{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 16182, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001853911753800519, "grad_norm": 20.296875, "learning_rate": 9.999814608824621e-06, "loss": 3.4381, "mean_token_accuracy": 0.3741410027996946, "step": 1 }, { "epoch": 0.0003707823507601038, "grad_norm": 33.5, "learning_rate": 9.99962921764924e-06, "loss": 3.499, "mean_token_accuracy": 0.3633383761976803, "step": 2 }, { "epoch": 0.0005561735261401557, "grad_norm": 23.40625, "learning_rate": 9.99944382647386e-06, "loss": 3.8999, "mean_token_accuracy": 0.3120863309352518, "step": 3 }, { "epoch": 0.0007415647015202076, "grad_norm": 25.8125, "learning_rate": 9.99925843529848e-06, "loss": 3.698, "mean_token_accuracy": 0.3301929116195603, "step": 4 }, { "epoch": 0.0009269558769002596, "grad_norm": 14.3359375, "learning_rate": 9.999073044123102e-06, "loss": 3.7705, "mean_token_accuracy": 0.3312273154993951, "step": 5 }, { "epoch": 0.0011123470522803114, "grad_norm": 15.921875, "learning_rate": 9.99888765294772e-06, "loss": 4.0455, "mean_token_accuracy": 0.3116751269035533, "step": 6 }, { "epoch": 0.0012977382276603633, "grad_norm": 23.28125, "learning_rate": 9.998702261772341e-06, "loss": 3.6514, "mean_token_accuracy": 0.3503990362897154, "step": 7 }, { "epoch": 0.0014831294030404152, "grad_norm": 21.71875, "learning_rate": 9.99851687059696e-06, "loss": 3.4188, "mean_token_accuracy": 0.37161084529505584, "step": 8 }, { "epoch": 0.0016685205784204673, "grad_norm": 17.125, "learning_rate": 9.99833147942158e-06, "loss": 4.2302, "mean_token_accuracy": 0.3071310572687225, "step": 9 }, { "epoch": 0.0018539117538005192, "grad_norm": 25.53125, "learning_rate": 9.9981460882462e-06, "loss": 3.4282, "mean_token_accuracy": 0.364314715241962, "step": 10 }, { "epoch": 0.002039302929180571, "grad_norm": 15.6484375, "learning_rate": 9.99796069707082e-06, "loss": 3.8982, "mean_token_accuracy": 0.33768844221105526, "step": 11 }, { "epoch": 0.002224694104560623, "grad_norm": 21.6875, "learning_rate": 9.99777530589544e-06, "loss": 3.6119, "mean_token_accuracy": 0.3585068198133525, "step": 12 }, { "epoch": 0.002410085279940675, "grad_norm": 26.03125, "learning_rate": 9.99758991472006e-06, "loss": 3.1203, "mean_token_accuracy": 0.4051309460181721, "step": 13 }, { "epoch": 0.0025954764553207266, "grad_norm": 22.875, "learning_rate": 9.997404523544681e-06, "loss": 3.7581, "mean_token_accuracy": 0.3534979423868313, "step": 14 }, { "epoch": 0.0027808676307007787, "grad_norm": 18.421875, "learning_rate": 9.9972191323693e-06, "loss": 4.1867, "mean_token_accuracy": 0.32308126410835214, "step": 15 }, { "epoch": 0.0029662588060808304, "grad_norm": 18.203125, "learning_rate": 9.99703374119392e-06, "loss": 3.9706, "mean_token_accuracy": 0.3233863546733272, "step": 16 }, { "epoch": 0.0031516499814608825, "grad_norm": 13.8828125, "learning_rate": 9.996848350018539e-06, "loss": 4.0017, "mean_token_accuracy": 0.34279940438204637, "step": 17 }, { "epoch": 0.0033370411568409346, "grad_norm": 14.1015625, "learning_rate": 9.99666295884316e-06, "loss": 3.5922, "mean_token_accuracy": 0.36894628705606103, "step": 18 }, { "epoch": 0.0035224323322209862, "grad_norm": 13.609375, "learning_rate": 9.99647756766778e-06, "loss": 3.7713, "mean_token_accuracy": 0.348929889298893, "step": 19 }, { "epoch": 0.0037078235076010383, "grad_norm": 13.65625, "learning_rate": 9.996292176492399e-06, "loss": 3.7535, "mean_token_accuracy": 0.34879288437102923, "step": 20 }, { "epoch": 0.00389321468298109, "grad_norm": 12.71875, "learning_rate": 9.996106785317021e-06, "loss": 3.9474, "mean_token_accuracy": 0.33545048223947305, "step": 21 }, { "epoch": 0.004078605858361142, "grad_norm": 11.3359375, "learning_rate": 9.99592139414164e-06, "loss": 3.6182, "mean_token_accuracy": 0.37765814266487213, "step": 22 }, { "epoch": 0.004263997033741194, "grad_norm": 16.9375, "learning_rate": 9.99573600296626e-06, "loss": 3.4863, "mean_token_accuracy": 0.36946011281224816, "step": 23 }, { "epoch": 0.004449388209121246, "grad_norm": 20.5625, "learning_rate": 9.995550611790879e-06, "loss": 3.5888, "mean_token_accuracy": 0.36218828957486543, "step": 24 }, { "epoch": 0.0046347793845012975, "grad_norm": 10.9765625, "learning_rate": 9.9953652206155e-06, "loss": 3.732, "mean_token_accuracy": 0.36657595645910035, "step": 25 }, { "epoch": 0.00482017055988135, "grad_norm": 13.1640625, "learning_rate": 9.99517982944012e-06, "loss": 3.8653, "mean_token_accuracy": 0.3446526555544052, "step": 26 }, { "epoch": 0.005005561735261402, "grad_norm": 17.203125, "learning_rate": 9.994994438264739e-06, "loss": 3.6163, "mean_token_accuracy": 0.35840407470288627, "step": 27 }, { "epoch": 0.005190952910641453, "grad_norm": 11.7890625, "learning_rate": 9.99480904708936e-06, "loss": 3.7773, "mean_token_accuracy": 0.3563114134542706, "step": 28 }, { "epoch": 0.005376344086021506, "grad_norm": 9.875, "learning_rate": 9.99462365591398e-06, "loss": 3.8295, "mean_token_accuracy": 0.36033011610015386, "step": 29 }, { "epoch": 0.0055617352614015575, "grad_norm": 15.3125, "learning_rate": 9.9944382647386e-06, "loss": 3.7272, "mean_token_accuracy": 0.36409055425448866, "step": 30 }, { "epoch": 0.005747126436781609, "grad_norm": 22.390625, "learning_rate": 9.994252873563219e-06, "loss": 3.0721, "mean_token_accuracy": 0.3995652729829945, "step": 31 }, { "epoch": 0.005932517612161661, "grad_norm": 12.671875, "learning_rate": 9.99406748238784e-06, "loss": 3.0176, "mean_token_accuracy": 0.43850538315389487, "step": 32 }, { "epoch": 0.006117908787541713, "grad_norm": 14.109375, "learning_rate": 9.993882091212458e-06, "loss": 3.7233, "mean_token_accuracy": 0.3470967741935484, "step": 33 }, { "epoch": 0.006303299962921765, "grad_norm": 22.578125, "learning_rate": 9.993696700037079e-06, "loss": 3.1235, "mean_token_accuracy": 0.39344262295081966, "step": 34 }, { "epoch": 0.006488691138301817, "grad_norm": 13.0859375, "learning_rate": 9.9935113088617e-06, "loss": 3.4322, "mean_token_accuracy": 0.3896842518140812, "step": 35 }, { "epoch": 0.006674082313681869, "grad_norm": 8.328125, "learning_rate": 9.993325917686318e-06, "loss": 3.2582, "mean_token_accuracy": 0.38849028400597907, "step": 36 }, { "epoch": 0.006859473489061921, "grad_norm": 13.234375, "learning_rate": 9.993140526510939e-06, "loss": 3.039, "mean_token_accuracy": 0.4251584592881521, "step": 37 }, { "epoch": 0.0070448646644419724, "grad_norm": 12.5859375, "learning_rate": 9.99295513533556e-06, "loss": 3.4285, "mean_token_accuracy": 0.39289994008388257, "step": 38 }, { "epoch": 0.007230255839822024, "grad_norm": 9.6640625, "learning_rate": 9.99276974416018e-06, "loss": 3.6749, "mean_token_accuracy": 0.35631067961165047, "step": 39 }, { "epoch": 0.007415647015202077, "grad_norm": 12.328125, "learning_rate": 9.992584352984798e-06, "loss": 4.1607, "mean_token_accuracy": 0.32176572264326553, "step": 40 }, { "epoch": 0.007601038190582128, "grad_norm": 20.578125, "learning_rate": 9.992398961809419e-06, "loss": 3.6988, "mean_token_accuracy": 0.36075949367088606, "step": 41 }, { "epoch": 0.00778642936596218, "grad_norm": 10.4140625, "learning_rate": 9.992213570634038e-06, "loss": 3.7204, "mean_token_accuracy": 0.3503425466572171, "step": 42 }, { "epoch": 0.007971820541342232, "grad_norm": 13.578125, "learning_rate": 9.992028179458658e-06, "loss": 4.0198, "mean_token_accuracy": 0.3351313969571231, "step": 43 }, { "epoch": 0.008157211716722283, "grad_norm": 15.03125, "learning_rate": 9.991842788283279e-06, "loss": 3.2864, "mean_token_accuracy": 0.4051724137931034, "step": 44 }, { "epoch": 0.008342602892102336, "grad_norm": 10.1875, "learning_rate": 9.9916573971079e-06, "loss": 3.4034, "mean_token_accuracy": 0.39417650076962546, "step": 45 }, { "epoch": 0.008527994067482388, "grad_norm": 18.015625, "learning_rate": 9.991472005932518e-06, "loss": 3.285, "mean_token_accuracy": 0.39734923790589793, "step": 46 }, { "epoch": 0.008713385242862439, "grad_norm": 8.9921875, "learning_rate": 9.991286614757138e-06, "loss": 3.4491, "mean_token_accuracy": 0.38653762819586884, "step": 47 }, { "epoch": 0.008898776418242492, "grad_norm": 8.640625, "learning_rate": 9.991101223581759e-06, "loss": 3.5815, "mean_token_accuracy": 0.373163027056723, "step": 48 }, { "epoch": 0.009084167593622544, "grad_norm": 8.3828125, "learning_rate": 9.990915832406378e-06, "loss": 3.9502, "mean_token_accuracy": 0.3483087234265368, "step": 49 }, { "epoch": 0.009269558769002595, "grad_norm": 11.3203125, "learning_rate": 9.990730441230998e-06, "loss": 3.0887, "mean_token_accuracy": 0.4165704758240349, "step": 50 }, { "epoch": 0.009454949944382647, "grad_norm": 17.828125, "learning_rate": 9.990545050055617e-06, "loss": 3.2425, "mean_token_accuracy": 0.3741918458899591, "step": 51 }, { "epoch": 0.0096403411197627, "grad_norm": 8.21875, "learning_rate": 9.990359658880238e-06, "loss": 3.7171, "mean_token_accuracy": 0.3701393497013935, "step": 52 }, { "epoch": 0.00982573229514275, "grad_norm": 28.3125, "learning_rate": 9.990174267704858e-06, "loss": 3.6864, "mean_token_accuracy": 0.3573947313835304, "step": 53 }, { "epoch": 0.010011123470522803, "grad_norm": 13.5703125, "learning_rate": 9.989988876529479e-06, "loss": 3.4587, "mean_token_accuracy": 0.37662811929397444, "step": 54 }, { "epoch": 0.010196514645902856, "grad_norm": 36.65625, "learning_rate": 9.989803485354097e-06, "loss": 3.4809, "mean_token_accuracy": 0.3548863636363636, "step": 55 }, { "epoch": 0.010381905821282907, "grad_norm": 10.4453125, "learning_rate": 9.989618094178718e-06, "loss": 3.6576, "mean_token_accuracy": 0.3727217125382263, "step": 56 }, { "epoch": 0.010567296996662959, "grad_norm": 9.6484375, "learning_rate": 9.989432703003338e-06, "loss": 4.2375, "mean_token_accuracy": 0.322178762856729, "step": 57 }, { "epoch": 0.010752688172043012, "grad_norm": 10.4375, "learning_rate": 9.989247311827957e-06, "loss": 3.9134, "mean_token_accuracy": 0.34256864918113533, "step": 58 }, { "epoch": 0.010938079347423062, "grad_norm": 16.984375, "learning_rate": 9.989061920652578e-06, "loss": 3.7722, "mean_token_accuracy": 0.3469330332020259, "step": 59 }, { "epoch": 0.011123470522803115, "grad_norm": 13.1015625, "learning_rate": 9.988876529477196e-06, "loss": 3.5898, "mean_token_accuracy": 0.3768393172454385, "step": 60 }, { "epoch": 0.011308861698183166, "grad_norm": 9.15625, "learning_rate": 9.988691138301819e-06, "loss": 3.7143, "mean_token_accuracy": 0.36265190711105766, "step": 61 }, { "epoch": 0.011494252873563218, "grad_norm": 9.6484375, "learning_rate": 9.988505747126437e-06, "loss": 3.339, "mean_token_accuracy": 0.40487172095704815, "step": 62 }, { "epoch": 0.01167964404894327, "grad_norm": 22.046875, "learning_rate": 9.988320355951058e-06, "loss": 3.3481, "mean_token_accuracy": 0.3734547820429408, "step": 63 }, { "epoch": 0.011865035224323322, "grad_norm": 12.84375, "learning_rate": 9.988134964775678e-06, "loss": 3.0666, "mean_token_accuracy": 0.4502140588316531, "step": 64 }, { "epoch": 0.012050426399703374, "grad_norm": 12.1171875, "learning_rate": 9.987949573600297e-06, "loss": 3.1151, "mean_token_accuracy": 0.4391958559447459, "step": 65 }, { "epoch": 0.012235817575083427, "grad_norm": 16.9375, "learning_rate": 9.987764182424918e-06, "loss": 3.5205, "mean_token_accuracy": 0.38388911209321014, "step": 66 }, { "epoch": 0.012421208750463477, "grad_norm": 8.7890625, "learning_rate": 9.987578791249536e-06, "loss": 3.6644, "mean_token_accuracy": 0.35716242125445086, "step": 67 }, { "epoch": 0.01260659992584353, "grad_norm": 11.5390625, "learning_rate": 9.987393400074157e-06, "loss": 3.1804, "mean_token_accuracy": 0.4036207345662933, "step": 68 }, { "epoch": 0.012791991101223582, "grad_norm": 9.9765625, "learning_rate": 9.987208008898777e-06, "loss": 3.261, "mean_token_accuracy": 0.4166935744268647, "step": 69 }, { "epoch": 0.012977382276603633, "grad_norm": 11.2265625, "learning_rate": 9.987022617723398e-06, "loss": 3.7567, "mean_token_accuracy": 0.3566405193400054, "step": 70 }, { "epoch": 0.013162773451983686, "grad_norm": 10.5234375, "learning_rate": 9.986837226548017e-06, "loss": 3.6839, "mean_token_accuracy": 0.3646567164179104, "step": 71 }, { "epoch": 0.013348164627363738, "grad_norm": 8.7421875, "learning_rate": 9.986651835372637e-06, "loss": 3.7606, "mean_token_accuracy": 0.36064139941690965, "step": 72 }, { "epoch": 0.013533555802743789, "grad_norm": 15.328125, "learning_rate": 9.986466444197258e-06, "loss": 3.7036, "mean_token_accuracy": 0.35925591008913577, "step": 73 }, { "epoch": 0.013718946978123842, "grad_norm": 10.015625, "learning_rate": 9.986281053021877e-06, "loss": 3.4698, "mean_token_accuracy": 0.4113996069101066, "step": 74 }, { "epoch": 0.013904338153503892, "grad_norm": 23.09375, "learning_rate": 9.986095661846497e-06, "loss": 3.0644, "mean_token_accuracy": 0.4284487385936661, "step": 75 }, { "epoch": 0.014089729328883945, "grad_norm": 13.78125, "learning_rate": 9.985910270671116e-06, "loss": 3.1365, "mean_token_accuracy": 0.44471413160733547, "step": 76 }, { "epoch": 0.014275120504263997, "grad_norm": 12.7734375, "learning_rate": 9.985724879495738e-06, "loss": 3.8205, "mean_token_accuracy": 0.3564191533657183, "step": 77 }, { "epoch": 0.014460511679644048, "grad_norm": 9.6328125, "learning_rate": 9.985539488320357e-06, "loss": 3.8257, "mean_token_accuracy": 0.3372511848341232, "step": 78 }, { "epoch": 0.0146459028550241, "grad_norm": 14.3828125, "learning_rate": 9.985354097144977e-06, "loss": 3.4389, "mean_token_accuracy": 0.3951518691588785, "step": 79 }, { "epoch": 0.014831294030404153, "grad_norm": 16.0, "learning_rate": 9.985168705969596e-06, "loss": 3.5757, "mean_token_accuracy": 0.3722309647742404, "step": 80 }, { "epoch": 0.015016685205784204, "grad_norm": 10.5078125, "learning_rate": 9.984983314794217e-06, "loss": 3.3855, "mean_token_accuracy": 0.3938829787234043, "step": 81 }, { "epoch": 0.015202076381164257, "grad_norm": 10.875, "learning_rate": 9.984797923618837e-06, "loss": 3.9025, "mean_token_accuracy": 0.35135884636716586, "step": 82 }, { "epoch": 0.015387467556544309, "grad_norm": 17.1875, "learning_rate": 9.984612532443456e-06, "loss": 3.5976, "mean_token_accuracy": 0.37413598089732314, "step": 83 }, { "epoch": 0.01557285873192436, "grad_norm": 11.40625, "learning_rate": 9.984427141268076e-06, "loss": 3.571, "mean_token_accuracy": 0.3723349820910796, "step": 84 }, { "epoch": 0.01575824990730441, "grad_norm": 10.9140625, "learning_rate": 9.984241750092697e-06, "loss": 3.4574, "mean_token_accuracy": 0.38086979722518677, "step": 85 }, { "epoch": 0.015943641082684465, "grad_norm": 13.5859375, "learning_rate": 9.984056358917317e-06, "loss": 3.154, "mean_token_accuracy": 0.43227402969523315, "step": 86 }, { "epoch": 0.016129032258064516, "grad_norm": 13.484375, "learning_rate": 9.983870967741936e-06, "loss": 3.2812, "mean_token_accuracy": 0.38634423897581793, "step": 87 }, { "epoch": 0.016314423433444566, "grad_norm": 9.8203125, "learning_rate": 9.983685576566557e-06, "loss": 2.9024, "mean_token_accuracy": 0.4394335212429534, "step": 88 }, { "epoch": 0.01649981460882462, "grad_norm": 11.59375, "learning_rate": 9.983500185391175e-06, "loss": 3.1165, "mean_token_accuracy": 0.4171966420758077, "step": 89 }, { "epoch": 0.01668520578420467, "grad_norm": 8.03125, "learning_rate": 9.983314794215796e-06, "loss": 3.6869, "mean_token_accuracy": 0.36991727834090643, "step": 90 }, { "epoch": 0.016870596959584722, "grad_norm": 8.71875, "learning_rate": 9.983129403040416e-06, "loss": 3.5475, "mean_token_accuracy": 0.38127165046373895, "step": 91 }, { "epoch": 0.017055988134964777, "grad_norm": 8.7109375, "learning_rate": 9.982944011865035e-06, "loss": 3.5756, "mean_token_accuracy": 0.39057239057239057, "step": 92 }, { "epoch": 0.017241379310344827, "grad_norm": 7.44921875, "learning_rate": 9.982758620689656e-06, "loss": 3.6211, "mean_token_accuracy": 0.3890165441176471, "step": 93 }, { "epoch": 0.017426770485724878, "grad_norm": 7.49609375, "learning_rate": 9.982573229514276e-06, "loss": 3.7896, "mean_token_accuracy": 0.3646804260985353, "step": 94 }, { "epoch": 0.017612161661104932, "grad_norm": 10.015625, "learning_rate": 9.982387838338897e-06, "loss": 3.4047, "mean_token_accuracy": 0.39285714285714285, "step": 95 }, { "epoch": 0.017797552836484983, "grad_norm": 10.53125, "learning_rate": 9.982202447163515e-06, "loss": 3.2716, "mean_token_accuracy": 0.40177011761965764, "step": 96 }, { "epoch": 0.017982944011865034, "grad_norm": 10.6484375, "learning_rate": 9.982017055988136e-06, "loss": 3.4895, "mean_token_accuracy": 0.3729986431478969, "step": 97 }, { "epoch": 0.018168335187245088, "grad_norm": 8.875, "learning_rate": 9.981831664812755e-06, "loss": 2.9862, "mean_token_accuracy": 0.4264252696456086, "step": 98 }, { "epoch": 0.01835372636262514, "grad_norm": 9.46875, "learning_rate": 9.981646273637375e-06, "loss": 3.5209, "mean_token_accuracy": 0.3851343753514, "step": 99 }, { "epoch": 0.01853911753800519, "grad_norm": 9.0, "learning_rate": 9.981460882461996e-06, "loss": 3.9077, "mean_token_accuracy": 0.35223367697594504, "step": 100 }, { "epoch": 0.018724508713385244, "grad_norm": 10.828125, "learning_rate": 9.981275491286616e-06, "loss": 3.6151, "mean_token_accuracy": 0.38634969325153373, "step": 101 }, { "epoch": 0.018909899888765295, "grad_norm": 9.140625, "learning_rate": 9.981090100111237e-06, "loss": 3.8936, "mean_token_accuracy": 0.3572481572481572, "step": 102 }, { "epoch": 0.019095291064145346, "grad_norm": 9.3984375, "learning_rate": 9.980904708935856e-06, "loss": 3.404, "mean_token_accuracy": 0.39367396593673964, "step": 103 }, { "epoch": 0.0192806822395254, "grad_norm": 9.5078125, "learning_rate": 9.980719317760476e-06, "loss": 3.8367, "mean_token_accuracy": 0.3672547018515819, "step": 104 }, { "epoch": 0.01946607341490545, "grad_norm": 21.1875, "learning_rate": 9.980533926585095e-06, "loss": 3.2517, "mean_token_accuracy": 0.3914650537634409, "step": 105 }, { "epoch": 0.0196514645902855, "grad_norm": 10.515625, "learning_rate": 9.980348535409715e-06, "loss": 3.2583, "mean_token_accuracy": 0.39715972554651346, "step": 106 }, { "epoch": 0.019836855765665556, "grad_norm": 7.29296875, "learning_rate": 9.980163144234336e-06, "loss": 3.4838, "mean_token_accuracy": 0.3822471011595362, "step": 107 }, { "epoch": 0.020022246941045607, "grad_norm": 11.2421875, "learning_rate": 9.979977753058955e-06, "loss": 3.5272, "mean_token_accuracy": 0.3844527565457798, "step": 108 }, { "epoch": 0.020207638116425657, "grad_norm": 10.4453125, "learning_rate": 9.979792361883575e-06, "loss": 3.931, "mean_token_accuracy": 0.3280475718533201, "step": 109 }, { "epoch": 0.02039302929180571, "grad_norm": 8.34375, "learning_rate": 9.979606970708196e-06, "loss": 3.5288, "mean_token_accuracy": 0.37078954459714364, "step": 110 }, { "epoch": 0.020578420467185762, "grad_norm": 9.125, "learning_rate": 9.979421579532816e-06, "loss": 3.1308, "mean_token_accuracy": 0.40451592288848653, "step": 111 }, { "epoch": 0.020763811642565813, "grad_norm": 9.8828125, "learning_rate": 9.979236188357435e-06, "loss": 3.8475, "mean_token_accuracy": 0.35536105032822757, "step": 112 }, { "epoch": 0.020949202817945867, "grad_norm": 9.5703125, "learning_rate": 9.979050797182055e-06, "loss": 2.8274, "mean_token_accuracy": 0.4463582677165354, "step": 113 }, { "epoch": 0.021134593993325918, "grad_norm": 7.41015625, "learning_rate": 9.978865406006674e-06, "loss": 3.2743, "mean_token_accuracy": 0.4164364640883978, "step": 114 }, { "epoch": 0.02131998516870597, "grad_norm": 7.42578125, "learning_rate": 9.978680014831295e-06, "loss": 3.0824, "mean_token_accuracy": 0.42395408548557617, "step": 115 }, { "epoch": 0.021505376344086023, "grad_norm": 14.859375, "learning_rate": 9.978494623655915e-06, "loss": 3.2852, "mean_token_accuracy": 0.39287063861947347, "step": 116 }, { "epoch": 0.021690767519466074, "grad_norm": 11.65625, "learning_rate": 9.978309232480536e-06, "loss": 3.6064, "mean_token_accuracy": 0.37898894154818324, "step": 117 }, { "epoch": 0.021876158694846125, "grad_norm": 9.359375, "learning_rate": 9.978123841305154e-06, "loss": 3.9121, "mean_token_accuracy": 0.33431998753699954, "step": 118 }, { "epoch": 0.022061549870226176, "grad_norm": 10.34375, "learning_rate": 9.977938450129775e-06, "loss": 3.1628, "mean_token_accuracy": 0.41136394604169507, "step": 119 }, { "epoch": 0.02224694104560623, "grad_norm": 10.7109375, "learning_rate": 9.977753058954395e-06, "loss": 3.2838, "mean_token_accuracy": 0.3993351640410464, "step": 120 }, { "epoch": 0.02243233222098628, "grad_norm": 9.1171875, "learning_rate": 9.977567667779014e-06, "loss": 3.3766, "mean_token_accuracy": 0.3800491266375546, "step": 121 }, { "epoch": 0.02261772339636633, "grad_norm": 13.5546875, "learning_rate": 9.977382276603635e-06, "loss": 3.2509, "mean_token_accuracy": 0.3941845468401923, "step": 122 }, { "epoch": 0.022803114571746386, "grad_norm": 8.546875, "learning_rate": 9.977196885428253e-06, "loss": 3.4839, "mean_token_accuracy": 0.37543383129621244, "step": 123 }, { "epoch": 0.022988505747126436, "grad_norm": 11.265625, "learning_rate": 9.977011494252874e-06, "loss": 3.9086, "mean_token_accuracy": 0.339407160272039, "step": 124 }, { "epoch": 0.023173896922506487, "grad_norm": 11.7109375, "learning_rate": 9.976826103077494e-06, "loss": 3.5963, "mean_token_accuracy": 0.3780616276007374, "step": 125 }, { "epoch": 0.02335928809788654, "grad_norm": 8.8203125, "learning_rate": 9.976640711902115e-06, "loss": 3.1831, "mean_token_accuracy": 0.41598277949683843, "step": 126 }, { "epoch": 0.023544679273266592, "grad_norm": 10.078125, "learning_rate": 9.976455320726734e-06, "loss": 2.7696, "mean_token_accuracy": 0.44809306434987484, "step": 127 }, { "epoch": 0.023730070448646643, "grad_norm": 10.1328125, "learning_rate": 9.976269929551354e-06, "loss": 3.2666, "mean_token_accuracy": 0.3986628211851075, "step": 128 }, { "epoch": 0.023915461624026697, "grad_norm": 12.2734375, "learning_rate": 9.976084538375975e-06, "loss": 3.0797, "mean_token_accuracy": 0.41573642570716374, "step": 129 }, { "epoch": 0.024100852799406748, "grad_norm": 12.1171875, "learning_rate": 9.975899147200594e-06, "loss": 3.4001, "mean_token_accuracy": 0.3711010397227406, "step": 130 }, { "epoch": 0.0242862439747868, "grad_norm": 8.40625, "learning_rate": 9.975713756025214e-06, "loss": 3.7864, "mean_token_accuracy": 0.35381828316610925, "step": 131 }, { "epoch": 0.024471635150166853, "grad_norm": 12.3984375, "learning_rate": 9.975528364849833e-06, "loss": 3.9022, "mean_token_accuracy": 0.3447947811905409, "step": 132 }, { "epoch": 0.024657026325546904, "grad_norm": 9.1796875, "learning_rate": 9.975342973674453e-06, "loss": 3.5614, "mean_token_accuracy": 0.36817218327082996, "step": 133 }, { "epoch": 0.024842417500926955, "grad_norm": 13.34375, "learning_rate": 9.975157582499074e-06, "loss": 3.3685, "mean_token_accuracy": 0.3716986017607457, "step": 134 }, { "epoch": 0.02502780867630701, "grad_norm": 12.578125, "learning_rate": 9.974972191323694e-06, "loss": 3.3435, "mean_token_accuracy": 0.3836438269655826, "step": 135 }, { "epoch": 0.02521319985168706, "grad_norm": 15.3984375, "learning_rate": 9.974786800148313e-06, "loss": 4.2441, "mean_token_accuracy": 0.32710989678202795, "step": 136 }, { "epoch": 0.02539859102706711, "grad_norm": 11.7265625, "learning_rate": 9.974601408972934e-06, "loss": 3.5882, "mean_token_accuracy": 0.3715994020926756, "step": 137 }, { "epoch": 0.025583982202447165, "grad_norm": 14.1875, "learning_rate": 9.974416017797554e-06, "loss": 3.8388, "mean_token_accuracy": 0.3429962894248609, "step": 138 }, { "epoch": 0.025769373377827216, "grad_norm": 18.828125, "learning_rate": 9.974230626622173e-06, "loss": 2.9681, "mean_token_accuracy": 0.42415471648467307, "step": 139 }, { "epoch": 0.025954764553207266, "grad_norm": 11.5, "learning_rate": 9.974045235446793e-06, "loss": 3.1797, "mean_token_accuracy": 0.4128205128205128, "step": 140 }, { "epoch": 0.02614015572858732, "grad_norm": 14.578125, "learning_rate": 9.973859844271412e-06, "loss": 3.1276, "mean_token_accuracy": 0.40506966593923116, "step": 141 }, { "epoch": 0.02632554690396737, "grad_norm": 14.421875, "learning_rate": 9.973674453096034e-06, "loss": 3.2541, "mean_token_accuracy": 0.3984329991934555, "step": 142 }, { "epoch": 0.026510938079347422, "grad_norm": 10.0234375, "learning_rate": 9.973489061920653e-06, "loss": 2.8935, "mean_token_accuracy": 0.43506763787721126, "step": 143 }, { "epoch": 0.026696329254727477, "grad_norm": 7.671875, "learning_rate": 9.973303670745274e-06, "loss": 4.186, "mean_token_accuracy": 0.3357953919082791, "step": 144 }, { "epoch": 0.026881720430107527, "grad_norm": 14.6171875, "learning_rate": 9.973118279569894e-06, "loss": 3.2942, "mean_token_accuracy": 0.39072847682119205, "step": 145 }, { "epoch": 0.027067111605487578, "grad_norm": 13.1953125, "learning_rate": 9.972932888394513e-06, "loss": 3.8442, "mean_token_accuracy": 0.34437676422522656, "step": 146 }, { "epoch": 0.027252502780867632, "grad_norm": 8.4375, "learning_rate": 9.972747497219133e-06, "loss": 3.5772, "mean_token_accuracy": 0.3763837638376384, "step": 147 }, { "epoch": 0.027437893956247683, "grad_norm": 14.8515625, "learning_rate": 9.972562106043752e-06, "loss": 3.3499, "mean_token_accuracy": 0.38854277465430787, "step": 148 }, { "epoch": 0.027623285131627734, "grad_norm": 11.40625, "learning_rate": 9.972376714868373e-06, "loss": 3.354, "mean_token_accuracy": 0.3941627358490566, "step": 149 }, { "epoch": 0.027808676307007785, "grad_norm": 11.2734375, "learning_rate": 9.972191323692993e-06, "loss": 3.3128, "mean_token_accuracy": 0.4013104013104013, "step": 150 }, { "epoch": 0.02799406748238784, "grad_norm": 10.078125, "learning_rate": 9.972005932517614e-06, "loss": 3.3116, "mean_token_accuracy": 0.40372204712591925, "step": 151 }, { "epoch": 0.02817945865776789, "grad_norm": 18.765625, "learning_rate": 9.971820541342232e-06, "loss": 2.9742, "mean_token_accuracy": 0.42819805430521274, "step": 152 }, { "epoch": 0.02836484983314794, "grad_norm": 9.9375, "learning_rate": 9.971635150166853e-06, "loss": 3.9243, "mean_token_accuracy": 0.36853853383458646, "step": 153 }, { "epoch": 0.028550241008527995, "grad_norm": 11.7265625, "learning_rate": 9.971449758991473e-06, "loss": 2.927, "mean_token_accuracy": 0.4494630448515477, "step": 154 }, { "epoch": 0.028735632183908046, "grad_norm": 8.96875, "learning_rate": 9.971264367816092e-06, "loss": 3.3215, "mean_token_accuracy": 0.38849487785658, "step": 155 }, { "epoch": 0.028921023359288096, "grad_norm": 9.0703125, "learning_rate": 9.971078976640713e-06, "loss": 3.3514, "mean_token_accuracy": 0.3982483882739326, "step": 156 }, { "epoch": 0.02910641453466815, "grad_norm": 7.76171875, "learning_rate": 9.970893585465332e-06, "loss": 3.5172, "mean_token_accuracy": 0.3770096463022508, "step": 157 }, { "epoch": 0.0292918057100482, "grad_norm": 9.5625, "learning_rate": 9.970708194289954e-06, "loss": 3.5161, "mean_token_accuracy": 0.39988837728477744, "step": 158 }, { "epoch": 0.029477196885428252, "grad_norm": 24.421875, "learning_rate": 9.970522803114573e-06, "loss": 2.3581, "mean_token_accuracy": 0.4728568434290505, "step": 159 }, { "epoch": 0.029662588060808306, "grad_norm": 9.3828125, "learning_rate": 9.970337411939193e-06, "loss": 3.7167, "mean_token_accuracy": 0.3625486922648859, "step": 160 }, { "epoch": 0.029847979236188357, "grad_norm": 10.859375, "learning_rate": 9.970152020763812e-06, "loss": 3.6196, "mean_token_accuracy": 0.36540371789413933, "step": 161 }, { "epoch": 0.030033370411568408, "grad_norm": 10.0390625, "learning_rate": 9.969966629588432e-06, "loss": 3.631, "mean_token_accuracy": 0.38195468561469365, "step": 162 }, { "epoch": 0.030218761586948462, "grad_norm": 13.46875, "learning_rate": 9.969781238413053e-06, "loss": 2.9555, "mean_token_accuracy": 0.41974308529105425, "step": 163 }, { "epoch": 0.030404152762328513, "grad_norm": 8.59375, "learning_rate": 9.969595847237672e-06, "loss": 4.2115, "mean_token_accuracy": 0.31930638391870997, "step": 164 }, { "epoch": 0.030589543937708564, "grad_norm": 16.109375, "learning_rate": 9.969410456062292e-06, "loss": 3.4556, "mean_token_accuracy": 0.380061394532963, "step": 165 }, { "epoch": 0.030774935113088618, "grad_norm": 13.3984375, "learning_rate": 9.969225064886913e-06, "loss": 3.4162, "mean_token_accuracy": 0.3842827318818968, "step": 166 }, { "epoch": 0.03096032628846867, "grad_norm": 12.0, "learning_rate": 9.969039673711533e-06, "loss": 3.7262, "mean_token_accuracy": 0.35910087719298245, "step": 167 }, { "epoch": 0.03114571746384872, "grad_norm": 9.1171875, "learning_rate": 9.968854282536152e-06, "loss": 3.3072, "mean_token_accuracy": 0.39448027598620067, "step": 168 }, { "epoch": 0.03133110863922877, "grad_norm": 14.0, "learning_rate": 9.968668891360772e-06, "loss": 2.9263, "mean_token_accuracy": 0.42758710348413936, "step": 169 }, { "epoch": 0.03151649981460882, "grad_norm": 11.15625, "learning_rate": 9.968483500185391e-06, "loss": 3.6215, "mean_token_accuracy": 0.36975368102380624, "step": 170 }, { "epoch": 0.03170189098998888, "grad_norm": 12.546875, "learning_rate": 9.968298109010012e-06, "loss": 2.6465, "mean_token_accuracy": 0.4780934039480019, "step": 171 }, { "epoch": 0.03188728216536893, "grad_norm": 13.9296875, "learning_rate": 9.968112717834632e-06, "loss": 3.106, "mean_token_accuracy": 0.4053084648493544, "step": 172 }, { "epoch": 0.03207267334074898, "grad_norm": 21.8125, "learning_rate": 9.967927326659251e-06, "loss": 3.0122, "mean_token_accuracy": 0.41927899686520376, "step": 173 }, { "epoch": 0.03225806451612903, "grad_norm": 12.4609375, "learning_rate": 9.967741935483871e-06, "loss": 2.9418, "mean_token_accuracy": 0.44761374704327256, "step": 174 }, { "epoch": 0.03244345569150908, "grad_norm": 9.9921875, "learning_rate": 9.967556544308492e-06, "loss": 3.6329, "mean_token_accuracy": 0.3670821673874332, "step": 175 }, { "epoch": 0.03262884686688913, "grad_norm": 8.359375, "learning_rate": 9.967371153133112e-06, "loss": 3.4014, "mean_token_accuracy": 0.38483363425328865, "step": 176 }, { "epoch": 0.03281423804226919, "grad_norm": 8.84375, "learning_rate": 9.967185761957731e-06, "loss": 3.0306, "mean_token_accuracy": 0.42600828093850635, "step": 177 }, { "epoch": 0.03299962921764924, "grad_norm": 9.25, "learning_rate": 9.967000370782352e-06, "loss": 3.6401, "mean_token_accuracy": 0.36178257933828495, "step": 178 }, { "epoch": 0.03318502039302929, "grad_norm": 9.5703125, "learning_rate": 9.96681497960697e-06, "loss": 3.1834, "mean_token_accuracy": 0.41987080898185175, "step": 179 }, { "epoch": 0.03337041156840934, "grad_norm": 8.7734375, "learning_rate": 9.966629588431591e-06, "loss": 3.1498, "mean_token_accuracy": 0.4112565798353354, "step": 180 }, { "epoch": 0.033555802743789394, "grad_norm": 9.984375, "learning_rate": 9.966444197256211e-06, "loss": 3.0118, "mean_token_accuracy": 0.42468716205777846, "step": 181 }, { "epoch": 0.033741193919169445, "grad_norm": 10.5078125, "learning_rate": 9.966258806080832e-06, "loss": 3.4268, "mean_token_accuracy": 0.39398592450415865, "step": 182 }, { "epoch": 0.0339265850945495, "grad_norm": 6.7109375, "learning_rate": 9.966073414905452e-06, "loss": 3.2557, "mean_token_accuracy": 0.4154325798908807, "step": 183 }, { "epoch": 0.03411197626992955, "grad_norm": 7.58203125, "learning_rate": 9.965888023730071e-06, "loss": 3.4369, "mean_token_accuracy": 0.3830580713905168, "step": 184 }, { "epoch": 0.034297367445309604, "grad_norm": 7.80078125, "learning_rate": 9.965702632554692e-06, "loss": 3.4169, "mean_token_accuracy": 0.39291857798165136, "step": 185 }, { "epoch": 0.034482758620689655, "grad_norm": 8.203125, "learning_rate": 9.96551724137931e-06, "loss": 3.8411, "mean_token_accuracy": 0.35609889359377134, "step": 186 }, { "epoch": 0.034668149796069705, "grad_norm": 12.53125, "learning_rate": 9.965331850203931e-06, "loss": 3.2983, "mean_token_accuracy": 0.4042931688804554, "step": 187 }, { "epoch": 0.034853540971449756, "grad_norm": 14.8671875, "learning_rate": 9.965146459028552e-06, "loss": 3.6223, "mean_token_accuracy": 0.39536835039436147, "step": 188 }, { "epoch": 0.035038932146829814, "grad_norm": 9.34375, "learning_rate": 9.96496106785317e-06, "loss": 3.7538, "mean_token_accuracy": 0.3595505617977528, "step": 189 }, { "epoch": 0.035224323322209865, "grad_norm": 9.46875, "learning_rate": 9.96477567667779e-06, "loss": 3.6352, "mean_token_accuracy": 0.37782231128924515, "step": 190 }, { "epoch": 0.035409714497589916, "grad_norm": 7.44140625, "learning_rate": 9.964590285502411e-06, "loss": 2.884, "mean_token_accuracy": 0.4645614843840037, "step": 191 }, { "epoch": 0.035595105672969966, "grad_norm": 9.296875, "learning_rate": 9.964404894327032e-06, "loss": 3.215, "mean_token_accuracy": 0.39757904622769485, "step": 192 }, { "epoch": 0.03578049684835002, "grad_norm": 9.9453125, "learning_rate": 9.96421950315165e-06, "loss": 3.123, "mean_token_accuracy": 0.42290748898678415, "step": 193 }, { "epoch": 0.03596588802373007, "grad_norm": 12.703125, "learning_rate": 9.964034111976271e-06, "loss": 3.7299, "mean_token_accuracy": 0.37048567870485677, "step": 194 }, { "epoch": 0.036151279199110126, "grad_norm": 7.2890625, "learning_rate": 9.96384872080089e-06, "loss": 3.1822, "mean_token_accuracy": 0.4138442855215221, "step": 195 }, { "epoch": 0.036336670374490176, "grad_norm": 9.765625, "learning_rate": 9.96366332962551e-06, "loss": 3.5492, "mean_token_accuracy": 0.37267384916748286, "step": 196 }, { "epoch": 0.03652206154987023, "grad_norm": 8.5625, "learning_rate": 9.963477938450131e-06, "loss": 3.4134, "mean_token_accuracy": 0.37505614612966015, "step": 197 }, { "epoch": 0.03670745272525028, "grad_norm": 16.390625, "learning_rate": 9.963292547274751e-06, "loss": 3.2484, "mean_token_accuracy": 0.3829763744756017, "step": 198 }, { "epoch": 0.03689284390063033, "grad_norm": 11.140625, "learning_rate": 9.96310715609937e-06, "loss": 3.1324, "mean_token_accuracy": 0.41386061080657793, "step": 199 }, { "epoch": 0.03707823507601038, "grad_norm": 10.8203125, "learning_rate": 9.96292176492399e-06, "loss": 3.4555, "mean_token_accuracy": 0.3731955844891027, "step": 200 }, { "epoch": 0.03726362625139043, "grad_norm": 10.96875, "learning_rate": 9.962736373748611e-06, "loss": 3.8081, "mean_token_accuracy": 0.34589411129119396, "step": 201 }, { "epoch": 0.03744901742677049, "grad_norm": 9.234375, "learning_rate": 9.96255098257323e-06, "loss": 3.3706, "mean_token_accuracy": 0.38206023006333206, "step": 202 }, { "epoch": 0.03763440860215054, "grad_norm": 9.5234375, "learning_rate": 9.96236559139785e-06, "loss": 3.1445, "mean_token_accuracy": 0.40518207282913166, "step": 203 }, { "epoch": 0.03781979977753059, "grad_norm": 14.484375, "learning_rate": 9.96218020022247e-06, "loss": 2.4262, "mean_token_accuracy": 0.47050754458161864, "step": 204 }, { "epoch": 0.03800519095291064, "grad_norm": 16.5625, "learning_rate": 9.96199480904709e-06, "loss": 3.4291, "mean_token_accuracy": 0.39842632331902716, "step": 205 }, { "epoch": 0.03819058212829069, "grad_norm": 34.4375, "learning_rate": 9.96180941787171e-06, "loss": 3.1806, "mean_token_accuracy": 0.3886689003959793, "step": 206 }, { "epoch": 0.03837597330367074, "grad_norm": 12.4375, "learning_rate": 9.96162402669633e-06, "loss": 2.9821, "mean_token_accuracy": 0.41936780069649077, "step": 207 }, { "epoch": 0.0385613644790508, "grad_norm": 11.6953125, "learning_rate": 9.96143863552095e-06, "loss": 3.2933, "mean_token_accuracy": 0.3872255489021956, "step": 208 }, { "epoch": 0.03874675565443085, "grad_norm": 9.515625, "learning_rate": 9.96125324434557e-06, "loss": 3.2637, "mean_token_accuracy": 0.40516499282639884, "step": 209 }, { "epoch": 0.0389321468298109, "grad_norm": 12.3125, "learning_rate": 9.96106785317019e-06, "loss": 3.7229, "mean_token_accuracy": 0.37159841479524436, "step": 210 }, { "epoch": 0.03911753800519095, "grad_norm": 21.125, "learning_rate": 9.96088246199481e-06, "loss": 3.1191, "mean_token_accuracy": 0.403714893140875, "step": 211 }, { "epoch": 0.039302929180571, "grad_norm": 18.21875, "learning_rate": 9.96069707081943e-06, "loss": 3.2282, "mean_token_accuracy": 0.39150075288606323, "step": 212 }, { "epoch": 0.039488320355951054, "grad_norm": 10.8359375, "learning_rate": 9.960511679644049e-06, "loss": 3.0031, "mean_token_accuracy": 0.4197100641204349, "step": 213 }, { "epoch": 0.03967371153133111, "grad_norm": 7.24609375, "learning_rate": 9.96032628846867e-06, "loss": 3.4147, "mean_token_accuracy": 0.38453572661373836, "step": 214 }, { "epoch": 0.03985910270671116, "grad_norm": 7.6796875, "learning_rate": 9.96014089729329e-06, "loss": 3.7369, "mean_token_accuracy": 0.3621285418106427, "step": 215 }, { "epoch": 0.04004449388209121, "grad_norm": 19.859375, "learning_rate": 9.95995550611791e-06, "loss": 3.1417, "mean_token_accuracy": 0.3868991517436381, "step": 216 }, { "epoch": 0.040229885057471264, "grad_norm": 12.640625, "learning_rate": 9.959770114942529e-06, "loss": 3.2834, "mean_token_accuracy": 0.38451840645486635, "step": 217 }, { "epoch": 0.040415276232851315, "grad_norm": 9.75, "learning_rate": 9.95958472376715e-06, "loss": 3.4675, "mean_token_accuracy": 0.387737843551797, "step": 218 }, { "epoch": 0.040600667408231365, "grad_norm": 10.328125, "learning_rate": 9.95939933259177e-06, "loss": 3.1948, "mean_token_accuracy": 0.40479477846115225, "step": 219 }, { "epoch": 0.04078605858361142, "grad_norm": 10.1640625, "learning_rate": 9.959213941416389e-06, "loss": 3.5703, "mean_token_accuracy": 0.39093041438623927, "step": 220 }, { "epoch": 0.040971449758991474, "grad_norm": 8.7421875, "learning_rate": 9.959028550241009e-06, "loss": 3.5699, "mean_token_accuracy": 0.3735813366960908, "step": 221 }, { "epoch": 0.041156840934371525, "grad_norm": 11.46875, "learning_rate": 9.95884315906563e-06, "loss": 3.1396, "mean_token_accuracy": 0.4228142076502732, "step": 222 }, { "epoch": 0.041342232109751575, "grad_norm": 9.6484375, "learning_rate": 9.95865776789025e-06, "loss": 3.3994, "mean_token_accuracy": 0.39686998394863565, "step": 223 }, { "epoch": 0.041527623285131626, "grad_norm": 7.89453125, "learning_rate": 9.958472376714869e-06, "loss": 3.2472, "mean_token_accuracy": 0.39676926017969794, "step": 224 }, { "epoch": 0.04171301446051168, "grad_norm": 8.6484375, "learning_rate": 9.95828698553949e-06, "loss": 3.7331, "mean_token_accuracy": 0.35888177052999415, "step": 225 }, { "epoch": 0.041898405635891735, "grad_norm": 9.3671875, "learning_rate": 9.95810159436411e-06, "loss": 3.2112, "mean_token_accuracy": 0.4221007418976962, "step": 226 }, { "epoch": 0.042083796811271786, "grad_norm": 9.5390625, "learning_rate": 9.957916203188729e-06, "loss": 3.3106, "mean_token_accuracy": 0.3904628658038941, "step": 227 }, { "epoch": 0.042269187986651836, "grad_norm": 9.296875, "learning_rate": 9.95773081201335e-06, "loss": 2.8992, "mean_token_accuracy": 0.43082386363636366, "step": 228 }, { "epoch": 0.04245457916203189, "grad_norm": 10.25, "learning_rate": 9.957545420837968e-06, "loss": 2.9827, "mean_token_accuracy": 0.42902469306111185, "step": 229 }, { "epoch": 0.04263997033741194, "grad_norm": 23.0625, "learning_rate": 9.95736002966259e-06, "loss": 2.8586, "mean_token_accuracy": 0.42935244806220385, "step": 230 }, { "epoch": 0.04282536151279199, "grad_norm": 15.5, "learning_rate": 9.957174638487209e-06, "loss": 2.8888, "mean_token_accuracy": 0.4355179704016913, "step": 231 }, { "epoch": 0.043010752688172046, "grad_norm": 8.359375, "learning_rate": 9.95698924731183e-06, "loss": 3.4322, "mean_token_accuracy": 0.38012439920836866, "step": 232 }, { "epoch": 0.0431961438635521, "grad_norm": 11.8671875, "learning_rate": 9.956803856136448e-06, "loss": 2.7555, "mean_token_accuracy": 0.454233801851217, "step": 233 }, { "epoch": 0.04338153503893215, "grad_norm": 10.0, "learning_rate": 9.956618464961069e-06, "loss": 3.2944, "mean_token_accuracy": 0.375397019301246, "step": 234 }, { "epoch": 0.0435669262143122, "grad_norm": 15.9296875, "learning_rate": 9.95643307378569e-06, "loss": 2.945, "mean_token_accuracy": 0.40634441087613293, "step": 235 }, { "epoch": 0.04375231738969225, "grad_norm": 13.296875, "learning_rate": 9.956247682610308e-06, "loss": 3.6443, "mean_token_accuracy": 0.3519572143832499, "step": 236 }, { "epoch": 0.0439377085650723, "grad_norm": 10.359375, "learning_rate": 9.956062291434929e-06, "loss": 3.0666, "mean_token_accuracy": 0.42105263157894735, "step": 237 }, { "epoch": 0.04412309974045235, "grad_norm": 9.8046875, "learning_rate": 9.955876900259549e-06, "loss": 3.062, "mean_token_accuracy": 0.4213204592901879, "step": 238 }, { "epoch": 0.04430849091583241, "grad_norm": 13.15625, "learning_rate": 9.95569150908417e-06, "loss": 2.9448, "mean_token_accuracy": 0.4407938849403245, "step": 239 }, { "epoch": 0.04449388209121246, "grad_norm": 10.4140625, "learning_rate": 9.955506117908788e-06, "loss": 3.5474, "mean_token_accuracy": 0.3649173256649892, "step": 240 }, { "epoch": 0.04467927326659251, "grad_norm": 7.78125, "learning_rate": 9.955320726733409e-06, "loss": 3.6224, "mean_token_accuracy": 0.36583052276559863, "step": 241 }, { "epoch": 0.04486466444197256, "grad_norm": 10.3515625, "learning_rate": 9.955135335558028e-06, "loss": 3.6969, "mean_token_accuracy": 0.35927545284197376, "step": 242 }, { "epoch": 0.04505005561735261, "grad_norm": 8.8203125, "learning_rate": 9.954949944382648e-06, "loss": 3.5021, "mean_token_accuracy": 0.36568775642409845, "step": 243 }, { "epoch": 0.04523544679273266, "grad_norm": 11.8359375, "learning_rate": 9.954764553207269e-06, "loss": 3.0969, "mean_token_accuracy": 0.40544398001835047, "step": 244 }, { "epoch": 0.04542083796811272, "grad_norm": 13.359375, "learning_rate": 9.954579162031887e-06, "loss": 3.5776, "mean_token_accuracy": 0.3670543684068163, "step": 245 }, { "epoch": 0.04560622914349277, "grad_norm": 9.265625, "learning_rate": 9.954393770856508e-06, "loss": 3.5283, "mean_token_accuracy": 0.3642548737406777, "step": 246 }, { "epoch": 0.04579162031887282, "grad_norm": 9.765625, "learning_rate": 9.954208379681128e-06, "loss": 3.191, "mean_token_accuracy": 0.4285278878390735, "step": 247 }, { "epoch": 0.04597701149425287, "grad_norm": 10.75, "learning_rate": 9.954022988505749e-06, "loss": 3.3679, "mean_token_accuracy": 0.3937433722163309, "step": 248 }, { "epoch": 0.046162402669632924, "grad_norm": 10.5078125, "learning_rate": 9.953837597330368e-06, "loss": 3.2337, "mean_token_accuracy": 0.4002500852563374, "step": 249 }, { "epoch": 0.046347793845012975, "grad_norm": 10.1796875, "learning_rate": 9.953652206154988e-06, "loss": 3.5841, "mean_token_accuracy": 0.37393986121819583, "step": 250 }, { "epoch": 0.04653318502039303, "grad_norm": 13.875, "learning_rate": 9.953466814979607e-06, "loss": 3.2986, "mean_token_accuracy": 0.3838593425794474, "step": 251 }, { "epoch": 0.04671857619577308, "grad_norm": 22.546875, "learning_rate": 9.953281423804227e-06, "loss": 2.995, "mean_token_accuracy": 0.398406374501992, "step": 252 }, { "epoch": 0.046903967371153134, "grad_norm": 13.015625, "learning_rate": 9.953096032628848e-06, "loss": 2.9509, "mean_token_accuracy": 0.421219646799117, "step": 253 }, { "epoch": 0.047089358546533185, "grad_norm": 10.4609375, "learning_rate": 9.952910641453467e-06, "loss": 3.1138, "mean_token_accuracy": 0.41052864410528644, "step": 254 }, { "epoch": 0.047274749721913235, "grad_norm": 10.5390625, "learning_rate": 9.952725250278087e-06, "loss": 2.6581, "mean_token_accuracy": 0.4651910950971561, "step": 255 }, { "epoch": 0.047460140897293286, "grad_norm": 20.65625, "learning_rate": 9.952539859102708e-06, "loss": 2.5778, "mean_token_accuracy": 0.4835465372739303, "step": 256 }, { "epoch": 0.047645532072673344, "grad_norm": 14.8828125, "learning_rate": 9.952354467927328e-06, "loss": 2.8463, "mean_token_accuracy": 0.4397463002114165, "step": 257 }, { "epoch": 0.047830923248053395, "grad_norm": 10.515625, "learning_rate": 9.952169076751947e-06, "loss": 3.0023, "mean_token_accuracy": 0.4152579107065453, "step": 258 }, { "epoch": 0.048016314423433445, "grad_norm": 10.2421875, "learning_rate": 9.951983685576567e-06, "loss": 3.172, "mean_token_accuracy": 0.41314093583636874, "step": 259 }, { "epoch": 0.048201705598813496, "grad_norm": 8.796875, "learning_rate": 9.951798294401186e-06, "loss": 4.0024, "mean_token_accuracy": 0.3308007718282682, "step": 260 }, { "epoch": 0.04838709677419355, "grad_norm": 8.859375, "learning_rate": 9.951612903225807e-06, "loss": 3.3318, "mean_token_accuracy": 0.3919406771392331, "step": 261 }, { "epoch": 0.0485724879495736, "grad_norm": 8.7109375, "learning_rate": 9.951427512050427e-06, "loss": 3.3641, "mean_token_accuracy": 0.39799622477130825, "step": 262 }, { "epoch": 0.048757879124953656, "grad_norm": 7.203125, "learning_rate": 9.951242120875048e-06, "loss": 3.9871, "mean_token_accuracy": 0.32790697674418606, "step": 263 }, { "epoch": 0.048943270300333706, "grad_norm": 9.84375, "learning_rate": 9.951056729699668e-06, "loss": 3.6859, "mean_token_accuracy": 0.37404908856035596, "step": 264 }, { "epoch": 0.04912866147571376, "grad_norm": 9.0703125, "learning_rate": 9.950871338524287e-06, "loss": 3.5331, "mean_token_accuracy": 0.3781861292234736, "step": 265 }, { "epoch": 0.04931405265109381, "grad_norm": 8.8125, "learning_rate": 9.950685947348908e-06, "loss": 3.0759, "mean_token_accuracy": 0.39980732177263967, "step": 266 }, { "epoch": 0.04949944382647386, "grad_norm": 9.359375, "learning_rate": 9.950500556173526e-06, "loss": 3.3699, "mean_token_accuracy": 0.3899159663865546, "step": 267 }, { "epoch": 0.04968483500185391, "grad_norm": 11.328125, "learning_rate": 9.950315164998147e-06, "loss": 3.1082, "mean_token_accuracy": 0.4157303370786517, "step": 268 }, { "epoch": 0.04987022617723396, "grad_norm": 7.4296875, "learning_rate": 9.950129773822767e-06, "loss": 2.8153, "mean_token_accuracy": 0.4501039501039501, "step": 269 }, { "epoch": 0.05005561735261402, "grad_norm": 8.8671875, "learning_rate": 9.949944382647386e-06, "loss": 2.8924, "mean_token_accuracy": 0.42946110828673106, "step": 270 }, { "epoch": 0.05024100852799407, "grad_norm": 8.671875, "learning_rate": 9.949758991472007e-06, "loss": 3.1582, "mean_token_accuracy": 0.4190692395005675, "step": 271 }, { "epoch": 0.05042639970337412, "grad_norm": 10.5703125, "learning_rate": 9.949573600296627e-06, "loss": 2.8333, "mean_token_accuracy": 0.44422398589065254, "step": 272 }, { "epoch": 0.05061179087875417, "grad_norm": 8.78125, "learning_rate": 9.949388209121248e-06, "loss": 3.1242, "mean_token_accuracy": 0.40270494065691415, "step": 273 }, { "epoch": 0.05079718205413422, "grad_norm": 8.234375, "learning_rate": 9.949202817945866e-06, "loss": 3.5041, "mean_token_accuracy": 0.39269535673839184, "step": 274 }, { "epoch": 0.05098257322951427, "grad_norm": 6.578125, "learning_rate": 9.949017426770487e-06, "loss": 3.1813, "mean_token_accuracy": 0.40011883541295307, "step": 275 }, { "epoch": 0.05116796440489433, "grad_norm": 7.16796875, "learning_rate": 9.948832035595106e-06, "loss": 3.5754, "mean_token_accuracy": 0.36409686187299234, "step": 276 }, { "epoch": 0.05135335558027438, "grad_norm": 7.625, "learning_rate": 9.948646644419726e-06, "loss": 3.3745, "mean_token_accuracy": 0.3978096788515613, "step": 277 }, { "epoch": 0.05153874675565443, "grad_norm": 7.98046875, "learning_rate": 9.948461253244347e-06, "loss": 3.459, "mean_token_accuracy": 0.3805266579973992, "step": 278 }, { "epoch": 0.05172413793103448, "grad_norm": 7.01953125, "learning_rate": 9.948275862068967e-06, "loss": 3.7506, "mean_token_accuracy": 0.36843579330625964, "step": 279 }, { "epoch": 0.05190952910641453, "grad_norm": 6.90625, "learning_rate": 9.948090470893586e-06, "loss": 3.0627, "mean_token_accuracy": 0.4097297297297297, "step": 280 }, { "epoch": 0.052094920281794584, "grad_norm": 8.015625, "learning_rate": 9.947905079718206e-06, "loss": 3.4648, "mean_token_accuracy": 0.3831575729787559, "step": 281 }, { "epoch": 0.05228031145717464, "grad_norm": 10.25, "learning_rate": 9.947719688542827e-06, "loss": 2.8794, "mean_token_accuracy": 0.4348494554772582, "step": 282 }, { "epoch": 0.05246570263255469, "grad_norm": 7.5234375, "learning_rate": 9.947534297367446e-06, "loss": 3.5441, "mean_token_accuracy": 0.3593377483443709, "step": 283 }, { "epoch": 0.05265109380793474, "grad_norm": 10.484375, "learning_rate": 9.947348906192066e-06, "loss": 3.3728, "mean_token_accuracy": 0.3731917993547716, "step": 284 }, { "epoch": 0.052836484983314794, "grad_norm": 13.0390625, "learning_rate": 9.947163515016685e-06, "loss": 3.1653, "mean_token_accuracy": 0.3817497876471302, "step": 285 }, { "epoch": 0.053021876158694844, "grad_norm": 9.828125, "learning_rate": 9.946978123841305e-06, "loss": 2.8067, "mean_token_accuracy": 0.4605310155970949, "step": 286 }, { "epoch": 0.053207267334074895, "grad_norm": 14.625, "learning_rate": 9.946792732665926e-06, "loss": 3.2071, "mean_token_accuracy": 0.40423710678365077, "step": 287 }, { "epoch": 0.05339265850945495, "grad_norm": 11.171875, "learning_rate": 9.946607341490546e-06, "loss": 3.2238, "mean_token_accuracy": 0.4034090909090909, "step": 288 }, { "epoch": 0.053578049684835004, "grad_norm": 13.09375, "learning_rate": 9.946421950315165e-06, "loss": 3.3854, "mean_token_accuracy": 0.3754677754677755, "step": 289 }, { "epoch": 0.053763440860215055, "grad_norm": 13.9921875, "learning_rate": 9.946236559139786e-06, "loss": 3.2726, "mean_token_accuracy": 0.38038548752834467, "step": 290 }, { "epoch": 0.053948832035595105, "grad_norm": 12.5546875, "learning_rate": 9.946051167964406e-06, "loss": 3.324, "mean_token_accuracy": 0.392080137736735, "step": 291 }, { "epoch": 0.054134223210975156, "grad_norm": 8.515625, "learning_rate": 9.945865776789025e-06, "loss": 3.2803, "mean_token_accuracy": 0.38600987091875477, "step": 292 }, { "epoch": 0.05431961438635521, "grad_norm": 8.1796875, "learning_rate": 9.945680385613646e-06, "loss": 3.2993, "mean_token_accuracy": 0.40348886682740037, "step": 293 }, { "epoch": 0.054505005561735265, "grad_norm": 12.3359375, "learning_rate": 9.945494994438264e-06, "loss": 3.2883, "mean_token_accuracy": 0.39398750926219966, "step": 294 }, { "epoch": 0.054690396737115315, "grad_norm": 9.5546875, "learning_rate": 9.945309603262887e-06, "loss": 3.0073, "mean_token_accuracy": 0.43836276083467096, "step": 295 }, { "epoch": 0.054875787912495366, "grad_norm": 17.671875, "learning_rate": 9.945124212087505e-06, "loss": 2.9189, "mean_token_accuracy": 0.4206308169596691, "step": 296 }, { "epoch": 0.05506117908787542, "grad_norm": 9.5078125, "learning_rate": 9.944938820912126e-06, "loss": 3.2944, "mean_token_accuracy": 0.3911849512563504, "step": 297 }, { "epoch": 0.05524657026325547, "grad_norm": 8.2578125, "learning_rate": 9.944753429736745e-06, "loss": 2.8802, "mean_token_accuracy": 0.435657629927785, "step": 298 }, { "epoch": 0.05543196143863552, "grad_norm": 9.2265625, "learning_rate": 9.944568038561365e-06, "loss": 2.9501, "mean_token_accuracy": 0.4169729368526562, "step": 299 }, { "epoch": 0.05561735261401557, "grad_norm": 7.32421875, "learning_rate": 9.944382647385986e-06, "loss": 3.428, "mean_token_accuracy": 0.392517725147081, "step": 300 }, { "epoch": 0.05580274378939563, "grad_norm": 7.921875, "learning_rate": 9.944197256210604e-06, "loss": 3.4753, "mean_token_accuracy": 0.36510085284960064, "step": 301 }, { "epoch": 0.05598813496477568, "grad_norm": 8.2265625, "learning_rate": 9.944011865035225e-06, "loss": 2.6533, "mean_token_accuracy": 0.4446389496717724, "step": 302 }, { "epoch": 0.05617352614015573, "grad_norm": 8.9453125, "learning_rate": 9.943826473859845e-06, "loss": 3.4928, "mean_token_accuracy": 0.35917901938426455, "step": 303 }, { "epoch": 0.05635891731553578, "grad_norm": 6.96484375, "learning_rate": 9.943641082684466e-06, "loss": 3.2542, "mean_token_accuracy": 0.4170755642787046, "step": 304 }, { "epoch": 0.05654430849091583, "grad_norm": 9.4453125, "learning_rate": 9.943455691509085e-06, "loss": 2.977, "mean_token_accuracy": 0.43080593849416754, "step": 305 }, { "epoch": 0.05672969966629588, "grad_norm": 8.109375, "learning_rate": 9.943270300333705e-06, "loss": 3.3331, "mean_token_accuracy": 0.3952975753122704, "step": 306 }, { "epoch": 0.05691509084167594, "grad_norm": 7.30859375, "learning_rate": 9.943084909158326e-06, "loss": 3.6364, "mean_token_accuracy": 0.371665582303188, "step": 307 }, { "epoch": 0.05710048201705599, "grad_norm": 7.671875, "learning_rate": 9.942899517982944e-06, "loss": 3.5286, "mean_token_accuracy": 0.3780568407138136, "step": 308 }, { "epoch": 0.05728587319243604, "grad_norm": 7.01171875, "learning_rate": 9.942714126807565e-06, "loss": 3.4153, "mean_token_accuracy": 0.38449289563939243, "step": 309 }, { "epoch": 0.05747126436781609, "grad_norm": 7.1953125, "learning_rate": 9.942528735632184e-06, "loss": 3.0392, "mean_token_accuracy": 0.3950374646849281, "step": 310 }, { "epoch": 0.05765665554319614, "grad_norm": 7.28515625, "learning_rate": 9.942343344456806e-06, "loss": 2.8286, "mean_token_accuracy": 0.4390590464161667, "step": 311 }, { "epoch": 0.05784204671857619, "grad_norm": 7.40625, "learning_rate": 9.942157953281425e-06, "loss": 3.4708, "mean_token_accuracy": 0.36556501659691154, "step": 312 }, { "epoch": 0.05802743789395625, "grad_norm": 12.171875, "learning_rate": 9.941972562106045e-06, "loss": 3.1521, "mean_token_accuracy": 0.41821471652593484, "step": 313 }, { "epoch": 0.0582128290693363, "grad_norm": 8.125, "learning_rate": 9.941787170930664e-06, "loss": 3.3218, "mean_token_accuracy": 0.39399853622834835, "step": 314 }, { "epoch": 0.05839822024471635, "grad_norm": 9.796875, "learning_rate": 9.941601779755284e-06, "loss": 3.2287, "mean_token_accuracy": 0.40054598512661205, "step": 315 }, { "epoch": 0.0585836114200964, "grad_norm": 9.4140625, "learning_rate": 9.941416388579905e-06, "loss": 3.0306, "mean_token_accuracy": 0.41148190045248867, "step": 316 }, { "epoch": 0.058769002595476454, "grad_norm": 7.82421875, "learning_rate": 9.941230997404524e-06, "loss": 3.1382, "mean_token_accuracy": 0.42365652544782484, "step": 317 }, { "epoch": 0.058954393770856504, "grad_norm": 7.359375, "learning_rate": 9.941045606229144e-06, "loss": 3.0658, "mean_token_accuracy": 0.4279584775086505, "step": 318 }, { "epoch": 0.05913978494623656, "grad_norm": 7.0390625, "learning_rate": 9.940860215053765e-06, "loss": 3.0132, "mean_token_accuracy": 0.4064222712238148, "step": 319 }, { "epoch": 0.05932517612161661, "grad_norm": 8.1875, "learning_rate": 9.940674823878385e-06, "loss": 3.3134, "mean_token_accuracy": 0.3796250207400033, "step": 320 }, { "epoch": 0.059510567296996664, "grad_norm": 8.390625, "learning_rate": 9.940489432703004e-06, "loss": 2.6812, "mean_token_accuracy": 0.47064432638605097, "step": 321 }, { "epoch": 0.059695958472376714, "grad_norm": 8.2265625, "learning_rate": 9.940304041527625e-06, "loss": 3.2405, "mean_token_accuracy": 0.3894899536321484, "step": 322 }, { "epoch": 0.059881349647756765, "grad_norm": 7.78125, "learning_rate": 9.940118650352243e-06, "loss": 3.5864, "mean_token_accuracy": 0.3572093023255814, "step": 323 }, { "epoch": 0.060066740823136816, "grad_norm": 6.88671875, "learning_rate": 9.939933259176864e-06, "loss": 3.335, "mean_token_accuracy": 0.3887285690035648, "step": 324 }, { "epoch": 0.060252131998516874, "grad_norm": 8.1796875, "learning_rate": 9.939747868001484e-06, "loss": 3.171, "mean_token_accuracy": 0.4153274407392527, "step": 325 }, { "epoch": 0.060437523173896925, "grad_norm": 9.6640625, "learning_rate": 9.939562476826103e-06, "loss": 2.8717, "mean_token_accuracy": 0.42344694887300716, "step": 326 }, { "epoch": 0.060622914349276975, "grad_norm": 10.09375, "learning_rate": 9.939377085650724e-06, "loss": 3.0943, "mean_token_accuracy": 0.4212737127371274, "step": 327 }, { "epoch": 0.060808305524657026, "grad_norm": 8.15625, "learning_rate": 9.939191694475344e-06, "loss": 2.8487, "mean_token_accuracy": 0.4289034132171387, "step": 328 }, { "epoch": 0.06099369670003708, "grad_norm": 7.00390625, "learning_rate": 9.939006303299965e-06, "loss": 3.358, "mean_token_accuracy": 0.3919404517453799, "step": 329 }, { "epoch": 0.06117908787541713, "grad_norm": 11.09375, "learning_rate": 9.938820912124583e-06, "loss": 2.9496, "mean_token_accuracy": 0.41870147406323843, "step": 330 }, { "epoch": 0.061364479050797185, "grad_norm": 8.1875, "learning_rate": 9.938635520949204e-06, "loss": 3.0177, "mean_token_accuracy": 0.4148455622024173, "step": 331 }, { "epoch": 0.061549870226177236, "grad_norm": 8.4140625, "learning_rate": 9.938450129773823e-06, "loss": 3.4472, "mean_token_accuracy": 0.36963210702341137, "step": 332 }, { "epoch": 0.06173526140155729, "grad_norm": 9.328125, "learning_rate": 9.938264738598443e-06, "loss": 3.0677, "mean_token_accuracy": 0.39379347244515783, "step": 333 }, { "epoch": 0.06192065257693734, "grad_norm": 8.75, "learning_rate": 9.938079347423064e-06, "loss": 3.2314, "mean_token_accuracy": 0.38148958213422357, "step": 334 }, { "epoch": 0.06210604375231739, "grad_norm": 10.4609375, "learning_rate": 9.937893956247684e-06, "loss": 3.256, "mean_token_accuracy": 0.37461950246667364, "step": 335 }, { "epoch": 0.06229143492769744, "grad_norm": 8.3359375, "learning_rate": 9.937708565072303e-06, "loss": 3.2837, "mean_token_accuracy": 0.39097202192096914, "step": 336 }, { "epoch": 0.06247682610307749, "grad_norm": 6.83984375, "learning_rate": 9.937523173896923e-06, "loss": 3.0507, "mean_token_accuracy": 0.4053655660377358, "step": 337 }, { "epoch": 0.06266221727845754, "grad_norm": 8.921875, "learning_rate": 9.937337782721544e-06, "loss": 2.9651, "mean_token_accuracy": 0.4084235503485096, "step": 338 }, { "epoch": 0.06284760845383759, "grad_norm": 11.9453125, "learning_rate": 9.937152391546163e-06, "loss": 3.2857, "mean_token_accuracy": 0.3715688462396786, "step": 339 }, { "epoch": 0.06303299962921764, "grad_norm": 7.9140625, "learning_rate": 9.936967000370783e-06, "loss": 2.8766, "mean_token_accuracy": 0.43730668570069, "step": 340 }, { "epoch": 0.06321839080459771, "grad_norm": 32.53125, "learning_rate": 9.936781609195402e-06, "loss": 3.2508, "mean_token_accuracy": 0.3707114026236125, "step": 341 }, { "epoch": 0.06340378197997776, "grad_norm": 8.7109375, "learning_rate": 9.936596218020022e-06, "loss": 3.0083, "mean_token_accuracy": 0.4149775645984837, "step": 342 }, { "epoch": 0.06358917315535781, "grad_norm": 8.6171875, "learning_rate": 9.936410826844643e-06, "loss": 3.0531, "mean_token_accuracy": 0.4118731078057128, "step": 343 }, { "epoch": 0.06377456433073786, "grad_norm": 7.37109375, "learning_rate": 9.936225435669263e-06, "loss": 3.1703, "mean_token_accuracy": 0.39235832396853426, "step": 344 }, { "epoch": 0.06395995550611791, "grad_norm": 6.90625, "learning_rate": 9.936040044493884e-06, "loss": 3.3839, "mean_token_accuracy": 0.37969796752583174, "step": 345 }, { "epoch": 0.06414534668149796, "grad_norm": 7.08203125, "learning_rate": 9.935854653318503e-06, "loss": 3.2928, "mean_token_accuracy": 0.39885729832675826, "step": 346 }, { "epoch": 0.06433073785687801, "grad_norm": 9.21875, "learning_rate": 9.935669262143123e-06, "loss": 3.1548, "mean_token_accuracy": 0.40253516772298653, "step": 347 }, { "epoch": 0.06451612903225806, "grad_norm": 7.96484375, "learning_rate": 9.935483870967742e-06, "loss": 3.379, "mean_token_accuracy": 0.38612143742255267, "step": 348 }, { "epoch": 0.06470152020763811, "grad_norm": 14.0859375, "learning_rate": 9.935298479792363e-06, "loss": 3.0443, "mean_token_accuracy": 0.3883242732214606, "step": 349 }, { "epoch": 0.06488691138301816, "grad_norm": 7.8984375, "learning_rate": 9.935113088616983e-06, "loss": 3.0287, "mean_token_accuracy": 0.4172011182371321, "step": 350 }, { "epoch": 0.06507230255839822, "grad_norm": 9.2890625, "learning_rate": 9.934927697441604e-06, "loss": 2.8132, "mean_token_accuracy": 0.4324363849078678, "step": 351 }, { "epoch": 0.06525769373377827, "grad_norm": 7.52734375, "learning_rate": 9.934742306266222e-06, "loss": 3.1722, "mean_token_accuracy": 0.37766203703703705, "step": 352 }, { "epoch": 0.06544308490915833, "grad_norm": 14.6796875, "learning_rate": 9.934556915090843e-06, "loss": 3.2414, "mean_token_accuracy": 0.4192169837331128, "step": 353 }, { "epoch": 0.06562847608453838, "grad_norm": 11.1171875, "learning_rate": 9.934371523915463e-06, "loss": 3.0014, "mean_token_accuracy": 0.4087623220153341, "step": 354 }, { "epoch": 0.06581386725991843, "grad_norm": 8.15625, "learning_rate": 9.934186132740082e-06, "loss": 3.0859, "mean_token_accuracy": 0.4166019166019166, "step": 355 }, { "epoch": 0.06599925843529848, "grad_norm": 7.78125, "learning_rate": 9.934000741564703e-06, "loss": 3.2385, "mean_token_accuracy": 0.39176453530520516, "step": 356 }, { "epoch": 0.06618464961067853, "grad_norm": 10.15625, "learning_rate": 9.933815350389321e-06, "loss": 2.9305, "mean_token_accuracy": 0.4167066730676908, "step": 357 }, { "epoch": 0.06637004078605858, "grad_norm": 8.0703125, "learning_rate": 9.933629959213942e-06, "loss": 3.4065, "mean_token_accuracy": 0.3885509838998211, "step": 358 }, { "epoch": 0.06655543196143864, "grad_norm": 13.6328125, "learning_rate": 9.933444568038562e-06, "loss": 3.1736, "mean_token_accuracy": 0.3686071473670918, "step": 359 }, { "epoch": 0.06674082313681869, "grad_norm": 10.0078125, "learning_rate": 9.933259176863183e-06, "loss": 3.4273, "mean_token_accuracy": 0.37640109057861254, "step": 360 }, { "epoch": 0.06692621431219874, "grad_norm": 13.15625, "learning_rate": 9.933073785687802e-06, "loss": 3.1109, "mean_token_accuracy": 0.4104465835568881, "step": 361 }, { "epoch": 0.06711160548757879, "grad_norm": 9.8359375, "learning_rate": 9.932888394512422e-06, "loss": 2.9534, "mean_token_accuracy": 0.41854456724558636, "step": 362 }, { "epoch": 0.06729699666295884, "grad_norm": 9.203125, "learning_rate": 9.932703003337043e-06, "loss": 3.1304, "mean_token_accuracy": 0.39285248467457934, "step": 363 }, { "epoch": 0.06748238783833889, "grad_norm": 14.5546875, "learning_rate": 9.932517612161661e-06, "loss": 3.5019, "mean_token_accuracy": 0.36822405782137324, "step": 364 }, { "epoch": 0.06766777901371895, "grad_norm": 13.6640625, "learning_rate": 9.932332220986282e-06, "loss": 2.765, "mean_token_accuracy": 0.41987221280479853, "step": 365 }, { "epoch": 0.067853170189099, "grad_norm": 8.484375, "learning_rate": 9.9321468298109e-06, "loss": 3.935, "mean_token_accuracy": 0.3432546470521154, "step": 366 }, { "epoch": 0.06803856136447906, "grad_norm": 11.6640625, "learning_rate": 9.931961438635523e-06, "loss": 2.8361, "mean_token_accuracy": 0.42698781989872725, "step": 367 }, { "epoch": 0.0682239525398591, "grad_norm": 7.37890625, "learning_rate": 9.931776047460142e-06, "loss": 3.107, "mean_token_accuracy": 0.40388738030584537, "step": 368 }, { "epoch": 0.06840934371523916, "grad_norm": 8.2265625, "learning_rate": 9.931590656284762e-06, "loss": 3.3055, "mean_token_accuracy": 0.3911715713492504, "step": 369 }, { "epoch": 0.06859473489061921, "grad_norm": 10.2421875, "learning_rate": 9.931405265109381e-06, "loss": 3.2748, "mean_token_accuracy": 0.373602667189645, "step": 370 }, { "epoch": 0.06878012606599926, "grad_norm": 9.9921875, "learning_rate": 9.931219873934002e-06, "loss": 3.1735, "mean_token_accuracy": 0.40048396854204477, "step": 371 }, { "epoch": 0.06896551724137931, "grad_norm": 12.8359375, "learning_rate": 9.931034482758622e-06, "loss": 3.1394, "mean_token_accuracy": 0.3843685537474215, "step": 372 }, { "epoch": 0.06915090841675936, "grad_norm": 19.09375, "learning_rate": 9.93084909158324e-06, "loss": 2.8149, "mean_token_accuracy": 0.42526997840172787, "step": 373 }, { "epoch": 0.06933629959213941, "grad_norm": 13.1328125, "learning_rate": 9.930663700407861e-06, "loss": 2.9503, "mean_token_accuracy": 0.422032262512064, "step": 374 }, { "epoch": 0.06952169076751946, "grad_norm": 12.9140625, "learning_rate": 9.93047830923248e-06, "loss": 3.3978, "mean_token_accuracy": 0.39074910450085654, "step": 375 }, { "epoch": 0.06970708194289951, "grad_norm": 8.9453125, "learning_rate": 9.930292918057102e-06, "loss": 3.4736, "mean_token_accuracy": 0.38575803981623275, "step": 376 }, { "epoch": 0.06989247311827956, "grad_norm": 7.98828125, "learning_rate": 9.930107526881721e-06, "loss": 3.3877, "mean_token_accuracy": 0.3846567967698519, "step": 377 }, { "epoch": 0.07007786429365963, "grad_norm": 6.87890625, "learning_rate": 9.929922135706342e-06, "loss": 3.6124, "mean_token_accuracy": 0.365993811698836, "step": 378 }, { "epoch": 0.07026325546903968, "grad_norm": 14.9375, "learning_rate": 9.92973674453096e-06, "loss": 2.9345, "mean_token_accuracy": 0.4124666751301257, "step": 379 }, { "epoch": 0.07044864664441973, "grad_norm": 17.203125, "learning_rate": 9.92955135335558e-06, "loss": 2.9832, "mean_token_accuracy": 0.4029434719589698, "step": 380 }, { "epoch": 0.07063403781979978, "grad_norm": 15.25, "learning_rate": 9.929365962180201e-06, "loss": 2.7983, "mean_token_accuracy": 0.4282109177845663, "step": 381 }, { "epoch": 0.07081942899517983, "grad_norm": 9.1171875, "learning_rate": 9.92918057100482e-06, "loss": 2.9869, "mean_token_accuracy": 0.4103250478011472, "step": 382 }, { "epoch": 0.07100482017055988, "grad_norm": 8.8515625, "learning_rate": 9.92899517982944e-06, "loss": 2.9022, "mean_token_accuracy": 0.40809928151534947, "step": 383 }, { "epoch": 0.07119021134593993, "grad_norm": 6.50390625, "learning_rate": 9.928809788654061e-06, "loss": 3.49, "mean_token_accuracy": 0.37555982085732564, "step": 384 }, { "epoch": 0.07137560252131998, "grad_norm": 8.5546875, "learning_rate": 9.928624397478682e-06, "loss": 3.0957, "mean_token_accuracy": 0.40704993705413345, "step": 385 }, { "epoch": 0.07156099369670003, "grad_norm": 7.35546875, "learning_rate": 9.9284390063033e-06, "loss": 3.3227, "mean_token_accuracy": 0.3978283621140764, "step": 386 }, { "epoch": 0.07174638487208009, "grad_norm": 9.7421875, "learning_rate": 9.928253615127921e-06, "loss": 2.8282, "mean_token_accuracy": 0.4238902114549783, "step": 387 }, { "epoch": 0.07193177604746014, "grad_norm": 10.140625, "learning_rate": 9.928068223952541e-06, "loss": 3.4092, "mean_token_accuracy": 0.3781851274050962, "step": 388 }, { "epoch": 0.07211716722284019, "grad_norm": 7.36328125, "learning_rate": 9.92788283277716e-06, "loss": 3.3315, "mean_token_accuracy": 0.3859465128474043, "step": 389 }, { "epoch": 0.07230255839822025, "grad_norm": 7.24609375, "learning_rate": 9.92769744160178e-06, "loss": 3.0029, "mean_token_accuracy": 0.4217097277614011, "step": 390 }, { "epoch": 0.0724879495736003, "grad_norm": 12.328125, "learning_rate": 9.9275120504264e-06, "loss": 3.1637, "mean_token_accuracy": 0.39295695919508106, "step": 391 }, { "epoch": 0.07267334074898035, "grad_norm": 6.78125, "learning_rate": 9.927326659251022e-06, "loss": 3.2558, "mean_token_accuracy": 0.38784781374219196, "step": 392 }, { "epoch": 0.0728587319243604, "grad_norm": 5.69921875, "learning_rate": 9.92714126807564e-06, "loss": 3.2201, "mean_token_accuracy": 0.3929712460063898, "step": 393 }, { "epoch": 0.07304412309974045, "grad_norm": 7.26953125, "learning_rate": 9.926955876900261e-06, "loss": 3.357, "mean_token_accuracy": 0.39025779803446803, "step": 394 }, { "epoch": 0.0732295142751205, "grad_norm": 8.6875, "learning_rate": 9.92677048572488e-06, "loss": 3.0221, "mean_token_accuracy": 0.40974671369028537, "step": 395 }, { "epoch": 0.07341490545050056, "grad_norm": 11.1328125, "learning_rate": 9.9265850945495e-06, "loss": 3.2178, "mean_token_accuracy": 0.39492710679151355, "step": 396 }, { "epoch": 0.0736002966258806, "grad_norm": 6.71875, "learning_rate": 9.92639970337412e-06, "loss": 3.4733, "mean_token_accuracy": 0.37597911227154046, "step": 397 }, { "epoch": 0.07378568780126066, "grad_norm": 10.546875, "learning_rate": 9.92621431219874e-06, "loss": 3.004, "mean_token_accuracy": 0.4068651374654303, "step": 398 }, { "epoch": 0.07397107897664071, "grad_norm": 6.87109375, "learning_rate": 9.92602892102336e-06, "loss": 3.0414, "mean_token_accuracy": 0.42359005457853244, "step": 399 }, { "epoch": 0.07415647015202076, "grad_norm": 7.52734375, "learning_rate": 9.92584352984798e-06, "loss": 3.3743, "mean_token_accuracy": 0.3960429621254946, "step": 400 }, { "epoch": 0.07434186132740081, "grad_norm": 16.5625, "learning_rate": 9.925658138672601e-06, "loss": 3.0779, "mean_token_accuracy": 0.4026806526806527, "step": 401 }, { "epoch": 0.07452725250278086, "grad_norm": 6.66015625, "learning_rate": 9.92547274749722e-06, "loss": 3.1661, "mean_token_accuracy": 0.39506468615237184, "step": 402 }, { "epoch": 0.07471264367816093, "grad_norm": 6.7109375, "learning_rate": 9.92528735632184e-06, "loss": 3.1679, "mean_token_accuracy": 0.4066322370209296, "step": 403 }, { "epoch": 0.07489803485354098, "grad_norm": 6.30859375, "learning_rate": 9.925101965146459e-06, "loss": 2.8993, "mean_token_accuracy": 0.42763237979306146, "step": 404 }, { "epoch": 0.07508342602892103, "grad_norm": 8.0078125, "learning_rate": 9.92491657397108e-06, "loss": 3.0765, "mean_token_accuracy": 0.4217510457233521, "step": 405 }, { "epoch": 0.07526881720430108, "grad_norm": 7.9609375, "learning_rate": 9.9247311827957e-06, "loss": 3.0178, "mean_token_accuracy": 0.4194839857651246, "step": 406 }, { "epoch": 0.07545420837968113, "grad_norm": 31.734375, "learning_rate": 9.924545791620319e-06, "loss": 3.415, "mean_token_accuracy": 0.36237650933040616, "step": 407 }, { "epoch": 0.07563959955506118, "grad_norm": 9.1953125, "learning_rate": 9.92436040044494e-06, "loss": 3.025, "mean_token_accuracy": 0.42097791798107254, "step": 408 }, { "epoch": 0.07582499073044123, "grad_norm": 7.0859375, "learning_rate": 9.92417500926956e-06, "loss": 2.794, "mean_token_accuracy": 0.4515760040671073, "step": 409 }, { "epoch": 0.07601038190582128, "grad_norm": 7.0859375, "learning_rate": 9.92398961809418e-06, "loss": 3.2994, "mean_token_accuracy": 0.3854875283446712, "step": 410 }, { "epoch": 0.07619577308120133, "grad_norm": 7.4375, "learning_rate": 9.923804226918799e-06, "loss": 3.2092, "mean_token_accuracy": 0.40295767465578786, "step": 411 }, { "epoch": 0.07638116425658138, "grad_norm": 7.83984375, "learning_rate": 9.92361883574342e-06, "loss": 3.7227, "mean_token_accuracy": 0.35226628895184137, "step": 412 }, { "epoch": 0.07656655543196143, "grad_norm": 6.5390625, "learning_rate": 9.923433444568038e-06, "loss": 3.0243, "mean_token_accuracy": 0.4165524801315429, "step": 413 }, { "epoch": 0.07675194660734148, "grad_norm": 8.03125, "learning_rate": 9.923248053392659e-06, "loss": 3.0522, "mean_token_accuracy": 0.4143056200650255, "step": 414 }, { "epoch": 0.07693733778272155, "grad_norm": 6.66796875, "learning_rate": 9.92306266221728e-06, "loss": 2.7394, "mean_token_accuracy": 0.4457690812491901, "step": 415 }, { "epoch": 0.0771227289581016, "grad_norm": 6.54296875, "learning_rate": 9.9228772710419e-06, "loss": 2.3393, "mean_token_accuracy": 0.5100199071001991, "step": 416 }, { "epoch": 0.07730812013348165, "grad_norm": 11.5234375, "learning_rate": 9.922691879866519e-06, "loss": 2.9385, "mean_token_accuracy": 0.4253193580085162, "step": 417 }, { "epoch": 0.0774935113088617, "grad_norm": 8.1171875, "learning_rate": 9.92250648869114e-06, "loss": 3.3027, "mean_token_accuracy": 0.3856152512998267, "step": 418 }, { "epoch": 0.07767890248424175, "grad_norm": 8.890625, "learning_rate": 9.92232109751576e-06, "loss": 3.2797, "mean_token_accuracy": 0.38766840635999517, "step": 419 }, { "epoch": 0.0778642936596218, "grad_norm": 7.9609375, "learning_rate": 9.922135706340378e-06, "loss": 3.0887, "mean_token_accuracy": 0.4093029118870541, "step": 420 }, { "epoch": 0.07804968483500185, "grad_norm": 9.3671875, "learning_rate": 9.921950315164999e-06, "loss": 3.1051, "mean_token_accuracy": 0.39662897375720074, "step": 421 }, { "epoch": 0.0782350760103819, "grad_norm": 6.80859375, "learning_rate": 9.921764923989618e-06, "loss": 3.2713, "mean_token_accuracy": 0.39184581171237953, "step": 422 }, { "epoch": 0.07842046718576196, "grad_norm": 8.7578125, "learning_rate": 9.921579532814238e-06, "loss": 2.8083, "mean_token_accuracy": 0.43615977575332865, "step": 423 }, { "epoch": 0.078605858361142, "grad_norm": 12.53125, "learning_rate": 9.921394141638859e-06, "loss": 2.6467, "mean_token_accuracy": 0.4571752694271129, "step": 424 }, { "epoch": 0.07879124953652206, "grad_norm": 8.2890625, "learning_rate": 9.92120875046348e-06, "loss": 3.017, "mean_token_accuracy": 0.4204812974104107, "step": 425 }, { "epoch": 0.07897664071190211, "grad_norm": 12.5234375, "learning_rate": 9.9210233592881e-06, "loss": 2.9766, "mean_token_accuracy": 0.4030904489143466, "step": 426 }, { "epoch": 0.07916203188728217, "grad_norm": 9.5625, "learning_rate": 9.920837968112719e-06, "loss": 3.3189, "mean_token_accuracy": 0.39043691484618814, "step": 427 }, { "epoch": 0.07934742306266222, "grad_norm": 8.703125, "learning_rate": 9.920652576937339e-06, "loss": 3.2277, "mean_token_accuracy": 0.3963214915595868, "step": 428 }, { "epoch": 0.07953281423804227, "grad_norm": 13.1171875, "learning_rate": 9.920467185761958e-06, "loss": 2.9014, "mean_token_accuracy": 0.44818840579710145, "step": 429 }, { "epoch": 0.07971820541342232, "grad_norm": 8.7265625, "learning_rate": 9.920281794586578e-06, "loss": 3.318, "mean_token_accuracy": 0.3807124443402859, "step": 430 }, { "epoch": 0.07990359658880238, "grad_norm": 11.0703125, "learning_rate": 9.920096403411199e-06, "loss": 3.514, "mean_token_accuracy": 0.36650659037498723, "step": 431 }, { "epoch": 0.08008898776418243, "grad_norm": 8.796875, "learning_rate": 9.91991101223582e-06, "loss": 3.5072, "mean_token_accuracy": 0.3688764829030007, "step": 432 }, { "epoch": 0.08027437893956248, "grad_norm": 7.54296875, "learning_rate": 9.919725621060438e-06, "loss": 3.6727, "mean_token_accuracy": 0.35138888888888886, "step": 433 }, { "epoch": 0.08045977011494253, "grad_norm": 7.65625, "learning_rate": 9.919540229885059e-06, "loss": 3.4004, "mean_token_accuracy": 0.4000682128240109, "step": 434 }, { "epoch": 0.08064516129032258, "grad_norm": 6.19921875, "learning_rate": 9.919354838709679e-06, "loss": 3.1433, "mean_token_accuracy": 0.40173325499412454, "step": 435 }, { "epoch": 0.08083055246570263, "grad_norm": 8.6484375, "learning_rate": 9.919169447534298e-06, "loss": 3.269, "mean_token_accuracy": 0.39300750503386417, "step": 436 }, { "epoch": 0.08101594364108268, "grad_norm": 7.29296875, "learning_rate": 9.918984056358918e-06, "loss": 3.4682, "mean_token_accuracy": 0.3660212367270456, "step": 437 }, { "epoch": 0.08120133481646273, "grad_norm": 10.25, "learning_rate": 9.918798665183537e-06, "loss": 3.6457, "mean_token_accuracy": 0.3582458307597282, "step": 438 }, { "epoch": 0.08138672599184278, "grad_norm": 9.8671875, "learning_rate": 9.918613274008158e-06, "loss": 2.6709, "mean_token_accuracy": 0.450360162856248, "step": 439 }, { "epoch": 0.08157211716722285, "grad_norm": 11.671875, "learning_rate": 9.918427882832778e-06, "loss": 3.1539, "mean_token_accuracy": 0.39481687161179424, "step": 440 }, { "epoch": 0.0817575083426029, "grad_norm": 11.8984375, "learning_rate": 9.918242491657399e-06, "loss": 3.2612, "mean_token_accuracy": 0.39783751010509294, "step": 441 }, { "epoch": 0.08194289951798295, "grad_norm": 9.015625, "learning_rate": 9.918057100482017e-06, "loss": 3.0612, "mean_token_accuracy": 0.4160719006079831, "step": 442 }, { "epoch": 0.082128290693363, "grad_norm": 9.84375, "learning_rate": 9.917871709306638e-06, "loss": 2.8218, "mean_token_accuracy": 0.4304481097649978, "step": 443 }, { "epoch": 0.08231368186874305, "grad_norm": 8.2578125, "learning_rate": 9.917686318131258e-06, "loss": 3.3211, "mean_token_accuracy": 0.41575370961031755, "step": 444 }, { "epoch": 0.0824990730441231, "grad_norm": 10.90625, "learning_rate": 9.917500926955877e-06, "loss": 2.7982, "mean_token_accuracy": 0.433960281601345, "step": 445 }, { "epoch": 0.08268446421950315, "grad_norm": 12.140625, "learning_rate": 9.917315535780498e-06, "loss": 3.3681, "mean_token_accuracy": 0.38062105398788815, "step": 446 }, { "epoch": 0.0828698553948832, "grad_norm": 9.21875, "learning_rate": 9.917130144605116e-06, "loss": 2.8688, "mean_token_accuracy": 0.4178797033781928, "step": 447 }, { "epoch": 0.08305524657026325, "grad_norm": 8.5859375, "learning_rate": 9.916944753429739e-06, "loss": 2.6755, "mean_token_accuracy": 0.45230017641471026, "step": 448 }, { "epoch": 0.0832406377456433, "grad_norm": 8.296875, "learning_rate": 9.916759362254357e-06, "loss": 3.1206, "mean_token_accuracy": 0.4315057671038079, "step": 449 }, { "epoch": 0.08342602892102335, "grad_norm": 8.1328125, "learning_rate": 9.916573971078978e-06, "loss": 3.3526, "mean_token_accuracy": 0.3785459823195672, "step": 450 }, { "epoch": 0.0836114200964034, "grad_norm": 8.6015625, "learning_rate": 9.916388579903597e-06, "loss": 3.3528, "mean_token_accuracy": 0.38765657620041755, "step": 451 }, { "epoch": 0.08379681127178347, "grad_norm": 8.3984375, "learning_rate": 9.916203188728217e-06, "loss": 2.3142, "mean_token_accuracy": 0.5174269005847953, "step": 452 }, { "epoch": 0.08398220244716352, "grad_norm": 16.234375, "learning_rate": 9.916017797552838e-06, "loss": 2.3648, "mean_token_accuracy": 0.4698336085355011, "step": 453 }, { "epoch": 0.08416759362254357, "grad_norm": 8.7578125, "learning_rate": 9.915832406377457e-06, "loss": 3.0772, "mean_token_accuracy": 0.4197109067017083, "step": 454 }, { "epoch": 0.08435298479792362, "grad_norm": 8.21875, "learning_rate": 9.915647015202077e-06, "loss": 3.1772, "mean_token_accuracy": 0.4105668684645019, "step": 455 }, { "epoch": 0.08453837597330367, "grad_norm": 8.03125, "learning_rate": 9.915461624026698e-06, "loss": 3.3031, "mean_token_accuracy": 0.39910955636826206, "step": 456 }, { "epoch": 0.08472376714868372, "grad_norm": 8.4921875, "learning_rate": 9.915276232851318e-06, "loss": 3.0853, "mean_token_accuracy": 0.4126109169131487, "step": 457 }, { "epoch": 0.08490915832406377, "grad_norm": 6.7890625, "learning_rate": 9.915090841675937e-06, "loss": 3.3507, "mean_token_accuracy": 0.3846563665423548, "step": 458 }, { "epoch": 0.08509454949944383, "grad_norm": 9.0546875, "learning_rate": 9.914905450500557e-06, "loss": 3.0794, "mean_token_accuracy": 0.44009632751354605, "step": 459 }, { "epoch": 0.08527994067482388, "grad_norm": 6.64453125, "learning_rate": 9.914720059325176e-06, "loss": 3.2129, "mean_token_accuracy": 0.40235094179294717, "step": 460 }, { "epoch": 0.08546533185020393, "grad_norm": 6.09375, "learning_rate": 9.914534668149797e-06, "loss": 3.1745, "mean_token_accuracy": 0.4080825038973498, "step": 461 }, { "epoch": 0.08565072302558398, "grad_norm": 6.06640625, "learning_rate": 9.914349276974417e-06, "loss": 3.2488, "mean_token_accuracy": 0.40218763146823727, "step": 462 }, { "epoch": 0.08583611420096403, "grad_norm": 8.1015625, "learning_rate": 9.914163885799036e-06, "loss": 3.1788, "mean_token_accuracy": 0.39791580968051554, "step": 463 }, { "epoch": 0.08602150537634409, "grad_norm": 7.64453125, "learning_rate": 9.913978494623658e-06, "loss": 3.1277, "mean_token_accuracy": 0.3926210607225211, "step": 464 }, { "epoch": 0.08620689655172414, "grad_norm": 7.4765625, "learning_rate": 9.913793103448277e-06, "loss": 3.1341, "mean_token_accuracy": 0.39659232580689846, "step": 465 }, { "epoch": 0.0863922877271042, "grad_norm": 7.7734375, "learning_rate": 9.913607712272897e-06, "loss": 3.2317, "mean_token_accuracy": 0.3909190480931992, "step": 466 }, { "epoch": 0.08657767890248425, "grad_norm": 6.375, "learning_rate": 9.913422321097516e-06, "loss": 3.2833, "mean_token_accuracy": 0.3947193843704392, "step": 467 }, { "epoch": 0.0867630700778643, "grad_norm": 7.23046875, "learning_rate": 9.913236929922137e-06, "loss": 2.9127, "mean_token_accuracy": 0.42207425798442616, "step": 468 }, { "epoch": 0.08694846125324435, "grad_norm": 7.61328125, "learning_rate": 9.913051538746757e-06, "loss": 3.0064, "mean_token_accuracy": 0.3991683991683992, "step": 469 }, { "epoch": 0.0871338524286244, "grad_norm": 9.0234375, "learning_rate": 9.912866147571376e-06, "loss": 2.9705, "mean_token_accuracy": 0.42190547636909226, "step": 470 }, { "epoch": 0.08731924360400445, "grad_norm": 6.9921875, "learning_rate": 9.912680756395996e-06, "loss": 2.6942, "mean_token_accuracy": 0.4473190348525469, "step": 471 }, { "epoch": 0.0875046347793845, "grad_norm": 8.6640625, "learning_rate": 9.912495365220617e-06, "loss": 2.7824, "mean_token_accuracy": 0.4219273223365993, "step": 472 }, { "epoch": 0.08769002595476455, "grad_norm": 9.8515625, "learning_rate": 9.912309974045237e-06, "loss": 2.9691, "mean_token_accuracy": 0.4296920395119117, "step": 473 }, { "epoch": 0.0878754171301446, "grad_norm": 11.4921875, "learning_rate": 9.912124582869856e-06, "loss": 3.1641, "mean_token_accuracy": 0.3890962671905697, "step": 474 }, { "epoch": 0.08806080830552465, "grad_norm": 7.84765625, "learning_rate": 9.911939191694477e-06, "loss": 3.0651, "mean_token_accuracy": 0.40698455339153794, "step": 475 }, { "epoch": 0.0882461994809047, "grad_norm": 6.94140625, "learning_rate": 9.911753800519095e-06, "loss": 2.7195, "mean_token_accuracy": 0.45902570657577046, "step": 476 }, { "epoch": 0.08843159065628477, "grad_norm": 9.984375, "learning_rate": 9.911568409343716e-06, "loss": 2.4893, "mean_token_accuracy": 0.4737275064267352, "step": 477 }, { "epoch": 0.08861698183166482, "grad_norm": 12.2734375, "learning_rate": 9.911383018168336e-06, "loss": 3.2334, "mean_token_accuracy": 0.4050864361702128, "step": 478 }, { "epoch": 0.08880237300704487, "grad_norm": 9.875, "learning_rate": 9.911197626992955e-06, "loss": 3.0104, "mean_token_accuracy": 0.42554298348289765, "step": 479 }, { "epoch": 0.08898776418242492, "grad_norm": 10.0, "learning_rate": 9.911012235817576e-06, "loss": 3.0575, "mean_token_accuracy": 0.40624231998033916, "step": 480 }, { "epoch": 0.08917315535780497, "grad_norm": 10.25, "learning_rate": 9.910826844642196e-06, "loss": 2.7407, "mean_token_accuracy": 0.4360285006195787, "step": 481 }, { "epoch": 0.08935854653318502, "grad_norm": 6.3125, "learning_rate": 9.910641453466817e-06, "loss": 3.2688, "mean_token_accuracy": 0.39638615112458936, "step": 482 }, { "epoch": 0.08954393770856507, "grad_norm": 9.703125, "learning_rate": 9.910456062291436e-06, "loss": 2.6631, "mean_token_accuracy": 0.4357098701833917, "step": 483 }, { "epoch": 0.08972932888394512, "grad_norm": 10.0859375, "learning_rate": 9.910270671116056e-06, "loss": 3.1845, "mean_token_accuracy": 0.3989728341667793, "step": 484 }, { "epoch": 0.08991472005932517, "grad_norm": 11.1015625, "learning_rate": 9.910085279940675e-06, "loss": 2.7018, "mean_token_accuracy": 0.4283249460819554, "step": 485 }, { "epoch": 0.09010011123470522, "grad_norm": 5.74609375, "learning_rate": 9.909899888765295e-06, "loss": 3.1636, "mean_token_accuracy": 0.40561257632843967, "step": 486 }, { "epoch": 0.09028550241008527, "grad_norm": 10.40625, "learning_rate": 9.909714497589916e-06, "loss": 3.0088, "mean_token_accuracy": 0.4089074098189868, "step": 487 }, { "epoch": 0.09047089358546533, "grad_norm": 12.03125, "learning_rate": 9.909529106414536e-06, "loss": 3.1917, "mean_token_accuracy": 0.39633614422797325, "step": 488 }, { "epoch": 0.09065628476084539, "grad_norm": 8.4296875, "learning_rate": 9.909343715239155e-06, "loss": 3.1894, "mean_token_accuracy": 0.394391623540739, "step": 489 }, { "epoch": 0.09084167593622544, "grad_norm": 8.765625, "learning_rate": 9.909158324063776e-06, "loss": 3.2605, "mean_token_accuracy": 0.4022971360381862, "step": 490 }, { "epoch": 0.09102706711160549, "grad_norm": 10.6015625, "learning_rate": 9.908972932888396e-06, "loss": 2.7009, "mean_token_accuracy": 0.442383273070272, "step": 491 }, { "epoch": 0.09121245828698554, "grad_norm": 11.5078125, "learning_rate": 9.908787541713015e-06, "loss": 3.1025, "mean_token_accuracy": 0.4097849102864148, "step": 492 }, { "epoch": 0.0913978494623656, "grad_norm": 7.56640625, "learning_rate": 9.908602150537635e-06, "loss": 3.0982, "mean_token_accuracy": 0.41829085457271364, "step": 493 }, { "epoch": 0.09158324063774564, "grad_norm": 11.234375, "learning_rate": 9.908416759362254e-06, "loss": 2.9722, "mean_token_accuracy": 0.38609790569189256, "step": 494 }, { "epoch": 0.0917686318131257, "grad_norm": 7.73046875, "learning_rate": 9.908231368186875e-06, "loss": 3.3339, "mean_token_accuracy": 0.38137963178746215, "step": 495 }, { "epoch": 0.09195402298850575, "grad_norm": 5.5625, "learning_rate": 9.908045977011495e-06, "loss": 3.4163, "mean_token_accuracy": 0.3942775733111718, "step": 496 }, { "epoch": 0.0921394141638858, "grad_norm": 12.203125, "learning_rate": 9.907860585836116e-06, "loss": 3.5249, "mean_token_accuracy": 0.3502646694447573, "step": 497 }, { "epoch": 0.09232480533926585, "grad_norm": 8.84375, "learning_rate": 9.907675194660734e-06, "loss": 3.2102, "mean_token_accuracy": 0.39647936581953835, "step": 498 }, { "epoch": 0.0925101965146459, "grad_norm": 9.0078125, "learning_rate": 9.907489803485355e-06, "loss": 3.1365, "mean_token_accuracy": 0.3905526397036122, "step": 499 }, { "epoch": 0.09269558769002595, "grad_norm": 9.2109375, "learning_rate": 9.907304412309975e-06, "loss": 3.1293, "mean_token_accuracy": 0.4135810217145726, "step": 500 }, { "epoch": 0.092880978865406, "grad_norm": 16.0, "learning_rate": 9.907119021134594e-06, "loss": 2.68, "mean_token_accuracy": 0.42712124114154376, "step": 501 }, { "epoch": 0.09306637004078606, "grad_norm": 7.4765625, "learning_rate": 9.906933629959215e-06, "loss": 3.1706, "mean_token_accuracy": 0.39808671501311527, "step": 502 }, { "epoch": 0.09325176121616612, "grad_norm": 7.7578125, "learning_rate": 9.906748238783834e-06, "loss": 3.3471, "mean_token_accuracy": 0.37984034314309545, "step": 503 }, { "epoch": 0.09343715239154617, "grad_norm": 9.3125, "learning_rate": 9.906562847608454e-06, "loss": 3.1901, "mean_token_accuracy": 0.39259415106248685, "step": 504 }, { "epoch": 0.09362254356692622, "grad_norm": 6.3828125, "learning_rate": 9.906377456433074e-06, "loss": 2.8863, "mean_token_accuracy": 0.4313641704946053, "step": 505 }, { "epoch": 0.09380793474230627, "grad_norm": 9.4765625, "learning_rate": 9.906192065257695e-06, "loss": 2.8253, "mean_token_accuracy": 0.4282671344443052, "step": 506 }, { "epoch": 0.09399332591768632, "grad_norm": 7.8671875, "learning_rate": 9.906006674082315e-06, "loss": 3.6233, "mean_token_accuracy": 0.3689356207341819, "step": 507 }, { "epoch": 0.09417871709306637, "grad_norm": 8.4765625, "learning_rate": 9.905821282906934e-06, "loss": 3.4721, "mean_token_accuracy": 0.37449329591518554, "step": 508 }, { "epoch": 0.09436410826844642, "grad_norm": 6.2890625, "learning_rate": 9.905635891731555e-06, "loss": 3.756, "mean_token_accuracy": 0.3437748871781258, "step": 509 }, { "epoch": 0.09454949944382647, "grad_norm": 11.421875, "learning_rate": 9.905450500556174e-06, "loss": 3.1838, "mean_token_accuracy": 0.41393114491593275, "step": 510 }, { "epoch": 0.09473489061920652, "grad_norm": 10.515625, "learning_rate": 9.905265109380794e-06, "loss": 2.8957, "mean_token_accuracy": 0.4046385949110561, "step": 511 }, { "epoch": 0.09492028179458657, "grad_norm": 5.91796875, "learning_rate": 9.905079718205415e-06, "loss": 2.7827, "mean_token_accuracy": 0.43753187149413564, "step": 512 }, { "epoch": 0.09510567296996662, "grad_norm": 7.1171875, "learning_rate": 9.904894327030035e-06, "loss": 2.9564, "mean_token_accuracy": 0.44311168044718763, "step": 513 }, { "epoch": 0.09529106414534669, "grad_norm": 10.2421875, "learning_rate": 9.904708935854654e-06, "loss": 3.4504, "mean_token_accuracy": 0.35724786827403704, "step": 514 }, { "epoch": 0.09547645532072674, "grad_norm": 9.0234375, "learning_rate": 9.904523544679274e-06, "loss": 3.5049, "mean_token_accuracy": 0.3726017781937295, "step": 515 }, { "epoch": 0.09566184649610679, "grad_norm": 9.2890625, "learning_rate": 9.904338153503895e-06, "loss": 3.1349, "mean_token_accuracy": 0.3977930733092975, "step": 516 }, { "epoch": 0.09584723767148684, "grad_norm": 9.9453125, "learning_rate": 9.904152762328514e-06, "loss": 3.0061, "mean_token_accuracy": 0.4089308176100629, "step": 517 }, { "epoch": 0.09603262884686689, "grad_norm": 7.37890625, "learning_rate": 9.903967371153134e-06, "loss": 3.3977, "mean_token_accuracy": 0.3812831077104179, "step": 518 }, { "epoch": 0.09621802002224694, "grad_norm": 6.78515625, "learning_rate": 9.903781979977753e-06, "loss": 3.0701, "mean_token_accuracy": 0.41572499107886285, "step": 519 }, { "epoch": 0.09640341119762699, "grad_norm": 13.4375, "learning_rate": 9.903596588802373e-06, "loss": 2.8534, "mean_token_accuracy": 0.41735941320293396, "step": 520 }, { "epoch": 0.09658880237300704, "grad_norm": 9.203125, "learning_rate": 9.903411197626994e-06, "loss": 3.3144, "mean_token_accuracy": 0.3787737317149082, "step": 521 }, { "epoch": 0.0967741935483871, "grad_norm": 9.21875, "learning_rate": 9.903225806451614e-06, "loss": 2.9402, "mean_token_accuracy": 0.42441036216007216, "step": 522 }, { "epoch": 0.09695958472376714, "grad_norm": 7.1640625, "learning_rate": 9.903040415276233e-06, "loss": 2.9591, "mean_token_accuracy": 0.4172654884443581, "step": 523 }, { "epoch": 0.0971449758991472, "grad_norm": 10.4609375, "learning_rate": 9.902855024100854e-06, "loss": 2.715, "mean_token_accuracy": 0.44952380952380955, "step": 524 }, { "epoch": 0.09733036707452725, "grad_norm": 8.4921875, "learning_rate": 9.902669632925474e-06, "loss": 3.3993, "mean_token_accuracy": 0.3869340061080866, "step": 525 }, { "epoch": 0.09751575824990731, "grad_norm": 7.46484375, "learning_rate": 9.902484241750093e-06, "loss": 3.2045, "mean_token_accuracy": 0.3825547206795165, "step": 526 }, { "epoch": 0.09770114942528736, "grad_norm": 11.1640625, "learning_rate": 9.902298850574713e-06, "loss": 2.7646, "mean_token_accuracy": 0.44712335757111876, "step": 527 }, { "epoch": 0.09788654060066741, "grad_norm": 6.7421875, "learning_rate": 9.902113459399332e-06, "loss": 3.063, "mean_token_accuracy": 0.40072365445499775, "step": 528 }, { "epoch": 0.09807193177604746, "grad_norm": 9.7890625, "learning_rate": 9.901928068223954e-06, "loss": 2.7241, "mean_token_accuracy": 0.44005805515239477, "step": 529 }, { "epoch": 0.09825732295142751, "grad_norm": 5.5703125, "learning_rate": 9.901742677048573e-06, "loss": 2.8571, "mean_token_accuracy": 0.43361597080605346, "step": 530 }, { "epoch": 0.09844271412680757, "grad_norm": 6.72265625, "learning_rate": 9.901557285873194e-06, "loss": 3.2402, "mean_token_accuracy": 0.4043173089656999, "step": 531 }, { "epoch": 0.09862810530218762, "grad_norm": 9.7109375, "learning_rate": 9.901371894697813e-06, "loss": 2.8658, "mean_token_accuracy": 0.44091611634115496, "step": 532 }, { "epoch": 0.09881349647756767, "grad_norm": 21.25, "learning_rate": 9.901186503522433e-06, "loss": 3.1902, "mean_token_accuracy": 0.3711459403905447, "step": 533 }, { "epoch": 0.09899888765294772, "grad_norm": 7.953125, "learning_rate": 9.901001112347053e-06, "loss": 2.6368, "mean_token_accuracy": 0.4520154748533633, "step": 534 }, { "epoch": 0.09918427882832777, "grad_norm": 12.546875, "learning_rate": 9.900815721171672e-06, "loss": 2.7065, "mean_token_accuracy": 0.4474727452923687, "step": 535 }, { "epoch": 0.09936967000370782, "grad_norm": 5.83203125, "learning_rate": 9.900630329996293e-06, "loss": 3.2118, "mean_token_accuracy": 0.43668559973270965, "step": 536 }, { "epoch": 0.09955506117908787, "grad_norm": 7.09375, "learning_rate": 9.900444938820913e-06, "loss": 2.9621, "mean_token_accuracy": 0.41491299069202753, "step": 537 }, { "epoch": 0.09974045235446792, "grad_norm": 8.4296875, "learning_rate": 9.900259547645534e-06, "loss": 2.8767, "mean_token_accuracy": 0.4382619568615192, "step": 538 }, { "epoch": 0.09992584352984799, "grad_norm": 11.125, "learning_rate": 9.900074156470153e-06, "loss": 3.0035, "mean_token_accuracy": 0.4055264167839325, "step": 539 }, { "epoch": 0.10011123470522804, "grad_norm": 8.6796875, "learning_rate": 9.899888765294773e-06, "loss": 2.8824, "mean_token_accuracy": 0.4249751573368665, "step": 540 }, { "epoch": 0.10029662588060809, "grad_norm": 8.390625, "learning_rate": 9.899703374119392e-06, "loss": 2.7838, "mean_token_accuracy": 0.4313384113166485, "step": 541 }, { "epoch": 0.10048201705598814, "grad_norm": 11.5078125, "learning_rate": 9.899517982944012e-06, "loss": 2.934, "mean_token_accuracy": 0.414598961338719, "step": 542 }, { "epoch": 0.10066740823136819, "grad_norm": 9.9921875, "learning_rate": 9.899332591768633e-06, "loss": 2.5087, "mean_token_accuracy": 0.4635171902052931, "step": 543 }, { "epoch": 0.10085279940674824, "grad_norm": 7.94921875, "learning_rate": 9.899147200593252e-06, "loss": 2.9936, "mean_token_accuracy": 0.40667330677290836, "step": 544 }, { "epoch": 0.10103819058212829, "grad_norm": 7.38671875, "learning_rate": 9.898961809417874e-06, "loss": 2.9504, "mean_token_accuracy": 0.42535932830510886, "step": 545 }, { "epoch": 0.10122358175750834, "grad_norm": 6.96875, "learning_rate": 9.898776418242493e-06, "loss": 3.1097, "mean_token_accuracy": 0.409689557855127, "step": 546 }, { "epoch": 0.10140897293288839, "grad_norm": 7.33984375, "learning_rate": 9.898591027067113e-06, "loss": 3.0158, "mean_token_accuracy": 0.4100429645542428, "step": 547 }, { "epoch": 0.10159436410826844, "grad_norm": 7.33203125, "learning_rate": 9.898405635891732e-06, "loss": 3.0057, "mean_token_accuracy": 0.42205900975053817, "step": 548 }, { "epoch": 0.1017797552836485, "grad_norm": 15.953125, "learning_rate": 9.898220244716352e-06, "loss": 2.541, "mean_token_accuracy": 0.45236523652365235, "step": 549 }, { "epoch": 0.10196514645902854, "grad_norm": 12.8828125, "learning_rate": 9.898034853540973e-06, "loss": 2.7674, "mean_token_accuracy": 0.4315550265122822, "step": 550 }, { "epoch": 0.10215053763440861, "grad_norm": 10.515625, "learning_rate": 9.897849462365592e-06, "loss": 3.2461, "mean_token_accuracy": 0.39080459770114945, "step": 551 }, { "epoch": 0.10233592880978866, "grad_norm": 7.88671875, "learning_rate": 9.897664071190212e-06, "loss": 3.2235, "mean_token_accuracy": 0.4100203276336243, "step": 552 }, { "epoch": 0.10252131998516871, "grad_norm": 12.7109375, "learning_rate": 9.897478680014833e-06, "loss": 2.5025, "mean_token_accuracy": 0.4781881199952193, "step": 553 }, { "epoch": 0.10270671116054876, "grad_norm": 16.125, "learning_rate": 9.897293288839453e-06, "loss": 2.6751, "mean_token_accuracy": 0.4566615226337449, "step": 554 }, { "epoch": 0.10289210233592881, "grad_norm": 13.90625, "learning_rate": 9.897107897664072e-06, "loss": 3.0205, "mean_token_accuracy": 0.402683780630105, "step": 555 }, { "epoch": 0.10307749351130886, "grad_norm": 8.1015625, "learning_rate": 9.896922506488692e-06, "loss": 3.1493, "mean_token_accuracy": 0.4079531051964512, "step": 556 }, { "epoch": 0.10326288468668891, "grad_norm": 8.625, "learning_rate": 9.896737115313311e-06, "loss": 2.9197, "mean_token_accuracy": 0.42426735218509, "step": 557 }, { "epoch": 0.10344827586206896, "grad_norm": 8.3203125, "learning_rate": 9.896551724137932e-06, "loss": 3.653, "mean_token_accuracy": 0.3520766061685781, "step": 558 }, { "epoch": 0.10363366703744901, "grad_norm": 8.3984375, "learning_rate": 9.896366332962552e-06, "loss": 3.2426, "mean_token_accuracy": 0.3994003471674294, "step": 559 }, { "epoch": 0.10381905821282907, "grad_norm": 7.9375, "learning_rate": 9.896180941787171e-06, "loss": 3.0199, "mean_token_accuracy": 0.40754315441002603, "step": 560 }, { "epoch": 0.10400444938820912, "grad_norm": 10.2578125, "learning_rate": 9.895995550611792e-06, "loss": 3.2425, "mean_token_accuracy": 0.38952962460425145, "step": 561 }, { "epoch": 0.10418984056358917, "grad_norm": 10.0546875, "learning_rate": 9.895810159436412e-06, "loss": 2.8934, "mean_token_accuracy": 0.4219742755307609, "step": 562 }, { "epoch": 0.10437523173896923, "grad_norm": 13.40625, "learning_rate": 9.895624768261032e-06, "loss": 2.7176, "mean_token_accuracy": 0.4490403213807469, "step": 563 }, { "epoch": 0.10456062291434928, "grad_norm": 5.7109375, "learning_rate": 9.895439377085651e-06, "loss": 3.0299, "mean_token_accuracy": 0.42014849323045567, "step": 564 }, { "epoch": 0.10474601408972933, "grad_norm": 7.70703125, "learning_rate": 9.895253985910272e-06, "loss": 3.0085, "mean_token_accuracy": 0.4144959128065395, "step": 565 }, { "epoch": 0.10493140526510938, "grad_norm": 11.046875, "learning_rate": 9.89506859473489e-06, "loss": 2.7066, "mean_token_accuracy": 0.4501766784452297, "step": 566 }, { "epoch": 0.10511679644048944, "grad_norm": 7.453125, "learning_rate": 9.894883203559511e-06, "loss": 3.4936, "mean_token_accuracy": 0.3604332669322709, "step": 567 }, { "epoch": 0.10530218761586949, "grad_norm": 7.2109375, "learning_rate": 9.894697812384132e-06, "loss": 3.2169, "mean_token_accuracy": 0.39522168073220015, "step": 568 }, { "epoch": 0.10548757879124954, "grad_norm": 6.40234375, "learning_rate": 9.894512421208752e-06, "loss": 2.944, "mean_token_accuracy": 0.4186946902654867, "step": 569 }, { "epoch": 0.10567296996662959, "grad_norm": 7.5703125, "learning_rate": 9.894327030033371e-06, "loss": 3.4881, "mean_token_accuracy": 0.3633245382585752, "step": 570 }, { "epoch": 0.10585836114200964, "grad_norm": 5.36328125, "learning_rate": 9.894141638857991e-06, "loss": 3.308, "mean_token_accuracy": 0.39363320295523685, "step": 571 }, { "epoch": 0.10604375231738969, "grad_norm": 6.37890625, "learning_rate": 9.893956247682612e-06, "loss": 3.7202, "mean_token_accuracy": 0.34663497623217415, "step": 572 }, { "epoch": 0.10622914349276974, "grad_norm": 11.0390625, "learning_rate": 9.89377085650723e-06, "loss": 2.4134, "mean_token_accuracy": 0.46348547717842326, "step": 573 }, { "epoch": 0.10641453466814979, "grad_norm": 8.2734375, "learning_rate": 9.893585465331851e-06, "loss": 2.4189, "mean_token_accuracy": 0.4828383641674781, "step": 574 }, { "epoch": 0.10659992584352984, "grad_norm": 6.29296875, "learning_rate": 9.89340007415647e-06, "loss": 3.4033, "mean_token_accuracy": 0.36513242478786323, "step": 575 }, { "epoch": 0.1067853170189099, "grad_norm": 15.03125, "learning_rate": 9.89321468298109e-06, "loss": 2.7942, "mean_token_accuracy": 0.4102470041574957, "step": 576 }, { "epoch": 0.10697070819428996, "grad_norm": 8.2109375, "learning_rate": 9.893029291805711e-06, "loss": 3.0124, "mean_token_accuracy": 0.40227895571902494, "step": 577 }, { "epoch": 0.10715609936967001, "grad_norm": 7.24609375, "learning_rate": 9.892843900630331e-06, "loss": 2.9258, "mean_token_accuracy": 0.42379448909299655, "step": 578 }, { "epoch": 0.10734149054505006, "grad_norm": 8.28125, "learning_rate": 9.89265850945495e-06, "loss": 2.8692, "mean_token_accuracy": 0.4168611435239207, "step": 579 }, { "epoch": 0.10752688172043011, "grad_norm": 7.74609375, "learning_rate": 9.89247311827957e-06, "loss": 3.1486, "mean_token_accuracy": 0.3992425981179711, "step": 580 }, { "epoch": 0.10771227289581016, "grad_norm": 12.3203125, "learning_rate": 9.892287727104191e-06, "loss": 3.4727, "mean_token_accuracy": 0.3803249328386849, "step": 581 }, { "epoch": 0.10789766407119021, "grad_norm": 8.46875, "learning_rate": 9.89210233592881e-06, "loss": 2.8421, "mean_token_accuracy": 0.42551020408163265, "step": 582 }, { "epoch": 0.10808305524657026, "grad_norm": 7.3203125, "learning_rate": 9.89191694475343e-06, "loss": 3.3514, "mean_token_accuracy": 0.3931807884005437, "step": 583 }, { "epoch": 0.10826844642195031, "grad_norm": 7.66796875, "learning_rate": 9.89173155357805e-06, "loss": 3.1343, "mean_token_accuracy": 0.40583554376657827, "step": 584 }, { "epoch": 0.10845383759733036, "grad_norm": 6.328125, "learning_rate": 9.891546162402671e-06, "loss": 3.0979, "mean_token_accuracy": 0.3949170397682381, "step": 585 }, { "epoch": 0.10863922877271041, "grad_norm": 7.5234375, "learning_rate": 9.89136077122729e-06, "loss": 2.7459, "mean_token_accuracy": 0.4419388585377194, "step": 586 }, { "epoch": 0.10882461994809046, "grad_norm": 6.80859375, "learning_rate": 9.89117538005191e-06, "loss": 2.948, "mean_token_accuracy": 0.4154088463052894, "step": 587 }, { "epoch": 0.10901001112347053, "grad_norm": 7.26171875, "learning_rate": 9.890989988876531e-06, "loss": 3.3296, "mean_token_accuracy": 0.38610216290842153, "step": 588 }, { "epoch": 0.10919540229885058, "grad_norm": 8.1328125, "learning_rate": 9.89080459770115e-06, "loss": 2.9242, "mean_token_accuracy": 0.4286278641032394, "step": 589 }, { "epoch": 0.10938079347423063, "grad_norm": 9.171875, "learning_rate": 9.89061920652577e-06, "loss": 2.9386, "mean_token_accuracy": 0.3997347731363077, "step": 590 }, { "epoch": 0.10956618464961068, "grad_norm": 7.13671875, "learning_rate": 9.89043381535039e-06, "loss": 2.7333, "mean_token_accuracy": 0.4522235576923077, "step": 591 }, { "epoch": 0.10975157582499073, "grad_norm": 9.3515625, "learning_rate": 9.89024842417501e-06, "loss": 2.7505, "mean_token_accuracy": 0.43312723722746505, "step": 592 }, { "epoch": 0.10993696700037078, "grad_norm": 8.1328125, "learning_rate": 9.89006303299963e-06, "loss": 3.2415, "mean_token_accuracy": 0.39282921925611525, "step": 593 }, { "epoch": 0.11012235817575083, "grad_norm": 7.2734375, "learning_rate": 9.88987764182425e-06, "loss": 2.6174, "mean_token_accuracy": 0.47191466378611935, "step": 594 }, { "epoch": 0.11030774935113088, "grad_norm": 13.7734375, "learning_rate": 9.88969225064887e-06, "loss": 2.5852, "mean_token_accuracy": 0.4635452423451874, "step": 595 }, { "epoch": 0.11049314052651094, "grad_norm": 6.69921875, "learning_rate": 9.88950685947349e-06, "loss": 3.0751, "mean_token_accuracy": 0.4011641177813285, "step": 596 }, { "epoch": 0.11067853170189099, "grad_norm": 7.35546875, "learning_rate": 9.88932146829811e-06, "loss": 2.8879, "mean_token_accuracy": 0.43366461587001104, "step": 597 }, { "epoch": 0.11086392287727104, "grad_norm": 7.484375, "learning_rate": 9.88913607712273e-06, "loss": 3.0651, "mean_token_accuracy": 0.4127983599355689, "step": 598 }, { "epoch": 0.11104931405265109, "grad_norm": 7.33203125, "learning_rate": 9.88895068594735e-06, "loss": 2.7786, "mean_token_accuracy": 0.4375, "step": 599 }, { "epoch": 0.11123470522803114, "grad_norm": 6.90234375, "learning_rate": 9.888765294771969e-06, "loss": 2.6548, "mean_token_accuracy": 0.44922443044110516, "step": 600 }, { "epoch": 0.1114200964034112, "grad_norm": 5.953125, "learning_rate": 9.88857990359659e-06, "loss": 3.0822, "mean_token_accuracy": 0.39007669068092027, "step": 601 }, { "epoch": 0.11160548757879125, "grad_norm": 5.98828125, "learning_rate": 9.88839451242121e-06, "loss": 3.364, "mean_token_accuracy": 0.380719794344473, "step": 602 }, { "epoch": 0.1117908787541713, "grad_norm": 10.3984375, "learning_rate": 9.88820912124583e-06, "loss": 3.4323, "mean_token_accuracy": 0.3897013641391176, "step": 603 }, { "epoch": 0.11197626992955136, "grad_norm": 7.0078125, "learning_rate": 9.888023730070449e-06, "loss": 2.8606, "mean_token_accuracy": 0.4246031746031746, "step": 604 }, { "epoch": 0.1121616611049314, "grad_norm": 6.8125, "learning_rate": 9.88783833889507e-06, "loss": 3.077, "mean_token_accuracy": 0.3956997359486986, "step": 605 }, { "epoch": 0.11234705228031146, "grad_norm": 10.3671875, "learning_rate": 9.88765294771969e-06, "loss": 2.9796, "mean_token_accuracy": 0.41631701631701634, "step": 606 }, { "epoch": 0.11253244345569151, "grad_norm": 9.625, "learning_rate": 9.887467556544309e-06, "loss": 3.1043, "mean_token_accuracy": 0.41048087300635566, "step": 607 }, { "epoch": 0.11271783463107156, "grad_norm": 9.3125, "learning_rate": 9.88728216536893e-06, "loss": 3.0037, "mean_token_accuracy": 0.4041991601679664, "step": 608 }, { "epoch": 0.11290322580645161, "grad_norm": 6.3984375, "learning_rate": 9.88709677419355e-06, "loss": 3.1005, "mean_token_accuracy": 0.39710365853658536, "step": 609 }, { "epoch": 0.11308861698183166, "grad_norm": 7.8828125, "learning_rate": 9.88691138301817e-06, "loss": 3.1799, "mean_token_accuracy": 0.3976857490864799, "step": 610 }, { "epoch": 0.11327400815721171, "grad_norm": 7.828125, "learning_rate": 9.886725991842789e-06, "loss": 2.9804, "mean_token_accuracy": 0.414394497666421, "step": 611 }, { "epoch": 0.11345939933259176, "grad_norm": 15.3046875, "learning_rate": 9.88654060066741e-06, "loss": 2.7082, "mean_token_accuracy": 0.43707273338563263, "step": 612 }, { "epoch": 0.11364479050797183, "grad_norm": 6.1796875, "learning_rate": 9.886355209492028e-06, "loss": 3.1872, "mean_token_accuracy": 0.41010071090047395, "step": 613 }, { "epoch": 0.11383018168335188, "grad_norm": 7.55078125, "learning_rate": 9.886169818316649e-06, "loss": 2.9411, "mean_token_accuracy": 0.43027276219746446, "step": 614 }, { "epoch": 0.11401557285873193, "grad_norm": 6.59765625, "learning_rate": 9.88598442714127e-06, "loss": 2.7976, "mean_token_accuracy": 0.4294573643410853, "step": 615 }, { "epoch": 0.11420096403411198, "grad_norm": 6.69921875, "learning_rate": 9.885799035965888e-06, "loss": 2.6748, "mean_token_accuracy": 0.4445882704371204, "step": 616 }, { "epoch": 0.11438635520949203, "grad_norm": 6.07421875, "learning_rate": 9.885613644790509e-06, "loss": 2.8201, "mean_token_accuracy": 0.43463191459864137, "step": 617 }, { "epoch": 0.11457174638487208, "grad_norm": 8.046875, "learning_rate": 9.885428253615129e-06, "loss": 3.2382, "mean_token_accuracy": 0.40473330129745316, "step": 618 }, { "epoch": 0.11475713756025213, "grad_norm": 5.34765625, "learning_rate": 9.88524286243975e-06, "loss": 2.7734, "mean_token_accuracy": 0.4668, "step": 619 }, { "epoch": 0.11494252873563218, "grad_norm": 7.04296875, "learning_rate": 9.885057471264368e-06, "loss": 2.7128, "mean_token_accuracy": 0.44738628649015616, "step": 620 }, { "epoch": 0.11512791991101223, "grad_norm": 8.78125, "learning_rate": 9.884872080088989e-06, "loss": 2.4175, "mean_token_accuracy": 0.4752225322173509, "step": 621 }, { "epoch": 0.11531331108639228, "grad_norm": 6.45703125, "learning_rate": 9.884686688913608e-06, "loss": 2.963, "mean_token_accuracy": 0.42620897018291154, "step": 622 }, { "epoch": 0.11549870226177233, "grad_norm": 7.15234375, "learning_rate": 9.884501297738228e-06, "loss": 3.2089, "mean_token_accuracy": 0.38625725519565624, "step": 623 }, { "epoch": 0.11568409343715239, "grad_norm": 6.80078125, "learning_rate": 9.884315906562849e-06, "loss": 2.9824, "mean_token_accuracy": 0.4298110067752288, "step": 624 }, { "epoch": 0.11586948461253245, "grad_norm": 6.421875, "learning_rate": 9.884130515387467e-06, "loss": 2.9457, "mean_token_accuracy": 0.4301253496322387, "step": 625 }, { "epoch": 0.1160548757879125, "grad_norm": 6.5546875, "learning_rate": 9.88394512421209e-06, "loss": 3.0506, "mean_token_accuracy": 0.40153886280646844, "step": 626 }, { "epoch": 0.11624026696329255, "grad_norm": 7.3828125, "learning_rate": 9.883759733036708e-06, "loss": 2.7686, "mean_token_accuracy": 0.4399644233619923, "step": 627 }, { "epoch": 0.1164256581386726, "grad_norm": 7.7109375, "learning_rate": 9.883574341861329e-06, "loss": 3.2799, "mean_token_accuracy": 0.4031502212965374, "step": 628 }, { "epoch": 0.11661104931405265, "grad_norm": 6.26953125, "learning_rate": 9.883388950685948e-06, "loss": 2.7435, "mean_token_accuracy": 0.45799031476997576, "step": 629 }, { "epoch": 0.1167964404894327, "grad_norm": 6.80078125, "learning_rate": 9.883203559510568e-06, "loss": 3.0745, "mean_token_accuracy": 0.42391716125773454, "step": 630 }, { "epoch": 0.11698183166481275, "grad_norm": 6.9140625, "learning_rate": 9.883018168335189e-06, "loss": 3.0794, "mean_token_accuracy": 0.4025974025974026, "step": 631 }, { "epoch": 0.1171672228401928, "grad_norm": 8.546875, "learning_rate": 9.882832777159807e-06, "loss": 2.3033, "mean_token_accuracy": 0.494098955969133, "step": 632 }, { "epoch": 0.11735261401557286, "grad_norm": 10.3515625, "learning_rate": 9.882647385984428e-06, "loss": 2.9588, "mean_token_accuracy": 0.4038412617654541, "step": 633 }, { "epoch": 0.11753800519095291, "grad_norm": 7.515625, "learning_rate": 9.882461994809048e-06, "loss": 2.9061, "mean_token_accuracy": 0.41789748045178104, "step": 634 }, { "epoch": 0.11772339636633296, "grad_norm": 7.6015625, "learning_rate": 9.882276603633669e-06, "loss": 2.8987, "mean_token_accuracy": 0.4271176294522598, "step": 635 }, { "epoch": 0.11790878754171301, "grad_norm": 8.34375, "learning_rate": 9.882091212458288e-06, "loss": 3.0162, "mean_token_accuracy": 0.4147084421235857, "step": 636 }, { "epoch": 0.11809417871709306, "grad_norm": 10.03125, "learning_rate": 9.881905821282908e-06, "loss": 2.5506, "mean_token_accuracy": 0.4669016411499064, "step": 637 }, { "epoch": 0.11827956989247312, "grad_norm": 6.921875, "learning_rate": 9.881720430107527e-06, "loss": 2.8896, "mean_token_accuracy": 0.42791265427912656, "step": 638 }, { "epoch": 0.11846496106785318, "grad_norm": 7.60546875, "learning_rate": 9.881535038932147e-06, "loss": 3.1056, "mean_token_accuracy": 0.39513721029432675, "step": 639 }, { "epoch": 0.11865035224323323, "grad_norm": 7.984375, "learning_rate": 9.881349647756768e-06, "loss": 3.2403, "mean_token_accuracy": 0.3993878094224115, "step": 640 }, { "epoch": 0.11883574341861328, "grad_norm": 11.3046875, "learning_rate": 9.881164256581387e-06, "loss": 3.0229, "mean_token_accuracy": 0.42289750766360124, "step": 641 }, { "epoch": 0.11902113459399333, "grad_norm": 10.1640625, "learning_rate": 9.880978865406007e-06, "loss": 3.3999, "mean_token_accuracy": 0.35867973414441817, "step": 642 }, { "epoch": 0.11920652576937338, "grad_norm": 12.3828125, "learning_rate": 9.880793474230628e-06, "loss": 2.7568, "mean_token_accuracy": 0.452468380252958, "step": 643 }, { "epoch": 0.11939191694475343, "grad_norm": 5.3671875, "learning_rate": 9.880608083055248e-06, "loss": 2.9786, "mean_token_accuracy": 0.4222486615110054, "step": 644 }, { "epoch": 0.11957730812013348, "grad_norm": 8.6796875, "learning_rate": 9.880422691879867e-06, "loss": 2.6843, "mean_token_accuracy": 0.4693542272210287, "step": 645 }, { "epoch": 0.11976269929551353, "grad_norm": 8.1484375, "learning_rate": 9.880237300704488e-06, "loss": 2.7184, "mean_token_accuracy": 0.44553226696083836, "step": 646 }, { "epoch": 0.11994809047089358, "grad_norm": 13.5546875, "learning_rate": 9.880051909529106e-06, "loss": 3.0748, "mean_token_accuracy": 0.3765888825649782, "step": 647 }, { "epoch": 0.12013348164627363, "grad_norm": 9.3984375, "learning_rate": 9.879866518353727e-06, "loss": 2.9943, "mean_token_accuracy": 0.40639350052984813, "step": 648 }, { "epoch": 0.12031887282165368, "grad_norm": 5.76171875, "learning_rate": 9.879681127178347e-06, "loss": 3.0106, "mean_token_accuracy": 0.40795701675834717, "step": 649 }, { "epoch": 0.12050426399703375, "grad_norm": 6.57421875, "learning_rate": 9.879495736002968e-06, "loss": 3.25, "mean_token_accuracy": 0.39554494828957837, "step": 650 }, { "epoch": 0.1206896551724138, "grad_norm": 8.0546875, "learning_rate": 9.879310344827587e-06, "loss": 2.7496, "mean_token_accuracy": 0.44060114503816794, "step": 651 }, { "epoch": 0.12087504634779385, "grad_norm": 7.1875, "learning_rate": 9.879124953652207e-06, "loss": 2.7566, "mean_token_accuracy": 0.43593967811430745, "step": 652 }, { "epoch": 0.1210604375231739, "grad_norm": 6.5859375, "learning_rate": 9.878939562476828e-06, "loss": 3.0319, "mean_token_accuracy": 0.4093316803786769, "step": 653 }, { "epoch": 0.12124582869855395, "grad_norm": 6.0390625, "learning_rate": 9.878754171301446e-06, "loss": 2.7842, "mean_token_accuracy": 0.45045698534745393, "step": 654 }, { "epoch": 0.121431219873934, "grad_norm": 9.3046875, "learning_rate": 9.878568780126067e-06, "loss": 3.4031, "mean_token_accuracy": 0.37794656888423256, "step": 655 }, { "epoch": 0.12161661104931405, "grad_norm": 7.484375, "learning_rate": 9.878383388950686e-06, "loss": 2.7452, "mean_token_accuracy": 0.4482017481014472, "step": 656 }, { "epoch": 0.1218020022246941, "grad_norm": 9.3984375, "learning_rate": 9.878197997775306e-06, "loss": 3.2324, "mean_token_accuracy": 0.3964526605046215, "step": 657 }, { "epoch": 0.12198739340007415, "grad_norm": 11.0, "learning_rate": 9.878012606599927e-06, "loss": 3.3105, "mean_token_accuracy": 0.37773512476007676, "step": 658 }, { "epoch": 0.1221727845754542, "grad_norm": 11.0703125, "learning_rate": 9.877827215424547e-06, "loss": 3.3004, "mean_token_accuracy": 0.38499690018598887, "step": 659 }, { "epoch": 0.12235817575083426, "grad_norm": 8.171875, "learning_rate": 9.877641824249166e-06, "loss": 3.3313, "mean_token_accuracy": 0.3730419902593129, "step": 660 }, { "epoch": 0.1225435669262143, "grad_norm": 7.3203125, "learning_rate": 9.877456433073786e-06, "loss": 2.6276, "mean_token_accuracy": 0.44212479703085134, "step": 661 }, { "epoch": 0.12272895810159437, "grad_norm": 6.55859375, "learning_rate": 9.877271041898407e-06, "loss": 3.1038, "mean_token_accuracy": 0.3980857851825594, "step": 662 }, { "epoch": 0.12291434927697442, "grad_norm": 6.03515625, "learning_rate": 9.877085650723026e-06, "loss": 3.2664, "mean_token_accuracy": 0.39626853771328335, "step": 663 }, { "epoch": 0.12309974045235447, "grad_norm": 7.05078125, "learning_rate": 9.876900259547646e-06, "loss": 3.1763, "mean_token_accuracy": 0.3953143934293793, "step": 664 }, { "epoch": 0.12328513162773452, "grad_norm": 8.28125, "learning_rate": 9.876714868372265e-06, "loss": 2.8905, "mean_token_accuracy": 0.423560281729043, "step": 665 }, { "epoch": 0.12347052280311457, "grad_norm": 6.21484375, "learning_rate": 9.876529477196887e-06, "loss": 2.899, "mean_token_accuracy": 0.425415735914619, "step": 666 }, { "epoch": 0.12365591397849462, "grad_norm": 9.3984375, "learning_rate": 9.876344086021506e-06, "loss": 2.8276, "mean_token_accuracy": 0.4398491945618645, "step": 667 }, { "epoch": 0.12384130515387468, "grad_norm": 9.8984375, "learning_rate": 9.876158694846126e-06, "loss": 3.076, "mean_token_accuracy": 0.40351941747572817, "step": 668 }, { "epoch": 0.12402669632925473, "grad_norm": 8.0546875, "learning_rate": 9.875973303670747e-06, "loss": 2.9766, "mean_token_accuracy": 0.42509241139378123, "step": 669 }, { "epoch": 0.12421208750463478, "grad_norm": 8.265625, "learning_rate": 9.875787912495366e-06, "loss": 2.8745, "mean_token_accuracy": 0.42353594227033026, "step": 670 }, { "epoch": 0.12439747868001483, "grad_norm": 9.1796875, "learning_rate": 9.875602521319986e-06, "loss": 3.0462, "mean_token_accuracy": 0.41735537190082644, "step": 671 }, { "epoch": 0.12458286985539488, "grad_norm": 8.96875, "learning_rate": 9.875417130144605e-06, "loss": 2.8012, "mean_token_accuracy": 0.4233263840630067, "step": 672 }, { "epoch": 0.12476826103077493, "grad_norm": 7.2734375, "learning_rate": 9.875231738969226e-06, "loss": 3.5253, "mean_token_accuracy": 0.3751308153363143, "step": 673 }, { "epoch": 0.12495365220615498, "grad_norm": 13.3671875, "learning_rate": 9.875046347793846e-06, "loss": 2.8286, "mean_token_accuracy": 0.4217142857142857, "step": 674 }, { "epoch": 0.12513904338153503, "grad_norm": 11.5703125, "learning_rate": 9.874860956618467e-06, "loss": 3.2542, "mean_token_accuracy": 0.3853965183752418, "step": 675 }, { "epoch": 0.12532443455691508, "grad_norm": 11.171875, "learning_rate": 9.874675565443085e-06, "loss": 2.5511, "mean_token_accuracy": 0.4512676056338028, "step": 676 }, { "epoch": 0.12550982573229513, "grad_norm": 6.765625, "learning_rate": 9.874490174267706e-06, "loss": 2.9929, "mean_token_accuracy": 0.41382038770934987, "step": 677 }, { "epoch": 0.12569521690767518, "grad_norm": 18.234375, "learning_rate": 9.874304783092326e-06, "loss": 3.1176, "mean_token_accuracy": 0.38240428255556846, "step": 678 }, { "epoch": 0.12588060808305523, "grad_norm": 12.8671875, "learning_rate": 9.874119391916945e-06, "loss": 2.2764, "mean_token_accuracy": 0.485667382245233, "step": 679 }, { "epoch": 0.12606599925843529, "grad_norm": 8.578125, "learning_rate": 9.873934000741566e-06, "loss": 3.1105, "mean_token_accuracy": 0.3994442006728097, "step": 680 }, { "epoch": 0.12625139043381536, "grad_norm": 8.40625, "learning_rate": 9.873748609566184e-06, "loss": 3.0083, "mean_token_accuracy": 0.4122506786262245, "step": 681 }, { "epoch": 0.12643678160919541, "grad_norm": 15.21875, "learning_rate": 9.873563218390807e-06, "loss": 2.8403, "mean_token_accuracy": 0.42344559585492225, "step": 682 }, { "epoch": 0.12662217278457547, "grad_norm": 11.9296875, "learning_rate": 9.873377827215425e-06, "loss": 2.9019, "mean_token_accuracy": 0.427928870292887, "step": 683 }, { "epoch": 0.12680756395995552, "grad_norm": 8.6640625, "learning_rate": 9.873192436040046e-06, "loss": 3.0652, "mean_token_accuracy": 0.3993808049535604, "step": 684 }, { "epoch": 0.12699295513533557, "grad_norm": 10.5078125, "learning_rate": 9.873007044864665e-06, "loss": 2.6465, "mean_token_accuracy": 0.4363766339869281, "step": 685 }, { "epoch": 0.12717834631071562, "grad_norm": 6.3359375, "learning_rate": 9.872821653689285e-06, "loss": 2.899, "mean_token_accuracy": 0.442664311845591, "step": 686 }, { "epoch": 0.12736373748609567, "grad_norm": 10.1953125, "learning_rate": 9.872636262513906e-06, "loss": 3.2416, "mean_token_accuracy": 0.39878197320341047, "step": 687 }, { "epoch": 0.12754912866147572, "grad_norm": 10.109375, "learning_rate": 9.872450871338524e-06, "loss": 2.6962, "mean_token_accuracy": 0.4372042227884965, "step": 688 }, { "epoch": 0.12773451983685577, "grad_norm": 20.03125, "learning_rate": 9.872265480163145e-06, "loss": 3.223, "mean_token_accuracy": 0.36838487972508593, "step": 689 }, { "epoch": 0.12791991101223582, "grad_norm": 8.6640625, "learning_rate": 9.872080088987765e-06, "loss": 3.2226, "mean_token_accuracy": 0.39855274144169217, "step": 690 }, { "epoch": 0.12810530218761587, "grad_norm": 10.3125, "learning_rate": 9.871894697812386e-06, "loss": 2.9064, "mean_token_accuracy": 0.41621691081984064, "step": 691 }, { "epoch": 0.12829069336299592, "grad_norm": 8.640625, "learning_rate": 9.871709306637005e-06, "loss": 2.9699, "mean_token_accuracy": 0.4099699183498066, "step": 692 }, { "epoch": 0.12847608453837597, "grad_norm": 6.9765625, "learning_rate": 9.871523915461625e-06, "loss": 3.0353, "mean_token_accuracy": 0.4178212787593746, "step": 693 }, { "epoch": 0.12866147571375602, "grad_norm": 7.75, "learning_rate": 9.871338524286244e-06, "loss": 2.6648, "mean_token_accuracy": 0.45852187028657615, "step": 694 }, { "epoch": 0.12884686688913607, "grad_norm": 6.50390625, "learning_rate": 9.871153133110865e-06, "loss": 3.0548, "mean_token_accuracy": 0.41835616438356166, "step": 695 }, { "epoch": 0.12903225806451613, "grad_norm": 8.875, "learning_rate": 9.870967741935485e-06, "loss": 3.2201, "mean_token_accuracy": 0.40557903634826714, "step": 696 }, { "epoch": 0.12921764923989618, "grad_norm": 6.6484375, "learning_rate": 9.870782350760104e-06, "loss": 2.9487, "mean_token_accuracy": 0.4257959388317874, "step": 697 }, { "epoch": 0.12940304041527623, "grad_norm": 15.0703125, "learning_rate": 9.870596959584724e-06, "loss": 3.0872, "mean_token_accuracy": 0.38031737565008666, "step": 698 }, { "epoch": 0.12958843159065628, "grad_norm": 7.10546875, "learning_rate": 9.870411568409345e-06, "loss": 3.2629, "mean_token_accuracy": 0.40667157223448613, "step": 699 }, { "epoch": 0.12977382276603633, "grad_norm": 6.98046875, "learning_rate": 9.870226177233965e-06, "loss": 2.8492, "mean_token_accuracy": 0.43875347115087937, "step": 700 }, { "epoch": 0.12995921394141638, "grad_norm": 5.9765625, "learning_rate": 9.870040786058584e-06, "loss": 2.978, "mean_token_accuracy": 0.40629453681710215, "step": 701 }, { "epoch": 0.13014460511679643, "grad_norm": 7.44140625, "learning_rate": 9.869855394883205e-06, "loss": 3.2567, "mean_token_accuracy": 0.3889651386220148, "step": 702 }, { "epoch": 0.13032999629217648, "grad_norm": 8.0625, "learning_rate": 9.869670003707823e-06, "loss": 2.9607, "mean_token_accuracy": 0.4, "step": 703 }, { "epoch": 0.13051538746755653, "grad_norm": 6.07421875, "learning_rate": 9.869484612532444e-06, "loss": 2.8669, "mean_token_accuracy": 0.4226688303582574, "step": 704 }, { "epoch": 0.13070077864293658, "grad_norm": 6.5546875, "learning_rate": 9.869299221357064e-06, "loss": 2.8474, "mean_token_accuracy": 0.4220518867924528, "step": 705 }, { "epoch": 0.13088616981831666, "grad_norm": 9.2734375, "learning_rate": 9.869113830181685e-06, "loss": 2.7965, "mean_token_accuracy": 0.4494828957836118, "step": 706 }, { "epoch": 0.1310715609936967, "grad_norm": 8.4296875, "learning_rate": 9.868928439006305e-06, "loss": 2.6898, "mean_token_accuracy": 0.45481171548117155, "step": 707 }, { "epoch": 0.13125695216907676, "grad_norm": 7.35546875, "learning_rate": 9.868743047830924e-06, "loss": 3.0509, "mean_token_accuracy": 0.4099435917617736, "step": 708 }, { "epoch": 0.1314423433444568, "grad_norm": 10.2890625, "learning_rate": 9.868557656655545e-06, "loss": 2.332, "mean_token_accuracy": 0.5083138656039576, "step": 709 }, { "epoch": 0.13162773451983686, "grad_norm": 7.9609375, "learning_rate": 9.868372265480163e-06, "loss": 3.4955, "mean_token_accuracy": 0.3647892949870182, "step": 710 }, { "epoch": 0.13181312569521692, "grad_norm": 9.828125, "learning_rate": 9.868186874304784e-06, "loss": 3.0041, "mean_token_accuracy": 0.40580575797395774, "step": 711 }, { "epoch": 0.13199851687059697, "grad_norm": 5.58984375, "learning_rate": 9.868001483129403e-06, "loss": 3.4287, "mean_token_accuracy": 0.35912722069870034, "step": 712 }, { "epoch": 0.13218390804597702, "grad_norm": 9.453125, "learning_rate": 9.867816091954023e-06, "loss": 2.9458, "mean_token_accuracy": 0.4134373760084645, "step": 713 }, { "epoch": 0.13236929922135707, "grad_norm": 11.046875, "learning_rate": 9.867630700778644e-06, "loss": 2.8708, "mean_token_accuracy": 0.4248428532385898, "step": 714 }, { "epoch": 0.13255469039673712, "grad_norm": 10.4765625, "learning_rate": 9.867445309603264e-06, "loss": 3.3693, "mean_token_accuracy": 0.37528634954857837, "step": 715 }, { "epoch": 0.13274008157211717, "grad_norm": 6.6953125, "learning_rate": 9.867259918427885e-06, "loss": 3.1915, "mean_token_accuracy": 0.38454580695180257, "step": 716 }, { "epoch": 0.13292547274749722, "grad_norm": 11.0234375, "learning_rate": 9.867074527252503e-06, "loss": 3.0738, "mean_token_accuracy": 0.4051822976381564, "step": 717 }, { "epoch": 0.13311086392287727, "grad_norm": 7.00390625, "learning_rate": 9.866889136077124e-06, "loss": 3.4506, "mean_token_accuracy": 0.36359537050954455, "step": 718 }, { "epoch": 0.13329625509825732, "grad_norm": 7.140625, "learning_rate": 9.866703744901743e-06, "loss": 3.1755, "mean_token_accuracy": 0.41415640302715534, "step": 719 }, { "epoch": 0.13348164627363737, "grad_norm": 10.53125, "learning_rate": 9.866518353726363e-06, "loss": 2.8631, "mean_token_accuracy": 0.4196456985255118, "step": 720 }, { "epoch": 0.13366703744901742, "grad_norm": 9.1484375, "learning_rate": 9.866332962550984e-06, "loss": 3.9717, "mean_token_accuracy": 0.32326324194836303, "step": 721 }, { "epoch": 0.13385242862439747, "grad_norm": 7.6015625, "learning_rate": 9.866147571375604e-06, "loss": 3.2262, "mean_token_accuracy": 0.3940175953079179, "step": 722 }, { "epoch": 0.13403781979977752, "grad_norm": 7.48046875, "learning_rate": 9.865962180200223e-06, "loss": 3.1626, "mean_token_accuracy": 0.38652597402597405, "step": 723 }, { "epoch": 0.13422321097515758, "grad_norm": 9.5546875, "learning_rate": 9.865776789024844e-06, "loss": 3.1989, "mean_token_accuracy": 0.4015409018808067, "step": 724 }, { "epoch": 0.13440860215053763, "grad_norm": 12.34375, "learning_rate": 9.865591397849464e-06, "loss": 2.7034, "mean_token_accuracy": 0.45056320400500627, "step": 725 }, { "epoch": 0.13459399332591768, "grad_norm": 12.1875, "learning_rate": 9.865406006674083e-06, "loss": 2.7679, "mean_token_accuracy": 0.44550369948776325, "step": 726 }, { "epoch": 0.13477938450129773, "grad_norm": 8.0, "learning_rate": 9.865220615498703e-06, "loss": 2.7701, "mean_token_accuracy": 0.4434520357267138, "step": 727 }, { "epoch": 0.13496477567667778, "grad_norm": 9.140625, "learning_rate": 9.865035224323322e-06, "loss": 2.7807, "mean_token_accuracy": 0.4473251780171627, "step": 728 }, { "epoch": 0.13515016685205783, "grad_norm": 7.7734375, "learning_rate": 9.864849833147943e-06, "loss": 3.026, "mean_token_accuracy": 0.4076058772687986, "step": 729 }, { "epoch": 0.1353355580274379, "grad_norm": 9.4609375, "learning_rate": 9.864664441972563e-06, "loss": 2.5247, "mean_token_accuracy": 0.46952672795369427, "step": 730 }, { "epoch": 0.13552094920281796, "grad_norm": 6.6953125, "learning_rate": 9.864479050797184e-06, "loss": 3.3639, "mean_token_accuracy": 0.38154450261780104, "step": 731 }, { "epoch": 0.135706340378198, "grad_norm": 5.75, "learning_rate": 9.864293659621802e-06, "loss": 3.377, "mean_token_accuracy": 0.3736515641855448, "step": 732 }, { "epoch": 0.13589173155357806, "grad_norm": 5.90625, "learning_rate": 9.864108268446423e-06, "loss": 3.122, "mean_token_accuracy": 0.40932708148523633, "step": 733 }, { "epoch": 0.1360771227289581, "grad_norm": 7.84375, "learning_rate": 9.863922877271043e-06, "loss": 3.045, "mean_token_accuracy": 0.407824455031038, "step": 734 }, { "epoch": 0.13626251390433816, "grad_norm": 5.93359375, "learning_rate": 9.863737486095662e-06, "loss": 2.9052, "mean_token_accuracy": 0.4120286164665939, "step": 735 }, { "epoch": 0.1364479050797182, "grad_norm": 6.86328125, "learning_rate": 9.863552094920283e-06, "loss": 2.9854, "mean_token_accuracy": 0.39720634920634923, "step": 736 }, { "epoch": 0.13663329625509826, "grad_norm": 8.3828125, "learning_rate": 9.863366703744901e-06, "loss": 2.5673, "mean_token_accuracy": 0.46193265007320644, "step": 737 }, { "epoch": 0.13681868743047831, "grad_norm": 6.7734375, "learning_rate": 9.863181312569524e-06, "loss": 3.1197, "mean_token_accuracy": 0.4005707878911426, "step": 738 }, { "epoch": 0.13700407860585836, "grad_norm": 6.328125, "learning_rate": 9.862995921394142e-06, "loss": 2.9011, "mean_token_accuracy": 0.4281341821743389, "step": 739 }, { "epoch": 0.13718946978123842, "grad_norm": 7.7578125, "learning_rate": 9.862810530218763e-06, "loss": 3.2804, "mean_token_accuracy": 0.39657297830374755, "step": 740 }, { "epoch": 0.13737486095661847, "grad_norm": 10.203125, "learning_rate": 9.862625139043382e-06, "loss": 2.973, "mean_token_accuracy": 0.40604960677555957, "step": 741 }, { "epoch": 0.13756025213199852, "grad_norm": 10.1015625, "learning_rate": 9.862439747868002e-06, "loss": 2.4093, "mean_token_accuracy": 0.47001584068793845, "step": 742 }, { "epoch": 0.13774564330737857, "grad_norm": 11.5234375, "learning_rate": 9.862254356692623e-06, "loss": 3.3727, "mean_token_accuracy": 0.36067346308310166, "step": 743 }, { "epoch": 0.13793103448275862, "grad_norm": 7.5859375, "learning_rate": 9.862068965517241e-06, "loss": 2.5664, "mean_token_accuracy": 0.49264998013508143, "step": 744 }, { "epoch": 0.13811642565813867, "grad_norm": 10.546875, "learning_rate": 9.861883574341862e-06, "loss": 2.6881, "mean_token_accuracy": 0.4430678466076696, "step": 745 }, { "epoch": 0.13830181683351872, "grad_norm": 7.5703125, "learning_rate": 9.86169818316648e-06, "loss": 2.9677, "mean_token_accuracy": 0.41450480149061203, "step": 746 }, { "epoch": 0.13848720800889877, "grad_norm": 8.171875, "learning_rate": 9.861512791991103e-06, "loss": 2.9933, "mean_token_accuracy": 0.41045454545454546, "step": 747 }, { "epoch": 0.13867259918427882, "grad_norm": 8.8046875, "learning_rate": 9.861327400815722e-06, "loss": 3.0249, "mean_token_accuracy": 0.4226044226044226, "step": 748 }, { "epoch": 0.13885799035965887, "grad_norm": 7.42578125, "learning_rate": 9.861142009640342e-06, "loss": 3.2242, "mean_token_accuracy": 0.39226460953186093, "step": 749 }, { "epoch": 0.13904338153503892, "grad_norm": 5.9609375, "learning_rate": 9.860956618464963e-06, "loss": 2.6403, "mean_token_accuracy": 0.4613674263479711, "step": 750 }, { "epoch": 0.13922877271041897, "grad_norm": 10.25, "learning_rate": 9.860771227289582e-06, "loss": 2.806, "mean_token_accuracy": 0.42448889865904593, "step": 751 }, { "epoch": 0.13941416388579903, "grad_norm": 7.19921875, "learning_rate": 9.860585836114202e-06, "loss": 2.7607, "mean_token_accuracy": 0.4307969615037981, "step": 752 }, { "epoch": 0.13959955506117908, "grad_norm": 7.1484375, "learning_rate": 9.86040044493882e-06, "loss": 3.3084, "mean_token_accuracy": 0.388254940161425, "step": 753 }, { "epoch": 0.13978494623655913, "grad_norm": 9.265625, "learning_rate": 9.860215053763441e-06, "loss": 2.8351, "mean_token_accuracy": 0.4377808988764045, "step": 754 }, { "epoch": 0.1399703374119392, "grad_norm": 10.8515625, "learning_rate": 9.860029662588062e-06, "loss": 2.7376, "mean_token_accuracy": 0.4554277498202732, "step": 755 }, { "epoch": 0.14015572858731926, "grad_norm": 6.63671875, "learning_rate": 9.859844271412682e-06, "loss": 3.2321, "mean_token_accuracy": 0.3894289864789362, "step": 756 }, { "epoch": 0.1403411197626993, "grad_norm": 7.44921875, "learning_rate": 9.859658880237301e-06, "loss": 2.6218, "mean_token_accuracy": 0.4551959489211801, "step": 757 }, { "epoch": 0.14052651093807936, "grad_norm": 5.6484375, "learning_rate": 9.859473489061922e-06, "loss": 3.1867, "mean_token_accuracy": 0.3927792915531335, "step": 758 }, { "epoch": 0.1407119021134594, "grad_norm": 7.45703125, "learning_rate": 9.859288097886542e-06, "loss": 3.4918, "mean_token_accuracy": 0.3734524369221125, "step": 759 }, { "epoch": 0.14089729328883946, "grad_norm": 6.5625, "learning_rate": 9.859102706711161e-06, "loss": 2.9985, "mean_token_accuracy": 0.41660759493670885, "step": 760 }, { "epoch": 0.1410826844642195, "grad_norm": 6.5546875, "learning_rate": 9.858917315535781e-06, "loss": 3.0234, "mean_token_accuracy": 0.40899110135213224, "step": 761 }, { "epoch": 0.14126807563959956, "grad_norm": 6.26171875, "learning_rate": 9.8587319243604e-06, "loss": 2.6739, "mean_token_accuracy": 0.4517402749341913, "step": 762 }, { "epoch": 0.1414534668149796, "grad_norm": 6.48828125, "learning_rate": 9.858546533185022e-06, "loss": 3.0713, "mean_token_accuracy": 0.4106090373280943, "step": 763 }, { "epoch": 0.14163885799035966, "grad_norm": 6.41015625, "learning_rate": 9.858361142009641e-06, "loss": 2.7636, "mean_token_accuracy": 0.44991534988713316, "step": 764 }, { "epoch": 0.1418242491657397, "grad_norm": 7.10546875, "learning_rate": 9.858175750834262e-06, "loss": 2.5906, "mean_token_accuracy": 0.4720234604105572, "step": 765 }, { "epoch": 0.14200964034111976, "grad_norm": 6.07421875, "learning_rate": 9.85799035965888e-06, "loss": 3.2116, "mean_token_accuracy": 0.3828075105377443, "step": 766 }, { "epoch": 0.14219503151649981, "grad_norm": 7.2890625, "learning_rate": 9.857804968483501e-06, "loss": 2.509, "mean_token_accuracy": 0.4621794037234735, "step": 767 }, { "epoch": 0.14238042269187987, "grad_norm": 8.46875, "learning_rate": 9.857619577308121e-06, "loss": 2.2698, "mean_token_accuracy": 0.5228154690218119, "step": 768 }, { "epoch": 0.14256581386725992, "grad_norm": 7.3359375, "learning_rate": 9.85743418613274e-06, "loss": 3.0399, "mean_token_accuracy": 0.41105919003115265, "step": 769 }, { "epoch": 0.14275120504263997, "grad_norm": 7.34375, "learning_rate": 9.85724879495736e-06, "loss": 2.9676, "mean_token_accuracy": 0.41291251964379255, "step": 770 }, { "epoch": 0.14293659621802002, "grad_norm": 6.56640625, "learning_rate": 9.857063403781981e-06, "loss": 2.975, "mean_token_accuracy": 0.4261959929126346, "step": 771 }, { "epoch": 0.14312198739340007, "grad_norm": 8.7265625, "learning_rate": 9.856878012606602e-06, "loss": 2.6482, "mean_token_accuracy": 0.45067458843916325, "step": 772 }, { "epoch": 0.14330737856878012, "grad_norm": 8.15625, "learning_rate": 9.85669262143122e-06, "loss": 3.1154, "mean_token_accuracy": 0.40451977401129946, "step": 773 }, { "epoch": 0.14349276974416017, "grad_norm": 6.75, "learning_rate": 9.856507230255841e-06, "loss": 3.0661, "mean_token_accuracy": 0.4118069520252801, "step": 774 }, { "epoch": 0.14367816091954022, "grad_norm": 7.4375, "learning_rate": 9.85632183908046e-06, "loss": 3.6535, "mean_token_accuracy": 0.36909323116219667, "step": 775 }, { "epoch": 0.14386355209492027, "grad_norm": 10.1484375, "learning_rate": 9.85613644790508e-06, "loss": 2.8093, "mean_token_accuracy": 0.42281771501925547, "step": 776 }, { "epoch": 0.14404894327030032, "grad_norm": 8.8203125, "learning_rate": 9.8559510567297e-06, "loss": 2.7525, "mean_token_accuracy": 0.44396351831813263, "step": 777 }, { "epoch": 0.14423433444568037, "grad_norm": 6.06640625, "learning_rate": 9.85576566555432e-06, "loss": 3.2859, "mean_token_accuracy": 0.3887994052781564, "step": 778 }, { "epoch": 0.14441972562106042, "grad_norm": 8.9140625, "learning_rate": 9.85558027437894e-06, "loss": 2.5743, "mean_token_accuracy": 0.4697271176805003, "step": 779 }, { "epoch": 0.1446051167964405, "grad_norm": 6.3203125, "learning_rate": 9.85539488320356e-06, "loss": 3.4889, "mean_token_accuracy": 0.3886387253204018, "step": 780 }, { "epoch": 0.14479050797182055, "grad_norm": 8.671875, "learning_rate": 9.855209492028181e-06, "loss": 3.4763, "mean_token_accuracy": 0.38178107208078205, "step": 781 }, { "epoch": 0.1449758991472006, "grad_norm": 8.0859375, "learning_rate": 9.8550241008528e-06, "loss": 2.5951, "mean_token_accuracy": 0.4481503345139709, "step": 782 }, { "epoch": 0.14516129032258066, "grad_norm": 7.65234375, "learning_rate": 9.85483870967742e-06, "loss": 2.8362, "mean_token_accuracy": 0.4366177099672038, "step": 783 }, { "epoch": 0.1453466814979607, "grad_norm": 7.78515625, "learning_rate": 9.854653318502039e-06, "loss": 2.9784, "mean_token_accuracy": 0.40820424555364315, "step": 784 }, { "epoch": 0.14553207267334076, "grad_norm": 8.5390625, "learning_rate": 9.85446792732666e-06, "loss": 2.7548, "mean_token_accuracy": 0.44809133071708884, "step": 785 }, { "epoch": 0.1457174638487208, "grad_norm": 6.03125, "learning_rate": 9.85428253615128e-06, "loss": 3.6404, "mean_token_accuracy": 0.3442300118156755, "step": 786 }, { "epoch": 0.14590285502410086, "grad_norm": 9.578125, "learning_rate": 9.8540971449759e-06, "loss": 3.0064, "mean_token_accuracy": 0.4064428721962969, "step": 787 }, { "epoch": 0.1460882461994809, "grad_norm": 7.6171875, "learning_rate": 9.853911753800521e-06, "loss": 2.9781, "mean_token_accuracy": 0.4061837258622614, "step": 788 }, { "epoch": 0.14627363737486096, "grad_norm": 5.88671875, "learning_rate": 9.85372636262514e-06, "loss": 3.2133, "mean_token_accuracy": 0.38737244897959183, "step": 789 }, { "epoch": 0.146459028550241, "grad_norm": 7.0234375, "learning_rate": 9.85354097144976e-06, "loss": 3.3739, "mean_token_accuracy": 0.3860845839017735, "step": 790 }, { "epoch": 0.14664441972562106, "grad_norm": 9.7265625, "learning_rate": 9.85335558027438e-06, "loss": 2.9039, "mean_token_accuracy": 0.42658672126352737, "step": 791 }, { "epoch": 0.1468298109010011, "grad_norm": 8.3828125, "learning_rate": 9.853170189099e-06, "loss": 3.1087, "mean_token_accuracy": 0.4005235602094241, "step": 792 }, { "epoch": 0.14701520207638116, "grad_norm": 5.421875, "learning_rate": 9.852984797923618e-06, "loss": 3.0261, "mean_token_accuracy": 0.4257202881152461, "step": 793 }, { "epoch": 0.1472005932517612, "grad_norm": 9.875, "learning_rate": 9.852799406748239e-06, "loss": 2.646, "mean_token_accuracy": 0.459673730751639, "step": 794 }, { "epoch": 0.14738598442714126, "grad_norm": 9.3203125, "learning_rate": 9.85261401557286e-06, "loss": 2.7779, "mean_token_accuracy": 0.44240048250904707, "step": 795 }, { "epoch": 0.14757137560252132, "grad_norm": 7.65625, "learning_rate": 9.85242862439748e-06, "loss": 2.5365, "mean_token_accuracy": 0.4512967610852157, "step": 796 }, { "epoch": 0.14775676677790137, "grad_norm": 9.2890625, "learning_rate": 9.8522432332221e-06, "loss": 3.1935, "mean_token_accuracy": 0.369736621372253, "step": 797 }, { "epoch": 0.14794215795328142, "grad_norm": 8.5234375, "learning_rate": 9.85205784204672e-06, "loss": 2.4953, "mean_token_accuracy": 0.48764492329312564, "step": 798 }, { "epoch": 0.14812754912866147, "grad_norm": 6.46875, "learning_rate": 9.85187245087134e-06, "loss": 2.7617, "mean_token_accuracy": 0.4278936196778624, "step": 799 }, { "epoch": 0.14831294030404152, "grad_norm": 8.2265625, "learning_rate": 9.851687059695958e-06, "loss": 2.8215, "mean_token_accuracy": 0.4289504036908881, "step": 800 }, { "epoch": 0.14849833147942157, "grad_norm": 7.09765625, "learning_rate": 9.851501668520579e-06, "loss": 3.0931, "mean_token_accuracy": 0.40274963820549925, "step": 801 }, { "epoch": 0.14868372265480162, "grad_norm": 6.65625, "learning_rate": 9.8513162773452e-06, "loss": 3.3212, "mean_token_accuracy": 0.3741313606814616, "step": 802 }, { "epoch": 0.14886911383018167, "grad_norm": 7.2421875, "learning_rate": 9.85113088616982e-06, "loss": 2.6435, "mean_token_accuracy": 0.4512826282628263, "step": 803 }, { "epoch": 0.14905450500556172, "grad_norm": 7.01171875, "learning_rate": 9.850945494994439e-06, "loss": 2.4941, "mean_token_accuracy": 0.45881397238017874, "step": 804 }, { "epoch": 0.1492398961809418, "grad_norm": 7.5859375, "learning_rate": 9.85076010381906e-06, "loss": 3.2045, "mean_token_accuracy": 0.39095197774283297, "step": 805 }, { "epoch": 0.14942528735632185, "grad_norm": 5.65234375, "learning_rate": 9.85057471264368e-06, "loss": 2.8881, "mean_token_accuracy": 0.4630516592541909, "step": 806 }, { "epoch": 0.1496106785317019, "grad_norm": 5.91796875, "learning_rate": 9.850389321468299e-06, "loss": 2.9351, "mean_token_accuracy": 0.4184427394146064, "step": 807 }, { "epoch": 0.14979606970708195, "grad_norm": 8.6328125, "learning_rate": 9.850203930292919e-06, "loss": 2.6712, "mean_token_accuracy": 0.4437269372693727, "step": 808 }, { "epoch": 0.149981460882462, "grad_norm": 7.703125, "learning_rate": 9.850018539117538e-06, "loss": 2.8661, "mean_token_accuracy": 0.40842204132748905, "step": 809 }, { "epoch": 0.15016685205784205, "grad_norm": 14.6171875, "learning_rate": 9.849833147942158e-06, "loss": 2.5576, "mean_token_accuracy": 0.4495798319327731, "step": 810 }, { "epoch": 0.1503522432332221, "grad_norm": 7.91015625, "learning_rate": 9.849647756766779e-06, "loss": 2.9951, "mean_token_accuracy": 0.42762465811066697, "step": 811 }, { "epoch": 0.15053763440860216, "grad_norm": 7.69921875, "learning_rate": 9.8494623655914e-06, "loss": 2.8932, "mean_token_accuracy": 0.4242622950819672, "step": 812 }, { "epoch": 0.1507230255839822, "grad_norm": 9.75, "learning_rate": 9.849276974416018e-06, "loss": 3.1525, "mean_token_accuracy": 0.39692242833052277, "step": 813 }, { "epoch": 0.15090841675936226, "grad_norm": 9.359375, "learning_rate": 9.849091583240639e-06, "loss": 3.0924, "mean_token_accuracy": 0.40548554484803556, "step": 814 }, { "epoch": 0.1510938079347423, "grad_norm": 9.0078125, "learning_rate": 9.848906192065259e-06, "loss": 3.1264, "mean_token_accuracy": 0.37162837162837165, "step": 815 }, { "epoch": 0.15127919911012236, "grad_norm": 6.90625, "learning_rate": 9.848720800889878e-06, "loss": 2.5484, "mean_token_accuracy": 0.4744904418280795, "step": 816 }, { "epoch": 0.1514645902855024, "grad_norm": 9.2109375, "learning_rate": 9.848535409714498e-06, "loss": 3.0358, "mean_token_accuracy": 0.4024390243902439, "step": 817 }, { "epoch": 0.15164998146088246, "grad_norm": 8.9921875, "learning_rate": 9.848350018539117e-06, "loss": 2.6225, "mean_token_accuracy": 0.44414292175486203, "step": 818 }, { "epoch": 0.1518353726362625, "grad_norm": 9.203125, "learning_rate": 9.84816462736374e-06, "loss": 3.1218, "mean_token_accuracy": 0.4061935172912399, "step": 819 }, { "epoch": 0.15202076381164256, "grad_norm": 10.4765625, "learning_rate": 9.847979236188358e-06, "loss": 2.6542, "mean_token_accuracy": 0.45192066281697213, "step": 820 }, { "epoch": 0.1522061549870226, "grad_norm": 8.6484375, "learning_rate": 9.847793845012979e-06, "loss": 2.8867, "mean_token_accuracy": 0.4193506993455665, "step": 821 }, { "epoch": 0.15239154616240266, "grad_norm": 9.6640625, "learning_rate": 9.847608453837597e-06, "loss": 3.1338, "mean_token_accuracy": 0.3854957507082153, "step": 822 }, { "epoch": 0.15257693733778271, "grad_norm": 7.94921875, "learning_rate": 9.847423062662218e-06, "loss": 3.2794, "mean_token_accuracy": 0.3942507645259939, "step": 823 }, { "epoch": 0.15276232851316277, "grad_norm": 6.21875, "learning_rate": 9.847237671486838e-06, "loss": 2.9785, "mean_token_accuracy": 0.41218826835265193, "step": 824 }, { "epoch": 0.15294771968854282, "grad_norm": 12.703125, "learning_rate": 9.847052280311457e-06, "loss": 2.5551, "mean_token_accuracy": 0.48514375075183447, "step": 825 }, { "epoch": 0.15313311086392287, "grad_norm": 7.171875, "learning_rate": 9.846866889136078e-06, "loss": 3.058, "mean_token_accuracy": 0.4107867521926954, "step": 826 }, { "epoch": 0.15331850203930292, "grad_norm": 7.75, "learning_rate": 9.846681497960698e-06, "loss": 3.0017, "mean_token_accuracy": 0.4088648332358104, "step": 827 }, { "epoch": 0.15350389321468297, "grad_norm": 6.71484375, "learning_rate": 9.846496106785319e-06, "loss": 3.0407, "mean_token_accuracy": 0.41314935064935066, "step": 828 }, { "epoch": 0.15368928439006305, "grad_norm": 6.59375, "learning_rate": 9.846310715609937e-06, "loss": 3.0433, "mean_token_accuracy": 0.4014628199918732, "step": 829 }, { "epoch": 0.1538746755654431, "grad_norm": 6.4296875, "learning_rate": 9.846125324434558e-06, "loss": 2.5572, "mean_token_accuracy": 0.47892011834319526, "step": 830 }, { "epoch": 0.15406006674082315, "grad_norm": 9.234375, "learning_rate": 9.845939933259177e-06, "loss": 2.5824, "mean_token_accuracy": 0.4693154034229829, "step": 831 }, { "epoch": 0.1542454579162032, "grad_norm": 6.515625, "learning_rate": 9.845754542083797e-06, "loss": 3.3004, "mean_token_accuracy": 0.38484621155288823, "step": 832 }, { "epoch": 0.15443084909158325, "grad_norm": 7.5234375, "learning_rate": 9.845569150908418e-06, "loss": 2.9477, "mean_token_accuracy": 0.42285553839674295, "step": 833 }, { "epoch": 0.1546162402669633, "grad_norm": 10.75, "learning_rate": 9.845383759733037e-06, "loss": 2.4653, "mean_token_accuracy": 0.461119927454092, "step": 834 }, { "epoch": 0.15480163144234335, "grad_norm": 9.875, "learning_rate": 9.845198368557659e-06, "loss": 2.6306, "mean_token_accuracy": 0.45259219668626405, "step": 835 }, { "epoch": 0.1549870226177234, "grad_norm": 10.59375, "learning_rate": 9.845012977382278e-06, "loss": 2.7674, "mean_token_accuracy": 0.438645585560375, "step": 836 }, { "epoch": 0.15517241379310345, "grad_norm": 7.1640625, "learning_rate": 9.844827586206898e-06, "loss": 3.2029, "mean_token_accuracy": 0.38777717685235263, "step": 837 }, { "epoch": 0.1553578049684835, "grad_norm": 7.6796875, "learning_rate": 9.844642195031517e-06, "loss": 3.1878, "mean_token_accuracy": 0.42232225300092335, "step": 838 }, { "epoch": 0.15554319614386355, "grad_norm": 11.7109375, "learning_rate": 9.844456803856137e-06, "loss": 3.455, "mean_token_accuracy": 0.36750832408435075, "step": 839 }, { "epoch": 0.1557285873192436, "grad_norm": 6.89453125, "learning_rate": 9.844271412680758e-06, "loss": 3.1067, "mean_token_accuracy": 0.4134655101197852, "step": 840 }, { "epoch": 0.15591397849462366, "grad_norm": 8.921875, "learning_rate": 9.844086021505377e-06, "loss": 2.8434, "mean_token_accuracy": 0.42536265793167993, "step": 841 }, { "epoch": 0.1560993696700037, "grad_norm": 8.6953125, "learning_rate": 9.843900630329997e-06, "loss": 2.9312, "mean_token_accuracy": 0.4146890113598021, "step": 842 }, { "epoch": 0.15628476084538376, "grad_norm": 10.875, "learning_rate": 9.843715239154618e-06, "loss": 2.7913, "mean_token_accuracy": 0.42761656341701987, "step": 843 }, { "epoch": 0.1564701520207638, "grad_norm": 8.6875, "learning_rate": 9.843529847979238e-06, "loss": 3.2152, "mean_token_accuracy": 0.3933711737279546, "step": 844 }, { "epoch": 0.15665554319614386, "grad_norm": 7.53125, "learning_rate": 9.843344456803857e-06, "loss": 2.8734, "mean_token_accuracy": 0.4027480916030534, "step": 845 }, { "epoch": 0.1568409343715239, "grad_norm": 8.125, "learning_rate": 9.843159065628477e-06, "loss": 2.6002, "mean_token_accuracy": 0.47222923504165615, "step": 846 }, { "epoch": 0.15702632554690396, "grad_norm": 9.6796875, "learning_rate": 9.842973674453096e-06, "loss": 3.3205, "mean_token_accuracy": 0.3829444891391794, "step": 847 }, { "epoch": 0.157211716722284, "grad_norm": 7.296875, "learning_rate": 9.842788283277717e-06, "loss": 3.3651, "mean_token_accuracy": 0.37548226509023025, "step": 848 }, { "epoch": 0.15739710789766406, "grad_norm": 8.7578125, "learning_rate": 9.842602892102337e-06, "loss": 3.1086, "mean_token_accuracy": 0.3885805763073639, "step": 849 }, { "epoch": 0.1575824990730441, "grad_norm": 8.8984375, "learning_rate": 9.842417500926956e-06, "loss": 3.0153, "mean_token_accuracy": 0.40098593242755803, "step": 850 }, { "epoch": 0.15776789024842416, "grad_norm": 8.84375, "learning_rate": 9.842232109751576e-06, "loss": 2.7477, "mean_token_accuracy": 0.44428880682541705, "step": 851 }, { "epoch": 0.15795328142380421, "grad_norm": 7.90234375, "learning_rate": 9.842046718576197e-06, "loss": 3.3033, "mean_token_accuracy": 0.38580060422960727, "step": 852 }, { "epoch": 0.15813867259918427, "grad_norm": 13.7109375, "learning_rate": 9.841861327400817e-06, "loss": 2.4676, "mean_token_accuracy": 0.4555924958442175, "step": 853 }, { "epoch": 0.15832406377456434, "grad_norm": 9.6171875, "learning_rate": 9.841675936225436e-06, "loss": 2.9422, "mean_token_accuracy": 0.40244248792956544, "step": 854 }, { "epoch": 0.1585094549499444, "grad_norm": 7.1640625, "learning_rate": 9.841490545050057e-06, "loss": 2.8004, "mean_token_accuracy": 0.423541915769895, "step": 855 }, { "epoch": 0.15869484612532445, "grad_norm": 8.84375, "learning_rate": 9.841305153874676e-06, "loss": 2.7139, "mean_token_accuracy": 0.4201044119152421, "step": 856 }, { "epoch": 0.1588802373007045, "grad_norm": 8.7109375, "learning_rate": 9.841119762699296e-06, "loss": 2.7176, "mean_token_accuracy": 0.4476150510560918, "step": 857 }, { "epoch": 0.15906562847608455, "grad_norm": 8.09375, "learning_rate": 9.840934371523916e-06, "loss": 3.0865, "mean_token_accuracy": 0.405218525766471, "step": 858 }, { "epoch": 0.1592510196514646, "grad_norm": 7.15625, "learning_rate": 9.840748980348537e-06, "loss": 3.3838, "mean_token_accuracy": 0.36763754045307445, "step": 859 }, { "epoch": 0.15943641082684465, "grad_norm": 10.640625, "learning_rate": 9.840563589173156e-06, "loss": 3.0425, "mean_token_accuracy": 0.4020387588215526, "step": 860 }, { "epoch": 0.1596218020022247, "grad_norm": 7.328125, "learning_rate": 9.840378197997776e-06, "loss": 2.8725, "mean_token_accuracy": 0.43518271539077086, "step": 861 }, { "epoch": 0.15980719317760475, "grad_norm": 5.49609375, "learning_rate": 9.840192806822397e-06, "loss": 3.0704, "mean_token_accuracy": 0.39563271395632715, "step": 862 }, { "epoch": 0.1599925843529848, "grad_norm": 8.953125, "learning_rate": 9.840007415647016e-06, "loss": 2.8088, "mean_token_accuracy": 0.42668037527310115, "step": 863 }, { "epoch": 0.16017797552836485, "grad_norm": 9.2109375, "learning_rate": 9.839822024471636e-06, "loss": 2.9694, "mean_token_accuracy": 0.41420861051838836, "step": 864 }, { "epoch": 0.1603633667037449, "grad_norm": 6.83984375, "learning_rate": 9.839636633296255e-06, "loss": 2.9639, "mean_token_accuracy": 0.426615064007145, "step": 865 }, { "epoch": 0.16054875787912495, "grad_norm": 6.4609375, "learning_rate": 9.839451242120875e-06, "loss": 2.4992, "mean_token_accuracy": 0.465635507733692, "step": 866 }, { "epoch": 0.160734149054505, "grad_norm": 10.6640625, "learning_rate": 9.839265850945496e-06, "loss": 2.6242, "mean_token_accuracy": 0.42900403768506057, "step": 867 }, { "epoch": 0.16091954022988506, "grad_norm": 6.3359375, "learning_rate": 9.839080459770116e-06, "loss": 3.102, "mean_token_accuracy": 0.4029791195637718, "step": 868 }, { "epoch": 0.1611049314052651, "grad_norm": 7.25390625, "learning_rate": 9.838895068594737e-06, "loss": 3.1046, "mean_token_accuracy": 0.40268617340208657, "step": 869 }, { "epoch": 0.16129032258064516, "grad_norm": 7.2421875, "learning_rate": 9.838709677419356e-06, "loss": 2.9799, "mean_token_accuracy": 0.4182015167930661, "step": 870 }, { "epoch": 0.1614757137560252, "grad_norm": 7.86328125, "learning_rate": 9.838524286243976e-06, "loss": 2.428, "mean_token_accuracy": 0.4929448786925185, "step": 871 }, { "epoch": 0.16166110493140526, "grad_norm": 7.06640625, "learning_rate": 9.838338895068595e-06, "loss": 3.1815, "mean_token_accuracy": 0.37569850552306694, "step": 872 }, { "epoch": 0.1618464961067853, "grad_norm": 8.1640625, "learning_rate": 9.838153503893215e-06, "loss": 2.9097, "mean_token_accuracy": 0.4155888748998512, "step": 873 }, { "epoch": 0.16203188728216536, "grad_norm": 6.66796875, "learning_rate": 9.837968112717834e-06, "loss": 2.9409, "mean_token_accuracy": 0.4218113975576662, "step": 874 }, { "epoch": 0.1622172784575454, "grad_norm": 8.9453125, "learning_rate": 9.837782721542455e-06, "loss": 2.719, "mean_token_accuracy": 0.4326935631283457, "step": 875 }, { "epoch": 0.16240266963292546, "grad_norm": 14.890625, "learning_rate": 9.837597330367075e-06, "loss": 2.5295, "mean_token_accuracy": 0.4559766108174969, "step": 876 }, { "epoch": 0.1625880608083055, "grad_norm": 7.08984375, "learning_rate": 9.837411939191696e-06, "loss": 2.9903, "mean_token_accuracy": 0.42188698542572234, "step": 877 }, { "epoch": 0.16277345198368556, "grad_norm": 7.546875, "learning_rate": 9.837226548016316e-06, "loss": 2.9975, "mean_token_accuracy": 0.4095304835318851, "step": 878 }, { "epoch": 0.16295884315906564, "grad_norm": 8.1953125, "learning_rate": 9.837041156840935e-06, "loss": 2.8369, "mean_token_accuracy": 0.42687011537030145, "step": 879 }, { "epoch": 0.1631442343344457, "grad_norm": 7.5078125, "learning_rate": 9.836855765665555e-06, "loss": 2.4864, "mean_token_accuracy": 0.4581965142712806, "step": 880 }, { "epoch": 0.16332962550982574, "grad_norm": 8.7265625, "learning_rate": 9.836670374490174e-06, "loss": 2.2708, "mean_token_accuracy": 0.49271422357546757, "step": 881 }, { "epoch": 0.1635150166852058, "grad_norm": 10.0390625, "learning_rate": 9.836484983314795e-06, "loss": 2.6299, "mean_token_accuracy": 0.45739005046863734, "step": 882 }, { "epoch": 0.16370040786058584, "grad_norm": 9.0859375, "learning_rate": 9.836299592139415e-06, "loss": 2.5935, "mean_token_accuracy": 0.4530878115996493, "step": 883 }, { "epoch": 0.1638857990359659, "grad_norm": 6.1875, "learning_rate": 9.836114200964036e-06, "loss": 2.7587, "mean_token_accuracy": 0.43889588821440845, "step": 884 }, { "epoch": 0.16407119021134595, "grad_norm": 11.046875, "learning_rate": 9.835928809788655e-06, "loss": 2.566, "mean_token_accuracy": 0.4770997846374731, "step": 885 }, { "epoch": 0.164256581386726, "grad_norm": 6.640625, "learning_rate": 9.835743418613275e-06, "loss": 3.6766, "mean_token_accuracy": 0.3553476682490924, "step": 886 }, { "epoch": 0.16444197256210605, "grad_norm": 5.9609375, "learning_rate": 9.835558027437895e-06, "loss": 2.9207, "mean_token_accuracy": 0.4325127334465195, "step": 887 }, { "epoch": 0.1646273637374861, "grad_norm": 7.98828125, "learning_rate": 9.835372636262514e-06, "loss": 3.0047, "mean_token_accuracy": 0.42887544802867383, "step": 888 }, { "epoch": 0.16481275491286615, "grad_norm": 7.10546875, "learning_rate": 9.835187245087135e-06, "loss": 3.0301, "mean_token_accuracy": 0.41472980825101685, "step": 889 }, { "epoch": 0.1649981460882462, "grad_norm": 5.5546875, "learning_rate": 9.835001853911754e-06, "loss": 3.1827, "mean_token_accuracy": 0.3912381780628329, "step": 890 }, { "epoch": 0.16518353726362625, "grad_norm": 6.95703125, "learning_rate": 9.834816462736374e-06, "loss": 3.2358, "mean_token_accuracy": 0.39298099597725067, "step": 891 }, { "epoch": 0.1653689284390063, "grad_norm": 7.484375, "learning_rate": 9.834631071560995e-06, "loss": 3.115, "mean_token_accuracy": 0.4002200522624123, "step": 892 }, { "epoch": 0.16555431961438635, "grad_norm": 6.36328125, "learning_rate": 9.834445680385615e-06, "loss": 2.7536, "mean_token_accuracy": 0.4423791821561338, "step": 893 }, { "epoch": 0.1657397107897664, "grad_norm": 10.5390625, "learning_rate": 9.834260289210234e-06, "loss": 2.8993, "mean_token_accuracy": 0.42203570161957676, "step": 894 }, { "epoch": 0.16592510196514645, "grad_norm": 6.16015625, "learning_rate": 9.834074898034854e-06, "loss": 3.2818, "mean_token_accuracy": 0.39052152317880795, "step": 895 }, { "epoch": 0.1661104931405265, "grad_norm": 6.75390625, "learning_rate": 9.833889506859475e-06, "loss": 3.1184, "mean_token_accuracy": 0.40248468345813476, "step": 896 }, { "epoch": 0.16629588431590656, "grad_norm": 8.109375, "learning_rate": 9.833704115684094e-06, "loss": 2.8024, "mean_token_accuracy": 0.4240407204385278, "step": 897 }, { "epoch": 0.1664812754912866, "grad_norm": 6.67578125, "learning_rate": 9.833518724508714e-06, "loss": 3.0508, "mean_token_accuracy": 0.3948530339346537, "step": 898 }, { "epoch": 0.16666666666666666, "grad_norm": 9.546875, "learning_rate": 9.833333333333333e-06, "loss": 3.1853, "mean_token_accuracy": 0.391392610637434, "step": 899 }, { "epoch": 0.1668520578420467, "grad_norm": 6.68359375, "learning_rate": 9.833147942157955e-06, "loss": 2.9536, "mean_token_accuracy": 0.43108345732449616, "step": 900 }, { "epoch": 0.16703744901742676, "grad_norm": 6.625, "learning_rate": 9.832962550982574e-06, "loss": 2.5083, "mean_token_accuracy": 0.4743078500626638, "step": 901 }, { "epoch": 0.1672228401928068, "grad_norm": 12.2265625, "learning_rate": 9.832777159807194e-06, "loss": 3.0457, "mean_token_accuracy": 0.4022612723062253, "step": 902 }, { "epoch": 0.16740823136818686, "grad_norm": 8.796875, "learning_rate": 9.832591768631813e-06, "loss": 3.3727, "mean_token_accuracy": 0.3941884355738186, "step": 903 }, { "epoch": 0.16759362254356694, "grad_norm": 11.078125, "learning_rate": 9.832406377456434e-06, "loss": 2.8555, "mean_token_accuracy": 0.4276098719809951, "step": 904 }, { "epoch": 0.167779013718947, "grad_norm": 11.4453125, "learning_rate": 9.832220986281054e-06, "loss": 2.6867, "mean_token_accuracy": 0.4577971646673937, "step": 905 }, { "epoch": 0.16796440489432704, "grad_norm": 8.9140625, "learning_rate": 9.832035595105673e-06, "loss": 2.989, "mean_token_accuracy": 0.4172646227440748, "step": 906 }, { "epoch": 0.1681497960697071, "grad_norm": 8.3515625, "learning_rate": 9.831850203930293e-06, "loss": 2.9702, "mean_token_accuracy": 0.42941757156959526, "step": 907 }, { "epoch": 0.16833518724508714, "grad_norm": 8.8359375, "learning_rate": 9.831664812754914e-06, "loss": 2.8582, "mean_token_accuracy": 0.4194536322430479, "step": 908 }, { "epoch": 0.1685205784204672, "grad_norm": 8.8125, "learning_rate": 9.831479421579534e-06, "loss": 3.2169, "mean_token_accuracy": 0.37999403697078116, "step": 909 }, { "epoch": 0.16870596959584724, "grad_norm": 9.7734375, "learning_rate": 9.831294030404153e-06, "loss": 3.4214, "mean_token_accuracy": 0.3838383838383838, "step": 910 }, { "epoch": 0.1688913607712273, "grad_norm": 7.76953125, "learning_rate": 9.831108639228774e-06, "loss": 2.8021, "mean_token_accuracy": 0.449582236465606, "step": 911 }, { "epoch": 0.16907675194660735, "grad_norm": 9.09375, "learning_rate": 9.830923248053393e-06, "loss": 2.8376, "mean_token_accuracy": 0.4329275103317324, "step": 912 }, { "epoch": 0.1692621431219874, "grad_norm": 7.80078125, "learning_rate": 9.830737856878013e-06, "loss": 3.6041, "mean_token_accuracy": 0.35638665132336017, "step": 913 }, { "epoch": 0.16944753429736745, "grad_norm": 8.125, "learning_rate": 9.830552465702634e-06, "loss": 3.0355, "mean_token_accuracy": 0.4033574618820268, "step": 914 }, { "epoch": 0.1696329254727475, "grad_norm": 7.69140625, "learning_rate": 9.830367074527252e-06, "loss": 3.2878, "mean_token_accuracy": 0.37535730404693846, "step": 915 }, { "epoch": 0.16981831664812755, "grad_norm": 10.15625, "learning_rate": 9.830181683351874e-06, "loss": 2.7633, "mean_token_accuracy": 0.4202977735282065, "step": 916 }, { "epoch": 0.1700037078235076, "grad_norm": 11.078125, "learning_rate": 9.829996292176493e-06, "loss": 2.9776, "mean_token_accuracy": 0.42373367771559767, "step": 917 }, { "epoch": 0.17018909899888765, "grad_norm": 10.7421875, "learning_rate": 9.829810901001114e-06, "loss": 3.1208, "mean_token_accuracy": 0.39517230909366935, "step": 918 }, { "epoch": 0.1703744901742677, "grad_norm": 6.74609375, "learning_rate": 9.829625509825733e-06, "loss": 3.0683, "mean_token_accuracy": 0.3957353928811283, "step": 919 }, { "epoch": 0.17055988134964775, "grad_norm": 9.53125, "learning_rate": 9.829440118650353e-06, "loss": 3.0807, "mean_token_accuracy": 0.41297676457693994, "step": 920 }, { "epoch": 0.1707452725250278, "grad_norm": 10.5703125, "learning_rate": 9.829254727474974e-06, "loss": 2.9419, "mean_token_accuracy": 0.4179579707068563, "step": 921 }, { "epoch": 0.17093066370040785, "grad_norm": 8.4140625, "learning_rate": 9.829069336299592e-06, "loss": 2.5302, "mean_token_accuracy": 0.4503933011925907, "step": 922 }, { "epoch": 0.1711160548757879, "grad_norm": 8.875, "learning_rate": 9.828883945124213e-06, "loss": 2.6073, "mean_token_accuracy": 0.44781718963165074, "step": 923 }, { "epoch": 0.17130144605116795, "grad_norm": 7.0546875, "learning_rate": 9.828698553948833e-06, "loss": 3.2873, "mean_token_accuracy": 0.37283500455788515, "step": 924 }, { "epoch": 0.171486837226548, "grad_norm": 9.1328125, "learning_rate": 9.828513162773454e-06, "loss": 2.7549, "mean_token_accuracy": 0.4229826353421859, "step": 925 }, { "epoch": 0.17167222840192806, "grad_norm": 6.18359375, "learning_rate": 9.828327771598073e-06, "loss": 2.8575, "mean_token_accuracy": 0.4302970541106865, "step": 926 }, { "epoch": 0.1718576195773081, "grad_norm": 5.80859375, "learning_rate": 9.828142380422693e-06, "loss": 3.078, "mean_token_accuracy": 0.41122956645344705, "step": 927 }, { "epoch": 0.17204301075268819, "grad_norm": 7.80859375, "learning_rate": 9.827956989247312e-06, "loss": 3.1563, "mean_token_accuracy": 0.40778401122019636, "step": 928 }, { "epoch": 0.17222840192806824, "grad_norm": 6.5546875, "learning_rate": 9.827771598071932e-06, "loss": 2.8492, "mean_token_accuracy": 0.42769857433808556, "step": 929 }, { "epoch": 0.1724137931034483, "grad_norm": 6.0859375, "learning_rate": 9.827586206896553e-06, "loss": 2.9861, "mean_token_accuracy": 0.3925370941677685, "step": 930 }, { "epoch": 0.17259918427882834, "grad_norm": 7.4921875, "learning_rate": 9.827400815721172e-06, "loss": 2.8912, "mean_token_accuracy": 0.41513458608430676, "step": 931 }, { "epoch": 0.1727845754542084, "grad_norm": 6.90234375, "learning_rate": 9.827215424545792e-06, "loss": 2.6906, "mean_token_accuracy": 0.45125628140703516, "step": 932 }, { "epoch": 0.17296996662958844, "grad_norm": 7.34375, "learning_rate": 9.827030033370413e-06, "loss": 2.5551, "mean_token_accuracy": 0.45125895125895127, "step": 933 }, { "epoch": 0.1731553578049685, "grad_norm": 9.640625, "learning_rate": 9.826844642195033e-06, "loss": 3.7212, "mean_token_accuracy": 0.3551655083048651, "step": 934 }, { "epoch": 0.17334074898034854, "grad_norm": 6.47265625, "learning_rate": 9.826659251019652e-06, "loss": 2.6948, "mean_token_accuracy": 0.4425101782956619, "step": 935 }, { "epoch": 0.1735261401557286, "grad_norm": 6.6640625, "learning_rate": 9.826473859844272e-06, "loss": 3.0416, "mean_token_accuracy": 0.41005147656461666, "step": 936 }, { "epoch": 0.17371153133110864, "grad_norm": 6.3671875, "learning_rate": 9.826288468668891e-06, "loss": 2.8398, "mean_token_accuracy": 0.43751487741014045, "step": 937 }, { "epoch": 0.1738969225064887, "grad_norm": 8.2890625, "learning_rate": 9.826103077493512e-06, "loss": 2.6089, "mean_token_accuracy": 0.4540399858038566, "step": 938 }, { "epoch": 0.17408231368186874, "grad_norm": 6.36328125, "learning_rate": 9.825917686318132e-06, "loss": 3.3504, "mean_token_accuracy": 0.3643364034425487, "step": 939 }, { "epoch": 0.1742677048572488, "grad_norm": 10.8515625, "learning_rate": 9.825732295142753e-06, "loss": 2.6028, "mean_token_accuracy": 0.4558444902162719, "step": 940 }, { "epoch": 0.17445309603262885, "grad_norm": 5.93359375, "learning_rate": 9.825546903967372e-06, "loss": 3.5589, "mean_token_accuracy": 0.3567421566590484, "step": 941 }, { "epoch": 0.1746384872080089, "grad_norm": 6.6015625, "learning_rate": 9.825361512791992e-06, "loss": 2.5763, "mean_token_accuracy": 0.4640759150474469, "step": 942 }, { "epoch": 0.17482387838338895, "grad_norm": 8.5546875, "learning_rate": 9.825176121616613e-06, "loss": 2.8157, "mean_token_accuracy": 0.43114952987564453, "step": 943 }, { "epoch": 0.175009269558769, "grad_norm": 8.8984375, "learning_rate": 9.824990730441231e-06, "loss": 2.6311, "mean_token_accuracy": 0.45034553365753777, "step": 944 }, { "epoch": 0.17519466073414905, "grad_norm": 7.09765625, "learning_rate": 9.824805339265852e-06, "loss": 3.032, "mean_token_accuracy": 0.4153272101033295, "step": 945 }, { "epoch": 0.1753800519095291, "grad_norm": 5.87890625, "learning_rate": 9.82461994809047e-06, "loss": 3.0415, "mean_token_accuracy": 0.4156521739130435, "step": 946 }, { "epoch": 0.17556544308490915, "grad_norm": 6.6171875, "learning_rate": 9.824434556915091e-06, "loss": 2.9468, "mean_token_accuracy": 0.41203838325094505, "step": 947 }, { "epoch": 0.1757508342602892, "grad_norm": 6.09375, "learning_rate": 9.824249165739712e-06, "loss": 2.6942, "mean_token_accuracy": 0.44108949416342413, "step": 948 }, { "epoch": 0.17593622543566925, "grad_norm": 6.671875, "learning_rate": 9.824063774564332e-06, "loss": 2.8791, "mean_token_accuracy": 0.4248938087269919, "step": 949 }, { "epoch": 0.1761216166110493, "grad_norm": 6.6875, "learning_rate": 9.823878383388951e-06, "loss": 3.0908, "mean_token_accuracy": 0.40885381837635415, "step": 950 }, { "epoch": 0.17630700778642935, "grad_norm": 5.80859375, "learning_rate": 9.823692992213571e-06, "loss": 2.7206, "mean_token_accuracy": 0.44763470781684794, "step": 951 }, { "epoch": 0.1764923989618094, "grad_norm": 15.25, "learning_rate": 9.823507601038192e-06, "loss": 2.9571, "mean_token_accuracy": 0.40001937608990507, "step": 952 }, { "epoch": 0.17667779013718948, "grad_norm": 6.4921875, "learning_rate": 9.82332220986281e-06, "loss": 3.2931, "mean_token_accuracy": 0.37936225537156154, "step": 953 }, { "epoch": 0.17686318131256953, "grad_norm": 7.2734375, "learning_rate": 9.823136818687431e-06, "loss": 2.6598, "mean_token_accuracy": 0.45132444744390077, "step": 954 }, { "epoch": 0.17704857248794958, "grad_norm": 7.16015625, "learning_rate": 9.82295142751205e-06, "loss": 2.9082, "mean_token_accuracy": 0.43122376493380693, "step": 955 }, { "epoch": 0.17723396366332964, "grad_norm": 5.98046875, "learning_rate": 9.822766036336672e-06, "loss": 3.1786, "mean_token_accuracy": 0.41091632475444484, "step": 956 }, { "epoch": 0.1774193548387097, "grad_norm": 8.875, "learning_rate": 9.822580645161291e-06, "loss": 3.1282, "mean_token_accuracy": 0.3942307692307692, "step": 957 }, { "epoch": 0.17760474601408974, "grad_norm": 7.92578125, "learning_rate": 9.822395253985911e-06, "loss": 3.2286, "mean_token_accuracy": 0.38834370512206795, "step": 958 }, { "epoch": 0.1777901371894698, "grad_norm": 8.1328125, "learning_rate": 9.822209862810532e-06, "loss": 2.6381, "mean_token_accuracy": 0.4401952807160293, "step": 959 }, { "epoch": 0.17797552836484984, "grad_norm": 14.2421875, "learning_rate": 9.82202447163515e-06, "loss": 2.5369, "mean_token_accuracy": 0.4442168804570443, "step": 960 }, { "epoch": 0.1781609195402299, "grad_norm": 7.26171875, "learning_rate": 9.821839080459771e-06, "loss": 2.8189, "mean_token_accuracy": 0.43766816143497755, "step": 961 }, { "epoch": 0.17834631071560994, "grad_norm": 5.9609375, "learning_rate": 9.82165368928439e-06, "loss": 3.3976, "mean_token_accuracy": 0.3637658637658638, "step": 962 }, { "epoch": 0.17853170189099, "grad_norm": 6.27734375, "learning_rate": 9.82146829810901e-06, "loss": 3.1762, "mean_token_accuracy": 0.39179153094462543, "step": 963 }, { "epoch": 0.17871709306637004, "grad_norm": 7.55078125, "learning_rate": 9.821282906933631e-06, "loss": 3.3903, "mean_token_accuracy": 0.37871967991998, "step": 964 }, { "epoch": 0.1789024842417501, "grad_norm": 9.734375, "learning_rate": 9.821097515758251e-06, "loss": 2.9571, "mean_token_accuracy": 0.41045498547918685, "step": 965 }, { "epoch": 0.17908787541713014, "grad_norm": 6.859375, "learning_rate": 9.82091212458287e-06, "loss": 3.5961, "mean_token_accuracy": 0.3788939206806642, "step": 966 }, { "epoch": 0.1792732665925102, "grad_norm": 8.3671875, "learning_rate": 9.82072673340749e-06, "loss": 2.6454, "mean_token_accuracy": 0.4683882457702582, "step": 967 }, { "epoch": 0.17945865776789025, "grad_norm": 6.90234375, "learning_rate": 9.820541342232111e-06, "loss": 2.8829, "mean_token_accuracy": 0.4283302855535237, "step": 968 }, { "epoch": 0.1796440489432703, "grad_norm": 11.0546875, "learning_rate": 9.82035595105673e-06, "loss": 3.1915, "mean_token_accuracy": 0.3899988316392102, "step": 969 }, { "epoch": 0.17982944011865035, "grad_norm": 7.70703125, "learning_rate": 9.82017055988135e-06, "loss": 2.7976, "mean_token_accuracy": 0.44193806727175655, "step": 970 }, { "epoch": 0.1800148312940304, "grad_norm": 7.35546875, "learning_rate": 9.81998516870597e-06, "loss": 2.9728, "mean_token_accuracy": 0.4155251141552511, "step": 971 }, { "epoch": 0.18020022246941045, "grad_norm": 7.4375, "learning_rate": 9.819799777530592e-06, "loss": 2.7307, "mean_token_accuracy": 0.4378502658427935, "step": 972 }, { "epoch": 0.1803856136447905, "grad_norm": 5.796875, "learning_rate": 9.81961438635521e-06, "loss": 2.8629, "mean_token_accuracy": 0.4325975807457289, "step": 973 }, { "epoch": 0.18057100482017055, "grad_norm": 6.0625, "learning_rate": 9.81942899517983e-06, "loss": 2.9161, "mean_token_accuracy": 0.4185038868357334, "step": 974 }, { "epoch": 0.1807563959955506, "grad_norm": 7.34765625, "learning_rate": 9.81924360400445e-06, "loss": 2.4493, "mean_token_accuracy": 0.4842044801838024, "step": 975 }, { "epoch": 0.18094178717093065, "grad_norm": 10.140625, "learning_rate": 9.81905821282907e-06, "loss": 3.1247, "mean_token_accuracy": 0.39074141932331746, "step": 976 }, { "epoch": 0.1811271783463107, "grad_norm": 8.859375, "learning_rate": 9.81887282165369e-06, "loss": 2.941, "mean_token_accuracy": 0.4139446316097847, "step": 977 }, { "epoch": 0.18131256952169078, "grad_norm": 8.6953125, "learning_rate": 9.81868743047831e-06, "loss": 2.9445, "mean_token_accuracy": 0.42589613970588236, "step": 978 }, { "epoch": 0.18149796069707083, "grad_norm": 8.34375, "learning_rate": 9.81850203930293e-06, "loss": 2.6382, "mean_token_accuracy": 0.449826443673083, "step": 979 }, { "epoch": 0.18168335187245088, "grad_norm": 6.3046875, "learning_rate": 9.81831664812755e-06, "loss": 2.3357, "mean_token_accuracy": 0.5011212241129138, "step": 980 }, { "epoch": 0.18186874304783093, "grad_norm": 7.3515625, "learning_rate": 9.818131256952171e-06, "loss": 2.9433, "mean_token_accuracy": 0.4157101369105241, "step": 981 }, { "epoch": 0.18205413422321098, "grad_norm": 6.50390625, "learning_rate": 9.81794586577679e-06, "loss": 3.0002, "mean_token_accuracy": 0.40797546012269936, "step": 982 }, { "epoch": 0.18223952539859103, "grad_norm": 8.359375, "learning_rate": 9.81776047460141e-06, "loss": 2.7719, "mean_token_accuracy": 0.43075258475595096, "step": 983 }, { "epoch": 0.18242491657397109, "grad_norm": 7.80859375, "learning_rate": 9.817575083426029e-06, "loss": 3.0833, "mean_token_accuracy": 0.40502056745069087, "step": 984 }, { "epoch": 0.18261030774935114, "grad_norm": 7.89453125, "learning_rate": 9.81738969225065e-06, "loss": 2.9024, "mean_token_accuracy": 0.41554229372080953, "step": 985 }, { "epoch": 0.1827956989247312, "grad_norm": 8.921875, "learning_rate": 9.81720430107527e-06, "loss": 2.6053, "mean_token_accuracy": 0.46339239187076603, "step": 986 }, { "epoch": 0.18298109010011124, "grad_norm": 8.703125, "learning_rate": 9.817018909899889e-06, "loss": 2.9021, "mean_token_accuracy": 0.4314046877561056, "step": 987 }, { "epoch": 0.1831664812754913, "grad_norm": 7.66796875, "learning_rate": 9.816833518724511e-06, "loss": 2.5, "mean_token_accuracy": 0.463427432049533, "step": 988 }, { "epoch": 0.18335187245087134, "grad_norm": 5.86328125, "learning_rate": 9.81664812754913e-06, "loss": 2.8316, "mean_token_accuracy": 0.4455082176479975, "step": 989 }, { "epoch": 0.1835372636262514, "grad_norm": 7.21484375, "learning_rate": 9.81646273637375e-06, "loss": 2.2925, "mean_token_accuracy": 0.4972521181589192, "step": 990 }, { "epoch": 0.18372265480163144, "grad_norm": 8.140625, "learning_rate": 9.816277345198369e-06, "loss": 3.1826, "mean_token_accuracy": 0.39731561115454783, "step": 991 }, { "epoch": 0.1839080459770115, "grad_norm": 7.06640625, "learning_rate": 9.81609195402299e-06, "loss": 3.2356, "mean_token_accuracy": 0.3774932282688993, "step": 992 }, { "epoch": 0.18409343715239154, "grad_norm": 6.515625, "learning_rate": 9.815906562847608e-06, "loss": 2.7563, "mean_token_accuracy": 0.4504526851047997, "step": 993 }, { "epoch": 0.1842788283277716, "grad_norm": 8.9921875, "learning_rate": 9.815721171672229e-06, "loss": 2.9787, "mean_token_accuracy": 0.4243169398907104, "step": 994 }, { "epoch": 0.18446421950315164, "grad_norm": 9.03125, "learning_rate": 9.81553578049685e-06, "loss": 2.6147, "mean_token_accuracy": 0.47397880195307845, "step": 995 }, { "epoch": 0.1846496106785317, "grad_norm": 8.5078125, "learning_rate": 9.815350389321468e-06, "loss": 2.3522, "mean_token_accuracy": 0.47869738705110054, "step": 996 }, { "epoch": 0.18483500185391175, "grad_norm": 7.359375, "learning_rate": 9.81516499814609e-06, "loss": 3.2108, "mean_token_accuracy": 0.40973544973544973, "step": 997 }, { "epoch": 0.1850203930292918, "grad_norm": 9.28125, "learning_rate": 9.814979606970709e-06, "loss": 2.8617, "mean_token_accuracy": 0.4271401536235111, "step": 998 }, { "epoch": 0.18520578420467185, "grad_norm": 7.3828125, "learning_rate": 9.81479421579533e-06, "loss": 3.1549, "mean_token_accuracy": 0.41374045801526715, "step": 999 }, { "epoch": 0.1853911753800519, "grad_norm": 6.76953125, "learning_rate": 9.814608824619948e-06, "loss": 2.7471, "mean_token_accuracy": 0.4567751869775627, "step": 1000 }, { "epoch": 0.18557656655543195, "grad_norm": 6.453125, "learning_rate": 9.814423433444569e-06, "loss": 2.6422, "mean_token_accuracy": 0.46346863468634686, "step": 1001 }, { "epoch": 0.185761957730812, "grad_norm": 14.5625, "learning_rate": 9.81423804226919e-06, "loss": 3.2903, "mean_token_accuracy": 0.3851472471190781, "step": 1002 }, { "epoch": 0.18594734890619208, "grad_norm": 7.12890625, "learning_rate": 9.814052651093808e-06, "loss": 3.0695, "mean_token_accuracy": 0.4044219253800092, "step": 1003 }, { "epoch": 0.18613274008157213, "grad_norm": 8.4609375, "learning_rate": 9.813867259918429e-06, "loss": 2.9237, "mean_token_accuracy": 0.42072978924189997, "step": 1004 }, { "epoch": 0.18631813125695218, "grad_norm": 11.5, "learning_rate": 9.813681868743049e-06, "loss": 3.2533, "mean_token_accuracy": 0.39268362389254075, "step": 1005 }, { "epoch": 0.18650352243233223, "grad_norm": 7.06640625, "learning_rate": 9.81349647756767e-06, "loss": 2.8126, "mean_token_accuracy": 0.4327493447372051, "step": 1006 }, { "epoch": 0.18668891360771228, "grad_norm": 9.015625, "learning_rate": 9.813311086392288e-06, "loss": 2.5965, "mean_token_accuracy": 0.44166666666666665, "step": 1007 }, { "epoch": 0.18687430478309233, "grad_norm": 10.53125, "learning_rate": 9.813125695216909e-06, "loss": 2.9113, "mean_token_accuracy": 0.40158777711204313, "step": 1008 }, { "epoch": 0.18705969595847238, "grad_norm": 13.765625, "learning_rate": 9.812940304041528e-06, "loss": 3.1759, "mean_token_accuracy": 0.3845419847328244, "step": 1009 }, { "epoch": 0.18724508713385243, "grad_norm": 9.390625, "learning_rate": 9.812754912866148e-06, "loss": 2.8142, "mean_token_accuracy": 0.42088565763384006, "step": 1010 }, { "epoch": 0.18743047830923248, "grad_norm": 6.74609375, "learning_rate": 9.812569521690769e-06, "loss": 2.9207, "mean_token_accuracy": 0.4175474156652671, "step": 1011 }, { "epoch": 0.18761586948461254, "grad_norm": 9.2421875, "learning_rate": 9.812384130515387e-06, "loss": 2.6521, "mean_token_accuracy": 0.4331787521079258, "step": 1012 }, { "epoch": 0.18780126065999259, "grad_norm": 6.53515625, "learning_rate": 9.812198739340008e-06, "loss": 2.6721, "mean_token_accuracy": 0.4418887537321685, "step": 1013 }, { "epoch": 0.18798665183537264, "grad_norm": 12.53125, "learning_rate": 9.812013348164628e-06, "loss": 2.6866, "mean_token_accuracy": 0.44471544715447153, "step": 1014 }, { "epoch": 0.1881720430107527, "grad_norm": 10.296875, "learning_rate": 9.811827956989249e-06, "loss": 2.7858, "mean_token_accuracy": 0.4494631617919289, "step": 1015 }, { "epoch": 0.18835743418613274, "grad_norm": 6.3125, "learning_rate": 9.811642565813868e-06, "loss": 2.9027, "mean_token_accuracy": 0.43593202050175345, "step": 1016 }, { "epoch": 0.1885428253615128, "grad_norm": 9.453125, "learning_rate": 9.811457174638488e-06, "loss": 2.7057, "mean_token_accuracy": 0.44948985312254736, "step": 1017 }, { "epoch": 0.18872821653689284, "grad_norm": 12.8359375, "learning_rate": 9.811271783463107e-06, "loss": 2.7934, "mean_token_accuracy": 0.4257238469372909, "step": 1018 }, { "epoch": 0.1889136077122729, "grad_norm": 6.63671875, "learning_rate": 9.811086392287728e-06, "loss": 2.9675, "mean_token_accuracy": 0.40269897255022236, "step": 1019 }, { "epoch": 0.18909899888765294, "grad_norm": 9.4765625, "learning_rate": 9.810901001112348e-06, "loss": 2.5321, "mean_token_accuracy": 0.4683893195521103, "step": 1020 }, { "epoch": 0.189284390063033, "grad_norm": 6.25390625, "learning_rate": 9.810715609936968e-06, "loss": 2.8699, "mean_token_accuracy": 0.4243516687986975, "step": 1021 }, { "epoch": 0.18946978123841304, "grad_norm": 6.52734375, "learning_rate": 9.810530218761587e-06, "loss": 3.0337, "mean_token_accuracy": 0.41924460431654675, "step": 1022 }, { "epoch": 0.1896551724137931, "grad_norm": 5.95703125, "learning_rate": 9.810344827586208e-06, "loss": 3.127, "mean_token_accuracy": 0.40585569030993507, "step": 1023 }, { "epoch": 0.18984056358917314, "grad_norm": 7.51953125, "learning_rate": 9.810159436410828e-06, "loss": 2.5803, "mean_token_accuracy": 0.45498717775678227, "step": 1024 }, { "epoch": 0.1900259547645532, "grad_norm": 8.5234375, "learning_rate": 9.809974045235447e-06, "loss": 2.7258, "mean_token_accuracy": 0.43440384865427123, "step": 1025 }, { "epoch": 0.19021134593993325, "grad_norm": 6.23046875, "learning_rate": 9.809788654060068e-06, "loss": 2.7584, "mean_token_accuracy": 0.46048304796462186, "step": 1026 }, { "epoch": 0.19039673711531332, "grad_norm": 8.78125, "learning_rate": 9.809603262884686e-06, "loss": 2.7614, "mean_token_accuracy": 0.4380333715450261, "step": 1027 }, { "epoch": 0.19058212829069338, "grad_norm": 7.125, "learning_rate": 9.809417871709307e-06, "loss": 2.8784, "mean_token_accuracy": 0.43032699294721094, "step": 1028 }, { "epoch": 0.19076751946607343, "grad_norm": 6.69140625, "learning_rate": 9.809232480533927e-06, "loss": 2.8202, "mean_token_accuracy": 0.42830009496676164, "step": 1029 }, { "epoch": 0.19095291064145348, "grad_norm": 8.3828125, "learning_rate": 9.809047089358548e-06, "loss": 2.5601, "mean_token_accuracy": 0.46968545813706125, "step": 1030 }, { "epoch": 0.19113830181683353, "grad_norm": 6.8515625, "learning_rate": 9.808861698183167e-06, "loss": 2.8931, "mean_token_accuracy": 0.4442016806722689, "step": 1031 }, { "epoch": 0.19132369299221358, "grad_norm": 6.6875, "learning_rate": 9.808676307007787e-06, "loss": 3.2197, "mean_token_accuracy": 0.4037063435495367, "step": 1032 }, { "epoch": 0.19150908416759363, "grad_norm": 6.2734375, "learning_rate": 9.808490915832408e-06, "loss": 3.1832, "mean_token_accuracy": 0.3869047619047619, "step": 1033 }, { "epoch": 0.19169447534297368, "grad_norm": 7.70703125, "learning_rate": 9.808305524657026e-06, "loss": 2.9157, "mean_token_accuracy": 0.42330226364846874, "step": 1034 }, { "epoch": 0.19187986651835373, "grad_norm": 7.453125, "learning_rate": 9.808120133481647e-06, "loss": 2.5107, "mean_token_accuracy": 0.48006245496036515, "step": 1035 }, { "epoch": 0.19206525769373378, "grad_norm": 5.61328125, "learning_rate": 9.807934742306266e-06, "loss": 2.8004, "mean_token_accuracy": 0.44635676371240146, "step": 1036 }, { "epoch": 0.19225064886911383, "grad_norm": 7.3359375, "learning_rate": 9.807749351130888e-06, "loss": 2.853, "mean_token_accuracy": 0.4311504424778761, "step": 1037 }, { "epoch": 0.19243604004449388, "grad_norm": 7.734375, "learning_rate": 9.807563959955507e-06, "loss": 2.7605, "mean_token_accuracy": 0.43736263736263736, "step": 1038 }, { "epoch": 0.19262143121987393, "grad_norm": 6.08203125, "learning_rate": 9.807378568780127e-06, "loss": 3.1788, "mean_token_accuracy": 0.39853353461289626, "step": 1039 }, { "epoch": 0.19280682239525399, "grad_norm": 16.40625, "learning_rate": 9.807193177604748e-06, "loss": 2.5596, "mean_token_accuracy": 0.4575200270788672, "step": 1040 }, { "epoch": 0.19299221357063404, "grad_norm": 8.359375, "learning_rate": 9.807007786429366e-06, "loss": 2.8605, "mean_token_accuracy": 0.4215727209464161, "step": 1041 }, { "epoch": 0.1931776047460141, "grad_norm": 11.6953125, "learning_rate": 9.806822395253987e-06, "loss": 2.8117, "mean_token_accuracy": 0.4198639061821341, "step": 1042 }, { "epoch": 0.19336299592139414, "grad_norm": 8.6875, "learning_rate": 9.806637004078606e-06, "loss": 3.0622, "mean_token_accuracy": 0.39813895781637715, "step": 1043 }, { "epoch": 0.1935483870967742, "grad_norm": 7.8046875, "learning_rate": 9.806451612903226e-06, "loss": 3.1685, "mean_token_accuracy": 0.3736914600550964, "step": 1044 }, { "epoch": 0.19373377827215424, "grad_norm": 5.85546875, "learning_rate": 9.806266221727847e-06, "loss": 3.3404, "mean_token_accuracy": 0.3777093925608777, "step": 1045 }, { "epoch": 0.1939191694475343, "grad_norm": 9.625, "learning_rate": 9.806080830552467e-06, "loss": 2.6389, "mean_token_accuracy": 0.447255880256593, "step": 1046 }, { "epoch": 0.19410456062291434, "grad_norm": 9.625, "learning_rate": 9.805895439377086e-06, "loss": 2.9779, "mean_token_accuracy": 0.42087752131420253, "step": 1047 }, { "epoch": 0.1942899517982944, "grad_norm": 7.984375, "learning_rate": 9.805710048201707e-06, "loss": 2.931, "mean_token_accuracy": 0.42249962847377026, "step": 1048 }, { "epoch": 0.19447534297367444, "grad_norm": 6.66796875, "learning_rate": 9.805524657026327e-06, "loss": 2.8385, "mean_token_accuracy": 0.4353510895883777, "step": 1049 }, { "epoch": 0.1946607341490545, "grad_norm": 10.6640625, "learning_rate": 9.805339265850946e-06, "loss": 2.5728, "mean_token_accuracy": 0.4534898891063275, "step": 1050 }, { "epoch": 0.19484612532443454, "grad_norm": 6.71484375, "learning_rate": 9.805153874675566e-06, "loss": 2.8725, "mean_token_accuracy": 0.44359083795703513, "step": 1051 }, { "epoch": 0.19503151649981462, "grad_norm": 6.45703125, "learning_rate": 9.804968483500185e-06, "loss": 2.8243, "mean_token_accuracy": 0.43431442928930364, "step": 1052 }, { "epoch": 0.19521690767519467, "grad_norm": 5.70703125, "learning_rate": 9.804783092324807e-06, "loss": 3.1598, "mean_token_accuracy": 0.40442149854381787, "step": 1053 }, { "epoch": 0.19540229885057472, "grad_norm": 6.97265625, "learning_rate": 9.804597701149426e-06, "loss": 3.4106, "mean_token_accuracy": 0.36759236300520703, "step": 1054 }, { "epoch": 0.19558769002595477, "grad_norm": 15.3203125, "learning_rate": 9.804412309974047e-06, "loss": 2.222, "mean_token_accuracy": 0.46537396121883656, "step": 1055 }, { "epoch": 0.19577308120133483, "grad_norm": 8.5859375, "learning_rate": 9.804226918798665e-06, "loss": 2.6373, "mean_token_accuracy": 0.45280784844384303, "step": 1056 }, { "epoch": 0.19595847237671488, "grad_norm": 7.50390625, "learning_rate": 9.804041527623286e-06, "loss": 3.2508, "mean_token_accuracy": 0.3787171622883051, "step": 1057 }, { "epoch": 0.19614386355209493, "grad_norm": 6.8046875, "learning_rate": 9.803856136447906e-06, "loss": 2.7307, "mean_token_accuracy": 0.44530870199319394, "step": 1058 }, { "epoch": 0.19632925472747498, "grad_norm": 8.1796875, "learning_rate": 9.803670745272525e-06, "loss": 2.7371, "mean_token_accuracy": 0.4481875240169079, "step": 1059 }, { "epoch": 0.19651464590285503, "grad_norm": 10.265625, "learning_rate": 9.803485354097146e-06, "loss": 2.8798, "mean_token_accuracy": 0.4278382907415165, "step": 1060 }, { "epoch": 0.19670003707823508, "grad_norm": 9.09375, "learning_rate": 9.803299962921766e-06, "loss": 2.5168, "mean_token_accuracy": 0.47246184472461844, "step": 1061 }, { "epoch": 0.19688542825361513, "grad_norm": 5.7109375, "learning_rate": 9.803114571746387e-06, "loss": 3.0063, "mean_token_accuracy": 0.408772874058127, "step": 1062 }, { "epoch": 0.19707081942899518, "grad_norm": 8.1171875, "learning_rate": 9.802929180571005e-06, "loss": 2.9961, "mean_token_accuracy": 0.42447696214778086, "step": 1063 }, { "epoch": 0.19725621060437523, "grad_norm": 7.59765625, "learning_rate": 9.802743789395626e-06, "loss": 3.3244, "mean_token_accuracy": 0.36967936543402974, "step": 1064 }, { "epoch": 0.19744160177975528, "grad_norm": 7.47265625, "learning_rate": 9.802558398220245e-06, "loss": 2.9134, "mean_token_accuracy": 0.43033292231812575, "step": 1065 }, { "epoch": 0.19762699295513533, "grad_norm": 6.15234375, "learning_rate": 9.802373007044865e-06, "loss": 2.9202, "mean_token_accuracy": 0.4260120350109409, "step": 1066 }, { "epoch": 0.19781238413051538, "grad_norm": 8.234375, "learning_rate": 9.802187615869486e-06, "loss": 3.2256, "mean_token_accuracy": 0.4077703087615358, "step": 1067 }, { "epoch": 0.19799777530589543, "grad_norm": 10.2109375, "learning_rate": 9.802002224694104e-06, "loss": 2.8775, "mean_token_accuracy": 0.42418032786885246, "step": 1068 }, { "epoch": 0.19818316648127549, "grad_norm": 7.55859375, "learning_rate": 9.801816833518727e-06, "loss": 3.1701, "mean_token_accuracy": 0.39383715699505173, "step": 1069 }, { "epoch": 0.19836855765665554, "grad_norm": 9.703125, "learning_rate": 9.801631442343345e-06, "loss": 2.8559, "mean_token_accuracy": 0.41557115507338865, "step": 1070 }, { "epoch": 0.1985539488320356, "grad_norm": 11.65625, "learning_rate": 9.801446051167966e-06, "loss": 2.6661, "mean_token_accuracy": 0.43855539287457296, "step": 1071 }, { "epoch": 0.19873934000741564, "grad_norm": 9.28125, "learning_rate": 9.801260659992585e-06, "loss": 2.6959, "mean_token_accuracy": 0.47648841528986213, "step": 1072 }, { "epoch": 0.1989247311827957, "grad_norm": 6.59765625, "learning_rate": 9.801075268817205e-06, "loss": 2.7289, "mean_token_accuracy": 0.44233420125593503, "step": 1073 }, { "epoch": 0.19911012235817574, "grad_norm": 8.2890625, "learning_rate": 9.800889877641824e-06, "loss": 3.0608, "mean_token_accuracy": 0.43058682275251386, "step": 1074 }, { "epoch": 0.1992955135335558, "grad_norm": 8.2421875, "learning_rate": 9.800704486466445e-06, "loss": 3.4542, "mean_token_accuracy": 0.38519845644983464, "step": 1075 }, { "epoch": 0.19948090470893584, "grad_norm": 7.58203125, "learning_rate": 9.800519095291065e-06, "loss": 2.809, "mean_token_accuracy": 0.4396375701888719, "step": 1076 }, { "epoch": 0.19966629588431592, "grad_norm": 7.734375, "learning_rate": 9.800333704115686e-06, "loss": 3.1592, "mean_token_accuracy": 0.39954392702832453, "step": 1077 }, { "epoch": 0.19985168705969597, "grad_norm": 8.828125, "learning_rate": 9.800148312940306e-06, "loss": 2.6994, "mean_token_accuracy": 0.45293150684931505, "step": 1078 }, { "epoch": 0.20003707823507602, "grad_norm": 6.16796875, "learning_rate": 9.799962921764925e-06, "loss": 3.5462, "mean_token_accuracy": 0.3581456953642384, "step": 1079 }, { "epoch": 0.20022246941045607, "grad_norm": 8.953125, "learning_rate": 9.799777530589545e-06, "loss": 2.9314, "mean_token_accuracy": 0.42709660973744584, "step": 1080 }, { "epoch": 0.20040786058583612, "grad_norm": 6.36328125, "learning_rate": 9.799592139414164e-06, "loss": 3.286, "mean_token_accuracy": 0.38740204051456456, "step": 1081 }, { "epoch": 0.20059325176121617, "grad_norm": 6.39453125, "learning_rate": 9.799406748238785e-06, "loss": 2.9125, "mean_token_accuracy": 0.4343953838434521, "step": 1082 }, { "epoch": 0.20077864293659622, "grad_norm": 7.03125, "learning_rate": 9.799221357063405e-06, "loss": 3.0987, "mean_token_accuracy": 0.40029286474973375, "step": 1083 }, { "epoch": 0.20096403411197628, "grad_norm": 8.609375, "learning_rate": 9.799035965888024e-06, "loss": 2.7017, "mean_token_accuracy": 0.4223855285472018, "step": 1084 }, { "epoch": 0.20114942528735633, "grad_norm": 7.8515625, "learning_rate": 9.798850574712644e-06, "loss": 2.577, "mean_token_accuracy": 0.4589793915603533, "step": 1085 }, { "epoch": 0.20133481646273638, "grad_norm": 6.96875, "learning_rate": 9.798665183537265e-06, "loss": 2.5672, "mean_token_accuracy": 0.47958900305470703, "step": 1086 }, { "epoch": 0.20152020763811643, "grad_norm": 6.26953125, "learning_rate": 9.798479792361885e-06, "loss": 2.8188, "mean_token_accuracy": 0.44358299875398033, "step": 1087 }, { "epoch": 0.20170559881349648, "grad_norm": 6.5625, "learning_rate": 9.798294401186504e-06, "loss": 3.2931, "mean_token_accuracy": 0.38922655225837205, "step": 1088 }, { "epoch": 0.20189098998887653, "grad_norm": 8.109375, "learning_rate": 9.798109010011125e-06, "loss": 3.3934, "mean_token_accuracy": 0.3894822225226443, "step": 1089 }, { "epoch": 0.20207638116425658, "grad_norm": 11.4453125, "learning_rate": 9.797923618835743e-06, "loss": 2.5942, "mean_token_accuracy": 0.4601639946151022, "step": 1090 }, { "epoch": 0.20226177233963663, "grad_norm": 8.6328125, "learning_rate": 9.797738227660364e-06, "loss": 2.9568, "mean_token_accuracy": 0.4112517580872011, "step": 1091 }, { "epoch": 0.20244716351501668, "grad_norm": 7.6953125, "learning_rate": 9.797552836484984e-06, "loss": 2.7999, "mean_token_accuracy": 0.4473579129508851, "step": 1092 }, { "epoch": 0.20263255469039673, "grad_norm": 6.0703125, "learning_rate": 9.797367445309605e-06, "loss": 2.4338, "mean_token_accuracy": 0.47934721440630274, "step": 1093 }, { "epoch": 0.20281794586577678, "grad_norm": 6.34375, "learning_rate": 9.797182054134224e-06, "loss": 2.4816, "mean_token_accuracy": 0.47784146511953024, "step": 1094 }, { "epoch": 0.20300333704115683, "grad_norm": 8.6875, "learning_rate": 9.796996662958844e-06, "loss": 3.0137, "mean_token_accuracy": 0.4271518905665247, "step": 1095 }, { "epoch": 0.20318872821653688, "grad_norm": 5.96484375, "learning_rate": 9.796811271783465e-06, "loss": 3.0285, "mean_token_accuracy": 0.4043307585158921, "step": 1096 }, { "epoch": 0.20337411939191694, "grad_norm": 6.15625, "learning_rate": 9.796625880608083e-06, "loss": 2.756, "mean_token_accuracy": 0.4442196141912856, "step": 1097 }, { "epoch": 0.203559510567297, "grad_norm": 10.0234375, "learning_rate": 9.796440489432704e-06, "loss": 2.5152, "mean_token_accuracy": 0.4513156299260337, "step": 1098 }, { "epoch": 0.20374490174267704, "grad_norm": 7.69921875, "learning_rate": 9.796255098257323e-06, "loss": 2.8944, "mean_token_accuracy": 0.41858353510895885, "step": 1099 }, { "epoch": 0.2039302929180571, "grad_norm": 5.78125, "learning_rate": 9.796069707081943e-06, "loss": 3.1327, "mean_token_accuracy": 0.4126669965363681, "step": 1100 }, { "epoch": 0.20411568409343714, "grad_norm": 6.609375, "learning_rate": 9.795884315906564e-06, "loss": 3.2587, "mean_token_accuracy": 0.39227285331361494, "step": 1101 }, { "epoch": 0.20430107526881722, "grad_norm": 8.1875, "learning_rate": 9.795698924731184e-06, "loss": 2.6494, "mean_token_accuracy": 0.4513888888888889, "step": 1102 }, { "epoch": 0.20448646644419727, "grad_norm": 9.0, "learning_rate": 9.795513533555803e-06, "loss": 3.3998, "mean_token_accuracy": 0.3798411728772144, "step": 1103 }, { "epoch": 0.20467185761957732, "grad_norm": 6.46875, "learning_rate": 9.795328142380424e-06, "loss": 2.4871, "mean_token_accuracy": 0.4683992003690604, "step": 1104 }, { "epoch": 0.20485724879495737, "grad_norm": 5.7421875, "learning_rate": 9.795142751205044e-06, "loss": 2.6401, "mean_token_accuracy": 0.46563407550822844, "step": 1105 }, { "epoch": 0.20504263997033742, "grad_norm": 11.2890625, "learning_rate": 9.794957360029663e-06, "loss": 2.6295, "mean_token_accuracy": 0.4567544809334777, "step": 1106 }, { "epoch": 0.20522803114571747, "grad_norm": 5.75, "learning_rate": 9.794771968854283e-06, "loss": 2.4093, "mean_token_accuracy": 0.5070866141732283, "step": 1107 }, { "epoch": 0.20541342232109752, "grad_norm": 6.33984375, "learning_rate": 9.794586577678902e-06, "loss": 2.6209, "mean_token_accuracy": 0.4647347623039157, "step": 1108 }, { "epoch": 0.20559881349647757, "grad_norm": 6.78515625, "learning_rate": 9.794401186503524e-06, "loss": 2.8667, "mean_token_accuracy": 0.4042606832219841, "step": 1109 }, { "epoch": 0.20578420467185762, "grad_norm": 5.6953125, "learning_rate": 9.794215795328143e-06, "loss": 3.1588, "mean_token_accuracy": 0.4005568333131582, "step": 1110 }, { "epoch": 0.20596959584723767, "grad_norm": 6.08984375, "learning_rate": 9.794030404152764e-06, "loss": 2.8841, "mean_token_accuracy": 0.42847533632286994, "step": 1111 }, { "epoch": 0.20615498702261773, "grad_norm": 8.171875, "learning_rate": 9.793845012977382e-06, "loss": 2.8357, "mean_token_accuracy": 0.4432359550561798, "step": 1112 }, { "epoch": 0.20634037819799778, "grad_norm": 7.10546875, "learning_rate": 9.793659621802003e-06, "loss": 2.9058, "mean_token_accuracy": 0.42099605876030094, "step": 1113 }, { "epoch": 0.20652576937337783, "grad_norm": 9.3203125, "learning_rate": 9.793474230626623e-06, "loss": 2.768, "mean_token_accuracy": 0.4348502528199144, "step": 1114 }, { "epoch": 0.20671116054875788, "grad_norm": 7.4375, "learning_rate": 9.793288839451242e-06, "loss": 2.9841, "mean_token_accuracy": 0.4286407766990291, "step": 1115 }, { "epoch": 0.20689655172413793, "grad_norm": 7.03125, "learning_rate": 9.793103448275863e-06, "loss": 2.8867, "mean_token_accuracy": 0.41017344033134867, "step": 1116 }, { "epoch": 0.20708194289951798, "grad_norm": 7.63671875, "learning_rate": 9.792918057100481e-06, "loss": 2.7961, "mean_token_accuracy": 0.44319258713277876, "step": 1117 }, { "epoch": 0.20726733407489803, "grad_norm": 8.6328125, "learning_rate": 9.792732665925104e-06, "loss": 2.687, "mean_token_accuracy": 0.4358334427952187, "step": 1118 }, { "epoch": 0.20745272525027808, "grad_norm": 6.98046875, "learning_rate": 9.792547274749722e-06, "loss": 3.0901, "mean_token_accuracy": 0.389687235841082, "step": 1119 }, { "epoch": 0.20763811642565813, "grad_norm": 5.7734375, "learning_rate": 9.792361883574343e-06, "loss": 2.5734, "mean_token_accuracy": 0.46616753778782316, "step": 1120 }, { "epoch": 0.20782350760103818, "grad_norm": 9.6484375, "learning_rate": 9.792176492398963e-06, "loss": 2.7232, "mean_token_accuracy": 0.45009185548071035, "step": 1121 }, { "epoch": 0.20800889877641823, "grad_norm": 8.6328125, "learning_rate": 9.791991101223582e-06, "loss": 3.2123, "mean_token_accuracy": 0.3910644742535699, "step": 1122 }, { "epoch": 0.20819428995179828, "grad_norm": 7.87109375, "learning_rate": 9.791805710048203e-06, "loss": 2.5421, "mean_token_accuracy": 0.44792528667691217, "step": 1123 }, { "epoch": 0.20837968112717833, "grad_norm": 6.19140625, "learning_rate": 9.791620318872821e-06, "loss": 2.7563, "mean_token_accuracy": 0.4462567963195316, "step": 1124 }, { "epoch": 0.20856507230255839, "grad_norm": 6.9921875, "learning_rate": 9.791434927697442e-06, "loss": 2.8709, "mean_token_accuracy": 0.4415215989684075, "step": 1125 }, { "epoch": 0.20875046347793846, "grad_norm": 6.0390625, "learning_rate": 9.791249536522062e-06, "loss": 2.9661, "mean_token_accuracy": 0.4276850958224183, "step": 1126 }, { "epoch": 0.20893585465331851, "grad_norm": 5.79296875, "learning_rate": 9.791064145346683e-06, "loss": 3.3114, "mean_token_accuracy": 0.3963323971584338, "step": 1127 }, { "epoch": 0.20912124582869857, "grad_norm": 8.234375, "learning_rate": 9.790878754171302e-06, "loss": 3.0924, "mean_token_accuracy": 0.3976329917032699, "step": 1128 }, { "epoch": 0.20930663700407862, "grad_norm": 8.484375, "learning_rate": 9.790693362995922e-06, "loss": 2.2897, "mean_token_accuracy": 0.5095942120163574, "step": 1129 }, { "epoch": 0.20949202817945867, "grad_norm": 6.53515625, "learning_rate": 9.790507971820543e-06, "loss": 2.709, "mean_token_accuracy": 0.44899978017146625, "step": 1130 }, { "epoch": 0.20967741935483872, "grad_norm": 7.0546875, "learning_rate": 9.790322580645162e-06, "loss": 3.2162, "mean_token_accuracy": 0.3875460405156538, "step": 1131 }, { "epoch": 0.20986281053021877, "grad_norm": 5.94921875, "learning_rate": 9.790137189469782e-06, "loss": 3.2783, "mean_token_accuracy": 0.40025209121118366, "step": 1132 }, { "epoch": 0.21004820170559882, "grad_norm": 7.921875, "learning_rate": 9.7899517982944e-06, "loss": 2.6593, "mean_token_accuracy": 0.4517487639626442, "step": 1133 }, { "epoch": 0.21023359288097887, "grad_norm": 6.6015625, "learning_rate": 9.789766407119023e-06, "loss": 3.0492, "mean_token_accuracy": 0.415185588199734, "step": 1134 }, { "epoch": 0.21041898405635892, "grad_norm": 7.4765625, "learning_rate": 9.789581015943642e-06, "loss": 3.0749, "mean_token_accuracy": 0.4105833598979917, "step": 1135 }, { "epoch": 0.21060437523173897, "grad_norm": 5.6796875, "learning_rate": 9.789395624768262e-06, "loss": 3.0489, "mean_token_accuracy": 0.41921847613155994, "step": 1136 }, { "epoch": 0.21078976640711902, "grad_norm": 7.2109375, "learning_rate": 9.789210233592881e-06, "loss": 2.7799, "mean_token_accuracy": 0.43996188055908514, "step": 1137 }, { "epoch": 0.21097515758249907, "grad_norm": 7.53125, "learning_rate": 9.789024842417502e-06, "loss": 3.2321, "mean_token_accuracy": 0.3952324195470799, "step": 1138 }, { "epoch": 0.21116054875787912, "grad_norm": 6.9609375, "learning_rate": 9.788839451242122e-06, "loss": 2.8977, "mean_token_accuracy": 0.4318699680996506, "step": 1139 }, { "epoch": 0.21134593993325917, "grad_norm": 6.29296875, "learning_rate": 9.788654060066741e-06, "loss": 3.1301, "mean_token_accuracy": 0.4022207707380797, "step": 1140 }, { "epoch": 0.21153133110863923, "grad_norm": 5.1171875, "learning_rate": 9.788468668891361e-06, "loss": 2.7294, "mean_token_accuracy": 0.4444444444444444, "step": 1141 }, { "epoch": 0.21171672228401928, "grad_norm": 8.375, "learning_rate": 9.788283277715982e-06, "loss": 2.7615, "mean_token_accuracy": 0.4364328706547107, "step": 1142 }, { "epoch": 0.21190211345939933, "grad_norm": 6.0703125, "learning_rate": 9.788097886540602e-06, "loss": 2.7843, "mean_token_accuracy": 0.4460633230596806, "step": 1143 }, { "epoch": 0.21208750463477938, "grad_norm": 8.75, "learning_rate": 9.787912495365221e-06, "loss": 2.8443, "mean_token_accuracy": 0.4269622093023256, "step": 1144 }, { "epoch": 0.21227289581015943, "grad_norm": 5.890625, "learning_rate": 9.787727104189842e-06, "loss": 2.6167, "mean_token_accuracy": 0.44726810673443457, "step": 1145 }, { "epoch": 0.21245828698553948, "grad_norm": 16.140625, "learning_rate": 9.78754171301446e-06, "loss": 2.5632, "mean_token_accuracy": 0.45762554534484357, "step": 1146 }, { "epoch": 0.21264367816091953, "grad_norm": 6.9375, "learning_rate": 9.787356321839081e-06, "loss": 2.8794, "mean_token_accuracy": 0.4282909930715935, "step": 1147 }, { "epoch": 0.21282906933629958, "grad_norm": 5.86328125, "learning_rate": 9.787170930663701e-06, "loss": 2.6344, "mean_token_accuracy": 0.4487199872793767, "step": 1148 }, { "epoch": 0.21301446051167963, "grad_norm": 6.14453125, "learning_rate": 9.78698553948832e-06, "loss": 2.9224, "mean_token_accuracy": 0.42774301846482526, "step": 1149 }, { "epoch": 0.21319985168705968, "grad_norm": 6.9140625, "learning_rate": 9.78680014831294e-06, "loss": 3.3961, "mean_token_accuracy": 0.3858891288696904, "step": 1150 }, { "epoch": 0.21338524286243976, "grad_norm": 9.546875, "learning_rate": 9.786614757137561e-06, "loss": 2.9146, "mean_token_accuracy": 0.4180452654014006, "step": 1151 }, { "epoch": 0.2135706340378198, "grad_norm": 5.9921875, "learning_rate": 9.786429365962182e-06, "loss": 2.4937, "mean_token_accuracy": 0.49495576594754, "step": 1152 }, { "epoch": 0.21375602521319986, "grad_norm": 10.1171875, "learning_rate": 9.7862439747868e-06, "loss": 2.7787, "mean_token_accuracy": 0.43734939759036146, "step": 1153 }, { "epoch": 0.2139414163885799, "grad_norm": 9.8203125, "learning_rate": 9.786058583611421e-06, "loss": 3.0979, "mean_token_accuracy": 0.3965096881817015, "step": 1154 }, { "epoch": 0.21412680756395996, "grad_norm": 9.015625, "learning_rate": 9.78587319243604e-06, "loss": 2.6714, "mean_token_accuracy": 0.43896882494004796, "step": 1155 }, { "epoch": 0.21431219873934002, "grad_norm": 6.4921875, "learning_rate": 9.78568780126066e-06, "loss": 3.2057, "mean_token_accuracy": 0.4044987146529563, "step": 1156 }, { "epoch": 0.21449758991472007, "grad_norm": 7.4453125, "learning_rate": 9.78550241008528e-06, "loss": 2.8015, "mean_token_accuracy": 0.4235974409448819, "step": 1157 }, { "epoch": 0.21468298109010012, "grad_norm": 7.97265625, "learning_rate": 9.785317018909901e-06, "loss": 2.7838, "mean_token_accuracy": 0.4342302690807242, "step": 1158 }, { "epoch": 0.21486837226548017, "grad_norm": 7.109375, "learning_rate": 9.785131627734522e-06, "loss": 2.8609, "mean_token_accuracy": 0.4235016934177588, "step": 1159 }, { "epoch": 0.21505376344086022, "grad_norm": 5.87890625, "learning_rate": 9.78494623655914e-06, "loss": 3.052, "mean_token_accuracy": 0.411, "step": 1160 }, { "epoch": 0.21523915461624027, "grad_norm": 9.328125, "learning_rate": 9.784760845383761e-06, "loss": 3.1273, "mean_token_accuracy": 0.3937896207155904, "step": 1161 }, { "epoch": 0.21542454579162032, "grad_norm": 8.2109375, "learning_rate": 9.78457545420838e-06, "loss": 2.7098, "mean_token_accuracy": 0.44323027718550106, "step": 1162 }, { "epoch": 0.21560993696700037, "grad_norm": 7.64453125, "learning_rate": 9.784390063033e-06, "loss": 2.7971, "mean_token_accuracy": 0.46920380570856285, "step": 1163 }, { "epoch": 0.21579532814238042, "grad_norm": 5.63671875, "learning_rate": 9.78420467185762e-06, "loss": 2.8516, "mean_token_accuracy": 0.4115293420272673, "step": 1164 }, { "epoch": 0.21598071931776047, "grad_norm": 8.78125, "learning_rate": 9.78401928068224e-06, "loss": 2.7246, "mean_token_accuracy": 0.4480143263457284, "step": 1165 }, { "epoch": 0.21616611049314052, "grad_norm": 5.77734375, "learning_rate": 9.78383388950686e-06, "loss": 2.8348, "mean_token_accuracy": 0.43222976796830787, "step": 1166 }, { "epoch": 0.21635150166852057, "grad_norm": 5.77734375, "learning_rate": 9.78364849833148e-06, "loss": 2.831, "mean_token_accuracy": 0.41485913737222635, "step": 1167 }, { "epoch": 0.21653689284390062, "grad_norm": 7.9921875, "learning_rate": 9.783463107156101e-06, "loss": 2.8661, "mean_token_accuracy": 0.4450830140485313, "step": 1168 }, { "epoch": 0.21672228401928068, "grad_norm": 5.6640625, "learning_rate": 9.78327771598072e-06, "loss": 3.0035, "mean_token_accuracy": 0.41605335786568537, "step": 1169 }, { "epoch": 0.21690767519466073, "grad_norm": 7.4921875, "learning_rate": 9.78309232480534e-06, "loss": 2.8281, "mean_token_accuracy": 0.4479394449116905, "step": 1170 }, { "epoch": 0.21709306637004078, "grad_norm": 7.08984375, "learning_rate": 9.78290693362996e-06, "loss": 3.1391, "mean_token_accuracy": 0.40062272963155166, "step": 1171 }, { "epoch": 0.21727845754542083, "grad_norm": 6.1484375, "learning_rate": 9.78272154245458e-06, "loss": 2.7473, "mean_token_accuracy": 0.4432754468246926, "step": 1172 }, { "epoch": 0.21746384872080088, "grad_norm": 6.8046875, "learning_rate": 9.7825361512792e-06, "loss": 2.7259, "mean_token_accuracy": 0.45585822120118147, "step": 1173 }, { "epoch": 0.21764923989618093, "grad_norm": 7.04296875, "learning_rate": 9.78235076010382e-06, "loss": 3.261, "mean_token_accuracy": 0.38257439942631766, "step": 1174 }, { "epoch": 0.21783463107156098, "grad_norm": 6.07421875, "learning_rate": 9.78216536892844e-06, "loss": 2.6737, "mean_token_accuracy": 0.4419543429844098, "step": 1175 }, { "epoch": 0.21802002224694106, "grad_norm": 10.0390625, "learning_rate": 9.78197997775306e-06, "loss": 2.4634, "mean_token_accuracy": 0.4530123759009928, "step": 1176 }, { "epoch": 0.2182054134223211, "grad_norm": 9.296875, "learning_rate": 9.78179458657768e-06, "loss": 2.839, "mean_token_accuracy": 0.42993858020282816, "step": 1177 }, { "epoch": 0.21839080459770116, "grad_norm": 17.25, "learning_rate": 9.7816091954023e-06, "loss": 2.197, "mean_token_accuracy": 0.4897648847818458, "step": 1178 }, { "epoch": 0.2185761957730812, "grad_norm": 5.33203125, "learning_rate": 9.78142380422692e-06, "loss": 2.4788, "mean_token_accuracy": 0.48766217247519716, "step": 1179 }, { "epoch": 0.21876158694846126, "grad_norm": 6.4453125, "learning_rate": 9.781238413051539e-06, "loss": 2.9532, "mean_token_accuracy": 0.42524773804394655, "step": 1180 }, { "epoch": 0.2189469781238413, "grad_norm": 7.6328125, "learning_rate": 9.781053021876159e-06, "loss": 2.6732, "mean_token_accuracy": 0.4523900088753645, "step": 1181 }, { "epoch": 0.21913236929922136, "grad_norm": 10.0078125, "learning_rate": 9.78086763070078e-06, "loss": 2.2347, "mean_token_accuracy": 0.5073031170004517, "step": 1182 }, { "epoch": 0.21931776047460141, "grad_norm": 10.1328125, "learning_rate": 9.7806822395254e-06, "loss": 3.0516, "mean_token_accuracy": 0.40474940960377853, "step": 1183 }, { "epoch": 0.21950315164998146, "grad_norm": 7.4609375, "learning_rate": 9.780496848350019e-06, "loss": 2.6507, "mean_token_accuracy": 0.44364672364672364, "step": 1184 }, { "epoch": 0.21968854282536152, "grad_norm": 7.55078125, "learning_rate": 9.78031145717464e-06, "loss": 2.9071, "mean_token_accuracy": 0.42946872546453807, "step": 1185 }, { "epoch": 0.21987393400074157, "grad_norm": 5.5625, "learning_rate": 9.78012606599926e-06, "loss": 3.155, "mean_token_accuracy": 0.4002143048486472, "step": 1186 }, { "epoch": 0.22005932517612162, "grad_norm": 7.703125, "learning_rate": 9.779940674823879e-06, "loss": 2.5232, "mean_token_accuracy": 0.4785413744740533, "step": 1187 }, { "epoch": 0.22024471635150167, "grad_norm": 5.546875, "learning_rate": 9.779755283648499e-06, "loss": 3.1383, "mean_token_accuracy": 0.40865491858130715, "step": 1188 }, { "epoch": 0.22043010752688172, "grad_norm": 9.578125, "learning_rate": 9.779569892473118e-06, "loss": 2.5066, "mean_token_accuracy": 0.4689073343312015, "step": 1189 }, { "epoch": 0.22061549870226177, "grad_norm": 7.33984375, "learning_rate": 9.77938450129774e-06, "loss": 2.5506, "mean_token_accuracy": 0.47013341419041843, "step": 1190 }, { "epoch": 0.22080088987764182, "grad_norm": 6.7421875, "learning_rate": 9.779199110122359e-06, "loss": 2.8008, "mean_token_accuracy": 0.42859139183901623, "step": 1191 }, { "epoch": 0.22098628105302187, "grad_norm": 6.66796875, "learning_rate": 9.77901371894698e-06, "loss": 2.8468, "mean_token_accuracy": 0.4329132690882135, "step": 1192 }, { "epoch": 0.22117167222840192, "grad_norm": 5.90234375, "learning_rate": 9.778828327771598e-06, "loss": 3.2222, "mean_token_accuracy": 0.38843111404087016, "step": 1193 }, { "epoch": 0.22135706340378197, "grad_norm": 5.609375, "learning_rate": 9.778642936596219e-06, "loss": 3.2838, "mean_token_accuracy": 0.3781668656320467, "step": 1194 }, { "epoch": 0.22154245457916202, "grad_norm": 7.5703125, "learning_rate": 9.778457545420839e-06, "loss": 2.6905, "mean_token_accuracy": 0.4725356294536817, "step": 1195 }, { "epoch": 0.22172784575454207, "grad_norm": 6.109375, "learning_rate": 9.778272154245458e-06, "loss": 3.0235, "mean_token_accuracy": 0.42206235011990406, "step": 1196 }, { "epoch": 0.22191323692992213, "grad_norm": 7.73828125, "learning_rate": 9.778086763070078e-06, "loss": 3.3764, "mean_token_accuracy": 0.3687551428235571, "step": 1197 }, { "epoch": 0.22209862810530218, "grad_norm": 6.09375, "learning_rate": 9.777901371894699e-06, "loss": 2.9945, "mean_token_accuracy": 0.4039287906691222, "step": 1198 }, { "epoch": 0.22228401928068223, "grad_norm": 9.2109375, "learning_rate": 9.77771598071932e-06, "loss": 2.336, "mean_token_accuracy": 0.5019560232024821, "step": 1199 }, { "epoch": 0.22246941045606228, "grad_norm": 8.640625, "learning_rate": 9.777530589543938e-06, "loss": 2.6591, "mean_token_accuracy": 0.41135487696412687, "step": 1200 }, { "epoch": 0.22265480163144236, "grad_norm": 7.15234375, "learning_rate": 9.777345198368559e-06, "loss": 2.5751, "mean_token_accuracy": 0.45745992601726265, "step": 1201 }, { "epoch": 0.2228401928068224, "grad_norm": 10.21875, "learning_rate": 9.77715980719318e-06, "loss": 2.1021, "mean_token_accuracy": 0.5066852367688023, "step": 1202 }, { "epoch": 0.22302558398220246, "grad_norm": 6.53125, "learning_rate": 9.776974416017798e-06, "loss": 3.0944, "mean_token_accuracy": 0.41747450585421125, "step": 1203 }, { "epoch": 0.2232109751575825, "grad_norm": 6.16796875, "learning_rate": 9.776789024842418e-06, "loss": 2.7315, "mean_token_accuracy": 0.4425815342214056, "step": 1204 }, { "epoch": 0.22339636633296256, "grad_norm": 6.91796875, "learning_rate": 9.776603633667037e-06, "loss": 3.0057, "mean_token_accuracy": 0.4061111111111111, "step": 1205 }, { "epoch": 0.2235817575083426, "grad_norm": 6.828125, "learning_rate": 9.77641824249166e-06, "loss": 2.4656, "mean_token_accuracy": 0.48159316497047366, "step": 1206 }, { "epoch": 0.22376714868372266, "grad_norm": 6.4453125, "learning_rate": 9.776232851316278e-06, "loss": 3.223, "mean_token_accuracy": 0.37722624382396874, "step": 1207 }, { "epoch": 0.2239525398591027, "grad_norm": 6.19140625, "learning_rate": 9.776047460140899e-06, "loss": 3.1511, "mean_token_accuracy": 0.39880636604774533, "step": 1208 }, { "epoch": 0.22413793103448276, "grad_norm": 8.9453125, "learning_rate": 9.775862068965518e-06, "loss": 2.5288, "mean_token_accuracy": 0.4620878466849185, "step": 1209 }, { "epoch": 0.2243233222098628, "grad_norm": 7.5546875, "learning_rate": 9.775676677790138e-06, "loss": 2.7911, "mean_token_accuracy": 0.4411728009981285, "step": 1210 }, { "epoch": 0.22450871338524286, "grad_norm": 5.4140625, "learning_rate": 9.775491286614759e-06, "loss": 3.3141, "mean_token_accuracy": 0.3771800177357375, "step": 1211 }, { "epoch": 0.22469410456062291, "grad_norm": 8.46875, "learning_rate": 9.775305895439377e-06, "loss": 2.681, "mean_token_accuracy": 0.4592499715034766, "step": 1212 }, { "epoch": 0.22487949573600297, "grad_norm": 6.85546875, "learning_rate": 9.775120504263998e-06, "loss": 3.2486, "mean_token_accuracy": 0.40227934044616875, "step": 1213 }, { "epoch": 0.22506488691138302, "grad_norm": 6.77734375, "learning_rate": 9.774935113088618e-06, "loss": 3.0283, "mean_token_accuracy": 0.40954332552693207, "step": 1214 }, { "epoch": 0.22525027808676307, "grad_norm": 7.9375, "learning_rate": 9.774749721913239e-06, "loss": 3.007, "mean_token_accuracy": 0.4057942057942058, "step": 1215 }, { "epoch": 0.22543566926214312, "grad_norm": 6.203125, "learning_rate": 9.774564330737858e-06, "loss": 2.9439, "mean_token_accuracy": 0.4070287539936102, "step": 1216 }, { "epoch": 0.22562106043752317, "grad_norm": 7.83203125, "learning_rate": 9.774378939562478e-06, "loss": 2.9385, "mean_token_accuracy": 0.4318747255963706, "step": 1217 }, { "epoch": 0.22580645161290322, "grad_norm": 7.75390625, "learning_rate": 9.774193548387097e-06, "loss": 2.8201, "mean_token_accuracy": 0.4237463976945245, "step": 1218 }, { "epoch": 0.22599184278828327, "grad_norm": 7.43359375, "learning_rate": 9.774008157211717e-06, "loss": 2.781, "mean_token_accuracy": 0.4406497292794669, "step": 1219 }, { "epoch": 0.22617723396366332, "grad_norm": 6.20703125, "learning_rate": 9.773822766036338e-06, "loss": 2.9886, "mean_token_accuracy": 0.4228863425209043, "step": 1220 }, { "epoch": 0.22636262513904337, "grad_norm": 10.5703125, "learning_rate": 9.773637374860957e-06, "loss": 2.855, "mean_token_accuracy": 0.4242585450192845, "step": 1221 }, { "epoch": 0.22654801631442342, "grad_norm": 7.5546875, "learning_rate": 9.773451983685577e-06, "loss": 2.6326, "mean_token_accuracy": 0.4650132860938884, "step": 1222 }, { "epoch": 0.22673340748980347, "grad_norm": 8.2265625, "learning_rate": 9.773266592510198e-06, "loss": 2.7328, "mean_token_accuracy": 0.433692264097478, "step": 1223 }, { "epoch": 0.22691879866518352, "grad_norm": 6.9453125, "learning_rate": 9.773081201334818e-06, "loss": 3.3291, "mean_token_accuracy": 0.3768449559918754, "step": 1224 }, { "epoch": 0.2271041898405636, "grad_norm": 6.04296875, "learning_rate": 9.772895810159437e-06, "loss": 3.1182, "mean_token_accuracy": 0.41483343808925205, "step": 1225 }, { "epoch": 0.22728958101594365, "grad_norm": 6.71875, "learning_rate": 9.772710418984057e-06, "loss": 3.0041, "mean_token_accuracy": 0.4258512183314507, "step": 1226 }, { "epoch": 0.2274749721913237, "grad_norm": 7.5234375, "learning_rate": 9.772525027808676e-06, "loss": 2.9577, "mean_token_accuracy": 0.4220503866300823, "step": 1227 }, { "epoch": 0.22766036336670376, "grad_norm": 6.5, "learning_rate": 9.772339636633297e-06, "loss": 3.2815, "mean_token_accuracy": 0.37471541449042456, "step": 1228 }, { "epoch": 0.2278457545420838, "grad_norm": 5.76953125, "learning_rate": 9.772154245457917e-06, "loss": 3.1414, "mean_token_accuracy": 0.40584045584045586, "step": 1229 }, { "epoch": 0.22803114571746386, "grad_norm": 5.59375, "learning_rate": 9.771968854282538e-06, "loss": 2.7879, "mean_token_accuracy": 0.4580934101087652, "step": 1230 }, { "epoch": 0.2282165368928439, "grad_norm": 5.82421875, "learning_rate": 9.771783463107156e-06, "loss": 2.8759, "mean_token_accuracy": 0.42318092428711895, "step": 1231 }, { "epoch": 0.22840192806822396, "grad_norm": 7.36328125, "learning_rate": 9.771598071931777e-06, "loss": 2.6194, "mean_token_accuracy": 0.4455120693695805, "step": 1232 }, { "epoch": 0.228587319243604, "grad_norm": 5.5078125, "learning_rate": 9.771412680756397e-06, "loss": 2.8271, "mean_token_accuracy": 0.43392952482648156, "step": 1233 }, { "epoch": 0.22877271041898406, "grad_norm": 6.28125, "learning_rate": 9.771227289581016e-06, "loss": 2.832, "mean_token_accuracy": 0.4328850855745721, "step": 1234 }, { "epoch": 0.2289581015943641, "grad_norm": 7.5234375, "learning_rate": 9.771041898405637e-06, "loss": 3.2831, "mean_token_accuracy": 0.404125, "step": 1235 }, { "epoch": 0.22914349276974416, "grad_norm": 5.5390625, "learning_rate": 9.770856507230256e-06, "loss": 3.1512, "mean_token_accuracy": 0.39688625537353317, "step": 1236 }, { "epoch": 0.2293288839451242, "grad_norm": 6.58984375, "learning_rate": 9.770671116054876e-06, "loss": 2.8749, "mean_token_accuracy": 0.42575301204819277, "step": 1237 }, { "epoch": 0.22951427512050426, "grad_norm": 6.14453125, "learning_rate": 9.770485724879497e-06, "loss": 2.996, "mean_token_accuracy": 0.4151395545531435, "step": 1238 }, { "epoch": 0.2296996662958843, "grad_norm": 5.39453125, "learning_rate": 9.770300333704117e-06, "loss": 3.1185, "mean_token_accuracy": 0.39124326855537345, "step": 1239 }, { "epoch": 0.22988505747126436, "grad_norm": 9.96875, "learning_rate": 9.770114942528738e-06, "loss": 2.6221, "mean_token_accuracy": 0.4539731393396754, "step": 1240 }, { "epoch": 0.23007044864664442, "grad_norm": 6.39453125, "learning_rate": 9.769929551353356e-06, "loss": 2.9471, "mean_token_accuracy": 0.4212497325058849, "step": 1241 }, { "epoch": 0.23025583982202447, "grad_norm": 5.69921875, "learning_rate": 9.769744160177977e-06, "loss": 3.1697, "mean_token_accuracy": 0.4084507042253521, "step": 1242 }, { "epoch": 0.23044123099740452, "grad_norm": 5.9375, "learning_rate": 9.769558769002596e-06, "loss": 3.2961, "mean_token_accuracy": 0.3853113358169239, "step": 1243 }, { "epoch": 0.23062662217278457, "grad_norm": 7.0703125, "learning_rate": 9.769373377827216e-06, "loss": 3.0753, "mean_token_accuracy": 0.40752205540937936, "step": 1244 }, { "epoch": 0.23081201334816462, "grad_norm": 6.08984375, "learning_rate": 9.769187986651837e-06, "loss": 2.8466, "mean_token_accuracy": 0.46470323741007197, "step": 1245 }, { "epoch": 0.23099740452354467, "grad_norm": 6.36328125, "learning_rate": 9.769002595476455e-06, "loss": 3.1019, "mean_token_accuracy": 0.400490647040785, "step": 1246 }, { "epoch": 0.23118279569892472, "grad_norm": 6.15234375, "learning_rate": 9.768817204301076e-06, "loss": 2.9671, "mean_token_accuracy": 0.4104171690378587, "step": 1247 }, { "epoch": 0.23136818687430477, "grad_norm": 6.24609375, "learning_rate": 9.768631813125696e-06, "loss": 3.2391, "mean_token_accuracy": 0.40015255530129673, "step": 1248 }, { "epoch": 0.23155357804968482, "grad_norm": 6.89453125, "learning_rate": 9.768446421950317e-06, "loss": 3.279, "mean_token_accuracy": 0.38253638253638256, "step": 1249 }, { "epoch": 0.2317389692250649, "grad_norm": 7.25390625, "learning_rate": 9.768261030774936e-06, "loss": 3.0371, "mean_token_accuracy": 0.41655985644706484, "step": 1250 }, { "epoch": 0.23192436040044495, "grad_norm": 6.33984375, "learning_rate": 9.768075639599556e-06, "loss": 2.8232, "mean_token_accuracy": 0.45692940997103215, "step": 1251 }, { "epoch": 0.232109751575825, "grad_norm": 9.03125, "learning_rate": 9.767890248424175e-06, "loss": 2.6696, "mean_token_accuracy": 0.44629927594529367, "step": 1252 }, { "epoch": 0.23229514275120505, "grad_norm": 7.4609375, "learning_rate": 9.767704857248795e-06, "loss": 2.6691, "mean_token_accuracy": 0.4613948526470196, "step": 1253 }, { "epoch": 0.2324805339265851, "grad_norm": 7.44921875, "learning_rate": 9.767519466073416e-06, "loss": 2.8696, "mean_token_accuracy": 0.409873027174558, "step": 1254 }, { "epoch": 0.23266592510196515, "grad_norm": 5.17578125, "learning_rate": 9.767334074898036e-06, "loss": 2.9466, "mean_token_accuracy": 0.4135889846866555, "step": 1255 }, { "epoch": 0.2328513162773452, "grad_norm": 7.4609375, "learning_rate": 9.767148683722655e-06, "loss": 3.0508, "mean_token_accuracy": 0.4241460541813899, "step": 1256 }, { "epoch": 0.23303670745272526, "grad_norm": 9.6328125, "learning_rate": 9.766963292547276e-06, "loss": 2.5362, "mean_token_accuracy": 0.46022511698495006, "step": 1257 }, { "epoch": 0.2332220986281053, "grad_norm": 9.9921875, "learning_rate": 9.766777901371896e-06, "loss": 2.7343, "mean_token_accuracy": 0.45231062410671746, "step": 1258 }, { "epoch": 0.23340748980348536, "grad_norm": 6.421875, "learning_rate": 9.766592510196515e-06, "loss": 2.7206, "mean_token_accuracy": 0.438365947721754, "step": 1259 }, { "epoch": 0.2335928809788654, "grad_norm": 12.46875, "learning_rate": 9.766407119021135e-06, "loss": 2.9712, "mean_token_accuracy": 0.3971388783390303, "step": 1260 }, { "epoch": 0.23377827215424546, "grad_norm": 6.7890625, "learning_rate": 9.766221727845754e-06, "loss": 2.8796, "mean_token_accuracy": 0.426182092555332, "step": 1261 }, { "epoch": 0.2339636633296255, "grad_norm": 6.63671875, "learning_rate": 9.766036336670375e-06, "loss": 2.8516, "mean_token_accuracy": 0.4217195641875888, "step": 1262 }, { "epoch": 0.23414905450500556, "grad_norm": 7.84765625, "learning_rate": 9.765850945494995e-06, "loss": 2.8203, "mean_token_accuracy": 0.43216805644644, "step": 1263 }, { "epoch": 0.2343344456803856, "grad_norm": 8.1953125, "learning_rate": 9.765665554319616e-06, "loss": 3.1013, "mean_token_accuracy": 0.3959471112388617, "step": 1264 }, { "epoch": 0.23451983685576566, "grad_norm": 10.4375, "learning_rate": 9.765480163144235e-06, "loss": 2.2, "mean_token_accuracy": 0.5128857779191854, "step": 1265 }, { "epoch": 0.2347052280311457, "grad_norm": 6.26953125, "learning_rate": 9.765294771968855e-06, "loss": 2.6077, "mean_token_accuracy": 0.4615115465360392, "step": 1266 }, { "epoch": 0.23489061920652576, "grad_norm": 5.57421875, "learning_rate": 9.765109380793476e-06, "loss": 2.6928, "mean_token_accuracy": 0.4464652854657599, "step": 1267 }, { "epoch": 0.23507601038190581, "grad_norm": 7.76953125, "learning_rate": 9.764923989618094e-06, "loss": 3.0746, "mean_token_accuracy": 0.4199491740787802, "step": 1268 }, { "epoch": 0.23526140155728587, "grad_norm": 6.234375, "learning_rate": 9.764738598442715e-06, "loss": 3.3705, "mean_token_accuracy": 0.3713670613562971, "step": 1269 }, { "epoch": 0.23544679273266592, "grad_norm": 8.4609375, "learning_rate": 9.764553207267334e-06, "loss": 2.8245, "mean_token_accuracy": 0.43195439263265606, "step": 1270 }, { "epoch": 0.23563218390804597, "grad_norm": 6.828125, "learning_rate": 9.764367816091956e-06, "loss": 2.8508, "mean_token_accuracy": 0.4360630328080599, "step": 1271 }, { "epoch": 0.23581757508342602, "grad_norm": 6.20703125, "learning_rate": 9.764182424916575e-06, "loss": 2.4791, "mean_token_accuracy": 0.468463462072798, "step": 1272 }, { "epoch": 0.23600296625880607, "grad_norm": 9.46875, "learning_rate": 9.763997033741195e-06, "loss": 2.819, "mean_token_accuracy": 0.4251429992539169, "step": 1273 }, { "epoch": 0.23618835743418612, "grad_norm": 6.28515625, "learning_rate": 9.763811642565814e-06, "loss": 2.9617, "mean_token_accuracy": 0.4247799067840497, "step": 1274 }, { "epoch": 0.2363737486095662, "grad_norm": 6.19140625, "learning_rate": 9.763626251390434e-06, "loss": 2.2891, "mean_token_accuracy": 0.5138172819137966, "step": 1275 }, { "epoch": 0.23655913978494625, "grad_norm": 6.63671875, "learning_rate": 9.763440860215055e-06, "loss": 2.7527, "mean_token_accuracy": 0.4379596307307151, "step": 1276 }, { "epoch": 0.2367445309603263, "grad_norm": 6.84375, "learning_rate": 9.763255469039674e-06, "loss": 3.0745, "mean_token_accuracy": 0.4180413137167484, "step": 1277 }, { "epoch": 0.23692992213570635, "grad_norm": 5.80078125, "learning_rate": 9.763070077864294e-06, "loss": 2.9202, "mean_token_accuracy": 0.428454619787408, "step": 1278 }, { "epoch": 0.2371153133110864, "grad_norm": 6.23046875, "learning_rate": 9.762884686688915e-06, "loss": 2.6945, "mean_token_accuracy": 0.4513642669955295, "step": 1279 }, { "epoch": 0.23730070448646645, "grad_norm": 6.18359375, "learning_rate": 9.762699295513535e-06, "loss": 2.7081, "mean_token_accuracy": 0.4679776048067732, "step": 1280 }, { "epoch": 0.2374860956618465, "grad_norm": 12.7890625, "learning_rate": 9.762513904338154e-06, "loss": 2.842, "mean_token_accuracy": 0.43618162316767967, "step": 1281 }, { "epoch": 0.23767148683722655, "grad_norm": 7.0078125, "learning_rate": 9.762328513162774e-06, "loss": 3.2867, "mean_token_accuracy": 0.39192649495405935, "step": 1282 }, { "epoch": 0.2378568780126066, "grad_norm": 6.4453125, "learning_rate": 9.762143121987395e-06, "loss": 2.7202, "mean_token_accuracy": 0.45325260490002817, "step": 1283 }, { "epoch": 0.23804226918798665, "grad_norm": 7.87890625, "learning_rate": 9.761957730812014e-06, "loss": 3.1419, "mean_token_accuracy": 0.38686757319612886, "step": 1284 }, { "epoch": 0.2382276603633667, "grad_norm": 8.109375, "learning_rate": 9.761772339636634e-06, "loss": 2.7957, "mean_token_accuracy": 0.4225908372827804, "step": 1285 }, { "epoch": 0.23841305153874676, "grad_norm": 5.97265625, "learning_rate": 9.761586948461253e-06, "loss": 3.1813, "mean_token_accuracy": 0.3837990802971348, "step": 1286 }, { "epoch": 0.2385984427141268, "grad_norm": 6.9140625, "learning_rate": 9.761401557285875e-06, "loss": 2.5476, "mean_token_accuracy": 0.4707103825136612, "step": 1287 }, { "epoch": 0.23878383388950686, "grad_norm": 5.55859375, "learning_rate": 9.761216166110494e-06, "loss": 3.4871, "mean_token_accuracy": 0.3607650685793381, "step": 1288 }, { "epoch": 0.2389692250648869, "grad_norm": 6.7421875, "learning_rate": 9.761030774935114e-06, "loss": 3.2974, "mean_token_accuracy": 0.38389800495809234, "step": 1289 }, { "epoch": 0.23915461624026696, "grad_norm": 5.3828125, "learning_rate": 9.760845383759733e-06, "loss": 3.3738, "mean_token_accuracy": 0.37438625204582654, "step": 1290 }, { "epoch": 0.239340007415647, "grad_norm": 11.28125, "learning_rate": 9.760659992584354e-06, "loss": 2.8283, "mean_token_accuracy": 0.42223692918596956, "step": 1291 }, { "epoch": 0.23952539859102706, "grad_norm": 6.765625, "learning_rate": 9.760474601408974e-06, "loss": 2.8955, "mean_token_accuracy": 0.44009746852578857, "step": 1292 }, { "epoch": 0.2397107897664071, "grad_norm": 5.83203125, "learning_rate": 9.760289210233593e-06, "loss": 2.6797, "mean_token_accuracy": 0.4535785785785786, "step": 1293 }, { "epoch": 0.23989618094178716, "grad_norm": 5.9375, "learning_rate": 9.760103819058214e-06, "loss": 3.1694, "mean_token_accuracy": 0.4032159824582775, "step": 1294 }, { "epoch": 0.2400815721171672, "grad_norm": 6.390625, "learning_rate": 9.759918427882834e-06, "loss": 2.8147, "mean_token_accuracy": 0.4276430496116659, "step": 1295 }, { "epoch": 0.24026696329254726, "grad_norm": 5.81640625, "learning_rate": 9.759733036707455e-06, "loss": 2.7573, "mean_token_accuracy": 0.44427414057368075, "step": 1296 }, { "epoch": 0.24045235446792732, "grad_norm": 8.109375, "learning_rate": 9.759547645532073e-06, "loss": 2.5628, "mean_token_accuracy": 0.45892900411921494, "step": 1297 }, { "epoch": 0.24063774564330737, "grad_norm": 7.2578125, "learning_rate": 9.759362254356694e-06, "loss": 2.6494, "mean_token_accuracy": 0.46351824087956023, "step": 1298 }, { "epoch": 0.24082313681868742, "grad_norm": 6.3515625, "learning_rate": 9.759176863181313e-06, "loss": 2.7979, "mean_token_accuracy": 0.45256453234998323, "step": 1299 }, { "epoch": 0.2410085279940675, "grad_norm": 6.58203125, "learning_rate": 9.758991472005933e-06, "loss": 3.4636, "mean_token_accuracy": 0.3567902975602877, "step": 1300 }, { "epoch": 0.24119391916944755, "grad_norm": 10.4609375, "learning_rate": 9.758806080830554e-06, "loss": 2.7455, "mean_token_accuracy": 0.44208829365079366, "step": 1301 }, { "epoch": 0.2413793103448276, "grad_norm": 8.2265625, "learning_rate": 9.758620689655172e-06, "loss": 2.7685, "mean_token_accuracy": 0.4325964010282776, "step": 1302 }, { "epoch": 0.24156470152020765, "grad_norm": 5.99609375, "learning_rate": 9.758435298479793e-06, "loss": 2.6965, "mean_token_accuracy": 0.44941574415744157, "step": 1303 }, { "epoch": 0.2417500926955877, "grad_norm": 5.52734375, "learning_rate": 9.758249907304413e-06, "loss": 2.7061, "mean_token_accuracy": 0.43315508021390375, "step": 1304 }, { "epoch": 0.24193548387096775, "grad_norm": 6.0234375, "learning_rate": 9.758064516129034e-06, "loss": 2.7839, "mean_token_accuracy": 0.4376158940397351, "step": 1305 }, { "epoch": 0.2421208750463478, "grad_norm": 6.1875, "learning_rate": 9.757879124953653e-06, "loss": 2.2454, "mean_token_accuracy": 0.5157614325884268, "step": 1306 }, { "epoch": 0.24230626622172785, "grad_norm": 8.640625, "learning_rate": 9.757693733778273e-06, "loss": 3.3332, "mean_token_accuracy": 0.3804717623043774, "step": 1307 }, { "epoch": 0.2424916573971079, "grad_norm": 6.8046875, "learning_rate": 9.757508342602892e-06, "loss": 3.3125, "mean_token_accuracy": 0.36640094478889873, "step": 1308 }, { "epoch": 0.24267704857248795, "grad_norm": 9.09375, "learning_rate": 9.757322951427512e-06, "loss": 3.2817, "mean_token_accuracy": 0.414235548352242, "step": 1309 }, { "epoch": 0.242862439747868, "grad_norm": 7.89453125, "learning_rate": 9.757137560252133e-06, "loss": 2.6533, "mean_token_accuracy": 0.459037711313394, "step": 1310 }, { "epoch": 0.24304783092324805, "grad_norm": 6.0234375, "learning_rate": 9.756952169076753e-06, "loss": 3.1318, "mean_token_accuracy": 0.3974814814814815, "step": 1311 }, { "epoch": 0.2432332220986281, "grad_norm": 5.12109375, "learning_rate": 9.756766777901372e-06, "loss": 2.9293, "mean_token_accuracy": 0.4233637116818558, "step": 1312 }, { "epoch": 0.24341861327400816, "grad_norm": 7.2890625, "learning_rate": 9.756581386725993e-06, "loss": 3.1417, "mean_token_accuracy": 0.3913650645874549, "step": 1313 }, { "epoch": 0.2436040044493882, "grad_norm": 6.51953125, "learning_rate": 9.756395995550613e-06, "loss": 2.8922, "mean_token_accuracy": 0.42792228141648386, "step": 1314 }, { "epoch": 0.24378939562476826, "grad_norm": 5.84765625, "learning_rate": 9.756210604375232e-06, "loss": 2.9139, "mean_token_accuracy": 0.41500443262411346, "step": 1315 }, { "epoch": 0.2439747868001483, "grad_norm": 5.44921875, "learning_rate": 9.756025213199852e-06, "loss": 3.0479, "mean_token_accuracy": 0.4154043097930446, "step": 1316 }, { "epoch": 0.24416017797552836, "grad_norm": 11.21875, "learning_rate": 9.755839822024471e-06, "loss": 2.8901, "mean_token_accuracy": 0.4377629971494503, "step": 1317 }, { "epoch": 0.2443455691509084, "grad_norm": 6.60546875, "learning_rate": 9.755654430849092e-06, "loss": 2.859, "mean_token_accuracy": 0.4385823600483286, "step": 1318 }, { "epoch": 0.24453096032628846, "grad_norm": 11.6171875, "learning_rate": 9.755469039673712e-06, "loss": 2.7477, "mean_token_accuracy": 0.4397597898160891, "step": 1319 }, { "epoch": 0.2447163515016685, "grad_norm": 7.5078125, "learning_rate": 9.755283648498333e-06, "loss": 3.0473, "mean_token_accuracy": 0.4154910096818811, "step": 1320 }, { "epoch": 0.24490174267704856, "grad_norm": 6.65234375, "learning_rate": 9.755098257322953e-06, "loss": 3.5174, "mean_token_accuracy": 0.3668742701440249, "step": 1321 }, { "epoch": 0.2450871338524286, "grad_norm": 5.94921875, "learning_rate": 9.754912866147572e-06, "loss": 3.4391, "mean_token_accuracy": 0.3920505871725384, "step": 1322 }, { "epoch": 0.24527252502780866, "grad_norm": 4.921875, "learning_rate": 9.754727474972193e-06, "loss": 3.4771, "mean_token_accuracy": 0.37844001824540063, "step": 1323 }, { "epoch": 0.24545791620318874, "grad_norm": 8.0078125, "learning_rate": 9.754542083796811e-06, "loss": 2.6134, "mean_token_accuracy": 0.4582752944156009, "step": 1324 }, { "epoch": 0.2456433073785688, "grad_norm": 8.625, "learning_rate": 9.754356692621432e-06, "loss": 2.4128, "mean_token_accuracy": 0.4834384858044164, "step": 1325 }, { "epoch": 0.24582869855394884, "grad_norm": 9.578125, "learning_rate": 9.754171301446052e-06, "loss": 2.2431, "mean_token_accuracy": 0.49921895339755273, "step": 1326 }, { "epoch": 0.2460140897293289, "grad_norm": 13.484375, "learning_rate": 9.753985910270673e-06, "loss": 3.1383, "mean_token_accuracy": 0.42633156559833985, "step": 1327 }, { "epoch": 0.24619948090470894, "grad_norm": 7.03125, "learning_rate": 9.753800519095292e-06, "loss": 2.7401, "mean_token_accuracy": 0.4392338943702844, "step": 1328 }, { "epoch": 0.246384872080089, "grad_norm": 6.89453125, "learning_rate": 9.753615127919912e-06, "loss": 2.7483, "mean_token_accuracy": 0.43293539325842695, "step": 1329 }, { "epoch": 0.24657026325546905, "grad_norm": 6.3984375, "learning_rate": 9.753429736744533e-06, "loss": 2.4277, "mean_token_accuracy": 0.49623865110246435, "step": 1330 }, { "epoch": 0.2467556544308491, "grad_norm": 5.31640625, "learning_rate": 9.753244345569151e-06, "loss": 2.7119, "mean_token_accuracy": 0.4484464172479391, "step": 1331 }, { "epoch": 0.24694104560622915, "grad_norm": 6.2265625, "learning_rate": 9.753058954393772e-06, "loss": 2.8247, "mean_token_accuracy": 0.43430369787568845, "step": 1332 }, { "epoch": 0.2471264367816092, "grad_norm": 6.14453125, "learning_rate": 9.75287356321839e-06, "loss": 2.9627, "mean_token_accuracy": 0.42333004277722935, "step": 1333 }, { "epoch": 0.24731182795698925, "grad_norm": 6.52734375, "learning_rate": 9.752688172043011e-06, "loss": 2.7786, "mean_token_accuracy": 0.46099806201550386, "step": 1334 }, { "epoch": 0.2474972191323693, "grad_norm": 5.91796875, "learning_rate": 9.752502780867632e-06, "loss": 2.8768, "mean_token_accuracy": 0.4107065452969224, "step": 1335 }, { "epoch": 0.24768261030774935, "grad_norm": 6.6484375, "learning_rate": 9.752317389692252e-06, "loss": 2.73, "mean_token_accuracy": 0.4367726920093095, "step": 1336 }, { "epoch": 0.2478680014831294, "grad_norm": 8.46875, "learning_rate": 9.752131998516871e-06, "loss": 2.9268, "mean_token_accuracy": 0.4272496642292195, "step": 1337 }, { "epoch": 0.24805339265850945, "grad_norm": 6.2109375, "learning_rate": 9.751946607341491e-06, "loss": 3.1549, "mean_token_accuracy": 0.408105147864184, "step": 1338 }, { "epoch": 0.2482387838338895, "grad_norm": 5.72265625, "learning_rate": 9.751761216166112e-06, "loss": 2.8773, "mean_token_accuracy": 0.43840800879013553, "step": 1339 }, { "epoch": 0.24842417500926955, "grad_norm": 5.6171875, "learning_rate": 9.75157582499073e-06, "loss": 3.3699, "mean_token_accuracy": 0.3798907426546582, "step": 1340 }, { "epoch": 0.2486095661846496, "grad_norm": 5.484375, "learning_rate": 9.751390433815351e-06, "loss": 2.9551, "mean_token_accuracy": 0.4093256603216763, "step": 1341 }, { "epoch": 0.24879495736002966, "grad_norm": 5.96875, "learning_rate": 9.75120504263997e-06, "loss": 2.7049, "mean_token_accuracy": 0.45079138402951324, "step": 1342 }, { "epoch": 0.2489803485354097, "grad_norm": 6.171875, "learning_rate": 9.751019651464592e-06, "loss": 2.9436, "mean_token_accuracy": 0.418928133096959, "step": 1343 }, { "epoch": 0.24916573971078976, "grad_norm": 8.6171875, "learning_rate": 9.750834260289211e-06, "loss": 2.6344, "mean_token_accuracy": 0.45914198161389175, "step": 1344 }, { "epoch": 0.2493511308861698, "grad_norm": 5.96875, "learning_rate": 9.750648869113831e-06, "loss": 2.7301, "mean_token_accuracy": 0.44885799404170806, "step": 1345 }, { "epoch": 0.24953652206154986, "grad_norm": 5.296875, "learning_rate": 9.75046347793845e-06, "loss": 2.7683, "mean_token_accuracy": 0.43739515022113773, "step": 1346 }, { "epoch": 0.2497219132369299, "grad_norm": 10.84375, "learning_rate": 9.75027808676307e-06, "loss": 3.3328, "mean_token_accuracy": 0.4147045420021267, "step": 1347 }, { "epoch": 0.24990730441230996, "grad_norm": 8.3828125, "learning_rate": 9.750092695587691e-06, "loss": 3.163, "mean_token_accuracy": 0.390068233510235, "step": 1348 }, { "epoch": 0.25009269558769004, "grad_norm": 8.765625, "learning_rate": 9.74990730441231e-06, "loss": 3.2138, "mean_token_accuracy": 0.38623503092512174, "step": 1349 }, { "epoch": 0.25027808676307006, "grad_norm": 6.94921875, "learning_rate": 9.74972191323693e-06, "loss": 2.5857, "mean_token_accuracy": 0.4518430439952438, "step": 1350 }, { "epoch": 0.25046347793845014, "grad_norm": 8.6953125, "learning_rate": 9.749536522061551e-06, "loss": 2.9861, "mean_token_accuracy": 0.41586292976285033, "step": 1351 }, { "epoch": 0.25064886911383016, "grad_norm": 12.875, "learning_rate": 9.749351130886172e-06, "loss": 2.8859, "mean_token_accuracy": 0.4280434539142121, "step": 1352 }, { "epoch": 0.25083426028921024, "grad_norm": 9.796875, "learning_rate": 9.74916573971079e-06, "loss": 3.4162, "mean_token_accuracy": 0.36652010050251255, "step": 1353 }, { "epoch": 0.25101965146459027, "grad_norm": 5.6875, "learning_rate": 9.74898034853541e-06, "loss": 2.991, "mean_token_accuracy": 0.40743527995183626, "step": 1354 }, { "epoch": 0.25120504263997034, "grad_norm": 11.6875, "learning_rate": 9.74879495736003e-06, "loss": 2.7729, "mean_token_accuracy": 0.4278676099039919, "step": 1355 }, { "epoch": 0.25139043381535037, "grad_norm": 10.28125, "learning_rate": 9.74860956618465e-06, "loss": 2.769, "mean_token_accuracy": 0.44195710455764076, "step": 1356 }, { "epoch": 0.25157582499073045, "grad_norm": 5.84765625, "learning_rate": 9.74842417500927e-06, "loss": 3.0844, "mean_token_accuracy": 0.40223727745391524, "step": 1357 }, { "epoch": 0.25176121616611047, "grad_norm": 9.578125, "learning_rate": 9.74823878383389e-06, "loss": 3.0617, "mean_token_accuracy": 0.39359449444150346, "step": 1358 }, { "epoch": 0.25194660734149055, "grad_norm": 7.79296875, "learning_rate": 9.748053392658512e-06, "loss": 2.722, "mean_token_accuracy": 0.4307601649970536, "step": 1359 }, { "epoch": 0.25213199851687057, "grad_norm": 8.5234375, "learning_rate": 9.74786800148313e-06, "loss": 2.6621, "mean_token_accuracy": 0.44982698961937717, "step": 1360 }, { "epoch": 0.25231738969225065, "grad_norm": 7.453125, "learning_rate": 9.747682610307751e-06, "loss": 2.4154, "mean_token_accuracy": 0.48718294657312466, "step": 1361 }, { "epoch": 0.2525027808676307, "grad_norm": 5.8125, "learning_rate": 9.74749721913237e-06, "loss": 2.762, "mean_token_accuracy": 0.4316137566137566, "step": 1362 }, { "epoch": 0.25268817204301075, "grad_norm": 8.21875, "learning_rate": 9.74731182795699e-06, "loss": 2.4951, "mean_token_accuracy": 0.47023953544640695, "step": 1363 }, { "epoch": 0.25287356321839083, "grad_norm": 10.625, "learning_rate": 9.74712643678161e-06, "loss": 2.5648, "mean_token_accuracy": 0.4565192285483312, "step": 1364 }, { "epoch": 0.25305895439377085, "grad_norm": 7.83984375, "learning_rate": 9.74694104560623e-06, "loss": 2.6409, "mean_token_accuracy": 0.4434259954921112, "step": 1365 }, { "epoch": 0.25324434556915093, "grad_norm": 5.21875, "learning_rate": 9.74675565443085e-06, "loss": 3.0594, "mean_token_accuracy": 0.425772921108742, "step": 1366 }, { "epoch": 0.25342973674453095, "grad_norm": 8.453125, "learning_rate": 9.746570263255469e-06, "loss": 2.5717, "mean_token_accuracy": 0.47274540217150457, "step": 1367 }, { "epoch": 0.25361512791991103, "grad_norm": 6.94921875, "learning_rate": 9.746384872080091e-06, "loss": 3.0585, "mean_token_accuracy": 0.409443185882914, "step": 1368 }, { "epoch": 0.25380051909529106, "grad_norm": 8.8125, "learning_rate": 9.74619948090471e-06, "loss": 2.9503, "mean_token_accuracy": 0.41759367681498827, "step": 1369 }, { "epoch": 0.25398591027067113, "grad_norm": 5.33203125, "learning_rate": 9.74601408972933e-06, "loss": 2.6677, "mean_token_accuracy": 0.46084710743801655, "step": 1370 }, { "epoch": 0.25417130144605116, "grad_norm": 7.37109375, "learning_rate": 9.745828698553949e-06, "loss": 2.63, "mean_token_accuracy": 0.46974107553240385, "step": 1371 }, { "epoch": 0.25435669262143124, "grad_norm": 7.20703125, "learning_rate": 9.74564330737857e-06, "loss": 2.5187, "mean_token_accuracy": 0.4754081632653061, "step": 1372 }, { "epoch": 0.25454208379681126, "grad_norm": 5.7578125, "learning_rate": 9.74545791620319e-06, "loss": 2.7271, "mean_token_accuracy": 0.44862518089725034, "step": 1373 }, { "epoch": 0.25472747497219134, "grad_norm": 7.703125, "learning_rate": 9.745272525027809e-06, "loss": 2.508, "mean_token_accuracy": 0.4699866065992938, "step": 1374 }, { "epoch": 0.25491286614757136, "grad_norm": 5.9375, "learning_rate": 9.74508713385243e-06, "loss": 3.0368, "mean_token_accuracy": 0.41518443356810314, "step": 1375 }, { "epoch": 0.25509825732295144, "grad_norm": 7.25, "learning_rate": 9.74490174267705e-06, "loss": 2.7556, "mean_token_accuracy": 0.4320270924044509, "step": 1376 }, { "epoch": 0.25528364849833146, "grad_norm": 6.1640625, "learning_rate": 9.74471635150167e-06, "loss": 2.841, "mean_token_accuracy": 0.4222616933096507, "step": 1377 }, { "epoch": 0.25546903967371154, "grad_norm": 6.140625, "learning_rate": 9.744530960326289e-06, "loss": 3.1002, "mean_token_accuracy": 0.40975118653818493, "step": 1378 }, { "epoch": 0.25565443084909156, "grad_norm": 6.98046875, "learning_rate": 9.74434556915091e-06, "loss": 2.8626, "mean_token_accuracy": 0.4357833358907473, "step": 1379 }, { "epoch": 0.25583982202447164, "grad_norm": 8.0234375, "learning_rate": 9.744160177975528e-06, "loss": 2.7693, "mean_token_accuracy": 0.45619126589275844, "step": 1380 }, { "epoch": 0.25602521319985166, "grad_norm": 6.13671875, "learning_rate": 9.743974786800149e-06, "loss": 2.7307, "mean_token_accuracy": 0.43463391885589925, "step": 1381 }, { "epoch": 0.25621060437523174, "grad_norm": 6.14453125, "learning_rate": 9.74378939562477e-06, "loss": 2.795, "mean_token_accuracy": 0.4477874034184032, "step": 1382 }, { "epoch": 0.25639599555061177, "grad_norm": 5.65234375, "learning_rate": 9.743604004449388e-06, "loss": 2.4475, "mean_token_accuracy": 0.4994080292756431, "step": 1383 }, { "epoch": 0.25658138672599184, "grad_norm": 7.2890625, "learning_rate": 9.743418613274009e-06, "loss": 2.8074, "mean_token_accuracy": 0.4427828348504551, "step": 1384 }, { "epoch": 0.25676677790137187, "grad_norm": 9.359375, "learning_rate": 9.743233222098629e-06, "loss": 2.7038, "mean_token_accuracy": 0.44256410256410256, "step": 1385 }, { "epoch": 0.25695216907675195, "grad_norm": 7.14453125, "learning_rate": 9.74304783092325e-06, "loss": 3.1913, "mean_token_accuracy": 0.40829756795422034, "step": 1386 }, { "epoch": 0.257137560252132, "grad_norm": 8.4609375, "learning_rate": 9.742862439747868e-06, "loss": 2.4865, "mean_token_accuracy": 0.4666666666666667, "step": 1387 }, { "epoch": 0.25732295142751205, "grad_norm": 5.6015625, "learning_rate": 9.742677048572489e-06, "loss": 2.7275, "mean_token_accuracy": 0.43402292423818845, "step": 1388 }, { "epoch": 0.2575083426028921, "grad_norm": 6.515625, "learning_rate": 9.742491657397108e-06, "loss": 2.8597, "mean_token_accuracy": 0.44301730706433, "step": 1389 }, { "epoch": 0.25769373377827215, "grad_norm": 8.6796875, "learning_rate": 9.742306266221728e-06, "loss": 2.7388, "mean_token_accuracy": 0.42249240121580545, "step": 1390 }, { "epoch": 0.25787912495365223, "grad_norm": 8.2734375, "learning_rate": 9.742120875046349e-06, "loss": 2.9099, "mean_token_accuracy": 0.41530627527332636, "step": 1391 }, { "epoch": 0.25806451612903225, "grad_norm": 6.96484375, "learning_rate": 9.74193548387097e-06, "loss": 2.7478, "mean_token_accuracy": 0.45724076281287246, "step": 1392 }, { "epoch": 0.25824990730441233, "grad_norm": 9.578125, "learning_rate": 9.741750092695588e-06, "loss": 2.9084, "mean_token_accuracy": 0.41853372434017594, "step": 1393 }, { "epoch": 0.25843529847979235, "grad_norm": 6.71484375, "learning_rate": 9.741564701520208e-06, "loss": 3.1329, "mean_token_accuracy": 0.4059267867518884, "step": 1394 }, { "epoch": 0.25862068965517243, "grad_norm": 7.83984375, "learning_rate": 9.741379310344829e-06, "loss": 3.1425, "mean_token_accuracy": 0.3849829351535836, "step": 1395 }, { "epoch": 0.25880608083055245, "grad_norm": 6.578125, "learning_rate": 9.741193919169448e-06, "loss": 2.8825, "mean_token_accuracy": 0.4265486725663717, "step": 1396 }, { "epoch": 0.25899147200593253, "grad_norm": 8.828125, "learning_rate": 9.741008527994068e-06, "loss": 2.7531, "mean_token_accuracy": 0.43152962515114873, "step": 1397 }, { "epoch": 0.25917686318131256, "grad_norm": 8.15625, "learning_rate": 9.740823136818687e-06, "loss": 2.5507, "mean_token_accuracy": 0.450074294205052, "step": 1398 }, { "epoch": 0.25936225435669263, "grad_norm": 8.4296875, "learning_rate": 9.740637745643308e-06, "loss": 2.8377, "mean_token_accuracy": 0.4302340960190451, "step": 1399 }, { "epoch": 0.25954764553207266, "grad_norm": 7.1875, "learning_rate": 9.740452354467928e-06, "loss": 2.8668, "mean_token_accuracy": 0.42975495915985995, "step": 1400 }, { "epoch": 0.25973303670745274, "grad_norm": 8.8125, "learning_rate": 9.740266963292549e-06, "loss": 3.134, "mean_token_accuracy": 0.3986462415390096, "step": 1401 }, { "epoch": 0.25991842788283276, "grad_norm": 10.2265625, "learning_rate": 9.740081572117169e-06, "loss": 2.8421, "mean_token_accuracy": 0.4289079822616408, "step": 1402 }, { "epoch": 0.26010381905821284, "grad_norm": 6.34765625, "learning_rate": 9.739896180941788e-06, "loss": 2.7373, "mean_token_accuracy": 0.4688512783579402, "step": 1403 }, { "epoch": 0.26028921023359286, "grad_norm": 10.5, "learning_rate": 9.739710789766408e-06, "loss": 2.878, "mean_token_accuracy": 0.42034638789202683, "step": 1404 }, { "epoch": 0.26047460140897294, "grad_norm": 8.578125, "learning_rate": 9.739525398591027e-06, "loss": 2.4952, "mean_token_accuracy": 0.46924726328856786, "step": 1405 }, { "epoch": 0.26065999258435296, "grad_norm": 6.1640625, "learning_rate": 9.739340007415648e-06, "loss": 2.6958, "mean_token_accuracy": 0.44245943129373233, "step": 1406 }, { "epoch": 0.26084538375973304, "grad_norm": 5.765625, "learning_rate": 9.739154616240268e-06, "loss": 2.801, "mean_token_accuracy": 0.430661659976693, "step": 1407 }, { "epoch": 0.26103077493511306, "grad_norm": 7.328125, "learning_rate": 9.738969225064889e-06, "loss": 2.4835, "mean_token_accuracy": 0.480924568162048, "step": 1408 }, { "epoch": 0.26121616611049314, "grad_norm": 6.109375, "learning_rate": 9.738783833889507e-06, "loss": 3.0193, "mean_token_accuracy": 0.4113686391298718, "step": 1409 }, { "epoch": 0.26140155728587317, "grad_norm": 5.4609375, "learning_rate": 9.738598442714128e-06, "loss": 3.0447, "mean_token_accuracy": 0.41317440401505645, "step": 1410 }, { "epoch": 0.26158694846125324, "grad_norm": 5.95703125, "learning_rate": 9.738413051538748e-06, "loss": 3.2015, "mean_token_accuracy": 0.40329271454498317, "step": 1411 }, { "epoch": 0.2617723396366333, "grad_norm": 7.953125, "learning_rate": 9.738227660363367e-06, "loss": 2.9105, "mean_token_accuracy": 0.42351323478858716, "step": 1412 }, { "epoch": 0.26195773081201335, "grad_norm": 8.34375, "learning_rate": 9.738042269187988e-06, "loss": 2.6713, "mean_token_accuracy": 0.4406269592476489, "step": 1413 }, { "epoch": 0.2621431219873934, "grad_norm": 7.45703125, "learning_rate": 9.737856878012606e-06, "loss": 2.7808, "mean_token_accuracy": 0.43470550981633943, "step": 1414 }, { "epoch": 0.26232851316277345, "grad_norm": 5.7421875, "learning_rate": 9.737671486837227e-06, "loss": 3.4136, "mean_token_accuracy": 0.3727969348659004, "step": 1415 }, { "epoch": 0.2625139043381535, "grad_norm": 6.578125, "learning_rate": 9.737486095661847e-06, "loss": 3.2447, "mean_token_accuracy": 0.39578976718248987, "step": 1416 }, { "epoch": 0.26269929551353355, "grad_norm": 5.47265625, "learning_rate": 9.737300704486468e-06, "loss": 2.8844, "mean_token_accuracy": 0.43197332909985714, "step": 1417 }, { "epoch": 0.2628846866889136, "grad_norm": 6.51171875, "learning_rate": 9.737115313311087e-06, "loss": 2.7373, "mean_token_accuracy": 0.4543935850317407, "step": 1418 }, { "epoch": 0.26307007786429365, "grad_norm": 7.1875, "learning_rate": 9.736929922135707e-06, "loss": 2.3903, "mean_token_accuracy": 0.4763619467998606, "step": 1419 }, { "epoch": 0.26325546903967373, "grad_norm": 7.484375, "learning_rate": 9.736744530960328e-06, "loss": 3.2405, "mean_token_accuracy": 0.3924422673198041, "step": 1420 }, { "epoch": 0.26344086021505375, "grad_norm": 6.7578125, "learning_rate": 9.736559139784946e-06, "loss": 2.9292, "mean_token_accuracy": 0.42827523649086435, "step": 1421 }, { "epoch": 0.26362625139043383, "grad_norm": 5.89453125, "learning_rate": 9.736373748609567e-06, "loss": 2.8133, "mean_token_accuracy": 0.4319627618308767, "step": 1422 }, { "epoch": 0.26381164256581385, "grad_norm": 7.66015625, "learning_rate": 9.736188357434186e-06, "loss": 2.2175, "mean_token_accuracy": 0.5247799348848426, "step": 1423 }, { "epoch": 0.26399703374119393, "grad_norm": 5.9140625, "learning_rate": 9.736002966258808e-06, "loss": 2.9609, "mean_token_accuracy": 0.4266355140186916, "step": 1424 }, { "epoch": 0.26418242491657395, "grad_norm": 8.7265625, "learning_rate": 9.735817575083427e-06, "loss": 2.7507, "mean_token_accuracy": 0.44058205335489087, "step": 1425 }, { "epoch": 0.26436781609195403, "grad_norm": 5.9140625, "learning_rate": 9.735632183908047e-06, "loss": 2.8315, "mean_token_accuracy": 0.45780809728607447, "step": 1426 }, { "epoch": 0.26455320726733406, "grad_norm": 5.3515625, "learning_rate": 9.735446792732666e-06, "loss": 2.9292, "mean_token_accuracy": 0.41445945945945944, "step": 1427 }, { "epoch": 0.26473859844271413, "grad_norm": 6.02734375, "learning_rate": 9.735261401557287e-06, "loss": 3.3309, "mean_token_accuracy": 0.37969094922737306, "step": 1428 }, { "epoch": 0.26492398961809416, "grad_norm": 5.86328125, "learning_rate": 9.735076010381907e-06, "loss": 2.6712, "mean_token_accuracy": 0.45440470661093046, "step": 1429 }, { "epoch": 0.26510938079347424, "grad_norm": 7.6640625, "learning_rate": 9.734890619206526e-06, "loss": 2.9152, "mean_token_accuracy": 0.4271628125366827, "step": 1430 }, { "epoch": 0.26529477196885426, "grad_norm": 7.703125, "learning_rate": 9.734705228031146e-06, "loss": 3.0648, "mean_token_accuracy": 0.4190012180267966, "step": 1431 }, { "epoch": 0.26548016314423434, "grad_norm": 7.69921875, "learning_rate": 9.734519836855767e-06, "loss": 2.6966, "mean_token_accuracy": 0.4501140250855188, "step": 1432 }, { "epoch": 0.26566555431961436, "grad_norm": 9.7265625, "learning_rate": 9.734334445680387e-06, "loss": 2.7761, "mean_token_accuracy": 0.43670392513647, "step": 1433 }, { "epoch": 0.26585094549499444, "grad_norm": 10.53125, "learning_rate": 9.734149054505006e-06, "loss": 3.0316, "mean_token_accuracy": 0.40456284540172877, "step": 1434 }, { "epoch": 0.26603633667037446, "grad_norm": 12.7109375, "learning_rate": 9.733963663329627e-06, "loss": 3.2042, "mean_token_accuracy": 0.3867345877949891, "step": 1435 }, { "epoch": 0.26622172784575454, "grad_norm": 14.8125, "learning_rate": 9.733778272154245e-06, "loss": 2.9063, "mean_token_accuracy": 0.42193010580314205, "step": 1436 }, { "epoch": 0.2664071190211346, "grad_norm": 13.6484375, "learning_rate": 9.733592880978866e-06, "loss": 2.8842, "mean_token_accuracy": 0.419578745811393, "step": 1437 }, { "epoch": 0.26659251019651464, "grad_norm": 7.6796875, "learning_rate": 9.733407489803486e-06, "loss": 2.8189, "mean_token_accuracy": 0.45008299739151053, "step": 1438 }, { "epoch": 0.2667779013718947, "grad_norm": 8.5078125, "learning_rate": 9.733222098628105e-06, "loss": 2.9613, "mean_token_accuracy": 0.4184129645152277, "step": 1439 }, { "epoch": 0.26696329254727474, "grad_norm": 11.828125, "learning_rate": 9.733036707452727e-06, "loss": 2.6886, "mean_token_accuracy": 0.44348598249397436, "step": 1440 }, { "epoch": 0.2671486837226548, "grad_norm": 7.8671875, "learning_rate": 9.732851316277346e-06, "loss": 3.1343, "mean_token_accuracy": 0.4058238490720804, "step": 1441 }, { "epoch": 0.26733407489803485, "grad_norm": 7.24609375, "learning_rate": 9.732665925101967e-06, "loss": 2.6312, "mean_token_accuracy": 0.44463832487309646, "step": 1442 }, { "epoch": 0.2675194660734149, "grad_norm": 5.7734375, "learning_rate": 9.732480533926585e-06, "loss": 3.0352, "mean_token_accuracy": 0.41620846282572316, "step": 1443 }, { "epoch": 0.26770485724879495, "grad_norm": 7.19921875, "learning_rate": 9.732295142751206e-06, "loss": 2.8949, "mean_token_accuracy": 0.4381679389312977, "step": 1444 }, { "epoch": 0.267890248424175, "grad_norm": 10.4609375, "learning_rate": 9.732109751575826e-06, "loss": 2.3839, "mean_token_accuracy": 0.4860557768924303, "step": 1445 }, { "epoch": 0.26807563959955505, "grad_norm": 7.82421875, "learning_rate": 9.731924360400445e-06, "loss": 3.2139, "mean_token_accuracy": 0.39789695057833857, "step": 1446 }, { "epoch": 0.2682610307749351, "grad_norm": 6.6875, "learning_rate": 9.731738969225066e-06, "loss": 3.0859, "mean_token_accuracy": 0.4082422901396764, "step": 1447 }, { "epoch": 0.26844642195031515, "grad_norm": 11.6640625, "learning_rate": 9.731553578049686e-06, "loss": 2.4961, "mean_token_accuracy": 0.4647495361781076, "step": 1448 }, { "epoch": 0.26863181312569523, "grad_norm": 6.8828125, "learning_rate": 9.731368186874307e-06, "loss": 2.6263, "mean_token_accuracy": 0.46710291493158834, "step": 1449 }, { "epoch": 0.26881720430107525, "grad_norm": 6.015625, "learning_rate": 9.731182795698925e-06, "loss": 2.7822, "mean_token_accuracy": 0.45086133860491384, "step": 1450 }, { "epoch": 0.26900259547645533, "grad_norm": 7.38671875, "learning_rate": 9.730997404523546e-06, "loss": 2.6993, "mean_token_accuracy": 0.4557204404021063, "step": 1451 }, { "epoch": 0.26918798665183535, "grad_norm": 7.96484375, "learning_rate": 9.730812013348165e-06, "loss": 2.765, "mean_token_accuracy": 0.43163357400722024, "step": 1452 }, { "epoch": 0.26937337782721543, "grad_norm": 7.8125, "learning_rate": 9.730626622172785e-06, "loss": 3.3332, "mean_token_accuracy": 0.38722490606548576, "step": 1453 }, { "epoch": 0.26955876900259546, "grad_norm": 6.484375, "learning_rate": 9.730441230997406e-06, "loss": 2.83, "mean_token_accuracy": 0.43372591006423983, "step": 1454 }, { "epoch": 0.26974416017797553, "grad_norm": 7.3984375, "learning_rate": 9.730255839822025e-06, "loss": 2.6394, "mean_token_accuracy": 0.4675309229305423, "step": 1455 }, { "epoch": 0.26992955135335556, "grad_norm": 12.46875, "learning_rate": 9.730070448646645e-06, "loss": 2.6017, "mean_token_accuracy": 0.4224, "step": 1456 }, { "epoch": 0.27011494252873564, "grad_norm": 9.84375, "learning_rate": 9.729885057471266e-06, "loss": 2.7468, "mean_token_accuracy": 0.4348642403235124, "step": 1457 }, { "epoch": 0.27030033370411566, "grad_norm": 5.75390625, "learning_rate": 9.729699666295886e-06, "loss": 2.808, "mean_token_accuracy": 0.4474291140957808, "step": 1458 }, { "epoch": 0.27048572487949574, "grad_norm": 10.015625, "learning_rate": 9.729514275120505e-06, "loss": 3.0718, "mean_token_accuracy": 0.4106830122591944, "step": 1459 }, { "epoch": 0.2706711160548758, "grad_norm": 5.84765625, "learning_rate": 9.729328883945125e-06, "loss": 2.732, "mean_token_accuracy": 0.43124665596575706, "step": 1460 }, { "epoch": 0.27085650723025584, "grad_norm": 7.26953125, "learning_rate": 9.729143492769744e-06, "loss": 2.9076, "mean_token_accuracy": 0.44059610873464883, "step": 1461 }, { "epoch": 0.2710418984056359, "grad_norm": 9.65625, "learning_rate": 9.728958101594365e-06, "loss": 2.8904, "mean_token_accuracy": 0.408169580690082, "step": 1462 }, { "epoch": 0.27122728958101594, "grad_norm": 12.515625, "learning_rate": 9.728772710418985e-06, "loss": 2.5016, "mean_token_accuracy": 0.472670715449563, "step": 1463 }, { "epoch": 0.271412680756396, "grad_norm": 7.953125, "learning_rate": 9.728587319243606e-06, "loss": 2.7231, "mean_token_accuracy": 0.433780385582565, "step": 1464 }, { "epoch": 0.27159807193177604, "grad_norm": 6.52734375, "learning_rate": 9.728401928068224e-06, "loss": 3.2028, "mean_token_accuracy": 0.3951143854207057, "step": 1465 }, { "epoch": 0.2717834631071561, "grad_norm": 9.046875, "learning_rate": 9.728216536892845e-06, "loss": 2.8251, "mean_token_accuracy": 0.44425935417734497, "step": 1466 }, { "epoch": 0.27196885428253614, "grad_norm": 7.19140625, "learning_rate": 9.728031145717465e-06, "loss": 2.4967, "mean_token_accuracy": 0.4601251497803222, "step": 1467 }, { "epoch": 0.2721542454579162, "grad_norm": 6.2890625, "learning_rate": 9.727845754542084e-06, "loss": 2.8892, "mean_token_accuracy": 0.43394886363636365, "step": 1468 }, { "epoch": 0.27233963663329624, "grad_norm": 8.609375, "learning_rate": 9.727660363366705e-06, "loss": 2.4747, "mean_token_accuracy": 0.4760112888052681, "step": 1469 }, { "epoch": 0.2725250278086763, "grad_norm": 6.0, "learning_rate": 9.727474972191323e-06, "loss": 3.0878, "mean_token_accuracy": 0.40199735290578753, "step": 1470 }, { "epoch": 0.27271041898405635, "grad_norm": 6.12890625, "learning_rate": 9.727289581015944e-06, "loss": 2.8279, "mean_token_accuracy": 0.4272195936543279, "step": 1471 }, { "epoch": 0.2728958101594364, "grad_norm": 8.078125, "learning_rate": 9.727104189840564e-06, "loss": 3.0491, "mean_token_accuracy": 0.377616555661275, "step": 1472 }, { "epoch": 0.27308120133481645, "grad_norm": 6.65625, "learning_rate": 9.726918798665185e-06, "loss": 3.0419, "mean_token_accuracy": 0.42346089850249585, "step": 1473 }, { "epoch": 0.2732665925101965, "grad_norm": 5.8984375, "learning_rate": 9.726733407489804e-06, "loss": 3.0996, "mean_token_accuracy": 0.4057115315098205, "step": 1474 }, { "epoch": 0.27345198368557655, "grad_norm": 5.53515625, "learning_rate": 9.726548016314424e-06, "loss": 3.0038, "mean_token_accuracy": 0.40247599797877714, "step": 1475 }, { "epoch": 0.27363737486095663, "grad_norm": 6.890625, "learning_rate": 9.726362625139045e-06, "loss": 3.4582, "mean_token_accuracy": 0.38304054946506405, "step": 1476 }, { "epoch": 0.27382276603633665, "grad_norm": 5.87109375, "learning_rate": 9.726177233963664e-06, "loss": 2.8892, "mean_token_accuracy": 0.43549280177187155, "step": 1477 }, { "epoch": 0.27400815721171673, "grad_norm": 5.5234375, "learning_rate": 9.725991842788284e-06, "loss": 3.3496, "mean_token_accuracy": 0.3818830242510699, "step": 1478 }, { "epoch": 0.27419354838709675, "grad_norm": 5.57421875, "learning_rate": 9.725806451612903e-06, "loss": 3.038, "mean_token_accuracy": 0.41250959324635456, "step": 1479 }, { "epoch": 0.27437893956247683, "grad_norm": 8.9375, "learning_rate": 9.725621060437525e-06, "loss": 2.832, "mean_token_accuracy": 0.4136631330977621, "step": 1480 }, { "epoch": 0.27456433073785685, "grad_norm": 6.94921875, "learning_rate": 9.725435669262144e-06, "loss": 2.3572, "mean_token_accuracy": 0.48827844096073636, "step": 1481 }, { "epoch": 0.27474972191323693, "grad_norm": 6.62890625, "learning_rate": 9.725250278086764e-06, "loss": 3.1226, "mean_token_accuracy": 0.40572369254147145, "step": 1482 }, { "epoch": 0.27493511308861696, "grad_norm": 7.2890625, "learning_rate": 9.725064886911385e-06, "loss": 2.6334, "mean_token_accuracy": 0.4463179628355127, "step": 1483 }, { "epoch": 0.27512050426399703, "grad_norm": 6.63671875, "learning_rate": 9.724879495736004e-06, "loss": 2.5939, "mean_token_accuracy": 0.45490981963927857, "step": 1484 }, { "epoch": 0.2753058954393771, "grad_norm": 5.890625, "learning_rate": 9.724694104560624e-06, "loss": 3.0145, "mean_token_accuracy": 0.4225153085256712, "step": 1485 }, { "epoch": 0.27549128661475714, "grad_norm": 5.76171875, "learning_rate": 9.724508713385243e-06, "loss": 3.1575, "mean_token_accuracy": 0.39855274144169217, "step": 1486 }, { "epoch": 0.2756766777901372, "grad_norm": 8.90625, "learning_rate": 9.724323322209863e-06, "loss": 2.738, "mean_token_accuracy": 0.43975542500899173, "step": 1487 }, { "epoch": 0.27586206896551724, "grad_norm": 6.0390625, "learning_rate": 9.724137931034484e-06, "loss": 3.0126, "mean_token_accuracy": 0.42025748817656333, "step": 1488 }, { "epoch": 0.2760474601408973, "grad_norm": 5.6875, "learning_rate": 9.723952539859104e-06, "loss": 3.1654, "mean_token_accuracy": 0.40727994705493054, "step": 1489 }, { "epoch": 0.27623285131627734, "grad_norm": 8.203125, "learning_rate": 9.723767148683723e-06, "loss": 2.8417, "mean_token_accuracy": 0.4226177787252185, "step": 1490 }, { "epoch": 0.2764182424916574, "grad_norm": 5.9921875, "learning_rate": 9.723581757508344e-06, "loss": 3.0496, "mean_token_accuracy": 0.4052637448421992, "step": 1491 }, { "epoch": 0.27660363366703744, "grad_norm": 9.4609375, "learning_rate": 9.723396366332964e-06, "loss": 3.1385, "mean_token_accuracy": 0.39399348944660295, "step": 1492 }, { "epoch": 0.2767890248424175, "grad_norm": 6.3359375, "learning_rate": 9.723210975157583e-06, "loss": 2.9989, "mean_token_accuracy": 0.4144329896907217, "step": 1493 }, { "epoch": 0.27697441601779754, "grad_norm": 10.5546875, "learning_rate": 9.723025583982203e-06, "loss": 2.5572, "mean_token_accuracy": 0.45578146438817513, "step": 1494 }, { "epoch": 0.2771598071931776, "grad_norm": 5.9765625, "learning_rate": 9.722840192806822e-06, "loss": 3.2437, "mean_token_accuracy": 0.39901112484548823, "step": 1495 }, { "epoch": 0.27734519836855764, "grad_norm": 8.375, "learning_rate": 9.722654801631443e-06, "loss": 2.857, "mean_token_accuracy": 0.43163303672139364, "step": 1496 }, { "epoch": 0.2775305895439377, "grad_norm": 5.890625, "learning_rate": 9.722469410456063e-06, "loss": 2.6753, "mean_token_accuracy": 0.44427023945267957, "step": 1497 }, { "epoch": 0.27771598071931775, "grad_norm": 15.15625, "learning_rate": 9.722284019280684e-06, "loss": 3.1551, "mean_token_accuracy": 0.41700504491202167, "step": 1498 }, { "epoch": 0.2779013718946978, "grad_norm": 7.15234375, "learning_rate": 9.722098628105302e-06, "loss": 2.4456, "mean_token_accuracy": 0.48917511647026585, "step": 1499 }, { "epoch": 0.27808676307007785, "grad_norm": 8.2109375, "learning_rate": 9.721913236929923e-06, "loss": 2.8178, "mean_token_accuracy": 0.4240829592942269, "step": 1500 }, { "epoch": 0.2782721542454579, "grad_norm": 7.01171875, "learning_rate": 9.721727845754543e-06, "loss": 2.7176, "mean_token_accuracy": 0.4411332941867293, "step": 1501 }, { "epoch": 0.27845754542083795, "grad_norm": 6.453125, "learning_rate": 9.721542454579162e-06, "loss": 2.6789, "mean_token_accuracy": 0.46482445601388334, "step": 1502 }, { "epoch": 0.278642936596218, "grad_norm": 7.99609375, "learning_rate": 9.721357063403783e-06, "loss": 2.6514, "mean_token_accuracy": 0.4524929444967074, "step": 1503 }, { "epoch": 0.27882832777159805, "grad_norm": 11.296875, "learning_rate": 9.721171672228402e-06, "loss": 2.9413, "mean_token_accuracy": 0.41957160856782866, "step": 1504 }, { "epoch": 0.27901371894697813, "grad_norm": 6.66015625, "learning_rate": 9.720986281053024e-06, "loss": 3.2105, "mean_token_accuracy": 0.38066789215686275, "step": 1505 }, { "epoch": 0.27919911012235815, "grad_norm": 5.65234375, "learning_rate": 9.720800889877643e-06, "loss": 2.9903, "mean_token_accuracy": 0.41111873713109126, "step": 1506 }, { "epoch": 0.27938450129773823, "grad_norm": 8.875, "learning_rate": 9.720615498702263e-06, "loss": 3.3211, "mean_token_accuracy": 0.3888682285855956, "step": 1507 }, { "epoch": 0.27956989247311825, "grad_norm": 5.9765625, "learning_rate": 9.720430107526882e-06, "loss": 3.24, "mean_token_accuracy": 0.4085076869726043, "step": 1508 }, { "epoch": 0.27975528364849833, "grad_norm": 4.9296875, "learning_rate": 9.720244716351502e-06, "loss": 2.7364, "mean_token_accuracy": 0.4503887205165371, "step": 1509 }, { "epoch": 0.2799406748238784, "grad_norm": 9.65625, "learning_rate": 9.720059325176123e-06, "loss": 2.9812, "mean_token_accuracy": 0.421281390856407, "step": 1510 }, { "epoch": 0.28012606599925843, "grad_norm": 8.78125, "learning_rate": 9.719873934000742e-06, "loss": 2.9257, "mean_token_accuracy": 0.41490618029130033, "step": 1511 }, { "epoch": 0.2803114571746385, "grad_norm": 8.7578125, "learning_rate": 9.719688542825362e-06, "loss": 2.6762, "mean_token_accuracy": 0.44573863636363636, "step": 1512 }, { "epoch": 0.28049684835001854, "grad_norm": 8.4765625, "learning_rate": 9.719503151649983e-06, "loss": 2.224, "mean_token_accuracy": 0.5133496560568963, "step": 1513 }, { "epoch": 0.2806822395253986, "grad_norm": 8.9453125, "learning_rate": 9.719317760474603e-06, "loss": 2.6199, "mean_token_accuracy": 0.436931473620376, "step": 1514 }, { "epoch": 0.28086763070077864, "grad_norm": 5.90234375, "learning_rate": 9.719132369299222e-06, "loss": 2.9279, "mean_token_accuracy": 0.4355122263065367, "step": 1515 }, { "epoch": 0.2810530218761587, "grad_norm": 6.04296875, "learning_rate": 9.718946978123842e-06, "loss": 2.8352, "mean_token_accuracy": 0.4333778966131907, "step": 1516 }, { "epoch": 0.28123841305153874, "grad_norm": 7.5703125, "learning_rate": 9.718761586948461e-06, "loss": 3.4339, "mean_token_accuracy": 0.37950404164954105, "step": 1517 }, { "epoch": 0.2814238042269188, "grad_norm": 7.578125, "learning_rate": 9.718576195773082e-06, "loss": 2.4594, "mean_token_accuracy": 0.4937257079400333, "step": 1518 }, { "epoch": 0.28160919540229884, "grad_norm": 8.140625, "learning_rate": 9.718390804597702e-06, "loss": 2.9719, "mean_token_accuracy": 0.4154550658271322, "step": 1519 }, { "epoch": 0.2817945865776789, "grad_norm": 7.875, "learning_rate": 9.718205413422321e-06, "loss": 2.5271, "mean_token_accuracy": 0.461518572782901, "step": 1520 }, { "epoch": 0.28197997775305894, "grad_norm": 8.65625, "learning_rate": 9.718020022246943e-06, "loss": 2.6488, "mean_token_accuracy": 0.45604468679345656, "step": 1521 }, { "epoch": 0.282165368928439, "grad_norm": 6.9140625, "learning_rate": 9.717834631071562e-06, "loss": 3.4569, "mean_token_accuracy": 0.36576034977455935, "step": 1522 }, { "epoch": 0.28235076010381904, "grad_norm": 7.4296875, "learning_rate": 9.717649239896182e-06, "loss": 2.6898, "mean_token_accuracy": 0.45914967346394775, "step": 1523 }, { "epoch": 0.2825361512791991, "grad_norm": 7.2421875, "learning_rate": 9.717463848720801e-06, "loss": 3.1185, "mean_token_accuracy": 0.40501277139208175, "step": 1524 }, { "epoch": 0.28272154245457914, "grad_norm": 8.9296875, "learning_rate": 9.717278457545422e-06, "loss": 2.6993, "mean_token_accuracy": 0.44654151189639596, "step": 1525 }, { "epoch": 0.2829069336299592, "grad_norm": 7.52734375, "learning_rate": 9.717093066370042e-06, "loss": 2.9636, "mean_token_accuracy": 0.4294286103913814, "step": 1526 }, { "epoch": 0.28309232480533925, "grad_norm": 6.4765625, "learning_rate": 9.716907675194661e-06, "loss": 2.7059, "mean_token_accuracy": 0.4510593852581319, "step": 1527 }, { "epoch": 0.2832777159807193, "grad_norm": 6.90625, "learning_rate": 9.716722284019281e-06, "loss": 2.9323, "mean_token_accuracy": 0.4316329213778596, "step": 1528 }, { "epoch": 0.28346310715609935, "grad_norm": 7.1875, "learning_rate": 9.716536892843902e-06, "loss": 3.0733, "mean_token_accuracy": 0.40403549288926704, "step": 1529 }, { "epoch": 0.2836484983314794, "grad_norm": 7.52734375, "learning_rate": 9.716351501668522e-06, "loss": 2.7215, "mean_token_accuracy": 0.4451308730287758, "step": 1530 }, { "epoch": 0.28383388950685945, "grad_norm": 9.421875, "learning_rate": 9.716166110493141e-06, "loss": 3.0258, "mean_token_accuracy": 0.42528989508558807, "step": 1531 }, { "epoch": 0.2840192806822395, "grad_norm": 7.984375, "learning_rate": 9.715980719317762e-06, "loss": 2.7971, "mean_token_accuracy": 0.45991451884470924, "step": 1532 }, { "epoch": 0.28420467185761955, "grad_norm": 10.4140625, "learning_rate": 9.71579532814238e-06, "loss": 2.821, "mean_token_accuracy": 0.42618769263471146, "step": 1533 }, { "epoch": 0.28439006303299963, "grad_norm": 9.3671875, "learning_rate": 9.715609936967001e-06, "loss": 2.8666, "mean_token_accuracy": 0.4384868072480661, "step": 1534 }, { "epoch": 0.2845754542083797, "grad_norm": 8.1875, "learning_rate": 9.715424545791622e-06, "loss": 2.8801, "mean_token_accuracy": 0.44099762959909306, "step": 1535 }, { "epoch": 0.28476084538375973, "grad_norm": 6.37890625, "learning_rate": 9.71523915461624e-06, "loss": 2.8135, "mean_token_accuracy": 0.43271767810026385, "step": 1536 }, { "epoch": 0.2849462365591398, "grad_norm": 7.80859375, "learning_rate": 9.71505376344086e-06, "loss": 2.7241, "mean_token_accuracy": 0.4506398537477148, "step": 1537 }, { "epoch": 0.28513162773451983, "grad_norm": 9.359375, "learning_rate": 9.714868372265481e-06, "loss": 3.1724, "mean_token_accuracy": 0.40512629090456637, "step": 1538 }, { "epoch": 0.2853170189098999, "grad_norm": 8.6875, "learning_rate": 9.714682981090102e-06, "loss": 3.2547, "mean_token_accuracy": 0.39429928741092635, "step": 1539 }, { "epoch": 0.28550241008527993, "grad_norm": 7.0703125, "learning_rate": 9.71449758991472e-06, "loss": 2.5974, "mean_token_accuracy": 0.4521172638436482, "step": 1540 }, { "epoch": 0.28568780126066, "grad_norm": 9.75, "learning_rate": 9.714312198739341e-06, "loss": 1.9374, "mean_token_accuracy": 0.5431305715783954, "step": 1541 }, { "epoch": 0.28587319243604004, "grad_norm": 6.546875, "learning_rate": 9.71412680756396e-06, "loss": 2.5831, "mean_token_accuracy": 0.49724061810154524, "step": 1542 }, { "epoch": 0.2860585836114201, "grad_norm": 7.0234375, "learning_rate": 9.71394141638858e-06, "loss": 3.1661, "mean_token_accuracy": 0.38490813648293964, "step": 1543 }, { "epoch": 0.28624397478680014, "grad_norm": 6.5859375, "learning_rate": 9.7137560252132e-06, "loss": 2.9869, "mean_token_accuracy": 0.41638769328869757, "step": 1544 }, { "epoch": 0.2864293659621802, "grad_norm": 6.390625, "learning_rate": 9.713570634037821e-06, "loss": 2.7195, "mean_token_accuracy": 0.4671212859230395, "step": 1545 }, { "epoch": 0.28661475713756024, "grad_norm": 5.890625, "learning_rate": 9.71338524286244e-06, "loss": 3.8243, "mean_token_accuracy": 0.3476795436868032, "step": 1546 }, { "epoch": 0.2868001483129403, "grad_norm": 7.81640625, "learning_rate": 9.71319985168706e-06, "loss": 2.8274, "mean_token_accuracy": 0.41514726507713884, "step": 1547 }, { "epoch": 0.28698553948832034, "grad_norm": 7.2109375, "learning_rate": 9.713014460511681e-06, "loss": 2.7271, "mean_token_accuracy": 0.4655148583275743, "step": 1548 }, { "epoch": 0.2871709306637004, "grad_norm": 7.57421875, "learning_rate": 9.7128290693363e-06, "loss": 2.549, "mean_token_accuracy": 0.4806169237182159, "step": 1549 }, { "epoch": 0.28735632183908044, "grad_norm": 6.91796875, "learning_rate": 9.71264367816092e-06, "loss": 2.791, "mean_token_accuracy": 0.44431065623118604, "step": 1550 }, { "epoch": 0.2875417130144605, "grad_norm": 5.5390625, "learning_rate": 9.71245828698554e-06, "loss": 3.0578, "mean_token_accuracy": 0.40838820498139133, "step": 1551 }, { "epoch": 0.28772710418984054, "grad_norm": 5.5078125, "learning_rate": 9.71227289581016e-06, "loss": 2.9479, "mean_token_accuracy": 0.4099685675797036, "step": 1552 }, { "epoch": 0.2879124953652206, "grad_norm": 7.28125, "learning_rate": 9.71208750463478e-06, "loss": 2.7056, "mean_token_accuracy": 0.4480243161094225, "step": 1553 }, { "epoch": 0.28809788654060065, "grad_norm": 6.79296875, "learning_rate": 9.7119021134594e-06, "loss": 2.9637, "mean_token_accuracy": 0.42044134727061555, "step": 1554 }, { "epoch": 0.2882832777159807, "grad_norm": 5.94921875, "learning_rate": 9.71171672228402e-06, "loss": 3.207, "mean_token_accuracy": 0.396993152724025, "step": 1555 }, { "epoch": 0.28846866889136075, "grad_norm": 8.9453125, "learning_rate": 9.71153133110864e-06, "loss": 2.4577, "mean_token_accuracy": 0.48194807190044553, "step": 1556 }, { "epoch": 0.2886540600667408, "grad_norm": 5.97265625, "learning_rate": 9.71134593993326e-06, "loss": 2.6438, "mean_token_accuracy": 0.453713670613563, "step": 1557 }, { "epoch": 0.28883945124212085, "grad_norm": 7.21484375, "learning_rate": 9.71116054875788e-06, "loss": 2.8583, "mean_token_accuracy": 0.41842634489693314, "step": 1558 }, { "epoch": 0.2890248424175009, "grad_norm": 5.4765625, "learning_rate": 9.7109751575825e-06, "loss": 2.8726, "mean_token_accuracy": 0.430997526793075, "step": 1559 }, { "epoch": 0.289210233592881, "grad_norm": 4.859375, "learning_rate": 9.710789766407119e-06, "loss": 2.675, "mean_token_accuracy": 0.4443155452436195, "step": 1560 }, { "epoch": 0.28939562476826103, "grad_norm": 5.77734375, "learning_rate": 9.71060437523174e-06, "loss": 2.7436, "mean_token_accuracy": 0.46791685494803437, "step": 1561 }, { "epoch": 0.2895810159436411, "grad_norm": 6.62890625, "learning_rate": 9.71041898405636e-06, "loss": 2.7468, "mean_token_accuracy": 0.4430458109781263, "step": 1562 }, { "epoch": 0.28976640711902113, "grad_norm": 9.9140625, "learning_rate": 9.71023359288098e-06, "loss": 2.8519, "mean_token_accuracy": 0.41906180193596426, "step": 1563 }, { "epoch": 0.2899517982944012, "grad_norm": 6.0703125, "learning_rate": 9.7100482017056e-06, "loss": 3.0702, "mean_token_accuracy": 0.41227700519735916, "step": 1564 }, { "epoch": 0.29013718946978123, "grad_norm": 6.65234375, "learning_rate": 9.70986281053022e-06, "loss": 2.5877, "mean_token_accuracy": 0.4474368982565704, "step": 1565 }, { "epoch": 0.2903225806451613, "grad_norm": 5.2734375, "learning_rate": 9.70967741935484e-06, "loss": 2.9484, "mean_token_accuracy": 0.4313041049986037, "step": 1566 }, { "epoch": 0.29050797182054133, "grad_norm": 6.99609375, "learning_rate": 9.709492028179459e-06, "loss": 2.7334, "mean_token_accuracy": 0.4450015669069257, "step": 1567 }, { "epoch": 0.2906933629959214, "grad_norm": 6.76953125, "learning_rate": 9.709306637004079e-06, "loss": 2.7194, "mean_token_accuracy": 0.4512799339388935, "step": 1568 }, { "epoch": 0.29087875417130143, "grad_norm": 6.90625, "learning_rate": 9.7091212458287e-06, "loss": 2.6758, "mean_token_accuracy": 0.4457377610407395, "step": 1569 }, { "epoch": 0.2910641453466815, "grad_norm": 5.21875, "learning_rate": 9.70893585465332e-06, "loss": 3.2338, "mean_token_accuracy": 0.3921968787515006, "step": 1570 }, { "epoch": 0.29124953652206154, "grad_norm": 7.515625, "learning_rate": 9.708750463477939e-06, "loss": 2.917, "mean_token_accuracy": 0.42592592592592593, "step": 1571 }, { "epoch": 0.2914349276974416, "grad_norm": 6.25390625, "learning_rate": 9.70856507230256e-06, "loss": 3.2265, "mean_token_accuracy": 0.4004566210045662, "step": 1572 }, { "epoch": 0.29162031887282164, "grad_norm": 7.76953125, "learning_rate": 9.70837968112718e-06, "loss": 2.3289, "mean_token_accuracy": 0.49734349734349736, "step": 1573 }, { "epoch": 0.2918057100482017, "grad_norm": 6.171875, "learning_rate": 9.708194289951799e-06, "loss": 3.2519, "mean_token_accuracy": 0.3765662490002666, "step": 1574 }, { "epoch": 0.29199110122358174, "grad_norm": 8.2109375, "learning_rate": 9.708008898776419e-06, "loss": 1.9985, "mean_token_accuracy": 0.5430408381950232, "step": 1575 }, { "epoch": 0.2921764923989618, "grad_norm": 7.39453125, "learning_rate": 9.707823507601038e-06, "loss": 2.8638, "mean_token_accuracy": 0.4476762906514973, "step": 1576 }, { "epoch": 0.29236188357434184, "grad_norm": 8.390625, "learning_rate": 9.70763811642566e-06, "loss": 2.8807, "mean_token_accuracy": 0.4274538745387454, "step": 1577 }, { "epoch": 0.2925472747497219, "grad_norm": 5.953125, "learning_rate": 9.707452725250279e-06, "loss": 2.8645, "mean_token_accuracy": 0.428875, "step": 1578 }, { "epoch": 0.29273266592510194, "grad_norm": 5.9765625, "learning_rate": 9.7072673340749e-06, "loss": 2.8208, "mean_token_accuracy": 0.42228277958285293, "step": 1579 }, { "epoch": 0.292918057100482, "grad_norm": 5.3359375, "learning_rate": 9.707081942899518e-06, "loss": 3.3632, "mean_token_accuracy": 0.37762669962917184, "step": 1580 }, { "epoch": 0.29310344827586204, "grad_norm": 7.4375, "learning_rate": 9.706896551724139e-06, "loss": 2.9174, "mean_token_accuracy": 0.4350547730829421, "step": 1581 }, { "epoch": 0.2932888394512421, "grad_norm": 6.2421875, "learning_rate": 9.70671116054876e-06, "loss": 3.2318, "mean_token_accuracy": 0.3927633498686898, "step": 1582 }, { "epoch": 0.29347423062662215, "grad_norm": 7.203125, "learning_rate": 9.706525769373378e-06, "loss": 2.6148, "mean_token_accuracy": 0.44075, "step": 1583 }, { "epoch": 0.2936596218020022, "grad_norm": 6.8359375, "learning_rate": 9.706340378197998e-06, "loss": 2.794, "mean_token_accuracy": 0.4410968577144875, "step": 1584 }, { "epoch": 0.2938450129773823, "grad_norm": 5.83984375, "learning_rate": 9.706154987022619e-06, "loss": 2.6049, "mean_token_accuracy": 0.4608023072889355, "step": 1585 }, { "epoch": 0.2940304041527623, "grad_norm": 7.8515625, "learning_rate": 9.70596959584724e-06, "loss": 2.6257, "mean_token_accuracy": 0.44883203559510565, "step": 1586 }, { "epoch": 0.2942157953281424, "grad_norm": 6.53125, "learning_rate": 9.705784204671858e-06, "loss": 2.7849, "mean_token_accuracy": 0.44210905596925293, "step": 1587 }, { "epoch": 0.2944011865035224, "grad_norm": 6.75390625, "learning_rate": 9.705598813496479e-06, "loss": 2.6874, "mean_token_accuracy": 0.43751891074130106, "step": 1588 }, { "epoch": 0.2945865776789025, "grad_norm": 5.18359375, "learning_rate": 9.705413422321098e-06, "loss": 3.0166, "mean_token_accuracy": 0.4035426365391166, "step": 1589 }, { "epoch": 0.29477196885428253, "grad_norm": 9.4609375, "learning_rate": 9.705228031145718e-06, "loss": 3.122, "mean_token_accuracy": 0.3881394756935607, "step": 1590 }, { "epoch": 0.2949573600296626, "grad_norm": 7.8046875, "learning_rate": 9.705042639970339e-06, "loss": 3.1501, "mean_token_accuracy": 0.4048533251420673, "step": 1591 }, { "epoch": 0.29514275120504263, "grad_norm": 6.26953125, "learning_rate": 9.704857248794957e-06, "loss": 3.2137, "mean_token_accuracy": 0.3904655612244898, "step": 1592 }, { "epoch": 0.2953281423804227, "grad_norm": 8.1015625, "learning_rate": 9.704671857619578e-06, "loss": 2.8807, "mean_token_accuracy": 0.4326157860404436, "step": 1593 }, { "epoch": 0.29551353355580273, "grad_norm": 7.43359375, "learning_rate": 9.704486466444198e-06, "loss": 2.2527, "mean_token_accuracy": 0.5080840743734842, "step": 1594 }, { "epoch": 0.2956989247311828, "grad_norm": 11.875, "learning_rate": 9.704301075268819e-06, "loss": 2.7885, "mean_token_accuracy": 0.42993070638121833, "step": 1595 }, { "epoch": 0.29588431590656283, "grad_norm": 9.015625, "learning_rate": 9.704115684093438e-06, "loss": 3.3049, "mean_token_accuracy": 0.36860940695296524, "step": 1596 }, { "epoch": 0.2960697070819429, "grad_norm": 9.6875, "learning_rate": 9.703930292918058e-06, "loss": 3.0422, "mean_token_accuracy": 0.3937613019891501, "step": 1597 }, { "epoch": 0.29625509825732294, "grad_norm": 6.13671875, "learning_rate": 9.703744901742677e-06, "loss": 3.2141, "mean_token_accuracy": 0.3995263026676639, "step": 1598 }, { "epoch": 0.296440489432703, "grad_norm": 6.0546875, "learning_rate": 9.703559510567297e-06, "loss": 2.4137, "mean_token_accuracy": 0.4966857142857143, "step": 1599 }, { "epoch": 0.29662588060808304, "grad_norm": 6.80859375, "learning_rate": 9.703374119391918e-06, "loss": 2.7705, "mean_token_accuracy": 0.43312744232950967, "step": 1600 }, { "epoch": 0.2968112717834631, "grad_norm": 7.23046875, "learning_rate": 9.703188728216538e-06, "loss": 2.9913, "mean_token_accuracy": 0.41335978835978837, "step": 1601 }, { "epoch": 0.29699666295884314, "grad_norm": 6.15234375, "learning_rate": 9.703003337041159e-06, "loss": 2.6985, "mean_token_accuracy": 0.4531224786186865, "step": 1602 }, { "epoch": 0.2971820541342232, "grad_norm": 5.7265625, "learning_rate": 9.702817945865778e-06, "loss": 3.1851, "mean_token_accuracy": 0.4013269557167104, "step": 1603 }, { "epoch": 0.29736744530960324, "grad_norm": 7.75390625, "learning_rate": 9.702632554690398e-06, "loss": 2.8564, "mean_token_accuracy": 0.43471473176270226, "step": 1604 }, { "epoch": 0.2975528364849833, "grad_norm": 7.609375, "learning_rate": 9.702447163515017e-06, "loss": 2.7967, "mean_token_accuracy": 0.4384229779162715, "step": 1605 }, { "epoch": 0.29773822766036334, "grad_norm": 4.98828125, "learning_rate": 9.702261772339637e-06, "loss": 3.0895, "mean_token_accuracy": 0.40297766749379654, "step": 1606 }, { "epoch": 0.2979236188357434, "grad_norm": 6.76171875, "learning_rate": 9.702076381164258e-06, "loss": 3.1842, "mean_token_accuracy": 0.39545519508360727, "step": 1607 }, { "epoch": 0.29810901001112344, "grad_norm": 7.15234375, "learning_rate": 9.701890989988877e-06, "loss": 2.9224, "mean_token_accuracy": 0.3972678349661929, "step": 1608 }, { "epoch": 0.2982944011865035, "grad_norm": 5.2265625, "learning_rate": 9.701705598813497e-06, "loss": 2.869, "mean_token_accuracy": 0.43312101910828027, "step": 1609 }, { "epoch": 0.2984797923618836, "grad_norm": 10.0625, "learning_rate": 9.701520207638118e-06, "loss": 2.9442, "mean_token_accuracy": 0.42415384615384616, "step": 1610 }, { "epoch": 0.2986651835372636, "grad_norm": 9.90625, "learning_rate": 9.701334816462738e-06, "loss": 2.9575, "mean_token_accuracy": 0.43625827814569534, "step": 1611 }, { "epoch": 0.2988505747126437, "grad_norm": 7.046875, "learning_rate": 9.701149425287357e-06, "loss": 2.6565, "mean_token_accuracy": 0.4426437429537768, "step": 1612 }, { "epoch": 0.2990359658880237, "grad_norm": 7.44921875, "learning_rate": 9.700964034111977e-06, "loss": 3.2336, "mean_token_accuracy": 0.3844438249233343, "step": 1613 }, { "epoch": 0.2992213570634038, "grad_norm": 8.1171875, "learning_rate": 9.700778642936596e-06, "loss": 3.1014, "mean_token_accuracy": 0.39703597466236407, "step": 1614 }, { "epoch": 0.2994067482387838, "grad_norm": 9.6171875, "learning_rate": 9.700593251761217e-06, "loss": 2.1395, "mean_token_accuracy": 0.5212129840546698, "step": 1615 }, { "epoch": 0.2995921394141639, "grad_norm": 8.3984375, "learning_rate": 9.700407860585837e-06, "loss": 2.5254, "mean_token_accuracy": 0.49286624203821655, "step": 1616 }, { "epoch": 0.29977753058954393, "grad_norm": 6.72265625, "learning_rate": 9.700222469410456e-06, "loss": 3.1263, "mean_token_accuracy": 0.40148428405122233, "step": 1617 }, { "epoch": 0.299962921764924, "grad_norm": 8.6015625, "learning_rate": 9.700037078235077e-06, "loss": 2.5444, "mean_token_accuracy": 0.4650430146613353, "step": 1618 }, { "epoch": 0.30014831294030403, "grad_norm": 5.89453125, "learning_rate": 9.699851687059697e-06, "loss": 3.2236, "mean_token_accuracy": 0.3995475752863, "step": 1619 }, { "epoch": 0.3003337041156841, "grad_norm": 5.81640625, "learning_rate": 9.699666295884318e-06, "loss": 3.2149, "mean_token_accuracy": 0.389470664180175, "step": 1620 }, { "epoch": 0.30051909529106413, "grad_norm": 6.26171875, "learning_rate": 9.699480904708936e-06, "loss": 2.8352, "mean_token_accuracy": 0.4374616799509503, "step": 1621 }, { "epoch": 0.3007044864664442, "grad_norm": 10.703125, "learning_rate": 9.699295513533557e-06, "loss": 2.7288, "mean_token_accuracy": 0.42482803799541435, "step": 1622 }, { "epoch": 0.30088987764182423, "grad_norm": 5.34765625, "learning_rate": 9.699110122358176e-06, "loss": 3.3044, "mean_token_accuracy": 0.3780892103676914, "step": 1623 }, { "epoch": 0.3010752688172043, "grad_norm": 6.02734375, "learning_rate": 9.698924731182796e-06, "loss": 3.2094, "mean_token_accuracy": 0.40568475452196384, "step": 1624 }, { "epoch": 0.30126065999258433, "grad_norm": 6.4296875, "learning_rate": 9.698739340007417e-06, "loss": 2.6631, "mean_token_accuracy": 0.4592358604091456, "step": 1625 }, { "epoch": 0.3014460511679644, "grad_norm": 5.2109375, "learning_rate": 9.698553948832037e-06, "loss": 3.0968, "mean_token_accuracy": 0.42143127603180613, "step": 1626 }, { "epoch": 0.30163144234334444, "grad_norm": 5.97265625, "learning_rate": 9.698368557656656e-06, "loss": 3.0453, "mean_token_accuracy": 0.40907880724174656, "step": 1627 }, { "epoch": 0.3018168335187245, "grad_norm": 23.8125, "learning_rate": 9.698183166481276e-06, "loss": 3.1499, "mean_token_accuracy": 0.36487980105001383, "step": 1628 }, { "epoch": 0.30200222469410454, "grad_norm": 9.4609375, "learning_rate": 9.697997775305897e-06, "loss": 2.9914, "mean_token_accuracy": 0.4153034868704262, "step": 1629 }, { "epoch": 0.3021876158694846, "grad_norm": 8.40625, "learning_rate": 9.697812384130516e-06, "loss": 3.042, "mean_token_accuracy": 0.41618709440431, "step": 1630 }, { "epoch": 0.30237300704486464, "grad_norm": 6.01171875, "learning_rate": 9.697626992955136e-06, "loss": 2.9885, "mean_token_accuracy": 0.39879502627868224, "step": 1631 }, { "epoch": 0.3025583982202447, "grad_norm": 20.09375, "learning_rate": 9.697441601779755e-06, "loss": 3.3641, "mean_token_accuracy": 0.36503928170594835, "step": 1632 }, { "epoch": 0.30274378939562474, "grad_norm": 9.0390625, "learning_rate": 9.697256210604375e-06, "loss": 3.5279, "mean_token_accuracy": 0.38456375838926177, "step": 1633 }, { "epoch": 0.3029291805710048, "grad_norm": 7.66015625, "learning_rate": 9.697070819428996e-06, "loss": 3.0595, "mean_token_accuracy": 0.39925612415031425, "step": 1634 }, { "epoch": 0.3031145717463849, "grad_norm": 6.16796875, "learning_rate": 9.696885428253616e-06, "loss": 3.0098, "mean_token_accuracy": 0.41105289421157687, "step": 1635 }, { "epoch": 0.3032999629217649, "grad_norm": 7.1328125, "learning_rate": 9.696700037078235e-06, "loss": 3.1967, "mean_token_accuracy": 0.3983202533388407, "step": 1636 }, { "epoch": 0.303485354097145, "grad_norm": 12.1953125, "learning_rate": 9.696514645902856e-06, "loss": 2.3336, "mean_token_accuracy": 0.4772754965273547, "step": 1637 }, { "epoch": 0.303670745272525, "grad_norm": 11.484375, "learning_rate": 9.696329254727476e-06, "loss": 2.2429, "mean_token_accuracy": 0.48963903743315507, "step": 1638 }, { "epoch": 0.3038561364479051, "grad_norm": 6.25, "learning_rate": 9.696143863552095e-06, "loss": 3.1508, "mean_token_accuracy": 0.381734404536862, "step": 1639 }, { "epoch": 0.3040415276232851, "grad_norm": 6.671875, "learning_rate": 9.695958472376715e-06, "loss": 2.8728, "mean_token_accuracy": 0.41678843968176654, "step": 1640 }, { "epoch": 0.3042269187986652, "grad_norm": 7.46875, "learning_rate": 9.695773081201334e-06, "loss": 2.5174, "mean_token_accuracy": 0.45927700348432055, "step": 1641 }, { "epoch": 0.3044123099740452, "grad_norm": 6.21484375, "learning_rate": 9.695587690025956e-06, "loss": 2.865, "mean_token_accuracy": 0.4286121808586507, "step": 1642 }, { "epoch": 0.3045977011494253, "grad_norm": 8.734375, "learning_rate": 9.695402298850575e-06, "loss": 2.7297, "mean_token_accuracy": 0.4509151414309484, "step": 1643 }, { "epoch": 0.3047830923248053, "grad_norm": 6.77734375, "learning_rate": 9.695216907675196e-06, "loss": 2.6571, "mean_token_accuracy": 0.44252054794520546, "step": 1644 }, { "epoch": 0.3049684835001854, "grad_norm": 5.8359375, "learning_rate": 9.695031516499816e-06, "loss": 2.7976, "mean_token_accuracy": 0.42959958126144987, "step": 1645 }, { "epoch": 0.30515387467556543, "grad_norm": 5.12109375, "learning_rate": 9.694846125324435e-06, "loss": 2.9451, "mean_token_accuracy": 0.4317892593535251, "step": 1646 }, { "epoch": 0.3053392658509455, "grad_norm": 6.2890625, "learning_rate": 9.694660734149056e-06, "loss": 2.6259, "mean_token_accuracy": 0.4798870853916726, "step": 1647 }, { "epoch": 0.30552465702632553, "grad_norm": 6.23828125, "learning_rate": 9.694475342973674e-06, "loss": 2.114, "mean_token_accuracy": 0.5342583321413246, "step": 1648 }, { "epoch": 0.3057100482017056, "grad_norm": 5.484375, "learning_rate": 9.694289951798295e-06, "loss": 3.1981, "mean_token_accuracy": 0.4018670565740637, "step": 1649 }, { "epoch": 0.30589543937708563, "grad_norm": 6.02734375, "learning_rate": 9.694104560622915e-06, "loss": 2.872, "mean_token_accuracy": 0.4292744479495268, "step": 1650 }, { "epoch": 0.3060808305524657, "grad_norm": 7.1328125, "learning_rate": 9.693919169447536e-06, "loss": 2.9167, "mean_token_accuracy": 0.4280237937871778, "step": 1651 }, { "epoch": 0.30626622172784573, "grad_norm": 5.359375, "learning_rate": 9.693733778272155e-06, "loss": 3.0145, "mean_token_accuracy": 0.4054091158704009, "step": 1652 }, { "epoch": 0.3064516129032258, "grad_norm": 11.6875, "learning_rate": 9.693548387096775e-06, "loss": 2.6207, "mean_token_accuracy": 0.4476362094551622, "step": 1653 }, { "epoch": 0.30663700407860583, "grad_norm": 7.13671875, "learning_rate": 9.693362995921396e-06, "loss": 2.897, "mean_token_accuracy": 0.4419168941461935, "step": 1654 }, { "epoch": 0.3068223952539859, "grad_norm": 6.25, "learning_rate": 9.693177604746014e-06, "loss": 2.8809, "mean_token_accuracy": 0.43045134479972297, "step": 1655 }, { "epoch": 0.30700778642936594, "grad_norm": 5.46875, "learning_rate": 9.692992213570635e-06, "loss": 2.7272, "mean_token_accuracy": 0.44576226012793174, "step": 1656 }, { "epoch": 0.307193177604746, "grad_norm": 4.80859375, "learning_rate": 9.692806822395254e-06, "loss": 3.0342, "mean_token_accuracy": 0.4126184478829315, "step": 1657 }, { "epoch": 0.3073785687801261, "grad_norm": 5.4765625, "learning_rate": 9.692621431219876e-06, "loss": 2.9861, "mean_token_accuracy": 0.43495693495693494, "step": 1658 }, { "epoch": 0.3075639599555061, "grad_norm": 7.91796875, "learning_rate": 9.692436040044495e-06, "loss": 2.6858, "mean_token_accuracy": 0.45570006096321886, "step": 1659 }, { "epoch": 0.3077493511308862, "grad_norm": 6.1796875, "learning_rate": 9.692250648869115e-06, "loss": 2.869, "mean_token_accuracy": 0.4214335745972449, "step": 1660 }, { "epoch": 0.3079347423062662, "grad_norm": 6.16796875, "learning_rate": 9.692065257693734e-06, "loss": 3.0517, "mean_token_accuracy": 0.4017615971814445, "step": 1661 }, { "epoch": 0.3081201334816463, "grad_norm": 7.6328125, "learning_rate": 9.691879866518354e-06, "loss": 2.9657, "mean_token_accuracy": 0.4263743115452562, "step": 1662 }, { "epoch": 0.3083055246570263, "grad_norm": 5.5625, "learning_rate": 9.691694475342975e-06, "loss": 2.7808, "mean_token_accuracy": 0.45369195635625476, "step": 1663 }, { "epoch": 0.3084909158324064, "grad_norm": 6.21875, "learning_rate": 9.691509084167594e-06, "loss": 2.7273, "mean_token_accuracy": 0.4305889423076923, "step": 1664 }, { "epoch": 0.3086763070077864, "grad_norm": 6.234375, "learning_rate": 9.691323692992214e-06, "loss": 2.8099, "mean_token_accuracy": 0.4431085770946951, "step": 1665 }, { "epoch": 0.3088616981831665, "grad_norm": 5.4921875, "learning_rate": 9.691138301816835e-06, "loss": 2.9046, "mean_token_accuracy": 0.4237653074956264, "step": 1666 }, { "epoch": 0.3090470893585465, "grad_norm": 5.36328125, "learning_rate": 9.690952910641455e-06, "loss": 2.7998, "mean_token_accuracy": 0.4306955025804866, "step": 1667 }, { "epoch": 0.3092324805339266, "grad_norm": 5.0546875, "learning_rate": 9.690767519466074e-06, "loss": 2.7191, "mean_token_accuracy": 0.45101694915254237, "step": 1668 }, { "epoch": 0.3094178717093066, "grad_norm": 5.0625, "learning_rate": 9.690582128290694e-06, "loss": 2.4871, "mean_token_accuracy": 0.4999378495960224, "step": 1669 }, { "epoch": 0.3096032628846867, "grad_norm": 7.10546875, "learning_rate": 9.690396737115313e-06, "loss": 2.5897, "mean_token_accuracy": 0.45039991690038433, "step": 1670 }, { "epoch": 0.3097886540600667, "grad_norm": 5.171875, "learning_rate": 9.690211345939934e-06, "loss": 2.7431, "mean_token_accuracy": 0.44445851804939834, "step": 1671 }, { "epoch": 0.3099740452354468, "grad_norm": 6.48828125, "learning_rate": 9.690025954764554e-06, "loss": 3.3281, "mean_token_accuracy": 0.3828210424006804, "step": 1672 }, { "epoch": 0.3101594364108268, "grad_norm": 6.9375, "learning_rate": 9.689840563589173e-06, "loss": 2.6799, "mean_token_accuracy": 0.4417074877536739, "step": 1673 }, { "epoch": 0.3103448275862069, "grad_norm": 17.34375, "learning_rate": 9.689655172413794e-06, "loss": 2.7592, "mean_token_accuracy": 0.43471357029436913, "step": 1674 }, { "epoch": 0.31053021876158693, "grad_norm": 5.328125, "learning_rate": 9.689469781238414e-06, "loss": 3.0806, "mean_token_accuracy": 0.4251152073732719, "step": 1675 }, { "epoch": 0.310715609936967, "grad_norm": 5.52734375, "learning_rate": 9.689284390063035e-06, "loss": 2.8179, "mean_token_accuracy": 0.4434006031083275, "step": 1676 }, { "epoch": 0.31090100111234703, "grad_norm": 5.7578125, "learning_rate": 9.689098998887653e-06, "loss": 3.0635, "mean_token_accuracy": 0.41468771448181196, "step": 1677 }, { "epoch": 0.3110863922877271, "grad_norm": 5.20703125, "learning_rate": 9.688913607712274e-06, "loss": 3.1912, "mean_token_accuracy": 0.3950810508664058, "step": 1678 }, { "epoch": 0.31127178346310713, "grad_norm": 5.6875, "learning_rate": 9.688728216536893e-06, "loss": 3.0033, "mean_token_accuracy": 0.41600604001510005, "step": 1679 }, { "epoch": 0.3114571746384872, "grad_norm": 5.87890625, "learning_rate": 9.688542825361513e-06, "loss": 2.7496, "mean_token_accuracy": 0.4387962586417243, "step": 1680 }, { "epoch": 0.31164256581386723, "grad_norm": 7.33203125, "learning_rate": 9.688357434186134e-06, "loss": 2.9002, "mean_token_accuracy": 0.4183506280720918, "step": 1681 }, { "epoch": 0.3118279569892473, "grad_norm": 6.1015625, "learning_rate": 9.688172043010754e-06, "loss": 3.0304, "mean_token_accuracy": 0.4251551043429216, "step": 1682 }, { "epoch": 0.3120133481646274, "grad_norm": 5.22265625, "learning_rate": 9.687986651835375e-06, "loss": 2.7653, "mean_token_accuracy": 0.43670137245622337, "step": 1683 }, { "epoch": 0.3121987393400074, "grad_norm": 8.390625, "learning_rate": 9.687801260659993e-06, "loss": 2.2176, "mean_token_accuracy": 0.5166324903167009, "step": 1684 }, { "epoch": 0.3123841305153875, "grad_norm": 6.85546875, "learning_rate": 9.687615869484614e-06, "loss": 2.5245, "mean_token_accuracy": 0.45152299422655784, "step": 1685 }, { "epoch": 0.3125695216907675, "grad_norm": 7.98046875, "learning_rate": 9.687430478309233e-06, "loss": 2.1454, "mean_token_accuracy": 0.5059038515603036, "step": 1686 }, { "epoch": 0.3127549128661476, "grad_norm": 9.4765625, "learning_rate": 9.687245087133853e-06, "loss": 2.4795, "mean_token_accuracy": 0.4844777841892672, "step": 1687 }, { "epoch": 0.3129403040415276, "grad_norm": 7.30859375, "learning_rate": 9.687059695958474e-06, "loss": 2.8524, "mean_token_accuracy": 0.42684824902723734, "step": 1688 }, { "epoch": 0.3131256952169077, "grad_norm": 6.8046875, "learning_rate": 9.686874304783092e-06, "loss": 2.7992, "mean_token_accuracy": 0.4192585220204031, "step": 1689 }, { "epoch": 0.3133110863922877, "grad_norm": 5.8984375, "learning_rate": 9.686688913607713e-06, "loss": 2.5237, "mean_token_accuracy": 0.47726646459079636, "step": 1690 }, { "epoch": 0.3134964775676678, "grad_norm": 6.15625, "learning_rate": 9.686503522432333e-06, "loss": 3.0805, "mean_token_accuracy": 0.41820681228043377, "step": 1691 }, { "epoch": 0.3136818687430478, "grad_norm": 6.44140625, "learning_rate": 9.686318131256954e-06, "loss": 3.1015, "mean_token_accuracy": 0.4072164948453608, "step": 1692 }, { "epoch": 0.3138672599184279, "grad_norm": 6.98828125, "learning_rate": 9.686132740081573e-06, "loss": 3.0463, "mean_token_accuracy": 0.41208998366218424, "step": 1693 }, { "epoch": 0.3140526510938079, "grad_norm": 5.71875, "learning_rate": 9.685947348906193e-06, "loss": 3.1958, "mean_token_accuracy": 0.39973127309371853, "step": 1694 }, { "epoch": 0.314238042269188, "grad_norm": 7.55078125, "learning_rate": 9.685761957730812e-06, "loss": 2.7279, "mean_token_accuracy": 0.42528162695643507, "step": 1695 }, { "epoch": 0.314423433444568, "grad_norm": 7.3046875, "learning_rate": 9.685576566555433e-06, "loss": 2.7829, "mean_token_accuracy": 0.4410958904109589, "step": 1696 }, { "epoch": 0.3146088246199481, "grad_norm": 7.1640625, "learning_rate": 9.685391175380053e-06, "loss": 3.5399, "mean_token_accuracy": 0.3669496487119438, "step": 1697 }, { "epoch": 0.3147942157953281, "grad_norm": 5.3828125, "learning_rate": 9.685205784204673e-06, "loss": 3.1652, "mean_token_accuracy": 0.40021281626862143, "step": 1698 }, { "epoch": 0.3149796069707082, "grad_norm": 5.72265625, "learning_rate": 9.685020393029292e-06, "loss": 2.8891, "mean_token_accuracy": 0.42942030899247324, "step": 1699 }, { "epoch": 0.3151649981460882, "grad_norm": 7.73046875, "learning_rate": 9.684835001853913e-06, "loss": 2.4826, "mean_token_accuracy": 0.4716366007389698, "step": 1700 }, { "epoch": 0.3153503893214683, "grad_norm": 5.73828125, "learning_rate": 9.684649610678533e-06, "loss": 3.078, "mean_token_accuracy": 0.4200402819738167, "step": 1701 }, { "epoch": 0.31553578049684833, "grad_norm": 6.10546875, "learning_rate": 9.684464219503152e-06, "loss": 2.8713, "mean_token_accuracy": 0.4183555775251277, "step": 1702 }, { "epoch": 0.3157211716722284, "grad_norm": 6.82421875, "learning_rate": 9.684278828327773e-06, "loss": 3.1498, "mean_token_accuracy": 0.393552036199095, "step": 1703 }, { "epoch": 0.31590656284760843, "grad_norm": 6.7890625, "learning_rate": 9.684093437152391e-06, "loss": 3.0194, "mean_token_accuracy": 0.41530170136320405, "step": 1704 }, { "epoch": 0.3160919540229885, "grad_norm": 6.609375, "learning_rate": 9.683908045977012e-06, "loss": 3.0437, "mean_token_accuracy": 0.40755467196819084, "step": 1705 }, { "epoch": 0.31627734519836853, "grad_norm": 6.60546875, "learning_rate": 9.683722654801632e-06, "loss": 3.3171, "mean_token_accuracy": 0.37373876986869387, "step": 1706 }, { "epoch": 0.3164627363737486, "grad_norm": 5.73046875, "learning_rate": 9.683537263626253e-06, "loss": 3.1819, "mean_token_accuracy": 0.3945742117942809, "step": 1707 }, { "epoch": 0.3166481275491287, "grad_norm": 5.36328125, "learning_rate": 9.683351872450872e-06, "loss": 3.0673, "mean_token_accuracy": 0.40986908358509566, "step": 1708 }, { "epoch": 0.3168335187245087, "grad_norm": 7.39453125, "learning_rate": 9.683166481275492e-06, "loss": 2.9868, "mean_token_accuracy": 0.42795151877899146, "step": 1709 }, { "epoch": 0.3170189098998888, "grad_norm": 7.265625, "learning_rate": 9.682981090100113e-06, "loss": 2.932, "mean_token_accuracy": 0.4205020920502092, "step": 1710 }, { "epoch": 0.3172043010752688, "grad_norm": 5.5, "learning_rate": 9.682795698924731e-06, "loss": 3.1569, "mean_token_accuracy": 0.40234159779614326, "step": 1711 }, { "epoch": 0.3173896922506489, "grad_norm": 6.0, "learning_rate": 9.682610307749352e-06, "loss": 3.0538, "mean_token_accuracy": 0.41827991113932084, "step": 1712 }, { "epoch": 0.3175750834260289, "grad_norm": 6.87109375, "learning_rate": 9.68242491657397e-06, "loss": 2.3621, "mean_token_accuracy": 0.4851256175759487, "step": 1713 }, { "epoch": 0.317760474601409, "grad_norm": 5.84375, "learning_rate": 9.682239525398593e-06, "loss": 3.4894, "mean_token_accuracy": 0.38137913866069206, "step": 1714 }, { "epoch": 0.317945865776789, "grad_norm": 5.33203125, "learning_rate": 9.682054134223212e-06, "loss": 2.4059, "mean_token_accuracy": 0.48142816009213935, "step": 1715 }, { "epoch": 0.3181312569521691, "grad_norm": 8.2734375, "learning_rate": 9.681868743047832e-06, "loss": 2.6264, "mean_token_accuracy": 0.4357818837314105, "step": 1716 }, { "epoch": 0.3183166481275491, "grad_norm": 6.76953125, "learning_rate": 9.681683351872451e-06, "loss": 2.5407, "mean_token_accuracy": 0.4632869365342868, "step": 1717 }, { "epoch": 0.3185020393029292, "grad_norm": 6.52734375, "learning_rate": 9.681497960697071e-06, "loss": 2.871, "mean_token_accuracy": 0.4162655806491423, "step": 1718 }, { "epoch": 0.3186874304783092, "grad_norm": 6.296875, "learning_rate": 9.681312569521692e-06, "loss": 2.9749, "mean_token_accuracy": 0.41883035039072347, "step": 1719 }, { "epoch": 0.3188728216536893, "grad_norm": 5.98046875, "learning_rate": 9.68112717834631e-06, "loss": 2.7921, "mean_token_accuracy": 0.4387661743562933, "step": 1720 }, { "epoch": 0.3190582128290693, "grad_norm": 6.87109375, "learning_rate": 9.680941787170931e-06, "loss": 2.734, "mean_token_accuracy": 0.44269226374128673, "step": 1721 }, { "epoch": 0.3192436040044494, "grad_norm": 8.6015625, "learning_rate": 9.680756395995552e-06, "loss": 3.1061, "mean_token_accuracy": 0.4065630397236615, "step": 1722 }, { "epoch": 0.3194289951798294, "grad_norm": 8.40625, "learning_rate": 9.680571004820172e-06, "loss": 2.1459, "mean_token_accuracy": 0.5106964582838127, "step": 1723 }, { "epoch": 0.3196143863552095, "grad_norm": 5.9921875, "learning_rate": 9.680385613644791e-06, "loss": 3.2803, "mean_token_accuracy": 0.3998935745643209, "step": 1724 }, { "epoch": 0.3197997775305895, "grad_norm": 7.1796875, "learning_rate": 9.680200222469412e-06, "loss": 2.739, "mean_token_accuracy": 0.4352580480327031, "step": 1725 }, { "epoch": 0.3199851687059696, "grad_norm": 7.71484375, "learning_rate": 9.680014831294032e-06, "loss": 2.7612, "mean_token_accuracy": 0.44147023571713945, "step": 1726 }, { "epoch": 0.3201705598813496, "grad_norm": 6.05078125, "learning_rate": 9.67982944011865e-06, "loss": 2.6809, "mean_token_accuracy": 0.4778831752371443, "step": 1727 }, { "epoch": 0.3203559510567297, "grad_norm": 5.7734375, "learning_rate": 9.679644048943271e-06, "loss": 2.6775, "mean_token_accuracy": 0.4397283531409168, "step": 1728 }, { "epoch": 0.3205413422321097, "grad_norm": 6.265625, "learning_rate": 9.67945865776789e-06, "loss": 2.7387, "mean_token_accuracy": 0.47771696637998434, "step": 1729 }, { "epoch": 0.3207267334074898, "grad_norm": 6.71484375, "learning_rate": 9.679273266592512e-06, "loss": 3.1066, "mean_token_accuracy": 0.41353059465670783, "step": 1730 }, { "epoch": 0.32091212458286983, "grad_norm": 6.3203125, "learning_rate": 9.679087875417131e-06, "loss": 2.9148, "mean_token_accuracy": 0.4270994332818135, "step": 1731 }, { "epoch": 0.3210975157582499, "grad_norm": 6.63671875, "learning_rate": 9.678902484241752e-06, "loss": 2.845, "mean_token_accuracy": 0.4361012596306714, "step": 1732 }, { "epoch": 0.32128290693363, "grad_norm": 9.5625, "learning_rate": 9.67871709306637e-06, "loss": 2.777, "mean_token_accuracy": 0.42577943229409027, "step": 1733 }, { "epoch": 0.32146829810901, "grad_norm": 12.59375, "learning_rate": 9.678531701890991e-06, "loss": 2.6566, "mean_token_accuracy": 0.4566291517979687, "step": 1734 }, { "epoch": 0.3216536892843901, "grad_norm": 9.359375, "learning_rate": 9.678346310715611e-06, "loss": 3.4603, "mean_token_accuracy": 0.3717342622542921, "step": 1735 }, { "epoch": 0.3218390804597701, "grad_norm": 7.83203125, "learning_rate": 9.67816091954023e-06, "loss": 3.1582, "mean_token_accuracy": 0.3794565729542956, "step": 1736 }, { "epoch": 0.3220244716351502, "grad_norm": 6.53125, "learning_rate": 9.67797552836485e-06, "loss": 2.8251, "mean_token_accuracy": 0.44359316011547856, "step": 1737 }, { "epoch": 0.3222098628105302, "grad_norm": 6.9453125, "learning_rate": 9.67779013718947e-06, "loss": 2.8963, "mean_token_accuracy": 0.4299089393356419, "step": 1738 }, { "epoch": 0.3223952539859103, "grad_norm": 5.625, "learning_rate": 9.677604746014092e-06, "loss": 2.9299, "mean_token_accuracy": 0.4466729589428976, "step": 1739 }, { "epoch": 0.3225806451612903, "grad_norm": 6.921875, "learning_rate": 9.67741935483871e-06, "loss": 3.1636, "mean_token_accuracy": 0.3961205319019153, "step": 1740 }, { "epoch": 0.3227660363366704, "grad_norm": 7.13671875, "learning_rate": 9.677233963663331e-06, "loss": 3.4525, "mean_token_accuracy": 0.3732336956521739, "step": 1741 }, { "epoch": 0.3229514275120504, "grad_norm": 9.46875, "learning_rate": 9.67704857248795e-06, "loss": 2.564, "mean_token_accuracy": 0.46214676125130827, "step": 1742 }, { "epoch": 0.3231368186874305, "grad_norm": 5.5859375, "learning_rate": 9.67686318131257e-06, "loss": 2.8633, "mean_token_accuracy": 0.42934385503434164, "step": 1743 }, { "epoch": 0.3233222098628105, "grad_norm": 5.2109375, "learning_rate": 9.67667779013719e-06, "loss": 3.2499, "mean_token_accuracy": 0.40780548888657, "step": 1744 }, { "epoch": 0.3235076010381906, "grad_norm": 8.3359375, "learning_rate": 9.67649239896181e-06, "loss": 3.282, "mean_token_accuracy": 0.3755483072770217, "step": 1745 }, { "epoch": 0.3236929922135706, "grad_norm": 7.11328125, "learning_rate": 9.67630700778643e-06, "loss": 2.9008, "mean_token_accuracy": 0.4235454634051262, "step": 1746 }, { "epoch": 0.3238783833889507, "grad_norm": 6.26953125, "learning_rate": 9.67612161661105e-06, "loss": 2.5453, "mean_token_accuracy": 0.4646326446846354, "step": 1747 }, { "epoch": 0.3240637745643307, "grad_norm": 8.171875, "learning_rate": 9.675936225435671e-06, "loss": 2.8352, "mean_token_accuracy": 0.4373008175142026, "step": 1748 }, { "epoch": 0.3242491657397108, "grad_norm": 6.8203125, "learning_rate": 9.67575083426029e-06, "loss": 2.7385, "mean_token_accuracy": 0.45920954511558537, "step": 1749 }, { "epoch": 0.3244345569150908, "grad_norm": 14.2265625, "learning_rate": 9.67556544308491e-06, "loss": 3.2331, "mean_token_accuracy": 0.3976919087136929, "step": 1750 }, { "epoch": 0.3246199480904709, "grad_norm": 7.21484375, "learning_rate": 9.675380051909529e-06, "loss": 2.8482, "mean_token_accuracy": 0.4298931456867344, "step": 1751 }, { "epoch": 0.3248053392658509, "grad_norm": 6.3515625, "learning_rate": 9.67519466073415e-06, "loss": 2.7205, "mean_token_accuracy": 0.4370629370629371, "step": 1752 }, { "epoch": 0.324990730441231, "grad_norm": 6.4765625, "learning_rate": 9.67500926955877e-06, "loss": 2.5599, "mean_token_accuracy": 0.47160762942779294, "step": 1753 }, { "epoch": 0.325176121616611, "grad_norm": 5.1328125, "learning_rate": 9.674823878383389e-06, "loss": 2.7731, "mean_token_accuracy": 0.44122328331059574, "step": 1754 }, { "epoch": 0.3253615127919911, "grad_norm": 14.21875, "learning_rate": 9.67463848720801e-06, "loss": 2.6185, "mean_token_accuracy": 0.4422258111877845, "step": 1755 }, { "epoch": 0.3255469039673711, "grad_norm": 5.453125, "learning_rate": 9.67445309603263e-06, "loss": 1.9098, "mean_token_accuracy": 0.572147291800471, "step": 1756 }, { "epoch": 0.3257322951427512, "grad_norm": 8.328125, "learning_rate": 9.67426770485725e-06, "loss": 2.5541, "mean_token_accuracy": 0.46827717736808644, "step": 1757 }, { "epoch": 0.3259176863181313, "grad_norm": 6.53515625, "learning_rate": 9.674082313681869e-06, "loss": 2.9023, "mean_token_accuracy": 0.41196777905638665, "step": 1758 }, { "epoch": 0.3261030774935113, "grad_norm": 9.5390625, "learning_rate": 9.67389692250649e-06, "loss": 2.7743, "mean_token_accuracy": 0.4165660468485873, "step": 1759 }, { "epoch": 0.3262884686688914, "grad_norm": 6.38671875, "learning_rate": 9.673711531331108e-06, "loss": 3.3841, "mean_token_accuracy": 0.3949424788365531, "step": 1760 }, { "epoch": 0.3264738598442714, "grad_norm": 7.859375, "learning_rate": 9.673526140155729e-06, "loss": 2.4976, "mean_token_accuracy": 0.48478513356562136, "step": 1761 }, { "epoch": 0.3266592510196515, "grad_norm": 6.859375, "learning_rate": 9.67334074898035e-06, "loss": 2.7439, "mean_token_accuracy": 0.44480195273493844, "step": 1762 }, { "epoch": 0.3268446421950315, "grad_norm": 6.5546875, "learning_rate": 9.67315535780497e-06, "loss": 2.8163, "mean_token_accuracy": 0.4390068886337543, "step": 1763 }, { "epoch": 0.3270300333704116, "grad_norm": 7.3125, "learning_rate": 9.67296996662959e-06, "loss": 3.3593, "mean_token_accuracy": 0.37644341801385683, "step": 1764 }, { "epoch": 0.3272154245457916, "grad_norm": 6.953125, "learning_rate": 9.672784575454209e-06, "loss": 2.8754, "mean_token_accuracy": 0.4099620893007582, "step": 1765 }, { "epoch": 0.3274008157211717, "grad_norm": 6.23828125, "learning_rate": 9.67259918427883e-06, "loss": 2.6543, "mean_token_accuracy": 0.456540825285338, "step": 1766 }, { "epoch": 0.3275862068965517, "grad_norm": 5.48828125, "learning_rate": 9.672413793103448e-06, "loss": 2.957, "mean_token_accuracy": 0.42774711490215755, "step": 1767 }, { "epoch": 0.3277715980719318, "grad_norm": 5.72265625, "learning_rate": 9.672228401928069e-06, "loss": 2.9854, "mean_token_accuracy": 0.418646346929628, "step": 1768 }, { "epoch": 0.3279569892473118, "grad_norm": 5.98828125, "learning_rate": 9.67204301075269e-06, "loss": 3.1308, "mean_token_accuracy": 0.3994402239104358, "step": 1769 }, { "epoch": 0.3281423804226919, "grad_norm": 6.85546875, "learning_rate": 9.671857619577308e-06, "loss": 3.219, "mean_token_accuracy": 0.3911631846414455, "step": 1770 }, { "epoch": 0.3283277715980719, "grad_norm": 6.1953125, "learning_rate": 9.671672228401929e-06, "loss": 2.8306, "mean_token_accuracy": 0.43488399207182, "step": 1771 }, { "epoch": 0.328513162773452, "grad_norm": 6.69921875, "learning_rate": 9.67148683722655e-06, "loss": 2.681, "mean_token_accuracy": 0.4502289077828646, "step": 1772 }, { "epoch": 0.328698553948832, "grad_norm": 6.5625, "learning_rate": 9.67130144605117e-06, "loss": 2.3006, "mean_token_accuracy": 0.5141921397379913, "step": 1773 }, { "epoch": 0.3288839451242121, "grad_norm": 7.2734375, "learning_rate": 9.671116054875788e-06, "loss": 2.703, "mean_token_accuracy": 0.4460529909860694, "step": 1774 }, { "epoch": 0.3290693362995921, "grad_norm": 6.60546875, "learning_rate": 9.670930663700409e-06, "loss": 2.7383, "mean_token_accuracy": 0.4535855186818287, "step": 1775 }, { "epoch": 0.3292547274749722, "grad_norm": 6.47265625, "learning_rate": 9.670745272525028e-06, "loss": 3.224, "mean_token_accuracy": 0.4026381909547739, "step": 1776 }, { "epoch": 0.3294401186503522, "grad_norm": 6.6640625, "learning_rate": 9.670559881349648e-06, "loss": 2.6125, "mean_token_accuracy": 0.4525202520252025, "step": 1777 }, { "epoch": 0.3296255098257323, "grad_norm": 5.8359375, "learning_rate": 9.670374490174269e-06, "loss": 3.0713, "mean_token_accuracy": 0.4307923771313942, "step": 1778 }, { "epoch": 0.3298109010011123, "grad_norm": 6.484375, "learning_rate": 9.67018909899889e-06, "loss": 2.965, "mean_token_accuracy": 0.42915698865965296, "step": 1779 }, { "epoch": 0.3299962921764924, "grad_norm": 5.875, "learning_rate": 9.670003707823508e-06, "loss": 2.7129, "mean_token_accuracy": 0.45958656895986, "step": 1780 }, { "epoch": 0.3301816833518724, "grad_norm": 8.3359375, "learning_rate": 9.669818316648129e-06, "loss": 2.8907, "mean_token_accuracy": 0.42188604552523684, "step": 1781 }, { "epoch": 0.3303670745272525, "grad_norm": 7.3515625, "learning_rate": 9.669632925472749e-06, "loss": 2.9502, "mean_token_accuracy": 0.41456907551328154, "step": 1782 }, { "epoch": 0.3305524657026326, "grad_norm": 5.71484375, "learning_rate": 9.669447534297368e-06, "loss": 3.1144, "mean_token_accuracy": 0.42750506072874495, "step": 1783 }, { "epoch": 0.3307378568780126, "grad_norm": 9.8984375, "learning_rate": 9.669262143121988e-06, "loss": 2.7557, "mean_token_accuracy": 0.44374486723241174, "step": 1784 }, { "epoch": 0.3309232480533927, "grad_norm": 9.7421875, "learning_rate": 9.669076751946607e-06, "loss": 2.9642, "mean_token_accuracy": 0.41742654508611954, "step": 1785 }, { "epoch": 0.3311086392287727, "grad_norm": 7.61328125, "learning_rate": 9.668891360771228e-06, "loss": 2.3543, "mean_token_accuracy": 0.484174989449993, "step": 1786 }, { "epoch": 0.3312940304041528, "grad_norm": 5.734375, "learning_rate": 9.668705969595848e-06, "loss": 3.0609, "mean_token_accuracy": 0.4087670049665299, "step": 1787 }, { "epoch": 0.3314794215795328, "grad_norm": 7.79296875, "learning_rate": 9.668520578420469e-06, "loss": 2.8457, "mean_token_accuracy": 0.43902818875564553, "step": 1788 }, { "epoch": 0.3316648127549129, "grad_norm": 5.80859375, "learning_rate": 9.668335187245087e-06, "loss": 2.9401, "mean_token_accuracy": 0.41848347730700675, "step": 1789 }, { "epoch": 0.3318502039302929, "grad_norm": 6.88671875, "learning_rate": 9.668149796069708e-06, "loss": 2.6617, "mean_token_accuracy": 0.4510108864696734, "step": 1790 }, { "epoch": 0.332035595105673, "grad_norm": 8.1796875, "learning_rate": 9.667964404894328e-06, "loss": 3.1361, "mean_token_accuracy": 0.40336134453781514, "step": 1791 }, { "epoch": 0.332220986281053, "grad_norm": 5.65625, "learning_rate": 9.667779013718947e-06, "loss": 3.0346, "mean_token_accuracy": 0.4303886925795053, "step": 1792 }, { "epoch": 0.3324063774564331, "grad_norm": 6.5, "learning_rate": 9.667593622543568e-06, "loss": 3.0776, "mean_token_accuracy": 0.3987851886461862, "step": 1793 }, { "epoch": 0.3325917686318131, "grad_norm": 6.625, "learning_rate": 9.667408231368186e-06, "loss": 2.4369, "mean_token_accuracy": 0.45948150833937634, "step": 1794 }, { "epoch": 0.3327771598071932, "grad_norm": 5.5, "learning_rate": 9.667222840192809e-06, "loss": 3.008, "mean_token_accuracy": 0.40268242056522435, "step": 1795 }, { "epoch": 0.3329625509825732, "grad_norm": 7.26171875, "learning_rate": 9.667037449017427e-06, "loss": 3.1859, "mean_token_accuracy": 0.40474287236877166, "step": 1796 }, { "epoch": 0.3331479421579533, "grad_norm": 6.78515625, "learning_rate": 9.666852057842048e-06, "loss": 2.7784, "mean_token_accuracy": 0.45059793335655407, "step": 1797 }, { "epoch": 0.3333333333333333, "grad_norm": 12.9453125, "learning_rate": 9.666666666666667e-06, "loss": 2.9279, "mean_token_accuracy": 0.4151468612017662, "step": 1798 }, { "epoch": 0.3335187245087134, "grad_norm": 7.61328125, "learning_rate": 9.666481275491287e-06, "loss": 3.095, "mean_token_accuracy": 0.41327210783587615, "step": 1799 }, { "epoch": 0.3337041156840934, "grad_norm": 5.34765625, "learning_rate": 9.666295884315908e-06, "loss": 2.6735, "mean_token_accuracy": 0.45320796460176993, "step": 1800 }, { "epoch": 0.3338895068594735, "grad_norm": 6.32421875, "learning_rate": 9.666110493140527e-06, "loss": 3.0857, "mean_token_accuracy": 0.4119214586255259, "step": 1801 }, { "epoch": 0.3340748980348535, "grad_norm": 8.3671875, "learning_rate": 9.665925101965147e-06, "loss": 2.6713, "mean_token_accuracy": 0.463898005554153, "step": 1802 }, { "epoch": 0.3342602892102336, "grad_norm": 7.9921875, "learning_rate": 9.665739710789767e-06, "loss": 3.2288, "mean_token_accuracy": 0.388215859030837, "step": 1803 }, { "epoch": 0.3344456803856136, "grad_norm": 6.77734375, "learning_rate": 9.665554319614388e-06, "loss": 3.2347, "mean_token_accuracy": 0.3862101646724322, "step": 1804 }, { "epoch": 0.3346310715609937, "grad_norm": 5.921875, "learning_rate": 9.665368928439007e-06, "loss": 3.3388, "mean_token_accuracy": 0.3841633612457764, "step": 1805 }, { "epoch": 0.3348164627363737, "grad_norm": 6.61328125, "learning_rate": 9.665183537263627e-06, "loss": 3.128, "mean_token_accuracy": 0.3998084749820445, "step": 1806 }, { "epoch": 0.3350018539117538, "grad_norm": 8.2109375, "learning_rate": 9.664998146088248e-06, "loss": 2.3419, "mean_token_accuracy": 0.516506273602144, "step": 1807 }, { "epoch": 0.3351872450871339, "grad_norm": 7.46484375, "learning_rate": 9.664812754912867e-06, "loss": 2.7152, "mean_token_accuracy": 0.4534505208333333, "step": 1808 }, { "epoch": 0.3353726362625139, "grad_norm": 7.28125, "learning_rate": 9.664627363737487e-06, "loss": 2.9138, "mean_token_accuracy": 0.4178109062377403, "step": 1809 }, { "epoch": 0.335558027437894, "grad_norm": 10.1015625, "learning_rate": 9.664441972562106e-06, "loss": 2.8979, "mean_token_accuracy": 0.41860166773572804, "step": 1810 }, { "epoch": 0.335743418613274, "grad_norm": 9.2421875, "learning_rate": 9.664256581386728e-06, "loss": 2.5886, "mean_token_accuracy": 0.4322150492115568, "step": 1811 }, { "epoch": 0.3359288097886541, "grad_norm": 7.828125, "learning_rate": 9.664071190211347e-06, "loss": 2.706, "mean_token_accuracy": 0.4560032477418045, "step": 1812 }, { "epoch": 0.3361142009640341, "grad_norm": 7.421875, "learning_rate": 9.663885799035967e-06, "loss": 3.3723, "mean_token_accuracy": 0.3758598550857562, "step": 1813 }, { "epoch": 0.3362995921394142, "grad_norm": 6.99609375, "learning_rate": 9.663700407860586e-06, "loss": 2.7197, "mean_token_accuracy": 0.4407239819004525, "step": 1814 }, { "epoch": 0.3364849833147942, "grad_norm": 8.40625, "learning_rate": 9.663515016685207e-06, "loss": 2.7143, "mean_token_accuracy": 0.45119164218086844, "step": 1815 }, { "epoch": 0.3366703744901743, "grad_norm": 6.83984375, "learning_rate": 9.663329625509827e-06, "loss": 3.1273, "mean_token_accuracy": 0.40207253886010363, "step": 1816 }, { "epoch": 0.3368557656655543, "grad_norm": 9.4140625, "learning_rate": 9.663144234334446e-06, "loss": 2.5952, "mean_token_accuracy": 0.4694956790919644, "step": 1817 }, { "epoch": 0.3370411568409344, "grad_norm": 7.5625, "learning_rate": 9.662958843159066e-06, "loss": 2.9857, "mean_token_accuracy": 0.41074353095316085, "step": 1818 }, { "epoch": 0.3372265480163144, "grad_norm": 5.953125, "learning_rate": 9.662773451983687e-06, "loss": 2.9329, "mean_token_accuracy": 0.43982074263764404, "step": 1819 }, { "epoch": 0.3374119391916945, "grad_norm": 5.75, "learning_rate": 9.662588060808307e-06, "loss": 3.2117, "mean_token_accuracy": 0.3819151575291491, "step": 1820 }, { "epoch": 0.3375973303670745, "grad_norm": 8.5625, "learning_rate": 9.662402669632926e-06, "loss": 2.708, "mean_token_accuracy": 0.44365049517362415, "step": 1821 }, { "epoch": 0.3377827215424546, "grad_norm": 7.19140625, "learning_rate": 9.662217278457547e-06, "loss": 2.4305, "mean_token_accuracy": 0.48782911077993046, "step": 1822 }, { "epoch": 0.3379681127178346, "grad_norm": 7.75390625, "learning_rate": 9.662031887282165e-06, "loss": 2.6495, "mean_token_accuracy": 0.46012980992118685, "step": 1823 }, { "epoch": 0.3381535038932147, "grad_norm": 5.7578125, "learning_rate": 9.661846496106786e-06, "loss": 2.4253, "mean_token_accuracy": 0.4933253036234066, "step": 1824 }, { "epoch": 0.3383388950685947, "grad_norm": 6.16796875, "learning_rate": 9.661661104931406e-06, "loss": 3.1025, "mean_token_accuracy": 0.40685640362225095, "step": 1825 }, { "epoch": 0.3385242862439748, "grad_norm": 12.7890625, "learning_rate": 9.661475713756025e-06, "loss": 3.1683, "mean_token_accuracy": 0.38286549097359907, "step": 1826 }, { "epoch": 0.3387096774193548, "grad_norm": 6.5546875, "learning_rate": 9.661290322580646e-06, "loss": 3.7282, "mean_token_accuracy": 0.3434858135495078, "step": 1827 }, { "epoch": 0.3388950685947349, "grad_norm": 5.74609375, "learning_rate": 9.661104931405266e-06, "loss": 3.2869, "mean_token_accuracy": 0.3789942378208486, "step": 1828 }, { "epoch": 0.3390804597701149, "grad_norm": 6.3984375, "learning_rate": 9.660919540229887e-06, "loss": 2.7726, "mean_token_accuracy": 0.43242467718794836, "step": 1829 }, { "epoch": 0.339265850945495, "grad_norm": 5.9296875, "learning_rate": 9.660734149054506e-06, "loss": 2.9348, "mean_token_accuracy": 0.42353618184433894, "step": 1830 }, { "epoch": 0.3394512421208751, "grad_norm": 5.8203125, "learning_rate": 9.660548757879126e-06, "loss": 2.6674, "mean_token_accuracy": 0.46368715083798884, "step": 1831 }, { "epoch": 0.3396366332962551, "grad_norm": 9.5390625, "learning_rate": 9.660363366703745e-06, "loss": 3.0592, "mean_token_accuracy": 0.4122794832615641, "step": 1832 }, { "epoch": 0.3398220244716352, "grad_norm": 8.3984375, "learning_rate": 9.660177975528365e-06, "loss": 3.1722, "mean_token_accuracy": 0.4271512205898597, "step": 1833 }, { "epoch": 0.3400074156470152, "grad_norm": 11.796875, "learning_rate": 9.659992584352986e-06, "loss": 1.8828, "mean_token_accuracy": 0.5495128869157726, "step": 1834 }, { "epoch": 0.3401928068223953, "grad_norm": 10.7578125, "learning_rate": 9.659807193177606e-06, "loss": 2.7099, "mean_token_accuracy": 0.4307116104868914, "step": 1835 }, { "epoch": 0.3403781979977753, "grad_norm": 9.875, "learning_rate": 9.659621802002225e-06, "loss": 2.9467, "mean_token_accuracy": 0.4245494294550496, "step": 1836 }, { "epoch": 0.3405635891731554, "grad_norm": 8.7578125, "learning_rate": 9.659436410826846e-06, "loss": 2.7993, "mean_token_accuracy": 0.4289288692958418, "step": 1837 }, { "epoch": 0.3407489803485354, "grad_norm": 6.98828125, "learning_rate": 9.659251019651466e-06, "loss": 2.7647, "mean_token_accuracy": 0.42599067599067597, "step": 1838 }, { "epoch": 0.3409343715239155, "grad_norm": 8.625, "learning_rate": 9.659065628476085e-06, "loss": 2.7569, "mean_token_accuracy": 0.448, "step": 1839 }, { "epoch": 0.3411197626992955, "grad_norm": 9.015625, "learning_rate": 9.658880237300705e-06, "loss": 2.7342, "mean_token_accuracy": 0.4415486103601168, "step": 1840 }, { "epoch": 0.3413051538746756, "grad_norm": 10.4921875, "learning_rate": 9.658694846125324e-06, "loss": 2.8034, "mean_token_accuracy": 0.43900889453621345, "step": 1841 }, { "epoch": 0.3414905450500556, "grad_norm": 6.94921875, "learning_rate": 9.658509454949945e-06, "loss": 3.235, "mean_token_accuracy": 0.39839766933721776, "step": 1842 }, { "epoch": 0.3416759362254357, "grad_norm": 6.2890625, "learning_rate": 9.658324063774565e-06, "loss": 2.6881, "mean_token_accuracy": 0.4708338450202628, "step": 1843 }, { "epoch": 0.3418613274008157, "grad_norm": 5.8125, "learning_rate": 9.658138672599186e-06, "loss": 2.4531, "mean_token_accuracy": 0.4862250520386923, "step": 1844 }, { "epoch": 0.3420467185761958, "grad_norm": 9.5, "learning_rate": 9.657953281423806e-06, "loss": 2.5952, "mean_token_accuracy": 0.46513274336283184, "step": 1845 }, { "epoch": 0.3422321097515758, "grad_norm": 8.5859375, "learning_rate": 9.657767890248425e-06, "loss": 2.7283, "mean_token_accuracy": 0.43087971274685816, "step": 1846 }, { "epoch": 0.3424175009269559, "grad_norm": 7.5234375, "learning_rate": 9.657582499073045e-06, "loss": 2.942, "mean_token_accuracy": 0.43703616444810933, "step": 1847 }, { "epoch": 0.3426028921023359, "grad_norm": 6.984375, "learning_rate": 9.657397107897664e-06, "loss": 2.8717, "mean_token_accuracy": 0.41526894158473104, "step": 1848 }, { "epoch": 0.342788283277716, "grad_norm": 8.078125, "learning_rate": 9.657211716722285e-06, "loss": 3.0608, "mean_token_accuracy": 0.43319352905931696, "step": 1849 }, { "epoch": 0.342973674453096, "grad_norm": 6.25390625, "learning_rate": 9.657026325546905e-06, "loss": 2.5444, "mean_token_accuracy": 0.4657965088850448, "step": 1850 }, { "epoch": 0.3431590656284761, "grad_norm": 6.44921875, "learning_rate": 9.656840934371526e-06, "loss": 2.763, "mean_token_accuracy": 0.44189958592132506, "step": 1851 }, { "epoch": 0.3433444568038561, "grad_norm": 7.09765625, "learning_rate": 9.656655543196144e-06, "loss": 2.7354, "mean_token_accuracy": 0.45300296256396444, "step": 1852 }, { "epoch": 0.3435298479792362, "grad_norm": 6.80859375, "learning_rate": 9.656470152020765e-06, "loss": 2.9231, "mean_token_accuracy": 0.4343457410367664, "step": 1853 }, { "epoch": 0.3437152391546162, "grad_norm": 8.7890625, "learning_rate": 9.656284760845385e-06, "loss": 2.6444, "mean_token_accuracy": 0.44942541813630676, "step": 1854 }, { "epoch": 0.3439006303299963, "grad_norm": 6.75390625, "learning_rate": 9.656099369670004e-06, "loss": 2.6828, "mean_token_accuracy": 0.44754450195682366, "step": 1855 }, { "epoch": 0.34408602150537637, "grad_norm": 6.73046875, "learning_rate": 9.655913978494625e-06, "loss": 3.2105, "mean_token_accuracy": 0.3884664131812421, "step": 1856 }, { "epoch": 0.3442714126807564, "grad_norm": 6.640625, "learning_rate": 9.655728587319244e-06, "loss": 2.9995, "mean_token_accuracy": 0.4199944918755164, "step": 1857 }, { "epoch": 0.3444568038561365, "grad_norm": 5.91015625, "learning_rate": 9.655543196143864e-06, "loss": 2.8224, "mean_token_accuracy": 0.43491882654514386, "step": 1858 }, { "epoch": 0.3446421950315165, "grad_norm": 7.09765625, "learning_rate": 9.655357804968485e-06, "loss": 2.5926, "mean_token_accuracy": 0.4549968963376785, "step": 1859 }, { "epoch": 0.3448275862068966, "grad_norm": 9.671875, "learning_rate": 9.655172413793105e-06, "loss": 2.7478, "mean_token_accuracy": 0.44606323620582766, "step": 1860 }, { "epoch": 0.3450129773822766, "grad_norm": 8.5390625, "learning_rate": 9.654987022617724e-06, "loss": 3.1071, "mean_token_accuracy": 0.39231212208465305, "step": 1861 }, { "epoch": 0.3451983685576567, "grad_norm": 7.03125, "learning_rate": 9.654801631442344e-06, "loss": 2.7875, "mean_token_accuracy": 0.44498217014773306, "step": 1862 }, { "epoch": 0.3453837597330367, "grad_norm": 8.5859375, "learning_rate": 9.654616240266965e-06, "loss": 2.7759, "mean_token_accuracy": 0.4514241554427026, "step": 1863 }, { "epoch": 0.3455691509084168, "grad_norm": 7.3515625, "learning_rate": 9.654430849091584e-06, "loss": 2.6367, "mean_token_accuracy": 0.4512278776238167, "step": 1864 }, { "epoch": 0.3457545420837968, "grad_norm": 6.63671875, "learning_rate": 9.654245457916204e-06, "loss": 2.7567, "mean_token_accuracy": 0.439669634791586, "step": 1865 }, { "epoch": 0.3459399332591769, "grad_norm": 7.5546875, "learning_rate": 9.654060066740823e-06, "loss": 2.7328, "mean_token_accuracy": 0.43555612440803787, "step": 1866 }, { "epoch": 0.3461253244345569, "grad_norm": 9.390625, "learning_rate": 9.653874675565443e-06, "loss": 2.3983, "mean_token_accuracy": 0.4868520722865491, "step": 1867 }, { "epoch": 0.346310715609937, "grad_norm": 7.36328125, "learning_rate": 9.653689284390064e-06, "loss": 3.2376, "mean_token_accuracy": 0.40642750373692077, "step": 1868 }, { "epoch": 0.346496106785317, "grad_norm": 6.7421875, "learning_rate": 9.653503893214684e-06, "loss": 3.0626, "mean_token_accuracy": 0.4048688253367998, "step": 1869 }, { "epoch": 0.3466814979606971, "grad_norm": 6.7421875, "learning_rate": 9.653318502039303e-06, "loss": 3.1158, "mean_token_accuracy": 0.39600111080255485, "step": 1870 }, { "epoch": 0.3468668891360771, "grad_norm": 12.8203125, "learning_rate": 9.653133110863924e-06, "loss": 2.4466, "mean_token_accuracy": 0.4811119978717744, "step": 1871 }, { "epoch": 0.3470522803114572, "grad_norm": 23.453125, "learning_rate": 9.652947719688544e-06, "loss": 2.7485, "mean_token_accuracy": 0.423905625426538, "step": 1872 }, { "epoch": 0.3472376714868372, "grad_norm": 9.1015625, "learning_rate": 9.652762328513163e-06, "loss": 2.9885, "mean_token_accuracy": 0.40101343389111477, "step": 1873 }, { "epoch": 0.3474230626622173, "grad_norm": 13.6875, "learning_rate": 9.652576937337783e-06, "loss": 2.41, "mean_token_accuracy": 0.4916629777187546, "step": 1874 }, { "epoch": 0.3476084538375973, "grad_norm": 14.8359375, "learning_rate": 9.652391546162402e-06, "loss": 3.1352, "mean_token_accuracy": 0.3867057413081124, "step": 1875 }, { "epoch": 0.3477938450129774, "grad_norm": 15.296875, "learning_rate": 9.652206154987024e-06, "loss": 2.6709, "mean_token_accuracy": 0.45244316697151815, "step": 1876 }, { "epoch": 0.3479792361883574, "grad_norm": 9.65625, "learning_rate": 9.652020763811643e-06, "loss": 2.7074, "mean_token_accuracy": 0.4298653319032297, "step": 1877 }, { "epoch": 0.3481646273637375, "grad_norm": 6.35546875, "learning_rate": 9.651835372636264e-06, "loss": 2.8416, "mean_token_accuracy": 0.42488425925925927, "step": 1878 }, { "epoch": 0.3483500185391175, "grad_norm": 12.8984375, "learning_rate": 9.651649981460882e-06, "loss": 2.9167, "mean_token_accuracy": 0.4296344647519582, "step": 1879 }, { "epoch": 0.3485354097144976, "grad_norm": 18.671875, "learning_rate": 9.651464590285503e-06, "loss": 2.3264, "mean_token_accuracy": 0.4827279654559309, "step": 1880 }, { "epoch": 0.34872080088987767, "grad_norm": 9.953125, "learning_rate": 9.651279199110123e-06, "loss": 2.7094, "mean_token_accuracy": 0.428470629740695, "step": 1881 }, { "epoch": 0.3489061920652577, "grad_norm": 9.984375, "learning_rate": 9.651093807934742e-06, "loss": 2.7984, "mean_token_accuracy": 0.44339106654512306, "step": 1882 }, { "epoch": 0.34909158324063777, "grad_norm": 7.53515625, "learning_rate": 9.650908416759363e-06, "loss": 3.1309, "mean_token_accuracy": 0.4012059868633574, "step": 1883 }, { "epoch": 0.3492769744160178, "grad_norm": 11.796875, "learning_rate": 9.650723025583983e-06, "loss": 2.7937, "mean_token_accuracy": 0.4226482923906531, "step": 1884 }, { "epoch": 0.34946236559139787, "grad_norm": 7.69921875, "learning_rate": 9.650537634408604e-06, "loss": 2.6282, "mean_token_accuracy": 0.4644159000173581, "step": 1885 }, { "epoch": 0.3496477567667779, "grad_norm": 6.515625, "learning_rate": 9.650352243233223e-06, "loss": 2.9318, "mean_token_accuracy": 0.4122948614474038, "step": 1886 }, { "epoch": 0.349833147942158, "grad_norm": 9.875, "learning_rate": 9.650166852057843e-06, "loss": 2.9755, "mean_token_accuracy": 0.4144013880855986, "step": 1887 }, { "epoch": 0.350018539117538, "grad_norm": 9.3046875, "learning_rate": 9.649981460882464e-06, "loss": 2.8052, "mean_token_accuracy": 0.4299972655181843, "step": 1888 }, { "epoch": 0.3502039302929181, "grad_norm": 8.21875, "learning_rate": 9.649796069707082e-06, "loss": 3.0619, "mean_token_accuracy": 0.40293767368003175, "step": 1889 }, { "epoch": 0.3503893214682981, "grad_norm": 9.21875, "learning_rate": 9.649610678531703e-06, "loss": 2.9595, "mean_token_accuracy": 0.4108836744882676, "step": 1890 }, { "epoch": 0.3505747126436782, "grad_norm": 6.78125, "learning_rate": 9.649425287356322e-06, "loss": 3.1242, "mean_token_accuracy": 0.37603132429030905, "step": 1891 }, { "epoch": 0.3507601038190582, "grad_norm": 5.51953125, "learning_rate": 9.649239896180944e-06, "loss": 3.0956, "mean_token_accuracy": 0.4022231370934541, "step": 1892 }, { "epoch": 0.3509454949944383, "grad_norm": 4.9375, "learning_rate": 9.649054505005563e-06, "loss": 2.3185, "mean_token_accuracy": 0.5050731477111845, "step": 1893 }, { "epoch": 0.3511308861698183, "grad_norm": 9.7421875, "learning_rate": 9.648869113830183e-06, "loss": 2.7189, "mean_token_accuracy": 0.44263959390862945, "step": 1894 }, { "epoch": 0.3513162773451984, "grad_norm": 8.9765625, "learning_rate": 9.648683722654802e-06, "loss": 2.3112, "mean_token_accuracy": 0.48070460076486266, "step": 1895 }, { "epoch": 0.3515016685205784, "grad_norm": 5.01171875, "learning_rate": 9.648498331479422e-06, "loss": 2.7872, "mean_token_accuracy": 0.444634703196347, "step": 1896 }, { "epoch": 0.3516870596959585, "grad_norm": 7.40234375, "learning_rate": 9.648312940304043e-06, "loss": 2.8237, "mean_token_accuracy": 0.43508510373959497, "step": 1897 }, { "epoch": 0.3518724508713385, "grad_norm": 7.28515625, "learning_rate": 9.648127549128662e-06, "loss": 2.6371, "mean_token_accuracy": 0.4533402651416688, "step": 1898 }, { "epoch": 0.3520578420467186, "grad_norm": 6.265625, "learning_rate": 9.647942157953282e-06, "loss": 2.6901, "mean_token_accuracy": 0.4568251446726427, "step": 1899 }, { "epoch": 0.3522432332220986, "grad_norm": 10.5625, "learning_rate": 9.647756766777903e-06, "loss": 2.709, "mean_token_accuracy": 0.43246174237859986, "step": 1900 }, { "epoch": 0.3524286243974787, "grad_norm": 7.90234375, "learning_rate": 9.647571375602523e-06, "loss": 2.8995, "mean_token_accuracy": 0.4257629443364956, "step": 1901 }, { "epoch": 0.3526140155728587, "grad_norm": 5.9765625, "learning_rate": 9.647385984427142e-06, "loss": 2.99, "mean_token_accuracy": 0.41589648798521256, "step": 1902 }, { "epoch": 0.3527994067482388, "grad_norm": 7.33203125, "learning_rate": 9.647200593251762e-06, "loss": 2.621, "mean_token_accuracy": 0.45819659321769024, "step": 1903 }, { "epoch": 0.3529847979236188, "grad_norm": 15.6953125, "learning_rate": 9.647015202076381e-06, "loss": 3.3667, "mean_token_accuracy": 0.41359284256788575, "step": 1904 }, { "epoch": 0.3531701890989989, "grad_norm": 10.171875, "learning_rate": 9.646829810901002e-06, "loss": 3.0174, "mean_token_accuracy": 0.42861201875266375, "step": 1905 }, { "epoch": 0.35335558027437897, "grad_norm": 12.7734375, "learning_rate": 9.646644419725622e-06, "loss": 3.2239, "mean_token_accuracy": 0.4074025634318598, "step": 1906 }, { "epoch": 0.353540971449759, "grad_norm": 6.48828125, "learning_rate": 9.646459028550241e-06, "loss": 2.8519, "mean_token_accuracy": 0.428043400500775, "step": 1907 }, { "epoch": 0.35372636262513907, "grad_norm": 6.0703125, "learning_rate": 9.646273637374861e-06, "loss": 2.8792, "mean_token_accuracy": 0.4269440316988608, "step": 1908 }, { "epoch": 0.3539117538005191, "grad_norm": 10.8359375, "learning_rate": 9.646088246199482e-06, "loss": 2.7078, "mean_token_accuracy": 0.4334143899299099, "step": 1909 }, { "epoch": 0.35409714497589917, "grad_norm": 10.7109375, "learning_rate": 9.645902855024102e-06, "loss": 2.6368, "mean_token_accuracy": 0.46637820137995556, "step": 1910 }, { "epoch": 0.3542825361512792, "grad_norm": 5.84375, "learning_rate": 9.645717463848721e-06, "loss": 3.0133, "mean_token_accuracy": 0.4191351180104546, "step": 1911 }, { "epoch": 0.35446792732665927, "grad_norm": 6.60546875, "learning_rate": 9.645532072673342e-06, "loss": 2.6503, "mean_token_accuracy": 0.48240764011058057, "step": 1912 }, { "epoch": 0.3546533185020393, "grad_norm": 6.57421875, "learning_rate": 9.64534668149796e-06, "loss": 3.0997, "mean_token_accuracy": 0.4120201096892139, "step": 1913 }, { "epoch": 0.3548387096774194, "grad_norm": 5.5546875, "learning_rate": 9.645161290322581e-06, "loss": 2.6042, "mean_token_accuracy": 0.4689585694496915, "step": 1914 }, { "epoch": 0.3550241008527994, "grad_norm": 5.1796875, "learning_rate": 9.644975899147202e-06, "loss": 2.813, "mean_token_accuracy": 0.4416645712848881, "step": 1915 }, { "epoch": 0.3552094920281795, "grad_norm": 6.484375, "learning_rate": 9.644790507971822e-06, "loss": 2.7345, "mean_token_accuracy": 0.4460013218770654, "step": 1916 }, { "epoch": 0.3553948832035595, "grad_norm": 6.7265625, "learning_rate": 9.64460511679644e-06, "loss": 2.6803, "mean_token_accuracy": 0.468959629223076, "step": 1917 }, { "epoch": 0.3555802743789396, "grad_norm": 6.67578125, "learning_rate": 9.644419725621061e-06, "loss": 2.9918, "mean_token_accuracy": 0.4214811335525286, "step": 1918 }, { "epoch": 0.3557656655543196, "grad_norm": 5.6015625, "learning_rate": 9.644234334445682e-06, "loss": 2.5548, "mean_token_accuracy": 0.47409695817490494, "step": 1919 }, { "epoch": 0.3559510567296997, "grad_norm": 7.625, "learning_rate": 9.6440489432703e-06, "loss": 3.0147, "mean_token_accuracy": 0.4262320894347347, "step": 1920 }, { "epoch": 0.3561364479050797, "grad_norm": 5.5859375, "learning_rate": 9.643863552094921e-06, "loss": 3.1127, "mean_token_accuracy": 0.4058050383351588, "step": 1921 }, { "epoch": 0.3563218390804598, "grad_norm": 5.57421875, "learning_rate": 9.64367816091954e-06, "loss": 2.7192, "mean_token_accuracy": 0.44304980638269464, "step": 1922 }, { "epoch": 0.3565072302558398, "grad_norm": 5.28515625, "learning_rate": 9.64349276974416e-06, "loss": 2.9308, "mean_token_accuracy": 0.43669330055316535, "step": 1923 }, { "epoch": 0.3566926214312199, "grad_norm": 5.22265625, "learning_rate": 9.643307378568781e-06, "loss": 2.8075, "mean_token_accuracy": 0.4549289832653635, "step": 1924 }, { "epoch": 0.3568780126065999, "grad_norm": 6.4765625, "learning_rate": 9.643121987393401e-06, "loss": 2.9893, "mean_token_accuracy": 0.41002720559657985, "step": 1925 }, { "epoch": 0.35706340378198, "grad_norm": 5.23828125, "learning_rate": 9.642936596218022e-06, "loss": 3.3922, "mean_token_accuracy": 0.39280898876404496, "step": 1926 }, { "epoch": 0.35724879495736, "grad_norm": 7.203125, "learning_rate": 9.64275120504264e-06, "loss": 3.1683, "mean_token_accuracy": 0.414902170999732, "step": 1927 }, { "epoch": 0.3574341861327401, "grad_norm": 7.08984375, "learning_rate": 9.642565813867261e-06, "loss": 3.0045, "mean_token_accuracy": 0.4277951208232166, "step": 1928 }, { "epoch": 0.3576195773081201, "grad_norm": 7.1640625, "learning_rate": 9.64238042269188e-06, "loss": 2.8381, "mean_token_accuracy": 0.41798127484930103, "step": 1929 }, { "epoch": 0.3578049684835002, "grad_norm": 5.40625, "learning_rate": 9.6421950315165e-06, "loss": 3.2355, "mean_token_accuracy": 0.4127672297802362, "step": 1930 }, { "epoch": 0.35799035965888026, "grad_norm": 8.6640625, "learning_rate": 9.642009640341121e-06, "loss": 3.0069, "mean_token_accuracy": 0.4026418786692759, "step": 1931 }, { "epoch": 0.3581757508342603, "grad_norm": 8.125, "learning_rate": 9.641824249165741e-06, "loss": 2.3459, "mean_token_accuracy": 0.508284023668639, "step": 1932 }, { "epoch": 0.35836114200964037, "grad_norm": 7.8671875, "learning_rate": 9.64163885799036e-06, "loss": 2.3111, "mean_token_accuracy": 0.4984088498257312, "step": 1933 }, { "epoch": 0.3585465331850204, "grad_norm": 5.6484375, "learning_rate": 9.64145346681498e-06, "loss": 2.7673, "mean_token_accuracy": 0.44067255507076725, "step": 1934 }, { "epoch": 0.35873192436040047, "grad_norm": 13.4375, "learning_rate": 9.641268075639601e-06, "loss": 2.7675, "mean_token_accuracy": 0.4404713531084925, "step": 1935 }, { "epoch": 0.3589173155357805, "grad_norm": 7.22265625, "learning_rate": 9.64108268446422e-06, "loss": 3.1261, "mean_token_accuracy": 0.4106641721234799, "step": 1936 }, { "epoch": 0.35910270671116057, "grad_norm": 5.984375, "learning_rate": 9.64089729328884e-06, "loss": 3.6334, "mean_token_accuracy": 0.3606942889137738, "step": 1937 }, { "epoch": 0.3592880978865406, "grad_norm": 10.3515625, "learning_rate": 9.64071190211346e-06, "loss": 2.8474, "mean_token_accuracy": 0.4440717326796542, "step": 1938 }, { "epoch": 0.35947348906192067, "grad_norm": 7.02734375, "learning_rate": 9.64052651093808e-06, "loss": 2.7504, "mean_token_accuracy": 0.43622412045750686, "step": 1939 }, { "epoch": 0.3596588802373007, "grad_norm": 5.91015625, "learning_rate": 9.6403411197627e-06, "loss": 3.3064, "mean_token_accuracy": 0.41315177681833276, "step": 1940 }, { "epoch": 0.35984427141268077, "grad_norm": 8.578125, "learning_rate": 9.64015572858732e-06, "loss": 2.3217, "mean_token_accuracy": 0.4976589324732078, "step": 1941 }, { "epoch": 0.3600296625880608, "grad_norm": 9.734375, "learning_rate": 9.63997033741194e-06, "loss": 3.0893, "mean_token_accuracy": 0.40950704225352114, "step": 1942 }, { "epoch": 0.3602150537634409, "grad_norm": 9.0234375, "learning_rate": 9.63978494623656e-06, "loss": 2.7221, "mean_token_accuracy": 0.45620661494487547, "step": 1943 }, { "epoch": 0.3604004449388209, "grad_norm": 5.83203125, "learning_rate": 9.63959955506118e-06, "loss": 3.4177, "mean_token_accuracy": 0.3831385642737897, "step": 1944 }, { "epoch": 0.360585836114201, "grad_norm": 8.5390625, "learning_rate": 9.6394141638858e-06, "loss": 3.0043, "mean_token_accuracy": 0.43035900491583495, "step": 1945 }, { "epoch": 0.360771227289581, "grad_norm": 13.6484375, "learning_rate": 9.63922877271042e-06, "loss": 2.5515, "mean_token_accuracy": 0.45373272959479855, "step": 1946 }, { "epoch": 0.3609566184649611, "grad_norm": 7.6328125, "learning_rate": 9.639043381535039e-06, "loss": 2.7821, "mean_token_accuracy": 0.44484864232817983, "step": 1947 }, { "epoch": 0.3611420096403411, "grad_norm": 8.4921875, "learning_rate": 9.63885799035966e-06, "loss": 3.1947, "mean_token_accuracy": 0.41179495971198354, "step": 1948 }, { "epoch": 0.3613274008157212, "grad_norm": 7.62109375, "learning_rate": 9.63867259918428e-06, "loss": 3.1753, "mean_token_accuracy": 0.4215343203230148, "step": 1949 }, { "epoch": 0.3615127919911012, "grad_norm": 12.1328125, "learning_rate": 9.6384872080089e-06, "loss": 2.5528, "mean_token_accuracy": 0.4636156186612576, "step": 1950 }, { "epoch": 0.3616981831664813, "grad_norm": 8.046875, "learning_rate": 9.638301816833519e-06, "loss": 3.0346, "mean_token_accuracy": 0.4150741681143926, "step": 1951 }, { "epoch": 0.3618835743418613, "grad_norm": 5.7734375, "learning_rate": 9.63811642565814e-06, "loss": 2.8098, "mean_token_accuracy": 0.4306280367104553, "step": 1952 }, { "epoch": 0.3620689655172414, "grad_norm": 8.5390625, "learning_rate": 9.63793103448276e-06, "loss": 2.5817, "mean_token_accuracy": 0.4619392185238784, "step": 1953 }, { "epoch": 0.3622543566926214, "grad_norm": 7.296875, "learning_rate": 9.637745643307379e-06, "loss": 2.732, "mean_token_accuracy": 0.45091623036649214, "step": 1954 }, { "epoch": 0.3624397478680015, "grad_norm": 5.2734375, "learning_rate": 9.637560252132e-06, "loss": 2.9312, "mean_token_accuracy": 0.42870165386117987, "step": 1955 }, { "epoch": 0.36262513904338156, "grad_norm": 7.265625, "learning_rate": 9.63737486095662e-06, "loss": 3.1522, "mean_token_accuracy": 0.40306534014520035, "step": 1956 }, { "epoch": 0.3628105302187616, "grad_norm": 8.4453125, "learning_rate": 9.63718946978124e-06, "loss": 2.9769, "mean_token_accuracy": 0.41661151555261416, "step": 1957 }, { "epoch": 0.36299592139414166, "grad_norm": 6.8046875, "learning_rate": 9.637004078605859e-06, "loss": 3.3178, "mean_token_accuracy": 0.4092255125284738, "step": 1958 }, { "epoch": 0.3631813125695217, "grad_norm": 8.2265625, "learning_rate": 9.63681868743048e-06, "loss": 2.4737, "mean_token_accuracy": 0.4673913043478261, "step": 1959 }, { "epoch": 0.36336670374490176, "grad_norm": 5.98046875, "learning_rate": 9.636633296255098e-06, "loss": 2.8654, "mean_token_accuracy": 0.434767401189227, "step": 1960 }, { "epoch": 0.3635520949202818, "grad_norm": 6.94140625, "learning_rate": 9.636447905079719e-06, "loss": 2.8771, "mean_token_accuracy": 0.4288475836431227, "step": 1961 }, { "epoch": 0.36373748609566187, "grad_norm": 8.5234375, "learning_rate": 9.63626251390434e-06, "loss": 2.7533, "mean_token_accuracy": 0.44155649038461536, "step": 1962 }, { "epoch": 0.3639228772710419, "grad_norm": 6.21875, "learning_rate": 9.636077122728958e-06, "loss": 2.7725, "mean_token_accuracy": 0.43539630836047777, "step": 1963 }, { "epoch": 0.36410826844642197, "grad_norm": 10.1953125, "learning_rate": 9.63589173155358e-06, "loss": 2.9197, "mean_token_accuracy": 0.4285538461538462, "step": 1964 }, { "epoch": 0.364293659621802, "grad_norm": 10.8984375, "learning_rate": 9.635706340378199e-06, "loss": 3.3504, "mean_token_accuracy": 0.38387329013678906, "step": 1965 }, { "epoch": 0.36447905079718207, "grad_norm": 10.453125, "learning_rate": 9.63552094920282e-06, "loss": 2.8602, "mean_token_accuracy": 0.43345965225144895, "step": 1966 }, { "epoch": 0.3646644419725621, "grad_norm": 6.8125, "learning_rate": 9.635335558027438e-06, "loss": 3.106, "mean_token_accuracy": 0.4032211676732816, "step": 1967 }, { "epoch": 0.36484983314794217, "grad_norm": 5.28125, "learning_rate": 9.635150166852059e-06, "loss": 2.9347, "mean_token_accuracy": 0.42733545066257805, "step": 1968 }, { "epoch": 0.3650352243233222, "grad_norm": 12.46875, "learning_rate": 9.63496477567668e-06, "loss": 3.0404, "mean_token_accuracy": 0.422849277357192, "step": 1969 }, { "epoch": 0.3652206154987023, "grad_norm": 11.9765625, "learning_rate": 9.634779384501298e-06, "loss": 3.107, "mean_token_accuracy": 0.4027143738433066, "step": 1970 }, { "epoch": 0.3654060066740823, "grad_norm": 11.1953125, "learning_rate": 9.634593993325919e-06, "loss": 2.8292, "mean_token_accuracy": 0.4358153189218041, "step": 1971 }, { "epoch": 0.3655913978494624, "grad_norm": 5.3671875, "learning_rate": 9.634408602150539e-06, "loss": 3.1158, "mean_token_accuracy": 0.42786385572771146, "step": 1972 }, { "epoch": 0.3657767890248424, "grad_norm": 7.6796875, "learning_rate": 9.63422321097516e-06, "loss": 2.6339, "mean_token_accuracy": 0.4522490221642764, "step": 1973 }, { "epoch": 0.3659621802002225, "grad_norm": 8.34375, "learning_rate": 9.634037819799778e-06, "loss": 2.389, "mean_token_accuracy": 0.4936310505020231, "step": 1974 }, { "epoch": 0.3661475713756025, "grad_norm": 8.5546875, "learning_rate": 9.633852428624399e-06, "loss": 2.9325, "mean_token_accuracy": 0.4143014604574263, "step": 1975 }, { "epoch": 0.3663329625509826, "grad_norm": 8.015625, "learning_rate": 9.633667037449018e-06, "loss": 3.1065, "mean_token_accuracy": 0.409533352419124, "step": 1976 }, { "epoch": 0.3665183537263626, "grad_norm": 6.515625, "learning_rate": 9.633481646273638e-06, "loss": 2.0513, "mean_token_accuracy": 0.5394495412844037, "step": 1977 }, { "epoch": 0.3667037449017427, "grad_norm": 10.2109375, "learning_rate": 9.633296255098259e-06, "loss": 2.736, "mean_token_accuracy": 0.4470061940812113, "step": 1978 }, { "epoch": 0.3668891360771227, "grad_norm": 6.28515625, "learning_rate": 9.633110863922877e-06, "loss": 2.8101, "mean_token_accuracy": 0.4428782166857892, "step": 1979 }, { "epoch": 0.3670745272525028, "grad_norm": 5.7421875, "learning_rate": 9.632925472747498e-06, "loss": 2.7887, "mean_token_accuracy": 0.44549583648750946, "step": 1980 }, { "epoch": 0.36725991842788286, "grad_norm": 10.1953125, "learning_rate": 9.632740081572118e-06, "loss": 2.6611, "mean_token_accuracy": 0.46782544378698226, "step": 1981 }, { "epoch": 0.3674453096032629, "grad_norm": 7.98828125, "learning_rate": 9.632554690396739e-06, "loss": 2.8348, "mean_token_accuracy": 0.43077601410934746, "step": 1982 }, { "epoch": 0.36763070077864296, "grad_norm": 9.1328125, "learning_rate": 9.632369299221358e-06, "loss": 2.5115, "mean_token_accuracy": 0.46553715825953024, "step": 1983 }, { "epoch": 0.367816091954023, "grad_norm": 6.26171875, "learning_rate": 9.632183908045978e-06, "loss": 2.6701, "mean_token_accuracy": 0.4568436258577104, "step": 1984 }, { "epoch": 0.36800148312940306, "grad_norm": 8.3359375, "learning_rate": 9.631998516870597e-06, "loss": 2.8724, "mean_token_accuracy": 0.4445274003285053, "step": 1985 }, { "epoch": 0.3681868743047831, "grad_norm": 8.8671875, "learning_rate": 9.631813125695217e-06, "loss": 3.4052, "mean_token_accuracy": 0.39631197097944376, "step": 1986 }, { "epoch": 0.36837226548016316, "grad_norm": 10.453125, "learning_rate": 9.631627734519838e-06, "loss": 3.0992, "mean_token_accuracy": 0.40095208845208846, "step": 1987 }, { "epoch": 0.3685576566555432, "grad_norm": 6.3125, "learning_rate": 9.631442343344457e-06, "loss": 2.8029, "mean_token_accuracy": 0.4380938459289578, "step": 1988 }, { "epoch": 0.36874304783092327, "grad_norm": 7.2109375, "learning_rate": 9.631256952169077e-06, "loss": 3.1823, "mean_token_accuracy": 0.3995384197664947, "step": 1989 }, { "epoch": 0.3689284390063033, "grad_norm": 8.3671875, "learning_rate": 9.631071560993698e-06, "loss": 2.175, "mean_token_accuracy": 0.5395224109309156, "step": 1990 }, { "epoch": 0.36911383018168337, "grad_norm": 9.765625, "learning_rate": 9.630886169818318e-06, "loss": 3.2129, "mean_token_accuracy": 0.4031246436309727, "step": 1991 }, { "epoch": 0.3692992213570634, "grad_norm": 9.125, "learning_rate": 9.630700778642937e-06, "loss": 2.8497, "mean_token_accuracy": 0.42178414409303644, "step": 1992 }, { "epoch": 0.36948461253244347, "grad_norm": 6.91015625, "learning_rate": 9.630515387467557e-06, "loss": 2.8404, "mean_token_accuracy": 0.43538393449878293, "step": 1993 }, { "epoch": 0.3696700037078235, "grad_norm": 6.37890625, "learning_rate": 9.630329996292176e-06, "loss": 2.8295, "mean_token_accuracy": 0.45617752007136486, "step": 1994 }, { "epoch": 0.36985539488320357, "grad_norm": 9.3203125, "learning_rate": 9.630144605116797e-06, "loss": 3.0167, "mean_token_accuracy": 0.4160714285714286, "step": 1995 }, { "epoch": 0.3700407860585836, "grad_norm": 5.3203125, "learning_rate": 9.629959213941417e-06, "loss": 3.2577, "mean_token_accuracy": 0.38896687254351486, "step": 1996 }, { "epoch": 0.37022617723396367, "grad_norm": 6.15625, "learning_rate": 9.629773822766038e-06, "loss": 3.1339, "mean_token_accuracy": 0.40636223704463825, "step": 1997 }, { "epoch": 0.3704115684093437, "grad_norm": 6.83203125, "learning_rate": 9.629588431590657e-06, "loss": 3.4045, "mean_token_accuracy": 0.3807138384470883, "step": 1998 }, { "epoch": 0.3705969595847238, "grad_norm": 9.1484375, "learning_rate": 9.629403040415277e-06, "loss": 2.821, "mean_token_accuracy": 0.43425551756294684, "step": 1999 }, { "epoch": 0.3707823507601038, "grad_norm": 6.15234375, "learning_rate": 9.629217649239898e-06, "loss": 2.3541, "mean_token_accuracy": 0.49463428410053656, "step": 2000 }, { "epoch": 0.3709677419354839, "grad_norm": 7.265625, "learning_rate": 9.629032258064516e-06, "loss": 2.8611, "mean_token_accuracy": 0.4388441746932313, "step": 2001 }, { "epoch": 0.3711531331108639, "grad_norm": 6.91796875, "learning_rate": 9.628846866889137e-06, "loss": 2.633, "mean_token_accuracy": 0.46019615335626035, "step": 2002 }, { "epoch": 0.371338524286244, "grad_norm": 6.11328125, "learning_rate": 9.628661475713756e-06, "loss": 2.7864, "mean_token_accuracy": 0.45384073291050037, "step": 2003 }, { "epoch": 0.371523915461624, "grad_norm": 6.64453125, "learning_rate": 9.628476084538376e-06, "loss": 2.5784, "mean_token_accuracy": 0.47295758610873895, "step": 2004 }, { "epoch": 0.3717093066370041, "grad_norm": 5.67578125, "learning_rate": 9.628290693362997e-06, "loss": 2.4409, "mean_token_accuracy": 0.509175465057818, "step": 2005 }, { "epoch": 0.37189469781238416, "grad_norm": 7.19140625, "learning_rate": 9.628105302187617e-06, "loss": 2.668, "mean_token_accuracy": 0.45245486949522823, "step": 2006 }, { "epoch": 0.3720800889877642, "grad_norm": 5.4921875, "learning_rate": 9.627919911012238e-06, "loss": 2.4966, "mean_token_accuracy": 0.46609897172236503, "step": 2007 }, { "epoch": 0.37226548016314426, "grad_norm": 4.8984375, "learning_rate": 9.627734519836856e-06, "loss": 3.14, "mean_token_accuracy": 0.4135108891663199, "step": 2008 }, { "epoch": 0.3724508713385243, "grad_norm": 8.265625, "learning_rate": 9.627549128661477e-06, "loss": 2.587, "mean_token_accuracy": 0.45553964327561514, "step": 2009 }, { "epoch": 0.37263626251390436, "grad_norm": 7.1484375, "learning_rate": 9.627363737486096e-06, "loss": 3.1487, "mean_token_accuracy": 0.3924928066963118, "step": 2010 }, { "epoch": 0.3728216536892844, "grad_norm": 5.28515625, "learning_rate": 9.627178346310716e-06, "loss": 3.2453, "mean_token_accuracy": 0.3857559836544075, "step": 2011 }, { "epoch": 0.37300704486466446, "grad_norm": 9.2890625, "learning_rate": 9.626992955135337e-06, "loss": 2.9718, "mean_token_accuracy": 0.4310854776693286, "step": 2012 }, { "epoch": 0.3731924360400445, "grad_norm": 6.68359375, "learning_rate": 9.626807563959957e-06, "loss": 3.0917, "mean_token_accuracy": 0.41037366083093807, "step": 2013 }, { "epoch": 0.37337782721542456, "grad_norm": 6.5546875, "learning_rate": 9.626622172784576e-06, "loss": 2.8792, "mean_token_accuracy": 0.42188208616780043, "step": 2014 }, { "epoch": 0.3735632183908046, "grad_norm": 5.8046875, "learning_rate": 9.626436781609196e-06, "loss": 2.5818, "mean_token_accuracy": 0.4514761765565624, "step": 2015 }, { "epoch": 0.37374860956618466, "grad_norm": 5.53125, "learning_rate": 9.626251390433817e-06, "loss": 3.1144, "mean_token_accuracy": 0.4027985328080424, "step": 2016 }, { "epoch": 0.3739340007415647, "grad_norm": 5.94921875, "learning_rate": 9.626065999258436e-06, "loss": 2.3861, "mean_token_accuracy": 0.4908996359854394, "step": 2017 }, { "epoch": 0.37411939191694477, "grad_norm": 5.4296875, "learning_rate": 9.625880608083056e-06, "loss": 2.9198, "mean_token_accuracy": 0.4199238041484408, "step": 2018 }, { "epoch": 0.3743047830923248, "grad_norm": 6.28515625, "learning_rate": 9.625695216907675e-06, "loss": 2.7408, "mean_token_accuracy": 0.44878048780487806, "step": 2019 }, { "epoch": 0.37449017426770487, "grad_norm": 8.7578125, "learning_rate": 9.625509825732296e-06, "loss": 2.7781, "mean_token_accuracy": 0.41345080034743764, "step": 2020 }, { "epoch": 0.3746755654430849, "grad_norm": 6.57421875, "learning_rate": 9.625324434556916e-06, "loss": 2.8937, "mean_token_accuracy": 0.4264958127333266, "step": 2021 }, { "epoch": 0.37486095661846497, "grad_norm": 5.33984375, "learning_rate": 9.625139043381536e-06, "loss": 2.3729, "mean_token_accuracy": 0.5096894409937888, "step": 2022 }, { "epoch": 0.375046347793845, "grad_norm": 9.1640625, "learning_rate": 9.624953652206155e-06, "loss": 3.0185, "mean_token_accuracy": 0.43386636915829924, "step": 2023 }, { "epoch": 0.37523173896922507, "grad_norm": 9.671875, "learning_rate": 9.624768261030776e-06, "loss": 2.5644, "mean_token_accuracy": 0.47907502827698883, "step": 2024 }, { "epoch": 0.3754171301446051, "grad_norm": 5.9375, "learning_rate": 9.624582869855396e-06, "loss": 2.9866, "mean_token_accuracy": 0.4180306230200634, "step": 2025 }, { "epoch": 0.37560252131998517, "grad_norm": 8.796875, "learning_rate": 9.624397478680015e-06, "loss": 2.6531, "mean_token_accuracy": 0.45761967501097933, "step": 2026 }, { "epoch": 0.3757879124953652, "grad_norm": 7.0, "learning_rate": 9.624212087504636e-06, "loss": 2.9049, "mean_token_accuracy": 0.42466502597757727, "step": 2027 }, { "epoch": 0.3759733036707453, "grad_norm": 6.56640625, "learning_rate": 9.624026696329254e-06, "loss": 3.2015, "mean_token_accuracy": 0.3909145248057382, "step": 2028 }, { "epoch": 0.37615869484612535, "grad_norm": 5.1015625, "learning_rate": 9.623841305153877e-06, "loss": 2.7174, "mean_token_accuracy": 0.4525670313815078, "step": 2029 }, { "epoch": 0.3763440860215054, "grad_norm": 7.5703125, "learning_rate": 9.623655913978495e-06, "loss": 2.8468, "mean_token_accuracy": 0.43388305847076464, "step": 2030 }, { "epoch": 0.37652947719688545, "grad_norm": 8.1953125, "learning_rate": 9.623470522803116e-06, "loss": 2.7492, "mean_token_accuracy": 0.44240525216353327, "step": 2031 }, { "epoch": 0.3767148683722655, "grad_norm": 5.515625, "learning_rate": 9.623285131627735e-06, "loss": 2.6994, "mean_token_accuracy": 0.4571729464076761, "step": 2032 }, { "epoch": 0.37690025954764556, "grad_norm": 9.9453125, "learning_rate": 9.623099740452355e-06, "loss": 2.6625, "mean_token_accuracy": 0.454396504642272, "step": 2033 }, { "epoch": 0.3770856507230256, "grad_norm": 6.41796875, "learning_rate": 9.622914349276976e-06, "loss": 3.2146, "mean_token_accuracy": 0.3953187485297577, "step": 2034 }, { "epoch": 0.37727104189840566, "grad_norm": 8.875, "learning_rate": 9.622728958101594e-06, "loss": 3.5484, "mean_token_accuracy": 0.3681206685690991, "step": 2035 }, { "epoch": 0.3774564330737857, "grad_norm": 5.72265625, "learning_rate": 9.622543566926215e-06, "loss": 3.07, "mean_token_accuracy": 0.40710232473000185, "step": 2036 }, { "epoch": 0.37764182424916576, "grad_norm": 6.0, "learning_rate": 9.622358175750835e-06, "loss": 2.6312, "mean_token_accuracy": 0.4592440215993829, "step": 2037 }, { "epoch": 0.3778272154245458, "grad_norm": 5.8828125, "learning_rate": 9.622172784575456e-06, "loss": 2.754, "mean_token_accuracy": 0.4361594751450921, "step": 2038 }, { "epoch": 0.37801260659992586, "grad_norm": 9.125, "learning_rate": 9.621987393400075e-06, "loss": 2.3551, "mean_token_accuracy": 0.46254248810333104, "step": 2039 }, { "epoch": 0.3781979977753059, "grad_norm": 7.25, "learning_rate": 9.621802002224695e-06, "loss": 2.9968, "mean_token_accuracy": 0.42607640994542145, "step": 2040 }, { "epoch": 0.37838338895068596, "grad_norm": 6.359375, "learning_rate": 9.621616611049314e-06, "loss": 3.0349, "mean_token_accuracy": 0.4182504556105181, "step": 2041 }, { "epoch": 0.378568780126066, "grad_norm": 7.11328125, "learning_rate": 9.621431219873934e-06, "loss": 2.6081, "mean_token_accuracy": 0.44713656387665196, "step": 2042 }, { "epoch": 0.37875417130144606, "grad_norm": 6.11328125, "learning_rate": 9.621245828698555e-06, "loss": 3.1703, "mean_token_accuracy": 0.4067410035478966, "step": 2043 }, { "epoch": 0.3789395624768261, "grad_norm": 9.84375, "learning_rate": 9.621060437523174e-06, "loss": 2.898, "mean_token_accuracy": 0.4383773626616558, "step": 2044 }, { "epoch": 0.37912495365220616, "grad_norm": 5.88671875, "learning_rate": 9.620875046347796e-06, "loss": 2.8187, "mean_token_accuracy": 0.4430740037950664, "step": 2045 }, { "epoch": 0.3793103448275862, "grad_norm": 7.15234375, "learning_rate": 9.620689655172415e-06, "loss": 2.8045, "mean_token_accuracy": 0.4574505858681501, "step": 2046 }, { "epoch": 0.37949573600296627, "grad_norm": 6.42578125, "learning_rate": 9.620504263997035e-06, "loss": 3.0148, "mean_token_accuracy": 0.4261457934380186, "step": 2047 }, { "epoch": 0.3796811271783463, "grad_norm": 6.83984375, "learning_rate": 9.620318872821654e-06, "loss": 2.8098, "mean_token_accuracy": 0.45230682139824513, "step": 2048 }, { "epoch": 0.37986651835372637, "grad_norm": 7.46484375, "learning_rate": 9.620133481646275e-06, "loss": 2.9976, "mean_token_accuracy": 0.40401785714285715, "step": 2049 }, { "epoch": 0.3800519095291064, "grad_norm": 7.50390625, "learning_rate": 9.619948090470895e-06, "loss": 2.6342, "mean_token_accuracy": 0.4509297520661157, "step": 2050 }, { "epoch": 0.38023730070448647, "grad_norm": 7.31640625, "learning_rate": 9.619762699295514e-06, "loss": 2.8952, "mean_token_accuracy": 0.4513000702740689, "step": 2051 }, { "epoch": 0.3804226918798665, "grad_norm": 6.8203125, "learning_rate": 9.619577308120134e-06, "loss": 3.02, "mean_token_accuracy": 0.4429545989179017, "step": 2052 }, { "epoch": 0.38060808305524657, "grad_norm": 5.8203125, "learning_rate": 9.619391916944755e-06, "loss": 2.9102, "mean_token_accuracy": 0.41605153670648237, "step": 2053 }, { "epoch": 0.38079347423062665, "grad_norm": 7.296875, "learning_rate": 9.619206525769375e-06, "loss": 2.9864, "mean_token_accuracy": 0.42070532237126784, "step": 2054 }, { "epoch": 0.3809788654060067, "grad_norm": 6.36328125, "learning_rate": 9.619021134593994e-06, "loss": 3.3018, "mean_token_accuracy": 0.38093232238135355, "step": 2055 }, { "epoch": 0.38116425658138675, "grad_norm": 8.0078125, "learning_rate": 9.618835743418615e-06, "loss": 2.9429, "mean_token_accuracy": 0.4352600274515785, "step": 2056 }, { "epoch": 0.3813496477567668, "grad_norm": 7.66796875, "learning_rate": 9.618650352243233e-06, "loss": 2.8707, "mean_token_accuracy": 0.4282781275663838, "step": 2057 }, { "epoch": 0.38153503893214685, "grad_norm": 6.72265625, "learning_rate": 9.618464961067854e-06, "loss": 3.138, "mean_token_accuracy": 0.4047550432276657, "step": 2058 }, { "epoch": 0.3817204301075269, "grad_norm": 7.92578125, "learning_rate": 9.618279569892474e-06, "loss": 2.4854, "mean_token_accuracy": 0.47245017584994137, "step": 2059 }, { "epoch": 0.38190582128290695, "grad_norm": 7.2734375, "learning_rate": 9.618094178717093e-06, "loss": 2.6575, "mean_token_accuracy": 0.4482477587612062, "step": 2060 }, { "epoch": 0.382091212458287, "grad_norm": 6.23046875, "learning_rate": 9.617908787541714e-06, "loss": 2.7923, "mean_token_accuracy": 0.44017611447440835, "step": 2061 }, { "epoch": 0.38227660363366706, "grad_norm": 7.7265625, "learning_rate": 9.617723396366334e-06, "loss": 2.9259, "mean_token_accuracy": 0.4344192392972881, "step": 2062 }, { "epoch": 0.3824619948090471, "grad_norm": 5.5546875, "learning_rate": 9.617538005190955e-06, "loss": 2.775, "mean_token_accuracy": 0.45398860398860397, "step": 2063 }, { "epoch": 0.38264738598442716, "grad_norm": 7.3828125, "learning_rate": 9.617352614015573e-06, "loss": 3.0458, "mean_token_accuracy": 0.40997804342522565, "step": 2064 }, { "epoch": 0.3828327771598072, "grad_norm": 60.53125, "learning_rate": 9.617167222840194e-06, "loss": 2.622, "mean_token_accuracy": 0.46342464348004514, "step": 2065 }, { "epoch": 0.38301816833518726, "grad_norm": 5.21484375, "learning_rate": 9.616981831664813e-06, "loss": 2.9078, "mean_token_accuracy": 0.43449565504344956, "step": 2066 }, { "epoch": 0.3832035595105673, "grad_norm": 5.4140625, "learning_rate": 9.616796440489433e-06, "loss": 3.0555, "mean_token_accuracy": 0.40692484014959585, "step": 2067 }, { "epoch": 0.38338895068594736, "grad_norm": 6.81640625, "learning_rate": 9.616611049314054e-06, "loss": 2.8763, "mean_token_accuracy": 0.4387933547070825, "step": 2068 }, { "epoch": 0.3835743418613274, "grad_norm": 8.703125, "learning_rate": 9.616425658138674e-06, "loss": 2.7112, "mean_token_accuracy": 0.4468868821292776, "step": 2069 }, { "epoch": 0.38375973303670746, "grad_norm": 7.13671875, "learning_rate": 9.616240266963293e-06, "loss": 2.7574, "mean_token_accuracy": 0.4438002371853999, "step": 2070 }, { "epoch": 0.3839451242120875, "grad_norm": 7.109375, "learning_rate": 9.616054875787913e-06, "loss": 2.5961, "mean_token_accuracy": 0.4589008924377642, "step": 2071 }, { "epoch": 0.38413051538746756, "grad_norm": 5.78515625, "learning_rate": 9.615869484612534e-06, "loss": 3.002, "mean_token_accuracy": 0.4162524850894632, "step": 2072 }, { "epoch": 0.3843159065628476, "grad_norm": 5.44140625, "learning_rate": 9.615684093437153e-06, "loss": 2.7992, "mean_token_accuracy": 0.444640234948605, "step": 2073 }, { "epoch": 0.38450129773822767, "grad_norm": 5.65234375, "learning_rate": 9.615498702261773e-06, "loss": 3.0131, "mean_token_accuracy": 0.41701769165964614, "step": 2074 }, { "epoch": 0.3846866889136077, "grad_norm": 8.9765625, "learning_rate": 9.615313311086392e-06, "loss": 2.7005, "mean_token_accuracy": 0.4645742697327533, "step": 2075 }, { "epoch": 0.38487208008898777, "grad_norm": 5.578125, "learning_rate": 9.615127919911013e-06, "loss": 2.5209, "mean_token_accuracy": 0.4868745793134395, "step": 2076 }, { "epoch": 0.3850574712643678, "grad_norm": 5.46875, "learning_rate": 9.614942528735633e-06, "loss": 2.8761, "mean_token_accuracy": 0.44999288661260495, "step": 2077 }, { "epoch": 0.38524286243974787, "grad_norm": 5.83984375, "learning_rate": 9.614757137560254e-06, "loss": 2.5907, "mean_token_accuracy": 0.46751907609816457, "step": 2078 }, { "epoch": 0.38542825361512795, "grad_norm": 5.703125, "learning_rate": 9.614571746384872e-06, "loss": 3.0025, "mean_token_accuracy": 0.4178844056706652, "step": 2079 }, { "epoch": 0.38561364479050797, "grad_norm": 7.05078125, "learning_rate": 9.614386355209493e-06, "loss": 2.6778, "mean_token_accuracy": 0.44808281398542305, "step": 2080 }, { "epoch": 0.38579903596588805, "grad_norm": 5.7890625, "learning_rate": 9.614200964034113e-06, "loss": 3.0175, "mean_token_accuracy": 0.39784572619874914, "step": 2081 }, { "epoch": 0.38598442714126807, "grad_norm": 5.87890625, "learning_rate": 9.614015572858732e-06, "loss": 3.2212, "mean_token_accuracy": 0.38601868067717454, "step": 2082 }, { "epoch": 0.38616981831664815, "grad_norm": 5.25390625, "learning_rate": 9.613830181683353e-06, "loss": 2.4727, "mean_token_accuracy": 0.4817673378076063, "step": 2083 }, { "epoch": 0.3863552094920282, "grad_norm": 6.05859375, "learning_rate": 9.613644790507971e-06, "loss": 2.5637, "mean_token_accuracy": 0.4649891981192019, "step": 2084 }, { "epoch": 0.38654060066740825, "grad_norm": 6.51171875, "learning_rate": 9.613459399332594e-06, "loss": 2.94, "mean_token_accuracy": 0.42481442205726405, "step": 2085 }, { "epoch": 0.3867259918427883, "grad_norm": 6.11328125, "learning_rate": 9.613274008157212e-06, "loss": 3.0626, "mean_token_accuracy": 0.41205965543841605, "step": 2086 }, { "epoch": 0.38691138301816835, "grad_norm": 7.16796875, "learning_rate": 9.613088616981833e-06, "loss": 2.6915, "mean_token_accuracy": 0.4504725236261813, "step": 2087 }, { "epoch": 0.3870967741935484, "grad_norm": 6.8828125, "learning_rate": 9.612903225806453e-06, "loss": 2.4441, "mean_token_accuracy": 0.4721268789205823, "step": 2088 }, { "epoch": 0.38728216536892845, "grad_norm": 9.1796875, "learning_rate": 9.612717834631072e-06, "loss": 2.735, "mean_token_accuracy": 0.4508590001547748, "step": 2089 }, { "epoch": 0.3874675565443085, "grad_norm": 11.9453125, "learning_rate": 9.612532443455693e-06, "loss": 2.2656, "mean_token_accuracy": 0.5066921606118547, "step": 2090 }, { "epoch": 0.38765294771968856, "grad_norm": 9.65625, "learning_rate": 9.612347052280311e-06, "loss": 2.6518, "mean_token_accuracy": 0.448006509357201, "step": 2091 }, { "epoch": 0.3878383388950686, "grad_norm": 5.96875, "learning_rate": 9.612161661104932e-06, "loss": 2.9964, "mean_token_accuracy": 0.39593674246796723, "step": 2092 }, { "epoch": 0.38802373007044866, "grad_norm": 7.15625, "learning_rate": 9.611976269929552e-06, "loss": 2.7437, "mean_token_accuracy": 0.45685146443514646, "step": 2093 }, { "epoch": 0.3882091212458287, "grad_norm": 5.50390625, "learning_rate": 9.611790878754173e-06, "loss": 3.1872, "mean_token_accuracy": 0.39611407082419303, "step": 2094 }, { "epoch": 0.38839451242120876, "grad_norm": 6.734375, "learning_rate": 9.611605487578792e-06, "loss": 2.7417, "mean_token_accuracy": 0.4251856082238721, "step": 2095 }, { "epoch": 0.3885799035965888, "grad_norm": 7.10546875, "learning_rate": 9.611420096403412e-06, "loss": 2.5485, "mean_token_accuracy": 0.45921203204185057, "step": 2096 }, { "epoch": 0.38876529477196886, "grad_norm": 7.171875, "learning_rate": 9.611234705228033e-06, "loss": 2.637, "mean_token_accuracy": 0.46487006737247355, "step": 2097 }, { "epoch": 0.3889506859473489, "grad_norm": 5.96484375, "learning_rate": 9.611049314052651e-06, "loss": 2.7538, "mean_token_accuracy": 0.44766657674669547, "step": 2098 }, { "epoch": 0.38913607712272896, "grad_norm": 9.1953125, "learning_rate": 9.610863922877272e-06, "loss": 2.4073, "mean_token_accuracy": 0.49343533546986457, "step": 2099 }, { "epoch": 0.389321468298109, "grad_norm": 7.26953125, "learning_rate": 9.61067853170189e-06, "loss": 3.1512, "mean_token_accuracy": 0.40235878336437, "step": 2100 }, { "epoch": 0.38950685947348906, "grad_norm": 6.18359375, "learning_rate": 9.610493140526513e-06, "loss": 2.995, "mean_token_accuracy": 0.41995542152877935, "step": 2101 }, { "epoch": 0.3896922506488691, "grad_norm": 8.2578125, "learning_rate": 9.610307749351132e-06, "loss": 3.0167, "mean_token_accuracy": 0.4134461134606971, "step": 2102 }, { "epoch": 0.38987764182424917, "grad_norm": 7.35546875, "learning_rate": 9.610122358175752e-06, "loss": 3.1865, "mean_token_accuracy": 0.4180364952495853, "step": 2103 }, { "epoch": 0.39006303299962924, "grad_norm": 6.3828125, "learning_rate": 9.609936967000371e-06, "loss": 2.7165, "mean_token_accuracy": 0.45424430641821945, "step": 2104 }, { "epoch": 0.39024842417500927, "grad_norm": 6.5859375, "learning_rate": 9.609751575824992e-06, "loss": 2.8965, "mean_token_accuracy": 0.44299552906110284, "step": 2105 }, { "epoch": 0.39043381535038935, "grad_norm": 6.00390625, "learning_rate": 9.609566184649612e-06, "loss": 3.0302, "mean_token_accuracy": 0.4079285822363444, "step": 2106 }, { "epoch": 0.39061920652576937, "grad_norm": 10.0703125, "learning_rate": 9.60938079347423e-06, "loss": 3.2112, "mean_token_accuracy": 0.38524853019775523, "step": 2107 }, { "epoch": 0.39080459770114945, "grad_norm": 5.13671875, "learning_rate": 9.609195402298851e-06, "loss": 2.4987, "mean_token_accuracy": 0.47512437810945274, "step": 2108 }, { "epoch": 0.39098998887652947, "grad_norm": 7.76171875, "learning_rate": 9.60901001112347e-06, "loss": 2.7383, "mean_token_accuracy": 0.44074074074074077, "step": 2109 }, { "epoch": 0.39117538005190955, "grad_norm": 9.0625, "learning_rate": 9.608824619948092e-06, "loss": 2.9336, "mean_token_accuracy": 0.4272752782714433, "step": 2110 }, { "epoch": 0.39136077122728957, "grad_norm": 5.51171875, "learning_rate": 9.608639228772711e-06, "loss": 3.1542, "mean_token_accuracy": 0.42014487754398067, "step": 2111 }, { "epoch": 0.39154616240266965, "grad_norm": 5.9140625, "learning_rate": 9.608453837597332e-06, "loss": 2.8332, "mean_token_accuracy": 0.42817153734184304, "step": 2112 }, { "epoch": 0.3917315535780497, "grad_norm": 7.25390625, "learning_rate": 9.60826844642195e-06, "loss": 3.1417, "mean_token_accuracy": 0.39766662529477476, "step": 2113 }, { "epoch": 0.39191694475342975, "grad_norm": 5.25390625, "learning_rate": 9.608083055246571e-06, "loss": 2.6411, "mean_token_accuracy": 0.47155025553662694, "step": 2114 }, { "epoch": 0.3921023359288098, "grad_norm": 11.09375, "learning_rate": 9.607897664071191e-06, "loss": 3.0105, "mean_token_accuracy": 0.42351854295895475, "step": 2115 }, { "epoch": 0.39228772710418985, "grad_norm": 10.3203125, "learning_rate": 9.60771227289581e-06, "loss": 3.0379, "mean_token_accuracy": 0.400325545767905, "step": 2116 }, { "epoch": 0.3924731182795699, "grad_norm": 9.875, "learning_rate": 9.60752688172043e-06, "loss": 2.7898, "mean_token_accuracy": 0.4152623976889745, "step": 2117 }, { "epoch": 0.39265850945494996, "grad_norm": 6.390625, "learning_rate": 9.607341490545051e-06, "loss": 2.8519, "mean_token_accuracy": 0.4348458406050029, "step": 2118 }, { "epoch": 0.39284390063033, "grad_norm": 7.98046875, "learning_rate": 9.607156099369672e-06, "loss": 2.4599, "mean_token_accuracy": 0.4906051191004587, "step": 2119 }, { "epoch": 0.39302929180571006, "grad_norm": 14.265625, "learning_rate": 9.60697070819429e-06, "loss": 2.804, "mean_token_accuracy": 0.439311098961181, "step": 2120 }, { "epoch": 0.3932146829810901, "grad_norm": 10.46875, "learning_rate": 9.606785317018911e-06, "loss": 2.7183, "mean_token_accuracy": 0.4366297243535095, "step": 2121 }, { "epoch": 0.39340007415647016, "grad_norm": 7.3671875, "learning_rate": 9.60659992584353e-06, "loss": 2.9443, "mean_token_accuracy": 0.4348396501457726, "step": 2122 }, { "epoch": 0.3935854653318502, "grad_norm": 6.40625, "learning_rate": 9.60641453466815e-06, "loss": 2.7636, "mean_token_accuracy": 0.4421972860125261, "step": 2123 }, { "epoch": 0.39377085650723026, "grad_norm": 7.66015625, "learning_rate": 9.60622914349277e-06, "loss": 2.6197, "mean_token_accuracy": 0.467395600052694, "step": 2124 }, { "epoch": 0.3939562476826103, "grad_norm": 10.09375, "learning_rate": 9.60604375231739e-06, "loss": 2.3695, "mean_token_accuracy": 0.48359945537814086, "step": 2125 }, { "epoch": 0.39414163885799036, "grad_norm": 7.42578125, "learning_rate": 9.605858361142012e-06, "loss": 3.2091, "mean_token_accuracy": 0.39211837535859884, "step": 2126 }, { "epoch": 0.3943270300333704, "grad_norm": 6.9453125, "learning_rate": 9.60567296996663e-06, "loss": 2.318, "mean_token_accuracy": 0.5069799906933458, "step": 2127 }, { "epoch": 0.39451242120875046, "grad_norm": 6.08984375, "learning_rate": 9.605487578791251e-06, "loss": 2.9046, "mean_token_accuracy": 0.4173882311362431, "step": 2128 }, { "epoch": 0.39469781238413054, "grad_norm": 6.1875, "learning_rate": 9.60530218761587e-06, "loss": 2.9756, "mean_token_accuracy": 0.4227377560710894, "step": 2129 }, { "epoch": 0.39488320355951056, "grad_norm": 7.2890625, "learning_rate": 9.60511679644049e-06, "loss": 2.7194, "mean_token_accuracy": 0.4433718558803535, "step": 2130 }, { "epoch": 0.39506859473489064, "grad_norm": 6.6484375, "learning_rate": 9.60493140526511e-06, "loss": 3.232, "mean_token_accuracy": 0.39049103663289164, "step": 2131 }, { "epoch": 0.39525398591027067, "grad_norm": 6.50390625, "learning_rate": 9.60474601408973e-06, "loss": 2.607, "mean_token_accuracy": 0.4528549551520455, "step": 2132 }, { "epoch": 0.39543937708565075, "grad_norm": 7.03515625, "learning_rate": 9.60456062291435e-06, "loss": 2.6888, "mean_token_accuracy": 0.43695872230345295, "step": 2133 }, { "epoch": 0.39562476826103077, "grad_norm": 7.9140625, "learning_rate": 9.60437523173897e-06, "loss": 2.9753, "mean_token_accuracy": 0.41482404235440673, "step": 2134 }, { "epoch": 0.39581015943641085, "grad_norm": 6.6875, "learning_rate": 9.604189840563591e-06, "loss": 2.8712, "mean_token_accuracy": 0.43239524702939336, "step": 2135 }, { "epoch": 0.39599555061179087, "grad_norm": 5.7890625, "learning_rate": 9.60400444938821e-06, "loss": 2.8143, "mean_token_accuracy": 0.44885033732617374, "step": 2136 }, { "epoch": 0.39618094178717095, "grad_norm": 7.94921875, "learning_rate": 9.60381905821283e-06, "loss": 2.8667, "mean_token_accuracy": 0.42611118146131444, "step": 2137 }, { "epoch": 0.39636633296255097, "grad_norm": 9.1640625, "learning_rate": 9.603633667037449e-06, "loss": 2.9556, "mean_token_accuracy": 0.4074408343361412, "step": 2138 }, { "epoch": 0.39655172413793105, "grad_norm": 6.80859375, "learning_rate": 9.60344827586207e-06, "loss": 2.8473, "mean_token_accuracy": 0.43704873646209386, "step": 2139 }, { "epoch": 0.3967371153133111, "grad_norm": 5.84375, "learning_rate": 9.60326288468669e-06, "loss": 3.104, "mean_token_accuracy": 0.4044405197426517, "step": 2140 }, { "epoch": 0.39692250648869115, "grad_norm": 7.203125, "learning_rate": 9.603077493511309e-06, "loss": 2.7151, "mean_token_accuracy": 0.4464461185718965, "step": 2141 }, { "epoch": 0.3971078976640712, "grad_norm": 6.8203125, "learning_rate": 9.60289210233593e-06, "loss": 3.3042, "mean_token_accuracy": 0.403771491957848, "step": 2142 }, { "epoch": 0.39729328883945125, "grad_norm": 7.71484375, "learning_rate": 9.60270671116055e-06, "loss": 2.7911, "mean_token_accuracy": 0.44799419132328916, "step": 2143 }, { "epoch": 0.3974786800148313, "grad_norm": 6.07421875, "learning_rate": 9.60252131998517e-06, "loss": 3.0516, "mean_token_accuracy": 0.4171237777247794, "step": 2144 }, { "epoch": 0.39766407119021135, "grad_norm": 7.16796875, "learning_rate": 9.60233592880979e-06, "loss": 3.2054, "mean_token_accuracy": 0.39799222797927464, "step": 2145 }, { "epoch": 0.3978494623655914, "grad_norm": 10.6875, "learning_rate": 9.60215053763441e-06, "loss": 2.9818, "mean_token_accuracy": 0.4114414541355502, "step": 2146 }, { "epoch": 0.39803485354097146, "grad_norm": 5.796875, "learning_rate": 9.601965146459028e-06, "loss": 2.4614, "mean_token_accuracy": 0.4758304412493803, "step": 2147 }, { "epoch": 0.3982202447163515, "grad_norm": 8.21875, "learning_rate": 9.601779755283649e-06, "loss": 2.9336, "mean_token_accuracy": 0.4114009953249887, "step": 2148 }, { "epoch": 0.39840563589173156, "grad_norm": 9.296875, "learning_rate": 9.60159436410827e-06, "loss": 2.7118, "mean_token_accuracy": 0.4494267885647947, "step": 2149 }, { "epoch": 0.3985910270671116, "grad_norm": 5.3203125, "learning_rate": 9.60140897293289e-06, "loss": 3.2308, "mean_token_accuracy": 0.39852803006576887, "step": 2150 }, { "epoch": 0.39877641824249166, "grad_norm": 7.66796875, "learning_rate": 9.601223581757509e-06, "loss": 3.5244, "mean_token_accuracy": 0.38055772230889234, "step": 2151 }, { "epoch": 0.3989618094178717, "grad_norm": 5.3046875, "learning_rate": 9.60103819058213e-06, "loss": 2.3049, "mean_token_accuracy": 0.49959094627761114, "step": 2152 }, { "epoch": 0.39914720059325176, "grad_norm": 6.0859375, "learning_rate": 9.60085279940675e-06, "loss": 2.7274, "mean_token_accuracy": 0.4641638225255973, "step": 2153 }, { "epoch": 0.39933259176863184, "grad_norm": 13.671875, "learning_rate": 9.600667408231369e-06, "loss": 2.176, "mean_token_accuracy": 0.5069835824552805, "step": 2154 }, { "epoch": 0.39951798294401186, "grad_norm": 12.1640625, "learning_rate": 9.600482017055989e-06, "loss": 2.4325, "mean_token_accuracy": 0.4857462965268268, "step": 2155 }, { "epoch": 0.39970337411939194, "grad_norm": 6.87109375, "learning_rate": 9.600296625880608e-06, "loss": 2.8785, "mean_token_accuracy": 0.42439628482972136, "step": 2156 }, { "epoch": 0.39988876529477196, "grad_norm": 7.71484375, "learning_rate": 9.600111234705228e-06, "loss": 2.919, "mean_token_accuracy": 0.40865543442352165, "step": 2157 }, { "epoch": 0.40007415647015204, "grad_norm": 6.7890625, "learning_rate": 9.599925843529849e-06, "loss": 2.5982, "mean_token_accuracy": 0.460278276481149, "step": 2158 }, { "epoch": 0.40025954764553207, "grad_norm": 6.23046875, "learning_rate": 9.59974045235447e-06, "loss": 3.1905, "mean_token_accuracy": 0.40333660451422965, "step": 2159 }, { "epoch": 0.40044493882091214, "grad_norm": 7.12890625, "learning_rate": 9.599555061179088e-06, "loss": 3.0669, "mean_token_accuracy": 0.42526118403428875, "step": 2160 }, { "epoch": 0.40063032999629217, "grad_norm": 9.078125, "learning_rate": 9.599369670003709e-06, "loss": 2.6433, "mean_token_accuracy": 0.45611033892868597, "step": 2161 }, { "epoch": 0.40081572117167225, "grad_norm": 5.37109375, "learning_rate": 9.599184278828329e-06, "loss": 3.0843, "mean_token_accuracy": 0.4155622060709705, "step": 2162 }, { "epoch": 0.40100111234705227, "grad_norm": 7.4921875, "learning_rate": 9.598998887652948e-06, "loss": 2.3621, "mean_token_accuracy": 0.5131480890024487, "step": 2163 }, { "epoch": 0.40118650352243235, "grad_norm": 8.5, "learning_rate": 9.598813496477568e-06, "loss": 2.8468, "mean_token_accuracy": 0.4180476047002109, "step": 2164 }, { "epoch": 0.40137189469781237, "grad_norm": 5.24609375, "learning_rate": 9.598628105302187e-06, "loss": 2.9088, "mean_token_accuracy": 0.421875, "step": 2165 }, { "epoch": 0.40155728587319245, "grad_norm": 7.4765625, "learning_rate": 9.59844271412681e-06, "loss": 3.1, "mean_token_accuracy": 0.4135415146651102, "step": 2166 }, { "epoch": 0.40174267704857247, "grad_norm": 6.86328125, "learning_rate": 9.598257322951428e-06, "loss": 3.2294, "mean_token_accuracy": 0.3907189916929247, "step": 2167 }, { "epoch": 0.40192806822395255, "grad_norm": 11.4375, "learning_rate": 9.598071931776049e-06, "loss": 2.7301, "mean_token_accuracy": 0.44564518204039627, "step": 2168 }, { "epoch": 0.4021134593993326, "grad_norm": 7.42578125, "learning_rate": 9.597886540600669e-06, "loss": 2.537, "mean_token_accuracy": 0.46511627906976744, "step": 2169 }, { "epoch": 0.40229885057471265, "grad_norm": 7.734375, "learning_rate": 9.597701149425288e-06, "loss": 2.9266, "mean_token_accuracy": 0.43805704099821746, "step": 2170 }, { "epoch": 0.4024842417500927, "grad_norm": 6.42578125, "learning_rate": 9.597515758249908e-06, "loss": 3.4523, "mean_token_accuracy": 0.35805357351737044, "step": 2171 }, { "epoch": 0.40266963292547275, "grad_norm": 5.9765625, "learning_rate": 9.597330367074527e-06, "loss": 2.7003, "mean_token_accuracy": 0.452103467879477, "step": 2172 }, { "epoch": 0.4028550241008528, "grad_norm": 8.0625, "learning_rate": 9.597144975899148e-06, "loss": 2.5305, "mean_token_accuracy": 0.4557803877175605, "step": 2173 }, { "epoch": 0.40304041527623286, "grad_norm": 6.46484375, "learning_rate": 9.596959584723768e-06, "loss": 2.6961, "mean_token_accuracy": 0.47148495984755684, "step": 2174 }, { "epoch": 0.4032258064516129, "grad_norm": 5.25390625, "learning_rate": 9.596774193548389e-06, "loss": 2.7688, "mean_token_accuracy": 0.4398087787918296, "step": 2175 }, { "epoch": 0.40341119762699296, "grad_norm": 8.953125, "learning_rate": 9.596588802373007e-06, "loss": 2.7808, "mean_token_accuracy": 0.44367781610919454, "step": 2176 }, { "epoch": 0.403596588802373, "grad_norm": 5.23828125, "learning_rate": 9.596403411197628e-06, "loss": 2.7649, "mean_token_accuracy": 0.4540554414784394, "step": 2177 }, { "epoch": 0.40378197997775306, "grad_norm": 7.40625, "learning_rate": 9.596218020022248e-06, "loss": 2.4962, "mean_token_accuracy": 0.4739084132055378, "step": 2178 }, { "epoch": 0.40396737115313314, "grad_norm": 6.1015625, "learning_rate": 9.596032628846867e-06, "loss": 3.4253, "mean_token_accuracy": 0.37210464922512915, "step": 2179 }, { "epoch": 0.40415276232851316, "grad_norm": 7.3125, "learning_rate": 9.595847237671488e-06, "loss": 3.0844, "mean_token_accuracy": 0.42624969219404085, "step": 2180 }, { "epoch": 0.40433815350389324, "grad_norm": 5.7890625, "learning_rate": 9.595661846496107e-06, "loss": 2.9273, "mean_token_accuracy": 0.4272886552781428, "step": 2181 }, { "epoch": 0.40452354467927326, "grad_norm": 8.8125, "learning_rate": 9.595476455320729e-06, "loss": 2.5441, "mean_token_accuracy": 0.4539132484677039, "step": 2182 }, { "epoch": 0.40470893585465334, "grad_norm": 6.84765625, "learning_rate": 9.595291064145348e-06, "loss": 2.5789, "mean_token_accuracy": 0.45889023896314296, "step": 2183 }, { "epoch": 0.40489432703003336, "grad_norm": 7.02734375, "learning_rate": 9.595105672969968e-06, "loss": 2.5914, "mean_token_accuracy": 0.4703813903164728, "step": 2184 }, { "epoch": 0.40507971820541344, "grad_norm": 5.83984375, "learning_rate": 9.594920281794587e-06, "loss": 3.1223, "mean_token_accuracy": 0.3999721176634602, "step": 2185 }, { "epoch": 0.40526510938079346, "grad_norm": 7.41796875, "learning_rate": 9.594734890619207e-06, "loss": 2.6828, "mean_token_accuracy": 0.4577973015374961, "step": 2186 }, { "epoch": 0.40545050055617354, "grad_norm": 10.6796875, "learning_rate": 9.594549499443828e-06, "loss": 3.0733, "mean_token_accuracy": 0.4001912829621533, "step": 2187 }, { "epoch": 0.40563589173155357, "grad_norm": 6.8359375, "learning_rate": 9.594364108268447e-06, "loss": 2.7907, "mean_token_accuracy": 0.4379369476322385, "step": 2188 }, { "epoch": 0.40582128290693364, "grad_norm": 7.05078125, "learning_rate": 9.594178717093067e-06, "loss": 2.5177, "mean_token_accuracy": 0.4611839262566614, "step": 2189 }, { "epoch": 0.40600667408231367, "grad_norm": 11.4375, "learning_rate": 9.593993325917688e-06, "loss": 2.8458, "mean_token_accuracy": 0.4365539858728557, "step": 2190 }, { "epoch": 0.40619206525769375, "grad_norm": 9.5703125, "learning_rate": 9.593807934742308e-06, "loss": 3.1345, "mean_token_accuracy": 0.40004364906154516, "step": 2191 }, { "epoch": 0.40637745643307377, "grad_norm": 6.9921875, "learning_rate": 9.593622543566927e-06, "loss": 2.7532, "mean_token_accuracy": 0.437296858071506, "step": 2192 }, { "epoch": 0.40656284760845385, "grad_norm": 5.94921875, "learning_rate": 9.593437152391547e-06, "loss": 2.9164, "mean_token_accuracy": 0.4266436979615274, "step": 2193 }, { "epoch": 0.40674823878383387, "grad_norm": 8.4296875, "learning_rate": 9.593251761216166e-06, "loss": 2.961, "mean_token_accuracy": 0.40921889665241423, "step": 2194 }, { "epoch": 0.40693362995921395, "grad_norm": 7.375, "learning_rate": 9.593066370040787e-06, "loss": 3.1608, "mean_token_accuracy": 0.3966967583093968, "step": 2195 }, { "epoch": 0.407119021134594, "grad_norm": 8.5625, "learning_rate": 9.592880978865407e-06, "loss": 2.8243, "mean_token_accuracy": 0.4401072011186204, "step": 2196 }, { "epoch": 0.40730441230997405, "grad_norm": 5.88671875, "learning_rate": 9.592695587690026e-06, "loss": 3.4901, "mean_token_accuracy": 0.37498164733519307, "step": 2197 }, { "epoch": 0.4074898034853541, "grad_norm": 7.1171875, "learning_rate": 9.592510196514646e-06, "loss": 2.6724, "mean_token_accuracy": 0.45298109549200194, "step": 2198 }, { "epoch": 0.40767519466073415, "grad_norm": 8.03125, "learning_rate": 9.592324805339267e-06, "loss": 2.5993, "mean_token_accuracy": 0.4460420765854397, "step": 2199 }, { "epoch": 0.4078605858361142, "grad_norm": 6.4921875, "learning_rate": 9.592139414163887e-06, "loss": 2.5064, "mean_token_accuracy": 0.49355747936380107, "step": 2200 }, { "epoch": 0.40804597701149425, "grad_norm": 6.87109375, "learning_rate": 9.591954022988506e-06, "loss": 2.891, "mean_token_accuracy": 0.4084868619752341, "step": 2201 }, { "epoch": 0.4082313681868743, "grad_norm": 7.6640625, "learning_rate": 9.591768631813127e-06, "loss": 2.9136, "mean_token_accuracy": 0.42668661588683354, "step": 2202 }, { "epoch": 0.40841675936225436, "grad_norm": 7.66015625, "learning_rate": 9.591583240637745e-06, "loss": 3.4947, "mean_token_accuracy": 0.3673661555017487, "step": 2203 }, { "epoch": 0.40860215053763443, "grad_norm": 6.83984375, "learning_rate": 9.591397849462366e-06, "loss": 3.3369, "mean_token_accuracy": 0.4024370691037358, "step": 2204 }, { "epoch": 0.40878754171301446, "grad_norm": 6.0546875, "learning_rate": 9.591212458286986e-06, "loss": 2.8524, "mean_token_accuracy": 0.4304034197168047, "step": 2205 }, { "epoch": 0.40897293288839454, "grad_norm": 6.95703125, "learning_rate": 9.591027067111607e-06, "loss": 3.0428, "mean_token_accuracy": 0.4013611872577748, "step": 2206 }, { "epoch": 0.40915832406377456, "grad_norm": 6.42578125, "learning_rate": 9.590841675936227e-06, "loss": 2.4655, "mean_token_accuracy": 0.46412838263058526, "step": 2207 }, { "epoch": 0.40934371523915464, "grad_norm": 6.75390625, "learning_rate": 9.590656284760846e-06, "loss": 2.2468, "mean_token_accuracy": 0.5171443193449334, "step": 2208 }, { "epoch": 0.40952910641453466, "grad_norm": 6.0859375, "learning_rate": 9.590470893585467e-06, "loss": 3.1099, "mean_token_accuracy": 0.4083229391087425, "step": 2209 }, { "epoch": 0.40971449758991474, "grad_norm": 7.49609375, "learning_rate": 9.590285502410086e-06, "loss": 2.7473, "mean_token_accuracy": 0.44744659782147406, "step": 2210 }, { "epoch": 0.40989988876529476, "grad_norm": 6.5546875, "learning_rate": 9.590100111234706e-06, "loss": 2.9004, "mean_token_accuracy": 0.42825878812747914, "step": 2211 }, { "epoch": 0.41008527994067484, "grad_norm": 6.71484375, "learning_rate": 9.589914720059327e-06, "loss": 2.6701, "mean_token_accuracy": 0.4524519747046892, "step": 2212 }, { "epoch": 0.41027067111605486, "grad_norm": 6.25390625, "learning_rate": 9.589729328883945e-06, "loss": 2.6886, "mean_token_accuracy": 0.4447219666097016, "step": 2213 }, { "epoch": 0.41045606229143494, "grad_norm": 5.359375, "learning_rate": 9.589543937708566e-06, "loss": 3.2223, "mean_token_accuracy": 0.39273674532638675, "step": 2214 }, { "epoch": 0.41064145346681497, "grad_norm": 8.75, "learning_rate": 9.589358546533186e-06, "loss": 2.7485, "mean_token_accuracy": 0.43857634902411025, "step": 2215 }, { "epoch": 0.41082684464219504, "grad_norm": 6.03515625, "learning_rate": 9.589173155357807e-06, "loss": 3.0011, "mean_token_accuracy": 0.4093845630737844, "step": 2216 }, { "epoch": 0.41101223581757507, "grad_norm": 5.87109375, "learning_rate": 9.588987764182426e-06, "loss": 2.8778, "mean_token_accuracy": 0.4264786870800749, "step": 2217 }, { "epoch": 0.41119762699295515, "grad_norm": 5.26171875, "learning_rate": 9.588802373007046e-06, "loss": 2.7295, "mean_token_accuracy": 0.44439317028149516, "step": 2218 }, { "epoch": 0.41138301816833517, "grad_norm": 5.40625, "learning_rate": 9.588616981831665e-06, "loss": 2.6191, "mean_token_accuracy": 0.47444470981609743, "step": 2219 }, { "epoch": 0.41156840934371525, "grad_norm": 4.84375, "learning_rate": 9.588431590656285e-06, "loss": 2.5209, "mean_token_accuracy": 0.48569458807307825, "step": 2220 }, { "epoch": 0.41175380051909527, "grad_norm": 5.85546875, "learning_rate": 9.588246199480906e-06, "loss": 2.6633, "mean_token_accuracy": 0.4565012773967998, "step": 2221 }, { "epoch": 0.41193919169447535, "grad_norm": 5.6171875, "learning_rate": 9.588060808305526e-06, "loss": 2.7262, "mean_token_accuracy": 0.44534466728649225, "step": 2222 }, { "epoch": 0.41212458286985537, "grad_norm": 6.2578125, "learning_rate": 9.587875417130145e-06, "loss": 2.4175, "mean_token_accuracy": 0.473217166828009, "step": 2223 }, { "epoch": 0.41230997404523545, "grad_norm": 6.3515625, "learning_rate": 9.587690025954766e-06, "loss": 2.209, "mean_token_accuracy": 0.5235075442816531, "step": 2224 }, { "epoch": 0.4124953652206155, "grad_norm": 5.91796875, "learning_rate": 9.587504634779386e-06, "loss": 3.1116, "mean_token_accuracy": 0.40691489361702127, "step": 2225 }, { "epoch": 0.41268075639599555, "grad_norm": 6.20703125, "learning_rate": 9.587319243604005e-06, "loss": 2.4577, "mean_token_accuracy": 0.4869445716903344, "step": 2226 }, { "epoch": 0.41286614757137563, "grad_norm": 6.37109375, "learning_rate": 9.587133852428625e-06, "loss": 3.2775, "mean_token_accuracy": 0.3961429799778897, "step": 2227 }, { "epoch": 0.41305153874675565, "grad_norm": 6.19140625, "learning_rate": 9.586948461253244e-06, "loss": 2.9515, "mean_token_accuracy": 0.4275176877916604, "step": 2228 }, { "epoch": 0.41323692992213573, "grad_norm": 5.85546875, "learning_rate": 9.586763070077865e-06, "loss": 2.2488, "mean_token_accuracy": 0.5189208128941836, "step": 2229 }, { "epoch": 0.41342232109751575, "grad_norm": 5.3984375, "learning_rate": 9.586577678902485e-06, "loss": 3.2799, "mean_token_accuracy": 0.4077181208053691, "step": 2230 }, { "epoch": 0.41360771227289583, "grad_norm": 5.49609375, "learning_rate": 9.586392287727106e-06, "loss": 2.805, "mean_token_accuracy": 0.44299973732597847, "step": 2231 }, { "epoch": 0.41379310344827586, "grad_norm": 7.0078125, "learning_rate": 9.586206896551724e-06, "loss": 2.9177, "mean_token_accuracy": 0.42305061559507523, "step": 2232 }, { "epoch": 0.41397849462365593, "grad_norm": 6.40234375, "learning_rate": 9.586021505376345e-06, "loss": 2.9239, "mean_token_accuracy": 0.4374907966426152, "step": 2233 }, { "epoch": 0.41416388579903596, "grad_norm": 5.00390625, "learning_rate": 9.585836114200965e-06, "loss": 3.1282, "mean_token_accuracy": 0.4031123139377537, "step": 2234 }, { "epoch": 0.41434927697441604, "grad_norm": 5.84765625, "learning_rate": 9.585650723025584e-06, "loss": 2.6302, "mean_token_accuracy": 0.4598634196624146, "step": 2235 }, { "epoch": 0.41453466814979606, "grad_norm": 5.20703125, "learning_rate": 9.585465331850205e-06, "loss": 3.2291, "mean_token_accuracy": 0.41088498304039467, "step": 2236 }, { "epoch": 0.41472005932517614, "grad_norm": 6.34765625, "learning_rate": 9.585279940674824e-06, "loss": 2.522, "mean_token_accuracy": 0.4711177794448612, "step": 2237 }, { "epoch": 0.41490545050055616, "grad_norm": 6.484375, "learning_rate": 9.585094549499444e-06, "loss": 3.0161, "mean_token_accuracy": 0.4154883901932806, "step": 2238 }, { "epoch": 0.41509084167593624, "grad_norm": 5.9453125, "learning_rate": 9.584909158324065e-06, "loss": 3.1629, "mean_token_accuracy": 0.41485998193315266, "step": 2239 }, { "epoch": 0.41527623285131626, "grad_norm": 5.953125, "learning_rate": 9.584723767148685e-06, "loss": 2.2707, "mean_token_accuracy": 0.5178206251825884, "step": 2240 }, { "epoch": 0.41546162402669634, "grad_norm": 6.41796875, "learning_rate": 9.584538375973304e-06, "loss": 2.8583, "mean_token_accuracy": 0.4417053364269142, "step": 2241 }, { "epoch": 0.41564701520207636, "grad_norm": 6.52734375, "learning_rate": 9.584352984797924e-06, "loss": 2.8294, "mean_token_accuracy": 0.4526962457337884, "step": 2242 }, { "epoch": 0.41583240637745644, "grad_norm": 6.11328125, "learning_rate": 9.584167593622545e-06, "loss": 2.9048, "mean_token_accuracy": 0.4266694403994175, "step": 2243 }, { "epoch": 0.41601779755283647, "grad_norm": 6.24609375, "learning_rate": 9.583982202447164e-06, "loss": 2.4896, "mean_token_accuracy": 0.48919753086419754, "step": 2244 }, { "epoch": 0.41620318872821654, "grad_norm": 7.5859375, "learning_rate": 9.583796811271784e-06, "loss": 2.288, "mean_token_accuracy": 0.51, "step": 2245 }, { "epoch": 0.41638857990359657, "grad_norm": 6.41015625, "learning_rate": 9.583611420096403e-06, "loss": 2.6851, "mean_token_accuracy": 0.4567398119122257, "step": 2246 }, { "epoch": 0.41657397107897665, "grad_norm": 6.890625, "learning_rate": 9.583426028921025e-06, "loss": 2.6854, "mean_token_accuracy": 0.44308614923307577, "step": 2247 }, { "epoch": 0.41675936225435667, "grad_norm": 5.95703125, "learning_rate": 9.583240637745644e-06, "loss": 2.4149, "mean_token_accuracy": 0.4935596302470071, "step": 2248 }, { "epoch": 0.41694475342973675, "grad_norm": 5.53515625, "learning_rate": 9.583055246570264e-06, "loss": 2.9091, "mean_token_accuracy": 0.4063740368900304, "step": 2249 }, { "epoch": 0.41713014460511677, "grad_norm": 9.59375, "learning_rate": 9.582869855394885e-06, "loss": 2.9351, "mean_token_accuracy": 0.41020106781778937, "step": 2250 }, { "epoch": 0.41731553578049685, "grad_norm": 7.80859375, "learning_rate": 9.582684464219504e-06, "loss": 3.2058, "mean_token_accuracy": 0.41066003866335266, "step": 2251 }, { "epoch": 0.4175009269558769, "grad_norm": 6.73828125, "learning_rate": 9.582499073044124e-06, "loss": 2.7334, "mean_token_accuracy": 0.4456549935149157, "step": 2252 }, { "epoch": 0.41768631813125695, "grad_norm": 8.125, "learning_rate": 9.582313681868743e-06, "loss": 2.6819, "mean_token_accuracy": 0.4686708131766874, "step": 2253 }, { "epoch": 0.41787170930663703, "grad_norm": 8.140625, "learning_rate": 9.582128290693363e-06, "loss": 3.0016, "mean_token_accuracy": 0.4097625968992248, "step": 2254 }, { "epoch": 0.41805710048201705, "grad_norm": 7.0390625, "learning_rate": 9.581942899517984e-06, "loss": 3.5263, "mean_token_accuracy": 0.371763423276097, "step": 2255 }, { "epoch": 0.41824249165739713, "grad_norm": 6.765625, "learning_rate": 9.581757508342604e-06, "loss": 3.1005, "mean_token_accuracy": 0.39565943238731216, "step": 2256 }, { "epoch": 0.41842788283277715, "grad_norm": 6.11328125, "learning_rate": 9.581572117167223e-06, "loss": 2.8462, "mean_token_accuracy": 0.4264867237217468, "step": 2257 }, { "epoch": 0.41861327400815723, "grad_norm": 6.43359375, "learning_rate": 9.581386725991844e-06, "loss": 3.4261, "mean_token_accuracy": 0.3723460721868365, "step": 2258 }, { "epoch": 0.41879866518353726, "grad_norm": 5.55078125, "learning_rate": 9.581201334816464e-06, "loss": 2.8604, "mean_token_accuracy": 0.45174699471969443, "step": 2259 }, { "epoch": 0.41898405635891733, "grad_norm": 5.453125, "learning_rate": 9.581015943641083e-06, "loss": 2.8714, "mean_token_accuracy": 0.4332957534761368, "step": 2260 }, { "epoch": 0.41916944753429736, "grad_norm": 5.42578125, "learning_rate": 9.580830552465703e-06, "loss": 2.8619, "mean_token_accuracy": 0.4447099429178282, "step": 2261 }, { "epoch": 0.41935483870967744, "grad_norm": 5.90234375, "learning_rate": 9.580645161290322e-06, "loss": 2.4376, "mean_token_accuracy": 0.4953051643192488, "step": 2262 }, { "epoch": 0.41954022988505746, "grad_norm": 6.1484375, "learning_rate": 9.580459770114944e-06, "loss": 2.6206, "mean_token_accuracy": 0.45974770642201834, "step": 2263 }, { "epoch": 0.41972562106043754, "grad_norm": 6.81640625, "learning_rate": 9.580274378939563e-06, "loss": 2.6863, "mean_token_accuracy": 0.4523254005288536, "step": 2264 }, { "epoch": 0.41991101223581756, "grad_norm": 11.6953125, "learning_rate": 9.580088987764184e-06, "loss": 2.9287, "mean_token_accuracy": 0.4245207667731629, "step": 2265 }, { "epoch": 0.42009640341119764, "grad_norm": 6.78125, "learning_rate": 9.579903596588803e-06, "loss": 2.9945, "mean_token_accuracy": 0.417077570655442, "step": 2266 }, { "epoch": 0.42028179458657766, "grad_norm": 7.203125, "learning_rate": 9.579718205413423e-06, "loss": 2.9123, "mean_token_accuracy": 0.42065454545454545, "step": 2267 }, { "epoch": 0.42046718576195774, "grad_norm": 6.73828125, "learning_rate": 9.579532814238044e-06, "loss": 2.767, "mean_token_accuracy": 0.4510226049515608, "step": 2268 }, { "epoch": 0.42065257693733776, "grad_norm": 7.34375, "learning_rate": 9.579347423062662e-06, "loss": 2.5501, "mean_token_accuracy": 0.46499921346547113, "step": 2269 }, { "epoch": 0.42083796811271784, "grad_norm": 5.9921875, "learning_rate": 9.579162031887283e-06, "loss": 2.6606, "mean_token_accuracy": 0.44438164141985076, "step": 2270 }, { "epoch": 0.42102335928809786, "grad_norm": 5.4375, "learning_rate": 9.578976640711903e-06, "loss": 2.7844, "mean_token_accuracy": 0.44346289752650175, "step": 2271 }, { "epoch": 0.42120875046347794, "grad_norm": 7.71484375, "learning_rate": 9.578791249536524e-06, "loss": 2.4964, "mean_token_accuracy": 0.46576007770762506, "step": 2272 }, { "epoch": 0.42139414163885797, "grad_norm": 8.7421875, "learning_rate": 9.578605858361143e-06, "loss": 2.7703, "mean_token_accuracy": 0.4587604478788835, "step": 2273 }, { "epoch": 0.42157953281423804, "grad_norm": 5.65625, "learning_rate": 9.578420467185763e-06, "loss": 2.9912, "mean_token_accuracy": 0.4190771349862259, "step": 2274 }, { "epoch": 0.42176492398961807, "grad_norm": 6.38671875, "learning_rate": 9.578235076010382e-06, "loss": 3.0331, "mean_token_accuracy": 0.41849039749888345, "step": 2275 }, { "epoch": 0.42195031516499815, "grad_norm": 5.7578125, "learning_rate": 9.578049684835002e-06, "loss": 3.2431, "mean_token_accuracy": 0.39229741222208725, "step": 2276 }, { "epoch": 0.4221357063403782, "grad_norm": 6.25390625, "learning_rate": 9.577864293659623e-06, "loss": 2.6444, "mean_token_accuracy": 0.4572508842541773, "step": 2277 }, { "epoch": 0.42232109751575825, "grad_norm": 7.94921875, "learning_rate": 9.577678902484242e-06, "loss": 3.124, "mean_token_accuracy": 0.41318891366677285, "step": 2278 }, { "epoch": 0.4225064886911383, "grad_norm": 5.48046875, "learning_rate": 9.577493511308862e-06, "loss": 2.6967, "mean_token_accuracy": 0.4566266721831472, "step": 2279 }, { "epoch": 0.42269187986651835, "grad_norm": 9.8359375, "learning_rate": 9.577308120133483e-06, "loss": 3.0674, "mean_token_accuracy": 0.40586592178770947, "step": 2280 }, { "epoch": 0.42287727104189843, "grad_norm": 8.9375, "learning_rate": 9.577122728958103e-06, "loss": 3.1422, "mean_token_accuracy": 0.4288283303361461, "step": 2281 }, { "epoch": 0.42306266221727845, "grad_norm": 5.9765625, "learning_rate": 9.576937337782722e-06, "loss": 2.8583, "mean_token_accuracy": 0.4208240652473255, "step": 2282 }, { "epoch": 0.42324805339265853, "grad_norm": 6.3828125, "learning_rate": 9.576751946607342e-06, "loss": 2.57, "mean_token_accuracy": 0.4715649104458099, "step": 2283 }, { "epoch": 0.42343344456803855, "grad_norm": 8.6796875, "learning_rate": 9.576566555431961e-06, "loss": 2.7068, "mean_token_accuracy": 0.44070002892681515, "step": 2284 }, { "epoch": 0.42361883574341863, "grad_norm": 6.45703125, "learning_rate": 9.576381164256582e-06, "loss": 2.7965, "mean_token_accuracy": 0.43934426229508194, "step": 2285 }, { "epoch": 0.42380422691879865, "grad_norm": 6.77734375, "learning_rate": 9.576195773081202e-06, "loss": 2.7099, "mean_token_accuracy": 0.45409778403095324, "step": 2286 }, { "epoch": 0.42398961809417873, "grad_norm": 5.3046875, "learning_rate": 9.576010381905823e-06, "loss": 2.6851, "mean_token_accuracy": 0.4520547945205479, "step": 2287 }, { "epoch": 0.42417500926955876, "grad_norm": 6.34765625, "learning_rate": 9.575824990730443e-06, "loss": 2.902, "mean_token_accuracy": 0.4282057532910775, "step": 2288 }, { "epoch": 0.42436040044493883, "grad_norm": 5.8671875, "learning_rate": 9.575639599555062e-06, "loss": 3.0857, "mean_token_accuracy": 0.4180987600609093, "step": 2289 }, { "epoch": 0.42454579162031886, "grad_norm": 6.171875, "learning_rate": 9.575454208379682e-06, "loss": 2.998, "mean_token_accuracy": 0.42883945322969713, "step": 2290 }, { "epoch": 0.42473118279569894, "grad_norm": 6.640625, "learning_rate": 9.575268817204301e-06, "loss": 2.8029, "mean_token_accuracy": 0.44484672942312337, "step": 2291 }, { "epoch": 0.42491657397107896, "grad_norm": 6.3515625, "learning_rate": 9.575083426028922e-06, "loss": 2.6079, "mean_token_accuracy": 0.4652828533840082, "step": 2292 }, { "epoch": 0.42510196514645904, "grad_norm": 7.15625, "learning_rate": 9.57489803485354e-06, "loss": 3.2783, "mean_token_accuracy": 0.3789332738228074, "step": 2293 }, { "epoch": 0.42528735632183906, "grad_norm": 5.84765625, "learning_rate": 9.574712643678161e-06, "loss": 3.2932, "mean_token_accuracy": 0.39693890352527916, "step": 2294 }, { "epoch": 0.42547274749721914, "grad_norm": 6.05859375, "learning_rate": 9.574527252502782e-06, "loss": 2.7962, "mean_token_accuracy": 0.4486226497595103, "step": 2295 }, { "epoch": 0.42565813867259916, "grad_norm": 7.83984375, "learning_rate": 9.574341861327402e-06, "loss": 2.7918, "mean_token_accuracy": 0.4354358082940154, "step": 2296 }, { "epoch": 0.42584352984797924, "grad_norm": 7.33203125, "learning_rate": 9.574156470152023e-06, "loss": 2.8192, "mean_token_accuracy": 0.43056141831996625, "step": 2297 }, { "epoch": 0.42602892102335926, "grad_norm": 6.8203125, "learning_rate": 9.573971078976641e-06, "loss": 2.6568, "mean_token_accuracy": 0.45289427052569403, "step": 2298 }, { "epoch": 0.42621431219873934, "grad_norm": 6.17578125, "learning_rate": 9.573785687801262e-06, "loss": 3.233, "mean_token_accuracy": 0.40052459016393444, "step": 2299 }, { "epoch": 0.42639970337411937, "grad_norm": 6.828125, "learning_rate": 9.57360029662588e-06, "loss": 3.5155, "mean_token_accuracy": 0.3681762210972773, "step": 2300 }, { "epoch": 0.42658509454949944, "grad_norm": 8.125, "learning_rate": 9.573414905450501e-06, "loss": 2.7311, "mean_token_accuracy": 0.4388185654008439, "step": 2301 }, { "epoch": 0.4267704857248795, "grad_norm": 6.0, "learning_rate": 9.573229514275122e-06, "loss": 3.2209, "mean_token_accuracy": 0.38390630083505045, "step": 2302 }, { "epoch": 0.42695587690025955, "grad_norm": 6.203125, "learning_rate": 9.573044123099742e-06, "loss": 2.3343, "mean_token_accuracy": 0.482729089351984, "step": 2303 }, { "epoch": 0.4271412680756396, "grad_norm": 6.90234375, "learning_rate": 9.572858731924361e-06, "loss": 2.3338, "mean_token_accuracy": 0.49918330308529946, "step": 2304 }, { "epoch": 0.42732665925101965, "grad_norm": 4.89453125, "learning_rate": 9.572673340748981e-06, "loss": 2.7545, "mean_token_accuracy": 0.43779510266827715, "step": 2305 }, { "epoch": 0.4275120504263997, "grad_norm": 7.27734375, "learning_rate": 9.572487949573602e-06, "loss": 2.5788, "mean_token_accuracy": 0.4614881082260528, "step": 2306 }, { "epoch": 0.42769744160177975, "grad_norm": 8.140625, "learning_rate": 9.57230255839822e-06, "loss": 2.7198, "mean_token_accuracy": 0.4642602368383507, "step": 2307 }, { "epoch": 0.4278828327771598, "grad_norm": 6.69921875, "learning_rate": 9.572117167222841e-06, "loss": 2.688, "mean_token_accuracy": 0.45572126171307903, "step": 2308 }, { "epoch": 0.42806822395253985, "grad_norm": 6.46875, "learning_rate": 9.57193177604746e-06, "loss": 3.0823, "mean_token_accuracy": 0.4181265382944839, "step": 2309 }, { "epoch": 0.42825361512791993, "grad_norm": 5.6171875, "learning_rate": 9.57174638487208e-06, "loss": 3.3536, "mean_token_accuracy": 0.3746069182389937, "step": 2310 }, { "epoch": 0.42843900630329995, "grad_norm": 6.59765625, "learning_rate": 9.571560993696701e-06, "loss": 3.0761, "mean_token_accuracy": 0.40766457470957407, "step": 2311 }, { "epoch": 0.42862439747868003, "grad_norm": 4.83203125, "learning_rate": 9.571375602521321e-06, "loss": 3.0865, "mean_token_accuracy": 0.4098548073625243, "step": 2312 }, { "epoch": 0.42880978865406005, "grad_norm": 5.90234375, "learning_rate": 9.57119021134594e-06, "loss": 2.7534, "mean_token_accuracy": 0.4348197748967089, "step": 2313 }, { "epoch": 0.42899517982944013, "grad_norm": 5.88671875, "learning_rate": 9.57100482017056e-06, "loss": 2.7977, "mean_token_accuracy": 0.44222160044767767, "step": 2314 }, { "epoch": 0.42918057100482015, "grad_norm": 6.6484375, "learning_rate": 9.570819428995181e-06, "loss": 2.5792, "mean_token_accuracy": 0.46234522942461764, "step": 2315 }, { "epoch": 0.42936596218020023, "grad_norm": 6.4765625, "learning_rate": 9.5706340378198e-06, "loss": 2.7415, "mean_token_accuracy": 0.44592592592592595, "step": 2316 }, { "epoch": 0.42955135335558026, "grad_norm": 6.0390625, "learning_rate": 9.57044864664442e-06, "loss": 3.2823, "mean_token_accuracy": 0.39098291116228334, "step": 2317 }, { "epoch": 0.42973674453096034, "grad_norm": 7.40234375, "learning_rate": 9.57026325546904e-06, "loss": 3.1571, "mean_token_accuracy": 0.40427046263345195, "step": 2318 }, { "epoch": 0.42992213570634036, "grad_norm": 10.71875, "learning_rate": 9.570077864293661e-06, "loss": 2.784, "mean_token_accuracy": 0.43703358208955223, "step": 2319 }, { "epoch": 0.43010752688172044, "grad_norm": 5.8515625, "learning_rate": 9.56989247311828e-06, "loss": 2.9161, "mean_token_accuracy": 0.4267010088001717, "step": 2320 }, { "epoch": 0.43029291805710046, "grad_norm": 8.1640625, "learning_rate": 9.5697070819429e-06, "loss": 2.8264, "mean_token_accuracy": 0.44167794316644116, "step": 2321 }, { "epoch": 0.43047830923248054, "grad_norm": 6.33984375, "learning_rate": 9.56952169076752e-06, "loss": 2.9504, "mean_token_accuracy": 0.42596030272740254, "step": 2322 }, { "epoch": 0.43066370040786056, "grad_norm": 7.79296875, "learning_rate": 9.56933629959214e-06, "loss": 2.7541, "mean_token_accuracy": 0.44746650549007755, "step": 2323 }, { "epoch": 0.43084909158324064, "grad_norm": 5.8515625, "learning_rate": 9.56915090841676e-06, "loss": 2.5844, "mean_token_accuracy": 0.45316896690339004, "step": 2324 }, { "epoch": 0.43103448275862066, "grad_norm": 6.2265625, "learning_rate": 9.56896551724138e-06, "loss": 2.8174, "mean_token_accuracy": 0.4273395532937518, "step": 2325 }, { "epoch": 0.43121987393400074, "grad_norm": 6.6328125, "learning_rate": 9.568780126066e-06, "loss": 2.8076, "mean_token_accuracy": 0.43569154091097884, "step": 2326 }, { "epoch": 0.4314052651093808, "grad_norm": 7.4453125, "learning_rate": 9.56859473489062e-06, "loss": 2.7529, "mean_token_accuracy": 0.4272092627277371, "step": 2327 }, { "epoch": 0.43159065628476084, "grad_norm": 6.33203125, "learning_rate": 9.56840934371524e-06, "loss": 2.7995, "mean_token_accuracy": 0.44236709478133635, "step": 2328 }, { "epoch": 0.4317760474601409, "grad_norm": 5.62109375, "learning_rate": 9.56822395253986e-06, "loss": 3.1791, "mean_token_accuracy": 0.41021897810218977, "step": 2329 }, { "epoch": 0.43196143863552094, "grad_norm": 8.34375, "learning_rate": 9.56803856136448e-06, "loss": 2.577, "mean_token_accuracy": 0.46273964131106987, "step": 2330 }, { "epoch": 0.432146829810901, "grad_norm": 6.609375, "learning_rate": 9.5678531701891e-06, "loss": 2.8645, "mean_token_accuracy": 0.42329700272479565, "step": 2331 }, { "epoch": 0.43233222098628105, "grad_norm": 8.234375, "learning_rate": 9.56766777901372e-06, "loss": 2.4701, "mean_token_accuracy": 0.48970716149608584, "step": 2332 }, { "epoch": 0.4325176121616611, "grad_norm": 6.546875, "learning_rate": 9.56748238783834e-06, "loss": 3.0891, "mean_token_accuracy": 0.4025445292620865, "step": 2333 }, { "epoch": 0.43270300333704115, "grad_norm": 7.6484375, "learning_rate": 9.567296996662959e-06, "loss": 3.1577, "mean_token_accuracy": 0.4045156407669021, "step": 2334 }, { "epoch": 0.4328883945124212, "grad_norm": 6.54296875, "learning_rate": 9.567111605487581e-06, "loss": 2.8963, "mean_token_accuracy": 0.41485784163864264, "step": 2335 }, { "epoch": 0.43307378568780125, "grad_norm": 5.66796875, "learning_rate": 9.5669262143122e-06, "loss": 2.4262, "mean_token_accuracy": 0.498676293622142, "step": 2336 }, { "epoch": 0.43325917686318133, "grad_norm": 6.58984375, "learning_rate": 9.56674082313682e-06, "loss": 2.8299, "mean_token_accuracy": 0.43812036688026135, "step": 2337 }, { "epoch": 0.43344456803856135, "grad_norm": 5.890625, "learning_rate": 9.566555431961439e-06, "loss": 3.2132, "mean_token_accuracy": 0.4051290374939133, "step": 2338 }, { "epoch": 0.43362995921394143, "grad_norm": 6.421875, "learning_rate": 9.56637004078606e-06, "loss": 3.0045, "mean_token_accuracy": 0.41964285714285715, "step": 2339 }, { "epoch": 0.43381535038932145, "grad_norm": 7.3125, "learning_rate": 9.56618464961068e-06, "loss": 2.7718, "mean_token_accuracy": 0.4379874213836478, "step": 2340 }, { "epoch": 0.43400074156470153, "grad_norm": 6.0703125, "learning_rate": 9.565999258435299e-06, "loss": 3.4193, "mean_token_accuracy": 0.37652681890600104, "step": 2341 }, { "epoch": 0.43418613274008155, "grad_norm": 9.0, "learning_rate": 9.56581386725992e-06, "loss": 3.2395, "mean_token_accuracy": 0.402314137518287, "step": 2342 }, { "epoch": 0.43437152391546163, "grad_norm": 9.6875, "learning_rate": 9.56562847608454e-06, "loss": 3.2184, "mean_token_accuracy": 0.37963930998431783, "step": 2343 }, { "epoch": 0.43455691509084166, "grad_norm": 6.58203125, "learning_rate": 9.56544308490916e-06, "loss": 3.0456, "mean_token_accuracy": 0.4261033877716291, "step": 2344 }, { "epoch": 0.43474230626622173, "grad_norm": 8.9453125, "learning_rate": 9.565257693733779e-06, "loss": 3.0722, "mean_token_accuracy": 0.4226220223221723, "step": 2345 }, { "epoch": 0.43492769744160176, "grad_norm": 6.67578125, "learning_rate": 9.5650723025584e-06, "loss": 2.719, "mean_token_accuracy": 0.44951830443159924, "step": 2346 }, { "epoch": 0.43511308861698184, "grad_norm": 5.71484375, "learning_rate": 9.564886911383018e-06, "loss": 2.4749, "mean_token_accuracy": 0.4772456870910173, "step": 2347 }, { "epoch": 0.43529847979236186, "grad_norm": 7.4921875, "learning_rate": 9.564701520207639e-06, "loss": 2.8291, "mean_token_accuracy": 0.44114394059093065, "step": 2348 }, { "epoch": 0.43548387096774194, "grad_norm": 9.1328125, "learning_rate": 9.56451612903226e-06, "loss": 2.6548, "mean_token_accuracy": 0.4633623768033138, "step": 2349 }, { "epoch": 0.43566926214312196, "grad_norm": 6.78125, "learning_rate": 9.564330737856878e-06, "loss": 2.4607, "mean_token_accuracy": 0.486877405808935, "step": 2350 }, { "epoch": 0.43585465331850204, "grad_norm": 8.0625, "learning_rate": 9.564145346681499e-06, "loss": 2.5791, "mean_token_accuracy": 0.46268896751367, "step": 2351 }, { "epoch": 0.4360400444938821, "grad_norm": 10.15625, "learning_rate": 9.563959955506119e-06, "loss": 2.9099, "mean_token_accuracy": 0.4356060606060606, "step": 2352 }, { "epoch": 0.43622543566926214, "grad_norm": 8.171875, "learning_rate": 9.56377456433074e-06, "loss": 2.9472, "mean_token_accuracy": 0.4171475680131904, "step": 2353 }, { "epoch": 0.4364108268446422, "grad_norm": 5.796875, "learning_rate": 9.563589173155358e-06, "loss": 3.0594, "mean_token_accuracy": 0.4031359906213365, "step": 2354 }, { "epoch": 0.43659621802002224, "grad_norm": 7.07421875, "learning_rate": 9.563403781979979e-06, "loss": 2.4322, "mean_token_accuracy": 0.49316615836439626, "step": 2355 }, { "epoch": 0.4367816091954023, "grad_norm": 8.3203125, "learning_rate": 9.563218390804598e-06, "loss": 2.6021, "mean_token_accuracy": 0.4652005799903335, "step": 2356 }, { "epoch": 0.43696700037078234, "grad_norm": 9.109375, "learning_rate": 9.563032999629218e-06, "loss": 3.1881, "mean_token_accuracy": 0.40682058246567077, "step": 2357 }, { "epoch": 0.4371523915461624, "grad_norm": 7.75, "learning_rate": 9.562847608453839e-06, "loss": 2.2675, "mean_token_accuracy": 0.5075212557226946, "step": 2358 }, { "epoch": 0.43733778272154245, "grad_norm": 8.625, "learning_rate": 9.562662217278457e-06, "loss": 3.2012, "mean_token_accuracy": 0.4028294862248697, "step": 2359 }, { "epoch": 0.4375231738969225, "grad_norm": 8.6171875, "learning_rate": 9.562476826103078e-06, "loss": 2.4626, "mean_token_accuracy": 0.4647638292498106, "step": 2360 }, { "epoch": 0.43770856507230255, "grad_norm": 6.73046875, "learning_rate": 9.562291434927698e-06, "loss": 2.8055, "mean_token_accuracy": 0.4294911734164071, "step": 2361 }, { "epoch": 0.4378939562476826, "grad_norm": 5.05859375, "learning_rate": 9.562106043752319e-06, "loss": 2.6261, "mean_token_accuracy": 0.4549322584257082, "step": 2362 }, { "epoch": 0.43807934742306265, "grad_norm": 6.28515625, "learning_rate": 9.561920652576938e-06, "loss": 2.6777, "mean_token_accuracy": 0.4393713588944587, "step": 2363 }, { "epoch": 0.4382647385984427, "grad_norm": 4.9375, "learning_rate": 9.561735261401558e-06, "loss": 2.9382, "mean_token_accuracy": 0.43802674457804247, "step": 2364 }, { "epoch": 0.43845012977382275, "grad_norm": 6.2421875, "learning_rate": 9.561549870226177e-06, "loss": 3.0734, "mean_token_accuracy": 0.4129256428075052, "step": 2365 }, { "epoch": 0.43863552094920283, "grad_norm": 5.89453125, "learning_rate": 9.561364479050797e-06, "loss": 2.3614, "mean_token_accuracy": 0.48459586806814064, "step": 2366 }, { "epoch": 0.43882091212458285, "grad_norm": 6.14453125, "learning_rate": 9.561179087875418e-06, "loss": 3.1097, "mean_token_accuracy": 0.4179781275006668, "step": 2367 }, { "epoch": 0.43900630329996293, "grad_norm": 6.59765625, "learning_rate": 9.560993696700038e-06, "loss": 2.5272, "mean_token_accuracy": 0.4798509201024924, "step": 2368 }, { "epoch": 0.43919169447534295, "grad_norm": 6.15234375, "learning_rate": 9.560808305524659e-06, "loss": 2.8779, "mean_token_accuracy": 0.44212888746393075, "step": 2369 }, { "epoch": 0.43937708565072303, "grad_norm": 5.94140625, "learning_rate": 9.560622914349278e-06, "loss": 3.344, "mean_token_accuracy": 0.3872888396566048, "step": 2370 }, { "epoch": 0.43956247682610305, "grad_norm": 4.64453125, "learning_rate": 9.560437523173898e-06, "loss": 2.633, "mean_token_accuracy": 0.46213997132803336, "step": 2371 }, { "epoch": 0.43974786800148313, "grad_norm": 6.6328125, "learning_rate": 9.560252131998517e-06, "loss": 2.9029, "mean_token_accuracy": 0.42407795732083386, "step": 2372 }, { "epoch": 0.43993325917686316, "grad_norm": 6.07421875, "learning_rate": 9.560066740823138e-06, "loss": 2.5744, "mean_token_accuracy": 0.4665350010972131, "step": 2373 }, { "epoch": 0.44011865035224323, "grad_norm": 6.22265625, "learning_rate": 9.559881349647756e-06, "loss": 2.4988, "mean_token_accuracy": 0.4788270473618212, "step": 2374 }, { "epoch": 0.44030404152762326, "grad_norm": 7.6640625, "learning_rate": 9.559695958472377e-06, "loss": 2.77, "mean_token_accuracy": 0.4401874310915105, "step": 2375 }, { "epoch": 0.44048943270300334, "grad_norm": 7.8515625, "learning_rate": 9.559510567296997e-06, "loss": 3.1516, "mean_token_accuracy": 0.40589600109185203, "step": 2376 }, { "epoch": 0.4406748238783834, "grad_norm": 8.8203125, "learning_rate": 9.559325176121618e-06, "loss": 2.8167, "mean_token_accuracy": 0.4267715043577113, "step": 2377 }, { "epoch": 0.44086021505376344, "grad_norm": 6.8515625, "learning_rate": 9.559139784946238e-06, "loss": 3.1831, "mean_token_accuracy": 0.4064989845336666, "step": 2378 }, { "epoch": 0.4410456062291435, "grad_norm": 5.78515625, "learning_rate": 9.558954393770857e-06, "loss": 3.0379, "mean_token_accuracy": 0.4132280649486072, "step": 2379 }, { "epoch": 0.44123099740452354, "grad_norm": 4.94921875, "learning_rate": 9.558769002595478e-06, "loss": 2.871, "mean_token_accuracy": 0.4444712117562033, "step": 2380 }, { "epoch": 0.4414163885799036, "grad_norm": 5.73828125, "learning_rate": 9.558583611420096e-06, "loss": 2.5853, "mean_token_accuracy": 0.47842866988283944, "step": 2381 }, { "epoch": 0.44160177975528364, "grad_norm": 6.67578125, "learning_rate": 9.558398220244717e-06, "loss": 2.8789, "mean_token_accuracy": 0.42940461725394896, "step": 2382 }, { "epoch": 0.4417871709306637, "grad_norm": 5.6171875, "learning_rate": 9.558212829069337e-06, "loss": 3.0894, "mean_token_accuracy": 0.3994656917885264, "step": 2383 }, { "epoch": 0.44197256210604374, "grad_norm": 5.92578125, "learning_rate": 9.558027437893958e-06, "loss": 2.6715, "mean_token_accuracy": 0.46623990245389424, "step": 2384 }, { "epoch": 0.4421579532814238, "grad_norm": 7.34765625, "learning_rate": 9.557842046718577e-06, "loss": 4.0918, "mean_token_accuracy": 0.33005341579983133, "step": 2385 }, { "epoch": 0.44234334445680384, "grad_norm": 7.55859375, "learning_rate": 9.557656655543197e-06, "loss": 3.1334, "mean_token_accuracy": 0.40554048265029274, "step": 2386 }, { "epoch": 0.4425287356321839, "grad_norm": 11.5859375, "learning_rate": 9.557471264367818e-06, "loss": 2.6694, "mean_token_accuracy": 0.44031130457723255, "step": 2387 }, { "epoch": 0.44271412680756395, "grad_norm": 8.2890625, "learning_rate": 9.557285873192436e-06, "loss": 2.7522, "mean_token_accuracy": 0.4521486643437863, "step": 2388 }, { "epoch": 0.442899517982944, "grad_norm": 7.2265625, "learning_rate": 9.557100482017057e-06, "loss": 2.7545, "mean_token_accuracy": 0.4453001371452685, "step": 2389 }, { "epoch": 0.44308490915832405, "grad_norm": 5.26171875, "learning_rate": 9.556915090841676e-06, "loss": 2.7894, "mean_token_accuracy": 0.44506866416978774, "step": 2390 }, { "epoch": 0.4432703003337041, "grad_norm": 5.91015625, "learning_rate": 9.556729699666296e-06, "loss": 3.3722, "mean_token_accuracy": 0.37681856438619693, "step": 2391 }, { "epoch": 0.44345569150908415, "grad_norm": 7.51171875, "learning_rate": 9.556544308490917e-06, "loss": 2.9046, "mean_token_accuracy": 0.42397876419407166, "step": 2392 }, { "epoch": 0.4436410826844642, "grad_norm": 5.38671875, "learning_rate": 9.556358917315537e-06, "loss": 2.8799, "mean_token_accuracy": 0.420169014084507, "step": 2393 }, { "epoch": 0.44382647385984425, "grad_norm": 5.63671875, "learning_rate": 9.556173526140156e-06, "loss": 3.1938, "mean_token_accuracy": 0.39748180663047245, "step": 2394 }, { "epoch": 0.44401186503522433, "grad_norm": 5.4921875, "learning_rate": 9.555988134964776e-06, "loss": 2.8314, "mean_token_accuracy": 0.4373899924566256, "step": 2395 }, { "epoch": 0.44419725621060435, "grad_norm": 5.92578125, "learning_rate": 9.555802743789397e-06, "loss": 2.5548, "mean_token_accuracy": 0.4647790055248619, "step": 2396 }, { "epoch": 0.44438264738598443, "grad_norm": 6.48828125, "learning_rate": 9.555617352614016e-06, "loss": 2.4911, "mean_token_accuracy": 0.48619145362940847, "step": 2397 }, { "epoch": 0.44456803856136445, "grad_norm": 9.25, "learning_rate": 9.555431961438636e-06, "loss": 3.1625, "mean_token_accuracy": 0.38819238659676314, "step": 2398 }, { "epoch": 0.44475342973674453, "grad_norm": 10.421875, "learning_rate": 9.555246570263255e-06, "loss": 2.6799, "mean_token_accuracy": 0.4431665421956684, "step": 2399 }, { "epoch": 0.44493882091212456, "grad_norm": 7.46484375, "learning_rate": 9.555061179087877e-06, "loss": 2.6659, "mean_token_accuracy": 0.45439935717155483, "step": 2400 }, { "epoch": 0.44512421208750463, "grad_norm": 6.1796875, "learning_rate": 9.554875787912496e-06, "loss": 2.7457, "mean_token_accuracy": 0.45843536538213686, "step": 2401 }, { "epoch": 0.4453096032628847, "grad_norm": 6.33203125, "learning_rate": 9.554690396737117e-06, "loss": 3.0558, "mean_token_accuracy": 0.4189925681255161, "step": 2402 }, { "epoch": 0.44549499443826474, "grad_norm": 8.8046875, "learning_rate": 9.554505005561735e-06, "loss": 3.0269, "mean_token_accuracy": 0.40975212382171533, "step": 2403 }, { "epoch": 0.4456803856136448, "grad_norm": 11.3671875, "learning_rate": 9.554319614386356e-06, "loss": 2.4509, "mean_token_accuracy": 0.4916887496988677, "step": 2404 }, { "epoch": 0.44586577678902484, "grad_norm": 5.82421875, "learning_rate": 9.554134223210976e-06, "loss": 2.7319, "mean_token_accuracy": 0.45618293306080737, "step": 2405 }, { "epoch": 0.4460511679644049, "grad_norm": 6.3671875, "learning_rate": 9.553948832035595e-06, "loss": 2.8403, "mean_token_accuracy": 0.4468239039212849, "step": 2406 }, { "epoch": 0.44623655913978494, "grad_norm": 6.171875, "learning_rate": 9.553763440860216e-06, "loss": 2.9621, "mean_token_accuracy": 0.41920103092783506, "step": 2407 }, { "epoch": 0.446421950315165, "grad_norm": 6.06640625, "learning_rate": 9.553578049684836e-06, "loss": 2.7995, "mean_token_accuracy": 0.4360921285750443, "step": 2408 }, { "epoch": 0.44660734149054504, "grad_norm": 4.71484375, "learning_rate": 9.553392658509457e-06, "loss": 2.3697, "mean_token_accuracy": 0.4950892385679586, "step": 2409 }, { "epoch": 0.4467927326659251, "grad_norm": 8.390625, "learning_rate": 9.553207267334075e-06, "loss": 3.0177, "mean_token_accuracy": 0.4204398447606727, "step": 2410 }, { "epoch": 0.44697812384130514, "grad_norm": 5.50390625, "learning_rate": 9.553021876158696e-06, "loss": 2.6889, "mean_token_accuracy": 0.45392249527410206, "step": 2411 }, { "epoch": 0.4471635150166852, "grad_norm": 7.3046875, "learning_rate": 9.552836484983316e-06, "loss": 2.1962, "mean_token_accuracy": 0.5082932049224184, "step": 2412 }, { "epoch": 0.44734890619206524, "grad_norm": 5.8203125, "learning_rate": 9.552651093807935e-06, "loss": 3.007, "mean_token_accuracy": 0.4300649901102006, "step": 2413 }, { "epoch": 0.4475342973674453, "grad_norm": 5.09375, "learning_rate": 9.552465702632556e-06, "loss": 3.198, "mean_token_accuracy": 0.40052682656314986, "step": 2414 }, { "epoch": 0.44771968854282534, "grad_norm": 6.99609375, "learning_rate": 9.552280311457174e-06, "loss": 3.1008, "mean_token_accuracy": 0.411509900990099, "step": 2415 }, { "epoch": 0.4479050797182054, "grad_norm": 6.76171875, "learning_rate": 9.552094920281797e-06, "loss": 2.7771, "mean_token_accuracy": 0.4511797679572416, "step": 2416 }, { "epoch": 0.44809047089358545, "grad_norm": 7.44140625, "learning_rate": 9.551909529106415e-06, "loss": 2.5745, "mean_token_accuracy": 0.45569935291189645, "step": 2417 }, { "epoch": 0.4482758620689655, "grad_norm": 5.48046875, "learning_rate": 9.551724137931036e-06, "loss": 2.9116, "mean_token_accuracy": 0.4231526447040294, "step": 2418 }, { "epoch": 0.44846125324434555, "grad_norm": 5.53515625, "learning_rate": 9.551538746755655e-06, "loss": 2.9503, "mean_token_accuracy": 0.4347772277227723, "step": 2419 }, { "epoch": 0.4486466444197256, "grad_norm": 10.3984375, "learning_rate": 9.551353355580275e-06, "loss": 3.1357, "mean_token_accuracy": 0.4181628392484342, "step": 2420 }, { "epoch": 0.44883203559510565, "grad_norm": 11.2421875, "learning_rate": 9.551167964404896e-06, "loss": 2.8802, "mean_token_accuracy": 0.4169553026554511, "step": 2421 }, { "epoch": 0.44901742677048573, "grad_norm": 6.265625, "learning_rate": 9.550982573229514e-06, "loss": 2.4579, "mean_token_accuracy": 0.4744744744744745, "step": 2422 }, { "epoch": 0.44920281794586575, "grad_norm": 9.625, "learning_rate": 9.550797182054135e-06, "loss": 2.4746, "mean_token_accuracy": 0.48040533553924103, "step": 2423 }, { "epoch": 0.44938820912124583, "grad_norm": 6.62109375, "learning_rate": 9.550611790878755e-06, "loss": 2.922, "mean_token_accuracy": 0.4366966215447617, "step": 2424 }, { "epoch": 0.4495736002966259, "grad_norm": 6.5546875, "learning_rate": 9.550426399703376e-06, "loss": 2.6024, "mean_token_accuracy": 0.4517479074347612, "step": 2425 }, { "epoch": 0.44975899147200593, "grad_norm": 6.3515625, "learning_rate": 9.550241008527995e-06, "loss": 2.8228, "mean_token_accuracy": 0.4199166053470689, "step": 2426 }, { "epoch": 0.449944382647386, "grad_norm": 6.91796875, "learning_rate": 9.550055617352615e-06, "loss": 2.7367, "mean_token_accuracy": 0.4427722772277228, "step": 2427 }, { "epoch": 0.45012977382276603, "grad_norm": 9.546875, "learning_rate": 9.549870226177234e-06, "loss": 2.388, "mean_token_accuracy": 0.4941823566744235, "step": 2428 }, { "epoch": 0.4503151649981461, "grad_norm": 6.58984375, "learning_rate": 9.549684835001855e-06, "loss": 2.5457, "mean_token_accuracy": 0.4641255605381166, "step": 2429 }, { "epoch": 0.45050055617352613, "grad_norm": 7.33984375, "learning_rate": 9.549499443826475e-06, "loss": 2.837, "mean_token_accuracy": 0.4302994011976048, "step": 2430 }, { "epoch": 0.4506859473489062, "grad_norm": 6.92578125, "learning_rate": 9.549314052651094e-06, "loss": 3.2055, "mean_token_accuracy": 0.39765851091817944, "step": 2431 }, { "epoch": 0.45087133852428624, "grad_norm": 8.984375, "learning_rate": 9.549128661475714e-06, "loss": 2.5708, "mean_token_accuracy": 0.4770676226974904, "step": 2432 }, { "epoch": 0.4510567296996663, "grad_norm": 5.28125, "learning_rate": 9.548943270300335e-06, "loss": 3.1242, "mean_token_accuracy": 0.40918282383727844, "step": 2433 }, { "epoch": 0.45124212087504634, "grad_norm": 6.37109375, "learning_rate": 9.548757879124955e-06, "loss": 2.972, "mean_token_accuracy": 0.4305274971941639, "step": 2434 }, { "epoch": 0.4514275120504264, "grad_norm": 6.40625, "learning_rate": 9.548572487949574e-06, "loss": 2.81, "mean_token_accuracy": 0.4394027254276602, "step": 2435 }, { "epoch": 0.45161290322580644, "grad_norm": 5.71875, "learning_rate": 9.548387096774195e-06, "loss": 3.103, "mean_token_accuracy": 0.44838590892353014, "step": 2436 }, { "epoch": 0.4517982944011865, "grad_norm": 6.60546875, "learning_rate": 9.548201705598813e-06, "loss": 2.5761, "mean_token_accuracy": 0.46043364176218754, "step": 2437 }, { "epoch": 0.45198368557656654, "grad_norm": 7.34765625, "learning_rate": 9.548016314423434e-06, "loss": 2.6479, "mean_token_accuracy": 0.46520433476955214, "step": 2438 }, { "epoch": 0.4521690767519466, "grad_norm": 5.79296875, "learning_rate": 9.547830923248054e-06, "loss": 3.2091, "mean_token_accuracy": 0.40176177709689775, "step": 2439 }, { "epoch": 0.45235446792732664, "grad_norm": 6.54296875, "learning_rate": 9.547645532072675e-06, "loss": 3.4627, "mean_token_accuracy": 0.3813646670789528, "step": 2440 }, { "epoch": 0.4525398591027067, "grad_norm": 4.88671875, "learning_rate": 9.547460140897294e-06, "loss": 2.8834, "mean_token_accuracy": 0.4404322614654718, "step": 2441 }, { "epoch": 0.45272525027808674, "grad_norm": 6.4453125, "learning_rate": 9.547274749721914e-06, "loss": 2.9165, "mean_token_accuracy": 0.4363368134362233, "step": 2442 }, { "epoch": 0.4529106414534668, "grad_norm": 7.5, "learning_rate": 9.547089358546535e-06, "loss": 2.6656, "mean_token_accuracy": 0.4488610478359909, "step": 2443 }, { "epoch": 0.45309603262884685, "grad_norm": 6.4453125, "learning_rate": 9.546903967371153e-06, "loss": 2.9692, "mean_token_accuracy": 0.4321608040201005, "step": 2444 }, { "epoch": 0.4532814238042269, "grad_norm": 5.8515625, "learning_rate": 9.546718576195774e-06, "loss": 2.8917, "mean_token_accuracy": 0.4230594643667726, "step": 2445 }, { "epoch": 0.45346681497960695, "grad_norm": 7.15625, "learning_rate": 9.546533185020393e-06, "loss": 2.5575, "mean_token_accuracy": 0.46420598303318944, "step": 2446 }, { "epoch": 0.453652206154987, "grad_norm": 5.75, "learning_rate": 9.546347793845013e-06, "loss": 2.8305, "mean_token_accuracy": 0.43079412496888225, "step": 2447 }, { "epoch": 0.45383759733036705, "grad_norm": 5.39453125, "learning_rate": 9.546162402669634e-06, "loss": 2.5969, "mean_token_accuracy": 0.4633488600147095, "step": 2448 }, { "epoch": 0.4540229885057471, "grad_norm": 6.09765625, "learning_rate": 9.545977011494254e-06, "loss": 2.3082, "mean_token_accuracy": 0.4954656686339427, "step": 2449 }, { "epoch": 0.4542083796811272, "grad_norm": 6.16796875, "learning_rate": 9.545791620318875e-06, "loss": 3.1047, "mean_token_accuracy": 0.3958333333333333, "step": 2450 }, { "epoch": 0.45439377085650723, "grad_norm": 6.1484375, "learning_rate": 9.545606229143493e-06, "loss": 2.9332, "mean_token_accuracy": 0.43058846761453395, "step": 2451 }, { "epoch": 0.4545791620318873, "grad_norm": 8.9921875, "learning_rate": 9.545420837968114e-06, "loss": 2.7718, "mean_token_accuracy": 0.44430459408432976, "step": 2452 }, { "epoch": 0.45476455320726733, "grad_norm": 5.671875, "learning_rate": 9.545235446792733e-06, "loss": 2.9261, "mean_token_accuracy": 0.4274848746758859, "step": 2453 }, { "epoch": 0.4549499443826474, "grad_norm": 9.6640625, "learning_rate": 9.545050055617353e-06, "loss": 2.756, "mean_token_accuracy": 0.4496743271475974, "step": 2454 }, { "epoch": 0.45513533555802743, "grad_norm": 5.26953125, "learning_rate": 9.544864664441972e-06, "loss": 3.2629, "mean_token_accuracy": 0.38518107395587625, "step": 2455 }, { "epoch": 0.4553207267334075, "grad_norm": 7.94140625, "learning_rate": 9.544679273266594e-06, "loss": 2.7715, "mean_token_accuracy": 0.4507629816664837, "step": 2456 }, { "epoch": 0.45550611790878753, "grad_norm": 5.82421875, "learning_rate": 9.544493882091213e-06, "loss": 2.6489, "mean_token_accuracy": 0.4534412955465587, "step": 2457 }, { "epoch": 0.4556915090841676, "grad_norm": 6.69921875, "learning_rate": 9.544308490915834e-06, "loss": 2.5872, "mean_token_accuracy": 0.46634486457495306, "step": 2458 }, { "epoch": 0.45587690025954763, "grad_norm": 8.296875, "learning_rate": 9.544123099740454e-06, "loss": 2.778, "mean_token_accuracy": 0.4428692340245451, "step": 2459 }, { "epoch": 0.4560622914349277, "grad_norm": 7.578125, "learning_rate": 9.543937708565073e-06, "loss": 2.6885, "mean_token_accuracy": 0.45277008310249306, "step": 2460 }, { "epoch": 0.45624768261030774, "grad_norm": 7.8515625, "learning_rate": 9.543752317389693e-06, "loss": 2.6234, "mean_token_accuracy": 0.46833013435700577, "step": 2461 }, { "epoch": 0.4564330737856878, "grad_norm": 6.19140625, "learning_rate": 9.543566926214312e-06, "loss": 2.5895, "mean_token_accuracy": 0.473430458023268, "step": 2462 }, { "epoch": 0.45661846496106784, "grad_norm": 6.98828125, "learning_rate": 9.543381535038933e-06, "loss": 3.2186, "mean_token_accuracy": 0.4083129584352078, "step": 2463 }, { "epoch": 0.4568038561364479, "grad_norm": 7.05859375, "learning_rate": 9.543196143863553e-06, "loss": 3.2707, "mean_token_accuracy": 0.39071450105517036, "step": 2464 }, { "epoch": 0.45698924731182794, "grad_norm": 9.28125, "learning_rate": 9.543010752688174e-06, "loss": 2.7024, "mean_token_accuracy": 0.45409096451652237, "step": 2465 }, { "epoch": 0.457174638487208, "grad_norm": 7.96484375, "learning_rate": 9.542825361512792e-06, "loss": 2.4721, "mean_token_accuracy": 0.46788790753462, "step": 2466 }, { "epoch": 0.45736002966258804, "grad_norm": 8.09375, "learning_rate": 9.542639970337413e-06, "loss": 3.2281, "mean_token_accuracy": 0.3880880545312325, "step": 2467 }, { "epoch": 0.4575454208379681, "grad_norm": 7.71484375, "learning_rate": 9.542454579162033e-06, "loss": 2.6807, "mean_token_accuracy": 0.45368811047490737, "step": 2468 }, { "epoch": 0.45773081201334814, "grad_norm": 7.4921875, "learning_rate": 9.542269187986652e-06, "loss": 2.9751, "mean_token_accuracy": 0.4145720649061406, "step": 2469 }, { "epoch": 0.4579162031887282, "grad_norm": 5.96875, "learning_rate": 9.542083796811273e-06, "loss": 3.0603, "mean_token_accuracy": 0.40944977529454635, "step": 2470 }, { "epoch": 0.45810159436410824, "grad_norm": 7.0078125, "learning_rate": 9.541898405635891e-06, "loss": 3.3703, "mean_token_accuracy": 0.37236731255265376, "step": 2471 }, { "epoch": 0.4582869855394883, "grad_norm": 6.28125, "learning_rate": 9.541713014460514e-06, "loss": 2.7076, "mean_token_accuracy": 0.46057263553156536, "step": 2472 }, { "epoch": 0.45847237671486835, "grad_norm": 6.23046875, "learning_rate": 9.541527623285132e-06, "loss": 3.3086, "mean_token_accuracy": 0.38600031041440325, "step": 2473 }, { "epoch": 0.4586577678902484, "grad_norm": 6.66796875, "learning_rate": 9.541342232109753e-06, "loss": 3.055, "mean_token_accuracy": 0.42121102684482964, "step": 2474 }, { "epoch": 0.4588431590656285, "grad_norm": 7.5625, "learning_rate": 9.541156840934372e-06, "loss": 3.066, "mean_token_accuracy": 0.4144700713893465, "step": 2475 }, { "epoch": 0.4590285502410085, "grad_norm": 7.6640625, "learning_rate": 9.540971449758992e-06, "loss": 2.2571, "mean_token_accuracy": 0.5086698043101313, "step": 2476 }, { "epoch": 0.4592139414163886, "grad_norm": 6.546875, "learning_rate": 9.540786058583613e-06, "loss": 2.9718, "mean_token_accuracy": 0.40262197594269494, "step": 2477 }, { "epoch": 0.4593993325917686, "grad_norm": 9.4375, "learning_rate": 9.540600667408232e-06, "loss": 3.2434, "mean_token_accuracy": 0.39374124241008873, "step": 2478 }, { "epoch": 0.4595847237671487, "grad_norm": 5.97265625, "learning_rate": 9.540415276232852e-06, "loss": 2.9946, "mean_token_accuracy": 0.43155586334256696, "step": 2479 }, { "epoch": 0.45977011494252873, "grad_norm": 6.875, "learning_rate": 9.54022988505747e-06, "loss": 2.8389, "mean_token_accuracy": 0.44116073816528484, "step": 2480 }, { "epoch": 0.4599555061179088, "grad_norm": 5.921875, "learning_rate": 9.540044493882093e-06, "loss": 3.0083, "mean_token_accuracy": 0.420907418761496, "step": 2481 }, { "epoch": 0.46014089729328883, "grad_norm": 5.1875, "learning_rate": 9.539859102706712e-06, "loss": 2.8483, "mean_token_accuracy": 0.42932015573525006, "step": 2482 }, { "epoch": 0.4603262884686689, "grad_norm": 5.86328125, "learning_rate": 9.539673711531332e-06, "loss": 2.5072, "mean_token_accuracy": 0.46636389440817483, "step": 2483 }, { "epoch": 0.46051167964404893, "grad_norm": 9.84375, "learning_rate": 9.539488320355951e-06, "loss": 2.5965, "mean_token_accuracy": 0.4535322078347812, "step": 2484 }, { "epoch": 0.460697070819429, "grad_norm": 6.734375, "learning_rate": 9.539302929180572e-06, "loss": 2.7789, "mean_token_accuracy": 0.4339500582222797, "step": 2485 }, { "epoch": 0.46088246199480903, "grad_norm": 6.59375, "learning_rate": 9.539117538005192e-06, "loss": 3.76, "mean_token_accuracy": 0.3526640493550196, "step": 2486 }, { "epoch": 0.4610678531701891, "grad_norm": 6.25390625, "learning_rate": 9.538932146829811e-06, "loss": 2.9707, "mean_token_accuracy": 0.4208, "step": 2487 }, { "epoch": 0.46125324434556914, "grad_norm": 8.21875, "learning_rate": 9.538746755654431e-06, "loss": 2.8035, "mean_token_accuracy": 0.43923737447242034, "step": 2488 }, { "epoch": 0.4614386355209492, "grad_norm": 8.3671875, "learning_rate": 9.538561364479052e-06, "loss": 2.4987, "mean_token_accuracy": 0.4725289523296526, "step": 2489 }, { "epoch": 0.46162402669632924, "grad_norm": 5.30859375, "learning_rate": 9.538375973303672e-06, "loss": 3.0194, "mean_token_accuracy": 0.4134504275479547, "step": 2490 }, { "epoch": 0.4618094178717093, "grad_norm": 7.19921875, "learning_rate": 9.538190582128291e-06, "loss": 2.3885, "mean_token_accuracy": 0.4854383582314773, "step": 2491 }, { "epoch": 0.46199480904708934, "grad_norm": 17.171875, "learning_rate": 9.538005190952912e-06, "loss": 2.308, "mean_token_accuracy": 0.5088640190155491, "step": 2492 }, { "epoch": 0.4621802002224694, "grad_norm": 5.63671875, "learning_rate": 9.53781979977753e-06, "loss": 2.9352, "mean_token_accuracy": 0.42948717948717946, "step": 2493 }, { "epoch": 0.46236559139784944, "grad_norm": 6.2734375, "learning_rate": 9.537634408602151e-06, "loss": 2.7943, "mean_token_accuracy": 0.43605045436050455, "step": 2494 }, { "epoch": 0.4625509825732295, "grad_norm": 8.21875, "learning_rate": 9.537449017426771e-06, "loss": 2.868, "mean_token_accuracy": 0.42641439090315963, "step": 2495 }, { "epoch": 0.46273637374860954, "grad_norm": 6.87109375, "learning_rate": 9.53726362625139e-06, "loss": 2.3044, "mean_token_accuracy": 0.5143839899937461, "step": 2496 }, { "epoch": 0.4629217649239896, "grad_norm": 7.29296875, "learning_rate": 9.537078235076012e-06, "loss": 2.463, "mean_token_accuracy": 0.49532119372787053, "step": 2497 }, { "epoch": 0.46310715609936964, "grad_norm": 7.12890625, "learning_rate": 9.536892843900631e-06, "loss": 2.5569, "mean_token_accuracy": 0.47036823935558114, "step": 2498 }, { "epoch": 0.4632925472747497, "grad_norm": 6.7265625, "learning_rate": 9.536707452725252e-06, "loss": 3.2376, "mean_token_accuracy": 0.41187021033751836, "step": 2499 }, { "epoch": 0.4634779384501298, "grad_norm": 7.58984375, "learning_rate": 9.53652206154987e-06, "loss": 2.7457, "mean_token_accuracy": 0.45398856520555403, "step": 2500 }, { "epoch": 0.4636633296255098, "grad_norm": 4.49609375, "learning_rate": 9.536336670374491e-06, "loss": 2.5008, "mean_token_accuracy": 0.49551397077672393, "step": 2501 }, { "epoch": 0.4638487208008899, "grad_norm": 5.359375, "learning_rate": 9.536151279199111e-06, "loss": 2.8365, "mean_token_accuracy": 0.426619294889314, "step": 2502 }, { "epoch": 0.4640341119762699, "grad_norm": 7.375, "learning_rate": 9.53596588802373e-06, "loss": 1.9702, "mean_token_accuracy": 0.552683615819209, "step": 2503 }, { "epoch": 0.46421950315165, "grad_norm": 6.2421875, "learning_rate": 9.53578049684835e-06, "loss": 3.0862, "mean_token_accuracy": 0.4169428609306642, "step": 2504 }, { "epoch": 0.46440489432703, "grad_norm": 6.49609375, "learning_rate": 9.535595105672971e-06, "loss": 2.8983, "mean_token_accuracy": 0.41797346200241253, "step": 2505 }, { "epoch": 0.4645902855024101, "grad_norm": 9.9453125, "learning_rate": 9.535409714497592e-06, "loss": 2.4401, "mean_token_accuracy": 0.4834067928441794, "step": 2506 }, { "epoch": 0.46477567667779013, "grad_norm": 6.71875, "learning_rate": 9.53522432332221e-06, "loss": 2.7282, "mean_token_accuracy": 0.44796316964285715, "step": 2507 }, { "epoch": 0.4649610678531702, "grad_norm": 6.34375, "learning_rate": 9.535038932146831e-06, "loss": 2.747, "mean_token_accuracy": 0.436739785840524, "step": 2508 }, { "epoch": 0.46514645902855023, "grad_norm": 6.4921875, "learning_rate": 9.53485354097145e-06, "loss": 2.8133, "mean_token_accuracy": 0.4511774411279436, "step": 2509 }, { "epoch": 0.4653318502039303, "grad_norm": 6.75390625, "learning_rate": 9.53466814979607e-06, "loss": 2.789, "mean_token_accuracy": 0.43570419218087614, "step": 2510 }, { "epoch": 0.46551724137931033, "grad_norm": 5.8671875, "learning_rate": 9.53448275862069e-06, "loss": 3.1694, "mean_token_accuracy": 0.40257203320853, "step": 2511 }, { "epoch": 0.4657026325546904, "grad_norm": 6.3828125, "learning_rate": 9.53429736744531e-06, "loss": 3.3972, "mean_token_accuracy": 0.39360568383658967, "step": 2512 }, { "epoch": 0.46588802373007043, "grad_norm": 6.6796875, "learning_rate": 9.53411197626993e-06, "loss": 2.8469, "mean_token_accuracy": 0.43317972350230416, "step": 2513 }, { "epoch": 0.4660734149054505, "grad_norm": 5.5234375, "learning_rate": 9.53392658509455e-06, "loss": 3.1231, "mean_token_accuracy": 0.3945369916707496, "step": 2514 }, { "epoch": 0.46625880608083053, "grad_norm": 6.4375, "learning_rate": 9.533741193919171e-06, "loss": 2.9609, "mean_token_accuracy": 0.413142306102769, "step": 2515 }, { "epoch": 0.4664441972562106, "grad_norm": 9.6875, "learning_rate": 9.53355580274379e-06, "loss": 2.7374, "mean_token_accuracy": 0.4649122807017544, "step": 2516 }, { "epoch": 0.46662958843159064, "grad_norm": 6.59375, "learning_rate": 9.53337041156841e-06, "loss": 2.9772, "mean_token_accuracy": 0.43154246100519933, "step": 2517 }, { "epoch": 0.4668149796069707, "grad_norm": 5.12890625, "learning_rate": 9.533185020393029e-06, "loss": 2.6205, "mean_token_accuracy": 0.4678420310296192, "step": 2518 }, { "epoch": 0.46700037078235074, "grad_norm": 7.30859375, "learning_rate": 9.53299962921765e-06, "loss": 2.3635, "mean_token_accuracy": 0.497495183044316, "step": 2519 }, { "epoch": 0.4671857619577308, "grad_norm": 5.89453125, "learning_rate": 9.53281423804227e-06, "loss": 2.9411, "mean_token_accuracy": 0.43390482294504085, "step": 2520 }, { "epoch": 0.46737115313311084, "grad_norm": 5.62109375, "learning_rate": 9.53262884686689e-06, "loss": 2.9649, "mean_token_accuracy": 0.41629432455556614, "step": 2521 }, { "epoch": 0.4675565443084909, "grad_norm": 5.3359375, "learning_rate": 9.53244345569151e-06, "loss": 2.3771, "mean_token_accuracy": 0.494625, "step": 2522 }, { "epoch": 0.46774193548387094, "grad_norm": 8.609375, "learning_rate": 9.53225806451613e-06, "loss": 3.0223, "mean_token_accuracy": 0.424265605875153, "step": 2523 }, { "epoch": 0.467927326659251, "grad_norm": 7.71875, "learning_rate": 9.53207267334075e-06, "loss": 2.8401, "mean_token_accuracy": 0.4319723537693588, "step": 2524 }, { "epoch": 0.4681127178346311, "grad_norm": 6.328125, "learning_rate": 9.53188728216537e-06, "loss": 2.7406, "mean_token_accuracy": 0.4533976450560363, "step": 2525 }, { "epoch": 0.4682981090100111, "grad_norm": 6.8203125, "learning_rate": 9.53170189098999e-06, "loss": 2.9489, "mean_token_accuracy": 0.4349461668180049, "step": 2526 }, { "epoch": 0.4684835001853912, "grad_norm": 7.328125, "learning_rate": 9.531516499814608e-06, "loss": 2.8208, "mean_token_accuracy": 0.42471204188481676, "step": 2527 }, { "epoch": 0.4686688913607712, "grad_norm": 5.5078125, "learning_rate": 9.531331108639229e-06, "loss": 2.9949, "mean_token_accuracy": 0.4132737504137703, "step": 2528 }, { "epoch": 0.4688542825361513, "grad_norm": 6.22265625, "learning_rate": 9.53114571746385e-06, "loss": 2.8006, "mean_token_accuracy": 0.4358610914245216, "step": 2529 }, { "epoch": 0.4690396737115313, "grad_norm": 5.265625, "learning_rate": 9.53096032628847e-06, "loss": 3.2051, "mean_token_accuracy": 0.4030714210581677, "step": 2530 }, { "epoch": 0.4692250648869114, "grad_norm": 5.21484375, "learning_rate": 9.53077493511309e-06, "loss": 2.8869, "mean_token_accuracy": 0.44106123362812044, "step": 2531 }, { "epoch": 0.4694104560622914, "grad_norm": 5.37890625, "learning_rate": 9.53058954393771e-06, "loss": 2.2242, "mean_token_accuracy": 0.5402336860670194, "step": 2532 }, { "epoch": 0.4695958472376715, "grad_norm": 5.4765625, "learning_rate": 9.53040415276233e-06, "loss": 2.8237, "mean_token_accuracy": 0.43513177242393397, "step": 2533 }, { "epoch": 0.4697812384130515, "grad_norm": 4.8984375, "learning_rate": 9.530218761586949e-06, "loss": 2.9234, "mean_token_accuracy": 0.423139598044541, "step": 2534 }, { "epoch": 0.4699666295884316, "grad_norm": 5.37890625, "learning_rate": 9.530033370411569e-06, "loss": 3.3577, "mean_token_accuracy": 0.3962329961632368, "step": 2535 }, { "epoch": 0.47015202076381163, "grad_norm": 6.08203125, "learning_rate": 9.529847979236188e-06, "loss": 3.0031, "mean_token_accuracy": 0.4164496527777778, "step": 2536 }, { "epoch": 0.4703374119391917, "grad_norm": 7.65625, "learning_rate": 9.52966258806081e-06, "loss": 2.822, "mean_token_accuracy": 0.40602721970187944, "step": 2537 }, { "epoch": 0.47052280311457173, "grad_norm": 5.77734375, "learning_rate": 9.529477196885429e-06, "loss": 3.0819, "mean_token_accuracy": 0.42841490138787436, "step": 2538 }, { "epoch": 0.4707081942899518, "grad_norm": 6.7109375, "learning_rate": 9.52929180571005e-06, "loss": 2.4889, "mean_token_accuracy": 0.46345205479452056, "step": 2539 }, { "epoch": 0.47089358546533183, "grad_norm": 9.0859375, "learning_rate": 9.52910641453467e-06, "loss": 2.8023, "mean_token_accuracy": 0.4428508707741841, "step": 2540 }, { "epoch": 0.4710789766407119, "grad_norm": 6.16796875, "learning_rate": 9.528921023359289e-06, "loss": 3.001, "mean_token_accuracy": 0.4205665024630542, "step": 2541 }, { "epoch": 0.47126436781609193, "grad_norm": 8.0078125, "learning_rate": 9.528735632183909e-06, "loss": 3.2606, "mean_token_accuracy": 0.40451467268623026, "step": 2542 }, { "epoch": 0.471449758991472, "grad_norm": 9.1875, "learning_rate": 9.528550241008528e-06, "loss": 2.4531, "mean_token_accuracy": 0.467595818815331, "step": 2543 }, { "epoch": 0.47163515016685204, "grad_norm": 5.53125, "learning_rate": 9.528364849833148e-06, "loss": 2.7229, "mean_token_accuracy": 0.4499524865378524, "step": 2544 }, { "epoch": 0.4718205413422321, "grad_norm": 5.14453125, "learning_rate": 9.528179458657769e-06, "loss": 2.6169, "mean_token_accuracy": 0.4622204578536456, "step": 2545 }, { "epoch": 0.47200593251761214, "grad_norm": 7.33984375, "learning_rate": 9.52799406748239e-06, "loss": 3.1118, "mean_token_accuracy": 0.40165752305357766, "step": 2546 }, { "epoch": 0.4721913236929922, "grad_norm": 7.00390625, "learning_rate": 9.527808676307008e-06, "loss": 2.9195, "mean_token_accuracy": 0.4091026635634859, "step": 2547 }, { "epoch": 0.47237671486837224, "grad_norm": 5.65234375, "learning_rate": 9.527623285131629e-06, "loss": 2.8457, "mean_token_accuracy": 0.442680262199563, "step": 2548 }, { "epoch": 0.4725621060437523, "grad_norm": 6.60546875, "learning_rate": 9.527437893956249e-06, "loss": 2.8663, "mean_token_accuracy": 0.4360231832367365, "step": 2549 }, { "epoch": 0.4727474972191324, "grad_norm": 5.76953125, "learning_rate": 9.527252502780868e-06, "loss": 2.7839, "mean_token_accuracy": 0.45588972431077696, "step": 2550 }, { "epoch": 0.4729328883945124, "grad_norm": 5.3671875, "learning_rate": 9.527067111605488e-06, "loss": 3.3261, "mean_token_accuracy": 0.3899767218951116, "step": 2551 }, { "epoch": 0.4731182795698925, "grad_norm": 7.52734375, "learning_rate": 9.526881720430107e-06, "loss": 2.8318, "mean_token_accuracy": 0.43180246913580245, "step": 2552 }, { "epoch": 0.4733036707452725, "grad_norm": 5.60546875, "learning_rate": 9.52669632925473e-06, "loss": 3.0814, "mean_token_accuracy": 0.4179741433412828, "step": 2553 }, { "epoch": 0.4734890619206526, "grad_norm": 5.6015625, "learning_rate": 9.526510938079348e-06, "loss": 2.7128, "mean_token_accuracy": 0.45361599759434673, "step": 2554 }, { "epoch": 0.4736744530960326, "grad_norm": 5.703125, "learning_rate": 9.526325546903969e-06, "loss": 3.2983, "mean_token_accuracy": 0.3863498789346247, "step": 2555 }, { "epoch": 0.4738598442714127, "grad_norm": 5.09765625, "learning_rate": 9.526140155728587e-06, "loss": 2.8733, "mean_token_accuracy": 0.43207261724659607, "step": 2556 }, { "epoch": 0.4740452354467927, "grad_norm": 5.56640625, "learning_rate": 9.525954764553208e-06, "loss": 3.0442, "mean_token_accuracy": 0.42383275261324044, "step": 2557 }, { "epoch": 0.4742306266221728, "grad_norm": 6.015625, "learning_rate": 9.525769373377828e-06, "loss": 2.6299, "mean_token_accuracy": 0.4565102793885082, "step": 2558 }, { "epoch": 0.4744160177975528, "grad_norm": 4.765625, "learning_rate": 9.525583982202447e-06, "loss": 2.909, "mean_token_accuracy": 0.4377313432835821, "step": 2559 }, { "epoch": 0.4746014089729329, "grad_norm": 5.94140625, "learning_rate": 9.525398591027068e-06, "loss": 2.9877, "mean_token_accuracy": 0.4233587128286166, "step": 2560 }, { "epoch": 0.4747868001483129, "grad_norm": 9.765625, "learning_rate": 9.525213199851688e-06, "loss": 2.5516, "mean_token_accuracy": 0.4771517525359044, "step": 2561 }, { "epoch": 0.474972191323693, "grad_norm": 9.1875, "learning_rate": 9.525027808676309e-06, "loss": 3.4366, "mean_token_accuracy": 0.3910933485583785, "step": 2562 }, { "epoch": 0.47515758249907303, "grad_norm": 7.6484375, "learning_rate": 9.524842417500928e-06, "loss": 2.6874, "mean_token_accuracy": 0.46093085429578323, "step": 2563 }, { "epoch": 0.4753429736744531, "grad_norm": 6.0078125, "learning_rate": 9.524657026325548e-06, "loss": 2.7747, "mean_token_accuracy": 0.4500293083235639, "step": 2564 }, { "epoch": 0.47552836484983313, "grad_norm": 15.1484375, "learning_rate": 9.524471635150167e-06, "loss": 2.1854, "mean_token_accuracy": 0.5015339305436249, "step": 2565 }, { "epoch": 0.4757137560252132, "grad_norm": 6.10546875, "learning_rate": 9.524286243974787e-06, "loss": 2.9815, "mean_token_accuracy": 0.40945397407358913, "step": 2566 }, { "epoch": 0.47589914720059323, "grad_norm": 4.8828125, "learning_rate": 9.524100852799408e-06, "loss": 2.9804, "mean_token_accuracy": 0.40863838260070456, "step": 2567 }, { "epoch": 0.4760845383759733, "grad_norm": 5.58203125, "learning_rate": 9.523915461624027e-06, "loss": 2.7612, "mean_token_accuracy": 0.4418451833355237, "step": 2568 }, { "epoch": 0.47626992955135333, "grad_norm": 7.234375, "learning_rate": 9.523730070448649e-06, "loss": 2.4868, "mean_token_accuracy": 0.4754227912122649, "step": 2569 }, { "epoch": 0.4764553207267334, "grad_norm": 11.84375, "learning_rate": 9.523544679273268e-06, "loss": 2.5776, "mean_token_accuracy": 0.44609753217617193, "step": 2570 }, { "epoch": 0.47664071190211343, "grad_norm": 7.37109375, "learning_rate": 9.523359288097888e-06, "loss": 2.9286, "mean_token_accuracy": 0.4316546762589928, "step": 2571 }, { "epoch": 0.4768261030774935, "grad_norm": 9.1171875, "learning_rate": 9.523173896922507e-06, "loss": 2.6578, "mean_token_accuracy": 0.4539438856537851, "step": 2572 }, { "epoch": 0.47701149425287354, "grad_norm": 6.38671875, "learning_rate": 9.522988505747127e-06, "loss": 2.7293, "mean_token_accuracy": 0.45932469012679866, "step": 2573 }, { "epoch": 0.4771968854282536, "grad_norm": 7.27734375, "learning_rate": 9.522803114571746e-06, "loss": 2.2807, "mean_token_accuracy": 0.5143958559209734, "step": 2574 }, { "epoch": 0.4773822766036337, "grad_norm": 9.8359375, "learning_rate": 9.522617723396367e-06, "loss": 2.2991, "mean_token_accuracy": 0.48370237931508736, "step": 2575 }, { "epoch": 0.4775676677790137, "grad_norm": 6.01171875, "learning_rate": 9.522432332220987e-06, "loss": 2.7155, "mean_token_accuracy": 0.46001386001386, "step": 2576 }, { "epoch": 0.4777530589543938, "grad_norm": 8.609375, "learning_rate": 9.522246941045608e-06, "loss": 2.3429, "mean_token_accuracy": 0.49522673031026254, "step": 2577 }, { "epoch": 0.4779384501297738, "grad_norm": 8.359375, "learning_rate": 9.522061549870228e-06, "loss": 2.4308, "mean_token_accuracy": 0.47777003484320557, "step": 2578 }, { "epoch": 0.4781238413051539, "grad_norm": 7.51953125, "learning_rate": 9.521876158694847e-06, "loss": 2.6773, "mean_token_accuracy": 0.45350270167788415, "step": 2579 }, { "epoch": 0.4783092324805339, "grad_norm": 7.32421875, "learning_rate": 9.521690767519467e-06, "loss": 3.1179, "mean_token_accuracy": 0.41104594330400784, "step": 2580 }, { "epoch": 0.478494623655914, "grad_norm": 6.58203125, "learning_rate": 9.521505376344086e-06, "loss": 2.5288, "mean_token_accuracy": 0.4715392838054516, "step": 2581 }, { "epoch": 0.478680014831294, "grad_norm": 7.203125, "learning_rate": 9.521319985168707e-06, "loss": 2.8641, "mean_token_accuracy": 0.43681248401125605, "step": 2582 }, { "epoch": 0.4788654060066741, "grad_norm": 5.4765625, "learning_rate": 9.521134593993327e-06, "loss": 2.7677, "mean_token_accuracy": 0.44071315178898524, "step": 2583 }, { "epoch": 0.4790507971820541, "grad_norm": 6.39453125, "learning_rate": 9.520949202817946e-06, "loss": 2.9314, "mean_token_accuracy": 0.42054263565891475, "step": 2584 }, { "epoch": 0.4792361883574342, "grad_norm": 5.1640625, "learning_rate": 9.520763811642566e-06, "loss": 3.0261, "mean_token_accuracy": 0.420222503872694, "step": 2585 }, { "epoch": 0.4794215795328142, "grad_norm": 6.7421875, "learning_rate": 9.520578420467187e-06, "loss": 3.0652, "mean_token_accuracy": 0.4460624071322437, "step": 2586 }, { "epoch": 0.4796069707081943, "grad_norm": 4.5078125, "learning_rate": 9.520393029291807e-06, "loss": 2.6088, "mean_token_accuracy": 0.46253943217665616, "step": 2587 }, { "epoch": 0.4797923618835743, "grad_norm": 5.88671875, "learning_rate": 9.520207638116426e-06, "loss": 3.6518, "mean_token_accuracy": 0.374085033483881, "step": 2588 }, { "epoch": 0.4799777530589544, "grad_norm": 6.9296875, "learning_rate": 9.520022246941047e-06, "loss": 3.0874, "mean_token_accuracy": 0.41317710334229735, "step": 2589 }, { "epoch": 0.4801631442343344, "grad_norm": 5.2890625, "learning_rate": 9.519836855765666e-06, "loss": 2.8353, "mean_token_accuracy": 0.4370590419606387, "step": 2590 }, { "epoch": 0.4803485354097145, "grad_norm": 5.0, "learning_rate": 9.519651464590286e-06, "loss": 2.6998, "mean_token_accuracy": 0.44785600847009, "step": 2591 }, { "epoch": 0.48053392658509453, "grad_norm": 5.69921875, "learning_rate": 9.519466073414907e-06, "loss": 2.5143, "mean_token_accuracy": 0.5007346459006758, "step": 2592 }, { "epoch": 0.4807193177604746, "grad_norm": 6.3125, "learning_rate": 9.519280682239527e-06, "loss": 2.726, "mean_token_accuracy": 0.4499440089585666, "step": 2593 }, { "epoch": 0.48090470893585463, "grad_norm": 6.98046875, "learning_rate": 9.519095291064146e-06, "loss": 2.6136, "mean_token_accuracy": 0.4663587065675838, "step": 2594 }, { "epoch": 0.4810901001112347, "grad_norm": 6.859375, "learning_rate": 9.518909899888766e-06, "loss": 2.7489, "mean_token_accuracy": 0.44330023292877285, "step": 2595 }, { "epoch": 0.48127549128661473, "grad_norm": 4.94140625, "learning_rate": 9.518724508713387e-06, "loss": 2.8618, "mean_token_accuracy": 0.4410502398384246, "step": 2596 }, { "epoch": 0.4814608824619948, "grad_norm": 7.0, "learning_rate": 9.518539117538006e-06, "loss": 2.785, "mean_token_accuracy": 0.4428932322829411, "step": 2597 }, { "epoch": 0.48164627363737483, "grad_norm": 7.4453125, "learning_rate": 9.518353726362626e-06, "loss": 2.8649, "mean_token_accuracy": 0.4088635732147819, "step": 2598 }, { "epoch": 0.4818316648127549, "grad_norm": 5.3515625, "learning_rate": 9.518168335187245e-06, "loss": 3.4449, "mean_token_accuracy": 0.3666745116498, "step": 2599 }, { "epoch": 0.482017055988135, "grad_norm": 6.28515625, "learning_rate": 9.517982944011865e-06, "loss": 3.191, "mean_token_accuracy": 0.41676519271695434, "step": 2600 }, { "epoch": 0.482202447163515, "grad_norm": 6.23046875, "learning_rate": 9.517797552836486e-06, "loss": 2.7283, "mean_token_accuracy": 0.4406799259944496, "step": 2601 }, { "epoch": 0.4823878383388951, "grad_norm": 9.4375, "learning_rate": 9.517612161661106e-06, "loss": 2.4779, "mean_token_accuracy": 0.48483920367534455, "step": 2602 }, { "epoch": 0.4825732295142751, "grad_norm": 5.5234375, "learning_rate": 9.517426770485725e-06, "loss": 2.9092, "mean_token_accuracy": 0.42191036668175647, "step": 2603 }, { "epoch": 0.4827586206896552, "grad_norm": 5.578125, "learning_rate": 9.517241379310346e-06, "loss": 3.0931, "mean_token_accuracy": 0.4219535389213422, "step": 2604 }, { "epoch": 0.4829440118650352, "grad_norm": 8.4765625, "learning_rate": 9.517055988134966e-06, "loss": 2.9972, "mean_token_accuracy": 0.41296813862493015, "step": 2605 }, { "epoch": 0.4831294030404153, "grad_norm": 9.1640625, "learning_rate": 9.516870596959585e-06, "loss": 3.2786, "mean_token_accuracy": 0.39085204755614267, "step": 2606 }, { "epoch": 0.4833147942157953, "grad_norm": 5.3671875, "learning_rate": 9.516685205784205e-06, "loss": 2.6184, "mean_token_accuracy": 0.44262749445676275, "step": 2607 }, { "epoch": 0.4835001853911754, "grad_norm": 6.28125, "learning_rate": 9.516499814608824e-06, "loss": 3.1467, "mean_token_accuracy": 0.39843943522417635, "step": 2608 }, { "epoch": 0.4836855765665554, "grad_norm": 6.33984375, "learning_rate": 9.516314423433445e-06, "loss": 2.7146, "mean_token_accuracy": 0.4528700906344411, "step": 2609 }, { "epoch": 0.4838709677419355, "grad_norm": 7.44921875, "learning_rate": 9.516129032258065e-06, "loss": 2.6833, "mean_token_accuracy": 0.45627687896784763, "step": 2610 }, { "epoch": 0.4840563589173155, "grad_norm": 6.20703125, "learning_rate": 9.515943641082686e-06, "loss": 3.2867, "mean_token_accuracy": 0.3927054708968274, "step": 2611 }, { "epoch": 0.4842417500926956, "grad_norm": 5.28125, "learning_rate": 9.515758249907305e-06, "loss": 2.9137, "mean_token_accuracy": 0.450512682137075, "step": 2612 }, { "epoch": 0.4844271412680756, "grad_norm": 7.13671875, "learning_rate": 9.515572858731925e-06, "loss": 2.8444, "mean_token_accuracy": 0.43803859691710156, "step": 2613 }, { "epoch": 0.4846125324434557, "grad_norm": 7.19921875, "learning_rate": 9.515387467556545e-06, "loss": 2.7975, "mean_token_accuracy": 0.4414752305047664, "step": 2614 }, { "epoch": 0.4847979236188357, "grad_norm": 7.40234375, "learning_rate": 9.515202076381164e-06, "loss": 3.154, "mean_token_accuracy": 0.408891671884784, "step": 2615 }, { "epoch": 0.4849833147942158, "grad_norm": 8.1484375, "learning_rate": 9.515016685205785e-06, "loss": 2.5771, "mean_token_accuracy": 0.48056780595369347, "step": 2616 }, { "epoch": 0.4851687059695958, "grad_norm": 7.91796875, "learning_rate": 9.514831294030404e-06, "loss": 2.6237, "mean_token_accuracy": 0.44602415702063414, "step": 2617 }, { "epoch": 0.4853540971449759, "grad_norm": 8.375, "learning_rate": 9.514645902855026e-06, "loss": 2.9895, "mean_token_accuracy": 0.4290997013227137, "step": 2618 }, { "epoch": 0.4855394883203559, "grad_norm": 10.25, "learning_rate": 9.514460511679645e-06, "loss": 3.4011, "mean_token_accuracy": 0.378950378950379, "step": 2619 }, { "epoch": 0.485724879495736, "grad_norm": 8.4140625, "learning_rate": 9.514275120504265e-06, "loss": 2.6076, "mean_token_accuracy": 0.4735290783357896, "step": 2620 }, { "epoch": 0.48591027067111603, "grad_norm": 6.03515625, "learning_rate": 9.514089729328886e-06, "loss": 2.7027, "mean_token_accuracy": 0.43832715722604554, "step": 2621 }, { "epoch": 0.4860956618464961, "grad_norm": 7.0, "learning_rate": 9.513904338153504e-06, "loss": 2.8333, "mean_token_accuracy": 0.4431761786600496, "step": 2622 }, { "epoch": 0.4862810530218762, "grad_norm": 9.71875, "learning_rate": 9.513718946978125e-06, "loss": 2.7316, "mean_token_accuracy": 0.4498026640355205, "step": 2623 }, { "epoch": 0.4864664441972562, "grad_norm": 8.3203125, "learning_rate": 9.513533555802744e-06, "loss": 2.6717, "mean_token_accuracy": 0.4897985705003249, "step": 2624 }, { "epoch": 0.4866518353726363, "grad_norm": 5.35546875, "learning_rate": 9.513348164627364e-06, "loss": 3.2429, "mean_token_accuracy": 0.3886930776822219, "step": 2625 }, { "epoch": 0.4868372265480163, "grad_norm": 9.0, "learning_rate": 9.513162773451985e-06, "loss": 2.8459, "mean_token_accuracy": 0.45525027203482044, "step": 2626 }, { "epoch": 0.4870226177233964, "grad_norm": 10.0390625, "learning_rate": 9.512977382276605e-06, "loss": 3.2139, "mean_token_accuracy": 0.39025394646533973, "step": 2627 }, { "epoch": 0.4872080088987764, "grad_norm": 6.40234375, "learning_rate": 9.512791991101224e-06, "loss": 3.1823, "mean_token_accuracy": 0.3923456790123457, "step": 2628 }, { "epoch": 0.4873934000741565, "grad_norm": 9.203125, "learning_rate": 9.512606599925844e-06, "loss": 2.0783, "mean_token_accuracy": 0.5319126710540061, "step": 2629 }, { "epoch": 0.4875787912495365, "grad_norm": 8.734375, "learning_rate": 9.512421208750465e-06, "loss": 2.5481, "mean_token_accuracy": 0.46162112373349706, "step": 2630 }, { "epoch": 0.4877641824249166, "grad_norm": 9.40625, "learning_rate": 9.512235817575084e-06, "loss": 2.5611, "mean_token_accuracy": 0.45832326648949023, "step": 2631 }, { "epoch": 0.4879495736002966, "grad_norm": 9.3125, "learning_rate": 9.512050426399704e-06, "loss": 2.6694, "mean_token_accuracy": 0.4398270812996793, "step": 2632 }, { "epoch": 0.4881349647756767, "grad_norm": 6.90234375, "learning_rate": 9.511865035224323e-06, "loss": 3.3448, "mean_token_accuracy": 0.39632380712577114, "step": 2633 }, { "epoch": 0.4883203559510567, "grad_norm": 5.89453125, "learning_rate": 9.511679644048945e-06, "loss": 2.9128, "mean_token_accuracy": 0.428627399764921, "step": 2634 }, { "epoch": 0.4885057471264368, "grad_norm": 9.4921875, "learning_rate": 9.511494252873564e-06, "loss": 2.8528, "mean_token_accuracy": 0.4206848357791754, "step": 2635 }, { "epoch": 0.4886911383018168, "grad_norm": 10.640625, "learning_rate": 9.511308861698184e-06, "loss": 2.6495, "mean_token_accuracy": 0.45034324942791765, "step": 2636 }, { "epoch": 0.4888765294771969, "grad_norm": 8.0546875, "learning_rate": 9.511123470522803e-06, "loss": 2.6312, "mean_token_accuracy": 0.4571123321123321, "step": 2637 }, { "epoch": 0.4890619206525769, "grad_norm": 6.859375, "learning_rate": 9.510938079347424e-06, "loss": 2.8677, "mean_token_accuracy": 0.43048845947396674, "step": 2638 }, { "epoch": 0.489247311827957, "grad_norm": 7.72265625, "learning_rate": 9.510752688172044e-06, "loss": 3.0873, "mean_token_accuracy": 0.4007341206511331, "step": 2639 }, { "epoch": 0.489432703003337, "grad_norm": 8.375, "learning_rate": 9.510567296996663e-06, "loss": 3.0211, "mean_token_accuracy": 0.40501132502831255, "step": 2640 }, { "epoch": 0.4896180941787171, "grad_norm": 7.2578125, "learning_rate": 9.510381905821284e-06, "loss": 2.8712, "mean_token_accuracy": 0.44693473961766644, "step": 2641 }, { "epoch": 0.4898034853540971, "grad_norm": 6.77734375, "learning_rate": 9.510196514645904e-06, "loss": 2.7596, "mean_token_accuracy": 0.43508673754896476, "step": 2642 }, { "epoch": 0.4899888765294772, "grad_norm": 7.62890625, "learning_rate": 9.510011123470524e-06, "loss": 2.5579, "mean_token_accuracy": 0.4839506172839506, "step": 2643 }, { "epoch": 0.4901742677048572, "grad_norm": 8.1171875, "learning_rate": 9.509825732295143e-06, "loss": 2.8385, "mean_token_accuracy": 0.4312160129584635, "step": 2644 }, { "epoch": 0.4903596588802373, "grad_norm": 6.19921875, "learning_rate": 9.509640341119764e-06, "loss": 2.9947, "mean_token_accuracy": 0.4269778030734206, "step": 2645 }, { "epoch": 0.4905450500556173, "grad_norm": 5.9609375, "learning_rate": 9.509454949944383e-06, "loss": 2.9403, "mean_token_accuracy": 0.4356181150550796, "step": 2646 }, { "epoch": 0.4907304412309974, "grad_norm": 9.0703125, "learning_rate": 9.509269558769003e-06, "loss": 2.8397, "mean_token_accuracy": 0.46260843553694286, "step": 2647 }, { "epoch": 0.4909158324063775, "grad_norm": 10.15625, "learning_rate": 9.509084167593624e-06, "loss": 2.9034, "mean_token_accuracy": 0.42648767264747645, "step": 2648 }, { "epoch": 0.4911012235817575, "grad_norm": 7.3828125, "learning_rate": 9.508898776418242e-06, "loss": 2.739, "mean_token_accuracy": 0.45829566003616634, "step": 2649 }, { "epoch": 0.4912866147571376, "grad_norm": 5.94140625, "learning_rate": 9.508713385242865e-06, "loss": 3.1552, "mean_token_accuracy": 0.4168056018672891, "step": 2650 }, { "epoch": 0.4914720059325176, "grad_norm": 10.7890625, "learning_rate": 9.508527994067483e-06, "loss": 3.2833, "mean_token_accuracy": 0.40257744147552615, "step": 2651 }, { "epoch": 0.4916573971078977, "grad_norm": 11.8515625, "learning_rate": 9.508342602892104e-06, "loss": 3.1695, "mean_token_accuracy": 0.40180666353824496, "step": 2652 }, { "epoch": 0.4918427882832777, "grad_norm": 9.828125, "learning_rate": 9.508157211716723e-06, "loss": 2.7199, "mean_token_accuracy": 0.4552501033484911, "step": 2653 }, { "epoch": 0.4920281794586578, "grad_norm": 6.8984375, "learning_rate": 9.507971820541343e-06, "loss": 2.5699, "mean_token_accuracy": 0.46013491098086917, "step": 2654 }, { "epoch": 0.4922135706340378, "grad_norm": 8.9296875, "learning_rate": 9.507786429365962e-06, "loss": 3.3893, "mean_token_accuracy": 0.3743869209809264, "step": 2655 }, { "epoch": 0.4923989618094179, "grad_norm": 8.0546875, "learning_rate": 9.507601038190582e-06, "loss": 2.3037, "mean_token_accuracy": 0.4937106918238994, "step": 2656 }, { "epoch": 0.4925843529847979, "grad_norm": 9.3359375, "learning_rate": 9.507415647015203e-06, "loss": 3.2925, "mean_token_accuracy": 0.3945649333681735, "step": 2657 }, { "epoch": 0.492769744160178, "grad_norm": 10.0859375, "learning_rate": 9.507230255839823e-06, "loss": 2.4257, "mean_token_accuracy": 0.4860200668896321, "step": 2658 }, { "epoch": 0.492955135335558, "grad_norm": 7.91796875, "learning_rate": 9.507044864664444e-06, "loss": 2.6985, "mean_token_accuracy": 0.44668624696718173, "step": 2659 }, { "epoch": 0.4931405265109381, "grad_norm": 7.203125, "learning_rate": 9.506859473489063e-06, "loss": 2.6933, "mean_token_accuracy": 0.47582001682085784, "step": 2660 }, { "epoch": 0.4933259176863181, "grad_norm": 6.875, "learning_rate": 9.506674082313683e-06, "loss": 2.6479, "mean_token_accuracy": 0.47790279549936204, "step": 2661 }, { "epoch": 0.4935113088616982, "grad_norm": 8.0625, "learning_rate": 9.506488691138302e-06, "loss": 2.5836, "mean_token_accuracy": 0.4588976674191121, "step": 2662 }, { "epoch": 0.4936967000370782, "grad_norm": 5.453125, "learning_rate": 9.506303299962922e-06, "loss": 2.9709, "mean_token_accuracy": 0.4241708957866163, "step": 2663 }, { "epoch": 0.4938820912124583, "grad_norm": 6.23828125, "learning_rate": 9.506117908787543e-06, "loss": 2.7588, "mean_token_accuracy": 0.44397024753078895, "step": 2664 }, { "epoch": 0.4940674823878383, "grad_norm": 7.92578125, "learning_rate": 9.505932517612162e-06, "loss": 2.7507, "mean_token_accuracy": 0.4496847414880202, "step": 2665 }, { "epoch": 0.4942528735632184, "grad_norm": 5.56640625, "learning_rate": 9.505747126436782e-06, "loss": 3.2054, "mean_token_accuracy": 0.41854815661617606, "step": 2666 }, { "epoch": 0.4944382647385984, "grad_norm": 5.14453125, "learning_rate": 9.505561735261403e-06, "loss": 2.8713, "mean_token_accuracy": 0.4441292356185973, "step": 2667 }, { "epoch": 0.4946236559139785, "grad_norm": 6.95703125, "learning_rate": 9.505376344086023e-06, "loss": 2.5086, "mean_token_accuracy": 0.46820491690701826, "step": 2668 }, { "epoch": 0.4948090470893585, "grad_norm": 5.3828125, "learning_rate": 9.505190952910642e-06, "loss": 2.6267, "mean_token_accuracy": 0.4463042313872523, "step": 2669 }, { "epoch": 0.4949944382647386, "grad_norm": 5.78125, "learning_rate": 9.505005561735263e-06, "loss": 2.7403, "mean_token_accuracy": 0.4408957415565345, "step": 2670 }, { "epoch": 0.4951798294401186, "grad_norm": 8.4453125, "learning_rate": 9.504820170559881e-06, "loss": 2.5424, "mean_token_accuracy": 0.46377183967112023, "step": 2671 }, { "epoch": 0.4953652206154987, "grad_norm": 4.98046875, "learning_rate": 9.504634779384502e-06, "loss": 3.6251, "mean_token_accuracy": 0.36990865126276196, "step": 2672 }, { "epoch": 0.4955506117908788, "grad_norm": 8.328125, "learning_rate": 9.504449388209122e-06, "loss": 2.6056, "mean_token_accuracy": 0.4459787750926991, "step": 2673 }, { "epoch": 0.4957360029662588, "grad_norm": 6.75390625, "learning_rate": 9.504263997033743e-06, "loss": 2.7746, "mean_token_accuracy": 0.4481585564265641, "step": 2674 }, { "epoch": 0.4959213941416389, "grad_norm": 9.2109375, "learning_rate": 9.504078605858362e-06, "loss": 2.5874, "mean_token_accuracy": 0.45452066003000136, "step": 2675 }, { "epoch": 0.4961067853170189, "grad_norm": 5.12109375, "learning_rate": 9.503893214682982e-06, "loss": 2.5571, "mean_token_accuracy": 0.4779324055666004, "step": 2676 }, { "epoch": 0.496292176492399, "grad_norm": 5.68359375, "learning_rate": 9.503707823507603e-06, "loss": 2.579, "mean_token_accuracy": 0.4846173086543272, "step": 2677 }, { "epoch": 0.496477567667779, "grad_norm": 5.39453125, "learning_rate": 9.503522432332221e-06, "loss": 2.6925, "mean_token_accuracy": 0.4710943396226415, "step": 2678 }, { "epoch": 0.4966629588431591, "grad_norm": 6.53515625, "learning_rate": 9.503337041156842e-06, "loss": 3.1251, "mean_token_accuracy": 0.40388460439874324, "step": 2679 }, { "epoch": 0.4968483500185391, "grad_norm": 5.42578125, "learning_rate": 9.50315164998146e-06, "loss": 2.7812, "mean_token_accuracy": 0.45014992503748125, "step": 2680 }, { "epoch": 0.4970337411939192, "grad_norm": 5.76953125, "learning_rate": 9.502966258806081e-06, "loss": 3.1132, "mean_token_accuracy": 0.42337114217884464, "step": 2681 }, { "epoch": 0.4972191323692992, "grad_norm": 6.53515625, "learning_rate": 9.502780867630702e-06, "loss": 3.1612, "mean_token_accuracy": 0.41472089067573836, "step": 2682 }, { "epoch": 0.4974045235446793, "grad_norm": 6.48828125, "learning_rate": 9.502595476455322e-06, "loss": 3.0188, "mean_token_accuracy": 0.41335342529268576, "step": 2683 }, { "epoch": 0.4975899147200593, "grad_norm": 6.6328125, "learning_rate": 9.502410085279941e-06, "loss": 2.4824, "mean_token_accuracy": 0.48311729876780285, "step": 2684 }, { "epoch": 0.4977753058954394, "grad_norm": 5.77734375, "learning_rate": 9.502224694104561e-06, "loss": 2.7865, "mean_token_accuracy": 0.45614275909402885, "step": 2685 }, { "epoch": 0.4979606970708194, "grad_norm": 5.45703125, "learning_rate": 9.502039302929182e-06, "loss": 2.9237, "mean_token_accuracy": 0.4332480818414322, "step": 2686 }, { "epoch": 0.4981460882461995, "grad_norm": 5.42578125, "learning_rate": 9.5018539117538e-06, "loss": 3.0401, "mean_token_accuracy": 0.4134461134606971, "step": 2687 }, { "epoch": 0.4983314794215795, "grad_norm": 7.046875, "learning_rate": 9.501668520578421e-06, "loss": 2.8413, "mean_token_accuracy": 0.42769500438212094, "step": 2688 }, { "epoch": 0.4985168705969596, "grad_norm": 9.171875, "learning_rate": 9.50148312940304e-06, "loss": 2.9259, "mean_token_accuracy": 0.4205286239184544, "step": 2689 }, { "epoch": 0.4987022617723396, "grad_norm": 8.6640625, "learning_rate": 9.501297738227662e-06, "loss": 2.8987, "mean_token_accuracy": 0.4297736506094022, "step": 2690 }, { "epoch": 0.4988876529477197, "grad_norm": 5.125, "learning_rate": 9.501112347052281e-06, "loss": 2.5725, "mean_token_accuracy": 0.4769962397699624, "step": 2691 }, { "epoch": 0.4990730441230997, "grad_norm": 8.828125, "learning_rate": 9.500926955876901e-06, "loss": 3.1194, "mean_token_accuracy": 0.40703212078224743, "step": 2692 }, { "epoch": 0.4992584352984798, "grad_norm": 6.15625, "learning_rate": 9.50074156470152e-06, "loss": 2.9328, "mean_token_accuracy": 0.42680071615479964, "step": 2693 }, { "epoch": 0.4994438264738598, "grad_norm": 7.77734375, "learning_rate": 9.50055617352614e-06, "loss": 2.282, "mean_token_accuracy": 0.5137240800671221, "step": 2694 }, { "epoch": 0.4996292176492399, "grad_norm": 7.3515625, "learning_rate": 9.500370782350761e-06, "loss": 2.8449, "mean_token_accuracy": 0.4427303283633543, "step": 2695 }, { "epoch": 0.4998146088246199, "grad_norm": 10.6953125, "learning_rate": 9.50018539117538e-06, "loss": 2.6711, "mean_token_accuracy": 0.4525009693679721, "step": 2696 }, { "epoch": 0.5, "grad_norm": 10.8984375, "learning_rate": 9.5e-06, "loss": 2.7526, "mean_token_accuracy": 0.43572216097023153, "step": 2697 }, { "epoch": 0.5001853911753801, "grad_norm": 6.4140625, "learning_rate": 9.499814608824621e-06, "loss": 2.5365, "mean_token_accuracy": 0.4977139124755062, "step": 2698 }, { "epoch": 0.5003707823507602, "grad_norm": 7.4296875, "learning_rate": 9.499629217649242e-06, "loss": 2.7446, "mean_token_accuracy": 0.46442222875624817, "step": 2699 }, { "epoch": 0.5005561735261401, "grad_norm": 6.88671875, "learning_rate": 9.49944382647386e-06, "loss": 3.187, "mean_token_accuracy": 0.3971187427240978, "step": 2700 }, { "epoch": 0.5007415647015202, "grad_norm": 9.015625, "learning_rate": 9.49925843529848e-06, "loss": 2.3528, "mean_token_accuracy": 0.4882262996941896, "step": 2701 }, { "epoch": 0.5009269558769003, "grad_norm": 5.5859375, "learning_rate": 9.499073044123101e-06, "loss": 3.0496, "mean_token_accuracy": 0.4168702584544169, "step": 2702 }, { "epoch": 0.5011123470522804, "grad_norm": 6.26953125, "learning_rate": 9.49888765294772e-06, "loss": 2.7018, "mean_token_accuracy": 0.4654474199869366, "step": 2703 }, { "epoch": 0.5012977382276603, "grad_norm": 7.234375, "learning_rate": 9.49870226177234e-06, "loss": 2.6871, "mean_token_accuracy": 0.46608023457283754, "step": 2704 }, { "epoch": 0.5014831294030404, "grad_norm": 6.125, "learning_rate": 9.49851687059696e-06, "loss": 3.4367, "mean_token_accuracy": 0.3709294033753268, "step": 2705 }, { "epoch": 0.5016685205784205, "grad_norm": 6.77734375, "learning_rate": 9.498331479421582e-06, "loss": 2.7517, "mean_token_accuracy": 0.45950763061156286, "step": 2706 }, { "epoch": 0.5018539117538006, "grad_norm": 5.58984375, "learning_rate": 9.4981460882462e-06, "loss": 2.9244, "mean_token_accuracy": 0.43453510436432635, "step": 2707 }, { "epoch": 0.5020393029291805, "grad_norm": 4.984375, "learning_rate": 9.49796069707082e-06, "loss": 2.5513, "mean_token_accuracy": 0.4632984901277584, "step": 2708 }, { "epoch": 0.5022246941045606, "grad_norm": 5.41796875, "learning_rate": 9.49777530589544e-06, "loss": 2.7881, "mean_token_accuracy": 0.43689671814671815, "step": 2709 }, { "epoch": 0.5024100852799407, "grad_norm": 5.58984375, "learning_rate": 9.49758991472006e-06, "loss": 2.3617, "mean_token_accuracy": 0.5068433630241147, "step": 2710 }, { "epoch": 0.5025954764553208, "grad_norm": 5.72265625, "learning_rate": 9.49740452354468e-06, "loss": 2.8423, "mean_token_accuracy": 0.4383446956105729, "step": 2711 }, { "epoch": 0.5027808676307007, "grad_norm": 5.66796875, "learning_rate": 9.4972191323693e-06, "loss": 2.8709, "mean_token_accuracy": 0.41535626535626535, "step": 2712 }, { "epoch": 0.5029662588060808, "grad_norm": 5.625, "learning_rate": 9.49703374119392e-06, "loss": 2.7515, "mean_token_accuracy": 0.46480419665731365, "step": 2713 }, { "epoch": 0.5031516499814609, "grad_norm": 5.10546875, "learning_rate": 9.49684835001854e-06, "loss": 2.6408, "mean_token_accuracy": 0.45881662017324915, "step": 2714 }, { "epoch": 0.503337041156841, "grad_norm": 6.875, "learning_rate": 9.496662958843161e-06, "loss": 2.5316, "mean_token_accuracy": 0.481951560316721, "step": 2715 }, { "epoch": 0.5035224323322209, "grad_norm": 6.984375, "learning_rate": 9.49647756766778e-06, "loss": 2.8637, "mean_token_accuracy": 0.4162470182878346, "step": 2716 }, { "epoch": 0.503707823507601, "grad_norm": 6.65234375, "learning_rate": 9.4962921764924e-06, "loss": 2.5862, "mean_token_accuracy": 0.46290111804849715, "step": 2717 }, { "epoch": 0.5038932146829811, "grad_norm": 4.9609375, "learning_rate": 9.496106785317019e-06, "loss": 2.6496, "mean_token_accuracy": 0.4557100027255383, "step": 2718 }, { "epoch": 0.5040786058583612, "grad_norm": 8.3125, "learning_rate": 9.49592139414164e-06, "loss": 3.0468, "mean_token_accuracy": 0.43645242180282434, "step": 2719 }, { "epoch": 0.5042639970337411, "grad_norm": 6.69140625, "learning_rate": 9.49573600296626e-06, "loss": 3.3426, "mean_token_accuracy": 0.3912852112676056, "step": 2720 }, { "epoch": 0.5044493882091212, "grad_norm": 7.30078125, "learning_rate": 9.495550611790879e-06, "loss": 2.9825, "mean_token_accuracy": 0.41556928096958395, "step": 2721 }, { "epoch": 0.5046347793845013, "grad_norm": 7.2109375, "learning_rate": 9.4953652206155e-06, "loss": 2.686, "mean_token_accuracy": 0.44554198076423185, "step": 2722 }, { "epoch": 0.5048201705598814, "grad_norm": 5.9921875, "learning_rate": 9.49517982944012e-06, "loss": 2.9192, "mean_token_accuracy": 0.43998040176384123, "step": 2723 }, { "epoch": 0.5050055617352615, "grad_norm": 5.3671875, "learning_rate": 9.49499443826474e-06, "loss": 2.9849, "mean_token_accuracy": 0.441846787237171, "step": 2724 }, { "epoch": 0.5051909529106414, "grad_norm": 8.0078125, "learning_rate": 9.494809047089359e-06, "loss": 2.3941, "mean_token_accuracy": 0.5016685205784205, "step": 2725 }, { "epoch": 0.5053763440860215, "grad_norm": 7.22265625, "learning_rate": 9.49462365591398e-06, "loss": 3.0942, "mean_token_accuracy": 0.418109474137004, "step": 2726 }, { "epoch": 0.5055617352614016, "grad_norm": 6.7734375, "learning_rate": 9.494438264738598e-06, "loss": 2.6008, "mean_token_accuracy": 0.4416214673137311, "step": 2727 }, { "epoch": 0.5057471264367817, "grad_norm": 7.0390625, "learning_rate": 9.494252873563219e-06, "loss": 2.5043, "mean_token_accuracy": 0.4632187285728279, "step": 2728 }, { "epoch": 0.5059325176121616, "grad_norm": 5.99609375, "learning_rate": 9.49406748238784e-06, "loss": 2.9766, "mean_token_accuracy": 0.4378268758672217, "step": 2729 }, { "epoch": 0.5061179087875417, "grad_norm": 6.14453125, "learning_rate": 9.493882091212458e-06, "loss": 2.963, "mean_token_accuracy": 0.4203009828009828, "step": 2730 }, { "epoch": 0.5063032999629218, "grad_norm": 6.5546875, "learning_rate": 9.493696700037079e-06, "loss": 3.1021, "mean_token_accuracy": 0.4140923801117976, "step": 2731 }, { "epoch": 0.5064886911383019, "grad_norm": 6.10546875, "learning_rate": 9.493511308861699e-06, "loss": 3.0982, "mean_token_accuracy": 0.41766954551890134, "step": 2732 }, { "epoch": 0.5066740823136818, "grad_norm": 4.8828125, "learning_rate": 9.49332591768632e-06, "loss": 2.7017, "mean_token_accuracy": 0.4389864099660249, "step": 2733 }, { "epoch": 0.5068594734890619, "grad_norm": 6.3671875, "learning_rate": 9.493140526510938e-06, "loss": 3.0815, "mean_token_accuracy": 0.40806017063313876, "step": 2734 }, { "epoch": 0.507044864664442, "grad_norm": 6.89453125, "learning_rate": 9.492955135335559e-06, "loss": 3.1153, "mean_token_accuracy": 0.4172641238887296, "step": 2735 }, { "epoch": 0.5072302558398221, "grad_norm": 6.1328125, "learning_rate": 9.492769744160178e-06, "loss": 2.9954, "mean_token_accuracy": 0.4334806787394838, "step": 2736 }, { "epoch": 0.507415647015202, "grad_norm": 6.17578125, "learning_rate": 9.492584352984798e-06, "loss": 2.9505, "mean_token_accuracy": 0.4669270505228676, "step": 2737 }, { "epoch": 0.5076010381905821, "grad_norm": 5.71484375, "learning_rate": 9.492398961809419e-06, "loss": 2.5795, "mean_token_accuracy": 0.4636956823545745, "step": 2738 }, { "epoch": 0.5077864293659622, "grad_norm": 5.875, "learning_rate": 9.492213570634039e-06, "loss": 2.6799, "mean_token_accuracy": 0.45851578704770685, "step": 2739 }, { "epoch": 0.5079718205413423, "grad_norm": 5.18359375, "learning_rate": 9.49202817945866e-06, "loss": 2.5443, "mean_token_accuracy": 0.4568421052631579, "step": 2740 }, { "epoch": 0.5081572117167222, "grad_norm": 5.99609375, "learning_rate": 9.491842788283278e-06, "loss": 2.5423, "mean_token_accuracy": 0.4743191591017678, "step": 2741 }, { "epoch": 0.5083426028921023, "grad_norm": 7.44921875, "learning_rate": 9.491657397107899e-06, "loss": 2.299, "mean_token_accuracy": 0.5021197668256492, "step": 2742 }, { "epoch": 0.5085279940674824, "grad_norm": 6.265625, "learning_rate": 9.491472005932518e-06, "loss": 3.1371, "mean_token_accuracy": 0.399055640632976, "step": 2743 }, { "epoch": 0.5087133852428625, "grad_norm": 5.484375, "learning_rate": 9.491286614757138e-06, "loss": 3.2764, "mean_token_accuracy": 0.40266328471781865, "step": 2744 }, { "epoch": 0.5088987764182424, "grad_norm": 6.82421875, "learning_rate": 9.491101223581759e-06, "loss": 3.1831, "mean_token_accuracy": 0.38971827594509995, "step": 2745 }, { "epoch": 0.5090841675936225, "grad_norm": 9.015625, "learning_rate": 9.490915832406377e-06, "loss": 3.3505, "mean_token_accuracy": 0.36485324818862824, "step": 2746 }, { "epoch": 0.5092695587690026, "grad_norm": 5.9921875, "learning_rate": 9.490730441230998e-06, "loss": 3.2547, "mean_token_accuracy": 0.39100684261974583, "step": 2747 }, { "epoch": 0.5094549499443827, "grad_norm": 5.828125, "learning_rate": 9.490545050055618e-06, "loss": 2.7544, "mean_token_accuracy": 0.4356773134705214, "step": 2748 }, { "epoch": 0.5096403411197628, "grad_norm": 7.28515625, "learning_rate": 9.490359658880239e-06, "loss": 3.0145, "mean_token_accuracy": 0.4150917176209005, "step": 2749 }, { "epoch": 0.5098257322951427, "grad_norm": 5.71875, "learning_rate": 9.490174267704858e-06, "loss": 2.9295, "mean_token_accuracy": 0.4291912530371399, "step": 2750 }, { "epoch": 0.5100111234705228, "grad_norm": 9.578125, "learning_rate": 9.489988876529478e-06, "loss": 3.1402, "mean_token_accuracy": 0.4268292682926829, "step": 2751 }, { "epoch": 0.5101965146459029, "grad_norm": 7.3671875, "learning_rate": 9.489803485354097e-06, "loss": 2.4615, "mean_token_accuracy": 0.4672114191146049, "step": 2752 }, { "epoch": 0.510381905821283, "grad_norm": 6.49609375, "learning_rate": 9.489618094178718e-06, "loss": 2.4887, "mean_token_accuracy": 0.47094484696140765, "step": 2753 }, { "epoch": 0.5105672969966629, "grad_norm": 7.0546875, "learning_rate": 9.489432703003338e-06, "loss": 2.8353, "mean_token_accuracy": 0.43817899637868596, "step": 2754 }, { "epoch": 0.510752688172043, "grad_norm": 6.00390625, "learning_rate": 9.489247311827959e-06, "loss": 2.5583, "mean_token_accuracy": 0.45448397132422563, "step": 2755 }, { "epoch": 0.5109380793474231, "grad_norm": 6.25390625, "learning_rate": 9.489061920652577e-06, "loss": 3.03, "mean_token_accuracy": 0.4084987593052109, "step": 2756 }, { "epoch": 0.5111234705228032, "grad_norm": 6.3046875, "learning_rate": 9.488876529477198e-06, "loss": 2.9225, "mean_token_accuracy": 0.4413754227733935, "step": 2757 }, { "epoch": 0.5113088616981831, "grad_norm": 6.08984375, "learning_rate": 9.488691138301818e-06, "loss": 3.0192, "mean_token_accuracy": 0.42981410867492853, "step": 2758 }, { "epoch": 0.5114942528735632, "grad_norm": 6.72265625, "learning_rate": 9.488505747126437e-06, "loss": 3.2597, "mean_token_accuracy": 0.411298457991065, "step": 2759 }, { "epoch": 0.5116796440489433, "grad_norm": 7.00390625, "learning_rate": 9.488320355951058e-06, "loss": 2.5509, "mean_token_accuracy": 0.45740392367643107, "step": 2760 }, { "epoch": 0.5118650352243234, "grad_norm": 4.81640625, "learning_rate": 9.488134964775676e-06, "loss": 2.827, "mean_token_accuracy": 0.4475289431096726, "step": 2761 }, { "epoch": 0.5120504263997033, "grad_norm": 6.72265625, "learning_rate": 9.487949573600297e-06, "loss": 3.2967, "mean_token_accuracy": 0.3988713607797871, "step": 2762 }, { "epoch": 0.5122358175750834, "grad_norm": 6.5625, "learning_rate": 9.487764182424917e-06, "loss": 2.554, "mean_token_accuracy": 0.4677708146821844, "step": 2763 }, { "epoch": 0.5124212087504635, "grad_norm": 6.1171875, "learning_rate": 9.487578791249538e-06, "loss": 3.162, "mean_token_accuracy": 0.39616128141397405, "step": 2764 }, { "epoch": 0.5126065999258436, "grad_norm": 9.375, "learning_rate": 9.487393400074157e-06, "loss": 2.555, "mean_token_accuracy": 0.46008708272859217, "step": 2765 }, { "epoch": 0.5127919911012235, "grad_norm": 5.953125, "learning_rate": 9.487208008898777e-06, "loss": 2.7105, "mean_token_accuracy": 0.4464145614310372, "step": 2766 }, { "epoch": 0.5129773822766036, "grad_norm": 7.1953125, "learning_rate": 9.487022617723398e-06, "loss": 2.2738, "mean_token_accuracy": 0.5102252673102875, "step": 2767 }, { "epoch": 0.5131627734519837, "grad_norm": 5.84375, "learning_rate": 9.486837226548016e-06, "loss": 3.2163, "mean_token_accuracy": 0.40253447633246364, "step": 2768 }, { "epoch": 0.5133481646273638, "grad_norm": 6.72265625, "learning_rate": 9.486651835372637e-06, "loss": 2.6652, "mean_token_accuracy": 0.4756300193852119, "step": 2769 }, { "epoch": 0.5135335558027437, "grad_norm": 7.3828125, "learning_rate": 9.486466444197256e-06, "loss": 2.3139, "mean_token_accuracy": 0.5039230574538092, "step": 2770 }, { "epoch": 0.5137189469781238, "grad_norm": 5.6875, "learning_rate": 9.486281053021878e-06, "loss": 2.7999, "mean_token_accuracy": 0.44335587139506, "step": 2771 }, { "epoch": 0.5139043381535039, "grad_norm": 8.09375, "learning_rate": 9.486095661846497e-06, "loss": 2.737, "mean_token_accuracy": 0.45554956896551724, "step": 2772 }, { "epoch": 0.514089729328884, "grad_norm": 10.46875, "learning_rate": 9.485910270671117e-06, "loss": 2.5759, "mean_token_accuracy": 0.44888295288975233, "step": 2773 }, { "epoch": 0.514275120504264, "grad_norm": 5.73828125, "learning_rate": 9.485724879495736e-06, "loss": 2.8279, "mean_token_accuracy": 0.4282669290047549, "step": 2774 }, { "epoch": 0.514460511679644, "grad_norm": 8.78125, "learning_rate": 9.485539488320356e-06, "loss": 3.4112, "mean_token_accuracy": 0.37783711615487314, "step": 2775 }, { "epoch": 0.5146459028550241, "grad_norm": 7.6328125, "learning_rate": 9.485354097144977e-06, "loss": 2.8436, "mean_token_accuracy": 0.43803834398444014, "step": 2776 }, { "epoch": 0.5148312940304042, "grad_norm": 8.3984375, "learning_rate": 9.485168705969596e-06, "loss": 2.7303, "mean_token_accuracy": 0.43912205249888775, "step": 2777 }, { "epoch": 0.5150166852057843, "grad_norm": 6.1015625, "learning_rate": 9.484983314794216e-06, "loss": 3.2752, "mean_token_accuracy": 0.4089532144059239, "step": 2778 }, { "epoch": 0.5152020763811642, "grad_norm": 5.79296875, "learning_rate": 9.484797923618837e-06, "loss": 2.4319, "mean_token_accuracy": 0.47305389221556887, "step": 2779 }, { "epoch": 0.5153874675565443, "grad_norm": 6.140625, "learning_rate": 9.484612532443457e-06, "loss": 2.9958, "mean_token_accuracy": 0.4305191873589165, "step": 2780 }, { "epoch": 0.5155728587319244, "grad_norm": 7.5703125, "learning_rate": 9.484427141268076e-06, "loss": 3.0578, "mean_token_accuracy": 0.41756548536209553, "step": 2781 }, { "epoch": 0.5157582499073045, "grad_norm": 8.640625, "learning_rate": 9.484241750092697e-06, "loss": 2.9061, "mean_token_accuracy": 0.42881982177787437, "step": 2782 }, { "epoch": 0.5159436410826844, "grad_norm": 7.73828125, "learning_rate": 9.484056358917317e-06, "loss": 2.7509, "mean_token_accuracy": 0.4404933881542394, "step": 2783 }, { "epoch": 0.5161290322580645, "grad_norm": 8.4296875, "learning_rate": 9.483870967741936e-06, "loss": 2.7492, "mean_token_accuracy": 0.4400839195359743, "step": 2784 }, { "epoch": 0.5163144234334446, "grad_norm": 8.8515625, "learning_rate": 9.483685576566556e-06, "loss": 2.8196, "mean_token_accuracy": 0.44691430242761043, "step": 2785 }, { "epoch": 0.5164998146088247, "grad_norm": 5.12109375, "learning_rate": 9.483500185391175e-06, "loss": 2.4553, "mean_token_accuracy": 0.485009910802775, "step": 2786 }, { "epoch": 0.5166852057842046, "grad_norm": 12.375, "learning_rate": 9.483314794215797e-06, "loss": 2.4553, "mean_token_accuracy": 0.4479944674965422, "step": 2787 }, { "epoch": 0.5168705969595847, "grad_norm": 7.98046875, "learning_rate": 9.483129403040416e-06, "loss": 3.0793, "mean_token_accuracy": 0.40118066658467594, "step": 2788 }, { "epoch": 0.5170559881349648, "grad_norm": 6.27734375, "learning_rate": 9.482944011865037e-06, "loss": 2.7857, "mean_token_accuracy": 0.43041949258980156, "step": 2789 }, { "epoch": 0.5172413793103449, "grad_norm": 6.078125, "learning_rate": 9.482758620689655e-06, "loss": 3.4801, "mean_token_accuracy": 0.38734323194464465, "step": 2790 }, { "epoch": 0.5174267704857248, "grad_norm": 8.015625, "learning_rate": 9.482573229514276e-06, "loss": 3.0161, "mean_token_accuracy": 0.42345244086562656, "step": 2791 }, { "epoch": 0.5176121616611049, "grad_norm": 6.78515625, "learning_rate": 9.482387838338896e-06, "loss": 2.7716, "mean_token_accuracy": 0.43864651649082786, "step": 2792 }, { "epoch": 0.517797552836485, "grad_norm": 7.0078125, "learning_rate": 9.482202447163515e-06, "loss": 2.2453, "mean_token_accuracy": 0.5304557865994539, "step": 2793 }, { "epoch": 0.5179829440118651, "grad_norm": 7.5390625, "learning_rate": 9.482017055988136e-06, "loss": 2.8603, "mean_token_accuracy": 0.44338899954037075, "step": 2794 }, { "epoch": 0.518168335187245, "grad_norm": 5.796875, "learning_rate": 9.481831664812756e-06, "loss": 2.6136, "mean_token_accuracy": 0.4619238476953908, "step": 2795 }, { "epoch": 0.5183537263626251, "grad_norm": 5.04296875, "learning_rate": 9.481646273637377e-06, "loss": 2.4041, "mean_token_accuracy": 0.49196282121377805, "step": 2796 }, { "epoch": 0.5185391175380052, "grad_norm": 6.6171875, "learning_rate": 9.481460882461995e-06, "loss": 3.0594, "mean_token_accuracy": 0.4235943917292168, "step": 2797 }, { "epoch": 0.5187245087133853, "grad_norm": 8.25, "learning_rate": 9.481275491286616e-06, "loss": 2.6918, "mean_token_accuracy": 0.44341522351993556, "step": 2798 }, { "epoch": 0.5189098998887653, "grad_norm": 10.234375, "learning_rate": 9.481090100111235e-06, "loss": 2.7453, "mean_token_accuracy": 0.4241340782122905, "step": 2799 }, { "epoch": 0.5190952910641453, "grad_norm": 7.2421875, "learning_rate": 9.480904708935855e-06, "loss": 3.0141, "mean_token_accuracy": 0.43438238586888916, "step": 2800 }, { "epoch": 0.5192806822395254, "grad_norm": 8.0625, "learning_rate": 9.480719317760476e-06, "loss": 2.551, "mean_token_accuracy": 0.47785081451843386, "step": 2801 }, { "epoch": 0.5194660734149055, "grad_norm": 6.31640625, "learning_rate": 9.480533926585095e-06, "loss": 2.6175, "mean_token_accuracy": 0.4487438313144908, "step": 2802 }, { "epoch": 0.5196514645902856, "grad_norm": 9.0703125, "learning_rate": 9.480348535409715e-06, "loss": 2.4346, "mean_token_accuracy": 0.4910071942446043, "step": 2803 }, { "epoch": 0.5198368557656655, "grad_norm": 6.10546875, "learning_rate": 9.480163144234335e-06, "loss": 2.4634, "mean_token_accuracy": 0.48520070279767535, "step": 2804 }, { "epoch": 0.5200222469410456, "grad_norm": 7.00390625, "learning_rate": 9.479977753058956e-06, "loss": 2.956, "mean_token_accuracy": 0.44159426286029513, "step": 2805 }, { "epoch": 0.5202076381164257, "grad_norm": 5.78515625, "learning_rate": 9.479792361883575e-06, "loss": 3.2777, "mean_token_accuracy": 0.39315770215879986, "step": 2806 }, { "epoch": 0.5203930292918058, "grad_norm": 6.43359375, "learning_rate": 9.479606970708195e-06, "loss": 2.0691, "mean_token_accuracy": 0.5525837444819527, "step": 2807 }, { "epoch": 0.5205784204671857, "grad_norm": 4.953125, "learning_rate": 9.479421579532814e-06, "loss": 3.038, "mean_token_accuracy": 0.42873596314425566, "step": 2808 }, { "epoch": 0.5207638116425658, "grad_norm": 5.3984375, "learning_rate": 9.479236188357435e-06, "loss": 2.6201, "mean_token_accuracy": 0.46547153137230146, "step": 2809 }, { "epoch": 0.5209492028179459, "grad_norm": 7.6171875, "learning_rate": 9.479050797182055e-06, "loss": 3.1412, "mean_token_accuracy": 0.408819287038945, "step": 2810 }, { "epoch": 0.521134593993326, "grad_norm": 5.4609375, "learning_rate": 9.478865406006676e-06, "loss": 3.0968, "mean_token_accuracy": 0.39909946786737616, "step": 2811 }, { "epoch": 0.5213199851687059, "grad_norm": 8.5078125, "learning_rate": 9.478680014831294e-06, "loss": 2.7351, "mean_token_accuracy": 0.4606436603334626, "step": 2812 }, { "epoch": 0.521505376344086, "grad_norm": 6.375, "learning_rate": 9.478494623655915e-06, "loss": 3.3519, "mean_token_accuracy": 0.39111052838599714, "step": 2813 }, { "epoch": 0.5216907675194661, "grad_norm": 7.5625, "learning_rate": 9.478309232480535e-06, "loss": 2.3896, "mean_token_accuracy": 0.4905318623415116, "step": 2814 }, { "epoch": 0.5218761586948462, "grad_norm": 7.05859375, "learning_rate": 9.478123841305154e-06, "loss": 2.9364, "mean_token_accuracy": 0.4266785767455709, "step": 2815 }, { "epoch": 0.5220615498702261, "grad_norm": 7.43359375, "learning_rate": 9.477938450129775e-06, "loss": 2.4496, "mean_token_accuracy": 0.4929934605631923, "step": 2816 }, { "epoch": 0.5222469410456062, "grad_norm": 4.859375, "learning_rate": 9.477753058954393e-06, "loss": 2.8325, "mean_token_accuracy": 0.4291353622819305, "step": 2817 }, { "epoch": 0.5224323322209863, "grad_norm": 10.5625, "learning_rate": 9.477567667779014e-06, "loss": 1.9907, "mean_token_accuracy": 0.517725258493353, "step": 2818 }, { "epoch": 0.5226177233963664, "grad_norm": 6.02734375, "learning_rate": 9.477382276603634e-06, "loss": 2.7372, "mean_token_accuracy": 0.4534977759805904, "step": 2819 }, { "epoch": 0.5228031145717463, "grad_norm": 6.3828125, "learning_rate": 9.477196885428255e-06, "loss": 2.8026, "mean_token_accuracy": 0.43660800886795065, "step": 2820 }, { "epoch": 0.5229885057471264, "grad_norm": 6.84765625, "learning_rate": 9.477011494252875e-06, "loss": 3.0844, "mean_token_accuracy": 0.40865892291446676, "step": 2821 }, { "epoch": 0.5231738969225065, "grad_norm": 7.56640625, "learning_rate": 9.476826103077494e-06, "loss": 3.0595, "mean_token_accuracy": 0.4151254117050925, "step": 2822 }, { "epoch": 0.5233592880978866, "grad_norm": 9.1328125, "learning_rate": 9.476640711902115e-06, "loss": 2.7696, "mean_token_accuracy": 0.43795883981225175, "step": 2823 }, { "epoch": 0.5235446792732666, "grad_norm": 7.6640625, "learning_rate": 9.476455320726733e-06, "loss": 2.5474, "mean_token_accuracy": 0.45962732919254656, "step": 2824 }, { "epoch": 0.5237300704486466, "grad_norm": 5.39453125, "learning_rate": 9.476269929551354e-06, "loss": 3.0124, "mean_token_accuracy": 0.4206396385685412, "step": 2825 }, { "epoch": 0.5239154616240267, "grad_norm": 7.1328125, "learning_rate": 9.476084538375974e-06, "loss": 2.9489, "mean_token_accuracy": 0.41460750853242323, "step": 2826 }, { "epoch": 0.5241008527994068, "grad_norm": 6.3046875, "learning_rate": 9.475899147200595e-06, "loss": 2.6346, "mean_token_accuracy": 0.4561946902654867, "step": 2827 }, { "epoch": 0.5242862439747868, "grad_norm": 5.92578125, "learning_rate": 9.475713756025214e-06, "loss": 3.0241, "mean_token_accuracy": 0.4243879582991683, "step": 2828 }, { "epoch": 0.5244716351501668, "grad_norm": 5.6484375, "learning_rate": 9.475528364849834e-06, "loss": 2.9676, "mean_token_accuracy": 0.44555521378037527, "step": 2829 }, { "epoch": 0.5246570263255469, "grad_norm": 6.5, "learning_rate": 9.475342973674455e-06, "loss": 2.8413, "mean_token_accuracy": 0.45789821546596166, "step": 2830 }, { "epoch": 0.524842417500927, "grad_norm": 7.89453125, "learning_rate": 9.475157582499074e-06, "loss": 2.3546, "mean_token_accuracy": 0.5083143507972665, "step": 2831 }, { "epoch": 0.525027808676307, "grad_norm": 6.69921875, "learning_rate": 9.474972191323694e-06, "loss": 3.0864, "mean_token_accuracy": 0.4277812895069532, "step": 2832 }, { "epoch": 0.525213199851687, "grad_norm": 5.90625, "learning_rate": 9.474786800148313e-06, "loss": 3.0219, "mean_token_accuracy": 0.42331007663273473, "step": 2833 }, { "epoch": 0.5253985910270671, "grad_norm": 7.84375, "learning_rate": 9.474601408972933e-06, "loss": 2.6578, "mean_token_accuracy": 0.4644268774703557, "step": 2834 }, { "epoch": 0.5255839822024472, "grad_norm": 7.78125, "learning_rate": 9.474416017797554e-06, "loss": 2.7074, "mean_token_accuracy": 0.4436526150501268, "step": 2835 }, { "epoch": 0.5257693733778273, "grad_norm": 7.140625, "learning_rate": 9.474230626622174e-06, "loss": 2.656, "mean_token_accuracy": 0.4467005076142132, "step": 2836 }, { "epoch": 0.5259547645532072, "grad_norm": 7.32421875, "learning_rate": 9.474045235446793e-06, "loss": 3.4224, "mean_token_accuracy": 0.37782245592329106, "step": 2837 }, { "epoch": 0.5261401557285873, "grad_norm": 5.98046875, "learning_rate": 9.473859844271414e-06, "loss": 2.6069, "mean_token_accuracy": 0.4603002840524821, "step": 2838 }, { "epoch": 0.5263255469039674, "grad_norm": 5.59765625, "learning_rate": 9.473674453096034e-06, "loss": 2.7357, "mean_token_accuracy": 0.4543397080457169, "step": 2839 }, { "epoch": 0.5265109380793475, "grad_norm": 5.859375, "learning_rate": 9.473489061920653e-06, "loss": 2.7469, "mean_token_accuracy": 0.4561710137133638, "step": 2840 }, { "epoch": 0.5266963292547274, "grad_norm": 5.734375, "learning_rate": 9.473303670745273e-06, "loss": 2.8876, "mean_token_accuracy": 0.43061488030197675, "step": 2841 }, { "epoch": 0.5268817204301075, "grad_norm": 7.40234375, "learning_rate": 9.473118279569892e-06, "loss": 3.0507, "mean_token_accuracy": 0.4270610596341174, "step": 2842 }, { "epoch": 0.5270671116054876, "grad_norm": 6.23828125, "learning_rate": 9.472932888394514e-06, "loss": 2.785, "mean_token_accuracy": 0.4538633461047254, "step": 2843 }, { "epoch": 0.5272525027808677, "grad_norm": 6.234375, "learning_rate": 9.472747497219133e-06, "loss": 2.3599, "mean_token_accuracy": 0.49093484419263456, "step": 2844 }, { "epoch": 0.5274378939562476, "grad_norm": 6.41015625, "learning_rate": 9.472562106043754e-06, "loss": 3.3318, "mean_token_accuracy": 0.41741799347784386, "step": 2845 }, { "epoch": 0.5276232851316277, "grad_norm": 7.20703125, "learning_rate": 9.472376714868372e-06, "loss": 2.8259, "mean_token_accuracy": 0.4221891288160834, "step": 2846 }, { "epoch": 0.5278086763070078, "grad_norm": 6.28515625, "learning_rate": 9.472191323692993e-06, "loss": 2.5206, "mean_token_accuracy": 0.46992431284026026, "step": 2847 }, { "epoch": 0.5279940674823879, "grad_norm": 5.93359375, "learning_rate": 9.472005932517613e-06, "loss": 3.0519, "mean_token_accuracy": 0.4161189899688689, "step": 2848 }, { "epoch": 0.5281794586577679, "grad_norm": 6.86328125, "learning_rate": 9.471820541342232e-06, "loss": 2.8259, "mean_token_accuracy": 0.45199303207295827, "step": 2849 }, { "epoch": 0.5283648498331479, "grad_norm": 6.11328125, "learning_rate": 9.471635150166853e-06, "loss": 2.6738, "mean_token_accuracy": 0.46475475743768424, "step": 2850 }, { "epoch": 0.528550241008528, "grad_norm": 6.125, "learning_rate": 9.471449758991471e-06, "loss": 2.4583, "mean_token_accuracy": 0.48299732815156665, "step": 2851 }, { "epoch": 0.5287356321839081, "grad_norm": 8.609375, "learning_rate": 9.471264367816094e-06, "loss": 2.8479, "mean_token_accuracy": 0.4226044226044226, "step": 2852 }, { "epoch": 0.5289210233592881, "grad_norm": 5.45703125, "learning_rate": 9.471078976640712e-06, "loss": 2.8219, "mean_token_accuracy": 0.4557321225879682, "step": 2853 }, { "epoch": 0.5291064145346681, "grad_norm": 5.55078125, "learning_rate": 9.470893585465333e-06, "loss": 2.5461, "mean_token_accuracy": 0.4616444015975761, "step": 2854 }, { "epoch": 0.5292918057100482, "grad_norm": 5.9375, "learning_rate": 9.470708194289952e-06, "loss": 3.7396, "mean_token_accuracy": 0.35271155722849573, "step": 2855 }, { "epoch": 0.5294771968854283, "grad_norm": 7.33203125, "learning_rate": 9.470522803114572e-06, "loss": 3.5868, "mean_token_accuracy": 0.38662379421221865, "step": 2856 }, { "epoch": 0.5296625880608083, "grad_norm": 7.640625, "learning_rate": 9.470337411939193e-06, "loss": 2.6328, "mean_token_accuracy": 0.46317171938272955, "step": 2857 }, { "epoch": 0.5298479792361883, "grad_norm": 7.84765625, "learning_rate": 9.470152020763812e-06, "loss": 2.7643, "mean_token_accuracy": 0.446376181679887, "step": 2858 }, { "epoch": 0.5300333704115684, "grad_norm": 7.23046875, "learning_rate": 9.469966629588432e-06, "loss": 2.6675, "mean_token_accuracy": 0.4337309107122952, "step": 2859 }, { "epoch": 0.5302187615869485, "grad_norm": 7.19921875, "learning_rate": 9.469781238413053e-06, "loss": 2.8339, "mean_token_accuracy": 0.43744787322768974, "step": 2860 }, { "epoch": 0.5304041527623286, "grad_norm": 6.71875, "learning_rate": 9.469595847237673e-06, "loss": 2.5633, "mean_token_accuracy": 0.48539857932123126, "step": 2861 }, { "epoch": 0.5305895439377085, "grad_norm": 7.05859375, "learning_rate": 9.469410456062292e-06, "loss": 2.7932, "mean_token_accuracy": 0.4282890401932792, "step": 2862 }, { "epoch": 0.5307749351130886, "grad_norm": 7.375, "learning_rate": 9.469225064886912e-06, "loss": 2.0528, "mean_token_accuracy": 0.5328171091445427, "step": 2863 }, { "epoch": 0.5309603262884687, "grad_norm": 5.9921875, "learning_rate": 9.469039673711533e-06, "loss": 2.9425, "mean_token_accuracy": 0.45242537313432835, "step": 2864 }, { "epoch": 0.5311457174638488, "grad_norm": 6.5859375, "learning_rate": 9.468854282536152e-06, "loss": 2.9216, "mean_token_accuracy": 0.4252167982071519, "step": 2865 }, { "epoch": 0.5313311086392287, "grad_norm": 5.25390625, "learning_rate": 9.468668891360772e-06, "loss": 2.718, "mean_token_accuracy": 0.45558112773302645, "step": 2866 }, { "epoch": 0.5315164998146088, "grad_norm": 7.66015625, "learning_rate": 9.468483500185391e-06, "loss": 2.9829, "mean_token_accuracy": 0.43725187472430527, "step": 2867 }, { "epoch": 0.5317018909899889, "grad_norm": 8.1328125, "learning_rate": 9.468298109010013e-06, "loss": 2.4871, "mean_token_accuracy": 0.46457098685905696, "step": 2868 }, { "epoch": 0.531887282165369, "grad_norm": 7.640625, "learning_rate": 9.468112717834632e-06, "loss": 2.441, "mean_token_accuracy": 0.47692307692307695, "step": 2869 }, { "epoch": 0.5320726733407489, "grad_norm": 7.26953125, "learning_rate": 9.467927326659252e-06, "loss": 2.3449, "mean_token_accuracy": 0.49765411893071465, "step": 2870 }, { "epoch": 0.532258064516129, "grad_norm": 7.14453125, "learning_rate": 9.467741935483871e-06, "loss": 2.8053, "mean_token_accuracy": 0.43725915221579964, "step": 2871 }, { "epoch": 0.5324434556915091, "grad_norm": 6.2421875, "learning_rate": 9.467556544308492e-06, "loss": 2.9417, "mean_token_accuracy": 0.42860831396849985, "step": 2872 }, { "epoch": 0.5326288468668892, "grad_norm": 8.1171875, "learning_rate": 9.467371153133112e-06, "loss": 2.6035, "mean_token_accuracy": 0.46498980285520053, "step": 2873 }, { "epoch": 0.5328142380422692, "grad_norm": 5.73828125, "learning_rate": 9.467185761957731e-06, "loss": 3.1883, "mean_token_accuracy": 0.4301543824701195, "step": 2874 }, { "epoch": 0.5329996292176492, "grad_norm": 5.6640625, "learning_rate": 9.467000370782351e-06, "loss": 2.8045, "mean_token_accuracy": 0.43950039032006244, "step": 2875 }, { "epoch": 0.5331850203930293, "grad_norm": 8.4296875, "learning_rate": 9.466814979606972e-06, "loss": 2.2215, "mean_token_accuracy": 0.4894362671472915, "step": 2876 }, { "epoch": 0.5333704115684094, "grad_norm": 6.4765625, "learning_rate": 9.466629588431592e-06, "loss": 3.0898, "mean_token_accuracy": 0.39997390056113796, "step": 2877 }, { "epoch": 0.5335558027437894, "grad_norm": 6.125, "learning_rate": 9.466444197256211e-06, "loss": 2.4839, "mean_token_accuracy": 0.5142328164776672, "step": 2878 }, { "epoch": 0.5337411939191694, "grad_norm": 5.7734375, "learning_rate": 9.466258806080832e-06, "loss": 2.9468, "mean_token_accuracy": 0.4346468561584841, "step": 2879 }, { "epoch": 0.5339265850945495, "grad_norm": 6.4296875, "learning_rate": 9.46607341490545e-06, "loss": 3.1948, "mean_token_accuracy": 0.4038237953752745, "step": 2880 }, { "epoch": 0.5341119762699296, "grad_norm": 6.2734375, "learning_rate": 9.465888023730071e-06, "loss": 2.7831, "mean_token_accuracy": 0.443679880329095, "step": 2881 }, { "epoch": 0.5342973674453096, "grad_norm": 6.4375, "learning_rate": 9.465702632554691e-06, "loss": 2.5558, "mean_token_accuracy": 0.4688601645123384, "step": 2882 }, { "epoch": 0.5344827586206896, "grad_norm": 5.3125, "learning_rate": 9.46551724137931e-06, "loss": 3.1536, "mean_token_accuracy": 0.4200789343857918, "step": 2883 }, { "epoch": 0.5346681497960697, "grad_norm": 4.85546875, "learning_rate": 9.46533185020393e-06, "loss": 3.0953, "mean_token_accuracy": 0.43098938298199724, "step": 2884 }, { "epoch": 0.5348535409714498, "grad_norm": 5.578125, "learning_rate": 9.465146459028551e-06, "loss": 3.1416, "mean_token_accuracy": 0.40680737217598095, "step": 2885 }, { "epoch": 0.5350389321468298, "grad_norm": 5.78125, "learning_rate": 9.464961067853172e-06, "loss": 2.9831, "mean_token_accuracy": 0.42898587285570133, "step": 2886 }, { "epoch": 0.5352243233222098, "grad_norm": 6.07421875, "learning_rate": 9.46477567667779e-06, "loss": 2.8269, "mean_token_accuracy": 0.4522691705790297, "step": 2887 }, { "epoch": 0.5354097144975899, "grad_norm": 7.375, "learning_rate": 9.464590285502411e-06, "loss": 2.4955, "mean_token_accuracy": 0.4757738896366083, "step": 2888 }, { "epoch": 0.53559510567297, "grad_norm": 4.9921875, "learning_rate": 9.46440489432703e-06, "loss": 2.8393, "mean_token_accuracy": 0.4341190108191654, "step": 2889 }, { "epoch": 0.53578049684835, "grad_norm": 7.0, "learning_rate": 9.46421950315165e-06, "loss": 3.0145, "mean_token_accuracy": 0.40296851158274377, "step": 2890 }, { "epoch": 0.53596588802373, "grad_norm": 5.4296875, "learning_rate": 9.46403411197627e-06, "loss": 3.3636, "mean_token_accuracy": 0.38802889576883387, "step": 2891 }, { "epoch": 0.5361512791991101, "grad_norm": 8.4609375, "learning_rate": 9.463848720800891e-06, "loss": 2.6097, "mean_token_accuracy": 0.46801470588235294, "step": 2892 }, { "epoch": 0.5363366703744902, "grad_norm": 6.0703125, "learning_rate": 9.46366332962551e-06, "loss": 2.7892, "mean_token_accuracy": 0.4475542431634356, "step": 2893 }, { "epoch": 0.5365220615498703, "grad_norm": 11.4921875, "learning_rate": 9.46347793845013e-06, "loss": 2.626, "mean_token_accuracy": 0.47231705506902183, "step": 2894 }, { "epoch": 0.5367074527252503, "grad_norm": 5.21484375, "learning_rate": 9.463292547274751e-06, "loss": 2.7213, "mean_token_accuracy": 0.45841784989858014, "step": 2895 }, { "epoch": 0.5368928439006303, "grad_norm": 6.36328125, "learning_rate": 9.46310715609937e-06, "loss": 2.6026, "mean_token_accuracy": 0.4678819444444444, "step": 2896 }, { "epoch": 0.5370782350760104, "grad_norm": 6.03515625, "learning_rate": 9.46292176492399e-06, "loss": 2.8458, "mean_token_accuracy": 0.4189961880559085, "step": 2897 }, { "epoch": 0.5372636262513905, "grad_norm": 9.3671875, "learning_rate": 9.46273637374861e-06, "loss": 2.5886, "mean_token_accuracy": 0.4768760907504363, "step": 2898 }, { "epoch": 0.5374490174267705, "grad_norm": 6.703125, "learning_rate": 9.46255098257323e-06, "loss": 3.2033, "mean_token_accuracy": 0.39785843415380334, "step": 2899 }, { "epoch": 0.5376344086021505, "grad_norm": 7.3671875, "learning_rate": 9.46236559139785e-06, "loss": 3.301, "mean_token_accuracy": 0.38910632746249185, "step": 2900 }, { "epoch": 0.5378197997775306, "grad_norm": 5.90234375, "learning_rate": 9.46218020022247e-06, "loss": 3.1398, "mean_token_accuracy": 0.41653112524027797, "step": 2901 }, { "epoch": 0.5380051909529107, "grad_norm": 5.52734375, "learning_rate": 9.461994809047091e-06, "loss": 2.8023, "mean_token_accuracy": 0.44234960767218834, "step": 2902 }, { "epoch": 0.5381905821282907, "grad_norm": 7.58984375, "learning_rate": 9.46180941787171e-06, "loss": 2.8442, "mean_token_accuracy": 0.4365420812046249, "step": 2903 }, { "epoch": 0.5383759733036707, "grad_norm": 10.3671875, "learning_rate": 9.46162402669633e-06, "loss": 2.2696, "mean_token_accuracy": 0.5064337150277417, "step": 2904 }, { "epoch": 0.5385613644790508, "grad_norm": 6.4609375, "learning_rate": 9.46143863552095e-06, "loss": 2.5671, "mean_token_accuracy": 0.4740759116844456, "step": 2905 }, { "epoch": 0.5387467556544309, "grad_norm": 6.0, "learning_rate": 9.46125324434557e-06, "loss": 3.1581, "mean_token_accuracy": 0.40246105215624295, "step": 2906 }, { "epoch": 0.5389321468298109, "grad_norm": 6.59765625, "learning_rate": 9.46106785317019e-06, "loss": 3.2238, "mean_token_accuracy": 0.4177092021128718, "step": 2907 }, { "epoch": 0.5391175380051909, "grad_norm": 6.35546875, "learning_rate": 9.46088246199481e-06, "loss": 3.2745, "mean_token_accuracy": 0.41226740179186766, "step": 2908 }, { "epoch": 0.539302929180571, "grad_norm": 5.359375, "learning_rate": 9.46069707081943e-06, "loss": 3.2445, "mean_token_accuracy": 0.3871749313519625, "step": 2909 }, { "epoch": 0.5394883203559511, "grad_norm": 8.71875, "learning_rate": 9.46051167964405e-06, "loss": 2.5877, "mean_token_accuracy": 0.483264761481152, "step": 2910 }, { "epoch": 0.5396737115313311, "grad_norm": 6.671875, "learning_rate": 9.46032628846867e-06, "loss": 2.737, "mean_token_accuracy": 0.4463423253622335, "step": 2911 }, { "epoch": 0.5398591027067111, "grad_norm": 5.40625, "learning_rate": 9.46014089729329e-06, "loss": 2.9582, "mean_token_accuracy": 0.43233743409490333, "step": 2912 }, { "epoch": 0.5400444938820912, "grad_norm": 5.703125, "learning_rate": 9.45995550611791e-06, "loss": 2.5401, "mean_token_accuracy": 0.4796905222437137, "step": 2913 }, { "epoch": 0.5402298850574713, "grad_norm": 4.90234375, "learning_rate": 9.459770114942529e-06, "loss": 2.5644, "mean_token_accuracy": 0.4775074183976261, "step": 2914 }, { "epoch": 0.5404152762328513, "grad_norm": 7.8984375, "learning_rate": 9.459584723767149e-06, "loss": 2.5894, "mean_token_accuracy": 0.46721132897603485, "step": 2915 }, { "epoch": 0.5406006674082313, "grad_norm": 8.1015625, "learning_rate": 9.45939933259177e-06, "loss": 3.0651, "mean_token_accuracy": 0.41386580677167306, "step": 2916 }, { "epoch": 0.5407860585836114, "grad_norm": 6.64453125, "learning_rate": 9.45921394141639e-06, "loss": 2.3646, "mean_token_accuracy": 0.4939033348024093, "step": 2917 }, { "epoch": 0.5409714497589915, "grad_norm": 10.0, "learning_rate": 9.459028550241009e-06, "loss": 2.7577, "mean_token_accuracy": 0.44311887515977844, "step": 2918 }, { "epoch": 0.5411568409343716, "grad_norm": 7.28515625, "learning_rate": 9.45884315906563e-06, "loss": 2.7212, "mean_token_accuracy": 0.43828880511391577, "step": 2919 }, { "epoch": 0.5413422321097516, "grad_norm": 6.32421875, "learning_rate": 9.45865776789025e-06, "loss": 2.5071, "mean_token_accuracy": 0.47138209422822025, "step": 2920 }, { "epoch": 0.5415276232851316, "grad_norm": 5.4609375, "learning_rate": 9.458472376714869e-06, "loss": 3.0111, "mean_token_accuracy": 0.4313127892538661, "step": 2921 }, { "epoch": 0.5417130144605117, "grad_norm": 8.234375, "learning_rate": 9.458286985539489e-06, "loss": 2.5636, "mean_token_accuracy": 0.4563631790744467, "step": 2922 }, { "epoch": 0.5418984056358918, "grad_norm": 6.828125, "learning_rate": 9.458101594364108e-06, "loss": 2.6473, "mean_token_accuracy": 0.4421920872361247, "step": 2923 }, { "epoch": 0.5420837968112718, "grad_norm": 5.9453125, "learning_rate": 9.45791620318873e-06, "loss": 2.7071, "mean_token_accuracy": 0.44041031178296663, "step": 2924 }, { "epoch": 0.5422691879866518, "grad_norm": 6.38671875, "learning_rate": 9.457730812013349e-06, "loss": 2.4567, "mean_token_accuracy": 0.4776157585060118, "step": 2925 }, { "epoch": 0.5424545791620319, "grad_norm": 12.4921875, "learning_rate": 9.45754542083797e-06, "loss": 2.6182, "mean_token_accuracy": 0.45106642291285803, "step": 2926 }, { "epoch": 0.542639970337412, "grad_norm": 7.875, "learning_rate": 9.457360029662588e-06, "loss": 2.6773, "mean_token_accuracy": 0.4423870383415518, "step": 2927 }, { "epoch": 0.542825361512792, "grad_norm": 9.3984375, "learning_rate": 9.457174638487209e-06, "loss": 2.6585, "mean_token_accuracy": 0.441566356849002, "step": 2928 }, { "epoch": 0.543010752688172, "grad_norm": 6.26171875, "learning_rate": 9.456989247311829e-06, "loss": 2.9728, "mean_token_accuracy": 0.428536375904797, "step": 2929 }, { "epoch": 0.5431961438635521, "grad_norm": 7.55859375, "learning_rate": 9.456803856136448e-06, "loss": 2.774, "mean_token_accuracy": 0.4648145067376476, "step": 2930 }, { "epoch": 0.5433815350389322, "grad_norm": 5.61328125, "learning_rate": 9.456618464961068e-06, "loss": 3.0686, "mean_token_accuracy": 0.42081862168170525, "step": 2931 }, { "epoch": 0.5435669262143122, "grad_norm": 4.7578125, "learning_rate": 9.456433073785689e-06, "loss": 3.1302, "mean_token_accuracy": 0.4389503573876432, "step": 2932 }, { "epoch": 0.5437523173896922, "grad_norm": 7.01171875, "learning_rate": 9.45624768261031e-06, "loss": 3.0709, "mean_token_accuracy": 0.4141399829980164, "step": 2933 }, { "epoch": 0.5439377085650723, "grad_norm": 5.66796875, "learning_rate": 9.456062291434928e-06, "loss": 3.158, "mean_token_accuracy": 0.4203651685393258, "step": 2934 }, { "epoch": 0.5441230997404524, "grad_norm": 6.2890625, "learning_rate": 9.455876900259549e-06, "loss": 2.6696, "mean_token_accuracy": 0.45157593123209167, "step": 2935 }, { "epoch": 0.5443084909158324, "grad_norm": 5.53125, "learning_rate": 9.455691509084168e-06, "loss": 2.4949, "mean_token_accuracy": 0.48263291139240505, "step": 2936 }, { "epoch": 0.5444938820912124, "grad_norm": 5.703125, "learning_rate": 9.455506117908788e-06, "loss": 3.5146, "mean_token_accuracy": 0.38159203980099504, "step": 2937 }, { "epoch": 0.5446792732665925, "grad_norm": 7.42578125, "learning_rate": 9.455320726733408e-06, "loss": 3.0979, "mean_token_accuracy": 0.40545303752850925, "step": 2938 }, { "epoch": 0.5448646644419726, "grad_norm": 5.65625, "learning_rate": 9.455135335558027e-06, "loss": 2.6137, "mean_token_accuracy": 0.49430765157532436, "step": 2939 }, { "epoch": 0.5450500556173526, "grad_norm": 7.38671875, "learning_rate": 9.45494994438265e-06, "loss": 2.5808, "mean_token_accuracy": 0.4534042843498431, "step": 2940 }, { "epoch": 0.5452354467927326, "grad_norm": 6.89453125, "learning_rate": 9.454764553207268e-06, "loss": 2.7685, "mean_token_accuracy": 0.43670779111035996, "step": 2941 }, { "epoch": 0.5454208379681127, "grad_norm": 5.9453125, "learning_rate": 9.454579162031889e-06, "loss": 2.3338, "mean_token_accuracy": 0.5088801184015787, "step": 2942 }, { "epoch": 0.5456062291434928, "grad_norm": 6.41796875, "learning_rate": 9.454393770856508e-06, "loss": 2.7579, "mean_token_accuracy": 0.4438618925831202, "step": 2943 }, { "epoch": 0.5457916203188728, "grad_norm": 5.4921875, "learning_rate": 9.454208379681128e-06, "loss": 2.7417, "mean_token_accuracy": 0.4478921463727798, "step": 2944 }, { "epoch": 0.5459770114942529, "grad_norm": 7.62109375, "learning_rate": 9.454022988505749e-06, "loss": 3.536, "mean_token_accuracy": 0.36312514955731034, "step": 2945 }, { "epoch": 0.5461624026696329, "grad_norm": 5.5859375, "learning_rate": 9.453837597330367e-06, "loss": 3.0594, "mean_token_accuracy": 0.410159416373953, "step": 2946 }, { "epoch": 0.546347793845013, "grad_norm": 7.42578125, "learning_rate": 9.453652206154988e-06, "loss": 2.5315, "mean_token_accuracy": 0.45857005038053383, "step": 2947 }, { "epoch": 0.546533185020393, "grad_norm": 7.29296875, "learning_rate": 9.453466814979608e-06, "loss": 3.0364, "mean_token_accuracy": 0.40353121801432956, "step": 2948 }, { "epoch": 0.5467185761957731, "grad_norm": 5.26171875, "learning_rate": 9.453281423804229e-06, "loss": 2.588, "mean_token_accuracy": 0.47316704459561604, "step": 2949 }, { "epoch": 0.5469039673711531, "grad_norm": 5.5390625, "learning_rate": 9.453096032628848e-06, "loss": 2.7083, "mean_token_accuracy": 0.44113295286408793, "step": 2950 }, { "epoch": 0.5470893585465332, "grad_norm": 7.05859375, "learning_rate": 9.452910641453468e-06, "loss": 2.768, "mean_token_accuracy": 0.4349468713105077, "step": 2951 }, { "epoch": 0.5472747497219133, "grad_norm": 6.171875, "learning_rate": 9.452725250278087e-06, "loss": 3.013, "mean_token_accuracy": 0.4283493132935994, "step": 2952 }, { "epoch": 0.5474601408972933, "grad_norm": 5.9609375, "learning_rate": 9.452539859102707e-06, "loss": 3.329, "mean_token_accuracy": 0.38268927444794953, "step": 2953 }, { "epoch": 0.5476455320726733, "grad_norm": 6.0078125, "learning_rate": 9.452354467927328e-06, "loss": 2.7009, "mean_token_accuracy": 0.4593716143011918, "step": 2954 }, { "epoch": 0.5478309232480534, "grad_norm": 6.05078125, "learning_rate": 9.452169076751947e-06, "loss": 2.8836, "mean_token_accuracy": 0.4247697031729785, "step": 2955 }, { "epoch": 0.5480163144234335, "grad_norm": 6.375, "learning_rate": 9.451983685576567e-06, "loss": 2.8153, "mean_token_accuracy": 0.43841548847280604, "step": 2956 }, { "epoch": 0.5482017055988135, "grad_norm": 6.5390625, "learning_rate": 9.451798294401188e-06, "loss": 3.0976, "mean_token_accuracy": 0.4335485606672047, "step": 2957 }, { "epoch": 0.5483870967741935, "grad_norm": 7.2890625, "learning_rate": 9.451612903225808e-06, "loss": 2.5472, "mean_token_accuracy": 0.4703525641025641, "step": 2958 }, { "epoch": 0.5485724879495736, "grad_norm": 5.52734375, "learning_rate": 9.451427512050427e-06, "loss": 2.7522, "mean_token_accuracy": 0.4346079246328623, "step": 2959 }, { "epoch": 0.5487578791249537, "grad_norm": 7.03515625, "learning_rate": 9.451242120875047e-06, "loss": 2.7747, "mean_token_accuracy": 0.4698614125897479, "step": 2960 }, { "epoch": 0.5489432703003337, "grad_norm": 6.85546875, "learning_rate": 9.451056729699666e-06, "loss": 2.9089, "mean_token_accuracy": 0.4175675675675676, "step": 2961 }, { "epoch": 0.5491286614757137, "grad_norm": 5.9765625, "learning_rate": 9.450871338524287e-06, "loss": 3.1658, "mean_token_accuracy": 0.39394642235880784, "step": 2962 }, { "epoch": 0.5493140526510938, "grad_norm": 6.37109375, "learning_rate": 9.450685947348907e-06, "loss": 2.9053, "mean_token_accuracy": 0.4261866744084773, "step": 2963 }, { "epoch": 0.5494994438264739, "grad_norm": 7.328125, "learning_rate": 9.450500556173528e-06, "loss": 2.8157, "mean_token_accuracy": 0.43747619652151837, "step": 2964 }, { "epoch": 0.5496848350018539, "grad_norm": 6.5859375, "learning_rate": 9.450315164998147e-06, "loss": 3.4574, "mean_token_accuracy": 0.3892110586648685, "step": 2965 }, { "epoch": 0.5498702261772339, "grad_norm": 10.390625, "learning_rate": 9.450129773822767e-06, "loss": 2.0949, "mean_token_accuracy": 0.5319148936170213, "step": 2966 }, { "epoch": 0.550055617352614, "grad_norm": 6.546875, "learning_rate": 9.449944382647387e-06, "loss": 2.7709, "mean_token_accuracy": 0.44807162534435263, "step": 2967 }, { "epoch": 0.5502410085279941, "grad_norm": 6.1640625, "learning_rate": 9.449758991472006e-06, "loss": 3.0282, "mean_token_accuracy": 0.4307030514854147, "step": 2968 }, { "epoch": 0.5504263997033741, "grad_norm": 5.515625, "learning_rate": 9.449573600296627e-06, "loss": 2.5391, "mean_token_accuracy": 0.4617539585870889, "step": 2969 }, { "epoch": 0.5506117908787542, "grad_norm": 6.375, "learning_rate": 9.449388209121246e-06, "loss": 2.6059, "mean_token_accuracy": 0.4758575197889182, "step": 2970 }, { "epoch": 0.5507971820541342, "grad_norm": 4.83203125, "learning_rate": 9.449202817945866e-06, "loss": 2.8657, "mean_token_accuracy": 0.42155331534857304, "step": 2971 }, { "epoch": 0.5509825732295143, "grad_norm": 5.75390625, "learning_rate": 9.449017426770487e-06, "loss": 3.047, "mean_token_accuracy": 0.414181732801127, "step": 2972 }, { "epoch": 0.5511679644048944, "grad_norm": 5.71484375, "learning_rate": 9.448832035595107e-06, "loss": 3.8133, "mean_token_accuracy": 0.3526508742244783, "step": 2973 }, { "epoch": 0.5513533555802744, "grad_norm": 6.3515625, "learning_rate": 9.448646644419726e-06, "loss": 2.6387, "mean_token_accuracy": 0.4614587381785205, "step": 2974 }, { "epoch": 0.5515387467556544, "grad_norm": 4.8984375, "learning_rate": 9.448461253244346e-06, "loss": 3.3632, "mean_token_accuracy": 0.3841771332383789, "step": 2975 }, { "epoch": 0.5517241379310345, "grad_norm": 6.38671875, "learning_rate": 9.448275862068967e-06, "loss": 2.5651, "mean_token_accuracy": 0.4772513335425165, "step": 2976 }, { "epoch": 0.5519095291064146, "grad_norm": 4.96484375, "learning_rate": 9.448090470893586e-06, "loss": 2.7462, "mean_token_accuracy": 0.45734399236367973, "step": 2977 }, { "epoch": 0.5520949202817946, "grad_norm": 6.16796875, "learning_rate": 9.447905079718206e-06, "loss": 3.2056, "mean_token_accuracy": 0.4206884315117104, "step": 2978 }, { "epoch": 0.5522803114571746, "grad_norm": 6.34375, "learning_rate": 9.447719688542825e-06, "loss": 2.9688, "mean_token_accuracy": 0.4270218151138966, "step": 2979 }, { "epoch": 0.5524657026325547, "grad_norm": 5.66796875, "learning_rate": 9.447534297367445e-06, "loss": 3.1863, "mean_token_accuracy": 0.39348863317429134, "step": 2980 }, { "epoch": 0.5526510938079348, "grad_norm": 7.2421875, "learning_rate": 9.447348906192066e-06, "loss": 2.5402, "mean_token_accuracy": 0.47466196355085244, "step": 2981 }, { "epoch": 0.5528364849833148, "grad_norm": 9.1796875, "learning_rate": 9.447163515016686e-06, "loss": 2.5158, "mean_token_accuracy": 0.4834305960817007, "step": 2982 }, { "epoch": 0.5530218761586948, "grad_norm": 7.0, "learning_rate": 9.446978123841307e-06, "loss": 2.8011, "mean_token_accuracy": 0.4426123921164498, "step": 2983 }, { "epoch": 0.5532072673340749, "grad_norm": 6.33984375, "learning_rate": 9.446792732665926e-06, "loss": 3.1543, "mean_token_accuracy": 0.41539482415394824, "step": 2984 }, { "epoch": 0.553392658509455, "grad_norm": 8.7890625, "learning_rate": 9.446607341490546e-06, "loss": 3.1289, "mean_token_accuracy": 0.40719923463286295, "step": 2985 }, { "epoch": 0.553578049684835, "grad_norm": 7.9375, "learning_rate": 9.446421950315165e-06, "loss": 2.9219, "mean_token_accuracy": 0.4214385033497661, "step": 2986 }, { "epoch": 0.553763440860215, "grad_norm": 6.62109375, "learning_rate": 9.446236559139785e-06, "loss": 2.798, "mean_token_accuracy": 0.4602012808783166, "step": 2987 }, { "epoch": 0.5539488320355951, "grad_norm": 5.36328125, "learning_rate": 9.446051167964406e-06, "loss": 3.1805, "mean_token_accuracy": 0.40885446769159633, "step": 2988 }, { "epoch": 0.5541342232109752, "grad_norm": 9.7734375, "learning_rate": 9.445865776789026e-06, "loss": 2.7911, "mean_token_accuracy": 0.4408098028769313, "step": 2989 }, { "epoch": 0.5543196143863552, "grad_norm": 6.3359375, "learning_rate": 9.445680385613645e-06, "loss": 2.9232, "mean_token_accuracy": 0.4124911785462244, "step": 2990 }, { "epoch": 0.5545050055617352, "grad_norm": 5.9296875, "learning_rate": 9.445494994438266e-06, "loss": 2.7367, "mean_token_accuracy": 0.4612546125461255, "step": 2991 }, { "epoch": 0.5546903967371153, "grad_norm": 6.3828125, "learning_rate": 9.445309603262886e-06, "loss": 2.4229, "mean_token_accuracy": 0.48038161784466543, "step": 2992 }, { "epoch": 0.5548757879124954, "grad_norm": 6.16015625, "learning_rate": 9.445124212087505e-06, "loss": 2.948, "mean_token_accuracy": 0.43003091813415784, "step": 2993 }, { "epoch": 0.5550611790878754, "grad_norm": 6.21484375, "learning_rate": 9.444938820912126e-06, "loss": 3.1607, "mean_token_accuracy": 0.40956511546125896, "step": 2994 }, { "epoch": 0.5552465702632555, "grad_norm": 6.21484375, "learning_rate": 9.444753429736744e-06, "loss": 2.9538, "mean_token_accuracy": 0.42231172952520446, "step": 2995 }, { "epoch": 0.5554319614386355, "grad_norm": 5.97265625, "learning_rate": 9.444568038561365e-06, "loss": 2.6489, "mean_token_accuracy": 0.45564516129032256, "step": 2996 }, { "epoch": 0.5556173526140156, "grad_norm": 6.0703125, "learning_rate": 9.444382647385985e-06, "loss": 2.4631, "mean_token_accuracy": 0.49040011725047633, "step": 2997 }, { "epoch": 0.5558027437893956, "grad_norm": 6.2734375, "learning_rate": 9.444197256210606e-06, "loss": 2.9895, "mean_token_accuracy": 0.42658137882018476, "step": 2998 }, { "epoch": 0.5559881349647757, "grad_norm": 7.36328125, "learning_rate": 9.444011865035225e-06, "loss": 2.9292, "mean_token_accuracy": 0.40367264621590065, "step": 2999 }, { "epoch": 0.5561735261401557, "grad_norm": 6.921875, "learning_rate": 9.443826473859845e-06, "loss": 2.6649, "mean_token_accuracy": 0.45190246516613075, "step": 3000 }, { "epoch": 0.5563589173155358, "grad_norm": 5.45703125, "learning_rate": 9.443641082684466e-06, "loss": 2.5858, "mean_token_accuracy": 0.46935707678075855, "step": 3001 }, { "epoch": 0.5565443084909159, "grad_norm": 5.80078125, "learning_rate": 9.443455691509084e-06, "loss": 3.2635, "mean_token_accuracy": 0.40212363330529854, "step": 3002 }, { "epoch": 0.5567296996662959, "grad_norm": 9.0546875, "learning_rate": 9.443270300333705e-06, "loss": 2.4568, "mean_token_accuracy": 0.46580683863227357, "step": 3003 }, { "epoch": 0.5569150908416759, "grad_norm": 10.28125, "learning_rate": 9.443084909158324e-06, "loss": 3.0562, "mean_token_accuracy": 0.4003463476070529, "step": 3004 }, { "epoch": 0.557100482017056, "grad_norm": 5.1875, "learning_rate": 9.442899517982946e-06, "loss": 2.8459, "mean_token_accuracy": 0.43414223593771917, "step": 3005 }, { "epoch": 0.557285873192436, "grad_norm": 7.66796875, "learning_rate": 9.442714126807565e-06, "loss": 2.4622, "mean_token_accuracy": 0.4695671042638603, "step": 3006 }, { "epoch": 0.5574712643678161, "grad_norm": 6.7578125, "learning_rate": 9.442528735632185e-06, "loss": 3.1757, "mean_token_accuracy": 0.4162598889075913, "step": 3007 }, { "epoch": 0.5576566555431961, "grad_norm": 6.91796875, "learning_rate": 9.442343344456804e-06, "loss": 2.7553, "mean_token_accuracy": 0.4263616274535292, "step": 3008 }, { "epoch": 0.5578420467185762, "grad_norm": 8.3203125, "learning_rate": 9.442157953281424e-06, "loss": 2.3425, "mean_token_accuracy": 0.4893809893809894, "step": 3009 }, { "epoch": 0.5580274378939563, "grad_norm": 5.45703125, "learning_rate": 9.441972562106045e-06, "loss": 2.6553, "mean_token_accuracy": 0.4653410660038663, "step": 3010 }, { "epoch": 0.5582128290693363, "grad_norm": 5.81640625, "learning_rate": 9.441787170930664e-06, "loss": 3.0349, "mean_token_accuracy": 0.3961129106894956, "step": 3011 }, { "epoch": 0.5583982202447163, "grad_norm": 11.078125, "learning_rate": 9.441601779755284e-06, "loss": 2.738, "mean_token_accuracy": 0.4572911122937733, "step": 3012 }, { "epoch": 0.5585836114200964, "grad_norm": 8.1328125, "learning_rate": 9.441416388579905e-06, "loss": 3.2793, "mean_token_accuracy": 0.39879518072289155, "step": 3013 }, { "epoch": 0.5587690025954765, "grad_norm": 6.56640625, "learning_rate": 9.441230997404525e-06, "loss": 2.9748, "mean_token_accuracy": 0.418001800180018, "step": 3014 }, { "epoch": 0.5589543937708565, "grad_norm": 5.921875, "learning_rate": 9.441045606229144e-06, "loss": 3.1678, "mean_token_accuracy": 0.3848596468944494, "step": 3015 }, { "epoch": 0.5591397849462365, "grad_norm": 6.35546875, "learning_rate": 9.440860215053764e-06, "loss": 2.815, "mean_token_accuracy": 0.4421269768507907, "step": 3016 }, { "epoch": 0.5593251761216166, "grad_norm": 7.61328125, "learning_rate": 9.440674823878383e-06, "loss": 3.1755, "mean_token_accuracy": 0.4004226954966672, "step": 3017 }, { "epoch": 0.5595105672969967, "grad_norm": 9.4140625, "learning_rate": 9.440489432703004e-06, "loss": 2.7947, "mean_token_accuracy": 0.4178360199466543, "step": 3018 }, { "epoch": 0.5596959584723767, "grad_norm": 6.328125, "learning_rate": 9.440304041527624e-06, "loss": 2.8986, "mean_token_accuracy": 0.43343792021515015, "step": 3019 }, { "epoch": 0.5598813496477568, "grad_norm": 6.81640625, "learning_rate": 9.440118650352243e-06, "loss": 3.1107, "mean_token_accuracy": 0.39828693790149894, "step": 3020 }, { "epoch": 0.5600667408231368, "grad_norm": 6.640625, "learning_rate": 9.439933259176865e-06, "loss": 3.0485, "mean_token_accuracy": 0.4369486892781712, "step": 3021 }, { "epoch": 0.5602521319985169, "grad_norm": 5.734375, "learning_rate": 9.439747868001484e-06, "loss": 2.9654, "mean_token_accuracy": 0.41912688442211055, "step": 3022 }, { "epoch": 0.560437523173897, "grad_norm": 7.82421875, "learning_rate": 9.439562476826105e-06, "loss": 3.0825, "mean_token_accuracy": 0.42078722510155486, "step": 3023 }, { "epoch": 0.560622914349277, "grad_norm": 6.18359375, "learning_rate": 9.439377085650723e-06, "loss": 2.9724, "mean_token_accuracy": 0.41904761904761906, "step": 3024 }, { "epoch": 0.560808305524657, "grad_norm": 5.14453125, "learning_rate": 9.439191694475344e-06, "loss": 3.0113, "mean_token_accuracy": 0.4302105661328962, "step": 3025 }, { "epoch": 0.5609936967000371, "grad_norm": 5.43359375, "learning_rate": 9.439006303299964e-06, "loss": 2.8233, "mean_token_accuracy": 0.44569871545348383, "step": 3026 }, { "epoch": 0.5611790878754171, "grad_norm": 7.515625, "learning_rate": 9.438820912124583e-06, "loss": 2.7591, "mean_token_accuracy": 0.4564459930313589, "step": 3027 }, { "epoch": 0.5613644790507972, "grad_norm": 7.69921875, "learning_rate": 9.438635520949204e-06, "loss": 2.3939, "mean_token_accuracy": 0.5017647728566549, "step": 3028 }, { "epoch": 0.5615498702261772, "grad_norm": 6.22265625, "learning_rate": 9.438450129773824e-06, "loss": 2.9177, "mean_token_accuracy": 0.4369124851052562, "step": 3029 }, { "epoch": 0.5617352614015573, "grad_norm": 7.1796875, "learning_rate": 9.438264738598445e-06, "loss": 2.5704, "mean_token_accuracy": 0.4731827262947595, "step": 3030 }, { "epoch": 0.5619206525769374, "grad_norm": 4.796875, "learning_rate": 9.438079347423063e-06, "loss": 2.8088, "mean_token_accuracy": 0.44042695130086723, "step": 3031 }, { "epoch": 0.5621060437523174, "grad_norm": 5.140625, "learning_rate": 9.437893956247684e-06, "loss": 3.172, "mean_token_accuracy": 0.40962025316455697, "step": 3032 }, { "epoch": 0.5622914349276974, "grad_norm": 8.8046875, "learning_rate": 9.437708565072303e-06, "loss": 2.9087, "mean_token_accuracy": 0.4396851122945126, "step": 3033 }, { "epoch": 0.5624768261030775, "grad_norm": 6.28125, "learning_rate": 9.437523173896923e-06, "loss": 3.0916, "mean_token_accuracy": 0.4091234091234091, "step": 3034 }, { "epoch": 0.5626622172784576, "grad_norm": 4.9453125, "learning_rate": 9.437337782721544e-06, "loss": 2.9472, "mean_token_accuracy": 0.4344988344988345, "step": 3035 }, { "epoch": 0.5628476084538376, "grad_norm": 7.94140625, "learning_rate": 9.437152391546162e-06, "loss": 3.5935, "mean_token_accuracy": 0.3597399959374365, "step": 3036 }, { "epoch": 0.5630329996292176, "grad_norm": 9.0703125, "learning_rate": 9.436967000370783e-06, "loss": 3.0361, "mean_token_accuracy": 0.425531914893617, "step": 3037 }, { "epoch": 0.5632183908045977, "grad_norm": 6.4140625, "learning_rate": 9.436781609195403e-06, "loss": 2.7566, "mean_token_accuracy": 0.4479725700655933, "step": 3038 }, { "epoch": 0.5634037819799778, "grad_norm": 4.75, "learning_rate": 9.436596218020024e-06, "loss": 2.9319, "mean_token_accuracy": 0.430103995621237, "step": 3039 }, { "epoch": 0.5635891731553578, "grad_norm": 6.10546875, "learning_rate": 9.436410826844643e-06, "loss": 3.2311, "mean_token_accuracy": 0.41284403669724773, "step": 3040 }, { "epoch": 0.5637745643307378, "grad_norm": 6.05859375, "learning_rate": 9.436225435669263e-06, "loss": 2.6596, "mean_token_accuracy": 0.44805485169175735, "step": 3041 }, { "epoch": 0.5639599555061179, "grad_norm": 8.3515625, "learning_rate": 9.436040044493882e-06, "loss": 2.4301, "mean_token_accuracy": 0.46982656212877166, "step": 3042 }, { "epoch": 0.564145346681498, "grad_norm": 6.3515625, "learning_rate": 9.435854653318502e-06, "loss": 2.5456, "mean_token_accuracy": 0.46542587566240146, "step": 3043 }, { "epoch": 0.564330737856878, "grad_norm": 6.8828125, "learning_rate": 9.435669262143123e-06, "loss": 2.4448, "mean_token_accuracy": 0.5168363351605325, "step": 3044 }, { "epoch": 0.5645161290322581, "grad_norm": 7.12890625, "learning_rate": 9.435483870967743e-06, "loss": 2.7131, "mean_token_accuracy": 0.4623695071619325, "step": 3045 }, { "epoch": 0.5647015202076381, "grad_norm": 6.79296875, "learning_rate": 9.435298479792362e-06, "loss": 2.8111, "mean_token_accuracy": 0.4423773460651959, "step": 3046 }, { "epoch": 0.5648869113830182, "grad_norm": 5.36328125, "learning_rate": 9.435113088616983e-06, "loss": 3.3342, "mean_token_accuracy": 0.4002873563218391, "step": 3047 }, { "epoch": 0.5650723025583982, "grad_norm": 5.7890625, "learning_rate": 9.434927697441603e-06, "loss": 2.7687, "mean_token_accuracy": 0.4469172932330827, "step": 3048 }, { "epoch": 0.5652576937337783, "grad_norm": 9.8046875, "learning_rate": 9.434742306266222e-06, "loss": 2.4697, "mean_token_accuracy": 0.4825788402848423, "step": 3049 }, { "epoch": 0.5654430849091583, "grad_norm": 6.234375, "learning_rate": 9.434556915090843e-06, "loss": 2.7925, "mean_token_accuracy": 0.43545302414535025, "step": 3050 }, { "epoch": 0.5656284760845384, "grad_norm": 5.92578125, "learning_rate": 9.434371523915461e-06, "loss": 3.2016, "mean_token_accuracy": 0.41889014155079546, "step": 3051 }, { "epoch": 0.5658138672599184, "grad_norm": 5.90625, "learning_rate": 9.434186132740082e-06, "loss": 2.6429, "mean_token_accuracy": 0.4780015753347586, "step": 3052 }, { "epoch": 0.5659992584352985, "grad_norm": 7.63671875, "learning_rate": 9.434000741564702e-06, "loss": 2.3174, "mean_token_accuracy": 0.47181788333150926, "step": 3053 }, { "epoch": 0.5661846496106785, "grad_norm": 5.14453125, "learning_rate": 9.433815350389323e-06, "loss": 2.4469, "mean_token_accuracy": 0.48305843242552104, "step": 3054 }, { "epoch": 0.5663700407860586, "grad_norm": 5.12890625, "learning_rate": 9.433629959213942e-06, "loss": 2.4647, "mean_token_accuracy": 0.48691174367043344, "step": 3055 }, { "epoch": 0.5665554319614386, "grad_norm": 5.25, "learning_rate": 9.433444568038562e-06, "loss": 2.5619, "mean_token_accuracy": 0.4682970012172181, "step": 3056 }, { "epoch": 0.5667408231368187, "grad_norm": 5.8359375, "learning_rate": 9.433259176863183e-06, "loss": 2.6465, "mean_token_accuracy": 0.46729328083322746, "step": 3057 }, { "epoch": 0.5669262143121987, "grad_norm": 5.24609375, "learning_rate": 9.433073785687801e-06, "loss": 2.7637, "mean_token_accuracy": 0.4496858694494729, "step": 3058 }, { "epoch": 0.5671116054875788, "grad_norm": 5.82421875, "learning_rate": 9.432888394512422e-06, "loss": 2.3114, "mean_token_accuracy": 0.5154596674453759, "step": 3059 }, { "epoch": 0.5672969966629589, "grad_norm": 6.65625, "learning_rate": 9.43270300333704e-06, "loss": 2.9054, "mean_token_accuracy": 0.42919094728282964, "step": 3060 }, { "epoch": 0.5674823878383389, "grad_norm": 5.03125, "learning_rate": 9.432517612161663e-06, "loss": 3.0225, "mean_token_accuracy": 0.41094117647058825, "step": 3061 }, { "epoch": 0.5676677790137189, "grad_norm": 6.265625, "learning_rate": 9.432332220986282e-06, "loss": 2.4744, "mean_token_accuracy": 0.47181405289874434, "step": 3062 }, { "epoch": 0.567853170189099, "grad_norm": 7.41796875, "learning_rate": 9.432146829810902e-06, "loss": 2.8384, "mean_token_accuracy": 0.4414233805287558, "step": 3063 }, { "epoch": 0.568038561364479, "grad_norm": 7.90234375, "learning_rate": 9.431961438635523e-06, "loss": 3.2231, "mean_token_accuracy": 0.4073697585768742, "step": 3064 }, { "epoch": 0.5682239525398591, "grad_norm": 6.03515625, "learning_rate": 9.431776047460141e-06, "loss": 2.9904, "mean_token_accuracy": 0.4246031746031746, "step": 3065 }, { "epoch": 0.5684093437152391, "grad_norm": 6.94921875, "learning_rate": 9.431590656284762e-06, "loss": 3.1075, "mean_token_accuracy": 0.41306808992398514, "step": 3066 }, { "epoch": 0.5685947348906192, "grad_norm": 10.65625, "learning_rate": 9.43140526510938e-06, "loss": 2.9475, "mean_token_accuracy": 0.41959101237061347, "step": 3067 }, { "epoch": 0.5687801260659993, "grad_norm": 7.07421875, "learning_rate": 9.431219873934001e-06, "loss": 2.9984, "mean_token_accuracy": 0.417858038625533, "step": 3068 }, { "epoch": 0.5689655172413793, "grad_norm": 6.29296875, "learning_rate": 9.431034482758622e-06, "loss": 3.2892, "mean_token_accuracy": 0.3775735919686915, "step": 3069 }, { "epoch": 0.5691509084167594, "grad_norm": 6.75, "learning_rate": 9.430849091583242e-06, "loss": 2.4359, "mean_token_accuracy": 0.47815592565773596, "step": 3070 }, { "epoch": 0.5693362995921394, "grad_norm": 6.02734375, "learning_rate": 9.430663700407861e-06, "loss": 2.946, "mean_token_accuracy": 0.42396166134185304, "step": 3071 }, { "epoch": 0.5695216907675195, "grad_norm": 5.40625, "learning_rate": 9.430478309232481e-06, "loss": 2.2017, "mean_token_accuracy": 0.5070720570219401, "step": 3072 }, { "epoch": 0.5697070819428995, "grad_norm": 5.26953125, "learning_rate": 9.430292918057102e-06, "loss": 3.1126, "mean_token_accuracy": 0.3952633728052266, "step": 3073 }, { "epoch": 0.5698924731182796, "grad_norm": 6.08203125, "learning_rate": 9.43010752688172e-06, "loss": 2.5552, "mean_token_accuracy": 0.4871575342465753, "step": 3074 }, { "epoch": 0.5700778642936596, "grad_norm": 6.4609375, "learning_rate": 9.429922135706341e-06, "loss": 2.6921, "mean_token_accuracy": 0.4547441058079356, "step": 3075 }, { "epoch": 0.5702632554690397, "grad_norm": 5.91015625, "learning_rate": 9.42973674453096e-06, "loss": 2.5115, "mean_token_accuracy": 0.4806201550387597, "step": 3076 }, { "epoch": 0.5704486466444197, "grad_norm": 5.80859375, "learning_rate": 9.429551353355582e-06, "loss": 2.8453, "mean_token_accuracy": 0.4449781193297043, "step": 3077 }, { "epoch": 0.5706340378197998, "grad_norm": 6.28125, "learning_rate": 9.429365962180201e-06, "loss": 2.7013, "mean_token_accuracy": 0.44952352590827876, "step": 3078 }, { "epoch": 0.5708194289951798, "grad_norm": 8.234375, "learning_rate": 9.429180571004822e-06, "loss": 2.4455, "mean_token_accuracy": 0.4864663256606991, "step": 3079 }, { "epoch": 0.5710048201705599, "grad_norm": 9.0625, "learning_rate": 9.42899517982944e-06, "loss": 2.5219, "mean_token_accuracy": 0.46737326012003577, "step": 3080 }, { "epoch": 0.57119021134594, "grad_norm": 6.39453125, "learning_rate": 9.42880978865406e-06, "loss": 3.1202, "mean_token_accuracy": 0.4183418579754966, "step": 3081 }, { "epoch": 0.57137560252132, "grad_norm": 4.84375, "learning_rate": 9.428624397478681e-06, "loss": 2.5251, "mean_token_accuracy": 0.4836318715256331, "step": 3082 }, { "epoch": 0.5715609936967, "grad_norm": 7.4453125, "learning_rate": 9.4284390063033e-06, "loss": 3.7518, "mean_token_accuracy": 0.3526537260757432, "step": 3083 }, { "epoch": 0.5717463848720801, "grad_norm": 8.25, "learning_rate": 9.42825361512792e-06, "loss": 2.6698, "mean_token_accuracy": 0.48333604291985044, "step": 3084 }, { "epoch": 0.5719317760474601, "grad_norm": 5.54296875, "learning_rate": 9.428068223952541e-06, "loss": 2.6782, "mean_token_accuracy": 0.4541989425796139, "step": 3085 }, { "epoch": 0.5721171672228402, "grad_norm": 6.4609375, "learning_rate": 9.427882832777162e-06, "loss": 2.8835, "mean_token_accuracy": 0.4227449888641425, "step": 3086 }, { "epoch": 0.5723025583982202, "grad_norm": 7.18359375, "learning_rate": 9.42769744160178e-06, "loss": 2.543, "mean_token_accuracy": 0.4923646459972235, "step": 3087 }, { "epoch": 0.5724879495736003, "grad_norm": 5.51953125, "learning_rate": 9.427512050426401e-06, "loss": 3.0018, "mean_token_accuracy": 0.43718455872323014, "step": 3088 }, { "epoch": 0.5726733407489804, "grad_norm": 6.63671875, "learning_rate": 9.42732665925102e-06, "loss": 2.4993, "mean_token_accuracy": 0.46388151460108984, "step": 3089 }, { "epoch": 0.5728587319243604, "grad_norm": 5.53125, "learning_rate": 9.42714126807564e-06, "loss": 3.2172, "mean_token_accuracy": 0.3991467576791809, "step": 3090 }, { "epoch": 0.5730441230997404, "grad_norm": 8.890625, "learning_rate": 9.42695587690026e-06, "loss": 2.4552, "mean_token_accuracy": 0.5087814988545871, "step": 3091 }, { "epoch": 0.5732295142751205, "grad_norm": 6.484375, "learning_rate": 9.42677048572488e-06, "loss": 2.621, "mean_token_accuracy": 0.45777655324424854, "step": 3092 }, { "epoch": 0.5734149054505006, "grad_norm": 6.6015625, "learning_rate": 9.4265850945495e-06, "loss": 2.9773, "mean_token_accuracy": 0.4281560041053712, "step": 3093 }, { "epoch": 0.5736002966258806, "grad_norm": 6.46484375, "learning_rate": 9.42639970337412e-06, "loss": 3.3191, "mean_token_accuracy": 0.4092181069958848, "step": 3094 }, { "epoch": 0.5737856878012607, "grad_norm": 7.8203125, "learning_rate": 9.426214312198741e-06, "loss": 2.7018, "mean_token_accuracy": 0.45648134914963556, "step": 3095 }, { "epoch": 0.5739710789766407, "grad_norm": 5.60546875, "learning_rate": 9.42602892102336e-06, "loss": 3.103, "mean_token_accuracy": 0.4109918055948008, "step": 3096 }, { "epoch": 0.5741564701520208, "grad_norm": 5.66796875, "learning_rate": 9.42584352984798e-06, "loss": 2.5586, "mean_token_accuracy": 0.4963943601334625, "step": 3097 }, { "epoch": 0.5743418613274008, "grad_norm": 6.38671875, "learning_rate": 9.425658138672599e-06, "loss": 2.8668, "mean_token_accuracy": 0.43670886075949367, "step": 3098 }, { "epoch": 0.5745272525027809, "grad_norm": 4.734375, "learning_rate": 9.42547274749722e-06, "loss": 2.8494, "mean_token_accuracy": 0.43486114247008356, "step": 3099 }, { "epoch": 0.5747126436781609, "grad_norm": 9.703125, "learning_rate": 9.42528735632184e-06, "loss": 2.6443, "mean_token_accuracy": 0.47657945118059986, "step": 3100 }, { "epoch": 0.574898034853541, "grad_norm": 7.10546875, "learning_rate": 9.425101965146459e-06, "loss": 3.0981, "mean_token_accuracy": 0.4058312472389928, "step": 3101 }, { "epoch": 0.575083426028921, "grad_norm": 7.1171875, "learning_rate": 9.424916573971081e-06, "loss": 2.6452, "mean_token_accuracy": 0.4586416707778326, "step": 3102 }, { "epoch": 0.5752688172043011, "grad_norm": 8.265625, "learning_rate": 9.4247311827957e-06, "loss": 2.6154, "mean_token_accuracy": 0.45869731800766284, "step": 3103 }, { "epoch": 0.5754542083796811, "grad_norm": 6.46484375, "learning_rate": 9.42454579162032e-06, "loss": 2.8656, "mean_token_accuracy": 0.42342899554675906, "step": 3104 }, { "epoch": 0.5756395995550612, "grad_norm": 5.28515625, "learning_rate": 9.424360400444939e-06, "loss": 3.1534, "mean_token_accuracy": 0.41033966033966035, "step": 3105 }, { "epoch": 0.5758249907304412, "grad_norm": 6.9921875, "learning_rate": 9.42417500926956e-06, "loss": 3.0204, "mean_token_accuracy": 0.41279450261780104, "step": 3106 }, { "epoch": 0.5760103819058213, "grad_norm": 5.96484375, "learning_rate": 9.42398961809418e-06, "loss": 2.4845, "mean_token_accuracy": 0.4784120219804867, "step": 3107 }, { "epoch": 0.5761957730812013, "grad_norm": 5.765625, "learning_rate": 9.423804226918799e-06, "loss": 3.2799, "mean_token_accuracy": 0.4043154761904762, "step": 3108 }, { "epoch": 0.5763811642565814, "grad_norm": 5.36328125, "learning_rate": 9.42361883574342e-06, "loss": 2.9894, "mean_token_accuracy": 0.4227910817506193, "step": 3109 }, { "epoch": 0.5765665554319614, "grad_norm": 6.2890625, "learning_rate": 9.42343344456804e-06, "loss": 2.8693, "mean_token_accuracy": 0.429007245455701, "step": 3110 }, { "epoch": 0.5767519466073415, "grad_norm": 4.91796875, "learning_rate": 9.42324805339266e-06, "loss": 3.0737, "mean_token_accuracy": 0.4101744573178852, "step": 3111 }, { "epoch": 0.5769373377827215, "grad_norm": 6.09375, "learning_rate": 9.423062662217279e-06, "loss": 3.3594, "mean_token_accuracy": 0.38680348097155476, "step": 3112 }, { "epoch": 0.5771227289581016, "grad_norm": 6.92578125, "learning_rate": 9.4228772710419e-06, "loss": 2.4139, "mean_token_accuracy": 0.49304463529696385, "step": 3113 }, { "epoch": 0.5773081201334817, "grad_norm": 6.859375, "learning_rate": 9.422691879866518e-06, "loss": 2.9333, "mean_token_accuracy": 0.420321086089861, "step": 3114 }, { "epoch": 0.5774935113088617, "grad_norm": 5.953125, "learning_rate": 9.422506488691139e-06, "loss": 3.0873, "mean_token_accuracy": 0.41862627872990565, "step": 3115 }, { "epoch": 0.5776789024842417, "grad_norm": 8.640625, "learning_rate": 9.42232109751576e-06, "loss": 2.8899, "mean_token_accuracy": 0.4374440131382502, "step": 3116 }, { "epoch": 0.5778642936596218, "grad_norm": 6.23828125, "learning_rate": 9.422135706340378e-06, "loss": 2.745, "mean_token_accuracy": 0.4514747276955085, "step": 3117 }, { "epoch": 0.5780496848350019, "grad_norm": 6.96484375, "learning_rate": 9.421950315164999e-06, "loss": 3.0952, "mean_token_accuracy": 0.4092219020172911, "step": 3118 }, { "epoch": 0.5782350760103819, "grad_norm": 8.5703125, "learning_rate": 9.42176492398962e-06, "loss": 3.1313, "mean_token_accuracy": 0.41048436541998773, "step": 3119 }, { "epoch": 0.578420467185762, "grad_norm": 10.3515625, "learning_rate": 9.42157953281424e-06, "loss": 2.6825, "mean_token_accuracy": 0.45455987311657414, "step": 3120 }, { "epoch": 0.578605858361142, "grad_norm": 7.3515625, "learning_rate": 9.421394141638858e-06, "loss": 2.8339, "mean_token_accuracy": 0.4226912138901362, "step": 3121 }, { "epoch": 0.5787912495365221, "grad_norm": 5.71875, "learning_rate": 9.421208750463479e-06, "loss": 3.2622, "mean_token_accuracy": 0.39472198701665256, "step": 3122 }, { "epoch": 0.5789766407119021, "grad_norm": 8.0625, "learning_rate": 9.421023359288098e-06, "loss": 2.5602, "mean_token_accuracy": 0.4640846271967241, "step": 3123 }, { "epoch": 0.5791620318872822, "grad_norm": 10.1328125, "learning_rate": 9.420837968112718e-06, "loss": 2.4376, "mean_token_accuracy": 0.47997032640949555, "step": 3124 }, { "epoch": 0.5793474230626622, "grad_norm": 8.3359375, "learning_rate": 9.420652576937339e-06, "loss": 2.4903, "mean_token_accuracy": 0.4672224327396741, "step": 3125 }, { "epoch": 0.5795328142380423, "grad_norm": 6.12890625, "learning_rate": 9.42046718576196e-06, "loss": 3.4373, "mean_token_accuracy": 0.37851057115809406, "step": 3126 }, { "epoch": 0.5797182054134223, "grad_norm": 7.875, "learning_rate": 9.420281794586578e-06, "loss": 3.1728, "mean_token_accuracy": 0.4037108125399872, "step": 3127 }, { "epoch": 0.5799035965888024, "grad_norm": 6.1953125, "learning_rate": 9.420096403411198e-06, "loss": 3.1119, "mean_token_accuracy": 0.41893115942028986, "step": 3128 }, { "epoch": 0.5800889877641824, "grad_norm": 6.67578125, "learning_rate": 9.419911012235819e-06, "loss": 2.9689, "mean_token_accuracy": 0.4304301646309081, "step": 3129 }, { "epoch": 0.5802743789395625, "grad_norm": 6.609375, "learning_rate": 9.419725621060438e-06, "loss": 3.0626, "mean_token_accuracy": 0.39611964430072755, "step": 3130 }, { "epoch": 0.5804597701149425, "grad_norm": 10.2578125, "learning_rate": 9.419540229885058e-06, "loss": 3.0787, "mean_token_accuracy": 0.4143338517366511, "step": 3131 }, { "epoch": 0.5806451612903226, "grad_norm": 7.58984375, "learning_rate": 9.419354838709677e-06, "loss": 2.5569, "mean_token_accuracy": 0.4489263257800451, "step": 3132 }, { "epoch": 0.5808305524657026, "grad_norm": 6.14453125, "learning_rate": 9.419169447534298e-06, "loss": 2.4433, "mean_token_accuracy": 0.48450106157112527, "step": 3133 }, { "epoch": 0.5810159436410827, "grad_norm": 8.0234375, "learning_rate": 9.418984056358918e-06, "loss": 2.9791, "mean_token_accuracy": 0.4270921131848284, "step": 3134 }, { "epoch": 0.5812013348164627, "grad_norm": 6.51953125, "learning_rate": 9.418798665183539e-06, "loss": 2.679, "mean_token_accuracy": 0.4571132800946466, "step": 3135 }, { "epoch": 0.5813867259918428, "grad_norm": 7.90234375, "learning_rate": 9.418613274008157e-06, "loss": 2.878, "mean_token_accuracy": 0.43769416637901276, "step": 3136 }, { "epoch": 0.5815721171672228, "grad_norm": 5.34375, "learning_rate": 9.418427882832778e-06, "loss": 2.7799, "mean_token_accuracy": 0.4715629198387819, "step": 3137 }, { "epoch": 0.5817575083426029, "grad_norm": 5.83984375, "learning_rate": 9.418242491657398e-06, "loss": 2.9228, "mean_token_accuracy": 0.42862664261788197, "step": 3138 }, { "epoch": 0.581942899517983, "grad_norm": 7.40625, "learning_rate": 9.418057100482017e-06, "loss": 2.633, "mean_token_accuracy": 0.45174825174825173, "step": 3139 }, { "epoch": 0.582128290693363, "grad_norm": 5.8984375, "learning_rate": 9.417871709306638e-06, "loss": 3.0056, "mean_token_accuracy": 0.43594380303241065, "step": 3140 }, { "epoch": 0.582313681868743, "grad_norm": 5.59765625, "learning_rate": 9.417686318131256e-06, "loss": 3.3847, "mean_token_accuracy": 0.4069647905422771, "step": 3141 }, { "epoch": 0.5824990730441231, "grad_norm": 8.5703125, "learning_rate": 9.417500926955879e-06, "loss": 2.718, "mean_token_accuracy": 0.4511930585683297, "step": 3142 }, { "epoch": 0.5826844642195032, "grad_norm": 7.01171875, "learning_rate": 9.417315535780497e-06, "loss": 2.7042, "mean_token_accuracy": 0.43966395112016293, "step": 3143 }, { "epoch": 0.5828698553948832, "grad_norm": 7.0390625, "learning_rate": 9.417130144605118e-06, "loss": 2.6724, "mean_token_accuracy": 0.45600246571120356, "step": 3144 }, { "epoch": 0.5830552465702633, "grad_norm": 7.95703125, "learning_rate": 9.416944753429738e-06, "loss": 2.5946, "mean_token_accuracy": 0.4699317226890756, "step": 3145 }, { "epoch": 0.5832406377456433, "grad_norm": 13.1796875, "learning_rate": 9.416759362254357e-06, "loss": 2.614, "mean_token_accuracy": 0.4659346144247567, "step": 3146 }, { "epoch": 0.5834260289210234, "grad_norm": 7.125, "learning_rate": 9.416573971078978e-06, "loss": 2.941, "mean_token_accuracy": 0.425314333612741, "step": 3147 }, { "epoch": 0.5836114200964034, "grad_norm": 5.44140625, "learning_rate": 9.416388579903596e-06, "loss": 2.9847, "mean_token_accuracy": 0.4335957126109529, "step": 3148 }, { "epoch": 0.5837968112717835, "grad_norm": 8.1796875, "learning_rate": 9.416203188728217e-06, "loss": 2.8809, "mean_token_accuracy": 0.4257296009529482, "step": 3149 }, { "epoch": 0.5839822024471635, "grad_norm": 7.1171875, "learning_rate": 9.416017797552837e-06, "loss": 3.099, "mean_token_accuracy": 0.4249722706385676, "step": 3150 }, { "epoch": 0.5841675936225436, "grad_norm": 5.73828125, "learning_rate": 9.415832406377458e-06, "loss": 2.13, "mean_token_accuracy": 0.5275523797989584, "step": 3151 }, { "epoch": 0.5843529847979236, "grad_norm": 5.15234375, "learning_rate": 9.415647015202077e-06, "loss": 3.1375, "mean_token_accuracy": 0.4089178901576944, "step": 3152 }, { "epoch": 0.5845383759733037, "grad_norm": 5.16015625, "learning_rate": 9.415461624026697e-06, "loss": 3.0362, "mean_token_accuracy": 0.4301772589710333, "step": 3153 }, { "epoch": 0.5847237671486837, "grad_norm": 5.859375, "learning_rate": 9.415276232851318e-06, "loss": 3.0178, "mean_token_accuracy": 0.42737763629128533, "step": 3154 }, { "epoch": 0.5849091583240638, "grad_norm": 4.859375, "learning_rate": 9.415090841675937e-06, "loss": 2.5903, "mean_token_accuracy": 0.4624968217645563, "step": 3155 }, { "epoch": 0.5850945494994438, "grad_norm": 7.86328125, "learning_rate": 9.414905450500557e-06, "loss": 2.8503, "mean_token_accuracy": 0.4461461628100549, "step": 3156 }, { "epoch": 0.5852799406748239, "grad_norm": 7.390625, "learning_rate": 9.414720059325176e-06, "loss": 3.2267, "mean_token_accuracy": 0.38434163701067614, "step": 3157 }, { "epoch": 0.5854653318502039, "grad_norm": 6.2109375, "learning_rate": 9.414534668149798e-06, "loss": 3.1227, "mean_token_accuracy": 0.40094665220392467, "step": 3158 }, { "epoch": 0.585650723025584, "grad_norm": 5.16796875, "learning_rate": 9.414349276974417e-06, "loss": 2.7075, "mean_token_accuracy": 0.44745296007342816, "step": 3159 }, { "epoch": 0.585836114200964, "grad_norm": 6.80078125, "learning_rate": 9.414163885799037e-06, "loss": 2.695, "mean_token_accuracy": 0.45565845236226654, "step": 3160 }, { "epoch": 0.5860215053763441, "grad_norm": 5.59375, "learning_rate": 9.413978494623656e-06, "loss": 2.7929, "mean_token_accuracy": 0.4571939688218758, "step": 3161 }, { "epoch": 0.5862068965517241, "grad_norm": 7.390625, "learning_rate": 9.413793103448277e-06, "loss": 2.9735, "mean_token_accuracy": 0.4155455904334828, "step": 3162 }, { "epoch": 0.5863922877271042, "grad_norm": 7.06640625, "learning_rate": 9.413607712272897e-06, "loss": 2.8829, "mean_token_accuracy": 0.42983074753173484, "step": 3163 }, { "epoch": 0.5865776789024842, "grad_norm": 5.98046875, "learning_rate": 9.413422321097516e-06, "loss": 3.0941, "mean_token_accuracy": 0.40365177698076293, "step": 3164 }, { "epoch": 0.5867630700778643, "grad_norm": 5.37109375, "learning_rate": 9.413236929922136e-06, "loss": 2.9516, "mean_token_accuracy": 0.44508346191611725, "step": 3165 }, { "epoch": 0.5869484612532443, "grad_norm": 5.265625, "learning_rate": 9.413051538746757e-06, "loss": 2.9248, "mean_token_accuracy": 0.4228206945428774, "step": 3166 }, { "epoch": 0.5871338524286244, "grad_norm": 6.24609375, "learning_rate": 9.412866147571377e-06, "loss": 2.9648, "mean_token_accuracy": 0.41889450867052025, "step": 3167 }, { "epoch": 0.5873192436040044, "grad_norm": 7.1171875, "learning_rate": 9.412680756395996e-06, "loss": 2.7879, "mean_token_accuracy": 0.4654662725073905, "step": 3168 }, { "epoch": 0.5875046347793845, "grad_norm": 9.0703125, "learning_rate": 9.412495365220617e-06, "loss": 2.644, "mean_token_accuracy": 0.4652708541043536, "step": 3169 }, { "epoch": 0.5876900259547646, "grad_norm": 6.83984375, "learning_rate": 9.412309974045235e-06, "loss": 2.8828, "mean_token_accuracy": 0.4394069004847448, "step": 3170 }, { "epoch": 0.5878754171301446, "grad_norm": 5.2421875, "learning_rate": 9.412124582869856e-06, "loss": 2.6269, "mean_token_accuracy": 0.46454767726161367, "step": 3171 }, { "epoch": 0.5880608083055247, "grad_norm": 6.3125, "learning_rate": 9.411939191694476e-06, "loss": 3.0064, "mean_token_accuracy": 0.42638785691540176, "step": 3172 }, { "epoch": 0.5882461994809047, "grad_norm": 5.69140625, "learning_rate": 9.411753800519095e-06, "loss": 2.785, "mean_token_accuracy": 0.459290823314564, "step": 3173 }, { "epoch": 0.5884315906562848, "grad_norm": 5.109375, "learning_rate": 9.411568409343716e-06, "loss": 2.736, "mean_token_accuracy": 0.4481346678798908, "step": 3174 }, { "epoch": 0.5886169818316648, "grad_norm": 6.171875, "learning_rate": 9.411383018168336e-06, "loss": 2.9975, "mean_token_accuracy": 0.433127424220658, "step": 3175 }, { "epoch": 0.5888023730070449, "grad_norm": 5.63671875, "learning_rate": 9.411197626992957e-06, "loss": 3.1198, "mean_token_accuracy": 0.40941879637262985, "step": 3176 }, { "epoch": 0.5889877641824249, "grad_norm": 4.984375, "learning_rate": 9.411012235817575e-06, "loss": 2.9452, "mean_token_accuracy": 0.4280962128966223, "step": 3177 }, { "epoch": 0.589173155357805, "grad_norm": 5.953125, "learning_rate": 9.410826844642196e-06, "loss": 2.4706, "mean_token_accuracy": 0.4766761661916904, "step": 3178 }, { "epoch": 0.589358546533185, "grad_norm": 6.5546875, "learning_rate": 9.410641453466815e-06, "loss": 2.3804, "mean_token_accuracy": 0.5072129255626082, "step": 3179 }, { "epoch": 0.5895439377085651, "grad_norm": 7.2734375, "learning_rate": 9.410456062291435e-06, "loss": 2.8843, "mean_token_accuracy": 0.43477728830151735, "step": 3180 }, { "epoch": 0.5897293288839451, "grad_norm": 5.76171875, "learning_rate": 9.410270671116056e-06, "loss": 2.6801, "mean_token_accuracy": 0.4577025823686554, "step": 3181 }, { "epoch": 0.5899147200593252, "grad_norm": 5.390625, "learning_rate": 9.410085279940676e-06, "loss": 3.212, "mean_token_accuracy": 0.3875552747946936, "step": 3182 }, { "epoch": 0.5901001112347052, "grad_norm": 5.1328125, "learning_rate": 9.409899888765297e-06, "loss": 2.9633, "mean_token_accuracy": 0.41951912827194127, "step": 3183 }, { "epoch": 0.5902855024100853, "grad_norm": 7.2578125, "learning_rate": 9.409714497589916e-06, "loss": 2.8132, "mean_token_accuracy": 0.42996997780968543, "step": 3184 }, { "epoch": 0.5904708935854653, "grad_norm": 6.26953125, "learning_rate": 9.409529106414536e-06, "loss": 3.0153, "mean_token_accuracy": 0.4072061767229053, "step": 3185 }, { "epoch": 0.5906562847608454, "grad_norm": 5.76953125, "learning_rate": 9.409343715239155e-06, "loss": 2.6653, "mean_token_accuracy": 0.45952178662459037, "step": 3186 }, { "epoch": 0.5908416759362254, "grad_norm": 8.0546875, "learning_rate": 9.409158324063775e-06, "loss": 2.3456, "mean_token_accuracy": 0.47361751152073733, "step": 3187 }, { "epoch": 0.5910270671116055, "grad_norm": 7.6875, "learning_rate": 9.408972932888396e-06, "loss": 3.007, "mean_token_accuracy": 0.42670001190901513, "step": 3188 }, { "epoch": 0.5912124582869855, "grad_norm": 7.1171875, "learning_rate": 9.408787541713015e-06, "loss": 3.3277, "mean_token_accuracy": 0.41493368265924346, "step": 3189 }, { "epoch": 0.5913978494623656, "grad_norm": 4.9921875, "learning_rate": 9.408602150537635e-06, "loss": 3.1807, "mean_token_accuracy": 0.4102823857160947, "step": 3190 }, { "epoch": 0.5915832406377456, "grad_norm": 7.2890625, "learning_rate": 9.408416759362256e-06, "loss": 2.5719, "mean_token_accuracy": 0.46932750136686713, "step": 3191 }, { "epoch": 0.5917686318131257, "grad_norm": 8.3671875, "learning_rate": 9.408231368186876e-06, "loss": 2.7128, "mean_token_accuracy": 0.4530053754683173, "step": 3192 }, { "epoch": 0.5919540229885057, "grad_norm": 5.2421875, "learning_rate": 9.408045977011495e-06, "loss": 2.9932, "mean_token_accuracy": 0.42836106611691943, "step": 3193 }, { "epoch": 0.5921394141638858, "grad_norm": 7.671875, "learning_rate": 9.407860585836115e-06, "loss": 3.129, "mean_token_accuracy": 0.40983031012287885, "step": 3194 }, { "epoch": 0.5923248053392659, "grad_norm": 6.48828125, "learning_rate": 9.407675194660734e-06, "loss": 2.7888, "mean_token_accuracy": 0.45134032634032634, "step": 3195 }, { "epoch": 0.5925101965146459, "grad_norm": 6.671875, "learning_rate": 9.407489803485355e-06, "loss": 2.9888, "mean_token_accuracy": 0.41494845360824745, "step": 3196 }, { "epoch": 0.592695587690026, "grad_norm": 6.5, "learning_rate": 9.407304412309975e-06, "loss": 2.6612, "mean_token_accuracy": 0.4678086237448317, "step": 3197 }, { "epoch": 0.592880978865406, "grad_norm": 5.296875, "learning_rate": 9.407119021134596e-06, "loss": 2.7146, "mean_token_accuracy": 0.4420306965761511, "step": 3198 }, { "epoch": 0.5930663700407861, "grad_norm": 10.25, "learning_rate": 9.406933629959214e-06, "loss": 2.7925, "mean_token_accuracy": 0.43722078532800374, "step": 3199 }, { "epoch": 0.5932517612161661, "grad_norm": 10.703125, "learning_rate": 9.406748238783835e-06, "loss": 2.5244, "mean_token_accuracy": 0.470515014781391, "step": 3200 }, { "epoch": 0.5934371523915462, "grad_norm": 6.234375, "learning_rate": 9.406562847608455e-06, "loss": 2.39, "mean_token_accuracy": 0.489131902254624, "step": 3201 }, { "epoch": 0.5936225435669262, "grad_norm": 8.5859375, "learning_rate": 9.406377456433074e-06, "loss": 2.4699, "mean_token_accuracy": 0.4670863706648995, "step": 3202 }, { "epoch": 0.5938079347423063, "grad_norm": 8.640625, "learning_rate": 9.406192065257695e-06, "loss": 3.0087, "mean_token_accuracy": 0.42319375135604254, "step": 3203 }, { "epoch": 0.5939933259176863, "grad_norm": 8.109375, "learning_rate": 9.406006674082313e-06, "loss": 3.024, "mean_token_accuracy": 0.4209846650524617, "step": 3204 }, { "epoch": 0.5941787170930664, "grad_norm": 5.328125, "learning_rate": 9.405821282906934e-06, "loss": 2.8248, "mean_token_accuracy": 0.45910360420268653, "step": 3205 }, { "epoch": 0.5943641082684464, "grad_norm": 5.72265625, "learning_rate": 9.405635891731554e-06, "loss": 3.0778, "mean_token_accuracy": 0.3947733333333333, "step": 3206 }, { "epoch": 0.5945494994438265, "grad_norm": 6.28125, "learning_rate": 9.405450500556175e-06, "loss": 2.4583, "mean_token_accuracy": 0.4950823177250374, "step": 3207 }, { "epoch": 0.5947348906192065, "grad_norm": 6.96875, "learning_rate": 9.405265109380794e-06, "loss": 3.269, "mean_token_accuracy": 0.3958907887479316, "step": 3208 }, { "epoch": 0.5949202817945866, "grad_norm": 6.640625, "learning_rate": 9.405079718205414e-06, "loss": 3.2785, "mean_token_accuracy": 0.4062180143295803, "step": 3209 }, { "epoch": 0.5951056729699666, "grad_norm": 6.359375, "learning_rate": 9.404894327030035e-06, "loss": 2.9842, "mean_token_accuracy": 0.4102499048585564, "step": 3210 }, { "epoch": 0.5952910641453467, "grad_norm": 5.4453125, "learning_rate": 9.404708935854654e-06, "loss": 2.7399, "mean_token_accuracy": 0.450886370241209, "step": 3211 }, { "epoch": 0.5954764553207267, "grad_norm": 5.25390625, "learning_rate": 9.404523544679274e-06, "loss": 3.0271, "mean_token_accuracy": 0.41641375821952453, "step": 3212 }, { "epoch": 0.5956618464961068, "grad_norm": 5.953125, "learning_rate": 9.404338153503893e-06, "loss": 2.786, "mean_token_accuracy": 0.44434735117422175, "step": 3213 }, { "epoch": 0.5958472376714868, "grad_norm": 6.70703125, "learning_rate": 9.404152762328515e-06, "loss": 2.2414, "mean_token_accuracy": 0.5388317448334434, "step": 3214 }, { "epoch": 0.5960326288468669, "grad_norm": 6.76171875, "learning_rate": 9.403967371153134e-06, "loss": 2.7463, "mean_token_accuracy": 0.45646088109621336, "step": 3215 }, { "epoch": 0.5962180200222469, "grad_norm": 6.57421875, "learning_rate": 9.403781979977754e-06, "loss": 1.7965, "mean_token_accuracy": 0.5882433169667213, "step": 3216 }, { "epoch": 0.596403411197627, "grad_norm": 7.59765625, "learning_rate": 9.403596588802373e-06, "loss": 2.9055, "mean_token_accuracy": 0.4407884279128914, "step": 3217 }, { "epoch": 0.596588802373007, "grad_norm": 5.9609375, "learning_rate": 9.403411197626994e-06, "loss": 3.2324, "mean_token_accuracy": 0.4034789987271956, "step": 3218 }, { "epoch": 0.5967741935483871, "grad_norm": 5.55078125, "learning_rate": 9.403225806451614e-06, "loss": 2.8423, "mean_token_accuracy": 0.43194422437014734, "step": 3219 }, { "epoch": 0.5969595847237672, "grad_norm": 8.53125, "learning_rate": 9.403040415276233e-06, "loss": 2.2564, "mean_token_accuracy": 0.5041362530413626, "step": 3220 }, { "epoch": 0.5971449758991472, "grad_norm": 7.5859375, "learning_rate": 9.402855024100853e-06, "loss": 2.3895, "mean_token_accuracy": 0.477368290873114, "step": 3221 }, { "epoch": 0.5973303670745272, "grad_norm": 5.11328125, "learning_rate": 9.402669632925472e-06, "loss": 2.7984, "mean_token_accuracy": 0.44868248653370213, "step": 3222 }, { "epoch": 0.5975157582499073, "grad_norm": 5.51953125, "learning_rate": 9.402484241750094e-06, "loss": 2.9314, "mean_token_accuracy": 0.4287475345167653, "step": 3223 }, { "epoch": 0.5977011494252874, "grad_norm": 9.75, "learning_rate": 9.402298850574713e-06, "loss": 2.9953, "mean_token_accuracy": 0.42247493274639275, "step": 3224 }, { "epoch": 0.5978865406006674, "grad_norm": 7.171875, "learning_rate": 9.402113459399334e-06, "loss": 2.4435, "mean_token_accuracy": 0.4856185745639367, "step": 3225 }, { "epoch": 0.5980719317760474, "grad_norm": 7.3984375, "learning_rate": 9.401928068223954e-06, "loss": 2.6402, "mean_token_accuracy": 0.45655426401196914, "step": 3226 }, { "epoch": 0.5982573229514275, "grad_norm": 4.9765625, "learning_rate": 9.401742677048573e-06, "loss": 3.177, "mean_token_accuracy": 0.4127769919849128, "step": 3227 }, { "epoch": 0.5984427141268076, "grad_norm": 5.859375, "learning_rate": 9.401557285873193e-06, "loss": 2.7735, "mean_token_accuracy": 0.463007840342124, "step": 3228 }, { "epoch": 0.5986281053021876, "grad_norm": 7.21484375, "learning_rate": 9.401371894697812e-06, "loss": 2.0978, "mean_token_accuracy": 0.5410216718266254, "step": 3229 }, { "epoch": 0.5988134964775677, "grad_norm": 7.84765625, "learning_rate": 9.401186503522433e-06, "loss": 2.8163, "mean_token_accuracy": 0.47610241820768134, "step": 3230 }, { "epoch": 0.5989988876529477, "grad_norm": 4.95703125, "learning_rate": 9.401001112347053e-06, "loss": 3.2793, "mean_token_accuracy": 0.3946295037389531, "step": 3231 }, { "epoch": 0.5991842788283278, "grad_norm": 6.203125, "learning_rate": 9.400815721171674e-06, "loss": 2.4602, "mean_token_accuracy": 0.46789871594863797, "step": 3232 }, { "epoch": 0.5993696700037078, "grad_norm": 6.484375, "learning_rate": 9.400630329996292e-06, "loss": 2.8624, "mean_token_accuracy": 0.4425796220430818, "step": 3233 }, { "epoch": 0.5995550611790879, "grad_norm": 6.35546875, "learning_rate": 9.400444938820913e-06, "loss": 3.099, "mean_token_accuracy": 0.4287498438865992, "step": 3234 }, { "epoch": 0.5997404523544679, "grad_norm": 6.37890625, "learning_rate": 9.400259547645533e-06, "loss": 2.6663, "mean_token_accuracy": 0.45685347738014853, "step": 3235 }, { "epoch": 0.599925843529848, "grad_norm": 12.265625, "learning_rate": 9.400074156470152e-06, "loss": 3.1976, "mean_token_accuracy": 0.37873172740374716, "step": 3236 }, { "epoch": 0.600111234705228, "grad_norm": 8.203125, "learning_rate": 9.399888765294773e-06, "loss": 3.3292, "mean_token_accuracy": 0.3841243862520458, "step": 3237 }, { "epoch": 0.6002966258806081, "grad_norm": 8.546875, "learning_rate": 9.399703374119392e-06, "loss": 2.8227, "mean_token_accuracy": 0.4499832346037778, "step": 3238 }, { "epoch": 0.6004820170559881, "grad_norm": 6.89453125, "learning_rate": 9.399517982944014e-06, "loss": 2.9902, "mean_token_accuracy": 0.4328996918863403, "step": 3239 }, { "epoch": 0.6006674082313682, "grad_norm": 8.5859375, "learning_rate": 9.399332591768633e-06, "loss": 2.6093, "mean_token_accuracy": 0.4751387226634751, "step": 3240 }, { "epoch": 0.6008527994067482, "grad_norm": 8.484375, "learning_rate": 9.399147200593253e-06, "loss": 2.7762, "mean_token_accuracy": 0.4507522878858384, "step": 3241 }, { "epoch": 0.6010381905821283, "grad_norm": 6.5625, "learning_rate": 9.398961809417872e-06, "loss": 2.8965, "mean_token_accuracy": 0.4195001125872551, "step": 3242 }, { "epoch": 0.6012235817575083, "grad_norm": 7.34765625, "learning_rate": 9.398776418242492e-06, "loss": 2.8068, "mean_token_accuracy": 0.43954829998772554, "step": 3243 }, { "epoch": 0.6014089729328884, "grad_norm": 6.53125, "learning_rate": 9.398591027067113e-06, "loss": 2.3951, "mean_token_accuracy": 0.4932870719565545, "step": 3244 }, { "epoch": 0.6015943641082685, "grad_norm": 7.41015625, "learning_rate": 9.398405635891732e-06, "loss": 2.7258, "mean_token_accuracy": 0.45349918623575913, "step": 3245 }, { "epoch": 0.6017797552836485, "grad_norm": 7.03125, "learning_rate": 9.398220244716352e-06, "loss": 2.2905, "mean_token_accuracy": 0.49455469506292354, "step": 3246 }, { "epoch": 0.6019651464590285, "grad_norm": 7.0859375, "learning_rate": 9.398034853540973e-06, "loss": 2.4706, "mean_token_accuracy": 0.4789283564118001, "step": 3247 }, { "epoch": 0.6021505376344086, "grad_norm": 5.5859375, "learning_rate": 9.397849462365593e-06, "loss": 2.7963, "mean_token_accuracy": 0.4542505744019462, "step": 3248 }, { "epoch": 0.6023359288097887, "grad_norm": 5.265625, "learning_rate": 9.397664071190212e-06, "loss": 2.7657, "mean_token_accuracy": 0.4407371060953004, "step": 3249 }, { "epoch": 0.6025213199851687, "grad_norm": 6.60546875, "learning_rate": 9.397478680014832e-06, "loss": 2.6778, "mean_token_accuracy": 0.4477392836171462, "step": 3250 }, { "epoch": 0.6027067111605487, "grad_norm": 5.90234375, "learning_rate": 9.397293288839451e-06, "loss": 2.5877, "mean_token_accuracy": 0.47302258315719214, "step": 3251 }, { "epoch": 0.6028921023359288, "grad_norm": 4.54296875, "learning_rate": 9.397107897664072e-06, "loss": 2.8369, "mean_token_accuracy": 0.4482680552411139, "step": 3252 }, { "epoch": 0.6030774935113089, "grad_norm": 6.625, "learning_rate": 9.396922506488692e-06, "loss": 2.6702, "mean_token_accuracy": 0.4484597801240884, "step": 3253 }, { "epoch": 0.6032628846866889, "grad_norm": 6.1015625, "learning_rate": 9.396737115313311e-06, "loss": 2.9859, "mean_token_accuracy": 0.4129094645231476, "step": 3254 }, { "epoch": 0.603448275862069, "grad_norm": 7.5390625, "learning_rate": 9.396551724137931e-06, "loss": 2.4414, "mean_token_accuracy": 0.49485783424077434, "step": 3255 }, { "epoch": 0.603633667037449, "grad_norm": 5.6328125, "learning_rate": 9.396366332962552e-06, "loss": 2.5731, "mean_token_accuracy": 0.4683647359851128, "step": 3256 }, { "epoch": 0.6038190582128291, "grad_norm": 5.01953125, "learning_rate": 9.396180941787172e-06, "loss": 2.7707, "mean_token_accuracy": 0.44610957918714783, "step": 3257 }, { "epoch": 0.6040044493882091, "grad_norm": 5.95703125, "learning_rate": 9.395995550611791e-06, "loss": 3.7238, "mean_token_accuracy": 0.36298568507157464, "step": 3258 }, { "epoch": 0.6041898405635892, "grad_norm": 6.37890625, "learning_rate": 9.395810159436412e-06, "loss": 3.1295, "mean_token_accuracy": 0.4135831381733021, "step": 3259 }, { "epoch": 0.6043752317389692, "grad_norm": 4.82421875, "learning_rate": 9.39562476826103e-06, "loss": 2.2456, "mean_token_accuracy": 0.5299048482102402, "step": 3260 }, { "epoch": 0.6045606229143493, "grad_norm": 5.17578125, "learning_rate": 9.395439377085651e-06, "loss": 2.6502, "mean_token_accuracy": 0.4715215158086976, "step": 3261 }, { "epoch": 0.6047460140897293, "grad_norm": 7.81640625, "learning_rate": 9.395253985910271e-06, "loss": 2.8411, "mean_token_accuracy": 0.44408375263755884, "step": 3262 }, { "epoch": 0.6049314052651094, "grad_norm": 6.7265625, "learning_rate": 9.395068594734892e-06, "loss": 2.5927, "mean_token_accuracy": 0.4578479990773844, "step": 3263 }, { "epoch": 0.6051167964404894, "grad_norm": 4.8671875, "learning_rate": 9.394883203559512e-06, "loss": 2.6667, "mean_token_accuracy": 0.464867243655567, "step": 3264 }, { "epoch": 0.6053021876158695, "grad_norm": 4.9453125, "learning_rate": 9.394697812384131e-06, "loss": 2.7551, "mean_token_accuracy": 0.4578907435508346, "step": 3265 }, { "epoch": 0.6054875787912495, "grad_norm": 5.23828125, "learning_rate": 9.394512421208752e-06, "loss": 2.2817, "mean_token_accuracy": 0.5130372492836677, "step": 3266 }, { "epoch": 0.6056729699666296, "grad_norm": 6.29296875, "learning_rate": 9.39432703003337e-06, "loss": 2.9149, "mean_token_accuracy": 0.42651359254846744, "step": 3267 }, { "epoch": 0.6058583611420096, "grad_norm": 6.28515625, "learning_rate": 9.394141638857991e-06, "loss": 3.215, "mean_token_accuracy": 0.40575175085680226, "step": 3268 }, { "epoch": 0.6060437523173897, "grad_norm": 8.28125, "learning_rate": 9.393956247682612e-06, "loss": 2.9843, "mean_token_accuracy": 0.41335075424634754, "step": 3269 }, { "epoch": 0.6062291434927698, "grad_norm": 7.546875, "learning_rate": 9.39377085650723e-06, "loss": 2.4258, "mean_token_accuracy": 0.48796944621260346, "step": 3270 }, { "epoch": 0.6064145346681498, "grad_norm": 6.2578125, "learning_rate": 9.39358546533185e-06, "loss": 2.3022, "mean_token_accuracy": 0.5031607262945528, "step": 3271 }, { "epoch": 0.6065999258435298, "grad_norm": 5.59375, "learning_rate": 9.393400074156471e-06, "loss": 2.6021, "mean_token_accuracy": 0.47356884992264053, "step": 3272 }, { "epoch": 0.6067853170189099, "grad_norm": 7.0078125, "learning_rate": 9.393214682981092e-06, "loss": 2.7835, "mean_token_accuracy": 0.45358269490761605, "step": 3273 }, { "epoch": 0.60697070819429, "grad_norm": 6.0546875, "learning_rate": 9.39302929180571e-06, "loss": 3.0623, "mean_token_accuracy": 0.40664601520698396, "step": 3274 }, { "epoch": 0.60715609936967, "grad_norm": 8.1015625, "learning_rate": 9.392843900630331e-06, "loss": 2.6747, "mean_token_accuracy": 0.4606703146374829, "step": 3275 }, { "epoch": 0.60734149054505, "grad_norm": 6.23046875, "learning_rate": 9.39265850945495e-06, "loss": 2.6422, "mean_token_accuracy": 0.4656387665198238, "step": 3276 }, { "epoch": 0.6075268817204301, "grad_norm": 10.453125, "learning_rate": 9.39247311827957e-06, "loss": 2.9345, "mean_token_accuracy": 0.4209857364778986, "step": 3277 }, { "epoch": 0.6077122728958102, "grad_norm": 12.8515625, "learning_rate": 9.392287727104191e-06, "loss": 2.2339, "mean_token_accuracy": 0.4914018904452796, "step": 3278 }, { "epoch": 0.6078976640711902, "grad_norm": 7.12890625, "learning_rate": 9.392102335928811e-06, "loss": 3.0181, "mean_token_accuracy": 0.425511315502844, "step": 3279 }, { "epoch": 0.6080830552465702, "grad_norm": 6.5859375, "learning_rate": 9.39191694475343e-06, "loss": 2.9267, "mean_token_accuracy": 0.4270289942981924, "step": 3280 }, { "epoch": 0.6082684464219503, "grad_norm": 5.00390625, "learning_rate": 9.39173155357805e-06, "loss": 2.8606, "mean_token_accuracy": 0.4277681660899654, "step": 3281 }, { "epoch": 0.6084538375973304, "grad_norm": 6.2890625, "learning_rate": 9.391546162402671e-06, "loss": 2.7298, "mean_token_accuracy": 0.4482192680124733, "step": 3282 }, { "epoch": 0.6086392287727104, "grad_norm": 5.66015625, "learning_rate": 9.39136077122729e-06, "loss": 2.784, "mean_token_accuracy": 0.45640050697084916, "step": 3283 }, { "epoch": 0.6088246199480905, "grad_norm": 4.328125, "learning_rate": 9.39117538005191e-06, "loss": 2.8484, "mean_token_accuracy": 0.4346054597310879, "step": 3284 }, { "epoch": 0.6090100111234705, "grad_norm": 6.34765625, "learning_rate": 9.39098998887653e-06, "loss": 2.8435, "mean_token_accuracy": 0.43119266055045874, "step": 3285 }, { "epoch": 0.6091954022988506, "grad_norm": 6.19921875, "learning_rate": 9.39080459770115e-06, "loss": 2.9239, "mean_token_accuracy": 0.41387166181496016, "step": 3286 }, { "epoch": 0.6093807934742306, "grad_norm": 6.265625, "learning_rate": 9.39061920652577e-06, "loss": 2.9919, "mean_token_accuracy": 0.4200537719436976, "step": 3287 }, { "epoch": 0.6095661846496107, "grad_norm": 5.60546875, "learning_rate": 9.39043381535039e-06, "loss": 3.0201, "mean_token_accuracy": 0.42201715622563035, "step": 3288 }, { "epoch": 0.6097515758249907, "grad_norm": 6.1796875, "learning_rate": 9.39024842417501e-06, "loss": 2.6116, "mean_token_accuracy": 0.47947504959560505, "step": 3289 }, { "epoch": 0.6099369670003708, "grad_norm": 5.91796875, "learning_rate": 9.39006303299963e-06, "loss": 2.5304, "mean_token_accuracy": 0.4594727161250766, "step": 3290 }, { "epoch": 0.6101223581757509, "grad_norm": 5.3828125, "learning_rate": 9.38987764182425e-06, "loss": 3.0694, "mean_token_accuracy": 0.42217059409226687, "step": 3291 }, { "epoch": 0.6103077493511309, "grad_norm": 6.8515625, "learning_rate": 9.38969225064887e-06, "loss": 2.5858, "mean_token_accuracy": 0.4573710073710074, "step": 3292 }, { "epoch": 0.6104931405265109, "grad_norm": 5.09375, "learning_rate": 9.38950685947349e-06, "loss": 3.1484, "mean_token_accuracy": 0.4094962362478286, "step": 3293 }, { "epoch": 0.610678531701891, "grad_norm": 5.35546875, "learning_rate": 9.389321468298109e-06, "loss": 3.1756, "mean_token_accuracy": 0.40341952804860814, "step": 3294 }, { "epoch": 0.6108639228772711, "grad_norm": 6.796875, "learning_rate": 9.38913607712273e-06, "loss": 3.1556, "mean_token_accuracy": 0.4105771612583602, "step": 3295 }, { "epoch": 0.6110493140526511, "grad_norm": 6.515625, "learning_rate": 9.38895068594735e-06, "loss": 2.8727, "mean_token_accuracy": 0.4280675973548861, "step": 3296 }, { "epoch": 0.6112347052280311, "grad_norm": 5.3515625, "learning_rate": 9.38876529477197e-06, "loss": 2.7825, "mean_token_accuracy": 0.4493711091347986, "step": 3297 }, { "epoch": 0.6114200964034112, "grad_norm": 5.53515625, "learning_rate": 9.388579903596589e-06, "loss": 3.6544, "mean_token_accuracy": 0.345622633103141, "step": 3298 }, { "epoch": 0.6116054875787913, "grad_norm": 5.69921875, "learning_rate": 9.38839451242121e-06, "loss": 2.2065, "mean_token_accuracy": 0.5098673207715464, "step": 3299 }, { "epoch": 0.6117908787541713, "grad_norm": 7.46484375, "learning_rate": 9.38820912124583e-06, "loss": 2.5895, "mean_token_accuracy": 0.46120689655172414, "step": 3300 }, { "epoch": 0.6119762699295513, "grad_norm": 5.28125, "learning_rate": 9.388023730070449e-06, "loss": 3.0934, "mean_token_accuracy": 0.42255468946442043, "step": 3301 }, { "epoch": 0.6121616611049314, "grad_norm": 7.06640625, "learning_rate": 9.387838338895069e-06, "loss": 2.4248, "mean_token_accuracy": 0.48392728766560544, "step": 3302 }, { "epoch": 0.6123470522803115, "grad_norm": 6.16015625, "learning_rate": 9.38765294771969e-06, "loss": 3.0696, "mean_token_accuracy": 0.4247218358831711, "step": 3303 }, { "epoch": 0.6125324434556915, "grad_norm": 6.4921875, "learning_rate": 9.38746755654431e-06, "loss": 3.324, "mean_token_accuracy": 0.3956128240527688, "step": 3304 }, { "epoch": 0.6127178346310715, "grad_norm": 6.69921875, "learning_rate": 9.387282165368929e-06, "loss": 2.43, "mean_token_accuracy": 0.4745679012345679, "step": 3305 }, { "epoch": 0.6129032258064516, "grad_norm": 5.046875, "learning_rate": 9.38709677419355e-06, "loss": 3.329, "mean_token_accuracy": 0.4011318619128466, "step": 3306 }, { "epoch": 0.6130886169818317, "grad_norm": 6.234375, "learning_rate": 9.38691138301817e-06, "loss": 2.5997, "mean_token_accuracy": 0.46308243727598564, "step": 3307 }, { "epoch": 0.6132740081572117, "grad_norm": 6.16796875, "learning_rate": 9.386725991842789e-06, "loss": 3.0764, "mean_token_accuracy": 0.41845096484271743, "step": 3308 }, { "epoch": 0.6134593993325917, "grad_norm": 5.24609375, "learning_rate": 9.38654060066741e-06, "loss": 3.6871, "mean_token_accuracy": 0.37120400142908183, "step": 3309 }, { "epoch": 0.6136447905079718, "grad_norm": 7.46875, "learning_rate": 9.386355209492028e-06, "loss": 2.9533, "mean_token_accuracy": 0.4210170636931914, "step": 3310 }, { "epoch": 0.6138301816833519, "grad_norm": 6.796875, "learning_rate": 9.38616981831665e-06, "loss": 2.6124, "mean_token_accuracy": 0.45919871365402654, "step": 3311 }, { "epoch": 0.6140155728587319, "grad_norm": 7.38671875, "learning_rate": 9.385984427141269e-06, "loss": 2.7932, "mean_token_accuracy": 0.43405167550490803, "step": 3312 }, { "epoch": 0.614200964034112, "grad_norm": 5.44140625, "learning_rate": 9.38579903596589e-06, "loss": 2.9543, "mean_token_accuracy": 0.4246153846153846, "step": 3313 }, { "epoch": 0.614386355209492, "grad_norm": 8.5078125, "learning_rate": 9.385613644790508e-06, "loss": 2.5322, "mean_token_accuracy": 0.4759493670886076, "step": 3314 }, { "epoch": 0.6145717463848721, "grad_norm": 6.4609375, "learning_rate": 9.385428253615129e-06, "loss": 3.202, "mean_token_accuracy": 0.4118089507333584, "step": 3315 }, { "epoch": 0.6147571375602522, "grad_norm": 5.24609375, "learning_rate": 9.38524286243975e-06, "loss": 3.0243, "mean_token_accuracy": 0.4185344827586207, "step": 3316 }, { "epoch": 0.6149425287356322, "grad_norm": 7.359375, "learning_rate": 9.385057471264368e-06, "loss": 2.7972, "mean_token_accuracy": 0.4398455412091227, "step": 3317 }, { "epoch": 0.6151279199110122, "grad_norm": 5.3671875, "learning_rate": 9.384872080088989e-06, "loss": 2.629, "mean_token_accuracy": 0.47803946530872055, "step": 3318 }, { "epoch": 0.6153133110863923, "grad_norm": 6.03125, "learning_rate": 9.384686688913609e-06, "loss": 2.508, "mean_token_accuracy": 0.4892681665373675, "step": 3319 }, { "epoch": 0.6154987022617724, "grad_norm": 6.0234375, "learning_rate": 9.38450129773823e-06, "loss": 2.7281, "mean_token_accuracy": 0.45826056584002794, "step": 3320 }, { "epoch": 0.6156840934371524, "grad_norm": 6.12109375, "learning_rate": 9.384315906562848e-06, "loss": 2.8716, "mean_token_accuracy": 0.43319630010277493, "step": 3321 }, { "epoch": 0.6158694846125324, "grad_norm": 6.80859375, "learning_rate": 9.384130515387469e-06, "loss": 2.6051, "mean_token_accuracy": 0.509854528390427, "step": 3322 }, { "epoch": 0.6160548757879125, "grad_norm": 6.69921875, "learning_rate": 9.383945124212088e-06, "loss": 3.0674, "mean_token_accuracy": 0.43159138655462187, "step": 3323 }, { "epoch": 0.6162402669632926, "grad_norm": 4.95703125, "learning_rate": 9.383759733036708e-06, "loss": 2.8316, "mean_token_accuracy": 0.4422546314544738, "step": 3324 }, { "epoch": 0.6164256581386726, "grad_norm": 6.0, "learning_rate": 9.383574341861329e-06, "loss": 2.849, "mean_token_accuracy": 0.4341824391632276, "step": 3325 }, { "epoch": 0.6166110493140526, "grad_norm": 8.96875, "learning_rate": 9.383388950685947e-06, "loss": 2.7341, "mean_token_accuracy": 0.4353089136928156, "step": 3326 }, { "epoch": 0.6167964404894327, "grad_norm": 6.73828125, "learning_rate": 9.383203559510568e-06, "loss": 2.59, "mean_token_accuracy": 0.4655715263518138, "step": 3327 }, { "epoch": 0.6169818316648128, "grad_norm": 6.63671875, "learning_rate": 9.383018168335188e-06, "loss": 2.289, "mean_token_accuracy": 0.5260273972602739, "step": 3328 }, { "epoch": 0.6171672228401928, "grad_norm": 5.90234375, "learning_rate": 9.382832777159809e-06, "loss": 2.7164, "mean_token_accuracy": 0.4584216431866351, "step": 3329 }, { "epoch": 0.6173526140155728, "grad_norm": 9.5078125, "learning_rate": 9.382647385984428e-06, "loss": 2.6979, "mean_token_accuracy": 0.45758832785009296, "step": 3330 }, { "epoch": 0.6175380051909529, "grad_norm": 8.0859375, "learning_rate": 9.382461994809048e-06, "loss": 2.7213, "mean_token_accuracy": 0.45431161195492403, "step": 3331 }, { "epoch": 0.617723396366333, "grad_norm": 8.6484375, "learning_rate": 9.382276603633667e-06, "loss": 3.341, "mean_token_accuracy": 0.41003102378490175, "step": 3332 }, { "epoch": 0.617908787541713, "grad_norm": 12.0625, "learning_rate": 9.382091212458287e-06, "loss": 2.9069, "mean_token_accuracy": 0.41904145077720206, "step": 3333 }, { "epoch": 0.618094178717093, "grad_norm": 8.6953125, "learning_rate": 9.381905821282908e-06, "loss": 2.999, "mean_token_accuracy": 0.4226509467667024, "step": 3334 }, { "epoch": 0.6182795698924731, "grad_norm": 8.390625, "learning_rate": 9.381720430107528e-06, "loss": 3.5712, "mean_token_accuracy": 0.37801374141161775, "step": 3335 }, { "epoch": 0.6184649610678532, "grad_norm": 10.1953125, "learning_rate": 9.381535038932147e-06, "loss": 2.9991, "mean_token_accuracy": 0.4127486885083638, "step": 3336 }, { "epoch": 0.6186503522432332, "grad_norm": 13.0859375, "learning_rate": 9.381349647756768e-06, "loss": 2.6949, "mean_token_accuracy": 0.464, "step": 3337 }, { "epoch": 0.6188357434186132, "grad_norm": 13.0859375, "learning_rate": 9.381164256581388e-06, "loss": 2.5913, "mean_token_accuracy": 0.4687024559878287, "step": 3338 }, { "epoch": 0.6190211345939933, "grad_norm": 7.24609375, "learning_rate": 9.380978865406007e-06, "loss": 2.3745, "mean_token_accuracy": 0.49349442379182157, "step": 3339 }, { "epoch": 0.6192065257693734, "grad_norm": 9.3125, "learning_rate": 9.380793474230627e-06, "loss": 2.6357, "mean_token_accuracy": 0.45356075697211157, "step": 3340 }, { "epoch": 0.6193919169447535, "grad_norm": 9.5859375, "learning_rate": 9.380608083055246e-06, "loss": 3.3249, "mean_token_accuracy": 0.4005258545135846, "step": 3341 }, { "epoch": 0.6195773081201335, "grad_norm": 8.3671875, "learning_rate": 9.380422691879867e-06, "loss": 2.8209, "mean_token_accuracy": 0.4487102579484103, "step": 3342 }, { "epoch": 0.6197626992955135, "grad_norm": 6.44140625, "learning_rate": 9.380237300704487e-06, "loss": 2.9379, "mean_token_accuracy": 0.4540797658251006, "step": 3343 }, { "epoch": 0.6199480904708936, "grad_norm": 7.47265625, "learning_rate": 9.380051909529108e-06, "loss": 2.9086, "mean_token_accuracy": 0.4359072375127421, "step": 3344 }, { "epoch": 0.6201334816462737, "grad_norm": 6.87109375, "learning_rate": 9.379866518353728e-06, "loss": 2.4419, "mean_token_accuracy": 0.49290819131390873, "step": 3345 }, { "epoch": 0.6203188728216537, "grad_norm": 6.98046875, "learning_rate": 9.379681127178347e-06, "loss": 2.5521, "mean_token_accuracy": 0.4635254723750299, "step": 3346 }, { "epoch": 0.6205042639970337, "grad_norm": 7.640625, "learning_rate": 9.379495736002968e-06, "loss": 2.9333, "mean_token_accuracy": 0.4259064239506544, "step": 3347 }, { "epoch": 0.6206896551724138, "grad_norm": 6.13671875, "learning_rate": 9.379310344827586e-06, "loss": 2.6625, "mean_token_accuracy": 0.4558954558954559, "step": 3348 }, { "epoch": 0.6208750463477939, "grad_norm": 7.41796875, "learning_rate": 9.379124953652207e-06, "loss": 2.6743, "mean_token_accuracy": 0.45361296822053304, "step": 3349 }, { "epoch": 0.6210604375231739, "grad_norm": 6.86328125, "learning_rate": 9.378939562476827e-06, "loss": 2.6462, "mean_token_accuracy": 0.448040172966941, "step": 3350 }, { "epoch": 0.6212458286985539, "grad_norm": 5.06640625, "learning_rate": 9.378754171301446e-06, "loss": 2.8061, "mean_token_accuracy": 0.45023380093520377, "step": 3351 }, { "epoch": 0.621431219873934, "grad_norm": 5.3515625, "learning_rate": 9.378568780126067e-06, "loss": 3.5268, "mean_token_accuracy": 0.3732539252842447, "step": 3352 }, { "epoch": 0.6216166110493141, "grad_norm": 6.68359375, "learning_rate": 9.378383388950687e-06, "loss": 2.8244, "mean_token_accuracy": 0.4448719846085541, "step": 3353 }, { "epoch": 0.6218020022246941, "grad_norm": 6.40625, "learning_rate": 9.378197997775308e-06, "loss": 2.66, "mean_token_accuracy": 0.48051044083526684, "step": 3354 }, { "epoch": 0.6219873934000741, "grad_norm": 7.7421875, "learning_rate": 9.378012606599926e-06, "loss": 2.508, "mean_token_accuracy": 0.49870466321243523, "step": 3355 }, { "epoch": 0.6221727845754542, "grad_norm": 7.2890625, "learning_rate": 9.377827215424547e-06, "loss": 2.7415, "mean_token_accuracy": 0.45779456632282434, "step": 3356 }, { "epoch": 0.6223581757508343, "grad_norm": 5.30078125, "learning_rate": 9.377641824249166e-06, "loss": 2.6945, "mean_token_accuracy": 0.4684053651266766, "step": 3357 }, { "epoch": 0.6225435669262143, "grad_norm": 10.09375, "learning_rate": 9.377456433073786e-06, "loss": 2.6231, "mean_token_accuracy": 0.46809078771695595, "step": 3358 }, { "epoch": 0.6227289581015943, "grad_norm": 7.38671875, "learning_rate": 9.377271041898407e-06, "loss": 2.2324, "mean_token_accuracy": 0.5388011152416357, "step": 3359 }, { "epoch": 0.6229143492769744, "grad_norm": 5.41015625, "learning_rate": 9.377085650723027e-06, "loss": 2.488, "mean_token_accuracy": 0.49058006941001486, "step": 3360 }, { "epoch": 0.6230997404523545, "grad_norm": 6.15625, "learning_rate": 9.376900259547646e-06, "loss": 2.7676, "mean_token_accuracy": 0.44492722137736257, "step": 3361 }, { "epoch": 0.6232851316277345, "grad_norm": 5.32421875, "learning_rate": 9.376714868372266e-06, "loss": 3.2614, "mean_token_accuracy": 0.39415961945031713, "step": 3362 }, { "epoch": 0.6234705228031145, "grad_norm": 5.484375, "learning_rate": 9.376529477196887e-06, "loss": 3.1397, "mean_token_accuracy": 0.4145861703737328, "step": 3363 }, { "epoch": 0.6236559139784946, "grad_norm": 6.328125, "learning_rate": 9.376344086021506e-06, "loss": 3.0144, "mean_token_accuracy": 0.42267415389285173, "step": 3364 }, { "epoch": 0.6238413051538747, "grad_norm": 6.828125, "learning_rate": 9.376158694846126e-06, "loss": 3.0636, "mean_token_accuracy": 0.4309231353678612, "step": 3365 }, { "epoch": 0.6240266963292548, "grad_norm": 10.1015625, "learning_rate": 9.375973303670745e-06, "loss": 2.5562, "mean_token_accuracy": 0.47425434040658854, "step": 3366 }, { "epoch": 0.6242120875046347, "grad_norm": 5.80859375, "learning_rate": 9.375787912495365e-06, "loss": 3.35, "mean_token_accuracy": 0.3857797676153928, "step": 3367 }, { "epoch": 0.6243974786800148, "grad_norm": 8.53125, "learning_rate": 9.375602521319986e-06, "loss": 2.4645, "mean_token_accuracy": 0.5040338579552969, "step": 3368 }, { "epoch": 0.6245828698553949, "grad_norm": 6.828125, "learning_rate": 9.375417130144606e-06, "loss": 2.8201, "mean_token_accuracy": 0.448992133726647, "step": 3369 }, { "epoch": 0.624768261030775, "grad_norm": 10.5546875, "learning_rate": 9.375231738969225e-06, "loss": 2.0473, "mean_token_accuracy": 0.5443393815486839, "step": 3370 }, { "epoch": 0.624953652206155, "grad_norm": 7.02734375, "learning_rate": 9.375046347793846e-06, "loss": 2.5197, "mean_token_accuracy": 0.480499653018737, "step": 3371 }, { "epoch": 0.625139043381535, "grad_norm": 5.94921875, "learning_rate": 9.374860956618466e-06, "loss": 2.9721, "mean_token_accuracy": 0.43727947438861176, "step": 3372 }, { "epoch": 0.6253244345569151, "grad_norm": 9.7421875, "learning_rate": 9.374675565443085e-06, "loss": 2.6704, "mean_token_accuracy": 0.45610978000435637, "step": 3373 }, { "epoch": 0.6255098257322952, "grad_norm": 4.7578125, "learning_rate": 9.374490174267706e-06, "loss": 2.5385, "mean_token_accuracy": 0.45998494731560463, "step": 3374 }, { "epoch": 0.6256952169076752, "grad_norm": 7.27734375, "learning_rate": 9.374304783092324e-06, "loss": 2.9244, "mean_token_accuracy": 0.4298538381862125, "step": 3375 }, { "epoch": 0.6258806080830552, "grad_norm": 8.7421875, "learning_rate": 9.374119391916947e-06, "loss": 2.7912, "mean_token_accuracy": 0.4508859452950284, "step": 3376 }, { "epoch": 0.6260659992584353, "grad_norm": 6.1875, "learning_rate": 9.373934000741565e-06, "loss": 2.4233, "mean_token_accuracy": 0.5111460549440385, "step": 3377 }, { "epoch": 0.6262513904338154, "grad_norm": 5.48046875, "learning_rate": 9.373748609566186e-06, "loss": 2.9703, "mean_token_accuracy": 0.42111818433434817, "step": 3378 }, { "epoch": 0.6264367816091954, "grad_norm": 6.0546875, "learning_rate": 9.373563218390805e-06, "loss": 2.5065, "mean_token_accuracy": 0.4824498049978333, "step": 3379 }, { "epoch": 0.6266221727845754, "grad_norm": 8.734375, "learning_rate": 9.373377827215425e-06, "loss": 2.7124, "mean_token_accuracy": 0.4636157246982913, "step": 3380 }, { "epoch": 0.6268075639599555, "grad_norm": 6.43359375, "learning_rate": 9.373192436040046e-06, "loss": 2.4534, "mean_token_accuracy": 0.48397490249279296, "step": 3381 }, { "epoch": 0.6269929551353356, "grad_norm": 6.84375, "learning_rate": 9.373007044864664e-06, "loss": 2.6778, "mean_token_accuracy": 0.4515189346650021, "step": 3382 }, { "epoch": 0.6271783463107156, "grad_norm": 7.12890625, "learning_rate": 9.372821653689285e-06, "loss": 3.0729, "mean_token_accuracy": 0.4134971141038923, "step": 3383 }, { "epoch": 0.6273637374860956, "grad_norm": 6.58203125, "learning_rate": 9.372636262513905e-06, "loss": 2.471, "mean_token_accuracy": 0.49773276904474, "step": 3384 }, { "epoch": 0.6275491286614757, "grad_norm": 5.4140625, "learning_rate": 9.372450871338526e-06, "loss": 2.8204, "mean_token_accuracy": 0.4537483559842174, "step": 3385 }, { "epoch": 0.6277345198368558, "grad_norm": 6.85546875, "learning_rate": 9.372265480163145e-06, "loss": 3.3555, "mean_token_accuracy": 0.38568665377176015, "step": 3386 }, { "epoch": 0.6279199110122358, "grad_norm": 7.2421875, "learning_rate": 9.372080088987765e-06, "loss": 2.6807, "mean_token_accuracy": 0.4566077003121748, "step": 3387 }, { "epoch": 0.6281053021876158, "grad_norm": 9.921875, "learning_rate": 9.371894697812386e-06, "loss": 2.8964, "mean_token_accuracy": 0.43160588611644274, "step": 3388 }, { "epoch": 0.6282906933629959, "grad_norm": 6.61328125, "learning_rate": 9.371709306637004e-06, "loss": 3.0339, "mean_token_accuracy": 0.4283946798639035, "step": 3389 }, { "epoch": 0.628476084538376, "grad_norm": 6.99609375, "learning_rate": 9.371523915461625e-06, "loss": 2.6339, "mean_token_accuracy": 0.46181924368520955, "step": 3390 }, { "epoch": 0.6286614757137561, "grad_norm": 4.53125, "learning_rate": 9.371338524286244e-06, "loss": 2.8568, "mean_token_accuracy": 0.43065971780856743, "step": 3391 }, { "epoch": 0.628846866889136, "grad_norm": 5.05078125, "learning_rate": 9.371153133110866e-06, "loss": 2.8206, "mean_token_accuracy": 0.4509913432002234, "step": 3392 }, { "epoch": 0.6290322580645161, "grad_norm": 4.921875, "learning_rate": 9.370967741935485e-06, "loss": 2.823, "mean_token_accuracy": 0.45094746941712643, "step": 3393 }, { "epoch": 0.6292176492398962, "grad_norm": 5.8359375, "learning_rate": 9.370782350760105e-06, "loss": 3.0987, "mean_token_accuracy": 0.4206886182528943, "step": 3394 }, { "epoch": 0.6294030404152763, "grad_norm": 7.1796875, "learning_rate": 9.370596959584724e-06, "loss": 2.0849, "mean_token_accuracy": 0.5452862212621509, "step": 3395 }, { "epoch": 0.6295884315906563, "grad_norm": 5.4375, "learning_rate": 9.370411568409344e-06, "loss": 2.9556, "mean_token_accuracy": 0.4313694766530868, "step": 3396 }, { "epoch": 0.6297738227660363, "grad_norm": 6.7421875, "learning_rate": 9.370226177233965e-06, "loss": 2.9863, "mean_token_accuracy": 0.4477933261571582, "step": 3397 }, { "epoch": 0.6299592139414164, "grad_norm": 6.2890625, "learning_rate": 9.370040786058584e-06, "loss": 3.0477, "mean_token_accuracy": 0.41886670959433353, "step": 3398 }, { "epoch": 0.6301446051167965, "grad_norm": 7.265625, "learning_rate": 9.369855394883204e-06, "loss": 3.1983, "mean_token_accuracy": 0.4104352318222911, "step": 3399 }, { "epoch": 0.6303299962921765, "grad_norm": 6.0078125, "learning_rate": 9.369670003707825e-06, "loss": 2.8942, "mean_token_accuracy": 0.42099284334885834, "step": 3400 }, { "epoch": 0.6305153874675565, "grad_norm": 5.25, "learning_rate": 9.369484612532445e-06, "loss": 2.9969, "mean_token_accuracy": 0.4339466039562872, "step": 3401 }, { "epoch": 0.6307007786429366, "grad_norm": 7.46875, "learning_rate": 9.369299221357064e-06, "loss": 3.0149, "mean_token_accuracy": 0.4330156643663052, "step": 3402 }, { "epoch": 0.6308861698183167, "grad_norm": 7.26953125, "learning_rate": 9.369113830181685e-06, "loss": 3.1874, "mean_token_accuracy": 0.4159938059949121, "step": 3403 }, { "epoch": 0.6310715609936967, "grad_norm": 6.59765625, "learning_rate": 9.368928439006303e-06, "loss": 2.5668, "mean_token_accuracy": 0.45945689327763006, "step": 3404 }, { "epoch": 0.6312569521690767, "grad_norm": 7.23046875, "learning_rate": 9.368743047830924e-06, "loss": 2.5722, "mean_token_accuracy": 0.49531893646236425, "step": 3405 }, { "epoch": 0.6314423433444568, "grad_norm": 9.359375, "learning_rate": 9.368557656655544e-06, "loss": 2.8377, "mean_token_accuracy": 0.436622691292876, "step": 3406 }, { "epoch": 0.6316277345198369, "grad_norm": 5.88671875, "learning_rate": 9.368372265480163e-06, "loss": 3.3391, "mean_token_accuracy": 0.3802051155319412, "step": 3407 }, { "epoch": 0.6318131256952169, "grad_norm": 8.625, "learning_rate": 9.368186874304784e-06, "loss": 2.7484, "mean_token_accuracy": 0.45124890446976335, "step": 3408 }, { "epoch": 0.6319985168705969, "grad_norm": 5.72265625, "learning_rate": 9.368001483129404e-06, "loss": 2.7559, "mean_token_accuracy": 0.44590581247348327, "step": 3409 }, { "epoch": 0.632183908045977, "grad_norm": 9.8046875, "learning_rate": 9.367816091954025e-06, "loss": 2.2211, "mean_token_accuracy": 0.5271361929684834, "step": 3410 }, { "epoch": 0.6323692992213571, "grad_norm": 7.87109375, "learning_rate": 9.367630700778643e-06, "loss": 2.7195, "mean_token_accuracy": 0.4528119386486113, "step": 3411 }, { "epoch": 0.6325546903967371, "grad_norm": 6.83203125, "learning_rate": 9.367445309603264e-06, "loss": 3.4319, "mean_token_accuracy": 0.3698408416509307, "step": 3412 }, { "epoch": 0.6327400815721171, "grad_norm": 6.3125, "learning_rate": 9.367259918427883e-06, "loss": 2.8753, "mean_token_accuracy": 0.43326460481099655, "step": 3413 }, { "epoch": 0.6329254727474972, "grad_norm": 8.546875, "learning_rate": 9.367074527252503e-06, "loss": 2.911, "mean_token_accuracy": 0.4267657023716445, "step": 3414 }, { "epoch": 0.6331108639228773, "grad_norm": 5.71875, "learning_rate": 9.366889136077124e-06, "loss": 2.8491, "mean_token_accuracy": 0.4377935847537243, "step": 3415 }, { "epoch": 0.6332962550982574, "grad_norm": 5.57421875, "learning_rate": 9.366703744901744e-06, "loss": 3.1554, "mean_token_accuracy": 0.41284728366660767, "step": 3416 }, { "epoch": 0.6334816462736373, "grad_norm": 5.2265625, "learning_rate": 9.366518353726363e-06, "loss": 2.8792, "mean_token_accuracy": 0.43496309963099633, "step": 3417 }, { "epoch": 0.6336670374490174, "grad_norm": 8.2890625, "learning_rate": 9.366332962550983e-06, "loss": 2.5987, "mean_token_accuracy": 0.4505840071877808, "step": 3418 }, { "epoch": 0.6338524286243975, "grad_norm": 5.77734375, "learning_rate": 9.366147571375604e-06, "loss": 2.9068, "mean_token_accuracy": 0.43318025258323767, "step": 3419 }, { "epoch": 0.6340378197997776, "grad_norm": 5.296875, "learning_rate": 9.365962180200223e-06, "loss": 2.8174, "mean_token_accuracy": 0.4445637007620479, "step": 3420 }, { "epoch": 0.6342232109751575, "grad_norm": 6.2890625, "learning_rate": 9.365776789024843e-06, "loss": 2.3499, "mean_token_accuracy": 0.4919716206123973, "step": 3421 }, { "epoch": 0.6344086021505376, "grad_norm": 6.53125, "learning_rate": 9.365591397849462e-06, "loss": 2.6285, "mean_token_accuracy": 0.4574826352346874, "step": 3422 }, { "epoch": 0.6345939933259177, "grad_norm": 6.4296875, "learning_rate": 9.365406006674082e-06, "loss": 2.64, "mean_token_accuracy": 0.46707583480703213, "step": 3423 }, { "epoch": 0.6347793845012978, "grad_norm": 7.98828125, "learning_rate": 9.365220615498703e-06, "loss": 2.4538, "mean_token_accuracy": 0.45767987549383454, "step": 3424 }, { "epoch": 0.6349647756766778, "grad_norm": 9.109375, "learning_rate": 9.365035224323323e-06, "loss": 2.4588, "mean_token_accuracy": 0.47116652679534826, "step": 3425 }, { "epoch": 0.6351501668520578, "grad_norm": 6.43359375, "learning_rate": 9.364849833147944e-06, "loss": 2.8657, "mean_token_accuracy": 0.4372764435360245, "step": 3426 }, { "epoch": 0.6353355580274379, "grad_norm": 6.38671875, "learning_rate": 9.364664441972563e-06, "loss": 3.0479, "mean_token_accuracy": 0.42502555117535407, "step": 3427 }, { "epoch": 0.635520949202818, "grad_norm": 6.81640625, "learning_rate": 9.364479050797183e-06, "loss": 3.1238, "mean_token_accuracy": 0.41693548387096774, "step": 3428 }, { "epoch": 0.635706340378198, "grad_norm": 8.34375, "learning_rate": 9.364293659621802e-06, "loss": 2.6401, "mean_token_accuracy": 0.47024151811385856, "step": 3429 }, { "epoch": 0.635891731553578, "grad_norm": 9.5390625, "learning_rate": 9.364108268446423e-06, "loss": 2.7015, "mean_token_accuracy": 0.42945050199719315, "step": 3430 }, { "epoch": 0.6360771227289581, "grad_norm": 5.296875, "learning_rate": 9.363922877271043e-06, "loss": 3.0209, "mean_token_accuracy": 0.42886755400392756, "step": 3431 }, { "epoch": 0.6362625139043382, "grad_norm": 7.28515625, "learning_rate": 9.363737486095664e-06, "loss": 2.7207, "mean_token_accuracy": 0.44628422425032593, "step": 3432 }, { "epoch": 0.6364479050797182, "grad_norm": 5.9375, "learning_rate": 9.363552094920282e-06, "loss": 2.9606, "mean_token_accuracy": 0.4298418972332016, "step": 3433 }, { "epoch": 0.6366332962550982, "grad_norm": 7.65234375, "learning_rate": 9.363366703744903e-06, "loss": 2.6574, "mean_token_accuracy": 0.44995836802664446, "step": 3434 }, { "epoch": 0.6368186874304783, "grad_norm": 7.69140625, "learning_rate": 9.363181312569523e-06, "loss": 2.6233, "mean_token_accuracy": 0.47205138718053846, "step": 3435 }, { "epoch": 0.6370040786058584, "grad_norm": 6.51171875, "learning_rate": 9.362995921394142e-06, "loss": 3.042, "mean_token_accuracy": 0.45814415437003403, "step": 3436 }, { "epoch": 0.6371894697812384, "grad_norm": 7.6171875, "learning_rate": 9.362810530218763e-06, "loss": 2.9216, "mean_token_accuracy": 0.4396541786743516, "step": 3437 }, { "epoch": 0.6373748609566184, "grad_norm": 5.59375, "learning_rate": 9.362625139043381e-06, "loss": 2.9363, "mean_token_accuracy": 0.42065188351589633, "step": 3438 }, { "epoch": 0.6375602521319985, "grad_norm": 5.125, "learning_rate": 9.362439747868002e-06, "loss": 2.992, "mean_token_accuracy": 0.42803487592219985, "step": 3439 }, { "epoch": 0.6377456433073786, "grad_norm": 6.515625, "learning_rate": 9.362254356692622e-06, "loss": 2.9044, "mean_token_accuracy": 0.42348104157149385, "step": 3440 }, { "epoch": 0.6379310344827587, "grad_norm": 7.8046875, "learning_rate": 9.362068965517243e-06, "loss": 2.9058, "mean_token_accuracy": 0.4381881533101045, "step": 3441 }, { "epoch": 0.6381164256581386, "grad_norm": 5.99609375, "learning_rate": 9.361883574341862e-06, "loss": 3.1916, "mean_token_accuracy": 0.39945581756931847, "step": 3442 }, { "epoch": 0.6383018168335187, "grad_norm": 6.5234375, "learning_rate": 9.361698183166482e-06, "loss": 2.9828, "mean_token_accuracy": 0.4289079229122056, "step": 3443 }, { "epoch": 0.6384872080088988, "grad_norm": 6.25, "learning_rate": 9.361512791991103e-06, "loss": 2.6105, "mean_token_accuracy": 0.47671568627450983, "step": 3444 }, { "epoch": 0.6386725991842789, "grad_norm": 6.74609375, "learning_rate": 9.361327400815721e-06, "loss": 2.9073, "mean_token_accuracy": 0.40586001085187196, "step": 3445 }, { "epoch": 0.6388579903596588, "grad_norm": 6.45703125, "learning_rate": 9.361142009640342e-06, "loss": 2.4019, "mean_token_accuracy": 0.5067737681333173, "step": 3446 }, { "epoch": 0.6390433815350389, "grad_norm": 6.69921875, "learning_rate": 9.36095661846496e-06, "loss": 2.7421, "mean_token_accuracy": 0.44170537096582113, "step": 3447 }, { "epoch": 0.639228772710419, "grad_norm": 7.24609375, "learning_rate": 9.360771227289583e-06, "loss": 2.6149, "mean_token_accuracy": 0.46155446883779006, "step": 3448 }, { "epoch": 0.6394141638857991, "grad_norm": 6.859375, "learning_rate": 9.360585836114202e-06, "loss": 2.679, "mean_token_accuracy": 0.4713182221096619, "step": 3449 }, { "epoch": 0.639599555061179, "grad_norm": 5.70703125, "learning_rate": 9.360400444938822e-06, "loss": 2.4673, "mean_token_accuracy": 0.48581647755303403, "step": 3450 }, { "epoch": 0.6397849462365591, "grad_norm": 10.8125, "learning_rate": 9.360215053763441e-06, "loss": 2.0817, "mean_token_accuracy": 0.5285530771925727, "step": 3451 }, { "epoch": 0.6399703374119392, "grad_norm": 5.14453125, "learning_rate": 9.360029662588062e-06, "loss": 2.5627, "mean_token_accuracy": 0.46903520208604954, "step": 3452 }, { "epoch": 0.6401557285873193, "grad_norm": 10.59375, "learning_rate": 9.359844271412682e-06, "loss": 2.7413, "mean_token_accuracy": 0.44132718421726874, "step": 3453 }, { "epoch": 0.6403411197626993, "grad_norm": 8.25, "learning_rate": 9.3596588802373e-06, "loss": 2.1288, "mean_token_accuracy": 0.530690985619081, "step": 3454 }, { "epoch": 0.6405265109380793, "grad_norm": 5.68359375, "learning_rate": 9.359473489061921e-06, "loss": 2.8097, "mean_token_accuracy": 0.4371443352302121, "step": 3455 }, { "epoch": 0.6407119021134594, "grad_norm": 7.2890625, "learning_rate": 9.359288097886542e-06, "loss": 2.7236, "mean_token_accuracy": 0.4595070422535211, "step": 3456 }, { "epoch": 0.6408972932888395, "grad_norm": 8.078125, "learning_rate": 9.359102706711162e-06, "loss": 2.8474, "mean_token_accuracy": 0.45810428119273694, "step": 3457 }, { "epoch": 0.6410826844642195, "grad_norm": 6.2265625, "learning_rate": 9.358917315535781e-06, "loss": 2.5827, "mean_token_accuracy": 0.46051919956733367, "step": 3458 }, { "epoch": 0.6412680756395995, "grad_norm": 5.42578125, "learning_rate": 9.358731924360402e-06, "loss": 2.9487, "mean_token_accuracy": 0.43360737419033385, "step": 3459 }, { "epoch": 0.6414534668149796, "grad_norm": 7.15234375, "learning_rate": 9.35854653318502e-06, "loss": 2.611, "mean_token_accuracy": 0.4728219594228057, "step": 3460 }, { "epoch": 0.6416388579903597, "grad_norm": 7.2890625, "learning_rate": 9.35836114200964e-06, "loss": 2.8574, "mean_token_accuracy": 0.41953610712577716, "step": 3461 }, { "epoch": 0.6418242491657397, "grad_norm": 6.40234375, "learning_rate": 9.358175750834261e-06, "loss": 2.9932, "mean_token_accuracy": 0.4303088275485105, "step": 3462 }, { "epoch": 0.6420096403411197, "grad_norm": 5.97265625, "learning_rate": 9.35799035965888e-06, "loss": 2.9876, "mean_token_accuracy": 0.417957027967831, "step": 3463 }, { "epoch": 0.6421950315164998, "grad_norm": 7.6171875, "learning_rate": 9.357804968483502e-06, "loss": 3.167, "mean_token_accuracy": 0.4123508157232016, "step": 3464 }, { "epoch": 0.6423804226918799, "grad_norm": 6.31640625, "learning_rate": 9.357619577308121e-06, "loss": 2.8087, "mean_token_accuracy": 0.450201126307321, "step": 3465 }, { "epoch": 0.64256581386726, "grad_norm": 6.37109375, "learning_rate": 9.357434186132742e-06, "loss": 2.9415, "mean_token_accuracy": 0.43822721924434566, "step": 3466 }, { "epoch": 0.6427512050426399, "grad_norm": 10.6640625, "learning_rate": 9.35724879495736e-06, "loss": 2.1166, "mean_token_accuracy": 0.5081081081081081, "step": 3467 }, { "epoch": 0.64293659621802, "grad_norm": 7.3671875, "learning_rate": 9.357063403781981e-06, "loss": 2.8479, "mean_token_accuracy": 0.444538407329105, "step": 3468 }, { "epoch": 0.6431219873934001, "grad_norm": 6.68359375, "learning_rate": 9.356878012606601e-06, "loss": 2.5465, "mean_token_accuracy": 0.47539884703043306, "step": 3469 }, { "epoch": 0.6433073785687802, "grad_norm": 7.33203125, "learning_rate": 9.35669262143122e-06, "loss": 3.033, "mean_token_accuracy": 0.4175976045145687, "step": 3470 }, { "epoch": 0.6434927697441601, "grad_norm": 7.2578125, "learning_rate": 9.35650723025584e-06, "loss": 2.9691, "mean_token_accuracy": 0.4239418913256041, "step": 3471 }, { "epoch": 0.6436781609195402, "grad_norm": 5.11328125, "learning_rate": 9.35632183908046e-06, "loss": 2.6975, "mean_token_accuracy": 0.47306485355648537, "step": 3472 }, { "epoch": 0.6438635520949203, "grad_norm": 7.625, "learning_rate": 9.356136447905082e-06, "loss": 2.9957, "mean_token_accuracy": 0.42177998894416807, "step": 3473 }, { "epoch": 0.6440489432703004, "grad_norm": 5.9609375, "learning_rate": 9.3559510567297e-06, "loss": 2.8651, "mean_token_accuracy": 0.4343128781331029, "step": 3474 }, { "epoch": 0.6442343344456803, "grad_norm": 6.6484375, "learning_rate": 9.355765665554321e-06, "loss": 2.6631, "mean_token_accuracy": 0.4327756746855538, "step": 3475 }, { "epoch": 0.6444197256210604, "grad_norm": 5.8515625, "learning_rate": 9.35558027437894e-06, "loss": 3.2914, "mean_token_accuracy": 0.3833744543556652, "step": 3476 }, { "epoch": 0.6446051167964405, "grad_norm": 7.13671875, "learning_rate": 9.35539488320356e-06, "loss": 2.7057, "mean_token_accuracy": 0.4630716934487021, "step": 3477 }, { "epoch": 0.6447905079718206, "grad_norm": 7.453125, "learning_rate": 9.35520949202818e-06, "loss": 2.6218, "mean_token_accuracy": 0.47800216372160115, "step": 3478 }, { "epoch": 0.6449758991472005, "grad_norm": 5.609375, "learning_rate": 9.3550241008528e-06, "loss": 2.5635, "mean_token_accuracy": 0.48227341702303766, "step": 3479 }, { "epoch": 0.6451612903225806, "grad_norm": 5.51171875, "learning_rate": 9.35483870967742e-06, "loss": 2.6569, "mean_token_accuracy": 0.4987199180747568, "step": 3480 }, { "epoch": 0.6453466814979607, "grad_norm": 7.7734375, "learning_rate": 9.35465331850204e-06, "loss": 2.7052, "mean_token_accuracy": 0.46191391805945864, "step": 3481 }, { "epoch": 0.6455320726733408, "grad_norm": 10.234375, "learning_rate": 9.354467927326661e-06, "loss": 2.9382, "mean_token_accuracy": 0.4238544474393531, "step": 3482 }, { "epoch": 0.6457174638487208, "grad_norm": 5.73828125, "learning_rate": 9.35428253615128e-06, "loss": 2.4242, "mean_token_accuracy": 0.5008727795461546, "step": 3483 }, { "epoch": 0.6459028550241008, "grad_norm": 9.2109375, "learning_rate": 9.3540971449759e-06, "loss": 2.6135, "mean_token_accuracy": 0.4593809364174768, "step": 3484 }, { "epoch": 0.6460882461994809, "grad_norm": 6.9453125, "learning_rate": 9.353911753800519e-06, "loss": 3.5662, "mean_token_accuracy": 0.3657446551515884, "step": 3485 }, { "epoch": 0.646273637374861, "grad_norm": 7.0390625, "learning_rate": 9.35372636262514e-06, "loss": 2.7669, "mean_token_accuracy": 0.4478002378121284, "step": 3486 }, { "epoch": 0.646459028550241, "grad_norm": 5.45703125, "learning_rate": 9.35354097144976e-06, "loss": 2.672, "mean_token_accuracy": 0.4561678146524734, "step": 3487 }, { "epoch": 0.646644419725621, "grad_norm": 4.61328125, "learning_rate": 9.353355580274379e-06, "loss": 3.1046, "mean_token_accuracy": 0.4125609634716831, "step": 3488 }, { "epoch": 0.6468298109010011, "grad_norm": 9.1640625, "learning_rate": 9.353170189099e-06, "loss": 2.2336, "mean_token_accuracy": 0.49825970548862114, "step": 3489 }, { "epoch": 0.6470152020763812, "grad_norm": 6.0546875, "learning_rate": 9.35298479792362e-06, "loss": 2.8191, "mean_token_accuracy": 0.4522131378629828, "step": 3490 }, { "epoch": 0.6472005932517613, "grad_norm": 6.5390625, "learning_rate": 9.35279940674824e-06, "loss": 2.5341, "mean_token_accuracy": 0.49609282500591995, "step": 3491 }, { "epoch": 0.6473859844271412, "grad_norm": 5.953125, "learning_rate": 9.352614015572859e-06, "loss": 3.1905, "mean_token_accuracy": 0.4015497967479675, "step": 3492 }, { "epoch": 0.6475713756025213, "grad_norm": 6.8359375, "learning_rate": 9.35242862439748e-06, "loss": 3.4976, "mean_token_accuracy": 0.3823094004441155, "step": 3493 }, { "epoch": 0.6477567667779014, "grad_norm": 6.34375, "learning_rate": 9.352243233222098e-06, "loss": 3.1655, "mean_token_accuracy": 0.4075322561897013, "step": 3494 }, { "epoch": 0.6479421579532815, "grad_norm": 6.6484375, "learning_rate": 9.352057842046719e-06, "loss": 2.9146, "mean_token_accuracy": 0.4371799062874578, "step": 3495 }, { "epoch": 0.6481275491286614, "grad_norm": 9.140625, "learning_rate": 9.35187245087134e-06, "loss": 3.2363, "mean_token_accuracy": 0.3997482693517936, "step": 3496 }, { "epoch": 0.6483129403040415, "grad_norm": 6.51953125, "learning_rate": 9.35168705969596e-06, "loss": 2.5667, "mean_token_accuracy": 0.4763842643975829, "step": 3497 }, { "epoch": 0.6484983314794216, "grad_norm": 6.015625, "learning_rate": 9.351501668520579e-06, "loss": 3.1906, "mean_token_accuracy": 0.43952318460192474, "step": 3498 }, { "epoch": 0.6486837226548017, "grad_norm": 5.328125, "learning_rate": 9.3513162773452e-06, "loss": 2.4719, "mean_token_accuracy": 0.4950111515436084, "step": 3499 }, { "epoch": 0.6488691138301816, "grad_norm": 9.34375, "learning_rate": 9.35113088616982e-06, "loss": 2.8902, "mean_token_accuracy": 0.4483165977554637, "step": 3500 }, { "epoch": 0.6490545050055617, "grad_norm": 7.40625, "learning_rate": 9.350945494994438e-06, "loss": 2.7395, "mean_token_accuracy": 0.4655295972481009, "step": 3501 }, { "epoch": 0.6492398961809418, "grad_norm": 4.96484375, "learning_rate": 9.350760103819059e-06, "loss": 2.8185, "mean_token_accuracy": 0.45587241519124205, "step": 3502 }, { "epoch": 0.6494252873563219, "grad_norm": 6.4921875, "learning_rate": 9.350574712643678e-06, "loss": 2.7182, "mean_token_accuracy": 0.46024662360540225, "step": 3503 }, { "epoch": 0.6496106785317018, "grad_norm": 6.37109375, "learning_rate": 9.350389321468298e-06, "loss": 2.6893, "mean_token_accuracy": 0.4928236464402444, "step": 3504 }, { "epoch": 0.6497960697070819, "grad_norm": 5.8984375, "learning_rate": 9.350203930292919e-06, "loss": 2.4816, "mean_token_accuracy": 0.47257634153572486, "step": 3505 }, { "epoch": 0.649981460882462, "grad_norm": 5.99609375, "learning_rate": 9.35001853911754e-06, "loss": 2.4901, "mean_token_accuracy": 0.4763820399688554, "step": 3506 }, { "epoch": 0.6501668520578421, "grad_norm": 7.16015625, "learning_rate": 9.34983314794216e-06, "loss": 2.4037, "mean_token_accuracy": 0.46898115008351227, "step": 3507 }, { "epoch": 0.650352243233222, "grad_norm": 6.37890625, "learning_rate": 9.349647756766779e-06, "loss": 2.6365, "mean_token_accuracy": 0.4667563125653668, "step": 3508 }, { "epoch": 0.6505376344086021, "grad_norm": 5.93359375, "learning_rate": 9.349462365591399e-06, "loss": 3.0668, "mean_token_accuracy": 0.41711040113596026, "step": 3509 }, { "epoch": 0.6507230255839822, "grad_norm": 11.3671875, "learning_rate": 9.349276974416018e-06, "loss": 2.9947, "mean_token_accuracy": 0.42299602710913764, "step": 3510 }, { "epoch": 0.6509084167593623, "grad_norm": 10.796875, "learning_rate": 9.349091583240638e-06, "loss": 2.9209, "mean_token_accuracy": 0.4326204586446792, "step": 3511 }, { "epoch": 0.6510938079347423, "grad_norm": 5.5546875, "learning_rate": 9.348906192065259e-06, "loss": 2.8007, "mean_token_accuracy": 0.43424317617866004, "step": 3512 }, { "epoch": 0.6512791991101223, "grad_norm": 6.32421875, "learning_rate": 9.34872080088988e-06, "loss": 3.0032, "mean_token_accuracy": 0.4412568306010929, "step": 3513 }, { "epoch": 0.6514645902855024, "grad_norm": 10.3984375, "learning_rate": 9.348535409714498e-06, "loss": 2.7403, "mean_token_accuracy": 0.45398080180688877, "step": 3514 }, { "epoch": 0.6516499814608825, "grad_norm": 6.60546875, "learning_rate": 9.348350018539119e-06, "loss": 2.8257, "mean_token_accuracy": 0.4348434716212777, "step": 3515 }, { "epoch": 0.6518353726362626, "grad_norm": 5.42578125, "learning_rate": 9.348164627363739e-06, "loss": 2.3386, "mean_token_accuracy": 0.5146098683506368, "step": 3516 }, { "epoch": 0.6520207638116425, "grad_norm": 8.3046875, "learning_rate": 9.347979236188358e-06, "loss": 2.7749, "mean_token_accuracy": 0.44810450092290216, "step": 3517 }, { "epoch": 0.6522061549870226, "grad_norm": 9.5625, "learning_rate": 9.347793845012978e-06, "loss": 2.8623, "mean_token_accuracy": 0.4358325957769629, "step": 3518 }, { "epoch": 0.6523915461624027, "grad_norm": 6.359375, "learning_rate": 9.347608453837597e-06, "loss": 3.0063, "mean_token_accuracy": 0.4304139172165567, "step": 3519 }, { "epoch": 0.6525769373377828, "grad_norm": 5.15625, "learning_rate": 9.347423062662218e-06, "loss": 2.7654, "mean_token_accuracy": 0.4473443820957849, "step": 3520 }, { "epoch": 0.6527623285131627, "grad_norm": 7.5625, "learning_rate": 9.347237671486838e-06, "loss": 3.2825, "mean_token_accuracy": 0.38628239499553174, "step": 3521 }, { "epoch": 0.6529477196885428, "grad_norm": 10.4609375, "learning_rate": 9.347052280311459e-06, "loss": 2.5936, "mean_token_accuracy": 0.46189454668623686, "step": 3522 }, { "epoch": 0.6531331108639229, "grad_norm": 6.03125, "learning_rate": 9.346866889136077e-06, "loss": 2.7846, "mean_token_accuracy": 0.4692909651410956, "step": 3523 }, { "epoch": 0.653318502039303, "grad_norm": 6.234375, "learning_rate": 9.346681497960698e-06, "loss": 2.7169, "mean_token_accuracy": 0.4525537450362865, "step": 3524 }, { "epoch": 0.6535038932146829, "grad_norm": 5.1953125, "learning_rate": 9.346496106785318e-06, "loss": 3.1949, "mean_token_accuracy": 0.40904212503353904, "step": 3525 }, { "epoch": 0.653689284390063, "grad_norm": 5.85546875, "learning_rate": 9.346310715609937e-06, "loss": 3.1406, "mean_token_accuracy": 0.43195342820181115, "step": 3526 }, { "epoch": 0.6538746755654431, "grad_norm": 5.17578125, "learning_rate": 9.346125324434558e-06, "loss": 3.0217, "mean_token_accuracy": 0.41850097520200613, "step": 3527 }, { "epoch": 0.6540600667408232, "grad_norm": 7.1875, "learning_rate": 9.345939933259176e-06, "loss": 3.3386, "mean_token_accuracy": 0.3845357776463631, "step": 3528 }, { "epoch": 0.6542454579162031, "grad_norm": 4.8828125, "learning_rate": 9.345754542083799e-06, "loss": 3.3616, "mean_token_accuracy": 0.3970710909259431, "step": 3529 }, { "epoch": 0.6544308490915832, "grad_norm": 6.96875, "learning_rate": 9.345569150908417e-06, "loss": 2.746, "mean_token_accuracy": 0.44024289263041677, "step": 3530 }, { "epoch": 0.6546162402669633, "grad_norm": 5.8203125, "learning_rate": 9.345383759733038e-06, "loss": 2.6817, "mean_token_accuracy": 0.46041506533435816, "step": 3531 }, { "epoch": 0.6548016314423434, "grad_norm": 7.046875, "learning_rate": 9.345198368557657e-06, "loss": 2.7184, "mean_token_accuracy": 0.44545743665853915, "step": 3532 }, { "epoch": 0.6549870226177233, "grad_norm": 8.0703125, "learning_rate": 9.345012977382277e-06, "loss": 2.948, "mean_token_accuracy": 0.42253978564468986, "step": 3533 }, { "epoch": 0.6551724137931034, "grad_norm": 6.4140625, "learning_rate": 9.344827586206898e-06, "loss": 2.5301, "mean_token_accuracy": 0.470106810930781, "step": 3534 }, { "epoch": 0.6553578049684835, "grad_norm": 6.37109375, "learning_rate": 9.344642195031517e-06, "loss": 2.5228, "mean_token_accuracy": 0.49033329379670265, "step": 3535 }, { "epoch": 0.6555431961438636, "grad_norm": 5.30078125, "learning_rate": 9.344456803856137e-06, "loss": 2.3649, "mean_token_accuracy": 0.4945665298237141, "step": 3536 }, { "epoch": 0.6557285873192435, "grad_norm": 6.3671875, "learning_rate": 9.344271412680758e-06, "loss": 2.6566, "mean_token_accuracy": 0.4783092324805339, "step": 3537 }, { "epoch": 0.6559139784946236, "grad_norm": 6.16015625, "learning_rate": 9.344086021505378e-06, "loss": 2.6125, "mean_token_accuracy": 0.46535962110364576, "step": 3538 }, { "epoch": 0.6560993696700037, "grad_norm": 6.12890625, "learning_rate": 9.343900630329997e-06, "loss": 2.7781, "mean_token_accuracy": 0.45773028540511296, "step": 3539 }, { "epoch": 0.6562847608453838, "grad_norm": 7.63671875, "learning_rate": 9.343715239154617e-06, "loss": 3.2221, "mean_token_accuracy": 0.37738353849867246, "step": 3540 }, { "epoch": 0.6564701520207639, "grad_norm": 5.53125, "learning_rate": 9.343529847979236e-06, "loss": 2.6657, "mean_token_accuracy": 0.47190366972477066, "step": 3541 }, { "epoch": 0.6566555431961438, "grad_norm": 6.375, "learning_rate": 9.343344456803857e-06, "loss": 3.0669, "mean_token_accuracy": 0.4262676939616556, "step": 3542 }, { "epoch": 0.6568409343715239, "grad_norm": 6.69921875, "learning_rate": 9.343159065628477e-06, "loss": 2.1978, "mean_token_accuracy": 0.5372761349437735, "step": 3543 }, { "epoch": 0.657026325546904, "grad_norm": 7.1953125, "learning_rate": 9.342973674453096e-06, "loss": 2.8334, "mean_token_accuracy": 0.43960423251339836, "step": 3544 }, { "epoch": 0.6572117167222841, "grad_norm": 6.515625, "learning_rate": 9.342788283277718e-06, "loss": 3.3283, "mean_token_accuracy": 0.4047042545831892, "step": 3545 }, { "epoch": 0.657397107897664, "grad_norm": 11.53125, "learning_rate": 9.342602892102337e-06, "loss": 2.5772, "mean_token_accuracy": 0.4546969114443163, "step": 3546 }, { "epoch": 0.6575824990730441, "grad_norm": 8.4140625, "learning_rate": 9.342417500926957e-06, "loss": 2.6725, "mean_token_accuracy": 0.4457540851861773, "step": 3547 }, { "epoch": 0.6577678902484242, "grad_norm": 5.33203125, "learning_rate": 9.342232109751576e-06, "loss": 2.8483, "mean_token_accuracy": 0.4439712284686731, "step": 3548 }, { "epoch": 0.6579532814238043, "grad_norm": 6.4453125, "learning_rate": 9.342046718576197e-06, "loss": 3.2162, "mean_token_accuracy": 0.4350282485875706, "step": 3549 }, { "epoch": 0.6581386725991842, "grad_norm": 7.57421875, "learning_rate": 9.341861327400817e-06, "loss": 3.2735, "mean_token_accuracy": 0.39293333333333336, "step": 3550 }, { "epoch": 0.6583240637745643, "grad_norm": 11.0859375, "learning_rate": 9.341675936225436e-06, "loss": 2.5121, "mean_token_accuracy": 0.4711549759135237, "step": 3551 }, { "epoch": 0.6585094549499444, "grad_norm": 5.39453125, "learning_rate": 9.341490545050056e-06, "loss": 2.9018, "mean_token_accuracy": 0.4497547621763431, "step": 3552 }, { "epoch": 0.6586948461253245, "grad_norm": 9.5625, "learning_rate": 9.341305153874677e-06, "loss": 2.9469, "mean_token_accuracy": 0.4166553272554089, "step": 3553 }, { "epoch": 0.6588802373007044, "grad_norm": 7.9765625, "learning_rate": 9.341119762699297e-06, "loss": 2.7343, "mean_token_accuracy": 0.4546106067845198, "step": 3554 }, { "epoch": 0.6590656284760845, "grad_norm": 8.3359375, "learning_rate": 9.340934371523916e-06, "loss": 2.4479, "mean_token_accuracy": 0.48914141414141415, "step": 3555 }, { "epoch": 0.6592510196514646, "grad_norm": 9.1484375, "learning_rate": 9.340748980348537e-06, "loss": 2.8444, "mean_token_accuracy": 0.4476233050216063, "step": 3556 }, { "epoch": 0.6594364108268447, "grad_norm": 6.23046875, "learning_rate": 9.340563589173155e-06, "loss": 2.8782, "mean_token_accuracy": 0.4375287092328893, "step": 3557 }, { "epoch": 0.6596218020022246, "grad_norm": 6.015625, "learning_rate": 9.340378197997776e-06, "loss": 2.8338, "mean_token_accuracy": 0.44735807001442246, "step": 3558 }, { "epoch": 0.6598071931776047, "grad_norm": 6.58984375, "learning_rate": 9.340192806822396e-06, "loss": 2.8294, "mean_token_accuracy": 0.4365513809405325, "step": 3559 }, { "epoch": 0.6599925843529848, "grad_norm": 5.046875, "learning_rate": 9.340007415647015e-06, "loss": 2.9411, "mean_token_accuracy": 0.43771464252263503, "step": 3560 }, { "epoch": 0.6601779755283649, "grad_norm": 8.265625, "learning_rate": 9.339822024471636e-06, "loss": 2.851, "mean_token_accuracy": 0.4304155814743517, "step": 3561 }, { "epoch": 0.6603633667037448, "grad_norm": 7.7578125, "learning_rate": 9.339636633296256e-06, "loss": 3.0229, "mean_token_accuracy": 0.3970982687149476, "step": 3562 }, { "epoch": 0.6605487578791249, "grad_norm": 4.55078125, "learning_rate": 9.339451242120877e-06, "loss": 2.9341, "mean_token_accuracy": 0.42667102824119507, "step": 3563 }, { "epoch": 0.660734149054505, "grad_norm": 6.25390625, "learning_rate": 9.339265850945496e-06, "loss": 3.1075, "mean_token_accuracy": 0.4174991766384894, "step": 3564 }, { "epoch": 0.6609195402298851, "grad_norm": 6.296875, "learning_rate": 9.339080459770116e-06, "loss": 2.8463, "mean_token_accuracy": 0.4427916029473099, "step": 3565 }, { "epoch": 0.6611049314052652, "grad_norm": 6.21875, "learning_rate": 9.338895068594735e-06, "loss": 2.7832, "mean_token_accuracy": 0.44074028679985294, "step": 3566 }, { "epoch": 0.6612903225806451, "grad_norm": 5.0703125, "learning_rate": 9.338709677419355e-06, "loss": 2.4891, "mean_token_accuracy": 0.4980330448465775, "step": 3567 }, { "epoch": 0.6614757137560252, "grad_norm": 9.015625, "learning_rate": 9.338524286243976e-06, "loss": 2.8717, "mean_token_accuracy": 0.443019943019943, "step": 3568 }, { "epoch": 0.6616611049314053, "grad_norm": 7.08984375, "learning_rate": 9.338338895068596e-06, "loss": 3.3111, "mean_token_accuracy": 0.3847064393939394, "step": 3569 }, { "epoch": 0.6618464961067854, "grad_norm": 9.21875, "learning_rate": 9.338153503893215e-06, "loss": 2.8339, "mean_token_accuracy": 0.4424587364826409, "step": 3570 }, { "epoch": 0.6620318872821653, "grad_norm": 6.94140625, "learning_rate": 9.337968112717836e-06, "loss": 2.4328, "mean_token_accuracy": 0.48807964369924023, "step": 3571 }, { "epoch": 0.6622172784575454, "grad_norm": 7.51171875, "learning_rate": 9.337782721542456e-06, "loss": 2.9582, "mean_token_accuracy": 0.42636655948553054, "step": 3572 }, { "epoch": 0.6624026696329255, "grad_norm": 6.56640625, "learning_rate": 9.337597330367075e-06, "loss": 3.6296, "mean_token_accuracy": 0.36961099412612214, "step": 3573 }, { "epoch": 0.6625880608083056, "grad_norm": 7.24609375, "learning_rate": 9.337411939191695e-06, "loss": 2.4658, "mean_token_accuracy": 0.497765136123527, "step": 3574 }, { "epoch": 0.6627734519836855, "grad_norm": 4.41015625, "learning_rate": 9.337226548016314e-06, "loss": 2.7012, "mean_token_accuracy": 0.4707991803278688, "step": 3575 }, { "epoch": 0.6629588431590656, "grad_norm": 4.90625, "learning_rate": 9.337041156840935e-06, "loss": 3.1077, "mean_token_accuracy": 0.424341656433347, "step": 3576 }, { "epoch": 0.6631442343344457, "grad_norm": 5.11328125, "learning_rate": 9.336855765665555e-06, "loss": 3.0677, "mean_token_accuracy": 0.42668621700879766, "step": 3577 }, { "epoch": 0.6633296255098258, "grad_norm": 9.4140625, "learning_rate": 9.336670374490176e-06, "loss": 2.6996, "mean_token_accuracy": 0.46281843616551316, "step": 3578 }, { "epoch": 0.6635150166852057, "grad_norm": 6.95703125, "learning_rate": 9.336484983314794e-06, "loss": 2.8427, "mean_token_accuracy": 0.45353524349057633, "step": 3579 }, { "epoch": 0.6637004078605858, "grad_norm": 6.21875, "learning_rate": 9.336299592139415e-06, "loss": 3.0039, "mean_token_accuracy": 0.41881824675987345, "step": 3580 }, { "epoch": 0.6638857990359659, "grad_norm": 9.5, "learning_rate": 9.336114200964035e-06, "loss": 3.1638, "mean_token_accuracy": 0.4225332784410594, "step": 3581 }, { "epoch": 0.664071190211346, "grad_norm": 6.62109375, "learning_rate": 9.335928809788654e-06, "loss": 3.1135, "mean_token_accuracy": 0.4169104463527413, "step": 3582 }, { "epoch": 0.6642565813867259, "grad_norm": 7.87890625, "learning_rate": 9.335743418613275e-06, "loss": 3.2602, "mean_token_accuracy": 0.397458318759473, "step": 3583 }, { "epoch": 0.664441972562106, "grad_norm": 6.90625, "learning_rate": 9.335558027437894e-06, "loss": 2.7016, "mean_token_accuracy": 0.4630177514792899, "step": 3584 }, { "epoch": 0.6646273637374861, "grad_norm": 8.0, "learning_rate": 9.335372636262516e-06, "loss": 2.6147, "mean_token_accuracy": 0.47836630504148053, "step": 3585 }, { "epoch": 0.6648127549128662, "grad_norm": 7.9140625, "learning_rate": 9.335187245087134e-06, "loss": 2.5899, "mean_token_accuracy": 0.4601692121744863, "step": 3586 }, { "epoch": 0.6649981460882461, "grad_norm": 8.6015625, "learning_rate": 9.335001853911755e-06, "loss": 3.018, "mean_token_accuracy": 0.4043239369326326, "step": 3587 }, { "epoch": 0.6651835372636262, "grad_norm": 6.171875, "learning_rate": 9.334816462736375e-06, "loss": 2.7246, "mean_token_accuracy": 0.44770132675100277, "step": 3588 }, { "epoch": 0.6653689284390063, "grad_norm": 6.9140625, "learning_rate": 9.334631071560994e-06, "loss": 2.5851, "mean_token_accuracy": 0.4643243243243243, "step": 3589 }, { "epoch": 0.6655543196143864, "grad_norm": 6.421875, "learning_rate": 9.334445680385615e-06, "loss": 2.7557, "mean_token_accuracy": 0.4528301886792453, "step": 3590 }, { "epoch": 0.6657397107897665, "grad_norm": 8.4140625, "learning_rate": 9.334260289210234e-06, "loss": 2.7999, "mean_token_accuracy": 0.4544693906010319, "step": 3591 }, { "epoch": 0.6659251019651464, "grad_norm": 7.125, "learning_rate": 9.334074898034854e-06, "loss": 2.5648, "mean_token_accuracy": 0.4726477024070022, "step": 3592 }, { "epoch": 0.6661104931405265, "grad_norm": 7.953125, "learning_rate": 9.333889506859475e-06, "loss": 2.7071, "mean_token_accuracy": 0.47197408461764995, "step": 3593 }, { "epoch": 0.6662958843159066, "grad_norm": 6.5078125, "learning_rate": 9.333704115684095e-06, "loss": 2.8733, "mean_token_accuracy": 0.4277215189873418, "step": 3594 }, { "epoch": 0.6664812754912867, "grad_norm": 6.2890625, "learning_rate": 9.333518724508714e-06, "loss": 2.603, "mean_token_accuracy": 0.4681585677749361, "step": 3595 }, { "epoch": 0.6666666666666666, "grad_norm": 7.38671875, "learning_rate": 9.333333333333334e-06, "loss": 2.3882, "mean_token_accuracy": 0.49659304511278196, "step": 3596 }, { "epoch": 0.6668520578420467, "grad_norm": 7.92578125, "learning_rate": 9.333147942157955e-06, "loss": 2.6524, "mean_token_accuracy": 0.45775144590210187, "step": 3597 }, { "epoch": 0.6670374490174268, "grad_norm": 6.0625, "learning_rate": 9.332962550982574e-06, "loss": 3.2304, "mean_token_accuracy": 0.40518134715025905, "step": 3598 }, { "epoch": 0.6672228401928069, "grad_norm": 7.97265625, "learning_rate": 9.332777159807194e-06, "loss": 2.674, "mean_token_accuracy": 0.4665566696281254, "step": 3599 }, { "epoch": 0.6674082313681868, "grad_norm": 7.56640625, "learning_rate": 9.332591768631813e-06, "loss": 2.5623, "mean_token_accuracy": 0.48282009724473257, "step": 3600 }, { "epoch": 0.6675936225435669, "grad_norm": 7.265625, "learning_rate": 9.332406377456433e-06, "loss": 3.2064, "mean_token_accuracy": 0.39840054066231134, "step": 3601 }, { "epoch": 0.667779013718947, "grad_norm": 8.9921875, "learning_rate": 9.332220986281054e-06, "loss": 2.6115, "mean_token_accuracy": 0.4723478260869565, "step": 3602 }, { "epoch": 0.6679644048943271, "grad_norm": 10.6796875, "learning_rate": 9.332035595105674e-06, "loss": 2.4115, "mean_token_accuracy": 0.4828637815858263, "step": 3603 }, { "epoch": 0.668149796069707, "grad_norm": 7.5390625, "learning_rate": 9.331850203930293e-06, "loss": 2.9932, "mean_token_accuracy": 0.43436265183667494, "step": 3604 }, { "epoch": 0.6683351872450871, "grad_norm": 7.359375, "learning_rate": 9.331664812754914e-06, "loss": 2.8746, "mean_token_accuracy": 0.4319745353732416, "step": 3605 }, { "epoch": 0.6685205784204672, "grad_norm": 8.265625, "learning_rate": 9.331479421579534e-06, "loss": 2.4358, "mean_token_accuracy": 0.4887960501329282, "step": 3606 }, { "epoch": 0.6687059695958473, "grad_norm": 7.2578125, "learning_rate": 9.331294030404153e-06, "loss": 2.8172, "mean_token_accuracy": 0.439513998943476, "step": 3607 }, { "epoch": 0.6688913607712272, "grad_norm": 9.0859375, "learning_rate": 9.331108639228773e-06, "loss": 3.0082, "mean_token_accuracy": 0.42958129418162044, "step": 3608 }, { "epoch": 0.6690767519466073, "grad_norm": 11.0, "learning_rate": 9.330923248053392e-06, "loss": 2.7527, "mean_token_accuracy": 0.4611883691529709, "step": 3609 }, { "epoch": 0.6692621431219874, "grad_norm": 10.78125, "learning_rate": 9.330737856878014e-06, "loss": 2.5035, "mean_token_accuracy": 0.46387246078683464, "step": 3610 }, { "epoch": 0.6694475342973675, "grad_norm": 5.68359375, "learning_rate": 9.330552465702633e-06, "loss": 3.1006, "mean_token_accuracy": 0.4197340797760672, "step": 3611 }, { "epoch": 0.6696329254727474, "grad_norm": 6.234375, "learning_rate": 9.330367074527254e-06, "loss": 2.6293, "mean_token_accuracy": 0.4637002341920375, "step": 3612 }, { "epoch": 0.6698183166481275, "grad_norm": 5.10546875, "learning_rate": 9.330181683351873e-06, "loss": 2.7377, "mean_token_accuracy": 0.4565049044914817, "step": 3613 }, { "epoch": 0.6700037078235076, "grad_norm": 6.05859375, "learning_rate": 9.329996292176493e-06, "loss": 2.7477, "mean_token_accuracy": 0.47724265678750993, "step": 3614 }, { "epoch": 0.6701890989988877, "grad_norm": 5.47265625, "learning_rate": 9.329810901001113e-06, "loss": 3.0433, "mean_token_accuracy": 0.42712964075453563, "step": 3615 }, { "epoch": 0.6703744901742678, "grad_norm": 8.515625, "learning_rate": 9.329625509825732e-06, "loss": 2.761, "mean_token_accuracy": 0.43816049239345023, "step": 3616 }, { "epoch": 0.6705598813496477, "grad_norm": 6.9140625, "learning_rate": 9.329440118650353e-06, "loss": 2.5539, "mean_token_accuracy": 0.45009666505558243, "step": 3617 }, { "epoch": 0.6707452725250278, "grad_norm": 8.578125, "learning_rate": 9.329254727474973e-06, "loss": 2.8629, "mean_token_accuracy": 0.4499082989454379, "step": 3618 }, { "epoch": 0.6709306637004079, "grad_norm": 6.234375, "learning_rate": 9.329069336299594e-06, "loss": 2.3471, "mean_token_accuracy": 0.5064914992272025, "step": 3619 }, { "epoch": 0.671116054875788, "grad_norm": 8.8515625, "learning_rate": 9.328883945124213e-06, "loss": 2.9939, "mean_token_accuracy": 0.39386302994367034, "step": 3620 }, { "epoch": 0.6713014460511679, "grad_norm": 6.03125, "learning_rate": 9.328698553948833e-06, "loss": 2.8051, "mean_token_accuracy": 0.439498504715896, "step": 3621 }, { "epoch": 0.671486837226548, "grad_norm": 6.9453125, "learning_rate": 9.328513162773452e-06, "loss": 3.2633, "mean_token_accuracy": 0.4141247182569497, "step": 3622 }, { "epoch": 0.6716722284019281, "grad_norm": 6.828125, "learning_rate": 9.328327771598072e-06, "loss": 2.487, "mean_token_accuracy": 0.48067349926793557, "step": 3623 }, { "epoch": 0.6718576195773082, "grad_norm": 5.90234375, "learning_rate": 9.328142380422693e-06, "loss": 2.6918, "mean_token_accuracy": 0.4550006955070246, "step": 3624 }, { "epoch": 0.6720430107526881, "grad_norm": 5.55078125, "learning_rate": 9.327956989247312e-06, "loss": 2.6263, "mean_token_accuracy": 0.484351302909141, "step": 3625 }, { "epoch": 0.6722284019280682, "grad_norm": 6.0625, "learning_rate": 9.327771598071934e-06, "loss": 2.9171, "mean_token_accuracy": 0.42613138686131385, "step": 3626 }, { "epoch": 0.6724137931034483, "grad_norm": 10.1171875, "learning_rate": 9.327586206896553e-06, "loss": 2.7052, "mean_token_accuracy": 0.4434087882822903, "step": 3627 }, { "epoch": 0.6725991842788284, "grad_norm": 6.015625, "learning_rate": 9.327400815721173e-06, "loss": 3.1685, "mean_token_accuracy": 0.40927152317880794, "step": 3628 }, { "epoch": 0.6727845754542083, "grad_norm": 9.3046875, "learning_rate": 9.327215424545792e-06, "loss": 2.6157, "mean_token_accuracy": 0.45728921500761455, "step": 3629 }, { "epoch": 0.6729699666295884, "grad_norm": 11.625, "learning_rate": 9.327030033370412e-06, "loss": 3.1095, "mean_token_accuracy": 0.3970977917981073, "step": 3630 }, { "epoch": 0.6731553578049685, "grad_norm": 9.953125, "learning_rate": 9.326844642195033e-06, "loss": 2.5221, "mean_token_accuracy": 0.4811220420101037, "step": 3631 }, { "epoch": 0.6733407489803486, "grad_norm": 5.68359375, "learning_rate": 9.326659251019652e-06, "loss": 3.0323, "mean_token_accuracy": 0.4242041435068216, "step": 3632 }, { "epoch": 0.6735261401557285, "grad_norm": 7.34765625, "learning_rate": 9.326473859844272e-06, "loss": 3.0189, "mean_token_accuracy": 0.4166666666666667, "step": 3633 }, { "epoch": 0.6737115313311086, "grad_norm": 8.359375, "learning_rate": 9.326288468668893e-06, "loss": 2.5294, "mean_token_accuracy": 0.47661037214168284, "step": 3634 }, { "epoch": 0.6738969225064887, "grad_norm": 5.91015625, "learning_rate": 9.326103077493513e-06, "loss": 2.7859, "mean_token_accuracy": 0.4438757706176573, "step": 3635 }, { "epoch": 0.6740823136818688, "grad_norm": 5.80078125, "learning_rate": 9.325917686318132e-06, "loss": 2.6873, "mean_token_accuracy": 0.45653973509933776, "step": 3636 }, { "epoch": 0.6742677048572487, "grad_norm": 6.5234375, "learning_rate": 9.325732295142752e-06, "loss": 2.5995, "mean_token_accuracy": 0.46665730731433386, "step": 3637 }, { "epoch": 0.6744530960326288, "grad_norm": 6.0078125, "learning_rate": 9.325546903967371e-06, "loss": 3.8748, "mean_token_accuracy": 0.37796052631578947, "step": 3638 }, { "epoch": 0.6746384872080089, "grad_norm": 6.66796875, "learning_rate": 9.325361512791992e-06, "loss": 2.6198, "mean_token_accuracy": 0.4668118766479422, "step": 3639 }, { "epoch": 0.674823878383389, "grad_norm": 5.34375, "learning_rate": 9.325176121616612e-06, "loss": 3.0215, "mean_token_accuracy": 0.4283849918433931, "step": 3640 }, { "epoch": 0.675009269558769, "grad_norm": 5.93359375, "learning_rate": 9.324990730441231e-06, "loss": 2.859, "mean_token_accuracy": 0.43352295277153063, "step": 3641 }, { "epoch": 0.675194660734149, "grad_norm": 5.39453125, "learning_rate": 9.324805339265852e-06, "loss": 2.3806, "mean_token_accuracy": 0.5106893792157311, "step": 3642 }, { "epoch": 0.6753800519095291, "grad_norm": 8.796875, "learning_rate": 9.324619948090472e-06, "loss": 2.5917, "mean_token_accuracy": 0.4822549647661755, "step": 3643 }, { "epoch": 0.6755654430849092, "grad_norm": 6.16015625, "learning_rate": 9.324434556915092e-06, "loss": 2.8296, "mean_token_accuracy": 0.45179584120982985, "step": 3644 }, { "epoch": 0.6757508342602893, "grad_norm": 5.8984375, "learning_rate": 9.324249165739711e-06, "loss": 2.427, "mean_token_accuracy": 0.48312611012433393, "step": 3645 }, { "epoch": 0.6759362254356692, "grad_norm": 5.2109375, "learning_rate": 9.324063774564332e-06, "loss": 2.7307, "mean_token_accuracy": 0.4411764705882353, "step": 3646 }, { "epoch": 0.6761216166110493, "grad_norm": 6.37109375, "learning_rate": 9.32387838338895e-06, "loss": 2.5482, "mean_token_accuracy": 0.4753647452762497, "step": 3647 }, { "epoch": 0.6763070077864294, "grad_norm": 6.9453125, "learning_rate": 9.323692992213571e-06, "loss": 2.7346, "mean_token_accuracy": 0.4473684210526316, "step": 3648 }, { "epoch": 0.6764923989618095, "grad_norm": 8.453125, "learning_rate": 9.323507601038192e-06, "loss": 2.7356, "mean_token_accuracy": 0.42458928090492865, "step": 3649 }, { "epoch": 0.6766777901371894, "grad_norm": 6.015625, "learning_rate": 9.323322209862812e-06, "loss": 2.4363, "mean_token_accuracy": 0.4764303241637608, "step": 3650 }, { "epoch": 0.6768631813125695, "grad_norm": 6.28515625, "learning_rate": 9.323136818687431e-06, "loss": 2.6247, "mean_token_accuracy": 0.46879449454200284, "step": 3651 }, { "epoch": 0.6770485724879496, "grad_norm": 10.5390625, "learning_rate": 9.322951427512051e-06, "loss": 2.6109, "mean_token_accuracy": 0.4587063422014878, "step": 3652 }, { "epoch": 0.6772339636633297, "grad_norm": 6.2890625, "learning_rate": 9.322766036336672e-06, "loss": 3.1121, "mean_token_accuracy": 0.408524771657902, "step": 3653 }, { "epoch": 0.6774193548387096, "grad_norm": 5.26953125, "learning_rate": 9.32258064516129e-06, "loss": 3.0713, "mean_token_accuracy": 0.41240128692600175, "step": 3654 }, { "epoch": 0.6776047460140897, "grad_norm": 6.5078125, "learning_rate": 9.322395253985911e-06, "loss": 2.8869, "mean_token_accuracy": 0.43451864700780574, "step": 3655 }, { "epoch": 0.6777901371894698, "grad_norm": 5.71875, "learning_rate": 9.32220986281053e-06, "loss": 2.2131, "mean_token_accuracy": 0.5030404378230465, "step": 3656 }, { "epoch": 0.6779755283648499, "grad_norm": 6.50390625, "learning_rate": 9.32202447163515e-06, "loss": 3.1317, "mean_token_accuracy": 0.41773084479371314, "step": 3657 }, { "epoch": 0.6781609195402298, "grad_norm": 7.51953125, "learning_rate": 9.321839080459771e-06, "loss": 2.5512, "mean_token_accuracy": 0.45736724008975316, "step": 3658 }, { "epoch": 0.6783463107156099, "grad_norm": 8.6015625, "learning_rate": 9.321653689284391e-06, "loss": 2.4088, "mean_token_accuracy": 0.48516607545952917, "step": 3659 }, { "epoch": 0.67853170189099, "grad_norm": 5.21484375, "learning_rate": 9.32146829810901e-06, "loss": 3.0705, "mean_token_accuracy": 0.40913770913770914, "step": 3660 }, { "epoch": 0.6787170930663701, "grad_norm": 5.55078125, "learning_rate": 9.32128290693363e-06, "loss": 2.5839, "mean_token_accuracy": 0.47993527508090617, "step": 3661 }, { "epoch": 0.6789024842417501, "grad_norm": 5.76171875, "learning_rate": 9.321097515758251e-06, "loss": 2.7031, "mean_token_accuracy": 0.44138892409073627, "step": 3662 }, { "epoch": 0.6790878754171301, "grad_norm": 6.15234375, "learning_rate": 9.32091212458287e-06, "loss": 2.707, "mean_token_accuracy": 0.47384799521244764, "step": 3663 }, { "epoch": 0.6792732665925102, "grad_norm": 5.34765625, "learning_rate": 9.32072673340749e-06, "loss": 2.3473, "mean_token_accuracy": 0.5033642249587407, "step": 3664 }, { "epoch": 0.6794586577678903, "grad_norm": 4.86328125, "learning_rate": 9.32054134223211e-06, "loss": 2.9438, "mean_token_accuracy": 0.45585274662065, "step": 3665 }, { "epoch": 0.6796440489432704, "grad_norm": 6.765625, "learning_rate": 9.320355951056731e-06, "loss": 2.416, "mean_token_accuracy": 0.4872469103339469, "step": 3666 }, { "epoch": 0.6798294401186503, "grad_norm": 11.1796875, "learning_rate": 9.32017055988135e-06, "loss": 3.1018, "mean_token_accuracy": 0.4056912616469403, "step": 3667 }, { "epoch": 0.6800148312940304, "grad_norm": 7.0078125, "learning_rate": 9.31998516870597e-06, "loss": 2.7959, "mean_token_accuracy": 0.45966535165283634, "step": 3668 }, { "epoch": 0.6802002224694105, "grad_norm": 6.23828125, "learning_rate": 9.319799777530591e-06, "loss": 2.6445, "mean_token_accuracy": 0.4571267922406522, "step": 3669 }, { "epoch": 0.6803856136447906, "grad_norm": 6.6875, "learning_rate": 9.31961438635521e-06, "loss": 2.7879, "mean_token_accuracy": 0.43941166638809964, "step": 3670 }, { "epoch": 0.6805710048201705, "grad_norm": 6.3359375, "learning_rate": 9.31942899517983e-06, "loss": 2.9837, "mean_token_accuracy": 0.41839007317849186, "step": 3671 }, { "epoch": 0.6807563959955506, "grad_norm": 8.0859375, "learning_rate": 9.31924360400445e-06, "loss": 2.927, "mean_token_accuracy": 0.4268956342967711, "step": 3672 }, { "epoch": 0.6809417871709307, "grad_norm": 7.43359375, "learning_rate": 9.31905821282907e-06, "loss": 2.7616, "mean_token_accuracy": 0.44950055493895674, "step": 3673 }, { "epoch": 0.6811271783463108, "grad_norm": 4.94921875, "learning_rate": 9.31887282165369e-06, "loss": 3.2001, "mean_token_accuracy": 0.4100163309744148, "step": 3674 }, { "epoch": 0.6813125695216907, "grad_norm": 7.0234375, "learning_rate": 9.31868743047831e-06, "loss": 2.5693, "mean_token_accuracy": 0.4498342874359747, "step": 3675 }, { "epoch": 0.6814979606970708, "grad_norm": 5.60546875, "learning_rate": 9.31850203930293e-06, "loss": 2.8776, "mean_token_accuracy": 0.44008662175168434, "step": 3676 }, { "epoch": 0.6816833518724509, "grad_norm": 7.2421875, "learning_rate": 9.31831664812755e-06, "loss": 1.9909, "mean_token_accuracy": 0.5634180610550518, "step": 3677 }, { "epoch": 0.681868743047831, "grad_norm": 5.671875, "learning_rate": 9.31813125695217e-06, "loss": 2.9693, "mean_token_accuracy": 0.4477510990869124, "step": 3678 }, { "epoch": 0.6820541342232109, "grad_norm": 5.640625, "learning_rate": 9.31794586577679e-06, "loss": 2.9028, "mean_token_accuracy": 0.4410116856090924, "step": 3679 }, { "epoch": 0.682239525398591, "grad_norm": 6.74609375, "learning_rate": 9.31776047460141e-06, "loss": 3.2196, "mean_token_accuracy": 0.4018082701768382, "step": 3680 }, { "epoch": 0.6824249165739711, "grad_norm": 6.75, "learning_rate": 9.317575083426029e-06, "loss": 2.5036, "mean_token_accuracy": 0.47283682338996447, "step": 3681 }, { "epoch": 0.6826103077493512, "grad_norm": 6.3671875, "learning_rate": 9.31738969225065e-06, "loss": 2.2638, "mean_token_accuracy": 0.5205215587494444, "step": 3682 }, { "epoch": 0.6827956989247311, "grad_norm": 6.4296875, "learning_rate": 9.31720430107527e-06, "loss": 2.7664, "mean_token_accuracy": 0.4615570599613153, "step": 3683 }, { "epoch": 0.6829810901001112, "grad_norm": 7.7734375, "learning_rate": 9.31701890989989e-06, "loss": 2.6736, "mean_token_accuracy": 0.4461025559612637, "step": 3684 }, { "epoch": 0.6831664812754913, "grad_norm": 7.00390625, "learning_rate": 9.316833518724509e-06, "loss": 3.0958, "mean_token_accuracy": 0.40964610084559977, "step": 3685 }, { "epoch": 0.6833518724508714, "grad_norm": 6.24609375, "learning_rate": 9.31664812754913e-06, "loss": 2.3771, "mean_token_accuracy": 0.49792919171676686, "step": 3686 }, { "epoch": 0.6835372636262514, "grad_norm": 7.484375, "learning_rate": 9.31646273637375e-06, "loss": 2.2255, "mean_token_accuracy": 0.5134324916407241, "step": 3687 }, { "epoch": 0.6837226548016314, "grad_norm": 5.6796875, "learning_rate": 9.316277345198369e-06, "loss": 3.3625, "mean_token_accuracy": 0.3951213763846335, "step": 3688 }, { "epoch": 0.6839080459770115, "grad_norm": 6.546875, "learning_rate": 9.31609195402299e-06, "loss": 2.6319, "mean_token_accuracy": 0.45058626465661644, "step": 3689 }, { "epoch": 0.6840934371523916, "grad_norm": 6.29296875, "learning_rate": 9.31590656284761e-06, "loss": 3.2104, "mean_token_accuracy": 0.42191029364311067, "step": 3690 }, { "epoch": 0.6842788283277716, "grad_norm": 8.8359375, "learning_rate": 9.31572117167223e-06, "loss": 2.5383, "mean_token_accuracy": 0.47611464968152867, "step": 3691 }, { "epoch": 0.6844642195031516, "grad_norm": 10.796875, "learning_rate": 9.315535780496849e-06, "loss": 2.3693, "mean_token_accuracy": 0.49222011385199244, "step": 3692 }, { "epoch": 0.6846496106785317, "grad_norm": 6.03125, "learning_rate": 9.31535038932147e-06, "loss": 2.7409, "mean_token_accuracy": 0.4267748001880583, "step": 3693 }, { "epoch": 0.6848350018539118, "grad_norm": 7.640625, "learning_rate": 9.315164998146088e-06, "loss": 2.974, "mean_token_accuracy": 0.4579218250104326, "step": 3694 }, { "epoch": 0.6850203930292919, "grad_norm": 6.83984375, "learning_rate": 9.314979606970709e-06, "loss": 2.7889, "mean_token_accuracy": 0.4270300179831235, "step": 3695 }, { "epoch": 0.6852057842046718, "grad_norm": 9.203125, "learning_rate": 9.31479421579533e-06, "loss": 2.6061, "mean_token_accuracy": 0.46867833433916717, "step": 3696 }, { "epoch": 0.6853911753800519, "grad_norm": 7.875, "learning_rate": 9.314608824619948e-06, "loss": 2.4554, "mean_token_accuracy": 0.49648886896757805, "step": 3697 }, { "epoch": 0.685576566555432, "grad_norm": 9.3046875, "learning_rate": 9.314423433444569e-06, "loss": 3.0638, "mean_token_accuracy": 0.4152993097204934, "step": 3698 }, { "epoch": 0.685761957730812, "grad_norm": 7.10546875, "learning_rate": 9.314238042269189e-06, "loss": 3.1631, "mean_token_accuracy": 0.3914081145584726, "step": 3699 }, { "epoch": 0.685947348906192, "grad_norm": 5.57421875, "learning_rate": 9.31405265109381e-06, "loss": 2.5044, "mean_token_accuracy": 0.4739136753378869, "step": 3700 }, { "epoch": 0.6861327400815721, "grad_norm": 7.57421875, "learning_rate": 9.313867259918428e-06, "loss": 3.2191, "mean_token_accuracy": 0.40308471454880296, "step": 3701 }, { "epoch": 0.6863181312569522, "grad_norm": 7.0234375, "learning_rate": 9.313681868743049e-06, "loss": 2.4089, "mean_token_accuracy": 0.4782207882000461, "step": 3702 }, { "epoch": 0.6865035224323323, "grad_norm": 6.0546875, "learning_rate": 9.313496477567668e-06, "loss": 2.6639, "mean_token_accuracy": 0.45160481444333, "step": 3703 }, { "epoch": 0.6866889136077122, "grad_norm": 5.6484375, "learning_rate": 9.313311086392288e-06, "loss": 2.7807, "mean_token_accuracy": 0.46417019158994777, "step": 3704 }, { "epoch": 0.6868743047830923, "grad_norm": 5.578125, "learning_rate": 9.313125695216909e-06, "loss": 2.6647, "mean_token_accuracy": 0.4640559756235188, "step": 3705 }, { "epoch": 0.6870596959584724, "grad_norm": 4.796875, "learning_rate": 9.312940304041529e-06, "loss": 2.7506, "mean_token_accuracy": 0.4486500794070937, "step": 3706 }, { "epoch": 0.6872450871338525, "grad_norm": 5.99609375, "learning_rate": 9.31275491286615e-06, "loss": 2.5841, "mean_token_accuracy": 0.4736072168718761, "step": 3707 }, { "epoch": 0.6874304783092324, "grad_norm": 5.796875, "learning_rate": 9.312569521690768e-06, "loss": 3.3288, "mean_token_accuracy": 0.3968909664093923, "step": 3708 }, { "epoch": 0.6876158694846125, "grad_norm": 6.33984375, "learning_rate": 9.312384130515389e-06, "loss": 2.7566, "mean_token_accuracy": 0.44651213407334855, "step": 3709 }, { "epoch": 0.6878012606599926, "grad_norm": 6.4375, "learning_rate": 9.312198739340008e-06, "loss": 3.3169, "mean_token_accuracy": 0.40241286863270775, "step": 3710 }, { "epoch": 0.6879866518353727, "grad_norm": 5.07421875, "learning_rate": 9.312013348164628e-06, "loss": 2.6545, "mean_token_accuracy": 0.48212258796821794, "step": 3711 }, { "epoch": 0.6881720430107527, "grad_norm": 6.87109375, "learning_rate": 9.311827956989249e-06, "loss": 2.463, "mean_token_accuracy": 0.47846695557963165, "step": 3712 }, { "epoch": 0.6883574341861327, "grad_norm": 7.05078125, "learning_rate": 9.311642565813867e-06, "loss": 2.4716, "mean_token_accuracy": 0.4891616011010437, "step": 3713 }, { "epoch": 0.6885428253615128, "grad_norm": 7.87890625, "learning_rate": 9.311457174638488e-06, "loss": 3.0846, "mean_token_accuracy": 0.41446028513238287, "step": 3714 }, { "epoch": 0.6887282165368929, "grad_norm": 7.09765625, "learning_rate": 9.311271783463108e-06, "loss": 2.5664, "mean_token_accuracy": 0.48602180598266703, "step": 3715 }, { "epoch": 0.688913607712273, "grad_norm": 7.54296875, "learning_rate": 9.311086392287729e-06, "loss": 2.0557, "mean_token_accuracy": 0.527019174898315, "step": 3716 }, { "epoch": 0.6890989988876529, "grad_norm": 5.75390625, "learning_rate": 9.310901001112348e-06, "loss": 3.0696, "mean_token_accuracy": 0.4094616639477977, "step": 3717 }, { "epoch": 0.689284390063033, "grad_norm": 6.77734375, "learning_rate": 9.310715609936968e-06, "loss": 2.8445, "mean_token_accuracy": 0.43648902821316615, "step": 3718 }, { "epoch": 0.6894697812384131, "grad_norm": 6.6875, "learning_rate": 9.310530218761587e-06, "loss": 3.2139, "mean_token_accuracy": 0.4118034948895483, "step": 3719 }, { "epoch": 0.6896551724137931, "grad_norm": 9.75, "learning_rate": 9.310344827586207e-06, "loss": 2.3257, "mean_token_accuracy": 0.49427772126144454, "step": 3720 }, { "epoch": 0.6898405635891731, "grad_norm": 6.1328125, "learning_rate": 9.310159436410828e-06, "loss": 2.9005, "mean_token_accuracy": 0.4299740644683216, "step": 3721 }, { "epoch": 0.6900259547645532, "grad_norm": 5.66015625, "learning_rate": 9.309974045235447e-06, "loss": 3.058, "mean_token_accuracy": 0.4259070871481858, "step": 3722 }, { "epoch": 0.6902113459399333, "grad_norm": 7.125, "learning_rate": 9.309788654060067e-06, "loss": 2.4398, "mean_token_accuracy": 0.47286742034943474, "step": 3723 }, { "epoch": 0.6903967371153134, "grad_norm": 5.88671875, "learning_rate": 9.309603262884688e-06, "loss": 2.3354, "mean_token_accuracy": 0.5140526837870605, "step": 3724 }, { "epoch": 0.6905821282906933, "grad_norm": 6.12890625, "learning_rate": 9.309417871709308e-06, "loss": 2.544, "mean_token_accuracy": 0.49151989562948467, "step": 3725 }, { "epoch": 0.6907675194660734, "grad_norm": 6.08203125, "learning_rate": 9.309232480533927e-06, "loss": 2.8818, "mean_token_accuracy": 0.43655339187985237, "step": 3726 }, { "epoch": 0.6909529106414535, "grad_norm": 7.25, "learning_rate": 9.309047089358548e-06, "loss": 2.6161, "mean_token_accuracy": 0.4860829225862569, "step": 3727 }, { "epoch": 0.6911383018168336, "grad_norm": 6.18359375, "learning_rate": 9.308861698183166e-06, "loss": 2.4938, "mean_token_accuracy": 0.48931829092654827, "step": 3728 }, { "epoch": 0.6913236929922135, "grad_norm": 5.98046875, "learning_rate": 9.308676307007787e-06, "loss": 2.8052, "mean_token_accuracy": 0.45561899818323387, "step": 3729 }, { "epoch": 0.6915090841675936, "grad_norm": 10.5390625, "learning_rate": 9.308490915832407e-06, "loss": 2.4368, "mean_token_accuracy": 0.49649520112153567, "step": 3730 }, { "epoch": 0.6916944753429737, "grad_norm": 7.1328125, "learning_rate": 9.308305524657028e-06, "loss": 2.1498, "mean_token_accuracy": 0.5312959153687923, "step": 3731 }, { "epoch": 0.6918798665183538, "grad_norm": 5.2734375, "learning_rate": 9.308120133481647e-06, "loss": 2.9191, "mean_token_accuracy": 0.4215308370044053, "step": 3732 }, { "epoch": 0.6920652576937337, "grad_norm": 5.68359375, "learning_rate": 9.307934742306267e-06, "loss": 2.9049, "mean_token_accuracy": 0.45426829268292684, "step": 3733 }, { "epoch": 0.6922506488691138, "grad_norm": 6.23046875, "learning_rate": 9.307749351130888e-06, "loss": 2.7945, "mean_token_accuracy": 0.4490644490644491, "step": 3734 }, { "epoch": 0.6924360400444939, "grad_norm": 7.4375, "learning_rate": 9.307563959955506e-06, "loss": 2.7582, "mean_token_accuracy": 0.44988569365900616, "step": 3735 }, { "epoch": 0.692621431219874, "grad_norm": 10.484375, "learning_rate": 9.307378568780127e-06, "loss": 2.1572, "mean_token_accuracy": 0.5419969894631209, "step": 3736 }, { "epoch": 0.692806822395254, "grad_norm": 8.5, "learning_rate": 9.307193177604746e-06, "loss": 2.9078, "mean_token_accuracy": 0.43260485950861954, "step": 3737 }, { "epoch": 0.692992213570634, "grad_norm": 7.1171875, "learning_rate": 9.307007786429366e-06, "loss": 2.7767, "mean_token_accuracy": 0.4397390653406053, "step": 3738 }, { "epoch": 0.6931776047460141, "grad_norm": 8.0625, "learning_rate": 9.306822395253987e-06, "loss": 3.3308, "mean_token_accuracy": 0.3990030117353827, "step": 3739 }, { "epoch": 0.6933629959213942, "grad_norm": 8.171875, "learning_rate": 9.306637004078607e-06, "loss": 2.8292, "mean_token_accuracy": 0.4578696343402226, "step": 3740 }, { "epoch": 0.6935483870967742, "grad_norm": 5.1015625, "learning_rate": 9.306451612903226e-06, "loss": 2.9635, "mean_token_accuracy": 0.4141553082405902, "step": 3741 }, { "epoch": 0.6937337782721542, "grad_norm": 9.8984375, "learning_rate": 9.306266221727846e-06, "loss": 2.6526, "mean_token_accuracy": 0.44365446142907927, "step": 3742 }, { "epoch": 0.6939191694475343, "grad_norm": 6.1328125, "learning_rate": 9.306080830552467e-06, "loss": 3.2672, "mean_token_accuracy": 0.4056435137895812, "step": 3743 }, { "epoch": 0.6941045606229144, "grad_norm": 7.33984375, "learning_rate": 9.305895439377086e-06, "loss": 2.6097, "mean_token_accuracy": 0.4550501156515035, "step": 3744 }, { "epoch": 0.6942899517982944, "grad_norm": 9.5859375, "learning_rate": 9.305710048201706e-06, "loss": 2.9654, "mean_token_accuracy": 0.42474916387959866, "step": 3745 }, { "epoch": 0.6944753429736744, "grad_norm": 6.09375, "learning_rate": 9.305524657026325e-06, "loss": 3.088, "mean_token_accuracy": 0.41641952008627664, "step": 3746 }, { "epoch": 0.6946607341490545, "grad_norm": 5.1328125, "learning_rate": 9.305339265850947e-06, "loss": 3.0227, "mean_token_accuracy": 0.4286058851905451, "step": 3747 }, { "epoch": 0.6948461253244346, "grad_norm": 4.90625, "learning_rate": 9.305153874675566e-06, "loss": 3.0253, "mean_token_accuracy": 0.43477609704957315, "step": 3748 }, { "epoch": 0.6950315164998146, "grad_norm": 5.79296875, "learning_rate": 9.304968483500186e-06, "loss": 3.0712, "mean_token_accuracy": 0.4207501512401694, "step": 3749 }, { "epoch": 0.6952169076751946, "grad_norm": 6.63671875, "learning_rate": 9.304783092324807e-06, "loss": 2.9522, "mean_token_accuracy": 0.42444910807974817, "step": 3750 }, { "epoch": 0.6954022988505747, "grad_norm": 5.82421875, "learning_rate": 9.304597701149426e-06, "loss": 3.2942, "mean_token_accuracy": 0.40330220598186495, "step": 3751 }, { "epoch": 0.6955876900259548, "grad_norm": 5.1875, "learning_rate": 9.304412309974046e-06, "loss": 3.1218, "mean_token_accuracy": 0.4129016670693404, "step": 3752 }, { "epoch": 0.6957730812013349, "grad_norm": 5.9375, "learning_rate": 9.304226918798665e-06, "loss": 3.5558, "mean_token_accuracy": 0.37955361723961006, "step": 3753 }, { "epoch": 0.6959584723767148, "grad_norm": 6.16796875, "learning_rate": 9.304041527623286e-06, "loss": 2.867, "mean_token_accuracy": 0.42647903503733486, "step": 3754 }, { "epoch": 0.6961438635520949, "grad_norm": 5.96484375, "learning_rate": 9.303856136447906e-06, "loss": 3.0602, "mean_token_accuracy": 0.41829268292682925, "step": 3755 }, { "epoch": 0.696329254727475, "grad_norm": 6.09765625, "learning_rate": 9.303670745272527e-06, "loss": 3.001, "mean_token_accuracy": 0.43562795585916975, "step": 3756 }, { "epoch": 0.6965146459028551, "grad_norm": 5.86328125, "learning_rate": 9.303485354097145e-06, "loss": 2.9652, "mean_token_accuracy": 0.4584017169549299, "step": 3757 }, { "epoch": 0.696700037078235, "grad_norm": 5.88671875, "learning_rate": 9.303299962921766e-06, "loss": 3.0853, "mean_token_accuracy": 0.41798523206751054, "step": 3758 }, { "epoch": 0.6968854282536151, "grad_norm": 5.46484375, "learning_rate": 9.303114571746386e-06, "loss": 2.1892, "mean_token_accuracy": 0.5260829774252593, "step": 3759 }, { "epoch": 0.6970708194289952, "grad_norm": 6.81640625, "learning_rate": 9.302929180571005e-06, "loss": 2.3943, "mean_token_accuracy": 0.5048891008824231, "step": 3760 }, { "epoch": 0.6972562106043753, "grad_norm": 8.2578125, "learning_rate": 9.302743789395626e-06, "loss": 3.0268, "mean_token_accuracy": 0.40733812949640286, "step": 3761 }, { "epoch": 0.6974416017797553, "grad_norm": 5.42578125, "learning_rate": 9.302558398220244e-06, "loss": 3.0889, "mean_token_accuracy": 0.4246820140859876, "step": 3762 }, { "epoch": 0.6976269929551353, "grad_norm": 5.4765625, "learning_rate": 9.302373007044867e-06, "loss": 2.5036, "mean_token_accuracy": 0.4927477840451249, "step": 3763 }, { "epoch": 0.6978123841305154, "grad_norm": 8.0390625, "learning_rate": 9.302187615869485e-06, "loss": 2.7553, "mean_token_accuracy": 0.45544554455445546, "step": 3764 }, { "epoch": 0.6979977753058955, "grad_norm": 6.69140625, "learning_rate": 9.302002224694106e-06, "loss": 3.2619, "mean_token_accuracy": 0.3847634322373697, "step": 3765 }, { "epoch": 0.6981831664812755, "grad_norm": 8.046875, "learning_rate": 9.301816833518725e-06, "loss": 2.7133, "mean_token_accuracy": 0.4454521556256572, "step": 3766 }, { "epoch": 0.6983685576566555, "grad_norm": 7.96875, "learning_rate": 9.301631442343345e-06, "loss": 2.3942, "mean_token_accuracy": 0.4794557097118463, "step": 3767 }, { "epoch": 0.6985539488320356, "grad_norm": 8.5, "learning_rate": 9.301446051167966e-06, "loss": 2.5215, "mean_token_accuracy": 0.46710526315789475, "step": 3768 }, { "epoch": 0.6987393400074157, "grad_norm": 6.44140625, "learning_rate": 9.301260659992584e-06, "loss": 3.127, "mean_token_accuracy": 0.44329799318439983, "step": 3769 }, { "epoch": 0.6989247311827957, "grad_norm": 5.23046875, "learning_rate": 9.301075268817205e-06, "loss": 2.5763, "mean_token_accuracy": 0.4542807992589652, "step": 3770 }, { "epoch": 0.6991101223581757, "grad_norm": 6.578125, "learning_rate": 9.300889877641825e-06, "loss": 2.4968, "mean_token_accuracy": 0.4939518643222347, "step": 3771 }, { "epoch": 0.6992955135335558, "grad_norm": 6.35546875, "learning_rate": 9.300704486466446e-06, "loss": 2.8044, "mean_token_accuracy": 0.4478944562899787, "step": 3772 }, { "epoch": 0.6994809047089359, "grad_norm": 7.02734375, "learning_rate": 9.300519095291065e-06, "loss": 3.0017, "mean_token_accuracy": 0.4395097332372026, "step": 3773 }, { "epoch": 0.699666295884316, "grad_norm": 6.16796875, "learning_rate": 9.300333704115685e-06, "loss": 2.7093, "mean_token_accuracy": 0.4448196677022829, "step": 3774 }, { "epoch": 0.6998516870596959, "grad_norm": 5.9453125, "learning_rate": 9.300148312940304e-06, "loss": 3.1104, "mean_token_accuracy": 0.4289184169167024, "step": 3775 }, { "epoch": 0.700037078235076, "grad_norm": 9.3984375, "learning_rate": 9.299962921764925e-06, "loss": 2.483, "mean_token_accuracy": 0.47798807905468993, "step": 3776 }, { "epoch": 0.7002224694104561, "grad_norm": 7.31640625, "learning_rate": 9.299777530589545e-06, "loss": 2.7822, "mean_token_accuracy": 0.4536370315944159, "step": 3777 }, { "epoch": 0.7004078605858362, "grad_norm": 4.8984375, "learning_rate": 9.299592139414164e-06, "loss": 3.2263, "mean_token_accuracy": 0.42537724418988876, "step": 3778 }, { "epoch": 0.7005932517612161, "grad_norm": 7.25, "learning_rate": 9.299406748238784e-06, "loss": 3.2735, "mean_token_accuracy": 0.3994807540354442, "step": 3779 }, { "epoch": 0.7007786429365962, "grad_norm": 6.94921875, "learning_rate": 9.299221357063405e-06, "loss": 2.6055, "mean_token_accuracy": 0.46255506607929514, "step": 3780 }, { "epoch": 0.7009640341119763, "grad_norm": 5.84765625, "learning_rate": 9.299035965888025e-06, "loss": 2.7275, "mean_token_accuracy": 0.4404685287681812, "step": 3781 }, { "epoch": 0.7011494252873564, "grad_norm": 9.3046875, "learning_rate": 9.298850574712644e-06, "loss": 2.3886, "mean_token_accuracy": 0.48823060043885896, "step": 3782 }, { "epoch": 0.7013348164627363, "grad_norm": 9.34375, "learning_rate": 9.298665183537265e-06, "loss": 2.7863, "mean_token_accuracy": 0.4582369942196532, "step": 3783 }, { "epoch": 0.7015202076381164, "grad_norm": 6.734375, "learning_rate": 9.298479792361883e-06, "loss": 2.377, "mean_token_accuracy": 0.514839409134029, "step": 3784 }, { "epoch": 0.7017055988134965, "grad_norm": 6.5, "learning_rate": 9.298294401186504e-06, "loss": 2.5365, "mean_token_accuracy": 0.4765061642242382, "step": 3785 }, { "epoch": 0.7018909899888766, "grad_norm": 11.890625, "learning_rate": 9.298109010011124e-06, "loss": 3.0053, "mean_token_accuracy": 0.4141820067409904, "step": 3786 }, { "epoch": 0.7020763811642566, "grad_norm": 13.9140625, "learning_rate": 9.297923618835745e-06, "loss": 3.1148, "mean_token_accuracy": 0.4159607668301382, "step": 3787 }, { "epoch": 0.7022617723396366, "grad_norm": 8.9921875, "learning_rate": 9.297738227660365e-06, "loss": 2.724, "mean_token_accuracy": 0.45027988146196907, "step": 3788 }, { "epoch": 0.7024471635150167, "grad_norm": 7.4609375, "learning_rate": 9.297552836484984e-06, "loss": 2.5233, "mean_token_accuracy": 0.479294745389166, "step": 3789 }, { "epoch": 0.7026325546903968, "grad_norm": 10.8203125, "learning_rate": 9.297367445309605e-06, "loss": 2.9294, "mean_token_accuracy": 0.42621160409556313, "step": 3790 }, { "epoch": 0.7028179458657768, "grad_norm": 11.359375, "learning_rate": 9.297182054134223e-06, "loss": 2.9372, "mean_token_accuracy": 0.4305572343768887, "step": 3791 }, { "epoch": 0.7030033370411568, "grad_norm": 5.5, "learning_rate": 9.296996662958844e-06, "loss": 2.7663, "mean_token_accuracy": 0.4328397673188789, "step": 3792 }, { "epoch": 0.7031887282165369, "grad_norm": 7.640625, "learning_rate": 9.296811271783464e-06, "loss": 2.926, "mean_token_accuracy": 0.4357455075279262, "step": 3793 }, { "epoch": 0.703374119391917, "grad_norm": 11.1875, "learning_rate": 9.296625880608083e-06, "loss": 2.7833, "mean_token_accuracy": 0.4382592286826277, "step": 3794 }, { "epoch": 0.703559510567297, "grad_norm": 9.609375, "learning_rate": 9.296440489432704e-06, "loss": 2.4775, "mean_token_accuracy": 0.5096978958504761, "step": 3795 }, { "epoch": 0.703744901742677, "grad_norm": 7.75390625, "learning_rate": 9.296255098257324e-06, "loss": 2.7215, "mean_token_accuracy": 0.46455589801064723, "step": 3796 }, { "epoch": 0.7039302929180571, "grad_norm": 8.21875, "learning_rate": 9.296069707081945e-06, "loss": 2.905, "mean_token_accuracy": 0.4429616322340562, "step": 3797 }, { "epoch": 0.7041156840934372, "grad_norm": 9.171875, "learning_rate": 9.295884315906563e-06, "loss": 2.7986, "mean_token_accuracy": 0.4398177445892925, "step": 3798 }, { "epoch": 0.7043010752688172, "grad_norm": 7.015625, "learning_rate": 9.295698924731184e-06, "loss": 2.6195, "mean_token_accuracy": 0.45451508740674756, "step": 3799 }, { "epoch": 0.7044864664441972, "grad_norm": 6.9453125, "learning_rate": 9.295513533555803e-06, "loss": 3.3091, "mean_token_accuracy": 0.40289952798381656, "step": 3800 }, { "epoch": 0.7046718576195773, "grad_norm": 5.9296875, "learning_rate": 9.295328142380423e-06, "loss": 2.9801, "mean_token_accuracy": 0.41936181719848564, "step": 3801 }, { "epoch": 0.7048572487949574, "grad_norm": 7.79296875, "learning_rate": 9.295142751205044e-06, "loss": 2.9512, "mean_token_accuracy": 0.431203007518797, "step": 3802 }, { "epoch": 0.7050426399703374, "grad_norm": 7.09375, "learning_rate": 9.294957360029664e-06, "loss": 3.1607, "mean_token_accuracy": 0.4059082338152106, "step": 3803 }, { "epoch": 0.7052280311457174, "grad_norm": 6.7734375, "learning_rate": 9.294771968854283e-06, "loss": 3.0824, "mean_token_accuracy": 0.4139832377591531, "step": 3804 }, { "epoch": 0.7054134223210975, "grad_norm": 9.4921875, "learning_rate": 9.294586577678904e-06, "loss": 2.8677, "mean_token_accuracy": 0.43493115229319806, "step": 3805 }, { "epoch": 0.7055988134964776, "grad_norm": 8.75, "learning_rate": 9.294401186503524e-06, "loss": 3.4116, "mean_token_accuracy": 0.4032863849765258, "step": 3806 }, { "epoch": 0.7057842046718577, "grad_norm": 5.8046875, "learning_rate": 9.294215795328143e-06, "loss": 2.6219, "mean_token_accuracy": 0.4749961053123539, "step": 3807 }, { "epoch": 0.7059695958472376, "grad_norm": 6.96875, "learning_rate": 9.294030404152763e-06, "loss": 2.3789, "mean_token_accuracy": 0.482174566316077, "step": 3808 }, { "epoch": 0.7061549870226177, "grad_norm": 5.390625, "learning_rate": 9.293845012977382e-06, "loss": 3.0125, "mean_token_accuracy": 0.4328379743182802, "step": 3809 }, { "epoch": 0.7063403781979978, "grad_norm": 5.82421875, "learning_rate": 9.293659621802003e-06, "loss": 3.1878, "mean_token_accuracy": 0.4178818520489622, "step": 3810 }, { "epoch": 0.7065257693733779, "grad_norm": 6.16796875, "learning_rate": 9.293474230626623e-06, "loss": 3.0366, "mean_token_accuracy": 0.41045465035771983, "step": 3811 }, { "epoch": 0.7067111605487579, "grad_norm": 4.390625, "learning_rate": 9.293288839451244e-06, "loss": 2.9539, "mean_token_accuracy": 0.42405305445733965, "step": 3812 }, { "epoch": 0.7068965517241379, "grad_norm": 5.0546875, "learning_rate": 9.293103448275862e-06, "loss": 2.8925, "mean_token_accuracy": 0.42827186382138893, "step": 3813 }, { "epoch": 0.707081942899518, "grad_norm": 5.96875, "learning_rate": 9.292918057100483e-06, "loss": 3.8499, "mean_token_accuracy": 0.35547122074636306, "step": 3814 }, { "epoch": 0.7072673340748981, "grad_norm": 5.66015625, "learning_rate": 9.292732665925103e-06, "loss": 2.6861, "mean_token_accuracy": 0.45479583283439107, "step": 3815 }, { "epoch": 0.7074527252502781, "grad_norm": 7.9453125, "learning_rate": 9.292547274749722e-06, "loss": 2.6778, "mean_token_accuracy": 0.4694493336685944, "step": 3816 }, { "epoch": 0.7076381164256581, "grad_norm": 7.5390625, "learning_rate": 9.292361883574343e-06, "loss": 2.5824, "mean_token_accuracy": 0.4599832515851178, "step": 3817 }, { "epoch": 0.7078235076010382, "grad_norm": 6.671875, "learning_rate": 9.292176492398961e-06, "loss": 2.8358, "mean_token_accuracy": 0.44285218999191034, "step": 3818 }, { "epoch": 0.7080088987764183, "grad_norm": 5.57421875, "learning_rate": 9.291991101223584e-06, "loss": 2.9752, "mean_token_accuracy": 0.43184402924451665, "step": 3819 }, { "epoch": 0.7081942899517983, "grad_norm": 8.2734375, "learning_rate": 9.291805710048202e-06, "loss": 2.7237, "mean_token_accuracy": 0.44651842233403666, "step": 3820 }, { "epoch": 0.7083796811271783, "grad_norm": 9.6640625, "learning_rate": 9.291620318872823e-06, "loss": 3.382, "mean_token_accuracy": 0.4221745542297895, "step": 3821 }, { "epoch": 0.7085650723025584, "grad_norm": 6.81640625, "learning_rate": 9.291434927697442e-06, "loss": 3.2157, "mean_token_accuracy": 0.4180610236220472, "step": 3822 }, { "epoch": 0.7087504634779385, "grad_norm": 9.0546875, "learning_rate": 9.291249536522062e-06, "loss": 2.7699, "mean_token_accuracy": 0.43652790484903936, "step": 3823 }, { "epoch": 0.7089358546533185, "grad_norm": 7.609375, "learning_rate": 9.291064145346683e-06, "loss": 2.7713, "mean_token_accuracy": 0.44808743169398907, "step": 3824 }, { "epoch": 0.7091212458286985, "grad_norm": 7.0234375, "learning_rate": 9.290878754171301e-06, "loss": 3.01, "mean_token_accuracy": 0.43073742246726393, "step": 3825 }, { "epoch": 0.7093066370040786, "grad_norm": 9.2109375, "learning_rate": 9.290693362995922e-06, "loss": 2.5356, "mean_token_accuracy": 0.48332617718620297, "step": 3826 }, { "epoch": 0.7094920281794587, "grad_norm": 6.37890625, "learning_rate": 9.290507971820542e-06, "loss": 2.5125, "mean_token_accuracy": 0.4970414201183432, "step": 3827 }, { "epoch": 0.7096774193548387, "grad_norm": 6.9765625, "learning_rate": 9.290322580645163e-06, "loss": 2.6819, "mean_token_accuracy": 0.46208977744247454, "step": 3828 }, { "epoch": 0.7098628105302187, "grad_norm": 5.46875, "learning_rate": 9.290137189469782e-06, "loss": 2.7806, "mean_token_accuracy": 0.4633494527869687, "step": 3829 }, { "epoch": 0.7100482017055988, "grad_norm": 10.4296875, "learning_rate": 9.289951798294402e-06, "loss": 2.8999, "mean_token_accuracy": 0.4438202247191011, "step": 3830 }, { "epoch": 0.7102335928809789, "grad_norm": 5.4609375, "learning_rate": 9.289766407119023e-06, "loss": 3.1191, "mean_token_accuracy": 0.424476736435142, "step": 3831 }, { "epoch": 0.710418984056359, "grad_norm": 5.45703125, "learning_rate": 9.289581015943642e-06, "loss": 2.7851, "mean_token_accuracy": 0.4512842588606687, "step": 3832 }, { "epoch": 0.7106043752317389, "grad_norm": 6.0703125, "learning_rate": 9.289395624768262e-06, "loss": 2.1701, "mean_token_accuracy": 0.5377923559612093, "step": 3833 }, { "epoch": 0.710789766407119, "grad_norm": 6.67578125, "learning_rate": 9.28921023359288e-06, "loss": 2.8134, "mean_token_accuracy": 0.4498439589835042, "step": 3834 }, { "epoch": 0.7109751575824991, "grad_norm": 5.94140625, "learning_rate": 9.289024842417503e-06, "loss": 2.5722, "mean_token_accuracy": 0.4611955951756686, "step": 3835 }, { "epoch": 0.7111605487578792, "grad_norm": 7.8046875, "learning_rate": 9.288839451242122e-06, "loss": 2.4677, "mean_token_accuracy": 0.47398629883354937, "step": 3836 }, { "epoch": 0.7113459399332592, "grad_norm": 7.45703125, "learning_rate": 9.288654060066742e-06, "loss": 3.3085, "mean_token_accuracy": 0.40940927712435576, "step": 3837 }, { "epoch": 0.7115313311086392, "grad_norm": 9.2265625, "learning_rate": 9.288468668891361e-06, "loss": 3.2965, "mean_token_accuracy": 0.4, "step": 3838 }, { "epoch": 0.7117167222840193, "grad_norm": 9.1796875, "learning_rate": 9.288283277715982e-06, "loss": 2.437, "mean_token_accuracy": 0.49358407079646016, "step": 3839 }, { "epoch": 0.7119021134593994, "grad_norm": 6.3359375, "learning_rate": 9.288097886540602e-06, "loss": 3.2576, "mean_token_accuracy": 0.384384834407313, "step": 3840 }, { "epoch": 0.7120875046347794, "grad_norm": 6.6875, "learning_rate": 9.287912495365221e-06, "loss": 3.1605, "mean_token_accuracy": 0.4100135317997294, "step": 3841 }, { "epoch": 0.7122728958101594, "grad_norm": 8.34375, "learning_rate": 9.287727104189841e-06, "loss": 3.1242, "mean_token_accuracy": 0.4092058674759737, "step": 3842 }, { "epoch": 0.7124582869855395, "grad_norm": 8.0546875, "learning_rate": 9.28754171301446e-06, "loss": 2.8401, "mean_token_accuracy": 0.4523113708820404, "step": 3843 }, { "epoch": 0.7126436781609196, "grad_norm": 6.55859375, "learning_rate": 9.287356321839082e-06, "loss": 2.7778, "mean_token_accuracy": 0.4600933047534989, "step": 3844 }, { "epoch": 0.7128290693362996, "grad_norm": 6.4765625, "learning_rate": 9.287170930663701e-06, "loss": 2.5912, "mean_token_accuracy": 0.46860986547085204, "step": 3845 }, { "epoch": 0.7130144605116796, "grad_norm": 7.53515625, "learning_rate": 9.286985539488322e-06, "loss": 2.9713, "mean_token_accuracy": 0.42661576938265106, "step": 3846 }, { "epoch": 0.7131998516870597, "grad_norm": 5.55859375, "learning_rate": 9.28680014831294e-06, "loss": 2.3253, "mean_token_accuracy": 0.5053908355795148, "step": 3847 }, { "epoch": 0.7133852428624398, "grad_norm": 5.30078125, "learning_rate": 9.286614757137561e-06, "loss": 2.8588, "mean_token_accuracy": 0.44854713868798374, "step": 3848 }, { "epoch": 0.7135706340378198, "grad_norm": 7.5390625, "learning_rate": 9.286429365962181e-06, "loss": 3.4518, "mean_token_accuracy": 0.3614772103239978, "step": 3849 }, { "epoch": 0.7137560252131998, "grad_norm": 8.6953125, "learning_rate": 9.2862439747868e-06, "loss": 2.8411, "mean_token_accuracy": 0.4513692162417375, "step": 3850 }, { "epoch": 0.7139414163885799, "grad_norm": 5.234375, "learning_rate": 9.28605858361142e-06, "loss": 2.9319, "mean_token_accuracy": 0.42496640286695536, "step": 3851 }, { "epoch": 0.71412680756396, "grad_norm": 5.2734375, "learning_rate": 9.285873192436041e-06, "loss": 2.2036, "mean_token_accuracy": 0.5415636789384676, "step": 3852 }, { "epoch": 0.71431219873934, "grad_norm": 8.078125, "learning_rate": 9.285687801260662e-06, "loss": 2.8999, "mean_token_accuracy": 0.44234296194406236, "step": 3853 }, { "epoch": 0.71449758991472, "grad_norm": 5.58203125, "learning_rate": 9.28550241008528e-06, "loss": 2.7645, "mean_token_accuracy": 0.44176706827309237, "step": 3854 }, { "epoch": 0.7146829810901001, "grad_norm": 6.55859375, "learning_rate": 9.285317018909901e-06, "loss": 2.7683, "mean_token_accuracy": 0.4656292491312887, "step": 3855 }, { "epoch": 0.7148683722654802, "grad_norm": 5.40625, "learning_rate": 9.28513162773452e-06, "loss": 2.8877, "mean_token_accuracy": 0.43333333333333335, "step": 3856 }, { "epoch": 0.7150537634408602, "grad_norm": 5.3671875, "learning_rate": 9.28494623655914e-06, "loss": 3.0976, "mean_token_accuracy": 0.4210226192739023, "step": 3857 }, { "epoch": 0.7152391546162402, "grad_norm": 5.078125, "learning_rate": 9.28476084538376e-06, "loss": 2.9418, "mean_token_accuracy": 0.4238421955403088, "step": 3858 }, { "epoch": 0.7154245457916203, "grad_norm": 5.5703125, "learning_rate": 9.28457545420838e-06, "loss": 3.224, "mean_token_accuracy": 0.41537043438184124, "step": 3859 }, { "epoch": 0.7156099369670004, "grad_norm": 7.21484375, "learning_rate": 9.284390063033e-06, "loss": 2.3118, "mean_token_accuracy": 0.4927465362673187, "step": 3860 }, { "epoch": 0.7157953281423804, "grad_norm": 6.81640625, "learning_rate": 9.28420467185762e-06, "loss": 3.2337, "mean_token_accuracy": 0.39963503649635035, "step": 3861 }, { "epoch": 0.7159807193177605, "grad_norm": 5.6484375, "learning_rate": 9.284019280682241e-06, "loss": 3.0017, "mean_token_accuracy": 0.41752975730406555, "step": 3862 }, { "epoch": 0.7161661104931405, "grad_norm": 13.1953125, "learning_rate": 9.28383388950686e-06, "loss": 2.4888, "mean_token_accuracy": 0.5243933918430562, "step": 3863 }, { "epoch": 0.7163515016685206, "grad_norm": 6.21875, "learning_rate": 9.28364849833148e-06, "loss": 2.9504, "mean_token_accuracy": 0.4504148053605616, "step": 3864 }, { "epoch": 0.7165368928439007, "grad_norm": 5.4765625, "learning_rate": 9.283463107156099e-06, "loss": 3.0543, "mean_token_accuracy": 0.4324521688330656, "step": 3865 }, { "epoch": 0.7167222840192807, "grad_norm": 5.99609375, "learning_rate": 9.28327771598072e-06, "loss": 3.1813, "mean_token_accuracy": 0.3868104860731841, "step": 3866 }, { "epoch": 0.7169076751946607, "grad_norm": 7.8125, "learning_rate": 9.28309232480534e-06, "loss": 2.6599, "mean_token_accuracy": 0.47185174785904815, "step": 3867 }, { "epoch": 0.7170930663700408, "grad_norm": 5.69140625, "learning_rate": 9.28290693362996e-06, "loss": 2.1933, "mean_token_accuracy": 0.5489057151747744, "step": 3868 }, { "epoch": 0.7172784575454209, "grad_norm": 12.296875, "learning_rate": 9.282721542454581e-06, "loss": 2.3496, "mean_token_accuracy": 0.509635477130253, "step": 3869 }, { "epoch": 0.7174638487208009, "grad_norm": 7.18359375, "learning_rate": 9.2825361512792e-06, "loss": 3.1087, "mean_token_accuracy": 0.40987944722140546, "step": 3870 }, { "epoch": 0.7176492398961809, "grad_norm": 5.81640625, "learning_rate": 9.28235076010382e-06, "loss": 2.3658, "mean_token_accuracy": 0.507732670533002, "step": 3871 }, { "epoch": 0.717834631071561, "grad_norm": 7.91015625, "learning_rate": 9.28216536892844e-06, "loss": 3.4865, "mean_token_accuracy": 0.38901449660859155, "step": 3872 }, { "epoch": 0.7180200222469411, "grad_norm": 6.78515625, "learning_rate": 9.28197997775306e-06, "loss": 2.8264, "mean_token_accuracy": 0.453559990145356, "step": 3873 }, { "epoch": 0.7182054134223211, "grad_norm": 9.84375, "learning_rate": 9.28179458657768e-06, "loss": 2.7351, "mean_token_accuracy": 0.4610535794687078, "step": 3874 }, { "epoch": 0.7183908045977011, "grad_norm": 8.2578125, "learning_rate": 9.281609195402299e-06, "loss": 2.4851, "mean_token_accuracy": 0.47504223621563507, "step": 3875 }, { "epoch": 0.7185761957730812, "grad_norm": 8.109375, "learning_rate": 9.28142380422692e-06, "loss": 2.5902, "mean_token_accuracy": 0.4603890611784607, "step": 3876 }, { "epoch": 0.7187615869484613, "grad_norm": 10.796875, "learning_rate": 9.28123841305154e-06, "loss": 2.6374, "mean_token_accuracy": 0.46257758305951074, "step": 3877 }, { "epoch": 0.7189469781238413, "grad_norm": 6.4140625, "learning_rate": 9.28105302187616e-06, "loss": 2.6443, "mean_token_accuracy": 0.49711732487748633, "step": 3878 }, { "epoch": 0.7191323692992213, "grad_norm": 5.3984375, "learning_rate": 9.28086763070078e-06, "loss": 2.6785, "mean_token_accuracy": 0.48767682660055417, "step": 3879 }, { "epoch": 0.7193177604746014, "grad_norm": 5.8125, "learning_rate": 9.2806822395254e-06, "loss": 2.7603, "mean_token_accuracy": 0.45819867921877194, "step": 3880 }, { "epoch": 0.7195031516499815, "grad_norm": 7.64453125, "learning_rate": 9.280496848350018e-06, "loss": 2.7929, "mean_token_accuracy": 0.45042286380869057, "step": 3881 }, { "epoch": 0.7196885428253615, "grad_norm": 5.43359375, "learning_rate": 9.280311457174639e-06, "loss": 3.2275, "mean_token_accuracy": 0.417011751538892, "step": 3882 }, { "epoch": 0.7198739340007415, "grad_norm": 5.95703125, "learning_rate": 9.28012606599926e-06, "loss": 2.6321, "mean_token_accuracy": 0.4506729773702115, "step": 3883 }, { "epoch": 0.7200593251761216, "grad_norm": 7.32421875, "learning_rate": 9.27994067482388e-06, "loss": 3.0024, "mean_token_accuracy": 0.4272026661112268, "step": 3884 }, { "epoch": 0.7202447163515017, "grad_norm": 4.953125, "learning_rate": 9.279755283648499e-06, "loss": 3.21, "mean_token_accuracy": 0.42186761229314423, "step": 3885 }, { "epoch": 0.7204301075268817, "grad_norm": 8.796875, "learning_rate": 9.27956989247312e-06, "loss": 2.1783, "mean_token_accuracy": 0.5267933087609866, "step": 3886 }, { "epoch": 0.7206154987022618, "grad_norm": 7.46875, "learning_rate": 9.27938450129774e-06, "loss": 2.1276, "mean_token_accuracy": 0.5390696260261477, "step": 3887 }, { "epoch": 0.7208008898776418, "grad_norm": 5.8828125, "learning_rate": 9.279199110122359e-06, "loss": 3.1521, "mean_token_accuracy": 0.4157950583598605, "step": 3888 }, { "epoch": 0.7209862810530219, "grad_norm": 5.6015625, "learning_rate": 9.279013718946979e-06, "loss": 2.8625, "mean_token_accuracy": 0.4286047053342651, "step": 3889 }, { "epoch": 0.721171672228402, "grad_norm": 6.70703125, "learning_rate": 9.278828327771598e-06, "loss": 3.0683, "mean_token_accuracy": 0.4291015107341638, "step": 3890 }, { "epoch": 0.721357063403782, "grad_norm": 5.06640625, "learning_rate": 9.278642936596218e-06, "loss": 2.922, "mean_token_accuracy": 0.4243416743089218, "step": 3891 }, { "epoch": 0.721542454579162, "grad_norm": 6.21875, "learning_rate": 9.278457545420839e-06, "loss": 2.7284, "mean_token_accuracy": 0.43791408420602695, "step": 3892 }, { "epoch": 0.7217278457545421, "grad_norm": 6.51953125, "learning_rate": 9.27827215424546e-06, "loss": 3.149, "mean_token_accuracy": 0.4257305194805195, "step": 3893 }, { "epoch": 0.7219132369299222, "grad_norm": 5.5234375, "learning_rate": 9.278086763070078e-06, "loss": 2.558, "mean_token_accuracy": 0.48107681910340166, "step": 3894 }, { "epoch": 0.7220986281053022, "grad_norm": 6.19140625, "learning_rate": 9.277901371894699e-06, "loss": 2.7365, "mean_token_accuracy": 0.46621621621621623, "step": 3895 }, { "epoch": 0.7222840192806822, "grad_norm": 7.63671875, "learning_rate": 9.277715980719319e-06, "loss": 2.7897, "mean_token_accuracy": 0.45120702267739576, "step": 3896 }, { "epoch": 0.7224694104560623, "grad_norm": 7.1015625, "learning_rate": 9.277530589543938e-06, "loss": 2.8925, "mean_token_accuracy": 0.4322405018345366, "step": 3897 }, { "epoch": 0.7226548016314424, "grad_norm": 5.7109375, "learning_rate": 9.277345198368558e-06, "loss": 2.3219, "mean_token_accuracy": 0.5061345158906134, "step": 3898 }, { "epoch": 0.7228401928068224, "grad_norm": 7.9921875, "learning_rate": 9.277159807193177e-06, "loss": 3.243, "mean_token_accuracy": 0.41237307258367806, "step": 3899 }, { "epoch": 0.7230255839822024, "grad_norm": 7.29296875, "learning_rate": 9.2769744160178e-06, "loss": 2.445, "mean_token_accuracy": 0.48598811112459056, "step": 3900 }, { "epoch": 0.7232109751575825, "grad_norm": 5.3125, "learning_rate": 9.276789024842418e-06, "loss": 2.9842, "mean_token_accuracy": 0.43967031482232133, "step": 3901 }, { "epoch": 0.7233963663329626, "grad_norm": 8.046875, "learning_rate": 9.276603633667039e-06, "loss": 3.0261, "mean_token_accuracy": 0.4347930992882133, "step": 3902 }, { "epoch": 0.7235817575083426, "grad_norm": 5.58203125, "learning_rate": 9.276418242491657e-06, "loss": 2.539, "mean_token_accuracy": 0.473052394647993, "step": 3903 }, { "epoch": 0.7237671486837226, "grad_norm": 5.4296875, "learning_rate": 9.276232851316278e-06, "loss": 3.0278, "mean_token_accuracy": 0.4216754540128881, "step": 3904 }, { "epoch": 0.7239525398591027, "grad_norm": 6.11328125, "learning_rate": 9.276047460140898e-06, "loss": 3.0518, "mean_token_accuracy": 0.4218213058419244, "step": 3905 }, { "epoch": 0.7241379310344828, "grad_norm": 6.6953125, "learning_rate": 9.275862068965517e-06, "loss": 2.9169, "mean_token_accuracy": 0.44007569386038686, "step": 3906 }, { "epoch": 0.7243233222098628, "grad_norm": 7.86328125, "learning_rate": 9.275676677790138e-06, "loss": 2.9699, "mean_token_accuracy": 0.43709173530247086, "step": 3907 }, { "epoch": 0.7245087133852428, "grad_norm": 9.3671875, "learning_rate": 9.275491286614758e-06, "loss": 2.5806, "mean_token_accuracy": 0.46972526006935184, "step": 3908 }, { "epoch": 0.7246941045606229, "grad_norm": 8.46875, "learning_rate": 9.275305895439379e-06, "loss": 2.8157, "mean_token_accuracy": 0.4554568891435119, "step": 3909 }, { "epoch": 0.724879495736003, "grad_norm": 8.71875, "learning_rate": 9.275120504263997e-06, "loss": 2.4995, "mean_token_accuracy": 0.4755811681969135, "step": 3910 }, { "epoch": 0.725064886911383, "grad_norm": 5.46484375, "learning_rate": 9.274935113088618e-06, "loss": 2.6272, "mean_token_accuracy": 0.4806324110671937, "step": 3911 }, { "epoch": 0.7252502780867631, "grad_norm": 6.1796875, "learning_rate": 9.274749721913238e-06, "loss": 2.6474, "mean_token_accuracy": 0.46493150684931506, "step": 3912 }, { "epoch": 0.7254356692621431, "grad_norm": 5.62109375, "learning_rate": 9.274564330737857e-06, "loss": 2.5719, "mean_token_accuracy": 0.4821671195652174, "step": 3913 }, { "epoch": 0.7256210604375232, "grad_norm": 4.84375, "learning_rate": 9.274378939562478e-06, "loss": 2.7854, "mean_token_accuracy": 0.46368325665690785, "step": 3914 }, { "epoch": 0.7258064516129032, "grad_norm": 4.8671875, "learning_rate": 9.274193548387097e-06, "loss": 2.9438, "mean_token_accuracy": 0.4207401270367302, "step": 3915 }, { "epoch": 0.7259918427882833, "grad_norm": 11.0703125, "learning_rate": 9.274008157211719e-06, "loss": 1.9193, "mean_token_accuracy": 0.580967454743298, "step": 3916 }, { "epoch": 0.7261772339636633, "grad_norm": 6.09765625, "learning_rate": 9.273822766036338e-06, "loss": 3.1129, "mean_token_accuracy": 0.4153397027600849, "step": 3917 }, { "epoch": 0.7263626251390434, "grad_norm": 4.90625, "learning_rate": 9.273637374860958e-06, "loss": 3.0335, "mean_token_accuracy": 0.41949556918882075, "step": 3918 }, { "epoch": 0.7265480163144235, "grad_norm": 5.63671875, "learning_rate": 9.273451983685577e-06, "loss": 2.201, "mean_token_accuracy": 0.5010224948875256, "step": 3919 }, { "epoch": 0.7267334074898035, "grad_norm": 7.890625, "learning_rate": 9.273266592510197e-06, "loss": 2.8647, "mean_token_accuracy": 0.4287729196050776, "step": 3920 }, { "epoch": 0.7269187986651835, "grad_norm": 5.484375, "learning_rate": 9.273081201334818e-06, "loss": 3.0507, "mean_token_accuracy": 0.4214655810510733, "step": 3921 }, { "epoch": 0.7271041898405636, "grad_norm": 5.8125, "learning_rate": 9.272895810159437e-06, "loss": 2.4188, "mean_token_accuracy": 0.4922339405560882, "step": 3922 }, { "epoch": 0.7272895810159437, "grad_norm": 7.46484375, "learning_rate": 9.272710418984057e-06, "loss": 2.7332, "mean_token_accuracy": 0.44634912326616066, "step": 3923 }, { "epoch": 0.7274749721913237, "grad_norm": 5.8984375, "learning_rate": 9.272525027808678e-06, "loss": 2.873, "mean_token_accuracy": 0.45482246952835187, "step": 3924 }, { "epoch": 0.7276603633667037, "grad_norm": 5.8984375, "learning_rate": 9.272339636633298e-06, "loss": 3.0332, "mean_token_accuracy": 0.41354611711485295, "step": 3925 }, { "epoch": 0.7278457545420838, "grad_norm": 5.5625, "learning_rate": 9.272154245457917e-06, "loss": 2.5159, "mean_token_accuracy": 0.4718878345843365, "step": 3926 }, { "epoch": 0.7280311457174639, "grad_norm": 5.13671875, "learning_rate": 9.271968854282537e-06, "loss": 2.9081, "mean_token_accuracy": 0.4388247168330156, "step": 3927 }, { "epoch": 0.7282165368928439, "grad_norm": 7.15625, "learning_rate": 9.271783463107156e-06, "loss": 2.3861, "mean_token_accuracy": 0.48508600043544525, "step": 3928 }, { "epoch": 0.7284019280682239, "grad_norm": 5.6875, "learning_rate": 9.271598071931777e-06, "loss": 3.0912, "mean_token_accuracy": 0.41877880184331795, "step": 3929 }, { "epoch": 0.728587319243604, "grad_norm": 5.79296875, "learning_rate": 9.271412680756397e-06, "loss": 2.6762, "mean_token_accuracy": 0.46303162486368593, "step": 3930 }, { "epoch": 0.7287727104189841, "grad_norm": 7.54296875, "learning_rate": 9.271227289581016e-06, "loss": 2.7837, "mean_token_accuracy": 0.44346617238183506, "step": 3931 }, { "epoch": 0.7289581015943641, "grad_norm": 8.2109375, "learning_rate": 9.271041898405636e-06, "loss": 2.4994, "mean_token_accuracy": 0.4934435261707989, "step": 3932 }, { "epoch": 0.7291434927697441, "grad_norm": 6.4765625, "learning_rate": 9.270856507230257e-06, "loss": 2.6423, "mean_token_accuracy": 0.48200403109703427, "step": 3933 }, { "epoch": 0.7293288839451242, "grad_norm": 11.6640625, "learning_rate": 9.270671116054877e-06, "loss": 2.3081, "mean_token_accuracy": 0.47759405703330937, "step": 3934 }, { "epoch": 0.7295142751205043, "grad_norm": 7.94921875, "learning_rate": 9.270485724879496e-06, "loss": 2.7722, "mean_token_accuracy": 0.4535424697594045, "step": 3935 }, { "epoch": 0.7296996662958843, "grad_norm": 7.17578125, "learning_rate": 9.270300333704117e-06, "loss": 2.5789, "mean_token_accuracy": 0.4907862407862408, "step": 3936 }, { "epoch": 0.7298850574712644, "grad_norm": 8.6640625, "learning_rate": 9.270114942528736e-06, "loss": 2.7871, "mean_token_accuracy": 0.45421519393097554, "step": 3937 }, { "epoch": 0.7300704486466444, "grad_norm": 8.6171875, "learning_rate": 9.269929551353356e-06, "loss": 2.9112, "mean_token_accuracy": 0.44194796817888626, "step": 3938 }, { "epoch": 0.7302558398220245, "grad_norm": 6.140625, "learning_rate": 9.269744160177976e-06, "loss": 3.073, "mean_token_accuracy": 0.4136210384356035, "step": 3939 }, { "epoch": 0.7304412309974045, "grad_norm": 9.1328125, "learning_rate": 9.269558769002597e-06, "loss": 2.2808, "mean_token_accuracy": 0.5192578930237605, "step": 3940 }, { "epoch": 0.7306266221727846, "grad_norm": 5.49609375, "learning_rate": 9.269373377827216e-06, "loss": 3.019, "mean_token_accuracy": 0.42737547090832984, "step": 3941 }, { "epoch": 0.7308120133481646, "grad_norm": 9.234375, "learning_rate": 9.269187986651836e-06, "loss": 2.9588, "mean_token_accuracy": 0.4270209157716224, "step": 3942 }, { "epoch": 0.7309974045235447, "grad_norm": 5.640625, "learning_rate": 9.269002595476457e-06, "loss": 2.6464, "mean_token_accuracy": 0.464031007751938, "step": 3943 }, { "epoch": 0.7311827956989247, "grad_norm": 5.68359375, "learning_rate": 9.268817204301076e-06, "loss": 2.9776, "mean_token_accuracy": 0.44745502413339183, "step": 3944 }, { "epoch": 0.7313681868743048, "grad_norm": 7.96484375, "learning_rate": 9.268631813125696e-06, "loss": 2.3587, "mean_token_accuracy": 0.4868006518196632, "step": 3945 }, { "epoch": 0.7315535780496848, "grad_norm": 6.171875, "learning_rate": 9.268446421950315e-06, "loss": 3.0466, "mean_token_accuracy": 0.4223285978999382, "step": 3946 }, { "epoch": 0.7317389692250649, "grad_norm": 8.71875, "learning_rate": 9.268261030774935e-06, "loss": 3.4374, "mean_token_accuracy": 0.3771629587374025, "step": 3947 }, { "epoch": 0.731924360400445, "grad_norm": 6.46484375, "learning_rate": 9.268075639599556e-06, "loss": 2.8142, "mean_token_accuracy": 0.43495196052973256, "step": 3948 }, { "epoch": 0.732109751575825, "grad_norm": 6.921875, "learning_rate": 9.267890248424176e-06, "loss": 2.7643, "mean_token_accuracy": 0.43204502017413465, "step": 3949 }, { "epoch": 0.732295142751205, "grad_norm": 6.92578125, "learning_rate": 9.267704857248797e-06, "loss": 2.6094, "mean_token_accuracy": 0.4581783500238436, "step": 3950 }, { "epoch": 0.7324805339265851, "grad_norm": 8.875, "learning_rate": 9.267519466073416e-06, "loss": 2.612, "mean_token_accuracy": 0.4797630799605133, "step": 3951 }, { "epoch": 0.7326659251019652, "grad_norm": 6.375, "learning_rate": 9.267334074898036e-06, "loss": 2.8794, "mean_token_accuracy": 0.4428646105593309, "step": 3952 }, { "epoch": 0.7328513162773452, "grad_norm": 6.95703125, "learning_rate": 9.267148683722655e-06, "loss": 2.7098, "mean_token_accuracy": 0.46692131398013753, "step": 3953 }, { "epoch": 0.7330367074527252, "grad_norm": 8.5, "learning_rate": 9.266963292547275e-06, "loss": 2.6399, "mean_token_accuracy": 0.4718202141428403, "step": 3954 }, { "epoch": 0.7332220986281053, "grad_norm": 6.76953125, "learning_rate": 9.266777901371894e-06, "loss": 2.771, "mean_token_accuracy": 0.4521072796934866, "step": 3955 }, { "epoch": 0.7334074898034854, "grad_norm": 5.94921875, "learning_rate": 9.266592510196516e-06, "loss": 3.0159, "mean_token_accuracy": 0.4052898142937535, "step": 3956 }, { "epoch": 0.7335928809788654, "grad_norm": 7.39453125, "learning_rate": 9.266407119021135e-06, "loss": 2.3836, "mean_token_accuracy": 0.5151616499442586, "step": 3957 }, { "epoch": 0.7337782721542454, "grad_norm": 8.7265625, "learning_rate": 9.266221727845756e-06, "loss": 3.2985, "mean_token_accuracy": 0.39811815517507326, "step": 3958 }, { "epoch": 0.7339636633296255, "grad_norm": 5.12109375, "learning_rate": 9.266036336670376e-06, "loss": 3.0436, "mean_token_accuracy": 0.4278858625162127, "step": 3959 }, { "epoch": 0.7341490545050056, "grad_norm": 7.32421875, "learning_rate": 9.265850945494995e-06, "loss": 3.0014, "mean_token_accuracy": 0.4146214777301649, "step": 3960 }, { "epoch": 0.7343344456803856, "grad_norm": 7.359375, "learning_rate": 9.265665554319615e-06, "loss": 2.9896, "mean_token_accuracy": 0.4469578783151326, "step": 3961 }, { "epoch": 0.7345198368557657, "grad_norm": 5.09765625, "learning_rate": 9.265480163144234e-06, "loss": 2.8436, "mean_token_accuracy": 0.44556256062075655, "step": 3962 }, { "epoch": 0.7347052280311457, "grad_norm": 5.6953125, "learning_rate": 9.265294771968855e-06, "loss": 2.9729, "mean_token_accuracy": 0.4538906934048863, "step": 3963 }, { "epoch": 0.7348906192065258, "grad_norm": 6.21484375, "learning_rate": 9.265109380793475e-06, "loss": 2.7902, "mean_token_accuracy": 0.4344487737795095, "step": 3964 }, { "epoch": 0.7350760103819058, "grad_norm": 5.546875, "learning_rate": 9.264923989618096e-06, "loss": 2.4585, "mean_token_accuracy": 0.48615253515125695, "step": 3965 }, { "epoch": 0.7352614015572859, "grad_norm": 6.36328125, "learning_rate": 9.264738598442715e-06, "loss": 2.7901, "mean_token_accuracy": 0.4600071736011478, "step": 3966 }, { "epoch": 0.7354467927326659, "grad_norm": 5.7890625, "learning_rate": 9.264553207267335e-06, "loss": 2.5737, "mean_token_accuracy": 0.48561987516827804, "step": 3967 }, { "epoch": 0.735632183908046, "grad_norm": 6.7109375, "learning_rate": 9.264367816091955e-06, "loss": 2.6687, "mean_token_accuracy": 0.463013306624696, "step": 3968 }, { "epoch": 0.735817575083426, "grad_norm": 8.03125, "learning_rate": 9.264182424916574e-06, "loss": 2.9828, "mean_token_accuracy": 0.4225283432890406, "step": 3969 }, { "epoch": 0.7360029662588061, "grad_norm": 6.984375, "learning_rate": 9.263997033741195e-06, "loss": 3.1496, "mean_token_accuracy": 0.4039049235993209, "step": 3970 }, { "epoch": 0.7361883574341861, "grad_norm": 6.4765625, "learning_rate": 9.263811642565814e-06, "loss": 3.1434, "mean_token_accuracy": 0.416026474412008, "step": 3971 }, { "epoch": 0.7363737486095662, "grad_norm": 5.33203125, "learning_rate": 9.263626251390434e-06, "loss": 2.6145, "mean_token_accuracy": 0.44239226033421286, "step": 3972 }, { "epoch": 0.7365591397849462, "grad_norm": 5.59375, "learning_rate": 9.263440860215055e-06, "loss": 2.8441, "mean_token_accuracy": 0.4342137145626363, "step": 3973 }, { "epoch": 0.7367445309603263, "grad_norm": 6.73828125, "learning_rate": 9.263255469039675e-06, "loss": 2.8735, "mean_token_accuracy": 0.43698378709085345, "step": 3974 }, { "epoch": 0.7369299221357063, "grad_norm": 4.671875, "learning_rate": 9.263070077864294e-06, "loss": 2.7441, "mean_token_accuracy": 0.45172155688622756, "step": 3975 }, { "epoch": 0.7371153133110864, "grad_norm": 5.19921875, "learning_rate": 9.262884686688914e-06, "loss": 2.7072, "mean_token_accuracy": 0.43791544801914384, "step": 3976 }, { "epoch": 0.7373007044864665, "grad_norm": 5.77734375, "learning_rate": 9.262699295513535e-06, "loss": 2.6231, "mean_token_accuracy": 0.47018794556059623, "step": 3977 }, { "epoch": 0.7374860956618465, "grad_norm": 6.37890625, "learning_rate": 9.262513904338154e-06, "loss": 2.9971, "mean_token_accuracy": 0.4163275686673449, "step": 3978 }, { "epoch": 0.7376714868372265, "grad_norm": 8.53125, "learning_rate": 9.262328513162774e-06, "loss": 2.1871, "mean_token_accuracy": 0.49588719153936545, "step": 3979 }, { "epoch": 0.7378568780126066, "grad_norm": 5.8046875, "learning_rate": 9.262143121987393e-06, "loss": 3.2811, "mean_token_accuracy": 0.40675324675324676, "step": 3980 }, { "epoch": 0.7380422691879867, "grad_norm": 5.5859375, "learning_rate": 9.261957730812015e-06, "loss": 2.8501, "mean_token_accuracy": 0.4452680344142952, "step": 3981 }, { "epoch": 0.7382276603633667, "grad_norm": 5.8046875, "learning_rate": 9.261772339636634e-06, "loss": 2.8001, "mean_token_accuracy": 0.4525697102241662, "step": 3982 }, { "epoch": 0.7384130515387467, "grad_norm": 6.05078125, "learning_rate": 9.261586948461254e-06, "loss": 3.3582, "mean_token_accuracy": 0.3859226087954989, "step": 3983 }, { "epoch": 0.7385984427141268, "grad_norm": 6.234375, "learning_rate": 9.261401557285873e-06, "loss": 3.0026, "mean_token_accuracy": 0.432781364019085, "step": 3984 }, { "epoch": 0.7387838338895069, "grad_norm": 6.8515625, "learning_rate": 9.261216166110494e-06, "loss": 2.7704, "mean_token_accuracy": 0.45082823459185195, "step": 3985 }, { "epoch": 0.7389692250648869, "grad_norm": 6.04296875, "learning_rate": 9.261030774935114e-06, "loss": 2.7325, "mean_token_accuracy": 0.4503319251659626, "step": 3986 }, { "epoch": 0.739154616240267, "grad_norm": 9.2734375, "learning_rate": 9.260845383759733e-06, "loss": 2.5619, "mean_token_accuracy": 0.47041593438781487, "step": 3987 }, { "epoch": 0.739340007415647, "grad_norm": 6.09765625, "learning_rate": 9.260659992584353e-06, "loss": 2.197, "mean_token_accuracy": 0.5171734234234234, "step": 3988 }, { "epoch": 0.7395253985910271, "grad_norm": 5.44140625, "learning_rate": 9.260474601408974e-06, "loss": 2.8406, "mean_token_accuracy": 0.45006105006105007, "step": 3989 }, { "epoch": 0.7397107897664071, "grad_norm": 6.8359375, "learning_rate": 9.260289210233594e-06, "loss": 2.5518, "mean_token_accuracy": 0.46858606807368547, "step": 3990 }, { "epoch": 0.7398961809417872, "grad_norm": 5.69921875, "learning_rate": 9.260103819058213e-06, "loss": 2.5798, "mean_token_accuracy": 0.4768961493582264, "step": 3991 }, { "epoch": 0.7400815721171672, "grad_norm": 7.15234375, "learning_rate": 9.259918427882834e-06, "loss": 2.6615, "mean_token_accuracy": 0.47436245252306025, "step": 3992 }, { "epoch": 0.7402669632925473, "grad_norm": 5.3046875, "learning_rate": 9.259733036707454e-06, "loss": 2.9924, "mean_token_accuracy": 0.41944739638682255, "step": 3993 }, { "epoch": 0.7404523544679273, "grad_norm": 5.19921875, "learning_rate": 9.259547645532073e-06, "loss": 3.1124, "mean_token_accuracy": 0.40746870797558704, "step": 3994 }, { "epoch": 0.7406377456433074, "grad_norm": 7.765625, "learning_rate": 9.259362254356694e-06, "loss": 2.6614, "mean_token_accuracy": 0.4447263501268576, "step": 3995 }, { "epoch": 0.7408231368186874, "grad_norm": 6.56640625, "learning_rate": 9.259176863181312e-06, "loss": 2.7696, "mean_token_accuracy": 0.44548369855692144, "step": 3996 }, { "epoch": 0.7410085279940675, "grad_norm": 6.67578125, "learning_rate": 9.258991472005934e-06, "loss": 2.866, "mean_token_accuracy": 0.444063245823389, "step": 3997 }, { "epoch": 0.7411939191694475, "grad_norm": 9.203125, "learning_rate": 9.258806080830553e-06, "loss": 2.7762, "mean_token_accuracy": 0.4304151144225652, "step": 3998 }, { "epoch": 0.7413793103448276, "grad_norm": 8.2734375, "learning_rate": 9.258620689655174e-06, "loss": 2.6137, "mean_token_accuracy": 0.4697869873931314, "step": 3999 }, { "epoch": 0.7415647015202076, "grad_norm": 6.875, "learning_rate": 9.258435298479793e-06, "loss": 2.8907, "mean_token_accuracy": 0.4239005000641108, "step": 4000 }, { "epoch": 0.7417500926955877, "grad_norm": 10.171875, "learning_rate": 9.258249907304413e-06, "loss": 3.0554, "mean_token_accuracy": 0.4281957633308985, "step": 4001 }, { "epoch": 0.7419354838709677, "grad_norm": 9.3046875, "learning_rate": 9.258064516129034e-06, "loss": 3.4044, "mean_token_accuracy": 0.40085942295887045, "step": 4002 }, { "epoch": 0.7421208750463478, "grad_norm": 5.41015625, "learning_rate": 9.257879124953652e-06, "loss": 2.9375, "mean_token_accuracy": 0.4327706635622817, "step": 4003 }, { "epoch": 0.7423062662217278, "grad_norm": 5.734375, "learning_rate": 9.257693733778273e-06, "loss": 2.8585, "mean_token_accuracy": 0.4462566844919786, "step": 4004 }, { "epoch": 0.7424916573971079, "grad_norm": 7.7734375, "learning_rate": 9.257508342602893e-06, "loss": 2.667, "mean_token_accuracy": 0.4480372776051963, "step": 4005 }, { "epoch": 0.742677048572488, "grad_norm": 6.5859375, "learning_rate": 9.257322951427514e-06, "loss": 2.8993, "mean_token_accuracy": 0.43162175902389427, "step": 4006 }, { "epoch": 0.742862439747868, "grad_norm": 5.58984375, "learning_rate": 9.257137560252133e-06, "loss": 3.0923, "mean_token_accuracy": 0.42166563595135026, "step": 4007 }, { "epoch": 0.743047830923248, "grad_norm": 7.46484375, "learning_rate": 9.256952169076753e-06, "loss": 3.0216, "mean_token_accuracy": 0.42411232304478996, "step": 4008 }, { "epoch": 0.7432332220986281, "grad_norm": 6.97265625, "learning_rate": 9.256766777901372e-06, "loss": 2.441, "mean_token_accuracy": 0.5162257131045886, "step": 4009 }, { "epoch": 0.7434186132740082, "grad_norm": 5.59765625, "learning_rate": 9.256581386725992e-06, "loss": 2.7914, "mean_token_accuracy": 0.45378044115772026, "step": 4010 }, { "epoch": 0.7436040044493882, "grad_norm": 7.3984375, "learning_rate": 9.256395995550613e-06, "loss": 2.3503, "mean_token_accuracy": 0.5054146856840993, "step": 4011 }, { "epoch": 0.7437893956247683, "grad_norm": 11.78125, "learning_rate": 9.256210604375232e-06, "loss": 2.4851, "mean_token_accuracy": 0.4681976674281592, "step": 4012 }, { "epoch": 0.7439747868001483, "grad_norm": 5.75, "learning_rate": 9.256025213199852e-06, "loss": 3.202, "mean_token_accuracy": 0.3881325455946571, "step": 4013 }, { "epoch": 0.7441601779755284, "grad_norm": 7.36328125, "learning_rate": 9.255839822024473e-06, "loss": 2.4837, "mean_token_accuracy": 0.5, "step": 4014 }, { "epoch": 0.7443455691509084, "grad_norm": 7.609375, "learning_rate": 9.255654430849093e-06, "loss": 2.8105, "mean_token_accuracy": 0.43729754743174454, "step": 4015 }, { "epoch": 0.7445309603262885, "grad_norm": 9.671875, "learning_rate": 9.255469039673712e-06, "loss": 2.4517, "mean_token_accuracy": 0.4835727492533068, "step": 4016 }, { "epoch": 0.7447163515016685, "grad_norm": 6.0859375, "learning_rate": 9.255283648498332e-06, "loss": 2.7048, "mean_token_accuracy": 0.46290762634792776, "step": 4017 }, { "epoch": 0.7449017426770486, "grad_norm": 8.4453125, "learning_rate": 9.255098257322951e-06, "loss": 2.7873, "mean_token_accuracy": 0.43796042178246425, "step": 4018 }, { "epoch": 0.7450871338524286, "grad_norm": 9.2890625, "learning_rate": 9.254912866147572e-06, "loss": 2.8432, "mean_token_accuracy": 0.45245460659045056, "step": 4019 }, { "epoch": 0.7452725250278087, "grad_norm": 6.61328125, "learning_rate": 9.254727474972192e-06, "loss": 3.0157, "mean_token_accuracy": 0.4337513969949087, "step": 4020 }, { "epoch": 0.7454579162031887, "grad_norm": 8.1796875, "learning_rate": 9.254542083796813e-06, "loss": 2.4191, "mean_token_accuracy": 0.4832964601769911, "step": 4021 }, { "epoch": 0.7456433073785688, "grad_norm": 7.21484375, "learning_rate": 9.254356692621432e-06, "loss": 2.5919, "mean_token_accuracy": 0.4706507868991918, "step": 4022 }, { "epoch": 0.7458286985539488, "grad_norm": 6.3125, "learning_rate": 9.254171301446052e-06, "loss": 2.9334, "mean_token_accuracy": 0.4576191225035383, "step": 4023 }, { "epoch": 0.7460140897293289, "grad_norm": 6.734375, "learning_rate": 9.253985910270673e-06, "loss": 3.5581, "mean_token_accuracy": 0.3625686199412741, "step": 4024 }, { "epoch": 0.7461994809047089, "grad_norm": 8.453125, "learning_rate": 9.253800519095291e-06, "loss": 2.3195, "mean_token_accuracy": 0.5072563135641641, "step": 4025 }, { "epoch": 0.746384872080089, "grad_norm": 7.3203125, "learning_rate": 9.253615127919912e-06, "loss": 2.86, "mean_token_accuracy": 0.443035745729882, "step": 4026 }, { "epoch": 0.746570263255469, "grad_norm": 5.99609375, "learning_rate": 9.25342973674453e-06, "loss": 2.8685, "mean_token_accuracy": 0.4346381093057607, "step": 4027 }, { "epoch": 0.7467556544308491, "grad_norm": 7.05078125, "learning_rate": 9.253244345569151e-06, "loss": 2.5989, "mean_token_accuracy": 0.4639344262295082, "step": 4028 }, { "epoch": 0.7469410456062291, "grad_norm": 6.8359375, "learning_rate": 9.253058954393772e-06, "loss": 3.0865, "mean_token_accuracy": 0.41778762462414937, "step": 4029 }, { "epoch": 0.7471264367816092, "grad_norm": 11.7890625, "learning_rate": 9.252873563218392e-06, "loss": 2.4202, "mean_token_accuracy": 0.4833012202954399, "step": 4030 }, { "epoch": 0.7473118279569892, "grad_norm": 5.91796875, "learning_rate": 9.252688172043013e-06, "loss": 2.8348, "mean_token_accuracy": 0.4420213389611912, "step": 4031 }, { "epoch": 0.7474972191323693, "grad_norm": 8.5, "learning_rate": 9.252502780867631e-06, "loss": 3.1172, "mean_token_accuracy": 0.4095894703854591, "step": 4032 }, { "epoch": 0.7476826103077493, "grad_norm": 7.8125, "learning_rate": 9.252317389692252e-06, "loss": 3.0157, "mean_token_accuracy": 0.43785725951331045, "step": 4033 }, { "epoch": 0.7478680014831294, "grad_norm": 5.390625, "learning_rate": 9.25213199851687e-06, "loss": 2.8455, "mean_token_accuracy": 0.44370701098105464, "step": 4034 }, { "epoch": 0.7480533926585095, "grad_norm": 6.7890625, "learning_rate": 9.251946607341491e-06, "loss": 2.8581, "mean_token_accuracy": 0.4390838867055157, "step": 4035 }, { "epoch": 0.7482387838338895, "grad_norm": 8.0390625, "learning_rate": 9.25176121616611e-06, "loss": 3.1941, "mean_token_accuracy": 0.4116521114965905, "step": 4036 }, { "epoch": 0.7484241750092696, "grad_norm": 7.90234375, "learning_rate": 9.251575824990732e-06, "loss": 2.5129, "mean_token_accuracy": 0.4682274247491639, "step": 4037 }, { "epoch": 0.7486095661846496, "grad_norm": 7.92578125, "learning_rate": 9.251390433815351e-06, "loss": 2.7454, "mean_token_accuracy": 0.43207514350321796, "step": 4038 }, { "epoch": 0.7487949573600297, "grad_norm": 7.74609375, "learning_rate": 9.251205042639971e-06, "loss": 2.7854, "mean_token_accuracy": 0.44140323824209715, "step": 4039 }, { "epoch": 0.7489803485354097, "grad_norm": 7.953125, "learning_rate": 9.251019651464592e-06, "loss": 2.7242, "mean_token_accuracy": 0.44894155238982825, "step": 4040 }, { "epoch": 0.7491657397107898, "grad_norm": 5.1015625, "learning_rate": 9.25083426028921e-06, "loss": 2.8172, "mean_token_accuracy": 0.42783505154639173, "step": 4041 }, { "epoch": 0.7493511308861698, "grad_norm": 6.78125, "learning_rate": 9.250648869113831e-06, "loss": 2.8606, "mean_token_accuracy": 0.4449648711943794, "step": 4042 }, { "epoch": 0.7495365220615499, "grad_norm": 10.40625, "learning_rate": 9.25046347793845e-06, "loss": 1.8386, "mean_token_accuracy": 0.5713257225136275, "step": 4043 }, { "epoch": 0.7497219132369299, "grad_norm": 5.234375, "learning_rate": 9.25027808676307e-06, "loss": 3.2198, "mean_token_accuracy": 0.4348412406522006, "step": 4044 }, { "epoch": 0.74990730441231, "grad_norm": 6.68359375, "learning_rate": 9.250092695587691e-06, "loss": 3.0722, "mean_token_accuracy": 0.41564605021432943, "step": 4045 }, { "epoch": 0.75009269558769, "grad_norm": 9.515625, "learning_rate": 9.249907304412311e-06, "loss": 3.2434, "mean_token_accuracy": 0.4159209296113263, "step": 4046 }, { "epoch": 0.7502780867630701, "grad_norm": 9.4140625, "learning_rate": 9.24972191323693e-06, "loss": 3.0931, "mean_token_accuracy": 0.4188545609964587, "step": 4047 }, { "epoch": 0.7504634779384501, "grad_norm": 6.98828125, "learning_rate": 9.24953652206155e-06, "loss": 2.9671, "mean_token_accuracy": 0.44380995888533575, "step": 4048 }, { "epoch": 0.7506488691138302, "grad_norm": 7.59375, "learning_rate": 9.249351130886171e-06, "loss": 2.9613, "mean_token_accuracy": 0.42479476833171004, "step": 4049 }, { "epoch": 0.7508342602892102, "grad_norm": 7.47265625, "learning_rate": 9.24916573971079e-06, "loss": 2.692, "mean_token_accuracy": 0.4621295279912184, "step": 4050 }, { "epoch": 0.7510196514645903, "grad_norm": 7.5625, "learning_rate": 9.24898034853541e-06, "loss": 2.9086, "mean_token_accuracy": 0.43951965065502185, "step": 4051 }, { "epoch": 0.7512050426399703, "grad_norm": 7.203125, "learning_rate": 9.24879495736003e-06, "loss": 3.0167, "mean_token_accuracy": 0.4137583469824871, "step": 4052 }, { "epoch": 0.7513904338153504, "grad_norm": 7.21484375, "learning_rate": 9.248609566184652e-06, "loss": 2.6742, "mean_token_accuracy": 0.45527648168370893, "step": 4053 }, { "epoch": 0.7515758249907304, "grad_norm": 10.125, "learning_rate": 9.24842417500927e-06, "loss": 2.7397, "mean_token_accuracy": 0.4435548438751001, "step": 4054 }, { "epoch": 0.7517612161661105, "grad_norm": 8.09375, "learning_rate": 9.24823878383389e-06, "loss": 2.4387, "mean_token_accuracy": 0.48414350434676134, "step": 4055 }, { "epoch": 0.7519466073414905, "grad_norm": 6.52734375, "learning_rate": 9.24805339265851e-06, "loss": 3.4818, "mean_token_accuracy": 0.4006942722539053, "step": 4056 }, { "epoch": 0.7521319985168706, "grad_norm": 6.36328125, "learning_rate": 9.24786800148313e-06, "loss": 2.9611, "mean_token_accuracy": 0.4347442680776014, "step": 4057 }, { "epoch": 0.7523173896922507, "grad_norm": 6.1328125, "learning_rate": 9.24768261030775e-06, "loss": 2.4162, "mean_token_accuracy": 0.48296957671957674, "step": 4058 }, { "epoch": 0.7525027808676307, "grad_norm": 5.9375, "learning_rate": 9.24749721913237e-06, "loss": 3.4389, "mean_token_accuracy": 0.40722114764667955, "step": 4059 }, { "epoch": 0.7526881720430108, "grad_norm": 6.28125, "learning_rate": 9.24731182795699e-06, "loss": 2.2765, "mean_token_accuracy": 0.5191870739712194, "step": 4060 }, { "epoch": 0.7528735632183908, "grad_norm": 6.0390625, "learning_rate": 9.24712643678161e-06, "loss": 2.9485, "mean_token_accuracy": 0.4388227927363807, "step": 4061 }, { "epoch": 0.7530589543937709, "grad_norm": 5.42578125, "learning_rate": 9.246941045606231e-06, "loss": 2.8608, "mean_token_accuracy": 0.4466173962478681, "step": 4062 }, { "epoch": 0.7532443455691509, "grad_norm": 5.73046875, "learning_rate": 9.24675565443085e-06, "loss": 2.5679, "mean_token_accuracy": 0.48073022312373226, "step": 4063 }, { "epoch": 0.753429736744531, "grad_norm": 8.84375, "learning_rate": 9.24657026325547e-06, "loss": 2.5652, "mean_token_accuracy": 0.4735376044568245, "step": 4064 }, { "epoch": 0.753615127919911, "grad_norm": 6.10546875, "learning_rate": 9.246384872080089e-06, "loss": 3.3759, "mean_token_accuracy": 0.39525230250356724, "step": 4065 }, { "epoch": 0.7538005190952911, "grad_norm": 7.8046875, "learning_rate": 9.24619948090471e-06, "loss": 2.9538, "mean_token_accuracy": 0.41501494851068543, "step": 4066 }, { "epoch": 0.7539859102706711, "grad_norm": 6.828125, "learning_rate": 9.24601408972933e-06, "loss": 2.7162, "mean_token_accuracy": 0.44494777903611427, "step": 4067 }, { "epoch": 0.7541713014460512, "grad_norm": 7.671875, "learning_rate": 9.245828698553949e-06, "loss": 2.5477, "mean_token_accuracy": 0.47215865751334857, "step": 4068 }, { "epoch": 0.7543566926214312, "grad_norm": 6.35546875, "learning_rate": 9.245643307378571e-06, "loss": 2.6635, "mean_token_accuracy": 0.4515411973918198, "step": 4069 }, { "epoch": 0.7545420837968113, "grad_norm": 6.12890625, "learning_rate": 9.24545791620319e-06, "loss": 2.3599, "mean_token_accuracy": 0.4759839893262175, "step": 4070 }, { "epoch": 0.7547274749721913, "grad_norm": 5.16796875, "learning_rate": 9.24527252502781e-06, "loss": 3.1606, "mean_token_accuracy": 0.4105604793409063, "step": 4071 }, { "epoch": 0.7549128661475714, "grad_norm": 5.8671875, "learning_rate": 9.245087133852429e-06, "loss": 2.9074, "mean_token_accuracy": 0.43739304050199657, "step": 4072 }, { "epoch": 0.7550982573229514, "grad_norm": 6.62109375, "learning_rate": 9.24490174267705e-06, "loss": 2.6379, "mean_token_accuracy": 0.44672545901402344, "step": 4073 }, { "epoch": 0.7552836484983315, "grad_norm": 6.48046875, "learning_rate": 9.244716351501668e-06, "loss": 2.5613, "mean_token_accuracy": 0.47007863521258164, "step": 4074 }, { "epoch": 0.7554690396737115, "grad_norm": 6.41015625, "learning_rate": 9.244530960326289e-06, "loss": 2.751, "mean_token_accuracy": 0.4574554294975689, "step": 4075 }, { "epoch": 0.7556544308490916, "grad_norm": 6.6875, "learning_rate": 9.24434556915091e-06, "loss": 3.018, "mean_token_accuracy": 0.41948938321536905, "step": 4076 }, { "epoch": 0.7558398220244716, "grad_norm": 5.5703125, "learning_rate": 9.24416017797553e-06, "loss": 2.8349, "mean_token_accuracy": 0.43303638834365094, "step": 4077 }, { "epoch": 0.7560252131998517, "grad_norm": 7.48828125, "learning_rate": 9.24397478680015e-06, "loss": 2.5955, "mean_token_accuracy": 0.46965012205044754, "step": 4078 }, { "epoch": 0.7562106043752317, "grad_norm": 5.27734375, "learning_rate": 9.243789395624769e-06, "loss": 2.9772, "mean_token_accuracy": 0.435501257635645, "step": 4079 }, { "epoch": 0.7563959955506118, "grad_norm": 7.5390625, "learning_rate": 9.24360400444939e-06, "loss": 2.6255, "mean_token_accuracy": 0.45963926670609107, "step": 4080 }, { "epoch": 0.7565813867259918, "grad_norm": 8.3828125, "learning_rate": 9.243418613274008e-06, "loss": 2.2806, "mean_token_accuracy": 0.524390243902439, "step": 4081 }, { "epoch": 0.7567667779013719, "grad_norm": 5.97265625, "learning_rate": 9.243233222098629e-06, "loss": 2.6644, "mean_token_accuracy": 0.4671719867620369, "step": 4082 }, { "epoch": 0.756952169076752, "grad_norm": 6.71875, "learning_rate": 9.24304783092325e-06, "loss": 2.9834, "mean_token_accuracy": 0.42625, "step": 4083 }, { "epoch": 0.757137560252132, "grad_norm": 5.94140625, "learning_rate": 9.242862439747868e-06, "loss": 3.0921, "mean_token_accuracy": 0.4190197123068727, "step": 4084 }, { "epoch": 0.757322951427512, "grad_norm": 6.18359375, "learning_rate": 9.242677048572489e-06, "loss": 2.4582, "mean_token_accuracy": 0.485120718697361, "step": 4085 }, { "epoch": 0.7575083426028921, "grad_norm": 5.72265625, "learning_rate": 9.242491657397109e-06, "loss": 3.0648, "mean_token_accuracy": 0.42507645259938837, "step": 4086 }, { "epoch": 0.7576937337782722, "grad_norm": 5.2734375, "learning_rate": 9.24230626622173e-06, "loss": 2.83, "mean_token_accuracy": 0.45915450579208666, "step": 4087 }, { "epoch": 0.7578791249536522, "grad_norm": 5.53515625, "learning_rate": 9.242120875046348e-06, "loss": 2.8726, "mean_token_accuracy": 0.45125827814569536, "step": 4088 }, { "epoch": 0.7580645161290323, "grad_norm": 5.2109375, "learning_rate": 9.241935483870969e-06, "loss": 2.5661, "mean_token_accuracy": 0.47138519037608034, "step": 4089 }, { "epoch": 0.7582499073044123, "grad_norm": 10.5625, "learning_rate": 9.241750092695588e-06, "loss": 2.572, "mean_token_accuracy": 0.4420249186216186, "step": 4090 }, { "epoch": 0.7584352984797924, "grad_norm": 5.64453125, "learning_rate": 9.241564701520208e-06, "loss": 2.398, "mean_token_accuracy": 0.5001670936838587, "step": 4091 }, { "epoch": 0.7586206896551724, "grad_norm": 5.54296875, "learning_rate": 9.241379310344829e-06, "loss": 2.6036, "mean_token_accuracy": 0.461252721916229, "step": 4092 }, { "epoch": 0.7588060808305525, "grad_norm": 5.11328125, "learning_rate": 9.241193919169447e-06, "loss": 3.0353, "mean_token_accuracy": 0.44257630625969996, "step": 4093 }, { "epoch": 0.7589914720059325, "grad_norm": 6.734375, "learning_rate": 9.241008527994068e-06, "loss": 2.5399, "mean_token_accuracy": 0.48105911652903394, "step": 4094 }, { "epoch": 0.7591768631813126, "grad_norm": 6.78125, "learning_rate": 9.240823136818688e-06, "loss": 2.7591, "mean_token_accuracy": 0.45267997308813634, "step": 4095 }, { "epoch": 0.7593622543566926, "grad_norm": 6.1015625, "learning_rate": 9.240637745643309e-06, "loss": 3.0812, "mean_token_accuracy": 0.4214317375886525, "step": 4096 }, { "epoch": 0.7595476455320727, "grad_norm": 5.484375, "learning_rate": 9.240452354467928e-06, "loss": 2.8984, "mean_token_accuracy": 0.4418202052677136, "step": 4097 }, { "epoch": 0.7597330367074527, "grad_norm": 7.09375, "learning_rate": 9.240266963292548e-06, "loss": 2.6811, "mean_token_accuracy": 0.45031395031395033, "step": 4098 }, { "epoch": 0.7599184278828328, "grad_norm": 6.44921875, "learning_rate": 9.240081572117167e-06, "loss": 2.801, "mean_token_accuracy": 0.4308017372931095, "step": 4099 }, { "epoch": 0.7601038190582128, "grad_norm": 7.0703125, "learning_rate": 9.239896180941788e-06, "loss": 2.4322, "mean_token_accuracy": 0.49240473738414003, "step": 4100 }, { "epoch": 0.7602892102335929, "grad_norm": 8.546875, "learning_rate": 9.239710789766408e-06, "loss": 2.8106, "mean_token_accuracy": 0.4342989571263036, "step": 4101 }, { "epoch": 0.7604746014089729, "grad_norm": 8.0234375, "learning_rate": 9.239525398591028e-06, "loss": 2.3234, "mean_token_accuracy": 0.5059210526315789, "step": 4102 }, { "epoch": 0.760659992584353, "grad_norm": 5.4296875, "learning_rate": 9.239340007415647e-06, "loss": 3.6483, "mean_token_accuracy": 0.3774027715690657, "step": 4103 }, { "epoch": 0.760845383759733, "grad_norm": 7.73828125, "learning_rate": 9.239154616240268e-06, "loss": 2.407, "mean_token_accuracy": 0.5005283550545967, "step": 4104 }, { "epoch": 0.7610307749351131, "grad_norm": 7.4375, "learning_rate": 9.238969225064888e-06, "loss": 2.9656, "mean_token_accuracy": 0.4222117350951707, "step": 4105 }, { "epoch": 0.7612161661104931, "grad_norm": 6.25, "learning_rate": 9.238783833889507e-06, "loss": 2.7779, "mean_token_accuracy": 0.4521497919556172, "step": 4106 }, { "epoch": 0.7614015572858732, "grad_norm": 5.4375, "learning_rate": 9.238598442714128e-06, "loss": 3.346, "mean_token_accuracy": 0.40461971830985916, "step": 4107 }, { "epoch": 0.7615869484612533, "grad_norm": 5.40234375, "learning_rate": 9.238413051538746e-06, "loss": 2.6364, "mean_token_accuracy": 0.4711763178395222, "step": 4108 }, { "epoch": 0.7617723396366333, "grad_norm": 7.16796875, "learning_rate": 9.238227660363367e-06, "loss": 2.8688, "mean_token_accuracy": 0.4258139235619611, "step": 4109 }, { "epoch": 0.7619577308120133, "grad_norm": 6.2890625, "learning_rate": 9.238042269187987e-06, "loss": 2.4537, "mean_token_accuracy": 0.49286823894930093, "step": 4110 }, { "epoch": 0.7621431219873934, "grad_norm": 5.89453125, "learning_rate": 9.237856878012608e-06, "loss": 2.7115, "mean_token_accuracy": 0.4467812259553732, "step": 4111 }, { "epoch": 0.7623285131627735, "grad_norm": 5.5390625, "learning_rate": 9.237671486837228e-06, "loss": 3.4207, "mean_token_accuracy": 0.3841204057149089, "step": 4112 }, { "epoch": 0.7625139043381535, "grad_norm": 7.17578125, "learning_rate": 9.237486095661847e-06, "loss": 2.9447, "mean_token_accuracy": 0.4154901169826543, "step": 4113 }, { "epoch": 0.7626992955135335, "grad_norm": 11.8671875, "learning_rate": 9.237300704486468e-06, "loss": 2.6685, "mean_token_accuracy": 0.46116449971735446, "step": 4114 }, { "epoch": 0.7628846866889136, "grad_norm": 7.30859375, "learning_rate": 9.237115313311086e-06, "loss": 2.5841, "mean_token_accuracy": 0.4966654083301875, "step": 4115 }, { "epoch": 0.7630700778642937, "grad_norm": 6.00390625, "learning_rate": 9.236929922135707e-06, "loss": 3.0009, "mean_token_accuracy": 0.4423462390547526, "step": 4116 }, { "epoch": 0.7632554690396737, "grad_norm": 7.40625, "learning_rate": 9.236744530960326e-06, "loss": 2.7945, "mean_token_accuracy": 0.44911616161616164, "step": 4117 }, { "epoch": 0.7634408602150538, "grad_norm": 6.44140625, "learning_rate": 9.236559139784948e-06, "loss": 2.9311, "mean_token_accuracy": 0.42918210316329436, "step": 4118 }, { "epoch": 0.7636262513904338, "grad_norm": 6.15234375, "learning_rate": 9.236373748609567e-06, "loss": 2.4578, "mean_token_accuracy": 0.4905496415381273, "step": 4119 }, { "epoch": 0.7638116425658139, "grad_norm": 5.078125, "learning_rate": 9.236188357434187e-06, "loss": 2.7956, "mean_token_accuracy": 0.44071315178898524, "step": 4120 }, { "epoch": 0.7639970337411939, "grad_norm": 8.375, "learning_rate": 9.236002966258808e-06, "loss": 2.423, "mean_token_accuracy": 0.4873111339298812, "step": 4121 }, { "epoch": 0.764182424916574, "grad_norm": 6.265625, "learning_rate": 9.235817575083426e-06, "loss": 3.3581, "mean_token_accuracy": 0.38879070941681393, "step": 4122 }, { "epoch": 0.764367816091954, "grad_norm": 6.5390625, "learning_rate": 9.235632183908047e-06, "loss": 2.7319, "mean_token_accuracy": 0.4613774335356919, "step": 4123 }, { "epoch": 0.7645532072673341, "grad_norm": 8.75, "learning_rate": 9.235446792732666e-06, "loss": 2.7523, "mean_token_accuracy": 0.4466780724265754, "step": 4124 }, { "epoch": 0.7647385984427141, "grad_norm": 5.28515625, "learning_rate": 9.235261401557286e-06, "loss": 2.7499, "mean_token_accuracy": 0.45650557620817844, "step": 4125 }, { "epoch": 0.7649239896180942, "grad_norm": 8.6796875, "learning_rate": 9.235076010381907e-06, "loss": 2.835, "mean_token_accuracy": 0.4424226324877142, "step": 4126 }, { "epoch": 0.7651093807934742, "grad_norm": 11.7734375, "learning_rate": 9.234890619206527e-06, "loss": 3.3589, "mean_token_accuracy": 0.3951010410287814, "step": 4127 }, { "epoch": 0.7652947719688543, "grad_norm": 10.640625, "learning_rate": 9.234705228031146e-06, "loss": 2.7618, "mean_token_accuracy": 0.4427973699940227, "step": 4128 }, { "epoch": 0.7654801631442343, "grad_norm": 9.0859375, "learning_rate": 9.234519836855767e-06, "loss": 2.836, "mean_token_accuracy": 0.44959816303099887, "step": 4129 }, { "epoch": 0.7656655543196144, "grad_norm": 9.59375, "learning_rate": 9.234334445680387e-06, "loss": 2.873, "mean_token_accuracy": 0.4414353419092756, "step": 4130 }, { "epoch": 0.7658509454949944, "grad_norm": 6.01953125, "learning_rate": 9.234149054505006e-06, "loss": 2.663, "mean_token_accuracy": 0.4666402953586498, "step": 4131 }, { "epoch": 0.7660363366703745, "grad_norm": 5.85546875, "learning_rate": 9.233963663329626e-06, "loss": 2.9813, "mean_token_accuracy": 0.4395527603074773, "step": 4132 }, { "epoch": 0.7662217278457546, "grad_norm": 6.4296875, "learning_rate": 9.233778272154245e-06, "loss": 2.3067, "mean_token_accuracy": 0.5062653957373889, "step": 4133 }, { "epoch": 0.7664071190211346, "grad_norm": 6.859375, "learning_rate": 9.233592880978867e-06, "loss": 2.3876, "mean_token_accuracy": 0.49891316782976775, "step": 4134 }, { "epoch": 0.7665925101965146, "grad_norm": 6.0078125, "learning_rate": 9.233407489803486e-06, "loss": 3.1111, "mean_token_accuracy": 0.40258924082453496, "step": 4135 }, { "epoch": 0.7667779013718947, "grad_norm": 5.53515625, "learning_rate": 9.233222098628107e-06, "loss": 3.1174, "mean_token_accuracy": 0.42552891396332865, "step": 4136 }, { "epoch": 0.7669632925472748, "grad_norm": 7.84765625, "learning_rate": 9.233036707452725e-06, "loss": 2.1024, "mean_token_accuracy": 0.5307548134264781, "step": 4137 }, { "epoch": 0.7671486837226548, "grad_norm": 5.40234375, "learning_rate": 9.232851316277346e-06, "loss": 3.1837, "mean_token_accuracy": 0.40727453911310413, "step": 4138 }, { "epoch": 0.7673340748980348, "grad_norm": 7.0, "learning_rate": 9.232665925101966e-06, "loss": 2.6321, "mean_token_accuracy": 0.46714182169606183, "step": 4139 }, { "epoch": 0.7675194660734149, "grad_norm": 7.41796875, "learning_rate": 9.232480533926585e-06, "loss": 2.7366, "mean_token_accuracy": 0.4531009738595592, "step": 4140 }, { "epoch": 0.767704857248795, "grad_norm": 5.37890625, "learning_rate": 9.232295142751206e-06, "loss": 2.981, "mean_token_accuracy": 0.43306672279520103, "step": 4141 }, { "epoch": 0.767890248424175, "grad_norm": 6.7421875, "learning_rate": 9.232109751575826e-06, "loss": 2.3775, "mean_token_accuracy": 0.5042005600746766, "step": 4142 }, { "epoch": 0.768075639599555, "grad_norm": 6.7890625, "learning_rate": 9.231924360400447e-06, "loss": 2.8148, "mean_token_accuracy": 0.4447220487195503, "step": 4143 }, { "epoch": 0.7682610307749351, "grad_norm": 5.5859375, "learning_rate": 9.231738969225065e-06, "loss": 2.9425, "mean_token_accuracy": 0.4293394777265745, "step": 4144 }, { "epoch": 0.7684464219503152, "grad_norm": 7.19140625, "learning_rate": 9.231553578049686e-06, "loss": 2.7067, "mean_token_accuracy": 0.4512902827509569, "step": 4145 }, { "epoch": 0.7686318131256952, "grad_norm": 6.92578125, "learning_rate": 9.231368186874305e-06, "loss": 2.9598, "mean_token_accuracy": 0.4323198667221991, "step": 4146 }, { "epoch": 0.7688172043010753, "grad_norm": 6.99609375, "learning_rate": 9.231182795698925e-06, "loss": 2.915, "mean_token_accuracy": 0.4269088844734601, "step": 4147 }, { "epoch": 0.7690025954764553, "grad_norm": 6.97265625, "learning_rate": 9.230997404523546e-06, "loss": 3.2325, "mean_token_accuracy": 0.40055370985603544, "step": 4148 }, { "epoch": 0.7691879866518354, "grad_norm": 8.5390625, "learning_rate": 9.230812013348164e-06, "loss": 2.9646, "mean_token_accuracy": 0.4490572565300121, "step": 4149 }, { "epoch": 0.7693733778272154, "grad_norm": 6.50390625, "learning_rate": 9.230626622172787e-06, "loss": 2.5494, "mean_token_accuracy": 0.4760922925871379, "step": 4150 }, { "epoch": 0.7695587690025955, "grad_norm": 7.953125, "learning_rate": 9.230441230997405e-06, "loss": 2.389, "mean_token_accuracy": 0.4922844877596009, "step": 4151 }, { "epoch": 0.7697441601779755, "grad_norm": 5.26953125, "learning_rate": 9.230255839822026e-06, "loss": 3.2441, "mean_token_accuracy": 0.398993158642055, "step": 4152 }, { "epoch": 0.7699295513533556, "grad_norm": 6.30078125, "learning_rate": 9.230070448646645e-06, "loss": 2.7994, "mean_token_accuracy": 0.4609221069223696, "step": 4153 }, { "epoch": 0.7701149425287356, "grad_norm": 7.23046875, "learning_rate": 9.229885057471265e-06, "loss": 2.3821, "mean_token_accuracy": 0.49879027123392333, "step": 4154 }, { "epoch": 0.7703003337041157, "grad_norm": 4.71484375, "learning_rate": 9.229699666295884e-06, "loss": 2.9933, "mean_token_accuracy": 0.41575956073595993, "step": 4155 }, { "epoch": 0.7704857248794957, "grad_norm": 4.921875, "learning_rate": 9.229514275120505e-06, "loss": 2.5524, "mean_token_accuracy": 0.4837806301050175, "step": 4156 }, { "epoch": 0.7706711160548758, "grad_norm": 9.0, "learning_rate": 9.229328883945125e-06, "loss": 2.834, "mean_token_accuracy": 0.45734400883489784, "step": 4157 }, { "epoch": 0.7708565072302559, "grad_norm": 6.671875, "learning_rate": 9.229143492769746e-06, "loss": 2.9378, "mean_token_accuracy": 0.4311533159748813, "step": 4158 }, { "epoch": 0.7710418984056359, "grad_norm": 7.515625, "learning_rate": 9.228958101594366e-06, "loss": 2.7893, "mean_token_accuracy": 0.4407091125283753, "step": 4159 }, { "epoch": 0.7712272895810159, "grad_norm": 6.25, "learning_rate": 9.228772710418985e-06, "loss": 3.0159, "mean_token_accuracy": 0.4393793985924504, "step": 4160 }, { "epoch": 0.771412680756396, "grad_norm": 7.11328125, "learning_rate": 9.228587319243605e-06, "loss": 3.1598, "mean_token_accuracy": 0.425963808025177, "step": 4161 }, { "epoch": 0.7715980719317761, "grad_norm": 7.25, "learning_rate": 9.228401928068224e-06, "loss": 2.897, "mean_token_accuracy": 0.4387332521315469, "step": 4162 }, { "epoch": 0.7717834631071561, "grad_norm": 5.83203125, "learning_rate": 9.228216536892845e-06, "loss": 2.7121, "mean_token_accuracy": 0.45878378378378376, "step": 4163 }, { "epoch": 0.7719688542825361, "grad_norm": 5.7734375, "learning_rate": 9.228031145717465e-06, "loss": 2.8085, "mean_token_accuracy": 0.47136273864384465, "step": 4164 }, { "epoch": 0.7721542454579162, "grad_norm": 5.9765625, "learning_rate": 9.227845754542084e-06, "loss": 2.8246, "mean_token_accuracy": 0.44946401225114857, "step": 4165 }, { "epoch": 0.7723396366332963, "grad_norm": 7.42578125, "learning_rate": 9.227660363366704e-06, "loss": 3.3767, "mean_token_accuracy": 0.39490445859872614, "step": 4166 }, { "epoch": 0.7725250278086763, "grad_norm": 5.83203125, "learning_rate": 9.227474972191325e-06, "loss": 2.823, "mean_token_accuracy": 0.4447122200170116, "step": 4167 }, { "epoch": 0.7727104189840563, "grad_norm": 6.62109375, "learning_rate": 9.227289581015945e-06, "loss": 2.51, "mean_token_accuracy": 0.4813487322447896, "step": 4168 }, { "epoch": 0.7728958101594364, "grad_norm": 7.06640625, "learning_rate": 9.227104189840564e-06, "loss": 2.4711, "mean_token_accuracy": 0.4863375161252492, "step": 4169 }, { "epoch": 0.7730812013348165, "grad_norm": 5.90625, "learning_rate": 9.226918798665185e-06, "loss": 2.754, "mean_token_accuracy": 0.4450073323556859, "step": 4170 }, { "epoch": 0.7732665925101965, "grad_norm": 5.4609375, "learning_rate": 9.226733407489803e-06, "loss": 2.7942, "mean_token_accuracy": 0.4496114314364502, "step": 4171 }, { "epoch": 0.7734519836855765, "grad_norm": 7.18359375, "learning_rate": 9.226548016314424e-06, "loss": 2.5152, "mean_token_accuracy": 0.4770898341271145, "step": 4172 }, { "epoch": 0.7736373748609566, "grad_norm": 8.15625, "learning_rate": 9.226362625139044e-06, "loss": 2.8384, "mean_token_accuracy": 0.4507244297805193, "step": 4173 }, { "epoch": 0.7738227660363367, "grad_norm": 5.48046875, "learning_rate": 9.226177233963665e-06, "loss": 2.9016, "mean_token_accuracy": 0.4589261744966443, "step": 4174 }, { "epoch": 0.7740081572117167, "grad_norm": 5.5625, "learning_rate": 9.225991842788284e-06, "loss": 2.3125, "mean_token_accuracy": 0.501249256395003, "step": 4175 }, { "epoch": 0.7741935483870968, "grad_norm": 6.63671875, "learning_rate": 9.225806451612904e-06, "loss": 2.8131, "mean_token_accuracy": 0.444589820751014, "step": 4176 }, { "epoch": 0.7743789395624768, "grad_norm": 6.24609375, "learning_rate": 9.225621060437525e-06, "loss": 3.6458, "mean_token_accuracy": 0.38483241169168264, "step": 4177 }, { "epoch": 0.7745643307378569, "grad_norm": 7.4296875, "learning_rate": 9.225435669262143e-06, "loss": 3.0916, "mean_token_accuracy": 0.4105047748976808, "step": 4178 }, { "epoch": 0.7747497219132369, "grad_norm": 4.484375, "learning_rate": 9.225250278086764e-06, "loss": 2.8858, "mean_token_accuracy": 0.4352795373175434, "step": 4179 }, { "epoch": 0.774935113088617, "grad_norm": 8.21875, "learning_rate": 9.225064886911383e-06, "loss": 2.6781, "mean_token_accuracy": 0.4359771258080557, "step": 4180 }, { "epoch": 0.775120504263997, "grad_norm": 6.90625, "learning_rate": 9.224879495736003e-06, "loss": 2.8168, "mean_token_accuracy": 0.4501251340722202, "step": 4181 }, { "epoch": 0.7753058954393771, "grad_norm": 6.359375, "learning_rate": 9.224694104560624e-06, "loss": 3.041, "mean_token_accuracy": 0.41057134971018494, "step": 4182 }, { "epoch": 0.7754912866147572, "grad_norm": 6.23046875, "learning_rate": 9.224508713385244e-06, "loss": 3.0925, "mean_token_accuracy": 0.4010741943542343, "step": 4183 }, { "epoch": 0.7756766777901372, "grad_norm": 8.8828125, "learning_rate": 9.224323322209863e-06, "loss": 3.3827, "mean_token_accuracy": 0.3868409458070201, "step": 4184 }, { "epoch": 0.7758620689655172, "grad_norm": 6.06640625, "learning_rate": 9.224137931034484e-06, "loss": 3.0692, "mean_token_accuracy": 0.4360035100915131, "step": 4185 }, { "epoch": 0.7760474601408973, "grad_norm": 5.78125, "learning_rate": 9.223952539859104e-06, "loss": 2.7654, "mean_token_accuracy": 0.4539018250471995, "step": 4186 }, { "epoch": 0.7762328513162774, "grad_norm": 7.01953125, "learning_rate": 9.223767148683723e-06, "loss": 3.183, "mean_token_accuracy": 0.4066538355787737, "step": 4187 }, { "epoch": 0.7764182424916574, "grad_norm": 5.99609375, "learning_rate": 9.223581757508343e-06, "loss": 3.0951, "mean_token_accuracy": 0.41848477583313354, "step": 4188 }, { "epoch": 0.7766036336670374, "grad_norm": 6.4609375, "learning_rate": 9.223396366332962e-06, "loss": 3.4678, "mean_token_accuracy": 0.36137474834627553, "step": 4189 }, { "epoch": 0.7767890248424175, "grad_norm": 5.66015625, "learning_rate": 9.223210975157584e-06, "loss": 2.8044, "mean_token_accuracy": 0.447041166380789, "step": 4190 }, { "epoch": 0.7769744160177976, "grad_norm": 4.93359375, "learning_rate": 9.223025583982203e-06, "loss": 2.6011, "mean_token_accuracy": 0.4681701030927835, "step": 4191 }, { "epoch": 0.7771598071931776, "grad_norm": 7.42578125, "learning_rate": 9.222840192806824e-06, "loss": 2.639, "mean_token_accuracy": 0.471261309207025, "step": 4192 }, { "epoch": 0.7773451983685576, "grad_norm": 6.11328125, "learning_rate": 9.222654801631442e-06, "loss": 3.0532, "mean_token_accuracy": 0.4170703575547866, "step": 4193 }, { "epoch": 0.7775305895439377, "grad_norm": 6.3671875, "learning_rate": 9.222469410456063e-06, "loss": 2.9521, "mean_token_accuracy": 0.4315774996561683, "step": 4194 }, { "epoch": 0.7777159807193178, "grad_norm": 6.93359375, "learning_rate": 9.222284019280683e-06, "loss": 3.1715, "mean_token_accuracy": 0.41615853658536583, "step": 4195 }, { "epoch": 0.7779013718946978, "grad_norm": 5.68359375, "learning_rate": 9.222098628105302e-06, "loss": 3.0222, "mean_token_accuracy": 0.43789691330674935, "step": 4196 }, { "epoch": 0.7780867630700778, "grad_norm": 5.74609375, "learning_rate": 9.221913236929923e-06, "loss": 3.1901, "mean_token_accuracy": 0.40142570728447313, "step": 4197 }, { "epoch": 0.7782721542454579, "grad_norm": 6.7734375, "learning_rate": 9.221727845754543e-06, "loss": 3.0015, "mean_token_accuracy": 0.4218315203642803, "step": 4198 }, { "epoch": 0.778457545420838, "grad_norm": 6.7265625, "learning_rate": 9.221542454579164e-06, "loss": 2.5276, "mean_token_accuracy": 0.47587523870146403, "step": 4199 }, { "epoch": 0.778642936596218, "grad_norm": 6.7890625, "learning_rate": 9.221357063403782e-06, "loss": 2.8134, "mean_token_accuracy": 0.439819252362043, "step": 4200 }, { "epoch": 0.778828327771598, "grad_norm": 6.171875, "learning_rate": 9.221171672228403e-06, "loss": 3.1924, "mean_token_accuracy": 0.42214285714285715, "step": 4201 }, { "epoch": 0.7790137189469781, "grad_norm": 11.046875, "learning_rate": 9.220986281053023e-06, "loss": 2.6358, "mean_token_accuracy": 0.46728077521234596, "step": 4202 }, { "epoch": 0.7791991101223582, "grad_norm": 9.5078125, "learning_rate": 9.220800889877642e-06, "loss": 2.4564, "mean_token_accuracy": 0.48792212474462204, "step": 4203 }, { "epoch": 0.7793845012977382, "grad_norm": 6.88671875, "learning_rate": 9.220615498702263e-06, "loss": 3.0597, "mean_token_accuracy": 0.42879090490090643, "step": 4204 }, { "epoch": 0.7795698924731183, "grad_norm": 6.1875, "learning_rate": 9.220430107526881e-06, "loss": 3.0234, "mean_token_accuracy": 0.4236957581667479, "step": 4205 }, { "epoch": 0.7797552836484983, "grad_norm": 7.80078125, "learning_rate": 9.220244716351504e-06, "loss": 2.927, "mean_token_accuracy": 0.43412563667232595, "step": 4206 }, { "epoch": 0.7799406748238784, "grad_norm": 6.96484375, "learning_rate": 9.220059325176122e-06, "loss": 2.4846, "mean_token_accuracy": 0.493886230728336, "step": 4207 }, { "epoch": 0.7801260659992585, "grad_norm": 5.63671875, "learning_rate": 9.219873934000743e-06, "loss": 2.2268, "mean_token_accuracy": 0.5343340785691473, "step": 4208 }, { "epoch": 0.7803114571746385, "grad_norm": 7.1484375, "learning_rate": 9.219688542825362e-06, "loss": 2.542, "mean_token_accuracy": 0.4632611064035831, "step": 4209 }, { "epoch": 0.7804968483500185, "grad_norm": 6.21484375, "learning_rate": 9.219503151649982e-06, "loss": 2.7727, "mean_token_accuracy": 0.47639831878435174, "step": 4210 }, { "epoch": 0.7806822395253986, "grad_norm": 6.9609375, "learning_rate": 9.219317760474603e-06, "loss": 2.7549, "mean_token_accuracy": 0.45268024851900013, "step": 4211 }, { "epoch": 0.7808676307007787, "grad_norm": 5.37890625, "learning_rate": 9.219132369299222e-06, "loss": 2.7618, "mean_token_accuracy": 0.47688641779189833, "step": 4212 }, { "epoch": 0.7810530218761587, "grad_norm": 6.6640625, "learning_rate": 9.218946978123842e-06, "loss": 2.7575, "mean_token_accuracy": 0.4550787280024699, "step": 4213 }, { "epoch": 0.7812384130515387, "grad_norm": 8.4921875, "learning_rate": 9.21876158694846e-06, "loss": 2.2068, "mean_token_accuracy": 0.5121336274818783, "step": 4214 }, { "epoch": 0.7814238042269188, "grad_norm": 5.1640625, "learning_rate": 9.218576195773083e-06, "loss": 2.683, "mean_token_accuracy": 0.45683105401298524, "step": 4215 }, { "epoch": 0.7816091954022989, "grad_norm": 10.03125, "learning_rate": 9.218390804597702e-06, "loss": 2.8649, "mean_token_accuracy": 0.463226649248857, "step": 4216 }, { "epoch": 0.7817945865776789, "grad_norm": 7.4921875, "learning_rate": 9.218205413422322e-06, "loss": 2.687, "mean_token_accuracy": 0.46336526784509513, "step": 4217 }, { "epoch": 0.7819799777530589, "grad_norm": 5.69921875, "learning_rate": 9.218020022246941e-06, "loss": 2.9644, "mean_token_accuracy": 0.44504943401633473, "step": 4218 }, { "epoch": 0.782165368928439, "grad_norm": 6.95703125, "learning_rate": 9.217834631071562e-06, "loss": 2.7178, "mean_token_accuracy": 0.4540048083006453, "step": 4219 }, { "epoch": 0.7823507601038191, "grad_norm": 6.05078125, "learning_rate": 9.217649239896182e-06, "loss": 3.5448, "mean_token_accuracy": 0.4, "step": 4220 }, { "epoch": 0.7825361512791991, "grad_norm": 5.82421875, "learning_rate": 9.217463848720801e-06, "loss": 3.0608, "mean_token_accuracy": 0.42347792508688376, "step": 4221 }, { "epoch": 0.7827215424545791, "grad_norm": 6.08203125, "learning_rate": 9.217278457545421e-06, "loss": 3.1666, "mean_token_accuracy": 0.4160839160839161, "step": 4222 }, { "epoch": 0.7829069336299592, "grad_norm": 6.21484375, "learning_rate": 9.217093066370042e-06, "loss": 3.1761, "mean_token_accuracy": 0.4104833219877468, "step": 4223 }, { "epoch": 0.7830923248053393, "grad_norm": 7.6640625, "learning_rate": 9.216907675194662e-06, "loss": 3.0579, "mean_token_accuracy": 0.4257872999483738, "step": 4224 }, { "epoch": 0.7832777159807193, "grad_norm": 12.203125, "learning_rate": 9.216722284019281e-06, "loss": 2.5231, "mean_token_accuracy": 0.4465006729475101, "step": 4225 }, { "epoch": 0.7834631071560993, "grad_norm": 8.015625, "learning_rate": 9.216536892843902e-06, "loss": 3.2124, "mean_token_accuracy": 0.43701142513529767, "step": 4226 }, { "epoch": 0.7836484983314794, "grad_norm": 9.4140625, "learning_rate": 9.21635150166852e-06, "loss": 2.6305, "mean_token_accuracy": 0.4792484243072898, "step": 4227 }, { "epoch": 0.7838338895068595, "grad_norm": 5.5, "learning_rate": 9.216166110493141e-06, "loss": 2.9033, "mean_token_accuracy": 0.4327833050230527, "step": 4228 }, { "epoch": 0.7840192806822395, "grad_norm": 7.10546875, "learning_rate": 9.215980719317761e-06, "loss": 2.9471, "mean_token_accuracy": 0.44229017566688356, "step": 4229 }, { "epoch": 0.7842046718576196, "grad_norm": 7.00390625, "learning_rate": 9.21579532814238e-06, "loss": 2.9005, "mean_token_accuracy": 0.438558752352783, "step": 4230 }, { "epoch": 0.7843900630329996, "grad_norm": 8.0234375, "learning_rate": 9.215609936967002e-06, "loss": 2.5694, "mean_token_accuracy": 0.46709549727857497, "step": 4231 }, { "epoch": 0.7845754542083797, "grad_norm": 6.8125, "learning_rate": 9.215424545791621e-06, "loss": 2.976, "mean_token_accuracy": 0.4227266155847911, "step": 4232 }, { "epoch": 0.7847608453837598, "grad_norm": 8.890625, "learning_rate": 9.215239154616242e-06, "loss": 3.0059, "mean_token_accuracy": 0.44510181618051736, "step": 4233 }, { "epoch": 0.7849462365591398, "grad_norm": 6.21484375, "learning_rate": 9.21505376344086e-06, "loss": 2.9082, "mean_token_accuracy": 0.45241162158293974, "step": 4234 }, { "epoch": 0.7851316277345198, "grad_norm": 8.96875, "learning_rate": 9.214868372265481e-06, "loss": 3.0119, "mean_token_accuracy": 0.4393545592376002, "step": 4235 }, { "epoch": 0.7853170189098999, "grad_norm": 8.21875, "learning_rate": 9.2146829810901e-06, "loss": 2.8862, "mean_token_accuracy": 0.4429034783581255, "step": 4236 }, { "epoch": 0.78550241008528, "grad_norm": 12.0234375, "learning_rate": 9.21449758991472e-06, "loss": 2.8171, "mean_token_accuracy": 0.44640914036996737, "step": 4237 }, { "epoch": 0.78568780126066, "grad_norm": 7.8125, "learning_rate": 9.21431219873934e-06, "loss": 2.8472, "mean_token_accuracy": 0.4338443396226415, "step": 4238 }, { "epoch": 0.78587319243604, "grad_norm": 5.44140625, "learning_rate": 9.214126807563961e-06, "loss": 2.9486, "mean_token_accuracy": 0.42700889601866704, "step": 4239 }, { "epoch": 0.7860585836114201, "grad_norm": 6.48828125, "learning_rate": 9.213941416388582e-06, "loss": 2.7167, "mean_token_accuracy": 0.461724041941171, "step": 4240 }, { "epoch": 0.7862439747868002, "grad_norm": 14.625, "learning_rate": 9.2137560252132e-06, "loss": 2.5803, "mean_token_accuracy": 0.46241979835013747, "step": 4241 }, { "epoch": 0.7864293659621802, "grad_norm": 6.8515625, "learning_rate": 9.213570634037821e-06, "loss": 2.4896, "mean_token_accuracy": 0.48712849408905096, "step": 4242 }, { "epoch": 0.7866147571375602, "grad_norm": 6.703125, "learning_rate": 9.21338524286244e-06, "loss": 2.9513, "mean_token_accuracy": 0.4453387671930718, "step": 4243 }, { "epoch": 0.7868001483129403, "grad_norm": 6.03125, "learning_rate": 9.21319985168706e-06, "loss": 3.1561, "mean_token_accuracy": 0.43246578415974873, "step": 4244 }, { "epoch": 0.7869855394883204, "grad_norm": 7.828125, "learning_rate": 9.21301446051168e-06, "loss": 3.2209, "mean_token_accuracy": 0.41520035487209817, "step": 4245 }, { "epoch": 0.7871709306637004, "grad_norm": 7.53515625, "learning_rate": 9.2128290693363e-06, "loss": 2.8638, "mean_token_accuracy": 0.43796268877702105, "step": 4246 }, { "epoch": 0.7873563218390804, "grad_norm": 5.20703125, "learning_rate": 9.21264367816092e-06, "loss": 2.7127, "mean_token_accuracy": 0.4641888838680524, "step": 4247 }, { "epoch": 0.7875417130144605, "grad_norm": 6.609375, "learning_rate": 9.21245828698554e-06, "loss": 2.5332, "mean_token_accuracy": 0.4695767195767196, "step": 4248 }, { "epoch": 0.7877271041898406, "grad_norm": 7.23828125, "learning_rate": 9.212272895810161e-06, "loss": 2.6835, "mean_token_accuracy": 0.4561465304684748, "step": 4249 }, { "epoch": 0.7879124953652206, "grad_norm": 6.90625, "learning_rate": 9.21208750463478e-06, "loss": 2.5804, "mean_token_accuracy": 0.4583576684966709, "step": 4250 }, { "epoch": 0.7880978865406006, "grad_norm": 9.71875, "learning_rate": 9.2119021134594e-06, "loss": 2.0372, "mean_token_accuracy": 0.5474542758279782, "step": 4251 }, { "epoch": 0.7882832777159807, "grad_norm": 10.0859375, "learning_rate": 9.21171672228402e-06, "loss": 2.9795, "mean_token_accuracy": 0.4281604602844814, "step": 4252 }, { "epoch": 0.7884686688913608, "grad_norm": 11.0859375, "learning_rate": 9.21153133110864e-06, "loss": 2.0194, "mean_token_accuracy": 0.531136449066533, "step": 4253 }, { "epoch": 0.7886540600667408, "grad_norm": 6.4296875, "learning_rate": 9.21134593993326e-06, "loss": 2.8925, "mean_token_accuracy": 0.43632596685082875, "step": 4254 }, { "epoch": 0.7888394512421208, "grad_norm": 7.734375, "learning_rate": 9.21116054875788e-06, "loss": 2.3587, "mean_token_accuracy": 0.5004965243296922, "step": 4255 }, { "epoch": 0.7890248424175009, "grad_norm": 7.890625, "learning_rate": 9.2109751575825e-06, "loss": 2.9216, "mean_token_accuracy": 0.430595286745675, "step": 4256 }, { "epoch": 0.789210233592881, "grad_norm": 5.43359375, "learning_rate": 9.21078976640712e-06, "loss": 2.8603, "mean_token_accuracy": 0.44718485301444943, "step": 4257 }, { "epoch": 0.7893956247682611, "grad_norm": 6.83984375, "learning_rate": 9.21060437523174e-06, "loss": 2.569, "mean_token_accuracy": 0.47357954545454545, "step": 4258 }, { "epoch": 0.789581015943641, "grad_norm": 7.9765625, "learning_rate": 9.21041898405636e-06, "loss": 2.7196, "mean_token_accuracy": 0.4508126603934987, "step": 4259 }, { "epoch": 0.7897664071190211, "grad_norm": 6.51171875, "learning_rate": 9.21023359288098e-06, "loss": 2.4467, "mean_token_accuracy": 0.49926144756277696, "step": 4260 }, { "epoch": 0.7899517982944012, "grad_norm": 6.328125, "learning_rate": 9.210048201705599e-06, "loss": 2.5325, "mean_token_accuracy": 0.47717231222385864, "step": 4261 }, { "epoch": 0.7901371894697813, "grad_norm": 5.34765625, "learning_rate": 9.209862810530219e-06, "loss": 3.2595, "mean_token_accuracy": 0.41034952337721287, "step": 4262 }, { "epoch": 0.7903225806451613, "grad_norm": 7.26953125, "learning_rate": 9.20967741935484e-06, "loss": 2.4621, "mean_token_accuracy": 0.4769585253456221, "step": 4263 }, { "epoch": 0.7905079718205413, "grad_norm": 5.50390625, "learning_rate": 9.20949202817946e-06, "loss": 3.29, "mean_token_accuracy": 0.40071852977753214, "step": 4264 }, { "epoch": 0.7906933629959214, "grad_norm": 7.828125, "learning_rate": 9.209306637004079e-06, "loss": 2.6563, "mean_token_accuracy": 0.47397908366533864, "step": 4265 }, { "epoch": 0.7908787541713015, "grad_norm": 5.2734375, "learning_rate": 9.2091212458287e-06, "loss": 2.8021, "mean_token_accuracy": 0.4498983444670346, "step": 4266 }, { "epoch": 0.7910641453466815, "grad_norm": 6.89453125, "learning_rate": 9.20893585465332e-06, "loss": 2.5447, "mean_token_accuracy": 0.4834315169366716, "step": 4267 }, { "epoch": 0.7912495365220615, "grad_norm": 6.0546875, "learning_rate": 9.208750463477939e-06, "loss": 2.9085, "mean_token_accuracy": 0.4450795785698274, "step": 4268 }, { "epoch": 0.7914349276974416, "grad_norm": 7.21484375, "learning_rate": 9.208565072302559e-06, "loss": 2.8274, "mean_token_accuracy": 0.43431008391800796, "step": 4269 }, { "epoch": 0.7916203188728217, "grad_norm": 6.37109375, "learning_rate": 9.208379681127178e-06, "loss": 2.661, "mean_token_accuracy": 0.46020338983050846, "step": 4270 }, { "epoch": 0.7918057100482017, "grad_norm": 6.85546875, "learning_rate": 9.2081942899518e-06, "loss": 2.4655, "mean_token_accuracy": 0.4787803360298693, "step": 4271 }, { "epoch": 0.7919911012235817, "grad_norm": 8.734375, "learning_rate": 9.208008898776419e-06, "loss": 2.7239, "mean_token_accuracy": 0.4778111739745403, "step": 4272 }, { "epoch": 0.7921764923989618, "grad_norm": 8.09375, "learning_rate": 9.20782350760104e-06, "loss": 2.6054, "mean_token_accuracy": 0.4744768912393654, "step": 4273 }, { "epoch": 0.7923618835743419, "grad_norm": 6.390625, "learning_rate": 9.207638116425658e-06, "loss": 2.9531, "mean_token_accuracy": 0.43011452368558045, "step": 4274 }, { "epoch": 0.7925472747497219, "grad_norm": 8.984375, "learning_rate": 9.207452725250279e-06, "loss": 3.0171, "mean_token_accuracy": 0.4263424782212976, "step": 4275 }, { "epoch": 0.7927326659251019, "grad_norm": 10.0, "learning_rate": 9.207267334074899e-06, "loss": 2.8858, "mean_token_accuracy": 0.43310782824796357, "step": 4276 }, { "epoch": 0.792918057100482, "grad_norm": 6.796875, "learning_rate": 9.207081942899518e-06, "loss": 2.9521, "mean_token_accuracy": 0.425511958730655, "step": 4277 }, { "epoch": 0.7931034482758621, "grad_norm": 7.0078125, "learning_rate": 9.206896551724138e-06, "loss": 2.802, "mean_token_accuracy": 0.48226661231145534, "step": 4278 }, { "epoch": 0.7932888394512421, "grad_norm": 11.0625, "learning_rate": 9.206711160548759e-06, "loss": 2.8343, "mean_token_accuracy": 0.4278266050945926, "step": 4279 }, { "epoch": 0.7934742306266221, "grad_norm": 9.3828125, "learning_rate": 9.20652576937338e-06, "loss": 2.8539, "mean_token_accuracy": 0.42800055119195257, "step": 4280 }, { "epoch": 0.7936596218020022, "grad_norm": 7.02734375, "learning_rate": 9.206340378197998e-06, "loss": 2.6687, "mean_token_accuracy": 0.46119336025123375, "step": 4281 }, { "epoch": 0.7938450129773823, "grad_norm": 8.015625, "learning_rate": 9.206154987022619e-06, "loss": 2.7846, "mean_token_accuracy": 0.4513231756214916, "step": 4282 }, { "epoch": 0.7940304041527624, "grad_norm": 12.0703125, "learning_rate": 9.20596959584724e-06, "loss": 2.5652, "mean_token_accuracy": 0.45425616547334924, "step": 4283 }, { "epoch": 0.7942157953281423, "grad_norm": 9.3125, "learning_rate": 9.205784204671858e-06, "loss": 3.1699, "mean_token_accuracy": 0.4195605146305363, "step": 4284 }, { "epoch": 0.7944011865035224, "grad_norm": 7.20703125, "learning_rate": 9.205598813496478e-06, "loss": 2.6675, "mean_token_accuracy": 0.4723760160207327, "step": 4285 }, { "epoch": 0.7945865776789025, "grad_norm": 11.3671875, "learning_rate": 9.205413422321097e-06, "loss": 3.2542, "mean_token_accuracy": 0.3997289972899729, "step": 4286 }, { "epoch": 0.7947719688542826, "grad_norm": 13.953125, "learning_rate": 9.20522803114572e-06, "loss": 2.7178, "mean_token_accuracy": 0.46816364195632115, "step": 4287 }, { "epoch": 0.7949573600296626, "grad_norm": 16.828125, "learning_rate": 9.205042639970338e-06, "loss": 2.9686, "mean_token_accuracy": 0.4101194522676508, "step": 4288 }, { "epoch": 0.7951427512050426, "grad_norm": 17.046875, "learning_rate": 9.204857248794959e-06, "loss": 2.6443, "mean_token_accuracy": 0.462800875273523, "step": 4289 }, { "epoch": 0.7953281423804227, "grad_norm": 6.62890625, "learning_rate": 9.204671857619578e-06, "loss": 2.4414, "mean_token_accuracy": 0.487135749822317, "step": 4290 }, { "epoch": 0.7955135335558028, "grad_norm": 10.7890625, "learning_rate": 9.204486466444198e-06, "loss": 2.5573, "mean_token_accuracy": 0.4753227821740941, "step": 4291 }, { "epoch": 0.7956989247311828, "grad_norm": 16.375, "learning_rate": 9.204301075268819e-06, "loss": 2.8874, "mean_token_accuracy": 0.4446066267698146, "step": 4292 }, { "epoch": 0.7958843159065628, "grad_norm": 9.859375, "learning_rate": 9.204115684093437e-06, "loss": 2.809, "mean_token_accuracy": 0.43368990849336253, "step": 4293 }, { "epoch": 0.7960697070819429, "grad_norm": 8.8359375, "learning_rate": 9.203930292918058e-06, "loss": 2.5457, "mean_token_accuracy": 0.47324630612537655, "step": 4294 }, { "epoch": 0.796255098257323, "grad_norm": 5.6640625, "learning_rate": 9.203744901742678e-06, "loss": 3.0402, "mean_token_accuracy": 0.428014440433213, "step": 4295 }, { "epoch": 0.796440489432703, "grad_norm": 10.5234375, "learning_rate": 9.203559510567299e-06, "loss": 2.7035, "mean_token_accuracy": 0.4578252544593369, "step": 4296 }, { "epoch": 0.796625880608083, "grad_norm": 14.6875, "learning_rate": 9.203374119391918e-06, "loss": 2.4191, "mean_token_accuracy": 0.4831178611773566, "step": 4297 }, { "epoch": 0.7968112717834631, "grad_norm": 12.6796875, "learning_rate": 9.203188728216538e-06, "loss": 2.6992, "mean_token_accuracy": 0.4486863711001642, "step": 4298 }, { "epoch": 0.7969966629588432, "grad_norm": 7.52734375, "learning_rate": 9.203003337041157e-06, "loss": 2.374, "mean_token_accuracy": 0.5143055555555556, "step": 4299 }, { "epoch": 0.7971820541342232, "grad_norm": 6.7578125, "learning_rate": 9.202817945865777e-06, "loss": 2.5986, "mean_token_accuracy": 0.47138174483106404, "step": 4300 }, { "epoch": 0.7973674453096032, "grad_norm": 8.171875, "learning_rate": 9.202632554690398e-06, "loss": 3.1837, "mean_token_accuracy": 0.41209372637944064, "step": 4301 }, { "epoch": 0.7975528364849833, "grad_norm": 7.3671875, "learning_rate": 9.202447163515017e-06, "loss": 2.8176, "mean_token_accuracy": 0.4446984541693882, "step": 4302 }, { "epoch": 0.7977382276603634, "grad_norm": 6.18359375, "learning_rate": 9.202261772339637e-06, "loss": 3.0559, "mean_token_accuracy": 0.4158186864014801, "step": 4303 }, { "epoch": 0.7979236188357434, "grad_norm": 5.4609375, "learning_rate": 9.202076381164258e-06, "loss": 2.8058, "mean_token_accuracy": 0.4567886079057206, "step": 4304 }, { "epoch": 0.7981090100111234, "grad_norm": 6.33203125, "learning_rate": 9.201890989988878e-06, "loss": 3.1767, "mean_token_accuracy": 0.40560920756713853, "step": 4305 }, { "epoch": 0.7982944011865035, "grad_norm": 10.4140625, "learning_rate": 9.201705598813497e-06, "loss": 3.1918, "mean_token_accuracy": 0.4176054071451561, "step": 4306 }, { "epoch": 0.7984797923618836, "grad_norm": 6.4296875, "learning_rate": 9.201520207638117e-06, "loss": 3.1628, "mean_token_accuracy": 0.4073435985786583, "step": 4307 }, { "epoch": 0.7986651835372637, "grad_norm": 5.53515625, "learning_rate": 9.201334816462736e-06, "loss": 2.556, "mean_token_accuracy": 0.4820005496015389, "step": 4308 }, { "epoch": 0.7988505747126436, "grad_norm": 7.2734375, "learning_rate": 9.201149425287357e-06, "loss": 2.3083, "mean_token_accuracy": 0.49704287826515525, "step": 4309 }, { "epoch": 0.7990359658880237, "grad_norm": 7.9609375, "learning_rate": 9.200964034111977e-06, "loss": 2.7381, "mean_token_accuracy": 0.4722558759389387, "step": 4310 }, { "epoch": 0.7992213570634038, "grad_norm": 7.6171875, "learning_rate": 9.200778642936598e-06, "loss": 3.7411, "mean_token_accuracy": 0.3593866866118175, "step": 4311 }, { "epoch": 0.7994067482387839, "grad_norm": 5.4609375, "learning_rate": 9.200593251761216e-06, "loss": 3.0992, "mean_token_accuracy": 0.409438202247191, "step": 4312 }, { "epoch": 0.7995921394141638, "grad_norm": 8.8203125, "learning_rate": 9.200407860585837e-06, "loss": 2.8268, "mean_token_accuracy": 0.463680387409201, "step": 4313 }, { "epoch": 0.7997775305895439, "grad_norm": 7.953125, "learning_rate": 9.200222469410457e-06, "loss": 2.8848, "mean_token_accuracy": 0.4409634993350081, "step": 4314 }, { "epoch": 0.799962921764924, "grad_norm": 6.1796875, "learning_rate": 9.200037078235076e-06, "loss": 2.6711, "mean_token_accuracy": 0.4618150438659571, "step": 4315 }, { "epoch": 0.8001483129403041, "grad_norm": 9.515625, "learning_rate": 9.199851687059697e-06, "loss": 2.9536, "mean_token_accuracy": 0.4299923488905891, "step": 4316 }, { "epoch": 0.800333704115684, "grad_norm": 8.3671875, "learning_rate": 9.199666295884316e-06, "loss": 2.9296, "mean_token_accuracy": 0.4384624567267518, "step": 4317 }, { "epoch": 0.8005190952910641, "grad_norm": 6.2578125, "learning_rate": 9.199480904708936e-06, "loss": 2.8893, "mean_token_accuracy": 0.4388138138138138, "step": 4318 }, { "epoch": 0.8007044864664442, "grad_norm": 5.95703125, "learning_rate": 9.199295513533557e-06, "loss": 2.6951, "mean_token_accuracy": 0.4737760059798181, "step": 4319 }, { "epoch": 0.8008898776418243, "grad_norm": 6.7109375, "learning_rate": 9.199110122358177e-06, "loss": 2.9045, "mean_token_accuracy": 0.442278686622578, "step": 4320 }, { "epoch": 0.8010752688172043, "grad_norm": 5.79296875, "learning_rate": 9.198924731182798e-06, "loss": 2.9112, "mean_token_accuracy": 0.4473835537665325, "step": 4321 }, { "epoch": 0.8012606599925843, "grad_norm": 7.9375, "learning_rate": 9.198739340007416e-06, "loss": 2.5082, "mean_token_accuracy": 0.46463897131552917, "step": 4322 }, { "epoch": 0.8014460511679644, "grad_norm": 6.73046875, "learning_rate": 9.198553948832037e-06, "loss": 2.7017, "mean_token_accuracy": 0.4599382786604183, "step": 4323 }, { "epoch": 0.8016314423433445, "grad_norm": 8.484375, "learning_rate": 9.198368557656656e-06, "loss": 2.9793, "mean_token_accuracy": 0.41475295755045233, "step": 4324 }, { "epoch": 0.8018168335187245, "grad_norm": 5.00390625, "learning_rate": 9.198183166481276e-06, "loss": 2.8176, "mean_token_accuracy": 0.4557622504537205, "step": 4325 }, { "epoch": 0.8020022246941045, "grad_norm": 6.57421875, "learning_rate": 9.197997775305897e-06, "loss": 2.7547, "mean_token_accuracy": 0.44747225647348954, "step": 4326 }, { "epoch": 0.8021876158694846, "grad_norm": 5.34375, "learning_rate": 9.197812384130517e-06, "loss": 2.8768, "mean_token_accuracy": 0.4258984258984259, "step": 4327 }, { "epoch": 0.8023730070448647, "grad_norm": 6.4765625, "learning_rate": 9.197626992955136e-06, "loss": 2.8811, "mean_token_accuracy": 0.46300512236767216, "step": 4328 }, { "epoch": 0.8025583982202447, "grad_norm": 6.2734375, "learning_rate": 9.197441601779756e-06, "loss": 2.2912, "mean_token_accuracy": 0.5259444109801222, "step": 4329 }, { "epoch": 0.8027437893956247, "grad_norm": 6.32421875, "learning_rate": 9.197256210604377e-06, "loss": 2.6276, "mean_token_accuracy": 0.48133225893096965, "step": 4330 }, { "epoch": 0.8029291805710048, "grad_norm": 5.3046875, "learning_rate": 9.197070819428996e-06, "loss": 3.2074, "mean_token_accuracy": 0.4130811078140455, "step": 4331 }, { "epoch": 0.8031145717463849, "grad_norm": 7.03125, "learning_rate": 9.196885428253616e-06, "loss": 2.9675, "mean_token_accuracy": 0.4250409612233752, "step": 4332 }, { "epoch": 0.803299962921765, "grad_norm": 9.2890625, "learning_rate": 9.196700037078235e-06, "loss": 2.8195, "mean_token_accuracy": 0.44739944975763135, "step": 4333 }, { "epoch": 0.8034853540971449, "grad_norm": 7.71484375, "learning_rate": 9.196514645902855e-06, "loss": 3.6465, "mean_token_accuracy": 0.3808119117853631, "step": 4334 }, { "epoch": 0.803670745272525, "grad_norm": 6.828125, "learning_rate": 9.196329254727476e-06, "loss": 2.5206, "mean_token_accuracy": 0.47836326170150134, "step": 4335 }, { "epoch": 0.8038561364479051, "grad_norm": 7.046875, "learning_rate": 9.196143863552096e-06, "loss": 2.4567, "mean_token_accuracy": 0.48352090032154343, "step": 4336 }, { "epoch": 0.8040415276232852, "grad_norm": 6.1171875, "learning_rate": 9.195958472376715e-06, "loss": 2.7378, "mean_token_accuracy": 0.43367760226812474, "step": 4337 }, { "epoch": 0.8042269187986651, "grad_norm": 6.66015625, "learning_rate": 9.195773081201336e-06, "loss": 2.8549, "mean_token_accuracy": 0.43636363636363634, "step": 4338 }, { "epoch": 0.8044123099740452, "grad_norm": 7.421875, "learning_rate": 9.195587690025956e-06, "loss": 3.0817, "mean_token_accuracy": 0.4194732641660016, "step": 4339 }, { "epoch": 0.8045977011494253, "grad_norm": 7.19921875, "learning_rate": 9.195402298850575e-06, "loss": 2.5639, "mean_token_accuracy": 0.492090395480226, "step": 4340 }, { "epoch": 0.8047830923248054, "grad_norm": 5.65234375, "learning_rate": 9.195216907675195e-06, "loss": 2.8191, "mean_token_accuracy": 0.45700613129953643, "step": 4341 }, { "epoch": 0.8049684835001854, "grad_norm": 8.09375, "learning_rate": 9.195031516499814e-06, "loss": 2.5304, "mean_token_accuracy": 0.47136815636072693, "step": 4342 }, { "epoch": 0.8051538746755654, "grad_norm": 6.2421875, "learning_rate": 9.194846125324435e-06, "loss": 3.0323, "mean_token_accuracy": 0.43833881995441887, "step": 4343 }, { "epoch": 0.8053392658509455, "grad_norm": 6.484375, "learning_rate": 9.194660734149055e-06, "loss": 2.9594, "mean_token_accuracy": 0.43760642179518366, "step": 4344 }, { "epoch": 0.8055246570263256, "grad_norm": 11.984375, "learning_rate": 9.194475342973676e-06, "loss": 3.015, "mean_token_accuracy": 0.4372923588039867, "step": 4345 }, { "epoch": 0.8057100482017056, "grad_norm": 7.43359375, "learning_rate": 9.194289951798295e-06, "loss": 3.3535, "mean_token_accuracy": 0.38111430813497255, "step": 4346 }, { "epoch": 0.8058954393770856, "grad_norm": 6.1796875, "learning_rate": 9.194104560622915e-06, "loss": 2.9831, "mean_token_accuracy": 0.4292652552926525, "step": 4347 }, { "epoch": 0.8060808305524657, "grad_norm": 7.0078125, "learning_rate": 9.193919169447536e-06, "loss": 2.1176, "mean_token_accuracy": 0.5565469293163383, "step": 4348 }, { "epoch": 0.8062662217278458, "grad_norm": 6.53515625, "learning_rate": 9.193733778272154e-06, "loss": 2.8928, "mean_token_accuracy": 0.42583960798374565, "step": 4349 }, { "epoch": 0.8064516129032258, "grad_norm": 9.703125, "learning_rate": 9.193548387096775e-06, "loss": 2.4723, "mean_token_accuracy": 0.5023058252427185, "step": 4350 }, { "epoch": 0.8066370040786058, "grad_norm": 9.09375, "learning_rate": 9.193362995921394e-06, "loss": 3.0332, "mean_token_accuracy": 0.4210726995424504, "step": 4351 }, { "epoch": 0.8068223952539859, "grad_norm": 5.953125, "learning_rate": 9.193177604746016e-06, "loss": 2.6328, "mean_token_accuracy": 0.47431526977783, "step": 4352 }, { "epoch": 0.807007786429366, "grad_norm": 8.0, "learning_rate": 9.192992213570635e-06, "loss": 2.8475, "mean_token_accuracy": 0.45464494163424124, "step": 4353 }, { "epoch": 0.807193177604746, "grad_norm": 5.60546875, "learning_rate": 9.192806822395255e-06, "loss": 2.7887, "mean_token_accuracy": 0.45560481317289425, "step": 4354 }, { "epoch": 0.807378568780126, "grad_norm": 6.41796875, "learning_rate": 9.192621431219874e-06, "loss": 2.923, "mean_token_accuracy": 0.42575241340147646, "step": 4355 }, { "epoch": 0.8075639599555061, "grad_norm": 6.3828125, "learning_rate": 9.192436040044494e-06, "loss": 3.0856, "mean_token_accuracy": 0.4237333691708104, "step": 4356 }, { "epoch": 0.8077493511308862, "grad_norm": 5.92578125, "learning_rate": 9.192250648869115e-06, "loss": 3.1427, "mean_token_accuracy": 0.394333936106088, "step": 4357 }, { "epoch": 0.8079347423062663, "grad_norm": 5.5625, "learning_rate": 9.192065257693734e-06, "loss": 2.9239, "mean_token_accuracy": 0.43770192442758815, "step": 4358 }, { "epoch": 0.8081201334816462, "grad_norm": 8.9765625, "learning_rate": 9.191879866518354e-06, "loss": 2.1578, "mean_token_accuracy": 0.527965889139704, "step": 4359 }, { "epoch": 0.8083055246570263, "grad_norm": 5.86328125, "learning_rate": 9.191694475342975e-06, "loss": 3.2135, "mean_token_accuracy": 0.41491976290299265, "step": 4360 }, { "epoch": 0.8084909158324064, "grad_norm": 6.76171875, "learning_rate": 9.191509084167595e-06, "loss": 3.4393, "mean_token_accuracy": 0.37162805458584575, "step": 4361 }, { "epoch": 0.8086763070077865, "grad_norm": 9.578125, "learning_rate": 9.191323692992214e-06, "loss": 2.3494, "mean_token_accuracy": 0.49133102160576153, "step": 4362 }, { "epoch": 0.8088616981831664, "grad_norm": 5.51171875, "learning_rate": 9.191138301816834e-06, "loss": 2.8712, "mean_token_accuracy": 0.4543939393939394, "step": 4363 }, { "epoch": 0.8090470893585465, "grad_norm": 5.99609375, "learning_rate": 9.190952910641455e-06, "loss": 2.7282, "mean_token_accuracy": 0.4700222057735011, "step": 4364 }, { "epoch": 0.8092324805339266, "grad_norm": 7.73046875, "learning_rate": 9.190767519466074e-06, "loss": 2.9636, "mean_token_accuracy": 0.41929848138691034, "step": 4365 }, { "epoch": 0.8094178717093067, "grad_norm": 7.4609375, "learning_rate": 9.190582128290694e-06, "loss": 2.5495, "mean_token_accuracy": 0.4725782727463753, "step": 4366 }, { "epoch": 0.8096032628846866, "grad_norm": 5.02734375, "learning_rate": 9.190396737115313e-06, "loss": 2.867, "mean_token_accuracy": 0.43786839889329965, "step": 4367 }, { "epoch": 0.8097886540600667, "grad_norm": 8.8046875, "learning_rate": 9.190211345939935e-06, "loss": 2.7449, "mean_token_accuracy": 0.4451419213973799, "step": 4368 }, { "epoch": 0.8099740452354468, "grad_norm": 6.109375, "learning_rate": 9.190025954764554e-06, "loss": 2.7499, "mean_token_accuracy": 0.4649339334978946, "step": 4369 }, { "epoch": 0.8101594364108269, "grad_norm": 5.03125, "learning_rate": 9.189840563589174e-06, "loss": 2.9489, "mean_token_accuracy": 0.4270986745213549, "step": 4370 }, { "epoch": 0.8103448275862069, "grad_norm": 11.8203125, "learning_rate": 9.189655172413793e-06, "loss": 2.5224, "mean_token_accuracy": 0.46034543531899896, "step": 4371 }, { "epoch": 0.8105302187615869, "grad_norm": 5.9140625, "learning_rate": 9.189469781238414e-06, "loss": 2.3851, "mean_token_accuracy": 0.5172600514417219, "step": 4372 }, { "epoch": 0.810715609936967, "grad_norm": 8.3359375, "learning_rate": 9.189284390063034e-06, "loss": 2.6574, "mean_token_accuracy": 0.45333491855902985, "step": 4373 }, { "epoch": 0.8109010011123471, "grad_norm": 6.796875, "learning_rate": 9.189098998887653e-06, "loss": 2.722, "mean_token_accuracy": 0.43742113112309283, "step": 4374 }, { "epoch": 0.811086392287727, "grad_norm": 5.18359375, "learning_rate": 9.188913607712274e-06, "loss": 3.0958, "mean_token_accuracy": 0.41936231884057973, "step": 4375 }, { "epoch": 0.8112717834631071, "grad_norm": 5.671875, "learning_rate": 9.188728216536894e-06, "loss": 2.6637, "mean_token_accuracy": 0.4841623360554318, "step": 4376 }, { "epoch": 0.8114571746384872, "grad_norm": 9.1328125, "learning_rate": 9.188542825361515e-06, "loss": 2.7581, "mean_token_accuracy": 0.46237996839674245, "step": 4377 }, { "epoch": 0.8116425658138673, "grad_norm": 8.4453125, "learning_rate": 9.188357434186133e-06, "loss": 2.4451, "mean_token_accuracy": 0.4785202863961814, "step": 4378 }, { "epoch": 0.8118279569892473, "grad_norm": 4.91015625, "learning_rate": 9.188172043010754e-06, "loss": 3.722, "mean_token_accuracy": 0.3836290784201488, "step": 4379 }, { "epoch": 0.8120133481646273, "grad_norm": 10.6015625, "learning_rate": 9.187986651835373e-06, "loss": 2.4961, "mean_token_accuracy": 0.46847190439867786, "step": 4380 }, { "epoch": 0.8121987393400074, "grad_norm": 7.9609375, "learning_rate": 9.187801260659993e-06, "loss": 2.7773, "mean_token_accuracy": 0.45524908528004504, "step": 4381 }, { "epoch": 0.8123841305153875, "grad_norm": 7.86328125, "learning_rate": 9.187615869484614e-06, "loss": 2.825, "mean_token_accuracy": 0.45626389918458116, "step": 4382 }, { "epoch": 0.8125695216907676, "grad_norm": 11.4453125, "learning_rate": 9.187430478309232e-06, "loss": 2.8006, "mean_token_accuracy": 0.44647184604419105, "step": 4383 }, { "epoch": 0.8127549128661475, "grad_norm": 8.8671875, "learning_rate": 9.187245087133853e-06, "loss": 2.1886, "mean_token_accuracy": 0.5300118114463653, "step": 4384 }, { "epoch": 0.8129403040415276, "grad_norm": 6.63671875, "learning_rate": 9.187059695958473e-06, "loss": 2.8098, "mean_token_accuracy": 0.46061902365374935, "step": 4385 }, { "epoch": 0.8131256952169077, "grad_norm": 6.078125, "learning_rate": 9.186874304783094e-06, "loss": 2.8019, "mean_token_accuracy": 0.45435909803448693, "step": 4386 }, { "epoch": 0.8133110863922878, "grad_norm": 9.0078125, "learning_rate": 9.186688913607713e-06, "loss": 3.0977, "mean_token_accuracy": 0.42460186710598574, "step": 4387 }, { "epoch": 0.8134964775676677, "grad_norm": 7.4921875, "learning_rate": 9.186503522432333e-06, "loss": 2.7664, "mean_token_accuracy": 0.45870144439770016, "step": 4388 }, { "epoch": 0.8136818687430478, "grad_norm": 5.77734375, "learning_rate": 9.186318131256952e-06, "loss": 2.8988, "mean_token_accuracy": 0.4316022799240025, "step": 4389 }, { "epoch": 0.8138672599184279, "grad_norm": 8.21875, "learning_rate": 9.186132740081572e-06, "loss": 2.9953, "mean_token_accuracy": 0.4364118092354277, "step": 4390 }, { "epoch": 0.814052651093808, "grad_norm": 10.1328125, "learning_rate": 9.185947348906193e-06, "loss": 2.5499, "mean_token_accuracy": 0.45371953826421546, "step": 4391 }, { "epoch": 0.814238042269188, "grad_norm": 5.546875, "learning_rate": 9.185761957730813e-06, "loss": 2.7855, "mean_token_accuracy": 0.44581519109820994, "step": 4392 }, { "epoch": 0.814423433444568, "grad_norm": 5.88671875, "learning_rate": 9.185576566555432e-06, "loss": 2.57, "mean_token_accuracy": 0.4981707954426675, "step": 4393 }, { "epoch": 0.8146088246199481, "grad_norm": 8.1171875, "learning_rate": 9.185391175380053e-06, "loss": 2.9629, "mean_token_accuracy": 0.42522106881968474, "step": 4394 }, { "epoch": 0.8147942157953282, "grad_norm": 8.7421875, "learning_rate": 9.185205784204673e-06, "loss": 2.6271, "mean_token_accuracy": 0.46552150271873455, "step": 4395 }, { "epoch": 0.8149796069707081, "grad_norm": 6.4375, "learning_rate": 9.185020393029292e-06, "loss": 2.3936, "mean_token_accuracy": 0.4743816254416961, "step": 4396 }, { "epoch": 0.8151649981460882, "grad_norm": 11.171875, "learning_rate": 9.184835001853912e-06, "loss": 2.6163, "mean_token_accuracy": 0.4695697796432319, "step": 4397 }, { "epoch": 0.8153503893214683, "grad_norm": 8.25, "learning_rate": 9.184649610678531e-06, "loss": 2.9028, "mean_token_accuracy": 0.44392655367231637, "step": 4398 }, { "epoch": 0.8155357804968484, "grad_norm": 9.265625, "learning_rate": 9.184464219503152e-06, "loss": 2.586, "mean_token_accuracy": 0.46875800256081945, "step": 4399 }, { "epoch": 0.8157211716722284, "grad_norm": 6.31640625, "learning_rate": 9.184278828327772e-06, "loss": 2.8025, "mean_token_accuracy": 0.46569129480614485, "step": 4400 }, { "epoch": 0.8159065628476084, "grad_norm": 5.33203125, "learning_rate": 9.184093437152393e-06, "loss": 2.9963, "mean_token_accuracy": 0.4361865709892363, "step": 4401 }, { "epoch": 0.8160919540229885, "grad_norm": 8.21875, "learning_rate": 9.183908045977013e-06, "loss": 2.8542, "mean_token_accuracy": 0.4517895809451025, "step": 4402 }, { "epoch": 0.8162773451983686, "grad_norm": 5.1328125, "learning_rate": 9.183722654801632e-06, "loss": 2.761, "mean_token_accuracy": 0.47304810248972684, "step": 4403 }, { "epoch": 0.8164627363737486, "grad_norm": 5.21484375, "learning_rate": 9.183537263626253e-06, "loss": 3.0951, "mean_token_accuracy": 0.4181457262961233, "step": 4404 }, { "epoch": 0.8166481275491286, "grad_norm": 7.67578125, "learning_rate": 9.183351872450871e-06, "loss": 2.9994, "mean_token_accuracy": 0.43290482634190347, "step": 4405 }, { "epoch": 0.8168335187245087, "grad_norm": 8.84375, "learning_rate": 9.183166481275492e-06, "loss": 2.68, "mean_token_accuracy": 0.47159940209267565, "step": 4406 }, { "epoch": 0.8170189098998888, "grad_norm": 7.48046875, "learning_rate": 9.182981090100112e-06, "loss": 2.6111, "mean_token_accuracy": 0.4858509366281387, "step": 4407 }, { "epoch": 0.8172043010752689, "grad_norm": 6.8125, "learning_rate": 9.182795698924733e-06, "loss": 2.5157, "mean_token_accuracy": 0.4702430846605197, "step": 4408 }, { "epoch": 0.8173896922506488, "grad_norm": 7.22265625, "learning_rate": 9.182610307749352e-06, "loss": 3.9058, "mean_token_accuracy": 0.36253776435045315, "step": 4409 }, { "epoch": 0.8175750834260289, "grad_norm": 7.32421875, "learning_rate": 9.182424916573972e-06, "loss": 2.9919, "mean_token_accuracy": 0.4170714781401804, "step": 4410 }, { "epoch": 0.817760474601409, "grad_norm": 7.578125, "learning_rate": 9.182239525398593e-06, "loss": 2.5637, "mean_token_accuracy": 0.4683528836754643, "step": 4411 }, { "epoch": 0.8179458657767891, "grad_norm": 6.359375, "learning_rate": 9.182054134223211e-06, "loss": 2.3226, "mean_token_accuracy": 0.49394166043380705, "step": 4412 }, { "epoch": 0.818131256952169, "grad_norm": 9.515625, "learning_rate": 9.181868743047832e-06, "loss": 3.0967, "mean_token_accuracy": 0.42943155657871435, "step": 4413 }, { "epoch": 0.8183166481275491, "grad_norm": 10.28125, "learning_rate": 9.18168335187245e-06, "loss": 2.5182, "mean_token_accuracy": 0.49058516801853996, "step": 4414 }, { "epoch": 0.8185020393029292, "grad_norm": 7.3671875, "learning_rate": 9.181497960697071e-06, "loss": 3.2666, "mean_token_accuracy": 0.4092020129403307, "step": 4415 }, { "epoch": 0.8186874304783093, "grad_norm": 7.53125, "learning_rate": 9.181312569521692e-06, "loss": 2.5404, "mean_token_accuracy": 0.4786148081147435, "step": 4416 }, { "epoch": 0.8188728216536892, "grad_norm": 7.44140625, "learning_rate": 9.181127178346312e-06, "loss": 2.816, "mean_token_accuracy": 0.4342594889605557, "step": 4417 }, { "epoch": 0.8190582128290693, "grad_norm": 9.546875, "learning_rate": 9.180941787170931e-06, "loss": 2.8622, "mean_token_accuracy": 0.43490304709141275, "step": 4418 }, { "epoch": 0.8192436040044494, "grad_norm": 5.7421875, "learning_rate": 9.180756395995551e-06, "loss": 3.0761, "mean_token_accuracy": 0.43468502169684275, "step": 4419 }, { "epoch": 0.8194289951798295, "grad_norm": 5.40234375, "learning_rate": 9.180571004820172e-06, "loss": 2.7504, "mean_token_accuracy": 0.4477154247163447, "step": 4420 }, { "epoch": 0.8196143863552094, "grad_norm": 7.3203125, "learning_rate": 9.18038561364479e-06, "loss": 2.4056, "mean_token_accuracy": 0.48785185185185187, "step": 4421 }, { "epoch": 0.8197997775305895, "grad_norm": 8.3359375, "learning_rate": 9.180200222469411e-06, "loss": 3.0176, "mean_token_accuracy": 0.42487629329734594, "step": 4422 }, { "epoch": 0.8199851687059696, "grad_norm": 6.42578125, "learning_rate": 9.18001483129403e-06, "loss": 2.2026, "mean_token_accuracy": 0.5066332916145182, "step": 4423 }, { "epoch": 0.8201705598813497, "grad_norm": 6.1875, "learning_rate": 9.179829440118652e-06, "loss": 2.4433, "mean_token_accuracy": 0.4946714031971581, "step": 4424 }, { "epoch": 0.8203559510567296, "grad_norm": 6.58203125, "learning_rate": 9.179644048943271e-06, "loss": 3.5816, "mean_token_accuracy": 0.39780658025922233, "step": 4425 }, { "epoch": 0.8205413422321097, "grad_norm": 5.0703125, "learning_rate": 9.179458657767891e-06, "loss": 2.8443, "mean_token_accuracy": 0.45802161263507896, "step": 4426 }, { "epoch": 0.8207267334074898, "grad_norm": 7.21875, "learning_rate": 9.17927326659251e-06, "loss": 2.6387, "mean_token_accuracy": 0.4645669291338583, "step": 4427 }, { "epoch": 0.8209121245828699, "grad_norm": 6.21484375, "learning_rate": 9.17908787541713e-06, "loss": 2.6687, "mean_token_accuracy": 0.4563298843578819, "step": 4428 }, { "epoch": 0.8210975157582499, "grad_norm": 4.69921875, "learning_rate": 9.178902484241751e-06, "loss": 3.1248, "mean_token_accuracy": 0.4328288707799767, "step": 4429 }, { "epoch": 0.8212829069336299, "grad_norm": 12.9921875, "learning_rate": 9.17871709306637e-06, "loss": 2.8007, "mean_token_accuracy": 0.4327208061647896, "step": 4430 }, { "epoch": 0.82146829810901, "grad_norm": 7.8984375, "learning_rate": 9.17853170189099e-06, "loss": 2.497, "mean_token_accuracy": 0.4802651401024405, "step": 4431 }, { "epoch": 0.8216536892843901, "grad_norm": 6.5390625, "learning_rate": 9.178346310715611e-06, "loss": 2.8033, "mean_token_accuracy": 0.4549240897487572, "step": 4432 }, { "epoch": 0.8218390804597702, "grad_norm": 6.30859375, "learning_rate": 9.178160919540232e-06, "loss": 3.034, "mean_token_accuracy": 0.43412010755900804, "step": 4433 }, { "epoch": 0.8220244716351501, "grad_norm": 6.6171875, "learning_rate": 9.17797552836485e-06, "loss": 3.1262, "mean_token_accuracy": 0.429, "step": 4434 }, { "epoch": 0.8222098628105302, "grad_norm": 5.41796875, "learning_rate": 9.17779013718947e-06, "loss": 2.863, "mean_token_accuracy": 0.43613707165109034, "step": 4435 }, { "epoch": 0.8223952539859103, "grad_norm": 7.05078125, "learning_rate": 9.17760474601409e-06, "loss": 3.2017, "mean_token_accuracy": 0.3989723189126471, "step": 4436 }, { "epoch": 0.8225806451612904, "grad_norm": 6.02734375, "learning_rate": 9.17741935483871e-06, "loss": 2.1676, "mean_token_accuracy": 0.5301844235106806, "step": 4437 }, { "epoch": 0.8227660363366703, "grad_norm": 5.06640625, "learning_rate": 9.17723396366333e-06, "loss": 2.6664, "mean_token_accuracy": 0.45574341123818995, "step": 4438 }, { "epoch": 0.8229514275120504, "grad_norm": 6.3125, "learning_rate": 9.17704857248795e-06, "loss": 3.1937, "mean_token_accuracy": 0.40545004128819156, "step": 4439 }, { "epoch": 0.8231368186874305, "grad_norm": 6.31640625, "learning_rate": 9.176863181312572e-06, "loss": 3.2905, "mean_token_accuracy": 0.40193732193732196, "step": 4440 }, { "epoch": 0.8233222098628106, "grad_norm": 5.796875, "learning_rate": 9.17667779013719e-06, "loss": 2.8553, "mean_token_accuracy": 0.43833652007648183, "step": 4441 }, { "epoch": 0.8235076010381905, "grad_norm": 7.5234375, "learning_rate": 9.176492398961811e-06, "loss": 2.7666, "mean_token_accuracy": 0.44815032295948326, "step": 4442 }, { "epoch": 0.8236929922135706, "grad_norm": 5.4765625, "learning_rate": 9.17630700778643e-06, "loss": 2.8879, "mean_token_accuracy": 0.4640972136982939, "step": 4443 }, { "epoch": 0.8238783833889507, "grad_norm": 5.08203125, "learning_rate": 9.17612161661105e-06, "loss": 2.7549, "mean_token_accuracy": 0.4592476489028213, "step": 4444 }, { "epoch": 0.8240637745643308, "grad_norm": 6.1796875, "learning_rate": 9.17593622543567e-06, "loss": 2.9661, "mean_token_accuracy": 0.4505977067577458, "step": 4445 }, { "epoch": 0.8242491657397107, "grad_norm": 6.21875, "learning_rate": 9.17575083426029e-06, "loss": 2.681, "mean_token_accuracy": 0.45746164574616455, "step": 4446 }, { "epoch": 0.8244345569150908, "grad_norm": 6.2421875, "learning_rate": 9.17556544308491e-06, "loss": 2.966, "mean_token_accuracy": 0.42812330989724173, "step": 4447 }, { "epoch": 0.8246199480904709, "grad_norm": 7.2734375, "learning_rate": 9.17538005190953e-06, "loss": 3.4448, "mean_token_accuracy": 0.4073864280049847, "step": 4448 }, { "epoch": 0.824805339265851, "grad_norm": 5.31640625, "learning_rate": 9.175194660734151e-06, "loss": 3.4096, "mean_token_accuracy": 0.39215435727063636, "step": 4449 }, { "epoch": 0.824990730441231, "grad_norm": 6.15625, "learning_rate": 9.17500926955877e-06, "loss": 3.16, "mean_token_accuracy": 0.41272123893805307, "step": 4450 }, { "epoch": 0.825176121616611, "grad_norm": 6.6328125, "learning_rate": 9.17482387838339e-06, "loss": 2.8029, "mean_token_accuracy": 0.43438287153652394, "step": 4451 }, { "epoch": 0.8253615127919911, "grad_norm": 9.484375, "learning_rate": 9.174638487208009e-06, "loss": 1.838, "mean_token_accuracy": 0.5679012345679012, "step": 4452 }, { "epoch": 0.8255469039673712, "grad_norm": 6.46484375, "learning_rate": 9.17445309603263e-06, "loss": 2.9247, "mean_token_accuracy": 0.4426871516794698, "step": 4453 }, { "epoch": 0.8257322951427513, "grad_norm": 6.0625, "learning_rate": 9.17426770485725e-06, "loss": 3.4541, "mean_token_accuracy": 0.4095221958658082, "step": 4454 }, { "epoch": 0.8259176863181312, "grad_norm": 5.27734375, "learning_rate": 9.174082313681869e-06, "loss": 2.9604, "mean_token_accuracy": 0.4255979314802844, "step": 4455 }, { "epoch": 0.8261030774935113, "grad_norm": 7.05859375, "learning_rate": 9.17389692250649e-06, "loss": 2.9849, "mean_token_accuracy": 0.4316702819956616, "step": 4456 }, { "epoch": 0.8262884686688914, "grad_norm": 6.07421875, "learning_rate": 9.17371153133111e-06, "loss": 3.1013, "mean_token_accuracy": 0.41817192600652886, "step": 4457 }, { "epoch": 0.8264738598442715, "grad_norm": 5.66796875, "learning_rate": 9.17352614015573e-06, "loss": 2.755, "mean_token_accuracy": 0.46497797356828197, "step": 4458 }, { "epoch": 0.8266592510196514, "grad_norm": 6.0546875, "learning_rate": 9.173340748980349e-06, "loss": 2.4322, "mean_token_accuracy": 0.48446383710167923, "step": 4459 }, { "epoch": 0.8268446421950315, "grad_norm": 6.18359375, "learning_rate": 9.17315535780497e-06, "loss": 2.5635, "mean_token_accuracy": 0.48299968223705114, "step": 4460 }, { "epoch": 0.8270300333704116, "grad_norm": 7.15625, "learning_rate": 9.172969966629588e-06, "loss": 3.3559, "mean_token_accuracy": 0.4238287309959665, "step": 4461 }, { "epoch": 0.8272154245457917, "grad_norm": 6.93359375, "learning_rate": 9.172784575454209e-06, "loss": 2.8607, "mean_token_accuracy": 0.4582779991146525, "step": 4462 }, { "epoch": 0.8274008157211716, "grad_norm": 8.609375, "learning_rate": 9.17259918427883e-06, "loss": 2.6722, "mean_token_accuracy": 0.44519621109607577, "step": 4463 }, { "epoch": 0.8275862068965517, "grad_norm": 6.734375, "learning_rate": 9.172413793103448e-06, "loss": 2.7328, "mean_token_accuracy": 0.47839933536416507, "step": 4464 }, { "epoch": 0.8277715980719318, "grad_norm": 9.1796875, "learning_rate": 9.172228401928069e-06, "loss": 3.0857, "mean_token_accuracy": 0.40721142013290673, "step": 4465 }, { "epoch": 0.8279569892473119, "grad_norm": 6.1015625, "learning_rate": 9.172043010752689e-06, "loss": 3.448, "mean_token_accuracy": 0.3917889857631608, "step": 4466 }, { "epoch": 0.8281423804226918, "grad_norm": 6.04296875, "learning_rate": 9.17185761957731e-06, "loss": 3.0551, "mean_token_accuracy": 0.4189074740630955, "step": 4467 }, { "epoch": 0.8283277715980719, "grad_norm": 6.32421875, "learning_rate": 9.171672228401928e-06, "loss": 3.0626, "mean_token_accuracy": 0.43795171459326465, "step": 4468 }, { "epoch": 0.828513162773452, "grad_norm": 5.796875, "learning_rate": 9.171486837226549e-06, "loss": 2.9824, "mean_token_accuracy": 0.44318351500671943, "step": 4469 }, { "epoch": 0.8286985539488321, "grad_norm": 7.21484375, "learning_rate": 9.171301446051168e-06, "loss": 2.2097, "mean_token_accuracy": 0.5083261379055137, "step": 4470 }, { "epoch": 0.828883945124212, "grad_norm": 7.8046875, "learning_rate": 9.171116054875788e-06, "loss": 2.9279, "mean_token_accuracy": 0.432826281477694, "step": 4471 }, { "epoch": 0.8290693362995921, "grad_norm": 8.625, "learning_rate": 9.170930663700409e-06, "loss": 2.7359, "mean_token_accuracy": 0.4522503998172264, "step": 4472 }, { "epoch": 0.8292547274749722, "grad_norm": 7.01953125, "learning_rate": 9.17074527252503e-06, "loss": 2.6236, "mean_token_accuracy": 0.4603505843071786, "step": 4473 }, { "epoch": 0.8294401186503523, "grad_norm": 11.4765625, "learning_rate": 9.170559881349648e-06, "loss": 2.6719, "mean_token_accuracy": 0.4640632122777693, "step": 4474 }, { "epoch": 0.8296255098257322, "grad_norm": 6.19140625, "learning_rate": 9.170374490174268e-06, "loss": 3.0002, "mean_token_accuracy": 0.42925474463817315, "step": 4475 }, { "epoch": 0.8298109010011123, "grad_norm": 4.96484375, "learning_rate": 9.170189098998889e-06, "loss": 3.2094, "mean_token_accuracy": 0.42144004282655245, "step": 4476 }, { "epoch": 0.8299962921764924, "grad_norm": 5.7421875, "learning_rate": 9.170003707823508e-06, "loss": 2.2935, "mean_token_accuracy": 0.5074814711229199, "step": 4477 }, { "epoch": 0.8301816833518725, "grad_norm": 6.69140625, "learning_rate": 9.169818316648128e-06, "loss": 2.9473, "mean_token_accuracy": 0.43269918466599916, "step": 4478 }, { "epoch": 0.8303670745272526, "grad_norm": 6.27734375, "learning_rate": 9.169632925472747e-06, "loss": 2.7936, "mean_token_accuracy": 0.4447274579724911, "step": 4479 }, { "epoch": 0.8305524657026325, "grad_norm": 5.90625, "learning_rate": 9.169447534297368e-06, "loss": 2.9834, "mean_token_accuracy": 0.4481435813125086, "step": 4480 }, { "epoch": 0.8307378568780126, "grad_norm": 5.05078125, "learning_rate": 9.169262143121988e-06, "loss": 2.7337, "mean_token_accuracy": 0.4577012563983248, "step": 4481 }, { "epoch": 0.8309232480533927, "grad_norm": 4.71875, "learning_rate": 9.169076751946609e-06, "loss": 3.0813, "mean_token_accuracy": 0.4241962305986696, "step": 4482 }, { "epoch": 0.8311086392287728, "grad_norm": 7.10546875, "learning_rate": 9.168891360771229e-06, "loss": 2.8972, "mean_token_accuracy": 0.4438296229571484, "step": 4483 }, { "epoch": 0.8312940304041527, "grad_norm": 7.5625, "learning_rate": 9.168705969595848e-06, "loss": 2.7619, "mean_token_accuracy": 0.4556809024979855, "step": 4484 }, { "epoch": 0.8314794215795328, "grad_norm": 5.2734375, "learning_rate": 9.168520578420468e-06, "loss": 3.0797, "mean_token_accuracy": 0.4078762306610408, "step": 4485 }, { "epoch": 0.8316648127549129, "grad_norm": 6.66796875, "learning_rate": 9.168335187245087e-06, "loss": 3.0787, "mean_token_accuracy": 0.41968911917098445, "step": 4486 }, { "epoch": 0.831850203930293, "grad_norm": 5.67578125, "learning_rate": 9.168149796069708e-06, "loss": 2.9729, "mean_token_accuracy": 0.43402545210984594, "step": 4487 }, { "epoch": 0.8320355951056729, "grad_norm": 6.03515625, "learning_rate": 9.167964404894328e-06, "loss": 2.4158, "mean_token_accuracy": 0.48830011142751023, "step": 4488 }, { "epoch": 0.832220986281053, "grad_norm": 5.890625, "learning_rate": 9.167779013718949e-06, "loss": 2.8134, "mean_token_accuracy": 0.43807242496829646, "step": 4489 }, { "epoch": 0.8324063774564331, "grad_norm": 6.03515625, "learning_rate": 9.167593622543567e-06, "loss": 2.6587, "mean_token_accuracy": 0.4577040990121494, "step": 4490 }, { "epoch": 0.8325917686318132, "grad_norm": 6.01171875, "learning_rate": 9.167408231368188e-06, "loss": 3.3818, "mean_token_accuracy": 0.3963035903650962, "step": 4491 }, { "epoch": 0.8327771598071931, "grad_norm": 6.265625, "learning_rate": 9.167222840192808e-06, "loss": 3.0869, "mean_token_accuracy": 0.4224632391281677, "step": 4492 }, { "epoch": 0.8329625509825732, "grad_norm": 5.95703125, "learning_rate": 9.167037449017427e-06, "loss": 3.0958, "mean_token_accuracy": 0.4208832238959701, "step": 4493 }, { "epoch": 0.8331479421579533, "grad_norm": 5.125, "learning_rate": 9.166852057842048e-06, "loss": 3.4583, "mean_token_accuracy": 0.39850357839947953, "step": 4494 }, { "epoch": 0.8333333333333334, "grad_norm": 6.74609375, "learning_rate": 9.166666666666666e-06, "loss": 2.7788, "mean_token_accuracy": 0.44272873934086193, "step": 4495 }, { "epoch": 0.8335187245087133, "grad_norm": 9.3359375, "learning_rate": 9.166481275491287e-06, "loss": 3.1218, "mean_token_accuracy": 0.44252463230044264, "step": 4496 }, { "epoch": 0.8337041156840934, "grad_norm": 7.5625, "learning_rate": 9.166295884315907e-06, "loss": 2.8186, "mean_token_accuracy": 0.44422015879424037, "step": 4497 }, { "epoch": 0.8338895068594735, "grad_norm": 5.2421875, "learning_rate": 9.166110493140528e-06, "loss": 3.4462, "mean_token_accuracy": 0.4011142061281337, "step": 4498 }, { "epoch": 0.8340748980348536, "grad_norm": 10.8125, "learning_rate": 9.165925101965147e-06, "loss": 3.1669, "mean_token_accuracy": 0.4164394234514998, "step": 4499 }, { "epoch": 0.8342602892102335, "grad_norm": 9.0625, "learning_rate": 9.165739710789767e-06, "loss": 3.1177, "mean_token_accuracy": 0.4205810828107001, "step": 4500 }, { "epoch": 0.8344456803856136, "grad_norm": 6.87109375, "learning_rate": 9.165554319614388e-06, "loss": 2.9973, "mean_token_accuracy": 0.4370713305898491, "step": 4501 }, { "epoch": 0.8346310715609937, "grad_norm": 5.80078125, "learning_rate": 9.165368928439006e-06, "loss": 2.7574, "mean_token_accuracy": 0.4560009487666034, "step": 4502 }, { "epoch": 0.8348164627363738, "grad_norm": 5.7734375, "learning_rate": 9.165183537263627e-06, "loss": 2.842, "mean_token_accuracy": 0.4471161657189277, "step": 4503 }, { "epoch": 0.8350018539117539, "grad_norm": 6.53515625, "learning_rate": 9.164998146088246e-06, "loss": 2.5804, "mean_token_accuracy": 0.4946051986267778, "step": 4504 }, { "epoch": 0.8351872450871338, "grad_norm": 6.55859375, "learning_rate": 9.164812754912868e-06, "loss": 2.6598, "mean_token_accuracy": 0.46208024147905924, "step": 4505 }, { "epoch": 0.8353726362625139, "grad_norm": 5.85546875, "learning_rate": 9.164627363737487e-06, "loss": 3.0803, "mean_token_accuracy": 0.4314106395696354, "step": 4506 }, { "epoch": 0.835558027437894, "grad_norm": 8.0, "learning_rate": 9.164441972562107e-06, "loss": 2.7174, "mean_token_accuracy": 0.4535132466677637, "step": 4507 }, { "epoch": 0.8357434186132741, "grad_norm": 6.66796875, "learning_rate": 9.164256581386726e-06, "loss": 3.0228, "mean_token_accuracy": 0.42738359201773835, "step": 4508 }, { "epoch": 0.835928809788654, "grad_norm": 5.4921875, "learning_rate": 9.164071190211347e-06, "loss": 2.278, "mean_token_accuracy": 0.5260058881256133, "step": 4509 }, { "epoch": 0.8361142009640341, "grad_norm": 7.8828125, "learning_rate": 9.163885799035967e-06, "loss": 2.9531, "mean_token_accuracy": 0.42908555537379356, "step": 4510 }, { "epoch": 0.8362995921394142, "grad_norm": 5.7890625, "learning_rate": 9.163700407860586e-06, "loss": 3.0234, "mean_token_accuracy": 0.4222619047619048, "step": 4511 }, { "epoch": 0.8364849833147943, "grad_norm": 8.5234375, "learning_rate": 9.163515016685206e-06, "loss": 2.7892, "mean_token_accuracy": 0.43832629139975465, "step": 4512 }, { "epoch": 0.8366703744901742, "grad_norm": 6.73046875, "learning_rate": 9.163329625509827e-06, "loss": 3.4254, "mean_token_accuracy": 0.3868283739952052, "step": 4513 }, { "epoch": 0.8368557656655543, "grad_norm": 7.078125, "learning_rate": 9.163144234334447e-06, "loss": 2.6741, "mean_token_accuracy": 0.44507827009816925, "step": 4514 }, { "epoch": 0.8370411568409344, "grad_norm": 5.953125, "learning_rate": 9.162958843159066e-06, "loss": 2.9697, "mean_token_accuracy": 0.43411778760004777, "step": 4515 }, { "epoch": 0.8372265480163145, "grad_norm": 9.34375, "learning_rate": 9.162773451983687e-06, "loss": 2.7934, "mean_token_accuracy": 0.43499617945639124, "step": 4516 }, { "epoch": 0.8374119391916944, "grad_norm": 6.8359375, "learning_rate": 9.162588060808305e-06, "loss": 2.5805, "mean_token_accuracy": 0.4850143017037682, "step": 4517 }, { "epoch": 0.8375973303670745, "grad_norm": 6.0859375, "learning_rate": 9.162402669632926e-06, "loss": 2.9658, "mean_token_accuracy": 0.4365394149727318, "step": 4518 }, { "epoch": 0.8377827215424546, "grad_norm": 6.00390625, "learning_rate": 9.162217278457546e-06, "loss": 2.9585, "mean_token_accuracy": 0.44438246122949904, "step": 4519 }, { "epoch": 0.8379681127178347, "grad_norm": 6.390625, "learning_rate": 9.162031887282165e-06, "loss": 2.5613, "mean_token_accuracy": 0.46315653632726805, "step": 4520 }, { "epoch": 0.8381535038932146, "grad_norm": 5.28125, "learning_rate": 9.161846496106787e-06, "loss": 2.7581, "mean_token_accuracy": 0.44249965814303294, "step": 4521 }, { "epoch": 0.8383388950685947, "grad_norm": 6.85546875, "learning_rate": 9.161661104931406e-06, "loss": 2.8028, "mean_token_accuracy": 0.4475448168355417, "step": 4522 }, { "epoch": 0.8385242862439748, "grad_norm": 5.875, "learning_rate": 9.161475713756027e-06, "loss": 2.0241, "mean_token_accuracy": 0.5649028801071668, "step": 4523 }, { "epoch": 0.8387096774193549, "grad_norm": 7.078125, "learning_rate": 9.161290322580645e-06, "loss": 2.8361, "mean_token_accuracy": 0.4526646588974728, "step": 4524 }, { "epoch": 0.8388950685947348, "grad_norm": 5.59375, "learning_rate": 9.161104931405266e-06, "loss": 2.6212, "mean_token_accuracy": 0.475275624931776, "step": 4525 }, { "epoch": 0.8390804597701149, "grad_norm": 6.7890625, "learning_rate": 9.160919540229886e-06, "loss": 2.6739, "mean_token_accuracy": 0.44297445255474455, "step": 4526 }, { "epoch": 0.839265850945495, "grad_norm": 9.3515625, "learning_rate": 9.160734149054505e-06, "loss": 2.8221, "mean_token_accuracy": 0.4253541076487252, "step": 4527 }, { "epoch": 0.8394512421208751, "grad_norm": 6.51171875, "learning_rate": 9.160548757879126e-06, "loss": 2.92, "mean_token_accuracy": 0.44187898089171973, "step": 4528 }, { "epoch": 0.8396366332962552, "grad_norm": 7.48828125, "learning_rate": 9.160363366703746e-06, "loss": 2.6921, "mean_token_accuracy": 0.4561128526645768, "step": 4529 }, { "epoch": 0.8398220244716351, "grad_norm": 7.375, "learning_rate": 9.160177975528367e-06, "loss": 2.5289, "mean_token_accuracy": 0.4934302488118535, "step": 4530 }, { "epoch": 0.8400074156470152, "grad_norm": 5.5625, "learning_rate": 9.159992584352985e-06, "loss": 3.1383, "mean_token_accuracy": 0.4193734828343544, "step": 4531 }, { "epoch": 0.8401928068223953, "grad_norm": 6.4609375, "learning_rate": 9.159807193177606e-06, "loss": 2.9498, "mean_token_accuracy": 0.4337333153566132, "step": 4532 }, { "epoch": 0.8403781979977754, "grad_norm": 5.78125, "learning_rate": 9.159621802002225e-06, "loss": 2.8378, "mean_token_accuracy": 0.4612153825961412, "step": 4533 }, { "epoch": 0.8405635891731553, "grad_norm": 6.8828125, "learning_rate": 9.159436410826845e-06, "loss": 2.9113, "mean_token_accuracy": 0.45061196345457677, "step": 4534 }, { "epoch": 0.8407489803485354, "grad_norm": 6.4921875, "learning_rate": 9.159251019651466e-06, "loss": 2.7866, "mean_token_accuracy": 0.47717231222385864, "step": 4535 }, { "epoch": 0.8409343715239155, "grad_norm": 5.6484375, "learning_rate": 9.159065628476085e-06, "loss": 2.7865, "mean_token_accuracy": 0.46956619213092704, "step": 4536 }, { "epoch": 0.8411197626992956, "grad_norm": 5.12890625, "learning_rate": 9.158880237300705e-06, "loss": 2.8227, "mean_token_accuracy": 0.44983105626560893, "step": 4537 }, { "epoch": 0.8413051538746755, "grad_norm": 7.85546875, "learning_rate": 9.158694846125326e-06, "loss": 3.0909, "mean_token_accuracy": 0.4114566284779051, "step": 4538 }, { "epoch": 0.8414905450500556, "grad_norm": 5.93359375, "learning_rate": 9.158509454949946e-06, "loss": 2.5092, "mean_token_accuracy": 0.48292682926829267, "step": 4539 }, { "epoch": 0.8416759362254357, "grad_norm": 5.75390625, "learning_rate": 9.158324063774565e-06, "loss": 2.928, "mean_token_accuracy": 0.4328305444261136, "step": 4540 }, { "epoch": 0.8418613274008158, "grad_norm": 7.18359375, "learning_rate": 9.158138672599185e-06, "loss": 2.5774, "mean_token_accuracy": 0.4721444133208839, "step": 4541 }, { "epoch": 0.8420467185761957, "grad_norm": 6.1953125, "learning_rate": 9.157953281423804e-06, "loss": 2.9274, "mean_token_accuracy": 0.42944187180112114, "step": 4542 }, { "epoch": 0.8422321097515758, "grad_norm": 5.26953125, "learning_rate": 9.157767890248425e-06, "loss": 2.7483, "mean_token_accuracy": 0.459679378188001, "step": 4543 }, { "epoch": 0.8424175009269559, "grad_norm": 5.48046875, "learning_rate": 9.157582499073045e-06, "loss": 2.6776, "mean_token_accuracy": 0.466697790227202, "step": 4544 }, { "epoch": 0.842602892102336, "grad_norm": 4.78515625, "learning_rate": 9.157397107897666e-06, "loss": 2.8987, "mean_token_accuracy": 0.4579643178025927, "step": 4545 }, { "epoch": 0.8427882832777159, "grad_norm": 7.140625, "learning_rate": 9.157211716722284e-06, "loss": 1.943, "mean_token_accuracy": 0.5541836490050348, "step": 4546 }, { "epoch": 0.842973674453096, "grad_norm": 6.5625, "learning_rate": 9.157026325546905e-06, "loss": 2.7451, "mean_token_accuracy": 0.451925666576892, "step": 4547 }, { "epoch": 0.8431590656284761, "grad_norm": 5.22265625, "learning_rate": 9.156840934371525e-06, "loss": 2.748, "mean_token_accuracy": 0.44850163686728783, "step": 4548 }, { "epoch": 0.8433444568038562, "grad_norm": 5.54296875, "learning_rate": 9.156655543196144e-06, "loss": 2.8334, "mean_token_accuracy": 0.44949894514767935, "step": 4549 }, { "epoch": 0.8435298479792361, "grad_norm": 6.34375, "learning_rate": 9.156470152020765e-06, "loss": 2.6232, "mean_token_accuracy": 0.495189050006776, "step": 4550 }, { "epoch": 0.8437152391546162, "grad_norm": 8.5390625, "learning_rate": 9.156284760845383e-06, "loss": 3.0298, "mean_token_accuracy": 0.4131203511766858, "step": 4551 }, { "epoch": 0.8439006303299963, "grad_norm": 8.203125, "learning_rate": 9.156099369670004e-06, "loss": 3.5428, "mean_token_accuracy": 0.37496542185338866, "step": 4552 }, { "epoch": 0.8440860215053764, "grad_norm": 5.4921875, "learning_rate": 9.155913978494624e-06, "loss": 3.0428, "mean_token_accuracy": 0.4306308384269404, "step": 4553 }, { "epoch": 0.8442714126807565, "grad_norm": 6.296875, "learning_rate": 9.155728587319245e-06, "loss": 2.9943, "mean_token_accuracy": 0.4311404507170499, "step": 4554 }, { "epoch": 0.8444568038561364, "grad_norm": 5.27734375, "learning_rate": 9.155543196143864e-06, "loss": 2.9444, "mean_token_accuracy": 0.4255471320268917, "step": 4555 }, { "epoch": 0.8446421950315165, "grad_norm": 5.046875, "learning_rate": 9.155357804968484e-06, "loss": 2.5808, "mean_token_accuracy": 0.4636100893326327, "step": 4556 }, { "epoch": 0.8448275862068966, "grad_norm": 5.6640625, "learning_rate": 9.155172413793105e-06, "loss": 2.9773, "mean_token_accuracy": 0.4701862669458834, "step": 4557 }, { "epoch": 0.8450129773822767, "grad_norm": 6.21875, "learning_rate": 9.154987022617723e-06, "loss": 3.02, "mean_token_accuracy": 0.42545855744035305, "step": 4558 }, { "epoch": 0.8451983685576566, "grad_norm": 4.875, "learning_rate": 9.154801631442344e-06, "loss": 2.3505, "mean_token_accuracy": 0.5260732476352171, "step": 4559 }, { "epoch": 0.8453837597330367, "grad_norm": 5.93359375, "learning_rate": 9.154616240266963e-06, "loss": 2.4597, "mean_token_accuracy": 0.48118081180811806, "step": 4560 }, { "epoch": 0.8455691509084168, "grad_norm": 6.3203125, "learning_rate": 9.154430849091585e-06, "loss": 2.7555, "mean_token_accuracy": 0.4737773152965661, "step": 4561 }, { "epoch": 0.8457545420837969, "grad_norm": 6.48046875, "learning_rate": 9.154245457916204e-06, "loss": 2.6971, "mean_token_accuracy": 0.4532354163779964, "step": 4562 }, { "epoch": 0.8459399332591768, "grad_norm": 8.5859375, "learning_rate": 9.154060066740824e-06, "loss": 2.9542, "mean_token_accuracy": 0.44809030056437854, "step": 4563 }, { "epoch": 0.8461253244345569, "grad_norm": 5.671875, "learning_rate": 9.153874675565445e-06, "loss": 2.9057, "mean_token_accuracy": 0.4352548036758563, "step": 4564 }, { "epoch": 0.846310715609937, "grad_norm": 6.62109375, "learning_rate": 9.153689284390064e-06, "loss": 2.9032, "mean_token_accuracy": 0.4343241869918699, "step": 4565 }, { "epoch": 0.8464961067853171, "grad_norm": 5.52734375, "learning_rate": 9.153503893214684e-06, "loss": 3.5836, "mean_token_accuracy": 0.37475984630163306, "step": 4566 }, { "epoch": 0.846681497960697, "grad_norm": 8.9453125, "learning_rate": 9.153318502039303e-06, "loss": 3.2336, "mean_token_accuracy": 0.416925562321362, "step": 4567 }, { "epoch": 0.8468668891360771, "grad_norm": 8.1328125, "learning_rate": 9.153133110863923e-06, "loss": 2.8175, "mean_token_accuracy": 0.4372207327971403, "step": 4568 }, { "epoch": 0.8470522803114572, "grad_norm": 6.37890625, "learning_rate": 9.152947719688544e-06, "loss": 2.8959, "mean_token_accuracy": 0.43572564160799454, "step": 4569 }, { "epoch": 0.8472376714868373, "grad_norm": 5.62890625, "learning_rate": 9.152762328513164e-06, "loss": 2.8433, "mean_token_accuracy": 0.4590340996759759, "step": 4570 }, { "epoch": 0.8474230626622172, "grad_norm": 7.0703125, "learning_rate": 9.152576937337783e-06, "loss": 2.7195, "mean_token_accuracy": 0.4498693664271718, "step": 4571 }, { "epoch": 0.8476084538375973, "grad_norm": 6.0, "learning_rate": 9.152391546162404e-06, "loss": 2.9667, "mean_token_accuracy": 0.44157918758557735, "step": 4572 }, { "epoch": 0.8477938450129774, "grad_norm": 7.96484375, "learning_rate": 9.152206154987024e-06, "loss": 3.1863, "mean_token_accuracy": 0.42868217054263563, "step": 4573 }, { "epoch": 0.8479792361883575, "grad_norm": 7.76953125, "learning_rate": 9.152020763811643e-06, "loss": 2.9351, "mean_token_accuracy": 0.4354955739370193, "step": 4574 }, { "epoch": 0.8481646273637374, "grad_norm": 5.41796875, "learning_rate": 9.151835372636263e-06, "loss": 3.3094, "mean_token_accuracy": 0.4124111182934712, "step": 4575 }, { "epoch": 0.8483500185391175, "grad_norm": 6.15625, "learning_rate": 9.151649981460882e-06, "loss": 3.3164, "mean_token_accuracy": 0.41254469606674615, "step": 4576 }, { "epoch": 0.8485354097144976, "grad_norm": 7.04296875, "learning_rate": 9.151464590285504e-06, "loss": 2.6545, "mean_token_accuracy": 0.4619790920807676, "step": 4577 }, { "epoch": 0.8487208008898777, "grad_norm": 5.16796875, "learning_rate": 9.151279199110123e-06, "loss": 2.4827, "mean_token_accuracy": 0.47848007870142645, "step": 4578 }, { "epoch": 0.8489061920652577, "grad_norm": 7.0546875, "learning_rate": 9.151093807934744e-06, "loss": 2.7514, "mean_token_accuracy": 0.46519377931374967, "step": 4579 }, { "epoch": 0.8490915832406377, "grad_norm": 6.7421875, "learning_rate": 9.150908416759362e-06, "loss": 3.1352, "mean_token_accuracy": 0.43538355217691777, "step": 4580 }, { "epoch": 0.8492769744160178, "grad_norm": 5.42578125, "learning_rate": 9.150723025583983e-06, "loss": 2.8222, "mean_token_accuracy": 0.4423152644704029, "step": 4581 }, { "epoch": 0.8494623655913979, "grad_norm": 6.22265625, "learning_rate": 9.150537634408603e-06, "loss": 3.3133, "mean_token_accuracy": 0.3756401384083045, "step": 4582 }, { "epoch": 0.849647756766778, "grad_norm": 5.81640625, "learning_rate": 9.150352243233222e-06, "loss": 3.0334, "mean_token_accuracy": 0.4282283884738527, "step": 4583 }, { "epoch": 0.8498331479421579, "grad_norm": 7.27734375, "learning_rate": 9.150166852057843e-06, "loss": 2.9441, "mean_token_accuracy": 0.4366584226435834, "step": 4584 }, { "epoch": 0.850018539117538, "grad_norm": 8.1484375, "learning_rate": 9.149981460882462e-06, "loss": 2.6243, "mean_token_accuracy": 0.46327752657949406, "step": 4585 }, { "epoch": 0.8502039302929181, "grad_norm": 7.125, "learning_rate": 9.149796069707084e-06, "loss": 2.9501, "mean_token_accuracy": 0.4533790650406504, "step": 4586 }, { "epoch": 0.8503893214682982, "grad_norm": 7.58984375, "learning_rate": 9.149610678531703e-06, "loss": 2.5239, "mean_token_accuracy": 0.47652723442672285, "step": 4587 }, { "epoch": 0.8505747126436781, "grad_norm": 9.046875, "learning_rate": 9.149425287356323e-06, "loss": 2.4094, "mean_token_accuracy": 0.4916861957370829, "step": 4588 }, { "epoch": 0.8507601038190582, "grad_norm": 5.71875, "learning_rate": 9.149239896180942e-06, "loss": 2.8939, "mean_token_accuracy": 0.43102124392116714, "step": 4589 }, { "epoch": 0.8509454949944383, "grad_norm": 11.1875, "learning_rate": 9.149054505005562e-06, "loss": 2.7724, "mean_token_accuracy": 0.45392528424472117, "step": 4590 }, { "epoch": 0.8511308861698184, "grad_norm": 7.3828125, "learning_rate": 9.148869113830183e-06, "loss": 2.5521, "mean_token_accuracy": 0.4860710854947166, "step": 4591 }, { "epoch": 0.8513162773451983, "grad_norm": 7.29296875, "learning_rate": 9.148683722654802e-06, "loss": 2.5617, "mean_token_accuracy": 0.47959889349930845, "step": 4592 }, { "epoch": 0.8515016685205784, "grad_norm": 7.82421875, "learning_rate": 9.148498331479422e-06, "loss": 2.6148, "mean_token_accuracy": 0.4486041982254923, "step": 4593 }, { "epoch": 0.8516870596959585, "grad_norm": 8.921875, "learning_rate": 9.148312940304043e-06, "loss": 2.2292, "mean_token_accuracy": 0.5175372192783245, "step": 4594 }, { "epoch": 0.8518724508713386, "grad_norm": 7.83984375, "learning_rate": 9.148127549128663e-06, "loss": 3.4512, "mean_token_accuracy": 0.3966630785791173, "step": 4595 }, { "epoch": 0.8520578420467185, "grad_norm": 8.0234375, "learning_rate": 9.147942157953282e-06, "loss": 2.3139, "mean_token_accuracy": 0.5229763912310287, "step": 4596 }, { "epoch": 0.8522432332220986, "grad_norm": 8.5546875, "learning_rate": 9.147756766777902e-06, "loss": 2.5622, "mean_token_accuracy": 0.4678040020523345, "step": 4597 }, { "epoch": 0.8524286243974787, "grad_norm": 6.10546875, "learning_rate": 9.147571375602521e-06, "loss": 2.3706, "mean_token_accuracy": 0.4842660052705007, "step": 4598 }, { "epoch": 0.8526140155728588, "grad_norm": 6.26953125, "learning_rate": 9.147385984427142e-06, "loss": 3.1207, "mean_token_accuracy": 0.4147045420021267, "step": 4599 }, { "epoch": 0.8527994067482387, "grad_norm": 7.98828125, "learning_rate": 9.147200593251762e-06, "loss": 2.8139, "mean_token_accuracy": 0.4517083271705369, "step": 4600 }, { "epoch": 0.8529847979236188, "grad_norm": 7.890625, "learning_rate": 9.147015202076381e-06, "loss": 2.5942, "mean_token_accuracy": 0.4730592584294836, "step": 4601 }, { "epoch": 0.8531701890989989, "grad_norm": 8.078125, "learning_rate": 9.146829810901003e-06, "loss": 3.352, "mean_token_accuracy": 0.41313535122786976, "step": 4602 }, { "epoch": 0.853355580274379, "grad_norm": 8.3203125, "learning_rate": 9.146644419725622e-06, "loss": 3.0535, "mean_token_accuracy": 0.4485408560311284, "step": 4603 }, { "epoch": 0.853540971449759, "grad_norm": 6.125, "learning_rate": 9.146459028550242e-06, "loss": 2.922, "mean_token_accuracy": 0.43643805035346644, "step": 4604 }, { "epoch": 0.853726362625139, "grad_norm": 8.7421875, "learning_rate": 9.146273637374861e-06, "loss": 2.6824, "mean_token_accuracy": 0.4525065963060686, "step": 4605 }, { "epoch": 0.8539117538005191, "grad_norm": 4.66796875, "learning_rate": 9.146088246199482e-06, "loss": 3.4257, "mean_token_accuracy": 0.39802850672705475, "step": 4606 }, { "epoch": 0.8540971449758992, "grad_norm": 7.77734375, "learning_rate": 9.145902855024102e-06, "loss": 2.9667, "mean_token_accuracy": 0.43364904776453816, "step": 4607 }, { "epoch": 0.8542825361512792, "grad_norm": 6.8828125, "learning_rate": 9.145717463848721e-06, "loss": 2.7681, "mean_token_accuracy": 0.4552487502975482, "step": 4608 }, { "epoch": 0.8544679273266592, "grad_norm": 7.34765625, "learning_rate": 9.145532072673341e-06, "loss": 2.7308, "mean_token_accuracy": 0.4507749712973594, "step": 4609 }, { "epoch": 0.8546533185020393, "grad_norm": 6.96875, "learning_rate": 9.145346681497962e-06, "loss": 3.0644, "mean_token_accuracy": 0.44049967126890205, "step": 4610 }, { "epoch": 0.8548387096774194, "grad_norm": 5.68359375, "learning_rate": 9.145161290322582e-06, "loss": 3.0464, "mean_token_accuracy": 0.43349701110162253, "step": 4611 }, { "epoch": 0.8550241008527995, "grad_norm": 5.13671875, "learning_rate": 9.144975899147201e-06, "loss": 2.9461, "mean_token_accuracy": 0.4390436153441934, "step": 4612 }, { "epoch": 0.8552094920281794, "grad_norm": 7.390625, "learning_rate": 9.144790507971822e-06, "loss": 2.2476, "mean_token_accuracy": 0.5270928687435406, "step": 4613 }, { "epoch": 0.8553948832035595, "grad_norm": 5.6953125, "learning_rate": 9.14460511679644e-06, "loss": 2.4497, "mean_token_accuracy": 0.49178303410278335, "step": 4614 }, { "epoch": 0.8555802743789396, "grad_norm": 5.51953125, "learning_rate": 9.144419725621061e-06, "loss": 3.1192, "mean_token_accuracy": 0.42038555756736606, "step": 4615 }, { "epoch": 0.8557656655543197, "grad_norm": 8.2890625, "learning_rate": 9.144234334445682e-06, "loss": 2.8057, "mean_token_accuracy": 0.4424301134791032, "step": 4616 }, { "epoch": 0.8559510567296996, "grad_norm": 7.5859375, "learning_rate": 9.1440489432703e-06, "loss": 3.3223, "mean_token_accuracy": 0.3915565979508703, "step": 4617 }, { "epoch": 0.8561364479050797, "grad_norm": 6.34765625, "learning_rate": 9.14386355209492e-06, "loss": 2.6406, "mean_token_accuracy": 0.4780232558139535, "step": 4618 }, { "epoch": 0.8563218390804598, "grad_norm": 9.078125, "learning_rate": 9.143678160919541e-06, "loss": 2.8121, "mean_token_accuracy": 0.44038956677487967, "step": 4619 }, { "epoch": 0.8565072302558399, "grad_norm": 11.7734375, "learning_rate": 9.143492769744162e-06, "loss": 2.4807, "mean_token_accuracy": 0.4883034987794955, "step": 4620 }, { "epoch": 0.8566926214312198, "grad_norm": 13.0078125, "learning_rate": 9.14330737856878e-06, "loss": 2.5472, "mean_token_accuracy": 0.4632571354772052, "step": 4621 }, { "epoch": 0.8568780126065999, "grad_norm": 8.2890625, "learning_rate": 9.143121987393401e-06, "loss": 2.6146, "mean_token_accuracy": 0.4562543192812716, "step": 4622 }, { "epoch": 0.85706340378198, "grad_norm": 6.00390625, "learning_rate": 9.14293659621802e-06, "loss": 3.1022, "mean_token_accuracy": 0.40301702171032516, "step": 4623 }, { "epoch": 0.8572487949573601, "grad_norm": 8.5546875, "learning_rate": 9.14275120504264e-06, "loss": 2.5246, "mean_token_accuracy": 0.47846543612015924, "step": 4624 }, { "epoch": 0.85743418613274, "grad_norm": 9.28125, "learning_rate": 9.14256581386726e-06, "loss": 2.8902, "mean_token_accuracy": 0.43981791404471815, "step": 4625 }, { "epoch": 0.8576195773081201, "grad_norm": 6.1640625, "learning_rate": 9.142380422691881e-06, "loss": 3.3155, "mean_token_accuracy": 0.3850203804347826, "step": 4626 }, { "epoch": 0.8578049684835002, "grad_norm": 10.2734375, "learning_rate": 9.1421950315165e-06, "loss": 2.6996, "mean_token_accuracy": 0.4652313452794865, "step": 4627 }, { "epoch": 0.8579903596588803, "grad_norm": 6.99609375, "learning_rate": 9.14200964034112e-06, "loss": 2.7668, "mean_token_accuracy": 0.45175537938844845, "step": 4628 }, { "epoch": 0.8581757508342603, "grad_norm": 7.375, "learning_rate": 9.141824249165741e-06, "loss": 2.7455, "mean_token_accuracy": 0.4558682443236329, "step": 4629 }, { "epoch": 0.8583611420096403, "grad_norm": 5.51953125, "learning_rate": 9.14163885799036e-06, "loss": 2.9031, "mean_token_accuracy": 0.45484994640943194, "step": 4630 }, { "epoch": 0.8585465331850204, "grad_norm": 5.62890625, "learning_rate": 9.14145346681498e-06, "loss": 2.7776, "mean_token_accuracy": 0.4491803278688525, "step": 4631 }, { "epoch": 0.8587319243604005, "grad_norm": 5.734375, "learning_rate": 9.1412680756396e-06, "loss": 2.8336, "mean_token_accuracy": 0.458836716901233, "step": 4632 }, { "epoch": 0.8589173155357805, "grad_norm": 5.46875, "learning_rate": 9.14108268446422e-06, "loss": 2.9393, "mean_token_accuracy": 0.44688060593980466, "step": 4633 }, { "epoch": 0.8591027067111605, "grad_norm": 7.41796875, "learning_rate": 9.14089729328884e-06, "loss": 2.9663, "mean_token_accuracy": 0.45310198068423635, "step": 4634 }, { "epoch": 0.8592880978865406, "grad_norm": 6.44921875, "learning_rate": 9.14071190211346e-06, "loss": 2.7802, "mean_token_accuracy": 0.4563735120767364, "step": 4635 }, { "epoch": 0.8594734890619207, "grad_norm": 7.6015625, "learning_rate": 9.14052651093808e-06, "loss": 2.467, "mean_token_accuracy": 0.501809268121863, "step": 4636 }, { "epoch": 0.8596588802373007, "grad_norm": 5.84375, "learning_rate": 9.1403411197627e-06, "loss": 3.2175, "mean_token_accuracy": 0.4216244865509474, "step": 4637 }, { "epoch": 0.8598442714126807, "grad_norm": 6.0859375, "learning_rate": 9.14015572858732e-06, "loss": 3.7964, "mean_token_accuracy": 0.37756883814640696, "step": 4638 }, { "epoch": 0.8600296625880608, "grad_norm": 5.61328125, "learning_rate": 9.13997033741194e-06, "loss": 3.1182, "mean_token_accuracy": 0.4431397574984046, "step": 4639 }, { "epoch": 0.8602150537634409, "grad_norm": 10.3984375, "learning_rate": 9.13978494623656e-06, "loss": 2.1447, "mean_token_accuracy": 0.4900475150892513, "step": 4640 }, { "epoch": 0.860400444938821, "grad_norm": 6.6953125, "learning_rate": 9.139599555061179e-06, "loss": 2.5984, "mean_token_accuracy": 0.45654008438818566, "step": 4641 }, { "epoch": 0.8605858361142009, "grad_norm": 5.97265625, "learning_rate": 9.1394141638858e-06, "loss": 2.8633, "mean_token_accuracy": 0.4496532237349088, "step": 4642 }, { "epoch": 0.860771227289581, "grad_norm": 6.12890625, "learning_rate": 9.13922877271042e-06, "loss": 2.7367, "mean_token_accuracy": 0.4566436301995045, "step": 4643 }, { "epoch": 0.8609566184649611, "grad_norm": 6.61328125, "learning_rate": 9.13904338153504e-06, "loss": 2.7607, "mean_token_accuracy": 0.4419168941461935, "step": 4644 }, { "epoch": 0.8611420096403412, "grad_norm": 6.37109375, "learning_rate": 9.13885799035966e-06, "loss": 3.4442, "mean_token_accuracy": 0.3995751711116356, "step": 4645 }, { "epoch": 0.8613274008157211, "grad_norm": 9.7109375, "learning_rate": 9.13867259918428e-06, "loss": 2.6569, "mean_token_accuracy": 0.4661331809088311, "step": 4646 }, { "epoch": 0.8615127919911012, "grad_norm": 6.6328125, "learning_rate": 9.1384872080089e-06, "loss": 2.5787, "mean_token_accuracy": 0.4753445305770887, "step": 4647 }, { "epoch": 0.8616981831664813, "grad_norm": 5.84375, "learning_rate": 9.138301816833519e-06, "loss": 2.8818, "mean_token_accuracy": 0.4417092924126172, "step": 4648 }, { "epoch": 0.8618835743418614, "grad_norm": 5.7734375, "learning_rate": 9.138116425658139e-06, "loss": 2.3646, "mean_token_accuracy": 0.5134058361149995, "step": 4649 }, { "epoch": 0.8620689655172413, "grad_norm": 5.73828125, "learning_rate": 9.13793103448276e-06, "loss": 3.6174, "mean_token_accuracy": 0.39386679444178246, "step": 4650 }, { "epoch": 0.8622543566926214, "grad_norm": 8.0234375, "learning_rate": 9.13774564330738e-06, "loss": 2.6811, "mean_token_accuracy": 0.45411140583554377, "step": 4651 }, { "epoch": 0.8624397478680015, "grad_norm": 7.32421875, "learning_rate": 9.137560252131999e-06, "loss": 3.5312, "mean_token_accuracy": 0.37929984779299847, "step": 4652 }, { "epoch": 0.8626251390433816, "grad_norm": 8.6484375, "learning_rate": 9.13737486095662e-06, "loss": 2.8069, "mean_token_accuracy": 0.45248322147651004, "step": 4653 }, { "epoch": 0.8628105302187616, "grad_norm": 6.9140625, "learning_rate": 9.13718946978124e-06, "loss": 2.5251, "mean_token_accuracy": 0.4804425744117189, "step": 4654 }, { "epoch": 0.8629959213941416, "grad_norm": 5.93359375, "learning_rate": 9.137004078605859e-06, "loss": 2.9191, "mean_token_accuracy": 0.4490266393442623, "step": 4655 }, { "epoch": 0.8631813125695217, "grad_norm": 8.140625, "learning_rate": 9.136818687430479e-06, "loss": 3.1519, "mean_token_accuracy": 0.42240145700533366, "step": 4656 }, { "epoch": 0.8633667037449018, "grad_norm": 8.2421875, "learning_rate": 9.136633296255098e-06, "loss": 2.6647, "mean_token_accuracy": 0.4605709973588228, "step": 4657 }, { "epoch": 0.8635520949202818, "grad_norm": 5.3359375, "learning_rate": 9.13644790507972e-06, "loss": 3.4337, "mean_token_accuracy": 0.38745294855708906, "step": 4658 }, { "epoch": 0.8637374860956618, "grad_norm": 6.67578125, "learning_rate": 9.136262513904339e-06, "loss": 2.6868, "mean_token_accuracy": 0.46804326450344147, "step": 4659 }, { "epoch": 0.8639228772710419, "grad_norm": 5.47265625, "learning_rate": 9.13607712272896e-06, "loss": 3.0203, "mean_token_accuracy": 0.44267291910902695, "step": 4660 }, { "epoch": 0.864108268446422, "grad_norm": 6.6015625, "learning_rate": 9.135891731553578e-06, "loss": 2.8542, "mean_token_accuracy": 0.4447283740299072, "step": 4661 }, { "epoch": 0.864293659621802, "grad_norm": 5.65234375, "learning_rate": 9.135706340378199e-06, "loss": 3.0972, "mean_token_accuracy": 0.43463230672533, "step": 4662 }, { "epoch": 0.864479050797182, "grad_norm": 7.8671875, "learning_rate": 9.13552094920282e-06, "loss": 2.6636, "mean_token_accuracy": 0.4828038325769037, "step": 4663 }, { "epoch": 0.8646644419725621, "grad_norm": 6.015625, "learning_rate": 9.135335558027438e-06, "loss": 2.5573, "mean_token_accuracy": 0.49047733847637415, "step": 4664 }, { "epoch": 0.8648498331479422, "grad_norm": 4.83203125, "learning_rate": 9.135150166852058e-06, "loss": 2.1404, "mean_token_accuracy": 0.538717402873869, "step": 4665 }, { "epoch": 0.8650352243233222, "grad_norm": 6.4140625, "learning_rate": 9.134964775676679e-06, "loss": 3.0335, "mean_token_accuracy": 0.4249540246145141, "step": 4666 }, { "epoch": 0.8652206154987022, "grad_norm": 5.859375, "learning_rate": 9.1347793845013e-06, "loss": 2.8254, "mean_token_accuracy": 0.45067228252065444, "step": 4667 }, { "epoch": 0.8654060066740823, "grad_norm": 8.21875, "learning_rate": 9.134593993325918e-06, "loss": 2.8289, "mean_token_accuracy": 0.43745261561789234, "step": 4668 }, { "epoch": 0.8655913978494624, "grad_norm": 5.87109375, "learning_rate": 9.134408602150539e-06, "loss": 3.0943, "mean_token_accuracy": 0.4046817849305048, "step": 4669 }, { "epoch": 0.8657767890248425, "grad_norm": 6.09375, "learning_rate": 9.134223210975158e-06, "loss": 3.3359, "mean_token_accuracy": 0.4003673094582185, "step": 4670 }, { "epoch": 0.8659621802002224, "grad_norm": 6.0078125, "learning_rate": 9.134037819799778e-06, "loss": 2.8823, "mean_token_accuracy": 0.45887265135699373, "step": 4671 }, { "epoch": 0.8661475713756025, "grad_norm": 5.1484375, "learning_rate": 9.133852428624399e-06, "loss": 3.2096, "mean_token_accuracy": 0.43425414364640885, "step": 4672 }, { "epoch": 0.8663329625509826, "grad_norm": 6.12890625, "learning_rate": 9.133667037449017e-06, "loss": 2.94, "mean_token_accuracy": 0.4308415967454869, "step": 4673 }, { "epoch": 0.8665183537263627, "grad_norm": 7.58984375, "learning_rate": 9.133481646273638e-06, "loss": 3.1473, "mean_token_accuracy": 0.41728100607111884, "step": 4674 }, { "epoch": 0.8667037449017426, "grad_norm": 8.8515625, "learning_rate": 9.133296255098258e-06, "loss": 2.2464, "mean_token_accuracy": 0.5138820029747149, "step": 4675 }, { "epoch": 0.8668891360771227, "grad_norm": 6.515625, "learning_rate": 9.133110863922879e-06, "loss": 2.7876, "mean_token_accuracy": 0.436130007558579, "step": 4676 }, { "epoch": 0.8670745272525028, "grad_norm": 5.53125, "learning_rate": 9.132925472747498e-06, "loss": 3.1226, "mean_token_accuracy": 0.4367537915456599, "step": 4677 }, { "epoch": 0.8672599184278829, "grad_norm": 6.6640625, "learning_rate": 9.132740081572118e-06, "loss": 2.5767, "mean_token_accuracy": 0.47606793618116316, "step": 4678 }, { "epoch": 0.8674453096032629, "grad_norm": 7.46875, "learning_rate": 9.132554690396737e-06, "loss": 3.0034, "mean_token_accuracy": 0.4336448598130841, "step": 4679 }, { "epoch": 0.8676307007786429, "grad_norm": 5.7265625, "learning_rate": 9.132369299221357e-06, "loss": 3.0675, "mean_token_accuracy": 0.4206041828040279, "step": 4680 }, { "epoch": 0.867816091954023, "grad_norm": 8.109375, "learning_rate": 9.132183908045978e-06, "loss": 2.9316, "mean_token_accuracy": 0.44249712165792504, "step": 4681 }, { "epoch": 0.8680014831294031, "grad_norm": 10.3203125, "learning_rate": 9.131998516870598e-06, "loss": 2.7924, "mean_token_accuracy": 0.4553583168967784, "step": 4682 }, { "epoch": 0.8681868743047831, "grad_norm": 7.40234375, "learning_rate": 9.131813125695219e-06, "loss": 3.0827, "mean_token_accuracy": 0.42592079756300194, "step": 4683 }, { "epoch": 0.8683722654801631, "grad_norm": 6.91015625, "learning_rate": 9.131627734519838e-06, "loss": 2.8227, "mean_token_accuracy": 0.43492488472408153, "step": 4684 }, { "epoch": 0.8685576566555432, "grad_norm": 6.44140625, "learning_rate": 9.131442343344458e-06, "loss": 3.4536, "mean_token_accuracy": 0.3873053576141462, "step": 4685 }, { "epoch": 0.8687430478309233, "grad_norm": 5.921875, "learning_rate": 9.131256952169077e-06, "loss": 3.3182, "mean_token_accuracy": 0.40918023582257157, "step": 4686 }, { "epoch": 0.8689284390063033, "grad_norm": 6.3359375, "learning_rate": 9.131071560993697e-06, "loss": 2.6813, "mean_token_accuracy": 0.44119131751640583, "step": 4687 }, { "epoch": 0.8691138301816833, "grad_norm": 9.84375, "learning_rate": 9.130886169818318e-06, "loss": 2.9004, "mean_token_accuracy": 0.44009843284548633, "step": 4688 }, { "epoch": 0.8692992213570634, "grad_norm": 7.765625, "learning_rate": 9.130700778642937e-06, "loss": 2.9159, "mean_token_accuracy": 0.44710021839359376, "step": 4689 }, { "epoch": 0.8694846125324435, "grad_norm": 7.5703125, "learning_rate": 9.130515387467557e-06, "loss": 2.8601, "mean_token_accuracy": 0.4358793022159359, "step": 4690 }, { "epoch": 0.8696700037078235, "grad_norm": 6.27734375, "learning_rate": 9.130329996292178e-06, "loss": 3.1576, "mean_token_accuracy": 0.41571259376233716, "step": 4691 }, { "epoch": 0.8698553948832035, "grad_norm": 5.88671875, "learning_rate": 9.130144605116798e-06, "loss": 2.7038, "mean_token_accuracy": 0.44497742663656886, "step": 4692 }, { "epoch": 0.8700407860585836, "grad_norm": 6.6953125, "learning_rate": 9.129959213941417e-06, "loss": 3.104, "mean_token_accuracy": 0.40666564745451766, "step": 4693 }, { "epoch": 0.8702261772339637, "grad_norm": 7.08203125, "learning_rate": 9.129773822766037e-06, "loss": 2.8246, "mean_token_accuracy": 0.45953326713008935, "step": 4694 }, { "epoch": 0.8704115684093437, "grad_norm": 6.234375, "learning_rate": 9.129588431590656e-06, "loss": 2.9804, "mean_token_accuracy": 0.4315159574468085, "step": 4695 }, { "epoch": 0.8705969595847237, "grad_norm": 7.7734375, "learning_rate": 9.129403040415277e-06, "loss": 2.7531, "mean_token_accuracy": 0.4513397517689363, "step": 4696 }, { "epoch": 0.8707823507601038, "grad_norm": 5.27734375, "learning_rate": 9.129217649239897e-06, "loss": 2.3878, "mean_token_accuracy": 0.48323956868260665, "step": 4697 }, { "epoch": 0.8709677419354839, "grad_norm": 7.01171875, "learning_rate": 9.129032258064518e-06, "loss": 2.4063, "mean_token_accuracy": 0.49667752442996743, "step": 4698 }, { "epoch": 0.871153133110864, "grad_norm": 12.46875, "learning_rate": 9.128846866889137e-06, "loss": 2.1499, "mean_token_accuracy": 0.5686990727732509, "step": 4699 }, { "epoch": 0.8713385242862439, "grad_norm": 9.734375, "learning_rate": 9.128661475713757e-06, "loss": 2.7293, "mean_token_accuracy": 0.4741484184914842, "step": 4700 }, { "epoch": 0.871523915461624, "grad_norm": 8.1640625, "learning_rate": 9.128476084538378e-06, "loss": 1.9806, "mean_token_accuracy": 0.5505342188853595, "step": 4701 }, { "epoch": 0.8717093066370041, "grad_norm": 5.50390625, "learning_rate": 9.128290693362996e-06, "loss": 2.6967, "mean_token_accuracy": 0.4743479507022069, "step": 4702 }, { "epoch": 0.8718946978123842, "grad_norm": 6.14453125, "learning_rate": 9.128105302187617e-06, "loss": 2.9239, "mean_token_accuracy": 0.4404992729035385, "step": 4703 }, { "epoch": 0.8720800889877642, "grad_norm": 8.4609375, "learning_rate": 9.127919911012236e-06, "loss": 2.4697, "mean_token_accuracy": 0.48561822936122523, "step": 4704 }, { "epoch": 0.8722654801631442, "grad_norm": 6.2421875, "learning_rate": 9.127734519836856e-06, "loss": 2.4644, "mean_token_accuracy": 0.510533035429301, "step": 4705 }, { "epoch": 0.8724508713385243, "grad_norm": 6.5625, "learning_rate": 9.127549128661477e-06, "loss": 2.5303, "mean_token_accuracy": 0.49799433695139217, "step": 4706 }, { "epoch": 0.8726362625139044, "grad_norm": 5.41796875, "learning_rate": 9.127363737486097e-06, "loss": 4.009, "mean_token_accuracy": 0.35014090177133655, "step": 4707 }, { "epoch": 0.8728216536892844, "grad_norm": 6.5546875, "learning_rate": 9.127178346310716e-06, "loss": 3.5426, "mean_token_accuracy": 0.38655102259342544, "step": 4708 }, { "epoch": 0.8730070448646644, "grad_norm": 7.765625, "learning_rate": 9.126992955135336e-06, "loss": 2.8556, "mean_token_accuracy": 0.42280720681842704, "step": 4709 }, { "epoch": 0.8731924360400445, "grad_norm": 7.40625, "learning_rate": 9.126807563959957e-06, "loss": 3.0304, "mean_token_accuracy": 0.4362083218517498, "step": 4710 }, { "epoch": 0.8733778272154246, "grad_norm": 6.37109375, "learning_rate": 9.126622172784576e-06, "loss": 3.0044, "mean_token_accuracy": 0.4462358923477614, "step": 4711 }, { "epoch": 0.8735632183908046, "grad_norm": 7.22265625, "learning_rate": 9.126436781609196e-06, "loss": 2.7918, "mean_token_accuracy": 0.44680015704750686, "step": 4712 }, { "epoch": 0.8737486095661846, "grad_norm": 6.54296875, "learning_rate": 9.126251390433815e-06, "loss": 2.6877, "mean_token_accuracy": 0.45240715268225584, "step": 4713 }, { "epoch": 0.8739340007415647, "grad_norm": 6.5234375, "learning_rate": 9.126065999258435e-06, "loss": 2.6366, "mean_token_accuracy": 0.47361729179911, "step": 4714 }, { "epoch": 0.8741193919169448, "grad_norm": 5.83203125, "learning_rate": 9.125880608083056e-06, "loss": 2.311, "mean_token_accuracy": 0.5197648489762822, "step": 4715 }, { "epoch": 0.8743047830923248, "grad_norm": 6.9140625, "learning_rate": 9.125695216907676e-06, "loss": 2.6729, "mean_token_accuracy": 0.44745044258820593, "step": 4716 }, { "epoch": 0.8744901742677048, "grad_norm": 7.6953125, "learning_rate": 9.125509825732295e-06, "loss": 3.0622, "mean_token_accuracy": 0.4200859710824541, "step": 4717 }, { "epoch": 0.8746755654430849, "grad_norm": 7.44921875, "learning_rate": 9.125324434556916e-06, "loss": 2.6061, "mean_token_accuracy": 0.4707744249841739, "step": 4718 }, { "epoch": 0.874860956618465, "grad_norm": 6.1328125, "learning_rate": 9.125139043381536e-06, "loss": 2.705, "mean_token_accuracy": 0.44725453070056803, "step": 4719 }, { "epoch": 0.875046347793845, "grad_norm": 6.63671875, "learning_rate": 9.124953652206155e-06, "loss": 3.3261, "mean_token_accuracy": 0.40443250503693756, "step": 4720 }, { "epoch": 0.875231738969225, "grad_norm": 7.99609375, "learning_rate": 9.124768261030775e-06, "loss": 2.7319, "mean_token_accuracy": 0.46303901437371664, "step": 4721 }, { "epoch": 0.8754171301446051, "grad_norm": 8.4765625, "learning_rate": 9.124582869855394e-06, "loss": 2.665, "mean_token_accuracy": 0.46344249809596344, "step": 4722 }, { "epoch": 0.8756025213199852, "grad_norm": 6.80859375, "learning_rate": 9.124397478680016e-06, "loss": 3.0257, "mean_token_accuracy": 0.42859334253719894, "step": 4723 }, { "epoch": 0.8757879124953653, "grad_norm": 7.671875, "learning_rate": 9.124212087504635e-06, "loss": 2.7506, "mean_token_accuracy": 0.44644484958979036, "step": 4724 }, { "epoch": 0.8759733036707452, "grad_norm": 7.0, "learning_rate": 9.124026696329256e-06, "loss": 2.346, "mean_token_accuracy": 0.49849883242521964, "step": 4725 }, { "epoch": 0.8761586948461253, "grad_norm": 7.01953125, "learning_rate": 9.123841305153876e-06, "loss": 2.2492, "mean_token_accuracy": 0.5295158771473191, "step": 4726 }, { "epoch": 0.8763440860215054, "grad_norm": 8.5234375, "learning_rate": 9.123655913978495e-06, "loss": 2.83, "mean_token_accuracy": 0.44706454357603015, "step": 4727 }, { "epoch": 0.8765294771968855, "grad_norm": 6.03515625, "learning_rate": 9.123470522803116e-06, "loss": 2.5547, "mean_token_accuracy": 0.48062110282430615, "step": 4728 }, { "epoch": 0.8767148683722655, "grad_norm": 6.25390625, "learning_rate": 9.123285131627734e-06, "loss": 3.1319, "mean_token_accuracy": 0.4159621578099839, "step": 4729 }, { "epoch": 0.8769002595476455, "grad_norm": 6.1953125, "learning_rate": 9.123099740452355e-06, "loss": 3.3989, "mean_token_accuracy": 0.3944636678200692, "step": 4730 }, { "epoch": 0.8770856507230256, "grad_norm": 4.66015625, "learning_rate": 9.122914349276975e-06, "loss": 2.6745, "mean_token_accuracy": 0.4903914590747331, "step": 4731 }, { "epoch": 0.8772710418984057, "grad_norm": 7.9921875, "learning_rate": 9.122728958101596e-06, "loss": 2.5371, "mean_token_accuracy": 0.4905070618198657, "step": 4732 }, { "epoch": 0.8774564330737857, "grad_norm": 6.49609375, "learning_rate": 9.122543566926215e-06, "loss": 2.2538, "mean_token_accuracy": 0.513311052206984, "step": 4733 }, { "epoch": 0.8776418242491657, "grad_norm": 6.55078125, "learning_rate": 9.122358175750835e-06, "loss": 3.086, "mean_token_accuracy": 0.4284332688588008, "step": 4734 }, { "epoch": 0.8778272154245458, "grad_norm": 8.53125, "learning_rate": 9.122172784575456e-06, "loss": 2.7871, "mean_token_accuracy": 0.45256108148331886, "step": 4735 }, { "epoch": 0.8780126065999259, "grad_norm": 6.60546875, "learning_rate": 9.121987393400074e-06, "loss": 2.8625, "mean_token_accuracy": 0.45506820005349025, "step": 4736 }, { "epoch": 0.8781979977753059, "grad_norm": 7.89453125, "learning_rate": 9.121802002224695e-06, "loss": 3.0716, "mean_token_accuracy": 0.4388914837303441, "step": 4737 }, { "epoch": 0.8783833889506859, "grad_norm": 8.890625, "learning_rate": 9.121616611049314e-06, "loss": 2.8772, "mean_token_accuracy": 0.43386714116251485, "step": 4738 }, { "epoch": 0.878568780126066, "grad_norm": 6.2890625, "learning_rate": 9.121431219873936e-06, "loss": 3.6184, "mean_token_accuracy": 0.3801114140097191, "step": 4739 }, { "epoch": 0.8787541713014461, "grad_norm": 11.1015625, "learning_rate": 9.121245828698555e-06, "loss": 2.7374, "mean_token_accuracy": 0.45813860328481776, "step": 4740 }, { "epoch": 0.8789395624768261, "grad_norm": 11.6171875, "learning_rate": 9.121060437523175e-06, "loss": 2.805, "mean_token_accuracy": 0.4422068386826096, "step": 4741 }, { "epoch": 0.8791249536522061, "grad_norm": 6.84765625, "learning_rate": 9.120875046347794e-06, "loss": 2.8808, "mean_token_accuracy": 0.4439571150097466, "step": 4742 }, { "epoch": 0.8793103448275862, "grad_norm": 6.13671875, "learning_rate": 9.120689655172414e-06, "loss": 3.0456, "mean_token_accuracy": 0.4279114740008595, "step": 4743 }, { "epoch": 0.8794957360029663, "grad_norm": 7.38671875, "learning_rate": 9.120504263997035e-06, "loss": 3.1233, "mean_token_accuracy": 0.4172205438066465, "step": 4744 }, { "epoch": 0.8796811271783463, "grad_norm": 8.1171875, "learning_rate": 9.120318872821654e-06, "loss": 2.2689, "mean_token_accuracy": 0.5117899761336515, "step": 4745 }, { "epoch": 0.8798665183537263, "grad_norm": 7.97265625, "learning_rate": 9.120133481646274e-06, "loss": 2.6436, "mean_token_accuracy": 0.46176279974076473, "step": 4746 }, { "epoch": 0.8800519095291064, "grad_norm": 7.94140625, "learning_rate": 9.119948090470895e-06, "loss": 3.0196, "mean_token_accuracy": 0.4321796071094481, "step": 4747 }, { "epoch": 0.8802373007044865, "grad_norm": 6.90625, "learning_rate": 9.119762699295515e-06, "loss": 3.0322, "mean_token_accuracy": 0.4455611390284757, "step": 4748 }, { "epoch": 0.8804226918798665, "grad_norm": 7.18359375, "learning_rate": 9.119577308120134e-06, "loss": 2.6006, "mean_token_accuracy": 0.4733405875952122, "step": 4749 }, { "epoch": 0.8806080830552465, "grad_norm": 9.125, "learning_rate": 9.119391916944754e-06, "loss": 3.0766, "mean_token_accuracy": 0.41508737386167854, "step": 4750 }, { "epoch": 0.8807934742306266, "grad_norm": 8.4296875, "learning_rate": 9.119206525769373e-06, "loss": 2.5999, "mean_token_accuracy": 0.47165566886622673, "step": 4751 }, { "epoch": 0.8809788654060067, "grad_norm": 5.83984375, "learning_rate": 9.119021134593994e-06, "loss": 2.756, "mean_token_accuracy": 0.46314203189752606, "step": 4752 }, { "epoch": 0.8811642565813868, "grad_norm": 7.2109375, "learning_rate": 9.118835743418614e-06, "loss": 2.9488, "mean_token_accuracy": 0.42040443971415536, "step": 4753 }, { "epoch": 0.8813496477567668, "grad_norm": 9.625, "learning_rate": 9.118650352243233e-06, "loss": 2.9176, "mean_token_accuracy": 0.4297745529930034, "step": 4754 }, { "epoch": 0.8815350389321468, "grad_norm": 10.4921875, "learning_rate": 9.118464961067854e-06, "loss": 2.4174, "mean_token_accuracy": 0.48940269749518306, "step": 4755 }, { "epoch": 0.8817204301075269, "grad_norm": 6.23046875, "learning_rate": 9.118279569892474e-06, "loss": 2.7697, "mean_token_accuracy": 0.44727891156462585, "step": 4756 }, { "epoch": 0.881905821282907, "grad_norm": 6.4609375, "learning_rate": 9.118094178717095e-06, "loss": 2.4987, "mean_token_accuracy": 0.4994364609749225, "step": 4757 }, { "epoch": 0.882091212458287, "grad_norm": 7.0625, "learning_rate": 9.117908787541713e-06, "loss": 3.2226, "mean_token_accuracy": 0.40342243596913097, "step": 4758 }, { "epoch": 0.882276603633667, "grad_norm": 6.296875, "learning_rate": 9.117723396366334e-06, "loss": 2.9266, "mean_token_accuracy": 0.45411003236245956, "step": 4759 }, { "epoch": 0.8824619948090471, "grad_norm": 8.5703125, "learning_rate": 9.117538005190953e-06, "loss": 2.6909, "mean_token_accuracy": 0.46436443791329907, "step": 4760 }, { "epoch": 0.8826473859844272, "grad_norm": 9.6171875, "learning_rate": 9.117352614015573e-06, "loss": 2.5413, "mean_token_accuracy": 0.47476261869065467, "step": 4761 }, { "epoch": 0.8828327771598072, "grad_norm": 10.375, "learning_rate": 9.117167222840194e-06, "loss": 2.6112, "mean_token_accuracy": 0.48375410392845014, "step": 4762 }, { "epoch": 0.8830181683351872, "grad_norm": 6.3671875, "learning_rate": 9.116981831664814e-06, "loss": 2.7955, "mean_token_accuracy": 0.4416274790431405, "step": 4763 }, { "epoch": 0.8832035595105673, "grad_norm": 7.96484375, "learning_rate": 9.116796440489435e-06, "loss": 2.6604, "mean_token_accuracy": 0.4526788142507479, "step": 4764 }, { "epoch": 0.8833889506859474, "grad_norm": 6.90234375, "learning_rate": 9.116611049314053e-06, "loss": 2.7788, "mean_token_accuracy": 0.4520460358056266, "step": 4765 }, { "epoch": 0.8835743418613274, "grad_norm": 5.70703125, "learning_rate": 9.116425658138674e-06, "loss": 3.2422, "mean_token_accuracy": 0.40122341165021547, "step": 4766 }, { "epoch": 0.8837597330367074, "grad_norm": 5.23046875, "learning_rate": 9.116240266963293e-06, "loss": 2.397, "mean_token_accuracy": 0.5078902402251483, "step": 4767 }, { "epoch": 0.8839451242120875, "grad_norm": 7.015625, "learning_rate": 9.116054875787913e-06, "loss": 2.812, "mean_token_accuracy": 0.4486486486486487, "step": 4768 }, { "epoch": 0.8841305153874676, "grad_norm": 7.0703125, "learning_rate": 9.115869484612534e-06, "loss": 2.9554, "mean_token_accuracy": 0.4558604973744179, "step": 4769 }, { "epoch": 0.8843159065628476, "grad_norm": 4.76953125, "learning_rate": 9.115684093437152e-06, "loss": 2.8999, "mean_token_accuracy": 0.44882600842865744, "step": 4770 }, { "epoch": 0.8845012977382276, "grad_norm": 11.4921875, "learning_rate": 9.115498702261773e-06, "loss": 2.7147, "mean_token_accuracy": 0.42719725919093426, "step": 4771 }, { "epoch": 0.8846866889136077, "grad_norm": 8.71875, "learning_rate": 9.115313311086393e-06, "loss": 3.3462, "mean_token_accuracy": 0.4055905727596602, "step": 4772 }, { "epoch": 0.8848720800889878, "grad_norm": 6.21484375, "learning_rate": 9.115127919911014e-06, "loss": 2.5715, "mean_token_accuracy": 0.47835547411818596, "step": 4773 }, { "epoch": 0.8850574712643678, "grad_norm": 5.63671875, "learning_rate": 9.114942528735633e-06, "loss": 2.8258, "mean_token_accuracy": 0.4738380590470623, "step": 4774 }, { "epoch": 0.8852428624397478, "grad_norm": 7.390625, "learning_rate": 9.114757137560253e-06, "loss": 3.3038, "mean_token_accuracy": 0.40875232774674114, "step": 4775 }, { "epoch": 0.8854282536151279, "grad_norm": 11.453125, "learning_rate": 9.114571746384872e-06, "loss": 2.7958, "mean_token_accuracy": 0.43519048163825647, "step": 4776 }, { "epoch": 0.885613644790508, "grad_norm": 6.7578125, "learning_rate": 9.114386355209493e-06, "loss": 2.9668, "mean_token_accuracy": 0.434561791899987, "step": 4777 }, { "epoch": 0.885799035965888, "grad_norm": 8.1484375, "learning_rate": 9.114200964034113e-06, "loss": 2.9992, "mean_token_accuracy": 0.43731629392971244, "step": 4778 }, { "epoch": 0.8859844271412681, "grad_norm": 7.33203125, "learning_rate": 9.114015572858733e-06, "loss": 2.7213, "mean_token_accuracy": 0.4538140643623361, "step": 4779 }, { "epoch": 0.8861698183166481, "grad_norm": 4.703125, "learning_rate": 9.113830181683352e-06, "loss": 2.7574, "mean_token_accuracy": 0.45145468732582766, "step": 4780 }, { "epoch": 0.8863552094920282, "grad_norm": 6.1484375, "learning_rate": 9.113644790507973e-06, "loss": 2.5406, "mean_token_accuracy": 0.46855345911949686, "step": 4781 }, { "epoch": 0.8865406006674083, "grad_norm": 5.68359375, "learning_rate": 9.113459399332593e-06, "loss": 2.9505, "mean_token_accuracy": 0.44532130777903045, "step": 4782 }, { "epoch": 0.8867259918427883, "grad_norm": 10.1640625, "learning_rate": 9.113274008157212e-06, "loss": 2.5814, "mean_token_accuracy": 0.471386040357261, "step": 4783 }, { "epoch": 0.8869113830181683, "grad_norm": 8.9609375, "learning_rate": 9.113088616981833e-06, "loss": 2.58, "mean_token_accuracy": 0.45769427402862983, "step": 4784 }, { "epoch": 0.8870967741935484, "grad_norm": 6.71875, "learning_rate": 9.112903225806451e-06, "loss": 2.7992, "mean_token_accuracy": 0.4433485078401619, "step": 4785 }, { "epoch": 0.8872821653689285, "grad_norm": 6.34375, "learning_rate": 9.112717834631072e-06, "loss": 3.2165, "mean_token_accuracy": 0.4023738872403561, "step": 4786 }, { "epoch": 0.8874675565443085, "grad_norm": 6.484375, "learning_rate": 9.112532443455692e-06, "loss": 2.7502, "mean_token_accuracy": 0.45825486503452606, "step": 4787 }, { "epoch": 0.8876529477196885, "grad_norm": 5.6796875, "learning_rate": 9.112347052280313e-06, "loss": 3.1176, "mean_token_accuracy": 0.4289069171648164, "step": 4788 }, { "epoch": 0.8878383388950686, "grad_norm": 6.32421875, "learning_rate": 9.112161661104932e-06, "loss": 3.4602, "mean_token_accuracy": 0.38835965026751923, "step": 4789 }, { "epoch": 0.8880237300704487, "grad_norm": 10.3125, "learning_rate": 9.111976269929552e-06, "loss": 2.7586, "mean_token_accuracy": 0.4556019070321812, "step": 4790 }, { "epoch": 0.8882091212458287, "grad_norm": 8.6796875, "learning_rate": 9.111790878754173e-06, "loss": 3.0236, "mean_token_accuracy": 0.44246277915632753, "step": 4791 }, { "epoch": 0.8883945124212087, "grad_norm": 8.328125, "learning_rate": 9.111605487578791e-06, "loss": 2.9589, "mean_token_accuracy": 0.433116413593637, "step": 4792 }, { "epoch": 0.8885799035965888, "grad_norm": 9.8828125, "learning_rate": 9.111420096403412e-06, "loss": 2.6919, "mean_token_accuracy": 0.469671603348358, "step": 4793 }, { "epoch": 0.8887652947719689, "grad_norm": 8.734375, "learning_rate": 9.11123470522803e-06, "loss": 3.0708, "mean_token_accuracy": 0.427466483327604, "step": 4794 }, { "epoch": 0.8889506859473489, "grad_norm": 5.65234375, "learning_rate": 9.111049314052653e-06, "loss": 3.2376, "mean_token_accuracy": 0.4072338380197068, "step": 4795 }, { "epoch": 0.8891360771227289, "grad_norm": 6.23046875, "learning_rate": 9.110863922877272e-06, "loss": 3.0328, "mean_token_accuracy": 0.42715812988670815, "step": 4796 }, { "epoch": 0.889321468298109, "grad_norm": 7.4765625, "learning_rate": 9.110678531701892e-06, "loss": 2.4425, "mean_token_accuracy": 0.48037399664814323, "step": 4797 }, { "epoch": 0.8895068594734891, "grad_norm": 6.67578125, "learning_rate": 9.110493140526511e-06, "loss": 2.8499, "mean_token_accuracy": 0.44973821989528795, "step": 4798 }, { "epoch": 0.8896922506488691, "grad_norm": 5.30859375, "learning_rate": 9.110307749351131e-06, "loss": 2.9762, "mean_token_accuracy": 0.4624066126212049, "step": 4799 }, { "epoch": 0.8898776418242491, "grad_norm": 6.1484375, "learning_rate": 9.110122358175752e-06, "loss": 2.4517, "mean_token_accuracy": 0.4829152249134948, "step": 4800 }, { "epoch": 0.8900630329996292, "grad_norm": 7.984375, "learning_rate": 9.10993696700037e-06, "loss": 3.2612, "mean_token_accuracy": 0.4140942073545689, "step": 4801 }, { "epoch": 0.8902484241750093, "grad_norm": 7.46875, "learning_rate": 9.109751575824991e-06, "loss": 3.1539, "mean_token_accuracy": 0.4131321370309951, "step": 4802 }, { "epoch": 0.8904338153503893, "grad_norm": 5.3828125, "learning_rate": 9.109566184649612e-06, "loss": 2.6659, "mean_token_accuracy": 0.48419756464632646, "step": 4803 }, { "epoch": 0.8906192065257694, "grad_norm": 6.41796875, "learning_rate": 9.109380793474232e-06, "loss": 3.3095, "mean_token_accuracy": 0.39717925386715197, "step": 4804 }, { "epoch": 0.8908045977011494, "grad_norm": 16.21875, "learning_rate": 9.109195402298851e-06, "loss": 2.0979, "mean_token_accuracy": 0.5234252174824442, "step": 4805 }, { "epoch": 0.8909899888765295, "grad_norm": 7.30859375, "learning_rate": 9.109010011123472e-06, "loss": 2.5693, "mean_token_accuracy": 0.47193611850480266, "step": 4806 }, { "epoch": 0.8911753800519095, "grad_norm": 6.91796875, "learning_rate": 9.108824619948092e-06, "loss": 2.8091, "mean_token_accuracy": 0.4536549707602339, "step": 4807 }, { "epoch": 0.8913607712272896, "grad_norm": 7.12109375, "learning_rate": 9.10863922877271e-06, "loss": 2.9058, "mean_token_accuracy": 0.43007518796992483, "step": 4808 }, { "epoch": 0.8915461624026696, "grad_norm": 8.875, "learning_rate": 9.108453837597331e-06, "loss": 2.5582, "mean_token_accuracy": 0.4794846134600951, "step": 4809 }, { "epoch": 0.8917315535780497, "grad_norm": 7.4140625, "learning_rate": 9.10826844642195e-06, "loss": 2.4695, "mean_token_accuracy": 0.48445154419595315, "step": 4810 }, { "epoch": 0.8919169447534298, "grad_norm": 8.40625, "learning_rate": 9.108083055246572e-06, "loss": 2.8693, "mean_token_accuracy": 0.4464115969581749, "step": 4811 }, { "epoch": 0.8921023359288098, "grad_norm": 6.73046875, "learning_rate": 9.107897664071191e-06, "loss": 3.1149, "mean_token_accuracy": 0.4265320836337419, "step": 4812 }, { "epoch": 0.8922877271041898, "grad_norm": 9.3828125, "learning_rate": 9.107712272895812e-06, "loss": 2.5036, "mean_token_accuracy": 0.4868949482139083, "step": 4813 }, { "epoch": 0.8924731182795699, "grad_norm": 5.61328125, "learning_rate": 9.10752688172043e-06, "loss": 2.9838, "mean_token_accuracy": 0.44264175680104884, "step": 4814 }, { "epoch": 0.89265850945495, "grad_norm": 5.83984375, "learning_rate": 9.107341490545051e-06, "loss": 2.7201, "mean_token_accuracy": 0.44769403824521936, "step": 4815 }, { "epoch": 0.89284390063033, "grad_norm": 5.75390625, "learning_rate": 9.107156099369671e-06, "loss": 2.5148, "mean_token_accuracy": 0.495776630689817, "step": 4816 }, { "epoch": 0.89302929180571, "grad_norm": 5.8046875, "learning_rate": 9.10697070819429e-06, "loss": 3.2708, "mean_token_accuracy": 0.4203842396613481, "step": 4817 }, { "epoch": 0.8932146829810901, "grad_norm": 5.66796875, "learning_rate": 9.10678531701891e-06, "loss": 2.6114, "mean_token_accuracy": 0.47569060773480665, "step": 4818 }, { "epoch": 0.8934000741564702, "grad_norm": 7.0703125, "learning_rate": 9.106599925843531e-06, "loss": 2.782, "mean_token_accuracy": 0.46246396791577893, "step": 4819 }, { "epoch": 0.8935854653318502, "grad_norm": 5.36328125, "learning_rate": 9.106414534668152e-06, "loss": 3.0879, "mean_token_accuracy": 0.41668705652067534, "step": 4820 }, { "epoch": 0.8937708565072302, "grad_norm": 6.41015625, "learning_rate": 9.10622914349277e-06, "loss": 3.0243, "mean_token_accuracy": 0.42051802945657696, "step": 4821 }, { "epoch": 0.8939562476826103, "grad_norm": 8.953125, "learning_rate": 9.106043752317391e-06, "loss": 3.012, "mean_token_accuracy": 0.4723398044081156, "step": 4822 }, { "epoch": 0.8941416388579904, "grad_norm": 6.5390625, "learning_rate": 9.10585836114201e-06, "loss": 2.4209, "mean_token_accuracy": 0.4869003062266077, "step": 4823 }, { "epoch": 0.8943270300333704, "grad_norm": 5.8203125, "learning_rate": 9.10567296996663e-06, "loss": 2.9491, "mean_token_accuracy": 0.4342634192480492, "step": 4824 }, { "epoch": 0.8945124212087505, "grad_norm": 7.11328125, "learning_rate": 9.10548757879125e-06, "loss": 3.2459, "mean_token_accuracy": 0.41229985443959244, "step": 4825 }, { "epoch": 0.8946978123841305, "grad_norm": 4.9609375, "learning_rate": 9.10530218761587e-06, "loss": 2.8846, "mean_token_accuracy": 0.4488990444536768, "step": 4826 }, { "epoch": 0.8948832035595106, "grad_norm": 5.5078125, "learning_rate": 9.10511679644049e-06, "loss": 2.9612, "mean_token_accuracy": 0.43195975083852417, "step": 4827 }, { "epoch": 0.8950685947348906, "grad_norm": 6.140625, "learning_rate": 9.10493140526511e-06, "loss": 3.4644, "mean_token_accuracy": 0.3847089487402259, "step": 4828 }, { "epoch": 0.8952539859102707, "grad_norm": 6.36328125, "learning_rate": 9.104746014089731e-06, "loss": 2.7245, "mean_token_accuracy": 0.47996424847311187, "step": 4829 }, { "epoch": 0.8954393770856507, "grad_norm": 5.13671875, "learning_rate": 9.10456062291435e-06, "loss": 2.7935, "mean_token_accuracy": 0.46265440965239873, "step": 4830 }, { "epoch": 0.8956247682610308, "grad_norm": 5.953125, "learning_rate": 9.10437523173897e-06, "loss": 3.0615, "mean_token_accuracy": 0.4243281471004243, "step": 4831 }, { "epoch": 0.8958101594364108, "grad_norm": 6.0625, "learning_rate": 9.104189840563589e-06, "loss": 2.9999, "mean_token_accuracy": 0.41274362818590704, "step": 4832 }, { "epoch": 0.8959955506117909, "grad_norm": 5.31640625, "learning_rate": 9.10400444938821e-06, "loss": 3.0061, "mean_token_accuracy": 0.44740400216333154, "step": 4833 }, { "epoch": 0.8961809417871709, "grad_norm": 5.1328125, "learning_rate": 9.10381905821283e-06, "loss": 2.7098, "mean_token_accuracy": 0.47615085967831394, "step": 4834 }, { "epoch": 0.896366332962551, "grad_norm": 5.6640625, "learning_rate": 9.103633667037449e-06, "loss": 2.8705, "mean_token_accuracy": 0.44950625411454903, "step": 4835 }, { "epoch": 0.896551724137931, "grad_norm": 6.09765625, "learning_rate": 9.10344827586207e-06, "loss": 2.4584, "mean_token_accuracy": 0.48600987538208323, "step": 4836 }, { "epoch": 0.8967371153133111, "grad_norm": 5.671875, "learning_rate": 9.10326288468669e-06, "loss": 2.6915, "mean_token_accuracy": 0.4685051389030223, "step": 4837 }, { "epoch": 0.8969225064886911, "grad_norm": 6.7421875, "learning_rate": 9.10307749351131e-06, "loss": 2.7308, "mean_token_accuracy": 0.4481563626282229, "step": 4838 }, { "epoch": 0.8971078976640712, "grad_norm": 6.125, "learning_rate": 9.102892102335929e-06, "loss": 2.7726, "mean_token_accuracy": 0.44919590643274854, "step": 4839 }, { "epoch": 0.8972932888394513, "grad_norm": 7.3515625, "learning_rate": 9.10270671116055e-06, "loss": 3.0127, "mean_token_accuracy": 0.4177123552123552, "step": 4840 }, { "epoch": 0.8974786800148313, "grad_norm": 9.078125, "learning_rate": 9.102521319985168e-06, "loss": 2.6159, "mean_token_accuracy": 0.467239878101872, "step": 4841 }, { "epoch": 0.8976640711902113, "grad_norm": 6.62109375, "learning_rate": 9.102335928809789e-06, "loss": 2.7854, "mean_token_accuracy": 0.4451139638459523, "step": 4842 }, { "epoch": 0.8978494623655914, "grad_norm": 6.59765625, "learning_rate": 9.10215053763441e-06, "loss": 2.6825, "mean_token_accuracy": 0.4688304997424008, "step": 4843 }, { "epoch": 0.8980348535409715, "grad_norm": 5.6640625, "learning_rate": 9.10196514645903e-06, "loss": 2.5821, "mean_token_accuracy": 0.4620289033720601, "step": 4844 }, { "epoch": 0.8982202447163515, "grad_norm": 10.984375, "learning_rate": 9.10177975528365e-06, "loss": 2.5949, "mean_token_accuracy": 0.4669365721997301, "step": 4845 }, { "epoch": 0.8984056358917315, "grad_norm": 6.08984375, "learning_rate": 9.101594364108269e-06, "loss": 3.2982, "mean_token_accuracy": 0.4004474272930649, "step": 4846 }, { "epoch": 0.8985910270671116, "grad_norm": 6.26953125, "learning_rate": 9.10140897293289e-06, "loss": 3.2974, "mean_token_accuracy": 0.456050796812749, "step": 4847 }, { "epoch": 0.8987764182424917, "grad_norm": 6.87890625, "learning_rate": 9.101223581757508e-06, "loss": 2.7279, "mean_token_accuracy": 0.4588701684836472, "step": 4848 }, { "epoch": 0.8989618094178717, "grad_norm": 5.80859375, "learning_rate": 9.101038190582129e-06, "loss": 2.1861, "mean_token_accuracy": 0.5260521042084169, "step": 4849 }, { "epoch": 0.8991472005932518, "grad_norm": 5.15234375, "learning_rate": 9.10085279940675e-06, "loss": 2.8569, "mean_token_accuracy": 0.438132390588809, "step": 4850 }, { "epoch": 0.8993325917686318, "grad_norm": 7.42578125, "learning_rate": 9.100667408231368e-06, "loss": 2.7418, "mean_token_accuracy": 0.46966378977199535, "step": 4851 }, { "epoch": 0.8995179829440119, "grad_norm": 5.69140625, "learning_rate": 9.100482017055989e-06, "loss": 2.8113, "mean_token_accuracy": 0.4586403613025909, "step": 4852 }, { "epoch": 0.8997033741193919, "grad_norm": 7.87890625, "learning_rate": 9.10029662588061e-06, "loss": 3.0107, "mean_token_accuracy": 0.42656436827421285, "step": 4853 }, { "epoch": 0.899888765294772, "grad_norm": 9.734375, "learning_rate": 9.10011123470523e-06, "loss": 2.8243, "mean_token_accuracy": 0.4418943533697632, "step": 4854 }, { "epoch": 0.900074156470152, "grad_norm": 7.9140625, "learning_rate": 9.099925843529848e-06, "loss": 2.5415, "mean_token_accuracy": 0.4780982261373235, "step": 4855 }, { "epoch": 0.9002595476455321, "grad_norm": 6.43359375, "learning_rate": 9.099740452354469e-06, "loss": 2.6513, "mean_token_accuracy": 0.46058631921824106, "step": 4856 }, { "epoch": 0.9004449388209121, "grad_norm": 9.59375, "learning_rate": 9.099555061179088e-06, "loss": 2.3358, "mean_token_accuracy": 0.5088495575221239, "step": 4857 }, { "epoch": 0.9006303299962922, "grad_norm": 7.1015625, "learning_rate": 9.099369670003708e-06, "loss": 2.4836, "mean_token_accuracy": 0.49608114338404796, "step": 4858 }, { "epoch": 0.9008157211716722, "grad_norm": 6.95703125, "learning_rate": 9.099184278828329e-06, "loss": 3.0634, "mean_token_accuracy": 0.42346771550311374, "step": 4859 }, { "epoch": 0.9010011123470523, "grad_norm": 6.63671875, "learning_rate": 9.09899888765295e-06, "loss": 2.7997, "mean_token_accuracy": 0.44312078898545065, "step": 4860 }, { "epoch": 0.9011865035224323, "grad_norm": 6.04296875, "learning_rate": 9.098813496477568e-06, "loss": 2.9203, "mean_token_accuracy": 0.44080480922586185, "step": 4861 }, { "epoch": 0.9013718946978124, "grad_norm": 5.6796875, "learning_rate": 9.098628105302189e-06, "loss": 2.5721, "mean_token_accuracy": 0.4629313738354547, "step": 4862 }, { "epoch": 0.9015572858731924, "grad_norm": 6.578125, "learning_rate": 9.098442714126809e-06, "loss": 3.5373, "mean_token_accuracy": 0.3878078650156484, "step": 4863 }, { "epoch": 0.9017426770485725, "grad_norm": 6.484375, "learning_rate": 9.098257322951428e-06, "loss": 2.6734, "mean_token_accuracy": 0.46557745073584433, "step": 4864 }, { "epoch": 0.9019280682239526, "grad_norm": 5.06640625, "learning_rate": 9.098071931776048e-06, "loss": 3.0073, "mean_token_accuracy": 0.4343629343629344, "step": 4865 }, { "epoch": 0.9021134593993326, "grad_norm": 4.94921875, "learning_rate": 9.097886540600667e-06, "loss": 3.0821, "mean_token_accuracy": 0.4202827289489859, "step": 4866 }, { "epoch": 0.9022988505747126, "grad_norm": 5.8671875, "learning_rate": 9.097701149425288e-06, "loss": 3.0367, "mean_token_accuracy": 0.4293239683933275, "step": 4867 }, { "epoch": 0.9024842417500927, "grad_norm": 6.9921875, "learning_rate": 9.097515758249908e-06, "loss": 2.4986, "mean_token_accuracy": 0.4861438679245283, "step": 4868 }, { "epoch": 0.9026696329254728, "grad_norm": 6.45703125, "learning_rate": 9.097330367074529e-06, "loss": 2.666, "mean_token_accuracy": 0.4762937265105522, "step": 4869 }, { "epoch": 0.9028550241008528, "grad_norm": 6.22265625, "learning_rate": 9.097144975899147e-06, "loss": 3.0184, "mean_token_accuracy": 0.42739095096179897, "step": 4870 }, { "epoch": 0.9030404152762328, "grad_norm": 5.359375, "learning_rate": 9.096959584723768e-06, "loss": 2.9694, "mean_token_accuracy": 0.4375404530744337, "step": 4871 }, { "epoch": 0.9032258064516129, "grad_norm": 8.34375, "learning_rate": 9.096774193548388e-06, "loss": 2.6604, "mean_token_accuracy": 0.47622687662436936, "step": 4872 }, { "epoch": 0.903411197626993, "grad_norm": 5.43359375, "learning_rate": 9.096588802373007e-06, "loss": 3.0543, "mean_token_accuracy": 0.4283646888567294, "step": 4873 }, { "epoch": 0.903596588802373, "grad_norm": 6.94921875, "learning_rate": 9.096403411197628e-06, "loss": 2.6828, "mean_token_accuracy": 0.4630102040816326, "step": 4874 }, { "epoch": 0.9037819799777531, "grad_norm": 8.21875, "learning_rate": 9.096218020022246e-06, "loss": 2.8308, "mean_token_accuracy": 0.46330335028027636, "step": 4875 }, { "epoch": 0.9039673711531331, "grad_norm": 6.2421875, "learning_rate": 9.096032628846869e-06, "loss": 3.0079, "mean_token_accuracy": 0.43385469960532086, "step": 4876 }, { "epoch": 0.9041527623285132, "grad_norm": 8.203125, "learning_rate": 9.095847237671487e-06, "loss": 2.8519, "mean_token_accuracy": 0.44860203088051187, "step": 4877 }, { "epoch": 0.9043381535038932, "grad_norm": 7.59765625, "learning_rate": 9.095661846496108e-06, "loss": 2.811, "mean_token_accuracy": 0.4399512789281364, "step": 4878 }, { "epoch": 0.9045235446792733, "grad_norm": 5.359375, "learning_rate": 9.095476455320727e-06, "loss": 3.3264, "mean_token_accuracy": 0.40315804722584386, "step": 4879 }, { "epoch": 0.9047089358546533, "grad_norm": 8.3671875, "learning_rate": 9.095291064145347e-06, "loss": 2.5031, "mean_token_accuracy": 0.4665497369080431, "step": 4880 }, { "epoch": 0.9048943270300334, "grad_norm": 9.828125, "learning_rate": 9.095105672969968e-06, "loss": 2.8232, "mean_token_accuracy": 0.451505940867643, "step": 4881 }, { "epoch": 0.9050797182054134, "grad_norm": 5.53125, "learning_rate": 9.094920281794587e-06, "loss": 3.142, "mean_token_accuracy": 0.4085560016613595, "step": 4882 }, { "epoch": 0.9052651093807935, "grad_norm": 7.41015625, "learning_rate": 9.094734890619207e-06, "loss": 2.7081, "mean_token_accuracy": 0.4691276058956314, "step": 4883 }, { "epoch": 0.9054505005561735, "grad_norm": 5.84375, "learning_rate": 9.094549499443827e-06, "loss": 2.9185, "mean_token_accuracy": 0.43350785340314135, "step": 4884 }, { "epoch": 0.9056358917315536, "grad_norm": 7.765625, "learning_rate": 9.094364108268448e-06, "loss": 3.0834, "mean_token_accuracy": 0.43157327586206895, "step": 4885 }, { "epoch": 0.9058212829069336, "grad_norm": 5.93359375, "learning_rate": 9.094178717093067e-06, "loss": 3.0475, "mean_token_accuracy": 0.4337931034482759, "step": 4886 }, { "epoch": 0.9060066740823137, "grad_norm": 5.7109375, "learning_rate": 9.093993325917687e-06, "loss": 2.9418, "mean_token_accuracy": 0.4622816032887975, "step": 4887 }, { "epoch": 0.9061920652576937, "grad_norm": 5.96484375, "learning_rate": 9.093807934742308e-06, "loss": 2.3742, "mean_token_accuracy": 0.497049356223176, "step": 4888 }, { "epoch": 0.9063774564330738, "grad_norm": 6.37890625, "learning_rate": 9.093622543566927e-06, "loss": 2.3453, "mean_token_accuracy": 0.4954711087975013, "step": 4889 }, { "epoch": 0.9065628476084538, "grad_norm": 6.46875, "learning_rate": 9.093437152391547e-06, "loss": 3.0197, "mean_token_accuracy": 0.42697674418604653, "step": 4890 }, { "epoch": 0.9067482387838339, "grad_norm": 6.8984375, "learning_rate": 9.093251761216166e-06, "loss": 3.1975, "mean_token_accuracy": 0.41771602257924445, "step": 4891 }, { "epoch": 0.9069336299592139, "grad_norm": 5.7734375, "learning_rate": 9.093066370040788e-06, "loss": 2.5583, "mean_token_accuracy": 0.4734808102345416, "step": 4892 }, { "epoch": 0.907119021134594, "grad_norm": 8.0234375, "learning_rate": 9.092880978865407e-06, "loss": 3.2496, "mean_token_accuracy": 0.42100852074312056, "step": 4893 }, { "epoch": 0.907304412309974, "grad_norm": 6.90625, "learning_rate": 9.092695587690027e-06, "loss": 2.7741, "mean_token_accuracy": 0.4510031986042454, "step": 4894 }, { "epoch": 0.9074898034853541, "grad_norm": 5.75390625, "learning_rate": 9.092510196514646e-06, "loss": 2.8908, "mean_token_accuracy": 0.433028010088942, "step": 4895 }, { "epoch": 0.9076751946607341, "grad_norm": 8.8515625, "learning_rate": 9.092324805339267e-06, "loss": 2.8886, "mean_token_accuracy": 0.43879074396616075, "step": 4896 }, { "epoch": 0.9078605858361142, "grad_norm": 7.41796875, "learning_rate": 9.092139414163887e-06, "loss": 3.0682, "mean_token_accuracy": 0.42445565586829526, "step": 4897 }, { "epoch": 0.9080459770114943, "grad_norm": 6.703125, "learning_rate": 9.091954022988506e-06, "loss": 2.7231, "mean_token_accuracy": 0.45677498467198036, "step": 4898 }, { "epoch": 0.9082313681868743, "grad_norm": 6.51953125, "learning_rate": 9.091768631813126e-06, "loss": 2.887, "mean_token_accuracy": 0.4489100817438692, "step": 4899 }, { "epoch": 0.9084167593622544, "grad_norm": 9.203125, "learning_rate": 9.091583240637747e-06, "loss": 2.5935, "mean_token_accuracy": 0.4736216376876207, "step": 4900 }, { "epoch": 0.9086021505376344, "grad_norm": 6.875, "learning_rate": 9.091397849462367e-06, "loss": 3.0335, "mean_token_accuracy": 0.4358125318390219, "step": 4901 }, { "epoch": 0.9087875417130145, "grad_norm": 6.2578125, "learning_rate": 9.091212458286986e-06, "loss": 3.2273, "mean_token_accuracy": 0.408009286128845, "step": 4902 }, { "epoch": 0.9089729328883945, "grad_norm": 6.3203125, "learning_rate": 9.091027067111607e-06, "loss": 2.5726, "mean_token_accuracy": 0.4705722070844687, "step": 4903 }, { "epoch": 0.9091583240637746, "grad_norm": 6.73828125, "learning_rate": 9.090841675936225e-06, "loss": 2.3397, "mean_token_accuracy": 0.5137127690946623, "step": 4904 }, { "epoch": 0.9093437152391546, "grad_norm": 8.453125, "learning_rate": 9.090656284760846e-06, "loss": 2.9487, "mean_token_accuracy": 0.4239875792469919, "step": 4905 }, { "epoch": 0.9095291064145347, "grad_norm": 5.89453125, "learning_rate": 9.090470893585466e-06, "loss": 3.0251, "mean_token_accuracy": 0.43301913738931735, "step": 4906 }, { "epoch": 0.9097144975899147, "grad_norm": 7.0625, "learning_rate": 9.090285502410085e-06, "loss": 3.1901, "mean_token_accuracy": 0.43487193535132174, "step": 4907 }, { "epoch": 0.9098998887652948, "grad_norm": 6.76953125, "learning_rate": 9.090100111234706e-06, "loss": 2.5914, "mean_token_accuracy": 0.4698652450762624, "step": 4908 }, { "epoch": 0.9100852799406748, "grad_norm": 6.2109375, "learning_rate": 9.089914720059326e-06, "loss": 2.5262, "mean_token_accuracy": 0.49608310626702995, "step": 4909 }, { "epoch": 0.9102706711160549, "grad_norm": 4.90234375, "learning_rate": 9.089729328883947e-06, "loss": 2.761, "mean_token_accuracy": 0.43695479777954005, "step": 4910 }, { "epoch": 0.9104560622914349, "grad_norm": 7.796875, "learning_rate": 9.089543937708566e-06, "loss": 3.1522, "mean_token_accuracy": 0.41713543920380786, "step": 4911 }, { "epoch": 0.910641453466815, "grad_norm": 8.828125, "learning_rate": 9.089358546533186e-06, "loss": 2.8721, "mean_token_accuracy": 0.44594594594594594, "step": 4912 }, { "epoch": 0.910826844642195, "grad_norm": 6.30859375, "learning_rate": 9.089173155357805e-06, "loss": 2.5212, "mean_token_accuracy": 0.4867914849961529, "step": 4913 }, { "epoch": 0.9110122358175751, "grad_norm": 6.24609375, "learning_rate": 9.088987764182425e-06, "loss": 3.2299, "mean_token_accuracy": 0.43695594829860196, "step": 4914 }, { "epoch": 0.9111976269929551, "grad_norm": 6.30078125, "learning_rate": 9.088802373007046e-06, "loss": 3.4052, "mean_token_accuracy": 0.4020790020790021, "step": 4915 }, { "epoch": 0.9113830181683352, "grad_norm": 5.84375, "learning_rate": 9.088616981831666e-06, "loss": 3.1535, "mean_token_accuracy": 0.4408988764044944, "step": 4916 }, { "epoch": 0.9115684093437152, "grad_norm": 9.609375, "learning_rate": 9.088431590656285e-06, "loss": 2.4482, "mean_token_accuracy": 0.4887955182072829, "step": 4917 }, { "epoch": 0.9117538005190953, "grad_norm": 10.8125, "learning_rate": 9.088246199480906e-06, "loss": 2.8664, "mean_token_accuracy": 0.4285477453580902, "step": 4918 }, { "epoch": 0.9119391916944753, "grad_norm": 5.86328125, "learning_rate": 9.088060808305526e-06, "loss": 3.0842, "mean_token_accuracy": 0.4392743550477088, "step": 4919 }, { "epoch": 0.9121245828698554, "grad_norm": 9.0078125, "learning_rate": 9.087875417130145e-06, "loss": 3.1244, "mean_token_accuracy": 0.41763754045307444, "step": 4920 }, { "epoch": 0.9123099740452354, "grad_norm": 6.453125, "learning_rate": 9.087690025954765e-06, "loss": 3.2078, "mean_token_accuracy": 0.40724863600935307, "step": 4921 }, { "epoch": 0.9124953652206155, "grad_norm": 6.296875, "learning_rate": 9.087504634779384e-06, "loss": 3.4093, "mean_token_accuracy": 0.3962237162065513, "step": 4922 }, { "epoch": 0.9126807563959956, "grad_norm": 4.78515625, "learning_rate": 9.087319243604005e-06, "loss": 3.1374, "mean_token_accuracy": 0.41914227071405424, "step": 4923 }, { "epoch": 0.9128661475713756, "grad_norm": 6.56640625, "learning_rate": 9.087133852428625e-06, "loss": 3.7136, "mean_token_accuracy": 0.3911639512696316, "step": 4924 }, { "epoch": 0.9130515387467557, "grad_norm": 8.0703125, "learning_rate": 9.086948461253246e-06, "loss": 2.7809, "mean_token_accuracy": 0.44657503879405896, "step": 4925 }, { "epoch": 0.9132369299221357, "grad_norm": 5.515625, "learning_rate": 9.086763070077866e-06, "loss": 3.0026, "mean_token_accuracy": 0.4470687984496124, "step": 4926 }, { "epoch": 0.9134223210975158, "grad_norm": 7.2265625, "learning_rate": 9.086577678902485e-06, "loss": 2.9416, "mean_token_accuracy": 0.4349683108294296, "step": 4927 }, { "epoch": 0.9136077122728958, "grad_norm": 6.5546875, "learning_rate": 9.086392287727105e-06, "loss": 2.9747, "mean_token_accuracy": 0.4269135480505933, "step": 4928 }, { "epoch": 0.9137931034482759, "grad_norm": 6.421875, "learning_rate": 9.086206896551724e-06, "loss": 2.5886, "mean_token_accuracy": 0.46509768079515595, "step": 4929 }, { "epoch": 0.9139784946236559, "grad_norm": 5.7265625, "learning_rate": 9.086021505376345e-06, "loss": 2.9867, "mean_token_accuracy": 0.45144508670520234, "step": 4930 }, { "epoch": 0.914163885799036, "grad_norm": 5.99609375, "learning_rate": 9.085836114200965e-06, "loss": 2.3185, "mean_token_accuracy": 0.5096470588235295, "step": 4931 }, { "epoch": 0.914349276974416, "grad_norm": 5.9296875, "learning_rate": 9.085650723025586e-06, "loss": 2.8879, "mean_token_accuracy": 0.4369295062135709, "step": 4932 }, { "epoch": 0.9145346681497961, "grad_norm": 4.78125, "learning_rate": 9.085465331850204e-06, "loss": 2.8352, "mean_token_accuracy": 0.4431062601932004, "step": 4933 }, { "epoch": 0.9147200593251761, "grad_norm": 5.80859375, "learning_rate": 9.085279940674825e-06, "loss": 2.7868, "mean_token_accuracy": 0.450063211125158, "step": 4934 }, { "epoch": 0.9149054505005562, "grad_norm": 9.8984375, "learning_rate": 9.085094549499445e-06, "loss": 3.4765, "mean_token_accuracy": 0.4296482412060301, "step": 4935 }, { "epoch": 0.9150908416759362, "grad_norm": 7.19140625, "learning_rate": 9.084909158324064e-06, "loss": 2.9212, "mean_token_accuracy": 0.4395873629916183, "step": 4936 }, { "epoch": 0.9152762328513163, "grad_norm": 5.58984375, "learning_rate": 9.084723767148685e-06, "loss": 2.9659, "mean_token_accuracy": 0.44335497548694847, "step": 4937 }, { "epoch": 0.9154616240266963, "grad_norm": 5.78125, "learning_rate": 9.084538375973304e-06, "loss": 2.6547, "mean_token_accuracy": 0.473407056345445, "step": 4938 }, { "epoch": 0.9156470152020764, "grad_norm": 5.8203125, "learning_rate": 9.084352984797924e-06, "loss": 2.3808, "mean_token_accuracy": 0.5084867320105187, "step": 4939 }, { "epoch": 0.9158324063774564, "grad_norm": 8.0703125, "learning_rate": 9.084167593622545e-06, "loss": 2.943, "mean_token_accuracy": 0.42499644532916253, "step": 4940 }, { "epoch": 0.9160177975528365, "grad_norm": 6.33203125, "learning_rate": 9.083982202447165e-06, "loss": 3.1771, "mean_token_accuracy": 0.41646489104116224, "step": 4941 }, { "epoch": 0.9162031887282165, "grad_norm": 5.0, "learning_rate": 9.083796811271784e-06, "loss": 2.9426, "mean_token_accuracy": 0.4365754679931928, "step": 4942 }, { "epoch": 0.9163885799035966, "grad_norm": 5.6796875, "learning_rate": 9.083611420096404e-06, "loss": 2.5685, "mean_token_accuracy": 0.4848862574010595, "step": 4943 }, { "epoch": 0.9165739710789766, "grad_norm": 7.18359375, "learning_rate": 9.083426028921025e-06, "loss": 2.6273, "mean_token_accuracy": 0.46711746100992474, "step": 4944 }, { "epoch": 0.9167593622543567, "grad_norm": 6.41796875, "learning_rate": 9.083240637745644e-06, "loss": 2.54, "mean_token_accuracy": 0.4940161424993042, "step": 4945 }, { "epoch": 0.9169447534297367, "grad_norm": 6.50390625, "learning_rate": 9.083055246570264e-06, "loss": 2.8788, "mean_token_accuracy": 0.46175089754211546, "step": 4946 }, { "epoch": 0.9171301446051168, "grad_norm": 7.57421875, "learning_rate": 9.082869855394883e-06, "loss": 2.9309, "mean_token_accuracy": 0.45634333378178393, "step": 4947 }, { "epoch": 0.9173155357804968, "grad_norm": 8.21875, "learning_rate": 9.082684464219505e-06, "loss": 2.7246, "mean_token_accuracy": 0.4640896614821592, "step": 4948 }, { "epoch": 0.9175009269558769, "grad_norm": 5.796875, "learning_rate": 9.082499073044124e-06, "loss": 3.4655, "mean_token_accuracy": 0.39344901540261257, "step": 4949 }, { "epoch": 0.917686318131257, "grad_norm": 6.1640625, "learning_rate": 9.082313681868744e-06, "loss": 2.7255, "mean_token_accuracy": 0.4713722290221678, "step": 4950 }, { "epoch": 0.917871709306637, "grad_norm": 7.26171875, "learning_rate": 9.082128290693363e-06, "loss": 2.9082, "mean_token_accuracy": 0.44174940304134724, "step": 4951 }, { "epoch": 0.918057100482017, "grad_norm": 7.3046875, "learning_rate": 9.081942899517984e-06, "loss": 2.5959, "mean_token_accuracy": 0.4730250481695568, "step": 4952 }, { "epoch": 0.9182424916573971, "grad_norm": 5.83984375, "learning_rate": 9.081757508342604e-06, "loss": 2.62, "mean_token_accuracy": 0.493306781723642, "step": 4953 }, { "epoch": 0.9184278828327772, "grad_norm": 6.1171875, "learning_rate": 9.081572117167223e-06, "loss": 3.2458, "mean_token_accuracy": 0.4190392758066871, "step": 4954 }, { "epoch": 0.9186132740081572, "grad_norm": 6.98828125, "learning_rate": 9.081386725991843e-06, "loss": 2.7548, "mean_token_accuracy": 0.4729447282861124, "step": 4955 }, { "epoch": 0.9187986651835373, "grad_norm": 8.796875, "learning_rate": 9.081201334816462e-06, "loss": 2.8006, "mean_token_accuracy": 0.45722061575097805, "step": 4956 }, { "epoch": 0.9189840563589173, "grad_norm": 5.48828125, "learning_rate": 9.081015943641084e-06, "loss": 2.8963, "mean_token_accuracy": 0.4548692128208277, "step": 4957 }, { "epoch": 0.9191694475342974, "grad_norm": 5.92578125, "learning_rate": 9.080830552465703e-06, "loss": 2.6877, "mean_token_accuracy": 0.46788783355947533, "step": 4958 }, { "epoch": 0.9193548387096774, "grad_norm": 6.1796875, "learning_rate": 9.080645161290324e-06, "loss": 2.9887, "mean_token_accuracy": 0.4341346829748348, "step": 4959 }, { "epoch": 0.9195402298850575, "grad_norm": 5.984375, "learning_rate": 9.080459770114942e-06, "loss": 3.1555, "mean_token_accuracy": 0.4420103092783505, "step": 4960 }, { "epoch": 0.9197256210604375, "grad_norm": 7.5546875, "learning_rate": 9.080274378939563e-06, "loss": 3.0522, "mean_token_accuracy": 0.4373956594323873, "step": 4961 }, { "epoch": 0.9199110122358176, "grad_norm": 6.17578125, "learning_rate": 9.080088987764183e-06, "loss": 2.5525, "mean_token_accuracy": 0.4836576610617797, "step": 4962 }, { "epoch": 0.9200964034111976, "grad_norm": 7.390625, "learning_rate": 9.079903596588802e-06, "loss": 3.1188, "mean_token_accuracy": 0.4197799385875128, "step": 4963 }, { "epoch": 0.9202817945865777, "grad_norm": 6.5703125, "learning_rate": 9.079718205413423e-06, "loss": 2.6088, "mean_token_accuracy": 0.4762982689747004, "step": 4964 }, { "epoch": 0.9204671857619577, "grad_norm": 6.67578125, "learning_rate": 9.079532814238043e-06, "loss": 3.117, "mean_token_accuracy": 0.41052767384853567, "step": 4965 }, { "epoch": 0.9206525769373378, "grad_norm": 5.6640625, "learning_rate": 9.079347423062664e-06, "loss": 3.0117, "mean_token_accuracy": 0.4299807815502883, "step": 4966 }, { "epoch": 0.9208379681127178, "grad_norm": 5.94921875, "learning_rate": 9.079162031887283e-06, "loss": 2.7008, "mean_token_accuracy": 0.465907603716791, "step": 4967 }, { "epoch": 0.9210233592880979, "grad_norm": 6.61328125, "learning_rate": 9.078976640711903e-06, "loss": 3.0792, "mean_token_accuracy": 0.43321230651633863, "step": 4968 }, { "epoch": 0.9212087504634779, "grad_norm": 7.1171875, "learning_rate": 9.078791249536524e-06, "loss": 2.2011, "mean_token_accuracy": 0.523759899958316, "step": 4969 }, { "epoch": 0.921394141638858, "grad_norm": 6.703125, "learning_rate": 9.078605858361142e-06, "loss": 2.5621, "mean_token_accuracy": 0.47427154370737756, "step": 4970 }, { "epoch": 0.921579532814238, "grad_norm": 7.41015625, "learning_rate": 9.078420467185763e-06, "loss": 3.3999, "mean_token_accuracy": 0.40147819660014783, "step": 4971 }, { "epoch": 0.9217649239896181, "grad_norm": 7.11328125, "learning_rate": 9.078235076010382e-06, "loss": 2.5382, "mean_token_accuracy": 0.4794092928322728, "step": 4972 }, { "epoch": 0.9219503151649981, "grad_norm": 6.48828125, "learning_rate": 9.078049684835004e-06, "loss": 2.7149, "mean_token_accuracy": 0.45803085299455537, "step": 4973 }, { "epoch": 0.9221357063403782, "grad_norm": 10.0, "learning_rate": 9.077864293659623e-06, "loss": 2.6543, "mean_token_accuracy": 0.47031039136302294, "step": 4974 }, { "epoch": 0.9223210975157583, "grad_norm": 5.96875, "learning_rate": 9.077678902484243e-06, "loss": 2.8497, "mean_token_accuracy": 0.44804183355585225, "step": 4975 }, { "epoch": 0.9225064886911383, "grad_norm": 5.21484375, "learning_rate": 9.077493511308862e-06, "loss": 2.3512, "mean_token_accuracy": 0.48171990768857037, "step": 4976 }, { "epoch": 0.9226918798665183, "grad_norm": 9.46875, "learning_rate": 9.077308120133482e-06, "loss": 2.0914, "mean_token_accuracy": 0.5437234510033689, "step": 4977 }, { "epoch": 0.9228772710418984, "grad_norm": 7.9921875, "learning_rate": 9.077122728958103e-06, "loss": 2.4773, "mean_token_accuracy": 0.47540292847361215, "step": 4978 }, { "epoch": 0.9230626622172785, "grad_norm": 5.47265625, "learning_rate": 9.076937337782722e-06, "loss": 2.8065, "mean_token_accuracy": 0.44319474282825727, "step": 4979 }, { "epoch": 0.9232480533926585, "grad_norm": 6.671875, "learning_rate": 9.076751946607342e-06, "loss": 3.3323, "mean_token_accuracy": 0.4120306933406413, "step": 4980 }, { "epoch": 0.9234334445680386, "grad_norm": 5.078125, "learning_rate": 9.076566555431963e-06, "loss": 2.8433, "mean_token_accuracy": 0.44940607127144744, "step": 4981 }, { "epoch": 0.9236188357434186, "grad_norm": 6.015625, "learning_rate": 9.076381164256583e-06, "loss": 3.0572, "mean_token_accuracy": 0.42929022588587945, "step": 4982 }, { "epoch": 0.9238042269187987, "grad_norm": 5.46875, "learning_rate": 9.076195773081202e-06, "loss": 3.5526, "mean_token_accuracy": 0.384774677053545, "step": 4983 }, { "epoch": 0.9239896180941787, "grad_norm": 9.109375, "learning_rate": 9.076010381905822e-06, "loss": 2.7878, "mean_token_accuracy": 0.4655041698256255, "step": 4984 }, { "epoch": 0.9241750092695588, "grad_norm": 7.05078125, "learning_rate": 9.075824990730441e-06, "loss": 2.8629, "mean_token_accuracy": 0.44124117170313765, "step": 4985 }, { "epoch": 0.9243604004449388, "grad_norm": 6.9453125, "learning_rate": 9.075639599555062e-06, "loss": 3.0968, "mean_token_accuracy": 0.41042691847320123, "step": 4986 }, { "epoch": 0.9245457916203189, "grad_norm": 7.85546875, "learning_rate": 9.075454208379682e-06, "loss": 2.7458, "mean_token_accuracy": 0.4569264752287788, "step": 4987 }, { "epoch": 0.9247311827956989, "grad_norm": 6.44921875, "learning_rate": 9.075268817204301e-06, "loss": 3.2206, "mean_token_accuracy": 0.41347508646086845, "step": 4988 }, { "epoch": 0.924916573971079, "grad_norm": 7.046875, "learning_rate": 9.075083426028921e-06, "loss": 2.8966, "mean_token_accuracy": 0.4460118425635667, "step": 4989 }, { "epoch": 0.925101965146459, "grad_norm": 5.4609375, "learning_rate": 9.074898034853542e-06, "loss": 3.1181, "mean_token_accuracy": 0.4161348585189645, "step": 4990 }, { "epoch": 0.9252873563218391, "grad_norm": 5.609375, "learning_rate": 9.074712643678162e-06, "loss": 2.8617, "mean_token_accuracy": 0.4566467065868263, "step": 4991 }, { "epoch": 0.9254727474972191, "grad_norm": 6.6796875, "learning_rate": 9.074527252502781e-06, "loss": 2.9827, "mean_token_accuracy": 0.4315217391304348, "step": 4992 }, { "epoch": 0.9256581386725992, "grad_norm": 6.24609375, "learning_rate": 9.074341861327402e-06, "loss": 3.56, "mean_token_accuracy": 0.41300056401579244, "step": 4993 }, { "epoch": 0.9258435298479792, "grad_norm": 5.25390625, "learning_rate": 9.07415647015202e-06, "loss": 2.7723, "mean_token_accuracy": 0.447431693989071, "step": 4994 }, { "epoch": 0.9260289210233593, "grad_norm": 6.8046875, "learning_rate": 9.073971078976641e-06, "loss": 2.7114, "mean_token_accuracy": 0.4684607717499644, "step": 4995 }, { "epoch": 0.9262143121987393, "grad_norm": 8.0078125, "learning_rate": 9.073785687801262e-06, "loss": 2.9449, "mean_token_accuracy": 0.46177924217462935, "step": 4996 }, { "epoch": 0.9263997033741194, "grad_norm": 5.671875, "learning_rate": 9.073600296625882e-06, "loss": 2.8734, "mean_token_accuracy": 0.442861504019538, "step": 4997 }, { "epoch": 0.9265850945494994, "grad_norm": 7.87109375, "learning_rate": 9.0734149054505e-06, "loss": 2.3334, "mean_token_accuracy": 0.5043661547726589, "step": 4998 }, { "epoch": 0.9267704857248795, "grad_norm": 6.90625, "learning_rate": 9.073229514275121e-06, "loss": 3.4988, "mean_token_accuracy": 0.39118457300275483, "step": 4999 }, { "epoch": 0.9269558769002596, "grad_norm": 6.0078125, "learning_rate": 9.073044123099742e-06, "loss": 2.6032, "mean_token_accuracy": 0.4696289293311274, "step": 5000 }, { "epoch": 0.9271412680756396, "grad_norm": 6.69140625, "learning_rate": 9.07285873192436e-06, "loss": 3.2134, "mean_token_accuracy": 0.39330346616364065, "step": 5001 }, { "epoch": 0.9273266592510196, "grad_norm": 8.046875, "learning_rate": 9.072673340748981e-06, "loss": 2.857, "mean_token_accuracy": 0.4456237278278569, "step": 5002 }, { "epoch": 0.9275120504263997, "grad_norm": 8.984375, "learning_rate": 9.0724879495736e-06, "loss": 2.9382, "mean_token_accuracy": 0.45203509276571263, "step": 5003 }, { "epoch": 0.9276974416017798, "grad_norm": 6.3203125, "learning_rate": 9.07230255839822e-06, "loss": 2.8767, "mean_token_accuracy": 0.44558051930579845, "step": 5004 }, { "epoch": 0.9278828327771598, "grad_norm": 7.36328125, "learning_rate": 9.072117167222841e-06, "loss": 2.5572, "mean_token_accuracy": 0.4724333063864187, "step": 5005 }, { "epoch": 0.9280682239525399, "grad_norm": 5.84765625, "learning_rate": 9.071931776047461e-06, "loss": 2.6134, "mean_token_accuracy": 0.4877289149121714, "step": 5006 }, { "epoch": 0.9282536151279199, "grad_norm": 5.34765625, "learning_rate": 9.071746384872082e-06, "loss": 3.2212, "mean_token_accuracy": 0.42715141612200436, "step": 5007 }, { "epoch": 0.9284390063033, "grad_norm": 7.6015625, "learning_rate": 9.0715609936967e-06, "loss": 2.988, "mean_token_accuracy": 0.4092832414412104, "step": 5008 }, { "epoch": 0.92862439747868, "grad_norm": 6.1796875, "learning_rate": 9.071375602521321e-06, "loss": 2.8125, "mean_token_accuracy": 0.45897097625329814, "step": 5009 }, { "epoch": 0.92880978865406, "grad_norm": 5.40625, "learning_rate": 9.07119021134594e-06, "loss": 2.6624, "mean_token_accuracy": 0.45803008248423094, "step": 5010 }, { "epoch": 0.9289951798294401, "grad_norm": 5.71875, "learning_rate": 9.07100482017056e-06, "loss": 2.9895, "mean_token_accuracy": 0.4239567621920563, "step": 5011 }, { "epoch": 0.9291805710048202, "grad_norm": 5.1953125, "learning_rate": 9.070819428995181e-06, "loss": 2.6746, "mean_token_accuracy": 0.46896863010607087, "step": 5012 }, { "epoch": 0.9293659621802002, "grad_norm": 5.515625, "learning_rate": 9.070634037819801e-06, "loss": 3.2647, "mean_token_accuracy": 0.41482632338253245, "step": 5013 }, { "epoch": 0.9295513533555803, "grad_norm": 6.11328125, "learning_rate": 9.07044864664442e-06, "loss": 3.3161, "mean_token_accuracy": 0.41375257126065235, "step": 5014 }, { "epoch": 0.9297367445309603, "grad_norm": 6.7890625, "learning_rate": 9.07026325546904e-06, "loss": 2.5915, "mean_token_accuracy": 0.4809070383822519, "step": 5015 }, { "epoch": 0.9299221357063404, "grad_norm": 6.3984375, "learning_rate": 9.070077864293661e-06, "loss": 3.0112, "mean_token_accuracy": 0.4320102432778489, "step": 5016 }, { "epoch": 0.9301075268817204, "grad_norm": 8.6484375, "learning_rate": 9.06989247311828e-06, "loss": 2.255, "mean_token_accuracy": 0.5033981161321092, "step": 5017 }, { "epoch": 0.9302929180571005, "grad_norm": 8.421875, "learning_rate": 9.0697070819429e-06, "loss": 2.5231, "mean_token_accuracy": 0.4760695051315923, "step": 5018 }, { "epoch": 0.9304783092324805, "grad_norm": 8.03125, "learning_rate": 9.06952169076752e-06, "loss": 3.0176, "mean_token_accuracy": 0.43518187239117473, "step": 5019 }, { "epoch": 0.9306637004078606, "grad_norm": 5.50390625, "learning_rate": 9.06933629959214e-06, "loss": 2.5805, "mean_token_accuracy": 0.4695041684949539, "step": 5020 }, { "epoch": 0.9308490915832406, "grad_norm": 7.58203125, "learning_rate": 9.06915090841676e-06, "loss": 2.8542, "mean_token_accuracy": 0.4540878319736157, "step": 5021 }, { "epoch": 0.9310344827586207, "grad_norm": 9.34375, "learning_rate": 9.06896551724138e-06, "loss": 2.9099, "mean_token_accuracy": 0.44472396925227115, "step": 5022 }, { "epoch": 0.9312198739340007, "grad_norm": 6.25390625, "learning_rate": 9.068780126066e-06, "loss": 3.1117, "mean_token_accuracy": 0.4439751000444642, "step": 5023 }, { "epoch": 0.9314052651093808, "grad_norm": 6.21875, "learning_rate": 9.06859473489062e-06, "loss": 2.9356, "mean_token_accuracy": 0.4418349127867051, "step": 5024 }, { "epoch": 0.9315906562847609, "grad_norm": 7.578125, "learning_rate": 9.06840934371524e-06, "loss": 2.809, "mean_token_accuracy": 0.44796851487440675, "step": 5025 }, { "epoch": 0.9317760474601409, "grad_norm": 7.67578125, "learning_rate": 9.06822395253986e-06, "loss": 2.3462, "mean_token_accuracy": 0.49228626526676666, "step": 5026 }, { "epoch": 0.931961438635521, "grad_norm": 7.54296875, "learning_rate": 9.06803856136448e-06, "loss": 2.4471, "mean_token_accuracy": 0.47226298796595245, "step": 5027 }, { "epoch": 0.932146829810901, "grad_norm": 5.9921875, "learning_rate": 9.067853170189099e-06, "loss": 2.7297, "mean_token_accuracy": 0.45342147141633005, "step": 5028 }, { "epoch": 0.9323322209862811, "grad_norm": 5.30078125, "learning_rate": 9.06766777901372e-06, "loss": 2.9192, "mean_token_accuracy": 0.41695114773396114, "step": 5029 }, { "epoch": 0.9325176121616611, "grad_norm": 7.73828125, "learning_rate": 9.06748238783834e-06, "loss": 2.6566, "mean_token_accuracy": 0.4622186495176849, "step": 5030 }, { "epoch": 0.9327030033370411, "grad_norm": 7.70703125, "learning_rate": 9.06729699666296e-06, "loss": 2.9391, "mean_token_accuracy": 0.44421620233662856, "step": 5031 }, { "epoch": 0.9328883945124212, "grad_norm": 6.8046875, "learning_rate": 9.067111605487579e-06, "loss": 3.0469, "mean_token_accuracy": 0.4385182644486366, "step": 5032 }, { "epoch": 0.9330737856878013, "grad_norm": 5.3828125, "learning_rate": 9.0669262143122e-06, "loss": 2.6182, "mean_token_accuracy": 0.4535098960558166, "step": 5033 }, { "epoch": 0.9332591768631813, "grad_norm": 6.78515625, "learning_rate": 9.06674082313682e-06, "loss": 2.5284, "mean_token_accuracy": 0.47643176997407494, "step": 5034 }, { "epoch": 0.9334445680385614, "grad_norm": 6.98828125, "learning_rate": 9.066555431961439e-06, "loss": 2.7124, "mean_token_accuracy": 0.46201592227769533, "step": 5035 }, { "epoch": 0.9336299592139414, "grad_norm": 9.234375, "learning_rate": 9.06637004078606e-06, "loss": 2.5526, "mean_token_accuracy": 0.4859393806762228, "step": 5036 }, { "epoch": 0.9338153503893215, "grad_norm": 7.45703125, "learning_rate": 9.06618464961068e-06, "loss": 2.9983, "mean_token_accuracy": 0.43536977491961415, "step": 5037 }, { "epoch": 0.9340007415647015, "grad_norm": 6.296875, "learning_rate": 9.0659992584353e-06, "loss": 2.4304, "mean_token_accuracy": 0.5121806298276886, "step": 5038 }, { "epoch": 0.9341861327400816, "grad_norm": 5.23828125, "learning_rate": 9.065813867259919e-06, "loss": 3.2227, "mean_token_accuracy": 0.4051336332363059, "step": 5039 }, { "epoch": 0.9343715239154616, "grad_norm": 6.97265625, "learning_rate": 9.06562847608454e-06, "loss": 2.7968, "mean_token_accuracy": 0.44825305852529485, "step": 5040 }, { "epoch": 0.9345569150908417, "grad_norm": 5.73046875, "learning_rate": 9.065443084909158e-06, "loss": 2.524, "mean_token_accuracy": 0.4919157275845174, "step": 5041 }, { "epoch": 0.9347423062662217, "grad_norm": 4.3046875, "learning_rate": 9.065257693733779e-06, "loss": 2.7022, "mean_token_accuracy": 0.4552004648460198, "step": 5042 }, { "epoch": 0.9349276974416018, "grad_norm": 6.30859375, "learning_rate": 9.0650723025584e-06, "loss": 3.0691, "mean_token_accuracy": 0.4241849886277483, "step": 5043 }, { "epoch": 0.9351130886169818, "grad_norm": 5.640625, "learning_rate": 9.064886911383018e-06, "loss": 2.8632, "mean_token_accuracy": 0.43985042735042734, "step": 5044 }, { "epoch": 0.9352984797923619, "grad_norm": 6.23046875, "learning_rate": 9.06470152020764e-06, "loss": 2.615, "mean_token_accuracy": 0.47962529274004684, "step": 5045 }, { "epoch": 0.9354838709677419, "grad_norm": 6.08203125, "learning_rate": 9.064516129032259e-06, "loss": 3.0175, "mean_token_accuracy": 0.43249277646440765, "step": 5046 }, { "epoch": 0.935669262143122, "grad_norm": 8.5078125, "learning_rate": 9.06433073785688e-06, "loss": 2.2219, "mean_token_accuracy": 0.5098530992475815, "step": 5047 }, { "epoch": 0.935854653318502, "grad_norm": 7.078125, "learning_rate": 9.064145346681498e-06, "loss": 2.9151, "mean_token_accuracy": 0.4275294117647059, "step": 5048 }, { "epoch": 0.9360400444938821, "grad_norm": 7.69921875, "learning_rate": 9.063959955506119e-06, "loss": 2.79, "mean_token_accuracy": 0.46177152022812334, "step": 5049 }, { "epoch": 0.9362254356692622, "grad_norm": 5.88671875, "learning_rate": 9.06377456433074e-06, "loss": 3.5509, "mean_token_accuracy": 0.372310570626754, "step": 5050 }, { "epoch": 0.9364108268446422, "grad_norm": 6.59375, "learning_rate": 9.063589173155358e-06, "loss": 2.6417, "mean_token_accuracy": 0.4562129515714126, "step": 5051 }, { "epoch": 0.9365962180200222, "grad_norm": 6.22265625, "learning_rate": 9.063403781979979e-06, "loss": 2.8197, "mean_token_accuracy": 0.44240400667779634, "step": 5052 }, { "epoch": 0.9367816091954023, "grad_norm": 7.74609375, "learning_rate": 9.063218390804599e-06, "loss": 3.8114, "mean_token_accuracy": 0.38948380010982975, "step": 5053 }, { "epoch": 0.9369670003707824, "grad_norm": 6.08203125, "learning_rate": 9.06303299962922e-06, "loss": 2.7711, "mean_token_accuracy": 0.4529924740174412, "step": 5054 }, { "epoch": 0.9371523915461624, "grad_norm": 5.0859375, "learning_rate": 9.062847608453838e-06, "loss": 2.6152, "mean_token_accuracy": 0.4714538765499387, "step": 5055 }, { "epoch": 0.9373377827215424, "grad_norm": 7.8359375, "learning_rate": 9.062662217278459e-06, "loss": 2.826, "mean_token_accuracy": 0.4443468072642062, "step": 5056 }, { "epoch": 0.9375231738969225, "grad_norm": 8.1953125, "learning_rate": 9.062476826103078e-06, "loss": 2.7945, "mean_token_accuracy": 0.4477390659747961, "step": 5057 }, { "epoch": 0.9377085650723026, "grad_norm": 6.69921875, "learning_rate": 9.062291434927698e-06, "loss": 2.7296, "mean_token_accuracy": 0.45717904068386894, "step": 5058 }, { "epoch": 0.9378939562476826, "grad_norm": 5.625, "learning_rate": 9.062106043752319e-06, "loss": 2.6638, "mean_token_accuracy": 0.46437768240343347, "step": 5059 }, { "epoch": 0.9380793474230626, "grad_norm": 5.61328125, "learning_rate": 9.061920652576937e-06, "loss": 2.9125, "mean_token_accuracy": 0.44624912362701563, "step": 5060 }, { "epoch": 0.9382647385984427, "grad_norm": 9.0546875, "learning_rate": 9.061735261401558e-06, "loss": 2.6639, "mean_token_accuracy": 0.46072642691000176, "step": 5061 }, { "epoch": 0.9384501297738228, "grad_norm": 12.9140625, "learning_rate": 9.061549870226178e-06, "loss": 2.7657, "mean_token_accuracy": 0.4651775804661487, "step": 5062 }, { "epoch": 0.9386355209492028, "grad_norm": 5.75390625, "learning_rate": 9.061364479050799e-06, "loss": 2.6755, "mean_token_accuracy": 0.46603598014888337, "step": 5063 }, { "epoch": 0.9388209121245829, "grad_norm": 6.23828125, "learning_rate": 9.061179087875418e-06, "loss": 2.5667, "mean_token_accuracy": 0.47568710359408034, "step": 5064 }, { "epoch": 0.9390063032999629, "grad_norm": 7.4453125, "learning_rate": 9.060993696700038e-06, "loss": 2.9105, "mean_token_accuracy": 0.4552574143022211, "step": 5065 }, { "epoch": 0.939191694475343, "grad_norm": 7.33984375, "learning_rate": 9.060808305524657e-06, "loss": 2.6523, "mean_token_accuracy": 0.4586961375306988, "step": 5066 }, { "epoch": 0.939377085650723, "grad_norm": 6.28125, "learning_rate": 9.060622914349277e-06, "loss": 3.3778, "mean_token_accuracy": 0.43209876543209874, "step": 5067 }, { "epoch": 0.939562476826103, "grad_norm": 7.87109375, "learning_rate": 9.060437523173898e-06, "loss": 2.663, "mean_token_accuracy": 0.46652615918670964, "step": 5068 }, { "epoch": 0.9397478680014831, "grad_norm": 6.6015625, "learning_rate": 9.060252131998518e-06, "loss": 2.9997, "mean_token_accuracy": 0.4517708689215351, "step": 5069 }, { "epoch": 0.9399332591768632, "grad_norm": 7.66796875, "learning_rate": 9.060066740823137e-06, "loss": 2.6694, "mean_token_accuracy": 0.47691123653155465, "step": 5070 }, { "epoch": 0.9401186503522432, "grad_norm": 9.53125, "learning_rate": 9.059881349647758e-06, "loss": 1.9966, "mean_token_accuracy": 0.5260179603887317, "step": 5071 }, { "epoch": 0.9403040415276233, "grad_norm": 9.2265625, "learning_rate": 9.059695958472378e-06, "loss": 2.3724, "mean_token_accuracy": 0.4982536270822139, "step": 5072 }, { "epoch": 0.9404894327030033, "grad_norm": 7.6171875, "learning_rate": 9.059510567296997e-06, "loss": 2.6403, "mean_token_accuracy": 0.46832160059281214, "step": 5073 }, { "epoch": 0.9406748238783834, "grad_norm": 7.484375, "learning_rate": 9.059325176121617e-06, "loss": 3.6392, "mean_token_accuracy": 0.3676419163072619, "step": 5074 }, { "epoch": 0.9408602150537635, "grad_norm": 8.1953125, "learning_rate": 9.059139784946236e-06, "loss": 3.1878, "mean_token_accuracy": 0.43460081013981444, "step": 5075 }, { "epoch": 0.9410456062291435, "grad_norm": 6.23046875, "learning_rate": 9.058954393770857e-06, "loss": 2.8118, "mean_token_accuracy": 0.4408090422367638, "step": 5076 }, { "epoch": 0.9412309974045235, "grad_norm": 5.94921875, "learning_rate": 9.058769002595477e-06, "loss": 2.9962, "mean_token_accuracy": 0.4208365409289197, "step": 5077 }, { "epoch": 0.9414163885799036, "grad_norm": 6.19921875, "learning_rate": 9.058583611420098e-06, "loss": 2.9726, "mean_token_accuracy": 0.4538629965592743, "step": 5078 }, { "epoch": 0.9416017797552837, "grad_norm": 6.15625, "learning_rate": 9.058398220244717e-06, "loss": 2.8894, "mean_token_accuracy": 0.4475616438356164, "step": 5079 }, { "epoch": 0.9417871709306637, "grad_norm": 6.0390625, "learning_rate": 9.058212829069337e-06, "loss": 3.0562, "mean_token_accuracy": 0.43910865434444635, "step": 5080 }, { "epoch": 0.9419725621060437, "grad_norm": 8.8359375, "learning_rate": 9.058027437893958e-06, "loss": 3.125, "mean_token_accuracy": 0.41188603841691374, "step": 5081 }, { "epoch": 0.9421579532814238, "grad_norm": 20.90625, "learning_rate": 9.057842046718576e-06, "loss": 3.1916, "mean_token_accuracy": 0.47096848666743624, "step": 5082 }, { "epoch": 0.9423433444568039, "grad_norm": 7.65625, "learning_rate": 9.057656655543197e-06, "loss": 2.5256, "mean_token_accuracy": 0.4723337406179089, "step": 5083 }, { "epoch": 0.9425287356321839, "grad_norm": 11.109375, "learning_rate": 9.057471264367816e-06, "loss": 3.8565, "mean_token_accuracy": 0.39267910554052127, "step": 5084 }, { "epoch": 0.942714126807564, "grad_norm": 6.87109375, "learning_rate": 9.057285873192436e-06, "loss": 2.5803, "mean_token_accuracy": 0.48431214802896216, "step": 5085 }, { "epoch": 0.942899517982944, "grad_norm": 6.359375, "learning_rate": 9.057100482017057e-06, "loss": 3.1399, "mean_token_accuracy": 0.3987034035656402, "step": 5086 }, { "epoch": 0.9430849091583241, "grad_norm": 6.390625, "learning_rate": 9.056915090841677e-06, "loss": 2.493, "mean_token_accuracy": 0.4829188302814977, "step": 5087 }, { "epoch": 0.9432703003337041, "grad_norm": 5.9140625, "learning_rate": 9.056729699666298e-06, "loss": 2.5376, "mean_token_accuracy": 0.46644388749651905, "step": 5088 }, { "epoch": 0.9434556915090841, "grad_norm": 9.1953125, "learning_rate": 9.056544308490916e-06, "loss": 2.8615, "mean_token_accuracy": 0.4608012568735271, "step": 5089 }, { "epoch": 0.9436410826844642, "grad_norm": 7.5, "learning_rate": 9.056358917315537e-06, "loss": 2.6794, "mean_token_accuracy": 0.44646051622795807, "step": 5090 }, { "epoch": 0.9438264738598443, "grad_norm": 8.0, "learning_rate": 9.056173526140156e-06, "loss": 2.7121, "mean_token_accuracy": 0.46624843161856966, "step": 5091 }, { "epoch": 0.9440118650352243, "grad_norm": 8.359375, "learning_rate": 9.055988134964776e-06, "loss": 3.4026, "mean_token_accuracy": 0.40621135083138654, "step": 5092 }, { "epoch": 0.9441972562106044, "grad_norm": 6.81640625, "learning_rate": 9.055802743789397e-06, "loss": 3.1415, "mean_token_accuracy": 0.4341147018661812, "step": 5093 }, { "epoch": 0.9443826473859844, "grad_norm": 7.1640625, "learning_rate": 9.055617352614017e-06, "loss": 2.8629, "mean_token_accuracy": 0.44316016766162836, "step": 5094 }, { "epoch": 0.9445680385613645, "grad_norm": 6.46484375, "learning_rate": 9.055431961438636e-06, "loss": 2.8548, "mean_token_accuracy": 0.4570001189484953, "step": 5095 }, { "epoch": 0.9447534297367445, "grad_norm": 6.4921875, "learning_rate": 9.055246570263256e-06, "loss": 2.5373, "mean_token_accuracy": 0.48470209339774556, "step": 5096 }, { "epoch": 0.9449388209121246, "grad_norm": 6.9453125, "learning_rate": 9.055061179087877e-06, "loss": 3.6893, "mean_token_accuracy": 0.3631725417439703, "step": 5097 }, { "epoch": 0.9451242120875046, "grad_norm": 8.28125, "learning_rate": 9.054875787912496e-06, "loss": 2.7876, "mean_token_accuracy": 0.47638660076880834, "step": 5098 }, { "epoch": 0.9453096032628847, "grad_norm": 8.5625, "learning_rate": 9.054690396737116e-06, "loss": 3.4998, "mean_token_accuracy": 0.3807572760666855, "step": 5099 }, { "epoch": 0.9454949944382648, "grad_norm": 7.640625, "learning_rate": 9.054505005561735e-06, "loss": 3.4905, "mean_token_accuracy": 0.408623417721519, "step": 5100 }, { "epoch": 0.9456803856136448, "grad_norm": 9.6875, "learning_rate": 9.054319614386356e-06, "loss": 2.83, "mean_token_accuracy": 0.4509908232477723, "step": 5101 }, { "epoch": 0.9458657767890248, "grad_norm": 6.046875, "learning_rate": 9.054134223210976e-06, "loss": 2.5938, "mean_token_accuracy": 0.4827832547506787, "step": 5102 }, { "epoch": 0.9460511679644049, "grad_norm": 5.9453125, "learning_rate": 9.053948832035596e-06, "loss": 2.8142, "mean_token_accuracy": 0.42935288640595903, "step": 5103 }, { "epoch": 0.946236559139785, "grad_norm": 5.59375, "learning_rate": 9.053763440860215e-06, "loss": 3.0578, "mean_token_accuracy": 0.430240669689571, "step": 5104 }, { "epoch": 0.946421950315165, "grad_norm": 5.40234375, "learning_rate": 9.053578049684836e-06, "loss": 2.2834, "mean_token_accuracy": 0.5416318241548203, "step": 5105 }, { "epoch": 0.946607341490545, "grad_norm": 6.34765625, "learning_rate": 9.053392658509456e-06, "loss": 2.7072, "mean_token_accuracy": 0.465514456469068, "step": 5106 }, { "epoch": 0.9467927326659251, "grad_norm": 6.1015625, "learning_rate": 9.053207267334075e-06, "loss": 3.038, "mean_token_accuracy": 0.4057322529931068, "step": 5107 }, { "epoch": 0.9469781238413052, "grad_norm": 7.03125, "learning_rate": 9.053021876158696e-06, "loss": 3.0117, "mean_token_accuracy": 0.4217802570943488, "step": 5108 }, { "epoch": 0.9471635150166852, "grad_norm": 9.828125, "learning_rate": 9.052836484983314e-06, "loss": 2.7111, "mean_token_accuracy": 0.43283582089552236, "step": 5109 }, { "epoch": 0.9473489061920652, "grad_norm": 6.734375, "learning_rate": 9.052651093807937e-06, "loss": 2.7074, "mean_token_accuracy": 0.4521354933726068, "step": 5110 }, { "epoch": 0.9475342973674453, "grad_norm": 5.88671875, "learning_rate": 9.052465702632555e-06, "loss": 3.2006, "mean_token_accuracy": 0.41606337931976506, "step": 5111 }, { "epoch": 0.9477196885428254, "grad_norm": 5.98828125, "learning_rate": 9.052280311457176e-06, "loss": 2.9331, "mean_token_accuracy": 0.4601884114079236, "step": 5112 }, { "epoch": 0.9479050797182054, "grad_norm": 8.1796875, "learning_rate": 9.052094920281795e-06, "loss": 2.8859, "mean_token_accuracy": 0.4550881577120644, "step": 5113 }, { "epoch": 0.9480904708935854, "grad_norm": 5.84765625, "learning_rate": 9.051909529106415e-06, "loss": 2.8655, "mean_token_accuracy": 0.44434675575874805, "step": 5114 }, { "epoch": 0.9482758620689655, "grad_norm": 6.70703125, "learning_rate": 9.051724137931036e-06, "loss": 3.096, "mean_token_accuracy": 0.43569001779647304, "step": 5115 }, { "epoch": 0.9484612532443456, "grad_norm": 5.9609375, "learning_rate": 9.051538746755654e-06, "loss": 2.3927, "mean_token_accuracy": 0.5415167650021586, "step": 5116 }, { "epoch": 0.9486466444197256, "grad_norm": 8.2890625, "learning_rate": 9.051353355580275e-06, "loss": 2.478, "mean_token_accuracy": 0.4662173546756529, "step": 5117 }, { "epoch": 0.9488320355951056, "grad_norm": 5.4765625, "learning_rate": 9.051167964404895e-06, "loss": 2.7463, "mean_token_accuracy": 0.45396536007292615, "step": 5118 }, { "epoch": 0.9490174267704857, "grad_norm": 6.12890625, "learning_rate": 9.050982573229516e-06, "loss": 2.4743, "mean_token_accuracy": 0.48412415269354264, "step": 5119 }, { "epoch": 0.9492028179458658, "grad_norm": 7.57421875, "learning_rate": 9.050797182054135e-06, "loss": 2.8984, "mean_token_accuracy": 0.44429795649307846, "step": 5120 }, { "epoch": 0.9493882091212458, "grad_norm": 7.59375, "learning_rate": 9.050611790878755e-06, "loss": 2.8695, "mean_token_accuracy": 0.4414830452566302, "step": 5121 }, { "epoch": 0.9495736002966259, "grad_norm": 8.578125, "learning_rate": 9.050426399703374e-06, "loss": 2.7572, "mean_token_accuracy": 0.4468139121604468, "step": 5122 }, { "epoch": 0.9497589914720059, "grad_norm": 5.84375, "learning_rate": 9.050241008527994e-06, "loss": 3.2567, "mean_token_accuracy": 0.4127341879137907, "step": 5123 }, { "epoch": 0.949944382647386, "grad_norm": 9.6640625, "learning_rate": 9.050055617352615e-06, "loss": 2.5905, "mean_token_accuracy": 0.46847395674687786, "step": 5124 }, { "epoch": 0.9501297738227661, "grad_norm": 6.796875, "learning_rate": 9.049870226177234e-06, "loss": 3.4738, "mean_token_accuracy": 0.40679611650485437, "step": 5125 }, { "epoch": 0.9503151649981461, "grad_norm": 5.7265625, "learning_rate": 9.049684835001856e-06, "loss": 2.7231, "mean_token_accuracy": 0.44760110160892885, "step": 5126 }, { "epoch": 0.9505005561735261, "grad_norm": 7.3359375, "learning_rate": 9.049499443826475e-06, "loss": 2.9174, "mean_token_accuracy": 0.45037504076530055, "step": 5127 }, { "epoch": 0.9506859473489062, "grad_norm": 6.078125, "learning_rate": 9.049314052651095e-06, "loss": 3.0871, "mean_token_accuracy": 0.43454871488344293, "step": 5128 }, { "epoch": 0.9508713385242863, "grad_norm": 8.2109375, "learning_rate": 9.049128661475714e-06, "loss": 2.7704, "mean_token_accuracy": 0.48210987996306554, "step": 5129 }, { "epoch": 0.9510567296996663, "grad_norm": 8.9921875, "learning_rate": 9.048943270300335e-06, "loss": 2.6709, "mean_token_accuracy": 0.47009653873322343, "step": 5130 }, { "epoch": 0.9512421208750463, "grad_norm": 7.9921875, "learning_rate": 9.048757879124955e-06, "loss": 2.6169, "mean_token_accuracy": 0.47681389942493574, "step": 5131 }, { "epoch": 0.9514275120504264, "grad_norm": 7.40625, "learning_rate": 9.048572487949574e-06, "loss": 3.1055, "mean_token_accuracy": 0.4241910229645094, "step": 5132 }, { "epoch": 0.9516129032258065, "grad_norm": 8.1328125, "learning_rate": 9.048387096774194e-06, "loss": 2.6471, "mean_token_accuracy": 0.482382682888292, "step": 5133 }, { "epoch": 0.9517982944011865, "grad_norm": 7.58984375, "learning_rate": 9.048201705598815e-06, "loss": 2.4732, "mean_token_accuracy": 0.5067913604987754, "step": 5134 }, { "epoch": 0.9519836855765665, "grad_norm": 5.828125, "learning_rate": 9.048016314423435e-06, "loss": 2.9442, "mean_token_accuracy": 0.42762507415463713, "step": 5135 }, { "epoch": 0.9521690767519466, "grad_norm": 5.6484375, "learning_rate": 9.047830923248054e-06, "loss": 3.4461, "mean_token_accuracy": 0.38918106686701726, "step": 5136 }, { "epoch": 0.9523544679273267, "grad_norm": 7.81640625, "learning_rate": 9.047645532072675e-06, "loss": 2.8653, "mean_token_accuracy": 0.43877109476417137, "step": 5137 }, { "epoch": 0.9525398591027067, "grad_norm": 8.125, "learning_rate": 9.047460140897293e-06, "loss": 2.6481, "mean_token_accuracy": 0.4720449323461833, "step": 5138 }, { "epoch": 0.9527252502780867, "grad_norm": 8.71875, "learning_rate": 9.047274749721914e-06, "loss": 2.9996, "mean_token_accuracy": 0.4242208623879323, "step": 5139 }, { "epoch": 0.9529106414534668, "grad_norm": 7.67578125, "learning_rate": 9.047089358546534e-06, "loss": 3.0508, "mean_token_accuracy": 0.43106382978723407, "step": 5140 }, { "epoch": 0.9530960326288469, "grad_norm": 6.15234375, "learning_rate": 9.046903967371153e-06, "loss": 2.5805, "mean_token_accuracy": 0.4807121661721068, "step": 5141 }, { "epoch": 0.9532814238042269, "grad_norm": 8.046875, "learning_rate": 9.046718576195774e-06, "loss": 2.2334, "mean_token_accuracy": 0.5171836787231304, "step": 5142 }, { "epoch": 0.953466814979607, "grad_norm": 8.125, "learning_rate": 9.046533185020394e-06, "loss": 2.8847, "mean_token_accuracy": 0.4715713583483533, "step": 5143 }, { "epoch": 0.953652206154987, "grad_norm": 6.63671875, "learning_rate": 9.046347793845015e-06, "loss": 2.7305, "mean_token_accuracy": 0.44865027899261045, "step": 5144 }, { "epoch": 0.9538375973303671, "grad_norm": 7.9765625, "learning_rate": 9.046162402669633e-06, "loss": 2.8197, "mean_token_accuracy": 0.4576057431121459, "step": 5145 }, { "epoch": 0.9540229885057471, "grad_norm": 6.26953125, "learning_rate": 9.045977011494254e-06, "loss": 2.8074, "mean_token_accuracy": 0.4429158110882957, "step": 5146 }, { "epoch": 0.9542083796811272, "grad_norm": 6.3203125, "learning_rate": 9.045791620318873e-06, "loss": 3.0816, "mean_token_accuracy": 0.4258615238697439, "step": 5147 }, { "epoch": 0.9543937708565072, "grad_norm": 5.49609375, "learning_rate": 9.045606229143493e-06, "loss": 2.5723, "mean_token_accuracy": 0.4476231758416414, "step": 5148 }, { "epoch": 0.9545791620318873, "grad_norm": 8.2265625, "learning_rate": 9.045420837968114e-06, "loss": 2.9124, "mean_token_accuracy": 0.4569065583284968, "step": 5149 }, { "epoch": 0.9547645532072674, "grad_norm": 7.3828125, "learning_rate": 9.045235446792734e-06, "loss": 2.7596, "mean_token_accuracy": 0.4764292878635908, "step": 5150 }, { "epoch": 0.9549499443826474, "grad_norm": 7.19921875, "learning_rate": 9.045050055617353e-06, "loss": 2.3494, "mean_token_accuracy": 0.5188296234075318, "step": 5151 }, { "epoch": 0.9551353355580274, "grad_norm": 6.3828125, "learning_rate": 9.044864664441973e-06, "loss": 2.9067, "mean_token_accuracy": 0.4331896551724138, "step": 5152 }, { "epoch": 0.9553207267334075, "grad_norm": 5.4140625, "learning_rate": 9.044679273266594e-06, "loss": 2.907, "mean_token_accuracy": 0.4327605019094381, "step": 5153 }, { "epoch": 0.9555061179087876, "grad_norm": 5.44140625, "learning_rate": 9.044493882091213e-06, "loss": 3.4317, "mean_token_accuracy": 0.37875710804224205, "step": 5154 }, { "epoch": 0.9556915090841676, "grad_norm": 11.4296875, "learning_rate": 9.044308490915833e-06, "loss": 1.8909, "mean_token_accuracy": 0.5749523204068658, "step": 5155 }, { "epoch": 0.9558769002595476, "grad_norm": 7.46875, "learning_rate": 9.044123099740452e-06, "loss": 2.6433, "mean_token_accuracy": 0.47192420231294413, "step": 5156 }, { "epoch": 0.9560622914349277, "grad_norm": 5.84765625, "learning_rate": 9.043937708565073e-06, "loss": 2.753, "mean_token_accuracy": 0.45921488656640325, "step": 5157 }, { "epoch": 0.9562476826103078, "grad_norm": 6.0625, "learning_rate": 9.043752317389693e-06, "loss": 3.0041, "mean_token_accuracy": 0.4222384784198976, "step": 5158 }, { "epoch": 0.9564330737856878, "grad_norm": 6.05859375, "learning_rate": 9.043566926214314e-06, "loss": 2.7249, "mean_token_accuracy": 0.4567667075755885, "step": 5159 }, { "epoch": 0.9566184649610678, "grad_norm": 5.55078125, "learning_rate": 9.043381535038932e-06, "loss": 2.7053, "mean_token_accuracy": 0.43915997529339096, "step": 5160 }, { "epoch": 0.9568038561364479, "grad_norm": 6.9296875, "learning_rate": 9.043196143863553e-06, "loss": 2.3705, "mean_token_accuracy": 0.495202123315639, "step": 5161 }, { "epoch": 0.956989247311828, "grad_norm": 5.77734375, "learning_rate": 9.043010752688173e-06, "loss": 2.9452, "mean_token_accuracy": 0.4395433316035288, "step": 5162 }, { "epoch": 0.957174638487208, "grad_norm": 5.90625, "learning_rate": 9.042825361512792e-06, "loss": 3.0524, "mean_token_accuracy": 0.4274028629856851, "step": 5163 }, { "epoch": 0.957360029662588, "grad_norm": 6.79296875, "learning_rate": 9.042639970337413e-06, "loss": 2.8446, "mean_token_accuracy": 0.44437215354586856, "step": 5164 }, { "epoch": 0.9575454208379681, "grad_norm": 6.6171875, "learning_rate": 9.042454579162031e-06, "loss": 2.6049, "mean_token_accuracy": 0.4754803996925442, "step": 5165 }, { "epoch": 0.9577308120133482, "grad_norm": 6.33203125, "learning_rate": 9.042269187986654e-06, "loss": 3.2309, "mean_token_accuracy": 0.4251479289940828, "step": 5166 }, { "epoch": 0.9579162031887282, "grad_norm": 7.0234375, "learning_rate": 9.042083796811272e-06, "loss": 2.0495, "mean_token_accuracy": 0.569643058214165, "step": 5167 }, { "epoch": 0.9581015943641082, "grad_norm": 6.46875, "learning_rate": 9.041898405635893e-06, "loss": 2.6429, "mean_token_accuracy": 0.502425799086758, "step": 5168 }, { "epoch": 0.9582869855394883, "grad_norm": 7.1484375, "learning_rate": 9.041713014460513e-06, "loss": 2.6652, "mean_token_accuracy": 0.46747479596735475, "step": 5169 }, { "epoch": 0.9584723767148684, "grad_norm": 7.07421875, "learning_rate": 9.041527623285132e-06, "loss": 3.0166, "mean_token_accuracy": 0.4230959446092977, "step": 5170 }, { "epoch": 0.9586577678902484, "grad_norm": 7.046875, "learning_rate": 9.041342232109753e-06, "loss": 3.1354, "mean_token_accuracy": 0.42696629213483145, "step": 5171 }, { "epoch": 0.9588431590656284, "grad_norm": 4.9921875, "learning_rate": 9.041156840934371e-06, "loss": 2.6096, "mean_token_accuracy": 0.4646479713603819, "step": 5172 }, { "epoch": 0.9590285502410085, "grad_norm": 5.32421875, "learning_rate": 9.040971449758992e-06, "loss": 2.8641, "mean_token_accuracy": 0.44739756367663347, "step": 5173 }, { "epoch": 0.9592139414163886, "grad_norm": 5.88671875, "learning_rate": 9.040786058583612e-06, "loss": 3.0097, "mean_token_accuracy": 0.4460464584048518, "step": 5174 }, { "epoch": 0.9593993325917687, "grad_norm": 6.27734375, "learning_rate": 9.040600667408233e-06, "loss": 2.7316, "mean_token_accuracy": 0.4506109230089798, "step": 5175 }, { "epoch": 0.9595847237671487, "grad_norm": 7.62890625, "learning_rate": 9.040415276232852e-06, "loss": 2.6282, "mean_token_accuracy": 0.49907846234860453, "step": 5176 }, { "epoch": 0.9597701149425287, "grad_norm": 7.25390625, "learning_rate": 9.040229885057472e-06, "loss": 2.2252, "mean_token_accuracy": 0.5144230769230769, "step": 5177 }, { "epoch": 0.9599555061179088, "grad_norm": 7.953125, "learning_rate": 9.040044493882093e-06, "loss": 2.5846, "mean_token_accuracy": 0.4631650750341064, "step": 5178 }, { "epoch": 0.9601408972932889, "grad_norm": 8.546875, "learning_rate": 9.039859102706711e-06, "loss": 2.7318, "mean_token_accuracy": 0.44526850707320426, "step": 5179 }, { "epoch": 0.9603262884686689, "grad_norm": 7.89453125, "learning_rate": 9.039673711531332e-06, "loss": 2.5774, "mean_token_accuracy": 0.4708860759493671, "step": 5180 }, { "epoch": 0.9605116796440489, "grad_norm": 5.578125, "learning_rate": 9.03948832035595e-06, "loss": 2.6955, "mean_token_accuracy": 0.45662750682330605, "step": 5181 }, { "epoch": 0.960697070819429, "grad_norm": 5.72265625, "learning_rate": 9.039302929180573e-06, "loss": 2.8256, "mean_token_accuracy": 0.4638644293071939, "step": 5182 }, { "epoch": 0.9608824619948091, "grad_norm": 7.765625, "learning_rate": 9.039117538005192e-06, "loss": 2.5729, "mean_token_accuracy": 0.4906201146430432, "step": 5183 }, { "epoch": 0.9610678531701891, "grad_norm": 6.18359375, "learning_rate": 9.038932146829812e-06, "loss": 2.5885, "mean_token_accuracy": 0.4794503589997524, "step": 5184 }, { "epoch": 0.9612532443455691, "grad_norm": 5.66015625, "learning_rate": 9.038746755654431e-06, "loss": 2.6422, "mean_token_accuracy": 0.45679554774458114, "step": 5185 }, { "epoch": 0.9614386355209492, "grad_norm": 8.03125, "learning_rate": 9.038561364479052e-06, "loss": 2.6093, "mean_token_accuracy": 0.46732700509010866, "step": 5186 }, { "epoch": 0.9616240266963293, "grad_norm": 7.65625, "learning_rate": 9.038375973303672e-06, "loss": 2.4173, "mean_token_accuracy": 0.506962962962963, "step": 5187 }, { "epoch": 0.9618094178717093, "grad_norm": 5.35546875, "learning_rate": 9.03819058212829e-06, "loss": 2.7887, "mean_token_accuracy": 0.4503105590062112, "step": 5188 }, { "epoch": 0.9619948090470893, "grad_norm": 10.0625, "learning_rate": 9.038005190952911e-06, "loss": 2.238, "mean_token_accuracy": 0.5131551252367923, "step": 5189 }, { "epoch": 0.9621802002224694, "grad_norm": 8.2265625, "learning_rate": 9.037819799777532e-06, "loss": 3.0787, "mean_token_accuracy": 0.429923805125837, "step": 5190 }, { "epoch": 0.9623655913978495, "grad_norm": 10.671875, "learning_rate": 9.037634408602152e-06, "loss": 2.8502, "mean_token_accuracy": 0.45515375854214124, "step": 5191 }, { "epoch": 0.9625509825732295, "grad_norm": 5.3828125, "learning_rate": 9.037449017426771e-06, "loss": 2.9714, "mean_token_accuracy": 0.44384707287933095, "step": 5192 }, { "epoch": 0.9627363737486095, "grad_norm": 8.7421875, "learning_rate": 9.037263626251392e-06, "loss": 2.6406, "mean_token_accuracy": 0.4695277815148661, "step": 5193 }, { "epoch": 0.9629217649239896, "grad_norm": 7.70703125, "learning_rate": 9.03707823507601e-06, "loss": 2.7539, "mean_token_accuracy": 0.4553635194972147, "step": 5194 }, { "epoch": 0.9631071560993697, "grad_norm": 6.41796875, "learning_rate": 9.036892843900631e-06, "loss": 3.5296, "mean_token_accuracy": 0.3985784001697433, "step": 5195 }, { "epoch": 0.9632925472747497, "grad_norm": 6.359375, "learning_rate": 9.036707452725251e-06, "loss": 3.4724, "mean_token_accuracy": 0.40551234820575793, "step": 5196 }, { "epoch": 0.9634779384501297, "grad_norm": 6.82421875, "learning_rate": 9.03652206154987e-06, "loss": 2.9556, "mean_token_accuracy": 0.4259637188208617, "step": 5197 }, { "epoch": 0.9636633296255098, "grad_norm": 8.953125, "learning_rate": 9.03633667037449e-06, "loss": 2.8666, "mean_token_accuracy": 0.4495681063122924, "step": 5198 }, { "epoch": 0.9638487208008899, "grad_norm": 5.5, "learning_rate": 9.036151279199111e-06, "loss": 2.726, "mean_token_accuracy": 0.4518666666666667, "step": 5199 }, { "epoch": 0.96403411197627, "grad_norm": 6.6796875, "learning_rate": 9.035965888023732e-06, "loss": 2.4821, "mean_token_accuracy": 0.4727700379952958, "step": 5200 }, { "epoch": 0.96421950315165, "grad_norm": 7.65625, "learning_rate": 9.03578049684835e-06, "loss": 2.2674, "mean_token_accuracy": 0.5025125628140703, "step": 5201 }, { "epoch": 0.96440489432703, "grad_norm": 6.30078125, "learning_rate": 9.035595105672971e-06, "loss": 3.0469, "mean_token_accuracy": 0.44163424124513617, "step": 5202 }, { "epoch": 0.9645902855024101, "grad_norm": 7.92578125, "learning_rate": 9.03540971449759e-06, "loss": 2.7477, "mean_token_accuracy": 0.44185490617903667, "step": 5203 }, { "epoch": 0.9647756766777902, "grad_norm": 10.9765625, "learning_rate": 9.03522432332221e-06, "loss": 3.2446, "mean_token_accuracy": 0.4154057771664374, "step": 5204 }, { "epoch": 0.9649610678531702, "grad_norm": 8.15625, "learning_rate": 9.03503893214683e-06, "loss": 3.2804, "mean_token_accuracy": 0.404245553643144, "step": 5205 }, { "epoch": 0.9651464590285502, "grad_norm": 6.27734375, "learning_rate": 9.03485354097145e-06, "loss": 3.3929, "mean_token_accuracy": 0.39731653888280394, "step": 5206 }, { "epoch": 0.9653318502039303, "grad_norm": 10.0625, "learning_rate": 9.034668149796072e-06, "loss": 2.5174, "mean_token_accuracy": 0.46927263730826324, "step": 5207 }, { "epoch": 0.9655172413793104, "grad_norm": 13.3515625, "learning_rate": 9.03448275862069e-06, "loss": 3.0429, "mean_token_accuracy": 0.4377210712481051, "step": 5208 }, { "epoch": 0.9657026325546904, "grad_norm": 12.0, "learning_rate": 9.034297367445311e-06, "loss": 2.4607, "mean_token_accuracy": 0.48308556713098444, "step": 5209 }, { "epoch": 0.9658880237300704, "grad_norm": 7.8046875, "learning_rate": 9.03411197626993e-06, "loss": 3.4499, "mean_token_accuracy": 0.408208851040067, "step": 5210 }, { "epoch": 0.9660734149054505, "grad_norm": 6.53125, "learning_rate": 9.03392658509455e-06, "loss": 3.6012, "mean_token_accuracy": 0.38065414057063324, "step": 5211 }, { "epoch": 0.9662588060808306, "grad_norm": 9.9921875, "learning_rate": 9.03374119391917e-06, "loss": 3.7186, "mean_token_accuracy": 0.3613712109189018, "step": 5212 }, { "epoch": 0.9664441972562106, "grad_norm": 10.265625, "learning_rate": 9.03355580274379e-06, "loss": 2.5571, "mean_token_accuracy": 0.47859116022099446, "step": 5213 }, { "epoch": 0.9666295884315906, "grad_norm": 8.75, "learning_rate": 9.03337041156841e-06, "loss": 2.949, "mean_token_accuracy": 0.44313603966532383, "step": 5214 }, { "epoch": 0.9668149796069707, "grad_norm": 6.265625, "learning_rate": 9.03318502039303e-06, "loss": 3.0346, "mean_token_accuracy": 0.42360797761934843, "step": 5215 }, { "epoch": 0.9670003707823508, "grad_norm": 10.1875, "learning_rate": 9.032999629217651e-06, "loss": 3.0643, "mean_token_accuracy": 0.4169190192439191, "step": 5216 }, { "epoch": 0.9671857619577308, "grad_norm": 9.8984375, "learning_rate": 9.03281423804227e-06, "loss": 2.6794, "mean_token_accuracy": 0.46551130494663145, "step": 5217 }, { "epoch": 0.9673711531331108, "grad_norm": 7.22265625, "learning_rate": 9.03262884686689e-06, "loss": 2.8222, "mean_token_accuracy": 0.4433491062039958, "step": 5218 }, { "epoch": 0.9675565443084909, "grad_norm": 8.3984375, "learning_rate": 9.032443455691509e-06, "loss": 2.4979, "mean_token_accuracy": 0.48482303539292143, "step": 5219 }, { "epoch": 0.967741935483871, "grad_norm": 10.1484375, "learning_rate": 9.03225806451613e-06, "loss": 2.6135, "mean_token_accuracy": 0.4622072391767211, "step": 5220 }, { "epoch": 0.9679273266592511, "grad_norm": 8.9296875, "learning_rate": 9.03207267334075e-06, "loss": 3.2953, "mean_token_accuracy": 0.40985163204747777, "step": 5221 }, { "epoch": 0.968112717834631, "grad_norm": 5.25, "learning_rate": 9.031887282165369e-06, "loss": 2.971, "mean_token_accuracy": 0.44638949671772427, "step": 5222 }, { "epoch": 0.9682981090100111, "grad_norm": 6.15625, "learning_rate": 9.03170189098999e-06, "loss": 2.9264, "mean_token_accuracy": 0.43218785796105386, "step": 5223 }, { "epoch": 0.9684835001853912, "grad_norm": 8.1015625, "learning_rate": 9.03151649981461e-06, "loss": 3.3585, "mean_token_accuracy": 0.4038132206872931, "step": 5224 }, { "epoch": 0.9686688913607713, "grad_norm": 6.55859375, "learning_rate": 9.03133110863923e-06, "loss": 2.9971, "mean_token_accuracy": 0.4357034795763994, "step": 5225 }, { "epoch": 0.9688542825361512, "grad_norm": 5.6015625, "learning_rate": 9.03114571746385e-06, "loss": 2.5248, "mean_token_accuracy": 0.4737704918032787, "step": 5226 }, { "epoch": 0.9690396737115313, "grad_norm": 8.375, "learning_rate": 9.03096032628847e-06, "loss": 2.4894, "mean_token_accuracy": 0.48577844311377244, "step": 5227 }, { "epoch": 0.9692250648869114, "grad_norm": 8.75, "learning_rate": 9.030774935113088e-06, "loss": 3.2596, "mean_token_accuracy": 0.40288098776723447, "step": 5228 }, { "epoch": 0.9694104560622915, "grad_norm": 5.42578125, "learning_rate": 9.030589543937709e-06, "loss": 2.495, "mean_token_accuracy": 0.49743062692702983, "step": 5229 }, { "epoch": 0.9695958472376714, "grad_norm": 7.52734375, "learning_rate": 9.03040415276233e-06, "loss": 2.9977, "mean_token_accuracy": 0.43370100915278104, "step": 5230 }, { "epoch": 0.9697812384130515, "grad_norm": 7.4453125, "learning_rate": 9.03021876158695e-06, "loss": 2.98, "mean_token_accuracy": 0.43478260869565216, "step": 5231 }, { "epoch": 0.9699666295884316, "grad_norm": 5.36328125, "learning_rate": 9.030033370411569e-06, "loss": 3.0468, "mean_token_accuracy": 0.4341298371748784, "step": 5232 }, { "epoch": 0.9701520207638117, "grad_norm": 7.01953125, "learning_rate": 9.02984797923619e-06, "loss": 2.5397, "mean_token_accuracy": 0.48105234460196294, "step": 5233 }, { "epoch": 0.9703374119391917, "grad_norm": 6.54296875, "learning_rate": 9.02966258806081e-06, "loss": 2.4322, "mean_token_accuracy": 0.5385243670024503, "step": 5234 }, { "epoch": 0.9705228031145717, "grad_norm": 6.1953125, "learning_rate": 9.029477196885429e-06, "loss": 3.202, "mean_token_accuracy": 0.42265415549597857, "step": 5235 }, { "epoch": 0.9707081942899518, "grad_norm": 5.21875, "learning_rate": 9.029291805710049e-06, "loss": 3.0352, "mean_token_accuracy": 0.4229080263478897, "step": 5236 }, { "epoch": 0.9708935854653319, "grad_norm": 7.6015625, "learning_rate": 9.029106414534668e-06, "loss": 2.8363, "mean_token_accuracy": 0.4495102188898295, "step": 5237 }, { "epoch": 0.9710789766407119, "grad_norm": 7.296875, "learning_rate": 9.028921023359288e-06, "loss": 3.108, "mean_token_accuracy": 0.45034872135503157, "step": 5238 }, { "epoch": 0.9712643678160919, "grad_norm": 8.375, "learning_rate": 9.028735632183909e-06, "loss": 2.9931, "mean_token_accuracy": 0.44179750947482405, "step": 5239 }, { "epoch": 0.971449758991472, "grad_norm": 7.96484375, "learning_rate": 9.02855024100853e-06, "loss": 3.0694, "mean_token_accuracy": 0.44374846475067553, "step": 5240 }, { "epoch": 0.9716351501668521, "grad_norm": 6.16796875, "learning_rate": 9.028364849833148e-06, "loss": 3.0564, "mean_token_accuracy": 0.4228007181328546, "step": 5241 }, { "epoch": 0.9718205413422321, "grad_norm": 6.42578125, "learning_rate": 9.028179458657769e-06, "loss": 2.9868, "mean_token_accuracy": 0.43331603528801244, "step": 5242 }, { "epoch": 0.9720059325176121, "grad_norm": 7.42578125, "learning_rate": 9.027994067482389e-06, "loss": 2.7804, "mean_token_accuracy": 0.4559925093632959, "step": 5243 }, { "epoch": 0.9721913236929922, "grad_norm": 7.38671875, "learning_rate": 9.027808676307008e-06, "loss": 3.0058, "mean_token_accuracy": 0.4321780699133552, "step": 5244 }, { "epoch": 0.9723767148683723, "grad_norm": 5.63671875, "learning_rate": 9.027623285131628e-06, "loss": 2.3579, "mean_token_accuracy": 0.5018023430459597, "step": 5245 }, { "epoch": 0.9725621060437524, "grad_norm": 5.4453125, "learning_rate": 9.027437893956247e-06, "loss": 2.8203, "mean_token_accuracy": 0.44840345347403043, "step": 5246 }, { "epoch": 0.9727474972191323, "grad_norm": 7.16796875, "learning_rate": 9.02725250278087e-06, "loss": 3.0954, "mean_token_accuracy": 0.42288125077553046, "step": 5247 }, { "epoch": 0.9729328883945124, "grad_norm": 9.734375, "learning_rate": 9.027067111605488e-06, "loss": 2.1252, "mean_token_accuracy": 0.5217251367878983, "step": 5248 }, { "epoch": 0.9731182795698925, "grad_norm": 6.890625, "learning_rate": 9.026881720430109e-06, "loss": 2.4225, "mean_token_accuracy": 0.5027578599007171, "step": 5249 }, { "epoch": 0.9733036707452726, "grad_norm": 8.2578125, "learning_rate": 9.026696329254729e-06, "loss": 2.5053, "mean_token_accuracy": 0.49483766637641496, "step": 5250 }, { "epoch": 0.9734890619206525, "grad_norm": 7.03515625, "learning_rate": 9.026510938079348e-06, "loss": 3.095, "mean_token_accuracy": 0.42227135210612055, "step": 5251 }, { "epoch": 0.9736744530960326, "grad_norm": 7.91015625, "learning_rate": 9.026325546903968e-06, "loss": 2.8561, "mean_token_accuracy": 0.4489284085727314, "step": 5252 }, { "epoch": 0.9738598442714127, "grad_norm": 8.953125, "learning_rate": 9.026140155728587e-06, "loss": 3.2796, "mean_token_accuracy": 0.4184190031152648, "step": 5253 }, { "epoch": 0.9740452354467928, "grad_norm": 5.4765625, "learning_rate": 9.025954764553208e-06, "loss": 2.4272, "mean_token_accuracy": 0.5006065016982048, "step": 5254 }, { "epoch": 0.9742306266221727, "grad_norm": 7.41015625, "learning_rate": 9.025769373377828e-06, "loss": 2.9454, "mean_token_accuracy": 0.44598687531549724, "step": 5255 }, { "epoch": 0.9744160177975528, "grad_norm": 7.08203125, "learning_rate": 9.025583982202449e-06, "loss": 2.6943, "mean_token_accuracy": 0.46075353218210363, "step": 5256 }, { "epoch": 0.9746014089729329, "grad_norm": 7.73828125, "learning_rate": 9.025398591027067e-06, "loss": 2.6617, "mean_token_accuracy": 0.45768187104277036, "step": 5257 }, { "epoch": 0.974786800148313, "grad_norm": 6.6640625, "learning_rate": 9.025213199851688e-06, "loss": 3.1012, "mean_token_accuracy": 0.42696936542669583, "step": 5258 }, { "epoch": 0.974972191323693, "grad_norm": 8.1640625, "learning_rate": 9.025027808676308e-06, "loss": 2.8136, "mean_token_accuracy": 0.4522706209453197, "step": 5259 }, { "epoch": 0.975157582499073, "grad_norm": 13.234375, "learning_rate": 9.024842417500927e-06, "loss": 3.001, "mean_token_accuracy": 0.44327301337529507, "step": 5260 }, { "epoch": 0.9753429736744531, "grad_norm": 8.9609375, "learning_rate": 9.024657026325548e-06, "loss": 3.377, "mean_token_accuracy": 0.41501625775938517, "step": 5261 }, { "epoch": 0.9755283648498332, "grad_norm": 5.8046875, "learning_rate": 9.024471635150167e-06, "loss": 2.5436, "mean_token_accuracy": 0.48532094874771936, "step": 5262 }, { "epoch": 0.9757137560252132, "grad_norm": 12.15625, "learning_rate": 9.024286243974789e-06, "loss": 2.7106, "mean_token_accuracy": 0.4552808988764045, "step": 5263 }, { "epoch": 0.9758991472005932, "grad_norm": 10.453125, "learning_rate": 9.024100852799408e-06, "loss": 2.8337, "mean_token_accuracy": 0.4589080459770115, "step": 5264 }, { "epoch": 0.9760845383759733, "grad_norm": 6.9296875, "learning_rate": 9.023915461624028e-06, "loss": 2.8556, "mean_token_accuracy": 0.45927740355174523, "step": 5265 }, { "epoch": 0.9762699295513534, "grad_norm": 10.4296875, "learning_rate": 9.023730070448647e-06, "loss": 2.011, "mean_token_accuracy": 0.5403192227619709, "step": 5266 }, { "epoch": 0.9764553207267334, "grad_norm": 10.1484375, "learning_rate": 9.023544679273267e-06, "loss": 3.1366, "mean_token_accuracy": 0.4303623561737936, "step": 5267 }, { "epoch": 0.9766407119021134, "grad_norm": 12.1875, "learning_rate": 9.023359288097888e-06, "loss": 2.6736, "mean_token_accuracy": 0.4656447492268388, "step": 5268 }, { "epoch": 0.9768261030774935, "grad_norm": 7.3671875, "learning_rate": 9.023173896922507e-06, "loss": 2.7089, "mean_token_accuracy": 0.4565191315821039, "step": 5269 }, { "epoch": 0.9770114942528736, "grad_norm": 9.34375, "learning_rate": 9.022988505747127e-06, "loss": 2.6935, "mean_token_accuracy": 0.46748704663212437, "step": 5270 }, { "epoch": 0.9771968854282537, "grad_norm": 10.2265625, "learning_rate": 9.022803114571748e-06, "loss": 3.1434, "mean_token_accuracy": 0.4215041464112096, "step": 5271 }, { "epoch": 0.9773822766036336, "grad_norm": 7.44140625, "learning_rate": 9.022617723396368e-06, "loss": 3.0097, "mean_token_accuracy": 0.42879256965944273, "step": 5272 }, { "epoch": 0.9775676677790137, "grad_norm": 6.28125, "learning_rate": 9.022432332220987e-06, "loss": 3.2989, "mean_token_accuracy": 0.4105042693732882, "step": 5273 }, { "epoch": 0.9777530589543938, "grad_norm": 7.24609375, "learning_rate": 9.022246941045607e-06, "loss": 2.7939, "mean_token_accuracy": 0.43898201144726084, "step": 5274 }, { "epoch": 0.9779384501297739, "grad_norm": 5.0, "learning_rate": 9.022061549870226e-06, "loss": 2.5596, "mean_token_accuracy": 0.47299554565701557, "step": 5275 }, { "epoch": 0.9781238413051538, "grad_norm": 5.81640625, "learning_rate": 9.021876158694847e-06, "loss": 3.4474, "mean_token_accuracy": 0.3967266775777414, "step": 5276 }, { "epoch": 0.9783092324805339, "grad_norm": 6.2890625, "learning_rate": 9.021690767519467e-06, "loss": 2.5461, "mean_token_accuracy": 0.47848360655737704, "step": 5277 }, { "epoch": 0.978494623655914, "grad_norm": 6.10546875, "learning_rate": 9.021505376344086e-06, "loss": 3.0837, "mean_token_accuracy": 0.4216088651754566, "step": 5278 }, { "epoch": 0.9786800148312941, "grad_norm": 5.77734375, "learning_rate": 9.021319985168706e-06, "loss": 2.8152, "mean_token_accuracy": 0.4521991045562286, "step": 5279 }, { "epoch": 0.978865406006674, "grad_norm": 5.65625, "learning_rate": 9.021134593993327e-06, "loss": 3.1958, "mean_token_accuracy": 0.41112200588317693, "step": 5280 }, { "epoch": 0.9790507971820541, "grad_norm": 7.0, "learning_rate": 9.020949202817947e-06, "loss": 2.479, "mean_token_accuracy": 0.4852670349907919, "step": 5281 }, { "epoch": 0.9792361883574342, "grad_norm": 5.51953125, "learning_rate": 9.020763811642566e-06, "loss": 2.936, "mean_token_accuracy": 0.4313387860525183, "step": 5282 }, { "epoch": 0.9794215795328143, "grad_norm": 6.9375, "learning_rate": 9.020578420467187e-06, "loss": 3.0654, "mean_token_accuracy": 0.43891004980955173, "step": 5283 }, { "epoch": 0.9796069707081942, "grad_norm": 6.4921875, "learning_rate": 9.020393029291805e-06, "loss": 3.3757, "mean_token_accuracy": 0.42258726899383986, "step": 5284 }, { "epoch": 0.9797923618835743, "grad_norm": 7.8515625, "learning_rate": 9.020207638116426e-06, "loss": 3.3513, "mean_token_accuracy": 0.3975588491717524, "step": 5285 }, { "epoch": 0.9799777530589544, "grad_norm": 6.83984375, "learning_rate": 9.020022246941046e-06, "loss": 2.9679, "mean_token_accuracy": 0.4574526489157288, "step": 5286 }, { "epoch": 0.9801631442343345, "grad_norm": 5.26171875, "learning_rate": 9.019836855765667e-06, "loss": 2.8174, "mean_token_accuracy": 0.45067167412971254, "step": 5287 }, { "epoch": 0.9803485354097144, "grad_norm": 5.58203125, "learning_rate": 9.019651464590287e-06, "loss": 3.1026, "mean_token_accuracy": 0.4354686020826759, "step": 5288 }, { "epoch": 0.9805339265850945, "grad_norm": 7.6953125, "learning_rate": 9.019466073414906e-06, "loss": 2.4289, "mean_token_accuracy": 0.49359768690623707, "step": 5289 }, { "epoch": 0.9807193177604746, "grad_norm": 5.41796875, "learning_rate": 9.019280682239527e-06, "loss": 3.0133, "mean_token_accuracy": 0.42977564518220723, "step": 5290 }, { "epoch": 0.9809047089358547, "grad_norm": 5.96875, "learning_rate": 9.019095291064146e-06, "loss": 2.2879, "mean_token_accuracy": 0.5046447729948973, "step": 5291 }, { "epoch": 0.9810901001112347, "grad_norm": 7.29296875, "learning_rate": 9.018909899888766e-06, "loss": 2.6908, "mean_token_accuracy": 0.4632418069087688, "step": 5292 }, { "epoch": 0.9812754912866147, "grad_norm": 8.265625, "learning_rate": 9.018724508713387e-06, "loss": 2.6789, "mean_token_accuracy": 0.4739330746847721, "step": 5293 }, { "epoch": 0.9814608824619948, "grad_norm": 5.78125, "learning_rate": 9.018539117538005e-06, "loss": 2.6383, "mean_token_accuracy": 0.47544361763022325, "step": 5294 }, { "epoch": 0.9816462736373749, "grad_norm": 8.2265625, "learning_rate": 9.018353726362626e-06, "loss": 2.9225, "mean_token_accuracy": 0.42470119521912353, "step": 5295 }, { "epoch": 0.981831664812755, "grad_norm": 7.2421875, "learning_rate": 9.018168335187246e-06, "loss": 3.6038, "mean_token_accuracy": 0.3933632286995516, "step": 5296 }, { "epoch": 0.9820170559881349, "grad_norm": 5.984375, "learning_rate": 9.017982944011867e-06, "loss": 2.706, "mean_token_accuracy": 0.47243090007087174, "step": 5297 }, { "epoch": 0.982202447163515, "grad_norm": 8.3359375, "learning_rate": 9.017797552836486e-06, "loss": 2.9854, "mean_token_accuracy": 0.40631054525325216, "step": 5298 }, { "epoch": 0.9823878383388951, "grad_norm": 6.83203125, "learning_rate": 9.017612161661106e-06, "loss": 3.0442, "mean_token_accuracy": 0.42088297126839525, "step": 5299 }, { "epoch": 0.9825732295142752, "grad_norm": 6.2890625, "learning_rate": 9.017426770485725e-06, "loss": 2.4303, "mean_token_accuracy": 0.4971590909090909, "step": 5300 }, { "epoch": 0.9827586206896551, "grad_norm": 6.01953125, "learning_rate": 9.017241379310345e-06, "loss": 2.6119, "mean_token_accuracy": 0.4545100077982844, "step": 5301 }, { "epoch": 0.9829440118650352, "grad_norm": 5.32421875, "learning_rate": 9.017055988134966e-06, "loss": 2.3698, "mean_token_accuracy": 0.49861395685187415, "step": 5302 }, { "epoch": 0.9831294030404153, "grad_norm": 4.9921875, "learning_rate": 9.016870596959586e-06, "loss": 2.6448, "mean_token_accuracy": 0.4657534246575342, "step": 5303 }, { "epoch": 0.9833147942157954, "grad_norm": 6.24609375, "learning_rate": 9.016685205784205e-06, "loss": 2.7036, "mean_token_accuracy": 0.4474397830594991, "step": 5304 }, { "epoch": 0.9835001853911753, "grad_norm": 7.28125, "learning_rate": 9.016499814608826e-06, "loss": 2.6528, "mean_token_accuracy": 0.4669168751737559, "step": 5305 }, { "epoch": 0.9836855765665554, "grad_norm": 9.09375, "learning_rate": 9.016314423433446e-06, "loss": 2.8119, "mean_token_accuracy": 0.44283623353390794, "step": 5306 }, { "epoch": 0.9838709677419355, "grad_norm": 6.609375, "learning_rate": 9.016129032258065e-06, "loss": 3.7831, "mean_token_accuracy": 0.3781437125748503, "step": 5307 }, { "epoch": 0.9840563589173156, "grad_norm": 7.12890625, "learning_rate": 9.015943641082685e-06, "loss": 2.877, "mean_token_accuracy": 0.4429403005279545, "step": 5308 }, { "epoch": 0.9842417500926955, "grad_norm": 6.03125, "learning_rate": 9.015758249907304e-06, "loss": 2.9062, "mean_token_accuracy": 0.4446273218904303, "step": 5309 }, { "epoch": 0.9844271412680756, "grad_norm": 8.0859375, "learning_rate": 9.015572858731925e-06, "loss": 2.753, "mean_token_accuracy": 0.45505452402004126, "step": 5310 }, { "epoch": 0.9846125324434557, "grad_norm": 6.69140625, "learning_rate": 9.015387467556545e-06, "loss": 2.3676, "mean_token_accuracy": 0.5065679733110926, "step": 5311 }, { "epoch": 0.9847979236188358, "grad_norm": 6.2109375, "learning_rate": 9.015202076381166e-06, "loss": 2.6653, "mean_token_accuracy": 0.47414406176978563, "step": 5312 }, { "epoch": 0.9849833147942157, "grad_norm": 8.7265625, "learning_rate": 9.015016685205784e-06, "loss": 2.7725, "mean_token_accuracy": 0.463492597577389, "step": 5313 }, { "epoch": 0.9851687059695958, "grad_norm": 9.921875, "learning_rate": 9.014831294030405e-06, "loss": 3.0038, "mean_token_accuracy": 0.43437423761893146, "step": 5314 }, { "epoch": 0.9853540971449759, "grad_norm": 6.00390625, "learning_rate": 9.014645902855025e-06, "loss": 3.1276, "mean_token_accuracy": 0.4335689469270397, "step": 5315 }, { "epoch": 0.985539488320356, "grad_norm": 7.3046875, "learning_rate": 9.014460511679644e-06, "loss": 2.5009, "mean_token_accuracy": 0.474618149146451, "step": 5316 }, { "epoch": 0.985724879495736, "grad_norm": 7.64453125, "learning_rate": 9.014275120504265e-06, "loss": 2.5951, "mean_token_accuracy": 0.4977079240340537, "step": 5317 }, { "epoch": 0.985910270671116, "grad_norm": 6.17578125, "learning_rate": 9.014089729328884e-06, "loss": 3.2152, "mean_token_accuracy": 0.41438937779836343, "step": 5318 }, { "epoch": 0.9860956618464961, "grad_norm": 6.1484375, "learning_rate": 9.013904338153506e-06, "loss": 2.0302, "mean_token_accuracy": 0.5498069498069498, "step": 5319 }, { "epoch": 0.9862810530218762, "grad_norm": 8.2109375, "learning_rate": 9.013718946978125e-06, "loss": 2.7719, "mean_token_accuracy": 0.4525238263325097, "step": 5320 }, { "epoch": 0.9864664441972563, "grad_norm": 5.74609375, "learning_rate": 9.013533555802745e-06, "loss": 3.0582, "mean_token_accuracy": 0.44462257849031395, "step": 5321 }, { "epoch": 0.9866518353726362, "grad_norm": 6.390625, "learning_rate": 9.013348164627364e-06, "loss": 2.9786, "mean_token_accuracy": 0.43372126028952596, "step": 5322 }, { "epoch": 0.9868372265480163, "grad_norm": 6.5078125, "learning_rate": 9.013162773451984e-06, "loss": 3.3364, "mean_token_accuracy": 0.4204374057315234, "step": 5323 }, { "epoch": 0.9870226177233964, "grad_norm": 6.59765625, "learning_rate": 9.012977382276605e-06, "loss": 2.6318, "mean_token_accuracy": 0.4917234664070107, "step": 5324 }, { "epoch": 0.9872080088987765, "grad_norm": 4.609375, "learning_rate": 9.012791991101224e-06, "loss": 2.9282, "mean_token_accuracy": 0.4313700051894136, "step": 5325 }, { "epoch": 0.9873934000741564, "grad_norm": 6.9765625, "learning_rate": 9.012606599925844e-06, "loss": 3.0229, "mean_token_accuracy": 0.42448266595001344, "step": 5326 }, { "epoch": 0.9875787912495365, "grad_norm": 6.3984375, "learning_rate": 9.012421208750463e-06, "loss": 2.9413, "mean_token_accuracy": 0.46790766939687267, "step": 5327 }, { "epoch": 0.9877641824249166, "grad_norm": 5.5859375, "learning_rate": 9.012235817575085e-06, "loss": 3.0831, "mean_token_accuracy": 0.4244882675986021, "step": 5328 }, { "epoch": 0.9879495736002967, "grad_norm": 6.73828125, "learning_rate": 9.012050426399704e-06, "loss": 3.2177, "mean_token_accuracy": 0.4176895721004933, "step": 5329 }, { "epoch": 0.9881349647756766, "grad_norm": 5.96484375, "learning_rate": 9.011865035224324e-06, "loss": 3.0277, "mean_token_accuracy": 0.4436307146027519, "step": 5330 }, { "epoch": 0.9883203559510567, "grad_norm": 6.421875, "learning_rate": 9.011679644048945e-06, "loss": 2.4874, "mean_token_accuracy": 0.4778126964173476, "step": 5331 }, { "epoch": 0.9885057471264368, "grad_norm": 5.5390625, "learning_rate": 9.011494252873564e-06, "loss": 2.4293, "mean_token_accuracy": 0.51498561751879, "step": 5332 }, { "epoch": 0.9886911383018169, "grad_norm": 7.08984375, "learning_rate": 9.011308861698184e-06, "loss": 2.8724, "mean_token_accuracy": 0.43947272947152016, "step": 5333 }, { "epoch": 0.9888765294771968, "grad_norm": 6.22265625, "learning_rate": 9.011123470522803e-06, "loss": 2.8004, "mean_token_accuracy": 0.4588452899986788, "step": 5334 }, { "epoch": 0.9890619206525769, "grad_norm": 7.19921875, "learning_rate": 9.010938079347423e-06, "loss": 2.769, "mean_token_accuracy": 0.46843335931410757, "step": 5335 }, { "epoch": 0.989247311827957, "grad_norm": 5.9453125, "learning_rate": 9.010752688172044e-06, "loss": 3.2927, "mean_token_accuracy": 0.4335483870967742, "step": 5336 }, { "epoch": 0.9894327030033371, "grad_norm": 8.765625, "learning_rate": 9.010567296996664e-06, "loss": 2.5567, "mean_token_accuracy": 0.48309302028837564, "step": 5337 }, { "epoch": 0.989618094178717, "grad_norm": 6.609375, "learning_rate": 9.010381905821283e-06, "loss": 3.3472, "mean_token_accuracy": 0.42130872483221476, "step": 5338 }, { "epoch": 0.9898034853540971, "grad_norm": 5.96484375, "learning_rate": 9.010196514645904e-06, "loss": 2.9473, "mean_token_accuracy": 0.4480499695307739, "step": 5339 }, { "epoch": 0.9899888765294772, "grad_norm": 6.3046875, "learning_rate": 9.010011123470524e-06, "loss": 2.755, "mean_token_accuracy": 0.4540897941772422, "step": 5340 }, { "epoch": 0.9901742677048573, "grad_norm": 8.109375, "learning_rate": 9.009825732295143e-06, "loss": 2.6095, "mean_token_accuracy": 0.4766005241482591, "step": 5341 }, { "epoch": 0.9903596588802372, "grad_norm": 6.0703125, "learning_rate": 9.009640341119763e-06, "loss": 3.0822, "mean_token_accuracy": 0.4536385936222404, "step": 5342 }, { "epoch": 0.9905450500556173, "grad_norm": 7.49609375, "learning_rate": 9.009454949944382e-06, "loss": 2.2223, "mean_token_accuracy": 0.5430619053503027, "step": 5343 }, { "epoch": 0.9907304412309974, "grad_norm": 7.7734375, "learning_rate": 9.009269558769004e-06, "loss": 2.3547, "mean_token_accuracy": 0.4967394030599448, "step": 5344 }, { "epoch": 0.9909158324063775, "grad_norm": 7.25390625, "learning_rate": 9.009084167593623e-06, "loss": 2.9127, "mean_token_accuracy": 0.43528753323072616, "step": 5345 }, { "epoch": 0.9911012235817576, "grad_norm": 6.76953125, "learning_rate": 9.008898776418244e-06, "loss": 2.5838, "mean_token_accuracy": 0.46951807228915665, "step": 5346 }, { "epoch": 0.9912866147571375, "grad_norm": 7.23046875, "learning_rate": 9.008713385242863e-06, "loss": 2.4568, "mean_token_accuracy": 0.48092594779733083, "step": 5347 }, { "epoch": 0.9914720059325176, "grad_norm": 7.76953125, "learning_rate": 9.008527994067483e-06, "loss": 2.8586, "mean_token_accuracy": 0.43542882307607617, "step": 5348 }, { "epoch": 0.9916573971078977, "grad_norm": 14.0703125, "learning_rate": 9.008342602892104e-06, "loss": 2.8178, "mean_token_accuracy": 0.4403052064631957, "step": 5349 }, { "epoch": 0.9918427882832778, "grad_norm": 6.68359375, "learning_rate": 9.008157211716722e-06, "loss": 3.051, "mean_token_accuracy": 0.4163214581607291, "step": 5350 }, { "epoch": 0.9920281794586577, "grad_norm": 4.88671875, "learning_rate": 9.007971820541343e-06, "loss": 2.933, "mean_token_accuracy": 0.4471948978488812, "step": 5351 }, { "epoch": 0.9922135706340378, "grad_norm": 11.6875, "learning_rate": 9.007786429365963e-06, "loss": 2.6764, "mean_token_accuracy": 0.45178335535006603, "step": 5352 }, { "epoch": 0.9923989618094179, "grad_norm": 6.3828125, "learning_rate": 9.007601038190584e-06, "loss": 2.6396, "mean_token_accuracy": 0.47504682163710477, "step": 5353 }, { "epoch": 0.992584352984798, "grad_norm": 7.15625, "learning_rate": 9.007415647015203e-06, "loss": 2.969, "mean_token_accuracy": 0.44990892531876137, "step": 5354 }, { "epoch": 0.9927697441601779, "grad_norm": 6.8671875, "learning_rate": 9.007230255839823e-06, "loss": 2.5238, "mean_token_accuracy": 0.46928872430085555, "step": 5355 }, { "epoch": 0.992955135335558, "grad_norm": 6.23046875, "learning_rate": 9.007044864664442e-06, "loss": 2.8182, "mean_token_accuracy": 0.4621351008320406, "step": 5356 }, { "epoch": 0.9931405265109381, "grad_norm": 5.29296875, "learning_rate": 9.006859473489062e-06, "loss": 2.9151, "mean_token_accuracy": 0.45566891794798103, "step": 5357 }, { "epoch": 0.9933259176863182, "grad_norm": 6.32421875, "learning_rate": 9.006674082313683e-06, "loss": 2.6284, "mean_token_accuracy": 0.48252125792954514, "step": 5358 }, { "epoch": 0.9935113088616981, "grad_norm": 7.484375, "learning_rate": 9.006488691138302e-06, "loss": 2.9684, "mean_token_accuracy": 0.43583923087194554, "step": 5359 }, { "epoch": 0.9936967000370782, "grad_norm": 5.828125, "learning_rate": 9.006303299962922e-06, "loss": 2.5793, "mean_token_accuracy": 0.477866188501625, "step": 5360 }, { "epoch": 0.9938820912124583, "grad_norm": 7.1796875, "learning_rate": 9.006117908787543e-06, "loss": 3.138, "mean_token_accuracy": 0.40840760941865106, "step": 5361 }, { "epoch": 0.9940674823878384, "grad_norm": 10.7578125, "learning_rate": 9.005932517612163e-06, "loss": 3.3873, "mean_token_accuracy": 0.38589277241815595, "step": 5362 }, { "epoch": 0.9942528735632183, "grad_norm": 9.640625, "learning_rate": 9.005747126436782e-06, "loss": 2.3354, "mean_token_accuracy": 0.5178206381654964, "step": 5363 }, { "epoch": 0.9944382647385984, "grad_norm": 7.50390625, "learning_rate": 9.005561735261402e-06, "loss": 3.5893, "mean_token_accuracy": 0.3940066592674806, "step": 5364 }, { "epoch": 0.9946236559139785, "grad_norm": 7.5625, "learning_rate": 9.005376344086021e-06, "loss": 2.7911, "mean_token_accuracy": 0.45264483627204033, "step": 5365 }, { "epoch": 0.9948090470893586, "grad_norm": 5.44921875, "learning_rate": 9.005190952910642e-06, "loss": 3.1668, "mean_token_accuracy": 0.41986190686058467, "step": 5366 }, { "epoch": 0.9949944382647385, "grad_norm": 7.14453125, "learning_rate": 9.005005561735262e-06, "loss": 2.952, "mean_token_accuracy": 0.4292821606254442, "step": 5367 }, { "epoch": 0.9951798294401186, "grad_norm": 6.05859375, "learning_rate": 9.004820170559883e-06, "loss": 3.1834, "mean_token_accuracy": 0.43790849673202614, "step": 5368 }, { "epoch": 0.9953652206154987, "grad_norm": 5.01171875, "learning_rate": 9.004634779384503e-06, "loss": 3.0007, "mean_token_accuracy": 0.4277613703484938, "step": 5369 }, { "epoch": 0.9955506117908788, "grad_norm": 5.76171875, "learning_rate": 9.004449388209122e-06, "loss": 2.5067, "mean_token_accuracy": 0.4837253829321663, "step": 5370 }, { "epoch": 0.9957360029662589, "grad_norm": 6.48828125, "learning_rate": 9.004263997033742e-06, "loss": 2.5389, "mean_token_accuracy": 0.4820298658567451, "step": 5371 }, { "epoch": 0.9959213941416388, "grad_norm": 6.578125, "learning_rate": 9.004078605858361e-06, "loss": 2.4616, "mean_token_accuracy": 0.48828031571394404, "step": 5372 }, { "epoch": 0.9961067853170189, "grad_norm": 5.26171875, "learning_rate": 9.003893214682982e-06, "loss": 2.6077, "mean_token_accuracy": 0.4764300847457627, "step": 5373 }, { "epoch": 0.996292176492399, "grad_norm": 6.66796875, "learning_rate": 9.003707823507602e-06, "loss": 3.2044, "mean_token_accuracy": 0.4174381811789523, "step": 5374 }, { "epoch": 0.9964775676677791, "grad_norm": 7.0703125, "learning_rate": 9.003522432332221e-06, "loss": 2.8274, "mean_token_accuracy": 0.45360262008733626, "step": 5375 }, { "epoch": 0.996662958843159, "grad_norm": 9.3125, "learning_rate": 9.003337041156842e-06, "loss": 3.1837, "mean_token_accuracy": 0.4358379715522573, "step": 5376 }, { "epoch": 0.9968483500185391, "grad_norm": 10.5, "learning_rate": 9.003151649981462e-06, "loss": 2.5045, "mean_token_accuracy": 0.48051391862955034, "step": 5377 }, { "epoch": 0.9970337411939192, "grad_norm": 8.3515625, "learning_rate": 9.002966258806083e-06, "loss": 2.7759, "mean_token_accuracy": 0.450172229480936, "step": 5378 }, { "epoch": 0.9972191323692993, "grad_norm": 9.765625, "learning_rate": 9.002780867630701e-06, "loss": 3.0243, "mean_token_accuracy": 0.43997089497938396, "step": 5379 }, { "epoch": 0.9974045235446792, "grad_norm": 8.171875, "learning_rate": 9.002595476455322e-06, "loss": 3.0958, "mean_token_accuracy": 0.4264875239923225, "step": 5380 }, { "epoch": 0.9975899147200593, "grad_norm": 8.203125, "learning_rate": 9.00241008527994e-06, "loss": 2.605, "mean_token_accuracy": 0.47257528913317304, "step": 5381 }, { "epoch": 0.9977753058954394, "grad_norm": 8.1796875, "learning_rate": 9.002224694104561e-06, "loss": 2.7325, "mean_token_accuracy": 0.47313371616078753, "step": 5382 }, { "epoch": 0.9979606970708195, "grad_norm": 7.546875, "learning_rate": 9.002039302929182e-06, "loss": 2.6217, "mean_token_accuracy": 0.4649734859056656, "step": 5383 }, { "epoch": 0.9981460882461994, "grad_norm": 6.98046875, "learning_rate": 9.001853911753802e-06, "loss": 2.4382, "mean_token_accuracy": 0.5214841259722045, "step": 5384 }, { "epoch": 0.9983314794215795, "grad_norm": 8.21875, "learning_rate": 9.001668520578421e-06, "loss": 2.9507, "mean_token_accuracy": 0.4603713768115942, "step": 5385 }, { "epoch": 0.9985168705969596, "grad_norm": 6.6015625, "learning_rate": 9.001483129403041e-06, "loss": 2.5004, "mean_token_accuracy": 0.49199300228771364, "step": 5386 }, { "epoch": 0.9987022617723397, "grad_norm": 7.265625, "learning_rate": 9.001297738227662e-06, "loss": 2.7984, "mean_token_accuracy": 0.4648922076357943, "step": 5387 }, { "epoch": 0.9988876529477196, "grad_norm": 12.0546875, "learning_rate": 9.00111234705228e-06, "loss": 2.629, "mean_token_accuracy": 0.4644888082274652, "step": 5388 }, { "epoch": 0.9990730441230997, "grad_norm": 7.83203125, "learning_rate": 9.000926955876901e-06, "loss": 3.0245, "mean_token_accuracy": 0.4527827116637063, "step": 5389 }, { "epoch": 0.9992584352984798, "grad_norm": 8.3671875, "learning_rate": 9.00074156470152e-06, "loss": 2.0859, "mean_token_accuracy": 0.5297951582867784, "step": 5390 }, { "epoch": 0.9994438264738599, "grad_norm": 11.0078125, "learning_rate": 9.00055617352614e-06, "loss": 3.0128, "mean_token_accuracy": 0.4428139835994821, "step": 5391 }, { "epoch": 0.9996292176492398, "grad_norm": 8.4765625, "learning_rate": 9.000370782350761e-06, "loss": 2.9123, "mean_token_accuracy": 0.44999464036874265, "step": 5392 }, { "epoch": 0.9998146088246199, "grad_norm": 7.53515625, "learning_rate": 9.000185391175381e-06, "loss": 2.8187, "mean_token_accuracy": 0.4531951640759931, "step": 5393 }, { "epoch": 1.0, "grad_norm": 8.6015625, "learning_rate": 9e-06, "loss": 2.6479, "mean_token_accuracy": 0.47234957020057305, "step": 5394 }, { "epoch": 1.00018539117538, "grad_norm": 7.25, "learning_rate": 8.99981460882462e-06, "loss": 2.7427, "mean_token_accuracy": 0.45970028351559333, "step": 5395 }, { "epoch": 1.0003707823507602, "grad_norm": 6.48046875, "learning_rate": 8.999629217649241e-06, "loss": 2.4549, "mean_token_accuracy": 0.4942700548081714, "step": 5396 }, { "epoch": 1.0005561735261401, "grad_norm": 5.8046875, "learning_rate": 8.99944382647386e-06, "loss": 2.8212, "mean_token_accuracy": 0.46564102564102566, "step": 5397 }, { "epoch": 1.0007415647015203, "grad_norm": 5.671875, "learning_rate": 8.99925843529848e-06, "loss": 2.6737, "mean_token_accuracy": 0.4762461059190031, "step": 5398 }, { "epoch": 1.0009269558769003, "grad_norm": 8.0859375, "learning_rate": 8.9990730441231e-06, "loss": 2.7601, "mean_token_accuracy": 0.48086188436830835, "step": 5399 }, { "epoch": 1.0011123470522802, "grad_norm": 6.15625, "learning_rate": 8.998887652947721e-06, "loss": 3.1373, "mean_token_accuracy": 0.4137083141232519, "step": 5400 }, { "epoch": 1.0012977382276604, "grad_norm": 7.9375, "learning_rate": 8.99870226177234e-06, "loss": 2.8595, "mean_token_accuracy": 0.46283362434522324, "step": 5401 }, { "epoch": 1.0014831294030404, "grad_norm": 7.7265625, "learning_rate": 8.99851687059696e-06, "loss": 2.987, "mean_token_accuracy": 0.4263896103896104, "step": 5402 }, { "epoch": 1.0016685205784204, "grad_norm": 9.84375, "learning_rate": 8.99833147942158e-06, "loss": 2.8224, "mean_token_accuracy": 0.440157771477875, "step": 5403 }, { "epoch": 1.0018539117538006, "grad_norm": 11.21875, "learning_rate": 8.9981460882462e-06, "loss": 3.122, "mean_token_accuracy": 0.3997254197908966, "step": 5404 }, { "epoch": 1.0020393029291805, "grad_norm": 7.17578125, "learning_rate": 8.99796069707082e-06, "loss": 2.9079, "mean_token_accuracy": 0.444659193402912, "step": 5405 }, { "epoch": 1.0022246941045607, "grad_norm": 8.3046875, "learning_rate": 8.99777530589544e-06, "loss": 2.8867, "mean_token_accuracy": 0.42133566783391696, "step": 5406 }, { "epoch": 1.0024100852799407, "grad_norm": 6.5703125, "learning_rate": 8.99758991472006e-06, "loss": 2.3106, "mean_token_accuracy": 0.5282411820781697, "step": 5407 }, { "epoch": 1.0025954764553207, "grad_norm": 6.546875, "learning_rate": 8.99740452354468e-06, "loss": 2.8621, "mean_token_accuracy": 0.4527790329868956, "step": 5408 }, { "epoch": 1.0027808676307008, "grad_norm": 8.2734375, "learning_rate": 8.9972191323693e-06, "loss": 2.6703, "mean_token_accuracy": 0.47613252197430694, "step": 5409 }, { "epoch": 1.0029662588060808, "grad_norm": 6.5078125, "learning_rate": 8.99703374119392e-06, "loss": 2.4359, "mean_token_accuracy": 0.48895582329317266, "step": 5410 }, { "epoch": 1.0031516499814608, "grad_norm": 7.328125, "learning_rate": 8.99684835001854e-06, "loss": 2.3395, "mean_token_accuracy": 0.5009856100926473, "step": 5411 }, { "epoch": 1.003337041156841, "grad_norm": 9.5859375, "learning_rate": 8.99666295884316e-06, "loss": 2.7966, "mean_token_accuracy": 0.4670528602461984, "step": 5412 }, { "epoch": 1.003522432332221, "grad_norm": 7.55078125, "learning_rate": 8.99647756766778e-06, "loss": 3.3766, "mean_token_accuracy": 0.39555498458376154, "step": 5413 }, { "epoch": 1.0037078235076011, "grad_norm": 7.18359375, "learning_rate": 8.9962921764924e-06, "loss": 2.6531, "mean_token_accuracy": 0.46013356957884693, "step": 5414 }, { "epoch": 1.003893214682981, "grad_norm": 11.6484375, "learning_rate": 8.996106785317019e-06, "loss": 2.3123, "mean_token_accuracy": 0.5195154777927322, "step": 5415 }, { "epoch": 1.004078605858361, "grad_norm": 7.54296875, "learning_rate": 8.995921394141641e-06, "loss": 2.4731, "mean_token_accuracy": 0.4807906741003548, "step": 5416 }, { "epoch": 1.0042639970337413, "grad_norm": 8.71875, "learning_rate": 8.99573600296626e-06, "loss": 2.9618, "mean_token_accuracy": 0.4386507189893831, "step": 5417 }, { "epoch": 1.0044493882091212, "grad_norm": 7.578125, "learning_rate": 8.99555061179088e-06, "loss": 2.5025, "mean_token_accuracy": 0.5083069118579582, "step": 5418 }, { "epoch": 1.0046347793845012, "grad_norm": 6.7265625, "learning_rate": 8.995365220615499e-06, "loss": 2.4786, "mean_token_accuracy": 0.4991372368572415, "step": 5419 }, { "epoch": 1.0048201705598814, "grad_norm": 7.953125, "learning_rate": 8.99517982944012e-06, "loss": 2.7431, "mean_token_accuracy": 0.4607252612169637, "step": 5420 }, { "epoch": 1.0050055617352613, "grad_norm": 10.3359375, "learning_rate": 8.99499443826474e-06, "loss": 2.7897, "mean_token_accuracy": 0.46403054939640304, "step": 5421 }, { "epoch": 1.0051909529106415, "grad_norm": 5.484375, "learning_rate": 8.994809047089359e-06, "loss": 2.8096, "mean_token_accuracy": 0.47230993441826574, "step": 5422 }, { "epoch": 1.0053763440860215, "grad_norm": 9.1875, "learning_rate": 8.99462365591398e-06, "loss": 2.7145, "mean_token_accuracy": 0.4539916639948702, "step": 5423 }, { "epoch": 1.0055617352614015, "grad_norm": 7.51171875, "learning_rate": 8.9944382647386e-06, "loss": 2.5722, "mean_token_accuracy": 0.4759963768115942, "step": 5424 }, { "epoch": 1.0057471264367817, "grad_norm": 7.15625, "learning_rate": 8.99425287356322e-06, "loss": 2.7145, "mean_token_accuracy": 0.47188441949901255, "step": 5425 }, { "epoch": 1.0059325176121616, "grad_norm": 6.8203125, "learning_rate": 8.994067482387839e-06, "loss": 2.8991, "mean_token_accuracy": 0.4482009176527409, "step": 5426 }, { "epoch": 1.0061179087875418, "grad_norm": 7.62109375, "learning_rate": 8.99388209121246e-06, "loss": 2.5275, "mean_token_accuracy": 0.48938079569249177, "step": 5427 }, { "epoch": 1.0063032999629218, "grad_norm": 6.38671875, "learning_rate": 8.993696700037078e-06, "loss": 3.03, "mean_token_accuracy": 0.44650062613051344, "step": 5428 }, { "epoch": 1.0064886911383017, "grad_norm": 6.26171875, "learning_rate": 8.993511308861699e-06, "loss": 2.8335, "mean_token_accuracy": 0.4438024019941083, "step": 5429 }, { "epoch": 1.006674082313682, "grad_norm": 8.8828125, "learning_rate": 8.99332591768632e-06, "loss": 2.4428, "mean_token_accuracy": 0.4889772727272727, "step": 5430 }, { "epoch": 1.006859473489062, "grad_norm": 9.03125, "learning_rate": 8.993140526510938e-06, "loss": 2.7231, "mean_token_accuracy": 0.4471385738495633, "step": 5431 }, { "epoch": 1.0070448646644419, "grad_norm": 6.23828125, "learning_rate": 8.992955135335559e-06, "loss": 3.0997, "mean_token_accuracy": 0.43875735596003834, "step": 5432 }, { "epoch": 1.007230255839822, "grad_norm": 9.46875, "learning_rate": 8.992769744160179e-06, "loss": 2.8572, "mean_token_accuracy": 0.4532510206113711, "step": 5433 }, { "epoch": 1.007415647015202, "grad_norm": 13.8125, "learning_rate": 8.9925843529848e-06, "loss": 2.4115, "mean_token_accuracy": 0.5002764976958526, "step": 5434 }, { "epoch": 1.0076010381905822, "grad_norm": 8.5546875, "learning_rate": 8.992398961809418e-06, "loss": 2.9994, "mean_token_accuracy": 0.42680180180180183, "step": 5435 }, { "epoch": 1.0077864293659622, "grad_norm": 5.51953125, "learning_rate": 8.992213570634039e-06, "loss": 2.3776, "mean_token_accuracy": 0.49168314528323837, "step": 5436 }, { "epoch": 1.0079718205413422, "grad_norm": 9.046875, "learning_rate": 8.992028179458658e-06, "loss": 3.1377, "mean_token_accuracy": 0.4369191597708466, "step": 5437 }, { "epoch": 1.0081572117167223, "grad_norm": 6.69140625, "learning_rate": 8.991842788283278e-06, "loss": 2.5676, "mean_token_accuracy": 0.4746033862151713, "step": 5438 }, { "epoch": 1.0083426028921023, "grad_norm": 10.0859375, "learning_rate": 8.991657397107899e-06, "loss": 2.5001, "mean_token_accuracy": 0.4820772058823529, "step": 5439 }, { "epoch": 1.0085279940674823, "grad_norm": 7.41796875, "learning_rate": 8.991472005932519e-06, "loss": 2.8444, "mean_token_accuracy": 0.462759881197167, "step": 5440 }, { "epoch": 1.0087133852428625, "grad_norm": 8.9453125, "learning_rate": 8.991286614757138e-06, "loss": 2.201, "mean_token_accuracy": 0.5431593312862504, "step": 5441 }, { "epoch": 1.0088987764182424, "grad_norm": 9.0, "learning_rate": 8.991101223581758e-06, "loss": 2.6431, "mean_token_accuracy": 0.46113400596845244, "step": 5442 }, { "epoch": 1.0090841675936226, "grad_norm": 10.2890625, "learning_rate": 8.990915832406379e-06, "loss": 2.4818, "mean_token_accuracy": 0.49086651053864166, "step": 5443 }, { "epoch": 1.0092695587690026, "grad_norm": 6.24609375, "learning_rate": 8.990730441230998e-06, "loss": 2.5717, "mean_token_accuracy": 0.4621790423317141, "step": 5444 }, { "epoch": 1.0094549499443826, "grad_norm": 7.61328125, "learning_rate": 8.990545050055618e-06, "loss": 3.2259, "mean_token_accuracy": 0.41433189655172414, "step": 5445 }, { "epoch": 1.0096403411197628, "grad_norm": 5.79296875, "learning_rate": 8.990359658880237e-06, "loss": 2.4473, "mean_token_accuracy": 0.5157223992242693, "step": 5446 }, { "epoch": 1.0098257322951427, "grad_norm": 7.03515625, "learning_rate": 8.990174267704857e-06, "loss": 2.4858, "mean_token_accuracy": 0.4995558187740598, "step": 5447 }, { "epoch": 1.010011123470523, "grad_norm": 7.03515625, "learning_rate": 8.989988876529478e-06, "loss": 2.5313, "mean_token_accuracy": 0.47881164228623135, "step": 5448 }, { "epoch": 1.0101965146459029, "grad_norm": 7.484375, "learning_rate": 8.989803485354098e-06, "loss": 3.1031, "mean_token_accuracy": 0.43690065209052553, "step": 5449 }, { "epoch": 1.0103819058212828, "grad_norm": 6.390625, "learning_rate": 8.989618094178719e-06, "loss": 2.9093, "mean_token_accuracy": 0.4510751791965328, "step": 5450 }, { "epoch": 1.010567296996663, "grad_norm": 5.6640625, "learning_rate": 8.989432703003338e-06, "loss": 2.5987, "mean_token_accuracy": 0.48045700541190617, "step": 5451 }, { "epoch": 1.010752688172043, "grad_norm": 4.95703125, "learning_rate": 8.989247311827958e-06, "loss": 2.5317, "mean_token_accuracy": 0.48695762819697164, "step": 5452 }, { "epoch": 1.010938079347423, "grad_norm": 6.5625, "learning_rate": 8.989061920652577e-06, "loss": 2.8766, "mean_token_accuracy": 0.4480246567666013, "step": 5453 }, { "epoch": 1.0111234705228032, "grad_norm": 5.0625, "learning_rate": 8.988876529477198e-06, "loss": 3.3495, "mean_token_accuracy": 0.41807335956339636, "step": 5454 }, { "epoch": 1.0113088616981831, "grad_norm": 6.4453125, "learning_rate": 8.988691138301818e-06, "loss": 3.1902, "mean_token_accuracy": 0.4173813607775872, "step": 5455 }, { "epoch": 1.0114942528735633, "grad_norm": 7.47265625, "learning_rate": 8.988505747126437e-06, "loss": 2.925, "mean_token_accuracy": 0.4321966693100714, "step": 5456 }, { "epoch": 1.0116796440489433, "grad_norm": 8.1328125, "learning_rate": 8.988320355951057e-06, "loss": 2.6556, "mean_token_accuracy": 0.5039230574538092, "step": 5457 }, { "epoch": 1.0118650352243233, "grad_norm": 5.95703125, "learning_rate": 8.988134964775678e-06, "loss": 2.7498, "mean_token_accuracy": 0.46423594983743616, "step": 5458 }, { "epoch": 1.0120504263997034, "grad_norm": 11.3203125, "learning_rate": 8.987949573600298e-06, "loss": 3.0391, "mean_token_accuracy": 0.43049095607235144, "step": 5459 }, { "epoch": 1.0122358175750834, "grad_norm": 12.609375, "learning_rate": 8.987764182424917e-06, "loss": 2.5737, "mean_token_accuracy": 0.47603358486555425, "step": 5460 }, { "epoch": 1.0124212087504634, "grad_norm": 10.1796875, "learning_rate": 8.987578791249538e-06, "loss": 2.8516, "mean_token_accuracy": 0.4436400089106705, "step": 5461 }, { "epoch": 1.0126065999258436, "grad_norm": 6.953125, "learning_rate": 8.987393400074156e-06, "loss": 2.8708, "mean_token_accuracy": 0.4493557331593541, "step": 5462 }, { "epoch": 1.0127919911012235, "grad_norm": 8.734375, "learning_rate": 8.987208008898777e-06, "loss": 2.5812, "mean_token_accuracy": 0.4994443209602134, "step": 5463 }, { "epoch": 1.0129773822766037, "grad_norm": 11.09375, "learning_rate": 8.987022617723397e-06, "loss": 2.8234, "mean_token_accuracy": 0.44006776789495977, "step": 5464 }, { "epoch": 1.0131627734519837, "grad_norm": 8.390625, "learning_rate": 8.986837226548018e-06, "loss": 2.7436, "mean_token_accuracy": 0.45945366898768075, "step": 5465 }, { "epoch": 1.0133481646273637, "grad_norm": 6.59375, "learning_rate": 8.986651835372637e-06, "loss": 3.2904, "mean_token_accuracy": 0.4235717400784998, "step": 5466 }, { "epoch": 1.0135335558027438, "grad_norm": 7.8984375, "learning_rate": 8.986466444197257e-06, "loss": 2.8725, "mean_token_accuracy": 0.43998285224349815, "step": 5467 }, { "epoch": 1.0137189469781238, "grad_norm": 14.7890625, "learning_rate": 8.986281053021878e-06, "loss": 2.4853, "mean_token_accuracy": 0.4687905604719764, "step": 5468 }, { "epoch": 1.013904338153504, "grad_norm": 8.46875, "learning_rate": 8.986095661846496e-06, "loss": 3.0284, "mean_token_accuracy": 0.42964001870032725, "step": 5469 }, { "epoch": 1.014089729328884, "grad_norm": 6.30859375, "learning_rate": 8.985910270671117e-06, "loss": 2.8018, "mean_token_accuracy": 0.4569841484893646, "step": 5470 }, { "epoch": 1.014275120504264, "grad_norm": 10.0625, "learning_rate": 8.985724879495736e-06, "loss": 2.4392, "mean_token_accuracy": 0.4865923113431419, "step": 5471 }, { "epoch": 1.0144605116796441, "grad_norm": 9.09375, "learning_rate": 8.985539488320356e-06, "loss": 2.8995, "mean_token_accuracy": 0.43331284572833434, "step": 5472 }, { "epoch": 1.014645902855024, "grad_norm": 9.4375, "learning_rate": 8.985354097144977e-06, "loss": 2.9138, "mean_token_accuracy": 0.4381491973559962, "step": 5473 }, { "epoch": 1.014831294030404, "grad_norm": 5.12109375, "learning_rate": 8.985168705969597e-06, "loss": 2.9833, "mean_token_accuracy": 0.4370777510592007, "step": 5474 }, { "epoch": 1.0150166852057843, "grad_norm": 8.34375, "learning_rate": 8.984983314794216e-06, "loss": 3.2008, "mean_token_accuracy": 0.44190871369294604, "step": 5475 }, { "epoch": 1.0152020763811642, "grad_norm": 8.3515625, "learning_rate": 8.984797923618836e-06, "loss": 2.7833, "mean_token_accuracy": 0.45383867832847424, "step": 5476 }, { "epoch": 1.0153874675565444, "grad_norm": 5.4375, "learning_rate": 8.984612532443457e-06, "loss": 2.7174, "mean_token_accuracy": 0.45886567164179104, "step": 5477 }, { "epoch": 1.0155728587319244, "grad_norm": 4.92578125, "learning_rate": 8.984427141268076e-06, "loss": 2.5576, "mean_token_accuracy": 0.48444994584558254, "step": 5478 }, { "epoch": 1.0157582499073043, "grad_norm": 7.08984375, "learning_rate": 8.984241750092696e-06, "loss": 2.4713, "mean_token_accuracy": 0.49972572682391664, "step": 5479 }, { "epoch": 1.0159436410826845, "grad_norm": 11.2265625, "learning_rate": 8.984056358917315e-06, "loss": 2.9527, "mean_token_accuracy": 0.44204809930178435, "step": 5480 }, { "epoch": 1.0161290322580645, "grad_norm": 6.44921875, "learning_rate": 8.983870967741937e-06, "loss": 2.9697, "mean_token_accuracy": 0.4401859940820065, "step": 5481 }, { "epoch": 1.0163144234334445, "grad_norm": 5.3046875, "learning_rate": 8.983685576566556e-06, "loss": 2.6541, "mean_token_accuracy": 0.48264556246466933, "step": 5482 }, { "epoch": 1.0164998146088247, "grad_norm": 9.7734375, "learning_rate": 8.983500185391177e-06, "loss": 2.7513, "mean_token_accuracy": 0.445993031358885, "step": 5483 }, { "epoch": 1.0166852057842046, "grad_norm": 9.46875, "learning_rate": 8.983314794215795e-06, "loss": 2.4654, "mean_token_accuracy": 0.5058866813833701, "step": 5484 }, { "epoch": 1.0168705969595848, "grad_norm": 7.53515625, "learning_rate": 8.983129403040416e-06, "loss": 2.8264, "mean_token_accuracy": 0.44897619443982023, "step": 5485 }, { "epoch": 1.0170559881349648, "grad_norm": 7.94921875, "learning_rate": 8.982944011865036e-06, "loss": 2.7339, "mean_token_accuracy": 0.4598187311178248, "step": 5486 }, { "epoch": 1.0172413793103448, "grad_norm": 9.7265625, "learning_rate": 8.982758620689655e-06, "loss": 3.0269, "mean_token_accuracy": 0.4277370590929913, "step": 5487 }, { "epoch": 1.017426770485725, "grad_norm": 7.63671875, "learning_rate": 8.982573229514276e-06, "loss": 3.2608, "mean_token_accuracy": 0.40989825832040006, "step": 5488 }, { "epoch": 1.017612161661105, "grad_norm": 6.21484375, "learning_rate": 8.982387838338896e-06, "loss": 2.3214, "mean_token_accuracy": 0.5200439319055464, "step": 5489 }, { "epoch": 1.0177975528364849, "grad_norm": 8.453125, "learning_rate": 8.982202447163517e-06, "loss": 3.4448, "mean_token_accuracy": 0.4241750726619935, "step": 5490 }, { "epoch": 1.017982944011865, "grad_norm": 12.75, "learning_rate": 8.982017055988135e-06, "loss": 2.7642, "mean_token_accuracy": 0.4415407480802576, "step": 5491 }, { "epoch": 1.018168335187245, "grad_norm": 10.296875, "learning_rate": 8.981831664812756e-06, "loss": 2.7, "mean_token_accuracy": 0.4550252409362093, "step": 5492 }, { "epoch": 1.0183537263626252, "grad_norm": 5.8203125, "learning_rate": 8.981646273637376e-06, "loss": 2.8245, "mean_token_accuracy": 0.4846382556987116, "step": 5493 }, { "epoch": 1.0185391175380052, "grad_norm": 6.34375, "learning_rate": 8.981460882461995e-06, "loss": 3.0094, "mean_token_accuracy": 0.43735117360244924, "step": 5494 }, { "epoch": 1.0187245087133852, "grad_norm": 8.6640625, "learning_rate": 8.981275491286616e-06, "loss": 2.909, "mean_token_accuracy": 0.4720408742548964, "step": 5495 }, { "epoch": 1.0189098998887653, "grad_norm": 9.765625, "learning_rate": 8.981090100111234e-06, "loss": 2.7555, "mean_token_accuracy": 0.4686874081986914, "step": 5496 }, { "epoch": 1.0190952910641453, "grad_norm": 4.96875, "learning_rate": 8.980904708935857e-06, "loss": 2.4565, "mean_token_accuracy": 0.4967860422405877, "step": 5497 }, { "epoch": 1.0192806822395255, "grad_norm": 5.47265625, "learning_rate": 8.980719317760475e-06, "loss": 2.6386, "mean_token_accuracy": 0.46288274831964155, "step": 5498 }, { "epoch": 1.0194660734149055, "grad_norm": 6.61328125, "learning_rate": 8.980533926585096e-06, "loss": 2.3363, "mean_token_accuracy": 0.5300979934671022, "step": 5499 }, { "epoch": 1.0196514645902854, "grad_norm": 6.96875, "learning_rate": 8.980348535409715e-06, "loss": 3.0539, "mean_token_accuracy": 0.42207792207792205, "step": 5500 }, { "epoch": 1.0198368557656656, "grad_norm": 5.1171875, "learning_rate": 8.980163144234335e-06, "loss": 2.0198, "mean_token_accuracy": 0.5498520373321193, "step": 5501 }, { "epoch": 1.0200222469410456, "grad_norm": 6.4765625, "learning_rate": 8.979977753058956e-06, "loss": 2.5637, "mean_token_accuracy": 0.47623842126459925, "step": 5502 }, { "epoch": 1.0202076381164256, "grad_norm": 9.3359375, "learning_rate": 8.979792361883574e-06, "loss": 2.3095, "mean_token_accuracy": 0.4949611287071696, "step": 5503 }, { "epoch": 1.0203930292918058, "grad_norm": 6.01171875, "learning_rate": 8.979606970708195e-06, "loss": 2.6835, "mean_token_accuracy": 0.4731083575006591, "step": 5504 }, { "epoch": 1.0205784204671857, "grad_norm": 9.2734375, "learning_rate": 8.979421579532815e-06, "loss": 2.5066, "mean_token_accuracy": 0.47296322999279017, "step": 5505 }, { "epoch": 1.020763811642566, "grad_norm": 10.765625, "learning_rate": 8.979236188357436e-06, "loss": 2.2761, "mean_token_accuracy": 0.5217729393468118, "step": 5506 }, { "epoch": 1.0209492028179459, "grad_norm": 10.609375, "learning_rate": 8.979050797182055e-06, "loss": 2.862, "mean_token_accuracy": 0.44781923279033103, "step": 5507 }, { "epoch": 1.0211345939933258, "grad_norm": 7.69140625, "learning_rate": 8.978865406006675e-06, "loss": 3.0876, "mean_token_accuracy": 0.4302705223880597, "step": 5508 }, { "epoch": 1.021319985168706, "grad_norm": 7.9140625, "learning_rate": 8.978680014831294e-06, "loss": 2.5951, "mean_token_accuracy": 0.48896648044692737, "step": 5509 }, { "epoch": 1.021505376344086, "grad_norm": 8.4609375, "learning_rate": 8.978494623655915e-06, "loss": 2.8644, "mean_token_accuracy": 0.4558106169296987, "step": 5510 }, { "epoch": 1.021690767519466, "grad_norm": 6.44921875, "learning_rate": 8.978309232480535e-06, "loss": 3.4437, "mean_token_accuracy": 0.4086298042577511, "step": 5511 }, { "epoch": 1.0218761586948462, "grad_norm": 8.9375, "learning_rate": 8.978123841305154e-06, "loss": 2.1186, "mean_token_accuracy": 0.5306296431616124, "step": 5512 }, { "epoch": 1.0220615498702261, "grad_norm": 5.78515625, "learning_rate": 8.977938450129774e-06, "loss": 2.9658, "mean_token_accuracy": 0.43958705072648485, "step": 5513 }, { "epoch": 1.0222469410456063, "grad_norm": 6.1171875, "learning_rate": 8.977753058954395e-06, "loss": 2.9204, "mean_token_accuracy": 0.4574953070528292, "step": 5514 }, { "epoch": 1.0224323322209863, "grad_norm": 5.12109375, "learning_rate": 8.977567667779015e-06, "loss": 3.096, "mean_token_accuracy": 0.45022551546391754, "step": 5515 }, { "epoch": 1.0226177233963663, "grad_norm": 4.8984375, "learning_rate": 8.977382276603634e-06, "loss": 2.3322, "mean_token_accuracy": 0.5193633952254642, "step": 5516 }, { "epoch": 1.0228031145717464, "grad_norm": 8.234375, "learning_rate": 8.977196885428255e-06, "loss": 2.5278, "mean_token_accuracy": 0.4796555639666919, "step": 5517 }, { "epoch": 1.0229885057471264, "grad_norm": 6.3828125, "learning_rate": 8.977011494252873e-06, "loss": 3.1843, "mean_token_accuracy": 0.4291914483202058, "step": 5518 }, { "epoch": 1.0231738969225064, "grad_norm": 8.1484375, "learning_rate": 8.976826103077494e-06, "loss": 2.7885, "mean_token_accuracy": 0.4369158878504673, "step": 5519 }, { "epoch": 1.0233592880978866, "grad_norm": 5.44140625, "learning_rate": 8.976640711902114e-06, "loss": 2.662, "mean_token_accuracy": 0.47323369565217394, "step": 5520 }, { "epoch": 1.0235446792732665, "grad_norm": 6.91015625, "learning_rate": 8.976455320726735e-06, "loss": 2.5861, "mean_token_accuracy": 0.4894597235795058, "step": 5521 }, { "epoch": 1.0237300704486467, "grad_norm": 7.33984375, "learning_rate": 8.976269929551354e-06, "loss": 2.4696, "mean_token_accuracy": 0.48795108500179835, "step": 5522 }, { "epoch": 1.0239154616240267, "grad_norm": 5.7734375, "learning_rate": 8.976084538375974e-06, "loss": 2.5893, "mean_token_accuracy": 0.47368421052631576, "step": 5523 }, { "epoch": 1.0241008527994067, "grad_norm": 5.875, "learning_rate": 8.975899147200595e-06, "loss": 3.0833, "mean_token_accuracy": 0.44462613349189833, "step": 5524 }, { "epoch": 1.0242862439747868, "grad_norm": 7.36328125, "learning_rate": 8.975713756025213e-06, "loss": 2.3263, "mean_token_accuracy": 0.5102139489906926, "step": 5525 }, { "epoch": 1.0244716351501668, "grad_norm": 7.55078125, "learning_rate": 8.975528364849834e-06, "loss": 2.7095, "mean_token_accuracy": 0.4616011961397309, "step": 5526 }, { "epoch": 1.024657026325547, "grad_norm": 5.54296875, "learning_rate": 8.975342973674453e-06, "loss": 3.4361, "mean_token_accuracy": 0.4043360433604336, "step": 5527 }, { "epoch": 1.024842417500927, "grad_norm": 8.6015625, "learning_rate": 8.975157582499073e-06, "loss": 2.9329, "mean_token_accuracy": 0.4526143790849673, "step": 5528 }, { "epoch": 1.025027808676307, "grad_norm": 12.765625, "learning_rate": 8.974972191323694e-06, "loss": 3.8307, "mean_token_accuracy": 0.3935156540032174, "step": 5529 }, { "epoch": 1.0252131998516871, "grad_norm": 7.9375, "learning_rate": 8.974786800148314e-06, "loss": 2.0652, "mean_token_accuracy": 0.5345549738219896, "step": 5530 }, { "epoch": 1.025398591027067, "grad_norm": 7.703125, "learning_rate": 8.974601408972935e-06, "loss": 2.4594, "mean_token_accuracy": 0.4909354604786077, "step": 5531 }, { "epoch": 1.025583982202447, "grad_norm": 5.9921875, "learning_rate": 8.974416017797553e-06, "loss": 2.5402, "mean_token_accuracy": 0.4727535765954027, "step": 5532 }, { "epoch": 1.0257693733778273, "grad_norm": 12.46875, "learning_rate": 8.974230626622174e-06, "loss": 2.8028, "mean_token_accuracy": 0.43747587803936705, "step": 5533 }, { "epoch": 1.0259547645532072, "grad_norm": 6.59375, "learning_rate": 8.974045235446793e-06, "loss": 2.6011, "mean_token_accuracy": 0.48085558969510755, "step": 5534 }, { "epoch": 1.0261401557285874, "grad_norm": 7.1015625, "learning_rate": 8.973859844271413e-06, "loss": 2.6265, "mean_token_accuracy": 0.48827208756841284, "step": 5535 }, { "epoch": 1.0263255469039674, "grad_norm": 8.65625, "learning_rate": 8.973674453096032e-06, "loss": 2.4597, "mean_token_accuracy": 0.47446344542637436, "step": 5536 }, { "epoch": 1.0265109380793473, "grad_norm": 7.9921875, "learning_rate": 8.973489061920654e-06, "loss": 2.743, "mean_token_accuracy": 0.45961566247652075, "step": 5537 }, { "epoch": 1.0266963292547275, "grad_norm": 7.1328125, "learning_rate": 8.973303670745273e-06, "loss": 2.798, "mean_token_accuracy": 0.46198246198246196, "step": 5538 }, { "epoch": 1.0268817204301075, "grad_norm": 7.21484375, "learning_rate": 8.973118279569894e-06, "loss": 2.8432, "mean_token_accuracy": 0.45358649789029537, "step": 5539 }, { "epoch": 1.0270671116054875, "grad_norm": 6.29296875, "learning_rate": 8.972932888394514e-06, "loss": 2.3655, "mean_token_accuracy": 0.5073823651166, "step": 5540 }, { "epoch": 1.0272525027808677, "grad_norm": 7.0625, "learning_rate": 8.972747497219133e-06, "loss": 2.3819, "mean_token_accuracy": 0.5251381942616478, "step": 5541 }, { "epoch": 1.0274378939562476, "grad_norm": 7.81640625, "learning_rate": 8.972562106043753e-06, "loss": 2.521, "mean_token_accuracy": 0.4916988856038208, "step": 5542 }, { "epoch": 1.0276232851316278, "grad_norm": 7.515625, "learning_rate": 8.972376714868372e-06, "loss": 2.759, "mean_token_accuracy": 0.46452815120457874, "step": 5543 }, { "epoch": 1.0278086763070078, "grad_norm": 5.21875, "learning_rate": 8.972191323692993e-06, "loss": 2.1372, "mean_token_accuracy": 0.5526788003076134, "step": 5544 }, { "epoch": 1.0279940674823878, "grad_norm": 6.58203125, "learning_rate": 8.972005932517613e-06, "loss": 2.6057, "mean_token_accuracy": 0.4781368821292776, "step": 5545 }, { "epoch": 1.028179458657768, "grad_norm": 5.796875, "learning_rate": 8.971820541342234e-06, "loss": 3.1485, "mean_token_accuracy": 0.41814420803782504, "step": 5546 }, { "epoch": 1.028364849833148, "grad_norm": 5.76953125, "learning_rate": 8.971635150166852e-06, "loss": 3.0989, "mean_token_accuracy": 0.4389733421458471, "step": 5547 }, { "epoch": 1.028550241008528, "grad_norm": 6.89453125, "learning_rate": 8.971449758991473e-06, "loss": 2.2856, "mean_token_accuracy": 0.4929492587682295, "step": 5548 }, { "epoch": 1.028735632183908, "grad_norm": 6.0390625, "learning_rate": 8.971264367816093e-06, "loss": 2.8145, "mean_token_accuracy": 0.4703251541157984, "step": 5549 }, { "epoch": 1.028921023359288, "grad_norm": 5.796875, "learning_rate": 8.971078976640712e-06, "loss": 3.0712, "mean_token_accuracy": 0.4100673700266938, "step": 5550 }, { "epoch": 1.0291064145346682, "grad_norm": 6.421875, "learning_rate": 8.970893585465333e-06, "loss": 2.6634, "mean_token_accuracy": 0.4408254599701641, "step": 5551 }, { "epoch": 1.0292918057100482, "grad_norm": 8.65625, "learning_rate": 8.970708194289951e-06, "loss": 2.551, "mean_token_accuracy": 0.4813048933500627, "step": 5552 }, { "epoch": 1.0294771968854282, "grad_norm": 5.99609375, "learning_rate": 8.970522803114574e-06, "loss": 2.4918, "mean_token_accuracy": 0.48578924355050285, "step": 5553 }, { "epoch": 1.0296625880608083, "grad_norm": 7.1015625, "learning_rate": 8.970337411939192e-06, "loss": 2.2307, "mean_token_accuracy": 0.5139676322028657, "step": 5554 }, { "epoch": 1.0298479792361883, "grad_norm": 6.171875, "learning_rate": 8.970152020763813e-06, "loss": 2.502, "mean_token_accuracy": 0.4964777947932619, "step": 5555 }, { "epoch": 1.0300333704115685, "grad_norm": 7.49609375, "learning_rate": 8.969966629588432e-06, "loss": 2.998, "mean_token_accuracy": 0.438280725319006, "step": 5556 }, { "epoch": 1.0302187615869485, "grad_norm": 6.109375, "learning_rate": 8.969781238413052e-06, "loss": 2.5377, "mean_token_accuracy": 0.4846938775510204, "step": 5557 }, { "epoch": 1.0304041527623284, "grad_norm": 7.23828125, "learning_rate": 8.969595847237673e-06, "loss": 2.7634, "mean_token_accuracy": 0.4433853264009243, "step": 5558 }, { "epoch": 1.0305895439377086, "grad_norm": 10.0546875, "learning_rate": 8.969410456062292e-06, "loss": 3.5801, "mean_token_accuracy": 0.3630314649825195, "step": 5559 }, { "epoch": 1.0307749351130886, "grad_norm": 6.05859375, "learning_rate": 8.969225064886912e-06, "loss": 2.8917, "mean_token_accuracy": 0.4431622231065128, "step": 5560 }, { "epoch": 1.0309603262884686, "grad_norm": 8.1015625, "learning_rate": 8.969039673711532e-06, "loss": 2.6158, "mean_token_accuracy": 0.455750273822563, "step": 5561 }, { "epoch": 1.0311457174638488, "grad_norm": 14.9921875, "learning_rate": 8.968854282536153e-06, "loss": 2.1778, "mean_token_accuracy": 0.5068895126307732, "step": 5562 }, { "epoch": 1.0313311086392287, "grad_norm": 8.1953125, "learning_rate": 8.968668891360772e-06, "loss": 2.7626, "mean_token_accuracy": 0.44863096680397385, "step": 5563 }, { "epoch": 1.031516499814609, "grad_norm": 5.46484375, "learning_rate": 8.968483500185392e-06, "loss": 2.6416, "mean_token_accuracy": 0.47481092167670813, "step": 5564 }, { "epoch": 1.0317018909899889, "grad_norm": 7.71875, "learning_rate": 8.968298109010011e-06, "loss": 3.14, "mean_token_accuracy": 0.42704449415858814, "step": 5565 }, { "epoch": 1.0318872821653688, "grad_norm": 9.4609375, "learning_rate": 8.968112717834632e-06, "loss": 2.5653, "mean_token_accuracy": 0.476685189659338, "step": 5566 }, { "epoch": 1.032072673340749, "grad_norm": 8.9921875, "learning_rate": 8.967927326659252e-06, "loss": 2.7088, "mean_token_accuracy": 0.47718416090509114, "step": 5567 }, { "epoch": 1.032258064516129, "grad_norm": 7.390625, "learning_rate": 8.967741935483871e-06, "loss": 2.8832, "mean_token_accuracy": 0.45079212674027846, "step": 5568 }, { "epoch": 1.0324434556915092, "grad_norm": 6.828125, "learning_rate": 8.967556544308491e-06, "loss": 2.416, "mean_token_accuracy": 0.5176904176904177, "step": 5569 }, { "epoch": 1.0326288468668892, "grad_norm": 8.671875, "learning_rate": 8.967371153133112e-06, "loss": 2.6533, "mean_token_accuracy": 0.4761355443403028, "step": 5570 }, { "epoch": 1.0328142380422691, "grad_norm": 7.6953125, "learning_rate": 8.967185761957732e-06, "loss": 2.9078, "mean_token_accuracy": 0.4454123112659698, "step": 5571 }, { "epoch": 1.0329996292176493, "grad_norm": 7.77734375, "learning_rate": 8.967000370782351e-06, "loss": 2.7456, "mean_token_accuracy": 0.4697633654688869, "step": 5572 }, { "epoch": 1.0331850203930293, "grad_norm": 11.2109375, "learning_rate": 8.966814979606972e-06, "loss": 2.4755, "mean_token_accuracy": 0.48849441157133466, "step": 5573 }, { "epoch": 1.0333704115684093, "grad_norm": 8.078125, "learning_rate": 8.966629588431592e-06, "loss": 2.6445, "mean_token_accuracy": 0.4667390714091773, "step": 5574 }, { "epoch": 1.0335558027437894, "grad_norm": 8.8359375, "learning_rate": 8.966444197256211e-06, "loss": 2.5863, "mean_token_accuracy": 0.4709214938143138, "step": 5575 }, { "epoch": 1.0337411939191694, "grad_norm": 10.140625, "learning_rate": 8.966258806080831e-06, "loss": 2.7975, "mean_token_accuracy": 0.44244946492271103, "step": 5576 }, { "epoch": 1.0339265850945496, "grad_norm": 10.25, "learning_rate": 8.96607341490545e-06, "loss": 2.6415, "mean_token_accuracy": 0.4659902292371289, "step": 5577 }, { "epoch": 1.0341119762699296, "grad_norm": 6.66015625, "learning_rate": 8.965888023730072e-06, "loss": 2.8957, "mean_token_accuracy": 0.4733075874602453, "step": 5578 }, { "epoch": 1.0342973674453095, "grad_norm": 6.796875, "learning_rate": 8.965702632554691e-06, "loss": 2.4616, "mean_token_accuracy": 0.5248847193512483, "step": 5579 }, { "epoch": 1.0344827586206897, "grad_norm": 5.80078125, "learning_rate": 8.965517241379312e-06, "loss": 2.8631, "mean_token_accuracy": 0.44561495917829863, "step": 5580 }, { "epoch": 1.0346681497960697, "grad_norm": 8.5390625, "learning_rate": 8.96533185020393e-06, "loss": 2.324, "mean_token_accuracy": 0.49885233358837033, "step": 5581 }, { "epoch": 1.0348535409714497, "grad_norm": 7.13671875, "learning_rate": 8.965146459028551e-06, "loss": 2.4044, "mean_token_accuracy": 0.49252316764953663, "step": 5582 }, { "epoch": 1.0350389321468298, "grad_norm": 5.15625, "learning_rate": 8.964961067853171e-06, "loss": 2.5914, "mean_token_accuracy": 0.48448197170241897, "step": 5583 }, { "epoch": 1.0352243233222098, "grad_norm": 7.0625, "learning_rate": 8.96477567667779e-06, "loss": 3.13, "mean_token_accuracy": 0.4157202630005977, "step": 5584 }, { "epoch": 1.03540971449759, "grad_norm": 11.203125, "learning_rate": 8.96459028550241e-06, "loss": 2.39, "mean_token_accuracy": 0.519563090968373, "step": 5585 }, { "epoch": 1.03559510567297, "grad_norm": 7.05078125, "learning_rate": 8.964404894327031e-06, "loss": 2.9875, "mean_token_accuracy": 0.4204845814977974, "step": 5586 }, { "epoch": 1.03578049684835, "grad_norm": 8.375, "learning_rate": 8.964219503151652e-06, "loss": 2.4857, "mean_token_accuracy": 0.47370133391172325, "step": 5587 }, { "epoch": 1.0359658880237301, "grad_norm": 6.05859375, "learning_rate": 8.96403411197627e-06, "loss": 2.5342, "mean_token_accuracy": 0.47273974168476396, "step": 5588 }, { "epoch": 1.03615127919911, "grad_norm": 8.203125, "learning_rate": 8.963848720800891e-06, "loss": 2.5891, "mean_token_accuracy": 0.4813193228254524, "step": 5589 }, { "epoch": 1.03633667037449, "grad_norm": 6.4140625, "learning_rate": 8.96366332962551e-06, "loss": 2.5965, "mean_token_accuracy": 0.46768328445747803, "step": 5590 }, { "epoch": 1.0365220615498703, "grad_norm": 7.83203125, "learning_rate": 8.96347793845013e-06, "loss": 2.7904, "mean_token_accuracy": 0.4707792207792208, "step": 5591 }, { "epoch": 1.0367074527252502, "grad_norm": 7.23828125, "learning_rate": 8.96329254727475e-06, "loss": 2.4898, "mean_token_accuracy": 0.4925648273319405, "step": 5592 }, { "epoch": 1.0368928439006304, "grad_norm": 8.3515625, "learning_rate": 8.96310715609937e-06, "loss": 3.3168, "mean_token_accuracy": 0.4000669194735668, "step": 5593 }, { "epoch": 1.0370782350760104, "grad_norm": 7.8515625, "learning_rate": 8.96292176492399e-06, "loss": 2.842, "mean_token_accuracy": 0.4589775734782003, "step": 5594 }, { "epoch": 1.0372636262513903, "grad_norm": 6.60546875, "learning_rate": 8.96273637374861e-06, "loss": 3.0455, "mean_token_accuracy": 0.4381054897739505, "step": 5595 }, { "epoch": 1.0374490174267705, "grad_norm": 5.984375, "learning_rate": 8.962550982573231e-06, "loss": 2.3983, "mean_token_accuracy": 0.4983991462113127, "step": 5596 }, { "epoch": 1.0376344086021505, "grad_norm": 10.8125, "learning_rate": 8.96236559139785e-06, "loss": 2.5216, "mean_token_accuracy": 0.48427754677754675, "step": 5597 }, { "epoch": 1.0378197997775307, "grad_norm": 7.453125, "learning_rate": 8.96218020022247e-06, "loss": 3.0371, "mean_token_accuracy": 0.4277363729358469, "step": 5598 }, { "epoch": 1.0380051909529107, "grad_norm": 6.44140625, "learning_rate": 8.961994809047089e-06, "loss": 3.1914, "mean_token_accuracy": 0.4148964418481147, "step": 5599 }, { "epoch": 1.0381905821282906, "grad_norm": 8.515625, "learning_rate": 8.96180941787171e-06, "loss": 2.1297, "mean_token_accuracy": 0.5437125748502994, "step": 5600 }, { "epoch": 1.0383759733036708, "grad_norm": 6.203125, "learning_rate": 8.96162402669633e-06, "loss": 2.8923, "mean_token_accuracy": 0.44300737338329504, "step": 5601 }, { "epoch": 1.0385613644790508, "grad_norm": 6.51953125, "learning_rate": 8.96143863552095e-06, "loss": 2.8181, "mean_token_accuracy": 0.4305235903337169, "step": 5602 }, { "epoch": 1.0387467556544308, "grad_norm": 5.9296875, "learning_rate": 8.96125324434557e-06, "loss": 3.0045, "mean_token_accuracy": 0.4406193378356119, "step": 5603 }, { "epoch": 1.038932146829811, "grad_norm": 6.46875, "learning_rate": 8.96106785317019e-06, "loss": 2.9437, "mean_token_accuracy": 0.4326349382544748, "step": 5604 }, { "epoch": 1.039117538005191, "grad_norm": 6.078125, "learning_rate": 8.96088246199481e-06, "loss": 2.7796, "mean_token_accuracy": 0.47644927536231885, "step": 5605 }, { "epoch": 1.039302929180571, "grad_norm": 6.2109375, "learning_rate": 8.96069707081943e-06, "loss": 2.9562, "mean_token_accuracy": 0.4398704902867715, "step": 5606 }, { "epoch": 1.039488320355951, "grad_norm": 6.40625, "learning_rate": 8.96051167964405e-06, "loss": 2.75, "mean_token_accuracy": 0.46301564722617355, "step": 5607 }, { "epoch": 1.039673711531331, "grad_norm": 6.546875, "learning_rate": 8.960326288468668e-06, "loss": 2.8534, "mean_token_accuracy": 0.45798367628463044, "step": 5608 }, { "epoch": 1.0398591027067112, "grad_norm": 8.078125, "learning_rate": 8.960140897293289e-06, "loss": 2.9492, "mean_token_accuracy": 0.42568318151521883, "step": 5609 }, { "epoch": 1.0400444938820912, "grad_norm": 7.7421875, "learning_rate": 8.95995550611791e-06, "loss": 2.7774, "mean_token_accuracy": 0.4832951945080092, "step": 5610 }, { "epoch": 1.0402298850574712, "grad_norm": 8.8515625, "learning_rate": 8.95977011494253e-06, "loss": 2.547, "mean_token_accuracy": 0.49680453394429036, "step": 5611 }, { "epoch": 1.0404152762328513, "grad_norm": 7.1484375, "learning_rate": 8.95958472376715e-06, "loss": 2.7894, "mean_token_accuracy": 0.4746865389164892, "step": 5612 }, { "epoch": 1.0406006674082313, "grad_norm": 5.83984375, "learning_rate": 8.95939933259177e-06, "loss": 3.0426, "mean_token_accuracy": 0.4395138496325608, "step": 5613 }, { "epoch": 1.0407860585836115, "grad_norm": 6.51171875, "learning_rate": 8.95921394141639e-06, "loss": 3.11, "mean_token_accuracy": 0.41501605995717344, "step": 5614 }, { "epoch": 1.0409714497589915, "grad_norm": 5.80078125, "learning_rate": 8.959028550241009e-06, "loss": 2.5915, "mean_token_accuracy": 0.46768828700403897, "step": 5615 }, { "epoch": 1.0411568409343714, "grad_norm": 5.40234375, "learning_rate": 8.958843159065629e-06, "loss": 2.5451, "mean_token_accuracy": 0.47551766138855056, "step": 5616 }, { "epoch": 1.0413422321097516, "grad_norm": 5.0703125, "learning_rate": 8.958657767890248e-06, "loss": 2.9057, "mean_token_accuracy": 0.4242699545749513, "step": 5617 }, { "epoch": 1.0415276232851316, "grad_norm": 6.63671875, "learning_rate": 8.95847237671487e-06, "loss": 2.6428, "mean_token_accuracy": 0.4813992951311839, "step": 5618 }, { "epoch": 1.0417130144605116, "grad_norm": 6.84765625, "learning_rate": 8.958286985539489e-06, "loss": 2.5617, "mean_token_accuracy": 0.4784663156473606, "step": 5619 }, { "epoch": 1.0418984056358918, "grad_norm": 6.1328125, "learning_rate": 8.95810159436411e-06, "loss": 2.5549, "mean_token_accuracy": 0.48664487771892256, "step": 5620 }, { "epoch": 1.0420837968112717, "grad_norm": 5.6875, "learning_rate": 8.95791620318873e-06, "loss": 2.2521, "mean_token_accuracy": 0.5407463006648081, "step": 5621 }, { "epoch": 1.042269187986652, "grad_norm": 7.7265625, "learning_rate": 8.957730812013349e-06, "loss": 2.3945, "mean_token_accuracy": 0.48872472783825816, "step": 5622 }, { "epoch": 1.0424545791620319, "grad_norm": 6.984375, "learning_rate": 8.957545420837969e-06, "loss": 2.6269, "mean_token_accuracy": 0.45908100819417075, "step": 5623 }, { "epoch": 1.0426399703374118, "grad_norm": 6.3828125, "learning_rate": 8.957360029662588e-06, "loss": 2.3866, "mean_token_accuracy": 0.48110140215403374, "step": 5624 }, { "epoch": 1.042825361512792, "grad_norm": 5.6484375, "learning_rate": 8.957174638487208e-06, "loss": 2.9371, "mean_token_accuracy": 0.44050818470559494, "step": 5625 }, { "epoch": 1.043010752688172, "grad_norm": 7.140625, "learning_rate": 8.956989247311829e-06, "loss": 3.4247, "mean_token_accuracy": 0.4192400413270577, "step": 5626 }, { "epoch": 1.0431961438635522, "grad_norm": 6.5, "learning_rate": 8.95680385613645e-06, "loss": 2.5695, "mean_token_accuracy": 0.4887134745880072, "step": 5627 }, { "epoch": 1.0433815350389322, "grad_norm": 6.73046875, "learning_rate": 8.956618464961068e-06, "loss": 2.6947, "mean_token_accuracy": 0.4662992306809048, "step": 5628 }, { "epoch": 1.0435669262143121, "grad_norm": 7.796875, "learning_rate": 8.956433073785689e-06, "loss": 2.4385, "mean_token_accuracy": 0.5011981566820276, "step": 5629 }, { "epoch": 1.0437523173896923, "grad_norm": 9.0859375, "learning_rate": 8.956247682610309e-06, "loss": 2.8683, "mean_token_accuracy": 0.45062549906840565, "step": 5630 }, { "epoch": 1.0439377085650723, "grad_norm": 6.3046875, "learning_rate": 8.956062291434928e-06, "loss": 2.7964, "mean_token_accuracy": 0.4394172853340804, "step": 5631 }, { "epoch": 1.0441230997404523, "grad_norm": 6.48828125, "learning_rate": 8.955876900259548e-06, "loss": 2.7648, "mean_token_accuracy": 0.4543467702768334, "step": 5632 }, { "epoch": 1.0443084909158324, "grad_norm": 6.109375, "learning_rate": 8.955691509084167e-06, "loss": 2.4697, "mean_token_accuracy": 0.4915514592933948, "step": 5633 }, { "epoch": 1.0444938820912124, "grad_norm": 8.3359375, "learning_rate": 8.95550611790879e-06, "loss": 3.0593, "mean_token_accuracy": 0.4274000252302258, "step": 5634 }, { "epoch": 1.0446792732665926, "grad_norm": 6.4375, "learning_rate": 8.955320726733408e-06, "loss": 2.7248, "mean_token_accuracy": 0.4629392656757555, "step": 5635 }, { "epoch": 1.0448646644419726, "grad_norm": 7.14453125, "learning_rate": 8.955135335558029e-06, "loss": 2.6308, "mean_token_accuracy": 0.4556735144078312, "step": 5636 }, { "epoch": 1.0450500556173525, "grad_norm": 8.5078125, "learning_rate": 8.954949944382647e-06, "loss": 2.6641, "mean_token_accuracy": 0.4824143756719398, "step": 5637 }, { "epoch": 1.0452354467927327, "grad_norm": 6.8515625, "learning_rate": 8.954764553207268e-06, "loss": 2.2615, "mean_token_accuracy": 0.5200527414569828, "step": 5638 }, { "epoch": 1.0454208379681127, "grad_norm": 7.0078125, "learning_rate": 8.954579162031888e-06, "loss": 3.0472, "mean_token_accuracy": 0.42877353357275005, "step": 5639 }, { "epoch": 1.0456062291434929, "grad_norm": 7.25, "learning_rate": 8.954393770856507e-06, "loss": 3.158, "mean_token_accuracy": 0.43897291593387266, "step": 5640 }, { "epoch": 1.0457916203188728, "grad_norm": 7.80859375, "learning_rate": 8.954208379681128e-06, "loss": 3.0234, "mean_token_accuracy": 0.43511053315994797, "step": 5641 }, { "epoch": 1.0459770114942528, "grad_norm": 8.1875, "learning_rate": 8.954022988505748e-06, "loss": 2.762, "mean_token_accuracy": 0.46043460434604344, "step": 5642 }, { "epoch": 1.046162402669633, "grad_norm": 6.67578125, "learning_rate": 8.953837597330369e-06, "loss": 3.1795, "mean_token_accuracy": 0.4272343791194305, "step": 5643 }, { "epoch": 1.046347793845013, "grad_norm": 8.46875, "learning_rate": 8.953652206154988e-06, "loss": 2.6132, "mean_token_accuracy": 0.4865735767991407, "step": 5644 }, { "epoch": 1.046533185020393, "grad_norm": 10.921875, "learning_rate": 8.953466814979608e-06, "loss": 1.9001, "mean_token_accuracy": 0.5672187567218757, "step": 5645 }, { "epoch": 1.0467185761957731, "grad_norm": 8.0546875, "learning_rate": 8.953281423804227e-06, "loss": 2.8291, "mean_token_accuracy": 0.4414668547249647, "step": 5646 }, { "epoch": 1.046903967371153, "grad_norm": 8.4453125, "learning_rate": 8.953096032628847e-06, "loss": 2.5627, "mean_token_accuracy": 0.4692361042306549, "step": 5647 }, { "epoch": 1.0470893585465333, "grad_norm": 7.0859375, "learning_rate": 8.952910641453468e-06, "loss": 2.5749, "mean_token_accuracy": 0.49135856486545615, "step": 5648 }, { "epoch": 1.0472747497219133, "grad_norm": 5.1875, "learning_rate": 8.952725250278087e-06, "loss": 2.6549, "mean_token_accuracy": 0.4620816702486951, "step": 5649 }, { "epoch": 1.0474601408972932, "grad_norm": 6.97265625, "learning_rate": 8.952539859102709e-06, "loss": 2.6919, "mean_token_accuracy": 0.4596832776405637, "step": 5650 }, { "epoch": 1.0476455320726734, "grad_norm": 8.53125, "learning_rate": 8.952354467927328e-06, "loss": 3.01, "mean_token_accuracy": 0.44061566735584656, "step": 5651 }, { "epoch": 1.0478309232480534, "grad_norm": 7.37890625, "learning_rate": 8.952169076751948e-06, "loss": 2.2429, "mean_token_accuracy": 0.538016628509459, "step": 5652 }, { "epoch": 1.0480163144234333, "grad_norm": 5.87109375, "learning_rate": 8.951983685576567e-06, "loss": 2.6003, "mean_token_accuracy": 0.457256046705588, "step": 5653 }, { "epoch": 1.0482017055988135, "grad_norm": 8.328125, "learning_rate": 8.951798294401187e-06, "loss": 2.6015, "mean_token_accuracy": 0.48035298035298035, "step": 5654 }, { "epoch": 1.0483870967741935, "grad_norm": 6.3828125, "learning_rate": 8.951612903225806e-06, "loss": 2.9956, "mean_token_accuracy": 0.4323971260613978, "step": 5655 }, { "epoch": 1.0485724879495737, "grad_norm": 7.9609375, "learning_rate": 8.951427512050427e-06, "loss": 2.7272, "mean_token_accuracy": 0.4562591714640479, "step": 5656 }, { "epoch": 1.0487578791249537, "grad_norm": 7.01953125, "learning_rate": 8.951242120875047e-06, "loss": 2.9276, "mean_token_accuracy": 0.44190900236538194, "step": 5657 }, { "epoch": 1.0489432703003336, "grad_norm": 6.30859375, "learning_rate": 8.951056729699668e-06, "loss": 3.2293, "mean_token_accuracy": 0.42861804535813347, "step": 5658 }, { "epoch": 1.0491286614757138, "grad_norm": 8.046875, "learning_rate": 8.950871338524288e-06, "loss": 2.7459, "mean_token_accuracy": 0.47488055595772405, "step": 5659 }, { "epoch": 1.0493140526510938, "grad_norm": 9.9140625, "learning_rate": 8.950685947348907e-06, "loss": 2.8, "mean_token_accuracy": 0.4492506396978189, "step": 5660 }, { "epoch": 1.0494994438264738, "grad_norm": 6.703125, "learning_rate": 8.950500556173527e-06, "loss": 2.8064, "mean_token_accuracy": 0.4726089785296031, "step": 5661 }, { "epoch": 1.049684835001854, "grad_norm": 6.4140625, "learning_rate": 8.950315164998146e-06, "loss": 2.214, "mean_token_accuracy": 0.5288023679417122, "step": 5662 }, { "epoch": 1.049870226177234, "grad_norm": 9.4296875, "learning_rate": 8.950129773822767e-06, "loss": 2.7581, "mean_token_accuracy": 0.4600056529112493, "step": 5663 }, { "epoch": 1.050055617352614, "grad_norm": 15.109375, "learning_rate": 8.949944382647387e-06, "loss": 2.9905, "mean_token_accuracy": 0.42480156684877846, "step": 5664 }, { "epoch": 1.050241008527994, "grad_norm": 5.92578125, "learning_rate": 8.949758991472006e-06, "loss": 2.5476, "mean_token_accuracy": 0.46461967899511514, "step": 5665 }, { "epoch": 1.050426399703374, "grad_norm": 6.90625, "learning_rate": 8.949573600296626e-06, "loss": 2.6666, "mean_token_accuracy": 0.4581861012956419, "step": 5666 }, { "epoch": 1.0506117908787542, "grad_norm": 5.37109375, "learning_rate": 8.949388209121247e-06, "loss": 2.9847, "mean_token_accuracy": 0.4569648189017266, "step": 5667 }, { "epoch": 1.0507971820541342, "grad_norm": 4.98828125, "learning_rate": 8.949202817945867e-06, "loss": 2.6743, "mean_token_accuracy": 0.4740740740740741, "step": 5668 }, { "epoch": 1.0509825732295144, "grad_norm": 5.90625, "learning_rate": 8.949017426770486e-06, "loss": 3.0292, "mean_token_accuracy": 0.4361972547025928, "step": 5669 }, { "epoch": 1.0511679644048944, "grad_norm": 12.640625, "learning_rate": 8.948832035595107e-06, "loss": 2.4846, "mean_token_accuracy": 0.47592931139549055, "step": 5670 }, { "epoch": 1.0513533555802743, "grad_norm": 7.90625, "learning_rate": 8.948646644419726e-06, "loss": 3.2244, "mean_token_accuracy": 0.42754798594214655, "step": 5671 }, { "epoch": 1.0515387467556545, "grad_norm": 6.90234375, "learning_rate": 8.948461253244346e-06, "loss": 2.9034, "mean_token_accuracy": 0.43752549286199865, "step": 5672 }, { "epoch": 1.0517241379310345, "grad_norm": 7.75390625, "learning_rate": 8.948275862068967e-06, "loss": 3.1433, "mean_token_accuracy": 0.41396718552797646, "step": 5673 }, { "epoch": 1.0519095291064144, "grad_norm": 5.48828125, "learning_rate": 8.948090470893587e-06, "loss": 2.8565, "mean_token_accuracy": 0.4496324530356657, "step": 5674 }, { "epoch": 1.0520949202817946, "grad_norm": 5.62890625, "learning_rate": 8.947905079718206e-06, "loss": 3.0286, "mean_token_accuracy": 0.4350290697674419, "step": 5675 }, { "epoch": 1.0522803114571746, "grad_norm": 5.78515625, "learning_rate": 8.947719688542826e-06, "loss": 2.86, "mean_token_accuracy": 0.4517029592406477, "step": 5676 }, { "epoch": 1.0524657026325548, "grad_norm": 5.84765625, "learning_rate": 8.947534297367447e-06, "loss": 2.8077, "mean_token_accuracy": 0.4557408985754292, "step": 5677 }, { "epoch": 1.0526510938079348, "grad_norm": 5.390625, "learning_rate": 8.947348906192066e-06, "loss": 2.9929, "mean_token_accuracy": 0.43848199178148417, "step": 5678 }, { "epoch": 1.0528364849833147, "grad_norm": 5.23046875, "learning_rate": 8.947163515016686e-06, "loss": 2.6964, "mean_token_accuracy": 0.47115795470730665, "step": 5679 }, { "epoch": 1.053021876158695, "grad_norm": 5.96875, "learning_rate": 8.946978123841305e-06, "loss": 3.4283, "mean_token_accuracy": 0.4009882643607165, "step": 5680 }, { "epoch": 1.0532072673340749, "grad_norm": 6.32421875, "learning_rate": 8.946792732665925e-06, "loss": 2.8485, "mean_token_accuracy": 0.45848327335732375, "step": 5681 }, { "epoch": 1.0533926585094548, "grad_norm": 8.75, "learning_rate": 8.946607341490546e-06, "loss": 2.9152, "mean_token_accuracy": 0.43754611573732477, "step": 5682 }, { "epoch": 1.053578049684835, "grad_norm": 6.27734375, "learning_rate": 8.946421950315166e-06, "loss": 2.5218, "mean_token_accuracy": 0.4813002826701457, "step": 5683 }, { "epoch": 1.053763440860215, "grad_norm": 6.6875, "learning_rate": 8.946236559139785e-06, "loss": 3.5092, "mean_token_accuracy": 0.42909504550050553, "step": 5684 }, { "epoch": 1.0539488320355952, "grad_norm": 5.65234375, "learning_rate": 8.946051167964406e-06, "loss": 2.9554, "mean_token_accuracy": 0.4472736306048938, "step": 5685 }, { "epoch": 1.0541342232109752, "grad_norm": 6.9921875, "learning_rate": 8.945865776789026e-06, "loss": 2.8285, "mean_token_accuracy": 0.45864338866628507, "step": 5686 }, { "epoch": 1.0543196143863551, "grad_norm": 6.125, "learning_rate": 8.945680385613645e-06, "loss": 2.3288, "mean_token_accuracy": 0.4971605575632421, "step": 5687 }, { "epoch": 1.0545050055617353, "grad_norm": 5.92578125, "learning_rate": 8.945494994438265e-06, "loss": 2.7666, "mean_token_accuracy": 0.46943231441048033, "step": 5688 }, { "epoch": 1.0546903967371153, "grad_norm": 7.20703125, "learning_rate": 8.945309603262884e-06, "loss": 2.9318, "mean_token_accuracy": 0.42982782113216894, "step": 5689 }, { "epoch": 1.0548757879124953, "grad_norm": 7.2578125, "learning_rate": 8.945124212087506e-06, "loss": 3.0356, "mean_token_accuracy": 0.4259826129545733, "step": 5690 }, { "epoch": 1.0550611790878754, "grad_norm": 6.28515625, "learning_rate": 8.944938820912125e-06, "loss": 2.8971, "mean_token_accuracy": 0.46479469451107663, "step": 5691 }, { "epoch": 1.0552465702632554, "grad_norm": 6.38671875, "learning_rate": 8.944753429736746e-06, "loss": 2.9445, "mean_token_accuracy": 0.4489905067522396, "step": 5692 }, { "epoch": 1.0554319614386356, "grad_norm": 7.78515625, "learning_rate": 8.944568038561366e-06, "loss": 2.696, "mean_token_accuracy": 0.48539928486293205, "step": 5693 }, { "epoch": 1.0556173526140156, "grad_norm": 11.8046875, "learning_rate": 8.944382647385985e-06, "loss": 2.4134, "mean_token_accuracy": 0.500067231410515, "step": 5694 }, { "epoch": 1.0558027437893955, "grad_norm": 6.14453125, "learning_rate": 8.944197256210605e-06, "loss": 3.0006, "mean_token_accuracy": 0.4424843271613938, "step": 5695 }, { "epoch": 1.0559881349647757, "grad_norm": 7.19921875, "learning_rate": 8.944011865035224e-06, "loss": 2.5536, "mean_token_accuracy": 0.4783821478382148, "step": 5696 }, { "epoch": 1.0561735261401557, "grad_norm": 7.3359375, "learning_rate": 8.943826473859845e-06, "loss": 2.7178, "mean_token_accuracy": 0.470316301703163, "step": 5697 }, { "epoch": 1.0563589173155359, "grad_norm": 9.6875, "learning_rate": 8.943641082684464e-06, "loss": 2.9396, "mean_token_accuracy": 0.44092174450974325, "step": 5698 }, { "epoch": 1.0565443084909159, "grad_norm": 6.90625, "learning_rate": 8.943455691509086e-06, "loss": 2.8208, "mean_token_accuracy": 0.4552061145219897, "step": 5699 }, { "epoch": 1.0567296996662958, "grad_norm": 8.65625, "learning_rate": 8.943270300333705e-06, "loss": 2.457, "mean_token_accuracy": 0.47265865004827523, "step": 5700 }, { "epoch": 1.056915090841676, "grad_norm": 7.38671875, "learning_rate": 8.943084909158325e-06, "loss": 2.0218, "mean_token_accuracy": 0.5308861799753458, "step": 5701 }, { "epoch": 1.057100482017056, "grad_norm": 5.6875, "learning_rate": 8.942899517982946e-06, "loss": 2.5764, "mean_token_accuracy": 0.48502994011976047, "step": 5702 }, { "epoch": 1.057285873192436, "grad_norm": 6.24609375, "learning_rate": 8.942714126807564e-06, "loss": 2.5405, "mean_token_accuracy": 0.5013395847287341, "step": 5703 }, { "epoch": 1.0574712643678161, "grad_norm": 7.37109375, "learning_rate": 8.942528735632185e-06, "loss": 2.1832, "mean_token_accuracy": 0.5244963939318578, "step": 5704 }, { "epoch": 1.057656655543196, "grad_norm": 6.7578125, "learning_rate": 8.942343344456804e-06, "loss": 3.0618, "mean_token_accuracy": 0.42559818773892116, "step": 5705 }, { "epoch": 1.0578420467185763, "grad_norm": 5.3515625, "learning_rate": 8.942157953281424e-06, "loss": 2.7036, "mean_token_accuracy": 0.49498619386717047, "step": 5706 }, { "epoch": 1.0580274378939563, "grad_norm": 6.52734375, "learning_rate": 8.941972562106045e-06, "loss": 2.6175, "mean_token_accuracy": 0.47662048490846115, "step": 5707 }, { "epoch": 1.0582128290693362, "grad_norm": 6.234375, "learning_rate": 8.941787170930665e-06, "loss": 2.5017, "mean_token_accuracy": 0.4891633064516129, "step": 5708 }, { "epoch": 1.0583982202447164, "grad_norm": 4.88671875, "learning_rate": 8.941601779755284e-06, "loss": 2.4592, "mean_token_accuracy": 0.4957750380939188, "step": 5709 }, { "epoch": 1.0585836114200964, "grad_norm": 5.4609375, "learning_rate": 8.941416388579904e-06, "loss": 2.7625, "mean_token_accuracy": 0.46655376799322607, "step": 5710 }, { "epoch": 1.0587690025954763, "grad_norm": 5.91796875, "learning_rate": 8.941230997404525e-06, "loss": 3.2089, "mean_token_accuracy": 0.42674842558833276, "step": 5711 }, { "epoch": 1.0589543937708565, "grad_norm": 7.51171875, "learning_rate": 8.941045606229144e-06, "loss": 3.0842, "mean_token_accuracy": 0.42810364464692485, "step": 5712 }, { "epoch": 1.0591397849462365, "grad_norm": 6.08203125, "learning_rate": 8.940860215053764e-06, "loss": 2.9793, "mean_token_accuracy": 0.42973939703628, "step": 5713 }, { "epoch": 1.0593251761216167, "grad_norm": 8.234375, "learning_rate": 8.940674823878383e-06, "loss": 2.5483, "mean_token_accuracy": 0.4722112211221122, "step": 5714 }, { "epoch": 1.0595105672969967, "grad_norm": 7.44921875, "learning_rate": 8.940489432703005e-06, "loss": 3.2442, "mean_token_accuracy": 0.3954865378170037, "step": 5715 }, { "epoch": 1.0596959584723766, "grad_norm": 6.57421875, "learning_rate": 8.940304041527624e-06, "loss": 3.1388, "mean_token_accuracy": 0.41407254952405453, "step": 5716 }, { "epoch": 1.0598813496477568, "grad_norm": 5.80078125, "learning_rate": 8.940118650352244e-06, "loss": 2.9944, "mean_token_accuracy": 0.4496709461646348, "step": 5717 }, { "epoch": 1.0600667408231368, "grad_norm": 6.51953125, "learning_rate": 8.939933259176863e-06, "loss": 2.8126, "mean_token_accuracy": 0.47378570567675066, "step": 5718 }, { "epoch": 1.060252131998517, "grad_norm": 5.578125, "learning_rate": 8.939747868001484e-06, "loss": 2.6748, "mean_token_accuracy": 0.4578462384816394, "step": 5719 }, { "epoch": 1.060437523173897, "grad_norm": 8.1484375, "learning_rate": 8.939562476826104e-06, "loss": 3.3277, "mean_token_accuracy": 0.4234686346863469, "step": 5720 }, { "epoch": 1.060622914349277, "grad_norm": 5.6328125, "learning_rate": 8.939377085650723e-06, "loss": 2.4503, "mean_token_accuracy": 0.5293363714041583, "step": 5721 }, { "epoch": 1.060808305524657, "grad_norm": 6.578125, "learning_rate": 8.939191694475344e-06, "loss": 2.9995, "mean_token_accuracy": 0.45446910617876424, "step": 5722 }, { "epoch": 1.060993696700037, "grad_norm": 7.734375, "learning_rate": 8.939006303299964e-06, "loss": 2.2549, "mean_token_accuracy": 0.5109130385098846, "step": 5723 }, { "epoch": 1.061179087875417, "grad_norm": 8.0234375, "learning_rate": 8.938820912124584e-06, "loss": 2.9035, "mean_token_accuracy": 0.4328173374613003, "step": 5724 }, { "epoch": 1.0613644790507972, "grad_norm": 6.609375, "learning_rate": 8.938635520949203e-06, "loss": 3.1527, "mean_token_accuracy": 0.4398438825725437, "step": 5725 }, { "epoch": 1.0615498702261772, "grad_norm": 7.1484375, "learning_rate": 8.938450129773824e-06, "loss": 2.9403, "mean_token_accuracy": 0.41848958333333336, "step": 5726 }, { "epoch": 1.0617352614015574, "grad_norm": 5.44140625, "learning_rate": 8.938264738598443e-06, "loss": 2.4523, "mean_token_accuracy": 0.49534474081529944, "step": 5727 }, { "epoch": 1.0619206525769374, "grad_norm": 6.5703125, "learning_rate": 8.938079347423063e-06, "loss": 2.2225, "mean_token_accuracy": 0.5306122448979592, "step": 5728 }, { "epoch": 1.0621060437523173, "grad_norm": 8.6015625, "learning_rate": 8.937893956247684e-06, "loss": 3.0911, "mean_token_accuracy": 0.42921137618501926, "step": 5729 }, { "epoch": 1.0622914349276975, "grad_norm": 8.1328125, "learning_rate": 8.937708565072302e-06, "loss": 2.6733, "mean_token_accuracy": 0.47794404684450226, "step": 5730 }, { "epoch": 1.0624768261030775, "grad_norm": 5.69921875, "learning_rate": 8.937523173896925e-06, "loss": 2.6952, "mean_token_accuracy": 0.4754469336954062, "step": 5731 }, { "epoch": 1.0626622172784574, "grad_norm": 6.66796875, "learning_rate": 8.937337782721543e-06, "loss": 2.9965, "mean_token_accuracy": 0.43822497976800645, "step": 5732 }, { "epoch": 1.0628476084538376, "grad_norm": 5.625, "learning_rate": 8.937152391546164e-06, "loss": 2.9898, "mean_token_accuracy": 0.42593153589821264, "step": 5733 }, { "epoch": 1.0630329996292176, "grad_norm": 5.53125, "learning_rate": 8.936967000370783e-06, "loss": 2.594, "mean_token_accuracy": 0.4858052901260213, "step": 5734 }, { "epoch": 1.0632183908045978, "grad_norm": 6.46484375, "learning_rate": 8.936781609195403e-06, "loss": 2.9792, "mean_token_accuracy": 0.45512335401846526, "step": 5735 }, { "epoch": 1.0634037819799778, "grad_norm": 5.34765625, "learning_rate": 8.936596218020022e-06, "loss": 2.331, "mean_token_accuracy": 0.5089388223716582, "step": 5736 }, { "epoch": 1.0635891731553577, "grad_norm": 5.66015625, "learning_rate": 8.936410826844642e-06, "loss": 2.6855, "mean_token_accuracy": 0.4709402693060191, "step": 5737 }, { "epoch": 1.063774564330738, "grad_norm": 7.1171875, "learning_rate": 8.936225435669263e-06, "loss": 3.047, "mean_token_accuracy": 0.4472206625491297, "step": 5738 }, { "epoch": 1.0639599555061179, "grad_norm": 8.8984375, "learning_rate": 8.936040044493883e-06, "loss": 2.2704, "mean_token_accuracy": 0.5379146919431279, "step": 5739 }, { "epoch": 1.064145346681498, "grad_norm": 7.078125, "learning_rate": 8.935854653318504e-06, "loss": 2.6071, "mean_token_accuracy": 0.4864927806241267, "step": 5740 }, { "epoch": 1.064330737856878, "grad_norm": 6.73828125, "learning_rate": 8.935669262143123e-06, "loss": 3.0707, "mean_token_accuracy": 0.4472066292435178, "step": 5741 }, { "epoch": 1.064516129032258, "grad_norm": 5.90625, "learning_rate": 8.935483870967743e-06, "loss": 2.7683, "mean_token_accuracy": 0.47944368026898976, "step": 5742 }, { "epoch": 1.0647015202076382, "grad_norm": 8.5390625, "learning_rate": 8.935298479792362e-06, "loss": 3.0508, "mean_token_accuracy": 0.40850845720143514, "step": 5743 }, { "epoch": 1.0648869113830182, "grad_norm": 9.0078125, "learning_rate": 8.935113088616982e-06, "loss": 2.9536, "mean_token_accuracy": 0.4547327596654951, "step": 5744 }, { "epoch": 1.0650723025583981, "grad_norm": 6.79296875, "learning_rate": 8.934927697441603e-06, "loss": 3.1622, "mean_token_accuracy": 0.4279286035698215, "step": 5745 }, { "epoch": 1.0652576937337783, "grad_norm": 11.53125, "learning_rate": 8.934742306266222e-06, "loss": 2.8398, "mean_token_accuracy": 0.4524207011686144, "step": 5746 }, { "epoch": 1.0654430849091583, "grad_norm": 8.5, "learning_rate": 8.934556915090842e-06, "loss": 2.8321, "mean_token_accuracy": 0.45915655690352397, "step": 5747 }, { "epoch": 1.0656284760845385, "grad_norm": 6.05078125, "learning_rate": 8.934371523915463e-06, "loss": 2.7909, "mean_token_accuracy": 0.46559466898449947, "step": 5748 }, { "epoch": 1.0658138672599184, "grad_norm": 9.9609375, "learning_rate": 8.934186132740083e-06, "loss": 2.1183, "mean_token_accuracy": 0.5217225124219645, "step": 5749 }, { "epoch": 1.0659992584352984, "grad_norm": 7.58984375, "learning_rate": 8.934000741564702e-06, "loss": 2.424, "mean_token_accuracy": 0.5076538920855499, "step": 5750 }, { "epoch": 1.0661846496106786, "grad_norm": 5.05859375, "learning_rate": 8.933815350389323e-06, "loss": 3.3062, "mean_token_accuracy": 0.3998315485501143, "step": 5751 }, { "epoch": 1.0663700407860586, "grad_norm": 6.6328125, "learning_rate": 8.933629959213941e-06, "loss": 2.0356, "mean_token_accuracy": 0.571405596654873, "step": 5752 }, { "epoch": 1.0665554319614385, "grad_norm": 6.58203125, "learning_rate": 8.933444568038562e-06, "loss": 3.2009, "mean_token_accuracy": 0.4553668232743043, "step": 5753 }, { "epoch": 1.0667408231368187, "grad_norm": 7.12890625, "learning_rate": 8.933259176863182e-06, "loss": 2.6738, "mean_token_accuracy": 0.46766917293233085, "step": 5754 }, { "epoch": 1.0669262143121987, "grad_norm": 6.328125, "learning_rate": 8.933073785687803e-06, "loss": 2.7168, "mean_token_accuracy": 0.4710581639803784, "step": 5755 }, { "epoch": 1.0671116054875789, "grad_norm": 6.1640625, "learning_rate": 8.932888394512422e-06, "loss": 3.3006, "mean_token_accuracy": 0.4151372137738158, "step": 5756 }, { "epoch": 1.0672969966629589, "grad_norm": 6.82421875, "learning_rate": 8.932703003337042e-06, "loss": 3.2766, "mean_token_accuracy": 0.42860548271752086, "step": 5757 }, { "epoch": 1.0674823878383388, "grad_norm": 6.32421875, "learning_rate": 8.932517612161663e-06, "loss": 2.222, "mean_token_accuracy": 0.5101957585644372, "step": 5758 }, { "epoch": 1.067667779013719, "grad_norm": 4.94921875, "learning_rate": 8.932332220986281e-06, "loss": 3.0586, "mean_token_accuracy": 0.44321070234113713, "step": 5759 }, { "epoch": 1.067853170189099, "grad_norm": 7.13671875, "learning_rate": 8.932146829810902e-06, "loss": 2.6122, "mean_token_accuracy": 0.48635259834871297, "step": 5760 }, { "epoch": 1.068038561364479, "grad_norm": 4.703125, "learning_rate": 8.93196143863552e-06, "loss": 2.788, "mean_token_accuracy": 0.45470517184583287, "step": 5761 }, { "epoch": 1.0682239525398591, "grad_norm": 5.12890625, "learning_rate": 8.931776047460141e-06, "loss": 2.7694, "mean_token_accuracy": 0.45633971291866027, "step": 5762 }, { "epoch": 1.068409343715239, "grad_norm": 7.80859375, "learning_rate": 8.931590656284762e-06, "loss": 2.4383, "mean_token_accuracy": 0.49179037336932074, "step": 5763 }, { "epoch": 1.0685947348906193, "grad_norm": 6.328125, "learning_rate": 8.931405265109382e-06, "loss": 2.4456, "mean_token_accuracy": 0.5048680682940595, "step": 5764 }, { "epoch": 1.0687801260659993, "grad_norm": 7.25390625, "learning_rate": 8.931219873934001e-06, "loss": 2.8644, "mean_token_accuracy": 0.4505724657916783, "step": 5765 }, { "epoch": 1.0689655172413792, "grad_norm": 6.390625, "learning_rate": 8.931034482758621e-06, "loss": 2.9415, "mean_token_accuracy": 0.44723946422737665, "step": 5766 }, { "epoch": 1.0691509084167594, "grad_norm": 6.75390625, "learning_rate": 8.930849091583242e-06, "loss": 2.4076, "mean_token_accuracy": 0.4897292885321725, "step": 5767 }, { "epoch": 1.0693362995921394, "grad_norm": 5.1015625, "learning_rate": 8.93066370040786e-06, "loss": 2.9646, "mean_token_accuracy": 0.44514887354239785, "step": 5768 }, { "epoch": 1.0695216907675196, "grad_norm": 5.890625, "learning_rate": 8.930478309232481e-06, "loss": 2.4079, "mean_token_accuracy": 0.4857183257918552, "step": 5769 }, { "epoch": 1.0697070819428995, "grad_norm": 5.16015625, "learning_rate": 8.9302929180571e-06, "loss": 2.5148, "mean_token_accuracy": 0.5208443972384493, "step": 5770 }, { "epoch": 1.0698924731182795, "grad_norm": 6.9765625, "learning_rate": 8.930107526881722e-06, "loss": 2.5371, "mean_token_accuracy": 0.47689463955637706, "step": 5771 }, { "epoch": 1.0700778642936597, "grad_norm": 5.34375, "learning_rate": 8.929922135706341e-06, "loss": 2.6064, "mean_token_accuracy": 0.4703448275862069, "step": 5772 }, { "epoch": 1.0702632554690397, "grad_norm": 5.96484375, "learning_rate": 8.929736744530961e-06, "loss": 3.2041, "mean_token_accuracy": 0.4141294005708849, "step": 5773 }, { "epoch": 1.0704486466444196, "grad_norm": 6.56640625, "learning_rate": 8.92955135335558e-06, "loss": 2.5006, "mean_token_accuracy": 0.48738194766991794, "step": 5774 }, { "epoch": 1.0706340378197998, "grad_norm": 8.2109375, "learning_rate": 8.9293659621802e-06, "loss": 3.1463, "mean_token_accuracy": 0.4229600694444444, "step": 5775 }, { "epoch": 1.0708194289951798, "grad_norm": 5.3671875, "learning_rate": 8.929180571004821e-06, "loss": 2.9061, "mean_token_accuracy": 0.45018933877075445, "step": 5776 }, { "epoch": 1.07100482017056, "grad_norm": 6.6015625, "learning_rate": 8.92899517982944e-06, "loss": 2.1618, "mean_token_accuracy": 0.5411734758813797, "step": 5777 }, { "epoch": 1.07119021134594, "grad_norm": 5.6875, "learning_rate": 8.92880978865406e-06, "loss": 2.7697, "mean_token_accuracy": 0.4490987560294491, "step": 5778 }, { "epoch": 1.07137560252132, "grad_norm": 7.66796875, "learning_rate": 8.928624397478681e-06, "loss": 2.953, "mean_token_accuracy": 0.4488862837045721, "step": 5779 }, { "epoch": 1.0715609936967, "grad_norm": 6.4609375, "learning_rate": 8.928439006303302e-06, "loss": 2.9553, "mean_token_accuracy": 0.44451675886755615, "step": 5780 }, { "epoch": 1.07174638487208, "grad_norm": 9.140625, "learning_rate": 8.92825361512792e-06, "loss": 2.3443, "mean_token_accuracy": 0.5105550118831259, "step": 5781 }, { "epoch": 1.07193177604746, "grad_norm": 7.28515625, "learning_rate": 8.92806822395254e-06, "loss": 3.1704, "mean_token_accuracy": 0.4397890418986229, "step": 5782 }, { "epoch": 1.0721171672228402, "grad_norm": 6.578125, "learning_rate": 8.927882832777161e-06, "loss": 3.3804, "mean_token_accuracy": 0.41244555071561917, "step": 5783 }, { "epoch": 1.0723025583982202, "grad_norm": 11.5859375, "learning_rate": 8.92769744160178e-06, "loss": 3.1856, "mean_token_accuracy": 0.4259179882840406, "step": 5784 }, { "epoch": 1.0724879495736004, "grad_norm": 7.44921875, "learning_rate": 8.9275120504264e-06, "loss": 2.1791, "mean_token_accuracy": 0.5471489714206821, "step": 5785 }, { "epoch": 1.0726733407489804, "grad_norm": 7.1796875, "learning_rate": 8.92732665925102e-06, "loss": 2.5774, "mean_token_accuracy": 0.48245363766048505, "step": 5786 }, { "epoch": 1.0728587319243603, "grad_norm": 5.58984375, "learning_rate": 8.927141268075642e-06, "loss": 2.6968, "mean_token_accuracy": 0.46965784377017433, "step": 5787 }, { "epoch": 1.0730441230997405, "grad_norm": 7.5703125, "learning_rate": 8.92695587690026e-06, "loss": 2.2704, "mean_token_accuracy": 0.5076905804423487, "step": 5788 }, { "epoch": 1.0732295142751205, "grad_norm": 6.12890625, "learning_rate": 8.92677048572488e-06, "loss": 2.6718, "mean_token_accuracy": 0.4686438463548471, "step": 5789 }, { "epoch": 1.0734149054505004, "grad_norm": 5.1875, "learning_rate": 8.9265850945495e-06, "loss": 2.7832, "mean_token_accuracy": 0.4414613894270689, "step": 5790 }, { "epoch": 1.0736002966258806, "grad_norm": 6.1171875, "learning_rate": 8.92639970337412e-06, "loss": 2.9742, "mean_token_accuracy": 0.4287641207216321, "step": 5791 }, { "epoch": 1.0737856878012606, "grad_norm": 6.57421875, "learning_rate": 8.92621431219874e-06, "loss": 2.7078, "mean_token_accuracy": 0.4662937062937063, "step": 5792 }, { "epoch": 1.0739710789766408, "grad_norm": 6.84765625, "learning_rate": 8.92602892102336e-06, "loss": 3.0665, "mean_token_accuracy": 0.4523151347615757, "step": 5793 }, { "epoch": 1.0741564701520208, "grad_norm": 7.01171875, "learning_rate": 8.92584352984798e-06, "loss": 3.3212, "mean_token_accuracy": 0.40908463343918267, "step": 5794 }, { "epoch": 1.0743418613274007, "grad_norm": 9.7890625, "learning_rate": 8.9256581386726e-06, "loss": 2.1224, "mean_token_accuracy": 0.5443458980044346, "step": 5795 }, { "epoch": 1.074527252502781, "grad_norm": 6.32421875, "learning_rate": 8.925472747497221e-06, "loss": 2.6615, "mean_token_accuracy": 0.4880177749563561, "step": 5796 }, { "epoch": 1.0747126436781609, "grad_norm": 7.015625, "learning_rate": 8.92528735632184e-06, "loss": 2.8996, "mean_token_accuracy": 0.44215032103739144, "step": 5797 }, { "epoch": 1.074898034853541, "grad_norm": 5.40234375, "learning_rate": 8.92510196514646e-06, "loss": 2.711, "mean_token_accuracy": 0.457280947926411, "step": 5798 }, { "epoch": 1.075083426028921, "grad_norm": 7.26171875, "learning_rate": 8.924916573971079e-06, "loss": 2.6011, "mean_token_accuracy": 0.4884597268016957, "step": 5799 }, { "epoch": 1.075268817204301, "grad_norm": 5.2890625, "learning_rate": 8.9247311827957e-06, "loss": 2.3566, "mean_token_accuracy": 0.49474635634391595, "step": 5800 }, { "epoch": 1.0754542083796812, "grad_norm": 6.140625, "learning_rate": 8.92454579162032e-06, "loss": 3.244, "mean_token_accuracy": 0.41499359248184536, "step": 5801 }, { "epoch": 1.0756395995550612, "grad_norm": 8.4375, "learning_rate": 8.924360400444939e-06, "loss": 3.4775, "mean_token_accuracy": 0.3731082654249127, "step": 5802 }, { "epoch": 1.0758249907304411, "grad_norm": 6.30078125, "learning_rate": 8.92417500926956e-06, "loss": 2.519, "mean_token_accuracy": 0.483072546230441, "step": 5803 }, { "epoch": 1.0760103819058213, "grad_norm": 5.3515625, "learning_rate": 8.92398961809418e-06, "loss": 2.7978, "mean_token_accuracy": 0.47080056722959907, "step": 5804 }, { "epoch": 1.0761957730812013, "grad_norm": 5.89453125, "learning_rate": 8.9238042269188e-06, "loss": 2.279, "mean_token_accuracy": 0.5139495390587093, "step": 5805 }, { "epoch": 1.0763811642565815, "grad_norm": 6.7265625, "learning_rate": 8.923618835743419e-06, "loss": 2.3307, "mean_token_accuracy": 0.5020222446916077, "step": 5806 }, { "epoch": 1.0765665554319614, "grad_norm": 6.0078125, "learning_rate": 8.92343344456804e-06, "loss": 3.3019, "mean_token_accuracy": 0.4198668714797747, "step": 5807 }, { "epoch": 1.0767519466073414, "grad_norm": 5.98046875, "learning_rate": 8.923248053392658e-06, "loss": 2.6598, "mean_token_accuracy": 0.4862234201856843, "step": 5808 }, { "epoch": 1.0769373377827216, "grad_norm": 6.28125, "learning_rate": 8.923062662217279e-06, "loss": 2.7145, "mean_token_accuracy": 0.4743436754176611, "step": 5809 }, { "epoch": 1.0771227289581016, "grad_norm": 6.015625, "learning_rate": 8.9228772710419e-06, "loss": 2.3365, "mean_token_accuracy": 0.5075952995127544, "step": 5810 }, { "epoch": 1.0773081201334818, "grad_norm": 6.4765625, "learning_rate": 8.92269187986652e-06, "loss": 2.6944, "mean_token_accuracy": 0.45133149678604223, "step": 5811 }, { "epoch": 1.0774935113088617, "grad_norm": 7.11328125, "learning_rate": 8.92250648869114e-06, "loss": 2.5046, "mean_token_accuracy": 0.4671948846260773, "step": 5812 }, { "epoch": 1.0776789024842417, "grad_norm": 8.078125, "learning_rate": 8.922321097515759e-06, "loss": 2.9621, "mean_token_accuracy": 0.4457315138051412, "step": 5813 }, { "epoch": 1.0778642936596219, "grad_norm": 5.24609375, "learning_rate": 8.92213570634038e-06, "loss": 2.5833, "mean_token_accuracy": 0.4936175644625989, "step": 5814 }, { "epoch": 1.0780496848350019, "grad_norm": 6.3203125, "learning_rate": 8.921950315164998e-06, "loss": 2.6766, "mean_token_accuracy": 0.4688198757763975, "step": 5815 }, { "epoch": 1.0782350760103818, "grad_norm": 5.90234375, "learning_rate": 8.921764923989619e-06, "loss": 3.1408, "mean_token_accuracy": 0.43957951586459637, "step": 5816 }, { "epoch": 1.078420467185762, "grad_norm": 5.765625, "learning_rate": 8.921579532814238e-06, "loss": 3.0975, "mean_token_accuracy": 0.4502564102564103, "step": 5817 }, { "epoch": 1.078605858361142, "grad_norm": 5.265625, "learning_rate": 8.921394141638858e-06, "loss": 2.7628, "mean_token_accuracy": 0.45309358945722583, "step": 5818 }, { "epoch": 1.078791249536522, "grad_norm": 6.55078125, "learning_rate": 8.921208750463479e-06, "loss": 2.8176, "mean_token_accuracy": 0.4581901489117984, "step": 5819 }, { "epoch": 1.0789766407119021, "grad_norm": 8.7578125, "learning_rate": 8.921023359288099e-06, "loss": 2.4382, "mean_token_accuracy": 0.4910374029640085, "step": 5820 }, { "epoch": 1.079162031887282, "grad_norm": 5.08203125, "learning_rate": 8.92083796811272e-06, "loss": 2.5454, "mean_token_accuracy": 0.4872926858370123, "step": 5821 }, { "epoch": 1.0793474230626623, "grad_norm": 5.43359375, "learning_rate": 8.920652576937338e-06, "loss": 2.5622, "mean_token_accuracy": 0.48317801295214025, "step": 5822 }, { "epoch": 1.0795328142380423, "grad_norm": 7.62109375, "learning_rate": 8.920467185761959e-06, "loss": 2.8763, "mean_token_accuracy": 0.43857493857493857, "step": 5823 }, { "epoch": 1.0797182054134222, "grad_norm": 6.5859375, "learning_rate": 8.920281794586578e-06, "loss": 2.8661, "mean_token_accuracy": 0.45296884185773073, "step": 5824 }, { "epoch": 1.0799035965888024, "grad_norm": 5.296875, "learning_rate": 8.920096403411198e-06, "loss": 2.6759, "mean_token_accuracy": 0.4666357738646895, "step": 5825 }, { "epoch": 1.0800889877641824, "grad_norm": 6.0546875, "learning_rate": 8.919911012235819e-06, "loss": 2.9337, "mean_token_accuracy": 0.46113989637305697, "step": 5826 }, { "epoch": 1.0802743789395626, "grad_norm": 5.97265625, "learning_rate": 8.919725621060437e-06, "loss": 2.5825, "mean_token_accuracy": 0.465184318314804, "step": 5827 }, { "epoch": 1.0804597701149425, "grad_norm": 5.2578125, "learning_rate": 8.919540229885058e-06, "loss": 2.3426, "mean_token_accuracy": 0.49328859060402686, "step": 5828 }, { "epoch": 1.0806451612903225, "grad_norm": 6.93359375, "learning_rate": 8.919354838709678e-06, "loss": 2.5996, "mean_token_accuracy": 0.5106905012267788, "step": 5829 }, { "epoch": 1.0808305524657027, "grad_norm": 7.609375, "learning_rate": 8.919169447534299e-06, "loss": 3.3304, "mean_token_accuracy": 0.43202065848934795, "step": 5830 }, { "epoch": 1.0810159436410827, "grad_norm": 7.78125, "learning_rate": 8.918984056358918e-06, "loss": 2.2893, "mean_token_accuracy": 0.49600798403193613, "step": 5831 }, { "epoch": 1.0812013348164626, "grad_norm": 7.7265625, "learning_rate": 8.918798665183538e-06, "loss": 2.763, "mean_token_accuracy": 0.47485760781122865, "step": 5832 }, { "epoch": 1.0813867259918428, "grad_norm": 6.30859375, "learning_rate": 8.918613274008157e-06, "loss": 2.9272, "mean_token_accuracy": 0.45123287671232876, "step": 5833 }, { "epoch": 1.0815721171672228, "grad_norm": 6.57421875, "learning_rate": 8.918427882832778e-06, "loss": 2.9774, "mean_token_accuracy": 0.4481352560914968, "step": 5834 }, { "epoch": 1.081757508342603, "grad_norm": 5.78515625, "learning_rate": 8.918242491657398e-06, "loss": 2.6549, "mean_token_accuracy": 0.4764795144157815, "step": 5835 }, { "epoch": 1.081942899517983, "grad_norm": 6.625, "learning_rate": 8.918057100482019e-06, "loss": 2.6245, "mean_token_accuracy": 0.46986301369863015, "step": 5836 }, { "epoch": 1.082128290693363, "grad_norm": 6.84375, "learning_rate": 8.917871709306637e-06, "loss": 2.985, "mean_token_accuracy": 0.4320270924044509, "step": 5837 }, { "epoch": 1.082313681868743, "grad_norm": 5.453125, "learning_rate": 8.917686318131258e-06, "loss": 2.6733, "mean_token_accuracy": 0.4623047926763597, "step": 5838 }, { "epoch": 1.082499073044123, "grad_norm": 6.83203125, "learning_rate": 8.917500926955878e-06, "loss": 2.3709, "mean_token_accuracy": 0.48354999197560583, "step": 5839 }, { "epoch": 1.0826844642195033, "grad_norm": 6.0703125, "learning_rate": 8.917315535780497e-06, "loss": 2.9644, "mean_token_accuracy": 0.43162193698949824, "step": 5840 }, { "epoch": 1.0828698553948832, "grad_norm": 7.6875, "learning_rate": 8.917130144605118e-06, "loss": 2.8106, "mean_token_accuracy": 0.45687560738581146, "step": 5841 }, { "epoch": 1.0830552465702632, "grad_norm": 7.765625, "learning_rate": 8.916944753429736e-06, "loss": 2.816, "mean_token_accuracy": 0.4612623392162728, "step": 5842 }, { "epoch": 1.0832406377456434, "grad_norm": 7.6484375, "learning_rate": 8.916759362254357e-06, "loss": 2.1659, "mean_token_accuracy": 0.5413478516774574, "step": 5843 }, { "epoch": 1.0834260289210234, "grad_norm": 5.79296875, "learning_rate": 8.916573971078977e-06, "loss": 3.0518, "mean_token_accuracy": 0.41382941382941385, "step": 5844 }, { "epoch": 1.0836114200964033, "grad_norm": 7.83984375, "learning_rate": 8.916388579903598e-06, "loss": 2.5888, "mean_token_accuracy": 0.46653980336187756, "step": 5845 }, { "epoch": 1.0837968112717835, "grad_norm": 7.71484375, "learning_rate": 8.916203188728217e-06, "loss": 2.769, "mean_token_accuracy": 0.45228777844671886, "step": 5846 }, { "epoch": 1.0839822024471635, "grad_norm": 7.203125, "learning_rate": 8.916017797552837e-06, "loss": 3.1581, "mean_token_accuracy": 0.4230212037402871, "step": 5847 }, { "epoch": 1.0841675936225437, "grad_norm": 6.3671875, "learning_rate": 8.915832406377458e-06, "loss": 2.9627, "mean_token_accuracy": 0.4391309094879354, "step": 5848 }, { "epoch": 1.0843529847979236, "grad_norm": 7.72265625, "learning_rate": 8.915647015202076e-06, "loss": 2.4092, "mean_token_accuracy": 0.5011114639568117, "step": 5849 }, { "epoch": 1.0845383759733036, "grad_norm": 6.1796875, "learning_rate": 8.915461624026697e-06, "loss": 3.1298, "mean_token_accuracy": 0.43235430157261795, "step": 5850 }, { "epoch": 1.0847237671486838, "grad_norm": 10.71875, "learning_rate": 8.915276232851316e-06, "loss": 2.7077, "mean_token_accuracy": 0.4445817882159044, "step": 5851 }, { "epoch": 1.0849091583240638, "grad_norm": 8.8984375, "learning_rate": 8.915090841675938e-06, "loss": 2.9048, "mean_token_accuracy": 0.4332615715823466, "step": 5852 }, { "epoch": 1.0850945494994437, "grad_norm": 5.58203125, "learning_rate": 8.914905450500557e-06, "loss": 2.9546, "mean_token_accuracy": 0.46609006040637013, "step": 5853 }, { "epoch": 1.085279940674824, "grad_norm": 7.640625, "learning_rate": 8.914720059325177e-06, "loss": 2.342, "mean_token_accuracy": 0.5257707129094412, "step": 5854 }, { "epoch": 1.0854653318502039, "grad_norm": 8.28125, "learning_rate": 8.914534668149796e-06, "loss": 2.3807, "mean_token_accuracy": 0.48614756488772237, "step": 5855 }, { "epoch": 1.085650723025584, "grad_norm": 6.62890625, "learning_rate": 8.914349276974416e-06, "loss": 2.8633, "mean_token_accuracy": 0.43610421836228286, "step": 5856 }, { "epoch": 1.085836114200964, "grad_norm": 5.47265625, "learning_rate": 8.914163885799037e-06, "loss": 2.3424, "mean_token_accuracy": 0.5262197902416781, "step": 5857 }, { "epoch": 1.086021505376344, "grad_norm": 10.828125, "learning_rate": 8.913978494623656e-06, "loss": 2.6676, "mean_token_accuracy": 0.47401055408970977, "step": 5858 }, { "epoch": 1.0862068965517242, "grad_norm": 9.4375, "learning_rate": 8.913793103448276e-06, "loss": 2.3748, "mean_token_accuracy": 0.5046898140529866, "step": 5859 }, { "epoch": 1.0863922877271042, "grad_norm": 7.69140625, "learning_rate": 8.913607712272897e-06, "loss": 2.4944, "mean_token_accuracy": 0.5003229974160207, "step": 5860 }, { "epoch": 1.0865776789024841, "grad_norm": 9.7109375, "learning_rate": 8.913422321097517e-06, "loss": 2.8139, "mean_token_accuracy": 0.45675884102677095, "step": 5861 }, { "epoch": 1.0867630700778643, "grad_norm": 6.60546875, "learning_rate": 8.913236929922136e-06, "loss": 2.9569, "mean_token_accuracy": 0.4216105046669831, "step": 5862 }, { "epoch": 1.0869484612532443, "grad_norm": 7.51953125, "learning_rate": 8.913051538746757e-06, "loss": 2.7842, "mean_token_accuracy": 0.46524064171123, "step": 5863 }, { "epoch": 1.0871338524286245, "grad_norm": 8.796875, "learning_rate": 8.912866147571377e-06, "loss": 2.7076, "mean_token_accuracy": 0.44183877415056627, "step": 5864 }, { "epoch": 1.0873192436040044, "grad_norm": 6.66796875, "learning_rate": 8.912680756395996e-06, "loss": 2.893, "mean_token_accuracy": 0.4463802943826975, "step": 5865 }, { "epoch": 1.0875046347793844, "grad_norm": 5.8984375, "learning_rate": 8.912495365220616e-06, "loss": 2.4264, "mean_token_accuracy": 0.5079887218045113, "step": 5866 }, { "epoch": 1.0876900259547646, "grad_norm": 6.2734375, "learning_rate": 8.912309974045235e-06, "loss": 2.9513, "mean_token_accuracy": 0.44932810750279956, "step": 5867 }, { "epoch": 1.0878754171301446, "grad_norm": 10.6640625, "learning_rate": 8.912124582869857e-06, "loss": 2.0251, "mean_token_accuracy": 0.5367423782339366, "step": 5868 }, { "epoch": 1.0880608083055248, "grad_norm": 6.28125, "learning_rate": 8.911939191694476e-06, "loss": 2.7347, "mean_token_accuracy": 0.4708604483007954, "step": 5869 }, { "epoch": 1.0882461994809047, "grad_norm": 5.61328125, "learning_rate": 8.911753800519097e-06, "loss": 3.0234, "mean_token_accuracy": 0.42890700566533096, "step": 5870 }, { "epoch": 1.0884315906562847, "grad_norm": 6.296875, "learning_rate": 8.911568409343715e-06, "loss": 2.6174, "mean_token_accuracy": 0.47026169706582077, "step": 5871 }, { "epoch": 1.0886169818316649, "grad_norm": 9.6875, "learning_rate": 8.911383018168336e-06, "loss": 3.0913, "mean_token_accuracy": 0.4249743062692703, "step": 5872 }, { "epoch": 1.0888023730070449, "grad_norm": 6.96484375, "learning_rate": 8.911197626992956e-06, "loss": 3.2522, "mean_token_accuracy": 0.4463864613677348, "step": 5873 }, { "epoch": 1.0889877641824248, "grad_norm": 6.16796875, "learning_rate": 8.911012235817575e-06, "loss": 2.7667, "mean_token_accuracy": 0.46302981682043054, "step": 5874 }, { "epoch": 1.089173155357805, "grad_norm": 7.078125, "learning_rate": 8.910826844642196e-06, "loss": 2.6224, "mean_token_accuracy": 0.47866018368449487, "step": 5875 }, { "epoch": 1.089358546533185, "grad_norm": 8.546875, "learning_rate": 8.910641453466816e-06, "loss": 3.3146, "mean_token_accuracy": 0.41527415143603136, "step": 5876 }, { "epoch": 1.0895439377085652, "grad_norm": 6.25390625, "learning_rate": 8.910456062291437e-06, "loss": 3.0159, "mean_token_accuracy": 0.46915224145583667, "step": 5877 }, { "epoch": 1.0897293288839451, "grad_norm": 5.81640625, "learning_rate": 8.910270671116055e-06, "loss": 3.417, "mean_token_accuracy": 0.4017118001964361, "step": 5878 }, { "epoch": 1.089914720059325, "grad_norm": 9.3671875, "learning_rate": 8.910085279940676e-06, "loss": 2.7975, "mean_token_accuracy": 0.4427985716175109, "step": 5879 }, { "epoch": 1.0901001112347053, "grad_norm": 10.0, "learning_rate": 8.909899888765295e-06, "loss": 2.654, "mean_token_accuracy": 0.4635085369936391, "step": 5880 }, { "epoch": 1.0902855024100853, "grad_norm": 7.109375, "learning_rate": 8.909714497589915e-06, "loss": 2.7899, "mean_token_accuracy": 0.4829136690647482, "step": 5881 }, { "epoch": 1.0904708935854652, "grad_norm": 6.49609375, "learning_rate": 8.909529106414536e-06, "loss": 2.5318, "mean_token_accuracy": 0.4765746638358103, "step": 5882 }, { "epoch": 1.0906562847608454, "grad_norm": 7.1875, "learning_rate": 8.909343715239155e-06, "loss": 2.5793, "mean_token_accuracy": 0.4867669953295278, "step": 5883 }, { "epoch": 1.0908416759362254, "grad_norm": 6.16796875, "learning_rate": 8.909158324063775e-06, "loss": 2.9176, "mean_token_accuracy": 0.4633587786259542, "step": 5884 }, { "epoch": 1.0910270671116056, "grad_norm": 6.02734375, "learning_rate": 8.908972932888395e-06, "loss": 2.9709, "mean_token_accuracy": 0.4574265505984766, "step": 5885 }, { "epoch": 1.0912124582869855, "grad_norm": 8.546875, "learning_rate": 8.908787541713016e-06, "loss": 2.5235, "mean_token_accuracy": 0.4960962498400102, "step": 5886 }, { "epoch": 1.0913978494623655, "grad_norm": 7.265625, "learning_rate": 8.908602150537635e-06, "loss": 2.5001, "mean_token_accuracy": 0.4853275992916772, "step": 5887 }, { "epoch": 1.0915832406377457, "grad_norm": 5.54296875, "learning_rate": 8.908416759362255e-06, "loss": 2.4464, "mean_token_accuracy": 0.5046728971962616, "step": 5888 }, { "epoch": 1.0917686318131257, "grad_norm": 5.51953125, "learning_rate": 8.908231368186874e-06, "loss": 2.7388, "mean_token_accuracy": 0.45863719234275296, "step": 5889 }, { "epoch": 1.0919540229885056, "grad_norm": 6.40625, "learning_rate": 8.908045977011495e-06, "loss": 2.9976, "mean_token_accuracy": 0.4380753138075314, "step": 5890 }, { "epoch": 1.0921394141638858, "grad_norm": 6.7265625, "learning_rate": 8.907860585836115e-06, "loss": 2.7043, "mean_token_accuracy": 0.45947947524333477, "step": 5891 }, { "epoch": 1.0923248053392658, "grad_norm": 7.9140625, "learning_rate": 8.907675194660736e-06, "loss": 2.6479, "mean_token_accuracy": 0.47155460906601765, "step": 5892 }, { "epoch": 1.092510196514646, "grad_norm": 5.796875, "learning_rate": 8.907489803485356e-06, "loss": 3.1548, "mean_token_accuracy": 0.4369444802267165, "step": 5893 }, { "epoch": 1.092695587690026, "grad_norm": 5.91015625, "learning_rate": 8.907304412309975e-06, "loss": 2.8659, "mean_token_accuracy": 0.455086258179655, "step": 5894 }, { "epoch": 1.092880978865406, "grad_norm": 12.3203125, "learning_rate": 8.907119021134595e-06, "loss": 2.3595, "mean_token_accuracy": 0.49942223249364454, "step": 5895 }, { "epoch": 1.093066370040786, "grad_norm": 8.125, "learning_rate": 8.906933629959214e-06, "loss": 2.9794, "mean_token_accuracy": 0.434631743899709, "step": 5896 }, { "epoch": 1.093251761216166, "grad_norm": 7.1875, "learning_rate": 8.906748238783835e-06, "loss": 2.8822, "mean_token_accuracy": 0.45534901858832705, "step": 5897 }, { "epoch": 1.0934371523915463, "grad_norm": 7.45703125, "learning_rate": 8.906562847608453e-06, "loss": 2.6458, "mean_token_accuracy": 0.4851845434066886, "step": 5898 }, { "epoch": 1.0936225435669262, "grad_norm": 9.171875, "learning_rate": 8.906377456433074e-06, "loss": 2.5894, "mean_token_accuracy": 0.46762200892253614, "step": 5899 }, { "epoch": 1.0938079347423062, "grad_norm": 10.484375, "learning_rate": 8.906192065257694e-06, "loss": 2.6075, "mean_token_accuracy": 0.4638793331569198, "step": 5900 }, { "epoch": 1.0939933259176864, "grad_norm": 7.34375, "learning_rate": 8.906006674082315e-06, "loss": 3.4862, "mean_token_accuracy": 0.41389024987751105, "step": 5901 }, { "epoch": 1.0941787170930664, "grad_norm": 7.01953125, "learning_rate": 8.905821282906935e-06, "loss": 2.5768, "mean_token_accuracy": 0.48210188159706285, "step": 5902 }, { "epoch": 1.0943641082684463, "grad_norm": 5.9921875, "learning_rate": 8.905635891731554e-06, "loss": 2.5851, "mean_token_accuracy": 0.4748700173310225, "step": 5903 }, { "epoch": 1.0945494994438265, "grad_norm": 6.5390625, "learning_rate": 8.905450500556175e-06, "loss": 2.7191, "mean_token_accuracy": 0.4780799524446426, "step": 5904 }, { "epoch": 1.0947348906192065, "grad_norm": 8.8203125, "learning_rate": 8.905265109380793e-06, "loss": 2.8511, "mean_token_accuracy": 0.44391067255258376, "step": 5905 }, { "epoch": 1.0949202817945867, "grad_norm": 8.28125, "learning_rate": 8.905079718205414e-06, "loss": 2.2929, "mean_token_accuracy": 0.5014671361502347, "step": 5906 }, { "epoch": 1.0951056729699666, "grad_norm": 9.3203125, "learning_rate": 8.904894327030034e-06, "loss": 2.8546, "mean_token_accuracy": 0.4511727078891258, "step": 5907 }, { "epoch": 1.0952910641453466, "grad_norm": 8.234375, "learning_rate": 8.904708935854655e-06, "loss": 2.6402, "mean_token_accuracy": 0.48106575963718823, "step": 5908 }, { "epoch": 1.0954764553207268, "grad_norm": 6.171875, "learning_rate": 8.904523544679274e-06, "loss": 2.6611, "mean_token_accuracy": 0.4696384211979023, "step": 5909 }, { "epoch": 1.0956618464961068, "grad_norm": 9.59375, "learning_rate": 8.904338153503894e-06, "loss": 2.4396, "mean_token_accuracy": 0.48324447829398326, "step": 5910 }, { "epoch": 1.095847237671487, "grad_norm": 6.88671875, "learning_rate": 8.904152762328515e-06, "loss": 2.6302, "mean_token_accuracy": 0.4809052333804809, "step": 5911 }, { "epoch": 1.096032628846867, "grad_norm": 5.75, "learning_rate": 8.903967371153134e-06, "loss": 3.3385, "mean_token_accuracy": 0.40424188865042293, "step": 5912 }, { "epoch": 1.0962180200222469, "grad_norm": 5.26953125, "learning_rate": 8.903781979977754e-06, "loss": 2.9783, "mean_token_accuracy": 0.45852593733949665, "step": 5913 }, { "epoch": 1.096403411197627, "grad_norm": 6.2109375, "learning_rate": 8.903596588802373e-06, "loss": 2.4342, "mean_token_accuracy": 0.523036253776435, "step": 5914 }, { "epoch": 1.096588802373007, "grad_norm": 7.2734375, "learning_rate": 8.903411197626993e-06, "loss": 2.6705, "mean_token_accuracy": 0.4719347376111043, "step": 5915 }, { "epoch": 1.096774193548387, "grad_norm": 8.078125, "learning_rate": 8.903225806451614e-06, "loss": 2.782, "mean_token_accuracy": 0.5186085035891772, "step": 5916 }, { "epoch": 1.0969595847237672, "grad_norm": 6.74609375, "learning_rate": 8.903040415276234e-06, "loss": 2.5854, "mean_token_accuracy": 0.4954763171899947, "step": 5917 }, { "epoch": 1.0971449758991472, "grad_norm": 5.7265625, "learning_rate": 8.902855024100853e-06, "loss": 2.7441, "mean_token_accuracy": 0.47062154891689084, "step": 5918 }, { "epoch": 1.0973303670745271, "grad_norm": 6.05078125, "learning_rate": 8.902669632925474e-06, "loss": 3.0181, "mean_token_accuracy": 0.44542124542124545, "step": 5919 }, { "epoch": 1.0975157582499073, "grad_norm": 5.9453125, "learning_rate": 8.902484241750094e-06, "loss": 2.5784, "mean_token_accuracy": 0.4626079320942485, "step": 5920 }, { "epoch": 1.0977011494252873, "grad_norm": 5.0859375, "learning_rate": 8.902298850574713e-06, "loss": 2.6459, "mean_token_accuracy": 0.47695652173913045, "step": 5921 }, { "epoch": 1.0978865406006675, "grad_norm": 5.7265625, "learning_rate": 8.902113459399333e-06, "loss": 2.4642, "mean_token_accuracy": 0.4829562008351947, "step": 5922 }, { "epoch": 1.0980719317760474, "grad_norm": 5.9921875, "learning_rate": 8.901928068223952e-06, "loss": 3.5238, "mean_token_accuracy": 0.4046744982868331, "step": 5923 }, { "epoch": 1.0982573229514274, "grad_norm": 6.61328125, "learning_rate": 8.901742677048574e-06, "loss": 3.3899, "mean_token_accuracy": 0.4277487853158179, "step": 5924 }, { "epoch": 1.0984427141268076, "grad_norm": 6.4453125, "learning_rate": 8.901557285873193e-06, "loss": 3.0662, "mean_token_accuracy": 0.4227543083727314, "step": 5925 }, { "epoch": 1.0986281053021876, "grad_norm": 6.9296875, "learning_rate": 8.901371894697814e-06, "loss": 2.3191, "mean_token_accuracy": 0.5286597358808655, "step": 5926 }, { "epoch": 1.0988134964775678, "grad_norm": 6.078125, "learning_rate": 8.901186503522432e-06, "loss": 2.8647, "mean_token_accuracy": 0.46147596479350034, "step": 5927 }, { "epoch": 1.0989988876529477, "grad_norm": 5.328125, "learning_rate": 8.901001112347053e-06, "loss": 2.5455, "mean_token_accuracy": 0.4766666666666667, "step": 5928 }, { "epoch": 1.0991842788283277, "grad_norm": 9.40625, "learning_rate": 8.900815721171673e-06, "loss": 2.3132, "mean_token_accuracy": 0.5059283551967709, "step": 5929 }, { "epoch": 1.0993696700037079, "grad_norm": 5.5859375, "learning_rate": 8.900630329996292e-06, "loss": 2.6389, "mean_token_accuracy": 0.45506623702112425, "step": 5930 }, { "epoch": 1.0995550611790879, "grad_norm": 5.90625, "learning_rate": 8.900444938820913e-06, "loss": 2.6946, "mean_token_accuracy": 0.47724079797695984, "step": 5931 }, { "epoch": 1.0997404523544678, "grad_norm": 7.05859375, "learning_rate": 8.900259547645533e-06, "loss": 3.09, "mean_token_accuracy": 0.44373325394462637, "step": 5932 }, { "epoch": 1.099925843529848, "grad_norm": 6.4921875, "learning_rate": 8.900074156470154e-06, "loss": 2.6832, "mean_token_accuracy": 0.4835182767624021, "step": 5933 }, { "epoch": 1.100111234705228, "grad_norm": 6.3515625, "learning_rate": 8.899888765294772e-06, "loss": 2.9061, "mean_token_accuracy": 0.4621715732826844, "step": 5934 }, { "epoch": 1.1002966258806082, "grad_norm": 6.6015625, "learning_rate": 8.899703374119393e-06, "loss": 3.5846, "mean_token_accuracy": 0.40544321505786596, "step": 5935 }, { "epoch": 1.1004820170559881, "grad_norm": 6.9765625, "learning_rate": 8.899517982944012e-06, "loss": 2.375, "mean_token_accuracy": 0.5258087968011632, "step": 5936 }, { "epoch": 1.100667408231368, "grad_norm": 6.953125, "learning_rate": 8.899332591768632e-06, "loss": 2.6999, "mean_token_accuracy": 0.47862805928990004, "step": 5937 }, { "epoch": 1.1008527994067483, "grad_norm": 5.98828125, "learning_rate": 8.899147200593253e-06, "loss": 3.1124, "mean_token_accuracy": 0.41685193419297467, "step": 5938 }, { "epoch": 1.1010381905821283, "grad_norm": 6.984375, "learning_rate": 8.898961809417872e-06, "loss": 2.763, "mean_token_accuracy": 0.4514224369296833, "step": 5939 }, { "epoch": 1.1012235817575085, "grad_norm": 5.73046875, "learning_rate": 8.898776418242492e-06, "loss": 2.6891, "mean_token_accuracy": 0.46892265193370164, "step": 5940 }, { "epoch": 1.1014089729328884, "grad_norm": 7.25, "learning_rate": 8.898591027067113e-06, "loss": 2.1948, "mean_token_accuracy": 0.5301125949201362, "step": 5941 }, { "epoch": 1.1015943641082684, "grad_norm": 6.50390625, "learning_rate": 8.898405635891733e-06, "loss": 2.5704, "mean_token_accuracy": 0.4674279499728113, "step": 5942 }, { "epoch": 1.1017797552836486, "grad_norm": 6.5, "learning_rate": 8.898220244716352e-06, "loss": 2.9638, "mean_token_accuracy": 0.44887328728343334, "step": 5943 }, { "epoch": 1.1019651464590285, "grad_norm": 6.89453125, "learning_rate": 8.898034853540972e-06, "loss": 2.497, "mean_token_accuracy": 0.4917267713194739, "step": 5944 }, { "epoch": 1.1021505376344085, "grad_norm": 7.5078125, "learning_rate": 8.897849462365593e-06, "loss": 2.4281, "mean_token_accuracy": 0.4920677601505781, "step": 5945 }, { "epoch": 1.1023359288097887, "grad_norm": 7.0390625, "learning_rate": 8.897664071190212e-06, "loss": 3.2403, "mean_token_accuracy": 0.42653766413268834, "step": 5946 }, { "epoch": 1.1025213199851687, "grad_norm": 6.296875, "learning_rate": 8.897478680014832e-06, "loss": 2.2185, "mean_token_accuracy": 0.5519073569482289, "step": 5947 }, { "epoch": 1.1027067111605489, "grad_norm": 10.53125, "learning_rate": 8.897293288839451e-06, "loss": 2.6036, "mean_token_accuracy": 0.4622032726434662, "step": 5948 }, { "epoch": 1.1028921023359288, "grad_norm": 6.234375, "learning_rate": 8.897107897664073e-06, "loss": 2.2669, "mean_token_accuracy": 0.5373719489060327, "step": 5949 }, { "epoch": 1.1030774935113088, "grad_norm": 7.26171875, "learning_rate": 8.896922506488692e-06, "loss": 3.1001, "mean_token_accuracy": 0.4223356009070295, "step": 5950 }, { "epoch": 1.103262884686689, "grad_norm": 7.4296875, "learning_rate": 8.896737115313312e-06, "loss": 2.808, "mean_token_accuracy": 0.4609227186726823, "step": 5951 }, { "epoch": 1.103448275862069, "grad_norm": 7.42578125, "learning_rate": 8.896551724137931e-06, "loss": 2.7813, "mean_token_accuracy": 0.48213881980556184, "step": 5952 }, { "epoch": 1.103633667037449, "grad_norm": 5.734375, "learning_rate": 8.896366332962552e-06, "loss": 2.9833, "mean_token_accuracy": 0.43242947736715504, "step": 5953 }, { "epoch": 1.103819058212829, "grad_norm": 7.55078125, "learning_rate": 8.896180941787172e-06, "loss": 3.1817, "mean_token_accuracy": 0.4266177921894826, "step": 5954 }, { "epoch": 1.104004449388209, "grad_norm": 8.1875, "learning_rate": 8.895995550611791e-06, "loss": 2.7619, "mean_token_accuracy": 0.4552127225771374, "step": 5955 }, { "epoch": 1.1041898405635893, "grad_norm": 5.50390625, "learning_rate": 8.895810159436411e-06, "loss": 2.7201, "mean_token_accuracy": 0.4820232246538633, "step": 5956 }, { "epoch": 1.1043752317389692, "grad_norm": 6.39453125, "learning_rate": 8.895624768261032e-06, "loss": 2.7391, "mean_token_accuracy": 0.47200864968765016, "step": 5957 }, { "epoch": 1.1045606229143492, "grad_norm": 7.609375, "learning_rate": 8.895439377085652e-06, "loss": 2.8617, "mean_token_accuracy": 0.4474417240873772, "step": 5958 }, { "epoch": 1.1047460140897294, "grad_norm": 7.828125, "learning_rate": 8.895253985910271e-06, "loss": 2.8975, "mean_token_accuracy": 0.4643700513960272, "step": 5959 }, { "epoch": 1.1049314052651094, "grad_norm": 8.2578125, "learning_rate": 8.895068594734892e-06, "loss": 2.8567, "mean_token_accuracy": 0.45591633223884226, "step": 5960 }, { "epoch": 1.1051167964404893, "grad_norm": 10.5078125, "learning_rate": 8.89488320355951e-06, "loss": 2.5254, "mean_token_accuracy": 0.4897573869536486, "step": 5961 }, { "epoch": 1.1053021876158695, "grad_norm": 6.03515625, "learning_rate": 8.894697812384131e-06, "loss": 2.9708, "mean_token_accuracy": 0.46972401482905396, "step": 5962 }, { "epoch": 1.1054875787912495, "grad_norm": 5.31640625, "learning_rate": 8.894512421208751e-06, "loss": 2.9248, "mean_token_accuracy": 0.46721673788124357, "step": 5963 }, { "epoch": 1.1056729699666297, "grad_norm": 6.30078125, "learning_rate": 8.89432703003337e-06, "loss": 3.2314, "mean_token_accuracy": 0.42909737209597565, "step": 5964 }, { "epoch": 1.1058583611420096, "grad_norm": 6.1328125, "learning_rate": 8.89414163885799e-06, "loss": 2.6957, "mean_token_accuracy": 0.4861111111111111, "step": 5965 }, { "epoch": 1.1060437523173896, "grad_norm": 5.7734375, "learning_rate": 8.893956247682611e-06, "loss": 3.1668, "mean_token_accuracy": 0.434232250963126, "step": 5966 }, { "epoch": 1.1062291434927698, "grad_norm": 5.78125, "learning_rate": 8.893770856507232e-06, "loss": 2.7599, "mean_token_accuracy": 0.47367799811142586, "step": 5967 }, { "epoch": 1.1064145346681498, "grad_norm": 6.4765625, "learning_rate": 8.89358546533185e-06, "loss": 3.4905, "mean_token_accuracy": 0.41115510013972983, "step": 5968 }, { "epoch": 1.10659992584353, "grad_norm": 5.94140625, "learning_rate": 8.893400074156471e-06, "loss": 2.672, "mean_token_accuracy": 0.49429657794676807, "step": 5969 }, { "epoch": 1.10678531701891, "grad_norm": 6.30078125, "learning_rate": 8.89321468298109e-06, "loss": 3.0852, "mean_token_accuracy": 0.43815426997245177, "step": 5970 }, { "epoch": 1.1069707081942899, "grad_norm": 6.58203125, "learning_rate": 8.89302929180571e-06, "loss": 2.7507, "mean_token_accuracy": 0.47865714719732644, "step": 5971 }, { "epoch": 1.10715609936967, "grad_norm": 6.0078125, "learning_rate": 8.89284390063033e-06, "loss": 2.8232, "mean_token_accuracy": 0.4657687991021324, "step": 5972 }, { "epoch": 1.10734149054505, "grad_norm": 4.859375, "learning_rate": 8.892658509454951e-06, "loss": 2.579, "mean_token_accuracy": 0.48058201612059037, "step": 5973 }, { "epoch": 1.10752688172043, "grad_norm": 5.08984375, "learning_rate": 8.89247311827957e-06, "loss": 2.5007, "mean_token_accuracy": 0.47933121345779056, "step": 5974 }, { "epoch": 1.1077122728958102, "grad_norm": 5.66015625, "learning_rate": 8.89228772710419e-06, "loss": 2.5891, "mean_token_accuracy": 0.48149417409184375, "step": 5975 }, { "epoch": 1.1078976640711902, "grad_norm": 5.13671875, "learning_rate": 8.892102335928811e-06, "loss": 2.5206, "mean_token_accuracy": 0.49604288996681134, "step": 5976 }, { "epoch": 1.1080830552465704, "grad_norm": 5.98046875, "learning_rate": 8.89191694475343e-06, "loss": 2.2843, "mean_token_accuracy": 0.5371428571428571, "step": 5977 }, { "epoch": 1.1082684464219503, "grad_norm": 6.42578125, "learning_rate": 8.89173155357805e-06, "loss": 2.9641, "mean_token_accuracy": 0.4724199288256228, "step": 5978 }, { "epoch": 1.1084538375973303, "grad_norm": 6.59765625, "learning_rate": 8.89154616240267e-06, "loss": 2.781, "mean_token_accuracy": 0.4551843175933917, "step": 5979 }, { "epoch": 1.1086392287727105, "grad_norm": 8.09375, "learning_rate": 8.89136077122729e-06, "loss": 2.3051, "mean_token_accuracy": 0.5087342935948513, "step": 5980 }, { "epoch": 1.1088246199480905, "grad_norm": 4.9453125, "learning_rate": 8.89117538005191e-06, "loss": 2.9737, "mean_token_accuracy": 0.44810624692572554, "step": 5981 }, { "epoch": 1.1090100111234706, "grad_norm": 7.69140625, "learning_rate": 8.89098998887653e-06, "loss": 3.0245, "mean_token_accuracy": 0.4263418749340926, "step": 5982 }, { "epoch": 1.1091954022988506, "grad_norm": 13.40625, "learning_rate": 8.890804597701151e-06, "loss": 2.6665, "mean_token_accuracy": 0.47416514371158, "step": 5983 }, { "epoch": 1.1093807934742306, "grad_norm": 8.015625, "learning_rate": 8.89061920652577e-06, "loss": 3.2707, "mean_token_accuracy": 0.4064094179202093, "step": 5984 }, { "epoch": 1.1095661846496108, "grad_norm": 5.62109375, "learning_rate": 8.89043381535039e-06, "loss": 2.799, "mean_token_accuracy": 0.4718036352338046, "step": 5985 }, { "epoch": 1.1097515758249907, "grad_norm": 7.15234375, "learning_rate": 8.89024842417501e-06, "loss": 2.4969, "mean_token_accuracy": 0.5173534381905629, "step": 5986 }, { "epoch": 1.1099369670003707, "grad_norm": 8.4453125, "learning_rate": 8.89006303299963e-06, "loss": 2.4693, "mean_token_accuracy": 0.49584043299589053, "step": 5987 }, { "epoch": 1.110122358175751, "grad_norm": 7.83203125, "learning_rate": 8.88987764182425e-06, "loss": 2.585, "mean_token_accuracy": 0.48588030214991285, "step": 5988 }, { "epoch": 1.1103077493511309, "grad_norm": 7.84765625, "learning_rate": 8.88969225064887e-06, "loss": 2.7907, "mean_token_accuracy": 0.4524405506883605, "step": 5989 }, { "epoch": 1.1104931405265108, "grad_norm": 6.3828125, "learning_rate": 8.88950685947349e-06, "loss": 2.8931, "mean_token_accuracy": 0.4735067437379576, "step": 5990 }, { "epoch": 1.110678531701891, "grad_norm": 11.2265625, "learning_rate": 8.88932146829811e-06, "loss": 2.4846, "mean_token_accuracy": 0.49226579520697167, "step": 5991 }, { "epoch": 1.110863922877271, "grad_norm": 7.609375, "learning_rate": 8.88913607712273e-06, "loss": 2.7628, "mean_token_accuracy": 0.48059490084985834, "step": 5992 }, { "epoch": 1.1110493140526512, "grad_norm": 7.23046875, "learning_rate": 8.88895068594735e-06, "loss": 2.9749, "mean_token_accuracy": 0.46493921159277907, "step": 5993 }, { "epoch": 1.1112347052280311, "grad_norm": 6.65625, "learning_rate": 8.88876529477197e-06, "loss": 2.7282, "mean_token_accuracy": 0.46918037170744753, "step": 5994 }, { "epoch": 1.111420096403411, "grad_norm": 6.58984375, "learning_rate": 8.888579903596589e-06, "loss": 2.9356, "mean_token_accuracy": 0.4409030544488712, "step": 5995 }, { "epoch": 1.1116054875787913, "grad_norm": 7.53515625, "learning_rate": 8.888394512421209e-06, "loss": 2.8876, "mean_token_accuracy": 0.45124617402710976, "step": 5996 }, { "epoch": 1.1117908787541713, "grad_norm": 7.71875, "learning_rate": 8.88820912124583e-06, "loss": 2.5171, "mean_token_accuracy": 0.4881751358261425, "step": 5997 }, { "epoch": 1.1119762699295515, "grad_norm": 8.28125, "learning_rate": 8.88802373007045e-06, "loss": 2.7551, "mean_token_accuracy": 0.4795714285714286, "step": 5998 }, { "epoch": 1.1121616611049314, "grad_norm": 6.515625, "learning_rate": 8.887838338895069e-06, "loss": 2.648, "mean_token_accuracy": 0.46601756395570826, "step": 5999 }, { "epoch": 1.1123470522803114, "grad_norm": 6.38671875, "learning_rate": 8.88765294771969e-06, "loss": 2.5822, "mean_token_accuracy": 0.4814588924127836, "step": 6000 }, { "epoch": 1.1125324434556916, "grad_norm": 6.37890625, "learning_rate": 8.88746755654431e-06, "loss": 2.7859, "mean_token_accuracy": 0.46122402634268195, "step": 6001 }, { "epoch": 1.1127178346310715, "grad_norm": 6.60546875, "learning_rate": 8.887282165368929e-06, "loss": 3.0582, "mean_token_accuracy": 0.4555933205260825, "step": 6002 }, { "epoch": 1.1129032258064515, "grad_norm": 7.12890625, "learning_rate": 8.887096774193549e-06, "loss": 2.8978, "mean_token_accuracy": 0.4502908514013749, "step": 6003 }, { "epoch": 1.1130886169818317, "grad_norm": 7.984375, "learning_rate": 8.886911383018168e-06, "loss": 2.6144, "mean_token_accuracy": 0.4776023890784983, "step": 6004 }, { "epoch": 1.1132740081572117, "grad_norm": 6.06640625, "learning_rate": 8.88672599184279e-06, "loss": 2.0241, "mean_token_accuracy": 0.5485391140433553, "step": 6005 }, { "epoch": 1.1134593993325919, "grad_norm": 5.50390625, "learning_rate": 8.886540600667409e-06, "loss": 3.4702, "mean_token_accuracy": 0.401330376940133, "step": 6006 }, { "epoch": 1.1136447905079718, "grad_norm": 5.53515625, "learning_rate": 8.88635520949203e-06, "loss": 3.2897, "mean_token_accuracy": 0.4184006527947776, "step": 6007 }, { "epoch": 1.1138301816833518, "grad_norm": 8.21875, "learning_rate": 8.886169818316648e-06, "loss": 3.4552, "mean_token_accuracy": 0.425670294664189, "step": 6008 }, { "epoch": 1.114015572858732, "grad_norm": 4.9765625, "learning_rate": 8.885984427141269e-06, "loss": 2.622, "mean_token_accuracy": 0.4809813189286518, "step": 6009 }, { "epoch": 1.114200964034112, "grad_norm": 7.359375, "learning_rate": 8.885799035965889e-06, "loss": 2.7759, "mean_token_accuracy": 0.4727088948787062, "step": 6010 }, { "epoch": 1.1143863552094921, "grad_norm": 9.5703125, "learning_rate": 8.885613644790508e-06, "loss": 2.8099, "mean_token_accuracy": 0.4615613382899628, "step": 6011 }, { "epoch": 1.114571746384872, "grad_norm": 5.51953125, "learning_rate": 8.885428253615128e-06, "loss": 2.4475, "mean_token_accuracy": 0.488135593220339, "step": 6012 }, { "epoch": 1.114757137560252, "grad_norm": 6.453125, "learning_rate": 8.885242862439749e-06, "loss": 2.8682, "mean_token_accuracy": 0.45306403488638974, "step": 6013 }, { "epoch": 1.1149425287356323, "grad_norm": 8.34375, "learning_rate": 8.88505747126437e-06, "loss": 2.7924, "mean_token_accuracy": 0.4776268228842079, "step": 6014 }, { "epoch": 1.1151279199110122, "grad_norm": 6.35546875, "learning_rate": 8.884872080088988e-06, "loss": 2.8932, "mean_token_accuracy": 0.45159343312409467, "step": 6015 }, { "epoch": 1.1153133110863922, "grad_norm": 9.234375, "learning_rate": 8.884686688913609e-06, "loss": 3.2073, "mean_token_accuracy": 0.45677083333333335, "step": 6016 }, { "epoch": 1.1154987022617724, "grad_norm": 7.125, "learning_rate": 8.884501297738228e-06, "loss": 2.6975, "mean_token_accuracy": 0.47155049786628733, "step": 6017 }, { "epoch": 1.1156840934371524, "grad_norm": 5.96484375, "learning_rate": 8.884315906562848e-06, "loss": 3.0363, "mean_token_accuracy": 0.44541484716157204, "step": 6018 }, { "epoch": 1.1158694846125325, "grad_norm": 6.61328125, "learning_rate": 8.884130515387468e-06, "loss": 2.7733, "mean_token_accuracy": 0.45345345345345345, "step": 6019 }, { "epoch": 1.1160548757879125, "grad_norm": 8.3515625, "learning_rate": 8.883945124212087e-06, "loss": 2.6205, "mean_token_accuracy": 0.4738770525173022, "step": 6020 }, { "epoch": 1.1162402669632925, "grad_norm": 6.0859375, "learning_rate": 8.88375973303671e-06, "loss": 3.1421, "mean_token_accuracy": 0.43269339997728046, "step": 6021 }, { "epoch": 1.1164256581386727, "grad_norm": 5.21875, "learning_rate": 8.883574341861328e-06, "loss": 2.7327, "mean_token_accuracy": 0.4588117106773823, "step": 6022 }, { "epoch": 1.1166110493140526, "grad_norm": 5.75390625, "learning_rate": 8.883388950685949e-06, "loss": 2.8152, "mean_token_accuracy": 0.4725463591135233, "step": 6023 }, { "epoch": 1.1167964404894326, "grad_norm": 6.03125, "learning_rate": 8.883203559510568e-06, "loss": 2.8547, "mean_token_accuracy": 0.4547422540928156, "step": 6024 }, { "epoch": 1.1169818316648128, "grad_norm": 5.1640625, "learning_rate": 8.883018168335188e-06, "loss": 2.4855, "mean_token_accuracy": 0.5194069431051109, "step": 6025 }, { "epoch": 1.1171672228401928, "grad_norm": 7.43359375, "learning_rate": 8.882832777159809e-06, "loss": 2.4647, "mean_token_accuracy": 0.52, "step": 6026 }, { "epoch": 1.117352614015573, "grad_norm": 5.66796875, "learning_rate": 8.882647385984427e-06, "loss": 2.9139, "mean_token_accuracy": 0.45119425652353995, "step": 6027 }, { "epoch": 1.117538005190953, "grad_norm": 5.43359375, "learning_rate": 8.882461994809048e-06, "loss": 2.7944, "mean_token_accuracy": 0.4552732335537765, "step": 6028 }, { "epoch": 1.1177233963663329, "grad_norm": 6.79296875, "learning_rate": 8.882276603633668e-06, "loss": 2.9487, "mean_token_accuracy": 0.4645888594164456, "step": 6029 }, { "epoch": 1.117908787541713, "grad_norm": 8.2578125, "learning_rate": 8.882091212458289e-06, "loss": 2.2868, "mean_token_accuracy": 0.5099697885196375, "step": 6030 }, { "epoch": 1.118094178717093, "grad_norm": 7.7265625, "learning_rate": 8.881905821282908e-06, "loss": 3.2369, "mean_token_accuracy": 0.42538190364277323, "step": 6031 }, { "epoch": 1.118279569892473, "grad_norm": 7.203125, "learning_rate": 8.881720430107528e-06, "loss": 2.7403, "mean_token_accuracy": 0.4516409098801875, "step": 6032 }, { "epoch": 1.1184649610678532, "grad_norm": 9.046875, "learning_rate": 8.881535038932147e-06, "loss": 2.4627, "mean_token_accuracy": 0.49254933548127267, "step": 6033 }, { "epoch": 1.1186503522432332, "grad_norm": 6.82421875, "learning_rate": 8.881349647756767e-06, "loss": 2.8884, "mean_token_accuracy": 0.4507178354500276, "step": 6034 }, { "epoch": 1.1188357434186134, "grad_norm": 7.21484375, "learning_rate": 8.881164256581388e-06, "loss": 2.9667, "mean_token_accuracy": 0.42517449426239207, "step": 6035 }, { "epoch": 1.1190211345939933, "grad_norm": 5.71875, "learning_rate": 8.880978865406007e-06, "loss": 2.6325, "mean_token_accuracy": 0.4747104530127547, "step": 6036 }, { "epoch": 1.1192065257693733, "grad_norm": 7.53515625, "learning_rate": 8.880793474230627e-06, "loss": 2.6773, "mean_token_accuracy": 0.45834738617200677, "step": 6037 }, { "epoch": 1.1193919169447535, "grad_norm": 6.4609375, "learning_rate": 8.880608083055248e-06, "loss": 2.5723, "mean_token_accuracy": 0.45986009327115257, "step": 6038 }, { "epoch": 1.1195773081201335, "grad_norm": 8.3828125, "learning_rate": 8.880422691879868e-06, "loss": 2.8636, "mean_token_accuracy": 0.44751670816406763, "step": 6039 }, { "epoch": 1.1197626992955136, "grad_norm": 6.6015625, "learning_rate": 8.880237300704487e-06, "loss": 2.5343, "mean_token_accuracy": 0.4836795252225519, "step": 6040 }, { "epoch": 1.1199480904708936, "grad_norm": 8.0390625, "learning_rate": 8.880051909529107e-06, "loss": 3.0737, "mean_token_accuracy": 0.4221553549537501, "step": 6041 }, { "epoch": 1.1201334816462736, "grad_norm": 7.1171875, "learning_rate": 8.879866518353726e-06, "loss": 2.5655, "mean_token_accuracy": 0.4824609482049877, "step": 6042 }, { "epoch": 1.1203188728216538, "grad_norm": 7.76171875, "learning_rate": 8.879681127178347e-06, "loss": 2.1774, "mean_token_accuracy": 0.5496136012364761, "step": 6043 }, { "epoch": 1.1205042639970337, "grad_norm": 6.45703125, "learning_rate": 8.879495736002967e-06, "loss": 2.3754, "mean_token_accuracy": 0.5075975359342916, "step": 6044 }, { "epoch": 1.1206896551724137, "grad_norm": 8.2578125, "learning_rate": 8.879310344827588e-06, "loss": 2.805, "mean_token_accuracy": 0.44574613284804365, "step": 6045 }, { "epoch": 1.120875046347794, "grad_norm": 6.734375, "learning_rate": 8.879124953652207e-06, "loss": 2.7607, "mean_token_accuracy": 0.48055908513341805, "step": 6046 }, { "epoch": 1.1210604375231739, "grad_norm": 6.921875, "learning_rate": 8.878939562476827e-06, "loss": 3.5432, "mean_token_accuracy": 0.41027457927369354, "step": 6047 }, { "epoch": 1.121245828698554, "grad_norm": 6.8203125, "learning_rate": 8.878754171301447e-06, "loss": 2.5396, "mean_token_accuracy": 0.473972602739726, "step": 6048 }, { "epoch": 1.121431219873934, "grad_norm": 6.2890625, "learning_rate": 8.878568780126066e-06, "loss": 2.5447, "mean_token_accuracy": 0.501987434286447, "step": 6049 }, { "epoch": 1.121616611049314, "grad_norm": 6.24609375, "learning_rate": 8.878383388950687e-06, "loss": 2.9563, "mean_token_accuracy": 0.45418759332156916, "step": 6050 }, { "epoch": 1.1218020022246942, "grad_norm": 6.20703125, "learning_rate": 8.878197997775306e-06, "loss": 3.0918, "mean_token_accuracy": 0.4252929097717116, "step": 6051 }, { "epoch": 1.1219873934000741, "grad_norm": 7.38671875, "learning_rate": 8.878012606599926e-06, "loss": 2.6327, "mean_token_accuracy": 0.48363718718151966, "step": 6052 }, { "epoch": 1.122172784575454, "grad_norm": 5.80859375, "learning_rate": 8.877827215424547e-06, "loss": 3.0127, "mean_token_accuracy": 0.4557859107256188, "step": 6053 }, { "epoch": 1.1223581757508343, "grad_norm": 5.828125, "learning_rate": 8.877641824249167e-06, "loss": 3.1603, "mean_token_accuracy": 0.4222945484133442, "step": 6054 }, { "epoch": 1.1225435669262143, "grad_norm": 5.17578125, "learning_rate": 8.877456433073786e-06, "loss": 2.4638, "mean_token_accuracy": 0.49341813341221713, "step": 6055 }, { "epoch": 1.1227289581015945, "grad_norm": 5.3203125, "learning_rate": 8.877271041898406e-06, "loss": 2.4586, "mean_token_accuracy": 0.5209993114979836, "step": 6056 }, { "epoch": 1.1229143492769744, "grad_norm": 5.9453125, "learning_rate": 8.877085650723027e-06, "loss": 2.4775, "mean_token_accuracy": 0.48168031136760164, "step": 6057 }, { "epoch": 1.1230997404523544, "grad_norm": 5.62109375, "learning_rate": 8.876900259547646e-06, "loss": 3.3435, "mean_token_accuracy": 0.4280414620840153, "step": 6058 }, { "epoch": 1.1232851316277346, "grad_norm": 5.0703125, "learning_rate": 8.876714868372266e-06, "loss": 3.1785, "mean_token_accuracy": 0.42826450226784435, "step": 6059 }, { "epoch": 1.1234705228031145, "grad_norm": 5.71875, "learning_rate": 8.876529477196885e-06, "loss": 2.5868, "mean_token_accuracy": 0.4804042059265328, "step": 6060 }, { "epoch": 1.1236559139784945, "grad_norm": 12.9765625, "learning_rate": 8.876344086021507e-06, "loss": 2.7781, "mean_token_accuracy": 0.4370037056643727, "step": 6061 }, { "epoch": 1.1238413051538747, "grad_norm": 6.2265625, "learning_rate": 8.876158694846126e-06, "loss": 3.2056, "mean_token_accuracy": 0.41947608200455583, "step": 6062 }, { "epoch": 1.1240266963292547, "grad_norm": 6.0078125, "learning_rate": 8.875973303670746e-06, "loss": 2.9121, "mean_token_accuracy": 0.4424515975769094, "step": 6063 }, { "epoch": 1.1242120875046349, "grad_norm": 6.15234375, "learning_rate": 8.875787912495367e-06, "loss": 2.8088, "mean_token_accuracy": 0.4528239202657807, "step": 6064 }, { "epoch": 1.1243974786800148, "grad_norm": 5.40234375, "learning_rate": 8.875602521319986e-06, "loss": 2.571, "mean_token_accuracy": 0.4760516451478551, "step": 6065 }, { "epoch": 1.1245828698553948, "grad_norm": 5.4296875, "learning_rate": 8.875417130144606e-06, "loss": 2.659, "mean_token_accuracy": 0.46417629036059393, "step": 6066 }, { "epoch": 1.124768261030775, "grad_norm": 6.10546875, "learning_rate": 8.875231738969225e-06, "loss": 2.6214, "mean_token_accuracy": 0.47344007182403114, "step": 6067 }, { "epoch": 1.124953652206155, "grad_norm": 7.05859375, "learning_rate": 8.875046347793845e-06, "loss": 2.6659, "mean_token_accuracy": 0.4788432267884323, "step": 6068 }, { "epoch": 1.1251390433815351, "grad_norm": 6.66796875, "learning_rate": 8.874860956618466e-06, "loss": 2.6639, "mean_token_accuracy": 0.48020850301352014, "step": 6069 }, { "epoch": 1.125324434556915, "grad_norm": 6.15625, "learning_rate": 8.874675565443086e-06, "loss": 2.5833, "mean_token_accuracy": 0.4834914611005693, "step": 6070 }, { "epoch": 1.125509825732295, "grad_norm": 6.1953125, "learning_rate": 8.874490174267705e-06, "loss": 2.6939, "mean_token_accuracy": 0.47797313314895296, "step": 6071 }, { "epoch": 1.1256952169076753, "grad_norm": 7.5234375, "learning_rate": 8.874304783092326e-06, "loss": 2.8667, "mean_token_accuracy": 0.4502591952604295, "step": 6072 }, { "epoch": 1.1258806080830552, "grad_norm": 8.21875, "learning_rate": 8.874119391916946e-06, "loss": 2.3519, "mean_token_accuracy": 0.5079067274189225, "step": 6073 }, { "epoch": 1.1260659992584352, "grad_norm": 5.41796875, "learning_rate": 8.873934000741565e-06, "loss": 2.8346, "mean_token_accuracy": 0.46578673150399447, "step": 6074 }, { "epoch": 1.1262513904338154, "grad_norm": 8.328125, "learning_rate": 8.873748609566186e-06, "loss": 2.4558, "mean_token_accuracy": 0.4904042988741044, "step": 6075 }, { "epoch": 1.1264367816091954, "grad_norm": 7.4609375, "learning_rate": 8.873563218390804e-06, "loss": 2.8993, "mean_token_accuracy": 0.4347881087919039, "step": 6076 }, { "epoch": 1.1266221727845755, "grad_norm": 9.15625, "learning_rate": 8.873377827215425e-06, "loss": 2.6044, "mean_token_accuracy": 0.473974111814927, "step": 6077 }, { "epoch": 1.1268075639599555, "grad_norm": 6.85546875, "learning_rate": 8.873192436040045e-06, "loss": 2.5192, "mean_token_accuracy": 0.4753384343731607, "step": 6078 }, { "epoch": 1.1269929551353355, "grad_norm": 6.796875, "learning_rate": 8.873007044864666e-06, "loss": 3.0628, "mean_token_accuracy": 0.4322377307519136, "step": 6079 }, { "epoch": 1.1271783463107157, "grad_norm": 8.3203125, "learning_rate": 8.872821653689285e-06, "loss": 2.6804, "mean_token_accuracy": 0.46040575916230364, "step": 6080 }, { "epoch": 1.1273637374860956, "grad_norm": 10.7578125, "learning_rate": 8.872636262513905e-06, "loss": 2.4161, "mean_token_accuracy": 0.4973072780427758, "step": 6081 }, { "epoch": 1.1275491286614758, "grad_norm": 8.7578125, "learning_rate": 8.872450871338526e-06, "loss": 3.3332, "mean_token_accuracy": 0.40064102564102566, "step": 6082 }, { "epoch": 1.1277345198368558, "grad_norm": 10.6484375, "learning_rate": 8.872265480163144e-06, "loss": 2.1574, "mean_token_accuracy": 0.5366781595529664, "step": 6083 }, { "epoch": 1.1279199110122358, "grad_norm": 6.94140625, "learning_rate": 8.872080088987765e-06, "loss": 3.0475, "mean_token_accuracy": 0.4360379628391926, "step": 6084 }, { "epoch": 1.128105302187616, "grad_norm": 7.49609375, "learning_rate": 8.871894697812384e-06, "loss": 2.7365, "mean_token_accuracy": 0.476449515722436, "step": 6085 }, { "epoch": 1.128290693362996, "grad_norm": 6.00390625, "learning_rate": 8.871709306637006e-06, "loss": 2.4432, "mean_token_accuracy": 0.4930013458950202, "step": 6086 }, { "epoch": 1.128476084538376, "grad_norm": 6.11328125, "learning_rate": 8.871523915461625e-06, "loss": 3.1485, "mean_token_accuracy": 0.4186150409530901, "step": 6087 }, { "epoch": 1.128661475713756, "grad_norm": 5.7890625, "learning_rate": 8.871338524286245e-06, "loss": 2.9741, "mean_token_accuracy": 0.4474813089839441, "step": 6088 }, { "epoch": 1.128846866889136, "grad_norm": 9.5859375, "learning_rate": 8.871153133110864e-06, "loss": 2.327, "mean_token_accuracy": 0.5031246014538961, "step": 6089 }, { "epoch": 1.129032258064516, "grad_norm": 7.48046875, "learning_rate": 8.870967741935484e-06, "loss": 2.3146, "mean_token_accuracy": 0.4942584310407349, "step": 6090 }, { "epoch": 1.1292176492398962, "grad_norm": 7.76171875, "learning_rate": 8.870782350760105e-06, "loss": 2.3077, "mean_token_accuracy": 0.5253214379427972, "step": 6091 }, { "epoch": 1.1294030404152762, "grad_norm": 5.453125, "learning_rate": 8.870596959584724e-06, "loss": 2.8183, "mean_token_accuracy": 0.4626738575078059, "step": 6092 }, { "epoch": 1.1295884315906564, "grad_norm": 7.24609375, "learning_rate": 8.870411568409344e-06, "loss": 2.9489, "mean_token_accuracy": 0.4522628642281463, "step": 6093 }, { "epoch": 1.1297738227660363, "grad_norm": 7.52734375, "learning_rate": 8.870226177233965e-06, "loss": 2.6075, "mean_token_accuracy": 0.46719981455725546, "step": 6094 }, { "epoch": 1.1299592139414163, "grad_norm": 5.3984375, "learning_rate": 8.870040786058585e-06, "loss": 2.8817, "mean_token_accuracy": 0.44915349254400716, "step": 6095 }, { "epoch": 1.1301446051167965, "grad_norm": 7.52734375, "learning_rate": 8.869855394883204e-06, "loss": 2.5496, "mean_token_accuracy": 0.484251968503937, "step": 6096 }, { "epoch": 1.1303299962921765, "grad_norm": 8.078125, "learning_rate": 8.869670003707824e-06, "loss": 2.8245, "mean_token_accuracy": 0.4768660667019587, "step": 6097 }, { "epoch": 1.1305153874675566, "grad_norm": 5.87890625, "learning_rate": 8.869484612532443e-06, "loss": 2.5123, "mean_token_accuracy": 0.4964493221433183, "step": 6098 }, { "epoch": 1.1307007786429366, "grad_norm": 5.9375, "learning_rate": 8.869299221357064e-06, "loss": 3.3111, "mean_token_accuracy": 0.4147701918786256, "step": 6099 }, { "epoch": 1.1308861698183166, "grad_norm": 8.703125, "learning_rate": 8.869113830181684e-06, "loss": 2.4049, "mean_token_accuracy": 0.5108347697611426, "step": 6100 }, { "epoch": 1.1310715609936968, "grad_norm": 5.765625, "learning_rate": 8.868928439006303e-06, "loss": 2.6086, "mean_token_accuracy": 0.47177688710754845, "step": 6101 }, { "epoch": 1.1312569521690767, "grad_norm": 5.51953125, "learning_rate": 8.868743047830925e-06, "loss": 3.0713, "mean_token_accuracy": 0.4107723822147247, "step": 6102 }, { "epoch": 1.1314423433444567, "grad_norm": 6.14453125, "learning_rate": 8.868557656655544e-06, "loss": 2.3673, "mean_token_accuracy": 0.5261427162117724, "step": 6103 }, { "epoch": 1.131627734519837, "grad_norm": 7.734375, "learning_rate": 8.868372265480165e-06, "loss": 2.4421, "mean_token_accuracy": 0.47541733363823463, "step": 6104 }, { "epoch": 1.1318131256952169, "grad_norm": 5.47265625, "learning_rate": 8.868186874304783e-06, "loss": 2.6935, "mean_token_accuracy": 0.4624617268830373, "step": 6105 }, { "epoch": 1.131998516870597, "grad_norm": 7.76171875, "learning_rate": 8.868001483129404e-06, "loss": 2.8987, "mean_token_accuracy": 0.4656620021528525, "step": 6106 }, { "epoch": 1.132183908045977, "grad_norm": 7.2578125, "learning_rate": 8.867816091954024e-06, "loss": 2.0636, "mean_token_accuracy": 0.5696166042087056, "step": 6107 }, { "epoch": 1.132369299221357, "grad_norm": 6.375, "learning_rate": 8.867630700778643e-06, "loss": 2.3469, "mean_token_accuracy": 0.5098680227630463, "step": 6108 }, { "epoch": 1.1325546903967372, "grad_norm": 6.703125, "learning_rate": 8.867445309603264e-06, "loss": 3.2382, "mean_token_accuracy": 0.4299039398579981, "step": 6109 }, { "epoch": 1.1327400815721171, "grad_norm": 10.421875, "learning_rate": 8.867259918427884e-06, "loss": 2.6108, "mean_token_accuracy": 0.47201534642292936, "step": 6110 }, { "epoch": 1.1329254727474973, "grad_norm": 6.0, "learning_rate": 8.867074527252505e-06, "loss": 3.4837, "mean_token_accuracy": 0.41144321093082836, "step": 6111 }, { "epoch": 1.1331108639228773, "grad_norm": 5.9609375, "learning_rate": 8.866889136077123e-06, "loss": 2.7884, "mean_token_accuracy": 0.47805190644024353, "step": 6112 }, { "epoch": 1.1332962550982573, "grad_norm": 9.0234375, "learning_rate": 8.866703744901744e-06, "loss": 2.7015, "mean_token_accuracy": 0.47411081538833777, "step": 6113 }, { "epoch": 1.1334816462736375, "grad_norm": 5.828125, "learning_rate": 8.866518353726363e-06, "loss": 2.3777, "mean_token_accuracy": 0.49588839941262847, "step": 6114 }, { "epoch": 1.1336670374490174, "grad_norm": 6.65625, "learning_rate": 8.866332962550983e-06, "loss": 2.912, "mean_token_accuracy": 0.4566843866902934, "step": 6115 }, { "epoch": 1.1338524286243974, "grad_norm": 6.9453125, "learning_rate": 8.866147571375604e-06, "loss": 2.8876, "mean_token_accuracy": 0.44988344988344986, "step": 6116 }, { "epoch": 1.1340378197997776, "grad_norm": 9.53125, "learning_rate": 8.865962180200222e-06, "loss": 2.9036, "mean_token_accuracy": 0.44989830508474576, "step": 6117 }, { "epoch": 1.1342232109751575, "grad_norm": 5.93359375, "learning_rate": 8.865776789024843e-06, "loss": 3.3547, "mean_token_accuracy": 0.42628960460853627, "step": 6118 }, { "epoch": 1.1344086021505375, "grad_norm": 7.00390625, "learning_rate": 8.865591397849463e-06, "loss": 3.6309, "mean_token_accuracy": 0.37717828418230565, "step": 6119 }, { "epoch": 1.1345939933259177, "grad_norm": 8.9140625, "learning_rate": 8.865406006674084e-06, "loss": 2.8457, "mean_token_accuracy": 0.4421704591355864, "step": 6120 }, { "epoch": 1.1347793845012977, "grad_norm": 7.609375, "learning_rate": 8.865220615498703e-06, "loss": 2.8997, "mean_token_accuracy": 0.441622760800843, "step": 6121 }, { "epoch": 1.1349647756766779, "grad_norm": 5.8125, "learning_rate": 8.865035224323323e-06, "loss": 2.4551, "mean_token_accuracy": 0.5312924330329589, "step": 6122 }, { "epoch": 1.1351501668520578, "grad_norm": 7.90234375, "learning_rate": 8.864849833147942e-06, "loss": 2.9831, "mean_token_accuracy": 0.4339234252778921, "step": 6123 }, { "epoch": 1.135335558027438, "grad_norm": 9.0625, "learning_rate": 8.864664441972562e-06, "loss": 2.4746, "mean_token_accuracy": 0.5060065878705677, "step": 6124 }, { "epoch": 1.135520949202818, "grad_norm": 6.33984375, "learning_rate": 8.864479050797183e-06, "loss": 3.0591, "mean_token_accuracy": 0.42217648572163624, "step": 6125 }, { "epoch": 1.135706340378198, "grad_norm": 7.55859375, "learning_rate": 8.864293659621803e-06, "loss": 2.7672, "mean_token_accuracy": 0.4686385844447379, "step": 6126 }, { "epoch": 1.1358917315535781, "grad_norm": 8.921875, "learning_rate": 8.864108268446422e-06, "loss": 2.8735, "mean_token_accuracy": 0.4532710280373832, "step": 6127 }, { "epoch": 1.136077122728958, "grad_norm": 7.5234375, "learning_rate": 8.863922877271043e-06, "loss": 2.7409, "mean_token_accuracy": 0.47589760638297873, "step": 6128 }, { "epoch": 1.136262513904338, "grad_norm": 7.9921875, "learning_rate": 8.863737486095663e-06, "loss": 2.2411, "mean_token_accuracy": 0.5232399179767601, "step": 6129 }, { "epoch": 1.1364479050797183, "grad_norm": 9.2421875, "learning_rate": 8.863552094920282e-06, "loss": 2.3582, "mean_token_accuracy": 0.512446240062557, "step": 6130 }, { "epoch": 1.1366332962550982, "grad_norm": 6.65625, "learning_rate": 8.863366703744903e-06, "loss": 2.819, "mean_token_accuracy": 0.45714647423724525, "step": 6131 }, { "epoch": 1.1368186874304782, "grad_norm": 7.11328125, "learning_rate": 8.863181312569521e-06, "loss": 2.694, "mean_token_accuracy": 0.493714436945878, "step": 6132 }, { "epoch": 1.1370040786058584, "grad_norm": 6.70703125, "learning_rate": 8.862995921394142e-06, "loss": 2.6374, "mean_token_accuracy": 0.4800796812749004, "step": 6133 }, { "epoch": 1.1371894697812384, "grad_norm": 6.12890625, "learning_rate": 8.862810530218762e-06, "loss": 2.7784, "mean_token_accuracy": 0.4684875977222276, "step": 6134 }, { "epoch": 1.1373748609566185, "grad_norm": 6.265625, "learning_rate": 8.862625139043383e-06, "loss": 2.4664, "mean_token_accuracy": 0.4958939348966063, "step": 6135 }, { "epoch": 1.1375602521319985, "grad_norm": 6.14453125, "learning_rate": 8.862439747868002e-06, "loss": 2.564, "mean_token_accuracy": 0.484965922758252, "step": 6136 }, { "epoch": 1.1377456433073785, "grad_norm": 8.046875, "learning_rate": 8.862254356692622e-06, "loss": 3.1018, "mean_token_accuracy": 0.4235631573355997, "step": 6137 }, { "epoch": 1.1379310344827587, "grad_norm": 7.98828125, "learning_rate": 8.862068965517243e-06, "loss": 2.7619, "mean_token_accuracy": 0.4607142857142857, "step": 6138 }, { "epoch": 1.1381164256581386, "grad_norm": 5.58984375, "learning_rate": 8.861883574341861e-06, "loss": 2.6263, "mean_token_accuracy": 0.4780308258569128, "step": 6139 }, { "epoch": 1.1383018168335188, "grad_norm": 7.81640625, "learning_rate": 8.861698183166482e-06, "loss": 2.5254, "mean_token_accuracy": 0.45831752055660974, "step": 6140 }, { "epoch": 1.1384872080088988, "grad_norm": 7.66015625, "learning_rate": 8.8615127919911e-06, "loss": 2.9118, "mean_token_accuracy": 0.4401151956632221, "step": 6141 }, { "epoch": 1.1386725991842788, "grad_norm": 8.4375, "learning_rate": 8.861327400815723e-06, "loss": 2.5593, "mean_token_accuracy": 0.4883792048929664, "step": 6142 }, { "epoch": 1.138857990359659, "grad_norm": 7.8359375, "learning_rate": 8.861142009640342e-06, "loss": 3.0609, "mean_token_accuracy": 0.42585794094173984, "step": 6143 }, { "epoch": 1.139043381535039, "grad_norm": 9.703125, "learning_rate": 8.860956618464962e-06, "loss": 2.5034, "mean_token_accuracy": 0.5018406521167499, "step": 6144 }, { "epoch": 1.139228772710419, "grad_norm": 7.53515625, "learning_rate": 8.860771227289583e-06, "loss": 2.9299, "mean_token_accuracy": 0.4520629266844761, "step": 6145 }, { "epoch": 1.139414163885799, "grad_norm": 6.44921875, "learning_rate": 8.860585836114201e-06, "loss": 2.7818, "mean_token_accuracy": 0.4752210018751674, "step": 6146 }, { "epoch": 1.139599555061179, "grad_norm": 7.875, "learning_rate": 8.860400444938822e-06, "loss": 3.0154, "mean_token_accuracy": 0.4488506519693507, "step": 6147 }, { "epoch": 1.139784946236559, "grad_norm": 7.3359375, "learning_rate": 8.86021505376344e-06, "loss": 3.096, "mean_token_accuracy": 0.42114485981308414, "step": 6148 }, { "epoch": 1.1399703374119392, "grad_norm": 7.24609375, "learning_rate": 8.860029662588061e-06, "loss": 3.2497, "mean_token_accuracy": 0.42070760628554904, "step": 6149 }, { "epoch": 1.1401557285873192, "grad_norm": 8.1484375, "learning_rate": 8.859844271412682e-06, "loss": 2.6299, "mean_token_accuracy": 0.48242530755711777, "step": 6150 }, { "epoch": 1.1403411197626994, "grad_norm": 6.3984375, "learning_rate": 8.859658880237302e-06, "loss": 3.2276, "mean_token_accuracy": 0.4148978246539222, "step": 6151 }, { "epoch": 1.1405265109380793, "grad_norm": 6.25390625, "learning_rate": 8.859473489061921e-06, "loss": 2.6551, "mean_token_accuracy": 0.4745433974462583, "step": 6152 }, { "epoch": 1.1407119021134595, "grad_norm": 7.86328125, "learning_rate": 8.859288097886541e-06, "loss": 2.6942, "mean_token_accuracy": 0.4392332268370607, "step": 6153 }, { "epoch": 1.1408972932888395, "grad_norm": 9.9296875, "learning_rate": 8.859102706711162e-06, "loss": 3.2602, "mean_token_accuracy": 0.4218085717568654, "step": 6154 }, { "epoch": 1.1410826844642195, "grad_norm": 7.20703125, "learning_rate": 8.85891731553578e-06, "loss": 3.003, "mean_token_accuracy": 0.44236327899179095, "step": 6155 }, { "epoch": 1.1412680756395996, "grad_norm": 7.47265625, "learning_rate": 8.858731924360401e-06, "loss": 2.7516, "mean_token_accuracy": 0.4636749389224637, "step": 6156 }, { "epoch": 1.1414534668149796, "grad_norm": 6.35546875, "learning_rate": 8.85854653318502e-06, "loss": 2.7147, "mean_token_accuracy": 0.47580522057701113, "step": 6157 }, { "epoch": 1.1416388579903596, "grad_norm": 6.4296875, "learning_rate": 8.858361142009642e-06, "loss": 2.7357, "mean_token_accuracy": 0.4905345211581292, "step": 6158 }, { "epoch": 1.1418242491657398, "grad_norm": 8.21875, "learning_rate": 8.858175750834261e-06, "loss": 2.5916, "mean_token_accuracy": 0.48572150619155924, "step": 6159 }, { "epoch": 1.1420096403411197, "grad_norm": 6.5859375, "learning_rate": 8.857990359658882e-06, "loss": 2.3482, "mean_token_accuracy": 0.5140864714086472, "step": 6160 }, { "epoch": 1.1421950315164997, "grad_norm": 5.5859375, "learning_rate": 8.8578049684835e-06, "loss": 2.8062, "mean_token_accuracy": 0.46034919365587096, "step": 6161 }, { "epoch": 1.14238042269188, "grad_norm": 6.50390625, "learning_rate": 8.85761957730812e-06, "loss": 2.6138, "mean_token_accuracy": 0.4618403837767117, "step": 6162 }, { "epoch": 1.1425658138672599, "grad_norm": 5.13671875, "learning_rate": 8.857434186132741e-06, "loss": 2.8813, "mean_token_accuracy": 0.4548903488479689, "step": 6163 }, { "epoch": 1.14275120504264, "grad_norm": 6.75, "learning_rate": 8.85724879495736e-06, "loss": 2.1379, "mean_token_accuracy": 0.5262407011107715, "step": 6164 }, { "epoch": 1.14293659621802, "grad_norm": 6.8828125, "learning_rate": 8.85706340378198e-06, "loss": 2.6396, "mean_token_accuracy": 0.47533185840707964, "step": 6165 }, { "epoch": 1.1431219873934, "grad_norm": 7.125, "learning_rate": 8.856878012606601e-06, "loss": 2.3762, "mean_token_accuracy": 0.5093226022803488, "step": 6166 }, { "epoch": 1.1433073785687802, "grad_norm": 6.2265625, "learning_rate": 8.856692621431222e-06, "loss": 2.9756, "mean_token_accuracy": 0.4512119328775637, "step": 6167 }, { "epoch": 1.1434927697441601, "grad_norm": 5.55078125, "learning_rate": 8.85650723025584e-06, "loss": 2.741, "mean_token_accuracy": 0.477198341697578, "step": 6168 }, { "epoch": 1.1436781609195403, "grad_norm": 6.2421875, "learning_rate": 8.856321839080461e-06, "loss": 2.9421, "mean_token_accuracy": 0.4389093588798821, "step": 6169 }, { "epoch": 1.1438635520949203, "grad_norm": 6.46484375, "learning_rate": 8.85613644790508e-06, "loss": 2.8186, "mean_token_accuracy": 0.46213503649635035, "step": 6170 }, { "epoch": 1.1440489432703003, "grad_norm": 7.03515625, "learning_rate": 8.8559510567297e-06, "loss": 2.5902, "mean_token_accuracy": 0.47406340057636887, "step": 6171 }, { "epoch": 1.1442343344456805, "grad_norm": 6.33203125, "learning_rate": 8.85576566555432e-06, "loss": 2.3843, "mean_token_accuracy": 0.5090029561945714, "step": 6172 }, { "epoch": 1.1444197256210604, "grad_norm": 7.03125, "learning_rate": 8.85558027437894e-06, "loss": 2.4378, "mean_token_accuracy": 0.5029572074683984, "step": 6173 }, { "epoch": 1.1446051167964404, "grad_norm": 7.3125, "learning_rate": 8.85539488320356e-06, "loss": 2.41, "mean_token_accuracy": 0.5085627779354717, "step": 6174 }, { "epoch": 1.1447905079718206, "grad_norm": 6.609375, "learning_rate": 8.85520949202818e-06, "loss": 3.1158, "mean_token_accuracy": 0.43976565360673747, "step": 6175 }, { "epoch": 1.1449758991472005, "grad_norm": 6.07421875, "learning_rate": 8.855024100852801e-06, "loss": 2.7264, "mean_token_accuracy": 0.4723886428757489, "step": 6176 }, { "epoch": 1.1451612903225807, "grad_norm": 6.13671875, "learning_rate": 8.85483870967742e-06, "loss": 2.7601, "mean_token_accuracy": 0.47481927710843375, "step": 6177 }, { "epoch": 1.1453466814979607, "grad_norm": 6.359375, "learning_rate": 8.85465331850204e-06, "loss": 3.1006, "mean_token_accuracy": 0.42986490370796204, "step": 6178 }, { "epoch": 1.1455320726733407, "grad_norm": 5.30078125, "learning_rate": 8.854467927326659e-06, "loss": 2.6905, "mean_token_accuracy": 0.48302312464749014, "step": 6179 }, { "epoch": 1.1457174638487209, "grad_norm": 5.7578125, "learning_rate": 8.85428253615128e-06, "loss": 2.8808, "mean_token_accuracy": 0.44185007261758463, "step": 6180 }, { "epoch": 1.1459028550241008, "grad_norm": 6.31640625, "learning_rate": 8.8540971449759e-06, "loss": 2.9105, "mean_token_accuracy": 0.4383685459270139, "step": 6181 }, { "epoch": 1.146088246199481, "grad_norm": 5.88671875, "learning_rate": 8.85391175380052e-06, "loss": 3.1046, "mean_token_accuracy": 0.4342357706246003, "step": 6182 }, { "epoch": 1.146273637374861, "grad_norm": 6.09765625, "learning_rate": 8.853726362625141e-06, "loss": 2.5872, "mean_token_accuracy": 0.4873096446700508, "step": 6183 }, { "epoch": 1.146459028550241, "grad_norm": 7.90625, "learning_rate": 8.85354097144976e-06, "loss": 3.003, "mean_token_accuracy": 0.44263640092985096, "step": 6184 }, { "epoch": 1.1466444197256211, "grad_norm": 7.60546875, "learning_rate": 8.85335558027438e-06, "loss": 2.549, "mean_token_accuracy": 0.47827758554402156, "step": 6185 }, { "epoch": 1.1468298109010011, "grad_norm": 6.21484375, "learning_rate": 8.853170189098999e-06, "loss": 2.8155, "mean_token_accuracy": 0.4563193343898574, "step": 6186 }, { "epoch": 1.147015202076381, "grad_norm": 6.77734375, "learning_rate": 8.85298479792362e-06, "loss": 2.6995, "mean_token_accuracy": 0.4885710637211084, "step": 6187 }, { "epoch": 1.1472005932517613, "grad_norm": 7.890625, "learning_rate": 8.85279940674824e-06, "loss": 2.6052, "mean_token_accuracy": 0.5041039671682627, "step": 6188 }, { "epoch": 1.1473859844271412, "grad_norm": 6.5078125, "learning_rate": 8.852614015572859e-06, "loss": 2.816, "mean_token_accuracy": 0.4597826086956522, "step": 6189 }, { "epoch": 1.1475713756025212, "grad_norm": 10.2421875, "learning_rate": 8.85242862439748e-06, "loss": 2.6783, "mean_token_accuracy": 0.4799685781618225, "step": 6190 }, { "epoch": 1.1477567667779014, "grad_norm": 7.70703125, "learning_rate": 8.8522432332221e-06, "loss": 3.0254, "mean_token_accuracy": 0.4185829345161667, "step": 6191 }, { "epoch": 1.1479421579532814, "grad_norm": 6.5625, "learning_rate": 8.85205784204672e-06, "loss": 2.7407, "mean_token_accuracy": 0.4522696929238985, "step": 6192 }, { "epoch": 1.1481275491286616, "grad_norm": 7.265625, "learning_rate": 8.851872450871339e-06, "loss": 2.8813, "mean_token_accuracy": 0.44809537656295434, "step": 6193 }, { "epoch": 1.1483129403040415, "grad_norm": 6.09765625, "learning_rate": 8.85168705969596e-06, "loss": 2.6448, "mean_token_accuracy": 0.4782293178519594, "step": 6194 }, { "epoch": 1.1484983314794215, "grad_norm": 6.83203125, "learning_rate": 8.851501668520578e-06, "loss": 2.6886, "mean_token_accuracy": 0.4938891679223171, "step": 6195 }, { "epoch": 1.1486837226548017, "grad_norm": 6.3203125, "learning_rate": 8.851316277345199e-06, "loss": 2.8137, "mean_token_accuracy": 0.46339633963396337, "step": 6196 }, { "epoch": 1.1488691138301816, "grad_norm": 5.8984375, "learning_rate": 8.85113088616982e-06, "loss": 3.0294, "mean_token_accuracy": 0.4501965188096575, "step": 6197 }, { "epoch": 1.1490545050055618, "grad_norm": 10.1953125, "learning_rate": 8.850945494994438e-06, "loss": 2.4977, "mean_token_accuracy": 0.48570675800289814, "step": 6198 }, { "epoch": 1.1492398961809418, "grad_norm": 5.88671875, "learning_rate": 8.850760103819059e-06, "loss": 2.6489, "mean_token_accuracy": 0.4861492673992674, "step": 6199 }, { "epoch": 1.1494252873563218, "grad_norm": 6.0703125, "learning_rate": 8.85057471264368e-06, "loss": 2.7254, "mean_token_accuracy": 0.46112024276141106, "step": 6200 }, { "epoch": 1.149610678531702, "grad_norm": 5.0234375, "learning_rate": 8.8503893214683e-06, "loss": 2.6529, "mean_token_accuracy": 0.4836140191169777, "step": 6201 }, { "epoch": 1.149796069707082, "grad_norm": 5.01171875, "learning_rate": 8.850203930292918e-06, "loss": 2.5626, "mean_token_accuracy": 0.4747630028183449, "step": 6202 }, { "epoch": 1.149981460882462, "grad_norm": 6.9921875, "learning_rate": 8.850018539117539e-06, "loss": 2.9312, "mean_token_accuracy": 0.44868968895420036, "step": 6203 }, { "epoch": 1.150166852057842, "grad_norm": 5.78515625, "learning_rate": 8.849833147942158e-06, "loss": 3.0616, "mean_token_accuracy": 0.44425675675675674, "step": 6204 }, { "epoch": 1.150352243233222, "grad_norm": 6.37890625, "learning_rate": 8.849647756766778e-06, "loss": 2.6839, "mean_token_accuracy": 0.490343616754452, "step": 6205 }, { "epoch": 1.1505376344086022, "grad_norm": 6.73828125, "learning_rate": 8.849462365591399e-06, "loss": 2.5792, "mean_token_accuracy": 0.48501362397820164, "step": 6206 }, { "epoch": 1.1507230255839822, "grad_norm": 5.6875, "learning_rate": 8.84927697441602e-06, "loss": 3.2156, "mean_token_accuracy": 0.4222932954276238, "step": 6207 }, { "epoch": 1.1509084167593622, "grad_norm": 6.5234375, "learning_rate": 8.849091583240638e-06, "loss": 2.5741, "mean_token_accuracy": 0.48509933774834435, "step": 6208 }, { "epoch": 1.1510938079347424, "grad_norm": 5.44140625, "learning_rate": 8.848906192065258e-06, "loss": 2.7665, "mean_token_accuracy": 0.4680750619615248, "step": 6209 }, { "epoch": 1.1512791991101223, "grad_norm": 6.4609375, "learning_rate": 8.848720800889879e-06, "loss": 2.9077, "mean_token_accuracy": 0.4458077709611452, "step": 6210 }, { "epoch": 1.1514645902855025, "grad_norm": 5.75390625, "learning_rate": 8.848535409714498e-06, "loss": 2.7499, "mean_token_accuracy": 0.46450079239302694, "step": 6211 }, { "epoch": 1.1516499814608825, "grad_norm": 6.359375, "learning_rate": 8.848350018539118e-06, "loss": 2.8467, "mean_token_accuracy": 0.4379429701664744, "step": 6212 }, { "epoch": 1.1518353726362625, "grad_norm": 6.06640625, "learning_rate": 8.848164627363737e-06, "loss": 2.9641, "mean_token_accuracy": 0.4384881422924901, "step": 6213 }, { "epoch": 1.1520207638116426, "grad_norm": 7.65234375, "learning_rate": 8.847979236188358e-06, "loss": 2.9871, "mean_token_accuracy": 0.48944531776330563, "step": 6214 }, { "epoch": 1.1522061549870226, "grad_norm": 6.0390625, "learning_rate": 8.847793845012978e-06, "loss": 2.4876, "mean_token_accuracy": 0.504351563339422, "step": 6215 }, { "epoch": 1.1523915461624026, "grad_norm": 6.94921875, "learning_rate": 8.847608453837599e-06, "loss": 3.1185, "mean_token_accuracy": 0.4230883224659158, "step": 6216 }, { "epoch": 1.1525769373377828, "grad_norm": 6.9140625, "learning_rate": 8.847423062662217e-06, "loss": 2.5818, "mean_token_accuracy": 0.47446083995459704, "step": 6217 }, { "epoch": 1.1527623285131627, "grad_norm": 5.47265625, "learning_rate": 8.847237671486838e-06, "loss": 2.5684, "mean_token_accuracy": 0.5115008260261786, "step": 6218 }, { "epoch": 1.1529477196885427, "grad_norm": 6.69921875, "learning_rate": 8.847052280311458e-06, "loss": 2.5594, "mean_token_accuracy": 0.4939550949913644, "step": 6219 }, { "epoch": 1.153133110863923, "grad_norm": 10.765625, "learning_rate": 8.846866889136077e-06, "loss": 2.8199, "mean_token_accuracy": 0.4374415341440599, "step": 6220 }, { "epoch": 1.1533185020393029, "grad_norm": 9.625, "learning_rate": 8.846681497960698e-06, "loss": 2.5348, "mean_token_accuracy": 0.48185053380782916, "step": 6221 }, { "epoch": 1.153503893214683, "grad_norm": 7.05078125, "learning_rate": 8.846496106785316e-06, "loss": 3.0819, "mean_token_accuracy": 0.4322967678746327, "step": 6222 }, { "epoch": 1.153689284390063, "grad_norm": 6.15625, "learning_rate": 8.846310715609939e-06, "loss": 3.1256, "mean_token_accuracy": 0.46271421954608616, "step": 6223 }, { "epoch": 1.1538746755654432, "grad_norm": 7.44921875, "learning_rate": 8.846125324434557e-06, "loss": 3.386, "mean_token_accuracy": 0.408676393955185, "step": 6224 }, { "epoch": 1.1540600667408232, "grad_norm": 6.1796875, "learning_rate": 8.845939933259178e-06, "loss": 2.8037, "mean_token_accuracy": 0.47654656696125086, "step": 6225 }, { "epoch": 1.1542454579162031, "grad_norm": 6.53125, "learning_rate": 8.845754542083798e-06, "loss": 2.5673, "mean_token_accuracy": 0.47266717518433765, "step": 6226 }, { "epoch": 1.1544308490915833, "grad_norm": 5.0390625, "learning_rate": 8.845569150908417e-06, "loss": 2.7396, "mean_token_accuracy": 0.47694910475116314, "step": 6227 }, { "epoch": 1.1546162402669633, "grad_norm": 6.28515625, "learning_rate": 8.845383759733038e-06, "loss": 2.8478, "mean_token_accuracy": 0.4412562455389008, "step": 6228 }, { "epoch": 1.1548016314423433, "grad_norm": 5.9765625, "learning_rate": 8.845198368557656e-06, "loss": 2.8053, "mean_token_accuracy": 0.4449157829070493, "step": 6229 }, { "epoch": 1.1549870226177235, "grad_norm": 5.38671875, "learning_rate": 8.845012977382277e-06, "loss": 3.0872, "mean_token_accuracy": 0.43776987553975105, "step": 6230 }, { "epoch": 1.1551724137931034, "grad_norm": 5.1796875, "learning_rate": 8.844827586206897e-06, "loss": 2.8646, "mean_token_accuracy": 0.4440495299243293, "step": 6231 }, { "epoch": 1.1553578049684834, "grad_norm": 6.55859375, "learning_rate": 8.844642195031518e-06, "loss": 3.0487, "mean_token_accuracy": 0.43379766427465877, "step": 6232 }, { "epoch": 1.1555431961438636, "grad_norm": 5.23828125, "learning_rate": 8.844456803856137e-06, "loss": 2.9256, "mean_token_accuracy": 0.45817312688350825, "step": 6233 }, { "epoch": 1.1557285873192435, "grad_norm": 7.55078125, "learning_rate": 8.844271412680757e-06, "loss": 2.6379, "mean_token_accuracy": 0.4900171889461854, "step": 6234 }, { "epoch": 1.1559139784946237, "grad_norm": 6.58203125, "learning_rate": 8.844086021505378e-06, "loss": 2.942, "mean_token_accuracy": 0.4361308238198087, "step": 6235 }, { "epoch": 1.1560993696700037, "grad_norm": 7.13671875, "learning_rate": 8.843900630329997e-06, "loss": 2.7927, "mean_token_accuracy": 0.4555878084179971, "step": 6236 }, { "epoch": 1.1562847608453837, "grad_norm": 6.46484375, "learning_rate": 8.843715239154617e-06, "loss": 2.2697, "mean_token_accuracy": 0.5258587167854828, "step": 6237 }, { "epoch": 1.1564701520207639, "grad_norm": 7.328125, "learning_rate": 8.843529847979236e-06, "loss": 2.3706, "mean_token_accuracy": 0.5077056922694506, "step": 6238 }, { "epoch": 1.1566555431961438, "grad_norm": 5.9453125, "learning_rate": 8.843344456803858e-06, "loss": 3.4275, "mean_token_accuracy": 0.40905416329830235, "step": 6239 }, { "epoch": 1.156840934371524, "grad_norm": 5.27734375, "learning_rate": 8.843159065628477e-06, "loss": 3.287, "mean_token_accuracy": 0.42162162162162165, "step": 6240 }, { "epoch": 1.157026325546904, "grad_norm": 6.25390625, "learning_rate": 8.842973674453097e-06, "loss": 2.8897, "mean_token_accuracy": 0.4485800604229607, "step": 6241 }, { "epoch": 1.157211716722284, "grad_norm": 5.87890625, "learning_rate": 8.842788283277716e-06, "loss": 3.1861, "mean_token_accuracy": 0.407801842766543, "step": 6242 }, { "epoch": 1.1573971078976641, "grad_norm": 6.6640625, "learning_rate": 8.842602892102337e-06, "loss": 2.6684, "mean_token_accuracy": 0.4861517976031957, "step": 6243 }, { "epoch": 1.1575824990730441, "grad_norm": 5.79296875, "learning_rate": 8.842417500926957e-06, "loss": 3.1253, "mean_token_accuracy": 0.4390753862237194, "step": 6244 }, { "epoch": 1.157767890248424, "grad_norm": 7.1875, "learning_rate": 8.842232109751576e-06, "loss": 2.8409, "mean_token_accuracy": 0.458041958041958, "step": 6245 }, { "epoch": 1.1579532814238043, "grad_norm": 6.7109375, "learning_rate": 8.842046718576196e-06, "loss": 3.1423, "mean_token_accuracy": 0.4358725761772853, "step": 6246 }, { "epoch": 1.1581386725991842, "grad_norm": 8.3984375, "learning_rate": 8.841861327400817e-06, "loss": 2.1037, "mean_token_accuracy": 0.5110396869759642, "step": 6247 }, { "epoch": 1.1583240637745644, "grad_norm": 5.796875, "learning_rate": 8.841675936225437e-06, "loss": 3.0579, "mean_token_accuracy": 0.43250063083522583, "step": 6248 }, { "epoch": 1.1585094549499444, "grad_norm": 6.5703125, "learning_rate": 8.841490545050056e-06, "loss": 3.2946, "mean_token_accuracy": 0.4102183106640759, "step": 6249 }, { "epoch": 1.1586948461253244, "grad_norm": 10.1640625, "learning_rate": 8.841305153874677e-06, "loss": 2.6042, "mean_token_accuracy": 0.48033573141486813, "step": 6250 }, { "epoch": 1.1588802373007046, "grad_norm": 5.66015625, "learning_rate": 8.841119762699295e-06, "loss": 2.9208, "mean_token_accuracy": 0.4442389758179232, "step": 6251 }, { "epoch": 1.1590656284760845, "grad_norm": 7.98828125, "learning_rate": 8.840934371523916e-06, "loss": 2.3404, "mean_token_accuracy": 0.5026996305768684, "step": 6252 }, { "epoch": 1.1592510196514647, "grad_norm": 8.3671875, "learning_rate": 8.840748980348536e-06, "loss": 2.5435, "mean_token_accuracy": 0.48536846943465606, "step": 6253 }, { "epoch": 1.1594364108268447, "grad_norm": 6.265625, "learning_rate": 8.840563589173155e-06, "loss": 2.8096, "mean_token_accuracy": 0.45177728063634104, "step": 6254 }, { "epoch": 1.1596218020022246, "grad_norm": 7.08984375, "learning_rate": 8.840378197997776e-06, "loss": 2.7046, "mean_token_accuracy": 0.4654343807763401, "step": 6255 }, { "epoch": 1.1598071931776048, "grad_norm": 7.02734375, "learning_rate": 8.840192806822396e-06, "loss": 2.7457, "mean_token_accuracy": 0.4650558586645882, "step": 6256 }, { "epoch": 1.1599925843529848, "grad_norm": 5.3359375, "learning_rate": 8.840007415647017e-06, "loss": 2.965, "mean_token_accuracy": 0.45515592988845893, "step": 6257 }, { "epoch": 1.1601779755283648, "grad_norm": 8.1640625, "learning_rate": 8.839822024471635e-06, "loss": 3.1483, "mean_token_accuracy": 0.41381544841886986, "step": 6258 }, { "epoch": 1.160363366703745, "grad_norm": 7.18359375, "learning_rate": 8.839636633296256e-06, "loss": 2.761, "mean_token_accuracy": 0.46389185343294204, "step": 6259 }, { "epoch": 1.160548757879125, "grad_norm": 7.296875, "learning_rate": 8.839451242120875e-06, "loss": 1.9678, "mean_token_accuracy": 0.5932790224032587, "step": 6260 }, { "epoch": 1.160734149054505, "grad_norm": 8.8359375, "learning_rate": 8.839265850945495e-06, "loss": 2.7525, "mean_token_accuracy": 0.45690468700849324, "step": 6261 }, { "epoch": 1.160919540229885, "grad_norm": 6.8203125, "learning_rate": 8.839080459770116e-06, "loss": 2.281, "mean_token_accuracy": 0.5114660114660114, "step": 6262 }, { "epoch": 1.161104931405265, "grad_norm": 6.04296875, "learning_rate": 8.838895068594736e-06, "loss": 2.8721, "mean_token_accuracy": 0.47297297297297297, "step": 6263 }, { "epoch": 1.1612903225806452, "grad_norm": 7.47265625, "learning_rate": 8.838709677419357e-06, "loss": 2.3967, "mean_token_accuracy": 0.5031376377074327, "step": 6264 }, { "epoch": 1.1614757137560252, "grad_norm": 7.96484375, "learning_rate": 8.838524286243976e-06, "loss": 2.6904, "mean_token_accuracy": 0.474112623432079, "step": 6265 }, { "epoch": 1.1616611049314052, "grad_norm": 6.62890625, "learning_rate": 8.838338895068596e-06, "loss": 2.6536, "mean_token_accuracy": 0.4803471994759253, "step": 6266 }, { "epoch": 1.1618464961067854, "grad_norm": 9.0, "learning_rate": 8.838153503893215e-06, "loss": 2.9049, "mean_token_accuracy": 0.43220022413149045, "step": 6267 }, { "epoch": 1.1620318872821653, "grad_norm": 7.6171875, "learning_rate": 8.837968112717835e-06, "loss": 2.8885, "mean_token_accuracy": 0.4385288966725044, "step": 6268 }, { "epoch": 1.1622172784575455, "grad_norm": 5.2265625, "learning_rate": 8.837782721542456e-06, "loss": 2.5944, "mean_token_accuracy": 0.48122555410691004, "step": 6269 }, { "epoch": 1.1624026696329255, "grad_norm": 7.8359375, "learning_rate": 8.837597330367075e-06, "loss": 3.3007, "mean_token_accuracy": 0.4160255001099143, "step": 6270 }, { "epoch": 1.1625880608083055, "grad_norm": 9.2265625, "learning_rate": 8.837411939191695e-06, "loss": 2.6935, "mean_token_accuracy": 0.4792046144505161, "step": 6271 }, { "epoch": 1.1627734519836856, "grad_norm": 5.3984375, "learning_rate": 8.837226548016316e-06, "loss": 3.0817, "mean_token_accuracy": 0.44159786304031085, "step": 6272 }, { "epoch": 1.1629588431590656, "grad_norm": 6.30078125, "learning_rate": 8.837041156840936e-06, "loss": 2.4583, "mean_token_accuracy": 0.5039006067610518, "step": 6273 }, { "epoch": 1.1631442343344456, "grad_norm": 8.8046875, "learning_rate": 8.836855765665555e-06, "loss": 3.4902, "mean_token_accuracy": 0.39008394543546693, "step": 6274 }, { "epoch": 1.1633296255098258, "grad_norm": 8.3515625, "learning_rate": 8.836670374490175e-06, "loss": 2.4008, "mean_token_accuracy": 0.49205340114431023, "step": 6275 }, { "epoch": 1.1635150166852057, "grad_norm": 7.72265625, "learning_rate": 8.836484983314794e-06, "loss": 2.2125, "mean_token_accuracy": 0.5218600953895072, "step": 6276 }, { "epoch": 1.163700407860586, "grad_norm": 6.6796875, "learning_rate": 8.836299592139415e-06, "loss": 3.018, "mean_token_accuracy": 0.43903448275862067, "step": 6277 }, { "epoch": 1.163885799035966, "grad_norm": 6.99609375, "learning_rate": 8.836114200964035e-06, "loss": 2.7199, "mean_token_accuracy": 0.4672369270497095, "step": 6278 }, { "epoch": 1.1640711902113459, "grad_norm": 4.890625, "learning_rate": 8.835928809788656e-06, "loss": 3.0414, "mean_token_accuracy": 0.4319129226493747, "step": 6279 }, { "epoch": 1.164256581386726, "grad_norm": 5.29296875, "learning_rate": 8.835743418613274e-06, "loss": 3.7187, "mean_token_accuracy": 0.3777634130575307, "step": 6280 }, { "epoch": 1.164441972562106, "grad_norm": 6.15234375, "learning_rate": 8.835558027437895e-06, "loss": 2.7329, "mean_token_accuracy": 0.46790299572039945, "step": 6281 }, { "epoch": 1.1646273637374862, "grad_norm": 6.90234375, "learning_rate": 8.835372636262515e-06, "loss": 2.6391, "mean_token_accuracy": 0.48060109289617486, "step": 6282 }, { "epoch": 1.1648127549128662, "grad_norm": 5.32421875, "learning_rate": 8.835187245087134e-06, "loss": 2.6662, "mean_token_accuracy": 0.48919472247497725, "step": 6283 }, { "epoch": 1.1649981460882461, "grad_norm": 7.390625, "learning_rate": 8.835001853911755e-06, "loss": 2.6174, "mean_token_accuracy": 0.47289972899729, "step": 6284 }, { "epoch": 1.1651835372636263, "grad_norm": 6.99609375, "learning_rate": 8.834816462736373e-06, "loss": 2.7107, "mean_token_accuracy": 0.4818551028429189, "step": 6285 }, { "epoch": 1.1653689284390063, "grad_norm": 4.90625, "learning_rate": 8.834631071560994e-06, "loss": 2.6862, "mean_token_accuracy": 0.47253634894991925, "step": 6286 }, { "epoch": 1.1655543196143863, "grad_norm": 8.9609375, "learning_rate": 8.834445680385614e-06, "loss": 2.4417, "mean_token_accuracy": 0.4875769318440848, "step": 6287 }, { "epoch": 1.1657397107897665, "grad_norm": 5.94140625, "learning_rate": 8.834260289210235e-06, "loss": 2.894, "mean_token_accuracy": 0.4424284717376134, "step": 6288 }, { "epoch": 1.1659251019651464, "grad_norm": 6.578125, "learning_rate": 8.834074898034854e-06, "loss": 2.9189, "mean_token_accuracy": 0.44844844844844844, "step": 6289 }, { "epoch": 1.1661104931405264, "grad_norm": 7.06640625, "learning_rate": 8.833889506859474e-06, "loss": 2.3077, "mean_token_accuracy": 0.4882315112540193, "step": 6290 }, { "epoch": 1.1662958843159066, "grad_norm": 7.6875, "learning_rate": 8.833704115684095e-06, "loss": 2.8007, "mean_token_accuracy": 0.45584158415841586, "step": 6291 }, { "epoch": 1.1664812754912866, "grad_norm": 6.96875, "learning_rate": 8.833518724508714e-06, "loss": 2.4177, "mean_token_accuracy": 0.485842242126553, "step": 6292 }, { "epoch": 1.1666666666666667, "grad_norm": 9.2890625, "learning_rate": 8.833333333333334e-06, "loss": 2.8094, "mean_token_accuracy": 0.4373887240356083, "step": 6293 }, { "epoch": 1.1668520578420467, "grad_norm": 9.1484375, "learning_rate": 8.833147942157953e-06, "loss": 2.2688, "mean_token_accuracy": 0.5540246555474981, "step": 6294 }, { "epoch": 1.1670374490174267, "grad_norm": 6.2265625, "learning_rate": 8.832962550982575e-06, "loss": 3.2563, "mean_token_accuracy": 0.42158273381294964, "step": 6295 }, { "epoch": 1.1672228401928069, "grad_norm": 7.8828125, "learning_rate": 8.832777159807194e-06, "loss": 2.6791, "mean_token_accuracy": 0.4647398843930636, "step": 6296 }, { "epoch": 1.1674082313681868, "grad_norm": 5.86328125, "learning_rate": 8.832591768631814e-06, "loss": 2.8409, "mean_token_accuracy": 0.48254149971379506, "step": 6297 }, { "epoch": 1.167593622543567, "grad_norm": 8.203125, "learning_rate": 8.832406377456433e-06, "loss": 3.0331, "mean_token_accuracy": 0.4413630011824143, "step": 6298 }, { "epoch": 1.167779013718947, "grad_norm": 6.07421875, "learning_rate": 8.832220986281054e-06, "loss": 2.9287, "mean_token_accuracy": 0.43567292755260556, "step": 6299 }, { "epoch": 1.167964404894327, "grad_norm": 9.328125, "learning_rate": 8.832035595105674e-06, "loss": 3.2963, "mean_token_accuracy": 0.4292892156862745, "step": 6300 }, { "epoch": 1.1681497960697071, "grad_norm": 6.78125, "learning_rate": 8.831850203930293e-06, "loss": 3.5141, "mean_token_accuracy": 0.4124339699309224, "step": 6301 }, { "epoch": 1.1683351872450871, "grad_norm": 6.765625, "learning_rate": 8.831664812754913e-06, "loss": 3.1951, "mean_token_accuracy": 0.4376654481280726, "step": 6302 }, { "epoch": 1.168520578420467, "grad_norm": 8.8125, "learning_rate": 8.831479421579534e-06, "loss": 2.7285, "mean_token_accuracy": 0.45105623326390953, "step": 6303 }, { "epoch": 1.1687059695958473, "grad_norm": 10.703125, "learning_rate": 8.831294030404154e-06, "loss": 2.5187, "mean_token_accuracy": 0.5028266666666666, "step": 6304 }, { "epoch": 1.1688913607712272, "grad_norm": 7.14453125, "learning_rate": 8.831108639228773e-06, "loss": 3.028, "mean_token_accuracy": 0.4359763920964845, "step": 6305 }, { "epoch": 1.1690767519466074, "grad_norm": 6.25, "learning_rate": 8.830923248053394e-06, "loss": 3.4223, "mean_token_accuracy": 0.405736480430236, "step": 6306 }, { "epoch": 1.1692621431219874, "grad_norm": 7.14453125, "learning_rate": 8.830737856878014e-06, "loss": 3.4809, "mean_token_accuracy": 0.3938528491772066, "step": 6307 }, { "epoch": 1.1694475342973674, "grad_norm": 6.58984375, "learning_rate": 8.830552465702633e-06, "loss": 4.1768, "mean_token_accuracy": 0.3549208903191204, "step": 6308 }, { "epoch": 1.1696329254727476, "grad_norm": 6.6953125, "learning_rate": 8.830367074527253e-06, "loss": 2.4775, "mean_token_accuracy": 0.4905234657039711, "step": 6309 }, { "epoch": 1.1698183166481275, "grad_norm": 6.6328125, "learning_rate": 8.830181683351872e-06, "loss": 2.8528, "mean_token_accuracy": 0.45447434292866085, "step": 6310 }, { "epoch": 1.1700037078235077, "grad_norm": 9.140625, "learning_rate": 8.829996292176493e-06, "loss": 2.582, "mean_token_accuracy": 0.4651473154706909, "step": 6311 }, { "epoch": 1.1701890989988877, "grad_norm": 6.1796875, "learning_rate": 8.829810901001113e-06, "loss": 2.67, "mean_token_accuracy": 0.48894570612122124, "step": 6312 }, { "epoch": 1.1703744901742676, "grad_norm": 6.3984375, "learning_rate": 8.829625509825734e-06, "loss": 3.2249, "mean_token_accuracy": 0.42331347299634825, "step": 6313 }, { "epoch": 1.1705598813496478, "grad_norm": 6.41796875, "learning_rate": 8.829440118650352e-06, "loss": 2.3696, "mean_token_accuracy": 0.49776071657069737, "step": 6314 }, { "epoch": 1.1707452725250278, "grad_norm": 9.984375, "learning_rate": 8.829254727474973e-06, "loss": 2.3138, "mean_token_accuracy": 0.492573402417962, "step": 6315 }, { "epoch": 1.1709306637004078, "grad_norm": 6.68359375, "learning_rate": 8.829069336299593e-06, "loss": 2.7926, "mean_token_accuracy": 0.4692871877148751, "step": 6316 }, { "epoch": 1.171116054875788, "grad_norm": 5.48046875, "learning_rate": 8.828883945124212e-06, "loss": 2.607, "mean_token_accuracy": 0.49708565636087176, "step": 6317 }, { "epoch": 1.171301446051168, "grad_norm": 6.27734375, "learning_rate": 8.828698553948833e-06, "loss": 2.3901, "mean_token_accuracy": 0.508356940509915, "step": 6318 }, { "epoch": 1.171486837226548, "grad_norm": 5.9765625, "learning_rate": 8.828513162773452e-06, "loss": 2.3003, "mean_token_accuracy": 0.5313873548968782, "step": 6319 }, { "epoch": 1.171672228401928, "grad_norm": 5.484375, "learning_rate": 8.828327771598074e-06, "loss": 2.9595, "mean_token_accuracy": 0.4595854922279793, "step": 6320 }, { "epoch": 1.171857619577308, "grad_norm": 5.640625, "learning_rate": 8.828142380422693e-06, "loss": 2.9775, "mean_token_accuracy": 0.43765156349712825, "step": 6321 }, { "epoch": 1.1720430107526882, "grad_norm": 5.8046875, "learning_rate": 8.827956989247313e-06, "loss": 2.8408, "mean_token_accuracy": 0.46032745591939545, "step": 6322 }, { "epoch": 1.1722284019280682, "grad_norm": 6.73046875, "learning_rate": 8.827771598071932e-06, "loss": 2.6551, "mean_token_accuracy": 0.4846560846560847, "step": 6323 }, { "epoch": 1.1724137931034484, "grad_norm": 15.2109375, "learning_rate": 8.827586206896552e-06, "loss": 2.3105, "mean_token_accuracy": 0.49884054017187285, "step": 6324 }, { "epoch": 1.1725991842788284, "grad_norm": 10.375, "learning_rate": 8.827400815721173e-06, "loss": 2.9711, "mean_token_accuracy": 0.4347183003899422, "step": 6325 }, { "epoch": 1.1727845754542083, "grad_norm": 10.046875, "learning_rate": 8.827215424545792e-06, "loss": 2.6262, "mean_token_accuracy": 0.4604775890690844, "step": 6326 }, { "epoch": 1.1729699666295885, "grad_norm": 6.92578125, "learning_rate": 8.827030033370412e-06, "loss": 2.6615, "mean_token_accuracy": 0.4831445523193096, "step": 6327 }, { "epoch": 1.1731553578049685, "grad_norm": 5.6640625, "learning_rate": 8.826844642195033e-06, "loss": 2.6929, "mean_token_accuracy": 0.4762435416877723, "step": 6328 }, { "epoch": 1.1733407489803485, "grad_norm": 7.15625, "learning_rate": 8.826659251019653e-06, "loss": 2.4264, "mean_token_accuracy": 0.4958384332925337, "step": 6329 }, { "epoch": 1.1735261401557286, "grad_norm": 9.6875, "learning_rate": 8.826473859844272e-06, "loss": 2.7945, "mean_token_accuracy": 0.44794903666873837, "step": 6330 }, { "epoch": 1.1737115313311086, "grad_norm": 8.7109375, "learning_rate": 8.826288468668892e-06, "loss": 2.5257, "mean_token_accuracy": 0.5103480714957667, "step": 6331 }, { "epoch": 1.1738969225064886, "grad_norm": 7.24609375, "learning_rate": 8.826103077493511e-06, "loss": 2.8784, "mean_token_accuracy": 0.4468599033816425, "step": 6332 }, { "epoch": 1.1740823136818688, "grad_norm": 12.5390625, "learning_rate": 8.825917686318132e-06, "loss": 2.7923, "mean_token_accuracy": 0.4553501180173092, "step": 6333 }, { "epoch": 1.1742677048572487, "grad_norm": 9.5234375, "learning_rate": 8.825732295142752e-06, "loss": 2.3155, "mean_token_accuracy": 0.5107069521853916, "step": 6334 }, { "epoch": 1.174453096032629, "grad_norm": 6.90625, "learning_rate": 8.825546903967371e-06, "loss": 2.4689, "mean_token_accuracy": 0.5278425655976676, "step": 6335 }, { "epoch": 1.174638487208009, "grad_norm": 6.3828125, "learning_rate": 8.825361512791991e-06, "loss": 2.7515, "mean_token_accuracy": 0.4889873582692558, "step": 6336 }, { "epoch": 1.1748238783833889, "grad_norm": 7.4453125, "learning_rate": 8.825176121616612e-06, "loss": 2.7022, "mean_token_accuracy": 0.4718196457326892, "step": 6337 }, { "epoch": 1.175009269558769, "grad_norm": 8.875, "learning_rate": 8.824990730441232e-06, "loss": 2.1254, "mean_token_accuracy": 0.5217060167555218, "step": 6338 }, { "epoch": 1.175194660734149, "grad_norm": 6.37890625, "learning_rate": 8.824805339265851e-06, "loss": 2.4546, "mean_token_accuracy": 0.4835164835164835, "step": 6339 }, { "epoch": 1.1753800519095292, "grad_norm": 8.28125, "learning_rate": 8.824619948090472e-06, "loss": 3.0341, "mean_token_accuracy": 0.43360905528950805, "step": 6340 }, { "epoch": 1.1755654430849092, "grad_norm": 6.94140625, "learning_rate": 8.82443455691509e-06, "loss": 3.1317, "mean_token_accuracy": 0.4315977254989408, "step": 6341 }, { "epoch": 1.1757508342602891, "grad_norm": 11.15625, "learning_rate": 8.824249165739711e-06, "loss": 2.5288, "mean_token_accuracy": 0.48819757129212715, "step": 6342 }, { "epoch": 1.1759362254356693, "grad_norm": 9.421875, "learning_rate": 8.824063774564331e-06, "loss": 2.2493, "mean_token_accuracy": 0.49625508238818744, "step": 6343 }, { "epoch": 1.1761216166110493, "grad_norm": 7.3046875, "learning_rate": 8.823878383388952e-06, "loss": 2.6195, "mean_token_accuracy": 0.4844215659712815, "step": 6344 }, { "epoch": 1.1763070077864293, "grad_norm": 9.171875, "learning_rate": 8.823692992213572e-06, "loss": 2.8151, "mean_token_accuracy": 0.46156693399136334, "step": 6345 }, { "epoch": 1.1764923989618095, "grad_norm": 8.3828125, "learning_rate": 8.823507601038191e-06, "loss": 2.7746, "mean_token_accuracy": 0.4703600436416535, "step": 6346 }, { "epoch": 1.1766777901371894, "grad_norm": 5.70703125, "learning_rate": 8.823322209862812e-06, "loss": 3.7264, "mean_token_accuracy": 0.3795459111914808, "step": 6347 }, { "epoch": 1.1768631813125696, "grad_norm": 5.75, "learning_rate": 8.82313681868743e-06, "loss": 2.7436, "mean_token_accuracy": 0.4621694417238002, "step": 6348 }, { "epoch": 1.1770485724879496, "grad_norm": 7.2109375, "learning_rate": 8.822951427512051e-06, "loss": 2.9344, "mean_token_accuracy": 0.44876997210246006, "step": 6349 }, { "epoch": 1.1772339636633296, "grad_norm": 7.4375, "learning_rate": 8.822766036336672e-06, "loss": 3.5666, "mean_token_accuracy": 0.39516767890150517, "step": 6350 }, { "epoch": 1.1774193548387097, "grad_norm": 5.4296875, "learning_rate": 8.82258064516129e-06, "loss": 2.789, "mean_token_accuracy": 0.454745650737723, "step": 6351 }, { "epoch": 1.1776047460140897, "grad_norm": 9.0703125, "learning_rate": 8.82239525398591e-06, "loss": 2.4672, "mean_token_accuracy": 0.4940248565965583, "step": 6352 }, { "epoch": 1.17779013718947, "grad_norm": 6.15234375, "learning_rate": 8.822209862810531e-06, "loss": 2.6406, "mean_token_accuracy": 0.48978741142142557, "step": 6353 }, { "epoch": 1.1779755283648499, "grad_norm": 5.9609375, "learning_rate": 8.822024471635152e-06, "loss": 3.0807, "mean_token_accuracy": 0.4498792270531401, "step": 6354 }, { "epoch": 1.1781609195402298, "grad_norm": 5.484375, "learning_rate": 8.82183908045977e-06, "loss": 3.0307, "mean_token_accuracy": 0.441527446300716, "step": 6355 }, { "epoch": 1.17834631071561, "grad_norm": 7.9609375, "learning_rate": 8.821653689284391e-06, "loss": 3.1834, "mean_token_accuracy": 0.42505207861606736, "step": 6356 }, { "epoch": 1.17853170189099, "grad_norm": 7.56640625, "learning_rate": 8.82146829810901e-06, "loss": 2.8839, "mean_token_accuracy": 0.47977856373981237, "step": 6357 }, { "epoch": 1.17871709306637, "grad_norm": 6.39453125, "learning_rate": 8.82128290693363e-06, "loss": 2.8867, "mean_token_accuracy": 0.4513958964009418, "step": 6358 }, { "epoch": 1.1789024842417501, "grad_norm": 8.9453125, "learning_rate": 8.821097515758251e-06, "loss": 2.606, "mean_token_accuracy": 0.47661171424741994, "step": 6359 }, { "epoch": 1.1790878754171301, "grad_norm": 6.2734375, "learning_rate": 8.820912124582871e-06, "loss": 2.403, "mean_token_accuracy": 0.5047009663097415, "step": 6360 }, { "epoch": 1.17927326659251, "grad_norm": 5.390625, "learning_rate": 8.82072673340749e-06, "loss": 3.5323, "mean_token_accuracy": 0.4063740228502706, "step": 6361 }, { "epoch": 1.1794586577678903, "grad_norm": 6.6484375, "learning_rate": 8.82054134223211e-06, "loss": 3.0652, "mean_token_accuracy": 0.42973651191969886, "step": 6362 }, { "epoch": 1.1796440489432702, "grad_norm": 6.89453125, "learning_rate": 8.820355951056731e-06, "loss": 2.9167, "mean_token_accuracy": 0.45741690094261617, "step": 6363 }, { "epoch": 1.1798294401186504, "grad_norm": 5.4453125, "learning_rate": 8.82017055988135e-06, "loss": 2.8703, "mean_token_accuracy": 0.4465591397849462, "step": 6364 }, { "epoch": 1.1800148312940304, "grad_norm": 6.56640625, "learning_rate": 8.81998516870597e-06, "loss": 2.8019, "mean_token_accuracy": 0.467167842896594, "step": 6365 }, { "epoch": 1.1802002224694104, "grad_norm": 5.3046875, "learning_rate": 8.81979977753059e-06, "loss": 2.3345, "mean_token_accuracy": 0.5, "step": 6366 }, { "epoch": 1.1803856136447906, "grad_norm": 5.5625, "learning_rate": 8.81961438635521e-06, "loss": 3.0838, "mean_token_accuracy": 0.4562492085602127, "step": 6367 }, { "epoch": 1.1805710048201705, "grad_norm": 7.52734375, "learning_rate": 8.81942899517983e-06, "loss": 2.5619, "mean_token_accuracy": 0.47722018223854207, "step": 6368 }, { "epoch": 1.1807563959955507, "grad_norm": 7.34375, "learning_rate": 8.81924360400445e-06, "loss": 4.0755, "mean_token_accuracy": 0.3647910205287255, "step": 6369 }, { "epoch": 1.1809417871709307, "grad_norm": 6.02734375, "learning_rate": 8.81905821282907e-06, "loss": 2.9377, "mean_token_accuracy": 0.4447466848010881, "step": 6370 }, { "epoch": 1.1811271783463106, "grad_norm": 6.54296875, "learning_rate": 8.81887282165369e-06, "loss": 3.1299, "mean_token_accuracy": 0.4263312274368231, "step": 6371 }, { "epoch": 1.1813125695216908, "grad_norm": 7.546875, "learning_rate": 8.81868743047831e-06, "loss": 2.7367, "mean_token_accuracy": 0.4625487646293888, "step": 6372 }, { "epoch": 1.1814979606970708, "grad_norm": 6.0078125, "learning_rate": 8.81850203930293e-06, "loss": 2.9158, "mean_token_accuracy": 0.43024618991793667, "step": 6373 }, { "epoch": 1.1816833518724508, "grad_norm": 5.5703125, "learning_rate": 8.81831664812755e-06, "loss": 2.2095, "mean_token_accuracy": 0.5179052234787291, "step": 6374 }, { "epoch": 1.181868743047831, "grad_norm": 6.92578125, "learning_rate": 8.818131256952169e-06, "loss": 2.6794, "mean_token_accuracy": 0.46814799714606053, "step": 6375 }, { "epoch": 1.182054134223211, "grad_norm": 6.390625, "learning_rate": 8.81794586577679e-06, "loss": 3.2691, "mean_token_accuracy": 0.44510875167508507, "step": 6376 }, { "epoch": 1.1822395253985911, "grad_norm": 6.1171875, "learning_rate": 8.81776047460141e-06, "loss": 2.2124, "mean_token_accuracy": 0.5571020255996023, "step": 6377 }, { "epoch": 1.182424916573971, "grad_norm": 5.12890625, "learning_rate": 8.81757508342603e-06, "loss": 2.651, "mean_token_accuracy": 0.4798093804865814, "step": 6378 }, { "epoch": 1.182610307749351, "grad_norm": 5.8671875, "learning_rate": 8.817389692250649e-06, "loss": 2.5449, "mean_token_accuracy": 0.49141707114952055, "step": 6379 }, { "epoch": 1.1827956989247312, "grad_norm": 5.9375, "learning_rate": 8.81720430107527e-06, "loss": 2.6384, "mean_token_accuracy": 0.4539434787749793, "step": 6380 }, { "epoch": 1.1829810901001112, "grad_norm": 6.0234375, "learning_rate": 8.81701890989989e-06, "loss": 2.8109, "mean_token_accuracy": 0.46606668633815285, "step": 6381 }, { "epoch": 1.1831664812754914, "grad_norm": 8.2421875, "learning_rate": 8.816833518724509e-06, "loss": 3.044, "mean_token_accuracy": 0.410218387705581, "step": 6382 }, { "epoch": 1.1833518724508714, "grad_norm": 5.33203125, "learning_rate": 8.816648127549129e-06, "loss": 2.8462, "mean_token_accuracy": 0.4774321641297154, "step": 6383 }, { "epoch": 1.1835372636262513, "grad_norm": 5.71875, "learning_rate": 8.81646273637375e-06, "loss": 2.6792, "mean_token_accuracy": 0.4960244648318043, "step": 6384 }, { "epoch": 1.1837226548016315, "grad_norm": 5.6484375, "learning_rate": 8.81627734519837e-06, "loss": 2.9034, "mean_token_accuracy": 0.446986301369863, "step": 6385 }, { "epoch": 1.1839080459770115, "grad_norm": 5.50390625, "learning_rate": 8.816091954022989e-06, "loss": 2.7844, "mean_token_accuracy": 0.46271556958950694, "step": 6386 }, { "epoch": 1.1840934371523915, "grad_norm": 6.48828125, "learning_rate": 8.81590656284761e-06, "loss": 3.5053, "mean_token_accuracy": 0.41990625861593606, "step": 6387 }, { "epoch": 1.1842788283277716, "grad_norm": 5.7265625, "learning_rate": 8.81572117167223e-06, "loss": 2.9776, "mean_token_accuracy": 0.45307725883893496, "step": 6388 }, { "epoch": 1.1844642195031516, "grad_norm": 7.515625, "learning_rate": 8.815535780496849e-06, "loss": 2.6269, "mean_token_accuracy": 0.47132815390307065, "step": 6389 }, { "epoch": 1.1846496106785316, "grad_norm": 6.3046875, "learning_rate": 8.81535038932147e-06, "loss": 2.7769, "mean_token_accuracy": 0.4734870654336884, "step": 6390 }, { "epoch": 1.1848350018539118, "grad_norm": 7.43359375, "learning_rate": 8.815164998146088e-06, "loss": 3.12, "mean_token_accuracy": 0.43461226695487704, "step": 6391 }, { "epoch": 1.1850203930292917, "grad_norm": 7.54296875, "learning_rate": 8.81497960697071e-06, "loss": 2.5001, "mean_token_accuracy": 0.4799328295549958, "step": 6392 }, { "epoch": 1.185205784204672, "grad_norm": 6.58984375, "learning_rate": 8.814794215795329e-06, "loss": 3.0162, "mean_token_accuracy": 0.4430566747246844, "step": 6393 }, { "epoch": 1.185391175380052, "grad_norm": 6.28515625, "learning_rate": 8.81460882461995e-06, "loss": 2.461, "mean_token_accuracy": 0.4929384965831435, "step": 6394 }, { "epoch": 1.1855765665554319, "grad_norm": 7.05078125, "learning_rate": 8.814423433444568e-06, "loss": 3.0958, "mean_token_accuracy": 0.4270176739587196, "step": 6395 }, { "epoch": 1.185761957730812, "grad_norm": 6.90234375, "learning_rate": 8.814238042269189e-06, "loss": 2.8033, "mean_token_accuracy": 0.4609099350046425, "step": 6396 }, { "epoch": 1.185947348906192, "grad_norm": 7.18359375, "learning_rate": 8.81405265109381e-06, "loss": 2.7717, "mean_token_accuracy": 0.46469833119383824, "step": 6397 }, { "epoch": 1.1861327400815722, "grad_norm": 6.8046875, "learning_rate": 8.813867259918428e-06, "loss": 2.4397, "mean_token_accuracy": 0.4994773779304166, "step": 6398 }, { "epoch": 1.1863181312569522, "grad_norm": 7.79296875, "learning_rate": 8.813681868743049e-06, "loss": 2.576, "mean_token_accuracy": 0.473090390351446, "step": 6399 }, { "epoch": 1.1865035224323321, "grad_norm": 8.734375, "learning_rate": 8.813496477567669e-06, "loss": 2.7637, "mean_token_accuracy": 0.47411444141689374, "step": 6400 }, { "epoch": 1.1866889136077123, "grad_norm": 7.51953125, "learning_rate": 8.81331108639229e-06, "loss": 3.0461, "mean_token_accuracy": 0.4305811059409155, "step": 6401 }, { "epoch": 1.1868743047830923, "grad_norm": 6.80859375, "learning_rate": 8.813125695216908e-06, "loss": 2.6728, "mean_token_accuracy": 0.47608083908928117, "step": 6402 }, { "epoch": 1.1870596959584723, "grad_norm": 7.68359375, "learning_rate": 8.812940304041529e-06, "loss": 3.5118, "mean_token_accuracy": 0.40063974410235903, "step": 6403 }, { "epoch": 1.1872450871338525, "grad_norm": 7.6640625, "learning_rate": 8.812754912866148e-06, "loss": 2.8287, "mean_token_accuracy": 0.45515232431120284, "step": 6404 }, { "epoch": 1.1874304783092324, "grad_norm": 5.87890625, "learning_rate": 8.812569521690768e-06, "loss": 3.2195, "mean_token_accuracy": 0.4374731413837559, "step": 6405 }, { "epoch": 1.1876158694846126, "grad_norm": 7.81640625, "learning_rate": 8.812384130515389e-06, "loss": 2.9887, "mean_token_accuracy": 0.4493107104984093, "step": 6406 }, { "epoch": 1.1878012606599926, "grad_norm": 8.2421875, "learning_rate": 8.812198739340007e-06, "loss": 2.8959, "mean_token_accuracy": 0.4631480324797002, "step": 6407 }, { "epoch": 1.1879866518353726, "grad_norm": 15.234375, "learning_rate": 8.812013348164628e-06, "loss": 2.7402, "mean_token_accuracy": 0.444, "step": 6408 }, { "epoch": 1.1881720430107527, "grad_norm": 8.7578125, "learning_rate": 8.811827956989248e-06, "loss": 2.6256, "mean_token_accuracy": 0.46747737556561086, "step": 6409 }, { "epoch": 1.1883574341861327, "grad_norm": 6.53515625, "learning_rate": 8.811642565813869e-06, "loss": 3.052, "mean_token_accuracy": 0.43136599230897843, "step": 6410 }, { "epoch": 1.188542825361513, "grad_norm": 8.3515625, "learning_rate": 8.811457174638488e-06, "loss": 2.3425, "mean_token_accuracy": 0.5219915987150976, "step": 6411 }, { "epoch": 1.1887282165368929, "grad_norm": 6.69140625, "learning_rate": 8.811271783463108e-06, "loss": 2.6473, "mean_token_accuracy": 0.49348154247289694, "step": 6412 }, { "epoch": 1.1889136077122728, "grad_norm": 6.38671875, "learning_rate": 8.811086392287727e-06, "loss": 2.4697, "mean_token_accuracy": 0.4959971322738678, "step": 6413 }, { "epoch": 1.189098998887653, "grad_norm": 8.2109375, "learning_rate": 8.810901001112347e-06, "loss": 3.0116, "mean_token_accuracy": 0.4202618883528601, "step": 6414 }, { "epoch": 1.189284390063033, "grad_norm": 8.1484375, "learning_rate": 8.810715609936968e-06, "loss": 2.9244, "mean_token_accuracy": 0.4568576182552888, "step": 6415 }, { "epoch": 1.189469781238413, "grad_norm": 7.15625, "learning_rate": 8.810530218761588e-06, "loss": 2.8936, "mean_token_accuracy": 0.4493371212121212, "step": 6416 }, { "epoch": 1.1896551724137931, "grad_norm": 6.203125, "learning_rate": 8.810344827586207e-06, "loss": 2.2097, "mean_token_accuracy": 0.5103169251517194, "step": 6417 }, { "epoch": 1.1898405635891731, "grad_norm": 6.28515625, "learning_rate": 8.810159436410828e-06, "loss": 2.9877, "mean_token_accuracy": 0.45045649838882923, "step": 6418 }, { "epoch": 1.190025954764553, "grad_norm": 5.828125, "learning_rate": 8.809974045235448e-06, "loss": 2.8231, "mean_token_accuracy": 0.4565071556350626, "step": 6419 }, { "epoch": 1.1902113459399333, "grad_norm": 7.30859375, "learning_rate": 8.809788654060067e-06, "loss": 2.728, "mean_token_accuracy": 0.46048020765736536, "step": 6420 }, { "epoch": 1.1903967371153132, "grad_norm": 6.52734375, "learning_rate": 8.809603262884687e-06, "loss": 2.771, "mean_token_accuracy": 0.4693314955203308, "step": 6421 }, { "epoch": 1.1905821282906934, "grad_norm": 9.3046875, "learning_rate": 8.809417871709306e-06, "loss": 2.8314, "mean_token_accuracy": 0.4578091439091746, "step": 6422 }, { "epoch": 1.1907675194660734, "grad_norm": 5.96875, "learning_rate": 8.809232480533927e-06, "loss": 3.0668, "mean_token_accuracy": 0.44035437983528825, "step": 6423 }, { "epoch": 1.1909529106414536, "grad_norm": 6.14453125, "learning_rate": 8.809047089358547e-06, "loss": 2.6839, "mean_token_accuracy": 0.4656220451168445, "step": 6424 }, { "epoch": 1.1911383018168336, "grad_norm": 7.015625, "learning_rate": 8.808861698183168e-06, "loss": 3.0735, "mean_token_accuracy": 0.46038573608991257, "step": 6425 }, { "epoch": 1.1913236929922135, "grad_norm": 6.18359375, "learning_rate": 8.808676307007788e-06, "loss": 2.7196, "mean_token_accuracy": 0.45421483309520716, "step": 6426 }, { "epoch": 1.1915090841675937, "grad_norm": 6.640625, "learning_rate": 8.808490915832407e-06, "loss": 3.2395, "mean_token_accuracy": 0.4069446462298416, "step": 6427 }, { "epoch": 1.1916944753429737, "grad_norm": 6.578125, "learning_rate": 8.808305524657028e-06, "loss": 2.5925, "mean_token_accuracy": 0.4931056095267941, "step": 6428 }, { "epoch": 1.1918798665183536, "grad_norm": 6.17578125, "learning_rate": 8.808120133481646e-06, "loss": 2.5755, "mean_token_accuracy": 0.48601186871745183, "step": 6429 }, { "epoch": 1.1920652576937338, "grad_norm": 6.6484375, "learning_rate": 8.807934742306267e-06, "loss": 2.7582, "mean_token_accuracy": 0.4620494391810955, "step": 6430 }, { "epoch": 1.1922506488691138, "grad_norm": 7.69140625, "learning_rate": 8.807749351130887e-06, "loss": 2.8932, "mean_token_accuracy": 0.4397296698426767, "step": 6431 }, { "epoch": 1.1924360400444938, "grad_norm": 6.3359375, "learning_rate": 8.807563959955508e-06, "loss": 2.5842, "mean_token_accuracy": 0.48234683281412255, "step": 6432 }, { "epoch": 1.192621431219874, "grad_norm": 5.84375, "learning_rate": 8.807378568780127e-06, "loss": 2.6112, "mean_token_accuracy": 0.4844789356984479, "step": 6433 }, { "epoch": 1.192806822395254, "grad_norm": 7.33203125, "learning_rate": 8.807193177604747e-06, "loss": 3.2604, "mean_token_accuracy": 0.42578042047143766, "step": 6434 }, { "epoch": 1.1929922135706341, "grad_norm": 6.6484375, "learning_rate": 8.807007786429368e-06, "loss": 2.9666, "mean_token_accuracy": 0.4721104708056975, "step": 6435 }, { "epoch": 1.193177604746014, "grad_norm": 8.4453125, "learning_rate": 8.806822395253986e-06, "loss": 2.792, "mean_token_accuracy": 0.46206896551724136, "step": 6436 }, { "epoch": 1.193362995921394, "grad_norm": 6.0703125, "learning_rate": 8.806637004078607e-06, "loss": 3.0656, "mean_token_accuracy": 0.4203590093160645, "step": 6437 }, { "epoch": 1.1935483870967742, "grad_norm": 5.36328125, "learning_rate": 8.806451612903226e-06, "loss": 2.991, "mean_token_accuracy": 0.4492753623188406, "step": 6438 }, { "epoch": 1.1937337782721542, "grad_norm": 6.82421875, "learning_rate": 8.806266221727846e-06, "loss": 2.687, "mean_token_accuracy": 0.463241322765783, "step": 6439 }, { "epoch": 1.1939191694475344, "grad_norm": 7.21484375, "learning_rate": 8.806080830552467e-06, "loss": 3.1144, "mean_token_accuracy": 0.43771863117870724, "step": 6440 }, { "epoch": 1.1941045606229144, "grad_norm": 8.2109375, "learning_rate": 8.805895439377087e-06, "loss": 3.225, "mean_token_accuracy": 0.4190995907230559, "step": 6441 }, { "epoch": 1.1942899517982943, "grad_norm": 6.39453125, "learning_rate": 8.805710048201706e-06, "loss": 3.0193, "mean_token_accuracy": 0.42626904944633265, "step": 6442 }, { "epoch": 1.1944753429736745, "grad_norm": 6.15234375, "learning_rate": 8.805524657026326e-06, "loss": 3.2261, "mean_token_accuracy": 0.4257966616084977, "step": 6443 }, { "epoch": 1.1946607341490545, "grad_norm": 5.40625, "learning_rate": 8.805339265850947e-06, "loss": 2.6869, "mean_token_accuracy": 0.45699414443721537, "step": 6444 }, { "epoch": 1.1948461253244345, "grad_norm": 7.13671875, "learning_rate": 8.805153874675566e-06, "loss": 2.7952, "mean_token_accuracy": 0.4552152557064432, "step": 6445 }, { "epoch": 1.1950315164998146, "grad_norm": 6.67578125, "learning_rate": 8.804968483500186e-06, "loss": 2.7549, "mean_token_accuracy": 0.46132468735525706, "step": 6446 }, { "epoch": 1.1952169076751946, "grad_norm": 6.2265625, "learning_rate": 8.804783092324805e-06, "loss": 2.8782, "mean_token_accuracy": 0.4632373497344143, "step": 6447 }, { "epoch": 1.1954022988505748, "grad_norm": 5.8125, "learning_rate": 8.804597701149425e-06, "loss": 2.7742, "mean_token_accuracy": 0.4583876858857292, "step": 6448 }, { "epoch": 1.1955876900259548, "grad_norm": 6.89453125, "learning_rate": 8.804412309974046e-06, "loss": 3.2342, "mean_token_accuracy": 0.4338526211671612, "step": 6449 }, { "epoch": 1.1957730812013347, "grad_norm": 6.65625, "learning_rate": 8.804226918798666e-06, "loss": 3.0143, "mean_token_accuracy": 0.43004587155963303, "step": 6450 }, { "epoch": 1.195958472376715, "grad_norm": 6.6328125, "learning_rate": 8.804041527623285e-06, "loss": 2.475, "mean_token_accuracy": 0.49002849002849, "step": 6451 }, { "epoch": 1.196143863552095, "grad_norm": 8.1953125, "learning_rate": 8.803856136447906e-06, "loss": 2.6906, "mean_token_accuracy": 0.4831537307325058, "step": 6452 }, { "epoch": 1.196329254727475, "grad_norm": 8.984375, "learning_rate": 8.803670745272526e-06, "loss": 2.6896, "mean_token_accuracy": 0.48764769065520946, "step": 6453 }, { "epoch": 1.196514645902855, "grad_norm": 8.3125, "learning_rate": 8.803485354097145e-06, "loss": 2.6573, "mean_token_accuracy": 0.45057624113475175, "step": 6454 }, { "epoch": 1.196700037078235, "grad_norm": 8.203125, "learning_rate": 8.803299962921766e-06, "loss": 3.2449, "mean_token_accuracy": 0.4351993332407279, "step": 6455 }, { "epoch": 1.1968854282536152, "grad_norm": 9.0390625, "learning_rate": 8.803114571746384e-06, "loss": 2.7258, "mean_token_accuracy": 0.4681406417933364, "step": 6456 }, { "epoch": 1.1970708194289952, "grad_norm": 7.48046875, "learning_rate": 8.802929180571007e-06, "loss": 2.6919, "mean_token_accuracy": 0.4594152301985599, "step": 6457 }, { "epoch": 1.1972562106043751, "grad_norm": 6.9765625, "learning_rate": 8.802743789395625e-06, "loss": 2.7389, "mean_token_accuracy": 0.48030552643402724, "step": 6458 }, { "epoch": 1.1974416017797553, "grad_norm": 9.7421875, "learning_rate": 8.802558398220246e-06, "loss": 3.2004, "mean_token_accuracy": 0.42887563884156726, "step": 6459 }, { "epoch": 1.1976269929551353, "grad_norm": 13.609375, "learning_rate": 8.802373007044865e-06, "loss": 2.6192, "mean_token_accuracy": 0.45671299247736835, "step": 6460 }, { "epoch": 1.1978123841305153, "grad_norm": 8.6640625, "learning_rate": 8.802187615869485e-06, "loss": 2.9635, "mean_token_accuracy": 0.4364450927276516, "step": 6461 }, { "epoch": 1.1979977753058955, "grad_norm": 7.80859375, "learning_rate": 8.802002224694106e-06, "loss": 2.0386, "mean_token_accuracy": 0.5340122731427079, "step": 6462 }, { "epoch": 1.1981831664812754, "grad_norm": 6.53125, "learning_rate": 8.801816833518724e-06, "loss": 2.875, "mean_token_accuracy": 0.4582284275577935, "step": 6463 }, { "epoch": 1.1983685576566556, "grad_norm": 8.8515625, "learning_rate": 8.801631442343345e-06, "loss": 2.8027, "mean_token_accuracy": 0.46124445803393976, "step": 6464 }, { "epoch": 1.1985539488320356, "grad_norm": 5.93359375, "learning_rate": 8.801446051167965e-06, "loss": 2.4499, "mean_token_accuracy": 0.4884163003445408, "step": 6465 }, { "epoch": 1.1987393400074156, "grad_norm": 10.171875, "learning_rate": 8.801260659992586e-06, "loss": 2.502, "mean_token_accuracy": 0.4791789548189218, "step": 6466 }, { "epoch": 1.1989247311827957, "grad_norm": 9.6796875, "learning_rate": 8.801075268817205e-06, "loss": 3.0017, "mean_token_accuracy": 0.45891022778025903, "step": 6467 }, { "epoch": 1.1991101223581757, "grad_norm": 5.796875, "learning_rate": 8.800889877641825e-06, "loss": 2.7854, "mean_token_accuracy": 0.4601137716629184, "step": 6468 }, { "epoch": 1.199295513533556, "grad_norm": 4.94140625, "learning_rate": 8.800704486466446e-06, "loss": 2.6037, "mean_token_accuracy": 0.48540706605222733, "step": 6469 }, { "epoch": 1.1994809047089359, "grad_norm": 5.17578125, "learning_rate": 8.800519095291064e-06, "loss": 2.9095, "mean_token_accuracy": 0.45133772780147347, "step": 6470 }, { "epoch": 1.1996662958843158, "grad_norm": 6.0625, "learning_rate": 8.800333704115685e-06, "loss": 2.7478, "mean_token_accuracy": 0.4475543854787308, "step": 6471 }, { "epoch": 1.199851687059696, "grad_norm": 6.75, "learning_rate": 8.800148312940304e-06, "loss": 3.0276, "mean_token_accuracy": 0.44599056603773585, "step": 6472 }, { "epoch": 1.200037078235076, "grad_norm": 6.55078125, "learning_rate": 8.799962921764926e-06, "loss": 2.2506, "mean_token_accuracy": 0.5217576187101347, "step": 6473 }, { "epoch": 1.200222469410456, "grad_norm": 7.3125, "learning_rate": 8.799777530589545e-06, "loss": 2.8597, "mean_token_accuracy": 0.4486624203821656, "step": 6474 }, { "epoch": 1.2004078605858362, "grad_norm": 7.53125, "learning_rate": 8.799592139414165e-06, "loss": 2.7053, "mean_token_accuracy": 0.4638033495407888, "step": 6475 }, { "epoch": 1.2005932517612161, "grad_norm": 7.62109375, "learning_rate": 8.799406748238784e-06, "loss": 2.8359, "mean_token_accuracy": 0.4814912050406931, "step": 6476 }, { "epoch": 1.2007786429365963, "grad_norm": 8.3828125, "learning_rate": 8.799221357063404e-06, "loss": 2.6657, "mean_token_accuracy": 0.48215928841631733, "step": 6477 }, { "epoch": 1.2009640341119763, "grad_norm": 5.515625, "learning_rate": 8.799035965888025e-06, "loss": 3.0571, "mean_token_accuracy": 0.4348221388794181, "step": 6478 }, { "epoch": 1.2011494252873562, "grad_norm": 7.25, "learning_rate": 8.798850574712644e-06, "loss": 3.0542, "mean_token_accuracy": 0.44322508398656213, "step": 6479 }, { "epoch": 1.2013348164627364, "grad_norm": 6.1171875, "learning_rate": 8.798665183537264e-06, "loss": 3.3947, "mean_token_accuracy": 0.4144047619047619, "step": 6480 }, { "epoch": 1.2015202076381164, "grad_norm": 6.8125, "learning_rate": 8.798479792361885e-06, "loss": 2.9937, "mean_token_accuracy": 0.4382205513784461, "step": 6481 }, { "epoch": 1.2017055988134966, "grad_norm": 6.55859375, "learning_rate": 8.798294401186505e-06, "loss": 2.5797, "mean_token_accuracy": 0.47920665387076133, "step": 6482 }, { "epoch": 1.2018909899888766, "grad_norm": 7.2109375, "learning_rate": 8.798109010011124e-06, "loss": 3.0179, "mean_token_accuracy": 0.4571956769055745, "step": 6483 }, { "epoch": 1.2020763811642565, "grad_norm": 7.125, "learning_rate": 8.797923618835745e-06, "loss": 1.8922, "mean_token_accuracy": 0.5804347826086956, "step": 6484 }, { "epoch": 1.2022617723396367, "grad_norm": 7.86328125, "learning_rate": 8.797738227660363e-06, "loss": 2.7083, "mean_token_accuracy": 0.4742751801575331, "step": 6485 }, { "epoch": 1.2024471635150167, "grad_norm": 8.90625, "learning_rate": 8.797552836484984e-06, "loss": 2.5284, "mean_token_accuracy": 0.47640086206896554, "step": 6486 }, { "epoch": 1.2026325546903966, "grad_norm": 9.953125, "learning_rate": 8.797367445309604e-06, "loss": 2.6027, "mean_token_accuracy": 0.4840656687590536, "step": 6487 }, { "epoch": 1.2028179458657768, "grad_norm": 8.109375, "learning_rate": 8.797182054134223e-06, "loss": 2.4776, "mean_token_accuracy": 0.5026155652823743, "step": 6488 }, { "epoch": 1.2030033370411568, "grad_norm": 6.625, "learning_rate": 8.796996662958844e-06, "loss": 2.5836, "mean_token_accuracy": 0.46923076923076923, "step": 6489 }, { "epoch": 1.2031887282165368, "grad_norm": 7.3984375, "learning_rate": 8.796811271783464e-06, "loss": 2.493, "mean_token_accuracy": 0.5124506268246608, "step": 6490 }, { "epoch": 1.203374119391917, "grad_norm": 6.41796875, "learning_rate": 8.796625880608085e-06, "loss": 3.4614, "mean_token_accuracy": 0.43246509129967775, "step": 6491 }, { "epoch": 1.203559510567297, "grad_norm": 6.09765625, "learning_rate": 8.796440489432703e-06, "loss": 3.0401, "mean_token_accuracy": 0.42487266553480474, "step": 6492 }, { "epoch": 1.2037449017426771, "grad_norm": 7.421875, "learning_rate": 8.796255098257324e-06, "loss": 2.9117, "mean_token_accuracy": 0.43437945791726107, "step": 6493 }, { "epoch": 1.203930292918057, "grad_norm": 6.47265625, "learning_rate": 8.796069707081943e-06, "loss": 2.4475, "mean_token_accuracy": 0.501031177969186, "step": 6494 }, { "epoch": 1.204115684093437, "grad_norm": 6.48828125, "learning_rate": 8.795884315906563e-06, "loss": 2.3678, "mean_token_accuracy": 0.515285084601166, "step": 6495 }, { "epoch": 1.2043010752688172, "grad_norm": 5.9609375, "learning_rate": 8.795698924731184e-06, "loss": 2.7263, "mean_token_accuracy": 0.46972318339100344, "step": 6496 }, { "epoch": 1.2044864664441972, "grad_norm": 4.91015625, "learning_rate": 8.795513533555804e-06, "loss": 2.8675, "mean_token_accuracy": 0.4575106223444139, "step": 6497 }, { "epoch": 1.2046718576195774, "grad_norm": 5.87109375, "learning_rate": 8.795328142380423e-06, "loss": 2.4637, "mean_token_accuracy": 0.49696519261736655, "step": 6498 }, { "epoch": 1.2048572487949574, "grad_norm": 6.04296875, "learning_rate": 8.795142751205043e-06, "loss": 2.7673, "mean_token_accuracy": 0.4739399829911311, "step": 6499 }, { "epoch": 1.2050426399703373, "grad_norm": 5.25, "learning_rate": 8.794957360029664e-06, "loss": 2.9244, "mean_token_accuracy": 0.4391359863800809, "step": 6500 }, { "epoch": 1.2052280311457175, "grad_norm": 5.25390625, "learning_rate": 8.794771968854283e-06, "loss": 2.013, "mean_token_accuracy": 0.557653922923384, "step": 6501 }, { "epoch": 1.2054134223210975, "grad_norm": 8.1640625, "learning_rate": 8.794586577678903e-06, "loss": 2.8808, "mean_token_accuracy": 0.46299702844205465, "step": 6502 }, { "epoch": 1.2055988134964775, "grad_norm": 8.1875, "learning_rate": 8.794401186503522e-06, "loss": 2.9953, "mean_token_accuracy": 0.4427980702963473, "step": 6503 }, { "epoch": 1.2057842046718577, "grad_norm": 9.75, "learning_rate": 8.794215795328142e-06, "loss": 2.6529, "mean_token_accuracy": 0.463200200954534, "step": 6504 }, { "epoch": 1.2059695958472376, "grad_norm": 7.76953125, "learning_rate": 8.794030404152763e-06, "loss": 2.3556, "mean_token_accuracy": 0.5079012345679013, "step": 6505 }, { "epoch": 1.2061549870226178, "grad_norm": 6.609375, "learning_rate": 8.793845012977383e-06, "loss": 3.0243, "mean_token_accuracy": 0.4335307893649263, "step": 6506 }, { "epoch": 1.2063403781979978, "grad_norm": 8.75, "learning_rate": 8.793659621802004e-06, "loss": 2.5779, "mean_token_accuracy": 0.4926173028546429, "step": 6507 }, { "epoch": 1.2065257693733777, "grad_norm": 18.421875, "learning_rate": 8.793474230626623e-06, "loss": 1.7085, "mean_token_accuracy": 0.5883407126291377, "step": 6508 }, { "epoch": 1.206711160548758, "grad_norm": 6.37890625, "learning_rate": 8.793288839451243e-06, "loss": 2.8361, "mean_token_accuracy": 0.44144144144144143, "step": 6509 }, { "epoch": 1.206896551724138, "grad_norm": 5.95703125, "learning_rate": 8.793103448275862e-06, "loss": 2.3865, "mean_token_accuracy": 0.48699103336763194, "step": 6510 }, { "epoch": 1.207081942899518, "grad_norm": 6.34375, "learning_rate": 8.792918057100483e-06, "loss": 2.7577, "mean_token_accuracy": 0.47492802303262954, "step": 6511 }, { "epoch": 1.207267334074898, "grad_norm": 8.515625, "learning_rate": 8.792732665925103e-06, "loss": 2.3418, "mean_token_accuracy": 0.500956937799043, "step": 6512 }, { "epoch": 1.207452725250278, "grad_norm": 7.57421875, "learning_rate": 8.792547274749724e-06, "loss": 2.484, "mean_token_accuracy": 0.5129479466387654, "step": 6513 }, { "epoch": 1.2076381164256582, "grad_norm": 11.25, "learning_rate": 8.792361883574342e-06, "loss": 2.1724, "mean_token_accuracy": 0.5210348706411698, "step": 6514 }, { "epoch": 1.2078235076010382, "grad_norm": 9.5859375, "learning_rate": 8.792176492398963e-06, "loss": 2.6263, "mean_token_accuracy": 0.4723419540229885, "step": 6515 }, { "epoch": 1.2080088987764181, "grad_norm": 13.28125, "learning_rate": 8.791991101223583e-06, "loss": 2.5701, "mean_token_accuracy": 0.48403783624002367, "step": 6516 }, { "epoch": 1.2081942899517983, "grad_norm": 14.65625, "learning_rate": 8.791805710048202e-06, "loss": 2.1428, "mean_token_accuracy": 0.5228815690218758, "step": 6517 }, { "epoch": 1.2083796811271783, "grad_norm": 14.6796875, "learning_rate": 8.791620318872823e-06, "loss": 3.1227, "mean_token_accuracy": 0.4292944328669247, "step": 6518 }, { "epoch": 1.2085650723025583, "grad_norm": 15.28125, "learning_rate": 8.791434927697441e-06, "loss": 2.6561, "mean_token_accuracy": 0.46710287168302433, "step": 6519 }, { "epoch": 1.2087504634779385, "grad_norm": 13.6015625, "learning_rate": 8.791249536522062e-06, "loss": 3.0709, "mean_token_accuracy": 0.4283391695847924, "step": 6520 }, { "epoch": 1.2089358546533184, "grad_norm": 8.703125, "learning_rate": 8.791064145346682e-06, "loss": 2.9144, "mean_token_accuracy": 0.46511627906976744, "step": 6521 }, { "epoch": 1.2091212458286986, "grad_norm": 7.67578125, "learning_rate": 8.790878754171303e-06, "loss": 3.4172, "mean_token_accuracy": 0.41921470342522976, "step": 6522 }, { "epoch": 1.2093066370040786, "grad_norm": 13.2109375, "learning_rate": 8.790693362995922e-06, "loss": 2.5537, "mean_token_accuracy": 0.4745624270711785, "step": 6523 }, { "epoch": 1.2094920281794588, "grad_norm": 14.296875, "learning_rate": 8.790507971820542e-06, "loss": 2.7245, "mean_token_accuracy": 0.4663027503674155, "step": 6524 }, { "epoch": 1.2096774193548387, "grad_norm": 6.84375, "learning_rate": 8.790322580645163e-06, "loss": 2.4258, "mean_token_accuracy": 0.4893456058504602, "step": 6525 }, { "epoch": 1.2098628105302187, "grad_norm": 5.98828125, "learning_rate": 8.790137189469781e-06, "loss": 2.5397, "mean_token_accuracy": 0.4941347040991169, "step": 6526 }, { "epoch": 1.210048201705599, "grad_norm": 7.453125, "learning_rate": 8.789951798294402e-06, "loss": 2.632, "mean_token_accuracy": 0.4711797890605838, "step": 6527 }, { "epoch": 1.2102335928809789, "grad_norm": 7.09375, "learning_rate": 8.78976640711902e-06, "loss": 2.8446, "mean_token_accuracy": 0.4395064549297384, "step": 6528 }, { "epoch": 1.2104189840563588, "grad_norm": 6.49609375, "learning_rate": 8.789581015943643e-06, "loss": 2.6643, "mean_token_accuracy": 0.47111442415206856, "step": 6529 }, { "epoch": 1.210604375231739, "grad_norm": 7.29296875, "learning_rate": 8.789395624768262e-06, "loss": 2.5416, "mean_token_accuracy": 0.4812166381307282, "step": 6530 }, { "epoch": 1.210789766407119, "grad_norm": 10.921875, "learning_rate": 8.789210233592882e-06, "loss": 2.7768, "mean_token_accuracy": 0.4612835930789054, "step": 6531 }, { "epoch": 1.210975157582499, "grad_norm": 7.7421875, "learning_rate": 8.789024842417501e-06, "loss": 2.8109, "mean_token_accuracy": 0.46142208774583965, "step": 6532 }, { "epoch": 1.2111605487578792, "grad_norm": 6.67578125, "learning_rate": 8.788839451242121e-06, "loss": 2.4654, "mean_token_accuracy": 0.5169617515880525, "step": 6533 }, { "epoch": 1.2113459399332591, "grad_norm": 8.1484375, "learning_rate": 8.788654060066742e-06, "loss": 2.5383, "mean_token_accuracy": 0.4841102867899457, "step": 6534 }, { "epoch": 1.2115313311086393, "grad_norm": 8.484375, "learning_rate": 8.78846866889136e-06, "loss": 2.5999, "mean_token_accuracy": 0.4870810055865922, "step": 6535 }, { "epoch": 1.2117167222840193, "grad_norm": 7.14453125, "learning_rate": 8.788283277715981e-06, "loss": 3.0674, "mean_token_accuracy": 0.45516476552598223, "step": 6536 }, { "epoch": 1.2119021134593992, "grad_norm": 6.7265625, "learning_rate": 8.788097886540602e-06, "loss": 3.1784, "mean_token_accuracy": 0.4386917540093446, "step": 6537 }, { "epoch": 1.2120875046347794, "grad_norm": 10.015625, "learning_rate": 8.787912495365222e-06, "loss": 2.5552, "mean_token_accuracy": 0.4789989755597834, "step": 6538 }, { "epoch": 1.2122728958101594, "grad_norm": 10.34375, "learning_rate": 8.787727104189841e-06, "loss": 2.2256, "mean_token_accuracy": 0.5405751969225133, "step": 6539 }, { "epoch": 1.2124582869855396, "grad_norm": 5.76953125, "learning_rate": 8.787541713014462e-06, "loss": 2.6028, "mean_token_accuracy": 0.47942360119433985, "step": 6540 }, { "epoch": 1.2126436781609196, "grad_norm": 6.6640625, "learning_rate": 8.78735632183908e-06, "loss": 2.9211, "mean_token_accuracy": 0.43910961563349293, "step": 6541 }, { "epoch": 1.2128290693362995, "grad_norm": 7.80859375, "learning_rate": 8.7871709306637e-06, "loss": 2.3351, "mean_token_accuracy": 0.506635004888951, "step": 6542 }, { "epoch": 1.2130144605116797, "grad_norm": 5.37890625, "learning_rate": 8.786985539488321e-06, "loss": 2.3846, "mean_token_accuracy": 0.49985775248933145, "step": 6543 }, { "epoch": 1.2131998516870597, "grad_norm": 5.72265625, "learning_rate": 8.78680014831294e-06, "loss": 2.6279, "mean_token_accuracy": 0.46471115665428464, "step": 6544 }, { "epoch": 1.2133852428624397, "grad_norm": 8.703125, "learning_rate": 8.786614757137562e-06, "loss": 2.6802, "mean_token_accuracy": 0.4800498753117207, "step": 6545 }, { "epoch": 1.2135706340378198, "grad_norm": 7.1015625, "learning_rate": 8.786429365962181e-06, "loss": 2.8154, "mean_token_accuracy": 0.47454431175361406, "step": 6546 }, { "epoch": 1.2137560252131998, "grad_norm": 7.8359375, "learning_rate": 8.786243974786802e-06, "loss": 2.34, "mean_token_accuracy": 0.5004863813229572, "step": 6547 }, { "epoch": 1.21394141638858, "grad_norm": 5.8125, "learning_rate": 8.78605858361142e-06, "loss": 2.5902, "mean_token_accuracy": 0.47715617715617714, "step": 6548 }, { "epoch": 1.21412680756396, "grad_norm": 6.76953125, "learning_rate": 8.785873192436041e-06, "loss": 2.7777, "mean_token_accuracy": 0.4603174603174603, "step": 6549 }, { "epoch": 1.21431219873934, "grad_norm": 6.05859375, "learning_rate": 8.785687801260661e-06, "loss": 3.1634, "mean_token_accuracy": 0.44045009498757853, "step": 6550 }, { "epoch": 1.2144975899147201, "grad_norm": 6.78515625, "learning_rate": 8.78550241008528e-06, "loss": 3.6676, "mean_token_accuracy": 0.4106062556313554, "step": 6551 }, { "epoch": 1.2146829810901, "grad_norm": 5.5703125, "learning_rate": 8.7853170189099e-06, "loss": 2.8238, "mean_token_accuracy": 0.44954240390482003, "step": 6552 }, { "epoch": 1.2148683722654803, "grad_norm": 7.015625, "learning_rate": 8.785131627734521e-06, "loss": 2.7361, "mean_token_accuracy": 0.46141439205955337, "step": 6553 }, { "epoch": 1.2150537634408602, "grad_norm": 5.8671875, "learning_rate": 8.784946236559142e-06, "loss": 3.0113, "mean_token_accuracy": 0.44121753700083777, "step": 6554 }, { "epoch": 1.2152391546162402, "grad_norm": 6.5078125, "learning_rate": 8.78476084538376e-06, "loss": 2.3536, "mean_token_accuracy": 0.5021189161422884, "step": 6555 }, { "epoch": 1.2154245457916204, "grad_norm": 5.73828125, "learning_rate": 8.784575454208381e-06, "loss": 3.0559, "mean_token_accuracy": 0.4519927536231884, "step": 6556 }, { "epoch": 1.2156099369670004, "grad_norm": 5.76171875, "learning_rate": 8.784390063033e-06, "loss": 3.085, "mean_token_accuracy": 0.43568665377176014, "step": 6557 }, { "epoch": 1.2157953281423803, "grad_norm": 5.515625, "learning_rate": 8.78420467185762e-06, "loss": 3.0064, "mean_token_accuracy": 0.43690426854537917, "step": 6558 }, { "epoch": 1.2159807193177605, "grad_norm": 7.015625, "learning_rate": 8.78401928068224e-06, "loss": 2.3874, "mean_token_accuracy": 0.5045945325063175, "step": 6559 }, { "epoch": 1.2161661104931405, "grad_norm": 6.43359375, "learning_rate": 8.78383388950686e-06, "loss": 3.0243, "mean_token_accuracy": 0.4625298329355609, "step": 6560 }, { "epoch": 1.2163515016685205, "grad_norm": 9.890625, "learning_rate": 8.78364849833148e-06, "loss": 2.6999, "mean_token_accuracy": 0.4770965468639887, "step": 6561 }, { "epoch": 1.2165368928439007, "grad_norm": 6.95703125, "learning_rate": 8.7834631071561e-06, "loss": 2.8364, "mean_token_accuracy": 0.46932814021421615, "step": 6562 }, { "epoch": 1.2167222840192806, "grad_norm": 8.125, "learning_rate": 8.783277715980721e-06, "loss": 2.5424, "mean_token_accuracy": 0.48943929493997873, "step": 6563 }, { "epoch": 1.2169076751946608, "grad_norm": 6.29296875, "learning_rate": 8.78309232480534e-06, "loss": 2.7572, "mean_token_accuracy": 0.4604563579973416, "step": 6564 }, { "epoch": 1.2170930663700408, "grad_norm": 5.6328125, "learning_rate": 8.78290693362996e-06, "loss": 2.7735, "mean_token_accuracy": 0.4751958224543081, "step": 6565 }, { "epoch": 1.2172784575454207, "grad_norm": 8.4140625, "learning_rate": 8.782721542454579e-06, "loss": 2.5586, "mean_token_accuracy": 0.47953216374269003, "step": 6566 }, { "epoch": 1.217463848720801, "grad_norm": 5.46875, "learning_rate": 8.7825361512792e-06, "loss": 2.6858, "mean_token_accuracy": 0.4750920245398773, "step": 6567 }, { "epoch": 1.217649239896181, "grad_norm": 8.1875, "learning_rate": 8.78235076010382e-06, "loss": 2.9169, "mean_token_accuracy": 0.454884246188594, "step": 6568 }, { "epoch": 1.217834631071561, "grad_norm": 7.859375, "learning_rate": 8.782165368928439e-06, "loss": 2.6228, "mean_token_accuracy": 0.4932895488292404, "step": 6569 }, { "epoch": 1.218020022246941, "grad_norm": 6.8515625, "learning_rate": 8.78197997775306e-06, "loss": 2.7818, "mean_token_accuracy": 0.45407725321888415, "step": 6570 }, { "epoch": 1.218205413422321, "grad_norm": 8.15625, "learning_rate": 8.78179458657768e-06, "loss": 2.3944, "mean_token_accuracy": 0.4964326484018265, "step": 6571 }, { "epoch": 1.2183908045977012, "grad_norm": 7.73828125, "learning_rate": 8.7816091954023e-06, "loss": 2.7251, "mean_token_accuracy": 0.4636610959968908, "step": 6572 }, { "epoch": 1.2185761957730812, "grad_norm": 8.4140625, "learning_rate": 8.781423804226919e-06, "loss": 3.0118, "mean_token_accuracy": 0.45545545545545546, "step": 6573 }, { "epoch": 1.2187615869484612, "grad_norm": 6.83203125, "learning_rate": 8.78123841305154e-06, "loss": 2.6366, "mean_token_accuracy": 0.49918454397189815, "step": 6574 }, { "epoch": 1.2189469781238413, "grad_norm": 8.0234375, "learning_rate": 8.781053021876158e-06, "loss": 2.7346, "mean_token_accuracy": 0.4712581344902386, "step": 6575 }, { "epoch": 1.2191323692992213, "grad_norm": 6.453125, "learning_rate": 8.780867630700779e-06, "loss": 2.9366, "mean_token_accuracy": 0.44063981042654027, "step": 6576 }, { "epoch": 1.2193177604746015, "grad_norm": 6.76953125, "learning_rate": 8.7806822395254e-06, "loss": 2.4907, "mean_token_accuracy": 0.5099505810070789, "step": 6577 }, { "epoch": 1.2195031516499815, "grad_norm": 7.81640625, "learning_rate": 8.78049684835002e-06, "loss": 2.5438, "mean_token_accuracy": 0.4837504246404711, "step": 6578 }, { "epoch": 1.2196885428253614, "grad_norm": 7.59765625, "learning_rate": 8.780311457174639e-06, "loss": 2.8274, "mean_token_accuracy": 0.4489819662594532, "step": 6579 }, { "epoch": 1.2198739340007416, "grad_norm": 6.8125, "learning_rate": 8.78012606599926e-06, "loss": 2.1961, "mean_token_accuracy": 0.5433338254835376, "step": 6580 }, { "epoch": 1.2200593251761216, "grad_norm": 6.00390625, "learning_rate": 8.77994067482388e-06, "loss": 3.028, "mean_token_accuracy": 0.430343386865126, "step": 6581 }, { "epoch": 1.2202447163515018, "grad_norm": 7.86328125, "learning_rate": 8.779755283648498e-06, "loss": 3.389, "mean_token_accuracy": 0.3994683421942968, "step": 6582 }, { "epoch": 1.2204301075268817, "grad_norm": 6.68359375, "learning_rate": 8.779569892473119e-06, "loss": 2.8746, "mean_token_accuracy": 0.45725699067909453, "step": 6583 }, { "epoch": 1.2206154987022617, "grad_norm": 4.875, "learning_rate": 8.779384501297738e-06, "loss": 2.5537, "mean_token_accuracy": 0.4982276119402985, "step": 6584 }, { "epoch": 1.220800889877642, "grad_norm": 6.70703125, "learning_rate": 8.779199110122358e-06, "loss": 3.3035, "mean_token_accuracy": 0.42238845845578415, "step": 6585 }, { "epoch": 1.2209862810530219, "grad_norm": 7.73828125, "learning_rate": 8.779013718946979e-06, "loss": 2.677, "mean_token_accuracy": 0.4865290914302092, "step": 6586 }, { "epoch": 1.2211716722284018, "grad_norm": 6.203125, "learning_rate": 8.7788283277716e-06, "loss": 2.5919, "mean_token_accuracy": 0.48125437981779956, "step": 6587 }, { "epoch": 1.221357063403782, "grad_norm": 6.375, "learning_rate": 8.77864293659622e-06, "loss": 3.0411, "mean_token_accuracy": 0.45101637492941843, "step": 6588 }, { "epoch": 1.221542454579162, "grad_norm": 8.765625, "learning_rate": 8.778457545420839e-06, "loss": 2.9616, "mean_token_accuracy": 0.43680734355759543, "step": 6589 }, { "epoch": 1.221727845754542, "grad_norm": 6.74609375, "learning_rate": 8.778272154245459e-06, "loss": 2.7919, "mean_token_accuracy": 0.47368421052631576, "step": 6590 }, { "epoch": 1.2219132369299222, "grad_norm": 12.1875, "learning_rate": 8.778086763070078e-06, "loss": 3.2453, "mean_token_accuracy": 0.4563861094761624, "step": 6591 }, { "epoch": 1.2220986281053021, "grad_norm": 7.1640625, "learning_rate": 8.777901371894698e-06, "loss": 2.4058, "mean_token_accuracy": 0.5050644567219152, "step": 6592 }, { "epoch": 1.2222840192806823, "grad_norm": 6.40234375, "learning_rate": 8.777715980719319e-06, "loss": 2.4092, "mean_token_accuracy": 0.5071184510250569, "step": 6593 }, { "epoch": 1.2224694104560623, "grad_norm": 6.1875, "learning_rate": 8.77753058954394e-06, "loss": 2.7646, "mean_token_accuracy": 0.45621805792163544, "step": 6594 }, { "epoch": 1.2226548016314425, "grad_norm": 6.421875, "learning_rate": 8.777345198368558e-06, "loss": 2.8224, "mean_token_accuracy": 0.44771513353115727, "step": 6595 }, { "epoch": 1.2228401928068224, "grad_norm": 5.5859375, "learning_rate": 8.777159807193179e-06, "loss": 2.6155, "mean_token_accuracy": 0.4716468151216986, "step": 6596 }, { "epoch": 1.2230255839822024, "grad_norm": 7.3359375, "learning_rate": 8.776974416017799e-06, "loss": 3.1917, "mean_token_accuracy": 0.43752733634640617, "step": 6597 }, { "epoch": 1.2232109751575826, "grad_norm": 6.3125, "learning_rate": 8.776789024842418e-06, "loss": 2.9273, "mean_token_accuracy": 0.4437559580552908, "step": 6598 }, { "epoch": 1.2233963663329626, "grad_norm": 6.2734375, "learning_rate": 8.776603633667038e-06, "loss": 3.8564, "mean_token_accuracy": 0.3781504731565633, "step": 6599 }, { "epoch": 1.2235817575083425, "grad_norm": 5.63671875, "learning_rate": 8.776418242491657e-06, "loss": 2.6114, "mean_token_accuracy": 0.47034327518289254, "step": 6600 }, { "epoch": 1.2237671486837227, "grad_norm": 5.4296875, "learning_rate": 8.776232851316278e-06, "loss": 2.6162, "mean_token_accuracy": 0.4822253000923361, "step": 6601 }, { "epoch": 1.2239525398591027, "grad_norm": 5.1328125, "learning_rate": 8.776047460140898e-06, "loss": 2.5476, "mean_token_accuracy": 0.47247150133398014, "step": 6602 }, { "epoch": 1.2241379310344827, "grad_norm": 6.37890625, "learning_rate": 8.775862068965519e-06, "loss": 3.1398, "mean_token_accuracy": 0.42875264270613106, "step": 6603 }, { "epoch": 1.2243233222098628, "grad_norm": 6.4765625, "learning_rate": 8.775676677790137e-06, "loss": 2.6803, "mean_token_accuracy": 0.4771668219944082, "step": 6604 }, { "epoch": 1.2245087133852428, "grad_norm": 6.27734375, "learning_rate": 8.775491286614758e-06, "loss": 2.7994, "mean_token_accuracy": 0.45821489482660605, "step": 6605 }, { "epoch": 1.224694104560623, "grad_norm": 7.98828125, "learning_rate": 8.775305895439378e-06, "loss": 2.5261, "mean_token_accuracy": 0.4983674680914218, "step": 6606 }, { "epoch": 1.224879495736003, "grad_norm": 6.69921875, "learning_rate": 8.775120504263997e-06, "loss": 2.6688, "mean_token_accuracy": 0.4730549006399461, "step": 6607 }, { "epoch": 1.225064886911383, "grad_norm": 6.390625, "learning_rate": 8.774935113088618e-06, "loss": 2.8597, "mean_token_accuracy": 0.4580811138014528, "step": 6608 }, { "epoch": 1.2252502780867631, "grad_norm": 5.52734375, "learning_rate": 8.774749721913236e-06, "loss": 2.6141, "mean_token_accuracy": 0.4949182501104728, "step": 6609 }, { "epoch": 1.225435669262143, "grad_norm": 6.4375, "learning_rate": 8.774564330737859e-06, "loss": 2.7782, "mean_token_accuracy": 0.4840261739799846, "step": 6610 }, { "epoch": 1.2256210604375233, "grad_norm": 7.3125, "learning_rate": 8.774378939562477e-06, "loss": 2.7077, "mean_token_accuracy": 0.4884555032503923, "step": 6611 }, { "epoch": 1.2258064516129032, "grad_norm": 5.9140625, "learning_rate": 8.774193548387098e-06, "loss": 3.0008, "mean_token_accuracy": 0.4347759829320329, "step": 6612 }, { "epoch": 1.2259918427882832, "grad_norm": 7.46484375, "learning_rate": 8.774008157211717e-06, "loss": 2.7343, "mean_token_accuracy": 0.44486732212707475, "step": 6613 }, { "epoch": 1.2261772339636634, "grad_norm": 7.29296875, "learning_rate": 8.773822766036337e-06, "loss": 3.0451, "mean_token_accuracy": 0.4677891654465593, "step": 6614 }, { "epoch": 1.2263626251390434, "grad_norm": 8.2421875, "learning_rate": 8.773637374860958e-06, "loss": 2.457, "mean_token_accuracy": 0.5040123891313529, "step": 6615 }, { "epoch": 1.2265480163144233, "grad_norm": 7.54296875, "learning_rate": 8.773451983685577e-06, "loss": 2.923, "mean_token_accuracy": 0.4517538384721009, "step": 6616 }, { "epoch": 1.2267334074898035, "grad_norm": 7.234375, "learning_rate": 8.773266592510197e-06, "loss": 3.0446, "mean_token_accuracy": 0.431665868836764, "step": 6617 }, { "epoch": 1.2269187986651835, "grad_norm": 7.5234375, "learning_rate": 8.773081201334818e-06, "loss": 2.6618, "mean_token_accuracy": 0.48417579121043947, "step": 6618 }, { "epoch": 1.2271041898405637, "grad_norm": 6.765625, "learning_rate": 8.772895810159438e-06, "loss": 3.3239, "mean_token_accuracy": 0.42322560692747796, "step": 6619 }, { "epoch": 1.2272895810159437, "grad_norm": 7.41796875, "learning_rate": 8.772710418984057e-06, "loss": 3.0598, "mean_token_accuracy": 0.4347728295096716, "step": 6620 }, { "epoch": 1.2274749721913236, "grad_norm": 8.390625, "learning_rate": 8.772525027808677e-06, "loss": 2.2123, "mean_token_accuracy": 0.5282861896838602, "step": 6621 }, { "epoch": 1.2276603633667038, "grad_norm": 7.24609375, "learning_rate": 8.772339636633296e-06, "loss": 2.5786, "mean_token_accuracy": 0.48101615352754384, "step": 6622 }, { "epoch": 1.2278457545420838, "grad_norm": 7.859375, "learning_rate": 8.772154245457917e-06, "loss": 2.8736, "mean_token_accuracy": 0.4778467311300332, "step": 6623 }, { "epoch": 1.228031145717464, "grad_norm": 9.25, "learning_rate": 8.771968854282537e-06, "loss": 2.8536, "mean_token_accuracy": 0.45817857673451196, "step": 6624 }, { "epoch": 1.228216536892844, "grad_norm": 5.8046875, "learning_rate": 8.771783463107156e-06, "loss": 2.7602, "mean_token_accuracy": 0.4623800706951692, "step": 6625 }, { "epoch": 1.228401928068224, "grad_norm": 8.1953125, "learning_rate": 8.771598071931778e-06, "loss": 2.6336, "mean_token_accuracy": 0.46781276277679157, "step": 6626 }, { "epoch": 1.228587319243604, "grad_norm": 6.84375, "learning_rate": 8.771412680756397e-06, "loss": 3.1214, "mean_token_accuracy": 0.4359861591695502, "step": 6627 }, { "epoch": 1.228772710418984, "grad_norm": 6.125, "learning_rate": 8.771227289581017e-06, "loss": 2.7447, "mean_token_accuracy": 0.46699779249448126, "step": 6628 }, { "epoch": 1.228958101594364, "grad_norm": 6.25, "learning_rate": 8.771041898405636e-06, "loss": 3.5625, "mean_token_accuracy": 0.39941010200319527, "step": 6629 }, { "epoch": 1.2291434927697442, "grad_norm": 6.859375, "learning_rate": 8.770856507230257e-06, "loss": 2.4065, "mean_token_accuracy": 0.5225885225885226, "step": 6630 }, { "epoch": 1.2293288839451242, "grad_norm": 5.7109375, "learning_rate": 8.770671116054877e-06, "loss": 2.7583, "mean_token_accuracy": 0.4509597686037339, "step": 6631 }, { "epoch": 1.2295142751205042, "grad_norm": 5.640625, "learning_rate": 8.770485724879496e-06, "loss": 3.0195, "mean_token_accuracy": 0.43264913406029504, "step": 6632 }, { "epoch": 1.2296996662958843, "grad_norm": 5.18359375, "learning_rate": 8.770300333704116e-06, "loss": 2.3864, "mean_token_accuracy": 0.5240506329113924, "step": 6633 }, { "epoch": 1.2298850574712643, "grad_norm": 5.015625, "learning_rate": 8.770114942528737e-06, "loss": 2.7436, "mean_token_accuracy": 0.48985855350947427, "step": 6634 }, { "epoch": 1.2300704486466445, "grad_norm": 6.109375, "learning_rate": 8.769929551353357e-06, "loss": 2.7309, "mean_token_accuracy": 0.48079618727221757, "step": 6635 }, { "epoch": 1.2302558398220245, "grad_norm": 6.0703125, "learning_rate": 8.769744160177976e-06, "loss": 2.8647, "mean_token_accuracy": 0.4470198675496689, "step": 6636 }, { "epoch": 1.2304412309974044, "grad_norm": 6.86328125, "learning_rate": 8.769558769002597e-06, "loss": 3.2423, "mean_token_accuracy": 0.42991960852848654, "step": 6637 }, { "epoch": 1.2306266221727846, "grad_norm": 5.50390625, "learning_rate": 8.769373377827215e-06, "loss": 2.9553, "mean_token_accuracy": 0.45041380277738813, "step": 6638 }, { "epoch": 1.2308120133481646, "grad_norm": 7.84765625, "learning_rate": 8.769187986651836e-06, "loss": 2.4645, "mean_token_accuracy": 0.5039218052371184, "step": 6639 }, { "epoch": 1.2309974045235448, "grad_norm": 5.640625, "learning_rate": 8.769002595476456e-06, "loss": 2.7687, "mean_token_accuracy": 0.46917269581629445, "step": 6640 }, { "epoch": 1.2311827956989247, "grad_norm": 5.4375, "learning_rate": 8.768817204301075e-06, "loss": 2.752, "mean_token_accuracy": 0.4710373242850218, "step": 6641 }, { "epoch": 1.2313681868743047, "grad_norm": 7.30078125, "learning_rate": 8.768631813125696e-06, "loss": 2.6614, "mean_token_accuracy": 0.4807282036694638, "step": 6642 }, { "epoch": 1.231553578049685, "grad_norm": 6.59375, "learning_rate": 8.768446421950316e-06, "loss": 3.493, "mean_token_accuracy": 0.4035029742233972, "step": 6643 }, { "epoch": 1.2317389692250649, "grad_norm": 5.8828125, "learning_rate": 8.768261030774937e-06, "loss": 3.1014, "mean_token_accuracy": 0.4492247520603436, "step": 6644 }, { "epoch": 1.2319243604004448, "grad_norm": 5.30859375, "learning_rate": 8.768075639599556e-06, "loss": 2.636, "mean_token_accuracy": 0.4631069198419778, "step": 6645 }, { "epoch": 1.232109751575825, "grad_norm": 7.27734375, "learning_rate": 8.767890248424176e-06, "loss": 2.5508, "mean_token_accuracy": 0.4787527956847783, "step": 6646 }, { "epoch": 1.232295142751205, "grad_norm": 5.3671875, "learning_rate": 8.767704857248795e-06, "loss": 2.0723, "mean_token_accuracy": 0.5542236164015236, "step": 6647 }, { "epoch": 1.2324805339265852, "grad_norm": 5.79296875, "learning_rate": 8.767519466073415e-06, "loss": 2.8559, "mean_token_accuracy": 0.45510485997739547, "step": 6648 }, { "epoch": 1.2326659251019652, "grad_norm": 8.4375, "learning_rate": 8.767334074898036e-06, "loss": 3.1364, "mean_token_accuracy": 0.4938002452650225, "step": 6649 }, { "epoch": 1.2328513162773451, "grad_norm": 5.99609375, "learning_rate": 8.767148683722656e-06, "loss": 2.7389, "mean_token_accuracy": 0.46055698687277635, "step": 6650 }, { "epoch": 1.2330367074527253, "grad_norm": 6.52734375, "learning_rate": 8.766963292547275e-06, "loss": 2.7879, "mean_token_accuracy": 0.4698723656871475, "step": 6651 }, { "epoch": 1.2332220986281053, "grad_norm": 5.5625, "learning_rate": 8.766777901371896e-06, "loss": 2.5989, "mean_token_accuracy": 0.48546429579452444, "step": 6652 }, { "epoch": 1.2334074898034855, "grad_norm": 5.47265625, "learning_rate": 8.766592510196516e-06, "loss": 3.167, "mean_token_accuracy": 0.4293281293560078, "step": 6653 }, { "epoch": 1.2335928809788654, "grad_norm": 7.54296875, "learning_rate": 8.766407119021135e-06, "loss": 2.5692, "mean_token_accuracy": 0.47995434441432444, "step": 6654 }, { "epoch": 1.2337782721542454, "grad_norm": 6.52734375, "learning_rate": 8.766221727845755e-06, "loss": 2.7878, "mean_token_accuracy": 0.44339356295878035, "step": 6655 }, { "epoch": 1.2339636633296256, "grad_norm": 8.3671875, "learning_rate": 8.766036336670374e-06, "loss": 2.5729, "mean_token_accuracy": 0.48121387283236994, "step": 6656 }, { "epoch": 1.2341490545050056, "grad_norm": 7.22265625, "learning_rate": 8.765850945494995e-06, "loss": 2.7465, "mean_token_accuracy": 0.4404864267289031, "step": 6657 }, { "epoch": 1.2343344456803855, "grad_norm": 7.68359375, "learning_rate": 8.765665554319615e-06, "loss": 3.2222, "mean_token_accuracy": 0.4331965344277246, "step": 6658 }, { "epoch": 1.2345198368557657, "grad_norm": 7.11328125, "learning_rate": 8.765480163144236e-06, "loss": 2.2198, "mean_token_accuracy": 0.5459989806320081, "step": 6659 }, { "epoch": 1.2347052280311457, "grad_norm": 7.71875, "learning_rate": 8.765294771968854e-06, "loss": 2.8011, "mean_token_accuracy": 0.4532710280373832, "step": 6660 }, { "epoch": 1.2348906192065257, "grad_norm": 6.79296875, "learning_rate": 8.765109380793475e-06, "loss": 2.9427, "mean_token_accuracy": 0.47419566644780037, "step": 6661 }, { "epoch": 1.2350760103819058, "grad_norm": 6.6015625, "learning_rate": 8.764923989618095e-06, "loss": 2.8941, "mean_token_accuracy": 0.44818387030576645, "step": 6662 }, { "epoch": 1.2352614015572858, "grad_norm": 7.52734375, "learning_rate": 8.764738598442714e-06, "loss": 2.4172, "mean_token_accuracy": 0.4943465684985538, "step": 6663 }, { "epoch": 1.235446792732666, "grad_norm": 8.671875, "learning_rate": 8.764553207267335e-06, "loss": 2.4013, "mean_token_accuracy": 0.49782293178519593, "step": 6664 }, { "epoch": 1.235632183908046, "grad_norm": 5.7890625, "learning_rate": 8.764367816091954e-06, "loss": 2.6393, "mean_token_accuracy": 0.48471678980153554, "step": 6665 }, { "epoch": 1.235817575083426, "grad_norm": 9.421875, "learning_rate": 8.764182424916576e-06, "loss": 2.6939, "mean_token_accuracy": 0.4692993964838625, "step": 6666 }, { "epoch": 1.2360029662588061, "grad_norm": 6.49609375, "learning_rate": 8.763997033741194e-06, "loss": 3.1109, "mean_token_accuracy": 0.4267895109851169, "step": 6667 }, { "epoch": 1.236188357434186, "grad_norm": 6.40234375, "learning_rate": 8.763811642565815e-06, "loss": 2.7938, "mean_token_accuracy": 0.45982812713135995, "step": 6668 }, { "epoch": 1.2363737486095663, "grad_norm": 7.7265625, "learning_rate": 8.763626251390435e-06, "loss": 3.2074, "mean_token_accuracy": 0.43894121468009095, "step": 6669 }, { "epoch": 1.2365591397849462, "grad_norm": 8.6640625, "learning_rate": 8.763440860215054e-06, "loss": 2.623, "mean_token_accuracy": 0.4936941671045717, "step": 6670 }, { "epoch": 1.2367445309603262, "grad_norm": 5.5625, "learning_rate": 8.763255469039675e-06, "loss": 2.8535, "mean_token_accuracy": 0.4628241174632927, "step": 6671 }, { "epoch": 1.2369299221357064, "grad_norm": 7.23828125, "learning_rate": 8.763070077864294e-06, "loss": 2.4929, "mean_token_accuracy": 0.49324001908700493, "step": 6672 }, { "epoch": 1.2371153133110864, "grad_norm": 10.0078125, "learning_rate": 8.762884686688914e-06, "loss": 3.155, "mean_token_accuracy": 0.43948264125255276, "step": 6673 }, { "epoch": 1.2373007044864663, "grad_norm": 5.69140625, "learning_rate": 8.762699295513535e-06, "loss": 2.4574, "mean_token_accuracy": 0.48737953189536487, "step": 6674 }, { "epoch": 1.2374860956618465, "grad_norm": 6.06640625, "learning_rate": 8.762513904338155e-06, "loss": 2.9325, "mean_token_accuracy": 0.4470570837030464, "step": 6675 }, { "epoch": 1.2376714868372265, "grad_norm": 6.08203125, "learning_rate": 8.762328513162774e-06, "loss": 2.8394, "mean_token_accuracy": 0.4507323568575233, "step": 6676 }, { "epoch": 1.2378568780126067, "grad_norm": 6.32421875, "learning_rate": 8.762143121987394e-06, "loss": 2.5263, "mean_token_accuracy": 0.5110150585610709, "step": 6677 }, { "epoch": 1.2380422691879867, "grad_norm": 5.84375, "learning_rate": 8.761957730812015e-06, "loss": 2.4604, "mean_token_accuracy": 0.4934403457323661, "step": 6678 }, { "epoch": 1.2382276603633666, "grad_norm": 6.70703125, "learning_rate": 8.761772339636634e-06, "loss": 2.7038, "mean_token_accuracy": 0.4991596638655462, "step": 6679 }, { "epoch": 1.2384130515387468, "grad_norm": 5.7421875, "learning_rate": 8.761586948461254e-06, "loss": 2.6639, "mean_token_accuracy": 0.4620160288971667, "step": 6680 }, { "epoch": 1.2385984427141268, "grad_norm": 6.33984375, "learning_rate": 8.761401557285873e-06, "loss": 2.5681, "mean_token_accuracy": 0.4895148026315789, "step": 6681 }, { "epoch": 1.238783833889507, "grad_norm": 7.43359375, "learning_rate": 8.761216166110493e-06, "loss": 2.5821, "mean_token_accuracy": 0.4804010938924339, "step": 6682 }, { "epoch": 1.238969225064887, "grad_norm": 5.94921875, "learning_rate": 8.761030774935114e-06, "loss": 2.7824, "mean_token_accuracy": 0.46069761729304837, "step": 6683 }, { "epoch": 1.239154616240267, "grad_norm": 7.59765625, "learning_rate": 8.760845383759734e-06, "loss": 3.1791, "mean_token_accuracy": 0.4210680751173709, "step": 6684 }, { "epoch": 1.239340007415647, "grad_norm": 8.0390625, "learning_rate": 8.760659992584353e-06, "loss": 2.5043, "mean_token_accuracy": 0.48750604546187326, "step": 6685 }, { "epoch": 1.239525398591027, "grad_norm": 6.1796875, "learning_rate": 8.760474601408974e-06, "loss": 2.7138, "mean_token_accuracy": 0.47574497574497576, "step": 6686 }, { "epoch": 1.239710789766407, "grad_norm": 7.41015625, "learning_rate": 8.760289210233594e-06, "loss": 2.4869, "mean_token_accuracy": 0.48620938628158844, "step": 6687 }, { "epoch": 1.2398961809417872, "grad_norm": 6.24609375, "learning_rate": 8.760103819058213e-06, "loss": 2.4615, "mean_token_accuracy": 0.4866682974559687, "step": 6688 }, { "epoch": 1.2400815721171672, "grad_norm": 7.00390625, "learning_rate": 8.759918427882833e-06, "loss": 2.404, "mean_token_accuracy": 0.4994980639609924, "step": 6689 }, { "epoch": 1.2402669632925472, "grad_norm": 6.390625, "learning_rate": 8.759733036707452e-06, "loss": 2.6777, "mean_token_accuracy": 0.46771117166212534, "step": 6690 }, { "epoch": 1.2404523544679273, "grad_norm": 6.78125, "learning_rate": 8.759547645532074e-06, "loss": 2.7794, "mean_token_accuracy": 0.46309006863706403, "step": 6691 }, { "epoch": 1.2406377456433073, "grad_norm": 6.4453125, "learning_rate": 8.759362254356693e-06, "loss": 2.7323, "mean_token_accuracy": 0.44880480634027864, "step": 6692 }, { "epoch": 1.2408231368186875, "grad_norm": 6.3984375, "learning_rate": 8.759176863181314e-06, "loss": 2.8165, "mean_token_accuracy": 0.4481180811808118, "step": 6693 }, { "epoch": 1.2410085279940675, "grad_norm": 6.74609375, "learning_rate": 8.758991472005933e-06, "loss": 2.9653, "mean_token_accuracy": 0.4480534439069339, "step": 6694 }, { "epoch": 1.2411939191694477, "grad_norm": 10.0390625, "learning_rate": 8.758806080830553e-06, "loss": 2.5681, "mean_token_accuracy": 0.4628238341968912, "step": 6695 }, { "epoch": 1.2413793103448276, "grad_norm": 8.5, "learning_rate": 8.758620689655173e-06, "loss": 2.4418, "mean_token_accuracy": 0.4923014586709887, "step": 6696 }, { "epoch": 1.2415647015202076, "grad_norm": 5.5859375, "learning_rate": 8.758435298479792e-06, "loss": 3.1324, "mean_token_accuracy": 0.4509918319719953, "step": 6697 }, { "epoch": 1.2417500926955878, "grad_norm": 6.7109375, "learning_rate": 8.758249907304413e-06, "loss": 2.7712, "mean_token_accuracy": 0.46607237892496006, "step": 6698 }, { "epoch": 1.2419354838709677, "grad_norm": 5.17578125, "learning_rate": 8.758064516129033e-06, "loss": 2.457, "mean_token_accuracy": 0.48798619407938404, "step": 6699 }, { "epoch": 1.2421208750463477, "grad_norm": 7.8984375, "learning_rate": 8.757879124953654e-06, "loss": 2.9329, "mean_token_accuracy": 0.4502302968270215, "step": 6700 }, { "epoch": 1.242306266221728, "grad_norm": 5.88671875, "learning_rate": 8.757693733778273e-06, "loss": 2.7687, "mean_token_accuracy": 0.4925673583152679, "step": 6701 }, { "epoch": 1.2424916573971079, "grad_norm": 6.93359375, "learning_rate": 8.757508342602893e-06, "loss": 3.2166, "mean_token_accuracy": 0.43183420273509093, "step": 6702 }, { "epoch": 1.2426770485724878, "grad_norm": 11.8359375, "learning_rate": 8.757322951427512e-06, "loss": 2.6865, "mean_token_accuracy": 0.4767163384779138, "step": 6703 }, { "epoch": 1.242862439747868, "grad_norm": 5.66015625, "learning_rate": 8.757137560252132e-06, "loss": 3.051, "mean_token_accuracy": 0.4544376358964222, "step": 6704 }, { "epoch": 1.243047830923248, "grad_norm": 6.453125, "learning_rate": 8.756952169076753e-06, "loss": 3.2754, "mean_token_accuracy": 0.40908366533864543, "step": 6705 }, { "epoch": 1.2432332220986282, "grad_norm": 6.93359375, "learning_rate": 8.756766777901372e-06, "loss": 2.1753, "mean_token_accuracy": 0.5378930395055093, "step": 6706 }, { "epoch": 1.2434186132740082, "grad_norm": 6.95703125, "learning_rate": 8.756581386725994e-06, "loss": 3.4061, "mean_token_accuracy": 0.40572597137014316, "step": 6707 }, { "epoch": 1.2436040044493881, "grad_norm": 6.16796875, "learning_rate": 8.756395995550613e-06, "loss": 3.321, "mean_token_accuracy": 0.4239890164752871, "step": 6708 }, { "epoch": 1.2437893956247683, "grad_norm": 6.27734375, "learning_rate": 8.756210604375233e-06, "loss": 2.8756, "mean_token_accuracy": 0.4459138187221397, "step": 6709 }, { "epoch": 1.2439747868001483, "grad_norm": 6.52734375, "learning_rate": 8.756025213199852e-06, "loss": 2.457, "mean_token_accuracy": 0.5065158593557905, "step": 6710 }, { "epoch": 1.2441601779755285, "grad_norm": 6.08203125, "learning_rate": 8.755839822024472e-06, "loss": 2.9262, "mean_token_accuracy": 0.44918552688482055, "step": 6711 }, { "epoch": 1.2443455691509084, "grad_norm": 7.19140625, "learning_rate": 8.755654430849093e-06, "loss": 3.0281, "mean_token_accuracy": 0.44071438163018667, "step": 6712 }, { "epoch": 1.2445309603262884, "grad_norm": 6.19921875, "learning_rate": 8.755469039673712e-06, "loss": 3.1844, "mean_token_accuracy": 0.42174993099641184, "step": 6713 }, { "epoch": 1.2447163515016686, "grad_norm": 6.9453125, "learning_rate": 8.755283648498332e-06, "loss": 2.8651, "mean_token_accuracy": 0.46655934435826135, "step": 6714 }, { "epoch": 1.2449017426770486, "grad_norm": 6.54296875, "learning_rate": 8.755098257322953e-06, "loss": 3.747, "mean_token_accuracy": 0.39570032573289904, "step": 6715 }, { "epoch": 1.2450871338524285, "grad_norm": 6.24609375, "learning_rate": 8.754912866147573e-06, "loss": 2.623, "mean_token_accuracy": 0.47602950215119855, "step": 6716 }, { "epoch": 1.2452725250278087, "grad_norm": 7.22265625, "learning_rate": 8.754727474972192e-06, "loss": 3.2433, "mean_token_accuracy": 0.4242527948893452, "step": 6717 }, { "epoch": 1.2454579162031887, "grad_norm": 6.76953125, "learning_rate": 8.754542083796812e-06, "loss": 2.6421, "mean_token_accuracy": 0.47396316417119383, "step": 6718 }, { "epoch": 1.2456433073785689, "grad_norm": 6.8203125, "learning_rate": 8.754356692621431e-06, "loss": 3.1563, "mean_token_accuracy": 0.4281524926686217, "step": 6719 }, { "epoch": 1.2458286985539488, "grad_norm": 5.765625, "learning_rate": 8.754171301446052e-06, "loss": 2.4215, "mean_token_accuracy": 0.5042219541616405, "step": 6720 }, { "epoch": 1.2460140897293288, "grad_norm": 8.875, "learning_rate": 8.753985910270672e-06, "loss": 2.7341, "mean_token_accuracy": 0.46629791613069205, "step": 6721 }, { "epoch": 1.246199480904709, "grad_norm": 6.81640625, "learning_rate": 8.753800519095291e-06, "loss": 2.2938, "mean_token_accuracy": 0.5015186490098409, "step": 6722 }, { "epoch": 1.246384872080089, "grad_norm": 7.921875, "learning_rate": 8.753615127919912e-06, "loss": 2.4598, "mean_token_accuracy": 0.4870848708487085, "step": 6723 }, { "epoch": 1.2465702632554692, "grad_norm": 6.6953125, "learning_rate": 8.753429736744532e-06, "loss": 2.4729, "mean_token_accuracy": 0.5131323586492431, "step": 6724 }, { "epoch": 1.2467556544308491, "grad_norm": 6.13671875, "learning_rate": 8.753244345569152e-06, "loss": 2.8349, "mean_token_accuracy": 0.4689895470383275, "step": 6725 }, { "epoch": 1.246941045606229, "grad_norm": 4.97265625, "learning_rate": 8.753058954393771e-06, "loss": 2.8458, "mean_token_accuracy": 0.4401634559252773, "step": 6726 }, { "epoch": 1.2471264367816093, "grad_norm": 7.7265625, "learning_rate": 8.752873563218392e-06, "loss": 3.0488, "mean_token_accuracy": 0.4328901154039137, "step": 6727 }, { "epoch": 1.2473118279569892, "grad_norm": 5.28125, "learning_rate": 8.75268817204301e-06, "loss": 2.4577, "mean_token_accuracy": 0.49883025734338443, "step": 6728 }, { "epoch": 1.2474972191323692, "grad_norm": 7.765625, "learning_rate": 8.752502780867631e-06, "loss": 2.727, "mean_token_accuracy": 0.4638447971781305, "step": 6729 }, { "epoch": 1.2476826103077494, "grad_norm": 5.29296875, "learning_rate": 8.752317389692252e-06, "loss": 2.9785, "mean_token_accuracy": 0.4434279564106014, "step": 6730 }, { "epoch": 1.2478680014831294, "grad_norm": 6.9765625, "learning_rate": 8.752131998516872e-06, "loss": 2.0136, "mean_token_accuracy": 0.5574194884539713, "step": 6731 }, { "epoch": 1.2480533926585093, "grad_norm": 6.80859375, "learning_rate": 8.751946607341491e-06, "loss": 2.8212, "mean_token_accuracy": 0.4519568489713999, "step": 6732 }, { "epoch": 1.2482387838338895, "grad_norm": 5.08984375, "learning_rate": 8.751761216166111e-06, "loss": 2.8478, "mean_token_accuracy": 0.45098501828477056, "step": 6733 }, { "epoch": 1.2484241750092695, "grad_norm": 7.015625, "learning_rate": 8.751575824990732e-06, "loss": 2.5982, "mean_token_accuracy": 0.4870630957784839, "step": 6734 }, { "epoch": 1.2486095661846497, "grad_norm": 9.96875, "learning_rate": 8.75139043381535e-06, "loss": 3.0341, "mean_token_accuracy": 0.4554203226719502, "step": 6735 }, { "epoch": 1.2487949573600297, "grad_norm": 7.12109375, "learning_rate": 8.751205042639971e-06, "loss": 2.7293, "mean_token_accuracy": 0.4726410299847535, "step": 6736 }, { "epoch": 1.2489803485354096, "grad_norm": 6.0859375, "learning_rate": 8.75101965146459e-06, "loss": 3.0175, "mean_token_accuracy": 0.4456856079002752, "step": 6737 }, { "epoch": 1.2491657397107898, "grad_norm": 8.078125, "learning_rate": 8.75083426028921e-06, "loss": 2.8711, "mean_token_accuracy": 0.44144736842105264, "step": 6738 }, { "epoch": 1.2493511308861698, "grad_norm": 7.0859375, "learning_rate": 8.750648869113831e-06, "loss": 3.2202, "mean_token_accuracy": 0.4200706001008573, "step": 6739 }, { "epoch": 1.24953652206155, "grad_norm": 6.421875, "learning_rate": 8.750463477938451e-06, "loss": 2.5511, "mean_token_accuracy": 0.4797720005560962, "step": 6740 }, { "epoch": 1.24972191323693, "grad_norm": 6.14453125, "learning_rate": 8.75027808676307e-06, "loss": 2.5444, "mean_token_accuracy": 0.49317124418430136, "step": 6741 }, { "epoch": 1.24990730441231, "grad_norm": 7.4609375, "learning_rate": 8.75009269558769e-06, "loss": 2.9241, "mean_token_accuracy": 0.4326327299234601, "step": 6742 }, { "epoch": 1.25009269558769, "grad_norm": 7.81640625, "learning_rate": 8.749907304412311e-06, "loss": 3.1704, "mean_token_accuracy": 0.43146985841482993, "step": 6743 }, { "epoch": 1.25027808676307, "grad_norm": 6.625, "learning_rate": 8.74972191323693e-06, "loss": 2.9301, "mean_token_accuracy": 0.44171632896305124, "step": 6744 }, { "epoch": 1.25046347793845, "grad_norm": 6.41796875, "learning_rate": 8.74953652206155e-06, "loss": 2.4918, "mean_token_accuracy": 0.4941389728096677, "step": 6745 }, { "epoch": 1.2506488691138302, "grad_norm": 6.63671875, "learning_rate": 8.74935113088617e-06, "loss": 2.6894, "mean_token_accuracy": 0.4709908735332464, "step": 6746 }, { "epoch": 1.2508342602892102, "grad_norm": 8.2109375, "learning_rate": 8.749165739710791e-06, "loss": 2.5561, "mean_token_accuracy": 0.4878086902156924, "step": 6747 }, { "epoch": 1.2510196514645902, "grad_norm": 6.06640625, "learning_rate": 8.74898034853541e-06, "loss": 3.5381, "mean_token_accuracy": 0.40937038858829317, "step": 6748 }, { "epoch": 1.2512050426399703, "grad_norm": 5.5234375, "learning_rate": 8.74879495736003e-06, "loss": 2.1615, "mean_token_accuracy": 0.548824494259158, "step": 6749 }, { "epoch": 1.2513904338153503, "grad_norm": 9.9609375, "learning_rate": 8.748609566184651e-06, "loss": 2.9008, "mean_token_accuracy": 0.45496838840510556, "step": 6750 }, { "epoch": 1.2515758249907305, "grad_norm": 7.6953125, "learning_rate": 8.74842417500927e-06, "loss": 2.8284, "mean_token_accuracy": 0.47239642232959805, "step": 6751 }, { "epoch": 1.2517612161661105, "grad_norm": 9.171875, "learning_rate": 8.74823878383389e-06, "loss": 3.8383, "mean_token_accuracy": 0.4061146181458101, "step": 6752 }, { "epoch": 1.2519466073414907, "grad_norm": 10.0390625, "learning_rate": 8.74805339265851e-06, "loss": 2.7514, "mean_token_accuracy": 0.5150321648943154, "step": 6753 }, { "epoch": 1.2521319985168706, "grad_norm": 5.6484375, "learning_rate": 8.74786800148313e-06, "loss": 2.9846, "mean_token_accuracy": 0.44584708076316165, "step": 6754 }, { "epoch": 1.2523173896922506, "grad_norm": 7.67578125, "learning_rate": 8.74768261030775e-06, "loss": 3.2229, "mean_token_accuracy": 0.4276060388209921, "step": 6755 }, { "epoch": 1.2525027808676308, "grad_norm": 6.2265625, "learning_rate": 8.74749721913237e-06, "loss": 2.4108, "mean_token_accuracy": 0.4786336059087312, "step": 6756 }, { "epoch": 1.2526881720430108, "grad_norm": 6.015625, "learning_rate": 8.74731182795699e-06, "loss": 2.6911, "mean_token_accuracy": 0.46736292428198434, "step": 6757 }, { "epoch": 1.2528735632183907, "grad_norm": 6.24609375, "learning_rate": 8.74712643678161e-06, "loss": 2.5607, "mean_token_accuracy": 0.4932332537245536, "step": 6758 }, { "epoch": 1.253058954393771, "grad_norm": 6.41796875, "learning_rate": 8.74694104560623e-06, "loss": 2.2509, "mean_token_accuracy": 0.5244332493702771, "step": 6759 }, { "epoch": 1.2532443455691509, "grad_norm": 5.7265625, "learning_rate": 8.74675565443085e-06, "loss": 2.5473, "mean_token_accuracy": 0.4777234447161098, "step": 6760 }, { "epoch": 1.2534297367445308, "grad_norm": 8.1953125, "learning_rate": 8.74657026325547e-06, "loss": 2.9633, "mean_token_accuracy": 0.43644767249917465, "step": 6761 }, { "epoch": 1.253615127919911, "grad_norm": 6.63671875, "learning_rate": 8.746384872080089e-06, "loss": 2.5463, "mean_token_accuracy": 0.48415968177297913, "step": 6762 }, { "epoch": 1.253800519095291, "grad_norm": 7.01953125, "learning_rate": 8.74619948090471e-06, "loss": 2.7518, "mean_token_accuracy": 0.4630438055095589, "step": 6763 }, { "epoch": 1.2539859102706712, "grad_norm": 6.52734375, "learning_rate": 8.74601408972933e-06, "loss": 2.2152, "mean_token_accuracy": 0.5293951384963256, "step": 6764 }, { "epoch": 1.2541713014460512, "grad_norm": 6.34765625, "learning_rate": 8.74582869855395e-06, "loss": 2.6572, "mean_token_accuracy": 0.46649544711650714, "step": 6765 }, { "epoch": 1.2543566926214313, "grad_norm": 7.33203125, "learning_rate": 8.745643307378569e-06, "loss": 2.393, "mean_token_accuracy": 0.49072192875524856, "step": 6766 }, { "epoch": 1.2545420837968113, "grad_norm": 6.28125, "learning_rate": 8.74545791620319e-06, "loss": 3.1449, "mean_token_accuracy": 0.43002058218171124, "step": 6767 }, { "epoch": 1.2547274749721913, "grad_norm": 7.2890625, "learning_rate": 8.74527252502781e-06, "loss": 2.9077, "mean_token_accuracy": 0.4720194647201946, "step": 6768 }, { "epoch": 1.2549128661475715, "grad_norm": 7.1796875, "learning_rate": 8.745087133852429e-06, "loss": 2.5595, "mean_token_accuracy": 0.5012501644953283, "step": 6769 }, { "epoch": 1.2550982573229514, "grad_norm": 6.18359375, "learning_rate": 8.74490174267705e-06, "loss": 2.6439, "mean_token_accuracy": 0.48436770881941205, "step": 6770 }, { "epoch": 1.2552836484983314, "grad_norm": 6.83203125, "learning_rate": 8.74471635150167e-06, "loss": 2.7475, "mean_token_accuracy": 0.46584272714925984, "step": 6771 }, { "epoch": 1.2554690396737116, "grad_norm": 9.2421875, "learning_rate": 8.74453096032629e-06, "loss": 2.8991, "mean_token_accuracy": 0.46563918505225377, "step": 6772 }, { "epoch": 1.2556544308490916, "grad_norm": 6.06640625, "learning_rate": 8.744345569150909e-06, "loss": 2.894, "mean_token_accuracy": 0.46795366795366794, "step": 6773 }, { "epoch": 1.2558398220244715, "grad_norm": 6.83984375, "learning_rate": 8.74416017797553e-06, "loss": 2.7069, "mean_token_accuracy": 0.46266094420600856, "step": 6774 }, { "epoch": 1.2560252131998517, "grad_norm": 5.44140625, "learning_rate": 8.743974786800148e-06, "loss": 2.9142, "mean_token_accuracy": 0.45024424284717374, "step": 6775 }, { "epoch": 1.2562106043752317, "grad_norm": 8.9921875, "learning_rate": 8.743789395624769e-06, "loss": 2.6917, "mean_token_accuracy": 0.44411918452692106, "step": 6776 }, { "epoch": 1.2563959955506117, "grad_norm": 7.2421875, "learning_rate": 8.74360400444939e-06, "loss": 2.5661, "mean_token_accuracy": 0.5127145991996902, "step": 6777 }, { "epoch": 1.2565813867259918, "grad_norm": 7.65234375, "learning_rate": 8.743418613274008e-06, "loss": 3.1534, "mean_token_accuracy": 0.45713179794851944, "step": 6778 }, { "epoch": 1.2567667779013718, "grad_norm": 5.56640625, "learning_rate": 8.743233222098629e-06, "loss": 3.115, "mean_token_accuracy": 0.4294346579270791, "step": 6779 }, { "epoch": 1.256952169076752, "grad_norm": 6.02734375, "learning_rate": 8.743047830923249e-06, "loss": 2.6003, "mean_token_accuracy": 0.48210502843126324, "step": 6780 }, { "epoch": 1.257137560252132, "grad_norm": 5.66796875, "learning_rate": 8.74286243974787e-06, "loss": 2.4566, "mean_token_accuracy": 0.5140995260663507, "step": 6781 }, { "epoch": 1.2573229514275122, "grad_norm": 8.7109375, "learning_rate": 8.742677048572488e-06, "loss": 2.4733, "mean_token_accuracy": 0.46385193753614806, "step": 6782 }, { "epoch": 1.2575083426028921, "grad_norm": 6.0390625, "learning_rate": 8.742491657397109e-06, "loss": 2.9813, "mean_token_accuracy": 0.44597417394606914, "step": 6783 }, { "epoch": 1.257693733778272, "grad_norm": 7.46484375, "learning_rate": 8.742306266221728e-06, "loss": 2.7349, "mean_token_accuracy": 0.4406564197388189, "step": 6784 }, { "epoch": 1.2578791249536523, "grad_norm": 6.69921875, "learning_rate": 8.742120875046348e-06, "loss": 2.8005, "mean_token_accuracy": 0.47238934250107434, "step": 6785 }, { "epoch": 1.2580645161290323, "grad_norm": 4.94140625, "learning_rate": 8.741935483870969e-06, "loss": 2.5127, "mean_token_accuracy": 0.5139984866500918, "step": 6786 }, { "epoch": 1.2582499073044122, "grad_norm": 5.62109375, "learning_rate": 8.741750092695589e-06, "loss": 2.4651, "mean_token_accuracy": 0.49170991013795723, "step": 6787 }, { "epoch": 1.2584352984797924, "grad_norm": 6.36328125, "learning_rate": 8.74156470152021e-06, "loss": 2.2244, "mean_token_accuracy": 0.5210989678202793, "step": 6788 }, { "epoch": 1.2586206896551724, "grad_norm": 6.02734375, "learning_rate": 8.741379310344828e-06, "loss": 2.8136, "mean_token_accuracy": 0.4666281421554464, "step": 6789 }, { "epoch": 1.2588060808305523, "grad_norm": 9.03125, "learning_rate": 8.741193919169449e-06, "loss": 2.8093, "mean_token_accuracy": 0.4558382257012394, "step": 6790 }, { "epoch": 1.2589914720059325, "grad_norm": 5.90234375, "learning_rate": 8.741008527994068e-06, "loss": 3.3896, "mean_token_accuracy": 0.4233520694941237, "step": 6791 }, { "epoch": 1.2591768631813125, "grad_norm": 6.796875, "learning_rate": 8.740823136818688e-06, "loss": 2.9796, "mean_token_accuracy": 0.45075519194461927, "step": 6792 }, { "epoch": 1.2593622543566927, "grad_norm": 5.8671875, "learning_rate": 8.740637745643309e-06, "loss": 2.5326, "mean_token_accuracy": 0.4676577394462097, "step": 6793 }, { "epoch": 1.2595476455320727, "grad_norm": 5.6328125, "learning_rate": 8.740452354467927e-06, "loss": 2.7728, "mean_token_accuracy": 0.47548312662244013, "step": 6794 }, { "epoch": 1.2597330367074528, "grad_norm": 7.11328125, "learning_rate": 8.740266963292548e-06, "loss": 3.1244, "mean_token_accuracy": 0.4441281138790036, "step": 6795 }, { "epoch": 1.2599184278828328, "grad_norm": 5.59765625, "learning_rate": 8.740081572117168e-06, "loss": 3.1198, "mean_token_accuracy": 0.434337123578658, "step": 6796 }, { "epoch": 1.2601038190582128, "grad_norm": 6.24609375, "learning_rate": 8.739896180941789e-06, "loss": 2.7971, "mean_token_accuracy": 0.4645439163205661, "step": 6797 }, { "epoch": 1.260289210233593, "grad_norm": 7.46484375, "learning_rate": 8.739710789766408e-06, "loss": 2.4278, "mean_token_accuracy": 0.5083766608896592, "step": 6798 }, { "epoch": 1.260474601408973, "grad_norm": 7.0625, "learning_rate": 8.739525398591028e-06, "loss": 2.5152, "mean_token_accuracy": 0.48158834844737564, "step": 6799 }, { "epoch": 1.260659992584353, "grad_norm": 8.1953125, "learning_rate": 8.739340007415647e-06, "loss": 3.0181, "mean_token_accuracy": 0.4427994616419919, "step": 6800 }, { "epoch": 1.260845383759733, "grad_norm": 6.125, "learning_rate": 8.739154616240267e-06, "loss": 3.1782, "mean_token_accuracy": 0.4527429934406679, "step": 6801 }, { "epoch": 1.261030774935113, "grad_norm": 10.171875, "learning_rate": 8.738969225064888e-06, "loss": 2.9036, "mean_token_accuracy": 0.4509746445844936, "step": 6802 }, { "epoch": 1.261216166110493, "grad_norm": 10.5, "learning_rate": 8.738783833889508e-06, "loss": 2.9682, "mean_token_accuracy": 0.43660326993660326, "step": 6803 }, { "epoch": 1.2614015572858732, "grad_norm": 6.4296875, "learning_rate": 8.738598442714127e-06, "loss": 2.7514, "mean_token_accuracy": 0.46253164556962023, "step": 6804 }, { "epoch": 1.2615869484612532, "grad_norm": 8.5, "learning_rate": 8.738413051538748e-06, "loss": 2.5542, "mean_token_accuracy": 0.5015659436260295, "step": 6805 }, { "epoch": 1.2617723396366334, "grad_norm": 8.921875, "learning_rate": 8.738227660363368e-06, "loss": 2.4894, "mean_token_accuracy": 0.4945043864071796, "step": 6806 }, { "epoch": 1.2619577308120133, "grad_norm": 8.1171875, "learning_rate": 8.738042269187987e-06, "loss": 2.9399, "mean_token_accuracy": 0.44983857467416, "step": 6807 }, { "epoch": 1.2621431219873935, "grad_norm": 5.88671875, "learning_rate": 8.737856878012608e-06, "loss": 2.8639, "mean_token_accuracy": 0.46305631571366845, "step": 6808 }, { "epoch": 1.2623285131627735, "grad_norm": 11.6875, "learning_rate": 8.737671486837226e-06, "loss": 2.7446, "mean_token_accuracy": 0.4537424980183445, "step": 6809 }, { "epoch": 1.2625139043381535, "grad_norm": 7.90234375, "learning_rate": 8.737486095661847e-06, "loss": 2.8229, "mean_token_accuracy": 0.44919143356643354, "step": 6810 }, { "epoch": 1.2626992955135337, "grad_norm": 6.4765625, "learning_rate": 8.737300704486467e-06, "loss": 2.953, "mean_token_accuracy": 0.4464570352122881, "step": 6811 }, { "epoch": 1.2628846866889136, "grad_norm": 8.234375, "learning_rate": 8.737115313311088e-06, "loss": 2.7617, "mean_token_accuracy": 0.478149446992177, "step": 6812 }, { "epoch": 1.2630700778642936, "grad_norm": 7.37109375, "learning_rate": 8.736929922135707e-06, "loss": 2.5801, "mean_token_accuracy": 0.4879990050988683, "step": 6813 }, { "epoch": 1.2632554690396738, "grad_norm": 6.54296875, "learning_rate": 8.736744530960327e-06, "loss": 1.7893, "mean_token_accuracy": 0.5961298377028714, "step": 6814 }, { "epoch": 1.2634408602150538, "grad_norm": 8.59375, "learning_rate": 8.736559139784948e-06, "loss": 2.9098, "mean_token_accuracy": 0.4634653644420329, "step": 6815 }, { "epoch": 1.2636262513904337, "grad_norm": 6.5859375, "learning_rate": 8.736373748609566e-06, "loss": 2.8137, "mean_token_accuracy": 0.46824044779751767, "step": 6816 }, { "epoch": 1.263811642565814, "grad_norm": 9.015625, "learning_rate": 8.736188357434187e-06, "loss": 2.9523, "mean_token_accuracy": 0.4348478646472916, "step": 6817 }, { "epoch": 1.2639970337411939, "grad_norm": 7.1484375, "learning_rate": 8.736002966258806e-06, "loss": 2.5326, "mean_token_accuracy": 0.4890496078231639, "step": 6818 }, { "epoch": 1.2641824249165738, "grad_norm": 7.72265625, "learning_rate": 8.735817575083426e-06, "loss": 2.7792, "mean_token_accuracy": 0.4627399546917849, "step": 6819 }, { "epoch": 1.264367816091954, "grad_norm": 5.72265625, "learning_rate": 8.735632183908047e-06, "loss": 2.2659, "mean_token_accuracy": 0.5080310163389643, "step": 6820 }, { "epoch": 1.264553207267334, "grad_norm": 6.171875, "learning_rate": 8.735446792732667e-06, "loss": 2.9384, "mean_token_accuracy": 0.4402029475718773, "step": 6821 }, { "epoch": 1.2647385984427142, "grad_norm": 5.98828125, "learning_rate": 8.735261401557286e-06, "loss": 2.8103, "mean_token_accuracy": 0.4395251051199604, "step": 6822 }, { "epoch": 1.2649239896180942, "grad_norm": 7.79296875, "learning_rate": 8.735076010381906e-06, "loss": 2.5809, "mean_token_accuracy": 0.4697717104508432, "step": 6823 }, { "epoch": 1.2651093807934743, "grad_norm": 6.6171875, "learning_rate": 8.734890619206527e-06, "loss": 2.8641, "mean_token_accuracy": 0.4699742442727396, "step": 6824 }, { "epoch": 1.2652947719688543, "grad_norm": 6.16796875, "learning_rate": 8.734705228031146e-06, "loss": 2.7156, "mean_token_accuracy": 0.4700846999459362, "step": 6825 }, { "epoch": 1.2654801631442343, "grad_norm": 5.4765625, "learning_rate": 8.734519836855766e-06, "loss": 3.5967, "mean_token_accuracy": 0.40620138960904284, "step": 6826 }, { "epoch": 1.2656655543196145, "grad_norm": 6.92578125, "learning_rate": 8.734334445680385e-06, "loss": 3.8365, "mean_token_accuracy": 0.3940274227283291, "step": 6827 }, { "epoch": 1.2658509454949944, "grad_norm": 6.421875, "learning_rate": 8.734149054505007e-06, "loss": 3.158, "mean_token_accuracy": 0.4304426377597109, "step": 6828 }, { "epoch": 1.2660363366703744, "grad_norm": 5.8984375, "learning_rate": 8.733963663329626e-06, "loss": 2.9354, "mean_token_accuracy": 0.4633601983880967, "step": 6829 }, { "epoch": 1.2662217278457546, "grad_norm": 9.2578125, "learning_rate": 8.733778272154246e-06, "loss": 2.5492, "mean_token_accuracy": 0.4778959149412423, "step": 6830 }, { "epoch": 1.2664071190211346, "grad_norm": 5.73046875, "learning_rate": 8.733592880978867e-06, "loss": 3.2573, "mean_token_accuracy": 0.4330474148201096, "step": 6831 }, { "epoch": 1.2665925101965145, "grad_norm": 6.609375, "learning_rate": 8.733407489803486e-06, "loss": 2.3335, "mean_token_accuracy": 0.532866023012829, "step": 6832 }, { "epoch": 1.2667779013718947, "grad_norm": 5.3515625, "learning_rate": 8.733222098628106e-06, "loss": 2.9168, "mean_token_accuracy": 0.44966666666666666, "step": 6833 }, { "epoch": 1.2669632925472747, "grad_norm": 5.80078125, "learning_rate": 8.733036707452725e-06, "loss": 2.9226, "mean_token_accuracy": 0.46327615366030445, "step": 6834 }, { "epoch": 1.2671486837226549, "grad_norm": 5.2109375, "learning_rate": 8.732851316277346e-06, "loss": 2.6512, "mean_token_accuracy": 0.4747636363636364, "step": 6835 }, { "epoch": 1.2673340748980348, "grad_norm": 5.609375, "learning_rate": 8.732665925101966e-06, "loss": 2.9665, "mean_token_accuracy": 0.4320665797653665, "step": 6836 }, { "epoch": 1.267519466073415, "grad_norm": 5.93359375, "learning_rate": 8.732480533926587e-06, "loss": 2.6722, "mean_token_accuracy": 0.473407977606718, "step": 6837 }, { "epoch": 1.267704857248795, "grad_norm": 5.26171875, "learning_rate": 8.732295142751205e-06, "loss": 3.0403, "mean_token_accuracy": 0.44613809577977476, "step": 6838 }, { "epoch": 1.267890248424175, "grad_norm": 6.58203125, "learning_rate": 8.732109751575826e-06, "loss": 2.8695, "mean_token_accuracy": 0.4500271591526344, "step": 6839 }, { "epoch": 1.2680756395995552, "grad_norm": 8.5703125, "learning_rate": 8.731924360400446e-06, "loss": 2.6207, "mean_token_accuracy": 0.48713550600343053, "step": 6840 }, { "epoch": 1.2682610307749351, "grad_norm": 6.52734375, "learning_rate": 8.731738969225065e-06, "loss": 3.0926, "mean_token_accuracy": 0.4469162995594714, "step": 6841 }, { "epoch": 1.268446421950315, "grad_norm": 5.73828125, "learning_rate": 8.731553578049686e-06, "loss": 2.5039, "mean_token_accuracy": 0.4686949371549395, "step": 6842 }, { "epoch": 1.2686318131256953, "grad_norm": 8.28125, "learning_rate": 8.731368186874304e-06, "loss": 2.8682, "mean_token_accuracy": 0.45308985046976313, "step": 6843 }, { "epoch": 1.2688172043010753, "grad_norm": 7.4453125, "learning_rate": 8.731182795698927e-06, "loss": 2.7532, "mean_token_accuracy": 0.4778502985696431, "step": 6844 }, { "epoch": 1.2690025954764552, "grad_norm": 6.453125, "learning_rate": 8.730997404523545e-06, "loss": 3.0764, "mean_token_accuracy": 0.429648970118944, "step": 6845 }, { "epoch": 1.2691879866518354, "grad_norm": 9.2578125, "learning_rate": 8.730812013348166e-06, "loss": 3.2888, "mean_token_accuracy": 0.39870093974571585, "step": 6846 }, { "epoch": 1.2693733778272154, "grad_norm": 7.83984375, "learning_rate": 8.730626622172785e-06, "loss": 2.9456, "mean_token_accuracy": 0.4395667870036101, "step": 6847 }, { "epoch": 1.2695587690025953, "grad_norm": 7.0546875, "learning_rate": 8.730441230997405e-06, "loss": 2.691, "mean_token_accuracy": 0.4701117318435754, "step": 6848 }, { "epoch": 1.2697441601779755, "grad_norm": 5.6796875, "learning_rate": 8.730255839822026e-06, "loss": 2.5011, "mean_token_accuracy": 0.4988110964332893, "step": 6849 }, { "epoch": 1.2699295513533555, "grad_norm": 6.69921875, "learning_rate": 8.730070448646644e-06, "loss": 2.808, "mean_token_accuracy": 0.4668904839083121, "step": 6850 }, { "epoch": 1.2701149425287357, "grad_norm": 8.7421875, "learning_rate": 8.729885057471265e-06, "loss": 2.4938, "mean_token_accuracy": 0.476169781254527, "step": 6851 }, { "epoch": 1.2703003337041157, "grad_norm": 6.96875, "learning_rate": 8.729699666295885e-06, "loss": 3.0007, "mean_token_accuracy": 0.4197422378441711, "step": 6852 }, { "epoch": 1.2704857248794958, "grad_norm": 13.546875, "learning_rate": 8.729514275120506e-06, "loss": 2.8055, "mean_token_accuracy": 0.44871794871794873, "step": 6853 }, { "epoch": 1.2706711160548758, "grad_norm": 11.6015625, "learning_rate": 8.729328883945125e-06, "loss": 2.9314, "mean_token_accuracy": 0.44271230786002397, "step": 6854 }, { "epoch": 1.2708565072302558, "grad_norm": 7.25, "learning_rate": 8.729143492769745e-06, "loss": 2.6627, "mean_token_accuracy": 0.4730885009030704, "step": 6855 }, { "epoch": 1.271041898405636, "grad_norm": 6.1328125, "learning_rate": 8.728958101594364e-06, "loss": 2.2544, "mean_token_accuracy": 0.5156273822228998, "step": 6856 }, { "epoch": 1.271227289581016, "grad_norm": 12.109375, "learning_rate": 8.728772710418985e-06, "loss": 2.7693, "mean_token_accuracy": 0.4431924882629108, "step": 6857 }, { "epoch": 1.271412680756396, "grad_norm": 11.6328125, "learning_rate": 8.728587319243605e-06, "loss": 2.7558, "mean_token_accuracy": 0.4644632540642214, "step": 6858 }, { "epoch": 1.271598071931776, "grad_norm": 10.8671875, "learning_rate": 8.728401928068224e-06, "loss": 3.2253, "mean_token_accuracy": 0.4214511041009464, "step": 6859 }, { "epoch": 1.271783463107156, "grad_norm": 4.98828125, "learning_rate": 8.728216536892844e-06, "loss": 2.8842, "mean_token_accuracy": 0.4524959742351047, "step": 6860 }, { "epoch": 1.271968854282536, "grad_norm": 10.578125, "learning_rate": 8.728031145717465e-06, "loss": 2.9777, "mean_token_accuracy": 0.43530411786928597, "step": 6861 }, { "epoch": 1.2721542454579162, "grad_norm": 14.5703125, "learning_rate": 8.727845754542085e-06, "loss": 3.0789, "mean_token_accuracy": 0.4350140056022409, "step": 6862 }, { "epoch": 1.2723396366332962, "grad_norm": 7.5078125, "learning_rate": 8.727660363366704e-06, "loss": 2.8479, "mean_token_accuracy": 0.4682926829268293, "step": 6863 }, { "epoch": 1.2725250278086764, "grad_norm": 6.16796875, "learning_rate": 8.727474972191325e-06, "loss": 3.0979, "mean_token_accuracy": 0.4301659988551803, "step": 6864 }, { "epoch": 1.2727104189840563, "grad_norm": 5.52734375, "learning_rate": 8.727289581015943e-06, "loss": 2.9038, "mean_token_accuracy": 0.44987991404373656, "step": 6865 }, { "epoch": 1.2728958101594365, "grad_norm": 6.0859375, "learning_rate": 8.727104189840564e-06, "loss": 2.5616, "mean_token_accuracy": 0.481492873987491, "step": 6866 }, { "epoch": 1.2730812013348165, "grad_norm": 6.4296875, "learning_rate": 8.726918798665184e-06, "loss": 2.8801, "mean_token_accuracy": 0.4825196850393701, "step": 6867 }, { "epoch": 1.2732665925101965, "grad_norm": 6.02734375, "learning_rate": 8.726733407489805e-06, "loss": 3.1763, "mean_token_accuracy": 0.4097202990861257, "step": 6868 }, { "epoch": 1.2734519836855767, "grad_norm": 8.109375, "learning_rate": 8.726548016314425e-06, "loss": 3.0444, "mean_token_accuracy": 0.45615763546798027, "step": 6869 }, { "epoch": 1.2736373748609566, "grad_norm": 5.71875, "learning_rate": 8.726362625139044e-06, "loss": 2.7271, "mean_token_accuracy": 0.4561996779388084, "step": 6870 }, { "epoch": 1.2738227660363366, "grad_norm": 6.86328125, "learning_rate": 8.726177233963665e-06, "loss": 2.4195, "mean_token_accuracy": 0.5001418842224744, "step": 6871 }, { "epoch": 1.2740081572117168, "grad_norm": 6.4609375, "learning_rate": 8.725991842788283e-06, "loss": 3.4894, "mean_token_accuracy": 0.42713973044580117, "step": 6872 }, { "epoch": 1.2741935483870968, "grad_norm": 7.44140625, "learning_rate": 8.725806451612904e-06, "loss": 2.9024, "mean_token_accuracy": 0.4456261234272019, "step": 6873 }, { "epoch": 1.2743789395624767, "grad_norm": 7.47265625, "learning_rate": 8.725621060437524e-06, "loss": 2.5909, "mean_token_accuracy": 0.4893787117405208, "step": 6874 }, { "epoch": 1.274564330737857, "grad_norm": 8.3984375, "learning_rate": 8.725435669262143e-06, "loss": 3.5365, "mean_token_accuracy": 0.41929269299573413, "step": 6875 }, { "epoch": 1.2747497219132369, "grad_norm": 11.578125, "learning_rate": 8.725250278086764e-06, "loss": 2.5456, "mean_token_accuracy": 0.4736973323360758, "step": 6876 }, { "epoch": 1.2749351130886168, "grad_norm": 7.00390625, "learning_rate": 8.725064886911384e-06, "loss": 3.2688, "mean_token_accuracy": 0.452212389380531, "step": 6877 }, { "epoch": 1.275120504263997, "grad_norm": 5.86328125, "learning_rate": 8.724879495736005e-06, "loss": 2.6558, "mean_token_accuracy": 0.4800524934383202, "step": 6878 }, { "epoch": 1.2753058954393772, "grad_norm": 9.7421875, "learning_rate": 8.724694104560623e-06, "loss": 2.6867, "mean_token_accuracy": 0.4744180407371484, "step": 6879 }, { "epoch": 1.2754912866147572, "grad_norm": 6.72265625, "learning_rate": 8.724508713385244e-06, "loss": 2.8819, "mean_token_accuracy": 0.44857142857142857, "step": 6880 }, { "epoch": 1.2756766777901372, "grad_norm": 12.9765625, "learning_rate": 8.724323322209863e-06, "loss": 2.2067, "mean_token_accuracy": 0.5220318960425281, "step": 6881 }, { "epoch": 1.2758620689655173, "grad_norm": 5.57421875, "learning_rate": 8.724137931034483e-06, "loss": 2.518, "mean_token_accuracy": 0.48923331755797445, "step": 6882 }, { "epoch": 1.2760474601408973, "grad_norm": 7.9453125, "learning_rate": 8.723952539859104e-06, "loss": 2.9759, "mean_token_accuracy": 0.442486281131279, "step": 6883 }, { "epoch": 1.2762328513162773, "grad_norm": 9.78125, "learning_rate": 8.723767148683724e-06, "loss": 2.7313, "mean_token_accuracy": 0.4579280531110082, "step": 6884 }, { "epoch": 1.2764182424916575, "grad_norm": 7.2890625, "learning_rate": 8.723581757508343e-06, "loss": 2.9476, "mean_token_accuracy": 0.4561038961038961, "step": 6885 }, { "epoch": 1.2766036336670374, "grad_norm": 8.4609375, "learning_rate": 8.723396366332964e-06, "loss": 3.3309, "mean_token_accuracy": 0.42711244893571276, "step": 6886 }, { "epoch": 1.2767890248424174, "grad_norm": 8.0, "learning_rate": 8.723210975157584e-06, "loss": 2.8451, "mean_token_accuracy": 0.4404369949117031, "step": 6887 }, { "epoch": 1.2769744160177976, "grad_norm": 6.98046875, "learning_rate": 8.723025583982203e-06, "loss": 2.536, "mean_token_accuracy": 0.4799697656840514, "step": 6888 }, { "epoch": 1.2771598071931776, "grad_norm": 6.6875, "learning_rate": 8.722840192806823e-06, "loss": 2.804, "mean_token_accuracy": 0.45852080989876265, "step": 6889 }, { "epoch": 1.2773451983685575, "grad_norm": 15.859375, "learning_rate": 8.722654801631442e-06, "loss": 2.7743, "mean_token_accuracy": 0.466089273817455, "step": 6890 }, { "epoch": 1.2775305895439377, "grad_norm": 6.33203125, "learning_rate": 8.722469410456063e-06, "loss": 3.0711, "mean_token_accuracy": 0.4412729260293939, "step": 6891 }, { "epoch": 1.2777159807193177, "grad_norm": 6.046875, "learning_rate": 8.722284019280683e-06, "loss": 2.3227, "mean_token_accuracy": 0.5013854930725347, "step": 6892 }, { "epoch": 1.2779013718946979, "grad_norm": 5.95703125, "learning_rate": 8.722098628105304e-06, "loss": 2.8942, "mean_token_accuracy": 0.4855994641661085, "step": 6893 }, { "epoch": 1.2780867630700778, "grad_norm": 6.640625, "learning_rate": 8.721913236929922e-06, "loss": 3.1615, "mean_token_accuracy": 0.4101966873706004, "step": 6894 }, { "epoch": 1.278272154245458, "grad_norm": 6.9453125, "learning_rate": 8.721727845754543e-06, "loss": 2.6892, "mean_token_accuracy": 0.48331322878970645, "step": 6895 }, { "epoch": 1.278457545420838, "grad_norm": 7.53515625, "learning_rate": 8.721542454579163e-06, "loss": 2.446, "mean_token_accuracy": 0.4849949135300102, "step": 6896 }, { "epoch": 1.278642936596218, "grad_norm": 9.2265625, "learning_rate": 8.721357063403782e-06, "loss": 3.4468, "mean_token_accuracy": 0.4271095717884131, "step": 6897 }, { "epoch": 1.2788283277715982, "grad_norm": 9.3359375, "learning_rate": 8.721171672228403e-06, "loss": 2.6155, "mean_token_accuracy": 0.4935453186574263, "step": 6898 }, { "epoch": 1.2790137189469781, "grad_norm": 6.90234375, "learning_rate": 8.720986281053021e-06, "loss": 4.2466, "mean_token_accuracy": 0.3687603058012292, "step": 6899 }, { "epoch": 1.279199110122358, "grad_norm": 5.375, "learning_rate": 8.720800889877644e-06, "loss": 2.4405, "mean_token_accuracy": 0.4988418871144703, "step": 6900 }, { "epoch": 1.2793845012977383, "grad_norm": 6.4375, "learning_rate": 8.720615498702262e-06, "loss": 2.568, "mean_token_accuracy": 0.47586206896551725, "step": 6901 }, { "epoch": 1.2795698924731183, "grad_norm": 6.03125, "learning_rate": 8.720430107526883e-06, "loss": 2.9857, "mean_token_accuracy": 0.4620599981424724, "step": 6902 }, { "epoch": 1.2797552836484982, "grad_norm": 6.359375, "learning_rate": 8.720244716351502e-06, "loss": 2.763, "mean_token_accuracy": 0.4634146341463415, "step": 6903 }, { "epoch": 1.2799406748238784, "grad_norm": 6.62109375, "learning_rate": 8.720059325176122e-06, "loss": 2.5361, "mean_token_accuracy": 0.5068493150684932, "step": 6904 }, { "epoch": 1.2801260659992584, "grad_norm": 7.1015625, "learning_rate": 8.719873934000743e-06, "loss": 2.2169, "mean_token_accuracy": 0.5367710676587645, "step": 6905 }, { "epoch": 1.2803114571746386, "grad_norm": 5.53125, "learning_rate": 8.719688542825361e-06, "loss": 2.7148, "mean_token_accuracy": 0.492800622648852, "step": 6906 }, { "epoch": 1.2804968483500185, "grad_norm": 8.7421875, "learning_rate": 8.719503151649982e-06, "loss": 3.4514, "mean_token_accuracy": 0.40084449621432733, "step": 6907 }, { "epoch": 1.2806822395253987, "grad_norm": 13.171875, "learning_rate": 8.719317760474602e-06, "loss": 2.9262, "mean_token_accuracy": 0.4486500794070937, "step": 6908 }, { "epoch": 1.2808676307007787, "grad_norm": 10.7265625, "learning_rate": 8.719132369299223e-06, "loss": 2.3029, "mean_token_accuracy": 0.5029123455036226, "step": 6909 }, { "epoch": 1.2810530218761587, "grad_norm": 6.8671875, "learning_rate": 8.718946978123842e-06, "loss": 3.1173, "mean_token_accuracy": 0.42355889724310775, "step": 6910 }, { "epoch": 1.2812384130515388, "grad_norm": 6.91796875, "learning_rate": 8.718761586948462e-06, "loss": 2.9953, "mean_token_accuracy": 0.4447870778267254, "step": 6911 }, { "epoch": 1.2814238042269188, "grad_norm": 6.58203125, "learning_rate": 8.718576195773083e-06, "loss": 2.6755, "mean_token_accuracy": 0.48339532412327313, "step": 6912 }, { "epoch": 1.2816091954022988, "grad_norm": 5.83203125, "learning_rate": 8.718390804597702e-06, "loss": 2.77, "mean_token_accuracy": 0.4582531742978069, "step": 6913 }, { "epoch": 1.281794586577679, "grad_norm": 7.9453125, "learning_rate": 8.718205413422322e-06, "loss": 3.0574, "mean_token_accuracy": 0.44884807475430966, "step": 6914 }, { "epoch": 1.281979977753059, "grad_norm": 7.17578125, "learning_rate": 8.71802002224694e-06, "loss": 2.502, "mean_token_accuracy": 0.4822074437055186, "step": 6915 }, { "epoch": 1.282165368928439, "grad_norm": 5.71484375, "learning_rate": 8.717834631071563e-06, "loss": 2.6671, "mean_token_accuracy": 0.4868812201111542, "step": 6916 }, { "epoch": 1.282350760103819, "grad_norm": 5.234375, "learning_rate": 8.717649239896182e-06, "loss": 3.0133, "mean_token_accuracy": 0.4406181552689219, "step": 6917 }, { "epoch": 1.282536151279199, "grad_norm": 6.51953125, "learning_rate": 8.717463848720802e-06, "loss": 2.417, "mean_token_accuracy": 0.5251918585251919, "step": 6918 }, { "epoch": 1.282721542454579, "grad_norm": 10.5390625, "learning_rate": 8.717278457545421e-06, "loss": 2.577, "mean_token_accuracy": 0.47660311958405543, "step": 6919 }, { "epoch": 1.2829069336299592, "grad_norm": 7.125, "learning_rate": 8.717093066370042e-06, "loss": 2.8554, "mean_token_accuracy": 0.45697509617331444, "step": 6920 }, { "epoch": 1.2830923248053392, "grad_norm": 6.9921875, "learning_rate": 8.716907675194662e-06, "loss": 3.263, "mean_token_accuracy": 0.422355854262469, "step": 6921 }, { "epoch": 1.2832777159807194, "grad_norm": 5.46484375, "learning_rate": 8.716722284019281e-06, "loss": 2.5434, "mean_token_accuracy": 0.504313205043132, "step": 6922 }, { "epoch": 1.2834631071560993, "grad_norm": 8.515625, "learning_rate": 8.716536892843901e-06, "loss": 2.7635, "mean_token_accuracy": 0.4735520094562648, "step": 6923 }, { "epoch": 1.2836484983314795, "grad_norm": 5.52734375, "learning_rate": 8.716351501668522e-06, "loss": 2.24, "mean_token_accuracy": 0.5216906123587669, "step": 6924 }, { "epoch": 1.2838338895068595, "grad_norm": 6.16796875, "learning_rate": 8.716166110493142e-06, "loss": 2.7268, "mean_token_accuracy": 0.49910946705028086, "step": 6925 }, { "epoch": 1.2840192806822395, "grad_norm": 9.640625, "learning_rate": 8.715980719317761e-06, "loss": 3.7165, "mean_token_accuracy": 0.42424242424242425, "step": 6926 }, { "epoch": 1.2842046718576197, "grad_norm": 6.1328125, "learning_rate": 8.715795328142382e-06, "loss": 2.6352, "mean_token_accuracy": 0.4812910938433951, "step": 6927 }, { "epoch": 1.2843900630329996, "grad_norm": 6.5078125, "learning_rate": 8.715609936967e-06, "loss": 3.0123, "mean_token_accuracy": 0.4396551724137931, "step": 6928 }, { "epoch": 1.2845754542083796, "grad_norm": 8.6015625, "learning_rate": 8.715424545791621e-06, "loss": 2.939, "mean_token_accuracy": 0.4322480248982523, "step": 6929 }, { "epoch": 1.2847608453837598, "grad_norm": 5.24609375, "learning_rate": 8.715239154616241e-06, "loss": 2.6858, "mean_token_accuracy": 0.4668402511870118, "step": 6930 }, { "epoch": 1.2849462365591398, "grad_norm": 5.35546875, "learning_rate": 8.71505376344086e-06, "loss": 2.7304, "mean_token_accuracy": 0.4777634777634778, "step": 6931 }, { "epoch": 1.2851316277345197, "grad_norm": 6.9375, "learning_rate": 8.71486837226548e-06, "loss": 2.3899, "mean_token_accuracy": 0.49371995658241585, "step": 6932 }, { "epoch": 1.2853170189099, "grad_norm": 6.9921875, "learning_rate": 8.714682981090101e-06, "loss": 3.1327, "mean_token_accuracy": 0.4503478052290717, "step": 6933 }, { "epoch": 1.2855024100852799, "grad_norm": 6.08984375, "learning_rate": 8.714497589914722e-06, "loss": 2.3297, "mean_token_accuracy": 0.5160578302615605, "step": 6934 }, { "epoch": 1.28568780126066, "grad_norm": 6.40234375, "learning_rate": 8.71431219873934e-06, "loss": 3.6942, "mean_token_accuracy": 0.3682753164556962, "step": 6935 }, { "epoch": 1.28587319243604, "grad_norm": 10.4296875, "learning_rate": 8.714126807563961e-06, "loss": 3.1291, "mean_token_accuracy": 0.4283894870904249, "step": 6936 }, { "epoch": 1.2860585836114202, "grad_norm": 9.359375, "learning_rate": 8.71394141638858e-06, "loss": 2.7199, "mean_token_accuracy": 0.44988576537200753, "step": 6937 }, { "epoch": 1.2862439747868002, "grad_norm": 6.8046875, "learning_rate": 8.7137560252132e-06, "loss": 2.6832, "mean_token_accuracy": 0.4782758620689655, "step": 6938 }, { "epoch": 1.2864293659621802, "grad_norm": 8.953125, "learning_rate": 8.71357063403782e-06, "loss": 3.1268, "mean_token_accuracy": 0.42401311640812206, "step": 6939 }, { "epoch": 1.2866147571375603, "grad_norm": 8.1484375, "learning_rate": 8.71338524286244e-06, "loss": 2.4169, "mean_token_accuracy": 0.49859550561797755, "step": 6940 }, { "epoch": 1.2868001483129403, "grad_norm": 8.59375, "learning_rate": 8.71319985168706e-06, "loss": 2.7931, "mean_token_accuracy": 0.47214381221215274, "step": 6941 }, { "epoch": 1.2869855394883203, "grad_norm": 7.67578125, "learning_rate": 8.71301446051168e-06, "loss": 2.2485, "mean_token_accuracy": 0.5122319956019791, "step": 6942 }, { "epoch": 1.2871709306637005, "grad_norm": 9.015625, "learning_rate": 8.712829069336301e-06, "loss": 2.8287, "mean_token_accuracy": 0.4582123600165906, "step": 6943 }, { "epoch": 1.2873563218390804, "grad_norm": 6.76953125, "learning_rate": 8.71264367816092e-06, "loss": 2.8152, "mean_token_accuracy": 0.4588443717634233, "step": 6944 }, { "epoch": 1.2875417130144604, "grad_norm": 7.3671875, "learning_rate": 8.71245828698554e-06, "loss": 3.3397, "mean_token_accuracy": 0.4276899924755455, "step": 6945 }, { "epoch": 1.2877271041898406, "grad_norm": 7.671875, "learning_rate": 8.712272895810159e-06, "loss": 3.1164, "mean_token_accuracy": 0.4264766911052704, "step": 6946 }, { "epoch": 1.2879124953652206, "grad_norm": 7.2890625, "learning_rate": 8.71208750463478e-06, "loss": 3.4786, "mean_token_accuracy": 0.40805653710247347, "step": 6947 }, { "epoch": 1.2880978865406005, "grad_norm": 7.8984375, "learning_rate": 8.7119021134594e-06, "loss": 2.4963, "mean_token_accuracy": 0.4802825947334618, "step": 6948 }, { "epoch": 1.2882832777159807, "grad_norm": 6.1328125, "learning_rate": 8.71171672228402e-06, "loss": 2.4055, "mean_token_accuracy": 0.5127755511022044, "step": 6949 }, { "epoch": 1.2884686688913607, "grad_norm": 9.046875, "learning_rate": 8.711531331108641e-06, "loss": 2.6307, "mean_token_accuracy": 0.4718600429113715, "step": 6950 }, { "epoch": 1.2886540600667409, "grad_norm": 6.32421875, "learning_rate": 8.71134593993326e-06, "loss": 2.7671, "mean_token_accuracy": 0.4769659011830202, "step": 6951 }, { "epoch": 1.2888394512421208, "grad_norm": 5.58984375, "learning_rate": 8.71116054875788e-06, "loss": 2.367, "mean_token_accuracy": 0.5385068993985139, "step": 6952 }, { "epoch": 1.289024842417501, "grad_norm": 9.75, "learning_rate": 8.710975157582499e-06, "loss": 2.8695, "mean_token_accuracy": 0.47386875939221906, "step": 6953 }, { "epoch": 1.289210233592881, "grad_norm": 8.9140625, "learning_rate": 8.71078976640712e-06, "loss": 3.0407, "mean_token_accuracy": 0.4362370133576892, "step": 6954 }, { "epoch": 1.289395624768261, "grad_norm": 6.890625, "learning_rate": 8.71060437523174e-06, "loss": 2.8899, "mean_token_accuracy": 0.4619191919191919, "step": 6955 }, { "epoch": 1.2895810159436412, "grad_norm": 5.93359375, "learning_rate": 8.710418984056359e-06, "loss": 3.0542, "mean_token_accuracy": 0.44437704719155646, "step": 6956 }, { "epoch": 1.2897664071190211, "grad_norm": 11.765625, "learning_rate": 8.71023359288098e-06, "loss": 2.2009, "mean_token_accuracy": 0.535036325287087, "step": 6957 }, { "epoch": 1.289951798294401, "grad_norm": 9.765625, "learning_rate": 8.7100482017056e-06, "loss": 2.2476, "mean_token_accuracy": 0.5053329864724245, "step": 6958 }, { "epoch": 1.2901371894697813, "grad_norm": 6.7109375, "learning_rate": 8.70986281053022e-06, "loss": 2.9455, "mean_token_accuracy": 0.45460358056265987, "step": 6959 }, { "epoch": 1.2903225806451613, "grad_norm": 8.5859375, "learning_rate": 8.70967741935484e-06, "loss": 2.9178, "mean_token_accuracy": 0.44893460690668624, "step": 6960 }, { "epoch": 1.2905079718205412, "grad_norm": 7.74609375, "learning_rate": 8.70949202817946e-06, "loss": 2.7003, "mean_token_accuracy": 0.47106662496090085, "step": 6961 }, { "epoch": 1.2906933629959214, "grad_norm": 8.1875, "learning_rate": 8.709306637004078e-06, "loss": 3.283, "mean_token_accuracy": 0.4214817938984957, "step": 6962 }, { "epoch": 1.2908787541713014, "grad_norm": 8.8515625, "learning_rate": 8.709121245828699e-06, "loss": 3.1283, "mean_token_accuracy": 0.4319349826700131, "step": 6963 }, { "epoch": 1.2910641453466816, "grad_norm": 8.328125, "learning_rate": 8.70893585465332e-06, "loss": 3.3758, "mean_token_accuracy": 0.4087332372825003, "step": 6964 }, { "epoch": 1.2912495365220615, "grad_norm": 6.8515625, "learning_rate": 8.70875046347794e-06, "loss": 3.3302, "mean_token_accuracy": 0.4220293325351691, "step": 6965 }, { "epoch": 1.2914349276974417, "grad_norm": 6.69140625, "learning_rate": 8.708565072302559e-06, "loss": 2.9249, "mean_token_accuracy": 0.4680795050677899, "step": 6966 }, { "epoch": 1.2916203188728217, "grad_norm": 10.6171875, "learning_rate": 8.70837968112718e-06, "loss": 2.445, "mean_token_accuracy": 0.4813256180957391, "step": 6967 }, { "epoch": 1.2918057100482017, "grad_norm": 10.46875, "learning_rate": 8.7081942899518e-06, "loss": 3.2887, "mean_token_accuracy": 0.41794963599474877, "step": 6968 }, { "epoch": 1.2919911012235819, "grad_norm": 7.28125, "learning_rate": 8.708008898776419e-06, "loss": 2.5879, "mean_token_accuracy": 0.49240034413535994, "step": 6969 }, { "epoch": 1.2921764923989618, "grad_norm": 7.48046875, "learning_rate": 8.707823507601039e-06, "loss": 2.8653, "mean_token_accuracy": 0.4529799341120096, "step": 6970 }, { "epoch": 1.2923618835743418, "grad_norm": 9.5859375, "learning_rate": 8.707638116425658e-06, "loss": 2.8389, "mean_token_accuracy": 0.4537901060974051, "step": 6971 }, { "epoch": 1.292547274749722, "grad_norm": 10.9609375, "learning_rate": 8.707452725250278e-06, "loss": 2.3883, "mean_token_accuracy": 0.49167410050550103, "step": 6972 }, { "epoch": 1.292732665925102, "grad_norm": 8.0703125, "learning_rate": 8.707267334074899e-06, "loss": 2.0286, "mean_token_accuracy": 0.5577903292464527, "step": 6973 }, { "epoch": 1.292918057100482, "grad_norm": 6.9609375, "learning_rate": 8.70708194289952e-06, "loss": 3.47, "mean_token_accuracy": 0.41597510373443985, "step": 6974 }, { "epoch": 1.293103448275862, "grad_norm": 7.328125, "learning_rate": 8.706896551724138e-06, "loss": 2.6679, "mean_token_accuracy": 0.5031482541499713, "step": 6975 }, { "epoch": 1.293288839451242, "grad_norm": 6.890625, "learning_rate": 8.706711160548759e-06, "loss": 3.0831, "mean_token_accuracy": 0.41405520736098145, "step": 6976 }, { "epoch": 1.293474230626622, "grad_norm": 6.265625, "learning_rate": 8.706525769373379e-06, "loss": 2.4937, "mean_token_accuracy": 0.48798001873243835, "step": 6977 }, { "epoch": 1.2936596218020022, "grad_norm": 5.83984375, "learning_rate": 8.706340378197998e-06, "loss": 3.2133, "mean_token_accuracy": 0.4313488576449912, "step": 6978 }, { "epoch": 1.2938450129773824, "grad_norm": 6.47265625, "learning_rate": 8.706154987022618e-06, "loss": 2.5218, "mean_token_accuracy": 0.5307808841757333, "step": 6979 }, { "epoch": 1.2940304041527624, "grad_norm": 4.875, "learning_rate": 8.705969595847237e-06, "loss": 2.9492, "mean_token_accuracy": 0.44651312957382694, "step": 6980 }, { "epoch": 1.2942157953281423, "grad_norm": 6.21875, "learning_rate": 8.70578420467186e-06, "loss": 2.6896, "mean_token_accuracy": 0.47109670448406266, "step": 6981 }, { "epoch": 1.2944011865035225, "grad_norm": 8.4375, "learning_rate": 8.705598813496478e-06, "loss": 3.2205, "mean_token_accuracy": 0.42215771649733913, "step": 6982 }, { "epoch": 1.2945865776789025, "grad_norm": 6.4765625, "learning_rate": 8.705413422321099e-06, "loss": 3.1936, "mean_token_accuracy": 0.44059925093632957, "step": 6983 }, { "epoch": 1.2947719688542825, "grad_norm": 5.5078125, "learning_rate": 8.705228031145717e-06, "loss": 2.5444, "mean_token_accuracy": 0.4750243495199666, "step": 6984 }, { "epoch": 1.2949573600296627, "grad_norm": 6.12109375, "learning_rate": 8.705042639970338e-06, "loss": 2.5734, "mean_token_accuracy": 0.49076002082248826, "step": 6985 }, { "epoch": 1.2951427512050426, "grad_norm": 6.28125, "learning_rate": 8.704857248794958e-06, "loss": 2.5851, "mean_token_accuracy": 0.4896254378873619, "step": 6986 }, { "epoch": 1.2953281423804226, "grad_norm": 5.19921875, "learning_rate": 8.704671857619577e-06, "loss": 2.3294, "mean_token_accuracy": 0.4987305041712006, "step": 6987 }, { "epoch": 1.2955135335558028, "grad_norm": 6.4140625, "learning_rate": 8.704486466444198e-06, "loss": 3.2613, "mean_token_accuracy": 0.4525993883792049, "step": 6988 }, { "epoch": 1.2956989247311828, "grad_norm": 7.1171875, "learning_rate": 8.704301075268818e-06, "loss": 2.6497, "mean_token_accuracy": 0.47506275237367673, "step": 6989 }, { "epoch": 1.2958843159065627, "grad_norm": 7.703125, "learning_rate": 8.704115684093439e-06, "loss": 3.1065, "mean_token_accuracy": 0.4464530892448513, "step": 6990 }, { "epoch": 1.296069707081943, "grad_norm": 6.3046875, "learning_rate": 8.703930292918057e-06, "loss": 2.5805, "mean_token_accuracy": 0.49241475295755044, "step": 6991 }, { "epoch": 1.2962550982573229, "grad_norm": 6.953125, "learning_rate": 8.703744901742678e-06, "loss": 2.3696, "mean_token_accuracy": 0.5400304028375982, "step": 6992 }, { "epoch": 1.296440489432703, "grad_norm": 7.390625, "learning_rate": 8.703559510567298e-06, "loss": 2.6299, "mean_token_accuracy": 0.4651370299553856, "step": 6993 }, { "epoch": 1.296625880608083, "grad_norm": 5.53515625, "learning_rate": 8.703374119391917e-06, "loss": 3.21, "mean_token_accuracy": 0.4156959813628422, "step": 6994 }, { "epoch": 1.2968112717834632, "grad_norm": 6.05859375, "learning_rate": 8.703188728216538e-06, "loss": 3.4343, "mean_token_accuracy": 0.40844493030882756, "step": 6995 }, { "epoch": 1.2969966629588432, "grad_norm": 5.9921875, "learning_rate": 8.703003337041157e-06, "loss": 3.6598, "mean_token_accuracy": 0.4075333837238599, "step": 6996 }, { "epoch": 1.2971820541342232, "grad_norm": 9.2109375, "learning_rate": 8.702817945865779e-06, "loss": 2.9394, "mean_token_accuracy": 0.4366993217784476, "step": 6997 }, { "epoch": 1.2973674453096034, "grad_norm": 8.6171875, "learning_rate": 8.702632554690398e-06, "loss": 2.2277, "mean_token_accuracy": 0.52773737793476, "step": 6998 }, { "epoch": 1.2975528364849833, "grad_norm": 5.32421875, "learning_rate": 8.702447163515018e-06, "loss": 2.4328, "mean_token_accuracy": 0.5355625748829104, "step": 6999 }, { "epoch": 1.2977382276603633, "grad_norm": 8.5859375, "learning_rate": 8.702261772339637e-06, "loss": 2.6627, "mean_token_accuracy": 0.4696329254727475, "step": 7000 }, { "epoch": 1.2979236188357435, "grad_norm": 6.0625, "learning_rate": 8.702076381164257e-06, "loss": 2.8589, "mean_token_accuracy": 0.46010834769761144, "step": 7001 }, { "epoch": 1.2981090100111234, "grad_norm": 5.4765625, "learning_rate": 8.701890989988878e-06, "loss": 2.743, "mean_token_accuracy": 0.4884253370643602, "step": 7002 }, { "epoch": 1.2982944011865034, "grad_norm": 8.546875, "learning_rate": 8.701705598813497e-06, "loss": 3.1546, "mean_token_accuracy": 0.42584030988379357, "step": 7003 }, { "epoch": 1.2984797923618836, "grad_norm": 6.75390625, "learning_rate": 8.701520207638117e-06, "loss": 2.8375, "mean_token_accuracy": 0.47386231038506416, "step": 7004 }, { "epoch": 1.2986651835372636, "grad_norm": 7.296875, "learning_rate": 8.701334816462738e-06, "loss": 2.3171, "mean_token_accuracy": 0.5175499930079709, "step": 7005 }, { "epoch": 1.2988505747126438, "grad_norm": 5.99609375, "learning_rate": 8.701149425287358e-06, "loss": 3.3306, "mean_token_accuracy": 0.4201019664967225, "step": 7006 }, { "epoch": 1.2990359658880237, "grad_norm": 6.08984375, "learning_rate": 8.700964034111977e-06, "loss": 3.4746, "mean_token_accuracy": 0.41622090501662573, "step": 7007 }, { "epoch": 1.299221357063404, "grad_norm": 7.03515625, "learning_rate": 8.700778642936597e-06, "loss": 3.1027, "mean_token_accuracy": 0.43416370106761565, "step": 7008 }, { "epoch": 1.2994067482387839, "grad_norm": 7.98046875, "learning_rate": 8.700593251761216e-06, "loss": 2.6505, "mean_token_accuracy": 0.4667373844521897, "step": 7009 }, { "epoch": 1.2995921394141638, "grad_norm": 5.77734375, "learning_rate": 8.700407860585837e-06, "loss": 2.618, "mean_token_accuracy": 0.48220387243735763, "step": 7010 }, { "epoch": 1.299777530589544, "grad_norm": 6.2421875, "learning_rate": 8.700222469410457e-06, "loss": 2.9512, "mean_token_accuracy": 0.46461787302871005, "step": 7011 }, { "epoch": 1.299962921764924, "grad_norm": 9.296875, "learning_rate": 8.700037078235076e-06, "loss": 3.1871, "mean_token_accuracy": 0.44493545761135017, "step": 7012 }, { "epoch": 1.300148312940304, "grad_norm": 6.546875, "learning_rate": 8.699851687059696e-06, "loss": 2.4898, "mean_token_accuracy": 0.4873815092945956, "step": 7013 }, { "epoch": 1.3003337041156842, "grad_norm": 5.76171875, "learning_rate": 8.699666295884317e-06, "loss": 2.6573, "mean_token_accuracy": 0.4707173227266493, "step": 7014 }, { "epoch": 1.3005190952910641, "grad_norm": 8.390625, "learning_rate": 8.699480904708937e-06, "loss": 2.7062, "mean_token_accuracy": 0.4790137138107771, "step": 7015 }, { "epoch": 1.300704486466444, "grad_norm": 9.3671875, "learning_rate": 8.699295513533556e-06, "loss": 2.957, "mean_token_accuracy": 0.4494427286155499, "step": 7016 }, { "epoch": 1.3008898776418243, "grad_norm": 8.7109375, "learning_rate": 8.699110122358177e-06, "loss": 2.9677, "mean_token_accuracy": 0.4471116816431322, "step": 7017 }, { "epoch": 1.3010752688172043, "grad_norm": 5.9921875, "learning_rate": 8.698924731182796e-06, "loss": 3.0336, "mean_token_accuracy": 0.4657353426465735, "step": 7018 }, { "epoch": 1.3012606599925842, "grad_norm": 6.91796875, "learning_rate": 8.698739340007416e-06, "loss": 2.6471, "mean_token_accuracy": 0.46966785616250345, "step": 7019 }, { "epoch": 1.3014460511679644, "grad_norm": 7.12109375, "learning_rate": 8.698553948832036e-06, "loss": 2.9234, "mean_token_accuracy": 0.46406968303895474, "step": 7020 }, { "epoch": 1.3016314423433444, "grad_norm": 10.453125, "learning_rate": 8.698368557656657e-06, "loss": 2.5199, "mean_token_accuracy": 0.48504273504273504, "step": 7021 }, { "epoch": 1.3018168335187246, "grad_norm": 6.11328125, "learning_rate": 8.698183166481276e-06, "loss": 2.7721, "mean_token_accuracy": 0.45688172043010755, "step": 7022 }, { "epoch": 1.3020022246941045, "grad_norm": 5.98828125, "learning_rate": 8.697997775305896e-06, "loss": 2.9321, "mean_token_accuracy": 0.45676511355155136, "step": 7023 }, { "epoch": 1.3021876158694847, "grad_norm": 7.24609375, "learning_rate": 8.697812384130517e-06, "loss": 3.0765, "mean_token_accuracy": 0.4544716766758643, "step": 7024 }, { "epoch": 1.3023730070448647, "grad_norm": 6.3203125, "learning_rate": 8.697626992955136e-06, "loss": 2.8539, "mean_token_accuracy": 0.46716216216216216, "step": 7025 }, { "epoch": 1.3025583982202447, "grad_norm": 6.3125, "learning_rate": 8.697441601779756e-06, "loss": 2.5717, "mean_token_accuracy": 0.4861842950777535, "step": 7026 }, { "epoch": 1.3027437893956249, "grad_norm": 6.28125, "learning_rate": 8.697256210604375e-06, "loss": 2.6859, "mean_token_accuracy": 0.4808729139922978, "step": 7027 }, { "epoch": 1.3029291805710048, "grad_norm": 6.32421875, "learning_rate": 8.697070819428995e-06, "loss": 3.0713, "mean_token_accuracy": 0.43382937634897195, "step": 7028 }, { "epoch": 1.3031145717463848, "grad_norm": 6.55859375, "learning_rate": 8.696885428253616e-06, "loss": 2.6334, "mean_token_accuracy": 0.4744678233050326, "step": 7029 }, { "epoch": 1.303299962921765, "grad_norm": 6.1015625, "learning_rate": 8.696700037078236e-06, "loss": 2.6485, "mean_token_accuracy": 0.48314606741573035, "step": 7030 }, { "epoch": 1.303485354097145, "grad_norm": 7.05859375, "learning_rate": 8.696514645902857e-06, "loss": 2.8046, "mean_token_accuracy": 0.4669852848698277, "step": 7031 }, { "epoch": 1.303670745272525, "grad_norm": 6.07421875, "learning_rate": 8.696329254727476e-06, "loss": 3.2299, "mean_token_accuracy": 0.42602970019613334, "step": 7032 }, { "epoch": 1.303856136447905, "grad_norm": 6.76171875, "learning_rate": 8.696143863552096e-06, "loss": 1.9862, "mean_token_accuracy": 0.5550348152115694, "step": 7033 }, { "epoch": 1.304041527623285, "grad_norm": 7.375, "learning_rate": 8.695958472376715e-06, "loss": 3.0631, "mean_token_accuracy": 0.4619227857683573, "step": 7034 }, { "epoch": 1.3042269187986653, "grad_norm": 10.578125, "learning_rate": 8.695773081201335e-06, "loss": 2.9494, "mean_token_accuracy": 0.4373391139412004, "step": 7035 }, { "epoch": 1.3044123099740452, "grad_norm": 6.0234375, "learning_rate": 8.695587690025956e-06, "loss": 2.9487, "mean_token_accuracy": 0.436052854891502, "step": 7036 }, { "epoch": 1.3045977011494254, "grad_norm": 6.8125, "learning_rate": 8.695402298850576e-06, "loss": 3.0095, "mean_token_accuracy": 0.4550379198266522, "step": 7037 }, { "epoch": 1.3047830923248054, "grad_norm": 14.0625, "learning_rate": 8.695216907675195e-06, "loss": 2.7531, "mean_token_accuracy": 0.47096456692913385, "step": 7038 }, { "epoch": 1.3049684835001854, "grad_norm": 9.46875, "learning_rate": 8.695031516499816e-06, "loss": 2.9083, "mean_token_accuracy": 0.4520629266844761, "step": 7039 }, { "epoch": 1.3051538746755655, "grad_norm": 6.63671875, "learning_rate": 8.694846125324436e-06, "loss": 2.4455, "mean_token_accuracy": 0.5008966599417171, "step": 7040 }, { "epoch": 1.3053392658509455, "grad_norm": 6.7421875, "learning_rate": 8.694660734149055e-06, "loss": 2.8823, "mean_token_accuracy": 0.4670502659863207, "step": 7041 }, { "epoch": 1.3055246570263255, "grad_norm": 5.37890625, "learning_rate": 8.694475342973675e-06, "loss": 3.0578, "mean_token_accuracy": 0.4740466101694915, "step": 7042 }, { "epoch": 1.3057100482017057, "grad_norm": 5.84375, "learning_rate": 8.694289951798294e-06, "loss": 2.9404, "mean_token_accuracy": 0.44580670405634537, "step": 7043 }, { "epoch": 1.3058954393770856, "grad_norm": 6.4609375, "learning_rate": 8.694104560622915e-06, "loss": 2.5324, "mean_token_accuracy": 0.5004758883248731, "step": 7044 }, { "epoch": 1.3060808305524656, "grad_norm": 5.73046875, "learning_rate": 8.693919169447535e-06, "loss": 2.6681, "mean_token_accuracy": 0.47325981264081585, "step": 7045 }, { "epoch": 1.3062662217278458, "grad_norm": 7.6484375, "learning_rate": 8.693733778272156e-06, "loss": 2.8457, "mean_token_accuracy": 0.4764309764309764, "step": 7046 }, { "epoch": 1.3064516129032258, "grad_norm": 9.125, "learning_rate": 8.693548387096775e-06, "loss": 2.8976, "mean_token_accuracy": 0.45290970983952017, "step": 7047 }, { "epoch": 1.3066370040786057, "grad_norm": 6.6015625, "learning_rate": 8.693362995921395e-06, "loss": 2.3853, "mean_token_accuracy": 0.49202361512372816, "step": 7048 }, { "epoch": 1.306822395253986, "grad_norm": 5.3515625, "learning_rate": 8.693177604746015e-06, "loss": 2.3149, "mean_token_accuracy": 0.545260663507109, "step": 7049 }, { "epoch": 1.3070077864293659, "grad_norm": 7.44921875, "learning_rate": 8.692992213570634e-06, "loss": 3.0918, "mean_token_accuracy": 0.44785794813979707, "step": 7050 }, { "epoch": 1.307193177604746, "grad_norm": 6.96484375, "learning_rate": 8.692806822395255e-06, "loss": 2.6991, "mean_token_accuracy": 0.45714285714285713, "step": 7051 }, { "epoch": 1.307378568780126, "grad_norm": 6.67578125, "learning_rate": 8.692621431219874e-06, "loss": 2.948, "mean_token_accuracy": 0.46090239079633294, "step": 7052 }, { "epoch": 1.3075639599555062, "grad_norm": 6.36328125, "learning_rate": 8.692436040044494e-06, "loss": 3.2028, "mean_token_accuracy": 0.43376623376623374, "step": 7053 }, { "epoch": 1.3077493511308862, "grad_norm": 5.96484375, "learning_rate": 8.692250648869115e-06, "loss": 2.636, "mean_token_accuracy": 0.4822245168903968, "step": 7054 }, { "epoch": 1.3079347423062662, "grad_norm": 6.953125, "learning_rate": 8.692065257693735e-06, "loss": 2.4221, "mean_token_accuracy": 0.49278438030560273, "step": 7055 }, { "epoch": 1.3081201334816464, "grad_norm": 5.7890625, "learning_rate": 8.691879866518354e-06, "loss": 2.7917, "mean_token_accuracy": 0.4626759730273276, "step": 7056 }, { "epoch": 1.3083055246570263, "grad_norm": 7.3203125, "learning_rate": 8.691694475342974e-06, "loss": 2.5592, "mean_token_accuracy": 0.4687456494500905, "step": 7057 }, { "epoch": 1.3084909158324063, "grad_norm": 5.83203125, "learning_rate": 8.691509084167595e-06, "loss": 2.4699, "mean_token_accuracy": 0.5067580997813556, "step": 7058 }, { "epoch": 1.3086763070077865, "grad_norm": 6.07421875, "learning_rate": 8.691323692992214e-06, "loss": 2.4696, "mean_token_accuracy": 0.5194647201946472, "step": 7059 }, { "epoch": 1.3088616981831664, "grad_norm": 8.515625, "learning_rate": 8.691138301816834e-06, "loss": 2.8756, "mean_token_accuracy": 0.4609012175222794, "step": 7060 }, { "epoch": 1.3090470893585464, "grad_norm": 10.203125, "learning_rate": 8.690952910641453e-06, "loss": 2.7576, "mean_token_accuracy": 0.4730829831932773, "step": 7061 }, { "epoch": 1.3092324805339266, "grad_norm": 6.390625, "learning_rate": 8.690767519466075e-06, "loss": 2.8571, "mean_token_accuracy": 0.45482560582714665, "step": 7062 }, { "epoch": 1.3094178717093066, "grad_norm": 6.96875, "learning_rate": 8.690582128290694e-06, "loss": 2.8342, "mean_token_accuracy": 0.4501955671447197, "step": 7063 }, { "epoch": 1.3096032628846868, "grad_norm": 9.1796875, "learning_rate": 8.690396737115314e-06, "loss": 2.6845, "mean_token_accuracy": 0.4937195590874135, "step": 7064 }, { "epoch": 1.3097886540600667, "grad_norm": 7.8515625, "learning_rate": 8.690211345939933e-06, "loss": 2.8854, "mean_token_accuracy": 0.4691790826760152, "step": 7065 }, { "epoch": 1.309974045235447, "grad_norm": 6.90625, "learning_rate": 8.690025954764554e-06, "loss": 2.1812, "mean_token_accuracy": 0.527256009457507, "step": 7066 }, { "epoch": 1.3101594364108269, "grad_norm": 8.4296875, "learning_rate": 8.689840563589174e-06, "loss": 2.7556, "mean_token_accuracy": 0.4637736938323872, "step": 7067 }, { "epoch": 1.3103448275862069, "grad_norm": 8.53125, "learning_rate": 8.689655172413793e-06, "loss": 2.6931, "mean_token_accuracy": 0.4608634111818825, "step": 7068 }, { "epoch": 1.310530218761587, "grad_norm": 7.7421875, "learning_rate": 8.689469781238413e-06, "loss": 2.6171, "mean_token_accuracy": 0.4862726291811533, "step": 7069 }, { "epoch": 1.310715609936967, "grad_norm": 5.98828125, "learning_rate": 8.689284390063034e-06, "loss": 3.0025, "mean_token_accuracy": 0.4476728174056734, "step": 7070 }, { "epoch": 1.310901001112347, "grad_norm": 8.7421875, "learning_rate": 8.689098998887654e-06, "loss": 2.9269, "mean_token_accuracy": 0.4602696688048174, "step": 7071 }, { "epoch": 1.3110863922877272, "grad_norm": 9.1953125, "learning_rate": 8.688913607712273e-06, "loss": 3.3894, "mean_token_accuracy": 0.4304994954591322, "step": 7072 }, { "epoch": 1.3112717834631071, "grad_norm": 8.5859375, "learning_rate": 8.688728216536894e-06, "loss": 2.5144, "mean_token_accuracy": 0.49756750182437365, "step": 7073 }, { "epoch": 1.311457174638487, "grad_norm": 9.2109375, "learning_rate": 8.688542825361514e-06, "loss": 2.8469, "mean_token_accuracy": 0.46040970507046347, "step": 7074 }, { "epoch": 1.3116425658138673, "grad_norm": 9.03125, "learning_rate": 8.688357434186133e-06, "loss": 2.8453, "mean_token_accuracy": 0.49150704734369355, "step": 7075 }, { "epoch": 1.3118279569892473, "grad_norm": 6.7265625, "learning_rate": 8.688172043010754e-06, "loss": 2.8478, "mean_token_accuracy": 0.44589444508603765, "step": 7076 }, { "epoch": 1.3120133481646274, "grad_norm": 8.6875, "learning_rate": 8.687986651835372e-06, "loss": 2.9035, "mean_token_accuracy": 0.4370629370629371, "step": 7077 }, { "epoch": 1.3121987393400074, "grad_norm": 10.453125, "learning_rate": 8.687801260659994e-06, "loss": 3.0436, "mean_token_accuracy": 0.4145097539206936, "step": 7078 }, { "epoch": 1.3123841305153876, "grad_norm": 7.1015625, "learning_rate": 8.687615869484613e-06, "loss": 2.5394, "mean_token_accuracy": 0.4783068783068783, "step": 7079 }, { "epoch": 1.3125695216907676, "grad_norm": 5.9296875, "learning_rate": 8.687430478309234e-06, "loss": 3.0164, "mean_token_accuracy": 0.44342993251290197, "step": 7080 }, { "epoch": 1.3127549128661475, "grad_norm": 7.6875, "learning_rate": 8.687245087133853e-06, "loss": 2.9231, "mean_token_accuracy": 0.4477571115973742, "step": 7081 }, { "epoch": 1.3129403040415277, "grad_norm": 6.08203125, "learning_rate": 8.687059695958473e-06, "loss": 3.1596, "mean_token_accuracy": 0.4309063893016345, "step": 7082 }, { "epoch": 1.3131256952169077, "grad_norm": 5.80859375, "learning_rate": 8.686874304783094e-06, "loss": 2.92, "mean_token_accuracy": 0.4457390597480016, "step": 7083 }, { "epoch": 1.3133110863922877, "grad_norm": 6.046875, "learning_rate": 8.686688913607712e-06, "loss": 3.0056, "mean_token_accuracy": 0.4255860683188212, "step": 7084 }, { "epoch": 1.3134964775676679, "grad_norm": 6.52734375, "learning_rate": 8.686503522432333e-06, "loss": 3.5776, "mean_token_accuracy": 0.43008314436885864, "step": 7085 }, { "epoch": 1.3136818687430478, "grad_norm": 6.6796875, "learning_rate": 8.686318131256953e-06, "loss": 2.9636, "mean_token_accuracy": 0.45851431117050895, "step": 7086 }, { "epoch": 1.3138672599184278, "grad_norm": 6.14453125, "learning_rate": 8.686132740081574e-06, "loss": 3.172, "mean_token_accuracy": 0.4369565217391304, "step": 7087 }, { "epoch": 1.314052651093808, "grad_norm": 7.0859375, "learning_rate": 8.685947348906193e-06, "loss": 2.3996, "mean_token_accuracy": 0.5025900493916395, "step": 7088 }, { "epoch": 1.314238042269188, "grad_norm": 5.80859375, "learning_rate": 8.685761957730813e-06, "loss": 3.1272, "mean_token_accuracy": 0.43207100591715975, "step": 7089 }, { "epoch": 1.314423433444568, "grad_norm": 7.5625, "learning_rate": 8.685576566555432e-06, "loss": 2.6418, "mean_token_accuracy": 0.45469155643466436, "step": 7090 }, { "epoch": 1.314608824619948, "grad_norm": 7.90234375, "learning_rate": 8.685391175380052e-06, "loss": 2.761, "mean_token_accuracy": 0.4907859078590786, "step": 7091 }, { "epoch": 1.314794215795328, "grad_norm": 5.2265625, "learning_rate": 8.685205784204673e-06, "loss": 2.3342, "mean_token_accuracy": 0.5463866584311303, "step": 7092 }, { "epoch": 1.3149796069707083, "grad_norm": 6.453125, "learning_rate": 8.685020393029292e-06, "loss": 2.6479, "mean_token_accuracy": 0.4678931333489571, "step": 7093 }, { "epoch": 1.3151649981460882, "grad_norm": 7.8828125, "learning_rate": 8.684835001853912e-06, "loss": 2.7444, "mean_token_accuracy": 0.48464007336084364, "step": 7094 }, { "epoch": 1.3153503893214684, "grad_norm": 7.83984375, "learning_rate": 8.684649610678533e-06, "loss": 3.172, "mean_token_accuracy": 0.4223687484448868, "step": 7095 }, { "epoch": 1.3155357804968484, "grad_norm": 6.10546875, "learning_rate": 8.684464219503153e-06, "loss": 3.267, "mean_token_accuracy": 0.4257607926397735, "step": 7096 }, { "epoch": 1.3157211716722284, "grad_norm": 8.6015625, "learning_rate": 8.684278828327772e-06, "loss": 3.1144, "mean_token_accuracy": 0.4262607040913416, "step": 7097 }, { "epoch": 1.3159065628476085, "grad_norm": 8.1484375, "learning_rate": 8.684093437152392e-06, "loss": 3.1164, "mean_token_accuracy": 0.3965156794425087, "step": 7098 }, { "epoch": 1.3160919540229885, "grad_norm": 7.15234375, "learning_rate": 8.683908045977011e-06, "loss": 2.461, "mean_token_accuracy": 0.5323812686815012, "step": 7099 }, { "epoch": 1.3162773451983685, "grad_norm": 6.2734375, "learning_rate": 8.683722654801632e-06, "loss": 2.5135, "mean_token_accuracy": 0.4849141265666099, "step": 7100 }, { "epoch": 1.3164627363737487, "grad_norm": 8.6328125, "learning_rate": 8.683537263626252e-06, "loss": 2.3052, "mean_token_accuracy": 0.5043419267299865, "step": 7101 }, { "epoch": 1.3166481275491286, "grad_norm": 5.0, "learning_rate": 8.683351872450873e-06, "loss": 2.7761, "mean_token_accuracy": 0.46165560752746654, "step": 7102 }, { "epoch": 1.3168335187245086, "grad_norm": 5.16015625, "learning_rate": 8.683166481275492e-06, "loss": 2.8358, "mean_token_accuracy": 0.4650896604349485, "step": 7103 }, { "epoch": 1.3170189098998888, "grad_norm": 5.0859375, "learning_rate": 8.682981090100112e-06, "loss": 2.0835, "mean_token_accuracy": 0.5486594891281402, "step": 7104 }, { "epoch": 1.3172043010752688, "grad_norm": 5.83984375, "learning_rate": 8.682795698924733e-06, "loss": 2.8427, "mean_token_accuracy": 0.44196540486337427, "step": 7105 }, { "epoch": 1.317389692250649, "grad_norm": 5.7421875, "learning_rate": 8.682610307749351e-06, "loss": 2.4071, "mean_token_accuracy": 0.5011117287381879, "step": 7106 }, { "epoch": 1.317575083426029, "grad_norm": 7.87890625, "learning_rate": 8.682424916573972e-06, "loss": 2.7258, "mean_token_accuracy": 0.49034194727225267, "step": 7107 }, { "epoch": 1.317760474601409, "grad_norm": 5.703125, "learning_rate": 8.68223952539859e-06, "loss": 2.9129, "mean_token_accuracy": 0.44031040714361647, "step": 7108 }, { "epoch": 1.317945865776789, "grad_norm": 5.203125, "learning_rate": 8.682054134223211e-06, "loss": 2.7827, "mean_token_accuracy": 0.47100110823790176, "step": 7109 }, { "epoch": 1.318131256952169, "grad_norm": 6.5078125, "learning_rate": 8.681868743047832e-06, "loss": 2.7137, "mean_token_accuracy": 0.4750436735712503, "step": 7110 }, { "epoch": 1.3183166481275492, "grad_norm": 5.38671875, "learning_rate": 8.681683351872452e-06, "loss": 2.8891, "mean_token_accuracy": 0.4566747728145399, "step": 7111 }, { "epoch": 1.3185020393029292, "grad_norm": 6.7734375, "learning_rate": 8.681497960697073e-06, "loss": 3.0353, "mean_token_accuracy": 0.44500917690244246, "step": 7112 }, { "epoch": 1.3186874304783092, "grad_norm": 6.31640625, "learning_rate": 8.681312569521691e-06, "loss": 2.8088, "mean_token_accuracy": 0.4563767000256608, "step": 7113 }, { "epoch": 1.3188728216536894, "grad_norm": 5.7109375, "learning_rate": 8.681127178346312e-06, "loss": 3.1201, "mean_token_accuracy": 0.4438229972034874, "step": 7114 }, { "epoch": 1.3190582128290693, "grad_norm": 6.02734375, "learning_rate": 8.68094178717093e-06, "loss": 2.4667, "mean_token_accuracy": 0.49524475524475525, "step": 7115 }, { "epoch": 1.3192436040044493, "grad_norm": 6.71484375, "learning_rate": 8.680756395995551e-06, "loss": 3.5577, "mean_token_accuracy": 0.4018622886547415, "step": 7116 }, { "epoch": 1.3194289951798295, "grad_norm": 6.484375, "learning_rate": 8.68057100482017e-06, "loss": 3.3033, "mean_token_accuracy": 0.4118141097424412, "step": 7117 }, { "epoch": 1.3196143863552094, "grad_norm": 7.04296875, "learning_rate": 8.680385613644792e-06, "loss": 2.4446, "mean_token_accuracy": 0.4937671667018804, "step": 7118 }, { "epoch": 1.3197997775305894, "grad_norm": 6.00390625, "learning_rate": 8.680200222469411e-06, "loss": 3.0815, "mean_token_accuracy": 0.42424242424242425, "step": 7119 }, { "epoch": 1.3199851687059696, "grad_norm": 6.30078125, "learning_rate": 8.680014831294031e-06, "loss": 3.0184, "mean_token_accuracy": 0.434826883910387, "step": 7120 }, { "epoch": 1.3201705598813496, "grad_norm": 9.3125, "learning_rate": 8.679829440118652e-06, "loss": 2.3845, "mean_token_accuracy": 0.4953023367863166, "step": 7121 }, { "epoch": 1.3203559510567298, "grad_norm": 6.83203125, "learning_rate": 8.67964404894327e-06, "loss": 2.9299, "mean_token_accuracy": 0.45575332163449483, "step": 7122 }, { "epoch": 1.3205413422321097, "grad_norm": 6.53125, "learning_rate": 8.679458657767891e-06, "loss": 3.7051, "mean_token_accuracy": 0.3856749311294766, "step": 7123 }, { "epoch": 1.32072673340749, "grad_norm": 7.6328125, "learning_rate": 8.67927326659251e-06, "loss": 3.2865, "mean_token_accuracy": 0.4438340320948388, "step": 7124 }, { "epoch": 1.3209121245828699, "grad_norm": 6.9765625, "learning_rate": 8.67908787541713e-06, "loss": 2.5585, "mean_token_accuracy": 0.52065999153857, "step": 7125 }, { "epoch": 1.3210975157582499, "grad_norm": 5.98046875, "learning_rate": 8.678902484241751e-06, "loss": 3.0205, "mean_token_accuracy": 0.4617616033755274, "step": 7126 }, { "epoch": 1.32128290693363, "grad_norm": 7.30859375, "learning_rate": 8.678717093066371e-06, "loss": 2.9563, "mean_token_accuracy": 0.4740767913915383, "step": 7127 }, { "epoch": 1.32146829810901, "grad_norm": 7.21875, "learning_rate": 8.67853170189099e-06, "loss": 3.2205, "mean_token_accuracy": 0.4268053148469093, "step": 7128 }, { "epoch": 1.32165368928439, "grad_norm": 7.7890625, "learning_rate": 8.67834631071561e-06, "loss": 3.1278, "mean_token_accuracy": 0.43245812047047644, "step": 7129 }, { "epoch": 1.3218390804597702, "grad_norm": 5.71875, "learning_rate": 8.678160919540231e-06, "loss": 2.4728, "mean_token_accuracy": 0.5037855281122006, "step": 7130 }, { "epoch": 1.3220244716351501, "grad_norm": 6.48828125, "learning_rate": 8.67797552836485e-06, "loss": 2.9364, "mean_token_accuracy": 0.44659546061415223, "step": 7131 }, { "epoch": 1.32220986281053, "grad_norm": 6.9765625, "learning_rate": 8.67779013718947e-06, "loss": 3.1549, "mean_token_accuracy": 0.43215434083601284, "step": 7132 }, { "epoch": 1.3223952539859103, "grad_norm": 5.73046875, "learning_rate": 8.67760474601409e-06, "loss": 3.2268, "mean_token_accuracy": 0.4274521621214404, "step": 7133 }, { "epoch": 1.3225806451612903, "grad_norm": 5.53515625, "learning_rate": 8.677419354838712e-06, "loss": 2.9665, "mean_token_accuracy": 0.4478368756068616, "step": 7134 }, { "epoch": 1.3227660363366704, "grad_norm": 5.921875, "learning_rate": 8.67723396366333e-06, "loss": 2.5995, "mean_token_accuracy": 0.4923810989687548, "step": 7135 }, { "epoch": 1.3229514275120504, "grad_norm": 6.40625, "learning_rate": 8.67704857248795e-06, "loss": 2.2696, "mean_token_accuracy": 0.5179230363732209, "step": 7136 }, { "epoch": 1.3231368186874306, "grad_norm": 8.296875, "learning_rate": 8.67686318131257e-06, "loss": 2.1416, "mean_token_accuracy": 0.5397967161845192, "step": 7137 }, { "epoch": 1.3233222098628106, "grad_norm": 5.91796875, "learning_rate": 8.67667779013719e-06, "loss": 2.6511, "mean_token_accuracy": 0.4809310889005786, "step": 7138 }, { "epoch": 1.3235076010381905, "grad_norm": 7.83203125, "learning_rate": 8.67649239896181e-06, "loss": 2.5325, "mean_token_accuracy": 0.4686435584051292, "step": 7139 }, { "epoch": 1.3236929922135707, "grad_norm": 11.015625, "learning_rate": 8.67630700778643e-06, "loss": 2.3235, "mean_token_accuracy": 0.5308602999210734, "step": 7140 }, { "epoch": 1.3238783833889507, "grad_norm": 9.2421875, "learning_rate": 8.67612161661105e-06, "loss": 2.6769, "mean_token_accuracy": 0.4757016632016632, "step": 7141 }, { "epoch": 1.3240637745643307, "grad_norm": 6.37109375, "learning_rate": 8.67593622543567e-06, "loss": 3.2522, "mean_token_accuracy": 0.4418165807019751, "step": 7142 }, { "epoch": 1.3242491657397109, "grad_norm": 8.0390625, "learning_rate": 8.675750834260291e-06, "loss": 2.8933, "mean_token_accuracy": 0.44660062423666713, "step": 7143 }, { "epoch": 1.3244345569150908, "grad_norm": 7.9140625, "learning_rate": 8.67556544308491e-06, "loss": 2.7649, "mean_token_accuracy": 0.47042177604096486, "step": 7144 }, { "epoch": 1.3246199480904708, "grad_norm": 5.4375, "learning_rate": 8.67538005190953e-06, "loss": 2.3969, "mean_token_accuracy": 0.505421293272371, "step": 7145 }, { "epoch": 1.324805339265851, "grad_norm": 7.80078125, "learning_rate": 8.675194660734149e-06, "loss": 2.3275, "mean_token_accuracy": 0.5078463607831415, "step": 7146 }, { "epoch": 1.324990730441231, "grad_norm": 6.625, "learning_rate": 8.67500926955877e-06, "loss": 2.827, "mean_token_accuracy": 0.45971622455274525, "step": 7147 }, { "epoch": 1.325176121616611, "grad_norm": 6.3125, "learning_rate": 8.67482387838339e-06, "loss": 2.9196, "mean_token_accuracy": 0.4626025791324736, "step": 7148 }, { "epoch": 1.325361512791991, "grad_norm": 6.3515625, "learning_rate": 8.674638487208009e-06, "loss": 3.1549, "mean_token_accuracy": 0.4340051522958024, "step": 7149 }, { "epoch": 1.325546903967371, "grad_norm": 7.88671875, "learning_rate": 8.674453096032631e-06, "loss": 3.1961, "mean_token_accuracy": 0.428740581270183, "step": 7150 }, { "epoch": 1.3257322951427513, "grad_norm": 8.2265625, "learning_rate": 8.67426770485725e-06, "loss": 2.1752, "mean_token_accuracy": 0.5267996260517295, "step": 7151 }, { "epoch": 1.3259176863181312, "grad_norm": 7.5390625, "learning_rate": 8.67408231368187e-06, "loss": 2.3468, "mean_token_accuracy": 0.4941696823482107, "step": 7152 }, { "epoch": 1.3261030774935114, "grad_norm": 8.578125, "learning_rate": 8.673896922506489e-06, "loss": 2.67, "mean_token_accuracy": 0.47132390096008087, "step": 7153 }, { "epoch": 1.3262884686688914, "grad_norm": 14.2890625, "learning_rate": 8.67371153133111e-06, "loss": 3.2165, "mean_token_accuracy": 0.41689866369710465, "step": 7154 }, { "epoch": 1.3264738598442714, "grad_norm": 6.70703125, "learning_rate": 8.67352614015573e-06, "loss": 2.8587, "mean_token_accuracy": 0.44610169491525425, "step": 7155 }, { "epoch": 1.3266592510196515, "grad_norm": 7.52734375, "learning_rate": 8.673340748980349e-06, "loss": 2.96, "mean_token_accuracy": 0.4502058672156459, "step": 7156 }, { "epoch": 1.3268446421950315, "grad_norm": 9.4921875, "learning_rate": 8.67315535780497e-06, "loss": 3.2149, "mean_token_accuracy": 0.4133034379671151, "step": 7157 }, { "epoch": 1.3270300333704115, "grad_norm": 10.34375, "learning_rate": 8.67296996662959e-06, "loss": 2.4036, "mean_token_accuracy": 0.4868845560387151, "step": 7158 }, { "epoch": 1.3272154245457917, "grad_norm": 5.015625, "learning_rate": 8.67278457545421e-06, "loss": 2.4743, "mean_token_accuracy": 0.4812383900928793, "step": 7159 }, { "epoch": 1.3274008157211716, "grad_norm": 7.53125, "learning_rate": 8.672599184278829e-06, "loss": 2.7616, "mean_token_accuracy": 0.47458807147830123, "step": 7160 }, { "epoch": 1.3275862068965516, "grad_norm": 11.96875, "learning_rate": 8.67241379310345e-06, "loss": 2.4532, "mean_token_accuracy": 0.4858880778588808, "step": 7161 }, { "epoch": 1.3277715980719318, "grad_norm": 10.2265625, "learning_rate": 8.672228401928068e-06, "loss": 3.1639, "mean_token_accuracy": 0.4442881700265667, "step": 7162 }, { "epoch": 1.3279569892473118, "grad_norm": 5.87890625, "learning_rate": 8.672043010752689e-06, "loss": 2.5357, "mean_token_accuracy": 0.4957315862616637, "step": 7163 }, { "epoch": 1.328142380422692, "grad_norm": 6.37109375, "learning_rate": 8.67185761957731e-06, "loss": 2.5916, "mean_token_accuracy": 0.5005800464037123, "step": 7164 }, { "epoch": 1.328327771598072, "grad_norm": 8.234375, "learning_rate": 8.671672228401928e-06, "loss": 2.9886, "mean_token_accuracy": 0.4495830174374526, "step": 7165 }, { "epoch": 1.328513162773452, "grad_norm": 8.2734375, "learning_rate": 8.671486837226549e-06, "loss": 3.0486, "mean_token_accuracy": 0.41974248927038627, "step": 7166 }, { "epoch": 1.328698553948832, "grad_norm": 7.390625, "learning_rate": 8.671301446051169e-06, "loss": 3.4328, "mean_token_accuracy": 0.41117850953206236, "step": 7167 }, { "epoch": 1.328883945124212, "grad_norm": 5.984375, "learning_rate": 8.67111605487579e-06, "loss": 2.859, "mean_token_accuracy": 0.44726350126857556, "step": 7168 }, { "epoch": 1.3290693362995922, "grad_norm": 7.38671875, "learning_rate": 8.670930663700408e-06, "loss": 3.1067, "mean_token_accuracy": 0.44147784673113005, "step": 7169 }, { "epoch": 1.3292547274749722, "grad_norm": 6.02734375, "learning_rate": 8.670745272525029e-06, "loss": 2.5338, "mean_token_accuracy": 0.4917005950516755, "step": 7170 }, { "epoch": 1.3294401186503522, "grad_norm": 6.390625, "learning_rate": 8.670559881349648e-06, "loss": 2.4196, "mean_token_accuracy": 0.49652402286420516, "step": 7171 }, { "epoch": 1.3296255098257324, "grad_norm": 6.10546875, "learning_rate": 8.670374490174268e-06, "loss": 2.5324, "mean_token_accuracy": 0.48323496169785257, "step": 7172 }, { "epoch": 1.3298109010011123, "grad_norm": 5.640625, "learning_rate": 8.670189098998889e-06, "loss": 2.7242, "mean_token_accuracy": 0.4735545335085414, "step": 7173 }, { "epoch": 1.3299962921764923, "grad_norm": 7.7734375, "learning_rate": 8.670003707823509e-06, "loss": 3.1069, "mean_token_accuracy": 0.43765508684863524, "step": 7174 }, { "epoch": 1.3301816833518725, "grad_norm": 6.04296875, "learning_rate": 8.669818316648128e-06, "loss": 2.9094, "mean_token_accuracy": 0.456682629516761, "step": 7175 }, { "epoch": 1.3303670745272524, "grad_norm": 6.375, "learning_rate": 8.669632925472748e-06, "loss": 3.0595, "mean_token_accuracy": 0.41973718113888175, "step": 7176 }, { "epoch": 1.3305524657026326, "grad_norm": 6.02734375, "learning_rate": 8.669447534297369e-06, "loss": 2.7826, "mean_token_accuracy": 0.451067615658363, "step": 7177 }, { "epoch": 1.3307378568780126, "grad_norm": 6.5625, "learning_rate": 8.669262143121988e-06, "loss": 3.3065, "mean_token_accuracy": 0.45878861459598, "step": 7178 }, { "epoch": 1.3309232480533928, "grad_norm": 5.8359375, "learning_rate": 8.669076751946608e-06, "loss": 2.473, "mean_token_accuracy": 0.5121262805770437, "step": 7179 }, { "epoch": 1.3311086392287728, "grad_norm": 5.75, "learning_rate": 8.668891360771227e-06, "loss": 2.5927, "mean_token_accuracy": 0.5213785213785214, "step": 7180 }, { "epoch": 1.3312940304041527, "grad_norm": 5.640625, "learning_rate": 8.668705969595848e-06, "loss": 2.5694, "mean_token_accuracy": 0.47198022133270545, "step": 7181 }, { "epoch": 1.331479421579533, "grad_norm": 5.921875, "learning_rate": 8.668520578420468e-06, "loss": 3.4492, "mean_token_accuracy": 0.41316685584562995, "step": 7182 }, { "epoch": 1.3316648127549129, "grad_norm": 8.75, "learning_rate": 8.668335187245088e-06, "loss": 2.1682, "mean_token_accuracy": 0.532287403365166, "step": 7183 }, { "epoch": 1.3318502039302929, "grad_norm": 6.15234375, "learning_rate": 8.668149796069707e-06, "loss": 2.3086, "mean_token_accuracy": 0.5148763793825953, "step": 7184 }, { "epoch": 1.332035595105673, "grad_norm": 5.67578125, "learning_rate": 8.667964404894328e-06, "loss": 2.8262, "mean_token_accuracy": 0.47252481239409344, "step": 7185 }, { "epoch": 1.332220986281053, "grad_norm": 9.046875, "learning_rate": 8.667779013718948e-06, "loss": 3.2013, "mean_token_accuracy": 0.4354686020826759, "step": 7186 }, { "epoch": 1.332406377456433, "grad_norm": 6.72265625, "learning_rate": 8.667593622543567e-06, "loss": 2.7543, "mean_token_accuracy": 0.46911455345190284, "step": 7187 }, { "epoch": 1.3325917686318132, "grad_norm": 6.21875, "learning_rate": 8.667408231368188e-06, "loss": 3.0154, "mean_token_accuracy": 0.4423076923076923, "step": 7188 }, { "epoch": 1.3327771598071931, "grad_norm": 7.01171875, "learning_rate": 8.667222840192806e-06, "loss": 2.873, "mean_token_accuracy": 0.47605692323673005, "step": 7189 }, { "epoch": 1.332962550982573, "grad_norm": 7.5234375, "learning_rate": 8.667037449017427e-06, "loss": 3.7216, "mean_token_accuracy": 0.3854370660494926, "step": 7190 }, { "epoch": 1.3331479421579533, "grad_norm": 6.4296875, "learning_rate": 8.666852057842047e-06, "loss": 3.8675, "mean_token_accuracy": 0.4056393076493579, "step": 7191 }, { "epoch": 1.3333333333333333, "grad_norm": 8.96875, "learning_rate": 8.666666666666668e-06, "loss": 3.2868, "mean_token_accuracy": 0.42080507547582585, "step": 7192 }, { "epoch": 1.3335187245087134, "grad_norm": 7.83984375, "learning_rate": 8.666481275491288e-06, "loss": 2.5227, "mean_token_accuracy": 0.489426907753601, "step": 7193 }, { "epoch": 1.3337041156840934, "grad_norm": 5.8359375, "learning_rate": 8.666295884315907e-06, "loss": 2.5026, "mean_token_accuracy": 0.4859406952965235, "step": 7194 }, { "epoch": 1.3338895068594736, "grad_norm": 7.6171875, "learning_rate": 8.666110493140528e-06, "loss": 2.9627, "mean_token_accuracy": 0.4659868026394721, "step": 7195 }, { "epoch": 1.3340748980348536, "grad_norm": 5.46484375, "learning_rate": 8.665925101965146e-06, "loss": 2.4394, "mean_token_accuracy": 0.5033815143880122, "step": 7196 }, { "epoch": 1.3342602892102335, "grad_norm": 6.40625, "learning_rate": 8.665739710789767e-06, "loss": 3.1604, "mean_token_accuracy": 0.4410924102889806, "step": 7197 }, { "epoch": 1.3344456803856137, "grad_norm": 6.17578125, "learning_rate": 8.665554319614386e-06, "loss": 3.0865, "mean_token_accuracy": 0.4511842105263158, "step": 7198 }, { "epoch": 1.3346310715609937, "grad_norm": 6.13671875, "learning_rate": 8.665368928439008e-06, "loss": 2.675, "mean_token_accuracy": 0.4800531914893617, "step": 7199 }, { "epoch": 1.3348164627363737, "grad_norm": 5.9765625, "learning_rate": 8.665183537263627e-06, "loss": 2.8052, "mean_token_accuracy": 0.44724845327254964, "step": 7200 }, { "epoch": 1.3350018539117539, "grad_norm": 6.69140625, "learning_rate": 8.664998146088247e-06, "loss": 2.8431, "mean_token_accuracy": 0.46218887697987376, "step": 7201 }, { "epoch": 1.3351872450871338, "grad_norm": 8.1953125, "learning_rate": 8.664812754912868e-06, "loss": 2.6405, "mean_token_accuracy": 0.4659643435980551, "step": 7202 }, { "epoch": 1.3353726362625138, "grad_norm": 7.140625, "learning_rate": 8.664627363737486e-06, "loss": 2.608, "mean_token_accuracy": 0.48135311243587964, "step": 7203 }, { "epoch": 1.335558027437894, "grad_norm": 7.0, "learning_rate": 8.664441972562107e-06, "loss": 3.9419, "mean_token_accuracy": 0.36348722756034685, "step": 7204 }, { "epoch": 1.335743418613274, "grad_norm": 8.984375, "learning_rate": 8.664256581386726e-06, "loss": 2.172, "mean_token_accuracy": 0.5345589113047631, "step": 7205 }, { "epoch": 1.3359288097886541, "grad_norm": 5.44921875, "learning_rate": 8.664071190211346e-06, "loss": 3.3444, "mean_token_accuracy": 0.41488381615313746, "step": 7206 }, { "epoch": 1.336114200964034, "grad_norm": 7.0, "learning_rate": 8.663885799035967e-06, "loss": 2.6724, "mean_token_accuracy": 0.4723557692307692, "step": 7207 }, { "epoch": 1.3362995921394143, "grad_norm": 6.16015625, "learning_rate": 8.663700407860587e-06, "loss": 2.2136, "mean_token_accuracy": 0.5317876932050161, "step": 7208 }, { "epoch": 1.3364849833147943, "grad_norm": 5.1875, "learning_rate": 8.663515016685206e-06, "loss": 2.7921, "mean_token_accuracy": 0.4575627798325871, "step": 7209 }, { "epoch": 1.3366703744901742, "grad_norm": 6.66796875, "learning_rate": 8.663329625509827e-06, "loss": 3.0123, "mean_token_accuracy": 0.4531194716358061, "step": 7210 }, { "epoch": 1.3368557656655544, "grad_norm": 8.4140625, "learning_rate": 8.663144234334447e-06, "loss": 3.2027, "mean_token_accuracy": 0.4120079644662276, "step": 7211 }, { "epoch": 1.3370411568409344, "grad_norm": 6.88671875, "learning_rate": 8.662958843159066e-06, "loss": 3.1639, "mean_token_accuracy": 0.4173508907823393, "step": 7212 }, { "epoch": 1.3372265480163144, "grad_norm": 6.2734375, "learning_rate": 8.662773451983686e-06, "loss": 2.8919, "mean_token_accuracy": 0.47506600176004693, "step": 7213 }, { "epoch": 1.3374119391916945, "grad_norm": 5.98046875, "learning_rate": 8.662588060808305e-06, "loss": 3.2255, "mean_token_accuracy": 0.4355846042120552, "step": 7214 }, { "epoch": 1.3375973303670745, "grad_norm": 5.890625, "learning_rate": 8.662402669632927e-06, "loss": 2.7012, "mean_token_accuracy": 0.4690909090909091, "step": 7215 }, { "epoch": 1.3377827215424545, "grad_norm": 6.50390625, "learning_rate": 8.662217278457546e-06, "loss": 2.6946, "mean_token_accuracy": 0.48770069229636176, "step": 7216 }, { "epoch": 1.3379681127178347, "grad_norm": 9.9921875, "learning_rate": 8.662031887282167e-06, "loss": 2.8079, "mean_token_accuracy": 0.4461745482506728, "step": 7217 }, { "epoch": 1.3381535038932146, "grad_norm": 6.9140625, "learning_rate": 8.661846496106785e-06, "loss": 2.6653, "mean_token_accuracy": 0.4593780135004822, "step": 7218 }, { "epoch": 1.3383388950685946, "grad_norm": 5.7421875, "learning_rate": 8.661661104931406e-06, "loss": 2.8977, "mean_token_accuracy": 0.44960538232630354, "step": 7219 }, { "epoch": 1.3385242862439748, "grad_norm": 6.06640625, "learning_rate": 8.661475713756026e-06, "loss": 2.3178, "mean_token_accuracy": 0.5185917721518988, "step": 7220 }, { "epoch": 1.3387096774193548, "grad_norm": 8.796875, "learning_rate": 8.661290322580645e-06, "loss": 3.1137, "mean_token_accuracy": 0.428271744392448, "step": 7221 }, { "epoch": 1.338895068594735, "grad_norm": 6.49609375, "learning_rate": 8.661104931405266e-06, "loss": 2.9439, "mean_token_accuracy": 0.46454767726161367, "step": 7222 }, { "epoch": 1.339080459770115, "grad_norm": 5.9921875, "learning_rate": 8.660919540229886e-06, "loss": 3.6439, "mean_token_accuracy": 0.3886658795749705, "step": 7223 }, { "epoch": 1.339265850945495, "grad_norm": 7.2109375, "learning_rate": 8.660734149054507e-06, "loss": 3.0451, "mean_token_accuracy": 0.4427671460834423, "step": 7224 }, { "epoch": 1.339451242120875, "grad_norm": 5.83984375, "learning_rate": 8.660548757879125e-06, "loss": 3.1548, "mean_token_accuracy": 0.4389814438575105, "step": 7225 }, { "epoch": 1.339636633296255, "grad_norm": 7.9453125, "learning_rate": 8.660363366703746e-06, "loss": 2.7117, "mean_token_accuracy": 0.465438919582566, "step": 7226 }, { "epoch": 1.3398220244716352, "grad_norm": 5.4921875, "learning_rate": 8.660177975528365e-06, "loss": 2.9679, "mean_token_accuracy": 0.46248048985472445, "step": 7227 }, { "epoch": 1.3400074156470152, "grad_norm": 5.109375, "learning_rate": 8.659992584352985e-06, "loss": 2.1292, "mean_token_accuracy": 0.5307682342272315, "step": 7228 }, { "epoch": 1.3401928068223952, "grad_norm": 6.171875, "learning_rate": 8.659807193177606e-06, "loss": 2.857, "mean_token_accuracy": 0.4735756229169973, "step": 7229 }, { "epoch": 1.3403781979977754, "grad_norm": 5.77734375, "learning_rate": 8.659621802002224e-06, "loss": 2.5436, "mean_token_accuracy": 0.47965274009766684, "step": 7230 }, { "epoch": 1.3405635891731553, "grad_norm": 6.515625, "learning_rate": 8.659436410826847e-06, "loss": 2.6775, "mean_token_accuracy": 0.4804823470909995, "step": 7231 }, { "epoch": 1.3407489803485353, "grad_norm": 7.703125, "learning_rate": 8.659251019651465e-06, "loss": 2.7673, "mean_token_accuracy": 0.47003704510786665, "step": 7232 }, { "epoch": 1.3409343715239155, "grad_norm": 6.7578125, "learning_rate": 8.659065628476086e-06, "loss": 2.8615, "mean_token_accuracy": 0.4645161290322581, "step": 7233 }, { "epoch": 1.3411197626992954, "grad_norm": 8.015625, "learning_rate": 8.658880237300705e-06, "loss": 2.6794, "mean_token_accuracy": 0.4670530156685984, "step": 7234 }, { "epoch": 1.3413051538746756, "grad_norm": 6.875, "learning_rate": 8.658694846125325e-06, "loss": 3.2448, "mean_token_accuracy": 0.4317507418397626, "step": 7235 }, { "epoch": 1.3414905450500556, "grad_norm": 6.35546875, "learning_rate": 8.658509454949946e-06, "loss": 2.7127, "mean_token_accuracy": 0.47053872053872053, "step": 7236 }, { "epoch": 1.3416759362254358, "grad_norm": 6.59375, "learning_rate": 8.658324063774565e-06, "loss": 3.463, "mean_token_accuracy": 0.40838757870024545, "step": 7237 }, { "epoch": 1.3418613274008158, "grad_norm": 6.66796875, "learning_rate": 8.658138672599185e-06, "loss": 2.8505, "mean_token_accuracy": 0.47598415449368653, "step": 7238 }, { "epoch": 1.3420467185761957, "grad_norm": 11.3203125, "learning_rate": 8.657953281423806e-06, "loss": 2.7571, "mean_token_accuracy": 0.47897897897897895, "step": 7239 }, { "epoch": 1.342232109751576, "grad_norm": 6.296875, "learning_rate": 8.657767890248426e-06, "loss": 3.0187, "mean_token_accuracy": 0.4676860025220681, "step": 7240 }, { "epoch": 1.3424175009269559, "grad_norm": 6.93359375, "learning_rate": 8.657582499073045e-06, "loss": 3.3356, "mean_token_accuracy": 0.4300136425648022, "step": 7241 }, { "epoch": 1.3426028921023359, "grad_norm": 7.05859375, "learning_rate": 8.657397107897665e-06, "loss": 3.0434, "mean_token_accuracy": 0.4514336917562724, "step": 7242 }, { "epoch": 1.342788283277716, "grad_norm": 6.65234375, "learning_rate": 8.657211716722284e-06, "loss": 2.749, "mean_token_accuracy": 0.46653543307086615, "step": 7243 }, { "epoch": 1.342973674453096, "grad_norm": 6.34375, "learning_rate": 8.657026325546905e-06, "loss": 2.5434, "mean_token_accuracy": 0.49780701754385964, "step": 7244 }, { "epoch": 1.343159065628476, "grad_norm": 7.6875, "learning_rate": 8.656840934371525e-06, "loss": 3.1214, "mean_token_accuracy": 0.4290519496698642, "step": 7245 }, { "epoch": 1.3433444568038562, "grad_norm": 6.44921875, "learning_rate": 8.656655543196144e-06, "loss": 3.3053, "mean_token_accuracy": 0.454360569446106, "step": 7246 }, { "epoch": 1.3435298479792361, "grad_norm": 6.41796875, "learning_rate": 8.656470152020764e-06, "loss": 2.7181, "mean_token_accuracy": 0.48188260462628696, "step": 7247 }, { "epoch": 1.343715239154616, "grad_norm": 5.2421875, "learning_rate": 8.656284760845385e-06, "loss": 2.9339, "mean_token_accuracy": 0.4404277577589213, "step": 7248 }, { "epoch": 1.3439006303299963, "grad_norm": 6.68359375, "learning_rate": 8.656099369670005e-06, "loss": 2.9291, "mean_token_accuracy": 0.4462477120195241, "step": 7249 }, { "epoch": 1.3440860215053765, "grad_norm": 6.22265625, "learning_rate": 8.655913978494624e-06, "loss": 2.8486, "mean_token_accuracy": 0.4581450969970768, "step": 7250 }, { "epoch": 1.3442714126807565, "grad_norm": 4.9375, "learning_rate": 8.655728587319245e-06, "loss": 2.3275, "mean_token_accuracy": 0.5249717514124294, "step": 7251 }, { "epoch": 1.3444568038561364, "grad_norm": 5.33203125, "learning_rate": 8.655543196143863e-06, "loss": 2.4292, "mean_token_accuracy": 0.5123260922626311, "step": 7252 }, { "epoch": 1.3446421950315166, "grad_norm": 5.99609375, "learning_rate": 8.655357804968484e-06, "loss": 2.715, "mean_token_accuracy": 0.47298553033320057, "step": 7253 }, { "epoch": 1.3448275862068966, "grad_norm": 6.07421875, "learning_rate": 8.655172413793104e-06, "loss": 2.6695, "mean_token_accuracy": 0.4816747716759578, "step": 7254 }, { "epoch": 1.3450129773822765, "grad_norm": 5.703125, "learning_rate": 8.654987022617725e-06, "loss": 3.0237, "mean_token_accuracy": 0.437412685107572, "step": 7255 }, { "epoch": 1.3451983685576567, "grad_norm": 6.0, "learning_rate": 8.654801631442344e-06, "loss": 2.8516, "mean_token_accuracy": 0.4623085983510012, "step": 7256 }, { "epoch": 1.3453837597330367, "grad_norm": 9.6953125, "learning_rate": 8.654616240266964e-06, "loss": 2.8629, "mean_token_accuracy": 0.4451311425987338, "step": 7257 }, { "epoch": 1.3455691509084167, "grad_norm": 6.75390625, "learning_rate": 8.654430849091585e-06, "loss": 3.5636, "mean_token_accuracy": 0.3974084958589367, "step": 7258 }, { "epoch": 1.3457545420837969, "grad_norm": 5.88671875, "learning_rate": 8.654245457916203e-06, "loss": 3.1171, "mean_token_accuracy": 0.4337084820203129, "step": 7259 }, { "epoch": 1.3459399332591768, "grad_norm": 6.95703125, "learning_rate": 8.654060066740824e-06, "loss": 2.7768, "mean_token_accuracy": 0.4583804569102013, "step": 7260 }, { "epoch": 1.3461253244345568, "grad_norm": 8.375, "learning_rate": 8.653874675565443e-06, "loss": 2.4179, "mean_token_accuracy": 0.4946236559139785, "step": 7261 }, { "epoch": 1.346310715609937, "grad_norm": 6.05078125, "learning_rate": 8.653689284390063e-06, "loss": 2.7718, "mean_token_accuracy": 0.47497232812692164, "step": 7262 }, { "epoch": 1.346496106785317, "grad_norm": 8.0859375, "learning_rate": 8.653503893214684e-06, "loss": 2.8715, "mean_token_accuracy": 0.44639855166170955, "step": 7263 }, { "epoch": 1.3466814979606971, "grad_norm": 10.2578125, "learning_rate": 8.653318502039304e-06, "loss": 3.3435, "mean_token_accuracy": 0.43461733400912317, "step": 7264 }, { "epoch": 1.346866889136077, "grad_norm": 8.0390625, "learning_rate": 8.653133110863923e-06, "loss": 2.6704, "mean_token_accuracy": 0.47962838594134766, "step": 7265 }, { "epoch": 1.3470522803114573, "grad_norm": 8.8515625, "learning_rate": 8.652947719688544e-06, "loss": 2.1593, "mean_token_accuracy": 0.56312625250501, "step": 7266 }, { "epoch": 1.3472376714868373, "grad_norm": 6.47265625, "learning_rate": 8.652762328513164e-06, "loss": 2.8806, "mean_token_accuracy": 0.45191420664206644, "step": 7267 }, { "epoch": 1.3474230626622172, "grad_norm": 7.18359375, "learning_rate": 8.652576937337783e-06, "loss": 3.2226, "mean_token_accuracy": 0.42981236970118136, "step": 7268 }, { "epoch": 1.3476084538375974, "grad_norm": 7.17578125, "learning_rate": 8.652391546162403e-06, "loss": 2.3328, "mean_token_accuracy": 0.5088454376163873, "step": 7269 }, { "epoch": 1.3477938450129774, "grad_norm": 6.5, "learning_rate": 8.652206154987022e-06, "loss": 2.6998, "mean_token_accuracy": 0.46749393040194226, "step": 7270 }, { "epoch": 1.3479792361883574, "grad_norm": 9.2265625, "learning_rate": 8.652020763811644e-06, "loss": 2.4339, "mean_token_accuracy": 0.490522834245007, "step": 7271 }, { "epoch": 1.3481646273637375, "grad_norm": 5.60546875, "learning_rate": 8.651835372636263e-06, "loss": 2.8724, "mean_token_accuracy": 0.45015882619875963, "step": 7272 }, { "epoch": 1.3483500185391175, "grad_norm": 6.078125, "learning_rate": 8.651649981460884e-06, "loss": 2.7544, "mean_token_accuracy": 0.4889453241708993, "step": 7273 }, { "epoch": 1.3485354097144975, "grad_norm": 7.3828125, "learning_rate": 8.651464590285504e-06, "loss": 3.0883, "mean_token_accuracy": 0.44939172749391726, "step": 7274 }, { "epoch": 1.3487208008898777, "grad_norm": 5.6328125, "learning_rate": 8.651279199110123e-06, "loss": 3.014, "mean_token_accuracy": 0.4529142984509083, "step": 7275 }, { "epoch": 1.3489061920652576, "grad_norm": 5.61328125, "learning_rate": 8.651093807934743e-06, "loss": 2.8706, "mean_token_accuracy": 0.4515597410241318, "step": 7276 }, { "epoch": 1.3490915832406378, "grad_norm": 5.796875, "learning_rate": 8.650908416759362e-06, "loss": 3.6346, "mean_token_accuracy": 0.40131500298864314, "step": 7277 }, { "epoch": 1.3492769744160178, "grad_norm": 5.46875, "learning_rate": 8.650723025583983e-06, "loss": 3.1167, "mean_token_accuracy": 0.42902881536819637, "step": 7278 }, { "epoch": 1.349462365591398, "grad_norm": 6.390625, "learning_rate": 8.650537634408603e-06, "loss": 2.6441, "mean_token_accuracy": 0.4784546805349183, "step": 7279 }, { "epoch": 1.349647756766778, "grad_norm": 5.90625, "learning_rate": 8.650352243233224e-06, "loss": 2.7791, "mean_token_accuracy": 0.4521497919556172, "step": 7280 }, { "epoch": 1.349833147942158, "grad_norm": 6.69921875, "learning_rate": 8.650166852057842e-06, "loss": 3.0139, "mean_token_accuracy": 0.4565560821484992, "step": 7281 }, { "epoch": 1.350018539117538, "grad_norm": 7.48828125, "learning_rate": 8.649981460882463e-06, "loss": 2.9816, "mean_token_accuracy": 0.4662576687116564, "step": 7282 }, { "epoch": 1.350203930292918, "grad_norm": 6.48046875, "learning_rate": 8.649796069707083e-06, "loss": 3.4044, "mean_token_accuracy": 0.4219853602157442, "step": 7283 }, { "epoch": 1.350389321468298, "grad_norm": 9.4140625, "learning_rate": 8.649610678531702e-06, "loss": 2.4514, "mean_token_accuracy": 0.5141999249343175, "step": 7284 }, { "epoch": 1.3505747126436782, "grad_norm": 8.6328125, "learning_rate": 8.649425287356323e-06, "loss": 2.4216, "mean_token_accuracy": 0.5134003350083752, "step": 7285 }, { "epoch": 1.3507601038190582, "grad_norm": 7.0546875, "learning_rate": 8.649239896180941e-06, "loss": 2.7765, "mean_token_accuracy": 0.4772782799902272, "step": 7286 }, { "epoch": 1.3509454949944382, "grad_norm": 9.0703125, "learning_rate": 8.649054505005564e-06, "loss": 3.2421, "mean_token_accuracy": 0.43143556779920417, "step": 7287 }, { "epoch": 1.3511308861698184, "grad_norm": 9.296875, "learning_rate": 8.648869113830182e-06, "loss": 3.1066, "mean_token_accuracy": 0.4344214514718741, "step": 7288 }, { "epoch": 1.3513162773451983, "grad_norm": 8.578125, "learning_rate": 8.648683722654803e-06, "loss": 2.6411, "mean_token_accuracy": 0.4764950756421972, "step": 7289 }, { "epoch": 1.3515016685205783, "grad_norm": 8.3984375, "learning_rate": 8.648498331479422e-06, "loss": 2.2674, "mean_token_accuracy": 0.539740426901212, "step": 7290 }, { "epoch": 1.3516870596959585, "grad_norm": 7.953125, "learning_rate": 8.648312940304042e-06, "loss": 3.2944, "mean_token_accuracy": 0.443556739162598, "step": 7291 }, { "epoch": 1.3518724508713384, "grad_norm": 6.91796875, "learning_rate": 8.648127549128663e-06, "loss": 2.7469, "mean_token_accuracy": 0.4670287044220326, "step": 7292 }, { "epoch": 1.3520578420467186, "grad_norm": 8.3203125, "learning_rate": 8.647942157953282e-06, "loss": 2.4673, "mean_token_accuracy": 0.4789122497796802, "step": 7293 }, { "epoch": 1.3522432332220986, "grad_norm": 8.1875, "learning_rate": 8.647756766777902e-06, "loss": 2.7897, "mean_token_accuracy": 0.45630198336532307, "step": 7294 }, { "epoch": 1.3524286243974788, "grad_norm": 7.10546875, "learning_rate": 8.647571375602523e-06, "loss": 3.0645, "mean_token_accuracy": 0.4429060107509366, "step": 7295 }, { "epoch": 1.3526140155728588, "grad_norm": 7.30078125, "learning_rate": 8.647385984427143e-06, "loss": 2.823, "mean_token_accuracy": 0.4820408681304734, "step": 7296 }, { "epoch": 1.3527994067482387, "grad_norm": 5.3984375, "learning_rate": 8.647200593251762e-06, "loss": 3.198, "mean_token_accuracy": 0.4487571312143439, "step": 7297 }, { "epoch": 1.352984797923619, "grad_norm": 6.59375, "learning_rate": 8.647015202076382e-06, "loss": 3.0485, "mean_token_accuracy": 0.4268653445820094, "step": 7298 }, { "epoch": 1.3531701890989989, "grad_norm": 4.9921875, "learning_rate": 8.646829810901001e-06, "loss": 2.6847, "mean_token_accuracy": 0.4755072463768116, "step": 7299 }, { "epoch": 1.3533555802743789, "grad_norm": 10.421875, "learning_rate": 8.646644419725622e-06, "loss": 2.4317, "mean_token_accuracy": 0.49184747721145206, "step": 7300 }, { "epoch": 1.353540971449759, "grad_norm": 7.453125, "learning_rate": 8.646459028550242e-06, "loss": 2.6152, "mean_token_accuracy": 0.4799290869950614, "step": 7301 }, { "epoch": 1.353726362625139, "grad_norm": 6.69921875, "learning_rate": 8.646273637374861e-06, "loss": 2.5994, "mean_token_accuracy": 0.47489970725360514, "step": 7302 }, { "epoch": 1.353911753800519, "grad_norm": 5.109375, "learning_rate": 8.646088246199481e-06, "loss": 2.9922, "mean_token_accuracy": 0.4417139256458727, "step": 7303 }, { "epoch": 1.3540971449758992, "grad_norm": 8.6484375, "learning_rate": 8.645902855024102e-06, "loss": 2.8287, "mean_token_accuracy": 0.4797136038186158, "step": 7304 }, { "epoch": 1.3542825361512791, "grad_norm": 12.765625, "learning_rate": 8.645717463848722e-06, "loss": 2.393, "mean_token_accuracy": 0.49369722188226656, "step": 7305 }, { "epoch": 1.3544679273266593, "grad_norm": 7.26171875, "learning_rate": 8.645532072673341e-06, "loss": 2.6199, "mean_token_accuracy": 0.47749834546657843, "step": 7306 }, { "epoch": 1.3546533185020393, "grad_norm": 11.5546875, "learning_rate": 8.645346681497962e-06, "loss": 2.5834, "mean_token_accuracy": 0.48481012658227846, "step": 7307 }, { "epoch": 1.3548387096774195, "grad_norm": 9.5703125, "learning_rate": 8.64516129032258e-06, "loss": 2.8783, "mean_token_accuracy": 0.46375587986973826, "step": 7308 }, { "epoch": 1.3550241008527995, "grad_norm": 5.953125, "learning_rate": 8.644975899147201e-06, "loss": 2.5174, "mean_token_accuracy": 0.4992014196983141, "step": 7309 }, { "epoch": 1.3552094920281794, "grad_norm": 6.7109375, "learning_rate": 8.644790507971821e-06, "loss": 3.0934, "mean_token_accuracy": 0.44241815048822514, "step": 7310 }, { "epoch": 1.3553948832035596, "grad_norm": 11.6171875, "learning_rate": 8.64460511679644e-06, "loss": 2.5826, "mean_token_accuracy": 0.4882753403933434, "step": 7311 }, { "epoch": 1.3555802743789396, "grad_norm": 6.890625, "learning_rate": 8.644419725621062e-06, "loss": 2.5368, "mean_token_accuracy": 0.49647795460474825, "step": 7312 }, { "epoch": 1.3557656655543195, "grad_norm": 5.97265625, "learning_rate": 8.644234334445681e-06, "loss": 2.9189, "mean_token_accuracy": 0.4441280173582859, "step": 7313 }, { "epoch": 1.3559510567296997, "grad_norm": 8.859375, "learning_rate": 8.644048943270302e-06, "loss": 2.6071, "mean_token_accuracy": 0.4887050170738114, "step": 7314 }, { "epoch": 1.3561364479050797, "grad_norm": 7.82421875, "learning_rate": 8.64386355209492e-06, "loss": 2.8921, "mean_token_accuracy": 0.4551282051282051, "step": 7315 }, { "epoch": 1.3563218390804597, "grad_norm": 5.8046875, "learning_rate": 8.643678160919541e-06, "loss": 3.3541, "mean_token_accuracy": 0.4180748529227688, "step": 7316 }, { "epoch": 1.3565072302558399, "grad_norm": 7.0234375, "learning_rate": 8.64349276974416e-06, "loss": 2.9752, "mean_token_accuracy": 0.44296316657504126, "step": 7317 }, { "epoch": 1.3566926214312198, "grad_norm": 7.23828125, "learning_rate": 8.64330737856878e-06, "loss": 2.774, "mean_token_accuracy": 0.4824197758329495, "step": 7318 }, { "epoch": 1.3568780126065998, "grad_norm": 8.2578125, "learning_rate": 8.6431219873934e-06, "loss": 2.4836, "mean_token_accuracy": 0.5028967076444821, "step": 7319 }, { "epoch": 1.35706340378198, "grad_norm": 6.23828125, "learning_rate": 8.642936596218021e-06, "loss": 3.2388, "mean_token_accuracy": 0.4162319543855392, "step": 7320 }, { "epoch": 1.35724879495736, "grad_norm": 7.86328125, "learning_rate": 8.642751205042642e-06, "loss": 2.988, "mean_token_accuracy": 0.4471707561342013, "step": 7321 }, { "epoch": 1.3574341861327401, "grad_norm": 11.1015625, "learning_rate": 8.64256581386726e-06, "loss": 2.3031, "mean_token_accuracy": 0.5095163050066674, "step": 7322 }, { "epoch": 1.35761957730812, "grad_norm": 7.86328125, "learning_rate": 8.642380422691881e-06, "loss": 2.6977, "mean_token_accuracy": 0.4797011559063998, "step": 7323 }, { "epoch": 1.3578049684835003, "grad_norm": 6.26171875, "learning_rate": 8.6421950315165e-06, "loss": 2.3932, "mean_token_accuracy": 0.5140370234154714, "step": 7324 }, { "epoch": 1.3579903596588803, "grad_norm": 5.5625, "learning_rate": 8.64200964034112e-06, "loss": 2.587, "mean_token_accuracy": 0.5044184380224505, "step": 7325 }, { "epoch": 1.3581757508342602, "grad_norm": 7.671875, "learning_rate": 8.64182424916574e-06, "loss": 3.2915, "mean_token_accuracy": 0.42289988492520136, "step": 7326 }, { "epoch": 1.3583611420096404, "grad_norm": 6.421875, "learning_rate": 8.64163885799036e-06, "loss": 2.5378, "mean_token_accuracy": 0.47769893563101873, "step": 7327 }, { "epoch": 1.3585465331850204, "grad_norm": 7.48046875, "learning_rate": 8.64145346681498e-06, "loss": 2.721, "mean_token_accuracy": 0.4774829600778968, "step": 7328 }, { "epoch": 1.3587319243604004, "grad_norm": 8.7578125, "learning_rate": 8.6412680756396e-06, "loss": 2.7644, "mean_token_accuracy": 0.4697347207973032, "step": 7329 }, { "epoch": 1.3589173155357805, "grad_norm": 7.63671875, "learning_rate": 8.641082684464221e-06, "loss": 3.0469, "mean_token_accuracy": 0.4492142025611176, "step": 7330 }, { "epoch": 1.3591027067111605, "grad_norm": 7.29296875, "learning_rate": 8.64089729328884e-06, "loss": 2.6576, "mean_token_accuracy": 0.48411464119084646, "step": 7331 }, { "epoch": 1.3592880978865405, "grad_norm": 7.9375, "learning_rate": 8.64071190211346e-06, "loss": 3.0635, "mean_token_accuracy": 0.4640647310804379, "step": 7332 }, { "epoch": 1.3594734890619207, "grad_norm": 6.8125, "learning_rate": 8.64052651093808e-06, "loss": 3.0955, "mean_token_accuracy": 0.42598496925529494, "step": 7333 }, { "epoch": 1.3596588802373006, "grad_norm": 7.7265625, "learning_rate": 8.6403411197627e-06, "loss": 2.3649, "mean_token_accuracy": 0.5146190987124464, "step": 7334 }, { "epoch": 1.3598442714126808, "grad_norm": 5.20703125, "learning_rate": 8.64015572858732e-06, "loss": 2.5601, "mean_token_accuracy": 0.4849726775956284, "step": 7335 }, { "epoch": 1.3600296625880608, "grad_norm": 7.0859375, "learning_rate": 8.63997033741194e-06, "loss": 2.2697, "mean_token_accuracy": 0.5223143130379343, "step": 7336 }, { "epoch": 1.360215053763441, "grad_norm": 6.9921875, "learning_rate": 8.63978494623656e-06, "loss": 2.4996, "mean_token_accuracy": 0.511329196132198, "step": 7337 }, { "epoch": 1.360400444938821, "grad_norm": 9.859375, "learning_rate": 8.63959955506118e-06, "loss": 2.1067, "mean_token_accuracy": 0.5356511490866235, "step": 7338 }, { "epoch": 1.360585836114201, "grad_norm": 8.265625, "learning_rate": 8.6394141638858e-06, "loss": 3.2059, "mean_token_accuracy": 0.4281349812526038, "step": 7339 }, { "epoch": 1.360771227289581, "grad_norm": 8.7890625, "learning_rate": 8.63922877271042e-06, "loss": 2.3977, "mean_token_accuracy": 0.49827143972343035, "step": 7340 }, { "epoch": 1.360956618464961, "grad_norm": 6.6484375, "learning_rate": 8.63904338153504e-06, "loss": 2.6319, "mean_token_accuracy": 0.4815571444784928, "step": 7341 }, { "epoch": 1.361142009640341, "grad_norm": 9.296875, "learning_rate": 8.638857990359659e-06, "loss": 2.6013, "mean_token_accuracy": 0.4659907055344318, "step": 7342 }, { "epoch": 1.3613274008157212, "grad_norm": 8.1484375, "learning_rate": 8.638672599184279e-06, "loss": 2.0642, "mean_token_accuracy": 0.5718364698247891, "step": 7343 }, { "epoch": 1.3615127919911012, "grad_norm": 6.94921875, "learning_rate": 8.6384872080089e-06, "loss": 2.4559, "mean_token_accuracy": 0.5107373479501427, "step": 7344 }, { "epoch": 1.3616981831664812, "grad_norm": 4.875, "learning_rate": 8.63830181683352e-06, "loss": 2.9818, "mean_token_accuracy": 0.44495916128245766, "step": 7345 }, { "epoch": 1.3618835743418614, "grad_norm": 6.8984375, "learning_rate": 8.638116425658139e-06, "loss": 3.0588, "mean_token_accuracy": 0.43302990897269183, "step": 7346 }, { "epoch": 1.3620689655172413, "grad_norm": 6.1484375, "learning_rate": 8.63793103448276e-06, "loss": 2.5581, "mean_token_accuracy": 0.48720431594964725, "step": 7347 }, { "epoch": 1.3622543566926213, "grad_norm": 5.91015625, "learning_rate": 8.63774564330738e-06, "loss": 2.8318, "mean_token_accuracy": 0.45915583373455304, "step": 7348 }, { "epoch": 1.3624397478680015, "grad_norm": 7.41015625, "learning_rate": 8.637560252131999e-06, "loss": 2.9738, "mean_token_accuracy": 0.4688416852482103, "step": 7349 }, { "epoch": 1.3626251390433817, "grad_norm": 14.3046875, "learning_rate": 8.637374860956619e-06, "loss": 2.4147, "mean_token_accuracy": 0.5191066997518611, "step": 7350 }, { "epoch": 1.3628105302187616, "grad_norm": 5.48828125, "learning_rate": 8.637189469781238e-06, "loss": 2.6388, "mean_token_accuracy": 0.48697549082509944, "step": 7351 }, { "epoch": 1.3629959213941416, "grad_norm": 5.6640625, "learning_rate": 8.63700407860586e-06, "loss": 2.3863, "mean_token_accuracy": 0.5041995089804885, "step": 7352 }, { "epoch": 1.3631813125695218, "grad_norm": 6.9609375, "learning_rate": 8.636818687430479e-06, "loss": 2.9356, "mean_token_accuracy": 0.4636036534735677, "step": 7353 }, { "epoch": 1.3633667037449018, "grad_norm": 6.453125, "learning_rate": 8.6366332962551e-06, "loss": 2.8832, "mean_token_accuracy": 0.460431654676259, "step": 7354 }, { "epoch": 1.3635520949202817, "grad_norm": 9.296875, "learning_rate": 8.63644790507972e-06, "loss": 3.1493, "mean_token_accuracy": 0.44529262086513993, "step": 7355 }, { "epoch": 1.363737486095662, "grad_norm": 10.9453125, "learning_rate": 8.636262513904339e-06, "loss": 3.0728, "mean_token_accuracy": 0.4516035827795435, "step": 7356 }, { "epoch": 1.363922877271042, "grad_norm": 8.9296875, "learning_rate": 8.636077122728959e-06, "loss": 2.6297, "mean_token_accuracy": 0.48235671514114625, "step": 7357 }, { "epoch": 1.3641082684464219, "grad_norm": 8.21875, "learning_rate": 8.635891731553578e-06, "loss": 2.4467, "mean_token_accuracy": 0.5075614366729678, "step": 7358 }, { "epoch": 1.364293659621802, "grad_norm": 6.46875, "learning_rate": 8.635706340378198e-06, "loss": 2.9179, "mean_token_accuracy": 0.4692737430167598, "step": 7359 }, { "epoch": 1.364479050797182, "grad_norm": 7.11328125, "learning_rate": 8.635520949202819e-06, "loss": 2.5782, "mean_token_accuracy": 0.4915512465373961, "step": 7360 }, { "epoch": 1.364664441972562, "grad_norm": 6.0859375, "learning_rate": 8.63533555802744e-06, "loss": 3.4511, "mean_token_accuracy": 0.42676121832132063, "step": 7361 }, { "epoch": 1.3648498331479422, "grad_norm": 8.015625, "learning_rate": 8.635150166852058e-06, "loss": 2.6058, "mean_token_accuracy": 0.4903758020164986, "step": 7362 }, { "epoch": 1.3650352243233221, "grad_norm": 5.41796875, "learning_rate": 8.634964775676679e-06, "loss": 2.8949, "mean_token_accuracy": 0.45038855726996285, "step": 7363 }, { "epoch": 1.3652206154987023, "grad_norm": 5.19921875, "learning_rate": 8.6347793845013e-06, "loss": 2.9809, "mean_token_accuracy": 0.43647932131495226, "step": 7364 }, { "epoch": 1.3654060066740823, "grad_norm": 9.7578125, "learning_rate": 8.634593993325918e-06, "loss": 2.6062, "mean_token_accuracy": 0.4904148164193653, "step": 7365 }, { "epoch": 1.3655913978494625, "grad_norm": 6.171875, "learning_rate": 8.634408602150538e-06, "loss": 3.1044, "mean_token_accuracy": 0.4492141064588562, "step": 7366 }, { "epoch": 1.3657767890248425, "grad_norm": 14.203125, "learning_rate": 8.634223210975157e-06, "loss": 3.1804, "mean_token_accuracy": 0.4076360310928212, "step": 7367 }, { "epoch": 1.3659621802002224, "grad_norm": 6.7890625, "learning_rate": 8.63403781979978e-06, "loss": 3.0431, "mean_token_accuracy": 0.42553875844322936, "step": 7368 }, { "epoch": 1.3661475713756026, "grad_norm": 6.984375, "learning_rate": 8.633852428624398e-06, "loss": 2.8844, "mean_token_accuracy": 0.44863852470825527, "step": 7369 }, { "epoch": 1.3663329625509826, "grad_norm": 8.8125, "learning_rate": 8.633667037449019e-06, "loss": 3.1541, "mean_token_accuracy": 0.4231031194660128, "step": 7370 }, { "epoch": 1.3665183537263625, "grad_norm": 6.55078125, "learning_rate": 8.633481646273638e-06, "loss": 2.8224, "mean_token_accuracy": 0.457565011820331, "step": 7371 }, { "epoch": 1.3667037449017427, "grad_norm": 5.77734375, "learning_rate": 8.633296255098258e-06, "loss": 2.5783, "mean_token_accuracy": 0.4784038901601831, "step": 7372 }, { "epoch": 1.3668891360771227, "grad_norm": 5.046875, "learning_rate": 8.633110863922878e-06, "loss": 2.5815, "mean_token_accuracy": 0.5002965599051008, "step": 7373 }, { "epoch": 1.3670745272525027, "grad_norm": 6.390625, "learning_rate": 8.632925472747497e-06, "loss": 2.9484, "mean_token_accuracy": 0.462430426716141, "step": 7374 }, { "epoch": 1.3672599184278829, "grad_norm": 6.703125, "learning_rate": 8.632740081572118e-06, "loss": 2.6047, "mean_token_accuracy": 0.48940710257582687, "step": 7375 }, { "epoch": 1.3674453096032628, "grad_norm": 6.77734375, "learning_rate": 8.632554690396738e-06, "loss": 2.4381, "mean_token_accuracy": 0.5100177184135205, "step": 7376 }, { "epoch": 1.367630700778643, "grad_norm": 6.44140625, "learning_rate": 8.632369299221359e-06, "loss": 2.4171, "mean_token_accuracy": 0.47732893652102226, "step": 7377 }, { "epoch": 1.367816091954023, "grad_norm": 6.32421875, "learning_rate": 8.632183908045978e-06, "loss": 3.2222, "mean_token_accuracy": 0.4439612848926918, "step": 7378 }, { "epoch": 1.3680014831294032, "grad_norm": 8.890625, "learning_rate": 8.631998516870598e-06, "loss": 2.4916, "mean_token_accuracy": 0.4841125737630504, "step": 7379 }, { "epoch": 1.3681868743047831, "grad_norm": 6.078125, "learning_rate": 8.631813125695217e-06, "loss": 2.6611, "mean_token_accuracy": 0.46060154113845386, "step": 7380 }, { "epoch": 1.368372265480163, "grad_norm": 5.72265625, "learning_rate": 8.631627734519837e-06, "loss": 2.6812, "mean_token_accuracy": 0.48614958448753465, "step": 7381 }, { "epoch": 1.3685576566555433, "grad_norm": 6.6640625, "learning_rate": 8.631442343344458e-06, "loss": 2.7439, "mean_token_accuracy": 0.46784100127477113, "step": 7382 }, { "epoch": 1.3687430478309233, "grad_norm": 5.28515625, "learning_rate": 8.631256952169077e-06, "loss": 2.5386, "mean_token_accuracy": 0.47807181016275313, "step": 7383 }, { "epoch": 1.3689284390063032, "grad_norm": 7.203125, "learning_rate": 8.631071560993697e-06, "loss": 1.8222, "mean_token_accuracy": 0.5953983266642415, "step": 7384 }, { "epoch": 1.3691138301816834, "grad_norm": 8.3125, "learning_rate": 8.630886169818318e-06, "loss": 2.6158, "mean_token_accuracy": 0.4783786968304466, "step": 7385 }, { "epoch": 1.3692992213570634, "grad_norm": 7.34765625, "learning_rate": 8.630700778642938e-06, "loss": 2.5117, "mean_token_accuracy": 0.5161290322580645, "step": 7386 }, { "epoch": 1.3694846125324434, "grad_norm": 6.9140625, "learning_rate": 8.630515387467557e-06, "loss": 2.7967, "mean_token_accuracy": 0.4590844062947067, "step": 7387 }, { "epoch": 1.3696700037078235, "grad_norm": 7.72265625, "learning_rate": 8.630329996292177e-06, "loss": 2.8173, "mean_token_accuracy": 0.4557356608478803, "step": 7388 }, { "epoch": 1.3698553948832035, "grad_norm": 5.68359375, "learning_rate": 8.630144605116796e-06, "loss": 2.189, "mean_token_accuracy": 0.538038613987981, "step": 7389 }, { "epoch": 1.3700407860585835, "grad_norm": 7.015625, "learning_rate": 8.629959213941417e-06, "loss": 2.7052, "mean_token_accuracy": 0.4675904180366369, "step": 7390 }, { "epoch": 1.3702261772339637, "grad_norm": 7.15234375, "learning_rate": 8.629773822766037e-06, "loss": 2.8478, "mean_token_accuracy": 0.4726126999515269, "step": 7391 }, { "epoch": 1.3704115684093436, "grad_norm": 6.44921875, "learning_rate": 8.629588431590658e-06, "loss": 2.4109, "mean_token_accuracy": 0.5066270283661588, "step": 7392 }, { "epoch": 1.3705969595847238, "grad_norm": 5.5234375, "learning_rate": 8.629403040415278e-06, "loss": 3.062, "mean_token_accuracy": 0.42673663858583066, "step": 7393 }, { "epoch": 1.3707823507601038, "grad_norm": 7.234375, "learning_rate": 8.629217649239897e-06, "loss": 3.2006, "mean_token_accuracy": 0.39489069649212, "step": 7394 }, { "epoch": 1.370967741935484, "grad_norm": 8.203125, "learning_rate": 8.629032258064517e-06, "loss": 2.7331, "mean_token_accuracy": 0.46336379669713, "step": 7395 }, { "epoch": 1.371153133110864, "grad_norm": 6.921875, "learning_rate": 8.628846866889136e-06, "loss": 2.559, "mean_token_accuracy": 0.48073871014283653, "step": 7396 }, { "epoch": 1.371338524286244, "grad_norm": 5.34375, "learning_rate": 8.628661475713757e-06, "loss": 2.2595, "mean_token_accuracy": 0.5477027027027027, "step": 7397 }, { "epoch": 1.371523915461624, "grad_norm": 5.59765625, "learning_rate": 8.628476084538376e-06, "loss": 2.6096, "mean_token_accuracy": 0.47985592315901815, "step": 7398 }, { "epoch": 1.371709306637004, "grad_norm": 5.8984375, "learning_rate": 8.628290693362996e-06, "loss": 2.6805, "mean_token_accuracy": 0.458012467913458, "step": 7399 }, { "epoch": 1.371894697812384, "grad_norm": 9.0078125, "learning_rate": 8.628105302187617e-06, "loss": 2.3659, "mean_token_accuracy": 0.5038386271262983, "step": 7400 }, { "epoch": 1.3720800889877642, "grad_norm": 5.73828125, "learning_rate": 8.627919911012237e-06, "loss": 3.1338, "mean_token_accuracy": 0.429970876356897, "step": 7401 }, { "epoch": 1.3722654801631442, "grad_norm": 6.35546875, "learning_rate": 8.627734519836858e-06, "loss": 2.8711, "mean_token_accuracy": 0.4637496459926366, "step": 7402 }, { "epoch": 1.3724508713385242, "grad_norm": 7.0, "learning_rate": 8.627549128661476e-06, "loss": 3.1086, "mean_token_accuracy": 0.4351287473566364, "step": 7403 }, { "epoch": 1.3726362625139044, "grad_norm": 10.7265625, "learning_rate": 8.627363737486097e-06, "loss": 2.4281, "mean_token_accuracy": 0.4818607002907344, "step": 7404 }, { "epoch": 1.3728216536892843, "grad_norm": 7.1328125, "learning_rate": 8.627178346310716e-06, "loss": 2.8394, "mean_token_accuracy": 0.4556173820879703, "step": 7405 }, { "epoch": 1.3730070448646645, "grad_norm": 9.6875, "learning_rate": 8.626992955135336e-06, "loss": 2.8406, "mean_token_accuracy": 0.4698564593301435, "step": 7406 }, { "epoch": 1.3731924360400445, "grad_norm": 10.8828125, "learning_rate": 8.626807563959957e-06, "loss": 2.4231, "mean_token_accuracy": 0.5076489096235217, "step": 7407 }, { "epoch": 1.3733778272154247, "grad_norm": 6.17578125, "learning_rate": 8.626622172784577e-06, "loss": 3.2381, "mean_token_accuracy": 0.43894610137755785, "step": 7408 }, { "epoch": 1.3735632183908046, "grad_norm": 6.74609375, "learning_rate": 8.626436781609196e-06, "loss": 2.4361, "mean_token_accuracy": 0.49498187900752716, "step": 7409 }, { "epoch": 1.3737486095661846, "grad_norm": 8.359375, "learning_rate": 8.626251390433816e-06, "loss": 3.1712, "mean_token_accuracy": 0.4538426032771752, "step": 7410 }, { "epoch": 1.3739340007415648, "grad_norm": 9.2265625, "learning_rate": 8.626065999258437e-06, "loss": 2.5257, "mean_token_accuracy": 0.5097571875465514, "step": 7411 }, { "epoch": 1.3741193919169448, "grad_norm": 6.765625, "learning_rate": 8.625880608083056e-06, "loss": 2.6559, "mean_token_accuracy": 0.4908521906596052, "step": 7412 }, { "epoch": 1.3743047830923247, "grad_norm": 7.9453125, "learning_rate": 8.625695216907676e-06, "loss": 2.9155, "mean_token_accuracy": 0.47449547115843, "step": 7413 }, { "epoch": 1.374490174267705, "grad_norm": 8.84375, "learning_rate": 8.625509825732295e-06, "loss": 2.4313, "mean_token_accuracy": 0.4968423942888523, "step": 7414 }, { "epoch": 1.374675565443085, "grad_norm": 5.65234375, "learning_rate": 8.625324434556915e-06, "loss": 2.7591, "mean_token_accuracy": 0.4628863134657837, "step": 7415 }, { "epoch": 1.3748609566184649, "grad_norm": 8.03125, "learning_rate": 8.625139043381536e-06, "loss": 2.9286, "mean_token_accuracy": 0.4363797020954304, "step": 7416 }, { "epoch": 1.375046347793845, "grad_norm": 7.4453125, "learning_rate": 8.624953652206156e-06, "loss": 2.8084, "mean_token_accuracy": 0.48120405049244, "step": 7417 }, { "epoch": 1.375231738969225, "grad_norm": 5.26953125, "learning_rate": 8.624768261030775e-06, "loss": 3.0796, "mean_token_accuracy": 0.425513698630137, "step": 7418 }, { "epoch": 1.375417130144605, "grad_norm": 5.41796875, "learning_rate": 8.624582869855396e-06, "loss": 2.9322, "mean_token_accuracy": 0.4543315804040963, "step": 7419 }, { "epoch": 1.3756025213199852, "grad_norm": 10.0234375, "learning_rate": 8.624397478680016e-06, "loss": 2.7116, "mean_token_accuracy": 0.47493440132578374, "step": 7420 }, { "epoch": 1.3757879124953651, "grad_norm": 5.83984375, "learning_rate": 8.624212087504635e-06, "loss": 2.8615, "mean_token_accuracy": 0.45558596665837275, "step": 7421 }, { "epoch": 1.3759733036707453, "grad_norm": 6.6484375, "learning_rate": 8.624026696329255e-06, "loss": 2.5107, "mean_token_accuracy": 0.5022156573116692, "step": 7422 }, { "epoch": 1.3761586948461253, "grad_norm": 6.78515625, "learning_rate": 8.623841305153874e-06, "loss": 3.082, "mean_token_accuracy": 0.4315381420462412, "step": 7423 }, { "epoch": 1.3763440860215055, "grad_norm": 5.71484375, "learning_rate": 8.623655913978495e-06, "loss": 2.1851, "mean_token_accuracy": 0.5349178403755869, "step": 7424 }, { "epoch": 1.3765294771968855, "grad_norm": 5.203125, "learning_rate": 8.623470522803115e-06, "loss": 2.443, "mean_token_accuracy": 0.502370820668693, "step": 7425 }, { "epoch": 1.3767148683722654, "grad_norm": 6.92578125, "learning_rate": 8.623285131627736e-06, "loss": 2.6219, "mean_token_accuracy": 0.47525664648591737, "step": 7426 }, { "epoch": 1.3769002595476456, "grad_norm": 6.77734375, "learning_rate": 8.623099740452355e-06, "loss": 2.446, "mean_token_accuracy": 0.49406824146981626, "step": 7427 }, { "epoch": 1.3770856507230256, "grad_norm": 9.5703125, "learning_rate": 8.622914349276975e-06, "loss": 2.767, "mean_token_accuracy": 0.4626664671554691, "step": 7428 }, { "epoch": 1.3772710418984055, "grad_norm": 10.0390625, "learning_rate": 8.622728958101596e-06, "loss": 2.8705, "mean_token_accuracy": 0.4491525423728814, "step": 7429 }, { "epoch": 1.3774564330737857, "grad_norm": 6.53515625, "learning_rate": 8.622543566926214e-06, "loss": 2.9715, "mean_token_accuracy": 0.43729694606887587, "step": 7430 }, { "epoch": 1.3776418242491657, "grad_norm": 10.109375, "learning_rate": 8.622358175750835e-06, "loss": 2.9818, "mean_token_accuracy": 0.44787322768974147, "step": 7431 }, { "epoch": 1.3778272154245457, "grad_norm": 11.578125, "learning_rate": 8.622172784575454e-06, "loss": 2.8537, "mean_token_accuracy": 0.4495240480961924, "step": 7432 }, { "epoch": 1.3780126065999259, "grad_norm": 7.61328125, "learning_rate": 8.621987393400076e-06, "loss": 2.7611, "mean_token_accuracy": 0.46386614453542185, "step": 7433 }, { "epoch": 1.3781979977753058, "grad_norm": 7.3671875, "learning_rate": 8.621802002224695e-06, "loss": 2.9143, "mean_token_accuracy": 0.45947578209928736, "step": 7434 }, { "epoch": 1.378383388950686, "grad_norm": 10.8671875, "learning_rate": 8.621616611049315e-06, "loss": 3.3139, "mean_token_accuracy": 0.4023842917251052, "step": 7435 }, { "epoch": 1.378568780126066, "grad_norm": 8.0625, "learning_rate": 8.621431219873934e-06, "loss": 2.951, "mean_token_accuracy": 0.4455558731066019, "step": 7436 }, { "epoch": 1.3787541713014462, "grad_norm": 8.0390625, "learning_rate": 8.621245828698554e-06, "loss": 2.8326, "mean_token_accuracy": 0.45309800049370524, "step": 7437 }, { "epoch": 1.3789395624768261, "grad_norm": 9.71875, "learning_rate": 8.621060437523175e-06, "loss": 3.2739, "mean_token_accuracy": 0.43939622641509435, "step": 7438 }, { "epoch": 1.379124953652206, "grad_norm": 7.2890625, "learning_rate": 8.620875046347794e-06, "loss": 2.5585, "mean_token_accuracy": 0.4921118184334348, "step": 7439 }, { "epoch": 1.3793103448275863, "grad_norm": 10.234375, "learning_rate": 8.620689655172414e-06, "loss": 2.7522, "mean_token_accuracy": 0.46027159780410287, "step": 7440 }, { "epoch": 1.3794957360029663, "grad_norm": 6.27734375, "learning_rate": 8.620504263997035e-06, "loss": 2.4853, "mean_token_accuracy": 0.49250053567602314, "step": 7441 }, { "epoch": 1.3796811271783462, "grad_norm": 8.46875, "learning_rate": 8.620318872821655e-06, "loss": 2.5383, "mean_token_accuracy": 0.5088557445816826, "step": 7442 }, { "epoch": 1.3798665183537264, "grad_norm": 7.46484375, "learning_rate": 8.620133481646274e-06, "loss": 2.7757, "mean_token_accuracy": 0.47903348141727164, "step": 7443 }, { "epoch": 1.3800519095291064, "grad_norm": 10.0390625, "learning_rate": 8.619948090470894e-06, "loss": 2.6176, "mean_token_accuracy": 0.45368620037807184, "step": 7444 }, { "epoch": 1.3802373007044864, "grad_norm": 7.37109375, "learning_rate": 8.619762699295515e-06, "loss": 2.2462, "mean_token_accuracy": 0.5062491946914057, "step": 7445 }, { "epoch": 1.3804226918798665, "grad_norm": 8.25, "learning_rate": 8.619577308120134e-06, "loss": 2.9541, "mean_token_accuracy": 0.4614381216370455, "step": 7446 }, { "epoch": 1.3806080830552465, "grad_norm": 8.703125, "learning_rate": 8.619391916944754e-06, "loss": 2.6027, "mean_token_accuracy": 0.4776796093931644, "step": 7447 }, { "epoch": 1.3807934742306267, "grad_norm": 9.3046875, "learning_rate": 8.619206525769373e-06, "loss": 2.5998, "mean_token_accuracy": 0.4815970056144729, "step": 7448 }, { "epoch": 1.3809788654060067, "grad_norm": 7.671875, "learning_rate": 8.619021134593995e-06, "loss": 2.2996, "mean_token_accuracy": 0.5394704281118535, "step": 7449 }, { "epoch": 1.3811642565813869, "grad_norm": 9.5, "learning_rate": 8.618835743418614e-06, "loss": 3.1821, "mean_token_accuracy": 0.4260977118119975, "step": 7450 }, { "epoch": 1.3813496477567668, "grad_norm": 6.6640625, "learning_rate": 8.618650352243234e-06, "loss": 2.7154, "mean_token_accuracy": 0.4883951843117786, "step": 7451 }, { "epoch": 1.3815350389321468, "grad_norm": 12.390625, "learning_rate": 8.618464961067853e-06, "loss": 3.2936, "mean_token_accuracy": 0.41905168731311404, "step": 7452 }, { "epoch": 1.381720430107527, "grad_norm": 8.59375, "learning_rate": 8.618279569892474e-06, "loss": 3.0574, "mean_token_accuracy": 0.45986779981114256, "step": 7453 }, { "epoch": 1.381905821282907, "grad_norm": 7.3359375, "learning_rate": 8.618094178717094e-06, "loss": 2.6872, "mean_token_accuracy": 0.4658268894986281, "step": 7454 }, { "epoch": 1.382091212458287, "grad_norm": 9.0625, "learning_rate": 8.617908787541713e-06, "loss": 2.8389, "mean_token_accuracy": 0.4682004495570541, "step": 7455 }, { "epoch": 1.3822766036336671, "grad_norm": 11.1015625, "learning_rate": 8.617723396366334e-06, "loss": 2.4567, "mean_token_accuracy": 0.5081764463377766, "step": 7456 }, { "epoch": 1.382461994809047, "grad_norm": 6.41796875, "learning_rate": 8.617538005190954e-06, "loss": 2.5723, "mean_token_accuracy": 0.4796665540159964, "step": 7457 }, { "epoch": 1.382647385984427, "grad_norm": 6.6953125, "learning_rate": 8.617352614015575e-06, "loss": 2.3135, "mean_token_accuracy": 0.5423728813559322, "step": 7458 }, { "epoch": 1.3828327771598072, "grad_norm": 10.9453125, "learning_rate": 8.617167222840193e-06, "loss": 2.6908, "mean_token_accuracy": 0.4795481773061783, "step": 7459 }, { "epoch": 1.3830181683351872, "grad_norm": 5.37109375, "learning_rate": 8.616981831664814e-06, "loss": 2.7969, "mean_token_accuracy": 0.4533803644914756, "step": 7460 }, { "epoch": 1.3832035595105672, "grad_norm": 7.14453125, "learning_rate": 8.616796440489433e-06, "loss": 2.8248, "mean_token_accuracy": 0.45457532446196813, "step": 7461 }, { "epoch": 1.3833889506859474, "grad_norm": 7.7890625, "learning_rate": 8.616611049314053e-06, "loss": 2.4295, "mean_token_accuracy": 0.491754860940192, "step": 7462 }, { "epoch": 1.3835743418613273, "grad_norm": 5.9453125, "learning_rate": 8.616425658138674e-06, "loss": 3.0007, "mean_token_accuracy": 0.4679860302677532, "step": 7463 }, { "epoch": 1.3837597330367075, "grad_norm": 7.30859375, "learning_rate": 8.616240266963292e-06, "loss": 2.3924, "mean_token_accuracy": 0.5044800754539024, "step": 7464 }, { "epoch": 1.3839451242120875, "grad_norm": 6.09375, "learning_rate": 8.616054875787913e-06, "loss": 2.955, "mean_token_accuracy": 0.4501477679265827, "step": 7465 }, { "epoch": 1.3841305153874677, "grad_norm": 8.1640625, "learning_rate": 8.615869484612533e-06, "loss": 2.4401, "mean_token_accuracy": 0.5099355178312935, "step": 7466 }, { "epoch": 1.3843159065628476, "grad_norm": 6.484375, "learning_rate": 8.615684093437154e-06, "loss": 3.801, "mean_token_accuracy": 0.37421760554001865, "step": 7467 }, { "epoch": 1.3845012977382276, "grad_norm": 5.515625, "learning_rate": 8.615498702261773e-06, "loss": 2.4741, "mean_token_accuracy": 0.4952840781495621, "step": 7468 }, { "epoch": 1.3846866889136078, "grad_norm": 7.68359375, "learning_rate": 8.615313311086393e-06, "loss": 2.9287, "mean_token_accuracy": 0.45156908028800435, "step": 7469 }, { "epoch": 1.3848720800889878, "grad_norm": 7.78515625, "learning_rate": 8.615127919911012e-06, "loss": 2.9389, "mean_token_accuracy": 0.4543524416135881, "step": 7470 }, { "epoch": 1.3850574712643677, "grad_norm": 5.8828125, "learning_rate": 8.614942528735632e-06, "loss": 2.8659, "mean_token_accuracy": 0.45898604386214753, "step": 7471 }, { "epoch": 1.385242862439748, "grad_norm": 5.3203125, "learning_rate": 8.614757137560253e-06, "loss": 3.0361, "mean_token_accuracy": 0.4255263157894737, "step": 7472 }, { "epoch": 1.385428253615128, "grad_norm": 5.9375, "learning_rate": 8.614571746384873e-06, "loss": 2.5852, "mean_token_accuracy": 0.4952340902719372, "step": 7473 }, { "epoch": 1.3856136447905079, "grad_norm": 6.17578125, "learning_rate": 8.614386355209494e-06, "loss": 3.2013, "mean_token_accuracy": 0.44165621079046424, "step": 7474 }, { "epoch": 1.385799035965888, "grad_norm": 5.9453125, "learning_rate": 8.614200964034113e-06, "loss": 2.1264, "mean_token_accuracy": 0.5561867382772154, "step": 7475 }, { "epoch": 1.385984427141268, "grad_norm": 5.86328125, "learning_rate": 8.614015572858733e-06, "loss": 2.9316, "mean_token_accuracy": 0.4616190725700812, "step": 7476 }, { "epoch": 1.3861698183166482, "grad_norm": 6.421875, "learning_rate": 8.613830181683352e-06, "loss": 3.4656, "mean_token_accuracy": 0.4162466072120977, "step": 7477 }, { "epoch": 1.3863552094920282, "grad_norm": 7.35546875, "learning_rate": 8.613644790507972e-06, "loss": 2.5155, "mean_token_accuracy": 0.4760656642295489, "step": 7478 }, { "epoch": 1.3865406006674084, "grad_norm": 5.75390625, "learning_rate": 8.613459399332591e-06, "loss": 3.1748, "mean_token_accuracy": 0.4321182815696485, "step": 7479 }, { "epoch": 1.3867259918427883, "grad_norm": 7.375, "learning_rate": 8.613274008157212e-06, "loss": 3.2484, "mean_token_accuracy": 0.430954717510164, "step": 7480 }, { "epoch": 1.3869113830181683, "grad_norm": 5.625, "learning_rate": 8.613088616981832e-06, "loss": 2.8459, "mean_token_accuracy": 0.48796660117878193, "step": 7481 }, { "epoch": 1.3870967741935485, "grad_norm": 5.53125, "learning_rate": 8.612903225806453e-06, "loss": 2.6467, "mean_token_accuracy": 0.503669028340081, "step": 7482 }, { "epoch": 1.3872821653689285, "grad_norm": 6.16796875, "learning_rate": 8.612717834631073e-06, "loss": 2.8613, "mean_token_accuracy": 0.46348019165595417, "step": 7483 }, { "epoch": 1.3874675565443084, "grad_norm": 5.34375, "learning_rate": 8.612532443455692e-06, "loss": 2.6693, "mean_token_accuracy": 0.46148884014965813, "step": 7484 }, { "epoch": 1.3876529477196886, "grad_norm": 5.1484375, "learning_rate": 8.612347052280313e-06, "loss": 3.1279, "mean_token_accuracy": 0.42932645034414946, "step": 7485 }, { "epoch": 1.3878383388950686, "grad_norm": 7.1953125, "learning_rate": 8.612161661104931e-06, "loss": 2.3405, "mean_token_accuracy": 0.49112938122025096, "step": 7486 }, { "epoch": 1.3880237300704485, "grad_norm": 5.21875, "learning_rate": 8.611976269929552e-06, "loss": 3.2291, "mean_token_accuracy": 0.41451823690150885, "step": 7487 }, { "epoch": 1.3882091212458287, "grad_norm": 6.125, "learning_rate": 8.611790878754172e-06, "loss": 2.9358, "mean_token_accuracy": 0.46724470134874757, "step": 7488 }, { "epoch": 1.3883945124212087, "grad_norm": 5.76171875, "learning_rate": 8.611605487578793e-06, "loss": 3.2967, "mean_token_accuracy": 0.4176644931831654, "step": 7489 }, { "epoch": 1.3885799035965887, "grad_norm": 6.74609375, "learning_rate": 8.611420096403412e-06, "loss": 3.3399, "mean_token_accuracy": 0.4237076153418566, "step": 7490 }, { "epoch": 1.3887652947719689, "grad_norm": 5.88671875, "learning_rate": 8.611234705228032e-06, "loss": 2.7049, "mean_token_accuracy": 0.4617714770386526, "step": 7491 }, { "epoch": 1.3889506859473488, "grad_norm": 6.83203125, "learning_rate": 8.611049314052653e-06, "loss": 3.1766, "mean_token_accuracy": 0.4301994301994302, "step": 7492 }, { "epoch": 1.389136077122729, "grad_norm": 5.48046875, "learning_rate": 8.610863922877271e-06, "loss": 2.5951, "mean_token_accuracy": 0.481351689612015, "step": 7493 }, { "epoch": 1.389321468298109, "grad_norm": 6.24609375, "learning_rate": 8.610678531701892e-06, "loss": 2.2081, "mean_token_accuracy": 0.5127291913995515, "step": 7494 }, { "epoch": 1.3895068594734892, "grad_norm": 5.96875, "learning_rate": 8.61049314052651e-06, "loss": 3.5189, "mean_token_accuracy": 0.43254845843632106, "step": 7495 }, { "epoch": 1.3896922506488691, "grad_norm": 5.55859375, "learning_rate": 8.610307749351131e-06, "loss": 2.8908, "mean_token_accuracy": 0.4518822724161533, "step": 7496 }, { "epoch": 1.389877641824249, "grad_norm": 6.19921875, "learning_rate": 8.610122358175752e-06, "loss": 2.6292, "mean_token_accuracy": 0.49087661094806684, "step": 7497 }, { "epoch": 1.3900630329996293, "grad_norm": 7.41796875, "learning_rate": 8.609936967000372e-06, "loss": 3.2729, "mean_token_accuracy": 0.4257347991922818, "step": 7498 }, { "epoch": 1.3902484241750093, "grad_norm": 6.1875, "learning_rate": 8.609751575824991e-06, "loss": 2.9784, "mean_token_accuracy": 0.4564425004556224, "step": 7499 }, { "epoch": 1.3904338153503892, "grad_norm": 5.7578125, "learning_rate": 8.609566184649611e-06, "loss": 2.8498, "mean_token_accuracy": 0.4636938836999096, "step": 7500 }, { "epoch": 1.3906192065257694, "grad_norm": 6.69140625, "learning_rate": 8.609380793474232e-06, "loss": 2.847, "mean_token_accuracy": 0.4925982049189882, "step": 7501 }, { "epoch": 1.3908045977011494, "grad_norm": 4.921875, "learning_rate": 8.60919540229885e-06, "loss": 2.7124, "mean_token_accuracy": 0.49513070220399796, "step": 7502 }, { "epoch": 1.3909899888765294, "grad_norm": 8.328125, "learning_rate": 8.609010011123471e-06, "loss": 3.1974, "mean_token_accuracy": 0.42016179215930305, "step": 7503 }, { "epoch": 1.3911753800519095, "grad_norm": 5.81640625, "learning_rate": 8.60882461994809e-06, "loss": 2.9254, "mean_token_accuracy": 0.4465551839464883, "step": 7504 }, { "epoch": 1.3913607712272895, "grad_norm": 5.41015625, "learning_rate": 8.608639228772712e-06, "loss": 3.1074, "mean_token_accuracy": 0.43101326405032137, "step": 7505 }, { "epoch": 1.3915461624026697, "grad_norm": 6.890625, "learning_rate": 8.608453837597331e-06, "loss": 2.9061, "mean_token_accuracy": 0.4397948164146868, "step": 7506 }, { "epoch": 1.3917315535780497, "grad_norm": 5.53515625, "learning_rate": 8.608268446421951e-06, "loss": 2.845, "mean_token_accuracy": 0.45052463806614423, "step": 7507 }, { "epoch": 1.3919169447534299, "grad_norm": 5.8046875, "learning_rate": 8.60808305524657e-06, "loss": 2.5062, "mean_token_accuracy": 0.48328437543712405, "step": 7508 }, { "epoch": 1.3921023359288098, "grad_norm": 6.2265625, "learning_rate": 8.60789766407119e-06, "loss": 2.8011, "mean_token_accuracy": 0.473595333128646, "step": 7509 }, { "epoch": 1.3922877271041898, "grad_norm": 6.03125, "learning_rate": 8.607712272895811e-06, "loss": 2.6691, "mean_token_accuracy": 0.47118772018117766, "step": 7510 }, { "epoch": 1.39247311827957, "grad_norm": 5.62109375, "learning_rate": 8.60752688172043e-06, "loss": 2.6488, "mean_token_accuracy": 0.49080841638981176, "step": 7511 }, { "epoch": 1.39265850945495, "grad_norm": 5.80859375, "learning_rate": 8.60734149054505e-06, "loss": 2.4396, "mean_token_accuracy": 0.49196624667668476, "step": 7512 }, { "epoch": 1.39284390063033, "grad_norm": 6.52734375, "learning_rate": 8.607156099369671e-06, "loss": 2.724, "mean_token_accuracy": 0.4842219804134929, "step": 7513 }, { "epoch": 1.3930292918057101, "grad_norm": 5.47265625, "learning_rate": 8.606970708194292e-06, "loss": 2.7364, "mean_token_accuracy": 0.47397675593734206, "step": 7514 }, { "epoch": 1.39321468298109, "grad_norm": 9.65625, "learning_rate": 8.60678531701891e-06, "loss": 2.5389, "mean_token_accuracy": 0.4800979791794244, "step": 7515 }, { "epoch": 1.39340007415647, "grad_norm": 6.66796875, "learning_rate": 8.60659992584353e-06, "loss": 2.7499, "mean_token_accuracy": 0.47573786893446723, "step": 7516 }, { "epoch": 1.3935854653318502, "grad_norm": 6.31640625, "learning_rate": 8.60641453466815e-06, "loss": 4.0145, "mean_token_accuracy": 0.37669296026194377, "step": 7517 }, { "epoch": 1.3937708565072302, "grad_norm": 5.80859375, "learning_rate": 8.60622914349277e-06, "loss": 2.9368, "mean_token_accuracy": 0.44560903632848275, "step": 7518 }, { "epoch": 1.3939562476826102, "grad_norm": 6.0703125, "learning_rate": 8.60604375231739e-06, "loss": 3.2694, "mean_token_accuracy": 0.4164484829816726, "step": 7519 }, { "epoch": 1.3941416388579904, "grad_norm": 6.203125, "learning_rate": 8.60585836114201e-06, "loss": 3.2934, "mean_token_accuracy": 0.40725957340651114, "step": 7520 }, { "epoch": 1.3943270300333703, "grad_norm": 6.28515625, "learning_rate": 8.605672969966632e-06, "loss": 3.2264, "mean_token_accuracy": 0.4428301441058351, "step": 7521 }, { "epoch": 1.3945124212087505, "grad_norm": 5.13671875, "learning_rate": 8.60548757879125e-06, "loss": 2.8318, "mean_token_accuracy": 0.4493539796824144, "step": 7522 }, { "epoch": 1.3946978123841305, "grad_norm": 8.234375, "learning_rate": 8.605302187615871e-06, "loss": 2.535, "mean_token_accuracy": 0.47875218844501033, "step": 7523 }, { "epoch": 1.3948832035595107, "grad_norm": 5.8828125, "learning_rate": 8.60511679644049e-06, "loss": 2.3364, "mean_token_accuracy": 0.5174635906689007, "step": 7524 }, { "epoch": 1.3950685947348906, "grad_norm": 6.10546875, "learning_rate": 8.60493140526511e-06, "loss": 3.1571, "mean_token_accuracy": 0.42610981308411217, "step": 7525 }, { "epoch": 1.3952539859102706, "grad_norm": 8.2265625, "learning_rate": 8.60474601408973e-06, "loss": 3.0124, "mean_token_accuracy": 0.44651913324112497, "step": 7526 }, { "epoch": 1.3954393770856508, "grad_norm": 6.39453125, "learning_rate": 8.60456062291435e-06, "loss": 2.6643, "mean_token_accuracy": 0.47797867408437644, "step": 7527 }, { "epoch": 1.3956247682610308, "grad_norm": 11.4921875, "learning_rate": 8.60437523173897e-06, "loss": 2.666, "mean_token_accuracy": 0.46880991004276656, "step": 7528 }, { "epoch": 1.3958101594364107, "grad_norm": 5.53515625, "learning_rate": 8.60418984056359e-06, "loss": 2.6346, "mean_token_accuracy": 0.4814997533300444, "step": 7529 }, { "epoch": 1.395995550611791, "grad_norm": 8.2109375, "learning_rate": 8.604004449388211e-06, "loss": 2.5642, "mean_token_accuracy": 0.4786364916893605, "step": 7530 }, { "epoch": 1.396180941787171, "grad_norm": 5.62109375, "learning_rate": 8.60381905821283e-06, "loss": 2.526, "mean_token_accuracy": 0.5019698410542046, "step": 7531 }, { "epoch": 1.3963663329625509, "grad_norm": 7.4453125, "learning_rate": 8.60363366703745e-06, "loss": 3.074, "mean_token_accuracy": 0.4331819656179046, "step": 7532 }, { "epoch": 1.396551724137931, "grad_norm": 9.8515625, "learning_rate": 8.603448275862069e-06, "loss": 2.8006, "mean_token_accuracy": 0.4627552487776819, "step": 7533 }, { "epoch": 1.396737115313311, "grad_norm": 5.4375, "learning_rate": 8.60326288468669e-06, "loss": 3.133, "mean_token_accuracy": 0.4440777411074441, "step": 7534 }, { "epoch": 1.3969225064886912, "grad_norm": 6.92578125, "learning_rate": 8.60307749351131e-06, "loss": 3.1841, "mean_token_accuracy": 0.44229326129371155, "step": 7535 }, { "epoch": 1.3971078976640712, "grad_norm": 9.5078125, "learning_rate": 8.602892102335929e-06, "loss": 2.8663, "mean_token_accuracy": 0.4626280892103677, "step": 7536 }, { "epoch": 1.3972932888394514, "grad_norm": 8.59375, "learning_rate": 8.60270671116055e-06, "loss": 3.2102, "mean_token_accuracy": 0.4276973761619014, "step": 7537 }, { "epoch": 1.3974786800148313, "grad_norm": 6.859375, "learning_rate": 8.60252131998517e-06, "loss": 2.8966, "mean_token_accuracy": 0.44309151499615484, "step": 7538 }, { "epoch": 1.3976640711902113, "grad_norm": 6.734375, "learning_rate": 8.60233592880979e-06, "loss": 2.4738, "mean_token_accuracy": 0.48750709823963656, "step": 7539 }, { "epoch": 1.3978494623655915, "grad_norm": 7.45703125, "learning_rate": 8.602150537634409e-06, "loss": 2.6338, "mean_token_accuracy": 0.4611682345219864, "step": 7540 }, { "epoch": 1.3980348535409715, "grad_norm": 7.57421875, "learning_rate": 8.60196514645903e-06, "loss": 2.5671, "mean_token_accuracy": 0.48541309144398137, "step": 7541 }, { "epoch": 1.3982202447163514, "grad_norm": 5.99609375, "learning_rate": 8.601779755283648e-06, "loss": 3.1215, "mean_token_accuracy": 0.4295327102803738, "step": 7542 }, { "epoch": 1.3984056358917316, "grad_norm": 7.484375, "learning_rate": 8.601594364108269e-06, "loss": 2.9964, "mean_token_accuracy": 0.456026600166251, "step": 7543 }, { "epoch": 1.3985910270671116, "grad_norm": 6.62890625, "learning_rate": 8.60140897293289e-06, "loss": 2.2819, "mean_token_accuracy": 0.5139420448332422, "step": 7544 }, { "epoch": 1.3987764182424915, "grad_norm": 6.34375, "learning_rate": 8.60122358175751e-06, "loss": 2.7956, "mean_token_accuracy": 0.46491728465487736, "step": 7545 }, { "epoch": 1.3989618094178717, "grad_norm": 6.33984375, "learning_rate": 8.601038190582129e-06, "loss": 2.9488, "mean_token_accuracy": 0.4514762969062463, "step": 7546 }, { "epoch": 1.3991472005932517, "grad_norm": 7.96484375, "learning_rate": 8.600852799406749e-06, "loss": 3.3419, "mean_token_accuracy": 0.4166666666666667, "step": 7547 }, { "epoch": 1.399332591768632, "grad_norm": 10.265625, "learning_rate": 8.60066740823137e-06, "loss": 2.7964, "mean_token_accuracy": 0.4583766137409111, "step": 7548 }, { "epoch": 1.3995179829440119, "grad_norm": 8.9296875, "learning_rate": 8.600482017055988e-06, "loss": 2.5309, "mean_token_accuracy": 0.504236262406197, "step": 7549 }, { "epoch": 1.399703374119392, "grad_norm": 7.70703125, "learning_rate": 8.600296625880609e-06, "loss": 3.3935, "mean_token_accuracy": 0.42084648025242083, "step": 7550 }, { "epoch": 1.399888765294772, "grad_norm": 8.8515625, "learning_rate": 8.600111234705228e-06, "loss": 3.1819, "mean_token_accuracy": 0.4427761094427761, "step": 7551 }, { "epoch": 1.400074156470152, "grad_norm": 5.1953125, "learning_rate": 8.599925843529848e-06, "loss": 2.8434, "mean_token_accuracy": 0.4539869106451182, "step": 7552 }, { "epoch": 1.4002595476455322, "grad_norm": 6.203125, "learning_rate": 8.599740452354469e-06, "loss": 2.6241, "mean_token_accuracy": 0.4700646262650896, "step": 7553 }, { "epoch": 1.4004449388209121, "grad_norm": 6.15625, "learning_rate": 8.59955506117909e-06, "loss": 2.8078, "mean_token_accuracy": 0.4507689021785562, "step": 7554 }, { "epoch": 1.4006303299962921, "grad_norm": 5.07421875, "learning_rate": 8.599369670003708e-06, "loss": 2.5778, "mean_token_accuracy": 0.4873939393939394, "step": 7555 }, { "epoch": 1.4008157211716723, "grad_norm": 6.7265625, "learning_rate": 8.599184278828328e-06, "loss": 2.8147, "mean_token_accuracy": 0.4830166954519286, "step": 7556 }, { "epoch": 1.4010011123470523, "grad_norm": 6.2734375, "learning_rate": 8.598998887652949e-06, "loss": 2.7944, "mean_token_accuracy": 0.4523403217942467, "step": 7557 }, { "epoch": 1.4011865035224322, "grad_norm": 5.6953125, "learning_rate": 8.598813496477568e-06, "loss": 2.3654, "mean_token_accuracy": 0.5004765146358067, "step": 7558 }, { "epoch": 1.4013718946978124, "grad_norm": 5.9140625, "learning_rate": 8.598628105302188e-06, "loss": 1.8951, "mean_token_accuracy": 0.5707808564231738, "step": 7559 }, { "epoch": 1.4015572858731924, "grad_norm": 7.80078125, "learning_rate": 8.598442714126807e-06, "loss": 2.5186, "mean_token_accuracy": 0.4939209726443769, "step": 7560 }, { "epoch": 1.4017426770485724, "grad_norm": 9.171875, "learning_rate": 8.598257322951428e-06, "loss": 2.9644, "mean_token_accuracy": 0.46165113182423434, "step": 7561 }, { "epoch": 1.4019280682239526, "grad_norm": 7.484375, "learning_rate": 8.598071931776048e-06, "loss": 3.4942, "mean_token_accuracy": 0.40168539325842695, "step": 7562 }, { "epoch": 1.4021134593993325, "grad_norm": 6.14453125, "learning_rate": 8.597886540600669e-06, "loss": 3.3395, "mean_token_accuracy": 0.4283195592286501, "step": 7563 }, { "epoch": 1.4022988505747127, "grad_norm": 7.5390625, "learning_rate": 8.597701149425289e-06, "loss": 2.7002, "mean_token_accuracy": 0.47047078604455655, "step": 7564 }, { "epoch": 1.4024842417500927, "grad_norm": 7.9296875, "learning_rate": 8.597515758249908e-06, "loss": 2.8029, "mean_token_accuracy": 0.467110125646711, "step": 7565 }, { "epoch": 1.4026696329254729, "grad_norm": 7.41796875, "learning_rate": 8.597330367074528e-06, "loss": 2.785, "mean_token_accuracy": 0.4522977694168234, "step": 7566 }, { "epoch": 1.4028550241008528, "grad_norm": 6.8515625, "learning_rate": 8.597144975899147e-06, "loss": 2.6134, "mean_token_accuracy": 0.4581900130470881, "step": 7567 }, { "epoch": 1.4030404152762328, "grad_norm": 9.1640625, "learning_rate": 8.596959584723768e-06, "loss": 2.7567, "mean_token_accuracy": 0.46747967479674796, "step": 7568 }, { "epoch": 1.403225806451613, "grad_norm": 7.37109375, "learning_rate": 8.596774193548388e-06, "loss": 2.9283, "mean_token_accuracy": 0.44391622767570343, "step": 7569 }, { "epoch": 1.403411197626993, "grad_norm": 5.80078125, "learning_rate": 8.596588802373009e-06, "loss": 2.9584, "mean_token_accuracy": 0.4473519469137348, "step": 7570 }, { "epoch": 1.403596588802373, "grad_norm": 9.7734375, "learning_rate": 8.596403411197627e-06, "loss": 2.9271, "mean_token_accuracy": 0.4757441210327249, "step": 7571 }, { "epoch": 1.4037819799777531, "grad_norm": 7.17578125, "learning_rate": 8.596218020022248e-06, "loss": 2.5833, "mean_token_accuracy": 0.49915984036967026, "step": 7572 }, { "epoch": 1.403967371153133, "grad_norm": 7.859375, "learning_rate": 8.596032628846868e-06, "loss": 2.6068, "mean_token_accuracy": 0.47024066868424846, "step": 7573 }, { "epoch": 1.404152762328513, "grad_norm": 10.71875, "learning_rate": 8.595847237671487e-06, "loss": 2.3354, "mean_token_accuracy": 0.5339408346539883, "step": 7574 }, { "epoch": 1.4043381535038932, "grad_norm": 10.5390625, "learning_rate": 8.595661846496108e-06, "loss": 2.4981, "mean_token_accuracy": 0.5108205590622182, "step": 7575 }, { "epoch": 1.4045235446792732, "grad_norm": 20.953125, "learning_rate": 8.595476455320726e-06, "loss": 2.1395, "mean_token_accuracy": 0.5107518442944593, "step": 7576 }, { "epoch": 1.4047089358546534, "grad_norm": 16.0, "learning_rate": 8.595291064145347e-06, "loss": 2.1315, "mean_token_accuracy": 0.5292562363490056, "step": 7577 }, { "epoch": 1.4048943270300334, "grad_norm": 9.1640625, "learning_rate": 8.595105672969967e-06, "loss": 2.7965, "mean_token_accuracy": 0.47703960864762507, "step": 7578 }, { "epoch": 1.4050797182054136, "grad_norm": 8.1796875, "learning_rate": 8.594920281794588e-06, "loss": 3.5582, "mean_token_accuracy": 0.4154103852596315, "step": 7579 }, { "epoch": 1.4052651093807935, "grad_norm": 7.69921875, "learning_rate": 8.594734890619207e-06, "loss": 2.5722, "mean_token_accuracy": 0.49080141129032256, "step": 7580 }, { "epoch": 1.4054505005561735, "grad_norm": 12.1953125, "learning_rate": 8.594549499443827e-06, "loss": 2.6134, "mean_token_accuracy": 0.46299425481581613, "step": 7581 }, { "epoch": 1.4056358917315537, "grad_norm": 7.19921875, "learning_rate": 8.594364108268448e-06, "loss": 2.4422, "mean_token_accuracy": 0.4998076676496987, "step": 7582 }, { "epoch": 1.4058212829069336, "grad_norm": 7.4765625, "learning_rate": 8.594178717093066e-06, "loss": 3.078, "mean_token_accuracy": 0.45777866083846613, "step": 7583 }, { "epoch": 1.4060066740823136, "grad_norm": 14.390625, "learning_rate": 8.593993325917687e-06, "loss": 2.6235, "mean_token_accuracy": 0.507523475506279, "step": 7584 }, { "epoch": 1.4061920652576938, "grad_norm": 17.28125, "learning_rate": 8.593807934742306e-06, "loss": 2.5532, "mean_token_accuracy": 0.47417840375586856, "step": 7585 }, { "epoch": 1.4063774564330738, "grad_norm": 8.1484375, "learning_rate": 8.593622543566928e-06, "loss": 3.0859, "mean_token_accuracy": 0.4334071885770556, "step": 7586 }, { "epoch": 1.4065628476084537, "grad_norm": 6.40234375, "learning_rate": 8.593437152391547e-06, "loss": 3.1547, "mean_token_accuracy": 0.42726517040731504, "step": 7587 }, { "epoch": 1.406748238783834, "grad_norm": 11.46875, "learning_rate": 8.593251761216167e-06, "loss": 2.6421, "mean_token_accuracy": 0.47375522871407366, "step": 7588 }, { "epoch": 1.406933629959214, "grad_norm": 8.578125, "learning_rate": 8.593066370040786e-06, "loss": 2.9107, "mean_token_accuracy": 0.4473378669762813, "step": 7589 }, { "epoch": 1.4071190211345939, "grad_norm": 9.2578125, "learning_rate": 8.592880978865407e-06, "loss": 2.5158, "mean_token_accuracy": 0.5033976624082631, "step": 7590 }, { "epoch": 1.407304412309974, "grad_norm": 7.08203125, "learning_rate": 8.592695587690027e-06, "loss": 3.6767, "mean_token_accuracy": 0.418, "step": 7591 }, { "epoch": 1.407489803485354, "grad_norm": 10.2109375, "learning_rate": 8.592510196514646e-06, "loss": 2.6872, "mean_token_accuracy": 0.4579204965978274, "step": 7592 }, { "epoch": 1.4076751946607342, "grad_norm": 12.8984375, "learning_rate": 8.592324805339266e-06, "loss": 2.9897, "mean_token_accuracy": 0.46375739644970415, "step": 7593 }, { "epoch": 1.4078605858361142, "grad_norm": 6.00390625, "learning_rate": 8.592139414163887e-06, "loss": 2.5909, "mean_token_accuracy": 0.46290762634792776, "step": 7594 }, { "epoch": 1.4080459770114944, "grad_norm": 7.51171875, "learning_rate": 8.591954022988507e-06, "loss": 2.8017, "mean_token_accuracy": 0.460446247464503, "step": 7595 }, { "epoch": 1.4082313681868743, "grad_norm": 9.1796875, "learning_rate": 8.591768631813126e-06, "loss": 3.1331, "mean_token_accuracy": 0.4244349419670128, "step": 7596 }, { "epoch": 1.4084167593622543, "grad_norm": 9.328125, "learning_rate": 8.591583240637747e-06, "loss": 2.4731, "mean_token_accuracy": 0.49089798411728774, "step": 7597 }, { "epoch": 1.4086021505376345, "grad_norm": 5.73828125, "learning_rate": 8.591397849462365e-06, "loss": 2.7015, "mean_token_accuracy": 0.4763336674462635, "step": 7598 }, { "epoch": 1.4087875417130145, "grad_norm": 7.59765625, "learning_rate": 8.591212458286986e-06, "loss": 2.4549, "mean_token_accuracy": 0.5109761793554414, "step": 7599 }, { "epoch": 1.4089729328883944, "grad_norm": 6.25390625, "learning_rate": 8.591027067111606e-06, "loss": 2.5481, "mean_token_accuracy": 0.48603504928806135, "step": 7600 }, { "epoch": 1.4091583240637746, "grad_norm": 5.68359375, "learning_rate": 8.590841675936225e-06, "loss": 2.7613, "mean_token_accuracy": 0.4571329799492971, "step": 7601 }, { "epoch": 1.4093437152391546, "grad_norm": 9.15625, "learning_rate": 8.590656284760847e-06, "loss": 2.4423, "mean_token_accuracy": 0.5031454298754163, "step": 7602 }, { "epoch": 1.4095291064145345, "grad_norm": 5.9296875, "learning_rate": 8.590470893585466e-06, "loss": 2.4711, "mean_token_accuracy": 0.4877356347944583, "step": 7603 }, { "epoch": 1.4097144975899147, "grad_norm": 5.40625, "learning_rate": 8.590285502410087e-06, "loss": 2.6203, "mean_token_accuracy": 0.48739495798319327, "step": 7604 }, { "epoch": 1.4098998887652947, "grad_norm": 6.43359375, "learning_rate": 8.590100111234705e-06, "loss": 3.2928, "mean_token_accuracy": 0.422360248447205, "step": 7605 }, { "epoch": 1.410085279940675, "grad_norm": 5.53515625, "learning_rate": 8.589914720059326e-06, "loss": 3.5048, "mean_token_accuracy": 0.40875912408759124, "step": 7606 }, { "epoch": 1.4102706711160549, "grad_norm": 6.82421875, "learning_rate": 8.589729328883946e-06, "loss": 3.0202, "mean_token_accuracy": 0.44676737160120844, "step": 7607 }, { "epoch": 1.410456062291435, "grad_norm": 6.234375, "learning_rate": 8.589543937708565e-06, "loss": 2.7959, "mean_token_accuracy": 0.47561937825469464, "step": 7608 }, { "epoch": 1.410641453466815, "grad_norm": 5.3671875, "learning_rate": 8.589358546533186e-06, "loss": 2.0832, "mean_token_accuracy": 0.5656208033207178, "step": 7609 }, { "epoch": 1.410826844642195, "grad_norm": 7.28515625, "learning_rate": 8.589173155357806e-06, "loss": 2.9394, "mean_token_accuracy": 0.4381611597231292, "step": 7610 }, { "epoch": 1.4110122358175752, "grad_norm": 6.23828125, "learning_rate": 8.588987764182427e-06, "loss": 2.9802, "mean_token_accuracy": 0.4591329068941009, "step": 7611 }, { "epoch": 1.4111976269929551, "grad_norm": 10.15625, "learning_rate": 8.588802373007045e-06, "loss": 2.3685, "mean_token_accuracy": 0.4988829018267841, "step": 7612 }, { "epoch": 1.4113830181683351, "grad_norm": 5.39453125, "learning_rate": 8.588616981831666e-06, "loss": 2.8282, "mean_token_accuracy": 0.4635056525079574, "step": 7613 }, { "epoch": 1.4115684093437153, "grad_norm": 7.0625, "learning_rate": 8.588431590656285e-06, "loss": 2.5143, "mean_token_accuracy": 0.49886694723211394, "step": 7614 }, { "epoch": 1.4117538005190953, "grad_norm": 6.82421875, "learning_rate": 8.588246199480905e-06, "loss": 3.0048, "mean_token_accuracy": 0.4475635593220339, "step": 7615 }, { "epoch": 1.4119391916944752, "grad_norm": 6.6015625, "learning_rate": 8.588060808305526e-06, "loss": 2.7165, "mean_token_accuracy": 0.46893813244524146, "step": 7616 }, { "epoch": 1.4121245828698554, "grad_norm": 7.046875, "learning_rate": 8.587875417130145e-06, "loss": 2.2444, "mean_token_accuracy": 0.5148683092608326, "step": 7617 }, { "epoch": 1.4123099740452354, "grad_norm": 9.3125, "learning_rate": 8.587690025954765e-06, "loss": 2.5737, "mean_token_accuracy": 0.47648514851485146, "step": 7618 }, { "epoch": 1.4124953652206154, "grad_norm": 5.515625, "learning_rate": 8.587504634779386e-06, "loss": 2.8072, "mean_token_accuracy": 0.4528301886792453, "step": 7619 }, { "epoch": 1.4126807563959956, "grad_norm": 6.03125, "learning_rate": 8.587319243604006e-06, "loss": 2.4569, "mean_token_accuracy": 0.4955592740378427, "step": 7620 }, { "epoch": 1.4128661475713757, "grad_norm": 11.484375, "learning_rate": 8.587133852428625e-06, "loss": 2.9092, "mean_token_accuracy": 0.45873055694932263, "step": 7621 }, { "epoch": 1.4130515387467557, "grad_norm": 7.765625, "learning_rate": 8.586948461253245e-06, "loss": 2.272, "mean_token_accuracy": 0.5451114518221488, "step": 7622 }, { "epoch": 1.4132369299221357, "grad_norm": 6.1953125, "learning_rate": 8.586763070077864e-06, "loss": 2.6782, "mean_token_accuracy": 0.4687468545546049, "step": 7623 }, { "epoch": 1.4134223210975159, "grad_norm": 7.01171875, "learning_rate": 8.586577678902485e-06, "loss": 3.4935, "mean_token_accuracy": 0.39215686274509803, "step": 7624 }, { "epoch": 1.4136077122728958, "grad_norm": 7.94140625, "learning_rate": 8.586392287727105e-06, "loss": 2.8194, "mean_token_accuracy": 0.4632405424696645, "step": 7625 }, { "epoch": 1.4137931034482758, "grad_norm": 5.84375, "learning_rate": 8.586206896551726e-06, "loss": 3.0251, "mean_token_accuracy": 0.43719706411854314, "step": 7626 }, { "epoch": 1.413978494623656, "grad_norm": 7.0234375, "learning_rate": 8.586021505376344e-06, "loss": 3.3982, "mean_token_accuracy": 0.39950654773201744, "step": 7627 }, { "epoch": 1.414163885799036, "grad_norm": 6.79296875, "learning_rate": 8.585836114200965e-06, "loss": 2.9332, "mean_token_accuracy": 0.4433802816901408, "step": 7628 }, { "epoch": 1.414349276974416, "grad_norm": 7.85546875, "learning_rate": 8.585650723025585e-06, "loss": 2.771, "mean_token_accuracy": 0.4742998537847261, "step": 7629 }, { "epoch": 1.4145346681497961, "grad_norm": 7.45703125, "learning_rate": 8.585465331850204e-06, "loss": 2.1059, "mean_token_accuracy": 0.5609429689108503, "step": 7630 }, { "epoch": 1.414720059325176, "grad_norm": 7.88671875, "learning_rate": 8.585279940674825e-06, "loss": 2.222, "mean_token_accuracy": 0.5087658989343417, "step": 7631 }, { "epoch": 1.414905450500556, "grad_norm": 8.546875, "learning_rate": 8.585094549499443e-06, "loss": 3.206, "mean_token_accuracy": 0.4407299493942647, "step": 7632 }, { "epoch": 1.4150908416759362, "grad_norm": 7.05078125, "learning_rate": 8.584909158324064e-06, "loss": 2.6048, "mean_token_accuracy": 0.47247013593299464, "step": 7633 }, { "epoch": 1.4152762328513162, "grad_norm": 8.109375, "learning_rate": 8.584723767148684e-06, "loss": 2.5969, "mean_token_accuracy": 0.4695606830921653, "step": 7634 }, { "epoch": 1.4154616240266964, "grad_norm": 6.78515625, "learning_rate": 8.584538375973305e-06, "loss": 3.0512, "mean_token_accuracy": 0.43850703650826023, "step": 7635 }, { "epoch": 1.4156470152020764, "grad_norm": 7.984375, "learning_rate": 8.584352984797924e-06, "loss": 2.8041, "mean_token_accuracy": 0.44735507858712187, "step": 7636 }, { "epoch": 1.4158324063774566, "grad_norm": 7.6953125, "learning_rate": 8.584167593622544e-06, "loss": 2.8911, "mean_token_accuracy": 0.44742818971275883, "step": 7637 }, { "epoch": 1.4160177975528365, "grad_norm": 6.65625, "learning_rate": 8.583982202447165e-06, "loss": 3.0607, "mean_token_accuracy": 0.43715680292861503, "step": 7638 }, { "epoch": 1.4162031887282165, "grad_norm": 7.1328125, "learning_rate": 8.583796811271783e-06, "loss": 3.06, "mean_token_accuracy": 0.44552746471723015, "step": 7639 }, { "epoch": 1.4163885799035967, "grad_norm": 6.91015625, "learning_rate": 8.583611420096404e-06, "loss": 3.1304, "mean_token_accuracy": 0.4266384088686012, "step": 7640 }, { "epoch": 1.4165739710789766, "grad_norm": 5.77734375, "learning_rate": 8.583426028921023e-06, "loss": 3.0355, "mean_token_accuracy": 0.4720872347990992, "step": 7641 }, { "epoch": 1.4167593622543566, "grad_norm": 5.953125, "learning_rate": 8.583240637745645e-06, "loss": 2.5934, "mean_token_accuracy": 0.47260686333534013, "step": 7642 }, { "epoch": 1.4169447534297368, "grad_norm": 6.1875, "learning_rate": 8.583055246570264e-06, "loss": 2.7985, "mean_token_accuracy": 0.46864975211431903, "step": 7643 }, { "epoch": 1.4171301446051168, "grad_norm": 11.3359375, "learning_rate": 8.582869855394884e-06, "loss": 2.4086, "mean_token_accuracy": 0.48792212474462204, "step": 7644 }, { "epoch": 1.4173155357804967, "grad_norm": 6.44921875, "learning_rate": 8.582684464219505e-06, "loss": 3.0392, "mean_token_accuracy": 0.4287531806615776, "step": 7645 }, { "epoch": 1.417500926955877, "grad_norm": 6.11328125, "learning_rate": 8.582499073044124e-06, "loss": 2.5881, "mean_token_accuracy": 0.4728149663840982, "step": 7646 }, { "epoch": 1.417686318131257, "grad_norm": 9.34375, "learning_rate": 8.582313681868744e-06, "loss": 2.4769, "mean_token_accuracy": 0.5116625983184161, "step": 7647 }, { "epoch": 1.417871709306637, "grad_norm": 8.6796875, "learning_rate": 8.582128290693363e-06, "loss": 2.9074, "mean_token_accuracy": 0.4423733263323707, "step": 7648 }, { "epoch": 1.418057100482017, "grad_norm": 6.140625, "learning_rate": 8.581942899517983e-06, "loss": 2.7226, "mean_token_accuracy": 0.4660577971646674, "step": 7649 }, { "epoch": 1.4182424916573972, "grad_norm": 7.34765625, "learning_rate": 8.581757508342604e-06, "loss": 2.1998, "mean_token_accuracy": 0.5422664790561749, "step": 7650 }, { "epoch": 1.4184278828327772, "grad_norm": 9.75, "learning_rate": 8.581572117167224e-06, "loss": 3.0443, "mean_token_accuracy": 0.4627366403067338, "step": 7651 }, { "epoch": 1.4186132740081572, "grad_norm": 15.359375, "learning_rate": 8.581386725991843e-06, "loss": 2.5842, "mean_token_accuracy": 0.4750251907298114, "step": 7652 }, { "epoch": 1.4187986651835374, "grad_norm": 9.6484375, "learning_rate": 8.581201334816464e-06, "loss": 2.6313, "mean_token_accuracy": 0.4674500587544066, "step": 7653 }, { "epoch": 1.4189840563589173, "grad_norm": 6.83203125, "learning_rate": 8.581015943641084e-06, "loss": 2.5113, "mean_token_accuracy": 0.49000799360511593, "step": 7654 }, { "epoch": 1.4191694475342973, "grad_norm": 11.8359375, "learning_rate": 8.580830552465703e-06, "loss": 2.8825, "mean_token_accuracy": 0.4323339406990227, "step": 7655 }, { "epoch": 1.4193548387096775, "grad_norm": 7.83203125, "learning_rate": 8.580645161290323e-06, "loss": 2.8854, "mean_token_accuracy": 0.44866385372714485, "step": 7656 }, { "epoch": 1.4195402298850575, "grad_norm": 6.53515625, "learning_rate": 8.580459770114942e-06, "loss": 2.78, "mean_token_accuracy": 0.46712714249168585, "step": 7657 }, { "epoch": 1.4197256210604374, "grad_norm": 9.1484375, "learning_rate": 8.580274378939564e-06, "loss": 2.6633, "mean_token_accuracy": 0.47552255225522555, "step": 7658 }, { "epoch": 1.4199110122358176, "grad_norm": 8.7734375, "learning_rate": 8.580088987764183e-06, "loss": 3.576, "mean_token_accuracy": 0.413478012564249, "step": 7659 }, { "epoch": 1.4200964034111976, "grad_norm": 7.578125, "learning_rate": 8.579903596588804e-06, "loss": 2.247, "mean_token_accuracy": 0.5579925650557621, "step": 7660 }, { "epoch": 1.4202817945865776, "grad_norm": 5.65234375, "learning_rate": 8.579718205413422e-06, "loss": 2.7061, "mean_token_accuracy": 0.45941807044410415, "step": 7661 }, { "epoch": 1.4204671857619577, "grad_norm": 5.92578125, "learning_rate": 8.579532814238043e-06, "loss": 2.3245, "mean_token_accuracy": 0.530825901512214, "step": 7662 }, { "epoch": 1.4206525769373377, "grad_norm": 5.7109375, "learning_rate": 8.579347423062663e-06, "loss": 2.6495, "mean_token_accuracy": 0.4896580835795694, "step": 7663 }, { "epoch": 1.420837968112718, "grad_norm": 5.30078125, "learning_rate": 8.579162031887282e-06, "loss": 2.5158, "mean_token_accuracy": 0.5123512095225905, "step": 7664 }, { "epoch": 1.4210233592880979, "grad_norm": 7.19140625, "learning_rate": 8.578976640711903e-06, "loss": 2.4797, "mean_token_accuracy": 0.5045794167269222, "step": 7665 }, { "epoch": 1.421208750463478, "grad_norm": 4.89453125, "learning_rate": 8.578791249536523e-06, "loss": 2.4155, "mean_token_accuracy": 0.5364612150049796, "step": 7666 }, { "epoch": 1.421394141638858, "grad_norm": 5.9375, "learning_rate": 8.578605858361144e-06, "loss": 3.0173, "mean_token_accuracy": 0.44624644708242767, "step": 7667 }, { "epoch": 1.421579532814238, "grad_norm": 6.30078125, "learning_rate": 8.578420467185763e-06, "loss": 2.3086, "mean_token_accuracy": 0.5440675657267402, "step": 7668 }, { "epoch": 1.4217649239896182, "grad_norm": 7.11328125, "learning_rate": 8.578235076010383e-06, "loss": 2.6213, "mean_token_accuracy": 0.47236965344989074, "step": 7669 }, { "epoch": 1.4219503151649981, "grad_norm": 7.4921875, "learning_rate": 8.578049684835002e-06, "loss": 3.0035, "mean_token_accuracy": 0.4582432432432432, "step": 7670 }, { "epoch": 1.4221357063403781, "grad_norm": 5.69921875, "learning_rate": 8.577864293659622e-06, "loss": 3.0463, "mean_token_accuracy": 0.44479085476747204, "step": 7671 }, { "epoch": 1.4223210975157583, "grad_norm": 6.640625, "learning_rate": 8.577678902484243e-06, "loss": 2.3926, "mean_token_accuracy": 0.5022312373225152, "step": 7672 }, { "epoch": 1.4225064886911383, "grad_norm": 6.7421875, "learning_rate": 8.577493511308862e-06, "loss": 3.6338, "mean_token_accuracy": 0.39423301424235346, "step": 7673 }, { "epoch": 1.4226918798665182, "grad_norm": 6.0859375, "learning_rate": 8.577308120133482e-06, "loss": 2.859, "mean_token_accuracy": 0.46866325785244706, "step": 7674 }, { "epoch": 1.4228772710418984, "grad_norm": 7.25, "learning_rate": 8.577122728958103e-06, "loss": 3.2182, "mean_token_accuracy": 0.43261490521942353, "step": 7675 }, { "epoch": 1.4230626622172784, "grad_norm": 11.421875, "learning_rate": 8.576937337782723e-06, "loss": 2.7552, "mean_token_accuracy": 0.46250515039142975, "step": 7676 }, { "epoch": 1.4232480533926586, "grad_norm": 9.046875, "learning_rate": 8.576751946607342e-06, "loss": 2.6495, "mean_token_accuracy": 0.48495627686379134, "step": 7677 }, { "epoch": 1.4234334445680386, "grad_norm": 5.83984375, "learning_rate": 8.576566555431962e-06, "loss": 2.8948, "mean_token_accuracy": 0.458525667925584, "step": 7678 }, { "epoch": 1.4236188357434187, "grad_norm": 10.1484375, "learning_rate": 8.576381164256581e-06, "loss": 2.3759, "mean_token_accuracy": 0.47884167114936343, "step": 7679 }, { "epoch": 1.4238042269187987, "grad_norm": 9.78125, "learning_rate": 8.576195773081202e-06, "loss": 2.9394, "mean_token_accuracy": 0.44356464152029945, "step": 7680 }, { "epoch": 1.4239896180941787, "grad_norm": 5.43359375, "learning_rate": 8.576010381905822e-06, "loss": 2.4844, "mean_token_accuracy": 0.5092643051771117, "step": 7681 }, { "epoch": 1.4241750092695589, "grad_norm": 7.9921875, "learning_rate": 8.575824990730441e-06, "loss": 3.3362, "mean_token_accuracy": 0.414251497005988, "step": 7682 }, { "epoch": 1.4243604004449388, "grad_norm": 7.1796875, "learning_rate": 8.575639599555063e-06, "loss": 2.3663, "mean_token_accuracy": 0.5028409090909091, "step": 7683 }, { "epoch": 1.4245457916203188, "grad_norm": 10.84375, "learning_rate": 8.575454208379682e-06, "loss": 2.2392, "mean_token_accuracy": 0.5089828830103268, "step": 7684 }, { "epoch": 1.424731182795699, "grad_norm": 7.91796875, "learning_rate": 8.575268817204302e-06, "loss": 2.4604, "mean_token_accuracy": 0.49593593114989243, "step": 7685 }, { "epoch": 1.424916573971079, "grad_norm": 9.4296875, "learning_rate": 8.575083426028921e-06, "loss": 3.0835, "mean_token_accuracy": 0.44105192779139735, "step": 7686 }, { "epoch": 1.425101965146459, "grad_norm": 9.1328125, "learning_rate": 8.574898034853542e-06, "loss": 3.0851, "mean_token_accuracy": 0.4320540067508439, "step": 7687 }, { "epoch": 1.4252873563218391, "grad_norm": 6.49609375, "learning_rate": 8.574712643678162e-06, "loss": 2.7192, "mean_token_accuracy": 0.47838874680306903, "step": 7688 }, { "epoch": 1.425472747497219, "grad_norm": 7.796875, "learning_rate": 8.574527252502781e-06, "loss": 2.8867, "mean_token_accuracy": 0.4886224281237226, "step": 7689 }, { "epoch": 1.425658138672599, "grad_norm": 7.203125, "learning_rate": 8.574341861327401e-06, "loss": 2.8562, "mean_token_accuracy": 0.46066738947114805, "step": 7690 }, { "epoch": 1.4258435298479792, "grad_norm": 7.32421875, "learning_rate": 8.574156470152022e-06, "loss": 2.9982, "mean_token_accuracy": 0.45857367593078135, "step": 7691 }, { "epoch": 1.4260289210233592, "grad_norm": 5.078125, "learning_rate": 8.573971078976642e-06, "loss": 3.5242, "mean_token_accuracy": 0.4244358331433782, "step": 7692 }, { "epoch": 1.4262143121987394, "grad_norm": 5.90625, "learning_rate": 8.573785687801261e-06, "loss": 2.8703, "mean_token_accuracy": 0.4478164322723908, "step": 7693 }, { "epoch": 1.4263997033741194, "grad_norm": 5.7421875, "learning_rate": 8.573600296625882e-06, "loss": 3.0314, "mean_token_accuracy": 0.4577229503983552, "step": 7694 }, { "epoch": 1.4265850945494996, "grad_norm": 6.90234375, "learning_rate": 8.5734149054505e-06, "loss": 2.6894, "mean_token_accuracy": 0.467238818286846, "step": 7695 }, { "epoch": 1.4267704857248795, "grad_norm": 6.0, "learning_rate": 8.573229514275121e-06, "loss": 3.5127, "mean_token_accuracy": 0.38895816800816235, "step": 7696 }, { "epoch": 1.4269558769002595, "grad_norm": 7.671875, "learning_rate": 8.573044123099742e-06, "loss": 2.2727, "mean_token_accuracy": 0.5161997002190707, "step": 7697 }, { "epoch": 1.4271412680756397, "grad_norm": 7.265625, "learning_rate": 8.57285873192436e-06, "loss": 2.7687, "mean_token_accuracy": 0.4763688760806916, "step": 7698 }, { "epoch": 1.4273266592510196, "grad_norm": 6.859375, "learning_rate": 8.57267334074898e-06, "loss": 2.7991, "mean_token_accuracy": 0.45061523112736945, "step": 7699 }, { "epoch": 1.4275120504263996, "grad_norm": 5.984375, "learning_rate": 8.572487949573601e-06, "loss": 2.8377, "mean_token_accuracy": 0.4703621581670362, "step": 7700 }, { "epoch": 1.4276974416017798, "grad_norm": 6.53515625, "learning_rate": 8.572302558398222e-06, "loss": 2.8186, "mean_token_accuracy": 0.48189030700241464, "step": 7701 }, { "epoch": 1.4278828327771598, "grad_norm": 7.78515625, "learning_rate": 8.57211716722284e-06, "loss": 2.557, "mean_token_accuracy": 0.5141654340653089, "step": 7702 }, { "epoch": 1.4280682239525397, "grad_norm": 7.18359375, "learning_rate": 8.571931776047461e-06, "loss": 3.5275, "mean_token_accuracy": 0.3997610196494955, "step": 7703 }, { "epoch": 1.42825361512792, "grad_norm": 6.96875, "learning_rate": 8.57174638487208e-06, "loss": 2.9627, "mean_token_accuracy": 0.46830685118742527, "step": 7704 }, { "epoch": 1.4284390063033, "grad_norm": 6.1328125, "learning_rate": 8.5715609936967e-06, "loss": 2.5705, "mean_token_accuracy": 0.5050747110234001, "step": 7705 }, { "epoch": 1.42862439747868, "grad_norm": 5.99609375, "learning_rate": 8.57137560252132e-06, "loss": 2.95, "mean_token_accuracy": 0.43867611246402477, "step": 7706 }, { "epoch": 1.42880978865406, "grad_norm": 5.58203125, "learning_rate": 8.571190211345941e-06, "loss": 3.1608, "mean_token_accuracy": 0.4391939665331134, "step": 7707 }, { "epoch": 1.4289951798294402, "grad_norm": 5.609375, "learning_rate": 8.57100482017056e-06, "loss": 2.7587, "mean_token_accuracy": 0.47407319108356183, "step": 7708 }, { "epoch": 1.4291805710048202, "grad_norm": 5.58984375, "learning_rate": 8.57081942899518e-06, "loss": 2.3133, "mean_token_accuracy": 0.5212418300653595, "step": 7709 }, { "epoch": 1.4293659621802002, "grad_norm": 7.63671875, "learning_rate": 8.570634037819801e-06, "loss": 3.1539, "mean_token_accuracy": 0.4487219943199748, "step": 7710 }, { "epoch": 1.4295513533555804, "grad_norm": 6.83203125, "learning_rate": 8.57044864664442e-06, "loss": 2.5381, "mean_token_accuracy": 0.4917435964113899, "step": 7711 }, { "epoch": 1.4297367445309603, "grad_norm": 6.421875, "learning_rate": 8.57026325546904e-06, "loss": 2.8026, "mean_token_accuracy": 0.4737026647966339, "step": 7712 }, { "epoch": 1.4299221357063403, "grad_norm": 5.73828125, "learning_rate": 8.57007786429366e-06, "loss": 2.6224, "mean_token_accuracy": 0.48362175525339923, "step": 7713 }, { "epoch": 1.4301075268817205, "grad_norm": 5.42578125, "learning_rate": 8.56989247311828e-06, "loss": 3.0498, "mean_token_accuracy": 0.4448973649730674, "step": 7714 }, { "epoch": 1.4302929180571005, "grad_norm": 5.84765625, "learning_rate": 8.5697070819429e-06, "loss": 2.5673, "mean_token_accuracy": 0.49084735754354886, "step": 7715 }, { "epoch": 1.4304783092324804, "grad_norm": 5.60546875, "learning_rate": 8.56952169076752e-06, "loss": 2.2223, "mean_token_accuracy": 0.5291132817455284, "step": 7716 }, { "epoch": 1.4306637004078606, "grad_norm": 7.4609375, "learning_rate": 8.56933629959214e-06, "loss": 2.5602, "mean_token_accuracy": 0.4908998988877654, "step": 7717 }, { "epoch": 1.4308490915832406, "grad_norm": 6.6640625, "learning_rate": 8.56915090841676e-06, "loss": 2.8266, "mean_token_accuracy": 0.47608958837772397, "step": 7718 }, { "epoch": 1.4310344827586206, "grad_norm": 6.8515625, "learning_rate": 8.56896551724138e-06, "loss": 3.219, "mean_token_accuracy": 0.45487195502810746, "step": 7719 }, { "epoch": 1.4312198739340007, "grad_norm": 5.32421875, "learning_rate": 8.568780126066e-06, "loss": 2.5192, "mean_token_accuracy": 0.49591309959130997, "step": 7720 }, { "epoch": 1.431405265109381, "grad_norm": 5.36328125, "learning_rate": 8.56859473489062e-06, "loss": 3.1209, "mean_token_accuracy": 0.42112526539278133, "step": 7721 }, { "epoch": 1.431590656284761, "grad_norm": 8.34375, "learning_rate": 8.568409343715239e-06, "loss": 2.957, "mean_token_accuracy": 0.433922145894668, "step": 7722 }, { "epoch": 1.4317760474601409, "grad_norm": 6.45703125, "learning_rate": 8.56822395253986e-06, "loss": 2.8379, "mean_token_accuracy": 0.44290260980267343, "step": 7723 }, { "epoch": 1.431961438635521, "grad_norm": 6.296875, "learning_rate": 8.56803856136448e-06, "loss": 3.0794, "mean_token_accuracy": 0.43896103896103894, "step": 7724 }, { "epoch": 1.432146829810901, "grad_norm": 5.5390625, "learning_rate": 8.5678531701891e-06, "loss": 3.2006, "mean_token_accuracy": 0.4332078411008698, "step": 7725 }, { "epoch": 1.432332220986281, "grad_norm": 7.234375, "learning_rate": 8.56766777901372e-06, "loss": 2.4364, "mean_token_accuracy": 0.5022441651705566, "step": 7726 }, { "epoch": 1.4325176121616612, "grad_norm": 5.7421875, "learning_rate": 8.56748238783834e-06, "loss": 3.2995, "mean_token_accuracy": 0.4223314606741573, "step": 7727 }, { "epoch": 1.4327030033370411, "grad_norm": 5.9921875, "learning_rate": 8.56729699666296e-06, "loss": 2.9298, "mean_token_accuracy": 0.4463683052090976, "step": 7728 }, { "epoch": 1.4328883945124211, "grad_norm": 6.9453125, "learning_rate": 8.567111605487579e-06, "loss": 2.9435, "mean_token_accuracy": 0.4688860435339309, "step": 7729 }, { "epoch": 1.4330737856878013, "grad_norm": 6.765625, "learning_rate": 8.566926214312199e-06, "loss": 2.9749, "mean_token_accuracy": 0.4762388818297332, "step": 7730 }, { "epoch": 1.4332591768631813, "grad_norm": 7.3671875, "learning_rate": 8.56674082313682e-06, "loss": 2.5405, "mean_token_accuracy": 0.48712953944622234, "step": 7731 }, { "epoch": 1.4334445680385612, "grad_norm": 6.49609375, "learning_rate": 8.56655543196144e-06, "loss": 3.418, "mean_token_accuracy": 0.41239316239316237, "step": 7732 }, { "epoch": 1.4336299592139414, "grad_norm": 5.9375, "learning_rate": 8.566370040786059e-06, "loss": 2.8689, "mean_token_accuracy": 0.4678377041068778, "step": 7733 }, { "epoch": 1.4338153503893214, "grad_norm": 4.5546875, "learning_rate": 8.56618464961068e-06, "loss": 3.0953, "mean_token_accuracy": 0.45127488648271047, "step": 7734 }, { "epoch": 1.4340007415647016, "grad_norm": 7.2734375, "learning_rate": 8.5659992584353e-06, "loss": 3.0279, "mean_token_accuracy": 0.44628237259816206, "step": 7735 }, { "epoch": 1.4341861327400816, "grad_norm": 6.1640625, "learning_rate": 8.565813867259919e-06, "loss": 2.5604, "mean_token_accuracy": 0.5005012531328321, "step": 7736 }, { "epoch": 1.4343715239154617, "grad_norm": 5.5859375, "learning_rate": 8.565628476084539e-06, "loss": 2.8696, "mean_token_accuracy": 0.5053003533568905, "step": 7737 }, { "epoch": 1.4345569150908417, "grad_norm": 7.5078125, "learning_rate": 8.565443084909158e-06, "loss": 2.9236, "mean_token_accuracy": 0.45195683266155245, "step": 7738 }, { "epoch": 1.4347423062662217, "grad_norm": 6.63671875, "learning_rate": 8.56525769373378e-06, "loss": 2.8162, "mean_token_accuracy": 0.45701575087170854, "step": 7739 }, { "epoch": 1.4349276974416019, "grad_norm": 5.4609375, "learning_rate": 8.565072302558399e-06, "loss": 2.5311, "mean_token_accuracy": 0.5023166459486947, "step": 7740 }, { "epoch": 1.4351130886169818, "grad_norm": 7.71875, "learning_rate": 8.56488691138302e-06, "loss": 2.6726, "mean_token_accuracy": 0.47072179732313574, "step": 7741 }, { "epoch": 1.4352984797923618, "grad_norm": 10.0078125, "learning_rate": 8.564701520207638e-06, "loss": 3.5682, "mean_token_accuracy": 0.44333649889205445, "step": 7742 }, { "epoch": 1.435483870967742, "grad_norm": 5.19921875, "learning_rate": 8.564516129032259e-06, "loss": 2.9369, "mean_token_accuracy": 0.45014245014245013, "step": 7743 }, { "epoch": 1.435669262143122, "grad_norm": 6.1953125, "learning_rate": 8.56433073785688e-06, "loss": 2.7812, "mean_token_accuracy": 0.47208317289179824, "step": 7744 }, { "epoch": 1.435854653318502, "grad_norm": 7.1328125, "learning_rate": 8.564145346681498e-06, "loss": 2.9734, "mean_token_accuracy": 0.45430031223389156, "step": 7745 }, { "epoch": 1.4360400444938821, "grad_norm": 5.734375, "learning_rate": 8.563959955506118e-06, "loss": 3.4659, "mean_token_accuracy": 0.4102279043913285, "step": 7746 }, { "epoch": 1.436225435669262, "grad_norm": 5.16796875, "learning_rate": 8.563774564330739e-06, "loss": 2.5578, "mean_token_accuracy": 0.49498327759197325, "step": 7747 }, { "epoch": 1.4364108268446423, "grad_norm": 7.73046875, "learning_rate": 8.56358917315536e-06, "loss": 3.353, "mean_token_accuracy": 0.4169259489732421, "step": 7748 }, { "epoch": 1.4365962180200222, "grad_norm": 7.13671875, "learning_rate": 8.563403781979978e-06, "loss": 3.2716, "mean_token_accuracy": 0.4422768572955639, "step": 7749 }, { "epoch": 1.4367816091954024, "grad_norm": 8.984375, "learning_rate": 8.563218390804599e-06, "loss": 3.1125, "mean_token_accuracy": 0.46705597179374175, "step": 7750 }, { "epoch": 1.4369670003707824, "grad_norm": 6.40625, "learning_rate": 8.563032999629218e-06, "loss": 3.058, "mean_token_accuracy": 0.4373939599592806, "step": 7751 }, { "epoch": 1.4371523915461624, "grad_norm": 8.984375, "learning_rate": 8.562847608453838e-06, "loss": 2.3864, "mean_token_accuracy": 0.5244825845532559, "step": 7752 }, { "epoch": 1.4373377827215426, "grad_norm": 8.1796875, "learning_rate": 8.562662217278459e-06, "loss": 3.3342, "mean_token_accuracy": 0.4258579207293909, "step": 7753 }, { "epoch": 1.4375231738969225, "grad_norm": 7.51171875, "learning_rate": 8.562476826103077e-06, "loss": 2.9517, "mean_token_accuracy": 0.49028884462151395, "step": 7754 }, { "epoch": 1.4377085650723025, "grad_norm": 6.41796875, "learning_rate": 8.562291434927698e-06, "loss": 3.0015, "mean_token_accuracy": 0.45316209799266133, "step": 7755 }, { "epoch": 1.4378939562476827, "grad_norm": 8.859375, "learning_rate": 8.562106043752318e-06, "loss": 2.7533, "mean_token_accuracy": 0.47435753563012006, "step": 7756 }, { "epoch": 1.4380793474230626, "grad_norm": 7.296875, "learning_rate": 8.561920652576939e-06, "loss": 3.0408, "mean_token_accuracy": 0.42903930131004364, "step": 7757 }, { "epoch": 1.4382647385984426, "grad_norm": 6.01953125, "learning_rate": 8.561735261401558e-06, "loss": 2.8047, "mean_token_accuracy": 0.48461074230537116, "step": 7758 }, { "epoch": 1.4384501297738228, "grad_norm": 7.0546875, "learning_rate": 8.561549870226178e-06, "loss": 3.2329, "mean_token_accuracy": 0.42741477272727274, "step": 7759 }, { "epoch": 1.4386355209492028, "grad_norm": 9.0703125, "learning_rate": 8.561364479050797e-06, "loss": 2.7614, "mean_token_accuracy": 0.46244945118428654, "step": 7760 }, { "epoch": 1.4388209121245827, "grad_norm": 10.2109375, "learning_rate": 8.561179087875417e-06, "loss": 2.421, "mean_token_accuracy": 0.5106582651830004, "step": 7761 }, { "epoch": 1.439006303299963, "grad_norm": 6.0078125, "learning_rate": 8.560993696700038e-06, "loss": 2.9374, "mean_token_accuracy": 0.45057893250494213, "step": 7762 }, { "epoch": 1.439191694475343, "grad_norm": 5.875, "learning_rate": 8.560808305524658e-06, "loss": 2.5396, "mean_token_accuracy": 0.49355634768302714, "step": 7763 }, { "epoch": 1.439377085650723, "grad_norm": 9.5, "learning_rate": 8.560622914349279e-06, "loss": 2.3521, "mean_token_accuracy": 0.5118898623279099, "step": 7764 }, { "epoch": 1.439562476826103, "grad_norm": 9.0390625, "learning_rate": 8.560437523173898e-06, "loss": 2.5188, "mean_token_accuracy": 0.5155672823218997, "step": 7765 }, { "epoch": 1.4397478680014832, "grad_norm": 7.421875, "learning_rate": 8.560252131998518e-06, "loss": 2.4701, "mean_token_accuracy": 0.47448036951501155, "step": 7766 }, { "epoch": 1.4399332591768632, "grad_norm": 7.43359375, "learning_rate": 8.560066740823137e-06, "loss": 2.911, "mean_token_accuracy": 0.45908538296081547, "step": 7767 }, { "epoch": 1.4401186503522432, "grad_norm": 6.20703125, "learning_rate": 8.559881349647757e-06, "loss": 2.9295, "mean_token_accuracy": 0.43610441346053047, "step": 7768 }, { "epoch": 1.4403040415276234, "grad_norm": 8.3828125, "learning_rate": 8.559695958472378e-06, "loss": 3.2024, "mean_token_accuracy": 0.4308510638297872, "step": 7769 }, { "epoch": 1.4404894327030033, "grad_norm": 6.453125, "learning_rate": 8.559510567296997e-06, "loss": 2.9795, "mean_token_accuracy": 0.4340358143501035, "step": 7770 }, { "epoch": 1.4406748238783833, "grad_norm": 6.83203125, "learning_rate": 8.559325176121617e-06, "loss": 2.9526, "mean_token_accuracy": 0.4266003166704366, "step": 7771 }, { "epoch": 1.4408602150537635, "grad_norm": 6.71875, "learning_rate": 8.559139784946238e-06, "loss": 2.7802, "mean_token_accuracy": 0.4729751149152005, "step": 7772 }, { "epoch": 1.4410456062291435, "grad_norm": 7.84375, "learning_rate": 8.558954393770858e-06, "loss": 2.4028, "mean_token_accuracy": 0.5047740292807129, "step": 7773 }, { "epoch": 1.4412309974045234, "grad_norm": 5.828125, "learning_rate": 8.558769002595477e-06, "loss": 2.6473, "mean_token_accuracy": 0.4792722547108512, "step": 7774 }, { "epoch": 1.4414163885799036, "grad_norm": 6.734375, "learning_rate": 8.558583611420097e-06, "loss": 2.7631, "mean_token_accuracy": 0.4400798934753662, "step": 7775 }, { "epoch": 1.4416017797552836, "grad_norm": 7.5, "learning_rate": 8.558398220244716e-06, "loss": 2.8502, "mean_token_accuracy": 0.43957139297283826, "step": 7776 }, { "epoch": 1.4417871709306638, "grad_norm": 6.4453125, "learning_rate": 8.558212829069337e-06, "loss": 2.6056, "mean_token_accuracy": 0.48202653799758743, "step": 7777 }, { "epoch": 1.4419725621060437, "grad_norm": 6.93359375, "learning_rate": 8.558027437893957e-06, "loss": 2.777, "mean_token_accuracy": 0.4604223344992875, "step": 7778 }, { "epoch": 1.442157953281424, "grad_norm": 5.35546875, "learning_rate": 8.557842046718578e-06, "loss": 2.5362, "mean_token_accuracy": 0.47174610195731503, "step": 7779 }, { "epoch": 1.442343344456804, "grad_norm": 6.5703125, "learning_rate": 8.557656655543197e-06, "loss": 2.3456, "mean_token_accuracy": 0.4970290492957746, "step": 7780 }, { "epoch": 1.4425287356321839, "grad_norm": 5.63671875, "learning_rate": 8.557471264367817e-06, "loss": 2.7937, "mean_token_accuracy": 0.45845697329376855, "step": 7781 }, { "epoch": 1.442714126807564, "grad_norm": 6.2734375, "learning_rate": 8.557285873192438e-06, "loss": 2.7995, "mean_token_accuracy": 0.4560407569141194, "step": 7782 }, { "epoch": 1.442899517982944, "grad_norm": 6.40625, "learning_rate": 8.557100482017056e-06, "loss": 2.1102, "mean_token_accuracy": 0.5638780462074059, "step": 7783 }, { "epoch": 1.443084909158324, "grad_norm": 5.48046875, "learning_rate": 8.556915090841677e-06, "loss": 3.5781, "mean_token_accuracy": 0.40634809905824903, "step": 7784 }, { "epoch": 1.4432703003337042, "grad_norm": 8.7421875, "learning_rate": 8.556729699666296e-06, "loss": 2.9379, "mean_token_accuracy": 0.4715755278830536, "step": 7785 }, { "epoch": 1.4434556915090841, "grad_norm": 9.59375, "learning_rate": 8.556544308490916e-06, "loss": 3.2369, "mean_token_accuracy": 0.43086037430995017, "step": 7786 }, { "epoch": 1.4436410826844641, "grad_norm": 7.2734375, "learning_rate": 8.556358917315537e-06, "loss": 2.5112, "mean_token_accuracy": 0.5290492957746479, "step": 7787 }, { "epoch": 1.4438264738598443, "grad_norm": 5.78125, "learning_rate": 8.556173526140157e-06, "loss": 3.2346, "mean_token_accuracy": 0.4644572526416907, "step": 7788 }, { "epoch": 1.4440118650352243, "grad_norm": 8.375, "learning_rate": 8.555988134964776e-06, "loss": 2.1068, "mean_token_accuracy": 0.5578516243135064, "step": 7789 }, { "epoch": 1.4441972562106042, "grad_norm": 6.44921875, "learning_rate": 8.555802743789396e-06, "loss": 2.8469, "mean_token_accuracy": 0.47569359194696786, "step": 7790 }, { "epoch": 1.4443826473859844, "grad_norm": 6.37890625, "learning_rate": 8.555617352614017e-06, "loss": 3.0, "mean_token_accuracy": 0.45267631518492996, "step": 7791 }, { "epoch": 1.4445680385613644, "grad_norm": 13.1484375, "learning_rate": 8.555431961438636e-06, "loss": 2.745, "mean_token_accuracy": 0.4603275898744948, "step": 7792 }, { "epoch": 1.4447534297367446, "grad_norm": 6.4296875, "learning_rate": 8.555246570263256e-06, "loss": 2.9126, "mean_token_accuracy": 0.46118276953029935, "step": 7793 }, { "epoch": 1.4449388209121246, "grad_norm": 5.90625, "learning_rate": 8.555061179087875e-06, "loss": 2.4989, "mean_token_accuracy": 0.5202341824157765, "step": 7794 }, { "epoch": 1.4451242120875047, "grad_norm": 5.75, "learning_rate": 8.554875787912495e-06, "loss": 3.1943, "mean_token_accuracy": 0.42486805339956535, "step": 7795 }, { "epoch": 1.4453096032628847, "grad_norm": 6.578125, "learning_rate": 8.554690396737116e-06, "loss": 2.7526, "mean_token_accuracy": 0.47821376986037556, "step": 7796 }, { "epoch": 1.4454949944382647, "grad_norm": 5.5625, "learning_rate": 8.554505005561736e-06, "loss": 2.9519, "mean_token_accuracy": 0.4281893554426317, "step": 7797 }, { "epoch": 1.4456803856136449, "grad_norm": 7.45703125, "learning_rate": 8.554319614386355e-06, "loss": 3.0369, "mean_token_accuracy": 0.44507493088898586, "step": 7798 }, { "epoch": 1.4458657767890248, "grad_norm": 5.91796875, "learning_rate": 8.554134223210976e-06, "loss": 3.1418, "mean_token_accuracy": 0.42290061445430605, "step": 7799 }, { "epoch": 1.4460511679644048, "grad_norm": 9.234375, "learning_rate": 8.553948832035596e-06, "loss": 2.6364, "mean_token_accuracy": 0.470231822971549, "step": 7800 }, { "epoch": 1.446236559139785, "grad_norm": 6.26953125, "learning_rate": 8.553763440860215e-06, "loss": 2.8642, "mean_token_accuracy": 0.43511791662009613, "step": 7801 }, { "epoch": 1.446421950315165, "grad_norm": 6.6796875, "learning_rate": 8.553578049684835e-06, "loss": 2.8581, "mean_token_accuracy": 0.45207793670458385, "step": 7802 }, { "epoch": 1.446607341490545, "grad_norm": 6.58203125, "learning_rate": 8.553392658509454e-06, "loss": 2.9414, "mean_token_accuracy": 0.45637514264965245, "step": 7803 }, { "epoch": 1.4467927326659251, "grad_norm": 6.9453125, "learning_rate": 8.553207267334076e-06, "loss": 3.1133, "mean_token_accuracy": 0.4517704517704518, "step": 7804 }, { "epoch": 1.446978123841305, "grad_norm": 5.953125, "learning_rate": 8.553021876158695e-06, "loss": 2.506, "mean_token_accuracy": 0.4781306171360096, "step": 7805 }, { "epoch": 1.4471635150166853, "grad_norm": 6.53515625, "learning_rate": 8.552836484983316e-06, "loss": 2.5493, "mean_token_accuracy": 0.4895051520162829, "step": 7806 }, { "epoch": 1.4473489061920652, "grad_norm": 5.77734375, "learning_rate": 8.552651093807936e-06, "loss": 2.8263, "mean_token_accuracy": 0.44857142857142857, "step": 7807 }, { "epoch": 1.4475342973674454, "grad_norm": 6.0859375, "learning_rate": 8.552465702632555e-06, "loss": 2.7532, "mean_token_accuracy": 0.4802801888229024, "step": 7808 }, { "epoch": 1.4477196885428254, "grad_norm": 6.1484375, "learning_rate": 8.552280311457176e-06, "loss": 3.4146, "mean_token_accuracy": 0.42637285764253235, "step": 7809 }, { "epoch": 1.4479050797182054, "grad_norm": 8.8359375, "learning_rate": 8.552094920281794e-06, "loss": 2.5432, "mean_token_accuracy": 0.4844570044408559, "step": 7810 }, { "epoch": 1.4480904708935856, "grad_norm": 6.5625, "learning_rate": 8.551909529106415e-06, "loss": 2.894, "mean_token_accuracy": 0.46410927056088347, "step": 7811 }, { "epoch": 1.4482758620689655, "grad_norm": 6.43359375, "learning_rate": 8.551724137931035e-06, "loss": 2.7324, "mean_token_accuracy": 0.47217235188509876, "step": 7812 }, { "epoch": 1.4484612532443455, "grad_norm": 11.5, "learning_rate": 8.551538746755656e-06, "loss": 3.1898, "mean_token_accuracy": 0.42460796139927626, "step": 7813 }, { "epoch": 1.4486466444197257, "grad_norm": 6.8828125, "learning_rate": 8.551353355580275e-06, "loss": 2.7847, "mean_token_accuracy": 0.4439958127710483, "step": 7814 }, { "epoch": 1.4488320355951056, "grad_norm": 8.4609375, "learning_rate": 8.551167964404895e-06, "loss": 2.2698, "mean_token_accuracy": 0.5164261168384879, "step": 7815 }, { "epoch": 1.4490174267704856, "grad_norm": 10.09375, "learning_rate": 8.550982573229516e-06, "loss": 2.4291, "mean_token_accuracy": 0.5037783375314862, "step": 7816 }, { "epoch": 1.4492028179458658, "grad_norm": 6.59375, "learning_rate": 8.550797182054134e-06, "loss": 2.6747, "mean_token_accuracy": 0.5052889576883385, "step": 7817 }, { "epoch": 1.4493882091212458, "grad_norm": 5.33984375, "learning_rate": 8.550611790878755e-06, "loss": 2.9248, "mean_token_accuracy": 0.44591346153846156, "step": 7818 }, { "epoch": 1.449573600296626, "grad_norm": 6.86328125, "learning_rate": 8.550426399703374e-06, "loss": 3.2043, "mean_token_accuracy": 0.4389318341531975, "step": 7819 }, { "epoch": 1.449758991472006, "grad_norm": 10.6875, "learning_rate": 8.550241008527996e-06, "loss": 2.5162, "mean_token_accuracy": 0.47845953002610964, "step": 7820 }, { "epoch": 1.4499443826473861, "grad_norm": 6.98828125, "learning_rate": 8.550055617352615e-06, "loss": 2.5938, "mean_token_accuracy": 0.5034168564920274, "step": 7821 }, { "epoch": 1.450129773822766, "grad_norm": 11.21875, "learning_rate": 8.549870226177235e-06, "loss": 3.2477, "mean_token_accuracy": 0.43099809026032765, "step": 7822 }, { "epoch": 1.450315164998146, "grad_norm": 8.1796875, "learning_rate": 8.549684835001854e-06, "loss": 2.6303, "mean_token_accuracy": 0.49120549120549123, "step": 7823 }, { "epoch": 1.4505005561735262, "grad_norm": 7.140625, "learning_rate": 8.549499443826474e-06, "loss": 3.3362, "mean_token_accuracy": 0.40784313725490196, "step": 7824 }, { "epoch": 1.4506859473489062, "grad_norm": 9.59375, "learning_rate": 8.549314052651095e-06, "loss": 3.2848, "mean_token_accuracy": 0.40755444646098005, "step": 7825 }, { "epoch": 1.4508713385242862, "grad_norm": 7.171875, "learning_rate": 8.549128661475714e-06, "loss": 3.2753, "mean_token_accuracy": 0.4197776012708499, "step": 7826 }, { "epoch": 1.4510567296996664, "grad_norm": 8.0859375, "learning_rate": 8.548943270300334e-06, "loss": 2.6093, "mean_token_accuracy": 0.4801697998787144, "step": 7827 }, { "epoch": 1.4512421208750463, "grad_norm": 11.1796875, "learning_rate": 8.548757879124955e-06, "loss": 3.0634, "mean_token_accuracy": 0.4393236978456504, "step": 7828 }, { "epoch": 1.4514275120504263, "grad_norm": 7.0, "learning_rate": 8.548572487949575e-06, "loss": 2.8975, "mean_token_accuracy": 0.46254571192638905, "step": 7829 }, { "epoch": 1.4516129032258065, "grad_norm": 6.4140625, "learning_rate": 8.548387096774194e-06, "loss": 1.963, "mean_token_accuracy": 0.5665490472829923, "step": 7830 }, { "epoch": 1.4517982944011865, "grad_norm": 10.125, "learning_rate": 8.548201705598814e-06, "loss": 3.4484, "mean_token_accuracy": 0.40723367485495615, "step": 7831 }, { "epoch": 1.4519836855765664, "grad_norm": 13.75, "learning_rate": 8.548016314423433e-06, "loss": 2.3999, "mean_token_accuracy": 0.49074759437453735, "step": 7832 }, { "epoch": 1.4521690767519466, "grad_norm": 7.7109375, "learning_rate": 8.547830923248054e-06, "loss": 3.0958, "mean_token_accuracy": 0.43508137432188065, "step": 7833 }, { "epoch": 1.4523544679273266, "grad_norm": 5.40234375, "learning_rate": 8.547645532072674e-06, "loss": 2.7748, "mean_token_accuracy": 0.453654299540886, "step": 7834 }, { "epoch": 1.4525398591027068, "grad_norm": 6.140625, "learning_rate": 8.547460140897293e-06, "loss": 3.1231, "mean_token_accuracy": 0.43324051003957204, "step": 7835 }, { "epoch": 1.4527252502780867, "grad_norm": 10.4296875, "learning_rate": 8.547274749721914e-06, "loss": 3.1024, "mean_token_accuracy": 0.4361518157822882, "step": 7836 }, { "epoch": 1.452910641453467, "grad_norm": 6.3515625, "learning_rate": 8.547089358546534e-06, "loss": 3.0712, "mean_token_accuracy": 0.4516714728185411, "step": 7837 }, { "epoch": 1.453096032628847, "grad_norm": 6.04296875, "learning_rate": 8.546903967371155e-06, "loss": 2.9177, "mean_token_accuracy": 0.45163277880468267, "step": 7838 }, { "epoch": 1.4532814238042269, "grad_norm": 10.8671875, "learning_rate": 8.546718576195773e-06, "loss": 3.0304, "mean_token_accuracy": 0.4455910902217227, "step": 7839 }, { "epoch": 1.453466814979607, "grad_norm": 10.25, "learning_rate": 8.546533185020394e-06, "loss": 3.0305, "mean_token_accuracy": 0.45133891706971807, "step": 7840 }, { "epoch": 1.453652206154987, "grad_norm": 11.4375, "learning_rate": 8.546347793845013e-06, "loss": 2.8, "mean_token_accuracy": 0.4720753256803818, "step": 7841 }, { "epoch": 1.453837597330367, "grad_norm": 6.70703125, "learning_rate": 8.546162402669633e-06, "loss": 3.2404, "mean_token_accuracy": 0.42706708268330734, "step": 7842 }, { "epoch": 1.4540229885057472, "grad_norm": 7.109375, "learning_rate": 8.545977011494254e-06, "loss": 2.9074, "mean_token_accuracy": 0.47761847433512095, "step": 7843 }, { "epoch": 1.4542083796811272, "grad_norm": 10.2421875, "learning_rate": 8.545791620318874e-06, "loss": 3.4967, "mean_token_accuracy": 0.3991161231331911, "step": 7844 }, { "epoch": 1.4543937708565071, "grad_norm": 6.2421875, "learning_rate": 8.545606229143495e-06, "loss": 3.4977, "mean_token_accuracy": 0.42142943817497874, "step": 7845 }, { "epoch": 1.4545791620318873, "grad_norm": 5.80859375, "learning_rate": 8.545420837968113e-06, "loss": 3.1832, "mean_token_accuracy": 0.42845287492590395, "step": 7846 }, { "epoch": 1.4547645532072673, "grad_norm": 10.1484375, "learning_rate": 8.545235446792734e-06, "loss": 2.7948, "mean_token_accuracy": 0.4635144350988861, "step": 7847 }, { "epoch": 1.4549499443826475, "grad_norm": 10.1953125, "learning_rate": 8.545050055617353e-06, "loss": 2.7546, "mean_token_accuracy": 0.4820136553551411, "step": 7848 }, { "epoch": 1.4551353355580274, "grad_norm": 6.49609375, "learning_rate": 8.544864664441973e-06, "loss": 2.6429, "mean_token_accuracy": 0.4621101364522417, "step": 7849 }, { "epoch": 1.4553207267334076, "grad_norm": 6.90625, "learning_rate": 8.544679273266594e-06, "loss": 2.61, "mean_token_accuracy": 0.480463347164592, "step": 7850 }, { "epoch": 1.4555061179087876, "grad_norm": 7.0390625, "learning_rate": 8.544493882091212e-06, "loss": 2.55, "mean_token_accuracy": 0.47000833101916134, "step": 7851 }, { "epoch": 1.4556915090841676, "grad_norm": 6.80859375, "learning_rate": 8.544308490915833e-06, "loss": 3.3924, "mean_token_accuracy": 0.4094776803246498, "step": 7852 }, { "epoch": 1.4558769002595477, "grad_norm": 6.0859375, "learning_rate": 8.544123099740453e-06, "loss": 2.6124, "mean_token_accuracy": 0.4919107391910739, "step": 7853 }, { "epoch": 1.4560622914349277, "grad_norm": 6.15234375, "learning_rate": 8.543937708565074e-06, "loss": 2.5512, "mean_token_accuracy": 0.48447946513849094, "step": 7854 }, { "epoch": 1.4562476826103077, "grad_norm": 7.20703125, "learning_rate": 8.543752317389693e-06, "loss": 3.0899, "mean_token_accuracy": 0.45342533267619517, "step": 7855 }, { "epoch": 1.4564330737856879, "grad_norm": 5.94140625, "learning_rate": 8.543566926214313e-06, "loss": 2.874, "mean_token_accuracy": 0.4457006843201428, "step": 7856 }, { "epoch": 1.4566184649610678, "grad_norm": 9.5234375, "learning_rate": 8.543381535038932e-06, "loss": 2.9142, "mean_token_accuracy": 0.44180091752907286, "step": 7857 }, { "epoch": 1.4568038561364478, "grad_norm": 9.0859375, "learning_rate": 8.543196143863553e-06, "loss": 3.3034, "mean_token_accuracy": 0.4189010460502585, "step": 7858 }, { "epoch": 1.456989247311828, "grad_norm": 6.50390625, "learning_rate": 8.543010752688173e-06, "loss": 2.5361, "mean_token_accuracy": 0.4909827015090173, "step": 7859 }, { "epoch": 1.457174638487208, "grad_norm": 6.8359375, "learning_rate": 8.542825361512793e-06, "loss": 2.8885, "mean_token_accuracy": 0.47942569567791593, "step": 7860 }, { "epoch": 1.457360029662588, "grad_norm": 7.55859375, "learning_rate": 8.542639970337412e-06, "loss": 2.8284, "mean_token_accuracy": 0.46159981768459435, "step": 7861 }, { "epoch": 1.4575454208379681, "grad_norm": 9.71875, "learning_rate": 8.542454579162033e-06, "loss": 2.7749, "mean_token_accuracy": 0.4645190023752969, "step": 7862 }, { "epoch": 1.457730812013348, "grad_norm": 9.7734375, "learning_rate": 8.542269187986653e-06, "loss": 2.5592, "mean_token_accuracy": 0.5148818208345491, "step": 7863 }, { "epoch": 1.4579162031887283, "grad_norm": 5.96875, "learning_rate": 8.542083796811272e-06, "loss": 3.2341, "mean_token_accuracy": 0.42153306026916326, "step": 7864 }, { "epoch": 1.4581015943641082, "grad_norm": 7.265625, "learning_rate": 8.541898405635893e-06, "loss": 2.715, "mean_token_accuracy": 0.5018199082133249, "step": 7865 }, { "epoch": 1.4582869855394884, "grad_norm": 8.8515625, "learning_rate": 8.541713014460511e-06, "loss": 3.0273, "mean_token_accuracy": 0.44452066540902074, "step": 7866 }, { "epoch": 1.4584723767148684, "grad_norm": 6.390625, "learning_rate": 8.541527623285132e-06, "loss": 2.8537, "mean_token_accuracy": 0.4850897510133179, "step": 7867 }, { "epoch": 1.4586577678902484, "grad_norm": 8.328125, "learning_rate": 8.541342232109752e-06, "loss": 2.9865, "mean_token_accuracy": 0.4617848303974378, "step": 7868 }, { "epoch": 1.4588431590656286, "grad_norm": 7.17578125, "learning_rate": 8.541156840934373e-06, "loss": 2.9651, "mean_token_accuracy": 0.46307053941908716, "step": 7869 }, { "epoch": 1.4590285502410085, "grad_norm": 8.5390625, "learning_rate": 8.540971449758992e-06, "loss": 2.9879, "mean_token_accuracy": 0.4407940724171676, "step": 7870 }, { "epoch": 1.4592139414163885, "grad_norm": 6.4140625, "learning_rate": 8.540786058583612e-06, "loss": 2.3967, "mean_token_accuracy": 0.5297192172116203, "step": 7871 }, { "epoch": 1.4593993325917687, "grad_norm": 6.77734375, "learning_rate": 8.540600667408233e-06, "loss": 2.9023, "mean_token_accuracy": 0.45032790867136263, "step": 7872 }, { "epoch": 1.4595847237671487, "grad_norm": 6.421875, "learning_rate": 8.540415276232851e-06, "loss": 2.7547, "mean_token_accuracy": 0.4853306086702671, "step": 7873 }, { "epoch": 1.4597701149425286, "grad_norm": 5.89453125, "learning_rate": 8.540229885057472e-06, "loss": 2.7977, "mean_token_accuracy": 0.48594132029339854, "step": 7874 }, { "epoch": 1.4599555061179088, "grad_norm": 6.6015625, "learning_rate": 8.54004449388209e-06, "loss": 2.6217, "mean_token_accuracy": 0.4970807875084861, "step": 7875 }, { "epoch": 1.4601408972932888, "grad_norm": 6.421875, "learning_rate": 8.539859102706713e-06, "loss": 3.7582, "mean_token_accuracy": 0.3912672450576021, "step": 7876 }, { "epoch": 1.460326288468669, "grad_norm": 7.39453125, "learning_rate": 8.539673711531332e-06, "loss": 3.7371, "mean_token_accuracy": 0.3766009852216749, "step": 7877 }, { "epoch": 1.460511679644049, "grad_norm": 5.34765625, "learning_rate": 8.539488320355952e-06, "loss": 3.2332, "mean_token_accuracy": 0.43877917414721723, "step": 7878 }, { "epoch": 1.4606970708194291, "grad_norm": 7.07421875, "learning_rate": 8.539302929180571e-06, "loss": 3.1175, "mean_token_accuracy": 0.43656716417910446, "step": 7879 }, { "epoch": 1.460882461994809, "grad_norm": 7.71875, "learning_rate": 8.539117538005191e-06, "loss": 2.9536, "mean_token_accuracy": 0.4444853392221813, "step": 7880 }, { "epoch": 1.461067853170189, "grad_norm": 13.203125, "learning_rate": 8.538932146829812e-06, "loss": 2.9463, "mean_token_accuracy": 0.4631434282858571, "step": 7881 }, { "epoch": 1.4612532443455692, "grad_norm": 5.9765625, "learning_rate": 8.53874675565443e-06, "loss": 2.639, "mean_token_accuracy": 0.48976548732547104, "step": 7882 }, { "epoch": 1.4614386355209492, "grad_norm": 7.6953125, "learning_rate": 8.538561364479051e-06, "loss": 2.5873, "mean_token_accuracy": 0.507802711014922, "step": 7883 }, { "epoch": 1.4616240266963292, "grad_norm": 8.25, "learning_rate": 8.538375973303672e-06, "loss": 2.9473, "mean_token_accuracy": 0.44396664060463903, "step": 7884 }, { "epoch": 1.4618094178717094, "grad_norm": 6.0390625, "learning_rate": 8.538190582128292e-06, "loss": 3.0263, "mean_token_accuracy": 0.4480888771326544, "step": 7885 }, { "epoch": 1.4619948090470893, "grad_norm": 6.04296875, "learning_rate": 8.538005190952911e-06, "loss": 2.1345, "mean_token_accuracy": 0.5498054474708172, "step": 7886 }, { "epoch": 1.4621802002224693, "grad_norm": 6.19921875, "learning_rate": 8.537819799777532e-06, "loss": 2.9138, "mean_token_accuracy": 0.4485056976994195, "step": 7887 }, { "epoch": 1.4623655913978495, "grad_norm": 6.90234375, "learning_rate": 8.537634408602152e-06, "loss": 3.6099, "mean_token_accuracy": 0.39652777777777776, "step": 7888 }, { "epoch": 1.4625509825732295, "grad_norm": 6.01953125, "learning_rate": 8.53744901742677e-06, "loss": 3.2111, "mean_token_accuracy": 0.44321608040201005, "step": 7889 }, { "epoch": 1.4627363737486094, "grad_norm": 7.125, "learning_rate": 8.537263626251391e-06, "loss": 2.608, "mean_token_accuracy": 0.4647887323943662, "step": 7890 }, { "epoch": 1.4629217649239896, "grad_norm": 6.80078125, "learning_rate": 8.53707823507601e-06, "loss": 3.7678, "mean_token_accuracy": 0.38359598853868193, "step": 7891 }, { "epoch": 1.4631071560993696, "grad_norm": 8.734375, "learning_rate": 8.536892843900632e-06, "loss": 3.1978, "mean_token_accuracy": 0.43611446997178555, "step": 7892 }, { "epoch": 1.4632925472747498, "grad_norm": 7.66015625, "learning_rate": 8.536707452725251e-06, "loss": 3.1695, "mean_token_accuracy": 0.4296479707180536, "step": 7893 }, { "epoch": 1.4634779384501297, "grad_norm": 6.78515625, "learning_rate": 8.536522061549872e-06, "loss": 2.4814, "mean_token_accuracy": 0.502254850550603, "step": 7894 }, { "epoch": 1.46366332962551, "grad_norm": 9.015625, "learning_rate": 8.53633667037449e-06, "loss": 2.5606, "mean_token_accuracy": 0.5031363088057901, "step": 7895 }, { "epoch": 1.46384872080089, "grad_norm": 9.3515625, "learning_rate": 8.536151279199111e-06, "loss": 3.4927, "mean_token_accuracy": 0.39601040763226364, "step": 7896 }, { "epoch": 1.4640341119762699, "grad_norm": 6.19921875, "learning_rate": 8.535965888023731e-06, "loss": 3.4803, "mean_token_accuracy": 0.4235897435897436, "step": 7897 }, { "epoch": 1.46421950315165, "grad_norm": 7.61328125, "learning_rate": 8.53578049684835e-06, "loss": 2.7534, "mean_token_accuracy": 0.47451330063902514, "step": 7898 }, { "epoch": 1.46440489432703, "grad_norm": 11.1171875, "learning_rate": 8.53559510567297e-06, "loss": 3.651, "mean_token_accuracy": 0.4040595399188092, "step": 7899 }, { "epoch": 1.46459028550241, "grad_norm": 7.64453125, "learning_rate": 8.535409714497591e-06, "loss": 3.2656, "mean_token_accuracy": 0.46500777604976673, "step": 7900 }, { "epoch": 1.4647756766777902, "grad_norm": 6.70703125, "learning_rate": 8.535224323322212e-06, "loss": 2.6647, "mean_token_accuracy": 0.4787762293769021, "step": 7901 }, { "epoch": 1.4649610678531702, "grad_norm": 9.21875, "learning_rate": 8.53503893214683e-06, "loss": 2.7427, "mean_token_accuracy": 0.4627295149355988, "step": 7902 }, { "epoch": 1.4651464590285501, "grad_norm": 5.734375, "learning_rate": 8.534853540971451e-06, "loss": 2.6982, "mean_token_accuracy": 0.4755515417365501, "step": 7903 }, { "epoch": 1.4653318502039303, "grad_norm": 5.375, "learning_rate": 8.53466814979607e-06, "loss": 2.716, "mean_token_accuracy": 0.46497622820919177, "step": 7904 }, { "epoch": 1.4655172413793103, "grad_norm": 7.15234375, "learning_rate": 8.53448275862069e-06, "loss": 3.1299, "mean_token_accuracy": 0.42488561260803254, "step": 7905 }, { "epoch": 1.4657026325546905, "grad_norm": 6.890625, "learning_rate": 8.53429736744531e-06, "loss": 3.622, "mean_token_accuracy": 0.3879200340280732, "step": 7906 }, { "epoch": 1.4658880237300704, "grad_norm": 7.4453125, "learning_rate": 8.53411197626993e-06, "loss": 2.9202, "mean_token_accuracy": 0.4488838153221715, "step": 7907 }, { "epoch": 1.4660734149054506, "grad_norm": 7.8671875, "learning_rate": 8.53392658509455e-06, "loss": 2.7822, "mean_token_accuracy": 0.49189862899875364, "step": 7908 }, { "epoch": 1.4662588060808306, "grad_norm": 6.90234375, "learning_rate": 8.53374119391917e-06, "loss": 3.0192, "mean_token_accuracy": 0.4500153798831129, "step": 7909 }, { "epoch": 1.4664441972562106, "grad_norm": 23.671875, "learning_rate": 8.533555802743791e-06, "loss": 3.4542, "mean_token_accuracy": 0.44963240036115054, "step": 7910 }, { "epoch": 1.4666295884315907, "grad_norm": 5.62890625, "learning_rate": 8.53337041156841e-06, "loss": 2.6252, "mean_token_accuracy": 0.4949092518813634, "step": 7911 }, { "epoch": 1.4668149796069707, "grad_norm": 5.953125, "learning_rate": 8.53318502039303e-06, "loss": 2.5613, "mean_token_accuracy": 0.4935454818372861, "step": 7912 }, { "epoch": 1.4670003707823507, "grad_norm": 5.3359375, "learning_rate": 8.532999629217649e-06, "loss": 2.805, "mean_token_accuracy": 0.4658688690375864, "step": 7913 }, { "epoch": 1.4671857619577309, "grad_norm": 5.75390625, "learning_rate": 8.53281423804227e-06, "loss": 3.1783, "mean_token_accuracy": 0.4179614667495339, "step": 7914 }, { "epoch": 1.4673711531331108, "grad_norm": 6.1484375, "learning_rate": 8.53262884686689e-06, "loss": 3.2537, "mean_token_accuracy": 0.43605616789974333, "step": 7915 }, { "epoch": 1.4675565443084908, "grad_norm": 5.0390625, "learning_rate": 8.53244345569151e-06, "loss": 2.7442, "mean_token_accuracy": 0.4841582712804789, "step": 7916 }, { "epoch": 1.467741935483871, "grad_norm": 5.14453125, "learning_rate": 8.53225806451613e-06, "loss": 2.4418, "mean_token_accuracy": 0.5245980901728514, "step": 7917 }, { "epoch": 1.467927326659251, "grad_norm": 5.83984375, "learning_rate": 8.53207267334075e-06, "loss": 2.8591, "mean_token_accuracy": 0.4605495741603728, "step": 7918 }, { "epoch": 1.4681127178346312, "grad_norm": 7.98828125, "learning_rate": 8.53188728216537e-06, "loss": 2.7281, "mean_token_accuracy": 0.4822843474360891, "step": 7919 }, { "epoch": 1.4682981090100111, "grad_norm": 5.5, "learning_rate": 8.531701890989989e-06, "loss": 2.8218, "mean_token_accuracy": 0.45784794604537093, "step": 7920 }, { "epoch": 1.4684835001853913, "grad_norm": 6.80859375, "learning_rate": 8.53151649981461e-06, "loss": 2.4831, "mean_token_accuracy": 0.49508742714404663, "step": 7921 }, { "epoch": 1.4686688913607713, "grad_norm": 6.8671875, "learning_rate": 8.531331108639228e-06, "loss": 3.4171, "mean_token_accuracy": 0.4472391903221943, "step": 7922 }, { "epoch": 1.4688542825361512, "grad_norm": 7.1171875, "learning_rate": 8.531145717463849e-06, "loss": 2.6813, "mean_token_accuracy": 0.474243399871217, "step": 7923 }, { "epoch": 1.4690396737115314, "grad_norm": 5.515625, "learning_rate": 8.53096032628847e-06, "loss": 2.7112, "mean_token_accuracy": 0.4808935094127564, "step": 7924 }, { "epoch": 1.4692250648869114, "grad_norm": 7.85546875, "learning_rate": 8.53077493511309e-06, "loss": 2.5896, "mean_token_accuracy": 0.467614756406646, "step": 7925 }, { "epoch": 1.4694104560622914, "grad_norm": 6.33984375, "learning_rate": 8.53058954393771e-06, "loss": 2.503, "mean_token_accuracy": 0.5060560985975351, "step": 7926 }, { "epoch": 1.4695958472376716, "grad_norm": 6.58203125, "learning_rate": 8.530404152762329e-06, "loss": 2.8306, "mean_token_accuracy": 0.48042995677065076, "step": 7927 }, { "epoch": 1.4697812384130515, "grad_norm": 5.5546875, "learning_rate": 8.53021876158695e-06, "loss": 2.6565, "mean_token_accuracy": 0.49041165942214265, "step": 7928 }, { "epoch": 1.4699666295884315, "grad_norm": 8.609375, "learning_rate": 8.530033370411568e-06, "loss": 2.8217, "mean_token_accuracy": 0.45671572604009736, "step": 7929 }, { "epoch": 1.4701520207638117, "grad_norm": 10.3203125, "learning_rate": 8.529847979236189e-06, "loss": 2.6475, "mean_token_accuracy": 0.49955357142857143, "step": 7930 }, { "epoch": 1.4703374119391917, "grad_norm": 7.8046875, "learning_rate": 8.52966258806081e-06, "loss": 2.4959, "mean_token_accuracy": 0.48393943681901797, "step": 7931 }, { "epoch": 1.4705228031145716, "grad_norm": 4.90625, "learning_rate": 8.529477196885428e-06, "loss": 2.7817, "mean_token_accuracy": 0.45967643838949035, "step": 7932 }, { "epoch": 1.4707081942899518, "grad_norm": 10.9921875, "learning_rate": 8.529291805710049e-06, "loss": 2.401, "mean_token_accuracy": 0.5024745269286754, "step": 7933 }, { "epoch": 1.4708935854653318, "grad_norm": 6.890625, "learning_rate": 8.52910641453467e-06, "loss": 2.9299, "mean_token_accuracy": 0.4428520243640272, "step": 7934 }, { "epoch": 1.471078976640712, "grad_norm": 6.45703125, "learning_rate": 8.52892102335929e-06, "loss": 2.7991, "mean_token_accuracy": 0.45350223546944857, "step": 7935 }, { "epoch": 1.471264367816092, "grad_norm": 6.1328125, "learning_rate": 8.528735632183908e-06, "loss": 3.2484, "mean_token_accuracy": 0.4583579444772593, "step": 7936 }, { "epoch": 1.4714497589914721, "grad_norm": 13.09375, "learning_rate": 8.528550241008529e-06, "loss": 3.0654, "mean_token_accuracy": 0.4769716088328076, "step": 7937 }, { "epoch": 1.471635150166852, "grad_norm": 7.41015625, "learning_rate": 8.528364849833148e-06, "loss": 2.432, "mean_token_accuracy": 0.5048942598187312, "step": 7938 }, { "epoch": 1.471820541342232, "grad_norm": 7.4375, "learning_rate": 8.528179458657768e-06, "loss": 3.3368, "mean_token_accuracy": 0.42369251577998196, "step": 7939 }, { "epoch": 1.4720059325176122, "grad_norm": 10.765625, "learning_rate": 8.527994067482389e-06, "loss": 3.0787, "mean_token_accuracy": 0.4425763944109963, "step": 7940 }, { "epoch": 1.4721913236929922, "grad_norm": 6.69140625, "learning_rate": 8.52780867630701e-06, "loss": 2.9073, "mean_token_accuracy": 0.4663755458515284, "step": 7941 }, { "epoch": 1.4723767148683722, "grad_norm": 6.13671875, "learning_rate": 8.527623285131628e-06, "loss": 2.9467, "mean_token_accuracy": 0.4519519519519519, "step": 7942 }, { "epoch": 1.4725621060437524, "grad_norm": 6.37109375, "learning_rate": 8.527437893956249e-06, "loss": 3.0013, "mean_token_accuracy": 0.44687219395390604, "step": 7943 }, { "epoch": 1.4727474972191323, "grad_norm": 7.8671875, "learning_rate": 8.527252502780869e-06, "loss": 2.591, "mean_token_accuracy": 0.48645703611457036, "step": 7944 }, { "epoch": 1.4729328883945123, "grad_norm": 5.25, "learning_rate": 8.527067111605488e-06, "loss": 3.1381, "mean_token_accuracy": 0.43822558963705244, "step": 7945 }, { "epoch": 1.4731182795698925, "grad_norm": 7.1015625, "learning_rate": 8.526881720430108e-06, "loss": 2.952, "mean_token_accuracy": 0.4658246656760773, "step": 7946 }, { "epoch": 1.4733036707452725, "grad_norm": 5.34765625, "learning_rate": 8.526696329254727e-06, "loss": 2.6892, "mean_token_accuracy": 0.472513423676809, "step": 7947 }, { "epoch": 1.4734890619206527, "grad_norm": 5.48046875, "learning_rate": 8.526510938079348e-06, "loss": 2.1906, "mean_token_accuracy": 0.5672640080767289, "step": 7948 }, { "epoch": 1.4736744530960326, "grad_norm": 5.93359375, "learning_rate": 8.526325546903968e-06, "loss": 2.855, "mean_token_accuracy": 0.45520361990950226, "step": 7949 }, { "epoch": 1.4738598442714128, "grad_norm": 5.8515625, "learning_rate": 8.526140155728589e-06, "loss": 3.4076, "mean_token_accuracy": 0.42464902472356814, "step": 7950 }, { "epoch": 1.4740452354467928, "grad_norm": 5.22265625, "learning_rate": 8.525954764553207e-06, "loss": 3.3606, "mean_token_accuracy": 0.42683444083133043, "step": 7951 }, { "epoch": 1.4742306266221727, "grad_norm": 8.4765625, "learning_rate": 8.525769373377828e-06, "loss": 2.5967, "mean_token_accuracy": 0.48363431151241537, "step": 7952 }, { "epoch": 1.474416017797553, "grad_norm": 5.72265625, "learning_rate": 8.525583982202448e-06, "loss": 2.9269, "mean_token_accuracy": 0.4478978770639656, "step": 7953 }, { "epoch": 1.474601408972933, "grad_norm": 6.6875, "learning_rate": 8.525398591027067e-06, "loss": 3.2819, "mean_token_accuracy": 0.4549947581249064, "step": 7954 }, { "epoch": 1.4747868001483129, "grad_norm": 10.71875, "learning_rate": 8.525213199851688e-06, "loss": 2.3159, "mean_token_accuracy": 0.5185848634124496, "step": 7955 }, { "epoch": 1.474972191323693, "grad_norm": 6.1640625, "learning_rate": 8.525027808676306e-06, "loss": 3.1898, "mean_token_accuracy": 0.44335325932251635, "step": 7956 }, { "epoch": 1.475157582499073, "grad_norm": 5.89453125, "learning_rate": 8.524842417500929e-06, "loss": 2.8931, "mean_token_accuracy": 0.47636462142760394, "step": 7957 }, { "epoch": 1.475342973674453, "grad_norm": 8.1171875, "learning_rate": 8.524657026325547e-06, "loss": 2.4912, "mean_token_accuracy": 0.4952357683850476, "step": 7958 }, { "epoch": 1.4755283648498332, "grad_norm": 8.7109375, "learning_rate": 8.524471635150168e-06, "loss": 2.802, "mean_token_accuracy": 0.47229072031148606, "step": 7959 }, { "epoch": 1.4757137560252132, "grad_norm": 5.9140625, "learning_rate": 8.524286243974787e-06, "loss": 3.6226, "mean_token_accuracy": 0.39038262668045504, "step": 7960 }, { "epoch": 1.4758991472005931, "grad_norm": 6.8046875, "learning_rate": 8.524100852799407e-06, "loss": 2.6566, "mean_token_accuracy": 0.47561489810260016, "step": 7961 }, { "epoch": 1.4760845383759733, "grad_norm": 6.0234375, "learning_rate": 8.523915461624028e-06, "loss": 2.5882, "mean_token_accuracy": 0.4835987477882129, "step": 7962 }, { "epoch": 1.4762699295513533, "grad_norm": 6.54296875, "learning_rate": 8.523730070448647e-06, "loss": 3.0214, "mean_token_accuracy": 0.4448405826627279, "step": 7963 }, { "epoch": 1.4764553207267335, "grad_norm": 6.16015625, "learning_rate": 8.523544679273267e-06, "loss": 3.0039, "mean_token_accuracy": 0.4574182335282035, "step": 7964 }, { "epoch": 1.4766407119021134, "grad_norm": 6.53125, "learning_rate": 8.523359288097887e-06, "loss": 2.7845, "mean_token_accuracy": 0.4779474130619169, "step": 7965 }, { "epoch": 1.4768261030774936, "grad_norm": 7.57421875, "learning_rate": 8.523173896922508e-06, "loss": 2.4045, "mean_token_accuracy": 0.5024134014764339, "step": 7966 }, { "epoch": 1.4770114942528736, "grad_norm": 5.80859375, "learning_rate": 8.522988505747127e-06, "loss": 2.7409, "mean_token_accuracy": 0.4648303000491884, "step": 7967 }, { "epoch": 1.4771968854282536, "grad_norm": 6.32421875, "learning_rate": 8.522803114571747e-06, "loss": 2.5199, "mean_token_accuracy": 0.4842148421484215, "step": 7968 }, { "epoch": 1.4773822766036337, "grad_norm": 6.125, "learning_rate": 8.522617723396368e-06, "loss": 2.4571, "mean_token_accuracy": 0.5066621499548328, "step": 7969 }, { "epoch": 1.4775676677790137, "grad_norm": 6.2421875, "learning_rate": 8.522432332220987e-06, "loss": 3.0045, "mean_token_accuracy": 0.4302426343154246, "step": 7970 }, { "epoch": 1.4777530589543937, "grad_norm": 5.55078125, "learning_rate": 8.522246941045607e-06, "loss": 2.7451, "mean_token_accuracy": 0.47583892617449663, "step": 7971 }, { "epoch": 1.4779384501297739, "grad_norm": 5.4375, "learning_rate": 8.522061549870226e-06, "loss": 2.5533, "mean_token_accuracy": 0.48322147651006714, "step": 7972 }, { "epoch": 1.4781238413051538, "grad_norm": 5.828125, "learning_rate": 8.521876158694848e-06, "loss": 2.7393, "mean_token_accuracy": 0.46737579452587885, "step": 7973 }, { "epoch": 1.4783092324805338, "grad_norm": 6.05859375, "learning_rate": 8.521690767519467e-06, "loss": 2.5999, "mean_token_accuracy": 0.5125786163522013, "step": 7974 }, { "epoch": 1.478494623655914, "grad_norm": 5.08984375, "learning_rate": 8.521505376344087e-06, "loss": 2.5591, "mean_token_accuracy": 0.484051724137931, "step": 7975 }, { "epoch": 1.478680014831294, "grad_norm": 5.7734375, "learning_rate": 8.521319985168706e-06, "loss": 2.5576, "mean_token_accuracy": 0.4828080229226361, "step": 7976 }, { "epoch": 1.4788654060066742, "grad_norm": 5.7421875, "learning_rate": 8.521134593993327e-06, "loss": 3.3981, "mean_token_accuracy": 0.4406198399593548, "step": 7977 }, { "epoch": 1.4790507971820541, "grad_norm": 5.73046875, "learning_rate": 8.520949202817947e-06, "loss": 3.8009, "mean_token_accuracy": 0.38714918759231903, "step": 7978 }, { "epoch": 1.4792361883574343, "grad_norm": 5.65234375, "learning_rate": 8.520763811642566e-06, "loss": 3.0012, "mean_token_accuracy": 0.46612466124661245, "step": 7979 }, { "epoch": 1.4794215795328143, "grad_norm": 5.94921875, "learning_rate": 8.520578420467186e-06, "loss": 2.8497, "mean_token_accuracy": 0.46706708744782655, "step": 7980 }, { "epoch": 1.4796069707081942, "grad_norm": 5.65625, "learning_rate": 8.520393029291807e-06, "loss": 3.0038, "mean_token_accuracy": 0.4650495877643685, "step": 7981 }, { "epoch": 1.4797923618835744, "grad_norm": 6.25390625, "learning_rate": 8.520207638116427e-06, "loss": 2.6513, "mean_token_accuracy": 0.4771202683277432, "step": 7982 }, { "epoch": 1.4799777530589544, "grad_norm": 5.17578125, "learning_rate": 8.520022246941046e-06, "loss": 1.716, "mean_token_accuracy": 0.6261138613861386, "step": 7983 }, { "epoch": 1.4801631442343344, "grad_norm": 5.8671875, "learning_rate": 8.519836855765667e-06, "loss": 2.8564, "mean_token_accuracy": 0.4692716705824162, "step": 7984 }, { "epoch": 1.4803485354097146, "grad_norm": 5.67578125, "learning_rate": 8.519651464590285e-06, "loss": 3.0229, "mean_token_accuracy": 0.45672031317964334, "step": 7985 }, { "epoch": 1.4805339265850945, "grad_norm": 5.0625, "learning_rate": 8.519466073414906e-06, "loss": 2.4549, "mean_token_accuracy": 0.49318568994889267, "step": 7986 }, { "epoch": 1.4807193177604745, "grad_norm": 5.64453125, "learning_rate": 8.519280682239526e-06, "loss": 3.1252, "mean_token_accuracy": 0.4632725042171446, "step": 7987 }, { "epoch": 1.4809047089358547, "grad_norm": 7.40234375, "learning_rate": 8.519095291064145e-06, "loss": 3.0907, "mean_token_accuracy": 0.4764595103578154, "step": 7988 }, { "epoch": 1.4810901001112347, "grad_norm": 7.48046875, "learning_rate": 8.518909899888766e-06, "loss": 2.7589, "mean_token_accuracy": 0.5023857164845313, "step": 7989 }, { "epoch": 1.4812754912866146, "grad_norm": 7.0859375, "learning_rate": 8.518724508713386e-06, "loss": 2.7447, "mean_token_accuracy": 0.4683005576753742, "step": 7990 }, { "epoch": 1.4814608824619948, "grad_norm": 6.6796875, "learning_rate": 8.518539117538007e-06, "loss": 3.489, "mean_token_accuracy": 0.41300906605266946, "step": 7991 }, { "epoch": 1.4816462736373748, "grad_norm": 6.9765625, "learning_rate": 8.518353726362626e-06, "loss": 3.5141, "mean_token_accuracy": 0.3938763012859767, "step": 7992 }, { "epoch": 1.481831664812755, "grad_norm": 6.41796875, "learning_rate": 8.518168335187246e-06, "loss": 2.7942, "mean_token_accuracy": 0.450038236043844, "step": 7993 }, { "epoch": 1.482017055988135, "grad_norm": 6.03125, "learning_rate": 8.517982944011865e-06, "loss": 3.0007, "mean_token_accuracy": 0.44292003685881026, "step": 7994 }, { "epoch": 1.4822024471635151, "grad_norm": 7.2734375, "learning_rate": 8.517797552836485e-06, "loss": 3.4898, "mean_token_accuracy": 0.4078859060402685, "step": 7995 }, { "epoch": 1.482387838338895, "grad_norm": 7.29296875, "learning_rate": 8.517612161661106e-06, "loss": 2.9845, "mean_token_accuracy": 0.46103575832305793, "step": 7996 }, { "epoch": 1.482573229514275, "grad_norm": 7.359375, "learning_rate": 8.517426770485726e-06, "loss": 2.7333, "mean_token_accuracy": 0.46562138542603776, "step": 7997 }, { "epoch": 1.4827586206896552, "grad_norm": 5.89453125, "learning_rate": 8.517241379310345e-06, "loss": 3.0431, "mean_token_accuracy": 0.4291243853959619, "step": 7998 }, { "epoch": 1.4829440118650352, "grad_norm": 5.328125, "learning_rate": 8.517055988134966e-06, "loss": 2.3431, "mean_token_accuracy": 0.5415029177087055, "step": 7999 }, { "epoch": 1.4831294030404152, "grad_norm": 6.28515625, "learning_rate": 8.516870596959586e-06, "loss": 2.9362, "mean_token_accuracy": 0.4457454050374404, "step": 8000 }, { "epoch": 1.4833147942157954, "grad_norm": 7.9453125, "learning_rate": 8.516685205784205e-06, "loss": 2.6462, "mean_token_accuracy": 0.4828295042321645, "step": 8001 }, { "epoch": 1.4835001853911753, "grad_norm": 6.578125, "learning_rate": 8.516499814608825e-06, "loss": 3.0169, "mean_token_accuracy": 0.46002273588480486, "step": 8002 }, { "epoch": 1.4836855765665553, "grad_norm": 5.62890625, "learning_rate": 8.516314423433444e-06, "loss": 3.276, "mean_token_accuracy": 0.415587219343696, "step": 8003 }, { "epoch": 1.4838709677419355, "grad_norm": 6.2421875, "learning_rate": 8.516129032258065e-06, "loss": 3.5254, "mean_token_accuracy": 0.4, "step": 8004 }, { "epoch": 1.4840563589173155, "grad_norm": 6.09375, "learning_rate": 8.515943641082685e-06, "loss": 2.7737, "mean_token_accuracy": 0.4678609062170706, "step": 8005 }, { "epoch": 1.4842417500926957, "grad_norm": 6.3046875, "learning_rate": 8.515758249907306e-06, "loss": 3.2617, "mean_token_accuracy": 0.4360674643217529, "step": 8006 }, { "epoch": 1.4844271412680756, "grad_norm": 5.49609375, "learning_rate": 8.515572858731926e-06, "loss": 2.9263, "mean_token_accuracy": 0.4251207729468599, "step": 8007 }, { "epoch": 1.4846125324434558, "grad_norm": 5.58984375, "learning_rate": 8.515387467556545e-06, "loss": 3.2865, "mean_token_accuracy": 0.43304046858359957, "step": 8008 }, { "epoch": 1.4847979236188358, "grad_norm": 6.07421875, "learning_rate": 8.515202076381165e-06, "loss": 2.5854, "mean_token_accuracy": 0.4826239224137931, "step": 8009 }, { "epoch": 1.4849833147942157, "grad_norm": 6.43359375, "learning_rate": 8.515016685205784e-06, "loss": 2.9379, "mean_token_accuracy": 0.4546051551469381, "step": 8010 }, { "epoch": 1.485168705969596, "grad_norm": 7.328125, "learning_rate": 8.514831294030405e-06, "loss": 2.9528, "mean_token_accuracy": 0.4526656701544594, "step": 8011 }, { "epoch": 1.485354097144976, "grad_norm": 7.66015625, "learning_rate": 8.514645902855025e-06, "loss": 2.7272, "mean_token_accuracy": 0.48482169171824346, "step": 8012 }, { "epoch": 1.4855394883203559, "grad_norm": 7.56640625, "learning_rate": 8.514460511679646e-06, "loss": 2.8881, "mean_token_accuracy": 0.4569899665551839, "step": 8013 }, { "epoch": 1.485724879495736, "grad_norm": 5.75390625, "learning_rate": 8.514275120504264e-06, "loss": 2.7165, "mean_token_accuracy": 0.4841328413284133, "step": 8014 }, { "epoch": 1.485910270671116, "grad_norm": 5.68359375, "learning_rate": 8.514089729328885e-06, "loss": 2.8969, "mean_token_accuracy": 0.45917482344195265, "step": 8015 }, { "epoch": 1.486095661846496, "grad_norm": 5.265625, "learning_rate": 8.513904338153505e-06, "loss": 2.6781, "mean_token_accuracy": 0.49518510592766957, "step": 8016 }, { "epoch": 1.4862810530218762, "grad_norm": 6.0390625, "learning_rate": 8.513718946978124e-06, "loss": 2.853, "mean_token_accuracy": 0.4585130239779196, "step": 8017 }, { "epoch": 1.4864664441972562, "grad_norm": 7.95703125, "learning_rate": 8.513533555802745e-06, "loss": 2.6231, "mean_token_accuracy": 0.4911820781696854, "step": 8018 }, { "epoch": 1.4866518353726363, "grad_norm": 5.92578125, "learning_rate": 8.513348164627364e-06, "loss": 2.4723, "mean_token_accuracy": 0.5340948425060574, "step": 8019 }, { "epoch": 1.4868372265480163, "grad_norm": 6.37890625, "learning_rate": 8.513162773451984e-06, "loss": 2.8321, "mean_token_accuracy": 0.4672766552293811, "step": 8020 }, { "epoch": 1.4870226177233965, "grad_norm": 9.9609375, "learning_rate": 8.512977382276605e-06, "loss": 3.1043, "mean_token_accuracy": 0.4352419460023477, "step": 8021 }, { "epoch": 1.4872080088987765, "grad_norm": 10.0, "learning_rate": 8.512791991101225e-06, "loss": 2.8343, "mean_token_accuracy": 0.4513677811550152, "step": 8022 }, { "epoch": 1.4873934000741564, "grad_norm": 6.2890625, "learning_rate": 8.512606599925844e-06, "loss": 2.6724, "mean_token_accuracy": 0.5019402985074627, "step": 8023 }, { "epoch": 1.4875787912495366, "grad_norm": 9.71875, "learning_rate": 8.512421208750464e-06, "loss": 2.7667, "mean_token_accuracy": 0.4716202270381837, "step": 8024 }, { "epoch": 1.4877641824249166, "grad_norm": 8.953125, "learning_rate": 8.512235817575085e-06, "loss": 2.9192, "mean_token_accuracy": 0.47719828672209624, "step": 8025 }, { "epoch": 1.4879495736002966, "grad_norm": 6.03515625, "learning_rate": 8.512050426399704e-06, "loss": 2.8504, "mean_token_accuracy": 0.45606117588065687, "step": 8026 }, { "epoch": 1.4881349647756767, "grad_norm": 6.7578125, "learning_rate": 8.511865035224324e-06, "loss": 2.508, "mean_token_accuracy": 0.4936126724578436, "step": 8027 }, { "epoch": 1.4883203559510567, "grad_norm": 6.1328125, "learning_rate": 8.511679644048943e-06, "loss": 3.1028, "mean_token_accuracy": 0.4385822510822511, "step": 8028 }, { "epoch": 1.4885057471264367, "grad_norm": 7.55859375, "learning_rate": 8.511494252873565e-06, "loss": 3.4797, "mean_token_accuracy": 0.44996247185389043, "step": 8029 }, { "epoch": 1.4886911383018169, "grad_norm": 5.375, "learning_rate": 8.511308861698184e-06, "loss": 3.0795, "mean_token_accuracy": 0.44145923877255366, "step": 8030 }, { "epoch": 1.4888765294771968, "grad_norm": 5.5546875, "learning_rate": 8.511123470522804e-06, "loss": 2.8607, "mean_token_accuracy": 0.45576947275701524, "step": 8031 }, { "epoch": 1.4890619206525768, "grad_norm": 10.203125, "learning_rate": 8.510938079347423e-06, "loss": 3.3016, "mean_token_accuracy": 0.42113529279784606, "step": 8032 }, { "epoch": 1.489247311827957, "grad_norm": 5.89453125, "learning_rate": 8.510752688172044e-06, "loss": 3.1071, "mean_token_accuracy": 0.45742395114780054, "step": 8033 }, { "epoch": 1.489432703003337, "grad_norm": 7.05078125, "learning_rate": 8.510567296996664e-06, "loss": 3.1208, "mean_token_accuracy": 0.43698665297741274, "step": 8034 }, { "epoch": 1.4896180941787172, "grad_norm": 5.80078125, "learning_rate": 8.510381905821283e-06, "loss": 3.8054, "mean_token_accuracy": 0.3904093567251462, "step": 8035 }, { "epoch": 1.4898034853540971, "grad_norm": 6.55859375, "learning_rate": 8.510196514645903e-06, "loss": 3.0763, "mean_token_accuracy": 0.42972579149556234, "step": 8036 }, { "epoch": 1.4899888765294773, "grad_norm": 5.64453125, "learning_rate": 8.510011123470524e-06, "loss": 3.0177, "mean_token_accuracy": 0.4467764060356653, "step": 8037 }, { "epoch": 1.4901742677048573, "grad_norm": 6.01171875, "learning_rate": 8.509825732295144e-06, "loss": 2.623, "mean_token_accuracy": 0.47832090582711956, "step": 8038 }, { "epoch": 1.4903596588802372, "grad_norm": 5.6953125, "learning_rate": 8.509640341119763e-06, "loss": 2.3997, "mean_token_accuracy": 0.5045176333430487, "step": 8039 }, { "epoch": 1.4905450500556174, "grad_norm": 5.8671875, "learning_rate": 8.509454949944384e-06, "loss": 3.6561, "mean_token_accuracy": 0.4093258426966292, "step": 8040 }, { "epoch": 1.4907304412309974, "grad_norm": 7.0078125, "learning_rate": 8.509269558769002e-06, "loss": 2.6608, "mean_token_accuracy": 0.46412213740458014, "step": 8041 }, { "epoch": 1.4909158324063774, "grad_norm": 7.79296875, "learning_rate": 8.509084167593623e-06, "loss": 1.7924, "mean_token_accuracy": 0.6134769291571979, "step": 8042 }, { "epoch": 1.4911012235817576, "grad_norm": 7.73046875, "learning_rate": 8.508898776418243e-06, "loss": 2.6075, "mean_token_accuracy": 0.4886890543768442, "step": 8043 }, { "epoch": 1.4912866147571375, "grad_norm": 5.32421875, "learning_rate": 8.508713385242862e-06, "loss": 2.7405, "mean_token_accuracy": 0.47257427799709123, "step": 8044 }, { "epoch": 1.4914720059325175, "grad_norm": 7.70703125, "learning_rate": 8.508527994067483e-06, "loss": 3.1725, "mean_token_accuracy": 0.4302888368462139, "step": 8045 }, { "epoch": 1.4916573971078977, "grad_norm": 7.1953125, "learning_rate": 8.508342602892103e-06, "loss": 3.4588, "mean_token_accuracy": 0.4131944444444444, "step": 8046 }, { "epoch": 1.4918427882832777, "grad_norm": 6.6484375, "learning_rate": 8.508157211716724e-06, "loss": 3.1421, "mean_token_accuracy": 0.4522912361735748, "step": 8047 }, { "epoch": 1.4920281794586578, "grad_norm": 7.17578125, "learning_rate": 8.507971820541343e-06, "loss": 2.4212, "mean_token_accuracy": 0.506042122223501, "step": 8048 }, { "epoch": 1.4922135706340378, "grad_norm": 5.5625, "learning_rate": 8.507786429365963e-06, "loss": 2.6861, "mean_token_accuracy": 0.4791395045632334, "step": 8049 }, { "epoch": 1.492398961809418, "grad_norm": 5.9921875, "learning_rate": 8.507601038190584e-06, "loss": 2.5831, "mean_token_accuracy": 0.499843961302403, "step": 8050 }, { "epoch": 1.492584352984798, "grad_norm": 6.484375, "learning_rate": 8.507415647015202e-06, "loss": 3.1776, "mean_token_accuracy": 0.4453522429474333, "step": 8051 }, { "epoch": 1.492769744160178, "grad_norm": 6.56640625, "learning_rate": 8.507230255839823e-06, "loss": 2.9997, "mean_token_accuracy": 0.4337108594270486, "step": 8052 }, { "epoch": 1.4929551353355581, "grad_norm": 6.1953125, "learning_rate": 8.507044864664442e-06, "loss": 3.1659, "mean_token_accuracy": 0.4317763268039511, "step": 8053 }, { "epoch": 1.493140526510938, "grad_norm": 5.4921875, "learning_rate": 8.506859473489064e-06, "loss": 2.6612, "mean_token_accuracy": 0.4779655900996076, "step": 8054 }, { "epoch": 1.493325917686318, "grad_norm": 7.01953125, "learning_rate": 8.506674082313683e-06, "loss": 2.6773, "mean_token_accuracy": 0.46616732258474036, "step": 8055 }, { "epoch": 1.4935113088616983, "grad_norm": 7.2734375, "learning_rate": 8.506488691138303e-06, "loss": 2.5989, "mean_token_accuracy": 0.5004418634010858, "step": 8056 }, { "epoch": 1.4936967000370782, "grad_norm": 5.48046875, "learning_rate": 8.506303299962922e-06, "loss": 2.6704, "mean_token_accuracy": 0.48788443616029825, "step": 8057 }, { "epoch": 1.4938820912124582, "grad_norm": 7.375, "learning_rate": 8.506117908787542e-06, "loss": 3.2257, "mean_token_accuracy": 0.438489488710096, "step": 8058 }, { "epoch": 1.4940674823878384, "grad_norm": 7.3125, "learning_rate": 8.505932517612163e-06, "loss": 2.5419, "mean_token_accuracy": 0.4870290302655961, "step": 8059 }, { "epoch": 1.4942528735632183, "grad_norm": 6.203125, "learning_rate": 8.505747126436782e-06, "loss": 3.5305, "mean_token_accuracy": 0.4162080352228949, "step": 8060 }, { "epoch": 1.4944382647385983, "grad_norm": 6.73046875, "learning_rate": 8.505561735261402e-06, "loss": 2.8037, "mean_token_accuracy": 0.47136836886465044, "step": 8061 }, { "epoch": 1.4946236559139785, "grad_norm": 7.11328125, "learning_rate": 8.505376344086023e-06, "loss": 2.29, "mean_token_accuracy": 0.5176174496644296, "step": 8062 }, { "epoch": 1.4948090470893585, "grad_norm": 6.34765625, "learning_rate": 8.505190952910643e-06, "loss": 3.1231, "mean_token_accuracy": 0.44406538734896944, "step": 8063 }, { "epoch": 1.4949944382647387, "grad_norm": 5.86328125, "learning_rate": 8.505005561735262e-06, "loss": 2.8737, "mean_token_accuracy": 0.4923916465526288, "step": 8064 }, { "epoch": 1.4951798294401186, "grad_norm": 5.99609375, "learning_rate": 8.504820170559882e-06, "loss": 2.9468, "mean_token_accuracy": 0.454889957732109, "step": 8065 }, { "epoch": 1.4953652206154988, "grad_norm": 7.12109375, "learning_rate": 8.504634779384501e-06, "loss": 2.7109, "mean_token_accuracy": 0.4582555348092825, "step": 8066 }, { "epoch": 1.4955506117908788, "grad_norm": 5.64453125, "learning_rate": 8.504449388209122e-06, "loss": 2.7472, "mean_token_accuracy": 0.4600153295861012, "step": 8067 }, { "epoch": 1.4957360029662587, "grad_norm": 5.890625, "learning_rate": 8.504263997033742e-06, "loss": 3.4505, "mean_token_accuracy": 0.4184286400768861, "step": 8068 }, { "epoch": 1.495921394141639, "grad_norm": 6.515625, "learning_rate": 8.504078605858361e-06, "loss": 2.9889, "mean_token_accuracy": 0.44995278564683666, "step": 8069 }, { "epoch": 1.496106785317019, "grad_norm": 5.734375, "learning_rate": 8.503893214682981e-06, "loss": 2.5889, "mean_token_accuracy": 0.483691431529899, "step": 8070 }, { "epoch": 1.4962921764923989, "grad_norm": 5.55859375, "learning_rate": 8.503707823507602e-06, "loss": 2.8222, "mean_token_accuracy": 0.4794265065858492, "step": 8071 }, { "epoch": 1.496477567667779, "grad_norm": 5.9453125, "learning_rate": 8.503522432332222e-06, "loss": 2.2889, "mean_token_accuracy": 0.5097512554011444, "step": 8072 }, { "epoch": 1.496662958843159, "grad_norm": 5.3828125, "learning_rate": 8.503337041156841e-06, "loss": 2.425, "mean_token_accuracy": 0.5084414678387051, "step": 8073 }, { "epoch": 1.496848350018539, "grad_norm": 6.22265625, "learning_rate": 8.503151649981462e-06, "loss": 2.6927, "mean_token_accuracy": 0.4720052083333333, "step": 8074 }, { "epoch": 1.4970337411939192, "grad_norm": 5.7734375, "learning_rate": 8.50296625880608e-06, "loss": 3.0126, "mean_token_accuracy": 0.44879131145626533, "step": 8075 }, { "epoch": 1.4972191323692992, "grad_norm": 5.40625, "learning_rate": 8.502780867630701e-06, "loss": 2.6459, "mean_token_accuracy": 0.46622291460133125, "step": 8076 }, { "epoch": 1.4974045235446793, "grad_norm": 5.43359375, "learning_rate": 8.502595476455322e-06, "loss": 3.2082, "mean_token_accuracy": 0.43470902226272623, "step": 8077 }, { "epoch": 1.4975899147200593, "grad_norm": 5.23828125, "learning_rate": 8.502410085279942e-06, "loss": 2.9601, "mean_token_accuracy": 0.4596949891067538, "step": 8078 }, { "epoch": 1.4977753058954395, "grad_norm": 6.23046875, "learning_rate": 8.50222469410456e-06, "loss": 3.3594, "mean_token_accuracy": 0.4082827860280276, "step": 8079 }, { "epoch": 1.4979606970708195, "grad_norm": 5.97265625, "learning_rate": 8.502039302929181e-06, "loss": 2.5842, "mean_token_accuracy": 0.4871861924686193, "step": 8080 }, { "epoch": 1.4981460882461994, "grad_norm": 6.67578125, "learning_rate": 8.501853911753802e-06, "loss": 3.2203, "mean_token_accuracy": 0.4296975546975547, "step": 8081 }, { "epoch": 1.4983314794215796, "grad_norm": 6.62890625, "learning_rate": 8.50166852057842e-06, "loss": 2.6627, "mean_token_accuracy": 0.4765916476110242, "step": 8082 }, { "epoch": 1.4985168705969596, "grad_norm": 5.4453125, "learning_rate": 8.501483129403041e-06, "loss": 2.9307, "mean_token_accuracy": 0.4637900874635569, "step": 8083 }, { "epoch": 1.4987022617723396, "grad_norm": 6.19140625, "learning_rate": 8.50129773822766e-06, "loss": 3.6897, "mean_token_accuracy": 0.40058020065272576, "step": 8084 }, { "epoch": 1.4988876529477198, "grad_norm": 6.58984375, "learning_rate": 8.50111234705228e-06, "loss": 2.7901, "mean_token_accuracy": 0.45515558267236117, "step": 8085 }, { "epoch": 1.4990730441230997, "grad_norm": 5.5546875, "learning_rate": 8.500926955876901e-06, "loss": 3.3627, "mean_token_accuracy": 0.42082210242587603, "step": 8086 }, { "epoch": 1.4992584352984797, "grad_norm": 5.125, "learning_rate": 8.500741564701521e-06, "loss": 2.1672, "mean_token_accuracy": 0.552065404475043, "step": 8087 }, { "epoch": 1.4994438264738599, "grad_norm": 6.83984375, "learning_rate": 8.500556173526142e-06, "loss": 2.5806, "mean_token_accuracy": 0.5043033889187736, "step": 8088 }, { "epoch": 1.4996292176492398, "grad_norm": 6.671875, "learning_rate": 8.50037078235076e-06, "loss": 3.0413, "mean_token_accuracy": 0.44064602960969046, "step": 8089 }, { "epoch": 1.4998146088246198, "grad_norm": 5.6484375, "learning_rate": 8.500185391175381e-06, "loss": 3.1397, "mean_token_accuracy": 0.43716448726772195, "step": 8090 }, { "epoch": 1.5, "grad_norm": 5.50390625, "learning_rate": 8.5e-06, "loss": 2.6092, "mean_token_accuracy": 0.47680293982544786, "step": 8091 }, { "epoch": 1.5001853911753802, "grad_norm": 5.4453125, "learning_rate": 8.49981460882462e-06, "loss": 2.9471, "mean_token_accuracy": 0.45460758628545395, "step": 8092 }, { "epoch": 1.5003707823507602, "grad_norm": 7.83203125, "learning_rate": 8.499629217649241e-06, "loss": 2.4695, "mean_token_accuracy": 0.49622411693057245, "step": 8093 }, { "epoch": 1.5005561735261401, "grad_norm": 5.921875, "learning_rate": 8.499443826473861e-06, "loss": 2.3143, "mean_token_accuracy": 0.5380031972097079, "step": 8094 }, { "epoch": 1.5007415647015203, "grad_norm": 6.55859375, "learning_rate": 8.49925843529848e-06, "loss": 2.5854, "mean_token_accuracy": 0.4996929124186218, "step": 8095 }, { "epoch": 1.5009269558769003, "grad_norm": 6.78515625, "learning_rate": 8.4990730441231e-06, "loss": 3.0328, "mean_token_accuracy": 0.463856993736952, "step": 8096 }, { "epoch": 1.5011123470522802, "grad_norm": 5.4765625, "learning_rate": 8.498887652947721e-06, "loss": 2.8831, "mean_token_accuracy": 0.478944820909971, "step": 8097 }, { "epoch": 1.5012977382276604, "grad_norm": 5.328125, "learning_rate": 8.49870226177234e-06, "loss": 2.6734, "mean_token_accuracy": 0.4641961549178044, "step": 8098 }, { "epoch": 1.5014831294030404, "grad_norm": 6.53515625, "learning_rate": 8.49851687059696e-06, "loss": 2.7245, "mean_token_accuracy": 0.4691149909692956, "step": 8099 }, { "epoch": 1.5016685205784204, "grad_norm": 5.25390625, "learning_rate": 8.49833147942158e-06, "loss": 3.2307, "mean_token_accuracy": 0.43688427689478526, "step": 8100 }, { "epoch": 1.5018539117538006, "grad_norm": 5.8671875, "learning_rate": 8.4981460882462e-06, "loss": 2.1917, "mean_token_accuracy": 0.5314391599752933, "step": 8101 }, { "epoch": 1.5020393029291805, "grad_norm": 8.3203125, "learning_rate": 8.49796069707082e-06, "loss": 2.8677, "mean_token_accuracy": 0.46004490820235105, "step": 8102 }, { "epoch": 1.5022246941045605, "grad_norm": 8.359375, "learning_rate": 8.49777530589544e-06, "loss": 2.7993, "mean_token_accuracy": 0.475659924580048, "step": 8103 }, { "epoch": 1.5024100852799407, "grad_norm": 6.42578125, "learning_rate": 8.49758991472006e-06, "loss": 2.9356, "mean_token_accuracy": 0.4481491205040693, "step": 8104 }, { "epoch": 1.5025954764553209, "grad_norm": 5.05078125, "learning_rate": 8.49740452354468e-06, "loss": 3.0479, "mean_token_accuracy": 0.4527973927213471, "step": 8105 }, { "epoch": 1.5027808676307006, "grad_norm": 5.28515625, "learning_rate": 8.4972191323693e-06, "loss": 2.9286, "mean_token_accuracy": 0.43651452282157677, "step": 8106 }, { "epoch": 1.5029662588060808, "grad_norm": 7.1484375, "learning_rate": 8.49703374119392e-06, "loss": 3.1495, "mean_token_accuracy": 0.4466930469191634, "step": 8107 }, { "epoch": 1.503151649981461, "grad_norm": 6.5078125, "learning_rate": 8.49684835001854e-06, "loss": 2.3214, "mean_token_accuracy": 0.5451505016722408, "step": 8108 }, { "epoch": 1.503337041156841, "grad_norm": 5.87109375, "learning_rate": 8.496662958843159e-06, "loss": 3.4007, "mean_token_accuracy": 0.41091424521615877, "step": 8109 }, { "epoch": 1.503522432332221, "grad_norm": 5.81640625, "learning_rate": 8.49647756766778e-06, "loss": 2.9649, "mean_token_accuracy": 0.4525146962769432, "step": 8110 }, { "epoch": 1.5037078235076011, "grad_norm": 5.66015625, "learning_rate": 8.4962921764924e-06, "loss": 2.7966, "mean_token_accuracy": 0.4538901317424807, "step": 8111 }, { "epoch": 1.503893214682981, "grad_norm": 6.4609375, "learning_rate": 8.49610678531702e-06, "loss": 2.2029, "mean_token_accuracy": 0.5537365791431912, "step": 8112 }, { "epoch": 1.504078605858361, "grad_norm": 8.375, "learning_rate": 8.495921394141639e-06, "loss": 3.0267, "mean_token_accuracy": 0.4945741532390661, "step": 8113 }, { "epoch": 1.5042639970337413, "grad_norm": 6.80859375, "learning_rate": 8.49573600296626e-06, "loss": 3.2295, "mean_token_accuracy": 0.43619281959512357, "step": 8114 }, { "epoch": 1.5044493882091212, "grad_norm": 6.46484375, "learning_rate": 8.49555061179088e-06, "loss": 3.7332, "mean_token_accuracy": 0.40746835443037976, "step": 8115 }, { "epoch": 1.5046347793845012, "grad_norm": 9.203125, "learning_rate": 8.495365220615499e-06, "loss": 2.7778, "mean_token_accuracy": 0.46215483234714005, "step": 8116 }, { "epoch": 1.5048201705598814, "grad_norm": 6.77734375, "learning_rate": 8.49517982944012e-06, "loss": 2.6319, "mean_token_accuracy": 0.4542042042042042, "step": 8117 }, { "epoch": 1.5050055617352616, "grad_norm": 8.328125, "learning_rate": 8.49499443826474e-06, "loss": 2.8717, "mean_token_accuracy": 0.45720850086157383, "step": 8118 }, { "epoch": 1.5051909529106413, "grad_norm": 5.77734375, "learning_rate": 8.49480904708936e-06, "loss": 2.9367, "mean_token_accuracy": 0.4533754249635745, "step": 8119 }, { "epoch": 1.5053763440860215, "grad_norm": 7.82421875, "learning_rate": 8.494623655913979e-06, "loss": 2.9803, "mean_token_accuracy": 0.44193078732220853, "step": 8120 }, { "epoch": 1.5055617352614017, "grad_norm": 6.6015625, "learning_rate": 8.4944382647386e-06, "loss": 2.7145, "mean_token_accuracy": 0.4983169516628518, "step": 8121 }, { "epoch": 1.5057471264367817, "grad_norm": 6.953125, "learning_rate": 8.494252873563218e-06, "loss": 3.0248, "mean_token_accuracy": 0.4381896689588997, "step": 8122 }, { "epoch": 1.5059325176121616, "grad_norm": 5.90234375, "learning_rate": 8.494067482387839e-06, "loss": 3.3629, "mean_token_accuracy": 0.4031028487361241, "step": 8123 }, { "epoch": 1.5061179087875418, "grad_norm": 6.18359375, "learning_rate": 8.49388209121246e-06, "loss": 3.0878, "mean_token_accuracy": 0.4521038495971352, "step": 8124 }, { "epoch": 1.5063032999629218, "grad_norm": 6.61328125, "learning_rate": 8.493696700037078e-06, "loss": 2.5116, "mean_token_accuracy": 0.48084748140872735, "step": 8125 }, { "epoch": 1.5064886911383017, "grad_norm": 6.3515625, "learning_rate": 8.4935113088617e-06, "loss": 2.8296, "mean_token_accuracy": 0.4741880983668106, "step": 8126 }, { "epoch": 1.506674082313682, "grad_norm": 6.90234375, "learning_rate": 8.493325917686319e-06, "loss": 3.4793, "mean_token_accuracy": 0.39510011192637734, "step": 8127 }, { "epoch": 1.506859473489062, "grad_norm": 7.0859375, "learning_rate": 8.49314052651094e-06, "loss": 2.7283, "mean_token_accuracy": 0.49664529595944534, "step": 8128 }, { "epoch": 1.5070448646644419, "grad_norm": 9.21875, "learning_rate": 8.492955135335558e-06, "loss": 2.4883, "mean_token_accuracy": 0.5003728560775541, "step": 8129 }, { "epoch": 1.507230255839822, "grad_norm": 5.3046875, "learning_rate": 8.492769744160179e-06, "loss": 2.2301, "mean_token_accuracy": 0.5423567258429643, "step": 8130 }, { "epoch": 1.507415647015202, "grad_norm": 5.3515625, "learning_rate": 8.4925843529848e-06, "loss": 2.6135, "mean_token_accuracy": 0.481780210283739, "step": 8131 }, { "epoch": 1.507601038190582, "grad_norm": 6.21484375, "learning_rate": 8.492398961809418e-06, "loss": 2.6376, "mean_token_accuracy": 0.4885966139639009, "step": 8132 }, { "epoch": 1.5077864293659622, "grad_norm": 6.35546875, "learning_rate": 8.492213570634039e-06, "loss": 2.6117, "mean_token_accuracy": 0.511895722601873, "step": 8133 }, { "epoch": 1.5079718205413424, "grad_norm": 5.859375, "learning_rate": 8.492028179458659e-06, "loss": 3.1044, "mean_token_accuracy": 0.45163014430785675, "step": 8134 }, { "epoch": 1.5081572117167221, "grad_norm": 5.53515625, "learning_rate": 8.49184278828328e-06, "loss": 3.212, "mean_token_accuracy": 0.4069689524234979, "step": 8135 }, { "epoch": 1.5083426028921023, "grad_norm": 9.8359375, "learning_rate": 8.491657397107898e-06, "loss": 2.7327, "mean_token_accuracy": 0.4587066825002699, "step": 8136 }, { "epoch": 1.5085279940674825, "grad_norm": 7.08203125, "learning_rate": 8.491472005932519e-06, "loss": 3.0077, "mean_token_accuracy": 0.4471955533097524, "step": 8137 }, { "epoch": 1.5087133852428625, "grad_norm": 8.4375, "learning_rate": 8.491286614757138e-06, "loss": 3.3161, "mean_token_accuracy": 0.41439710701774213, "step": 8138 }, { "epoch": 1.5088987764182424, "grad_norm": 8.5859375, "learning_rate": 8.491101223581758e-06, "loss": 2.1968, "mean_token_accuracy": 0.5201405152224824, "step": 8139 }, { "epoch": 1.5090841675936226, "grad_norm": 5.515625, "learning_rate": 8.490915832406379e-06, "loss": 2.6471, "mean_token_accuracy": 0.4886032191854813, "step": 8140 }, { "epoch": 1.5092695587690026, "grad_norm": 6.875, "learning_rate": 8.490730441230997e-06, "loss": 2.5118, "mean_token_accuracy": 0.48185401658284627, "step": 8141 }, { "epoch": 1.5094549499443826, "grad_norm": 7.578125, "learning_rate": 8.490545050055618e-06, "loss": 2.7685, "mean_token_accuracy": 0.4690656565656566, "step": 8142 }, { "epoch": 1.5096403411197628, "grad_norm": 6.69140625, "learning_rate": 8.490359658880238e-06, "loss": 2.8193, "mean_token_accuracy": 0.47326596683998823, "step": 8143 }, { "epoch": 1.5098257322951427, "grad_norm": 6.29296875, "learning_rate": 8.490174267704859e-06, "loss": 2.5314, "mean_token_accuracy": 0.4626950354609929, "step": 8144 }, { "epoch": 1.5100111234705227, "grad_norm": 5.6796875, "learning_rate": 8.489988876529478e-06, "loss": 2.8902, "mean_token_accuracy": 0.46473616473616475, "step": 8145 }, { "epoch": 1.5101965146459029, "grad_norm": 8.984375, "learning_rate": 8.489803485354098e-06, "loss": 2.4983, "mean_token_accuracy": 0.49335956714215445, "step": 8146 }, { "epoch": 1.510381905821283, "grad_norm": 7.078125, "learning_rate": 8.489618094178717e-06, "loss": 2.7449, "mean_token_accuracy": 0.49389875558777335, "step": 8147 }, { "epoch": 1.5105672969966628, "grad_norm": 5.21875, "learning_rate": 8.489432703003337e-06, "loss": 2.9781, "mean_token_accuracy": 0.46951478285471776, "step": 8148 }, { "epoch": 1.510752688172043, "grad_norm": 8.421875, "learning_rate": 8.489247311827958e-06, "loss": 2.8853, "mean_token_accuracy": 0.46274311694872444, "step": 8149 }, { "epoch": 1.5109380793474232, "grad_norm": 8.0625, "learning_rate": 8.489061920652578e-06, "loss": 3.4396, "mean_token_accuracy": 0.4183177570093458, "step": 8150 }, { "epoch": 1.5111234705228032, "grad_norm": 6.5546875, "learning_rate": 8.488876529477197e-06, "loss": 2.9917, "mean_token_accuracy": 0.4447632711621234, "step": 8151 }, { "epoch": 1.5113088616981831, "grad_norm": 8.9453125, "learning_rate": 8.488691138301818e-06, "loss": 2.9226, "mean_token_accuracy": 0.4702569517775431, "step": 8152 }, { "epoch": 1.5114942528735633, "grad_norm": 7.609375, "learning_rate": 8.488505747126438e-06, "loss": 2.6935, "mean_token_accuracy": 0.4769170579029734, "step": 8153 }, { "epoch": 1.5116796440489433, "grad_norm": 5.19140625, "learning_rate": 8.488320355951057e-06, "loss": 3.0485, "mean_token_accuracy": 0.45320447609359105, "step": 8154 }, { "epoch": 1.5118650352243233, "grad_norm": 5.04296875, "learning_rate": 8.488134964775677e-06, "loss": 2.66, "mean_token_accuracy": 0.47689503591722954, "step": 8155 }, { "epoch": 1.5120504263997034, "grad_norm": 8.1953125, "learning_rate": 8.487949573600296e-06, "loss": 3.149, "mean_token_accuracy": 0.4368541135418065, "step": 8156 }, { "epoch": 1.5122358175750834, "grad_norm": 6.58984375, "learning_rate": 8.487764182424917e-06, "loss": 2.7056, "mean_token_accuracy": 0.46638005159071366, "step": 8157 }, { "epoch": 1.5124212087504634, "grad_norm": 6.0546875, "learning_rate": 8.487578791249537e-06, "loss": 2.593, "mean_token_accuracy": 0.4851537645811241, "step": 8158 }, { "epoch": 1.5126065999258436, "grad_norm": 6.17578125, "learning_rate": 8.487393400074158e-06, "loss": 2.7058, "mean_token_accuracy": 0.47496871088861076, "step": 8159 }, { "epoch": 1.5127919911012235, "grad_norm": 7.16015625, "learning_rate": 8.487208008898777e-06, "loss": 2.5877, "mean_token_accuracy": 0.48800599700149927, "step": 8160 }, { "epoch": 1.5129773822766035, "grad_norm": 6.6796875, "learning_rate": 8.487022617723397e-06, "loss": 3.1071, "mean_token_accuracy": 0.4526328590538733, "step": 8161 }, { "epoch": 1.5131627734519837, "grad_norm": 6.37890625, "learning_rate": 8.486837226548018e-06, "loss": 2.2745, "mean_token_accuracy": 0.5206062217495347, "step": 8162 }, { "epoch": 1.5133481646273639, "grad_norm": 6.54296875, "learning_rate": 8.486651835372636e-06, "loss": 2.7022, "mean_token_accuracy": 0.48041566746602715, "step": 8163 }, { "epoch": 1.5135335558027436, "grad_norm": 6.19921875, "learning_rate": 8.486466444197257e-06, "loss": 2.9882, "mean_token_accuracy": 0.45608540925266905, "step": 8164 }, { "epoch": 1.5137189469781238, "grad_norm": 6.90625, "learning_rate": 8.486281053021876e-06, "loss": 3.0707, "mean_token_accuracy": 0.4313310069790628, "step": 8165 }, { "epoch": 1.513904338153504, "grad_norm": 6.47265625, "learning_rate": 8.486095661846496e-06, "loss": 3.4558, "mean_token_accuracy": 0.45211658570437196, "step": 8166 }, { "epoch": 1.514089729328884, "grad_norm": 7.09375, "learning_rate": 8.485910270671117e-06, "loss": 3.2208, "mean_token_accuracy": 0.4428692940175787, "step": 8167 }, { "epoch": 1.514275120504264, "grad_norm": 6.1015625, "learning_rate": 8.485724879495737e-06, "loss": 2.9897, "mean_token_accuracy": 0.46046119827313886, "step": 8168 }, { "epoch": 1.5144605116796441, "grad_norm": 5.8515625, "learning_rate": 8.485539488320358e-06, "loss": 2.7912, "mean_token_accuracy": 0.46390822376901264, "step": 8169 }, { "epoch": 1.514645902855024, "grad_norm": 5.8125, "learning_rate": 8.485354097144976e-06, "loss": 3.1433, "mean_token_accuracy": 0.43471267729200175, "step": 8170 }, { "epoch": 1.514831294030404, "grad_norm": 7.94140625, "learning_rate": 8.485168705969597e-06, "loss": 3.0334, "mean_token_accuracy": 0.4529342723004695, "step": 8171 }, { "epoch": 1.5150166852057843, "grad_norm": 7.06640625, "learning_rate": 8.484983314794216e-06, "loss": 2.5931, "mean_token_accuracy": 0.48040504997369804, "step": 8172 }, { "epoch": 1.5152020763811642, "grad_norm": 7.8515625, "learning_rate": 8.484797923618836e-06, "loss": 2.8142, "mean_token_accuracy": 0.4424185547660544, "step": 8173 }, { "epoch": 1.5153874675565442, "grad_norm": 5.59375, "learning_rate": 8.484612532443457e-06, "loss": 3.3718, "mean_token_accuracy": 0.4184770682466535, "step": 8174 }, { "epoch": 1.5155728587319244, "grad_norm": 6.0078125, "learning_rate": 8.484427141268077e-06, "loss": 2.9492, "mean_token_accuracy": 0.4616157267900703, "step": 8175 }, { "epoch": 1.5157582499073046, "grad_norm": 6.73828125, "learning_rate": 8.484241750092696e-06, "loss": 2.7291, "mean_token_accuracy": 0.47106109324758844, "step": 8176 }, { "epoch": 1.5159436410826843, "grad_norm": 7.39453125, "learning_rate": 8.484056358917316e-06, "loss": 3.4673, "mean_token_accuracy": 0.42366412213740456, "step": 8177 }, { "epoch": 1.5161290322580645, "grad_norm": 6.02734375, "learning_rate": 8.483870967741937e-06, "loss": 2.6788, "mean_token_accuracy": 0.4745308310991957, "step": 8178 }, { "epoch": 1.5163144234334447, "grad_norm": 5.45703125, "learning_rate": 8.483685576566556e-06, "loss": 3.0684, "mean_token_accuracy": 0.435375, "step": 8179 }, { "epoch": 1.5164998146088247, "grad_norm": 6.51171875, "learning_rate": 8.483500185391176e-06, "loss": 3.0418, "mean_token_accuracy": 0.4603131749460043, "step": 8180 }, { "epoch": 1.5166852057842046, "grad_norm": 5.71484375, "learning_rate": 8.483314794215795e-06, "loss": 3.086, "mean_token_accuracy": 0.4425393883225209, "step": 8181 }, { "epoch": 1.5168705969595848, "grad_norm": 6.23046875, "learning_rate": 8.483129403040416e-06, "loss": 2.984, "mean_token_accuracy": 0.4871883258646444, "step": 8182 }, { "epoch": 1.5170559881349648, "grad_norm": 6.4921875, "learning_rate": 8.482944011865036e-06, "loss": 2.9324, "mean_token_accuracy": 0.4427722772277228, "step": 8183 }, { "epoch": 1.5172413793103448, "grad_norm": 7.359375, "learning_rate": 8.482758620689656e-06, "loss": 2.8921, "mean_token_accuracy": 0.46385266529398306, "step": 8184 }, { "epoch": 1.517426770485725, "grad_norm": 5.83203125, "learning_rate": 8.482573229514275e-06, "loss": 3.2658, "mean_token_accuracy": 0.4217965653896962, "step": 8185 }, { "epoch": 1.517612161661105, "grad_norm": 11.703125, "learning_rate": 8.482387838338896e-06, "loss": 3.7565, "mean_token_accuracy": 0.4201095809269954, "step": 8186 }, { "epoch": 1.5177975528364849, "grad_norm": 9.4296875, "learning_rate": 8.482202447163516e-06, "loss": 2.9116, "mean_token_accuracy": 0.45659928656361476, "step": 8187 }, { "epoch": 1.517982944011865, "grad_norm": 8.703125, "learning_rate": 8.482017055988135e-06, "loss": 2.8034, "mean_token_accuracy": 0.4729808932129879, "step": 8188 }, { "epoch": 1.518168335187245, "grad_norm": 5.02734375, "learning_rate": 8.481831664812756e-06, "loss": 2.5335, "mean_token_accuracy": 0.5016320820704127, "step": 8189 }, { "epoch": 1.518353726362625, "grad_norm": 7.20703125, "learning_rate": 8.481646273637374e-06, "loss": 2.9734, "mean_token_accuracy": 0.47483431846677415, "step": 8190 }, { "epoch": 1.5185391175380052, "grad_norm": 6.12890625, "learning_rate": 8.481460882461997e-06, "loss": 3.1681, "mean_token_accuracy": 0.47367218732153055, "step": 8191 }, { "epoch": 1.5187245087133854, "grad_norm": 6.1171875, "learning_rate": 8.481275491286615e-06, "loss": 2.5867, "mean_token_accuracy": 0.5146371213661314, "step": 8192 }, { "epoch": 1.5189098998887653, "grad_norm": 5.5703125, "learning_rate": 8.481090100111236e-06, "loss": 2.6973, "mean_token_accuracy": 0.47012665198237885, "step": 8193 }, { "epoch": 1.5190952910641453, "grad_norm": 9.3671875, "learning_rate": 8.480904708935855e-06, "loss": 2.919, "mean_token_accuracy": 0.44632833085118573, "step": 8194 }, { "epoch": 1.5192806822395255, "grad_norm": 6.84375, "learning_rate": 8.480719317760475e-06, "loss": 2.5664, "mean_token_accuracy": 0.48267352185089973, "step": 8195 }, { "epoch": 1.5194660734149055, "grad_norm": 5.50390625, "learning_rate": 8.480533926585096e-06, "loss": 3.0994, "mean_token_accuracy": 0.4313366466126079, "step": 8196 }, { "epoch": 1.5196514645902854, "grad_norm": 5.2578125, "learning_rate": 8.480348535409714e-06, "loss": 2.7937, "mean_token_accuracy": 0.46343849351919786, "step": 8197 }, { "epoch": 1.5198368557656656, "grad_norm": 5.97265625, "learning_rate": 8.480163144234335e-06, "loss": 2.4882, "mean_token_accuracy": 0.4899538106235566, "step": 8198 }, { "epoch": 1.5200222469410456, "grad_norm": 6.90625, "learning_rate": 8.479977753058955e-06, "loss": 2.4123, "mean_token_accuracy": 0.5082036180058898, "step": 8199 }, { "epoch": 1.5202076381164256, "grad_norm": 6.65234375, "learning_rate": 8.479792361883576e-06, "loss": 2.4942, "mean_token_accuracy": 0.4986691717551276, "step": 8200 }, { "epoch": 1.5203930292918058, "grad_norm": 9.9921875, "learning_rate": 8.479606970708195e-06, "loss": 3.6525, "mean_token_accuracy": 0.3978559262598938, "step": 8201 }, { "epoch": 1.5205784204671857, "grad_norm": 8.375, "learning_rate": 8.479421579532815e-06, "loss": 3.102, "mean_token_accuracy": 0.4873556497588072, "step": 8202 }, { "epoch": 1.5207638116425657, "grad_norm": 7.8984375, "learning_rate": 8.479236188357434e-06, "loss": 2.454, "mean_token_accuracy": 0.5201238390092879, "step": 8203 }, { "epoch": 1.5209492028179459, "grad_norm": 6.47265625, "learning_rate": 8.479050797182054e-06, "loss": 2.9256, "mean_token_accuracy": 0.4726725003586286, "step": 8204 }, { "epoch": 1.521134593993326, "grad_norm": 11.140625, "learning_rate": 8.478865406006675e-06, "loss": 4.4489, "mean_token_accuracy": 0.39100684261974583, "step": 8205 }, { "epoch": 1.5213199851687058, "grad_norm": 8.0234375, "learning_rate": 8.478680014831294e-06, "loss": 3.802, "mean_token_accuracy": 0.41375336150595465, "step": 8206 }, { "epoch": 1.521505376344086, "grad_norm": 7.92578125, "learning_rate": 8.478494623655916e-06, "loss": 2.3995, "mean_token_accuracy": 0.5010091416359966, "step": 8207 }, { "epoch": 1.5216907675194662, "grad_norm": 6.01953125, "learning_rate": 8.478309232480535e-06, "loss": 2.7623, "mean_token_accuracy": 0.46704826732673266, "step": 8208 }, { "epoch": 1.5218761586948462, "grad_norm": 11.9453125, "learning_rate": 8.478123841305155e-06, "loss": 2.9639, "mean_token_accuracy": 0.4564993564993565, "step": 8209 }, { "epoch": 1.5220615498702261, "grad_norm": 8.640625, "learning_rate": 8.477938450129774e-06, "loss": 2.3315, "mean_token_accuracy": 0.5261520225218551, "step": 8210 }, { "epoch": 1.5222469410456063, "grad_norm": 6.0859375, "learning_rate": 8.477753058954395e-06, "loss": 2.5042, "mean_token_accuracy": 0.5022047431772136, "step": 8211 }, { "epoch": 1.5224323322209863, "grad_norm": 8.65625, "learning_rate": 8.477567667779015e-06, "loss": 2.6928, "mean_token_accuracy": 0.45893719806763283, "step": 8212 }, { "epoch": 1.5226177233963663, "grad_norm": 8.0078125, "learning_rate": 8.477382276603634e-06, "loss": 3.128, "mean_token_accuracy": 0.44004477403106196, "step": 8213 }, { "epoch": 1.5228031145717464, "grad_norm": 11.3046875, "learning_rate": 8.477196885428254e-06, "loss": 2.6875, "mean_token_accuracy": 0.4664804469273743, "step": 8214 }, { "epoch": 1.5229885057471264, "grad_norm": 6.04296875, "learning_rate": 8.477011494252875e-06, "loss": 2.8386, "mean_token_accuracy": 0.4670073650984518, "step": 8215 }, { "epoch": 1.5231738969225064, "grad_norm": 8.046875, "learning_rate": 8.476826103077495e-06, "loss": 2.7261, "mean_token_accuracy": 0.4602086438152012, "step": 8216 }, { "epoch": 1.5233592880978866, "grad_norm": 9.1640625, "learning_rate": 8.476640711902114e-06, "loss": 2.6466, "mean_token_accuracy": 0.46522430020779854, "step": 8217 }, { "epoch": 1.5235446792732668, "grad_norm": 6.7578125, "learning_rate": 8.476455320726735e-06, "loss": 2.894, "mean_token_accuracy": 0.4687819856704197, "step": 8218 }, { "epoch": 1.5237300704486465, "grad_norm": 5.5, "learning_rate": 8.476269929551353e-06, "loss": 2.8042, "mean_token_accuracy": 0.4664469742360695, "step": 8219 }, { "epoch": 1.5239154616240267, "grad_norm": 8.4296875, "learning_rate": 8.476084538375974e-06, "loss": 2.5862, "mean_token_accuracy": 0.49321941958231624, "step": 8220 }, { "epoch": 1.5241008527994069, "grad_norm": 6.05859375, "learning_rate": 8.475899147200594e-06, "loss": 3.2337, "mean_token_accuracy": 0.4468392692863714, "step": 8221 }, { "epoch": 1.5242862439747868, "grad_norm": 6.3828125, "learning_rate": 8.475713756025213e-06, "loss": 2.5924, "mean_token_accuracy": 0.49598416947968804, "step": 8222 }, { "epoch": 1.5244716351501668, "grad_norm": 6.36328125, "learning_rate": 8.475528364849834e-06, "loss": 2.7448, "mean_token_accuracy": 0.4636542239685658, "step": 8223 }, { "epoch": 1.524657026325547, "grad_norm": 8.671875, "learning_rate": 8.475342973674454e-06, "loss": 2.7723, "mean_token_accuracy": 0.47716150081566067, "step": 8224 }, { "epoch": 1.524842417500927, "grad_norm": 6.21484375, "learning_rate": 8.475157582499075e-06, "loss": 2.6926, "mean_token_accuracy": 0.4734565473849784, "step": 8225 }, { "epoch": 1.525027808676307, "grad_norm": 6.46484375, "learning_rate": 8.474972191323693e-06, "loss": 2.5611, "mean_token_accuracy": 0.48408057179987, "step": 8226 }, { "epoch": 1.5252131998516871, "grad_norm": 7.5703125, "learning_rate": 8.474786800148314e-06, "loss": 2.5052, "mean_token_accuracy": 0.49529395716818986, "step": 8227 }, { "epoch": 1.525398591027067, "grad_norm": 10.7109375, "learning_rate": 8.474601408972933e-06, "loss": 3.8109, "mean_token_accuracy": 0.3927765237020316, "step": 8228 }, { "epoch": 1.525583982202447, "grad_norm": 10.2109375, "learning_rate": 8.474416017797553e-06, "loss": 2.8466, "mean_token_accuracy": 0.44885297936805657, "step": 8229 }, { "epoch": 1.5257693733778273, "grad_norm": 5.04296875, "learning_rate": 8.474230626622174e-06, "loss": 2.5406, "mean_token_accuracy": 0.48607120549656463, "step": 8230 }, { "epoch": 1.5259547645532072, "grad_norm": 8.25, "learning_rate": 8.474045235446794e-06, "loss": 3.0445, "mean_token_accuracy": 0.4446354038792045, "step": 8231 }, { "epoch": 1.5261401557285872, "grad_norm": 8.25, "learning_rate": 8.473859844271413e-06, "loss": 3.0869, "mean_token_accuracy": 0.45315225517069524, "step": 8232 }, { "epoch": 1.5263255469039674, "grad_norm": 8.09375, "learning_rate": 8.473674453096033e-06, "loss": 3.0268, "mean_token_accuracy": 0.46030136192408, "step": 8233 }, { "epoch": 1.5265109380793476, "grad_norm": 6.11328125, "learning_rate": 8.473489061920654e-06, "loss": 3.1729, "mean_token_accuracy": 0.4381590373402804, "step": 8234 }, { "epoch": 1.5266963292547273, "grad_norm": 9.2265625, "learning_rate": 8.473303670745273e-06, "loss": 2.9986, "mean_token_accuracy": 0.44305724725943973, "step": 8235 }, { "epoch": 1.5268817204301075, "grad_norm": 7.5390625, "learning_rate": 8.473118279569893e-06, "loss": 3.2708, "mean_token_accuracy": 0.4195917351257157, "step": 8236 }, { "epoch": 1.5270671116054877, "grad_norm": 7.7890625, "learning_rate": 8.472932888394512e-06, "loss": 2.9537, "mean_token_accuracy": 0.4593348239239799, "step": 8237 }, { "epoch": 1.5272525027808677, "grad_norm": 8.34375, "learning_rate": 8.472747497219133e-06, "loss": 2.5839, "mean_token_accuracy": 0.4740268735281895, "step": 8238 }, { "epoch": 1.5274378939562476, "grad_norm": 8.671875, "learning_rate": 8.472562106043753e-06, "loss": 2.4349, "mean_token_accuracy": 0.5113572787288799, "step": 8239 }, { "epoch": 1.5276232851316278, "grad_norm": 12.2734375, "learning_rate": 8.472376714868374e-06, "loss": 2.6082, "mean_token_accuracy": 0.4818398541579021, "step": 8240 }, { "epoch": 1.5278086763070078, "grad_norm": 5.94921875, "learning_rate": 8.472191323692992e-06, "loss": 2.7817, "mean_token_accuracy": 0.47315096251266464, "step": 8241 }, { "epoch": 1.5279940674823878, "grad_norm": 7.83984375, "learning_rate": 8.472005932517613e-06, "loss": 3.148, "mean_token_accuracy": 0.43857975622681505, "step": 8242 }, { "epoch": 1.528179458657768, "grad_norm": 8.640625, "learning_rate": 8.471820541342233e-06, "loss": 2.7277, "mean_token_accuracy": 0.46743564495709666, "step": 8243 }, { "epoch": 1.528364849833148, "grad_norm": 7.328125, "learning_rate": 8.471635150166852e-06, "loss": 2.6818, "mean_token_accuracy": 0.4806254248810333, "step": 8244 }, { "epoch": 1.5285502410085279, "grad_norm": 6.09375, "learning_rate": 8.471449758991473e-06, "loss": 2.3158, "mean_token_accuracy": 0.5528224534847052, "step": 8245 }, { "epoch": 1.528735632183908, "grad_norm": 7.34765625, "learning_rate": 8.471264367816091e-06, "loss": 2.6678, "mean_token_accuracy": 0.4762285897230671, "step": 8246 }, { "epoch": 1.5289210233592883, "grad_norm": 7.03125, "learning_rate": 8.471078976640714e-06, "loss": 3.0301, "mean_token_accuracy": 0.45836650214933927, "step": 8247 }, { "epoch": 1.529106414534668, "grad_norm": 6.44140625, "learning_rate": 8.470893585465332e-06, "loss": 2.9204, "mean_token_accuracy": 0.4623800959232614, "step": 8248 }, { "epoch": 1.5292918057100482, "grad_norm": 6.7578125, "learning_rate": 8.470708194289953e-06, "loss": 3.1226, "mean_token_accuracy": 0.4064873214055863, "step": 8249 }, { "epoch": 1.5294771968854284, "grad_norm": 6.96875, "learning_rate": 8.470522803114573e-06, "loss": 3.1172, "mean_token_accuracy": 0.45636420919974796, "step": 8250 }, { "epoch": 1.5296625880608083, "grad_norm": 11.5, "learning_rate": 8.470337411939192e-06, "loss": 2.2962, "mean_token_accuracy": 0.5254882489241973, "step": 8251 }, { "epoch": 1.5298479792361883, "grad_norm": 6.4609375, "learning_rate": 8.470152020763813e-06, "loss": 2.5618, "mean_token_accuracy": 0.4758269720101781, "step": 8252 }, { "epoch": 1.5300333704115685, "grad_norm": 5.29296875, "learning_rate": 8.469966629588431e-06, "loss": 3.3506, "mean_token_accuracy": 0.4041125079906243, "step": 8253 }, { "epoch": 1.5302187615869485, "grad_norm": 8.9765625, "learning_rate": 8.469781238413052e-06, "loss": 2.6715, "mean_token_accuracy": 0.5057456254896839, "step": 8254 }, { "epoch": 1.5304041527623284, "grad_norm": 8.7734375, "learning_rate": 8.469595847237672e-06, "loss": 2.688, "mean_token_accuracy": 0.4634875668870003, "step": 8255 }, { "epoch": 1.5305895439377086, "grad_norm": 7.48046875, "learning_rate": 8.469410456062293e-06, "loss": 3.4164, "mean_token_accuracy": 0.4211187055377319, "step": 8256 }, { "epoch": 1.5307749351130886, "grad_norm": 8.0859375, "learning_rate": 8.469225064886912e-06, "loss": 2.6893, "mean_token_accuracy": 0.5069643386347921, "step": 8257 }, { "epoch": 1.5309603262884686, "grad_norm": 6.37109375, "learning_rate": 8.469039673711532e-06, "loss": 2.5801, "mean_token_accuracy": 0.48939929328621906, "step": 8258 }, { "epoch": 1.5311457174638488, "grad_norm": 8.140625, "learning_rate": 8.468854282536153e-06, "loss": 2.9098, "mean_token_accuracy": 0.44330937066464876, "step": 8259 }, { "epoch": 1.5313311086392287, "grad_norm": 8.0390625, "learning_rate": 8.468668891360771e-06, "loss": 3.0435, "mean_token_accuracy": 0.44524761611811753, "step": 8260 }, { "epoch": 1.5315164998146087, "grad_norm": 7.33203125, "learning_rate": 8.468483500185392e-06, "loss": 3.027, "mean_token_accuracy": 0.4457318861836316, "step": 8261 }, { "epoch": 1.5317018909899889, "grad_norm": 6.203125, "learning_rate": 8.46829810901001e-06, "loss": 2.6156, "mean_token_accuracy": 0.48277439024390245, "step": 8262 }, { "epoch": 1.531887282165369, "grad_norm": 7.69921875, "learning_rate": 8.468112717834633e-06, "loss": 3.8167, "mean_token_accuracy": 0.3946355065728323, "step": 8263 }, { "epoch": 1.5320726733407488, "grad_norm": 7.703125, "learning_rate": 8.467927326659252e-06, "loss": 2.8456, "mean_token_accuracy": 0.4896875, "step": 8264 }, { "epoch": 1.532258064516129, "grad_norm": 6.40625, "learning_rate": 8.467741935483872e-06, "loss": 2.7337, "mean_token_accuracy": 0.4832987054498961, "step": 8265 }, { "epoch": 1.5324434556915092, "grad_norm": 5.45703125, "learning_rate": 8.467556544308491e-06, "loss": 2.6817, "mean_token_accuracy": 0.4656932703919201, "step": 8266 }, { "epoch": 1.5326288468668892, "grad_norm": 8.859375, "learning_rate": 8.467371153133112e-06, "loss": 3.1206, "mean_token_accuracy": 0.4492307692307692, "step": 8267 }, { "epoch": 1.5328142380422691, "grad_norm": 7.75390625, "learning_rate": 8.467185761957732e-06, "loss": 3.2165, "mean_token_accuracy": 0.43408328399447405, "step": 8268 }, { "epoch": 1.5329996292176493, "grad_norm": 6.921875, "learning_rate": 8.46700037078235e-06, "loss": 2.4233, "mean_token_accuracy": 0.5070378803560339, "step": 8269 }, { "epoch": 1.5331850203930293, "grad_norm": 7.64453125, "learning_rate": 8.466814979606971e-06, "loss": 2.0224, "mean_token_accuracy": 0.5635934462649264, "step": 8270 }, { "epoch": 1.5333704115684093, "grad_norm": 4.94921875, "learning_rate": 8.466629588431592e-06, "loss": 2.9274, "mean_token_accuracy": 0.4729085392284352, "step": 8271 }, { "epoch": 1.5335558027437894, "grad_norm": 6.6640625, "learning_rate": 8.466444197256212e-06, "loss": 2.5367, "mean_token_accuracy": 0.4700025859839669, "step": 8272 }, { "epoch": 1.5337411939191694, "grad_norm": 6.97265625, "learning_rate": 8.466258806080831e-06, "loss": 3.4702, "mean_token_accuracy": 0.40429136081309996, "step": 8273 }, { "epoch": 1.5339265850945494, "grad_norm": 6.33203125, "learning_rate": 8.466073414905452e-06, "loss": 2.5163, "mean_token_accuracy": 0.5131054475178162, "step": 8274 }, { "epoch": 1.5341119762699296, "grad_norm": 7.80078125, "learning_rate": 8.46588802373007e-06, "loss": 2.9597, "mean_token_accuracy": 0.45412780175414175, "step": 8275 }, { "epoch": 1.5342973674453098, "grad_norm": 7.81640625, "learning_rate": 8.465702632554691e-06, "loss": 2.2468, "mean_token_accuracy": 0.5265990737263737, "step": 8276 }, { "epoch": 1.5344827586206895, "grad_norm": 6.1328125, "learning_rate": 8.465517241379311e-06, "loss": 2.8233, "mean_token_accuracy": 0.45545023696682463, "step": 8277 }, { "epoch": 1.5346681497960697, "grad_norm": 5.6953125, "learning_rate": 8.46533185020393e-06, "loss": 2.7845, "mean_token_accuracy": 0.47294324681038274, "step": 8278 }, { "epoch": 1.5348535409714499, "grad_norm": 7.09765625, "learning_rate": 8.46514645902855e-06, "loss": 2.7992, "mean_token_accuracy": 0.4610792461664521, "step": 8279 }, { "epoch": 1.5350389321468298, "grad_norm": 6.88671875, "learning_rate": 8.464961067853171e-06, "loss": 2.8371, "mean_token_accuracy": 0.46764243377283177, "step": 8280 }, { "epoch": 1.5352243233222098, "grad_norm": 5.50390625, "learning_rate": 8.464775676677792e-06, "loss": 2.905, "mean_token_accuracy": 0.4720666379434152, "step": 8281 }, { "epoch": 1.53540971449759, "grad_norm": 5.1171875, "learning_rate": 8.46459028550241e-06, "loss": 2.572, "mean_token_accuracy": 0.48058017727639, "step": 8282 }, { "epoch": 1.53559510567297, "grad_norm": 5.16796875, "learning_rate": 8.464404894327031e-06, "loss": 3.1354, "mean_token_accuracy": 0.4449255751014885, "step": 8283 }, { "epoch": 1.53578049684835, "grad_norm": 6.1015625, "learning_rate": 8.46421950315165e-06, "loss": 2.8049, "mean_token_accuracy": 0.46330413772274237, "step": 8284 }, { "epoch": 1.5359658880237301, "grad_norm": 5.8984375, "learning_rate": 8.46403411197627e-06, "loss": 2.4932, "mean_token_accuracy": 0.5093723545773371, "step": 8285 }, { "epoch": 1.53615127919911, "grad_norm": 11.7578125, "learning_rate": 8.46384872080089e-06, "loss": 2.5753, "mean_token_accuracy": 0.5040721714070918, "step": 8286 }, { "epoch": 1.53633667037449, "grad_norm": 5.97265625, "learning_rate": 8.463663329625511e-06, "loss": 2.7301, "mean_token_accuracy": 0.5111734859116916, "step": 8287 }, { "epoch": 1.5365220615498703, "grad_norm": 7.0703125, "learning_rate": 8.463477938450132e-06, "loss": 2.8774, "mean_token_accuracy": 0.4550398381181232, "step": 8288 }, { "epoch": 1.5367074527252504, "grad_norm": 5.91015625, "learning_rate": 8.46329254727475e-06, "loss": 3.1831, "mean_token_accuracy": 0.4555691827405508, "step": 8289 }, { "epoch": 1.5368928439006302, "grad_norm": 6.08984375, "learning_rate": 8.463107156099371e-06, "loss": 2.9793, "mean_token_accuracy": 0.474928744853795, "step": 8290 }, { "epoch": 1.5370782350760104, "grad_norm": 5.79296875, "learning_rate": 8.46292176492399e-06, "loss": 2.4181, "mean_token_accuracy": 0.513204765999263, "step": 8291 }, { "epoch": 1.5372636262513906, "grad_norm": 6.01171875, "learning_rate": 8.46273637374861e-06, "loss": 2.4434, "mean_token_accuracy": 0.4942728722633029, "step": 8292 }, { "epoch": 1.5374490174267705, "grad_norm": 6.77734375, "learning_rate": 8.46255098257323e-06, "loss": 2.8601, "mean_token_accuracy": 0.4796938456732271, "step": 8293 }, { "epoch": 1.5376344086021505, "grad_norm": 5.87109375, "learning_rate": 8.46236559139785e-06, "loss": 2.6536, "mean_token_accuracy": 0.4689096158116811, "step": 8294 }, { "epoch": 1.5378197997775307, "grad_norm": 7.00390625, "learning_rate": 8.46218020022247e-06, "loss": 2.818, "mean_token_accuracy": 0.4736763386180078, "step": 8295 }, { "epoch": 1.5380051909529107, "grad_norm": 5.82421875, "learning_rate": 8.46199480904709e-06, "loss": 2.36, "mean_token_accuracy": 0.5431269674711438, "step": 8296 }, { "epoch": 1.5381905821282906, "grad_norm": 6.0859375, "learning_rate": 8.461809417871711e-06, "loss": 2.8016, "mean_token_accuracy": 0.47885878489326766, "step": 8297 }, { "epoch": 1.5383759733036708, "grad_norm": 5.9453125, "learning_rate": 8.46162402669633e-06, "loss": 3.0624, "mean_token_accuracy": 0.4649161845423528, "step": 8298 }, { "epoch": 1.5385613644790508, "grad_norm": 7.6015625, "learning_rate": 8.46143863552095e-06, "loss": 3.2099, "mean_token_accuracy": 0.4384648066846056, "step": 8299 }, { "epoch": 1.5387467556544308, "grad_norm": 6.37109375, "learning_rate": 8.461253244345569e-06, "loss": 2.5515, "mean_token_accuracy": 0.5203161836083761, "step": 8300 }, { "epoch": 1.538932146829811, "grad_norm": 5.51171875, "learning_rate": 8.46106785317019e-06, "loss": 2.7736, "mean_token_accuracy": 0.4962667994026879, "step": 8301 }, { "epoch": 1.539117538005191, "grad_norm": 6.6640625, "learning_rate": 8.46088246199481e-06, "loss": 2.4737, "mean_token_accuracy": 0.47905686546463244, "step": 8302 }, { "epoch": 1.5393029291805709, "grad_norm": 7.1171875, "learning_rate": 8.460697070819429e-06, "loss": 2.9505, "mean_token_accuracy": 0.47694981089645305, "step": 8303 }, { "epoch": 1.539488320355951, "grad_norm": 5.6640625, "learning_rate": 8.46051167964405e-06, "loss": 2.626, "mean_token_accuracy": 0.5056267409470752, "step": 8304 }, { "epoch": 1.5396737115313313, "grad_norm": 8.4765625, "learning_rate": 8.46032628846867e-06, "loss": 2.3865, "mean_token_accuracy": 0.4964625850340136, "step": 8305 }, { "epoch": 1.539859102706711, "grad_norm": 6.46484375, "learning_rate": 8.46014089729329e-06, "loss": 2.8396, "mean_token_accuracy": 0.4595394313649813, "step": 8306 }, { "epoch": 1.5400444938820912, "grad_norm": 5.87109375, "learning_rate": 8.45995550611791e-06, "loss": 3.5105, "mean_token_accuracy": 0.4338784216139353, "step": 8307 }, { "epoch": 1.5402298850574714, "grad_norm": 8.9765625, "learning_rate": 8.45977011494253e-06, "loss": 2.8756, "mean_token_accuracy": 0.45482734319943624, "step": 8308 }, { "epoch": 1.5404152762328513, "grad_norm": 5.9296875, "learning_rate": 8.459584723767148e-06, "loss": 3.0344, "mean_token_accuracy": 0.4293410692084542, "step": 8309 }, { "epoch": 1.5406006674082313, "grad_norm": 6.875, "learning_rate": 8.459399332591769e-06, "loss": 2.5309, "mean_token_accuracy": 0.49761677788369874, "step": 8310 }, { "epoch": 1.5407860585836115, "grad_norm": 11.6953125, "learning_rate": 8.45921394141639e-06, "loss": 2.5846, "mean_token_accuracy": 0.4719591226321037, "step": 8311 }, { "epoch": 1.5409714497589915, "grad_norm": 9.4609375, "learning_rate": 8.45902855024101e-06, "loss": 2.7333, "mean_token_accuracy": 0.46911827346301294, "step": 8312 }, { "epoch": 1.5411568409343714, "grad_norm": 6.08984375, "learning_rate": 8.458843159065629e-06, "loss": 3.1097, "mean_token_accuracy": 0.4497566561694818, "step": 8313 }, { "epoch": 1.5413422321097516, "grad_norm": 10.7109375, "learning_rate": 8.45865776789025e-06, "loss": 2.3564, "mean_token_accuracy": 0.5270201592410638, "step": 8314 }, { "epoch": 1.5415276232851316, "grad_norm": 10.125, "learning_rate": 8.45847237671487e-06, "loss": 2.9697, "mean_token_accuracy": 0.45194841214810705, "step": 8315 }, { "epoch": 1.5417130144605116, "grad_norm": 7.3046875, "learning_rate": 8.458286985539489e-06, "loss": 2.881, "mean_token_accuracy": 0.4797864225781846, "step": 8316 }, { "epoch": 1.5418984056358918, "grad_norm": 7.12109375, "learning_rate": 8.458101594364109e-06, "loss": 3.1344, "mean_token_accuracy": 0.443884620229248, "step": 8317 }, { "epoch": 1.542083796811272, "grad_norm": 11.375, "learning_rate": 8.457916203188728e-06, "loss": 2.8157, "mean_token_accuracy": 0.45394381415451107, "step": 8318 }, { "epoch": 1.5422691879866517, "grad_norm": 5.71484375, "learning_rate": 8.457730812013348e-06, "loss": 2.49, "mean_token_accuracy": 0.5247035573122529, "step": 8319 }, { "epoch": 1.5424545791620319, "grad_norm": 6.125, "learning_rate": 8.457545420837969e-06, "loss": 2.6034, "mean_token_accuracy": 0.5059982862039417, "step": 8320 }, { "epoch": 1.542639970337412, "grad_norm": 13.046875, "learning_rate": 8.45736002966259e-06, "loss": 2.4745, "mean_token_accuracy": 0.48938178386031145, "step": 8321 }, { "epoch": 1.542825361512792, "grad_norm": 10.8203125, "learning_rate": 8.457174638487208e-06, "loss": 2.118, "mean_token_accuracy": 0.5397564849126522, "step": 8322 }, { "epoch": 1.543010752688172, "grad_norm": 6.6953125, "learning_rate": 8.456989247311829e-06, "loss": 2.797, "mean_token_accuracy": 0.46263125386040765, "step": 8323 }, { "epoch": 1.5431961438635522, "grad_norm": 6.171875, "learning_rate": 8.456803856136449e-06, "loss": 2.6753, "mean_token_accuracy": 0.47368421052631576, "step": 8324 }, { "epoch": 1.5433815350389322, "grad_norm": 7.7734375, "learning_rate": 8.456618464961068e-06, "loss": 2.6547, "mean_token_accuracy": 0.5095279976546467, "step": 8325 }, { "epoch": 1.5435669262143121, "grad_norm": 9.1484375, "learning_rate": 8.456433073785688e-06, "loss": 2.4895, "mean_token_accuracy": 0.5024351717783335, "step": 8326 }, { "epoch": 1.5437523173896923, "grad_norm": 6.2265625, "learning_rate": 8.456247682610307e-06, "loss": 3.1105, "mean_token_accuracy": 0.45373891001267425, "step": 8327 }, { "epoch": 1.5439377085650723, "grad_norm": 5.1640625, "learning_rate": 8.45606229143493e-06, "loss": 2.3699, "mean_token_accuracy": 0.5329433497536946, "step": 8328 }, { "epoch": 1.5441230997404523, "grad_norm": 7.8515625, "learning_rate": 8.455876900259548e-06, "loss": 3.1118, "mean_token_accuracy": 0.4311143623506462, "step": 8329 }, { "epoch": 1.5443084909158324, "grad_norm": 6.1640625, "learning_rate": 8.455691509084169e-06, "loss": 2.5738, "mean_token_accuracy": 0.49162011173184356, "step": 8330 }, { "epoch": 1.5444938820912124, "grad_norm": 5.453125, "learning_rate": 8.455506117908789e-06, "loss": 2.4346, "mean_token_accuracy": 0.49577639751552793, "step": 8331 }, { "epoch": 1.5446792732665924, "grad_norm": 7.59765625, "learning_rate": 8.455320726733408e-06, "loss": 2.9831, "mean_token_accuracy": 0.4492063492063492, "step": 8332 }, { "epoch": 1.5448646644419726, "grad_norm": 6.9375, "learning_rate": 8.455135335558028e-06, "loss": 2.2194, "mean_token_accuracy": 0.5542857142857143, "step": 8333 }, { "epoch": 1.5450500556173528, "grad_norm": 6.0390625, "learning_rate": 8.454949944382647e-06, "loss": 2.728, "mean_token_accuracy": 0.46300156331422615, "step": 8334 }, { "epoch": 1.5452354467927325, "grad_norm": 6.359375, "learning_rate": 8.454764553207268e-06, "loss": 3.801, "mean_token_accuracy": 0.3954456415279138, "step": 8335 }, { "epoch": 1.5454208379681127, "grad_norm": 5.83984375, "learning_rate": 8.454579162031888e-06, "loss": 3.0618, "mean_token_accuracy": 0.45435452254270514, "step": 8336 }, { "epoch": 1.5456062291434929, "grad_norm": 5.9765625, "learning_rate": 8.454393770856509e-06, "loss": 3.3953, "mean_token_accuracy": 0.43796526054590573, "step": 8337 }, { "epoch": 1.5457916203188728, "grad_norm": 5.6640625, "learning_rate": 8.454208379681127e-06, "loss": 3.0747, "mean_token_accuracy": 0.4418501715170964, "step": 8338 }, { "epoch": 1.5459770114942528, "grad_norm": 5.25, "learning_rate": 8.454022988505748e-06, "loss": 2.8709, "mean_token_accuracy": 0.4779563246806757, "step": 8339 }, { "epoch": 1.546162402669633, "grad_norm": 6.640625, "learning_rate": 8.453837597330368e-06, "loss": 2.4073, "mean_token_accuracy": 0.48337982333798235, "step": 8340 }, { "epoch": 1.546347793845013, "grad_norm": 5.92578125, "learning_rate": 8.453652206154987e-06, "loss": 3.161, "mean_token_accuracy": 0.42514438027543316, "step": 8341 }, { "epoch": 1.546533185020393, "grad_norm": 6.8046875, "learning_rate": 8.453466814979608e-06, "loss": 2.5927, "mean_token_accuracy": 0.48689019545526774, "step": 8342 }, { "epoch": 1.5467185761957731, "grad_norm": 5.9375, "learning_rate": 8.453281423804227e-06, "loss": 3.4112, "mean_token_accuracy": 0.4292901062045836, "step": 8343 }, { "epoch": 1.546903967371153, "grad_norm": 9.5625, "learning_rate": 8.453096032628849e-06, "loss": 2.5495, "mean_token_accuracy": 0.470580404685836, "step": 8344 }, { "epoch": 1.547089358546533, "grad_norm": 6.57421875, "learning_rate": 8.452910641453468e-06, "loss": 3.2944, "mean_token_accuracy": 0.42242473180297613, "step": 8345 }, { "epoch": 1.5472747497219133, "grad_norm": 5.11328125, "learning_rate": 8.452725250278088e-06, "loss": 2.7521, "mean_token_accuracy": 0.47368421052631576, "step": 8346 }, { "epoch": 1.5474601408972934, "grad_norm": 6.33984375, "learning_rate": 8.452539859102707e-06, "loss": 2.3273, "mean_token_accuracy": 0.5154548130703562, "step": 8347 }, { "epoch": 1.5476455320726732, "grad_norm": 6.88671875, "learning_rate": 8.452354467927327e-06, "loss": 2.5139, "mean_token_accuracy": 0.48480355819125276, "step": 8348 }, { "epoch": 1.5478309232480534, "grad_norm": 7.49609375, "learning_rate": 8.452169076751948e-06, "loss": 2.6049, "mean_token_accuracy": 0.5007653619068445, "step": 8349 }, { "epoch": 1.5480163144234336, "grad_norm": 5.99609375, "learning_rate": 8.451983685576567e-06, "loss": 2.9472, "mean_token_accuracy": 0.4402050792893764, "step": 8350 }, { "epoch": 1.5482017055988135, "grad_norm": 6.24609375, "learning_rate": 8.451798294401187e-06, "loss": 3.3317, "mean_token_accuracy": 0.43426034730382557, "step": 8351 }, { "epoch": 1.5483870967741935, "grad_norm": 6.9921875, "learning_rate": 8.451612903225808e-06, "loss": 3.3378, "mean_token_accuracy": 0.4583196407054442, "step": 8352 }, { "epoch": 1.5485724879495737, "grad_norm": 7.79296875, "learning_rate": 8.451427512050428e-06, "loss": 2.359, "mean_token_accuracy": 0.5041722745625841, "step": 8353 }, { "epoch": 1.5487578791249537, "grad_norm": 5.765625, "learning_rate": 8.451242120875047e-06, "loss": 2.835, "mean_token_accuracy": 0.49053080821552414, "step": 8354 }, { "epoch": 1.5489432703003336, "grad_norm": 6.55078125, "learning_rate": 8.451056729699667e-06, "loss": 2.9094, "mean_token_accuracy": 0.47598152424942264, "step": 8355 }, { "epoch": 1.5491286614757138, "grad_norm": 7.50390625, "learning_rate": 8.450871338524286e-06, "loss": 2.5188, "mean_token_accuracy": 0.5249426259210049, "step": 8356 }, { "epoch": 1.5493140526510938, "grad_norm": 7.2109375, "learning_rate": 8.450685947348907e-06, "loss": 2.431, "mean_token_accuracy": 0.5111341273951321, "step": 8357 }, { "epoch": 1.5494994438264738, "grad_norm": 6.2265625, "learning_rate": 8.450500556173527e-06, "loss": 2.9914, "mean_token_accuracy": 0.4403846153846154, "step": 8358 }, { "epoch": 1.549684835001854, "grad_norm": 6.2890625, "learning_rate": 8.450315164998146e-06, "loss": 2.9276, "mean_token_accuracy": 0.4477098639987494, "step": 8359 }, { "epoch": 1.549870226177234, "grad_norm": 5.80078125, "learning_rate": 8.450129773822766e-06, "loss": 2.8693, "mean_token_accuracy": 0.46636005256241786, "step": 8360 }, { "epoch": 1.5500556173526139, "grad_norm": 5.2578125, "learning_rate": 8.449944382647387e-06, "loss": 3.059, "mean_token_accuracy": 0.44615600056014565, "step": 8361 }, { "epoch": 1.550241008527994, "grad_norm": 6.22265625, "learning_rate": 8.449758991472007e-06, "loss": 2.8033, "mean_token_accuracy": 0.4682950306822284, "step": 8362 }, { "epoch": 1.5504263997033743, "grad_norm": 6.67578125, "learning_rate": 8.449573600296626e-06, "loss": 3.3225, "mean_token_accuracy": 0.41184387617765816, "step": 8363 }, { "epoch": 1.5506117908787542, "grad_norm": 6.30078125, "learning_rate": 8.449388209121247e-06, "loss": 3.3009, "mean_token_accuracy": 0.4326310632043116, "step": 8364 }, { "epoch": 1.5507971820541342, "grad_norm": 5.93359375, "learning_rate": 8.449202817945865e-06, "loss": 3.0693, "mean_token_accuracy": 0.4468957714608966, "step": 8365 }, { "epoch": 1.5509825732295144, "grad_norm": 6.62890625, "learning_rate": 8.449017426770486e-06, "loss": 2.6706, "mean_token_accuracy": 0.4885249968986478, "step": 8366 }, { "epoch": 1.5511679644048944, "grad_norm": 5.234375, "learning_rate": 8.448832035595106e-06, "loss": 2.5995, "mean_token_accuracy": 0.48256075607560756, "step": 8367 }, { "epoch": 1.5513533555802743, "grad_norm": 6.9375, "learning_rate": 8.448646644419727e-06, "loss": 2.998, "mean_token_accuracy": 0.44861731674918487, "step": 8368 }, { "epoch": 1.5515387467556545, "grad_norm": 7.08203125, "learning_rate": 8.448461253244347e-06, "loss": 2.7109, "mean_token_accuracy": 0.4887955182072829, "step": 8369 }, { "epoch": 1.5517241379310345, "grad_norm": 6.23828125, "learning_rate": 8.448275862068966e-06, "loss": 2.9689, "mean_token_accuracy": 0.4404202719406675, "step": 8370 }, { "epoch": 1.5519095291064144, "grad_norm": 6.3203125, "learning_rate": 8.448090470893587e-06, "loss": 3.4496, "mean_token_accuracy": 0.4451539338654504, "step": 8371 }, { "epoch": 1.5520949202817946, "grad_norm": 5.921875, "learning_rate": 8.447905079718206e-06, "loss": 2.7598, "mean_token_accuracy": 0.4789072887706999, "step": 8372 }, { "epoch": 1.5522803114571746, "grad_norm": 8.7109375, "learning_rate": 8.447719688542826e-06, "loss": 3.4347, "mean_token_accuracy": 0.42412993039443153, "step": 8373 }, { "epoch": 1.5524657026325546, "grad_norm": 7.1796875, "learning_rate": 8.447534297367447e-06, "loss": 3.0919, "mean_token_accuracy": 0.46592902442315476, "step": 8374 }, { "epoch": 1.5526510938079348, "grad_norm": 8.015625, "learning_rate": 8.447348906192065e-06, "loss": 2.5119, "mean_token_accuracy": 0.49633016484177594, "step": 8375 }, { "epoch": 1.552836484983315, "grad_norm": 5.65625, "learning_rate": 8.447163515016686e-06, "loss": 2.5972, "mean_token_accuracy": 0.4932650562172993, "step": 8376 }, { "epoch": 1.5530218761586947, "grad_norm": 8.6484375, "learning_rate": 8.446978123841306e-06, "loss": 3.2344, "mean_token_accuracy": 0.45692216280451575, "step": 8377 }, { "epoch": 1.5532072673340749, "grad_norm": 6.953125, "learning_rate": 8.446792732665927e-06, "loss": 3.1933, "mean_token_accuracy": 0.4149122807017544, "step": 8378 }, { "epoch": 1.553392658509455, "grad_norm": 5.66796875, "learning_rate": 8.446607341490546e-06, "loss": 2.6754, "mean_token_accuracy": 0.46707572583058965, "step": 8379 }, { "epoch": 1.553578049684835, "grad_norm": 7.359375, "learning_rate": 8.446421950315166e-06, "loss": 2.881, "mean_token_accuracy": 0.44709997085397846, "step": 8380 }, { "epoch": 1.553763440860215, "grad_norm": 6.703125, "learning_rate": 8.446236559139785e-06, "loss": 2.923, "mean_token_accuracy": 0.43796308770039066, "step": 8381 }, { "epoch": 1.5539488320355952, "grad_norm": 6.30078125, "learning_rate": 8.446051167964405e-06, "loss": 2.7977, "mean_token_accuracy": 0.47877059569074776, "step": 8382 }, { "epoch": 1.5541342232109752, "grad_norm": 5.65625, "learning_rate": 8.445865776789026e-06, "loss": 2.5994, "mean_token_accuracy": 0.48074757937401486, "step": 8383 }, { "epoch": 1.5543196143863551, "grad_norm": 5.48828125, "learning_rate": 8.445680385613646e-06, "loss": 2.5245, "mean_token_accuracy": 0.4875305623471883, "step": 8384 }, { "epoch": 1.5545050055617353, "grad_norm": 6.01953125, "learning_rate": 8.445494994438265e-06, "loss": 3.3098, "mean_token_accuracy": 0.4410510545119281, "step": 8385 }, { "epoch": 1.5546903967371153, "grad_norm": 7.5546875, "learning_rate": 8.445309603262886e-06, "loss": 2.699, "mean_token_accuracy": 0.48598480129562727, "step": 8386 }, { "epoch": 1.5548757879124953, "grad_norm": 7.40625, "learning_rate": 8.445124212087506e-06, "loss": 2.7595, "mean_token_accuracy": 0.4619133983223759, "step": 8387 }, { "epoch": 1.5550611790878754, "grad_norm": 5.515625, "learning_rate": 8.444938820912125e-06, "loss": 2.9571, "mean_token_accuracy": 0.45032802249297094, "step": 8388 }, { "epoch": 1.5552465702632556, "grad_norm": 7.0625, "learning_rate": 8.444753429736745e-06, "loss": 2.6437, "mean_token_accuracy": 0.5005512679162073, "step": 8389 }, { "epoch": 1.5554319614386354, "grad_norm": 6.03125, "learning_rate": 8.444568038561364e-06, "loss": 3.5808, "mean_token_accuracy": 0.39526355996944235, "step": 8390 }, { "epoch": 1.5556173526140156, "grad_norm": 5.95703125, "learning_rate": 8.444382647385985e-06, "loss": 2.916, "mean_token_accuracy": 0.4557610241820768, "step": 8391 }, { "epoch": 1.5558027437893958, "grad_norm": 5.640625, "learning_rate": 8.444197256210605e-06, "loss": 2.8687, "mean_token_accuracy": 0.46607454863133374, "step": 8392 }, { "epoch": 1.5559881349647757, "grad_norm": 6.01953125, "learning_rate": 8.444011865035226e-06, "loss": 3.1959, "mean_token_accuracy": 0.4544605947459661, "step": 8393 }, { "epoch": 1.5561735261401557, "grad_norm": 6.31640625, "learning_rate": 8.443826473859844e-06, "loss": 2.87, "mean_token_accuracy": 0.47581120943952804, "step": 8394 }, { "epoch": 1.5563589173155359, "grad_norm": 6.60546875, "learning_rate": 8.443641082684465e-06, "loss": 2.9675, "mean_token_accuracy": 0.46794577205882354, "step": 8395 }, { "epoch": 1.5565443084909159, "grad_norm": 7.68359375, "learning_rate": 8.443455691509085e-06, "loss": 2.3252, "mean_token_accuracy": 0.5242279020234292, "step": 8396 }, { "epoch": 1.5567296996662958, "grad_norm": 6.13671875, "learning_rate": 8.443270300333704e-06, "loss": 2.2919, "mean_token_accuracy": 0.5244206773618538, "step": 8397 }, { "epoch": 1.556915090841676, "grad_norm": 9.453125, "learning_rate": 8.443084909158325e-06, "loss": 2.5541, "mean_token_accuracy": 0.48144772816039577, "step": 8398 }, { "epoch": 1.557100482017056, "grad_norm": 7.859375, "learning_rate": 8.442899517982944e-06, "loss": 2.8134, "mean_token_accuracy": 0.4628579735426661, "step": 8399 }, { "epoch": 1.557285873192436, "grad_norm": 6.81640625, "learning_rate": 8.442714126807566e-06, "loss": 3.0545, "mean_token_accuracy": 0.45233995930505555, "step": 8400 }, { "epoch": 1.5574712643678161, "grad_norm": 6.1953125, "learning_rate": 8.442528735632185e-06, "loss": 3.377, "mean_token_accuracy": 0.41705213151643955, "step": 8401 }, { "epoch": 1.557656655543196, "grad_norm": 5.56640625, "learning_rate": 8.442343344456805e-06, "loss": 2.7175, "mean_token_accuracy": 0.4952866807142065, "step": 8402 }, { "epoch": 1.557842046718576, "grad_norm": 9.53125, "learning_rate": 8.442157953281424e-06, "loss": 2.1349, "mean_token_accuracy": 0.536872747435542, "step": 8403 }, { "epoch": 1.5580274378939563, "grad_norm": 6.33984375, "learning_rate": 8.441972562106044e-06, "loss": 2.6698, "mean_token_accuracy": 0.4799600449494319, "step": 8404 }, { "epoch": 1.5582128290693364, "grad_norm": 5.80078125, "learning_rate": 8.441787170930665e-06, "loss": 2.9482, "mean_token_accuracy": 0.4440196613872201, "step": 8405 }, { "epoch": 1.5583982202447162, "grad_norm": 6.48828125, "learning_rate": 8.441601779755284e-06, "loss": 2.8323, "mean_token_accuracy": 0.47560813505250565, "step": 8406 }, { "epoch": 1.5585836114200964, "grad_norm": 6.8203125, "learning_rate": 8.441416388579904e-06, "loss": 2.4759, "mean_token_accuracy": 0.5145191703331238, "step": 8407 }, { "epoch": 1.5587690025954766, "grad_norm": 7.38671875, "learning_rate": 8.441230997404525e-06, "loss": 2.8462, "mean_token_accuracy": 0.49516129032258066, "step": 8408 }, { "epoch": 1.5589543937708565, "grad_norm": 7.2109375, "learning_rate": 8.441045606229145e-06, "loss": 2.0692, "mean_token_accuracy": 0.5750350631136045, "step": 8409 }, { "epoch": 1.5591397849462365, "grad_norm": 5.12109375, "learning_rate": 8.440860215053764e-06, "loss": 2.2935, "mean_token_accuracy": 0.5374163879598662, "step": 8410 }, { "epoch": 1.5593251761216167, "grad_norm": 9.703125, "learning_rate": 8.440674823878384e-06, "loss": 2.3745, "mean_token_accuracy": 0.488303307340683, "step": 8411 }, { "epoch": 1.5595105672969967, "grad_norm": 6.1328125, "learning_rate": 8.440489432703005e-06, "loss": 2.7654, "mean_token_accuracy": 0.4771478667445938, "step": 8412 }, { "epoch": 1.5596959584723766, "grad_norm": 5.2578125, "learning_rate": 8.440304041527624e-06, "loss": 3.0409, "mean_token_accuracy": 0.44935428660339194, "step": 8413 }, { "epoch": 1.5598813496477568, "grad_norm": 7.70703125, "learning_rate": 8.440118650352244e-06, "loss": 3.1878, "mean_token_accuracy": 0.42570993914807304, "step": 8414 }, { "epoch": 1.5600667408231368, "grad_norm": 6.1875, "learning_rate": 8.439933259176863e-06, "loss": 2.8927, "mean_token_accuracy": 0.46062805744042923, "step": 8415 }, { "epoch": 1.5602521319985168, "grad_norm": 5.87890625, "learning_rate": 8.439747868001483e-06, "loss": 2.2906, "mean_token_accuracy": 0.529060874885286, "step": 8416 }, { "epoch": 1.560437523173897, "grad_norm": 5.6328125, "learning_rate": 8.439562476826104e-06, "loss": 3.3128, "mean_token_accuracy": 0.4397498085269339, "step": 8417 }, { "epoch": 1.5606229143492771, "grad_norm": 6.21484375, "learning_rate": 8.439377085650724e-06, "loss": 2.6274, "mean_token_accuracy": 0.48127935017134155, "step": 8418 }, { "epoch": 1.5608083055246569, "grad_norm": 9.3046875, "learning_rate": 8.439191694475343e-06, "loss": 3.0029, "mean_token_accuracy": 0.4575427682737169, "step": 8419 }, { "epoch": 1.560993696700037, "grad_norm": 7.48828125, "learning_rate": 8.439006303299964e-06, "loss": 3.0026, "mean_token_accuracy": 0.4358916222650493, "step": 8420 }, { "epoch": 1.5611790878754173, "grad_norm": 8.578125, "learning_rate": 8.438820912124584e-06, "loss": 2.1261, "mean_token_accuracy": 0.5424522168768691, "step": 8421 }, { "epoch": 1.5613644790507972, "grad_norm": 6.203125, "learning_rate": 8.438635520949203e-06, "loss": 2.8001, "mean_token_accuracy": 0.4609452244413684, "step": 8422 }, { "epoch": 1.5615498702261772, "grad_norm": 5.22265625, "learning_rate": 8.438450129773823e-06, "loss": 2.5867, "mean_token_accuracy": 0.502964681618974, "step": 8423 }, { "epoch": 1.5617352614015574, "grad_norm": 8.1015625, "learning_rate": 8.438264738598442e-06, "loss": 3.6961, "mean_token_accuracy": 0.4135758270084491, "step": 8424 }, { "epoch": 1.5619206525769374, "grad_norm": 5.6953125, "learning_rate": 8.438079347423064e-06, "loss": 3.3276, "mean_token_accuracy": 0.4387755102040816, "step": 8425 }, { "epoch": 1.5621060437523173, "grad_norm": 8.78125, "learning_rate": 8.437893956247683e-06, "loss": 2.9403, "mean_token_accuracy": 0.4440535633227417, "step": 8426 }, { "epoch": 1.5622914349276975, "grad_norm": 8.3828125, "learning_rate": 8.437708565072304e-06, "loss": 2.8296, "mean_token_accuracy": 0.4684845775592311, "step": 8427 }, { "epoch": 1.5624768261030775, "grad_norm": 5.86328125, "learning_rate": 8.437523173896923e-06, "loss": 2.8187, "mean_token_accuracy": 0.46486877454019426, "step": 8428 }, { "epoch": 1.5626622172784574, "grad_norm": 5.453125, "learning_rate": 8.437337782721543e-06, "loss": 3.0312, "mean_token_accuracy": 0.4531678797279561, "step": 8429 }, { "epoch": 1.5628476084538376, "grad_norm": 6.41015625, "learning_rate": 8.437152391546164e-06, "loss": 2.5548, "mean_token_accuracy": 0.484992288527702, "step": 8430 }, { "epoch": 1.5630329996292176, "grad_norm": 7.03125, "learning_rate": 8.436967000370782e-06, "loss": 3.0103, "mean_token_accuracy": 0.4444220903329645, "step": 8431 }, { "epoch": 1.5632183908045976, "grad_norm": 6.7890625, "learning_rate": 8.436781609195403e-06, "loss": 2.9194, "mean_token_accuracy": 0.45368916797488223, "step": 8432 }, { "epoch": 1.5634037819799778, "grad_norm": 5.76953125, "learning_rate": 8.436596218020023e-06, "loss": 2.8125, "mean_token_accuracy": 0.4692723697148476, "step": 8433 }, { "epoch": 1.563589173155358, "grad_norm": 5.921875, "learning_rate": 8.436410826844644e-06, "loss": 2.8724, "mean_token_accuracy": 0.45999757193152846, "step": 8434 }, { "epoch": 1.5637745643307377, "grad_norm": 5.828125, "learning_rate": 8.436225435669263e-06, "loss": 2.7728, "mean_token_accuracy": 0.4893233082706767, "step": 8435 }, { "epoch": 1.5639599555061179, "grad_norm": 6.66796875, "learning_rate": 8.436040044493883e-06, "loss": 2.6715, "mean_token_accuracy": 0.4991968793024323, "step": 8436 }, { "epoch": 1.564145346681498, "grad_norm": 5.546875, "learning_rate": 8.435854653318502e-06, "loss": 2.8158, "mean_token_accuracy": 0.464476386036961, "step": 8437 }, { "epoch": 1.564330737856878, "grad_norm": 6.78125, "learning_rate": 8.435669262143122e-06, "loss": 2.4507, "mean_token_accuracy": 0.5100829038763164, "step": 8438 }, { "epoch": 1.564516129032258, "grad_norm": 5.10546875, "learning_rate": 8.435483870967743e-06, "loss": 2.7281, "mean_token_accuracy": 0.4666173022337406, "step": 8439 }, { "epoch": 1.5647015202076382, "grad_norm": 6.41796875, "learning_rate": 8.435298479792362e-06, "loss": 3.0754, "mean_token_accuracy": 0.43266381297332895, "step": 8440 }, { "epoch": 1.5648869113830182, "grad_norm": 6.64453125, "learning_rate": 8.435113088616982e-06, "loss": 2.1925, "mean_token_accuracy": 0.5324232081911263, "step": 8441 }, { "epoch": 1.5650723025583981, "grad_norm": 5.703125, "learning_rate": 8.434927697441603e-06, "loss": 3.0943, "mean_token_accuracy": 0.4386896283827528, "step": 8442 }, { "epoch": 1.5652576937337783, "grad_norm": 8.3828125, "learning_rate": 8.434742306266223e-06, "loss": 2.9414, "mean_token_accuracy": 0.45081549439347607, "step": 8443 }, { "epoch": 1.5654430849091583, "grad_norm": 6.4921875, "learning_rate": 8.434556915090842e-06, "loss": 2.6132, "mean_token_accuracy": 0.48522378908645003, "step": 8444 }, { "epoch": 1.5656284760845383, "grad_norm": 6.5546875, "learning_rate": 8.434371523915462e-06, "loss": 2.4626, "mean_token_accuracy": 0.4992660251182515, "step": 8445 }, { "epoch": 1.5658138672599184, "grad_norm": 6.08203125, "learning_rate": 8.434186132740081e-06, "loss": 2.6058, "mean_token_accuracy": 0.48985383851284237, "step": 8446 }, { "epoch": 1.5659992584352986, "grad_norm": 6.32421875, "learning_rate": 8.434000741564702e-06, "loss": 2.3627, "mean_token_accuracy": 0.5083064419744233, "step": 8447 }, { "epoch": 1.5661846496106784, "grad_norm": 8.0390625, "learning_rate": 8.433815350389322e-06, "loss": 2.8576, "mean_token_accuracy": 0.4631894761135472, "step": 8448 }, { "epoch": 1.5663700407860586, "grad_norm": 6.46875, "learning_rate": 8.433629959213943e-06, "loss": 2.5598, "mean_token_accuracy": 0.5002834467120182, "step": 8449 }, { "epoch": 1.5665554319614388, "grad_norm": 5.83203125, "learning_rate": 8.433444568038563e-06, "loss": 2.6172, "mean_token_accuracy": 0.4694871794871795, "step": 8450 }, { "epoch": 1.5667408231368187, "grad_norm": 5.6875, "learning_rate": 8.433259176863182e-06, "loss": 2.9658, "mean_token_accuracy": 0.44785358632754674, "step": 8451 }, { "epoch": 1.5669262143121987, "grad_norm": 5.31640625, "learning_rate": 8.433073785687802e-06, "loss": 2.3299, "mean_token_accuracy": 0.499515503875969, "step": 8452 }, { "epoch": 1.5671116054875789, "grad_norm": 8.78125, "learning_rate": 8.432888394512421e-06, "loss": 2.3122, "mean_token_accuracy": 0.5362698066579444, "step": 8453 }, { "epoch": 1.5672969966629589, "grad_norm": 7.81640625, "learning_rate": 8.432703003337042e-06, "loss": 2.9139, "mean_token_accuracy": 0.4539308176100629, "step": 8454 }, { "epoch": 1.5674823878383388, "grad_norm": 6.4609375, "learning_rate": 8.432517612161662e-06, "loss": 2.6785, "mean_token_accuracy": 0.5007900677200903, "step": 8455 }, { "epoch": 1.567667779013719, "grad_norm": 5.82421875, "learning_rate": 8.432332220986281e-06, "loss": 2.9801, "mean_token_accuracy": 0.45579742336739226, "step": 8456 }, { "epoch": 1.567853170189099, "grad_norm": 8.0234375, "learning_rate": 8.432146829810902e-06, "loss": 3.1714, "mean_token_accuracy": 0.4217910056378655, "step": 8457 }, { "epoch": 1.568038561364479, "grad_norm": 5.9765625, "learning_rate": 8.431961438635522e-06, "loss": 3.3776, "mean_token_accuracy": 0.4361509539061412, "step": 8458 }, { "epoch": 1.5682239525398591, "grad_norm": 7.49609375, "learning_rate": 8.431776047460143e-06, "loss": 2.8761, "mean_token_accuracy": 0.4672897196261682, "step": 8459 }, { "epoch": 1.568409343715239, "grad_norm": 5.53515625, "learning_rate": 8.431590656284761e-06, "loss": 2.1784, "mean_token_accuracy": 0.5402506372132541, "step": 8460 }, { "epoch": 1.568594734890619, "grad_norm": 5.78125, "learning_rate": 8.431405265109382e-06, "loss": 2.5836, "mean_token_accuracy": 0.4813252392221422, "step": 8461 }, { "epoch": 1.5687801260659993, "grad_norm": 5.765625, "learning_rate": 8.431219873934e-06, "loss": 3.1844, "mean_token_accuracy": 0.4172632158590308, "step": 8462 }, { "epoch": 1.5689655172413794, "grad_norm": 5.7421875, "learning_rate": 8.431034482758621e-06, "loss": 2.8508, "mean_token_accuracy": 0.47329746752394697, "step": 8463 }, { "epoch": 1.5691509084167594, "grad_norm": 7.1640625, "learning_rate": 8.430849091583242e-06, "loss": 3.3646, "mean_token_accuracy": 0.4275109170305677, "step": 8464 }, { "epoch": 1.5693362995921394, "grad_norm": 7.46484375, "learning_rate": 8.430663700407862e-06, "loss": 2.4379, "mean_token_accuracy": 0.5061133753241941, "step": 8465 }, { "epoch": 1.5695216907675196, "grad_norm": 10.0625, "learning_rate": 8.430478309232481e-06, "loss": 3.3212, "mean_token_accuracy": 0.4708789740849586, "step": 8466 }, { "epoch": 1.5697070819428995, "grad_norm": 6.21875, "learning_rate": 8.430292918057101e-06, "loss": 2.7067, "mean_token_accuracy": 0.4847483757295452, "step": 8467 }, { "epoch": 1.5698924731182795, "grad_norm": 7.00390625, "learning_rate": 8.430107526881722e-06, "loss": 1.93, "mean_token_accuracy": 0.5710831721470019, "step": 8468 }, { "epoch": 1.5700778642936597, "grad_norm": 5.9765625, "learning_rate": 8.42992213570634e-06, "loss": 2.4411, "mean_token_accuracy": 0.5247413685653746, "step": 8469 }, { "epoch": 1.5702632554690397, "grad_norm": 7.390625, "learning_rate": 8.429736744530961e-06, "loss": 2.6506, "mean_token_accuracy": 0.4908442715929347, "step": 8470 }, { "epoch": 1.5704486466444196, "grad_norm": 8.8984375, "learning_rate": 8.42955135335558e-06, "loss": 3.061, "mean_token_accuracy": 0.4353319295134656, "step": 8471 }, { "epoch": 1.5706340378197998, "grad_norm": 7.5546875, "learning_rate": 8.4293659621802e-06, "loss": 2.9731, "mean_token_accuracy": 0.4525856368205264, "step": 8472 }, { "epoch": 1.5708194289951798, "grad_norm": 6.37109375, "learning_rate": 8.429180571004821e-06, "loss": 2.6035, "mean_token_accuracy": 0.4880643841222207, "step": 8473 }, { "epoch": 1.5710048201705598, "grad_norm": 9.2734375, "learning_rate": 8.428995179829441e-06, "loss": 2.7968, "mean_token_accuracy": 0.4525010969723563, "step": 8474 }, { "epoch": 1.57119021134594, "grad_norm": 6.79296875, "learning_rate": 8.42880978865406e-06, "loss": 2.7208, "mean_token_accuracy": 0.4759751271905031, "step": 8475 }, { "epoch": 1.5713756025213201, "grad_norm": 7.69921875, "learning_rate": 8.42862439747868e-06, "loss": 3.0084, "mean_token_accuracy": 0.43533946779408106, "step": 8476 }, { "epoch": 1.5715609936966999, "grad_norm": 8.1171875, "learning_rate": 8.428439006303301e-06, "loss": 2.4103, "mean_token_accuracy": 0.5210897359302187, "step": 8477 }, { "epoch": 1.57174638487208, "grad_norm": 6.4921875, "learning_rate": 8.42825361512792e-06, "loss": 2.6997, "mean_token_accuracy": 0.46847339088624196, "step": 8478 }, { "epoch": 1.5719317760474603, "grad_norm": 6.87109375, "learning_rate": 8.42806822395254e-06, "loss": 2.6708, "mean_token_accuracy": 0.4825951510226195, "step": 8479 }, { "epoch": 1.5721171672228402, "grad_norm": 7.1640625, "learning_rate": 8.42788283277716e-06, "loss": 2.1141, "mean_token_accuracy": 0.5741696588868941, "step": 8480 }, { "epoch": 1.5723025583982202, "grad_norm": 9.1328125, "learning_rate": 8.427697441601781e-06, "loss": 2.521, "mean_token_accuracy": 0.49967500812479687, "step": 8481 }, { "epoch": 1.5724879495736004, "grad_norm": 6.109375, "learning_rate": 8.4275120504264e-06, "loss": 2.8349, "mean_token_accuracy": 0.49317523783262096, "step": 8482 }, { "epoch": 1.5726733407489804, "grad_norm": 5.8515625, "learning_rate": 8.42732665925102e-06, "loss": 3.3681, "mean_token_accuracy": 0.428452694278986, "step": 8483 }, { "epoch": 1.5728587319243603, "grad_norm": 9.234375, "learning_rate": 8.42714126807564e-06, "loss": 2.4523, "mean_token_accuracy": 0.4965366067514248, "step": 8484 }, { "epoch": 1.5730441230997405, "grad_norm": 8.1875, "learning_rate": 8.42695587690026e-06, "loss": 3.3614, "mean_token_accuracy": 0.42931078010603385, "step": 8485 }, { "epoch": 1.5732295142751205, "grad_norm": 5.67578125, "learning_rate": 8.42677048572488e-06, "loss": 2.8377, "mean_token_accuracy": 0.44548686549476596, "step": 8486 }, { "epoch": 1.5734149054505004, "grad_norm": 8.453125, "learning_rate": 8.4265850945495e-06, "loss": 3.379, "mean_token_accuracy": 0.42384887839433294, "step": 8487 }, { "epoch": 1.5736002966258806, "grad_norm": 6.4296875, "learning_rate": 8.42639970337412e-06, "loss": 3.1337, "mean_token_accuracy": 0.4355234460196292, "step": 8488 }, { "epoch": 1.5737856878012608, "grad_norm": 5.890625, "learning_rate": 8.42621431219874e-06, "loss": 2.9977, "mean_token_accuracy": 0.4712328767123288, "step": 8489 }, { "epoch": 1.5739710789766406, "grad_norm": 7.203125, "learning_rate": 8.42602892102336e-06, "loss": 3.2388, "mean_token_accuracy": 0.4267581475128645, "step": 8490 }, { "epoch": 1.5741564701520208, "grad_norm": 9.4765625, "learning_rate": 8.42584352984798e-06, "loss": 3.1719, "mean_token_accuracy": 0.45363048166786485, "step": 8491 }, { "epoch": 1.574341861327401, "grad_norm": 5.79296875, "learning_rate": 8.4256581386726e-06, "loss": 2.6594, "mean_token_accuracy": 0.4928061173156253, "step": 8492 }, { "epoch": 1.574527252502781, "grad_norm": 9.7265625, "learning_rate": 8.42547274749722e-06, "loss": 2.5923, "mean_token_accuracy": 0.5029758289809304, "step": 8493 }, { "epoch": 1.5747126436781609, "grad_norm": 12.890625, "learning_rate": 8.42528735632184e-06, "loss": 2.4506, "mean_token_accuracy": 0.5001299038711353, "step": 8494 }, { "epoch": 1.574898034853541, "grad_norm": 5.54296875, "learning_rate": 8.42510196514646e-06, "loss": 2.8877, "mean_token_accuracy": 0.45436199937907484, "step": 8495 }, { "epoch": 1.575083426028921, "grad_norm": 6.98828125, "learning_rate": 8.424916573971079e-06, "loss": 2.8114, "mean_token_accuracy": 0.4441413354117953, "step": 8496 }, { "epoch": 1.575268817204301, "grad_norm": 6.9453125, "learning_rate": 8.424731182795701e-06, "loss": 2.4207, "mean_token_accuracy": 0.4998148833765272, "step": 8497 }, { "epoch": 1.5754542083796812, "grad_norm": 8.9765625, "learning_rate": 8.42454579162032e-06, "loss": 2.4778, "mean_token_accuracy": 0.5161745676979179, "step": 8498 }, { "epoch": 1.5756395995550612, "grad_norm": 6.60546875, "learning_rate": 8.42436040044494e-06, "loss": 2.5788, "mean_token_accuracy": 0.49110049924028654, "step": 8499 }, { "epoch": 1.5758249907304411, "grad_norm": 6.3828125, "learning_rate": 8.424175009269559e-06, "loss": 3.3314, "mean_token_accuracy": 0.42735042735042733, "step": 8500 }, { "epoch": 1.5760103819058213, "grad_norm": 9.640625, "learning_rate": 8.42398961809418e-06, "loss": 2.9186, "mean_token_accuracy": 0.4283121597096189, "step": 8501 }, { "epoch": 1.5761957730812013, "grad_norm": 8.1171875, "learning_rate": 8.4238042269188e-06, "loss": 2.5448, "mean_token_accuracy": 0.48713480266529985, "step": 8502 }, { "epoch": 1.5763811642565813, "grad_norm": 6.21875, "learning_rate": 8.423618835743419e-06, "loss": 3.2833, "mean_token_accuracy": 0.47560679611650486, "step": 8503 }, { "epoch": 1.5765665554319614, "grad_norm": 8.2421875, "learning_rate": 8.42343344456804e-06, "loss": 2.7554, "mean_token_accuracy": 0.4461690885072655, "step": 8504 }, { "epoch": 1.5767519466073416, "grad_norm": 12.3203125, "learning_rate": 8.42324805339266e-06, "loss": 2.826, "mean_token_accuracy": 0.42980365452275565, "step": 8505 }, { "epoch": 1.5769373377827214, "grad_norm": 6.515625, "learning_rate": 8.42306266221728e-06, "loss": 2.3576, "mean_token_accuracy": 0.5284066638188808, "step": 8506 }, { "epoch": 1.5771227289581016, "grad_norm": 6.36328125, "learning_rate": 8.422877271041899e-06, "loss": 2.8022, "mean_token_accuracy": 0.48289812431040824, "step": 8507 }, { "epoch": 1.5773081201334818, "grad_norm": 9.5625, "learning_rate": 8.42269187986652e-06, "loss": 2.5185, "mean_token_accuracy": 0.4984953703703704, "step": 8508 }, { "epoch": 1.5774935113088617, "grad_norm": 7.71875, "learning_rate": 8.422506488691138e-06, "loss": 2.2664, "mean_token_accuracy": 0.5409507923269391, "step": 8509 }, { "epoch": 1.5776789024842417, "grad_norm": 5.6953125, "learning_rate": 8.422321097515759e-06, "loss": 2.9023, "mean_token_accuracy": 0.43927948866937827, "step": 8510 }, { "epoch": 1.5778642936596219, "grad_norm": 7.24609375, "learning_rate": 8.42213570634038e-06, "loss": 2.7093, "mean_token_accuracy": 0.4667966211825861, "step": 8511 }, { "epoch": 1.5780496848350019, "grad_norm": 6.671875, "learning_rate": 8.421950315164998e-06, "loss": 2.7113, "mean_token_accuracy": 0.47303958177744587, "step": 8512 }, { "epoch": 1.5782350760103818, "grad_norm": 6.171875, "learning_rate": 8.421764923989619e-06, "loss": 3.188, "mean_token_accuracy": 0.44219292158223455, "step": 8513 }, { "epoch": 1.578420467185762, "grad_norm": 6.1875, "learning_rate": 8.421579532814239e-06, "loss": 2.7631, "mean_token_accuracy": 0.46224601867105986, "step": 8514 }, { "epoch": 1.578605858361142, "grad_norm": 7.19140625, "learning_rate": 8.42139414163886e-06, "loss": 2.271, "mean_token_accuracy": 0.5368063420158551, "step": 8515 }, { "epoch": 1.578791249536522, "grad_norm": 7.41796875, "learning_rate": 8.421208750463478e-06, "loss": 2.8469, "mean_token_accuracy": 0.48372781065088755, "step": 8516 }, { "epoch": 1.5789766407119021, "grad_norm": 6.40625, "learning_rate": 8.421023359288099e-06, "loss": 2.8499, "mean_token_accuracy": 0.4705999205403258, "step": 8517 }, { "epoch": 1.5791620318872823, "grad_norm": 6.4609375, "learning_rate": 8.420837968112718e-06, "loss": 2.5319, "mean_token_accuracy": 0.4864171621779177, "step": 8518 }, { "epoch": 1.579347423062662, "grad_norm": 6.14453125, "learning_rate": 8.420652576937338e-06, "loss": 3.4918, "mean_token_accuracy": 0.4133238837703756, "step": 8519 }, { "epoch": 1.5795328142380423, "grad_norm": 7.7421875, "learning_rate": 8.420467185761959e-06, "loss": 2.7205, "mean_token_accuracy": 0.4586458333333333, "step": 8520 }, { "epoch": 1.5797182054134224, "grad_norm": 5.6796875, "learning_rate": 8.420281794586579e-06, "loss": 2.677, "mean_token_accuracy": 0.49216171617161714, "step": 8521 }, { "epoch": 1.5799035965888024, "grad_norm": 5.5546875, "learning_rate": 8.420096403411198e-06, "loss": 3.5245, "mean_token_accuracy": 0.40401076057715823, "step": 8522 }, { "epoch": 1.5800889877641824, "grad_norm": 5.9375, "learning_rate": 8.419911012235818e-06, "loss": 2.7, "mean_token_accuracy": 0.47946030598723044, "step": 8523 }, { "epoch": 1.5802743789395626, "grad_norm": 5.88671875, "learning_rate": 8.419725621060439e-06, "loss": 3.1547, "mean_token_accuracy": 0.45251460648413333, "step": 8524 }, { "epoch": 1.5804597701149425, "grad_norm": 6.3203125, "learning_rate": 8.419540229885058e-06, "loss": 2.9989, "mean_token_accuracy": 0.4875985663082437, "step": 8525 }, { "epoch": 1.5806451612903225, "grad_norm": 8.8046875, "learning_rate": 8.419354838709678e-06, "loss": 2.778, "mean_token_accuracy": 0.4647747074931541, "step": 8526 }, { "epoch": 1.5808305524657027, "grad_norm": 7.796875, "learning_rate": 8.419169447534297e-06, "loss": 2.5678, "mean_token_accuracy": 0.49796452457109625, "step": 8527 }, { "epoch": 1.5810159436410827, "grad_norm": 6.23046875, "learning_rate": 8.418984056358917e-06, "loss": 3.0137, "mean_token_accuracy": 0.46067242442936457, "step": 8528 }, { "epoch": 1.5812013348164626, "grad_norm": 6.64453125, "learning_rate": 8.418798665183538e-06, "loss": 2.885, "mean_token_accuracy": 0.441711988649799, "step": 8529 }, { "epoch": 1.5813867259918428, "grad_norm": 6.7421875, "learning_rate": 8.418613274008158e-06, "loss": 3.2051, "mean_token_accuracy": 0.4299418604651163, "step": 8530 }, { "epoch": 1.5815721171672228, "grad_norm": 6.87890625, "learning_rate": 8.418427882832779e-06, "loss": 3.1126, "mean_token_accuracy": 0.44399121430812677, "step": 8531 }, { "epoch": 1.5817575083426028, "grad_norm": 10.6640625, "learning_rate": 8.418242491657398e-06, "loss": 2.64, "mean_token_accuracy": 0.4772877618522602, "step": 8532 }, { "epoch": 1.581942899517983, "grad_norm": 8.6328125, "learning_rate": 8.418057100482018e-06, "loss": 2.431, "mean_token_accuracy": 0.5036755386565273, "step": 8533 }, { "epoch": 1.5821282906933631, "grad_norm": 11.40625, "learning_rate": 8.417871709306637e-06, "loss": 2.3335, "mean_token_accuracy": 0.513957509881423, "step": 8534 }, { "epoch": 1.5823136818687429, "grad_norm": 6.77734375, "learning_rate": 8.417686318131258e-06, "loss": 3.08, "mean_token_accuracy": 0.4370174277520406, "step": 8535 }, { "epoch": 1.582499073044123, "grad_norm": 5.90625, "learning_rate": 8.417500926955878e-06, "loss": 2.7527, "mean_token_accuracy": 0.4726314366806325, "step": 8536 }, { "epoch": 1.5826844642195033, "grad_norm": 6.328125, "learning_rate": 8.417315535780497e-06, "loss": 2.6442, "mean_token_accuracy": 0.46855573225386726, "step": 8537 }, { "epoch": 1.5828698553948832, "grad_norm": 5.83203125, "learning_rate": 8.417130144605117e-06, "loss": 2.9191, "mean_token_accuracy": 0.49198697068403907, "step": 8538 }, { "epoch": 1.5830552465702632, "grad_norm": 6.70703125, "learning_rate": 8.416944753429738e-06, "loss": 2.3842, "mean_token_accuracy": 0.5192307692307693, "step": 8539 }, { "epoch": 1.5832406377456434, "grad_norm": 6.078125, "learning_rate": 8.416759362254358e-06, "loss": 3.7909, "mean_token_accuracy": 0.41010984084286034, "step": 8540 }, { "epoch": 1.5834260289210234, "grad_norm": 6.05859375, "learning_rate": 8.416573971078977e-06, "loss": 2.8041, "mean_token_accuracy": 0.47223523898781633, "step": 8541 }, { "epoch": 1.5836114200964033, "grad_norm": 5.328125, "learning_rate": 8.416388579903598e-06, "loss": 3.3487, "mean_token_accuracy": 0.4473744554001376, "step": 8542 }, { "epoch": 1.5837968112717835, "grad_norm": 5.62890625, "learning_rate": 8.416203188728216e-06, "loss": 2.8092, "mean_token_accuracy": 0.45176110260336905, "step": 8543 }, { "epoch": 1.5839822024471635, "grad_norm": 7.51171875, "learning_rate": 8.416017797552837e-06, "loss": 2.6675, "mean_token_accuracy": 0.4629073260919806, "step": 8544 }, { "epoch": 1.5841675936225434, "grad_norm": 6.1328125, "learning_rate": 8.415832406377457e-06, "loss": 2.8197, "mean_token_accuracy": 0.44554165037152915, "step": 8545 }, { "epoch": 1.5843529847979236, "grad_norm": 7.171875, "learning_rate": 8.415647015202078e-06, "loss": 2.6288, "mean_token_accuracy": 0.45631207715560673, "step": 8546 }, { "epoch": 1.5845383759733038, "grad_norm": 6.875, "learning_rate": 8.415461624026697e-06, "loss": 3.223, "mean_token_accuracy": 0.44672131147540983, "step": 8547 }, { "epoch": 1.5847237671486836, "grad_norm": 7.68359375, "learning_rate": 8.415276232851317e-06, "loss": 2.6178, "mean_token_accuracy": 0.492586778301064, "step": 8548 }, { "epoch": 1.5849091583240638, "grad_norm": 5.53515625, "learning_rate": 8.415090841675938e-06, "loss": 2.8437, "mean_token_accuracy": 0.4559015964407223, "step": 8549 }, { "epoch": 1.585094549499444, "grad_norm": 6.1484375, "learning_rate": 8.414905450500556e-06, "loss": 3.1616, "mean_token_accuracy": 0.44719314938154137, "step": 8550 }, { "epoch": 1.585279940674824, "grad_norm": 7.3359375, "learning_rate": 8.414720059325177e-06, "loss": 2.6281, "mean_token_accuracy": 0.48656798245614036, "step": 8551 }, { "epoch": 1.5854653318502039, "grad_norm": 6.625, "learning_rate": 8.414534668149796e-06, "loss": 2.3305, "mean_token_accuracy": 0.5082903981264637, "step": 8552 }, { "epoch": 1.585650723025584, "grad_norm": 6.109375, "learning_rate": 8.414349276974416e-06, "loss": 2.5596, "mean_token_accuracy": 0.47637318255250405, "step": 8553 }, { "epoch": 1.585836114200964, "grad_norm": 7.57421875, "learning_rate": 8.414163885799037e-06, "loss": 3.4852, "mean_token_accuracy": 0.4205808940322215, "step": 8554 }, { "epoch": 1.586021505376344, "grad_norm": 6.42578125, "learning_rate": 8.413978494623657e-06, "loss": 3.6456, "mean_token_accuracy": 0.4002157497303128, "step": 8555 }, { "epoch": 1.5862068965517242, "grad_norm": 5.22265625, "learning_rate": 8.413793103448276e-06, "loss": 3.2209, "mean_token_accuracy": 0.4351843032669778, "step": 8556 }, { "epoch": 1.5863922877271042, "grad_norm": 5.34375, "learning_rate": 8.413607712272896e-06, "loss": 2.9638, "mean_token_accuracy": 0.4638513964987305, "step": 8557 }, { "epoch": 1.5865776789024841, "grad_norm": 6.703125, "learning_rate": 8.413422321097517e-06, "loss": 3.0011, "mean_token_accuracy": 0.4484749777909387, "step": 8558 }, { "epoch": 1.5867630700778643, "grad_norm": 7.71875, "learning_rate": 8.413236929922136e-06, "loss": 2.981, "mean_token_accuracy": 0.45481770833333335, "step": 8559 }, { "epoch": 1.5869484612532443, "grad_norm": 6.94140625, "learning_rate": 8.413051538746756e-06, "loss": 3.0304, "mean_token_accuracy": 0.44808414725770096, "step": 8560 }, { "epoch": 1.5871338524286243, "grad_norm": 6.32421875, "learning_rate": 8.412866147571375e-06, "loss": 2.7095, "mean_token_accuracy": 0.4721120186697783, "step": 8561 }, { "epoch": 1.5873192436040044, "grad_norm": 6.0234375, "learning_rate": 8.412680756395997e-06, "loss": 2.6582, "mean_token_accuracy": 0.4550159841680621, "step": 8562 }, { "epoch": 1.5875046347793846, "grad_norm": 7.93359375, "learning_rate": 8.412495365220616e-06, "loss": 2.2023, "mean_token_accuracy": 0.550316856780735, "step": 8563 }, { "epoch": 1.5876900259547646, "grad_norm": 5.90234375, "learning_rate": 8.412309974045237e-06, "loss": 3.2143, "mean_token_accuracy": 0.42887776983559683, "step": 8564 }, { "epoch": 1.5878754171301446, "grad_norm": 5.9296875, "learning_rate": 8.412124582869855e-06, "loss": 2.8184, "mean_token_accuracy": 0.44744780982261373, "step": 8565 }, { "epoch": 1.5880608083055248, "grad_norm": 5.46484375, "learning_rate": 8.411939191694476e-06, "loss": 2.381, "mean_token_accuracy": 0.514172335600907, "step": 8566 }, { "epoch": 1.5882461994809047, "grad_norm": 5.30078125, "learning_rate": 8.411753800519096e-06, "loss": 2.5785, "mean_token_accuracy": 0.48451327433628316, "step": 8567 }, { "epoch": 1.5884315906562847, "grad_norm": 6.38671875, "learning_rate": 8.411568409343715e-06, "loss": 3.1453, "mean_token_accuracy": 0.44353758070410526, "step": 8568 }, { "epoch": 1.5886169818316649, "grad_norm": 6.5859375, "learning_rate": 8.411383018168336e-06, "loss": 2.9786, "mean_token_accuracy": 0.44592440801457195, "step": 8569 }, { "epoch": 1.5888023730070449, "grad_norm": 6.5234375, "learning_rate": 8.411197626992956e-06, "loss": 3.0007, "mean_token_accuracy": 0.46088902451429153, "step": 8570 }, { "epoch": 1.5889877641824248, "grad_norm": 6.08984375, "learning_rate": 8.411012235817577e-06, "loss": 3.1898, "mean_token_accuracy": 0.5014020707506471, "step": 8571 }, { "epoch": 1.589173155357805, "grad_norm": 5.359375, "learning_rate": 8.410826844642195e-06, "loss": 2.5308, "mean_token_accuracy": 0.5078087264530672, "step": 8572 }, { "epoch": 1.589358546533185, "grad_norm": 7.15234375, "learning_rate": 8.410641453466816e-06, "loss": 3.9028, "mean_token_accuracy": 0.4039861558658551, "step": 8573 }, { "epoch": 1.589543937708565, "grad_norm": 6.94921875, "learning_rate": 8.410456062291436e-06, "loss": 2.6449, "mean_token_accuracy": 0.503921568627451, "step": 8574 }, { "epoch": 1.5897293288839451, "grad_norm": 6.5546875, "learning_rate": 8.410270671116055e-06, "loss": 3.465, "mean_token_accuracy": 0.41294232225949573, "step": 8575 }, { "epoch": 1.5899147200593253, "grad_norm": 7.60546875, "learning_rate": 8.410085279940676e-06, "loss": 2.8748, "mean_token_accuracy": 0.4710275560133917, "step": 8576 }, { "epoch": 1.590100111234705, "grad_norm": 7.05078125, "learning_rate": 8.409899888765294e-06, "loss": 3.1863, "mean_token_accuracy": 0.4696335742360847, "step": 8577 }, { "epoch": 1.5902855024100853, "grad_norm": 7.93359375, "learning_rate": 8.409714497589917e-06, "loss": 2.3812, "mean_token_accuracy": 0.5145588874402434, "step": 8578 }, { "epoch": 1.5904708935854655, "grad_norm": 8.859375, "learning_rate": 8.409529106414535e-06, "loss": 3.6163, "mean_token_accuracy": 0.42421330771078547, "step": 8579 }, { "epoch": 1.5906562847608454, "grad_norm": 9.1015625, "learning_rate": 8.409343715239156e-06, "loss": 2.8344, "mean_token_accuracy": 0.47660628176723574, "step": 8580 }, { "epoch": 1.5908416759362254, "grad_norm": 9.0859375, "learning_rate": 8.409158324063775e-06, "loss": 3.1078, "mean_token_accuracy": 0.4412874322093846, "step": 8581 }, { "epoch": 1.5910270671116056, "grad_norm": 6.2265625, "learning_rate": 8.408972932888395e-06, "loss": 2.7176, "mean_token_accuracy": 0.4824162184526272, "step": 8582 }, { "epoch": 1.5912124582869855, "grad_norm": 5.875, "learning_rate": 8.408787541713016e-06, "loss": 2.6757, "mean_token_accuracy": 0.4731616059864729, "step": 8583 }, { "epoch": 1.5913978494623655, "grad_norm": 8.765625, "learning_rate": 8.408602150537634e-06, "loss": 2.9229, "mean_token_accuracy": 0.4496810772501772, "step": 8584 }, { "epoch": 1.5915832406377457, "grad_norm": 8.5390625, "learning_rate": 8.408416759362255e-06, "loss": 2.6517, "mean_token_accuracy": 0.4864112639161755, "step": 8585 }, { "epoch": 1.5917686318131257, "grad_norm": 5.62109375, "learning_rate": 8.408231368186875e-06, "loss": 2.4607, "mean_token_accuracy": 0.5052899936265137, "step": 8586 }, { "epoch": 1.5919540229885056, "grad_norm": 7.4765625, "learning_rate": 8.408045977011496e-06, "loss": 3.1103, "mean_token_accuracy": 0.45030020013342226, "step": 8587 }, { "epoch": 1.5921394141638858, "grad_norm": 7.06640625, "learning_rate": 8.407860585836115e-06, "loss": 2.5483, "mean_token_accuracy": 0.48023200757575757, "step": 8588 }, { "epoch": 1.592324805339266, "grad_norm": 6.33203125, "learning_rate": 8.407675194660735e-06, "loss": 3.418, "mean_token_accuracy": 0.41613614103819785, "step": 8589 }, { "epoch": 1.5925101965146458, "grad_norm": 6.00390625, "learning_rate": 8.407489803485354e-06, "loss": 2.8001, "mean_token_accuracy": 0.5009683666881859, "step": 8590 }, { "epoch": 1.592695587690026, "grad_norm": 5.6328125, "learning_rate": 8.407304412309975e-06, "loss": 2.6756, "mean_token_accuracy": 0.4856084656084656, "step": 8591 }, { "epoch": 1.5928809788654061, "grad_norm": 6.73828125, "learning_rate": 8.407119021134595e-06, "loss": 2.8308, "mean_token_accuracy": 0.4684332464523603, "step": 8592 }, { "epoch": 1.593066370040786, "grad_norm": 6.90625, "learning_rate": 8.406933629959214e-06, "loss": 2.9859, "mean_token_accuracy": 0.4724200761967865, "step": 8593 }, { "epoch": 1.593251761216166, "grad_norm": 7.609375, "learning_rate": 8.406748238783834e-06, "loss": 2.638, "mean_token_accuracy": 0.4976752836153989, "step": 8594 }, { "epoch": 1.5934371523915463, "grad_norm": 6.171875, "learning_rate": 8.406562847608455e-06, "loss": 2.7275, "mean_token_accuracy": 0.4715752566992236, "step": 8595 }, { "epoch": 1.5936225435669262, "grad_norm": 5.8359375, "learning_rate": 8.406377456433075e-06, "loss": 2.8831, "mean_token_accuracy": 0.46126126126126127, "step": 8596 }, { "epoch": 1.5938079347423062, "grad_norm": 8.09375, "learning_rate": 8.406192065257694e-06, "loss": 2.6847, "mean_token_accuracy": 0.4756987316197104, "step": 8597 }, { "epoch": 1.5939933259176864, "grad_norm": 6.10546875, "learning_rate": 8.406006674082315e-06, "loss": 3.2149, "mean_token_accuracy": 0.4498239436619718, "step": 8598 }, { "epoch": 1.5941787170930664, "grad_norm": 6.83984375, "learning_rate": 8.405821282906933e-06, "loss": 3.1685, "mean_token_accuracy": 0.4303857566765579, "step": 8599 }, { "epoch": 1.5943641082684463, "grad_norm": 8.2265625, "learning_rate": 8.405635891731554e-06, "loss": 2.7891, "mean_token_accuracy": 0.4682016004492489, "step": 8600 }, { "epoch": 1.5945494994438265, "grad_norm": 5.90234375, "learning_rate": 8.405450500556174e-06, "loss": 3.0223, "mean_token_accuracy": 0.45625, "step": 8601 }, { "epoch": 1.5947348906192065, "grad_norm": 8.15625, "learning_rate": 8.405265109380795e-06, "loss": 1.9168, "mean_token_accuracy": 0.5838676583739421, "step": 8602 }, { "epoch": 1.5949202817945864, "grad_norm": 8.2578125, "learning_rate": 8.405079718205414e-06, "loss": 2.7637, "mean_token_accuracy": 0.4684414864115363, "step": 8603 }, { "epoch": 1.5951056729699666, "grad_norm": 8.6875, "learning_rate": 8.404894327030034e-06, "loss": 2.8355, "mean_token_accuracy": 0.45230197691389146, "step": 8604 }, { "epoch": 1.5952910641453468, "grad_norm": 5.42578125, "learning_rate": 8.404708935854655e-06, "loss": 2.856, "mean_token_accuracy": 0.4762416427889207, "step": 8605 }, { "epoch": 1.5954764553207266, "grad_norm": 5.32421875, "learning_rate": 8.404523544679273e-06, "loss": 2.8356, "mean_token_accuracy": 0.4732682473268247, "step": 8606 }, { "epoch": 1.5956618464961068, "grad_norm": 8.9453125, "learning_rate": 8.404338153503894e-06, "loss": 2.7425, "mean_token_accuracy": 0.47544318512060446, "step": 8607 }, { "epoch": 1.595847237671487, "grad_norm": 7.50390625, "learning_rate": 8.404152762328513e-06, "loss": 2.9613, "mean_token_accuracy": 0.4596247960848287, "step": 8608 }, { "epoch": 1.596032628846867, "grad_norm": 5.5234375, "learning_rate": 8.403967371153133e-06, "loss": 2.5058, "mean_token_accuracy": 0.493202258941644, "step": 8609 }, { "epoch": 1.5962180200222469, "grad_norm": 7.07421875, "learning_rate": 8.403781979977754e-06, "loss": 2.7966, "mean_token_accuracy": 0.4609459251270991, "step": 8610 }, { "epoch": 1.596403411197627, "grad_norm": 10.0390625, "learning_rate": 8.403596588802374e-06, "loss": 3.0163, "mean_token_accuracy": 0.43963520555877245, "step": 8611 }, { "epoch": 1.596588802373007, "grad_norm": 5.94921875, "learning_rate": 8.403411197626995e-06, "loss": 2.8013, "mean_token_accuracy": 0.46428034290870807, "step": 8612 }, { "epoch": 1.596774193548387, "grad_norm": 8.03125, "learning_rate": 8.403225806451613e-06, "loss": 2.3093, "mean_token_accuracy": 0.5013246982631734, "step": 8613 }, { "epoch": 1.5969595847237672, "grad_norm": 6.82421875, "learning_rate": 8.403040415276234e-06, "loss": 3.2666, "mean_token_accuracy": 0.4300713985720286, "step": 8614 }, { "epoch": 1.5971449758991472, "grad_norm": 6.43359375, "learning_rate": 8.402855024100853e-06, "loss": 3.0618, "mean_token_accuracy": 0.441352141314017, "step": 8615 }, { "epoch": 1.5973303670745271, "grad_norm": 6.58203125, "learning_rate": 8.402669632925473e-06, "loss": 2.7172, "mean_token_accuracy": 0.48085542322960684, "step": 8616 }, { "epoch": 1.5975157582499073, "grad_norm": 7.0234375, "learning_rate": 8.402484241750094e-06, "loss": 3.0322, "mean_token_accuracy": 0.4524315705554079, "step": 8617 }, { "epoch": 1.5977011494252875, "grad_norm": 5.703125, "learning_rate": 8.402298850574714e-06, "loss": 2.7527, "mean_token_accuracy": 0.4898522877386814, "step": 8618 }, { "epoch": 1.5978865406006673, "grad_norm": 6.66796875, "learning_rate": 8.402113459399333e-06, "loss": 2.9781, "mean_token_accuracy": 0.44206065421798435, "step": 8619 }, { "epoch": 1.5980719317760474, "grad_norm": 6.5078125, "learning_rate": 8.401928068223954e-06, "loss": 3.499, "mean_token_accuracy": 0.4003276897870016, "step": 8620 }, { "epoch": 1.5982573229514276, "grad_norm": 7.4296875, "learning_rate": 8.401742677048574e-06, "loss": 2.7849, "mean_token_accuracy": 0.47256977863330124, "step": 8621 }, { "epoch": 1.5984427141268076, "grad_norm": 8.2578125, "learning_rate": 8.401557285873193e-06, "loss": 2.9765, "mean_token_accuracy": 0.46650373778033355, "step": 8622 }, { "epoch": 1.5986281053021876, "grad_norm": 8.53125, "learning_rate": 8.401371894697813e-06, "loss": 3.6432, "mean_token_accuracy": 0.4119150080688542, "step": 8623 }, { "epoch": 1.5988134964775678, "grad_norm": 10.1875, "learning_rate": 8.401186503522432e-06, "loss": 2.8262, "mean_token_accuracy": 0.4548213546696096, "step": 8624 }, { "epoch": 1.5989988876529477, "grad_norm": 5.5703125, "learning_rate": 8.401001112347053e-06, "loss": 3.2812, "mean_token_accuracy": 0.42887624466571833, "step": 8625 }, { "epoch": 1.5991842788283277, "grad_norm": 7.80859375, "learning_rate": 8.400815721171673e-06, "loss": 2.3344, "mean_token_accuracy": 0.5284240825137922, "step": 8626 }, { "epoch": 1.5993696700037079, "grad_norm": 6.8515625, "learning_rate": 8.400630329996294e-06, "loss": 3.0744, "mean_token_accuracy": 0.42608089260808923, "step": 8627 }, { "epoch": 1.5995550611790879, "grad_norm": 6.3515625, "learning_rate": 8.400444938820912e-06, "loss": 2.4912, "mean_token_accuracy": 0.5222658808120497, "step": 8628 }, { "epoch": 1.5997404523544678, "grad_norm": 6.5078125, "learning_rate": 8.400259547645533e-06, "loss": 2.8949, "mean_token_accuracy": 0.436965645888388, "step": 8629 }, { "epoch": 1.599925843529848, "grad_norm": 6.05078125, "learning_rate": 8.400074156470153e-06, "loss": 2.8493, "mean_token_accuracy": 0.45618312339157524, "step": 8630 }, { "epoch": 1.600111234705228, "grad_norm": 6.4140625, "learning_rate": 8.399888765294772e-06, "loss": 2.4854, "mean_token_accuracy": 0.5345849802371542, "step": 8631 }, { "epoch": 1.600296625880608, "grad_norm": 5.80078125, "learning_rate": 8.399703374119393e-06, "loss": 2.8014, "mean_token_accuracy": 0.4791158317783838, "step": 8632 }, { "epoch": 1.6004820170559881, "grad_norm": 5.7734375, "learning_rate": 8.399517982944011e-06, "loss": 2.5197, "mean_token_accuracy": 0.5085915699809076, "step": 8633 }, { "epoch": 1.6006674082313683, "grad_norm": 7.2421875, "learning_rate": 8.399332591768634e-06, "loss": 2.7264, "mean_token_accuracy": 0.4726089588377724, "step": 8634 }, { "epoch": 1.600852799406748, "grad_norm": 7.265625, "learning_rate": 8.399147200593252e-06, "loss": 2.6153, "mean_token_accuracy": 0.4830630919890833, "step": 8635 }, { "epoch": 1.6010381905821283, "grad_norm": 6.0234375, "learning_rate": 8.398961809417873e-06, "loss": 2.7931, "mean_token_accuracy": 0.4743985194324491, "step": 8636 }, { "epoch": 1.6012235817575085, "grad_norm": 8.296875, "learning_rate": 8.398776418242492e-06, "loss": 3.1076, "mean_token_accuracy": 0.42414355628058725, "step": 8637 }, { "epoch": 1.6014089729328884, "grad_norm": 5.2734375, "learning_rate": 8.398591027067112e-06, "loss": 2.9813, "mean_token_accuracy": 0.4670192906036092, "step": 8638 }, { "epoch": 1.6015943641082684, "grad_norm": 5.3359375, "learning_rate": 8.398405635891733e-06, "loss": 3.1805, "mean_token_accuracy": 0.4517241379310345, "step": 8639 }, { "epoch": 1.6017797552836486, "grad_norm": 6.87890625, "learning_rate": 8.398220244716352e-06, "loss": 3.3873, "mean_token_accuracy": 0.4171504596032898, "step": 8640 }, { "epoch": 1.6019651464590285, "grad_norm": 9.1953125, "learning_rate": 8.398034853540972e-06, "loss": 2.7982, "mean_token_accuracy": 0.44830297470842617, "step": 8641 }, { "epoch": 1.6021505376344085, "grad_norm": 6.109375, "learning_rate": 8.397849462365592e-06, "loss": 2.7956, "mean_token_accuracy": 0.4911080711354309, "step": 8642 }, { "epoch": 1.6023359288097887, "grad_norm": 6.6484375, "learning_rate": 8.397664071190213e-06, "loss": 2.8701, "mean_token_accuracy": 0.47009860417467025, "step": 8643 }, { "epoch": 1.6025213199851687, "grad_norm": 5.2734375, "learning_rate": 8.397478680014832e-06, "loss": 2.985, "mean_token_accuracy": 0.4557747727776546, "step": 8644 }, { "epoch": 1.6027067111605486, "grad_norm": 6.02734375, "learning_rate": 8.397293288839452e-06, "loss": 2.6442, "mean_token_accuracy": 0.478779375657664, "step": 8645 }, { "epoch": 1.6028921023359288, "grad_norm": 6.3828125, "learning_rate": 8.397107897664071e-06, "loss": 3.9453, "mean_token_accuracy": 0.3714471968709257, "step": 8646 }, { "epoch": 1.603077493511309, "grad_norm": 7.19921875, "learning_rate": 8.396922506488692e-06, "loss": 2.8062, "mean_token_accuracy": 0.4599919039265956, "step": 8647 }, { "epoch": 1.6032628846866888, "grad_norm": 8.4921875, "learning_rate": 8.396737115313312e-06, "loss": 2.2855, "mean_token_accuracy": 0.5318315377081293, "step": 8648 }, { "epoch": 1.603448275862069, "grad_norm": 6.734375, "learning_rate": 8.396551724137931e-06, "loss": 2.8772, "mean_token_accuracy": 0.4736248590402205, "step": 8649 }, { "epoch": 1.6036336670374491, "grad_norm": 6.16796875, "learning_rate": 8.396366332962553e-06, "loss": 2.7806, "mean_token_accuracy": 0.4805728871242201, "step": 8650 }, { "epoch": 1.603819058212829, "grad_norm": 6.96875, "learning_rate": 8.396180941787172e-06, "loss": 2.799, "mean_token_accuracy": 0.49238253744718985, "step": 8651 }, { "epoch": 1.604004449388209, "grad_norm": 5.7578125, "learning_rate": 8.395995550611792e-06, "loss": 2.5007, "mean_token_accuracy": 0.5115071403281011, "step": 8652 }, { "epoch": 1.6041898405635893, "grad_norm": 6.11328125, "learning_rate": 8.395810159436411e-06, "loss": 2.8007, "mean_token_accuracy": 0.4666385135135135, "step": 8653 }, { "epoch": 1.6043752317389692, "grad_norm": 5.76953125, "learning_rate": 8.395624768261032e-06, "loss": 2.6047, "mean_token_accuracy": 0.47375565610859727, "step": 8654 }, { "epoch": 1.6045606229143492, "grad_norm": 5.6796875, "learning_rate": 8.395439377085652e-06, "loss": 2.4953, "mean_token_accuracy": 0.5057455350962204, "step": 8655 }, { "epoch": 1.6047460140897294, "grad_norm": 7.8515625, "learning_rate": 8.395253985910271e-06, "loss": 3.532, "mean_token_accuracy": 0.42178601720586456, "step": 8656 }, { "epoch": 1.6049314052651094, "grad_norm": 10.2265625, "learning_rate": 8.395068594734891e-06, "loss": 3.0786, "mean_token_accuracy": 0.4359581360578121, "step": 8657 }, { "epoch": 1.6051167964404893, "grad_norm": 7.1015625, "learning_rate": 8.394883203559512e-06, "loss": 2.9055, "mean_token_accuracy": 0.46379027853631893, "step": 8658 }, { "epoch": 1.6053021876158695, "grad_norm": 8.84375, "learning_rate": 8.394697812384132e-06, "loss": 2.4032, "mean_token_accuracy": 0.5116225546605293, "step": 8659 }, { "epoch": 1.6054875787912495, "grad_norm": 7.60546875, "learning_rate": 8.394512421208751e-06, "loss": 3.6699, "mean_token_accuracy": 0.40378951502061655, "step": 8660 }, { "epoch": 1.6056729699666294, "grad_norm": 6.7890625, "learning_rate": 8.394327030033372e-06, "loss": 2.4313, "mean_token_accuracy": 0.5136150234741784, "step": 8661 }, { "epoch": 1.6058583611420096, "grad_norm": 7.60546875, "learning_rate": 8.39414163885799e-06, "loss": 3.4075, "mean_token_accuracy": 0.4122950819672131, "step": 8662 }, { "epoch": 1.6060437523173898, "grad_norm": 7.0390625, "learning_rate": 8.393956247682611e-06, "loss": 2.5689, "mean_token_accuracy": 0.5111719763586565, "step": 8663 }, { "epoch": 1.6062291434927698, "grad_norm": 7.70703125, "learning_rate": 8.393770856507231e-06, "loss": 2.6302, "mean_token_accuracy": 0.48648273949812837, "step": 8664 }, { "epoch": 1.6064145346681498, "grad_norm": 8.578125, "learning_rate": 8.39358546533185e-06, "loss": 2.7151, "mean_token_accuracy": 0.4719166184134337, "step": 8665 }, { "epoch": 1.60659992584353, "grad_norm": 5.765625, "learning_rate": 8.39340007415647e-06, "loss": 2.8652, "mean_token_accuracy": 0.4518396649715824, "step": 8666 }, { "epoch": 1.60678531701891, "grad_norm": 7.63671875, "learning_rate": 8.393214682981091e-06, "loss": 2.7526, "mean_token_accuracy": 0.47163486333161425, "step": 8667 }, { "epoch": 1.6069707081942899, "grad_norm": 10.890625, "learning_rate": 8.393029291805712e-06, "loss": 3.1586, "mean_token_accuracy": 0.4612741210679355, "step": 8668 }, { "epoch": 1.60715609936967, "grad_norm": 7.453125, "learning_rate": 8.39284390063033e-06, "loss": 2.7412, "mean_token_accuracy": 0.49185611009493946, "step": 8669 }, { "epoch": 1.60734149054505, "grad_norm": 9.6484375, "learning_rate": 8.392658509454951e-06, "loss": 2.5046, "mean_token_accuracy": 0.49555254032865975, "step": 8670 }, { "epoch": 1.60752688172043, "grad_norm": 6.19140625, "learning_rate": 8.39247311827957e-06, "loss": 3.1556, "mean_token_accuracy": 0.4467073998642227, "step": 8671 }, { "epoch": 1.6077122728958102, "grad_norm": 8.6171875, "learning_rate": 8.39228772710419e-06, "loss": 2.7868, "mean_token_accuracy": 0.479288076862498, "step": 8672 }, { "epoch": 1.6078976640711902, "grad_norm": 7.59375, "learning_rate": 8.39210233592881e-06, "loss": 3.2884, "mean_token_accuracy": 0.44635845471817603, "step": 8673 }, { "epoch": 1.6080830552465701, "grad_norm": 6.671875, "learning_rate": 8.39191694475343e-06, "loss": 2.6022, "mean_token_accuracy": 0.48223615464994773, "step": 8674 }, { "epoch": 1.6082684464219503, "grad_norm": 6.00390625, "learning_rate": 8.39173155357805e-06, "loss": 3.1312, "mean_token_accuracy": 0.4388614580307871, "step": 8675 }, { "epoch": 1.6084538375973305, "grad_norm": 6.68359375, "learning_rate": 8.39154616240267e-06, "loss": 3.131, "mean_token_accuracy": 0.44995152690256907, "step": 8676 }, { "epoch": 1.6086392287727103, "grad_norm": 6.55859375, "learning_rate": 8.391360771227291e-06, "loss": 3.7159, "mean_token_accuracy": 0.39223663954255555, "step": 8677 }, { "epoch": 1.6088246199480905, "grad_norm": 7.14453125, "learning_rate": 8.39117538005191e-06, "loss": 3.0674, "mean_token_accuracy": 0.45244186046511625, "step": 8678 }, { "epoch": 1.6090100111234706, "grad_norm": 6.9609375, "learning_rate": 8.39098998887653e-06, "loss": 2.9394, "mean_token_accuracy": 0.47921419518377695, "step": 8679 }, { "epoch": 1.6091954022988506, "grad_norm": 5.76171875, "learning_rate": 8.390804597701149e-06, "loss": 2.4442, "mean_token_accuracy": 0.49082626850563077, "step": 8680 }, { "epoch": 1.6093807934742306, "grad_norm": 5.94140625, "learning_rate": 8.39061920652577e-06, "loss": 3.21, "mean_token_accuracy": 0.42822662601626016, "step": 8681 }, { "epoch": 1.6095661846496108, "grad_norm": 6.2734375, "learning_rate": 8.39043381535039e-06, "loss": 2.7947, "mean_token_accuracy": 0.46371158392434986, "step": 8682 }, { "epoch": 1.6097515758249907, "grad_norm": 6.0, "learning_rate": 8.39024842417501e-06, "loss": 2.8765, "mean_token_accuracy": 0.4587135293454996, "step": 8683 }, { "epoch": 1.6099369670003707, "grad_norm": 8.234375, "learning_rate": 8.39006303299963e-06, "loss": 2.4366, "mean_token_accuracy": 0.531859410430839, "step": 8684 }, { "epoch": 1.610122358175751, "grad_norm": 6.0703125, "learning_rate": 8.38987764182425e-06, "loss": 3.1128, "mean_token_accuracy": 0.4520777948789702, "step": 8685 }, { "epoch": 1.6103077493511309, "grad_norm": 7.953125, "learning_rate": 8.38969225064887e-06, "loss": 2.8506, "mean_token_accuracy": 0.5088055588834312, "step": 8686 }, { "epoch": 1.6104931405265108, "grad_norm": 6.4609375, "learning_rate": 8.38950685947349e-06, "loss": 3.0663, "mean_token_accuracy": 0.4762446297700278, "step": 8687 }, { "epoch": 1.610678531701891, "grad_norm": 7.11328125, "learning_rate": 8.38932146829811e-06, "loss": 3.1893, "mean_token_accuracy": 0.4541069459757442, "step": 8688 }, { "epoch": 1.6108639228772712, "grad_norm": 6.16015625, "learning_rate": 8.389136077122728e-06, "loss": 2.7016, "mean_token_accuracy": 0.48244810744810745, "step": 8689 }, { "epoch": 1.611049314052651, "grad_norm": 8.1875, "learning_rate": 8.388950685947349e-06, "loss": 3.0439, "mean_token_accuracy": 0.4649674500717202, "step": 8690 }, { "epoch": 1.6112347052280311, "grad_norm": 6.16796875, "learning_rate": 8.38876529477197e-06, "loss": 2.8254, "mean_token_accuracy": 0.4778393351800554, "step": 8691 }, { "epoch": 1.6114200964034113, "grad_norm": 6.69140625, "learning_rate": 8.38857990359659e-06, "loss": 3.0551, "mean_token_accuracy": 0.46653543307086615, "step": 8692 }, { "epoch": 1.6116054875787913, "grad_norm": 5.71484375, "learning_rate": 8.38839451242121e-06, "loss": 3.0346, "mean_token_accuracy": 0.45984102503262547, "step": 8693 }, { "epoch": 1.6117908787541713, "grad_norm": 7.88671875, "learning_rate": 8.38820912124583e-06, "loss": 2.7222, "mean_token_accuracy": 0.45812518366147514, "step": 8694 }, { "epoch": 1.6119762699295515, "grad_norm": 7.54296875, "learning_rate": 8.38802373007045e-06, "loss": 3.2794, "mean_token_accuracy": 0.42385001932740624, "step": 8695 }, { "epoch": 1.6121616611049314, "grad_norm": 6.4609375, "learning_rate": 8.387838338895069e-06, "loss": 2.9902, "mean_token_accuracy": 0.46371769383697814, "step": 8696 }, { "epoch": 1.6123470522803114, "grad_norm": 6.08203125, "learning_rate": 8.387652947719689e-06, "loss": 3.1507, "mean_token_accuracy": 0.43814016172506737, "step": 8697 }, { "epoch": 1.6125324434556916, "grad_norm": 6.73046875, "learning_rate": 8.38746755654431e-06, "loss": 2.5095, "mean_token_accuracy": 0.4918279569892473, "step": 8698 }, { "epoch": 1.6127178346310715, "grad_norm": 8.21875, "learning_rate": 8.38728216536893e-06, "loss": 2.7553, "mean_token_accuracy": 0.46977150978462884, "step": 8699 }, { "epoch": 1.6129032258064515, "grad_norm": 6.3671875, "learning_rate": 8.387096774193549e-06, "loss": 2.8277, "mean_token_accuracy": 0.4714035964035964, "step": 8700 }, { "epoch": 1.6130886169818317, "grad_norm": 6.125, "learning_rate": 8.38691138301817e-06, "loss": 2.6838, "mean_token_accuracy": 0.47739955357142855, "step": 8701 }, { "epoch": 1.6132740081572117, "grad_norm": 6.51953125, "learning_rate": 8.38672599184279e-06, "loss": 2.8808, "mean_token_accuracy": 0.46365584308554475, "step": 8702 }, { "epoch": 1.6134593993325916, "grad_norm": 10.5625, "learning_rate": 8.386540600667409e-06, "loss": 2.836, "mean_token_accuracy": 0.46285093842097097, "step": 8703 }, { "epoch": 1.6136447905079718, "grad_norm": 9.5234375, "learning_rate": 8.386355209492029e-06, "loss": 3.6062, "mean_token_accuracy": 0.4108641975308642, "step": 8704 }, { "epoch": 1.613830181683352, "grad_norm": 7.9140625, "learning_rate": 8.386169818316648e-06, "loss": 2.6586, "mean_token_accuracy": 0.48470106260401996, "step": 8705 }, { "epoch": 1.6140155728587318, "grad_norm": 8.484375, "learning_rate": 8.385984427141268e-06, "loss": 2.3907, "mean_token_accuracy": 0.5153913808267371, "step": 8706 }, { "epoch": 1.614200964034112, "grad_norm": 11.515625, "learning_rate": 8.385799035965889e-06, "loss": 2.8371, "mean_token_accuracy": 0.46099205393691306, "step": 8707 }, { "epoch": 1.6143863552094921, "grad_norm": 8.1015625, "learning_rate": 8.38561364479051e-06, "loss": 2.6495, "mean_token_accuracy": 0.4885974914481186, "step": 8708 }, { "epoch": 1.614571746384872, "grad_norm": 9.6015625, "learning_rate": 8.385428253615128e-06, "loss": 2.6431, "mean_token_accuracy": 0.48355736917906866, "step": 8709 }, { "epoch": 1.614757137560252, "grad_norm": 6.125, "learning_rate": 8.385242862439749e-06, "loss": 3.2684, "mean_token_accuracy": 0.43847361429779014, "step": 8710 }, { "epoch": 1.6149425287356323, "grad_norm": 8.5, "learning_rate": 8.385057471264369e-06, "loss": 3.1125, "mean_token_accuracy": 0.445178521225752, "step": 8711 }, { "epoch": 1.6151279199110122, "grad_norm": 10.0546875, "learning_rate": 8.384872080088988e-06, "loss": 3.1396, "mean_token_accuracy": 0.4551998774697503, "step": 8712 }, { "epoch": 1.6153133110863922, "grad_norm": 8.828125, "learning_rate": 8.384686688913608e-06, "loss": 3.1585, "mean_token_accuracy": 0.435092180546726, "step": 8713 }, { "epoch": 1.6154987022617724, "grad_norm": 7.0625, "learning_rate": 8.384501297738227e-06, "loss": 2.3264, "mean_token_accuracy": 0.5415637860082304, "step": 8714 }, { "epoch": 1.6156840934371524, "grad_norm": 8.359375, "learning_rate": 8.38431590656285e-06, "loss": 3.3887, "mean_token_accuracy": 0.4264190154565528, "step": 8715 }, { "epoch": 1.6158694846125323, "grad_norm": 7.10546875, "learning_rate": 8.384130515387468e-06, "loss": 3.2293, "mean_token_accuracy": 0.43281121187139326, "step": 8716 }, { "epoch": 1.6160548757879125, "grad_norm": 6.85546875, "learning_rate": 8.383945124212089e-06, "loss": 2.7756, "mean_token_accuracy": 0.44758016092230096, "step": 8717 }, { "epoch": 1.6162402669632927, "grad_norm": 6.76953125, "learning_rate": 8.383759733036707e-06, "loss": 2.3221, "mean_token_accuracy": 0.5683209341745927, "step": 8718 }, { "epoch": 1.6164256581386724, "grad_norm": 6.68359375, "learning_rate": 8.383574341861328e-06, "loss": 2.8779, "mean_token_accuracy": 0.46162458017166313, "step": 8719 }, { "epoch": 1.6166110493140526, "grad_norm": 6.10546875, "learning_rate": 8.383388950685948e-06, "loss": 2.7845, "mean_token_accuracy": 0.47326709250059146, "step": 8720 }, { "epoch": 1.6167964404894328, "grad_norm": 7.09375, "learning_rate": 8.383203559510567e-06, "loss": 2.6524, "mean_token_accuracy": 0.4699955548970218, "step": 8721 }, { "epoch": 1.6169818316648128, "grad_norm": 5.2265625, "learning_rate": 8.383018168335188e-06, "loss": 2.3545, "mean_token_accuracy": 0.5278416347381865, "step": 8722 }, { "epoch": 1.6171672228401928, "grad_norm": 8.4921875, "learning_rate": 8.382832777159808e-06, "loss": 3.2611, "mean_token_accuracy": 0.44083384426732064, "step": 8723 }, { "epoch": 1.617352614015573, "grad_norm": 5.8359375, "learning_rate": 8.382647385984429e-06, "loss": 3.1706, "mean_token_accuracy": 0.4355317884729649, "step": 8724 }, { "epoch": 1.617538005190953, "grad_norm": 5.6640625, "learning_rate": 8.382461994809048e-06, "loss": 2.5865, "mean_token_accuracy": 0.4916753381893861, "step": 8725 }, { "epoch": 1.6177233963663329, "grad_norm": 6.6015625, "learning_rate": 8.382276603633668e-06, "loss": 3.1512, "mean_token_accuracy": 0.4276888959290353, "step": 8726 }, { "epoch": 1.617908787541713, "grad_norm": 5.3671875, "learning_rate": 8.382091212458287e-06, "loss": 3.2208, "mean_token_accuracy": 0.4450572177030194, "step": 8727 }, { "epoch": 1.618094178717093, "grad_norm": 6.67578125, "learning_rate": 8.381905821282907e-06, "loss": 3.0537, "mean_token_accuracy": 0.44220616838010557, "step": 8728 }, { "epoch": 1.618279569892473, "grad_norm": 4.9375, "learning_rate": 8.381720430107528e-06, "loss": 2.736, "mean_token_accuracy": 0.4629345904537419, "step": 8729 }, { "epoch": 1.6184649610678532, "grad_norm": 6.19140625, "learning_rate": 8.381535038932147e-06, "loss": 3.5557, "mean_token_accuracy": 0.40139073827489274, "step": 8730 }, { "epoch": 1.6186503522432332, "grad_norm": 9.59375, "learning_rate": 8.381349647756769e-06, "loss": 3.157, "mean_token_accuracy": 0.4577866954776972, "step": 8731 }, { "epoch": 1.6188357434186131, "grad_norm": 5.96484375, "learning_rate": 8.381164256581388e-06, "loss": 3.4271, "mean_token_accuracy": 0.4199318568994889, "step": 8732 }, { "epoch": 1.6190211345939933, "grad_norm": 6.2578125, "learning_rate": 8.380978865406008e-06, "loss": 2.7057, "mean_token_accuracy": 0.49382030273573113, "step": 8733 }, { "epoch": 1.6192065257693735, "grad_norm": 5.3515625, "learning_rate": 8.380793474230627e-06, "loss": 3.178, "mean_token_accuracy": 0.4324363636363636, "step": 8734 }, { "epoch": 1.6193919169447535, "grad_norm": 6.03515625, "learning_rate": 8.380608083055247e-06, "loss": 2.7252, "mean_token_accuracy": 0.4905340122731427, "step": 8735 }, { "epoch": 1.6195773081201335, "grad_norm": 6.12109375, "learning_rate": 8.380422691879868e-06, "loss": 2.5136, "mean_token_accuracy": 0.4964589235127479, "step": 8736 }, { "epoch": 1.6197626992955136, "grad_norm": 6.15234375, "learning_rate": 8.380237300704487e-06, "loss": 2.1226, "mean_token_accuracy": 0.519751327548245, "step": 8737 }, { "epoch": 1.6199480904708936, "grad_norm": 5.91015625, "learning_rate": 8.380051909529107e-06, "loss": 3.3436, "mean_token_accuracy": 0.42542306178669814, "step": 8738 }, { "epoch": 1.6201334816462736, "grad_norm": 5.9296875, "learning_rate": 8.379866518353728e-06, "loss": 2.8995, "mean_token_accuracy": 0.4425174825174825, "step": 8739 }, { "epoch": 1.6203188728216538, "grad_norm": 6.43359375, "learning_rate": 8.379681127178348e-06, "loss": 3.3522, "mean_token_accuracy": 0.42960832648589425, "step": 8740 }, { "epoch": 1.6205042639970337, "grad_norm": 6.71484375, "learning_rate": 8.379495736002967e-06, "loss": 3.2208, "mean_token_accuracy": 0.43608297153883263, "step": 8741 }, { "epoch": 1.6206896551724137, "grad_norm": 6.06640625, "learning_rate": 8.379310344827587e-06, "loss": 2.7103, "mean_token_accuracy": 0.47791103689084563, "step": 8742 }, { "epoch": 1.620875046347794, "grad_norm": 8.6640625, "learning_rate": 8.379124953652206e-06, "loss": 2.8738, "mean_token_accuracy": 0.47170081673592207, "step": 8743 }, { "epoch": 1.6210604375231739, "grad_norm": 8.7421875, "learning_rate": 8.378939562476827e-06, "loss": 2.364, "mean_token_accuracy": 0.5155636143850105, "step": 8744 }, { "epoch": 1.6212458286985538, "grad_norm": 6.3203125, "learning_rate": 8.378754171301447e-06, "loss": 2.8848, "mean_token_accuracy": 0.4480376304571513, "step": 8745 }, { "epoch": 1.621431219873934, "grad_norm": 8.25, "learning_rate": 8.378568780126066e-06, "loss": 3.3542, "mean_token_accuracy": 0.41583214115402956, "step": 8746 }, { "epoch": 1.6216166110493142, "grad_norm": 14.15625, "learning_rate": 8.378383388950686e-06, "loss": 2.6231, "mean_token_accuracy": 0.4831294030404153, "step": 8747 }, { "epoch": 1.621802002224694, "grad_norm": 6.4765625, "learning_rate": 8.378197997775307e-06, "loss": 3.0147, "mean_token_accuracy": 0.45090361445783134, "step": 8748 }, { "epoch": 1.6219873934000741, "grad_norm": 6.609375, "learning_rate": 8.378012606599927e-06, "loss": 2.9997, "mean_token_accuracy": 0.4414651002073255, "step": 8749 }, { "epoch": 1.6221727845754543, "grad_norm": 7.109375, "learning_rate": 8.377827215424546e-06, "loss": 3.2786, "mean_token_accuracy": 0.41841941505911634, "step": 8750 }, { "epoch": 1.6223581757508343, "grad_norm": 6.85546875, "learning_rate": 8.377641824249167e-06, "loss": 2.424, "mean_token_accuracy": 0.49204898556022664, "step": 8751 }, { "epoch": 1.6225435669262143, "grad_norm": 5.43359375, "learning_rate": 8.377456433073786e-06, "loss": 3.1259, "mean_token_accuracy": 0.43714220288527594, "step": 8752 }, { "epoch": 1.6227289581015945, "grad_norm": 6.56640625, "learning_rate": 8.377271041898406e-06, "loss": 2.7814, "mean_token_accuracy": 0.4806212596181248, "step": 8753 }, { "epoch": 1.6229143492769744, "grad_norm": 6.921875, "learning_rate": 8.377085650723027e-06, "loss": 2.8349, "mean_token_accuracy": 0.4693958141371594, "step": 8754 }, { "epoch": 1.6230997404523544, "grad_norm": 7.81640625, "learning_rate": 8.376900259547647e-06, "loss": 3.27, "mean_token_accuracy": 0.4076861058855146, "step": 8755 }, { "epoch": 1.6232851316277346, "grad_norm": 6.58984375, "learning_rate": 8.376714868372266e-06, "loss": 3.212, "mean_token_accuracy": 0.43562650740783276, "step": 8756 }, { "epoch": 1.6234705228031145, "grad_norm": 6.60546875, "learning_rate": 8.376529477196886e-06, "loss": 2.5984, "mean_token_accuracy": 0.4968465311843027, "step": 8757 }, { "epoch": 1.6236559139784945, "grad_norm": 6.484375, "learning_rate": 8.376344086021507e-06, "loss": 2.6933, "mean_token_accuracy": 0.49328897556498796, "step": 8758 }, { "epoch": 1.6238413051538747, "grad_norm": 10.6484375, "learning_rate": 8.376158694846126e-06, "loss": 3.0886, "mean_token_accuracy": 0.44981729598051157, "step": 8759 }, { "epoch": 1.624026696329255, "grad_norm": 11.0390625, "learning_rate": 8.375973303670746e-06, "loss": 2.8943, "mean_token_accuracy": 0.45137236236712674, "step": 8760 }, { "epoch": 1.6242120875046346, "grad_norm": 9.7578125, "learning_rate": 8.375787912495365e-06, "loss": 2.6662, "mean_token_accuracy": 0.4861612515042118, "step": 8761 }, { "epoch": 1.6243974786800148, "grad_norm": 6.58984375, "learning_rate": 8.375602521319985e-06, "loss": 2.6576, "mean_token_accuracy": 0.5018670649738611, "step": 8762 }, { "epoch": 1.624582869855395, "grad_norm": 8.7109375, "learning_rate": 8.375417130144606e-06, "loss": 2.7496, "mean_token_accuracy": 0.47197558268590456, "step": 8763 }, { "epoch": 1.624768261030775, "grad_norm": 8.265625, "learning_rate": 8.375231738969226e-06, "loss": 2.6224, "mean_token_accuracy": 0.4936582442824328, "step": 8764 }, { "epoch": 1.624953652206155, "grad_norm": 13.171875, "learning_rate": 8.375046347793845e-06, "loss": 2.835, "mean_token_accuracy": 0.44433552723708164, "step": 8765 }, { "epoch": 1.6251390433815351, "grad_norm": 9.03125, "learning_rate": 8.374860956618466e-06, "loss": 3.2026, "mean_token_accuracy": 0.4111567821491486, "step": 8766 }, { "epoch": 1.625324434556915, "grad_norm": 5.8359375, "learning_rate": 8.374675565443086e-06, "loss": 2.7225, "mean_token_accuracy": 0.45979899497487436, "step": 8767 }, { "epoch": 1.625509825732295, "grad_norm": 8.3671875, "learning_rate": 8.374490174267705e-06, "loss": 3.1076, "mean_token_accuracy": 0.4501593740944654, "step": 8768 }, { "epoch": 1.6256952169076753, "grad_norm": 10.90625, "learning_rate": 8.374304783092325e-06, "loss": 2.8233, "mean_token_accuracy": 0.5003565825131936, "step": 8769 }, { "epoch": 1.6258806080830552, "grad_norm": 8.6875, "learning_rate": 8.374119391916944e-06, "loss": 2.4493, "mean_token_accuracy": 0.5243741765480896, "step": 8770 }, { "epoch": 1.6260659992584352, "grad_norm": 5.45703125, "learning_rate": 8.373934000741566e-06, "loss": 3.3252, "mean_token_accuracy": 0.446087786259542, "step": 8771 }, { "epoch": 1.6262513904338154, "grad_norm": 8.2109375, "learning_rate": 8.373748609566185e-06, "loss": 3.1779, "mean_token_accuracy": 0.43308832108272083, "step": 8772 }, { "epoch": 1.6264367816091954, "grad_norm": 7.32421875, "learning_rate": 8.373563218390806e-06, "loss": 2.9791, "mean_token_accuracy": 0.44981810400171196, "step": 8773 }, { "epoch": 1.6266221727845753, "grad_norm": 6.26953125, "learning_rate": 8.373377827215426e-06, "loss": 3.0773, "mean_token_accuracy": 0.4402861860209136, "step": 8774 }, { "epoch": 1.6268075639599555, "grad_norm": 6.31640625, "learning_rate": 8.373192436040045e-06, "loss": 2.91, "mean_token_accuracy": 0.48240880638894884, "step": 8775 }, { "epoch": 1.6269929551353357, "grad_norm": 6.21875, "learning_rate": 8.373007044864665e-06, "loss": 2.7576, "mean_token_accuracy": 0.4987667854206632, "step": 8776 }, { "epoch": 1.6271783463107155, "grad_norm": 6.1875, "learning_rate": 8.372821653689284e-06, "loss": 2.7194, "mean_token_accuracy": 0.48885793871866295, "step": 8777 }, { "epoch": 1.6273637374860956, "grad_norm": 7.65234375, "learning_rate": 8.372636262513905e-06, "loss": 3.0453, "mean_token_accuracy": 0.4468552240733566, "step": 8778 }, { "epoch": 1.6275491286614758, "grad_norm": 6.76953125, "learning_rate": 8.372450871338525e-06, "loss": 3.1302, "mean_token_accuracy": 0.4250386398763524, "step": 8779 }, { "epoch": 1.6277345198368558, "grad_norm": 8.671875, "learning_rate": 8.372265480163146e-06, "loss": 2.8536, "mean_token_accuracy": 0.4679523539421441, "step": 8780 }, { "epoch": 1.6279199110122358, "grad_norm": 7.38671875, "learning_rate": 8.372080088987765e-06, "loss": 3.6165, "mean_token_accuracy": 0.41606929510155316, "step": 8781 }, { "epoch": 1.628105302187616, "grad_norm": 6.7109375, "learning_rate": 8.371894697812385e-06, "loss": 2.4377, "mean_token_accuracy": 0.500449121006031, "step": 8782 }, { "epoch": 1.628290693362996, "grad_norm": 8.25, "learning_rate": 8.371709306637006e-06, "loss": 2.4435, "mean_token_accuracy": 0.48975055569276366, "step": 8783 }, { "epoch": 1.628476084538376, "grad_norm": 5.265625, "learning_rate": 8.371523915461624e-06, "loss": 2.4908, "mean_token_accuracy": 0.5074250490333426, "step": 8784 }, { "epoch": 1.628661475713756, "grad_norm": 6.76171875, "learning_rate": 8.371338524286245e-06, "loss": 2.6093, "mean_token_accuracy": 0.48430634023854363, "step": 8785 }, { "epoch": 1.628846866889136, "grad_norm": 8.234375, "learning_rate": 8.371153133110864e-06, "loss": 2.659, "mean_token_accuracy": 0.4823463478423314, "step": 8786 }, { "epoch": 1.629032258064516, "grad_norm": 6.19140625, "learning_rate": 8.370967741935484e-06, "loss": 2.8055, "mean_token_accuracy": 0.4695269526952695, "step": 8787 }, { "epoch": 1.6292176492398962, "grad_norm": 6.6640625, "learning_rate": 8.370782350760105e-06, "loss": 2.8646, "mean_token_accuracy": 0.4707775489186406, "step": 8788 }, { "epoch": 1.6294030404152764, "grad_norm": 8.5234375, "learning_rate": 8.370596959584725e-06, "loss": 3.1852, "mean_token_accuracy": 0.4490778970547757, "step": 8789 }, { "epoch": 1.6295884315906561, "grad_norm": 6.04296875, "learning_rate": 8.370411568409344e-06, "loss": 2.8226, "mean_token_accuracy": 0.4623541887592789, "step": 8790 }, { "epoch": 1.6297738227660363, "grad_norm": 5.87109375, "learning_rate": 8.370226177233964e-06, "loss": 3.2292, "mean_token_accuracy": 0.4213668499607227, "step": 8791 }, { "epoch": 1.6299592139414165, "grad_norm": 6.69921875, "learning_rate": 8.370040786058585e-06, "loss": 3.3587, "mean_token_accuracy": 0.4348685022842326, "step": 8792 }, { "epoch": 1.6301446051167965, "grad_norm": 6.5234375, "learning_rate": 8.369855394883204e-06, "loss": 3.2033, "mean_token_accuracy": 0.44258752341161217, "step": 8793 }, { "epoch": 1.6303299962921765, "grad_norm": 8.8515625, "learning_rate": 8.369670003707824e-06, "loss": 2.2662, "mean_token_accuracy": 0.5220538030861669, "step": 8794 }, { "epoch": 1.6305153874675566, "grad_norm": 9.2421875, "learning_rate": 8.369484612532443e-06, "loss": 2.6571, "mean_token_accuracy": 0.4830764581444545, "step": 8795 }, { "epoch": 1.6307007786429366, "grad_norm": 9.625, "learning_rate": 8.369299221357065e-06, "loss": 2.9988, "mean_token_accuracy": 0.4165642286416718, "step": 8796 }, { "epoch": 1.6308861698183166, "grad_norm": 6.46484375, "learning_rate": 8.369113830181684e-06, "loss": 2.7927, "mean_token_accuracy": 0.4953470959460905, "step": 8797 }, { "epoch": 1.6310715609936968, "grad_norm": 8.6640625, "learning_rate": 8.368928439006304e-06, "loss": 2.7785, "mean_token_accuracy": 0.49072418417523467, "step": 8798 }, { "epoch": 1.6312569521690767, "grad_norm": 5.83984375, "learning_rate": 8.368743047830923e-06, "loss": 3.5161, "mean_token_accuracy": 0.39652618823212604, "step": 8799 }, { "epoch": 1.6314423433444567, "grad_norm": 5.61328125, "learning_rate": 8.368557656655544e-06, "loss": 2.7975, "mean_token_accuracy": 0.46272054638588506, "step": 8800 }, { "epoch": 1.631627734519837, "grad_norm": 5.81640625, "learning_rate": 8.368372265480164e-06, "loss": 2.7163, "mean_token_accuracy": 0.5060637820032939, "step": 8801 }, { "epoch": 1.6318131256952169, "grad_norm": 5.83203125, "learning_rate": 8.368186874304783e-06, "loss": 3.2057, "mean_token_accuracy": 0.4193423597678917, "step": 8802 }, { "epoch": 1.6319985168705968, "grad_norm": 5.89453125, "learning_rate": 8.368001483129404e-06, "loss": 3.3967, "mean_token_accuracy": 0.4403158853903007, "step": 8803 }, { "epoch": 1.632183908045977, "grad_norm": 6.31640625, "learning_rate": 8.367816091954024e-06, "loss": 2.6812, "mean_token_accuracy": 0.49637571730594987, "step": 8804 }, { "epoch": 1.6323692992213572, "grad_norm": 6.69140625, "learning_rate": 8.367630700778644e-06, "loss": 3.6396, "mean_token_accuracy": 0.4328578455484506, "step": 8805 }, { "epoch": 1.632554690396737, "grad_norm": 7.94921875, "learning_rate": 8.367445309603263e-06, "loss": 2.6067, "mean_token_accuracy": 0.49641611778380473, "step": 8806 }, { "epoch": 1.6327400815721171, "grad_norm": 6.7734375, "learning_rate": 8.367259918427884e-06, "loss": 2.9741, "mean_token_accuracy": 0.46098868374032165, "step": 8807 }, { "epoch": 1.6329254727474973, "grad_norm": 6.93359375, "learning_rate": 8.367074527252503e-06, "loss": 2.9466, "mean_token_accuracy": 0.44734325911760686, "step": 8808 }, { "epoch": 1.6331108639228773, "grad_norm": 6.5859375, "learning_rate": 8.366889136077123e-06, "loss": 3.0028, "mean_token_accuracy": 0.4577215878194671, "step": 8809 }, { "epoch": 1.6332962550982573, "grad_norm": 6.29296875, "learning_rate": 8.366703744901744e-06, "loss": 2.8746, "mean_token_accuracy": 0.4679935449166218, "step": 8810 }, { "epoch": 1.6334816462736375, "grad_norm": 6.55859375, "learning_rate": 8.366518353726362e-06, "loss": 2.4088, "mean_token_accuracy": 0.5062162162162163, "step": 8811 }, { "epoch": 1.6336670374490174, "grad_norm": 6.4296875, "learning_rate": 8.366332962550985e-06, "loss": 3.1861, "mean_token_accuracy": 0.42810539523212043, "step": 8812 }, { "epoch": 1.6338524286243974, "grad_norm": 10.3125, "learning_rate": 8.366147571375603e-06, "loss": 2.6235, "mean_token_accuracy": 0.5053533190578159, "step": 8813 }, { "epoch": 1.6340378197997776, "grad_norm": 6.34375, "learning_rate": 8.365962180200224e-06, "loss": 2.6298, "mean_token_accuracy": 0.505017629509086, "step": 8814 }, { "epoch": 1.6342232109751575, "grad_norm": 6.93359375, "learning_rate": 8.365776789024843e-06, "loss": 3.0334, "mean_token_accuracy": 0.4579090291921249, "step": 8815 }, { "epoch": 1.6344086021505375, "grad_norm": 8.234375, "learning_rate": 8.365591397849463e-06, "loss": 3.137, "mean_token_accuracy": 0.44664466446644663, "step": 8816 }, { "epoch": 1.6345939933259177, "grad_norm": 8.2890625, "learning_rate": 8.365406006674084e-06, "loss": 2.7034, "mean_token_accuracy": 0.4574607329842932, "step": 8817 }, { "epoch": 1.634779384501298, "grad_norm": 5.84375, "learning_rate": 8.365220615498702e-06, "loss": 2.7892, "mean_token_accuracy": 0.49075081610446136, "step": 8818 }, { "epoch": 1.6349647756766776, "grad_norm": 8.28125, "learning_rate": 8.365035224323323e-06, "loss": 2.8311, "mean_token_accuracy": 0.4548637159289822, "step": 8819 }, { "epoch": 1.6351501668520578, "grad_norm": 8.3515625, "learning_rate": 8.364849833147943e-06, "loss": 2.2074, "mean_token_accuracy": 0.5258861439312568, "step": 8820 }, { "epoch": 1.635335558027438, "grad_norm": 6.03125, "learning_rate": 8.364664441972564e-06, "loss": 2.2845, "mean_token_accuracy": 0.5308416100365917, "step": 8821 }, { "epoch": 1.635520949202818, "grad_norm": 6.88671875, "learning_rate": 8.364479050797183e-06, "loss": 3.169, "mean_token_accuracy": 0.43691473632331196, "step": 8822 }, { "epoch": 1.635706340378198, "grad_norm": 6.11328125, "learning_rate": 8.364293659621803e-06, "loss": 2.4554, "mean_token_accuracy": 0.5282762938230384, "step": 8823 }, { "epoch": 1.6358917315535781, "grad_norm": 7.40625, "learning_rate": 8.364108268446422e-06, "loss": 2.809, "mean_token_accuracy": 0.4753790839455995, "step": 8824 }, { "epoch": 1.636077122728958, "grad_norm": 5.84375, "learning_rate": 8.363922877271042e-06, "loss": 2.6758, "mean_token_accuracy": 0.4891379708805177, "step": 8825 }, { "epoch": 1.636262513904338, "grad_norm": 6.34375, "learning_rate": 8.363737486095663e-06, "loss": 2.3031, "mean_token_accuracy": 0.5506055363321799, "step": 8826 }, { "epoch": 1.6364479050797183, "grad_norm": 8.828125, "learning_rate": 8.363552094920282e-06, "loss": 2.4782, "mean_token_accuracy": 0.4932441654155862, "step": 8827 }, { "epoch": 1.6366332962550982, "grad_norm": 6.453125, "learning_rate": 8.363366703744902e-06, "loss": 2.1661, "mean_token_accuracy": 0.563860103626943, "step": 8828 }, { "epoch": 1.6368186874304782, "grad_norm": 6.62890625, "learning_rate": 8.363181312569523e-06, "loss": 3.1639, "mean_token_accuracy": 0.44660886090984675, "step": 8829 }, { "epoch": 1.6370040786058584, "grad_norm": 8.0234375, "learning_rate": 8.362995921394143e-06, "loss": 3.4875, "mean_token_accuracy": 0.41378768844221103, "step": 8830 }, { "epoch": 1.6371894697812384, "grad_norm": 7.8203125, "learning_rate": 8.362810530218762e-06, "loss": 3.1557, "mean_token_accuracy": 0.4383342840844267, "step": 8831 }, { "epoch": 1.6373748609566183, "grad_norm": 7.421875, "learning_rate": 8.362625139043383e-06, "loss": 2.7168, "mean_token_accuracy": 0.49100609756097563, "step": 8832 }, { "epoch": 1.6375602521319985, "grad_norm": 10.0703125, "learning_rate": 8.362439747868001e-06, "loss": 2.4154, "mean_token_accuracy": 0.5020499933871181, "step": 8833 }, { "epoch": 1.6377456433073787, "grad_norm": 8.9453125, "learning_rate": 8.362254356692622e-06, "loss": 2.6216, "mean_token_accuracy": 0.4719281790164813, "step": 8834 }, { "epoch": 1.6379310344827587, "grad_norm": 5.97265625, "learning_rate": 8.362068965517242e-06, "loss": 2.6687, "mean_token_accuracy": 0.48147727272727275, "step": 8835 }, { "epoch": 1.6381164256581386, "grad_norm": 8.375, "learning_rate": 8.361883574341863e-06, "loss": 4.605, "mean_token_accuracy": 0.38232301206570724, "step": 8836 }, { "epoch": 1.6383018168335188, "grad_norm": 7.80859375, "learning_rate": 8.361698183166482e-06, "loss": 2.8881, "mean_token_accuracy": 0.4531287461040518, "step": 8837 }, { "epoch": 1.6384872080088988, "grad_norm": 7.25, "learning_rate": 8.361512791991102e-06, "loss": 3.9553, "mean_token_accuracy": 0.3836397486618571, "step": 8838 }, { "epoch": 1.6386725991842788, "grad_norm": 6.23046875, "learning_rate": 8.361327400815723e-06, "loss": 2.6187, "mean_token_accuracy": 0.48896969696969694, "step": 8839 }, { "epoch": 1.638857990359659, "grad_norm": 7.0546875, "learning_rate": 8.361142009640341e-06, "loss": 3.4368, "mean_token_accuracy": 0.4477919402600747, "step": 8840 }, { "epoch": 1.639043381535039, "grad_norm": 9.390625, "learning_rate": 8.360956618464962e-06, "loss": 2.6954, "mean_token_accuracy": 0.5128427441205159, "step": 8841 }, { "epoch": 1.639228772710419, "grad_norm": 6.6796875, "learning_rate": 8.36077122728958e-06, "loss": 3.0075, "mean_token_accuracy": 0.45757471659223636, "step": 8842 }, { "epoch": 1.639414163885799, "grad_norm": 6.75390625, "learning_rate": 8.360585836114201e-06, "loss": 3.1961, "mean_token_accuracy": 0.438194723449846, "step": 8843 }, { "epoch": 1.639599555061179, "grad_norm": 6.49609375, "learning_rate": 8.360400444938822e-06, "loss": 2.701, "mean_token_accuracy": 0.47089678510998306, "step": 8844 }, { "epoch": 1.639784946236559, "grad_norm": 6.171875, "learning_rate": 8.360215053763442e-06, "loss": 2.8426, "mean_token_accuracy": 0.47523838818824976, "step": 8845 }, { "epoch": 1.6399703374119392, "grad_norm": 5.82421875, "learning_rate": 8.360029662588061e-06, "loss": 2.545, "mean_token_accuracy": 0.49304148088096206, "step": 8846 }, { "epoch": 1.6401557285873194, "grad_norm": 7.23828125, "learning_rate": 8.359844271412681e-06, "loss": 3.3549, "mean_token_accuracy": 0.45261522527187986, "step": 8847 }, { "epoch": 1.6403411197626991, "grad_norm": 8.921875, "learning_rate": 8.359658880237302e-06, "loss": 2.5924, "mean_token_accuracy": 0.47549407114624503, "step": 8848 }, { "epoch": 1.6405265109380793, "grad_norm": 6.54296875, "learning_rate": 8.35947348906192e-06, "loss": 2.6524, "mean_token_accuracy": 0.5071950662402924, "step": 8849 }, { "epoch": 1.6407119021134595, "grad_norm": 7.1953125, "learning_rate": 8.359288097886541e-06, "loss": 2.6618, "mean_token_accuracy": 0.47151931688014165, "step": 8850 }, { "epoch": 1.6408972932888395, "grad_norm": 10.3125, "learning_rate": 8.35910270671116e-06, "loss": 2.7937, "mean_token_accuracy": 0.4871468567065411, "step": 8851 }, { "epoch": 1.6410826844642195, "grad_norm": 7.00390625, "learning_rate": 8.358917315535782e-06, "loss": 2.6599, "mean_token_accuracy": 0.4843529743445165, "step": 8852 }, { "epoch": 1.6412680756395996, "grad_norm": 6.8828125, "learning_rate": 8.358731924360401e-06, "loss": 2.6812, "mean_token_accuracy": 0.48415596654494053, "step": 8853 }, { "epoch": 1.6414534668149796, "grad_norm": 8.15625, "learning_rate": 8.358546533185021e-06, "loss": 1.9835, "mean_token_accuracy": 0.546678870292887, "step": 8854 }, { "epoch": 1.6416388579903596, "grad_norm": 9.6640625, "learning_rate": 8.358361142009642e-06, "loss": 2.8561, "mean_token_accuracy": 0.4505708039834831, "step": 8855 }, { "epoch": 1.6418242491657398, "grad_norm": 7.38671875, "learning_rate": 8.35817575083426e-06, "loss": 2.7305, "mean_token_accuracy": 0.493120470538809, "step": 8856 }, { "epoch": 1.6420096403411197, "grad_norm": 6.00390625, "learning_rate": 8.357990359658881e-06, "loss": 2.9431, "mean_token_accuracy": 0.4471526195899772, "step": 8857 }, { "epoch": 1.6421950315164997, "grad_norm": 7.5078125, "learning_rate": 8.3578049684835e-06, "loss": 2.9658, "mean_token_accuracy": 0.44033176528843776, "step": 8858 }, { "epoch": 1.64238042269188, "grad_norm": 9.3359375, "learning_rate": 8.35761957730812e-06, "loss": 2.7248, "mean_token_accuracy": 0.47452819897458276, "step": 8859 }, { "epoch": 1.64256581386726, "grad_norm": 5.60546875, "learning_rate": 8.357434186132741e-06, "loss": 2.8646, "mean_token_accuracy": 0.4497469269703543, "step": 8860 }, { "epoch": 1.6427512050426398, "grad_norm": 6.24609375, "learning_rate": 8.357248794957362e-06, "loss": 3.2009, "mean_token_accuracy": 0.4492247520603436, "step": 8861 }, { "epoch": 1.64293659621802, "grad_norm": 5.7265625, "learning_rate": 8.35706340378198e-06, "loss": 3.0196, "mean_token_accuracy": 0.4486628793127338, "step": 8862 }, { "epoch": 1.6431219873934002, "grad_norm": 7.84765625, "learning_rate": 8.3568780126066e-06, "loss": 2.9143, "mean_token_accuracy": 0.4354253112033195, "step": 8863 }, { "epoch": 1.6433073785687802, "grad_norm": 10.421875, "learning_rate": 8.356692621431221e-06, "loss": 3.671, "mean_token_accuracy": 0.4350613154960981, "step": 8864 }, { "epoch": 1.6434927697441601, "grad_norm": 7.0, "learning_rate": 8.35650723025584e-06, "loss": 2.8428, "mean_token_accuracy": 0.4612221749610095, "step": 8865 }, { "epoch": 1.6436781609195403, "grad_norm": 5.98828125, "learning_rate": 8.35632183908046e-06, "loss": 3.1397, "mean_token_accuracy": 0.4563353445538266, "step": 8866 }, { "epoch": 1.6438635520949203, "grad_norm": 6.97265625, "learning_rate": 8.35613644790508e-06, "loss": 2.9386, "mean_token_accuracy": 0.4553683969290707, "step": 8867 }, { "epoch": 1.6440489432703003, "grad_norm": 7.48046875, "learning_rate": 8.355951056729702e-06, "loss": 3.1005, "mean_token_accuracy": 0.42735949098621423, "step": 8868 }, { "epoch": 1.6442343344456805, "grad_norm": 6.12890625, "learning_rate": 8.35576566555432e-06, "loss": 3.0006, "mean_token_accuracy": 0.4480252764612954, "step": 8869 }, { "epoch": 1.6444197256210604, "grad_norm": 5.89453125, "learning_rate": 8.35558027437894e-06, "loss": 2.7202, "mean_token_accuracy": 0.47690671528410816, "step": 8870 }, { "epoch": 1.6446051167964404, "grad_norm": 6.3671875, "learning_rate": 8.35539488320356e-06, "loss": 3.161, "mean_token_accuracy": 0.4351408028759736, "step": 8871 }, { "epoch": 1.6447905079718206, "grad_norm": 6.4140625, "learning_rate": 8.35520949202818e-06, "loss": 2.6233, "mean_token_accuracy": 0.4792492422476102, "step": 8872 }, { "epoch": 1.6449758991472005, "grad_norm": 5.6640625, "learning_rate": 8.3550241008528e-06, "loss": 2.8209, "mean_token_accuracy": 0.45934959349593496, "step": 8873 }, { "epoch": 1.6451612903225805, "grad_norm": 6.2578125, "learning_rate": 8.35483870967742e-06, "loss": 2.5556, "mean_token_accuracy": 0.5086761824797089, "step": 8874 }, { "epoch": 1.6453466814979607, "grad_norm": 6.7734375, "learning_rate": 8.35465331850204e-06, "loss": 2.7282, "mean_token_accuracy": 0.4775630873191849, "step": 8875 }, { "epoch": 1.645532072673341, "grad_norm": 6.18359375, "learning_rate": 8.35446792732666e-06, "loss": 2.8468, "mean_token_accuracy": 0.46067730198712564, "step": 8876 }, { "epoch": 1.6457174638487206, "grad_norm": 6.14453125, "learning_rate": 8.354282536151281e-06, "loss": 2.8684, "mean_token_accuracy": 0.4735873850197109, "step": 8877 }, { "epoch": 1.6459028550241008, "grad_norm": 7.1796875, "learning_rate": 8.3540971449759e-06, "loss": 3.9834, "mean_token_accuracy": 0.41897696212417024, "step": 8878 }, { "epoch": 1.646088246199481, "grad_norm": 5.03125, "learning_rate": 8.35391175380052e-06, "loss": 2.3121, "mean_token_accuracy": 0.5300807043286867, "step": 8879 }, { "epoch": 1.646273637374861, "grad_norm": 9.3359375, "learning_rate": 8.353726362625139e-06, "loss": 3.1224, "mean_token_accuracy": 0.4431006848317062, "step": 8880 }, { "epoch": 1.646459028550241, "grad_norm": 7.2265625, "learning_rate": 8.35354097144976e-06, "loss": 3.2936, "mean_token_accuracy": 0.4360096589194084, "step": 8881 }, { "epoch": 1.6466444197256211, "grad_norm": 5.8671875, "learning_rate": 8.35335558027438e-06, "loss": 2.9265, "mean_token_accuracy": 0.4557272858587318, "step": 8882 }, { "epoch": 1.6468298109010011, "grad_norm": 5.76171875, "learning_rate": 8.353170189098999e-06, "loss": 2.9484, "mean_token_accuracy": 0.4702726218097448, "step": 8883 }, { "epoch": 1.647015202076381, "grad_norm": 6.01171875, "learning_rate": 8.35298479792362e-06, "loss": 2.6761, "mean_token_accuracy": 0.4667458432304038, "step": 8884 }, { "epoch": 1.6472005932517613, "grad_norm": 6.3984375, "learning_rate": 8.35279940674824e-06, "loss": 2.9143, "mean_token_accuracy": 0.47463468431210365, "step": 8885 }, { "epoch": 1.6473859844271412, "grad_norm": 6.83203125, "learning_rate": 8.35261401557286e-06, "loss": 3.4555, "mean_token_accuracy": 0.4168646080760095, "step": 8886 }, { "epoch": 1.6475713756025212, "grad_norm": 7.1953125, "learning_rate": 8.352428624397479e-06, "loss": 2.8409, "mean_token_accuracy": 0.4543462381300219, "step": 8887 }, { "epoch": 1.6477567667779014, "grad_norm": 7.02734375, "learning_rate": 8.3522432332221e-06, "loss": 2.6208, "mean_token_accuracy": 0.4878709677419355, "step": 8888 }, { "epoch": 1.6479421579532816, "grad_norm": 6.08984375, "learning_rate": 8.352057842046718e-06, "loss": 2.5263, "mean_token_accuracy": 0.4846723044397463, "step": 8889 }, { "epoch": 1.6481275491286613, "grad_norm": 6.796875, "learning_rate": 8.351872450871339e-06, "loss": 2.5631, "mean_token_accuracy": 0.499277858015776, "step": 8890 }, { "epoch": 1.6483129403040415, "grad_norm": 6.1484375, "learning_rate": 8.35168705969596e-06, "loss": 2.4274, "mean_token_accuracy": 0.5145564405383136, "step": 8891 }, { "epoch": 1.6484983314794217, "grad_norm": 5.8125, "learning_rate": 8.35150166852058e-06, "loss": 3.0021, "mean_token_accuracy": 0.43482805477161074, "step": 8892 }, { "epoch": 1.6486837226548017, "grad_norm": 6.26171875, "learning_rate": 8.3513162773452e-06, "loss": 2.6814, "mean_token_accuracy": 0.466122574684324, "step": 8893 }, { "epoch": 1.6488691138301816, "grad_norm": 6.46875, "learning_rate": 8.351130886169819e-06, "loss": 2.1462, "mean_token_accuracy": 0.5381827271591051, "step": 8894 }, { "epoch": 1.6490545050055618, "grad_norm": 7.2890625, "learning_rate": 8.35094549499444e-06, "loss": 2.4187, "mean_token_accuracy": 0.5052848985542462, "step": 8895 }, { "epoch": 1.6492398961809418, "grad_norm": 6.3203125, "learning_rate": 8.350760103819058e-06, "loss": 2.5877, "mean_token_accuracy": 0.4992602708546717, "step": 8896 }, { "epoch": 1.6494252873563218, "grad_norm": 5.58984375, "learning_rate": 8.350574712643679e-06, "loss": 3.3992, "mean_token_accuracy": 0.40399581832965503, "step": 8897 }, { "epoch": 1.649610678531702, "grad_norm": 6.70703125, "learning_rate": 8.350389321468298e-06, "loss": 2.7946, "mean_token_accuracy": 0.4770566349731294, "step": 8898 }, { "epoch": 1.649796069707082, "grad_norm": 5.67578125, "learning_rate": 8.350203930292918e-06, "loss": 3.0669, "mean_token_accuracy": 0.4374927854092116, "step": 8899 }, { "epoch": 1.649981460882462, "grad_norm": 5.87109375, "learning_rate": 8.350018539117539e-06, "loss": 3.1234, "mean_token_accuracy": 0.43770384866275275, "step": 8900 }, { "epoch": 1.650166852057842, "grad_norm": 6.88671875, "learning_rate": 8.349833147942159e-06, "loss": 2.8359, "mean_token_accuracy": 0.4533399429916966, "step": 8901 }, { "epoch": 1.650352243233222, "grad_norm": 5.57421875, "learning_rate": 8.34964775676678e-06, "loss": 2.5231, "mean_token_accuracy": 0.4983351831298557, "step": 8902 }, { "epoch": 1.650537634408602, "grad_norm": 5.875, "learning_rate": 8.349462365591398e-06, "loss": 2.9503, "mean_token_accuracy": 0.4602080461210678, "step": 8903 }, { "epoch": 1.6507230255839822, "grad_norm": 5.453125, "learning_rate": 8.349276974416019e-06, "loss": 3.3628, "mean_token_accuracy": 0.4294787781026313, "step": 8904 }, { "epoch": 1.6509084167593624, "grad_norm": 5.87890625, "learning_rate": 8.349091583240638e-06, "loss": 2.7305, "mean_token_accuracy": 0.46581945661700264, "step": 8905 }, { "epoch": 1.6510938079347421, "grad_norm": 5.86328125, "learning_rate": 8.348906192065258e-06, "loss": 2.5961, "mean_token_accuracy": 0.4800856021876115, "step": 8906 }, { "epoch": 1.6512791991101223, "grad_norm": 6.18359375, "learning_rate": 8.348720800889879e-06, "loss": 2.88, "mean_token_accuracy": 0.48093496996604856, "step": 8907 }, { "epoch": 1.6514645902855025, "grad_norm": 6.64453125, "learning_rate": 8.348535409714497e-06, "loss": 2.6665, "mean_token_accuracy": 0.4768935479108224, "step": 8908 }, { "epoch": 1.6516499814608825, "grad_norm": 6.375, "learning_rate": 8.348350018539118e-06, "loss": 2.722, "mean_token_accuracy": 0.4585215379530682, "step": 8909 }, { "epoch": 1.6518353726362625, "grad_norm": 6.84765625, "learning_rate": 8.348164627363738e-06, "loss": 2.9283, "mean_token_accuracy": 0.44984288131496253, "step": 8910 }, { "epoch": 1.6520207638116426, "grad_norm": 9.5078125, "learning_rate": 8.347979236188359e-06, "loss": 2.975, "mean_token_accuracy": 0.46616541353383456, "step": 8911 }, { "epoch": 1.6522061549870226, "grad_norm": 6.5234375, "learning_rate": 8.347793845012978e-06, "loss": 2.5324, "mean_token_accuracy": 0.49645704162976084, "step": 8912 }, { "epoch": 1.6523915461624026, "grad_norm": 7.359375, "learning_rate": 8.347608453837598e-06, "loss": 3.6046, "mean_token_accuracy": 0.43010752688172044, "step": 8913 }, { "epoch": 1.6525769373377828, "grad_norm": 7.46875, "learning_rate": 8.347423062662217e-06, "loss": 2.685, "mean_token_accuracy": 0.4922027290448343, "step": 8914 }, { "epoch": 1.6527623285131627, "grad_norm": 7.33984375, "learning_rate": 8.347237671486838e-06, "loss": 2.5033, "mean_token_accuracy": 0.48224400871459694, "step": 8915 }, { "epoch": 1.6529477196885427, "grad_norm": 5.8828125, "learning_rate": 8.347052280311458e-06, "loss": 2.8199, "mean_token_accuracy": 0.46099290780141844, "step": 8916 }, { "epoch": 1.653133110863923, "grad_norm": 7.75390625, "learning_rate": 8.346866889136079e-06, "loss": 2.7777, "mean_token_accuracy": 0.47224797986488276, "step": 8917 }, { "epoch": 1.653318502039303, "grad_norm": 7.1953125, "learning_rate": 8.346681497960697e-06, "loss": 2.423, "mean_token_accuracy": 0.5005189413596264, "step": 8918 }, { "epoch": 1.6535038932146828, "grad_norm": 5.5234375, "learning_rate": 8.346496106785318e-06, "loss": 2.9997, "mean_token_accuracy": 0.4807347670250896, "step": 8919 }, { "epoch": 1.653689284390063, "grad_norm": 10.0234375, "learning_rate": 8.346310715609938e-06, "loss": 2.5239, "mean_token_accuracy": 0.4829603627321925, "step": 8920 }, { "epoch": 1.6538746755654432, "grad_norm": 6.28515625, "learning_rate": 8.346125324434557e-06, "loss": 3.1977, "mean_token_accuracy": 0.42775712515489467, "step": 8921 }, { "epoch": 1.6540600667408232, "grad_norm": 6.15234375, "learning_rate": 8.345939933259178e-06, "loss": 2.9899, "mean_token_accuracy": 0.4540427439844567, "step": 8922 }, { "epoch": 1.6542454579162031, "grad_norm": 5.72265625, "learning_rate": 8.345754542083796e-06, "loss": 2.8329, "mean_token_accuracy": 0.4738286969253294, "step": 8923 }, { "epoch": 1.6544308490915833, "grad_norm": 8.03125, "learning_rate": 8.345569150908417e-06, "loss": 2.926, "mean_token_accuracy": 0.45076361978573054, "step": 8924 }, { "epoch": 1.6546162402669633, "grad_norm": 9.671875, "learning_rate": 8.345383759733037e-06, "loss": 2.7679, "mean_token_accuracy": 0.48451507742461286, "step": 8925 }, { "epoch": 1.6548016314423433, "grad_norm": 6.05078125, "learning_rate": 8.345198368557658e-06, "loss": 3.097, "mean_token_accuracy": 0.4546755104695019, "step": 8926 }, { "epoch": 1.6549870226177235, "grad_norm": 7.4296875, "learning_rate": 8.345012977382277e-06, "loss": 3.1438, "mean_token_accuracy": 0.4406298093997869, "step": 8927 }, { "epoch": 1.6551724137931034, "grad_norm": 7.4609375, "learning_rate": 8.344827586206897e-06, "loss": 2.8248, "mean_token_accuracy": 0.47302540993866565, "step": 8928 }, { "epoch": 1.6553578049684834, "grad_norm": 6.40625, "learning_rate": 8.344642195031518e-06, "loss": 2.7979, "mean_token_accuracy": 0.4591714399893433, "step": 8929 }, { "epoch": 1.6555431961438636, "grad_norm": 5.80859375, "learning_rate": 8.344456803856136e-06, "loss": 3.6356, "mean_token_accuracy": 0.38245083207261726, "step": 8930 }, { "epoch": 1.6557285873192435, "grad_norm": 6.7890625, "learning_rate": 8.344271412680757e-06, "loss": 2.658, "mean_token_accuracy": 0.4933811362382791, "step": 8931 }, { "epoch": 1.6559139784946235, "grad_norm": 5.90625, "learning_rate": 8.344086021505376e-06, "loss": 2.7383, "mean_token_accuracy": 0.4720262096774194, "step": 8932 }, { "epoch": 1.6560993696700037, "grad_norm": 5.81640625, "learning_rate": 8.343900630329998e-06, "loss": 2.4528, "mean_token_accuracy": 0.5091585181104531, "step": 8933 }, { "epoch": 1.656284760845384, "grad_norm": 5.40234375, "learning_rate": 8.343715239154617e-06, "loss": 2.7788, "mean_token_accuracy": 0.47177594442032134, "step": 8934 }, { "epoch": 1.6564701520207639, "grad_norm": 6.84375, "learning_rate": 8.343529847979237e-06, "loss": 2.4675, "mean_token_accuracy": 0.5143762183235867, "step": 8935 }, { "epoch": 1.6566555431961438, "grad_norm": 5.3515625, "learning_rate": 8.343344456803858e-06, "loss": 3.0416, "mean_token_accuracy": 0.4616797900262467, "step": 8936 }, { "epoch": 1.656840934371524, "grad_norm": 7.28515625, "learning_rate": 8.343159065628476e-06, "loss": 2.7382, "mean_token_accuracy": 0.48330229671011793, "step": 8937 }, { "epoch": 1.657026325546904, "grad_norm": 5.96875, "learning_rate": 8.342973674453097e-06, "loss": 3.2798, "mean_token_accuracy": 0.4383697813121272, "step": 8938 }, { "epoch": 1.657211716722284, "grad_norm": 5.04296875, "learning_rate": 8.342788283277716e-06, "loss": 2.706, "mean_token_accuracy": 0.4714981729598051, "step": 8939 }, { "epoch": 1.6573971078976641, "grad_norm": 8.8203125, "learning_rate": 8.342602892102336e-06, "loss": 2.3911, "mean_token_accuracy": 0.5070240825688074, "step": 8940 }, { "epoch": 1.6575824990730441, "grad_norm": 7.84765625, "learning_rate": 8.342417500926957e-06, "loss": 2.5925, "mean_token_accuracy": 0.48901715568382237, "step": 8941 }, { "epoch": 1.657767890248424, "grad_norm": 6.11328125, "learning_rate": 8.342232109751577e-06, "loss": 3.574, "mean_token_accuracy": 0.40052164840897236, "step": 8942 }, { "epoch": 1.6579532814238043, "grad_norm": 8.546875, "learning_rate": 8.342046718576196e-06, "loss": 2.3831, "mean_token_accuracy": 0.5049430609435616, "step": 8943 }, { "epoch": 1.6581386725991842, "grad_norm": 6.82421875, "learning_rate": 8.341861327400817e-06, "loss": 3.0246, "mean_token_accuracy": 0.4502415458937198, "step": 8944 }, { "epoch": 1.6583240637745642, "grad_norm": 5.7109375, "learning_rate": 8.341675936225437e-06, "loss": 2.9924, "mean_token_accuracy": 0.46675567423230974, "step": 8945 }, { "epoch": 1.6585094549499444, "grad_norm": 7.1953125, "learning_rate": 8.341490545050056e-06, "loss": 3.1273, "mean_token_accuracy": 0.4332704797821077, "step": 8946 }, { "epoch": 1.6586948461253246, "grad_norm": 10.328125, "learning_rate": 8.341305153874676e-06, "loss": 2.506, "mean_token_accuracy": 0.4873920945024989, "step": 8947 }, { "epoch": 1.6588802373007043, "grad_norm": 6.18359375, "learning_rate": 8.341119762699295e-06, "loss": 3.1364, "mean_token_accuracy": 0.4446677384780279, "step": 8948 }, { "epoch": 1.6590656284760845, "grad_norm": 6.7421875, "learning_rate": 8.340934371523917e-06, "loss": 2.5963, "mean_token_accuracy": 0.47680511182108626, "step": 8949 }, { "epoch": 1.6592510196514647, "grad_norm": 8.2109375, "learning_rate": 8.340748980348536e-06, "loss": 2.869, "mean_token_accuracy": 0.4663911510878814, "step": 8950 }, { "epoch": 1.6594364108268447, "grad_norm": 7.65625, "learning_rate": 8.340563589173157e-06, "loss": 2.7554, "mean_token_accuracy": 0.47129489124936774, "step": 8951 }, { "epoch": 1.6596218020022246, "grad_norm": 5.58203125, "learning_rate": 8.340378197997775e-06, "loss": 2.6674, "mean_token_accuracy": 0.464941112024103, "step": 8952 }, { "epoch": 1.6598071931776048, "grad_norm": 9.9140625, "learning_rate": 8.340192806822396e-06, "loss": 3.1767, "mean_token_accuracy": 0.43936731107205623, "step": 8953 }, { "epoch": 1.6599925843529848, "grad_norm": 9.7734375, "learning_rate": 8.340007415647016e-06, "loss": 3.0985, "mean_token_accuracy": 0.42906415267877673, "step": 8954 }, { "epoch": 1.6601779755283648, "grad_norm": 7.01953125, "learning_rate": 8.339822024471635e-06, "loss": 3.0179, "mean_token_accuracy": 0.4436495983935743, "step": 8955 }, { "epoch": 1.660363366703745, "grad_norm": 10.234375, "learning_rate": 8.339636633296256e-06, "loss": 2.9426, "mean_token_accuracy": 0.46073227167551384, "step": 8956 }, { "epoch": 1.660548757879125, "grad_norm": 6.9609375, "learning_rate": 8.339451242120876e-06, "loss": 2.3345, "mean_token_accuracy": 0.5381624983801996, "step": 8957 }, { "epoch": 1.660734149054505, "grad_norm": 5.80078125, "learning_rate": 8.339265850945497e-06, "loss": 2.6572, "mean_token_accuracy": 0.4807121661721068, "step": 8958 }, { "epoch": 1.660919540229885, "grad_norm": 7.74609375, "learning_rate": 8.339080459770115e-06, "loss": 3.3757, "mean_token_accuracy": 0.42642746248059343, "step": 8959 }, { "epoch": 1.6611049314052653, "grad_norm": 9.078125, "learning_rate": 8.338895068594736e-06, "loss": 3.4042, "mean_token_accuracy": 0.42110596409959467, "step": 8960 }, { "epoch": 1.661290322580645, "grad_norm": 9.578125, "learning_rate": 8.338709677419355e-06, "loss": 2.6259, "mean_token_accuracy": 0.47948521916411824, "step": 8961 }, { "epoch": 1.6614757137560252, "grad_norm": 6.2578125, "learning_rate": 8.338524286243975e-06, "loss": 3.0731, "mean_token_accuracy": 0.4330493000608643, "step": 8962 }, { "epoch": 1.6616611049314054, "grad_norm": 8.09375, "learning_rate": 8.338338895068596e-06, "loss": 3.3641, "mean_token_accuracy": 0.42841091492776884, "step": 8963 }, { "epoch": 1.6618464961067854, "grad_norm": 7.44140625, "learning_rate": 8.338153503893215e-06, "loss": 3.119, "mean_token_accuracy": 0.4395997140814868, "step": 8964 }, { "epoch": 1.6620318872821653, "grad_norm": 5.9140625, "learning_rate": 8.337968112717835e-06, "loss": 3.0123, "mean_token_accuracy": 0.4436574372182872, "step": 8965 }, { "epoch": 1.6622172784575455, "grad_norm": 7.50390625, "learning_rate": 8.337782721542455e-06, "loss": 3.1252, "mean_token_accuracy": 0.41665666626665065, "step": 8966 }, { "epoch": 1.6624026696329255, "grad_norm": 10.03125, "learning_rate": 8.337597330367076e-06, "loss": 2.9077, "mean_token_accuracy": 0.4581852641554134, "step": 8967 }, { "epoch": 1.6625880608083055, "grad_norm": 6.3984375, "learning_rate": 8.337411939191695e-06, "loss": 2.8411, "mean_token_accuracy": 0.4494369494926971, "step": 8968 }, { "epoch": 1.6627734519836856, "grad_norm": 6.703125, "learning_rate": 8.337226548016315e-06, "loss": 2.8383, "mean_token_accuracy": 0.4523720582741875, "step": 8969 }, { "epoch": 1.6629588431590656, "grad_norm": 9.6640625, "learning_rate": 8.337041156840934e-06, "loss": 2.2965, "mean_token_accuracy": 0.5086042065009561, "step": 8970 }, { "epoch": 1.6631442343344456, "grad_norm": 6.0703125, "learning_rate": 8.336855765665555e-06, "loss": 3.091, "mean_token_accuracy": 0.4368794326241135, "step": 8971 }, { "epoch": 1.6633296255098258, "grad_norm": 6.46875, "learning_rate": 8.336670374490175e-06, "loss": 3.115, "mean_token_accuracy": 0.44277175116227585, "step": 8972 }, { "epoch": 1.6635150166852057, "grad_norm": 6.19921875, "learning_rate": 8.336484983314796e-06, "loss": 3.1499, "mean_token_accuracy": 0.4318181818181818, "step": 8973 }, { "epoch": 1.6637004078605857, "grad_norm": 7.09765625, "learning_rate": 8.336299592139416e-06, "loss": 2.9243, "mean_token_accuracy": 0.4716981132075472, "step": 8974 }, { "epoch": 1.663885799035966, "grad_norm": 6.0390625, "learning_rate": 8.336114200964035e-06, "loss": 2.7871, "mean_token_accuracy": 0.4651128701260627, "step": 8975 }, { "epoch": 1.664071190211346, "grad_norm": 6.40625, "learning_rate": 8.335928809788655e-06, "loss": 3.1742, "mean_token_accuracy": 0.4586917929810187, "step": 8976 }, { "epoch": 1.6642565813867258, "grad_norm": 8.03125, "learning_rate": 8.335743418613274e-06, "loss": 3.0432, "mean_token_accuracy": 0.44231963243511735, "step": 8977 }, { "epoch": 1.664441972562106, "grad_norm": 6.12890625, "learning_rate": 8.335558027437895e-06, "loss": 3.0977, "mean_token_accuracy": 0.44981949458483755, "step": 8978 }, { "epoch": 1.6646273637374862, "grad_norm": 6.32421875, "learning_rate": 8.335372636262513e-06, "loss": 2.0904, "mean_token_accuracy": 0.5561176098640531, "step": 8979 }, { "epoch": 1.6648127549128662, "grad_norm": 7.96484375, "learning_rate": 8.335187245087134e-06, "loss": 2.8054, "mean_token_accuracy": 0.48493057907213005, "step": 8980 }, { "epoch": 1.6649981460882461, "grad_norm": 9.0625, "learning_rate": 8.335001853911754e-06, "loss": 2.3613, "mean_token_accuracy": 0.5271335542099852, "step": 8981 }, { "epoch": 1.6651835372636263, "grad_norm": 6.28515625, "learning_rate": 8.334816462736375e-06, "loss": 3.0985, "mean_token_accuracy": 0.45264373716632444, "step": 8982 }, { "epoch": 1.6653689284390063, "grad_norm": 6.18359375, "learning_rate": 8.334631071560995e-06, "loss": 2.9048, "mean_token_accuracy": 0.4543681747269891, "step": 8983 }, { "epoch": 1.6655543196143863, "grad_norm": 5.24609375, "learning_rate": 8.334445680385614e-06, "loss": 2.6863, "mean_token_accuracy": 0.47003154574132494, "step": 8984 }, { "epoch": 1.6657397107897665, "grad_norm": 5.4921875, "learning_rate": 8.334260289210235e-06, "loss": 2.8223, "mean_token_accuracy": 0.47709074733096085, "step": 8985 }, { "epoch": 1.6659251019651464, "grad_norm": 7.75, "learning_rate": 8.334074898034853e-06, "loss": 3.0837, "mean_token_accuracy": 0.44212198997677543, "step": 8986 }, { "epoch": 1.6661104931405264, "grad_norm": 6.62890625, "learning_rate": 8.333889506859474e-06, "loss": 2.7917, "mean_token_accuracy": 0.46207624323967816, "step": 8987 }, { "epoch": 1.6662958843159066, "grad_norm": 7.41796875, "learning_rate": 8.333704115684094e-06, "loss": 2.9036, "mean_token_accuracy": 0.46514527996009475, "step": 8988 }, { "epoch": 1.6664812754912868, "grad_norm": 8.15625, "learning_rate": 8.333518724508715e-06, "loss": 3.101, "mean_token_accuracy": 0.45799164474702153, "step": 8989 }, { "epoch": 1.6666666666666665, "grad_norm": 6.37890625, "learning_rate": 8.333333333333334e-06, "loss": 3.3409, "mean_token_accuracy": 0.43858688210324526, "step": 8990 }, { "epoch": 1.6668520578420467, "grad_norm": 7.76171875, "learning_rate": 8.333147942157954e-06, "loss": 3.3431, "mean_token_accuracy": 0.41430886202168765, "step": 8991 }, { "epoch": 1.667037449017427, "grad_norm": 6.8125, "learning_rate": 8.332962550982575e-06, "loss": 3.332, "mean_token_accuracy": 0.4458502024291498, "step": 8992 }, { "epoch": 1.6672228401928069, "grad_norm": 5.96484375, "learning_rate": 8.332777159807194e-06, "loss": 2.6627, "mean_token_accuracy": 0.4687168610816543, "step": 8993 }, { "epoch": 1.6674082313681868, "grad_norm": 7.71484375, "learning_rate": 8.332591768631814e-06, "loss": 3.0384, "mean_token_accuracy": 0.447800741918389, "step": 8994 }, { "epoch": 1.667593622543567, "grad_norm": 7.61328125, "learning_rate": 8.332406377456433e-06, "loss": 2.6316, "mean_token_accuracy": 0.4780055136042191, "step": 8995 }, { "epoch": 1.667779013718947, "grad_norm": 5.83203125, "learning_rate": 8.332220986281053e-06, "loss": 2.6068, "mean_token_accuracy": 0.498416354736539, "step": 8996 }, { "epoch": 1.667964404894327, "grad_norm": 8.484375, "learning_rate": 8.332035595105674e-06, "loss": 2.5613, "mean_token_accuracy": 0.4997301672962763, "step": 8997 }, { "epoch": 1.6681497960697071, "grad_norm": 7.53515625, "learning_rate": 8.331850203930294e-06, "loss": 3.7567, "mean_token_accuracy": 0.3870479857005362, "step": 8998 }, { "epoch": 1.6683351872450871, "grad_norm": 5.7265625, "learning_rate": 8.331664812754913e-06, "loss": 2.6293, "mean_token_accuracy": 0.4918627666593358, "step": 8999 }, { "epoch": 1.668520578420467, "grad_norm": 7.2890625, "learning_rate": 8.331479421579534e-06, "loss": 2.5366, "mean_token_accuracy": 0.4916379734382686, "step": 9000 }, { "epoch": 1.6687059695958473, "grad_norm": 6.71484375, "learning_rate": 8.331294030404154e-06, "loss": 2.8305, "mean_token_accuracy": 0.49238445378151263, "step": 9001 }, { "epoch": 1.6688913607712272, "grad_norm": 5.51953125, "learning_rate": 8.331108639228773e-06, "loss": 3.6064, "mean_token_accuracy": 0.40842560773832254, "step": 9002 }, { "epoch": 1.6690767519466072, "grad_norm": 7.12109375, "learning_rate": 8.330923248053393e-06, "loss": 2.5349, "mean_token_accuracy": 0.510231120270867, "step": 9003 }, { "epoch": 1.6692621431219874, "grad_norm": 8.2734375, "learning_rate": 8.330737856878012e-06, "loss": 3.027, "mean_token_accuracy": 0.46778668310727495, "step": 9004 }, { "epoch": 1.6694475342973676, "grad_norm": 6.01171875, "learning_rate": 8.330552465702634e-06, "loss": 3.4727, "mean_token_accuracy": 0.4431224608241439, "step": 9005 }, { "epoch": 1.6696329254727473, "grad_norm": 7.98828125, "learning_rate": 8.330367074527253e-06, "loss": 2.7563, "mean_token_accuracy": 0.4786685419596812, "step": 9006 }, { "epoch": 1.6698183166481275, "grad_norm": 7.05078125, "learning_rate": 8.330181683351874e-06, "loss": 3.0117, "mean_token_accuracy": 0.44002541296060993, "step": 9007 }, { "epoch": 1.6700037078235077, "grad_norm": 6.9609375, "learning_rate": 8.329996292176492e-06, "loss": 3.0331, "mean_token_accuracy": 0.4196854816062904, "step": 9008 }, { "epoch": 1.6701890989988877, "grad_norm": 7.7265625, "learning_rate": 8.329810901001113e-06, "loss": 2.7892, "mean_token_accuracy": 0.4558925356598809, "step": 9009 }, { "epoch": 1.6703744901742676, "grad_norm": 5.61328125, "learning_rate": 8.329625509825733e-06, "loss": 2.3497, "mean_token_accuracy": 0.5170417160152652, "step": 9010 }, { "epoch": 1.6705598813496478, "grad_norm": 5.80859375, "learning_rate": 8.329440118650352e-06, "loss": 3.2756, "mean_token_accuracy": 0.44947275922671354, "step": 9011 }, { "epoch": 1.6707452725250278, "grad_norm": 7.1953125, "learning_rate": 8.329254727474973e-06, "loss": 2.9402, "mean_token_accuracy": 0.46925512414597564, "step": 9012 }, { "epoch": 1.6709306637004078, "grad_norm": 7.40234375, "learning_rate": 8.329069336299593e-06, "loss": 3.0045, "mean_token_accuracy": 0.45862776384773196, "step": 9013 }, { "epoch": 1.671116054875788, "grad_norm": 8.1484375, "learning_rate": 8.328883945124214e-06, "loss": 2.5807, "mean_token_accuracy": 0.47954999474292925, "step": 9014 }, { "epoch": 1.671301446051168, "grad_norm": 6.44921875, "learning_rate": 8.328698553948832e-06, "loss": 2.528, "mean_token_accuracy": 0.5122330646301796, "step": 9015 }, { "epoch": 1.671486837226548, "grad_norm": 6.60546875, "learning_rate": 8.328513162773453e-06, "loss": 2.9889, "mean_token_accuracy": 0.4682550246239851, "step": 9016 }, { "epoch": 1.671672228401928, "grad_norm": 6.16015625, "learning_rate": 8.328327771598072e-06, "loss": 2.8057, "mean_token_accuracy": 0.4726643598615917, "step": 9017 }, { "epoch": 1.6718576195773083, "grad_norm": 7.25, "learning_rate": 8.328142380422692e-06, "loss": 2.8836, "mean_token_accuracy": 0.4497442855651811, "step": 9018 }, { "epoch": 1.672043010752688, "grad_norm": 9.0, "learning_rate": 8.327956989247313e-06, "loss": 2.4302, "mean_token_accuracy": 0.48620917917034423, "step": 9019 }, { "epoch": 1.6722284019280682, "grad_norm": 5.08203125, "learning_rate": 8.327771598071932e-06, "loss": 2.803, "mean_token_accuracy": 0.45986580516898606, "step": 9020 }, { "epoch": 1.6724137931034484, "grad_norm": 5.703125, "learning_rate": 8.327586206896554e-06, "loss": 3.3039, "mean_token_accuracy": 0.43088975937325125, "step": 9021 }, { "epoch": 1.6725991842788284, "grad_norm": 6.8359375, "learning_rate": 8.327400815721173e-06, "loss": 2.8829, "mean_token_accuracy": 0.4466959215281363, "step": 9022 }, { "epoch": 1.6727845754542083, "grad_norm": 7.06640625, "learning_rate": 8.327215424545793e-06, "loss": 3.3077, "mean_token_accuracy": 0.4373962138824311, "step": 9023 }, { "epoch": 1.6729699666295885, "grad_norm": 5.78125, "learning_rate": 8.327030033370412e-06, "loss": 2.7014, "mean_token_accuracy": 0.4916142557651992, "step": 9024 }, { "epoch": 1.6731553578049685, "grad_norm": 6.7578125, "learning_rate": 8.326844642195032e-06, "loss": 2.2777, "mean_token_accuracy": 0.5315116279069767, "step": 9025 }, { "epoch": 1.6733407489803485, "grad_norm": 5.46875, "learning_rate": 8.326659251019653e-06, "loss": 2.6853, "mean_token_accuracy": 0.48560923037910486, "step": 9026 }, { "epoch": 1.6735261401557286, "grad_norm": 5.671875, "learning_rate": 8.326473859844272e-06, "loss": 2.8309, "mean_token_accuracy": 0.4642772088592522, "step": 9027 }, { "epoch": 1.6737115313311086, "grad_norm": 5.359375, "learning_rate": 8.326288468668892e-06, "loss": 2.4247, "mean_token_accuracy": 0.49465942917177375, "step": 9028 }, { "epoch": 1.6738969225064886, "grad_norm": 6.1640625, "learning_rate": 8.326103077493513e-06, "loss": 3.0541, "mean_token_accuracy": 0.443935119887165, "step": 9029 }, { "epoch": 1.6740823136818688, "grad_norm": 6.2265625, "learning_rate": 8.325917686318133e-06, "loss": 2.9046, "mean_token_accuracy": 0.46596022644796053, "step": 9030 }, { "epoch": 1.6742677048572487, "grad_norm": 5.2265625, "learning_rate": 8.325732295142752e-06, "loss": 3.2694, "mean_token_accuracy": 0.4195157561747171, "step": 9031 }, { "epoch": 1.6744530960326287, "grad_norm": 5.16796875, "learning_rate": 8.325546903967372e-06, "loss": 2.7101, "mean_token_accuracy": 0.48514851485148514, "step": 9032 }, { "epoch": 1.674638487208009, "grad_norm": 5.72265625, "learning_rate": 8.325361512791991e-06, "loss": 2.3203, "mean_token_accuracy": 0.5085653104925053, "step": 9033 }, { "epoch": 1.674823878383389, "grad_norm": 5.66015625, "learning_rate": 8.325176121616612e-06, "loss": 3.1346, "mean_token_accuracy": 0.4413498208713741, "step": 9034 }, { "epoch": 1.675009269558769, "grad_norm": 5.7578125, "learning_rate": 8.324990730441232e-06, "loss": 2.7465, "mean_token_accuracy": 0.46360804304729536, "step": 9035 }, { "epoch": 1.675194660734149, "grad_norm": 4.90625, "learning_rate": 8.324805339265851e-06, "loss": 2.7933, "mean_token_accuracy": 0.48440279919345275, "step": 9036 }, { "epoch": 1.6753800519095292, "grad_norm": 7.2578125, "learning_rate": 8.324619948090471e-06, "loss": 2.3204, "mean_token_accuracy": 0.5199916483975363, "step": 9037 }, { "epoch": 1.6755654430849092, "grad_norm": 7.26953125, "learning_rate": 8.324434556915092e-06, "loss": 2.8912, "mean_token_accuracy": 0.46266094420600856, "step": 9038 }, { "epoch": 1.6757508342602891, "grad_norm": 6.515625, "learning_rate": 8.324249165739712e-06, "loss": 2.9263, "mean_token_accuracy": 0.45702005730659023, "step": 9039 }, { "epoch": 1.6759362254356693, "grad_norm": 7.6640625, "learning_rate": 8.324063774564331e-06, "loss": 3.4648, "mean_token_accuracy": 0.43855534709193245, "step": 9040 }, { "epoch": 1.6761216166110493, "grad_norm": 5.6953125, "learning_rate": 8.323878383388952e-06, "loss": 2.9746, "mean_token_accuracy": 0.44357976653696496, "step": 9041 }, { "epoch": 1.6763070077864293, "grad_norm": 6.3828125, "learning_rate": 8.32369299221357e-06, "loss": 2.2308, "mean_token_accuracy": 0.5568052832525023, "step": 9042 }, { "epoch": 1.6764923989618095, "grad_norm": 6.01171875, "learning_rate": 8.323507601038191e-06, "loss": 2.7737, "mean_token_accuracy": 0.48327016783974014, "step": 9043 }, { "epoch": 1.6766777901371894, "grad_norm": 7.5859375, "learning_rate": 8.323322209862811e-06, "loss": 2.5998, "mean_token_accuracy": 0.5013791374122367, "step": 9044 }, { "epoch": 1.6768631813125694, "grad_norm": 5.625, "learning_rate": 8.32313681868743e-06, "loss": 3.1244, "mean_token_accuracy": 0.4639464257903753, "step": 9045 }, { "epoch": 1.6770485724879496, "grad_norm": 8.9609375, "learning_rate": 8.32295142751205e-06, "loss": 2.7473, "mean_token_accuracy": 0.46039453717754175, "step": 9046 }, { "epoch": 1.6772339636633298, "grad_norm": 5.7890625, "learning_rate": 8.322766036336671e-06, "loss": 2.586, "mean_token_accuracy": 0.49952278692436175, "step": 9047 }, { "epoch": 1.6774193548387095, "grad_norm": 5.49609375, "learning_rate": 8.322580645161292e-06, "loss": 3.265, "mean_token_accuracy": 0.4274952919020716, "step": 9048 }, { "epoch": 1.6776047460140897, "grad_norm": 6.6953125, "learning_rate": 8.32239525398591e-06, "loss": 3.0674, "mean_token_accuracy": 0.4355092129695758, "step": 9049 }, { "epoch": 1.67779013718947, "grad_norm": 6.84375, "learning_rate": 8.322209862810531e-06, "loss": 2.5567, "mean_token_accuracy": 0.4922271037512673, "step": 9050 }, { "epoch": 1.6779755283648499, "grad_norm": 5.41796875, "learning_rate": 8.32202447163515e-06, "loss": 2.5311, "mean_token_accuracy": 0.49187087653157396, "step": 9051 }, { "epoch": 1.6781609195402298, "grad_norm": 5.51953125, "learning_rate": 8.32183908045977e-06, "loss": 2.6122, "mean_token_accuracy": 0.4788783685360524, "step": 9052 }, { "epoch": 1.67834631071561, "grad_norm": 6.140625, "learning_rate": 8.32165368928439e-06, "loss": 3.0602, "mean_token_accuracy": 0.4343867166577397, "step": 9053 }, { "epoch": 1.67853170189099, "grad_norm": 7.078125, "learning_rate": 8.321468298109011e-06, "loss": 2.9969, "mean_token_accuracy": 0.443505212735982, "step": 9054 }, { "epoch": 1.67871709306637, "grad_norm": 6.5859375, "learning_rate": 8.321282906933632e-06, "loss": 3.214, "mean_token_accuracy": 0.42210753720595295, "step": 9055 }, { "epoch": 1.6789024842417501, "grad_norm": 6.32421875, "learning_rate": 8.32109751575825e-06, "loss": 2.9148, "mean_token_accuracy": 0.4652930001461347, "step": 9056 }, { "epoch": 1.6790878754171301, "grad_norm": 6.00390625, "learning_rate": 8.320912124582871e-06, "loss": 2.6057, "mean_token_accuracy": 0.48931855056787454, "step": 9057 }, { "epoch": 1.67927326659251, "grad_norm": 6.03515625, "learning_rate": 8.32072673340749e-06, "loss": 2.8581, "mean_token_accuracy": 0.4650215620507906, "step": 9058 }, { "epoch": 1.6794586577678903, "grad_norm": 8.2109375, "learning_rate": 8.32054134223211e-06, "loss": 2.5866, "mean_token_accuracy": 0.5024516129032258, "step": 9059 }, { "epoch": 1.6796440489432705, "grad_norm": 6.140625, "learning_rate": 8.32035595105673e-06, "loss": 3.1098, "mean_token_accuracy": 0.43600791894724583, "step": 9060 }, { "epoch": 1.6798294401186502, "grad_norm": 6.9140625, "learning_rate": 8.32017055988135e-06, "loss": 3.1584, "mean_token_accuracy": 0.46320730778990105, "step": 9061 }, { "epoch": 1.6800148312940304, "grad_norm": 6.390625, "learning_rate": 8.31998516870597e-06, "loss": 2.5249, "mean_token_accuracy": 0.527389903329753, "step": 9062 }, { "epoch": 1.6802002224694106, "grad_norm": 6.23828125, "learning_rate": 8.31979977753059e-06, "loss": 2.4759, "mean_token_accuracy": 0.49195205479452053, "step": 9063 }, { "epoch": 1.6803856136447906, "grad_norm": 6.6796875, "learning_rate": 8.319614386355211e-06, "loss": 3.4126, "mean_token_accuracy": 0.4267648864333947, "step": 9064 }, { "epoch": 1.6805710048201705, "grad_norm": 6.14453125, "learning_rate": 8.31942899517983e-06, "loss": 2.9261, "mean_token_accuracy": 0.46221216296004725, "step": 9065 }, { "epoch": 1.6807563959955507, "grad_norm": 6.14453125, "learning_rate": 8.31924360400445e-06, "loss": 3.2519, "mean_token_accuracy": 0.43577087933497854, "step": 9066 }, { "epoch": 1.6809417871709307, "grad_norm": 6.17578125, "learning_rate": 8.31905821282907e-06, "loss": 2.9324, "mean_token_accuracy": 0.4508689474259482, "step": 9067 }, { "epoch": 1.6811271783463106, "grad_norm": 6.3828125, "learning_rate": 8.31887282165369e-06, "loss": 2.5088, "mean_token_accuracy": 0.5060381201804162, "step": 9068 }, { "epoch": 1.6813125695216908, "grad_norm": 6.03515625, "learning_rate": 8.31868743047831e-06, "loss": 2.7209, "mean_token_accuracy": 0.47542828560710265, "step": 9069 }, { "epoch": 1.6814979606970708, "grad_norm": 6.5625, "learning_rate": 8.31850203930293e-06, "loss": 2.7264, "mean_token_accuracy": 0.46617715981877833, "step": 9070 }, { "epoch": 1.6816833518724508, "grad_norm": 6.42578125, "learning_rate": 8.31831664812755e-06, "loss": 3.1624, "mean_token_accuracy": 0.4611142533936652, "step": 9071 }, { "epoch": 1.681868743047831, "grad_norm": 33.53125, "learning_rate": 8.31813125695217e-06, "loss": 3.0743, "mean_token_accuracy": 0.5067851567191065, "step": 9072 }, { "epoch": 1.682054134223211, "grad_norm": 7.07421875, "learning_rate": 8.31794586577679e-06, "loss": 3.0683, "mean_token_accuracy": 0.4524196573715143, "step": 9073 }, { "epoch": 1.682239525398591, "grad_norm": 9.8046875, "learning_rate": 8.31776047460141e-06, "loss": 3.0395, "mean_token_accuracy": 0.4491747296528173, "step": 9074 }, { "epoch": 1.682424916573971, "grad_norm": 6.96484375, "learning_rate": 8.31757508342603e-06, "loss": 2.7205, "mean_token_accuracy": 0.45052359101070716, "step": 9075 }, { "epoch": 1.6826103077493513, "grad_norm": 7.3828125, "learning_rate": 8.317389692250649e-06, "loss": 2.6822, "mean_token_accuracy": 0.4628480509148767, "step": 9076 }, { "epoch": 1.682795698924731, "grad_norm": 8.5546875, "learning_rate": 8.317204301075269e-06, "loss": 3.2102, "mean_token_accuracy": 0.4362955774296409, "step": 9077 }, { "epoch": 1.6829810901001112, "grad_norm": 9.5234375, "learning_rate": 8.31701890989989e-06, "loss": 2.946, "mean_token_accuracy": 0.44569816643159377, "step": 9078 }, { "epoch": 1.6831664812754914, "grad_norm": 10.9140625, "learning_rate": 8.31683351872451e-06, "loss": 3.0545, "mean_token_accuracy": 0.45707070707070707, "step": 9079 }, { "epoch": 1.6833518724508714, "grad_norm": 8.4140625, "learning_rate": 8.316648127549129e-06, "loss": 2.4792, "mean_token_accuracy": 0.5179000801496126, "step": 9080 }, { "epoch": 1.6835372636262513, "grad_norm": 5.81640625, "learning_rate": 8.31646273637375e-06, "loss": 3.2532, "mean_token_accuracy": 0.43907156673114117, "step": 9081 }, { "epoch": 1.6837226548016315, "grad_norm": 5.59375, "learning_rate": 8.31627734519837e-06, "loss": 2.8948, "mean_token_accuracy": 0.47361461714148334, "step": 9082 }, { "epoch": 1.6839080459770115, "grad_norm": 6.81640625, "learning_rate": 8.316091954022989e-06, "loss": 3.1187, "mean_token_accuracy": 0.45247904531893735, "step": 9083 }, { "epoch": 1.6840934371523915, "grad_norm": 5.84375, "learning_rate": 8.315906562847609e-06, "loss": 2.9655, "mean_token_accuracy": 0.4411298000634719, "step": 9084 }, { "epoch": 1.6842788283277716, "grad_norm": 8.6640625, "learning_rate": 8.315721171672228e-06, "loss": 2.769, "mean_token_accuracy": 0.47344643186428176, "step": 9085 }, { "epoch": 1.6844642195031516, "grad_norm": 6.0390625, "learning_rate": 8.31553578049685e-06, "loss": 3.2741, "mean_token_accuracy": 0.4599254426840634, "step": 9086 }, { "epoch": 1.6846496106785316, "grad_norm": 6.88671875, "learning_rate": 8.315350389321469e-06, "loss": 2.901, "mean_token_accuracy": 0.47023411371237456, "step": 9087 }, { "epoch": 1.6848350018539118, "grad_norm": 6.39453125, "learning_rate": 8.31516499814609e-06, "loss": 3.1977, "mean_token_accuracy": 0.44329896907216493, "step": 9088 }, { "epoch": 1.685020393029292, "grad_norm": 6.015625, "learning_rate": 8.314979606970708e-06, "loss": 3.1098, "mean_token_accuracy": 0.42452305982589367, "step": 9089 }, { "epoch": 1.6852057842046717, "grad_norm": 5.984375, "learning_rate": 8.314794215795329e-06, "loss": 3.1597, "mean_token_accuracy": 0.4833976833976834, "step": 9090 }, { "epoch": 1.685391175380052, "grad_norm": 6.9921875, "learning_rate": 8.314608824619949e-06, "loss": 3.1161, "mean_token_accuracy": 0.45440792188144685, "step": 9091 }, { "epoch": 1.685576566555432, "grad_norm": 6.19140625, "learning_rate": 8.314423433444568e-06, "loss": 2.4878, "mean_token_accuracy": 0.51136638452237, "step": 9092 }, { "epoch": 1.685761957730812, "grad_norm": 6.40234375, "learning_rate": 8.314238042269188e-06, "loss": 2.6714, "mean_token_accuracy": 0.5005433306166802, "step": 9093 }, { "epoch": 1.685947348906192, "grad_norm": 6.7734375, "learning_rate": 8.314052651093809e-06, "loss": 3.189, "mean_token_accuracy": 0.428958417987268, "step": 9094 }, { "epoch": 1.6861327400815722, "grad_norm": 7.9921875, "learning_rate": 8.31386725991843e-06, "loss": 2.5998, "mean_token_accuracy": 0.4912023460410557, "step": 9095 }, { "epoch": 1.6863181312569522, "grad_norm": 6.62109375, "learning_rate": 8.313681868743048e-06, "loss": 2.0204, "mean_token_accuracy": 0.5506028993361333, "step": 9096 }, { "epoch": 1.6865035224323321, "grad_norm": 7.9921875, "learning_rate": 8.313496477567669e-06, "loss": 2.5934, "mean_token_accuracy": 0.4632554945054945, "step": 9097 }, { "epoch": 1.6866889136077123, "grad_norm": 6.44140625, "learning_rate": 8.313311086392288e-06, "loss": 2.6643, "mean_token_accuracy": 0.49024628139478177, "step": 9098 }, { "epoch": 1.6868743047830923, "grad_norm": 6.6328125, "learning_rate": 8.313125695216908e-06, "loss": 3.2855, "mean_token_accuracy": 0.4303356554781507, "step": 9099 }, { "epoch": 1.6870596959584723, "grad_norm": 5.83984375, "learning_rate": 8.312940304041528e-06, "loss": 2.2972, "mean_token_accuracy": 0.5406933333333334, "step": 9100 }, { "epoch": 1.6872450871338525, "grad_norm": 6.07421875, "learning_rate": 8.312754912866147e-06, "loss": 2.7214, "mean_token_accuracy": 0.48404255319148937, "step": 9101 }, { "epoch": 1.6874304783092324, "grad_norm": 8.2109375, "learning_rate": 8.31256952169077e-06, "loss": 2.5909, "mean_token_accuracy": 0.48796112215595316, "step": 9102 }, { "epoch": 1.6876158694846124, "grad_norm": 6.20703125, "learning_rate": 8.312384130515388e-06, "loss": 3.1369, "mean_token_accuracy": 0.42562893081761005, "step": 9103 }, { "epoch": 1.6878012606599926, "grad_norm": 7.109375, "learning_rate": 8.312198739340009e-06, "loss": 3.0262, "mean_token_accuracy": 0.4610630407911001, "step": 9104 }, { "epoch": 1.6879866518353728, "grad_norm": 6.6484375, "learning_rate": 8.312013348164628e-06, "loss": 2.7802, "mean_token_accuracy": 0.47671840354767187, "step": 9105 }, { "epoch": 1.6881720430107527, "grad_norm": 10.421875, "learning_rate": 8.311827956989248e-06, "loss": 2.5358, "mean_token_accuracy": 0.48253968253968255, "step": 9106 }, { "epoch": 1.6883574341861327, "grad_norm": 5.515625, "learning_rate": 8.311642565813869e-06, "loss": 2.5276, "mean_token_accuracy": 0.5057920648711266, "step": 9107 }, { "epoch": 1.688542825361513, "grad_norm": 5.66796875, "learning_rate": 8.311457174638487e-06, "loss": 2.7112, "mean_token_accuracy": 0.47992403689636465, "step": 9108 }, { "epoch": 1.6887282165368929, "grad_norm": 6.9609375, "learning_rate": 8.311271783463108e-06, "loss": 3.8169, "mean_token_accuracy": 0.448292164046803, "step": 9109 }, { "epoch": 1.6889136077122728, "grad_norm": 6.42578125, "learning_rate": 8.311086392287728e-06, "loss": 2.9506, "mean_token_accuracy": 0.5024707725683982, "step": 9110 }, { "epoch": 1.689098998887653, "grad_norm": 7.9609375, "learning_rate": 8.310901001112349e-06, "loss": 2.8987, "mean_token_accuracy": 0.45084897229669346, "step": 9111 }, { "epoch": 1.689284390063033, "grad_norm": 6.29296875, "learning_rate": 8.310715609936968e-06, "loss": 2.9296, "mean_token_accuracy": 0.448702269330072, "step": 9112 }, { "epoch": 1.689469781238413, "grad_norm": 5.46875, "learning_rate": 8.310530218761588e-06, "loss": 3.2255, "mean_token_accuracy": 0.4287630402384501, "step": 9113 }, { "epoch": 1.6896551724137931, "grad_norm": 5.8671875, "learning_rate": 8.310344827586207e-06, "loss": 3.1278, "mean_token_accuracy": 0.44597279535381323, "step": 9114 }, { "epoch": 1.6898405635891731, "grad_norm": 6.640625, "learning_rate": 8.310159436410827e-06, "loss": 3.0455, "mean_token_accuracy": 0.4509682224428997, "step": 9115 }, { "epoch": 1.690025954764553, "grad_norm": 6.19921875, "learning_rate": 8.309974045235448e-06, "loss": 2.872, "mean_token_accuracy": 0.4431968295904888, "step": 9116 }, { "epoch": 1.6902113459399333, "grad_norm": 6.76171875, "learning_rate": 8.309788654060067e-06, "loss": 2.564, "mean_token_accuracy": 0.4997751798561151, "step": 9117 }, { "epoch": 1.6903967371153135, "grad_norm": 7.09765625, "learning_rate": 8.309603262884687e-06, "loss": 2.4654, "mean_token_accuracy": 0.4890738813735692, "step": 9118 }, { "epoch": 1.6905821282906932, "grad_norm": 6.6484375, "learning_rate": 8.309417871709308e-06, "loss": 3.405, "mean_token_accuracy": 0.424188467333242, "step": 9119 }, { "epoch": 1.6907675194660734, "grad_norm": 5.5234375, "learning_rate": 8.309232480533928e-06, "loss": 3.1678, "mean_token_accuracy": 0.44377059986816086, "step": 9120 }, { "epoch": 1.6909529106414536, "grad_norm": 6.12109375, "learning_rate": 8.309047089358547e-06, "loss": 2.6648, "mean_token_accuracy": 0.4827944230198754, "step": 9121 }, { "epoch": 1.6911383018168336, "grad_norm": 5.69921875, "learning_rate": 8.308861698183167e-06, "loss": 3.0312, "mean_token_accuracy": 0.43607245996324495, "step": 9122 }, { "epoch": 1.6913236929922135, "grad_norm": 6.01171875, "learning_rate": 8.308676307007786e-06, "loss": 3.0037, "mean_token_accuracy": 0.4627159745377387, "step": 9123 }, { "epoch": 1.6915090841675937, "grad_norm": 5.9140625, "learning_rate": 8.308490915832407e-06, "loss": 3.2002, "mean_token_accuracy": 0.44009475127789555, "step": 9124 }, { "epoch": 1.6916944753429737, "grad_norm": 5.5234375, "learning_rate": 8.308305524657027e-06, "loss": 2.7782, "mean_token_accuracy": 0.47016314464031533, "step": 9125 }, { "epoch": 1.6918798665183536, "grad_norm": 5.7578125, "learning_rate": 8.308120133481648e-06, "loss": 3.0414, "mean_token_accuracy": 0.4603316326530612, "step": 9126 }, { "epoch": 1.6920652576937338, "grad_norm": 7.6796875, "learning_rate": 8.307934742306267e-06, "loss": 2.6886, "mean_token_accuracy": 0.47412060301507536, "step": 9127 }, { "epoch": 1.6922506488691138, "grad_norm": 5.9140625, "learning_rate": 8.307749351130887e-06, "loss": 2.418, "mean_token_accuracy": 0.49349069229833314, "step": 9128 }, { "epoch": 1.6924360400444938, "grad_norm": 5.54296875, "learning_rate": 8.307563959955507e-06, "loss": 3.019, "mean_token_accuracy": 0.44623799359658484, "step": 9129 }, { "epoch": 1.692621431219874, "grad_norm": 6.51171875, "learning_rate": 8.307378568780126e-06, "loss": 2.6168, "mean_token_accuracy": 0.4892263759086189, "step": 9130 }, { "epoch": 1.6928068223952542, "grad_norm": 5.9921875, "learning_rate": 8.307193177604747e-06, "loss": 3.3726, "mean_token_accuracy": 0.41840161182001345, "step": 9131 }, { "epoch": 1.692992213570634, "grad_norm": 6.66015625, "learning_rate": 8.307007786429366e-06, "loss": 2.371, "mean_token_accuracy": 0.5277275467148885, "step": 9132 }, { "epoch": 1.693177604746014, "grad_norm": 7.6796875, "learning_rate": 8.306822395253986e-06, "loss": 2.9492, "mean_token_accuracy": 0.4437097321125805, "step": 9133 }, { "epoch": 1.6933629959213943, "grad_norm": 7.59765625, "learning_rate": 8.306637004078607e-06, "loss": 2.9861, "mean_token_accuracy": 0.45683563748079875, "step": 9134 }, { "epoch": 1.6935483870967742, "grad_norm": 5.83984375, "learning_rate": 8.306451612903227e-06, "loss": 2.6193, "mean_token_accuracy": 0.4721680420105026, "step": 9135 }, { "epoch": 1.6937337782721542, "grad_norm": 6.609375, "learning_rate": 8.306266221727846e-06, "loss": 3.0677, "mean_token_accuracy": 0.44235895157707683, "step": 9136 }, { "epoch": 1.6939191694475344, "grad_norm": 7.86328125, "learning_rate": 8.306080830552466e-06, "loss": 3.1931, "mean_token_accuracy": 0.44338819523269013, "step": 9137 }, { "epoch": 1.6941045606229144, "grad_norm": 7.09375, "learning_rate": 8.305895439377087e-06, "loss": 2.1719, "mean_token_accuracy": 0.531744975376015, "step": 9138 }, { "epoch": 1.6942899517982943, "grad_norm": 10.1796875, "learning_rate": 8.305710048201706e-06, "loss": 3.0565, "mean_token_accuracy": 0.4555105461656656, "step": 9139 }, { "epoch": 1.6944753429736745, "grad_norm": 7.66015625, "learning_rate": 8.305524657026326e-06, "loss": 2.9333, "mean_token_accuracy": 0.4652434956637759, "step": 9140 }, { "epoch": 1.6946607341490545, "grad_norm": 7.47265625, "learning_rate": 8.305339265850945e-06, "loss": 3.2352, "mean_token_accuracy": 0.4527670074021854, "step": 9141 }, { "epoch": 1.6948461253244345, "grad_norm": 6.17578125, "learning_rate": 8.305153874675567e-06, "loss": 2.5131, "mean_token_accuracy": 0.4810828440965427, "step": 9142 }, { "epoch": 1.6950315164998146, "grad_norm": 5.85546875, "learning_rate": 8.304968483500186e-06, "loss": 2.4858, "mean_token_accuracy": 0.5054072553045859, "step": 9143 }, { "epoch": 1.6952169076751946, "grad_norm": 6.0625, "learning_rate": 8.304783092324806e-06, "loss": 2.6147, "mean_token_accuracy": 0.49845616861323666, "step": 9144 }, { "epoch": 1.6954022988505746, "grad_norm": 6.94921875, "learning_rate": 8.304597701149427e-06, "loss": 2.9895, "mean_token_accuracy": 0.4867490106727425, "step": 9145 }, { "epoch": 1.6955876900259548, "grad_norm": 6.90625, "learning_rate": 8.304412309974046e-06, "loss": 2.7804, "mean_token_accuracy": 0.4589828373484491, "step": 9146 }, { "epoch": 1.695773081201335, "grad_norm": 7.94921875, "learning_rate": 8.304226918798666e-06, "loss": 2.3435, "mean_token_accuracy": 0.5245294204831777, "step": 9147 }, { "epoch": 1.6959584723767147, "grad_norm": 6.80859375, "learning_rate": 8.304041527623285e-06, "loss": 3.4427, "mean_token_accuracy": 0.4133880290881969, "step": 9148 }, { "epoch": 1.696143863552095, "grad_norm": 8.8046875, "learning_rate": 8.303856136447905e-06, "loss": 3.057, "mean_token_accuracy": 0.44923903312444047, "step": 9149 }, { "epoch": 1.696329254727475, "grad_norm": 7.5, "learning_rate": 8.303670745272526e-06, "loss": 3.2175, "mean_token_accuracy": 0.44012042818911684, "step": 9150 }, { "epoch": 1.696514645902855, "grad_norm": 6.1875, "learning_rate": 8.303485354097146e-06, "loss": 2.9516, "mean_token_accuracy": 0.4599203187250996, "step": 9151 }, { "epoch": 1.696700037078235, "grad_norm": 5.84375, "learning_rate": 8.303299962921765e-06, "loss": 3.0816, "mean_token_accuracy": 0.42380614417520024, "step": 9152 }, { "epoch": 1.6968854282536152, "grad_norm": 7.4140625, "learning_rate": 8.303114571746386e-06, "loss": 2.6362, "mean_token_accuracy": 0.5126780482877058, "step": 9153 }, { "epoch": 1.6970708194289952, "grad_norm": 7.640625, "learning_rate": 8.302929180571006e-06, "loss": 3.095, "mean_token_accuracy": 0.4669619785898856, "step": 9154 }, { "epoch": 1.6972562106043751, "grad_norm": 6.19921875, "learning_rate": 8.302743789395625e-06, "loss": 3.6208, "mean_token_accuracy": 0.4143248306672431, "step": 9155 }, { "epoch": 1.6974416017797553, "grad_norm": 14.515625, "learning_rate": 8.302558398220246e-06, "loss": 3.1389, "mean_token_accuracy": 0.47212905286695417, "step": 9156 }, { "epoch": 1.6976269929551353, "grad_norm": 4.984375, "learning_rate": 8.302373007044864e-06, "loss": 2.618, "mean_token_accuracy": 0.4993939393939394, "step": 9157 }, { "epoch": 1.6978123841305153, "grad_norm": 6.0625, "learning_rate": 8.302187615869485e-06, "loss": 3.0672, "mean_token_accuracy": 0.4473547717842324, "step": 9158 }, { "epoch": 1.6979977753058955, "grad_norm": 7.78125, "learning_rate": 8.302002224694105e-06, "loss": 2.5315, "mean_token_accuracy": 0.5020033741037537, "step": 9159 }, { "epoch": 1.6981831664812757, "grad_norm": 6.65625, "learning_rate": 8.301816833518726e-06, "loss": 2.8877, "mean_token_accuracy": 0.4588102833552261, "step": 9160 }, { "epoch": 1.6983685576566554, "grad_norm": 5.9140625, "learning_rate": 8.301631442343345e-06, "loss": 2.7359, "mean_token_accuracy": 0.4795289855072464, "step": 9161 }, { "epoch": 1.6985539488320356, "grad_norm": 8.1640625, "learning_rate": 8.301446051167965e-06, "loss": 3.0187, "mean_token_accuracy": 0.4553030303030303, "step": 9162 }, { "epoch": 1.6987393400074158, "grad_norm": 10.9921875, "learning_rate": 8.301260659992586e-06, "loss": 2.7128, "mean_token_accuracy": 0.465080778526905, "step": 9163 }, { "epoch": 1.6989247311827957, "grad_norm": 6.1171875, "learning_rate": 8.301075268817204e-06, "loss": 3.1521, "mean_token_accuracy": 0.44034786869181775, "step": 9164 }, { "epoch": 1.6991101223581757, "grad_norm": 8.265625, "learning_rate": 8.300889877641825e-06, "loss": 3.1169, "mean_token_accuracy": 0.4561344537815126, "step": 9165 }, { "epoch": 1.699295513533556, "grad_norm": 6.06640625, "learning_rate": 8.300704486466444e-06, "loss": 3.2531, "mean_token_accuracy": 0.4515744906059804, "step": 9166 }, { "epoch": 1.6994809047089359, "grad_norm": 5.53515625, "learning_rate": 8.300519095291066e-06, "loss": 3.0218, "mean_token_accuracy": 0.46347125495015706, "step": 9167 }, { "epoch": 1.6996662958843158, "grad_norm": 8.65625, "learning_rate": 8.300333704115685e-06, "loss": 2.7546, "mean_token_accuracy": 0.45079400581525386, "step": 9168 }, { "epoch": 1.699851687059696, "grad_norm": 9.2265625, "learning_rate": 8.300148312940305e-06, "loss": 2.6805, "mean_token_accuracy": 0.49092607419268747, "step": 9169 }, { "epoch": 1.700037078235076, "grad_norm": 5.4296875, "learning_rate": 8.299962921764924e-06, "loss": 2.7725, "mean_token_accuracy": 0.4744058500914077, "step": 9170 }, { "epoch": 1.700222469410456, "grad_norm": 6.875, "learning_rate": 8.299777530589544e-06, "loss": 2.8927, "mean_token_accuracy": 0.4659549228944247, "step": 9171 }, { "epoch": 1.7004078605858362, "grad_norm": 5.8828125, "learning_rate": 8.299592139414165e-06, "loss": 2.6225, "mean_token_accuracy": 0.49113960524420114, "step": 9172 }, { "epoch": 1.7005932517612161, "grad_norm": 8.4140625, "learning_rate": 8.299406748238784e-06, "loss": 3.0099, "mean_token_accuracy": 0.4428220326731242, "step": 9173 }, { "epoch": 1.700778642936596, "grad_norm": 6.63671875, "learning_rate": 8.299221357063404e-06, "loss": 3.1017, "mean_token_accuracy": 0.4497628288055196, "step": 9174 }, { "epoch": 1.7009640341119763, "grad_norm": 6.0, "learning_rate": 8.299035965888025e-06, "loss": 3.1422, "mean_token_accuracy": 0.46157643574131973, "step": 9175 }, { "epoch": 1.7011494252873565, "grad_norm": 6.65625, "learning_rate": 8.298850574712645e-06, "loss": 2.8155, "mean_token_accuracy": 0.502017238217495, "step": 9176 }, { "epoch": 1.7013348164627362, "grad_norm": 6.51171875, "learning_rate": 8.298665183537264e-06, "loss": 2.9695, "mean_token_accuracy": 0.4541035967226774, "step": 9177 }, { "epoch": 1.7015202076381164, "grad_norm": 6.44140625, "learning_rate": 8.298479792361884e-06, "loss": 2.8345, "mean_token_accuracy": 0.4619680338917774, "step": 9178 }, { "epoch": 1.7017055988134966, "grad_norm": 5.23828125, "learning_rate": 8.298294401186503e-06, "loss": 2.4934, "mean_token_accuracy": 0.5127464523420822, "step": 9179 }, { "epoch": 1.7018909899888766, "grad_norm": 5.625, "learning_rate": 8.298109010011124e-06, "loss": 3.3165, "mean_token_accuracy": 0.4198581560283688, "step": 9180 }, { "epoch": 1.7020763811642565, "grad_norm": 6.44921875, "learning_rate": 8.297923618835744e-06, "loss": 3.1553, "mean_token_accuracy": 0.42332230623818523, "step": 9181 }, { "epoch": 1.7022617723396367, "grad_norm": 9.1171875, "learning_rate": 8.297738227660363e-06, "loss": 2.7647, "mean_token_accuracy": 0.4835945427894915, "step": 9182 }, { "epoch": 1.7024471635150167, "grad_norm": 4.9140625, "learning_rate": 8.297552836484985e-06, "loss": 2.4592, "mean_token_accuracy": 0.4954534283607766, "step": 9183 }, { "epoch": 1.7026325546903966, "grad_norm": 6.125, "learning_rate": 8.297367445309604e-06, "loss": 3.2922, "mean_token_accuracy": 0.4340136054421769, "step": 9184 }, { "epoch": 1.7028179458657768, "grad_norm": 6.26171875, "learning_rate": 8.297182054134225e-06, "loss": 3.2195, "mean_token_accuracy": 0.4219881500987492, "step": 9185 }, { "epoch": 1.7030033370411568, "grad_norm": 7.85546875, "learning_rate": 8.296996662958843e-06, "loss": 3.0717, "mean_token_accuracy": 0.45837912087912086, "step": 9186 }, { "epoch": 1.7031887282165368, "grad_norm": 7.41015625, "learning_rate": 8.296811271783464e-06, "loss": 2.836, "mean_token_accuracy": 0.45192192551790206, "step": 9187 }, { "epoch": 1.703374119391917, "grad_norm": 8.1953125, "learning_rate": 8.296625880608084e-06, "loss": 2.6632, "mean_token_accuracy": 0.47739846174605316, "step": 9188 }, { "epoch": 1.7035595105672972, "grad_norm": 6.4140625, "learning_rate": 8.296440489432703e-06, "loss": 2.998, "mean_token_accuracy": 0.45372960372960375, "step": 9189 }, { "epoch": 1.703744901742677, "grad_norm": 6.96484375, "learning_rate": 8.296255098257324e-06, "loss": 2.5713, "mean_token_accuracy": 0.5227439471753484, "step": 9190 }, { "epoch": 1.703930292918057, "grad_norm": 7.65625, "learning_rate": 8.296069707081944e-06, "loss": 2.9373, "mean_token_accuracy": 0.44803242253889397, "step": 9191 }, { "epoch": 1.7041156840934373, "grad_norm": 6.484375, "learning_rate": 8.295884315906565e-06, "loss": 2.6226, "mean_token_accuracy": 0.505586592178771, "step": 9192 }, { "epoch": 1.7043010752688172, "grad_norm": 7.01953125, "learning_rate": 8.295698924731183e-06, "loss": 3.0122, "mean_token_accuracy": 0.43734910671173344, "step": 9193 }, { "epoch": 1.7044864664441972, "grad_norm": 5.5234375, "learning_rate": 8.295513533555804e-06, "loss": 3.536, "mean_token_accuracy": 0.4166666666666667, "step": 9194 }, { "epoch": 1.7046718576195774, "grad_norm": 7.4765625, "learning_rate": 8.295328142380423e-06, "loss": 2.5758, "mean_token_accuracy": 0.4848899390623057, "step": 9195 }, { "epoch": 1.7048572487949574, "grad_norm": 7.26953125, "learning_rate": 8.295142751205043e-06, "loss": 2.6302, "mean_token_accuracy": 0.5172917681441792, "step": 9196 }, { "epoch": 1.7050426399703373, "grad_norm": 6.80078125, "learning_rate": 8.294957360029664e-06, "loss": 2.5561, "mean_token_accuracy": 0.49020798542583877, "step": 9197 }, { "epoch": 1.7052280311457175, "grad_norm": 6.0703125, "learning_rate": 8.294771968854282e-06, "loss": 3.5002, "mean_token_accuracy": 0.43086719223771985, "step": 9198 }, { "epoch": 1.7054134223210975, "grad_norm": 5.21484375, "learning_rate": 8.294586577678903e-06, "loss": 2.8274, "mean_token_accuracy": 0.46041412911084045, "step": 9199 }, { "epoch": 1.7055988134964775, "grad_norm": 6.8046875, "learning_rate": 8.294401186503523e-06, "loss": 2.5256, "mean_token_accuracy": 0.5048094484354073, "step": 9200 }, { "epoch": 1.7057842046718577, "grad_norm": 5.96484375, "learning_rate": 8.294215795328144e-06, "loss": 2.9088, "mean_token_accuracy": 0.476592478894858, "step": 9201 }, { "epoch": 1.7059695958472376, "grad_norm": 7.859375, "learning_rate": 8.294030404152763e-06, "loss": 2.65, "mean_token_accuracy": 0.4851347596554598, "step": 9202 }, { "epoch": 1.7061549870226176, "grad_norm": 5.8046875, "learning_rate": 8.293845012977383e-06, "loss": 3.079, "mean_token_accuracy": 0.4226062687848862, "step": 9203 }, { "epoch": 1.7063403781979978, "grad_norm": 7.08984375, "learning_rate": 8.293659621802002e-06, "loss": 3.1121, "mean_token_accuracy": 0.44983563445101904, "step": 9204 }, { "epoch": 1.706525769373378, "grad_norm": 6.48046875, "learning_rate": 8.293474230626622e-06, "loss": 3.2747, "mean_token_accuracy": 0.45783464025312975, "step": 9205 }, { "epoch": 1.706711160548758, "grad_norm": 5.3125, "learning_rate": 8.293288839451243e-06, "loss": 3.0289, "mean_token_accuracy": 0.4499727371864776, "step": 9206 }, { "epoch": 1.706896551724138, "grad_norm": 5.73046875, "learning_rate": 8.293103448275863e-06, "loss": 2.8939, "mean_token_accuracy": 0.4688015393073117, "step": 9207 }, { "epoch": 1.707081942899518, "grad_norm": 7.26953125, "learning_rate": 8.292918057100482e-06, "loss": 3.2186, "mean_token_accuracy": 0.4258528660942214, "step": 9208 }, { "epoch": 1.707267334074898, "grad_norm": 5.3671875, "learning_rate": 8.292732665925103e-06, "loss": 2.9231, "mean_token_accuracy": 0.4463087248322148, "step": 9209 }, { "epoch": 1.707452725250278, "grad_norm": 7.109375, "learning_rate": 8.292547274749723e-06, "loss": 2.6988, "mean_token_accuracy": 0.4710302766281856, "step": 9210 }, { "epoch": 1.7076381164256582, "grad_norm": 6.7890625, "learning_rate": 8.292361883574342e-06, "loss": 2.9817, "mean_token_accuracy": 0.45598740767647417, "step": 9211 }, { "epoch": 1.7078235076010382, "grad_norm": 8.3515625, "learning_rate": 8.292176492398963e-06, "loss": 3.4509, "mean_token_accuracy": 0.4278626452539496, "step": 9212 }, { "epoch": 1.7080088987764181, "grad_norm": 9.9921875, "learning_rate": 8.291991101223581e-06, "loss": 2.4456, "mean_token_accuracy": 0.48627092050209203, "step": 9213 }, { "epoch": 1.7081942899517983, "grad_norm": 5.94140625, "learning_rate": 8.291805710048202e-06, "loss": 2.8301, "mean_token_accuracy": 0.46034129692832765, "step": 9214 }, { "epoch": 1.7083796811271783, "grad_norm": 6.36328125, "learning_rate": 8.291620318872822e-06, "loss": 2.6948, "mean_token_accuracy": 0.4872531867033242, "step": 9215 }, { "epoch": 1.7085650723025583, "grad_norm": 6.84765625, "learning_rate": 8.291434927697443e-06, "loss": 3.2476, "mean_token_accuracy": 0.4539614561027837, "step": 9216 }, { "epoch": 1.7087504634779385, "grad_norm": 6.4375, "learning_rate": 8.291249536522062e-06, "loss": 3.4262, "mean_token_accuracy": 0.42397806580259223, "step": 9217 }, { "epoch": 1.7089358546533187, "grad_norm": 7.6328125, "learning_rate": 8.291064145346682e-06, "loss": 3.2596, "mean_token_accuracy": 0.4411130284728214, "step": 9218 }, { "epoch": 1.7091212458286984, "grad_norm": 7.6796875, "learning_rate": 8.290878754171303e-06, "loss": 3.3181, "mean_token_accuracy": 0.44378204363896506, "step": 9219 }, { "epoch": 1.7093066370040786, "grad_norm": 7.40234375, "learning_rate": 8.290693362995921e-06, "loss": 3.0276, "mean_token_accuracy": 0.4325952914798206, "step": 9220 }, { "epoch": 1.7094920281794588, "grad_norm": 6.6640625, "learning_rate": 8.290507971820542e-06, "loss": 2.9692, "mean_token_accuracy": 0.4624371487725525, "step": 9221 }, { "epoch": 1.7096774193548387, "grad_norm": 8.3203125, "learning_rate": 8.29032258064516e-06, "loss": 2.4669, "mean_token_accuracy": 0.4934699103713188, "step": 9222 }, { "epoch": 1.7098628105302187, "grad_norm": 6.18359375, "learning_rate": 8.290137189469783e-06, "loss": 2.8584, "mean_token_accuracy": 0.46683811586356255, "step": 9223 }, { "epoch": 1.710048201705599, "grad_norm": 6.41796875, "learning_rate": 8.289951798294402e-06, "loss": 2.0392, "mean_token_accuracy": 0.568015602145295, "step": 9224 }, { "epoch": 1.7102335928809789, "grad_norm": 6.89453125, "learning_rate": 8.289766407119022e-06, "loss": 3.2076, "mean_token_accuracy": 0.4288857006217121, "step": 9225 }, { "epoch": 1.7104189840563588, "grad_norm": 6.3828125, "learning_rate": 8.289581015943643e-06, "loss": 2.5082, "mean_token_accuracy": 0.5105743209381578, "step": 9226 }, { "epoch": 1.710604375231739, "grad_norm": 6.36328125, "learning_rate": 8.289395624768261e-06, "loss": 2.7186, "mean_token_accuracy": 0.4736082722135385, "step": 9227 }, { "epoch": 1.710789766407119, "grad_norm": 6.50390625, "learning_rate": 8.289210233592882e-06, "loss": 2.754, "mean_token_accuracy": 0.4829787234042553, "step": 9228 }, { "epoch": 1.710975157582499, "grad_norm": 7.1875, "learning_rate": 8.2890248424175e-06, "loss": 2.7404, "mean_token_accuracy": 0.4737308520065831, "step": 9229 }, { "epoch": 1.7111605487578792, "grad_norm": 5.14453125, "learning_rate": 8.288839451242121e-06, "loss": 3.2076, "mean_token_accuracy": 0.4420117732107818, "step": 9230 }, { "epoch": 1.7113459399332593, "grad_norm": 6.62109375, "learning_rate": 8.288654060066742e-06, "loss": 2.8795, "mean_token_accuracy": 0.4672639558924879, "step": 9231 }, { "epoch": 1.711531331108639, "grad_norm": 6.3984375, "learning_rate": 8.288468668891362e-06, "loss": 3.4243, "mean_token_accuracy": 0.4143802190651773, "step": 9232 }, { "epoch": 1.7117167222840193, "grad_norm": 6.47265625, "learning_rate": 8.288283277715981e-06, "loss": 3.3869, "mean_token_accuracy": 0.4306097927305497, "step": 9233 }, { "epoch": 1.7119021134593995, "grad_norm": 7.28515625, "learning_rate": 8.288097886540601e-06, "loss": 2.7603, "mean_token_accuracy": 0.4820444295974612, "step": 9234 }, { "epoch": 1.7120875046347794, "grad_norm": 5.51171875, "learning_rate": 8.287912495365222e-06, "loss": 3.3483, "mean_token_accuracy": 0.42633504023408925, "step": 9235 }, { "epoch": 1.7122728958101594, "grad_norm": 6.96484375, "learning_rate": 8.28772710418984e-06, "loss": 2.4689, "mean_token_accuracy": 0.4989429175475687, "step": 9236 }, { "epoch": 1.7124582869855396, "grad_norm": 5.9296875, "learning_rate": 8.287541713014461e-06, "loss": 3.0406, "mean_token_accuracy": 0.43970588235294117, "step": 9237 }, { "epoch": 1.7126436781609196, "grad_norm": 5.6015625, "learning_rate": 8.28735632183908e-06, "loss": 2.9407, "mean_token_accuracy": 0.4594490216271885, "step": 9238 }, { "epoch": 1.7128290693362995, "grad_norm": 5.8515625, "learning_rate": 8.287170930663702e-06, "loss": 3.2096, "mean_token_accuracy": 0.4372937293729373, "step": 9239 }, { "epoch": 1.7130144605116797, "grad_norm": 5.421875, "learning_rate": 8.286985539488321e-06, "loss": 2.894, "mean_token_accuracy": 0.4685230024213075, "step": 9240 }, { "epoch": 1.7131998516870597, "grad_norm": 6.15625, "learning_rate": 8.286800148312942e-06, "loss": 3.0391, "mean_token_accuracy": 0.4578050443081118, "step": 9241 }, { "epoch": 1.7133852428624397, "grad_norm": 5.78515625, "learning_rate": 8.28661475713756e-06, "loss": 2.9948, "mean_token_accuracy": 0.447605561277034, "step": 9242 }, { "epoch": 1.7135706340378198, "grad_norm": 5.9921875, "learning_rate": 8.28642936596218e-06, "loss": 2.7272, "mean_token_accuracy": 0.4799179442884906, "step": 9243 }, { "epoch": 1.7137560252131998, "grad_norm": 7.09375, "learning_rate": 8.286243974786801e-06, "loss": 3.0862, "mean_token_accuracy": 0.44486732212707475, "step": 9244 }, { "epoch": 1.7139414163885798, "grad_norm": 5.9375, "learning_rate": 8.28605858361142e-06, "loss": 2.9846, "mean_token_accuracy": 0.4597345132743363, "step": 9245 }, { "epoch": 1.71412680756396, "grad_norm": 6.125, "learning_rate": 8.28587319243604e-06, "loss": 2.7016, "mean_token_accuracy": 0.4590254706533776, "step": 9246 }, { "epoch": 1.7143121987393402, "grad_norm": 6.0625, "learning_rate": 8.285687801260661e-06, "loss": 3.0713, "mean_token_accuracy": 0.43330821401657876, "step": 9247 }, { "epoch": 1.71449758991472, "grad_norm": 5.92578125, "learning_rate": 8.285502410085282e-06, "loss": 2.8442, "mean_token_accuracy": 0.48251509283517485, "step": 9248 }, { "epoch": 1.7146829810901, "grad_norm": 6.6015625, "learning_rate": 8.2853170189099e-06, "loss": 2.642, "mean_token_accuracy": 0.495844414893617, "step": 9249 }, { "epoch": 1.7148683722654803, "grad_norm": 6.19140625, "learning_rate": 8.285131627734521e-06, "loss": 2.5013, "mean_token_accuracy": 0.4824617756649107, "step": 9250 }, { "epoch": 1.7150537634408602, "grad_norm": 6.13671875, "learning_rate": 8.28494623655914e-06, "loss": 2.8992, "mean_token_accuracy": 0.4868729488982654, "step": 9251 }, { "epoch": 1.7152391546162402, "grad_norm": 5.8203125, "learning_rate": 8.28476084538376e-06, "loss": 2.4433, "mean_token_accuracy": 0.49912810194500334, "step": 9252 }, { "epoch": 1.7154245457916204, "grad_norm": 8.203125, "learning_rate": 8.28457545420838e-06, "loss": 2.8871, "mean_token_accuracy": 0.4303702913242984, "step": 9253 }, { "epoch": 1.7156099369670004, "grad_norm": 6.82421875, "learning_rate": 8.284390063033e-06, "loss": 2.9166, "mean_token_accuracy": 0.5072049279507205, "step": 9254 }, { "epoch": 1.7157953281423803, "grad_norm": 9.1953125, "learning_rate": 8.284204671857622e-06, "loss": 2.9052, "mean_token_accuracy": 0.4379371828281443, "step": 9255 }, { "epoch": 1.7159807193177605, "grad_norm": 5.58984375, "learning_rate": 8.28401928068224e-06, "loss": 2.8858, "mean_token_accuracy": 0.44904204364023415, "step": 9256 }, { "epoch": 1.7161661104931405, "grad_norm": 7.6328125, "learning_rate": 8.283833889506861e-06, "loss": 2.713, "mean_token_accuracy": 0.46743111960503264, "step": 9257 }, { "epoch": 1.7163515016685205, "grad_norm": 5.37890625, "learning_rate": 8.28364849833148e-06, "loss": 3.3529, "mean_token_accuracy": 0.41551064991807757, "step": 9258 }, { "epoch": 1.7165368928439007, "grad_norm": 5.9140625, "learning_rate": 8.2834631071561e-06, "loss": 2.9271, "mean_token_accuracy": 0.4464420607342499, "step": 9259 }, { "epoch": 1.7167222840192808, "grad_norm": 7.15625, "learning_rate": 8.283277715980719e-06, "loss": 2.8367, "mean_token_accuracy": 0.46855072463768116, "step": 9260 }, { "epoch": 1.7169076751946606, "grad_norm": 5.828125, "learning_rate": 8.28309232480534e-06, "loss": 2.402, "mean_token_accuracy": 0.5226076184577269, "step": 9261 }, { "epoch": 1.7170930663700408, "grad_norm": 6.359375, "learning_rate": 8.28290693362996e-06, "loss": 2.9922, "mean_token_accuracy": 0.44770979221673746, "step": 9262 }, { "epoch": 1.717278457545421, "grad_norm": 6.09765625, "learning_rate": 8.28272154245458e-06, "loss": 3.0814, "mean_token_accuracy": 0.4475608294378521, "step": 9263 }, { "epoch": 1.717463848720801, "grad_norm": 7.18359375, "learning_rate": 8.282536151279201e-06, "loss": 2.1824, "mean_token_accuracy": 0.5180355452728994, "step": 9264 }, { "epoch": 1.717649239896181, "grad_norm": 7.51953125, "learning_rate": 8.28235076010382e-06, "loss": 2.67, "mean_token_accuracy": 0.48107900792636155, "step": 9265 }, { "epoch": 1.717834631071561, "grad_norm": 6.828125, "learning_rate": 8.28216536892844e-06, "loss": 2.8987, "mean_token_accuracy": 0.47889740363719546, "step": 9266 }, { "epoch": 1.718020022246941, "grad_norm": 6.5625, "learning_rate": 8.281979977753059e-06, "loss": 2.8734, "mean_token_accuracy": 0.46486657192483816, "step": 9267 }, { "epoch": 1.718205413422321, "grad_norm": 6.4453125, "learning_rate": 8.28179458657768e-06, "loss": 2.7657, "mean_token_accuracy": 0.4826663097309597, "step": 9268 }, { "epoch": 1.7183908045977012, "grad_norm": 6.125, "learning_rate": 8.2816091954023e-06, "loss": 3.0378, "mean_token_accuracy": 0.4746412666996536, "step": 9269 }, { "epoch": 1.7185761957730812, "grad_norm": 4.81640625, "learning_rate": 8.281423804226919e-06, "loss": 2.6302, "mean_token_accuracy": 0.47150153217568946, "step": 9270 }, { "epoch": 1.7187615869484612, "grad_norm": 5.65234375, "learning_rate": 8.28123841305154e-06, "loss": 3.159, "mean_token_accuracy": 0.44659365666334805, "step": 9271 }, { "epoch": 1.7189469781238413, "grad_norm": 6.3671875, "learning_rate": 8.28105302187616e-06, "loss": 3.0113, "mean_token_accuracy": 0.46462303231151614, "step": 9272 }, { "epoch": 1.7191323692992213, "grad_norm": 5.55859375, "learning_rate": 8.28086763070078e-06, "loss": 3.053, "mean_token_accuracy": 0.44271364099577815, "step": 9273 }, { "epoch": 1.7193177604746013, "grad_norm": 6.65625, "learning_rate": 8.280682239525399e-06, "loss": 2.2959, "mean_token_accuracy": 0.5494971715901948, "step": 9274 }, { "epoch": 1.7195031516499815, "grad_norm": 6.12109375, "learning_rate": 8.28049684835002e-06, "loss": 3.0604, "mean_token_accuracy": 0.4432760663507109, "step": 9275 }, { "epoch": 1.7196885428253617, "grad_norm": 5.67578125, "learning_rate": 8.280311457174638e-06, "loss": 2.6921, "mean_token_accuracy": 0.46728845633955124, "step": 9276 }, { "epoch": 1.7198739340007414, "grad_norm": 5.70703125, "learning_rate": 8.280126065999259e-06, "loss": 2.4873, "mean_token_accuracy": 0.49040011725047633, "step": 9277 }, { "epoch": 1.7200593251761216, "grad_norm": 6.65625, "learning_rate": 8.27994067482388e-06, "loss": 3.1706, "mean_token_accuracy": 0.4393828067597355, "step": 9278 }, { "epoch": 1.7202447163515018, "grad_norm": 5.50390625, "learning_rate": 8.279755283648498e-06, "loss": 3.0636, "mean_token_accuracy": 0.43363711681855843, "step": 9279 }, { "epoch": 1.7204301075268817, "grad_norm": 8.3359375, "learning_rate": 8.279569892473119e-06, "loss": 2.8496, "mean_token_accuracy": 0.4640439932318105, "step": 9280 }, { "epoch": 1.7206154987022617, "grad_norm": 5.3671875, "learning_rate": 8.27938450129774e-06, "loss": 2.7917, "mean_token_accuracy": 0.48745724059293044, "step": 9281 }, { "epoch": 1.720800889877642, "grad_norm": 5.9453125, "learning_rate": 8.27919911012236e-06, "loss": 3.078, "mean_token_accuracy": 0.43871800892228224, "step": 9282 }, { "epoch": 1.7209862810530219, "grad_norm": 7.66015625, "learning_rate": 8.279013718946978e-06, "loss": 2.5141, "mean_token_accuracy": 0.49268094334507995, "step": 9283 }, { "epoch": 1.7211716722284018, "grad_norm": 6.5078125, "learning_rate": 8.278828327771599e-06, "loss": 3.1947, "mean_token_accuracy": 0.404654823476501, "step": 9284 }, { "epoch": 1.721357063403782, "grad_norm": 5.54296875, "learning_rate": 8.278642936596218e-06, "loss": 2.763, "mean_token_accuracy": 0.4711985688729875, "step": 9285 }, { "epoch": 1.721542454579162, "grad_norm": 6.33203125, "learning_rate": 8.278457545420838e-06, "loss": 3.0974, "mean_token_accuracy": 0.4415665147119768, "step": 9286 }, { "epoch": 1.721727845754542, "grad_norm": 7.06640625, "learning_rate": 8.278272154245459e-06, "loss": 3.5288, "mean_token_accuracy": 0.42699036058550516, "step": 9287 }, { "epoch": 1.7219132369299222, "grad_norm": 5.703125, "learning_rate": 8.27808676307008e-06, "loss": 2.5371, "mean_token_accuracy": 0.47725887086940616, "step": 9288 }, { "epoch": 1.7220986281053023, "grad_norm": 5.546875, "learning_rate": 8.277901371894698e-06, "loss": 2.8843, "mean_token_accuracy": 0.46002300834052345, "step": 9289 }, { "epoch": 1.722284019280682, "grad_norm": 6.1953125, "learning_rate": 8.277715980719318e-06, "loss": 3.4144, "mean_token_accuracy": 0.43328566714332856, "step": 9290 }, { "epoch": 1.7224694104560623, "grad_norm": 6.87109375, "learning_rate": 8.277530589543939e-06, "loss": 2.6404, "mean_token_accuracy": 0.5106971328773104, "step": 9291 }, { "epoch": 1.7226548016314425, "grad_norm": 5.671875, "learning_rate": 8.277345198368558e-06, "loss": 3.1405, "mean_token_accuracy": 0.44143749315218583, "step": 9292 }, { "epoch": 1.7228401928068224, "grad_norm": 8.3828125, "learning_rate": 8.277159807193178e-06, "loss": 3.0832, "mean_token_accuracy": 0.4482724870314182, "step": 9293 }, { "epoch": 1.7230255839822024, "grad_norm": 7.25390625, "learning_rate": 8.276974416017797e-06, "loss": 3.4287, "mean_token_accuracy": 0.4282844990548204, "step": 9294 }, { "epoch": 1.7232109751575826, "grad_norm": 5.85546875, "learning_rate": 8.276789024842418e-06, "loss": 3.2222, "mean_token_accuracy": 0.43010176754151047, "step": 9295 }, { "epoch": 1.7233963663329626, "grad_norm": 5.75390625, "learning_rate": 8.276603633667038e-06, "loss": 2.6772, "mean_token_accuracy": 0.48214686728048506, "step": 9296 }, { "epoch": 1.7235817575083425, "grad_norm": 6.85546875, "learning_rate": 8.276418242491659e-06, "loss": 3.1786, "mean_token_accuracy": 0.4303312027890761, "step": 9297 }, { "epoch": 1.7237671486837227, "grad_norm": 5.8515625, "learning_rate": 8.276232851316277e-06, "loss": 3.4945, "mean_token_accuracy": 0.43135917030567683, "step": 9298 }, { "epoch": 1.7239525398591027, "grad_norm": 6.91796875, "learning_rate": 8.276047460140898e-06, "loss": 3.0121, "mean_token_accuracy": 0.4461756373937677, "step": 9299 }, { "epoch": 1.7241379310344827, "grad_norm": 5.82421875, "learning_rate": 8.275862068965518e-06, "loss": 3.5069, "mean_token_accuracy": 0.392887383573243, "step": 9300 }, { "epoch": 1.7243233222098628, "grad_norm": 6.4296875, "learning_rate": 8.275676677790137e-06, "loss": 3.561, "mean_token_accuracy": 0.42029174264131286, "step": 9301 }, { "epoch": 1.7245087133852428, "grad_norm": 6.6640625, "learning_rate": 8.275491286614758e-06, "loss": 2.8963, "mean_token_accuracy": 0.4639070606812542, "step": 9302 }, { "epoch": 1.7246941045606228, "grad_norm": 8.21875, "learning_rate": 8.275305895439376e-06, "loss": 2.147, "mean_token_accuracy": 0.5434427260153402, "step": 9303 }, { "epoch": 1.724879495736003, "grad_norm": 7.56640625, "learning_rate": 8.275120504263999e-06, "loss": 2.5683, "mean_token_accuracy": 0.504697631335186, "step": 9304 }, { "epoch": 1.7250648869113832, "grad_norm": 5.6875, "learning_rate": 8.274935113088617e-06, "loss": 2.6573, "mean_token_accuracy": 0.5122749590834698, "step": 9305 }, { "epoch": 1.7252502780867631, "grad_norm": 8.0234375, "learning_rate": 8.274749721913238e-06, "loss": 2.573, "mean_token_accuracy": 0.5083012352238013, "step": 9306 }, { "epoch": 1.725435669262143, "grad_norm": 7.0, "learning_rate": 8.274564330737858e-06, "loss": 2.6803, "mean_token_accuracy": 0.4829891546575546, "step": 9307 }, { "epoch": 1.7256210604375233, "grad_norm": 6.39453125, "learning_rate": 8.274378939562477e-06, "loss": 3.4856, "mean_token_accuracy": 0.40768775872264934, "step": 9308 }, { "epoch": 1.7258064516129032, "grad_norm": 7.34765625, "learning_rate": 8.274193548387098e-06, "loss": 2.4347, "mean_token_accuracy": 0.47008642945336176, "step": 9309 }, { "epoch": 1.7259918427882832, "grad_norm": 6.15234375, "learning_rate": 8.274008157211716e-06, "loss": 3.0024, "mean_token_accuracy": 0.4743477718137561, "step": 9310 }, { "epoch": 1.7261772339636634, "grad_norm": 6.35546875, "learning_rate": 8.273822766036337e-06, "loss": 3.0678, "mean_token_accuracy": 0.46008492569002124, "step": 9311 }, { "epoch": 1.7263626251390434, "grad_norm": 7.49609375, "learning_rate": 8.273637374860957e-06, "loss": 2.3818, "mean_token_accuracy": 0.5042965627498002, "step": 9312 }, { "epoch": 1.7265480163144233, "grad_norm": 5.57421875, "learning_rate": 8.273451983685578e-06, "loss": 3.0811, "mean_token_accuracy": 0.44785276073619634, "step": 9313 }, { "epoch": 1.7267334074898035, "grad_norm": 5.72265625, "learning_rate": 8.273266592510197e-06, "loss": 2.7298, "mean_token_accuracy": 0.4797364495275982, "step": 9314 }, { "epoch": 1.7269187986651835, "grad_norm": 6.66796875, "learning_rate": 8.273081201334817e-06, "loss": 3.0762, "mean_token_accuracy": 0.4327558696147709, "step": 9315 }, { "epoch": 1.7271041898405635, "grad_norm": 5.45703125, "learning_rate": 8.272895810159438e-06, "loss": 3.0138, "mean_token_accuracy": 0.4311095827602017, "step": 9316 }, { "epoch": 1.7272895810159437, "grad_norm": 5.61328125, "learning_rate": 8.272710418984057e-06, "loss": 3.0153, "mean_token_accuracy": 0.4666947132239518, "step": 9317 }, { "epoch": 1.7274749721913238, "grad_norm": 6.2421875, "learning_rate": 8.272525027808677e-06, "loss": 2.9025, "mean_token_accuracy": 0.4673913043478261, "step": 9318 }, { "epoch": 1.7276603633667036, "grad_norm": 6.2421875, "learning_rate": 8.272339636633296e-06, "loss": 3.3398, "mean_token_accuracy": 0.43284201785343573, "step": 9319 }, { "epoch": 1.7278457545420838, "grad_norm": 6.40625, "learning_rate": 8.272154245457918e-06, "loss": 2.6005, "mean_token_accuracy": 0.4953247409653778, "step": 9320 }, { "epoch": 1.728031145717464, "grad_norm": 5.98828125, "learning_rate": 8.271968854282537e-06, "loss": 2.806, "mean_token_accuracy": 0.48205128205128206, "step": 9321 }, { "epoch": 1.728216536892844, "grad_norm": 7.98046875, "learning_rate": 8.271783463107157e-06, "loss": 2.6799, "mean_token_accuracy": 0.4909946082367787, "step": 9322 }, { "epoch": 1.728401928068224, "grad_norm": 7.6953125, "learning_rate": 8.271598071931776e-06, "loss": 2.8031, "mean_token_accuracy": 0.47549501474512007, "step": 9323 }, { "epoch": 1.728587319243604, "grad_norm": 5.1015625, "learning_rate": 8.271412680756397e-06, "loss": 2.6223, "mean_token_accuracy": 0.4805940871514642, "step": 9324 }, { "epoch": 1.728772710418984, "grad_norm": 5.11328125, "learning_rate": 8.271227289581017e-06, "loss": 3.0015, "mean_token_accuracy": 0.4448269267546376, "step": 9325 }, { "epoch": 1.728958101594364, "grad_norm": 7.4140625, "learning_rate": 8.271041898405636e-06, "loss": 2.7509, "mean_token_accuracy": 0.4804869913275517, "step": 9326 }, { "epoch": 1.7291434927697442, "grad_norm": 8.875, "learning_rate": 8.270856507230256e-06, "loss": 3.0281, "mean_token_accuracy": 0.45755128764731556, "step": 9327 }, { "epoch": 1.7293288839451242, "grad_norm": 6.49609375, "learning_rate": 8.270671116054877e-06, "loss": 3.6008, "mean_token_accuracy": 0.41527034030057003, "step": 9328 }, { "epoch": 1.7295142751205042, "grad_norm": 5.87890625, "learning_rate": 8.270485724879497e-06, "loss": 3.1132, "mean_token_accuracy": 0.450685117351784, "step": 9329 }, { "epoch": 1.7296996662958843, "grad_norm": 7.5703125, "learning_rate": 8.270300333704116e-06, "loss": 2.4219, "mean_token_accuracy": 0.5050994390617032, "step": 9330 }, { "epoch": 1.7298850574712645, "grad_norm": 6.40234375, "learning_rate": 8.270114942528737e-06, "loss": 3.5794, "mean_token_accuracy": 0.42763636363636365, "step": 9331 }, { "epoch": 1.7300704486466443, "grad_norm": 6.27734375, "learning_rate": 8.269929551353355e-06, "loss": 2.9938, "mean_token_accuracy": 0.46228981206726016, "step": 9332 }, { "epoch": 1.7302558398220245, "grad_norm": 6.421875, "learning_rate": 8.269744160177976e-06, "loss": 2.9216, "mean_token_accuracy": 0.47494587070832045, "step": 9333 }, { "epoch": 1.7304412309974047, "grad_norm": 6.19140625, "learning_rate": 8.269558769002596e-06, "loss": 2.6744, "mean_token_accuracy": 0.49249482401656314, "step": 9334 }, { "epoch": 1.7306266221727846, "grad_norm": 6.2109375, "learning_rate": 8.269373377827215e-06, "loss": 2.5632, "mean_token_accuracy": 0.5022277227722772, "step": 9335 }, { "epoch": 1.7308120133481646, "grad_norm": 5.390625, "learning_rate": 8.269187986651836e-06, "loss": 2.5703, "mean_token_accuracy": 0.49089861751152075, "step": 9336 }, { "epoch": 1.7309974045235448, "grad_norm": 6.234375, "learning_rate": 8.269002595476456e-06, "loss": 2.9509, "mean_token_accuracy": 0.46065318818040435, "step": 9337 }, { "epoch": 1.7311827956989247, "grad_norm": 5.83203125, "learning_rate": 8.268817204301077e-06, "loss": 2.6467, "mean_token_accuracy": 0.49814183957881697, "step": 9338 }, { "epoch": 1.7313681868743047, "grad_norm": 5.98046875, "learning_rate": 8.268631813125695e-06, "loss": 2.6043, "mean_token_accuracy": 0.48254378746914306, "step": 9339 }, { "epoch": 1.731553578049685, "grad_norm": 6.2734375, "learning_rate": 8.268446421950316e-06, "loss": 3.3792, "mean_token_accuracy": 0.42272832907273994, "step": 9340 }, { "epoch": 1.7317389692250649, "grad_norm": 6.421875, "learning_rate": 8.268261030774935e-06, "loss": 2.4625, "mean_token_accuracy": 0.5006131959774344, "step": 9341 }, { "epoch": 1.7319243604004448, "grad_norm": 6.9375, "learning_rate": 8.268075639599555e-06, "loss": 3.8282, "mean_token_accuracy": 0.41473178542834266, "step": 9342 }, { "epoch": 1.732109751575825, "grad_norm": 7.76953125, "learning_rate": 8.267890248424176e-06, "loss": 3.033, "mean_token_accuracy": 0.45601064616294545, "step": 9343 }, { "epoch": 1.732295142751205, "grad_norm": 5.22265625, "learning_rate": 8.267704857248796e-06, "loss": 2.7972, "mean_token_accuracy": 0.48488476504040706, "step": 9344 }, { "epoch": 1.732480533926585, "grad_norm": 8.3203125, "learning_rate": 8.267519466073417e-06, "loss": 2.5188, "mean_token_accuracy": 0.48247223563495895, "step": 9345 }, { "epoch": 1.7326659251019652, "grad_norm": 6.68359375, "learning_rate": 8.267334074898036e-06, "loss": 2.8236, "mean_token_accuracy": 0.48553459119496856, "step": 9346 }, { "epoch": 1.7328513162773453, "grad_norm": 7.9140625, "learning_rate": 8.267148683722656e-06, "loss": 2.8859, "mean_token_accuracy": 0.466206616602395, "step": 9347 }, { "epoch": 1.733036707452725, "grad_norm": 7.421875, "learning_rate": 8.266963292547275e-06, "loss": 3.2492, "mean_token_accuracy": 0.43426025917926564, "step": 9348 }, { "epoch": 1.7332220986281053, "grad_norm": 10.828125, "learning_rate": 8.266777901371895e-06, "loss": 2.4428, "mean_token_accuracy": 0.5040750251200179, "step": 9349 }, { "epoch": 1.7334074898034855, "grad_norm": 6.24609375, "learning_rate": 8.266592510196516e-06, "loss": 3.376, "mean_token_accuracy": 0.42036027494666983, "step": 9350 }, { "epoch": 1.7335928809788654, "grad_norm": 9.1953125, "learning_rate": 8.266407119021135e-06, "loss": 2.4273, "mean_token_accuracy": 0.5161923454367027, "step": 9351 }, { "epoch": 1.7337782721542454, "grad_norm": 7.5625, "learning_rate": 8.266221727845755e-06, "loss": 2.684, "mean_token_accuracy": 0.47750346100599905, "step": 9352 }, { "epoch": 1.7339636633296256, "grad_norm": 6.01953125, "learning_rate": 8.266036336670376e-06, "loss": 3.2636, "mean_token_accuracy": 0.4480085538626036, "step": 9353 }, { "epoch": 1.7341490545050056, "grad_norm": 8.234375, "learning_rate": 8.265850945494996e-06, "loss": 2.5755, "mean_token_accuracy": 0.47633136094674555, "step": 9354 }, { "epoch": 1.7343344456803855, "grad_norm": 7.0703125, "learning_rate": 8.265665554319615e-06, "loss": 2.6987, "mean_token_accuracy": 0.5024768195097168, "step": 9355 }, { "epoch": 1.7345198368557657, "grad_norm": 7.78515625, "learning_rate": 8.265480163144235e-06, "loss": 3.0672, "mean_token_accuracy": 0.4562201753412496, "step": 9356 }, { "epoch": 1.7347052280311457, "grad_norm": 6.421875, "learning_rate": 8.265294771968854e-06, "loss": 3.6915, "mean_token_accuracy": 0.41029923451635353, "step": 9357 }, { "epoch": 1.7348906192065257, "grad_norm": 8.703125, "learning_rate": 8.265109380793475e-06, "loss": 2.9138, "mean_token_accuracy": 0.46609868654597086, "step": 9358 }, { "epoch": 1.7350760103819058, "grad_norm": 10.8046875, "learning_rate": 8.264923989618095e-06, "loss": 4.3301, "mean_token_accuracy": 0.4281911353653667, "step": 9359 }, { "epoch": 1.735261401557286, "grad_norm": 12.5703125, "learning_rate": 8.264738598442716e-06, "loss": 2.875, "mean_token_accuracy": 0.45106996874248617, "step": 9360 }, { "epoch": 1.7354467927326658, "grad_norm": 9.328125, "learning_rate": 8.264553207267334e-06, "loss": 3.1992, "mean_token_accuracy": 0.44613396938978334, "step": 9361 }, { "epoch": 1.735632183908046, "grad_norm": 9.03125, "learning_rate": 8.264367816091955e-06, "loss": 3.6802, "mean_token_accuracy": 0.4004074596458235, "step": 9362 }, { "epoch": 1.7358175750834262, "grad_norm": 10.546875, "learning_rate": 8.264182424916575e-06, "loss": 3.0055, "mean_token_accuracy": 0.4527725962353739, "step": 9363 }, { "epoch": 1.7360029662588061, "grad_norm": 10.3984375, "learning_rate": 8.263997033741194e-06, "loss": 3.1479, "mean_token_accuracy": 0.4545003309066843, "step": 9364 }, { "epoch": 1.736188357434186, "grad_norm": 9.8515625, "learning_rate": 8.263811642565815e-06, "loss": 2.4416, "mean_token_accuracy": 0.501778093883357, "step": 9365 }, { "epoch": 1.7363737486095663, "grad_norm": 11.125, "learning_rate": 8.263626251390433e-06, "loss": 2.6524, "mean_token_accuracy": 0.47113478691774036, "step": 9366 }, { "epoch": 1.7365591397849462, "grad_norm": 9.3046875, "learning_rate": 8.263440860215054e-06, "loss": 2.723, "mean_token_accuracy": 0.45169043422528443, "step": 9367 }, { "epoch": 1.7367445309603262, "grad_norm": 7.16015625, "learning_rate": 8.263255469039674e-06, "loss": 2.5012, "mean_token_accuracy": 0.5, "step": 9368 }, { "epoch": 1.7369299221357064, "grad_norm": 6.4609375, "learning_rate": 8.263070077864295e-06, "loss": 3.229, "mean_token_accuracy": 0.4164645239539115, "step": 9369 }, { "epoch": 1.7371153133110864, "grad_norm": 7.7421875, "learning_rate": 8.262884686688914e-06, "loss": 2.4474, "mean_token_accuracy": 0.5226224783861672, "step": 9370 }, { "epoch": 1.7373007044864663, "grad_norm": 6.73046875, "learning_rate": 8.262699295513534e-06, "loss": 3.4926, "mean_token_accuracy": 0.42429696287964, "step": 9371 }, { "epoch": 1.7374860956618465, "grad_norm": 5.8125, "learning_rate": 8.262513904338155e-06, "loss": 3.2431, "mean_token_accuracy": 0.4444765342960289, "step": 9372 }, { "epoch": 1.7376714868372265, "grad_norm": 5.34765625, "learning_rate": 8.262328513162774e-06, "loss": 2.8934, "mean_token_accuracy": 0.45482411326813604, "step": 9373 }, { "epoch": 1.7378568780126065, "grad_norm": 6.046875, "learning_rate": 8.262143121987394e-06, "loss": 2.8084, "mean_token_accuracy": 0.47649424490361947, "step": 9374 }, { "epoch": 1.7380422691879867, "grad_norm": 5.515625, "learning_rate": 8.261957730812013e-06, "loss": 3.1988, "mean_token_accuracy": 0.4539656771799629, "step": 9375 }, { "epoch": 1.7382276603633668, "grad_norm": 5.3515625, "learning_rate": 8.261772339636635e-06, "loss": 2.2626, "mean_token_accuracy": 0.5365316362746297, "step": 9376 }, { "epoch": 1.7384130515387466, "grad_norm": 6.15234375, "learning_rate": 8.261586948461254e-06, "loss": 3.3794, "mean_token_accuracy": 0.4443375040102663, "step": 9377 }, { "epoch": 1.7385984427141268, "grad_norm": 5.23828125, "learning_rate": 8.261401557285874e-06, "loss": 2.6585, "mean_token_accuracy": 0.48484848484848486, "step": 9378 }, { "epoch": 1.738783833889507, "grad_norm": 7.20703125, "learning_rate": 8.261216166110493e-06, "loss": 2.7338, "mean_token_accuracy": 0.48241559485530544, "step": 9379 }, { "epoch": 1.738969225064887, "grad_norm": 7.41796875, "learning_rate": 8.261030774935114e-06, "loss": 2.9851, "mean_token_accuracy": 0.47561144120491916, "step": 9380 }, { "epoch": 1.739154616240267, "grad_norm": 6.56640625, "learning_rate": 8.260845383759734e-06, "loss": 3.702, "mean_token_accuracy": 0.410084985835694, "step": 9381 }, { "epoch": 1.739340007415647, "grad_norm": 6.36328125, "learning_rate": 8.260659992584353e-06, "loss": 2.4, "mean_token_accuracy": 0.5186509355261589, "step": 9382 }, { "epoch": 1.739525398591027, "grad_norm": 5.2578125, "learning_rate": 8.260474601408973e-06, "loss": 2.4216, "mean_token_accuracy": 0.49484978540772534, "step": 9383 }, { "epoch": 1.739710789766407, "grad_norm": 6.88671875, "learning_rate": 8.260289210233594e-06, "loss": 2.8913, "mean_token_accuracy": 0.45968475295544103, "step": 9384 }, { "epoch": 1.7398961809417872, "grad_norm": 6.82421875, "learning_rate": 8.260103819058214e-06, "loss": 3.2902, "mean_token_accuracy": 0.4588020674977197, "step": 9385 }, { "epoch": 1.7400815721171672, "grad_norm": 6.99609375, "learning_rate": 8.259918427882833e-06, "loss": 2.6656, "mean_token_accuracy": 0.4816723940435281, "step": 9386 }, { "epoch": 1.7402669632925472, "grad_norm": 6.5625, "learning_rate": 8.259733036707454e-06, "loss": 3.0478, "mean_token_accuracy": 0.44549707602339184, "step": 9387 }, { "epoch": 1.7404523544679273, "grad_norm": 8.0, "learning_rate": 8.259547645532074e-06, "loss": 2.6733, "mean_token_accuracy": 0.4737997256515775, "step": 9388 }, { "epoch": 1.7406377456433075, "grad_norm": 6.671875, "learning_rate": 8.259362254356693e-06, "loss": 3.4276, "mean_token_accuracy": 0.4253090909090909, "step": 9389 }, { "epoch": 1.7408231368186873, "grad_norm": 5.7109375, "learning_rate": 8.259176863181313e-06, "loss": 2.8682, "mean_token_accuracy": 0.47598979705001665, "step": 9390 }, { "epoch": 1.7410085279940675, "grad_norm": 5.90625, "learning_rate": 8.258991472005932e-06, "loss": 3.0569, "mean_token_accuracy": 0.43924107815088087, "step": 9391 }, { "epoch": 1.7411939191694477, "grad_norm": 6.19921875, "learning_rate": 8.258806080830554e-06, "loss": 2.3682, "mean_token_accuracy": 0.518824027072758, "step": 9392 }, { "epoch": 1.7413793103448276, "grad_norm": 5.8515625, "learning_rate": 8.258620689655173e-06, "loss": 3.0434, "mean_token_accuracy": 0.4694537762846833, "step": 9393 }, { "epoch": 1.7415647015202076, "grad_norm": 6.68359375, "learning_rate": 8.258435298479794e-06, "loss": 3.3234, "mean_token_accuracy": 0.4292787217283673, "step": 9394 }, { "epoch": 1.7417500926955878, "grad_norm": 6.98828125, "learning_rate": 8.258249907304412e-06, "loss": 3.0662, "mean_token_accuracy": 0.44380979580179925, "step": 9395 }, { "epoch": 1.7419354838709677, "grad_norm": 5.86328125, "learning_rate": 8.258064516129033e-06, "loss": 3.1963, "mean_token_accuracy": 0.45581773799837266, "step": 9396 }, { "epoch": 1.7421208750463477, "grad_norm": 5.85546875, "learning_rate": 8.257879124953653e-06, "loss": 2.7508, "mean_token_accuracy": 0.48428422463876064, "step": 9397 }, { "epoch": 1.742306266221728, "grad_norm": 6.953125, "learning_rate": 8.257693733778272e-06, "loss": 2.4479, "mean_token_accuracy": 0.51224381155177, "step": 9398 }, { "epoch": 1.7424916573971079, "grad_norm": 6.70703125, "learning_rate": 8.257508342602893e-06, "loss": 2.9268, "mean_token_accuracy": 0.4686219262295082, "step": 9399 }, { "epoch": 1.7426770485724878, "grad_norm": 5.45703125, "learning_rate": 8.257322951427513e-06, "loss": 2.4966, "mean_token_accuracy": 0.5131662638341178, "step": 9400 }, { "epoch": 1.742862439747868, "grad_norm": 5.56640625, "learning_rate": 8.257137560252134e-06, "loss": 2.8623, "mean_token_accuracy": 0.46940592799898234, "step": 9401 }, { "epoch": 1.743047830923248, "grad_norm": 6.46484375, "learning_rate": 8.256952169076753e-06, "loss": 2.8002, "mean_token_accuracy": 0.4510169228380686, "step": 9402 }, { "epoch": 1.743233222098628, "grad_norm": 6.3203125, "learning_rate": 8.256766777901373e-06, "loss": 2.9303, "mean_token_accuracy": 0.4623583378305451, "step": 9403 }, { "epoch": 1.7434186132740082, "grad_norm": 6.703125, "learning_rate": 8.256581386725992e-06, "loss": 2.7939, "mean_token_accuracy": 0.4607034899697719, "step": 9404 }, { "epoch": 1.7436040044493883, "grad_norm": 6.3984375, "learning_rate": 8.256395995550612e-06, "loss": 3.2591, "mean_token_accuracy": 0.4199567333693889, "step": 9405 }, { "epoch": 1.7437893956247683, "grad_norm": 7.890625, "learning_rate": 8.256210604375233e-06, "loss": 2.5303, "mean_token_accuracy": 0.4744815596672048, "step": 9406 }, { "epoch": 1.7439747868001483, "grad_norm": 6.953125, "learning_rate": 8.256025213199852e-06, "loss": 3.1461, "mean_token_accuracy": 0.4511627906976744, "step": 9407 }, { "epoch": 1.7441601779755285, "grad_norm": 5.5703125, "learning_rate": 8.255839822024472e-06, "loss": 2.624, "mean_token_accuracy": 0.5181224004753416, "step": 9408 }, { "epoch": 1.7443455691509084, "grad_norm": 5.43359375, "learning_rate": 8.255654430849093e-06, "loss": 2.9124, "mean_token_accuracy": 0.45454545454545453, "step": 9409 }, { "epoch": 1.7445309603262884, "grad_norm": 7.14453125, "learning_rate": 8.255469039673713e-06, "loss": 2.8364, "mean_token_accuracy": 0.45881030253475064, "step": 9410 }, { "epoch": 1.7447163515016686, "grad_norm": 6.3671875, "learning_rate": 8.255283648498332e-06, "loss": 3.0072, "mean_token_accuracy": 0.44368697628177817, "step": 9411 }, { "epoch": 1.7449017426770486, "grad_norm": 5.43359375, "learning_rate": 8.255098257322952e-06, "loss": 2.7304, "mean_token_accuracy": 0.4976258309591643, "step": 9412 }, { "epoch": 1.7450871338524285, "grad_norm": 6.12109375, "learning_rate": 8.254912866147571e-06, "loss": 3.0609, "mean_token_accuracy": 0.4456648009293484, "step": 9413 }, { "epoch": 1.7452725250278087, "grad_norm": 6.1015625, "learning_rate": 8.254727474972192e-06, "loss": 2.3035, "mean_token_accuracy": 0.5139121689574254, "step": 9414 }, { "epoch": 1.7454579162031887, "grad_norm": 9.28125, "learning_rate": 8.254542083796812e-06, "loss": 2.9757, "mean_token_accuracy": 0.445869907720009, "step": 9415 }, { "epoch": 1.7456433073785687, "grad_norm": 8.8203125, "learning_rate": 8.254356692621431e-06, "loss": 2.032, "mean_token_accuracy": 0.589329268292683, "step": 9416 }, { "epoch": 1.7458286985539488, "grad_norm": 5.59375, "learning_rate": 8.254171301446051e-06, "loss": 2.6346, "mean_token_accuracy": 0.4829240756421112, "step": 9417 }, { "epoch": 1.746014089729329, "grad_norm": 6.07421875, "learning_rate": 8.253985910270672e-06, "loss": 2.9825, "mean_token_accuracy": 0.46016959273856445, "step": 9418 }, { "epoch": 1.7461994809047088, "grad_norm": 10.15625, "learning_rate": 8.253800519095292e-06, "loss": 2.9424, "mean_token_accuracy": 0.4825133372851215, "step": 9419 }, { "epoch": 1.746384872080089, "grad_norm": 9.0625, "learning_rate": 8.253615127919911e-06, "loss": 2.7412, "mean_token_accuracy": 0.46885107579805935, "step": 9420 }, { "epoch": 1.7465702632554692, "grad_norm": 5.51953125, "learning_rate": 8.253429736744532e-06, "loss": 3.4558, "mean_token_accuracy": 0.4116436342252279, "step": 9421 }, { "epoch": 1.7467556544308491, "grad_norm": 9.6796875, "learning_rate": 8.25324434556915e-06, "loss": 3.088, "mean_token_accuracy": 0.44594594594594594, "step": 9422 }, { "epoch": 1.746941045606229, "grad_norm": 15.5703125, "learning_rate": 8.253058954393771e-06, "loss": 2.7433, "mean_token_accuracy": 0.4756082188327241, "step": 9423 }, { "epoch": 1.7471264367816093, "grad_norm": 8.8515625, "learning_rate": 8.252873563218391e-06, "loss": 2.6124, "mean_token_accuracy": 0.4821522034962839, "step": 9424 }, { "epoch": 1.7473118279569892, "grad_norm": 7.359375, "learning_rate": 8.252688172043012e-06, "loss": 2.5557, "mean_token_accuracy": 0.527177089421391, "step": 9425 }, { "epoch": 1.7474972191323692, "grad_norm": 13.25, "learning_rate": 8.252502780867632e-06, "loss": 2.844, "mean_token_accuracy": 0.4844280860702152, "step": 9426 }, { "epoch": 1.7476826103077494, "grad_norm": 9.578125, "learning_rate": 8.252317389692251e-06, "loss": 2.8621, "mean_token_accuracy": 0.48211169284467714, "step": 9427 }, { "epoch": 1.7478680014831294, "grad_norm": 5.64453125, "learning_rate": 8.252131998516872e-06, "loss": 2.8808, "mean_token_accuracy": 0.48101124846864907, "step": 9428 }, { "epoch": 1.7480533926585093, "grad_norm": 5.56640625, "learning_rate": 8.25194660734149e-06, "loss": 2.4273, "mean_token_accuracy": 0.5336857280153772, "step": 9429 }, { "epoch": 1.7482387838338895, "grad_norm": 7.3046875, "learning_rate": 8.251761216166111e-06, "loss": 2.8881, "mean_token_accuracy": 0.4724942570426792, "step": 9430 }, { "epoch": 1.7484241750092697, "grad_norm": 7.5234375, "learning_rate": 8.251575824990732e-06, "loss": 2.225, "mean_token_accuracy": 0.5436857107448259, "step": 9431 }, { "epoch": 1.7486095661846495, "grad_norm": 7.1875, "learning_rate": 8.25139043381535e-06, "loss": 3.5422, "mean_token_accuracy": 0.39936367409046897, "step": 9432 }, { "epoch": 1.7487949573600297, "grad_norm": 7.359375, "learning_rate": 8.25120504263997e-06, "loss": 2.9385, "mean_token_accuracy": 0.4784204088764634, "step": 9433 }, { "epoch": 1.7489803485354098, "grad_norm": 12.6171875, "learning_rate": 8.251019651464591e-06, "loss": 2.5904, "mean_token_accuracy": 0.50630068621335, "step": 9434 }, { "epoch": 1.7491657397107898, "grad_norm": 7.8125, "learning_rate": 8.250834260289212e-06, "loss": 3.3426, "mean_token_accuracy": 0.41955427749820273, "step": 9435 }, { "epoch": 1.7493511308861698, "grad_norm": 8.3828125, "learning_rate": 8.25064886911383e-06, "loss": 3.2227, "mean_token_accuracy": 0.4277071051815079, "step": 9436 }, { "epoch": 1.74953652206155, "grad_norm": 7.85546875, "learning_rate": 8.250463477938451e-06, "loss": 3.2273, "mean_token_accuracy": 0.4475897192940898, "step": 9437 }, { "epoch": 1.74972191323693, "grad_norm": 8.375, "learning_rate": 8.25027808676307e-06, "loss": 2.6869, "mean_token_accuracy": 0.45848771785371206, "step": 9438 }, { "epoch": 1.74990730441231, "grad_norm": 7.9375, "learning_rate": 8.25009269558769e-06, "loss": 2.5631, "mean_token_accuracy": 0.49118671858987495, "step": 9439 }, { "epoch": 1.75009269558769, "grad_norm": 8.0234375, "learning_rate": 8.249907304412311e-06, "loss": 2.2374, "mean_token_accuracy": 0.55092655214392, "step": 9440 }, { "epoch": 1.75027808676307, "grad_norm": 8.515625, "learning_rate": 8.249721913236931e-06, "loss": 3.1711, "mean_token_accuracy": 0.43751735628991945, "step": 9441 }, { "epoch": 1.75046347793845, "grad_norm": 7.52734375, "learning_rate": 8.24953652206155e-06, "loss": 2.5655, "mean_token_accuracy": 0.4840611091062192, "step": 9442 }, { "epoch": 1.7506488691138302, "grad_norm": 6.43359375, "learning_rate": 8.24935113088617e-06, "loss": 3.2653, "mean_token_accuracy": 0.42957835116425425, "step": 9443 }, { "epoch": 1.7508342602892102, "grad_norm": 7.1796875, "learning_rate": 8.249165739710791e-06, "loss": 2.6631, "mean_token_accuracy": 0.495279307631786, "step": 9444 }, { "epoch": 1.7510196514645902, "grad_norm": 8.4609375, "learning_rate": 8.24898034853541e-06, "loss": 2.1668, "mean_token_accuracy": 0.5481319880010909, "step": 9445 }, { "epoch": 1.7512050426399703, "grad_norm": 6.0859375, "learning_rate": 8.24879495736003e-06, "loss": 2.7202, "mean_token_accuracy": 0.4739385065885798, "step": 9446 }, { "epoch": 1.7513904338153505, "grad_norm": 7.90234375, "learning_rate": 8.24860956618465e-06, "loss": 2.8585, "mean_token_accuracy": 0.4541351549158331, "step": 9447 }, { "epoch": 1.7515758249907303, "grad_norm": 11.0703125, "learning_rate": 8.24842417500927e-06, "loss": 2.5817, "mean_token_accuracy": 0.490241891148983, "step": 9448 }, { "epoch": 1.7517612161661105, "grad_norm": 7.12890625, "learning_rate": 8.24823878383389e-06, "loss": 2.66, "mean_token_accuracy": 0.4823131344201784, "step": 9449 }, { "epoch": 1.7519466073414907, "grad_norm": 8.1953125, "learning_rate": 8.24805339265851e-06, "loss": 3.0404, "mean_token_accuracy": 0.4599828803766317, "step": 9450 }, { "epoch": 1.7521319985168706, "grad_norm": 8.390625, "learning_rate": 8.24786800148313e-06, "loss": 2.2881, "mean_token_accuracy": 0.5435881238155401, "step": 9451 }, { "epoch": 1.7523173896922506, "grad_norm": 6.16015625, "learning_rate": 8.24768261030775e-06, "loss": 2.6605, "mean_token_accuracy": 0.47557807222655235, "step": 9452 }, { "epoch": 1.7525027808676308, "grad_norm": 9.1796875, "learning_rate": 8.24749721913237e-06, "loss": 2.1359, "mean_token_accuracy": 0.5668958223162348, "step": 9453 }, { "epoch": 1.7526881720430108, "grad_norm": 7.67578125, "learning_rate": 8.24731182795699e-06, "loss": 2.1717, "mean_token_accuracy": 0.5342292089249493, "step": 9454 }, { "epoch": 1.7528735632183907, "grad_norm": 6.82421875, "learning_rate": 8.24712643678161e-06, "loss": 3.2054, "mean_token_accuracy": 0.45174696013544713, "step": 9455 }, { "epoch": 1.753058954393771, "grad_norm": 6.44140625, "learning_rate": 8.246941045606229e-06, "loss": 3.1688, "mean_token_accuracy": 0.44142550233792494, "step": 9456 }, { "epoch": 1.7532443455691509, "grad_norm": 7.41015625, "learning_rate": 8.24675565443085e-06, "loss": 2.6262, "mean_token_accuracy": 0.5370440970898462, "step": 9457 }, { "epoch": 1.7534297367445308, "grad_norm": 5.8828125, "learning_rate": 8.24657026325547e-06, "loss": 2.5084, "mean_token_accuracy": 0.485688629475811, "step": 9458 }, { "epoch": 1.753615127919911, "grad_norm": 7.0859375, "learning_rate": 8.24638487208009e-06, "loss": 2.6368, "mean_token_accuracy": 0.4763805721889554, "step": 9459 }, { "epoch": 1.7538005190952912, "grad_norm": 6.09375, "learning_rate": 8.246199480904709e-06, "loss": 3.6168, "mean_token_accuracy": 0.4252432506509524, "step": 9460 }, { "epoch": 1.753985910270671, "grad_norm": 9.9296875, "learning_rate": 8.24601408972933e-06, "loss": 2.8573, "mean_token_accuracy": 0.4588820906356943, "step": 9461 }, { "epoch": 1.7541713014460512, "grad_norm": 5.8515625, "learning_rate": 8.24582869855395e-06, "loss": 2.7066, "mean_token_accuracy": 0.504407680231856, "step": 9462 }, { "epoch": 1.7543566926214313, "grad_norm": 6.125, "learning_rate": 8.245643307378569e-06, "loss": 3.3165, "mean_token_accuracy": 0.44382801664355065, "step": 9463 }, { "epoch": 1.7545420837968113, "grad_norm": 9.1171875, "learning_rate": 8.245457916203189e-06, "loss": 2.201, "mean_token_accuracy": 0.5501640085963126, "step": 9464 }, { "epoch": 1.7547274749721913, "grad_norm": 6.59765625, "learning_rate": 8.24527252502781e-06, "loss": 2.3556, "mean_token_accuracy": 0.5039231881065456, "step": 9465 }, { "epoch": 1.7549128661475715, "grad_norm": 6.44921875, "learning_rate": 8.24508713385243e-06, "loss": 2.7625, "mean_token_accuracy": 0.47192533493903355, "step": 9466 }, { "epoch": 1.7550982573229514, "grad_norm": 5.96875, "learning_rate": 8.244901742677049e-06, "loss": 2.5207, "mean_token_accuracy": 0.49774947853770996, "step": 9467 }, { "epoch": 1.7552836484983314, "grad_norm": 7.03125, "learning_rate": 8.24471635150167e-06, "loss": 2.39, "mean_token_accuracy": 0.5091093117408907, "step": 9468 }, { "epoch": 1.7554690396737116, "grad_norm": 6.89453125, "learning_rate": 8.24453096032629e-06, "loss": 2.6462, "mean_token_accuracy": 0.47351524879614765, "step": 9469 }, { "epoch": 1.7556544308490916, "grad_norm": 7.01953125, "learning_rate": 8.244345569150909e-06, "loss": 2.6854, "mean_token_accuracy": 0.46433941997851774, "step": 9470 }, { "epoch": 1.7558398220244715, "grad_norm": 8.984375, "learning_rate": 8.24416017797553e-06, "loss": 2.6181, "mean_token_accuracy": 0.4770395074397127, "step": 9471 }, { "epoch": 1.7560252131998517, "grad_norm": 6.39453125, "learning_rate": 8.243974786800148e-06, "loss": 3.6557, "mean_token_accuracy": 0.4180726800778715, "step": 9472 }, { "epoch": 1.7562106043752317, "grad_norm": 5.83984375, "learning_rate": 8.24378939562477e-06, "loss": 2.5307, "mean_token_accuracy": 0.5004932587964486, "step": 9473 }, { "epoch": 1.7563959955506117, "grad_norm": 6.84375, "learning_rate": 8.243604004449389e-06, "loss": 3.007, "mean_token_accuracy": 0.4706298655343241, "step": 9474 }, { "epoch": 1.7565813867259918, "grad_norm": 5.47265625, "learning_rate": 8.24341861327401e-06, "loss": 2.6142, "mean_token_accuracy": 0.49333715432010317, "step": 9475 }, { "epoch": 1.756766777901372, "grad_norm": 7.234375, "learning_rate": 8.243233222098628e-06, "loss": 2.9798, "mean_token_accuracy": 0.4787966252220249, "step": 9476 }, { "epoch": 1.756952169076752, "grad_norm": 6.96875, "learning_rate": 8.243047830923249e-06, "loss": 2.8049, "mean_token_accuracy": 0.46520982822352686, "step": 9477 }, { "epoch": 1.757137560252132, "grad_norm": 6.046875, "learning_rate": 8.24286243974787e-06, "loss": 2.6237, "mean_token_accuracy": 0.4904397705544933, "step": 9478 }, { "epoch": 1.7573229514275122, "grad_norm": 7.296875, "learning_rate": 8.242677048572488e-06, "loss": 3.0178, "mean_token_accuracy": 0.4608788853161844, "step": 9479 }, { "epoch": 1.7575083426028921, "grad_norm": 6.8203125, "learning_rate": 8.242491657397109e-06, "loss": 2.973, "mean_token_accuracy": 0.4524380495603517, "step": 9480 }, { "epoch": 1.757693733778272, "grad_norm": 6.0546875, "learning_rate": 8.242306266221729e-06, "loss": 2.5458, "mean_token_accuracy": 0.49889494622071606, "step": 9481 }, { "epoch": 1.7578791249536523, "grad_norm": 7.26953125, "learning_rate": 8.24212087504635e-06, "loss": 2.9402, "mean_token_accuracy": 0.4562556663644606, "step": 9482 }, { "epoch": 1.7580645161290323, "grad_norm": 6.66796875, "learning_rate": 8.241935483870968e-06, "loss": 2.5337, "mean_token_accuracy": 0.49869420702754036, "step": 9483 }, { "epoch": 1.7582499073044122, "grad_norm": 5.35546875, "learning_rate": 8.241750092695589e-06, "loss": 3.3672, "mean_token_accuracy": 0.4471954940154893, "step": 9484 }, { "epoch": 1.7584352984797924, "grad_norm": 5.9296875, "learning_rate": 8.241564701520208e-06, "loss": 3.193, "mean_token_accuracy": 0.4397254952738573, "step": 9485 }, { "epoch": 1.7586206896551724, "grad_norm": 23.21875, "learning_rate": 8.241379310344828e-06, "loss": 3.94, "mean_token_accuracy": 0.47316730971588916, "step": 9486 }, { "epoch": 1.7588060808305523, "grad_norm": 7.296875, "learning_rate": 8.241193919169449e-06, "loss": 3.1978, "mean_token_accuracy": 0.43987014903349564, "step": 9487 }, { "epoch": 1.7589914720059325, "grad_norm": 6.109375, "learning_rate": 8.241008527994067e-06, "loss": 2.9465, "mean_token_accuracy": 0.45274467013597447, "step": 9488 }, { "epoch": 1.7591768631813127, "grad_norm": 5.69921875, "learning_rate": 8.240823136818688e-06, "loss": 2.9341, "mean_token_accuracy": 0.48116646415552855, "step": 9489 }, { "epoch": 1.7593622543566925, "grad_norm": 9.1875, "learning_rate": 8.240637745643308e-06, "loss": 3.4204, "mean_token_accuracy": 0.42246575342465753, "step": 9490 }, { "epoch": 1.7595476455320727, "grad_norm": 5.99609375, "learning_rate": 8.240452354467929e-06, "loss": 3.1232, "mean_token_accuracy": 0.4532803180914513, "step": 9491 }, { "epoch": 1.7597330367074528, "grad_norm": 6.5859375, "learning_rate": 8.240266963292548e-06, "loss": 2.7409, "mean_token_accuracy": 0.47150370290866156, "step": 9492 }, { "epoch": 1.7599184278828328, "grad_norm": 7.81640625, "learning_rate": 8.240081572117168e-06, "loss": 4.1064, "mean_token_accuracy": 0.3837837837837838, "step": 9493 }, { "epoch": 1.7601038190582128, "grad_norm": 6.7265625, "learning_rate": 8.239896180941787e-06, "loss": 3.4168, "mean_token_accuracy": 0.4287435974691172, "step": 9494 }, { "epoch": 1.760289210233593, "grad_norm": 7.7734375, "learning_rate": 8.239710789766407e-06, "loss": 2.7165, "mean_token_accuracy": 0.5046208530805687, "step": 9495 }, { "epoch": 1.760474601408973, "grad_norm": 6.0, "learning_rate": 8.239525398591028e-06, "loss": 3.088, "mean_token_accuracy": 0.4526572290554653, "step": 9496 }, { "epoch": 1.760659992584353, "grad_norm": 9.1796875, "learning_rate": 8.239340007415648e-06, "loss": 3.5986, "mean_token_accuracy": 0.3952473326867119, "step": 9497 }, { "epoch": 1.760845383759733, "grad_norm": 8.328125, "learning_rate": 8.239154616240267e-06, "loss": 3.5032, "mean_token_accuracy": 0.4232600025730091, "step": 9498 }, { "epoch": 1.761030774935113, "grad_norm": 11.4765625, "learning_rate": 8.238969225064888e-06, "loss": 2.8789, "mean_token_accuracy": 0.46553122465531227, "step": 9499 }, { "epoch": 1.761216166110493, "grad_norm": 6.93359375, "learning_rate": 8.238783833889508e-06, "loss": 2.964, "mean_token_accuracy": 0.46408839779005523, "step": 9500 }, { "epoch": 1.7614015572858732, "grad_norm": 8.1875, "learning_rate": 8.238598442714127e-06, "loss": 3.3812, "mean_token_accuracy": 0.4151216305062459, "step": 9501 }, { "epoch": 1.7615869484612534, "grad_norm": 15.6015625, "learning_rate": 8.238413051538747e-06, "loss": 2.384, "mean_token_accuracy": 0.4917832405269591, "step": 9502 }, { "epoch": 1.7617723396366332, "grad_norm": 15.53125, "learning_rate": 8.238227660363366e-06, "loss": 2.6349, "mean_token_accuracy": 0.45594405594405596, "step": 9503 }, { "epoch": 1.7619577308120133, "grad_norm": 6.875, "learning_rate": 8.238042269187987e-06, "loss": 2.9421, "mean_token_accuracy": 0.49014925373134327, "step": 9504 }, { "epoch": 1.7621431219873935, "grad_norm": 9.0859375, "learning_rate": 8.237856878012607e-06, "loss": 2.8547, "mean_token_accuracy": 0.48193787981093855, "step": 9505 }, { "epoch": 1.7623285131627735, "grad_norm": 13.609375, "learning_rate": 8.237671486837228e-06, "loss": 2.4167, "mean_token_accuracy": 0.5197938627313188, "step": 9506 }, { "epoch": 1.7625139043381535, "grad_norm": 7.91015625, "learning_rate": 8.237486095661848e-06, "loss": 3.4779, "mean_token_accuracy": 0.4281540989592843, "step": 9507 }, { "epoch": 1.7626992955135337, "grad_norm": 8.96875, "learning_rate": 8.237300704486467e-06, "loss": 2.8377, "mean_token_accuracy": 0.49267217630853993, "step": 9508 }, { "epoch": 1.7628846866889136, "grad_norm": 11.1328125, "learning_rate": 8.237115313311088e-06, "loss": 2.9774, "mean_token_accuracy": 0.4681141439205955, "step": 9509 }, { "epoch": 1.7630700778642936, "grad_norm": 7.9375, "learning_rate": 8.236929922135706e-06, "loss": 2.6403, "mean_token_accuracy": 0.48046875, "step": 9510 }, { "epoch": 1.7632554690396738, "grad_norm": 6.5859375, "learning_rate": 8.236744530960327e-06, "loss": 2.598, "mean_token_accuracy": 0.472168410176934, "step": 9511 }, { "epoch": 1.7634408602150538, "grad_norm": 8.96875, "learning_rate": 8.236559139784947e-06, "loss": 2.3104, "mean_token_accuracy": 0.5360123647604328, "step": 9512 }, { "epoch": 1.7636262513904337, "grad_norm": 7.19921875, "learning_rate": 8.236373748609568e-06, "loss": 2.7339, "mean_token_accuracy": 0.46906127376999696, "step": 9513 }, { "epoch": 1.763811642565814, "grad_norm": 8.8671875, "learning_rate": 8.236188357434187e-06, "loss": 2.9717, "mean_token_accuracy": 0.4657777777777778, "step": 9514 }, { "epoch": 1.7639970337411939, "grad_norm": 6.45703125, "learning_rate": 8.236002966258807e-06, "loss": 3.4706, "mean_token_accuracy": 0.4333295049959802, "step": 9515 }, { "epoch": 1.7641824249165738, "grad_norm": 4.7890625, "learning_rate": 8.235817575083428e-06, "loss": 2.8686, "mean_token_accuracy": 0.47574626865671643, "step": 9516 }, { "epoch": 1.764367816091954, "grad_norm": 8.21875, "learning_rate": 8.235632183908046e-06, "loss": 2.9334, "mean_token_accuracy": 0.48323119777158774, "step": 9517 }, { "epoch": 1.7645532072673342, "grad_norm": 11.4140625, "learning_rate": 8.235446792732667e-06, "loss": 2.8059, "mean_token_accuracy": 0.4714636987187783, "step": 9518 }, { "epoch": 1.764738598442714, "grad_norm": 7.55859375, "learning_rate": 8.235261401557286e-06, "loss": 2.6959, "mean_token_accuracy": 0.5063379300738264, "step": 9519 }, { "epoch": 1.7649239896180942, "grad_norm": 7.25390625, "learning_rate": 8.235076010381906e-06, "loss": 2.9252, "mean_token_accuracy": 0.4820542412002308, "step": 9520 }, { "epoch": 1.7651093807934743, "grad_norm": 7.46875, "learning_rate": 8.234890619206527e-06, "loss": 2.8497, "mean_token_accuracy": 0.46867454568560174, "step": 9521 }, { "epoch": 1.7652947719688543, "grad_norm": 8.5390625, "learning_rate": 8.234705228031147e-06, "loss": 2.7107, "mean_token_accuracy": 0.5033872377622378, "step": 9522 }, { "epoch": 1.7654801631442343, "grad_norm": 6.109375, "learning_rate": 8.234519836855766e-06, "loss": 2.8375, "mean_token_accuracy": 0.4396047328045768, "step": 9523 }, { "epoch": 1.7656655543196145, "grad_norm": 6.58203125, "learning_rate": 8.234334445680386e-06, "loss": 3.5387, "mean_token_accuracy": 0.4346833130328867, "step": 9524 }, { "epoch": 1.7658509454949944, "grad_norm": 7.046875, "learning_rate": 8.234149054505007e-06, "loss": 3.9087, "mean_token_accuracy": 0.3810373012334671, "step": 9525 }, { "epoch": 1.7660363366703744, "grad_norm": 10.9375, "learning_rate": 8.233963663329626e-06, "loss": 3.6271, "mean_token_accuracy": 0.42084251101321585, "step": 9526 }, { "epoch": 1.7662217278457546, "grad_norm": 7.734375, "learning_rate": 8.233778272154246e-06, "loss": 3.0169, "mean_token_accuracy": 0.46674462797836574, "step": 9527 }, { "epoch": 1.7664071190211346, "grad_norm": 11.796875, "learning_rate": 8.233592880978865e-06, "loss": 2.2579, "mean_token_accuracy": 0.5234688536032106, "step": 9528 }, { "epoch": 1.7665925101965145, "grad_norm": 7.7421875, "learning_rate": 8.233407489803485e-06, "loss": 3.0129, "mean_token_accuracy": 0.46468535675610634, "step": 9529 }, { "epoch": 1.7667779013718947, "grad_norm": 9.0390625, "learning_rate": 8.233222098628106e-06, "loss": 2.8426, "mean_token_accuracy": 0.48530549110595517, "step": 9530 }, { "epoch": 1.766963292547275, "grad_norm": 10.0078125, "learning_rate": 8.233036707452726e-06, "loss": 2.7707, "mean_token_accuracy": 0.47758152173913043, "step": 9531 }, { "epoch": 1.7671486837226547, "grad_norm": 7.3359375, "learning_rate": 8.232851316277345e-06, "loss": 2.7291, "mean_token_accuracy": 0.4583999016836672, "step": 9532 }, { "epoch": 1.7673340748980348, "grad_norm": 6.9609375, "learning_rate": 8.232665925101966e-06, "loss": 2.7015, "mean_token_accuracy": 0.5235050770966528, "step": 9533 }, { "epoch": 1.767519466073415, "grad_norm": 7.26171875, "learning_rate": 8.232480533926586e-06, "loss": 2.7186, "mean_token_accuracy": 0.47739754964089565, "step": 9534 }, { "epoch": 1.767704857248795, "grad_norm": 6.3125, "learning_rate": 8.232295142751205e-06, "loss": 2.5374, "mean_token_accuracy": 0.5085859340002986, "step": 9535 }, { "epoch": 1.767890248424175, "grad_norm": 5.98046875, "learning_rate": 8.232109751575826e-06, "loss": 2.5203, "mean_token_accuracy": 0.5136948781155848, "step": 9536 }, { "epoch": 1.7680756395995552, "grad_norm": 6.2890625, "learning_rate": 8.231924360400444e-06, "loss": 2.2888, "mean_token_accuracy": 0.5246317927882174, "step": 9537 }, { "epoch": 1.7682610307749351, "grad_norm": 8.796875, "learning_rate": 8.231738969225067e-06, "loss": 2.433, "mean_token_accuracy": 0.510969387755102, "step": 9538 }, { "epoch": 1.768446421950315, "grad_norm": 10.8125, "learning_rate": 8.231553578049685e-06, "loss": 3.1167, "mean_token_accuracy": 0.4309111880046136, "step": 9539 }, { "epoch": 1.7686318131256953, "grad_norm": 8.046875, "learning_rate": 8.231368186874306e-06, "loss": 2.2262, "mean_token_accuracy": 0.5443245778611632, "step": 9540 }, { "epoch": 1.7688172043010753, "grad_norm": 12.8203125, "learning_rate": 8.231182795698925e-06, "loss": 2.8502, "mean_token_accuracy": 0.4690436039508552, "step": 9541 }, { "epoch": 1.7690025954764552, "grad_norm": 6.69921875, "learning_rate": 8.230997404523545e-06, "loss": 2.8884, "mean_token_accuracy": 0.46958153914099593, "step": 9542 }, { "epoch": 1.7691879866518354, "grad_norm": 5.73828125, "learning_rate": 8.230812013348166e-06, "loss": 3.0683, "mean_token_accuracy": 0.4351640427833193, "step": 9543 }, { "epoch": 1.7693733778272154, "grad_norm": 7.00390625, "learning_rate": 8.230626622172784e-06, "loss": 2.9766, "mean_token_accuracy": 0.4453111457791645, "step": 9544 }, { "epoch": 1.7695587690025953, "grad_norm": 6.12890625, "learning_rate": 8.230441230997405e-06, "loss": 3.0401, "mean_token_accuracy": 0.462758219377364, "step": 9545 }, { "epoch": 1.7697441601779755, "grad_norm": 6.2109375, "learning_rate": 8.230255839822025e-06, "loss": 2.9325, "mean_token_accuracy": 0.47351970828268797, "step": 9546 }, { "epoch": 1.7699295513533557, "grad_norm": 6.5625, "learning_rate": 8.230070448646646e-06, "loss": 2.8599, "mean_token_accuracy": 0.47406434668417596, "step": 9547 }, { "epoch": 1.7701149425287355, "grad_norm": 6.01953125, "learning_rate": 8.229885057471265e-06, "loss": 2.5419, "mean_token_accuracy": 0.48299136069114473, "step": 9548 }, { "epoch": 1.7703003337041157, "grad_norm": 7.16015625, "learning_rate": 8.229699666295885e-06, "loss": 2.8444, "mean_token_accuracy": 0.45300772936036743, "step": 9549 }, { "epoch": 1.7704857248794958, "grad_norm": 6.58984375, "learning_rate": 8.229514275120506e-06, "loss": 2.3771, "mean_token_accuracy": 0.502724358974359, "step": 9550 }, { "epoch": 1.7706711160548758, "grad_norm": 6.1953125, "learning_rate": 8.229328883945124e-06, "loss": 3.0421, "mean_token_accuracy": 0.4472931075602194, "step": 9551 }, { "epoch": 1.7708565072302558, "grad_norm": 6.1796875, "learning_rate": 8.229143492769745e-06, "loss": 2.7132, "mean_token_accuracy": 0.4877537511032657, "step": 9552 }, { "epoch": 1.771041898405636, "grad_norm": 6.22265625, "learning_rate": 8.228958101594364e-06, "loss": 2.8283, "mean_token_accuracy": 0.469858857670492, "step": 9553 }, { "epoch": 1.771227289581016, "grad_norm": 7.6875, "learning_rate": 8.228772710418986e-06, "loss": 2.8835, "mean_token_accuracy": 0.45665992487720314, "step": 9554 }, { "epoch": 1.771412680756396, "grad_norm": 9.5703125, "learning_rate": 8.228587319243605e-06, "loss": 2.4835, "mean_token_accuracy": 0.495383767396996, "step": 9555 }, { "epoch": 1.771598071931776, "grad_norm": 5.0625, "learning_rate": 8.228401928068225e-06, "loss": 2.6586, "mean_token_accuracy": 0.48697999364877737, "step": 9556 }, { "epoch": 1.771783463107156, "grad_norm": 7.97265625, "learning_rate": 8.228216536892844e-06, "loss": 2.7101, "mean_token_accuracy": 0.48128165602290246, "step": 9557 }, { "epoch": 1.771968854282536, "grad_norm": 6.1796875, "learning_rate": 8.228031145717464e-06, "loss": 3.0213, "mean_token_accuracy": 0.43493975903614457, "step": 9558 }, { "epoch": 1.7721542454579162, "grad_norm": 5.421875, "learning_rate": 8.227845754542085e-06, "loss": 3.0755, "mean_token_accuracy": 0.4640784212562879, "step": 9559 }, { "epoch": 1.7723396366332964, "grad_norm": 6.7890625, "learning_rate": 8.227660363366704e-06, "loss": 2.4714, "mean_token_accuracy": 0.5230096640589047, "step": 9560 }, { "epoch": 1.7725250278086762, "grad_norm": 6.3125, "learning_rate": 8.227474972191324e-06, "loss": 2.8238, "mean_token_accuracy": 0.4774542327067207, "step": 9561 }, { "epoch": 1.7727104189840563, "grad_norm": 6.16796875, "learning_rate": 8.227289581015945e-06, "loss": 2.2713, "mean_token_accuracy": 0.519268451992162, "step": 9562 }, { "epoch": 1.7728958101594365, "grad_norm": 7.1171875, "learning_rate": 8.227104189840565e-06, "loss": 2.9995, "mean_token_accuracy": 0.44350226779727864, "step": 9563 }, { "epoch": 1.7730812013348165, "grad_norm": 6.18359375, "learning_rate": 8.226918798665184e-06, "loss": 3.3824, "mean_token_accuracy": 0.457194303486659, "step": 9564 }, { "epoch": 1.7732665925101965, "grad_norm": 6.21484375, "learning_rate": 8.226733407489805e-06, "loss": 2.9569, "mean_token_accuracy": 0.457089552238806, "step": 9565 }, { "epoch": 1.7734519836855767, "grad_norm": 5.484375, "learning_rate": 8.226548016314423e-06, "loss": 2.7289, "mean_token_accuracy": 0.4761834319526627, "step": 9566 }, { "epoch": 1.7736373748609566, "grad_norm": 5.5703125, "learning_rate": 8.226362625139044e-06, "loss": 2.903, "mean_token_accuracy": 0.4675893235418726, "step": 9567 }, { "epoch": 1.7738227660363366, "grad_norm": 6.5703125, "learning_rate": 8.226177233963664e-06, "loss": 3.0034, "mean_token_accuracy": 0.4648783814503296, "step": 9568 }, { "epoch": 1.7740081572117168, "grad_norm": 5.6875, "learning_rate": 8.225991842788283e-06, "loss": 3.351, "mean_token_accuracy": 0.4181791569086651, "step": 9569 }, { "epoch": 1.7741935483870968, "grad_norm": 5.5, "learning_rate": 8.225806451612904e-06, "loss": 2.8396, "mean_token_accuracy": 0.465905383360522, "step": 9570 }, { "epoch": 1.7743789395624767, "grad_norm": 5.83203125, "learning_rate": 8.225621060437524e-06, "loss": 2.415, "mean_token_accuracy": 0.5131720844245878, "step": 9571 }, { "epoch": 1.774564330737857, "grad_norm": 7.4140625, "learning_rate": 8.225435669262145e-06, "loss": 3.2642, "mean_token_accuracy": 0.4519869413486435, "step": 9572 }, { "epoch": 1.7747497219132369, "grad_norm": 6.9921875, "learning_rate": 8.225250278086763e-06, "loss": 3.4175, "mean_token_accuracy": 0.4090725535065285, "step": 9573 }, { "epoch": 1.7749351130886168, "grad_norm": 6.75390625, "learning_rate": 8.225064886911384e-06, "loss": 2.4711, "mean_token_accuracy": 0.5050037341299477, "step": 9574 }, { "epoch": 1.775120504263997, "grad_norm": 6.890625, "learning_rate": 8.224879495736003e-06, "loss": 3.3154, "mean_token_accuracy": 0.44387755102040816, "step": 9575 }, { "epoch": 1.7753058954393772, "grad_norm": 5.3203125, "learning_rate": 8.224694104560623e-06, "loss": 2.7816, "mean_token_accuracy": 0.4677017723743958, "step": 9576 }, { "epoch": 1.7754912866147572, "grad_norm": 6.27734375, "learning_rate": 8.224508713385244e-06, "loss": 3.3314, "mean_token_accuracy": 0.4208096590909091, "step": 9577 }, { "epoch": 1.7756766777901372, "grad_norm": 5.44140625, "learning_rate": 8.224323322209864e-06, "loss": 3.0081, "mean_token_accuracy": 0.44363310274148093, "step": 9578 }, { "epoch": 1.7758620689655173, "grad_norm": 7.30859375, "learning_rate": 8.224137931034483e-06, "loss": 2.7279, "mean_token_accuracy": 0.47324169224609636, "step": 9579 }, { "epoch": 1.7760474601408973, "grad_norm": 5.89453125, "learning_rate": 8.223952539859103e-06, "loss": 4.0705, "mean_token_accuracy": 0.39968445963712856, "step": 9580 }, { "epoch": 1.7762328513162773, "grad_norm": 7.27734375, "learning_rate": 8.223767148683724e-06, "loss": 2.4423, "mean_token_accuracy": 0.5002010993430754, "step": 9581 }, { "epoch": 1.7764182424916575, "grad_norm": 6.3515625, "learning_rate": 8.223581757508343e-06, "loss": 2.4231, "mean_token_accuracy": 0.5058267716535433, "step": 9582 }, { "epoch": 1.7766036336670374, "grad_norm": 6.09765625, "learning_rate": 8.223396366332963e-06, "loss": 2.9354, "mean_token_accuracy": 0.47269279393173197, "step": 9583 }, { "epoch": 1.7767890248424174, "grad_norm": 5.265625, "learning_rate": 8.223210975157582e-06, "loss": 2.7054, "mean_token_accuracy": 0.4783329648463409, "step": 9584 }, { "epoch": 1.7769744160177976, "grad_norm": 5.5859375, "learning_rate": 8.223025583982202e-06, "loss": 3.2465, "mean_token_accuracy": 0.4434366339128244, "step": 9585 }, { "epoch": 1.7771598071931776, "grad_norm": 7.39453125, "learning_rate": 8.222840192806823e-06, "loss": 3.0398, "mean_token_accuracy": 0.44512443900448795, "step": 9586 }, { "epoch": 1.7773451983685575, "grad_norm": 6.125, "learning_rate": 8.222654801631443e-06, "loss": 2.4514, "mean_token_accuracy": 0.47677419354838707, "step": 9587 }, { "epoch": 1.7775305895439377, "grad_norm": 7.1328125, "learning_rate": 8.222469410456064e-06, "loss": 2.4678, "mean_token_accuracy": 0.5025551371705218, "step": 9588 }, { "epoch": 1.777715980719318, "grad_norm": 7.953125, "learning_rate": 8.222284019280683e-06, "loss": 3.0658, "mean_token_accuracy": 0.45177728063634104, "step": 9589 }, { "epoch": 1.7779013718946977, "grad_norm": 7.08203125, "learning_rate": 8.222098628105303e-06, "loss": 3.5725, "mean_token_accuracy": 0.42514670995130477, "step": 9590 }, { "epoch": 1.7780867630700778, "grad_norm": 7.76171875, "learning_rate": 8.221913236929922e-06, "loss": 2.6558, "mean_token_accuracy": 0.4767092829349639, "step": 9591 }, { "epoch": 1.778272154245458, "grad_norm": 7.91015625, "learning_rate": 8.221727845754543e-06, "loss": 3.0104, "mean_token_accuracy": 0.47475157168931253, "step": 9592 }, { "epoch": 1.778457545420838, "grad_norm": 7.23828125, "learning_rate": 8.221542454579163e-06, "loss": 2.2635, "mean_token_accuracy": 0.5650228576986674, "step": 9593 }, { "epoch": 1.778642936596218, "grad_norm": 7.6640625, "learning_rate": 8.221357063403784e-06, "loss": 2.8007, "mean_token_accuracy": 0.4931309904153355, "step": 9594 }, { "epoch": 1.7788283277715982, "grad_norm": 6.625, "learning_rate": 8.221171672228402e-06, "loss": 2.4612, "mean_token_accuracy": 0.4998455677957377, "step": 9595 }, { "epoch": 1.7790137189469781, "grad_norm": 7.24609375, "learning_rate": 8.220986281053023e-06, "loss": 2.1666, "mean_token_accuracy": 0.5443766937669376, "step": 9596 }, { "epoch": 1.779199110122358, "grad_norm": 5.3359375, "learning_rate": 8.220800889877643e-06, "loss": 2.626, "mean_token_accuracy": 0.4946322521699406, "step": 9597 }, { "epoch": 1.7793845012977383, "grad_norm": 6.02734375, "learning_rate": 8.220615498702262e-06, "loss": 2.9786, "mean_token_accuracy": 0.4478200618270973, "step": 9598 }, { "epoch": 1.7795698924731183, "grad_norm": 5.5859375, "learning_rate": 8.220430107526883e-06, "loss": 2.8726, "mean_token_accuracy": 0.4552467609534171, "step": 9599 }, { "epoch": 1.7797552836484982, "grad_norm": 5.484375, "learning_rate": 8.220244716351501e-06, "loss": 3.4032, "mean_token_accuracy": 0.43150248502612465, "step": 9600 }, { "epoch": 1.7799406748238784, "grad_norm": 5.06640625, "learning_rate": 8.220059325176122e-06, "loss": 3.0623, "mean_token_accuracy": 0.4571986811576505, "step": 9601 }, { "epoch": 1.7801260659992586, "grad_norm": 9.0234375, "learning_rate": 8.219873934000742e-06, "loss": 2.7846, "mean_token_accuracy": 0.46526131969564677, "step": 9602 }, { "epoch": 1.7803114571746383, "grad_norm": 6.58984375, "learning_rate": 8.219688542825363e-06, "loss": 2.8147, "mean_token_accuracy": 0.4742242242242242, "step": 9603 }, { "epoch": 1.7804968483500185, "grad_norm": 9.328125, "learning_rate": 8.219503151649982e-06, "loss": 2.6249, "mean_token_accuracy": 0.49377299745258985, "step": 9604 }, { "epoch": 1.7806822395253987, "grad_norm": 7.05859375, "learning_rate": 8.219317760474602e-06, "loss": 2.7874, "mean_token_accuracy": 0.4858474858474858, "step": 9605 }, { "epoch": 1.7808676307007787, "grad_norm": 7.86328125, "learning_rate": 8.219132369299223e-06, "loss": 2.3155, "mean_token_accuracy": 0.5153146100401214, "step": 9606 }, { "epoch": 1.7810530218761587, "grad_norm": 8.7578125, "learning_rate": 8.218946978123841e-06, "loss": 3.0138, "mean_token_accuracy": 0.4533134259790973, "step": 9607 }, { "epoch": 1.7812384130515388, "grad_norm": 6.40234375, "learning_rate": 8.218761586948462e-06, "loss": 3.2122, "mean_token_accuracy": 0.43982494529540483, "step": 9608 }, { "epoch": 1.7814238042269188, "grad_norm": 8.03125, "learning_rate": 8.21857619577308e-06, "loss": 2.9155, "mean_token_accuracy": 0.45991892245324967, "step": 9609 }, { "epoch": 1.7816091954022988, "grad_norm": 6.79296875, "learning_rate": 8.218390804597703e-06, "loss": 2.9605, "mean_token_accuracy": 0.4465611083621969, "step": 9610 }, { "epoch": 1.781794586577679, "grad_norm": 6.73828125, "learning_rate": 8.218205413422322e-06, "loss": 2.9695, "mean_token_accuracy": 0.45112521638776687, "step": 9611 }, { "epoch": 1.781979977753059, "grad_norm": 6.8671875, "learning_rate": 8.218020022246942e-06, "loss": 2.9868, "mean_token_accuracy": 0.4624202162302983, "step": 9612 }, { "epoch": 1.782165368928439, "grad_norm": 8.0703125, "learning_rate": 8.217834631071561e-06, "loss": 2.6783, "mean_token_accuracy": 0.4740061162079511, "step": 9613 }, { "epoch": 1.782350760103819, "grad_norm": 9.1171875, "learning_rate": 8.217649239896181e-06, "loss": 2.6579, "mean_token_accuracy": 0.47734587769249814, "step": 9614 }, { "epoch": 1.782536151279199, "grad_norm": 6.51953125, "learning_rate": 8.217463848720802e-06, "loss": 3.014, "mean_token_accuracy": 0.45768025078369906, "step": 9615 }, { "epoch": 1.782721542454579, "grad_norm": 7.5234375, "learning_rate": 8.21727845754542e-06, "loss": 2.3631, "mean_token_accuracy": 0.5174494455316373, "step": 9616 }, { "epoch": 1.7829069336299592, "grad_norm": 6.375, "learning_rate": 8.217093066370041e-06, "loss": 3.3574, "mean_token_accuracy": 0.43539923415118426, "step": 9617 }, { "epoch": 1.7830923248053394, "grad_norm": 6.34375, "learning_rate": 8.216907675194662e-06, "loss": 2.7432, "mean_token_accuracy": 0.5021739130434782, "step": 9618 }, { "epoch": 1.7832777159807192, "grad_norm": 7.01171875, "learning_rate": 8.216722284019282e-06, "loss": 3.1228, "mean_token_accuracy": 0.4425440940673437, "step": 9619 }, { "epoch": 1.7834631071560993, "grad_norm": 5.9765625, "learning_rate": 8.216536892843901e-06, "loss": 2.9486, "mean_token_accuracy": 0.47592620368981553, "step": 9620 }, { "epoch": 1.7836484983314795, "grad_norm": 5.93359375, "learning_rate": 8.216351501668522e-06, "loss": 3.1292, "mean_token_accuracy": 0.4308355345474767, "step": 9621 }, { "epoch": 1.7838338895068595, "grad_norm": 6.48828125, "learning_rate": 8.21616611049314e-06, "loss": 2.4951, "mean_token_accuracy": 0.493866424352567, "step": 9622 }, { "epoch": 1.7840192806822395, "grad_norm": 6.7109375, "learning_rate": 8.21598071931776e-06, "loss": 2.5534, "mean_token_accuracy": 0.4870761204647854, "step": 9623 }, { "epoch": 1.7842046718576197, "grad_norm": 5.5859375, "learning_rate": 8.215795328142381e-06, "loss": 2.7453, "mean_token_accuracy": 0.4646808510638298, "step": 9624 }, { "epoch": 1.7843900630329996, "grad_norm": 6.4921875, "learning_rate": 8.215609936967e-06, "loss": 2.8947, "mean_token_accuracy": 0.45857359635811834, "step": 9625 }, { "epoch": 1.7845754542083796, "grad_norm": 9.5078125, "learning_rate": 8.215424545791622e-06, "loss": 3.2034, "mean_token_accuracy": 0.4329777365491651, "step": 9626 }, { "epoch": 1.7847608453837598, "grad_norm": 7.734375, "learning_rate": 8.215239154616241e-06, "loss": 2.5492, "mean_token_accuracy": 0.49579045837231056, "step": 9627 }, { "epoch": 1.7849462365591398, "grad_norm": 6.27734375, "learning_rate": 8.215053763440862e-06, "loss": 2.6523, "mean_token_accuracy": 0.4826128460384563, "step": 9628 }, { "epoch": 1.7851316277345197, "grad_norm": 9.65625, "learning_rate": 8.21486837226548e-06, "loss": 2.8742, "mean_token_accuracy": 0.4262508122157245, "step": 9629 }, { "epoch": 1.7853170189099, "grad_norm": 5.48828125, "learning_rate": 8.214682981090101e-06, "loss": 2.7872, "mean_token_accuracy": 0.4583948793697686, "step": 9630 }, { "epoch": 1.78550241008528, "grad_norm": 5.7109375, "learning_rate": 8.214497589914721e-06, "loss": 3.2122, "mean_token_accuracy": 0.43611793611793614, "step": 9631 }, { "epoch": 1.7856878012606598, "grad_norm": 6.4765625, "learning_rate": 8.21431219873934e-06, "loss": 2.197, "mean_token_accuracy": 0.5417288641606731, "step": 9632 }, { "epoch": 1.78587319243604, "grad_norm": 6.23046875, "learning_rate": 8.21412680756396e-06, "loss": 2.8691, "mean_token_accuracy": 0.46356519120746764, "step": 9633 }, { "epoch": 1.7860585836114202, "grad_norm": 7.52734375, "learning_rate": 8.213941416388581e-06, "loss": 2.8759, "mean_token_accuracy": 0.48735163649442514, "step": 9634 }, { "epoch": 1.7862439747868002, "grad_norm": 5.2578125, "learning_rate": 8.213756025213202e-06, "loss": 3.1746, "mean_token_accuracy": 0.4302978515625, "step": 9635 }, { "epoch": 1.7864293659621802, "grad_norm": 6.18359375, "learning_rate": 8.21357063403782e-06, "loss": 4.1098, "mean_token_accuracy": 0.3805104408352668, "step": 9636 }, { "epoch": 1.7866147571375603, "grad_norm": 6.921875, "learning_rate": 8.213385242862441e-06, "loss": 2.913, "mean_token_accuracy": 0.4610784837159637, "step": 9637 }, { "epoch": 1.7868001483129403, "grad_norm": 7.44140625, "learning_rate": 8.21319985168706e-06, "loss": 2.5299, "mean_token_accuracy": 0.4878655880522713, "step": 9638 }, { "epoch": 1.7869855394883203, "grad_norm": 5.7890625, "learning_rate": 8.21301446051168e-06, "loss": 3.5456, "mean_token_accuracy": 0.41764783701547825, "step": 9639 }, { "epoch": 1.7871709306637005, "grad_norm": 7.68359375, "learning_rate": 8.2128290693363e-06, "loss": 3.226, "mean_token_accuracy": 0.4437374413931681, "step": 9640 }, { "epoch": 1.7873563218390804, "grad_norm": 7.58203125, "learning_rate": 8.21264367816092e-06, "loss": 2.8886, "mean_token_accuracy": 0.4631212053069485, "step": 9641 }, { "epoch": 1.7875417130144604, "grad_norm": 7.4921875, "learning_rate": 8.21245828698554e-06, "loss": 3.1694, "mean_token_accuracy": 0.4325104662913238, "step": 9642 }, { "epoch": 1.7877271041898406, "grad_norm": 6.7734375, "learning_rate": 8.21227289581016e-06, "loss": 3.1785, "mean_token_accuracy": 0.44219549946422193, "step": 9643 }, { "epoch": 1.7879124953652206, "grad_norm": 6.328125, "learning_rate": 8.212087504634781e-06, "loss": 3.1936, "mean_token_accuracy": 0.4452657897597565, "step": 9644 }, { "epoch": 1.7880978865406005, "grad_norm": 8.1328125, "learning_rate": 8.2119021134594e-06, "loss": 2.6013, "mean_token_accuracy": 0.48523636363636363, "step": 9645 }, { "epoch": 1.7882832777159807, "grad_norm": 8.921875, "learning_rate": 8.21171672228402e-06, "loss": 3.0429, "mean_token_accuracy": 0.462721110927426, "step": 9646 }, { "epoch": 1.788468668891361, "grad_norm": 6.37890625, "learning_rate": 8.211531331108639e-06, "loss": 2.6957, "mean_token_accuracy": 0.4734561213434453, "step": 9647 }, { "epoch": 1.7886540600667407, "grad_norm": 6.1328125, "learning_rate": 8.21134593993326e-06, "loss": 3.2295, "mean_token_accuracy": 0.42920110192837463, "step": 9648 }, { "epoch": 1.7888394512421208, "grad_norm": 11.2109375, "learning_rate": 8.21116054875788e-06, "loss": 2.49, "mean_token_accuracy": 0.4917073170731707, "step": 9649 }, { "epoch": 1.789024842417501, "grad_norm": 6.3984375, "learning_rate": 8.210975157582499e-06, "loss": 2.9582, "mean_token_accuracy": 0.44768289128533956, "step": 9650 }, { "epoch": 1.789210233592881, "grad_norm": 7.12109375, "learning_rate": 8.21078976640712e-06, "loss": 3.4092, "mean_token_accuracy": 0.44888435175732827, "step": 9651 }, { "epoch": 1.789395624768261, "grad_norm": 6.74609375, "learning_rate": 8.21060437523174e-06, "loss": 3.0891, "mean_token_accuracy": 0.44674428633031477, "step": 9652 }, { "epoch": 1.7895810159436412, "grad_norm": 8.6171875, "learning_rate": 8.21041898405636e-06, "loss": 2.785, "mean_token_accuracy": 0.48672566371681414, "step": 9653 }, { "epoch": 1.7897664071190211, "grad_norm": 8.1328125, "learning_rate": 8.210233592880979e-06, "loss": 2.7493, "mean_token_accuracy": 0.4682337139019476, "step": 9654 }, { "epoch": 1.789951798294401, "grad_norm": 7.171875, "learning_rate": 8.2100482017056e-06, "loss": 3.4636, "mean_token_accuracy": 0.40964840556009813, "step": 9655 }, { "epoch": 1.7901371894697813, "grad_norm": 5.58984375, "learning_rate": 8.209862810530218e-06, "loss": 3.2928, "mean_token_accuracy": 0.42924393723252496, "step": 9656 }, { "epoch": 1.7903225806451613, "grad_norm": 5.51171875, "learning_rate": 8.209677419354839e-06, "loss": 2.8689, "mean_token_accuracy": 0.46484708175947365, "step": 9657 }, { "epoch": 1.7905079718205412, "grad_norm": 9.1484375, "learning_rate": 8.20949202817946e-06, "loss": 3.4795, "mean_token_accuracy": 0.40644187216909916, "step": 9658 }, { "epoch": 1.7906933629959214, "grad_norm": 7.0234375, "learning_rate": 8.20930663700408e-06, "loss": 2.6107, "mean_token_accuracy": 0.48478071810162754, "step": 9659 }, { "epoch": 1.7908787541713016, "grad_norm": 8.546875, "learning_rate": 8.209121245828699e-06, "loss": 2.3993, "mean_token_accuracy": 0.5128870157237801, "step": 9660 }, { "epoch": 1.7910641453466813, "grad_norm": 7.68359375, "learning_rate": 8.20893585465332e-06, "loss": 2.9357, "mean_token_accuracy": 0.48224023581429626, "step": 9661 }, { "epoch": 1.7912495365220615, "grad_norm": 6.59765625, "learning_rate": 8.20875046347794e-06, "loss": 2.6467, "mean_token_accuracy": 0.5277408437084492, "step": 9662 }, { "epoch": 1.7914349276974417, "grad_norm": 6.9765625, "learning_rate": 8.208565072302558e-06, "loss": 2.6732, "mean_token_accuracy": 0.47769355039145023, "step": 9663 }, { "epoch": 1.7916203188728217, "grad_norm": 5.86328125, "learning_rate": 8.208379681127179e-06, "loss": 3.4767, "mean_token_accuracy": 0.427953689496443, "step": 9664 }, { "epoch": 1.7918057100482017, "grad_norm": 6.1484375, "learning_rate": 8.208194289951798e-06, "loss": 2.6623, "mean_token_accuracy": 0.4913562895291637, "step": 9665 }, { "epoch": 1.7919911012235819, "grad_norm": 6.0625, "learning_rate": 8.208008898776418e-06, "loss": 3.2077, "mean_token_accuracy": 0.4350403034613561, "step": 9666 }, { "epoch": 1.7921764923989618, "grad_norm": 5.66796875, "learning_rate": 8.207823507601039e-06, "loss": 2.9873, "mean_token_accuracy": 0.4515014615997874, "step": 9667 }, { "epoch": 1.7923618835743418, "grad_norm": 8.0390625, "learning_rate": 8.20763811642566e-06, "loss": 2.9095, "mean_token_accuracy": 0.4779992478375329, "step": 9668 }, { "epoch": 1.792547274749722, "grad_norm": 8.109375, "learning_rate": 8.20745272525028e-06, "loss": 2.78, "mean_token_accuracy": 0.4652019650655022, "step": 9669 }, { "epoch": 1.792732665925102, "grad_norm": 5.765625, "learning_rate": 8.207267334074899e-06, "loss": 2.6805, "mean_token_accuracy": 0.46454802259887007, "step": 9670 }, { "epoch": 1.792918057100482, "grad_norm": 6.828125, "learning_rate": 8.207081942899519e-06, "loss": 3.4574, "mean_token_accuracy": 0.4247166756610901, "step": 9671 }, { "epoch": 1.793103448275862, "grad_norm": 10.3359375, "learning_rate": 8.206896551724138e-06, "loss": 3.2071, "mean_token_accuracy": 0.4386710239651416, "step": 9672 }, { "epoch": 1.793288839451242, "grad_norm": 7.546875, "learning_rate": 8.206711160548758e-06, "loss": 2.956, "mean_token_accuracy": 0.4792671166827387, "step": 9673 }, { "epoch": 1.793474230626622, "grad_norm": 5.8671875, "learning_rate": 8.206525769373379e-06, "loss": 2.8832, "mean_token_accuracy": 0.4693033261625021, "step": 9674 }, { "epoch": 1.7936596218020022, "grad_norm": 9.078125, "learning_rate": 8.206340378198e-06, "loss": 2.4613, "mean_token_accuracy": 0.48221767514372016, "step": 9675 }, { "epoch": 1.7938450129773824, "grad_norm": 9.9375, "learning_rate": 8.206154987022618e-06, "loss": 2.3304, "mean_token_accuracy": 0.517806111233238, "step": 9676 }, { "epoch": 1.7940304041527624, "grad_norm": 7.90234375, "learning_rate": 8.205969595847239e-06, "loss": 2.8567, "mean_token_accuracy": 0.4871621621621622, "step": 9677 }, { "epoch": 1.7942157953281423, "grad_norm": 9.390625, "learning_rate": 8.205784204671859e-06, "loss": 3.6323, "mean_token_accuracy": 0.41097456416118894, "step": 9678 }, { "epoch": 1.7944011865035225, "grad_norm": 8.4921875, "learning_rate": 8.205598813496478e-06, "loss": 3.6311, "mean_token_accuracy": 0.3952914798206278, "step": 9679 }, { "epoch": 1.7945865776789025, "grad_norm": 10.09375, "learning_rate": 8.205413422321098e-06, "loss": 2.6286, "mean_token_accuracy": 0.46046301864101025, "step": 9680 }, { "epoch": 1.7947719688542825, "grad_norm": 6.74609375, "learning_rate": 8.205228031145717e-06, "loss": 2.9353, "mean_token_accuracy": 0.47051133062173156, "step": 9681 }, { "epoch": 1.7949573600296627, "grad_norm": 7.703125, "learning_rate": 8.205042639970338e-06, "loss": 3.289, "mean_token_accuracy": 0.436469824789098, "step": 9682 }, { "epoch": 1.7951427512050426, "grad_norm": 12.5, "learning_rate": 8.204857248794958e-06, "loss": 2.3779, "mean_token_accuracy": 0.4870514820592824, "step": 9683 }, { "epoch": 1.7953281423804226, "grad_norm": 9.46875, "learning_rate": 8.204671857619579e-06, "loss": 2.7959, "mean_token_accuracy": 0.4654761904761905, "step": 9684 }, { "epoch": 1.7955135335558028, "grad_norm": 6.4609375, "learning_rate": 8.204486466444197e-06, "loss": 3.0577, "mean_token_accuracy": 0.4372146118721461, "step": 9685 }, { "epoch": 1.7956989247311828, "grad_norm": 7.03515625, "learning_rate": 8.204301075268818e-06, "loss": 3.3634, "mean_token_accuracy": 0.4482315112540193, "step": 9686 }, { "epoch": 1.7958843159065627, "grad_norm": 9.21875, "learning_rate": 8.204115684093438e-06, "loss": 2.9286, "mean_token_accuracy": 0.4506856540084388, "step": 9687 }, { "epoch": 1.796069707081943, "grad_norm": 6.52734375, "learning_rate": 8.203930292918057e-06, "loss": 2.6642, "mean_token_accuracy": 0.47392182487822265, "step": 9688 }, { "epoch": 1.796255098257323, "grad_norm": 7.3046875, "learning_rate": 8.203744901742678e-06, "loss": 2.7706, "mean_token_accuracy": 0.4916674339379431, "step": 9689 }, { "epoch": 1.7964404894327028, "grad_norm": 7.10546875, "learning_rate": 8.203559510567296e-06, "loss": 3.2834, "mean_token_accuracy": 0.44178847807394667, "step": 9690 }, { "epoch": 1.796625880608083, "grad_norm": 5.35546875, "learning_rate": 8.203374119391919e-06, "loss": 2.6904, "mean_token_accuracy": 0.4775310740465314, "step": 9691 }, { "epoch": 1.7968112717834632, "grad_norm": 5.125, "learning_rate": 8.203188728216537e-06, "loss": 2.6956, "mean_token_accuracy": 0.46671750735374223, "step": 9692 }, { "epoch": 1.7969966629588432, "grad_norm": 6.16796875, "learning_rate": 8.203003337041158e-06, "loss": 3.2733, "mean_token_accuracy": 0.4330246913580247, "step": 9693 }, { "epoch": 1.7971820541342232, "grad_norm": 8.9140625, "learning_rate": 8.202817945865777e-06, "loss": 3.0777, "mean_token_accuracy": 0.4467065868263473, "step": 9694 }, { "epoch": 1.7973674453096034, "grad_norm": 6.38671875, "learning_rate": 8.202632554690397e-06, "loss": 3.2288, "mean_token_accuracy": 0.43284913353720694, "step": 9695 }, { "epoch": 1.7975528364849833, "grad_norm": 6.1171875, "learning_rate": 8.202447163515018e-06, "loss": 3.0861, "mean_token_accuracy": 0.44957768108523166, "step": 9696 }, { "epoch": 1.7977382276603633, "grad_norm": 5.78515625, "learning_rate": 8.202261772339637e-06, "loss": 2.5436, "mean_token_accuracy": 0.4953366813569745, "step": 9697 }, { "epoch": 1.7979236188357435, "grad_norm": 7.46484375, "learning_rate": 8.202076381164257e-06, "loss": 2.9292, "mean_token_accuracy": 0.4704201680672269, "step": 9698 }, { "epoch": 1.7981090100111234, "grad_norm": 5.6015625, "learning_rate": 8.201890989988878e-06, "loss": 2.5926, "mean_token_accuracy": 0.4812778214192197, "step": 9699 }, { "epoch": 1.7982944011865034, "grad_norm": 5.421875, "learning_rate": 8.201705598813498e-06, "loss": 2.7337, "mean_token_accuracy": 0.4800646551724138, "step": 9700 }, { "epoch": 1.7984797923618836, "grad_norm": 7.63671875, "learning_rate": 8.201520207638117e-06, "loss": 2.6219, "mean_token_accuracy": 0.5316345736209642, "step": 9701 }, { "epoch": 1.7986651835372638, "grad_norm": 6.9375, "learning_rate": 8.201334816462737e-06, "loss": 2.0324, "mean_token_accuracy": 0.568499660556687, "step": 9702 }, { "epoch": 1.7988505747126435, "grad_norm": 7.09765625, "learning_rate": 8.201149425287356e-06, "loss": 2.771, "mean_token_accuracy": 0.4827756513149772, "step": 9703 }, { "epoch": 1.7990359658880237, "grad_norm": 7.47265625, "learning_rate": 8.200964034111977e-06, "loss": 2.7877, "mean_token_accuracy": 0.49731077003804275, "step": 9704 }, { "epoch": 1.799221357063404, "grad_norm": 7.1875, "learning_rate": 8.200778642936597e-06, "loss": 2.8058, "mean_token_accuracy": 0.47399684810280035, "step": 9705 }, { "epoch": 1.7994067482387839, "grad_norm": 6.18359375, "learning_rate": 8.200593251761216e-06, "loss": 2.9046, "mean_token_accuracy": 0.45935937789298276, "step": 9706 }, { "epoch": 1.7995921394141638, "grad_norm": 6.23828125, "learning_rate": 8.200407860585838e-06, "loss": 2.9626, "mean_token_accuracy": 0.44463559587112916, "step": 9707 }, { "epoch": 1.799777530589544, "grad_norm": 5.828125, "learning_rate": 8.200222469410457e-06, "loss": 2.2819, "mean_token_accuracy": 0.5537817028985508, "step": 9708 }, { "epoch": 1.799962921764924, "grad_norm": 5.4375, "learning_rate": 8.200037078235077e-06, "loss": 2.642, "mean_token_accuracy": 0.5057347670250896, "step": 9709 }, { "epoch": 1.800148312940304, "grad_norm": 6.09375, "learning_rate": 8.199851687059696e-06, "loss": 3.0603, "mean_token_accuracy": 0.4719835876394409, "step": 9710 }, { "epoch": 1.8003337041156842, "grad_norm": 6.38671875, "learning_rate": 8.199666295884317e-06, "loss": 2.6213, "mean_token_accuracy": 0.4742669993761697, "step": 9711 }, { "epoch": 1.8005190952910641, "grad_norm": 6.43359375, "learning_rate": 8.199480904708937e-06, "loss": 3.066, "mean_token_accuracy": 0.43764013452914796, "step": 9712 }, { "epoch": 1.800704486466444, "grad_norm": 8.265625, "learning_rate": 8.199295513533556e-06, "loss": 3.1842, "mean_token_accuracy": 0.44929006085192696, "step": 9713 }, { "epoch": 1.8008898776418243, "grad_norm": 6.28125, "learning_rate": 8.199110122358176e-06, "loss": 3.8345, "mean_token_accuracy": 0.4025541365907829, "step": 9714 }, { "epoch": 1.8010752688172043, "grad_norm": 6.25, "learning_rate": 8.198924731182797e-06, "loss": 3.0789, "mean_token_accuracy": 0.4502623773671002, "step": 9715 }, { "epoch": 1.8012606599925842, "grad_norm": 7.56640625, "learning_rate": 8.198739340007417e-06, "loss": 2.8256, "mean_token_accuracy": 0.4991372368572415, "step": 9716 }, { "epoch": 1.8014460511679644, "grad_norm": 6.33984375, "learning_rate": 8.198553948832036e-06, "loss": 2.7627, "mean_token_accuracy": 0.47356095155922595, "step": 9717 }, { "epoch": 1.8016314423433446, "grad_norm": 6.7578125, "learning_rate": 8.198368557656657e-06, "loss": 2.8067, "mean_token_accuracy": 0.4675972083748754, "step": 9718 }, { "epoch": 1.8018168335187243, "grad_norm": 6.515625, "learning_rate": 8.198183166481275e-06, "loss": 3.1668, "mean_token_accuracy": 0.46179354094579006, "step": 9719 }, { "epoch": 1.8020022246941045, "grad_norm": 6.91796875, "learning_rate": 8.197997775305896e-06, "loss": 2.7347, "mean_token_accuracy": 0.46636971046770603, "step": 9720 }, { "epoch": 1.8021876158694847, "grad_norm": 7.0859375, "learning_rate": 8.197812384130516e-06, "loss": 4.1433, "mean_token_accuracy": 0.3972676856485865, "step": 9721 }, { "epoch": 1.8023730070448647, "grad_norm": 6.19140625, "learning_rate": 8.197626992955135e-06, "loss": 3.0488, "mean_token_accuracy": 0.46537741734248284, "step": 9722 }, { "epoch": 1.8025583982202447, "grad_norm": 5.9453125, "learning_rate": 8.197441601779756e-06, "loss": 3.0955, "mean_token_accuracy": 0.4326722338204593, "step": 9723 }, { "epoch": 1.8027437893956249, "grad_norm": 5.84765625, "learning_rate": 8.197256210604376e-06, "loss": 2.6003, "mean_token_accuracy": 0.48848, "step": 9724 }, { "epoch": 1.8029291805710048, "grad_norm": 5.82421875, "learning_rate": 8.197070819428997e-06, "loss": 3.3642, "mean_token_accuracy": 0.42700548081714, "step": 9725 }, { "epoch": 1.8031145717463848, "grad_norm": 5.984375, "learning_rate": 8.196885428253616e-06, "loss": 2.1562, "mean_token_accuracy": 0.5632199485025071, "step": 9726 }, { "epoch": 1.803299962921765, "grad_norm": 6.5078125, "learning_rate": 8.196700037078236e-06, "loss": 3.561, "mean_token_accuracy": 0.4193629929221436, "step": 9727 }, { "epoch": 1.803485354097145, "grad_norm": 7.296875, "learning_rate": 8.196514645902855e-06, "loss": 2.4221, "mean_token_accuracy": 0.48274898728872745, "step": 9728 }, { "epoch": 1.803670745272525, "grad_norm": 6.9765625, "learning_rate": 8.196329254727475e-06, "loss": 2.7572, "mean_token_accuracy": 0.4813626642224259, "step": 9729 }, { "epoch": 1.803856136447905, "grad_norm": 5.28515625, "learning_rate": 8.196143863552096e-06, "loss": 2.7202, "mean_token_accuracy": 0.45993413830954993, "step": 9730 }, { "epoch": 1.8040415276232853, "grad_norm": 8.5859375, "learning_rate": 8.195958472376716e-06, "loss": 2.8889, "mean_token_accuracy": 0.4806515085264539, "step": 9731 }, { "epoch": 1.804226918798665, "grad_norm": 7.5859375, "learning_rate": 8.195773081201335e-06, "loss": 3.3232, "mean_token_accuracy": 0.4307084391758397, "step": 9732 }, { "epoch": 1.8044123099740452, "grad_norm": 7.33203125, "learning_rate": 8.195587690025956e-06, "loss": 3.5033, "mean_token_accuracy": 0.41774845711853587, "step": 9733 }, { "epoch": 1.8045977011494254, "grad_norm": 8.171875, "learning_rate": 8.195402298850576e-06, "loss": 2.5851, "mean_token_accuracy": 0.48422800221361373, "step": 9734 }, { "epoch": 1.8047830923248054, "grad_norm": 7.9765625, "learning_rate": 8.195216907675195e-06, "loss": 2.727, "mean_token_accuracy": 0.4723446452181578, "step": 9735 }, { "epoch": 1.8049684835001854, "grad_norm": 6.91015625, "learning_rate": 8.195031516499815e-06, "loss": 2.8107, "mean_token_accuracy": 0.45479302832244006, "step": 9736 }, { "epoch": 1.8051538746755655, "grad_norm": 6.1953125, "learning_rate": 8.194846125324434e-06, "loss": 2.7632, "mean_token_accuracy": 0.46462513199577615, "step": 9737 }, { "epoch": 1.8053392658509455, "grad_norm": 16.609375, "learning_rate": 8.194660734149055e-06, "loss": 2.5031, "mean_token_accuracy": 0.5150624540778839, "step": 9738 }, { "epoch": 1.8055246570263255, "grad_norm": 5.68359375, "learning_rate": 8.194475342973675e-06, "loss": 2.5784, "mean_token_accuracy": 0.4961229946524064, "step": 9739 }, { "epoch": 1.8057100482017057, "grad_norm": 5.0859375, "learning_rate": 8.194289951798296e-06, "loss": 2.6473, "mean_token_accuracy": 0.48227894036530783, "step": 9740 }, { "epoch": 1.8058954393770856, "grad_norm": 5.58984375, "learning_rate": 8.194104560622914e-06, "loss": 2.9946, "mean_token_accuracy": 0.45447750037374796, "step": 9741 }, { "epoch": 1.8060808305524656, "grad_norm": 6.65234375, "learning_rate": 8.193919169447535e-06, "loss": 2.7202, "mean_token_accuracy": 0.530852224512062, "step": 9742 }, { "epoch": 1.8062662217278458, "grad_norm": 5.60546875, "learning_rate": 8.193733778272155e-06, "loss": 2.9559, "mean_token_accuracy": 0.4725196288365453, "step": 9743 }, { "epoch": 1.8064516129032258, "grad_norm": 5.71875, "learning_rate": 8.193548387096774e-06, "loss": 2.3046, "mean_token_accuracy": 0.51862689926843, "step": 9744 }, { "epoch": 1.8066370040786057, "grad_norm": 5.953125, "learning_rate": 8.193362995921395e-06, "loss": 2.8691, "mean_token_accuracy": 0.4575927472265299, "step": 9745 }, { "epoch": 1.806822395253986, "grad_norm": 5.77734375, "learning_rate": 8.193177604746014e-06, "loss": 3.112, "mean_token_accuracy": 0.47305924412665984, "step": 9746 }, { "epoch": 1.807007786429366, "grad_norm": 6.3671875, "learning_rate": 8.192992213570636e-06, "loss": 3.0639, "mean_token_accuracy": 0.4385299503008109, "step": 9747 }, { "epoch": 1.8071931776047458, "grad_norm": 6.15234375, "learning_rate": 8.192806822395254e-06, "loss": 2.7636, "mean_token_accuracy": 0.49243172096084237, "step": 9748 }, { "epoch": 1.807378568780126, "grad_norm": 8.84375, "learning_rate": 8.192621431219875e-06, "loss": 2.265, "mean_token_accuracy": 0.5377920293174531, "step": 9749 }, { "epoch": 1.8075639599555062, "grad_norm": 6.265625, "learning_rate": 8.192436040044495e-06, "loss": 2.3752, "mean_token_accuracy": 0.5179594689028651, "step": 9750 }, { "epoch": 1.8077493511308862, "grad_norm": 5.62890625, "learning_rate": 8.192250648869114e-06, "loss": 2.874, "mean_token_accuracy": 0.4565040650406504, "step": 9751 }, { "epoch": 1.8079347423062662, "grad_norm": 6.25390625, "learning_rate": 8.192065257693735e-06, "loss": 2.7652, "mean_token_accuracy": 0.4917075759586042, "step": 9752 }, { "epoch": 1.8081201334816464, "grad_norm": 6.828125, "learning_rate": 8.191879866518354e-06, "loss": 2.113, "mean_token_accuracy": 0.5598546387345019, "step": 9753 }, { "epoch": 1.8083055246570263, "grad_norm": 10.515625, "learning_rate": 8.191694475342974e-06, "loss": 3.0867, "mean_token_accuracy": 0.4830652543481104, "step": 9754 }, { "epoch": 1.8084909158324063, "grad_norm": 8.4765625, "learning_rate": 8.191509084167595e-06, "loss": 2.483, "mean_token_accuracy": 0.4932237600922722, "step": 9755 }, { "epoch": 1.8086763070077865, "grad_norm": 6.796875, "learning_rate": 8.191323692992215e-06, "loss": 3.2837, "mean_token_accuracy": 0.4286278121299829, "step": 9756 }, { "epoch": 1.8088616981831664, "grad_norm": 7.51953125, "learning_rate": 8.191138301816834e-06, "loss": 2.9066, "mean_token_accuracy": 0.4557165861513688, "step": 9757 }, { "epoch": 1.8090470893585464, "grad_norm": 7.5, "learning_rate": 8.190952910641454e-06, "loss": 2.9211, "mean_token_accuracy": 0.4515810276679842, "step": 9758 }, { "epoch": 1.8092324805339266, "grad_norm": 6.078125, "learning_rate": 8.190767519466075e-06, "loss": 3.6522, "mean_token_accuracy": 0.4365850123206262, "step": 9759 }, { "epoch": 1.8094178717093068, "grad_norm": 5.734375, "learning_rate": 8.190582128290694e-06, "loss": 2.6315, "mean_token_accuracy": 0.4782507015902713, "step": 9760 }, { "epoch": 1.8096032628846865, "grad_norm": 8.7265625, "learning_rate": 8.190396737115314e-06, "loss": 3.0339, "mean_token_accuracy": 0.46263858093126387, "step": 9761 }, { "epoch": 1.8097886540600667, "grad_norm": 6.03125, "learning_rate": 8.190211345939933e-06, "loss": 3.0578, "mean_token_accuracy": 0.4490238611713666, "step": 9762 }, { "epoch": 1.809974045235447, "grad_norm": 6.17578125, "learning_rate": 8.190025954764555e-06, "loss": 2.5821, "mean_token_accuracy": 0.5032177703965124, "step": 9763 }, { "epoch": 1.8101594364108269, "grad_norm": 7.890625, "learning_rate": 8.189840563589174e-06, "loss": 2.7666, "mean_token_accuracy": 0.47734420500873614, "step": 9764 }, { "epoch": 1.8103448275862069, "grad_norm": 8.28125, "learning_rate": 8.189655172413794e-06, "loss": 2.8335, "mean_token_accuracy": 0.4757357545397621, "step": 9765 }, { "epoch": 1.810530218761587, "grad_norm": 5.890625, "learning_rate": 8.189469781238413e-06, "loss": 2.9625, "mean_token_accuracy": 0.46898620275944813, "step": 9766 }, { "epoch": 1.810715609936967, "grad_norm": 10.0546875, "learning_rate": 8.189284390063034e-06, "loss": 2.2506, "mean_token_accuracy": 0.5117647058823529, "step": 9767 }, { "epoch": 1.810901001112347, "grad_norm": 8.015625, "learning_rate": 8.189098998887654e-06, "loss": 3.1272, "mean_token_accuracy": 0.4377541650268923, "step": 9768 }, { "epoch": 1.8110863922877272, "grad_norm": 5.75390625, "learning_rate": 8.188913607712273e-06, "loss": 2.5252, "mean_token_accuracy": 0.49521503516660903, "step": 9769 }, { "epoch": 1.8112717834631071, "grad_norm": 6.03125, "learning_rate": 8.188728216536893e-06, "loss": 3.3686, "mean_token_accuracy": 0.4259910860896083, "step": 9770 }, { "epoch": 1.811457174638487, "grad_norm": 5.75, "learning_rate": 8.188542825361514e-06, "loss": 3.2681, "mean_token_accuracy": 0.4480208786428882, "step": 9771 }, { "epoch": 1.8116425658138673, "grad_norm": 8.6953125, "learning_rate": 8.188357434186134e-06, "loss": 2.7263, "mean_token_accuracy": 0.48052115583075333, "step": 9772 }, { "epoch": 1.8118279569892473, "grad_norm": 6.87890625, "learning_rate": 8.188172043010753e-06, "loss": 2.9371, "mean_token_accuracy": 0.4658648744251857, "step": 9773 }, { "epoch": 1.8120133481646272, "grad_norm": 5.88671875, "learning_rate": 8.187986651835374e-06, "loss": 2.6383, "mean_token_accuracy": 0.48497613030047737, "step": 9774 }, { "epoch": 1.8121987393400074, "grad_norm": 8.5234375, "learning_rate": 8.187801260659993e-06, "loss": 2.6084, "mean_token_accuracy": 0.491070110701107, "step": 9775 }, { "epoch": 1.8123841305153876, "grad_norm": 10.8671875, "learning_rate": 8.187615869484613e-06, "loss": 3.2205, "mean_token_accuracy": 0.4423391494002181, "step": 9776 }, { "epoch": 1.8125695216907676, "grad_norm": 6.08203125, "learning_rate": 8.187430478309233e-06, "loss": 3.0314, "mean_token_accuracy": 0.443636925931653, "step": 9777 }, { "epoch": 1.8127549128661475, "grad_norm": 8.25, "learning_rate": 8.187245087133852e-06, "loss": 3.0045, "mean_token_accuracy": 0.4608355091383812, "step": 9778 }, { "epoch": 1.8129403040415277, "grad_norm": 9.46875, "learning_rate": 8.187059695958473e-06, "loss": 2.4985, "mean_token_accuracy": 0.5064224282363453, "step": 9779 }, { "epoch": 1.8131256952169077, "grad_norm": 6.80078125, "learning_rate": 8.186874304783093e-06, "loss": 3.3964, "mean_token_accuracy": 0.4038800705467372, "step": 9780 }, { "epoch": 1.8133110863922877, "grad_norm": 5.40625, "learning_rate": 8.186688913607714e-06, "loss": 2.6853, "mean_token_accuracy": 0.48298959985898116, "step": 9781 }, { "epoch": 1.8134964775676679, "grad_norm": 8.3203125, "learning_rate": 8.186503522432333e-06, "loss": 2.9409, "mean_token_accuracy": 0.45241417806307566, "step": 9782 }, { "epoch": 1.8136818687430478, "grad_norm": 8.4375, "learning_rate": 8.186318131256953e-06, "loss": 2.323, "mean_token_accuracy": 0.5307000886188125, "step": 9783 }, { "epoch": 1.8138672599184278, "grad_norm": 4.9921875, "learning_rate": 8.186132740081572e-06, "loss": 2.9498, "mean_token_accuracy": 0.4759418653873289, "step": 9784 }, { "epoch": 1.814052651093808, "grad_norm": 7.09375, "learning_rate": 8.185947348906192e-06, "loss": 2.3819, "mean_token_accuracy": 0.5059413027916965, "step": 9785 }, { "epoch": 1.814238042269188, "grad_norm": 7.8203125, "learning_rate": 8.185761957730813e-06, "loss": 2.5279, "mean_token_accuracy": 0.5114387391967463, "step": 9786 }, { "epoch": 1.814423433444568, "grad_norm": 4.609375, "learning_rate": 8.185576566555432e-06, "loss": 2.9901, "mean_token_accuracy": 0.4519632414369256, "step": 9787 }, { "epoch": 1.814608824619948, "grad_norm": 5.671875, "learning_rate": 8.185391175380054e-06, "loss": 3.4827, "mean_token_accuracy": 0.43407159412825136, "step": 9788 }, { "epoch": 1.8147942157953283, "grad_norm": 6.828125, "learning_rate": 8.185205784204673e-06, "loss": 2.583, "mean_token_accuracy": 0.48542349491116094, "step": 9789 }, { "epoch": 1.814979606970708, "grad_norm": 5.359375, "learning_rate": 8.185020393029293e-06, "loss": 2.9935, "mean_token_accuracy": 0.455950991831972, "step": 9790 }, { "epoch": 1.8151649981460882, "grad_norm": 6.98828125, "learning_rate": 8.184835001853912e-06, "loss": 2.8281, "mean_token_accuracy": 0.5339027595269382, "step": 9791 }, { "epoch": 1.8153503893214684, "grad_norm": 6.41015625, "learning_rate": 8.184649610678532e-06, "loss": 3.0733, "mean_token_accuracy": 0.452537865649208, "step": 9792 }, { "epoch": 1.8155357804968484, "grad_norm": 5.82421875, "learning_rate": 8.184464219503153e-06, "loss": 3.1223, "mean_token_accuracy": 0.45197670095426945, "step": 9793 }, { "epoch": 1.8157211716722284, "grad_norm": 5.64453125, "learning_rate": 8.184278828327772e-06, "loss": 3.5586, "mean_token_accuracy": 0.42010217417131995, "step": 9794 }, { "epoch": 1.8159065628476085, "grad_norm": 5.44921875, "learning_rate": 8.184093437152392e-06, "loss": 2.2773, "mean_token_accuracy": 0.5234433408095696, "step": 9795 }, { "epoch": 1.8160919540229885, "grad_norm": 6.65625, "learning_rate": 8.183908045977013e-06, "loss": 3.0704, "mean_token_accuracy": 0.4691806564770734, "step": 9796 }, { "epoch": 1.8162773451983685, "grad_norm": 6.20703125, "learning_rate": 8.183722654801633e-06, "loss": 2.6987, "mean_token_accuracy": 0.4841087056655919, "step": 9797 }, { "epoch": 1.8164627363737487, "grad_norm": 6.03125, "learning_rate": 8.183537263626252e-06, "loss": 3.2751, "mean_token_accuracy": 0.44124732334047106, "step": 9798 }, { "epoch": 1.8166481275491286, "grad_norm": 6.08203125, "learning_rate": 8.183351872450872e-06, "loss": 3.1651, "mean_token_accuracy": 0.4688142563399589, "step": 9799 }, { "epoch": 1.8168335187245086, "grad_norm": 5.28515625, "learning_rate": 8.183166481275491e-06, "loss": 2.075, "mean_token_accuracy": 0.5517497034400949, "step": 9800 }, { "epoch": 1.8170189098998888, "grad_norm": 5.9765625, "learning_rate": 8.182981090100112e-06, "loss": 2.9874, "mean_token_accuracy": 0.45034224460803307, "step": 9801 }, { "epoch": 1.817204301075269, "grad_norm": 5.50390625, "learning_rate": 8.182795698924732e-06, "loss": 3.0302, "mean_token_accuracy": 0.46223129578479644, "step": 9802 }, { "epoch": 1.8173896922506487, "grad_norm": 6.05859375, "learning_rate": 8.182610307749351e-06, "loss": 3.0606, "mean_token_accuracy": 0.4563717778046714, "step": 9803 }, { "epoch": 1.817575083426029, "grad_norm": 5.08984375, "learning_rate": 8.182424916573972e-06, "loss": 2.6675, "mean_token_accuracy": 0.5217276099629041, "step": 9804 }, { "epoch": 1.817760474601409, "grad_norm": 6.1796875, "learning_rate": 8.182239525398592e-06, "loss": 3.0867, "mean_token_accuracy": 0.4517724649629019, "step": 9805 }, { "epoch": 1.817945865776789, "grad_norm": 6.0546875, "learning_rate": 8.182054134223212e-06, "loss": 2.9183, "mean_token_accuracy": 0.4608718837488335, "step": 9806 }, { "epoch": 1.818131256952169, "grad_norm": 5.82421875, "learning_rate": 8.181868743047831e-06, "loss": 2.7907, "mean_token_accuracy": 0.5035245335176227, "step": 9807 }, { "epoch": 1.8183166481275492, "grad_norm": 7.70703125, "learning_rate": 8.181683351872452e-06, "loss": 3.377, "mean_token_accuracy": 0.43843416370106764, "step": 9808 }, { "epoch": 1.8185020393029292, "grad_norm": 7.59375, "learning_rate": 8.18149796069707e-06, "loss": 2.7963, "mean_token_accuracy": 0.46919967663702505, "step": 9809 }, { "epoch": 1.8186874304783092, "grad_norm": 8.109375, "learning_rate": 8.181312569521691e-06, "loss": 2.2368, "mean_token_accuracy": 0.5364663585002568, "step": 9810 }, { "epoch": 1.8188728216536894, "grad_norm": 5.9921875, "learning_rate": 8.181127178346312e-06, "loss": 2.4569, "mean_token_accuracy": 0.5049197307094769, "step": 9811 }, { "epoch": 1.8190582128290693, "grad_norm": 5.25390625, "learning_rate": 8.180941787170932e-06, "loss": 2.1116, "mean_token_accuracy": 0.5403482018045495, "step": 9812 }, { "epoch": 1.8192436040044493, "grad_norm": 5.875, "learning_rate": 8.180756395995551e-06, "loss": 2.9597, "mean_token_accuracy": 0.45045857765749514, "step": 9813 }, { "epoch": 1.8194289951798295, "grad_norm": 6.91015625, "learning_rate": 8.180571004820171e-06, "loss": 2.0494, "mean_token_accuracy": 0.5486033519553073, "step": 9814 }, { "epoch": 1.8196143863552094, "grad_norm": 6.56640625, "learning_rate": 8.180385613644792e-06, "loss": 3.0762, "mean_token_accuracy": 0.4500611995104039, "step": 9815 }, { "epoch": 1.8197997775305894, "grad_norm": 6.078125, "learning_rate": 8.18020022246941e-06, "loss": 2.7934, "mean_token_accuracy": 0.47223734349482854, "step": 9816 }, { "epoch": 1.8199851687059696, "grad_norm": 6.08984375, "learning_rate": 8.180014831294031e-06, "loss": 3.1129, "mean_token_accuracy": 0.4422683923705722, "step": 9817 }, { "epoch": 1.8201705598813498, "grad_norm": 5.31640625, "learning_rate": 8.17982944011865e-06, "loss": 3.3861, "mean_token_accuracy": 0.39957466918714557, "step": 9818 }, { "epoch": 1.8203559510567295, "grad_norm": 5.62109375, "learning_rate": 8.17964404894327e-06, "loss": 2.6603, "mean_token_accuracy": 0.4887010242237035, "step": 9819 }, { "epoch": 1.8205413422321097, "grad_norm": 5.58984375, "learning_rate": 8.179458657767891e-06, "loss": 2.6692, "mean_token_accuracy": 0.4914947520810713, "step": 9820 }, { "epoch": 1.82072673340749, "grad_norm": 7.00390625, "learning_rate": 8.179273266592511e-06, "loss": 2.4118, "mean_token_accuracy": 0.5110850286906625, "step": 9821 }, { "epoch": 1.8209121245828699, "grad_norm": 8.109375, "learning_rate": 8.17908787541713e-06, "loss": 2.8918, "mean_token_accuracy": 0.4678201599161096, "step": 9822 }, { "epoch": 1.8210975157582499, "grad_norm": 8.28125, "learning_rate": 8.17890248424175e-06, "loss": 2.5796, "mean_token_accuracy": 0.48856664807585054, "step": 9823 }, { "epoch": 1.82128290693363, "grad_norm": 6.7890625, "learning_rate": 8.178717093066371e-06, "loss": 2.877, "mean_token_accuracy": 0.47789790611742167, "step": 9824 }, { "epoch": 1.82146829810901, "grad_norm": 6.28125, "learning_rate": 8.17853170189099e-06, "loss": 3.0604, "mean_token_accuracy": 0.4393972483074907, "step": 9825 }, { "epoch": 1.82165368928439, "grad_norm": 6.5703125, "learning_rate": 8.17834631071561e-06, "loss": 2.4296, "mean_token_accuracy": 0.48900714185688277, "step": 9826 }, { "epoch": 1.8218390804597702, "grad_norm": 5.2578125, "learning_rate": 8.17816091954023e-06, "loss": 2.6679, "mean_token_accuracy": 0.5003408316291752, "step": 9827 }, { "epoch": 1.8220244716351501, "grad_norm": 7.54296875, "learning_rate": 8.177975528364851e-06, "loss": 3.0166, "mean_token_accuracy": 0.44657050338534, "step": 9828 }, { "epoch": 1.82220986281053, "grad_norm": 5.9296875, "learning_rate": 8.17779013718947e-06, "loss": 3.3837, "mean_token_accuracy": 0.4261609259517501, "step": 9829 }, { "epoch": 1.8223952539859103, "grad_norm": 5.69140625, "learning_rate": 8.17760474601409e-06, "loss": 3.2641, "mean_token_accuracy": 0.4339457567804024, "step": 9830 }, { "epoch": 1.8225806451612905, "grad_norm": 6.73828125, "learning_rate": 8.177419354838711e-06, "loss": 2.8617, "mean_token_accuracy": 0.45494755465689374, "step": 9831 }, { "epoch": 1.8227660363366702, "grad_norm": 5.5546875, "learning_rate": 8.17723396366333e-06, "loss": 3.3221, "mean_token_accuracy": 0.4438561930558976, "step": 9832 }, { "epoch": 1.8229514275120504, "grad_norm": 5.91796875, "learning_rate": 8.17704857248795e-06, "loss": 3.6643, "mean_token_accuracy": 0.4020573108008817, "step": 9833 }, { "epoch": 1.8231368186874306, "grad_norm": 7.65234375, "learning_rate": 8.17686318131257e-06, "loss": 2.4775, "mean_token_accuracy": 0.5040042712226375, "step": 9834 }, { "epoch": 1.8233222098628106, "grad_norm": 5.91796875, "learning_rate": 8.17667779013719e-06, "loss": 2.8397, "mean_token_accuracy": 0.46711769973137, "step": 9835 }, { "epoch": 1.8235076010381905, "grad_norm": 8.0859375, "learning_rate": 8.17649239896181e-06, "loss": 2.5356, "mean_token_accuracy": 0.48488252363568285, "step": 9836 }, { "epoch": 1.8236929922135707, "grad_norm": 6.9375, "learning_rate": 8.17630700778643e-06, "loss": 3.7763, "mean_token_accuracy": 0.41598842466992225, "step": 9837 }, { "epoch": 1.8238783833889507, "grad_norm": 6.45703125, "learning_rate": 8.17612161661105e-06, "loss": 2.8534, "mean_token_accuracy": 0.48314432188907874, "step": 9838 }, { "epoch": 1.8240637745643307, "grad_norm": 6.34765625, "learning_rate": 8.17593622543567e-06, "loss": 3.2312, "mean_token_accuracy": 0.42416596579758903, "step": 9839 }, { "epoch": 1.8242491657397109, "grad_norm": 6.17578125, "learning_rate": 8.17575083426029e-06, "loss": 2.8948, "mean_token_accuracy": 0.46728221597751907, "step": 9840 }, { "epoch": 1.8244345569150908, "grad_norm": 6.21484375, "learning_rate": 8.17556544308491e-06, "loss": 2.8339, "mean_token_accuracy": 0.4582139446036294, "step": 9841 }, { "epoch": 1.8246199480904708, "grad_norm": 5.92578125, "learning_rate": 8.17538005190953e-06, "loss": 3.296, "mean_token_accuracy": 0.43596189468113256, "step": 9842 }, { "epoch": 1.824805339265851, "grad_norm": 7.359375, "learning_rate": 8.175194660734149e-06, "loss": 2.0158, "mean_token_accuracy": 0.5690138027605521, "step": 9843 }, { "epoch": 1.824990730441231, "grad_norm": 8.2421875, "learning_rate": 8.17500926955877e-06, "loss": 2.5757, "mean_token_accuracy": 0.4889845530514054, "step": 9844 }, { "epoch": 1.825176121616611, "grad_norm": 7.71875, "learning_rate": 8.17482387838339e-06, "loss": 2.8355, "mean_token_accuracy": 0.4741042345276873, "step": 9845 }, { "epoch": 1.825361512791991, "grad_norm": 6.4140625, "learning_rate": 8.17463848720801e-06, "loss": 3.1864, "mean_token_accuracy": 0.4386013597890939, "step": 9846 }, { "epoch": 1.8255469039673713, "grad_norm": 8.375, "learning_rate": 8.174453096032629e-06, "loss": 2.5643, "mean_token_accuracy": 0.496025198740063, "step": 9847 }, { "epoch": 1.8257322951427513, "grad_norm": 7.75390625, "learning_rate": 8.17426770485725e-06, "loss": 3.2, "mean_token_accuracy": 0.4153522607781283, "step": 9848 }, { "epoch": 1.8259176863181312, "grad_norm": 6.97265625, "learning_rate": 8.17408231368187e-06, "loss": 3.3362, "mean_token_accuracy": 0.4476205434270457, "step": 9849 }, { "epoch": 1.8261030774935114, "grad_norm": 6.3984375, "learning_rate": 8.173896922506489e-06, "loss": 3.1812, "mean_token_accuracy": 0.4113464447806354, "step": 9850 }, { "epoch": 1.8262884686688914, "grad_norm": 8.0703125, "learning_rate": 8.17371153133111e-06, "loss": 3.4171, "mean_token_accuracy": 0.4261029411764706, "step": 9851 }, { "epoch": 1.8264738598442714, "grad_norm": 9.2578125, "learning_rate": 8.17352614015573e-06, "loss": 3.2803, "mean_token_accuracy": 0.45393871028388794, "step": 9852 }, { "epoch": 1.8266592510196515, "grad_norm": 7.2734375, "learning_rate": 8.17334074898035e-06, "loss": 2.8363, "mean_token_accuracy": 0.46935180836073276, "step": 9853 }, { "epoch": 1.8268446421950315, "grad_norm": 7.7109375, "learning_rate": 8.173155357804969e-06, "loss": 2.9806, "mean_token_accuracy": 0.46115317414094353, "step": 9854 }, { "epoch": 1.8270300333704115, "grad_norm": 10.5546875, "learning_rate": 8.17296996662959e-06, "loss": 2.6667, "mean_token_accuracy": 0.5090268604139145, "step": 9855 }, { "epoch": 1.8272154245457917, "grad_norm": 6.67578125, "learning_rate": 8.172784575454208e-06, "loss": 3.0599, "mean_token_accuracy": 0.45188536953242836, "step": 9856 }, { "epoch": 1.8274008157211716, "grad_norm": 9.03125, "learning_rate": 8.172599184278829e-06, "loss": 2.1617, "mean_token_accuracy": 0.5504184934236748, "step": 9857 }, { "epoch": 1.8275862068965516, "grad_norm": 8.625, "learning_rate": 8.17241379310345e-06, "loss": 3.0517, "mean_token_accuracy": 0.4832857382832185, "step": 9858 }, { "epoch": 1.8277715980719318, "grad_norm": 5.453125, "learning_rate": 8.172228401928068e-06, "loss": 2.856, "mean_token_accuracy": 0.47587392550143265, "step": 9859 }, { "epoch": 1.827956989247312, "grad_norm": 5.8515625, "learning_rate": 8.172043010752689e-06, "loss": 2.6661, "mean_token_accuracy": 0.47032229838121625, "step": 9860 }, { "epoch": 1.8281423804226917, "grad_norm": 10.1875, "learning_rate": 8.171857619577309e-06, "loss": 2.2526, "mean_token_accuracy": 0.5315723840834372, "step": 9861 }, { "epoch": 1.828327771598072, "grad_norm": 7.11328125, "learning_rate": 8.17167222840193e-06, "loss": 2.9516, "mean_token_accuracy": 0.4582560296846011, "step": 9862 }, { "epoch": 1.828513162773452, "grad_norm": 6.26953125, "learning_rate": 8.171486837226548e-06, "loss": 2.347, "mean_token_accuracy": 0.5147873058744091, "step": 9863 }, { "epoch": 1.828698553948832, "grad_norm": 7.96875, "learning_rate": 8.171301446051169e-06, "loss": 2.9877, "mean_token_accuracy": 0.4392204960479695, "step": 9864 }, { "epoch": 1.828883945124212, "grad_norm": 7.3984375, "learning_rate": 8.171116054875788e-06, "loss": 3.3785, "mean_token_accuracy": 0.45092262366258334, "step": 9865 }, { "epoch": 1.8290693362995922, "grad_norm": 5.5546875, "learning_rate": 8.170930663700408e-06, "loss": 3.3754, "mean_token_accuracy": 0.4195064629847238, "step": 9866 }, { "epoch": 1.8292547274749722, "grad_norm": 5.67578125, "learning_rate": 8.170745272525029e-06, "loss": 2.5538, "mean_token_accuracy": 0.5019165727170237, "step": 9867 }, { "epoch": 1.8294401186503522, "grad_norm": 6.2578125, "learning_rate": 8.170559881349649e-06, "loss": 3.2508, "mean_token_accuracy": 0.44874164652353193, "step": 9868 }, { "epoch": 1.8296255098257324, "grad_norm": 6.1796875, "learning_rate": 8.17037449017427e-06, "loss": 3.2008, "mean_token_accuracy": 0.44672607516466484, "step": 9869 }, { "epoch": 1.8298109010011123, "grad_norm": 8.0390625, "learning_rate": 8.170189098998888e-06, "loss": 3.1967, "mean_token_accuracy": 0.43868548742831937, "step": 9870 }, { "epoch": 1.8299962921764923, "grad_norm": 6.1015625, "learning_rate": 8.170003707823509e-06, "loss": 2.8199, "mean_token_accuracy": 0.4828819068255688, "step": 9871 }, { "epoch": 1.8301816833518725, "grad_norm": 7.12109375, "learning_rate": 8.169818316648128e-06, "loss": 2.9534, "mean_token_accuracy": 0.46561147802322933, "step": 9872 }, { "epoch": 1.8303670745272527, "grad_norm": 5.73046875, "learning_rate": 8.169632925472748e-06, "loss": 2.6069, "mean_token_accuracy": 0.5093678598629093, "step": 9873 }, { "epoch": 1.8305524657026324, "grad_norm": 7.52734375, "learning_rate": 8.169447534297369e-06, "loss": 2.6319, "mean_token_accuracy": 0.49981709547616143, "step": 9874 }, { "epoch": 1.8307378568780126, "grad_norm": 6.234375, "learning_rate": 8.169262143121987e-06, "loss": 3.2729, "mean_token_accuracy": 0.4259123552689433, "step": 9875 }, { "epoch": 1.8309232480533928, "grad_norm": 6.40234375, "learning_rate": 8.169076751946608e-06, "loss": 2.8166, "mean_token_accuracy": 0.4746349913387775, "step": 9876 }, { "epoch": 1.8311086392287728, "grad_norm": 6.6953125, "learning_rate": 8.168891360771228e-06, "loss": 2.5001, "mean_token_accuracy": 0.5214254797287549, "step": 9877 }, { "epoch": 1.8312940304041527, "grad_norm": 6.171875, "learning_rate": 8.168705969595849e-06, "loss": 3.8367, "mean_token_accuracy": 0.3778600714191245, "step": 9878 }, { "epoch": 1.831479421579533, "grad_norm": 5.65625, "learning_rate": 8.168520578420468e-06, "loss": 3.215, "mean_token_accuracy": 0.44952089987501737, "step": 9879 }, { "epoch": 1.8316648127549129, "grad_norm": 10.25, "learning_rate": 8.168335187245088e-06, "loss": 2.0783, "mean_token_accuracy": 0.5462147230103032, "step": 9880 }, { "epoch": 1.8318502039302929, "grad_norm": 10.8125, "learning_rate": 8.168149796069707e-06, "loss": 2.4704, "mean_token_accuracy": 0.5054429996976111, "step": 9881 }, { "epoch": 1.832035595105673, "grad_norm": 8.265625, "learning_rate": 8.167964404894327e-06, "loss": 2.9189, "mean_token_accuracy": 0.44876946258161726, "step": 9882 }, { "epoch": 1.832220986281053, "grad_norm": 7.171875, "learning_rate": 8.167779013718948e-06, "loss": 2.9213, "mean_token_accuracy": 0.4702416028285209, "step": 9883 }, { "epoch": 1.832406377456433, "grad_norm": 9.6484375, "learning_rate": 8.167593622543568e-06, "loss": 2.562, "mean_token_accuracy": 0.4769986601161233, "step": 9884 }, { "epoch": 1.8325917686318132, "grad_norm": 5.140625, "learning_rate": 8.167408231368187e-06, "loss": 2.7043, "mean_token_accuracy": 0.48070460076486266, "step": 9885 }, { "epoch": 1.8327771598071931, "grad_norm": 5.62890625, "learning_rate": 8.167222840192808e-06, "loss": 2.9202, "mean_token_accuracy": 0.4669733427695211, "step": 9886 }, { "epoch": 1.832962550982573, "grad_norm": 6.19140625, "learning_rate": 8.167037449017428e-06, "loss": 2.6217, "mean_token_accuracy": 0.47896484915582616, "step": 9887 }, { "epoch": 1.8331479421579533, "grad_norm": 6.24609375, "learning_rate": 8.166852057842047e-06, "loss": 2.7821, "mean_token_accuracy": 0.4508919623170976, "step": 9888 }, { "epoch": 1.8333333333333335, "grad_norm": 5.5625, "learning_rate": 8.166666666666668e-06, "loss": 2.7016, "mean_token_accuracy": 0.47558284562985703, "step": 9889 }, { "epoch": 1.8335187245087132, "grad_norm": 6.60546875, "learning_rate": 8.166481275491286e-06, "loss": 2.9647, "mean_token_accuracy": 0.45293618825524695, "step": 9890 }, { "epoch": 1.8337041156840934, "grad_norm": 5.0234375, "learning_rate": 8.166295884315907e-06, "loss": 2.4817, "mean_token_accuracy": 0.49432700346351366, "step": 9891 }, { "epoch": 1.8338895068594736, "grad_norm": 7.81640625, "learning_rate": 8.166110493140527e-06, "loss": 3.2523, "mean_token_accuracy": 0.42291438409766785, "step": 9892 }, { "epoch": 1.8340748980348536, "grad_norm": 4.9765625, "learning_rate": 8.165925101965148e-06, "loss": 2.9318, "mean_token_accuracy": 0.43706777316735823, "step": 9893 }, { "epoch": 1.8342602892102335, "grad_norm": 5.52734375, "learning_rate": 8.165739710789767e-06, "loss": 2.9282, "mean_token_accuracy": 0.45150794643974473, "step": 9894 }, { "epoch": 1.8344456803856137, "grad_norm": 6.20703125, "learning_rate": 8.165554319614387e-06, "loss": 3.4705, "mean_token_accuracy": 0.4322184138990663, "step": 9895 }, { "epoch": 1.8346310715609937, "grad_norm": 8.1015625, "learning_rate": 8.165368928439008e-06, "loss": 2.375, "mean_token_accuracy": 0.508030303030303, "step": 9896 }, { "epoch": 1.8348164627363737, "grad_norm": 6.8828125, "learning_rate": 8.165183537263626e-06, "loss": 2.878, "mean_token_accuracy": 0.44756671899529044, "step": 9897 }, { "epoch": 1.8350018539117539, "grad_norm": 5.48828125, "learning_rate": 8.164998146088247e-06, "loss": 3.0532, "mean_token_accuracy": 0.4302161954714085, "step": 9898 }, { "epoch": 1.8351872450871338, "grad_norm": 5.96484375, "learning_rate": 8.164812754912866e-06, "loss": 2.7995, "mean_token_accuracy": 0.4733405875952122, "step": 9899 }, { "epoch": 1.8353726362625138, "grad_norm": 6.57421875, "learning_rate": 8.164627363737486e-06, "loss": 2.905, "mean_token_accuracy": 0.47527472527472525, "step": 9900 }, { "epoch": 1.835558027437894, "grad_norm": 9.0859375, "learning_rate": 8.164441972562107e-06, "loss": 2.8332, "mean_token_accuracy": 0.4676726511730367, "step": 9901 }, { "epoch": 1.8357434186132742, "grad_norm": 5.31640625, "learning_rate": 8.164256581386727e-06, "loss": 2.7991, "mean_token_accuracy": 0.49108402822322, "step": 9902 }, { "epoch": 1.835928809788654, "grad_norm": 6.9921875, "learning_rate": 8.164071190211346e-06, "loss": 2.4431, "mean_token_accuracy": 0.5142450142450142, "step": 9903 }, { "epoch": 1.836114200964034, "grad_norm": 9.8515625, "learning_rate": 8.163885799035966e-06, "loss": 1.9152, "mean_token_accuracy": 0.5636960087479497, "step": 9904 }, { "epoch": 1.8362995921394143, "grad_norm": 5.875, "learning_rate": 8.163700407860587e-06, "loss": 3.1739, "mean_token_accuracy": 0.44848621780388614, "step": 9905 }, { "epoch": 1.8364849833147943, "grad_norm": 8.7578125, "learning_rate": 8.163515016685206e-06, "loss": 3.2362, "mean_token_accuracy": 0.4287764153404644, "step": 9906 }, { "epoch": 1.8366703744901742, "grad_norm": 8.625, "learning_rate": 8.163329625509826e-06, "loss": 3.5704, "mean_token_accuracy": 0.4211150652431791, "step": 9907 }, { "epoch": 1.8368557656655544, "grad_norm": 6.859375, "learning_rate": 8.163144234334445e-06, "loss": 2.6843, "mean_token_accuracy": 0.4796943540399038, "step": 9908 }, { "epoch": 1.8370411568409344, "grad_norm": 10.609375, "learning_rate": 8.162958843159067e-06, "loss": 2.5302, "mean_token_accuracy": 0.495017015070491, "step": 9909 }, { "epoch": 1.8372265480163144, "grad_norm": 8.0390625, "learning_rate": 8.162773451983686e-06, "loss": 2.4259, "mean_token_accuracy": 0.500880503144654, "step": 9910 }, { "epoch": 1.8374119391916945, "grad_norm": 7.1171875, "learning_rate": 8.162588060808306e-06, "loss": 2.2755, "mean_token_accuracy": 0.5408628081457664, "step": 9911 }, { "epoch": 1.8375973303670745, "grad_norm": 6.67578125, "learning_rate": 8.162402669632927e-06, "loss": 2.8986, "mean_token_accuracy": 0.48922825197709296, "step": 9912 }, { "epoch": 1.8377827215424545, "grad_norm": 10.78125, "learning_rate": 8.162217278457546e-06, "loss": 2.7058, "mean_token_accuracy": 0.46162452450212577, "step": 9913 }, { "epoch": 1.8379681127178347, "grad_norm": 6.453125, "learning_rate": 8.162031887282166e-06, "loss": 2.7801, "mean_token_accuracy": 0.4758316747227751, "step": 9914 }, { "epoch": 1.8381535038932146, "grad_norm": 6.05078125, "learning_rate": 8.161846496106785e-06, "loss": 3.0257, "mean_token_accuracy": 0.47233718144195164, "step": 9915 }, { "epoch": 1.8383388950685946, "grad_norm": 7.8984375, "learning_rate": 8.161661104931406e-06, "loss": 2.867, "mean_token_accuracy": 0.48376201610808, "step": 9916 }, { "epoch": 1.8385242862439748, "grad_norm": 6.4609375, "learning_rate": 8.161475713756026e-06, "loss": 3.3436, "mean_token_accuracy": 0.44700139470013944, "step": 9917 }, { "epoch": 1.838709677419355, "grad_norm": 9.765625, "learning_rate": 8.161290322580647e-06, "loss": 2.5837, "mean_token_accuracy": 0.48219241443108235, "step": 9918 }, { "epoch": 1.8388950685947347, "grad_norm": 7.8203125, "learning_rate": 8.161104931405265e-06, "loss": 2.1196, "mean_token_accuracy": 0.5406290956749672, "step": 9919 }, { "epoch": 1.839080459770115, "grad_norm": 5.12109375, "learning_rate": 8.160919540229886e-06, "loss": 2.3421, "mean_token_accuracy": 0.5113786875376279, "step": 9920 }, { "epoch": 1.839265850945495, "grad_norm": 7.99609375, "learning_rate": 8.160734149054506e-06, "loss": 3.2918, "mean_token_accuracy": 0.4302621995630007, "step": 9921 }, { "epoch": 1.839451242120875, "grad_norm": 5.60546875, "learning_rate": 8.160548757879125e-06, "loss": 2.7983, "mean_token_accuracy": 0.49405656510452245, "step": 9922 }, { "epoch": 1.839636633296255, "grad_norm": 5.8984375, "learning_rate": 8.160363366703746e-06, "loss": 2.9442, "mean_token_accuracy": 0.44370782526340713, "step": 9923 }, { "epoch": 1.8398220244716352, "grad_norm": 7.53125, "learning_rate": 8.160177975528364e-06, "loss": 3.295, "mean_token_accuracy": 0.44047467143039426, "step": 9924 }, { "epoch": 1.8400074156470152, "grad_norm": 7.2265625, "learning_rate": 8.159992584352987e-06, "loss": 2.2591, "mean_token_accuracy": 0.5414471562197614, "step": 9925 }, { "epoch": 1.8401928068223952, "grad_norm": 6.75, "learning_rate": 8.159807193177605e-06, "loss": 2.8694, "mean_token_accuracy": 0.46465249582039647, "step": 9926 }, { "epoch": 1.8403781979977754, "grad_norm": 10.1328125, "learning_rate": 8.159621802002226e-06, "loss": 2.1149, "mean_token_accuracy": 0.5379284274193549, "step": 9927 }, { "epoch": 1.8405635891731553, "grad_norm": 6.55859375, "learning_rate": 8.159436410826845e-06, "loss": 2.5451, "mean_token_accuracy": 0.4833965299322073, "step": 9928 }, { "epoch": 1.8407489803485353, "grad_norm": 7.41015625, "learning_rate": 8.159251019651465e-06, "loss": 2.9423, "mean_token_accuracy": 0.49673103238558614, "step": 9929 }, { "epoch": 1.8409343715239155, "grad_norm": 6.9296875, "learning_rate": 8.159065628476086e-06, "loss": 2.8812, "mean_token_accuracy": 0.46237878973855406, "step": 9930 }, { "epoch": 1.8411197626992957, "grad_norm": 7.16015625, "learning_rate": 8.158880237300704e-06, "loss": 2.3804, "mean_token_accuracy": 0.516580310880829, "step": 9931 }, { "epoch": 1.8413051538746754, "grad_norm": 7.76171875, "learning_rate": 8.158694846125325e-06, "loss": 3.0806, "mean_token_accuracy": 0.4563608434576176, "step": 9932 }, { "epoch": 1.8414905450500556, "grad_norm": 7.32421875, "learning_rate": 8.158509454949945e-06, "loss": 2.784, "mean_token_accuracy": 0.4812752331894751, "step": 9933 }, { "epoch": 1.8416759362254358, "grad_norm": 5.73828125, "learning_rate": 8.158324063774566e-06, "loss": 2.9815, "mean_token_accuracy": 0.4635841644001209, "step": 9934 }, { "epoch": 1.8418613274008158, "grad_norm": 6.3046875, "learning_rate": 8.158138672599185e-06, "loss": 3.1736, "mean_token_accuracy": 0.4578762863281692, "step": 9935 }, { "epoch": 1.8420467185761957, "grad_norm": 7.2734375, "learning_rate": 8.157953281423805e-06, "loss": 2.8627, "mean_token_accuracy": 0.46904982977406373, "step": 9936 }, { "epoch": 1.842232109751576, "grad_norm": 5.5078125, "learning_rate": 8.157767890248424e-06, "loss": 3.0206, "mean_token_accuracy": 0.46620475113122173, "step": 9937 }, { "epoch": 1.8424175009269559, "grad_norm": 5.57421875, "learning_rate": 8.157582499073045e-06, "loss": 3.1131, "mean_token_accuracy": 0.44411447084233263, "step": 9938 }, { "epoch": 1.8426028921023359, "grad_norm": 10.75, "learning_rate": 8.157397107897665e-06, "loss": 3.0653, "mean_token_accuracy": 0.4419742729306488, "step": 9939 }, { "epoch": 1.842788283277716, "grad_norm": 10.375, "learning_rate": 8.157211716722284e-06, "loss": 2.3796, "mean_token_accuracy": 0.5429635541367385, "step": 9940 }, { "epoch": 1.842973674453096, "grad_norm": 6.00390625, "learning_rate": 8.157026325546904e-06, "loss": 2.7982, "mean_token_accuracy": 0.48338398196255267, "step": 9941 }, { "epoch": 1.843159065628476, "grad_norm": 6.7421875, "learning_rate": 8.156840934371525e-06, "loss": 3.0193, "mean_token_accuracy": 0.433843085106383, "step": 9942 }, { "epoch": 1.8433444568038562, "grad_norm": 6.4453125, "learning_rate": 8.156655543196145e-06, "loss": 2.8091, "mean_token_accuracy": 0.4510950891035233, "step": 9943 }, { "epoch": 1.8435298479792361, "grad_norm": 5.171875, "learning_rate": 8.156470152020764e-06, "loss": 2.4815, "mean_token_accuracy": 0.49093214965123655, "step": 9944 }, { "epoch": 1.843715239154616, "grad_norm": 6.5859375, "learning_rate": 8.156284760845385e-06, "loss": 3.5451, "mean_token_accuracy": 0.4009827448291624, "step": 9945 }, { "epoch": 1.8439006303299963, "grad_norm": 7.53125, "learning_rate": 8.156099369670003e-06, "loss": 2.6191, "mean_token_accuracy": 0.49115838031778575, "step": 9946 }, { "epoch": 1.8440860215053765, "grad_norm": 5.375, "learning_rate": 8.155913978494624e-06, "loss": 3.359, "mean_token_accuracy": 0.4231683596282689, "step": 9947 }, { "epoch": 1.8442714126807565, "grad_norm": 5.640625, "learning_rate": 8.155728587319244e-06, "loss": 2.8164, "mean_token_accuracy": 0.4651907952228372, "step": 9948 }, { "epoch": 1.8444568038561364, "grad_norm": 6.7109375, "learning_rate": 8.155543196143865e-06, "loss": 2.9187, "mean_token_accuracy": 0.46469888636609363, "step": 9949 }, { "epoch": 1.8446421950315166, "grad_norm": 6.45703125, "learning_rate": 8.155357804968485e-06, "loss": 3.7768, "mean_token_accuracy": 0.40051306717973384, "step": 9950 }, { "epoch": 1.8448275862068966, "grad_norm": 6.046875, "learning_rate": 8.155172413793104e-06, "loss": 3.0859, "mean_token_accuracy": 0.45669553630912724, "step": 9951 }, { "epoch": 1.8450129773822765, "grad_norm": 8.1875, "learning_rate": 8.154987022617725e-06, "loss": 2.538, "mean_token_accuracy": 0.4975929978118162, "step": 9952 }, { "epoch": 1.8451983685576567, "grad_norm": 9.7578125, "learning_rate": 8.154801631442343e-06, "loss": 3.7664, "mean_token_accuracy": 0.4131888710540396, "step": 9953 }, { "epoch": 1.8453837597330367, "grad_norm": 8.609375, "learning_rate": 8.154616240266964e-06, "loss": 2.6419, "mean_token_accuracy": 0.4950580146110872, "step": 9954 }, { "epoch": 1.8455691509084167, "grad_norm": 6.1484375, "learning_rate": 8.154430849091584e-06, "loss": 2.6192, "mean_token_accuracy": 0.47983014861995754, "step": 9955 }, { "epoch": 1.8457545420837969, "grad_norm": 7.16015625, "learning_rate": 8.154245457916203e-06, "loss": 2.8454, "mean_token_accuracy": 0.44467787114845936, "step": 9956 }, { "epoch": 1.8459399332591768, "grad_norm": 9.9296875, "learning_rate": 8.154060066740824e-06, "loss": 2.5857, "mean_token_accuracy": 0.48365145228215767, "step": 9957 }, { "epoch": 1.8461253244345568, "grad_norm": 6.61328125, "learning_rate": 8.153874675565444e-06, "loss": 3.0831, "mean_token_accuracy": 0.4460812356979405, "step": 9958 }, { "epoch": 1.846310715609937, "grad_norm": 8.1875, "learning_rate": 8.153689284390065e-06, "loss": 2.9066, "mean_token_accuracy": 0.4807849550286182, "step": 9959 }, { "epoch": 1.8464961067853172, "grad_norm": 9.1875, "learning_rate": 8.153503893214683e-06, "loss": 2.8255, "mean_token_accuracy": 0.4528210704361484, "step": 9960 }, { "epoch": 1.846681497960697, "grad_norm": 6.13671875, "learning_rate": 8.153318502039304e-06, "loss": 3.0219, "mean_token_accuracy": 0.4478114478114478, "step": 9961 }, { "epoch": 1.846866889136077, "grad_norm": 6.8984375, "learning_rate": 8.153133110863923e-06, "loss": 2.6925, "mean_token_accuracy": 0.48782456612404146, "step": 9962 }, { "epoch": 1.8470522803114573, "grad_norm": 8.109375, "learning_rate": 8.152947719688543e-06, "loss": 3.1296, "mean_token_accuracy": 0.4542721122525683, "step": 9963 }, { "epoch": 1.8472376714868373, "grad_norm": 7.91015625, "learning_rate": 8.152762328513164e-06, "loss": 2.9401, "mean_token_accuracy": 0.46347497089639117, "step": 9964 }, { "epoch": 1.8474230626622172, "grad_norm": 5.71484375, "learning_rate": 8.152576937337784e-06, "loss": 2.5118, "mean_token_accuracy": 0.5021300766827606, "step": 9965 }, { "epoch": 1.8476084538375974, "grad_norm": 13.7109375, "learning_rate": 8.152391546162403e-06, "loss": 2.5042, "mean_token_accuracy": 0.4783227643328588, "step": 9966 }, { "epoch": 1.8477938450129774, "grad_norm": 17.9375, "learning_rate": 8.152206154987024e-06, "loss": 3.0861, "mean_token_accuracy": 0.4556627461345315, "step": 9967 }, { "epoch": 1.8479792361883574, "grad_norm": 12.78125, "learning_rate": 8.152020763811644e-06, "loss": 2.1583, "mean_token_accuracy": 0.5589119916307048, "step": 9968 }, { "epoch": 1.8481646273637375, "grad_norm": 5.56640625, "learning_rate": 8.151835372636263e-06, "loss": 3.0719, "mean_token_accuracy": 0.44946528091580057, "step": 9969 }, { "epoch": 1.8483500185391175, "grad_norm": 10.2890625, "learning_rate": 8.151649981460883e-06, "loss": 3.1589, "mean_token_accuracy": 0.445649400357234, "step": 9970 }, { "epoch": 1.8485354097144975, "grad_norm": 10.5546875, "learning_rate": 8.151464590285502e-06, "loss": 3.0036, "mean_token_accuracy": 0.44089834515366433, "step": 9971 }, { "epoch": 1.8487208008898777, "grad_norm": 10.96875, "learning_rate": 8.151279199110123e-06, "loss": 2.5958, "mean_token_accuracy": 0.46606855302507477, "step": 9972 }, { "epoch": 1.8489061920652579, "grad_norm": 5.62890625, "learning_rate": 8.151093807934743e-06, "loss": 3.1624, "mean_token_accuracy": 0.44402218570254726, "step": 9973 }, { "epoch": 1.8490915832406376, "grad_norm": 8.2734375, "learning_rate": 8.150908416759364e-06, "loss": 3.128, "mean_token_accuracy": 0.4478021978021978, "step": 9974 }, { "epoch": 1.8492769744160178, "grad_norm": 10.9453125, "learning_rate": 8.150723025583982e-06, "loss": 2.8193, "mean_token_accuracy": 0.48189280540801543, "step": 9975 }, { "epoch": 1.849462365591398, "grad_norm": 7.31640625, "learning_rate": 8.150537634408603e-06, "loss": 2.7037, "mean_token_accuracy": 0.48055000587613117, "step": 9976 }, { "epoch": 1.849647756766778, "grad_norm": 5.7890625, "learning_rate": 8.150352243233223e-06, "loss": 2.8575, "mean_token_accuracy": 0.46745087555139686, "step": 9977 }, { "epoch": 1.849833147942158, "grad_norm": 7.515625, "learning_rate": 8.150166852057842e-06, "loss": 3.2801, "mean_token_accuracy": 0.44025, "step": 9978 }, { "epoch": 1.850018539117538, "grad_norm": 9.109375, "learning_rate": 8.149981460882463e-06, "loss": 2.5739, "mean_token_accuracy": 0.5059568530642911, "step": 9979 }, { "epoch": 1.850203930292918, "grad_norm": 6.05078125, "learning_rate": 8.149796069707081e-06, "loss": 2.9637, "mean_token_accuracy": 0.43939051918735894, "step": 9980 }, { "epoch": 1.850389321468298, "grad_norm": 5.859375, "learning_rate": 8.149610678531704e-06, "loss": 3.0351, "mean_token_accuracy": 0.4342086980686397, "step": 9981 }, { "epoch": 1.8505747126436782, "grad_norm": 8.796875, "learning_rate": 8.149425287356322e-06, "loss": 2.9432, "mean_token_accuracy": 0.4728248192209004, "step": 9982 }, { "epoch": 1.8507601038190582, "grad_norm": 9.375, "learning_rate": 8.149239896180943e-06, "loss": 2.816, "mean_token_accuracy": 0.4644736842105263, "step": 9983 }, { "epoch": 1.8509454949944382, "grad_norm": 7.109375, "learning_rate": 8.149054505005562e-06, "loss": 3.3016, "mean_token_accuracy": 0.4351917866818765, "step": 9984 }, { "epoch": 1.8511308861698184, "grad_norm": 6.61328125, "learning_rate": 8.148869113830182e-06, "loss": 3.4519, "mean_token_accuracy": 0.4630037783375315, "step": 9985 }, { "epoch": 1.8513162773451983, "grad_norm": 7.5078125, "learning_rate": 8.148683722654803e-06, "loss": 3.4055, "mean_token_accuracy": 0.4355115026921194, "step": 9986 }, { "epoch": 1.8515016685205783, "grad_norm": 7.3671875, "learning_rate": 8.148498331479421e-06, "loss": 3.4726, "mean_token_accuracy": 0.42811416377700723, "step": 9987 }, { "epoch": 1.8516870596959585, "grad_norm": 5.87109375, "learning_rate": 8.148312940304042e-06, "loss": 2.8772, "mean_token_accuracy": 0.45348113090048575, "step": 9988 }, { "epoch": 1.8518724508713387, "grad_norm": 7.51953125, "learning_rate": 8.148127549128662e-06, "loss": 2.5587, "mean_token_accuracy": 0.4889494833524684, "step": 9989 }, { "epoch": 1.8520578420467184, "grad_norm": 9.046875, "learning_rate": 8.147942157953283e-06, "loss": 3.0855, "mean_token_accuracy": 0.4576296517710357, "step": 9990 }, { "epoch": 1.8522432332220986, "grad_norm": 6.50390625, "learning_rate": 8.147756766777902e-06, "loss": 2.8655, "mean_token_accuracy": 0.4732809430255403, "step": 9991 }, { "epoch": 1.8524286243974788, "grad_norm": 6.80078125, "learning_rate": 8.147571375602522e-06, "loss": 2.6175, "mean_token_accuracy": 0.4849949135300102, "step": 9992 }, { "epoch": 1.8526140155728588, "grad_norm": 7.0234375, "learning_rate": 8.147385984427143e-06, "loss": 3.2387, "mean_token_accuracy": 0.48017743276961466, "step": 9993 }, { "epoch": 1.8527994067482387, "grad_norm": 6.1015625, "learning_rate": 8.147200593251762e-06, "loss": 3.4517, "mean_token_accuracy": 0.40099812850904554, "step": 9994 }, { "epoch": 1.852984797923619, "grad_norm": 5.51171875, "learning_rate": 8.147015202076382e-06, "loss": 2.9321, "mean_token_accuracy": 0.4466759002770083, "step": 9995 }, { "epoch": 1.8531701890989989, "grad_norm": 6.44140625, "learning_rate": 8.146829810901e-06, "loss": 2.7863, "mean_token_accuracy": 0.4731091244501206, "step": 9996 }, { "epoch": 1.8533555802743789, "grad_norm": 6.546875, "learning_rate": 8.146644419725623e-06, "loss": 3.5497, "mean_token_accuracy": 0.3927816369676835, "step": 9997 }, { "epoch": 1.853540971449759, "grad_norm": 6.39453125, "learning_rate": 8.146459028550242e-06, "loss": 2.8374, "mean_token_accuracy": 0.46509671993271656, "step": 9998 }, { "epoch": 1.853726362625139, "grad_norm": 5.515625, "learning_rate": 8.146273637374862e-06, "loss": 2.8204, "mean_token_accuracy": 0.4816753926701571, "step": 9999 }, { "epoch": 1.853911753800519, "grad_norm": 7.078125, "learning_rate": 8.146088246199481e-06, "loss": 3.8214, "mean_token_accuracy": 0.41460016717748677, "step": 10000 }, { "epoch": 1.8540971449758992, "grad_norm": 6.625, "learning_rate": 8.145902855024102e-06, "loss": 3.7568, "mean_token_accuracy": 0.3967456329265375, "step": 10001 }, { "epoch": 1.8542825361512794, "grad_norm": 5.37109375, "learning_rate": 8.145717463848722e-06, "loss": 2.8804, "mean_token_accuracy": 0.4695168502562984, "step": 10002 }, { "epoch": 1.854467927326659, "grad_norm": 6.1875, "learning_rate": 8.145532072673341e-06, "loss": 2.7417, "mean_token_accuracy": 0.479466271312083, "step": 10003 }, { "epoch": 1.8546533185020393, "grad_norm": 7.54296875, "learning_rate": 8.145346681497961e-06, "loss": 2.8913, "mean_token_accuracy": 0.4465077273271834, "step": 10004 }, { "epoch": 1.8548387096774195, "grad_norm": 8.2578125, "learning_rate": 8.145161290322582e-06, "loss": 3.2007, "mean_token_accuracy": 0.44229973803943196, "step": 10005 }, { "epoch": 1.8550241008527995, "grad_norm": 6.5625, "learning_rate": 8.144975899147202e-06, "loss": 2.7217, "mean_token_accuracy": 0.4804333407030732, "step": 10006 }, { "epoch": 1.8552094920281794, "grad_norm": 7.5546875, "learning_rate": 8.144790507971821e-06, "loss": 2.6358, "mean_token_accuracy": 0.4964329643296433, "step": 10007 }, { "epoch": 1.8553948832035596, "grad_norm": 6.36328125, "learning_rate": 8.144605116796442e-06, "loss": 2.9489, "mean_token_accuracy": 0.47580756966588106, "step": 10008 }, { "epoch": 1.8555802743789396, "grad_norm": 5.5234375, "learning_rate": 8.14441972562106e-06, "loss": 3.0713, "mean_token_accuracy": 0.451479052335698, "step": 10009 }, { "epoch": 1.8557656655543195, "grad_norm": 7.5, "learning_rate": 8.144234334445681e-06, "loss": 2.7688, "mean_token_accuracy": 0.45563689604685215, "step": 10010 }, { "epoch": 1.8559510567296997, "grad_norm": 5.6328125, "learning_rate": 8.144048943270301e-06, "loss": 2.9745, "mean_token_accuracy": 0.4631933265037319, "step": 10011 }, { "epoch": 1.8561364479050797, "grad_norm": 5.6640625, "learning_rate": 8.14386355209492e-06, "loss": 3.2617, "mean_token_accuracy": 0.41563731931668857, "step": 10012 }, { "epoch": 1.8563218390804597, "grad_norm": 6.30078125, "learning_rate": 8.14367816091954e-06, "loss": 2.5986, "mean_token_accuracy": 0.5061658398299078, "step": 10013 }, { "epoch": 1.8565072302558399, "grad_norm": 6.57421875, "learning_rate": 8.143492769744161e-06, "loss": 2.89, "mean_token_accuracy": 0.45018300122000815, "step": 10014 }, { "epoch": 1.8566926214312198, "grad_norm": 5.9375, "learning_rate": 8.143307378568782e-06, "loss": 2.899, "mean_token_accuracy": 0.46830193150847255, "step": 10015 }, { "epoch": 1.8568780126065998, "grad_norm": 7.09375, "learning_rate": 8.1431219873934e-06, "loss": 2.2012, "mean_token_accuracy": 0.5535831689677844, "step": 10016 }, { "epoch": 1.85706340378198, "grad_norm": 6.95703125, "learning_rate": 8.142936596218021e-06, "loss": 3.0441, "mean_token_accuracy": 0.4692197958959728, "step": 10017 }, { "epoch": 1.8572487949573602, "grad_norm": 8.9375, "learning_rate": 8.14275120504264e-06, "loss": 2.4698, "mean_token_accuracy": 0.5164878823996821, "step": 10018 }, { "epoch": 1.85743418613274, "grad_norm": 6.4453125, "learning_rate": 8.14256581386726e-06, "loss": 2.9722, "mean_token_accuracy": 0.4692706609290522, "step": 10019 }, { "epoch": 1.85761957730812, "grad_norm": 10.453125, "learning_rate": 8.14238042269188e-06, "loss": 2.5891, "mean_token_accuracy": 0.5196545946642609, "step": 10020 }, { "epoch": 1.8578049684835003, "grad_norm": 7.15625, "learning_rate": 8.1421950315165e-06, "loss": 2.7676, "mean_token_accuracy": 0.4744458692068144, "step": 10021 }, { "epoch": 1.8579903596588803, "grad_norm": 6.25390625, "learning_rate": 8.14200964034112e-06, "loss": 2.9565, "mean_token_accuracy": 0.45191733365189346, "step": 10022 }, { "epoch": 1.8581757508342602, "grad_norm": 9.25, "learning_rate": 8.14182424916574e-06, "loss": 3.2358, "mean_token_accuracy": 0.5224670688169396, "step": 10023 }, { "epoch": 1.8583611420096404, "grad_norm": 6.2421875, "learning_rate": 8.141638857990361e-06, "loss": 3.3779, "mean_token_accuracy": 0.4293432584944171, "step": 10024 }, { "epoch": 1.8585465331850204, "grad_norm": 6.3671875, "learning_rate": 8.14145346681498e-06, "loss": 2.6535, "mean_token_accuracy": 0.4979274611398964, "step": 10025 }, { "epoch": 1.8587319243604004, "grad_norm": 6.7421875, "learning_rate": 8.1412680756396e-06, "loss": 3.1015, "mean_token_accuracy": 0.4486022131624927, "step": 10026 }, { "epoch": 1.8589173155357805, "grad_norm": 6.49609375, "learning_rate": 8.141082684464219e-06, "loss": 3.4688, "mean_token_accuracy": 0.4399370307580528, "step": 10027 }, { "epoch": 1.8591027067111605, "grad_norm": 10.6796875, "learning_rate": 8.14089729328884e-06, "loss": 2.6873, "mean_token_accuracy": 0.5049084959398861, "step": 10028 }, { "epoch": 1.8592880978865405, "grad_norm": 6.21484375, "learning_rate": 8.14071190211346e-06, "loss": 3.0716, "mean_token_accuracy": 0.44325648735939416, "step": 10029 }, { "epoch": 1.8594734890619207, "grad_norm": 7.546875, "learning_rate": 8.14052651093808e-06, "loss": 3.1817, "mean_token_accuracy": 0.46569014084507043, "step": 10030 }, { "epoch": 1.8596588802373009, "grad_norm": 8.0546875, "learning_rate": 8.140341119762701e-06, "loss": 2.7788, "mean_token_accuracy": 0.47570750858825456, "step": 10031 }, { "epoch": 1.8598442714126806, "grad_norm": 7.8984375, "learning_rate": 8.14015572858732e-06, "loss": 2.3437, "mean_token_accuracy": 0.520845231296402, "step": 10032 }, { "epoch": 1.8600296625880608, "grad_norm": 6.70703125, "learning_rate": 8.13997033741194e-06, "loss": 2.2979, "mean_token_accuracy": 0.5494066762781413, "step": 10033 }, { "epoch": 1.860215053763441, "grad_norm": 6.015625, "learning_rate": 8.139784946236559e-06, "loss": 3.5025, "mean_token_accuracy": 0.4078773460216759, "step": 10034 }, { "epoch": 1.860400444938821, "grad_norm": 9.734375, "learning_rate": 8.13959955506118e-06, "loss": 2.4133, "mean_token_accuracy": 0.5071715433161216, "step": 10035 }, { "epoch": 1.860585836114201, "grad_norm": 5.91015625, "learning_rate": 8.1394141638858e-06, "loss": 3.044, "mean_token_accuracy": 0.4548951048951049, "step": 10036 }, { "epoch": 1.860771227289581, "grad_norm": 7.08984375, "learning_rate": 8.139228772710419e-06, "loss": 2.7719, "mean_token_accuracy": 0.4637427687340239, "step": 10037 }, { "epoch": 1.860956618464961, "grad_norm": 6.0234375, "learning_rate": 8.13904338153504e-06, "loss": 3.3162, "mean_token_accuracy": 0.45489655172413795, "step": 10038 }, { "epoch": 1.861142009640341, "grad_norm": 6.25390625, "learning_rate": 8.13885799035966e-06, "loss": 2.6496, "mean_token_accuracy": 0.5134001636661211, "step": 10039 }, { "epoch": 1.8613274008157212, "grad_norm": 6.86328125, "learning_rate": 8.13867259918428e-06, "loss": 2.8543, "mean_token_accuracy": 0.45669789820207646, "step": 10040 }, { "epoch": 1.8615127919911012, "grad_norm": 5.81640625, "learning_rate": 8.1384872080089e-06, "loss": 3.231, "mean_token_accuracy": 0.4535752401280683, "step": 10041 }, { "epoch": 1.8616981831664812, "grad_norm": 7.66015625, "learning_rate": 8.13830181683352e-06, "loss": 2.7361, "mean_token_accuracy": 0.4631645569620253, "step": 10042 }, { "epoch": 1.8618835743418614, "grad_norm": 6.41796875, "learning_rate": 8.138116425658138e-06, "loss": 2.4694, "mean_token_accuracy": 0.5153933865450399, "step": 10043 }, { "epoch": 1.8620689655172413, "grad_norm": 6.765625, "learning_rate": 8.137931034482759e-06, "loss": 3.0429, "mean_token_accuracy": 0.4396620015843676, "step": 10044 }, { "epoch": 1.8622543566926213, "grad_norm": 5.98828125, "learning_rate": 8.13774564330738e-06, "loss": 3.3068, "mean_token_accuracy": 0.4488695652173913, "step": 10045 }, { "epoch": 1.8624397478680015, "grad_norm": 6.58984375, "learning_rate": 8.137560252132e-06, "loss": 2.6463, "mean_token_accuracy": 0.48038302965819923, "step": 10046 }, { "epoch": 1.8626251390433817, "grad_norm": 7.453125, "learning_rate": 8.137374860956619e-06, "loss": 3.2315, "mean_token_accuracy": 0.4383149448345035, "step": 10047 }, { "epoch": 1.8628105302187616, "grad_norm": 7.11328125, "learning_rate": 8.13718946978124e-06, "loss": 2.549, "mean_token_accuracy": 0.46773914547656115, "step": 10048 }, { "epoch": 1.8629959213941416, "grad_norm": 5.90234375, "learning_rate": 8.13700407860586e-06, "loss": 3.1311, "mean_token_accuracy": 0.441596161131456, "step": 10049 }, { "epoch": 1.8631813125695218, "grad_norm": 6.0, "learning_rate": 8.136818687430479e-06, "loss": 2.9813, "mean_token_accuracy": 0.4544702842377261, "step": 10050 }, { "epoch": 1.8633667037449018, "grad_norm": 6.828125, "learning_rate": 8.136633296255099e-06, "loss": 2.7688, "mean_token_accuracy": 0.4627846912420756, "step": 10051 }, { "epoch": 1.8635520949202817, "grad_norm": 7.32421875, "learning_rate": 8.136447905079718e-06, "loss": 3.1986, "mean_token_accuracy": 0.4503259857187209, "step": 10052 }, { "epoch": 1.863737486095662, "grad_norm": 5.73046875, "learning_rate": 8.136262513904338e-06, "loss": 2.7262, "mean_token_accuracy": 0.4857604775525432, "step": 10053 }, { "epoch": 1.863922877271042, "grad_norm": 6.90234375, "learning_rate": 8.136077122728959e-06, "loss": 3.2629, "mean_token_accuracy": 0.43776667073022063, "step": 10054 }, { "epoch": 1.8641082684464219, "grad_norm": 7.30859375, "learning_rate": 8.13589173155358e-06, "loss": 2.7272, "mean_token_accuracy": 0.4948280682135868, "step": 10055 }, { "epoch": 1.864293659621802, "grad_norm": 6.30859375, "learning_rate": 8.135706340378198e-06, "loss": 2.95, "mean_token_accuracy": 0.48943862987630826, "step": 10056 }, { "epoch": 1.864479050797182, "grad_norm": 6.92578125, "learning_rate": 8.135520949202819e-06, "loss": 3.4681, "mean_token_accuracy": 0.5044055849261218, "step": 10057 }, { "epoch": 1.864664441972562, "grad_norm": 10.1484375, "learning_rate": 8.135335558027439e-06, "loss": 2.588, "mean_token_accuracy": 0.5019032849288031, "step": 10058 }, { "epoch": 1.8648498331479422, "grad_norm": 6.81640625, "learning_rate": 8.135150166852058e-06, "loss": 2.6152, "mean_token_accuracy": 0.4832443653618031, "step": 10059 }, { "epoch": 1.8650352243233224, "grad_norm": 5.90625, "learning_rate": 8.134964775676678e-06, "loss": 2.9631, "mean_token_accuracy": 0.4725109409190372, "step": 10060 }, { "epoch": 1.865220615498702, "grad_norm": 6.92578125, "learning_rate": 8.134779384501297e-06, "loss": 2.7595, "mean_token_accuracy": 0.4825163908835467, "step": 10061 }, { "epoch": 1.8654060066740823, "grad_norm": 7.8046875, "learning_rate": 8.13459399332592e-06, "loss": 2.371, "mean_token_accuracy": 0.5030015356694123, "step": 10062 }, { "epoch": 1.8655913978494625, "grad_norm": 6.09375, "learning_rate": 8.134408602150538e-06, "loss": 3.0158, "mean_token_accuracy": 0.4490874764002517, "step": 10063 }, { "epoch": 1.8657767890248425, "grad_norm": 9.734375, "learning_rate": 8.134223210975159e-06, "loss": 2.8755, "mean_token_accuracy": 0.4687551798441903, "step": 10064 }, { "epoch": 1.8659621802002224, "grad_norm": 6.1328125, "learning_rate": 8.134037819799777e-06, "loss": 2.9597, "mean_token_accuracy": 0.46167471819645733, "step": 10065 }, { "epoch": 1.8661475713756026, "grad_norm": 7.34765625, "learning_rate": 8.133852428624398e-06, "loss": 2.518, "mean_token_accuracy": 0.49220294311443, "step": 10066 }, { "epoch": 1.8663329625509826, "grad_norm": 6.9140625, "learning_rate": 8.133667037449018e-06, "loss": 3.0132, "mean_token_accuracy": 0.45056614194973765, "step": 10067 }, { "epoch": 1.8665183537263625, "grad_norm": 5.65625, "learning_rate": 8.133481646273637e-06, "loss": 2.8332, "mean_token_accuracy": 0.46616219303255285, "step": 10068 }, { "epoch": 1.8667037449017427, "grad_norm": 7.2421875, "learning_rate": 8.133296255098258e-06, "loss": 3.1782, "mean_token_accuracy": 0.4431973720229948, "step": 10069 }, { "epoch": 1.8668891360771227, "grad_norm": 6.8984375, "learning_rate": 8.133110863922878e-06, "loss": 2.3584, "mean_token_accuracy": 0.5256545887331394, "step": 10070 }, { "epoch": 1.8670745272525027, "grad_norm": 6.1328125, "learning_rate": 8.132925472747499e-06, "loss": 3.3312, "mean_token_accuracy": 0.43197603159471604, "step": 10071 }, { "epoch": 1.8672599184278829, "grad_norm": 7.90234375, "learning_rate": 8.132740081572117e-06, "loss": 2.3161, "mean_token_accuracy": 0.5472336824288586, "step": 10072 }, { "epoch": 1.867445309603263, "grad_norm": 9.4375, "learning_rate": 8.132554690396738e-06, "loss": 2.8848, "mean_token_accuracy": 0.46260650047333546, "step": 10073 }, { "epoch": 1.8676307007786428, "grad_norm": 6.09765625, "learning_rate": 8.132369299221358e-06, "loss": 3.3155, "mean_token_accuracy": 0.4501431918771153, "step": 10074 }, { "epoch": 1.867816091954023, "grad_norm": 5.640625, "learning_rate": 8.132183908045977e-06, "loss": 2.6753, "mean_token_accuracy": 0.4748073503260225, "step": 10075 }, { "epoch": 1.8680014831294032, "grad_norm": 6.62890625, "learning_rate": 8.131998516870598e-06, "loss": 3.3371, "mean_token_accuracy": 0.4504396482813749, "step": 10076 }, { "epoch": 1.8681868743047831, "grad_norm": 10.078125, "learning_rate": 8.131813125695217e-06, "loss": 3.5119, "mean_token_accuracy": 0.4208205265267941, "step": 10077 }, { "epoch": 1.868372265480163, "grad_norm": 6.53515625, "learning_rate": 8.131627734519839e-06, "loss": 3.2437, "mean_token_accuracy": 0.4361607933415973, "step": 10078 }, { "epoch": 1.8685576566555433, "grad_norm": 8.5703125, "learning_rate": 8.131442343344458e-06, "loss": 2.022, "mean_token_accuracy": 0.5736419280795715, "step": 10079 }, { "epoch": 1.8687430478309233, "grad_norm": 9.1328125, "learning_rate": 8.131256952169078e-06, "loss": 3.0501, "mean_token_accuracy": 0.4652041438147471, "step": 10080 }, { "epoch": 1.8689284390063032, "grad_norm": 8.328125, "learning_rate": 8.131071560993697e-06, "loss": 3.0278, "mean_token_accuracy": 0.44563279857397503, "step": 10081 }, { "epoch": 1.8691138301816834, "grad_norm": 6.546875, "learning_rate": 8.130886169818317e-06, "loss": 3.2802, "mean_token_accuracy": 0.4242276224203199, "step": 10082 }, { "epoch": 1.8692992213570634, "grad_norm": 7.91796875, "learning_rate": 8.130700778642938e-06, "loss": 2.6393, "mean_token_accuracy": 0.4713003845606039, "step": 10083 }, { "epoch": 1.8694846125324434, "grad_norm": 6.08203125, "learning_rate": 8.130515387467557e-06, "loss": 3.0436, "mean_token_accuracy": 0.44912968864917874, "step": 10084 }, { "epoch": 1.8696700037078235, "grad_norm": 7.609375, "learning_rate": 8.130329996292177e-06, "loss": 2.7074, "mean_token_accuracy": 0.48401475561020596, "step": 10085 }, { "epoch": 1.8698553948832035, "grad_norm": 7.578125, "learning_rate": 8.130144605116798e-06, "loss": 2.275, "mean_token_accuracy": 0.5303368398682393, "step": 10086 }, { "epoch": 1.8700407860585835, "grad_norm": 6.58984375, "learning_rate": 8.129959213941418e-06, "loss": 3.4136, "mean_token_accuracy": 0.4453793703757435, "step": 10087 }, { "epoch": 1.8702261772339637, "grad_norm": 5.75, "learning_rate": 8.129773822766037e-06, "loss": 2.6269, "mean_token_accuracy": 0.47276036926277465, "step": 10088 }, { "epoch": 1.8704115684093439, "grad_norm": 6.6015625, "learning_rate": 8.129588431590657e-06, "loss": 2.9661, "mean_token_accuracy": 0.4627847604084839, "step": 10089 }, { "epoch": 1.8705969595847236, "grad_norm": 7.31640625, "learning_rate": 8.129403040415276e-06, "loss": 2.8455, "mean_token_accuracy": 0.4611319868482856, "step": 10090 }, { "epoch": 1.8707823507601038, "grad_norm": 5.515625, "learning_rate": 8.129217649239897e-06, "loss": 3.1213, "mean_token_accuracy": 0.43246854470548934, "step": 10091 }, { "epoch": 1.870967741935484, "grad_norm": 6.58203125, "learning_rate": 8.129032258064517e-06, "loss": 2.3151, "mean_token_accuracy": 0.536333231985406, "step": 10092 }, { "epoch": 1.871153133110864, "grad_norm": 6.26171875, "learning_rate": 8.128846866889136e-06, "loss": 2.9521, "mean_token_accuracy": 0.4721120475954388, "step": 10093 }, { "epoch": 1.871338524286244, "grad_norm": 10.15625, "learning_rate": 8.128661475713756e-06, "loss": 2.7966, "mean_token_accuracy": 0.4749476622470342, "step": 10094 }, { "epoch": 1.871523915461624, "grad_norm": 6.27734375, "learning_rate": 8.128476084538377e-06, "loss": 3.0571, "mean_token_accuracy": 0.4620541700187718, "step": 10095 }, { "epoch": 1.871709306637004, "grad_norm": 7.13671875, "learning_rate": 8.128290693362997e-06, "loss": 3.5073, "mean_token_accuracy": 0.41513429176418143, "step": 10096 }, { "epoch": 1.871894697812384, "grad_norm": 6.8828125, "learning_rate": 8.128105302187616e-06, "loss": 2.9056, "mean_token_accuracy": 0.4556067083196317, "step": 10097 }, { "epoch": 1.8720800889877642, "grad_norm": 5.71875, "learning_rate": 8.127919911012237e-06, "loss": 2.8175, "mean_token_accuracy": 0.4741745283018868, "step": 10098 }, { "epoch": 1.8722654801631442, "grad_norm": 7.9609375, "learning_rate": 8.127734519836856e-06, "loss": 2.2557, "mean_token_accuracy": 0.5196486361534906, "step": 10099 }, { "epoch": 1.8724508713385242, "grad_norm": 8.890625, "learning_rate": 8.127549128661476e-06, "loss": 3.0002, "mean_token_accuracy": 0.4482452243447357, "step": 10100 }, { "epoch": 1.8726362625139044, "grad_norm": 13.4296875, "learning_rate": 8.127363737486096e-06, "loss": 2.6813, "mean_token_accuracy": 0.47229085774797036, "step": 10101 }, { "epoch": 1.8728216536892845, "grad_norm": 6.83984375, "learning_rate": 8.127178346310717e-06, "loss": 2.9567, "mean_token_accuracy": 0.47224435590969455, "step": 10102 }, { "epoch": 1.8730070448646643, "grad_norm": 9.3125, "learning_rate": 8.126992955135336e-06, "loss": 3.1554, "mean_token_accuracy": 0.4297907488986784, "step": 10103 }, { "epoch": 1.8731924360400445, "grad_norm": 9.8515625, "learning_rate": 8.126807563959956e-06, "loss": 2.5324, "mean_token_accuracy": 0.5000681477443096, "step": 10104 }, { "epoch": 1.8733778272154247, "grad_norm": 10.28125, "learning_rate": 8.126622172784577e-06, "loss": 2.9469, "mean_token_accuracy": 0.4587481019065294, "step": 10105 }, { "epoch": 1.8735632183908046, "grad_norm": 5.6171875, "learning_rate": 8.126436781609196e-06, "loss": 2.9074, "mean_token_accuracy": 0.46707726763717805, "step": 10106 }, { "epoch": 1.8737486095661846, "grad_norm": 9.9453125, "learning_rate": 8.126251390433816e-06, "loss": 2.9564, "mean_token_accuracy": 0.4639951792708647, "step": 10107 }, { "epoch": 1.8739340007415648, "grad_norm": 6.44140625, "learning_rate": 8.126065999258435e-06, "loss": 3.4303, "mean_token_accuracy": 0.42714465937762824, "step": 10108 }, { "epoch": 1.8741193919169448, "grad_norm": 7.59375, "learning_rate": 8.125880608083055e-06, "loss": 2.7195, "mean_token_accuracy": 0.4618293547624675, "step": 10109 }, { "epoch": 1.8743047830923247, "grad_norm": 10.4140625, "learning_rate": 8.125695216907676e-06, "loss": 2.6316, "mean_token_accuracy": 0.5033467202141901, "step": 10110 }, { "epoch": 1.874490174267705, "grad_norm": 6.03125, "learning_rate": 8.125509825732296e-06, "loss": 3.3669, "mean_token_accuracy": 0.4336404896162839, "step": 10111 }, { "epoch": 1.874675565443085, "grad_norm": 6.53125, "learning_rate": 8.125324434556917e-06, "loss": 2.6287, "mean_token_accuracy": 0.4826904055390702, "step": 10112 }, { "epoch": 1.8748609566184649, "grad_norm": 7.76171875, "learning_rate": 8.125139043381536e-06, "loss": 3.0431, "mean_token_accuracy": 0.46282229592382584, "step": 10113 }, { "epoch": 1.875046347793845, "grad_norm": 6.64453125, "learning_rate": 8.124953652206156e-06, "loss": 3.5989, "mean_token_accuracy": 0.41878830740216877, "step": 10114 }, { "epoch": 1.875231738969225, "grad_norm": 5.984375, "learning_rate": 8.124768261030775e-06, "loss": 2.3945, "mean_token_accuracy": 0.5489812363301485, "step": 10115 }, { "epoch": 1.875417130144605, "grad_norm": 6.453125, "learning_rate": 8.124582869855395e-06, "loss": 2.7672, "mean_token_accuracy": 0.47782772445632365, "step": 10116 }, { "epoch": 1.8756025213199852, "grad_norm": 8.3515625, "learning_rate": 8.124397478680016e-06, "loss": 3.28, "mean_token_accuracy": 0.43952451708766715, "step": 10117 }, { "epoch": 1.8757879124953654, "grad_norm": 4.98046875, "learning_rate": 8.124212087504636e-06, "loss": 3.1113, "mean_token_accuracy": 0.46519708980345315, "step": 10118 }, { "epoch": 1.875973303670745, "grad_norm": 5.46484375, "learning_rate": 8.124026696329255e-06, "loss": 2.7496, "mean_token_accuracy": 0.48156620021528523, "step": 10119 }, { "epoch": 1.8761586948461253, "grad_norm": 6.48046875, "learning_rate": 8.123841305153876e-06, "loss": 2.9656, "mean_token_accuracy": 0.44422546314544736, "step": 10120 }, { "epoch": 1.8763440860215055, "grad_norm": 8.421875, "learning_rate": 8.123655913978496e-06, "loss": 2.5989, "mean_token_accuracy": 0.48301366428481884, "step": 10121 }, { "epoch": 1.8765294771968855, "grad_norm": 7.0703125, "learning_rate": 8.123470522803115e-06, "loss": 3.3154, "mean_token_accuracy": 0.4249201277955272, "step": 10122 }, { "epoch": 1.8767148683722654, "grad_norm": 7.765625, "learning_rate": 8.123285131627735e-06, "loss": 2.725, "mean_token_accuracy": 0.49347568208778175, "step": 10123 }, { "epoch": 1.8769002595476456, "grad_norm": 6.01953125, "learning_rate": 8.123099740452354e-06, "loss": 2.8754, "mean_token_accuracy": 0.45204143814747105, "step": 10124 }, { "epoch": 1.8770856507230256, "grad_norm": 7.3203125, "learning_rate": 8.122914349276975e-06, "loss": 2.4927, "mean_token_accuracy": 0.5107468838943959, "step": 10125 }, { "epoch": 1.8772710418984055, "grad_norm": 9.6171875, "learning_rate": 8.122728958101595e-06, "loss": 3.3556, "mean_token_accuracy": 0.4277137757585591, "step": 10126 }, { "epoch": 1.8774564330737857, "grad_norm": 9.953125, "learning_rate": 8.122543566926216e-06, "loss": 2.8704, "mean_token_accuracy": 0.4784792784564448, "step": 10127 }, { "epoch": 1.8776418242491657, "grad_norm": 6.70703125, "learning_rate": 8.122358175750835e-06, "loss": 2.3334, "mean_token_accuracy": 0.5217520415738679, "step": 10128 }, { "epoch": 1.8778272154245457, "grad_norm": 5.7421875, "learning_rate": 8.122172784575455e-06, "loss": 3.0312, "mean_token_accuracy": 0.4983666061705989, "step": 10129 }, { "epoch": 1.8780126065999259, "grad_norm": 11.40625, "learning_rate": 8.121987393400075e-06, "loss": 2.5276, "mean_token_accuracy": 0.49204720369420213, "step": 10130 }, { "epoch": 1.878197997775306, "grad_norm": 9.8515625, "learning_rate": 8.121802002224694e-06, "loss": 2.5466, "mean_token_accuracy": 0.49810574864108054, "step": 10131 }, { "epoch": 1.8783833889506858, "grad_norm": 6.7578125, "learning_rate": 8.121616611049315e-06, "loss": 2.622, "mean_token_accuracy": 0.48539847161572053, "step": 10132 }, { "epoch": 1.878568780126066, "grad_norm": 9.5234375, "learning_rate": 8.121431219873934e-06, "loss": 2.7148, "mean_token_accuracy": 0.4747528720277852, "step": 10133 }, { "epoch": 1.8787541713014462, "grad_norm": 9.421875, "learning_rate": 8.121245828698556e-06, "loss": 2.8769, "mean_token_accuracy": 0.4526807737245229, "step": 10134 }, { "epoch": 1.8789395624768261, "grad_norm": 7.88671875, "learning_rate": 8.121060437523175e-06, "loss": 3.5884, "mean_token_accuracy": 0.4243703199455412, "step": 10135 }, { "epoch": 1.879124953652206, "grad_norm": 5.921875, "learning_rate": 8.120875046347795e-06, "loss": 2.7137, "mean_token_accuracy": 0.48685774666490556, "step": 10136 }, { "epoch": 1.8793103448275863, "grad_norm": 6.82421875, "learning_rate": 8.120689655172414e-06, "loss": 2.399, "mean_token_accuracy": 0.5368325173630878, "step": 10137 }, { "epoch": 1.8794957360029663, "grad_norm": 6.6015625, "learning_rate": 8.120504263997034e-06, "loss": 2.0357, "mean_token_accuracy": 0.5621627023785729, "step": 10138 }, { "epoch": 1.8796811271783462, "grad_norm": 6.7265625, "learning_rate": 8.120318872821655e-06, "loss": 2.7578, "mean_token_accuracy": 0.46712586098935505, "step": 10139 }, { "epoch": 1.8798665183537264, "grad_norm": 6.33203125, "learning_rate": 8.120133481646274e-06, "loss": 2.7437, "mean_token_accuracy": 0.49986958789775693, "step": 10140 }, { "epoch": 1.8800519095291064, "grad_norm": 6.65625, "learning_rate": 8.119948090470894e-06, "loss": 2.8836, "mean_token_accuracy": 0.47645561565383393, "step": 10141 }, { "epoch": 1.8802373007044864, "grad_norm": 6.54296875, "learning_rate": 8.119762699295515e-06, "loss": 2.7803, "mean_token_accuracy": 0.4742000576535024, "step": 10142 }, { "epoch": 1.8804226918798665, "grad_norm": 7.60546875, "learning_rate": 8.119577308120135e-06, "loss": 2.5352, "mean_token_accuracy": 0.4858186506231199, "step": 10143 }, { "epoch": 1.8806080830552465, "grad_norm": 6.7734375, "learning_rate": 8.119391916944754e-06, "loss": 3.0937, "mean_token_accuracy": 0.44979203802733214, "step": 10144 }, { "epoch": 1.8807934742306265, "grad_norm": 6.48046875, "learning_rate": 8.119206525769374e-06, "loss": 3.2022, "mean_token_accuracy": 0.4332659251769464, "step": 10145 }, { "epoch": 1.8809788654060067, "grad_norm": 8.1484375, "learning_rate": 8.119021134593993e-06, "loss": 2.9449, "mean_token_accuracy": 0.4693407100046104, "step": 10146 }, { "epoch": 1.8811642565813869, "grad_norm": 6.12109375, "learning_rate": 8.118835743418614e-06, "loss": 3.2414, "mean_token_accuracy": 0.44424007744433686, "step": 10147 }, { "epoch": 1.8813496477567668, "grad_norm": 6.25390625, "learning_rate": 8.118650352243234e-06, "loss": 3.5111, "mean_token_accuracy": 0.41730474732006123, "step": 10148 }, { "epoch": 1.8815350389321468, "grad_norm": 6.90625, "learning_rate": 8.118464961067853e-06, "loss": 3.1791, "mean_token_accuracy": 0.4261992619926199, "step": 10149 }, { "epoch": 1.881720430107527, "grad_norm": 6.609375, "learning_rate": 8.118279569892473e-06, "loss": 2.8779, "mean_token_accuracy": 0.4701067615658363, "step": 10150 }, { "epoch": 1.881905821282907, "grad_norm": 6.36328125, "learning_rate": 8.118094178717094e-06, "loss": 3.5744, "mean_token_accuracy": 0.40881337934696044, "step": 10151 }, { "epoch": 1.882091212458287, "grad_norm": 6.55078125, "learning_rate": 8.117908787541714e-06, "loss": 3.0412, "mean_token_accuracy": 0.4582822085889571, "step": 10152 }, { "epoch": 1.8822766036336671, "grad_norm": 6.6875, "learning_rate": 8.117723396366333e-06, "loss": 2.565, "mean_token_accuracy": 0.4875977653631285, "step": 10153 }, { "epoch": 1.882461994809047, "grad_norm": 6.421875, "learning_rate": 8.117538005190954e-06, "loss": 2.9909, "mean_token_accuracy": 0.47181910226122176, "step": 10154 }, { "epoch": 1.882647385984427, "grad_norm": 8.9296875, "learning_rate": 8.117352614015574e-06, "loss": 2.7223, "mean_token_accuracy": 0.46560920830993824, "step": 10155 }, { "epoch": 1.8828327771598072, "grad_norm": 10.46875, "learning_rate": 8.117167222840193e-06, "loss": 2.7738, "mean_token_accuracy": 0.46884038916644755, "step": 10156 }, { "epoch": 1.8830181683351872, "grad_norm": 5.6015625, "learning_rate": 8.116981831664814e-06, "loss": 2.7535, "mean_token_accuracy": 0.48962117280747275, "step": 10157 }, { "epoch": 1.8832035595105672, "grad_norm": 6.9296875, "learning_rate": 8.116796440489432e-06, "loss": 2.5823, "mean_token_accuracy": 0.48230888429752067, "step": 10158 }, { "epoch": 1.8833889506859474, "grad_norm": 11.03125, "learning_rate": 8.116611049314054e-06, "loss": 3.3281, "mean_token_accuracy": 0.4120691683331373, "step": 10159 }, { "epoch": 1.8835743418613276, "grad_norm": 6.1875, "learning_rate": 8.116425658138673e-06, "loss": 2.7301, "mean_token_accuracy": 0.4890616004605642, "step": 10160 }, { "epoch": 1.8837597330367073, "grad_norm": 6.9296875, "learning_rate": 8.116240266963294e-06, "loss": 2.8867, "mean_token_accuracy": 0.4628535903015232, "step": 10161 }, { "epoch": 1.8839451242120875, "grad_norm": 7.45703125, "learning_rate": 8.116054875787913e-06, "loss": 2.2121, "mean_token_accuracy": 0.5138355620283331, "step": 10162 }, { "epoch": 1.8841305153874677, "grad_norm": 6.12109375, "learning_rate": 8.115869484612533e-06, "loss": 2.8463, "mean_token_accuracy": 0.47619047619047616, "step": 10163 }, { "epoch": 1.8843159065628476, "grad_norm": 6.17578125, "learning_rate": 8.115684093437154e-06, "loss": 2.9441, "mean_token_accuracy": 0.436, "step": 10164 }, { "epoch": 1.8845012977382276, "grad_norm": 6.421875, "learning_rate": 8.115498702261772e-06, "loss": 3.2196, "mean_token_accuracy": 0.42308148499584863, "step": 10165 }, { "epoch": 1.8846866889136078, "grad_norm": 8.53125, "learning_rate": 8.115313311086393e-06, "loss": 2.2704, "mean_token_accuracy": 0.5262193725669796, "step": 10166 }, { "epoch": 1.8848720800889878, "grad_norm": 5.62890625, "learning_rate": 8.115127919911013e-06, "loss": 3.1664, "mean_token_accuracy": 0.4422132976349844, "step": 10167 }, { "epoch": 1.8850574712643677, "grad_norm": 8.8671875, "learning_rate": 8.114942528735634e-06, "loss": 2.7669, "mean_token_accuracy": 0.4784598214285714, "step": 10168 }, { "epoch": 1.885242862439748, "grad_norm": 8.25, "learning_rate": 8.114757137560253e-06, "loss": 3.097, "mean_token_accuracy": 0.4444283646888567, "step": 10169 }, { "epoch": 1.885428253615128, "grad_norm": 5.8359375, "learning_rate": 8.114571746384873e-06, "loss": 2.7841, "mean_token_accuracy": 0.4897499436810092, "step": 10170 }, { "epoch": 1.8856136447905079, "grad_norm": 6.13671875, "learning_rate": 8.114386355209492e-06, "loss": 3.0454, "mean_token_accuracy": 0.4658125609246623, "step": 10171 }, { "epoch": 1.885799035965888, "grad_norm": 5.96875, "learning_rate": 8.114200964034112e-06, "loss": 3.1056, "mean_token_accuracy": 0.4548320983761627, "step": 10172 }, { "epoch": 1.8859844271412682, "grad_norm": 5.91015625, "learning_rate": 8.114015572858733e-06, "loss": 2.8177, "mean_token_accuracy": 0.4608807182556648, "step": 10173 }, { "epoch": 1.886169818316648, "grad_norm": 5.03515625, "learning_rate": 8.113830181683352e-06, "loss": 2.718, "mean_token_accuracy": 0.4823258196721312, "step": 10174 }, { "epoch": 1.8863552094920282, "grad_norm": 5.984375, "learning_rate": 8.113644790507972e-06, "loss": 2.838, "mean_token_accuracy": 0.49235833529273454, "step": 10175 }, { "epoch": 1.8865406006674084, "grad_norm": 8.3984375, "learning_rate": 8.113459399332593e-06, "loss": 2.6655, "mean_token_accuracy": 0.4907505596795098, "step": 10176 }, { "epoch": 1.8867259918427883, "grad_norm": 6.9453125, "learning_rate": 8.113274008157213e-06, "loss": 2.5754, "mean_token_accuracy": 0.5122862642962291, "step": 10177 }, { "epoch": 1.8869113830181683, "grad_norm": 6.48828125, "learning_rate": 8.113088616981832e-06, "loss": 3.4611, "mean_token_accuracy": 0.4354364540931705, "step": 10178 }, { "epoch": 1.8870967741935485, "grad_norm": 6.7578125, "learning_rate": 8.112903225806452e-06, "loss": 2.8304, "mean_token_accuracy": 0.4632697155203457, "step": 10179 }, { "epoch": 1.8872821653689285, "grad_norm": 6.7421875, "learning_rate": 8.112717834631071e-06, "loss": 3.308, "mean_token_accuracy": 0.42641209228321403, "step": 10180 }, { "epoch": 1.8874675565443084, "grad_norm": 6.484375, "learning_rate": 8.112532443455692e-06, "loss": 3.098, "mean_token_accuracy": 0.46327047565437657, "step": 10181 }, { "epoch": 1.8876529477196886, "grad_norm": 4.94140625, "learning_rate": 8.112347052280312e-06, "loss": 2.707, "mean_token_accuracy": 0.48258885686839575, "step": 10182 }, { "epoch": 1.8878383388950686, "grad_norm": 7.453125, "learning_rate": 8.112161661104933e-06, "loss": 2.1207, "mean_token_accuracy": 0.5361473347060169, "step": 10183 }, { "epoch": 1.8880237300704485, "grad_norm": 6.0859375, "learning_rate": 8.111976269929552e-06, "loss": 3.6855, "mean_token_accuracy": 0.40639486508974204, "step": 10184 }, { "epoch": 1.8882091212458287, "grad_norm": 6.98828125, "learning_rate": 8.111790878754172e-06, "loss": 3.0511, "mean_token_accuracy": 0.4448014394036756, "step": 10185 }, { "epoch": 1.8883945124212087, "grad_norm": 7.4453125, "learning_rate": 8.111605487578793e-06, "loss": 2.46, "mean_token_accuracy": 0.49883213161368944, "step": 10186 }, { "epoch": 1.8885799035965887, "grad_norm": 6.20703125, "learning_rate": 8.111420096403411e-06, "loss": 3.4081, "mean_token_accuracy": 0.41220387652548457, "step": 10187 }, { "epoch": 1.8887652947719689, "grad_norm": 6.3203125, "learning_rate": 8.111234705228032e-06, "loss": 3.4229, "mean_token_accuracy": 0.4361374553516443, "step": 10188 }, { "epoch": 1.888950685947349, "grad_norm": 9.5078125, "learning_rate": 8.11104931405265e-06, "loss": 2.6671, "mean_token_accuracy": 0.4957492703971577, "step": 10189 }, { "epoch": 1.8891360771227288, "grad_norm": 6.25, "learning_rate": 8.110863922877271e-06, "loss": 2.3141, "mean_token_accuracy": 0.4990070921985816, "step": 10190 }, { "epoch": 1.889321468298109, "grad_norm": 6.43359375, "learning_rate": 8.110678531701892e-06, "loss": 3.0731, "mean_token_accuracy": 0.4689890710382514, "step": 10191 }, { "epoch": 1.8895068594734892, "grad_norm": 6.3828125, "learning_rate": 8.110493140526512e-06, "loss": 2.8349, "mean_token_accuracy": 0.4778192855475134, "step": 10192 }, { "epoch": 1.8896922506488691, "grad_norm": 6.94140625, "learning_rate": 8.110307749351133e-06, "loss": 2.9536, "mean_token_accuracy": 0.47776084245531386, "step": 10193 }, { "epoch": 1.889877641824249, "grad_norm": 6.0546875, "learning_rate": 8.110122358175751e-06, "loss": 3.0804, "mean_token_accuracy": 0.4528101802757158, "step": 10194 }, { "epoch": 1.8900630329996293, "grad_norm": 6.65625, "learning_rate": 8.109936967000372e-06, "loss": 3.7011, "mean_token_accuracy": 0.3937895026106073, "step": 10195 }, { "epoch": 1.8902484241750093, "grad_norm": 6.2734375, "learning_rate": 8.10975157582499e-06, "loss": 2.7686, "mean_token_accuracy": 0.4699491129143171, "step": 10196 }, { "epoch": 1.8904338153503892, "grad_norm": 7.3046875, "learning_rate": 8.109566184649611e-06, "loss": 3.0357, "mean_token_accuracy": 0.45566166439290584, "step": 10197 }, { "epoch": 1.8906192065257694, "grad_norm": 5.796875, "learning_rate": 8.109380793474232e-06, "loss": 3.9371, "mean_token_accuracy": 0.38888207141980613, "step": 10198 }, { "epoch": 1.8908045977011494, "grad_norm": 5.8671875, "learning_rate": 8.109195402298852e-06, "loss": 2.8585, "mean_token_accuracy": 0.4585698070374574, "step": 10199 }, { "epoch": 1.8909899888765294, "grad_norm": 7.375, "learning_rate": 8.109010011123471e-06, "loss": 3.3664, "mean_token_accuracy": 0.43427437141596825, "step": 10200 }, { "epoch": 1.8911753800519095, "grad_norm": 6.01953125, "learning_rate": 8.108824619948091e-06, "loss": 2.836, "mean_token_accuracy": 0.4595010779180782, "step": 10201 }, { "epoch": 1.8913607712272897, "grad_norm": 6.671875, "learning_rate": 8.108639228772712e-06, "loss": 3.0316, "mean_token_accuracy": 0.46031488394741116, "step": 10202 }, { "epoch": 1.8915461624026695, "grad_norm": 5.6171875, "learning_rate": 8.10845383759733e-06, "loss": 2.6315, "mean_token_accuracy": 0.47295864262990456, "step": 10203 }, { "epoch": 1.8917315535780497, "grad_norm": 5.859375, "learning_rate": 8.108268446421951e-06, "loss": 2.5142, "mean_token_accuracy": 0.49154471544715445, "step": 10204 }, { "epoch": 1.8919169447534299, "grad_norm": 6.11328125, "learning_rate": 8.10808305524657e-06, "loss": 2.682, "mean_token_accuracy": 0.4906141603754336, "step": 10205 }, { "epoch": 1.8921023359288098, "grad_norm": 7.28125, "learning_rate": 8.10789766407119e-06, "loss": 3.5345, "mean_token_accuracy": 0.4322392414296134, "step": 10206 }, { "epoch": 1.8922877271041898, "grad_norm": 5.94921875, "learning_rate": 8.107712272895811e-06, "loss": 4.0479, "mean_token_accuracy": 0.3861627906976744, "step": 10207 }, { "epoch": 1.89247311827957, "grad_norm": 7.0234375, "learning_rate": 8.107526881720431e-06, "loss": 3.6692, "mean_token_accuracy": 0.42349911912183225, "step": 10208 }, { "epoch": 1.89265850945495, "grad_norm": 7.46875, "learning_rate": 8.10734149054505e-06, "loss": 3.2852, "mean_token_accuracy": 0.45218821148494814, "step": 10209 }, { "epoch": 1.89284390063033, "grad_norm": 5.80859375, "learning_rate": 8.10715609936967e-06, "loss": 3.2614, "mean_token_accuracy": 0.4279962103268593, "step": 10210 }, { "epoch": 1.8930292918057101, "grad_norm": 5.921875, "learning_rate": 8.106970708194291e-06, "loss": 3.1745, "mean_token_accuracy": 0.4562043795620438, "step": 10211 }, { "epoch": 1.89321468298109, "grad_norm": 6.11328125, "learning_rate": 8.10678531701891e-06, "loss": 2.6065, "mean_token_accuracy": 0.4901935316274841, "step": 10212 }, { "epoch": 1.89340007415647, "grad_norm": 7.015625, "learning_rate": 8.10659992584353e-06, "loss": 2.9656, "mean_token_accuracy": 0.4572724612232953, "step": 10213 }, { "epoch": 1.8935854653318502, "grad_norm": 5.3125, "learning_rate": 8.10641453466815e-06, "loss": 2.4477, "mean_token_accuracy": 0.5052414231257941, "step": 10214 }, { "epoch": 1.8937708565072302, "grad_norm": 5.796875, "learning_rate": 8.106229143492772e-06, "loss": 3.4245, "mean_token_accuracy": 0.41787858572381587, "step": 10215 }, { "epoch": 1.8939562476826102, "grad_norm": 5.20703125, "learning_rate": 8.10604375231739e-06, "loss": 2.6053, "mean_token_accuracy": 0.4931606343057741, "step": 10216 }, { "epoch": 1.8941416388579904, "grad_norm": 5.02734375, "learning_rate": 8.10585836114201e-06, "loss": 2.7455, "mean_token_accuracy": 0.46941544885177455, "step": 10217 }, { "epoch": 1.8943270300333706, "grad_norm": 5.7578125, "learning_rate": 8.10567296996663e-06, "loss": 3.325, "mean_token_accuracy": 0.42923850574712646, "step": 10218 }, { "epoch": 1.8945124212087505, "grad_norm": 5.6484375, "learning_rate": 8.10548757879125e-06, "loss": 2.8436, "mean_token_accuracy": 0.48911245865490627, "step": 10219 }, { "epoch": 1.8946978123841305, "grad_norm": 5.375, "learning_rate": 8.10530218761587e-06, "loss": 3.3293, "mean_token_accuracy": 0.43183415319747015, "step": 10220 }, { "epoch": 1.8948832035595107, "grad_norm": 5.296875, "learning_rate": 8.10511679644049e-06, "loss": 3.2274, "mean_token_accuracy": 0.4331697977743209, "step": 10221 }, { "epoch": 1.8950685947348906, "grad_norm": 6.359375, "learning_rate": 8.10493140526511e-06, "loss": 3.0228, "mean_token_accuracy": 0.4664996043260353, "step": 10222 }, { "epoch": 1.8952539859102706, "grad_norm": 6.21875, "learning_rate": 8.10474601408973e-06, "loss": 2.8443, "mean_token_accuracy": 0.4662805662805663, "step": 10223 }, { "epoch": 1.8954393770856508, "grad_norm": 5.859375, "learning_rate": 8.104560622914351e-06, "loss": 3.2023, "mean_token_accuracy": 0.4566687154271666, "step": 10224 }, { "epoch": 1.8956247682610308, "grad_norm": 6.26953125, "learning_rate": 8.10437523173897e-06, "loss": 2.6744, "mean_token_accuracy": 0.49914129586260736, "step": 10225 }, { "epoch": 1.8958101594364107, "grad_norm": 7.0703125, "learning_rate": 8.10418984056359e-06, "loss": 2.3553, "mean_token_accuracy": 0.5187844595460956, "step": 10226 }, { "epoch": 1.895995550611791, "grad_norm": 5.7734375, "learning_rate": 8.104004449388209e-06, "loss": 2.9762, "mean_token_accuracy": 0.46474358974358976, "step": 10227 }, { "epoch": 1.896180941787171, "grad_norm": 11.4140625, "learning_rate": 8.10381905821283e-06, "loss": 2.8779, "mean_token_accuracy": 0.43937694704049846, "step": 10228 }, { "epoch": 1.8963663329625509, "grad_norm": 6.36328125, "learning_rate": 8.10363366703745e-06, "loss": 4.1701, "mean_token_accuracy": 0.3883248730964467, "step": 10229 }, { "epoch": 1.896551724137931, "grad_norm": 10.15625, "learning_rate": 8.103448275862069e-06, "loss": 2.7331, "mean_token_accuracy": 0.4723637789102313, "step": 10230 }, { "epoch": 1.8967371153133112, "grad_norm": 6.6796875, "learning_rate": 8.103262884686691e-06, "loss": 3.0446, "mean_token_accuracy": 0.4592933947772657, "step": 10231 }, { "epoch": 1.896922506488691, "grad_norm": 7.41796875, "learning_rate": 8.10307749351131e-06, "loss": 3.2741, "mean_token_accuracy": 0.43571988502743664, "step": 10232 }, { "epoch": 1.8971078976640712, "grad_norm": 5.05078125, "learning_rate": 8.10289210233593e-06, "loss": 3.0238, "mean_token_accuracy": 0.46801980198019805, "step": 10233 }, { "epoch": 1.8972932888394514, "grad_norm": 5.5703125, "learning_rate": 8.102706711160549e-06, "loss": 3.1857, "mean_token_accuracy": 0.4452408256880734, "step": 10234 }, { "epoch": 1.8974786800148313, "grad_norm": 5.5625, "learning_rate": 8.10252131998517e-06, "loss": 2.776, "mean_token_accuracy": 0.4755217220107626, "step": 10235 }, { "epoch": 1.8976640711902113, "grad_norm": 6.79296875, "learning_rate": 8.10233592880979e-06, "loss": 2.7818, "mean_token_accuracy": 0.48226846424384523, "step": 10236 }, { "epoch": 1.8978494623655915, "grad_norm": 7.09375, "learning_rate": 8.102150537634409e-06, "loss": 2.9528, "mean_token_accuracy": 0.4675055309734513, "step": 10237 }, { "epoch": 1.8980348535409715, "grad_norm": 6.4765625, "learning_rate": 8.10196514645903e-06, "loss": 2.9773, "mean_token_accuracy": 0.44704870444038275, "step": 10238 }, { "epoch": 1.8982202447163514, "grad_norm": 5.7265625, "learning_rate": 8.10177975528365e-06, "loss": 2.5908, "mean_token_accuracy": 0.5, "step": 10239 }, { "epoch": 1.8984056358917316, "grad_norm": 7.66796875, "learning_rate": 8.10159436410827e-06, "loss": 3.0848, "mean_token_accuracy": 0.4497307001795332, "step": 10240 }, { "epoch": 1.8985910270671116, "grad_norm": 7.5703125, "learning_rate": 8.101408972932889e-06, "loss": 3.0003, "mean_token_accuracy": 0.47577142857142857, "step": 10241 }, { "epoch": 1.8987764182424915, "grad_norm": 5.84765625, "learning_rate": 8.10122358175751e-06, "loss": 2.5143, "mean_token_accuracy": 0.4969033558980268, "step": 10242 }, { "epoch": 1.8989618094178717, "grad_norm": 9.3671875, "learning_rate": 8.101038190582128e-06, "loss": 3.0333, "mean_token_accuracy": 0.46844919786096256, "step": 10243 }, { "epoch": 1.899147200593252, "grad_norm": 8.6953125, "learning_rate": 8.100852799406749e-06, "loss": 3.0233, "mean_token_accuracy": 0.4796336796063423, "step": 10244 }, { "epoch": 1.8993325917686317, "grad_norm": 6.50390625, "learning_rate": 8.10066740823137e-06, "loss": 2.7202, "mean_token_accuracy": 0.49641932043272896, "step": 10245 }, { "epoch": 1.8995179829440119, "grad_norm": 10.484375, "learning_rate": 8.100482017055988e-06, "loss": 2.6624, "mean_token_accuracy": 0.5025544202576633, "step": 10246 }, { "epoch": 1.899703374119392, "grad_norm": 6.5234375, "learning_rate": 8.100296625880609e-06, "loss": 2.9508, "mean_token_accuracy": 0.4321632454755253, "step": 10247 }, { "epoch": 1.899888765294772, "grad_norm": 11.546875, "learning_rate": 8.100111234705229e-06, "loss": 2.406, "mean_token_accuracy": 0.5115220901218259, "step": 10248 }, { "epoch": 1.900074156470152, "grad_norm": 6.73828125, "learning_rate": 8.09992584352985e-06, "loss": 3.5183, "mean_token_accuracy": 0.41292829388908514, "step": 10249 }, { "epoch": 1.9002595476455322, "grad_norm": 8.15625, "learning_rate": 8.099740452354468e-06, "loss": 3.1894, "mean_token_accuracy": 0.4647098065376918, "step": 10250 }, { "epoch": 1.9004449388209121, "grad_norm": 6.640625, "learning_rate": 8.099555061179089e-06, "loss": 3.3746, "mean_token_accuracy": 0.4244885045349083, "step": 10251 }, { "epoch": 1.9006303299962921, "grad_norm": 9.0390625, "learning_rate": 8.099369670003708e-06, "loss": 2.7523, "mean_token_accuracy": 0.4865411348414002, "step": 10252 }, { "epoch": 1.9008157211716723, "grad_norm": 6.78515625, "learning_rate": 8.099184278828328e-06, "loss": 2.3637, "mean_token_accuracy": 0.5105720586327669, "step": 10253 }, { "epoch": 1.9010011123470523, "grad_norm": 7.76171875, "learning_rate": 8.098998887652949e-06, "loss": 2.9138, "mean_token_accuracy": 0.45197978096040436, "step": 10254 }, { "epoch": 1.9011865035224322, "grad_norm": 8.3671875, "learning_rate": 8.098813496477569e-06, "loss": 2.8814, "mean_token_accuracy": 0.45235294117647057, "step": 10255 }, { "epoch": 1.9013718946978124, "grad_norm": 7.87109375, "learning_rate": 8.098628105302188e-06, "loss": 3.1355, "mean_token_accuracy": 0.4411764705882353, "step": 10256 }, { "epoch": 1.9015572858731924, "grad_norm": 6.16796875, "learning_rate": 8.098442714126808e-06, "loss": 2.8745, "mean_token_accuracy": 0.47480106100795755, "step": 10257 }, { "epoch": 1.9017426770485724, "grad_norm": 7.9140625, "learning_rate": 8.098257322951429e-06, "loss": 2.9194, "mean_token_accuracy": 0.4636769248369752, "step": 10258 }, { "epoch": 1.9019280682239526, "grad_norm": 7.90234375, "learning_rate": 8.098071931776048e-06, "loss": 2.6303, "mean_token_accuracy": 0.5020851762921774, "step": 10259 }, { "epoch": 1.9021134593993327, "grad_norm": 6.05078125, "learning_rate": 8.097886540600668e-06, "loss": 2.4992, "mean_token_accuracy": 0.5027665068240502, "step": 10260 }, { "epoch": 1.9022988505747125, "grad_norm": 5.68359375, "learning_rate": 8.097701149425287e-06, "loss": 2.5759, "mean_token_accuracy": 0.5078587992785365, "step": 10261 }, { "epoch": 1.9024842417500927, "grad_norm": 8.6484375, "learning_rate": 8.097515758249908e-06, "loss": 2.6642, "mean_token_accuracy": 0.47744116499804967, "step": 10262 }, { "epoch": 1.9026696329254729, "grad_norm": 6.89453125, "learning_rate": 8.097330367074528e-06, "loss": 3.0275, "mean_token_accuracy": 0.4750856258976909, "step": 10263 }, { "epoch": 1.9028550241008528, "grad_norm": 8.03125, "learning_rate": 8.097144975899148e-06, "loss": 2.2295, "mean_token_accuracy": 0.5027366172740622, "step": 10264 }, { "epoch": 1.9030404152762328, "grad_norm": 5.75390625, "learning_rate": 8.096959584723767e-06, "loss": 3.2299, "mean_token_accuracy": 0.4493662943907642, "step": 10265 }, { "epoch": 1.903225806451613, "grad_norm": 7.70703125, "learning_rate": 8.096774193548388e-06, "loss": 3.1596, "mean_token_accuracy": 0.4524808332127875, "step": 10266 }, { "epoch": 1.903411197626993, "grad_norm": 8.1875, "learning_rate": 8.096588802373008e-06, "loss": 2.4824, "mean_token_accuracy": 0.5213193885760258, "step": 10267 }, { "epoch": 1.903596588802373, "grad_norm": 8.078125, "learning_rate": 8.096403411197627e-06, "loss": 1.9392, "mean_token_accuracy": 0.5655058043117744, "step": 10268 }, { "epoch": 1.9037819799777531, "grad_norm": 6.98828125, "learning_rate": 8.096218020022248e-06, "loss": 2.8976, "mean_token_accuracy": 0.4680600033709759, "step": 10269 }, { "epoch": 1.903967371153133, "grad_norm": 8.234375, "learning_rate": 8.096032628846866e-06, "loss": 3.0843, "mean_token_accuracy": 0.4384149046281236, "step": 10270 }, { "epoch": 1.904152762328513, "grad_norm": 10.203125, "learning_rate": 8.095847237671487e-06, "loss": 2.8745, "mean_token_accuracy": 0.441351888667992, "step": 10271 }, { "epoch": 1.9043381535038932, "grad_norm": 9.7265625, "learning_rate": 8.095661846496107e-06, "loss": 2.5442, "mean_token_accuracy": 0.5048254079663099, "step": 10272 }, { "epoch": 1.9045235446792734, "grad_norm": 6.72265625, "learning_rate": 8.095476455320728e-06, "loss": 3.1269, "mean_token_accuracy": 0.43475572047000616, "step": 10273 }, { "epoch": 1.9047089358546532, "grad_norm": 10.2265625, "learning_rate": 8.095291064145348e-06, "loss": 2.5302, "mean_token_accuracy": 0.5091968117719191, "step": 10274 }, { "epoch": 1.9048943270300334, "grad_norm": 7.47265625, "learning_rate": 8.095105672969967e-06, "loss": 3.5961, "mean_token_accuracy": 0.40953098106712565, "step": 10275 }, { "epoch": 1.9050797182054136, "grad_norm": 7.27734375, "learning_rate": 8.094920281794588e-06, "loss": 2.9623, "mean_token_accuracy": 0.4692132269099202, "step": 10276 }, { "epoch": 1.9052651093807935, "grad_norm": 7.32421875, "learning_rate": 8.094734890619206e-06, "loss": 3.2354, "mean_token_accuracy": 0.439851445180042, "step": 10277 }, { "epoch": 1.9054505005561735, "grad_norm": 5.89453125, "learning_rate": 8.094549499443827e-06, "loss": 3.0347, "mean_token_accuracy": 0.4633295978084921, "step": 10278 }, { "epoch": 1.9056358917315537, "grad_norm": 6.83984375, "learning_rate": 8.094364108268447e-06, "loss": 2.9089, "mean_token_accuracy": 0.4681907250163292, "step": 10279 }, { "epoch": 1.9058212829069336, "grad_norm": 6.67578125, "learning_rate": 8.094178717093068e-06, "loss": 3.2072, "mean_token_accuracy": 0.45765587445214195, "step": 10280 }, { "epoch": 1.9060066740823136, "grad_norm": 5.55078125, "learning_rate": 8.093993325917687e-06, "loss": 2.7036, "mean_token_accuracy": 0.47793589190743135, "step": 10281 }, { "epoch": 1.9061920652576938, "grad_norm": 6.5859375, "learning_rate": 8.093807934742307e-06, "loss": 2.8824, "mean_token_accuracy": 0.4811320754716981, "step": 10282 }, { "epoch": 1.9063774564330738, "grad_norm": 7.06640625, "learning_rate": 8.093622543566928e-06, "loss": 2.8615, "mean_token_accuracy": 0.47151898734177217, "step": 10283 }, { "epoch": 1.9065628476084537, "grad_norm": 6.80859375, "learning_rate": 8.093437152391546e-06, "loss": 2.6127, "mean_token_accuracy": 0.49407343162763806, "step": 10284 }, { "epoch": 1.906748238783834, "grad_norm": 7.34765625, "learning_rate": 8.093251761216167e-06, "loss": 2.861, "mean_token_accuracy": 0.4880141307090588, "step": 10285 }, { "epoch": 1.906933629959214, "grad_norm": 6.515625, "learning_rate": 8.093066370040786e-06, "loss": 2.8208, "mean_token_accuracy": 0.49135864765989656, "step": 10286 }, { "epoch": 1.9071190211345939, "grad_norm": 6.43359375, "learning_rate": 8.092880978865406e-06, "loss": 2.7112, "mean_token_accuracy": 0.488235982675875, "step": 10287 }, { "epoch": 1.907304412309974, "grad_norm": 7.92578125, "learning_rate": 8.092695587690027e-06, "loss": 3.3615, "mean_token_accuracy": 0.4293885332970176, "step": 10288 }, { "epoch": 1.9074898034853542, "grad_norm": 6.91796875, "learning_rate": 8.092510196514647e-06, "loss": 2.9566, "mean_token_accuracy": 0.4629401625215464, "step": 10289 }, { "epoch": 1.907675194660734, "grad_norm": 6.5625, "learning_rate": 8.092324805339266e-06, "loss": 2.8477, "mean_token_accuracy": 0.46130141045395284, "step": 10290 }, { "epoch": 1.9078605858361142, "grad_norm": 7.1875, "learning_rate": 8.092139414163887e-06, "loss": 2.9925, "mean_token_accuracy": 0.48203479576399394, "step": 10291 }, { "epoch": 1.9080459770114944, "grad_norm": 8.234375, "learning_rate": 8.091954022988507e-06, "loss": 2.7513, "mean_token_accuracy": 0.4864825791528119, "step": 10292 }, { "epoch": 1.9082313681868743, "grad_norm": 9.1953125, "learning_rate": 8.091768631813126e-06, "loss": 2.8324, "mean_token_accuracy": 0.4657139692035006, "step": 10293 }, { "epoch": 1.9084167593622543, "grad_norm": 9.671875, "learning_rate": 8.091583240637746e-06, "loss": 2.9701, "mean_token_accuracy": 0.4721758128775222, "step": 10294 }, { "epoch": 1.9086021505376345, "grad_norm": 10.4921875, "learning_rate": 8.091397849462365e-06, "loss": 2.7357, "mean_token_accuracy": 0.4798964968152866, "step": 10295 }, { "epoch": 1.9087875417130145, "grad_norm": 9.765625, "learning_rate": 8.091212458286987e-06, "loss": 2.5448, "mean_token_accuracy": 0.48547666335650447, "step": 10296 }, { "epoch": 1.9089729328883944, "grad_norm": 5.48828125, "learning_rate": 8.091027067111606e-06, "loss": 4.1756, "mean_token_accuracy": 0.37635776494764656, "step": 10297 }, { "epoch": 1.9091583240637746, "grad_norm": 9.2265625, "learning_rate": 8.090841675936227e-06, "loss": 2.7633, "mean_token_accuracy": 0.4745273950656841, "step": 10298 }, { "epoch": 1.9093437152391546, "grad_norm": 7.12890625, "learning_rate": 8.090656284760845e-06, "loss": 3.1784, "mean_token_accuracy": 0.4439259604915228, "step": 10299 }, { "epoch": 1.9095291064145345, "grad_norm": 8.71875, "learning_rate": 8.090470893585466e-06, "loss": 2.5782, "mean_token_accuracy": 0.48118939883645767, "step": 10300 }, { "epoch": 1.9097144975899147, "grad_norm": 6.0859375, "learning_rate": 8.090285502410086e-06, "loss": 2.1835, "mean_token_accuracy": 0.5498420720151611, "step": 10301 }, { "epoch": 1.909899888765295, "grad_norm": 9.515625, "learning_rate": 8.090100111234705e-06, "loss": 2.4372, "mean_token_accuracy": 0.5194482415807133, "step": 10302 }, { "epoch": 1.9100852799406747, "grad_norm": 10.984375, "learning_rate": 8.089914720059326e-06, "loss": 2.9623, "mean_token_accuracy": 0.451738006320023, "step": 10303 }, { "epoch": 1.9102706711160549, "grad_norm": 7.359375, "learning_rate": 8.089729328883946e-06, "loss": 3.4499, "mean_token_accuracy": 0.44393766628658304, "step": 10304 }, { "epoch": 1.910456062291435, "grad_norm": 8.8984375, "learning_rate": 8.089543937708567e-06, "loss": 2.9217, "mean_token_accuracy": 0.4655773158716809, "step": 10305 }, { "epoch": 1.910641453466815, "grad_norm": 10.921875, "learning_rate": 8.089358546533185e-06, "loss": 3.0264, "mean_token_accuracy": 0.46019988242210463, "step": 10306 }, { "epoch": 1.910826844642195, "grad_norm": 6.46875, "learning_rate": 8.089173155357806e-06, "loss": 3.2135, "mean_token_accuracy": 0.4380597014925373, "step": 10307 }, { "epoch": 1.9110122358175752, "grad_norm": 8.4375, "learning_rate": 8.088987764182425e-06, "loss": 3.6797, "mean_token_accuracy": 0.3981517636339971, "step": 10308 }, { "epoch": 1.9111976269929551, "grad_norm": 9.6015625, "learning_rate": 8.088802373007045e-06, "loss": 3.0903, "mean_token_accuracy": 0.46560477849676457, "step": 10309 }, { "epoch": 1.9113830181683351, "grad_norm": 10.5, "learning_rate": 8.088616981831666e-06, "loss": 3.372, "mean_token_accuracy": 0.43158002876515306, "step": 10310 }, { "epoch": 1.9115684093437153, "grad_norm": 7.09765625, "learning_rate": 8.088431590656284e-06, "loss": 2.5558, "mean_token_accuracy": 0.4810109100952907, "step": 10311 }, { "epoch": 1.9117538005190953, "grad_norm": 12.109375, "learning_rate": 8.088246199480907e-06, "loss": 2.385, "mean_token_accuracy": 0.5205712342740565, "step": 10312 }, { "epoch": 1.9119391916944752, "grad_norm": 14.5390625, "learning_rate": 8.088060808305525e-06, "loss": 2.6275, "mean_token_accuracy": 0.47574039067422813, "step": 10313 }, { "epoch": 1.9121245828698554, "grad_norm": 8.2421875, "learning_rate": 8.087875417130146e-06, "loss": 2.5504, "mean_token_accuracy": 0.5081716637272193, "step": 10314 }, { "epoch": 1.9123099740452354, "grad_norm": 7.81640625, "learning_rate": 8.087690025954765e-06, "loss": 2.3461, "mean_token_accuracy": 0.5279630123588512, "step": 10315 }, { "epoch": 1.9124953652206154, "grad_norm": 8.2578125, "learning_rate": 8.087504634779385e-06, "loss": 3.0193, "mean_token_accuracy": 0.4689250102445021, "step": 10316 }, { "epoch": 1.9126807563959956, "grad_norm": 8.125, "learning_rate": 8.087319243604006e-06, "loss": 2.9256, "mean_token_accuracy": 0.448177591120444, "step": 10317 }, { "epoch": 1.9128661475713757, "grad_norm": 7.875, "learning_rate": 8.087133852428625e-06, "loss": 2.9542, "mean_token_accuracy": 0.4679600235155791, "step": 10318 }, { "epoch": 1.9130515387467557, "grad_norm": 14.765625, "learning_rate": 8.086948461253245e-06, "loss": 2.9623, "mean_token_accuracy": 0.4629825489159175, "step": 10319 }, { "epoch": 1.9132369299221357, "grad_norm": 6.55078125, "learning_rate": 8.086763070077866e-06, "loss": 2.7826, "mean_token_accuracy": 0.4946314567614626, "step": 10320 }, { "epoch": 1.9134223210975159, "grad_norm": 6.3203125, "learning_rate": 8.086577678902486e-06, "loss": 3.3196, "mean_token_accuracy": 0.44623282919383933, "step": 10321 }, { "epoch": 1.9136077122728958, "grad_norm": 7.87109375, "learning_rate": 8.086392287727105e-06, "loss": 2.6543, "mean_token_accuracy": 0.49813200498132004, "step": 10322 }, { "epoch": 1.9137931034482758, "grad_norm": 7.8515625, "learning_rate": 8.086206896551725e-06, "loss": 2.7293, "mean_token_accuracy": 0.4696587438587173, "step": 10323 }, { "epoch": 1.913978494623656, "grad_norm": 7.09765625, "learning_rate": 8.086021505376344e-06, "loss": 2.5154, "mean_token_accuracy": 0.5053430220132507, "step": 10324 }, { "epoch": 1.914163885799036, "grad_norm": 10.984375, "learning_rate": 8.085836114200965e-06, "loss": 2.7474, "mean_token_accuracy": 0.5030978934324659, "step": 10325 }, { "epoch": 1.914349276974416, "grad_norm": 6.8203125, "learning_rate": 8.085650723025585e-06, "loss": 3.2159, "mean_token_accuracy": 0.4538786396977106, "step": 10326 }, { "epoch": 1.9145346681497961, "grad_norm": 7.81640625, "learning_rate": 8.085465331850204e-06, "loss": 2.9475, "mean_token_accuracy": 0.46917450365726227, "step": 10327 }, { "epoch": 1.914720059325176, "grad_norm": 6.09375, "learning_rate": 8.085279940674824e-06, "loss": 2.8872, "mean_token_accuracy": 0.48288075560802834, "step": 10328 }, { "epoch": 1.914905450500556, "grad_norm": 10.9296875, "learning_rate": 8.085094549499445e-06, "loss": 3.072, "mean_token_accuracy": 0.4654073991281994, "step": 10329 }, { "epoch": 1.9150908416759362, "grad_norm": 8.6015625, "learning_rate": 8.084909158324065e-06, "loss": 3.0468, "mean_token_accuracy": 0.44217391304347825, "step": 10330 }, { "epoch": 1.9152762328513164, "grad_norm": 6.32421875, "learning_rate": 8.084723767148684e-06, "loss": 2.6005, "mean_token_accuracy": 0.48831677626976355, "step": 10331 }, { "epoch": 1.9154616240266962, "grad_norm": 6.13671875, "learning_rate": 8.084538375973305e-06, "loss": 2.7467, "mean_token_accuracy": 0.467689620758483, "step": 10332 }, { "epoch": 1.9156470152020764, "grad_norm": 7.578125, "learning_rate": 8.084352984797923e-06, "loss": 2.4015, "mean_token_accuracy": 0.5086884075220186, "step": 10333 }, { "epoch": 1.9158324063774566, "grad_norm": 6.43359375, "learning_rate": 8.084167593622544e-06, "loss": 2.8891, "mean_token_accuracy": 0.4600989979476035, "step": 10334 }, { "epoch": 1.9160177975528365, "grad_norm": 5.88671875, "learning_rate": 8.083982202447164e-06, "loss": 3.3017, "mean_token_accuracy": 0.46669391091132, "step": 10335 }, { "epoch": 1.9162031887282165, "grad_norm": 5.78125, "learning_rate": 8.083796811271785e-06, "loss": 3.0785, "mean_token_accuracy": 0.45549230955552866, "step": 10336 }, { "epoch": 1.9163885799035967, "grad_norm": 7.0859375, "learning_rate": 8.083611420096404e-06, "loss": 2.1819, "mean_token_accuracy": 0.555714968914395, "step": 10337 }, { "epoch": 1.9165739710789766, "grad_norm": 6.22265625, "learning_rate": 8.083426028921024e-06, "loss": 2.4666, "mean_token_accuracy": 0.5310273351470398, "step": 10338 }, { "epoch": 1.9167593622543566, "grad_norm": 5.66015625, "learning_rate": 8.083240637745645e-06, "loss": 3.443, "mean_token_accuracy": 0.434717880005955, "step": 10339 }, { "epoch": 1.9169447534297368, "grad_norm": 5.9609375, "learning_rate": 8.083055246570263e-06, "loss": 2.8922, "mean_token_accuracy": 0.4551706571574121, "step": 10340 }, { "epoch": 1.9171301446051168, "grad_norm": 6.5625, "learning_rate": 8.082869855394884e-06, "loss": 2.1332, "mean_token_accuracy": 0.5711869781207061, "step": 10341 }, { "epoch": 1.9173155357804967, "grad_norm": 7.94140625, "learning_rate": 8.082684464219503e-06, "loss": 1.8187, "mean_token_accuracy": 0.6020258367586612, "step": 10342 }, { "epoch": 1.917500926955877, "grad_norm": 8.875, "learning_rate": 8.082499073044123e-06, "loss": 2.6121, "mean_token_accuracy": 0.4914769340803433, "step": 10343 }, { "epoch": 1.9176863181312571, "grad_norm": 5.1953125, "learning_rate": 8.082313681868744e-06, "loss": 3.0569, "mean_token_accuracy": 0.4718068018167719, "step": 10344 }, { "epoch": 1.9178717093066369, "grad_norm": 7.04296875, "learning_rate": 8.082128290693364e-06, "loss": 3.9936, "mean_token_accuracy": 0.4157936351057521, "step": 10345 }, { "epoch": 1.918057100482017, "grad_norm": 6.9609375, "learning_rate": 8.081942899517983e-06, "loss": 2.5488, "mean_token_accuracy": 0.5059743954480797, "step": 10346 }, { "epoch": 1.9182424916573972, "grad_norm": 8.0234375, "learning_rate": 8.081757508342604e-06, "loss": 3.1018, "mean_token_accuracy": 0.46181242580134546, "step": 10347 }, { "epoch": 1.9184278828327772, "grad_norm": 5.16796875, "learning_rate": 8.081572117167224e-06, "loss": 2.9348, "mean_token_accuracy": 0.4611624834874505, "step": 10348 }, { "epoch": 1.9186132740081572, "grad_norm": 7.24609375, "learning_rate": 8.081386725991843e-06, "loss": 3.824, "mean_token_accuracy": 0.404228280961183, "step": 10349 }, { "epoch": 1.9187986651835374, "grad_norm": 8.7890625, "learning_rate": 8.081201334816463e-06, "loss": 2.3782, "mean_token_accuracy": 0.5364560318161732, "step": 10350 }, { "epoch": 1.9189840563589173, "grad_norm": 5.96875, "learning_rate": 8.081015943641082e-06, "loss": 3.5373, "mean_token_accuracy": 0.4282586483725004, "step": 10351 }, { "epoch": 1.9191694475342973, "grad_norm": 5.68359375, "learning_rate": 8.080830552465704e-06, "loss": 2.7985, "mean_token_accuracy": 0.4622412562455389, "step": 10352 }, { "epoch": 1.9193548387096775, "grad_norm": 10.234375, "learning_rate": 8.080645161290323e-06, "loss": 2.9669, "mean_token_accuracy": 0.45303446172613604, "step": 10353 }, { "epoch": 1.9195402298850575, "grad_norm": 6.47265625, "learning_rate": 8.080459770114944e-06, "loss": 3.0813, "mean_token_accuracy": 0.4490177736202058, "step": 10354 }, { "epoch": 1.9197256210604374, "grad_norm": 6.60546875, "learning_rate": 8.080274378939564e-06, "loss": 3.3202, "mean_token_accuracy": 0.4305711786026747, "step": 10355 }, { "epoch": 1.9199110122358176, "grad_norm": 9.09375, "learning_rate": 8.080088987764183e-06, "loss": 3.046, "mean_token_accuracy": 0.44542228530872957, "step": 10356 }, { "epoch": 1.9200964034111976, "grad_norm": 6.35546875, "learning_rate": 8.079903596588803e-06, "loss": 3.2808, "mean_token_accuracy": 0.4639618138424821, "step": 10357 }, { "epoch": 1.9202817945865776, "grad_norm": 7.02734375, "learning_rate": 8.079718205413422e-06, "loss": 2.6198, "mean_token_accuracy": 0.4948529411764706, "step": 10358 }, { "epoch": 1.9204671857619577, "grad_norm": 6.1875, "learning_rate": 8.079532814238043e-06, "loss": 2.846, "mean_token_accuracy": 0.4687459389213775, "step": 10359 }, { "epoch": 1.920652576937338, "grad_norm": 6.48046875, "learning_rate": 8.079347423062663e-06, "loss": 2.364, "mean_token_accuracy": 0.5096587250482936, "step": 10360 }, { "epoch": 1.9208379681127177, "grad_norm": 5.328125, "learning_rate": 8.079162031887284e-06, "loss": 3.7651, "mean_token_accuracy": 0.41081081081081083, "step": 10361 }, { "epoch": 1.9210233592880979, "grad_norm": 6.08203125, "learning_rate": 8.078976640711902e-06, "loss": 2.8441, "mean_token_accuracy": 0.4825749167591565, "step": 10362 }, { "epoch": 1.921208750463478, "grad_norm": 6.55859375, "learning_rate": 8.078791249536523e-06, "loss": 2.6774, "mean_token_accuracy": 0.4939595660749507, "step": 10363 }, { "epoch": 1.921394141638858, "grad_norm": 6.38671875, "learning_rate": 8.078605858361143e-06, "loss": 2.8022, "mean_token_accuracy": 0.46705619094977624, "step": 10364 }, { "epoch": 1.921579532814238, "grad_norm": 6.93359375, "learning_rate": 8.078420467185762e-06, "loss": 3.0185, "mean_token_accuracy": 0.4496509129967777, "step": 10365 }, { "epoch": 1.9217649239896182, "grad_norm": 5.21875, "learning_rate": 8.078235076010383e-06, "loss": 2.5587, "mean_token_accuracy": 0.5148629148629149, "step": 10366 }, { "epoch": 1.9219503151649981, "grad_norm": 5.7421875, "learning_rate": 8.078049684835001e-06, "loss": 2.9501, "mean_token_accuracy": 0.47554617676266137, "step": 10367 }, { "epoch": 1.9221357063403781, "grad_norm": 6.28515625, "learning_rate": 8.077864293659624e-06, "loss": 2.6439, "mean_token_accuracy": 0.48576688854269934, "step": 10368 }, { "epoch": 1.9223210975157583, "grad_norm": 5.78515625, "learning_rate": 8.077678902484242e-06, "loss": 3.3892, "mean_token_accuracy": 0.4363131079203335, "step": 10369 }, { "epoch": 1.9225064886911383, "grad_norm": 6.953125, "learning_rate": 8.077493511308863e-06, "loss": 3.2938, "mean_token_accuracy": 0.45559766465225854, "step": 10370 }, { "epoch": 1.9226918798665182, "grad_norm": 7.58203125, "learning_rate": 8.077308120133482e-06, "loss": 2.7635, "mean_token_accuracy": 0.48502276539659717, "step": 10371 }, { "epoch": 1.9228772710418984, "grad_norm": 5.96484375, "learning_rate": 8.077122728958102e-06, "loss": 2.1215, "mean_token_accuracy": 0.5575916230366492, "step": 10372 }, { "epoch": 1.9230626622172786, "grad_norm": 6.77734375, "learning_rate": 8.076937337782723e-06, "loss": 2.7117, "mean_token_accuracy": 0.45497197966896913, "step": 10373 }, { "epoch": 1.9232480533926584, "grad_norm": 6.54296875, "learning_rate": 8.076751946607342e-06, "loss": 3.0793, "mean_token_accuracy": 0.43742203742203745, "step": 10374 }, { "epoch": 1.9234334445680386, "grad_norm": 6.67578125, "learning_rate": 8.076566555431962e-06, "loss": 2.7692, "mean_token_accuracy": 0.46850344395742016, "step": 10375 }, { "epoch": 1.9236188357434187, "grad_norm": 8.203125, "learning_rate": 8.076381164256583e-06, "loss": 2.7727, "mean_token_accuracy": 0.468324960176449, "step": 10376 }, { "epoch": 1.9238042269187987, "grad_norm": 6.140625, "learning_rate": 8.076195773081203e-06, "loss": 2.9475, "mean_token_accuracy": 0.46644676979071886, "step": 10377 }, { "epoch": 1.9239896180941787, "grad_norm": 8.640625, "learning_rate": 8.076010381905822e-06, "loss": 3.1439, "mean_token_accuracy": 0.46116504854368934, "step": 10378 }, { "epoch": 1.9241750092695589, "grad_norm": 6.578125, "learning_rate": 8.075824990730442e-06, "loss": 2.471, "mean_token_accuracy": 0.5143592522351667, "step": 10379 }, { "epoch": 1.9243604004449388, "grad_norm": 9.046875, "learning_rate": 8.075639599555061e-06, "loss": 2.2921, "mean_token_accuracy": 0.5141073982577038, "step": 10380 }, { "epoch": 1.9245457916203188, "grad_norm": 7.1796875, "learning_rate": 8.075454208379682e-06, "loss": 2.9535, "mean_token_accuracy": 0.4465894465894466, "step": 10381 }, { "epoch": 1.924731182795699, "grad_norm": 6.5859375, "learning_rate": 8.075268817204302e-06, "loss": 2.8682, "mean_token_accuracy": 0.46201329534662866, "step": 10382 }, { "epoch": 1.924916573971079, "grad_norm": 6.65625, "learning_rate": 8.075083426028921e-06, "loss": 2.8094, "mean_token_accuracy": 0.46615656268393174, "step": 10383 }, { "epoch": 1.925101965146459, "grad_norm": 10.2578125, "learning_rate": 8.074898034853541e-06, "loss": 2.0601, "mean_token_accuracy": 0.575619448340346, "step": 10384 }, { "epoch": 1.9252873563218391, "grad_norm": 6.4921875, "learning_rate": 8.074712643678162e-06, "loss": 3.3101, "mean_token_accuracy": 0.4569380549909062, "step": 10385 }, { "epoch": 1.925472747497219, "grad_norm": 6.09375, "learning_rate": 8.074527252502782e-06, "loss": 3.1516, "mean_token_accuracy": 0.43795798729848556, "step": 10386 }, { "epoch": 1.925658138672599, "grad_norm": 6.73046875, "learning_rate": 8.074341861327401e-06, "loss": 2.7083, "mean_token_accuracy": 0.4701336338753824, "step": 10387 }, { "epoch": 1.9258435298479792, "grad_norm": 5.203125, "learning_rate": 8.074156470152022e-06, "loss": 2.9933, "mean_token_accuracy": 0.4438324727481354, "step": 10388 }, { "epoch": 1.9260289210233594, "grad_norm": 6.15234375, "learning_rate": 8.07397107897664e-06, "loss": 3.0362, "mean_token_accuracy": 0.4450777202072539, "step": 10389 }, { "epoch": 1.9262143121987392, "grad_norm": 5.48046875, "learning_rate": 8.073785687801261e-06, "loss": 3.0521, "mean_token_accuracy": 0.4445106687328645, "step": 10390 }, { "epoch": 1.9263997033741194, "grad_norm": 8.296875, "learning_rate": 8.073600296625881e-06, "loss": 2.3927, "mean_token_accuracy": 0.4922572043892934, "step": 10391 }, { "epoch": 1.9265850945494996, "grad_norm": 6.8046875, "learning_rate": 8.073414905450502e-06, "loss": 3.6478, "mean_token_accuracy": 0.43342911877394635, "step": 10392 }, { "epoch": 1.9267704857248795, "grad_norm": 6.76171875, "learning_rate": 8.073229514275122e-06, "loss": 2.445, "mean_token_accuracy": 0.5221228923842933, "step": 10393 }, { "epoch": 1.9269558769002595, "grad_norm": 7.4375, "learning_rate": 8.073044123099741e-06, "loss": 3.6617, "mean_token_accuracy": 0.4127818508784064, "step": 10394 }, { "epoch": 1.9271412680756397, "grad_norm": 7.9453125, "learning_rate": 8.072858731924362e-06, "loss": 2.593, "mean_token_accuracy": 0.5104820403258112, "step": 10395 }, { "epoch": 1.9273266592510196, "grad_norm": 6.125, "learning_rate": 8.07267334074898e-06, "loss": 3.2392, "mean_token_accuracy": 0.44300991131977047, "step": 10396 }, { "epoch": 1.9275120504263996, "grad_norm": 14.2109375, "learning_rate": 8.072487949573601e-06, "loss": 3.0613, "mean_token_accuracy": 0.45460835870467753, "step": 10397 }, { "epoch": 1.9276974416017798, "grad_norm": 10.4765625, "learning_rate": 8.072302558398221e-06, "loss": 3.2837, "mean_token_accuracy": 0.42866733803289697, "step": 10398 }, { "epoch": 1.9278828327771598, "grad_norm": 11.25, "learning_rate": 8.07211716722284e-06, "loss": 3.3254, "mean_token_accuracy": 0.42421848549825075, "step": 10399 }, { "epoch": 1.9280682239525397, "grad_norm": 7.78125, "learning_rate": 8.07193177604746e-06, "loss": 3.5773, "mean_token_accuracy": 0.42247658688865763, "step": 10400 }, { "epoch": 1.92825361512792, "grad_norm": 18.515625, "learning_rate": 8.071746384872081e-06, "loss": 2.0805, "mean_token_accuracy": 0.5460359760159893, "step": 10401 }, { "epoch": 1.9284390063033001, "grad_norm": 11.8359375, "learning_rate": 8.071560993696702e-06, "loss": 2.8686, "mean_token_accuracy": 0.45147820277267414, "step": 10402 }, { "epoch": 1.9286243974786799, "grad_norm": 13.34375, "learning_rate": 8.07137560252132e-06, "loss": 2.4216, "mean_token_accuracy": 0.5168491397696573, "step": 10403 }, { "epoch": 1.92880978865406, "grad_norm": 5.61328125, "learning_rate": 8.071190211345941e-06, "loss": 2.9512, "mean_token_accuracy": 0.4650275540483256, "step": 10404 }, { "epoch": 1.9289951798294402, "grad_norm": 7.24609375, "learning_rate": 8.07100482017056e-06, "loss": 3.0431, "mean_token_accuracy": 0.4736293516781322, "step": 10405 }, { "epoch": 1.9291805710048202, "grad_norm": 10.0, "learning_rate": 8.07081942899518e-06, "loss": 3.2925, "mean_token_accuracy": 0.4232345707928329, "step": 10406 }, { "epoch": 1.9293659621802002, "grad_norm": 8.5859375, "learning_rate": 8.0706340378198e-06, "loss": 2.8336, "mean_token_accuracy": 0.463197803634462, "step": 10407 }, { "epoch": 1.9295513533555804, "grad_norm": 5.9609375, "learning_rate": 8.07044864664442e-06, "loss": 3.046, "mean_token_accuracy": 0.4464165658195509, "step": 10408 }, { "epoch": 1.9297367445309603, "grad_norm": 8.46875, "learning_rate": 8.07026325546904e-06, "loss": 2.7676, "mean_token_accuracy": 0.5019873399087296, "step": 10409 }, { "epoch": 1.9299221357063403, "grad_norm": 6.6953125, "learning_rate": 8.07007786429366e-06, "loss": 2.5103, "mean_token_accuracy": 0.502814845704754, "step": 10410 }, { "epoch": 1.9301075268817205, "grad_norm": 12.1796875, "learning_rate": 8.069892473118281e-06, "loss": 2.8528, "mean_token_accuracy": 0.45404696626534746, "step": 10411 }, { "epoch": 1.9302929180571005, "grad_norm": 6.75, "learning_rate": 8.0697070819429e-06, "loss": 2.9298, "mean_token_accuracy": 0.4493141877411059, "step": 10412 }, { "epoch": 1.9304783092324804, "grad_norm": 10.609375, "learning_rate": 8.06952169076752e-06, "loss": 4.0295, "mean_token_accuracy": 0.41983695652173914, "step": 10413 }, { "epoch": 1.9306637004078606, "grad_norm": 8.265625, "learning_rate": 8.06933629959214e-06, "loss": 3.1219, "mean_token_accuracy": 0.42972315181016124, "step": 10414 }, { "epoch": 1.9308490915832406, "grad_norm": 8.8515625, "learning_rate": 8.06915090841676e-06, "loss": 2.5215, "mean_token_accuracy": 0.49205461056401073, "step": 10415 }, { "epoch": 1.9310344827586206, "grad_norm": 6.15625, "learning_rate": 8.06896551724138e-06, "loss": 2.9309, "mean_token_accuracy": 0.4779299847792998, "step": 10416 }, { "epoch": 1.9312198739340007, "grad_norm": 6.640625, "learning_rate": 8.068780126066e-06, "loss": 2.6323, "mean_token_accuracy": 0.5032974661575842, "step": 10417 }, { "epoch": 1.931405265109381, "grad_norm": 8.15625, "learning_rate": 8.06859473489062e-06, "loss": 2.5085, "mean_token_accuracy": 0.49014014634751335, "step": 10418 }, { "epoch": 1.931590656284761, "grad_norm": 6.390625, "learning_rate": 8.06840934371524e-06, "loss": 3.2134, "mean_token_accuracy": 0.4583525080533824, "step": 10419 }, { "epoch": 1.9317760474601409, "grad_norm": 5.9296875, "learning_rate": 8.06822395253986e-06, "loss": 2.9791, "mean_token_accuracy": 0.44430596666147376, "step": 10420 }, { "epoch": 1.931961438635521, "grad_norm": 7.0390625, "learning_rate": 8.06803856136448e-06, "loss": 3.1352, "mean_token_accuracy": 0.43994928538497, "step": 10421 }, { "epoch": 1.932146829810901, "grad_norm": 5.17578125, "learning_rate": 8.0678531701891e-06, "loss": 2.8088, "mean_token_accuracy": 0.488679476196304, "step": 10422 }, { "epoch": 1.932332220986281, "grad_norm": 7.86328125, "learning_rate": 8.067667779013719e-06, "loss": 3.2063, "mean_token_accuracy": 0.4427150886294855, "step": 10423 }, { "epoch": 1.9325176121616612, "grad_norm": 7.1328125, "learning_rate": 8.067482387838339e-06, "loss": 3.2191, "mean_token_accuracy": 0.45500750837472564, "step": 10424 }, { "epoch": 1.9327030033370411, "grad_norm": 6.20703125, "learning_rate": 8.06729699666296e-06, "loss": 2.7669, "mean_token_accuracy": 0.48097795041699987, "step": 10425 }, { "epoch": 1.9328883945124211, "grad_norm": 6.09765625, "learning_rate": 8.06711160548758e-06, "loss": 3.2506, "mean_token_accuracy": 0.45327954190525765, "step": 10426 }, { "epoch": 1.9330737856878013, "grad_norm": 5.47265625, "learning_rate": 8.066926214312199e-06, "loss": 2.6359, "mean_token_accuracy": 0.4958767067730161, "step": 10427 }, { "epoch": 1.9332591768631813, "grad_norm": 6.15625, "learning_rate": 8.06674082313682e-06, "loss": 3.4798, "mean_token_accuracy": 0.43817427385892116, "step": 10428 }, { "epoch": 1.9334445680385612, "grad_norm": 7.2890625, "learning_rate": 8.06655543196144e-06, "loss": 2.6007, "mean_token_accuracy": 0.4999341151666886, "step": 10429 }, { "epoch": 1.9336299592139414, "grad_norm": 5.921875, "learning_rate": 8.066370040786059e-06, "loss": 2.9893, "mean_token_accuracy": 0.4700162074554295, "step": 10430 }, { "epoch": 1.9338153503893216, "grad_norm": 6.70703125, "learning_rate": 8.066184649610679e-06, "loss": 3.1471, "mean_token_accuracy": 0.4241455347298787, "step": 10431 }, { "epoch": 1.9340007415647014, "grad_norm": 7.80859375, "learning_rate": 8.065999258435298e-06, "loss": 2.6235, "mean_token_accuracy": 0.472627953924712, "step": 10432 }, { "epoch": 1.9341861327400816, "grad_norm": 5.07421875, "learning_rate": 8.06581386725992e-06, "loss": 2.9562, "mean_token_accuracy": 0.4500165690931183, "step": 10433 }, { "epoch": 1.9343715239154617, "grad_norm": 8.515625, "learning_rate": 8.065628476084539e-06, "loss": 3.4685, "mean_token_accuracy": 0.4381852111033357, "step": 10434 }, { "epoch": 1.9345569150908417, "grad_norm": 6.765625, "learning_rate": 8.06544308490916e-06, "loss": 2.1582, "mean_token_accuracy": 0.5354643984085609, "step": 10435 }, { "epoch": 1.9347423062662217, "grad_norm": 6.125, "learning_rate": 8.06525769373378e-06, "loss": 3.1293, "mean_token_accuracy": 0.4457030660021145, "step": 10436 }, { "epoch": 1.9349276974416019, "grad_norm": 6.50390625, "learning_rate": 8.065072302558399e-06, "loss": 3.2835, "mean_token_accuracy": 0.45025536261491317, "step": 10437 }, { "epoch": 1.9351130886169818, "grad_norm": 6.37109375, "learning_rate": 8.064886911383019e-06, "loss": 3.1872, "mean_token_accuracy": 0.4247378931602596, "step": 10438 }, { "epoch": 1.9352984797923618, "grad_norm": 8.1640625, "learning_rate": 8.064701520207638e-06, "loss": 2.3702, "mean_token_accuracy": 0.5336713325250744, "step": 10439 }, { "epoch": 1.935483870967742, "grad_norm": 6.32421875, "learning_rate": 8.064516129032258e-06, "loss": 2.5949, "mean_token_accuracy": 0.48480614739783445, "step": 10440 }, { "epoch": 1.935669262143122, "grad_norm": 9.453125, "learning_rate": 8.064330737856879e-06, "loss": 2.622, "mean_token_accuracy": 0.47119110111236095, "step": 10441 }, { "epoch": 1.935854653318502, "grad_norm": 6.109375, "learning_rate": 8.0641453466815e-06, "loss": 2.7047, "mean_token_accuracy": 0.4800064236389915, "step": 10442 }, { "epoch": 1.9360400444938821, "grad_norm": 6.05078125, "learning_rate": 8.063959955506118e-06, "loss": 3.3167, "mean_token_accuracy": 0.42436431547191494, "step": 10443 }, { "epoch": 1.9362254356692623, "grad_norm": 6.08203125, "learning_rate": 8.063774564330739e-06, "loss": 2.7186, "mean_token_accuracy": 0.5052604234316145, "step": 10444 }, { "epoch": 1.936410826844642, "grad_norm": 7.0, "learning_rate": 8.06358917315536e-06, "loss": 2.6626, "mean_token_accuracy": 0.4682219557396708, "step": 10445 }, { "epoch": 1.9365962180200222, "grad_norm": 7.9609375, "learning_rate": 8.063403781979978e-06, "loss": 3.2389, "mean_token_accuracy": 0.45312157721796276, "step": 10446 }, { "epoch": 1.9367816091954024, "grad_norm": 6.30859375, "learning_rate": 8.063218390804598e-06, "loss": 3.0901, "mean_token_accuracy": 0.4410851666891618, "step": 10447 }, { "epoch": 1.9369670003707824, "grad_norm": 5.26953125, "learning_rate": 8.063032999629217e-06, "loss": 3.1376, "mean_token_accuracy": 0.45340022296544036, "step": 10448 }, { "epoch": 1.9371523915461624, "grad_norm": 5.48046875, "learning_rate": 8.06284760845384e-06, "loss": 2.5845, "mean_token_accuracy": 0.5070149423976275, "step": 10449 }, { "epoch": 1.9373377827215426, "grad_norm": 8.9453125, "learning_rate": 8.062662217278458e-06, "loss": 2.4323, "mean_token_accuracy": 0.5157000369412634, "step": 10450 }, { "epoch": 1.9375231738969225, "grad_norm": 6.1953125, "learning_rate": 8.062476826103079e-06, "loss": 2.8212, "mean_token_accuracy": 0.45074483421432004, "step": 10451 }, { "epoch": 1.9377085650723025, "grad_norm": 5.91015625, "learning_rate": 8.062291434927698e-06, "loss": 3.0841, "mean_token_accuracy": 0.44957632175677287, "step": 10452 }, { "epoch": 1.9378939562476827, "grad_norm": 6.625, "learning_rate": 8.062106043752318e-06, "loss": 2.6862, "mean_token_accuracy": 0.48737435645991667, "step": 10453 }, { "epoch": 1.9380793474230626, "grad_norm": 6.59375, "learning_rate": 8.061920652576938e-06, "loss": 2.7423, "mean_token_accuracy": 0.4707379134860051, "step": 10454 }, { "epoch": 1.9382647385984426, "grad_norm": 5.45703125, "learning_rate": 8.061735261401557e-06, "loss": 2.9807, "mean_token_accuracy": 0.4604999331640155, "step": 10455 }, { "epoch": 1.9384501297738228, "grad_norm": 5.84375, "learning_rate": 8.061549870226178e-06, "loss": 2.5858, "mean_token_accuracy": 0.5099452291726723, "step": 10456 }, { "epoch": 1.9386355209492028, "grad_norm": 5.875, "learning_rate": 8.061364479050798e-06, "loss": 2.9005, "mean_token_accuracy": 0.4639815215822145, "step": 10457 }, { "epoch": 1.9388209121245827, "grad_norm": 6.2890625, "learning_rate": 8.061179087875419e-06, "loss": 2.7863, "mean_token_accuracy": 0.4829837328767123, "step": 10458 }, { "epoch": 1.939006303299963, "grad_norm": 6.47265625, "learning_rate": 8.060993696700038e-06, "loss": 3.2447, "mean_token_accuracy": 0.4280918913248715, "step": 10459 }, { "epoch": 1.9391916944753431, "grad_norm": 5.515625, "learning_rate": 8.060808305524658e-06, "loss": 2.981, "mean_token_accuracy": 0.4626738228836302, "step": 10460 }, { "epoch": 1.9393770856507229, "grad_norm": 6.046875, "learning_rate": 8.060622914349277e-06, "loss": 3.1321, "mean_token_accuracy": 0.46447459186019774, "step": 10461 }, { "epoch": 1.939562476826103, "grad_norm": 6.65625, "learning_rate": 8.060437523173897e-06, "loss": 2.5942, "mean_token_accuracy": 0.47986822840409954, "step": 10462 }, { "epoch": 1.9397478680014832, "grad_norm": 9.015625, "learning_rate": 8.060252131998518e-06, "loss": 3.3177, "mean_token_accuracy": 0.4402570122327938, "step": 10463 }, { "epoch": 1.9399332591768632, "grad_norm": 6.50390625, "learning_rate": 8.060066740823137e-06, "loss": 2.5836, "mean_token_accuracy": 0.49836717307965356, "step": 10464 }, { "epoch": 1.9401186503522432, "grad_norm": 6.38671875, "learning_rate": 8.059881349647757e-06, "loss": 3.0579, "mean_token_accuracy": 0.4377117539744592, "step": 10465 }, { "epoch": 1.9403040415276234, "grad_norm": 9.40625, "learning_rate": 8.059695958472378e-06, "loss": 2.9538, "mean_token_accuracy": 0.4717967201297531, "step": 10466 }, { "epoch": 1.9404894327030033, "grad_norm": 6.390625, "learning_rate": 8.059510567296998e-06, "loss": 2.8775, "mean_token_accuracy": 0.47337278106508873, "step": 10467 }, { "epoch": 1.9406748238783833, "grad_norm": 8.546875, "learning_rate": 8.059325176121617e-06, "loss": 2.4804, "mean_token_accuracy": 0.4950964884530212, "step": 10468 }, { "epoch": 1.9408602150537635, "grad_norm": 6.49609375, "learning_rate": 8.059139784946237e-06, "loss": 2.7158, "mean_token_accuracy": 0.49169741697416974, "step": 10469 }, { "epoch": 1.9410456062291435, "grad_norm": 8.1640625, "learning_rate": 8.058954393770856e-06, "loss": 2.8394, "mean_token_accuracy": 0.4626769626769627, "step": 10470 }, { "epoch": 1.9412309974045234, "grad_norm": 8.2109375, "learning_rate": 8.058769002595477e-06, "loss": 3.7991, "mean_token_accuracy": 0.40977550440466043, "step": 10471 }, { "epoch": 1.9414163885799036, "grad_norm": 5.78515625, "learning_rate": 8.058583611420097e-06, "loss": 2.5155, "mean_token_accuracy": 0.4830421377183967, "step": 10472 }, { "epoch": 1.9416017797552838, "grad_norm": 8.4140625, "learning_rate": 8.058398220244718e-06, "loss": 3.1837, "mean_token_accuracy": 0.4492995330220147, "step": 10473 }, { "epoch": 1.9417871709306636, "grad_norm": 6.68359375, "learning_rate": 8.058212829069338e-06, "loss": 2.9098, "mean_token_accuracy": 0.45946335833814195, "step": 10474 }, { "epoch": 1.9419725621060437, "grad_norm": 8.984375, "learning_rate": 8.058027437893957e-06, "loss": 3.5126, "mean_token_accuracy": 0.39788499669530736, "step": 10475 }, { "epoch": 1.942157953281424, "grad_norm": 8.90625, "learning_rate": 8.057842046718577e-06, "loss": 2.9853, "mean_token_accuracy": 0.46144523557036815, "step": 10476 }, { "epoch": 1.942343344456804, "grad_norm": 6.3671875, "learning_rate": 8.057656655543196e-06, "loss": 2.9652, "mean_token_accuracy": 0.45551203133743706, "step": 10477 }, { "epoch": 1.9425287356321839, "grad_norm": 8.1796875, "learning_rate": 8.057471264367817e-06, "loss": 2.9007, "mean_token_accuracy": 0.46941827426349636, "step": 10478 }, { "epoch": 1.942714126807564, "grad_norm": 5.99609375, "learning_rate": 8.057285873192436e-06, "loss": 3.3413, "mean_token_accuracy": 0.44204420442044207, "step": 10479 }, { "epoch": 1.942899517982944, "grad_norm": 7.2109375, "learning_rate": 8.057100482017056e-06, "loss": 2.6892, "mean_token_accuracy": 0.47316807409425227, "step": 10480 }, { "epoch": 1.943084909158324, "grad_norm": 5.98828125, "learning_rate": 8.056915090841677e-06, "loss": 3.3456, "mean_token_accuracy": 0.43257097791798105, "step": 10481 }, { "epoch": 1.9432703003337042, "grad_norm": 5.5390625, "learning_rate": 8.056729699666297e-06, "loss": 2.9839, "mean_token_accuracy": 0.44241182496707154, "step": 10482 }, { "epoch": 1.9434556915090841, "grad_norm": 5.546875, "learning_rate": 8.056544308490917e-06, "loss": 2.9067, "mean_token_accuracy": 0.47664562669071237, "step": 10483 }, { "epoch": 1.9436410826844641, "grad_norm": 6.15234375, "learning_rate": 8.056358917315536e-06, "loss": 2.6108, "mean_token_accuracy": 0.49892008639308855, "step": 10484 }, { "epoch": 1.9438264738598443, "grad_norm": 8.109375, "learning_rate": 8.056173526140157e-06, "loss": 2.6796, "mean_token_accuracy": 0.4784561233159364, "step": 10485 }, { "epoch": 1.9440118650352243, "grad_norm": 11.09375, "learning_rate": 8.055988134964776e-06, "loss": 1.846, "mean_token_accuracy": 0.5865133917990045, "step": 10486 }, { "epoch": 1.9441972562106042, "grad_norm": 8.140625, "learning_rate": 8.055802743789396e-06, "loss": 2.8276, "mean_token_accuracy": 0.47274061648144267, "step": 10487 }, { "epoch": 1.9443826473859844, "grad_norm": 7.8515625, "learning_rate": 8.055617352614017e-06, "loss": 2.5337, "mean_token_accuracy": 0.48117994100294986, "step": 10488 }, { "epoch": 1.9445680385613646, "grad_norm": 6.8359375, "learning_rate": 8.055431961438637e-06, "loss": 2.7193, "mean_token_accuracy": 0.4626111560226354, "step": 10489 }, { "epoch": 1.9447534297367444, "grad_norm": 6.44140625, "learning_rate": 8.055246570263256e-06, "loss": 2.9138, "mean_token_accuracy": 0.4703710899553138, "step": 10490 }, { "epoch": 1.9449388209121246, "grad_norm": 13.0546875, "learning_rate": 8.055061179087876e-06, "loss": 2.6443, "mean_token_accuracy": 0.5158277744087326, "step": 10491 }, { "epoch": 1.9451242120875047, "grad_norm": 7.8203125, "learning_rate": 8.054875787912497e-06, "loss": 2.9872, "mean_token_accuracy": 0.46629213483146065, "step": 10492 }, { "epoch": 1.9453096032628847, "grad_norm": 6.703125, "learning_rate": 8.054690396737116e-06, "loss": 2.2749, "mean_token_accuracy": 0.5698972755694507, "step": 10493 }, { "epoch": 1.9454949944382647, "grad_norm": 8.0234375, "learning_rate": 8.054505005561736e-06, "loss": 2.8601, "mean_token_accuracy": 0.4625447535114294, "step": 10494 }, { "epoch": 1.9456803856136449, "grad_norm": 6.25, "learning_rate": 8.054319614386355e-06, "loss": 3.442, "mean_token_accuracy": 0.4168589821450261, "step": 10495 }, { "epoch": 1.9458657767890248, "grad_norm": 6.26171875, "learning_rate": 8.054134223210975e-06, "loss": 2.8337, "mean_token_accuracy": 0.4772261443862446, "step": 10496 }, { "epoch": 1.9460511679644048, "grad_norm": 7.1484375, "learning_rate": 8.053948832035596e-06, "loss": 3.0354, "mean_token_accuracy": 0.4549929676511955, "step": 10497 }, { "epoch": 1.946236559139785, "grad_norm": 6.30078125, "learning_rate": 8.053763440860216e-06, "loss": 2.2133, "mean_token_accuracy": 0.5409836065573771, "step": 10498 }, { "epoch": 1.946421950315165, "grad_norm": 6.015625, "learning_rate": 8.053578049684835e-06, "loss": 2.7547, "mean_token_accuracy": 0.47962541128828146, "step": 10499 }, { "epoch": 1.946607341490545, "grad_norm": 6.40625, "learning_rate": 8.053392658509456e-06, "loss": 3.0949, "mean_token_accuracy": 0.449935723468076, "step": 10500 }, { "epoch": 1.9467927326659251, "grad_norm": 6.390625, "learning_rate": 8.053207267334076e-06, "loss": 3.4475, "mean_token_accuracy": 0.4132399457749661, "step": 10501 }, { "epoch": 1.9469781238413053, "grad_norm": 7.72265625, "learning_rate": 8.053021876158695e-06, "loss": 3.345, "mean_token_accuracy": 0.4551383747576238, "step": 10502 }, { "epoch": 1.947163515016685, "grad_norm": 7.01953125, "learning_rate": 8.052836484983315e-06, "loss": 2.6486, "mean_token_accuracy": 0.4955171952361836, "step": 10503 }, { "epoch": 1.9473489061920652, "grad_norm": 7.80078125, "learning_rate": 8.052651093807934e-06, "loss": 2.4398, "mean_token_accuracy": 0.5034168564920274, "step": 10504 }, { "epoch": 1.9475342973674454, "grad_norm": 6.01953125, "learning_rate": 8.052465702632556e-06, "loss": 3.0308, "mean_token_accuracy": 0.46256598082516426, "step": 10505 }, { "epoch": 1.9477196885428254, "grad_norm": 5.8515625, "learning_rate": 8.052280311457175e-06, "loss": 3.2298, "mean_token_accuracy": 0.44019581899973537, "step": 10506 }, { "epoch": 1.9479050797182054, "grad_norm": 5.48046875, "learning_rate": 8.052094920281796e-06, "loss": 3.6104, "mean_token_accuracy": 0.416658023026657, "step": 10507 }, { "epoch": 1.9480904708935856, "grad_norm": 5.61328125, "learning_rate": 8.051909529106415e-06, "loss": 2.9139, "mean_token_accuracy": 0.45685087055261164, "step": 10508 }, { "epoch": 1.9482758620689655, "grad_norm": 6.85546875, "learning_rate": 8.051724137931035e-06, "loss": 3.2905, "mean_token_accuracy": 0.4073987081620669, "step": 10509 }, { "epoch": 1.9484612532443455, "grad_norm": 7.85546875, "learning_rate": 8.051538746755656e-06, "loss": 3.2761, "mean_token_accuracy": 0.43172454384932313, "step": 10510 }, { "epoch": 1.9486466444197257, "grad_norm": 5.734375, "learning_rate": 8.051353355580274e-06, "loss": 2.4804, "mean_token_accuracy": 0.48996188055908513, "step": 10511 }, { "epoch": 1.9488320355951056, "grad_norm": 6.65625, "learning_rate": 8.051167964404895e-06, "loss": 2.3101, "mean_token_accuracy": 0.515110448570267, "step": 10512 }, { "epoch": 1.9490174267704856, "grad_norm": 5.859375, "learning_rate": 8.050982573229515e-06, "loss": 2.7689, "mean_token_accuracy": 0.47511061946902655, "step": 10513 }, { "epoch": 1.9492028179458658, "grad_norm": 6.79296875, "learning_rate": 8.050797182054136e-06, "loss": 2.699, "mean_token_accuracy": 0.48139880952380953, "step": 10514 }, { "epoch": 1.9493882091212458, "grad_norm": 5.23828125, "learning_rate": 8.050611790878755e-06, "loss": 2.9543, "mean_token_accuracy": 0.4460211767274315, "step": 10515 }, { "epoch": 1.9495736002966257, "grad_norm": 6.34375, "learning_rate": 8.050426399703375e-06, "loss": 3.6145, "mean_token_accuracy": 0.4283913659996849, "step": 10516 }, { "epoch": 1.949758991472006, "grad_norm": 6.3046875, "learning_rate": 8.050241008527996e-06, "loss": 2.8108, "mean_token_accuracy": 0.45940005171967935, "step": 10517 }, { "epoch": 1.9499443826473861, "grad_norm": 5.4296875, "learning_rate": 8.050055617352614e-06, "loss": 2.9966, "mean_token_accuracy": 0.4540406427221172, "step": 10518 }, { "epoch": 1.950129773822766, "grad_norm": 5.921875, "learning_rate": 8.049870226177235e-06, "loss": 3.8425, "mean_token_accuracy": 0.3959035843636818, "step": 10519 }, { "epoch": 1.950315164998146, "grad_norm": 6.89453125, "learning_rate": 8.049684835001854e-06, "loss": 2.6853, "mean_token_accuracy": 0.4767041031913711, "step": 10520 }, { "epoch": 1.9505005561735262, "grad_norm": 7.07421875, "learning_rate": 8.049499443826474e-06, "loss": 2.9303, "mean_token_accuracy": 0.4880968762900784, "step": 10521 }, { "epoch": 1.9506859473489062, "grad_norm": 5.7890625, "learning_rate": 8.049314052651095e-06, "loss": 2.5162, "mean_token_accuracy": 0.4987246312520794, "step": 10522 }, { "epoch": 1.9508713385242862, "grad_norm": 5.80859375, "learning_rate": 8.049128661475715e-06, "loss": 3.0992, "mean_token_accuracy": 0.46462370437133843, "step": 10523 }, { "epoch": 1.9510567296996664, "grad_norm": 6.51953125, "learning_rate": 8.048943270300334e-06, "loss": 2.9218, "mean_token_accuracy": 0.4610543979862956, "step": 10524 }, { "epoch": 1.9512421208750463, "grad_norm": 5.93359375, "learning_rate": 8.048757879124954e-06, "loss": 3.123, "mean_token_accuracy": 0.45618141916605703, "step": 10525 }, { "epoch": 1.9514275120504263, "grad_norm": 6.97265625, "learning_rate": 8.048572487949575e-06, "loss": 2.453, "mean_token_accuracy": 0.5111425539441103, "step": 10526 }, { "epoch": 1.9516129032258065, "grad_norm": 7.50390625, "learning_rate": 8.048387096774194e-06, "loss": 2.5441, "mean_token_accuracy": 0.4904710046283692, "step": 10527 }, { "epoch": 1.9517982944011865, "grad_norm": 6.9609375, "learning_rate": 8.048201705598814e-06, "loss": 3.277, "mean_token_accuracy": 0.4361100144755889, "step": 10528 }, { "epoch": 1.9519836855765664, "grad_norm": 7.58203125, "learning_rate": 8.048016314423433e-06, "loss": 3.6208, "mean_token_accuracy": 0.40683760683760684, "step": 10529 }, { "epoch": 1.9521690767519466, "grad_norm": 7.03125, "learning_rate": 8.047830923248055e-06, "loss": 3.0153, "mean_token_accuracy": 0.44150139757753226, "step": 10530 }, { "epoch": 1.9523544679273268, "grad_norm": 7.75390625, "learning_rate": 8.047645532072674e-06, "loss": 3.5224, "mean_token_accuracy": 0.4205135400589308, "step": 10531 }, { "epoch": 1.9525398591027066, "grad_norm": 13.1484375, "learning_rate": 8.047460140897294e-06, "loss": 3.4522, "mean_token_accuracy": 0.4147567103811742, "step": 10532 }, { "epoch": 1.9527252502780867, "grad_norm": 13.8125, "learning_rate": 8.047274749721913e-06, "loss": 2.7297, "mean_token_accuracy": 0.4707655213984328, "step": 10533 }, { "epoch": 1.952910641453467, "grad_norm": 6.96875, "learning_rate": 8.047089358546534e-06, "loss": 2.7687, "mean_token_accuracy": 0.4621772414687946, "step": 10534 }, { "epoch": 1.953096032628847, "grad_norm": 6.50390625, "learning_rate": 8.046903967371154e-06, "loss": 3.0116, "mean_token_accuracy": 0.448531226857566, "step": 10535 }, { "epoch": 1.9532814238042269, "grad_norm": 7.36328125, "learning_rate": 8.046718576195773e-06, "loss": 2.8412, "mean_token_accuracy": 0.4760946149974836, "step": 10536 }, { "epoch": 1.953466814979607, "grad_norm": 5.38671875, "learning_rate": 8.046533185020394e-06, "loss": 2.77, "mean_token_accuracy": 0.49051817852397633, "step": 10537 }, { "epoch": 1.953652206154987, "grad_norm": 7.4375, "learning_rate": 8.046347793845014e-06, "loss": 2.8354, "mean_token_accuracy": 0.4973399691093187, "step": 10538 }, { "epoch": 1.953837597330367, "grad_norm": 7.72265625, "learning_rate": 8.046162402669635e-06, "loss": 2.4162, "mean_token_accuracy": 0.523253980766199, "step": 10539 }, { "epoch": 1.9540229885057472, "grad_norm": 7.43359375, "learning_rate": 8.045977011494253e-06, "loss": 2.8244, "mean_token_accuracy": 0.47996089931573804, "step": 10540 }, { "epoch": 1.9542083796811272, "grad_norm": 6.25, "learning_rate": 8.045791620318874e-06, "loss": 3.141, "mean_token_accuracy": 0.4533728429492766, "step": 10541 }, { "epoch": 1.9543937708565071, "grad_norm": 6.43359375, "learning_rate": 8.045606229143493e-06, "loss": 2.8538, "mean_token_accuracy": 0.48956617243272926, "step": 10542 }, { "epoch": 1.9545791620318873, "grad_norm": 6.56640625, "learning_rate": 8.045420837968113e-06, "loss": 2.8376, "mean_token_accuracy": 0.4777652687869406, "step": 10543 }, { "epoch": 1.9547645532072675, "grad_norm": 6.33203125, "learning_rate": 8.045235446792734e-06, "loss": 2.9892, "mean_token_accuracy": 0.4557752341311134, "step": 10544 }, { "epoch": 1.9549499443826472, "grad_norm": 9.09375, "learning_rate": 8.045050055617352e-06, "loss": 2.6309, "mean_token_accuracy": 0.4774703557312253, "step": 10545 }, { "epoch": 1.9551353355580274, "grad_norm": 7.41015625, "learning_rate": 8.044864664441973e-06, "loss": 2.5797, "mean_token_accuracy": 0.5078280318091452, "step": 10546 }, { "epoch": 1.9553207267334076, "grad_norm": 5.6953125, "learning_rate": 8.044679273266593e-06, "loss": 2.924, "mean_token_accuracy": 0.447505498321565, "step": 10547 }, { "epoch": 1.9555061179087876, "grad_norm": 6.2578125, "learning_rate": 8.044493882091214e-06, "loss": 3.5033, "mean_token_accuracy": 0.4209031909140076, "step": 10548 }, { "epoch": 1.9556915090841676, "grad_norm": 6.76953125, "learning_rate": 8.044308490915833e-06, "loss": 2.7625, "mean_token_accuracy": 0.4569965870307167, "step": 10549 }, { "epoch": 1.9558769002595477, "grad_norm": 7.49609375, "learning_rate": 8.044123099740453e-06, "loss": 2.6469, "mean_token_accuracy": 0.47849355797819626, "step": 10550 }, { "epoch": 1.9560622914349277, "grad_norm": 10.1953125, "learning_rate": 8.043937708565072e-06, "loss": 3.3117, "mean_token_accuracy": 0.4523127919240621, "step": 10551 }, { "epoch": 1.9562476826103077, "grad_norm": 11.7265625, "learning_rate": 8.043752317389692e-06, "loss": 2.5625, "mean_token_accuracy": 0.48314902580305424, "step": 10552 }, { "epoch": 1.9564330737856879, "grad_norm": 9.2890625, "learning_rate": 8.043566926214313e-06, "loss": 3.4114, "mean_token_accuracy": 0.4431753283837807, "step": 10553 }, { "epoch": 1.9566184649610678, "grad_norm": 6.17578125, "learning_rate": 8.043381535038933e-06, "loss": 2.3534, "mean_token_accuracy": 0.5067531779661016, "step": 10554 }, { "epoch": 1.9568038561364478, "grad_norm": 5.953125, "learning_rate": 8.043196143863554e-06, "loss": 2.627, "mean_token_accuracy": 0.46782255545142143, "step": 10555 }, { "epoch": 1.956989247311828, "grad_norm": 6.23046875, "learning_rate": 8.043010752688173e-06, "loss": 2.5411, "mean_token_accuracy": 0.5180926096289482, "step": 10556 }, { "epoch": 1.957174638487208, "grad_norm": 5.51953125, "learning_rate": 8.042825361512793e-06, "loss": 2.6001, "mean_token_accuracy": 0.5063050998772458, "step": 10557 }, { "epoch": 1.957360029662588, "grad_norm": 5.9140625, "learning_rate": 8.042639970337412e-06, "loss": 3.2724, "mean_token_accuracy": 0.4320300997873385, "step": 10558 }, { "epoch": 1.9575454208379681, "grad_norm": 6.3203125, "learning_rate": 8.042454579162032e-06, "loss": 3.0506, "mean_token_accuracy": 0.45068825910931176, "step": 10559 }, { "epoch": 1.9577308120133483, "grad_norm": 6.83203125, "learning_rate": 8.042269187986651e-06, "loss": 2.6215, "mean_token_accuracy": 0.4752245113576334, "step": 10560 }, { "epoch": 1.957916203188728, "grad_norm": 6.765625, "learning_rate": 8.042083796811272e-06, "loss": 2.9711, "mean_token_accuracy": 0.45222849968612683, "step": 10561 }, { "epoch": 1.9581015943641082, "grad_norm": 6.94140625, "learning_rate": 8.041898405635892e-06, "loss": 2.9325, "mean_token_accuracy": 0.46841404730213987, "step": 10562 }, { "epoch": 1.9582869855394884, "grad_norm": 6.72265625, "learning_rate": 8.041713014460513e-06, "loss": 2.8092, "mean_token_accuracy": 0.46603131381892443, "step": 10563 }, { "epoch": 1.9584723767148684, "grad_norm": 6.19921875, "learning_rate": 8.041527623285133e-06, "loss": 2.587, "mean_token_accuracy": 0.4912476129853596, "step": 10564 }, { "epoch": 1.9586577678902484, "grad_norm": 6.32421875, "learning_rate": 8.041342232109752e-06, "loss": 3.3039, "mean_token_accuracy": 0.4194163646767118, "step": 10565 }, { "epoch": 1.9588431590656286, "grad_norm": 5.7890625, "learning_rate": 8.041156840934373e-06, "loss": 2.6794, "mean_token_accuracy": 0.48881193393713374, "step": 10566 }, { "epoch": 1.9590285502410085, "grad_norm": 6.27734375, "learning_rate": 8.040971449758991e-06, "loss": 2.518, "mean_token_accuracy": 0.4898589519201071, "step": 10567 }, { "epoch": 1.9592139414163885, "grad_norm": 5.92578125, "learning_rate": 8.040786058583612e-06, "loss": 3.2332, "mean_token_accuracy": 0.42391304347826086, "step": 10568 }, { "epoch": 1.9593993325917687, "grad_norm": 5.6640625, "learning_rate": 8.040600667408232e-06, "loss": 2.915, "mean_token_accuracy": 0.4814674016346702, "step": 10569 }, { "epoch": 1.9595847237671487, "grad_norm": 5.3203125, "learning_rate": 8.040415276232853e-06, "loss": 2.5864, "mean_token_accuracy": 0.4891011840688913, "step": 10570 }, { "epoch": 1.9597701149425286, "grad_norm": 6.2109375, "learning_rate": 8.040229885057472e-06, "loss": 2.8914, "mean_token_accuracy": 0.4819497587755781, "step": 10571 }, { "epoch": 1.9599555061179088, "grad_norm": 6.85546875, "learning_rate": 8.040044493882092e-06, "loss": 2.562, "mean_token_accuracy": 0.4825344212136665, "step": 10572 }, { "epoch": 1.960140897293289, "grad_norm": 5.63671875, "learning_rate": 8.039859102706713e-06, "loss": 3.1942, "mean_token_accuracy": 0.4466324200913242, "step": 10573 }, { "epoch": 1.9603262884686687, "grad_norm": 5.66015625, "learning_rate": 8.039673711531331e-06, "loss": 3.0959, "mean_token_accuracy": 0.4499563754206656, "step": 10574 }, { "epoch": 1.960511679644049, "grad_norm": 6.86328125, "learning_rate": 8.039488320355952e-06, "loss": 2.6456, "mean_token_accuracy": 0.4760304004676995, "step": 10575 }, { "epoch": 1.9606970708194291, "grad_norm": 5.88671875, "learning_rate": 8.03930292918057e-06, "loss": 3.2979, "mean_token_accuracy": 0.4250236518448439, "step": 10576 }, { "epoch": 1.960882461994809, "grad_norm": 6.875, "learning_rate": 8.039117538005191e-06, "loss": 3.0132, "mean_token_accuracy": 0.4568432374294165, "step": 10577 }, { "epoch": 1.961067853170189, "grad_norm": 6.01171875, "learning_rate": 8.038932146829812e-06, "loss": 2.6318, "mean_token_accuracy": 0.5182692307692308, "step": 10578 }, { "epoch": 1.9612532443455692, "grad_norm": 6.6796875, "learning_rate": 8.038746755654432e-06, "loss": 2.7356, "mean_token_accuracy": 0.47750575434191256, "step": 10579 }, { "epoch": 1.9614386355209492, "grad_norm": 5.796875, "learning_rate": 8.038561364479051e-06, "loss": 3.27, "mean_token_accuracy": 0.43345823764051733, "step": 10580 }, { "epoch": 1.9616240266963292, "grad_norm": 5.80078125, "learning_rate": 8.038375973303671e-06, "loss": 3.5975, "mean_token_accuracy": 0.4071534078919603, "step": 10581 }, { "epoch": 1.9618094178717094, "grad_norm": 6.53125, "learning_rate": 8.038190582128292e-06, "loss": 3.0283, "mean_token_accuracy": 0.46247987117552336, "step": 10582 }, { "epoch": 1.9619948090470893, "grad_norm": 6.34375, "learning_rate": 8.03800519095291e-06, "loss": 3.6128, "mean_token_accuracy": 0.4159510988211323, "step": 10583 }, { "epoch": 1.9621802002224693, "grad_norm": 6.0078125, "learning_rate": 8.037819799777531e-06, "loss": 3.3801, "mean_token_accuracy": 0.41379310344827586, "step": 10584 }, { "epoch": 1.9623655913978495, "grad_norm": 7.98046875, "learning_rate": 8.03763440860215e-06, "loss": 2.5335, "mean_token_accuracy": 0.49079858179976366, "step": 10585 }, { "epoch": 1.9625509825732295, "grad_norm": 5.578125, "learning_rate": 8.037449017426772e-06, "loss": 3.2632, "mean_token_accuracy": 0.4355491706934831, "step": 10586 }, { "epoch": 1.9627363737486094, "grad_norm": 6.19921875, "learning_rate": 8.037263626251391e-06, "loss": 2.9972, "mean_token_accuracy": 0.44512482336316533, "step": 10587 }, { "epoch": 1.9629217649239896, "grad_norm": 6.31640625, "learning_rate": 8.037078235076011e-06, "loss": 2.5911, "mean_token_accuracy": 0.4903589021815623, "step": 10588 }, { "epoch": 1.9631071560993698, "grad_norm": 5.8359375, "learning_rate": 8.03689284390063e-06, "loss": 2.7078, "mean_token_accuracy": 0.4768100413286392, "step": 10589 }, { "epoch": 1.9632925472747496, "grad_norm": 5.8671875, "learning_rate": 8.03670745272525e-06, "loss": 2.8359, "mean_token_accuracy": 0.45407239819004525, "step": 10590 }, { "epoch": 1.9634779384501297, "grad_norm": 7.90234375, "learning_rate": 8.036522061549871e-06, "loss": 2.6195, "mean_token_accuracy": 0.48159708674730134, "step": 10591 }, { "epoch": 1.96366332962551, "grad_norm": 8.03125, "learning_rate": 8.03633667037449e-06, "loss": 2.2452, "mean_token_accuracy": 0.5224158573899161, "step": 10592 }, { "epoch": 1.96384872080089, "grad_norm": 6.3046875, "learning_rate": 8.03615127919911e-06, "loss": 3.3375, "mean_token_accuracy": 0.44288025889967636, "step": 10593 }, { "epoch": 1.9640341119762699, "grad_norm": 8.6484375, "learning_rate": 8.035965888023731e-06, "loss": 2.5639, "mean_token_accuracy": 0.5264267826545407, "step": 10594 }, { "epoch": 1.96421950315165, "grad_norm": 8.34375, "learning_rate": 8.035780496848352e-06, "loss": 2.557, "mean_token_accuracy": 0.47866391995291346, "step": 10595 }, { "epoch": 1.96440489432703, "grad_norm": 5.8515625, "learning_rate": 8.03559510567297e-06, "loss": 3.4585, "mean_token_accuracy": 0.4276387377584331, "step": 10596 }, { "epoch": 1.96459028550241, "grad_norm": 7.46484375, "learning_rate": 8.03540971449759e-06, "loss": 2.1089, "mean_token_accuracy": 0.5613308223477715, "step": 10597 }, { "epoch": 1.9647756766777902, "grad_norm": 6.3203125, "learning_rate": 8.035224323322211e-06, "loss": 2.9303, "mean_token_accuracy": 0.4532082711575357, "step": 10598 }, { "epoch": 1.9649610678531702, "grad_norm": 5.96484375, "learning_rate": 8.03503893214683e-06, "loss": 3.375, "mean_token_accuracy": 0.4267900758452243, "step": 10599 }, { "epoch": 1.9651464590285501, "grad_norm": 5.79296875, "learning_rate": 8.03485354097145e-06, "loss": 3.2035, "mean_token_accuracy": 0.4290803536086555, "step": 10600 }, { "epoch": 1.9653318502039303, "grad_norm": 5.56640625, "learning_rate": 8.03466814979607e-06, "loss": 2.6259, "mean_token_accuracy": 0.49467238211879977, "step": 10601 }, { "epoch": 1.9655172413793105, "grad_norm": 5.30859375, "learning_rate": 8.034482758620692e-06, "loss": 2.8318, "mean_token_accuracy": 0.4607976653696498, "step": 10602 }, { "epoch": 1.9657026325546902, "grad_norm": 5.953125, "learning_rate": 8.03429736744531e-06, "loss": 3.0009, "mean_token_accuracy": 0.4699017199017199, "step": 10603 }, { "epoch": 1.9658880237300704, "grad_norm": 7.2265625, "learning_rate": 8.034111976269931e-06, "loss": 2.525, "mean_token_accuracy": 0.4879690949227373, "step": 10604 }, { "epoch": 1.9660734149054506, "grad_norm": 5.38671875, "learning_rate": 8.03392658509455e-06, "loss": 3.2323, "mean_token_accuracy": 0.4411607809666062, "step": 10605 }, { "epoch": 1.9662588060808306, "grad_norm": 10.5, "learning_rate": 8.03374119391917e-06, "loss": 2.4949, "mean_token_accuracy": 0.4899717324000538, "step": 10606 }, { "epoch": 1.9664441972562106, "grad_norm": 6.7578125, "learning_rate": 8.03355580274379e-06, "loss": 2.9911, "mean_token_accuracy": 0.4533273110508883, "step": 10607 }, { "epoch": 1.9666295884315907, "grad_norm": 7.02734375, "learning_rate": 8.03337041156841e-06, "loss": 3.3464, "mean_token_accuracy": 0.4457762557077626, "step": 10608 }, { "epoch": 1.9668149796069707, "grad_norm": 6.734375, "learning_rate": 8.03318502039303e-06, "loss": 3.1309, "mean_token_accuracy": 0.45518788558609086, "step": 10609 }, { "epoch": 1.9670003707823507, "grad_norm": 7.71875, "learning_rate": 8.03299962921765e-06, "loss": 2.7598, "mean_token_accuracy": 0.471722621902478, "step": 10610 }, { "epoch": 1.9671857619577309, "grad_norm": 6.11328125, "learning_rate": 8.032814238042271e-06, "loss": 2.8744, "mean_token_accuracy": 0.4774156660949114, "step": 10611 }, { "epoch": 1.9673711531331108, "grad_norm": 16.203125, "learning_rate": 8.03262884686689e-06, "loss": 2.9014, "mean_token_accuracy": 0.4712213383247543, "step": 10612 }, { "epoch": 1.9675565443084908, "grad_norm": 9.3359375, "learning_rate": 8.03244345569151e-06, "loss": 3.072, "mean_token_accuracy": 0.4389261744966443, "step": 10613 }, { "epoch": 1.967741935483871, "grad_norm": 7.53515625, "learning_rate": 8.032258064516129e-06, "loss": 3.0121, "mean_token_accuracy": 0.4968789013732834, "step": 10614 }, { "epoch": 1.9679273266592512, "grad_norm": 8.9765625, "learning_rate": 8.03207267334075e-06, "loss": 2.2504, "mean_token_accuracy": 0.5257638967189086, "step": 10615 }, { "epoch": 1.968112717834631, "grad_norm": 5.92578125, "learning_rate": 8.03188728216537e-06, "loss": 3.0954, "mean_token_accuracy": 0.45932499663842946, "step": 10616 }, { "epoch": 1.9682981090100111, "grad_norm": 6.51171875, "learning_rate": 8.031701890989989e-06, "loss": 2.9062, "mean_token_accuracy": 0.47510937684758187, "step": 10617 }, { "epoch": 1.9684835001853913, "grad_norm": 6.625, "learning_rate": 8.03151649981461e-06, "loss": 3.3537, "mean_token_accuracy": 0.4477894986369563, "step": 10618 }, { "epoch": 1.9686688913607713, "grad_norm": 7.62109375, "learning_rate": 8.03133110863923e-06, "loss": 2.7757, "mean_token_accuracy": 0.4804917827123322, "step": 10619 }, { "epoch": 1.9688542825361512, "grad_norm": 6.60546875, "learning_rate": 8.03114571746385e-06, "loss": 3.1903, "mean_token_accuracy": 0.4413394066231688, "step": 10620 }, { "epoch": 1.9690396737115314, "grad_norm": 7.12109375, "learning_rate": 8.030960326288469e-06, "loss": 2.8262, "mean_token_accuracy": 0.4662219699120068, "step": 10621 }, { "epoch": 1.9692250648869114, "grad_norm": 7.6015625, "learning_rate": 8.03077493511309e-06, "loss": 3.1873, "mean_token_accuracy": 0.4777592768791627, "step": 10622 }, { "epoch": 1.9694104560622914, "grad_norm": 8.796875, "learning_rate": 8.030589543937708e-06, "loss": 3.19, "mean_token_accuracy": 0.4783060535203949, "step": 10623 }, { "epoch": 1.9695958472376716, "grad_norm": 6.03515625, "learning_rate": 8.030404152762329e-06, "loss": 3.1569, "mean_token_accuracy": 0.4679006169751542, "step": 10624 }, { "epoch": 1.9697812384130515, "grad_norm": 6.96484375, "learning_rate": 8.03021876158695e-06, "loss": 3.2334, "mean_token_accuracy": 0.44752066115702477, "step": 10625 }, { "epoch": 1.9699666295884315, "grad_norm": 6.48046875, "learning_rate": 8.03003337041157e-06, "loss": 3.5067, "mean_token_accuracy": 0.3862623762376238, "step": 10626 }, { "epoch": 1.9701520207638117, "grad_norm": 5.42578125, "learning_rate": 8.029847979236189e-06, "loss": 3.3476, "mean_token_accuracy": 0.4401382374722291, "step": 10627 }, { "epoch": 1.9703374119391917, "grad_norm": 6.84375, "learning_rate": 8.029662588060809e-06, "loss": 2.4076, "mean_token_accuracy": 0.5136352568785001, "step": 10628 }, { "epoch": 1.9705228031145716, "grad_norm": 7.25390625, "learning_rate": 8.02947719688543e-06, "loss": 3.0193, "mean_token_accuracy": 0.46475981935130695, "step": 10629 }, { "epoch": 1.9707081942899518, "grad_norm": 6.203125, "learning_rate": 8.029291805710048e-06, "loss": 2.7658, "mean_token_accuracy": 0.46382003152314083, "step": 10630 }, { "epoch": 1.970893585465332, "grad_norm": 5.89453125, "learning_rate": 8.029106414534669e-06, "loss": 2.6314, "mean_token_accuracy": 0.4692511504671594, "step": 10631 }, { "epoch": 1.9710789766407117, "grad_norm": 6.8359375, "learning_rate": 8.028921023359288e-06, "loss": 3.0423, "mean_token_accuracy": 0.4620600115406809, "step": 10632 }, { "epoch": 1.971264367816092, "grad_norm": 6.7265625, "learning_rate": 8.028735632183908e-06, "loss": 3.2103, "mean_token_accuracy": 0.4374867331776693, "step": 10633 }, { "epoch": 1.9714497589914721, "grad_norm": 6.60546875, "learning_rate": 8.028550241008529e-06, "loss": 2.8704, "mean_token_accuracy": 0.4605589362181654, "step": 10634 }, { "epoch": 1.971635150166852, "grad_norm": 5.50390625, "learning_rate": 8.02836484983315e-06, "loss": 2.7121, "mean_token_accuracy": 0.49283596837944665, "step": 10635 }, { "epoch": 1.971820541342232, "grad_norm": 5.921875, "learning_rate": 8.02817945865777e-06, "loss": 2.261, "mean_token_accuracy": 0.5492651757188498, "step": 10636 }, { "epoch": 1.9720059325176122, "grad_norm": 6.703125, "learning_rate": 8.027994067482388e-06, "loss": 2.6194, "mean_token_accuracy": 0.5007116424708227, "step": 10637 }, { "epoch": 1.9721913236929922, "grad_norm": 7.43359375, "learning_rate": 8.027808676307009e-06, "loss": 2.9138, "mean_token_accuracy": 0.4536758677234217, "step": 10638 }, { "epoch": 1.9723767148683722, "grad_norm": 7.54296875, "learning_rate": 8.027623285131628e-06, "loss": 2.7717, "mean_token_accuracy": 0.48630338733431516, "step": 10639 }, { "epoch": 1.9725621060437524, "grad_norm": 8.15625, "learning_rate": 8.027437893956248e-06, "loss": 3.6419, "mean_token_accuracy": 0.4408547871445398, "step": 10640 }, { "epoch": 1.9727474972191323, "grad_norm": 5.95703125, "learning_rate": 8.027252502780867e-06, "loss": 3.021, "mean_token_accuracy": 0.44087272727272725, "step": 10641 }, { "epoch": 1.9729328883945123, "grad_norm": 10.734375, "learning_rate": 8.027067111605488e-06, "loss": 3.1236, "mean_token_accuracy": 0.4875593269092478, "step": 10642 }, { "epoch": 1.9731182795698925, "grad_norm": 6.1640625, "learning_rate": 8.026881720430108e-06, "loss": 3.0518, "mean_token_accuracy": 0.46080198242847487, "step": 10643 }, { "epoch": 1.9733036707452727, "grad_norm": 6.4375, "learning_rate": 8.026696329254729e-06, "loss": 2.6385, "mean_token_accuracy": 0.4874191229331416, "step": 10644 }, { "epoch": 1.9734890619206524, "grad_norm": 5.48046875, "learning_rate": 8.026510938079349e-06, "loss": 3.0651, "mean_token_accuracy": 0.4495970405601797, "step": 10645 }, { "epoch": 1.9736744530960326, "grad_norm": 5.75390625, "learning_rate": 8.026325546903968e-06, "loss": 2.8533, "mean_token_accuracy": 0.4759027266028003, "step": 10646 }, { "epoch": 1.9738598442714128, "grad_norm": 5.8515625, "learning_rate": 8.026140155728588e-06, "loss": 2.4104, "mean_token_accuracy": 0.5054945054945055, "step": 10647 }, { "epoch": 1.9740452354467928, "grad_norm": 5.56640625, "learning_rate": 8.025954764553207e-06, "loss": 2.9778, "mean_token_accuracy": 0.46188449848024316, "step": 10648 }, { "epoch": 1.9742306266221727, "grad_norm": 7.95703125, "learning_rate": 8.025769373377828e-06, "loss": 3.6646, "mean_token_accuracy": 0.4354908461871126, "step": 10649 }, { "epoch": 1.974416017797553, "grad_norm": 7.14453125, "learning_rate": 8.025583982202448e-06, "loss": 2.5355, "mean_token_accuracy": 0.5058131939908557, "step": 10650 }, { "epoch": 1.974601408972933, "grad_norm": 8.6640625, "learning_rate": 8.025398591027069e-06, "loss": 2.8217, "mean_token_accuracy": 0.48036253776435045, "step": 10651 }, { "epoch": 1.9747868001483129, "grad_norm": 7.5546875, "learning_rate": 8.025213199851687e-06, "loss": 2.4395, "mean_token_accuracy": 0.529796511627907, "step": 10652 }, { "epoch": 1.974972191323693, "grad_norm": 6.9140625, "learning_rate": 8.025027808676308e-06, "loss": 2.6031, "mean_token_accuracy": 0.5236822568671121, "step": 10653 }, { "epoch": 1.975157582499073, "grad_norm": 8.734375, "learning_rate": 8.024842417500928e-06, "loss": 3.1856, "mean_token_accuracy": 0.44300265103948655, "step": 10654 }, { "epoch": 1.975342973674453, "grad_norm": 7.15234375, "learning_rate": 8.024657026325547e-06, "loss": 2.8198, "mean_token_accuracy": 0.47390515089630136, "step": 10655 }, { "epoch": 1.9755283648498332, "grad_norm": 10.1015625, "learning_rate": 8.024471635150168e-06, "loss": 2.3306, "mean_token_accuracy": 0.512817290776577, "step": 10656 }, { "epoch": 1.9757137560252132, "grad_norm": 8.3515625, "learning_rate": 8.024286243974786e-06, "loss": 2.0155, "mean_token_accuracy": 0.598697539797395, "step": 10657 }, { "epoch": 1.9758991472005931, "grad_norm": 10.4765625, "learning_rate": 8.024100852799407e-06, "loss": 2.659, "mean_token_accuracy": 0.4819435325016415, "step": 10658 }, { "epoch": 1.9760845383759733, "grad_norm": 9.71875, "learning_rate": 8.023915461624027e-06, "loss": 2.8298, "mean_token_accuracy": 0.4708695652173913, "step": 10659 }, { "epoch": 1.9762699295513535, "grad_norm": 6.75390625, "learning_rate": 8.023730070448648e-06, "loss": 3.3916, "mean_token_accuracy": 0.4241001564945227, "step": 10660 }, { "epoch": 1.9764553207267332, "grad_norm": 7.96875, "learning_rate": 8.023544679273267e-06, "loss": 3.2713, "mean_token_accuracy": 0.43509385937002865, "step": 10661 }, { "epoch": 1.9766407119021134, "grad_norm": 7.51953125, "learning_rate": 8.023359288097887e-06, "loss": 3.5524, "mean_token_accuracy": 0.4191852825229961, "step": 10662 }, { "epoch": 1.9768261030774936, "grad_norm": 8.15625, "learning_rate": 8.023173896922508e-06, "loss": 3.0429, "mean_token_accuracy": 0.46165843514426824, "step": 10663 }, { "epoch": 1.9770114942528736, "grad_norm": 7.62109375, "learning_rate": 8.022988505747126e-06, "loss": 3.1729, "mean_token_accuracy": 0.44058744993324434, "step": 10664 }, { "epoch": 1.9771968854282536, "grad_norm": 5.734375, "learning_rate": 8.022803114571747e-06, "loss": 3.2362, "mean_token_accuracy": 0.44572984008166044, "step": 10665 }, { "epoch": 1.9773822766036337, "grad_norm": 7.31640625, "learning_rate": 8.022617723396366e-06, "loss": 2.0301, "mean_token_accuracy": 0.5658025372311086, "step": 10666 }, { "epoch": 1.9775676677790137, "grad_norm": 7.3984375, "learning_rate": 8.022432332220988e-06, "loss": 3.2387, "mean_token_accuracy": 0.4350767629456154, "step": 10667 }, { "epoch": 1.9777530589543937, "grad_norm": 6.98828125, "learning_rate": 8.022246941045607e-06, "loss": 3.1533, "mean_token_accuracy": 0.47295946696279845, "step": 10668 }, { "epoch": 1.9779384501297739, "grad_norm": 6.73046875, "learning_rate": 8.022061549870227e-06, "loss": 3.1954, "mean_token_accuracy": 0.44366197183098594, "step": 10669 }, { "epoch": 1.9781238413051538, "grad_norm": 6.28125, "learning_rate": 8.021876158694846e-06, "loss": 3.0928, "mean_token_accuracy": 0.4833887043189369, "step": 10670 }, { "epoch": 1.9783092324805338, "grad_norm": 6.0078125, "learning_rate": 8.021690767519467e-06, "loss": 3.1318, "mean_token_accuracy": 0.4527487821851079, "step": 10671 }, { "epoch": 1.978494623655914, "grad_norm": 5.44140625, "learning_rate": 8.021505376344087e-06, "loss": 2.7924, "mean_token_accuracy": 0.46773120425815035, "step": 10672 }, { "epoch": 1.9786800148312942, "grad_norm": 6.19921875, "learning_rate": 8.021319985168706e-06, "loss": 2.8581, "mean_token_accuracy": 0.4737927687454501, "step": 10673 }, { "epoch": 1.978865406006674, "grad_norm": 6.23046875, "learning_rate": 8.021134593993326e-06, "loss": 2.5478, "mean_token_accuracy": 0.4771878184082418, "step": 10674 }, { "epoch": 1.9790507971820541, "grad_norm": 6.19921875, "learning_rate": 8.020949202817947e-06, "loss": 3.783, "mean_token_accuracy": 0.4208070271754049, "step": 10675 }, { "epoch": 1.9792361883574343, "grad_norm": 7.328125, "learning_rate": 8.020763811642567e-06, "loss": 2.2361, "mean_token_accuracy": 0.5335989661856558, "step": 10676 }, { "epoch": 1.9794215795328143, "grad_norm": 7.48046875, "learning_rate": 8.020578420467186e-06, "loss": 3.131, "mean_token_accuracy": 0.43209486166007904, "step": 10677 }, { "epoch": 1.9796069707081942, "grad_norm": 8.984375, "learning_rate": 8.020393029291807e-06, "loss": 2.8919, "mean_token_accuracy": 0.44433208615633535, "step": 10678 }, { "epoch": 1.9797923618835744, "grad_norm": 10.1953125, "learning_rate": 8.020207638116425e-06, "loss": 2.5593, "mean_token_accuracy": 0.4896719319562576, "step": 10679 }, { "epoch": 1.9799777530589544, "grad_norm": 7.453125, "learning_rate": 8.020022246941046e-06, "loss": 3.285, "mean_token_accuracy": 0.4269027882441598, "step": 10680 }, { "epoch": 1.9801631442343344, "grad_norm": 10.953125, "learning_rate": 8.019836855765666e-06, "loss": 2.6711, "mean_token_accuracy": 0.4742864025051186, "step": 10681 }, { "epoch": 1.9803485354097146, "grad_norm": 8.9375, "learning_rate": 8.019651464590285e-06, "loss": 3.0672, "mean_token_accuracy": 0.4702925634746746, "step": 10682 }, { "epoch": 1.9805339265850945, "grad_norm": 8.09375, "learning_rate": 8.019466073414907e-06, "loss": 2.431, "mean_token_accuracy": 0.5255681818181818, "step": 10683 }, { "epoch": 1.9807193177604745, "grad_norm": 7.1796875, "learning_rate": 8.019280682239526e-06, "loss": 3.1742, "mean_token_accuracy": 0.4424148974024434, "step": 10684 }, { "epoch": 1.9809047089358547, "grad_norm": 6.94921875, "learning_rate": 8.019095291064147e-06, "loss": 2.6353, "mean_token_accuracy": 0.4828155981493721, "step": 10685 }, { "epoch": 1.9810901001112347, "grad_norm": 6.09375, "learning_rate": 8.018909899888765e-06, "loss": 2.3101, "mean_token_accuracy": 0.5647118947233933, "step": 10686 }, { "epoch": 1.9812754912866146, "grad_norm": 6.30078125, "learning_rate": 8.018724508713386e-06, "loss": 2.6596, "mean_token_accuracy": 0.4853117107536887, "step": 10687 }, { "epoch": 1.9814608824619948, "grad_norm": 5.4140625, "learning_rate": 8.018539117538006e-06, "loss": 2.662, "mean_token_accuracy": 0.4828291684933509, "step": 10688 }, { "epoch": 1.981646273637375, "grad_norm": 5.921875, "learning_rate": 8.018353726362625e-06, "loss": 3.189, "mean_token_accuracy": 0.44811028500619576, "step": 10689 }, { "epoch": 1.981831664812755, "grad_norm": 8.2109375, "learning_rate": 8.018168335187246e-06, "loss": 3.0401, "mean_token_accuracy": 0.48121718055742563, "step": 10690 }, { "epoch": 1.982017055988135, "grad_norm": 6.7421875, "learning_rate": 8.017982944011866e-06, "loss": 2.8874, "mean_token_accuracy": 0.4606582278481013, "step": 10691 }, { "epoch": 1.9822024471635151, "grad_norm": 7.69140625, "learning_rate": 8.017797552836487e-06, "loss": 3.1193, "mean_token_accuracy": 0.4639175257731959, "step": 10692 }, { "epoch": 1.982387838338895, "grad_norm": 6.5, "learning_rate": 8.017612161661105e-06, "loss": 3.0379, "mean_token_accuracy": 0.4620075046904315, "step": 10693 }, { "epoch": 1.982573229514275, "grad_norm": 7.578125, "learning_rate": 8.017426770485726e-06, "loss": 2.769, "mean_token_accuracy": 0.46250282358256156, "step": 10694 }, { "epoch": 1.9827586206896552, "grad_norm": 7.9609375, "learning_rate": 8.017241379310345e-06, "loss": 2.7203, "mean_token_accuracy": 0.4851439890294073, "step": 10695 }, { "epoch": 1.9829440118650352, "grad_norm": 8.4921875, "learning_rate": 8.017055988134965e-06, "loss": 3.1994, "mean_token_accuracy": 0.4620359281437126, "step": 10696 }, { "epoch": 1.9831294030404152, "grad_norm": 5.2421875, "learning_rate": 8.016870596959586e-06, "loss": 3.025, "mean_token_accuracy": 0.4575773064145157, "step": 10697 }, { "epoch": 1.9833147942157954, "grad_norm": 8.3984375, "learning_rate": 8.016685205784205e-06, "loss": 3.2198, "mean_token_accuracy": 0.4391107853914743, "step": 10698 }, { "epoch": 1.9835001853911753, "grad_norm": 9.3125, "learning_rate": 8.016499814608825e-06, "loss": 2.5917, "mean_token_accuracy": 0.4902886431076342, "step": 10699 }, { "epoch": 1.9836855765665553, "grad_norm": 7.69140625, "learning_rate": 8.016314423433446e-06, "loss": 2.8955, "mean_token_accuracy": 0.4513635861764381, "step": 10700 }, { "epoch": 1.9838709677419355, "grad_norm": 7.59375, "learning_rate": 8.016129032258066e-06, "loss": 2.5019, "mean_token_accuracy": 0.4971257185703574, "step": 10701 }, { "epoch": 1.9840563589173157, "grad_norm": 11.9765625, "learning_rate": 8.015943641082685e-06, "loss": 2.8997, "mean_token_accuracy": 0.4768067922965417, "step": 10702 }, { "epoch": 1.9842417500926954, "grad_norm": 7.1796875, "learning_rate": 8.015758249907305e-06, "loss": 3.556, "mean_token_accuracy": 0.41295595949737707, "step": 10703 }, { "epoch": 1.9844271412680756, "grad_norm": 8.2734375, "learning_rate": 8.015572858731924e-06, "loss": 2.6868, "mean_token_accuracy": 0.4712054965646471, "step": 10704 }, { "epoch": 1.9846125324434558, "grad_norm": 11.5390625, "learning_rate": 8.015387467556545e-06, "loss": 2.9923, "mean_token_accuracy": 0.4474330487662262, "step": 10705 }, { "epoch": 1.9847979236188358, "grad_norm": 8.46875, "learning_rate": 8.015202076381165e-06, "loss": 2.9238, "mean_token_accuracy": 0.46416510318949344, "step": 10706 }, { "epoch": 1.9849833147942157, "grad_norm": 6.62109375, "learning_rate": 8.015016685205786e-06, "loss": 2.8922, "mean_token_accuracy": 0.44942424926620006, "step": 10707 }, { "epoch": 1.985168705969596, "grad_norm": 11.5, "learning_rate": 8.014831294030404e-06, "loss": 2.7848, "mean_token_accuracy": 0.45894102726696256, "step": 10708 }, { "epoch": 1.985354097144976, "grad_norm": 9.7109375, "learning_rate": 8.014645902855025e-06, "loss": 3.0297, "mean_token_accuracy": 0.4752231455832564, "step": 10709 }, { "epoch": 1.9855394883203559, "grad_norm": 6.44140625, "learning_rate": 8.014460511679645e-06, "loss": 3.0374, "mean_token_accuracy": 0.4555663385553861, "step": 10710 }, { "epoch": 1.985724879495736, "grad_norm": 8.9453125, "learning_rate": 8.014275120504264e-06, "loss": 2.6101, "mean_token_accuracy": 0.5046206225680934, "step": 10711 }, { "epoch": 1.985910270671116, "grad_norm": 6.30859375, "learning_rate": 8.014089729328885e-06, "loss": 2.4811, "mean_token_accuracy": 0.5099959200326397, "step": 10712 }, { "epoch": 1.986095661846496, "grad_norm": 6.5234375, "learning_rate": 8.013904338153503e-06, "loss": 3.4257, "mean_token_accuracy": 0.4121285627653123, "step": 10713 }, { "epoch": 1.9862810530218762, "grad_norm": 9.421875, "learning_rate": 8.013718946978124e-06, "loss": 2.7287, "mean_token_accuracy": 0.49371657754010695, "step": 10714 }, { "epoch": 1.9864664441972564, "grad_norm": 6.74609375, "learning_rate": 8.013533555802744e-06, "loss": 3.4397, "mean_token_accuracy": 0.43787477339283226, "step": 10715 }, { "epoch": 1.9866518353726361, "grad_norm": 7.39453125, "learning_rate": 8.013348164627365e-06, "loss": 2.644, "mean_token_accuracy": 0.5053722179585571, "step": 10716 }, { "epoch": 1.9868372265480163, "grad_norm": 6.9296875, "learning_rate": 8.013162773451985e-06, "loss": 3.1884, "mean_token_accuracy": 0.45357142857142857, "step": 10717 }, { "epoch": 1.9870226177233965, "grad_norm": 5.9921875, "learning_rate": 8.012977382276604e-06, "loss": 2.7344, "mean_token_accuracy": 0.49273447820343463, "step": 10718 }, { "epoch": 1.9872080088987765, "grad_norm": 6.12109375, "learning_rate": 8.012791991101225e-06, "loss": 2.6594, "mean_token_accuracy": 0.48302425106990016, "step": 10719 }, { "epoch": 1.9873934000741564, "grad_norm": 6.55859375, "learning_rate": 8.012606599925843e-06, "loss": 3.5412, "mean_token_accuracy": 0.417531556802244, "step": 10720 }, { "epoch": 1.9875787912495366, "grad_norm": 10.84375, "learning_rate": 8.012421208750464e-06, "loss": 2.5985, "mean_token_accuracy": 0.49404919333509656, "step": 10721 }, { "epoch": 1.9877641824249166, "grad_norm": 5.96875, "learning_rate": 8.012235817575083e-06, "loss": 3.3425, "mean_token_accuracy": 0.4218241042345277, "step": 10722 }, { "epoch": 1.9879495736002966, "grad_norm": 8.4296875, "learning_rate": 8.012050426399705e-06, "loss": 3.8381, "mean_token_accuracy": 0.39969079665332846, "step": 10723 }, { "epoch": 1.9881349647756767, "grad_norm": 6.84375, "learning_rate": 8.011865035224324e-06, "loss": 2.8277, "mean_token_accuracy": 0.4730851758559567, "step": 10724 }, { "epoch": 1.9883203559510567, "grad_norm": 8.5, "learning_rate": 8.011679644048944e-06, "loss": 2.3781, "mean_token_accuracy": 0.5369665397715888, "step": 10725 }, { "epoch": 1.9885057471264367, "grad_norm": 6.09375, "learning_rate": 8.011494252873565e-06, "loss": 2.7855, "mean_token_accuracy": 0.4789915966386555, "step": 10726 }, { "epoch": 1.9886911383018169, "grad_norm": 8.3359375, "learning_rate": 8.011308861698184e-06, "loss": 2.7558, "mean_token_accuracy": 0.4632361760825237, "step": 10727 }, { "epoch": 1.9888765294771968, "grad_norm": 7.7890625, "learning_rate": 8.011123470522804e-06, "loss": 2.6262, "mean_token_accuracy": 0.5144725557461407, "step": 10728 }, { "epoch": 1.9890619206525768, "grad_norm": 7.1875, "learning_rate": 8.010938079347423e-06, "loss": 3.3445, "mean_token_accuracy": 0.4565354928508161, "step": 10729 }, { "epoch": 1.989247311827957, "grad_norm": 8.4140625, "learning_rate": 8.010752688172043e-06, "loss": 3.0721, "mean_token_accuracy": 0.47480077536075815, "step": 10730 }, { "epoch": 1.9894327030033372, "grad_norm": 9.875, "learning_rate": 8.010567296996664e-06, "loss": 2.4567, "mean_token_accuracy": 0.5297147882899862, "step": 10731 }, { "epoch": 1.989618094178717, "grad_norm": 10.8203125, "learning_rate": 8.010381905821284e-06, "loss": 3.8611, "mean_token_accuracy": 0.42245534448539834, "step": 10732 }, { "epoch": 1.9898034853540971, "grad_norm": 7.80859375, "learning_rate": 8.010196514645903e-06, "loss": 3.1811, "mean_token_accuracy": 0.45581480801028845, "step": 10733 }, { "epoch": 1.9899888765294773, "grad_norm": 7.98046875, "learning_rate": 8.010011123470524e-06, "loss": 3.1298, "mean_token_accuracy": 0.45045385779122543, "step": 10734 }, { "epoch": 1.9901742677048573, "grad_norm": 8.265625, "learning_rate": 8.009825732295144e-06, "loss": 2.9986, "mean_token_accuracy": 0.4547320878669678, "step": 10735 }, { "epoch": 1.9903596588802372, "grad_norm": 10.8359375, "learning_rate": 8.009640341119763e-06, "loss": 3.1925, "mean_token_accuracy": 0.4679950866657568, "step": 10736 }, { "epoch": 1.9905450500556174, "grad_norm": 9.3828125, "learning_rate": 8.009454949944383e-06, "loss": 3.1173, "mean_token_accuracy": 0.44630973986690864, "step": 10737 }, { "epoch": 1.9907304412309974, "grad_norm": 14.9921875, "learning_rate": 8.009269558769002e-06, "loss": 2.8305, "mean_token_accuracy": 0.43517074016293983, "step": 10738 }, { "epoch": 1.9909158324063774, "grad_norm": 8.21875, "learning_rate": 8.009084167593624e-06, "loss": 3.5514, "mean_token_accuracy": 0.42876693581497194, "step": 10739 }, { "epoch": 1.9911012235817576, "grad_norm": 10.0, "learning_rate": 8.008898776418243e-06, "loss": 2.8148, "mean_token_accuracy": 0.4732837627528964, "step": 10740 }, { "epoch": 1.9912866147571375, "grad_norm": 8.0234375, "learning_rate": 8.008713385242864e-06, "loss": 3.0343, "mean_token_accuracy": 0.4490351872871737, "step": 10741 }, { "epoch": 1.9914720059325175, "grad_norm": 8.8671875, "learning_rate": 8.008527994067482e-06, "loss": 2.7333, "mean_token_accuracy": 0.4609048978695366, "step": 10742 }, { "epoch": 1.9916573971078977, "grad_norm": 8.703125, "learning_rate": 8.008342602892103e-06, "loss": 2.6406, "mean_token_accuracy": 0.4728003444811253, "step": 10743 }, { "epoch": 1.9918427882832779, "grad_norm": 8.28125, "learning_rate": 8.008157211716723e-06, "loss": 2.7926, "mean_token_accuracy": 0.4745317496573778, "step": 10744 }, { "epoch": 1.9920281794586576, "grad_norm": 9.5234375, "learning_rate": 8.007971820541342e-06, "loss": 2.5409, "mean_token_accuracy": 0.49304461942257216, "step": 10745 }, { "epoch": 1.9922135706340378, "grad_norm": 6.87109375, "learning_rate": 8.007786429365963e-06, "loss": 2.5533, "mean_token_accuracy": 0.4864109728219456, "step": 10746 }, { "epoch": 1.992398961809418, "grad_norm": 6.7578125, "learning_rate": 8.007601038190583e-06, "loss": 3.0521, "mean_token_accuracy": 0.46139574711003284, "step": 10747 }, { "epoch": 1.992584352984798, "grad_norm": 7.59765625, "learning_rate": 8.007415647015204e-06, "loss": 3.3525, "mean_token_accuracy": 0.45082726671078754, "step": 10748 }, { "epoch": 1.992769744160178, "grad_norm": 7.84375, "learning_rate": 8.007230255839822e-06, "loss": 2.2662, "mean_token_accuracy": 0.5143433437920215, "step": 10749 }, { "epoch": 1.9929551353355581, "grad_norm": 6.51953125, "learning_rate": 8.007044864664443e-06, "loss": 2.9754, "mean_token_accuracy": 0.48411758371917735, "step": 10750 }, { "epoch": 1.993140526510938, "grad_norm": 6.7265625, "learning_rate": 8.006859473489062e-06, "loss": 2.2903, "mean_token_accuracy": 0.5112095583299772, "step": 10751 }, { "epoch": 1.993325917686318, "grad_norm": 5.67578125, "learning_rate": 8.006674082313682e-06, "loss": 3.0573, "mean_token_accuracy": 0.45158144071339, "step": 10752 }, { "epoch": 1.9935113088616983, "grad_norm": 5.88671875, "learning_rate": 8.006488691138303e-06, "loss": 3.0611, "mean_token_accuracy": 0.4632192279679534, "step": 10753 }, { "epoch": 1.9936967000370782, "grad_norm": 7.3515625, "learning_rate": 8.006303299962922e-06, "loss": 2.6784, "mean_token_accuracy": 0.47740986019131715, "step": 10754 }, { "epoch": 1.9938820912124582, "grad_norm": 7.171875, "learning_rate": 8.006117908787544e-06, "loss": 3.0903, "mean_token_accuracy": 0.4571873378308251, "step": 10755 }, { "epoch": 1.9940674823878384, "grad_norm": 8.5625, "learning_rate": 8.005932517612163e-06, "loss": 2.4697, "mean_token_accuracy": 0.5211170724996561, "step": 10756 }, { "epoch": 1.9942528735632183, "grad_norm": 8.0, "learning_rate": 8.005747126436783e-06, "loss": 3.4159, "mean_token_accuracy": 0.4333435021354484, "step": 10757 }, { "epoch": 1.9944382647385983, "grad_norm": 8.0546875, "learning_rate": 8.005561735261402e-06, "loss": 2.8394, "mean_token_accuracy": 0.4744318181818182, "step": 10758 }, { "epoch": 1.9946236559139785, "grad_norm": 6.9296875, "learning_rate": 8.005376344086022e-06, "loss": 2.4562, "mean_token_accuracy": 0.539283171136957, "step": 10759 }, { "epoch": 1.9948090470893587, "grad_norm": 6.30078125, "learning_rate": 8.005190952910641e-06, "loss": 2.6911, "mean_token_accuracy": 0.4833207547169811, "step": 10760 }, { "epoch": 1.9949944382647384, "grad_norm": 6.80859375, "learning_rate": 8.005005561735262e-06, "loss": 2.9502, "mean_token_accuracy": 0.4690017513134851, "step": 10761 }, { "epoch": 1.9951798294401186, "grad_norm": 8.3046875, "learning_rate": 8.004820170559882e-06, "loss": 3.6189, "mean_token_accuracy": 0.45224908820748344, "step": 10762 }, { "epoch": 1.9953652206154988, "grad_norm": 6.11328125, "learning_rate": 8.004634779384503e-06, "loss": 2.8786, "mean_token_accuracy": 0.4625249358791679, "step": 10763 }, { "epoch": 1.9955506117908788, "grad_norm": 6.0859375, "learning_rate": 8.004449388209123e-06, "loss": 3.063, "mean_token_accuracy": 0.4746670328161472, "step": 10764 }, { "epoch": 1.9957360029662587, "grad_norm": 7.2421875, "learning_rate": 8.004263997033742e-06, "loss": 3.327, "mean_token_accuracy": 0.438682652029467, "step": 10765 }, { "epoch": 1.995921394141639, "grad_norm": 7.02734375, "learning_rate": 8.004078605858362e-06, "loss": 2.6294, "mean_token_accuracy": 0.49117381780698366, "step": 10766 }, { "epoch": 1.996106785317019, "grad_norm": 7.2109375, "learning_rate": 8.003893214682981e-06, "loss": 3.8101, "mean_token_accuracy": 0.4112128657583203, "step": 10767 }, { "epoch": 1.9962921764923989, "grad_norm": 5.37890625, "learning_rate": 8.003707823507602e-06, "loss": 2.8985, "mean_token_accuracy": 0.47346706501636077, "step": 10768 }, { "epoch": 1.996477567667779, "grad_norm": 6.8046875, "learning_rate": 8.003522432332222e-06, "loss": 2.8025, "mean_token_accuracy": 0.47616702777230646, "step": 10769 }, { "epoch": 1.996662958843159, "grad_norm": 7.0078125, "learning_rate": 8.003337041156841e-06, "loss": 2.8019, "mean_token_accuracy": 0.475470697427738, "step": 10770 }, { "epoch": 1.996848350018539, "grad_norm": 7.9296875, "learning_rate": 8.003151649981461e-06, "loss": 2.7185, "mean_token_accuracy": 0.48474033965050456, "step": 10771 }, { "epoch": 1.9970337411939192, "grad_norm": 7.10546875, "learning_rate": 8.002966258806082e-06, "loss": 2.4837, "mean_token_accuracy": 0.49939702532493635, "step": 10772 }, { "epoch": 1.9972191323692994, "grad_norm": 6.26171875, "learning_rate": 8.002780867630702e-06, "loss": 3.3297, "mean_token_accuracy": 0.4303306116985678, "step": 10773 }, { "epoch": 1.9974045235446791, "grad_norm": 6.1171875, "learning_rate": 8.002595476455321e-06, "loss": 2.4726, "mean_token_accuracy": 0.5159839552692355, "step": 10774 }, { "epoch": 1.9975899147200593, "grad_norm": 9.3828125, "learning_rate": 8.002410085279942e-06, "loss": 2.7426, "mean_token_accuracy": 0.47046390032261654, "step": 10775 }, { "epoch": 1.9977753058954395, "grad_norm": 6.45703125, "learning_rate": 8.00222469410456e-06, "loss": 2.3168, "mean_token_accuracy": 0.5241369632332304, "step": 10776 }, { "epoch": 1.9979606970708195, "grad_norm": 5.6953125, "learning_rate": 8.002039302929181e-06, "loss": 3.0936, "mean_token_accuracy": 0.4397991211550534, "step": 10777 }, { "epoch": 1.9981460882461994, "grad_norm": 7.296875, "learning_rate": 8.001853911753802e-06, "loss": 2.5426, "mean_token_accuracy": 0.4850966262692434, "step": 10778 }, { "epoch": 1.9983314794215796, "grad_norm": 6.328125, "learning_rate": 8.00166852057842e-06, "loss": 3.1363, "mean_token_accuracy": 0.43676767676767675, "step": 10779 }, { "epoch": 1.9985168705969596, "grad_norm": 7.78515625, "learning_rate": 8.00148312940304e-06, "loss": 1.9923, "mean_token_accuracy": 0.5631584437258788, "step": 10780 }, { "epoch": 1.9987022617723396, "grad_norm": 6.421875, "learning_rate": 8.001297738227661e-06, "loss": 3.1193, "mean_token_accuracy": 0.43542234332425067, "step": 10781 }, { "epoch": 1.9988876529477198, "grad_norm": 7.203125, "learning_rate": 8.001112347052282e-06, "loss": 3.7044, "mean_token_accuracy": 0.41835147744945567, "step": 10782 }, { "epoch": 1.9990730441230997, "grad_norm": 7.21875, "learning_rate": 8.0009269558769e-06, "loss": 4.3436, "mean_token_accuracy": 0.4155860349127182, "step": 10783 }, { "epoch": 1.9992584352984797, "grad_norm": 5.484375, "learning_rate": 8.000741564701521e-06, "loss": 3.3995, "mean_token_accuracy": 0.4310155735906997, "step": 10784 }, { "epoch": 1.9994438264738599, "grad_norm": 6.65234375, "learning_rate": 8.00055617352614e-06, "loss": 2.9603, "mean_token_accuracy": 0.465089065141074, "step": 10785 }, { "epoch": 1.9996292176492398, "grad_norm": 8.0546875, "learning_rate": 8.00037078235076e-06, "loss": 3.042, "mean_token_accuracy": 0.4503063308373043, "step": 10786 }, { "epoch": 1.9998146088246198, "grad_norm": 6.80859375, "learning_rate": 8.00018539117538e-06, "loss": 2.5963, "mean_token_accuracy": 0.5153462749213011, "step": 10787 }, { "epoch": 2.0, "grad_norm": 7.71875, "learning_rate": 8.000000000000001e-06, "loss": 3.6965, "mean_token_accuracy": 0.4419183538001991, "step": 10788 }, { "epoch": 2.00018539117538, "grad_norm": 8.40625, "learning_rate": 7.99981460882462e-06, "loss": 2.4972, "mean_token_accuracy": 0.48521563536708495, "step": 10789 }, { "epoch": 2.00037078235076, "grad_norm": 8.75, "learning_rate": 7.99962921764924e-06, "loss": 2.8787, "mean_token_accuracy": 0.47733677782996714, "step": 10790 }, { "epoch": 2.00055617352614, "grad_norm": 8.9296875, "learning_rate": 7.999443826473861e-06, "loss": 2.4858, "mean_token_accuracy": 0.5247357293868922, "step": 10791 }, { "epoch": 2.0007415647015203, "grad_norm": 10.0390625, "learning_rate": 7.99925843529848e-06, "loss": 2.4654, "mean_token_accuracy": 0.5227120908483633, "step": 10792 }, { "epoch": 2.0009269558769, "grad_norm": 8.6015625, "learning_rate": 7.9990730441231e-06, "loss": 2.7942, "mean_token_accuracy": 0.47761897282864774, "step": 10793 }, { "epoch": 2.0011123470522802, "grad_norm": 7.37109375, "learning_rate": 7.99888765294772e-06, "loss": 3.0089, "mean_token_accuracy": 0.47504078303425773, "step": 10794 }, { "epoch": 2.0012977382276604, "grad_norm": 8.421875, "learning_rate": 7.99870226177234e-06, "loss": 2.2437, "mean_token_accuracy": 0.5568345323741007, "step": 10795 }, { "epoch": 2.0014831294030406, "grad_norm": 8.671875, "learning_rate": 7.99851687059696e-06, "loss": 2.5008, "mean_token_accuracy": 0.5194006915097964, "step": 10796 }, { "epoch": 2.0016685205784204, "grad_norm": 8.6875, "learning_rate": 7.99833147942158e-06, "loss": 2.4547, "mean_token_accuracy": 0.5127133009379591, "step": 10797 }, { "epoch": 2.0018539117538006, "grad_norm": 6.265625, "learning_rate": 7.9981460882462e-06, "loss": 2.5066, "mean_token_accuracy": 0.5023052959501557, "step": 10798 }, { "epoch": 2.0020393029291808, "grad_norm": 7.83203125, "learning_rate": 7.99796069707082e-06, "loss": 2.964, "mean_token_accuracy": 0.4700488269620102, "step": 10799 }, { "epoch": 2.0022246941045605, "grad_norm": 7.9453125, "learning_rate": 7.99777530589544e-06, "loss": 2.7573, "mean_token_accuracy": 0.4727040090986636, "step": 10800 }, { "epoch": 2.0024100852799407, "grad_norm": 7.94140625, "learning_rate": 7.99758991472006e-06, "loss": 3.0897, "mean_token_accuracy": 0.427367055771725, "step": 10801 }, { "epoch": 2.002595476455321, "grad_norm": 9.75, "learning_rate": 7.99740452354468e-06, "loss": 2.7898, "mean_token_accuracy": 0.47820593809222994, "step": 10802 }, { "epoch": 2.0027808676307006, "grad_norm": 6.2734375, "learning_rate": 7.997219132369299e-06, "loss": 3.0672, "mean_token_accuracy": 0.4681102828067684, "step": 10803 }, { "epoch": 2.002966258806081, "grad_norm": 6.328125, "learning_rate": 7.99703374119392e-06, "loss": 2.9532, "mean_token_accuracy": 0.4748270102600811, "step": 10804 }, { "epoch": 2.003151649981461, "grad_norm": 7.41015625, "learning_rate": 7.99684835001854e-06, "loss": 2.4855, "mean_token_accuracy": 0.5426364467933733, "step": 10805 }, { "epoch": 2.0033370411568407, "grad_norm": 6.87109375, "learning_rate": 7.99666295884316e-06, "loss": 3.4007, "mean_token_accuracy": 0.4587834664161578, "step": 10806 }, { "epoch": 2.003522432332221, "grad_norm": 5.671875, "learning_rate": 7.99647756766778e-06, "loss": 3.0086, "mean_token_accuracy": 0.4724717473196175, "step": 10807 }, { "epoch": 2.003707823507601, "grad_norm": 7.90625, "learning_rate": 7.9962921764924e-06, "loss": 3.0367, "mean_token_accuracy": 0.4762157148535842, "step": 10808 }, { "epoch": 2.003893214682981, "grad_norm": 5.8984375, "learning_rate": 7.99610678531702e-06, "loss": 2.8093, "mean_token_accuracy": 0.4808464064210142, "step": 10809 }, { "epoch": 2.004078605858361, "grad_norm": 6.99609375, "learning_rate": 7.995921394141639e-06, "loss": 3.4001, "mean_token_accuracy": 0.4218236928517302, "step": 10810 }, { "epoch": 2.0042639970337413, "grad_norm": 7.48046875, "learning_rate": 7.995736002966259e-06, "loss": 2.5082, "mean_token_accuracy": 0.5175164216452924, "step": 10811 }, { "epoch": 2.0044493882091214, "grad_norm": 5.81640625, "learning_rate": 7.99555061179088e-06, "loss": 2.2787, "mean_token_accuracy": 0.5540441176470589, "step": 10812 }, { "epoch": 2.004634779384501, "grad_norm": 6.32421875, "learning_rate": 7.9953652206155e-06, "loss": 2.8187, "mean_token_accuracy": 0.4836655592469546, "step": 10813 }, { "epoch": 2.0048201705598814, "grad_norm": 5.23828125, "learning_rate": 7.995179829440119e-06, "loss": 2.4098, "mean_token_accuracy": 0.5099636545933074, "step": 10814 }, { "epoch": 2.0050055617352616, "grad_norm": 6.05859375, "learning_rate": 7.99499443826474e-06, "loss": 3.0315, "mean_token_accuracy": 0.46397273612463485, "step": 10815 }, { "epoch": 2.0051909529106413, "grad_norm": 8.984375, "learning_rate": 7.99480904708936e-06, "loss": 3.1042, "mean_token_accuracy": 0.4725745629070963, "step": 10816 }, { "epoch": 2.0053763440860215, "grad_norm": 7.39453125, "learning_rate": 7.994623655913979e-06, "loss": 2.541, "mean_token_accuracy": 0.5356822932218572, "step": 10817 }, { "epoch": 2.0055617352614017, "grad_norm": 6.66015625, "learning_rate": 7.994438264738599e-06, "loss": 2.3002, "mean_token_accuracy": 0.5514898065865134, "step": 10818 }, { "epoch": 2.0057471264367814, "grad_norm": 6.171875, "learning_rate": 7.994252873563218e-06, "loss": 2.7205, "mean_token_accuracy": 0.49355376914909754, "step": 10819 }, { "epoch": 2.0059325176121616, "grad_norm": 7.01953125, "learning_rate": 7.99406748238784e-06, "loss": 2.774, "mean_token_accuracy": 0.4949189450762158, "step": 10820 }, { "epoch": 2.006117908787542, "grad_norm": 7.54296875, "learning_rate": 7.993882091212459e-06, "loss": 2.7606, "mean_token_accuracy": 0.47726341663252764, "step": 10821 }, { "epoch": 2.0063032999629216, "grad_norm": 5.703125, "learning_rate": 7.99369670003708e-06, "loss": 3.1624, "mean_token_accuracy": 0.4446133796698523, "step": 10822 }, { "epoch": 2.0064886911383017, "grad_norm": 6.84375, "learning_rate": 7.993511308861698e-06, "loss": 2.5116, "mean_token_accuracy": 0.5036338859868271, "step": 10823 }, { "epoch": 2.006674082313682, "grad_norm": 7.3359375, "learning_rate": 7.993325917686319e-06, "loss": 2.6711, "mean_token_accuracy": 0.4898547244567175, "step": 10824 }, { "epoch": 2.006859473489062, "grad_norm": 6.58984375, "learning_rate": 7.99314052651094e-06, "loss": 2.7956, "mean_token_accuracy": 0.47031766566048583, "step": 10825 }, { "epoch": 2.007044864664442, "grad_norm": 8.0546875, "learning_rate": 7.992955135335558e-06, "loss": 2.5852, "mean_token_accuracy": 0.500428134556575, "step": 10826 }, { "epoch": 2.007230255839822, "grad_norm": 6.90234375, "learning_rate": 7.992769744160178e-06, "loss": 3.3116, "mean_token_accuracy": 0.4528529563868299, "step": 10827 }, { "epoch": 2.0074156470152023, "grad_norm": 8.0625, "learning_rate": 7.992584352984799e-06, "loss": 2.4038, "mean_token_accuracy": 0.522578360191252, "step": 10828 }, { "epoch": 2.007601038190582, "grad_norm": 7.75, "learning_rate": 7.99239896180942e-06, "loss": 2.6199, "mean_token_accuracy": 0.5260590500641849, "step": 10829 }, { "epoch": 2.007786429365962, "grad_norm": 9.234375, "learning_rate": 7.992213570634038e-06, "loss": 2.4868, "mean_token_accuracy": 0.5149768399382398, "step": 10830 }, { "epoch": 2.0079718205413424, "grad_norm": 6.234375, "learning_rate": 7.992028179458659e-06, "loss": 2.7919, "mean_token_accuracy": 0.4741869181477451, "step": 10831 }, { "epoch": 2.008157211716722, "grad_norm": 8.046875, "learning_rate": 7.991842788283278e-06, "loss": 2.6223, "mean_token_accuracy": 0.4823549664313996, "step": 10832 }, { "epoch": 2.0083426028921023, "grad_norm": 7.75390625, "learning_rate": 7.991657397107898e-06, "loss": 3.3037, "mean_token_accuracy": 0.46093133385951063, "step": 10833 }, { "epoch": 2.0085279940674825, "grad_norm": 7.69921875, "learning_rate": 7.991472005932519e-06, "loss": 2.6483, "mean_token_accuracy": 0.4977668258124134, "step": 10834 }, { "epoch": 2.0087133852428622, "grad_norm": 6.88671875, "learning_rate": 7.991286614757137e-06, "loss": 3.1207, "mean_token_accuracy": 0.446443172526574, "step": 10835 }, { "epoch": 2.0088987764182424, "grad_norm": 8.3359375, "learning_rate": 7.99110122358176e-06, "loss": 3.3604, "mean_token_accuracy": 0.4459804658151766, "step": 10836 }, { "epoch": 2.0090841675936226, "grad_norm": 9.8984375, "learning_rate": 7.990915832406378e-06, "loss": 3.2352, "mean_token_accuracy": 0.4567315055904426, "step": 10837 }, { "epoch": 2.0092695587690024, "grad_norm": 7.83984375, "learning_rate": 7.990730441230999e-06, "loss": 3.4465, "mean_token_accuracy": 0.4493016037247801, "step": 10838 }, { "epoch": 2.0094549499443826, "grad_norm": 7.85546875, "learning_rate": 7.990545050055618e-06, "loss": 2.1356, "mean_token_accuracy": 0.5392230711288066, "step": 10839 }, { "epoch": 2.0096403411197628, "grad_norm": 8.9921875, "learning_rate": 7.990359658880238e-06, "loss": 3.1533, "mean_token_accuracy": 0.45023466580352806, "step": 10840 }, { "epoch": 2.009825732295143, "grad_norm": 10.8203125, "learning_rate": 7.990174267704857e-06, "loss": 2.6224, "mean_token_accuracy": 0.4899822815864795, "step": 10841 }, { "epoch": 2.0100111234705227, "grad_norm": 8.6875, "learning_rate": 7.989988876529477e-06, "loss": 2.3736, "mean_token_accuracy": 0.5177955005356505, "step": 10842 }, { "epoch": 2.010196514645903, "grad_norm": 9.3359375, "learning_rate": 7.989803485354098e-06, "loss": 2.6667, "mean_token_accuracy": 0.49387755102040815, "step": 10843 }, { "epoch": 2.010381905821283, "grad_norm": 8.3671875, "learning_rate": 7.989618094178718e-06, "loss": 3.2224, "mean_token_accuracy": 0.45077978789769185, "step": 10844 }, { "epoch": 2.010567296996663, "grad_norm": 6.59375, "learning_rate": 7.989432703003339e-06, "loss": 2.4606, "mean_token_accuracy": 0.5144292557541769, "step": 10845 }, { "epoch": 2.010752688172043, "grad_norm": 8.375, "learning_rate": 7.989247311827958e-06, "loss": 2.7893, "mean_token_accuracy": 0.4937189599766287, "step": 10846 }, { "epoch": 2.010938079347423, "grad_norm": 7.49609375, "learning_rate": 7.989061920652578e-06, "loss": 3.2547, "mean_token_accuracy": 0.43530444964871196, "step": 10847 }, { "epoch": 2.011123470522803, "grad_norm": 7.94921875, "learning_rate": 7.988876529477197e-06, "loss": 2.7162, "mean_token_accuracy": 0.474468085106383, "step": 10848 }, { "epoch": 2.011308861698183, "grad_norm": 7.671875, "learning_rate": 7.988691138301817e-06, "loss": 2.9402, "mean_token_accuracy": 0.46807917479788125, "step": 10849 }, { "epoch": 2.0114942528735633, "grad_norm": 9.140625, "learning_rate": 7.988505747126438e-06, "loss": 3.1294, "mean_token_accuracy": 0.44867807153965783, "step": 10850 }, { "epoch": 2.011679644048943, "grad_norm": 7.3359375, "learning_rate": 7.988320355951057e-06, "loss": 2.8828, "mean_token_accuracy": 0.4852829037669666, "step": 10851 }, { "epoch": 2.0118650352243233, "grad_norm": 6.9921875, "learning_rate": 7.988134964775677e-06, "loss": 2.8591, "mean_token_accuracy": 0.46142046326123637, "step": 10852 }, { "epoch": 2.0120504263997034, "grad_norm": 10.5859375, "learning_rate": 7.987949573600298e-06, "loss": 2.0983, "mean_token_accuracy": 0.5826745718050066, "step": 10853 }, { "epoch": 2.0122358175750836, "grad_norm": 6.8671875, "learning_rate": 7.987764182424918e-06, "loss": 2.4871, "mean_token_accuracy": 0.5379694593479158, "step": 10854 }, { "epoch": 2.0124212087504634, "grad_norm": 6.29296875, "learning_rate": 7.987578791249537e-06, "loss": 2.9109, "mean_token_accuracy": 0.4916935283907761, "step": 10855 }, { "epoch": 2.0126065999258436, "grad_norm": 6.31640625, "learning_rate": 7.987393400074157e-06, "loss": 3.7756, "mean_token_accuracy": 0.41575239635061784, "step": 10856 }, { "epoch": 2.0127919911012238, "grad_norm": 6.875, "learning_rate": 7.987208008898776e-06, "loss": 2.7283, "mean_token_accuracy": 0.4952919020715631, "step": 10857 }, { "epoch": 2.0129773822766035, "grad_norm": 8.890625, "learning_rate": 7.987022617723397e-06, "loss": 2.7272, "mean_token_accuracy": 0.48917618761274806, "step": 10858 }, { "epoch": 2.0131627734519837, "grad_norm": 7.08203125, "learning_rate": 7.986837226548017e-06, "loss": 2.4486, "mean_token_accuracy": 0.5209363186972957, "step": 10859 }, { "epoch": 2.013348164627364, "grad_norm": 6.30859375, "learning_rate": 7.986651835372638e-06, "loss": 2.1878, "mean_token_accuracy": 0.5513748191027497, "step": 10860 }, { "epoch": 2.0135335558027436, "grad_norm": 9.453125, "learning_rate": 7.986466444197257e-06, "loss": 2.906, "mean_token_accuracy": 0.4704703649019194, "step": 10861 }, { "epoch": 2.013718946978124, "grad_norm": 6.24609375, "learning_rate": 7.986281053021877e-06, "loss": 2.4332, "mean_token_accuracy": 0.5122103944896681, "step": 10862 }, { "epoch": 2.013904338153504, "grad_norm": 8.65625, "learning_rate": 7.986095661846498e-06, "loss": 2.7561, "mean_token_accuracy": 0.4879302103250478, "step": 10863 }, { "epoch": 2.0140897293288837, "grad_norm": 8.03125, "learning_rate": 7.985910270671116e-06, "loss": 2.5752, "mean_token_accuracy": 0.5098098400241473, "step": 10864 }, { "epoch": 2.014275120504264, "grad_norm": 7.53515625, "learning_rate": 7.985724879495737e-06, "loss": 2.524, "mean_token_accuracy": 0.5269403075432861, "step": 10865 }, { "epoch": 2.014460511679644, "grad_norm": 7.4453125, "learning_rate": 7.985539488320356e-06, "loss": 2.5592, "mean_token_accuracy": 0.5008982035928143, "step": 10866 }, { "epoch": 2.0146459028550243, "grad_norm": 6.07421875, "learning_rate": 7.985354097144976e-06, "loss": 2.8083, "mean_token_accuracy": 0.48344170573113565, "step": 10867 }, { "epoch": 2.014831294030404, "grad_norm": 6.2578125, "learning_rate": 7.985168705969597e-06, "loss": 3.3403, "mean_token_accuracy": 0.4456066945606695, "step": 10868 }, { "epoch": 2.0150166852057843, "grad_norm": 7.8984375, "learning_rate": 7.984983314794217e-06, "loss": 2.6673, "mean_token_accuracy": 0.5100611309220581, "step": 10869 }, { "epoch": 2.0152020763811644, "grad_norm": 6.765625, "learning_rate": 7.984797923618836e-06, "loss": 3.2952, "mean_token_accuracy": 0.45961779885618637, "step": 10870 }, { "epoch": 2.015387467556544, "grad_norm": 6.1328125, "learning_rate": 7.984612532443456e-06, "loss": 2.6104, "mean_token_accuracy": 0.48666331152491193, "step": 10871 }, { "epoch": 2.0155728587319244, "grad_norm": 7.64453125, "learning_rate": 7.984427141268077e-06, "loss": 2.4744, "mean_token_accuracy": 0.511060507482108, "step": 10872 }, { "epoch": 2.0157582499073046, "grad_norm": 6.2421875, "learning_rate": 7.984241750092696e-06, "loss": 2.5579, "mean_token_accuracy": 0.520836536982931, "step": 10873 }, { "epoch": 2.0159436410826843, "grad_norm": 6.69921875, "learning_rate": 7.984056358917316e-06, "loss": 2.7044, "mean_token_accuracy": 0.4944746825004123, "step": 10874 }, { "epoch": 2.0161290322580645, "grad_norm": 9.71875, "learning_rate": 7.983870967741935e-06, "loss": 2.6038, "mean_token_accuracy": 0.4991414608374059, "step": 10875 }, { "epoch": 2.0163144234334447, "grad_norm": 6.125, "learning_rate": 7.983685576566557e-06, "loss": 2.6446, "mean_token_accuracy": 0.5006514657980456, "step": 10876 }, { "epoch": 2.0164998146088244, "grad_norm": 7.1796875, "learning_rate": 7.983500185391176e-06, "loss": 2.4562, "mean_token_accuracy": 0.5112548861032484, "step": 10877 }, { "epoch": 2.0166852057842046, "grad_norm": 5.72265625, "learning_rate": 7.983314794215796e-06, "loss": 2.5686, "mean_token_accuracy": 0.5081474296799224, "step": 10878 }, { "epoch": 2.016870596959585, "grad_norm": 6.77734375, "learning_rate": 7.983129403040415e-06, "loss": 3.7187, "mean_token_accuracy": 0.4262147570485903, "step": 10879 }, { "epoch": 2.0170559881349646, "grad_norm": 7.109375, "learning_rate": 7.982944011865036e-06, "loss": 3.4231, "mean_token_accuracy": 0.45064438464806683, "step": 10880 }, { "epoch": 2.0172413793103448, "grad_norm": 6.54296875, "learning_rate": 7.982758620689656e-06, "loss": 2.7103, "mean_token_accuracy": 0.4659292497130335, "step": 10881 }, { "epoch": 2.017426770485725, "grad_norm": 6.5625, "learning_rate": 7.982573229514275e-06, "loss": 2.7305, "mean_token_accuracy": 0.5007112375533428, "step": 10882 }, { "epoch": 2.017612161661105, "grad_norm": 6.05078125, "learning_rate": 7.982387838338895e-06, "loss": 3.1448, "mean_token_accuracy": 0.4521201185719809, "step": 10883 }, { "epoch": 2.017797552836485, "grad_norm": 6.19921875, "learning_rate": 7.982202447163516e-06, "loss": 2.7618, "mean_token_accuracy": 0.5175892738616753, "step": 10884 }, { "epoch": 2.017982944011865, "grad_norm": 7.63671875, "learning_rate": 7.982017055988136e-06, "loss": 3.4577, "mean_token_accuracy": 0.4248587570621469, "step": 10885 }, { "epoch": 2.0181683351872453, "grad_norm": 7.63671875, "learning_rate": 7.981831664812755e-06, "loss": 3.2489, "mean_token_accuracy": 0.47715404699738906, "step": 10886 }, { "epoch": 2.018353726362625, "grad_norm": 7.15234375, "learning_rate": 7.981646273637376e-06, "loss": 3.5504, "mean_token_accuracy": 0.46905537459283386, "step": 10887 }, { "epoch": 2.018539117538005, "grad_norm": 5.9375, "learning_rate": 7.981460882461996e-06, "loss": 2.5756, "mean_token_accuracy": 0.49847581179589134, "step": 10888 }, { "epoch": 2.0187245087133854, "grad_norm": 7.21484375, "learning_rate": 7.981275491286615e-06, "loss": 3.0198, "mean_token_accuracy": 0.47399483585392843, "step": 10889 }, { "epoch": 2.018909899888765, "grad_norm": 7.6015625, "learning_rate": 7.981090100111236e-06, "loss": 2.9589, "mean_token_accuracy": 0.47300245432233434, "step": 10890 }, { "epoch": 2.0190952910641453, "grad_norm": 6.3359375, "learning_rate": 7.980904708935854e-06, "loss": 3.0154, "mean_token_accuracy": 0.4870176890452053, "step": 10891 }, { "epoch": 2.0192806822395255, "grad_norm": 8.9453125, "learning_rate": 7.980719317760475e-06, "loss": 2.6939, "mean_token_accuracy": 0.5089450956199877, "step": 10892 }, { "epoch": 2.0194660734149052, "grad_norm": 7.66796875, "learning_rate": 7.980533926585095e-06, "loss": 2.7276, "mean_token_accuracy": 0.47743113176236307, "step": 10893 }, { "epoch": 2.0196514645902854, "grad_norm": 6.5859375, "learning_rate": 7.980348535409716e-06, "loss": 3.1401, "mean_token_accuracy": 0.44131646946824427, "step": 10894 }, { "epoch": 2.0198368557656656, "grad_norm": 7.03125, "learning_rate": 7.980163144234335e-06, "loss": 2.5318, "mean_token_accuracy": 0.48655256723716384, "step": 10895 }, { "epoch": 2.020022246941046, "grad_norm": 7.0703125, "learning_rate": 7.979977753058955e-06, "loss": 3.2773, "mean_token_accuracy": 0.4398320895522388, "step": 10896 }, { "epoch": 2.0202076381164256, "grad_norm": 7.7890625, "learning_rate": 7.979792361883576e-06, "loss": 2.7659, "mean_token_accuracy": 0.47199885123492247, "step": 10897 }, { "epoch": 2.0203930292918058, "grad_norm": 7.06640625, "learning_rate": 7.979606970708194e-06, "loss": 2.8227, "mean_token_accuracy": 0.458602931467054, "step": 10898 }, { "epoch": 2.020578420467186, "grad_norm": 6.484375, "learning_rate": 7.979421579532815e-06, "loss": 3.4512, "mean_token_accuracy": 0.437013082463619, "step": 10899 }, { "epoch": 2.0207638116425657, "grad_norm": 7.05859375, "learning_rate": 7.979236188357434e-06, "loss": 2.8602, "mean_token_accuracy": 0.47945372515546886, "step": 10900 }, { "epoch": 2.020949202817946, "grad_norm": 6.02734375, "learning_rate": 7.979050797182056e-06, "loss": 2.8821, "mean_token_accuracy": 0.505519093078759, "step": 10901 }, { "epoch": 2.021134593993326, "grad_norm": 9.3359375, "learning_rate": 7.978865406006675e-06, "loss": 2.4815, "mean_token_accuracy": 0.5040837455423904, "step": 10902 }, { "epoch": 2.021319985168706, "grad_norm": 8.3359375, "learning_rate": 7.978680014831295e-06, "loss": 2.8917, "mean_token_accuracy": 0.4811552907654449, "step": 10903 }, { "epoch": 2.021505376344086, "grad_norm": 7.52734375, "learning_rate": 7.978494623655914e-06, "loss": 3.5858, "mean_token_accuracy": 0.42793682132280353, "step": 10904 }, { "epoch": 2.021690767519466, "grad_norm": 6.71875, "learning_rate": 7.978309232480534e-06, "loss": 2.4233, "mean_token_accuracy": 0.5106638229805385, "step": 10905 }, { "epoch": 2.021876158694846, "grad_norm": 7.12890625, "learning_rate": 7.978123841305155e-06, "loss": 3.006, "mean_token_accuracy": 0.475947622329428, "step": 10906 }, { "epoch": 2.022061549870226, "grad_norm": 7.296875, "learning_rate": 7.977938450129774e-06, "loss": 2.7761, "mean_token_accuracy": 0.48358348968105064, "step": 10907 }, { "epoch": 2.0222469410456063, "grad_norm": 7.78125, "learning_rate": 7.977753058954394e-06, "loss": 2.9711, "mean_token_accuracy": 0.4762996019780485, "step": 10908 }, { "epoch": 2.022432332220986, "grad_norm": 6.7890625, "learning_rate": 7.977567667779015e-06, "loss": 3.3472, "mean_token_accuracy": 0.44415797492089537, "step": 10909 }, { "epoch": 2.0226177233963663, "grad_norm": 10.59375, "learning_rate": 7.977382276603635e-06, "loss": 2.7198, "mean_token_accuracy": 0.4860913993755322, "step": 10910 }, { "epoch": 2.0228031145717464, "grad_norm": 7.125, "learning_rate": 7.977196885428254e-06, "loss": 2.6829, "mean_token_accuracy": 0.4999343228687771, "step": 10911 }, { "epoch": 2.0229885057471266, "grad_norm": 6.87890625, "learning_rate": 7.977011494252874e-06, "loss": 2.8713, "mean_token_accuracy": 0.4770951226084613, "step": 10912 }, { "epoch": 2.0231738969225064, "grad_norm": 8.0234375, "learning_rate": 7.976826103077493e-06, "loss": 2.5937, "mean_token_accuracy": 0.5087930257026905, "step": 10913 }, { "epoch": 2.0233592880978866, "grad_norm": 6.671875, "learning_rate": 7.976640711902114e-06, "loss": 2.8037, "mean_token_accuracy": 0.5107650903498654, "step": 10914 }, { "epoch": 2.0235446792732668, "grad_norm": 5.94921875, "learning_rate": 7.976455320726734e-06, "loss": 2.7132, "mean_token_accuracy": 0.48962177121771217, "step": 10915 }, { "epoch": 2.0237300704486465, "grad_norm": 14.46875, "learning_rate": 7.976269929551353e-06, "loss": 2.3827, "mean_token_accuracy": 0.5002660281989891, "step": 10916 }, { "epoch": 2.0239154616240267, "grad_norm": 6.30859375, "learning_rate": 7.976084538375974e-06, "loss": 2.8899, "mean_token_accuracy": 0.4838774485183325, "step": 10917 }, { "epoch": 2.024100852799407, "grad_norm": 7.23828125, "learning_rate": 7.975899147200594e-06, "loss": 2.8815, "mean_token_accuracy": 0.4639410868461148, "step": 10918 }, { "epoch": 2.0242862439747866, "grad_norm": 6.92578125, "learning_rate": 7.975713756025215e-06, "loss": 2.7172, "mean_token_accuracy": 0.4928514694201747, "step": 10919 }, { "epoch": 2.024471635150167, "grad_norm": 5.5546875, "learning_rate": 7.975528364849833e-06, "loss": 2.6342, "mean_token_accuracy": 0.4960535117056856, "step": 10920 }, { "epoch": 2.024657026325547, "grad_norm": 6.09375, "learning_rate": 7.975342973674454e-06, "loss": 2.5127, "mean_token_accuracy": 0.5042016806722689, "step": 10921 }, { "epoch": 2.0248424175009268, "grad_norm": 11.515625, "learning_rate": 7.975157582499073e-06, "loss": 3.2478, "mean_token_accuracy": 0.4659694288012872, "step": 10922 }, { "epoch": 2.025027808676307, "grad_norm": 8.8984375, "learning_rate": 7.974972191323693e-06, "loss": 3.0458, "mean_token_accuracy": 0.481604820805582, "step": 10923 }, { "epoch": 2.025213199851687, "grad_norm": 6.3515625, "learning_rate": 7.974786800148314e-06, "loss": 2.9322, "mean_token_accuracy": 0.4709795824730443, "step": 10924 }, { "epoch": 2.0253985910270673, "grad_norm": 7.6015625, "learning_rate": 7.974601408972934e-06, "loss": 2.2006, "mean_token_accuracy": 0.523441126411883, "step": 10925 }, { "epoch": 2.025583982202447, "grad_norm": 6.86328125, "learning_rate": 7.974416017797555e-06, "loss": 2.7883, "mean_token_accuracy": 0.47608105285147273, "step": 10926 }, { "epoch": 2.0257693733778273, "grad_norm": 7.8984375, "learning_rate": 7.974230626622173e-06, "loss": 2.3551, "mean_token_accuracy": 0.51952770208901, "step": 10927 }, { "epoch": 2.0259547645532074, "grad_norm": 6.23046875, "learning_rate": 7.974045235446794e-06, "loss": 2.6286, "mean_token_accuracy": 0.5008053887831307, "step": 10928 }, { "epoch": 2.026140155728587, "grad_norm": 5.890625, "learning_rate": 7.973859844271413e-06, "loss": 2.8286, "mean_token_accuracy": 0.47745149449396956, "step": 10929 }, { "epoch": 2.0263255469039674, "grad_norm": 7.53515625, "learning_rate": 7.973674453096033e-06, "loss": 2.5628, "mean_token_accuracy": 0.5028285751874754, "step": 10930 }, { "epoch": 2.0265109380793476, "grad_norm": 5.84375, "learning_rate": 7.973489061920654e-06, "loss": 2.2795, "mean_token_accuracy": 0.5443146500910943, "step": 10931 }, { "epoch": 2.0266963292547273, "grad_norm": 6.78125, "learning_rate": 7.973303670745272e-06, "loss": 2.8108, "mean_token_accuracy": 0.5187316868982838, "step": 10932 }, { "epoch": 2.0268817204301075, "grad_norm": 6.7109375, "learning_rate": 7.973118279569893e-06, "loss": 2.0401, "mean_token_accuracy": 0.5779006699989365, "step": 10933 }, { "epoch": 2.0270671116054877, "grad_norm": 6.078125, "learning_rate": 7.972932888394513e-06, "loss": 2.4783, "mean_token_accuracy": 0.5137121854679106, "step": 10934 }, { "epoch": 2.0272525027808674, "grad_norm": 6.26171875, "learning_rate": 7.972747497219134e-06, "loss": 2.735, "mean_token_accuracy": 0.49124012366884234, "step": 10935 }, { "epoch": 2.0274378939562476, "grad_norm": 6.2578125, "learning_rate": 7.972562106043753e-06, "loss": 3.2162, "mean_token_accuracy": 0.4650735294117647, "step": 10936 }, { "epoch": 2.027623285131628, "grad_norm": 6.796875, "learning_rate": 7.972376714868373e-06, "loss": 3.0123, "mean_token_accuracy": 0.45938183807439825, "step": 10937 }, { "epoch": 2.027808676307008, "grad_norm": 6.6640625, "learning_rate": 7.972191323692992e-06, "loss": 2.9601, "mean_token_accuracy": 0.4889024950252564, "step": 10938 }, { "epoch": 2.0279940674823878, "grad_norm": 6.828125, "learning_rate": 7.972005932517613e-06, "loss": 2.5415, "mean_token_accuracy": 0.509349593495935, "step": 10939 }, { "epoch": 2.028179458657768, "grad_norm": 7.79296875, "learning_rate": 7.971820541342233e-06, "loss": 3.7182, "mean_token_accuracy": 0.4177583697234352, "step": 10940 }, { "epoch": 2.028364849833148, "grad_norm": 9.2265625, "learning_rate": 7.971635150166853e-06, "loss": 3.4607, "mean_token_accuracy": 0.4235700197238659, "step": 10941 }, { "epoch": 2.028550241008528, "grad_norm": 6.9765625, "learning_rate": 7.971449758991472e-06, "loss": 3.1936, "mean_token_accuracy": 0.46866051543111675, "step": 10942 }, { "epoch": 2.028735632183908, "grad_norm": 8.7265625, "learning_rate": 7.971264367816093e-06, "loss": 2.5056, "mean_token_accuracy": 0.4971355080088858, "step": 10943 }, { "epoch": 2.0289210233592883, "grad_norm": 6.27734375, "learning_rate": 7.971078976640713e-06, "loss": 2.8824, "mean_token_accuracy": 0.4587579834216606, "step": 10944 }, { "epoch": 2.029106414534668, "grad_norm": 6.51953125, "learning_rate": 7.970893585465332e-06, "loss": 2.5519, "mean_token_accuracy": 0.5053327150475307, "step": 10945 }, { "epoch": 2.029291805710048, "grad_norm": 6.2109375, "learning_rate": 7.970708194289953e-06, "loss": 2.6741, "mean_token_accuracy": 0.4846927374301676, "step": 10946 }, { "epoch": 2.0294771968854284, "grad_norm": 6.8828125, "learning_rate": 7.970522803114571e-06, "loss": 2.5006, "mean_token_accuracy": 0.5080246913580246, "step": 10947 }, { "epoch": 2.029662588060808, "grad_norm": 6.328125, "learning_rate": 7.970337411939192e-06, "loss": 2.5913, "mean_token_accuracy": 0.4989371752479924, "step": 10948 }, { "epoch": 2.0298479792361883, "grad_norm": 6.84375, "learning_rate": 7.970152020763812e-06, "loss": 3.1748, "mean_token_accuracy": 0.47072497457504, "step": 10949 }, { "epoch": 2.0300333704115685, "grad_norm": 6.3359375, "learning_rate": 7.969966629588433e-06, "loss": 2.6583, "mean_token_accuracy": 0.4880167451596023, "step": 10950 }, { "epoch": 2.0302187615869483, "grad_norm": 6.73828125, "learning_rate": 7.969781238413052e-06, "loss": 2.8343, "mean_token_accuracy": 0.47580174927113705, "step": 10951 }, { "epoch": 2.0304041527623284, "grad_norm": 6.9140625, "learning_rate": 7.969595847237672e-06, "loss": 2.7853, "mean_token_accuracy": 0.4761566678499586, "step": 10952 }, { "epoch": 2.0305895439377086, "grad_norm": 7.32421875, "learning_rate": 7.969410456062293e-06, "loss": 2.9687, "mean_token_accuracy": 0.4613072877535687, "step": 10953 }, { "epoch": 2.030774935113089, "grad_norm": 8.0859375, "learning_rate": 7.969225064886911e-06, "loss": 2.3049, "mean_token_accuracy": 0.5537666174298376, "step": 10954 }, { "epoch": 2.0309603262884686, "grad_norm": 6.578125, "learning_rate": 7.969039673711532e-06, "loss": 2.5755, "mean_token_accuracy": 0.5130607941899965, "step": 10955 }, { "epoch": 2.0311457174638488, "grad_norm": 12.515625, "learning_rate": 7.96885428253615e-06, "loss": 2.4719, "mean_token_accuracy": 0.4801307590152212, "step": 10956 }, { "epoch": 2.031331108639229, "grad_norm": 7.94140625, "learning_rate": 7.968668891360773e-06, "loss": 2.6705, "mean_token_accuracy": 0.4752743337608665, "step": 10957 }, { "epoch": 2.0315164998146087, "grad_norm": 5.72265625, "learning_rate": 7.968483500185392e-06, "loss": 2.5384, "mean_token_accuracy": 0.5143737166324436, "step": 10958 }, { "epoch": 2.031701890989989, "grad_norm": 7.2265625, "learning_rate": 7.968298109010012e-06, "loss": 2.3619, "mean_token_accuracy": 0.5525588738323904, "step": 10959 }, { "epoch": 2.031887282165369, "grad_norm": 9.6171875, "learning_rate": 7.968112717834631e-06, "loss": 2.6587, "mean_token_accuracy": 0.48940269749518306, "step": 10960 }, { "epoch": 2.032072673340749, "grad_norm": 6.453125, "learning_rate": 7.967927326659251e-06, "loss": 2.7894, "mean_token_accuracy": 0.4845300642148278, "step": 10961 }, { "epoch": 2.032258064516129, "grad_norm": 6.1015625, "learning_rate": 7.967741935483872e-06, "loss": 3.1091, "mean_token_accuracy": 0.4636527485731451, "step": 10962 }, { "epoch": 2.032443455691509, "grad_norm": 6.53125, "learning_rate": 7.96755654430849e-06, "loss": 2.6311, "mean_token_accuracy": 0.48192019950124687, "step": 10963 }, { "epoch": 2.032628846866889, "grad_norm": 6.96875, "learning_rate": 7.967371153133111e-06, "loss": 3.2455, "mean_token_accuracy": 0.4506332757628094, "step": 10964 }, { "epoch": 2.032814238042269, "grad_norm": 7.109375, "learning_rate": 7.967185761957732e-06, "loss": 3.3215, "mean_token_accuracy": 0.46040593122305185, "step": 10965 }, { "epoch": 2.0329996292176493, "grad_norm": 6.8359375, "learning_rate": 7.967000370782352e-06, "loss": 2.4834, "mean_token_accuracy": 0.5414207898320472, "step": 10966 }, { "epoch": 2.0331850203930295, "grad_norm": 6.18359375, "learning_rate": 7.966814979606971e-06, "loss": 1.9284, "mean_token_accuracy": 0.5845596558750283, "step": 10967 }, { "epoch": 2.0333704115684093, "grad_norm": 6.328125, "learning_rate": 7.966629588431592e-06, "loss": 2.6183, "mean_token_accuracy": 0.5165553502913173, "step": 10968 }, { "epoch": 2.0335558027437894, "grad_norm": 7.43359375, "learning_rate": 7.966444197256212e-06, "loss": 2.8584, "mean_token_accuracy": 0.4656469408224674, "step": 10969 }, { "epoch": 2.0337411939191696, "grad_norm": 7.2890625, "learning_rate": 7.96625880608083e-06, "loss": 3.08, "mean_token_accuracy": 0.4680874316939891, "step": 10970 }, { "epoch": 2.0339265850945494, "grad_norm": 8.5625, "learning_rate": 7.966073414905451e-06, "loss": 3.034, "mean_token_accuracy": 0.4369439344725321, "step": 10971 }, { "epoch": 2.0341119762699296, "grad_norm": 7.45703125, "learning_rate": 7.96588802373007e-06, "loss": 2.5981, "mean_token_accuracy": 0.49218932152016787, "step": 10972 }, { "epoch": 2.0342973674453098, "grad_norm": 6.54296875, "learning_rate": 7.965702632554692e-06, "loss": 3.5325, "mean_token_accuracy": 0.4231917010089527, "step": 10973 }, { "epoch": 2.0344827586206895, "grad_norm": 8.359375, "learning_rate": 7.965517241379311e-06, "loss": 2.7116, "mean_token_accuracy": 0.4628733697938578, "step": 10974 }, { "epoch": 2.0346681497960697, "grad_norm": 6.2421875, "learning_rate": 7.965331850203932e-06, "loss": 2.4349, "mean_token_accuracy": 0.5247459416655669, "step": 10975 }, { "epoch": 2.03485354097145, "grad_norm": 7.83984375, "learning_rate": 7.96514645902855e-06, "loss": 2.6804, "mean_token_accuracy": 0.5245264691597863, "step": 10976 }, { "epoch": 2.0350389321468296, "grad_norm": 8.5390625, "learning_rate": 7.964961067853171e-06, "loss": 2.9453, "mean_token_accuracy": 0.45475113122171945, "step": 10977 }, { "epoch": 2.03522432332221, "grad_norm": 6.2421875, "learning_rate": 7.964775676677791e-06, "loss": 2.4727, "mean_token_accuracy": 0.5122006841505131, "step": 10978 }, { "epoch": 2.03540971449759, "grad_norm": 6.15234375, "learning_rate": 7.96459028550241e-06, "loss": 2.7964, "mean_token_accuracy": 0.4700374531835206, "step": 10979 }, { "epoch": 2.0355951056729698, "grad_norm": 7.4296875, "learning_rate": 7.96440489432703e-06, "loss": 2.8148, "mean_token_accuracy": 0.4783099864437415, "step": 10980 }, { "epoch": 2.03578049684835, "grad_norm": 6.9609375, "learning_rate": 7.964219503151651e-06, "loss": 2.8028, "mean_token_accuracy": 0.5144400352733686, "step": 10981 }, { "epoch": 2.03596588802373, "grad_norm": 6.57421875, "learning_rate": 7.964034111976272e-06, "loss": 3.2683, "mean_token_accuracy": 0.4507530321204851, "step": 10982 }, { "epoch": 2.0361512791991103, "grad_norm": 6.875, "learning_rate": 7.96384872080089e-06, "loss": 2.8861, "mean_token_accuracy": 0.5095316545069428, "step": 10983 }, { "epoch": 2.03633667037449, "grad_norm": 6.65625, "learning_rate": 7.963663329625511e-06, "loss": 2.8372, "mean_token_accuracy": 0.47297894271830365, "step": 10984 }, { "epoch": 2.0365220615498703, "grad_norm": 6.359375, "learning_rate": 7.96347793845013e-06, "loss": 2.4382, "mean_token_accuracy": 0.5084179104477612, "step": 10985 }, { "epoch": 2.0367074527252504, "grad_norm": 6.29296875, "learning_rate": 7.96329254727475e-06, "loss": 2.5758, "mean_token_accuracy": 0.49111656796186626, "step": 10986 }, { "epoch": 2.03689284390063, "grad_norm": 6.9375, "learning_rate": 7.96310715609937e-06, "loss": 3.2392, "mean_token_accuracy": 0.4381979695431472, "step": 10987 }, { "epoch": 2.0370782350760104, "grad_norm": 7.87890625, "learning_rate": 7.96292176492399e-06, "loss": 3.4449, "mean_token_accuracy": 0.4191159181754056, "step": 10988 }, { "epoch": 2.0372636262513906, "grad_norm": 9.7890625, "learning_rate": 7.96273637374861e-06, "loss": 2.4293, "mean_token_accuracy": 0.5186659346692286, "step": 10989 }, { "epoch": 2.0374490174267703, "grad_norm": 6.27734375, "learning_rate": 7.96255098257323e-06, "loss": 2.8573, "mean_token_accuracy": 0.4880298704151109, "step": 10990 }, { "epoch": 2.0376344086021505, "grad_norm": 8.125, "learning_rate": 7.962365591397851e-06, "loss": 2.7946, "mean_token_accuracy": 0.5091701936642299, "step": 10991 }, { "epoch": 2.0378197997775307, "grad_norm": 6.30078125, "learning_rate": 7.96218020022247e-06, "loss": 3.0101, "mean_token_accuracy": 0.4514452744397532, "step": 10992 }, { "epoch": 2.0380051909529104, "grad_norm": 6.17578125, "learning_rate": 7.96199480904709e-06, "loss": 2.5807, "mean_token_accuracy": 0.5050998263888888, "step": 10993 }, { "epoch": 2.0381905821282906, "grad_norm": 6.91015625, "learning_rate": 7.961809417871709e-06, "loss": 3.1641, "mean_token_accuracy": 0.42320695484182563, "step": 10994 }, { "epoch": 2.038375973303671, "grad_norm": 6.33203125, "learning_rate": 7.96162402669633e-06, "loss": 2.3743, "mean_token_accuracy": 0.5270337922403003, "step": 10995 }, { "epoch": 2.038561364479051, "grad_norm": 6.5078125, "learning_rate": 7.96143863552095e-06, "loss": 2.6811, "mean_token_accuracy": 0.5005936319481922, "step": 10996 }, { "epoch": 2.0387467556544308, "grad_norm": 6.99609375, "learning_rate": 7.96125324434557e-06, "loss": 3.3177, "mean_token_accuracy": 0.4190463540974897, "step": 10997 }, { "epoch": 2.038932146829811, "grad_norm": 7.55859375, "learning_rate": 7.96106785317019e-06, "loss": 2.0541, "mean_token_accuracy": 0.5514201762977473, "step": 10998 }, { "epoch": 2.039117538005191, "grad_norm": 6.078125, "learning_rate": 7.96088246199481e-06, "loss": 2.2914, "mean_token_accuracy": 0.5371554831957205, "step": 10999 }, { "epoch": 2.039302929180571, "grad_norm": 7.234375, "learning_rate": 7.96069707081943e-06, "loss": 3.0013, "mean_token_accuracy": 0.49369544131910764, "step": 11000 }, { "epoch": 2.039488320355951, "grad_norm": 7.16015625, "learning_rate": 7.960511679644049e-06, "loss": 2.9114, "mean_token_accuracy": 0.47163695299837927, "step": 11001 }, { "epoch": 2.0396737115313313, "grad_norm": 6.92578125, "learning_rate": 7.96032628846867e-06, "loss": 2.8633, "mean_token_accuracy": 0.47253585596582237, "step": 11002 }, { "epoch": 2.039859102706711, "grad_norm": 6.09765625, "learning_rate": 7.960140897293288e-06, "loss": 2.4309, "mean_token_accuracy": 0.5130370370370371, "step": 11003 }, { "epoch": 2.040044493882091, "grad_norm": 7.69140625, "learning_rate": 7.959955506117909e-06, "loss": 3.302, "mean_token_accuracy": 0.4816934767591039, "step": 11004 }, { "epoch": 2.0402298850574714, "grad_norm": 7.81640625, "learning_rate": 7.95977011494253e-06, "loss": 3.1727, "mean_token_accuracy": 0.44715447154471544, "step": 11005 }, { "epoch": 2.040415276232851, "grad_norm": 8.6875, "learning_rate": 7.95958472376715e-06, "loss": 2.8182, "mean_token_accuracy": 0.46822870240672354, "step": 11006 }, { "epoch": 2.0406006674082313, "grad_norm": 9.28125, "learning_rate": 7.95939933259177e-06, "loss": 2.7062, "mean_token_accuracy": 0.4748201438848921, "step": 11007 }, { "epoch": 2.0407860585836115, "grad_norm": 9.75, "learning_rate": 7.959213941416389e-06, "loss": 2.7599, "mean_token_accuracy": 0.5070969469737547, "step": 11008 }, { "epoch": 2.0409714497589917, "grad_norm": 7.8046875, "learning_rate": 7.95902855024101e-06, "loss": 3.1223, "mean_token_accuracy": 0.45673779596609176, "step": 11009 }, { "epoch": 2.0411568409343714, "grad_norm": 7.484375, "learning_rate": 7.958843159065628e-06, "loss": 3.3623, "mean_token_accuracy": 0.43852813852813854, "step": 11010 }, { "epoch": 2.0413422321097516, "grad_norm": 8.0, "learning_rate": 7.958657767890249e-06, "loss": 3.0495, "mean_token_accuracy": 0.4620418848167539, "step": 11011 }, { "epoch": 2.041527623285132, "grad_norm": 7.11328125, "learning_rate": 7.95847237671487e-06, "loss": 2.6402, "mean_token_accuracy": 0.48354555978170993, "step": 11012 }, { "epoch": 2.0417130144605116, "grad_norm": 5.7890625, "learning_rate": 7.958286985539488e-06, "loss": 2.2659, "mean_token_accuracy": 0.5388254940161425, "step": 11013 }, { "epoch": 2.0418984056358918, "grad_norm": 6.01953125, "learning_rate": 7.958101594364109e-06, "loss": 2.9277, "mean_token_accuracy": 0.4634539014704304, "step": 11014 }, { "epoch": 2.042083796811272, "grad_norm": 8.2109375, "learning_rate": 7.95791620318873e-06, "loss": 3.0561, "mean_token_accuracy": 0.4542488990376774, "step": 11015 }, { "epoch": 2.0422691879866517, "grad_norm": 6.66796875, "learning_rate": 7.95773081201335e-06, "loss": 2.7688, "mean_token_accuracy": 0.47597290058295255, "step": 11016 }, { "epoch": 2.042454579162032, "grad_norm": 6.453125, "learning_rate": 7.957545420837968e-06, "loss": 2.6242, "mean_token_accuracy": 0.4846203763789747, "step": 11017 }, { "epoch": 2.042639970337412, "grad_norm": 6.67578125, "learning_rate": 7.957360029662589e-06, "loss": 3.1075, "mean_token_accuracy": 0.4505978602894902, "step": 11018 }, { "epoch": 2.042825361512792, "grad_norm": 7.25, "learning_rate": 7.957174638487208e-06, "loss": 3.3198, "mean_token_accuracy": 0.4637937124690922, "step": 11019 }, { "epoch": 2.043010752688172, "grad_norm": 8.359375, "learning_rate": 7.956989247311828e-06, "loss": 3.2756, "mean_token_accuracy": 0.44936776113059185, "step": 11020 }, { "epoch": 2.043196143863552, "grad_norm": 8.1015625, "learning_rate": 7.956803856136449e-06, "loss": 2.7053, "mean_token_accuracy": 0.4486138336600392, "step": 11021 }, { "epoch": 2.043381535038932, "grad_norm": 7.15625, "learning_rate": 7.95661846496107e-06, "loss": 2.4052, "mean_token_accuracy": 0.5379181660669066, "step": 11022 }, { "epoch": 2.043566926214312, "grad_norm": 8.9609375, "learning_rate": 7.956433073785688e-06, "loss": 2.74, "mean_token_accuracy": 0.49881201956673654, "step": 11023 }, { "epoch": 2.0437523173896923, "grad_norm": 8.984375, "learning_rate": 7.956247682610309e-06, "loss": 3.0969, "mean_token_accuracy": 0.4695234577022534, "step": 11024 }, { "epoch": 2.0439377085650725, "grad_norm": 9.2734375, "learning_rate": 7.956062291434929e-06, "loss": 2.9824, "mean_token_accuracy": 0.4573502722323049, "step": 11025 }, { "epoch": 2.0441230997404523, "grad_norm": 9.09375, "learning_rate": 7.955876900259548e-06, "loss": 2.4877, "mean_token_accuracy": 0.4992274412855377, "step": 11026 }, { "epoch": 2.0443084909158324, "grad_norm": 9.234375, "learning_rate": 7.955691509084168e-06, "loss": 2.4069, "mean_token_accuracy": 0.518968980138362, "step": 11027 }, { "epoch": 2.0444938820912126, "grad_norm": 10.90625, "learning_rate": 7.955506117908787e-06, "loss": 2.4174, "mean_token_accuracy": 0.4935080694090523, "step": 11028 }, { "epoch": 2.0446792732665924, "grad_norm": 8.9921875, "learning_rate": 7.955320726733408e-06, "loss": 2.7527, "mean_token_accuracy": 0.47843905915894513, "step": 11029 }, { "epoch": 2.0448646644419726, "grad_norm": 9.8125, "learning_rate": 7.955135335558028e-06, "loss": 3.7395, "mean_token_accuracy": 0.42388059701492536, "step": 11030 }, { "epoch": 2.0450500556173528, "grad_norm": 8.515625, "learning_rate": 7.954949944382649e-06, "loss": 3.3123, "mean_token_accuracy": 0.45037438266687907, "step": 11031 }, { "epoch": 2.0452354467927325, "grad_norm": 5.85546875, "learning_rate": 7.954764553207267e-06, "loss": 3.1424, "mean_token_accuracy": 0.46837708830548924, "step": 11032 }, { "epoch": 2.0454208379681127, "grad_norm": 10.734375, "learning_rate": 7.954579162031888e-06, "loss": 2.6922, "mean_token_accuracy": 0.46987485485743774, "step": 11033 }, { "epoch": 2.045606229143493, "grad_norm": 9.9453125, "learning_rate": 7.954393770856508e-06, "loss": 2.437, "mean_token_accuracy": 0.5067294014662436, "step": 11034 }, { "epoch": 2.0457916203188726, "grad_norm": 13.71875, "learning_rate": 7.954208379681127e-06, "loss": 2.5411, "mean_token_accuracy": 0.512496711391739, "step": 11035 }, { "epoch": 2.045977011494253, "grad_norm": 9.2109375, "learning_rate": 7.954022988505748e-06, "loss": 3.4063, "mean_token_accuracy": 0.42857142857142855, "step": 11036 }, { "epoch": 2.046162402669633, "grad_norm": 8.1328125, "learning_rate": 7.953837597330366e-06, "loss": 2.8277, "mean_token_accuracy": 0.47345903977182696, "step": 11037 }, { "epoch": 2.0463477938450128, "grad_norm": 6.4609375, "learning_rate": 7.953652206154989e-06, "loss": 3.0639, "mean_token_accuracy": 0.46919431279620855, "step": 11038 }, { "epoch": 2.046533185020393, "grad_norm": 7.484375, "learning_rate": 7.953466814979607e-06, "loss": 3.0062, "mean_token_accuracy": 0.4790504451038576, "step": 11039 }, { "epoch": 2.046718576195773, "grad_norm": 7.3125, "learning_rate": 7.953281423804228e-06, "loss": 2.9183, "mean_token_accuracy": 0.45355721634475393, "step": 11040 }, { "epoch": 2.0469039673711533, "grad_norm": 8.265625, "learning_rate": 7.953096032628847e-06, "loss": 3.0324, "mean_token_accuracy": 0.4486007995431182, "step": 11041 }, { "epoch": 2.047089358546533, "grad_norm": 5.76953125, "learning_rate": 7.952910641453467e-06, "loss": 2.2781, "mean_token_accuracy": 0.5317887931034483, "step": 11042 }, { "epoch": 2.0472747497219133, "grad_norm": 8.5078125, "learning_rate": 7.952725250278088e-06, "loss": 2.9737, "mean_token_accuracy": 0.45422832980972516, "step": 11043 }, { "epoch": 2.0474601408972934, "grad_norm": 10.2265625, "learning_rate": 7.952539859102707e-06, "loss": 2.2307, "mean_token_accuracy": 0.5644883920894239, "step": 11044 }, { "epoch": 2.047645532072673, "grad_norm": 7.01953125, "learning_rate": 7.952354467927327e-06, "loss": 2.811, "mean_token_accuracy": 0.5130250529727035, "step": 11045 }, { "epoch": 2.0478309232480534, "grad_norm": 9.59375, "learning_rate": 7.952169076751947e-06, "loss": 2.1242, "mean_token_accuracy": 0.5831739961759083, "step": 11046 }, { "epoch": 2.0480163144234336, "grad_norm": 11.1953125, "learning_rate": 7.951983685576568e-06, "loss": 2.6954, "mean_token_accuracy": 0.48087178298168326, "step": 11047 }, { "epoch": 2.0482017055988133, "grad_norm": 11.4921875, "learning_rate": 7.951798294401187e-06, "loss": 2.4739, "mean_token_accuracy": 0.51024655779699, "step": 11048 }, { "epoch": 2.0483870967741935, "grad_norm": 6.4375, "learning_rate": 7.951612903225807e-06, "loss": 3.0471, "mean_token_accuracy": 0.4578625235404896, "step": 11049 }, { "epoch": 2.0485724879495737, "grad_norm": 11.890625, "learning_rate": 7.951427512050428e-06, "loss": 2.5466, "mean_token_accuracy": 0.4874804381846635, "step": 11050 }, { "epoch": 2.0487578791249534, "grad_norm": 9.6640625, "learning_rate": 7.951242120875047e-06, "loss": 2.8876, "mean_token_accuracy": 0.4689655172413793, "step": 11051 }, { "epoch": 2.0489432703003336, "grad_norm": 6.61328125, "learning_rate": 7.951056729699667e-06, "loss": 2.5437, "mean_token_accuracy": 0.518523795953263, "step": 11052 }, { "epoch": 2.049128661475714, "grad_norm": 8.0, "learning_rate": 7.950871338524286e-06, "loss": 3.5081, "mean_token_accuracy": 0.447575115322279, "step": 11053 }, { "epoch": 2.049314052651094, "grad_norm": 9.3359375, "learning_rate": 7.950685947348908e-06, "loss": 2.3483, "mean_token_accuracy": 0.5288818987703746, "step": 11054 }, { "epoch": 2.0494994438264738, "grad_norm": 6.71484375, "learning_rate": 7.950500556173527e-06, "loss": 3.3799, "mean_token_accuracy": 0.4470314318975553, "step": 11055 }, { "epoch": 2.049684835001854, "grad_norm": 8.59375, "learning_rate": 7.950315164998147e-06, "loss": 2.9384, "mean_token_accuracy": 0.4537899773356837, "step": 11056 }, { "epoch": 2.049870226177234, "grad_norm": 9.015625, "learning_rate": 7.950129773822766e-06, "loss": 2.9968, "mean_token_accuracy": 0.4688079061148857, "step": 11057 }, { "epoch": 2.050055617352614, "grad_norm": 5.69140625, "learning_rate": 7.949944382647387e-06, "loss": 2.9191, "mean_token_accuracy": 0.4786099460754943, "step": 11058 }, { "epoch": 2.050241008527994, "grad_norm": 5.59375, "learning_rate": 7.949758991472007e-06, "loss": 2.5432, "mean_token_accuracy": 0.5020955574182733, "step": 11059 }, { "epoch": 2.0504263997033743, "grad_norm": 6.41796875, "learning_rate": 7.949573600296626e-06, "loss": 3.1121, "mean_token_accuracy": 0.4605304212168487, "step": 11060 }, { "epoch": 2.050611790878754, "grad_norm": 6.90625, "learning_rate": 7.949388209121246e-06, "loss": 3.0774, "mean_token_accuracy": 0.464281214564697, "step": 11061 }, { "epoch": 2.050797182054134, "grad_norm": 5.921875, "learning_rate": 7.949202817945867e-06, "loss": 3.0524, "mean_token_accuracy": 0.47358024691358025, "step": 11062 }, { "epoch": 2.0509825732295144, "grad_norm": 7.1875, "learning_rate": 7.949017426770487e-06, "loss": 2.6627, "mean_token_accuracy": 0.5082650567773466, "step": 11063 }, { "epoch": 2.051167964404894, "grad_norm": 7.7734375, "learning_rate": 7.948832035595106e-06, "loss": 2.2627, "mean_token_accuracy": 0.5312225153913809, "step": 11064 }, { "epoch": 2.0513533555802743, "grad_norm": 6.0234375, "learning_rate": 7.948646644419727e-06, "loss": 2.9321, "mean_token_accuracy": 0.5119151833479234, "step": 11065 }, { "epoch": 2.0515387467556545, "grad_norm": 6.41015625, "learning_rate": 7.948461253244345e-06, "loss": 2.2864, "mean_token_accuracy": 0.553906904391528, "step": 11066 }, { "epoch": 2.0517241379310347, "grad_norm": 7.75, "learning_rate": 7.948275862068966e-06, "loss": 2.5996, "mean_token_accuracy": 0.5200213561131874, "step": 11067 }, { "epoch": 2.0519095291064144, "grad_norm": 5.9140625, "learning_rate": 7.948090470893586e-06, "loss": 2.631, "mean_token_accuracy": 0.5, "step": 11068 }, { "epoch": 2.0520949202817946, "grad_norm": 6.3828125, "learning_rate": 7.947905079718205e-06, "loss": 3.0364, "mean_token_accuracy": 0.4728038367060475, "step": 11069 }, { "epoch": 2.052280311457175, "grad_norm": 6.83984375, "learning_rate": 7.947719688542826e-06, "loss": 2.7977, "mean_token_accuracy": 0.46464539383017067, "step": 11070 }, { "epoch": 2.0524657026325546, "grad_norm": 7.359375, "learning_rate": 7.947534297367446e-06, "loss": 2.7611, "mean_token_accuracy": 0.4973718791064389, "step": 11071 }, { "epoch": 2.0526510938079348, "grad_norm": 10.0859375, "learning_rate": 7.947348906192067e-06, "loss": 2.6969, "mean_token_accuracy": 0.47619625941219335, "step": 11072 }, { "epoch": 2.052836484983315, "grad_norm": 6.5625, "learning_rate": 7.947163515016686e-06, "loss": 2.6009, "mean_token_accuracy": 0.482989403234802, "step": 11073 }, { "epoch": 2.0530218761586947, "grad_norm": 8.0859375, "learning_rate": 7.946978123841306e-06, "loss": 2.8616, "mean_token_accuracy": 0.47792734114922564, "step": 11074 }, { "epoch": 2.053207267334075, "grad_norm": 7.2734375, "learning_rate": 7.946792732665925e-06, "loss": 3.3049, "mean_token_accuracy": 0.4371144403877628, "step": 11075 }, { "epoch": 2.053392658509455, "grad_norm": 8.0234375, "learning_rate": 7.946607341490545e-06, "loss": 3.1219, "mean_token_accuracy": 0.456532877882152, "step": 11076 }, { "epoch": 2.053578049684835, "grad_norm": 8.375, "learning_rate": 7.946421950315166e-06, "loss": 3.1793, "mean_token_accuracy": 0.4536959786417487, "step": 11077 }, { "epoch": 2.053763440860215, "grad_norm": 7.82421875, "learning_rate": 7.946236559139786e-06, "loss": 2.9693, "mean_token_accuracy": 0.48003237992444686, "step": 11078 }, { "epoch": 2.053948832035595, "grad_norm": 11.9921875, "learning_rate": 7.946051167964405e-06, "loss": 2.3218, "mean_token_accuracy": 0.499479979199168, "step": 11079 }, { "epoch": 2.054134223210975, "grad_norm": 6.72265625, "learning_rate": 7.945865776789026e-06, "loss": 2.3435, "mean_token_accuracy": 0.5111767186840995, "step": 11080 }, { "epoch": 2.054319614386355, "grad_norm": 5.72265625, "learning_rate": 7.945680385613646e-06, "loss": 2.9086, "mean_token_accuracy": 0.47548048922539315, "step": 11081 }, { "epoch": 2.0545050055617353, "grad_norm": 6.84765625, "learning_rate": 7.945494994438265e-06, "loss": 2.3728, "mean_token_accuracy": 0.5271374379924132, "step": 11082 }, { "epoch": 2.0546903967371155, "grad_norm": 7.76953125, "learning_rate": 7.945309603262885e-06, "loss": 3.1417, "mean_token_accuracy": 0.47062262496346097, "step": 11083 }, { "epoch": 2.0548757879124953, "grad_norm": 7.0859375, "learning_rate": 7.945124212087504e-06, "loss": 2.8168, "mean_token_accuracy": 0.4705746329055382, "step": 11084 }, { "epoch": 2.0550611790878754, "grad_norm": 9.6796875, "learning_rate": 7.944938820912125e-06, "loss": 2.2831, "mean_token_accuracy": 0.5317373461012312, "step": 11085 }, { "epoch": 2.0552465702632556, "grad_norm": 7.19140625, "learning_rate": 7.944753429736745e-06, "loss": 2.5842, "mean_token_accuracy": 0.5192584075883875, "step": 11086 }, { "epoch": 2.0554319614386354, "grad_norm": 6.58203125, "learning_rate": 7.944568038561366e-06, "loss": 2.9399, "mean_token_accuracy": 0.4646300237886566, "step": 11087 }, { "epoch": 2.0556173526140156, "grad_norm": 6.73828125, "learning_rate": 7.944382647385986e-06, "loss": 2.6766, "mean_token_accuracy": 0.5211722179189259, "step": 11088 }, { "epoch": 2.0558027437893958, "grad_norm": 9.140625, "learning_rate": 7.944197256210605e-06, "loss": 2.5845, "mean_token_accuracy": 0.5061963775023832, "step": 11089 }, { "epoch": 2.0559881349647755, "grad_norm": 9.0703125, "learning_rate": 7.944011865035225e-06, "loss": 3.0319, "mean_token_accuracy": 0.46777003484320556, "step": 11090 }, { "epoch": 2.0561735261401557, "grad_norm": 9.9609375, "learning_rate": 7.943826473859844e-06, "loss": 3.0523, "mean_token_accuracy": 0.4605847237103934, "step": 11091 }, { "epoch": 2.056358917315536, "grad_norm": 6.6640625, "learning_rate": 7.943641082684465e-06, "loss": 2.4852, "mean_token_accuracy": 0.49620599577061825, "step": 11092 }, { "epoch": 2.0565443084909156, "grad_norm": 7.9453125, "learning_rate": 7.943455691509085e-06, "loss": 3.8453, "mean_token_accuracy": 0.41403508771929826, "step": 11093 }, { "epoch": 2.056729699666296, "grad_norm": 10.078125, "learning_rate": 7.943270300333706e-06, "loss": 3.087, "mean_token_accuracy": 0.4543714866642746, "step": 11094 }, { "epoch": 2.056915090841676, "grad_norm": 7.703125, "learning_rate": 7.943084909158324e-06, "loss": 2.9585, "mean_token_accuracy": 0.47253653936822254, "step": 11095 }, { "epoch": 2.057100482017056, "grad_norm": 5.7109375, "learning_rate": 7.942899517982945e-06, "loss": 2.9384, "mean_token_accuracy": 0.46965339791130617, "step": 11096 }, { "epoch": 2.057285873192436, "grad_norm": 11.4609375, "learning_rate": 7.942714126807565e-06, "loss": 3.8138, "mean_token_accuracy": 0.4392918483287092, "step": 11097 }, { "epoch": 2.057471264367816, "grad_norm": 12.578125, "learning_rate": 7.942528735632184e-06, "loss": 2.7812, "mean_token_accuracy": 0.47592030610898534, "step": 11098 }, { "epoch": 2.0576566555431963, "grad_norm": 10.4375, "learning_rate": 7.942343344456805e-06, "loss": 2.5163, "mean_token_accuracy": 0.5046496398166339, "step": 11099 }, { "epoch": 2.057842046718576, "grad_norm": 6.25, "learning_rate": 7.942157953281424e-06, "loss": 2.691, "mean_token_accuracy": 0.48757763975155277, "step": 11100 }, { "epoch": 2.0580274378939563, "grad_norm": 10.0234375, "learning_rate": 7.941972562106044e-06, "loss": 3.0978, "mean_token_accuracy": 0.44033816425120775, "step": 11101 }, { "epoch": 2.0582128290693364, "grad_norm": 10.8359375, "learning_rate": 7.941787170930665e-06, "loss": 3.6431, "mean_token_accuracy": 0.4451305575158786, "step": 11102 }, { "epoch": 2.058398220244716, "grad_norm": 7.55078125, "learning_rate": 7.941601779755285e-06, "loss": 2.9076, "mean_token_accuracy": 0.45856862971072954, "step": 11103 }, { "epoch": 2.0585836114200964, "grad_norm": 6.44140625, "learning_rate": 7.941416388579904e-06, "loss": 3.3467, "mean_token_accuracy": 0.4203998073217726, "step": 11104 }, { "epoch": 2.0587690025954766, "grad_norm": 5.80859375, "learning_rate": 7.941230997404524e-06, "loss": 3.0493, "mean_token_accuracy": 0.4525930445393533, "step": 11105 }, { "epoch": 2.0589543937708563, "grad_norm": 12.96875, "learning_rate": 7.941045606229145e-06, "loss": 2.66, "mean_token_accuracy": 0.4881414980570816, "step": 11106 }, { "epoch": 2.0591397849462365, "grad_norm": 11.5, "learning_rate": 7.940860215053764e-06, "loss": 2.7268, "mean_token_accuracy": 0.4617248062015504, "step": 11107 }, { "epoch": 2.0593251761216167, "grad_norm": 7.86328125, "learning_rate": 7.940674823878384e-06, "loss": 2.5614, "mean_token_accuracy": 0.4952796956460476, "step": 11108 }, { "epoch": 2.0595105672969964, "grad_norm": 7.828125, "learning_rate": 7.940489432703003e-06, "loss": 2.8869, "mean_token_accuracy": 0.465930800254723, "step": 11109 }, { "epoch": 2.0596959584723766, "grad_norm": 7.98046875, "learning_rate": 7.940304041527625e-06, "loss": 2.9661, "mean_token_accuracy": 0.47250658087159986, "step": 11110 }, { "epoch": 2.059881349647757, "grad_norm": 8.6640625, "learning_rate": 7.940118650352244e-06, "loss": 2.4165, "mean_token_accuracy": 0.5272820644498651, "step": 11111 }, { "epoch": 2.060066740823137, "grad_norm": 7.5234375, "learning_rate": 7.939933259176864e-06, "loss": 3.0407, "mean_token_accuracy": 0.4624349119761964, "step": 11112 }, { "epoch": 2.0602521319985168, "grad_norm": 8.8515625, "learning_rate": 7.939747868001483e-06, "loss": 2.8667, "mean_token_accuracy": 0.48247232472324725, "step": 11113 }, { "epoch": 2.060437523173897, "grad_norm": 7.921875, "learning_rate": 7.939562476826104e-06, "loss": 3.0871, "mean_token_accuracy": 0.46703444564047364, "step": 11114 }, { "epoch": 2.060622914349277, "grad_norm": 7.0625, "learning_rate": 7.939377085650724e-06, "loss": 3.4347, "mean_token_accuracy": 0.45660749506903353, "step": 11115 }, { "epoch": 2.060808305524657, "grad_norm": 7.21875, "learning_rate": 7.939191694475343e-06, "loss": 2.4902, "mean_token_accuracy": 0.5036619718309859, "step": 11116 }, { "epoch": 2.060993696700037, "grad_norm": 7.515625, "learning_rate": 7.939006303299963e-06, "loss": 2.0226, "mean_token_accuracy": 0.556198347107438, "step": 11117 }, { "epoch": 2.0611790878754173, "grad_norm": 5.75390625, "learning_rate": 7.938820912124584e-06, "loss": 3.2231, "mean_token_accuracy": 0.4441551679250195, "step": 11118 }, { "epoch": 2.061364479050797, "grad_norm": 7.55078125, "learning_rate": 7.938635520949204e-06, "loss": 3.2792, "mean_token_accuracy": 0.4496652465003043, "step": 11119 }, { "epoch": 2.061549870226177, "grad_norm": 6.578125, "learning_rate": 7.938450129773823e-06, "loss": 2.7822, "mean_token_accuracy": 0.4740213523131673, "step": 11120 }, { "epoch": 2.0617352614015574, "grad_norm": 6.1875, "learning_rate": 7.938264738598444e-06, "loss": 2.5927, "mean_token_accuracy": 0.5084284232365145, "step": 11121 }, { "epoch": 2.061920652576937, "grad_norm": 8.3671875, "learning_rate": 7.938079347423062e-06, "loss": 3.0243, "mean_token_accuracy": 0.4475457170356112, "step": 11122 }, { "epoch": 2.0621060437523173, "grad_norm": 7.71875, "learning_rate": 7.937893956247683e-06, "loss": 2.595, "mean_token_accuracy": 0.507460126907906, "step": 11123 }, { "epoch": 2.0622914349276975, "grad_norm": 6.87890625, "learning_rate": 7.937708565072303e-06, "loss": 3.0396, "mean_token_accuracy": 0.4526813880126183, "step": 11124 }, { "epoch": 2.0624768261030777, "grad_norm": 6.77734375, "learning_rate": 7.937523173896922e-06, "loss": 2.7191, "mean_token_accuracy": 0.5110604638691514, "step": 11125 }, { "epoch": 2.0626622172784574, "grad_norm": 8.1015625, "learning_rate": 7.937337782721544e-06, "loss": 2.9408, "mean_token_accuracy": 0.4760226882090065, "step": 11126 }, { "epoch": 2.0628476084538376, "grad_norm": 5.93359375, "learning_rate": 7.937152391546163e-06, "loss": 2.7256, "mean_token_accuracy": 0.49726231956197114, "step": 11127 }, { "epoch": 2.063032999629218, "grad_norm": 7.56640625, "learning_rate": 7.936967000370784e-06, "loss": 3.4549, "mean_token_accuracy": 0.4217378141083863, "step": 11128 }, { "epoch": 2.0632183908045976, "grad_norm": 7.51171875, "learning_rate": 7.936781609195403e-06, "loss": 3.0561, "mean_token_accuracy": 0.4519373279695109, "step": 11129 }, { "epoch": 2.0634037819799778, "grad_norm": 6.0234375, "learning_rate": 7.936596218020023e-06, "loss": 2.4912, "mean_token_accuracy": 0.5134871628209294, "step": 11130 }, { "epoch": 2.063589173155358, "grad_norm": 6.94140625, "learning_rate": 7.936410826844644e-06, "loss": 1.9724, "mean_token_accuracy": 0.5705767984445884, "step": 11131 }, { "epoch": 2.0637745643307377, "grad_norm": 5.2734375, "learning_rate": 7.936225435669262e-06, "loss": 2.3906, "mean_token_accuracy": 0.5038128068526063, "step": 11132 }, { "epoch": 2.063959955506118, "grad_norm": 8.2578125, "learning_rate": 7.936040044493883e-06, "loss": 3.2315, "mean_token_accuracy": 0.45629232950070875, "step": 11133 }, { "epoch": 2.064145346681498, "grad_norm": 6.609375, "learning_rate": 7.935854653318503e-06, "loss": 2.7083, "mean_token_accuracy": 0.4942213233169604, "step": 11134 }, { "epoch": 2.064330737856878, "grad_norm": 6.5546875, "learning_rate": 7.935669262143124e-06, "loss": 1.9959, "mean_token_accuracy": 0.5652116576552816, "step": 11135 }, { "epoch": 2.064516129032258, "grad_norm": 7.484375, "learning_rate": 7.935483870967743e-06, "loss": 2.6744, "mean_token_accuracy": 0.5312447078746825, "step": 11136 }, { "epoch": 2.064701520207638, "grad_norm": 9.140625, "learning_rate": 7.935298479792363e-06, "loss": 2.6932, "mean_token_accuracy": 0.47133838383838383, "step": 11137 }, { "epoch": 2.0648869113830184, "grad_norm": 7.484375, "learning_rate": 7.935113088616982e-06, "loss": 2.9929, "mean_token_accuracy": 0.45969983324068925, "step": 11138 }, { "epoch": 2.065072302558398, "grad_norm": 7.10546875, "learning_rate": 7.934927697441602e-06, "loss": 3.8653, "mean_token_accuracy": 0.4068852073786864, "step": 11139 }, { "epoch": 2.0652576937337783, "grad_norm": 7.00390625, "learning_rate": 7.934742306266223e-06, "loss": 2.9088, "mean_token_accuracy": 0.48401761613865607, "step": 11140 }, { "epoch": 2.0654430849091585, "grad_norm": 8.5546875, "learning_rate": 7.934556915090842e-06, "loss": 2.5674, "mean_token_accuracy": 0.49857217030114226, "step": 11141 }, { "epoch": 2.0656284760845383, "grad_norm": 8.6328125, "learning_rate": 7.934371523915462e-06, "loss": 3.1848, "mean_token_accuracy": 0.4727343547130615, "step": 11142 }, { "epoch": 2.0658138672599184, "grad_norm": 6.6796875, "learning_rate": 7.934186132740083e-06, "loss": 2.7549, "mean_token_accuracy": 0.49181698721777567, "step": 11143 }, { "epoch": 2.0659992584352986, "grad_norm": 9.65625, "learning_rate": 7.934000741564703e-06, "loss": 2.7603, "mean_token_accuracy": 0.47869609856262835, "step": 11144 }, { "epoch": 2.0661846496106784, "grad_norm": 6.921875, "learning_rate": 7.933815350389322e-06, "loss": 3.2121, "mean_token_accuracy": 0.4722010662604722, "step": 11145 }, { "epoch": 2.0663700407860586, "grad_norm": 6.55078125, "learning_rate": 7.933629959213942e-06, "loss": 2.6413, "mean_token_accuracy": 0.5013069997095556, "step": 11146 }, { "epoch": 2.0665554319614388, "grad_norm": 5.91796875, "learning_rate": 7.933444568038561e-06, "loss": 2.9234, "mean_token_accuracy": 0.4948948948948949, "step": 11147 }, { "epoch": 2.0667408231368185, "grad_norm": 5.79296875, "learning_rate": 7.933259176863182e-06, "loss": 3.265, "mean_token_accuracy": 0.44822739340933937, "step": 11148 }, { "epoch": 2.0669262143121987, "grad_norm": 5.36328125, "learning_rate": 7.933073785687802e-06, "loss": 2.6778, "mean_token_accuracy": 0.5074057939446743, "step": 11149 }, { "epoch": 2.067111605487579, "grad_norm": 6.609375, "learning_rate": 7.932888394512421e-06, "loss": 2.8743, "mean_token_accuracy": 0.48020571701949527, "step": 11150 }, { "epoch": 2.0672969966629586, "grad_norm": 6.2890625, "learning_rate": 7.932703003337041e-06, "loss": 2.6268, "mean_token_accuracy": 0.502021018593371, "step": 11151 }, { "epoch": 2.067482387838339, "grad_norm": 6.046875, "learning_rate": 7.932517612161662e-06, "loss": 2.8148, "mean_token_accuracy": 0.49466728495246926, "step": 11152 }, { "epoch": 2.067667779013719, "grad_norm": 6.6171875, "learning_rate": 7.932332220986282e-06, "loss": 2.6459, "mean_token_accuracy": 0.504446871586831, "step": 11153 }, { "epoch": 2.067853170189099, "grad_norm": 6.234375, "learning_rate": 7.932146829810901e-06, "loss": 2.7008, "mean_token_accuracy": 0.47444108761329307, "step": 11154 }, { "epoch": 2.068038561364479, "grad_norm": 6.41796875, "learning_rate": 7.931961438635522e-06, "loss": 3.479, "mean_token_accuracy": 0.40685191032522894, "step": 11155 }, { "epoch": 2.068223952539859, "grad_norm": 6.44921875, "learning_rate": 7.93177604746014e-06, "loss": 2.2386, "mean_token_accuracy": 0.5574893791969302, "step": 11156 }, { "epoch": 2.0684093437152393, "grad_norm": 6.07421875, "learning_rate": 7.931590656284761e-06, "loss": 2.4362, "mean_token_accuracy": 0.5046847888953152, "step": 11157 }, { "epoch": 2.068594734890619, "grad_norm": 9.1640625, "learning_rate": 7.931405265109382e-06, "loss": 3.2901, "mean_token_accuracy": 0.44636927963043166, "step": 11158 }, { "epoch": 2.0687801260659993, "grad_norm": 7.5859375, "learning_rate": 7.931219873934002e-06, "loss": 2.1656, "mean_token_accuracy": 0.5534294234592445, "step": 11159 }, { "epoch": 2.0689655172413794, "grad_norm": 7.265625, "learning_rate": 7.93103448275862e-06, "loss": 2.6748, "mean_token_accuracy": 0.47854077253218885, "step": 11160 }, { "epoch": 2.069150908416759, "grad_norm": 6.68359375, "learning_rate": 7.930849091583241e-06, "loss": 2.7176, "mean_token_accuracy": 0.4877137519758586, "step": 11161 }, { "epoch": 2.0693362995921394, "grad_norm": 7.328125, "learning_rate": 7.930663700407862e-06, "loss": 2.7989, "mean_token_accuracy": 0.4841511072514112, "step": 11162 }, { "epoch": 2.0695216907675196, "grad_norm": 7.1796875, "learning_rate": 7.93047830923248e-06, "loss": 2.3613, "mean_token_accuracy": 0.5160154284755996, "step": 11163 }, { "epoch": 2.0697070819428993, "grad_norm": 6.21875, "learning_rate": 7.930292918057101e-06, "loss": 2.7173, "mean_token_accuracy": 0.502951149064423, "step": 11164 }, { "epoch": 2.0698924731182795, "grad_norm": 6.64453125, "learning_rate": 7.93010752688172e-06, "loss": 2.7911, "mean_token_accuracy": 0.49530878028287356, "step": 11165 }, { "epoch": 2.0700778642936597, "grad_norm": 6.46875, "learning_rate": 7.92992213570634e-06, "loss": 2.3694, "mean_token_accuracy": 0.5239942369500167, "step": 11166 }, { "epoch": 2.07026325546904, "grad_norm": 10.5859375, "learning_rate": 7.929736744530961e-06, "loss": 2.4885, "mean_token_accuracy": 0.5001112594570538, "step": 11167 }, { "epoch": 2.0704486466444196, "grad_norm": 11.4375, "learning_rate": 7.929551353355581e-06, "loss": 2.3009, "mean_token_accuracy": 0.5181575224021381, "step": 11168 }, { "epoch": 2.0706340378198, "grad_norm": 5.9609375, "learning_rate": 7.929365962180202e-06, "loss": 2.7442, "mean_token_accuracy": 0.5027252324462969, "step": 11169 }, { "epoch": 2.07081942899518, "grad_norm": 7.8046875, "learning_rate": 7.92918057100482e-06, "loss": 2.4878, "mean_token_accuracy": 0.49542374895994296, "step": 11170 }, { "epoch": 2.0710048201705598, "grad_norm": 8.390625, "learning_rate": 7.928995179829441e-06, "loss": 3.2973, "mean_token_accuracy": 0.4512610088070456, "step": 11171 }, { "epoch": 2.07119021134594, "grad_norm": 6.125, "learning_rate": 7.92880978865406e-06, "loss": 2.8435, "mean_token_accuracy": 0.4988403963736032, "step": 11172 }, { "epoch": 2.07137560252132, "grad_norm": 5.90625, "learning_rate": 7.92862439747868e-06, "loss": 2.7373, "mean_token_accuracy": 0.4900874635568513, "step": 11173 }, { "epoch": 2.0715609936967, "grad_norm": 8.28125, "learning_rate": 7.928439006303301e-06, "loss": 3.2347, "mean_token_accuracy": 0.45322410147991543, "step": 11174 }, { "epoch": 2.07174638487208, "grad_norm": 7.19140625, "learning_rate": 7.928253615127921e-06, "loss": 3.3862, "mean_token_accuracy": 0.4428425804572594, "step": 11175 }, { "epoch": 2.0719317760474603, "grad_norm": 8.453125, "learning_rate": 7.92806822395254e-06, "loss": 2.1702, "mean_token_accuracy": 0.5416078984485191, "step": 11176 }, { "epoch": 2.07211716722284, "grad_norm": 9.796875, "learning_rate": 7.92788283277716e-06, "loss": 3.212, "mean_token_accuracy": 0.45143620574482296, "step": 11177 }, { "epoch": 2.07230255839822, "grad_norm": 6.73046875, "learning_rate": 7.927697441601781e-06, "loss": 2.714, "mean_token_accuracy": 0.48797385620915035, "step": 11178 }, { "epoch": 2.0724879495736004, "grad_norm": 8.4140625, "learning_rate": 7.9275120504264e-06, "loss": 2.9505, "mean_token_accuracy": 0.46327052060044716, "step": 11179 }, { "epoch": 2.07267334074898, "grad_norm": 10.2421875, "learning_rate": 7.92732665925102e-06, "loss": 2.5954, "mean_token_accuracy": 0.4898520361398455, "step": 11180 }, { "epoch": 2.0728587319243603, "grad_norm": 8.5234375, "learning_rate": 7.92714126807564e-06, "loss": 2.7867, "mean_token_accuracy": 0.4818530539982296, "step": 11181 }, { "epoch": 2.0730441230997405, "grad_norm": 6.046875, "learning_rate": 7.92695587690026e-06, "loss": 2.9596, "mean_token_accuracy": 0.47026022304832715, "step": 11182 }, { "epoch": 2.0732295142751207, "grad_norm": 10.1953125, "learning_rate": 7.92677048572488e-06, "loss": 2.9316, "mean_token_accuracy": 0.45340201245807377, "step": 11183 }, { "epoch": 2.0734149054505004, "grad_norm": 6.33203125, "learning_rate": 7.9265850945495e-06, "loss": 2.7745, "mean_token_accuracy": 0.478374672815655, "step": 11184 }, { "epoch": 2.0736002966258806, "grad_norm": 5.5859375, "learning_rate": 7.92639970337412e-06, "loss": 2.1262, "mean_token_accuracy": 0.5523270440251572, "step": 11185 }, { "epoch": 2.073785687801261, "grad_norm": 6.9609375, "learning_rate": 7.92621431219874e-06, "loss": 2.8001, "mean_token_accuracy": 0.49698228950232515, "step": 11186 }, { "epoch": 2.0739710789766406, "grad_norm": 7.1328125, "learning_rate": 7.92602892102336e-06, "loss": 3.0336, "mean_token_accuracy": 0.4376704111680663, "step": 11187 }, { "epoch": 2.0741564701520208, "grad_norm": 6.12109375, "learning_rate": 7.92584352984798e-06, "loss": 3.0557, "mean_token_accuracy": 0.4730881494454174, "step": 11188 }, { "epoch": 2.074341861327401, "grad_norm": 6.48046875, "learning_rate": 7.9256581386726e-06, "loss": 3.4424, "mean_token_accuracy": 0.4170359428852782, "step": 11189 }, { "epoch": 2.0745272525027807, "grad_norm": 5.6875, "learning_rate": 7.925472747497219e-06, "loss": 2.6061, "mean_token_accuracy": 0.49539447336804165, "step": 11190 }, { "epoch": 2.074712643678161, "grad_norm": 7.390625, "learning_rate": 7.92528735632184e-06, "loss": 2.8948, "mean_token_accuracy": 0.5060207224866984, "step": 11191 }, { "epoch": 2.074898034853541, "grad_norm": 7.875, "learning_rate": 7.92510196514646e-06, "loss": 2.8509, "mean_token_accuracy": 0.47306075659151703, "step": 11192 }, { "epoch": 2.075083426028921, "grad_norm": 9.9765625, "learning_rate": 7.92491657397108e-06, "loss": 2.7732, "mean_token_accuracy": 0.4957157784743992, "step": 11193 }, { "epoch": 2.075268817204301, "grad_norm": 9.6796875, "learning_rate": 7.924731182795699e-06, "loss": 3.0266, "mean_token_accuracy": 0.49566587864460204, "step": 11194 }, { "epoch": 2.075454208379681, "grad_norm": 7.484375, "learning_rate": 7.92454579162032e-06, "loss": 2.7491, "mean_token_accuracy": 0.480374464245432, "step": 11195 }, { "epoch": 2.0756395995550614, "grad_norm": 6.92578125, "learning_rate": 7.92436040044494e-06, "loss": 2.4174, "mean_token_accuracy": 0.5198670254867817, "step": 11196 }, { "epoch": 2.075824990730441, "grad_norm": 6.46875, "learning_rate": 7.924175009269559e-06, "loss": 2.4569, "mean_token_accuracy": 0.5028197198471894, "step": 11197 }, { "epoch": 2.0760103819058213, "grad_norm": 5.87109375, "learning_rate": 7.92398961809418e-06, "loss": 2.3831, "mean_token_accuracy": 0.5193777292576419, "step": 11198 }, { "epoch": 2.0761957730812015, "grad_norm": 7.3828125, "learning_rate": 7.9238042269188e-06, "loss": 2.8468, "mean_token_accuracy": 0.45149497323410365, "step": 11199 }, { "epoch": 2.0763811642565813, "grad_norm": 6.328125, "learning_rate": 7.92361883574342e-06, "loss": 2.4394, "mean_token_accuracy": 0.5022538897775193, "step": 11200 }, { "epoch": 2.0765665554319614, "grad_norm": 6.96484375, "learning_rate": 7.923433444568039e-06, "loss": 3.0919, "mean_token_accuracy": 0.46627349797272394, "step": 11201 }, { "epoch": 2.0767519466073416, "grad_norm": 6.72265625, "learning_rate": 7.92324805339266e-06, "loss": 3.1912, "mean_token_accuracy": 0.44732724902216425, "step": 11202 }, { "epoch": 2.0769373377827214, "grad_norm": 6.0703125, "learning_rate": 7.923062662217278e-06, "loss": 2.7846, "mean_token_accuracy": 0.4866810655147588, "step": 11203 }, { "epoch": 2.0771227289581016, "grad_norm": 5.61328125, "learning_rate": 7.922877271041899e-06, "loss": 2.7956, "mean_token_accuracy": 0.4658833522083805, "step": 11204 }, { "epoch": 2.0773081201334818, "grad_norm": 12.671875, "learning_rate": 7.92269187986652e-06, "loss": 3.1031, "mean_token_accuracy": 0.4868867633446467, "step": 11205 }, { "epoch": 2.0774935113088615, "grad_norm": 7.34375, "learning_rate": 7.922506488691138e-06, "loss": 3.2396, "mean_token_accuracy": 0.4571019398515818, "step": 11206 }, { "epoch": 2.0776789024842417, "grad_norm": 7.5390625, "learning_rate": 7.92232109751576e-06, "loss": 2.8835, "mean_token_accuracy": 0.48490749756572543, "step": 11207 }, { "epoch": 2.077864293659622, "grad_norm": 7.2578125, "learning_rate": 7.922135706340379e-06, "loss": 3.9074, "mean_token_accuracy": 0.41481151677055506, "step": 11208 }, { "epoch": 2.078049684835002, "grad_norm": 5.72265625, "learning_rate": 7.921950315165e-06, "loss": 2.8236, "mean_token_accuracy": 0.47912110798708735, "step": 11209 }, { "epoch": 2.078235076010382, "grad_norm": 7.48828125, "learning_rate": 7.921764923989618e-06, "loss": 4.0121, "mean_token_accuracy": 0.41961356179365655, "step": 11210 }, { "epoch": 2.078420467185762, "grad_norm": 8.125, "learning_rate": 7.921579532814239e-06, "loss": 2.7882, "mean_token_accuracy": 0.4696940542620743, "step": 11211 }, { "epoch": 2.078605858361142, "grad_norm": 8.7578125, "learning_rate": 7.92139414163886e-06, "loss": 2.6548, "mean_token_accuracy": 0.5006422982599557, "step": 11212 }, { "epoch": 2.078791249536522, "grad_norm": 8.90625, "learning_rate": 7.921208750463478e-06, "loss": 2.8819, "mean_token_accuracy": 0.4953345774460144, "step": 11213 }, { "epoch": 2.078976640711902, "grad_norm": 7.8203125, "learning_rate": 7.921023359288099e-06, "loss": 3.2911, "mean_token_accuracy": 0.44637173774665817, "step": 11214 }, { "epoch": 2.0791620318872823, "grad_norm": 7.39453125, "learning_rate": 7.920837968112719e-06, "loss": 2.6889, "mean_token_accuracy": 0.4697923956146489, "step": 11215 }, { "epoch": 2.079347423062662, "grad_norm": 8.234375, "learning_rate": 7.92065257693734e-06, "loss": 2.763, "mean_token_accuracy": 0.48798328108672934, "step": 11216 }, { "epoch": 2.0795328142380423, "grad_norm": 7.11328125, "learning_rate": 7.920467185761958e-06, "loss": 2.6089, "mean_token_accuracy": 0.5180096051227321, "step": 11217 }, { "epoch": 2.0797182054134224, "grad_norm": 9.3984375, "learning_rate": 7.920281794586579e-06, "loss": 2.8342, "mean_token_accuracy": 0.46156077748767044, "step": 11218 }, { "epoch": 2.079903596588802, "grad_norm": 7.01171875, "learning_rate": 7.920096403411198e-06, "loss": 3.4359, "mean_token_accuracy": 0.45224513675588707, "step": 11219 }, { "epoch": 2.0800889877641824, "grad_norm": 8.1640625, "learning_rate": 7.919911012235818e-06, "loss": 2.6304, "mean_token_accuracy": 0.5109052883178966, "step": 11220 }, { "epoch": 2.0802743789395626, "grad_norm": 10.7421875, "learning_rate": 7.919725621060439e-06, "loss": 2.4993, "mean_token_accuracy": 0.5180995475113123, "step": 11221 }, { "epoch": 2.0804597701149423, "grad_norm": 7.23046875, "learning_rate": 7.919540229885057e-06, "loss": 2.5783, "mean_token_accuracy": 0.4964309614097702, "step": 11222 }, { "epoch": 2.0806451612903225, "grad_norm": 7.44140625, "learning_rate": 7.919354838709678e-06, "loss": 3.1658, "mean_token_accuracy": 0.4681544028950543, "step": 11223 }, { "epoch": 2.0808305524657027, "grad_norm": 10.671875, "learning_rate": 7.919169447534298e-06, "loss": 2.4823, "mean_token_accuracy": 0.526896551724138, "step": 11224 }, { "epoch": 2.081015943641083, "grad_norm": 9.40625, "learning_rate": 7.918984056358919e-06, "loss": 2.181, "mean_token_accuracy": 0.5634290662433251, "step": 11225 }, { "epoch": 2.0812013348164626, "grad_norm": 6.34375, "learning_rate": 7.918798665183538e-06, "loss": 2.4768, "mean_token_accuracy": 0.5597649918962723, "step": 11226 }, { "epoch": 2.081386725991843, "grad_norm": 10.390625, "learning_rate": 7.918613274008158e-06, "loss": 2.8365, "mean_token_accuracy": 0.46663706006439437, "step": 11227 }, { "epoch": 2.081572117167223, "grad_norm": 14.5234375, "learning_rate": 7.918427882832777e-06, "loss": 2.4348, "mean_token_accuracy": 0.5061224489795918, "step": 11228 }, { "epoch": 2.0817575083426028, "grad_norm": 12.9296875, "learning_rate": 7.918242491657397e-06, "loss": 3.4438, "mean_token_accuracy": 0.4461184588844163, "step": 11229 }, { "epoch": 2.081942899517983, "grad_norm": 7.12890625, "learning_rate": 7.918057100482018e-06, "loss": 2.5604, "mean_token_accuracy": 0.4986726281961716, "step": 11230 }, { "epoch": 2.082128290693363, "grad_norm": 11.8828125, "learning_rate": 7.917871709306638e-06, "loss": 3.311, "mean_token_accuracy": 0.4283367556468172, "step": 11231 }, { "epoch": 2.082313681868743, "grad_norm": 21.453125, "learning_rate": 7.917686318131257e-06, "loss": 2.5196, "mean_token_accuracy": 0.492292600896861, "step": 11232 }, { "epoch": 2.082499073044123, "grad_norm": 15.640625, "learning_rate": 7.917500926955878e-06, "loss": 2.764, "mean_token_accuracy": 0.48109460055907016, "step": 11233 }, { "epoch": 2.0826844642195033, "grad_norm": 7.92578125, "learning_rate": 7.917315535780498e-06, "loss": 3.0109, "mean_token_accuracy": 0.4821953609931395, "step": 11234 }, { "epoch": 2.082869855394883, "grad_norm": 7.90234375, "learning_rate": 7.917130144605117e-06, "loss": 2.7545, "mean_token_accuracy": 0.47784679089026916, "step": 11235 }, { "epoch": 2.083055246570263, "grad_norm": 11.640625, "learning_rate": 7.916944753429737e-06, "loss": 3.0109, "mean_token_accuracy": 0.4734542565867867, "step": 11236 }, { "epoch": 2.0832406377456434, "grad_norm": 13.6875, "learning_rate": 7.916759362254356e-06, "loss": 2.5166, "mean_token_accuracy": 0.5026559604694256, "step": 11237 }, { "epoch": 2.083426028921023, "grad_norm": 6.29296875, "learning_rate": 7.916573971078977e-06, "loss": 2.1478, "mean_token_accuracy": 0.5757575757575758, "step": 11238 }, { "epoch": 2.0836114200964033, "grad_norm": 6.36328125, "learning_rate": 7.916388579903597e-06, "loss": 2.5818, "mean_token_accuracy": 0.4994792441600952, "step": 11239 }, { "epoch": 2.0837968112717835, "grad_norm": 8.640625, "learning_rate": 7.916203188728218e-06, "loss": 3.3312, "mean_token_accuracy": 0.4643041237113402, "step": 11240 }, { "epoch": 2.0839822024471637, "grad_norm": 6.6328125, "learning_rate": 7.916017797552837e-06, "loss": 2.5959, "mean_token_accuracy": 0.5050780282387912, "step": 11241 }, { "epoch": 2.0841675936225434, "grad_norm": 6.625, "learning_rate": 7.915832406377457e-06, "loss": 2.9855, "mean_token_accuracy": 0.4720715835140998, "step": 11242 }, { "epoch": 2.0843529847979236, "grad_norm": 6.625, "learning_rate": 7.915647015202078e-06, "loss": 2.9797, "mean_token_accuracy": 0.4456854141736471, "step": 11243 }, { "epoch": 2.084538375973304, "grad_norm": 7.15625, "learning_rate": 7.915461624026696e-06, "loss": 3.0983, "mean_token_accuracy": 0.4613572101790763, "step": 11244 }, { "epoch": 2.0847237671486836, "grad_norm": 6.328125, "learning_rate": 7.915276232851317e-06, "loss": 3.2398, "mean_token_accuracy": 0.4519333096842852, "step": 11245 }, { "epoch": 2.0849091583240638, "grad_norm": 7.32421875, "learning_rate": 7.915090841675936e-06, "loss": 2.4483, "mean_token_accuracy": 0.5056314761976273, "step": 11246 }, { "epoch": 2.085094549499444, "grad_norm": 6.5703125, "learning_rate": 7.914905450500558e-06, "loss": 2.7771, "mean_token_accuracy": 0.4961982540129541, "step": 11247 }, { "epoch": 2.0852799406748237, "grad_norm": 6.61328125, "learning_rate": 7.914720059325177e-06, "loss": 2.4718, "mean_token_accuracy": 0.5181505540695452, "step": 11248 }, { "epoch": 2.085465331850204, "grad_norm": 6.1328125, "learning_rate": 7.914534668149797e-06, "loss": 1.9649, "mean_token_accuracy": 0.5939055481364575, "step": 11249 }, { "epoch": 2.085650723025584, "grad_norm": 6.12109375, "learning_rate": 7.914349276974418e-06, "loss": 3.2338, "mean_token_accuracy": 0.45025088728429813, "step": 11250 }, { "epoch": 2.085836114200964, "grad_norm": 6.359375, "learning_rate": 7.914163885799036e-06, "loss": 3.312, "mean_token_accuracy": 0.4452756996397894, "step": 11251 }, { "epoch": 2.086021505376344, "grad_norm": 8.234375, "learning_rate": 7.913978494623657e-06, "loss": 3.0548, "mean_token_accuracy": 0.48682232688646154, "step": 11252 }, { "epoch": 2.086206896551724, "grad_norm": 6.4375, "learning_rate": 7.913793103448276e-06, "loss": 2.8616, "mean_token_accuracy": 0.46505700871898054, "step": 11253 }, { "epoch": 2.0863922877271044, "grad_norm": 6.1640625, "learning_rate": 7.913607712272896e-06, "loss": 2.7472, "mean_token_accuracy": 0.48307846578089747, "step": 11254 }, { "epoch": 2.086577678902484, "grad_norm": 7.11328125, "learning_rate": 7.913422321097517e-06, "loss": 2.7216, "mean_token_accuracy": 0.4607990012484395, "step": 11255 }, { "epoch": 2.0867630700778643, "grad_norm": 6.79296875, "learning_rate": 7.913236929922137e-06, "loss": 2.6188, "mean_token_accuracy": 0.4904751232631107, "step": 11256 }, { "epoch": 2.0869484612532445, "grad_norm": 7.4609375, "learning_rate": 7.913051538746756e-06, "loss": 2.614, "mean_token_accuracy": 0.505157201441531, "step": 11257 }, { "epoch": 2.0871338524286243, "grad_norm": 8.484375, "learning_rate": 7.912866147571376e-06, "loss": 3.0615, "mean_token_accuracy": 0.47309197651663404, "step": 11258 }, { "epoch": 2.0873192436040044, "grad_norm": 8.5078125, "learning_rate": 7.912680756395997e-06, "loss": 3.0495, "mean_token_accuracy": 0.4587781731909846, "step": 11259 }, { "epoch": 2.0875046347793846, "grad_norm": 8.6953125, "learning_rate": 7.912495365220616e-06, "loss": 3.632, "mean_token_accuracy": 0.43979416809605487, "step": 11260 }, { "epoch": 2.0876900259547644, "grad_norm": 8.1171875, "learning_rate": 7.912309974045236e-06, "loss": 2.7262, "mean_token_accuracy": 0.48568235787121106, "step": 11261 }, { "epoch": 2.0878754171301446, "grad_norm": 9.921875, "learning_rate": 7.912124582869855e-06, "loss": 2.9473, "mean_token_accuracy": 0.4910102186004398, "step": 11262 }, { "epoch": 2.0880608083055248, "grad_norm": 7.30859375, "learning_rate": 7.911939191694476e-06, "loss": 2.6281, "mean_token_accuracy": 0.48457099849473156, "step": 11263 }, { "epoch": 2.0882461994809045, "grad_norm": 7.328125, "learning_rate": 7.911753800519096e-06, "loss": 3.1874, "mean_token_accuracy": 0.43985952589991223, "step": 11264 }, { "epoch": 2.0884315906562847, "grad_norm": 8.53125, "learning_rate": 7.911568409343716e-06, "loss": 2.6595, "mean_token_accuracy": 0.48931855056787454, "step": 11265 }, { "epoch": 2.088616981831665, "grad_norm": 10.4140625, "learning_rate": 7.911383018168335e-06, "loss": 2.9954, "mean_token_accuracy": 0.46571508536244055, "step": 11266 }, { "epoch": 2.088802373007045, "grad_norm": 6.71875, "learning_rate": 7.911197626992956e-06, "loss": 2.8687, "mean_token_accuracy": 0.47425258610890075, "step": 11267 }, { "epoch": 2.088987764182425, "grad_norm": 6.921875, "learning_rate": 7.911012235817576e-06, "loss": 2.9902, "mean_token_accuracy": 0.4499319727891156, "step": 11268 }, { "epoch": 2.089173155357805, "grad_norm": 8.15625, "learning_rate": 7.910826844642195e-06, "loss": 2.8762, "mean_token_accuracy": 0.5055301755229622, "step": 11269 }, { "epoch": 2.089358546533185, "grad_norm": 6.67578125, "learning_rate": 7.910641453466816e-06, "loss": 3.0175, "mean_token_accuracy": 0.4822316986496091, "step": 11270 }, { "epoch": 2.089543937708565, "grad_norm": 6.35546875, "learning_rate": 7.910456062291434e-06, "loss": 2.8184, "mean_token_accuracy": 0.4807247494217425, "step": 11271 }, { "epoch": 2.089729328883945, "grad_norm": 6.97265625, "learning_rate": 7.910270671116057e-06, "loss": 3.1166, "mean_token_accuracy": 0.4668491105858938, "step": 11272 }, { "epoch": 2.0899147200593253, "grad_norm": 6.7265625, "learning_rate": 7.910085279940675e-06, "loss": 2.888, "mean_token_accuracy": 0.4923469387755102, "step": 11273 }, { "epoch": 2.090100111234705, "grad_norm": 5.83203125, "learning_rate": 7.909899888765296e-06, "loss": 2.6623, "mean_token_accuracy": 0.48323605266730013, "step": 11274 }, { "epoch": 2.0902855024100853, "grad_norm": 6.81640625, "learning_rate": 7.909714497589915e-06, "loss": 3.0822, "mean_token_accuracy": 0.45037868895272914, "step": 11275 }, { "epoch": 2.0904708935854655, "grad_norm": 6.12109375, "learning_rate": 7.909529106414535e-06, "loss": 2.3874, "mean_token_accuracy": 0.5391040242976461, "step": 11276 }, { "epoch": 2.090656284760845, "grad_norm": 9.6484375, "learning_rate": 7.909343715239156e-06, "loss": 2.1114, "mean_token_accuracy": 0.5447983681154872, "step": 11277 }, { "epoch": 2.0908416759362254, "grad_norm": 6.05078125, "learning_rate": 7.909158324063774e-06, "loss": 3.0152, "mean_token_accuracy": 0.4966131907308378, "step": 11278 }, { "epoch": 2.0910270671116056, "grad_norm": 6.08203125, "learning_rate": 7.908972932888395e-06, "loss": 2.7518, "mean_token_accuracy": 0.48224919835089325, "step": 11279 }, { "epoch": 2.0912124582869858, "grad_norm": 8.71875, "learning_rate": 7.908787541713015e-06, "loss": 3.0807, "mean_token_accuracy": 0.45693035835023665, "step": 11280 }, { "epoch": 2.0913978494623655, "grad_norm": 7.48046875, "learning_rate": 7.908602150537636e-06, "loss": 3.3614, "mean_token_accuracy": 0.44888832098134424, "step": 11281 }, { "epoch": 2.0915832406377457, "grad_norm": 6.24609375, "learning_rate": 7.908416759362255e-06, "loss": 2.8361, "mean_token_accuracy": 0.4682956627978009, "step": 11282 }, { "epoch": 2.091768631813126, "grad_norm": 7.47265625, "learning_rate": 7.908231368186875e-06, "loss": 2.6449, "mean_token_accuracy": 0.5144083384426732, "step": 11283 }, { "epoch": 2.0919540229885056, "grad_norm": 8.40625, "learning_rate": 7.908045977011494e-06, "loss": 2.349, "mean_token_accuracy": 0.5066478961363992, "step": 11284 }, { "epoch": 2.092139414163886, "grad_norm": 5.94921875, "learning_rate": 7.907860585836114e-06, "loss": 2.344, "mean_token_accuracy": 0.5207397622192866, "step": 11285 }, { "epoch": 2.092324805339266, "grad_norm": 10.078125, "learning_rate": 7.907675194660735e-06, "loss": 2.6016, "mean_token_accuracy": 0.49842857142857144, "step": 11286 }, { "epoch": 2.0925101965146458, "grad_norm": 7.15625, "learning_rate": 7.907489803485354e-06, "loss": 2.706, "mean_token_accuracy": 0.49681616832779624, "step": 11287 }, { "epoch": 2.092695587690026, "grad_norm": 7.3671875, "learning_rate": 7.907304412309976e-06, "loss": 2.9226, "mean_token_accuracy": 0.4780289560579121, "step": 11288 }, { "epoch": 2.092880978865406, "grad_norm": 7.75, "learning_rate": 7.907119021134595e-06, "loss": 2.2471, "mean_token_accuracy": 0.5176368123094789, "step": 11289 }, { "epoch": 2.093066370040786, "grad_norm": 8.0390625, "learning_rate": 7.906933629959215e-06, "loss": 3.1462, "mean_token_accuracy": 0.4358316221765914, "step": 11290 }, { "epoch": 2.093251761216166, "grad_norm": 7.26953125, "learning_rate": 7.906748238783834e-06, "loss": 2.5687, "mean_token_accuracy": 0.5039747807017544, "step": 11291 }, { "epoch": 2.0934371523915463, "grad_norm": 7.296875, "learning_rate": 7.906562847608455e-06, "loss": 2.8499, "mean_token_accuracy": 0.46308954203691044, "step": 11292 }, { "epoch": 2.093622543566926, "grad_norm": 7.54296875, "learning_rate": 7.906377456433075e-06, "loss": 2.9991, "mean_token_accuracy": 0.45910687405920725, "step": 11293 }, { "epoch": 2.093807934742306, "grad_norm": 9.046875, "learning_rate": 7.906192065257694e-06, "loss": 2.4361, "mean_token_accuracy": 0.5240981240981241, "step": 11294 }, { "epoch": 2.0939933259176864, "grad_norm": 7.55859375, "learning_rate": 7.906006674082314e-06, "loss": 2.8269, "mean_token_accuracy": 0.47691472026072784, "step": 11295 }, { "epoch": 2.0941787170930666, "grad_norm": 7.73828125, "learning_rate": 7.905821282906935e-06, "loss": 2.9652, "mean_token_accuracy": 0.48208077025942764, "step": 11296 }, { "epoch": 2.0943641082684463, "grad_norm": 6.171875, "learning_rate": 7.905635891731555e-06, "loss": 2.9948, "mean_token_accuracy": 0.4394129024917511, "step": 11297 }, { "epoch": 2.0945494994438265, "grad_norm": 6.87109375, "learning_rate": 7.905450500556174e-06, "loss": 2.7124, "mean_token_accuracy": 0.468534253850239, "step": 11298 }, { "epoch": 2.0947348906192067, "grad_norm": 7.05078125, "learning_rate": 7.905265109380795e-06, "loss": 2.0024, "mean_token_accuracy": 0.5802606661468197, "step": 11299 }, { "epoch": 2.0949202817945864, "grad_norm": 6.359375, "learning_rate": 7.905079718205413e-06, "loss": 2.781, "mean_token_accuracy": 0.4672542166843945, "step": 11300 }, { "epoch": 2.0951056729699666, "grad_norm": 6.58984375, "learning_rate": 7.904894327030034e-06, "loss": 2.8979, "mean_token_accuracy": 0.4804295116037409, "step": 11301 }, { "epoch": 2.095291064145347, "grad_norm": 6.3515625, "learning_rate": 7.904708935854654e-06, "loss": 2.7591, "mean_token_accuracy": 0.48442426535502586, "step": 11302 }, { "epoch": 2.0954764553207266, "grad_norm": 9.0390625, "learning_rate": 7.904523544679273e-06, "loss": 3.4266, "mean_token_accuracy": 0.4580855281789861, "step": 11303 }, { "epoch": 2.0956618464961068, "grad_norm": 6.2109375, "learning_rate": 7.904338153503894e-06, "loss": 3.0724, "mean_token_accuracy": 0.4642050737149688, "step": 11304 }, { "epoch": 2.095847237671487, "grad_norm": 8.9296875, "learning_rate": 7.904152762328514e-06, "loss": 2.6057, "mean_token_accuracy": 0.4832904884318766, "step": 11305 }, { "epoch": 2.0960326288468667, "grad_norm": 8.140625, "learning_rate": 7.903967371153135e-06, "loss": 2.154, "mean_token_accuracy": 0.5661466650288662, "step": 11306 }, { "epoch": 2.096218020022247, "grad_norm": 7.24609375, "learning_rate": 7.903781979977753e-06, "loss": 2.2587, "mean_token_accuracy": 0.5490127758420441, "step": 11307 }, { "epoch": 2.096403411197627, "grad_norm": 6.66796875, "learning_rate": 7.903596588802374e-06, "loss": 2.4575, "mean_token_accuracy": 0.5357698289269052, "step": 11308 }, { "epoch": 2.096588802373007, "grad_norm": 7.19140625, "learning_rate": 7.903411197626993e-06, "loss": 2.6227, "mean_token_accuracy": 0.4932681759250025, "step": 11309 }, { "epoch": 2.096774193548387, "grad_norm": 6.81640625, "learning_rate": 7.903225806451613e-06, "loss": 2.9474, "mean_token_accuracy": 0.48655913978494625, "step": 11310 }, { "epoch": 2.096959584723767, "grad_norm": 6.578125, "learning_rate": 7.903040415276234e-06, "loss": 2.6751, "mean_token_accuracy": 0.48142593447161974, "step": 11311 }, { "epoch": 2.0971449758991474, "grad_norm": 7.69140625, "learning_rate": 7.902855024100854e-06, "loss": 3.1818, "mean_token_accuracy": 0.45396007726980037, "step": 11312 }, { "epoch": 2.097330367074527, "grad_norm": 6.12109375, "learning_rate": 7.902669632925473e-06, "loss": 2.6713, "mean_token_accuracy": 0.48611753817677, "step": 11313 }, { "epoch": 2.0975157582499073, "grad_norm": 7.9375, "learning_rate": 7.902484241750093e-06, "loss": 3.3448, "mean_token_accuracy": 0.4425367362722351, "step": 11314 }, { "epoch": 2.0977011494252875, "grad_norm": 8.84375, "learning_rate": 7.902298850574714e-06, "loss": 2.6868, "mean_token_accuracy": 0.48989597034556975, "step": 11315 }, { "epoch": 2.0978865406006673, "grad_norm": 6.6015625, "learning_rate": 7.902113459399333e-06, "loss": 3.1244, "mean_token_accuracy": 0.45454545454545453, "step": 11316 }, { "epoch": 2.0980719317760474, "grad_norm": 6.55859375, "learning_rate": 7.901928068223953e-06, "loss": 2.3703, "mean_token_accuracy": 0.5241027181258447, "step": 11317 }, { "epoch": 2.0982573229514276, "grad_norm": 6.3046875, "learning_rate": 7.901742677048572e-06, "loss": 2.622, "mean_token_accuracy": 0.5051435590357746, "step": 11318 }, { "epoch": 2.0984427141268074, "grad_norm": 10.2734375, "learning_rate": 7.901557285873193e-06, "loss": 3.1607, "mean_token_accuracy": 0.47352386479036707, "step": 11319 }, { "epoch": 2.0986281053021876, "grad_norm": 7.3515625, "learning_rate": 7.901371894697813e-06, "loss": 2.4999, "mean_token_accuracy": 0.5051663128096249, "step": 11320 }, { "epoch": 2.0988134964775678, "grad_norm": 6.90625, "learning_rate": 7.901186503522434e-06, "loss": 3.1715, "mean_token_accuracy": 0.45490633916387546, "step": 11321 }, { "epoch": 2.0989988876529475, "grad_norm": 7.72265625, "learning_rate": 7.901001112347052e-06, "loss": 3.1255, "mean_token_accuracy": 0.44624479964966063, "step": 11322 }, { "epoch": 2.0991842788283277, "grad_norm": 6.390625, "learning_rate": 7.900815721171673e-06, "loss": 2.823, "mean_token_accuracy": 0.4885814954978468, "step": 11323 }, { "epoch": 2.099369670003708, "grad_norm": 8.3984375, "learning_rate": 7.900630329996293e-06, "loss": 2.5891, "mean_token_accuracy": 0.5083705357142857, "step": 11324 }, { "epoch": 2.099555061179088, "grad_norm": 6.41015625, "learning_rate": 7.900444938820912e-06, "loss": 3.1322, "mean_token_accuracy": 0.4505723204994797, "step": 11325 }, { "epoch": 2.099740452354468, "grad_norm": 7.19140625, "learning_rate": 7.900259547645533e-06, "loss": 2.7176, "mean_token_accuracy": 0.47370431682159686, "step": 11326 }, { "epoch": 2.099925843529848, "grad_norm": 6.77734375, "learning_rate": 7.900074156470151e-06, "loss": 2.9175, "mean_token_accuracy": 0.49468831429144117, "step": 11327 }, { "epoch": 2.100111234705228, "grad_norm": 6.22265625, "learning_rate": 7.899888765294774e-06, "loss": 2.79, "mean_token_accuracy": 0.4715087803129853, "step": 11328 }, { "epoch": 2.100296625880608, "grad_norm": 6.296875, "learning_rate": 7.899703374119392e-06, "loss": 2.0651, "mean_token_accuracy": 0.5835576217274284, "step": 11329 }, { "epoch": 2.100482017055988, "grad_norm": 10.0546875, "learning_rate": 7.899517982944013e-06, "loss": 2.3867, "mean_token_accuracy": 0.5006275275414865, "step": 11330 }, { "epoch": 2.1006674082313683, "grad_norm": 6.22265625, "learning_rate": 7.899332591768633e-06, "loss": 2.4466, "mean_token_accuracy": 0.545332257196664, "step": 11331 }, { "epoch": 2.100852799406748, "grad_norm": 9.53125, "learning_rate": 7.899147200593252e-06, "loss": 2.662, "mean_token_accuracy": 0.5035637515556058, "step": 11332 }, { "epoch": 2.1010381905821283, "grad_norm": 9.875, "learning_rate": 7.898961809417873e-06, "loss": 2.5317, "mean_token_accuracy": 0.49613633957870223, "step": 11333 }, { "epoch": 2.1012235817575085, "grad_norm": 9.84375, "learning_rate": 7.898776418242491e-06, "loss": 3.1199, "mean_token_accuracy": 0.4750234155479238, "step": 11334 }, { "epoch": 2.101408972932888, "grad_norm": 11.6484375, "learning_rate": 7.898591027067112e-06, "loss": 2.6282, "mean_token_accuracy": 0.49972943722943725, "step": 11335 }, { "epoch": 2.1015943641082684, "grad_norm": 7.421875, "learning_rate": 7.898405635891732e-06, "loss": 2.7258, "mean_token_accuracy": 0.49649517259621745, "step": 11336 }, { "epoch": 2.1017797552836486, "grad_norm": 10.921875, "learning_rate": 7.898220244716353e-06, "loss": 2.5406, "mean_token_accuracy": 0.48103607770582796, "step": 11337 }, { "epoch": 2.1019651464590288, "grad_norm": 7.23828125, "learning_rate": 7.898034853540972e-06, "loss": 2.6286, "mean_token_accuracy": 0.5033387742988574, "step": 11338 }, { "epoch": 2.1021505376344085, "grad_norm": 8.25, "learning_rate": 7.897849462365592e-06, "loss": 3.1336, "mean_token_accuracy": 0.4638655462184874, "step": 11339 }, { "epoch": 2.1023359288097887, "grad_norm": 6.3984375, "learning_rate": 7.897664071190213e-06, "loss": 3.159, "mean_token_accuracy": 0.45870275314979003, "step": 11340 }, { "epoch": 2.102521319985169, "grad_norm": 10.296875, "learning_rate": 7.897478680014831e-06, "loss": 2.5997, "mean_token_accuracy": 0.48745082078415414, "step": 11341 }, { "epoch": 2.1027067111605486, "grad_norm": 8.0859375, "learning_rate": 7.897293288839452e-06, "loss": 2.3333, "mean_token_accuracy": 0.5429379371930346, "step": 11342 }, { "epoch": 2.102892102335929, "grad_norm": 6.39453125, "learning_rate": 7.89710789766407e-06, "loss": 3.4395, "mean_token_accuracy": 0.4435817157169693, "step": 11343 }, { "epoch": 2.103077493511309, "grad_norm": 8.21875, "learning_rate": 7.896922506488693e-06, "loss": 2.633, "mean_token_accuracy": 0.48471741637831606, "step": 11344 }, { "epoch": 2.1032628846866888, "grad_norm": 7.02734375, "learning_rate": 7.896737115313312e-06, "loss": 3.0752, "mean_token_accuracy": 0.4471607103705328, "step": 11345 }, { "epoch": 2.103448275862069, "grad_norm": 6.44921875, "learning_rate": 7.896551724137932e-06, "loss": 2.9675, "mean_token_accuracy": 0.47425742574257423, "step": 11346 }, { "epoch": 2.103633667037449, "grad_norm": 8.734375, "learning_rate": 7.896366332962551e-06, "loss": 2.471, "mean_token_accuracy": 0.5067905646890636, "step": 11347 }, { "epoch": 2.103819058212829, "grad_norm": 8.734375, "learning_rate": 7.896180941787172e-06, "loss": 2.6899, "mean_token_accuracy": 0.47939346811819594, "step": 11348 }, { "epoch": 2.104004449388209, "grad_norm": 6.51953125, "learning_rate": 7.895995550611792e-06, "loss": 3.0898, "mean_token_accuracy": 0.4760630389533155, "step": 11349 }, { "epoch": 2.1041898405635893, "grad_norm": 6.19140625, "learning_rate": 7.89581015943641e-06, "loss": 2.8013, "mean_token_accuracy": 0.5156864830373286, "step": 11350 }, { "epoch": 2.1043752317389695, "grad_norm": 8.4140625, "learning_rate": 7.895624768261031e-06, "loss": 2.1929, "mean_token_accuracy": 0.5333640128854119, "step": 11351 }, { "epoch": 2.104560622914349, "grad_norm": 6.4453125, "learning_rate": 7.895439377085652e-06, "loss": 2.9384, "mean_token_accuracy": 0.4659201033703026, "step": 11352 }, { "epoch": 2.1047460140897294, "grad_norm": 5.91796875, "learning_rate": 7.895253985910272e-06, "loss": 2.6375, "mean_token_accuracy": 0.496870925684485, "step": 11353 }, { "epoch": 2.1049314052651096, "grad_norm": 7.1328125, "learning_rate": 7.895068594734891e-06, "loss": 2.519, "mean_token_accuracy": 0.5207536865101038, "step": 11354 }, { "epoch": 2.1051167964404893, "grad_norm": 6.2421875, "learning_rate": 7.894883203559512e-06, "loss": 2.3526, "mean_token_accuracy": 0.5189421015010722, "step": 11355 }, { "epoch": 2.1053021876158695, "grad_norm": 5.8671875, "learning_rate": 7.89469781238413e-06, "loss": 2.5606, "mean_token_accuracy": 0.4931017691933128, "step": 11356 }, { "epoch": 2.1054875787912497, "grad_norm": 6.5078125, "learning_rate": 7.894512421208751e-06, "loss": 2.7606, "mean_token_accuracy": 0.485230352303523, "step": 11357 }, { "epoch": 2.1056729699666294, "grad_norm": 6.21875, "learning_rate": 7.894327030033371e-06, "loss": 2.8605, "mean_token_accuracy": 0.4949232585596222, "step": 11358 }, { "epoch": 2.1058583611420096, "grad_norm": 6.4296875, "learning_rate": 7.89414163885799e-06, "loss": 2.5093, "mean_token_accuracy": 0.5299879243257749, "step": 11359 }, { "epoch": 2.10604375231739, "grad_norm": 7.296875, "learning_rate": 7.89395624768261e-06, "loss": 3.3161, "mean_token_accuracy": 0.44537576360278447, "step": 11360 }, { "epoch": 2.1062291434927696, "grad_norm": 6.3125, "learning_rate": 7.893770856507231e-06, "loss": 2.8225, "mean_token_accuracy": 0.47462941847206386, "step": 11361 }, { "epoch": 2.1064145346681498, "grad_norm": 6.5, "learning_rate": 7.893585465331852e-06, "loss": 2.684, "mean_token_accuracy": 0.48043505438179773, "step": 11362 }, { "epoch": 2.10659992584353, "grad_norm": 6.34765625, "learning_rate": 7.89340007415647e-06, "loss": 3.0781, "mean_token_accuracy": 0.45559336919530136, "step": 11363 }, { "epoch": 2.1067853170189097, "grad_norm": 5.71484375, "learning_rate": 7.893214682981091e-06, "loss": 2.1556, "mean_token_accuracy": 0.580490335970943, "step": 11364 }, { "epoch": 2.10697070819429, "grad_norm": 6.55859375, "learning_rate": 7.89302929180571e-06, "loss": 2.7745, "mean_token_accuracy": 0.4796402289452167, "step": 11365 }, { "epoch": 2.10715609936967, "grad_norm": 6.69921875, "learning_rate": 7.89284390063033e-06, "loss": 2.7886, "mean_token_accuracy": 0.4724464236014473, "step": 11366 }, { "epoch": 2.1073414905450503, "grad_norm": 6.8046875, "learning_rate": 7.89265850945495e-06, "loss": 2.4865, "mean_token_accuracy": 0.49732620320855614, "step": 11367 }, { "epoch": 2.10752688172043, "grad_norm": 10.078125, "learning_rate": 7.892473118279571e-06, "loss": 2.6186, "mean_token_accuracy": 0.4750963126031921, "step": 11368 }, { "epoch": 2.10771227289581, "grad_norm": 8.8046875, "learning_rate": 7.892287727104192e-06, "loss": 2.6249, "mean_token_accuracy": 0.49086936163571093, "step": 11369 }, { "epoch": 2.1078976640711904, "grad_norm": 7.39453125, "learning_rate": 7.89210233592881e-06, "loss": 2.9338, "mean_token_accuracy": 0.4702110606465402, "step": 11370 }, { "epoch": 2.10808305524657, "grad_norm": 6.54296875, "learning_rate": 7.891916944753431e-06, "loss": 2.7356, "mean_token_accuracy": 0.5041186161449753, "step": 11371 }, { "epoch": 2.1082684464219503, "grad_norm": 6.0859375, "learning_rate": 7.89173155357805e-06, "loss": 2.3997, "mean_token_accuracy": 0.5230616082291782, "step": 11372 }, { "epoch": 2.1084538375973305, "grad_norm": 8.0390625, "learning_rate": 7.89154616240267e-06, "loss": 2.5174, "mean_token_accuracy": 0.49872053872053873, "step": 11373 }, { "epoch": 2.1086392287727103, "grad_norm": 6.953125, "learning_rate": 7.89136077122729e-06, "loss": 2.2907, "mean_token_accuracy": 0.5358354058331449, "step": 11374 }, { "epoch": 2.1088246199480905, "grad_norm": 6.33203125, "learning_rate": 7.89117538005191e-06, "loss": 3.5177, "mean_token_accuracy": 0.4452301719356628, "step": 11375 }, { "epoch": 2.1090100111234706, "grad_norm": 6.19921875, "learning_rate": 7.89098998887653e-06, "loss": 2.4695, "mean_token_accuracy": 0.49985101311084623, "step": 11376 }, { "epoch": 2.1091954022988504, "grad_norm": 5.921875, "learning_rate": 7.89080459770115e-06, "loss": 3.2945, "mean_token_accuracy": 0.4594631236442516, "step": 11377 }, { "epoch": 2.1093807934742306, "grad_norm": 7.5625, "learning_rate": 7.890619206525771e-06, "loss": 2.4619, "mean_token_accuracy": 0.531015157304802, "step": 11378 }, { "epoch": 2.1095661846496108, "grad_norm": 6.25, "learning_rate": 7.89043381535039e-06, "loss": 2.6581, "mean_token_accuracy": 0.5039646579066607, "step": 11379 }, { "epoch": 2.1097515758249905, "grad_norm": 6.6015625, "learning_rate": 7.89024842417501e-06, "loss": 3.5985, "mean_token_accuracy": 0.432475884244373, "step": 11380 }, { "epoch": 2.1099369670003707, "grad_norm": 6.40625, "learning_rate": 7.890063032999629e-06, "loss": 2.0106, "mean_token_accuracy": 0.5544057698338037, "step": 11381 }, { "epoch": 2.110122358175751, "grad_norm": 7.46875, "learning_rate": 7.88987764182425e-06, "loss": 3.1674, "mean_token_accuracy": 0.42293994842195753, "step": 11382 }, { "epoch": 2.110307749351131, "grad_norm": 6.70703125, "learning_rate": 7.88969225064887e-06, "loss": 2.7461, "mean_token_accuracy": 0.5075306479859895, "step": 11383 }, { "epoch": 2.110493140526511, "grad_norm": 7.0390625, "learning_rate": 7.889506859473489e-06, "loss": 3.0897, "mean_token_accuracy": 0.46999668471654327, "step": 11384 }, { "epoch": 2.110678531701891, "grad_norm": 6.734375, "learning_rate": 7.88932146829811e-06, "loss": 2.588, "mean_token_accuracy": 0.48945849977807365, "step": 11385 }, { "epoch": 2.110863922877271, "grad_norm": 10.1640625, "learning_rate": 7.88913607712273e-06, "loss": 2.2732, "mean_token_accuracy": 0.5444887118193891, "step": 11386 }, { "epoch": 2.111049314052651, "grad_norm": 6.83984375, "learning_rate": 7.88895068594735e-06, "loss": 2.6596, "mean_token_accuracy": 0.4612432847275518, "step": 11387 }, { "epoch": 2.111234705228031, "grad_norm": 7.515625, "learning_rate": 7.88876529477197e-06, "loss": 3.1292, "mean_token_accuracy": 0.4538564422648239, "step": 11388 }, { "epoch": 2.1114200964034113, "grad_norm": 6.79296875, "learning_rate": 7.88857990359659e-06, "loss": 2.6225, "mean_token_accuracy": 0.4931452675982072, "step": 11389 }, { "epoch": 2.111605487578791, "grad_norm": 8.734375, "learning_rate": 7.888394512421208e-06, "loss": 2.4968, "mean_token_accuracy": 0.4959771606540358, "step": 11390 }, { "epoch": 2.1117908787541713, "grad_norm": 7.1953125, "learning_rate": 7.888209121245829e-06, "loss": 2.8489, "mean_token_accuracy": 0.4687676493843895, "step": 11391 }, { "epoch": 2.1119762699295515, "grad_norm": 8.859375, "learning_rate": 7.88802373007045e-06, "loss": 3.431, "mean_token_accuracy": 0.4613022898810285, "step": 11392 }, { "epoch": 2.112161661104931, "grad_norm": 7.65625, "learning_rate": 7.88783833889507e-06, "loss": 3.2983, "mean_token_accuracy": 0.460904044409199, "step": 11393 }, { "epoch": 2.1123470522803114, "grad_norm": 8.7109375, "learning_rate": 7.887652947719689e-06, "loss": 2.5795, "mean_token_accuracy": 0.4711596842744384, "step": 11394 }, { "epoch": 2.1125324434556916, "grad_norm": 8.4140625, "learning_rate": 7.88746755654431e-06, "loss": 2.8672, "mean_token_accuracy": 0.48810188805720034, "step": 11395 }, { "epoch": 2.1127178346310718, "grad_norm": 11.46875, "learning_rate": 7.88728216536893e-06, "loss": 2.4195, "mean_token_accuracy": 0.5025879917184265, "step": 11396 }, { "epoch": 2.1129032258064515, "grad_norm": 7.046875, "learning_rate": 7.887096774193549e-06, "loss": 3.221, "mean_token_accuracy": 0.47037980290111836, "step": 11397 }, { "epoch": 2.1130886169818317, "grad_norm": 7.5234375, "learning_rate": 7.886911383018169e-06, "loss": 3.2469, "mean_token_accuracy": 0.43959190979058527, "step": 11398 }, { "epoch": 2.113274008157212, "grad_norm": 6.8828125, "learning_rate": 7.886725991842788e-06, "loss": 2.9992, "mean_token_accuracy": 0.458060587608464, "step": 11399 }, { "epoch": 2.1134593993325916, "grad_norm": 7.515625, "learning_rate": 7.886540600667408e-06, "loss": 3.0594, "mean_token_accuracy": 0.4699634337454292, "step": 11400 }, { "epoch": 2.113644790507972, "grad_norm": 5.6875, "learning_rate": 7.886355209492029e-06, "loss": 2.1893, "mean_token_accuracy": 0.536655069582505, "step": 11401 }, { "epoch": 2.113830181683352, "grad_norm": 8.765625, "learning_rate": 7.88616981831665e-06, "loss": 3.9438, "mean_token_accuracy": 0.40982028241335045, "step": 11402 }, { "epoch": 2.1140155728587318, "grad_norm": 7.27734375, "learning_rate": 7.885984427141268e-06, "loss": 2.6758, "mean_token_accuracy": 0.5003026268006294, "step": 11403 }, { "epoch": 2.114200964034112, "grad_norm": 6.19921875, "learning_rate": 7.885799035965889e-06, "loss": 2.7982, "mean_token_accuracy": 0.4918112930823759, "step": 11404 }, { "epoch": 2.114386355209492, "grad_norm": 6.2734375, "learning_rate": 7.885613644790509e-06, "loss": 2.3918, "mean_token_accuracy": 0.5169584914901433, "step": 11405 }, { "epoch": 2.114571746384872, "grad_norm": 7.36328125, "learning_rate": 7.885428253615128e-06, "loss": 2.8001, "mean_token_accuracy": 0.46820603907637653, "step": 11406 }, { "epoch": 2.114757137560252, "grad_norm": 6.6015625, "learning_rate": 7.885242862439748e-06, "loss": 2.5317, "mean_token_accuracy": 0.49456390432471614, "step": 11407 }, { "epoch": 2.1149425287356323, "grad_norm": 9.4375, "learning_rate": 7.885057471264367e-06, "loss": 2.8676, "mean_token_accuracy": 0.47792508688376883, "step": 11408 }, { "epoch": 2.1151279199110125, "grad_norm": 6.1171875, "learning_rate": 7.88487208008899e-06, "loss": 2.3359, "mean_token_accuracy": 0.5318199711323973, "step": 11409 }, { "epoch": 2.115313311086392, "grad_norm": 6.33984375, "learning_rate": 7.884686688913608e-06, "loss": 2.6462, "mean_token_accuracy": 0.4847010826926094, "step": 11410 }, { "epoch": 2.1154987022617724, "grad_norm": 6.35546875, "learning_rate": 7.884501297738229e-06, "loss": 2.8685, "mean_token_accuracy": 0.467244564445076, "step": 11411 }, { "epoch": 2.1156840934371526, "grad_norm": 6.26171875, "learning_rate": 7.884315906562849e-06, "loss": 3.1337, "mean_token_accuracy": 0.46775871034841393, "step": 11412 }, { "epoch": 2.1158694846125323, "grad_norm": 6.82421875, "learning_rate": 7.884130515387468e-06, "loss": 2.5842, "mean_token_accuracy": 0.48522130532633156, "step": 11413 }, { "epoch": 2.1160548757879125, "grad_norm": 8.7734375, "learning_rate": 7.883945124212088e-06, "loss": 3.4982, "mean_token_accuracy": 0.43869038249645836, "step": 11414 }, { "epoch": 2.1162402669632927, "grad_norm": 7.19140625, "learning_rate": 7.883759733036707e-06, "loss": 3.0445, "mean_token_accuracy": 0.46871648194494103, "step": 11415 }, { "epoch": 2.1164256581386724, "grad_norm": 6.01953125, "learning_rate": 7.883574341861328e-06, "loss": 2.7898, "mean_token_accuracy": 0.4855997083485235, "step": 11416 }, { "epoch": 2.1166110493140526, "grad_norm": 9.4765625, "learning_rate": 7.883388950685948e-06, "loss": 2.5163, "mean_token_accuracy": 0.48711477151965993, "step": 11417 }, { "epoch": 2.116796440489433, "grad_norm": 10.0, "learning_rate": 7.883203559510569e-06, "loss": 2.5753, "mean_token_accuracy": 0.48977751052315094, "step": 11418 }, { "epoch": 2.1169818316648126, "grad_norm": 8.3515625, "learning_rate": 7.883018168335187e-06, "loss": 2.0136, "mean_token_accuracy": 0.5458218158622156, "step": 11419 }, { "epoch": 2.1171672228401928, "grad_norm": 6.0625, "learning_rate": 7.882832777159808e-06, "loss": 2.2165, "mean_token_accuracy": 0.5333425759046166, "step": 11420 }, { "epoch": 2.117352614015573, "grad_norm": 7.6640625, "learning_rate": 7.882647385984428e-06, "loss": 3.1282, "mean_token_accuracy": 0.4411401776900296, "step": 11421 }, { "epoch": 2.1175380051909527, "grad_norm": 6.6640625, "learning_rate": 7.882461994809047e-06, "loss": 2.7005, "mean_token_accuracy": 0.4928825622775801, "step": 11422 }, { "epoch": 2.117723396366333, "grad_norm": 7.61328125, "learning_rate": 7.882276603633668e-06, "loss": 2.4166, "mean_token_accuracy": 0.515887621150779, "step": 11423 }, { "epoch": 2.117908787541713, "grad_norm": 8.6015625, "learning_rate": 7.882091212458287e-06, "loss": 2.4793, "mean_token_accuracy": 0.5161048689138577, "step": 11424 }, { "epoch": 2.1180941787170933, "grad_norm": 7.87109375, "learning_rate": 7.881905821282909e-06, "loss": 2.6848, "mean_token_accuracy": 0.5023094688221709, "step": 11425 }, { "epoch": 2.118279569892473, "grad_norm": 8.34375, "learning_rate": 7.881720430107528e-06, "loss": 3.2211, "mean_token_accuracy": 0.4461174713787954, "step": 11426 }, { "epoch": 2.118464961067853, "grad_norm": 6.68359375, "learning_rate": 7.881535038932148e-06, "loss": 2.5061, "mean_token_accuracy": 0.5205206378986866, "step": 11427 }, { "epoch": 2.1186503522432334, "grad_norm": 5.6796875, "learning_rate": 7.881349647756767e-06, "loss": 2.2642, "mean_token_accuracy": 0.569510778365819, "step": 11428 }, { "epoch": 2.118835743418613, "grad_norm": 10.7265625, "learning_rate": 7.881164256581387e-06, "loss": 3.0339, "mean_token_accuracy": 0.4728633811603244, "step": 11429 }, { "epoch": 2.1190211345939933, "grad_norm": 6.28125, "learning_rate": 7.880978865406008e-06, "loss": 2.8041, "mean_token_accuracy": 0.4718823044135845, "step": 11430 }, { "epoch": 2.1192065257693735, "grad_norm": 6.859375, "learning_rate": 7.880793474230627e-06, "loss": 3.4319, "mean_token_accuracy": 0.4161579892280072, "step": 11431 }, { "epoch": 2.1193919169447533, "grad_norm": 9.3359375, "learning_rate": 7.880608083055247e-06, "loss": 2.4395, "mean_token_accuracy": 0.5122081387591728, "step": 11432 }, { "epoch": 2.1195773081201335, "grad_norm": 6.484375, "learning_rate": 7.880422691879868e-06, "loss": 2.8505, "mean_token_accuracy": 0.48547844695811676, "step": 11433 }, { "epoch": 2.1197626992955136, "grad_norm": 6.109375, "learning_rate": 7.880237300704488e-06, "loss": 2.7771, "mean_token_accuracy": 0.4701402805611222, "step": 11434 }, { "epoch": 2.1199480904708934, "grad_norm": 9.9609375, "learning_rate": 7.880051909529107e-06, "loss": 2.7191, "mean_token_accuracy": 0.49423829157416216, "step": 11435 }, { "epoch": 2.1201334816462736, "grad_norm": 9.515625, "learning_rate": 7.879866518353727e-06, "loss": 2.9555, "mean_token_accuracy": 0.4778365667254556, "step": 11436 }, { "epoch": 2.1203188728216538, "grad_norm": 7.8671875, "learning_rate": 7.879681127178346e-06, "loss": 3.7456, "mean_token_accuracy": 0.39080612924716857, "step": 11437 }, { "epoch": 2.120504263997034, "grad_norm": 7.37109375, "learning_rate": 7.879495736002967e-06, "loss": 2.7234, "mean_token_accuracy": 0.49799548985216735, "step": 11438 }, { "epoch": 2.1206896551724137, "grad_norm": 7.53515625, "learning_rate": 7.879310344827587e-06, "loss": 3.2699, "mean_token_accuracy": 0.4485481065167938, "step": 11439 }, { "epoch": 2.120875046347794, "grad_norm": 6.2421875, "learning_rate": 7.879124953652206e-06, "loss": 3.2053, "mean_token_accuracy": 0.46002331002331004, "step": 11440 }, { "epoch": 2.121060437523174, "grad_norm": 6.265625, "learning_rate": 7.878939562476826e-06, "loss": 2.7628, "mean_token_accuracy": 0.47265625, "step": 11441 }, { "epoch": 2.121245828698554, "grad_norm": 7.70703125, "learning_rate": 7.878754171301447e-06, "loss": 2.9482, "mean_token_accuracy": 0.4832099418297197, "step": 11442 }, { "epoch": 2.121431219873934, "grad_norm": 8.4375, "learning_rate": 7.878568780126067e-06, "loss": 3.6468, "mean_token_accuracy": 0.41553094832481546, "step": 11443 }, { "epoch": 2.121616611049314, "grad_norm": 7.234375, "learning_rate": 7.878383388950686e-06, "loss": 3.3835, "mean_token_accuracy": 0.4437705592105263, "step": 11444 }, { "epoch": 2.121802002224694, "grad_norm": 6.8828125, "learning_rate": 7.878197997775307e-06, "loss": 2.7104, "mean_token_accuracy": 0.5244548286604361, "step": 11445 }, { "epoch": 2.121987393400074, "grad_norm": 6.11328125, "learning_rate": 7.878012606599925e-06, "loss": 2.4811, "mean_token_accuracy": 0.5014702278853223, "step": 11446 }, { "epoch": 2.1221727845754543, "grad_norm": 8.8984375, "learning_rate": 7.877827215424546e-06, "loss": 3.1323, "mean_token_accuracy": 0.4387086712414223, "step": 11447 }, { "epoch": 2.122358175750834, "grad_norm": 6.05859375, "learning_rate": 7.877641824249166e-06, "loss": 2.5822, "mean_token_accuracy": 0.5008764241893077, "step": 11448 }, { "epoch": 2.1225435669262143, "grad_norm": 5.93359375, "learning_rate": 7.877456433073787e-06, "loss": 2.6828, "mean_token_accuracy": 0.48578811369509045, "step": 11449 }, { "epoch": 2.1227289581015945, "grad_norm": 8.53125, "learning_rate": 7.877271041898407e-06, "loss": 2.8419, "mean_token_accuracy": 0.46367357380404195, "step": 11450 }, { "epoch": 2.122914349276974, "grad_norm": 5.59765625, "learning_rate": 7.877085650723026e-06, "loss": 2.9522, "mean_token_accuracy": 0.4520653007559987, "step": 11451 }, { "epoch": 2.1230997404523544, "grad_norm": 6.21484375, "learning_rate": 7.876900259547647e-06, "loss": 2.2091, "mean_token_accuracy": 0.5495506586236613, "step": 11452 }, { "epoch": 2.1232851316277346, "grad_norm": 8.6328125, "learning_rate": 7.876714868372266e-06, "loss": 2.7831, "mean_token_accuracy": 0.48254051917248647, "step": 11453 }, { "epoch": 2.1234705228031148, "grad_norm": 7.81640625, "learning_rate": 7.876529477196886e-06, "loss": 2.2761, "mean_token_accuracy": 0.5523625310859354, "step": 11454 }, { "epoch": 2.1236559139784945, "grad_norm": 6.58203125, "learning_rate": 7.876344086021507e-06, "loss": 2.4132, "mean_token_accuracy": 0.5098591549295775, "step": 11455 }, { "epoch": 2.1238413051538747, "grad_norm": 9.140625, "learning_rate": 7.876158694846125e-06, "loss": 2.2637, "mean_token_accuracy": 0.525768852689229, "step": 11456 }, { "epoch": 2.124026696329255, "grad_norm": 7.06640625, "learning_rate": 7.875973303670746e-06, "loss": 3.0759, "mean_token_accuracy": 0.45397960535028475, "step": 11457 }, { "epoch": 2.1242120875046346, "grad_norm": 6.9140625, "learning_rate": 7.875787912495366e-06, "loss": 2.8676, "mean_token_accuracy": 0.4756224066390041, "step": 11458 }, { "epoch": 2.124397478680015, "grad_norm": 8.7109375, "learning_rate": 7.875602521319987e-06, "loss": 2.1941, "mean_token_accuracy": 0.5396975425330813, "step": 11459 }, { "epoch": 2.124582869855395, "grad_norm": 7.54296875, "learning_rate": 7.875417130144606e-06, "loss": 2.58, "mean_token_accuracy": 0.487146937480165, "step": 11460 }, { "epoch": 2.1247682610307748, "grad_norm": 7.859375, "learning_rate": 7.875231738969226e-06, "loss": 2.6915, "mean_token_accuracy": 0.48961073119410836, "step": 11461 }, { "epoch": 2.124953652206155, "grad_norm": 7.03515625, "learning_rate": 7.875046347793845e-06, "loss": 3.4177, "mean_token_accuracy": 0.4389347113186922, "step": 11462 }, { "epoch": 2.125139043381535, "grad_norm": 6.58984375, "learning_rate": 7.874860956618465e-06, "loss": 2.6228, "mean_token_accuracy": 0.4976711690731253, "step": 11463 }, { "epoch": 2.125324434556915, "grad_norm": 7.7890625, "learning_rate": 7.874675565443086e-06, "loss": 3.474, "mean_token_accuracy": 0.44262761268306555, "step": 11464 }, { "epoch": 2.125509825732295, "grad_norm": 8.203125, "learning_rate": 7.874490174267706e-06, "loss": 2.8924, "mean_token_accuracy": 0.4666278053045759, "step": 11465 }, { "epoch": 2.1256952169076753, "grad_norm": 6.70703125, "learning_rate": 7.874304783092325e-06, "loss": 2.564, "mean_token_accuracy": 0.5220028208744711, "step": 11466 }, { "epoch": 2.1258806080830555, "grad_norm": 6.43359375, "learning_rate": 7.874119391916946e-06, "loss": 3.0559, "mean_token_accuracy": 0.45654872749844816, "step": 11467 }, { "epoch": 2.126065999258435, "grad_norm": 6.79296875, "learning_rate": 7.873934000741566e-06, "loss": 2.5733, "mean_token_accuracy": 0.5139307683029791, "step": 11468 }, { "epoch": 2.1262513904338154, "grad_norm": 7.734375, "learning_rate": 7.873748609566185e-06, "loss": 3.0504, "mean_token_accuracy": 0.4625754527162978, "step": 11469 }, { "epoch": 2.1264367816091956, "grad_norm": 6.44921875, "learning_rate": 7.873563218390805e-06, "loss": 3.4013, "mean_token_accuracy": 0.4378836238644734, "step": 11470 }, { "epoch": 2.1266221727845753, "grad_norm": 6.421875, "learning_rate": 7.873377827215424e-06, "loss": 3.2089, "mean_token_accuracy": 0.4607442041691019, "step": 11471 }, { "epoch": 2.1268075639599555, "grad_norm": 6.26171875, "learning_rate": 7.873192436040045e-06, "loss": 2.9077, "mean_token_accuracy": 0.4668155315717653, "step": 11472 }, { "epoch": 2.1269929551353357, "grad_norm": 6.1171875, "learning_rate": 7.873007044864665e-06, "loss": 2.8879, "mean_token_accuracy": 0.4642435375934917, "step": 11473 }, { "epoch": 2.1271783463107155, "grad_norm": 7.4375, "learning_rate": 7.872821653689286e-06, "loss": 2.7947, "mean_token_accuracy": 0.4775175980462577, "step": 11474 }, { "epoch": 2.1273637374860956, "grad_norm": 6.20703125, "learning_rate": 7.872636262513904e-06, "loss": 3.0391, "mean_token_accuracy": 0.46111805121798877, "step": 11475 }, { "epoch": 2.127549128661476, "grad_norm": 6.80078125, "learning_rate": 7.872450871338525e-06, "loss": 1.9665, "mean_token_accuracy": 0.5618881587769881, "step": 11476 }, { "epoch": 2.1277345198368556, "grad_norm": 5.95703125, "learning_rate": 7.872265480163145e-06, "loss": 2.6681, "mean_token_accuracy": 0.4909418571564071, "step": 11477 }, { "epoch": 2.1279199110122358, "grad_norm": 7.7734375, "learning_rate": 7.872080088987764e-06, "loss": 2.9948, "mean_token_accuracy": 0.49434333497294636, "step": 11478 }, { "epoch": 2.128105302187616, "grad_norm": 6.640625, "learning_rate": 7.871894697812385e-06, "loss": 3.383, "mean_token_accuracy": 0.4333373771685066, "step": 11479 }, { "epoch": 2.128290693362996, "grad_norm": 7.1875, "learning_rate": 7.871709306637004e-06, "loss": 2.6971, "mean_token_accuracy": 0.4794791494972804, "step": 11480 }, { "epoch": 2.128476084538376, "grad_norm": 6.84765625, "learning_rate": 7.871523915461626e-06, "loss": 2.5837, "mean_token_accuracy": 0.5145454545454545, "step": 11481 }, { "epoch": 2.128661475713756, "grad_norm": 6.75390625, "learning_rate": 7.871338524286245e-06, "loss": 2.4486, "mean_token_accuracy": 0.5107474691443628, "step": 11482 }, { "epoch": 2.1288468668891363, "grad_norm": 7.23046875, "learning_rate": 7.871153133110865e-06, "loss": 2.7773, "mean_token_accuracy": 0.5265191897654584, "step": 11483 }, { "epoch": 2.129032258064516, "grad_norm": 7.3203125, "learning_rate": 7.870967741935484e-06, "loss": 2.5754, "mean_token_accuracy": 0.47774436090225564, "step": 11484 }, { "epoch": 2.129217649239896, "grad_norm": 7.16796875, "learning_rate": 7.870782350760104e-06, "loss": 2.8913, "mean_token_accuracy": 0.46409540725704135, "step": 11485 }, { "epoch": 2.1294030404152764, "grad_norm": 7.26171875, "learning_rate": 7.870596959584725e-06, "loss": 2.9117, "mean_token_accuracy": 0.4724526066350711, "step": 11486 }, { "epoch": 2.129588431590656, "grad_norm": 7.00390625, "learning_rate": 7.870411568409344e-06, "loss": 2.7702, "mean_token_accuracy": 0.49097027481772293, "step": 11487 }, { "epoch": 2.1297738227660363, "grad_norm": 6.4453125, "learning_rate": 7.870226177233964e-06, "loss": 2.9172, "mean_token_accuracy": 0.4742967992240543, "step": 11488 }, { "epoch": 2.1299592139414165, "grad_norm": 8.84375, "learning_rate": 7.870040786058585e-06, "loss": 4.5595, "mean_token_accuracy": 0.39080459770114945, "step": 11489 }, { "epoch": 2.1301446051167963, "grad_norm": 7.640625, "learning_rate": 7.869855394883205e-06, "loss": 2.9024, "mean_token_accuracy": 0.4945005273466928, "step": 11490 }, { "epoch": 2.1303299962921765, "grad_norm": 6.2578125, "learning_rate": 7.869670003707824e-06, "loss": 3.2204, "mean_token_accuracy": 0.4407552083333333, "step": 11491 }, { "epoch": 2.1305153874675566, "grad_norm": 6.51171875, "learning_rate": 7.869484612532444e-06, "loss": 2.4181, "mean_token_accuracy": 0.5043558606124604, "step": 11492 }, { "epoch": 2.1307007786429364, "grad_norm": 6.5625, "learning_rate": 7.869299221357065e-06, "loss": 2.9503, "mean_token_accuracy": 0.461010922021844, "step": 11493 }, { "epoch": 2.1308861698183166, "grad_norm": 6.11328125, "learning_rate": 7.869113830181684e-06, "loss": 3.0311, "mean_token_accuracy": 0.45435349646006706, "step": 11494 }, { "epoch": 2.1310715609936968, "grad_norm": 7.890625, "learning_rate": 7.868928439006304e-06, "loss": 3.2539, "mean_token_accuracy": 0.455690013445789, "step": 11495 }, { "epoch": 2.131256952169077, "grad_norm": 6.26171875, "learning_rate": 7.868743047830923e-06, "loss": 2.8262, "mean_token_accuracy": 0.47705779334500875, "step": 11496 }, { "epoch": 2.1314423433444567, "grad_norm": 6.54296875, "learning_rate": 7.868557656655545e-06, "loss": 3.0703, "mean_token_accuracy": 0.4620554808749847, "step": 11497 }, { "epoch": 2.131627734519837, "grad_norm": 7.1328125, "learning_rate": 7.868372265480164e-06, "loss": 2.9176, "mean_token_accuracy": 0.47026963381608383, "step": 11498 }, { "epoch": 2.131813125695217, "grad_norm": 7.09375, "learning_rate": 7.868186874304784e-06, "loss": 3.1349, "mean_token_accuracy": 0.44756856418529617, "step": 11499 }, { "epoch": 2.131998516870597, "grad_norm": 7.125, "learning_rate": 7.868001483129403e-06, "loss": 3.3496, "mean_token_accuracy": 0.4487854843429909, "step": 11500 }, { "epoch": 2.132183908045977, "grad_norm": 6.14453125, "learning_rate": 7.867816091954024e-06, "loss": 3.4223, "mean_token_accuracy": 0.4303899082568807, "step": 11501 }, { "epoch": 2.132369299221357, "grad_norm": 6.89453125, "learning_rate": 7.867630700778644e-06, "loss": 2.755, "mean_token_accuracy": 0.4735441452723857, "step": 11502 }, { "epoch": 2.132554690396737, "grad_norm": 6.03125, "learning_rate": 7.867445309603263e-06, "loss": 2.5196, "mean_token_accuracy": 0.5060477923099617, "step": 11503 }, { "epoch": 2.132740081572117, "grad_norm": 7.0625, "learning_rate": 7.867259918427883e-06, "loss": 2.2935, "mean_token_accuracy": 0.5233087427405431, "step": 11504 }, { "epoch": 2.1329254727474973, "grad_norm": 10.53125, "learning_rate": 7.867074527252504e-06, "loss": 3.4776, "mean_token_accuracy": 0.4454939000393546, "step": 11505 }, { "epoch": 2.133110863922877, "grad_norm": 6.37109375, "learning_rate": 7.866889136077124e-06, "loss": 2.978, "mean_token_accuracy": 0.48537578674564974, "step": 11506 }, { "epoch": 2.1332962550982573, "grad_norm": 6.7265625, "learning_rate": 7.866703744901743e-06, "loss": 3.1822, "mean_token_accuracy": 0.4754569190600522, "step": 11507 }, { "epoch": 2.1334816462736375, "grad_norm": 6.76171875, "learning_rate": 7.866518353726364e-06, "loss": 3.5361, "mean_token_accuracy": 0.4441272861824717, "step": 11508 }, { "epoch": 2.133667037449017, "grad_norm": 6.640625, "learning_rate": 7.866332962550983e-06, "loss": 2.6576, "mean_token_accuracy": 0.5000692041522491, "step": 11509 }, { "epoch": 2.1338524286243974, "grad_norm": 7.62109375, "learning_rate": 7.866147571375603e-06, "loss": 2.7419, "mean_token_accuracy": 0.4789335088874259, "step": 11510 }, { "epoch": 2.1340378197997776, "grad_norm": 6.33203125, "learning_rate": 7.865962180200224e-06, "loss": 2.4501, "mean_token_accuracy": 0.49930715935334874, "step": 11511 }, { "epoch": 2.1342232109751578, "grad_norm": 11.0625, "learning_rate": 7.865776789024842e-06, "loss": 3.669, "mean_token_accuracy": 0.4354520817935452, "step": 11512 }, { "epoch": 2.1344086021505375, "grad_norm": 10.9140625, "learning_rate": 7.865591397849463e-06, "loss": 2.5974, "mean_token_accuracy": 0.5125889726973335, "step": 11513 }, { "epoch": 2.1345939933259177, "grad_norm": 7.80078125, "learning_rate": 7.865406006674083e-06, "loss": 2.6478, "mean_token_accuracy": 0.4849318658280922, "step": 11514 }, { "epoch": 2.134779384501298, "grad_norm": 8.7109375, "learning_rate": 7.865220615498704e-06, "loss": 2.9767, "mean_token_accuracy": 0.47253797939020503, "step": 11515 }, { "epoch": 2.1349647756766776, "grad_norm": 13.8515625, "learning_rate": 7.865035224323323e-06, "loss": 2.4862, "mean_token_accuracy": 0.5020785838809173, "step": 11516 }, { "epoch": 2.135150166852058, "grad_norm": 10.703125, "learning_rate": 7.864849833147943e-06, "loss": 2.4018, "mean_token_accuracy": 0.547474528506395, "step": 11517 }, { "epoch": 2.135335558027438, "grad_norm": 7.37109375, "learning_rate": 7.864664441972562e-06, "loss": 3.1074, "mean_token_accuracy": 0.4631336405529954, "step": 11518 }, { "epoch": 2.1355209492028178, "grad_norm": 9.328125, "learning_rate": 7.864479050797182e-06, "loss": 3.0804, "mean_token_accuracy": 0.46291208791208793, "step": 11519 }, { "epoch": 2.135706340378198, "grad_norm": 13.125, "learning_rate": 7.864293659621803e-06, "loss": 3.0889, "mean_token_accuracy": 0.45961571161367115, "step": 11520 }, { "epoch": 2.135891731553578, "grad_norm": 9.546875, "learning_rate": 7.864108268446422e-06, "loss": 2.6312, "mean_token_accuracy": 0.4962029161603888, "step": 11521 }, { "epoch": 2.136077122728958, "grad_norm": 8.578125, "learning_rate": 7.863922877271042e-06, "loss": 2.2144, "mean_token_accuracy": 0.5345740413925878, "step": 11522 }, { "epoch": 2.136262513904338, "grad_norm": 6.4296875, "learning_rate": 7.863737486095663e-06, "loss": 3.0574, "mean_token_accuracy": 0.4842914438502674, "step": 11523 }, { "epoch": 2.1364479050797183, "grad_norm": 7.7265625, "learning_rate": 7.863552094920283e-06, "loss": 2.9194, "mean_token_accuracy": 0.4962624584717608, "step": 11524 }, { "epoch": 2.1366332962550985, "grad_norm": 7.0, "learning_rate": 7.863366703744902e-06, "loss": 2.3082, "mean_token_accuracy": 0.5306609130138542, "step": 11525 }, { "epoch": 2.136818687430478, "grad_norm": 6.578125, "learning_rate": 7.863181312569522e-06, "loss": 2.602, "mean_token_accuracy": 0.5008792965627498, "step": 11526 }, { "epoch": 2.1370040786058584, "grad_norm": 8.3515625, "learning_rate": 7.862995921394141e-06, "loss": 2.7841, "mean_token_accuracy": 0.48615635179153094, "step": 11527 }, { "epoch": 2.1371894697812386, "grad_norm": 10.078125, "learning_rate": 7.862810530218762e-06, "loss": 2.8097, "mean_token_accuracy": 0.475115379817887, "step": 11528 }, { "epoch": 2.1373748609566183, "grad_norm": 7.82421875, "learning_rate": 7.862625139043382e-06, "loss": 2.5396, "mean_token_accuracy": 0.49343419925777904, "step": 11529 }, { "epoch": 2.1375602521319985, "grad_norm": 6.3984375, "learning_rate": 7.862439747868003e-06, "loss": 2.9166, "mean_token_accuracy": 0.48377642872095455, "step": 11530 }, { "epoch": 2.1377456433073787, "grad_norm": 8.7265625, "learning_rate": 7.862254356692623e-06, "loss": 3.2184, "mean_token_accuracy": 0.4796818510484454, "step": 11531 }, { "epoch": 2.1379310344827585, "grad_norm": 9.265625, "learning_rate": 7.862068965517242e-06, "loss": 2.1981, "mean_token_accuracy": 0.5180988353792887, "step": 11532 }, { "epoch": 2.1381164256581386, "grad_norm": 6.7265625, "learning_rate": 7.861883574341862e-06, "loss": 2.9197, "mean_token_accuracy": 0.4667146455559554, "step": 11533 }, { "epoch": 2.138301816833519, "grad_norm": 6.87109375, "learning_rate": 7.861698183166481e-06, "loss": 2.5734, "mean_token_accuracy": 0.5165238678090576, "step": 11534 }, { "epoch": 2.1384872080088986, "grad_norm": 6.99609375, "learning_rate": 7.861512791991102e-06, "loss": 2.8794, "mean_token_accuracy": 0.49425915800984144, "step": 11535 }, { "epoch": 2.1386725991842788, "grad_norm": 7.60546875, "learning_rate": 7.861327400815722e-06, "loss": 3.0902, "mean_token_accuracy": 0.48586500743946975, "step": 11536 }, { "epoch": 2.138857990359659, "grad_norm": 6.80859375, "learning_rate": 7.861142009640341e-06, "loss": 2.3077, "mean_token_accuracy": 0.5285754112071369, "step": 11537 }, { "epoch": 2.139043381535039, "grad_norm": 6.73046875, "learning_rate": 7.860956618464962e-06, "loss": 2.3313, "mean_token_accuracy": 0.5224636497304362, "step": 11538 }, { "epoch": 2.139228772710419, "grad_norm": 7.015625, "learning_rate": 7.860771227289582e-06, "loss": 3.2249, "mean_token_accuracy": 0.44451047392660825, "step": 11539 }, { "epoch": 2.139414163885799, "grad_norm": 7.64453125, "learning_rate": 7.860585836114203e-06, "loss": 2.7683, "mean_token_accuracy": 0.4900969812867095, "step": 11540 }, { "epoch": 2.1395995550611793, "grad_norm": 6.78125, "learning_rate": 7.860400444938821e-06, "loss": 2.9554, "mean_token_accuracy": 0.4647802528597231, "step": 11541 }, { "epoch": 2.139784946236559, "grad_norm": 8.3203125, "learning_rate": 7.860215053763442e-06, "loss": 3.3257, "mean_token_accuracy": 0.43535040082751486, "step": 11542 }, { "epoch": 2.139970337411939, "grad_norm": 6.265625, "learning_rate": 7.86002966258806e-06, "loss": 2.7118, "mean_token_accuracy": 0.4671033478893741, "step": 11543 }, { "epoch": 2.1401557285873194, "grad_norm": 6.42578125, "learning_rate": 7.859844271412681e-06, "loss": 3.5109, "mean_token_accuracy": 0.43784639746634996, "step": 11544 }, { "epoch": 2.140341119762699, "grad_norm": 7.28515625, "learning_rate": 7.859658880237302e-06, "loss": 2.4091, "mean_token_accuracy": 0.4960323185687491, "step": 11545 }, { "epoch": 2.1405265109380793, "grad_norm": 6.6328125, "learning_rate": 7.859473489061922e-06, "loss": 2.4744, "mean_token_accuracy": 0.5087961000423908, "step": 11546 }, { "epoch": 2.1407119021134595, "grad_norm": 6.0078125, "learning_rate": 7.859288097886541e-06, "loss": 2.9778, "mean_token_accuracy": 0.4682507169192954, "step": 11547 }, { "epoch": 2.1408972932888393, "grad_norm": 6.61328125, "learning_rate": 7.859102706711161e-06, "loss": 3.2734, "mean_token_accuracy": 0.4403899721448468, "step": 11548 }, { "epoch": 2.1410826844642195, "grad_norm": 6.63671875, "learning_rate": 7.858917315535782e-06, "loss": 2.6088, "mean_token_accuracy": 0.48622167789344767, "step": 11549 }, { "epoch": 2.1412680756395996, "grad_norm": 7.36328125, "learning_rate": 7.8587319243604e-06, "loss": 2.793, "mean_token_accuracy": 0.4679232232405326, "step": 11550 }, { "epoch": 2.14145346681498, "grad_norm": 7.390625, "learning_rate": 7.858546533185021e-06, "loss": 3.0518, "mean_token_accuracy": 0.4737612887478643, "step": 11551 }, { "epoch": 2.1416388579903596, "grad_norm": 6.18359375, "learning_rate": 7.85836114200964e-06, "loss": 2.8566, "mean_token_accuracy": 0.4680456112437019, "step": 11552 }, { "epoch": 2.1418242491657398, "grad_norm": 6.59375, "learning_rate": 7.85817575083426e-06, "loss": 2.6595, "mean_token_accuracy": 0.49789076376554176, "step": 11553 }, { "epoch": 2.14200964034112, "grad_norm": 9.6171875, "learning_rate": 7.857990359658881e-06, "loss": 2.9908, "mean_token_accuracy": 0.47617804464169167, "step": 11554 }, { "epoch": 2.1421950315164997, "grad_norm": 8.4296875, "learning_rate": 7.857804968483501e-06, "loss": 3.6647, "mean_token_accuracy": 0.43835192069392814, "step": 11555 }, { "epoch": 2.14238042269188, "grad_norm": 6.34375, "learning_rate": 7.85761957730812e-06, "loss": 2.85, "mean_token_accuracy": 0.4720938943688351, "step": 11556 }, { "epoch": 2.14256581386726, "grad_norm": 7.8046875, "learning_rate": 7.85743418613274e-06, "loss": 2.5237, "mean_token_accuracy": 0.5241442144995444, "step": 11557 }, { "epoch": 2.14275120504264, "grad_norm": 7.51171875, "learning_rate": 7.857248794957361e-06, "loss": 3.225, "mean_token_accuracy": 0.4528099910793934, "step": 11558 }, { "epoch": 2.14293659621802, "grad_norm": 7.328125, "learning_rate": 7.85706340378198e-06, "loss": 3.0033, "mean_token_accuracy": 0.4754738015607581, "step": 11559 }, { "epoch": 2.1431219873934, "grad_norm": 6.16015625, "learning_rate": 7.8568780126066e-06, "loss": 3.0342, "mean_token_accuracy": 0.44600651996740015, "step": 11560 }, { "epoch": 2.14330737856878, "grad_norm": 6.5234375, "learning_rate": 7.85669262143122e-06, "loss": 2.8363, "mean_token_accuracy": 0.47174122174122174, "step": 11561 }, { "epoch": 2.14349276974416, "grad_norm": 7.50390625, "learning_rate": 7.856507230255841e-06, "loss": 3.0203, "mean_token_accuracy": 0.47149087384913585, "step": 11562 }, { "epoch": 2.1436781609195403, "grad_norm": 7.96484375, "learning_rate": 7.85632183908046e-06, "loss": 2.7691, "mean_token_accuracy": 0.4774445564516129, "step": 11563 }, { "epoch": 2.14386355209492, "grad_norm": 7.31640625, "learning_rate": 7.85613644790508e-06, "loss": 2.5543, "mean_token_accuracy": 0.5138274336283186, "step": 11564 }, { "epoch": 2.1440489432703003, "grad_norm": 6.1328125, "learning_rate": 7.8559510567297e-06, "loss": 3.075, "mean_token_accuracy": 0.4583333333333333, "step": 11565 }, { "epoch": 2.1442343344456805, "grad_norm": 7.25, "learning_rate": 7.85576566555432e-06, "loss": 2.7653, "mean_token_accuracy": 0.499865627519484, "step": 11566 }, { "epoch": 2.14441972562106, "grad_norm": 7.80078125, "learning_rate": 7.85558027437894e-06, "loss": 2.4609, "mean_token_accuracy": 0.5140958517921869, "step": 11567 }, { "epoch": 2.1446051167964404, "grad_norm": 6.05859375, "learning_rate": 7.85539488320356e-06, "loss": 2.5637, "mean_token_accuracy": 0.48964745383324004, "step": 11568 }, { "epoch": 2.1447905079718206, "grad_norm": 6.1171875, "learning_rate": 7.85520949202818e-06, "loss": 2.9872, "mean_token_accuracy": 0.4796568308852203, "step": 11569 }, { "epoch": 2.1449758991472008, "grad_norm": 6.48828125, "learning_rate": 7.8550241008528e-06, "loss": 2.9157, "mean_token_accuracy": 0.4640637450199203, "step": 11570 }, { "epoch": 2.1451612903225805, "grad_norm": 7.03125, "learning_rate": 7.85483870967742e-06, "loss": 3.1482, "mean_token_accuracy": 0.46536532465771996, "step": 11571 }, { "epoch": 2.1453466814979607, "grad_norm": 5.859375, "learning_rate": 7.85465331850204e-06, "loss": 2.725, "mean_token_accuracy": 0.48176881303335917, "step": 11572 }, { "epoch": 2.145532072673341, "grad_norm": 6.25, "learning_rate": 7.85446792732666e-06, "loss": 2.8241, "mean_token_accuracy": 0.4754338792471278, "step": 11573 }, { "epoch": 2.1457174638487206, "grad_norm": 5.51953125, "learning_rate": 7.85428253615128e-06, "loss": 2.2883, "mean_token_accuracy": 0.5590506472859414, "step": 11574 }, { "epoch": 2.145902855024101, "grad_norm": 6.62890625, "learning_rate": 7.8540971449759e-06, "loss": 2.6107, "mean_token_accuracy": 0.49267139479905436, "step": 11575 }, { "epoch": 2.146088246199481, "grad_norm": 8.984375, "learning_rate": 7.85391175380052e-06, "loss": 2.9945, "mean_token_accuracy": 0.4733044733044733, "step": 11576 }, { "epoch": 2.1462736373748608, "grad_norm": 6.828125, "learning_rate": 7.853726362625139e-06, "loss": 2.4319, "mean_token_accuracy": 0.529519033508092, "step": 11577 }, { "epoch": 2.146459028550241, "grad_norm": 8.90625, "learning_rate": 7.853540971449761e-06, "loss": 3.1455, "mean_token_accuracy": 0.4513576204120617, "step": 11578 }, { "epoch": 2.146644419725621, "grad_norm": 6.5390625, "learning_rate": 7.85335558027438e-06, "loss": 2.9674, "mean_token_accuracy": 0.46641969407265776, "step": 11579 }, { "epoch": 2.146829810901001, "grad_norm": 6.7421875, "learning_rate": 7.853170189099e-06, "loss": 2.9822, "mean_token_accuracy": 0.4791574605980262, "step": 11580 }, { "epoch": 2.147015202076381, "grad_norm": 8.2734375, "learning_rate": 7.852984797923619e-06, "loss": 2.6795, "mean_token_accuracy": 0.4926612305411416, "step": 11581 }, { "epoch": 2.1472005932517613, "grad_norm": 7.0859375, "learning_rate": 7.85279940674824e-06, "loss": 2.8516, "mean_token_accuracy": 0.5040310540459839, "step": 11582 }, { "epoch": 2.1473859844271415, "grad_norm": 7.32421875, "learning_rate": 7.85261401557286e-06, "loss": 3.3562, "mean_token_accuracy": 0.4627863953322897, "step": 11583 }, { "epoch": 2.147571375602521, "grad_norm": 8.7109375, "learning_rate": 7.852428624397479e-06, "loss": 3.2026, "mean_token_accuracy": 0.4621901083842033, "step": 11584 }, { "epoch": 2.1477567667779014, "grad_norm": 7.77734375, "learning_rate": 7.8522432332221e-06, "loss": 2.5969, "mean_token_accuracy": 0.501910132799709, "step": 11585 }, { "epoch": 2.1479421579532816, "grad_norm": 6.82421875, "learning_rate": 7.85205784204672e-06, "loss": 2.7833, "mean_token_accuracy": 0.5053075241960662, "step": 11586 }, { "epoch": 2.1481275491286613, "grad_norm": 7.08984375, "learning_rate": 7.85187245087134e-06, "loss": 2.661, "mean_token_accuracy": 0.4914048606994665, "step": 11587 }, { "epoch": 2.1483129403040415, "grad_norm": 7.21484375, "learning_rate": 7.851687059695959e-06, "loss": 2.8188, "mean_token_accuracy": 0.5149067585223899, "step": 11588 }, { "epoch": 2.1484983314794217, "grad_norm": 6.109375, "learning_rate": 7.85150166852058e-06, "loss": 2.901, "mean_token_accuracy": 0.4795124481327801, "step": 11589 }, { "epoch": 2.1486837226548015, "grad_norm": 7.609375, "learning_rate": 7.851316277345198e-06, "loss": 2.6326, "mean_token_accuracy": 0.5131058720164007, "step": 11590 }, { "epoch": 2.1488691138301816, "grad_norm": 8.859375, "learning_rate": 7.851130886169819e-06, "loss": 2.3959, "mean_token_accuracy": 0.510192329839917, "step": 11591 }, { "epoch": 2.149054505005562, "grad_norm": 5.71484375, "learning_rate": 7.85094549499444e-06, "loss": 2.941, "mean_token_accuracy": 0.47459714666940367, "step": 11592 }, { "epoch": 2.1492398961809416, "grad_norm": 8.3515625, "learning_rate": 7.850760103819058e-06, "loss": 3.0336, "mean_token_accuracy": 0.4475625632865616, "step": 11593 }, { "epoch": 2.1494252873563218, "grad_norm": 6.36328125, "learning_rate": 7.850574712643679e-06, "loss": 2.5556, "mean_token_accuracy": 0.5026570803376055, "step": 11594 }, { "epoch": 2.149610678531702, "grad_norm": 7.37890625, "learning_rate": 7.850389321468299e-06, "loss": 3.1539, "mean_token_accuracy": 0.45358133463860334, "step": 11595 }, { "epoch": 2.149796069707082, "grad_norm": 7.68359375, "learning_rate": 7.85020393029292e-06, "loss": 3.7394, "mean_token_accuracy": 0.42516483516483516, "step": 11596 }, { "epoch": 2.149981460882462, "grad_norm": 6.90234375, "learning_rate": 7.850018539117538e-06, "loss": 3.2947, "mean_token_accuracy": 0.44200138504155123, "step": 11597 }, { "epoch": 2.150166852057842, "grad_norm": 8.203125, "learning_rate": 7.849833147942159e-06, "loss": 2.2175, "mean_token_accuracy": 0.5135293428604897, "step": 11598 }, { "epoch": 2.1503522432332223, "grad_norm": 7.76953125, "learning_rate": 7.849647756766778e-06, "loss": 3.227, "mean_token_accuracy": 0.45668196356556806, "step": 11599 }, { "epoch": 2.150537634408602, "grad_norm": 5.609375, "learning_rate": 7.849462365591398e-06, "loss": 2.5068, "mean_token_accuracy": 0.4908872639965077, "step": 11600 }, { "epoch": 2.150723025583982, "grad_norm": 6.125, "learning_rate": 7.849276974416019e-06, "loss": 2.4503, "mean_token_accuracy": 0.525016160310278, "step": 11601 }, { "epoch": 2.1509084167593624, "grad_norm": 7.32421875, "learning_rate": 7.849091583240639e-06, "loss": 3.8412, "mean_token_accuracy": 0.41142638036809814, "step": 11602 }, { "epoch": 2.151093807934742, "grad_norm": 6.671875, "learning_rate": 7.848906192065258e-06, "loss": 3.0111, "mean_token_accuracy": 0.44451400845202693, "step": 11603 }, { "epoch": 2.1512791991101223, "grad_norm": 5.78125, "learning_rate": 7.848720800889878e-06, "loss": 2.4512, "mean_token_accuracy": 0.4895552992438806, "step": 11604 }, { "epoch": 2.1514645902855025, "grad_norm": 6.86328125, "learning_rate": 7.848535409714499e-06, "loss": 3.2933, "mean_token_accuracy": 0.454488971730351, "step": 11605 }, { "epoch": 2.1516499814608823, "grad_norm": 6.1875, "learning_rate": 7.848350018539118e-06, "loss": 3.2659, "mean_token_accuracy": 0.4210592527361932, "step": 11606 }, { "epoch": 2.1518353726362625, "grad_norm": 5.52734375, "learning_rate": 7.848164627363738e-06, "loss": 2.4509, "mean_token_accuracy": 0.5072610178052784, "step": 11607 }, { "epoch": 2.1520207638116426, "grad_norm": 7.328125, "learning_rate": 7.847979236188357e-06, "loss": 2.7195, "mean_token_accuracy": 0.4990375978442192, "step": 11608 }, { "epoch": 2.152206154987023, "grad_norm": 6.73046875, "learning_rate": 7.847793845012977e-06, "loss": 2.3363, "mean_token_accuracy": 0.5520441434662654, "step": 11609 }, { "epoch": 2.1523915461624026, "grad_norm": 7.796875, "learning_rate": 7.847608453837598e-06, "loss": 3.5386, "mean_token_accuracy": 0.44560185185185186, "step": 11610 }, { "epoch": 2.1525769373377828, "grad_norm": 7.28515625, "learning_rate": 7.847423062662218e-06, "loss": 2.657, "mean_token_accuracy": 0.4910896476669841, "step": 11611 }, { "epoch": 2.152762328513163, "grad_norm": 7.62890625, "learning_rate": 7.847237671486839e-06, "loss": 3.4563, "mean_token_accuracy": 0.46261859582542697, "step": 11612 }, { "epoch": 2.1529477196885427, "grad_norm": 7.27734375, "learning_rate": 7.847052280311458e-06, "loss": 2.8229, "mean_token_accuracy": 0.4602754237288136, "step": 11613 }, { "epoch": 2.153133110863923, "grad_norm": 13.046875, "learning_rate": 7.846866889136078e-06, "loss": 2.1644, "mean_token_accuracy": 0.5269957840337277, "step": 11614 }, { "epoch": 2.153318502039303, "grad_norm": 6.328125, "learning_rate": 7.846681497960697e-06, "loss": 2.8322, "mean_token_accuracy": 0.46472053126729385, "step": 11615 }, { "epoch": 2.153503893214683, "grad_norm": 6.953125, "learning_rate": 7.846496106785318e-06, "loss": 3.0569, "mean_token_accuracy": 0.44762897342365815, "step": 11616 }, { "epoch": 2.153689284390063, "grad_norm": 10.4140625, "learning_rate": 7.846310715609938e-06, "loss": 2.5276, "mean_token_accuracy": 0.500125156445557, "step": 11617 }, { "epoch": 2.153874675565443, "grad_norm": 7.875, "learning_rate": 7.846125324434559e-06, "loss": 2.7325, "mean_token_accuracy": 0.5011786892975012, "step": 11618 }, { "epoch": 2.154060066740823, "grad_norm": 6.52734375, "learning_rate": 7.845939933259177e-06, "loss": 2.8773, "mean_token_accuracy": 0.46140238018655516, "step": 11619 }, { "epoch": 2.154245457916203, "grad_norm": 8.1015625, "learning_rate": 7.845754542083798e-06, "loss": 3.1528, "mean_token_accuracy": 0.46240820608462524, "step": 11620 }, { "epoch": 2.1544308490915833, "grad_norm": 8.0234375, "learning_rate": 7.845569150908418e-06, "loss": 2.8611, "mean_token_accuracy": 0.47739291380222104, "step": 11621 }, { "epoch": 2.1546162402669635, "grad_norm": 6.44140625, "learning_rate": 7.845383759733037e-06, "loss": 2.6951, "mean_token_accuracy": 0.5041375743470391, "step": 11622 }, { "epoch": 2.1548016314423433, "grad_norm": 8.3828125, "learning_rate": 7.845198368557658e-06, "loss": 2.868, "mean_token_accuracy": 0.4819632560010728, "step": 11623 }, { "epoch": 2.1549870226177235, "grad_norm": 11.6015625, "learning_rate": 7.845012977382276e-06, "loss": 3.8543, "mean_token_accuracy": 0.42042755344418054, "step": 11624 }, { "epoch": 2.1551724137931036, "grad_norm": 9.3046875, "learning_rate": 7.844827586206897e-06, "loss": 1.9827, "mean_token_accuracy": 0.5706386014983946, "step": 11625 }, { "epoch": 2.1553578049684834, "grad_norm": 7.03515625, "learning_rate": 7.844642195031517e-06, "loss": 2.6455, "mean_token_accuracy": 0.49177153920619554, "step": 11626 }, { "epoch": 2.1555431961438636, "grad_norm": 11.109375, "learning_rate": 7.844456803856138e-06, "loss": 3.7379, "mean_token_accuracy": 0.41831873367143074, "step": 11627 }, { "epoch": 2.1557285873192438, "grad_norm": 7.6015625, "learning_rate": 7.844271412680757e-06, "loss": 2.9217, "mean_token_accuracy": 0.46221966647498564, "step": 11628 }, { "epoch": 2.1559139784946235, "grad_norm": 11.9296875, "learning_rate": 7.844086021505377e-06, "loss": 2.9899, "mean_token_accuracy": 0.4551961823966066, "step": 11629 }, { "epoch": 2.1560993696700037, "grad_norm": 8.9296875, "learning_rate": 7.843900630329998e-06, "loss": 2.6481, "mean_token_accuracy": 0.5051657050852006, "step": 11630 }, { "epoch": 2.156284760845384, "grad_norm": 5.890625, "learning_rate": 7.843715239154616e-06, "loss": 2.8358, "mean_token_accuracy": 0.4611421842802473, "step": 11631 }, { "epoch": 2.1564701520207636, "grad_norm": 7.7421875, "learning_rate": 7.843529847979237e-06, "loss": 2.838, "mean_token_accuracy": 0.48794505186431175, "step": 11632 }, { "epoch": 2.156655543196144, "grad_norm": 12.3359375, "learning_rate": 7.843344456803856e-06, "loss": 2.4521, "mean_token_accuracy": 0.5150042869391255, "step": 11633 }, { "epoch": 2.156840934371524, "grad_norm": 9.5078125, "learning_rate": 7.843159065628476e-06, "loss": 3.0226, "mean_token_accuracy": 0.44533737680060653, "step": 11634 }, { "epoch": 2.1570263255469038, "grad_norm": 7.546875, "learning_rate": 7.842973674453097e-06, "loss": 2.4126, "mean_token_accuracy": 0.513152514651765, "step": 11635 }, { "epoch": 2.157211716722284, "grad_norm": 10.265625, "learning_rate": 7.842788283277717e-06, "loss": 2.7197, "mean_token_accuracy": 0.4929230769230769, "step": 11636 }, { "epoch": 2.157397107897664, "grad_norm": 12.0546875, "learning_rate": 7.842602892102336e-06, "loss": 2.9867, "mean_token_accuracy": 0.4549065033923548, "step": 11637 }, { "epoch": 2.157582499073044, "grad_norm": 7.42578125, "learning_rate": 7.842417500926956e-06, "loss": 2.8258, "mean_token_accuracy": 0.46657034020753974, "step": 11638 }, { "epoch": 2.157767890248424, "grad_norm": 9.125, "learning_rate": 7.842232109751577e-06, "loss": 2.9667, "mean_token_accuracy": 0.4741003547896604, "step": 11639 }, { "epoch": 2.1579532814238043, "grad_norm": 8.140625, "learning_rate": 7.842046718576196e-06, "loss": 3.1668, "mean_token_accuracy": 0.4607828089025326, "step": 11640 }, { "epoch": 2.1581386725991845, "grad_norm": 8.7890625, "learning_rate": 7.841861327400816e-06, "loss": 2.8076, "mean_token_accuracy": 0.4656987698104843, "step": 11641 }, { "epoch": 2.158324063774564, "grad_norm": 6.38671875, "learning_rate": 7.841675936225435e-06, "loss": 2.3557, "mean_token_accuracy": 0.547244567174931, "step": 11642 }, { "epoch": 2.1585094549499444, "grad_norm": 9.09375, "learning_rate": 7.841490545050057e-06, "loss": 3.2148, "mean_token_accuracy": 0.43332416827055265, "step": 11643 }, { "epoch": 2.1586948461253246, "grad_norm": 8.0390625, "learning_rate": 7.841305153874676e-06, "loss": 2.7146, "mean_token_accuracy": 0.5033197437390798, "step": 11644 }, { "epoch": 2.1588802373007043, "grad_norm": 7.55078125, "learning_rate": 7.841119762699297e-06, "loss": 2.8969, "mean_token_accuracy": 0.4794195250659631, "step": 11645 }, { "epoch": 2.1590656284760845, "grad_norm": 6.65234375, "learning_rate": 7.840934371523915e-06, "loss": 3.2974, "mean_token_accuracy": 0.442995372318048, "step": 11646 }, { "epoch": 2.1592510196514647, "grad_norm": 8.1484375, "learning_rate": 7.840748980348536e-06, "loss": 2.9653, "mean_token_accuracy": 0.5250687863038827, "step": 11647 }, { "epoch": 2.1594364108268445, "grad_norm": 7.6015625, "learning_rate": 7.840563589173156e-06, "loss": 2.8016, "mean_token_accuracy": 0.525, "step": 11648 }, { "epoch": 2.1596218020022246, "grad_norm": 8.1015625, "learning_rate": 7.840378197997775e-06, "loss": 3.5144, "mean_token_accuracy": 0.44856039325842695, "step": 11649 }, { "epoch": 2.159807193177605, "grad_norm": 12.953125, "learning_rate": 7.840192806822396e-06, "loss": 2.9875, "mean_token_accuracy": 0.45776199804113615, "step": 11650 }, { "epoch": 2.1599925843529846, "grad_norm": 6.66015625, "learning_rate": 7.840007415647016e-06, "loss": 2.5638, "mean_token_accuracy": 0.521012069216228, "step": 11651 }, { "epoch": 2.1601779755283648, "grad_norm": 6.33984375, "learning_rate": 7.839822024471637e-06, "loss": 2.6743, "mean_token_accuracy": 0.47246945240609445, "step": 11652 }, { "epoch": 2.160363366703745, "grad_norm": 6.11328125, "learning_rate": 7.839636633296255e-06, "loss": 2.2056, "mean_token_accuracy": 0.5294992563212693, "step": 11653 }, { "epoch": 2.160548757879125, "grad_norm": 7.375, "learning_rate": 7.839451242120876e-06, "loss": 3.0935, "mean_token_accuracy": 0.4359602892511337, "step": 11654 }, { "epoch": 2.160734149054505, "grad_norm": 5.96875, "learning_rate": 7.839265850945496e-06, "loss": 2.3627, "mean_token_accuracy": 0.5240197351337315, "step": 11655 }, { "epoch": 2.160919540229885, "grad_norm": 8.0546875, "learning_rate": 7.839080459770115e-06, "loss": 2.4664, "mean_token_accuracy": 0.5241483724451174, "step": 11656 }, { "epoch": 2.1611049314052653, "grad_norm": 5.91015625, "learning_rate": 7.838895068594736e-06, "loss": 2.8979, "mean_token_accuracy": 0.46977950713359273, "step": 11657 }, { "epoch": 2.161290322580645, "grad_norm": 6.44921875, "learning_rate": 7.838709677419354e-06, "loss": 2.3012, "mean_token_accuracy": 0.5335451545887816, "step": 11658 }, { "epoch": 2.161475713756025, "grad_norm": 6.30078125, "learning_rate": 7.838524286243977e-06, "loss": 3.052, "mean_token_accuracy": 0.4672131147540984, "step": 11659 }, { "epoch": 2.1616611049314054, "grad_norm": 7.05078125, "learning_rate": 7.838338895068595e-06, "loss": 3.0092, "mean_token_accuracy": 0.475129111171514, "step": 11660 }, { "epoch": 2.161846496106785, "grad_norm": 6.26171875, "learning_rate": 7.838153503893216e-06, "loss": 2.3227, "mean_token_accuracy": 0.5131421092812739, "step": 11661 }, { "epoch": 2.1620318872821653, "grad_norm": 7.19140625, "learning_rate": 7.837968112717835e-06, "loss": 3.5782, "mean_token_accuracy": 0.4424010592908636, "step": 11662 }, { "epoch": 2.1622172784575455, "grad_norm": 7.50390625, "learning_rate": 7.837782721542455e-06, "loss": 2.7696, "mean_token_accuracy": 0.4787002487562189, "step": 11663 }, { "epoch": 2.1624026696329253, "grad_norm": 5.30078125, "learning_rate": 7.837597330367076e-06, "loss": 3.0263, "mean_token_accuracy": 0.4685141255121846, "step": 11664 }, { "epoch": 2.1625880608083055, "grad_norm": 6.3828125, "learning_rate": 7.837411939191694e-06, "loss": 2.854, "mean_token_accuracy": 0.453438673746924, "step": 11665 }, { "epoch": 2.1627734519836856, "grad_norm": 7.4140625, "learning_rate": 7.837226548016315e-06, "loss": 2.6018, "mean_token_accuracy": 0.4949720670391061, "step": 11666 }, { "epoch": 2.162958843159066, "grad_norm": 6.67578125, "learning_rate": 7.837041156840935e-06, "loss": 2.4184, "mean_token_accuracy": 0.5136090491339697, "step": 11667 }, { "epoch": 2.1631442343344456, "grad_norm": 6.37109375, "learning_rate": 7.836855765665556e-06, "loss": 2.8056, "mean_token_accuracy": 0.4586860137041516, "step": 11668 }, { "epoch": 2.1633296255098258, "grad_norm": 7.3515625, "learning_rate": 7.836670374490175e-06, "loss": 3.0698, "mean_token_accuracy": 0.46689723320158105, "step": 11669 }, { "epoch": 2.163515016685206, "grad_norm": 5.72265625, "learning_rate": 7.836484983314795e-06, "loss": 2.8279, "mean_token_accuracy": 0.5243033636937505, "step": 11670 }, { "epoch": 2.1637004078605857, "grad_norm": 7.83203125, "learning_rate": 7.836299592139414e-06, "loss": 2.2005, "mean_token_accuracy": 0.5512626262626262, "step": 11671 }, { "epoch": 2.163885799035966, "grad_norm": 6.64453125, "learning_rate": 7.836114200964035e-06, "loss": 2.6436, "mean_token_accuracy": 0.49793032080027594, "step": 11672 }, { "epoch": 2.164071190211346, "grad_norm": 6.10546875, "learning_rate": 7.835928809788655e-06, "loss": 2.7201, "mean_token_accuracy": 0.5073295870999267, "step": 11673 }, { "epoch": 2.164256581386726, "grad_norm": 7.01171875, "learning_rate": 7.835743418613274e-06, "loss": 2.403, "mean_token_accuracy": 0.5456586826347305, "step": 11674 }, { "epoch": 2.164441972562106, "grad_norm": 7.08203125, "learning_rate": 7.835558027437894e-06, "loss": 3.5827, "mean_token_accuracy": 0.41216539196940727, "step": 11675 }, { "epoch": 2.164627363737486, "grad_norm": 6.96875, "learning_rate": 7.835372636262515e-06, "loss": 3.4009, "mean_token_accuracy": 0.43710335448776066, "step": 11676 }, { "epoch": 2.164812754912866, "grad_norm": 6.77734375, "learning_rate": 7.835187245087135e-06, "loss": 2.5886, "mean_token_accuracy": 0.5185280826339574, "step": 11677 }, { "epoch": 2.164998146088246, "grad_norm": 6.359375, "learning_rate": 7.835001853911754e-06, "loss": 2.6307, "mean_token_accuracy": 0.4783954438529157, "step": 11678 }, { "epoch": 2.1651835372636263, "grad_norm": 6.73828125, "learning_rate": 7.834816462736375e-06, "loss": 2.9812, "mean_token_accuracy": 0.4775578204625637, "step": 11679 }, { "epoch": 2.1653689284390065, "grad_norm": 7.8515625, "learning_rate": 7.834631071560993e-06, "loss": 2.8219, "mean_token_accuracy": 0.48304628632938645, "step": 11680 }, { "epoch": 2.1655543196143863, "grad_norm": 8.2421875, "learning_rate": 7.834445680385614e-06, "loss": 3.319, "mean_token_accuracy": 0.43412620831975673, "step": 11681 }, { "epoch": 2.1657397107897665, "grad_norm": 6.76953125, "learning_rate": 7.834260289210234e-06, "loss": 2.9016, "mean_token_accuracy": 0.48965198288819517, "step": 11682 }, { "epoch": 2.1659251019651466, "grad_norm": 5.921875, "learning_rate": 7.834074898034855e-06, "loss": 2.6194, "mean_token_accuracy": 0.5012262811410869, "step": 11683 }, { "epoch": 2.1661104931405264, "grad_norm": 6.63671875, "learning_rate": 7.833889506859474e-06, "loss": 2.8585, "mean_token_accuracy": 0.4804632662611137, "step": 11684 }, { "epoch": 2.1662958843159066, "grad_norm": 7.109375, "learning_rate": 7.833704115684094e-06, "loss": 2.5131, "mean_token_accuracy": 0.5015402843601896, "step": 11685 }, { "epoch": 2.1664812754912868, "grad_norm": 6.2109375, "learning_rate": 7.833518724508715e-06, "loss": 2.5351, "mean_token_accuracy": 0.4964262508122157, "step": 11686 }, { "epoch": 2.1666666666666665, "grad_norm": 7.00390625, "learning_rate": 7.833333333333333e-06, "loss": 2.6604, "mean_token_accuracy": 0.5026633024950939, "step": 11687 }, { "epoch": 2.1668520578420467, "grad_norm": 7.46875, "learning_rate": 7.833147942157954e-06, "loss": 2.3652, "mean_token_accuracy": 0.5031928480204342, "step": 11688 }, { "epoch": 2.167037449017427, "grad_norm": 7.08203125, "learning_rate": 7.832962550982573e-06, "loss": 2.9314, "mean_token_accuracy": 0.4708355228586443, "step": 11689 }, { "epoch": 2.1672228401928066, "grad_norm": 7.50390625, "learning_rate": 7.832777159807193e-06, "loss": 2.8314, "mean_token_accuracy": 0.48037341131481276, "step": 11690 }, { "epoch": 2.167408231368187, "grad_norm": 8.296875, "learning_rate": 7.832591768631814e-06, "loss": 3.1353, "mean_token_accuracy": 0.44041778762462414, "step": 11691 }, { "epoch": 2.167593622543567, "grad_norm": 7.92578125, "learning_rate": 7.832406377456434e-06, "loss": 2.9751, "mean_token_accuracy": 0.45702247191011236, "step": 11692 }, { "epoch": 2.167779013718947, "grad_norm": 9.0078125, "learning_rate": 7.832220986281055e-06, "loss": 2.4721, "mean_token_accuracy": 0.5291273313377849, "step": 11693 }, { "epoch": 2.167964404894327, "grad_norm": 8.5546875, "learning_rate": 7.832035595105673e-06, "loss": 3.2931, "mean_token_accuracy": 0.46428090945782324, "step": 11694 }, { "epoch": 2.168149796069707, "grad_norm": 6.890625, "learning_rate": 7.831850203930294e-06, "loss": 2.8872, "mean_token_accuracy": 0.47872797593467986, "step": 11695 }, { "epoch": 2.1683351872450873, "grad_norm": 7.95703125, "learning_rate": 7.831664812754913e-06, "loss": 2.8209, "mean_token_accuracy": 0.4648729446935725, "step": 11696 }, { "epoch": 2.168520578420467, "grad_norm": 7.90234375, "learning_rate": 7.831479421579533e-06, "loss": 2.8322, "mean_token_accuracy": 0.46124210201880106, "step": 11697 }, { "epoch": 2.1687059695958473, "grad_norm": 8.6171875, "learning_rate": 7.831294030404154e-06, "loss": 2.7701, "mean_token_accuracy": 0.502021018593371, "step": 11698 }, { "epoch": 2.1688913607712275, "grad_norm": 7.359375, "learning_rate": 7.831108639228774e-06, "loss": 2.5262, "mean_token_accuracy": 0.4878946623247912, "step": 11699 }, { "epoch": 2.169076751946607, "grad_norm": 6.625, "learning_rate": 7.830923248053393e-06, "loss": 2.1812, "mean_token_accuracy": 0.5614816700610998, "step": 11700 }, { "epoch": 2.1692621431219874, "grad_norm": 6.61328125, "learning_rate": 7.830737856878014e-06, "loss": 2.9918, "mean_token_accuracy": 0.470474879559532, "step": 11701 }, { "epoch": 2.1694475342973676, "grad_norm": 8.3125, "learning_rate": 7.830552465702634e-06, "loss": 3.0012, "mean_token_accuracy": 0.4483498153222924, "step": 11702 }, { "epoch": 2.1696329254727473, "grad_norm": 8.3046875, "learning_rate": 7.830367074527253e-06, "loss": 3.1714, "mean_token_accuracy": 0.4405934262693439, "step": 11703 }, { "epoch": 2.1698183166481275, "grad_norm": 7.20703125, "learning_rate": 7.830181683351873e-06, "loss": 2.8371, "mean_token_accuracy": 0.4741596917762721, "step": 11704 }, { "epoch": 2.1700037078235077, "grad_norm": 6.35546875, "learning_rate": 7.829996292176492e-06, "loss": 3.0208, "mean_token_accuracy": 0.4547750781813808, "step": 11705 }, { "epoch": 2.1701890989988875, "grad_norm": 6.13671875, "learning_rate": 7.829810901001113e-06, "loss": 2.5817, "mean_token_accuracy": 0.5038339502908514, "step": 11706 }, { "epoch": 2.1703744901742676, "grad_norm": 8.203125, "learning_rate": 7.829625509825733e-06, "loss": 2.8177, "mean_token_accuracy": 0.46447778092272857, "step": 11707 }, { "epoch": 2.170559881349648, "grad_norm": 7.29296875, "learning_rate": 7.829440118650354e-06, "loss": 3.2234, "mean_token_accuracy": 0.46598984771573604, "step": 11708 }, { "epoch": 2.1707452725250276, "grad_norm": 6.75390625, "learning_rate": 7.829254727474972e-06, "loss": 3.0202, "mean_token_accuracy": 0.448267295988076, "step": 11709 }, { "epoch": 2.1709306637004078, "grad_norm": 7.23046875, "learning_rate": 7.829069336299593e-06, "loss": 4.1314, "mean_token_accuracy": 0.40699084794354395, "step": 11710 }, { "epoch": 2.171116054875788, "grad_norm": 6.06640625, "learning_rate": 7.828883945124213e-06, "loss": 2.8943, "mean_token_accuracy": 0.46569259418421427, "step": 11711 }, { "epoch": 2.171301446051168, "grad_norm": 7.0546875, "learning_rate": 7.828698553948832e-06, "loss": 2.5443, "mean_token_accuracy": 0.4901391224583397, "step": 11712 }, { "epoch": 2.171486837226548, "grad_norm": 7.21875, "learning_rate": 7.828513162773453e-06, "loss": 2.5347, "mean_token_accuracy": 0.49584002335425487, "step": 11713 }, { "epoch": 2.171672228401928, "grad_norm": 6.38671875, "learning_rate": 7.828327771598071e-06, "loss": 3.1344, "mean_token_accuracy": 0.4525451950523311, "step": 11714 }, { "epoch": 2.1718576195773083, "grad_norm": 7.71875, "learning_rate": 7.828142380422694e-06, "loss": 2.6198, "mean_token_accuracy": 0.49417249417249415, "step": 11715 }, { "epoch": 2.172043010752688, "grad_norm": 7.875, "learning_rate": 7.827956989247312e-06, "loss": 2.5994, "mean_token_accuracy": 0.4996309184857113, "step": 11716 }, { "epoch": 2.172228401928068, "grad_norm": 7.61328125, "learning_rate": 7.827771598071933e-06, "loss": 2.7492, "mean_token_accuracy": 0.48932423508694695, "step": 11717 }, { "epoch": 2.1724137931034484, "grad_norm": 7.82421875, "learning_rate": 7.827586206896552e-06, "loss": 2.458, "mean_token_accuracy": 0.5064543889845095, "step": 11718 }, { "epoch": 2.172599184278828, "grad_norm": 9.4453125, "learning_rate": 7.827400815721172e-06, "loss": 2.2809, "mean_token_accuracy": 0.4801214798453893, "step": 11719 }, { "epoch": 2.1727845754542083, "grad_norm": 9.3125, "learning_rate": 7.827215424545793e-06, "loss": 2.7367, "mean_token_accuracy": 0.46733860678744577, "step": 11720 }, { "epoch": 2.1729699666295885, "grad_norm": 7.2109375, "learning_rate": 7.827030033370412e-06, "loss": 2.9765, "mean_token_accuracy": 0.4969896004378763, "step": 11721 }, { "epoch": 2.1731553578049683, "grad_norm": 6.6796875, "learning_rate": 7.826844642195032e-06, "loss": 3.167, "mean_token_accuracy": 0.4637145033184637, "step": 11722 }, { "epoch": 2.1733407489803485, "grad_norm": 11.0859375, "learning_rate": 7.826659251019652e-06, "loss": 2.497, "mean_token_accuracy": 0.49565323864332067, "step": 11723 }, { "epoch": 2.1735261401557286, "grad_norm": 8.75, "learning_rate": 7.826473859844273e-06, "loss": 2.3306, "mean_token_accuracy": 0.5187713310580204, "step": 11724 }, { "epoch": 2.173711531331109, "grad_norm": 6.73046875, "learning_rate": 7.826288468668892e-06, "loss": 2.7449, "mean_token_accuracy": 0.4832349785407725, "step": 11725 }, { "epoch": 2.1738969225064886, "grad_norm": 7.78515625, "learning_rate": 7.826103077493512e-06, "loss": 3.0016, "mean_token_accuracy": 0.4648970955953068, "step": 11726 }, { "epoch": 2.1740823136818688, "grad_norm": 10.734375, "learning_rate": 7.825917686318131e-06, "loss": 2.4921, "mean_token_accuracy": 0.5087548638132295, "step": 11727 }, { "epoch": 2.174267704857249, "grad_norm": 9.453125, "learning_rate": 7.825732295142752e-06, "loss": 2.9936, "mean_token_accuracy": 0.4735768903993203, "step": 11728 }, { "epoch": 2.1744530960326287, "grad_norm": 7.11328125, "learning_rate": 7.825546903967372e-06, "loss": 2.8808, "mean_token_accuracy": 0.4661866440773684, "step": 11729 }, { "epoch": 2.174638487208009, "grad_norm": 6.34375, "learning_rate": 7.825361512791991e-06, "loss": 3.0877, "mean_token_accuracy": 0.46234033181617973, "step": 11730 }, { "epoch": 2.174823878383389, "grad_norm": 7.98828125, "learning_rate": 7.825176121616613e-06, "loss": 2.8563, "mean_token_accuracy": 0.4757229320780094, "step": 11731 }, { "epoch": 2.175009269558769, "grad_norm": 6.828125, "learning_rate": 7.824990730441232e-06, "loss": 2.7179, "mean_token_accuracy": 0.49141055949566587, "step": 11732 }, { "epoch": 2.175194660734149, "grad_norm": 6.80078125, "learning_rate": 7.824805339265852e-06, "loss": 3.2425, "mean_token_accuracy": 0.4406758130081301, "step": 11733 }, { "epoch": 2.175380051909529, "grad_norm": 8.9765625, "learning_rate": 7.824619948090471e-06, "loss": 2.7108, "mean_token_accuracy": 0.4924299772899319, "step": 11734 }, { "epoch": 2.175565443084909, "grad_norm": 14.0078125, "learning_rate": 7.824434556915092e-06, "loss": 2.6569, "mean_token_accuracy": 0.4727356040934779, "step": 11735 }, { "epoch": 2.175750834260289, "grad_norm": 5.82421875, "learning_rate": 7.824249165739712e-06, "loss": 3.0185, "mean_token_accuracy": 0.46004206098843325, "step": 11736 }, { "epoch": 2.1759362254356693, "grad_norm": 7.671875, "learning_rate": 7.824063774564331e-06, "loss": 2.5249, "mean_token_accuracy": 0.5070440573770492, "step": 11737 }, { "epoch": 2.1761216166110495, "grad_norm": 11.390625, "learning_rate": 7.823878383388951e-06, "loss": 2.5691, "mean_token_accuracy": 0.5041547649961049, "step": 11738 }, { "epoch": 2.1763070077864293, "grad_norm": 9.84375, "learning_rate": 7.823692992213572e-06, "loss": 2.3748, "mean_token_accuracy": 0.5261975162625665, "step": 11739 }, { "epoch": 2.1764923989618095, "grad_norm": 6.96875, "learning_rate": 7.823507601038192e-06, "loss": 2.6315, "mean_token_accuracy": 0.5006216972334473, "step": 11740 }, { "epoch": 2.1766777901371896, "grad_norm": 12.203125, "learning_rate": 7.823322209862811e-06, "loss": 3.162, "mean_token_accuracy": 0.4213425570074302, "step": 11741 }, { "epoch": 2.1768631813125694, "grad_norm": 7.67578125, "learning_rate": 7.823136818687432e-06, "loss": 2.7865, "mean_token_accuracy": 0.48819875776397514, "step": 11742 }, { "epoch": 2.1770485724879496, "grad_norm": 9.015625, "learning_rate": 7.82295142751205e-06, "loss": 2.7514, "mean_token_accuracy": 0.474950971652701, "step": 11743 }, { "epoch": 2.1772339636633298, "grad_norm": 7.75, "learning_rate": 7.822766036336671e-06, "loss": 2.9702, "mean_token_accuracy": 0.4678945589726259, "step": 11744 }, { "epoch": 2.1774193548387095, "grad_norm": 8.4375, "learning_rate": 7.822580645161291e-06, "loss": 2.9056, "mean_token_accuracy": 0.4845917920257047, "step": 11745 }, { "epoch": 2.1776047460140897, "grad_norm": 8.3984375, "learning_rate": 7.82239525398591e-06, "loss": 2.7136, "mean_token_accuracy": 0.4955364134690681, "step": 11746 }, { "epoch": 2.17779013718947, "grad_norm": 6.0703125, "learning_rate": 7.82220986281053e-06, "loss": 3.0423, "mean_token_accuracy": 0.44992481203007517, "step": 11747 }, { "epoch": 2.1779755283648496, "grad_norm": 10.734375, "learning_rate": 7.822024471635151e-06, "loss": 2.5889, "mean_token_accuracy": 0.4938467645891227, "step": 11748 }, { "epoch": 2.17816091954023, "grad_norm": 8.5546875, "learning_rate": 7.821839080459772e-06, "loss": 3.4367, "mean_token_accuracy": 0.4332432816721172, "step": 11749 }, { "epoch": 2.17834631071561, "grad_norm": 7.26171875, "learning_rate": 7.82165368928439e-06, "loss": 2.9641, "mean_token_accuracy": 0.46042889966811335, "step": 11750 }, { "epoch": 2.17853170189099, "grad_norm": 9.2734375, "learning_rate": 7.821468298109011e-06, "loss": 2.3124, "mean_token_accuracy": 0.5506470325747435, "step": 11751 }, { "epoch": 2.17871709306637, "grad_norm": 12.578125, "learning_rate": 7.82128290693363e-06, "loss": 2.6574, "mean_token_accuracy": 0.4768280123583934, "step": 11752 }, { "epoch": 2.17890248424175, "grad_norm": 7.6796875, "learning_rate": 7.82109751575825e-06, "loss": 2.6732, "mean_token_accuracy": 0.4917677642980936, "step": 11753 }, { "epoch": 2.1790878754171303, "grad_norm": 6.46484375, "learning_rate": 7.82091212458287e-06, "loss": 2.9914, "mean_token_accuracy": 0.466643051127642, "step": 11754 }, { "epoch": 2.17927326659251, "grad_norm": 7.3671875, "learning_rate": 7.82072673340749e-06, "loss": 2.4799, "mean_token_accuracy": 0.5151087263484891, "step": 11755 }, { "epoch": 2.1794586577678903, "grad_norm": 8.25, "learning_rate": 7.82054134223211e-06, "loss": 2.5673, "mean_token_accuracy": 0.4981121115306419, "step": 11756 }, { "epoch": 2.1796440489432705, "grad_norm": 6.33203125, "learning_rate": 7.82035595105673e-06, "loss": 3.095, "mean_token_accuracy": 0.45077220077220076, "step": 11757 }, { "epoch": 2.17982944011865, "grad_norm": 11.0703125, "learning_rate": 7.820170559881351e-06, "loss": 2.2856, "mean_token_accuracy": 0.531239252550728, "step": 11758 }, { "epoch": 2.1800148312940304, "grad_norm": 9.96875, "learning_rate": 7.81998516870597e-06, "loss": 2.5912, "mean_token_accuracy": 0.49404580152671757, "step": 11759 }, { "epoch": 2.1802002224694106, "grad_norm": 6.2890625, "learning_rate": 7.81979977753059e-06, "loss": 2.7074, "mean_token_accuracy": 0.48426383697093933, "step": 11760 }, { "epoch": 2.1803856136447903, "grad_norm": 8.328125, "learning_rate": 7.819614386355209e-06, "loss": 3.1886, "mean_token_accuracy": 0.47280163599182007, "step": 11761 }, { "epoch": 2.1805710048201705, "grad_norm": 12.1640625, "learning_rate": 7.81942899517983e-06, "loss": 2.7125, "mean_token_accuracy": 0.478110599078341, "step": 11762 }, { "epoch": 2.1807563959955507, "grad_norm": 12.1484375, "learning_rate": 7.81924360400445e-06, "loss": 2.5696, "mean_token_accuracy": 0.5125634517766497, "step": 11763 }, { "epoch": 2.1809417871709305, "grad_norm": 7.24609375, "learning_rate": 7.81905821282907e-06, "loss": 2.382, "mean_token_accuracy": 0.502837947411097, "step": 11764 }, { "epoch": 2.1811271783463106, "grad_norm": 8.3125, "learning_rate": 7.81887282165369e-06, "loss": 3.1348, "mean_token_accuracy": 0.44934402332361517, "step": 11765 }, { "epoch": 2.181312569521691, "grad_norm": 8.3828125, "learning_rate": 7.81868743047831e-06, "loss": 3.2821, "mean_token_accuracy": 0.44374209860935526, "step": 11766 }, { "epoch": 2.181497960697071, "grad_norm": 9.6640625, "learning_rate": 7.81850203930293e-06, "loss": 2.6517, "mean_token_accuracy": 0.47556294779938585, "step": 11767 }, { "epoch": 2.1816833518724508, "grad_norm": 7.24609375, "learning_rate": 7.81831664812755e-06, "loss": 2.335, "mean_token_accuracy": 0.5643899895724713, "step": 11768 }, { "epoch": 2.181868743047831, "grad_norm": 6.50390625, "learning_rate": 7.81813125695217e-06, "loss": 2.9825, "mean_token_accuracy": 0.44958245045494205, "step": 11769 }, { "epoch": 2.182054134223211, "grad_norm": 5.47265625, "learning_rate": 7.817945865776788e-06, "loss": 2.2381, "mean_token_accuracy": 0.5518856875146404, "step": 11770 }, { "epoch": 2.182239525398591, "grad_norm": 8.2578125, "learning_rate": 7.817760474601409e-06, "loss": 2.8548, "mean_token_accuracy": 0.4906176700547303, "step": 11771 }, { "epoch": 2.182424916573971, "grad_norm": 6.62109375, "learning_rate": 7.81757508342603e-06, "loss": 3.2429, "mean_token_accuracy": 0.45691747572815533, "step": 11772 }, { "epoch": 2.1826103077493513, "grad_norm": 6.453125, "learning_rate": 7.81738969225065e-06, "loss": 2.9636, "mean_token_accuracy": 0.4780254376023171, "step": 11773 }, { "epoch": 2.182795698924731, "grad_norm": 6.12890625, "learning_rate": 7.81720430107527e-06, "loss": 3.2501, "mean_token_accuracy": 0.4278056170609486, "step": 11774 }, { "epoch": 2.182981090100111, "grad_norm": 6.27734375, "learning_rate": 7.81701890989989e-06, "loss": 2.8703, "mean_token_accuracy": 0.4702523240371846, "step": 11775 }, { "epoch": 2.1831664812754914, "grad_norm": 6.77734375, "learning_rate": 7.81683351872451e-06, "loss": 2.631, "mean_token_accuracy": 0.5062853985357093, "step": 11776 }, { "epoch": 2.183351872450871, "grad_norm": 6.3125, "learning_rate": 7.816648127549129e-06, "loss": 3.031, "mean_token_accuracy": 0.4707396369547548, "step": 11777 }, { "epoch": 2.1835372636262513, "grad_norm": 6.53125, "learning_rate": 7.816462736373749e-06, "loss": 2.915, "mean_token_accuracy": 0.4781431334622824, "step": 11778 }, { "epoch": 2.1837226548016315, "grad_norm": 6.45703125, "learning_rate": 7.81627734519837e-06, "loss": 2.5358, "mean_token_accuracy": 0.5004770342101676, "step": 11779 }, { "epoch": 2.1839080459770113, "grad_norm": 6.1171875, "learning_rate": 7.81609195402299e-06, "loss": 2.9821, "mean_token_accuracy": 0.46633872101194657, "step": 11780 }, { "epoch": 2.1840934371523915, "grad_norm": 6.04296875, "learning_rate": 7.815906562847609e-06, "loss": 2.7786, "mean_token_accuracy": 0.5052825552825553, "step": 11781 }, { "epoch": 2.1842788283277716, "grad_norm": 6.14453125, "learning_rate": 7.81572117167223e-06, "loss": 2.9774, "mean_token_accuracy": 0.4675678622042952, "step": 11782 }, { "epoch": 2.184464219503152, "grad_norm": 6.2265625, "learning_rate": 7.81553578049685e-06, "loss": 2.7744, "mean_token_accuracy": 0.5113224071062179, "step": 11783 }, { "epoch": 2.1846496106785316, "grad_norm": 7.6015625, "learning_rate": 7.815350389321469e-06, "loss": 2.8205, "mean_token_accuracy": 0.5114470842332614, "step": 11784 }, { "epoch": 2.1848350018539118, "grad_norm": 6.61328125, "learning_rate": 7.815164998146089e-06, "loss": 2.5907, "mean_token_accuracy": 0.4918096220817748, "step": 11785 }, { "epoch": 2.185020393029292, "grad_norm": 8.0703125, "learning_rate": 7.814979606970708e-06, "loss": 2.5211, "mean_token_accuracy": 0.489434139469359, "step": 11786 }, { "epoch": 2.1852057842046717, "grad_norm": 6.21484375, "learning_rate": 7.814794215795328e-06, "loss": 2.1906, "mean_token_accuracy": 0.5390372512332029, "step": 11787 }, { "epoch": 2.185391175380052, "grad_norm": 7.07421875, "learning_rate": 7.814608824619949e-06, "loss": 2.7153, "mean_token_accuracy": 0.5018808397039194, "step": 11788 }, { "epoch": 2.185576566555432, "grad_norm": 9.3125, "learning_rate": 7.81442343344457e-06, "loss": 3.8291, "mean_token_accuracy": 0.42101374316536133, "step": 11789 }, { "epoch": 2.185761957730812, "grad_norm": 8.4765625, "learning_rate": 7.814238042269188e-06, "loss": 2.8814, "mean_token_accuracy": 0.48073022312373226, "step": 11790 }, { "epoch": 2.185947348906192, "grad_norm": 7.37109375, "learning_rate": 7.814052651093809e-06, "loss": 2.668, "mean_token_accuracy": 0.4969786656801085, "step": 11791 }, { "epoch": 2.186132740081572, "grad_norm": 6.7734375, "learning_rate": 7.813867259918429e-06, "loss": 2.7012, "mean_token_accuracy": 0.4903770102820986, "step": 11792 }, { "epoch": 2.186318131256952, "grad_norm": 7.32421875, "learning_rate": 7.813681868743048e-06, "loss": 2.257, "mean_token_accuracy": 0.5540407762220585, "step": 11793 }, { "epoch": 2.186503522432332, "grad_norm": 7.41015625, "learning_rate": 7.813496477567668e-06, "loss": 3.0187, "mean_token_accuracy": 0.48068838675775094, "step": 11794 }, { "epoch": 2.1866889136077123, "grad_norm": 7.7265625, "learning_rate": 7.813311086392287e-06, "loss": 2.6653, "mean_token_accuracy": 0.4883298755186722, "step": 11795 }, { "epoch": 2.1868743047830925, "grad_norm": 7.51171875, "learning_rate": 7.81312569521691e-06, "loss": 3.8253, "mean_token_accuracy": 0.39764606977721734, "step": 11796 }, { "epoch": 2.1870596959584723, "grad_norm": 7.1796875, "learning_rate": 7.812940304041528e-06, "loss": 2.6718, "mean_token_accuracy": 0.5177797051170858, "step": 11797 }, { "epoch": 2.1872450871338525, "grad_norm": 10.0, "learning_rate": 7.812754912866149e-06, "loss": 2.9222, "mean_token_accuracy": 0.46614304415089847, "step": 11798 }, { "epoch": 2.1874304783092327, "grad_norm": 7.73828125, "learning_rate": 7.812569521690767e-06, "loss": 2.5826, "mean_token_accuracy": 0.5208474806464756, "step": 11799 }, { "epoch": 2.1876158694846124, "grad_norm": 14.828125, "learning_rate": 7.812384130515388e-06, "loss": 2.683, "mean_token_accuracy": 0.4780304087041428, "step": 11800 }, { "epoch": 2.1878012606599926, "grad_norm": 10.0078125, "learning_rate": 7.812198739340008e-06, "loss": 2.3743, "mean_token_accuracy": 0.5165443928731847, "step": 11801 }, { "epoch": 2.1879866518353728, "grad_norm": 8.671875, "learning_rate": 7.812013348164627e-06, "loss": 3.0875, "mean_token_accuracy": 0.4553081147040879, "step": 11802 }, { "epoch": 2.1881720430107525, "grad_norm": 8.2421875, "learning_rate": 7.811827956989248e-06, "loss": 4.0577, "mean_token_accuracy": 0.4005803684077719, "step": 11803 }, { "epoch": 2.1883574341861327, "grad_norm": 9.2109375, "learning_rate": 7.811642565813868e-06, "loss": 3.1167, "mean_token_accuracy": 0.4564408041697692, "step": 11804 }, { "epoch": 2.188542825361513, "grad_norm": 8.4765625, "learning_rate": 7.811457174638489e-06, "loss": 2.8267, "mean_token_accuracy": 0.47233991042246704, "step": 11805 }, { "epoch": 2.1887282165368926, "grad_norm": 7.01953125, "learning_rate": 7.811271783463108e-06, "loss": 2.8637, "mean_token_accuracy": 0.47626301091647627, "step": 11806 }, { "epoch": 2.188913607712273, "grad_norm": 8.703125, "learning_rate": 7.811086392287728e-06, "loss": 3.8213, "mean_token_accuracy": 0.4155307091438199, "step": 11807 }, { "epoch": 2.189098998887653, "grad_norm": 7.7734375, "learning_rate": 7.810901001112347e-06, "loss": 2.8084, "mean_token_accuracy": 0.46352300415651426, "step": 11808 }, { "epoch": 2.189284390063033, "grad_norm": 7.05078125, "learning_rate": 7.810715609936967e-06, "loss": 2.9067, "mean_token_accuracy": 0.4750364190173487, "step": 11809 }, { "epoch": 2.189469781238413, "grad_norm": 7.25, "learning_rate": 7.810530218761588e-06, "loss": 2.7101, "mean_token_accuracy": 0.49149010072941995, "step": 11810 }, { "epoch": 2.189655172413793, "grad_norm": 7.890625, "learning_rate": 7.810344827586207e-06, "loss": 2.7017, "mean_token_accuracy": 0.4917340837143862, "step": 11811 }, { "epoch": 2.1898405635891733, "grad_norm": 6.66796875, "learning_rate": 7.810159436410829e-06, "loss": 2.7602, "mean_token_accuracy": 0.4845577211394303, "step": 11812 }, { "epoch": 2.190025954764553, "grad_norm": 6.5078125, "learning_rate": 7.809974045235448e-06, "loss": 2.5966, "mean_token_accuracy": 0.5026645542272793, "step": 11813 }, { "epoch": 2.1902113459399333, "grad_norm": 8.8515625, "learning_rate": 7.809788654060068e-06, "loss": 2.6584, "mean_token_accuracy": 0.48828504089837793, "step": 11814 }, { "epoch": 2.1903967371153135, "grad_norm": 7.2734375, "learning_rate": 7.809603262884687e-06, "loss": 2.9241, "mean_token_accuracy": 0.47361214836942994, "step": 11815 }, { "epoch": 2.190582128290693, "grad_norm": 7.9296875, "learning_rate": 7.809417871709307e-06, "loss": 2.6258, "mean_token_accuracy": 0.4866608353378526, "step": 11816 }, { "epoch": 2.1907675194660734, "grad_norm": 7.37890625, "learning_rate": 7.809232480533928e-06, "loss": 2.9412, "mean_token_accuracy": 0.4495462549099282, "step": 11817 }, { "epoch": 2.1909529106414536, "grad_norm": 6.59375, "learning_rate": 7.809047089358547e-06, "loss": 2.5933, "mean_token_accuracy": 0.5104875283446711, "step": 11818 }, { "epoch": 2.1911383018168333, "grad_norm": 7.36328125, "learning_rate": 7.808861698183167e-06, "loss": 1.9793, "mean_token_accuracy": 0.5658318014705882, "step": 11819 }, { "epoch": 2.1913236929922135, "grad_norm": 6.6171875, "learning_rate": 7.808676307007788e-06, "loss": 3.1333, "mean_token_accuracy": 0.45281007751937985, "step": 11820 }, { "epoch": 2.1915090841675937, "grad_norm": 7.45703125, "learning_rate": 7.808490915832408e-06, "loss": 3.0255, "mean_token_accuracy": 0.4729045796815208, "step": 11821 }, { "epoch": 2.191694475342974, "grad_norm": 6.890625, "learning_rate": 7.808305524657027e-06, "loss": 3.1095, "mean_token_accuracy": 0.46998956158663885, "step": 11822 }, { "epoch": 2.1918798665183536, "grad_norm": 7.1015625, "learning_rate": 7.808120133481647e-06, "loss": 2.7337, "mean_token_accuracy": 0.4671521035598705, "step": 11823 }, { "epoch": 2.192065257693734, "grad_norm": 7.80859375, "learning_rate": 7.807934742306266e-06, "loss": 3.3008, "mean_token_accuracy": 0.4612629344653754, "step": 11824 }, { "epoch": 2.192250648869114, "grad_norm": 7.25, "learning_rate": 7.807749351130887e-06, "loss": 2.156, "mean_token_accuracy": 0.540268456375839, "step": 11825 }, { "epoch": 2.1924360400444938, "grad_norm": 6.1328125, "learning_rate": 7.807563959955507e-06, "loss": 2.8329, "mean_token_accuracy": 0.4742003789145213, "step": 11826 }, { "epoch": 2.192621431219874, "grad_norm": 8.5234375, "learning_rate": 7.807378568780126e-06, "loss": 2.7903, "mean_token_accuracy": 0.4961832061068702, "step": 11827 }, { "epoch": 2.192806822395254, "grad_norm": 8.0859375, "learning_rate": 7.807193177604746e-06, "loss": 2.8301, "mean_token_accuracy": 0.4761796733212341, "step": 11828 }, { "epoch": 2.192992213570634, "grad_norm": 6.6953125, "learning_rate": 7.807007786429367e-06, "loss": 2.8573, "mean_token_accuracy": 0.4635564745412251, "step": 11829 }, { "epoch": 2.193177604746014, "grad_norm": 6.74609375, "learning_rate": 7.806822395253987e-06, "loss": 2.6224, "mean_token_accuracy": 0.5138790035587188, "step": 11830 }, { "epoch": 2.1933629959213943, "grad_norm": 7.125, "learning_rate": 7.806637004078606e-06, "loss": 2.568, "mean_token_accuracy": 0.49701884092535176, "step": 11831 }, { "epoch": 2.193548387096774, "grad_norm": 6.171875, "learning_rate": 7.806451612903227e-06, "loss": 2.8077, "mean_token_accuracy": 0.4831932773109244, "step": 11832 }, { "epoch": 2.193733778272154, "grad_norm": 6.10546875, "learning_rate": 7.806266221727846e-06, "loss": 3.084, "mean_token_accuracy": 0.4701397712833545, "step": 11833 }, { "epoch": 2.1939191694475344, "grad_norm": 5.9921875, "learning_rate": 7.806080830552466e-06, "loss": 2.3446, "mean_token_accuracy": 0.5230318729586665, "step": 11834 }, { "epoch": 2.194104560622914, "grad_norm": 11.1171875, "learning_rate": 7.805895439377087e-06, "loss": 3.557, "mean_token_accuracy": 0.44720678560982746, "step": 11835 }, { "epoch": 2.1942899517982943, "grad_norm": 7.6015625, "learning_rate": 7.805710048201707e-06, "loss": 2.681, "mean_token_accuracy": 0.49961113703530874, "step": 11836 }, { "epoch": 2.1944753429736745, "grad_norm": 6.76953125, "learning_rate": 7.805524657026326e-06, "loss": 3.0562, "mean_token_accuracy": 0.4476699770817418, "step": 11837 }, { "epoch": 2.1946607341490543, "grad_norm": 6.734375, "learning_rate": 7.805339265850946e-06, "loss": 3.0839, "mean_token_accuracy": 0.4598327420172326, "step": 11838 }, { "epoch": 2.1948461253244345, "grad_norm": 6.62109375, "learning_rate": 7.805153874675567e-06, "loss": 3.035, "mean_token_accuracy": 0.4723774053382992, "step": 11839 }, { "epoch": 2.1950315164998146, "grad_norm": 7.765625, "learning_rate": 7.804968483500186e-06, "loss": 3.2996, "mean_token_accuracy": 0.4732905982905983, "step": 11840 }, { "epoch": 2.195216907675195, "grad_norm": 7.5078125, "learning_rate": 7.804783092324806e-06, "loss": 2.8382, "mean_token_accuracy": 0.4895900064061499, "step": 11841 }, { "epoch": 2.1954022988505746, "grad_norm": 7.0546875, "learning_rate": 7.804597701149425e-06, "loss": 2.4008, "mean_token_accuracy": 0.5292988929889298, "step": 11842 }, { "epoch": 2.1955876900259548, "grad_norm": 6.66015625, "learning_rate": 7.804412309974045e-06, "loss": 2.3661, "mean_token_accuracy": 0.5214861827317202, "step": 11843 }, { "epoch": 2.195773081201335, "grad_norm": 8.296875, "learning_rate": 7.804226918798666e-06, "loss": 3.6342, "mean_token_accuracy": 0.42572440015733576, "step": 11844 }, { "epoch": 2.1959584723767147, "grad_norm": 7.31640625, "learning_rate": 7.804041527623286e-06, "loss": 2.7076, "mean_token_accuracy": 0.4934247499157019, "step": 11845 }, { "epoch": 2.196143863552095, "grad_norm": 6.92578125, "learning_rate": 7.803856136447905e-06, "loss": 2.9512, "mean_token_accuracy": 0.4423737774253238, "step": 11846 }, { "epoch": 2.196329254727475, "grad_norm": 5.765625, "learning_rate": 7.803670745272526e-06, "loss": 2.5791, "mean_token_accuracy": 0.5262521968365553, "step": 11847 }, { "epoch": 2.196514645902855, "grad_norm": 8.34375, "learning_rate": 7.803485354097146e-06, "loss": 2.7669, "mean_token_accuracy": 0.4960148021633931, "step": 11848 }, { "epoch": 2.196700037078235, "grad_norm": 8.4765625, "learning_rate": 7.803299962921765e-06, "loss": 2.8539, "mean_token_accuracy": 0.48688790902761664, "step": 11849 }, { "epoch": 2.196885428253615, "grad_norm": 6.59375, "learning_rate": 7.803114571746385e-06, "loss": 2.7752, "mean_token_accuracy": 0.4676324587174552, "step": 11850 }, { "epoch": 2.197070819428995, "grad_norm": 9.3828125, "learning_rate": 7.802929180571004e-06, "loss": 3.0113, "mean_token_accuracy": 0.48169418521177315, "step": 11851 }, { "epoch": 2.197256210604375, "grad_norm": 8.5625, "learning_rate": 7.802743789395626e-06, "loss": 2.9911, "mean_token_accuracy": 0.4735769922109047, "step": 11852 }, { "epoch": 2.1974416017797553, "grad_norm": 6.78125, "learning_rate": 7.802558398220245e-06, "loss": 3.0147, "mean_token_accuracy": 0.4595535285645703, "step": 11853 }, { "epoch": 2.1976269929551355, "grad_norm": 6.68359375, "learning_rate": 7.802373007044866e-06, "loss": 2.4068, "mean_token_accuracy": 0.5611698655176917, "step": 11854 }, { "epoch": 2.1978123841305153, "grad_norm": 7.56640625, "learning_rate": 7.802187615869486e-06, "loss": 2.7999, "mean_token_accuracy": 0.46388399512129014, "step": 11855 }, { "epoch": 2.1979977753058955, "grad_norm": 7.87109375, "learning_rate": 7.802002224694105e-06, "loss": 3.1556, "mean_token_accuracy": 0.4858278955954323, "step": 11856 }, { "epoch": 2.1981831664812757, "grad_norm": 8.0390625, "learning_rate": 7.801816833518725e-06, "loss": 2.743, "mean_token_accuracy": 0.47861070135443207, "step": 11857 }, { "epoch": 2.1983685576566554, "grad_norm": 6.8203125, "learning_rate": 7.801631442343344e-06, "loss": 2.7668, "mean_token_accuracy": 0.48966613672496023, "step": 11858 }, { "epoch": 2.1985539488320356, "grad_norm": 7.4609375, "learning_rate": 7.801446051167965e-06, "loss": 2.7806, "mean_token_accuracy": 0.4899024591290012, "step": 11859 }, { "epoch": 2.1987393400074158, "grad_norm": 6.1328125, "learning_rate": 7.801260659992585e-06, "loss": 2.4354, "mean_token_accuracy": 0.5388636143645339, "step": 11860 }, { "epoch": 2.1989247311827955, "grad_norm": 6.5390625, "learning_rate": 7.801075268817206e-06, "loss": 3.4223, "mean_token_accuracy": 0.44740853658536583, "step": 11861 }, { "epoch": 2.1991101223581757, "grad_norm": 6.10546875, "learning_rate": 7.800889877641825e-06, "loss": 2.8677, "mean_token_accuracy": 0.47115384615384615, "step": 11862 }, { "epoch": 2.199295513533556, "grad_norm": 7.1953125, "learning_rate": 7.800704486466445e-06, "loss": 3.1164, "mean_token_accuracy": 0.45913392601089764, "step": 11863 }, { "epoch": 2.1994809047089356, "grad_norm": 5.55859375, "learning_rate": 7.800519095291066e-06, "loss": 3.1536, "mean_token_accuracy": 0.44969199178644764, "step": 11864 }, { "epoch": 2.199666295884316, "grad_norm": 6.28125, "learning_rate": 7.800333704115684e-06, "loss": 2.6105, "mean_token_accuracy": 0.5046666666666667, "step": 11865 }, { "epoch": 2.199851687059696, "grad_norm": 7.50390625, "learning_rate": 7.800148312940305e-06, "loss": 2.6588, "mean_token_accuracy": 0.508198727361723, "step": 11866 }, { "epoch": 2.200037078235076, "grad_norm": 6.76171875, "learning_rate": 7.799962921764924e-06, "loss": 3.3561, "mean_token_accuracy": 0.4353909465020576, "step": 11867 }, { "epoch": 2.200222469410456, "grad_norm": 9.6484375, "learning_rate": 7.799777530589546e-06, "loss": 2.5715, "mean_token_accuracy": 0.4887495981999357, "step": 11868 }, { "epoch": 2.200407860585836, "grad_norm": 6.53515625, "learning_rate": 7.799592139414165e-06, "loss": 2.4468, "mean_token_accuracy": 0.49375866851595007, "step": 11869 }, { "epoch": 2.2005932517612163, "grad_norm": 7.11328125, "learning_rate": 7.799406748238785e-06, "loss": 3.0639, "mean_token_accuracy": 0.4786485218207414, "step": 11870 }, { "epoch": 2.200778642936596, "grad_norm": 7.9296875, "learning_rate": 7.799221357063404e-06, "loss": 2.628, "mean_token_accuracy": 0.48329276388581544, "step": 11871 }, { "epoch": 2.2009640341119763, "grad_norm": 7.8203125, "learning_rate": 7.799035965888024e-06, "loss": 2.787, "mean_token_accuracy": 0.48743582815455283, "step": 11872 }, { "epoch": 2.2011494252873565, "grad_norm": 6.27734375, "learning_rate": 7.798850574712645e-06, "loss": 2.8709, "mean_token_accuracy": 0.47448912326961107, "step": 11873 }, { "epoch": 2.201334816462736, "grad_norm": 7.9921875, "learning_rate": 7.798665183537264e-06, "loss": 3.1996, "mean_token_accuracy": 0.45327510917030567, "step": 11874 }, { "epoch": 2.2015202076381164, "grad_norm": 8.125, "learning_rate": 7.798479792361884e-06, "loss": 2.6945, "mean_token_accuracy": 0.5060163022383232, "step": 11875 }, { "epoch": 2.2017055988134966, "grad_norm": 6.4453125, "learning_rate": 7.798294401186505e-06, "loss": 3.0816, "mean_token_accuracy": 0.4609872611464968, "step": 11876 }, { "epoch": 2.2018909899888763, "grad_norm": 8.765625, "learning_rate": 7.798109010011125e-06, "loss": 2.3322, "mean_token_accuracy": 0.5349624457844071, "step": 11877 }, { "epoch": 2.2020763811642565, "grad_norm": 8.1015625, "learning_rate": 7.797923618835744e-06, "loss": 3.1153, "mean_token_accuracy": 0.44488809498791637, "step": 11878 }, { "epoch": 2.2022617723396367, "grad_norm": 9.609375, "learning_rate": 7.797738227660364e-06, "loss": 3.4308, "mean_token_accuracy": 0.41430073606729756, "step": 11879 }, { "epoch": 2.202447163515017, "grad_norm": 8.0234375, "learning_rate": 7.797552836484983e-06, "loss": 2.4783, "mean_token_accuracy": 0.5010259040779688, "step": 11880 }, { "epoch": 2.2026325546903966, "grad_norm": 6.859375, "learning_rate": 7.797367445309604e-06, "loss": 3.26, "mean_token_accuracy": 0.4545644018340975, "step": 11881 }, { "epoch": 2.202817945865777, "grad_norm": 7.83984375, "learning_rate": 7.797182054134224e-06, "loss": 2.4844, "mean_token_accuracy": 0.510914142082286, "step": 11882 }, { "epoch": 2.203003337041157, "grad_norm": 8.21875, "learning_rate": 7.796996662958843e-06, "loss": 2.1678, "mean_token_accuracy": 0.5746996996996997, "step": 11883 }, { "epoch": 2.2031887282165368, "grad_norm": 7.11328125, "learning_rate": 7.796811271783464e-06, "loss": 2.6749, "mean_token_accuracy": 0.48767403244347934, "step": 11884 }, { "epoch": 2.203374119391917, "grad_norm": 5.84765625, "learning_rate": 7.796625880608084e-06, "loss": 2.3036, "mean_token_accuracy": 0.5433215202248564, "step": 11885 }, { "epoch": 2.203559510567297, "grad_norm": 7.44921875, "learning_rate": 7.796440489432704e-06, "loss": 3.3311, "mean_token_accuracy": 0.42900302114803623, "step": 11886 }, { "epoch": 2.203744901742677, "grad_norm": 7.515625, "learning_rate": 7.796255098257323e-06, "loss": 2.6592, "mean_token_accuracy": 0.4876387487386478, "step": 11887 }, { "epoch": 2.203930292918057, "grad_norm": 6.73828125, "learning_rate": 7.796069707081944e-06, "loss": 3.0023, "mean_token_accuracy": 0.47071207430340556, "step": 11888 }, { "epoch": 2.2041156840934373, "grad_norm": 6.5078125, "learning_rate": 7.795884315906563e-06, "loss": 2.165, "mean_token_accuracy": 0.5381306218224482, "step": 11889 }, { "epoch": 2.204301075268817, "grad_norm": 6.23046875, "learning_rate": 7.795698924731183e-06, "loss": 2.3951, "mean_token_accuracy": 0.5587950808345999, "step": 11890 }, { "epoch": 2.204486466444197, "grad_norm": 7.12109375, "learning_rate": 7.795513533555804e-06, "loss": 2.5515, "mean_token_accuracy": 0.5019286403085824, "step": 11891 }, { "epoch": 2.2046718576195774, "grad_norm": 9.0078125, "learning_rate": 7.795328142380422e-06, "loss": 2.997, "mean_token_accuracy": 0.4725957878710987, "step": 11892 }, { "epoch": 2.2048572487949576, "grad_norm": 6.7265625, "learning_rate": 7.795142751205045e-06, "loss": 2.4579, "mean_token_accuracy": 0.4974031162604874, "step": 11893 }, { "epoch": 2.2050426399703373, "grad_norm": 7.2265625, "learning_rate": 7.794957360029663e-06, "loss": 3.901, "mean_token_accuracy": 0.4089327953248922, "step": 11894 }, { "epoch": 2.2052280311457175, "grad_norm": 9.03125, "learning_rate": 7.794771968854284e-06, "loss": 2.9855, "mean_token_accuracy": 0.45355361596009974, "step": 11895 }, { "epoch": 2.2054134223210977, "grad_norm": 9.6328125, "learning_rate": 7.794586577678903e-06, "loss": 2.7867, "mean_token_accuracy": 0.47638205930561767, "step": 11896 }, { "epoch": 2.2055988134964775, "grad_norm": 8.8515625, "learning_rate": 7.794401186503523e-06, "loss": 2.5441, "mean_token_accuracy": 0.5295192578015181, "step": 11897 }, { "epoch": 2.2057842046718577, "grad_norm": 9.3359375, "learning_rate": 7.794215795328144e-06, "loss": 3.3979, "mean_token_accuracy": 0.4167782987273945, "step": 11898 }, { "epoch": 2.205969595847238, "grad_norm": 9.453125, "learning_rate": 7.794030404152762e-06, "loss": 3.787, "mean_token_accuracy": 0.4149960357911428, "step": 11899 }, { "epoch": 2.2061549870226176, "grad_norm": 10.9453125, "learning_rate": 7.793845012977383e-06, "loss": 3.1969, "mean_token_accuracy": 0.4264071786505069, "step": 11900 }, { "epoch": 2.2063403781979978, "grad_norm": 9.6484375, "learning_rate": 7.793659621802003e-06, "loss": 2.7032, "mean_token_accuracy": 0.4776641550053821, "step": 11901 }, { "epoch": 2.206525769373378, "grad_norm": 9.875, "learning_rate": 7.793474230626624e-06, "loss": 2.4513, "mean_token_accuracy": 0.5256026244472971, "step": 11902 }, { "epoch": 2.2067111605487577, "grad_norm": 7.33984375, "learning_rate": 7.793288839451243e-06, "loss": 3.3082, "mean_token_accuracy": 0.4279402985074627, "step": 11903 }, { "epoch": 2.206896551724138, "grad_norm": 7.078125, "learning_rate": 7.793103448275863e-06, "loss": 2.855, "mean_token_accuracy": 0.4734380621177599, "step": 11904 }, { "epoch": 2.207081942899518, "grad_norm": 10.4609375, "learning_rate": 7.792918057100482e-06, "loss": 2.2214, "mean_token_accuracy": 0.553426844566085, "step": 11905 }, { "epoch": 2.207267334074898, "grad_norm": 10.09375, "learning_rate": 7.792732665925102e-06, "loss": 3.3526, "mean_token_accuracy": 0.4753329297820823, "step": 11906 }, { "epoch": 2.207452725250278, "grad_norm": 7.43359375, "learning_rate": 7.792547274749723e-06, "loss": 2.4005, "mean_token_accuracy": 0.5142055419151175, "step": 11907 }, { "epoch": 2.207638116425658, "grad_norm": 6.484375, "learning_rate": 7.792361883574342e-06, "loss": 2.7077, "mean_token_accuracy": 0.49850924269528923, "step": 11908 }, { "epoch": 2.207823507601038, "grad_norm": 7.71484375, "learning_rate": 7.792176492398962e-06, "loss": 2.6095, "mean_token_accuracy": 0.4817401425869344, "step": 11909 }, { "epoch": 2.208008898776418, "grad_norm": 9.3515625, "learning_rate": 7.791991101223583e-06, "loss": 2.7584, "mean_token_accuracy": 0.48442584844258485, "step": 11910 }, { "epoch": 2.2081942899517983, "grad_norm": 8.0859375, "learning_rate": 7.791805710048203e-06, "loss": 2.6275, "mean_token_accuracy": 0.5121604828688088, "step": 11911 }, { "epoch": 2.2083796811271785, "grad_norm": 7.40234375, "learning_rate": 7.791620318872822e-06, "loss": 2.6232, "mean_token_accuracy": 0.5022554452893414, "step": 11912 }, { "epoch": 2.2085650723025583, "grad_norm": 7.859375, "learning_rate": 7.791434927697443e-06, "loss": 2.7867, "mean_token_accuracy": 0.47111319868482854, "step": 11913 }, { "epoch": 2.2087504634779385, "grad_norm": 7.390625, "learning_rate": 7.791249536522061e-06, "loss": 2.758, "mean_token_accuracy": 0.48204590913420325, "step": 11914 }, { "epoch": 2.2089358546533187, "grad_norm": 8.9453125, "learning_rate": 7.791064145346682e-06, "loss": 2.9664, "mean_token_accuracy": 0.47520530638029057, "step": 11915 }, { "epoch": 2.2091212458286984, "grad_norm": 9.0703125, "learning_rate": 7.790878754171302e-06, "loss": 3.0299, "mean_token_accuracy": 0.4846272098385857, "step": 11916 }, { "epoch": 2.2093066370040786, "grad_norm": 7.40625, "learning_rate": 7.790693362995923e-06, "loss": 2.5467, "mean_token_accuracy": 0.5323657847418444, "step": 11917 }, { "epoch": 2.209492028179459, "grad_norm": 7.08203125, "learning_rate": 7.790507971820542e-06, "loss": 2.8027, "mean_token_accuracy": 0.49663692518874397, "step": 11918 }, { "epoch": 2.2096774193548385, "grad_norm": 7.65234375, "learning_rate": 7.790322580645162e-06, "loss": 2.2149, "mean_token_accuracy": 0.5773037167363096, "step": 11919 }, { "epoch": 2.2098628105302187, "grad_norm": 6.23828125, "learning_rate": 7.790137189469783e-06, "loss": 2.4778, "mean_token_accuracy": 0.4977643504531722, "step": 11920 }, { "epoch": 2.210048201705599, "grad_norm": 7.65625, "learning_rate": 7.789951798294401e-06, "loss": 2.9266, "mean_token_accuracy": 0.48638132295719844, "step": 11921 }, { "epoch": 2.2102335928809786, "grad_norm": 7.21484375, "learning_rate": 7.789766407119022e-06, "loss": 2.6366, "mean_token_accuracy": 0.48789075002667237, "step": 11922 }, { "epoch": 2.210418984056359, "grad_norm": 6.43359375, "learning_rate": 7.78958101594364e-06, "loss": 2.3747, "mean_token_accuracy": 0.5431301652892562, "step": 11923 }, { "epoch": 2.210604375231739, "grad_norm": 7.35546875, "learning_rate": 7.789395624768261e-06, "loss": 2.3608, "mean_token_accuracy": 0.5604260089686098, "step": 11924 }, { "epoch": 2.210789766407119, "grad_norm": 8.7265625, "learning_rate": 7.789210233592882e-06, "loss": 2.5968, "mean_token_accuracy": 0.5228236061297685, "step": 11925 }, { "epoch": 2.210975157582499, "grad_norm": 8.1328125, "learning_rate": 7.789024842417502e-06, "loss": 2.7785, "mean_token_accuracy": 0.48058124174372524, "step": 11926 }, { "epoch": 2.211160548757879, "grad_norm": 8.484375, "learning_rate": 7.788839451242121e-06, "loss": 3.1148, "mean_token_accuracy": 0.46057975073519114, "step": 11927 }, { "epoch": 2.2113459399332593, "grad_norm": 6.94921875, "learning_rate": 7.788654060066741e-06, "loss": 3.5398, "mean_token_accuracy": 0.4428125812321289, "step": 11928 }, { "epoch": 2.211531331108639, "grad_norm": 8.671875, "learning_rate": 7.788468668891362e-06, "loss": 2.9515, "mean_token_accuracy": 0.465014164305949, "step": 11929 }, { "epoch": 2.2117167222840193, "grad_norm": 6.390625, "learning_rate": 7.78828327771598e-06, "loss": 3.289, "mean_token_accuracy": 0.4286797220467467, "step": 11930 }, { "epoch": 2.2119021134593995, "grad_norm": 7.671875, "learning_rate": 7.788097886540601e-06, "loss": 2.4063, "mean_token_accuracy": 0.551678445229682, "step": 11931 }, { "epoch": 2.212087504634779, "grad_norm": 8.2578125, "learning_rate": 7.78791249536522e-06, "loss": 2.6868, "mean_token_accuracy": 0.48629984406326576, "step": 11932 }, { "epoch": 2.2122728958101594, "grad_norm": 8.890625, "learning_rate": 7.787727104189842e-06, "loss": 2.8754, "mean_token_accuracy": 0.4559852670349908, "step": 11933 }, { "epoch": 2.2124582869855396, "grad_norm": 7.4921875, "learning_rate": 7.787541713014461e-06, "loss": 3.2408, "mean_token_accuracy": 0.4418604651162791, "step": 11934 }, { "epoch": 2.2126436781609193, "grad_norm": 8.3984375, "learning_rate": 7.787356321839081e-06, "loss": 2.2244, "mean_token_accuracy": 0.5321828358208955, "step": 11935 }, { "epoch": 2.2128290693362995, "grad_norm": 8.71875, "learning_rate": 7.787170930663702e-06, "loss": 2.395, "mean_token_accuracy": 0.5322836567801782, "step": 11936 }, { "epoch": 2.2130144605116797, "grad_norm": 8.1328125, "learning_rate": 7.78698553948832e-06, "loss": 3.1417, "mean_token_accuracy": 0.4822123401889939, "step": 11937 }, { "epoch": 2.21319985168706, "grad_norm": 7.76953125, "learning_rate": 7.786800148312941e-06, "loss": 2.3425, "mean_token_accuracy": 0.5300977533870691, "step": 11938 }, { "epoch": 2.2133852428624397, "grad_norm": 6.9609375, "learning_rate": 7.78661475713756e-06, "loss": 2.5862, "mean_token_accuracy": 0.5133358614587283, "step": 11939 }, { "epoch": 2.21357063403782, "grad_norm": 8.65625, "learning_rate": 7.78642936596218e-06, "loss": 2.913, "mean_token_accuracy": 0.47900575761831204, "step": 11940 }, { "epoch": 2.2137560252132, "grad_norm": 10.984375, "learning_rate": 7.786243974786801e-06, "loss": 2.4861, "mean_token_accuracy": 0.5132455231611662, "step": 11941 }, { "epoch": 2.2139414163885798, "grad_norm": 6.87890625, "learning_rate": 7.786058583611422e-06, "loss": 3.0099, "mean_token_accuracy": 0.445326278659612, "step": 11942 }, { "epoch": 2.21412680756396, "grad_norm": 10.359375, "learning_rate": 7.78587319243604e-06, "loss": 2.8093, "mean_token_accuracy": 0.4887249443207127, "step": 11943 }, { "epoch": 2.21431219873934, "grad_norm": 9.578125, "learning_rate": 7.78568780126066e-06, "loss": 2.9031, "mean_token_accuracy": 0.47985651214128033, "step": 11944 }, { "epoch": 2.21449758991472, "grad_norm": 10.859375, "learning_rate": 7.785502410085281e-06, "loss": 2.2878, "mean_token_accuracy": 0.5188163884673748, "step": 11945 }, { "epoch": 2.2146829810901, "grad_norm": 6.5703125, "learning_rate": 7.7853170189099e-06, "loss": 2.0436, "mean_token_accuracy": 0.57378754388246, "step": 11946 }, { "epoch": 2.2148683722654803, "grad_norm": 9.96875, "learning_rate": 7.78513162773452e-06, "loss": 3.6276, "mean_token_accuracy": 0.44884527940648555, "step": 11947 }, { "epoch": 2.21505376344086, "grad_norm": 9.796875, "learning_rate": 7.78494623655914e-06, "loss": 3.3843, "mean_token_accuracy": 0.43529411764705883, "step": 11948 }, { "epoch": 2.21523915461624, "grad_norm": 10.5078125, "learning_rate": 7.784760845383762e-06, "loss": 2.8625, "mean_token_accuracy": 0.4746483001172333, "step": 11949 }, { "epoch": 2.2154245457916204, "grad_norm": 6.82421875, "learning_rate": 7.78457545420838e-06, "loss": 2.6808, "mean_token_accuracy": 0.49445041174364485, "step": 11950 }, { "epoch": 2.2156099369670006, "grad_norm": 8.46875, "learning_rate": 7.784390063033e-06, "loss": 2.8473, "mean_token_accuracy": 0.4697036063658928, "step": 11951 }, { "epoch": 2.2157953281423803, "grad_norm": 8.375, "learning_rate": 7.78420467185762e-06, "loss": 2.182, "mean_token_accuracy": 0.5804651162790697, "step": 11952 }, { "epoch": 2.2159807193177605, "grad_norm": 7.1015625, "learning_rate": 7.78401928068224e-06, "loss": 2.727, "mean_token_accuracy": 0.4769946157611356, "step": 11953 }, { "epoch": 2.2161661104931407, "grad_norm": 8.40625, "learning_rate": 7.78383388950686e-06, "loss": 2.3702, "mean_token_accuracy": 0.5205972536995067, "step": 11954 }, { "epoch": 2.2163515016685205, "grad_norm": 8.0078125, "learning_rate": 7.78364849833148e-06, "loss": 3.0988, "mean_token_accuracy": 0.463436928702011, "step": 11955 }, { "epoch": 2.2165368928439007, "grad_norm": 8.65625, "learning_rate": 7.7834631071561e-06, "loss": 2.4038, "mean_token_accuracy": 0.5274940678840401, "step": 11956 }, { "epoch": 2.216722284019281, "grad_norm": 7.8984375, "learning_rate": 7.78327771598072e-06, "loss": 3.1949, "mean_token_accuracy": 0.4634555475676971, "step": 11957 }, { "epoch": 2.2169076751946606, "grad_norm": 10.2421875, "learning_rate": 7.783092324805341e-06, "loss": 2.6592, "mean_token_accuracy": 0.5072027239392352, "step": 11958 }, { "epoch": 2.2170930663700408, "grad_norm": 8.2890625, "learning_rate": 7.78290693362996e-06, "loss": 3.6864, "mean_token_accuracy": 0.4215857928964482, "step": 11959 }, { "epoch": 2.217278457545421, "grad_norm": 5.9375, "learning_rate": 7.78272154245458e-06, "loss": 2.8368, "mean_token_accuracy": 0.48934356351236147, "step": 11960 }, { "epoch": 2.2174638487208007, "grad_norm": 6.73828125, "learning_rate": 7.782536151279199e-06, "loss": 2.4766, "mean_token_accuracy": 0.5211141060197664, "step": 11961 }, { "epoch": 2.217649239896181, "grad_norm": 7.0078125, "learning_rate": 7.78235076010382e-06, "loss": 3.3719, "mean_token_accuracy": 0.4455611390284757, "step": 11962 }, { "epoch": 2.217834631071561, "grad_norm": 7.0, "learning_rate": 7.78216536892844e-06, "loss": 2.3562, "mean_token_accuracy": 0.5208003617454217, "step": 11963 }, { "epoch": 2.2180200222469413, "grad_norm": 8.03125, "learning_rate": 7.781979977753059e-06, "loss": 3.0717, "mean_token_accuracy": 0.47812149382913927, "step": 11964 }, { "epoch": 2.218205413422321, "grad_norm": 7.6484375, "learning_rate": 7.78179458657768e-06, "loss": 2.3392, "mean_token_accuracy": 0.5286779992662346, "step": 11965 }, { "epoch": 2.218390804597701, "grad_norm": 7.77734375, "learning_rate": 7.7816091954023e-06, "loss": 2.8786, "mean_token_accuracy": 0.4892343120322395, "step": 11966 }, { "epoch": 2.2185761957730814, "grad_norm": 6.71484375, "learning_rate": 7.78142380422692e-06, "loss": 3.0756, "mean_token_accuracy": 0.45655024436762426, "step": 11967 }, { "epoch": 2.218761586948461, "grad_norm": 7.85546875, "learning_rate": 7.781238413051539e-06, "loss": 2.8121, "mean_token_accuracy": 0.5020055913455694, "step": 11968 }, { "epoch": 2.2189469781238413, "grad_norm": 6.671875, "learning_rate": 7.78105302187616e-06, "loss": 2.8269, "mean_token_accuracy": 0.48883224083515414, "step": 11969 }, { "epoch": 2.2191323692992215, "grad_norm": 7.76171875, "learning_rate": 7.780867630700778e-06, "loss": 2.665, "mean_token_accuracy": 0.5027013443899987, "step": 11970 }, { "epoch": 2.2193177604746013, "grad_norm": 9.1484375, "learning_rate": 7.780682239525399e-06, "loss": 2.7637, "mean_token_accuracy": 0.4811472369885029, "step": 11971 }, { "epoch": 2.2195031516499815, "grad_norm": 9.828125, "learning_rate": 7.78049684835002e-06, "loss": 2.4618, "mean_token_accuracy": 0.5366212734633787, "step": 11972 }, { "epoch": 2.2196885428253617, "grad_norm": 7.578125, "learning_rate": 7.78031145717464e-06, "loss": 2.25, "mean_token_accuracy": 0.531975217893521, "step": 11973 }, { "epoch": 2.2198739340007414, "grad_norm": 7.7890625, "learning_rate": 7.78012606599926e-06, "loss": 3.1534, "mean_token_accuracy": 0.48495807989478873, "step": 11974 }, { "epoch": 2.2200593251761216, "grad_norm": 6.8203125, "learning_rate": 7.779940674823879e-06, "loss": 2.5977, "mean_token_accuracy": 0.4901147959183674, "step": 11975 }, { "epoch": 2.220244716351502, "grad_norm": 7.28125, "learning_rate": 7.7797552836485e-06, "loss": 2.5241, "mean_token_accuracy": 0.49740932642487046, "step": 11976 }, { "epoch": 2.2204301075268815, "grad_norm": 7.671875, "learning_rate": 7.779569892473118e-06, "loss": 2.6926, "mean_token_accuracy": 0.4974965862539827, "step": 11977 }, { "epoch": 2.2206154987022617, "grad_norm": 7.23828125, "learning_rate": 7.779384501297739e-06, "loss": 2.7638, "mean_token_accuracy": 0.47655088525476413, "step": 11978 }, { "epoch": 2.220800889877642, "grad_norm": 7.7890625, "learning_rate": 7.77919911012236e-06, "loss": 3.4153, "mean_token_accuracy": 0.4239278885550396, "step": 11979 }, { "epoch": 2.2209862810530216, "grad_norm": 8.9375, "learning_rate": 7.779013718946978e-06, "loss": 2.6558, "mean_token_accuracy": 0.4710344827586207, "step": 11980 }, { "epoch": 2.221171672228402, "grad_norm": 6.18359375, "learning_rate": 7.778828327771599e-06, "loss": 3.0493, "mean_token_accuracy": 0.46605482997999764, "step": 11981 }, { "epoch": 2.221357063403782, "grad_norm": 6.828125, "learning_rate": 7.778642936596219e-06, "loss": 2.2945, "mean_token_accuracy": 0.5442914585110272, "step": 11982 }, { "epoch": 2.221542454579162, "grad_norm": 7.3125, "learning_rate": 7.77845754542084e-06, "loss": 3.2996, "mean_token_accuracy": 0.4552517555655162, "step": 11983 }, { "epoch": 2.221727845754542, "grad_norm": 6.66796875, "learning_rate": 7.778272154245458e-06, "loss": 3.1781, "mean_token_accuracy": 0.4547076060644668, "step": 11984 }, { "epoch": 2.221913236929922, "grad_norm": 8.9375, "learning_rate": 7.778086763070079e-06, "loss": 2.6273, "mean_token_accuracy": 0.5001198753296572, "step": 11985 }, { "epoch": 2.2220986281053023, "grad_norm": 9.21875, "learning_rate": 7.777901371894698e-06, "loss": 3.7597, "mean_token_accuracy": 0.4258783204798629, "step": 11986 }, { "epoch": 2.222284019280682, "grad_norm": 7.4609375, "learning_rate": 7.777715980719318e-06, "loss": 2.4037, "mean_token_accuracy": 0.5353858456617353, "step": 11987 }, { "epoch": 2.2224694104560623, "grad_norm": 7.828125, "learning_rate": 7.777530589543939e-06, "loss": 2.5603, "mean_token_accuracy": 0.49210084033613444, "step": 11988 }, { "epoch": 2.2226548016314425, "grad_norm": 8.234375, "learning_rate": 7.77734519836856e-06, "loss": 2.2506, "mean_token_accuracy": 0.5515804098645363, "step": 11989 }, { "epoch": 2.222840192806822, "grad_norm": 8.796875, "learning_rate": 7.777159807193178e-06, "loss": 2.2632, "mean_token_accuracy": 0.5265197060788244, "step": 11990 }, { "epoch": 2.2230255839822024, "grad_norm": 6.7578125, "learning_rate": 7.776974416017798e-06, "loss": 2.2753, "mean_token_accuracy": 0.5423159913689762, "step": 11991 }, { "epoch": 2.2232109751575826, "grad_norm": 7.55859375, "learning_rate": 7.776789024842419e-06, "loss": 2.8279, "mean_token_accuracy": 0.4670143783478996, "step": 11992 }, { "epoch": 2.2233963663329623, "grad_norm": 7.265625, "learning_rate": 7.776603633667038e-06, "loss": 3.4709, "mean_token_accuracy": 0.4638262716865337, "step": 11993 }, { "epoch": 2.2235817575083425, "grad_norm": 6.46875, "learning_rate": 7.776418242491658e-06, "loss": 2.8486, "mean_token_accuracy": 0.4538021259198692, "step": 11994 }, { "epoch": 2.2237671486837227, "grad_norm": 6.4140625, "learning_rate": 7.776232851316277e-06, "loss": 2.6459, "mean_token_accuracy": 0.4869117869571796, "step": 11995 }, { "epoch": 2.223952539859103, "grad_norm": 7.23046875, "learning_rate": 7.776047460140898e-06, "loss": 2.3697, "mean_token_accuracy": 0.5180772960243801, "step": 11996 }, { "epoch": 2.2241379310344827, "grad_norm": 8.7578125, "learning_rate": 7.775862068965518e-06, "loss": 3.4246, "mean_token_accuracy": 0.5109034267912772, "step": 11997 }, { "epoch": 2.224323322209863, "grad_norm": 9.59375, "learning_rate": 7.775676677790139e-06, "loss": 2.6548, "mean_token_accuracy": 0.47973238882329794, "step": 11998 }, { "epoch": 2.224508713385243, "grad_norm": 8.46875, "learning_rate": 7.775491286614757e-06, "loss": 2.9374, "mean_token_accuracy": 0.4498797113071371, "step": 11999 }, { "epoch": 2.2246941045606228, "grad_norm": 6.48046875, "learning_rate": 7.775305895439378e-06, "loss": 2.2535, "mean_token_accuracy": 0.5514945829526171, "step": 12000 }, { "epoch": 2.224879495736003, "grad_norm": 9.5, "learning_rate": 7.775120504263998e-06, "loss": 2.5706, "mean_token_accuracy": 0.49743975903614457, "step": 12001 }, { "epoch": 2.225064886911383, "grad_norm": 8.1015625, "learning_rate": 7.774935113088617e-06, "loss": 2.3838, "mean_token_accuracy": 0.5169693361119381, "step": 12002 }, { "epoch": 2.225250278086763, "grad_norm": 11.171875, "learning_rate": 7.774749721913238e-06, "loss": 2.1912, "mean_token_accuracy": 0.5551322277005827, "step": 12003 }, { "epoch": 2.225435669262143, "grad_norm": 9.7109375, "learning_rate": 7.774564330737856e-06, "loss": 3.0355, "mean_token_accuracy": 0.45807275047862156, "step": 12004 }, { "epoch": 2.2256210604375233, "grad_norm": 8.671875, "learning_rate": 7.774378939562477e-06, "loss": 3.3282, "mean_token_accuracy": 0.4469732890785522, "step": 12005 }, { "epoch": 2.225806451612903, "grad_norm": 7.28515625, "learning_rate": 7.774193548387097e-06, "loss": 2.9265, "mean_token_accuracy": 0.46012781328146707, "step": 12006 }, { "epoch": 2.225991842788283, "grad_norm": 10.4765625, "learning_rate": 7.774008157211718e-06, "loss": 1.8749, "mean_token_accuracy": 0.5751047973917094, "step": 12007 }, { "epoch": 2.2261772339636634, "grad_norm": 7.91015625, "learning_rate": 7.773822766036337e-06, "loss": 2.7655, "mean_token_accuracy": 0.4764431928950504, "step": 12008 }, { "epoch": 2.2263626251390436, "grad_norm": 8.171875, "learning_rate": 7.773637374860957e-06, "loss": 2.9449, "mean_token_accuracy": 0.4619849100406268, "step": 12009 }, { "epoch": 2.2265480163144233, "grad_norm": 6.953125, "learning_rate": 7.773451983685578e-06, "loss": 2.7073, "mean_token_accuracy": 0.46598157335223245, "step": 12010 }, { "epoch": 2.2267334074898035, "grad_norm": 7.734375, "learning_rate": 7.773266592510196e-06, "loss": 2.5881, "mean_token_accuracy": 0.5057515337423313, "step": 12011 }, { "epoch": 2.2269187986651837, "grad_norm": 7.46875, "learning_rate": 7.773081201334817e-06, "loss": 2.5666, "mean_token_accuracy": 0.47752207653197754, "step": 12012 }, { "epoch": 2.2271041898405635, "grad_norm": 5.50390625, "learning_rate": 7.772895810159436e-06, "loss": 1.8873, "mean_token_accuracy": 0.6180646700871018, "step": 12013 }, { "epoch": 2.2272895810159437, "grad_norm": 9.953125, "learning_rate": 7.772710418984058e-06, "loss": 2.5095, "mean_token_accuracy": 0.5322902796271638, "step": 12014 }, { "epoch": 2.227474972191324, "grad_norm": 9.7890625, "learning_rate": 7.772525027808677e-06, "loss": 3.0417, "mean_token_accuracy": 0.45217391304347826, "step": 12015 }, { "epoch": 2.2276603633667036, "grad_norm": 8.0078125, "learning_rate": 7.772339636633297e-06, "loss": 2.579, "mean_token_accuracy": 0.4932673811486672, "step": 12016 }, { "epoch": 2.227845754542084, "grad_norm": 7.11328125, "learning_rate": 7.772154245457918e-06, "loss": 2.8692, "mean_token_accuracy": 0.4742093373493976, "step": 12017 }, { "epoch": 2.228031145717464, "grad_norm": 8.3359375, "learning_rate": 7.771968854282536e-06, "loss": 3.4776, "mean_token_accuracy": 0.4562784915043514, "step": 12018 }, { "epoch": 2.2282165368928437, "grad_norm": 7.9453125, "learning_rate": 7.771783463107157e-06, "loss": 3.1038, "mean_token_accuracy": 0.45587382581468167, "step": 12019 }, { "epoch": 2.228401928068224, "grad_norm": 8.046875, "learning_rate": 7.771598071931776e-06, "loss": 3.0998, "mean_token_accuracy": 0.45685005393743255, "step": 12020 }, { "epoch": 2.228587319243604, "grad_norm": 7.30859375, "learning_rate": 7.771412680756396e-06, "loss": 2.3569, "mean_token_accuracy": 0.5152247462542291, "step": 12021 }, { "epoch": 2.2287727104189843, "grad_norm": 10.203125, "learning_rate": 7.771227289581017e-06, "loss": 3.1015, "mean_token_accuracy": 0.47266265718972117, "step": 12022 }, { "epoch": 2.228958101594364, "grad_norm": 11.0078125, "learning_rate": 7.771041898405637e-06, "loss": 3.0635, "mean_token_accuracy": 0.4423125473365312, "step": 12023 }, { "epoch": 2.229143492769744, "grad_norm": 10.2890625, "learning_rate": 7.770856507230256e-06, "loss": 3.4326, "mean_token_accuracy": 0.41913499344692007, "step": 12024 }, { "epoch": 2.2293288839451244, "grad_norm": 8.46875, "learning_rate": 7.770671116054877e-06, "loss": 3.1609, "mean_token_accuracy": 0.4506513026052104, "step": 12025 }, { "epoch": 2.229514275120504, "grad_norm": 6.640625, "learning_rate": 7.770485724879497e-06, "loss": 2.8656, "mean_token_accuracy": 0.4934078212290503, "step": 12026 }, { "epoch": 2.2296996662958843, "grad_norm": 18.140625, "learning_rate": 7.770300333704116e-06, "loss": 2.5152, "mean_token_accuracy": 0.5103658536585366, "step": 12027 }, { "epoch": 2.2298850574712645, "grad_norm": 11.875, "learning_rate": 7.770114942528736e-06, "loss": 2.8123, "mean_token_accuracy": 0.4768059744363062, "step": 12028 }, { "epoch": 2.2300704486466443, "grad_norm": 9.28125, "learning_rate": 7.769929551353355e-06, "loss": 2.5078, "mean_token_accuracy": 0.5463003343509231, "step": 12029 }, { "epoch": 2.2302558398220245, "grad_norm": 6.39453125, "learning_rate": 7.769744160177977e-06, "loss": 2.1212, "mean_token_accuracy": 0.5544589047078348, "step": 12030 }, { "epoch": 2.2304412309974047, "grad_norm": 8.2421875, "learning_rate": 7.769558769002596e-06, "loss": 3.4837, "mean_token_accuracy": 0.44278825076720735, "step": 12031 }, { "epoch": 2.2306266221727844, "grad_norm": 9.6484375, "learning_rate": 7.769373377827217e-06, "loss": 2.9991, "mean_token_accuracy": 0.48059701492537316, "step": 12032 }, { "epoch": 2.2308120133481646, "grad_norm": 13.90625, "learning_rate": 7.769187986651835e-06, "loss": 2.549, "mean_token_accuracy": 0.5010088138472975, "step": 12033 }, { "epoch": 2.230997404523545, "grad_norm": 8.4140625, "learning_rate": 7.769002595476456e-06, "loss": 3.4452, "mean_token_accuracy": 0.42181244692466335, "step": 12034 }, { "epoch": 2.2311827956989245, "grad_norm": 10.8984375, "learning_rate": 7.768817204301076e-06, "loss": 3.2512, "mean_token_accuracy": 0.448995877111318, "step": 12035 }, { "epoch": 2.2313681868743047, "grad_norm": 17.640625, "learning_rate": 7.768631813125695e-06, "loss": 2.7173, "mean_token_accuracy": 0.47703885731838425, "step": 12036 }, { "epoch": 2.231553578049685, "grad_norm": 9.109375, "learning_rate": 7.768446421950316e-06, "loss": 3.429, "mean_token_accuracy": 0.43478260869565216, "step": 12037 }, { "epoch": 2.231738969225065, "grad_norm": 17.046875, "learning_rate": 7.768261030774936e-06, "loss": 2.5474, "mean_token_accuracy": 0.5048292749285811, "step": 12038 }, { "epoch": 2.231924360400445, "grad_norm": 10.8125, "learning_rate": 7.768075639599557e-06, "loss": 2.8851, "mean_token_accuracy": 0.4807533681605719, "step": 12039 }, { "epoch": 2.232109751575825, "grad_norm": 9.375, "learning_rate": 7.767890248424175e-06, "loss": 3.0266, "mean_token_accuracy": 0.4669542229024628, "step": 12040 }, { "epoch": 2.232295142751205, "grad_norm": 8.421875, "learning_rate": 7.767704857248796e-06, "loss": 3.1411, "mean_token_accuracy": 0.4681952662721893, "step": 12041 }, { "epoch": 2.232480533926585, "grad_norm": 7.171875, "learning_rate": 7.767519466073415e-06, "loss": 2.8008, "mean_token_accuracy": 0.48014359434597265, "step": 12042 }, { "epoch": 2.232665925101965, "grad_norm": 6.9296875, "learning_rate": 7.767334074898035e-06, "loss": 2.5464, "mean_token_accuracy": 0.5203527815468114, "step": 12043 }, { "epoch": 2.2328513162773453, "grad_norm": 6.921875, "learning_rate": 7.767148683722656e-06, "loss": 2.9109, "mean_token_accuracy": 0.4917730683164407, "step": 12044 }, { "epoch": 2.233036707452725, "grad_norm": 9.15625, "learning_rate": 7.766963292547275e-06, "loss": 3.1433, "mean_token_accuracy": 0.45906286476571617, "step": 12045 }, { "epoch": 2.2332220986281053, "grad_norm": 7.26171875, "learning_rate": 7.766777901371895e-06, "loss": 2.3254, "mean_token_accuracy": 0.5052090380192126, "step": 12046 }, { "epoch": 2.2334074898034855, "grad_norm": 8.515625, "learning_rate": 7.766592510196515e-06, "loss": 2.9677, "mean_token_accuracy": 0.47182398864442865, "step": 12047 }, { "epoch": 2.233592880978865, "grad_norm": 6.26953125, "learning_rate": 7.766407119021136e-06, "loss": 2.7347, "mean_token_accuracy": 0.4868785295094674, "step": 12048 }, { "epoch": 2.2337782721542454, "grad_norm": 8.9296875, "learning_rate": 7.766221727845755e-06, "loss": 2.5619, "mean_token_accuracy": 0.47712814298697365, "step": 12049 }, { "epoch": 2.2339636633296256, "grad_norm": 7.28515625, "learning_rate": 7.766036336670375e-06, "loss": 3.0943, "mean_token_accuracy": 0.4607336139506915, "step": 12050 }, { "epoch": 2.2341490545050053, "grad_norm": 6.921875, "learning_rate": 7.765850945494994e-06, "loss": 2.6011, "mean_token_accuracy": 0.49840425531914895, "step": 12051 }, { "epoch": 2.2343344456803855, "grad_norm": 7.171875, "learning_rate": 7.765665554319615e-06, "loss": 2.7287, "mean_token_accuracy": 0.49435566632458583, "step": 12052 }, { "epoch": 2.2345198368557657, "grad_norm": 6.5234375, "learning_rate": 7.765480163144235e-06, "loss": 3.3745, "mean_token_accuracy": 0.4345413764608665, "step": 12053 }, { "epoch": 2.234705228031146, "grad_norm": 7.18359375, "learning_rate": 7.765294771968856e-06, "loss": 2.552, "mean_token_accuracy": 0.50757977313686, "step": 12054 }, { "epoch": 2.2348906192065257, "grad_norm": 10.3203125, "learning_rate": 7.765109380793476e-06, "loss": 2.6136, "mean_token_accuracy": 0.514277245331489, "step": 12055 }, { "epoch": 2.235076010381906, "grad_norm": 6.7578125, "learning_rate": 7.764923989618095e-06, "loss": 2.9183, "mean_token_accuracy": 0.4629731970157502, "step": 12056 }, { "epoch": 2.235261401557286, "grad_norm": 7.12890625, "learning_rate": 7.764738598442715e-06, "loss": 2.9702, "mean_token_accuracy": 0.4732010660349423, "step": 12057 }, { "epoch": 2.2354467927326658, "grad_norm": 10.5, "learning_rate": 7.764553207267334e-06, "loss": 1.9867, "mean_token_accuracy": 0.562273276904474, "step": 12058 }, { "epoch": 2.235632183908046, "grad_norm": 9.25, "learning_rate": 7.764367816091955e-06, "loss": 2.9177, "mean_token_accuracy": 0.4767565919829294, "step": 12059 }, { "epoch": 2.235817575083426, "grad_norm": 7.30078125, "learning_rate": 7.764182424916575e-06, "loss": 2.6563, "mean_token_accuracy": 0.5096551724137931, "step": 12060 }, { "epoch": 2.236002966258806, "grad_norm": 7.02734375, "learning_rate": 7.763997033741194e-06, "loss": 2.6784, "mean_token_accuracy": 0.4756410256410256, "step": 12061 }, { "epoch": 2.236188357434186, "grad_norm": 7.34765625, "learning_rate": 7.763811642565814e-06, "loss": 2.4739, "mean_token_accuracy": 0.5207944548120501, "step": 12062 }, { "epoch": 2.2363737486095663, "grad_norm": 9.4453125, "learning_rate": 7.763626251390435e-06, "loss": 2.8722, "mean_token_accuracy": 0.47250621718706826, "step": 12063 }, { "epoch": 2.236559139784946, "grad_norm": 6.89453125, "learning_rate": 7.763440860215055e-06, "loss": 2.2982, "mean_token_accuracy": 0.5625094711319897, "step": 12064 }, { "epoch": 2.236744530960326, "grad_norm": 10.359375, "learning_rate": 7.763255469039674e-06, "loss": 3.4173, "mean_token_accuracy": 0.4757344237598192, "step": 12065 }, { "epoch": 2.2369299221357064, "grad_norm": 7.4375, "learning_rate": 7.763070077864295e-06, "loss": 2.9677, "mean_token_accuracy": 0.47505197505197505, "step": 12066 }, { "epoch": 2.2371153133110866, "grad_norm": 7.0703125, "learning_rate": 7.762884686688913e-06, "loss": 2.7488, "mean_token_accuracy": 0.4832523315381584, "step": 12067 }, { "epoch": 2.2373007044864663, "grad_norm": 6.10546875, "learning_rate": 7.762699295513534e-06, "loss": 3.1267, "mean_token_accuracy": 0.4605229428351705, "step": 12068 }, { "epoch": 2.2374860956618465, "grad_norm": 7.4765625, "learning_rate": 7.762513904338154e-06, "loss": 2.6699, "mean_token_accuracy": 0.4881410654414504, "step": 12069 }, { "epoch": 2.2376714868372267, "grad_norm": 8.5859375, "learning_rate": 7.762328513162775e-06, "loss": 2.474, "mean_token_accuracy": 0.5048532335399368, "step": 12070 }, { "epoch": 2.2378568780126065, "grad_norm": 6.453125, "learning_rate": 7.762143121987394e-06, "loss": 2.7844, "mean_token_accuracy": 0.48481262327416175, "step": 12071 }, { "epoch": 2.2380422691879867, "grad_norm": 7.59765625, "learning_rate": 7.761957730812014e-06, "loss": 2.5888, "mean_token_accuracy": 0.4884855879213055, "step": 12072 }, { "epoch": 2.238227660363367, "grad_norm": 8.1484375, "learning_rate": 7.761772339636635e-06, "loss": 2.2001, "mean_token_accuracy": 0.5277821501652763, "step": 12073 }, { "epoch": 2.2384130515387466, "grad_norm": 7.98828125, "learning_rate": 7.761586948461254e-06, "loss": 1.9494, "mean_token_accuracy": 0.5598180970149254, "step": 12074 }, { "epoch": 2.238598442714127, "grad_norm": 9.765625, "learning_rate": 7.761401557285874e-06, "loss": 2.6822, "mean_token_accuracy": 0.49797160243407707, "step": 12075 }, { "epoch": 2.238783833889507, "grad_norm": 8.4453125, "learning_rate": 7.761216166110493e-06, "loss": 3.0987, "mean_token_accuracy": 0.4543436025534277, "step": 12076 }, { "epoch": 2.2389692250648867, "grad_norm": 10.6953125, "learning_rate": 7.761030774935113e-06, "loss": 2.4638, "mean_token_accuracy": 0.5197465369879163, "step": 12077 }, { "epoch": 2.239154616240267, "grad_norm": 9.6328125, "learning_rate": 7.760845383759734e-06, "loss": 3.1135, "mean_token_accuracy": 0.4840090357864701, "step": 12078 }, { "epoch": 2.239340007415647, "grad_norm": 6.7734375, "learning_rate": 7.760659992584354e-06, "loss": 2.6245, "mean_token_accuracy": 0.5267494773090641, "step": 12079 }, { "epoch": 2.2395253985910273, "grad_norm": 9.4296875, "learning_rate": 7.760474601408973e-06, "loss": 2.1503, "mean_token_accuracy": 0.5302083333333333, "step": 12080 }, { "epoch": 2.239710789766407, "grad_norm": 9.734375, "learning_rate": 7.760289210233594e-06, "loss": 2.6242, "mean_token_accuracy": 0.4913735899137359, "step": 12081 }, { "epoch": 2.239896180941787, "grad_norm": 6.51953125, "learning_rate": 7.760103819058214e-06, "loss": 2.6773, "mean_token_accuracy": 0.49447317716255956, "step": 12082 }, { "epoch": 2.2400815721171674, "grad_norm": 6.20703125, "learning_rate": 7.759918427882833e-06, "loss": 3.5276, "mean_token_accuracy": 0.43000658761528326, "step": 12083 }, { "epoch": 2.240266963292547, "grad_norm": 7.87109375, "learning_rate": 7.759733036707453e-06, "loss": 2.6657, "mean_token_accuracy": 0.5113230035756854, "step": 12084 }, { "epoch": 2.2404523544679273, "grad_norm": 8.640625, "learning_rate": 7.759547645532072e-06, "loss": 2.6708, "mean_token_accuracy": 0.4780566366930613, "step": 12085 }, { "epoch": 2.2406377456433075, "grad_norm": 7.67578125, "learning_rate": 7.759362254356694e-06, "loss": 2.9049, "mean_token_accuracy": 0.465109964702688, "step": 12086 }, { "epoch": 2.2408231368186873, "grad_norm": 8.890625, "learning_rate": 7.759176863181313e-06, "loss": 2.6978, "mean_token_accuracy": 0.4842185128983308, "step": 12087 }, { "epoch": 2.2410085279940675, "grad_norm": 9.6015625, "learning_rate": 7.758991472005934e-06, "loss": 2.4653, "mean_token_accuracy": 0.5004153316720067, "step": 12088 }, { "epoch": 2.2411939191694477, "grad_norm": 7.58984375, "learning_rate": 7.758806080830552e-06, "loss": 2.8256, "mean_token_accuracy": 0.46210643633324044, "step": 12089 }, { "epoch": 2.2413793103448274, "grad_norm": 7.09765625, "learning_rate": 7.758620689655173e-06, "loss": 3.3098, "mean_token_accuracy": 0.45077978789769185, "step": 12090 }, { "epoch": 2.2415647015202076, "grad_norm": 6.796875, "learning_rate": 7.758435298479793e-06, "loss": 2.9374, "mean_token_accuracy": 0.4592112371690978, "step": 12091 }, { "epoch": 2.241750092695588, "grad_norm": 10.4921875, "learning_rate": 7.758249907304412e-06, "loss": 2.8221, "mean_token_accuracy": 0.49542786952367956, "step": 12092 }, { "epoch": 2.241935483870968, "grad_norm": 8.4375, "learning_rate": 7.758064516129033e-06, "loss": 2.9644, "mean_token_accuracy": 0.46271664651350264, "step": 12093 }, { "epoch": 2.2421208750463477, "grad_norm": 8.5, "learning_rate": 7.757879124953653e-06, "loss": 2.6776, "mean_token_accuracy": 0.49653808110781406, "step": 12094 }, { "epoch": 2.242306266221728, "grad_norm": 9.109375, "learning_rate": 7.757693733778274e-06, "loss": 2.4345, "mean_token_accuracy": 0.5036297640653358, "step": 12095 }, { "epoch": 2.242491657397108, "grad_norm": 7.43359375, "learning_rate": 7.757508342602892e-06, "loss": 3.0578, "mean_token_accuracy": 0.46436015006252607, "step": 12096 }, { "epoch": 2.242677048572488, "grad_norm": 7.640625, "learning_rate": 7.757322951427513e-06, "loss": 2.4669, "mean_token_accuracy": 0.497488138431482, "step": 12097 }, { "epoch": 2.242862439747868, "grad_norm": 7.984375, "learning_rate": 7.757137560252133e-06, "loss": 2.1927, "mean_token_accuracy": 0.5394279604383855, "step": 12098 }, { "epoch": 2.243047830923248, "grad_norm": 8.0, "learning_rate": 7.756952169076752e-06, "loss": 2.6437, "mean_token_accuracy": 0.47671607274256284, "step": 12099 }, { "epoch": 2.243233222098628, "grad_norm": 7.51171875, "learning_rate": 7.756766777901373e-06, "loss": 3.3141, "mean_token_accuracy": 0.44250816866345105, "step": 12100 }, { "epoch": 2.243418613274008, "grad_norm": 6.8671875, "learning_rate": 7.756581386725992e-06, "loss": 2.9843, "mean_token_accuracy": 0.4984126984126984, "step": 12101 }, { "epoch": 2.2436040044493883, "grad_norm": 8.296875, "learning_rate": 7.756395995550614e-06, "loss": 2.9832, "mean_token_accuracy": 0.46637533381888807, "step": 12102 }, { "epoch": 2.243789395624768, "grad_norm": 8.4921875, "learning_rate": 7.756210604375233e-06, "loss": 2.8825, "mean_token_accuracy": 0.48408910103420844, "step": 12103 }, { "epoch": 2.2439747868001483, "grad_norm": 7.58984375, "learning_rate": 7.756025213199853e-06, "loss": 2.5594, "mean_token_accuracy": 0.48502304147465436, "step": 12104 }, { "epoch": 2.2441601779755285, "grad_norm": 6.08984375, "learning_rate": 7.755839822024472e-06, "loss": 2.5902, "mean_token_accuracy": 0.49101961867919314, "step": 12105 }, { "epoch": 2.244345569150908, "grad_norm": 7.35546875, "learning_rate": 7.755654430849092e-06, "loss": 2.868, "mean_token_accuracy": 0.5196820590461771, "step": 12106 }, { "epoch": 2.2445309603262884, "grad_norm": 11.3671875, "learning_rate": 7.755469039673713e-06, "loss": 2.7644, "mean_token_accuracy": 0.4795240302889816, "step": 12107 }, { "epoch": 2.2447163515016686, "grad_norm": 7.23828125, "learning_rate": 7.755283648498332e-06, "loss": 3.1238, "mean_token_accuracy": 0.4791483757682177, "step": 12108 }, { "epoch": 2.2449017426770483, "grad_norm": 6.6484375, "learning_rate": 7.755098257322952e-06, "loss": 2.3348, "mean_token_accuracy": 0.5593173685659235, "step": 12109 }, { "epoch": 2.2450871338524285, "grad_norm": 8.8046875, "learning_rate": 7.754912866147573e-06, "loss": 2.3053, "mean_token_accuracy": 0.5271523178807948, "step": 12110 }, { "epoch": 2.2452725250278087, "grad_norm": 8.4921875, "learning_rate": 7.754727474972193e-06, "loss": 2.5885, "mean_token_accuracy": 0.49887402304941053, "step": 12111 }, { "epoch": 2.245457916203189, "grad_norm": 8.390625, "learning_rate": 7.754542083796812e-06, "loss": 2.5485, "mean_token_accuracy": 0.5562073669849932, "step": 12112 }, { "epoch": 2.2456433073785687, "grad_norm": 10.6328125, "learning_rate": 7.754356692621432e-06, "loss": 2.9785, "mean_token_accuracy": 0.4877943088256536, "step": 12113 }, { "epoch": 2.245828698553949, "grad_norm": 10.1171875, "learning_rate": 7.754171301446051e-06, "loss": 2.7546, "mean_token_accuracy": 0.47619665326242056, "step": 12114 }, { "epoch": 2.246014089729329, "grad_norm": 8.796875, "learning_rate": 7.753985910270672e-06, "loss": 2.7775, "mean_token_accuracy": 0.49025300705101615, "step": 12115 }, { "epoch": 2.246199480904709, "grad_norm": 8.6640625, "learning_rate": 7.753800519095292e-06, "loss": 2.21, "mean_token_accuracy": 0.5727558230829626, "step": 12116 }, { "epoch": 2.246384872080089, "grad_norm": 9.078125, "learning_rate": 7.753615127919911e-06, "loss": 3.8653, "mean_token_accuracy": 0.4176839703365659, "step": 12117 }, { "epoch": 2.246570263255469, "grad_norm": 6.50390625, "learning_rate": 7.753429736744531e-06, "loss": 2.8779, "mean_token_accuracy": 0.466182478438493, "step": 12118 }, { "epoch": 2.246755654430849, "grad_norm": 7.23046875, "learning_rate": 7.753244345569152e-06, "loss": 3.0161, "mean_token_accuracy": 0.4658457550226831, "step": 12119 }, { "epoch": 2.246941045606229, "grad_norm": 8.4765625, "learning_rate": 7.753058954393772e-06, "loss": 2.5899, "mean_token_accuracy": 0.4954117215220849, "step": 12120 }, { "epoch": 2.2471264367816093, "grad_norm": 8.46875, "learning_rate": 7.752873563218391e-06, "loss": 2.7308, "mean_token_accuracy": 0.49852625937834943, "step": 12121 }, { "epoch": 2.247311827956989, "grad_norm": 7.86328125, "learning_rate": 7.752688172043012e-06, "loss": 4.0007, "mean_token_accuracy": 0.411214953271028, "step": 12122 }, { "epoch": 2.247497219132369, "grad_norm": 7.8984375, "learning_rate": 7.75250278086763e-06, "loss": 2.3514, "mean_token_accuracy": 0.5447880870561282, "step": 12123 }, { "epoch": 2.2476826103077494, "grad_norm": 8.8046875, "learning_rate": 7.752317389692251e-06, "loss": 2.1949, "mean_token_accuracy": 0.5611441037450006, "step": 12124 }, { "epoch": 2.2478680014831296, "grad_norm": 6.515625, "learning_rate": 7.752131998516871e-06, "loss": 2.9224, "mean_token_accuracy": 0.487448588852645, "step": 12125 }, { "epoch": 2.2480533926585093, "grad_norm": 8.1015625, "learning_rate": 7.75194660734149e-06, "loss": 2.6046, "mean_token_accuracy": 0.5054762694988384, "step": 12126 }, { "epoch": 2.2482387838338895, "grad_norm": 8.7578125, "learning_rate": 7.75176121616611e-06, "loss": 2.9216, "mean_token_accuracy": 0.4445641923750505, "step": 12127 }, { "epoch": 2.2484241750092697, "grad_norm": 6.9921875, "learning_rate": 7.751575824990731e-06, "loss": 3.2858, "mean_token_accuracy": 0.44481054365733114, "step": 12128 }, { "epoch": 2.2486095661846495, "grad_norm": 7.34375, "learning_rate": 7.751390433815352e-06, "loss": 3.1713, "mean_token_accuracy": 0.46034725480994837, "step": 12129 }, { "epoch": 2.2487949573600297, "grad_norm": 7.89453125, "learning_rate": 7.75120504263997e-06, "loss": 2.4901, "mean_token_accuracy": 0.5337311251826595, "step": 12130 }, { "epoch": 2.24898034853541, "grad_norm": 9.6875, "learning_rate": 7.751019651464591e-06, "loss": 2.5727, "mean_token_accuracy": 0.5099528548978522, "step": 12131 }, { "epoch": 2.2491657397107896, "grad_norm": 10.421875, "learning_rate": 7.75083426028921e-06, "loss": 3.3868, "mean_token_accuracy": 0.4244176706827309, "step": 12132 }, { "epoch": 2.24935113088617, "grad_norm": 8.125, "learning_rate": 7.75064886911383e-06, "loss": 2.752, "mean_token_accuracy": 0.46797180892717305, "step": 12133 }, { "epoch": 2.24953652206155, "grad_norm": 8.359375, "learning_rate": 7.75046347793845e-06, "loss": 3.1624, "mean_token_accuracy": 0.4776416378678396, "step": 12134 }, { "epoch": 2.2497219132369297, "grad_norm": 7.3984375, "learning_rate": 7.750278086763071e-06, "loss": 2.8279, "mean_token_accuracy": 0.48582839474362277, "step": 12135 }, { "epoch": 2.24990730441231, "grad_norm": 8.9296875, "learning_rate": 7.750092695587692e-06, "loss": 2.7376, "mean_token_accuracy": 0.48172189955585926, "step": 12136 }, { "epoch": 2.25009269558769, "grad_norm": 8.609375, "learning_rate": 7.74990730441231e-06, "loss": 2.4682, "mean_token_accuracy": 0.5165595650024716, "step": 12137 }, { "epoch": 2.2502780867630703, "grad_norm": 6.04296875, "learning_rate": 7.749721913236931e-06, "loss": 2.1244, "mean_token_accuracy": 0.5573440643863179, "step": 12138 }, { "epoch": 2.25046347793845, "grad_norm": 7.4609375, "learning_rate": 7.74953652206155e-06, "loss": 2.424, "mean_token_accuracy": 0.5202831472910427, "step": 12139 }, { "epoch": 2.25064886911383, "grad_norm": 6.15234375, "learning_rate": 7.74935113088617e-06, "loss": 2.0929, "mean_token_accuracy": 0.5698474827911999, "step": 12140 }, { "epoch": 2.2508342602892104, "grad_norm": 8.984375, "learning_rate": 7.74916573971079e-06, "loss": 3.034, "mean_token_accuracy": 0.47167344567112146, "step": 12141 }, { "epoch": 2.25101965146459, "grad_norm": 6.28125, "learning_rate": 7.74898034853541e-06, "loss": 2.2375, "mean_token_accuracy": 0.5411836485661989, "step": 12142 }, { "epoch": 2.2512050426399703, "grad_norm": 6.4921875, "learning_rate": 7.74879495736003e-06, "loss": 2.4503, "mean_token_accuracy": 0.5004016870857602, "step": 12143 }, { "epoch": 2.2513904338153505, "grad_norm": 8.1328125, "learning_rate": 7.74860956618465e-06, "loss": 3.0079, "mean_token_accuracy": 0.4508568917533324, "step": 12144 }, { "epoch": 2.2515758249907303, "grad_norm": 7.4375, "learning_rate": 7.748424175009271e-06, "loss": 2.8543, "mean_token_accuracy": 0.4930673457838144, "step": 12145 }, { "epoch": 2.2517612161661105, "grad_norm": 6.7421875, "learning_rate": 7.74823878383389e-06, "loss": 2.0357, "mean_token_accuracy": 0.5958986731001207, "step": 12146 }, { "epoch": 2.2519466073414907, "grad_norm": 11.21875, "learning_rate": 7.74805339265851e-06, "loss": 2.5374, "mean_token_accuracy": 0.5060950554718532, "step": 12147 }, { "epoch": 2.2521319985168704, "grad_norm": 7.23828125, "learning_rate": 7.74786800148313e-06, "loss": 2.9639, "mean_token_accuracy": 0.45950351053159477, "step": 12148 }, { "epoch": 2.2523173896922506, "grad_norm": 8.2421875, "learning_rate": 7.74768261030775e-06, "loss": 2.5864, "mean_token_accuracy": 0.5086425443650611, "step": 12149 }, { "epoch": 2.252502780867631, "grad_norm": 9.9140625, "learning_rate": 7.74749721913237e-06, "loss": 2.9266, "mean_token_accuracy": 0.46787792423702645, "step": 12150 }, { "epoch": 2.252688172043011, "grad_norm": 6.40625, "learning_rate": 7.74731182795699e-06, "loss": 2.6967, "mean_token_accuracy": 0.48873373532211994, "step": 12151 }, { "epoch": 2.2528735632183907, "grad_norm": 17.203125, "learning_rate": 7.74712643678161e-06, "loss": 3.5208, "mean_token_accuracy": 0.43896781675848073, "step": 12152 }, { "epoch": 2.253058954393771, "grad_norm": 8.3046875, "learning_rate": 7.74694104560623e-06, "loss": 3.5203, "mean_token_accuracy": 0.4177996115617503, "step": 12153 }, { "epoch": 2.253244345569151, "grad_norm": 11.734375, "learning_rate": 7.74675565443085e-06, "loss": 3.2452, "mean_token_accuracy": 0.4503130335799658, "step": 12154 }, { "epoch": 2.253429736744531, "grad_norm": 10.2421875, "learning_rate": 7.74657026325547e-06, "loss": 2.8815, "mean_token_accuracy": 0.4617801047120419, "step": 12155 }, { "epoch": 2.253615127919911, "grad_norm": 6.41796875, "learning_rate": 7.74638487208009e-06, "loss": 2.6623, "mean_token_accuracy": 0.4972477064220184, "step": 12156 }, { "epoch": 2.253800519095291, "grad_norm": 9.6796875, "learning_rate": 7.746199480904709e-06, "loss": 2.7382, "mean_token_accuracy": 0.4926342467923333, "step": 12157 }, { "epoch": 2.253985910270671, "grad_norm": 8.859375, "learning_rate": 7.746014089729329e-06, "loss": 3.59, "mean_token_accuracy": 0.424284346067894, "step": 12158 }, { "epoch": 2.254171301446051, "grad_norm": 6.8984375, "learning_rate": 7.74582869855395e-06, "loss": 3.1383, "mean_token_accuracy": 0.43444165621079045, "step": 12159 }, { "epoch": 2.2543566926214313, "grad_norm": 9.875, "learning_rate": 7.74564330737857e-06, "loss": 2.2417, "mean_token_accuracy": 0.5447269910933265, "step": 12160 }, { "epoch": 2.254542083796811, "grad_norm": 11.8984375, "learning_rate": 7.745457916203189e-06, "loss": 2.4905, "mean_token_accuracy": 0.5002277904328019, "step": 12161 }, { "epoch": 2.2547274749721913, "grad_norm": 7.6171875, "learning_rate": 7.74527252502781e-06, "loss": 2.9058, "mean_token_accuracy": 0.462475442043222, "step": 12162 }, { "epoch": 2.2549128661475715, "grad_norm": 8.1953125, "learning_rate": 7.74508713385243e-06, "loss": 3.0764, "mean_token_accuracy": 0.45381335859782096, "step": 12163 }, { "epoch": 2.2550982573229517, "grad_norm": 7.484375, "learning_rate": 7.744901742677049e-06, "loss": 2.5664, "mean_token_accuracy": 0.5121342708097074, "step": 12164 }, { "epoch": 2.2552836484983314, "grad_norm": 7.671875, "learning_rate": 7.744716351501669e-06, "loss": 3.2198, "mean_token_accuracy": 0.45454545454545453, "step": 12165 }, { "epoch": 2.2554690396737116, "grad_norm": 7.60546875, "learning_rate": 7.744530960326288e-06, "loss": 2.3133, "mean_token_accuracy": 0.5286697247706422, "step": 12166 }, { "epoch": 2.2556544308490913, "grad_norm": 7.078125, "learning_rate": 7.74434556915091e-06, "loss": 2.6853, "mean_token_accuracy": 0.49432081594807603, "step": 12167 }, { "epoch": 2.2558398220244715, "grad_norm": 9.71875, "learning_rate": 7.744160177975529e-06, "loss": 2.9387, "mean_token_accuracy": 0.4880048587913757, "step": 12168 }, { "epoch": 2.2560252131998517, "grad_norm": 8.2421875, "learning_rate": 7.74397478680015e-06, "loss": 2.2322, "mean_token_accuracy": 0.5385499253131104, "step": 12169 }, { "epoch": 2.256210604375232, "grad_norm": 7.57421875, "learning_rate": 7.743789395624768e-06, "loss": 2.5122, "mean_token_accuracy": 0.5107234314980794, "step": 12170 }, { "epoch": 2.2563959955506117, "grad_norm": 7.1875, "learning_rate": 7.743604004449389e-06, "loss": 3.2273, "mean_token_accuracy": 0.4438337801608579, "step": 12171 }, { "epoch": 2.256581386725992, "grad_norm": 7.125, "learning_rate": 7.743418613274009e-06, "loss": 3.1777, "mean_token_accuracy": 0.46206237304337433, "step": 12172 }, { "epoch": 2.256766777901372, "grad_norm": 6.50390625, "learning_rate": 7.743233222098628e-06, "loss": 3.0609, "mean_token_accuracy": 0.4763869286722397, "step": 12173 }, { "epoch": 2.256952169076752, "grad_norm": 7.70703125, "learning_rate": 7.743047830923248e-06, "loss": 2.8586, "mean_token_accuracy": 0.4877212237618329, "step": 12174 }, { "epoch": 2.257137560252132, "grad_norm": 11.015625, "learning_rate": 7.742862439747869e-06, "loss": 2.8551, "mean_token_accuracy": 0.47454989997777286, "step": 12175 }, { "epoch": 2.257322951427512, "grad_norm": 7.48046875, "learning_rate": 7.74267704857249e-06, "loss": 3.2161, "mean_token_accuracy": 0.43216928469427496, "step": 12176 }, { "epoch": 2.2575083426028923, "grad_norm": 7.57421875, "learning_rate": 7.742491657397108e-06, "loss": 2.2127, "mean_token_accuracy": 0.5366089965397924, "step": 12177 }, { "epoch": 2.257693733778272, "grad_norm": 6.8828125, "learning_rate": 7.742306266221729e-06, "loss": 3.2162, "mean_token_accuracy": 0.4553827751196172, "step": 12178 }, { "epoch": 2.2578791249536523, "grad_norm": 6.36328125, "learning_rate": 7.74212087504635e-06, "loss": 2.559, "mean_token_accuracy": 0.4988824318283415, "step": 12179 }, { "epoch": 2.258064516129032, "grad_norm": 7.27734375, "learning_rate": 7.741935483870968e-06, "loss": 2.1841, "mean_token_accuracy": 0.587200846225043, "step": 12180 }, { "epoch": 2.258249907304412, "grad_norm": 7.546875, "learning_rate": 7.741750092695588e-06, "loss": 2.4472, "mean_token_accuracy": 0.5087586505190311, "step": 12181 }, { "epoch": 2.2584352984797924, "grad_norm": 6.33203125, "learning_rate": 7.741564701520207e-06, "loss": 2.6842, "mean_token_accuracy": 0.4960493559909081, "step": 12182 }, { "epoch": 2.2586206896551726, "grad_norm": 8.21875, "learning_rate": 7.74137931034483e-06, "loss": 3.0518, "mean_token_accuracy": 0.45153933865450396, "step": 12183 }, { "epoch": 2.2588060808305523, "grad_norm": 6.0859375, "learning_rate": 7.741193919169448e-06, "loss": 2.6314, "mean_token_accuracy": 0.49850707516551995, "step": 12184 }, { "epoch": 2.2589914720059325, "grad_norm": 6.54296875, "learning_rate": 7.741008527994069e-06, "loss": 3.2296, "mean_token_accuracy": 0.4461427243622844, "step": 12185 }, { "epoch": 2.2591768631813127, "grad_norm": 7.80078125, "learning_rate": 7.740823136818688e-06, "loss": 2.5161, "mean_token_accuracy": 0.5158302063789869, "step": 12186 }, { "epoch": 2.2593622543566925, "grad_norm": 6.5546875, "learning_rate": 7.740637745643308e-06, "loss": 2.8974, "mean_token_accuracy": 0.4920493575880931, "step": 12187 }, { "epoch": 2.2595476455320727, "grad_norm": 8.1171875, "learning_rate": 7.740452354467929e-06, "loss": 2.799, "mean_token_accuracy": 0.5207956600361664, "step": 12188 }, { "epoch": 2.259733036707453, "grad_norm": 8.140625, "learning_rate": 7.740266963292547e-06, "loss": 3.625, "mean_token_accuracy": 0.4387947269303202, "step": 12189 }, { "epoch": 2.2599184278828326, "grad_norm": 6.8125, "learning_rate": 7.740081572117168e-06, "loss": 3.3853, "mean_token_accuracy": 0.4233378561736771, "step": 12190 }, { "epoch": 2.260103819058213, "grad_norm": 8.6015625, "learning_rate": 7.739896180941788e-06, "loss": 2.965, "mean_token_accuracy": 0.48697132381825325, "step": 12191 }, { "epoch": 2.260289210233593, "grad_norm": 6.79296875, "learning_rate": 7.739710789766409e-06, "loss": 2.9977, "mean_token_accuracy": 0.46733481811432814, "step": 12192 }, { "epoch": 2.2604746014089727, "grad_norm": 6.27734375, "learning_rate": 7.739525398591028e-06, "loss": 2.9364, "mean_token_accuracy": 0.45676728334956185, "step": 12193 }, { "epoch": 2.260659992584353, "grad_norm": 8.046875, "learning_rate": 7.739340007415648e-06, "loss": 2.4516, "mean_token_accuracy": 0.5364423717521652, "step": 12194 }, { "epoch": 2.260845383759733, "grad_norm": 7.47265625, "learning_rate": 7.739154616240267e-06, "loss": 2.6279, "mean_token_accuracy": 0.4958997369642581, "step": 12195 }, { "epoch": 2.2610307749351133, "grad_norm": 6.3359375, "learning_rate": 7.738969225064887e-06, "loss": 3.505, "mean_token_accuracy": 0.4298963447899618, "step": 12196 }, { "epoch": 2.261216166110493, "grad_norm": 7.1484375, "learning_rate": 7.738783833889508e-06, "loss": 2.7582, "mean_token_accuracy": 0.5571134791549822, "step": 12197 }, { "epoch": 2.261401557285873, "grad_norm": 6.9296875, "learning_rate": 7.738598442714127e-06, "loss": 2.4797, "mean_token_accuracy": 0.5063159265529366, "step": 12198 }, { "epoch": 2.2615869484612534, "grad_norm": 6.3671875, "learning_rate": 7.738413051538747e-06, "loss": 3.014, "mean_token_accuracy": 0.4521922219469903, "step": 12199 }, { "epoch": 2.261772339636633, "grad_norm": 6.265625, "learning_rate": 7.738227660363368e-06, "loss": 2.4482, "mean_token_accuracy": 0.5417820286936823, "step": 12200 }, { "epoch": 2.2619577308120133, "grad_norm": 6.5625, "learning_rate": 7.738042269187988e-06, "loss": 3.1296, "mean_token_accuracy": 0.4521158129175947, "step": 12201 }, { "epoch": 2.2621431219873935, "grad_norm": 7.515625, "learning_rate": 7.737856878012607e-06, "loss": 2.262, "mean_token_accuracy": 0.5319122413363251, "step": 12202 }, { "epoch": 2.2623285131627733, "grad_norm": 6.67578125, "learning_rate": 7.737671486837227e-06, "loss": 2.3516, "mean_token_accuracy": 0.5136397889977392, "step": 12203 }, { "epoch": 2.2625139043381535, "grad_norm": 6.53125, "learning_rate": 7.737486095661846e-06, "loss": 2.9853, "mean_token_accuracy": 0.47758647432914536, "step": 12204 }, { "epoch": 2.2626992955135337, "grad_norm": 6.67578125, "learning_rate": 7.737300704486467e-06, "loss": 3.0608, "mean_token_accuracy": 0.4665823984831727, "step": 12205 }, { "epoch": 2.2628846866889134, "grad_norm": 8.390625, "learning_rate": 7.737115313311087e-06, "loss": 2.5472, "mean_token_accuracy": 0.5084219858156028, "step": 12206 }, { "epoch": 2.2630700778642936, "grad_norm": 7.57421875, "learning_rate": 7.736929922135708e-06, "loss": 2.1493, "mean_token_accuracy": 0.5377178476020863, "step": 12207 }, { "epoch": 2.263255469039674, "grad_norm": 8.8046875, "learning_rate": 7.736744530960327e-06, "loss": 2.9281, "mean_token_accuracy": 0.45609968125181105, "step": 12208 }, { "epoch": 2.263440860215054, "grad_norm": 7.890625, "learning_rate": 7.736559139784947e-06, "loss": 2.4428, "mean_token_accuracy": 0.5407435701553348, "step": 12209 }, { "epoch": 2.2636262513904337, "grad_norm": 6.21875, "learning_rate": 7.736373748609567e-06, "loss": 2.4876, "mean_token_accuracy": 0.5139507620164127, "step": 12210 }, { "epoch": 2.263811642565814, "grad_norm": 6.84765625, "learning_rate": 7.736188357434186e-06, "loss": 2.7385, "mean_token_accuracy": 0.48383852894699036, "step": 12211 }, { "epoch": 2.263997033741194, "grad_norm": 6.61328125, "learning_rate": 7.736002966258807e-06, "loss": 2.9654, "mean_token_accuracy": 0.4527909395585346, "step": 12212 }, { "epoch": 2.264182424916574, "grad_norm": 6.5, "learning_rate": 7.735817575083426e-06, "loss": 2.6396, "mean_token_accuracy": 0.4938284198714679, "step": 12213 }, { "epoch": 2.264367816091954, "grad_norm": 6.37109375, "learning_rate": 7.735632183908046e-06, "loss": 2.6389, "mean_token_accuracy": 0.5134359509522567, "step": 12214 }, { "epoch": 2.2645532072673342, "grad_norm": 6.39453125, "learning_rate": 7.735446792732667e-06, "loss": 2.9048, "mean_token_accuracy": 0.4741388174807198, "step": 12215 }, { "epoch": 2.264738598442714, "grad_norm": 7.0703125, "learning_rate": 7.735261401557287e-06, "loss": 2.4109, "mean_token_accuracy": 0.5444931648181583, "step": 12216 }, { "epoch": 2.264923989618094, "grad_norm": 7.2265625, "learning_rate": 7.735076010381908e-06, "loss": 3.0247, "mean_token_accuracy": 0.4494396664060464, "step": 12217 }, { "epoch": 2.2651093807934743, "grad_norm": 6.734375, "learning_rate": 7.734890619206526e-06, "loss": 2.8215, "mean_token_accuracy": 0.47888421380707497, "step": 12218 }, { "epoch": 2.265294771968854, "grad_norm": 6.8515625, "learning_rate": 7.734705228031147e-06, "loss": 3.0242, "mean_token_accuracy": 0.4396551724137931, "step": 12219 }, { "epoch": 2.2654801631442343, "grad_norm": 7.3671875, "learning_rate": 7.734519836855766e-06, "loss": 2.7896, "mean_token_accuracy": 0.48385964912280705, "step": 12220 }, { "epoch": 2.2656655543196145, "grad_norm": 7.79296875, "learning_rate": 7.734334445680386e-06, "loss": 2.6054, "mean_token_accuracy": 0.5064251432110234, "step": 12221 }, { "epoch": 2.2658509454949947, "grad_norm": 6.3125, "learning_rate": 7.734149054505005e-06, "loss": 2.5042, "mean_token_accuracy": 0.49104436483879854, "step": 12222 }, { "epoch": 2.2660363366703744, "grad_norm": 8.484375, "learning_rate": 7.733963663329627e-06, "loss": 2.6719, "mean_token_accuracy": 0.4715068493150685, "step": 12223 }, { "epoch": 2.2662217278457546, "grad_norm": 8.4609375, "learning_rate": 7.733778272154246e-06, "loss": 2.9348, "mean_token_accuracy": 0.4837801306600586, "step": 12224 }, { "epoch": 2.266407119021135, "grad_norm": 8.15625, "learning_rate": 7.733592880978866e-06, "loss": 2.9207, "mean_token_accuracy": 0.47441664323868427, "step": 12225 }, { "epoch": 2.2665925101965145, "grad_norm": 11.5078125, "learning_rate": 7.733407489803487e-06, "loss": 3.1518, "mean_token_accuracy": 0.43690165361183636, "step": 12226 }, { "epoch": 2.2667779013718947, "grad_norm": 7.28125, "learning_rate": 7.733222098628106e-06, "loss": 2.6889, "mean_token_accuracy": 0.4771084337349398, "step": 12227 }, { "epoch": 2.266963292547275, "grad_norm": 7.98046875, "learning_rate": 7.733036707452726e-06, "loss": 2.456, "mean_token_accuracy": 0.5015974440894568, "step": 12228 }, { "epoch": 2.2671486837226547, "grad_norm": 7.9375, "learning_rate": 7.732851316277345e-06, "loss": 3.1607, "mean_token_accuracy": 0.47321071571371454, "step": 12229 }, { "epoch": 2.267334074898035, "grad_norm": 9.9296875, "learning_rate": 7.732665925101965e-06, "loss": 2.781, "mean_token_accuracy": 0.4948721117014704, "step": 12230 }, { "epoch": 2.267519466073415, "grad_norm": 7.42578125, "learning_rate": 7.732480533926586e-06, "loss": 2.3298, "mean_token_accuracy": 0.5564197373454035, "step": 12231 }, { "epoch": 2.267704857248795, "grad_norm": 9.046875, "learning_rate": 7.732295142751206e-06, "loss": 3.0407, "mean_token_accuracy": 0.4636433710174717, "step": 12232 }, { "epoch": 2.267890248424175, "grad_norm": 7.94921875, "learning_rate": 7.732109751575825e-06, "loss": 2.1698, "mean_token_accuracy": 0.5451970126191089, "step": 12233 }, { "epoch": 2.268075639599555, "grad_norm": 10.1875, "learning_rate": 7.731924360400446e-06, "loss": 2.6631, "mean_token_accuracy": 0.47915849993466614, "step": 12234 }, { "epoch": 2.2682610307749353, "grad_norm": 8.375, "learning_rate": 7.731738969225066e-06, "loss": 2.3374, "mean_token_accuracy": 0.5316393240409523, "step": 12235 }, { "epoch": 2.268446421950315, "grad_norm": 7.34375, "learning_rate": 7.731553578049685e-06, "loss": 2.4193, "mean_token_accuracy": 0.5041509433962265, "step": 12236 }, { "epoch": 2.2686318131256953, "grad_norm": 5.81640625, "learning_rate": 7.731368186874306e-06, "loss": 2.3568, "mean_token_accuracy": 0.5251770766258854, "step": 12237 }, { "epoch": 2.268817204301075, "grad_norm": 6.83203125, "learning_rate": 7.731182795698924e-06, "loss": 2.3761, "mean_token_accuracy": 0.5247570569180935, "step": 12238 }, { "epoch": 2.269002595476455, "grad_norm": 9.328125, "learning_rate": 7.730997404523546e-06, "loss": 2.932, "mean_token_accuracy": 0.5094170403587444, "step": 12239 }, { "epoch": 2.2691879866518354, "grad_norm": 11.3515625, "learning_rate": 7.730812013348165e-06, "loss": 3.0937, "mean_token_accuracy": 0.45418745077448147, "step": 12240 }, { "epoch": 2.2693733778272156, "grad_norm": 6.75, "learning_rate": 7.730626622172786e-06, "loss": 2.5064, "mean_token_accuracy": 0.5114265095020447, "step": 12241 }, { "epoch": 2.2695587690025953, "grad_norm": 9.546875, "learning_rate": 7.730441230997405e-06, "loss": 3.231, "mean_token_accuracy": 0.4383599339570721, "step": 12242 }, { "epoch": 2.2697441601779755, "grad_norm": 10.1875, "learning_rate": 7.730255839822025e-06, "loss": 2.0396, "mean_token_accuracy": 0.556033920417482, "step": 12243 }, { "epoch": 2.2699295513533557, "grad_norm": 6.046875, "learning_rate": 7.730070448646646e-06, "loss": 2.5544, "mean_token_accuracy": 0.5052040696994503, "step": 12244 }, { "epoch": 2.2701149425287355, "grad_norm": 7.7421875, "learning_rate": 7.729885057471264e-06, "loss": 2.7289, "mean_token_accuracy": 0.5369396922488769, "step": 12245 }, { "epoch": 2.2703003337041157, "grad_norm": 7.44921875, "learning_rate": 7.729699666295885e-06, "loss": 3.0965, "mean_token_accuracy": 0.47558981546367673, "step": 12246 }, { "epoch": 2.270485724879496, "grad_norm": 6.4296875, "learning_rate": 7.729514275120505e-06, "loss": 2.8389, "mean_token_accuracy": 0.48094277524153306, "step": 12247 }, { "epoch": 2.270671116054876, "grad_norm": 6.046875, "learning_rate": 7.729328883945126e-06, "loss": 2.4416, "mean_token_accuracy": 0.5253339752076062, "step": 12248 }, { "epoch": 2.270856507230256, "grad_norm": 6.83203125, "learning_rate": 7.729143492769745e-06, "loss": 2.7874, "mean_token_accuracy": 0.47637238256932657, "step": 12249 }, { "epoch": 2.271041898405636, "grad_norm": 7.62109375, "learning_rate": 7.728958101594365e-06, "loss": 3.348, "mean_token_accuracy": 0.468896080546566, "step": 12250 }, { "epoch": 2.2712272895810157, "grad_norm": 6.85546875, "learning_rate": 7.728772710418984e-06, "loss": 2.5723, "mean_token_accuracy": 0.5007606490872211, "step": 12251 }, { "epoch": 2.271412680756396, "grad_norm": 7.2109375, "learning_rate": 7.728587319243604e-06, "loss": 2.6499, "mean_token_accuracy": 0.514693416219271, "step": 12252 }, { "epoch": 2.271598071931776, "grad_norm": 7.75390625, "learning_rate": 7.728401928068225e-06, "loss": 2.5572, "mean_token_accuracy": 0.5251751024183957, "step": 12253 }, { "epoch": 2.2717834631071563, "grad_norm": 8.0546875, "learning_rate": 7.728216536892844e-06, "loss": 2.6036, "mean_token_accuracy": 0.5134105232302506, "step": 12254 }, { "epoch": 2.271968854282536, "grad_norm": 8.78125, "learning_rate": 7.728031145717464e-06, "loss": 2.6388, "mean_token_accuracy": 0.5277387091733083, "step": 12255 }, { "epoch": 2.272154245457916, "grad_norm": 6.81640625, "learning_rate": 7.727845754542085e-06, "loss": 2.3917, "mean_token_accuracy": 0.5258649093904448, "step": 12256 }, { "epoch": 2.2723396366332964, "grad_norm": 6.87890625, "learning_rate": 7.727660363366705e-06, "loss": 2.8243, "mean_token_accuracy": 0.47924406396381847, "step": 12257 }, { "epoch": 2.272525027808676, "grad_norm": 6.0234375, "learning_rate": 7.727474972191324e-06, "loss": 2.6253, "mean_token_accuracy": 0.4980694980694981, "step": 12258 }, { "epoch": 2.2727104189840563, "grad_norm": 6.30859375, "learning_rate": 7.727289581015944e-06, "loss": 2.8235, "mean_token_accuracy": 0.4723259389413573, "step": 12259 }, { "epoch": 2.2728958101594365, "grad_norm": 6.7421875, "learning_rate": 7.727104189840563e-06, "loss": 2.5097, "mean_token_accuracy": 0.49756888168557534, "step": 12260 }, { "epoch": 2.2730812013348163, "grad_norm": 25.984375, "learning_rate": 7.726918798665184e-06, "loss": 2.6903, "mean_token_accuracy": 0.49481020166073547, "step": 12261 }, { "epoch": 2.2732665925101965, "grad_norm": 7.4609375, "learning_rate": 7.726733407489804e-06, "loss": 2.8883, "mean_token_accuracy": 0.4928325167846126, "step": 12262 }, { "epoch": 2.2734519836855767, "grad_norm": 6.8515625, "learning_rate": 7.726548016314423e-06, "loss": 3.4679, "mean_token_accuracy": 0.4385512584407612, "step": 12263 }, { "epoch": 2.2736373748609564, "grad_norm": 6.8515625, "learning_rate": 7.726362625139045e-06, "loss": 2.7812, "mean_token_accuracy": 0.4682325109834404, "step": 12264 }, { "epoch": 2.2738227660363366, "grad_norm": 7.72265625, "learning_rate": 7.726177233963664e-06, "loss": 3.4717, "mean_token_accuracy": 0.4509090909090909, "step": 12265 }, { "epoch": 2.274008157211717, "grad_norm": 6.859375, "learning_rate": 7.725991842788285e-06, "loss": 3.0445, "mean_token_accuracy": 0.4805668016194332, "step": 12266 }, { "epoch": 2.274193548387097, "grad_norm": 6.99609375, "learning_rate": 7.725806451612903e-06, "loss": 2.9308, "mean_token_accuracy": 0.4836852207293666, "step": 12267 }, { "epoch": 2.2743789395624767, "grad_norm": 6.8046875, "learning_rate": 7.725621060437524e-06, "loss": 2.9904, "mean_token_accuracy": 0.4756466241182398, "step": 12268 }, { "epoch": 2.274564330737857, "grad_norm": 6.66796875, "learning_rate": 7.725435669262144e-06, "loss": 2.9606, "mean_token_accuracy": 0.4692737430167598, "step": 12269 }, { "epoch": 2.274749721913237, "grad_norm": 6.3984375, "learning_rate": 7.725250278086763e-06, "loss": 2.5514, "mean_token_accuracy": 0.5100564317754305, "step": 12270 }, { "epoch": 2.274935113088617, "grad_norm": 6.05078125, "learning_rate": 7.725064886911384e-06, "loss": 2.969, "mean_token_accuracy": 0.46182231053079764, "step": 12271 }, { "epoch": 2.275120504263997, "grad_norm": 6.59375, "learning_rate": 7.724879495736004e-06, "loss": 3.2892, "mean_token_accuracy": 0.43890625, "step": 12272 }, { "epoch": 2.2753058954393772, "grad_norm": 6.33984375, "learning_rate": 7.724694104560625e-06, "loss": 2.4695, "mean_token_accuracy": 0.530638852672751, "step": 12273 }, { "epoch": 2.275491286614757, "grad_norm": 6.640625, "learning_rate": 7.724508713385243e-06, "loss": 3.0003, "mean_token_accuracy": 0.46051246051246053, "step": 12274 }, { "epoch": 2.275676677790137, "grad_norm": 6.296875, "learning_rate": 7.724323322209864e-06, "loss": 2.2532, "mean_token_accuracy": 0.5320970042796006, "step": 12275 }, { "epoch": 2.2758620689655173, "grad_norm": 6.73828125, "learning_rate": 7.724137931034483e-06, "loss": 2.2557, "mean_token_accuracy": 0.5378576854899975, "step": 12276 }, { "epoch": 2.276047460140897, "grad_norm": 7.92578125, "learning_rate": 7.723952539859103e-06, "loss": 2.788, "mean_token_accuracy": 0.48034527003382715, "step": 12277 }, { "epoch": 2.2762328513162773, "grad_norm": 7.03515625, "learning_rate": 7.723767148683724e-06, "loss": 2.9047, "mean_token_accuracy": 0.4802168815943728, "step": 12278 }, { "epoch": 2.2764182424916575, "grad_norm": 8.0078125, "learning_rate": 7.723581757508342e-06, "loss": 3.4169, "mean_token_accuracy": 0.4406285436578649, "step": 12279 }, { "epoch": 2.2766036336670377, "grad_norm": 6.578125, "learning_rate": 7.723396366332963e-06, "loss": 2.9437, "mean_token_accuracy": 0.46548428207306713, "step": 12280 }, { "epoch": 2.2767890248424174, "grad_norm": 6.44921875, "learning_rate": 7.723210975157583e-06, "loss": 2.5239, "mean_token_accuracy": 0.5050145157033518, "step": 12281 }, { "epoch": 2.2769744160177976, "grad_norm": 6.4609375, "learning_rate": 7.723025583982204e-06, "loss": 2.4356, "mean_token_accuracy": 0.5245378374783881, "step": 12282 }, { "epoch": 2.277159807193178, "grad_norm": 7.140625, "learning_rate": 7.722840192806823e-06, "loss": 2.4411, "mean_token_accuracy": 0.5283252929014473, "step": 12283 }, { "epoch": 2.2773451983685575, "grad_norm": 7.69921875, "learning_rate": 7.722654801631443e-06, "loss": 2.706, "mean_token_accuracy": 0.4868383809793377, "step": 12284 }, { "epoch": 2.2775305895439377, "grad_norm": 6.76171875, "learning_rate": 7.722469410456062e-06, "loss": 2.8171, "mean_token_accuracy": 0.48856891624532867, "step": 12285 }, { "epoch": 2.277715980719318, "grad_norm": 6.80859375, "learning_rate": 7.722284019280682e-06, "loss": 3.0878, "mean_token_accuracy": 0.45571536714610145, "step": 12286 }, { "epoch": 2.2779013718946977, "grad_norm": 7.23828125, "learning_rate": 7.722098628105303e-06, "loss": 3.3486, "mean_token_accuracy": 0.45006388415672915, "step": 12287 }, { "epoch": 2.278086763070078, "grad_norm": 6.09375, "learning_rate": 7.721913236929923e-06, "loss": 2.5485, "mean_token_accuracy": 0.5066442645486482, "step": 12288 }, { "epoch": 2.278272154245458, "grad_norm": 7.33203125, "learning_rate": 7.721727845754542e-06, "loss": 2.842, "mean_token_accuracy": 0.46078190312269485, "step": 12289 }, { "epoch": 2.278457545420838, "grad_norm": 6.47265625, "learning_rate": 7.721542454579163e-06, "loss": 3.3868, "mean_token_accuracy": 0.45248868778280543, "step": 12290 }, { "epoch": 2.278642936596218, "grad_norm": 6.671875, "learning_rate": 7.721357063403783e-06, "loss": 2.901, "mean_token_accuracy": 0.45994269340974214, "step": 12291 }, { "epoch": 2.278828327771598, "grad_norm": 8.5703125, "learning_rate": 7.721171672228402e-06, "loss": 2.54, "mean_token_accuracy": 0.5339021615472127, "step": 12292 }, { "epoch": 2.2790137189469784, "grad_norm": 6.83984375, "learning_rate": 7.720986281053023e-06, "loss": 3.0206, "mean_token_accuracy": 0.47869071476285907, "step": 12293 }, { "epoch": 2.279199110122358, "grad_norm": 6.46875, "learning_rate": 7.720800889877641e-06, "loss": 2.9749, "mean_token_accuracy": 0.4593609865470852, "step": 12294 }, { "epoch": 2.2793845012977383, "grad_norm": 6.1796875, "learning_rate": 7.720615498702262e-06, "loss": 2.7716, "mean_token_accuracy": 0.4835558001926517, "step": 12295 }, { "epoch": 2.279569892473118, "grad_norm": 6.671875, "learning_rate": 7.720430107526882e-06, "loss": 3.4171, "mean_token_accuracy": 0.42994241842610365, "step": 12296 }, { "epoch": 2.279755283648498, "grad_norm": 7.41796875, "learning_rate": 7.720244716351503e-06, "loss": 2.4573, "mean_token_accuracy": 0.5135351914978945, "step": 12297 }, { "epoch": 2.2799406748238784, "grad_norm": 6.95703125, "learning_rate": 7.720059325176123e-06, "loss": 2.8525, "mean_token_accuracy": 0.4672969966629588, "step": 12298 }, { "epoch": 2.2801260659992586, "grad_norm": 6.59765625, "learning_rate": 7.719873934000742e-06, "loss": 2.9846, "mean_token_accuracy": 0.4550363676438175, "step": 12299 }, { "epoch": 2.2803114571746383, "grad_norm": 8.6484375, "learning_rate": 7.719688542825363e-06, "loss": 2.7551, "mean_token_accuracy": 0.4882758620689655, "step": 12300 }, { "epoch": 2.2804968483500185, "grad_norm": 6.74609375, "learning_rate": 7.719503151649981e-06, "loss": 2.7783, "mean_token_accuracy": 0.47149098241290466, "step": 12301 }, { "epoch": 2.2806822395253987, "grad_norm": 7.921875, "learning_rate": 7.719317760474602e-06, "loss": 2.4022, "mean_token_accuracy": 0.5127513995438524, "step": 12302 }, { "epoch": 2.2808676307007785, "grad_norm": 7.9921875, "learning_rate": 7.71913236929922e-06, "loss": 3.5181, "mean_token_accuracy": 0.41800152555301295, "step": 12303 }, { "epoch": 2.2810530218761587, "grad_norm": 8.125, "learning_rate": 7.718946978123843e-06, "loss": 2.9149, "mean_token_accuracy": 0.49525101763907736, "step": 12304 }, { "epoch": 2.281238413051539, "grad_norm": 7.75390625, "learning_rate": 7.718761586948462e-06, "loss": 3.2494, "mean_token_accuracy": 0.4395968322534197, "step": 12305 }, { "epoch": 2.281423804226919, "grad_norm": 7.015625, "learning_rate": 7.718576195773082e-06, "loss": 2.9502, "mean_token_accuracy": 0.43871039964177766, "step": 12306 }, { "epoch": 2.281609195402299, "grad_norm": 6.875, "learning_rate": 7.718390804597703e-06, "loss": 2.6265, "mean_token_accuracy": 0.48924022837066317, "step": 12307 }, { "epoch": 2.281794586577679, "grad_norm": 7.75, "learning_rate": 7.718205413422321e-06, "loss": 2.7879, "mean_token_accuracy": 0.47962555066079293, "step": 12308 }, { "epoch": 2.2819799777530587, "grad_norm": 7.98828125, "learning_rate": 7.718020022246942e-06, "loss": 2.6771, "mean_token_accuracy": 0.5035128805620609, "step": 12309 }, { "epoch": 2.282165368928439, "grad_norm": 7.5625, "learning_rate": 7.71783463107156e-06, "loss": 2.6573, "mean_token_accuracy": 0.47474747474747475, "step": 12310 }, { "epoch": 2.282350760103819, "grad_norm": 7.1328125, "learning_rate": 7.717649239896181e-06, "loss": 3.0201, "mean_token_accuracy": 0.4600777511961722, "step": 12311 }, { "epoch": 2.2825361512791993, "grad_norm": 6.78125, "learning_rate": 7.717463848720802e-06, "loss": 3.0153, "mean_token_accuracy": 0.4663976624460832, "step": 12312 }, { "epoch": 2.282721542454579, "grad_norm": 7.4453125, "learning_rate": 7.717278457545422e-06, "loss": 3.0663, "mean_token_accuracy": 0.46137506987143656, "step": 12313 }, { "epoch": 2.2829069336299592, "grad_norm": 9.234375, "learning_rate": 7.717093066370041e-06, "loss": 2.5378, "mean_token_accuracy": 0.48985204855842185, "step": 12314 }, { "epoch": 2.2830923248053394, "grad_norm": 9.2578125, "learning_rate": 7.716907675194661e-06, "loss": 3.11, "mean_token_accuracy": 0.452896512935883, "step": 12315 }, { "epoch": 2.283277715980719, "grad_norm": 9.2890625, "learning_rate": 7.716722284019282e-06, "loss": 3.2963, "mean_token_accuracy": 0.46542280041081824, "step": 12316 }, { "epoch": 2.2834631071560993, "grad_norm": 11.84375, "learning_rate": 7.7165368928439e-06, "loss": 2.8924, "mean_token_accuracy": 0.46913073237508557, "step": 12317 }, { "epoch": 2.2836484983314795, "grad_norm": 8.921875, "learning_rate": 7.716351501668521e-06, "loss": 2.5914, "mean_token_accuracy": 0.5074664964901084, "step": 12318 }, { "epoch": 2.2838338895068593, "grad_norm": 6.6328125, "learning_rate": 7.71616611049314e-06, "loss": 2.9147, "mean_token_accuracy": 0.459067211497816, "step": 12319 }, { "epoch": 2.2840192806822395, "grad_norm": 8.7421875, "learning_rate": 7.715980719317762e-06, "loss": 2.4882, "mean_token_accuracy": 0.5079697986577181, "step": 12320 }, { "epoch": 2.2842046718576197, "grad_norm": 12.046875, "learning_rate": 7.715795328142381e-06, "loss": 3.0884, "mean_token_accuracy": 0.4631737529293606, "step": 12321 }, { "epoch": 2.2843900630329994, "grad_norm": 12.21875, "learning_rate": 7.715609936967002e-06, "loss": 2.566, "mean_token_accuracy": 0.4939542063287883, "step": 12322 }, { "epoch": 2.2845754542083796, "grad_norm": 10.828125, "learning_rate": 7.71542454579162e-06, "loss": 2.4587, "mean_token_accuracy": 0.5019508057675997, "step": 12323 }, { "epoch": 2.28476084538376, "grad_norm": 12.6875, "learning_rate": 7.71523915461624e-06, "loss": 2.8035, "mean_token_accuracy": 0.47122692725298587, "step": 12324 }, { "epoch": 2.28494623655914, "grad_norm": 13.6484375, "learning_rate": 7.715053763440861e-06, "loss": 2.815, "mean_token_accuracy": 0.481635301752109, "step": 12325 }, { "epoch": 2.2851316277345197, "grad_norm": 10.7890625, "learning_rate": 7.71486837226548e-06, "loss": 2.8068, "mean_token_accuracy": 0.45285664213109006, "step": 12326 }, { "epoch": 2.2853170189099, "grad_norm": 9.75, "learning_rate": 7.7146829810901e-06, "loss": 3.2683, "mean_token_accuracy": 0.42665635473060065, "step": 12327 }, { "epoch": 2.28550241008528, "grad_norm": 9.4921875, "learning_rate": 7.714497589914721e-06, "loss": 2.9497, "mean_token_accuracy": 0.4868401705631525, "step": 12328 }, { "epoch": 2.28568780126066, "grad_norm": 6.76171875, "learning_rate": 7.714312198739342e-06, "loss": 2.6697, "mean_token_accuracy": 0.5071115973741794, "step": 12329 }, { "epoch": 2.28587319243604, "grad_norm": 8.3125, "learning_rate": 7.71412680756396e-06, "loss": 2.6884, "mean_token_accuracy": 0.504228178300129, "step": 12330 }, { "epoch": 2.2860585836114202, "grad_norm": 6.7890625, "learning_rate": 7.713941416388581e-06, "loss": 3.3915, "mean_token_accuracy": 0.4232496986961762, "step": 12331 }, { "epoch": 2.2862439747868, "grad_norm": 7.94140625, "learning_rate": 7.7137560252132e-06, "loss": 2.9875, "mean_token_accuracy": 0.47459710461622506, "step": 12332 }, { "epoch": 2.28642936596218, "grad_norm": 7.8125, "learning_rate": 7.71357063403782e-06, "loss": 2.7082, "mean_token_accuracy": 0.4925227568270481, "step": 12333 }, { "epoch": 2.2866147571375603, "grad_norm": 10.8828125, "learning_rate": 7.71338524286244e-06, "loss": 3.1996, "mean_token_accuracy": 0.46999735659529474, "step": 12334 }, { "epoch": 2.28680014831294, "grad_norm": 10.0390625, "learning_rate": 7.71319985168706e-06, "loss": 2.9266, "mean_token_accuracy": 0.4394951744617669, "step": 12335 }, { "epoch": 2.2869855394883203, "grad_norm": 8.015625, "learning_rate": 7.713014460511682e-06, "loss": 2.9532, "mean_token_accuracy": 0.4674034695019586, "step": 12336 }, { "epoch": 2.2871709306637005, "grad_norm": 7.73828125, "learning_rate": 7.7128290693363e-06, "loss": 2.8164, "mean_token_accuracy": 0.4815611995982207, "step": 12337 }, { "epoch": 2.2873563218390807, "grad_norm": 6.37890625, "learning_rate": 7.712643678160921e-06, "loss": 2.5643, "mean_token_accuracy": 0.517648536616003, "step": 12338 }, { "epoch": 2.2875417130144604, "grad_norm": 7.19921875, "learning_rate": 7.71245828698554e-06, "loss": 2.4472, "mean_token_accuracy": 0.5068360556563823, "step": 12339 }, { "epoch": 2.2877271041898406, "grad_norm": 8.421875, "learning_rate": 7.71227289581016e-06, "loss": 2.874, "mean_token_accuracy": 0.5015486725663717, "step": 12340 }, { "epoch": 2.287912495365221, "grad_norm": 8.671875, "learning_rate": 7.712087504634779e-06, "loss": 3.2409, "mean_token_accuracy": 0.4880908757786735, "step": 12341 }, { "epoch": 2.2880978865406005, "grad_norm": 6.09765625, "learning_rate": 7.7119021134594e-06, "loss": 2.4699, "mean_token_accuracy": 0.5313754732132614, "step": 12342 }, { "epoch": 2.2882832777159807, "grad_norm": 6.4765625, "learning_rate": 7.71171672228402e-06, "loss": 2.899, "mean_token_accuracy": 0.47677703695332807, "step": 12343 }, { "epoch": 2.288468668891361, "grad_norm": 7.734375, "learning_rate": 7.71153133110864e-06, "loss": 2.5331, "mean_token_accuracy": 0.5130286202477574, "step": 12344 }, { "epoch": 2.2886540600667407, "grad_norm": 7.30078125, "learning_rate": 7.711345939933261e-06, "loss": 2.3787, "mean_token_accuracy": 0.5046035454170674, "step": 12345 }, { "epoch": 2.288839451242121, "grad_norm": 6.9453125, "learning_rate": 7.71116054875788e-06, "loss": 2.679, "mean_token_accuracy": 0.5092690278824415, "step": 12346 }, { "epoch": 2.289024842417501, "grad_norm": 7.25, "learning_rate": 7.7109751575825e-06, "loss": 3.7638, "mean_token_accuracy": 0.4088009614495701, "step": 12347 }, { "epoch": 2.289210233592881, "grad_norm": 6.5703125, "learning_rate": 7.710789766407119e-06, "loss": 2.987, "mean_token_accuracy": 0.45755060433959516, "step": 12348 }, { "epoch": 2.289395624768261, "grad_norm": 6.9453125, "learning_rate": 7.71060437523174e-06, "loss": 2.9332, "mean_token_accuracy": 0.4879344186583535, "step": 12349 }, { "epoch": 2.289581015943641, "grad_norm": 7.45703125, "learning_rate": 7.71041898405636e-06, "loss": 2.8706, "mean_token_accuracy": 0.47400346620450606, "step": 12350 }, { "epoch": 2.2897664071190214, "grad_norm": 5.93359375, "learning_rate": 7.710233592880979e-06, "loss": 2.5068, "mean_token_accuracy": 0.5312593478911157, "step": 12351 }, { "epoch": 2.289951798294401, "grad_norm": 7.3125, "learning_rate": 7.7100482017056e-06, "loss": 2.8078, "mean_token_accuracy": 0.4725554343874955, "step": 12352 }, { "epoch": 2.2901371894697813, "grad_norm": 7.765625, "learning_rate": 7.70986281053022e-06, "loss": 3.297, "mean_token_accuracy": 0.4491356891540462, "step": 12353 }, { "epoch": 2.2903225806451615, "grad_norm": 8.203125, "learning_rate": 7.70967741935484e-06, "loss": 3.1961, "mean_token_accuracy": 0.4396215257244234, "step": 12354 }, { "epoch": 2.290507971820541, "grad_norm": 8.171875, "learning_rate": 7.709492028179459e-06, "loss": 2.4419, "mean_token_accuracy": 0.5054302422723476, "step": 12355 }, { "epoch": 2.2906933629959214, "grad_norm": 7.02734375, "learning_rate": 7.70930663700408e-06, "loss": 3.4039, "mean_token_accuracy": 0.4561869357408391, "step": 12356 }, { "epoch": 2.2908787541713016, "grad_norm": 8.0703125, "learning_rate": 7.709121245828698e-06, "loss": 3.0542, "mean_token_accuracy": 0.46413071227980596, "step": 12357 }, { "epoch": 2.2910641453466813, "grad_norm": 10.59375, "learning_rate": 7.708935854653319e-06, "loss": 2.664, "mean_token_accuracy": 0.4823845076994867, "step": 12358 }, { "epoch": 2.2912495365220615, "grad_norm": 9.578125, "learning_rate": 7.70875046347794e-06, "loss": 2.5334, "mean_token_accuracy": 0.48986402966625464, "step": 12359 }, { "epoch": 2.2914349276974417, "grad_norm": 6.8125, "learning_rate": 7.70856507230256e-06, "loss": 2.8376, "mean_token_accuracy": 0.4892575793745524, "step": 12360 }, { "epoch": 2.2916203188728215, "grad_norm": 10.5546875, "learning_rate": 7.708379681127179e-06, "loss": 2.9273, "mean_token_accuracy": 0.4968144232072658, "step": 12361 }, { "epoch": 2.2918057100482017, "grad_norm": 9.6640625, "learning_rate": 7.7081942899518e-06, "loss": 2.6689, "mean_token_accuracy": 0.5133662081494939, "step": 12362 }, { "epoch": 2.291991101223582, "grad_norm": 8.03125, "learning_rate": 7.70800889877642e-06, "loss": 3.2168, "mean_token_accuracy": 0.4609375, "step": 12363 }, { "epoch": 2.292176492398962, "grad_norm": 6.9765625, "learning_rate": 7.707823507601038e-06, "loss": 2.669, "mean_token_accuracy": 0.4822877753445833, "step": 12364 }, { "epoch": 2.292361883574342, "grad_norm": 6.25, "learning_rate": 7.707638116425659e-06, "loss": 2.4671, "mean_token_accuracy": 0.5294596165020337, "step": 12365 }, { "epoch": 2.292547274749722, "grad_norm": 11.7421875, "learning_rate": 7.707452725250278e-06, "loss": 3.6039, "mean_token_accuracy": 0.4441521429354379, "step": 12366 }, { "epoch": 2.2927326659251017, "grad_norm": 9.03125, "learning_rate": 7.707267334074898e-06, "loss": 2.6187, "mean_token_accuracy": 0.5063834240267895, "step": 12367 }, { "epoch": 2.292918057100482, "grad_norm": 7.2421875, "learning_rate": 7.707081942899519e-06, "loss": 2.5171, "mean_token_accuracy": 0.5306551135617698, "step": 12368 }, { "epoch": 2.293103448275862, "grad_norm": 7.140625, "learning_rate": 7.70689655172414e-06, "loss": 2.5132, "mean_token_accuracy": 0.5106628982528263, "step": 12369 }, { "epoch": 2.2932888394512423, "grad_norm": 10.515625, "learning_rate": 7.706711160548758e-06, "loss": 3.0694, "mean_token_accuracy": 0.4666370106761566, "step": 12370 }, { "epoch": 2.293474230626622, "grad_norm": 8.4375, "learning_rate": 7.706525769373378e-06, "loss": 2.7084, "mean_token_accuracy": 0.4901458634895891, "step": 12371 }, { "epoch": 2.2936596218020022, "grad_norm": 8.4765625, "learning_rate": 7.706340378197999e-06, "loss": 2.7464, "mean_token_accuracy": 0.4696879819670186, "step": 12372 }, { "epoch": 2.2938450129773824, "grad_norm": 6.68359375, "learning_rate": 7.706154987022618e-06, "loss": 3.0206, "mean_token_accuracy": 0.4787629413326254, "step": 12373 }, { "epoch": 2.294030404152762, "grad_norm": 9.8984375, "learning_rate": 7.705969595847238e-06, "loss": 2.2587, "mean_token_accuracy": 0.540482284790516, "step": 12374 }, { "epoch": 2.2942157953281423, "grad_norm": 7.16015625, "learning_rate": 7.705784204671857e-06, "loss": 2.7156, "mean_token_accuracy": 0.498600265212907, "step": 12375 }, { "epoch": 2.2944011865035225, "grad_norm": 8.375, "learning_rate": 7.705598813496478e-06, "loss": 2.4011, "mean_token_accuracy": 0.5066717988196048, "step": 12376 }, { "epoch": 2.2945865776789027, "grad_norm": 6.5546875, "learning_rate": 7.705413422321098e-06, "loss": 2.4112, "mean_token_accuracy": 0.5198913941683391, "step": 12377 }, { "epoch": 2.2947719688542825, "grad_norm": 6.96484375, "learning_rate": 7.705228031145719e-06, "loss": 3.0232, "mean_token_accuracy": 0.4693517565779803, "step": 12378 }, { "epoch": 2.2949573600296627, "grad_norm": 7.41015625, "learning_rate": 7.705042639970337e-06, "loss": 3.0818, "mean_token_accuracy": 0.4465294807548779, "step": 12379 }, { "epoch": 2.2951427512050424, "grad_norm": 6.7421875, "learning_rate": 7.704857248794958e-06, "loss": 2.7467, "mean_token_accuracy": 0.47884393063583813, "step": 12380 }, { "epoch": 2.2953281423804226, "grad_norm": 6.05078125, "learning_rate": 7.704671857619578e-06, "loss": 3.0658, "mean_token_accuracy": 0.45109809663250366, "step": 12381 }, { "epoch": 2.295513533555803, "grad_norm": 6.6171875, "learning_rate": 7.704486466444197e-06, "loss": 2.7133, "mean_token_accuracy": 0.505515587529976, "step": 12382 }, { "epoch": 2.295698924731183, "grad_norm": 7.1796875, "learning_rate": 7.704301075268818e-06, "loss": 3.0049, "mean_token_accuracy": 0.4654887504766747, "step": 12383 }, { "epoch": 2.2958843159065627, "grad_norm": 7.04296875, "learning_rate": 7.704115684093436e-06, "loss": 2.846, "mean_token_accuracy": 0.5020045101478326, "step": 12384 }, { "epoch": 2.296069707081943, "grad_norm": 6.13671875, "learning_rate": 7.703930292918059e-06, "loss": 2.8673, "mean_token_accuracy": 0.4724012328383301, "step": 12385 }, { "epoch": 2.296255098257323, "grad_norm": 7.25390625, "learning_rate": 7.703744901742677e-06, "loss": 3.2847, "mean_token_accuracy": 0.45753517545346667, "step": 12386 }, { "epoch": 2.296440489432703, "grad_norm": 7.37890625, "learning_rate": 7.703559510567298e-06, "loss": 2.7863, "mean_token_accuracy": 0.4740223773125069, "step": 12387 }, { "epoch": 2.296625880608083, "grad_norm": 6.3515625, "learning_rate": 7.703374119391918e-06, "loss": 2.4772, "mean_token_accuracy": 0.544108008913357, "step": 12388 }, { "epoch": 2.2968112717834632, "grad_norm": 6.17578125, "learning_rate": 7.703188728216537e-06, "loss": 2.9356, "mean_token_accuracy": 0.4666732109551389, "step": 12389 }, { "epoch": 2.296996662958843, "grad_norm": 7.25, "learning_rate": 7.703003337041158e-06, "loss": 3.061, "mean_token_accuracy": 0.4674210839785587, "step": 12390 }, { "epoch": 2.297182054134223, "grad_norm": 6.46484375, "learning_rate": 7.702817945865776e-06, "loss": 2.6022, "mean_token_accuracy": 0.5003592814371257, "step": 12391 }, { "epoch": 2.2973674453096034, "grad_norm": 6.2109375, "learning_rate": 7.702632554690397e-06, "loss": 2.6589, "mean_token_accuracy": 0.4874165983846424, "step": 12392 }, { "epoch": 2.297552836484983, "grad_norm": 7.09765625, "learning_rate": 7.702447163515017e-06, "loss": 3.0223, "mean_token_accuracy": 0.44184630373888684, "step": 12393 }, { "epoch": 2.2977382276603633, "grad_norm": 7.38671875, "learning_rate": 7.702261772339638e-06, "loss": 3.0053, "mean_token_accuracy": 0.4431543299467828, "step": 12394 }, { "epoch": 2.2979236188357435, "grad_norm": 6.5078125, "learning_rate": 7.702076381164257e-06, "loss": 2.6815, "mean_token_accuracy": 0.4946334089191232, "step": 12395 }, { "epoch": 2.2981090100111237, "grad_norm": 9.78125, "learning_rate": 7.701890989988877e-06, "loss": 3.3622, "mean_token_accuracy": 0.44126250962278674, "step": 12396 }, { "epoch": 2.2982944011865034, "grad_norm": 7.74609375, "learning_rate": 7.701705598813498e-06, "loss": 3.1534, "mean_token_accuracy": 0.4761852741970723, "step": 12397 }, { "epoch": 2.2984797923618836, "grad_norm": 7.9296875, "learning_rate": 7.701520207638117e-06, "loss": 2.992, "mean_token_accuracy": 0.4489993098688751, "step": 12398 }, { "epoch": 2.298665183537264, "grad_norm": 7.0234375, "learning_rate": 7.701334816462737e-06, "loss": 3.1628, "mean_token_accuracy": 0.4569832402234637, "step": 12399 }, { "epoch": 2.2988505747126435, "grad_norm": 7.27734375, "learning_rate": 7.701149425287356e-06, "loss": 3.0383, "mean_token_accuracy": 0.4636502886767998, "step": 12400 }, { "epoch": 2.2990359658880237, "grad_norm": 6.8203125, "learning_rate": 7.700964034111978e-06, "loss": 2.8046, "mean_token_accuracy": 0.48128646648640955, "step": 12401 }, { "epoch": 2.299221357063404, "grad_norm": 8.453125, "learning_rate": 7.700778642936597e-06, "loss": 2.5885, "mean_token_accuracy": 0.4975621358068736, "step": 12402 }, { "epoch": 2.2994067482387837, "grad_norm": 6.515625, "learning_rate": 7.700593251761217e-06, "loss": 2.5951, "mean_token_accuracy": 0.5140799794265141, "step": 12403 }, { "epoch": 2.299592139414164, "grad_norm": 6.2890625, "learning_rate": 7.700407860585836e-06, "loss": 2.6908, "mean_token_accuracy": 0.4959918564702888, "step": 12404 }, { "epoch": 2.299777530589544, "grad_norm": 9.8125, "learning_rate": 7.700222469410457e-06, "loss": 2.0845, "mean_token_accuracy": 0.5581501309944185, "step": 12405 }, { "epoch": 2.299962921764924, "grad_norm": 7.9609375, "learning_rate": 7.700037078235077e-06, "loss": 2.6887, "mean_token_accuracy": 0.4795968777788756, "step": 12406 }, { "epoch": 2.300148312940304, "grad_norm": 7.53125, "learning_rate": 7.699851687059696e-06, "loss": 2.2549, "mean_token_accuracy": 0.5320754716981132, "step": 12407 }, { "epoch": 2.300333704115684, "grad_norm": 7.72265625, "learning_rate": 7.699666295884316e-06, "loss": 2.4278, "mean_token_accuracy": 0.5091470134450077, "step": 12408 }, { "epoch": 2.3005190952910644, "grad_norm": 8.078125, "learning_rate": 7.699480904708937e-06, "loss": 2.8942, "mean_token_accuracy": 0.45304645083450634, "step": 12409 }, { "epoch": 2.300704486466444, "grad_norm": 9.421875, "learning_rate": 7.699295513533557e-06, "loss": 3.5095, "mean_token_accuracy": 0.44678310195551574, "step": 12410 }, { "epoch": 2.3008898776418243, "grad_norm": 8.953125, "learning_rate": 7.699110122358176e-06, "loss": 2.3591, "mean_token_accuracy": 0.5284805844946556, "step": 12411 }, { "epoch": 2.3010752688172045, "grad_norm": 7.46875, "learning_rate": 7.698924731182797e-06, "loss": 2.5706, "mean_token_accuracy": 0.5013755158184319, "step": 12412 }, { "epoch": 2.3012606599925842, "grad_norm": 7.171875, "learning_rate": 7.698739340007415e-06, "loss": 3.2727, "mean_token_accuracy": 0.4400807202732071, "step": 12413 }, { "epoch": 2.3014460511679644, "grad_norm": 8.0703125, "learning_rate": 7.698553948832036e-06, "loss": 3.4123, "mean_token_accuracy": 0.45824777549623547, "step": 12414 }, { "epoch": 2.3016314423433446, "grad_norm": 6.7890625, "learning_rate": 7.698368557656656e-06, "loss": 2.8796, "mean_token_accuracy": 0.47637213881368445, "step": 12415 }, { "epoch": 2.3018168335187243, "grad_norm": 6.87890625, "learning_rate": 7.698183166481275e-06, "loss": 2.5544, "mean_token_accuracy": 0.5032796660703638, "step": 12416 }, { "epoch": 2.3020022246941045, "grad_norm": 6.34375, "learning_rate": 7.697997775305897e-06, "loss": 2.4755, "mean_token_accuracy": 0.5306316289377869, "step": 12417 }, { "epoch": 2.3021876158694847, "grad_norm": 6.76171875, "learning_rate": 7.697812384130516e-06, "loss": 2.6128, "mean_token_accuracy": 0.49884022376859055, "step": 12418 }, { "epoch": 2.3023730070448645, "grad_norm": 6.6796875, "learning_rate": 7.697626992955137e-06, "loss": 2.2853, "mean_token_accuracy": 0.525100516944285, "step": 12419 }, { "epoch": 2.3025583982202447, "grad_norm": 6.21875, "learning_rate": 7.697441601779755e-06, "loss": 2.4777, "mean_token_accuracy": 0.5133424351495594, "step": 12420 }, { "epoch": 2.302743789395625, "grad_norm": 7.6875, "learning_rate": 7.697256210604376e-06, "loss": 2.7159, "mean_token_accuracy": 0.46685748719493825, "step": 12421 }, { "epoch": 2.302929180571005, "grad_norm": 9.46875, "learning_rate": 7.697070819428995e-06, "loss": 2.1954, "mean_token_accuracy": 0.5409767780901802, "step": 12422 }, { "epoch": 2.303114571746385, "grad_norm": 7.19921875, "learning_rate": 7.696885428253615e-06, "loss": 3.0459, "mean_token_accuracy": 0.48126254740129376, "step": 12423 }, { "epoch": 2.303299962921765, "grad_norm": 7.3984375, "learning_rate": 7.696700037078236e-06, "loss": 2.5737, "mean_token_accuracy": 0.49570786210655404, "step": 12424 }, { "epoch": 2.303485354097145, "grad_norm": 7.5625, "learning_rate": 7.696514645902856e-06, "loss": 2.8621, "mean_token_accuracy": 0.507607593102449, "step": 12425 }, { "epoch": 2.303670745272525, "grad_norm": 6.94140625, "learning_rate": 7.696329254727477e-06, "loss": 3.2183, "mean_token_accuracy": 0.45683183183183185, "step": 12426 }, { "epoch": 2.303856136447905, "grad_norm": 8.53125, "learning_rate": 7.696143863552096e-06, "loss": 2.1204, "mean_token_accuracy": 0.5615902964959568, "step": 12427 }, { "epoch": 2.3040415276232853, "grad_norm": 7.26171875, "learning_rate": 7.695958472376716e-06, "loss": 2.5169, "mean_token_accuracy": 0.5221266133988937, "step": 12428 }, { "epoch": 2.304226918798665, "grad_norm": 6.859375, "learning_rate": 7.695773081201335e-06, "loss": 2.4897, "mean_token_accuracy": 0.5156316916488223, "step": 12429 }, { "epoch": 2.3044123099740452, "grad_norm": 6.78125, "learning_rate": 7.695587690025955e-06, "loss": 2.4151, "mean_token_accuracy": 0.5531581485053038, "step": 12430 }, { "epoch": 2.3045977011494254, "grad_norm": 6.6171875, "learning_rate": 7.695402298850576e-06, "loss": 2.9494, "mean_token_accuracy": 0.4794857768052516, "step": 12431 }, { "epoch": 2.304783092324805, "grad_norm": 8.640625, "learning_rate": 7.695216907675195e-06, "loss": 2.5851, "mean_token_accuracy": 0.48889201349831274, "step": 12432 }, { "epoch": 2.3049684835001854, "grad_norm": 6.7265625, "learning_rate": 7.695031516499815e-06, "loss": 2.3769, "mean_token_accuracy": 0.5613221657194137, "step": 12433 }, { "epoch": 2.3051538746755655, "grad_norm": 6.6328125, "learning_rate": 7.694846125324436e-06, "loss": 3.0874, "mean_token_accuracy": 0.46653279785809904, "step": 12434 }, { "epoch": 2.3053392658509457, "grad_norm": 7.42578125, "learning_rate": 7.694660734149056e-06, "loss": 2.9401, "mean_token_accuracy": 0.46952686447473935, "step": 12435 }, { "epoch": 2.3055246570263255, "grad_norm": 6.87109375, "learning_rate": 7.694475342973675e-06, "loss": 2.7312, "mean_token_accuracy": 0.5100373805897827, "step": 12436 }, { "epoch": 2.3057100482017057, "grad_norm": 7.44140625, "learning_rate": 7.694289951798295e-06, "loss": 2.6576, "mean_token_accuracy": 0.4916907018266722, "step": 12437 }, { "epoch": 2.3058954393770854, "grad_norm": 6.2578125, "learning_rate": 7.694104560622914e-06, "loss": 2.9314, "mean_token_accuracy": 0.4694353070175439, "step": 12438 }, { "epoch": 2.3060808305524656, "grad_norm": 7.0390625, "learning_rate": 7.693919169447535e-06, "loss": 3.2699, "mean_token_accuracy": 0.44308614923307577, "step": 12439 }, { "epoch": 2.306266221727846, "grad_norm": 7.671875, "learning_rate": 7.693733778272155e-06, "loss": 3.2509, "mean_token_accuracy": 0.4649919828968466, "step": 12440 }, { "epoch": 2.306451612903226, "grad_norm": 7.21875, "learning_rate": 7.693548387096776e-06, "loss": 2.4587, "mean_token_accuracy": 0.5065188253929572, "step": 12441 }, { "epoch": 2.3066370040786057, "grad_norm": 6.3046875, "learning_rate": 7.693362995921394e-06, "loss": 2.7282, "mean_token_accuracy": 0.4948906844106464, "step": 12442 }, { "epoch": 2.306822395253986, "grad_norm": 6.7890625, "learning_rate": 7.693177604746015e-06, "loss": 3.6955, "mean_token_accuracy": 0.4130308318789994, "step": 12443 }, { "epoch": 2.307007786429366, "grad_norm": 6.88671875, "learning_rate": 7.692992213570635e-06, "loss": 2.7446, "mean_token_accuracy": 0.4637839147286822, "step": 12444 }, { "epoch": 2.307193177604746, "grad_norm": 7.09765625, "learning_rate": 7.692806822395254e-06, "loss": 2.7511, "mean_token_accuracy": 0.47876220731850805, "step": 12445 }, { "epoch": 2.307378568780126, "grad_norm": 8.15625, "learning_rate": 7.692621431219875e-06, "loss": 3.2737, "mean_token_accuracy": 0.4591760299625468, "step": 12446 }, { "epoch": 2.3075639599555062, "grad_norm": 8.265625, "learning_rate": 7.692436040044493e-06, "loss": 3.5588, "mean_token_accuracy": 0.43906757155503096, "step": 12447 }, { "epoch": 2.3077493511308864, "grad_norm": 8.734375, "learning_rate": 7.692250648869114e-06, "loss": 2.9043, "mean_token_accuracy": 0.49074074074074076, "step": 12448 }, { "epoch": 2.307934742306266, "grad_norm": 7.6328125, "learning_rate": 7.692065257693734e-06, "loss": 2.8362, "mean_token_accuracy": 0.48956903650837225, "step": 12449 }, { "epoch": 2.3081201334816464, "grad_norm": 7.24609375, "learning_rate": 7.691879866518355e-06, "loss": 2.743, "mean_token_accuracy": 0.4937799043062201, "step": 12450 }, { "epoch": 2.308305524657026, "grad_norm": 7.15625, "learning_rate": 7.691694475342974e-06, "loss": 2.9813, "mean_token_accuracy": 0.4714104193138501, "step": 12451 }, { "epoch": 2.3084909158324063, "grad_norm": 7.953125, "learning_rate": 7.691509084167594e-06, "loss": 3.2915, "mean_token_accuracy": 0.4258417958311064, "step": 12452 }, { "epoch": 2.3086763070077865, "grad_norm": 9.796875, "learning_rate": 7.691323692992215e-06, "loss": 3.5705, "mean_token_accuracy": 0.4273372415921012, "step": 12453 }, { "epoch": 2.3088616981831667, "grad_norm": 7.51953125, "learning_rate": 7.691138301816834e-06, "loss": 2.7226, "mean_token_accuracy": 0.5039849297203304, "step": 12454 }, { "epoch": 2.3090470893585464, "grad_norm": 7.55078125, "learning_rate": 7.690952910641454e-06, "loss": 3.657, "mean_token_accuracy": 0.4143646408839779, "step": 12455 }, { "epoch": 2.3092324805339266, "grad_norm": 8.625, "learning_rate": 7.690767519466073e-06, "loss": 2.4227, "mean_token_accuracy": 0.5124690097710369, "step": 12456 }, { "epoch": 2.309417871709307, "grad_norm": 8.640625, "learning_rate": 7.690582128290695e-06, "loss": 2.8058, "mean_token_accuracy": 0.4929073130422807, "step": 12457 }, { "epoch": 2.3096032628846865, "grad_norm": 7.55078125, "learning_rate": 7.690396737115314e-06, "loss": 2.8245, "mean_token_accuracy": 0.4721133901820696, "step": 12458 }, { "epoch": 2.3097886540600667, "grad_norm": 9.140625, "learning_rate": 7.690211345939934e-06, "loss": 3.1665, "mean_token_accuracy": 0.46516779266806957, "step": 12459 }, { "epoch": 2.309974045235447, "grad_norm": 11.5546875, "learning_rate": 7.690025954764553e-06, "loss": 2.6003, "mean_token_accuracy": 0.4995551035973052, "step": 12460 }, { "epoch": 2.3101594364108267, "grad_norm": 7.56640625, "learning_rate": 7.689840563589174e-06, "loss": 2.752, "mean_token_accuracy": 0.4958196476560167, "step": 12461 }, { "epoch": 2.310344827586207, "grad_norm": 7.69921875, "learning_rate": 7.689655172413794e-06, "loss": 3.3258, "mean_token_accuracy": 0.4518234709805591, "step": 12462 }, { "epoch": 2.310530218761587, "grad_norm": 8.6171875, "learning_rate": 7.689469781238413e-06, "loss": 4.0492, "mean_token_accuracy": 0.4021437078205637, "step": 12463 }, { "epoch": 2.310715609936967, "grad_norm": 8.484375, "learning_rate": 7.689284390063033e-06, "loss": 2.775, "mean_token_accuracy": 0.4768550504352632, "step": 12464 }, { "epoch": 2.310901001112347, "grad_norm": 8.8125, "learning_rate": 7.689098998887654e-06, "loss": 3.1076, "mean_token_accuracy": 0.46593319194061505, "step": 12465 }, { "epoch": 2.311086392287727, "grad_norm": 8.2734375, "learning_rate": 7.688913607712274e-06, "loss": 2.8516, "mean_token_accuracy": 0.4827098598791619, "step": 12466 }, { "epoch": 2.3112717834631074, "grad_norm": 8.65625, "learning_rate": 7.688728216536893e-06, "loss": 3.4446, "mean_token_accuracy": 0.42568306010928963, "step": 12467 }, { "epoch": 2.311457174638487, "grad_norm": 7.1875, "learning_rate": 7.688542825361514e-06, "loss": 2.9673, "mean_token_accuracy": 0.4704364652762847, "step": 12468 }, { "epoch": 2.3116425658138673, "grad_norm": 10.71875, "learning_rate": 7.688357434186134e-06, "loss": 1.8509, "mean_token_accuracy": 0.5848529411764706, "step": 12469 }, { "epoch": 2.3118279569892475, "grad_norm": 8.2734375, "learning_rate": 7.688172043010753e-06, "loss": 2.7619, "mean_token_accuracy": 0.49602203182374544, "step": 12470 }, { "epoch": 2.3120133481646272, "grad_norm": 13.6875, "learning_rate": 7.687986651835373e-06, "loss": 2.8185, "mean_token_accuracy": 0.4682190056639396, "step": 12471 }, { "epoch": 2.3121987393400074, "grad_norm": 9.21875, "learning_rate": 7.687801260659992e-06, "loss": 3.6923, "mean_token_accuracy": 0.41774255400741406, "step": 12472 }, { "epoch": 2.3123841305153876, "grad_norm": 10.671875, "learning_rate": 7.687615869484614e-06, "loss": 3.0597, "mean_token_accuracy": 0.4607232968881413, "step": 12473 }, { "epoch": 2.3125695216907673, "grad_norm": 9.890625, "learning_rate": 7.687430478309233e-06, "loss": 2.7601, "mean_token_accuracy": 0.507159186620142, "step": 12474 }, { "epoch": 2.3127549128661475, "grad_norm": 11.8984375, "learning_rate": 7.687245087133854e-06, "loss": 3.2629, "mean_token_accuracy": 0.4358403797656134, "step": 12475 }, { "epoch": 2.3129403040415277, "grad_norm": 7.26953125, "learning_rate": 7.687059695958472e-06, "loss": 2.6676, "mean_token_accuracy": 0.4924833276817, "step": 12476 }, { "epoch": 2.3131256952169075, "grad_norm": 12.3984375, "learning_rate": 7.686874304783093e-06, "loss": 2.9411, "mean_token_accuracy": 0.4782487838776928, "step": 12477 }, { "epoch": 2.3133110863922877, "grad_norm": 6.9453125, "learning_rate": 7.686688913607713e-06, "loss": 3.0105, "mean_token_accuracy": 0.4765661252900232, "step": 12478 }, { "epoch": 2.313496477567668, "grad_norm": 9.1171875, "learning_rate": 7.686503522432332e-06, "loss": 2.8064, "mean_token_accuracy": 0.46832373446498937, "step": 12479 }, { "epoch": 2.313681868743048, "grad_norm": 7.87890625, "learning_rate": 7.686318131256953e-06, "loss": 2.8239, "mean_token_accuracy": 0.4911072862880092, "step": 12480 }, { "epoch": 2.313867259918428, "grad_norm": 5.8125, "learning_rate": 7.686132740081573e-06, "loss": 2.6431, "mean_token_accuracy": 0.48807495741056217, "step": 12481 }, { "epoch": 2.314052651093808, "grad_norm": 7.93359375, "learning_rate": 7.685947348906194e-06, "loss": 2.9797, "mean_token_accuracy": 0.4630954192664739, "step": 12482 }, { "epoch": 2.314238042269188, "grad_norm": 7.44140625, "learning_rate": 7.685761957730813e-06, "loss": 2.7672, "mean_token_accuracy": 0.47533126585847196, "step": 12483 }, { "epoch": 2.314423433444568, "grad_norm": 6.7578125, "learning_rate": 7.685576566555433e-06, "loss": 3.0652, "mean_token_accuracy": 0.4541126686061888, "step": 12484 }, { "epoch": 2.314608824619948, "grad_norm": 6.8046875, "learning_rate": 7.685391175380052e-06, "loss": 3.1711, "mean_token_accuracy": 0.46094637223974766, "step": 12485 }, { "epoch": 2.3147942157953283, "grad_norm": 7.453125, "learning_rate": 7.685205784204672e-06, "loss": 3.4849, "mean_token_accuracy": 0.4294614147909968, "step": 12486 }, { "epoch": 2.314979606970708, "grad_norm": 6.9765625, "learning_rate": 7.685020393029293e-06, "loss": 2.1792, "mean_token_accuracy": 0.5681881051175657, "step": 12487 }, { "epoch": 2.3151649981460882, "grad_norm": 6.16015625, "learning_rate": 7.684835001853912e-06, "loss": 2.5951, "mean_token_accuracy": 0.5031096563011457, "step": 12488 }, { "epoch": 2.3153503893214684, "grad_norm": 8.171875, "learning_rate": 7.684649610678532e-06, "loss": 2.7706, "mean_token_accuracy": 0.4960662525879917, "step": 12489 }, { "epoch": 2.315535780496848, "grad_norm": 7.1171875, "learning_rate": 7.684464219503153e-06, "loss": 3.5488, "mean_token_accuracy": 0.43252944475602917, "step": 12490 }, { "epoch": 2.3157211716722284, "grad_norm": 7.14453125, "learning_rate": 7.684278828327773e-06, "loss": 2.762, "mean_token_accuracy": 0.47824870611274306, "step": 12491 }, { "epoch": 2.3159065628476085, "grad_norm": 6.57421875, "learning_rate": 7.684093437152392e-06, "loss": 2.6737, "mean_token_accuracy": 0.48744892002335083, "step": 12492 }, { "epoch": 2.3160919540229887, "grad_norm": 7.34375, "learning_rate": 7.683908045977012e-06, "loss": 2.661, "mean_token_accuracy": 0.4969766772243018, "step": 12493 }, { "epoch": 2.3162773451983685, "grad_norm": 8.0859375, "learning_rate": 7.683722654801631e-06, "loss": 2.9318, "mean_token_accuracy": 0.4909963985594238, "step": 12494 }, { "epoch": 2.3164627363737487, "grad_norm": 7.125, "learning_rate": 7.683537263626252e-06, "loss": 3.1949, "mean_token_accuracy": 0.46098149637972646, "step": 12495 }, { "epoch": 2.316648127549129, "grad_norm": 6.81640625, "learning_rate": 7.683351872450872e-06, "loss": 2.8188, "mean_token_accuracy": 0.49283739633073637, "step": 12496 }, { "epoch": 2.3168335187245086, "grad_norm": 7.1875, "learning_rate": 7.683166481275491e-06, "loss": 3.1061, "mean_token_accuracy": 0.44430217669654287, "step": 12497 }, { "epoch": 2.317018909899889, "grad_norm": 6.57421875, "learning_rate": 7.682981090100111e-06, "loss": 2.732, "mean_token_accuracy": 0.5001197318007663, "step": 12498 }, { "epoch": 2.317204301075269, "grad_norm": 6.99609375, "learning_rate": 7.682795698924732e-06, "loss": 2.7438, "mean_token_accuracy": 0.4805482486501454, "step": 12499 }, { "epoch": 2.3173896922506487, "grad_norm": 6.21484375, "learning_rate": 7.682610307749352e-06, "loss": 2.55, "mean_token_accuracy": 0.4961343641695548, "step": 12500 }, { "epoch": 2.317575083426029, "grad_norm": 8.5703125, "learning_rate": 7.682424916573971e-06, "loss": 2.5058, "mean_token_accuracy": 0.5336538461538461, "step": 12501 }, { "epoch": 2.317760474601409, "grad_norm": 6.68359375, "learning_rate": 7.682239525398592e-06, "loss": 3.3744, "mean_token_accuracy": 0.4255942689677629, "step": 12502 }, { "epoch": 2.317945865776789, "grad_norm": 7.1640625, "learning_rate": 7.68205413422321e-06, "loss": 3.7901, "mean_token_accuracy": 0.42271019936434556, "step": 12503 }, { "epoch": 2.318131256952169, "grad_norm": 8.4296875, "learning_rate": 7.681868743047831e-06, "loss": 3.4621, "mean_token_accuracy": 0.4473953013278856, "step": 12504 }, { "epoch": 2.3183166481275492, "grad_norm": 7.65625, "learning_rate": 7.681683351872451e-06, "loss": 3.3619, "mean_token_accuracy": 0.42087095061072755, "step": 12505 }, { "epoch": 2.3185020393029294, "grad_norm": 7.34375, "learning_rate": 7.681497960697072e-06, "loss": 2.6962, "mean_token_accuracy": 0.5012060647829083, "step": 12506 }, { "epoch": 2.318687430478309, "grad_norm": 7.53515625, "learning_rate": 7.681312569521692e-06, "loss": 2.1573, "mean_token_accuracy": 0.5718550685303297, "step": 12507 }, { "epoch": 2.3188728216536894, "grad_norm": 6.44140625, "learning_rate": 7.681127178346311e-06, "loss": 2.462, "mean_token_accuracy": 0.497235219055721, "step": 12508 }, { "epoch": 2.319058212829069, "grad_norm": 20.390625, "learning_rate": 7.680941787170932e-06, "loss": 3.3533, "mean_token_accuracy": 0.42165206508135167, "step": 12509 }, { "epoch": 2.3192436040044493, "grad_norm": 7.21484375, "learning_rate": 7.68075639599555e-06, "loss": 2.9096, "mean_token_accuracy": 0.502943396226415, "step": 12510 }, { "epoch": 2.3194289951798295, "grad_norm": 7.3515625, "learning_rate": 7.680571004820171e-06, "loss": 2.9026, "mean_token_accuracy": 0.4769267083677564, "step": 12511 }, { "epoch": 2.3196143863552097, "grad_norm": 7.0703125, "learning_rate": 7.680385613644792e-06, "loss": 2.6556, "mean_token_accuracy": 0.4879154078549849, "step": 12512 }, { "epoch": 2.3197997775305894, "grad_norm": 6.6015625, "learning_rate": 7.68020022246941e-06, "loss": 3.2014, "mean_token_accuracy": 0.4446961620469083, "step": 12513 }, { "epoch": 2.3199851687059696, "grad_norm": 6.0078125, "learning_rate": 7.68001483129403e-06, "loss": 2.6664, "mean_token_accuracy": 0.48041336217255465, "step": 12514 }, { "epoch": 2.32017055988135, "grad_norm": 6.46875, "learning_rate": 7.679829440118651e-06, "loss": 2.7995, "mean_token_accuracy": 0.48399706816516, "step": 12515 }, { "epoch": 2.3203559510567295, "grad_norm": 6.359375, "learning_rate": 7.679644048943272e-06, "loss": 2.9888, "mean_token_accuracy": 0.4654921020656136, "step": 12516 }, { "epoch": 2.3205413422321097, "grad_norm": 7.33984375, "learning_rate": 7.67945865776789e-06, "loss": 2.6512, "mean_token_accuracy": 0.5023070803500398, "step": 12517 }, { "epoch": 2.32072673340749, "grad_norm": 6.55859375, "learning_rate": 7.679273266592511e-06, "loss": 2.5576, "mean_token_accuracy": 0.4957111234089651, "step": 12518 }, { "epoch": 2.3209121245828697, "grad_norm": 6.98828125, "learning_rate": 7.67908787541713e-06, "loss": 3.1854, "mean_token_accuracy": 0.4471495539089672, "step": 12519 }, { "epoch": 2.32109751575825, "grad_norm": 5.79296875, "learning_rate": 7.67890248424175e-06, "loss": 2.8661, "mean_token_accuracy": 0.4761671363803455, "step": 12520 }, { "epoch": 2.32128290693363, "grad_norm": 6.7578125, "learning_rate": 7.678717093066371e-06, "loss": 2.7791, "mean_token_accuracy": 0.5106717984604618, "step": 12521 }, { "epoch": 2.32146829810901, "grad_norm": 7.32421875, "learning_rate": 7.678531701890991e-06, "loss": 3.1174, "mean_token_accuracy": 0.48091497864131183, "step": 12522 }, { "epoch": 2.32165368928439, "grad_norm": 6.5859375, "learning_rate": 7.67834631071561e-06, "loss": 2.8908, "mean_token_accuracy": 0.4673913043478261, "step": 12523 }, { "epoch": 2.32183908045977, "grad_norm": 9.1953125, "learning_rate": 7.67816091954023e-06, "loss": 2.5657, "mean_token_accuracy": 0.5145728643216081, "step": 12524 }, { "epoch": 2.3220244716351504, "grad_norm": 7.9140625, "learning_rate": 7.677975528364851e-06, "loss": 2.9351, "mean_token_accuracy": 0.47720778337188735, "step": 12525 }, { "epoch": 2.32220986281053, "grad_norm": 7.234375, "learning_rate": 7.67779013718947e-06, "loss": 2.899, "mean_token_accuracy": 0.48561593312023604, "step": 12526 }, { "epoch": 2.3223952539859103, "grad_norm": 7.33984375, "learning_rate": 7.67760474601409e-06, "loss": 3.185, "mean_token_accuracy": 0.45740905057675246, "step": 12527 }, { "epoch": 2.3225806451612905, "grad_norm": 10.9296875, "learning_rate": 7.67741935483871e-06, "loss": 2.6792, "mean_token_accuracy": 0.48737972388788176, "step": 12528 }, { "epoch": 2.3227660363366702, "grad_norm": 9.171875, "learning_rate": 7.67723396366333e-06, "loss": 3.3524, "mean_token_accuracy": 0.43610665889552674, "step": 12529 }, { "epoch": 2.3229514275120504, "grad_norm": 6.8359375, "learning_rate": 7.67704857248795e-06, "loss": 2.6895, "mean_token_accuracy": 0.5094869992972593, "step": 12530 }, { "epoch": 2.3231368186874306, "grad_norm": 7.29296875, "learning_rate": 7.67686318131257e-06, "loss": 3.3391, "mean_token_accuracy": 0.44766523798781865, "step": 12531 }, { "epoch": 2.3233222098628104, "grad_norm": 7.36328125, "learning_rate": 7.67667779013719e-06, "loss": 2.5552, "mean_token_accuracy": 0.5299270072992701, "step": 12532 }, { "epoch": 2.3235076010381905, "grad_norm": 7.29296875, "learning_rate": 7.67649239896181e-06, "loss": 3.0744, "mean_token_accuracy": 0.4640101347460555, "step": 12533 }, { "epoch": 2.3236929922135707, "grad_norm": 6.37890625, "learning_rate": 7.67630700778643e-06, "loss": 2.6446, "mean_token_accuracy": 0.4767110977984125, "step": 12534 }, { "epoch": 2.3238783833889505, "grad_norm": 6.57421875, "learning_rate": 7.67612161661105e-06, "loss": 2.8585, "mean_token_accuracy": 0.5069143735158542, "step": 12535 }, { "epoch": 2.3240637745643307, "grad_norm": 7.890625, "learning_rate": 7.67593622543567e-06, "loss": 3.0067, "mean_token_accuracy": 0.45326887661141807, "step": 12536 }, { "epoch": 2.324249165739711, "grad_norm": 7.4921875, "learning_rate": 7.675750834260289e-06, "loss": 3.4811, "mean_token_accuracy": 0.4217931447388116, "step": 12537 }, { "epoch": 2.324434556915091, "grad_norm": 8.3125, "learning_rate": 7.67556544308491e-06, "loss": 3.4629, "mean_token_accuracy": 0.4188308100342579, "step": 12538 }, { "epoch": 2.324619948090471, "grad_norm": 6.125, "learning_rate": 7.67538005190953e-06, "loss": 2.9007, "mean_token_accuracy": 0.4603814128391083, "step": 12539 }, { "epoch": 2.324805339265851, "grad_norm": 7.77734375, "learning_rate": 7.67519466073415e-06, "loss": 3.219, "mean_token_accuracy": 0.46049469964664314, "step": 12540 }, { "epoch": 2.324990730441231, "grad_norm": 7.21484375, "learning_rate": 7.675009269558769e-06, "loss": 1.9265, "mean_token_accuracy": 0.6070070300795206, "step": 12541 }, { "epoch": 2.325176121616611, "grad_norm": 8.6171875, "learning_rate": 7.67482387838339e-06, "loss": 2.6098, "mean_token_accuracy": 0.4927643236857649, "step": 12542 }, { "epoch": 2.325361512791991, "grad_norm": 6.44921875, "learning_rate": 7.67463848720801e-06, "loss": 2.6683, "mean_token_accuracy": 0.4868654311039484, "step": 12543 }, { "epoch": 2.3255469039673713, "grad_norm": 9.265625, "learning_rate": 7.674453096032629e-06, "loss": 2.7381, "mean_token_accuracy": 0.4894758998105662, "step": 12544 }, { "epoch": 2.325732295142751, "grad_norm": 6.4375, "learning_rate": 7.674267704857249e-06, "loss": 3.148, "mean_token_accuracy": 0.4580379602103819, "step": 12545 }, { "epoch": 2.3259176863181312, "grad_norm": 9.125, "learning_rate": 7.67408231368187e-06, "loss": 2.1669, "mean_token_accuracy": 0.5620783405497936, "step": 12546 }, { "epoch": 2.3261030774935114, "grad_norm": 6.91015625, "learning_rate": 7.67389692250649e-06, "loss": 2.8972, "mean_token_accuracy": 0.4968562446413261, "step": 12547 }, { "epoch": 2.326288468668891, "grad_norm": 7.37890625, "learning_rate": 7.673711531331109e-06, "loss": 3.0508, "mean_token_accuracy": 0.4783404514948139, "step": 12548 }, { "epoch": 2.3264738598442714, "grad_norm": 8.71875, "learning_rate": 7.67352614015573e-06, "loss": 2.3728, "mean_token_accuracy": 0.528854961832061, "step": 12549 }, { "epoch": 2.3266592510196515, "grad_norm": 6.48046875, "learning_rate": 7.67334074898035e-06, "loss": 2.5083, "mean_token_accuracy": 0.5103196131619294, "step": 12550 }, { "epoch": 2.3268446421950317, "grad_norm": 7.77734375, "learning_rate": 7.673155357804969e-06, "loss": 2.7138, "mean_token_accuracy": 0.5061946902654867, "step": 12551 }, { "epoch": 2.3270300333704115, "grad_norm": 8.828125, "learning_rate": 7.67296996662959e-06, "loss": 3.0901, "mean_token_accuracy": 0.48267008985879334, "step": 12552 }, { "epoch": 2.3272154245457917, "grad_norm": 6.4609375, "learning_rate": 7.672784575454208e-06, "loss": 3.0396, "mean_token_accuracy": 0.47396828966602944, "step": 12553 }, { "epoch": 2.327400815721172, "grad_norm": 6.6171875, "learning_rate": 7.67259918427883e-06, "loss": 2.7073, "mean_token_accuracy": 0.5045335242185636, "step": 12554 }, { "epoch": 2.3275862068965516, "grad_norm": 7.41796875, "learning_rate": 7.672413793103449e-06, "loss": 2.7639, "mean_token_accuracy": 0.47121389539422326, "step": 12555 }, { "epoch": 2.327771598071932, "grad_norm": 7.109375, "learning_rate": 7.67222840192807e-06, "loss": 3.0459, "mean_token_accuracy": 0.4727802981205444, "step": 12556 }, { "epoch": 2.327956989247312, "grad_norm": 8.1015625, "learning_rate": 7.672043010752688e-06, "loss": 3.6025, "mean_token_accuracy": 0.4172959553118858, "step": 12557 }, { "epoch": 2.3281423804226917, "grad_norm": 10.2734375, "learning_rate": 7.671857619577309e-06, "loss": 3.4901, "mean_token_accuracy": 0.4558363731109953, "step": 12558 }, { "epoch": 2.328327771598072, "grad_norm": 7.3984375, "learning_rate": 7.67167222840193e-06, "loss": 2.97, "mean_token_accuracy": 0.4536532170119956, "step": 12559 }, { "epoch": 2.328513162773452, "grad_norm": 7.51953125, "learning_rate": 7.671486837226548e-06, "loss": 3.004, "mean_token_accuracy": 0.48123154787009703, "step": 12560 }, { "epoch": 2.328698553948832, "grad_norm": 10.1171875, "learning_rate": 7.671301446051169e-06, "loss": 2.6856, "mean_token_accuracy": 0.49578303290888043, "step": 12561 }, { "epoch": 2.328883945124212, "grad_norm": 7.5078125, "learning_rate": 7.671116054875789e-06, "loss": 2.197, "mean_token_accuracy": 0.5448673904081353, "step": 12562 }, { "epoch": 2.3290693362995922, "grad_norm": 7.6640625, "learning_rate": 7.67093066370041e-06, "loss": 3.6846, "mean_token_accuracy": 0.4306611490713117, "step": 12563 }, { "epoch": 2.3292547274749724, "grad_norm": 8.546875, "learning_rate": 7.670745272525028e-06, "loss": 2.9073, "mean_token_accuracy": 0.47362066883979237, "step": 12564 }, { "epoch": 2.329440118650352, "grad_norm": 9.765625, "learning_rate": 7.670559881349649e-06, "loss": 2.6718, "mean_token_accuracy": 0.4786224821312541, "step": 12565 }, { "epoch": 2.3296255098257324, "grad_norm": 6.89453125, "learning_rate": 7.670374490174268e-06, "loss": 2.7653, "mean_token_accuracy": 0.5083461583994318, "step": 12566 }, { "epoch": 2.329810901001112, "grad_norm": 8.0, "learning_rate": 7.670189098998888e-06, "loss": 2.4453, "mean_token_accuracy": 0.5380765589070743, "step": 12567 }, { "epoch": 2.3299962921764923, "grad_norm": 7.109375, "learning_rate": 7.670003707823509e-06, "loss": 2.5508, "mean_token_accuracy": 0.5171128898702733, "step": 12568 }, { "epoch": 2.3301816833518725, "grad_norm": 9.3125, "learning_rate": 7.669818316648127e-06, "loss": 2.7306, "mean_token_accuracy": 0.48519458544839256, "step": 12569 }, { "epoch": 2.3303670745272527, "grad_norm": 7.6171875, "learning_rate": 7.669632925472748e-06, "loss": 2.0762, "mean_token_accuracy": 0.5322864491674564, "step": 12570 }, { "epoch": 2.3305524657026324, "grad_norm": 8.4375, "learning_rate": 7.669447534297368e-06, "loss": 2.9514, "mean_token_accuracy": 0.4639956686518679, "step": 12571 }, { "epoch": 2.3307378568780126, "grad_norm": 10.046875, "learning_rate": 7.669262143121989e-06, "loss": 2.9068, "mean_token_accuracy": 0.4735836330935252, "step": 12572 }, { "epoch": 2.330923248053393, "grad_norm": 8.53125, "learning_rate": 7.669076751946608e-06, "loss": 2.9762, "mean_token_accuracy": 0.4944358360332441, "step": 12573 }, { "epoch": 2.3311086392287725, "grad_norm": 7.12109375, "learning_rate": 7.668891360771228e-06, "loss": 2.6447, "mean_token_accuracy": 0.4820792520035619, "step": 12574 }, { "epoch": 2.3312940304041527, "grad_norm": 9.4609375, "learning_rate": 7.668705969595847e-06, "loss": 2.5006, "mean_token_accuracy": 0.49803149606299213, "step": 12575 }, { "epoch": 2.331479421579533, "grad_norm": 7.421875, "learning_rate": 7.668520578420467e-06, "loss": 3.3792, "mean_token_accuracy": 0.4250453214335518, "step": 12576 }, { "epoch": 2.331664812754913, "grad_norm": 6.94921875, "learning_rate": 7.668335187245088e-06, "loss": 3.6605, "mean_token_accuracy": 0.4052940442002747, "step": 12577 }, { "epoch": 2.331850203930293, "grad_norm": 7.81640625, "learning_rate": 7.668149796069708e-06, "loss": 2.8406, "mean_token_accuracy": 0.49838237559698045, "step": 12578 }, { "epoch": 2.332035595105673, "grad_norm": 7.71875, "learning_rate": 7.667964404894327e-06, "loss": 3.1928, "mean_token_accuracy": 0.455985095482068, "step": 12579 }, { "epoch": 2.332220986281053, "grad_norm": 7.14453125, "learning_rate": 7.667779013718948e-06, "loss": 2.7584, "mean_token_accuracy": 0.48723949626657753, "step": 12580 }, { "epoch": 2.332406377456433, "grad_norm": 8.015625, "learning_rate": 7.667593622543568e-06, "loss": 3.3051, "mean_token_accuracy": 0.46434549715433543, "step": 12581 }, { "epoch": 2.332591768631813, "grad_norm": 7.74609375, "learning_rate": 7.667408231368187e-06, "loss": 2.6728, "mean_token_accuracy": 0.4903015892879489, "step": 12582 }, { "epoch": 2.3327771598071934, "grad_norm": 7.42578125, "learning_rate": 7.667222840192807e-06, "loss": 3.0481, "mean_token_accuracy": 0.4771587008185899, "step": 12583 }, { "epoch": 2.332962550982573, "grad_norm": 12.5078125, "learning_rate": 7.667037449017426e-06, "loss": 3.3073, "mean_token_accuracy": 0.4351243547630221, "step": 12584 }, { "epoch": 2.3331479421579533, "grad_norm": 7.6015625, "learning_rate": 7.666852057842047e-06, "loss": 2.7259, "mean_token_accuracy": 0.47663682148852826, "step": 12585 }, { "epoch": 2.3333333333333335, "grad_norm": 9.2890625, "learning_rate": 7.666666666666667e-06, "loss": 2.5987, "mean_token_accuracy": 0.5103199174406605, "step": 12586 }, { "epoch": 2.3335187245087132, "grad_norm": 9.8125, "learning_rate": 7.666481275491288e-06, "loss": 3.2439, "mean_token_accuracy": 0.45442207645329113, "step": 12587 }, { "epoch": 2.3337041156840934, "grad_norm": 9.1875, "learning_rate": 7.666295884315908e-06, "loss": 2.8019, "mean_token_accuracy": 0.4652027419542233, "step": 12588 }, { "epoch": 2.3338895068594736, "grad_norm": 8.5859375, "learning_rate": 7.666110493140527e-06, "loss": 2.5149, "mean_token_accuracy": 0.500669164882227, "step": 12589 }, { "epoch": 2.3340748980348534, "grad_norm": 7.83984375, "learning_rate": 7.665925101965148e-06, "loss": 2.5843, "mean_token_accuracy": 0.4925481643038895, "step": 12590 }, { "epoch": 2.3342602892102335, "grad_norm": 6.82421875, "learning_rate": 7.665739710789766e-06, "loss": 2.1813, "mean_token_accuracy": 0.5510982778178818, "step": 12591 }, { "epoch": 2.3344456803856137, "grad_norm": 9.2578125, "learning_rate": 7.665554319614387e-06, "loss": 2.5247, "mean_token_accuracy": 0.505558093497792, "step": 12592 }, { "epoch": 2.3346310715609935, "grad_norm": 7.328125, "learning_rate": 7.665368928439007e-06, "loss": 3.126, "mean_token_accuracy": 0.46557759626604434, "step": 12593 }, { "epoch": 2.3348164627363737, "grad_norm": 9.5546875, "learning_rate": 7.665183537263628e-06, "loss": 2.3803, "mean_token_accuracy": 0.5134812060034533, "step": 12594 }, { "epoch": 2.335001853911754, "grad_norm": 8.6640625, "learning_rate": 7.664998146088247e-06, "loss": 2.9333, "mean_token_accuracy": 0.48565719994291423, "step": 12595 }, { "epoch": 2.335187245087134, "grad_norm": 6.80078125, "learning_rate": 7.664812754912867e-06, "loss": 2.6298, "mean_token_accuracy": 0.5078457446808511, "step": 12596 }, { "epoch": 2.335372636262514, "grad_norm": 7.03515625, "learning_rate": 7.664627363737488e-06, "loss": 2.7339, "mean_token_accuracy": 0.4856924254016832, "step": 12597 }, { "epoch": 2.335558027437894, "grad_norm": 6.8671875, "learning_rate": 7.664441972562106e-06, "loss": 3.4625, "mean_token_accuracy": 0.42072538860103625, "step": 12598 }, { "epoch": 2.335743418613274, "grad_norm": 7.4609375, "learning_rate": 7.664256581386727e-06, "loss": 2.8357, "mean_token_accuracy": 0.48549596717135984, "step": 12599 }, { "epoch": 2.335928809788654, "grad_norm": 7.171875, "learning_rate": 7.664071190211346e-06, "loss": 3.169, "mean_token_accuracy": 0.4597300166843622, "step": 12600 }, { "epoch": 2.336114200964034, "grad_norm": 6.48828125, "learning_rate": 7.663885799035966e-06, "loss": 2.7641, "mean_token_accuracy": 0.47121653488694687, "step": 12601 }, { "epoch": 2.3362995921394143, "grad_norm": 8.0390625, "learning_rate": 7.663700407860587e-06, "loss": 2.7544, "mean_token_accuracy": 0.4953324716357892, "step": 12602 }, { "epoch": 2.336484983314794, "grad_norm": 10.109375, "learning_rate": 7.663515016685207e-06, "loss": 2.5253, "mean_token_accuracy": 0.5197780993728895, "step": 12603 }, { "epoch": 2.3366703744901742, "grad_norm": 6.03125, "learning_rate": 7.663329625509826e-06, "loss": 2.1609, "mean_token_accuracy": 0.5681818181818182, "step": 12604 }, { "epoch": 2.3368557656655544, "grad_norm": 7.84765625, "learning_rate": 7.663144234334446e-06, "loss": 2.8367, "mean_token_accuracy": 0.48933809934672745, "step": 12605 }, { "epoch": 2.337041156840934, "grad_norm": 7.390625, "learning_rate": 7.662958843159067e-06, "loss": 2.4827, "mean_token_accuracy": 0.5052050671014674, "step": 12606 }, { "epoch": 2.3372265480163144, "grad_norm": 6.34375, "learning_rate": 7.662773451983686e-06, "loss": 2.7794, "mean_token_accuracy": 0.4874662553027381, "step": 12607 }, { "epoch": 2.3374119391916945, "grad_norm": 7.453125, "learning_rate": 7.662588060808306e-06, "loss": 3.0296, "mean_token_accuracy": 0.45680839612486546, "step": 12608 }, { "epoch": 2.3375973303670747, "grad_norm": 6.796875, "learning_rate": 7.662402669632925e-06, "loss": 2.8897, "mean_token_accuracy": 0.4877407008731013, "step": 12609 }, { "epoch": 2.3377827215424545, "grad_norm": 7.96484375, "learning_rate": 7.662217278457547e-06, "loss": 3.7091, "mean_token_accuracy": 0.442681025931699, "step": 12610 }, { "epoch": 2.3379681127178347, "grad_norm": 6.05859375, "learning_rate": 7.662031887282166e-06, "loss": 2.6445, "mean_token_accuracy": 0.5008590197069227, "step": 12611 }, { "epoch": 2.338153503893215, "grad_norm": 7.4921875, "learning_rate": 7.661846496106786e-06, "loss": 2.7118, "mean_token_accuracy": 0.48599857853589196, "step": 12612 }, { "epoch": 2.3383388950685946, "grad_norm": 6.140625, "learning_rate": 7.661661104931405e-06, "loss": 2.6083, "mean_token_accuracy": 0.525648690661246, "step": 12613 }, { "epoch": 2.338524286243975, "grad_norm": 8.3359375, "learning_rate": 7.661475713756026e-06, "loss": 2.3451, "mean_token_accuracy": 0.5182229316307528, "step": 12614 }, { "epoch": 2.338709677419355, "grad_norm": 6.55078125, "learning_rate": 7.661290322580646e-06, "loss": 3.0661, "mean_token_accuracy": 0.4537600389389146, "step": 12615 }, { "epoch": 2.3388950685947347, "grad_norm": 6.31640625, "learning_rate": 7.661104931405265e-06, "loss": 2.8766, "mean_token_accuracy": 0.48340306834030683, "step": 12616 }, { "epoch": 2.339080459770115, "grad_norm": 7.1015625, "learning_rate": 7.660919540229886e-06, "loss": 2.7529, "mean_token_accuracy": 0.4805543770329515, "step": 12617 }, { "epoch": 2.339265850945495, "grad_norm": 6.79296875, "learning_rate": 7.660734149054506e-06, "loss": 2.8647, "mean_token_accuracy": 0.4725394235997825, "step": 12618 }, { "epoch": 2.339451242120875, "grad_norm": 7.69140625, "learning_rate": 7.660548757879127e-06, "loss": 3.1733, "mean_token_accuracy": 0.45658706310295744, "step": 12619 }, { "epoch": 2.339636633296255, "grad_norm": 6.75390625, "learning_rate": 7.660363366703745e-06, "loss": 2.6518, "mean_token_accuracy": 0.49564243027888444, "step": 12620 }, { "epoch": 2.3398220244716352, "grad_norm": 7.296875, "learning_rate": 7.660177975528366e-06, "loss": 2.9179, "mean_token_accuracy": 0.49985915492957744, "step": 12621 }, { "epoch": 2.3400074156470154, "grad_norm": 7.4375, "learning_rate": 7.659992584352985e-06, "loss": 3.2764, "mean_token_accuracy": 0.44620341832446064, "step": 12622 }, { "epoch": 2.340192806822395, "grad_norm": 6.01171875, "learning_rate": 7.659807193177605e-06, "loss": 2.7113, "mean_token_accuracy": 0.519356343283582, "step": 12623 }, { "epoch": 2.3403781979977754, "grad_norm": 6.98828125, "learning_rate": 7.659621802002226e-06, "loss": 2.7203, "mean_token_accuracy": 0.48994749647842234, "step": 12624 }, { "epoch": 2.3405635891731555, "grad_norm": 6.60546875, "learning_rate": 7.659436410826844e-06, "loss": 2.2241, "mean_token_accuracy": 0.569476637794315, "step": 12625 }, { "epoch": 2.3407489803485353, "grad_norm": 8.5625, "learning_rate": 7.659251019651465e-06, "loss": 2.8383, "mean_token_accuracy": 0.4978023216499493, "step": 12626 }, { "epoch": 2.3409343715239155, "grad_norm": 7.109375, "learning_rate": 7.659065628476085e-06, "loss": 3.2374, "mean_token_accuracy": 0.46051611474138887, "step": 12627 }, { "epoch": 2.3411197626992957, "grad_norm": 7.18359375, "learning_rate": 7.658880237300706e-06, "loss": 2.7629, "mean_token_accuracy": 0.4919365767719203, "step": 12628 }, { "epoch": 2.3413051538746754, "grad_norm": 7.3203125, "learning_rate": 7.658694846125325e-06, "loss": 2.66, "mean_token_accuracy": 0.500991817505579, "step": 12629 }, { "epoch": 2.3414905450500556, "grad_norm": 6.98828125, "learning_rate": 7.658509454949945e-06, "loss": 2.6718, "mean_token_accuracy": 0.5397245762711864, "step": 12630 }, { "epoch": 2.341675936225436, "grad_norm": 8.828125, "learning_rate": 7.658324063774566e-06, "loss": 3.2498, "mean_token_accuracy": 0.45516627078384797, "step": 12631 }, { "epoch": 2.3418613274008155, "grad_norm": 7.0, "learning_rate": 7.658138672599184e-06, "loss": 2.8458, "mean_token_accuracy": 0.5068824686129179, "step": 12632 }, { "epoch": 2.3420467185761957, "grad_norm": 7.69921875, "learning_rate": 7.657953281423805e-06, "loss": 3.0113, "mean_token_accuracy": 0.48765586034912717, "step": 12633 }, { "epoch": 2.342232109751576, "grad_norm": 11.0078125, "learning_rate": 7.657767890248424e-06, "loss": 2.6734, "mean_token_accuracy": 0.5218462549277266, "step": 12634 }, { "epoch": 2.342417500926956, "grad_norm": 8.453125, "learning_rate": 7.657582499073046e-06, "loss": 2.969, "mean_token_accuracy": 0.47216664705190065, "step": 12635 }, { "epoch": 2.342602892102336, "grad_norm": 9.2734375, "learning_rate": 7.657397107897665e-06, "loss": 3.241, "mean_token_accuracy": 0.44176239639377635, "step": 12636 }, { "epoch": 2.342788283277716, "grad_norm": 6.9140625, "learning_rate": 7.657211716722285e-06, "loss": 2.8486, "mean_token_accuracy": 0.47477781950181497, "step": 12637 }, { "epoch": 2.342973674453096, "grad_norm": 6.92578125, "learning_rate": 7.657026325546904e-06, "loss": 2.9696, "mean_token_accuracy": 0.4737189646064448, "step": 12638 }, { "epoch": 2.343159065628476, "grad_norm": 7.828125, "learning_rate": 7.656840934371524e-06, "loss": 2.774, "mean_token_accuracy": 0.5122652681441381, "step": 12639 }, { "epoch": 2.343344456803856, "grad_norm": 7.68359375, "learning_rate": 7.656655543196145e-06, "loss": 2.7355, "mean_token_accuracy": 0.4725025513096723, "step": 12640 }, { "epoch": 2.3435298479792364, "grad_norm": 6.7109375, "learning_rate": 7.656470152020764e-06, "loss": 3.5417, "mean_token_accuracy": 0.4424864864864865, "step": 12641 }, { "epoch": 2.343715239154616, "grad_norm": 7.296875, "learning_rate": 7.656284760845384e-06, "loss": 2.7497, "mean_token_accuracy": 0.491364168618267, "step": 12642 }, { "epoch": 2.3439006303299963, "grad_norm": 6.37890625, "learning_rate": 7.656099369670005e-06, "loss": 3.2085, "mean_token_accuracy": 0.4650660264105642, "step": 12643 }, { "epoch": 2.3440860215053765, "grad_norm": 8.390625, "learning_rate": 7.655913978494625e-06, "loss": 3.5763, "mean_token_accuracy": 0.4244721169463996, "step": 12644 }, { "epoch": 2.3442714126807562, "grad_norm": 8.1875, "learning_rate": 7.655728587319244e-06, "loss": 2.9791, "mean_token_accuracy": 0.4724703589410427, "step": 12645 }, { "epoch": 2.3444568038561364, "grad_norm": 7.26171875, "learning_rate": 7.655543196143865e-06, "loss": 3.647, "mean_token_accuracy": 0.41794538361508454, "step": 12646 }, { "epoch": 2.3446421950315166, "grad_norm": 6.6328125, "learning_rate": 7.655357804968483e-06, "loss": 2.5335, "mean_token_accuracy": 0.5119386014781125, "step": 12647 }, { "epoch": 2.344827586206897, "grad_norm": 7.10546875, "learning_rate": 7.655172413793104e-06, "loss": 3.0111, "mean_token_accuracy": 0.46762087624367255, "step": 12648 }, { "epoch": 2.3450129773822765, "grad_norm": 7.55859375, "learning_rate": 7.654987022617724e-06, "loss": 3.3048, "mean_token_accuracy": 0.46143922018348627, "step": 12649 }, { "epoch": 2.3451983685576567, "grad_norm": 6.69921875, "learning_rate": 7.654801631442343e-06, "loss": 2.7533, "mean_token_accuracy": 0.49252955697024814, "step": 12650 }, { "epoch": 2.3453837597330365, "grad_norm": 7.79296875, "learning_rate": 7.654616240266964e-06, "loss": 2.5228, "mean_token_accuracy": 0.49957032368948723, "step": 12651 }, { "epoch": 2.3455691509084167, "grad_norm": 8.6328125, "learning_rate": 7.654430849091584e-06, "loss": 3.1885, "mean_token_accuracy": 0.47089823164304956, "step": 12652 }, { "epoch": 2.345754542083797, "grad_norm": 5.93359375, "learning_rate": 7.654245457916205e-06, "loss": 3.0121, "mean_token_accuracy": 0.4635432511915627, "step": 12653 }, { "epoch": 2.345939933259177, "grad_norm": 12.546875, "learning_rate": 7.654060066740823e-06, "loss": 3.2519, "mean_token_accuracy": 0.45813253012048194, "step": 12654 }, { "epoch": 2.346125324434557, "grad_norm": 8.265625, "learning_rate": 7.653874675565444e-06, "loss": 2.6091, "mean_token_accuracy": 0.5333966874830302, "step": 12655 }, { "epoch": 2.346310715609937, "grad_norm": 8.328125, "learning_rate": 7.653689284390063e-06, "loss": 3.9857, "mean_token_accuracy": 0.39912562684839914, "step": 12656 }, { "epoch": 2.346496106785317, "grad_norm": 8.1640625, "learning_rate": 7.653503893214683e-06, "loss": 2.7098, "mean_token_accuracy": 0.4752280009859502, "step": 12657 }, { "epoch": 2.346681497960697, "grad_norm": 6.76171875, "learning_rate": 7.653318502039304e-06, "loss": 2.4499, "mean_token_accuracy": 0.5183239962651728, "step": 12658 }, { "epoch": 2.346866889136077, "grad_norm": 8.4453125, "learning_rate": 7.653133110863924e-06, "loss": 2.5942, "mean_token_accuracy": 0.5066455696202532, "step": 12659 }, { "epoch": 2.3470522803114573, "grad_norm": 7.19140625, "learning_rate": 7.652947719688543e-06, "loss": 3.0368, "mean_token_accuracy": 0.4773418071290412, "step": 12660 }, { "epoch": 2.347237671486837, "grad_norm": 7.98046875, "learning_rate": 7.652762328513163e-06, "loss": 3.4078, "mean_token_accuracy": 0.43305414651409696, "step": 12661 }, { "epoch": 2.3474230626622172, "grad_norm": 6.78515625, "learning_rate": 7.652576937337784e-06, "loss": 2.7389, "mean_token_accuracy": 0.48809055118110234, "step": 12662 }, { "epoch": 2.3476084538375974, "grad_norm": 8.0390625, "learning_rate": 7.652391546162403e-06, "loss": 3.5226, "mean_token_accuracy": 0.42998760842627015, "step": 12663 }, { "epoch": 2.347793845012977, "grad_norm": 7.5, "learning_rate": 7.652206154987023e-06, "loss": 2.9686, "mean_token_accuracy": 0.4614777806994478, "step": 12664 }, { "epoch": 2.3479792361883574, "grad_norm": 6.38671875, "learning_rate": 7.652020763811642e-06, "loss": 3.1098, "mean_token_accuracy": 0.44243851386708527, "step": 12665 }, { "epoch": 2.3481646273637375, "grad_norm": 10.046875, "learning_rate": 7.651835372636262e-06, "loss": 2.9901, "mean_token_accuracy": 0.4679245283018868, "step": 12666 }, { "epoch": 2.3483500185391177, "grad_norm": 10.28125, "learning_rate": 7.651649981460883e-06, "loss": 2.6448, "mean_token_accuracy": 0.5109712536145603, "step": 12667 }, { "epoch": 2.3485354097144975, "grad_norm": 8.90625, "learning_rate": 7.651464590285503e-06, "loss": 4.7752, "mean_token_accuracy": 0.40296992004061427, "step": 12668 }, { "epoch": 2.3487208008898777, "grad_norm": 7.546875, "learning_rate": 7.651279199110124e-06, "loss": 2.6924, "mean_token_accuracy": 0.5085440074906367, "step": 12669 }, { "epoch": 2.348906192065258, "grad_norm": 7.19921875, "learning_rate": 7.651093807934743e-06, "loss": 2.8625, "mean_token_accuracy": 0.48123502794783074, "step": 12670 }, { "epoch": 2.3490915832406376, "grad_norm": 6.75, "learning_rate": 7.650908416759363e-06, "loss": 3.0149, "mean_token_accuracy": 0.4653578214059531, "step": 12671 }, { "epoch": 2.349276974416018, "grad_norm": 6.4765625, "learning_rate": 7.650723025583982e-06, "loss": 2.5441, "mean_token_accuracy": 0.5249773960216998, "step": 12672 }, { "epoch": 2.349462365591398, "grad_norm": 6.6328125, "learning_rate": 7.650537634408603e-06, "loss": 2.7856, "mean_token_accuracy": 0.4722145804676754, "step": 12673 }, { "epoch": 2.3496477567667777, "grad_norm": 7.14453125, "learning_rate": 7.650352243233223e-06, "loss": 2.729, "mean_token_accuracy": 0.494606521206178, "step": 12674 }, { "epoch": 2.349833147942158, "grad_norm": 6.49609375, "learning_rate": 7.650166852057844e-06, "loss": 3.0673, "mean_token_accuracy": 0.4678887641850605, "step": 12675 }, { "epoch": 2.350018539117538, "grad_norm": 7.58203125, "learning_rate": 7.649981460882462e-06, "loss": 2.4124, "mean_token_accuracy": 0.5157462345960748, "step": 12676 }, { "epoch": 2.350203930292918, "grad_norm": 9.828125, "learning_rate": 7.649796069707083e-06, "loss": 3.1404, "mean_token_accuracy": 0.47819036808821147, "step": 12677 }, { "epoch": 2.350389321468298, "grad_norm": 9.2734375, "learning_rate": 7.649610678531703e-06, "loss": 2.3369, "mean_token_accuracy": 0.5272592409444925, "step": 12678 }, { "epoch": 2.3505747126436782, "grad_norm": 6.09375, "learning_rate": 7.649425287356322e-06, "loss": 2.8798, "mean_token_accuracy": 0.4819022722702594, "step": 12679 }, { "epoch": 2.3507601038190584, "grad_norm": 10.7265625, "learning_rate": 7.649239896180943e-06, "loss": 2.4492, "mean_token_accuracy": 0.5162975344755537, "step": 12680 }, { "epoch": 2.350945494994438, "grad_norm": 10.0078125, "learning_rate": 7.649054505005561e-06, "loss": 2.9156, "mean_token_accuracy": 0.49132009491694767, "step": 12681 }, { "epoch": 2.3511308861698184, "grad_norm": 7.421875, "learning_rate": 7.648869113830182e-06, "loss": 2.5295, "mean_token_accuracy": 0.49915824915824913, "step": 12682 }, { "epoch": 2.3513162773451985, "grad_norm": 12.0859375, "learning_rate": 7.648683722654802e-06, "loss": 2.5398, "mean_token_accuracy": 0.5206725786018657, "step": 12683 }, { "epoch": 2.3515016685205783, "grad_norm": 12.3515625, "learning_rate": 7.648498331479423e-06, "loss": 2.5708, "mean_token_accuracy": 0.5052802599512591, "step": 12684 }, { "epoch": 2.3516870596959585, "grad_norm": 8.8046875, "learning_rate": 7.648312940304042e-06, "loss": 2.5489, "mean_token_accuracy": 0.5174840085287846, "step": 12685 }, { "epoch": 2.3518724508713387, "grad_norm": 10.171875, "learning_rate": 7.648127549128662e-06, "loss": 2.6896, "mean_token_accuracy": 0.5274261603375527, "step": 12686 }, { "epoch": 2.3520578420467184, "grad_norm": 11.6953125, "learning_rate": 7.647942157953283e-06, "loss": 2.8721, "mean_token_accuracy": 0.4750089466778003, "step": 12687 }, { "epoch": 2.3522432332220986, "grad_norm": 9.6015625, "learning_rate": 7.647756766777901e-06, "loss": 2.8832, "mean_token_accuracy": 0.4763661922333509, "step": 12688 }, { "epoch": 2.352428624397479, "grad_norm": 10.5234375, "learning_rate": 7.647571375602522e-06, "loss": 2.5286, "mean_token_accuracy": 0.502900160434407, "step": 12689 }, { "epoch": 2.3526140155728585, "grad_norm": 9.390625, "learning_rate": 7.64738598442714e-06, "loss": 2.9186, "mean_token_accuracy": 0.45679699457353556, "step": 12690 }, { "epoch": 2.3527994067482387, "grad_norm": 7.015625, "learning_rate": 7.647200593251763e-06, "loss": 2.9403, "mean_token_accuracy": 0.4965166908563135, "step": 12691 }, { "epoch": 2.352984797923619, "grad_norm": 6.78125, "learning_rate": 7.647015202076382e-06, "loss": 2.8664, "mean_token_accuracy": 0.4742129119196374, "step": 12692 }, { "epoch": 2.353170189098999, "grad_norm": 9.6640625, "learning_rate": 7.646829810901002e-06, "loss": 3.4814, "mean_token_accuracy": 0.4728668941979522, "step": 12693 }, { "epoch": 2.353355580274379, "grad_norm": 14.7890625, "learning_rate": 7.646644419725621e-06, "loss": 2.7433, "mean_token_accuracy": 0.49991957535788967, "step": 12694 }, { "epoch": 2.353540971449759, "grad_norm": 8.328125, "learning_rate": 7.646459028550241e-06, "loss": 3.249, "mean_token_accuracy": 0.47268810586244037, "step": 12695 }, { "epoch": 2.3537263626251392, "grad_norm": 8.4765625, "learning_rate": 7.646273637374862e-06, "loss": 2.2821, "mean_token_accuracy": 0.5386757817445246, "step": 12696 }, { "epoch": 2.353911753800519, "grad_norm": 9.2109375, "learning_rate": 7.64608824619948e-06, "loss": 3.786, "mean_token_accuracy": 0.46812543073742247, "step": 12697 }, { "epoch": 2.354097144975899, "grad_norm": 12.046875, "learning_rate": 7.645902855024101e-06, "loss": 2.8505, "mean_token_accuracy": 0.477293620150763, "step": 12698 }, { "epoch": 2.3542825361512794, "grad_norm": 11.765625, "learning_rate": 7.645717463848722e-06, "loss": 3.3823, "mean_token_accuracy": 0.44162793972679903, "step": 12699 }, { "epoch": 2.354467927326659, "grad_norm": 10.6875, "learning_rate": 7.645532072673342e-06, "loss": 2.5182, "mean_token_accuracy": 0.5151852883811646, "step": 12700 }, { "epoch": 2.3546533185020393, "grad_norm": 7.52734375, "learning_rate": 7.645346681497961e-06, "loss": 3.1233, "mean_token_accuracy": 0.44495412844036697, "step": 12701 }, { "epoch": 2.3548387096774195, "grad_norm": 13.359375, "learning_rate": 7.645161290322582e-06, "loss": 2.7183, "mean_token_accuracy": 0.4849968612680477, "step": 12702 }, { "epoch": 2.3550241008527992, "grad_norm": 14.6640625, "learning_rate": 7.6449758991472e-06, "loss": 3.1737, "mean_token_accuracy": 0.43594009983361065, "step": 12703 }, { "epoch": 2.3552094920281794, "grad_norm": 12.0859375, "learning_rate": 7.64479050797182e-06, "loss": 2.6796, "mean_token_accuracy": 0.4673304293714997, "step": 12704 }, { "epoch": 2.3553948832035596, "grad_norm": 8.9765625, "learning_rate": 7.644605116796441e-06, "loss": 3.1492, "mean_token_accuracy": 0.4340004765308554, "step": 12705 }, { "epoch": 2.35558027437894, "grad_norm": 11.046875, "learning_rate": 7.64441972562106e-06, "loss": 2.3386, "mean_token_accuracy": 0.5374172185430464, "step": 12706 }, { "epoch": 2.3557656655543195, "grad_norm": 11.0234375, "learning_rate": 7.644234334445682e-06, "loss": 2.5372, "mean_token_accuracy": 0.5053523114927656, "step": 12707 }, { "epoch": 2.3559510567296997, "grad_norm": 7.25, "learning_rate": 7.644048943270301e-06, "loss": 3.5891, "mean_token_accuracy": 0.40726500050591924, "step": 12708 }, { "epoch": 2.3561364479050795, "grad_norm": 8.4765625, "learning_rate": 7.643863552094922e-06, "loss": 3.3423, "mean_token_accuracy": 0.45510756150972564, "step": 12709 }, { "epoch": 2.3563218390804597, "grad_norm": 19.171875, "learning_rate": 7.64367816091954e-06, "loss": 2.6173, "mean_token_accuracy": 0.47494902417710455, "step": 12710 }, { "epoch": 2.35650723025584, "grad_norm": 11.6796875, "learning_rate": 7.643492769744161e-06, "loss": 2.7371, "mean_token_accuracy": 0.4911774141803016, "step": 12711 }, { "epoch": 2.35669262143122, "grad_norm": 6.29296875, "learning_rate": 7.643307378568781e-06, "loss": 2.3398, "mean_token_accuracy": 0.5554495587884569, "step": 12712 }, { "epoch": 2.3568780126066, "grad_norm": 9.7734375, "learning_rate": 7.6431219873934e-06, "loss": 3.3931, "mean_token_accuracy": 0.4663665594855306, "step": 12713 }, { "epoch": 2.35706340378198, "grad_norm": 9.3125, "learning_rate": 7.64293659621802e-06, "loss": 3.5506, "mean_token_accuracy": 0.42844503428445035, "step": 12714 }, { "epoch": 2.35724879495736, "grad_norm": 10.6953125, "learning_rate": 7.642751205042641e-06, "loss": 2.5488, "mean_token_accuracy": 0.5016935904116727, "step": 12715 }, { "epoch": 2.35743418613274, "grad_norm": 9.8203125, "learning_rate": 7.642565813867262e-06, "loss": 3.867, "mean_token_accuracy": 0.4346470087893394, "step": 12716 }, { "epoch": 2.35761957730812, "grad_norm": 11.9453125, "learning_rate": 7.64238042269188e-06, "loss": 2.7364, "mean_token_accuracy": 0.46881720430107526, "step": 12717 }, { "epoch": 2.3578049684835003, "grad_norm": 8.2265625, "learning_rate": 7.642195031516501e-06, "loss": 2.8633, "mean_token_accuracy": 0.4728379464798438, "step": 12718 }, { "epoch": 2.3579903596588805, "grad_norm": 6.046875, "learning_rate": 7.64200964034112e-06, "loss": 2.5168, "mean_token_accuracy": 0.5271246551950611, "step": 12719 }, { "epoch": 2.3581757508342602, "grad_norm": 10.0703125, "learning_rate": 7.64182424916574e-06, "loss": 3.2942, "mean_token_accuracy": 0.4385458047429878, "step": 12720 }, { "epoch": 2.3583611420096404, "grad_norm": 12.9375, "learning_rate": 7.64163885799036e-06, "loss": 2.4964, "mean_token_accuracy": 0.5030032127392093, "step": 12721 }, { "epoch": 2.35854653318502, "grad_norm": 6.7109375, "learning_rate": 7.64145346681498e-06, "loss": 2.6264, "mean_token_accuracy": 0.49041132125380243, "step": 12722 }, { "epoch": 2.3587319243604004, "grad_norm": 9.4609375, "learning_rate": 7.6412680756396e-06, "loss": 2.4269, "mean_token_accuracy": 0.52453653217012, "step": 12723 }, { "epoch": 2.3589173155357805, "grad_norm": 9.484375, "learning_rate": 7.64108268446422e-06, "loss": 2.5169, "mean_token_accuracy": 0.5238573581115018, "step": 12724 }, { "epoch": 2.3591027067111607, "grad_norm": 6.640625, "learning_rate": 7.640897293288841e-06, "loss": 2.8032, "mean_token_accuracy": 0.4929658340511054, "step": 12725 }, { "epoch": 2.3592880978865405, "grad_norm": 11.6875, "learning_rate": 7.64071190211346e-06, "loss": 2.2529, "mean_token_accuracy": 0.5287253141831239, "step": 12726 }, { "epoch": 2.3594734890619207, "grad_norm": 6.734375, "learning_rate": 7.64052651093808e-06, "loss": 3.1237, "mean_token_accuracy": 0.4534263438654083, "step": 12727 }, { "epoch": 2.359658880237301, "grad_norm": 8.3046875, "learning_rate": 7.640341119762699e-06, "loss": 3.2458, "mean_token_accuracy": 0.4346762355243843, "step": 12728 }, { "epoch": 2.3598442714126806, "grad_norm": 8.796875, "learning_rate": 7.64015572858732e-06, "loss": 3.3514, "mean_token_accuracy": 0.43756637839478074, "step": 12729 }, { "epoch": 2.360029662588061, "grad_norm": 6.9140625, "learning_rate": 7.63997033741194e-06, "loss": 2.5698, "mean_token_accuracy": 0.5009693237541338, "step": 12730 }, { "epoch": 2.360215053763441, "grad_norm": 5.97265625, "learning_rate": 7.63978494623656e-06, "loss": 2.5455, "mean_token_accuracy": 0.5137276328370441, "step": 12731 }, { "epoch": 2.3604004449388207, "grad_norm": 7.109375, "learning_rate": 7.63959955506118e-06, "loss": 2.7781, "mean_token_accuracy": 0.4802336028751123, "step": 12732 }, { "epoch": 2.360585836114201, "grad_norm": 7.70703125, "learning_rate": 7.6394141638858e-06, "loss": 3.3729, "mean_token_accuracy": 0.45111414279217826, "step": 12733 }, { "epoch": 2.360771227289581, "grad_norm": 6.36328125, "learning_rate": 7.63922877271042e-06, "loss": 2.9118, "mean_token_accuracy": 0.48656029897944514, "step": 12734 }, { "epoch": 2.360956618464961, "grad_norm": 6.5703125, "learning_rate": 7.639043381535039e-06, "loss": 2.3134, "mean_token_accuracy": 0.525911708253359, "step": 12735 }, { "epoch": 2.361142009640341, "grad_norm": 9.4140625, "learning_rate": 7.63885799035966e-06, "loss": 3.6539, "mean_token_accuracy": 0.46134969325153374, "step": 12736 }, { "epoch": 2.3613274008157212, "grad_norm": 6.94140625, "learning_rate": 7.638672599184278e-06, "loss": 2.5971, "mean_token_accuracy": 0.47412312975226883, "step": 12737 }, { "epoch": 2.3615127919911014, "grad_norm": 12.4609375, "learning_rate": 7.638487208008899e-06, "loss": 4.3308, "mean_token_accuracy": 0.44086021505376344, "step": 12738 }, { "epoch": 2.361698183166481, "grad_norm": 6.43359375, "learning_rate": 7.63830181683352e-06, "loss": 3.1351, "mean_token_accuracy": 0.4503222341568206, "step": 12739 }, { "epoch": 2.3618835743418614, "grad_norm": 6.48046875, "learning_rate": 7.63811642565814e-06, "loss": 2.9824, "mean_token_accuracy": 0.4892031822199773, "step": 12740 }, { "epoch": 2.3620689655172415, "grad_norm": 7.1484375, "learning_rate": 7.637931034482759e-06, "loss": 3.1858, "mean_token_accuracy": 0.4563197026022305, "step": 12741 }, { "epoch": 2.3622543566926213, "grad_norm": 6.44921875, "learning_rate": 7.63774564330738e-06, "loss": 2.7439, "mean_token_accuracy": 0.49961695607763024, "step": 12742 }, { "epoch": 2.3624397478680015, "grad_norm": 6.94921875, "learning_rate": 7.637560252132e-06, "loss": 2.6781, "mean_token_accuracy": 0.5184993531694696, "step": 12743 }, { "epoch": 2.3626251390433817, "grad_norm": 7.796875, "learning_rate": 7.637374860956618e-06, "loss": 2.5922, "mean_token_accuracy": 0.4960051134547779, "step": 12744 }, { "epoch": 2.3628105302187614, "grad_norm": 6.97265625, "learning_rate": 7.637189469781239e-06, "loss": 2.827, "mean_token_accuracy": 0.4854996243425995, "step": 12745 }, { "epoch": 2.3629959213941416, "grad_norm": 8.25, "learning_rate": 7.637004078605858e-06, "loss": 2.8077, "mean_token_accuracy": 0.48614019958112603, "step": 12746 }, { "epoch": 2.363181312569522, "grad_norm": 7.03515625, "learning_rate": 7.636818687430478e-06, "loss": 2.9392, "mean_token_accuracy": 0.5073002754820937, "step": 12747 }, { "epoch": 2.3633667037449015, "grad_norm": 6.7734375, "learning_rate": 7.636633296255099e-06, "loss": 3.2193, "mean_token_accuracy": 0.4584670689433817, "step": 12748 }, { "epoch": 2.3635520949202817, "grad_norm": 6.5546875, "learning_rate": 7.63644790507972e-06, "loss": 2.8194, "mean_token_accuracy": 0.46354625550660794, "step": 12749 }, { "epoch": 2.363737486095662, "grad_norm": 6.86328125, "learning_rate": 7.63626251390434e-06, "loss": 2.9762, "mean_token_accuracy": 0.4776251661497563, "step": 12750 }, { "epoch": 2.363922877271042, "grad_norm": 6.6953125, "learning_rate": 7.636077122728959e-06, "loss": 2.9847, "mean_token_accuracy": 0.4688009313154831, "step": 12751 }, { "epoch": 2.364108268446422, "grad_norm": 8.375, "learning_rate": 7.635891731553579e-06, "loss": 2.5015, "mean_token_accuracy": 0.5233605989186192, "step": 12752 }, { "epoch": 2.364293659621802, "grad_norm": 8.0, "learning_rate": 7.635706340378198e-06, "loss": 2.5795, "mean_token_accuracy": 0.48991488865570715, "step": 12753 }, { "epoch": 2.3644790507971822, "grad_norm": 7.1953125, "learning_rate": 7.635520949202818e-06, "loss": 3.5159, "mean_token_accuracy": 0.4509702189478053, "step": 12754 }, { "epoch": 2.364664441972562, "grad_norm": 6.859375, "learning_rate": 7.635335558027439e-06, "loss": 2.4924, "mean_token_accuracy": 0.5078646029948408, "step": 12755 }, { "epoch": 2.364849833147942, "grad_norm": 8.1953125, "learning_rate": 7.63515016685206e-06, "loss": 3.449, "mean_token_accuracy": 0.44929415154134256, "step": 12756 }, { "epoch": 2.3650352243233224, "grad_norm": 7.09765625, "learning_rate": 7.634964775676678e-06, "loss": 3.0195, "mean_token_accuracy": 0.4636363636363636, "step": 12757 }, { "epoch": 2.365220615498702, "grad_norm": 9.71875, "learning_rate": 7.634779384501299e-06, "loss": 2.7043, "mean_token_accuracy": 0.48497495826377296, "step": 12758 }, { "epoch": 2.3654060066740823, "grad_norm": 6.61328125, "learning_rate": 7.634593993325919e-06, "loss": 2.9491, "mean_token_accuracy": 0.4749941023826374, "step": 12759 }, { "epoch": 2.3655913978494625, "grad_norm": 6.0234375, "learning_rate": 7.634408602150538e-06, "loss": 2.8202, "mean_token_accuracy": 0.4921353970051592, "step": 12760 }, { "epoch": 2.3657767890248422, "grad_norm": 9.96875, "learning_rate": 7.634223210975158e-06, "loss": 2.6908, "mean_token_accuracy": 0.49056603773584906, "step": 12761 }, { "epoch": 2.3659621802002224, "grad_norm": 11.125, "learning_rate": 7.634037819799777e-06, "loss": 2.8311, "mean_token_accuracy": 0.4874585896829153, "step": 12762 }, { "epoch": 2.3661475713756026, "grad_norm": 9.4140625, "learning_rate": 7.633852428624398e-06, "loss": 3.456, "mean_token_accuracy": 0.44687375016664443, "step": 12763 }, { "epoch": 2.366332962550983, "grad_norm": 6.12890625, "learning_rate": 7.633667037449018e-06, "loss": 3.2571, "mean_token_accuracy": 0.43760896637608965, "step": 12764 }, { "epoch": 2.3665183537263625, "grad_norm": 7.86328125, "learning_rate": 7.633481646273639e-06, "loss": 3.0385, "mean_token_accuracy": 0.4922884012539185, "step": 12765 }, { "epoch": 2.3667037449017427, "grad_norm": 8.2578125, "learning_rate": 7.633296255098257e-06, "loss": 3.0516, "mean_token_accuracy": 0.45058300943920043, "step": 12766 }, { "epoch": 2.3668891360771225, "grad_norm": 7.69921875, "learning_rate": 7.633110863922878e-06, "loss": 2.814, "mean_token_accuracy": 0.4796451766196737, "step": 12767 }, { "epoch": 2.3670745272525027, "grad_norm": 7.53125, "learning_rate": 7.632925472747498e-06, "loss": 2.9657, "mean_token_accuracy": 0.4828198301001648, "step": 12768 }, { "epoch": 2.367259918427883, "grad_norm": 7.1328125, "learning_rate": 7.632740081572117e-06, "loss": 2.4662, "mean_token_accuracy": 0.5026178010471204, "step": 12769 }, { "epoch": 2.367445309603263, "grad_norm": 6.91015625, "learning_rate": 7.632554690396738e-06, "loss": 2.3976, "mean_token_accuracy": 0.5290185676392573, "step": 12770 }, { "epoch": 2.367630700778643, "grad_norm": 6.7109375, "learning_rate": 7.632369299221356e-06, "loss": 2.9553, "mean_token_accuracy": 0.47660661468136595, "step": 12771 }, { "epoch": 2.367816091954023, "grad_norm": 9.96875, "learning_rate": 7.632183908045979e-06, "loss": 2.6112, "mean_token_accuracy": 0.5271551075673236, "step": 12772 }, { "epoch": 2.368001483129403, "grad_norm": 6.4609375, "learning_rate": 7.631998516870597e-06, "loss": 2.513, "mean_token_accuracy": 0.5075061944322985, "step": 12773 }, { "epoch": 2.368186874304783, "grad_norm": 7.6875, "learning_rate": 7.631813125695218e-06, "loss": 2.5828, "mean_token_accuracy": 0.5034639409426462, "step": 12774 }, { "epoch": 2.368372265480163, "grad_norm": 8.890625, "learning_rate": 7.631627734519837e-06, "loss": 2.9286, "mean_token_accuracy": 0.5000714387769681, "step": 12775 }, { "epoch": 2.3685576566555433, "grad_norm": 7.15234375, "learning_rate": 7.631442343344457e-06, "loss": 3.4681, "mean_token_accuracy": 0.42647552917459963, "step": 12776 }, { "epoch": 2.3687430478309235, "grad_norm": 9.90625, "learning_rate": 7.631256952169078e-06, "loss": 2.1824, "mean_token_accuracy": 0.5376392995224016, "step": 12777 }, { "epoch": 2.3689284390063032, "grad_norm": 6.87890625, "learning_rate": 7.631071560993697e-06, "loss": 3.3282, "mean_token_accuracy": 0.45767272028442396, "step": 12778 }, { "epoch": 2.3691138301816834, "grad_norm": 9.296875, "learning_rate": 7.630886169818317e-06, "loss": 2.4789, "mean_token_accuracy": 0.5095939933259177, "step": 12779 }, { "epoch": 2.369299221357063, "grad_norm": 6.55859375, "learning_rate": 7.630700778642938e-06, "loss": 2.8441, "mean_token_accuracy": 0.47527472527472525, "step": 12780 }, { "epoch": 2.3694846125324434, "grad_norm": 9.4140625, "learning_rate": 7.630515387467558e-06, "loss": 2.7123, "mean_token_accuracy": 0.4901937335565654, "step": 12781 }, { "epoch": 2.3696700037078235, "grad_norm": 7.55859375, "learning_rate": 7.630329996292177e-06, "loss": 3.7741, "mean_token_accuracy": 0.4423229912490056, "step": 12782 }, { "epoch": 2.3698553948832037, "grad_norm": 7.38671875, "learning_rate": 7.630144605116797e-06, "loss": 3.007, "mean_token_accuracy": 0.47838696312664, "step": 12783 }, { "epoch": 2.3700407860585835, "grad_norm": 8.0859375, "learning_rate": 7.629959213941416e-06, "loss": 3.2904, "mean_token_accuracy": 0.45183863885839737, "step": 12784 }, { "epoch": 2.3702261772339637, "grad_norm": 9.9609375, "learning_rate": 7.629773822766037e-06, "loss": 2.3593, "mean_token_accuracy": 0.5288651084201633, "step": 12785 }, { "epoch": 2.370411568409344, "grad_norm": 6.515625, "learning_rate": 7.629588431590657e-06, "loss": 2.935, "mean_token_accuracy": 0.45790634323517965, "step": 12786 }, { "epoch": 2.3705969595847236, "grad_norm": 8.2265625, "learning_rate": 7.629403040415276e-06, "loss": 2.9545, "mean_token_accuracy": 0.475384349628606, "step": 12787 }, { "epoch": 2.370782350760104, "grad_norm": 6.53125, "learning_rate": 7.629217649239897e-06, "loss": 2.971, "mean_token_accuracy": 0.46498467342607874, "step": 12788 }, { "epoch": 2.370967741935484, "grad_norm": 6.67578125, "learning_rate": 7.629032258064517e-06, "loss": 2.6182, "mean_token_accuracy": 0.5018462032428961, "step": 12789 }, { "epoch": 2.3711531331108637, "grad_norm": 6.2734375, "learning_rate": 7.6288468668891365e-06, "loss": 3.0286, "mean_token_accuracy": 0.4704869265695945, "step": 12790 }, { "epoch": 2.371338524286244, "grad_norm": 7.84375, "learning_rate": 7.628661475713757e-06, "loss": 2.747, "mean_token_accuracy": 0.4905637840420449, "step": 12791 }, { "epoch": 2.371523915461624, "grad_norm": 6.19140625, "learning_rate": 7.628476084538377e-06, "loss": 2.7038, "mean_token_accuracy": 0.48619224641529474, "step": 12792 }, { "epoch": 2.371709306637004, "grad_norm": 7.65234375, "learning_rate": 7.628290693362996e-06, "loss": 2.4417, "mean_token_accuracy": 0.5021608810818347, "step": 12793 }, { "epoch": 2.371894697812384, "grad_norm": 7.1875, "learning_rate": 7.628105302187616e-06, "loss": 2.465, "mean_token_accuracy": 0.490539916935856, "step": 12794 }, { "epoch": 2.3720800889877642, "grad_norm": 9.1796875, "learning_rate": 7.6279199110122356e-06, "loss": 1.8646, "mean_token_accuracy": 0.577714012434242, "step": 12795 }, { "epoch": 2.3722654801631444, "grad_norm": 9.03125, "learning_rate": 7.627734519836857e-06, "loss": 3.0283, "mean_token_accuracy": 0.459721146398141, "step": 12796 }, { "epoch": 2.372450871338524, "grad_norm": 8.28125, "learning_rate": 7.6275491286614766e-06, "loss": 3.0281, "mean_token_accuracy": 0.4505971769815418, "step": 12797 }, { "epoch": 2.3726362625139044, "grad_norm": 7.25, "learning_rate": 7.627363737486096e-06, "loss": 2.6155, "mean_token_accuracy": 0.5033410278216499, "step": 12798 }, { "epoch": 2.3728216536892845, "grad_norm": 7.04296875, "learning_rate": 7.627178346310717e-06, "loss": 2.5541, "mean_token_accuracy": 0.5141664558563117, "step": 12799 }, { "epoch": 2.3730070448646643, "grad_norm": 8.9140625, "learning_rate": 7.626992955135336e-06, "loss": 3.4132, "mean_token_accuracy": 0.45859133126934987, "step": 12800 }, { "epoch": 2.3731924360400445, "grad_norm": 7.10546875, "learning_rate": 7.626807563959956e-06, "loss": 2.4952, "mean_token_accuracy": 0.5132189489586706, "step": 12801 }, { "epoch": 2.3733778272154247, "grad_norm": 6.3515625, "learning_rate": 7.626622172784576e-06, "loss": 2.9273, "mean_token_accuracy": 0.47131593257205, "step": 12802 }, { "epoch": 2.3735632183908044, "grad_norm": 6.890625, "learning_rate": 7.626436781609195e-06, "loss": 3.5297, "mean_token_accuracy": 0.43121172353455817, "step": 12803 }, { "epoch": 2.3737486095661846, "grad_norm": 5.94921875, "learning_rate": 7.626251390433817e-06, "loss": 2.8368, "mean_token_accuracy": 0.490608284612514, "step": 12804 }, { "epoch": 2.373934000741565, "grad_norm": 7.62890625, "learning_rate": 7.626065999258436e-06, "loss": 3.4998, "mean_token_accuracy": 0.4524793388429752, "step": 12805 }, { "epoch": 2.3741193919169445, "grad_norm": 6.06640625, "learning_rate": 7.625880608083056e-06, "loss": 2.3121, "mean_token_accuracy": 0.5234814398200225, "step": 12806 }, { "epoch": 2.3743047830923247, "grad_norm": 7.0703125, "learning_rate": 7.6256952169076755e-06, "loss": 3.0902, "mean_token_accuracy": 0.43080900243309, "step": 12807 }, { "epoch": 2.374490174267705, "grad_norm": 6.6015625, "learning_rate": 7.625509825732296e-06, "loss": 3.2591, "mean_token_accuracy": 0.446748053041465, "step": 12808 }, { "epoch": 2.374675565443085, "grad_norm": 9.34375, "learning_rate": 7.625324434556916e-06, "loss": 2.8188, "mean_token_accuracy": 0.47324743393754093, "step": 12809 }, { "epoch": 2.374860956618465, "grad_norm": 6.33203125, "learning_rate": 7.625139043381535e-06, "loss": 2.7375, "mean_token_accuracy": 0.4955296667569764, "step": 12810 }, { "epoch": 2.375046347793845, "grad_norm": 8.1640625, "learning_rate": 7.624953652206155e-06, "loss": 2.8593, "mean_token_accuracy": 0.4711111111111111, "step": 12811 }, { "epoch": 2.3752317389692252, "grad_norm": 8.640625, "learning_rate": 7.624768261030776e-06, "loss": 3.0338, "mean_token_accuracy": 0.45329315540249676, "step": 12812 }, { "epoch": 2.375417130144605, "grad_norm": 8.453125, "learning_rate": 7.624582869855396e-06, "loss": 2.7287, "mean_token_accuracy": 0.49201791661881245, "step": 12813 }, { "epoch": 2.375602521319985, "grad_norm": 6.98046875, "learning_rate": 7.624397478680016e-06, "loss": 2.9999, "mean_token_accuracy": 0.46235619630115043, "step": 12814 }, { "epoch": 2.3757879124953654, "grad_norm": 8.0078125, "learning_rate": 7.624212087504635e-06, "loss": 3.1702, "mean_token_accuracy": 0.45966207899742645, "step": 12815 }, { "epoch": 2.375973303670745, "grad_norm": 7.17578125, "learning_rate": 7.624026696329255e-06, "loss": 2.641, "mean_token_accuracy": 0.49703703703703705, "step": 12816 }, { "epoch": 2.3761586948461253, "grad_norm": 6.90625, "learning_rate": 7.623841305153875e-06, "loss": 2.6376, "mean_token_accuracy": 0.5033916117321104, "step": 12817 }, { "epoch": 2.3763440860215055, "grad_norm": 8.6171875, "learning_rate": 7.623655913978495e-06, "loss": 2.9636, "mean_token_accuracy": 0.4859513930053349, "step": 12818 }, { "epoch": 2.3765294771968852, "grad_norm": 6.7890625, "learning_rate": 7.623470522803115e-06, "loss": 2.6822, "mean_token_accuracy": 0.4948914431673052, "step": 12819 }, { "epoch": 2.3767148683722654, "grad_norm": 8.484375, "learning_rate": 7.623285131627736e-06, "loss": 2.4426, "mean_token_accuracy": 0.5233412322274882, "step": 12820 }, { "epoch": 2.3769002595476456, "grad_norm": 7.6875, "learning_rate": 7.623099740452356e-06, "loss": 2.562, "mean_token_accuracy": 0.5269776440240757, "step": 12821 }, { "epoch": 2.377085650723026, "grad_norm": 7.609375, "learning_rate": 7.622914349276975e-06, "loss": 2.6695, "mean_token_accuracy": 0.47727876694127025, "step": 12822 }, { "epoch": 2.3772710418984055, "grad_norm": 7.578125, "learning_rate": 7.622728958101595e-06, "loss": 2.6549, "mean_token_accuracy": 0.5326473577235772, "step": 12823 }, { "epoch": 2.3774564330737857, "grad_norm": 7.83984375, "learning_rate": 7.6225435669262146e-06, "loss": 2.8208, "mean_token_accuracy": 0.48042574015170053, "step": 12824 }, { "epoch": 2.377641824249166, "grad_norm": 8.1328125, "learning_rate": 7.622358175750835e-06, "loss": 3.0595, "mean_token_accuracy": 0.45216606498194944, "step": 12825 }, { "epoch": 2.3778272154245457, "grad_norm": 7.1953125, "learning_rate": 7.622172784575455e-06, "loss": 2.216, "mean_token_accuracy": 0.5622107629247654, "step": 12826 }, { "epoch": 2.378012606599926, "grad_norm": 8.59375, "learning_rate": 7.621987393400074e-06, "loss": 3.2191, "mean_token_accuracy": 0.45646265898467225, "step": 12827 }, { "epoch": 2.378197997775306, "grad_norm": 7.56640625, "learning_rate": 7.621802002224695e-06, "loss": 2.6798, "mean_token_accuracy": 0.5193181818181818, "step": 12828 }, { "epoch": 2.378383388950686, "grad_norm": 7.24609375, "learning_rate": 7.621616611049315e-06, "loss": 2.6441, "mean_token_accuracy": 0.49170687575392036, "step": 12829 }, { "epoch": 2.378568780126066, "grad_norm": 11.1484375, "learning_rate": 7.621431219873935e-06, "loss": 4.4324, "mean_token_accuracy": 0.4349605378544285, "step": 12830 }, { "epoch": 2.378754171301446, "grad_norm": 8.6171875, "learning_rate": 7.621245828698555e-06, "loss": 2.534, "mean_token_accuracy": 0.48733862959285007, "step": 12831 }, { "epoch": 2.378939562476826, "grad_norm": 8.203125, "learning_rate": 7.621060437523174e-06, "loss": 4.0336, "mean_token_accuracy": 0.4362327358213341, "step": 12832 }, { "epoch": 2.379124953652206, "grad_norm": 8.34375, "learning_rate": 7.620875046347794e-06, "loss": 2.8846, "mean_token_accuracy": 0.47443216324547555, "step": 12833 }, { "epoch": 2.3793103448275863, "grad_norm": 7.42578125, "learning_rate": 7.620689655172414e-06, "loss": 3.2789, "mean_token_accuracy": 0.4586063132817153, "step": 12834 }, { "epoch": 2.3794957360029665, "grad_norm": 6.84765625, "learning_rate": 7.620504263997034e-06, "loss": 2.6632, "mean_token_accuracy": 0.48108907231735865, "step": 12835 }, { "epoch": 2.3796811271783462, "grad_norm": 7.9453125, "learning_rate": 7.6203188728216545e-06, "loss": 2.7342, "mean_token_accuracy": 0.5299502487562189, "step": 12836 }, { "epoch": 2.3798665183537264, "grad_norm": 7.3984375, "learning_rate": 7.620133481646275e-06, "loss": 2.6479, "mean_token_accuracy": 0.48641941265683164, "step": 12837 }, { "epoch": 2.380051909529106, "grad_norm": 12.4375, "learning_rate": 7.619948090470895e-06, "loss": 3.1233, "mean_token_accuracy": 0.4884450784593438, "step": 12838 }, { "epoch": 2.3802373007044864, "grad_norm": 9.484375, "learning_rate": 7.619762699295514e-06, "loss": 3.0892, "mean_token_accuracy": 0.4685679239497087, "step": 12839 }, { "epoch": 2.3804226918798665, "grad_norm": 7.12109375, "learning_rate": 7.619577308120134e-06, "loss": 2.8422, "mean_token_accuracy": 0.47885491216655823, "step": 12840 }, { "epoch": 2.3806080830552467, "grad_norm": 6.81640625, "learning_rate": 7.619391916944754e-06, "loss": 2.6232, "mean_token_accuracy": 0.4951985093879891, "step": 12841 }, { "epoch": 2.3807934742306265, "grad_norm": 7.10546875, "learning_rate": 7.619206525769374e-06, "loss": 2.7345, "mean_token_accuracy": 0.484733756717147, "step": 12842 }, { "epoch": 2.3809788654060067, "grad_norm": 7.8046875, "learning_rate": 7.619021134593994e-06, "loss": 3.1829, "mean_token_accuracy": 0.46738333740532617, "step": 12843 }, { "epoch": 2.381164256581387, "grad_norm": 6.37109375, "learning_rate": 7.618835743418614e-06, "loss": 2.319, "mean_token_accuracy": 0.5451876436047995, "step": 12844 }, { "epoch": 2.3813496477567666, "grad_norm": 6.390625, "learning_rate": 7.618650352243234e-06, "loss": 2.8467, "mean_token_accuracy": 0.49559533958510943, "step": 12845 }, { "epoch": 2.381535038932147, "grad_norm": 5.94921875, "learning_rate": 7.618464961067854e-06, "loss": 2.7228, "mean_token_accuracy": 0.4895330112721417, "step": 12846 }, { "epoch": 2.381720430107527, "grad_norm": 7.09375, "learning_rate": 7.618279569892474e-06, "loss": 2.5239, "mean_token_accuracy": 0.49325100516944287, "step": 12847 }, { "epoch": 2.381905821282907, "grad_norm": 6.47265625, "learning_rate": 7.618094178717094e-06, "loss": 3.1132, "mean_token_accuracy": 0.4339475733194913, "step": 12848 }, { "epoch": 2.382091212458287, "grad_norm": 8.1171875, "learning_rate": 7.617908787541713e-06, "loss": 2.6658, "mean_token_accuracy": 0.4815748031496063, "step": 12849 }, { "epoch": 2.382276603633667, "grad_norm": 7.58984375, "learning_rate": 7.617723396366333e-06, "loss": 2.9789, "mean_token_accuracy": 0.465514839069249, "step": 12850 }, { "epoch": 2.382461994809047, "grad_norm": 8.8203125, "learning_rate": 7.6175380051909534e-06, "loss": 3.4512, "mean_token_accuracy": 0.4434701877449969, "step": 12851 }, { "epoch": 2.382647385984427, "grad_norm": 6.8046875, "learning_rate": 7.617352614015574e-06, "loss": 3.4752, "mean_token_accuracy": 0.45186781609195403, "step": 12852 }, { "epoch": 2.3828327771598072, "grad_norm": 7.76171875, "learning_rate": 7.6171672228401936e-06, "loss": 2.8936, "mean_token_accuracy": 0.47450587224291035, "step": 12853 }, { "epoch": 2.3830181683351874, "grad_norm": 7.13671875, "learning_rate": 7.616981831664813e-06, "loss": 3.2169, "mean_token_accuracy": 0.45313782991202345, "step": 12854 }, { "epoch": 2.383203559510567, "grad_norm": 6.23046875, "learning_rate": 7.616796440489434e-06, "loss": 2.547, "mean_token_accuracy": 0.5054102177783866, "step": 12855 }, { "epoch": 2.3833889506859474, "grad_norm": 8.0234375, "learning_rate": 7.616611049314053e-06, "loss": 3.2302, "mean_token_accuracy": 0.46348019165595417, "step": 12856 }, { "epoch": 2.3835743418613276, "grad_norm": 8.71875, "learning_rate": 7.616425658138673e-06, "loss": 3.3054, "mean_token_accuracy": 0.4574215330749031, "step": 12857 }, { "epoch": 2.3837597330367073, "grad_norm": 5.88671875, "learning_rate": 7.616240266963293e-06, "loss": 2.4971, "mean_token_accuracy": 0.5349489103897278, "step": 12858 }, { "epoch": 2.3839451242120875, "grad_norm": 7.33203125, "learning_rate": 7.616054875787912e-06, "loss": 3.0072, "mean_token_accuracy": 0.4363283775048481, "step": 12859 }, { "epoch": 2.3841305153874677, "grad_norm": 6.35546875, "learning_rate": 7.615869484612534e-06, "loss": 2.9194, "mean_token_accuracy": 0.4763799104922924, "step": 12860 }, { "epoch": 2.3843159065628474, "grad_norm": 7.11328125, "learning_rate": 7.615684093437153e-06, "loss": 2.6495, "mean_token_accuracy": 0.5037143541816439, "step": 12861 }, { "epoch": 2.3845012977382276, "grad_norm": 7.609375, "learning_rate": 7.615498702261773e-06, "loss": 2.6164, "mean_token_accuracy": 0.5080191184280404, "step": 12862 }, { "epoch": 2.384686688913608, "grad_norm": 7.1953125, "learning_rate": 7.615313311086393e-06, "loss": 3.049, "mean_token_accuracy": 0.46779038718291055, "step": 12863 }, { "epoch": 2.3848720800889875, "grad_norm": 7.828125, "learning_rate": 7.615127919911013e-06, "loss": 2.7287, "mean_token_accuracy": 0.506372396642835, "step": 12864 }, { "epoch": 2.3850574712643677, "grad_norm": 7.046875, "learning_rate": 7.614942528735633e-06, "loss": 3.0195, "mean_token_accuracy": 0.46064209274673007, "step": 12865 }, { "epoch": 2.385242862439748, "grad_norm": 10.875, "learning_rate": 7.614757137560252e-06, "loss": 2.9555, "mean_token_accuracy": 0.4691485694588073, "step": 12866 }, { "epoch": 2.385428253615128, "grad_norm": 8.328125, "learning_rate": 7.614571746384872e-06, "loss": 2.7083, "mean_token_accuracy": 0.48321678321678324, "step": 12867 }, { "epoch": 2.385613644790508, "grad_norm": 8.8125, "learning_rate": 7.6143863552094925e-06, "loss": 2.2453, "mean_token_accuracy": 0.5366689513365319, "step": 12868 }, { "epoch": 2.385799035965888, "grad_norm": 10.8125, "learning_rate": 7.614200964034113e-06, "loss": 2.9106, "mean_token_accuracy": 0.49093581577658013, "step": 12869 }, { "epoch": 2.3859844271412682, "grad_norm": 7.85546875, "learning_rate": 7.614015572858733e-06, "loss": 3.0141, "mean_token_accuracy": 0.4716036228023442, "step": 12870 }, { "epoch": 2.386169818316648, "grad_norm": 8.15625, "learning_rate": 7.613830181683352e-06, "loss": 2.4281, "mean_token_accuracy": 0.5031372966074658, "step": 12871 }, { "epoch": 2.386355209492028, "grad_norm": 8.0703125, "learning_rate": 7.613644790507973e-06, "loss": 3.343, "mean_token_accuracy": 0.41998318385650224, "step": 12872 }, { "epoch": 2.3865406006674084, "grad_norm": 7.6015625, "learning_rate": 7.613459399332592e-06, "loss": 2.7884, "mean_token_accuracy": 0.4732039397450753, "step": 12873 }, { "epoch": 2.386725991842788, "grad_norm": 7.98828125, "learning_rate": 7.613274008157212e-06, "loss": 2.8451, "mean_token_accuracy": 0.48303055907618647, "step": 12874 }, { "epoch": 2.3869113830181683, "grad_norm": 8.109375, "learning_rate": 7.613088616981832e-06, "loss": 2.248, "mean_token_accuracy": 0.5320586214792764, "step": 12875 }, { "epoch": 2.3870967741935485, "grad_norm": 6.4375, "learning_rate": 7.612903225806451e-06, "loss": 3.1429, "mean_token_accuracy": 0.4531781754308925, "step": 12876 }, { "epoch": 2.3872821653689282, "grad_norm": 7.55078125, "learning_rate": 7.612717834631073e-06, "loss": 2.7299, "mean_token_accuracy": 0.4880334161210205, "step": 12877 }, { "epoch": 2.3874675565443084, "grad_norm": 7.1328125, "learning_rate": 7.612532443455692e-06, "loss": 2.5029, "mean_token_accuracy": 0.4981395348837209, "step": 12878 }, { "epoch": 2.3876529477196886, "grad_norm": 6.88671875, "learning_rate": 7.612347052280312e-06, "loss": 2.3593, "mean_token_accuracy": 0.5202710207708542, "step": 12879 }, { "epoch": 2.387838338895069, "grad_norm": 6.671875, "learning_rate": 7.6121616611049324e-06, "loss": 2.7799, "mean_token_accuracy": 0.47780496329954564, "step": 12880 }, { "epoch": 2.3880237300704485, "grad_norm": 7.578125, "learning_rate": 7.611976269929552e-06, "loss": 2.6593, "mean_token_accuracy": 0.48833795465666285, "step": 12881 }, { "epoch": 2.3882091212458287, "grad_norm": 8.03125, "learning_rate": 7.611790878754172e-06, "loss": 2.0249, "mean_token_accuracy": 0.601905239408373, "step": 12882 }, { "epoch": 2.388394512421209, "grad_norm": 6.79296875, "learning_rate": 7.611605487578791e-06, "loss": 2.6771, "mean_token_accuracy": 0.5175071857852104, "step": 12883 }, { "epoch": 2.3885799035965887, "grad_norm": 7.52734375, "learning_rate": 7.611420096403411e-06, "loss": 2.5734, "mean_token_accuracy": 0.4999361348831268, "step": 12884 }, { "epoch": 2.388765294771969, "grad_norm": 7.36328125, "learning_rate": 7.611234705228032e-06, "loss": 2.7689, "mean_token_accuracy": 0.46973754686663094, "step": 12885 }, { "epoch": 2.388950685947349, "grad_norm": 6.9453125, "learning_rate": 7.611049314052652e-06, "loss": 2.588, "mean_token_accuracy": 0.5028231797919762, "step": 12886 }, { "epoch": 2.389136077122729, "grad_norm": 5.75390625, "learning_rate": 7.610863922877272e-06, "loss": 2.4097, "mean_token_accuracy": 0.5393246067539325, "step": 12887 }, { "epoch": 2.389321468298109, "grad_norm": 6.296875, "learning_rate": 7.610678531701891e-06, "loss": 2.4244, "mean_token_accuracy": 0.4878986332574032, "step": 12888 }, { "epoch": 2.389506859473489, "grad_norm": 6.97265625, "learning_rate": 7.610493140526512e-06, "loss": 3.2427, "mean_token_accuracy": 0.4654140570633264, "step": 12889 }, { "epoch": 2.389692250648869, "grad_norm": 7.37890625, "learning_rate": 7.6103077493511314e-06, "loss": 2.3137, "mean_token_accuracy": 0.5266415827163491, "step": 12890 }, { "epoch": 2.389877641824249, "grad_norm": 6.921875, "learning_rate": 7.610122358175751e-06, "loss": 2.5531, "mean_token_accuracy": 0.5078793895024196, "step": 12891 }, { "epoch": 2.3900630329996293, "grad_norm": 6.4765625, "learning_rate": 7.609936967000371e-06, "loss": 3.0244, "mean_token_accuracy": 0.4844454463480613, "step": 12892 }, { "epoch": 2.3902484241750095, "grad_norm": 7.140625, "learning_rate": 7.609751575824992e-06, "loss": 3.117, "mean_token_accuracy": 0.4550881577120644, "step": 12893 }, { "epoch": 2.3904338153503892, "grad_norm": 6.34765625, "learning_rate": 7.609566184649612e-06, "loss": 2.7601, "mean_token_accuracy": 0.47757542810546344, "step": 12894 }, { "epoch": 2.3906192065257694, "grad_norm": 7.42578125, "learning_rate": 7.609380793474231e-06, "loss": 2.8455, "mean_token_accuracy": 0.4616951276829077, "step": 12895 }, { "epoch": 2.3908045977011496, "grad_norm": 7.90625, "learning_rate": 7.609195402298851e-06, "loss": 3.1748, "mean_token_accuracy": 0.45281594443605616, "step": 12896 }, { "epoch": 2.3909899888765294, "grad_norm": 6.4765625, "learning_rate": 7.609010011123471e-06, "loss": 2.6226, "mean_token_accuracy": 0.49319498719849075, "step": 12897 }, { "epoch": 2.3911753800519095, "grad_norm": 7.83203125, "learning_rate": 7.608824619948091e-06, "loss": 2.7255, "mean_token_accuracy": 0.4897150887951961, "step": 12898 }, { "epoch": 2.3913607712272897, "grad_norm": 7.84765625, "learning_rate": 7.608639228772711e-06, "loss": 2.9013, "mean_token_accuracy": 0.4596296296296296, "step": 12899 }, { "epoch": 2.3915461624026695, "grad_norm": 7.375, "learning_rate": 7.60845383759733e-06, "loss": 2.4629, "mean_token_accuracy": 0.5579504865131174, "step": 12900 }, { "epoch": 2.3917315535780497, "grad_norm": 10.234375, "learning_rate": 7.608268446421952e-06, "loss": 2.7144, "mean_token_accuracy": 0.47274696185777887, "step": 12901 }, { "epoch": 2.39191694475343, "grad_norm": 7.5859375, "learning_rate": 7.608083055246571e-06, "loss": 2.435, "mean_token_accuracy": 0.5075818036711891, "step": 12902 }, { "epoch": 2.3921023359288096, "grad_norm": 8.796875, "learning_rate": 7.607897664071191e-06, "loss": 2.8358, "mean_token_accuracy": 0.4858757062146893, "step": 12903 }, { "epoch": 2.39228772710419, "grad_norm": 8.3203125, "learning_rate": 7.607712272895811e-06, "loss": 2.5283, "mean_token_accuracy": 0.5091582424181763, "step": 12904 }, { "epoch": 2.39247311827957, "grad_norm": 9.3359375, "learning_rate": 7.60752688172043e-06, "loss": 3.6304, "mean_token_accuracy": 0.43239208856822986, "step": 12905 }, { "epoch": 2.39265850945495, "grad_norm": 8.328125, "learning_rate": 7.607341490545051e-06, "loss": 3.168, "mean_token_accuracy": 0.4575138678769541, "step": 12906 }, { "epoch": 2.39284390063033, "grad_norm": 7.484375, "learning_rate": 7.6071560993696705e-06, "loss": 2.5914, "mean_token_accuracy": 0.5168791312559018, "step": 12907 }, { "epoch": 2.39302929180571, "grad_norm": 8.3515625, "learning_rate": 7.60697070819429e-06, "loss": 2.0653, "mean_token_accuracy": 0.5919213973799127, "step": 12908 }, { "epoch": 2.39321468298109, "grad_norm": 7.20703125, "learning_rate": 7.606785317018911e-06, "loss": 2.1766, "mean_token_accuracy": 0.5245180926614812, "step": 12909 }, { "epoch": 2.39340007415647, "grad_norm": 7.328125, "learning_rate": 7.606599925843531e-06, "loss": 2.9521, "mean_token_accuracy": 0.4816683070866142, "step": 12910 }, { "epoch": 2.3935854653318502, "grad_norm": 9.0859375, "learning_rate": 7.606414534668151e-06, "loss": 2.5194, "mean_token_accuracy": 0.5206298828125, "step": 12911 }, { "epoch": 2.3937708565072304, "grad_norm": 7.4375, "learning_rate": 7.60622914349277e-06, "loss": 2.9665, "mean_token_accuracy": 0.45284872298624756, "step": 12912 }, { "epoch": 2.39395624768261, "grad_norm": 6.859375, "learning_rate": 7.60604375231739e-06, "loss": 3.1054, "mean_token_accuracy": 0.45099549004509953, "step": 12913 }, { "epoch": 2.3941416388579904, "grad_norm": 8.015625, "learning_rate": 7.60585836114201e-06, "loss": 2.5806, "mean_token_accuracy": 0.490507794412718, "step": 12914 }, { "epoch": 2.3943270300333706, "grad_norm": 10.3828125, "learning_rate": 7.60567296996663e-06, "loss": 2.4481, "mean_token_accuracy": 0.5089881288863765, "step": 12915 }, { "epoch": 2.3945124212087503, "grad_norm": 8.0703125, "learning_rate": 7.60548757879125e-06, "loss": 2.5725, "mean_token_accuracy": 0.5217889908256881, "step": 12916 }, { "epoch": 2.3946978123841305, "grad_norm": 9.5625, "learning_rate": 7.60530218761587e-06, "loss": 3.5452, "mean_token_accuracy": 0.4156528282954973, "step": 12917 }, { "epoch": 2.3948832035595107, "grad_norm": 8.34375, "learning_rate": 7.605116796440491e-06, "loss": 3.6632, "mean_token_accuracy": 0.437847866419295, "step": 12918 }, { "epoch": 2.395068594734891, "grad_norm": 6.53515625, "learning_rate": 7.6049314052651104e-06, "loss": 2.99, "mean_token_accuracy": 0.47394296951819076, "step": 12919 }, { "epoch": 2.3952539859102706, "grad_norm": 7.1171875, "learning_rate": 7.60474601408973e-06, "loss": 3.126, "mean_token_accuracy": 0.4464689265536723, "step": 12920 }, { "epoch": 2.395439377085651, "grad_norm": 8.4375, "learning_rate": 7.60456062291435e-06, "loss": 2.533, "mean_token_accuracy": 0.4963629638762175, "step": 12921 }, { "epoch": 2.3956247682610305, "grad_norm": 7.63671875, "learning_rate": 7.604375231738969e-06, "loss": 2.3776, "mean_token_accuracy": 0.5556081400851869, "step": 12922 }, { "epoch": 2.3958101594364107, "grad_norm": 8.0234375, "learning_rate": 7.60418984056359e-06, "loss": 2.6208, "mean_token_accuracy": 0.48827264481015936, "step": 12923 }, { "epoch": 2.395995550611791, "grad_norm": 7.02734375, "learning_rate": 7.6040044493882095e-06, "loss": 2.8721, "mean_token_accuracy": 0.48974189278623426, "step": 12924 }, { "epoch": 2.396180941787171, "grad_norm": 7.4140625, "learning_rate": 7.60381905821283e-06, "loss": 2.7974, "mean_token_accuracy": 0.46070460704607047, "step": 12925 }, { "epoch": 2.396366332962551, "grad_norm": 8.8359375, "learning_rate": 7.60363366703745e-06, "loss": 3.4779, "mean_token_accuracy": 0.4477413640389725, "step": 12926 }, { "epoch": 2.396551724137931, "grad_norm": 6.96484375, "learning_rate": 7.60344827586207e-06, "loss": 2.7552, "mean_token_accuracy": 0.4899527983816588, "step": 12927 }, { "epoch": 2.3967371153133112, "grad_norm": 7.703125, "learning_rate": 7.60326288468669e-06, "loss": 3.2684, "mean_token_accuracy": 0.4344903278913431, "step": 12928 }, { "epoch": 2.396922506488691, "grad_norm": 9.59375, "learning_rate": 7.603077493511309e-06, "loss": 2.831, "mean_token_accuracy": 0.4612857813233224, "step": 12929 }, { "epoch": 2.397107897664071, "grad_norm": 6.94921875, "learning_rate": 7.602892102335929e-06, "loss": 2.6646, "mean_token_accuracy": 0.48606581678031424, "step": 12930 }, { "epoch": 2.3972932888394514, "grad_norm": 8.15625, "learning_rate": 7.602706711160549e-06, "loss": 2.9582, "mean_token_accuracy": 0.4779817883266159, "step": 12931 }, { "epoch": 2.397478680014831, "grad_norm": 9.09375, "learning_rate": 7.602521319985169e-06, "loss": 3.0835, "mean_token_accuracy": 0.472478094810155, "step": 12932 }, { "epoch": 2.3976640711902113, "grad_norm": 14.3046875, "learning_rate": 7.60233592880979e-06, "loss": 2.5612, "mean_token_accuracy": 0.494176752683261, "step": 12933 }, { "epoch": 2.3978494623655915, "grad_norm": 9.0546875, "learning_rate": 7.602150537634409e-06, "loss": 2.8401, "mean_token_accuracy": 0.49625520110957005, "step": 12934 }, { "epoch": 2.3980348535409712, "grad_norm": 7.4296875, "learning_rate": 7.601965146459029e-06, "loss": 2.7248, "mean_token_accuracy": 0.5034584980237155, "step": 12935 }, { "epoch": 2.3982202447163514, "grad_norm": 7.203125, "learning_rate": 7.6017797552836495e-06, "loss": 2.7904, "mean_token_accuracy": 0.47932085328689594, "step": 12936 }, { "epoch": 2.3984056358917316, "grad_norm": 8.328125, "learning_rate": 7.601594364108269e-06, "loss": 2.6128, "mean_token_accuracy": 0.5543087020961254, "step": 12937 }, { "epoch": 2.398591027067112, "grad_norm": 6.55859375, "learning_rate": 7.601408972932889e-06, "loss": 2.6674, "mean_token_accuracy": 0.5333937354387781, "step": 12938 }, { "epoch": 2.3987764182424915, "grad_norm": 6.65234375, "learning_rate": 7.601223581757508e-06, "loss": 2.1034, "mean_token_accuracy": 0.5353776715783884, "step": 12939 }, { "epoch": 2.3989618094178717, "grad_norm": 9.09375, "learning_rate": 7.601038190582128e-06, "loss": 2.7531, "mean_token_accuracy": 0.474679682733374, "step": 12940 }, { "epoch": 2.399147200593252, "grad_norm": 9.9921875, "learning_rate": 7.600852799406749e-06, "loss": 3.1425, "mean_token_accuracy": 0.4515592515592516, "step": 12941 }, { "epoch": 2.3993325917686317, "grad_norm": 8.15625, "learning_rate": 7.600667408231369e-06, "loss": 3.2147, "mean_token_accuracy": 0.4527454817543456, "step": 12942 }, { "epoch": 2.399517982944012, "grad_norm": 9.3359375, "learning_rate": 7.600482017055989e-06, "loss": 2.9758, "mean_token_accuracy": 0.4651004426285325, "step": 12943 }, { "epoch": 2.399703374119392, "grad_norm": 15.671875, "learning_rate": 7.600296625880609e-06, "loss": 3.8154, "mean_token_accuracy": 0.4373692468619247, "step": 12944 }, { "epoch": 2.399888765294772, "grad_norm": 7.83203125, "learning_rate": 7.600111234705229e-06, "loss": 3.4938, "mean_token_accuracy": 0.4389093419236079, "step": 12945 }, { "epoch": 2.400074156470152, "grad_norm": 7.83203125, "learning_rate": 7.5999258435298484e-06, "loss": 3.1224, "mean_token_accuracy": 0.4634377967711301, "step": 12946 }, { "epoch": 2.400259547645532, "grad_norm": 7.13671875, "learning_rate": 7.599740452354468e-06, "loss": 2.4604, "mean_token_accuracy": 0.5184146341463415, "step": 12947 }, { "epoch": 2.400444938820912, "grad_norm": 7.40234375, "learning_rate": 7.599555061179088e-06, "loss": 3.3107, "mean_token_accuracy": 0.44049247606019154, "step": 12948 }, { "epoch": 2.400630329996292, "grad_norm": 8.1171875, "learning_rate": 7.599369670003709e-06, "loss": 3.1559, "mean_token_accuracy": 0.44666666666666666, "step": 12949 }, { "epoch": 2.4008157211716723, "grad_norm": 8.3515625, "learning_rate": 7.599184278828329e-06, "loss": 2.5909, "mean_token_accuracy": 0.4935864545920985, "step": 12950 }, { "epoch": 2.4010011123470525, "grad_norm": 7.984375, "learning_rate": 7.598998887652948e-06, "loss": 3.0329, "mean_token_accuracy": 0.45072402643047943, "step": 12951 }, { "epoch": 2.4011865035224322, "grad_norm": 12.671875, "learning_rate": 7.598813496477568e-06, "loss": 2.7612, "mean_token_accuracy": 0.4830166954519286, "step": 12952 }, { "epoch": 2.4013718946978124, "grad_norm": 11.2578125, "learning_rate": 7.5986281053021885e-06, "loss": 3.1774, "mean_token_accuracy": 0.5025528811086798, "step": 12953 }, { "epoch": 2.4015572858731926, "grad_norm": 10.1171875, "learning_rate": 7.598442714126808e-06, "loss": 2.9938, "mean_token_accuracy": 0.4864432109308284, "step": 12954 }, { "epoch": 2.4017426770485724, "grad_norm": 9.9765625, "learning_rate": 7.598257322951428e-06, "loss": 2.7588, "mean_token_accuracy": 0.49134014764338446, "step": 12955 }, { "epoch": 2.4019280682239526, "grad_norm": 10.4140625, "learning_rate": 7.5980719317760474e-06, "loss": 2.8426, "mean_token_accuracy": 0.46921965317919073, "step": 12956 }, { "epoch": 2.4021134593993327, "grad_norm": 6.42578125, "learning_rate": 7.597886540600669e-06, "loss": 2.4938, "mean_token_accuracy": 0.49924318869828455, "step": 12957 }, { "epoch": 2.4022988505747125, "grad_norm": 7.1015625, "learning_rate": 7.597701149425288e-06, "loss": 3.2672, "mean_token_accuracy": 0.46181935345290515, "step": 12958 }, { "epoch": 2.4024842417500927, "grad_norm": 9.7578125, "learning_rate": 7.597515758249908e-06, "loss": 2.746, "mean_token_accuracy": 0.4760833708202129, "step": 12959 }, { "epoch": 2.402669632925473, "grad_norm": 9.0625, "learning_rate": 7.597330367074528e-06, "loss": 3.4525, "mean_token_accuracy": 0.45295353724025217, "step": 12960 }, { "epoch": 2.4028550241008526, "grad_norm": 6.9609375, "learning_rate": 7.597144975899148e-06, "loss": 3.0148, "mean_token_accuracy": 0.4643744821872411, "step": 12961 }, { "epoch": 2.403040415276233, "grad_norm": 9.796875, "learning_rate": 7.596959584723768e-06, "loss": 2.6426, "mean_token_accuracy": 0.4912237330037083, "step": 12962 }, { "epoch": 2.403225806451613, "grad_norm": 7.04296875, "learning_rate": 7.5967741935483875e-06, "loss": 2.7152, "mean_token_accuracy": 0.4900288818594416, "step": 12963 }, { "epoch": 2.403411197626993, "grad_norm": 6.73046875, "learning_rate": 7.596588802373007e-06, "loss": 2.7653, "mean_token_accuracy": 0.49639159083777845, "step": 12964 }, { "epoch": 2.403596588802373, "grad_norm": 9.3515625, "learning_rate": 7.5964034111976285e-06, "loss": 2.9292, "mean_token_accuracy": 0.4719946272666219, "step": 12965 }, { "epoch": 2.403781979977753, "grad_norm": 10.203125, "learning_rate": 7.596218020022248e-06, "loss": 2.996, "mean_token_accuracy": 0.43944887599709936, "step": 12966 }, { "epoch": 2.4039673711531333, "grad_norm": 7.55078125, "learning_rate": 7.596032628846868e-06, "loss": 2.9697, "mean_token_accuracy": 0.46889101599890604, "step": 12967 }, { "epoch": 2.404152762328513, "grad_norm": 8.6171875, "learning_rate": 7.595847237671487e-06, "loss": 2.9917, "mean_token_accuracy": 0.46231865451277443, "step": 12968 }, { "epoch": 2.4043381535038932, "grad_norm": 9.2890625, "learning_rate": 7.595661846496107e-06, "loss": 2.7733, "mean_token_accuracy": 0.4686333084391337, "step": 12969 }, { "epoch": 2.4045235446792734, "grad_norm": 8.0703125, "learning_rate": 7.5954764553207275e-06, "loss": 2.6628, "mean_token_accuracy": 0.4991941982272361, "step": 12970 }, { "epoch": 2.404708935854653, "grad_norm": 6.10546875, "learning_rate": 7.595291064145347e-06, "loss": 2.7914, "mean_token_accuracy": 0.48505258182677535, "step": 12971 }, { "epoch": 2.4048943270300334, "grad_norm": 6.92578125, "learning_rate": 7.595105672969967e-06, "loss": 2.4565, "mean_token_accuracy": 0.5098551412966041, "step": 12972 }, { "epoch": 2.4050797182054136, "grad_norm": 9.671875, "learning_rate": 7.594920281794587e-06, "loss": 2.869, "mean_token_accuracy": 0.48788057936742535, "step": 12973 }, { "epoch": 2.4052651093807933, "grad_norm": 6.85546875, "learning_rate": 7.594734890619208e-06, "loss": 3.1107, "mean_token_accuracy": 0.45315990014452767, "step": 12974 }, { "epoch": 2.4054505005561735, "grad_norm": 6.4921875, "learning_rate": 7.5945494994438274e-06, "loss": 2.8511, "mean_token_accuracy": 0.4883617823099091, "step": 12975 }, { "epoch": 2.4056358917315537, "grad_norm": 6.58203125, "learning_rate": 7.594364108268447e-06, "loss": 3.1219, "mean_token_accuracy": 0.46508844261204946, "step": 12976 }, { "epoch": 2.405821282906934, "grad_norm": 6.4921875, "learning_rate": 7.594178717093067e-06, "loss": 3.2758, "mean_token_accuracy": 0.4416915293812427, "step": 12977 }, { "epoch": 2.4060066740823136, "grad_norm": 8.4296875, "learning_rate": 7.593993325917686e-06, "loss": 2.6859, "mean_token_accuracy": 0.5001744388882429, "step": 12978 }, { "epoch": 2.406192065257694, "grad_norm": 11.09375, "learning_rate": 7.593807934742307e-06, "loss": 2.4943, "mean_token_accuracy": 0.4905067434856619, "step": 12979 }, { "epoch": 2.4063774564330735, "grad_norm": 7.64453125, "learning_rate": 7.5936225435669265e-06, "loss": 2.7061, "mean_token_accuracy": 0.47316276537833424, "step": 12980 }, { "epoch": 2.4065628476084537, "grad_norm": 7.7109375, "learning_rate": 7.593437152391547e-06, "loss": 2.385, "mean_token_accuracy": 0.5164886205294937, "step": 12981 }, { "epoch": 2.406748238783834, "grad_norm": 9.609375, "learning_rate": 7.5932517612161675e-06, "loss": 2.9225, "mean_token_accuracy": 0.4590711660541263, "step": 12982 }, { "epoch": 2.406933629959214, "grad_norm": 9.5625, "learning_rate": 7.593066370040787e-06, "loss": 3.3258, "mean_token_accuracy": 0.45997267759562843, "step": 12983 }, { "epoch": 2.407119021134594, "grad_norm": 8.6171875, "learning_rate": 7.592880978865407e-06, "loss": 2.4454, "mean_token_accuracy": 0.5645561249793354, "step": 12984 }, { "epoch": 2.407304412309974, "grad_norm": 7.2421875, "learning_rate": 7.5926955876900264e-06, "loss": 2.9038, "mean_token_accuracy": 0.4656141868512111, "step": 12985 }, { "epoch": 2.4074898034853542, "grad_norm": 8.671875, "learning_rate": 7.592510196514646e-06, "loss": 2.3729, "mean_token_accuracy": 0.516890924706556, "step": 12986 }, { "epoch": 2.407675194660734, "grad_norm": 7.046875, "learning_rate": 7.5923248053392666e-06, "loss": 2.8087, "mean_token_accuracy": 0.4920895347474511, "step": 12987 }, { "epoch": 2.407860585836114, "grad_norm": 7.6171875, "learning_rate": 7.592139414163886e-06, "loss": 2.3066, "mean_token_accuracy": 0.5226023213194869, "step": 12988 }, { "epoch": 2.4080459770114944, "grad_norm": 9.4453125, "learning_rate": 7.591954022988507e-06, "loss": 2.8436, "mean_token_accuracy": 0.4660321237358715, "step": 12989 }, { "epoch": 2.408231368186874, "grad_norm": 8.5859375, "learning_rate": 7.591768631813126e-06, "loss": 3.7846, "mean_token_accuracy": 0.43788691704498556, "step": 12990 }, { "epoch": 2.4084167593622543, "grad_norm": 8.5234375, "learning_rate": 7.591583240637747e-06, "loss": 2.9494, "mean_token_accuracy": 0.4871457785568215, "step": 12991 }, { "epoch": 2.4086021505376345, "grad_norm": 8.4375, "learning_rate": 7.5913978494623665e-06, "loss": 2.3989, "mean_token_accuracy": 0.5166487647690655, "step": 12992 }, { "epoch": 2.4087875417130142, "grad_norm": 6.4921875, "learning_rate": 7.591212458286986e-06, "loss": 2.4155, "mean_token_accuracy": 0.511744738628649, "step": 12993 }, { "epoch": 2.4089729328883944, "grad_norm": 7.58203125, "learning_rate": 7.591027067111606e-06, "loss": 2.3309, "mean_token_accuracy": 0.513776102088167, "step": 12994 }, { "epoch": 2.4091583240637746, "grad_norm": 8.3125, "learning_rate": 7.590841675936225e-06, "loss": 2.9935, "mean_token_accuracy": 0.48462948980692483, "step": 12995 }, { "epoch": 2.409343715239155, "grad_norm": 7.96484375, "learning_rate": 7.590656284760846e-06, "loss": 3.0165, "mean_token_accuracy": 0.47132904608788856, "step": 12996 }, { "epoch": 2.4095291064145345, "grad_norm": 7.171875, "learning_rate": 7.5904708935854656e-06, "loss": 3.0407, "mean_token_accuracy": 0.4814032121724429, "step": 12997 }, { "epoch": 2.4097144975899147, "grad_norm": 7.6796875, "learning_rate": 7.590285502410086e-06, "loss": 2.7852, "mean_token_accuracy": 0.5058531394111387, "step": 12998 }, { "epoch": 2.409899888765295, "grad_norm": 8.546875, "learning_rate": 7.5901001112347065e-06, "loss": 3.1236, "mean_token_accuracy": 0.517587373167982, "step": 12999 }, { "epoch": 2.4100852799406747, "grad_norm": 7.1640625, "learning_rate": 7.589914720059326e-06, "loss": 2.7384, "mean_token_accuracy": 0.5039335015585572, "step": 13000 }, { "epoch": 2.410270671116055, "grad_norm": 8.34375, "learning_rate": 7.589729328883946e-06, "loss": 3.5377, "mean_token_accuracy": 0.4237006767973439, "step": 13001 }, { "epoch": 2.410456062291435, "grad_norm": 7.6328125, "learning_rate": 7.5895439377085655e-06, "loss": 3.0899, "mean_token_accuracy": 0.4553504747679505, "step": 13002 }, { "epoch": 2.410641453466815, "grad_norm": 7.078125, "learning_rate": 7.589358546533185e-06, "loss": 2.0258, "mean_token_accuracy": 0.5856121814791796, "step": 13003 }, { "epoch": 2.410826844642195, "grad_norm": 9.5703125, "learning_rate": 7.589173155357806e-06, "loss": 2.7222, "mean_token_accuracy": 0.5356884494815529, "step": 13004 }, { "epoch": 2.411012235817575, "grad_norm": 6.5078125, "learning_rate": 7.588987764182425e-06, "loss": 3.0677, "mean_token_accuracy": 0.45830202854996244, "step": 13005 }, { "epoch": 2.411197626992955, "grad_norm": 7.078125, "learning_rate": 7.588802373007046e-06, "loss": 2.6529, "mean_token_accuracy": 0.5003707548568886, "step": 13006 }, { "epoch": 2.411383018168335, "grad_norm": 7.96875, "learning_rate": 7.588616981831665e-06, "loss": 2.8894, "mean_token_accuracy": 0.47344759763978644, "step": 13007 }, { "epoch": 2.4115684093437153, "grad_norm": 7.7890625, "learning_rate": 7.588431590656286e-06, "loss": 3.5992, "mean_token_accuracy": 0.44351851851851853, "step": 13008 }, { "epoch": 2.4117538005190955, "grad_norm": 8.53125, "learning_rate": 7.5882461994809055e-06, "loss": 2.5435, "mean_token_accuracy": 0.5276387896116274, "step": 13009 }, { "epoch": 2.4119391916944752, "grad_norm": 9.4453125, "learning_rate": 7.588060808305525e-06, "loss": 2.523, "mean_token_accuracy": 0.4912124977809338, "step": 13010 }, { "epoch": 2.4121245828698554, "grad_norm": 9.359375, "learning_rate": 7.587875417130145e-06, "loss": 2.4542, "mean_token_accuracy": 0.5363849765258216, "step": 13011 }, { "epoch": 2.4123099740452356, "grad_norm": 8.1796875, "learning_rate": 7.5876900259547644e-06, "loss": 2.6371, "mean_token_accuracy": 0.5310610932475884, "step": 13012 }, { "epoch": 2.4124953652206154, "grad_norm": 6.9140625, "learning_rate": 7.587504634779385e-06, "loss": 2.9038, "mean_token_accuracy": 0.46286797502230154, "step": 13013 }, { "epoch": 2.4126807563959956, "grad_norm": 7.26171875, "learning_rate": 7.5873192436040054e-06, "loss": 2.6516, "mean_token_accuracy": 0.4775266948100323, "step": 13014 }, { "epoch": 2.4128661475713757, "grad_norm": 9.0, "learning_rate": 7.587133852428625e-06, "loss": 3.1117, "mean_token_accuracy": 0.442954627080091, "step": 13015 }, { "epoch": 2.4130515387467555, "grad_norm": 9.4453125, "learning_rate": 7.586948461253245e-06, "loss": 2.4102, "mean_token_accuracy": 0.5238639611317519, "step": 13016 }, { "epoch": 2.4132369299221357, "grad_norm": 6.84765625, "learning_rate": 7.586763070077865e-06, "loss": 2.1421, "mean_token_accuracy": 0.5573435156143461, "step": 13017 }, { "epoch": 2.413422321097516, "grad_norm": 7.5234375, "learning_rate": 7.586577678902485e-06, "loss": 3.0548, "mean_token_accuracy": 0.484260391198044, "step": 13018 }, { "epoch": 2.4136077122728956, "grad_norm": 13.6484375, "learning_rate": 7.5863922877271045e-06, "loss": 2.4858, "mean_token_accuracy": 0.5031133250311333, "step": 13019 }, { "epoch": 2.413793103448276, "grad_norm": 9.5390625, "learning_rate": 7.586206896551724e-06, "loss": 2.9742, "mean_token_accuracy": 0.4608003744441844, "step": 13020 }, { "epoch": 2.413978494623656, "grad_norm": 8.453125, "learning_rate": 7.586021505376344e-06, "loss": 2.9751, "mean_token_accuracy": 0.44908202351076476, "step": 13021 }, { "epoch": 2.414163885799036, "grad_norm": 13.40625, "learning_rate": 7.585836114200965e-06, "loss": 3.6725, "mean_token_accuracy": 0.40941603501654744, "step": 13022 }, { "epoch": 2.414349276974416, "grad_norm": 9.265625, "learning_rate": 7.585650723025585e-06, "loss": 2.4709, "mean_token_accuracy": 0.5056165593840717, "step": 13023 }, { "epoch": 2.414534668149796, "grad_norm": 8.6171875, "learning_rate": 7.585465331850204e-06, "loss": 2.9383, "mean_token_accuracy": 0.4823837049270575, "step": 13024 }, { "epoch": 2.4147200593251763, "grad_norm": 7.46875, "learning_rate": 7.585279940674825e-06, "loss": 2.7085, "mean_token_accuracy": 0.532874139010645, "step": 13025 }, { "epoch": 2.414905450500556, "grad_norm": 8.5078125, "learning_rate": 7.5850945494994446e-06, "loss": 2.7346, "mean_token_accuracy": 0.4952848469190027, "step": 13026 }, { "epoch": 2.4150908416759362, "grad_norm": 7.59765625, "learning_rate": 7.584909158324064e-06, "loss": 2.8501, "mean_token_accuracy": 0.49902504874756265, "step": 13027 }, { "epoch": 2.4152762328513164, "grad_norm": 8.609375, "learning_rate": 7.584723767148684e-06, "loss": 2.7343, "mean_token_accuracy": 0.4787338192102687, "step": 13028 }, { "epoch": 2.415461624026696, "grad_norm": 8.1484375, "learning_rate": 7.5845383759733035e-06, "loss": 2.5705, "mean_token_accuracy": 0.5290199809705043, "step": 13029 }, { "epoch": 2.4156470152020764, "grad_norm": 7.1484375, "learning_rate": 7.584352984797925e-06, "loss": 3.3533, "mean_token_accuracy": 0.44161073825503355, "step": 13030 }, { "epoch": 2.4158324063774566, "grad_norm": 7.01171875, "learning_rate": 7.5841675936225445e-06, "loss": 2.3582, "mean_token_accuracy": 0.5567706658089417, "step": 13031 }, { "epoch": 2.4160177975528363, "grad_norm": 12.0390625, "learning_rate": 7.583982202447164e-06, "loss": 2.4232, "mean_token_accuracy": 0.5548558170367915, "step": 13032 }, { "epoch": 2.4162031887282165, "grad_norm": 14.765625, "learning_rate": 7.583796811271784e-06, "loss": 2.5361, "mean_token_accuracy": 0.5014637319744117, "step": 13033 }, { "epoch": 2.4163885799035967, "grad_norm": 7.78515625, "learning_rate": 7.583611420096404e-06, "loss": 2.9985, "mean_token_accuracy": 0.45525068237322225, "step": 13034 }, { "epoch": 2.416573971078977, "grad_norm": 8.703125, "learning_rate": 7.583426028921024e-06, "loss": 2.9385, "mean_token_accuracy": 0.45631805520384905, "step": 13035 }, { "epoch": 2.4167593622543566, "grad_norm": 13.3828125, "learning_rate": 7.5832406377456435e-06, "loss": 2.9345, "mean_token_accuracy": 0.48874874587931777, "step": 13036 }, { "epoch": 2.416944753429737, "grad_norm": 7.3515625, "learning_rate": 7.583055246570263e-06, "loss": 2.2558, "mean_token_accuracy": 0.5548808538043978, "step": 13037 }, { "epoch": 2.4171301446051165, "grad_norm": 6.8125, "learning_rate": 7.5828698553948845e-06, "loss": 2.9617, "mean_token_accuracy": 0.446090655713657, "step": 13038 }, { "epoch": 2.4173155357804967, "grad_norm": 10.1484375, "learning_rate": 7.582684464219504e-06, "loss": 2.6885, "mean_token_accuracy": 0.5009214273747696, "step": 13039 }, { "epoch": 2.417500926955877, "grad_norm": 7.39453125, "learning_rate": 7.582499073044124e-06, "loss": 3.102, "mean_token_accuracy": 0.4498532942117898, "step": 13040 }, { "epoch": 2.417686318131257, "grad_norm": 9.296875, "learning_rate": 7.5823136818687435e-06, "loss": 2.6843, "mean_token_accuracy": 0.5035778175313059, "step": 13041 }, { "epoch": 2.417871709306637, "grad_norm": 9.8515625, "learning_rate": 7.582128290693364e-06, "loss": 3.3557, "mean_token_accuracy": 0.45001960015680126, "step": 13042 }, { "epoch": 2.418057100482017, "grad_norm": 7.734375, "learning_rate": 7.581942899517984e-06, "loss": 3.454, "mean_token_accuracy": 0.4577763286599283, "step": 13043 }, { "epoch": 2.4182424916573972, "grad_norm": 8.1875, "learning_rate": 7.581757508342603e-06, "loss": 2.7699, "mean_token_accuracy": 0.48417108153508204, "step": 13044 }, { "epoch": 2.418427882832777, "grad_norm": 7.9140625, "learning_rate": 7.581572117167223e-06, "loss": 2.6982, "mean_token_accuracy": 0.5237360092628329, "step": 13045 }, { "epoch": 2.418613274008157, "grad_norm": 8.1328125, "learning_rate": 7.581386725991844e-06, "loss": 3.332, "mean_token_accuracy": 0.43684527393136663, "step": 13046 }, { "epoch": 2.4187986651835374, "grad_norm": 7.3359375, "learning_rate": 7.581201334816464e-06, "loss": 2.7519, "mean_token_accuracy": 0.4999185800358248, "step": 13047 }, { "epoch": 2.4189840563589176, "grad_norm": 7.01171875, "learning_rate": 7.5810159436410835e-06, "loss": 2.9073, "mean_token_accuracy": 0.47575360419397117, "step": 13048 }, { "epoch": 2.4191694475342973, "grad_norm": 7.390625, "learning_rate": 7.580830552465703e-06, "loss": 3.1208, "mean_token_accuracy": 0.4566551377886868, "step": 13049 }, { "epoch": 2.4193548387096775, "grad_norm": 6.8515625, "learning_rate": 7.580645161290323e-06, "loss": 2.5623, "mean_token_accuracy": 0.5099206349206349, "step": 13050 }, { "epoch": 2.4195402298850572, "grad_norm": 8.265625, "learning_rate": 7.580459770114943e-06, "loss": 2.7889, "mean_token_accuracy": 0.49156305506216696, "step": 13051 }, { "epoch": 2.4197256210604374, "grad_norm": 7.05078125, "learning_rate": 7.580274378939563e-06, "loss": 3.2181, "mean_token_accuracy": 0.44080923028291447, "step": 13052 }, { "epoch": 2.4199110122358176, "grad_norm": 8.375, "learning_rate": 7.5800889877641826e-06, "loss": 2.9596, "mean_token_accuracy": 0.47565402428338965, "step": 13053 }, { "epoch": 2.420096403411198, "grad_norm": 7.16015625, "learning_rate": 7.579903596588803e-06, "loss": 2.6662, "mean_token_accuracy": 0.4963212370619778, "step": 13054 }, { "epoch": 2.4202817945865776, "grad_norm": 5.890625, "learning_rate": 7.5797182054134236e-06, "loss": 2.2758, "mean_token_accuracy": 0.5249330442545593, "step": 13055 }, { "epoch": 2.4204671857619577, "grad_norm": 6.73828125, "learning_rate": 7.579532814238043e-06, "loss": 2.911, "mean_token_accuracy": 0.46710080043413377, "step": 13056 }, { "epoch": 2.420652576937338, "grad_norm": 6.921875, "learning_rate": 7.579347423062663e-06, "loss": 2.1998, "mean_token_accuracy": 0.5659726499938401, "step": 13057 }, { "epoch": 2.4208379681127177, "grad_norm": 7.47265625, "learning_rate": 7.5791620318872825e-06, "loss": 2.9534, "mean_token_accuracy": 0.47557328015952144, "step": 13058 }, { "epoch": 2.421023359288098, "grad_norm": 9.0390625, "learning_rate": 7.578976640711902e-06, "loss": 2.5074, "mean_token_accuracy": 0.5118297872340426, "step": 13059 }, { "epoch": 2.421208750463478, "grad_norm": 6.98828125, "learning_rate": 7.578791249536523e-06, "loss": 3.4705, "mean_token_accuracy": 0.4349939246658566, "step": 13060 }, { "epoch": 2.421394141638858, "grad_norm": 9.15625, "learning_rate": 7.578605858361142e-06, "loss": 3.0099, "mean_token_accuracy": 0.46039787470653654, "step": 13061 }, { "epoch": 2.421579532814238, "grad_norm": 7.35546875, "learning_rate": 7.578420467185763e-06, "loss": 3.0953, "mean_token_accuracy": 0.4615287428932407, "step": 13062 }, { "epoch": 2.421764923989618, "grad_norm": 6.15234375, "learning_rate": 7.578235076010383e-06, "loss": 3.1408, "mean_token_accuracy": 0.4482036972445064, "step": 13063 }, { "epoch": 2.421950315164998, "grad_norm": 7.1953125, "learning_rate": 7.578049684835003e-06, "loss": 3.2398, "mean_token_accuracy": 0.45486547648981646, "step": 13064 }, { "epoch": 2.422135706340378, "grad_norm": 8.6484375, "learning_rate": 7.5778642936596225e-06, "loss": 3.1131, "mean_token_accuracy": 0.4656204282484373, "step": 13065 }, { "epoch": 2.4223210975157583, "grad_norm": 8.09375, "learning_rate": 7.577678902484242e-06, "loss": 2.2597, "mean_token_accuracy": 0.5251735154786824, "step": 13066 }, { "epoch": 2.4225064886911385, "grad_norm": 9.5703125, "learning_rate": 7.577493511308862e-06, "loss": 2.5484, "mean_token_accuracy": 0.49126186872161826, "step": 13067 }, { "epoch": 2.4226918798665182, "grad_norm": 8.5, "learning_rate": 7.577308120133482e-06, "loss": 3.1984, "mean_token_accuracy": 0.4782842399483982, "step": 13068 }, { "epoch": 2.4228772710418984, "grad_norm": 8.6484375, "learning_rate": 7.577122728958102e-06, "loss": 2.6909, "mean_token_accuracy": 0.48757894736842106, "step": 13069 }, { "epoch": 2.4230626622172786, "grad_norm": 7.35546875, "learning_rate": 7.5769373377827225e-06, "loss": 2.9344, "mean_token_accuracy": 0.4784245583550536, "step": 13070 }, { "epoch": 2.4232480533926584, "grad_norm": 6.71484375, "learning_rate": 7.576751946607342e-06, "loss": 2.5882, "mean_token_accuracy": 0.5107466063348416, "step": 13071 }, { "epoch": 2.4234334445680386, "grad_norm": 8.390625, "learning_rate": 7.576566555431963e-06, "loss": 2.8166, "mean_token_accuracy": 0.476245924545878, "step": 13072 }, { "epoch": 2.4236188357434187, "grad_norm": 7.96875, "learning_rate": 7.576381164256582e-06, "loss": 3.8705, "mean_token_accuracy": 0.43874099132225325, "step": 13073 }, { "epoch": 2.4238042269187985, "grad_norm": 7.625, "learning_rate": 7.576195773081202e-06, "loss": 2.9153, "mean_token_accuracy": 0.45453053184504266, "step": 13074 }, { "epoch": 2.4239896180941787, "grad_norm": 6.67578125, "learning_rate": 7.5760103819058215e-06, "loss": 2.7217, "mean_token_accuracy": 0.49364567526555386, "step": 13075 }, { "epoch": 2.424175009269559, "grad_norm": 6.09765625, "learning_rate": 7.575824990730441e-06, "loss": 2.6107, "mean_token_accuracy": 0.4810864442515193, "step": 13076 }, { "epoch": 2.4243604004449386, "grad_norm": 7.30078125, "learning_rate": 7.575639599555062e-06, "loss": 2.9492, "mean_token_accuracy": 0.4635499207606973, "step": 13077 }, { "epoch": 2.424545791620319, "grad_norm": 7.91796875, "learning_rate": 7.575454208379682e-06, "loss": 2.4701, "mean_token_accuracy": 0.5179214049229883, "step": 13078 }, { "epoch": 2.424731182795699, "grad_norm": 7.9375, "learning_rate": 7.575268817204302e-06, "loss": 3.4104, "mean_token_accuracy": 0.4479741553371921, "step": 13079 }, { "epoch": 2.424916573971079, "grad_norm": 9.03125, "learning_rate": 7.575083426028922e-06, "loss": 3.0889, "mean_token_accuracy": 0.4466310439202398, "step": 13080 }, { "epoch": 2.425101965146459, "grad_norm": 6.51953125, "learning_rate": 7.574898034853542e-06, "loss": 2.6558, "mean_token_accuracy": 0.5078426640926641, "step": 13081 }, { "epoch": 2.425287356321839, "grad_norm": 7.1484375, "learning_rate": 7.5747126436781616e-06, "loss": 2.8147, "mean_token_accuracy": 0.47686557546075525, "step": 13082 }, { "epoch": 2.4254727474972193, "grad_norm": 7.3125, "learning_rate": 7.574527252502781e-06, "loss": 3.0744, "mean_token_accuracy": 0.4490125332320547, "step": 13083 }, { "epoch": 2.425658138672599, "grad_norm": 7.859375, "learning_rate": 7.574341861327401e-06, "loss": 2.7617, "mean_token_accuracy": 0.5034332799267567, "step": 13084 }, { "epoch": 2.4258435298479792, "grad_norm": 6.921875, "learning_rate": 7.574156470152021e-06, "loss": 2.744, "mean_token_accuracy": 0.48842372343799556, "step": 13085 }, { "epoch": 2.4260289210233594, "grad_norm": 7.390625, "learning_rate": 7.573971078976642e-06, "loss": 3.0766, "mean_token_accuracy": 0.44488603156049095, "step": 13086 }, { "epoch": 2.426214312198739, "grad_norm": 8.3671875, "learning_rate": 7.5737856878012615e-06, "loss": 2.8811, "mean_token_accuracy": 0.48511511951899106, "step": 13087 }, { "epoch": 2.4263997033741194, "grad_norm": 6.96484375, "learning_rate": 7.573600296625881e-06, "loss": 2.7444, "mean_token_accuracy": 0.48018333782690753, "step": 13088 }, { "epoch": 2.4265850945494996, "grad_norm": 7.16796875, "learning_rate": 7.573414905450502e-06, "loss": 3.3839, "mean_token_accuracy": 0.42384823848238484, "step": 13089 }, { "epoch": 2.4267704857248793, "grad_norm": 7.94140625, "learning_rate": 7.573229514275121e-06, "loss": 2.871, "mean_token_accuracy": 0.47692557686057846, "step": 13090 }, { "epoch": 2.4269558769002595, "grad_norm": 7.9921875, "learning_rate": 7.573044123099741e-06, "loss": 2.8172, "mean_token_accuracy": 0.4931452149607347, "step": 13091 }, { "epoch": 2.4271412680756397, "grad_norm": 7.38671875, "learning_rate": 7.5728587319243606e-06, "loss": 2.9111, "mean_token_accuracy": 0.4990161949447556, "step": 13092 }, { "epoch": 2.42732665925102, "grad_norm": 6.9765625, "learning_rate": 7.57267334074898e-06, "loss": 3.4719, "mean_token_accuracy": 0.437948865268601, "step": 13093 }, { "epoch": 2.4275120504263996, "grad_norm": 7.58984375, "learning_rate": 7.5724879495736015e-06, "loss": 3.1812, "mean_token_accuracy": 0.45845523698069046, "step": 13094 }, { "epoch": 2.42769744160178, "grad_norm": 7.68359375, "learning_rate": 7.572302558398221e-06, "loss": 2.9154, "mean_token_accuracy": 0.4712913053667682, "step": 13095 }, { "epoch": 2.42788283277716, "grad_norm": 6.484375, "learning_rate": 7.572117167222841e-06, "loss": 2.404, "mean_token_accuracy": 0.5299557176348008, "step": 13096 }, { "epoch": 2.4280682239525397, "grad_norm": 7.4921875, "learning_rate": 7.5719317760474605e-06, "loss": 2.5105, "mean_token_accuracy": 0.5284144059869037, "step": 13097 }, { "epoch": 2.42825361512792, "grad_norm": 8.3125, "learning_rate": 7.571746384872081e-06, "loss": 2.716, "mean_token_accuracy": 0.4972862957937585, "step": 13098 }, { "epoch": 2.4284390063033, "grad_norm": 8.671875, "learning_rate": 7.571560993696701e-06, "loss": 2.8334, "mean_token_accuracy": 0.4999349720379763, "step": 13099 }, { "epoch": 2.42862439747868, "grad_norm": 8.9609375, "learning_rate": 7.57137560252132e-06, "loss": 2.4092, "mean_token_accuracy": 0.5411827526364972, "step": 13100 }, { "epoch": 2.42880978865406, "grad_norm": 7.7109375, "learning_rate": 7.57119021134594e-06, "loss": 3.3386, "mean_token_accuracy": 0.43820224719101125, "step": 13101 }, { "epoch": 2.4289951798294402, "grad_norm": 7.03515625, "learning_rate": 7.571004820170561e-06, "loss": 2.6384, "mean_token_accuracy": 0.49078040603464335, "step": 13102 }, { "epoch": 2.42918057100482, "grad_norm": 6.97265625, "learning_rate": 7.570819428995181e-06, "loss": 3.2631, "mean_token_accuracy": 0.4605446485117163, "step": 13103 }, { "epoch": 2.4293659621802, "grad_norm": 8.34375, "learning_rate": 7.5706340378198005e-06, "loss": 2.6038, "mean_token_accuracy": 0.4954159026504417, "step": 13104 }, { "epoch": 2.4295513533555804, "grad_norm": 8.015625, "learning_rate": 7.57044864664442e-06, "loss": 3.1927, "mean_token_accuracy": 0.4530847540782534, "step": 13105 }, { "epoch": 2.4297367445309606, "grad_norm": 10.0390625, "learning_rate": 7.570263255469041e-06, "loss": 3.1332, "mean_token_accuracy": 0.483695652173913, "step": 13106 }, { "epoch": 2.4299221357063403, "grad_norm": 7.8125, "learning_rate": 7.57007786429366e-06, "loss": 2.3271, "mean_token_accuracy": 0.5253504672897197, "step": 13107 }, { "epoch": 2.4301075268817205, "grad_norm": 7.7421875, "learning_rate": 7.56989247311828e-06, "loss": 2.9811, "mean_token_accuracy": 0.46444565811032223, "step": 13108 }, { "epoch": 2.4302929180571002, "grad_norm": 8.1953125, "learning_rate": 7.5697070819429e-06, "loss": 3.1979, "mean_token_accuracy": 0.4508400292184076, "step": 13109 }, { "epoch": 2.4304783092324804, "grad_norm": 8.2109375, "learning_rate": 7.569521690767521e-06, "loss": 2.8162, "mean_token_accuracy": 0.46913405848283657, "step": 13110 }, { "epoch": 2.4306637004078606, "grad_norm": 8.3515625, "learning_rate": 7.5693362995921406e-06, "loss": 2.5909, "mean_token_accuracy": 0.4937243852459016, "step": 13111 }, { "epoch": 2.430849091583241, "grad_norm": 7.42578125, "learning_rate": 7.56915090841676e-06, "loss": 2.9309, "mean_token_accuracy": 0.49145299145299143, "step": 13112 }, { "epoch": 2.4310344827586206, "grad_norm": 8.421875, "learning_rate": 7.56896551724138e-06, "loss": 3.6261, "mean_token_accuracy": 0.41516421401422865, "step": 13113 }, { "epoch": 2.4312198739340007, "grad_norm": 9.1953125, "learning_rate": 7.5687801260659995e-06, "loss": 2.8108, "mean_token_accuracy": 0.46534925209276795, "step": 13114 }, { "epoch": 2.431405265109381, "grad_norm": 7.8203125, "learning_rate": 7.56859473489062e-06, "loss": 2.8305, "mean_token_accuracy": 0.4826048171275647, "step": 13115 }, { "epoch": 2.4315906562847607, "grad_norm": 8.3203125, "learning_rate": 7.56840934371524e-06, "loss": 3.3202, "mean_token_accuracy": 0.4589609255621043, "step": 13116 }, { "epoch": 2.431776047460141, "grad_norm": 9.046875, "learning_rate": 7.568223952539859e-06, "loss": 3.3454, "mean_token_accuracy": 0.4475333257377236, "step": 13117 }, { "epoch": 2.431961438635521, "grad_norm": 10.5, "learning_rate": 7.568038561364479e-06, "loss": 3.0209, "mean_token_accuracy": 0.45747316267547483, "step": 13118 }, { "epoch": 2.4321468298109012, "grad_norm": 7.3515625, "learning_rate": 7.5678531701891e-06, "loss": 2.7378, "mean_token_accuracy": 0.49765896390273373, "step": 13119 }, { "epoch": 2.432332220986281, "grad_norm": 9.40625, "learning_rate": 7.56766777901372e-06, "loss": 2.7954, "mean_token_accuracy": 0.5068163592622293, "step": 13120 }, { "epoch": 2.432517612161661, "grad_norm": 10.8046875, "learning_rate": 7.5674823878383396e-06, "loss": 2.966, "mean_token_accuracy": 0.4535175879396985, "step": 13121 }, { "epoch": 2.432703003337041, "grad_norm": 8.515625, "learning_rate": 7.567296996662959e-06, "loss": 2.1596, "mean_token_accuracy": 0.5152173913043478, "step": 13122 }, { "epoch": 2.432888394512421, "grad_norm": 6.38671875, "learning_rate": 7.56711160548758e-06, "loss": 2.462, "mean_token_accuracy": 0.5202634245187436, "step": 13123 }, { "epoch": 2.4330737856878013, "grad_norm": 8.40625, "learning_rate": 7.566926214312199e-06, "loss": 2.0985, "mean_token_accuracy": 0.5561284274666042, "step": 13124 }, { "epoch": 2.4332591768631815, "grad_norm": 6.43359375, "learning_rate": 7.566740823136819e-06, "loss": 2.7968, "mean_token_accuracy": 0.45245036790226295, "step": 13125 }, { "epoch": 2.4334445680385612, "grad_norm": 7.78125, "learning_rate": 7.566555431961439e-06, "loss": 2.8199, "mean_token_accuracy": 0.4965419901199718, "step": 13126 }, { "epoch": 2.4336299592139414, "grad_norm": 6.6328125, "learning_rate": 7.56637004078606e-06, "loss": 3.0973, "mean_token_accuracy": 0.45958254269449716, "step": 13127 }, { "epoch": 2.4338153503893216, "grad_norm": 6.9375, "learning_rate": 7.56618464961068e-06, "loss": 2.3314, "mean_token_accuracy": 0.531118747613593, "step": 13128 }, { "epoch": 2.4340007415647014, "grad_norm": 7.95703125, "learning_rate": 7.565999258435299e-06, "loss": 3.4013, "mean_token_accuracy": 0.4623850835167681, "step": 13129 }, { "epoch": 2.4341861327400816, "grad_norm": 7.734375, "learning_rate": 7.565813867259919e-06, "loss": 3.0837, "mean_token_accuracy": 0.4850006437491953, "step": 13130 }, { "epoch": 2.4343715239154617, "grad_norm": 7.06640625, "learning_rate": 7.5656284760845385e-06, "loss": 3.041, "mean_token_accuracy": 0.4571510461450272, "step": 13131 }, { "epoch": 2.4345569150908415, "grad_norm": 8.6796875, "learning_rate": 7.565443084909159e-06, "loss": 2.8106, "mean_token_accuracy": 0.4821132075471698, "step": 13132 }, { "epoch": 2.4347423062662217, "grad_norm": 7.1640625, "learning_rate": 7.565257693733779e-06, "loss": 3.0489, "mean_token_accuracy": 0.4772200772200772, "step": 13133 }, { "epoch": 2.434927697441602, "grad_norm": 7.46484375, "learning_rate": 7.565072302558398e-06, "loss": 2.8196, "mean_token_accuracy": 0.48959795788130184, "step": 13134 }, { "epoch": 2.4351130886169816, "grad_norm": 8.6328125, "learning_rate": 7.564886911383019e-06, "loss": 2.9486, "mean_token_accuracy": 0.49554773244978256, "step": 13135 }, { "epoch": 2.435298479792362, "grad_norm": 6.37890625, "learning_rate": 7.564701520207639e-06, "loss": 2.8255, "mean_token_accuracy": 0.4834971025447216, "step": 13136 }, { "epoch": 2.435483870967742, "grad_norm": 7.671875, "learning_rate": 7.564516129032259e-06, "loss": 3.2447, "mean_token_accuracy": 0.44636429085673146, "step": 13137 }, { "epoch": 2.435669262143122, "grad_norm": 11.5859375, "learning_rate": 7.564330737856879e-06, "loss": 2.5231, "mean_token_accuracy": 0.5014048890137679, "step": 13138 }, { "epoch": 2.435854653318502, "grad_norm": 6.16015625, "learning_rate": 7.564145346681498e-06, "loss": 2.6765, "mean_token_accuracy": 0.49656235182550973, "step": 13139 }, { "epoch": 2.436040044493882, "grad_norm": 7.7421875, "learning_rate": 7.563959955506118e-06, "loss": 3.1053, "mean_token_accuracy": 0.46008073559093965, "step": 13140 }, { "epoch": 2.4362254356692623, "grad_norm": 7.04296875, "learning_rate": 7.563774564330738e-06, "loss": 2.924, "mean_token_accuracy": 0.4785336449747816, "step": 13141 }, { "epoch": 2.436410826844642, "grad_norm": 8.2734375, "learning_rate": 7.563589173155358e-06, "loss": 3.3711, "mean_token_accuracy": 0.4209643605870021, "step": 13142 }, { "epoch": 2.4365962180200222, "grad_norm": 9.8515625, "learning_rate": 7.5634037819799785e-06, "loss": 3.2916, "mean_token_accuracy": 0.4442758781654802, "step": 13143 }, { "epoch": 2.4367816091954024, "grad_norm": 10.609375, "learning_rate": 7.563218390804599e-06, "loss": 2.7056, "mean_token_accuracy": 0.4814107461166285, "step": 13144 }, { "epoch": 2.436967000370782, "grad_norm": 7.7734375, "learning_rate": 7.563032999629219e-06, "loss": 3.1668, "mean_token_accuracy": 0.5013934633899164, "step": 13145 }, { "epoch": 2.4371523915461624, "grad_norm": 9.8828125, "learning_rate": 7.562847608453838e-06, "loss": 3.2758, "mean_token_accuracy": 0.44922308546059936, "step": 13146 }, { "epoch": 2.4373377827215426, "grad_norm": 12.6015625, "learning_rate": 7.562662217278458e-06, "loss": 2.7055, "mean_token_accuracy": 0.4899139149428241, "step": 13147 }, { "epoch": 2.4375231738969223, "grad_norm": 7.80859375, "learning_rate": 7.562476826103078e-06, "loss": 3.1387, "mean_token_accuracy": 0.4623699363572078, "step": 13148 }, { "epoch": 2.4377085650723025, "grad_norm": 7.6171875, "learning_rate": 7.562291434927698e-06, "loss": 3.2907, "mean_token_accuracy": 0.4459366391184573, "step": 13149 }, { "epoch": 2.4378939562476827, "grad_norm": 8.328125, "learning_rate": 7.562106043752318e-06, "loss": 3.6717, "mean_token_accuracy": 0.45035987661373245, "step": 13150 }, { "epoch": 2.438079347423063, "grad_norm": 7.78125, "learning_rate": 7.561920652576938e-06, "loss": 3.189, "mean_token_accuracy": 0.462581905966203, "step": 13151 }, { "epoch": 2.4382647385984426, "grad_norm": 7.66796875, "learning_rate": 7.561735261401558e-06, "loss": 2.9878, "mean_token_accuracy": 0.47948260481712757, "step": 13152 }, { "epoch": 2.438450129773823, "grad_norm": 10.3984375, "learning_rate": 7.561549870226178e-06, "loss": 2.6643, "mean_token_accuracy": 0.5160837375542507, "step": 13153 }, { "epoch": 2.438635520949203, "grad_norm": 7.42578125, "learning_rate": 7.561364479050798e-06, "loss": 3.1829, "mean_token_accuracy": 0.45798155604623975, "step": 13154 }, { "epoch": 2.4388209121245827, "grad_norm": 7.98046875, "learning_rate": 7.561179087875418e-06, "loss": 2.7662, "mean_token_accuracy": 0.5057188669160877, "step": 13155 }, { "epoch": 2.439006303299963, "grad_norm": 11.1640625, "learning_rate": 7.560993696700037e-06, "loss": 2.9334, "mean_token_accuracy": 0.4752008382815229, "step": 13156 }, { "epoch": 2.439191694475343, "grad_norm": 6.625, "learning_rate": 7.560808305524657e-06, "loss": 2.4537, "mean_token_accuracy": 0.526532185532571, "step": 13157 }, { "epoch": 2.439377085650723, "grad_norm": 7.29296875, "learning_rate": 7.560622914349277e-06, "loss": 3.5148, "mean_token_accuracy": 0.43953318745441283, "step": 13158 }, { "epoch": 2.439562476826103, "grad_norm": 11.21875, "learning_rate": 7.560437523173898e-06, "loss": 2.8551, "mean_token_accuracy": 0.49088207483452656, "step": 13159 }, { "epoch": 2.4397478680014832, "grad_norm": 7.64453125, "learning_rate": 7.5602521319985175e-06, "loss": 3.7279, "mean_token_accuracy": 0.4224299065420561, "step": 13160 }, { "epoch": 2.439933259176863, "grad_norm": 6.9921875, "learning_rate": 7.560066740823138e-06, "loss": 3.1398, "mean_token_accuracy": 0.44332998996990974, "step": 13161 }, { "epoch": 2.440118650352243, "grad_norm": 8.1328125, "learning_rate": 7.559881349647758e-06, "loss": 2.8059, "mean_token_accuracy": 0.48989350141273635, "step": 13162 }, { "epoch": 2.4403040415276234, "grad_norm": 7.1484375, "learning_rate": 7.559695958472377e-06, "loss": 2.8894, "mean_token_accuracy": 0.4716376944190302, "step": 13163 }, { "epoch": 2.4404894327030036, "grad_norm": 9.6640625, "learning_rate": 7.559510567296997e-06, "loss": 4.1782, "mean_token_accuracy": 0.4034749034749035, "step": 13164 }, { "epoch": 2.4406748238783833, "grad_norm": 7.12109375, "learning_rate": 7.559325176121617e-06, "loss": 2.4011, "mean_token_accuracy": 0.5283325012481278, "step": 13165 }, { "epoch": 2.4408602150537635, "grad_norm": 7.625, "learning_rate": 7.559139784946237e-06, "loss": 3.3419, "mean_token_accuracy": 0.45691980127750176, "step": 13166 }, { "epoch": 2.4410456062291437, "grad_norm": 6.4140625, "learning_rate": 7.558954393770858e-06, "loss": 2.8391, "mean_token_accuracy": 0.47615885243724265, "step": 13167 }, { "epoch": 2.4412309974045234, "grad_norm": 6.578125, "learning_rate": 7.558769002595477e-06, "loss": 3.0291, "mean_token_accuracy": 0.47982690594374444, "step": 13168 }, { "epoch": 2.4414163885799036, "grad_norm": 7.484375, "learning_rate": 7.558583611420097e-06, "loss": 3.0209, "mean_token_accuracy": 0.49640490624559425, "step": 13169 }, { "epoch": 2.441601779755284, "grad_norm": 6.8515625, "learning_rate": 7.558398220244717e-06, "loss": 2.6576, "mean_token_accuracy": 0.512249443207127, "step": 13170 }, { "epoch": 2.4417871709306636, "grad_norm": 6.765625, "learning_rate": 7.558212829069337e-06, "loss": 2.3558, "mean_token_accuracy": 0.5250787224471435, "step": 13171 }, { "epoch": 2.4419725621060437, "grad_norm": 13.5390625, "learning_rate": 7.558027437893957e-06, "loss": 3.4024, "mean_token_accuracy": 0.4574479559918024, "step": 13172 }, { "epoch": 2.442157953281424, "grad_norm": 9.2734375, "learning_rate": 7.557842046718576e-06, "loss": 2.6378, "mean_token_accuracy": 0.5078770862579941, "step": 13173 }, { "epoch": 2.4423433444568037, "grad_norm": 8.3515625, "learning_rate": 7.557656655543196e-06, "loss": 2.8637, "mean_token_accuracy": 0.4762852793067104, "step": 13174 }, { "epoch": 2.442528735632184, "grad_norm": 7.46875, "learning_rate": 7.557471264367817e-06, "loss": 3.0389, "mean_token_accuracy": 0.44976974245266926, "step": 13175 }, { "epoch": 2.442714126807564, "grad_norm": 8.828125, "learning_rate": 7.557285873192437e-06, "loss": 3.3449, "mean_token_accuracy": 0.47508600599267564, "step": 13176 }, { "epoch": 2.4428995179829442, "grad_norm": 8.2890625, "learning_rate": 7.557100482017057e-06, "loss": 2.9713, "mean_token_accuracy": 0.45991253644314867, "step": 13177 }, { "epoch": 2.443084909158324, "grad_norm": 6.78125, "learning_rate": 7.556915090841676e-06, "loss": 2.4336, "mean_token_accuracy": 0.5197181720324885, "step": 13178 }, { "epoch": 2.443270300333704, "grad_norm": 7.3828125, "learning_rate": 7.556729699666297e-06, "loss": 2.8244, "mean_token_accuracy": 0.5099741844637409, "step": 13179 }, { "epoch": 2.443455691509084, "grad_norm": 7.16796875, "learning_rate": 7.556544308490916e-06, "loss": 3.0526, "mean_token_accuracy": 0.4602693602693603, "step": 13180 }, { "epoch": 2.443641082684464, "grad_norm": 6.52734375, "learning_rate": 7.556358917315536e-06, "loss": 3.0262, "mean_token_accuracy": 0.46614613681389117, "step": 13181 }, { "epoch": 2.4438264738598443, "grad_norm": 9.7109375, "learning_rate": 7.556173526140156e-06, "loss": 2.6327, "mean_token_accuracy": 0.4994246260069045, "step": 13182 }, { "epoch": 2.4440118650352245, "grad_norm": 7.92578125, "learning_rate": 7.555988134964777e-06, "loss": 2.8908, "mean_token_accuracy": 0.4602808865481677, "step": 13183 }, { "epoch": 2.4441972562106042, "grad_norm": 9.109375, "learning_rate": 7.555802743789397e-06, "loss": 2.9989, "mean_token_accuracy": 0.4976043805612594, "step": 13184 }, { "epoch": 2.4443826473859844, "grad_norm": 6.984375, "learning_rate": 7.555617352614016e-06, "loss": 3.2671, "mean_token_accuracy": 0.44524959742351045, "step": 13185 }, { "epoch": 2.4445680385613646, "grad_norm": 7.4609375, "learning_rate": 7.555431961438636e-06, "loss": 3.231, "mean_token_accuracy": 0.4870578669196368, "step": 13186 }, { "epoch": 2.4447534297367444, "grad_norm": 7.73828125, "learning_rate": 7.555246570263256e-06, "loss": 4.489, "mean_token_accuracy": 0.3473892274748806, "step": 13187 }, { "epoch": 2.4449388209121246, "grad_norm": 6.5078125, "learning_rate": 7.555061179087876e-06, "loss": 2.8399, "mean_token_accuracy": 0.46882640586797064, "step": 13188 }, { "epoch": 2.4451242120875047, "grad_norm": 6.4453125, "learning_rate": 7.554875787912496e-06, "loss": 2.8271, "mean_token_accuracy": 0.5100322991093276, "step": 13189 }, { "epoch": 2.445309603262885, "grad_norm": 8.21875, "learning_rate": 7.554690396737115e-06, "loss": 4.06, "mean_token_accuracy": 0.41890744409849984, "step": 13190 }, { "epoch": 2.4454949944382647, "grad_norm": 7.86328125, "learning_rate": 7.554505005561737e-06, "loss": 3.1707, "mean_token_accuracy": 0.4500276090557703, "step": 13191 }, { "epoch": 2.445680385613645, "grad_norm": 6.96484375, "learning_rate": 7.554319614386356e-06, "loss": 2.8706, "mean_token_accuracy": 0.488702201622248, "step": 13192 }, { "epoch": 2.4458657767890246, "grad_norm": 7.03515625, "learning_rate": 7.554134223210976e-06, "loss": 2.7088, "mean_token_accuracy": 0.4857250660294303, "step": 13193 }, { "epoch": 2.446051167964405, "grad_norm": 6.5859375, "learning_rate": 7.553948832035596e-06, "loss": 2.7485, "mean_token_accuracy": 0.49779179810725555, "step": 13194 }, { "epoch": 2.446236559139785, "grad_norm": 7.859375, "learning_rate": 7.553763440860215e-06, "loss": 3.3263, "mean_token_accuracy": 0.4362745098039216, "step": 13195 }, { "epoch": 2.446421950315165, "grad_norm": 7.12109375, "learning_rate": 7.553578049684836e-06, "loss": 2.5866, "mean_token_accuracy": 0.49486747357131916, "step": 13196 }, { "epoch": 2.446607341490545, "grad_norm": 6.63671875, "learning_rate": 7.553392658509455e-06, "loss": 2.9705, "mean_token_accuracy": 0.4706743080112181, "step": 13197 }, { "epoch": 2.446792732665925, "grad_norm": 6.0859375, "learning_rate": 7.553207267334075e-06, "loss": 3.3024, "mean_token_accuracy": 0.44313536907910556, "step": 13198 }, { "epoch": 2.4469781238413053, "grad_norm": 7.0390625, "learning_rate": 7.553021876158696e-06, "loss": 3.6083, "mean_token_accuracy": 0.43163824604141293, "step": 13199 }, { "epoch": 2.447163515016685, "grad_norm": 7.64453125, "learning_rate": 7.552836484983316e-06, "loss": 2.9616, "mean_token_accuracy": 0.47360084477296727, "step": 13200 }, { "epoch": 2.4473489061920652, "grad_norm": 6.11328125, "learning_rate": 7.552651093807936e-06, "loss": 2.6068, "mean_token_accuracy": 0.47780517879161527, "step": 13201 }, { "epoch": 2.4475342973674454, "grad_norm": 6.859375, "learning_rate": 7.552465702632555e-06, "loss": 2.6885, "mean_token_accuracy": 0.4832381788903528, "step": 13202 }, { "epoch": 2.447719688542825, "grad_norm": 6.27734375, "learning_rate": 7.552280311457175e-06, "loss": 3.0123, "mean_token_accuracy": 0.4588390501319261, "step": 13203 }, { "epoch": 2.4479050797182054, "grad_norm": 7.07421875, "learning_rate": 7.5520949202817954e-06, "loss": 3.011, "mean_token_accuracy": 0.46469651389134664, "step": 13204 }, { "epoch": 2.4480904708935856, "grad_norm": 8.265625, "learning_rate": 7.551909529106415e-06, "loss": 2.5261, "mean_token_accuracy": 0.4989381282741045, "step": 13205 }, { "epoch": 2.4482758620689653, "grad_norm": 6.48828125, "learning_rate": 7.551724137931035e-06, "loss": 2.9973, "mean_token_accuracy": 0.47969483568075116, "step": 13206 }, { "epoch": 2.4484612532443455, "grad_norm": 7.26171875, "learning_rate": 7.551538746755655e-06, "loss": 2.6401, "mean_token_accuracy": 0.4899672131147541, "step": 13207 }, { "epoch": 2.4486466444197257, "grad_norm": 10.7109375, "learning_rate": 7.551353355580276e-06, "loss": 2.3393, "mean_token_accuracy": 0.5124252491694352, "step": 13208 }, { "epoch": 2.448832035595106, "grad_norm": 7.265625, "learning_rate": 7.551167964404895e-06, "loss": 3.3117, "mean_token_accuracy": 0.47304881923117653, "step": 13209 }, { "epoch": 2.4490174267704856, "grad_norm": 7.19140625, "learning_rate": 7.550982573229515e-06, "loss": 3.0107, "mean_token_accuracy": 0.4800339847068819, "step": 13210 }, { "epoch": 2.449202817945866, "grad_norm": 6.875, "learning_rate": 7.550797182054135e-06, "loss": 2.704, "mean_token_accuracy": 0.48489405331510593, "step": 13211 }, { "epoch": 2.449388209121246, "grad_norm": 7.734375, "learning_rate": 7.550611790878754e-06, "loss": 2.9996, "mean_token_accuracy": 0.4691937276206939, "step": 13212 }, { "epoch": 2.4495736002966257, "grad_norm": 7.3046875, "learning_rate": 7.550426399703375e-06, "loss": 2.7962, "mean_token_accuracy": 0.5191304347826087, "step": 13213 }, { "epoch": 2.449758991472006, "grad_norm": 7.56640625, "learning_rate": 7.5502410085279944e-06, "loss": 2.6208, "mean_token_accuracy": 0.5085951393005335, "step": 13214 }, { "epoch": 2.449944382647386, "grad_norm": 6.8359375, "learning_rate": 7.550055617352615e-06, "loss": 2.6724, "mean_token_accuracy": 0.4931972789115646, "step": 13215 }, { "epoch": 2.450129773822766, "grad_norm": 6.69921875, "learning_rate": 7.5498702261772346e-06, "loss": 2.8974, "mean_token_accuracy": 0.4801796221740477, "step": 13216 }, { "epoch": 2.450315164998146, "grad_norm": 6.66015625, "learning_rate": 7.549684835001855e-06, "loss": 3.2697, "mean_token_accuracy": 0.4374600468783294, "step": 13217 }, { "epoch": 2.4505005561735262, "grad_norm": 7.7578125, "learning_rate": 7.549499443826475e-06, "loss": 2.5174, "mean_token_accuracy": 0.50613774389456, "step": 13218 }, { "epoch": 2.450685947348906, "grad_norm": 7.3671875, "learning_rate": 7.549314052651094e-06, "loss": 2.6881, "mean_token_accuracy": 0.5040331873703618, "step": 13219 }, { "epoch": 2.450871338524286, "grad_norm": 7.0703125, "learning_rate": 7.549128661475714e-06, "loss": 3.4583, "mean_token_accuracy": 0.42150170648464164, "step": 13220 }, { "epoch": 2.4510567296996664, "grad_norm": 7.37109375, "learning_rate": 7.548943270300334e-06, "loss": 3.051, "mean_token_accuracy": 0.451077246452969, "step": 13221 }, { "epoch": 2.4512421208750466, "grad_norm": 8.2578125, "learning_rate": 7.548757879124954e-06, "loss": 2.6532, "mean_token_accuracy": 0.5150265957446809, "step": 13222 }, { "epoch": 2.4514275120504263, "grad_norm": 6.9296875, "learning_rate": 7.548572487949575e-06, "loss": 2.6949, "mean_token_accuracy": 0.5034596375617792, "step": 13223 }, { "epoch": 2.4516129032258065, "grad_norm": 8.484375, "learning_rate": 7.548387096774194e-06, "loss": 2.9165, "mean_token_accuracy": 0.48109264119762807, "step": 13224 }, { "epoch": 2.4517982944011867, "grad_norm": 8.078125, "learning_rate": 7.548201705598815e-06, "loss": 2.8765, "mean_token_accuracy": 0.4824469478648153, "step": 13225 }, { "epoch": 2.4519836855765664, "grad_norm": 8.4453125, "learning_rate": 7.548016314423434e-06, "loss": 3.0404, "mean_token_accuracy": 0.46179775280898877, "step": 13226 }, { "epoch": 2.4521690767519466, "grad_norm": 7.828125, "learning_rate": 7.547830923248054e-06, "loss": 2.717, "mean_token_accuracy": 0.4817970565453137, "step": 13227 }, { "epoch": 2.452354467927327, "grad_norm": 7.87109375, "learning_rate": 7.547645532072674e-06, "loss": 2.9725, "mean_token_accuracy": 0.4736559139784946, "step": 13228 }, { "epoch": 2.4525398591027066, "grad_norm": 7.078125, "learning_rate": 7.547460140897293e-06, "loss": 3.0439, "mean_token_accuracy": 0.449438202247191, "step": 13229 }, { "epoch": 2.4527252502780867, "grad_norm": 7.34375, "learning_rate": 7.547274749721914e-06, "loss": 2.3932, "mean_token_accuracy": 0.5071455672189199, "step": 13230 }, { "epoch": 2.452910641453467, "grad_norm": 6.60546875, "learning_rate": 7.547089358546534e-06, "loss": 2.5717, "mean_token_accuracy": 0.506631299734748, "step": 13231 }, { "epoch": 2.4530960326288467, "grad_norm": 7.05859375, "learning_rate": 7.546903967371154e-06, "loss": 3.4247, "mean_token_accuracy": 0.44687610307094955, "step": 13232 }, { "epoch": 2.453281423804227, "grad_norm": 8.203125, "learning_rate": 7.546718576195774e-06, "loss": 2.7161, "mean_token_accuracy": 0.4849804092294297, "step": 13233 }, { "epoch": 2.453466814979607, "grad_norm": 16.609375, "learning_rate": 7.546533185020394e-06, "loss": 2.7526, "mean_token_accuracy": 0.5026485490557346, "step": 13234 }, { "epoch": 2.4536522061549872, "grad_norm": 8.4296875, "learning_rate": 7.546347793845014e-06, "loss": 2.9868, "mean_token_accuracy": 0.453990453990454, "step": 13235 }, { "epoch": 2.453837597330367, "grad_norm": 9.5234375, "learning_rate": 7.546162402669633e-06, "loss": 3.3655, "mean_token_accuracy": 0.4521408497272276, "step": 13236 }, { "epoch": 2.454022988505747, "grad_norm": 8.4921875, "learning_rate": 7.545977011494253e-06, "loss": 2.8036, "mean_token_accuracy": 0.4907233127184727, "step": 13237 }, { "epoch": 2.4542083796811274, "grad_norm": 9.65625, "learning_rate": 7.545791620318873e-06, "loss": 2.5723, "mean_token_accuracy": 0.5023023286409682, "step": 13238 }, { "epoch": 2.454393770856507, "grad_norm": 8.40625, "learning_rate": 7.545606229143493e-06, "loss": 2.7374, "mean_token_accuracy": 0.5142514251425142, "step": 13239 }, { "epoch": 2.4545791620318873, "grad_norm": 7.21484375, "learning_rate": 7.545420837968114e-06, "loss": 2.9025, "mean_token_accuracy": 0.47911007631612984, "step": 13240 }, { "epoch": 2.4547645532072675, "grad_norm": 10.3046875, "learning_rate": 7.545235446792733e-06, "loss": 2.744, "mean_token_accuracy": 0.4700448308653716, "step": 13241 }, { "epoch": 2.4549499443826472, "grad_norm": 7.32421875, "learning_rate": 7.545050055617354e-06, "loss": 3.4312, "mean_token_accuracy": 0.4547646237787083, "step": 13242 }, { "epoch": 2.4551353355580274, "grad_norm": 8.578125, "learning_rate": 7.5448646644419734e-06, "loss": 2.874, "mean_token_accuracy": 0.4737789781679046, "step": 13243 }, { "epoch": 2.4553207267334076, "grad_norm": 10.2265625, "learning_rate": 7.544679273266593e-06, "loss": 3.0093, "mean_token_accuracy": 0.4699094180419407, "step": 13244 }, { "epoch": 2.4555061179087874, "grad_norm": 8.8515625, "learning_rate": 7.544493882091213e-06, "loss": 3.0177, "mean_token_accuracy": 0.46644363341443634, "step": 13245 }, { "epoch": 2.4556915090841676, "grad_norm": 9.4609375, "learning_rate": 7.544308490915832e-06, "loss": 2.2757, "mean_token_accuracy": 0.5451754971306553, "step": 13246 }, { "epoch": 2.4558769002595477, "grad_norm": 9.078125, "learning_rate": 7.544123099740452e-06, "loss": 3.0499, "mean_token_accuracy": 0.46343107516009435, "step": 13247 }, { "epoch": 2.456062291434928, "grad_norm": 6.71484375, "learning_rate": 7.543937708565073e-06, "loss": 3.0525, "mean_token_accuracy": 0.4864864864864865, "step": 13248 }, { "epoch": 2.4562476826103077, "grad_norm": 11.4921875, "learning_rate": 7.543752317389693e-06, "loss": 3.4827, "mean_token_accuracy": 0.5011206328279499, "step": 13249 }, { "epoch": 2.456433073785688, "grad_norm": 7.56640625, "learning_rate": 7.543566926214313e-06, "loss": 3.0248, "mean_token_accuracy": 0.4866333725029377, "step": 13250 }, { "epoch": 2.4566184649610676, "grad_norm": 8.6640625, "learning_rate": 7.543381535038933e-06, "loss": 2.2933, "mean_token_accuracy": 0.5190633443410387, "step": 13251 }, { "epoch": 2.456803856136448, "grad_norm": 6.83984375, "learning_rate": 7.543196143863553e-06, "loss": 2.2886, "mean_token_accuracy": 0.5615132558832291, "step": 13252 }, { "epoch": 2.456989247311828, "grad_norm": 7.12109375, "learning_rate": 7.543010752688172e-06, "loss": 2.9018, "mean_token_accuracy": 0.4667919799498747, "step": 13253 }, { "epoch": 2.457174638487208, "grad_norm": 6.97265625, "learning_rate": 7.542825361512792e-06, "loss": 3.039, "mean_token_accuracy": 0.4797348011008256, "step": 13254 }, { "epoch": 2.457360029662588, "grad_norm": 6.46875, "learning_rate": 7.542639970337412e-06, "loss": 2.7739, "mean_token_accuracy": 0.4895278665246716, "step": 13255 }, { "epoch": 2.457545420837968, "grad_norm": 9.1328125, "learning_rate": 7.542454579162033e-06, "loss": 2.5269, "mean_token_accuracy": 0.5278604849000426, "step": 13256 }, { "epoch": 2.4577308120133483, "grad_norm": 7.72265625, "learning_rate": 7.542269187986653e-06, "loss": 2.6555, "mean_token_accuracy": 0.5005268703898841, "step": 13257 }, { "epoch": 2.457916203188728, "grad_norm": 7.8515625, "learning_rate": 7.542083796811272e-06, "loss": 3.2479, "mean_token_accuracy": 0.45065410779696496, "step": 13258 }, { "epoch": 2.4581015943641082, "grad_norm": 7.44921875, "learning_rate": 7.541898405635892e-06, "loss": 3.301, "mean_token_accuracy": 0.4671036659383345, "step": 13259 }, { "epoch": 2.4582869855394884, "grad_norm": 7.74609375, "learning_rate": 7.5417130144605125e-06, "loss": 2.5827, "mean_token_accuracy": 0.5135805130416038, "step": 13260 }, { "epoch": 2.458472376714868, "grad_norm": 7.62109375, "learning_rate": 7.541527623285132e-06, "loss": 2.6281, "mean_token_accuracy": 0.4961340206185567, "step": 13261 }, { "epoch": 2.4586577678902484, "grad_norm": 9.578125, "learning_rate": 7.541342232109752e-06, "loss": 3.1316, "mean_token_accuracy": 0.45230078563411896, "step": 13262 }, { "epoch": 2.4588431590656286, "grad_norm": 8.671875, "learning_rate": 7.541156840934371e-06, "loss": 2.694, "mean_token_accuracy": 0.47699468085106383, "step": 13263 }, { "epoch": 2.4590285502410083, "grad_norm": 7.93359375, "learning_rate": 7.540971449758993e-06, "loss": 2.7149, "mean_token_accuracy": 0.5159562077801072, "step": 13264 }, { "epoch": 2.4592139414163885, "grad_norm": 10.34375, "learning_rate": 7.540786058583612e-06, "loss": 2.7619, "mean_token_accuracy": 0.5025322283609577, "step": 13265 }, { "epoch": 2.4593993325917687, "grad_norm": 8.9375, "learning_rate": 7.540600667408232e-06, "loss": 2.7486, "mean_token_accuracy": 0.4464885188666577, "step": 13266 }, { "epoch": 2.459584723767149, "grad_norm": 6.17578125, "learning_rate": 7.540415276232852e-06, "loss": 2.6317, "mean_token_accuracy": 0.5076169029010466, "step": 13267 }, { "epoch": 2.4597701149425286, "grad_norm": 6.5625, "learning_rate": 7.540229885057472e-06, "loss": 2.2256, "mean_token_accuracy": 0.5350691619202603, "step": 13268 }, { "epoch": 2.459955506117909, "grad_norm": 9.3125, "learning_rate": 7.540044493882092e-06, "loss": 2.823, "mean_token_accuracy": 0.49676519091716353, "step": 13269 }, { "epoch": 2.460140897293289, "grad_norm": 15.5078125, "learning_rate": 7.5398591027067114e-06, "loss": 3.2457, "mean_token_accuracy": 0.443209574987787, "step": 13270 }, { "epoch": 2.4603262884686687, "grad_norm": 8.890625, "learning_rate": 7.539673711531331e-06, "loss": 2.8765, "mean_token_accuracy": 0.46920492721164614, "step": 13271 }, { "epoch": 2.460511679644049, "grad_norm": 8.828125, "learning_rate": 7.5394883203559524e-06, "loss": 2.5411, "mean_token_accuracy": 0.5048463356973996, "step": 13272 }, { "epoch": 2.460697070819429, "grad_norm": 10.8359375, "learning_rate": 7.539302929180572e-06, "loss": 2.3473, "mean_token_accuracy": 0.5099614395886889, "step": 13273 }, { "epoch": 2.460882461994809, "grad_norm": 7.328125, "learning_rate": 7.539117538005192e-06, "loss": 3.1377, "mean_token_accuracy": 0.46026069904256545, "step": 13274 }, { "epoch": 2.461067853170189, "grad_norm": 8.09375, "learning_rate": 7.538932146829811e-06, "loss": 2.9261, "mean_token_accuracy": 0.4772105742935278, "step": 13275 }, { "epoch": 2.4612532443455692, "grad_norm": 6.44921875, "learning_rate": 7.538746755654431e-06, "loss": 3.537, "mean_token_accuracy": 0.4269978401727862, "step": 13276 }, { "epoch": 2.461438635520949, "grad_norm": 7.44140625, "learning_rate": 7.5385613644790515e-06, "loss": 2.6834, "mean_token_accuracy": 0.5136465324384788, "step": 13277 }, { "epoch": 2.461624026696329, "grad_norm": 7.81640625, "learning_rate": 7.538375973303671e-06, "loss": 2.7812, "mean_token_accuracy": 0.4897326045206087, "step": 13278 }, { "epoch": 2.4618094178717094, "grad_norm": 9.6875, "learning_rate": 7.538190582128291e-06, "loss": 3.0179, "mean_token_accuracy": 0.45230582524271845, "step": 13279 }, { "epoch": 2.4619948090470896, "grad_norm": 9.171875, "learning_rate": 7.538005190952912e-06, "loss": 2.5547, "mean_token_accuracy": 0.5022950819672131, "step": 13280 }, { "epoch": 2.4621802002224693, "grad_norm": 7.6171875, "learning_rate": 7.537819799777532e-06, "loss": 3.5734, "mean_token_accuracy": 0.4427555773473891, "step": 13281 }, { "epoch": 2.4623655913978495, "grad_norm": 10.8515625, "learning_rate": 7.537634408602151e-06, "loss": 3.0597, "mean_token_accuracy": 0.4607632356893308, "step": 13282 }, { "epoch": 2.4625509825732297, "grad_norm": 8.0859375, "learning_rate": 7.537449017426771e-06, "loss": 2.7501, "mean_token_accuracy": 0.4774280273328106, "step": 13283 }, { "epoch": 2.4627363737486094, "grad_norm": 7.625, "learning_rate": 7.537263626251391e-06, "loss": 2.8907, "mean_token_accuracy": 0.4921441774491682, "step": 13284 }, { "epoch": 2.4629217649239896, "grad_norm": 7.16796875, "learning_rate": 7.537078235076011e-06, "loss": 3.3606, "mean_token_accuracy": 0.43273841236014915, "step": 13285 }, { "epoch": 2.46310715609937, "grad_norm": 8.09375, "learning_rate": 7.536892843900631e-06, "loss": 2.9947, "mean_token_accuracy": 0.47450684488519995, "step": 13286 }, { "epoch": 2.4632925472747496, "grad_norm": 8.140625, "learning_rate": 7.5367074527252505e-06, "loss": 2.5571, "mean_token_accuracy": 0.5103296193129062, "step": 13287 }, { "epoch": 2.4634779384501297, "grad_norm": 6.9296875, "learning_rate": 7.536522061549871e-06, "loss": 3.0481, "mean_token_accuracy": 0.4744993742177722, "step": 13288 }, { "epoch": 2.46366332962551, "grad_norm": 8.2109375, "learning_rate": 7.5363366703744915e-06, "loss": 2.9223, "mean_token_accuracy": 0.48070626753975676, "step": 13289 }, { "epoch": 2.4638487208008897, "grad_norm": 7.265625, "learning_rate": 7.536151279199111e-06, "loss": 2.5837, "mean_token_accuracy": 0.5, "step": 13290 }, { "epoch": 2.46403411197627, "grad_norm": 7.51171875, "learning_rate": 7.535965888023731e-06, "loss": 2.6596, "mean_token_accuracy": 0.49928140270192584, "step": 13291 }, { "epoch": 2.46421950315165, "grad_norm": 6.70703125, "learning_rate": 7.53578049684835e-06, "loss": 2.4983, "mean_token_accuracy": 0.4989667049368542, "step": 13292 }, { "epoch": 2.4644048943270302, "grad_norm": 7.58984375, "learning_rate": 7.53559510567297e-06, "loss": 2.5259, "mean_token_accuracy": 0.5089542892924233, "step": 13293 }, { "epoch": 2.46459028550241, "grad_norm": 6.796875, "learning_rate": 7.5354097144975905e-06, "loss": 3.3845, "mean_token_accuracy": 0.45827458018202794, "step": 13294 }, { "epoch": 2.46477567667779, "grad_norm": 12.265625, "learning_rate": 7.53522432332221e-06, "loss": 3.9096, "mean_token_accuracy": 0.4415058696532182, "step": 13295 }, { "epoch": 2.4649610678531704, "grad_norm": 6.83203125, "learning_rate": 7.535038932146831e-06, "loss": 2.4868, "mean_token_accuracy": 0.5420560747663551, "step": 13296 }, { "epoch": 2.46514645902855, "grad_norm": 7.65234375, "learning_rate": 7.53485354097145e-06, "loss": 2.7698, "mean_token_accuracy": 0.4740678145135735, "step": 13297 }, { "epoch": 2.4653318502039303, "grad_norm": 8.046875, "learning_rate": 7.534668149796071e-06, "loss": 3.0664, "mean_token_accuracy": 0.4593239943451998, "step": 13298 }, { "epoch": 2.4655172413793105, "grad_norm": 11.7890625, "learning_rate": 7.5344827586206904e-06, "loss": 2.5011, "mean_token_accuracy": 0.4934348449740704, "step": 13299 }, { "epoch": 2.4657026325546902, "grad_norm": 12.5078125, "learning_rate": 7.53429736744531e-06, "loss": 2.2675, "mean_token_accuracy": 0.5410469107551488, "step": 13300 }, { "epoch": 2.4658880237300704, "grad_norm": 9.7890625, "learning_rate": 7.53411197626993e-06, "loss": 2.9737, "mean_token_accuracy": 0.4888496056567854, "step": 13301 }, { "epoch": 2.4660734149054506, "grad_norm": 6.9140625, "learning_rate": 7.533926585094549e-06, "loss": 2.9123, "mean_token_accuracy": 0.46083532555196177, "step": 13302 }, { "epoch": 2.4662588060808304, "grad_norm": 9.4453125, "learning_rate": 7.53374119391917e-06, "loss": 3.3758, "mean_token_accuracy": 0.45414146401678807, "step": 13303 }, { "epoch": 2.4664441972562106, "grad_norm": 11.234375, "learning_rate": 7.53355580274379e-06, "loss": 2.471, "mean_token_accuracy": 0.5417844622865543, "step": 13304 }, { "epoch": 2.4666295884315907, "grad_norm": 14.1953125, "learning_rate": 7.53337041156841e-06, "loss": 2.791, "mean_token_accuracy": 0.4993684210526316, "step": 13305 }, { "epoch": 2.466814979606971, "grad_norm": 7.63671875, "learning_rate": 7.5331850203930305e-06, "loss": 3.2095, "mean_token_accuracy": 0.4437604924454393, "step": 13306 }, { "epoch": 2.4670003707823507, "grad_norm": 12.0390625, "learning_rate": 7.53299962921765e-06, "loss": 2.7843, "mean_token_accuracy": 0.4885530619149656, "step": 13307 }, { "epoch": 2.467185761957731, "grad_norm": 9.25, "learning_rate": 7.53281423804227e-06, "loss": 2.6579, "mean_token_accuracy": 0.5365567574612277, "step": 13308 }, { "epoch": 2.4673711531331106, "grad_norm": 8.171875, "learning_rate": 7.5326288468668894e-06, "loss": 2.7935, "mean_token_accuracy": 0.5261004514672686, "step": 13309 }, { "epoch": 2.467556544308491, "grad_norm": 6.8046875, "learning_rate": 7.532443455691509e-06, "loss": 2.7979, "mean_token_accuracy": 0.5002209944751381, "step": 13310 }, { "epoch": 2.467741935483871, "grad_norm": 9.0078125, "learning_rate": 7.5322580645161296e-06, "loss": 2.6403, "mean_token_accuracy": 0.49795261772448085, "step": 13311 }, { "epoch": 2.467927326659251, "grad_norm": 7.921875, "learning_rate": 7.53207267334075e-06, "loss": 3.096, "mean_token_accuracy": 0.4681624467285034, "step": 13312 }, { "epoch": 2.468112717834631, "grad_norm": 7.27734375, "learning_rate": 7.53188728216537e-06, "loss": 2.1734, "mean_token_accuracy": 0.5699150520187076, "step": 13313 }, { "epoch": 2.468298109010011, "grad_norm": 7.68359375, "learning_rate": 7.531701890989989e-06, "loss": 3.0781, "mean_token_accuracy": 0.45792450652199007, "step": 13314 }, { "epoch": 2.4684835001853913, "grad_norm": 8.4609375, "learning_rate": 7.53151649981461e-06, "loss": 3.2745, "mean_token_accuracy": 0.4541628304616599, "step": 13315 }, { "epoch": 2.468668891360771, "grad_norm": 8.96875, "learning_rate": 7.5313311086392295e-06, "loss": 3.1627, "mean_token_accuracy": 0.42383209207853756, "step": 13316 }, { "epoch": 2.4688542825361512, "grad_norm": 9.2265625, "learning_rate": 7.531145717463849e-06, "loss": 2.4012, "mean_token_accuracy": 0.5350802008081302, "step": 13317 }, { "epoch": 2.4690396737115314, "grad_norm": 6.921875, "learning_rate": 7.530960326288469e-06, "loss": 2.5358, "mean_token_accuracy": 0.503161159489443, "step": 13318 }, { "epoch": 2.4692250648869116, "grad_norm": 7.78125, "learning_rate": 7.530774935113088e-06, "loss": 2.4126, "mean_token_accuracy": 0.5125684420109508, "step": 13319 }, { "epoch": 2.4694104560622914, "grad_norm": 7.74609375, "learning_rate": 7.53058954393771e-06, "loss": 3.2211, "mean_token_accuracy": 0.4494884158532283, "step": 13320 }, { "epoch": 2.4695958472376716, "grad_norm": 6.546875, "learning_rate": 7.530404152762329e-06, "loss": 2.8311, "mean_token_accuracy": 0.48358938547486036, "step": 13321 }, { "epoch": 2.4697812384130513, "grad_norm": 8.3671875, "learning_rate": 7.530218761586949e-06, "loss": 3.6416, "mean_token_accuracy": 0.45073919422187114, "step": 13322 }, { "epoch": 2.4699666295884315, "grad_norm": 8.796875, "learning_rate": 7.5300333704115695e-06, "loss": 2.8854, "mean_token_accuracy": 0.46971153846153846, "step": 13323 }, { "epoch": 2.4701520207638117, "grad_norm": 8.390625, "learning_rate": 7.529847979236189e-06, "loss": 2.5262, "mean_token_accuracy": 0.5237226277372263, "step": 13324 }, { "epoch": 2.470337411939192, "grad_norm": 10.6640625, "learning_rate": 7.529662588060809e-06, "loss": 2.5144, "mean_token_accuracy": 0.5209748302037555, "step": 13325 }, { "epoch": 2.4705228031145716, "grad_norm": 7.5546875, "learning_rate": 7.5294771968854285e-06, "loss": 3.1682, "mean_token_accuracy": 0.46429780033840945, "step": 13326 }, { "epoch": 2.470708194289952, "grad_norm": 7.890625, "learning_rate": 7.529291805710048e-06, "loss": 3.5393, "mean_token_accuracy": 0.4307639445653419, "step": 13327 }, { "epoch": 2.470893585465332, "grad_norm": 9.1953125, "learning_rate": 7.5291064145346695e-06, "loss": 2.8436, "mean_token_accuracy": 0.46603908484270734, "step": 13328 }, { "epoch": 2.4710789766407117, "grad_norm": 7.890625, "learning_rate": 7.528921023359289e-06, "loss": 2.5791, "mean_token_accuracy": 0.5097365406643757, "step": 13329 }, { "epoch": 2.471264367816092, "grad_norm": 6.765625, "learning_rate": 7.528735632183909e-06, "loss": 2.7928, "mean_token_accuracy": 0.49418257756563244, "step": 13330 }, { "epoch": 2.471449758991472, "grad_norm": 9.2265625, "learning_rate": 7.528550241008528e-06, "loss": 3.1077, "mean_token_accuracy": 0.4617847138855542, "step": 13331 }, { "epoch": 2.471635150166852, "grad_norm": 9.0859375, "learning_rate": 7.528364849833149e-06, "loss": 3.0796, "mean_token_accuracy": 0.4658896658896659, "step": 13332 }, { "epoch": 2.471820541342232, "grad_norm": 7.2265625, "learning_rate": 7.5281794586577685e-06, "loss": 2.9862, "mean_token_accuracy": 0.4403870967741936, "step": 13333 }, { "epoch": 2.4720059325176122, "grad_norm": 7.3515625, "learning_rate": 7.527994067482388e-06, "loss": 2.9313, "mean_token_accuracy": 0.4601187403200826, "step": 13334 }, { "epoch": 2.472191323692992, "grad_norm": 7.8515625, "learning_rate": 7.527808676307008e-06, "loss": 2.9619, "mean_token_accuracy": 0.4835637480798771, "step": 13335 }, { "epoch": 2.472376714868372, "grad_norm": 7.18359375, "learning_rate": 7.527623285131629e-06, "loss": 3.5066, "mean_token_accuracy": 0.4232268121590023, "step": 13336 }, { "epoch": 2.4725621060437524, "grad_norm": 8.1796875, "learning_rate": 7.527437893956249e-06, "loss": 3.3012, "mean_token_accuracy": 0.46864857847591845, "step": 13337 }, { "epoch": 2.4727474972191326, "grad_norm": 6.80078125, "learning_rate": 7.5272525027808684e-06, "loss": 2.7245, "mean_token_accuracy": 0.4970202622169249, "step": 13338 }, { "epoch": 2.4729328883945123, "grad_norm": 6.9453125, "learning_rate": 7.527067111605488e-06, "loss": 2.4637, "mean_token_accuracy": 0.5198421164924258, "step": 13339 }, { "epoch": 2.4731182795698925, "grad_norm": 9.1796875, "learning_rate": 7.526881720430108e-06, "loss": 2.4712, "mean_token_accuracy": 0.5152079453755432, "step": 13340 }, { "epoch": 2.4733036707452727, "grad_norm": 7.62890625, "learning_rate": 7.526696329254728e-06, "loss": 2.4651, "mean_token_accuracy": 0.5261992619926199, "step": 13341 }, { "epoch": 2.4734890619206524, "grad_norm": 7.48046875, "learning_rate": 7.526510938079348e-06, "loss": 2.9323, "mean_token_accuracy": 0.4613366907984472, "step": 13342 }, { "epoch": 2.4736744530960326, "grad_norm": 7.7578125, "learning_rate": 7.5263255469039675e-06, "loss": 2.7491, "mean_token_accuracy": 0.4887230514096186, "step": 13343 }, { "epoch": 2.473859844271413, "grad_norm": 8.8125, "learning_rate": 7.526140155728589e-06, "loss": 2.8553, "mean_token_accuracy": 0.466578073089701, "step": 13344 }, { "epoch": 2.4740452354467926, "grad_norm": 8.78125, "learning_rate": 7.5259547645532085e-06, "loss": 2.8389, "mean_token_accuracy": 0.4698604060913706, "step": 13345 }, { "epoch": 2.4742306266221727, "grad_norm": 10.0234375, "learning_rate": 7.525769373377828e-06, "loss": 2.4711, "mean_token_accuracy": 0.5102293862368258, "step": 13346 }, { "epoch": 2.474416017797553, "grad_norm": 6.609375, "learning_rate": 7.525583982202448e-06, "loss": 3.2931, "mean_token_accuracy": 0.4564947172156619, "step": 13347 }, { "epoch": 2.4746014089729327, "grad_norm": 6.77734375, "learning_rate": 7.525398591027067e-06, "loss": 2.0569, "mean_token_accuracy": 0.572173124858373, "step": 13348 }, { "epoch": 2.474786800148313, "grad_norm": 7.37109375, "learning_rate": 7.525213199851688e-06, "loss": 3.2619, "mean_token_accuracy": 0.46810165975103735, "step": 13349 }, { "epoch": 2.474972191323693, "grad_norm": 7.640625, "learning_rate": 7.5250278086763076e-06, "loss": 3.283, "mean_token_accuracy": 0.4435272536687631, "step": 13350 }, { "epoch": 2.4751575824990732, "grad_norm": 7.67578125, "learning_rate": 7.524842417500927e-06, "loss": 3.7357, "mean_token_accuracy": 0.4129402671966435, "step": 13351 }, { "epoch": 2.475342973674453, "grad_norm": 8.2578125, "learning_rate": 7.524657026325548e-06, "loss": 2.8798, "mean_token_accuracy": 0.49326347305389223, "step": 13352 }, { "epoch": 2.475528364849833, "grad_norm": 11.15625, "learning_rate": 7.524471635150168e-06, "loss": 3.0943, "mean_token_accuracy": 0.4886431919240476, "step": 13353 }, { "epoch": 2.4757137560252134, "grad_norm": 7.83984375, "learning_rate": 7.524286243974788e-06, "loss": 2.8252, "mean_token_accuracy": 0.48654353562005276, "step": 13354 }, { "epoch": 2.475899147200593, "grad_norm": 7.0703125, "learning_rate": 7.5241008527994075e-06, "loss": 3.3741, "mean_token_accuracy": 0.4473328324567994, "step": 13355 }, { "epoch": 2.4760845383759733, "grad_norm": 8.640625, "learning_rate": 7.523915461624027e-06, "loss": 2.7767, "mean_token_accuracy": 0.48945416004239534, "step": 13356 }, { "epoch": 2.4762699295513535, "grad_norm": 8.671875, "learning_rate": 7.523730070448647e-06, "loss": 2.9909, "mean_token_accuracy": 0.4589222107319315, "step": 13357 }, { "epoch": 2.4764553207267332, "grad_norm": 7.32421875, "learning_rate": 7.523544679273267e-06, "loss": 2.4897, "mean_token_accuracy": 0.5608597553734711, "step": 13358 }, { "epoch": 2.4766407119021134, "grad_norm": 7.8203125, "learning_rate": 7.523359288097887e-06, "loss": 3.1675, "mean_token_accuracy": 0.46964476654389925, "step": 13359 }, { "epoch": 2.4768261030774936, "grad_norm": 13.1015625, "learning_rate": 7.523173896922507e-06, "loss": 3.1675, "mean_token_accuracy": 0.4482343499197432, "step": 13360 }, { "epoch": 2.4770114942528734, "grad_norm": 6.80078125, "learning_rate": 7.522988505747128e-06, "loss": 2.694, "mean_token_accuracy": 0.48408050260888086, "step": 13361 }, { "epoch": 2.4771968854282536, "grad_norm": 7.1796875, "learning_rate": 7.5228031145717475e-06, "loss": 2.6752, "mean_token_accuracy": 0.504384799362211, "step": 13362 }, { "epoch": 2.4773822766036337, "grad_norm": 8.515625, "learning_rate": 7.522617723396367e-06, "loss": 3.0982, "mean_token_accuracy": 0.4556381464494702, "step": 13363 }, { "epoch": 2.477567667779014, "grad_norm": 8.359375, "learning_rate": 7.522432332220987e-06, "loss": 2.6557, "mean_token_accuracy": 0.4727175938248603, "step": 13364 }, { "epoch": 2.4777530589543937, "grad_norm": 6.66796875, "learning_rate": 7.5222469410456065e-06, "loss": 2.7096, "mean_token_accuracy": 0.4813586574213647, "step": 13365 }, { "epoch": 2.477938450129774, "grad_norm": 8.203125, "learning_rate": 7.522061549870226e-06, "loss": 2.7456, "mean_token_accuracy": 0.4972761558876938, "step": 13366 }, { "epoch": 2.478123841305154, "grad_norm": 7.52734375, "learning_rate": 7.521876158694847e-06, "loss": 3.4023, "mean_token_accuracy": 0.4281520827509086, "step": 13367 }, { "epoch": 2.478309232480534, "grad_norm": 10.3828125, "learning_rate": 7.521690767519466e-06, "loss": 2.8668, "mean_token_accuracy": 0.5126627218934912, "step": 13368 }, { "epoch": 2.478494623655914, "grad_norm": 8.7109375, "learning_rate": 7.521505376344087e-06, "loss": 2.2928, "mean_token_accuracy": 0.5527035556490637, "step": 13369 }, { "epoch": 2.478680014831294, "grad_norm": 7.66015625, "learning_rate": 7.521319985168707e-06, "loss": 2.7185, "mean_token_accuracy": 0.5340132364109798, "step": 13370 }, { "epoch": 2.478865406006674, "grad_norm": 7.015625, "learning_rate": 7.521134593993327e-06, "loss": 2.7987, "mean_token_accuracy": 0.493445833961127, "step": 13371 }, { "epoch": 2.479050797182054, "grad_norm": 8.34375, "learning_rate": 7.5209492028179465e-06, "loss": 2.3068, "mean_token_accuracy": 0.5200969758836289, "step": 13372 }, { "epoch": 2.4792361883574343, "grad_norm": 7.953125, "learning_rate": 7.520763811642566e-06, "loss": 3.0508, "mean_token_accuracy": 0.46788263283108644, "step": 13373 }, { "epoch": 2.479421579532814, "grad_norm": 8.1796875, "learning_rate": 7.520578420467186e-06, "loss": 3.2592, "mean_token_accuracy": 0.43542822858857055, "step": 13374 }, { "epoch": 2.4796069707081942, "grad_norm": 6.921875, "learning_rate": 7.520393029291806e-06, "loss": 2.8971, "mean_token_accuracy": 0.4828077959118999, "step": 13375 }, { "epoch": 2.4797923618835744, "grad_norm": 7.34375, "learning_rate": 7.520207638116426e-06, "loss": 2.9779, "mean_token_accuracy": 0.45186615186615187, "step": 13376 }, { "epoch": 2.4799777530589546, "grad_norm": 8.578125, "learning_rate": 7.520022246941046e-06, "loss": 2.9864, "mean_token_accuracy": 0.4780915287244401, "step": 13377 }, { "epoch": 2.4801631442343344, "grad_norm": 8.3359375, "learning_rate": 7.519836855765666e-06, "loss": 2.8084, "mean_token_accuracy": 0.49474615639862846, "step": 13378 }, { "epoch": 2.4803485354097146, "grad_norm": 7.640625, "learning_rate": 7.5196514645902866e-06, "loss": 2.9678, "mean_token_accuracy": 0.4765432098765432, "step": 13379 }, { "epoch": 2.4805339265850943, "grad_norm": 7.59765625, "learning_rate": 7.519466073414906e-06, "loss": 2.6507, "mean_token_accuracy": 0.493480150680962, "step": 13380 }, { "epoch": 2.4807193177604745, "grad_norm": 7.4140625, "learning_rate": 7.519280682239526e-06, "loss": 2.6835, "mean_token_accuracy": 0.48588987833939545, "step": 13381 }, { "epoch": 2.4809047089358547, "grad_norm": 6.97265625, "learning_rate": 7.5190952910641455e-06, "loss": 2.4351, "mean_token_accuracy": 0.5155837295298468, "step": 13382 }, { "epoch": 2.481090100111235, "grad_norm": 6.4453125, "learning_rate": 7.518909899888765e-06, "loss": 2.6197, "mean_token_accuracy": 0.5044617643736586, "step": 13383 }, { "epoch": 2.4812754912866146, "grad_norm": 7.70703125, "learning_rate": 7.518724508713386e-06, "loss": 2.7679, "mean_token_accuracy": 0.4906679764243615, "step": 13384 }, { "epoch": 2.481460882461995, "grad_norm": 9.9296875, "learning_rate": 7.518539117538006e-06, "loss": 3.82, "mean_token_accuracy": 0.4376544596719838, "step": 13385 }, { "epoch": 2.481646273637375, "grad_norm": 7.109375, "learning_rate": 7.518353726362626e-06, "loss": 2.8567, "mean_token_accuracy": 0.4698809961967857, "step": 13386 }, { "epoch": 2.4818316648127547, "grad_norm": 7.4921875, "learning_rate": 7.518168335187246e-06, "loss": 3.1748, "mean_token_accuracy": 0.46855796804588284, "step": 13387 }, { "epoch": 2.482017055988135, "grad_norm": 6.16015625, "learning_rate": 7.517982944011866e-06, "loss": 2.6274, "mean_token_accuracy": 0.515479049906592, "step": 13388 }, { "epoch": 2.482202447163515, "grad_norm": 7.67578125, "learning_rate": 7.5177975528364855e-06, "loss": 3.2758, "mean_token_accuracy": 0.47297999480384517, "step": 13389 }, { "epoch": 2.4823878383388953, "grad_norm": 7.75390625, "learning_rate": 7.517612161661105e-06, "loss": 2.3158, "mean_token_accuracy": 0.5441895449417075, "step": 13390 }, { "epoch": 2.482573229514275, "grad_norm": 7.0078125, "learning_rate": 7.517426770485725e-06, "loss": 3.1104, "mean_token_accuracy": 0.46416107382550337, "step": 13391 }, { "epoch": 2.4827586206896552, "grad_norm": 8.25, "learning_rate": 7.517241379310345e-06, "loss": 2.5368, "mean_token_accuracy": 0.5137787337007662, "step": 13392 }, { "epoch": 2.482944011865035, "grad_norm": 8.34375, "learning_rate": 7.517055988134966e-06, "loss": 3.7509, "mean_token_accuracy": 0.4327950605556875, "step": 13393 }, { "epoch": 2.483129403040415, "grad_norm": 8.75, "learning_rate": 7.5168705969595855e-06, "loss": 2.3938, "mean_token_accuracy": 0.5130890052356021, "step": 13394 }, { "epoch": 2.4833147942157954, "grad_norm": 6.50390625, "learning_rate": 7.516685205784205e-06, "loss": 3.1548, "mean_token_accuracy": 0.44699950811608463, "step": 13395 }, { "epoch": 2.4835001853911756, "grad_norm": 6.7265625, "learning_rate": 7.516499814608826e-06, "loss": 2.8864, "mean_token_accuracy": 0.47256312596981237, "step": 13396 }, { "epoch": 2.4836855765665553, "grad_norm": 7.33984375, "learning_rate": 7.516314423433445e-06, "loss": 2.8077, "mean_token_accuracy": 0.523390643742503, "step": 13397 }, { "epoch": 2.4838709677419355, "grad_norm": 8.40625, "learning_rate": 7.516129032258065e-06, "loss": 2.6146, "mean_token_accuracy": 0.49390937144713337, "step": 13398 }, { "epoch": 2.4840563589173157, "grad_norm": 7.3515625, "learning_rate": 7.5159436410826845e-06, "loss": 3.4517, "mean_token_accuracy": 0.4265389082462253, "step": 13399 }, { "epoch": 2.4842417500926954, "grad_norm": 9.7265625, "learning_rate": 7.515758249907304e-06, "loss": 2.4666, "mean_token_accuracy": 0.4980188211986132, "step": 13400 }, { "epoch": 2.4844271412680756, "grad_norm": 8.578125, "learning_rate": 7.5155728587319255e-06, "loss": 2.8473, "mean_token_accuracy": 0.4804493103938575, "step": 13401 }, { "epoch": 2.484612532443456, "grad_norm": 8.1875, "learning_rate": 7.515387467556545e-06, "loss": 2.9203, "mean_token_accuracy": 0.4864145502938893, "step": 13402 }, { "epoch": 2.4847979236188356, "grad_norm": 11.3359375, "learning_rate": 7.515202076381165e-06, "loss": 2.925, "mean_token_accuracy": 0.46713742531233027, "step": 13403 }, { "epoch": 2.4849833147942157, "grad_norm": 9.859375, "learning_rate": 7.515016685205785e-06, "loss": 3.007, "mean_token_accuracy": 0.49192928516525747, "step": 13404 }, { "epoch": 2.485168705969596, "grad_norm": 6.8984375, "learning_rate": 7.514831294030405e-06, "loss": 3.1074, "mean_token_accuracy": 0.46310925622298404, "step": 13405 }, { "epoch": 2.4853540971449757, "grad_norm": 8.8515625, "learning_rate": 7.5146459028550246e-06, "loss": 2.9993, "mean_token_accuracy": 0.48634661968729354, "step": 13406 }, { "epoch": 2.485539488320356, "grad_norm": 9.28125, "learning_rate": 7.514460511679644e-06, "loss": 3.0121, "mean_token_accuracy": 0.4662534435261708, "step": 13407 }, { "epoch": 2.485724879495736, "grad_norm": 6.15625, "learning_rate": 7.514275120504264e-06, "loss": 2.7469, "mean_token_accuracy": 0.4930687686871432, "step": 13408 }, { "epoch": 2.4859102706711163, "grad_norm": 7.125, "learning_rate": 7.514089729328885e-06, "loss": 3.0081, "mean_token_accuracy": 0.4764559190259645, "step": 13409 }, { "epoch": 2.486095661846496, "grad_norm": 6.91796875, "learning_rate": 7.513904338153505e-06, "loss": 2.8525, "mean_token_accuracy": 0.4921892434724392, "step": 13410 }, { "epoch": 2.486281053021876, "grad_norm": 9.0234375, "learning_rate": 7.5137189469781245e-06, "loss": 2.6243, "mean_token_accuracy": 0.484802661424467, "step": 13411 }, { "epoch": 2.4864664441972564, "grad_norm": 7.37890625, "learning_rate": 7.513533555802744e-06, "loss": 3.3506, "mean_token_accuracy": 0.44198342383538153, "step": 13412 }, { "epoch": 2.486651835372636, "grad_norm": 7.69140625, "learning_rate": 7.513348164627365e-06, "loss": 2.4477, "mean_token_accuracy": 0.5536558242980262, "step": 13413 }, { "epoch": 2.4868372265480163, "grad_norm": 7.6796875, "learning_rate": 7.513162773451984e-06, "loss": 2.3984, "mean_token_accuracy": 0.5429239505622981, "step": 13414 }, { "epoch": 2.4870226177233965, "grad_norm": 7.9765625, "learning_rate": 7.512977382276604e-06, "loss": 3.301, "mean_token_accuracy": 0.4442921236291127, "step": 13415 }, { "epoch": 2.4872080088987762, "grad_norm": 7.75390625, "learning_rate": 7.5127919911012236e-06, "loss": 3.0548, "mean_token_accuracy": 0.4658942983598021, "step": 13416 }, { "epoch": 2.4873934000741564, "grad_norm": 8.1015625, "learning_rate": 7.512606599925845e-06, "loss": 2.9517, "mean_token_accuracy": 0.5045510455104552, "step": 13417 }, { "epoch": 2.4875787912495366, "grad_norm": 7.25, "learning_rate": 7.5124212087504645e-06, "loss": 3.0516, "mean_token_accuracy": 0.45899091343854614, "step": 13418 }, { "epoch": 2.4877641824249164, "grad_norm": 8.4375, "learning_rate": 7.512235817575084e-06, "loss": 2.8354, "mean_token_accuracy": 0.48569033648664833, "step": 13419 }, { "epoch": 2.4879495736002966, "grad_norm": 7.06640625, "learning_rate": 7.512050426399704e-06, "loss": 2.4433, "mean_token_accuracy": 0.5433648286560429, "step": 13420 }, { "epoch": 2.4881349647756767, "grad_norm": 8.1484375, "learning_rate": 7.5118650352243235e-06, "loss": 2.8987, "mean_token_accuracy": 0.5172338806132435, "step": 13421 }, { "epoch": 2.488320355951057, "grad_norm": 7.01953125, "learning_rate": 7.511679644048944e-06, "loss": 2.8077, "mean_token_accuracy": 0.4860454115421003, "step": 13422 }, { "epoch": 2.4885057471264367, "grad_norm": 6.37109375, "learning_rate": 7.511494252873564e-06, "loss": 2.9741, "mean_token_accuracy": 0.4604071067184375, "step": 13423 }, { "epoch": 2.488691138301817, "grad_norm": 7.3828125, "learning_rate": 7.511308861698183e-06, "loss": 3.0482, "mean_token_accuracy": 0.47602070155261644, "step": 13424 }, { "epoch": 2.488876529477197, "grad_norm": 6.68359375, "learning_rate": 7.511123470522805e-06, "loss": 2.6006, "mean_token_accuracy": 0.5005136106831022, "step": 13425 }, { "epoch": 2.489061920652577, "grad_norm": 6.94140625, "learning_rate": 7.510938079347424e-06, "loss": 2.8546, "mean_token_accuracy": 0.47346611415586154, "step": 13426 }, { "epoch": 2.489247311827957, "grad_norm": 8.109375, "learning_rate": 7.510752688172044e-06, "loss": 3.4283, "mean_token_accuracy": 0.4365817495619356, "step": 13427 }, { "epoch": 2.489432703003337, "grad_norm": 7.7109375, "learning_rate": 7.5105672969966635e-06, "loss": 3.2388, "mean_token_accuracy": 0.4547099400655886, "step": 13428 }, { "epoch": 2.489618094178717, "grad_norm": 7.70703125, "learning_rate": 7.510381905821283e-06, "loss": 3.4889, "mean_token_accuracy": 0.43380668804397615, "step": 13429 }, { "epoch": 2.489803485354097, "grad_norm": 9.484375, "learning_rate": 7.510196514645904e-06, "loss": 2.4831, "mean_token_accuracy": 0.4905131166474179, "step": 13430 }, { "epoch": 2.4899888765294773, "grad_norm": 9.75, "learning_rate": 7.510011123470523e-06, "loss": 2.6115, "mean_token_accuracy": 0.4971209213051823, "step": 13431 }, { "epoch": 2.490174267704857, "grad_norm": 10.953125, "learning_rate": 7.509825732295143e-06, "loss": 3.3875, "mean_token_accuracy": 0.45209918286841366, "step": 13432 }, { "epoch": 2.4903596588802372, "grad_norm": 6.05859375, "learning_rate": 7.5096403411197634e-06, "loss": 2.3436, "mean_token_accuracy": 0.5541093343250278, "step": 13433 }, { "epoch": 2.4905450500556174, "grad_norm": 8.359375, "learning_rate": 7.509454949944384e-06, "loss": 2.8157, "mean_token_accuracy": 0.49122030396930794, "step": 13434 }, { "epoch": 2.4907304412309976, "grad_norm": 9.578125, "learning_rate": 7.5092695587690036e-06, "loss": 3.1524, "mean_token_accuracy": 0.4540645879732739, "step": 13435 }, { "epoch": 2.4909158324063774, "grad_norm": 7.84765625, "learning_rate": 7.509084167593623e-06, "loss": 2.2787, "mean_token_accuracy": 0.5460733644270768, "step": 13436 }, { "epoch": 2.4911012235817576, "grad_norm": 6.36328125, "learning_rate": 7.508898776418243e-06, "loss": 2.8285, "mean_token_accuracy": 0.48437681496108725, "step": 13437 }, { "epoch": 2.4912866147571378, "grad_norm": 7.26171875, "learning_rate": 7.5087133852428625e-06, "loss": 2.4766, "mean_token_accuracy": 0.4917558886509636, "step": 13438 }, { "epoch": 2.4914720059325175, "grad_norm": 6.91015625, "learning_rate": 7.508527994067483e-06, "loss": 3.4809, "mean_token_accuracy": 0.43765529704088546, "step": 13439 }, { "epoch": 2.4916573971078977, "grad_norm": 11.96875, "learning_rate": 7.508342602892103e-06, "loss": 2.8066, "mean_token_accuracy": 0.46888639481232075, "step": 13440 }, { "epoch": 2.491842788283278, "grad_norm": 12.6796875, "learning_rate": 7.508157211716723e-06, "loss": 2.9643, "mean_token_accuracy": 0.4918774074694356, "step": 13441 }, { "epoch": 2.4920281794586576, "grad_norm": 11.3515625, "learning_rate": 7.507971820541344e-06, "loss": 2.6283, "mean_token_accuracy": 0.49481441048034935, "step": 13442 }, { "epoch": 2.492213570634038, "grad_norm": 7.2109375, "learning_rate": 7.507786429365963e-06, "loss": 2.6463, "mean_token_accuracy": 0.48945425784339575, "step": 13443 }, { "epoch": 2.492398961809418, "grad_norm": 7.68359375, "learning_rate": 7.507601038190583e-06, "loss": 2.6996, "mean_token_accuracy": 0.5244840145690004, "step": 13444 }, { "epoch": 2.4925843529847977, "grad_norm": 8.921875, "learning_rate": 7.5074156470152026e-06, "loss": 3.3345, "mean_token_accuracy": 0.4383633732403631, "step": 13445 }, { "epoch": 2.492769744160178, "grad_norm": 12.5390625, "learning_rate": 7.507230255839822e-06, "loss": 2.6583, "mean_token_accuracy": 0.48631717771183647, "step": 13446 }, { "epoch": 2.492955135335558, "grad_norm": 7.08203125, "learning_rate": 7.507044864664442e-06, "loss": 2.8589, "mean_token_accuracy": 0.48365101503176816, "step": 13447 }, { "epoch": 2.4931405265109383, "grad_norm": 8.703125, "learning_rate": 7.506859473489062e-06, "loss": 2.7982, "mean_token_accuracy": 0.5099128867527786, "step": 13448 }, { "epoch": 2.493325917686318, "grad_norm": 7.84765625, "learning_rate": 7.506674082313683e-06, "loss": 3.0923, "mean_token_accuracy": 0.45694704049844237, "step": 13449 }, { "epoch": 2.4935113088616983, "grad_norm": 7.53125, "learning_rate": 7.5064886911383025e-06, "loss": 2.6751, "mean_token_accuracy": 0.5052465143021417, "step": 13450 }, { "epoch": 2.493696700037078, "grad_norm": 8.1484375, "learning_rate": 7.506303299962923e-06, "loss": 3.4327, "mean_token_accuracy": 0.4412366691802219, "step": 13451 }, { "epoch": 2.493882091212458, "grad_norm": 10.5234375, "learning_rate": 7.506117908787543e-06, "loss": 3.5384, "mean_token_accuracy": 0.45236631837738167, "step": 13452 }, { "epoch": 2.4940674823878384, "grad_norm": 9.2421875, "learning_rate": 7.505932517612162e-06, "loss": 3.0827, "mean_token_accuracy": 0.4602706832569308, "step": 13453 }, { "epoch": 2.4942528735632186, "grad_norm": 7.7578125, "learning_rate": 7.505747126436782e-06, "loss": 2.8087, "mean_token_accuracy": 0.47281921618204803, "step": 13454 }, { "epoch": 2.4944382647385983, "grad_norm": 7.90234375, "learning_rate": 7.5055617352614015e-06, "loss": 2.3976, "mean_token_accuracy": 0.508042416299297, "step": 13455 }, { "epoch": 2.4946236559139785, "grad_norm": 8.015625, "learning_rate": 7.505376344086022e-06, "loss": 2.6759, "mean_token_accuracy": 0.49007849978194507, "step": 13456 }, { "epoch": 2.4948090470893587, "grad_norm": 7.890625, "learning_rate": 7.5051909529106425e-06, "loss": 2.8726, "mean_token_accuracy": 0.47721644378405154, "step": 13457 }, { "epoch": 2.4949944382647384, "grad_norm": 8.9609375, "learning_rate": 7.505005561735262e-06, "loss": 3.6097, "mean_token_accuracy": 0.4575740318906606, "step": 13458 }, { "epoch": 2.4951798294401186, "grad_norm": 9.4453125, "learning_rate": 7.504820170559882e-06, "loss": 3.3394, "mean_token_accuracy": 0.46460490849765923, "step": 13459 }, { "epoch": 2.495365220615499, "grad_norm": 9.359375, "learning_rate": 7.504634779384502e-06, "loss": 3.1626, "mean_token_accuracy": 0.4410997866793079, "step": 13460 }, { "epoch": 2.495550611790879, "grad_norm": 6.77734375, "learning_rate": 7.504449388209122e-06, "loss": 2.5744, "mean_token_accuracy": 0.4923963698798136, "step": 13461 }, { "epoch": 2.4957360029662587, "grad_norm": 7.26953125, "learning_rate": 7.504263997033742e-06, "loss": 3.1093, "mean_token_accuracy": 0.47325968788357003, "step": 13462 }, { "epoch": 2.495921394141639, "grad_norm": 9.1640625, "learning_rate": 7.504078605858361e-06, "loss": 3.992, "mean_token_accuracy": 0.40954167783436074, "step": 13463 }, { "epoch": 2.4961067853170187, "grad_norm": 10.2734375, "learning_rate": 7.503893214682981e-06, "loss": 2.299, "mean_token_accuracy": 0.5656646473882729, "step": 13464 }, { "epoch": 2.496292176492399, "grad_norm": 7.4609375, "learning_rate": 7.503707823507602e-06, "loss": 3.6524, "mean_token_accuracy": 0.4281974080377013, "step": 13465 }, { "epoch": 2.496477567667779, "grad_norm": 6.921875, "learning_rate": 7.503522432332222e-06, "loss": 2.5368, "mean_token_accuracy": 0.4874261371444277, "step": 13466 }, { "epoch": 2.4966629588431593, "grad_norm": 7.0859375, "learning_rate": 7.5033370411568415e-06, "loss": 3.6824, "mean_token_accuracy": 0.4521659468655935, "step": 13467 }, { "epoch": 2.496848350018539, "grad_norm": 7.91015625, "learning_rate": 7.503151649981462e-06, "loss": 2.5876, "mean_token_accuracy": 0.5202537119792418, "step": 13468 }, { "epoch": 2.497033741193919, "grad_norm": 7.6328125, "learning_rate": 7.502966258806082e-06, "loss": 2.855, "mean_token_accuracy": 0.48551521818848553, "step": 13469 }, { "epoch": 2.4972191323692994, "grad_norm": 6.703125, "learning_rate": 7.502780867630701e-06, "loss": 3.0999, "mean_token_accuracy": 0.4549696027145483, "step": 13470 }, { "epoch": 2.497404523544679, "grad_norm": 7.15234375, "learning_rate": 7.502595476455321e-06, "loss": 3.0055, "mean_token_accuracy": 0.4769097437167265, "step": 13471 }, { "epoch": 2.4975899147200593, "grad_norm": 6.15234375, "learning_rate": 7.502410085279941e-06, "loss": 2.8982, "mean_token_accuracy": 0.4797608668576411, "step": 13472 }, { "epoch": 2.4977753058954395, "grad_norm": 6.90234375, "learning_rate": 7.502224694104562e-06, "loss": 2.7075, "mean_token_accuracy": 0.5038260658326248, "step": 13473 }, { "epoch": 2.4979606970708192, "grad_norm": 7.04296875, "learning_rate": 7.5020393029291816e-06, "loss": 2.1819, "mean_token_accuracy": 0.5451341518872215, "step": 13474 }, { "epoch": 2.4981460882461994, "grad_norm": 7.79296875, "learning_rate": 7.501853911753801e-06, "loss": 2.7286, "mean_token_accuracy": 0.5014287226161499, "step": 13475 }, { "epoch": 2.4983314794215796, "grad_norm": 7.00390625, "learning_rate": 7.501668520578421e-06, "loss": 2.8358, "mean_token_accuracy": 0.4750593824228028, "step": 13476 }, { "epoch": 2.4985168705969594, "grad_norm": 7.88671875, "learning_rate": 7.501483129403041e-06, "loss": 2.7956, "mean_token_accuracy": 0.5177061634603602, "step": 13477 }, { "epoch": 2.4987022617723396, "grad_norm": 7.48046875, "learning_rate": 7.501297738227661e-06, "loss": 3.5313, "mean_token_accuracy": 0.4282934131736527, "step": 13478 }, { "epoch": 2.4988876529477198, "grad_norm": 5.89453125, "learning_rate": 7.501112347052281e-06, "loss": 2.8049, "mean_token_accuracy": 0.48816316620128686, "step": 13479 }, { "epoch": 2.4990730441231, "grad_norm": 7.796875, "learning_rate": 7.5009269558769e-06, "loss": 2.7458, "mean_token_accuracy": 0.4665671323450997, "step": 13480 }, { "epoch": 2.4992584352984797, "grad_norm": 10.125, "learning_rate": 7.500741564701522e-06, "loss": 2.8412, "mean_token_accuracy": 0.4630314232902033, "step": 13481 }, { "epoch": 2.49944382647386, "grad_norm": 8.7421875, "learning_rate": 7.500556173526141e-06, "loss": 2.8524, "mean_token_accuracy": 0.4822684194435952, "step": 13482 }, { "epoch": 2.49962921764924, "grad_norm": 7.3671875, "learning_rate": 7.500370782350761e-06, "loss": 2.9411, "mean_token_accuracy": 0.5023119605425401, "step": 13483 }, { "epoch": 2.49981460882462, "grad_norm": 8.2890625, "learning_rate": 7.5001853911753805e-06, "loss": 2.4033, "mean_token_accuracy": 0.5398520953163517, "step": 13484 }, { "epoch": 2.5, "grad_norm": 6.35546875, "learning_rate": 7.500000000000001e-06, "loss": 2.9542, "mean_token_accuracy": 0.46430566417160096, "step": 13485 }, { "epoch": 2.50018539117538, "grad_norm": 7.25, "learning_rate": 7.499814608824621e-06, "loss": 2.5042, "mean_token_accuracy": 0.5147773545251544, "step": 13486 }, { "epoch": 2.5003707823507604, "grad_norm": 7.203125, "learning_rate": 7.49962921764924e-06, "loss": 3.3609, "mean_token_accuracy": 0.4247676463200201, "step": 13487 }, { "epoch": 2.50055617352614, "grad_norm": 7.76171875, "learning_rate": 7.49944382647386e-06, "loss": 2.2649, "mean_token_accuracy": 0.5556672891825887, "step": 13488 }, { "epoch": 2.5007415647015203, "grad_norm": 8.7265625, "learning_rate": 7.49925843529848e-06, "loss": 2.7262, "mean_token_accuracy": 0.5074726745482936, "step": 13489 }, { "epoch": 2.5009269558769, "grad_norm": 7.9375, "learning_rate": 7.499073044123101e-06, "loss": 2.9391, "mean_token_accuracy": 0.46142414860681114, "step": 13490 }, { "epoch": 2.5011123470522802, "grad_norm": 7.125, "learning_rate": 7.498887652947721e-06, "loss": 2.7601, "mean_token_accuracy": 0.4780564263322884, "step": 13491 }, { "epoch": 2.5012977382276604, "grad_norm": 7.078125, "learning_rate": 7.49870226177234e-06, "loss": 2.6352, "mean_token_accuracy": 0.5016646223935518, "step": 13492 }, { "epoch": 2.5014831294030406, "grad_norm": 8.5234375, "learning_rate": 7.49851687059696e-06, "loss": 3.3239, "mean_token_accuracy": 0.4602649006622517, "step": 13493 }, { "epoch": 2.5016685205784204, "grad_norm": 8.3359375, "learning_rate": 7.49833147942158e-06, "loss": 2.7484, "mean_token_accuracy": 0.47357818069115143, "step": 13494 }, { "epoch": 2.5018539117538006, "grad_norm": 7.1484375, "learning_rate": 7.4981460882462e-06, "loss": 2.6622, "mean_token_accuracy": 0.4862629660779366, "step": 13495 }, { "epoch": 2.5020393029291803, "grad_norm": 8.234375, "learning_rate": 7.49796069707082e-06, "loss": 3.0296, "mean_token_accuracy": 0.48395527806011324, "step": 13496 }, { "epoch": 2.5022246941045605, "grad_norm": 12.6484375, "learning_rate": 7.497775305895439e-06, "loss": 3.598, "mean_token_accuracy": 0.43443620670576516, "step": 13497 }, { "epoch": 2.5024100852799407, "grad_norm": 10.5625, "learning_rate": 7.497589914720061e-06, "loss": 2.4522, "mean_token_accuracy": 0.502421457841798, "step": 13498 }, { "epoch": 2.502595476455321, "grad_norm": 7.3828125, "learning_rate": 7.49740452354468e-06, "loss": 2.7098, "mean_token_accuracy": 0.4844728956687551, "step": 13499 }, { "epoch": 2.5027808676307006, "grad_norm": 9.28125, "learning_rate": 7.4972191323693e-06, "loss": 3.0257, "mean_token_accuracy": 0.4826392894220787, "step": 13500 }, { "epoch": 2.502966258806081, "grad_norm": 10.6328125, "learning_rate": 7.49703374119392e-06, "loss": 2.7512, "mean_token_accuracy": 0.49948418156808805, "step": 13501 }, { "epoch": 2.503151649981461, "grad_norm": 8.0625, "learning_rate": 7.496848350018539e-06, "loss": 3.6229, "mean_token_accuracy": 0.4290468986384266, "step": 13502 }, { "epoch": 2.5033370411568407, "grad_norm": 6.20703125, "learning_rate": 7.49666295884316e-06, "loss": 2.3191, "mean_token_accuracy": 0.5459044790893344, "step": 13503 }, { "epoch": 2.503522432332221, "grad_norm": 11.2109375, "learning_rate": 7.496477567667779e-06, "loss": 2.6314, "mean_token_accuracy": 0.47930372807017546, "step": 13504 }, { "epoch": 2.503707823507601, "grad_norm": 10.328125, "learning_rate": 7.496292176492399e-06, "loss": 2.5754, "mean_token_accuracy": 0.5025178592341023, "step": 13505 }, { "epoch": 2.5038932146829813, "grad_norm": 6.59765625, "learning_rate": 7.49610678531702e-06, "loss": 2.5662, "mean_token_accuracy": 0.5095532831001076, "step": 13506 }, { "epoch": 2.504078605858361, "grad_norm": 9.84375, "learning_rate": 7.49592139414164e-06, "loss": 2.5524, "mean_token_accuracy": 0.5043478260869565, "step": 13507 }, { "epoch": 2.5042639970337413, "grad_norm": 7.81640625, "learning_rate": 7.49573600296626e-06, "loss": 3.1508, "mean_token_accuracy": 0.4525327878714956, "step": 13508 }, { "epoch": 2.504449388209121, "grad_norm": 7.30078125, "learning_rate": 7.495550611790879e-06, "loss": 3.0043, "mean_token_accuracy": 0.45413533834586467, "step": 13509 }, { "epoch": 2.504634779384501, "grad_norm": 10.921875, "learning_rate": 7.495365220615499e-06, "loss": 2.8416, "mean_token_accuracy": 0.47270471464019853, "step": 13510 }, { "epoch": 2.5048201705598814, "grad_norm": 8.90625, "learning_rate": 7.495179829440119e-06, "loss": 2.9219, "mean_token_accuracy": 0.49427853753837564, "step": 13511 }, { "epoch": 2.5050055617352616, "grad_norm": 8.8671875, "learning_rate": 7.494994438264739e-06, "loss": 3.2972, "mean_token_accuracy": 0.47516422726748875, "step": 13512 }, { "epoch": 2.5051909529106413, "grad_norm": 8.4453125, "learning_rate": 7.494809047089359e-06, "loss": 3.1952, "mean_token_accuracy": 0.46358519529251235, "step": 13513 }, { "epoch": 2.5053763440860215, "grad_norm": 7.76171875, "learning_rate": 7.494623655913979e-06, "loss": 2.9123, "mean_token_accuracy": 0.46158714086824454, "step": 13514 }, { "epoch": 2.5055617352614017, "grad_norm": 9.1015625, "learning_rate": 7.4944382647386e-06, "loss": 3.9804, "mean_token_accuracy": 0.40692307692307694, "step": 13515 }, { "epoch": 2.5057471264367814, "grad_norm": 7.390625, "learning_rate": 7.494252873563219e-06, "loss": 2.8283, "mean_token_accuracy": 0.5033697768458889, "step": 13516 }, { "epoch": 2.5059325176121616, "grad_norm": 10.3984375, "learning_rate": 7.494067482387839e-06, "loss": 2.7655, "mean_token_accuracy": 0.4876608808596056, "step": 13517 }, { "epoch": 2.506117908787542, "grad_norm": 11.4375, "learning_rate": 7.493882091212459e-06, "loss": 4.025, "mean_token_accuracy": 0.41453926574276684, "step": 13518 }, { "epoch": 2.506303299962922, "grad_norm": 10.0234375, "learning_rate": 7.493696700037078e-06, "loss": 2.8974, "mean_token_accuracy": 0.47974772249474423, "step": 13519 }, { "epoch": 2.5064886911383017, "grad_norm": 6.4921875, "learning_rate": 7.493511308861699e-06, "loss": 2.9625, "mean_token_accuracy": 0.4745103686635945, "step": 13520 }, { "epoch": 2.506674082313682, "grad_norm": 7.87890625, "learning_rate": 7.493325917686318e-06, "loss": 2.9649, "mean_token_accuracy": 0.5141062376019396, "step": 13521 }, { "epoch": 2.5068594734890617, "grad_norm": 8.234375, "learning_rate": 7.493140526510939e-06, "loss": 3.3066, "mean_token_accuracy": 0.44746600741656367, "step": 13522 }, { "epoch": 2.507044864664442, "grad_norm": 6.9375, "learning_rate": 7.492955135335559e-06, "loss": 2.3291, "mean_token_accuracy": 0.5433839479392625, "step": 13523 }, { "epoch": 2.507230255839822, "grad_norm": 7.04296875, "learning_rate": 7.492769744160179e-06, "loss": 2.8499, "mean_token_accuracy": 0.45614495798319327, "step": 13524 }, { "epoch": 2.5074156470152023, "grad_norm": 8.3671875, "learning_rate": 7.492584352984799e-06, "loss": 2.9179, "mean_token_accuracy": 0.46381773108251684, "step": 13525 }, { "epoch": 2.507601038190582, "grad_norm": 6.2578125, "learning_rate": 7.492398961809418e-06, "loss": 2.4729, "mean_token_accuracy": 0.518233259981138, "step": 13526 }, { "epoch": 2.507786429365962, "grad_norm": 9.1640625, "learning_rate": 7.492213570634038e-06, "loss": 3.3009, "mean_token_accuracy": 0.47492447129909365, "step": 13527 }, { "epoch": 2.5079718205413424, "grad_norm": 7.10546875, "learning_rate": 7.492028179458658e-06, "loss": 2.9424, "mean_token_accuracy": 0.47959183673469385, "step": 13528 }, { "epoch": 2.508157211716722, "grad_norm": 6.6875, "learning_rate": 7.491842788283278e-06, "loss": 2.7751, "mean_token_accuracy": 0.5188515081206496, "step": 13529 }, { "epoch": 2.5083426028921023, "grad_norm": 6.9375, "learning_rate": 7.491657397107899e-06, "loss": 2.7138, "mean_token_accuracy": 0.49895148888578217, "step": 13530 }, { "epoch": 2.5085279940674825, "grad_norm": 7.18359375, "learning_rate": 7.491472005932518e-06, "loss": 2.7392, "mean_token_accuracy": 0.47947650208209397, "step": 13531 }, { "epoch": 2.5087133852428627, "grad_norm": 7.09765625, "learning_rate": 7.491286614757139e-06, "loss": 2.7898, "mean_token_accuracy": 0.471240755957272, "step": 13532 }, { "epoch": 2.5088987764182424, "grad_norm": 8.1953125, "learning_rate": 7.491101223581758e-06, "loss": 2.9585, "mean_token_accuracy": 0.46684894053315107, "step": 13533 }, { "epoch": 2.5090841675936226, "grad_norm": 6.56640625, "learning_rate": 7.490915832406378e-06, "loss": 2.7227, "mean_token_accuracy": 0.4837103428652724, "step": 13534 }, { "epoch": 2.5092695587690024, "grad_norm": 8.375, "learning_rate": 7.490730441230998e-06, "loss": 3.0762, "mean_token_accuracy": 0.473107228502912, "step": 13535 }, { "epoch": 2.5094549499443826, "grad_norm": 11.4296875, "learning_rate": 7.490545050055617e-06, "loss": 3.2919, "mean_token_accuracy": 0.4486021505376344, "step": 13536 }, { "epoch": 2.5096403411197628, "grad_norm": 11.40625, "learning_rate": 7.490359658880238e-06, "loss": 2.6599, "mean_token_accuracy": 0.49650880388585306, "step": 13537 }, { "epoch": 2.509825732295143, "grad_norm": 8.6796875, "learning_rate": 7.490174267704858e-06, "loss": 2.8254, "mean_token_accuracy": 0.4617489737529544, "step": 13538 }, { "epoch": 2.5100111234705227, "grad_norm": 8.6328125, "learning_rate": 7.489988876529478e-06, "loss": 3.3309, "mean_token_accuracy": 0.46528035647976235, "step": 13539 }, { "epoch": 2.510196514645903, "grad_norm": 10.4921875, "learning_rate": 7.4898034853540976e-06, "loss": 2.5423, "mean_token_accuracy": 0.5006151574803149, "step": 13540 }, { "epoch": 2.510381905821283, "grad_norm": 7.609375, "learning_rate": 7.489618094178718e-06, "loss": 2.7209, "mean_token_accuracy": 0.5164339116941741, "step": 13541 }, { "epoch": 2.510567296996663, "grad_norm": 15.1875, "learning_rate": 7.489432703003338e-06, "loss": 2.6962, "mean_token_accuracy": 0.48496379035227694, "step": 13542 }, { "epoch": 2.510752688172043, "grad_norm": 12.921875, "learning_rate": 7.489247311827957e-06, "loss": 2.7829, "mean_token_accuracy": 0.4890592334494773, "step": 13543 }, { "epoch": 2.510938079347423, "grad_norm": 10.4453125, "learning_rate": 7.489061920652577e-06, "loss": 3.534, "mean_token_accuracy": 0.4190061763054464, "step": 13544 }, { "epoch": 2.5111234705228034, "grad_norm": 7.71484375, "learning_rate": 7.488876529477197e-06, "loss": 2.2997, "mean_token_accuracy": 0.507940640458214, "step": 13545 }, { "epoch": 2.511308861698183, "grad_norm": 12.6875, "learning_rate": 7.488691138301818e-06, "loss": 2.7636, "mean_token_accuracy": 0.5020363236103468, "step": 13546 }, { "epoch": 2.5114942528735633, "grad_norm": 10.1171875, "learning_rate": 7.488505747126438e-06, "loss": 2.8945, "mean_token_accuracy": 0.5111307031077805, "step": 13547 }, { "epoch": 2.511679644048943, "grad_norm": 8.7109375, "learning_rate": 7.488320355951057e-06, "loss": 3.3234, "mean_token_accuracy": 0.4332164058283864, "step": 13548 }, { "epoch": 2.5118650352243233, "grad_norm": 8.8203125, "learning_rate": 7.488134964775678e-06, "loss": 2.6633, "mean_token_accuracy": 0.4996640236527348, "step": 13549 }, { "epoch": 2.5120504263997034, "grad_norm": 8.875, "learning_rate": 7.487949573600297e-06, "loss": 3.1899, "mean_token_accuracy": 0.464041095890411, "step": 13550 }, { "epoch": 2.5122358175750836, "grad_norm": 6.98828125, "learning_rate": 7.487764182424917e-06, "loss": 3.1297, "mean_token_accuracy": 0.4735901509134233, "step": 13551 }, { "epoch": 2.5124212087504634, "grad_norm": 9.4921875, "learning_rate": 7.487578791249537e-06, "loss": 2.8181, "mean_token_accuracy": 0.46585160202360876, "step": 13552 }, { "epoch": 2.5126065999258436, "grad_norm": 7.85546875, "learning_rate": 7.487393400074156e-06, "loss": 3.0745, "mean_token_accuracy": 0.47018909899888767, "step": 13553 }, { "epoch": 2.5127919911012233, "grad_norm": 9.109375, "learning_rate": 7.487208008898778e-06, "loss": 2.9643, "mean_token_accuracy": 0.4624892703862661, "step": 13554 }, { "epoch": 2.5129773822766035, "grad_norm": 8.796875, "learning_rate": 7.487022617723397e-06, "loss": 2.6436, "mean_token_accuracy": 0.506547300908605, "step": 13555 }, { "epoch": 2.5131627734519837, "grad_norm": 11.1796875, "learning_rate": 7.486837226548017e-06, "loss": 2.9021, "mean_token_accuracy": 0.4730668983492615, "step": 13556 }, { "epoch": 2.513348164627364, "grad_norm": 8.890625, "learning_rate": 7.486651835372637e-06, "loss": 3.1534, "mean_token_accuracy": 0.4651646871556263, "step": 13557 }, { "epoch": 2.5135335558027436, "grad_norm": 7.82421875, "learning_rate": 7.486466444197257e-06, "loss": 3.5317, "mean_token_accuracy": 0.431175805539853, "step": 13558 }, { "epoch": 2.513718946978124, "grad_norm": 9.4609375, "learning_rate": 7.486281053021877e-06, "loss": 2.5818, "mean_token_accuracy": 0.4853761393007754, "step": 13559 }, { "epoch": 2.513904338153504, "grad_norm": 7.0859375, "learning_rate": 7.486095661846496e-06, "loss": 3.1095, "mean_token_accuracy": 0.46477641768673494, "step": 13560 }, { "epoch": 2.5140897293288837, "grad_norm": 7.61328125, "learning_rate": 7.485910270671116e-06, "loss": 2.6238, "mean_token_accuracy": 0.504364694471387, "step": 13561 }, { "epoch": 2.514275120504264, "grad_norm": 8.28125, "learning_rate": 7.485724879495737e-06, "loss": 3.1256, "mean_token_accuracy": 0.43881317433276545, "step": 13562 }, { "epoch": 2.514460511679644, "grad_norm": 6.703125, "learning_rate": 7.485539488320357e-06, "loss": 2.6103, "mean_token_accuracy": 0.5021589450344264, "step": 13563 }, { "epoch": 2.5146459028550243, "grad_norm": 12.484375, "learning_rate": 7.485354097144977e-06, "loss": 2.314, "mean_token_accuracy": 0.5506278386321133, "step": 13564 }, { "epoch": 2.514831294030404, "grad_norm": 9.109375, "learning_rate": 7.485168705969596e-06, "loss": 3.0727, "mean_token_accuracy": 0.43622363575493805, "step": 13565 }, { "epoch": 2.5150166852057843, "grad_norm": 7.66015625, "learning_rate": 7.484983314794216e-06, "loss": 3.4117, "mean_token_accuracy": 0.44035006909258406, "step": 13566 }, { "epoch": 2.515202076381164, "grad_norm": 9.4375, "learning_rate": 7.4847979236188364e-06, "loss": 2.9408, "mean_token_accuracy": 0.4542483660130719, "step": 13567 }, { "epoch": 2.515387467556544, "grad_norm": 8.640625, "learning_rate": 7.484612532443456e-06, "loss": 3.6595, "mean_token_accuracy": 0.43819530284301605, "step": 13568 }, { "epoch": 2.5155728587319244, "grad_norm": 7.28515625, "learning_rate": 7.484427141268076e-06, "loss": 3.0287, "mean_token_accuracy": 0.4590078328981723, "step": 13569 }, { "epoch": 2.5157582499073046, "grad_norm": 8.78125, "learning_rate": 7.484241750092697e-06, "loss": 2.7276, "mean_token_accuracy": 0.5053715308863026, "step": 13570 }, { "epoch": 2.5159436410826843, "grad_norm": 8.0625, "learning_rate": 7.484056358917317e-06, "loss": 2.8367, "mean_token_accuracy": 0.5202349869451697, "step": 13571 }, { "epoch": 2.5161290322580645, "grad_norm": 8.46875, "learning_rate": 7.483870967741936e-06, "loss": 3.4563, "mean_token_accuracy": 0.484456322970882, "step": 13572 }, { "epoch": 2.5163144234334447, "grad_norm": 12.453125, "learning_rate": 7.483685576566556e-06, "loss": 3.3911, "mean_token_accuracy": 0.4842529296875, "step": 13573 }, { "epoch": 2.5164998146088244, "grad_norm": 9.921875, "learning_rate": 7.483500185391176e-06, "loss": 2.2968, "mean_token_accuracy": 0.5326054883446444, "step": 13574 }, { "epoch": 2.5166852057842046, "grad_norm": 7.03515625, "learning_rate": 7.483314794215796e-06, "loss": 2.2568, "mean_token_accuracy": 0.5827769605069594, "step": 13575 }, { "epoch": 2.516870596959585, "grad_norm": 8.828125, "learning_rate": 7.483129403040416e-06, "loss": 2.6532, "mean_token_accuracy": 0.4973130951258603, "step": 13576 }, { "epoch": 2.517055988134965, "grad_norm": 9.4296875, "learning_rate": 7.482944011865035e-06, "loss": 2.4652, "mean_token_accuracy": 0.5212249208025344, "step": 13577 }, { "epoch": 2.5172413793103448, "grad_norm": 13.390625, "learning_rate": 7.482758620689656e-06, "loss": 2.8524, "mean_token_accuracy": 0.4833174451858913, "step": 13578 }, { "epoch": 2.517426770485725, "grad_norm": 7.0078125, "learning_rate": 7.482573229514276e-06, "loss": 2.688, "mean_token_accuracy": 0.4966638359820454, "step": 13579 }, { "epoch": 2.5176121616611047, "grad_norm": 10.7421875, "learning_rate": 7.482387838338896e-06, "loss": 2.8541, "mean_token_accuracy": 0.4936293182713779, "step": 13580 }, { "epoch": 2.517797552836485, "grad_norm": 12.7109375, "learning_rate": 7.482202447163516e-06, "loss": 3.1062, "mean_token_accuracy": 0.4555104589447393, "step": 13581 }, { "epoch": 2.517982944011865, "grad_norm": 8.828125, "learning_rate": 7.482017055988135e-06, "loss": 4.0784, "mean_token_accuracy": 0.3941176470588235, "step": 13582 }, { "epoch": 2.5181683351872453, "grad_norm": 7.14453125, "learning_rate": 7.481831664812755e-06, "loss": 2.1591, "mean_token_accuracy": 0.5567731713757592, "step": 13583 }, { "epoch": 2.518353726362625, "grad_norm": 7.5703125, "learning_rate": 7.4816462736373755e-06, "loss": 2.9052, "mean_token_accuracy": 0.47465498938428874, "step": 13584 }, { "epoch": 2.518539117538005, "grad_norm": 7.0703125, "learning_rate": 7.481460882461995e-06, "loss": 2.642, "mean_token_accuracy": 0.4950006328312872, "step": 13585 }, { "epoch": 2.5187245087133854, "grad_norm": 10.109375, "learning_rate": 7.481275491286616e-06, "loss": 2.6972, "mean_token_accuracy": 0.4760025597269625, "step": 13586 }, { "epoch": 2.518909899888765, "grad_norm": 7.82421875, "learning_rate": 7.481090100111236e-06, "loss": 3.2981, "mean_token_accuracy": 0.4650695258837326, "step": 13587 }, { "epoch": 2.5190952910641453, "grad_norm": 8.078125, "learning_rate": 7.480904708935856e-06, "loss": 2.8837, "mean_token_accuracy": 0.48855258051998446, "step": 13588 }, { "epoch": 2.5192806822395255, "grad_norm": 8.0625, "learning_rate": 7.480719317760475e-06, "loss": 2.3827, "mean_token_accuracy": 0.5568291505791506, "step": 13589 }, { "epoch": 2.5194660734149057, "grad_norm": 6.84375, "learning_rate": 7.480533926585095e-06, "loss": 2.1944, "mean_token_accuracy": 0.533155210104445, "step": 13590 }, { "epoch": 2.5196514645902854, "grad_norm": 6.6328125, "learning_rate": 7.480348535409715e-06, "loss": 2.4606, "mean_token_accuracy": 0.5226680562793121, "step": 13591 }, { "epoch": 2.5198368557656656, "grad_norm": 6.53125, "learning_rate": 7.480163144234335e-06, "loss": 2.8942, "mean_token_accuracy": 0.4800244910454615, "step": 13592 }, { "epoch": 2.5200222469410454, "grad_norm": 6.52734375, "learning_rate": 7.479977753058955e-06, "loss": 2.2138, "mean_token_accuracy": 0.5512655512655512, "step": 13593 }, { "epoch": 2.5202076381164256, "grad_norm": 8.484375, "learning_rate": 7.479792361883575e-06, "loss": 3.1283, "mean_token_accuracy": 0.4785069235400361, "step": 13594 }, { "epoch": 2.5203930292918058, "grad_norm": 6.54296875, "learning_rate": 7.479606970708195e-06, "loss": 3.3479, "mean_token_accuracy": 0.46871671991480296, "step": 13595 }, { "epoch": 2.520578420467186, "grad_norm": 6.89453125, "learning_rate": 7.4794215795328154e-06, "loss": 3.3745, "mean_token_accuracy": 0.4316820276497696, "step": 13596 }, { "epoch": 2.5207638116425657, "grad_norm": 6.68359375, "learning_rate": 7.479236188357435e-06, "loss": 2.654, "mean_token_accuracy": 0.4759564293304995, "step": 13597 }, { "epoch": 2.520949202817946, "grad_norm": 7.44140625, "learning_rate": 7.479050797182055e-06, "loss": 2.4964, "mean_token_accuracy": 0.4986636657757772, "step": 13598 }, { "epoch": 2.521134593993326, "grad_norm": 8.2265625, "learning_rate": 7.478865406006674e-06, "loss": 3.299, "mean_token_accuracy": 0.44449463579898363, "step": 13599 }, { "epoch": 2.521319985168706, "grad_norm": 6.3515625, "learning_rate": 7.478680014831294e-06, "loss": 2.7694, "mean_token_accuracy": 0.5060827250608273, "step": 13600 }, { "epoch": 2.521505376344086, "grad_norm": 7.2421875, "learning_rate": 7.4784946236559145e-06, "loss": 2.6156, "mean_token_accuracy": 0.48412103025756437, "step": 13601 }, { "epoch": 2.521690767519466, "grad_norm": 7.66796875, "learning_rate": 7.478309232480535e-06, "loss": 3.1582, "mean_token_accuracy": 0.4842410655413425, "step": 13602 }, { "epoch": 2.5218761586948464, "grad_norm": 9.375, "learning_rate": 7.478123841305155e-06, "loss": 3.2825, "mean_token_accuracy": 0.4431831904359377, "step": 13603 }, { "epoch": 2.522061549870226, "grad_norm": 7.08984375, "learning_rate": 7.477938450129775e-06, "loss": 2.9868, "mean_token_accuracy": 0.4723636363636364, "step": 13604 }, { "epoch": 2.5222469410456063, "grad_norm": 8.015625, "learning_rate": 7.477753058954395e-06, "loss": 3.2567, "mean_token_accuracy": 0.4584993002476047, "step": 13605 }, { "epoch": 2.522432332220986, "grad_norm": 9.8984375, "learning_rate": 7.477567667779014e-06, "loss": 3.3552, "mean_token_accuracy": 0.44371822803195354, "step": 13606 }, { "epoch": 2.5226177233963663, "grad_norm": 10.2890625, "learning_rate": 7.477382276603634e-06, "loss": 2.6186, "mean_token_accuracy": 0.48052095130237826, "step": 13607 }, { "epoch": 2.5228031145717464, "grad_norm": 7.89453125, "learning_rate": 7.477196885428254e-06, "loss": 3.0229, "mean_token_accuracy": 0.45211667527103766, "step": 13608 }, { "epoch": 2.5229885057471266, "grad_norm": 10.4921875, "learning_rate": 7.477011494252873e-06, "loss": 3.7733, "mean_token_accuracy": 0.4479822719850711, "step": 13609 }, { "epoch": 2.5231738969225064, "grad_norm": 9.046875, "learning_rate": 7.476826103077494e-06, "loss": 2.4991, "mean_token_accuracy": 0.49696969696969695, "step": 13610 }, { "epoch": 2.5233592880978866, "grad_norm": 7.53515625, "learning_rate": 7.476640711902114e-06, "loss": 3.2552, "mean_token_accuracy": 0.4511055052678439, "step": 13611 }, { "epoch": 2.5235446792732668, "grad_norm": 7.04296875, "learning_rate": 7.476455320726734e-06, "loss": 2.6726, "mean_token_accuracy": 0.5106801573917932, "step": 13612 }, { "epoch": 2.5237300704486465, "grad_norm": 8.6640625, "learning_rate": 7.4762699295513545e-06, "loss": 2.9615, "mean_token_accuracy": 0.48330182734719596, "step": 13613 }, { "epoch": 2.5239154616240267, "grad_norm": 6.16796875, "learning_rate": 7.476084538375974e-06, "loss": 2.8263, "mean_token_accuracy": 0.484375, "step": 13614 }, { "epoch": 2.524100852799407, "grad_norm": 8.1328125, "learning_rate": 7.475899147200594e-06, "loss": 2.8008, "mean_token_accuracy": 0.4865752642102257, "step": 13615 }, { "epoch": 2.524286243974787, "grad_norm": 8.03125, "learning_rate": 7.475713756025213e-06, "loss": 2.7067, "mean_token_accuracy": 0.48707671043538353, "step": 13616 }, { "epoch": 2.524471635150167, "grad_norm": 8.515625, "learning_rate": 7.475528364849833e-06, "loss": 3.4345, "mean_token_accuracy": 0.4487037719621575, "step": 13617 }, { "epoch": 2.524657026325547, "grad_norm": 8.1171875, "learning_rate": 7.4753429736744535e-06, "loss": 2.8192, "mean_token_accuracy": 0.4895811184350415, "step": 13618 }, { "epoch": 2.5248424175009268, "grad_norm": 8.296875, "learning_rate": 7.475157582499074e-06, "loss": 2.9471, "mean_token_accuracy": 0.4761296503273192, "step": 13619 }, { "epoch": 2.525027808676307, "grad_norm": 7.828125, "learning_rate": 7.474972191323694e-06, "loss": 2.7374, "mean_token_accuracy": 0.47462259239979177, "step": 13620 }, { "epoch": 2.525213199851687, "grad_norm": 7.05078125, "learning_rate": 7.474786800148313e-06, "loss": 2.5386, "mean_token_accuracy": 0.5518618734699137, "step": 13621 }, { "epoch": 2.5253985910270673, "grad_norm": 7.52734375, "learning_rate": 7.474601408972934e-06, "loss": 2.7704, "mean_token_accuracy": 0.5014114326040932, "step": 13622 }, { "epoch": 2.525583982202447, "grad_norm": 8.2265625, "learning_rate": 7.4744160177975535e-06, "loss": 2.902, "mean_token_accuracy": 0.47486755394753843, "step": 13623 }, { "epoch": 2.5257693733778273, "grad_norm": 7.421875, "learning_rate": 7.474230626622173e-06, "loss": 2.9414, "mean_token_accuracy": 0.5028134143596669, "step": 13624 }, { "epoch": 2.525954764553207, "grad_norm": 6.61328125, "learning_rate": 7.474045235446793e-06, "loss": 2.8455, "mean_token_accuracy": 0.483059624204764, "step": 13625 }, { "epoch": 2.526140155728587, "grad_norm": 7.234375, "learning_rate": 7.473859844271412e-06, "loss": 2.813, "mean_token_accuracy": 0.5222148978246539, "step": 13626 }, { "epoch": 2.5263255469039674, "grad_norm": 6.35546875, "learning_rate": 7.473674453096034e-06, "loss": 2.5901, "mean_token_accuracy": 0.4913294797687861, "step": 13627 }, { "epoch": 2.5265109380793476, "grad_norm": 6.2578125, "learning_rate": 7.473489061920653e-06, "loss": 2.4609, "mean_token_accuracy": 0.521511017838405, "step": 13628 }, { "epoch": 2.5266963292547273, "grad_norm": 6.33984375, "learning_rate": 7.473303670745273e-06, "loss": 2.7149, "mean_token_accuracy": 0.5002018570851837, "step": 13629 }, { "epoch": 2.5268817204301075, "grad_norm": 8.171875, "learning_rate": 7.4731182795698935e-06, "loss": 3.0819, "mean_token_accuracy": 0.4727933356965615, "step": 13630 }, { "epoch": 2.5270671116054877, "grad_norm": 7.50390625, "learning_rate": 7.472932888394513e-06, "loss": 3.085, "mean_token_accuracy": 0.4551306323362363, "step": 13631 }, { "epoch": 2.5272525027808674, "grad_norm": 7.0703125, "learning_rate": 7.472747497219133e-06, "loss": 2.9557, "mean_token_accuracy": 0.49679724069967973, "step": 13632 }, { "epoch": 2.5274378939562476, "grad_norm": 8.3203125, "learning_rate": 7.4725621060437524e-06, "loss": 2.9449, "mean_token_accuracy": 0.48145001416029454, "step": 13633 }, { "epoch": 2.527623285131628, "grad_norm": 7.49609375, "learning_rate": 7.472376714868372e-06, "loss": 2.3127, "mean_token_accuracy": 0.5291616846020991, "step": 13634 }, { "epoch": 2.527808676307008, "grad_norm": 6.54296875, "learning_rate": 7.472191323692993e-06, "loss": 3.3442, "mean_token_accuracy": 0.41216901408450707, "step": 13635 }, { "epoch": 2.5279940674823878, "grad_norm": 7.44140625, "learning_rate": 7.472005932517613e-06, "loss": 3.656, "mean_token_accuracy": 0.4393581291205374, "step": 13636 }, { "epoch": 2.528179458657768, "grad_norm": 8.875, "learning_rate": 7.471820541342233e-06, "loss": 2.8291, "mean_token_accuracy": 0.4743619489559165, "step": 13637 }, { "epoch": 2.5283648498331477, "grad_norm": 7.35546875, "learning_rate": 7.471635150166852e-06, "loss": 3.6574, "mean_token_accuracy": 0.44239860661856184, "step": 13638 }, { "epoch": 2.528550241008528, "grad_norm": 7.453125, "learning_rate": 7.471449758991473e-06, "loss": 2.515, "mean_token_accuracy": 0.5135363790186125, "step": 13639 }, { "epoch": 2.528735632183908, "grad_norm": 9.484375, "learning_rate": 7.4712643678160925e-06, "loss": 2.6836, "mean_token_accuracy": 0.494946201499837, "step": 13640 }, { "epoch": 2.5289210233592883, "grad_norm": 8.6875, "learning_rate": 7.471078976640712e-06, "loss": 3.1493, "mean_token_accuracy": 0.46058264592252274, "step": 13641 }, { "epoch": 2.529106414534668, "grad_norm": 7.421875, "learning_rate": 7.470893585465332e-06, "loss": 3.1181, "mean_token_accuracy": 0.46413338503295853, "step": 13642 }, { "epoch": 2.529291805710048, "grad_norm": 6.921875, "learning_rate": 7.470708194289953e-06, "loss": 2.945, "mean_token_accuracy": 0.47437689261588634, "step": 13643 }, { "epoch": 2.5294771968854284, "grad_norm": 6.89453125, "learning_rate": 7.470522803114573e-06, "loss": 2.9656, "mean_token_accuracy": 0.46724300959070647, "step": 13644 }, { "epoch": 2.529662588060808, "grad_norm": 7.42578125, "learning_rate": 7.470337411939192e-06, "loss": 2.8079, "mean_token_accuracy": 0.47449862567443757, "step": 13645 }, { "epoch": 2.5298479792361883, "grad_norm": 5.74609375, "learning_rate": 7.470152020763812e-06, "loss": 2.6052, "mean_token_accuracy": 0.5024642681123707, "step": 13646 }, { "epoch": 2.5300333704115685, "grad_norm": 7.03125, "learning_rate": 7.469966629588432e-06, "loss": 3.2737, "mean_token_accuracy": 0.4512279888785913, "step": 13647 }, { "epoch": 2.5302187615869487, "grad_norm": 6.00390625, "learning_rate": 7.469781238413052e-06, "loss": 2.9769, "mean_token_accuracy": 0.45360384410037374, "step": 13648 }, { "epoch": 2.5304041527623284, "grad_norm": 7.1953125, "learning_rate": 7.469595847237672e-06, "loss": 3.7534, "mean_token_accuracy": 0.42343559940532505, "step": 13649 }, { "epoch": 2.5305895439377086, "grad_norm": 7.171875, "learning_rate": 7.4694104560622915e-06, "loss": 3.1812, "mean_token_accuracy": 0.46565193965517243, "step": 13650 }, { "epoch": 2.5307749351130884, "grad_norm": 8.9375, "learning_rate": 7.469225064886913e-06, "loss": 3.4261, "mean_token_accuracy": 0.41765270227054685, "step": 13651 }, { "epoch": 2.5309603262884686, "grad_norm": 8.5546875, "learning_rate": 7.4690396737115325e-06, "loss": 2.8297, "mean_token_accuracy": 0.4800173761946134, "step": 13652 }, { "epoch": 2.5311457174638488, "grad_norm": 7.29296875, "learning_rate": 7.468854282536152e-06, "loss": 3.5447, "mean_token_accuracy": 0.4408883826879271, "step": 13653 }, { "epoch": 2.531331108639229, "grad_norm": 7.03515625, "learning_rate": 7.468668891360772e-06, "loss": 2.5639, "mean_token_accuracy": 0.5211361366914457, "step": 13654 }, { "epoch": 2.5315164998146087, "grad_norm": 6.5390625, "learning_rate": 7.468483500185391e-06, "loss": 2.6471, "mean_token_accuracy": 0.4964811963932263, "step": 13655 }, { "epoch": 2.531701890989989, "grad_norm": 6.33984375, "learning_rate": 7.468298109010012e-06, "loss": 3.1685, "mean_token_accuracy": 0.4529842680437132, "step": 13656 }, { "epoch": 2.531887282165369, "grad_norm": 7.8046875, "learning_rate": 7.4681127178346315e-06, "loss": 3.0455, "mean_token_accuracy": 0.46415525114155254, "step": 13657 }, { "epoch": 2.532072673340749, "grad_norm": 6.80859375, "learning_rate": 7.467927326659251e-06, "loss": 2.5499, "mean_token_accuracy": 0.5102534890344631, "step": 13658 }, { "epoch": 2.532258064516129, "grad_norm": 6.6796875, "learning_rate": 7.467741935483872e-06, "loss": 2.954, "mean_token_accuracy": 0.4601291485207989, "step": 13659 }, { "epoch": 2.532443455691509, "grad_norm": 8.7109375, "learning_rate": 7.467556544308492e-06, "loss": 3.0062, "mean_token_accuracy": 0.5111967966780365, "step": 13660 }, { "epoch": 2.5326288468668894, "grad_norm": 12.3515625, "learning_rate": 7.467371153133112e-06, "loss": 2.6907, "mean_token_accuracy": 0.49343683839096686, "step": 13661 }, { "epoch": 2.532814238042269, "grad_norm": 9.40625, "learning_rate": 7.4671857619577314e-06, "loss": 2.8912, "mean_token_accuracy": 0.485891035380942, "step": 13662 }, { "epoch": 2.5329996292176493, "grad_norm": 6.72265625, "learning_rate": 7.467000370782351e-06, "loss": 3.0471, "mean_token_accuracy": 0.4590987019348518, "step": 13663 }, { "epoch": 2.533185020393029, "grad_norm": 9.6796875, "learning_rate": 7.466814979606971e-06, "loss": 3.8774, "mean_token_accuracy": 0.41136576239476147, "step": 13664 }, { "epoch": 2.5333704115684093, "grad_norm": 8.2578125, "learning_rate": 7.466629588431591e-06, "loss": 2.7784, "mean_token_accuracy": 0.48436179205409974, "step": 13665 }, { "epoch": 2.5335558027437894, "grad_norm": 7.5703125, "learning_rate": 7.466444197256211e-06, "loss": 2.8374, "mean_token_accuracy": 0.47025557368134857, "step": 13666 }, { "epoch": 2.5337411939191696, "grad_norm": 7.90625, "learning_rate": 7.466258806080831e-06, "loss": 2.655, "mean_token_accuracy": 0.5195878758835509, "step": 13667 }, { "epoch": 2.5339265850945494, "grad_norm": 9.7109375, "learning_rate": 7.466073414905452e-06, "loss": 2.2084, "mean_token_accuracy": 0.5285680695979309, "step": 13668 }, { "epoch": 2.5341119762699296, "grad_norm": 6.13671875, "learning_rate": 7.4658880237300715e-06, "loss": 2.512, "mean_token_accuracy": 0.51995825723976, "step": 13669 }, { "epoch": 2.5342973674453098, "grad_norm": 11.015625, "learning_rate": 7.465702632554691e-06, "loss": 2.9147, "mean_token_accuracy": 0.5300930937432344, "step": 13670 }, { "epoch": 2.5344827586206895, "grad_norm": 12.4296875, "learning_rate": 7.465517241379311e-06, "loss": 3.0565, "mean_token_accuracy": 0.46942656524283205, "step": 13671 }, { "epoch": 2.5346681497960697, "grad_norm": 7.69921875, "learning_rate": 7.46533185020393e-06, "loss": 3.3901, "mean_token_accuracy": 0.44393837910247824, "step": 13672 }, { "epoch": 2.53485354097145, "grad_norm": 8.1796875, "learning_rate": 7.465146459028551e-06, "loss": 2.4282, "mean_token_accuracy": 0.5174624226348364, "step": 13673 }, { "epoch": 2.53503893214683, "grad_norm": 9.765625, "learning_rate": 7.4649610678531706e-06, "loss": 3.6076, "mean_token_accuracy": 0.4631484334874165, "step": 13674 }, { "epoch": 2.53522432332221, "grad_norm": 8.328125, "learning_rate": 7.464775676677791e-06, "loss": 2.512, "mean_token_accuracy": 0.5079872204472844, "step": 13675 }, { "epoch": 2.53540971449759, "grad_norm": 9.765625, "learning_rate": 7.464590285502411e-06, "loss": 2.0045, "mean_token_accuracy": 0.5813310652774825, "step": 13676 }, { "epoch": 2.5355951056729698, "grad_norm": 7.859375, "learning_rate": 7.464404894327031e-06, "loss": 3.3312, "mean_token_accuracy": 0.447840211767537, "step": 13677 }, { "epoch": 2.53578049684835, "grad_norm": 7.890625, "learning_rate": 7.464219503151651e-06, "loss": 2.5785, "mean_token_accuracy": 0.514599555061179, "step": 13678 }, { "epoch": 2.53596588802373, "grad_norm": 6.67578125, "learning_rate": 7.4640341119762705e-06, "loss": 2.5208, "mean_token_accuracy": 0.5176873928627085, "step": 13679 }, { "epoch": 2.5361512791991103, "grad_norm": 8.140625, "learning_rate": 7.46384872080089e-06, "loss": 3.1449, "mean_token_accuracy": 0.45553869499241273, "step": 13680 }, { "epoch": 2.53633667037449, "grad_norm": 7.98828125, "learning_rate": 7.46366332962551e-06, "loss": 2.4454, "mean_token_accuracy": 0.5124571428571428, "step": 13681 }, { "epoch": 2.5365220615498703, "grad_norm": 6.78515625, "learning_rate": 7.46347793845013e-06, "loss": 2.5707, "mean_token_accuracy": 0.5067470864853814, "step": 13682 }, { "epoch": 2.5367074527252504, "grad_norm": 9.3515625, "learning_rate": 7.463292547274751e-06, "loss": 2.5917, "mean_token_accuracy": 0.5055731549923735, "step": 13683 }, { "epoch": 2.53689284390063, "grad_norm": 7.33984375, "learning_rate": 7.46310715609937e-06, "loss": 2.4356, "mean_token_accuracy": 0.5091906721536351, "step": 13684 }, { "epoch": 2.5370782350760104, "grad_norm": 6.87109375, "learning_rate": 7.46292176492399e-06, "loss": 3.259, "mean_token_accuracy": 0.45609537238815534, "step": 13685 }, { "epoch": 2.5372636262513906, "grad_norm": 7.93359375, "learning_rate": 7.4627363737486105e-06, "loss": 2.8888, "mean_token_accuracy": 0.48690899847483476, "step": 13686 }, { "epoch": 2.5374490174267708, "grad_norm": 7.78515625, "learning_rate": 7.46255098257323e-06, "loss": 2.447, "mean_token_accuracy": 0.5214928694244968, "step": 13687 }, { "epoch": 2.5376344086021505, "grad_norm": 8.3671875, "learning_rate": 7.46236559139785e-06, "loss": 2.5526, "mean_token_accuracy": 0.531605158983527, "step": 13688 }, { "epoch": 2.5378197997775307, "grad_norm": 9.3203125, "learning_rate": 7.4621802002224695e-06, "loss": 2.6413, "mean_token_accuracy": 0.5223688821399369, "step": 13689 }, { "epoch": 2.5380051909529104, "grad_norm": 8.3046875, "learning_rate": 7.461994809047089e-06, "loss": 3.0175, "mean_token_accuracy": 0.4665192284064381, "step": 13690 }, { "epoch": 2.5381905821282906, "grad_norm": 10.921875, "learning_rate": 7.4618094178717104e-06, "loss": 2.9656, "mean_token_accuracy": 0.4708090075062552, "step": 13691 }, { "epoch": 2.538375973303671, "grad_norm": 7.92578125, "learning_rate": 7.46162402669633e-06, "loss": 2.6859, "mean_token_accuracy": 0.5093271581114113, "step": 13692 }, { "epoch": 2.538561364479051, "grad_norm": 12.140625, "learning_rate": 7.46143863552095e-06, "loss": 2.5423, "mean_token_accuracy": 0.5205352411234764, "step": 13693 }, { "epoch": 2.5387467556544308, "grad_norm": 7.12109375, "learning_rate": 7.46125324434557e-06, "loss": 2.463, "mean_token_accuracy": 0.5362103843560351, "step": 13694 }, { "epoch": 2.538932146829811, "grad_norm": 9.21875, "learning_rate": 7.46106785317019e-06, "loss": 3.0837, "mean_token_accuracy": 0.44954128440366975, "step": 13695 }, { "epoch": 2.5391175380051907, "grad_norm": 9.4609375, "learning_rate": 7.4608824619948095e-06, "loss": 2.931, "mean_token_accuracy": 0.4732313575525813, "step": 13696 }, { "epoch": 2.539302929180571, "grad_norm": 7.25390625, "learning_rate": 7.460697070819429e-06, "loss": 2.8577, "mean_token_accuracy": 0.4890343322999196, "step": 13697 }, { "epoch": 2.539488320355951, "grad_norm": 9.0234375, "learning_rate": 7.460511679644049e-06, "loss": 2.846, "mean_token_accuracy": 0.4860530773574252, "step": 13698 }, { "epoch": 2.5396737115313313, "grad_norm": 9.71875, "learning_rate": 7.46032628846867e-06, "loss": 3.0379, "mean_token_accuracy": 0.47923632833567037, "step": 13699 }, { "epoch": 2.539859102706711, "grad_norm": 8.390625, "learning_rate": 7.46014089729329e-06, "loss": 2.5781, "mean_token_accuracy": 0.5247789669613774, "step": 13700 }, { "epoch": 2.540044493882091, "grad_norm": 7.58984375, "learning_rate": 7.459955506117909e-06, "loss": 3.3282, "mean_token_accuracy": 0.44519653247832797, "step": 13701 }, { "epoch": 2.5402298850574714, "grad_norm": 12.8046875, "learning_rate": 7.459770114942529e-06, "loss": 2.6228, "mean_token_accuracy": 0.49311328949001115, "step": 13702 }, { "epoch": 2.540415276232851, "grad_norm": 10.8046875, "learning_rate": 7.4595847237671496e-06, "loss": 3.8102, "mean_token_accuracy": 0.4461622210125204, "step": 13703 }, { "epoch": 2.5406006674082313, "grad_norm": 8.9921875, "learning_rate": 7.459399332591769e-06, "loss": 3.1903, "mean_token_accuracy": 0.46397941680960547, "step": 13704 }, { "epoch": 2.5407860585836115, "grad_norm": 7.90625, "learning_rate": 7.459213941416389e-06, "loss": 2.7562, "mean_token_accuracy": 0.5047322253000923, "step": 13705 }, { "epoch": 2.5409714497589917, "grad_norm": 9.3671875, "learning_rate": 7.4590285502410085e-06, "loss": 2.8957, "mean_token_accuracy": 0.48219380746063617, "step": 13706 }, { "epoch": 2.5411568409343714, "grad_norm": 8.0234375, "learning_rate": 7.45884315906563e-06, "loss": 2.7646, "mean_token_accuracy": 0.5115737570195864, "step": 13707 }, { "epoch": 2.5413422321097516, "grad_norm": 8.140625, "learning_rate": 7.4586577678902495e-06, "loss": 3.1256, "mean_token_accuracy": 0.47494598209692357, "step": 13708 }, { "epoch": 2.5415276232851314, "grad_norm": 10.6015625, "learning_rate": 7.458472376714869e-06, "loss": 2.5526, "mean_token_accuracy": 0.5268019776216497, "step": 13709 }, { "epoch": 2.5417130144605116, "grad_norm": 9.2109375, "learning_rate": 7.458286985539489e-06, "loss": 2.1826, "mean_token_accuracy": 0.5614336917562724, "step": 13710 }, { "epoch": 2.5418984056358918, "grad_norm": 7.84375, "learning_rate": 7.458101594364109e-06, "loss": 2.5114, "mean_token_accuracy": 0.5068179075660674, "step": 13711 }, { "epoch": 2.542083796811272, "grad_norm": 9.9609375, "learning_rate": 7.457916203188729e-06, "loss": 2.7617, "mean_token_accuracy": 0.49123359580052495, "step": 13712 }, { "epoch": 2.5422691879866517, "grad_norm": 9.609375, "learning_rate": 7.4577308120133485e-06, "loss": 3.2789, "mean_token_accuracy": 0.47387677447680376, "step": 13713 }, { "epoch": 2.542454579162032, "grad_norm": 6.703125, "learning_rate": 7.457545420837968e-06, "loss": 2.9395, "mean_token_accuracy": 0.4611215834118756, "step": 13714 }, { "epoch": 2.542639970337412, "grad_norm": 10.2109375, "learning_rate": 7.4573600296625895e-06, "loss": 2.8379, "mean_token_accuracy": 0.4648496900222248, "step": 13715 }, { "epoch": 2.542825361512792, "grad_norm": 13.375, "learning_rate": 7.457174638487209e-06, "loss": 2.7651, "mean_token_accuracy": 0.47644554319461613, "step": 13716 }, { "epoch": 2.543010752688172, "grad_norm": 8.8671875, "learning_rate": 7.456989247311829e-06, "loss": 3.3587, "mean_token_accuracy": 0.4694264069264069, "step": 13717 }, { "epoch": 2.543196143863552, "grad_norm": 10.828125, "learning_rate": 7.4568038561364485e-06, "loss": 2.8851, "mean_token_accuracy": 0.4812841174941513, "step": 13718 }, { "epoch": 2.5433815350389324, "grad_norm": 9.140625, "learning_rate": 7.456618464961068e-06, "loss": 3.4093, "mean_token_accuracy": 0.42862394314422253, "step": 13719 }, { "epoch": 2.543566926214312, "grad_norm": 7.16796875, "learning_rate": 7.456433073785689e-06, "loss": 2.2668, "mean_token_accuracy": 0.5445887445887446, "step": 13720 }, { "epoch": 2.5437523173896923, "grad_norm": 9.4375, "learning_rate": 7.456247682610308e-06, "loss": 3.5722, "mean_token_accuracy": 0.45403949730700177, "step": 13721 }, { "epoch": 2.543937708565072, "grad_norm": 11.546875, "learning_rate": 7.456062291434928e-06, "loss": 3.1793, "mean_token_accuracy": 0.46894803548795944, "step": 13722 }, { "epoch": 2.5441230997404523, "grad_norm": 11.6875, "learning_rate": 7.455876900259549e-06, "loss": 2.5427, "mean_token_accuracy": 0.5072082500263074, "step": 13723 }, { "epoch": 2.5443084909158324, "grad_norm": 7.44140625, "learning_rate": 7.455691509084169e-06, "loss": 3.3973, "mean_token_accuracy": 0.43305582093626316, "step": 13724 }, { "epoch": 2.5444938820912126, "grad_norm": 8.6328125, "learning_rate": 7.4555061179087885e-06, "loss": 2.9154, "mean_token_accuracy": 0.46700143472022954, "step": 13725 }, { "epoch": 2.5446792732665924, "grad_norm": 8.375, "learning_rate": 7.455320726733408e-06, "loss": 2.6136, "mean_token_accuracy": 0.49379395945386845, "step": 13726 }, { "epoch": 2.5448646644419726, "grad_norm": 7.60546875, "learning_rate": 7.455135335558028e-06, "loss": 2.5721, "mean_token_accuracy": 0.508656103286385, "step": 13727 }, { "epoch": 2.5450500556173528, "grad_norm": 9.9921875, "learning_rate": 7.4549499443826474e-06, "loss": 3.7884, "mean_token_accuracy": 0.4765258215962441, "step": 13728 }, { "epoch": 2.5452354467927325, "grad_norm": 13.46875, "learning_rate": 7.454764553207268e-06, "loss": 2.6743, "mean_token_accuracy": 0.5086196786677613, "step": 13729 }, { "epoch": 2.5454208379681127, "grad_norm": 9.0390625, "learning_rate": 7.454579162031888e-06, "loss": 3.4162, "mean_token_accuracy": 0.4493441881501583, "step": 13730 }, { "epoch": 2.545606229143493, "grad_norm": 8.6328125, "learning_rate": 7.454393770856508e-06, "loss": 3.1033, "mean_token_accuracy": 0.46269173105754813, "step": 13731 }, { "epoch": 2.545791620318873, "grad_norm": 7.94921875, "learning_rate": 7.4542083796811286e-06, "loss": 3.1873, "mean_token_accuracy": 0.45701777659856724, "step": 13732 }, { "epoch": 2.545977011494253, "grad_norm": 7.5546875, "learning_rate": 7.454022988505748e-06, "loss": 3.1608, "mean_token_accuracy": 0.45866209262435675, "step": 13733 }, { "epoch": 2.546162402669633, "grad_norm": 8.625, "learning_rate": 7.453837597330368e-06, "loss": 3.1048, "mean_token_accuracy": 0.4539690925901466, "step": 13734 }, { "epoch": 2.5463477938450128, "grad_norm": 7.3515625, "learning_rate": 7.4536522061549875e-06, "loss": 2.7198, "mean_token_accuracy": 0.4927085823571599, "step": 13735 }, { "epoch": 2.546533185020393, "grad_norm": 10.515625, "learning_rate": 7.453466814979607e-06, "loss": 2.9261, "mean_token_accuracy": 0.46141845673826953, "step": 13736 }, { "epoch": 2.546718576195773, "grad_norm": 10.7109375, "learning_rate": 7.453281423804228e-06, "loss": 2.7738, "mean_token_accuracy": 0.4698316183348924, "step": 13737 }, { "epoch": 2.5469039673711533, "grad_norm": 7.3515625, "learning_rate": 7.453096032628847e-06, "loss": 3.1993, "mean_token_accuracy": 0.4743239552034963, "step": 13738 }, { "epoch": 2.547089358546533, "grad_norm": 7.79296875, "learning_rate": 7.452910641453467e-06, "loss": 2.7701, "mean_token_accuracy": 0.48983050847457626, "step": 13739 }, { "epoch": 2.5472747497219133, "grad_norm": 8.1875, "learning_rate": 7.452725250278087e-06, "loss": 2.6402, "mean_token_accuracy": 0.518609865470852, "step": 13740 }, { "epoch": 2.5474601408972934, "grad_norm": 7.92578125, "learning_rate": 7.452539859102708e-06, "loss": 2.8256, "mean_token_accuracy": 0.5, "step": 13741 }, { "epoch": 2.547645532072673, "grad_norm": 7.0234375, "learning_rate": 7.4523544679273275e-06, "loss": 2.8078, "mean_token_accuracy": 0.47328538402260467, "step": 13742 }, { "epoch": 2.5478309232480534, "grad_norm": 8.7265625, "learning_rate": 7.452169076751947e-06, "loss": 2.5951, "mean_token_accuracy": 0.5126547455295736, "step": 13743 }, { "epoch": 2.5480163144234336, "grad_norm": 8.2421875, "learning_rate": 7.451983685576567e-06, "loss": 2.696, "mean_token_accuracy": 0.4946119516708392, "step": 13744 }, { "epoch": 2.5482017055988138, "grad_norm": 6.62890625, "learning_rate": 7.4517982944011865e-06, "loss": 2.2215, "mean_token_accuracy": 0.5694678545629666, "step": 13745 }, { "epoch": 2.5483870967741935, "grad_norm": 6.83203125, "learning_rate": 7.451612903225807e-06, "loss": 3.3826, "mean_token_accuracy": 0.44591859680683604, "step": 13746 }, { "epoch": 2.5485724879495737, "grad_norm": 8.53125, "learning_rate": 7.451427512050427e-06, "loss": 3.0809, "mean_token_accuracy": 0.4765109705209315, "step": 13747 }, { "epoch": 2.5487578791249534, "grad_norm": 6.8984375, "learning_rate": 7.451242120875047e-06, "loss": 2.8689, "mean_token_accuracy": 0.4675194660734149, "step": 13748 }, { "epoch": 2.5489432703003336, "grad_norm": 6.4140625, "learning_rate": 7.451056729699668e-06, "loss": 2.9036, "mean_token_accuracy": 0.47838555496548063, "step": 13749 }, { "epoch": 2.549128661475714, "grad_norm": 10.875, "learning_rate": 7.450871338524287e-06, "loss": 3.0931, "mean_token_accuracy": 0.48893285519970325, "step": 13750 }, { "epoch": 2.549314052651094, "grad_norm": 9.515625, "learning_rate": 7.450685947348907e-06, "loss": 2.9861, "mean_token_accuracy": 0.47596675099385616, "step": 13751 }, { "epoch": 2.5494994438264738, "grad_norm": 8.90625, "learning_rate": 7.4505005561735265e-06, "loss": 2.6401, "mean_token_accuracy": 0.4944962784358947, "step": 13752 }, { "epoch": 2.549684835001854, "grad_norm": 9.390625, "learning_rate": 7.450315164998146e-06, "loss": 3.3395, "mean_token_accuracy": 0.46005370929842226, "step": 13753 }, { "epoch": 2.5498702261772337, "grad_norm": 7.82421875, "learning_rate": 7.450129773822767e-06, "loss": 3.1013, "mean_token_accuracy": 0.48082088750402924, "step": 13754 }, { "epoch": 2.550055617352614, "grad_norm": 10.2890625, "learning_rate": 7.449944382647386e-06, "loss": 2.8467, "mean_token_accuracy": 0.48082454458293383, "step": 13755 }, { "epoch": 2.550241008527994, "grad_norm": 7.73828125, "learning_rate": 7.449758991472007e-06, "loss": 3.2417, "mean_token_accuracy": 0.4435667135591386, "step": 13756 }, { "epoch": 2.5504263997033743, "grad_norm": 9.515625, "learning_rate": 7.4495736002966264e-06, "loss": 3.027, "mean_token_accuracy": 0.44071192833569073, "step": 13757 }, { "epoch": 2.5506117908787544, "grad_norm": 9.78125, "learning_rate": 7.449388209121247e-06, "loss": 2.8907, "mean_token_accuracy": 0.47465727581436296, "step": 13758 }, { "epoch": 2.550797182054134, "grad_norm": 8.9765625, "learning_rate": 7.449202817945867e-06, "loss": 3.2299, "mean_token_accuracy": 0.45731788520442135, "step": 13759 }, { "epoch": 2.5509825732295144, "grad_norm": 8.703125, "learning_rate": 7.449017426770486e-06, "loss": 2.8205, "mean_token_accuracy": 0.48021828103683495, "step": 13760 }, { "epoch": 2.551167964404894, "grad_norm": 11.125, "learning_rate": 7.448832035595106e-06, "loss": 3.0661, "mean_token_accuracy": 0.49720341477774505, "step": 13761 }, { "epoch": 2.5513533555802743, "grad_norm": 9.171875, "learning_rate": 7.4486466444197255e-06, "loss": 2.885, "mean_token_accuracy": 0.47912399558846697, "step": 13762 }, { "epoch": 2.5515387467556545, "grad_norm": 6.625, "learning_rate": 7.448461253244346e-06, "loss": 3.1454, "mean_token_accuracy": 0.47504720798489347, "step": 13763 }, { "epoch": 2.5517241379310347, "grad_norm": 7.3671875, "learning_rate": 7.4482758620689665e-06, "loss": 3.2068, "mean_token_accuracy": 0.4516082711085583, "step": 13764 }, { "epoch": 2.5519095291064144, "grad_norm": 7.671875, "learning_rate": 7.448090470893586e-06, "loss": 2.9544, "mean_token_accuracy": 0.4733085501858736, "step": 13765 }, { "epoch": 2.5520949202817946, "grad_norm": 7.73828125, "learning_rate": 7.447905079718206e-06, "loss": 2.461, "mean_token_accuracy": 0.5116017437772465, "step": 13766 }, { "epoch": 2.5522803114571744, "grad_norm": 7.08984375, "learning_rate": 7.447719688542826e-06, "loss": 2.9461, "mean_token_accuracy": 0.4699897107579742, "step": 13767 }, { "epoch": 2.5524657026325546, "grad_norm": 8.078125, "learning_rate": 7.447534297367446e-06, "loss": 2.9221, "mean_token_accuracy": 0.45645771981489275, "step": 13768 }, { "epoch": 2.5526510938079348, "grad_norm": 7.7578125, "learning_rate": 7.4473489061920656e-06, "loss": 2.6815, "mean_token_accuracy": 0.4994958951461904, "step": 13769 }, { "epoch": 2.552836484983315, "grad_norm": 6.7109375, "learning_rate": 7.447163515016685e-06, "loss": 3.0948, "mean_token_accuracy": 0.46786175710594313, "step": 13770 }, { "epoch": 2.5530218761586947, "grad_norm": 8.6640625, "learning_rate": 7.446978123841305e-06, "loss": 2.414, "mean_token_accuracy": 0.5302649930264993, "step": 13771 }, { "epoch": 2.553207267334075, "grad_norm": 6.41015625, "learning_rate": 7.446792732665926e-06, "loss": 2.2122, "mean_token_accuracy": 0.56563687749732, "step": 13772 }, { "epoch": 2.553392658509455, "grad_norm": 6.51171875, "learning_rate": 7.446607341490546e-06, "loss": 3.0249, "mean_token_accuracy": 0.4741901501185146, "step": 13773 }, { "epoch": 2.553578049684835, "grad_norm": 7.96484375, "learning_rate": 7.4464219503151655e-06, "loss": 3.2153, "mean_token_accuracy": 0.4674100623921412, "step": 13774 }, { "epoch": 2.553763440860215, "grad_norm": 8.234375, "learning_rate": 7.446236559139786e-06, "loss": 2.7232, "mean_token_accuracy": 0.4859452736318408, "step": 13775 }, { "epoch": 2.553948832035595, "grad_norm": 7.4296875, "learning_rate": 7.446051167964406e-06, "loss": 2.9467, "mean_token_accuracy": 0.4530834848272969, "step": 13776 }, { "epoch": 2.5541342232109754, "grad_norm": 8.8984375, "learning_rate": 7.445865776789025e-06, "loss": 3.0162, "mean_token_accuracy": 0.4883168316831683, "step": 13777 }, { "epoch": 2.554319614386355, "grad_norm": 29.234375, "learning_rate": 7.445680385613645e-06, "loss": 4.0019, "mean_token_accuracy": 0.49345417925478346, "step": 13778 }, { "epoch": 2.5545050055617353, "grad_norm": 13.8984375, "learning_rate": 7.4454949944382645e-06, "loss": 3.038, "mean_token_accuracy": 0.457682698313554, "step": 13779 }, { "epoch": 2.554690396737115, "grad_norm": 7.28515625, "learning_rate": 7.445309603262886e-06, "loss": 3.2163, "mean_token_accuracy": 0.45729213993639256, "step": 13780 }, { "epoch": 2.5548757879124953, "grad_norm": 8.6484375, "learning_rate": 7.4451242120875055e-06, "loss": 3.2263, "mean_token_accuracy": 0.47430710095083956, "step": 13781 }, { "epoch": 2.5550611790878754, "grad_norm": 12.6875, "learning_rate": 7.444938820912125e-06, "loss": 2.3114, "mean_token_accuracy": 0.5545127282077655, "step": 13782 }, { "epoch": 2.5552465702632556, "grad_norm": 6.64453125, "learning_rate": 7.444753429736745e-06, "loss": 3.192, "mean_token_accuracy": 0.46390516782099095, "step": 13783 }, { "epoch": 2.5554319614386354, "grad_norm": 7.89453125, "learning_rate": 7.444568038561365e-06, "loss": 2.977, "mean_token_accuracy": 0.4658491561181435, "step": 13784 }, { "epoch": 2.5556173526140156, "grad_norm": 10.40625, "learning_rate": 7.444382647385985e-06, "loss": 2.4909, "mean_token_accuracy": 0.5201331114808653, "step": 13785 }, { "epoch": 2.5558027437893958, "grad_norm": 8.3828125, "learning_rate": 7.444197256210605e-06, "loss": 3.0655, "mean_token_accuracy": 0.4624685929648241, "step": 13786 }, { "epoch": 2.5559881349647755, "grad_norm": 7.94140625, "learning_rate": 7.444011865035224e-06, "loss": 2.3223, "mean_token_accuracy": 0.5297484072994277, "step": 13787 }, { "epoch": 2.5561735261401557, "grad_norm": 9.875, "learning_rate": 7.443826473859846e-06, "loss": 3.0607, "mean_token_accuracy": 0.4629455909943715, "step": 13788 }, { "epoch": 2.556358917315536, "grad_norm": 8.9140625, "learning_rate": 7.443641082684465e-06, "loss": 2.8328, "mean_token_accuracy": 0.4606174433618969, "step": 13789 }, { "epoch": 2.556544308490916, "grad_norm": 6.359375, "learning_rate": 7.443455691509085e-06, "loss": 2.5463, "mean_token_accuracy": 0.5047575199508901, "step": 13790 }, { "epoch": 2.556729699666296, "grad_norm": 12.8203125, "learning_rate": 7.4432703003337045e-06, "loss": 3.1607, "mean_token_accuracy": 0.4719748559455212, "step": 13791 }, { "epoch": 2.556915090841676, "grad_norm": 12.6640625, "learning_rate": 7.443084909158325e-06, "loss": 2.937, "mean_token_accuracy": 0.46943550745832113, "step": 13792 }, { "epoch": 2.5571004820170558, "grad_norm": 14.53125, "learning_rate": 7.442899517982945e-06, "loss": 4.2626, "mean_token_accuracy": 0.41929133858267714, "step": 13793 }, { "epoch": 2.557285873192436, "grad_norm": 8.6484375, "learning_rate": 7.442714126807564e-06, "loss": 2.541, "mean_token_accuracy": 0.5419734904270986, "step": 13794 }, { "epoch": 2.557471264367816, "grad_norm": 7.58203125, "learning_rate": 7.442528735632184e-06, "loss": 2.4456, "mean_token_accuracy": 0.5135135135135135, "step": 13795 }, { "epoch": 2.5576566555431963, "grad_norm": 7.4921875, "learning_rate": 7.442343344456805e-06, "loss": 2.781, "mean_token_accuracy": 0.48956328645447816, "step": 13796 }, { "epoch": 2.557842046718576, "grad_norm": 7.5703125, "learning_rate": 7.442157953281425e-06, "loss": 3.19, "mean_token_accuracy": 0.46706859287908087, "step": 13797 }, { "epoch": 2.5580274378939563, "grad_norm": 7.48828125, "learning_rate": 7.4419725621060446e-06, "loss": 3.2036, "mean_token_accuracy": 0.4515965534718702, "step": 13798 }, { "epoch": 2.5582128290693364, "grad_norm": 6.4765625, "learning_rate": 7.441787170930664e-06, "loss": 2.3134, "mean_token_accuracy": 0.5537430167597766, "step": 13799 }, { "epoch": 2.558398220244716, "grad_norm": 9.0390625, "learning_rate": 7.441601779755284e-06, "loss": 2.9859, "mean_token_accuracy": 0.514644993099218, "step": 13800 }, { "epoch": 2.5585836114200964, "grad_norm": 6.84765625, "learning_rate": 7.441416388579904e-06, "loss": 2.6691, "mean_token_accuracy": 0.495105554900342, "step": 13801 }, { "epoch": 2.5587690025954766, "grad_norm": 7.33984375, "learning_rate": 7.441230997404524e-06, "loss": 2.5071, "mean_token_accuracy": 0.502056202878684, "step": 13802 }, { "epoch": 2.5589543937708568, "grad_norm": 8.0703125, "learning_rate": 7.441045606229144e-06, "loss": 2.8847, "mean_token_accuracy": 0.478494623655914, "step": 13803 }, { "epoch": 2.5591397849462365, "grad_norm": 6.8671875, "learning_rate": 7.440860215053764e-06, "loss": 2.6381, "mean_token_accuracy": 0.4953638662545659, "step": 13804 }, { "epoch": 2.5593251761216167, "grad_norm": 6.68359375, "learning_rate": 7.440674823878385e-06, "loss": 2.4831, "mean_token_accuracy": 0.5285391353489687, "step": 13805 }, { "epoch": 2.5595105672969964, "grad_norm": 9.265625, "learning_rate": 7.440489432703004e-06, "loss": 2.7182, "mean_token_accuracy": 0.4893338568250229, "step": 13806 }, { "epoch": 2.5596959584723766, "grad_norm": 9.6015625, "learning_rate": 7.440304041527624e-06, "loss": 3.1842, "mean_token_accuracy": 0.5038948393378773, "step": 13807 }, { "epoch": 2.559881349647757, "grad_norm": 7.02734375, "learning_rate": 7.4401186503522435e-06, "loss": 2.8508, "mean_token_accuracy": 0.4800404975412207, "step": 13808 }, { "epoch": 2.560066740823137, "grad_norm": 6.74609375, "learning_rate": 7.439933259176863e-06, "loss": 2.3926, "mean_token_accuracy": 0.5151197800759262, "step": 13809 }, { "epoch": 2.5602521319985168, "grad_norm": 7.65625, "learning_rate": 7.439747868001484e-06, "loss": 2.9048, "mean_token_accuracy": 0.48632218844984804, "step": 13810 }, { "epoch": 2.560437523173897, "grad_norm": 9.8203125, "learning_rate": 7.439562476826103e-06, "loss": 3.113, "mean_token_accuracy": 0.47706809229037705, "step": 13811 }, { "epoch": 2.560622914349277, "grad_norm": 7.71875, "learning_rate": 7.439377085650724e-06, "loss": 2.8864, "mean_token_accuracy": 0.49459920409323477, "step": 13812 }, { "epoch": 2.560808305524657, "grad_norm": 8.6328125, "learning_rate": 7.439191694475344e-06, "loss": 3.495, "mean_token_accuracy": 0.43187289359653347, "step": 13813 }, { "epoch": 2.560993696700037, "grad_norm": 6.6015625, "learning_rate": 7.439006303299964e-06, "loss": 2.5041, "mean_token_accuracy": 0.5275664130289681, "step": 13814 }, { "epoch": 2.5611790878754173, "grad_norm": 6.57421875, "learning_rate": 7.438820912124584e-06, "loss": 3.0163, "mean_token_accuracy": 0.4601349662584354, "step": 13815 }, { "epoch": 2.5613644790507974, "grad_norm": 7.7109375, "learning_rate": 7.438635520949203e-06, "loss": 2.5587, "mean_token_accuracy": 0.5119920354783238, "step": 13816 }, { "epoch": 2.561549870226177, "grad_norm": 7.2421875, "learning_rate": 7.438450129773823e-06, "loss": 3.2185, "mean_token_accuracy": 0.460569295380308, "step": 13817 }, { "epoch": 2.5617352614015574, "grad_norm": 6.9609375, "learning_rate": 7.438264738598443e-06, "loss": 2.5738, "mean_token_accuracy": 0.4888156345655757, "step": 13818 }, { "epoch": 2.561920652576937, "grad_norm": 9.1640625, "learning_rate": 7.438079347423063e-06, "loss": 3.7603, "mean_token_accuracy": 0.41190405685519693, "step": 13819 }, { "epoch": 2.5621060437523173, "grad_norm": 6.34375, "learning_rate": 7.4378939562476835e-06, "loss": 2.8523, "mean_token_accuracy": 0.4710240172063568, "step": 13820 }, { "epoch": 2.5622914349276975, "grad_norm": 6.5, "learning_rate": 7.437708565072303e-06, "loss": 2.5775, "mean_token_accuracy": 0.5075620767494357, "step": 13821 }, { "epoch": 2.5624768261030777, "grad_norm": 6.62109375, "learning_rate": 7.437523173896924e-06, "loss": 2.7058, "mean_token_accuracy": 0.5084475613643609, "step": 13822 }, { "epoch": 2.5626622172784574, "grad_norm": 8.6484375, "learning_rate": 7.437337782721543e-06, "loss": 2.1548, "mean_token_accuracy": 0.563770325203252, "step": 13823 }, { "epoch": 2.5628476084538376, "grad_norm": 8.8203125, "learning_rate": 7.437152391546163e-06, "loss": 3.722, "mean_token_accuracy": 0.42838829973196596, "step": 13824 }, { "epoch": 2.5630329996292174, "grad_norm": 9.1171875, "learning_rate": 7.436967000370783e-06, "loss": 2.4887, "mean_token_accuracy": 0.5232360097323601, "step": 13825 }, { "epoch": 2.5632183908045976, "grad_norm": 8.1015625, "learning_rate": 7.436781609195402e-06, "loss": 2.449, "mean_token_accuracy": 0.5107878391631252, "step": 13826 }, { "epoch": 2.5634037819799778, "grad_norm": 7.4765625, "learning_rate": 7.436596218020023e-06, "loss": 3.0364, "mean_token_accuracy": 0.4947598651234849, "step": 13827 }, { "epoch": 2.563589173155358, "grad_norm": 8.15625, "learning_rate": 7.436410826844643e-06, "loss": 2.3949, "mean_token_accuracy": 0.5049645390070922, "step": 13828 }, { "epoch": 2.5637745643307377, "grad_norm": 7.8984375, "learning_rate": 7.436225435669263e-06, "loss": 2.7149, "mean_token_accuracy": 0.4957854406130268, "step": 13829 }, { "epoch": 2.563959955506118, "grad_norm": 9.140625, "learning_rate": 7.436040044493883e-06, "loss": 3.4269, "mean_token_accuracy": 0.4505919587387176, "step": 13830 }, { "epoch": 2.564145346681498, "grad_norm": 10.390625, "learning_rate": 7.435854653318503e-06, "loss": 2.9345, "mean_token_accuracy": 0.4883227176220807, "step": 13831 }, { "epoch": 2.564330737856878, "grad_norm": 7.44140625, "learning_rate": 7.435669262143123e-06, "loss": 2.7499, "mean_token_accuracy": 0.49706933523945673, "step": 13832 }, { "epoch": 2.564516129032258, "grad_norm": 7.69140625, "learning_rate": 7.435483870967742e-06, "loss": 3.1706, "mean_token_accuracy": 0.4594022745305475, "step": 13833 }, { "epoch": 2.564701520207638, "grad_norm": 9.015625, "learning_rate": 7.435298479792362e-06, "loss": 2.4651, "mean_token_accuracy": 0.5189968652037618, "step": 13834 }, { "epoch": 2.5648869113830184, "grad_norm": 9.40625, "learning_rate": 7.435113088616982e-06, "loss": 2.3811, "mean_token_accuracy": 0.519605077574048, "step": 13835 }, { "epoch": 2.565072302558398, "grad_norm": 7.73046875, "learning_rate": 7.434927697441603e-06, "loss": 2.987, "mean_token_accuracy": 0.46234522942461764, "step": 13836 }, { "epoch": 2.5652576937337783, "grad_norm": 7.8984375, "learning_rate": 7.4347423062662225e-06, "loss": 3.0258, "mean_token_accuracy": 0.48636689787132265, "step": 13837 }, { "epoch": 2.565443084909158, "grad_norm": 9.890625, "learning_rate": 7.434556915090842e-06, "loss": 3.7939, "mean_token_accuracy": 0.4362322515212982, "step": 13838 }, { "epoch": 2.5656284760845383, "grad_norm": 9.1796875, "learning_rate": 7.434371523915463e-06, "loss": 2.9287, "mean_token_accuracy": 0.46857410881801126, "step": 13839 }, { "epoch": 2.5658138672599184, "grad_norm": 7.11328125, "learning_rate": 7.434186132740082e-06, "loss": 3.0496, "mean_token_accuracy": 0.4505854211255193, "step": 13840 }, { "epoch": 2.5659992584352986, "grad_norm": 8.484375, "learning_rate": 7.434000741564702e-06, "loss": 2.9985, "mean_token_accuracy": 0.4707586933614331, "step": 13841 }, { "epoch": 2.5661846496106784, "grad_norm": 6.13671875, "learning_rate": 7.433815350389322e-06, "loss": 2.6593, "mean_token_accuracy": 0.4863102998696219, "step": 13842 }, { "epoch": 2.5663700407860586, "grad_norm": 5.8828125, "learning_rate": 7.433629959213941e-06, "loss": 3.045, "mean_token_accuracy": 0.46725471242145966, "step": 13843 }, { "epoch": 2.5665554319614388, "grad_norm": 7.1484375, "learning_rate": 7.433444568038563e-06, "loss": 3.2909, "mean_token_accuracy": 0.473590828442746, "step": 13844 }, { "epoch": 2.5667408231368185, "grad_norm": 6.87890625, "learning_rate": 7.433259176863182e-06, "loss": 2.794, "mean_token_accuracy": 0.4644133412745682, "step": 13845 }, { "epoch": 2.5669262143121987, "grad_norm": 13.609375, "learning_rate": 7.433073785687802e-06, "loss": 3.4907, "mean_token_accuracy": 0.4208729216152019, "step": 13846 }, { "epoch": 2.567111605487579, "grad_norm": 7.953125, "learning_rate": 7.4328883945124215e-06, "loss": 3.1017, "mean_token_accuracy": 0.47587791270101737, "step": 13847 }, { "epoch": 2.567296996662959, "grad_norm": 7.10546875, "learning_rate": 7.432703003337042e-06, "loss": 2.3971, "mean_token_accuracy": 0.53634483855575, "step": 13848 }, { "epoch": 2.567482387838339, "grad_norm": 7.28515625, "learning_rate": 7.432517612161662e-06, "loss": 2.5727, "mean_token_accuracy": 0.5276028158577251, "step": 13849 }, { "epoch": 2.567667779013719, "grad_norm": 7.32421875, "learning_rate": 7.432332220986281e-06, "loss": 2.7462, "mean_token_accuracy": 0.4621802612800527, "step": 13850 }, { "epoch": 2.5678531701890988, "grad_norm": 8.015625, "learning_rate": 7.432146829810901e-06, "loss": 2.8527, "mean_token_accuracy": 0.4865229110512129, "step": 13851 }, { "epoch": 2.568038561364479, "grad_norm": 8.34375, "learning_rate": 7.431961438635522e-06, "loss": 2.739, "mean_token_accuracy": 0.4807002439374372, "step": 13852 }, { "epoch": 2.568223952539859, "grad_norm": 7.19140625, "learning_rate": 7.431776047460142e-06, "loss": 2.6978, "mean_token_accuracy": 0.5185643564356436, "step": 13853 }, { "epoch": 2.5684093437152393, "grad_norm": 7.46484375, "learning_rate": 7.431590656284762e-06, "loss": 2.6608, "mean_token_accuracy": 0.47228995788073597, "step": 13854 }, { "epoch": 2.568594734890619, "grad_norm": 8.0546875, "learning_rate": 7.431405265109381e-06, "loss": 3.0315, "mean_token_accuracy": 0.4684123025768911, "step": 13855 }, { "epoch": 2.5687801260659993, "grad_norm": 7.74609375, "learning_rate": 7.431219873934002e-06, "loss": 3.5044, "mean_token_accuracy": 0.4502816556453588, "step": 13856 }, { "epoch": 2.5689655172413794, "grad_norm": 11.7265625, "learning_rate": 7.431034482758621e-06, "loss": 2.8327, "mean_token_accuracy": 0.46732922092757595, "step": 13857 }, { "epoch": 2.569150908416759, "grad_norm": 11.7421875, "learning_rate": 7.430849091583241e-06, "loss": 3.2985, "mean_token_accuracy": 0.45017128620367486, "step": 13858 }, { "epoch": 2.5693362995921394, "grad_norm": 7.75, "learning_rate": 7.430663700407861e-06, "loss": 3.0737, "mean_token_accuracy": 0.42348837209302326, "step": 13859 }, { "epoch": 2.5695216907675196, "grad_norm": 8.4140625, "learning_rate": 7.43047830923248e-06, "loss": 4.3692, "mean_token_accuracy": 0.40856465336867065, "step": 13860 }, { "epoch": 2.5697070819428998, "grad_norm": 6.265625, "learning_rate": 7.430292918057102e-06, "loss": 2.6703, "mean_token_accuracy": 0.4928308070462925, "step": 13861 }, { "epoch": 2.5698924731182795, "grad_norm": 7.921875, "learning_rate": 7.430107526881721e-06, "loss": 2.4017, "mean_token_accuracy": 0.5153974695631416, "step": 13862 }, { "epoch": 2.5700778642936597, "grad_norm": 10.4296875, "learning_rate": 7.429922135706341e-06, "loss": 2.8629, "mean_token_accuracy": 0.47024504084014, "step": 13863 }, { "epoch": 2.5702632554690394, "grad_norm": 6.29296875, "learning_rate": 7.4297367445309606e-06, "loss": 2.7099, "mean_token_accuracy": 0.495089614534741, "step": 13864 }, { "epoch": 2.5704486466444196, "grad_norm": 10.5078125, "learning_rate": 7.429551353355581e-06, "loss": 3.4971, "mean_token_accuracy": 0.43264825238905613, "step": 13865 }, { "epoch": 2.5706340378198, "grad_norm": 9.8671875, "learning_rate": 7.429365962180201e-06, "loss": 2.6106, "mean_token_accuracy": 0.5007216742843397, "step": 13866 }, { "epoch": 2.57081942899518, "grad_norm": 6.40625, "learning_rate": 7.42918057100482e-06, "loss": 2.9911, "mean_token_accuracy": 0.4704772475027747, "step": 13867 }, { "epoch": 2.5710048201705598, "grad_norm": 9.1328125, "learning_rate": 7.42899517982944e-06, "loss": 2.4868, "mean_token_accuracy": 0.5120310183601323, "step": 13868 }, { "epoch": 2.57119021134594, "grad_norm": 7.05078125, "learning_rate": 7.428809788654061e-06, "loss": 2.8921, "mean_token_accuracy": 0.4736093327642623, "step": 13869 }, { "epoch": 2.57137560252132, "grad_norm": 8.2265625, "learning_rate": 7.428624397478681e-06, "loss": 2.7207, "mean_token_accuracy": 0.4735373268214891, "step": 13870 }, { "epoch": 2.5715609936967, "grad_norm": 9.453125, "learning_rate": 7.428439006303301e-06, "loss": 3.2834, "mean_token_accuracy": 0.4847742922723795, "step": 13871 }, { "epoch": 2.57174638487208, "grad_norm": 6.8359375, "learning_rate": 7.42825361512792e-06, "loss": 2.4725, "mean_token_accuracy": 0.5376717281272596, "step": 13872 }, { "epoch": 2.5719317760474603, "grad_norm": 6.85546875, "learning_rate": 7.428068223952541e-06, "loss": 3.1751, "mean_token_accuracy": 0.4665497707040734, "step": 13873 }, { "epoch": 2.5721171672228405, "grad_norm": 6.796875, "learning_rate": 7.42788283277716e-06, "loss": 3.0615, "mean_token_accuracy": 0.44154147615937295, "step": 13874 }, { "epoch": 2.57230255839822, "grad_norm": 9.3984375, "learning_rate": 7.42769744160178e-06, "loss": 3.0767, "mean_token_accuracy": 0.481562099871959, "step": 13875 }, { "epoch": 2.5724879495736004, "grad_norm": 7.0859375, "learning_rate": 7.4275120504264e-06, "loss": 3.4327, "mean_token_accuracy": 0.4141820212171971, "step": 13876 }, { "epoch": 2.57267334074898, "grad_norm": 8.4453125, "learning_rate": 7.427326659251021e-06, "loss": 3.6529, "mean_token_accuracy": 0.45190895741556536, "step": 13877 }, { "epoch": 2.5728587319243603, "grad_norm": 8.765625, "learning_rate": 7.427141268075641e-06, "loss": 2.7644, "mean_token_accuracy": 0.4810219874708267, "step": 13878 }, { "epoch": 2.5730441230997405, "grad_norm": 7.65234375, "learning_rate": 7.42695587690026e-06, "loss": 2.5066, "mean_token_accuracy": 0.5161910394795209, "step": 13879 }, { "epoch": 2.5732295142751207, "grad_norm": 7.64453125, "learning_rate": 7.42677048572488e-06, "loss": 2.7141, "mean_token_accuracy": 0.4954978467962939, "step": 13880 }, { "epoch": 2.5734149054505004, "grad_norm": 9.7265625, "learning_rate": 7.4265850945495e-06, "loss": 3.3356, "mean_token_accuracy": 0.4327260571619589, "step": 13881 }, { "epoch": 2.5736002966258806, "grad_norm": 8.7265625, "learning_rate": 7.42639970337412e-06, "loss": 3.0133, "mean_token_accuracy": 0.4539687703318152, "step": 13882 }, { "epoch": 2.573785687801261, "grad_norm": 7.47265625, "learning_rate": 7.42621431219874e-06, "loss": 2.8698, "mean_token_accuracy": 0.4681172911333488, "step": 13883 }, { "epoch": 2.5739710789766406, "grad_norm": 7.625, "learning_rate": 7.426028921023359e-06, "loss": 2.883, "mean_token_accuracy": 0.5178223336118073, "step": 13884 }, { "epoch": 2.5741564701520208, "grad_norm": 8.25, "learning_rate": 7.42584352984798e-06, "loss": 2.9943, "mean_token_accuracy": 0.5128030044383749, "step": 13885 }, { "epoch": 2.574341861327401, "grad_norm": 8.359375, "learning_rate": 7.4256581386726e-06, "loss": 2.9865, "mean_token_accuracy": 0.47829816435066047, "step": 13886 }, { "epoch": 2.574527252502781, "grad_norm": 7.46484375, "learning_rate": 7.42547274749722e-06, "loss": 3.2705, "mean_token_accuracy": 0.4473460463986396, "step": 13887 }, { "epoch": 2.574712643678161, "grad_norm": 7.21875, "learning_rate": 7.42528735632184e-06, "loss": 2.7705, "mean_token_accuracy": 0.4973309608540925, "step": 13888 }, { "epoch": 2.574898034853541, "grad_norm": 6.9453125, "learning_rate": 7.425101965146459e-06, "loss": 2.9913, "mean_token_accuracy": 0.4772191185599007, "step": 13889 }, { "epoch": 2.575083426028921, "grad_norm": 7.84765625, "learning_rate": 7.424916573971079e-06, "loss": 3.0265, "mean_token_accuracy": 0.4748159057437408, "step": 13890 }, { "epoch": 2.575268817204301, "grad_norm": 7.66796875, "learning_rate": 7.4247311827956994e-06, "loss": 2.9358, "mean_token_accuracy": 0.48886378308586187, "step": 13891 }, { "epoch": 2.575454208379681, "grad_norm": 8.53125, "learning_rate": 7.424545791620319e-06, "loss": 2.9368, "mean_token_accuracy": 0.48583494633116314, "step": 13892 }, { "epoch": 2.5756395995550614, "grad_norm": 7.5390625, "learning_rate": 7.4243604004449396e-06, "loss": 2.9243, "mean_token_accuracy": 0.4774090853434554, "step": 13893 }, { "epoch": 2.575824990730441, "grad_norm": 9.2734375, "learning_rate": 7.42417500926956e-06, "loss": 1.7282, "mean_token_accuracy": 0.6059786110037366, "step": 13894 }, { "epoch": 2.5760103819058213, "grad_norm": 9.1484375, "learning_rate": 7.42398961809418e-06, "loss": 2.8561, "mean_token_accuracy": 0.4573792111257643, "step": 13895 }, { "epoch": 2.576195773081201, "grad_norm": 7.16796875, "learning_rate": 7.423804226918799e-06, "loss": 2.9652, "mean_token_accuracy": 0.47495727619298017, "step": 13896 }, { "epoch": 2.5763811642565813, "grad_norm": 7.296875, "learning_rate": 7.423618835743419e-06, "loss": 3.5156, "mean_token_accuracy": 0.4018526687251875, "step": 13897 }, { "epoch": 2.5765665554319614, "grad_norm": 7.56640625, "learning_rate": 7.423433444568039e-06, "loss": 3.0222, "mean_token_accuracy": 0.46008907159986295, "step": 13898 }, { "epoch": 2.5767519466073416, "grad_norm": 6.81640625, "learning_rate": 7.423248053392659e-06, "loss": 3.1452, "mean_token_accuracy": 0.45694078491194007, "step": 13899 }, { "epoch": 2.5769373377827214, "grad_norm": 10.6640625, "learning_rate": 7.423062662217279e-06, "loss": 2.6061, "mean_token_accuracy": 0.4940143655227454, "step": 13900 }, { "epoch": 2.5771227289581016, "grad_norm": 7.5625, "learning_rate": 7.422877271041899e-06, "loss": 3.0893, "mean_token_accuracy": 0.47484358706986446, "step": 13901 }, { "epoch": 2.5773081201334818, "grad_norm": 8.4140625, "learning_rate": 7.422691879866519e-06, "loss": 2.9624, "mean_token_accuracy": 0.47853681052388647, "step": 13902 }, { "epoch": 2.5774935113088615, "grad_norm": 7.3515625, "learning_rate": 7.422506488691139e-06, "loss": 3.4662, "mean_token_accuracy": 0.4620510921880785, "step": 13903 }, { "epoch": 2.5776789024842417, "grad_norm": 6.73046875, "learning_rate": 7.422321097515759e-06, "loss": 3.3028, "mean_token_accuracy": 0.44192997438087106, "step": 13904 }, { "epoch": 2.577864293659622, "grad_norm": 11.703125, "learning_rate": 7.422135706340379e-06, "loss": 2.8506, "mean_token_accuracy": 0.4718213420415043, "step": 13905 }, { "epoch": 2.578049684835002, "grad_norm": 8.3828125, "learning_rate": 7.421950315164998e-06, "loss": 2.9878, "mean_token_accuracy": 0.46676036542515814, "step": 13906 }, { "epoch": 2.578235076010382, "grad_norm": 10.390625, "learning_rate": 7.421764923989618e-06, "loss": 2.4272, "mean_token_accuracy": 0.5113495200451722, "step": 13907 }, { "epoch": 2.578420467185762, "grad_norm": 9.8671875, "learning_rate": 7.4215795328142385e-06, "loss": 2.5407, "mean_token_accuracy": 0.5048792388387412, "step": 13908 }, { "epoch": 2.5786058583611418, "grad_norm": 11.8828125, "learning_rate": 7.421394141638859e-06, "loss": 2.7853, "mean_token_accuracy": 0.4998896490840874, "step": 13909 }, { "epoch": 2.578791249536522, "grad_norm": 8.453125, "learning_rate": 7.421208750463479e-06, "loss": 2.8793, "mean_token_accuracy": 0.47957017776598565, "step": 13910 }, { "epoch": 2.578976640711902, "grad_norm": 12.171875, "learning_rate": 7.421023359288099e-06, "loss": 2.9128, "mean_token_accuracy": 0.5126591541050783, "step": 13911 }, { "epoch": 2.5791620318872823, "grad_norm": 12.1015625, "learning_rate": 7.420837968112719e-06, "loss": 2.7874, "mean_token_accuracy": 0.46401485838105555, "step": 13912 }, { "epoch": 2.579347423062662, "grad_norm": 7.76171875, "learning_rate": 7.420652576937338e-06, "loss": 2.5907, "mean_token_accuracy": 0.4966266437964551, "step": 13913 }, { "epoch": 2.5795328142380423, "grad_norm": 10.390625, "learning_rate": 7.420467185761958e-06, "loss": 3.5395, "mean_token_accuracy": 0.441124212566629, "step": 13914 }, { "epoch": 2.5797182054134224, "grad_norm": 11.359375, "learning_rate": 7.420281794586578e-06, "loss": 3.0276, "mean_token_accuracy": 0.4985744390727656, "step": 13915 }, { "epoch": 2.579903596588802, "grad_norm": 9.25, "learning_rate": 7.420096403411198e-06, "loss": 2.2037, "mean_token_accuracy": 0.5579320299173337, "step": 13916 }, { "epoch": 2.5800889877641824, "grad_norm": 8.875, "learning_rate": 7.419911012235819e-06, "loss": 3.328, "mean_token_accuracy": 0.4683205217796413, "step": 13917 }, { "epoch": 2.5802743789395626, "grad_norm": 14.078125, "learning_rate": 7.419725621060438e-06, "loss": 3.0099, "mean_token_accuracy": 0.48199910554561715, "step": 13918 }, { "epoch": 2.5804597701149428, "grad_norm": 16.5, "learning_rate": 7.419540229885058e-06, "loss": 2.81, "mean_token_accuracy": 0.4898307148946535, "step": 13919 }, { "epoch": 2.5806451612903225, "grad_norm": 8.703125, "learning_rate": 7.4193548387096784e-06, "loss": 2.3953, "mean_token_accuracy": 0.5180497468633062, "step": 13920 }, { "epoch": 2.5808305524657027, "grad_norm": 10.828125, "learning_rate": 7.419169447534298e-06, "loss": 2.4448, "mean_token_accuracy": 0.5023769907297362, "step": 13921 }, { "epoch": 2.5810159436410824, "grad_norm": 15.8828125, "learning_rate": 7.418984056358918e-06, "loss": 2.6862, "mean_token_accuracy": 0.48659694674840726, "step": 13922 }, { "epoch": 2.5812013348164626, "grad_norm": 10.7421875, "learning_rate": 7.418798665183537e-06, "loss": 3.193, "mean_token_accuracy": 0.4764187757832534, "step": 13923 }, { "epoch": 2.581386725991843, "grad_norm": 6.53125, "learning_rate": 7.418613274008157e-06, "loss": 2.8806, "mean_token_accuracy": 0.5219277108433735, "step": 13924 }, { "epoch": 2.581572117167223, "grad_norm": 12.9921875, "learning_rate": 7.418427882832778e-06, "loss": 3.1153, "mean_token_accuracy": 0.4549132947976879, "step": 13925 }, { "epoch": 2.5817575083426028, "grad_norm": 12.8984375, "learning_rate": 7.418242491657398e-06, "loss": 3.5043, "mean_token_accuracy": 0.43431442928930364, "step": 13926 }, { "epoch": 2.581942899517983, "grad_norm": 12.96875, "learning_rate": 7.418057100482018e-06, "loss": 2.4754, "mean_token_accuracy": 0.5062525211778943, "step": 13927 }, { "epoch": 2.582128290693363, "grad_norm": 8.7890625, "learning_rate": 7.417871709306637e-06, "loss": 2.3015, "mean_token_accuracy": 0.5627637130801688, "step": 13928 }, { "epoch": 2.582313681868743, "grad_norm": 14.3515625, "learning_rate": 7.417686318131258e-06, "loss": 2.551, "mean_token_accuracy": 0.5056388483481491, "step": 13929 }, { "epoch": 2.582499073044123, "grad_norm": 12.796875, "learning_rate": 7.417500926955877e-06, "loss": 2.8666, "mean_token_accuracy": 0.4835680751173709, "step": 13930 }, { "epoch": 2.5826844642195033, "grad_norm": 7.375, "learning_rate": 7.417315535780497e-06, "loss": 3.738, "mean_token_accuracy": 0.41615109084988605, "step": 13931 }, { "epoch": 2.5828698553948835, "grad_norm": 11.9609375, "learning_rate": 7.417130144605117e-06, "loss": 2.7118, "mean_token_accuracy": 0.49779601763185893, "step": 13932 }, { "epoch": 2.583055246570263, "grad_norm": 16.25, "learning_rate": 7.416944753429738e-06, "loss": 2.5849, "mean_token_accuracy": 0.5102974828375286, "step": 13933 }, { "epoch": 2.5832406377456434, "grad_norm": 7.75390625, "learning_rate": 7.416759362254358e-06, "loss": 2.7829, "mean_token_accuracy": 0.4915697674418605, "step": 13934 }, { "epoch": 2.583426028921023, "grad_norm": 7.75, "learning_rate": 7.416573971078977e-06, "loss": 3.2192, "mean_token_accuracy": 0.4704918032786885, "step": 13935 }, { "epoch": 2.5836114200964033, "grad_norm": 7.7265625, "learning_rate": 7.416388579903597e-06, "loss": 2.1098, "mean_token_accuracy": 0.5558608058608059, "step": 13936 }, { "epoch": 2.5837968112717835, "grad_norm": 8.703125, "learning_rate": 7.4162031887282175e-06, "loss": 3.5302, "mean_token_accuracy": 0.42971204188481676, "step": 13937 }, { "epoch": 2.5839822024471637, "grad_norm": 8.515625, "learning_rate": 7.416017797552837e-06, "loss": 3.2424, "mean_token_accuracy": 0.4374258600237248, "step": 13938 }, { "epoch": 2.5841675936225434, "grad_norm": 7.21875, "learning_rate": 7.415832406377457e-06, "loss": 2.4694, "mean_token_accuracy": 0.5230387868183144, "step": 13939 }, { "epoch": 2.5843529847979236, "grad_norm": 7.1875, "learning_rate": 7.415647015202076e-06, "loss": 2.5194, "mean_token_accuracy": 0.5006126616746086, "step": 13940 }, { "epoch": 2.584538375973304, "grad_norm": 7.5625, "learning_rate": 7.415461624026698e-06, "loss": 3.7236, "mean_token_accuracy": 0.432829992189534, "step": 13941 }, { "epoch": 2.5847237671486836, "grad_norm": 7.828125, "learning_rate": 7.415276232851317e-06, "loss": 2.9918, "mean_token_accuracy": 0.5079330342488237, "step": 13942 }, { "epoch": 2.5849091583240638, "grad_norm": 8.953125, "learning_rate": 7.415090841675937e-06, "loss": 2.8803, "mean_token_accuracy": 0.47976111479761113, "step": 13943 }, { "epoch": 2.585094549499444, "grad_norm": 6.5390625, "learning_rate": 7.414905450500557e-06, "loss": 2.3202, "mean_token_accuracy": 0.5530612244897959, "step": 13944 }, { "epoch": 2.585279940674824, "grad_norm": 8.4140625, "learning_rate": 7.414720059325176e-06, "loss": 2.7998, "mean_token_accuracy": 0.5053997923156802, "step": 13945 }, { "epoch": 2.585465331850204, "grad_norm": 6.9453125, "learning_rate": 7.414534668149797e-06, "loss": 3.0802, "mean_token_accuracy": 0.4533551554828151, "step": 13946 }, { "epoch": 2.585650723025584, "grad_norm": 7.74609375, "learning_rate": 7.4143492769744165e-06, "loss": 2.8331, "mean_token_accuracy": 0.48632218844984804, "step": 13947 }, { "epoch": 2.585836114200964, "grad_norm": 6.69140625, "learning_rate": 7.414163885799036e-06, "loss": 2.6185, "mean_token_accuracy": 0.4991015274034142, "step": 13948 }, { "epoch": 2.586021505376344, "grad_norm": 6.7421875, "learning_rate": 7.4139784946236574e-06, "loss": 2.8337, "mean_token_accuracy": 0.4729305363647783, "step": 13949 }, { "epoch": 2.586206896551724, "grad_norm": 8.5078125, "learning_rate": 7.413793103448277e-06, "loss": 2.951, "mean_token_accuracy": 0.45045170257123, "step": 13950 }, { "epoch": 2.5863922877271044, "grad_norm": 7.421875, "learning_rate": 7.413607712272897e-06, "loss": 2.8925, "mean_token_accuracy": 0.46608803471791693, "step": 13951 }, { "epoch": 2.586577678902484, "grad_norm": 7.984375, "learning_rate": 7.413422321097516e-06, "loss": 2.986, "mean_token_accuracy": 0.48396024657189585, "step": 13952 }, { "epoch": 2.5867630700778643, "grad_norm": 7.328125, "learning_rate": 7.413236929922136e-06, "loss": 2.6442, "mean_token_accuracy": 0.5099665551839465, "step": 13953 }, { "epoch": 2.586948461253244, "grad_norm": 6.9765625, "learning_rate": 7.4130515387467565e-06, "loss": 2.658, "mean_token_accuracy": 0.4752606321363561, "step": 13954 }, { "epoch": 2.5871338524286243, "grad_norm": 7.8125, "learning_rate": 7.412866147571376e-06, "loss": 2.597, "mean_token_accuracy": 0.49253030160668987, "step": 13955 }, { "epoch": 2.5873192436040044, "grad_norm": 7.35546875, "learning_rate": 7.412680756395996e-06, "loss": 2.4636, "mean_token_accuracy": 0.5244667503136763, "step": 13956 }, { "epoch": 2.5875046347793846, "grad_norm": 9.0234375, "learning_rate": 7.412495365220616e-06, "loss": 3.1874, "mean_token_accuracy": 0.45848168140576545, "step": 13957 }, { "epoch": 2.587690025954765, "grad_norm": 8.5234375, "learning_rate": 7.412309974045237e-06, "loss": 3.0442, "mean_token_accuracy": 0.47606863335340155, "step": 13958 }, { "epoch": 2.5878754171301446, "grad_norm": 8.984375, "learning_rate": 7.412124582869856e-06, "loss": 3.7709, "mean_token_accuracy": 0.4407072587077905, "step": 13959 }, { "epoch": 2.5880608083055248, "grad_norm": 9.9140625, "learning_rate": 7.411939191694476e-06, "loss": 2.5997, "mean_token_accuracy": 0.5048709847288047, "step": 13960 }, { "epoch": 2.5882461994809045, "grad_norm": 8.21875, "learning_rate": 7.411753800519096e-06, "loss": 3.2985, "mean_token_accuracy": 0.47121732241880115, "step": 13961 }, { "epoch": 2.5884315906562847, "grad_norm": 6.59375, "learning_rate": 7.411568409343715e-06, "loss": 2.8897, "mean_token_accuracy": 0.4830540746382331, "step": 13962 }, { "epoch": 2.588616981831665, "grad_norm": 7.4453125, "learning_rate": 7.411383018168336e-06, "loss": 3.1365, "mean_token_accuracy": 0.47453310696095075, "step": 13963 }, { "epoch": 2.588802373007045, "grad_norm": 7.4921875, "learning_rate": 7.4111976269929555e-06, "loss": 2.2326, "mean_token_accuracy": 0.5432654141307224, "step": 13964 }, { "epoch": 2.588987764182425, "grad_norm": 6.8515625, "learning_rate": 7.411012235817576e-06, "loss": 3.1662, "mean_token_accuracy": 0.44380853277835586, "step": 13965 }, { "epoch": 2.589173155357805, "grad_norm": 9.234375, "learning_rate": 7.410826844642196e-06, "loss": 2.869, "mean_token_accuracy": 0.48666568439664065, "step": 13966 }, { "epoch": 2.5893585465331848, "grad_norm": 7.44921875, "learning_rate": 7.410641453466816e-06, "loss": 2.7601, "mean_token_accuracy": 0.5128174697365298, "step": 13967 }, { "epoch": 2.589543937708565, "grad_norm": 7.42578125, "learning_rate": 7.410456062291436e-06, "loss": 2.3941, "mean_token_accuracy": 0.5070015879890284, "step": 13968 }, { "epoch": 2.589729328883945, "grad_norm": 7.734375, "learning_rate": 7.410270671116055e-06, "loss": 2.5153, "mean_token_accuracy": 0.520937813440321, "step": 13969 }, { "epoch": 2.5899147200593253, "grad_norm": 8.3203125, "learning_rate": 7.410085279940675e-06, "loss": 2.6845, "mean_token_accuracy": 0.49912556838055266, "step": 13970 }, { "epoch": 2.590100111234705, "grad_norm": 8.671875, "learning_rate": 7.409899888765295e-06, "loss": 2.7338, "mean_token_accuracy": 0.5214756967820116, "step": 13971 }, { "epoch": 2.5902855024100853, "grad_norm": 9.453125, "learning_rate": 7.409714497589915e-06, "loss": 3.0871, "mean_token_accuracy": 0.46781163434903045, "step": 13972 }, { "epoch": 2.5904708935854655, "grad_norm": 10.265625, "learning_rate": 7.409529106414536e-06, "loss": 2.7718, "mean_token_accuracy": 0.511468204916907, "step": 13973 }, { "epoch": 2.590656284760845, "grad_norm": 8.4765625, "learning_rate": 7.409343715239155e-06, "loss": 2.5169, "mean_token_accuracy": 0.5114715189873418, "step": 13974 }, { "epoch": 2.5908416759362254, "grad_norm": 7.74609375, "learning_rate": 7.409158324063776e-06, "loss": 3.3568, "mean_token_accuracy": 0.444529262086514, "step": 13975 }, { "epoch": 2.5910270671116056, "grad_norm": 10.8828125, "learning_rate": 7.4089729328883955e-06, "loss": 2.368, "mean_token_accuracy": 0.5223726627981947, "step": 13976 }, { "epoch": 2.5912124582869858, "grad_norm": 7.5234375, "learning_rate": 7.408787541713015e-06, "loss": 2.3371, "mean_token_accuracy": 0.5459699833240689, "step": 13977 }, { "epoch": 2.5913978494623655, "grad_norm": 7.89453125, "learning_rate": 7.408602150537635e-06, "loss": 2.256, "mean_token_accuracy": 0.5705033058966401, "step": 13978 }, { "epoch": 2.5915832406377457, "grad_norm": 10.109375, "learning_rate": 7.408416759362254e-06, "loss": 3.1308, "mean_token_accuracy": 0.43333333333333335, "step": 13979 }, { "epoch": 2.5917686318131254, "grad_norm": 11.203125, "learning_rate": 7.408231368186875e-06, "loss": 2.8321, "mean_token_accuracy": 0.4740951029098652, "step": 13980 }, { "epoch": 2.5919540229885056, "grad_norm": 8.0625, "learning_rate": 7.4080459770114945e-06, "loss": 2.9113, "mean_token_accuracy": 0.46781901372648704, "step": 13981 }, { "epoch": 2.592139414163886, "grad_norm": 8.984375, "learning_rate": 7.407860585836115e-06, "loss": 2.8399, "mean_token_accuracy": 0.48662827895073574, "step": 13982 }, { "epoch": 2.592324805339266, "grad_norm": 12.453125, "learning_rate": 7.407675194660735e-06, "loss": 2.8819, "mean_token_accuracy": 0.4713804713804714, "step": 13983 }, { "epoch": 2.5925101965146458, "grad_norm": 8.9765625, "learning_rate": 7.407489803485355e-06, "loss": 2.6188, "mean_token_accuracy": 0.4950351174618552, "step": 13984 }, { "epoch": 2.592695587690026, "grad_norm": 9.6875, "learning_rate": 7.407304412309975e-06, "loss": 2.9166, "mean_token_accuracy": 0.4851043865822191, "step": 13985 }, { "epoch": 2.592880978865406, "grad_norm": 10.671875, "learning_rate": 7.4071190211345944e-06, "loss": 2.4716, "mean_token_accuracy": 0.5270096463022508, "step": 13986 }, { "epoch": 2.593066370040786, "grad_norm": 7.70703125, "learning_rate": 7.406933629959214e-06, "loss": 3.4771, "mean_token_accuracy": 0.4183262209577999, "step": 13987 }, { "epoch": 2.593251761216166, "grad_norm": 7.5703125, "learning_rate": 7.406748238783834e-06, "loss": 2.8169, "mean_token_accuracy": 0.4788005803643398, "step": 13988 }, { "epoch": 2.5934371523915463, "grad_norm": 13.78125, "learning_rate": 7.406562847608454e-06, "loss": 3.0382, "mean_token_accuracy": 0.4695856137607506, "step": 13989 }, { "epoch": 2.5936225435669265, "grad_norm": 14.40625, "learning_rate": 7.406377456433075e-06, "loss": 2.449, "mean_token_accuracy": 0.5082574031890661, "step": 13990 }, { "epoch": 2.593807934742306, "grad_norm": 11.8828125, "learning_rate": 7.406192065257694e-06, "loss": 2.6672, "mean_token_accuracy": 0.4993322451402285, "step": 13991 }, { "epoch": 2.5939933259176864, "grad_norm": 8.2265625, "learning_rate": 7.406006674082315e-06, "loss": 2.9562, "mean_token_accuracy": 0.4866153846153846, "step": 13992 }, { "epoch": 2.594178717093066, "grad_norm": 9.7421875, "learning_rate": 7.4058212829069345e-06, "loss": 3.907, "mean_token_accuracy": 0.39424420229114276, "step": 13993 }, { "epoch": 2.5943641082684463, "grad_norm": 12.3046875, "learning_rate": 7.405635891731554e-06, "loss": 2.9289, "mean_token_accuracy": 0.47823428711176325, "step": 13994 }, { "epoch": 2.5945494994438265, "grad_norm": 7.328125, "learning_rate": 7.405450500556174e-06, "loss": 2.8235, "mean_token_accuracy": 0.46825521241573914, "step": 13995 }, { "epoch": 2.5947348906192067, "grad_norm": 9.90625, "learning_rate": 7.4052651093807934e-06, "loss": 2.6526, "mean_token_accuracy": 0.5003327787021631, "step": 13996 }, { "epoch": 2.5949202817945864, "grad_norm": 11.5, "learning_rate": 7.405079718205414e-06, "loss": 3.2845, "mean_token_accuracy": 0.4846711614298435, "step": 13997 }, { "epoch": 2.5951056729699666, "grad_norm": 13.0703125, "learning_rate": 7.404894327030034e-06, "loss": 2.832, "mean_token_accuracy": 0.485972850678733, "step": 13998 }, { "epoch": 2.595291064145347, "grad_norm": 7.4609375, "learning_rate": 7.404708935854654e-06, "loss": 2.9016, "mean_token_accuracy": 0.4855574812247256, "step": 13999 }, { "epoch": 2.5954764553207266, "grad_norm": 10.265625, "learning_rate": 7.404523544679274e-06, "loss": 2.9818, "mean_token_accuracy": 0.47647538754419366, "step": 14000 }, { "epoch": 2.5956618464961068, "grad_norm": 10.796875, "learning_rate": 7.404338153503894e-06, "loss": 3.0895, "mean_token_accuracy": 0.47282847517211074, "step": 14001 }, { "epoch": 2.595847237671487, "grad_norm": 8.6015625, "learning_rate": 7.404152762328514e-06, "loss": 2.5182, "mean_token_accuracy": 0.5162206607863656, "step": 14002 }, { "epoch": 2.596032628846867, "grad_norm": 6.34765625, "learning_rate": 7.4039673711531335e-06, "loss": 2.4964, "mean_token_accuracy": 0.5136678906568748, "step": 14003 }, { "epoch": 2.596218020022247, "grad_norm": 12.3515625, "learning_rate": 7.403781979977753e-06, "loss": 3.1507, "mean_token_accuracy": 0.48750215480089637, "step": 14004 }, { "epoch": 2.596403411197627, "grad_norm": 10.3515625, "learning_rate": 7.403596588802373e-06, "loss": 3.2799, "mean_token_accuracy": 0.45151093690732075, "step": 14005 }, { "epoch": 2.596588802373007, "grad_norm": 10.5390625, "learning_rate": 7.403411197626994e-06, "loss": 3.4013, "mean_token_accuracy": 0.4314648409641991, "step": 14006 }, { "epoch": 2.596774193548387, "grad_norm": 8.890625, "learning_rate": 7.403225806451614e-06, "loss": 3.4462, "mean_token_accuracy": 0.4637015781922525, "step": 14007 }, { "epoch": 2.596959584723767, "grad_norm": 7.609375, "learning_rate": 7.403040415276233e-06, "loss": 3.0131, "mean_token_accuracy": 0.5031438935912939, "step": 14008 }, { "epoch": 2.5971449758991474, "grad_norm": 12.3046875, "learning_rate": 7.402855024100853e-06, "loss": 2.436, "mean_token_accuracy": 0.5123674911660777, "step": 14009 }, { "epoch": 2.597330367074527, "grad_norm": 12.1171875, "learning_rate": 7.4026696329254735e-06, "loss": 2.6155, "mean_token_accuracy": 0.4810971089696071, "step": 14010 }, { "epoch": 2.5975157582499073, "grad_norm": 7.21484375, "learning_rate": 7.402484241750093e-06, "loss": 2.4449, "mean_token_accuracy": 0.51972401379931, "step": 14011 }, { "epoch": 2.5977011494252875, "grad_norm": 10.1796875, "learning_rate": 7.402298850574713e-06, "loss": 2.8226, "mean_token_accuracy": 0.48535980148883373, "step": 14012 }, { "epoch": 2.5978865406006673, "grad_norm": 13.8203125, "learning_rate": 7.4021134593993325e-06, "loss": 3.3392, "mean_token_accuracy": 0.4419678036699313, "step": 14013 }, { "epoch": 2.5980719317760474, "grad_norm": 11.96875, "learning_rate": 7.401928068223954e-06, "loss": 2.7862, "mean_token_accuracy": 0.4786916557240062, "step": 14014 }, { "epoch": 2.5982573229514276, "grad_norm": 8.6484375, "learning_rate": 7.4017426770485734e-06, "loss": 3.0507, "mean_token_accuracy": 0.4789346802892894, "step": 14015 }, { "epoch": 2.598442714126808, "grad_norm": 10.203125, "learning_rate": 7.401557285873193e-06, "loss": 3.0694, "mean_token_accuracy": 0.465343347639485, "step": 14016 }, { "epoch": 2.5986281053021876, "grad_norm": 13.859375, "learning_rate": 7.401371894697813e-06, "loss": 3.0966, "mean_token_accuracy": 0.47842153886762573, "step": 14017 }, { "epoch": 2.5988134964775678, "grad_norm": 9.21875, "learning_rate": 7.401186503522433e-06, "loss": 3.2729, "mean_token_accuracy": 0.47618315918869086, "step": 14018 }, { "epoch": 2.5989988876529475, "grad_norm": 7.78125, "learning_rate": 7.401001112347053e-06, "loss": 3.1098, "mean_token_accuracy": 0.44264356573983515, "step": 14019 }, { "epoch": 2.5991842788283277, "grad_norm": 13.6171875, "learning_rate": 7.4008157211716725e-06, "loss": 2.397, "mean_token_accuracy": 0.5088184590903063, "step": 14020 }, { "epoch": 2.599369670003708, "grad_norm": 16.046875, "learning_rate": 7.400630329996292e-06, "loss": 2.4798, "mean_token_accuracy": 0.510983629494893, "step": 14021 }, { "epoch": 2.599555061179088, "grad_norm": 7.57421875, "learning_rate": 7.4004449388209135e-06, "loss": 3.3235, "mean_token_accuracy": 0.4503038619878455, "step": 14022 }, { "epoch": 2.599740452354468, "grad_norm": 7.359375, "learning_rate": 7.400259547645533e-06, "loss": 2.757, "mean_token_accuracy": 0.48173076923076924, "step": 14023 }, { "epoch": 2.599925843529848, "grad_norm": 10.515625, "learning_rate": 7.400074156470153e-06, "loss": 2.8751, "mean_token_accuracy": 0.4902627107567638, "step": 14024 }, { "epoch": 2.6001112347052278, "grad_norm": 8.2109375, "learning_rate": 7.3998887652947724e-06, "loss": 2.72, "mean_token_accuracy": 0.5054747801112907, "step": 14025 }, { "epoch": 2.600296625880608, "grad_norm": 7.0234375, "learning_rate": 7.399703374119392e-06, "loss": 3.0959, "mean_token_accuracy": 0.4595917225950783, "step": 14026 }, { "epoch": 2.600482017055988, "grad_norm": 7.29296875, "learning_rate": 7.3995179829440126e-06, "loss": 2.6463, "mean_token_accuracy": 0.49626181407814923, "step": 14027 }, { "epoch": 2.6006674082313683, "grad_norm": 8.53125, "learning_rate": 7.399332591768632e-06, "loss": 3.1928, "mean_token_accuracy": 0.4684243565599498, "step": 14028 }, { "epoch": 2.600852799406748, "grad_norm": 9.34375, "learning_rate": 7.399147200593252e-06, "loss": 2.3868, "mean_token_accuracy": 0.5161152917790715, "step": 14029 }, { "epoch": 2.6010381905821283, "grad_norm": 8.7265625, "learning_rate": 7.398961809417873e-06, "loss": 3.1542, "mean_token_accuracy": 0.4465753424657534, "step": 14030 }, { "epoch": 2.6012235817575085, "grad_norm": 8.9765625, "learning_rate": 7.398776418242493e-06, "loss": 3.0323, "mean_token_accuracy": 0.48227725176013597, "step": 14031 }, { "epoch": 2.601408972932888, "grad_norm": 9.0390625, "learning_rate": 7.3985910270671125e-06, "loss": 2.2046, "mean_token_accuracy": 0.546595715650157, "step": 14032 }, { "epoch": 2.6015943641082684, "grad_norm": 9.09375, "learning_rate": 7.398405635891732e-06, "loss": 2.7814, "mean_token_accuracy": 0.4890779489537825, "step": 14033 }, { "epoch": 2.6017797552836486, "grad_norm": 7.03125, "learning_rate": 7.398220244716352e-06, "loss": 3.1435, "mean_token_accuracy": 0.4759066572381049, "step": 14034 }, { "epoch": 2.6019651464590288, "grad_norm": 7.26171875, "learning_rate": 7.398034853540972e-06, "loss": 3.458, "mean_token_accuracy": 0.41332504403688736, "step": 14035 }, { "epoch": 2.6021505376344085, "grad_norm": 8.5390625, "learning_rate": 7.397849462365592e-06, "loss": 3.0594, "mean_token_accuracy": 0.4686639497742042, "step": 14036 }, { "epoch": 2.6023359288097887, "grad_norm": 7.28515625, "learning_rate": 7.3976640711902115e-06, "loss": 3.0607, "mean_token_accuracy": 0.4781514830508475, "step": 14037 }, { "epoch": 2.6025213199851684, "grad_norm": 9.8828125, "learning_rate": 7.397478680014832e-06, "loss": 3.0082, "mean_token_accuracy": 0.48933431408606104, "step": 14038 }, { "epoch": 2.6027067111605486, "grad_norm": 7.30859375, "learning_rate": 7.3972932888394525e-06, "loss": 3.2307, "mean_token_accuracy": 0.4556137437731511, "step": 14039 }, { "epoch": 2.602892102335929, "grad_norm": 8.0859375, "learning_rate": 7.397107897664072e-06, "loss": 2.8999, "mean_token_accuracy": 0.48298091799896853, "step": 14040 }, { "epoch": 2.603077493511309, "grad_norm": 9.0, "learning_rate": 7.396922506488692e-06, "loss": 3.0363, "mean_token_accuracy": 0.48129145288449504, "step": 14041 }, { "epoch": 2.6032628846866888, "grad_norm": 9.59375, "learning_rate": 7.3967371153133115e-06, "loss": 2.9399, "mean_token_accuracy": 0.49319517476028457, "step": 14042 }, { "epoch": 2.603448275862069, "grad_norm": 7.625, "learning_rate": 7.396551724137931e-06, "loss": 2.8305, "mean_token_accuracy": 0.47251992305578455, "step": 14043 }, { "epoch": 2.603633667037449, "grad_norm": 9.328125, "learning_rate": 7.396366332962552e-06, "loss": 3.2587, "mean_token_accuracy": 0.45565980629539954, "step": 14044 }, { "epoch": 2.603819058212829, "grad_norm": 8.1796875, "learning_rate": 7.396180941787171e-06, "loss": 3.3015, "mean_token_accuracy": 0.4554052567909381, "step": 14045 }, { "epoch": 2.604004449388209, "grad_norm": 9.25, "learning_rate": 7.395995550611792e-06, "loss": 3.0196, "mean_token_accuracy": 0.4747941643796042, "step": 14046 }, { "epoch": 2.6041898405635893, "grad_norm": 7.23828125, "learning_rate": 7.395810159436411e-06, "loss": 3.1163, "mean_token_accuracy": 0.4877472322428163, "step": 14047 }, { "epoch": 2.6043752317389695, "grad_norm": 6.984375, "learning_rate": 7.395624768261032e-06, "loss": 2.8238, "mean_token_accuracy": 0.472082329756952, "step": 14048 }, { "epoch": 2.604560622914349, "grad_norm": 8.4453125, "learning_rate": 7.3954393770856515e-06, "loss": 2.5248, "mean_token_accuracy": 0.5030164092664092, "step": 14049 }, { "epoch": 2.6047460140897294, "grad_norm": 10.6875, "learning_rate": 7.395253985910271e-06, "loss": 3.261, "mean_token_accuracy": 0.4514126047811239, "step": 14050 }, { "epoch": 2.604931405265109, "grad_norm": 10.1328125, "learning_rate": 7.395068594734891e-06, "loss": 3.1654, "mean_token_accuracy": 0.4627264988447039, "step": 14051 }, { "epoch": 2.6051167964404893, "grad_norm": 8.0234375, "learning_rate": 7.3948832035595104e-06, "loss": 2.7106, "mean_token_accuracy": 0.5234604105571847, "step": 14052 }, { "epoch": 2.6053021876158695, "grad_norm": 8.2109375, "learning_rate": 7.394697812384131e-06, "loss": 2.7293, "mean_token_accuracy": 0.5092974056341165, "step": 14053 }, { "epoch": 2.6054875787912497, "grad_norm": 6.84375, "learning_rate": 7.3945124212087514e-06, "loss": 3.124, "mean_token_accuracy": 0.44587706146926537, "step": 14054 }, { "epoch": 2.6056729699666294, "grad_norm": 9.890625, "learning_rate": 7.394327030033371e-06, "loss": 3.0979, "mean_token_accuracy": 0.4720161834120027, "step": 14055 }, { "epoch": 2.6058583611420096, "grad_norm": 9.3671875, "learning_rate": 7.3941416388579916e-06, "loss": 2.7948, "mean_token_accuracy": 0.4816664476278092, "step": 14056 }, { "epoch": 2.60604375231739, "grad_norm": 7.7890625, "learning_rate": 7.393956247682611e-06, "loss": 2.7531, "mean_token_accuracy": 0.5115388491383654, "step": 14057 }, { "epoch": 2.6062291434927696, "grad_norm": 9.109375, "learning_rate": 7.393770856507231e-06, "loss": 2.9072, "mean_token_accuracy": 0.48580668343514194, "step": 14058 }, { "epoch": 2.6064145346681498, "grad_norm": 7.24609375, "learning_rate": 7.3935854653318505e-06, "loss": 2.9306, "mean_token_accuracy": 0.47118816930137686, "step": 14059 }, { "epoch": 2.60659992584353, "grad_norm": 8.859375, "learning_rate": 7.39340007415647e-06, "loss": 2.8998, "mean_token_accuracy": 0.4931021389697507, "step": 14060 }, { "epoch": 2.60678531701891, "grad_norm": 10.1796875, "learning_rate": 7.393214682981091e-06, "loss": 2.8502, "mean_token_accuracy": 0.4877284030706022, "step": 14061 }, { "epoch": 2.60697070819429, "grad_norm": 8.7890625, "learning_rate": 7.393029291805711e-06, "loss": 2.749, "mean_token_accuracy": 0.4812680115273775, "step": 14062 }, { "epoch": 2.60715609936967, "grad_norm": 6.31640625, "learning_rate": 7.392843900630331e-06, "loss": 3.3214, "mean_token_accuracy": 0.44415243101182655, "step": 14063 }, { "epoch": 2.60734149054505, "grad_norm": 13.8359375, "learning_rate": 7.39265850945495e-06, "loss": 2.5121, "mean_token_accuracy": 0.4973715651135006, "step": 14064 }, { "epoch": 2.60752688172043, "grad_norm": 9.2109375, "learning_rate": 7.392473118279571e-06, "loss": 3.4335, "mean_token_accuracy": 0.43648365802608274, "step": 14065 }, { "epoch": 2.60771227289581, "grad_norm": 7.5546875, "learning_rate": 7.3922877271041905e-06, "loss": 2.9448, "mean_token_accuracy": 0.5109794353433252, "step": 14066 }, { "epoch": 2.6078976640711904, "grad_norm": 8.2421875, "learning_rate": 7.39210233592881e-06, "loss": 2.8468, "mean_token_accuracy": 0.4609544468546638, "step": 14067 }, { "epoch": 2.60808305524657, "grad_norm": 7.1171875, "learning_rate": 7.39191694475343e-06, "loss": 3.2203, "mean_token_accuracy": 0.4631093544137022, "step": 14068 }, { "epoch": 2.6082684464219503, "grad_norm": 7.50390625, "learning_rate": 7.3917315535780495e-06, "loss": 2.4313, "mean_token_accuracy": 0.540004638218924, "step": 14069 }, { "epoch": 2.6084538375973305, "grad_norm": 7.33203125, "learning_rate": 7.391546162402671e-06, "loss": 2.9706, "mean_token_accuracy": 0.4758477677482313, "step": 14070 }, { "epoch": 2.6086392287727103, "grad_norm": 8.0, "learning_rate": 7.3913607712272905e-06, "loss": 2.6613, "mean_token_accuracy": 0.515305490147478, "step": 14071 }, { "epoch": 2.6088246199480905, "grad_norm": 6.21484375, "learning_rate": 7.39117538005191e-06, "loss": 2.756, "mean_token_accuracy": 0.4924379915305505, "step": 14072 }, { "epoch": 2.6090100111234706, "grad_norm": 7.37109375, "learning_rate": 7.390989988876531e-06, "loss": 2.9131, "mean_token_accuracy": 0.5037259615384615, "step": 14073 }, { "epoch": 2.609195402298851, "grad_norm": 8.921875, "learning_rate": 7.39080459770115e-06, "loss": 3.4159, "mean_token_accuracy": 0.43231334149326806, "step": 14074 }, { "epoch": 2.6093807934742306, "grad_norm": 7.32421875, "learning_rate": 7.39061920652577e-06, "loss": 3.4893, "mean_token_accuracy": 0.43875636558563386, "step": 14075 }, { "epoch": 2.6095661846496108, "grad_norm": 7.17578125, "learning_rate": 7.3904338153503895e-06, "loss": 3.0812, "mean_token_accuracy": 0.45950238221281103, "step": 14076 }, { "epoch": 2.6097515758249905, "grad_norm": 11.4296875, "learning_rate": 7.390248424175009e-06, "loss": 2.2841, "mean_token_accuracy": 0.5572435050638486, "step": 14077 }, { "epoch": 2.6099369670003707, "grad_norm": 9.390625, "learning_rate": 7.3900630329996305e-06, "loss": 2.4703, "mean_token_accuracy": 0.4871969955616251, "step": 14078 }, { "epoch": 2.610122358175751, "grad_norm": 9.4375, "learning_rate": 7.38987764182425e-06, "loss": 3.2738, "mean_token_accuracy": 0.4497816593886463, "step": 14079 }, { "epoch": 2.610307749351131, "grad_norm": 8.5, "learning_rate": 7.38969225064887e-06, "loss": 2.8264, "mean_token_accuracy": 0.4925914726338071, "step": 14080 }, { "epoch": 2.610493140526511, "grad_norm": 10.421875, "learning_rate": 7.3895068594734894e-06, "loss": 3.2832, "mean_token_accuracy": 0.4251590289497598, "step": 14081 }, { "epoch": 2.610678531701891, "grad_norm": 11.03125, "learning_rate": 7.38932146829811e-06, "loss": 2.1693, "mean_token_accuracy": 0.5547879689658838, "step": 14082 }, { "epoch": 2.610863922877271, "grad_norm": 11.1953125, "learning_rate": 7.38913607712273e-06, "loss": 2.8699, "mean_token_accuracy": 0.48881880733944955, "step": 14083 }, { "epoch": 2.611049314052651, "grad_norm": 8.75, "learning_rate": 7.388950685947349e-06, "loss": 2.6956, "mean_token_accuracy": 0.48306852832611136, "step": 14084 }, { "epoch": 2.611234705228031, "grad_norm": 9.3203125, "learning_rate": 7.388765294771969e-06, "loss": 2.465, "mean_token_accuracy": 0.5628022172425994, "step": 14085 }, { "epoch": 2.6114200964034113, "grad_norm": 9.28125, "learning_rate": 7.38857990359659e-06, "loss": 3.0678, "mean_token_accuracy": 0.4595360824742268, "step": 14086 }, { "epoch": 2.6116054875787915, "grad_norm": 9.21875, "learning_rate": 7.38839451242121e-06, "loss": 3.0063, "mean_token_accuracy": 0.47842862153630306, "step": 14087 }, { "epoch": 2.6117908787541713, "grad_norm": 8.9765625, "learning_rate": 7.3882091212458295e-06, "loss": 2.9347, "mean_token_accuracy": 0.4759678597516435, "step": 14088 }, { "epoch": 2.6119762699295515, "grad_norm": 10.640625, "learning_rate": 7.388023730070449e-06, "loss": 3.6015, "mean_token_accuracy": 0.4211740579338631, "step": 14089 }, { "epoch": 2.612161661104931, "grad_norm": 8.7109375, "learning_rate": 7.387838338895069e-06, "loss": 3.1343, "mean_token_accuracy": 0.4515026517383618, "step": 14090 }, { "epoch": 2.6123470522803114, "grad_norm": 10.09375, "learning_rate": 7.387652947719689e-06, "loss": 2.8525, "mean_token_accuracy": 0.4834376630151278, "step": 14091 }, { "epoch": 2.6125324434556916, "grad_norm": 9.6328125, "learning_rate": 7.387467556544309e-06, "loss": 2.4331, "mean_token_accuracy": 0.5052425555710891, "step": 14092 }, { "epoch": 2.6127178346310718, "grad_norm": 8.75, "learning_rate": 7.3872821653689286e-06, "loss": 2.7591, "mean_token_accuracy": 0.5040108975329196, "step": 14093 }, { "epoch": 2.6129032258064515, "grad_norm": 7.13671875, "learning_rate": 7.38709677419355e-06, "loss": 2.5322, "mean_token_accuracy": 0.5232524552281917, "step": 14094 }, { "epoch": 2.6130886169818317, "grad_norm": 8.953125, "learning_rate": 7.3869113830181695e-06, "loss": 2.4001, "mean_token_accuracy": 0.5013999066728885, "step": 14095 }, { "epoch": 2.6132740081572114, "grad_norm": 6.9921875, "learning_rate": 7.386725991842789e-06, "loss": 3.086, "mean_token_accuracy": 0.4639605462822458, "step": 14096 }, { "epoch": 2.6134593993325916, "grad_norm": 9.1171875, "learning_rate": 7.386540600667409e-06, "loss": 2.9732, "mean_token_accuracy": 0.47910330066463896, "step": 14097 }, { "epoch": 2.613644790507972, "grad_norm": 9.609375, "learning_rate": 7.3863552094920285e-06, "loss": 3.4957, "mean_token_accuracy": 0.4449487744780184, "step": 14098 }, { "epoch": 2.613830181683352, "grad_norm": 7.9921875, "learning_rate": 7.386169818316649e-06, "loss": 3.2253, "mean_token_accuracy": 0.46657160963244615, "step": 14099 }, { "epoch": 2.6140155728587318, "grad_norm": 12.3671875, "learning_rate": 7.385984427141269e-06, "loss": 2.1501, "mean_token_accuracy": 0.5361332707648991, "step": 14100 }, { "epoch": 2.614200964034112, "grad_norm": 9.9765625, "learning_rate": 7.385799035965888e-06, "loss": 2.635, "mean_token_accuracy": 0.5020650490449148, "step": 14101 }, { "epoch": 2.614386355209492, "grad_norm": 7.86328125, "learning_rate": 7.385613644790509e-06, "loss": 2.2437, "mean_token_accuracy": 0.5814469596094097, "step": 14102 }, { "epoch": 2.614571746384872, "grad_norm": 6.90625, "learning_rate": 7.385428253615129e-06, "loss": 3.1955, "mean_token_accuracy": 0.457201646090535, "step": 14103 }, { "epoch": 2.614757137560252, "grad_norm": 10.546875, "learning_rate": 7.385242862439749e-06, "loss": 3.0441, "mean_token_accuracy": 0.46887694928580786, "step": 14104 }, { "epoch": 2.6149425287356323, "grad_norm": 10.1953125, "learning_rate": 7.3850574712643685e-06, "loss": 2.9211, "mean_token_accuracy": 0.4647692143694702, "step": 14105 }, { "epoch": 2.6151279199110125, "grad_norm": 7.98828125, "learning_rate": 7.384872080088988e-06, "loss": 2.8921, "mean_token_accuracy": 0.4710591941199633, "step": 14106 }, { "epoch": 2.615313311086392, "grad_norm": 6.73046875, "learning_rate": 7.384686688913608e-06, "loss": 3.0889, "mean_token_accuracy": 0.45594202898550723, "step": 14107 }, { "epoch": 2.6154987022617724, "grad_norm": 10.453125, "learning_rate": 7.384501297738228e-06, "loss": 3.0201, "mean_token_accuracy": 0.4977744807121662, "step": 14108 }, { "epoch": 2.615684093437152, "grad_norm": 8.5390625, "learning_rate": 7.384315906562848e-06, "loss": 3.3037, "mean_token_accuracy": 0.4407453416149068, "step": 14109 }, { "epoch": 2.6158694846125323, "grad_norm": 14.578125, "learning_rate": 7.384130515387468e-06, "loss": 2.791, "mean_token_accuracy": 0.47880299251870323, "step": 14110 }, { "epoch": 2.6160548757879125, "grad_norm": 7.4765625, "learning_rate": 7.383945124212089e-06, "loss": 2.7112, "mean_token_accuracy": 0.5003955249180698, "step": 14111 }, { "epoch": 2.6162402669632927, "grad_norm": 9.3125, "learning_rate": 7.383759733036709e-06, "loss": 3.2263, "mean_token_accuracy": 0.46655400647796086, "step": 14112 }, { "epoch": 2.6164256581386724, "grad_norm": 6.57421875, "learning_rate": 7.383574341861328e-06, "loss": 2.8133, "mean_token_accuracy": 0.4804353816478369, "step": 14113 }, { "epoch": 2.6166110493140526, "grad_norm": 7.140625, "learning_rate": 7.383388950685948e-06, "loss": 2.8674, "mean_token_accuracy": 0.47397744258730806, "step": 14114 }, { "epoch": 2.616796440489433, "grad_norm": 8.25, "learning_rate": 7.3832035595105675e-06, "loss": 3.6098, "mean_token_accuracy": 0.4327720207253886, "step": 14115 }, { "epoch": 2.6169818316648126, "grad_norm": 13.125, "learning_rate": 7.383018168335188e-06, "loss": 2.9063, "mean_token_accuracy": 0.5001217730150999, "step": 14116 }, { "epoch": 2.6171672228401928, "grad_norm": 7.3203125, "learning_rate": 7.382832777159808e-06, "loss": 2.3687, "mean_token_accuracy": 0.5429168532616808, "step": 14117 }, { "epoch": 2.617352614015573, "grad_norm": 8.5078125, "learning_rate": 7.382647385984427e-06, "loss": 3.1127, "mean_token_accuracy": 0.46412920996619506, "step": 14118 }, { "epoch": 2.617538005190953, "grad_norm": 9.546875, "learning_rate": 7.382461994809048e-06, "loss": 2.7468, "mean_token_accuracy": 0.4847380982288657, "step": 14119 }, { "epoch": 2.617723396366333, "grad_norm": 7.12890625, "learning_rate": 7.382276603633668e-06, "loss": 2.9983, "mean_token_accuracy": 0.46207232436381346, "step": 14120 }, { "epoch": 2.617908787541713, "grad_norm": 6.93359375, "learning_rate": 7.382091212458288e-06, "loss": 3.0181, "mean_token_accuracy": 0.47529831604773054, "step": 14121 }, { "epoch": 2.618094178717093, "grad_norm": 14.5390625, "learning_rate": 7.3819058212829076e-06, "loss": 2.5823, "mean_token_accuracy": 0.5047941342357586, "step": 14122 }, { "epoch": 2.618279569892473, "grad_norm": 9.6171875, "learning_rate": 7.381720430107527e-06, "loss": 3.4154, "mean_token_accuracy": 0.4461909132629647, "step": 14123 }, { "epoch": 2.618464961067853, "grad_norm": 7.53125, "learning_rate": 7.381535038932147e-06, "loss": 2.7958, "mean_token_accuracy": 0.4945054945054945, "step": 14124 }, { "epoch": 2.6186503522432334, "grad_norm": 8.6640625, "learning_rate": 7.381349647756767e-06, "loss": 2.6396, "mean_token_accuracy": 0.49286304117022534, "step": 14125 }, { "epoch": 2.618835743418613, "grad_norm": 6.7578125, "learning_rate": 7.381164256581387e-06, "loss": 3.0076, "mean_token_accuracy": 0.4779585410653372, "step": 14126 }, { "epoch": 2.6190211345939933, "grad_norm": 7.390625, "learning_rate": 7.3809788654060075e-06, "loss": 3.3571, "mean_token_accuracy": 0.44035304166271233, "step": 14127 }, { "epoch": 2.6192065257693735, "grad_norm": 7.13671875, "learning_rate": 7.380793474230627e-06, "loss": 2.7956, "mean_token_accuracy": 0.4814137023904838, "step": 14128 }, { "epoch": 2.6193919169447533, "grad_norm": 7.36328125, "learning_rate": 7.380608083055248e-06, "loss": 2.483, "mean_token_accuracy": 0.5256426491072559, "step": 14129 }, { "epoch": 2.6195773081201335, "grad_norm": 6.87109375, "learning_rate": 7.380422691879867e-06, "loss": 2.672, "mean_token_accuracy": 0.5210253456221198, "step": 14130 }, { "epoch": 2.6197626992955136, "grad_norm": 8.8359375, "learning_rate": 7.380237300704487e-06, "loss": 2.2239, "mean_token_accuracy": 0.5228364156534178, "step": 14131 }, { "epoch": 2.619948090470894, "grad_norm": 9.5078125, "learning_rate": 7.3800519095291066e-06, "loss": 2.4832, "mean_token_accuracy": 0.5028207724576884, "step": 14132 }, { "epoch": 2.6201334816462736, "grad_norm": 7.453125, "learning_rate": 7.379866518353726e-06, "loss": 2.8541, "mean_token_accuracy": 0.4826792963464141, "step": 14133 }, { "epoch": 2.6203188728216538, "grad_norm": 10.4296875, "learning_rate": 7.379681127178347e-06, "loss": 2.5243, "mean_token_accuracy": 0.5446159007044616, "step": 14134 }, { "epoch": 2.6205042639970335, "grad_norm": 7.94140625, "learning_rate": 7.379495736002967e-06, "loss": 3.0413, "mean_token_accuracy": 0.4830739992873263, "step": 14135 }, { "epoch": 2.6206896551724137, "grad_norm": 7.69921875, "learning_rate": 7.379310344827587e-06, "loss": 2.5266, "mean_token_accuracy": 0.5210634472129463, "step": 14136 }, { "epoch": 2.620875046347794, "grad_norm": 9.78125, "learning_rate": 7.379124953652207e-06, "loss": 3.4824, "mean_token_accuracy": 0.4475969889982629, "step": 14137 }, { "epoch": 2.621060437523174, "grad_norm": 7.97265625, "learning_rate": 7.378939562476827e-06, "loss": 2.5992, "mean_token_accuracy": 0.5103920907763173, "step": 14138 }, { "epoch": 2.621245828698554, "grad_norm": 8.3046875, "learning_rate": 7.378754171301447e-06, "loss": 3.7747, "mean_token_accuracy": 0.42805100182149364, "step": 14139 }, { "epoch": 2.621431219873934, "grad_norm": 7.296875, "learning_rate": 7.378568780126066e-06, "loss": 3.4326, "mean_token_accuracy": 0.4532384548358989, "step": 14140 }, { "epoch": 2.621616611049314, "grad_norm": 7.5234375, "learning_rate": 7.378383388950686e-06, "loss": 3.032, "mean_token_accuracy": 0.4861598988337783, "step": 14141 }, { "epoch": 2.621802002224694, "grad_norm": 8.9296875, "learning_rate": 7.378197997775306e-06, "loss": 2.2753, "mean_token_accuracy": 0.5338488994646045, "step": 14142 }, { "epoch": 2.621987393400074, "grad_norm": 7.21875, "learning_rate": 7.378012606599927e-06, "loss": 2.4876, "mean_token_accuracy": 0.5226213326021387, "step": 14143 }, { "epoch": 2.6221727845754543, "grad_norm": 8.359375, "learning_rate": 7.3778272154245465e-06, "loss": 3.0206, "mean_token_accuracy": 0.45583472920156337, "step": 14144 }, { "epoch": 2.6223581757508345, "grad_norm": 8.0234375, "learning_rate": 7.377641824249166e-06, "loss": 3.1258, "mean_token_accuracy": 0.4807268007787151, "step": 14145 }, { "epoch": 2.6225435669262143, "grad_norm": 7.26171875, "learning_rate": 7.377456433073787e-06, "loss": 3.2833, "mean_token_accuracy": 0.4493281712593942, "step": 14146 }, { "epoch": 2.6227289581015945, "grad_norm": 7.6328125, "learning_rate": 7.377271041898406e-06, "loss": 3.1788, "mean_token_accuracy": 0.49473420260782347, "step": 14147 }, { "epoch": 2.622914349276974, "grad_norm": 7.03125, "learning_rate": 7.377085650723026e-06, "loss": 2.8422, "mean_token_accuracy": 0.4773832326506557, "step": 14148 }, { "epoch": 2.6230997404523544, "grad_norm": 8.3203125, "learning_rate": 7.376900259547646e-06, "loss": 3.5837, "mean_token_accuracy": 0.443391757460919, "step": 14149 }, { "epoch": 2.6232851316277346, "grad_norm": 6.95703125, "learning_rate": 7.376714868372265e-06, "loss": 3.4642, "mean_token_accuracy": 0.4074117785908179, "step": 14150 }, { "epoch": 2.6234705228031148, "grad_norm": 10.9609375, "learning_rate": 7.3765294771968866e-06, "loss": 2.7694, "mean_token_accuracy": 0.4804005112910098, "step": 14151 }, { "epoch": 2.6236559139784945, "grad_norm": 7.53515625, "learning_rate": 7.376344086021506e-06, "loss": 3.2535, "mean_token_accuracy": 0.4327240416349327, "step": 14152 }, { "epoch": 2.6238413051538747, "grad_norm": 7.03515625, "learning_rate": 7.376158694846126e-06, "loss": 2.7118, "mean_token_accuracy": 0.485350897115603, "step": 14153 }, { "epoch": 2.624026696329255, "grad_norm": 7.578125, "learning_rate": 7.375973303670746e-06, "loss": 3.4201, "mean_token_accuracy": 0.44834893166166345, "step": 14154 }, { "epoch": 2.6242120875046346, "grad_norm": 11.5703125, "learning_rate": 7.375787912495366e-06, "loss": 2.9165, "mean_token_accuracy": 0.48405880106911037, "step": 14155 }, { "epoch": 2.624397478680015, "grad_norm": 7.60546875, "learning_rate": 7.375602521319986e-06, "loss": 2.5367, "mean_token_accuracy": 0.5033240997229916, "step": 14156 }, { "epoch": 2.624582869855395, "grad_norm": 9.5546875, "learning_rate": 7.375417130144605e-06, "loss": 4.0943, "mean_token_accuracy": 0.4101567920077702, "step": 14157 }, { "epoch": 2.624768261030775, "grad_norm": 9.1640625, "learning_rate": 7.375231738969225e-06, "loss": 2.7376, "mean_token_accuracy": 0.5143303064699205, "step": 14158 }, { "epoch": 2.624953652206155, "grad_norm": 12.484375, "learning_rate": 7.375046347793846e-06, "loss": 2.9806, "mean_token_accuracy": 0.4590370955011839, "step": 14159 }, { "epoch": 2.625139043381535, "grad_norm": 10.953125, "learning_rate": 7.374860956618466e-06, "loss": 2.5853, "mean_token_accuracy": 0.508390918065153, "step": 14160 }, { "epoch": 2.625324434556915, "grad_norm": 8.625, "learning_rate": 7.3746755654430856e-06, "loss": 3.1424, "mean_token_accuracy": 0.4803276487176044, "step": 14161 }, { "epoch": 2.625509825732295, "grad_norm": 13.46875, "learning_rate": 7.374490174267705e-06, "loss": 3.0, "mean_token_accuracy": 0.47415692255549907, "step": 14162 }, { "epoch": 2.6256952169076753, "grad_norm": 10.234375, "learning_rate": 7.374304783092326e-06, "loss": 2.809, "mean_token_accuracy": 0.501952819871895, "step": 14163 }, { "epoch": 2.6258806080830555, "grad_norm": 8.3828125, "learning_rate": 7.374119391916945e-06, "loss": 3.6695, "mean_token_accuracy": 0.44614612101431084, "step": 14164 }, { "epoch": 2.626065999258435, "grad_norm": 10.3359375, "learning_rate": 7.373934000741565e-06, "loss": 3.1726, "mean_token_accuracy": 0.4530751977334435, "step": 14165 }, { "epoch": 2.6262513904338154, "grad_norm": 10.40625, "learning_rate": 7.373748609566185e-06, "loss": 2.3665, "mean_token_accuracy": 0.5176578856551248, "step": 14166 }, { "epoch": 2.626436781609195, "grad_norm": 7.8125, "learning_rate": 7.373563218390806e-06, "loss": 2.9746, "mean_token_accuracy": 0.4461044401005306, "step": 14167 }, { "epoch": 2.6266221727845753, "grad_norm": 11.2421875, "learning_rate": 7.373377827215426e-06, "loss": 2.6354, "mean_token_accuracy": 0.5139511981617245, "step": 14168 }, { "epoch": 2.6268075639599555, "grad_norm": 10.1796875, "learning_rate": 7.373192436040045e-06, "loss": 2.5387, "mean_token_accuracy": 0.5271895285729488, "step": 14169 }, { "epoch": 2.6269929551353357, "grad_norm": 7.05078125, "learning_rate": 7.373007044864665e-06, "loss": 2.6464, "mean_token_accuracy": 0.49543446244477174, "step": 14170 }, { "epoch": 2.6271783463107155, "grad_norm": 7.015625, "learning_rate": 7.3728216536892845e-06, "loss": 3.1725, "mean_token_accuracy": 0.45266818824040866, "step": 14171 }, { "epoch": 2.6273637374860956, "grad_norm": 8.9296875, "learning_rate": 7.372636262513905e-06, "loss": 3.0672, "mean_token_accuracy": 0.4473927392739274, "step": 14172 }, { "epoch": 2.627549128661476, "grad_norm": 8.5859375, "learning_rate": 7.372450871338525e-06, "loss": 3.082, "mean_token_accuracy": 0.49716171617161714, "step": 14173 }, { "epoch": 2.6277345198368556, "grad_norm": 6.37890625, "learning_rate": 7.372265480163144e-06, "loss": 2.8405, "mean_token_accuracy": 0.5161555875694588, "step": 14174 }, { "epoch": 2.6279199110122358, "grad_norm": 7.0078125, "learning_rate": 7.372080088987766e-06, "loss": 2.6976, "mean_token_accuracy": 0.5046668812359189, "step": 14175 }, { "epoch": 2.628105302187616, "grad_norm": 8.3828125, "learning_rate": 7.371894697812385e-06, "loss": 3.4402, "mean_token_accuracy": 0.4577408103031929, "step": 14176 }, { "epoch": 2.628290693362996, "grad_norm": 12.5078125, "learning_rate": 7.371709306637005e-06, "loss": 3.0743, "mean_token_accuracy": 0.5206190343805023, "step": 14177 }, { "epoch": 2.628476084538376, "grad_norm": 6.7265625, "learning_rate": 7.371523915461625e-06, "loss": 3.1043, "mean_token_accuracy": 0.4465770953294946, "step": 14178 }, { "epoch": 2.628661475713756, "grad_norm": 6.87890625, "learning_rate": 7.371338524286244e-06, "loss": 2.5344, "mean_token_accuracy": 0.5078745281790967, "step": 14179 }, { "epoch": 2.628846866889136, "grad_norm": 7.4453125, "learning_rate": 7.371153133110865e-06, "loss": 3.0331, "mean_token_accuracy": 0.44755434782608694, "step": 14180 }, { "epoch": 2.629032258064516, "grad_norm": 7.66796875, "learning_rate": 7.370967741935484e-06, "loss": 3.2128, "mean_token_accuracy": 0.4474716202270382, "step": 14181 }, { "epoch": 2.629217649239896, "grad_norm": 7.6328125, "learning_rate": 7.370782350760104e-06, "loss": 3.011, "mean_token_accuracy": 0.4665823076038192, "step": 14182 }, { "epoch": 2.6294030404152764, "grad_norm": 6.76171875, "learning_rate": 7.3705969595847245e-06, "loss": 2.8592, "mean_token_accuracy": 0.4823046134400674, "step": 14183 }, { "epoch": 2.629588431590656, "grad_norm": 7.72265625, "learning_rate": 7.370411568409345e-06, "loss": 3.0971, "mean_token_accuracy": 0.44139886578449905, "step": 14184 }, { "epoch": 2.6297738227660363, "grad_norm": 12.2890625, "learning_rate": 7.370226177233965e-06, "loss": 2.4161, "mean_token_accuracy": 0.5474780701754386, "step": 14185 }, { "epoch": 2.6299592139414165, "grad_norm": 8.046875, "learning_rate": 7.370040786058584e-06, "loss": 2.9119, "mean_token_accuracy": 0.4832852536908196, "step": 14186 }, { "epoch": 2.6301446051167963, "grad_norm": 10.8671875, "learning_rate": 7.369855394883204e-06, "loss": 3.7396, "mean_token_accuracy": 0.4427446903312045, "step": 14187 }, { "epoch": 2.6303299962921765, "grad_norm": 11.3671875, "learning_rate": 7.3696700037078236e-06, "loss": 2.7694, "mean_token_accuracy": 0.478134110787172, "step": 14188 }, { "epoch": 2.6305153874675566, "grad_norm": 7.8828125, "learning_rate": 7.369484612532444e-06, "loss": 2.5305, "mean_token_accuracy": 0.5245682315738264, "step": 14189 }, { "epoch": 2.630700778642937, "grad_norm": 9.640625, "learning_rate": 7.369299221357064e-06, "loss": 3.1076, "mean_token_accuracy": 0.4753491703473797, "step": 14190 }, { "epoch": 2.6308861698183166, "grad_norm": 10.2734375, "learning_rate": 7.369113830181684e-06, "loss": 3.0144, "mean_token_accuracy": 0.4603174603174603, "step": 14191 }, { "epoch": 2.6310715609936968, "grad_norm": 9.0, "learning_rate": 7.368928439006305e-06, "loss": 2.7946, "mean_token_accuracy": 0.48688729316266, "step": 14192 }, { "epoch": 2.6312569521690765, "grad_norm": 10.6015625, "learning_rate": 7.368743047830924e-06, "loss": 2.7617, "mean_token_accuracy": 0.48682688600436963, "step": 14193 }, { "epoch": 2.6314423433444567, "grad_norm": 8.578125, "learning_rate": 7.368557656655544e-06, "loss": 2.6715, "mean_token_accuracy": 0.48141957052848144, "step": 14194 }, { "epoch": 2.631627734519837, "grad_norm": 7.48046875, "learning_rate": 7.368372265480164e-06, "loss": 2.9129, "mean_token_accuracy": 0.46754134208008746, "step": 14195 }, { "epoch": 2.631813125695217, "grad_norm": 7.58984375, "learning_rate": 7.368186874304783e-06, "loss": 3.2122, "mean_token_accuracy": 0.4572068707991038, "step": 14196 }, { "epoch": 2.631998516870597, "grad_norm": 7.046875, "learning_rate": 7.368001483129403e-06, "loss": 2.9541, "mean_token_accuracy": 0.47368421052631576, "step": 14197 }, { "epoch": 2.632183908045977, "grad_norm": 9.3125, "learning_rate": 7.367816091954023e-06, "loss": 4.3136, "mean_token_accuracy": 0.4010071561091969, "step": 14198 }, { "epoch": 2.632369299221357, "grad_norm": 9.1796875, "learning_rate": 7.367630700778644e-06, "loss": 2.7876, "mean_token_accuracy": 0.503665200586432, "step": 14199 }, { "epoch": 2.632554690396737, "grad_norm": 8.359375, "learning_rate": 7.3674453096032635e-06, "loss": 3.2459, "mean_token_accuracy": 0.4617397998460354, "step": 14200 }, { "epoch": 2.632740081572117, "grad_norm": 7.1015625, "learning_rate": 7.367259918427884e-06, "loss": 3.0194, "mean_token_accuracy": 0.463554667998003, "step": 14201 }, { "epoch": 2.6329254727474973, "grad_norm": 8.640625, "learning_rate": 7.367074527252504e-06, "loss": 2.708, "mean_token_accuracy": 0.5133768135232264, "step": 14202 }, { "epoch": 2.6331108639228775, "grad_norm": 7.33203125, "learning_rate": 7.366889136077123e-06, "loss": 2.788, "mean_token_accuracy": 0.48757342973339357, "step": 14203 }, { "epoch": 2.6332962550982573, "grad_norm": 9.3046875, "learning_rate": 7.366703744901743e-06, "loss": 3.731, "mean_token_accuracy": 0.428777166797591, "step": 14204 }, { "epoch": 2.6334816462736375, "grad_norm": 8.0234375, "learning_rate": 7.366518353726363e-06, "loss": 3.1002, "mean_token_accuracy": 0.4542433891734736, "step": 14205 }, { "epoch": 2.633667037449017, "grad_norm": 6.9140625, "learning_rate": 7.366332962550983e-06, "loss": 3.0576, "mean_token_accuracy": 0.4557739557739558, "step": 14206 }, { "epoch": 2.6338524286243974, "grad_norm": 6.69921875, "learning_rate": 7.366147571375604e-06, "loss": 2.7178, "mean_token_accuracy": 0.48120394654677157, "step": 14207 }, { "epoch": 2.6340378197997776, "grad_norm": 6.73046875, "learning_rate": 7.365962180200223e-06, "loss": 2.5909, "mean_token_accuracy": 0.5094905094905094, "step": 14208 }, { "epoch": 2.6342232109751578, "grad_norm": 7.08984375, "learning_rate": 7.365776789024843e-06, "loss": 3.205, "mean_token_accuracy": 0.4548702811868883, "step": 14209 }, { "epoch": 2.6344086021505375, "grad_norm": 7.65625, "learning_rate": 7.365591397849463e-06, "loss": 2.9416, "mean_token_accuracy": 0.4600360576923077, "step": 14210 }, { "epoch": 2.6345939933259177, "grad_norm": 6.58984375, "learning_rate": 7.365406006674083e-06, "loss": 2.8407, "mean_token_accuracy": 0.47643459566727303, "step": 14211 }, { "epoch": 2.634779384501298, "grad_norm": 8.2890625, "learning_rate": 7.365220615498703e-06, "loss": 2.7153, "mean_token_accuracy": 0.5127450980392156, "step": 14212 }, { "epoch": 2.6349647756766776, "grad_norm": 8.3671875, "learning_rate": 7.365035224323322e-06, "loss": 2.1452, "mean_token_accuracy": 0.5376921928646067, "step": 14213 }, { "epoch": 2.635150166852058, "grad_norm": 6.765625, "learning_rate": 7.364849833147942e-06, "loss": 2.4932, "mean_token_accuracy": 0.5292682926829269, "step": 14214 }, { "epoch": 2.635335558027438, "grad_norm": 6.91796875, "learning_rate": 7.364664441972563e-06, "loss": 3.4865, "mean_token_accuracy": 0.4329971181556196, "step": 14215 }, { "epoch": 2.635520949202818, "grad_norm": 7.69140625, "learning_rate": 7.364479050797183e-06, "loss": 3.0733, "mean_token_accuracy": 0.49531893646236425, "step": 14216 }, { "epoch": 2.635706340378198, "grad_norm": 7.93359375, "learning_rate": 7.3642936596218026e-06, "loss": 2.7227, "mean_token_accuracy": 0.49695307739183425, "step": 14217 }, { "epoch": 2.635891731553578, "grad_norm": 6.63671875, "learning_rate": 7.364108268446423e-06, "loss": 2.395, "mean_token_accuracy": 0.5192061459667093, "step": 14218 }, { "epoch": 2.636077122728958, "grad_norm": 7.32421875, "learning_rate": 7.363922877271043e-06, "loss": 2.3654, "mean_token_accuracy": 0.5334760012228676, "step": 14219 }, { "epoch": 2.636262513904338, "grad_norm": 9.125, "learning_rate": 7.363737486095662e-06, "loss": 2.8788, "mean_token_accuracy": 0.47878128400435255, "step": 14220 }, { "epoch": 2.6364479050797183, "grad_norm": 7.75390625, "learning_rate": 7.363552094920282e-06, "loss": 2.9765, "mean_token_accuracy": 0.4730473047304731, "step": 14221 }, { "epoch": 2.6366332962550985, "grad_norm": 9.390625, "learning_rate": 7.363366703744902e-06, "loss": 3.1158, "mean_token_accuracy": 0.4769910243624448, "step": 14222 }, { "epoch": 2.636818687430478, "grad_norm": 8.796875, "learning_rate": 7.363181312569523e-06, "loss": 2.7525, "mean_token_accuracy": 0.49052229646802126, "step": 14223 }, { "epoch": 2.6370040786058584, "grad_norm": 7.0703125, "learning_rate": 7.362995921394143e-06, "loss": 2.7399, "mean_token_accuracy": 0.507840038896317, "step": 14224 }, { "epoch": 2.637189469781238, "grad_norm": 6.20703125, "learning_rate": 7.362810530218762e-06, "loss": 2.2014, "mean_token_accuracy": 0.5724515166583789, "step": 14225 }, { "epoch": 2.6373748609566183, "grad_norm": 8.5, "learning_rate": 7.362625139043382e-06, "loss": 3.4631, "mean_token_accuracy": 0.4371327849588719, "step": 14226 }, { "epoch": 2.6375602521319985, "grad_norm": 7.25390625, "learning_rate": 7.362439747868002e-06, "loss": 2.9858, "mean_token_accuracy": 0.4936321134934709, "step": 14227 }, { "epoch": 2.6377456433073787, "grad_norm": 7.41796875, "learning_rate": 7.362254356692622e-06, "loss": 2.8484, "mean_token_accuracy": 0.4996793638578941, "step": 14228 }, { "epoch": 2.637931034482759, "grad_norm": 8.796875, "learning_rate": 7.362068965517242e-06, "loss": 2.6857, "mean_token_accuracy": 0.4891135303265941, "step": 14229 }, { "epoch": 2.6381164256581386, "grad_norm": 6.4453125, "learning_rate": 7.361883574341861e-06, "loss": 2.8567, "mean_token_accuracy": 0.4705810397553517, "step": 14230 }, { "epoch": 2.638301816833519, "grad_norm": 6.64453125, "learning_rate": 7.361698183166481e-06, "loss": 2.8533, "mean_token_accuracy": 0.49150458305387884, "step": 14231 }, { "epoch": 2.6384872080088986, "grad_norm": 7.90625, "learning_rate": 7.361512791991102e-06, "loss": 2.9313, "mean_token_accuracy": 0.4723267060720043, "step": 14232 }, { "epoch": 2.6386725991842788, "grad_norm": 6.97265625, "learning_rate": 7.361327400815722e-06, "loss": 2.5938, "mean_token_accuracy": 0.5021057557323351, "step": 14233 }, { "epoch": 2.638857990359659, "grad_norm": 8.671875, "learning_rate": 7.361142009640342e-06, "loss": 2.8646, "mean_token_accuracy": 0.48496748067247514, "step": 14234 }, { "epoch": 2.639043381535039, "grad_norm": 7.29296875, "learning_rate": 7.360956618464962e-06, "loss": 3.2904, "mean_token_accuracy": 0.4434846088331914, "step": 14235 }, { "epoch": 2.639228772710419, "grad_norm": 8.3671875, "learning_rate": 7.360771227289582e-06, "loss": 2.7182, "mean_token_accuracy": 0.4736575481256332, "step": 14236 }, { "epoch": 2.639414163885799, "grad_norm": 7.68359375, "learning_rate": 7.360585836114201e-06, "loss": 3.3077, "mean_token_accuracy": 0.458537845517954, "step": 14237 }, { "epoch": 2.639599555061179, "grad_norm": 8.234375, "learning_rate": 7.360400444938821e-06, "loss": 3.8085, "mean_token_accuracy": 0.43091693335188536, "step": 14238 }, { "epoch": 2.639784946236559, "grad_norm": 7.7734375, "learning_rate": 7.360215053763441e-06, "loss": 3.3621, "mean_token_accuracy": 0.4654111275233875, "step": 14239 }, { "epoch": 2.639970337411939, "grad_norm": 7.140625, "learning_rate": 7.360029662588062e-06, "loss": 2.7482, "mean_token_accuracy": 0.4964915016372992, "step": 14240 }, { "epoch": 2.6401557285873194, "grad_norm": 7.4375, "learning_rate": 7.359844271412682e-06, "loss": 3.9045, "mean_token_accuracy": 0.4398918918918919, "step": 14241 }, { "epoch": 2.640341119762699, "grad_norm": 7.8046875, "learning_rate": 7.359658880237301e-06, "loss": 3.1542, "mean_token_accuracy": 0.47436245252306025, "step": 14242 }, { "epoch": 2.6405265109380793, "grad_norm": 7.09375, "learning_rate": 7.359473489061921e-06, "loss": 2.998, "mean_token_accuracy": 0.4581550802139037, "step": 14243 }, { "epoch": 2.6407119021134595, "grad_norm": 7.24609375, "learning_rate": 7.3592880978865414e-06, "loss": 3.3017, "mean_token_accuracy": 0.4505801761498672, "step": 14244 }, { "epoch": 2.6408972932888393, "grad_norm": 6.734375, "learning_rate": 7.359102706711161e-06, "loss": 3.1089, "mean_token_accuracy": 0.45920985120574653, "step": 14245 }, { "epoch": 2.6410826844642195, "grad_norm": 8.828125, "learning_rate": 7.358917315535781e-06, "loss": 3.0563, "mean_token_accuracy": 0.459630001434103, "step": 14246 }, { "epoch": 2.6412680756395996, "grad_norm": 6.7578125, "learning_rate": 7.3587319243604e-06, "loss": 3.0398, "mean_token_accuracy": 0.5004841208365608, "step": 14247 }, { "epoch": 2.64145346681498, "grad_norm": 8.2421875, "learning_rate": 7.358546533185022e-06, "loss": 3.1707, "mean_token_accuracy": 0.4669477488330071, "step": 14248 }, { "epoch": 2.6416388579903596, "grad_norm": 9.171875, "learning_rate": 7.358361142009641e-06, "loss": 3.4026, "mean_token_accuracy": 0.4428493746601414, "step": 14249 }, { "epoch": 2.6418242491657398, "grad_norm": 6.859375, "learning_rate": 7.358175750834261e-06, "loss": 2.7039, "mean_token_accuracy": 0.4921121151784347, "step": 14250 }, { "epoch": 2.6420096403411195, "grad_norm": 8.328125, "learning_rate": 7.357990359658881e-06, "loss": 2.615, "mean_token_accuracy": 0.5124664365170695, "step": 14251 }, { "epoch": 2.6421950315164997, "grad_norm": 10.90625, "learning_rate": 7.3578049684835e-06, "loss": 3.6903, "mean_token_accuracy": 0.4239174697977467, "step": 14252 }, { "epoch": 2.64238042269188, "grad_norm": 11.40625, "learning_rate": 7.357619577308121e-06, "loss": 2.4858, "mean_token_accuracy": 0.5097087378640777, "step": 14253 }, { "epoch": 2.64256581386726, "grad_norm": 7.890625, "learning_rate": 7.35743418613274e-06, "loss": 3.1588, "mean_token_accuracy": 0.46310979707028627, "step": 14254 }, { "epoch": 2.64275120504264, "grad_norm": 10.734375, "learning_rate": 7.35724879495736e-06, "loss": 2.6914, "mean_token_accuracy": 0.4879037209873511, "step": 14255 }, { "epoch": 2.64293659621802, "grad_norm": 7.16015625, "learning_rate": 7.357063403781981e-06, "loss": 2.3388, "mean_token_accuracy": 0.5301734976112648, "step": 14256 }, { "epoch": 2.6431219873934, "grad_norm": 6.62890625, "learning_rate": 7.356878012606601e-06, "loss": 3.2898, "mean_token_accuracy": 0.446204259967231, "step": 14257 }, { "epoch": 2.64330737856878, "grad_norm": 6.67578125, "learning_rate": 7.356692621431221e-06, "loss": 2.6662, "mean_token_accuracy": 0.4984029484029484, "step": 14258 }, { "epoch": 2.64349276974416, "grad_norm": 14.09375, "learning_rate": 7.35650723025584e-06, "loss": 2.6272, "mean_token_accuracy": 0.48154381240884103, "step": 14259 }, { "epoch": 2.6436781609195403, "grad_norm": 7.1875, "learning_rate": 7.35632183908046e-06, "loss": 3.0848, "mean_token_accuracy": 0.463245492371706, "step": 14260 }, { "epoch": 2.6438635520949205, "grad_norm": 7.7265625, "learning_rate": 7.3561364479050805e-06, "loss": 3.1732, "mean_token_accuracy": 0.4572441293752769, "step": 14261 }, { "epoch": 2.6440489432703003, "grad_norm": 8.3046875, "learning_rate": 7.3559510567297e-06, "loss": 3.6731, "mean_token_accuracy": 0.43567378979502835, "step": 14262 }, { "epoch": 2.6442343344456805, "grad_norm": 8.3671875, "learning_rate": 7.35576566555432e-06, "loss": 2.9744, "mean_token_accuracy": 0.4995707101680363, "step": 14263 }, { "epoch": 2.64441972562106, "grad_norm": 7.1328125, "learning_rate": 7.35558027437894e-06, "loss": 3.1067, "mean_token_accuracy": 0.46995086304649114, "step": 14264 }, { "epoch": 2.6446051167964404, "grad_norm": 8.28125, "learning_rate": 7.355394883203561e-06, "loss": 2.5237, "mean_token_accuracy": 0.5141571553994733, "step": 14265 }, { "epoch": 2.6447905079718206, "grad_norm": 7.27734375, "learning_rate": 7.35520949202818e-06, "loss": 2.478, "mean_token_accuracy": 0.5173444976076556, "step": 14266 }, { "epoch": 2.6449758991472008, "grad_norm": 9.1171875, "learning_rate": 7.3550241008528e-06, "loss": 3.1787, "mean_token_accuracy": 0.47163073667504296, "step": 14267 }, { "epoch": 2.6451612903225805, "grad_norm": 8.4140625, "learning_rate": 7.35483870967742e-06, "loss": 2.9707, "mean_token_accuracy": 0.46900429057552895, "step": 14268 }, { "epoch": 2.6453466814979607, "grad_norm": 6.4921875, "learning_rate": 7.354653318502039e-06, "loss": 3.2017, "mean_token_accuracy": 0.45097402597402597, "step": 14269 }, { "epoch": 2.645532072673341, "grad_norm": 7.96875, "learning_rate": 7.35446792732666e-06, "loss": 3.0517, "mean_token_accuracy": 0.4843972754633296, "step": 14270 }, { "epoch": 2.6457174638487206, "grad_norm": 7.484375, "learning_rate": 7.3542825361512795e-06, "loss": 2.7003, "mean_token_accuracy": 0.4737973662993819, "step": 14271 }, { "epoch": 2.645902855024101, "grad_norm": 7.81640625, "learning_rate": 7.3540971449759e-06, "loss": 3.5894, "mean_token_accuracy": 0.43482466747279325, "step": 14272 }, { "epoch": 2.646088246199481, "grad_norm": 6.19140625, "learning_rate": 7.3539117538005204e-06, "loss": 2.5726, "mean_token_accuracy": 0.5312038794012228, "step": 14273 }, { "epoch": 2.646273637374861, "grad_norm": 7.37890625, "learning_rate": 7.35372636262514e-06, "loss": 2.8285, "mean_token_accuracy": 0.47429798987264077, "step": 14274 }, { "epoch": 2.646459028550241, "grad_norm": 6.234375, "learning_rate": 7.35354097144976e-06, "loss": 2.3178, "mean_token_accuracy": 0.5242883101150818, "step": 14275 }, { "epoch": 2.646644419725621, "grad_norm": 10.0859375, "learning_rate": 7.353355580274379e-06, "loss": 2.7029, "mean_token_accuracy": 0.47975475795120703, "step": 14276 }, { "epoch": 2.646829810901001, "grad_norm": 7.57421875, "learning_rate": 7.353170189098999e-06, "loss": 2.9633, "mean_token_accuracy": 0.4553066037735849, "step": 14277 }, { "epoch": 2.647015202076381, "grad_norm": 7.02734375, "learning_rate": 7.352984797923619e-06, "loss": 3.2534, "mean_token_accuracy": 0.4459183673469388, "step": 14278 }, { "epoch": 2.6472005932517613, "grad_norm": 7.71875, "learning_rate": 7.352799406748239e-06, "loss": 2.6739, "mean_token_accuracy": 0.5099650083675643, "step": 14279 }, { "epoch": 2.6473859844271415, "grad_norm": 9.1015625, "learning_rate": 7.35261401557286e-06, "loss": 3.0928, "mean_token_accuracy": 0.47110187110187113, "step": 14280 }, { "epoch": 2.647571375602521, "grad_norm": 10.125, "learning_rate": 7.352428624397479e-06, "loss": 2.7022, "mean_token_accuracy": 0.49726303658887927, "step": 14281 }, { "epoch": 2.6477567667779014, "grad_norm": 7.9921875, "learning_rate": 7.3522432332221e-06, "loss": 3.222, "mean_token_accuracy": 0.434966953485472, "step": 14282 }, { "epoch": 2.6479421579532816, "grad_norm": 8.9140625, "learning_rate": 7.3520578420467194e-06, "loss": 2.5268, "mean_token_accuracy": 0.5171227969623156, "step": 14283 }, { "epoch": 2.6481275491286613, "grad_norm": 8.8671875, "learning_rate": 7.351872450871339e-06, "loss": 3.3809, "mean_token_accuracy": 0.4357548509228585, "step": 14284 }, { "epoch": 2.6483129403040415, "grad_norm": 11.4609375, "learning_rate": 7.351687059695959e-06, "loss": 3.2992, "mean_token_accuracy": 0.46227278671374394, "step": 14285 }, { "epoch": 2.6484983314794217, "grad_norm": 8.3515625, "learning_rate": 7.351501668520578e-06, "loss": 3.4116, "mean_token_accuracy": 0.4252127506130102, "step": 14286 }, { "epoch": 2.648683722654802, "grad_norm": 7.453125, "learning_rate": 7.351316277345199e-06, "loss": 2.658, "mean_token_accuracy": 0.5203423304805793, "step": 14287 }, { "epoch": 2.6488691138301816, "grad_norm": 7.6484375, "learning_rate": 7.351130886169819e-06, "loss": 2.8526, "mean_token_accuracy": 0.4992313604919293, "step": 14288 }, { "epoch": 2.649054505005562, "grad_norm": 6.49609375, "learning_rate": 7.350945494994439e-06, "loss": 2.9531, "mean_token_accuracy": 0.47182353829883666, "step": 14289 }, { "epoch": 2.6492398961809416, "grad_norm": 8.796875, "learning_rate": 7.350760103819059e-06, "loss": 3.0691, "mean_token_accuracy": 0.4632891860614633, "step": 14290 }, { "epoch": 2.6494252873563218, "grad_norm": 8.8203125, "learning_rate": 7.350574712643679e-06, "loss": 2.9039, "mean_token_accuracy": 0.49181969949916526, "step": 14291 }, { "epoch": 2.649610678531702, "grad_norm": 6.859375, "learning_rate": 7.350389321468299e-06, "loss": 2.7767, "mean_token_accuracy": 0.47903316469926926, "step": 14292 }, { "epoch": 2.649796069707082, "grad_norm": 9.5546875, "learning_rate": 7.350203930292918e-06, "loss": 2.4519, "mean_token_accuracy": 0.5420047732696898, "step": 14293 }, { "epoch": 2.649981460882462, "grad_norm": 10.3203125, "learning_rate": 7.350018539117538e-06, "loss": 2.6788, "mean_token_accuracy": 0.486514657980456, "step": 14294 }, { "epoch": 2.650166852057842, "grad_norm": 9.2109375, "learning_rate": 7.349833147942158e-06, "loss": 2.5203, "mean_token_accuracy": 0.5370532915360502, "step": 14295 }, { "epoch": 2.650352243233222, "grad_norm": 6.67578125, "learning_rate": 7.349647756766779e-06, "loss": 2.4733, "mean_token_accuracy": 0.5176431424766977, "step": 14296 }, { "epoch": 2.650537634408602, "grad_norm": 11.859375, "learning_rate": 7.349462365591399e-06, "loss": 2.5689, "mean_token_accuracy": 0.5104194496393267, "step": 14297 }, { "epoch": 2.650723025583982, "grad_norm": 9.0546875, "learning_rate": 7.349276974416018e-06, "loss": 2.7057, "mean_token_accuracy": 0.4876954627018713, "step": 14298 }, { "epoch": 2.6509084167593624, "grad_norm": 7.328125, "learning_rate": 7.349091583240639e-06, "loss": 2.8154, "mean_token_accuracy": 0.48210227272727274, "step": 14299 }, { "epoch": 2.651093807934742, "grad_norm": 7.04296875, "learning_rate": 7.3489061920652585e-06, "loss": 2.7913, "mean_token_accuracy": 0.4715729304889252, "step": 14300 }, { "epoch": 2.6512791991101223, "grad_norm": 7.65625, "learning_rate": 7.348720800889878e-06, "loss": 3.1335, "mean_token_accuracy": 0.4390834763298678, "step": 14301 }, { "epoch": 2.6514645902855025, "grad_norm": 8.1640625, "learning_rate": 7.348535409714498e-06, "loss": 2.7062, "mean_token_accuracy": 0.5124170413187754, "step": 14302 }, { "epoch": 2.6516499814608823, "grad_norm": 9.5859375, "learning_rate": 7.348350018539117e-06, "loss": 2.7289, "mean_token_accuracy": 0.4888211382113821, "step": 14303 }, { "epoch": 2.6518353726362625, "grad_norm": 8.890625, "learning_rate": 7.348164627363739e-06, "loss": 2.6641, "mean_token_accuracy": 0.5122745490981964, "step": 14304 }, { "epoch": 2.6520207638116426, "grad_norm": 8.0859375, "learning_rate": 7.347979236188358e-06, "loss": 3.1834, "mean_token_accuracy": 0.4539279385705848, "step": 14305 }, { "epoch": 2.652206154987023, "grad_norm": 7.73046875, "learning_rate": 7.347793845012978e-06, "loss": 3.3326, "mean_token_accuracy": 0.45745159837910854, "step": 14306 }, { "epoch": 2.6523915461624026, "grad_norm": 9.8203125, "learning_rate": 7.347608453837598e-06, "loss": 2.7702, "mean_token_accuracy": 0.4783176214648296, "step": 14307 }, { "epoch": 2.6525769373377828, "grad_norm": 7.45703125, "learning_rate": 7.347423062662218e-06, "loss": 2.6301, "mean_token_accuracy": 0.515926493108729, "step": 14308 }, { "epoch": 2.6527623285131625, "grad_norm": 8.015625, "learning_rate": 7.347237671486838e-06, "loss": 2.8802, "mean_token_accuracy": 0.5093541564991052, "step": 14309 }, { "epoch": 2.6529477196885427, "grad_norm": 10.171875, "learning_rate": 7.3470522803114574e-06, "loss": 2.8799, "mean_token_accuracy": 0.48279608837377763, "step": 14310 }, { "epoch": 2.653133110863923, "grad_norm": 7.69921875, "learning_rate": 7.346866889136077e-06, "loss": 2.5841, "mean_token_accuracy": 0.5251419864247125, "step": 14311 }, { "epoch": 2.653318502039303, "grad_norm": 7.859375, "learning_rate": 7.3466814979606984e-06, "loss": 2.7497, "mean_token_accuracy": 0.4897693209956992, "step": 14312 }, { "epoch": 2.653503893214683, "grad_norm": 9.3515625, "learning_rate": 7.346496106785318e-06, "loss": 3.2413, "mean_token_accuracy": 0.42471207166216407, "step": 14313 }, { "epoch": 2.653689284390063, "grad_norm": 8.2421875, "learning_rate": 7.346310715609938e-06, "loss": 2.6391, "mean_token_accuracy": 0.5070757670632435, "step": 14314 }, { "epoch": 2.653874675565443, "grad_norm": 8.453125, "learning_rate": 7.346125324434557e-06, "loss": 3.6969, "mean_token_accuracy": 0.43725247524752475, "step": 14315 }, { "epoch": 2.654060066740823, "grad_norm": 9.5703125, "learning_rate": 7.345939933259177e-06, "loss": 2.6861, "mean_token_accuracy": 0.4831869130566495, "step": 14316 }, { "epoch": 2.654245457916203, "grad_norm": 9.25, "learning_rate": 7.3457545420837975e-06, "loss": 2.2485, "mean_token_accuracy": 0.5464669229749437, "step": 14317 }, { "epoch": 2.6544308490915833, "grad_norm": 7.98828125, "learning_rate": 7.345569150908417e-06, "loss": 2.649, "mean_token_accuracy": 0.49554896142433236, "step": 14318 }, { "epoch": 2.6546162402669635, "grad_norm": 8.4609375, "learning_rate": 7.345383759733037e-06, "loss": 2.3793, "mean_token_accuracy": 0.5355987055016181, "step": 14319 }, { "epoch": 2.6548016314423433, "grad_norm": 8.6328125, "learning_rate": 7.345198368557658e-06, "loss": 3.2562, "mean_token_accuracy": 0.4733650931139021, "step": 14320 }, { "epoch": 2.6549870226177235, "grad_norm": 8.2421875, "learning_rate": 7.345012977382278e-06, "loss": 3.1708, "mean_token_accuracy": 0.46873025900189513, "step": 14321 }, { "epoch": 2.655172413793103, "grad_norm": 8.03125, "learning_rate": 7.344827586206897e-06, "loss": 2.8323, "mean_token_accuracy": 0.4583420776495278, "step": 14322 }, { "epoch": 2.6553578049684834, "grad_norm": 7.25390625, "learning_rate": 7.344642195031517e-06, "loss": 3.0198, "mean_token_accuracy": 0.4683162341581171, "step": 14323 }, { "epoch": 2.6555431961438636, "grad_norm": 9.828125, "learning_rate": 7.344456803856137e-06, "loss": 2.8613, "mean_token_accuracy": 0.4884003400947407, "step": 14324 }, { "epoch": 2.6557285873192438, "grad_norm": 8.515625, "learning_rate": 7.344271412680757e-06, "loss": 2.43, "mean_token_accuracy": 0.49431311329170385, "step": 14325 }, { "epoch": 2.6559139784946235, "grad_norm": 7.8515625, "learning_rate": 7.344086021505377e-06, "loss": 3.3819, "mean_token_accuracy": 0.44514693829762325, "step": 14326 }, { "epoch": 2.6560993696700037, "grad_norm": 7.88671875, "learning_rate": 7.3439006303299965e-06, "loss": 3.8092, "mean_token_accuracy": 0.4363689011148111, "step": 14327 }, { "epoch": 2.656284760845384, "grad_norm": 9.390625, "learning_rate": 7.343715239154617e-06, "loss": 2.7322, "mean_token_accuracy": 0.5014848812095032, "step": 14328 }, { "epoch": 2.6564701520207636, "grad_norm": 6.6484375, "learning_rate": 7.3435298479792375e-06, "loss": 2.7605, "mean_token_accuracy": 0.48818137964302943, "step": 14329 }, { "epoch": 2.656655543196144, "grad_norm": 9.171875, "learning_rate": 7.343344456803857e-06, "loss": 3.4169, "mean_token_accuracy": 0.42405816259087903, "step": 14330 }, { "epoch": 2.656840934371524, "grad_norm": 9.609375, "learning_rate": 7.343159065628477e-06, "loss": 2.5572, "mean_token_accuracy": 0.527061556329849, "step": 14331 }, { "epoch": 2.657026325546904, "grad_norm": 7.12109375, "learning_rate": 7.342973674453096e-06, "loss": 2.4386, "mean_token_accuracy": 0.520631810946492, "step": 14332 }, { "epoch": 2.657211716722284, "grad_norm": 8.0625, "learning_rate": 7.342788283277716e-06, "loss": 3.3779, "mean_token_accuracy": 0.43190661478599224, "step": 14333 }, { "epoch": 2.657397107897664, "grad_norm": 8.8515625, "learning_rate": 7.3426028921023365e-06, "loss": 3.3593, "mean_token_accuracy": 0.4447821681864235, "step": 14334 }, { "epoch": 2.657582499073044, "grad_norm": 8.3125, "learning_rate": 7.342417500926956e-06, "loss": 2.72, "mean_token_accuracy": 0.49237855027963595, "step": 14335 }, { "epoch": 2.657767890248424, "grad_norm": 7.30078125, "learning_rate": 7.342232109751577e-06, "loss": 2.8139, "mean_token_accuracy": 0.5040805916857944, "step": 14336 }, { "epoch": 2.6579532814238043, "grad_norm": 6.9375, "learning_rate": 7.342046718576197e-06, "loss": 3.1177, "mean_token_accuracy": 0.4613195615514334, "step": 14337 }, { "epoch": 2.6581386725991845, "grad_norm": 9.7109375, "learning_rate": 7.341861327400817e-06, "loss": 2.4282, "mean_token_accuracy": 0.5263311878291399, "step": 14338 }, { "epoch": 2.658324063774564, "grad_norm": 9.9921875, "learning_rate": 7.3416759362254364e-06, "loss": 2.5198, "mean_token_accuracy": 0.5300751879699248, "step": 14339 }, { "epoch": 2.6585094549499444, "grad_norm": 7.44140625, "learning_rate": 7.341490545050056e-06, "loss": 2.7297, "mean_token_accuracy": 0.49147596375364766, "step": 14340 }, { "epoch": 2.6586948461253246, "grad_norm": 9.078125, "learning_rate": 7.341305153874676e-06, "loss": 3.0315, "mean_token_accuracy": 0.4886835409133521, "step": 14341 }, { "epoch": 2.6588802373007043, "grad_norm": 7.5703125, "learning_rate": 7.341119762699296e-06, "loss": 2.7941, "mean_token_accuracy": 0.46582185938832704, "step": 14342 }, { "epoch": 2.6590656284760845, "grad_norm": 8.40625, "learning_rate": 7.340934371523916e-06, "loss": 3.0662, "mean_token_accuracy": 0.4630030783263026, "step": 14343 }, { "epoch": 2.6592510196514647, "grad_norm": 7.328125, "learning_rate": 7.340748980348536e-06, "loss": 2.8566, "mean_token_accuracy": 0.4864437689969605, "step": 14344 }, { "epoch": 2.659436410826845, "grad_norm": 6.8125, "learning_rate": 7.340563589173156e-06, "loss": 3.1324, "mean_token_accuracy": 0.4594193946880791, "step": 14345 }, { "epoch": 2.6596218020022246, "grad_norm": 9.875, "learning_rate": 7.3403781979977765e-06, "loss": 2.1975, "mean_token_accuracy": 0.5395151515151515, "step": 14346 }, { "epoch": 2.659807193177605, "grad_norm": 8.171875, "learning_rate": 7.340192806822396e-06, "loss": 2.3029, "mean_token_accuracy": 0.5442312816413536, "step": 14347 }, { "epoch": 2.6599925843529846, "grad_norm": 8.515625, "learning_rate": 7.340007415647016e-06, "loss": 2.9849, "mean_token_accuracy": 0.45294768890119014, "step": 14348 }, { "epoch": 2.6601779755283648, "grad_norm": 8.5234375, "learning_rate": 7.3398220244716354e-06, "loss": 3.3485, "mean_token_accuracy": 0.4499722838137472, "step": 14349 }, { "epoch": 2.660363366703745, "grad_norm": 8.046875, "learning_rate": 7.339636633296255e-06, "loss": 3.5782, "mean_token_accuracy": 0.4275343531241898, "step": 14350 }, { "epoch": 2.660548757879125, "grad_norm": 8.703125, "learning_rate": 7.3394512421208756e-06, "loss": 2.9482, "mean_token_accuracy": 0.5027027027027027, "step": 14351 }, { "epoch": 2.660734149054505, "grad_norm": 9.0, "learning_rate": 7.339265850945495e-06, "loss": 2.6088, "mean_token_accuracy": 0.4948390267879086, "step": 14352 }, { "epoch": 2.660919540229885, "grad_norm": 9.203125, "learning_rate": 7.339080459770116e-06, "loss": 4.6201, "mean_token_accuracy": 0.368457459073092, "step": 14353 }, { "epoch": 2.6611049314052653, "grad_norm": 8.3515625, "learning_rate": 7.338895068594736e-06, "loss": 3.0571, "mean_token_accuracy": 0.47402360384474995, "step": 14354 }, { "epoch": 2.661290322580645, "grad_norm": 7.359375, "learning_rate": 7.338709677419356e-06, "loss": 2.7064, "mean_token_accuracy": 0.486412672402021, "step": 14355 }, { "epoch": 2.661475713756025, "grad_norm": 9.6015625, "learning_rate": 7.3385242862439755e-06, "loss": 2.84, "mean_token_accuracy": 0.4868532654792197, "step": 14356 }, { "epoch": 2.6616611049314054, "grad_norm": 7.74609375, "learning_rate": 7.338338895068595e-06, "loss": 2.5029, "mean_token_accuracy": 0.5128824476650563, "step": 14357 }, { "epoch": 2.6618464961067856, "grad_norm": 7.59375, "learning_rate": 7.338153503893215e-06, "loss": 3.0158, "mean_token_accuracy": 0.48630306021717673, "step": 14358 }, { "epoch": 2.6620318872821653, "grad_norm": 7.4296875, "learning_rate": 7.337968112717834e-06, "loss": 2.9526, "mean_token_accuracy": 0.4682871249525256, "step": 14359 }, { "epoch": 2.6622172784575455, "grad_norm": 9.515625, "learning_rate": 7.337782721542455e-06, "loss": 2.4451, "mean_token_accuracy": 0.5371870842907244, "step": 14360 }, { "epoch": 2.6624026696329253, "grad_norm": 8.359375, "learning_rate": 7.337597330367075e-06, "loss": 2.6715, "mean_token_accuracy": 0.5096519847743338, "step": 14361 }, { "epoch": 2.6625880608083055, "grad_norm": 7.578125, "learning_rate": 7.337411939191695e-06, "loss": 3.0092, "mean_token_accuracy": 0.4981901941428101, "step": 14362 }, { "epoch": 2.6627734519836856, "grad_norm": 9.046875, "learning_rate": 7.3372265480163155e-06, "loss": 2.7157, "mean_token_accuracy": 0.503098429168468, "step": 14363 }, { "epoch": 2.662958843159066, "grad_norm": 13.7890625, "learning_rate": 7.337041156840935e-06, "loss": 3.1592, "mean_token_accuracy": 0.4386603567528213, "step": 14364 }, { "epoch": 2.6631442343344456, "grad_norm": 7.44921875, "learning_rate": 7.336855765665555e-06, "loss": 2.7982, "mean_token_accuracy": 0.47955550760599847, "step": 14365 }, { "epoch": 2.6633296255098258, "grad_norm": 7.33203125, "learning_rate": 7.3366703744901745e-06, "loss": 2.9848, "mean_token_accuracy": 0.4629433250854248, "step": 14366 }, { "epoch": 2.6635150166852055, "grad_norm": 7.80078125, "learning_rate": 7.336484983314794e-06, "loss": 2.6045, "mean_token_accuracy": 0.5093696763202725, "step": 14367 }, { "epoch": 2.6637004078605857, "grad_norm": 7.52734375, "learning_rate": 7.336299592139415e-06, "loss": 2.2891, "mean_token_accuracy": 0.5421074904782057, "step": 14368 }, { "epoch": 2.663885799035966, "grad_norm": 8.7890625, "learning_rate": 7.336114200964035e-06, "loss": 2.8799, "mean_token_accuracy": 0.4768488326594496, "step": 14369 }, { "epoch": 2.664071190211346, "grad_norm": 7.5390625, "learning_rate": 7.335928809788655e-06, "loss": 2.9566, "mean_token_accuracy": 0.4744389607970947, "step": 14370 }, { "epoch": 2.664256581386726, "grad_norm": 8.8671875, "learning_rate": 7.335743418613274e-06, "loss": 2.8167, "mean_token_accuracy": 0.4772811230144071, "step": 14371 }, { "epoch": 2.664441972562106, "grad_norm": 8.703125, "learning_rate": 7.335558027437895e-06, "loss": 2.7402, "mean_token_accuracy": 0.5307187593200119, "step": 14372 }, { "epoch": 2.664627363737486, "grad_norm": 7.8984375, "learning_rate": 7.3353726362625145e-06, "loss": 3.0046, "mean_token_accuracy": 0.47235216504737915, "step": 14373 }, { "epoch": 2.664812754912866, "grad_norm": 8.3671875, "learning_rate": 7.335187245087134e-06, "loss": 2.6115, "mean_token_accuracy": 0.5032178217821782, "step": 14374 }, { "epoch": 2.664998146088246, "grad_norm": 8.328125, "learning_rate": 7.335001853911754e-06, "loss": 2.8469, "mean_token_accuracy": 0.5011686554311724, "step": 14375 }, { "epoch": 2.6651835372636263, "grad_norm": 7.03515625, "learning_rate": 7.3348164627363734e-06, "loss": 2.9636, "mean_token_accuracy": 0.46422018348623856, "step": 14376 }, { "epoch": 2.6653689284390065, "grad_norm": 14.09375, "learning_rate": 7.334631071560995e-06, "loss": 2.856, "mean_token_accuracy": 0.48548812664907653, "step": 14377 }, { "epoch": 2.6655543196143863, "grad_norm": 10.5078125, "learning_rate": 7.3344456803856144e-06, "loss": 2.6905, "mean_token_accuracy": 0.4989408099688474, "step": 14378 }, { "epoch": 2.6657397107897665, "grad_norm": 8.4765625, "learning_rate": 7.334260289210234e-06, "loss": 2.6666, "mean_token_accuracy": 0.5112605400789839, "step": 14379 }, { "epoch": 2.665925101965146, "grad_norm": 7.63671875, "learning_rate": 7.3340748980348546e-06, "loss": 3.339, "mean_token_accuracy": 0.4401919763721388, "step": 14380 }, { "epoch": 2.6661104931405264, "grad_norm": 7.3984375, "learning_rate": 7.333889506859474e-06, "loss": 2.4887, "mean_token_accuracy": 0.5197666882696047, "step": 14381 }, { "epoch": 2.6662958843159066, "grad_norm": 7.80859375, "learning_rate": 7.333704115684094e-06, "loss": 2.4098, "mean_token_accuracy": 0.5115528591712921, "step": 14382 }, { "epoch": 2.6664812754912868, "grad_norm": 8.2109375, "learning_rate": 7.3335187245087135e-06, "loss": 2.9373, "mean_token_accuracy": 0.4772420009013069, "step": 14383 }, { "epoch": 2.6666666666666665, "grad_norm": 8.5703125, "learning_rate": 7.333333333333333e-06, "loss": 3.1502, "mean_token_accuracy": 0.47878463391781556, "step": 14384 }, { "epoch": 2.6668520578420467, "grad_norm": 8.4375, "learning_rate": 7.3331479421579545e-06, "loss": 2.3126, "mean_token_accuracy": 0.5543495610534717, "step": 14385 }, { "epoch": 2.667037449017427, "grad_norm": 11.140625, "learning_rate": 7.332962550982574e-06, "loss": 3.3788, "mean_token_accuracy": 0.4505475404136972, "step": 14386 }, { "epoch": 2.6672228401928066, "grad_norm": 10.046875, "learning_rate": 7.332777159807194e-06, "loss": 3.0358, "mean_token_accuracy": 0.4616791101630352, "step": 14387 }, { "epoch": 2.667408231368187, "grad_norm": 8.4140625, "learning_rate": 7.332591768631813e-06, "loss": 2.7218, "mean_token_accuracy": 0.47732958098811756, "step": 14388 }, { "epoch": 2.667593622543567, "grad_norm": 9.1171875, "learning_rate": 7.332406377456434e-06, "loss": 3.0542, "mean_token_accuracy": 0.460020464844321, "step": 14389 }, { "epoch": 2.667779013718947, "grad_norm": 9.1640625, "learning_rate": 7.3322209862810535e-06, "loss": 2.533, "mean_token_accuracy": 0.5123526745240253, "step": 14390 }, { "epoch": 2.667964404894327, "grad_norm": 7.09375, "learning_rate": 7.332035595105673e-06, "loss": 2.6494, "mean_token_accuracy": 0.48421185054723864, "step": 14391 }, { "epoch": 2.668149796069707, "grad_norm": 10.453125, "learning_rate": 7.331850203930293e-06, "loss": 3.4936, "mean_token_accuracy": 0.4476819290943017, "step": 14392 }, { "epoch": 2.668335187245087, "grad_norm": 9.5078125, "learning_rate": 7.331664812754914e-06, "loss": 2.6458, "mean_token_accuracy": 0.500997150997151, "step": 14393 }, { "epoch": 2.668520578420467, "grad_norm": 7.17578125, "learning_rate": 7.331479421579534e-06, "loss": 2.9601, "mean_token_accuracy": 0.47411679884643115, "step": 14394 }, { "epoch": 2.6687059695958473, "grad_norm": 9.953125, "learning_rate": 7.3312940304041535e-06, "loss": 3.2614, "mean_token_accuracy": 0.4641962944416625, "step": 14395 }, { "epoch": 2.6688913607712275, "grad_norm": 8.328125, "learning_rate": 7.331108639228773e-06, "loss": 3.2525, "mean_token_accuracy": 0.4509118161378966, "step": 14396 }, { "epoch": 2.669076751946607, "grad_norm": 11.703125, "learning_rate": 7.330923248053393e-06, "loss": 3.4304, "mean_token_accuracy": 0.4450656611623976, "step": 14397 }, { "epoch": 2.6692621431219874, "grad_norm": 7.65234375, "learning_rate": 7.330737856878013e-06, "loss": 3.2061, "mean_token_accuracy": 0.4449357468564322, "step": 14398 }, { "epoch": 2.6694475342973676, "grad_norm": 7.9765625, "learning_rate": 7.330552465702633e-06, "loss": 2.6651, "mean_token_accuracy": 0.5024531668153435, "step": 14399 }, { "epoch": 2.6696329254727473, "grad_norm": 9.3359375, "learning_rate": 7.3303670745272525e-06, "loss": 2.891, "mean_token_accuracy": 0.4740767472633225, "step": 14400 }, { "epoch": 2.6698183166481275, "grad_norm": 8.515625, "learning_rate": 7.330181683351874e-06, "loss": 2.4907, "mean_token_accuracy": 0.5204359673024523, "step": 14401 }, { "epoch": 2.6700037078235077, "grad_norm": 8.859375, "learning_rate": 7.3299962921764935e-06, "loss": 2.1434, "mean_token_accuracy": 0.5657636435021594, "step": 14402 }, { "epoch": 2.670189098998888, "grad_norm": 8.40625, "learning_rate": 7.329810901001113e-06, "loss": 3.3618, "mean_token_accuracy": 0.4505754544476713, "step": 14403 }, { "epoch": 2.6703744901742676, "grad_norm": 8.0, "learning_rate": 7.329625509825733e-06, "loss": 3.0273, "mean_token_accuracy": 0.48359414437152953, "step": 14404 }, { "epoch": 2.670559881349648, "grad_norm": 7.58203125, "learning_rate": 7.3294401186503524e-06, "loss": 2.9479, "mean_token_accuracy": 0.4811188811188811, "step": 14405 }, { "epoch": 2.6707452725250276, "grad_norm": 7.96875, "learning_rate": 7.329254727474973e-06, "loss": 2.9317, "mean_token_accuracy": 0.4608429545138406, "step": 14406 }, { "epoch": 2.6709306637004078, "grad_norm": 9.9296875, "learning_rate": 7.329069336299593e-06, "loss": 3.3822, "mean_token_accuracy": 0.4625935162094763, "step": 14407 }, { "epoch": 2.671116054875788, "grad_norm": 9.8203125, "learning_rate": 7.328883945124212e-06, "loss": 2.7646, "mean_token_accuracy": 0.5155367231638418, "step": 14408 }, { "epoch": 2.671301446051168, "grad_norm": 6.75390625, "learning_rate": 7.328698553948833e-06, "loss": 3.0527, "mean_token_accuracy": 0.4783256570035221, "step": 14409 }, { "epoch": 2.671486837226548, "grad_norm": 9.921875, "learning_rate": 7.328513162773453e-06, "loss": 2.2954, "mean_token_accuracy": 0.5635332252836305, "step": 14410 }, { "epoch": 2.671672228401928, "grad_norm": 9.9296875, "learning_rate": 7.328327771598073e-06, "loss": 3.0179, "mean_token_accuracy": 0.5013857727332718, "step": 14411 }, { "epoch": 2.6718576195773083, "grad_norm": 7.89453125, "learning_rate": 7.3281423804226925e-06, "loss": 3.0008, "mean_token_accuracy": 0.47876546358882793, "step": 14412 }, { "epoch": 2.672043010752688, "grad_norm": 7.6484375, "learning_rate": 7.327956989247312e-06, "loss": 2.8653, "mean_token_accuracy": 0.4782608695652174, "step": 14413 }, { "epoch": 2.672228401928068, "grad_norm": 9.2421875, "learning_rate": 7.327771598071932e-06, "loss": 2.3436, "mean_token_accuracy": 0.527347036903474, "step": 14414 }, { "epoch": 2.6724137931034484, "grad_norm": 7.359375, "learning_rate": 7.327586206896552e-06, "loss": 2.2808, "mean_token_accuracy": 0.5082590456213949, "step": 14415 }, { "epoch": 2.6725991842788286, "grad_norm": 7.73046875, "learning_rate": 7.327400815721172e-06, "loss": 3.5812, "mean_token_accuracy": 0.42135817469960846, "step": 14416 }, { "epoch": 2.6727845754542083, "grad_norm": 9.5703125, "learning_rate": 7.327215424545792e-06, "loss": 2.4771, "mean_token_accuracy": 0.5110599078341014, "step": 14417 }, { "epoch": 2.6729699666295885, "grad_norm": 8.5390625, "learning_rate": 7.327030033370413e-06, "loss": 3.0331, "mean_token_accuracy": 0.48859375, "step": 14418 }, { "epoch": 2.6731553578049683, "grad_norm": 8.2578125, "learning_rate": 7.3268446421950326e-06, "loss": 2.2458, "mean_token_accuracy": 0.5618089027502127, "step": 14419 }, { "epoch": 2.6733407489803485, "grad_norm": 11.6328125, "learning_rate": 7.326659251019652e-06, "loss": 3.7466, "mean_token_accuracy": 0.44139727286655434, "step": 14420 }, { "epoch": 2.6735261401557286, "grad_norm": 8.984375, "learning_rate": 7.326473859844272e-06, "loss": 3.6291, "mean_token_accuracy": 0.4485969387755102, "step": 14421 }, { "epoch": 2.673711531331109, "grad_norm": 7.42578125, "learning_rate": 7.3262884686688915e-06, "loss": 3.3756, "mean_token_accuracy": 0.4565706570657066, "step": 14422 }, { "epoch": 2.6738969225064886, "grad_norm": 7.91796875, "learning_rate": 7.326103077493512e-06, "loss": 2.4003, "mean_token_accuracy": 0.5232634338138925, "step": 14423 }, { "epoch": 2.6740823136818688, "grad_norm": 8.953125, "learning_rate": 7.325917686318132e-06, "loss": 2.391, "mean_token_accuracy": 0.5338824821526633, "step": 14424 }, { "epoch": 2.6742677048572485, "grad_norm": 6.7578125, "learning_rate": 7.325732295142752e-06, "loss": 3.005, "mean_token_accuracy": 0.47583148558758315, "step": 14425 }, { "epoch": 2.6744530960326287, "grad_norm": 7.09765625, "learning_rate": 7.325546903967372e-06, "loss": 2.9228, "mean_token_accuracy": 0.47908163265306125, "step": 14426 }, { "epoch": 2.674638487208009, "grad_norm": 6.41796875, "learning_rate": 7.325361512791992e-06, "loss": 2.4937, "mean_token_accuracy": 0.4925915948275862, "step": 14427 }, { "epoch": 2.674823878383389, "grad_norm": 8.4140625, "learning_rate": 7.325176121616612e-06, "loss": 2.5499, "mean_token_accuracy": 0.5092957746478873, "step": 14428 }, { "epoch": 2.6750092695587693, "grad_norm": 7.15234375, "learning_rate": 7.3249907304412315e-06, "loss": 3.3315, "mean_token_accuracy": 0.4595533498759305, "step": 14429 }, { "epoch": 2.675194660734149, "grad_norm": 8.2109375, "learning_rate": 7.324805339265851e-06, "loss": 3.4268, "mean_token_accuracy": 0.4422024088847177, "step": 14430 }, { "epoch": 2.675380051909529, "grad_norm": 9.25, "learning_rate": 7.324619948090471e-06, "loss": 3.3492, "mean_token_accuracy": 0.440857762062279, "step": 14431 }, { "epoch": 2.675565443084909, "grad_norm": 7.61328125, "learning_rate": 7.324434556915091e-06, "loss": 2.8591, "mean_token_accuracy": 0.4747097844112769, "step": 14432 }, { "epoch": 2.675750834260289, "grad_norm": 7.72265625, "learning_rate": 7.324249165739712e-06, "loss": 3.3891, "mean_token_accuracy": 0.43711640486840736, "step": 14433 }, { "epoch": 2.6759362254356693, "grad_norm": 10.7578125, "learning_rate": 7.3240637745643315e-06, "loss": 2.8709, "mean_token_accuracy": 0.46384910932588197, "step": 14434 }, { "epoch": 2.6761216166110495, "grad_norm": 8.640625, "learning_rate": 7.323878383388951e-06, "loss": 3.2718, "mean_token_accuracy": 0.4524421593830334, "step": 14435 }, { "epoch": 2.6763070077864293, "grad_norm": 8.0703125, "learning_rate": 7.323692992213572e-06, "loss": 2.4309, "mean_token_accuracy": 0.5226392612451594, "step": 14436 }, { "epoch": 2.6764923989618095, "grad_norm": 8.046875, "learning_rate": 7.323507601038191e-06, "loss": 2.9711, "mean_token_accuracy": 0.47438370846730976, "step": 14437 }, { "epoch": 2.676677790137189, "grad_norm": 9.21875, "learning_rate": 7.323322209862811e-06, "loss": 3.8425, "mean_token_accuracy": 0.42394822006472493, "step": 14438 }, { "epoch": 2.6768631813125694, "grad_norm": 8.625, "learning_rate": 7.3231368186874305e-06, "loss": 2.8295, "mean_token_accuracy": 0.48242540087076563, "step": 14439 }, { "epoch": 2.6770485724879496, "grad_norm": 8.9296875, "learning_rate": 7.32295142751205e-06, "loss": 2.6654, "mean_token_accuracy": 0.4991403026134801, "step": 14440 }, { "epoch": 2.6772339636633298, "grad_norm": 13.3359375, "learning_rate": 7.3227660363366715e-06, "loss": 2.7126, "mean_token_accuracy": 0.4706781055353155, "step": 14441 }, { "epoch": 2.6774193548387095, "grad_norm": 10.5859375, "learning_rate": 7.322580645161291e-06, "loss": 2.9525, "mean_token_accuracy": 0.48564834847829225, "step": 14442 }, { "epoch": 2.6776047460140897, "grad_norm": 8.9453125, "learning_rate": 7.322395253985911e-06, "loss": 3.0842, "mean_token_accuracy": 0.4722810514513517, "step": 14443 }, { "epoch": 2.67779013718947, "grad_norm": 8.7421875, "learning_rate": 7.322209862810531e-06, "loss": 2.6741, "mean_token_accuracy": 0.5108436921449352, "step": 14444 }, { "epoch": 2.6779755283648496, "grad_norm": 13.3046875, "learning_rate": 7.322024471635151e-06, "loss": 3.0802, "mean_token_accuracy": 0.47243675099866844, "step": 14445 }, { "epoch": 2.67816091954023, "grad_norm": 8.109375, "learning_rate": 7.3218390804597706e-06, "loss": 2.1807, "mean_token_accuracy": 0.5350964737192282, "step": 14446 }, { "epoch": 2.67834631071561, "grad_norm": 9.0625, "learning_rate": 7.32165368928439e-06, "loss": 3.0076, "mean_token_accuracy": 0.46537008535049673, "step": 14447 }, { "epoch": 2.67853170189099, "grad_norm": 17.09375, "learning_rate": 7.32146829810901e-06, "loss": 2.6424, "mean_token_accuracy": 0.5080044865912103, "step": 14448 }, { "epoch": 2.67871709306637, "grad_norm": 12.5703125, "learning_rate": 7.321282906933631e-06, "loss": 2.9962, "mean_token_accuracy": 0.47645473283653084, "step": 14449 }, { "epoch": 2.67890248424175, "grad_norm": 10.8046875, "learning_rate": 7.321097515758251e-06, "loss": 2.6283, "mean_token_accuracy": 0.5406218655967904, "step": 14450 }, { "epoch": 2.67908787541713, "grad_norm": 17.796875, "learning_rate": 7.3209121245828705e-06, "loss": 2.4369, "mean_token_accuracy": 0.5225100215849522, "step": 14451 }, { "epoch": 2.67927326659251, "grad_norm": 18.4375, "learning_rate": 7.32072673340749e-06, "loss": 3.3878, "mean_token_accuracy": 0.4425444596443228, "step": 14452 }, { "epoch": 2.6794586577678903, "grad_norm": 14.59375, "learning_rate": 7.320541342232111e-06, "loss": 2.8386, "mean_token_accuracy": 0.47734843437708196, "step": 14453 }, { "epoch": 2.6796440489432705, "grad_norm": 6.55859375, "learning_rate": 7.32035595105673e-06, "loss": 3.1307, "mean_token_accuracy": 0.4610519658480025, "step": 14454 }, { "epoch": 2.67982944011865, "grad_norm": 11.453125, "learning_rate": 7.32017055988135e-06, "loss": 2.8486, "mean_token_accuracy": 0.4752519933804724, "step": 14455 }, { "epoch": 2.6800148312940304, "grad_norm": 13.171875, "learning_rate": 7.3199851687059696e-06, "loss": 2.9795, "mean_token_accuracy": 0.46078304690528493, "step": 14456 }, { "epoch": 2.6802002224694106, "grad_norm": 8.8671875, "learning_rate": 7.319799777530591e-06, "loss": 1.547, "mean_token_accuracy": 0.649205078348393, "step": 14457 }, { "epoch": 2.6803856136447903, "grad_norm": 7.14453125, "learning_rate": 7.3196143863552105e-06, "loss": 2.7919, "mean_token_accuracy": 0.4840520540954325, "step": 14458 }, { "epoch": 2.6805710048201705, "grad_norm": 10.53125, "learning_rate": 7.31942899517983e-06, "loss": 2.1802, "mean_token_accuracy": 0.5456793336803748, "step": 14459 }, { "epoch": 2.6807563959955507, "grad_norm": 12.5625, "learning_rate": 7.31924360400445e-06, "loss": 3.0239, "mean_token_accuracy": 0.4828891342059612, "step": 14460 }, { "epoch": 2.680941787170931, "grad_norm": 8.3125, "learning_rate": 7.31905821282907e-06, "loss": 3.1144, "mean_token_accuracy": 0.46226415094339623, "step": 14461 }, { "epoch": 2.6811271783463106, "grad_norm": 6.75, "learning_rate": 7.31887282165369e-06, "loss": 3.073, "mean_token_accuracy": 0.4622485207100592, "step": 14462 }, { "epoch": 2.681312569521691, "grad_norm": 9.4609375, "learning_rate": 7.31868743047831e-06, "loss": 2.9195, "mean_token_accuracy": 0.5194730813287515, "step": 14463 }, { "epoch": 2.6814979606970706, "grad_norm": 15.7265625, "learning_rate": 7.318502039302929e-06, "loss": 3.233, "mean_token_accuracy": 0.45287089279787585, "step": 14464 }, { "epoch": 2.6816833518724508, "grad_norm": 7.00390625, "learning_rate": 7.318316648127551e-06, "loss": 3.1165, "mean_token_accuracy": 0.44779639975170704, "step": 14465 }, { "epoch": 2.681868743047831, "grad_norm": 8.421875, "learning_rate": 7.31813125695217e-06, "loss": 2.7542, "mean_token_accuracy": 0.5258884319308236, "step": 14466 }, { "epoch": 2.682054134223211, "grad_norm": 9.953125, "learning_rate": 7.31794586577679e-06, "loss": 3.4616, "mean_token_accuracy": 0.44388789505068577, "step": 14467 }, { "epoch": 2.682239525398591, "grad_norm": 9.0703125, "learning_rate": 7.3177604746014095e-06, "loss": 3.0452, "mean_token_accuracy": 0.4639529883904257, "step": 14468 }, { "epoch": 2.682424916573971, "grad_norm": 6.9453125, "learning_rate": 7.317575083426029e-06, "loss": 2.5903, "mean_token_accuracy": 0.5081680280046674, "step": 14469 }, { "epoch": 2.6826103077493513, "grad_norm": 7.04296875, "learning_rate": 7.31738969225065e-06, "loss": 2.6123, "mean_token_accuracy": 0.4862068965517241, "step": 14470 }, { "epoch": 2.682795698924731, "grad_norm": 8.0078125, "learning_rate": 7.317204301075269e-06, "loss": 2.6064, "mean_token_accuracy": 0.49676354867368955, "step": 14471 }, { "epoch": 2.682981090100111, "grad_norm": 5.95703125, "learning_rate": 7.317018909899889e-06, "loss": 2.5964, "mean_token_accuracy": 0.4882771491893153, "step": 14472 }, { "epoch": 2.6831664812754914, "grad_norm": 7.296875, "learning_rate": 7.31683351872451e-06, "loss": 2.9497, "mean_token_accuracy": 0.47953774385072095, "step": 14473 }, { "epoch": 2.6833518724508716, "grad_norm": 10.109375, "learning_rate": 7.31664812754913e-06, "loss": 3.2987, "mean_token_accuracy": 0.46673490276356194, "step": 14474 }, { "epoch": 2.6835372636262513, "grad_norm": 7.7265625, "learning_rate": 7.3164627363737496e-06, "loss": 3.3764, "mean_token_accuracy": 0.434087307304423, "step": 14475 }, { "epoch": 2.6837226548016315, "grad_norm": 7.4140625, "learning_rate": 7.316277345198369e-06, "loss": 2.4983, "mean_token_accuracy": 0.5052967638949354, "step": 14476 }, { "epoch": 2.6839080459770113, "grad_norm": 10.015625, "learning_rate": 7.316091954022989e-06, "loss": 2.8413, "mean_token_accuracy": 0.4908882192567083, "step": 14477 }, { "epoch": 2.6840934371523915, "grad_norm": 8.8359375, "learning_rate": 7.3159065628476085e-06, "loss": 2.6331, "mean_token_accuracy": 0.4841803687095167, "step": 14478 }, { "epoch": 2.6842788283277716, "grad_norm": 7.09375, "learning_rate": 7.315721171672229e-06, "loss": 2.1008, "mean_token_accuracy": 0.5543035243346919, "step": 14479 }, { "epoch": 2.684464219503152, "grad_norm": 7.12109375, "learning_rate": 7.315535780496849e-06, "loss": 3.3076, "mean_token_accuracy": 0.4471839214769806, "step": 14480 }, { "epoch": 2.6846496106785316, "grad_norm": 7.18359375, "learning_rate": 7.315350389321468e-06, "loss": 2.6224, "mean_token_accuracy": 0.5089998646636893, "step": 14481 }, { "epoch": 2.6848350018539118, "grad_norm": 7.921875, "learning_rate": 7.31516499814609e-06, "loss": 3.2929, "mean_token_accuracy": 0.4492753623188406, "step": 14482 }, { "epoch": 2.685020393029292, "grad_norm": 8.6796875, "learning_rate": 7.314979606970709e-06, "loss": 2.7632, "mean_token_accuracy": 0.4961704497980783, "step": 14483 }, { "epoch": 2.6852057842046717, "grad_norm": 7.45703125, "learning_rate": 7.314794215795329e-06, "loss": 3.1422, "mean_token_accuracy": 0.46682431492558074, "step": 14484 }, { "epoch": 2.685391175380052, "grad_norm": 7.31640625, "learning_rate": 7.3146088246199486e-06, "loss": 2.8186, "mean_token_accuracy": 0.4923326452373931, "step": 14485 }, { "epoch": 2.685576566555432, "grad_norm": 7.66015625, "learning_rate": 7.314423433444568e-06, "loss": 2.5064, "mean_token_accuracy": 0.5172867096095701, "step": 14486 }, { "epoch": 2.6857619577308123, "grad_norm": 10.5390625, "learning_rate": 7.314238042269189e-06, "loss": 3.4663, "mean_token_accuracy": 0.46333514394350894, "step": 14487 }, { "epoch": 2.685947348906192, "grad_norm": 6.11328125, "learning_rate": 7.314052651093808e-06, "loss": 2.6942, "mean_token_accuracy": 0.4924263674614306, "step": 14488 }, { "epoch": 2.686132740081572, "grad_norm": 6.625, "learning_rate": 7.313867259918428e-06, "loss": 2.9867, "mean_token_accuracy": 0.4907253599114064, "step": 14489 }, { "epoch": 2.686318131256952, "grad_norm": 8.671875, "learning_rate": 7.3136818687430485e-06, "loss": 3.0863, "mean_token_accuracy": 0.47035148913335123, "step": 14490 }, { "epoch": 2.686503522432332, "grad_norm": 7.265625, "learning_rate": 7.313496477567669e-06, "loss": 2.8924, "mean_token_accuracy": 0.4836174339083393, "step": 14491 }, { "epoch": 2.6866889136077123, "grad_norm": 7.71875, "learning_rate": 7.313311086392289e-06, "loss": 3.2416, "mean_token_accuracy": 0.45787965616045845, "step": 14492 }, { "epoch": 2.6868743047830925, "grad_norm": 7.37109375, "learning_rate": 7.313125695216908e-06, "loss": 3.0507, "mean_token_accuracy": 0.4570300637716368, "step": 14493 }, { "epoch": 2.6870596959584723, "grad_norm": 7.71875, "learning_rate": 7.312940304041528e-06, "loss": 3.0931, "mean_token_accuracy": 0.4741368565374548, "step": 14494 }, { "epoch": 2.6872450871338525, "grad_norm": 6.6875, "learning_rate": 7.3127549128661475e-06, "loss": 3.0696, "mean_token_accuracy": 0.46319873893537045, "step": 14495 }, { "epoch": 2.687430478309232, "grad_norm": 6.546875, "learning_rate": 7.312569521690768e-06, "loss": 2.8635, "mean_token_accuracy": 0.4871698113207547, "step": 14496 }, { "epoch": 2.6876158694846124, "grad_norm": 7.1640625, "learning_rate": 7.312384130515388e-06, "loss": 2.0691, "mean_token_accuracy": 0.5727239421139403, "step": 14497 }, { "epoch": 2.6878012606599926, "grad_norm": 7.8515625, "learning_rate": 7.312198739340008e-06, "loss": 3.037, "mean_token_accuracy": 0.48949839387200395, "step": 14498 }, { "epoch": 2.6879866518353728, "grad_norm": 7.17578125, "learning_rate": 7.312013348164629e-06, "loss": 3.1276, "mean_token_accuracy": 0.4701355807988274, "step": 14499 }, { "epoch": 2.688172043010753, "grad_norm": 7.06640625, "learning_rate": 7.311827956989248e-06, "loss": 2.86, "mean_token_accuracy": 0.4744139508290452, "step": 14500 }, { "epoch": 2.6883574341861327, "grad_norm": 8.234375, "learning_rate": 7.311642565813868e-06, "loss": 2.8777, "mean_token_accuracy": 0.4667176740627391, "step": 14501 }, { "epoch": 2.688542825361513, "grad_norm": 8.2734375, "learning_rate": 7.311457174638488e-06, "loss": 2.6723, "mean_token_accuracy": 0.4946164199192463, "step": 14502 }, { "epoch": 2.6887282165368926, "grad_norm": 7.23828125, "learning_rate": 7.311271783463107e-06, "loss": 3.2436, "mean_token_accuracy": 0.4434731071269818, "step": 14503 }, { "epoch": 2.688913607712273, "grad_norm": 7.359375, "learning_rate": 7.311086392287728e-06, "loss": 3.1714, "mean_token_accuracy": 0.45684102035557844, "step": 14504 }, { "epoch": 2.689098998887653, "grad_norm": 7.66015625, "learning_rate": 7.310901001112347e-06, "loss": 3.357, "mean_token_accuracy": 0.4477184711230847, "step": 14505 }, { "epoch": 2.689284390063033, "grad_norm": 7.59375, "learning_rate": 7.310715609936968e-06, "loss": 2.5964, "mean_token_accuracy": 0.510383488342464, "step": 14506 }, { "epoch": 2.689469781238413, "grad_norm": 10.796875, "learning_rate": 7.3105302187615875e-06, "loss": 3.3372, "mean_token_accuracy": 0.4530110141610642, "step": 14507 }, { "epoch": 2.689655172413793, "grad_norm": 6.90625, "learning_rate": 7.310344827586208e-06, "loss": 2.7863, "mean_token_accuracy": 0.4843126827971284, "step": 14508 }, { "epoch": 2.689840563589173, "grad_norm": 7.41015625, "learning_rate": 7.310159436410828e-06, "loss": 2.8269, "mean_token_accuracy": 0.47622201616108095, "step": 14509 }, { "epoch": 2.690025954764553, "grad_norm": 7.98828125, "learning_rate": 7.309974045235447e-06, "loss": 3.2778, "mean_token_accuracy": 0.4499257736667809, "step": 14510 }, { "epoch": 2.6902113459399333, "grad_norm": 8.1171875, "learning_rate": 7.309788654060067e-06, "loss": 2.885, "mean_token_accuracy": 0.46481620405101276, "step": 14511 }, { "epoch": 2.6903967371153135, "grad_norm": 7.06640625, "learning_rate": 7.3096032628846866e-06, "loss": 2.4226, "mean_token_accuracy": 0.5253798342541437, "step": 14512 }, { "epoch": 2.690582128290693, "grad_norm": 7.2578125, "learning_rate": 7.309417871709307e-06, "loss": 2.795, "mean_token_accuracy": 0.4846288427893027, "step": 14513 }, { "epoch": 2.6907675194660734, "grad_norm": 8.140625, "learning_rate": 7.3092324805339276e-06, "loss": 3.2857, "mean_token_accuracy": 0.45914654316903736, "step": 14514 }, { "epoch": 2.6909529106414536, "grad_norm": 13.6328125, "learning_rate": 7.309047089358547e-06, "loss": 3.4974, "mean_token_accuracy": 0.4483457123565159, "step": 14515 }, { "epoch": 2.6911383018168333, "grad_norm": 7.01171875, "learning_rate": 7.308861698183167e-06, "loss": 3.2945, "mean_token_accuracy": 0.4274798927613941, "step": 14516 }, { "epoch": 2.6913236929922135, "grad_norm": 7.453125, "learning_rate": 7.308676307007787e-06, "loss": 2.8742, "mean_token_accuracy": 0.4873663751214772, "step": 14517 }, { "epoch": 2.6915090841675937, "grad_norm": 10.0546875, "learning_rate": 7.308490915832407e-06, "loss": 3.6484, "mean_token_accuracy": 0.45082944811883335, "step": 14518 }, { "epoch": 2.691694475342974, "grad_norm": 8.0078125, "learning_rate": 7.308305524657027e-06, "loss": 3.3543, "mean_token_accuracy": 0.452659311707558, "step": 14519 }, { "epoch": 2.6918798665183536, "grad_norm": 6.97265625, "learning_rate": 7.308120133481646e-06, "loss": 2.6265, "mean_token_accuracy": 0.5147899577429779, "step": 14520 }, { "epoch": 2.692065257693734, "grad_norm": 7.48828125, "learning_rate": 7.307934742306266e-06, "loss": 2.569, "mean_token_accuracy": 0.49503245462975226, "step": 14521 }, { "epoch": 2.6922506488691136, "grad_norm": 10.3984375, "learning_rate": 7.307749351130887e-06, "loss": 3.2732, "mean_token_accuracy": 0.4837240681396526, "step": 14522 }, { "epoch": 2.6924360400444938, "grad_norm": 8.015625, "learning_rate": 7.307563959955507e-06, "loss": 2.3497, "mean_token_accuracy": 0.5142639206712434, "step": 14523 }, { "epoch": 2.692621431219874, "grad_norm": 6.87109375, "learning_rate": 7.3073785687801265e-06, "loss": 3.1149, "mean_token_accuracy": 0.4634115884115884, "step": 14524 }, { "epoch": 2.692806822395254, "grad_norm": 8.6328125, "learning_rate": 7.307193177604747e-06, "loss": 2.8724, "mean_token_accuracy": 0.4808992065824273, "step": 14525 }, { "epoch": 2.692992213570634, "grad_norm": 7.8671875, "learning_rate": 7.307007786429367e-06, "loss": 2.6963, "mean_token_accuracy": 0.5126658624849216, "step": 14526 }, { "epoch": 2.693177604746014, "grad_norm": 7.58203125, "learning_rate": 7.306822395253986e-06, "loss": 2.9362, "mean_token_accuracy": 0.4609221253865617, "step": 14527 }, { "epoch": 2.6933629959213943, "grad_norm": 6.94140625, "learning_rate": 7.306637004078606e-06, "loss": 2.9932, "mean_token_accuracy": 0.45277481323372465, "step": 14528 }, { "epoch": 2.693548387096774, "grad_norm": 8.75, "learning_rate": 7.306451612903226e-06, "loss": 3.0467, "mean_token_accuracy": 0.4671849988566202, "step": 14529 }, { "epoch": 2.693733778272154, "grad_norm": 7.38671875, "learning_rate": 7.306266221727847e-06, "loss": 2.0626, "mean_token_accuracy": 0.5645017352503718, "step": 14530 }, { "epoch": 2.6939191694475344, "grad_norm": 8.734375, "learning_rate": 7.306080830552467e-06, "loss": 2.828, "mean_token_accuracy": 0.4762704104990081, "step": 14531 }, { "epoch": 2.6941045606229146, "grad_norm": 16.609375, "learning_rate": 7.305895439377086e-06, "loss": 2.7623, "mean_token_accuracy": 0.49904099736274277, "step": 14532 }, { "epoch": 2.6942899517982943, "grad_norm": 7.7890625, "learning_rate": 7.305710048201706e-06, "loss": 3.1047, "mean_token_accuracy": 0.474646623289208, "step": 14533 }, { "epoch": 2.6944753429736745, "grad_norm": 7.98828125, "learning_rate": 7.305524657026326e-06, "loss": 3.0994, "mean_token_accuracy": 0.4784189004997728, "step": 14534 }, { "epoch": 2.6946607341490543, "grad_norm": 9.7421875, "learning_rate": 7.305339265850946e-06, "loss": 2.804, "mean_token_accuracy": 0.4751961883408072, "step": 14535 }, { "epoch": 2.6948461253244345, "grad_norm": 10.578125, "learning_rate": 7.305153874675566e-06, "loss": 2.4495, "mean_token_accuracy": 0.543204252088079, "step": 14536 }, { "epoch": 2.6950315164998146, "grad_norm": 6.98828125, "learning_rate": 7.304968483500185e-06, "loss": 3.0707, "mean_token_accuracy": 0.4661075367647059, "step": 14537 }, { "epoch": 2.695216907675195, "grad_norm": 8.0546875, "learning_rate": 7.304783092324807e-06, "loss": 3.2969, "mean_token_accuracy": 0.4661928193912885, "step": 14538 }, { "epoch": 2.6954022988505746, "grad_norm": 7.1171875, "learning_rate": 7.304597701149426e-06, "loss": 3.2735, "mean_token_accuracy": 0.4479295809571039, "step": 14539 }, { "epoch": 2.6955876900259548, "grad_norm": 7.3125, "learning_rate": 7.304412309974046e-06, "loss": 2.9544, "mean_token_accuracy": 0.473592317765168, "step": 14540 }, { "epoch": 2.695773081201335, "grad_norm": 11.03125, "learning_rate": 7.3042269187986656e-06, "loss": 1.9727, "mean_token_accuracy": 0.5680222841225627, "step": 14541 }, { "epoch": 2.6959584723767147, "grad_norm": 7.25, "learning_rate": 7.304041527623286e-06, "loss": 3.0139, "mean_token_accuracy": 0.47965353815983003, "step": 14542 }, { "epoch": 2.696143863552095, "grad_norm": 7.3828125, "learning_rate": 7.303856136447906e-06, "loss": 3.1985, "mean_token_accuracy": 0.4551226551226551, "step": 14543 }, { "epoch": 2.696329254727475, "grad_norm": 7.9375, "learning_rate": 7.303670745272525e-06, "loss": 3.5007, "mean_token_accuracy": 0.44407412345020664, "step": 14544 }, { "epoch": 2.6965146459028553, "grad_norm": 8.953125, "learning_rate": 7.303485354097145e-06, "loss": 2.8548, "mean_token_accuracy": 0.4833788016570678, "step": 14545 }, { "epoch": 2.696700037078235, "grad_norm": 14.1640625, "learning_rate": 7.303299962921766e-06, "loss": 2.5259, "mean_token_accuracy": 0.4767274472168906, "step": 14546 }, { "epoch": 2.696885428253615, "grad_norm": 8.8046875, "learning_rate": 7.303114571746386e-06, "loss": 2.7058, "mean_token_accuracy": 0.48175182481751827, "step": 14547 }, { "epoch": 2.697070819428995, "grad_norm": 7.21484375, "learning_rate": 7.302929180571006e-06, "loss": 3.2638, "mean_token_accuracy": 0.4495754506182035, "step": 14548 }, { "epoch": 2.697256210604375, "grad_norm": 9.109375, "learning_rate": 7.302743789395625e-06, "loss": 3.5182, "mean_token_accuracy": 0.4602321319486866, "step": 14549 }, { "epoch": 2.6974416017797553, "grad_norm": 12.609375, "learning_rate": 7.302558398220245e-06, "loss": 2.6387, "mean_token_accuracy": 0.4958382877526754, "step": 14550 }, { "epoch": 2.6976269929551355, "grad_norm": 7.7109375, "learning_rate": 7.302373007044865e-06, "loss": 3.6533, "mean_token_accuracy": 0.44113778362380796, "step": 14551 }, { "epoch": 2.6978123841305153, "grad_norm": 8.25, "learning_rate": 7.302187615869485e-06, "loss": 2.5149, "mean_token_accuracy": 0.5139035550862372, "step": 14552 }, { "epoch": 2.6979977753058955, "grad_norm": 8.921875, "learning_rate": 7.302002224694105e-06, "loss": 2.8418, "mean_token_accuracy": 0.4843209591883791, "step": 14553 }, { "epoch": 2.6981831664812757, "grad_norm": 7.828125, "learning_rate": 7.301816833518726e-06, "loss": 3.0839, "mean_token_accuracy": 0.4679184425757401, "step": 14554 }, { "epoch": 2.6983685576566554, "grad_norm": 10.2734375, "learning_rate": 7.301631442343346e-06, "loss": 2.8046, "mean_token_accuracy": 0.47708489857250186, "step": 14555 }, { "epoch": 2.6985539488320356, "grad_norm": 7.578125, "learning_rate": 7.301446051167965e-06, "loss": 2.7482, "mean_token_accuracy": 0.4867027535890798, "step": 14556 }, { "epoch": 2.6987393400074158, "grad_norm": 7.76171875, "learning_rate": 7.301260659992585e-06, "loss": 2.604, "mean_token_accuracy": 0.49788484136310224, "step": 14557 }, { "epoch": 2.698924731182796, "grad_norm": 6.88671875, "learning_rate": 7.301075268817205e-06, "loss": 3.3564, "mean_token_accuracy": 0.45613128311151313, "step": 14558 }, { "epoch": 2.6991101223581757, "grad_norm": 7.62109375, "learning_rate": 7.300889877641824e-06, "loss": 2.4932, "mean_token_accuracy": 0.5212969489477394, "step": 14559 }, { "epoch": 2.699295513533556, "grad_norm": 8.7421875, "learning_rate": 7.300704486466445e-06, "loss": 2.261, "mean_token_accuracy": 0.5565659528698543, "step": 14560 }, { "epoch": 2.6994809047089356, "grad_norm": 8.953125, "learning_rate": 7.300519095291064e-06, "loss": 2.8922, "mean_token_accuracy": 0.483424047501237, "step": 14561 }, { "epoch": 2.699666295884316, "grad_norm": 7.55078125, "learning_rate": 7.300333704115685e-06, "loss": 3.364, "mean_token_accuracy": 0.4601726263871763, "step": 14562 }, { "epoch": 2.699851687059696, "grad_norm": 7.0625, "learning_rate": 7.300148312940305e-06, "loss": 2.76, "mean_token_accuracy": 0.49669635162309683, "step": 14563 }, { "epoch": 2.700037078235076, "grad_norm": 9.3828125, "learning_rate": 7.299962921764925e-06, "loss": 2.578, "mean_token_accuracy": 0.49458353394318727, "step": 14564 }, { "epoch": 2.700222469410456, "grad_norm": 11.15625, "learning_rate": 7.299777530589545e-06, "loss": 3.2782, "mean_token_accuracy": 0.4922945205479452, "step": 14565 }, { "epoch": 2.700407860585836, "grad_norm": 6.73046875, "learning_rate": 7.299592139414164e-06, "loss": 2.3026, "mean_token_accuracy": 0.5326549210206561, "step": 14566 }, { "epoch": 2.700593251761216, "grad_norm": 10.2578125, "learning_rate": 7.299406748238784e-06, "loss": 2.8587, "mean_token_accuracy": 0.514873417721519, "step": 14567 }, { "epoch": 2.700778642936596, "grad_norm": 10.4765625, "learning_rate": 7.2992213570634044e-06, "loss": 2.652, "mean_token_accuracy": 0.4986449864498645, "step": 14568 }, { "epoch": 2.7009640341119763, "grad_norm": 9.453125, "learning_rate": 7.299035965888024e-06, "loss": 2.9433, "mean_token_accuracy": 0.4885746929448729, "step": 14569 }, { "epoch": 2.7011494252873565, "grad_norm": 11.53125, "learning_rate": 7.298850574712645e-06, "loss": 3.2667, "mean_token_accuracy": 0.4507815800591466, "step": 14570 }, { "epoch": 2.701334816462736, "grad_norm": 10.0546875, "learning_rate": 7.298665183537264e-06, "loss": 2.7379, "mean_token_accuracy": 0.500070751379652, "step": 14571 }, { "epoch": 2.7015202076381164, "grad_norm": 7.91796875, "learning_rate": 7.298479792361885e-06, "loss": 2.6368, "mean_token_accuracy": 0.4911937377690802, "step": 14572 }, { "epoch": 2.7017055988134966, "grad_norm": 10.8515625, "learning_rate": 7.298294401186504e-06, "loss": 3.1396, "mean_token_accuracy": 0.4870702179176755, "step": 14573 }, { "epoch": 2.7018909899888763, "grad_norm": 7.828125, "learning_rate": 7.298109010011124e-06, "loss": 2.6299, "mean_token_accuracy": 0.4980030721966206, "step": 14574 }, { "epoch": 2.7020763811642565, "grad_norm": 7.9609375, "learning_rate": 7.297923618835744e-06, "loss": 3.8773, "mean_token_accuracy": 0.42565633943013564, "step": 14575 }, { "epoch": 2.7022617723396367, "grad_norm": 10.546875, "learning_rate": 7.297738227660363e-06, "loss": 2.6699, "mean_token_accuracy": 0.49074930619796486, "step": 14576 }, { "epoch": 2.702447163515017, "grad_norm": 8.4765625, "learning_rate": 7.297552836484984e-06, "loss": 3.2974, "mean_token_accuracy": 0.44717683222779486, "step": 14577 }, { "epoch": 2.7026325546903966, "grad_norm": 8.484375, "learning_rate": 7.297367445309604e-06, "loss": 3.5365, "mean_token_accuracy": 0.4262587256794891, "step": 14578 }, { "epoch": 2.702817945865777, "grad_norm": 9.71875, "learning_rate": 7.297182054134224e-06, "loss": 3.2692, "mean_token_accuracy": 0.45686705767350927, "step": 14579 }, { "epoch": 2.7030033370411566, "grad_norm": 9.7890625, "learning_rate": 7.296996662958844e-06, "loss": 2.9154, "mean_token_accuracy": 0.48645320197044334, "step": 14580 }, { "epoch": 2.7031887282165368, "grad_norm": 10.6015625, "learning_rate": 7.296811271783464e-06, "loss": 2.7109, "mean_token_accuracy": 0.5182587986239746, "step": 14581 }, { "epoch": 2.703374119391917, "grad_norm": 7.90234375, "learning_rate": 7.296625880608084e-06, "loss": 3.5476, "mean_token_accuracy": 0.4400372222868442, "step": 14582 }, { "epoch": 2.703559510567297, "grad_norm": 10.53125, "learning_rate": 7.296440489432703e-06, "loss": 2.7235, "mean_token_accuracy": 0.504010923365762, "step": 14583 }, { "epoch": 2.703744901742677, "grad_norm": 10.1796875, "learning_rate": 7.296255098257323e-06, "loss": 2.37, "mean_token_accuracy": 0.5096791661026127, "step": 14584 }, { "epoch": 2.703930292918057, "grad_norm": 7.33203125, "learning_rate": 7.2960697070819435e-06, "loss": 3.3059, "mean_token_accuracy": 0.45863488009081876, "step": 14585 }, { "epoch": 2.7041156840934373, "grad_norm": 7.83203125, "learning_rate": 7.295884315906564e-06, "loss": 2.7381, "mean_token_accuracy": 0.47505981962083565, "step": 14586 }, { "epoch": 2.704301075268817, "grad_norm": 9.3984375, "learning_rate": 7.295698924731184e-06, "loss": 2.8303, "mean_token_accuracy": 0.4859629421673217, "step": 14587 }, { "epoch": 2.704486466444197, "grad_norm": 8.15625, "learning_rate": 7.295513533555803e-06, "loss": 3.9349, "mean_token_accuracy": 0.4351234809878479, "step": 14588 }, { "epoch": 2.7046718576195774, "grad_norm": 11.1875, "learning_rate": 7.295328142380424e-06, "loss": 2.5918, "mean_token_accuracy": 0.4888063776345965, "step": 14589 }, { "epoch": 2.7048572487949576, "grad_norm": 9.3203125, "learning_rate": 7.295142751205043e-06, "loss": 3.1253, "mean_token_accuracy": 0.46685800191335247, "step": 14590 }, { "epoch": 2.7050426399703373, "grad_norm": 6.56640625, "learning_rate": 7.294957360029663e-06, "loss": 2.4609, "mean_token_accuracy": 0.5107930015905476, "step": 14591 }, { "epoch": 2.7052280311457175, "grad_norm": 7.5078125, "learning_rate": 7.294771968854283e-06, "loss": 3.0263, "mean_token_accuracy": 0.4997263273125342, "step": 14592 }, { "epoch": 2.7054134223210973, "grad_norm": 8.875, "learning_rate": 7.294586577678902e-06, "loss": 3.4129, "mean_token_accuracy": 0.44300961918566345, "step": 14593 }, { "epoch": 2.7055988134964775, "grad_norm": 7.1015625, "learning_rate": 7.294401186503524e-06, "loss": 2.6876, "mean_token_accuracy": 0.5092054263565892, "step": 14594 }, { "epoch": 2.7057842046718577, "grad_norm": 9.578125, "learning_rate": 7.294215795328143e-06, "loss": 2.9596, "mean_token_accuracy": 0.4739084132055378, "step": 14595 }, { "epoch": 2.705969595847238, "grad_norm": 9.484375, "learning_rate": 7.294030404152763e-06, "loss": 2.5546, "mean_token_accuracy": 0.5499793757734085, "step": 14596 }, { "epoch": 2.7061549870226176, "grad_norm": 9.609375, "learning_rate": 7.293845012977383e-06, "loss": 3.0804, "mean_token_accuracy": 0.4592448098091124, "step": 14597 }, { "epoch": 2.7063403781979978, "grad_norm": 8.34375, "learning_rate": 7.293659621802003e-06, "loss": 3.1929, "mean_token_accuracy": 0.46561021239132827, "step": 14598 }, { "epoch": 2.706525769373378, "grad_norm": 8.8671875, "learning_rate": 7.293474230626623e-06, "loss": 2.5953, "mean_token_accuracy": 0.4842642939528753, "step": 14599 }, { "epoch": 2.7067111605487577, "grad_norm": 10.7265625, "learning_rate": 7.293288839451242e-06, "loss": 3.7388, "mean_token_accuracy": 0.4416994706121895, "step": 14600 }, { "epoch": 2.706896551724138, "grad_norm": 7.4921875, "learning_rate": 7.293103448275862e-06, "loss": 2.7389, "mean_token_accuracy": 0.48456397199236156, "step": 14601 }, { "epoch": 2.707081942899518, "grad_norm": 8.2578125, "learning_rate": 7.292918057100482e-06, "loss": 2.6578, "mean_token_accuracy": 0.4995408631772268, "step": 14602 }, { "epoch": 2.7072673340748983, "grad_norm": 7.23046875, "learning_rate": 7.292732665925103e-06, "loss": 2.919, "mean_token_accuracy": 0.4654965135849964, "step": 14603 }, { "epoch": 2.707452725250278, "grad_norm": 9.3125, "learning_rate": 7.292547274749723e-06, "loss": 3.2368, "mean_token_accuracy": 0.45878524945770066, "step": 14604 }, { "epoch": 2.707638116425658, "grad_norm": 7.4921875, "learning_rate": 7.292361883574342e-06, "loss": 2.9561, "mean_token_accuracy": 0.4618917576961271, "step": 14605 }, { "epoch": 2.707823507601038, "grad_norm": 7.1796875, "learning_rate": 7.292176492398963e-06, "loss": 2.7712, "mean_token_accuracy": 0.49917970031718256, "step": 14606 }, { "epoch": 2.708008898776418, "grad_norm": 7.4765625, "learning_rate": 7.2919911012235824e-06, "loss": 3.0656, "mean_token_accuracy": 0.470503300330033, "step": 14607 }, { "epoch": 2.7081942899517983, "grad_norm": 6.85546875, "learning_rate": 7.291805710048202e-06, "loss": 3.1917, "mean_token_accuracy": 0.45124674358020095, "step": 14608 }, { "epoch": 2.7083796811271785, "grad_norm": 7.39453125, "learning_rate": 7.291620318872822e-06, "loss": 2.8625, "mean_token_accuracy": 0.4962579942849367, "step": 14609 }, { "epoch": 2.7085650723025583, "grad_norm": 7.53515625, "learning_rate": 7.291434927697441e-06, "loss": 2.8933, "mean_token_accuracy": 0.4896751101321586, "step": 14610 }, { "epoch": 2.7087504634779385, "grad_norm": 7.6171875, "learning_rate": 7.291249536522063e-06, "loss": 2.9236, "mean_token_accuracy": 0.4631633178773828, "step": 14611 }, { "epoch": 2.7089358546533187, "grad_norm": 6.25390625, "learning_rate": 7.291064145346682e-06, "loss": 3.0096, "mean_token_accuracy": 0.46203534430225474, "step": 14612 }, { "epoch": 2.7091212458286984, "grad_norm": 7.09765625, "learning_rate": 7.290878754171302e-06, "loss": 2.9686, "mean_token_accuracy": 0.4742857142857143, "step": 14613 }, { "epoch": 2.7093066370040786, "grad_norm": 8.34375, "learning_rate": 7.290693362995922e-06, "loss": 2.8423, "mean_token_accuracy": 0.4909592822636301, "step": 14614 }, { "epoch": 2.709492028179459, "grad_norm": 10.1328125, "learning_rate": 7.290507971820542e-06, "loss": 2.7285, "mean_token_accuracy": 0.52606043803715, "step": 14615 }, { "epoch": 2.709677419354839, "grad_norm": 7.13671875, "learning_rate": 7.290322580645162e-06, "loss": 2.7659, "mean_token_accuracy": 0.4726767520839765, "step": 14616 }, { "epoch": 2.7098628105302187, "grad_norm": 6.71875, "learning_rate": 7.290137189469781e-06, "loss": 2.8133, "mean_token_accuracy": 0.4822954822954823, "step": 14617 }, { "epoch": 2.710048201705599, "grad_norm": 7.66796875, "learning_rate": 7.289951798294401e-06, "loss": 2.1624, "mean_token_accuracy": 0.5574008171872941, "step": 14618 }, { "epoch": 2.7102335928809786, "grad_norm": 7.5390625, "learning_rate": 7.289766407119022e-06, "loss": 3.0709, "mean_token_accuracy": 0.43725652225239053, "step": 14619 }, { "epoch": 2.710418984056359, "grad_norm": 7.75, "learning_rate": 7.289581015943642e-06, "loss": 2.9743, "mean_token_accuracy": 0.45851306774930223, "step": 14620 }, { "epoch": 2.710604375231739, "grad_norm": 10.2578125, "learning_rate": 7.289395624768262e-06, "loss": 2.6218, "mean_token_accuracy": 0.4846275752773376, "step": 14621 }, { "epoch": 2.710789766407119, "grad_norm": 8.296875, "learning_rate": 7.289210233592881e-06, "loss": 2.8298, "mean_token_accuracy": 0.45516707521610117, "step": 14622 }, { "epoch": 2.710975157582499, "grad_norm": 7.84375, "learning_rate": 7.289024842417502e-06, "loss": 2.5224, "mean_token_accuracy": 0.5372015226669743, "step": 14623 }, { "epoch": 2.711160548757879, "grad_norm": 6.62109375, "learning_rate": 7.2888394512421215e-06, "loss": 2.8167, "mean_token_accuracy": 0.4714076246334311, "step": 14624 }, { "epoch": 2.7113459399332593, "grad_norm": 7.65234375, "learning_rate": 7.288654060066741e-06, "loss": 2.8195, "mean_token_accuracy": 0.5288387025351489, "step": 14625 }, { "epoch": 2.711531331108639, "grad_norm": 9.3359375, "learning_rate": 7.288468668891361e-06, "loss": 2.7815, "mean_token_accuracy": 0.49747715805263876, "step": 14626 }, { "epoch": 2.7117167222840193, "grad_norm": 7.67578125, "learning_rate": 7.288283277715982e-06, "loss": 2.5575, "mean_token_accuracy": 0.5338971674050208, "step": 14627 }, { "epoch": 2.7119021134593995, "grad_norm": 7.13671875, "learning_rate": 7.288097886540602e-06, "loss": 2.3913, "mean_token_accuracy": 0.5031122349737405, "step": 14628 }, { "epoch": 2.7120875046347797, "grad_norm": 7.30859375, "learning_rate": 7.287912495365221e-06, "loss": 2.8039, "mean_token_accuracy": 0.48686197523406827, "step": 14629 }, { "epoch": 2.7122728958101594, "grad_norm": 7.85546875, "learning_rate": 7.287727104189841e-06, "loss": 2.1497, "mean_token_accuracy": 0.574089874857793, "step": 14630 }, { "epoch": 2.7124582869855396, "grad_norm": 7.546875, "learning_rate": 7.287541713014461e-06, "loss": 3.4947, "mean_token_accuracy": 0.44298444343754717, "step": 14631 }, { "epoch": 2.7126436781609193, "grad_norm": 7.51171875, "learning_rate": 7.287356321839081e-06, "loss": 2.5436, "mean_token_accuracy": 0.5137345867415456, "step": 14632 }, { "epoch": 2.7128290693362995, "grad_norm": 7.171875, "learning_rate": 7.287170930663701e-06, "loss": 2.6861, "mean_token_accuracy": 0.5251538183134274, "step": 14633 }, { "epoch": 2.7130144605116797, "grad_norm": 7.25390625, "learning_rate": 7.2869855394883204e-06, "loss": 3.1023, "mean_token_accuracy": 0.45741056218057924, "step": 14634 }, { "epoch": 2.71319985168706, "grad_norm": 6.5078125, "learning_rate": 7.286800148312941e-06, "loss": 2.5777, "mean_token_accuracy": 0.4917541229385307, "step": 14635 }, { "epoch": 2.7133852428624397, "grad_norm": 6.5, "learning_rate": 7.2866147571375614e-06, "loss": 2.8021, "mean_token_accuracy": 0.48037331869338457, "step": 14636 }, { "epoch": 2.71357063403782, "grad_norm": 9.1328125, "learning_rate": 7.286429365962181e-06, "loss": 3.5122, "mean_token_accuracy": 0.4454772160507821, "step": 14637 }, { "epoch": 2.7137560252131996, "grad_norm": 7.6171875, "learning_rate": 7.286243974786801e-06, "loss": 3.0706, "mean_token_accuracy": 0.4664378860672615, "step": 14638 }, { "epoch": 2.7139414163885798, "grad_norm": 7.0078125, "learning_rate": 7.28605858361142e-06, "loss": 2.9132, "mean_token_accuracy": 0.4592250400678797, "step": 14639 }, { "epoch": 2.71412680756396, "grad_norm": 7.40625, "learning_rate": 7.28587319243604e-06, "loss": 2.7205, "mean_token_accuracy": 0.480814408770556, "step": 14640 }, { "epoch": 2.71431219873934, "grad_norm": 10.3125, "learning_rate": 7.2856878012606605e-06, "loss": 2.9284, "mean_token_accuracy": 0.5020358306188925, "step": 14641 }, { "epoch": 2.71449758991472, "grad_norm": 8.8046875, "learning_rate": 7.28550241008528e-06, "loss": 2.8961, "mean_token_accuracy": 0.4637138429752066, "step": 14642 }, { "epoch": 2.7146829810901, "grad_norm": 7.33203125, "learning_rate": 7.285317018909901e-06, "loss": 2.858, "mean_token_accuracy": 0.4799503927242662, "step": 14643 }, { "epoch": 2.7148683722654803, "grad_norm": 9.5234375, "learning_rate": 7.285131627734521e-06, "loss": 3.4392, "mean_token_accuracy": 0.4286858974358974, "step": 14644 }, { "epoch": 2.71505376344086, "grad_norm": 14.4609375, "learning_rate": 7.284946236559141e-06, "loss": 3.4729, "mean_token_accuracy": 0.44673003262874167, "step": 14645 }, { "epoch": 2.71523915461624, "grad_norm": 7.703125, "learning_rate": 7.28476084538376e-06, "loss": 3.2568, "mean_token_accuracy": 0.4333284435968901, "step": 14646 }, { "epoch": 2.7154245457916204, "grad_norm": 9.1796875, "learning_rate": 7.28457545420838e-06, "loss": 2.7446, "mean_token_accuracy": 0.498164733519307, "step": 14647 }, { "epoch": 2.7156099369670006, "grad_norm": 8.15625, "learning_rate": 7.284390063033e-06, "loss": 2.4834, "mean_token_accuracy": 0.5175438596491229, "step": 14648 }, { "epoch": 2.7157953281423803, "grad_norm": 7.81640625, "learning_rate": 7.28420467185762e-06, "loss": 2.7912, "mean_token_accuracy": 0.4874969355234126, "step": 14649 }, { "epoch": 2.7159807193177605, "grad_norm": 6.80859375, "learning_rate": 7.28401928068224e-06, "loss": 2.8824, "mean_token_accuracy": 0.4808070221407048, "step": 14650 }, { "epoch": 2.7161661104931403, "grad_norm": 9.671875, "learning_rate": 7.28383388950686e-06, "loss": 2.4154, "mean_token_accuracy": 0.5250737463126843, "step": 14651 }, { "epoch": 2.7163515016685205, "grad_norm": 7.59375, "learning_rate": 7.28364849833148e-06, "loss": 3.0658, "mean_token_accuracy": 0.4571004085675375, "step": 14652 }, { "epoch": 2.7165368928439007, "grad_norm": 7.203125, "learning_rate": 7.2834631071561005e-06, "loss": 3.3442, "mean_token_accuracy": 0.41854838709677417, "step": 14653 }, { "epoch": 2.716722284019281, "grad_norm": 7.70703125, "learning_rate": 7.28327771598072e-06, "loss": 3.0118, "mean_token_accuracy": 0.4775469585769374, "step": 14654 }, { "epoch": 2.7169076751946606, "grad_norm": 12.0390625, "learning_rate": 7.28309232480534e-06, "loss": 3.2518, "mean_token_accuracy": 0.49902912621359224, "step": 14655 }, { "epoch": 2.7170930663700408, "grad_norm": 8.625, "learning_rate": 7.282906933629959e-06, "loss": 2.8118, "mean_token_accuracy": 0.4954706577392674, "step": 14656 }, { "epoch": 2.717278457545421, "grad_norm": 7.1171875, "learning_rate": 7.282721542454579e-06, "loss": 2.8463, "mean_token_accuracy": 0.48488960157888245, "step": 14657 }, { "epoch": 2.7174638487208007, "grad_norm": 7.8984375, "learning_rate": 7.2825361512791995e-06, "loss": 2.8991, "mean_token_accuracy": 0.4812283100220843, "step": 14658 }, { "epoch": 2.717649239896181, "grad_norm": 8.6328125, "learning_rate": 7.28235076010382e-06, "loss": 2.5967, "mean_token_accuracy": 0.5195960567444097, "step": 14659 }, { "epoch": 2.717834631071561, "grad_norm": 6.78125, "learning_rate": 7.28216536892844e-06, "loss": 2.8318, "mean_token_accuracy": 0.47472289808056234, "step": 14660 }, { "epoch": 2.7180200222469413, "grad_norm": 8.4296875, "learning_rate": 7.28197997775306e-06, "loss": 2.8769, "mean_token_accuracy": 0.5005065856129686, "step": 14661 }, { "epoch": 2.718205413422321, "grad_norm": 8.9375, "learning_rate": 7.28179458657768e-06, "loss": 2.5784, "mean_token_accuracy": 0.5031138200609514, "step": 14662 }, { "epoch": 2.718390804597701, "grad_norm": 7.640625, "learning_rate": 7.2816091954022994e-06, "loss": 2.543, "mean_token_accuracy": 0.5456287935257161, "step": 14663 }, { "epoch": 2.718576195773081, "grad_norm": 7.9140625, "learning_rate": 7.281423804226919e-06, "loss": 2.6385, "mean_token_accuracy": 0.5221164613661814, "step": 14664 }, { "epoch": 2.718761586948461, "grad_norm": 7.15234375, "learning_rate": 7.281238413051539e-06, "loss": 2.5316, "mean_token_accuracy": 0.5045362903225806, "step": 14665 }, { "epoch": 2.7189469781238413, "grad_norm": 8.2109375, "learning_rate": 7.281053021876159e-06, "loss": 2.5829, "mean_token_accuracy": 0.508139023317202, "step": 14666 }, { "epoch": 2.7191323692992215, "grad_norm": 8.25, "learning_rate": 7.28086763070078e-06, "loss": 3.2634, "mean_token_accuracy": 0.45217391304347826, "step": 14667 }, { "epoch": 2.7193177604746013, "grad_norm": 7.9140625, "learning_rate": 7.280682239525399e-06, "loss": 3.0041, "mean_token_accuracy": 0.47413134784268807, "step": 14668 }, { "epoch": 2.7195031516499815, "grad_norm": 8.71875, "learning_rate": 7.280496848350019e-06, "loss": 3.6179, "mean_token_accuracy": 0.431859649122807, "step": 14669 }, { "epoch": 2.7196885428253617, "grad_norm": 7.1171875, "learning_rate": 7.2803114571746395e-06, "loss": 3.5975, "mean_token_accuracy": 0.4420352346926204, "step": 14670 }, { "epoch": 2.7198739340007414, "grad_norm": 7.3125, "learning_rate": 7.280126065999259e-06, "loss": 2.4539, "mean_token_accuracy": 0.5286009648518263, "step": 14671 }, { "epoch": 2.7200593251761216, "grad_norm": 9.265625, "learning_rate": 7.279940674823879e-06, "loss": 3.4695, "mean_token_accuracy": 0.45538802047309446, "step": 14672 }, { "epoch": 2.720244716351502, "grad_norm": 6.93359375, "learning_rate": 7.2797552836484984e-06, "loss": 2.8769, "mean_token_accuracy": 0.4605739760378936, "step": 14673 }, { "epoch": 2.720430107526882, "grad_norm": 8.3515625, "learning_rate": 7.279569892473118e-06, "loss": 2.5538, "mean_token_accuracy": 0.522322890514136, "step": 14674 }, { "epoch": 2.7206154987022617, "grad_norm": 7.42578125, "learning_rate": 7.279384501297739e-06, "loss": 2.6921, "mean_token_accuracy": 0.4814420803782506, "step": 14675 }, { "epoch": 2.720800889877642, "grad_norm": 7.68359375, "learning_rate": 7.279199110122359e-06, "loss": 3.1958, "mean_token_accuracy": 0.47414820109602096, "step": 14676 }, { "epoch": 2.7209862810530216, "grad_norm": 7.32421875, "learning_rate": 7.279013718946979e-06, "loss": 2.8463, "mean_token_accuracy": 0.47734487734487735, "step": 14677 }, { "epoch": 2.721171672228402, "grad_norm": 8.9609375, "learning_rate": 7.278828327771598e-06, "loss": 3.6245, "mean_token_accuracy": 0.45037121644774414, "step": 14678 }, { "epoch": 2.721357063403782, "grad_norm": 9.6171875, "learning_rate": 7.278642936596219e-06, "loss": 2.5197, "mean_token_accuracy": 0.5088744102448888, "step": 14679 }, { "epoch": 2.721542454579162, "grad_norm": 7.4609375, "learning_rate": 7.2784575454208385e-06, "loss": 3.2373, "mean_token_accuracy": 0.4440150176678445, "step": 14680 }, { "epoch": 2.721727845754542, "grad_norm": 6.5703125, "learning_rate": 7.278272154245458e-06, "loss": 2.962, "mean_token_accuracy": 0.4866598255515649, "step": 14681 }, { "epoch": 2.721913236929922, "grad_norm": 10.7265625, "learning_rate": 7.278086763070078e-06, "loss": 3.6383, "mean_token_accuracy": 0.4645633971291866, "step": 14682 }, { "epoch": 2.7220986281053023, "grad_norm": 6.5546875, "learning_rate": 7.277901371894699e-06, "loss": 2.775, "mean_token_accuracy": 0.5201890034364262, "step": 14683 }, { "epoch": 2.722284019280682, "grad_norm": 7.4140625, "learning_rate": 7.277715980719319e-06, "loss": 2.7628, "mean_token_accuracy": 0.48069586448954676, "step": 14684 }, { "epoch": 2.7224694104560623, "grad_norm": 6.921875, "learning_rate": 7.277530589543938e-06, "loss": 2.2859, "mean_token_accuracy": 0.5444877029721774, "step": 14685 }, { "epoch": 2.7226548016314425, "grad_norm": 6.984375, "learning_rate": 7.277345198368558e-06, "loss": 2.2994, "mean_token_accuracy": 0.5664910702813825, "step": 14686 }, { "epoch": 2.7228401928068227, "grad_norm": 6.16796875, "learning_rate": 7.2771598071931785e-06, "loss": 2.7646, "mean_token_accuracy": 0.4840130916414904, "step": 14687 }, { "epoch": 2.7230255839822024, "grad_norm": 8.625, "learning_rate": 7.276974416017798e-06, "loss": 2.2574, "mean_token_accuracy": 0.5229303156640858, "step": 14688 }, { "epoch": 2.7232109751575826, "grad_norm": 8.953125, "learning_rate": 7.276789024842418e-06, "loss": 1.648, "mean_token_accuracy": 0.6362763915547025, "step": 14689 }, { "epoch": 2.7233963663329623, "grad_norm": 7.3828125, "learning_rate": 7.2766036336670375e-06, "loss": 2.8386, "mean_token_accuracy": 0.4729913137893594, "step": 14690 }, { "epoch": 2.7235817575083425, "grad_norm": 11.4296875, "learning_rate": 7.276418242491659e-06, "loss": 3.1823, "mean_token_accuracy": 0.4911887215636014, "step": 14691 }, { "epoch": 2.7237671486837227, "grad_norm": 8.421875, "learning_rate": 7.2762328513162784e-06, "loss": 2.8446, "mean_token_accuracy": 0.4702199144777031, "step": 14692 }, { "epoch": 2.723952539859103, "grad_norm": 7.98046875, "learning_rate": 7.276047460140898e-06, "loss": 2.9023, "mean_token_accuracy": 0.4807436918990704, "step": 14693 }, { "epoch": 2.7241379310344827, "grad_norm": 7.26953125, "learning_rate": 7.275862068965518e-06, "loss": 2.8204, "mean_token_accuracy": 0.4970434782608696, "step": 14694 }, { "epoch": 2.724323322209863, "grad_norm": 6.1171875, "learning_rate": 7.275676677790137e-06, "loss": 2.316, "mean_token_accuracy": 0.5496489468405216, "step": 14695 }, { "epoch": 2.7245087133852426, "grad_norm": 11.828125, "learning_rate": 7.275491286614758e-06, "loss": 3.4062, "mean_token_accuracy": 0.48497221277078373, "step": 14696 }, { "epoch": 2.7246941045606228, "grad_norm": 8.203125, "learning_rate": 7.2753058954393775e-06, "loss": 3.3733, "mean_token_accuracy": 0.45524672462348476, "step": 14697 }, { "epoch": 2.724879495736003, "grad_norm": 9.28125, "learning_rate": 7.275120504263997e-06, "loss": 2.7203, "mean_token_accuracy": 0.47891963109354413, "step": 14698 }, { "epoch": 2.725064886911383, "grad_norm": 8.4453125, "learning_rate": 7.2749351130886185e-06, "loss": 2.431, "mean_token_accuracy": 0.5080235357047339, "step": 14699 }, { "epoch": 2.7252502780867633, "grad_norm": 11.0, "learning_rate": 7.274749721913238e-06, "loss": 2.6897, "mean_token_accuracy": 0.4839404822986147, "step": 14700 }, { "epoch": 2.725435669262143, "grad_norm": 9.0390625, "learning_rate": 7.274564330737858e-06, "loss": 2.9027, "mean_token_accuracy": 0.496449951273841, "step": 14701 }, { "epoch": 2.7256210604375233, "grad_norm": 8.359375, "learning_rate": 7.2743789395624774e-06, "loss": 3.2311, "mean_token_accuracy": 0.47159173051901115, "step": 14702 }, { "epoch": 2.725806451612903, "grad_norm": 9.140625, "learning_rate": 7.274193548387097e-06, "loss": 3.228, "mean_token_accuracy": 0.45765587445214195, "step": 14703 }, { "epoch": 2.725991842788283, "grad_norm": 10.6640625, "learning_rate": 7.2740081572117176e-06, "loss": 2.5513, "mean_token_accuracy": 0.5542971352431713, "step": 14704 }, { "epoch": 2.7261772339636634, "grad_norm": 9.859375, "learning_rate": 7.273822766036337e-06, "loss": 3.0244, "mean_token_accuracy": 0.45500848896434637, "step": 14705 }, { "epoch": 2.7263626251390436, "grad_norm": 7.19921875, "learning_rate": 7.273637374860957e-06, "loss": 2.2374, "mean_token_accuracy": 0.5652978600347022, "step": 14706 }, { "epoch": 2.7265480163144233, "grad_norm": 10.734375, "learning_rate": 7.273451983685577e-06, "loss": 2.8502, "mean_token_accuracy": 0.4843830665978317, "step": 14707 }, { "epoch": 2.7267334074898035, "grad_norm": 7.54296875, "learning_rate": 7.273266592510198e-06, "loss": 2.7886, "mean_token_accuracy": 0.5159301130524152, "step": 14708 }, { "epoch": 2.7269187986651833, "grad_norm": 8.59375, "learning_rate": 7.2730812013348175e-06, "loss": 2.4436, "mean_token_accuracy": 0.5071707953063885, "step": 14709 }, { "epoch": 2.7271041898405635, "grad_norm": 9.5390625, "learning_rate": 7.272895810159437e-06, "loss": 2.8527, "mean_token_accuracy": 0.4840880160029096, "step": 14710 }, { "epoch": 2.7272895810159437, "grad_norm": 7.015625, "learning_rate": 7.272710418984057e-06, "loss": 3.0903, "mean_token_accuracy": 0.4669728011303426, "step": 14711 }, { "epoch": 2.727474972191324, "grad_norm": 9.546875, "learning_rate": 7.272525027808676e-06, "loss": 2.7678, "mean_token_accuracy": 0.5144085521729026, "step": 14712 }, { "epoch": 2.7276603633667036, "grad_norm": 7.34765625, "learning_rate": 7.272339636633297e-06, "loss": 2.3313, "mean_token_accuracy": 0.5395047903446559, "step": 14713 }, { "epoch": 2.727845754542084, "grad_norm": 7.1328125, "learning_rate": 7.2721542454579166e-06, "loss": 2.9809, "mean_token_accuracy": 0.5058353144586125, "step": 14714 }, { "epoch": 2.728031145717464, "grad_norm": 8.7890625, "learning_rate": 7.271968854282537e-06, "loss": 2.9453, "mean_token_accuracy": 0.4981899887654475, "step": 14715 }, { "epoch": 2.7282165368928437, "grad_norm": 8.578125, "learning_rate": 7.271783463107157e-06, "loss": 2.8812, "mean_token_accuracy": 0.48714262897300753, "step": 14716 }, { "epoch": 2.728401928068224, "grad_norm": 6.4375, "learning_rate": 7.271598071931777e-06, "loss": 2.7284, "mean_token_accuracy": 0.48509410633328753, "step": 14717 }, { "epoch": 2.728587319243604, "grad_norm": 7.18359375, "learning_rate": 7.271412680756397e-06, "loss": 2.6055, "mean_token_accuracy": 0.5002770083102493, "step": 14718 }, { "epoch": 2.7287727104189843, "grad_norm": 8.8046875, "learning_rate": 7.2712272895810165e-06, "loss": 3.0788, "mean_token_accuracy": 0.47523519645821805, "step": 14719 }, { "epoch": 2.728958101594364, "grad_norm": 10.140625, "learning_rate": 7.271041898405636e-06, "loss": 3.2674, "mean_token_accuracy": 0.45637788862872847, "step": 14720 }, { "epoch": 2.729143492769744, "grad_norm": 7.05859375, "learning_rate": 7.270856507230256e-06, "loss": 2.9036, "mean_token_accuracy": 0.49731267783749605, "step": 14721 }, { "epoch": 2.729328883945124, "grad_norm": 9.1875, "learning_rate": 7.270671116054876e-06, "loss": 3.5934, "mean_token_accuracy": 0.4600160470714095, "step": 14722 }, { "epoch": 2.729514275120504, "grad_norm": 7.34375, "learning_rate": 7.270485724879496e-06, "loss": 3.0136, "mean_token_accuracy": 0.4784653200812226, "step": 14723 }, { "epoch": 2.7296996662958843, "grad_norm": 7.57421875, "learning_rate": 7.270300333704116e-06, "loss": 3.2421, "mean_token_accuracy": 0.4791891309036015, "step": 14724 }, { "epoch": 2.7298850574712645, "grad_norm": 11.7265625, "learning_rate": 7.270114942528737e-06, "loss": 3.0774, "mean_token_accuracy": 0.46306978848546543, "step": 14725 }, { "epoch": 2.7300704486466443, "grad_norm": 7.1484375, "learning_rate": 7.2699295513533565e-06, "loss": 3.1081, "mean_token_accuracy": 0.46954387990762125, "step": 14726 }, { "epoch": 2.7302558398220245, "grad_norm": 8.3984375, "learning_rate": 7.269744160177976e-06, "loss": 2.6353, "mean_token_accuracy": 0.5100682039623254, "step": 14727 }, { "epoch": 2.7304412309974047, "grad_norm": 9.515625, "learning_rate": 7.269558769002596e-06, "loss": 2.7495, "mean_token_accuracy": 0.5179581447963801, "step": 14728 }, { "epoch": 2.7306266221727844, "grad_norm": 12.75, "learning_rate": 7.2693733778272155e-06, "loss": 2.5371, "mean_token_accuracy": 0.509911599249933, "step": 14729 }, { "epoch": 2.7308120133481646, "grad_norm": 6.8125, "learning_rate": 7.269187986651836e-06, "loss": 3.2643, "mean_token_accuracy": 0.4514231197328669, "step": 14730 }, { "epoch": 2.730997404523545, "grad_norm": 9.1640625, "learning_rate": 7.269002595476456e-06, "loss": 3.1893, "mean_token_accuracy": 0.4823446327683616, "step": 14731 }, { "epoch": 2.731182795698925, "grad_norm": 17.78125, "learning_rate": 7.268817204301076e-06, "loss": 2.7713, "mean_token_accuracy": 0.47913500876680304, "step": 14732 }, { "epoch": 2.7313681868743047, "grad_norm": 9.34375, "learning_rate": 7.268631813125696e-06, "loss": 2.3493, "mean_token_accuracy": 0.5169420330439652, "step": 14733 }, { "epoch": 2.731553578049685, "grad_norm": 6.8828125, "learning_rate": 7.268446421950316e-06, "loss": 3.2546, "mean_token_accuracy": 0.43972257250945773, "step": 14734 }, { "epoch": 2.7317389692250647, "grad_norm": 10.984375, "learning_rate": 7.268261030774936e-06, "loss": 3.12, "mean_token_accuracy": 0.46378269617706236, "step": 14735 }, { "epoch": 2.731924360400445, "grad_norm": 13.4375, "learning_rate": 7.2680756395995555e-06, "loss": 2.588, "mean_token_accuracy": 0.5037073052145375, "step": 14736 }, { "epoch": 2.732109751575825, "grad_norm": 8.0859375, "learning_rate": 7.267890248424175e-06, "loss": 2.748, "mean_token_accuracy": 0.49677196218584274, "step": 14737 }, { "epoch": 2.732295142751205, "grad_norm": 10.046875, "learning_rate": 7.267704857248795e-06, "loss": 2.7569, "mean_token_accuracy": 0.5062130177514793, "step": 14738 }, { "epoch": 2.732480533926585, "grad_norm": 10.9453125, "learning_rate": 7.267519466073415e-06, "loss": 2.6763, "mean_token_accuracy": 0.4783178304729464, "step": 14739 }, { "epoch": 2.732665925101965, "grad_norm": 12.203125, "learning_rate": 7.267334074898036e-06, "loss": 2.6665, "mean_token_accuracy": 0.4830818109610802, "step": 14740 }, { "epoch": 2.7328513162773453, "grad_norm": 9.03125, "learning_rate": 7.267148683722655e-06, "loss": 2.6469, "mean_token_accuracy": 0.48718300205620285, "step": 14741 }, { "epoch": 2.733036707452725, "grad_norm": 8.5, "learning_rate": 7.266963292547276e-06, "loss": 2.7229, "mean_token_accuracy": 0.4663755458515284, "step": 14742 }, { "epoch": 2.7332220986281053, "grad_norm": 9.4375, "learning_rate": 7.2667779013718956e-06, "loss": 3.566, "mean_token_accuracy": 0.4352910602910603, "step": 14743 }, { "epoch": 2.7334074898034855, "grad_norm": 8.0625, "learning_rate": 7.266592510196515e-06, "loss": 2.3909, "mean_token_accuracy": 0.507703777335984, "step": 14744 }, { "epoch": 2.7335928809788657, "grad_norm": 7.18359375, "learning_rate": 7.266407119021135e-06, "loss": 2.9623, "mean_token_accuracy": 0.4589978509218414, "step": 14745 }, { "epoch": 2.7337782721542454, "grad_norm": 6.5546875, "learning_rate": 7.2662217278457545e-06, "loss": 3.3152, "mean_token_accuracy": 0.446102091560626, "step": 14746 }, { "epoch": 2.7339636633296256, "grad_norm": 7.61328125, "learning_rate": 7.266036336670375e-06, "loss": 3.0274, "mean_token_accuracy": 0.4671400903808909, "step": 14747 }, { "epoch": 2.7341490545050053, "grad_norm": 7.25390625, "learning_rate": 7.2658509454949955e-06, "loss": 2.6932, "mean_token_accuracy": 0.4874629280129415, "step": 14748 }, { "epoch": 2.7343344456803855, "grad_norm": 8.8359375, "learning_rate": 7.265665554319615e-06, "loss": 2.7605, "mean_token_accuracy": 0.48218430034129695, "step": 14749 }, { "epoch": 2.7345198368557657, "grad_norm": 8.734375, "learning_rate": 7.265480163144235e-06, "loss": 3.3027, "mean_token_accuracy": 0.44083671557914456, "step": 14750 }, { "epoch": 2.734705228031146, "grad_norm": 8.0078125, "learning_rate": 7.265294771968855e-06, "loss": 3.1385, "mean_token_accuracy": 0.4635405227322644, "step": 14751 }, { "epoch": 2.7348906192065257, "grad_norm": 9.46875, "learning_rate": 7.265109380793475e-06, "loss": 2.5733, "mean_token_accuracy": 0.4963627135848418, "step": 14752 }, { "epoch": 2.735076010381906, "grad_norm": 9.390625, "learning_rate": 7.2649239896180945e-06, "loss": 2.8095, "mean_token_accuracy": 0.48098192608578366, "step": 14753 }, { "epoch": 2.735261401557286, "grad_norm": 8.125, "learning_rate": 7.264738598442714e-06, "loss": 2.5466, "mean_token_accuracy": 0.5452485672397708, "step": 14754 }, { "epoch": 2.7354467927326658, "grad_norm": 9.84375, "learning_rate": 7.264553207267334e-06, "loss": 2.694, "mean_token_accuracy": 0.4850581073602656, "step": 14755 }, { "epoch": 2.735632183908046, "grad_norm": 11.84375, "learning_rate": 7.264367816091955e-06, "loss": 2.6604, "mean_token_accuracy": 0.515748031496063, "step": 14756 }, { "epoch": 2.735817575083426, "grad_norm": 7.72265625, "learning_rate": 7.264182424916575e-06, "loss": 2.4028, "mean_token_accuracy": 0.5078856263623541, "step": 14757 }, { "epoch": 2.7360029662588063, "grad_norm": 8.5234375, "learning_rate": 7.2639970337411945e-06, "loss": 2.5033, "mean_token_accuracy": 0.5126753313601853, "step": 14758 }, { "epoch": 2.736188357434186, "grad_norm": 9.4609375, "learning_rate": 7.263811642565814e-06, "loss": 3.3074, "mean_token_accuracy": 0.47218892351953967, "step": 14759 }, { "epoch": 2.7363737486095663, "grad_norm": 10.109375, "learning_rate": 7.263626251390435e-06, "loss": 3.1522, "mean_token_accuracy": 0.4745358755644757, "step": 14760 }, { "epoch": 2.736559139784946, "grad_norm": 10.2421875, "learning_rate": 7.263440860215054e-06, "loss": 3.6617, "mean_token_accuracy": 0.4317027532745255, "step": 14761 }, { "epoch": 2.736744530960326, "grad_norm": 7.96484375, "learning_rate": 7.263255469039674e-06, "loss": 2.3974, "mean_token_accuracy": 0.5107644305772231, "step": 14762 }, { "epoch": 2.7369299221357064, "grad_norm": 11.5, "learning_rate": 7.2630700778642935e-06, "loss": 2.4602, "mean_token_accuracy": 0.5172079495879787, "step": 14763 }, { "epoch": 2.7371153133110866, "grad_norm": 11.1484375, "learning_rate": 7.262884686688915e-06, "loss": 3.3814, "mean_token_accuracy": 0.45423040152963673, "step": 14764 }, { "epoch": 2.7373007044864663, "grad_norm": 8.421875, "learning_rate": 7.2626992955135345e-06, "loss": 2.6465, "mean_token_accuracy": 0.4916848807374323, "step": 14765 }, { "epoch": 2.7374860956618465, "grad_norm": 11.0546875, "learning_rate": 7.262513904338154e-06, "loss": 3.1681, "mean_token_accuracy": 0.48580697485806973, "step": 14766 }, { "epoch": 2.7376714868372263, "grad_norm": 12.53125, "learning_rate": 7.262328513162774e-06, "loss": 3.4805, "mean_token_accuracy": 0.4295480880648899, "step": 14767 }, { "epoch": 2.7378568780126065, "grad_norm": 7.82421875, "learning_rate": 7.262143121987394e-06, "loss": 3.5222, "mean_token_accuracy": 0.477874967841523, "step": 14768 }, { "epoch": 2.7380422691879867, "grad_norm": 6.77734375, "learning_rate": 7.261957730812014e-06, "loss": 2.7751, "mean_token_accuracy": 0.49986468200270634, "step": 14769 }, { "epoch": 2.738227660363367, "grad_norm": 8.796875, "learning_rate": 7.2617723396366336e-06, "loss": 3.0339, "mean_token_accuracy": 0.4964576226712149, "step": 14770 }, { "epoch": 2.7384130515387466, "grad_norm": 7.39453125, "learning_rate": 7.261586948461253e-06, "loss": 3.0945, "mean_token_accuracy": 0.4671249252839211, "step": 14771 }, { "epoch": 2.738598442714127, "grad_norm": 7.84375, "learning_rate": 7.2614015572858746e-06, "loss": 2.7335, "mean_token_accuracy": 0.4873688875757128, "step": 14772 }, { "epoch": 2.738783833889507, "grad_norm": 7.5625, "learning_rate": 7.261216166110494e-06, "loss": 3.3132, "mean_token_accuracy": 0.45591965100943493, "step": 14773 }, { "epoch": 2.7389692250648867, "grad_norm": 15.203125, "learning_rate": 7.261030774935114e-06, "loss": 3.0972, "mean_token_accuracy": 0.4681783045606802, "step": 14774 }, { "epoch": 2.739154616240267, "grad_norm": 8.1484375, "learning_rate": 7.2608453837597335e-06, "loss": 2.8424, "mean_token_accuracy": 0.48164067297064966, "step": 14775 }, { "epoch": 2.739340007415647, "grad_norm": 8.7734375, "learning_rate": 7.260659992584353e-06, "loss": 2.5422, "mean_token_accuracy": 0.48308065494238933, "step": 14776 }, { "epoch": 2.7395253985910273, "grad_norm": 9.875, "learning_rate": 7.260474601408974e-06, "loss": 2.6006, "mean_token_accuracy": 0.5117640033440821, "step": 14777 }, { "epoch": 2.739710789766407, "grad_norm": 11.671875, "learning_rate": 7.260289210233593e-06, "loss": 3.1703, "mean_token_accuracy": 0.47640414668110787, "step": 14778 }, { "epoch": 2.739896180941787, "grad_norm": 8.9140625, "learning_rate": 7.260103819058213e-06, "loss": 3.6669, "mean_token_accuracy": 0.4358468219502566, "step": 14779 }, { "epoch": 2.740081572117167, "grad_norm": 9.8828125, "learning_rate": 7.259918427882834e-06, "loss": 2.4907, "mean_token_accuracy": 0.5083477259643063, "step": 14780 }, { "epoch": 2.740266963292547, "grad_norm": 8.34375, "learning_rate": 7.259733036707454e-06, "loss": 3.2599, "mean_token_accuracy": 0.45718654434250766, "step": 14781 }, { "epoch": 2.7404523544679273, "grad_norm": 7.25390625, "learning_rate": 7.2595476455320735e-06, "loss": 2.6959, "mean_token_accuracy": 0.4856534090909091, "step": 14782 }, { "epoch": 2.7406377456433075, "grad_norm": 7.53515625, "learning_rate": 7.259362254356693e-06, "loss": 2.5506, "mean_token_accuracy": 0.5103173771865508, "step": 14783 }, { "epoch": 2.7408231368186873, "grad_norm": 12.9375, "learning_rate": 7.259176863181313e-06, "loss": 2.3265, "mean_token_accuracy": 0.5409335288367546, "step": 14784 }, { "epoch": 2.7410085279940675, "grad_norm": 8.2890625, "learning_rate": 7.258991472005933e-06, "loss": 2.8334, "mean_token_accuracy": 0.4832948608917507, "step": 14785 }, { "epoch": 2.7411939191694477, "grad_norm": 11.671875, "learning_rate": 7.258806080830553e-06, "loss": 2.9943, "mean_token_accuracy": 0.45862703034017777, "step": 14786 }, { "epoch": 2.7413793103448274, "grad_norm": 13.3046875, "learning_rate": 7.258620689655173e-06, "loss": 2.7407, "mean_token_accuracy": 0.5071673085151904, "step": 14787 }, { "epoch": 2.7415647015202076, "grad_norm": 7.2578125, "learning_rate": 7.258435298479793e-06, "loss": 2.8574, "mean_token_accuracy": 0.47388610209501325, "step": 14788 }, { "epoch": 2.741750092695588, "grad_norm": 9.21875, "learning_rate": 7.258249907304414e-06, "loss": 2.8127, "mean_token_accuracy": 0.4882510013351135, "step": 14789 }, { "epoch": 2.741935483870968, "grad_norm": 9.53125, "learning_rate": 7.258064516129033e-06, "loss": 3.0661, "mean_token_accuracy": 0.484226271416916, "step": 14790 }, { "epoch": 2.7421208750463477, "grad_norm": 7.30859375, "learning_rate": 7.257879124953653e-06, "loss": 2.7312, "mean_token_accuracy": 0.5017103762827823, "step": 14791 }, { "epoch": 2.742306266221728, "grad_norm": 8.1484375, "learning_rate": 7.2576937337782725e-06, "loss": 4.276, "mean_token_accuracy": 0.3920744138634047, "step": 14792 }, { "epoch": 2.7424916573971077, "grad_norm": 7.08984375, "learning_rate": 7.257508342602892e-06, "loss": 2.9164, "mean_token_accuracy": 0.49093075599099695, "step": 14793 }, { "epoch": 2.742677048572488, "grad_norm": 7.5859375, "learning_rate": 7.257322951427513e-06, "loss": 2.6669, "mean_token_accuracy": 0.5044818931516672, "step": 14794 }, { "epoch": 2.742862439747868, "grad_norm": 6.75390625, "learning_rate": 7.257137560252132e-06, "loss": 2.5004, "mean_token_accuracy": 0.5346334902488231, "step": 14795 }, { "epoch": 2.743047830923248, "grad_norm": 6.8671875, "learning_rate": 7.256952169076753e-06, "loss": 2.7715, "mean_token_accuracy": 0.4831610044313146, "step": 14796 }, { "epoch": 2.743233222098628, "grad_norm": 9.6484375, "learning_rate": 7.2567667779013724e-06, "loss": 3.2188, "mean_token_accuracy": 0.47224797986488276, "step": 14797 }, { "epoch": 2.743418613274008, "grad_norm": 8.0703125, "learning_rate": 7.256581386725993e-06, "loss": 2.8451, "mean_token_accuracy": 0.48976248976248976, "step": 14798 }, { "epoch": 2.7436040044493883, "grad_norm": 7.5859375, "learning_rate": 7.2563959955506126e-06, "loss": 2.5137, "mean_token_accuracy": 0.5175574204946997, "step": 14799 }, { "epoch": 2.743789395624768, "grad_norm": 10.46875, "learning_rate": 7.256210604375232e-06, "loss": 2.8148, "mean_token_accuracy": 0.46623959000640613, "step": 14800 }, { "epoch": 2.7439747868001483, "grad_norm": 7.8671875, "learning_rate": 7.256025213199852e-06, "loss": 2.5828, "mean_token_accuracy": 0.5147101660355374, "step": 14801 }, { "epoch": 2.7441601779755285, "grad_norm": 7.65625, "learning_rate": 7.2558398220244715e-06, "loss": 3.693, "mean_token_accuracy": 0.42519863791146423, "step": 14802 }, { "epoch": 2.7443455691509087, "grad_norm": 7.61328125, "learning_rate": 7.255654430849092e-06, "loss": 2.6108, "mean_token_accuracy": 0.50011460004584, "step": 14803 }, { "epoch": 2.7445309603262884, "grad_norm": 6.46875, "learning_rate": 7.2554690396737125e-06, "loss": 2.4222, "mean_token_accuracy": 0.5353299017313992, "step": 14804 }, { "epoch": 2.7447163515016686, "grad_norm": 8.0390625, "learning_rate": 7.255283648498332e-06, "loss": 3.2055, "mean_token_accuracy": 0.45304029304029303, "step": 14805 }, { "epoch": 2.7449017426770483, "grad_norm": 10.65625, "learning_rate": 7.255098257322953e-06, "loss": 3.7001, "mean_token_accuracy": 0.44077576534701074, "step": 14806 }, { "epoch": 2.7450871338524285, "grad_norm": 7.890625, "learning_rate": 7.254912866147572e-06, "loss": 3.3108, "mean_token_accuracy": 0.45675807236735355, "step": 14807 }, { "epoch": 2.7452725250278087, "grad_norm": 8.1484375, "learning_rate": 7.254727474972192e-06, "loss": 2.2498, "mean_token_accuracy": 0.5223503395987995, "step": 14808 }, { "epoch": 2.745457916203189, "grad_norm": 7.2578125, "learning_rate": 7.2545420837968116e-06, "loss": 2.5858, "mean_token_accuracy": 0.4963005780346821, "step": 14809 }, { "epoch": 2.7456433073785687, "grad_norm": 8.1015625, "learning_rate": 7.254356692621431e-06, "loss": 2.8728, "mean_token_accuracy": 0.4782804919327541, "step": 14810 }, { "epoch": 2.745828698553949, "grad_norm": 10.1875, "learning_rate": 7.254171301446052e-06, "loss": 3.0346, "mean_token_accuracy": 0.4612899503865236, "step": 14811 }, { "epoch": 2.746014089729329, "grad_norm": 7.015625, "learning_rate": 7.253985910270672e-06, "loss": 2.7668, "mean_token_accuracy": 0.47231681170387185, "step": 14812 }, { "epoch": 2.746199480904709, "grad_norm": 8.296875, "learning_rate": 7.253800519095292e-06, "loss": 3.2158, "mean_token_accuracy": 0.4533333333333333, "step": 14813 }, { "epoch": 2.746384872080089, "grad_norm": 9.25, "learning_rate": 7.2536151279199115e-06, "loss": 3.1318, "mean_token_accuracy": 0.4723242022027676, "step": 14814 }, { "epoch": 2.746570263255469, "grad_norm": 6.52734375, "learning_rate": 7.253429736744532e-06, "loss": 2.7032, "mean_token_accuracy": 0.4800173761946134, "step": 14815 }, { "epoch": 2.7467556544308493, "grad_norm": 7.22265625, "learning_rate": 7.253244345569152e-06, "loss": 3.2994, "mean_token_accuracy": 0.4453227931488801, "step": 14816 }, { "epoch": 2.746941045606229, "grad_norm": 7.75, "learning_rate": 7.253058954393771e-06, "loss": 3.2332, "mean_token_accuracy": 0.4534954407294833, "step": 14817 }, { "epoch": 2.7471264367816093, "grad_norm": 7.5, "learning_rate": 7.252873563218391e-06, "loss": 3.1735, "mean_token_accuracy": 0.44948849104859334, "step": 14818 }, { "epoch": 2.747311827956989, "grad_norm": 8.7421875, "learning_rate": 7.2526881720430105e-06, "loss": 3.9921, "mean_token_accuracy": 0.4187058183795541, "step": 14819 }, { "epoch": 2.747497219132369, "grad_norm": 9.2265625, "learning_rate": 7.252502780867632e-06, "loss": 2.1735, "mean_token_accuracy": 0.5685851926977687, "step": 14820 }, { "epoch": 2.7476826103077494, "grad_norm": 7.0546875, "learning_rate": 7.2523173896922515e-06, "loss": 2.5724, "mean_token_accuracy": 0.5310828025477707, "step": 14821 }, { "epoch": 2.7478680014831296, "grad_norm": 7.2421875, "learning_rate": 7.252131998516871e-06, "loss": 3.1205, "mean_token_accuracy": 0.4641886751440061, "step": 14822 }, { "epoch": 2.7480533926585093, "grad_norm": 8.7578125, "learning_rate": 7.251946607341492e-06, "loss": 2.945, "mean_token_accuracy": 0.46978321424246094, "step": 14823 }, { "epoch": 2.7482387838338895, "grad_norm": 9.515625, "learning_rate": 7.251761216166111e-06, "loss": 2.8226, "mean_token_accuracy": 0.48175074183976263, "step": 14824 }, { "epoch": 2.7484241750092697, "grad_norm": 6.51953125, "learning_rate": 7.251575824990731e-06, "loss": 3.1383, "mean_token_accuracy": 0.45470548408937034, "step": 14825 }, { "epoch": 2.7486095661846495, "grad_norm": 9.1640625, "learning_rate": 7.251390433815351e-06, "loss": 3.2372, "mean_token_accuracy": 0.457713557161071, "step": 14826 }, { "epoch": 2.7487949573600297, "grad_norm": 9.4765625, "learning_rate": 7.25120504263997e-06, "loss": 2.791, "mean_token_accuracy": 0.5096081856750686, "step": 14827 }, { "epoch": 2.74898034853541, "grad_norm": 13.59375, "learning_rate": 7.2510196514645916e-06, "loss": 2.4653, "mean_token_accuracy": 0.5443718228031954, "step": 14828 }, { "epoch": 2.74916573971079, "grad_norm": 7.99609375, "learning_rate": 7.250834260289211e-06, "loss": 2.8527, "mean_token_accuracy": 0.4794957715015159, "step": 14829 }, { "epoch": 2.74935113088617, "grad_norm": 8.8984375, "learning_rate": 7.250648869113831e-06, "loss": 3.1235, "mean_token_accuracy": 0.5080596291358623, "step": 14830 }, { "epoch": 2.74953652206155, "grad_norm": 7.21875, "learning_rate": 7.2504634779384505e-06, "loss": 2.4544, "mean_token_accuracy": 0.5145631067961165, "step": 14831 }, { "epoch": 2.7497219132369297, "grad_norm": 8.0859375, "learning_rate": 7.250278086763071e-06, "loss": 2.5942, "mean_token_accuracy": 0.48945147679324896, "step": 14832 }, { "epoch": 2.74990730441231, "grad_norm": 7.1015625, "learning_rate": 7.250092695587691e-06, "loss": 3.0181, "mean_token_accuracy": 0.45010551703346396, "step": 14833 }, { "epoch": 2.75009269558769, "grad_norm": 7.43359375, "learning_rate": 7.24990730441231e-06, "loss": 2.7281, "mean_token_accuracy": 0.49951899951899953, "step": 14834 }, { "epoch": 2.7502780867630703, "grad_norm": 8.125, "learning_rate": 7.24972191323693e-06, "loss": 2.994, "mean_token_accuracy": 0.4570645875039511, "step": 14835 }, { "epoch": 2.75046347793845, "grad_norm": 7.86328125, "learning_rate": 7.249536522061551e-06, "loss": 2.7826, "mean_token_accuracy": 0.493452380952381, "step": 14836 }, { "epoch": 2.75064886911383, "grad_norm": 7.359375, "learning_rate": 7.249351130886171e-06, "loss": 2.9018, "mean_token_accuracy": 0.48751512484875154, "step": 14837 }, { "epoch": 2.75083426028921, "grad_norm": 7.0234375, "learning_rate": 7.2491657397107906e-06, "loss": 2.6354, "mean_token_accuracy": 0.5045500505561172, "step": 14838 }, { "epoch": 2.75101965146459, "grad_norm": 8.03125, "learning_rate": 7.24898034853541e-06, "loss": 2.7796, "mean_token_accuracy": 0.4876443728176202, "step": 14839 }, { "epoch": 2.7512050426399703, "grad_norm": 7.1953125, "learning_rate": 7.24879495736003e-06, "loss": 2.4115, "mean_token_accuracy": 0.5340458748648973, "step": 14840 }, { "epoch": 2.7513904338153505, "grad_norm": 7.93359375, "learning_rate": 7.24860956618465e-06, "loss": 2.9755, "mean_token_accuracy": 0.47453299287730144, "step": 14841 }, { "epoch": 2.7515758249907303, "grad_norm": 9.0859375, "learning_rate": 7.24842417500927e-06, "loss": 2.9474, "mean_token_accuracy": 0.4656970362239298, "step": 14842 }, { "epoch": 2.7517612161661105, "grad_norm": 7.375, "learning_rate": 7.24823878383389e-06, "loss": 3.5333, "mean_token_accuracy": 0.43989248308462325, "step": 14843 }, { "epoch": 2.7519466073414907, "grad_norm": 6.91015625, "learning_rate": 7.248053392658511e-06, "loss": 2.6833, "mean_token_accuracy": 0.4842875099988573, "step": 14844 }, { "epoch": 2.7521319985168704, "grad_norm": 8.171875, "learning_rate": 7.247868001483131e-06, "loss": 3.3693, "mean_token_accuracy": 0.4667900948854432, "step": 14845 }, { "epoch": 2.7523173896922506, "grad_norm": 7.66796875, "learning_rate": 7.24768261030775e-06, "loss": 2.9951, "mean_token_accuracy": 0.47549958937859294, "step": 14846 }, { "epoch": 2.752502780867631, "grad_norm": 7.46875, "learning_rate": 7.24749721913237e-06, "loss": 2.4631, "mean_token_accuracy": 0.5106794032413867, "step": 14847 }, { "epoch": 2.752688172043011, "grad_norm": 8.3515625, "learning_rate": 7.2473118279569895e-06, "loss": 3.2675, "mean_token_accuracy": 0.4621815286624204, "step": 14848 }, { "epoch": 2.7528735632183907, "grad_norm": 6.2421875, "learning_rate": 7.24712643678161e-06, "loss": 2.6296, "mean_token_accuracy": 0.4986001866417811, "step": 14849 }, { "epoch": 2.753058954393771, "grad_norm": 7.3984375, "learning_rate": 7.24694104560623e-06, "loss": 3.5314, "mean_token_accuracy": 0.41505766489568485, "step": 14850 }, { "epoch": 2.7532443455691507, "grad_norm": 10.2265625, "learning_rate": 7.246755654430849e-06, "loss": 2.9439, "mean_token_accuracy": 0.4622986327331799, "step": 14851 }, { "epoch": 2.753429736744531, "grad_norm": 9.5625, "learning_rate": 7.246570263255469e-06, "loss": 3.1751, "mean_token_accuracy": 0.44319263725801333, "step": 14852 }, { "epoch": 2.753615127919911, "grad_norm": 6.68359375, "learning_rate": 7.24638487208009e-06, "loss": 2.4508, "mean_token_accuracy": 0.5114231014677728, "step": 14853 }, { "epoch": 2.753800519095291, "grad_norm": 10.859375, "learning_rate": 7.24619948090471e-06, "loss": 3.0964, "mean_token_accuracy": 0.4699074074074074, "step": 14854 }, { "epoch": 2.753985910270671, "grad_norm": 7.234375, "learning_rate": 7.24601408972933e-06, "loss": 2.9494, "mean_token_accuracy": 0.48335419274092617, "step": 14855 }, { "epoch": 2.754171301446051, "grad_norm": 8.0546875, "learning_rate": 7.245828698553949e-06, "loss": 2.2639, "mean_token_accuracy": 0.5391383495145631, "step": 14856 }, { "epoch": 2.7543566926214313, "grad_norm": 7.44140625, "learning_rate": 7.245643307378569e-06, "loss": 3.084, "mean_token_accuracy": 0.48740916004732127, "step": 14857 }, { "epoch": 2.754542083796811, "grad_norm": 9.2265625, "learning_rate": 7.245457916203189e-06, "loss": 3.0937, "mean_token_accuracy": 0.4755747126436782, "step": 14858 }, { "epoch": 2.7547274749721913, "grad_norm": 7.27734375, "learning_rate": 7.245272525027809e-06, "loss": 2.705, "mean_token_accuracy": 0.49926650366748165, "step": 14859 }, { "epoch": 2.7549128661475715, "grad_norm": 7.17578125, "learning_rate": 7.245087133852429e-06, "loss": 3.0543, "mean_token_accuracy": 0.4727251624883937, "step": 14860 }, { "epoch": 2.7550982573229517, "grad_norm": 10.203125, "learning_rate": 7.24490174267705e-06, "loss": 2.6734, "mean_token_accuracy": 0.5080932784636488, "step": 14861 }, { "epoch": 2.7552836484983314, "grad_norm": 9.359375, "learning_rate": 7.24471635150167e-06, "loss": 3.613, "mean_token_accuracy": 0.426006426006426, "step": 14862 }, { "epoch": 2.7554690396737116, "grad_norm": 9.1171875, "learning_rate": 7.244530960326289e-06, "loss": 3.2086, "mean_token_accuracy": 0.46356502242152464, "step": 14863 }, { "epoch": 2.7556544308490913, "grad_norm": 7.42578125, "learning_rate": 7.244345569150909e-06, "loss": 2.6528, "mean_token_accuracy": 0.4897891963109354, "step": 14864 }, { "epoch": 2.7558398220244715, "grad_norm": 10.2421875, "learning_rate": 7.244160177975529e-06, "loss": 2.8749, "mean_token_accuracy": 0.47155704843428325, "step": 14865 }, { "epoch": 2.7560252131998517, "grad_norm": 11.2265625, "learning_rate": 7.243974786800149e-06, "loss": 2.3936, "mean_token_accuracy": 0.5176266137040715, "step": 14866 }, { "epoch": 2.756210604375232, "grad_norm": 7.03125, "learning_rate": 7.243789395624769e-06, "loss": 2.823, "mean_token_accuracy": 0.5138888888888888, "step": 14867 }, { "epoch": 2.7563959955506117, "grad_norm": 9.9921875, "learning_rate": 7.243604004449388e-06, "loss": 3.1354, "mean_token_accuracy": 0.466977985323549, "step": 14868 }, { "epoch": 2.756581386725992, "grad_norm": 9.703125, "learning_rate": 7.243418613274009e-06, "loss": 3.0735, "mean_token_accuracy": 0.45927173226689116, "step": 14869 }, { "epoch": 2.756766777901372, "grad_norm": 8.359375, "learning_rate": 7.243233222098629e-06, "loss": 2.9045, "mean_token_accuracy": 0.46311213399820994, "step": 14870 }, { "epoch": 2.756952169076752, "grad_norm": 7.6953125, "learning_rate": 7.243047830923249e-06, "loss": 2.5329, "mean_token_accuracy": 0.5031289111389237, "step": 14871 }, { "epoch": 2.757137560252132, "grad_norm": 13.0234375, "learning_rate": 7.242862439747869e-06, "loss": 2.5275, "mean_token_accuracy": 0.4943509338252248, "step": 14872 }, { "epoch": 2.757322951427512, "grad_norm": 8.796875, "learning_rate": 7.242677048572488e-06, "loss": 2.9005, "mean_token_accuracy": 0.47593167701863354, "step": 14873 }, { "epoch": 2.7575083426028923, "grad_norm": 8.8046875, "learning_rate": 7.242491657397108e-06, "loss": 3.3039, "mean_token_accuracy": 0.46223820605034904, "step": 14874 }, { "epoch": 2.757693733778272, "grad_norm": 11.2890625, "learning_rate": 7.242306266221728e-06, "loss": 3.0149, "mean_token_accuracy": 0.49165275459098495, "step": 14875 }, { "epoch": 2.7578791249536523, "grad_norm": 15.984375, "learning_rate": 7.242120875046348e-06, "loss": 2.3571, "mean_token_accuracy": 0.5181973272675576, "step": 14876 }, { "epoch": 2.758064516129032, "grad_norm": 19.0, "learning_rate": 7.2419354838709685e-06, "loss": 2.9314, "mean_token_accuracy": 0.4584313183012865, "step": 14877 }, { "epoch": 2.758249907304412, "grad_norm": 9.765625, "learning_rate": 7.241750092695588e-06, "loss": 3.0267, "mean_token_accuracy": 0.4695033592412067, "step": 14878 }, { "epoch": 2.7584352984797924, "grad_norm": 8.546875, "learning_rate": 7.241564701520209e-06, "loss": 2.9736, "mean_token_accuracy": 0.45637839208724323, "step": 14879 }, { "epoch": 2.7586206896551726, "grad_norm": 7.64453125, "learning_rate": 7.241379310344828e-06, "loss": 2.9132, "mean_token_accuracy": 0.49442896935933145, "step": 14880 }, { "epoch": 2.7588060808305523, "grad_norm": 8.5234375, "learning_rate": 7.241193919169448e-06, "loss": 2.7925, "mean_token_accuracy": 0.49201499791695597, "step": 14881 }, { "epoch": 2.7589914720059325, "grad_norm": 8.2109375, "learning_rate": 7.241008527994068e-06, "loss": 2.9784, "mean_token_accuracy": 0.48815228966986157, "step": 14882 }, { "epoch": 2.7591768631813127, "grad_norm": 9.6015625, "learning_rate": 7.240823136818687e-06, "loss": 3.2139, "mean_token_accuracy": 0.4637998436278342, "step": 14883 }, { "epoch": 2.7593622543566925, "grad_norm": 11.015625, "learning_rate": 7.240637745643308e-06, "loss": 2.8003, "mean_token_accuracy": 0.48838090426875475, "step": 14884 }, { "epoch": 2.7595476455320727, "grad_norm": 7.90234375, "learning_rate": 7.240452354467928e-06, "loss": 3.3024, "mean_token_accuracy": 0.44165565830030823, "step": 14885 }, { "epoch": 2.759733036707453, "grad_norm": 10.546875, "learning_rate": 7.240266963292548e-06, "loss": 3.3158, "mean_token_accuracy": 0.4415365132967497, "step": 14886 }, { "epoch": 2.759918427882833, "grad_norm": 8.1796875, "learning_rate": 7.240081572117168e-06, "loss": 2.2504, "mean_token_accuracy": 0.5347212894560107, "step": 14887 }, { "epoch": 2.760103819058213, "grad_norm": 8.7578125, "learning_rate": 7.239896180941788e-06, "loss": 2.7059, "mean_token_accuracy": 0.4975833736104398, "step": 14888 }, { "epoch": 2.760289210233593, "grad_norm": 8.328125, "learning_rate": 7.239710789766408e-06, "loss": 2.737, "mean_token_accuracy": 0.4924965893587995, "step": 14889 }, { "epoch": 2.7604746014089727, "grad_norm": 7.09375, "learning_rate": 7.239525398591027e-06, "loss": 2.3621, "mean_token_accuracy": 0.5105291576673866, "step": 14890 }, { "epoch": 2.760659992584353, "grad_norm": 9.7734375, "learning_rate": 7.239340007415647e-06, "loss": 3.343, "mean_token_accuracy": 0.47875419705916405, "step": 14891 }, { "epoch": 2.760845383759733, "grad_norm": 7.859375, "learning_rate": 7.2391546162402674e-06, "loss": 2.8276, "mean_token_accuracy": 0.49747590637907296, "step": 14892 }, { "epoch": 2.7610307749351133, "grad_norm": 7.7421875, "learning_rate": 7.238969225064888e-06, "loss": 2.9518, "mean_token_accuracy": 0.46955974842767295, "step": 14893 }, { "epoch": 2.761216166110493, "grad_norm": 9.625, "learning_rate": 7.238783833889508e-06, "loss": 3.1409, "mean_token_accuracy": 0.4560214778828944, "step": 14894 }, { "epoch": 2.761401557285873, "grad_norm": 6.86328125, "learning_rate": 7.238598442714127e-06, "loss": 2.4125, "mean_token_accuracy": 0.5088172043010752, "step": 14895 }, { "epoch": 2.7615869484612534, "grad_norm": 10.1484375, "learning_rate": 7.238413051538748e-06, "loss": 3.8387, "mean_token_accuracy": 0.4669040084388186, "step": 14896 }, { "epoch": 2.761772339636633, "grad_norm": 10.1328125, "learning_rate": 7.238227660363367e-06, "loss": 2.5197, "mean_token_accuracy": 0.5096200485044462, "step": 14897 }, { "epoch": 2.7619577308120133, "grad_norm": 7.1953125, "learning_rate": 7.238042269187987e-06, "loss": 3.0126, "mean_token_accuracy": 0.4675385333956095, "step": 14898 }, { "epoch": 2.7621431219873935, "grad_norm": 8.3984375, "learning_rate": 7.237856878012607e-06, "loss": 2.7853, "mean_token_accuracy": 0.47955647955647956, "step": 14899 }, { "epoch": 2.7623285131627737, "grad_norm": 6.53125, "learning_rate": 7.237671486837226e-06, "loss": 2.7794, "mean_token_accuracy": 0.4776364382437423, "step": 14900 }, { "epoch": 2.7625139043381535, "grad_norm": 7.3828125, "learning_rate": 7.237486095661848e-06, "loss": 3.0698, "mean_token_accuracy": 0.4510502864417568, "step": 14901 }, { "epoch": 2.7626992955135337, "grad_norm": 9.6015625, "learning_rate": 7.237300704486467e-06, "loss": 2.3116, "mean_token_accuracy": 0.5308845958312151, "step": 14902 }, { "epoch": 2.7628846866889134, "grad_norm": 9.515625, "learning_rate": 7.237115313311087e-06, "loss": 2.9323, "mean_token_accuracy": 0.46164349553128103, "step": 14903 }, { "epoch": 2.7630700778642936, "grad_norm": 7.2421875, "learning_rate": 7.236929922135707e-06, "loss": 2.7643, "mean_token_accuracy": 0.4996724747805581, "step": 14904 }, { "epoch": 2.763255469039674, "grad_norm": 8.1015625, "learning_rate": 7.236744530960327e-06, "loss": 2.7777, "mean_token_accuracy": 0.4927242819182589, "step": 14905 }, { "epoch": 2.763440860215054, "grad_norm": 8.7734375, "learning_rate": 7.236559139784947e-06, "loss": 2.4191, "mean_token_accuracy": 0.5222454135999122, "step": 14906 }, { "epoch": 2.7636262513904337, "grad_norm": 7.5703125, "learning_rate": 7.236373748609566e-06, "loss": 3.0185, "mean_token_accuracy": 0.4634915366744109, "step": 14907 }, { "epoch": 2.763811642565814, "grad_norm": 7.3046875, "learning_rate": 7.236188357434186e-06, "loss": 3.4494, "mean_token_accuracy": 0.4301492537313433, "step": 14908 }, { "epoch": 2.7639970337411937, "grad_norm": 8.171875, "learning_rate": 7.236002966258807e-06, "loss": 3.8328, "mean_token_accuracy": 0.4139804418688881, "step": 14909 }, { "epoch": 2.764182424916574, "grad_norm": 8.34375, "learning_rate": 7.235817575083427e-06, "loss": 3.0634, "mean_token_accuracy": 0.45400452219445436, "step": 14910 }, { "epoch": 2.764367816091954, "grad_norm": 7.1484375, "learning_rate": 7.235632183908047e-06, "loss": 3.3591, "mean_token_accuracy": 0.44966905681191394, "step": 14911 }, { "epoch": 2.7645532072673342, "grad_norm": 7.7578125, "learning_rate": 7.235446792732666e-06, "loss": 2.7236, "mean_token_accuracy": 0.49956445993031356, "step": 14912 }, { "epoch": 2.764738598442714, "grad_norm": 7.69921875, "learning_rate": 7.235261401557287e-06, "loss": 3.0516, "mean_token_accuracy": 0.4497530599098132, "step": 14913 }, { "epoch": 2.764923989618094, "grad_norm": 7.375, "learning_rate": 7.235076010381906e-06, "loss": 2.9983, "mean_token_accuracy": 0.4583655581335396, "step": 14914 }, { "epoch": 2.7651093807934743, "grad_norm": 8.9765625, "learning_rate": 7.234890619206526e-06, "loss": 3.0604, "mean_token_accuracy": 0.44219292158223455, "step": 14915 }, { "epoch": 2.765294771968854, "grad_norm": 7.39453125, "learning_rate": 7.234705228031146e-06, "loss": 2.8731, "mean_token_accuracy": 0.4873995368478409, "step": 14916 }, { "epoch": 2.7654801631442343, "grad_norm": 8.3984375, "learning_rate": 7.234519836855767e-06, "loss": 2.4908, "mean_token_accuracy": 0.5011469437322899, "step": 14917 }, { "epoch": 2.7656655543196145, "grad_norm": 9.9140625, "learning_rate": 7.234334445680387e-06, "loss": 3.6922, "mean_token_accuracy": 0.42028316405047705, "step": 14918 }, { "epoch": 2.7658509454949947, "grad_norm": 7.26171875, "learning_rate": 7.234149054505006e-06, "loss": 2.8883, "mean_token_accuracy": 0.47289101268149236, "step": 14919 }, { "epoch": 2.7660363366703744, "grad_norm": 6.9296875, "learning_rate": 7.233963663329626e-06, "loss": 3.3742, "mean_token_accuracy": 0.4421929528312507, "step": 14920 }, { "epoch": 2.7662217278457546, "grad_norm": 7.4296875, "learning_rate": 7.233778272154246e-06, "loss": 3.2222, "mean_token_accuracy": 0.457037695138977, "step": 14921 }, { "epoch": 2.7664071190211343, "grad_norm": 7.7890625, "learning_rate": 7.233592880978866e-06, "loss": 3.1198, "mean_token_accuracy": 0.4516829533116178, "step": 14922 }, { "epoch": 2.7665925101965145, "grad_norm": 8.453125, "learning_rate": 7.233407489803486e-06, "loss": 2.4589, "mean_token_accuracy": 0.5199101973295521, "step": 14923 }, { "epoch": 2.7667779013718947, "grad_norm": 7.375, "learning_rate": 7.233222098628105e-06, "loss": 2.9488, "mean_token_accuracy": 0.47092020692898573, "step": 14924 }, { "epoch": 2.766963292547275, "grad_norm": 9.7265625, "learning_rate": 7.233036707452727e-06, "loss": 3.2661, "mean_token_accuracy": 0.4494038520330174, "step": 14925 }, { "epoch": 2.7671486837226547, "grad_norm": 7.5625, "learning_rate": 7.232851316277346e-06, "loss": 2.6689, "mean_token_accuracy": 0.49907749077490776, "step": 14926 }, { "epoch": 2.767334074898035, "grad_norm": 7.2421875, "learning_rate": 7.232665925101966e-06, "loss": 3.589, "mean_token_accuracy": 0.4372999709048589, "step": 14927 }, { "epoch": 2.767519466073415, "grad_norm": 8.75, "learning_rate": 7.232480533926586e-06, "loss": 4.0577, "mean_token_accuracy": 0.4266649720386375, "step": 14928 }, { "epoch": 2.767704857248795, "grad_norm": 8.4921875, "learning_rate": 7.232295142751205e-06, "loss": 2.5097, "mean_token_accuracy": 0.508110992529349, "step": 14929 }, { "epoch": 2.767890248424175, "grad_norm": 15.328125, "learning_rate": 7.232109751575826e-06, "loss": 4.2298, "mean_token_accuracy": 0.42609871534820826, "step": 14930 }, { "epoch": 2.768075639599555, "grad_norm": 7.78125, "learning_rate": 7.2319243604004454e-06, "loss": 2.4759, "mean_token_accuracy": 0.5060144346431436, "step": 14931 }, { "epoch": 2.7682610307749353, "grad_norm": 7.1953125, "learning_rate": 7.231738969225065e-06, "loss": 3.1937, "mean_token_accuracy": 0.46236828901154037, "step": 14932 }, { "epoch": 2.768446421950315, "grad_norm": 8.9453125, "learning_rate": 7.2315535780496856e-06, "loss": 2.981, "mean_token_accuracy": 0.5010895744924877, "step": 14933 }, { "epoch": 2.7686318131256953, "grad_norm": 9.1875, "learning_rate": 7.231368186874306e-06, "loss": 2.5565, "mean_token_accuracy": 0.5140819964349376, "step": 14934 }, { "epoch": 2.768817204301075, "grad_norm": 7.1328125, "learning_rate": 7.231182795698926e-06, "loss": 2.5259, "mean_token_accuracy": 0.5347545413601561, "step": 14935 }, { "epoch": 2.769002595476455, "grad_norm": 7.40625, "learning_rate": 7.230997404523545e-06, "loss": 2.8142, "mean_token_accuracy": 0.48696648524778, "step": 14936 }, { "epoch": 2.7691879866518354, "grad_norm": 7.80859375, "learning_rate": 7.230812013348165e-06, "loss": 4.1087, "mean_token_accuracy": 0.41726440761810535, "step": 14937 }, { "epoch": 2.7693733778272156, "grad_norm": 8.4921875, "learning_rate": 7.230626622172785e-06, "loss": 3.0888, "mean_token_accuracy": 0.46557295550584143, "step": 14938 }, { "epoch": 2.7695587690025953, "grad_norm": 9.6015625, "learning_rate": 7.230441230997405e-06, "loss": 3.2259, "mean_token_accuracy": 0.4753146176185866, "step": 14939 }, { "epoch": 2.7697441601779755, "grad_norm": 7.85546875, "learning_rate": 7.230255839822025e-06, "loss": 2.6818, "mean_token_accuracy": 0.496486151302191, "step": 14940 }, { "epoch": 2.7699295513533557, "grad_norm": 10.984375, "learning_rate": 7.230070448646645e-06, "loss": 2.9597, "mean_token_accuracy": 0.4701913640824338, "step": 14941 }, { "epoch": 2.7701149425287355, "grad_norm": 7.75, "learning_rate": 7.229885057471266e-06, "loss": 2.2951, "mean_token_accuracy": 0.5373205173845971, "step": 14942 }, { "epoch": 2.7703003337041157, "grad_norm": 14.3359375, "learning_rate": 7.229699666295885e-06, "loss": 3.617, "mean_token_accuracy": 0.4546084546084546, "step": 14943 }, { "epoch": 2.770485724879496, "grad_norm": 8.2578125, "learning_rate": 7.229514275120505e-06, "loss": 3.4586, "mean_token_accuracy": 0.4414523783246882, "step": 14944 }, { "epoch": 2.770671116054876, "grad_norm": 6.8125, "learning_rate": 7.229328883945125e-06, "loss": 2.9606, "mean_token_accuracy": 0.47267719959414406, "step": 14945 }, { "epoch": 2.770856507230256, "grad_norm": 7.70703125, "learning_rate": 7.229143492769744e-06, "loss": 2.6706, "mean_token_accuracy": 0.4928949568124826, "step": 14946 }, { "epoch": 2.771041898405636, "grad_norm": 9.296875, "learning_rate": 7.228958101594365e-06, "loss": 3.4236, "mean_token_accuracy": 0.45313527014591826, "step": 14947 }, { "epoch": 2.7712272895810157, "grad_norm": 8.625, "learning_rate": 7.2287727104189845e-06, "loss": 2.8268, "mean_token_accuracy": 0.49534219351633335, "step": 14948 }, { "epoch": 2.771412680756396, "grad_norm": 7.09375, "learning_rate": 7.228587319243605e-06, "loss": 2.6657, "mean_token_accuracy": 0.5211981566820276, "step": 14949 }, { "epoch": 2.771598071931776, "grad_norm": 7.4140625, "learning_rate": 7.228401928068225e-06, "loss": 3.3849, "mean_token_accuracy": 0.43249872253449156, "step": 14950 }, { "epoch": 2.7717834631071563, "grad_norm": 8.0625, "learning_rate": 7.228216536892845e-06, "loss": 3.6337, "mean_token_accuracy": 0.4524598521539638, "step": 14951 }, { "epoch": 2.771968854282536, "grad_norm": 7.90234375, "learning_rate": 7.228031145717465e-06, "loss": 3.4224, "mean_token_accuracy": 0.4568937150378414, "step": 14952 }, { "epoch": 2.772154245457916, "grad_norm": 8.1171875, "learning_rate": 7.227845754542084e-06, "loss": 2.9356, "mean_token_accuracy": 0.4798491165376216, "step": 14953 }, { "epoch": 2.7723396366332964, "grad_norm": 8.7265625, "learning_rate": 7.227660363366704e-06, "loss": 2.6862, "mean_token_accuracy": 0.47983392645314354, "step": 14954 }, { "epoch": 2.772525027808676, "grad_norm": 7.49609375, "learning_rate": 7.227474972191324e-06, "loss": 2.8088, "mean_token_accuracy": 0.47935085382100034, "step": 14955 }, { "epoch": 2.7727104189840563, "grad_norm": 6.8984375, "learning_rate": 7.227289581015944e-06, "loss": 2.2604, "mean_token_accuracy": 0.5435386168601716, "step": 14956 }, { "epoch": 2.7728958101594365, "grad_norm": 7.26953125, "learning_rate": 7.227104189840565e-06, "loss": 2.6062, "mean_token_accuracy": 0.49673008323424495, "step": 14957 }, { "epoch": 2.7730812013348167, "grad_norm": 9.34375, "learning_rate": 7.226918798665184e-06, "loss": 3.0451, "mean_token_accuracy": 0.48356360171510243, "step": 14958 }, { "epoch": 2.7732665925101965, "grad_norm": 7.86328125, "learning_rate": 7.226733407489804e-06, "loss": 3.22, "mean_token_accuracy": 0.4558114035087719, "step": 14959 }, { "epoch": 2.7734519836855767, "grad_norm": 6.8515625, "learning_rate": 7.2265480163144244e-06, "loss": 2.8417, "mean_token_accuracy": 0.4682766504715633, "step": 14960 }, { "epoch": 2.7736373748609564, "grad_norm": 8.4765625, "learning_rate": 7.226362625139044e-06, "loss": 2.6572, "mean_token_accuracy": 0.48473325131417067, "step": 14961 }, { "epoch": 2.7738227660363366, "grad_norm": 6.10546875, "learning_rate": 7.226177233963664e-06, "loss": 3.2596, "mean_token_accuracy": 0.42630846623140317, "step": 14962 }, { "epoch": 2.774008157211717, "grad_norm": 8.171875, "learning_rate": 7.225991842788283e-06, "loss": 3.3752, "mean_token_accuracy": 0.4343101343101343, "step": 14963 }, { "epoch": 2.774193548387097, "grad_norm": 8.5, "learning_rate": 7.225806451612903e-06, "loss": 2.4757, "mean_token_accuracy": 0.513355592654424, "step": 14964 }, { "epoch": 2.7743789395624767, "grad_norm": 8.53125, "learning_rate": 7.225621060437524e-06, "loss": 3.7204, "mean_token_accuracy": 0.4127665876777251, "step": 14965 }, { "epoch": 2.774564330737857, "grad_norm": 7.04296875, "learning_rate": 7.225435669262144e-06, "loss": 3.33, "mean_token_accuracy": 0.4666427117020002, "step": 14966 }, { "epoch": 2.7747497219132367, "grad_norm": 8.4296875, "learning_rate": 7.225250278086764e-06, "loss": 3.875, "mean_token_accuracy": 0.4513528041569611, "step": 14967 }, { "epoch": 2.774935113088617, "grad_norm": 6.87109375, "learning_rate": 7.225064886911384e-06, "loss": 2.7644, "mean_token_accuracy": 0.4735774647887324, "step": 14968 }, { "epoch": 2.775120504263997, "grad_norm": 7.125, "learning_rate": 7.224879495736004e-06, "loss": 2.4641, "mean_token_accuracy": 0.5253505933117584, "step": 14969 }, { "epoch": 2.7753058954393772, "grad_norm": 8.5703125, "learning_rate": 7.224694104560623e-06, "loss": 3.3967, "mean_token_accuracy": 0.45251084067701774, "step": 14970 }, { "epoch": 2.7754912866147574, "grad_norm": 9.765625, "learning_rate": 7.224508713385243e-06, "loss": 2.9796, "mean_token_accuracy": 0.4374850870913863, "step": 14971 }, { "epoch": 2.775676677790137, "grad_norm": 8.6015625, "learning_rate": 7.224323322209863e-06, "loss": 3.2966, "mean_token_accuracy": 0.5060849598163031, "step": 14972 }, { "epoch": 2.7758620689655173, "grad_norm": 8.328125, "learning_rate": 7.224137931034483e-06, "loss": 3.4799, "mean_token_accuracy": 0.42666493910339465, "step": 14973 }, { "epoch": 2.776047460140897, "grad_norm": 7.8046875, "learning_rate": 7.223952539859104e-06, "loss": 2.4665, "mean_token_accuracy": 0.5753994025198078, "step": 14974 }, { "epoch": 2.7762328513162773, "grad_norm": 11.046875, "learning_rate": 7.223767148683723e-06, "loss": 2.1144, "mean_token_accuracy": 0.5366124774211477, "step": 14975 }, { "epoch": 2.7764182424916575, "grad_norm": 7.14453125, "learning_rate": 7.223581757508343e-06, "loss": 2.5525, "mean_token_accuracy": 0.4911203683402763, "step": 14976 }, { "epoch": 2.7766036336670377, "grad_norm": 9.1171875, "learning_rate": 7.2233963663329635e-06, "loss": 2.7829, "mean_token_accuracy": 0.5451481696687972, "step": 14977 }, { "epoch": 2.7767890248424174, "grad_norm": 9.3671875, "learning_rate": 7.223210975157583e-06, "loss": 3.4986, "mean_token_accuracy": 0.4373851404568128, "step": 14978 }, { "epoch": 2.7769744160177976, "grad_norm": 8.265625, "learning_rate": 7.223025583982203e-06, "loss": 2.8453, "mean_token_accuracy": 0.46710905943450665, "step": 14979 }, { "epoch": 2.7771598071931773, "grad_norm": 13.125, "learning_rate": 7.222840192806822e-06, "loss": 3.1908, "mean_token_accuracy": 0.45592124595944755, "step": 14980 }, { "epoch": 2.7773451983685575, "grad_norm": 10.25, "learning_rate": 7.222654801631442e-06, "loss": 2.9848, "mean_token_accuracy": 0.4584593232541397, "step": 14981 }, { "epoch": 2.7775305895439377, "grad_norm": 8.75, "learning_rate": 7.222469410456063e-06, "loss": 2.7519, "mean_token_accuracy": 0.473582224518796, "step": 14982 }, { "epoch": 2.777715980719318, "grad_norm": 11.4296875, "learning_rate": 7.222284019280683e-06, "loss": 2.0863, "mean_token_accuracy": 0.5655685977533762, "step": 14983 }, { "epoch": 2.7779013718946977, "grad_norm": 11.1484375, "learning_rate": 7.222098628105303e-06, "loss": 2.4308, "mean_token_accuracy": 0.5125912408759125, "step": 14984 }, { "epoch": 2.778086763070078, "grad_norm": 9.6015625, "learning_rate": 7.221913236929923e-06, "loss": 3.4935, "mean_token_accuracy": 0.4396197327852004, "step": 14985 }, { "epoch": 2.778272154245458, "grad_norm": 10.90625, "learning_rate": 7.221727845754543e-06, "loss": 3.1788, "mean_token_accuracy": 0.46296296296296297, "step": 14986 }, { "epoch": 2.778457545420838, "grad_norm": 7.67578125, "learning_rate": 7.2215424545791624e-06, "loss": 3.0737, "mean_token_accuracy": 0.47532496144525227, "step": 14987 }, { "epoch": 2.778642936596218, "grad_norm": 9.765625, "learning_rate": 7.221357063403782e-06, "loss": 2.587, "mean_token_accuracy": 0.5180847399242163, "step": 14988 }, { "epoch": 2.778828327771598, "grad_norm": 12.984375, "learning_rate": 7.221171672228402e-06, "loss": 2.7294, "mean_token_accuracy": 0.4927049559981473, "step": 14989 }, { "epoch": 2.7790137189469784, "grad_norm": 8.6640625, "learning_rate": 7.220986281053023e-06, "loss": 3.0906, "mean_token_accuracy": 0.49673396674584325, "step": 14990 }, { "epoch": 2.779199110122358, "grad_norm": 14.5703125, "learning_rate": 7.220800889877643e-06, "loss": 2.7881, "mean_token_accuracy": 0.48788325076984873, "step": 14991 }, { "epoch": 2.7793845012977383, "grad_norm": 13.0546875, "learning_rate": 7.220615498702262e-06, "loss": 5.5876, "mean_token_accuracy": 0.4015338263489455, "step": 14992 }, { "epoch": 2.779569892473118, "grad_norm": 8.4296875, "learning_rate": 7.220430107526882e-06, "loss": 3.1843, "mean_token_accuracy": 0.48014077425842133, "step": 14993 }, { "epoch": 2.779755283648498, "grad_norm": 8.9765625, "learning_rate": 7.2202447163515025e-06, "loss": 3.1532, "mean_token_accuracy": 0.4792698457619924, "step": 14994 }, { "epoch": 2.7799406748238784, "grad_norm": 11.609375, "learning_rate": 7.220059325176122e-06, "loss": 2.5985, "mean_token_accuracy": 0.5000560726701806, "step": 14995 }, { "epoch": 2.7801260659992586, "grad_norm": 8.21875, "learning_rate": 7.219873934000742e-06, "loss": 2.6073, "mean_token_accuracy": 0.5141521043563869, "step": 14996 }, { "epoch": 2.7803114571746383, "grad_norm": 6.44921875, "learning_rate": 7.2196885428253614e-06, "loss": 2.7888, "mean_token_accuracy": 0.4883241758241758, "step": 14997 }, { "epoch": 2.7804968483500185, "grad_norm": 9.015625, "learning_rate": 7.219503151649983e-06, "loss": 2.935, "mean_token_accuracy": 0.4764511018291805, "step": 14998 }, { "epoch": 2.7806822395253987, "grad_norm": 10.015625, "learning_rate": 7.219317760474602e-06, "loss": 3.5296, "mean_token_accuracy": 0.4371382957493514, "step": 14999 }, { "epoch": 2.7808676307007785, "grad_norm": 8.8671875, "learning_rate": 7.219132369299222e-06, "loss": 3.1964, "mean_token_accuracy": 0.4735980352026197, "step": 15000 }, { "epoch": 2.7810530218761587, "grad_norm": 8.578125, "learning_rate": 7.218946978123842e-06, "loss": 2.538, "mean_token_accuracy": 0.49921996879875197, "step": 15001 }, { "epoch": 2.781238413051539, "grad_norm": 10.828125, "learning_rate": 7.218761586948461e-06, "loss": 2.5354, "mean_token_accuracy": 0.5108680210257138, "step": 15002 }, { "epoch": 2.781423804226919, "grad_norm": 7.828125, "learning_rate": 7.218576195773082e-06, "loss": 3.3247, "mean_token_accuracy": 0.4697816150330117, "step": 15003 }, { "epoch": 2.781609195402299, "grad_norm": 6.66015625, "learning_rate": 7.2183908045977015e-06, "loss": 2.1294, "mean_token_accuracy": 0.5935689797848446, "step": 15004 }, { "epoch": 2.781794586577679, "grad_norm": 7.54296875, "learning_rate": 7.218205413422321e-06, "loss": 2.7886, "mean_token_accuracy": 0.49103987884906614, "step": 15005 }, { "epoch": 2.7819799777530587, "grad_norm": 7.3828125, "learning_rate": 7.2180200222469425e-06, "loss": 2.3306, "mean_token_accuracy": 0.5389274634362535, "step": 15006 }, { "epoch": 2.782165368928439, "grad_norm": 8.3203125, "learning_rate": 7.217834631071562e-06, "loss": 3.4698, "mean_token_accuracy": 0.43117959342224527, "step": 15007 }, { "epoch": 2.782350760103819, "grad_norm": 7.4609375, "learning_rate": 7.217649239896182e-06, "loss": 2.113, "mean_token_accuracy": 0.6056830601092896, "step": 15008 }, { "epoch": 2.7825361512791993, "grad_norm": 8.34375, "learning_rate": 7.217463848720801e-06, "loss": 3.5166, "mean_token_accuracy": 0.45571004530542103, "step": 15009 }, { "epoch": 2.782721542454579, "grad_norm": 7.1328125, "learning_rate": 7.217278457545421e-06, "loss": 2.8189, "mean_token_accuracy": 0.46806822689408967, "step": 15010 }, { "epoch": 2.7829069336299592, "grad_norm": 7.7265625, "learning_rate": 7.2170930663700415e-06, "loss": 2.3085, "mean_token_accuracy": 0.5416575071444274, "step": 15011 }, { "epoch": 2.7830923248053394, "grad_norm": 7.5078125, "learning_rate": 7.216907675194661e-06, "loss": 3.4242, "mean_token_accuracy": 0.4415089546551505, "step": 15012 }, { "epoch": 2.783277715980719, "grad_norm": 7.62890625, "learning_rate": 7.216722284019281e-06, "loss": 3.0251, "mean_token_accuracy": 0.4640779751611382, "step": 15013 }, { "epoch": 2.7834631071560993, "grad_norm": 9.9921875, "learning_rate": 7.216536892843901e-06, "loss": 3.2228, "mean_token_accuracy": 0.46495726495726497, "step": 15014 }, { "epoch": 2.7836484983314795, "grad_norm": 7.71484375, "learning_rate": 7.216351501668522e-06, "loss": 3.4503, "mean_token_accuracy": 0.42964705882352944, "step": 15015 }, { "epoch": 2.7838338895068597, "grad_norm": 7.24609375, "learning_rate": 7.2161661104931415e-06, "loss": 3.4894, "mean_token_accuracy": 0.42843716433942, "step": 15016 }, { "epoch": 2.7840192806822395, "grad_norm": 9.5859375, "learning_rate": 7.215980719317761e-06, "loss": 3.2611, "mean_token_accuracy": 0.4607128113389294, "step": 15017 }, { "epoch": 2.7842046718576197, "grad_norm": 9.4140625, "learning_rate": 7.215795328142381e-06, "loss": 2.9679, "mean_token_accuracy": 0.5068640324698579, "step": 15018 }, { "epoch": 2.7843900630329994, "grad_norm": 6.953125, "learning_rate": 7.215609936967e-06, "loss": 3.0607, "mean_token_accuracy": 0.4737818451644818, "step": 15019 }, { "epoch": 2.7845754542083796, "grad_norm": 7.83984375, "learning_rate": 7.215424545791621e-06, "loss": 2.5687, "mean_token_accuracy": 0.5053626149131767, "step": 15020 }, { "epoch": 2.78476084538376, "grad_norm": 11.4296875, "learning_rate": 7.2152391546162405e-06, "loss": 3.3697, "mean_token_accuracy": 0.44071709900855627, "step": 15021 }, { "epoch": 2.78494623655914, "grad_norm": 17.609375, "learning_rate": 7.215053763440861e-06, "loss": 3.9178, "mean_token_accuracy": 0.4407461186194572, "step": 15022 }, { "epoch": 2.7851316277345197, "grad_norm": 8.7109375, "learning_rate": 7.2148683722654815e-06, "loss": 2.8927, "mean_token_accuracy": 0.46709792549631945, "step": 15023 }, { "epoch": 2.7853170189099, "grad_norm": 9.9765625, "learning_rate": 7.214682981090101e-06, "loss": 2.5958, "mean_token_accuracy": 0.5057632836660106, "step": 15024 }, { "epoch": 2.78550241008528, "grad_norm": 9.1640625, "learning_rate": 7.214497589914721e-06, "loss": 3.4265, "mean_token_accuracy": 0.4288164665523156, "step": 15025 }, { "epoch": 2.78568780126066, "grad_norm": 8.796875, "learning_rate": 7.2143121987393404e-06, "loss": 2.9072, "mean_token_accuracy": 0.5011644671176666, "step": 15026 }, { "epoch": 2.78587319243604, "grad_norm": 7.63671875, "learning_rate": 7.21412680756396e-06, "loss": 2.8445, "mean_token_accuracy": 0.5015437816475238, "step": 15027 }, { "epoch": 2.7860585836114202, "grad_norm": 7.10546875, "learning_rate": 7.21394141638858e-06, "loss": 3.2668, "mean_token_accuracy": 0.4474148459763876, "step": 15028 }, { "epoch": 2.7862439747868004, "grad_norm": 9.5234375, "learning_rate": 7.2137560252132e-06, "loss": 2.1866, "mean_token_accuracy": 0.5628367975365666, "step": 15029 }, { "epoch": 2.78642936596218, "grad_norm": 6.87109375, "learning_rate": 7.213570634037821e-06, "loss": 2.8371, "mean_token_accuracy": 0.47868999551368324, "step": 15030 }, { "epoch": 2.7866147571375603, "grad_norm": 8.578125, "learning_rate": 7.21338524286244e-06, "loss": 2.3633, "mean_token_accuracy": 0.5554833468724614, "step": 15031 }, { "epoch": 2.78680014831294, "grad_norm": 8.1640625, "learning_rate": 7.213199851687061e-06, "loss": 2.6255, "mean_token_accuracy": 0.5090479937057435, "step": 15032 }, { "epoch": 2.7869855394883203, "grad_norm": 6.68359375, "learning_rate": 7.2130144605116805e-06, "loss": 2.8819, "mean_token_accuracy": 0.47889930898321814, "step": 15033 }, { "epoch": 2.7871709306637005, "grad_norm": 7.31640625, "learning_rate": 7.2128290693363e-06, "loss": 2.6332, "mean_token_accuracy": 0.4926062846580407, "step": 15034 }, { "epoch": 2.7873563218390807, "grad_norm": 8.4296875, "learning_rate": 7.21264367816092e-06, "loss": 2.9633, "mean_token_accuracy": 0.47737989040397605, "step": 15035 }, { "epoch": 2.7875417130144604, "grad_norm": 9.8203125, "learning_rate": 7.212458286985539e-06, "loss": 2.7091, "mean_token_accuracy": 0.5327774214239898, "step": 15036 }, { "epoch": 2.7877271041898406, "grad_norm": 8.421875, "learning_rate": 7.21227289581016e-06, "loss": 2.9695, "mean_token_accuracy": 0.48814877841254406, "step": 15037 }, { "epoch": 2.7879124953652203, "grad_norm": 7.53125, "learning_rate": 7.21208750463478e-06, "loss": 2.9865, "mean_token_accuracy": 0.47276817428368456, "step": 15038 }, { "epoch": 2.7880978865406005, "grad_norm": 9.25, "learning_rate": 7.2119021134594e-06, "loss": 2.7567, "mean_token_accuracy": 0.48334174659262996, "step": 15039 }, { "epoch": 2.7882832777159807, "grad_norm": 8.796875, "learning_rate": 7.21171672228402e-06, "loss": 3.1252, "mean_token_accuracy": 0.46791775907271976, "step": 15040 }, { "epoch": 2.788468668891361, "grad_norm": 9.0546875, "learning_rate": 7.21153133110864e-06, "loss": 2.6008, "mean_token_accuracy": 0.4856509422342653, "step": 15041 }, { "epoch": 2.7886540600667407, "grad_norm": 7.0625, "learning_rate": 7.21134593993326e-06, "loss": 2.8621, "mean_token_accuracy": 0.4546548956661316, "step": 15042 }, { "epoch": 2.788839451242121, "grad_norm": 8.71875, "learning_rate": 7.2111605487578795e-06, "loss": 2.4514, "mean_token_accuracy": 0.5345345345345346, "step": 15043 }, { "epoch": 2.789024842417501, "grad_norm": 10.9921875, "learning_rate": 7.210975157582499e-06, "loss": 3.3255, "mean_token_accuracy": 0.4586641756188697, "step": 15044 }, { "epoch": 2.789210233592881, "grad_norm": 8.7734375, "learning_rate": 7.210789766407119e-06, "loss": 2.3458, "mean_token_accuracy": 0.5252192982456141, "step": 15045 }, { "epoch": 2.789395624768261, "grad_norm": 8.2421875, "learning_rate": 7.21060437523174e-06, "loss": 2.5305, "mean_token_accuracy": 0.5393713722483846, "step": 15046 }, { "epoch": 2.789581015943641, "grad_norm": 10.1484375, "learning_rate": 7.21041898405636e-06, "loss": 3.818, "mean_token_accuracy": 0.4524929444967074, "step": 15047 }, { "epoch": 2.7897664071190214, "grad_norm": 8.4609375, "learning_rate": 7.210233592880979e-06, "loss": 3.044, "mean_token_accuracy": 0.4785492671445384, "step": 15048 }, { "epoch": 2.789951798294401, "grad_norm": 7.265625, "learning_rate": 7.2100482017056e-06, "loss": 2.505, "mean_token_accuracy": 0.485740153915799, "step": 15049 }, { "epoch": 2.7901371894697813, "grad_norm": 10.0703125, "learning_rate": 7.2098628105302195e-06, "loss": 3.4323, "mean_token_accuracy": 0.45092804474955506, "step": 15050 }, { "epoch": 2.790322580645161, "grad_norm": 10.609375, "learning_rate": 7.209677419354839e-06, "loss": 2.5447, "mean_token_accuracy": 0.4824062095730918, "step": 15051 }, { "epoch": 2.790507971820541, "grad_norm": 10.390625, "learning_rate": 7.209492028179459e-06, "loss": 2.843, "mean_token_accuracy": 0.4775125832115603, "step": 15052 }, { "epoch": 2.7906933629959214, "grad_norm": 11.9609375, "learning_rate": 7.2093066370040785e-06, "loss": 3.2672, "mean_token_accuracy": 0.4845197142101085, "step": 15053 }, { "epoch": 2.7908787541713016, "grad_norm": 17.46875, "learning_rate": 7.2091212458287e-06, "loss": 2.9087, "mean_token_accuracy": 0.463458974669737, "step": 15054 }, { "epoch": 2.7910641453466813, "grad_norm": 12.7578125, "learning_rate": 7.2089358546533194e-06, "loss": 2.923, "mean_token_accuracy": 0.48713186521623775, "step": 15055 }, { "epoch": 2.7912495365220615, "grad_norm": 9.4453125, "learning_rate": 7.208750463477939e-06, "loss": 3.3041, "mean_token_accuracy": 0.45218340611353713, "step": 15056 }, { "epoch": 2.7914349276974417, "grad_norm": 13.03125, "learning_rate": 7.208565072302559e-06, "loss": 2.4398, "mean_token_accuracy": 0.5108866736621196, "step": 15057 }, { "epoch": 2.7916203188728215, "grad_norm": 8.8515625, "learning_rate": 7.208379681127179e-06, "loss": 2.3937, "mean_token_accuracy": 0.5191654536632703, "step": 15058 }, { "epoch": 2.7918057100482017, "grad_norm": 7.109375, "learning_rate": 7.208194289951799e-06, "loss": 3.0377, "mean_token_accuracy": 0.43545699269324445, "step": 15059 }, { "epoch": 2.791991101223582, "grad_norm": 6.77734375, "learning_rate": 7.2080088987764185e-06, "loss": 3.2756, "mean_token_accuracy": 0.461275925439748, "step": 15060 }, { "epoch": 2.792176492398962, "grad_norm": 10.203125, "learning_rate": 7.207823507601038e-06, "loss": 2.8231, "mean_token_accuracy": 0.4801780354000621, "step": 15061 }, { "epoch": 2.792361883574342, "grad_norm": 7.87890625, "learning_rate": 7.2076381164256595e-06, "loss": 2.8007, "mean_token_accuracy": 0.4960419341035516, "step": 15062 }, { "epoch": 2.792547274749722, "grad_norm": 7.36328125, "learning_rate": 7.207452725250279e-06, "loss": 2.7081, "mean_token_accuracy": 0.5051770207080828, "step": 15063 }, { "epoch": 2.7927326659251017, "grad_norm": 7.9453125, "learning_rate": 7.207267334074899e-06, "loss": 3.7494, "mean_token_accuracy": 0.4190836485918453, "step": 15064 }, { "epoch": 2.792918057100482, "grad_norm": 10.7578125, "learning_rate": 7.207081942899518e-06, "loss": 2.8866, "mean_token_accuracy": 0.5111214087117701, "step": 15065 }, { "epoch": 2.793103448275862, "grad_norm": 6.9375, "learning_rate": 7.206896551724139e-06, "loss": 3.004, "mean_token_accuracy": 0.4711782989545069, "step": 15066 }, { "epoch": 2.7932888394512423, "grad_norm": 8.3359375, "learning_rate": 7.2067111605487586e-06, "loss": 3.2717, "mean_token_accuracy": 0.46502000827928797, "step": 15067 }, { "epoch": 2.793474230626622, "grad_norm": 6.734375, "learning_rate": 7.206525769373378e-06, "loss": 3.067, "mean_token_accuracy": 0.47590435315757207, "step": 15068 }, { "epoch": 2.7936596218020022, "grad_norm": 7.33203125, "learning_rate": 7.206340378197998e-06, "loss": 2.8713, "mean_token_accuracy": 0.4911985648615316, "step": 15069 }, { "epoch": 2.7938450129773824, "grad_norm": 6.31640625, "learning_rate": 7.206154987022619e-06, "loss": 2.8797, "mean_token_accuracy": 0.47644764476447643, "step": 15070 }, { "epoch": 2.794030404152762, "grad_norm": 8.3125, "learning_rate": 7.205969595847239e-06, "loss": 3.1379, "mean_token_accuracy": 0.47233887164506116, "step": 15071 }, { "epoch": 2.7942157953281423, "grad_norm": 8.875, "learning_rate": 7.2057842046718585e-06, "loss": 2.9323, "mean_token_accuracy": 0.45787384208204673, "step": 15072 }, { "epoch": 2.7944011865035225, "grad_norm": 6.67578125, "learning_rate": 7.205598813496478e-06, "loss": 2.9699, "mean_token_accuracy": 0.4838482155452109, "step": 15073 }, { "epoch": 2.7945865776789027, "grad_norm": 7.71484375, "learning_rate": 7.205413422321098e-06, "loss": 2.8495, "mean_token_accuracy": 0.4518233717814284, "step": 15074 }, { "epoch": 2.7947719688542825, "grad_norm": 8.296875, "learning_rate": 7.205228031145718e-06, "loss": 2.7054, "mean_token_accuracy": 0.4866317725279389, "step": 15075 }, { "epoch": 2.7949573600296627, "grad_norm": 9.2890625, "learning_rate": 7.205042639970338e-06, "loss": 2.8962, "mean_token_accuracy": 0.45772241208246056, "step": 15076 }, { "epoch": 2.7951427512050424, "grad_norm": 7.94921875, "learning_rate": 7.2048572487949575e-06, "loss": 2.7764, "mean_token_accuracy": 0.48885761201032135, "step": 15077 }, { "epoch": 2.7953281423804226, "grad_norm": 6.84375, "learning_rate": 7.204671857619578e-06, "loss": 3.0983, "mean_token_accuracy": 0.4557510148849797, "step": 15078 }, { "epoch": 2.795513533555803, "grad_norm": 10.078125, "learning_rate": 7.2044864664441985e-06, "loss": 3.5627, "mean_token_accuracy": 0.424390243902439, "step": 15079 }, { "epoch": 2.795698924731183, "grad_norm": 9.296875, "learning_rate": 7.204301075268818e-06, "loss": 3.1225, "mean_token_accuracy": 0.45311916324243556, "step": 15080 }, { "epoch": 2.7958843159065627, "grad_norm": 9.078125, "learning_rate": 7.204115684093438e-06, "loss": 3.2977, "mean_token_accuracy": 0.44464896134630555, "step": 15081 }, { "epoch": 2.796069707081943, "grad_norm": 9.1484375, "learning_rate": 7.2039302929180575e-06, "loss": 3.0301, "mean_token_accuracy": 0.4696875, "step": 15082 }, { "epoch": 2.796255098257323, "grad_norm": 8.8515625, "learning_rate": 7.203744901742677e-06, "loss": 3.1958, "mean_token_accuracy": 0.4553934105611554, "step": 15083 }, { "epoch": 2.796440489432703, "grad_norm": 10.5859375, "learning_rate": 7.203559510567298e-06, "loss": 3.6017, "mean_token_accuracy": 0.42387399834208345, "step": 15084 }, { "epoch": 2.796625880608083, "grad_norm": 8.953125, "learning_rate": 7.203374119391917e-06, "loss": 2.7305, "mean_token_accuracy": 0.49435787211176785, "step": 15085 }, { "epoch": 2.7968112717834632, "grad_norm": 11.265625, "learning_rate": 7.203188728216538e-06, "loss": 2.6826, "mean_token_accuracy": 0.49415053318148877, "step": 15086 }, { "epoch": 2.7969966629588434, "grad_norm": 8.8828125, "learning_rate": 7.203003337041158e-06, "loss": 3.0542, "mean_token_accuracy": 0.4573622524585747, "step": 15087 }, { "epoch": 2.797182054134223, "grad_norm": 6.73828125, "learning_rate": 7.202817945865778e-06, "loss": 3.0894, "mean_token_accuracy": 0.4755719557195572, "step": 15088 }, { "epoch": 2.7973674453096034, "grad_norm": 9.296875, "learning_rate": 7.2026325546903975e-06, "loss": 2.3044, "mean_token_accuracy": 0.5953451043338683, "step": 15089 }, { "epoch": 2.797552836484983, "grad_norm": 9.546875, "learning_rate": 7.202447163515017e-06, "loss": 3.764, "mean_token_accuracy": 0.4209486166007905, "step": 15090 }, { "epoch": 2.7977382276603633, "grad_norm": 8.765625, "learning_rate": 7.202261772339637e-06, "loss": 2.9009, "mean_token_accuracy": 0.4816078877512325, "step": 15091 }, { "epoch": 2.7979236188357435, "grad_norm": 9.3671875, "learning_rate": 7.202076381164257e-06, "loss": 3.3386, "mean_token_accuracy": 0.43972761518400466, "step": 15092 }, { "epoch": 2.7981090100111237, "grad_norm": 8.0078125, "learning_rate": 7.201890989988877e-06, "loss": 3.0692, "mean_token_accuracy": 0.4766125732987863, "step": 15093 }, { "epoch": 2.7982944011865034, "grad_norm": 9.4609375, "learning_rate": 7.2017055988134966e-06, "loss": 2.8689, "mean_token_accuracy": 0.46838276440962506, "step": 15094 }, { "epoch": 2.7984797923618836, "grad_norm": 6.984375, "learning_rate": 7.201520207638117e-06, "loss": 2.8181, "mean_token_accuracy": 0.4955522609340252, "step": 15095 }, { "epoch": 2.798665183537264, "grad_norm": 8.609375, "learning_rate": 7.2013348164627376e-06, "loss": 2.7688, "mean_token_accuracy": 0.5122669283611384, "step": 15096 }, { "epoch": 2.7988505747126435, "grad_norm": 7.69140625, "learning_rate": 7.201149425287357e-06, "loss": 2.5302, "mean_token_accuracy": 0.5167961895211832, "step": 15097 }, { "epoch": 2.7990359658880237, "grad_norm": 7.640625, "learning_rate": 7.200964034111977e-06, "loss": 2.8601, "mean_token_accuracy": 0.4876747873720629, "step": 15098 }, { "epoch": 2.799221357063404, "grad_norm": 7.51171875, "learning_rate": 7.2007786429365965e-06, "loss": 3.0752, "mean_token_accuracy": 0.46550598476605004, "step": 15099 }, { "epoch": 2.799406748238784, "grad_norm": 7.12890625, "learning_rate": 7.200593251761216e-06, "loss": 2.7282, "mean_token_accuracy": 0.4918970448045758, "step": 15100 }, { "epoch": 2.799592139414164, "grad_norm": 6.84375, "learning_rate": 7.200407860585837e-06, "loss": 2.718, "mean_token_accuracy": 0.4981715229444379, "step": 15101 }, { "epoch": 2.799777530589544, "grad_norm": 7.33203125, "learning_rate": 7.200222469410456e-06, "loss": 2.9067, "mean_token_accuracy": 0.5020752826678117, "step": 15102 }, { "epoch": 2.799962921764924, "grad_norm": 8.609375, "learning_rate": 7.200037078235077e-06, "loss": 3.3783, "mean_token_accuracy": 0.4369772137294491, "step": 15103 }, { "epoch": 2.800148312940304, "grad_norm": 10.0078125, "learning_rate": 7.199851687059697e-06, "loss": 2.4701, "mean_token_accuracy": 0.5457623161335512, "step": 15104 }, { "epoch": 2.800333704115684, "grad_norm": 7.625, "learning_rate": 7.199666295884317e-06, "loss": 3.1045, "mean_token_accuracy": 0.47562008469449485, "step": 15105 }, { "epoch": 2.8005190952910644, "grad_norm": 7.46484375, "learning_rate": 7.1994809047089365e-06, "loss": 3.0907, "mean_token_accuracy": 0.4813863928112965, "step": 15106 }, { "epoch": 2.800704486466444, "grad_norm": 7.828125, "learning_rate": 7.199295513533556e-06, "loss": 3.2976, "mean_token_accuracy": 0.4475277497477296, "step": 15107 }, { "epoch": 2.8008898776418243, "grad_norm": 8.1875, "learning_rate": 7.199110122358176e-06, "loss": 2.9235, "mean_token_accuracy": 0.49774236387782206, "step": 15108 }, { "epoch": 2.801075268817204, "grad_norm": 8.4140625, "learning_rate": 7.1989247311827955e-06, "loss": 2.9451, "mean_token_accuracy": 0.4848860257680872, "step": 15109 }, { "epoch": 2.8012606599925842, "grad_norm": 7.921875, "learning_rate": 7.198739340007416e-06, "loss": 2.8936, "mean_token_accuracy": 0.4786989795918367, "step": 15110 }, { "epoch": 2.8014460511679644, "grad_norm": 8.1875, "learning_rate": 7.1985539488320365e-06, "loss": 3.1427, "mean_token_accuracy": 0.46887136636459925, "step": 15111 }, { "epoch": 2.8016314423433446, "grad_norm": 7.64453125, "learning_rate": 7.198368557656656e-06, "loss": 2.6006, "mean_token_accuracy": 0.5046799516908212, "step": 15112 }, { "epoch": 2.8018168335187243, "grad_norm": 7.05859375, "learning_rate": 7.198183166481277e-06, "loss": 3.0696, "mean_token_accuracy": 0.47130061814053237, "step": 15113 }, { "epoch": 2.8020022246941045, "grad_norm": 6.87109375, "learning_rate": 7.197997775305896e-06, "loss": 2.9914, "mean_token_accuracy": 0.4650147492625369, "step": 15114 }, { "epoch": 2.8021876158694847, "grad_norm": 7.3125, "learning_rate": 7.197812384130516e-06, "loss": 2.788, "mean_token_accuracy": 0.4853542234332425, "step": 15115 }, { "epoch": 2.8023730070448645, "grad_norm": 7.62109375, "learning_rate": 7.1976269929551355e-06, "loss": 2.5486, "mean_token_accuracy": 0.4947674418604651, "step": 15116 }, { "epoch": 2.8025583982202447, "grad_norm": 7.9765625, "learning_rate": 7.197441601779755e-06, "loss": 2.9243, "mean_token_accuracy": 0.48503801815060094, "step": 15117 }, { "epoch": 2.802743789395625, "grad_norm": 7.1796875, "learning_rate": 7.197256210604376e-06, "loss": 2.8176, "mean_token_accuracy": 0.48723868154418365, "step": 15118 }, { "epoch": 2.802929180571005, "grad_norm": 7.1171875, "learning_rate": 7.197070819428996e-06, "loss": 2.9805, "mean_token_accuracy": 0.4750103263114416, "step": 15119 }, { "epoch": 2.803114571746385, "grad_norm": 8.3828125, "learning_rate": 7.196885428253616e-06, "loss": 3.0855, "mean_token_accuracy": 0.4650530675640694, "step": 15120 }, { "epoch": 2.803299962921765, "grad_norm": 8.0390625, "learning_rate": 7.1967000370782354e-06, "loss": 3.2994, "mean_token_accuracy": 0.4635239777204184, "step": 15121 }, { "epoch": 2.8034853540971447, "grad_norm": 8.2890625, "learning_rate": 7.196514645902856e-06, "loss": 3.2627, "mean_token_accuracy": 0.4201581872595126, "step": 15122 }, { "epoch": 2.803670745272525, "grad_norm": 6.65625, "learning_rate": 7.196329254727476e-06, "loss": 2.4691, "mean_token_accuracy": 0.5175804032661223, "step": 15123 }, { "epoch": 2.803856136447905, "grad_norm": 8.6015625, "learning_rate": 7.196143863552095e-06, "loss": 2.3659, "mean_token_accuracy": 0.5158619080942384, "step": 15124 }, { "epoch": 2.8040415276232853, "grad_norm": 7.83203125, "learning_rate": 7.195958472376715e-06, "loss": 3.032, "mean_token_accuracy": 0.45323002240477966, "step": 15125 }, { "epoch": 2.804226918798665, "grad_norm": 7.14453125, "learning_rate": 7.1957730812013345e-06, "loss": 2.7285, "mean_token_accuracy": 0.5055790108564535, "step": 15126 }, { "epoch": 2.8044123099740452, "grad_norm": 9.1015625, "learning_rate": 7.195587690025956e-06, "loss": 2.7358, "mean_token_accuracy": 0.4887051700046577, "step": 15127 }, { "epoch": 2.8045977011494254, "grad_norm": 7.56640625, "learning_rate": 7.1954022988505755e-06, "loss": 3.0218, "mean_token_accuracy": 0.4885626493683851, "step": 15128 }, { "epoch": 2.804783092324805, "grad_norm": 6.83984375, "learning_rate": 7.195216907675195e-06, "loss": 2.511, "mean_token_accuracy": 0.4998406628425749, "step": 15129 }, { "epoch": 2.8049684835001854, "grad_norm": 8.5703125, "learning_rate": 7.195031516499816e-06, "loss": 2.6137, "mean_token_accuracy": 0.481037367540435, "step": 15130 }, { "epoch": 2.8051538746755655, "grad_norm": 7.9609375, "learning_rate": 7.194846125324435e-06, "loss": 3.7774, "mean_token_accuracy": 0.45529253035692385, "step": 15131 }, { "epoch": 2.8053392658509457, "grad_norm": 8.5546875, "learning_rate": 7.194660734149055e-06, "loss": 3.5504, "mean_token_accuracy": 0.44033302497687327, "step": 15132 }, { "epoch": 2.8055246570263255, "grad_norm": 8.140625, "learning_rate": 7.1944753429736746e-06, "loss": 3.2215, "mean_token_accuracy": 0.4429657794676806, "step": 15133 }, { "epoch": 2.8057100482017057, "grad_norm": 8.2734375, "learning_rate": 7.194289951798294e-06, "loss": 3.0829, "mean_token_accuracy": 0.48085306940025924, "step": 15134 }, { "epoch": 2.8058954393770854, "grad_norm": 9.6875, "learning_rate": 7.1941045606229155e-06, "loss": 2.5775, "mean_token_accuracy": 0.5050999592003264, "step": 15135 }, { "epoch": 2.8060808305524656, "grad_norm": 9.6953125, "learning_rate": 7.193919169447535e-06, "loss": 3.6068, "mean_token_accuracy": 0.4398965478466209, "step": 15136 }, { "epoch": 2.806266221727846, "grad_norm": 7.83984375, "learning_rate": 7.193733778272155e-06, "loss": 3.025, "mean_token_accuracy": 0.4621919119406557, "step": 15137 }, { "epoch": 2.806451612903226, "grad_norm": 7.921875, "learning_rate": 7.1935483870967745e-06, "loss": 2.8591, "mean_token_accuracy": 0.5216637781629117, "step": 15138 }, { "epoch": 2.8066370040786057, "grad_norm": 9.328125, "learning_rate": 7.193362995921395e-06, "loss": 3.0351, "mean_token_accuracy": 0.46036892118501954, "step": 15139 }, { "epoch": 2.806822395253986, "grad_norm": 8.2578125, "learning_rate": 7.193177604746015e-06, "loss": 2.9143, "mean_token_accuracy": 0.4710926694329184, "step": 15140 }, { "epoch": 2.807007786429366, "grad_norm": 7.640625, "learning_rate": 7.192992213570634e-06, "loss": 3.4673, "mean_token_accuracy": 0.44758862011258177, "step": 15141 }, { "epoch": 2.807193177604746, "grad_norm": 7.21484375, "learning_rate": 7.192806822395254e-06, "loss": 2.5494, "mean_token_accuracy": 0.492918961447679, "step": 15142 }, { "epoch": 2.807378568780126, "grad_norm": 8.84375, "learning_rate": 7.192621431219875e-06, "loss": 2.9263, "mean_token_accuracy": 0.4803836094158675, "step": 15143 }, { "epoch": 2.8075639599555062, "grad_norm": 7.59765625, "learning_rate": 7.192436040044495e-06, "loss": 2.5769, "mean_token_accuracy": 0.49435417240429635, "step": 15144 }, { "epoch": 2.8077493511308864, "grad_norm": 8.9765625, "learning_rate": 7.1922506488691145e-06, "loss": 2.3778, "mean_token_accuracy": 0.5434726411029729, "step": 15145 }, { "epoch": 2.807934742306266, "grad_norm": 8.25, "learning_rate": 7.192065257693734e-06, "loss": 3.5127, "mean_token_accuracy": 0.43467960288808666, "step": 15146 }, { "epoch": 2.8081201334816464, "grad_norm": 11.3984375, "learning_rate": 7.191879866518354e-06, "loss": 3.3858, "mean_token_accuracy": 0.47497232812692164, "step": 15147 }, { "epoch": 2.808305524657026, "grad_norm": 8.4765625, "learning_rate": 7.191694475342974e-06, "loss": 2.9619, "mean_token_accuracy": 0.47221255366051745, "step": 15148 }, { "epoch": 2.8084909158324063, "grad_norm": 6.83984375, "learning_rate": 7.191509084167594e-06, "loss": 2.5992, "mean_token_accuracy": 0.5071653768988249, "step": 15149 }, { "epoch": 2.8086763070077865, "grad_norm": 8.734375, "learning_rate": 7.191323692992214e-06, "loss": 2.4153, "mean_token_accuracy": 0.5380348652931854, "step": 15150 }, { "epoch": 2.8088616981831667, "grad_norm": 8.515625, "learning_rate": 7.191138301816835e-06, "loss": 2.4626, "mean_token_accuracy": 0.50625, "step": 15151 }, { "epoch": 2.8090470893585464, "grad_norm": 7.6328125, "learning_rate": 7.190952910641455e-06, "loss": 2.422, "mean_token_accuracy": 0.5230891719745223, "step": 15152 }, { "epoch": 2.8092324805339266, "grad_norm": 7.58984375, "learning_rate": 7.190767519466074e-06, "loss": 3.325, "mean_token_accuracy": 0.4875606796116505, "step": 15153 }, { "epoch": 2.809417871709307, "grad_norm": 8.90625, "learning_rate": 7.190582128290694e-06, "loss": 2.7585, "mean_token_accuracy": 0.5080741626794258, "step": 15154 }, { "epoch": 2.8096032628846865, "grad_norm": 6.6953125, "learning_rate": 7.1903967371153135e-06, "loss": 2.7595, "mean_token_accuracy": 0.49667908634375507, "step": 15155 }, { "epoch": 2.8097886540600667, "grad_norm": 6.79296875, "learning_rate": 7.190211345939934e-06, "loss": 3.0433, "mean_token_accuracy": 0.4691623197299785, "step": 15156 }, { "epoch": 2.809974045235447, "grad_norm": 6.74609375, "learning_rate": 7.190025954764554e-06, "loss": 3.007, "mean_token_accuracy": 0.4715125017078836, "step": 15157 }, { "epoch": 2.810159436410827, "grad_norm": 9.203125, "learning_rate": 7.189840563589173e-06, "loss": 3.114, "mean_token_accuracy": 0.456773766147731, "step": 15158 }, { "epoch": 2.810344827586207, "grad_norm": 10.65625, "learning_rate": 7.189655172413794e-06, "loss": 3.6596, "mean_token_accuracy": 0.43829617834394907, "step": 15159 }, { "epoch": 2.810530218761587, "grad_norm": 8.5234375, "learning_rate": 7.189469781238414e-06, "loss": 3.2839, "mean_token_accuracy": 0.44046218037873114, "step": 15160 }, { "epoch": 2.810715609936967, "grad_norm": 7.04296875, "learning_rate": 7.189284390063034e-06, "loss": 3.0344, "mean_token_accuracy": 0.4515198579986687, "step": 15161 }, { "epoch": 2.810901001112347, "grad_norm": 6.77734375, "learning_rate": 7.1890989988876536e-06, "loss": 2.4698, "mean_token_accuracy": 0.5353876306620209, "step": 15162 }, { "epoch": 2.811086392287727, "grad_norm": 8.890625, "learning_rate": 7.188913607712273e-06, "loss": 3.6128, "mean_token_accuracy": 0.4655195772930497, "step": 15163 }, { "epoch": 2.8112717834631074, "grad_norm": 9.3515625, "learning_rate": 7.188728216536893e-06, "loss": 2.8803, "mean_token_accuracy": 0.4988600740951838, "step": 15164 }, { "epoch": 2.811457174638487, "grad_norm": 6.73046875, "learning_rate": 7.188542825361513e-06, "loss": 3.0146, "mean_token_accuracy": 0.4668587896253602, "step": 15165 }, { "epoch": 2.8116425658138673, "grad_norm": 9.171875, "learning_rate": 7.188357434186133e-06, "loss": 3.4488, "mean_token_accuracy": 0.45190453230472516, "step": 15166 }, { "epoch": 2.811827956989247, "grad_norm": 7.8125, "learning_rate": 7.1881720430107535e-06, "loss": 2.5176, "mean_token_accuracy": 0.5082615306639635, "step": 15167 }, { "epoch": 2.8120133481646272, "grad_norm": 8.15625, "learning_rate": 7.187986651835374e-06, "loss": 2.1421, "mean_token_accuracy": 0.5490694717373714, "step": 15168 }, { "epoch": 2.8121987393400074, "grad_norm": 7.51953125, "learning_rate": 7.187801260659994e-06, "loss": 3.1388, "mean_token_accuracy": 0.46170742422023725, "step": 15169 }, { "epoch": 2.8123841305153876, "grad_norm": 10.828125, "learning_rate": 7.187615869484613e-06, "loss": 3.336, "mean_token_accuracy": 0.45003417634996584, "step": 15170 }, { "epoch": 2.812569521690768, "grad_norm": 6.4375, "learning_rate": 7.187430478309233e-06, "loss": 2.8683, "mean_token_accuracy": 0.46730370711489755, "step": 15171 }, { "epoch": 2.8127549128661475, "grad_norm": 9.234375, "learning_rate": 7.1872450871338525e-06, "loss": 3.2277, "mean_token_accuracy": 0.465564738292011, "step": 15172 }, { "epoch": 2.8129403040415277, "grad_norm": 10.3046875, "learning_rate": 7.187059695958473e-06, "loss": 2.6012, "mean_token_accuracy": 0.4836394948335247, "step": 15173 }, { "epoch": 2.8131256952169075, "grad_norm": 9.03125, "learning_rate": 7.186874304783093e-06, "loss": 3.6846, "mean_token_accuracy": 0.44680851063829785, "step": 15174 }, { "epoch": 2.8133110863922877, "grad_norm": 8.2734375, "learning_rate": 7.186688913607713e-06, "loss": 2.9356, "mean_token_accuracy": 0.48729216152019, "step": 15175 }, { "epoch": 2.813496477567668, "grad_norm": 10.5390625, "learning_rate": 7.186503522432333e-06, "loss": 2.5329, "mean_token_accuracy": 0.5306863434059499, "step": 15176 }, { "epoch": 2.813681868743048, "grad_norm": 6.7578125, "learning_rate": 7.186318131256953e-06, "loss": 2.7566, "mean_token_accuracy": 0.4843462246777164, "step": 15177 }, { "epoch": 2.813867259918428, "grad_norm": 8.6171875, "learning_rate": 7.186132740081573e-06, "loss": 3.4107, "mean_token_accuracy": 0.46724744513318817, "step": 15178 }, { "epoch": 2.814052651093808, "grad_norm": 9.0390625, "learning_rate": 7.185947348906193e-06, "loss": 2.5768, "mean_token_accuracy": 0.5283747886983821, "step": 15179 }, { "epoch": 2.8142380422691877, "grad_norm": 7.515625, "learning_rate": 7.185761957730812e-06, "loss": 3.4027, "mean_token_accuracy": 0.4648182665424045, "step": 15180 }, { "epoch": 2.814423433444568, "grad_norm": 7.15625, "learning_rate": 7.185576566555432e-06, "loss": 2.3864, "mean_token_accuracy": 0.5327638295655798, "step": 15181 }, { "epoch": 2.814608824619948, "grad_norm": 9.140625, "learning_rate": 7.185391175380052e-06, "loss": 3.1763, "mean_token_accuracy": 0.46565888925402565, "step": 15182 }, { "epoch": 2.8147942157953283, "grad_norm": 8.84375, "learning_rate": 7.185205784204673e-06, "loss": 2.7451, "mean_token_accuracy": 0.5076026355803345, "step": 15183 }, { "epoch": 2.814979606970708, "grad_norm": 7.37890625, "learning_rate": 7.1850203930292925e-06, "loss": 2.8083, "mean_token_accuracy": 0.49742853725630903, "step": 15184 }, { "epoch": 2.8151649981460882, "grad_norm": 9.28125, "learning_rate": 7.184835001853913e-06, "loss": 2.9839, "mean_token_accuracy": 0.46620011911852294, "step": 15185 }, { "epoch": 2.8153503893214684, "grad_norm": 10.765625, "learning_rate": 7.184649610678533e-06, "loss": 3.2862, "mean_token_accuracy": 0.46138691883372734, "step": 15186 }, { "epoch": 2.815535780496848, "grad_norm": 7.80078125, "learning_rate": 7.184464219503152e-06, "loss": 2.8552, "mean_token_accuracy": 0.47386581469648564, "step": 15187 }, { "epoch": 2.8157211716722284, "grad_norm": 8.3125, "learning_rate": 7.184278828327772e-06, "loss": 3.495, "mean_token_accuracy": 0.42829525483304043, "step": 15188 }, { "epoch": 2.8159065628476085, "grad_norm": 16.546875, "learning_rate": 7.184093437152392e-06, "loss": 3.1005, "mean_token_accuracy": 0.4606645839250213, "step": 15189 }, { "epoch": 2.8160919540229887, "grad_norm": 11.5625, "learning_rate": 7.183908045977011e-06, "loss": 3.435, "mean_token_accuracy": 0.44716657126502574, "step": 15190 }, { "epoch": 2.8162773451983685, "grad_norm": 7.79296875, "learning_rate": 7.1837226548016326e-06, "loss": 2.4775, "mean_token_accuracy": 0.5566850035876585, "step": 15191 }, { "epoch": 2.8164627363737487, "grad_norm": 12.234375, "learning_rate": 7.183537263626252e-06, "loss": 2.7201, "mean_token_accuracy": 0.5015587978550942, "step": 15192 }, { "epoch": 2.8166481275491284, "grad_norm": 12.3203125, "learning_rate": 7.183351872450872e-06, "loss": 3.244, "mean_token_accuracy": 0.45494887131005207, "step": 15193 }, { "epoch": 2.8168335187245086, "grad_norm": 10.0546875, "learning_rate": 7.183166481275492e-06, "loss": 2.8202, "mean_token_accuracy": 0.48370031455533313, "step": 15194 }, { "epoch": 2.817018909899889, "grad_norm": 8.7578125, "learning_rate": 7.182981090100112e-06, "loss": 2.5972, "mean_token_accuracy": 0.5006048038707448, "step": 15195 }, { "epoch": 2.817204301075269, "grad_norm": 8.90625, "learning_rate": 7.182795698924732e-06, "loss": 2.8367, "mean_token_accuracy": 0.5017137454201631, "step": 15196 }, { "epoch": 2.8173896922506487, "grad_norm": 7.4453125, "learning_rate": 7.182610307749351e-06, "loss": 2.0616, "mean_token_accuracy": 0.5739686228936665, "step": 15197 }, { "epoch": 2.817575083426029, "grad_norm": 7.31640625, "learning_rate": 7.182424916573971e-06, "loss": 3.1897, "mean_token_accuracy": 0.4597519455252918, "step": 15198 }, { "epoch": 2.817760474601409, "grad_norm": 9.9140625, "learning_rate": 7.182239525398592e-06, "loss": 3.3992, "mean_token_accuracy": 0.4260727318484963, "step": 15199 }, { "epoch": 2.817945865776789, "grad_norm": 10.1171875, "learning_rate": 7.182054134223212e-06, "loss": 2.8729, "mean_token_accuracy": 0.46733264439086003, "step": 15200 }, { "epoch": 2.818131256952169, "grad_norm": 8.4765625, "learning_rate": 7.1818687430478315e-06, "loss": 3.1042, "mean_token_accuracy": 0.49048072346501664, "step": 15201 }, { "epoch": 2.8183166481275492, "grad_norm": 11.34375, "learning_rate": 7.181683351872451e-06, "loss": 3.5828, "mean_token_accuracy": 0.4350017582932833, "step": 15202 }, { "epoch": 2.8185020393029294, "grad_norm": 12.5390625, "learning_rate": 7.181497960697072e-06, "loss": 2.6933, "mean_token_accuracy": 0.49958088851634536, "step": 15203 }, { "epoch": 2.818687430478309, "grad_norm": 9.1953125, "learning_rate": 7.181312569521691e-06, "loss": 3.4056, "mean_token_accuracy": 0.44741044946383757, "step": 15204 }, { "epoch": 2.8188728216536894, "grad_norm": 7.796875, "learning_rate": 7.181127178346311e-06, "loss": 3.1323, "mean_token_accuracy": 0.43470016591609384, "step": 15205 }, { "epoch": 2.819058212829069, "grad_norm": 8.265625, "learning_rate": 7.180941787170931e-06, "loss": 2.7191, "mean_token_accuracy": 0.5104555638536221, "step": 15206 }, { "epoch": 2.8192436040044493, "grad_norm": 11.78125, "learning_rate": 7.180756395995552e-06, "loss": 3.0656, "mean_token_accuracy": 0.4659610610218215, "step": 15207 }, { "epoch": 2.8194289951798295, "grad_norm": 6.95703125, "learning_rate": 7.180571004820172e-06, "loss": 2.7586, "mean_token_accuracy": 0.49056603773584906, "step": 15208 }, { "epoch": 2.8196143863552097, "grad_norm": 7.78125, "learning_rate": 7.180385613644791e-06, "loss": 3.4233, "mean_token_accuracy": 0.42899891186071815, "step": 15209 }, { "epoch": 2.8197997775305894, "grad_norm": 9.390625, "learning_rate": 7.180200222469411e-06, "loss": 3.0736, "mean_token_accuracy": 0.515790787666958, "step": 15210 }, { "epoch": 2.8199851687059696, "grad_norm": 8.7734375, "learning_rate": 7.180014831294031e-06, "loss": 2.537, "mean_token_accuracy": 0.494885598923284, "step": 15211 }, { "epoch": 2.82017055988135, "grad_norm": 10.4609375, "learning_rate": 7.179829440118651e-06, "loss": 3.3401, "mean_token_accuracy": 0.4673852957435047, "step": 15212 }, { "epoch": 2.8203559510567295, "grad_norm": 6.921875, "learning_rate": 7.179644048943271e-06, "loss": 3.0698, "mean_token_accuracy": 0.45839320705421294, "step": 15213 }, { "epoch": 2.8205413422321097, "grad_norm": 6.72265625, "learning_rate": 7.17945865776789e-06, "loss": 2.4871, "mean_token_accuracy": 0.5359680284191829, "step": 15214 }, { "epoch": 2.82072673340749, "grad_norm": 12.2265625, "learning_rate": 7.179273266592512e-06, "loss": 4.7525, "mean_token_accuracy": 0.42224480906054884, "step": 15215 }, { "epoch": 2.82091212458287, "grad_norm": 8.6875, "learning_rate": 7.179087875417131e-06, "loss": 2.4929, "mean_token_accuracy": 0.516763145200193, "step": 15216 }, { "epoch": 2.82109751575825, "grad_norm": 7.203125, "learning_rate": 7.178902484241751e-06, "loss": 2.8199, "mean_token_accuracy": 0.493015873015873, "step": 15217 }, { "epoch": 2.82128290693363, "grad_norm": 7.25, "learning_rate": 7.178717093066371e-06, "loss": 3.0702, "mean_token_accuracy": 0.45937961595273263, "step": 15218 }, { "epoch": 2.82146829810901, "grad_norm": 9.1953125, "learning_rate": 7.17853170189099e-06, "loss": 2.6695, "mean_token_accuracy": 0.4725123378859176, "step": 15219 }, { "epoch": 2.82165368928439, "grad_norm": 7.17578125, "learning_rate": 7.178346310715611e-06, "loss": 2.2863, "mean_token_accuracy": 0.5600731570061902, "step": 15220 }, { "epoch": 2.82183908045977, "grad_norm": 6.8671875, "learning_rate": 7.17816091954023e-06, "loss": 3.1208, "mean_token_accuracy": 0.47054392044598464, "step": 15221 }, { "epoch": 2.8220244716351504, "grad_norm": 8.296875, "learning_rate": 7.17797552836485e-06, "loss": 3.0415, "mean_token_accuracy": 0.45921186563908684, "step": 15222 }, { "epoch": 2.82220986281053, "grad_norm": 11.0703125, "learning_rate": 7.17779013718947e-06, "loss": 3.9782, "mean_token_accuracy": 0.4315453863465866, "step": 15223 }, { "epoch": 2.8223952539859103, "grad_norm": 7.89453125, "learning_rate": 7.177604746014091e-06, "loss": 3.3349, "mean_token_accuracy": 0.4629338493167251, "step": 15224 }, { "epoch": 2.8225806451612905, "grad_norm": 8.265625, "learning_rate": 7.177419354838711e-06, "loss": 3.0677, "mean_token_accuracy": 0.44997828290140435, "step": 15225 }, { "epoch": 2.8227660363366702, "grad_norm": 7.6015625, "learning_rate": 7.17723396366333e-06, "loss": 2.7996, "mean_token_accuracy": 0.4922202274087373, "step": 15226 }, { "epoch": 2.8229514275120504, "grad_norm": 7.484375, "learning_rate": 7.17704857248795e-06, "loss": 2.6462, "mean_token_accuracy": 0.5251582448235168, "step": 15227 }, { "epoch": 2.8231368186874306, "grad_norm": 9.703125, "learning_rate": 7.1768631813125696e-06, "loss": 2.9348, "mean_token_accuracy": 0.469034749034749, "step": 15228 }, { "epoch": 2.823322209862811, "grad_norm": 9.25, "learning_rate": 7.17667779013719e-06, "loss": 2.9124, "mean_token_accuracy": 0.4720428123269976, "step": 15229 }, { "epoch": 2.8235076010381905, "grad_norm": 11.078125, "learning_rate": 7.17649239896181e-06, "loss": 3.0566, "mean_token_accuracy": 0.47708138447146864, "step": 15230 }, { "epoch": 2.8236929922135707, "grad_norm": 9.921875, "learning_rate": 7.176307007786429e-06, "loss": 2.7654, "mean_token_accuracy": 0.4688361831218974, "step": 15231 }, { "epoch": 2.8238783833889505, "grad_norm": 7.5625, "learning_rate": 7.176121616611051e-06, "loss": 2.9236, "mean_token_accuracy": 0.47847180109157067, "step": 15232 }, { "epoch": 2.8240637745643307, "grad_norm": 6.7421875, "learning_rate": 7.17593622543567e-06, "loss": 2.5024, "mean_token_accuracy": 0.49814601713335893, "step": 15233 }, { "epoch": 2.824249165739711, "grad_norm": 9.3671875, "learning_rate": 7.17575083426029e-06, "loss": 3.0272, "mean_token_accuracy": 0.4809512254307207, "step": 15234 }, { "epoch": 2.824434556915091, "grad_norm": 12.9453125, "learning_rate": 7.17556544308491e-06, "loss": 2.4487, "mean_token_accuracy": 0.5411372096008215, "step": 15235 }, { "epoch": 2.824619948090471, "grad_norm": 9.8515625, "learning_rate": 7.175380051909529e-06, "loss": 2.9255, "mean_token_accuracy": 0.47164298047549474, "step": 15236 }, { "epoch": 2.824805339265851, "grad_norm": 7.9296875, "learning_rate": 7.17519466073415e-06, "loss": 2.6418, "mean_token_accuracy": 0.4936708860759494, "step": 15237 }, { "epoch": 2.8249907304412307, "grad_norm": 9.109375, "learning_rate": 7.175009269558769e-06, "loss": 2.9923, "mean_token_accuracy": 0.4757526323594839, "step": 15238 }, { "epoch": 2.825176121616611, "grad_norm": 8.6796875, "learning_rate": 7.174823878383389e-06, "loss": 2.9743, "mean_token_accuracy": 0.4585518102372035, "step": 15239 }, { "epoch": 2.825361512791991, "grad_norm": 7.9296875, "learning_rate": 7.1746384872080095e-06, "loss": 2.7221, "mean_token_accuracy": 0.4936163667128134, "step": 15240 }, { "epoch": 2.8255469039673713, "grad_norm": 8.4609375, "learning_rate": 7.17445309603263e-06, "loss": 2.935, "mean_token_accuracy": 0.4834771068347711, "step": 15241 }, { "epoch": 2.8257322951427515, "grad_norm": 8.921875, "learning_rate": 7.17426770485725e-06, "loss": 2.6768, "mean_token_accuracy": 0.5093951849677041, "step": 15242 }, { "epoch": 2.8259176863181312, "grad_norm": 8.3359375, "learning_rate": 7.174082313681869e-06, "loss": 2.8721, "mean_token_accuracy": 0.5156293608707787, "step": 15243 }, { "epoch": 2.8261030774935114, "grad_norm": 7.9453125, "learning_rate": 7.173896922506489e-06, "loss": 2.7361, "mean_token_accuracy": 0.504950495049505, "step": 15244 }, { "epoch": 2.826288468668891, "grad_norm": 7.9609375, "learning_rate": 7.173711531331109e-06, "loss": 2.6629, "mean_token_accuracy": 0.5042558679391282, "step": 15245 }, { "epoch": 2.8264738598442714, "grad_norm": 7.47265625, "learning_rate": 7.173526140155729e-06, "loss": 3.2311, "mean_token_accuracy": 0.4636731777036684, "step": 15246 }, { "epoch": 2.8266592510196515, "grad_norm": 14.2265625, "learning_rate": 7.173340748980349e-06, "loss": 2.87, "mean_token_accuracy": 0.4608982412060301, "step": 15247 }, { "epoch": 2.8268446421950317, "grad_norm": 12.1875, "learning_rate": 7.173155357804969e-06, "loss": 2.8229, "mean_token_accuracy": 0.4696688331785825, "step": 15248 }, { "epoch": 2.8270300333704115, "grad_norm": 8.4140625, "learning_rate": 7.17296996662959e-06, "loss": 2.975, "mean_token_accuracy": 0.4731856601573885, "step": 15249 }, { "epoch": 2.8272154245457917, "grad_norm": 10.8046875, "learning_rate": 7.172784575454209e-06, "loss": 2.1435, "mean_token_accuracy": 0.5584784254352763, "step": 15250 }, { "epoch": 2.8274008157211714, "grad_norm": 7.921875, "learning_rate": 7.172599184278829e-06, "loss": 3.0244, "mean_token_accuracy": 0.4827233694540994, "step": 15251 }, { "epoch": 2.8275862068965516, "grad_norm": 8.0546875, "learning_rate": 7.172413793103449e-06, "loss": 2.8611, "mean_token_accuracy": 0.4657322875144657, "step": 15252 }, { "epoch": 2.827771598071932, "grad_norm": 7.16796875, "learning_rate": 7.172228401928068e-06, "loss": 3.2189, "mean_token_accuracy": 0.4601990049751244, "step": 15253 }, { "epoch": 2.827956989247312, "grad_norm": 8.8515625, "learning_rate": 7.172043010752689e-06, "loss": 2.8872, "mean_token_accuracy": 0.4682074768345937, "step": 15254 }, { "epoch": 2.8281423804226917, "grad_norm": 7.48046875, "learning_rate": 7.1718576195773084e-06, "loss": 3.0905, "mean_token_accuracy": 0.4719029374201788, "step": 15255 }, { "epoch": 2.828327771598072, "grad_norm": 7.57421875, "learning_rate": 7.171672228401929e-06, "loss": 2.532, "mean_token_accuracy": 0.5068777575914871, "step": 15256 }, { "epoch": 2.828513162773452, "grad_norm": 6.890625, "learning_rate": 7.1714868372265486e-06, "loss": 1.9936, "mean_token_accuracy": 0.5862289218191109, "step": 15257 }, { "epoch": 2.828698553948832, "grad_norm": 7.30859375, "learning_rate": 7.171301446051169e-06, "loss": 2.7233, "mean_token_accuracy": 0.4974804736709499, "step": 15258 }, { "epoch": 2.828883945124212, "grad_norm": 8.8515625, "learning_rate": 7.171116054875789e-06, "loss": 2.4227, "mean_token_accuracy": 0.6034171035979027, "step": 15259 }, { "epoch": 2.8290693362995922, "grad_norm": 8.2421875, "learning_rate": 7.170930663700408e-06, "loss": 3.2699, "mean_token_accuracy": 0.47303353422963995, "step": 15260 }, { "epoch": 2.8292547274749724, "grad_norm": 7.04296875, "learning_rate": 7.170745272525028e-06, "loss": 3.2004, "mean_token_accuracy": 0.45066193530776577, "step": 15261 }, { "epoch": 2.829440118650352, "grad_norm": 7.28515625, "learning_rate": 7.170559881349648e-06, "loss": 3.3746, "mean_token_accuracy": 0.45788712011577426, "step": 15262 }, { "epoch": 2.8296255098257324, "grad_norm": 6.8203125, "learning_rate": 7.170374490174268e-06, "loss": 3.0295, "mean_token_accuracy": 0.4820002801512817, "step": 15263 }, { "epoch": 2.829810901001112, "grad_norm": 7.3671875, "learning_rate": 7.170189098998889e-06, "loss": 3.0949, "mean_token_accuracy": 0.46167056986729116, "step": 15264 }, { "epoch": 2.8299962921764923, "grad_norm": 7.48828125, "learning_rate": 7.170003707823508e-06, "loss": 2.5157, "mean_token_accuracy": 0.4998430469812703, "step": 15265 }, { "epoch": 2.8301816833518725, "grad_norm": 13.5, "learning_rate": 7.169818316648128e-06, "loss": 3.2212, "mean_token_accuracy": 0.43549835378629154, "step": 15266 }, { "epoch": 2.8303670745272527, "grad_norm": 12.1015625, "learning_rate": 7.169632925472748e-06, "loss": 2.8169, "mean_token_accuracy": 0.4647041735257093, "step": 15267 }, { "epoch": 2.8305524657026324, "grad_norm": 9.8671875, "learning_rate": 7.169447534297368e-06, "loss": 3.6591, "mean_token_accuracy": 0.440552016985138, "step": 15268 }, { "epoch": 2.8307378568780126, "grad_norm": 10.21875, "learning_rate": 7.169262143121988e-06, "loss": 3.1163, "mean_token_accuracy": 0.4661560106299828, "step": 15269 }, { "epoch": 2.830923248053393, "grad_norm": 7.78125, "learning_rate": 7.169076751946607e-06, "loss": 2.594, "mean_token_accuracy": 0.539269406392694, "step": 15270 }, { "epoch": 2.8311086392287725, "grad_norm": 8.40625, "learning_rate": 7.168891360771227e-06, "loss": 2.8756, "mean_token_accuracy": 0.521639306623809, "step": 15271 }, { "epoch": 2.8312940304041527, "grad_norm": 13.375, "learning_rate": 7.168705969595848e-06, "loss": 2.2763, "mean_token_accuracy": 0.5494162524563634, "step": 15272 }, { "epoch": 2.831479421579533, "grad_norm": 7.05859375, "learning_rate": 7.168520578420468e-06, "loss": 3.4559, "mean_token_accuracy": 0.435024154589372, "step": 15273 }, { "epoch": 2.831664812754913, "grad_norm": 8.2109375, "learning_rate": 7.168335187245088e-06, "loss": 2.6001, "mean_token_accuracy": 0.48295849756549963, "step": 15274 }, { "epoch": 2.831850203930293, "grad_norm": 11.25, "learning_rate": 7.168149796069708e-06, "loss": 3.1598, "mean_token_accuracy": 0.46556436184749933, "step": 15275 }, { "epoch": 2.832035595105673, "grad_norm": 10.1953125, "learning_rate": 7.167964404894328e-06, "loss": 3.0354, "mean_token_accuracy": 0.4671600370027752, "step": 15276 }, { "epoch": 2.832220986281053, "grad_norm": 6.38671875, "learning_rate": 7.167779013718947e-06, "loss": 2.5175, "mean_token_accuracy": 0.5075727643882524, "step": 15277 }, { "epoch": 2.832406377456433, "grad_norm": 10.1640625, "learning_rate": 7.167593622543567e-06, "loss": 2.594, "mean_token_accuracy": 0.5274872620005363, "step": 15278 }, { "epoch": 2.832591768631813, "grad_norm": 13.3984375, "learning_rate": 7.167408231368187e-06, "loss": 2.8739, "mean_token_accuracy": 0.45, "step": 15279 }, { "epoch": 2.8327771598071934, "grad_norm": 8.5546875, "learning_rate": 7.167222840192808e-06, "loss": 3.0544, "mean_token_accuracy": 0.4652767361631918, "step": 15280 }, { "epoch": 2.832962550982573, "grad_norm": 10.4609375, "learning_rate": 7.167037449017428e-06, "loss": 2.741, "mean_token_accuracy": 0.466893039049236, "step": 15281 }, { "epoch": 2.8331479421579533, "grad_norm": 8.25, "learning_rate": 7.166852057842047e-06, "loss": 3.3713, "mean_token_accuracy": 0.45990428388000465, "step": 15282 }, { "epoch": 2.8333333333333335, "grad_norm": 9.21875, "learning_rate": 7.166666666666667e-06, "loss": 3.4191, "mean_token_accuracy": 0.46594666078906766, "step": 15283 }, { "epoch": 2.8335187245087132, "grad_norm": 7.48046875, "learning_rate": 7.1664812754912874e-06, "loss": 3.1798, "mean_token_accuracy": 0.463968253968254, "step": 15284 }, { "epoch": 2.8337041156840934, "grad_norm": 8.140625, "learning_rate": 7.166295884315907e-06, "loss": 3.5657, "mean_token_accuracy": 0.4385147891755821, "step": 15285 }, { "epoch": 2.8338895068594736, "grad_norm": 7.66796875, "learning_rate": 7.166110493140527e-06, "loss": 3.0722, "mean_token_accuracy": 0.46045751633986925, "step": 15286 }, { "epoch": 2.834074898034854, "grad_norm": 8.046875, "learning_rate": 7.165925101965146e-06, "loss": 3.1527, "mean_token_accuracy": 0.46708041014570967, "step": 15287 }, { "epoch": 2.8342602892102335, "grad_norm": 8.1171875, "learning_rate": 7.165739710789768e-06, "loss": 3.5991, "mean_token_accuracy": 0.44224655598728363, "step": 15288 }, { "epoch": 2.8344456803856137, "grad_norm": 9.1484375, "learning_rate": 7.165554319614387e-06, "loss": 3.0266, "mean_token_accuracy": 0.4754125412541254, "step": 15289 }, { "epoch": 2.8346310715609935, "grad_norm": 7.69921875, "learning_rate": 7.165368928439007e-06, "loss": 2.5108, "mean_token_accuracy": 0.4887545684565645, "step": 15290 }, { "epoch": 2.8348164627363737, "grad_norm": 7.02734375, "learning_rate": 7.165183537263627e-06, "loss": 2.2721, "mean_token_accuracy": 0.5655388978930308, "step": 15291 }, { "epoch": 2.835001853911754, "grad_norm": 6.69140625, "learning_rate": 7.164998146088247e-06, "loss": 2.4699, "mean_token_accuracy": 0.5299379770992366, "step": 15292 }, { "epoch": 2.835187245087134, "grad_norm": 7.00390625, "learning_rate": 7.164812754912867e-06, "loss": 2.4868, "mean_token_accuracy": 0.5182687591956842, "step": 15293 }, { "epoch": 2.835372636262514, "grad_norm": 9.2109375, "learning_rate": 7.164627363737486e-06, "loss": 3.2646, "mean_token_accuracy": 0.4600687810470004, "step": 15294 }, { "epoch": 2.835558027437894, "grad_norm": 12.8671875, "learning_rate": 7.164441972562106e-06, "loss": 2.74, "mean_token_accuracy": 0.49096420659904394, "step": 15295 }, { "epoch": 2.835743418613274, "grad_norm": 8.046875, "learning_rate": 7.164256581386727e-06, "loss": 2.8127, "mean_token_accuracy": 0.49566236811254394, "step": 15296 }, { "epoch": 2.835928809788654, "grad_norm": 11.9921875, "learning_rate": 7.164071190211347e-06, "loss": 3.2175, "mean_token_accuracy": 0.45983338292174947, "step": 15297 }, { "epoch": 2.836114200964034, "grad_norm": 11.3828125, "learning_rate": 7.163885799035967e-06, "loss": 2.8594, "mean_token_accuracy": 0.46378795420607266, "step": 15298 }, { "epoch": 2.8362995921394143, "grad_norm": 7.7734375, "learning_rate": 7.163700407860586e-06, "loss": 2.8611, "mean_token_accuracy": 0.47993499187398425, "step": 15299 }, { "epoch": 2.8364849833147945, "grad_norm": 9.125, "learning_rate": 7.163515016685206e-06, "loss": 2.474, "mean_token_accuracy": 0.5115901715345387, "step": 15300 }, { "epoch": 2.8366703744901742, "grad_norm": 10.9296875, "learning_rate": 7.1633296255098265e-06, "loss": 2.5771, "mean_token_accuracy": 0.49374796129172555, "step": 15301 }, { "epoch": 2.8368557656655544, "grad_norm": 8.296875, "learning_rate": 7.163144234334446e-06, "loss": 3.0117, "mean_token_accuracy": 0.4800275482093664, "step": 15302 }, { "epoch": 2.837041156840934, "grad_norm": 9.3203125, "learning_rate": 7.162958843159066e-06, "loss": 3.4338, "mean_token_accuracy": 0.45917001338688085, "step": 15303 }, { "epoch": 2.8372265480163144, "grad_norm": 8.6171875, "learning_rate": 7.162773451983687e-06, "loss": 3.3489, "mean_token_accuracy": 0.4583008573655495, "step": 15304 }, { "epoch": 2.8374119391916945, "grad_norm": 7.16796875, "learning_rate": 7.162588060808307e-06, "loss": 3.2071, "mean_token_accuracy": 0.45114539504441326, "step": 15305 }, { "epoch": 2.8375973303670747, "grad_norm": 8.1171875, "learning_rate": 7.162402669632926e-06, "loss": 2.8511, "mean_token_accuracy": 0.5066331517944421, "step": 15306 }, { "epoch": 2.8377827215424545, "grad_norm": 8.265625, "learning_rate": 7.162217278457546e-06, "loss": 3.1439, "mean_token_accuracy": 0.46727748691099474, "step": 15307 }, { "epoch": 2.8379681127178347, "grad_norm": 9.65625, "learning_rate": 7.162031887282166e-06, "loss": 2.8564, "mean_token_accuracy": 0.46910202767943354, "step": 15308 }, { "epoch": 2.8381535038932144, "grad_norm": 7.94921875, "learning_rate": 7.161846496106785e-06, "loss": 2.7972, "mean_token_accuracy": 0.4878367773999477, "step": 15309 }, { "epoch": 2.8383388950685946, "grad_norm": 7.86328125, "learning_rate": 7.161661104931406e-06, "loss": 3.2065, "mean_token_accuracy": 0.45118570988604867, "step": 15310 }, { "epoch": 2.838524286243975, "grad_norm": 8.6875, "learning_rate": 7.1614757137560255e-06, "loss": 3.5121, "mean_token_accuracy": 0.44878317063110135, "step": 15311 }, { "epoch": 2.838709677419355, "grad_norm": 8.59375, "learning_rate": 7.161290322580646e-06, "loss": 3.4376, "mean_token_accuracy": 0.4389626818469323, "step": 15312 }, { "epoch": 2.8388950685947347, "grad_norm": 8.0859375, "learning_rate": 7.1611049314052664e-06, "loss": 2.7883, "mean_token_accuracy": 0.47029945999018163, "step": 15313 }, { "epoch": 2.839080459770115, "grad_norm": 7.51171875, "learning_rate": 7.160919540229886e-06, "loss": 3.0305, "mean_token_accuracy": 0.5053506375227687, "step": 15314 }, { "epoch": 2.839265850945495, "grad_norm": 9.0859375, "learning_rate": 7.160734149054506e-06, "loss": 3.4296, "mean_token_accuracy": 0.4365284974093264, "step": 15315 }, { "epoch": 2.839451242120875, "grad_norm": 8.015625, "learning_rate": 7.160548757879125e-06, "loss": 3.4203, "mean_token_accuracy": 0.4649099576271186, "step": 15316 }, { "epoch": 2.839636633296255, "grad_norm": 7.390625, "learning_rate": 7.160363366703745e-06, "loss": 2.552, "mean_token_accuracy": 0.5275647518448849, "step": 15317 }, { "epoch": 2.8398220244716352, "grad_norm": 6.99609375, "learning_rate": 7.1601779755283655e-06, "loss": 3.0433, "mean_token_accuracy": 0.47981444332998996, "step": 15318 }, { "epoch": 2.8400074156470154, "grad_norm": 7.203125, "learning_rate": 7.159992584352985e-06, "loss": 2.7172, "mean_token_accuracy": 0.49413362973940966, "step": 15319 }, { "epoch": 2.840192806822395, "grad_norm": 7.9765625, "learning_rate": 7.159807193177606e-06, "loss": 2.697, "mean_token_accuracy": 0.46564087242306545, "step": 15320 }, { "epoch": 2.8403781979977754, "grad_norm": 9.8203125, "learning_rate": 7.159621802002225e-06, "loss": 2.5349, "mean_token_accuracy": 0.5241062156950007, "step": 15321 }, { "epoch": 2.840563589173155, "grad_norm": 6.8359375, "learning_rate": 7.159436410826846e-06, "loss": 2.8102, "mean_token_accuracy": 0.4800826541154862, "step": 15322 }, { "epoch": 2.8407489803485353, "grad_norm": 7.05859375, "learning_rate": 7.159251019651465e-06, "loss": 2.7281, "mean_token_accuracy": 0.49166200335758253, "step": 15323 }, { "epoch": 2.8409343715239155, "grad_norm": 7.859375, "learning_rate": 7.159065628476085e-06, "loss": 2.7788, "mean_token_accuracy": 0.4970980392156863, "step": 15324 }, { "epoch": 2.8411197626992957, "grad_norm": 7.23828125, "learning_rate": 7.158880237300705e-06, "loss": 3.1547, "mean_token_accuracy": 0.4501797268152408, "step": 15325 }, { "epoch": 2.8413051538746754, "grad_norm": 8.609375, "learning_rate": 7.158694846125324e-06, "loss": 2.4184, "mean_token_accuracy": 0.5225821707259399, "step": 15326 }, { "epoch": 2.8414905450500556, "grad_norm": 7.66796875, "learning_rate": 7.158509454949945e-06, "loss": 3.1083, "mean_token_accuracy": 0.4604329311568488, "step": 15327 }, { "epoch": 2.841675936225436, "grad_norm": 9.5078125, "learning_rate": 7.158324063774565e-06, "loss": 2.8636, "mean_token_accuracy": 0.4778147131088308, "step": 15328 }, { "epoch": 2.8418613274008155, "grad_norm": 7.13671875, "learning_rate": 7.158138672599185e-06, "loss": 3.3273, "mean_token_accuracy": 0.43343996062992124, "step": 15329 }, { "epoch": 2.8420467185761957, "grad_norm": 7.05078125, "learning_rate": 7.1579532814238055e-06, "loss": 2.55, "mean_token_accuracy": 0.48985549554809515, "step": 15330 }, { "epoch": 2.842232109751576, "grad_norm": 8.6875, "learning_rate": 7.157767890248425e-06, "loss": 2.8157, "mean_token_accuracy": 0.5078813785733369, "step": 15331 }, { "epoch": 2.842417500926956, "grad_norm": 7.89453125, "learning_rate": 7.157582499073045e-06, "loss": 2.8427, "mean_token_accuracy": 0.5075866475003993, "step": 15332 }, { "epoch": 2.842602892102336, "grad_norm": 7.42578125, "learning_rate": 7.157397107897664e-06, "loss": 2.9308, "mean_token_accuracy": 0.4898137732783402, "step": 15333 }, { "epoch": 2.842788283277716, "grad_norm": 7.97265625, "learning_rate": 7.157211716722284e-06, "loss": 2.7237, "mean_token_accuracy": 0.5005841121495327, "step": 15334 }, { "epoch": 2.842973674453096, "grad_norm": 7.76171875, "learning_rate": 7.1570263255469045e-06, "loss": 3.3254, "mean_token_accuracy": 0.44100856327307325, "step": 15335 }, { "epoch": 2.843159065628476, "grad_norm": 9.6171875, "learning_rate": 7.156840934371525e-06, "loss": 3.2085, "mean_token_accuracy": 0.4795158286778399, "step": 15336 }, { "epoch": 2.843344456803856, "grad_norm": 7.99609375, "learning_rate": 7.156655543196145e-06, "loss": 3.2065, "mean_token_accuracy": 0.47353625436215585, "step": 15337 }, { "epoch": 2.8435298479792364, "grad_norm": 22.171875, "learning_rate": 7.156470152020764e-06, "loss": 3.4936, "mean_token_accuracy": 0.44387910857840646, "step": 15338 }, { "epoch": 2.843715239154616, "grad_norm": 7.95703125, "learning_rate": 7.156284760845385e-06, "loss": 2.5127, "mean_token_accuracy": 0.5081242688158065, "step": 15339 }, { "epoch": 2.8439006303299963, "grad_norm": 8.1953125, "learning_rate": 7.1560993696700045e-06, "loss": 3.1522, "mean_token_accuracy": 0.44326562297839306, "step": 15340 }, { "epoch": 2.8440860215053765, "grad_norm": 11.7578125, "learning_rate": 7.155913978494624e-06, "loss": 2.5732, "mean_token_accuracy": 0.5439847231063017, "step": 15341 }, { "epoch": 2.8442714126807562, "grad_norm": 8.875, "learning_rate": 7.155728587319244e-06, "loss": 2.858, "mean_token_accuracy": 0.49902152641878667, "step": 15342 }, { "epoch": 2.8444568038561364, "grad_norm": 7.9921875, "learning_rate": 7.155543196143863e-06, "loss": 2.6091, "mean_token_accuracy": 0.49125218695326167, "step": 15343 }, { "epoch": 2.8446421950315166, "grad_norm": 6.73828125, "learning_rate": 7.155357804968484e-06, "loss": 2.5139, "mean_token_accuracy": 0.514889943892965, "step": 15344 }, { "epoch": 2.844827586206897, "grad_norm": 8.8359375, "learning_rate": 7.155172413793104e-06, "loss": 2.8087, "mean_token_accuracy": 0.5017280240420736, "step": 15345 }, { "epoch": 2.8450129773822765, "grad_norm": 7.96484375, "learning_rate": 7.154987022617724e-06, "loss": 2.5906, "mean_token_accuracy": 0.5117145899893504, "step": 15346 }, { "epoch": 2.8451983685576567, "grad_norm": 11.0234375, "learning_rate": 7.154801631442344e-06, "loss": 2.1383, "mean_token_accuracy": 0.5921406180026869, "step": 15347 }, { "epoch": 2.8453837597330365, "grad_norm": 7.84765625, "learning_rate": 7.154616240266964e-06, "loss": 2.6228, "mean_token_accuracy": 0.514711137581894, "step": 15348 }, { "epoch": 2.8455691509084167, "grad_norm": 7.53515625, "learning_rate": 7.154430849091584e-06, "loss": 2.8816, "mean_token_accuracy": 0.4826195278206193, "step": 15349 }, { "epoch": 2.845754542083797, "grad_norm": 9.3515625, "learning_rate": 7.1542454579162034e-06, "loss": 2.9497, "mean_token_accuracy": 0.4782608695652174, "step": 15350 }, { "epoch": 2.845939933259177, "grad_norm": 8.453125, "learning_rate": 7.154060066740823e-06, "loss": 3.5573, "mean_token_accuracy": 0.4413212293557913, "step": 15351 }, { "epoch": 2.846125324434557, "grad_norm": 8.4140625, "learning_rate": 7.153874675565443e-06, "loss": 2.5936, "mean_token_accuracy": 0.49481193255512324, "step": 15352 }, { "epoch": 2.846310715609937, "grad_norm": 10.2734375, "learning_rate": 7.153689284390064e-06, "loss": 2.8037, "mean_token_accuracy": 0.49286669638876507, "step": 15353 }, { "epoch": 2.846496106785317, "grad_norm": 9.09375, "learning_rate": 7.153503893214684e-06, "loss": 3.4853, "mean_token_accuracy": 0.4278494248867201, "step": 15354 }, { "epoch": 2.846681497960697, "grad_norm": 8.15625, "learning_rate": 7.153318502039303e-06, "loss": 3.448, "mean_token_accuracy": 0.4424838362068966, "step": 15355 }, { "epoch": 2.846866889136077, "grad_norm": 9.5703125, "learning_rate": 7.153133110863924e-06, "loss": 3.1107, "mean_token_accuracy": 0.46409389297791925, "step": 15356 }, { "epoch": 2.8470522803114573, "grad_norm": 9.546875, "learning_rate": 7.1529477196885435e-06, "loss": 2.8841, "mean_token_accuracy": 0.5011965811965812, "step": 15357 }, { "epoch": 2.8472376714868375, "grad_norm": 7.73828125, "learning_rate": 7.152762328513163e-06, "loss": 2.7977, "mean_token_accuracy": 0.4729277142059726, "step": 15358 }, { "epoch": 2.8474230626622172, "grad_norm": 7.66015625, "learning_rate": 7.152576937337783e-06, "loss": 2.2581, "mean_token_accuracy": 0.5403341976375684, "step": 15359 }, { "epoch": 2.8476084538375974, "grad_norm": 7.6484375, "learning_rate": 7.152391546162402e-06, "loss": 3.3615, "mean_token_accuracy": 0.4239086087311302, "step": 15360 }, { "epoch": 2.847793845012977, "grad_norm": 7.91796875, "learning_rate": 7.152206154987024e-06, "loss": 2.8529, "mean_token_accuracy": 0.4927097661623109, "step": 15361 }, { "epoch": 2.8479792361883574, "grad_norm": 7.38671875, "learning_rate": 7.152020763811643e-06, "loss": 2.7488, "mean_token_accuracy": 0.4978601997146933, "step": 15362 }, { "epoch": 2.8481646273637375, "grad_norm": 9.1484375, "learning_rate": 7.151835372636263e-06, "loss": 3.1621, "mean_token_accuracy": 0.4585887384176764, "step": 15363 }, { "epoch": 2.8483500185391177, "grad_norm": 9.109375, "learning_rate": 7.151649981460883e-06, "loss": 2.8604, "mean_token_accuracy": 0.48059561496072223, "step": 15364 }, { "epoch": 2.8485354097144975, "grad_norm": 7.25390625, "learning_rate": 7.151464590285503e-06, "loss": 3.4205, "mean_token_accuracy": 0.4360484134772653, "step": 15365 }, { "epoch": 2.8487208008898777, "grad_norm": 36.1875, "learning_rate": 7.151279199110123e-06, "loss": 4.6103, "mean_token_accuracy": 0.4674168418449363, "step": 15366 }, { "epoch": 2.848906192065258, "grad_norm": 8.9921875, "learning_rate": 7.1510938079347425e-06, "loss": 2.7873, "mean_token_accuracy": 0.48792585824304924, "step": 15367 }, { "epoch": 2.8490915832406376, "grad_norm": 6.63671875, "learning_rate": 7.150908416759362e-06, "loss": 3.3537, "mean_token_accuracy": 0.4453447050461976, "step": 15368 }, { "epoch": 2.849276974416018, "grad_norm": 7.51953125, "learning_rate": 7.1507230255839835e-06, "loss": 2.3749, "mean_token_accuracy": 0.5031650706054211, "step": 15369 }, { "epoch": 2.849462365591398, "grad_norm": 8.4765625, "learning_rate": 7.150537634408603e-06, "loss": 2.8218, "mean_token_accuracy": 0.48682196905288216, "step": 15370 }, { "epoch": 2.849647756766778, "grad_norm": 7.09765625, "learning_rate": 7.150352243233223e-06, "loss": 2.0373, "mean_token_accuracy": 0.5493087557603686, "step": 15371 }, { "epoch": 2.849833147942158, "grad_norm": 7.31640625, "learning_rate": 7.150166852057842e-06, "loss": 2.7381, "mean_token_accuracy": 0.5044893378226711, "step": 15372 }, { "epoch": 2.850018539117538, "grad_norm": 7.96875, "learning_rate": 7.149981460882463e-06, "loss": 2.6069, "mean_token_accuracy": 0.4919678714859438, "step": 15373 }, { "epoch": 2.850203930292918, "grad_norm": 9.921875, "learning_rate": 7.1497960697070825e-06, "loss": 3.1815, "mean_token_accuracy": 0.4425312568545734, "step": 15374 }, { "epoch": 2.850389321468298, "grad_norm": 7.7734375, "learning_rate": 7.149610678531702e-06, "loss": 3.3214, "mean_token_accuracy": 0.46538311940879035, "step": 15375 }, { "epoch": 2.8505747126436782, "grad_norm": 8.96875, "learning_rate": 7.149425287356322e-06, "loss": 2.8184, "mean_token_accuracy": 0.5161367154936287, "step": 15376 }, { "epoch": 2.8507601038190584, "grad_norm": 8.640625, "learning_rate": 7.149239896180943e-06, "loss": 2.7493, "mean_token_accuracy": 0.4799309948246118, "step": 15377 }, { "epoch": 2.850945494994438, "grad_norm": 7.13671875, "learning_rate": 7.149054505005563e-06, "loss": 3.0697, "mean_token_accuracy": 0.4699612403100775, "step": 15378 }, { "epoch": 2.8511308861698184, "grad_norm": 7.296875, "learning_rate": 7.1488691138301824e-06, "loss": 3.37, "mean_token_accuracy": 0.46646380119183467, "step": 15379 }, { "epoch": 2.851316277345198, "grad_norm": 6.91015625, "learning_rate": 7.148683722654802e-06, "loss": 3.1996, "mean_token_accuracy": 0.452587274077742, "step": 15380 }, { "epoch": 2.8515016685205783, "grad_norm": 6.8828125, "learning_rate": 7.148498331479422e-06, "loss": 3.1867, "mean_token_accuracy": 0.4593460008837826, "step": 15381 }, { "epoch": 2.8516870596959585, "grad_norm": 7.421875, "learning_rate": 7.148312940304042e-06, "loss": 3.5187, "mean_token_accuracy": 0.4282899366643209, "step": 15382 }, { "epoch": 2.8518724508713387, "grad_norm": 7.3203125, "learning_rate": 7.148127549128662e-06, "loss": 2.4259, "mean_token_accuracy": 0.547911547911548, "step": 15383 }, { "epoch": 2.8520578420467184, "grad_norm": 7.2578125, "learning_rate": 7.1479421579532815e-06, "loss": 3.5647, "mean_token_accuracy": 0.4451204672669749, "step": 15384 }, { "epoch": 2.8522432332220986, "grad_norm": 8.0546875, "learning_rate": 7.147756766777902e-06, "loss": 3.2734, "mean_token_accuracy": 0.46436176875861207, "step": 15385 }, { "epoch": 2.852428624397479, "grad_norm": 8.5625, "learning_rate": 7.1475713756025225e-06, "loss": 2.7901, "mean_token_accuracy": 0.4898785425101215, "step": 15386 }, { "epoch": 2.8526140155728585, "grad_norm": 11.109375, "learning_rate": 7.147385984427142e-06, "loss": 2.3004, "mean_token_accuracy": 0.5293980672953779, "step": 15387 }, { "epoch": 2.8527994067482387, "grad_norm": 9.7890625, "learning_rate": 7.147200593251762e-06, "loss": 2.6833, "mean_token_accuracy": 0.49607843137254903, "step": 15388 }, { "epoch": 2.852984797923619, "grad_norm": 9.0703125, "learning_rate": 7.147015202076381e-06, "loss": 2.774, "mean_token_accuracy": 0.46868801360158685, "step": 15389 }, { "epoch": 2.853170189098999, "grad_norm": 7.109375, "learning_rate": 7.146829810901001e-06, "loss": 2.7155, "mean_token_accuracy": 0.4909630428918263, "step": 15390 }, { "epoch": 2.853355580274379, "grad_norm": 8.46875, "learning_rate": 7.1466444197256216e-06, "loss": 2.7526, "mean_token_accuracy": 0.5096236713588049, "step": 15391 }, { "epoch": 2.853540971449759, "grad_norm": 11.5390625, "learning_rate": 7.146459028550241e-06, "loss": 2.9017, "mean_token_accuracy": 0.4709328526363227, "step": 15392 }, { "epoch": 2.853726362625139, "grad_norm": 9.484375, "learning_rate": 7.146273637374862e-06, "loss": 2.1755, "mean_token_accuracy": 0.5425426663368785, "step": 15393 }, { "epoch": 2.853911753800519, "grad_norm": 12.640625, "learning_rate": 7.146088246199482e-06, "loss": 3.0755, "mean_token_accuracy": 0.463776465504825, "step": 15394 }, { "epoch": 2.854097144975899, "grad_norm": 8.671875, "learning_rate": 7.145902855024102e-06, "loss": 3.12, "mean_token_accuracy": 0.4854199912955172, "step": 15395 }, { "epoch": 2.8542825361512794, "grad_norm": 7.7109375, "learning_rate": 7.1457174638487215e-06, "loss": 3.2935, "mean_token_accuracy": 0.4643468339988591, "step": 15396 }, { "epoch": 2.854467927326659, "grad_norm": 8.0, "learning_rate": 7.145532072673341e-06, "loss": 2.9534, "mean_token_accuracy": 0.48430351790503234, "step": 15397 }, { "epoch": 2.8546533185020393, "grad_norm": 9.1640625, "learning_rate": 7.145346681497961e-06, "loss": 2.866, "mean_token_accuracy": 0.4780555235571939, "step": 15398 }, { "epoch": 2.8548387096774195, "grad_norm": 7.04296875, "learning_rate": 7.145161290322581e-06, "loss": 3.2361, "mean_token_accuracy": 0.44921232071478956, "step": 15399 }, { "epoch": 2.8550241008527992, "grad_norm": 7.55078125, "learning_rate": 7.144975899147201e-06, "loss": 2.8261, "mean_token_accuracy": 0.4678362573099415, "step": 15400 }, { "epoch": 2.8552094920281794, "grad_norm": 7.5546875, "learning_rate": 7.144790507971821e-06, "loss": 2.6117, "mean_token_accuracy": 0.4860059269015476, "step": 15401 }, { "epoch": 2.8553948832035596, "grad_norm": 7.85546875, "learning_rate": 7.144605116796441e-06, "loss": 2.4388, "mean_token_accuracy": 0.5091268968550693, "step": 15402 }, { "epoch": 2.85558027437894, "grad_norm": 7.63671875, "learning_rate": 7.1444197256210615e-06, "loss": 2.5938, "mean_token_accuracy": 0.5183447612173704, "step": 15403 }, { "epoch": 2.8557656655543195, "grad_norm": 9.5546875, "learning_rate": 7.144234334445681e-06, "loss": 2.504, "mean_token_accuracy": 0.5094386181369525, "step": 15404 }, { "epoch": 2.8559510567296997, "grad_norm": 8.8984375, "learning_rate": 7.144048943270301e-06, "loss": 3.4258, "mean_token_accuracy": 0.4336482231219073, "step": 15405 }, { "epoch": 2.8561364479050795, "grad_norm": 9.125, "learning_rate": 7.1438635520949205e-06, "loss": 2.941, "mean_token_accuracy": 0.47102803738317756, "step": 15406 }, { "epoch": 2.8563218390804597, "grad_norm": 7.7421875, "learning_rate": 7.14367816091954e-06, "loss": 2.7892, "mean_token_accuracy": 0.4851625727672867, "step": 15407 }, { "epoch": 2.85650723025584, "grad_norm": 10.0703125, "learning_rate": 7.143492769744161e-06, "loss": 3.4092, "mean_token_accuracy": 0.44240498882221435, "step": 15408 }, { "epoch": 2.85669262143122, "grad_norm": 11.9140625, "learning_rate": 7.143307378568781e-06, "loss": 2.1659, "mean_token_accuracy": 0.5501371913195311, "step": 15409 }, { "epoch": 2.8568780126066, "grad_norm": 7.9296875, "learning_rate": 7.143121987393401e-06, "loss": 2.6286, "mean_token_accuracy": 0.49921618925466726, "step": 15410 }, { "epoch": 2.85706340378198, "grad_norm": 8.6484375, "learning_rate": 7.142936596218021e-06, "loss": 3.133, "mean_token_accuracy": 0.46520107238605896, "step": 15411 }, { "epoch": 2.85724879495736, "grad_norm": 8.75, "learning_rate": 7.142751205042641e-06, "loss": 2.5357, "mean_token_accuracy": 0.5255925798694606, "step": 15412 }, { "epoch": 2.85743418613274, "grad_norm": 10.515625, "learning_rate": 7.1425658138672605e-06, "loss": 2.5536, "mean_token_accuracy": 0.4854693241287162, "step": 15413 }, { "epoch": 2.85761957730812, "grad_norm": 7.890625, "learning_rate": 7.14238042269188e-06, "loss": 3.3162, "mean_token_accuracy": 0.4305973356252686, "step": 15414 }, { "epoch": 2.8578049684835003, "grad_norm": 7.953125, "learning_rate": 7.1421950315165e-06, "loss": 2.9712, "mean_token_accuracy": 0.47223894977718894, "step": 15415 }, { "epoch": 2.8579903596588805, "grad_norm": 9.9609375, "learning_rate": 7.14200964034112e-06, "loss": 3.318, "mean_token_accuracy": 0.4513636895704085, "step": 15416 }, { "epoch": 2.8581757508342602, "grad_norm": 7.3828125, "learning_rate": 7.141824249165741e-06, "loss": 2.8601, "mean_token_accuracy": 0.49901356350184956, "step": 15417 }, { "epoch": 2.8583611420096404, "grad_norm": 10.03125, "learning_rate": 7.1416388579903604e-06, "loss": 3.9528, "mean_token_accuracy": 0.41701212958697986, "step": 15418 }, { "epoch": 2.85854653318502, "grad_norm": 8.6171875, "learning_rate": 7.14145346681498e-06, "loss": 3.1183, "mean_token_accuracy": 0.5009868421052631, "step": 15419 }, { "epoch": 2.8587319243604004, "grad_norm": 8.296875, "learning_rate": 7.1412680756396006e-06, "loss": 3.5853, "mean_token_accuracy": 0.44292721728755136, "step": 15420 }, { "epoch": 2.8589173155357805, "grad_norm": 9.8203125, "learning_rate": 7.14108268446422e-06, "loss": 3.1178, "mean_token_accuracy": 0.4773278835100268, "step": 15421 }, { "epoch": 2.8591027067111607, "grad_norm": 8.8359375, "learning_rate": 7.14089729328884e-06, "loss": 3.6471, "mean_token_accuracy": 0.46944724744926564, "step": 15422 }, { "epoch": 2.8592880978865405, "grad_norm": 8.4296875, "learning_rate": 7.1407119021134595e-06, "loss": 2.7932, "mean_token_accuracy": 0.48786789485508875, "step": 15423 }, { "epoch": 2.8594734890619207, "grad_norm": 9.3515625, "learning_rate": 7.140526510938079e-06, "loss": 3.7751, "mean_token_accuracy": 0.44979699330626577, "step": 15424 }, { "epoch": 2.859658880237301, "grad_norm": 9.15625, "learning_rate": 7.1403411197627005e-06, "loss": 2.8293, "mean_token_accuracy": 0.47691318327974275, "step": 15425 }, { "epoch": 2.8598442714126806, "grad_norm": 10.40625, "learning_rate": 7.14015572858732e-06, "loss": 2.6012, "mean_token_accuracy": 0.5078836046660684, "step": 15426 }, { "epoch": 2.860029662588061, "grad_norm": 8.515625, "learning_rate": 7.13997033741194e-06, "loss": 2.8402, "mean_token_accuracy": 0.47957662492546216, "step": 15427 }, { "epoch": 2.860215053763441, "grad_norm": 9.9609375, "learning_rate": 7.139784946236559e-06, "loss": 2.4463, "mean_token_accuracy": 0.5068536482496837, "step": 15428 }, { "epoch": 2.860400444938821, "grad_norm": 8.1328125, "learning_rate": 7.13959955506118e-06, "loss": 2.6924, "mean_token_accuracy": 0.5178592204770215, "step": 15429 }, { "epoch": 2.860585836114201, "grad_norm": 8.3203125, "learning_rate": 7.1394141638857995e-06, "loss": 2.2011, "mean_token_accuracy": 0.5719931497826374, "step": 15430 }, { "epoch": 2.860771227289581, "grad_norm": 8.1015625, "learning_rate": 7.139228772710419e-06, "loss": 2.9747, "mean_token_accuracy": 0.48650025471217523, "step": 15431 }, { "epoch": 2.860956618464961, "grad_norm": 8.671875, "learning_rate": 7.139043381535039e-06, "loss": 2.926, "mean_token_accuracy": 0.49029714978775013, "step": 15432 }, { "epoch": 2.861142009640341, "grad_norm": 7.16796875, "learning_rate": 7.13885799035966e-06, "loss": 2.6574, "mean_token_accuracy": 0.4761255115961801, "step": 15433 }, { "epoch": 2.8613274008157212, "grad_norm": 7.8125, "learning_rate": 7.13867259918428e-06, "loss": 3.2222, "mean_token_accuracy": 0.4931987098583649, "step": 15434 }, { "epoch": 2.8615127919911014, "grad_norm": 9.359375, "learning_rate": 7.1384872080088995e-06, "loss": 3.3731, "mean_token_accuracy": 0.43633231747714324, "step": 15435 }, { "epoch": 2.861698183166481, "grad_norm": 9.8046875, "learning_rate": 7.138301816833519e-06, "loss": 2.7059, "mean_token_accuracy": 0.499622641509434, "step": 15436 }, { "epoch": 2.8618835743418614, "grad_norm": 8.21875, "learning_rate": 7.13811642565814e-06, "loss": 3.4409, "mean_token_accuracy": 0.45554445554445555, "step": 15437 }, { "epoch": 2.862068965517241, "grad_norm": 8.0703125, "learning_rate": 7.137931034482759e-06, "loss": 3.0493, "mean_token_accuracy": 0.4828467644288982, "step": 15438 }, { "epoch": 2.8622543566926213, "grad_norm": 6.75, "learning_rate": 7.137745643307379e-06, "loss": 2.4812, "mean_token_accuracy": 0.5218516826557028, "step": 15439 }, { "epoch": 2.8624397478680015, "grad_norm": 8.2421875, "learning_rate": 7.1375602521319985e-06, "loss": 2.7305, "mean_token_accuracy": 0.49899611832418683, "step": 15440 }, { "epoch": 2.8626251390433817, "grad_norm": 6.28125, "learning_rate": 7.13737486095662e-06, "loss": 2.7991, "mean_token_accuracy": 0.476644245142003, "step": 15441 }, { "epoch": 2.862810530218762, "grad_norm": 8.1328125, "learning_rate": 7.1371894697812395e-06, "loss": 3.0243, "mean_token_accuracy": 0.49973168768446474, "step": 15442 }, { "epoch": 2.8629959213941416, "grad_norm": 7.09375, "learning_rate": 7.137004078605859e-06, "loss": 3.2036, "mean_token_accuracy": 0.465005931198102, "step": 15443 }, { "epoch": 2.863181312569522, "grad_norm": 7.7109375, "learning_rate": 7.136818687430479e-06, "loss": 3.0596, "mean_token_accuracy": 0.4708381171067738, "step": 15444 }, { "epoch": 2.8633667037449015, "grad_norm": 6.64453125, "learning_rate": 7.1366332962550984e-06, "loss": 2.6432, "mean_token_accuracy": 0.4973298644392715, "step": 15445 }, { "epoch": 2.8635520949202817, "grad_norm": 7.4921875, "learning_rate": 7.136447905079719e-06, "loss": 2.9686, "mean_token_accuracy": 0.4724025974025974, "step": 15446 }, { "epoch": 2.863737486095662, "grad_norm": 7.80859375, "learning_rate": 7.136262513904339e-06, "loss": 2.6856, "mean_token_accuracy": 0.4875851714978635, "step": 15447 }, { "epoch": 2.863922877271042, "grad_norm": 8.9375, "learning_rate": 7.136077122728958e-06, "loss": 3.8066, "mean_token_accuracy": 0.44271698113207547, "step": 15448 }, { "epoch": 2.864108268446422, "grad_norm": 7.625, "learning_rate": 7.1358917315535796e-06, "loss": 2.8372, "mean_token_accuracy": 0.49585234342596435, "step": 15449 }, { "epoch": 2.864293659621802, "grad_norm": 11.1015625, "learning_rate": 7.135706340378199e-06, "loss": 3.1271, "mean_token_accuracy": 0.5034405504880781, "step": 15450 }, { "epoch": 2.864479050797182, "grad_norm": 7.53125, "learning_rate": 7.135520949202819e-06, "loss": 3.1014, "mean_token_accuracy": 0.46648192135128, "step": 15451 }, { "epoch": 2.864664441972562, "grad_norm": 6.63671875, "learning_rate": 7.1353355580274385e-06, "loss": 2.894, "mean_token_accuracy": 0.45373525557011796, "step": 15452 }, { "epoch": 2.864849833147942, "grad_norm": 7.92578125, "learning_rate": 7.135150166852058e-06, "loss": 3.4305, "mean_token_accuracy": 0.42160560344827586, "step": 15453 }, { "epoch": 2.8650352243233224, "grad_norm": 7.140625, "learning_rate": 7.134964775676679e-06, "loss": 3.0192, "mean_token_accuracy": 0.4666581827437007, "step": 15454 }, { "epoch": 2.865220615498702, "grad_norm": 7.05078125, "learning_rate": 7.134779384501298e-06, "loss": 2.7483, "mean_token_accuracy": 0.482048901268957, "step": 15455 }, { "epoch": 2.8654060066740823, "grad_norm": 7.49609375, "learning_rate": 7.134593993325918e-06, "loss": 2.423, "mean_token_accuracy": 0.5359148684599468, "step": 15456 }, { "epoch": 2.8655913978494625, "grad_norm": 7.46875, "learning_rate": 7.134408602150538e-06, "loss": 2.9269, "mean_token_accuracy": 0.47923363764947924, "step": 15457 }, { "epoch": 2.8657767890248422, "grad_norm": 8.90625, "learning_rate": 7.134223210975159e-06, "loss": 2.078, "mean_token_accuracy": 0.572247237003684, "step": 15458 }, { "epoch": 2.8659621802002224, "grad_norm": 7.15234375, "learning_rate": 7.1340378197997785e-06, "loss": 2.8335, "mean_token_accuracy": 0.4735194992778045, "step": 15459 }, { "epoch": 2.8661475713756026, "grad_norm": 7.06640625, "learning_rate": 7.133852428624398e-06, "loss": 2.8685, "mean_token_accuracy": 0.4880315762668704, "step": 15460 }, { "epoch": 2.866332962550983, "grad_norm": 9.4140625, "learning_rate": 7.133667037449018e-06, "loss": 2.8043, "mean_token_accuracy": 0.47434715821812595, "step": 15461 }, { "epoch": 2.8665183537263625, "grad_norm": 7.0703125, "learning_rate": 7.1334816462736375e-06, "loss": 2.6912, "mean_token_accuracy": 0.47393193338160755, "step": 15462 }, { "epoch": 2.8667037449017427, "grad_norm": 7.57421875, "learning_rate": 7.133296255098258e-06, "loss": 2.6968, "mean_token_accuracy": 0.4786856127886323, "step": 15463 }, { "epoch": 2.8668891360771225, "grad_norm": 7.12109375, "learning_rate": 7.133110863922878e-06, "loss": 2.8232, "mean_token_accuracy": 0.46379897785349233, "step": 15464 }, { "epoch": 2.8670745272525027, "grad_norm": 8.1796875, "learning_rate": 7.132925472747497e-06, "loss": 3.0142, "mean_token_accuracy": 0.4540380047505938, "step": 15465 }, { "epoch": 2.867259918427883, "grad_norm": 9.5859375, "learning_rate": 7.132740081572118e-06, "loss": 3.1617, "mean_token_accuracy": 0.45264891572879495, "step": 15466 }, { "epoch": 2.867445309603263, "grad_norm": 8.9609375, "learning_rate": 7.132554690396738e-06, "loss": 3.4215, "mean_token_accuracy": 0.4589661604050093, "step": 15467 }, { "epoch": 2.867630700778643, "grad_norm": 6.796875, "learning_rate": 7.132369299221358e-06, "loss": 2.649, "mean_token_accuracy": 0.504007694773966, "step": 15468 }, { "epoch": 2.867816091954023, "grad_norm": 7.63671875, "learning_rate": 7.1321839080459775e-06, "loss": 2.9383, "mean_token_accuracy": 0.4481786133960047, "step": 15469 }, { "epoch": 2.868001483129403, "grad_norm": 6.8828125, "learning_rate": 7.131998516870597e-06, "loss": 2.8922, "mean_token_accuracy": 0.4793646291306198, "step": 15470 }, { "epoch": 2.868186874304783, "grad_norm": 8.0078125, "learning_rate": 7.131813125695217e-06, "loss": 2.7249, "mean_token_accuracy": 0.5054680664916885, "step": 15471 }, { "epoch": 2.868372265480163, "grad_norm": 8.84375, "learning_rate": 7.131627734519837e-06, "loss": 2.662, "mean_token_accuracy": 0.5258299836236415, "step": 15472 }, { "epoch": 2.8685576566555433, "grad_norm": 11.71875, "learning_rate": 7.131442343344457e-06, "loss": 2.0144, "mean_token_accuracy": 0.5530789573101624, "step": 15473 }, { "epoch": 2.8687430478309235, "grad_norm": 8.171875, "learning_rate": 7.1312569521690774e-06, "loss": 3.3912, "mean_token_accuracy": 0.43987751737133435, "step": 15474 }, { "epoch": 2.8689284390063032, "grad_norm": 9.9140625, "learning_rate": 7.131071560993698e-06, "loss": 2.7687, "mean_token_accuracy": 0.4986634589681903, "step": 15475 }, { "epoch": 2.8691138301816834, "grad_norm": 9.4609375, "learning_rate": 7.130886169818318e-06, "loss": 4.1266, "mean_token_accuracy": 0.416520979020979, "step": 15476 }, { "epoch": 2.869299221357063, "grad_norm": 8.1015625, "learning_rate": 7.130700778642937e-06, "loss": 2.6428, "mean_token_accuracy": 0.4990616202690022, "step": 15477 }, { "epoch": 2.8694846125324434, "grad_norm": 10.34375, "learning_rate": 7.130515387467557e-06, "loss": 3.0755, "mean_token_accuracy": 0.4734674667607954, "step": 15478 }, { "epoch": 2.8696700037078235, "grad_norm": 8.4453125, "learning_rate": 7.1303299962921765e-06, "loss": 3.1005, "mean_token_accuracy": 0.4685714285714286, "step": 15479 }, { "epoch": 2.8698553948832037, "grad_norm": 8.84375, "learning_rate": 7.130144605116797e-06, "loss": 2.6627, "mean_token_accuracy": 0.5176203966005666, "step": 15480 }, { "epoch": 2.8700407860585835, "grad_norm": 10.3671875, "learning_rate": 7.129959213941417e-06, "loss": 3.112, "mean_token_accuracy": 0.47888161808447355, "step": 15481 }, { "epoch": 2.8702261772339637, "grad_norm": 8.578125, "learning_rate": 7.129773822766037e-06, "loss": 2.9368, "mean_token_accuracy": 0.4525049603174603, "step": 15482 }, { "epoch": 2.870411568409344, "grad_norm": 7.203125, "learning_rate": 7.129588431590657e-06, "loss": 3.0739, "mean_token_accuracy": 0.4537876042044219, "step": 15483 }, { "epoch": 2.8705969595847236, "grad_norm": 7.06640625, "learning_rate": 7.129403040415277e-06, "loss": 2.594, "mean_token_accuracy": 0.5207286432160804, "step": 15484 }, { "epoch": 2.870782350760104, "grad_norm": 8.5234375, "learning_rate": 7.129217649239897e-06, "loss": 3.1862, "mean_token_accuracy": 0.4503126395712372, "step": 15485 }, { "epoch": 2.870967741935484, "grad_norm": 8.1875, "learning_rate": 7.1290322580645166e-06, "loss": 2.7443, "mean_token_accuracy": 0.4899571505088377, "step": 15486 }, { "epoch": 2.871153133110864, "grad_norm": 7.67578125, "learning_rate": 7.128846866889136e-06, "loss": 3.5087, "mean_token_accuracy": 0.4472661311220523, "step": 15487 }, { "epoch": 2.871338524286244, "grad_norm": 7.8203125, "learning_rate": 7.128661475713756e-06, "loss": 2.9899, "mean_token_accuracy": 0.4697558268590455, "step": 15488 }, { "epoch": 2.871523915461624, "grad_norm": 9.90625, "learning_rate": 7.128476084538376e-06, "loss": 2.9245, "mean_token_accuracy": 0.48659003831417624, "step": 15489 }, { "epoch": 2.871709306637004, "grad_norm": 8.453125, "learning_rate": 7.128290693362997e-06, "loss": 2.9171, "mean_token_accuracy": 0.47273430449916204, "step": 15490 }, { "epoch": 2.871894697812384, "grad_norm": 8.5546875, "learning_rate": 7.1281053021876165e-06, "loss": 3.2799, "mean_token_accuracy": 0.4607336956521739, "step": 15491 }, { "epoch": 2.8720800889877642, "grad_norm": 8.71875, "learning_rate": 7.127919911012237e-06, "loss": 2.6558, "mean_token_accuracy": 0.48501152959262106, "step": 15492 }, { "epoch": 2.8722654801631444, "grad_norm": 10.015625, "learning_rate": 7.127734519836857e-06, "loss": 2.48, "mean_token_accuracy": 0.5230312035661219, "step": 15493 }, { "epoch": 2.872450871338524, "grad_norm": 8.6015625, "learning_rate": 7.127549128661476e-06, "loss": 3.0757, "mean_token_accuracy": 0.47432550043516103, "step": 15494 }, { "epoch": 2.8726362625139044, "grad_norm": 7.80078125, "learning_rate": 7.127363737486096e-06, "loss": 2.2756, "mean_token_accuracy": 0.5087077418410654, "step": 15495 }, { "epoch": 2.8728216536892845, "grad_norm": 8.5390625, "learning_rate": 7.1271783463107155e-06, "loss": 3.1052, "mean_token_accuracy": 0.45910547582555383, "step": 15496 }, { "epoch": 2.8730070448646643, "grad_norm": 6.5234375, "learning_rate": 7.126992955135336e-06, "loss": 3.311, "mean_token_accuracy": 0.4493084786530367, "step": 15497 }, { "epoch": 2.8731924360400445, "grad_norm": 12.40625, "learning_rate": 7.1268075639599565e-06, "loss": 3.5757, "mean_token_accuracy": 0.4381711118808452, "step": 15498 }, { "epoch": 2.8733778272154247, "grad_norm": 8.6796875, "learning_rate": 7.126622172784576e-06, "loss": 3.0308, "mean_token_accuracy": 0.44908086162174354, "step": 15499 }, { "epoch": 2.873563218390805, "grad_norm": 8.8671875, "learning_rate": 7.126436781609196e-06, "loss": 2.6993, "mean_token_accuracy": 0.511962447001817, "step": 15500 }, { "epoch": 2.8737486095661846, "grad_norm": 8.3984375, "learning_rate": 7.126251390433816e-06, "loss": 2.7436, "mean_token_accuracy": 0.4775628871192011, "step": 15501 }, { "epoch": 2.873934000741565, "grad_norm": 9.3984375, "learning_rate": 7.126065999258436e-06, "loss": 2.5437, "mean_token_accuracy": 0.4955933069357517, "step": 15502 }, { "epoch": 2.8741193919169445, "grad_norm": 12.5, "learning_rate": 7.125880608083056e-06, "loss": 3.8848, "mean_token_accuracy": 0.45644955300127715, "step": 15503 }, { "epoch": 2.8743047830923247, "grad_norm": 9.90625, "learning_rate": 7.125695216907675e-06, "loss": 2.44, "mean_token_accuracy": 0.5202205882352942, "step": 15504 }, { "epoch": 2.874490174267705, "grad_norm": 7.37109375, "learning_rate": 7.125509825732295e-06, "loss": 2.5429, "mean_token_accuracy": 0.5170721297481861, "step": 15505 }, { "epoch": 2.874675565443085, "grad_norm": 8.5234375, "learning_rate": 7.125324434556916e-06, "loss": 3.0047, "mean_token_accuracy": 0.4664682993820096, "step": 15506 }, { "epoch": 2.874860956618465, "grad_norm": 10.1640625, "learning_rate": 7.125139043381536e-06, "loss": 3.8155, "mean_token_accuracy": 0.4349201328483315, "step": 15507 }, { "epoch": 2.875046347793845, "grad_norm": 8.359375, "learning_rate": 7.1249536522061555e-06, "loss": 3.2474, "mean_token_accuracy": 0.43636363636363634, "step": 15508 }, { "epoch": 2.875231738969225, "grad_norm": 8.21875, "learning_rate": 7.124768261030775e-06, "loss": 3.1538, "mean_token_accuracy": 0.4579958342976163, "step": 15509 }, { "epoch": 2.875417130144605, "grad_norm": 7.39453125, "learning_rate": 7.124582869855396e-06, "loss": 2.7179, "mean_token_accuracy": 0.4927060060454725, "step": 15510 }, { "epoch": 2.875602521319985, "grad_norm": 7.28515625, "learning_rate": 7.124397478680015e-06, "loss": 2.7276, "mean_token_accuracy": 0.500557880055788, "step": 15511 }, { "epoch": 2.8757879124953654, "grad_norm": 11.2734375, "learning_rate": 7.124212087504635e-06, "loss": 3.0326, "mean_token_accuracy": 0.4778938906752412, "step": 15512 }, { "epoch": 2.875973303670745, "grad_norm": 9.296875, "learning_rate": 7.124026696329255e-06, "loss": 3.39, "mean_token_accuracy": 0.4228552051542896, "step": 15513 }, { "epoch": 2.8761586948461253, "grad_norm": 9.1796875, "learning_rate": 7.123841305153876e-06, "loss": 3.0318, "mean_token_accuracy": 0.4806849315068493, "step": 15514 }, { "epoch": 2.8763440860215055, "grad_norm": 6.8671875, "learning_rate": 7.1236559139784956e-06, "loss": 2.9514, "mean_token_accuracy": 0.4666772201994618, "step": 15515 }, { "epoch": 2.8765294771968852, "grad_norm": 7.8671875, "learning_rate": 7.123470522803115e-06, "loss": 2.623, "mean_token_accuracy": 0.48609916881627974, "step": 15516 }, { "epoch": 2.8767148683722654, "grad_norm": 8.2109375, "learning_rate": 7.123285131627735e-06, "loss": 3.0368, "mean_token_accuracy": 0.5015454113171659, "step": 15517 }, { "epoch": 2.8769002595476456, "grad_norm": 14.4609375, "learning_rate": 7.123099740452355e-06, "loss": 2.6978, "mean_token_accuracy": 0.47697893972403776, "step": 15518 }, { "epoch": 2.877085650723026, "grad_norm": 11.328125, "learning_rate": 7.122914349276975e-06, "loss": 4.5036, "mean_token_accuracy": 0.3924077402981918, "step": 15519 }, { "epoch": 2.8772710418984055, "grad_norm": 8.8046875, "learning_rate": 7.122728958101595e-06, "loss": 3.4734, "mean_token_accuracy": 0.4639618138424821, "step": 15520 }, { "epoch": 2.8774564330737857, "grad_norm": 7.7421875, "learning_rate": 7.122543566926214e-06, "loss": 2.8755, "mean_token_accuracy": 0.47150180940892644, "step": 15521 }, { "epoch": 2.8776418242491655, "grad_norm": 8.203125, "learning_rate": 7.122358175750836e-06, "loss": 2.768, "mean_token_accuracy": 0.48633668471255476, "step": 15522 }, { "epoch": 2.8778272154245457, "grad_norm": 10.4921875, "learning_rate": 7.122172784575455e-06, "loss": 3.1767, "mean_token_accuracy": 0.5161994485294118, "step": 15523 }, { "epoch": 2.878012606599926, "grad_norm": 7.83984375, "learning_rate": 7.121987393400075e-06, "loss": 2.6629, "mean_token_accuracy": 0.5141992551210428, "step": 15524 }, { "epoch": 2.878197997775306, "grad_norm": 8.296875, "learning_rate": 7.1218020022246946e-06, "loss": 3.6966, "mean_token_accuracy": 0.42244372270131036, "step": 15525 }, { "epoch": 2.878383388950686, "grad_norm": 8.3359375, "learning_rate": 7.121616611049314e-06, "loss": 2.6353, "mean_token_accuracy": 0.5074689669682306, "step": 15526 }, { "epoch": 2.878568780126066, "grad_norm": 7.33203125, "learning_rate": 7.121431219873935e-06, "loss": 2.738, "mean_token_accuracy": 0.4708940883915452, "step": 15527 }, { "epoch": 2.878754171301446, "grad_norm": 6.40234375, "learning_rate": 7.121245828698554e-06, "loss": 2.7045, "mean_token_accuracy": 0.4862919808087731, "step": 15528 }, { "epoch": 2.878939562476826, "grad_norm": 6.5703125, "learning_rate": 7.121060437523174e-06, "loss": 2.6032, "mean_token_accuracy": 0.4879054425508521, "step": 15529 }, { "epoch": 2.879124953652206, "grad_norm": 6.24609375, "learning_rate": 7.120875046347795e-06, "loss": 2.8447, "mean_token_accuracy": 0.4871415566681839, "step": 15530 }, { "epoch": 2.8793103448275863, "grad_norm": 9.3984375, "learning_rate": 7.120689655172415e-06, "loss": 3.0904, "mean_token_accuracy": 0.49290973547859285, "step": 15531 }, { "epoch": 2.8794957360029665, "grad_norm": 7.9296875, "learning_rate": 7.120504263997035e-06, "loss": 3.0391, "mean_token_accuracy": 0.4795808704997313, "step": 15532 }, { "epoch": 2.8796811271783462, "grad_norm": 7.62109375, "learning_rate": 7.120318872821654e-06, "loss": 2.8077, "mean_token_accuracy": 0.4823159549384333, "step": 15533 }, { "epoch": 2.8798665183537264, "grad_norm": 7.62890625, "learning_rate": 7.120133481646274e-06, "loss": 2.5638, "mean_token_accuracy": 0.49315897808451387, "step": 15534 }, { "epoch": 2.880051909529106, "grad_norm": 8.2109375, "learning_rate": 7.119948090470894e-06, "loss": 2.8838, "mean_token_accuracy": 0.4901139799953478, "step": 15535 }, { "epoch": 2.8802373007044864, "grad_norm": 7.515625, "learning_rate": 7.119762699295514e-06, "loss": 2.8844, "mean_token_accuracy": 0.5025048705816866, "step": 15536 }, { "epoch": 2.8804226918798665, "grad_norm": 8.171875, "learning_rate": 7.119577308120134e-06, "loss": 2.8269, "mean_token_accuracy": 0.4758729388942774, "step": 15537 }, { "epoch": 2.8806080830552467, "grad_norm": 8.2734375, "learning_rate": 7.119391916944754e-06, "loss": 3.271, "mean_token_accuracy": 0.4321691613209789, "step": 15538 }, { "epoch": 2.8807934742306265, "grad_norm": 12.3515625, "learning_rate": 7.119206525769375e-06, "loss": 3.1054, "mean_token_accuracy": 0.44327573253193087, "step": 15539 }, { "epoch": 2.8809788654060067, "grad_norm": 8.875, "learning_rate": 7.119021134593994e-06, "loss": 2.5713, "mean_token_accuracy": 0.5065471275215289, "step": 15540 }, { "epoch": 2.881164256581387, "grad_norm": 6.953125, "learning_rate": 7.118835743418614e-06, "loss": 2.4565, "mean_token_accuracy": 0.5129848229342328, "step": 15541 }, { "epoch": 2.8813496477567666, "grad_norm": 6.82421875, "learning_rate": 7.118650352243234e-06, "loss": 2.525, "mean_token_accuracy": 0.5035698010025824, "step": 15542 }, { "epoch": 2.881535038932147, "grad_norm": 8.2734375, "learning_rate": 7.118464961067853e-06, "loss": 2.6406, "mean_token_accuracy": 0.504462388440289, "step": 15543 }, { "epoch": 2.881720430107527, "grad_norm": 8.4296875, "learning_rate": 7.118279569892474e-06, "loss": 2.5626, "mean_token_accuracy": 0.5145477545857052, "step": 15544 }, { "epoch": 2.881905821282907, "grad_norm": 7.390625, "learning_rate": 7.118094178717093e-06, "loss": 3.4419, "mean_token_accuracy": 0.4398822869955157, "step": 15545 }, { "epoch": 2.882091212458287, "grad_norm": 7.87109375, "learning_rate": 7.117908787541714e-06, "loss": 2.9308, "mean_token_accuracy": 0.4707492302429011, "step": 15546 }, { "epoch": 2.882276603633667, "grad_norm": 10.53125, "learning_rate": 7.1177233963663335e-06, "loss": 2.997, "mean_token_accuracy": 0.469044558697515, "step": 15547 }, { "epoch": 2.882461994809047, "grad_norm": 9.6171875, "learning_rate": 7.117538005190954e-06, "loss": 3.95, "mean_token_accuracy": 0.44687875150060025, "step": 15548 }, { "epoch": 2.882647385984427, "grad_norm": 8.4609375, "learning_rate": 7.117352614015574e-06, "loss": 2.6059, "mean_token_accuracy": 0.5539921465968587, "step": 15549 }, { "epoch": 2.8828327771598072, "grad_norm": 8.1484375, "learning_rate": 7.117167222840193e-06, "loss": 3.1906, "mean_token_accuracy": 0.45764576457645767, "step": 15550 }, { "epoch": 2.8830181683351874, "grad_norm": 7.4453125, "learning_rate": 7.116981831664813e-06, "loss": 3.1178, "mean_token_accuracy": 0.4699074074074074, "step": 15551 }, { "epoch": 2.883203559510567, "grad_norm": 8.4765625, "learning_rate": 7.1167964404894326e-06, "loss": 2.4148, "mean_token_accuracy": 0.5143184421534936, "step": 15552 }, { "epoch": 2.8833889506859474, "grad_norm": 7.0078125, "learning_rate": 7.116611049314053e-06, "loss": 2.2574, "mean_token_accuracy": 0.5437966685812752, "step": 15553 }, { "epoch": 2.8835743418613276, "grad_norm": 7.41796875, "learning_rate": 7.1164256581386736e-06, "loss": 2.7018, "mean_token_accuracy": 0.5048031389527804, "step": 15554 }, { "epoch": 2.8837597330367073, "grad_norm": 9.875, "learning_rate": 7.116240266963293e-06, "loss": 3.5147, "mean_token_accuracy": 0.4680436477007015, "step": 15555 }, { "epoch": 2.8839451242120875, "grad_norm": 8.109375, "learning_rate": 7.116054875787914e-06, "loss": 3.1235, "mean_token_accuracy": 0.49077076577700174, "step": 15556 }, { "epoch": 2.8841305153874677, "grad_norm": 8.2265625, "learning_rate": 7.115869484612533e-06, "loss": 2.95, "mean_token_accuracy": 0.4889652438218581, "step": 15557 }, { "epoch": 2.884315906562848, "grad_norm": 7.9375, "learning_rate": 7.115684093437153e-06, "loss": 2.3968, "mean_token_accuracy": 0.5134742951907131, "step": 15558 }, { "epoch": 2.8845012977382276, "grad_norm": 7.3984375, "learning_rate": 7.115498702261773e-06, "loss": 2.2958, "mean_token_accuracy": 0.5257232916807385, "step": 15559 }, { "epoch": 2.884686688913608, "grad_norm": 10.421875, "learning_rate": 7.115313311086392e-06, "loss": 2.6557, "mean_token_accuracy": 0.5358150786184652, "step": 15560 }, { "epoch": 2.8848720800889875, "grad_norm": 9.21875, "learning_rate": 7.115127919911013e-06, "loss": 2.734, "mean_token_accuracy": 0.5150399017802333, "step": 15561 }, { "epoch": 2.8850574712643677, "grad_norm": 7.80859375, "learning_rate": 7.114942528735633e-06, "loss": 3.1243, "mean_token_accuracy": 0.4711073754990544, "step": 15562 }, { "epoch": 2.885242862439748, "grad_norm": 12.7421875, "learning_rate": 7.114757137560253e-06, "loss": 2.5693, "mean_token_accuracy": 0.5060805675196351, "step": 15563 }, { "epoch": 2.885428253615128, "grad_norm": 10.203125, "learning_rate": 7.1145717463848725e-06, "loss": 3.1214, "mean_token_accuracy": 0.44798500468603564, "step": 15564 }, { "epoch": 2.885613644790508, "grad_norm": 10.59375, "learning_rate": 7.114386355209493e-06, "loss": 2.3188, "mean_token_accuracy": 0.5499074469599886, "step": 15565 }, { "epoch": 2.885799035965888, "grad_norm": 11.0, "learning_rate": 7.114200964034113e-06, "loss": 2.4669, "mean_token_accuracy": 0.5096315691041705, "step": 15566 }, { "epoch": 2.8859844271412682, "grad_norm": 7.140625, "learning_rate": 7.114015572858732e-06, "loss": 2.8679, "mean_token_accuracy": 0.46134146341463417, "step": 15567 }, { "epoch": 2.886169818316648, "grad_norm": 9.2265625, "learning_rate": 7.113830181683352e-06, "loss": 2.8338, "mean_token_accuracy": 0.4872831616016695, "step": 15568 }, { "epoch": 2.886355209492028, "grad_norm": 8.1796875, "learning_rate": 7.113644790507972e-06, "loss": 3.3915, "mean_token_accuracy": 0.437449815320379, "step": 15569 }, { "epoch": 2.8865406006674084, "grad_norm": 13.7421875, "learning_rate": 7.113459399332593e-06, "loss": 3.0637, "mean_token_accuracy": 0.4847715736040609, "step": 15570 }, { "epoch": 2.8867259918427886, "grad_norm": 8.0625, "learning_rate": 7.113274008157213e-06, "loss": 2.8374, "mean_token_accuracy": 0.510624387054593, "step": 15571 }, { "epoch": 2.8869113830181683, "grad_norm": 7.64453125, "learning_rate": 7.113088616981832e-06, "loss": 3.1731, "mean_token_accuracy": 0.44519846350832265, "step": 15572 }, { "epoch": 2.8870967741935485, "grad_norm": 10.6875, "learning_rate": 7.112903225806453e-06, "loss": 2.9545, "mean_token_accuracy": 0.4576030317385126, "step": 15573 }, { "epoch": 2.8872821653689282, "grad_norm": 11.2421875, "learning_rate": 7.112717834631072e-06, "loss": 2.7228, "mean_token_accuracy": 0.48887403304364435, "step": 15574 }, { "epoch": 2.8874675565443084, "grad_norm": 10.3515625, "learning_rate": 7.112532443455692e-06, "loss": 2.6822, "mean_token_accuracy": 0.49574151683114775, "step": 15575 }, { "epoch": 2.8876529477196886, "grad_norm": 11.203125, "learning_rate": 7.112347052280312e-06, "loss": 2.9155, "mean_token_accuracy": 0.4805950117436528, "step": 15576 }, { "epoch": 2.887838338895069, "grad_norm": 10.09375, "learning_rate": 7.112161661104931e-06, "loss": 3.1315, "mean_token_accuracy": 0.4517818367607613, "step": 15577 }, { "epoch": 2.8880237300704485, "grad_norm": 12.0078125, "learning_rate": 7.111976269929553e-06, "loss": 3.4284, "mean_token_accuracy": 0.437152133580705, "step": 15578 }, { "epoch": 2.8882091212458287, "grad_norm": 11.7578125, "learning_rate": 7.111790878754172e-06, "loss": 2.8019, "mean_token_accuracy": 0.5130685920577618, "step": 15579 }, { "epoch": 2.8883945124212085, "grad_norm": 6.62890625, "learning_rate": 7.111605487578792e-06, "loss": 2.6654, "mean_token_accuracy": 0.49685455460493205, "step": 15580 }, { "epoch": 2.8885799035965887, "grad_norm": 8.3828125, "learning_rate": 7.1114200964034116e-06, "loss": 2.8834, "mean_token_accuracy": 0.4901065449010654, "step": 15581 }, { "epoch": 2.888765294771969, "grad_norm": 7.18359375, "learning_rate": 7.111234705228032e-06, "loss": 3.39, "mean_token_accuracy": 0.4440852490421456, "step": 15582 }, { "epoch": 2.888950685947349, "grad_norm": 8.5625, "learning_rate": 7.111049314052652e-06, "loss": 3.0451, "mean_token_accuracy": 0.48722960646338287, "step": 15583 }, { "epoch": 2.889136077122729, "grad_norm": 8.15625, "learning_rate": 7.110863922877271e-06, "loss": 3.6271, "mean_token_accuracy": 0.4175592095338663, "step": 15584 }, { "epoch": 2.889321468298109, "grad_norm": 8.6015625, "learning_rate": 7.110678531701891e-06, "loss": 3.1728, "mean_token_accuracy": 0.44114963503649635, "step": 15585 }, { "epoch": 2.889506859473489, "grad_norm": 6.8125, "learning_rate": 7.110493140526512e-06, "loss": 2.6439, "mean_token_accuracy": 0.4919593464556799, "step": 15586 }, { "epoch": 2.889692250648869, "grad_norm": 8.375, "learning_rate": 7.110307749351132e-06, "loss": 3.044, "mean_token_accuracy": 0.47741574604950354, "step": 15587 }, { "epoch": 2.889877641824249, "grad_norm": 7.33984375, "learning_rate": 7.110122358175752e-06, "loss": 3.1407, "mean_token_accuracy": 0.46085710564022403, "step": 15588 }, { "epoch": 2.8900630329996293, "grad_norm": 7.91015625, "learning_rate": 7.109936967000371e-06, "loss": 3.1418, "mean_token_accuracy": 0.4704569481968754, "step": 15589 }, { "epoch": 2.8902484241750095, "grad_norm": 8.046875, "learning_rate": 7.109751575824991e-06, "loss": 3.1488, "mean_token_accuracy": 0.48166926677067085, "step": 15590 }, { "epoch": 2.8904338153503892, "grad_norm": 9.390625, "learning_rate": 7.109566184649611e-06, "loss": 2.4945, "mean_token_accuracy": 0.5378996527361992, "step": 15591 }, { "epoch": 2.8906192065257694, "grad_norm": 7.96875, "learning_rate": 7.109380793474231e-06, "loss": 2.6331, "mean_token_accuracy": 0.4932278396137857, "step": 15592 }, { "epoch": 2.890804597701149, "grad_norm": 7.96484375, "learning_rate": 7.109195402298851e-06, "loss": 3.2704, "mean_token_accuracy": 0.4530014261636199, "step": 15593 }, { "epoch": 2.8909899888765294, "grad_norm": 10.1328125, "learning_rate": 7.10901001112347e-06, "loss": 3.0939, "mean_token_accuracy": 0.45751850624240414, "step": 15594 }, { "epoch": 2.8911753800519095, "grad_norm": 13.0703125, "learning_rate": 7.108824619948092e-06, "loss": 1.9098, "mean_token_accuracy": 0.5603662321539417, "step": 15595 }, { "epoch": 2.8913607712272897, "grad_norm": 10.75, "learning_rate": 7.108639228772711e-06, "loss": 2.6984, "mean_token_accuracy": 0.48870776023093904, "step": 15596 }, { "epoch": 2.8915461624026695, "grad_norm": 9.5546875, "learning_rate": 7.108453837597331e-06, "loss": 3.3815, "mean_token_accuracy": 0.45513654096228867, "step": 15597 }, { "epoch": 2.8917315535780497, "grad_norm": 10.8515625, "learning_rate": 7.108268446421951e-06, "loss": 2.442, "mean_token_accuracy": 0.5482262430545661, "step": 15598 }, { "epoch": 2.89191694475343, "grad_norm": 7.921875, "learning_rate": 7.108083055246571e-06, "loss": 3.6025, "mean_token_accuracy": 0.3982974332516445, "step": 15599 }, { "epoch": 2.8921023359288096, "grad_norm": 14.1796875, "learning_rate": 7.107897664071191e-06, "loss": 2.8858, "mean_token_accuracy": 0.4919614147909968, "step": 15600 }, { "epoch": 2.89228772710419, "grad_norm": 20.625, "learning_rate": 7.10771227289581e-06, "loss": 2.9847, "mean_token_accuracy": 0.48926967226750956, "step": 15601 }, { "epoch": 2.89247311827957, "grad_norm": 8.3515625, "learning_rate": 7.10752688172043e-06, "loss": 2.6511, "mean_token_accuracy": 0.49352799518362434, "step": 15602 }, { "epoch": 2.89265850945495, "grad_norm": 7.625, "learning_rate": 7.107341490545051e-06, "loss": 2.8708, "mean_token_accuracy": 0.48746010031919745, "step": 15603 }, { "epoch": 2.89284390063033, "grad_norm": 9.046875, "learning_rate": 7.107156099369671e-06, "loss": 3.1153, "mean_token_accuracy": 0.4703951225231563, "step": 15604 }, { "epoch": 2.89302929180571, "grad_norm": 10.765625, "learning_rate": 7.106970708194291e-06, "loss": 2.6005, "mean_token_accuracy": 0.48886954358910584, "step": 15605 }, { "epoch": 2.89321468298109, "grad_norm": 7.421875, "learning_rate": 7.10678531701891e-06, "loss": 2.7559, "mean_token_accuracy": 0.4886822464211428, "step": 15606 }, { "epoch": 2.89340007415647, "grad_norm": 7.34375, "learning_rate": 7.10659992584353e-06, "loss": 2.6573, "mean_token_accuracy": 0.5344180225281602, "step": 15607 }, { "epoch": 2.8935854653318502, "grad_norm": 13.25, "learning_rate": 7.1064145346681504e-06, "loss": 2.6019, "mean_token_accuracy": 0.5141738821017673, "step": 15608 }, { "epoch": 2.8937708565072304, "grad_norm": 8.1015625, "learning_rate": 7.10622914349277e-06, "loss": 3.2425, "mean_token_accuracy": 0.4597875569044006, "step": 15609 }, { "epoch": 2.89395624768261, "grad_norm": 7.68359375, "learning_rate": 7.10604375231739e-06, "loss": 3.1906, "mean_token_accuracy": 0.45671794871794874, "step": 15610 }, { "epoch": 2.8941416388579904, "grad_norm": 8.609375, "learning_rate": 7.105858361142011e-06, "loss": 3.5021, "mean_token_accuracy": 0.4381243063263041, "step": 15611 }, { "epoch": 2.8943270300333706, "grad_norm": 7.05078125, "learning_rate": 7.105672969966631e-06, "loss": 2.8265, "mean_token_accuracy": 0.49204372492043724, "step": 15612 }, { "epoch": 2.8945124212087503, "grad_norm": 7.84375, "learning_rate": 7.10548757879125e-06, "loss": 2.8022, "mean_token_accuracy": 0.4990352146647371, "step": 15613 }, { "epoch": 2.8946978123841305, "grad_norm": 9.5078125, "learning_rate": 7.10530218761587e-06, "loss": 2.844, "mean_token_accuracy": 0.49016799062377914, "step": 15614 }, { "epoch": 2.8948832035595107, "grad_norm": 7.72265625, "learning_rate": 7.10511679644049e-06, "loss": 2.8177, "mean_token_accuracy": 0.48145810320340854, "step": 15615 }, { "epoch": 2.895068594734891, "grad_norm": 7.50390625, "learning_rate": 7.10493140526511e-06, "loss": 2.7389, "mean_token_accuracy": 0.5150162337662337, "step": 15616 }, { "epoch": 2.8952539859102706, "grad_norm": 15.7578125, "learning_rate": 7.10474601408973e-06, "loss": 3.1561, "mean_token_accuracy": 0.47682439791843156, "step": 15617 }, { "epoch": 2.895439377085651, "grad_norm": 9.4296875, "learning_rate": 7.104560622914349e-06, "loss": 2.738, "mean_token_accuracy": 0.5141163625019498, "step": 15618 }, { "epoch": 2.8956247682610305, "grad_norm": 7.75390625, "learning_rate": 7.10437523173897e-06, "loss": 2.9768, "mean_token_accuracy": 0.4658847089114576, "step": 15619 }, { "epoch": 2.8958101594364107, "grad_norm": 7.05859375, "learning_rate": 7.10418984056359e-06, "loss": 2.4236, "mean_token_accuracy": 0.5262858443331816, "step": 15620 }, { "epoch": 2.895995550611791, "grad_norm": 7.7109375, "learning_rate": 7.10400444938821e-06, "loss": 3.1566, "mean_token_accuracy": 0.4694946974422957, "step": 15621 }, { "epoch": 2.896180941787171, "grad_norm": 12.8671875, "learning_rate": 7.10381905821283e-06, "loss": 2.6538, "mean_token_accuracy": 0.5302201297875048, "step": 15622 }, { "epoch": 2.896366332962551, "grad_norm": 8.0390625, "learning_rate": 7.103633667037449e-06, "loss": 3.3227, "mean_token_accuracy": 0.46773971660475994, "step": 15623 }, { "epoch": 2.896551724137931, "grad_norm": 7.67578125, "learning_rate": 7.103448275862069e-06, "loss": 3.6951, "mean_token_accuracy": 0.4410604192355117, "step": 15624 }, { "epoch": 2.8967371153133112, "grad_norm": 8.078125, "learning_rate": 7.1032628846866895e-06, "loss": 3.2513, "mean_token_accuracy": 0.45279451731064635, "step": 15625 }, { "epoch": 2.896922506488691, "grad_norm": 8.5390625, "learning_rate": 7.103077493511309e-06, "loss": 3.6513, "mean_token_accuracy": 0.4430601092896175, "step": 15626 }, { "epoch": 2.897107897664071, "grad_norm": 7.69140625, "learning_rate": 7.10289210233593e-06, "loss": 2.8634, "mean_token_accuracy": 0.4756035578144854, "step": 15627 }, { "epoch": 2.8972932888394514, "grad_norm": 7.5703125, "learning_rate": 7.102706711160549e-06, "loss": 3.2001, "mean_token_accuracy": 0.44053064958828914, "step": 15628 }, { "epoch": 2.8974786800148316, "grad_norm": 12.953125, "learning_rate": 7.10252131998517e-06, "loss": 2.7398, "mean_token_accuracy": 0.5194516053433326, "step": 15629 }, { "epoch": 2.8976640711902113, "grad_norm": 9.4296875, "learning_rate": 7.102335928809789e-06, "loss": 3.0298, "mean_token_accuracy": 0.46626865671641793, "step": 15630 }, { "epoch": 2.8978494623655915, "grad_norm": 6.73828125, "learning_rate": 7.102150537634409e-06, "loss": 3.107, "mean_token_accuracy": 0.47746071133167906, "step": 15631 }, { "epoch": 2.8980348535409712, "grad_norm": 9.1640625, "learning_rate": 7.101965146459029e-06, "loss": 3.3361, "mean_token_accuracy": 0.4469902912621359, "step": 15632 }, { "epoch": 2.8982202447163514, "grad_norm": 8.4921875, "learning_rate": 7.101779755283648e-06, "loss": 3.7476, "mean_token_accuracy": 0.4521452145214521, "step": 15633 }, { "epoch": 2.8984056358917316, "grad_norm": 7.63671875, "learning_rate": 7.101594364108269e-06, "loss": 3.2099, "mean_token_accuracy": 0.4664274730907227, "step": 15634 }, { "epoch": 2.898591027067112, "grad_norm": 8.40625, "learning_rate": 7.101408972932889e-06, "loss": 2.4616, "mean_token_accuracy": 0.5061302225925485, "step": 15635 }, { "epoch": 2.8987764182424915, "grad_norm": 7.703125, "learning_rate": 7.101223581757509e-06, "loss": 3.4479, "mean_token_accuracy": 0.4773887673231218, "step": 15636 }, { "epoch": 2.8989618094178717, "grad_norm": 7.94140625, "learning_rate": 7.1010381905821294e-06, "loss": 2.3262, "mean_token_accuracy": 0.5262515262515263, "step": 15637 }, { "epoch": 2.899147200593252, "grad_norm": 6.88671875, "learning_rate": 7.100852799406749e-06, "loss": 3.1037, "mean_token_accuracy": 0.4700877785280216, "step": 15638 }, { "epoch": 2.8993325917686317, "grad_norm": 7.61328125, "learning_rate": 7.100667408231369e-06, "loss": 2.6047, "mean_token_accuracy": 0.4934540164861807, "step": 15639 }, { "epoch": 2.899517982944012, "grad_norm": 7.15625, "learning_rate": 7.100482017055988e-06, "loss": 2.4837, "mean_token_accuracy": 0.5245454545454545, "step": 15640 }, { "epoch": 2.899703374119392, "grad_norm": 7.08984375, "learning_rate": 7.100296625880608e-06, "loss": 3.8267, "mean_token_accuracy": 0.4403852033017426, "step": 15641 }, { "epoch": 2.8998887652947722, "grad_norm": 8.203125, "learning_rate": 7.1001112347052285e-06, "loss": 3.2386, "mean_token_accuracy": 0.4633674315731269, "step": 15642 }, { "epoch": 2.900074156470152, "grad_norm": 6.78125, "learning_rate": 7.099925843529849e-06, "loss": 2.8229, "mean_token_accuracy": 0.5052127359819667, "step": 15643 }, { "epoch": 2.900259547645532, "grad_norm": 7.15625, "learning_rate": 7.099740452354469e-06, "loss": 2.8255, "mean_token_accuracy": 0.46617697300990774, "step": 15644 }, { "epoch": 2.900444938820912, "grad_norm": 7.859375, "learning_rate": 7.099555061179088e-06, "loss": 3.5256, "mean_token_accuracy": 0.43240248226950356, "step": 15645 }, { "epoch": 2.900630329996292, "grad_norm": 8.0, "learning_rate": 7.099369670003709e-06, "loss": 3.9688, "mean_token_accuracy": 0.400925466864981, "step": 15646 }, { "epoch": 2.9008157211716723, "grad_norm": 7.2109375, "learning_rate": 7.099184278828328e-06, "loss": 3.0766, "mean_token_accuracy": 0.47669652345529073, "step": 15647 }, { "epoch": 2.9010011123470525, "grad_norm": 6.6015625, "learning_rate": 7.098998887652948e-06, "loss": 2.5819, "mean_token_accuracy": 0.5340175642789123, "step": 15648 }, { "epoch": 2.9011865035224322, "grad_norm": 7.58984375, "learning_rate": 7.098813496477568e-06, "loss": 3.1715, "mean_token_accuracy": 0.4451233059320151, "step": 15649 }, { "epoch": 2.9013718946978124, "grad_norm": 7.19140625, "learning_rate": 7.098628105302187e-06, "loss": 2.5984, "mean_token_accuracy": 0.5097444781290602, "step": 15650 }, { "epoch": 2.901557285873192, "grad_norm": 8.390625, "learning_rate": 7.098442714126809e-06, "loss": 2.9536, "mean_token_accuracy": 0.46746226030191756, "step": 15651 }, { "epoch": 2.9017426770485724, "grad_norm": 7.3828125, "learning_rate": 7.098257322951428e-06, "loss": 2.9524, "mean_token_accuracy": 0.4646727351538795, "step": 15652 }, { "epoch": 2.9019280682239526, "grad_norm": 8.1796875, "learning_rate": 7.098071931776048e-06, "loss": 3.007, "mean_token_accuracy": 0.45636172450052576, "step": 15653 }, { "epoch": 2.9021134593993327, "grad_norm": 8.5390625, "learning_rate": 7.0978865406006685e-06, "loss": 3.7691, "mean_token_accuracy": 0.42691256830601093, "step": 15654 }, { "epoch": 2.9022988505747125, "grad_norm": 10.2890625, "learning_rate": 7.097701149425288e-06, "loss": 2.8142, "mean_token_accuracy": 0.47863888472492877, "step": 15655 }, { "epoch": 2.9024842417500927, "grad_norm": 6.9765625, "learning_rate": 7.097515758249908e-06, "loss": 2.9332, "mean_token_accuracy": 0.4742959856201318, "step": 15656 }, { "epoch": 2.902669632925473, "grad_norm": 9.2109375, "learning_rate": 7.097330367074527e-06, "loss": 3.0579, "mean_token_accuracy": 0.4539943419102789, "step": 15657 }, { "epoch": 2.9028550241008526, "grad_norm": 8.9296875, "learning_rate": 7.097144975899147e-06, "loss": 3.8363, "mean_token_accuracy": 0.44075321494182484, "step": 15658 }, { "epoch": 2.903040415276233, "grad_norm": 8.078125, "learning_rate": 7.096959584723768e-06, "loss": 3.1466, "mean_token_accuracy": 0.4855249891977531, "step": 15659 }, { "epoch": 2.903225806451613, "grad_norm": 8.1328125, "learning_rate": 7.096774193548388e-06, "loss": 3.2449, "mean_token_accuracy": 0.4701403404001194, "step": 15660 }, { "epoch": 2.903411197626993, "grad_norm": 8.3984375, "learning_rate": 7.096588802373008e-06, "loss": 2.1974, "mean_token_accuracy": 0.5418590335487843, "step": 15661 }, { "epoch": 2.903596588802373, "grad_norm": 8.8984375, "learning_rate": 7.096403411197627e-06, "loss": 2.7801, "mean_token_accuracy": 0.47370671227020095, "step": 15662 }, { "epoch": 2.903781979977753, "grad_norm": 10.8125, "learning_rate": 7.096218020022248e-06, "loss": 2.5378, "mean_token_accuracy": 0.5436564309911485, "step": 15663 }, { "epoch": 2.903967371153133, "grad_norm": 7.65234375, "learning_rate": 7.0960326288468675e-06, "loss": 2.6963, "mean_token_accuracy": 0.5021338724168913, "step": 15664 }, { "epoch": 2.904152762328513, "grad_norm": 8.140625, "learning_rate": 7.095847237671487e-06, "loss": 2.8906, "mean_token_accuracy": 0.49257278669043375, "step": 15665 }, { "epoch": 2.9043381535038932, "grad_norm": 7.953125, "learning_rate": 7.095661846496107e-06, "loss": 2.6803, "mean_token_accuracy": 0.5120320855614974, "step": 15666 }, { "epoch": 2.9045235446792734, "grad_norm": 9.828125, "learning_rate": 7.095476455320728e-06, "loss": 3.0065, "mean_token_accuracy": 0.48968481375358164, "step": 15667 }, { "epoch": 2.904708935854653, "grad_norm": 8.234375, "learning_rate": 7.095291064145348e-06, "loss": 2.8933, "mean_token_accuracy": 0.5061124694376528, "step": 15668 }, { "epoch": 2.9048943270300334, "grad_norm": 8.0859375, "learning_rate": 7.095105672969967e-06, "loss": 2.711, "mean_token_accuracy": 0.5133070772168516, "step": 15669 }, { "epoch": 2.9050797182054136, "grad_norm": 8.953125, "learning_rate": 7.094920281794587e-06, "loss": 2.6645, "mean_token_accuracy": 0.5021128511061397, "step": 15670 }, { "epoch": 2.9052651093807933, "grad_norm": 8.7109375, "learning_rate": 7.094734890619207e-06, "loss": 2.7712, "mean_token_accuracy": 0.511797325214744, "step": 15671 }, { "epoch": 2.9054505005561735, "grad_norm": 8.9375, "learning_rate": 7.094549499443827e-06, "loss": 2.28, "mean_token_accuracy": 0.5417822661191877, "step": 15672 }, { "epoch": 2.9056358917315537, "grad_norm": 9.2734375, "learning_rate": 7.094364108268447e-06, "loss": 3.3647, "mean_token_accuracy": 0.4733620949132136, "step": 15673 }, { "epoch": 2.905821282906934, "grad_norm": 13.4375, "learning_rate": 7.0941787170930664e-06, "loss": 2.5466, "mean_token_accuracy": 0.514533258803801, "step": 15674 }, { "epoch": 2.9060066740823136, "grad_norm": 10.3359375, "learning_rate": 7.093993325917688e-06, "loss": 2.9538, "mean_token_accuracy": 0.46054003940201643, "step": 15675 }, { "epoch": 2.906192065257694, "grad_norm": 7.4296875, "learning_rate": 7.093807934742307e-06, "loss": 2.5765, "mean_token_accuracy": 0.4891860465116279, "step": 15676 }, { "epoch": 2.9063774564330735, "grad_norm": 10.6796875, "learning_rate": 7.093622543566927e-06, "loss": 2.5058, "mean_token_accuracy": 0.5269845802398629, "step": 15677 }, { "epoch": 2.9065628476084537, "grad_norm": 9.53125, "learning_rate": 7.093437152391547e-06, "loss": 3.4802, "mean_token_accuracy": 0.44841562269712604, "step": 15678 }, { "epoch": 2.906748238783834, "grad_norm": 7.94921875, "learning_rate": 7.093251761216166e-06, "loss": 3.0488, "mean_token_accuracy": 0.461505376344086, "step": 15679 }, { "epoch": 2.906933629959214, "grad_norm": 8.0, "learning_rate": 7.093066370040787e-06, "loss": 3.2182, "mean_token_accuracy": 0.46346220029826773, "step": 15680 }, { "epoch": 2.907119021134594, "grad_norm": 11.7265625, "learning_rate": 7.0928809788654065e-06, "loss": 3.146, "mean_token_accuracy": 0.5066336200612335, "step": 15681 }, { "epoch": 2.907304412309974, "grad_norm": 8.0078125, "learning_rate": 7.092695587690026e-06, "loss": 2.6054, "mean_token_accuracy": 0.5064624222115844, "step": 15682 }, { "epoch": 2.9074898034853542, "grad_norm": 9.390625, "learning_rate": 7.092510196514647e-06, "loss": 3.2638, "mean_token_accuracy": 0.47739478566546234, "step": 15683 }, { "epoch": 2.907675194660734, "grad_norm": 10.59375, "learning_rate": 7.092324805339267e-06, "loss": 3.5701, "mean_token_accuracy": 0.4445576655859194, "step": 15684 }, { "epoch": 2.907860585836114, "grad_norm": 9.28125, "learning_rate": 7.092139414163887e-06, "loss": 3.027, "mean_token_accuracy": 0.4532773564463705, "step": 15685 }, { "epoch": 2.9080459770114944, "grad_norm": 9.1796875, "learning_rate": 7.091954022988506e-06, "loss": 2.8795, "mean_token_accuracy": 0.5132016086728449, "step": 15686 }, { "epoch": 2.9082313681868746, "grad_norm": 9.7109375, "learning_rate": 7.091768631813126e-06, "loss": 2.5916, "mean_token_accuracy": 0.49929758838679467, "step": 15687 }, { "epoch": 2.9084167593622543, "grad_norm": 10.015625, "learning_rate": 7.091583240637746e-06, "loss": 2.8945, "mean_token_accuracy": 0.46384009691096306, "step": 15688 }, { "epoch": 2.9086021505376345, "grad_norm": 7.76953125, "learning_rate": 7.091397849462366e-06, "loss": 2.6458, "mean_token_accuracy": 0.4937413073713491, "step": 15689 }, { "epoch": 2.9087875417130142, "grad_norm": 11.5, "learning_rate": 7.091212458286986e-06, "loss": 3.2189, "mean_token_accuracy": 0.49583484244838827, "step": 15690 }, { "epoch": 2.9089729328883944, "grad_norm": 10.0703125, "learning_rate": 7.091027067111606e-06, "loss": 2.7098, "mean_token_accuracy": 0.4876215165262476, "step": 15691 }, { "epoch": 2.9091583240637746, "grad_norm": 7.28125, "learning_rate": 7.090841675936227e-06, "loss": 2.4582, "mean_token_accuracy": 0.5132585000813404, "step": 15692 }, { "epoch": 2.909343715239155, "grad_norm": 8.359375, "learning_rate": 7.0906562847608465e-06, "loss": 2.6627, "mean_token_accuracy": 0.5062416406598306, "step": 15693 }, { "epoch": 2.9095291064145345, "grad_norm": 8.65625, "learning_rate": 7.090470893585466e-06, "loss": 2.3882, "mean_token_accuracy": 0.5252609603340292, "step": 15694 }, { "epoch": 2.9097144975899147, "grad_norm": 9.7265625, "learning_rate": 7.090285502410086e-06, "loss": 3.2907, "mean_token_accuracy": 0.500881390078066, "step": 15695 }, { "epoch": 2.909899888765295, "grad_norm": 8.0078125, "learning_rate": 7.090100111234705e-06, "loss": 3.5808, "mean_token_accuracy": 0.4350093109869646, "step": 15696 }, { "epoch": 2.9100852799406747, "grad_norm": 8.0703125, "learning_rate": 7.089914720059326e-06, "loss": 2.6988, "mean_token_accuracy": 0.5089573398633844, "step": 15697 }, { "epoch": 2.910270671116055, "grad_norm": 8.1328125, "learning_rate": 7.0897293288839455e-06, "loss": 3.0188, "mean_token_accuracy": 0.47078154022003144, "step": 15698 }, { "epoch": 2.910456062291435, "grad_norm": 9.078125, "learning_rate": 7.089543937708566e-06, "loss": 3.1678, "mean_token_accuracy": 0.44476553264153423, "step": 15699 }, { "epoch": 2.9106414534668152, "grad_norm": 7.98046875, "learning_rate": 7.089358546533186e-06, "loss": 3.8545, "mean_token_accuracy": 0.4150114990591679, "step": 15700 }, { "epoch": 2.910826844642195, "grad_norm": 9.1796875, "learning_rate": 7.089173155357806e-06, "loss": 3.9198, "mean_token_accuracy": 0.4080142764438676, "step": 15701 }, { "epoch": 2.911012235817575, "grad_norm": 9.9453125, "learning_rate": 7.088987764182426e-06, "loss": 3.1579, "mean_token_accuracy": 0.4416674352116573, "step": 15702 }, { "epoch": 2.911197626992955, "grad_norm": 10.3359375, "learning_rate": 7.0888023730070454e-06, "loss": 2.7957, "mean_token_accuracy": 0.49638802889576883, "step": 15703 }, { "epoch": 2.911383018168335, "grad_norm": 7.796875, "learning_rate": 7.088616981831665e-06, "loss": 3.6298, "mean_token_accuracy": 0.4284026775167345, "step": 15704 }, { "epoch": 2.9115684093437153, "grad_norm": 11.484375, "learning_rate": 7.088431590656285e-06, "loss": 3.1702, "mean_token_accuracy": 0.44798785117691725, "step": 15705 }, { "epoch": 2.9117538005190955, "grad_norm": 9.890625, "learning_rate": 7.088246199480905e-06, "loss": 3.2304, "mean_token_accuracy": 0.45840130505709625, "step": 15706 }, { "epoch": 2.9119391916944752, "grad_norm": 8.4453125, "learning_rate": 7.088060808305526e-06, "loss": 3.2358, "mean_token_accuracy": 0.47026413871333655, "step": 15707 }, { "epoch": 2.9121245828698554, "grad_norm": 10.25, "learning_rate": 7.087875417130145e-06, "loss": 2.7427, "mean_token_accuracy": 0.49605878423513694, "step": 15708 }, { "epoch": 2.912309974045235, "grad_norm": 10.21875, "learning_rate": 7.087690025954765e-06, "loss": 3.0675, "mean_token_accuracy": 0.4659731252709146, "step": 15709 }, { "epoch": 2.9124953652206154, "grad_norm": 7.0859375, "learning_rate": 7.0875046347793855e-06, "loss": 2.4612, "mean_token_accuracy": 0.5276872964169381, "step": 15710 }, { "epoch": 2.9126807563959956, "grad_norm": 7.93359375, "learning_rate": 7.087319243604005e-06, "loss": 3.0221, "mean_token_accuracy": 0.4718160229971254, "step": 15711 }, { "epoch": 2.9128661475713757, "grad_norm": 11.5703125, "learning_rate": 7.087133852428625e-06, "loss": 3.1443, "mean_token_accuracy": 0.451984126984127, "step": 15712 }, { "epoch": 2.913051538746756, "grad_norm": 10.0625, "learning_rate": 7.0869484612532444e-06, "loss": 3.3893, "mean_token_accuracy": 0.43479289940828403, "step": 15713 }, { "epoch": 2.9132369299221357, "grad_norm": 7.71484375, "learning_rate": 7.086763070077864e-06, "loss": 3.2152, "mean_token_accuracy": 0.45918114143920596, "step": 15714 }, { "epoch": 2.913422321097516, "grad_norm": 9.8984375, "learning_rate": 7.0865776789024846e-06, "loss": 2.4842, "mean_token_accuracy": 0.5282472686525352, "step": 15715 }, { "epoch": 2.9136077122728956, "grad_norm": 10.0859375, "learning_rate": 7.086392287727105e-06, "loss": 2.663, "mean_token_accuracy": 0.484525748653051, "step": 15716 }, { "epoch": 2.913793103448276, "grad_norm": 10.25, "learning_rate": 7.086206896551725e-06, "loss": 3.3332, "mean_token_accuracy": 0.4789722785665991, "step": 15717 }, { "epoch": 2.913978494623656, "grad_norm": 12.5234375, "learning_rate": 7.086021505376345e-06, "loss": 2.9367, "mean_token_accuracy": 0.4922727917198355, "step": 15718 }, { "epoch": 2.914163885799036, "grad_norm": 12.7734375, "learning_rate": 7.085836114200965e-06, "loss": 3.3091, "mean_token_accuracy": 0.4472979552093476, "step": 15719 }, { "epoch": 2.914349276974416, "grad_norm": 9.1875, "learning_rate": 7.0856507230255845e-06, "loss": 3.1505, "mean_token_accuracy": 0.4708854944297691, "step": 15720 }, { "epoch": 2.914534668149796, "grad_norm": 7.90625, "learning_rate": 7.085465331850204e-06, "loss": 2.6392, "mean_token_accuracy": 0.5240837116598384, "step": 15721 }, { "epoch": 2.914720059325176, "grad_norm": 13.84375, "learning_rate": 7.085279940674824e-06, "loss": 3.478, "mean_token_accuracy": 0.4285520423395305, "step": 15722 }, { "epoch": 2.914905450500556, "grad_norm": 10.625, "learning_rate": 7.085094549499444e-06, "loss": 2.8776, "mean_token_accuracy": 0.48238644880827464, "step": 15723 }, { "epoch": 2.9150908416759362, "grad_norm": 8.328125, "learning_rate": 7.084909158324065e-06, "loss": 3.1688, "mean_token_accuracy": 0.46766917293233085, "step": 15724 }, { "epoch": 2.9152762328513164, "grad_norm": 7.66796875, "learning_rate": 7.084723767148684e-06, "loss": 2.5762, "mean_token_accuracy": 0.5095231275955893, "step": 15725 }, { "epoch": 2.915461624026696, "grad_norm": 8.6640625, "learning_rate": 7.084538375973304e-06, "loss": 3.3095, "mean_token_accuracy": 0.46163793103448275, "step": 15726 }, { "epoch": 2.9156470152020764, "grad_norm": 10.125, "learning_rate": 7.0843529847979245e-06, "loss": 2.7697, "mean_token_accuracy": 0.4928736752344987, "step": 15727 }, { "epoch": 2.9158324063774566, "grad_norm": 11.6875, "learning_rate": 7.084167593622544e-06, "loss": 1.9251, "mean_token_accuracy": 0.5755448296036273, "step": 15728 }, { "epoch": 2.9160177975528363, "grad_norm": 7.30078125, "learning_rate": 7.083982202447164e-06, "loss": 2.6348, "mean_token_accuracy": 0.5233333333333333, "step": 15729 }, { "epoch": 2.9162031887282165, "grad_norm": 10.90625, "learning_rate": 7.0837968112717835e-06, "loss": 3.4017, "mean_token_accuracy": 0.4537117903930131, "step": 15730 }, { "epoch": 2.9163885799035967, "grad_norm": 11.8671875, "learning_rate": 7.083611420096403e-06, "loss": 2.9499, "mean_token_accuracy": 0.4781433794309331, "step": 15731 }, { "epoch": 2.916573971078977, "grad_norm": 7.734375, "learning_rate": 7.0834260289210244e-06, "loss": 2.9926, "mean_token_accuracy": 0.48730538922155686, "step": 15732 }, { "epoch": 2.9167593622543566, "grad_norm": 10.703125, "learning_rate": 7.083240637745644e-06, "loss": 2.9316, "mean_token_accuracy": 0.4698461878942127, "step": 15733 }, { "epoch": 2.916944753429737, "grad_norm": 10.90625, "learning_rate": 7.083055246570264e-06, "loss": 3.2342, "mean_token_accuracy": 0.43428154631655724, "step": 15734 }, { "epoch": 2.9171301446051165, "grad_norm": 10.1328125, "learning_rate": 7.082869855394884e-06, "loss": 2.8833, "mean_token_accuracy": 0.48706624605678234, "step": 15735 }, { "epoch": 2.9173155357804967, "grad_norm": 8.453125, "learning_rate": 7.082684464219504e-06, "loss": 3.3451, "mean_token_accuracy": 0.47602996254681645, "step": 15736 }, { "epoch": 2.917500926955877, "grad_norm": 9.625, "learning_rate": 7.0824990730441235e-06, "loss": 3.0043, "mean_token_accuracy": 0.47522236340533675, "step": 15737 }, { "epoch": 2.917686318131257, "grad_norm": 12.9765625, "learning_rate": 7.082313681868743e-06, "loss": 3.549, "mean_token_accuracy": 0.44315967259971417, "step": 15738 }, { "epoch": 2.917871709306637, "grad_norm": 10.3984375, "learning_rate": 7.082128290693363e-06, "loss": 2.6227, "mean_token_accuracy": 0.4967019290603609, "step": 15739 }, { "epoch": 2.918057100482017, "grad_norm": 7.73828125, "learning_rate": 7.081942899517984e-06, "loss": 3.1772, "mean_token_accuracy": 0.4609550894655765, "step": 15740 }, { "epoch": 2.9182424916573972, "grad_norm": 10.0, "learning_rate": 7.081757508342604e-06, "loss": 3.4737, "mean_token_accuracy": 0.4441242395132885, "step": 15741 }, { "epoch": 2.918427882832777, "grad_norm": 8.21875, "learning_rate": 7.0815721171672234e-06, "loss": 3.0331, "mean_token_accuracy": 0.4755825087775295, "step": 15742 }, { "epoch": 2.918613274008157, "grad_norm": 7.56640625, "learning_rate": 7.081386725991843e-06, "loss": 3.0874, "mean_token_accuracy": 0.46127678875155914, "step": 15743 }, { "epoch": 2.9187986651835374, "grad_norm": 9.203125, "learning_rate": 7.0812013348164636e-06, "loss": 3.0048, "mean_token_accuracy": 0.49566947565543074, "step": 15744 }, { "epoch": 2.9189840563589176, "grad_norm": 8.9140625, "learning_rate": 7.081015943641083e-06, "loss": 3.1536, "mean_token_accuracy": 0.45977179637214743, "step": 15745 }, { "epoch": 2.9191694475342973, "grad_norm": 9.6875, "learning_rate": 7.080830552465703e-06, "loss": 2.6859, "mean_token_accuracy": 0.4985284708893154, "step": 15746 }, { "epoch": 2.9193548387096775, "grad_norm": 7.4453125, "learning_rate": 7.0806451612903225e-06, "loss": 2.4681, "mean_token_accuracy": 0.49156400642742365, "step": 15747 }, { "epoch": 2.9195402298850572, "grad_norm": 7.6015625, "learning_rate": 7.080459770114944e-06, "loss": 2.5623, "mean_token_accuracy": 0.49905587026546705, "step": 15748 }, { "epoch": 2.9197256210604374, "grad_norm": 8.1875, "learning_rate": 7.0802743789395635e-06, "loss": 2.7275, "mean_token_accuracy": 0.503462204270052, "step": 15749 }, { "epoch": 2.9199110122358176, "grad_norm": 6.78515625, "learning_rate": 7.080088987764183e-06, "loss": 2.9298, "mean_token_accuracy": 0.47989093387866394, "step": 15750 }, { "epoch": 2.920096403411198, "grad_norm": 6.671875, "learning_rate": 7.079903596588803e-06, "loss": 2.3778, "mean_token_accuracy": 0.530525372465554, "step": 15751 }, { "epoch": 2.9202817945865776, "grad_norm": 8.6328125, "learning_rate": 7.079718205413422e-06, "loss": 2.6491, "mean_token_accuracy": 0.5050431401142301, "step": 15752 }, { "epoch": 2.9204671857619577, "grad_norm": 7.203125, "learning_rate": 7.079532814238043e-06, "loss": 2.4091, "mean_token_accuracy": 0.5245714285714286, "step": 15753 }, { "epoch": 2.920652576937338, "grad_norm": 7.01171875, "learning_rate": 7.0793474230626625e-06, "loss": 2.8524, "mean_token_accuracy": 0.49049276914836637, "step": 15754 }, { "epoch": 2.9208379681127177, "grad_norm": 7.390625, "learning_rate": 7.079162031887282e-06, "loss": 2.6931, "mean_token_accuracy": 0.48771390960947786, "step": 15755 }, { "epoch": 2.921023359288098, "grad_norm": 7.1484375, "learning_rate": 7.0789766407119035e-06, "loss": 2.5714, "mean_token_accuracy": 0.4988766702140239, "step": 15756 }, { "epoch": 2.921208750463478, "grad_norm": 7.265625, "learning_rate": 7.078791249536523e-06, "loss": 3.511, "mean_token_accuracy": 0.48579676674364897, "step": 15757 }, { "epoch": 2.9213941416388582, "grad_norm": 9.53125, "learning_rate": 7.078605858361143e-06, "loss": 2.9755, "mean_token_accuracy": 0.4838300570703868, "step": 15758 }, { "epoch": 2.921579532814238, "grad_norm": 7.125, "learning_rate": 7.0784204671857625e-06, "loss": 2.4836, "mean_token_accuracy": 0.5111083059833376, "step": 15759 }, { "epoch": 2.921764923989618, "grad_norm": 8.140625, "learning_rate": 7.078235076010382e-06, "loss": 2.8941, "mean_token_accuracy": 0.46477570525666717, "step": 15760 }, { "epoch": 2.921950315164998, "grad_norm": 11.2578125, "learning_rate": 7.078049684835003e-06, "loss": 2.9741, "mean_token_accuracy": 0.49598163030998854, "step": 15761 }, { "epoch": 2.922135706340378, "grad_norm": 9.515625, "learning_rate": 7.077864293659622e-06, "loss": 3.3953, "mean_token_accuracy": 0.4868884540117417, "step": 15762 }, { "epoch": 2.9223210975157583, "grad_norm": 7.75, "learning_rate": 7.077678902484242e-06, "loss": 3.0091, "mean_token_accuracy": 0.4638176010573852, "step": 15763 }, { "epoch": 2.9225064886911385, "grad_norm": 11.171875, "learning_rate": 7.077493511308862e-06, "loss": 2.6826, "mean_token_accuracy": 0.5095168374816984, "step": 15764 }, { "epoch": 2.9226918798665182, "grad_norm": 8.5625, "learning_rate": 7.077308120133483e-06, "loss": 3.1169, "mean_token_accuracy": 0.4585965315263581, "step": 15765 }, { "epoch": 2.9228772710418984, "grad_norm": 7.328125, "learning_rate": 7.0771227289581025e-06, "loss": 3.1019, "mean_token_accuracy": 0.4423810735181888, "step": 15766 }, { "epoch": 2.9230626622172786, "grad_norm": 7.21484375, "learning_rate": 7.076937337782722e-06, "loss": 2.6243, "mean_token_accuracy": 0.5051173991571343, "step": 15767 }, { "epoch": 2.9232480533926584, "grad_norm": 10.34375, "learning_rate": 7.076751946607342e-06, "loss": 3.1089, "mean_token_accuracy": 0.4505407544183593, "step": 15768 }, { "epoch": 2.9234334445680386, "grad_norm": 7.56640625, "learning_rate": 7.0765665554319614e-06, "loss": 2.5792, "mean_token_accuracy": 0.506789413118527, "step": 15769 }, { "epoch": 2.9236188357434187, "grad_norm": 7.75390625, "learning_rate": 7.076381164256582e-06, "loss": 3.2366, "mean_token_accuracy": 0.45126353790613716, "step": 15770 }, { "epoch": 2.923804226918799, "grad_norm": 9.3203125, "learning_rate": 7.076195773081202e-06, "loss": 2.9261, "mean_token_accuracy": 0.4659159159159159, "step": 15771 }, { "epoch": 2.9239896180941787, "grad_norm": 9.7578125, "learning_rate": 7.076010381905822e-06, "loss": 2.7783, "mean_token_accuracy": 0.4992810353091548, "step": 15772 }, { "epoch": 2.924175009269559, "grad_norm": 7.5234375, "learning_rate": 7.0758249907304426e-06, "loss": 2.9573, "mean_token_accuracy": 0.4654565920923034, "step": 15773 }, { "epoch": 2.9243604004449386, "grad_norm": 8.21875, "learning_rate": 7.075639599555062e-06, "loss": 2.8393, "mean_token_accuracy": 0.49641010913268235, "step": 15774 }, { "epoch": 2.924545791620319, "grad_norm": 7.43359375, "learning_rate": 7.075454208379682e-06, "loss": 2.5363, "mean_token_accuracy": 0.4935323383084577, "step": 15775 }, { "epoch": 2.924731182795699, "grad_norm": 9.28125, "learning_rate": 7.0752688172043015e-06, "loss": 3.3421, "mean_token_accuracy": 0.4594017094017094, "step": 15776 }, { "epoch": 2.924916573971079, "grad_norm": 8.828125, "learning_rate": 7.075083426028921e-06, "loss": 3.0977, "mean_token_accuracy": 0.47050788526015147, "step": 15777 }, { "epoch": 2.925101965146459, "grad_norm": 7.57421875, "learning_rate": 7.074898034853541e-06, "loss": 2.7739, "mean_token_accuracy": 0.5073227885178676, "step": 15778 }, { "epoch": 2.925287356321839, "grad_norm": 8.1875, "learning_rate": 7.074712643678161e-06, "loss": 3.2481, "mean_token_accuracy": 0.46366013071895423, "step": 15779 }, { "epoch": 2.925472747497219, "grad_norm": 8.4140625, "learning_rate": 7.074527252502782e-06, "loss": 3.1468, "mean_token_accuracy": 0.4652077278167208, "step": 15780 }, { "epoch": 2.925658138672599, "grad_norm": 7.2109375, "learning_rate": 7.074341861327401e-06, "loss": 3.0484, "mean_token_accuracy": 0.4479315263908702, "step": 15781 }, { "epoch": 2.9258435298479792, "grad_norm": 6.8125, "learning_rate": 7.074156470152022e-06, "loss": 2.6744, "mean_token_accuracy": 0.484525748653051, "step": 15782 }, { "epoch": 2.9260289210233594, "grad_norm": 8.75, "learning_rate": 7.0739710789766415e-06, "loss": 2.7703, "mean_token_accuracy": 0.48855258051998446, "step": 15783 }, { "epoch": 2.926214312198739, "grad_norm": 7.75, "learning_rate": 7.073785687801261e-06, "loss": 3.5682, "mean_token_accuracy": 0.44030163385002097, "step": 15784 }, { "epoch": 2.9263997033741194, "grad_norm": 7.2421875, "learning_rate": 7.073600296625881e-06, "loss": 2.7126, "mean_token_accuracy": 0.5106864186362094, "step": 15785 }, { "epoch": 2.9265850945494996, "grad_norm": 7.3984375, "learning_rate": 7.0734149054505005e-06, "loss": 3.0821, "mean_token_accuracy": 0.46185714285714285, "step": 15786 }, { "epoch": 2.9267704857248793, "grad_norm": 8.21875, "learning_rate": 7.073229514275121e-06, "loss": 2.8343, "mean_token_accuracy": 0.49259478672985785, "step": 15787 }, { "epoch": 2.9269558769002595, "grad_norm": 9.015625, "learning_rate": 7.0730441230997415e-06, "loss": 2.835, "mean_token_accuracy": 0.4790157004830918, "step": 15788 }, { "epoch": 2.9271412680756397, "grad_norm": 7.18359375, "learning_rate": 7.072858731924361e-06, "loss": 3.2031, "mean_token_accuracy": 0.44110830172777077, "step": 15789 }, { "epoch": 2.92732665925102, "grad_norm": 8.3125, "learning_rate": 7.072673340748981e-06, "loss": 3.371, "mean_token_accuracy": 0.4430113556708257, "step": 15790 }, { "epoch": 2.9275120504263996, "grad_norm": 7.82421875, "learning_rate": 7.072487949573601e-06, "loss": 2.3814, "mean_token_accuracy": 0.5577318472026837, "step": 15791 }, { "epoch": 2.92769744160178, "grad_norm": 10.8984375, "learning_rate": 7.072302558398221e-06, "loss": 3.3322, "mean_token_accuracy": 0.4778652906029332, "step": 15792 }, { "epoch": 2.9278828327771595, "grad_norm": 8.5703125, "learning_rate": 7.0721171672228405e-06, "loss": 3.1129, "mean_token_accuracy": 0.514813073124853, "step": 15793 }, { "epoch": 2.9280682239525397, "grad_norm": 9.1640625, "learning_rate": 7.07193177604746e-06, "loss": 3.5527, "mean_token_accuracy": 0.4112914811090361, "step": 15794 }, { "epoch": 2.92825361512792, "grad_norm": 15.234375, "learning_rate": 7.07174638487208e-06, "loss": 2.8418, "mean_token_accuracy": 0.4838897798512903, "step": 15795 }, { "epoch": 2.9284390063033, "grad_norm": 8.3984375, "learning_rate": 7.071560993696701e-06, "loss": 3.201, "mean_token_accuracy": 0.4392664238161035, "step": 15796 }, { "epoch": 2.92862439747868, "grad_norm": 7.6640625, "learning_rate": 7.071375602521321e-06, "loss": 3.0161, "mean_token_accuracy": 0.4551588762115078, "step": 15797 }, { "epoch": 2.92880978865406, "grad_norm": 7.78125, "learning_rate": 7.0711902113459404e-06, "loss": 2.9503, "mean_token_accuracy": 0.4650073206442167, "step": 15798 }, { "epoch": 2.9289951798294402, "grad_norm": 10.1796875, "learning_rate": 7.071004820170561e-06, "loss": 2.9974, "mean_token_accuracy": 0.47019471986620476, "step": 15799 }, { "epoch": 2.92918057100482, "grad_norm": 7.43359375, "learning_rate": 7.070819428995181e-06, "loss": 2.9785, "mean_token_accuracy": 0.4455039227519614, "step": 15800 }, { "epoch": 2.9293659621802, "grad_norm": 7.81640625, "learning_rate": 7.0706340378198e-06, "loss": 3.6277, "mean_token_accuracy": 0.433665008291874, "step": 15801 }, { "epoch": 2.9295513533555804, "grad_norm": 7.5234375, "learning_rate": 7.07044864664442e-06, "loss": 3.4402, "mean_token_accuracy": 0.46136631330977623, "step": 15802 }, { "epoch": 2.9297367445309606, "grad_norm": 7.984375, "learning_rate": 7.0702632554690395e-06, "loss": 2.8516, "mean_token_accuracy": 0.5092699884125145, "step": 15803 }, { "epoch": 2.9299221357063403, "grad_norm": 7.171875, "learning_rate": 7.070077864293661e-06, "loss": 2.3194, "mean_token_accuracy": 0.5531094527363184, "step": 15804 }, { "epoch": 2.9301075268817205, "grad_norm": 9.2265625, "learning_rate": 7.0698924731182805e-06, "loss": 3.6322, "mean_token_accuracy": 0.4616822429906542, "step": 15805 }, { "epoch": 2.9302929180571002, "grad_norm": 7.98828125, "learning_rate": 7.0697070819429e-06, "loss": 2.599, "mean_token_accuracy": 0.5279367011564212, "step": 15806 }, { "epoch": 2.9304783092324804, "grad_norm": 6.75390625, "learning_rate": 7.06952169076752e-06, "loss": 2.3356, "mean_token_accuracy": 0.5168043292509257, "step": 15807 }, { "epoch": 2.9306637004078606, "grad_norm": 9.90625, "learning_rate": 7.06933629959214e-06, "loss": 3.0674, "mean_token_accuracy": 0.46885035324341684, "step": 15808 }, { "epoch": 2.930849091583241, "grad_norm": 8.0546875, "learning_rate": 7.06915090841676e-06, "loss": 2.6481, "mean_token_accuracy": 0.505949603359776, "step": 15809 }, { "epoch": 2.9310344827586206, "grad_norm": 6.93359375, "learning_rate": 7.0689655172413796e-06, "loss": 2.8904, "mean_token_accuracy": 0.48043995917904525, "step": 15810 }, { "epoch": 2.9312198739340007, "grad_norm": 7.03125, "learning_rate": 7.068780126065999e-06, "loss": 2.7723, "mean_token_accuracy": 0.48166877370417194, "step": 15811 }, { "epoch": 2.931405265109381, "grad_norm": 11.8359375, "learning_rate": 7.0685947348906205e-06, "loss": 3.0394, "mean_token_accuracy": 0.4785571142284569, "step": 15812 }, { "epoch": 2.9315906562847607, "grad_norm": 18.90625, "learning_rate": 7.06840934371524e-06, "loss": 3.0292, "mean_token_accuracy": 0.4750179985601152, "step": 15813 }, { "epoch": 2.931776047460141, "grad_norm": 7.43359375, "learning_rate": 7.06822395253986e-06, "loss": 2.3933, "mean_token_accuracy": 0.5435318998942545, "step": 15814 }, { "epoch": 2.931961438635521, "grad_norm": 8.3125, "learning_rate": 7.0680385613644795e-06, "loss": 2.3059, "mean_token_accuracy": 0.5369670100404225, "step": 15815 }, { "epoch": 2.9321468298109012, "grad_norm": 9.6875, "learning_rate": 7.0678531701891e-06, "loss": 3.2994, "mean_token_accuracy": 0.45899870522227015, "step": 15816 }, { "epoch": 2.932332220986281, "grad_norm": 9.8046875, "learning_rate": 7.06766777901372e-06, "loss": 2.7043, "mean_token_accuracy": 0.5157147174372922, "step": 15817 }, { "epoch": 2.932517612161661, "grad_norm": 8.921875, "learning_rate": 7.067482387838339e-06, "loss": 2.612, "mean_token_accuracy": 0.5244029075804777, "step": 15818 }, { "epoch": 2.932703003337041, "grad_norm": 9.671875, "learning_rate": 7.067296996662959e-06, "loss": 3.0663, "mean_token_accuracy": 0.4700064850843061, "step": 15819 }, { "epoch": 2.932888394512421, "grad_norm": 7.5, "learning_rate": 7.06711160548758e-06, "loss": 2.752, "mean_token_accuracy": 0.4924466593988475, "step": 15820 }, { "epoch": 2.9330737856878013, "grad_norm": 9.2890625, "learning_rate": 7.0669262143122e-06, "loss": 3.1458, "mean_token_accuracy": 0.46797608881298036, "step": 15821 }, { "epoch": 2.9332591768631815, "grad_norm": 9.5859375, "learning_rate": 7.0667408231368195e-06, "loss": 3.012, "mean_token_accuracy": 0.4552975713904457, "step": 15822 }, { "epoch": 2.9334445680385612, "grad_norm": 7.7109375, "learning_rate": 7.066555431961439e-06, "loss": 2.8264, "mean_token_accuracy": 0.4715383758493929, "step": 15823 }, { "epoch": 2.9336299592139414, "grad_norm": 7.39453125, "learning_rate": 7.066370040786059e-06, "loss": 3.6664, "mean_token_accuracy": 0.41659017898118406, "step": 15824 }, { "epoch": 2.9338153503893216, "grad_norm": 7.19921875, "learning_rate": 7.066184649610679e-06, "loss": 3.2621, "mean_token_accuracy": 0.4592394215318693, "step": 15825 }, { "epoch": 2.9340007415647014, "grad_norm": 11.2265625, "learning_rate": 7.065999258435299e-06, "loss": 2.7286, "mean_token_accuracy": 0.5185491493383743, "step": 15826 }, { "epoch": 2.9341861327400816, "grad_norm": 8.1171875, "learning_rate": 7.065813867259919e-06, "loss": 2.1867, "mean_token_accuracy": 0.5652795838751625, "step": 15827 }, { "epoch": 2.9343715239154617, "grad_norm": 6.421875, "learning_rate": 7.065628476084539e-06, "loss": 2.8827, "mean_token_accuracy": 0.48452183526810394, "step": 15828 }, { "epoch": 2.934556915090842, "grad_norm": 8.7421875, "learning_rate": 7.06544308490916e-06, "loss": 2.4925, "mean_token_accuracy": 0.556489413285922, "step": 15829 }, { "epoch": 2.9347423062662217, "grad_norm": 8.671875, "learning_rate": 7.065257693733779e-06, "loss": 3.2094, "mean_token_accuracy": 0.4568430070678946, "step": 15830 }, { "epoch": 2.934927697441602, "grad_norm": 7.5859375, "learning_rate": 7.065072302558399e-06, "loss": 2.8042, "mean_token_accuracy": 0.49071117561683597, "step": 15831 }, { "epoch": 2.9351130886169816, "grad_norm": 8.90625, "learning_rate": 7.0648869113830185e-06, "loss": 2.8898, "mean_token_accuracy": 0.462671905697446, "step": 15832 }, { "epoch": 2.935298479792362, "grad_norm": 9.4921875, "learning_rate": 7.064701520207638e-06, "loss": 2.8209, "mean_token_accuracy": 0.5168785021183545, "step": 15833 }, { "epoch": 2.935483870967742, "grad_norm": 7.66796875, "learning_rate": 7.064516129032259e-06, "loss": 3.3073, "mean_token_accuracy": 0.45827372436814495, "step": 15834 }, { "epoch": 2.935669262143122, "grad_norm": 7.28515625, "learning_rate": 7.064330737856878e-06, "loss": 3.3815, "mean_token_accuracy": 0.44579454000315605, "step": 15835 }, { "epoch": 2.935854653318502, "grad_norm": 8.9609375, "learning_rate": 7.064145346681498e-06, "loss": 3.3173, "mean_token_accuracy": 0.46916299559471364, "step": 15836 }, { "epoch": 2.936040044493882, "grad_norm": 11.1796875, "learning_rate": 7.063959955506119e-06, "loss": 2.8192, "mean_token_accuracy": 0.494113763890417, "step": 15837 }, { "epoch": 2.9362254356692623, "grad_norm": 10.3515625, "learning_rate": 7.063774564330739e-06, "loss": 2.6047, "mean_token_accuracy": 0.5044428434197886, "step": 15838 }, { "epoch": 2.936410826844642, "grad_norm": 9.4140625, "learning_rate": 7.0635891731553586e-06, "loss": 2.9186, "mean_token_accuracy": 0.49605939463163906, "step": 15839 }, { "epoch": 2.9365962180200222, "grad_norm": 7.3359375, "learning_rate": 7.063403781979978e-06, "loss": 2.8512, "mean_token_accuracy": 0.4851138353765324, "step": 15840 }, { "epoch": 2.9367816091954024, "grad_norm": 10.8046875, "learning_rate": 7.063218390804598e-06, "loss": 2.8166, "mean_token_accuracy": 0.4740200546946217, "step": 15841 }, { "epoch": 2.9369670003707826, "grad_norm": 9.78125, "learning_rate": 7.063032999629218e-06, "loss": 2.9014, "mean_token_accuracy": 0.49051735259966456, "step": 15842 }, { "epoch": 2.9371523915461624, "grad_norm": 8.6953125, "learning_rate": 7.062847608453838e-06, "loss": 3.6371, "mean_token_accuracy": 0.41403087992804677, "step": 15843 }, { "epoch": 2.9373377827215426, "grad_norm": 8.1484375, "learning_rate": 7.062662217278458e-06, "loss": 3.2407, "mean_token_accuracy": 0.4611777685836436, "step": 15844 }, { "epoch": 2.9375231738969223, "grad_norm": 8.578125, "learning_rate": 7.062476826103078e-06, "loss": 3.1328, "mean_token_accuracy": 0.48473991507430997, "step": 15845 }, { "epoch": 2.9377085650723025, "grad_norm": 9.484375, "learning_rate": 7.062291434927699e-06, "loss": 2.9125, "mean_token_accuracy": 0.47316830038212326, "step": 15846 }, { "epoch": 2.9378939562476827, "grad_norm": 9.4609375, "learning_rate": 7.062106043752318e-06, "loss": 2.6308, "mean_token_accuracy": 0.513713230069632, "step": 15847 }, { "epoch": 2.938079347423063, "grad_norm": 10.859375, "learning_rate": 7.061920652576938e-06, "loss": 3.2246, "mean_token_accuracy": 0.48979914597501184, "step": 15848 }, { "epoch": 2.9382647385984426, "grad_norm": 7.23046875, "learning_rate": 7.0617352614015576e-06, "loss": 2.7237, "mean_token_accuracy": 0.5023864721123688, "step": 15849 }, { "epoch": 2.938450129773823, "grad_norm": 10.328125, "learning_rate": 7.061549870226177e-06, "loss": 4.1498, "mean_token_accuracy": 0.4085137480588289, "step": 15850 }, { "epoch": 2.9386355209492026, "grad_norm": 9.9453125, "learning_rate": 7.061364479050798e-06, "loss": 2.7981, "mean_token_accuracy": 0.4875759978852762, "step": 15851 }, { "epoch": 2.9388209121245827, "grad_norm": 9.015625, "learning_rate": 7.061179087875417e-06, "loss": 2.7035, "mean_token_accuracy": 0.4703505799556888, "step": 15852 }, { "epoch": 2.939006303299963, "grad_norm": 14.4765625, "learning_rate": 7.060993696700038e-06, "loss": 2.3924, "mean_token_accuracy": 0.5200155965687548, "step": 15853 }, { "epoch": 2.939191694475343, "grad_norm": 8.8828125, "learning_rate": 7.060808305524658e-06, "loss": 2.5691, "mean_token_accuracy": 0.49460473844710295, "step": 15854 }, { "epoch": 2.939377085650723, "grad_norm": 11.2734375, "learning_rate": 7.060622914349278e-06, "loss": 2.2658, "mean_token_accuracy": 0.5290097988653946, "step": 15855 }, { "epoch": 2.939562476826103, "grad_norm": 8.3125, "learning_rate": 7.060437523173898e-06, "loss": 2.8359, "mean_token_accuracy": 0.5164335664335664, "step": 15856 }, { "epoch": 2.9397478680014832, "grad_norm": 7.65234375, "learning_rate": 7.060252131998517e-06, "loss": 2.7946, "mean_token_accuracy": 0.49559408754974416, "step": 15857 }, { "epoch": 2.939933259176863, "grad_norm": 10.015625, "learning_rate": 7.060066740823137e-06, "loss": 2.8507, "mean_token_accuracy": 0.4987984379693602, "step": 15858 }, { "epoch": 2.940118650352243, "grad_norm": 8.6640625, "learning_rate": 7.0598813496477565e-06, "loss": 3.165, "mean_token_accuracy": 0.4562431842966194, "step": 15859 }, { "epoch": 2.9403040415276234, "grad_norm": 7.69921875, "learning_rate": 7.059695958472377e-06, "loss": 2.73, "mean_token_accuracy": 0.5006079555323953, "step": 15860 }, { "epoch": 2.9404894327030036, "grad_norm": 10.15625, "learning_rate": 7.0595105672969975e-06, "loss": 2.9164, "mean_token_accuracy": 0.5049894163894769, "step": 15861 }, { "epoch": 2.9406748238783833, "grad_norm": 8.625, "learning_rate": 7.059325176121617e-06, "loss": 3.0053, "mean_token_accuracy": 0.48793893129770993, "step": 15862 }, { "epoch": 2.9408602150537635, "grad_norm": 7.90234375, "learning_rate": 7.059139784946238e-06, "loss": 2.1494, "mean_token_accuracy": 0.6004613018964634, "step": 15863 }, { "epoch": 2.9410456062291432, "grad_norm": 7.61328125, "learning_rate": 7.058954393770857e-06, "loss": 2.721, "mean_token_accuracy": 0.507703777335984, "step": 15864 }, { "epoch": 2.9412309974045234, "grad_norm": 8.0234375, "learning_rate": 7.058769002595477e-06, "loss": 2.6721, "mean_token_accuracy": 0.48899499243041805, "step": 15865 }, { "epoch": 2.9414163885799036, "grad_norm": 8.1796875, "learning_rate": 7.058583611420097e-06, "loss": 3.2691, "mean_token_accuracy": 0.4306041239599662, "step": 15866 }, { "epoch": 2.941601779755284, "grad_norm": 7.875, "learning_rate": 7.058398220244716e-06, "loss": 2.4522, "mean_token_accuracy": 0.5192002925758868, "step": 15867 }, { "epoch": 2.9417871709306636, "grad_norm": 7.2109375, "learning_rate": 7.058212829069337e-06, "loss": 2.9984, "mean_token_accuracy": 0.47703984819734346, "step": 15868 }, { "epoch": 2.9419725621060437, "grad_norm": 7.9296875, "learning_rate": 7.058027437893957e-06, "loss": 3.3077, "mean_token_accuracy": 0.4520570948782536, "step": 15869 }, { "epoch": 2.942157953281424, "grad_norm": 8.7421875, "learning_rate": 7.057842046718577e-06, "loss": 3.0002, "mean_token_accuracy": 0.46713540002130605, "step": 15870 }, { "epoch": 2.9423433444568037, "grad_norm": 9.2578125, "learning_rate": 7.0576566555431965e-06, "loss": 3.7599, "mean_token_accuracy": 0.43513257575757575, "step": 15871 }, { "epoch": 2.942528735632184, "grad_norm": 7.84765625, "learning_rate": 7.057471264367817e-06, "loss": 2.444, "mean_token_accuracy": 0.5206198450387404, "step": 15872 }, { "epoch": 2.942714126807564, "grad_norm": 6.86328125, "learning_rate": 7.057285873192437e-06, "loss": 3.1746, "mean_token_accuracy": 0.44261096605744127, "step": 15873 }, { "epoch": 2.9428995179829442, "grad_norm": 7.90234375, "learning_rate": 7.057100482017056e-06, "loss": 3.2638, "mean_token_accuracy": 0.4214473950942631, "step": 15874 }, { "epoch": 2.943084909158324, "grad_norm": 8.421875, "learning_rate": 7.056915090841676e-06, "loss": 2.8237, "mean_token_accuracy": 0.5019907100199071, "step": 15875 }, { "epoch": 2.943270300333704, "grad_norm": 8.765625, "learning_rate": 7.0567296996662956e-06, "loss": 3.1895, "mean_token_accuracy": 0.4807560608135872, "step": 15876 }, { "epoch": 2.943455691509084, "grad_norm": 8.0, "learning_rate": 7.056544308490917e-06, "loss": 2.7218, "mean_token_accuracy": 0.5040379438533521, "step": 15877 }, { "epoch": 2.943641082684464, "grad_norm": 9.796875, "learning_rate": 7.0563589173155366e-06, "loss": 3.5964, "mean_token_accuracy": 0.46086576300830784, "step": 15878 }, { "epoch": 2.9438264738598443, "grad_norm": 6.8046875, "learning_rate": 7.056173526140156e-06, "loss": 2.2858, "mean_token_accuracy": 0.5082347200195193, "step": 15879 }, { "epoch": 2.9440118650352245, "grad_norm": 7.51171875, "learning_rate": 7.055988134964777e-06, "loss": 2.7866, "mean_token_accuracy": 0.5028873917228104, "step": 15880 }, { "epoch": 2.9441972562106042, "grad_norm": 9.1953125, "learning_rate": 7.055802743789396e-06, "loss": 3.2958, "mean_token_accuracy": 0.44286459692147145, "step": 15881 }, { "epoch": 2.9443826473859844, "grad_norm": 7.10546875, "learning_rate": 7.055617352614016e-06, "loss": 2.868, "mean_token_accuracy": 0.4928187835643812, "step": 15882 }, { "epoch": 2.9445680385613646, "grad_norm": 8.2265625, "learning_rate": 7.055431961438636e-06, "loss": 2.8836, "mean_token_accuracy": 0.4991249562478124, "step": 15883 }, { "epoch": 2.9447534297367444, "grad_norm": 11.84375, "learning_rate": 7.055246570263255e-06, "loss": 3.754, "mean_token_accuracy": 0.4304707150556953, "step": 15884 }, { "epoch": 2.9449388209121246, "grad_norm": 10.6015625, "learning_rate": 7.055061179087877e-06, "loss": 3.0254, "mean_token_accuracy": 0.47985663082437274, "step": 15885 }, { "epoch": 2.9451242120875047, "grad_norm": 9.8046875, "learning_rate": 7.054875787912496e-06, "loss": 3.5139, "mean_token_accuracy": 0.4581447963800905, "step": 15886 }, { "epoch": 2.945309603262885, "grad_norm": 8.5, "learning_rate": 7.054690396737116e-06, "loss": 3.1766, "mean_token_accuracy": 0.48623853211009177, "step": 15887 }, { "epoch": 2.9454949944382647, "grad_norm": 8.2734375, "learning_rate": 7.0545050055617355e-06, "loss": 3.1931, "mean_token_accuracy": 0.4423548650858545, "step": 15888 }, { "epoch": 2.945680385613645, "grad_norm": 8.6171875, "learning_rate": 7.054319614386356e-06, "loss": 2.6668, "mean_token_accuracy": 0.4884244775372294, "step": 15889 }, { "epoch": 2.9458657767890246, "grad_norm": 7.171875, "learning_rate": 7.054134223210976e-06, "loss": 3.1518, "mean_token_accuracy": 0.46432889963724305, "step": 15890 }, { "epoch": 2.946051167964405, "grad_norm": 7.859375, "learning_rate": 7.053948832035595e-06, "loss": 2.8087, "mean_token_accuracy": 0.4903186768858411, "step": 15891 }, { "epoch": 2.946236559139785, "grad_norm": 7.75390625, "learning_rate": 7.053763440860215e-06, "loss": 3.668, "mean_token_accuracy": 0.430779392338177, "step": 15892 }, { "epoch": 2.946421950315165, "grad_norm": 7.83203125, "learning_rate": 7.053578049684836e-06, "loss": 3.0425, "mean_token_accuracy": 0.5017862459065198, "step": 15893 }, { "epoch": 2.946607341490545, "grad_norm": 8.34375, "learning_rate": 7.053392658509456e-06, "loss": 2.96, "mean_token_accuracy": 0.48180636777128005, "step": 15894 }, { "epoch": 2.946792732665925, "grad_norm": 7.87890625, "learning_rate": 7.053207267334076e-06, "loss": 3.4177, "mean_token_accuracy": 0.4449760765550239, "step": 15895 }, { "epoch": 2.9469781238413053, "grad_norm": 8.7265625, "learning_rate": 7.053021876158695e-06, "loss": 3.4909, "mean_token_accuracy": 0.4472532814778804, "step": 15896 }, { "epoch": 2.947163515016685, "grad_norm": 9.421875, "learning_rate": 7.052836484983316e-06, "loss": 3.6805, "mean_token_accuracy": 0.4512670565302144, "step": 15897 }, { "epoch": 2.9473489061920652, "grad_norm": 14.3046875, "learning_rate": 7.052651093807935e-06, "loss": 3.5733, "mean_token_accuracy": 0.4441244239631336, "step": 15898 }, { "epoch": 2.9475342973674454, "grad_norm": 13.2890625, "learning_rate": 7.052465702632555e-06, "loss": 2.3244, "mean_token_accuracy": 0.5438373570520966, "step": 15899 }, { "epoch": 2.9477196885428256, "grad_norm": 8.2109375, "learning_rate": 7.052280311457175e-06, "loss": 2.7091, "mean_token_accuracy": 0.5077808901338313, "step": 15900 }, { "epoch": 2.9479050797182054, "grad_norm": 7.94140625, "learning_rate": 7.052094920281796e-06, "loss": 2.9722, "mean_token_accuracy": 0.4889607650119533, "step": 15901 }, { "epoch": 2.9480904708935856, "grad_norm": 7.6015625, "learning_rate": 7.051909529106416e-06, "loss": 2.5472, "mean_token_accuracy": 0.5253927658019729, "step": 15902 }, { "epoch": 2.9482758620689653, "grad_norm": 9.1875, "learning_rate": 7.051724137931035e-06, "loss": 2.5659, "mean_token_accuracy": 0.5105979473449352, "step": 15903 }, { "epoch": 2.9484612532443455, "grad_norm": 7.0234375, "learning_rate": 7.051538746755655e-06, "loss": 2.8341, "mean_token_accuracy": 0.4900739587388089, "step": 15904 }, { "epoch": 2.9486466444197257, "grad_norm": 7.5234375, "learning_rate": 7.0513533555802746e-06, "loss": 3.3262, "mean_token_accuracy": 0.44781718963165074, "step": 15905 }, { "epoch": 2.948832035595106, "grad_norm": 6.35546875, "learning_rate": 7.051167964404895e-06, "loss": 2.6839, "mean_token_accuracy": 0.5138317329675355, "step": 15906 }, { "epoch": 2.9490174267704856, "grad_norm": 8.515625, "learning_rate": 7.050982573229515e-06, "loss": 3.434, "mean_token_accuracy": 0.4387510008006405, "step": 15907 }, { "epoch": 2.949202817945866, "grad_norm": 7.4140625, "learning_rate": 7.050797182054134e-06, "loss": 3.2484, "mean_token_accuracy": 0.47089724732086574, "step": 15908 }, { "epoch": 2.9493882091212456, "grad_norm": 8.1328125, "learning_rate": 7.050611790878755e-06, "loss": 4.0605, "mean_token_accuracy": 0.4290044864799321, "step": 15909 }, { "epoch": 2.9495736002966257, "grad_norm": 7.4140625, "learning_rate": 7.050426399703375e-06, "loss": 2.8843, "mean_token_accuracy": 0.4799949193445954, "step": 15910 }, { "epoch": 2.949758991472006, "grad_norm": 8.484375, "learning_rate": 7.050241008527995e-06, "loss": 2.939, "mean_token_accuracy": 0.4675903018307768, "step": 15911 }, { "epoch": 2.949944382647386, "grad_norm": 7.98046875, "learning_rate": 7.050055617352615e-06, "loss": 3.5833, "mean_token_accuracy": 0.44323971260613976, "step": 15912 }, { "epoch": 2.9501297738227663, "grad_norm": 7.421875, "learning_rate": 7.049870226177234e-06, "loss": 2.9019, "mean_token_accuracy": 0.4617356749134726, "step": 15913 }, { "epoch": 2.950315164998146, "grad_norm": 7.1015625, "learning_rate": 7.049684835001854e-06, "loss": 3.5659, "mean_token_accuracy": 0.42948324536639254, "step": 15914 }, { "epoch": 2.9505005561735262, "grad_norm": 10.75, "learning_rate": 7.049499443826474e-06, "loss": 2.5778, "mean_token_accuracy": 0.4866270430906389, "step": 15915 }, { "epoch": 2.950685947348906, "grad_norm": 8.2734375, "learning_rate": 7.049314052651094e-06, "loss": 3.5153, "mean_token_accuracy": 0.4488924472911663, "step": 15916 }, { "epoch": 2.950871338524286, "grad_norm": 10.125, "learning_rate": 7.0491286614757145e-06, "loss": 3.4706, "mean_token_accuracy": 0.45141514253601717, "step": 15917 }, { "epoch": 2.9510567296996664, "grad_norm": 7.8515625, "learning_rate": 7.048943270300335e-06, "loss": 2.5834, "mean_token_accuracy": 0.5095163806552262, "step": 15918 }, { "epoch": 2.9512421208750466, "grad_norm": 7.68359375, "learning_rate": 7.048757879124955e-06, "loss": 3.3423, "mean_token_accuracy": 0.45326460481099656, "step": 15919 }, { "epoch": 2.9514275120504263, "grad_norm": 11.2890625, "learning_rate": 7.048572487949574e-06, "loss": 3.1328, "mean_token_accuracy": 0.4556204974853881, "step": 15920 }, { "epoch": 2.9516129032258065, "grad_norm": 12.3125, "learning_rate": 7.048387096774194e-06, "loss": 2.4693, "mean_token_accuracy": 0.5223765432098766, "step": 15921 }, { "epoch": 2.9517982944011862, "grad_norm": 7.76953125, "learning_rate": 7.048201705598814e-06, "loss": 2.8882, "mean_token_accuracy": 0.5012445181936708, "step": 15922 }, { "epoch": 2.9519836855765664, "grad_norm": 13.0546875, "learning_rate": 7.048016314423434e-06, "loss": 2.9771, "mean_token_accuracy": 0.46348675752121365, "step": 15923 }, { "epoch": 2.9521690767519466, "grad_norm": 11.1171875, "learning_rate": 7.047830923248054e-06, "loss": 2.6287, "mean_token_accuracy": 0.5175762693972342, "step": 15924 }, { "epoch": 2.952354467927327, "grad_norm": 8.7265625, "learning_rate": 7.047645532072674e-06, "loss": 3.4737, "mean_token_accuracy": 0.4410942956926659, "step": 15925 }, { "epoch": 2.9525398591027066, "grad_norm": 14.3125, "learning_rate": 7.047460140897294e-06, "loss": 3.2648, "mean_token_accuracy": 0.43913376353494477, "step": 15926 }, { "epoch": 2.9527252502780867, "grad_norm": 10.828125, "learning_rate": 7.047274749721914e-06, "loss": 3.4699, "mean_token_accuracy": 0.4301675977653631, "step": 15927 }, { "epoch": 2.952910641453467, "grad_norm": 10.7265625, "learning_rate": 7.047089358546534e-06, "loss": 3.2113, "mean_token_accuracy": 0.47706422018348627, "step": 15928 }, { "epoch": 2.9530960326288467, "grad_norm": 6.6953125, "learning_rate": 7.046903967371154e-06, "loss": 2.7962, "mean_token_accuracy": 0.5187223352075607, "step": 15929 }, { "epoch": 2.953281423804227, "grad_norm": 10.984375, "learning_rate": 7.046718576195773e-06, "loss": 2.9269, "mean_token_accuracy": 0.45759463344513657, "step": 15930 }, { "epoch": 2.953466814979607, "grad_norm": 8.4296875, "learning_rate": 7.046533185020393e-06, "loss": 3.4795, "mean_token_accuracy": 0.43736001194564733, "step": 15931 }, { "epoch": 2.9536522061549872, "grad_norm": 7.35546875, "learning_rate": 7.0463477938450134e-06, "loss": 2.5584, "mean_token_accuracy": 0.49147588894301025, "step": 15932 }, { "epoch": 2.953837597330367, "grad_norm": 8.3203125, "learning_rate": 7.046162402669634e-06, "loss": 2.9198, "mean_token_accuracy": 0.45978517160073357, "step": 15933 }, { "epoch": 2.954022988505747, "grad_norm": 7.67578125, "learning_rate": 7.0459770114942536e-06, "loss": 3.1011, "mean_token_accuracy": 0.4479772521649218, "step": 15934 }, { "epoch": 2.954208379681127, "grad_norm": 10.6484375, "learning_rate": 7.045791620318874e-06, "loss": 2.8662, "mean_token_accuracy": 0.47896498688138966, "step": 15935 }, { "epoch": 2.954393770856507, "grad_norm": 9.125, "learning_rate": 7.045606229143494e-06, "loss": 2.7003, "mean_token_accuracy": 0.47024734982332156, "step": 15936 }, { "epoch": 2.9545791620318873, "grad_norm": 8.53125, "learning_rate": 7.045420837968113e-06, "loss": 2.7965, "mean_token_accuracy": 0.4832859656245771, "step": 15937 }, { "epoch": 2.9547645532072675, "grad_norm": 9.734375, "learning_rate": 7.045235446792733e-06, "loss": 2.9187, "mean_token_accuracy": 0.47946319642130947, "step": 15938 }, { "epoch": 2.9549499443826472, "grad_norm": 7.44921875, "learning_rate": 7.045050055617353e-06, "loss": 3.5564, "mean_token_accuracy": 0.4570008643042351, "step": 15939 }, { "epoch": 2.9551353355580274, "grad_norm": 8.2421875, "learning_rate": 7.044864664441972e-06, "loss": 2.6324, "mean_token_accuracy": 0.5231325601695972, "step": 15940 }, { "epoch": 2.9553207267334076, "grad_norm": 9.328125, "learning_rate": 7.044679273266594e-06, "loss": 3.3131, "mean_token_accuracy": 0.4732387923147301, "step": 15941 }, { "epoch": 2.9555061179087874, "grad_norm": 9.953125, "learning_rate": 7.044493882091213e-06, "loss": 3.0333, "mean_token_accuracy": 0.49283449587824985, "step": 15942 }, { "epoch": 2.9556915090841676, "grad_norm": 9.8203125, "learning_rate": 7.044308490915833e-06, "loss": 3.4776, "mean_token_accuracy": 0.43864057127743983, "step": 15943 }, { "epoch": 2.9558769002595477, "grad_norm": 8.8125, "learning_rate": 7.044123099740453e-06, "loss": 2.5535, "mean_token_accuracy": 0.5170561765132821, "step": 15944 }, { "epoch": 2.956062291434928, "grad_norm": 8.0078125, "learning_rate": 7.043937708565073e-06, "loss": 3.1252, "mean_token_accuracy": 0.472607349575986, "step": 15945 }, { "epoch": 2.9562476826103077, "grad_norm": 10.5546875, "learning_rate": 7.043752317389693e-06, "loss": 3.488, "mean_token_accuracy": 0.4619138922781942, "step": 15946 }, { "epoch": 2.956433073785688, "grad_norm": 8.4375, "learning_rate": 7.043566926214312e-06, "loss": 2.9627, "mean_token_accuracy": 0.46504001049455596, "step": 15947 }, { "epoch": 2.9566184649610676, "grad_norm": 8.203125, "learning_rate": 7.043381535038932e-06, "loss": 2.8816, "mean_token_accuracy": 0.5276369481884506, "step": 15948 }, { "epoch": 2.956803856136448, "grad_norm": 8.8984375, "learning_rate": 7.043196143863553e-06, "loss": 3.1179, "mean_token_accuracy": 0.43981431501673324, "step": 15949 }, { "epoch": 2.956989247311828, "grad_norm": 9.6953125, "learning_rate": 7.043010752688173e-06, "loss": 2.6075, "mean_token_accuracy": 0.4959016393442623, "step": 15950 }, { "epoch": 2.957174638487208, "grad_norm": 8.1875, "learning_rate": 7.042825361512793e-06, "loss": 2.9207, "mean_token_accuracy": 0.4968211767425731, "step": 15951 }, { "epoch": 2.957360029662588, "grad_norm": 7.27734375, "learning_rate": 7.042639970337412e-06, "loss": 3.4572, "mean_token_accuracy": 0.4350783432527633, "step": 15952 }, { "epoch": 2.957545420837968, "grad_norm": 8.7421875, "learning_rate": 7.042454579162033e-06, "loss": 3.4821, "mean_token_accuracy": 0.44428301612668897, "step": 15953 }, { "epoch": 2.9577308120133483, "grad_norm": 8.8046875, "learning_rate": 7.042269187986652e-06, "loss": 3.2571, "mean_token_accuracy": 0.4592705167173252, "step": 15954 }, { "epoch": 2.957916203188728, "grad_norm": 11.2734375, "learning_rate": 7.042083796811272e-06, "loss": 3.7642, "mean_token_accuracy": 0.43663609352796795, "step": 15955 }, { "epoch": 2.9581015943641082, "grad_norm": 9.1328125, "learning_rate": 7.041898405635892e-06, "loss": 2.7363, "mean_token_accuracy": 0.5093948000873935, "step": 15956 }, { "epoch": 2.9582869855394884, "grad_norm": 12.09375, "learning_rate": 7.041713014460513e-06, "loss": 2.2971, "mean_token_accuracy": 0.5192258721670486, "step": 15957 }, { "epoch": 2.9584723767148686, "grad_norm": 6.8984375, "learning_rate": 7.041527623285133e-06, "loss": 2.7576, "mean_token_accuracy": 0.5030736240171552, "step": 15958 }, { "epoch": 2.9586577678902484, "grad_norm": 8.46875, "learning_rate": 7.041342232109752e-06, "loss": 2.5126, "mean_token_accuracy": 0.5742867084203201, "step": 15959 }, { "epoch": 2.9588431590656286, "grad_norm": 9.890625, "learning_rate": 7.041156840934372e-06, "loss": 3.125, "mean_token_accuracy": 0.49262536873156343, "step": 15960 }, { "epoch": 2.9590285502410083, "grad_norm": 7.26953125, "learning_rate": 7.0409714497589924e-06, "loss": 2.881, "mean_token_accuracy": 0.4847213900539245, "step": 15961 }, { "epoch": 2.9592139414163885, "grad_norm": 9.1328125, "learning_rate": 7.040786058583612e-06, "loss": 3.8667, "mean_token_accuracy": 0.4388256777637309, "step": 15962 }, { "epoch": 2.9593993325917687, "grad_norm": 12.109375, "learning_rate": 7.040600667408232e-06, "loss": 3.0998, "mean_token_accuracy": 0.46522265572447197, "step": 15963 }, { "epoch": 2.959584723767149, "grad_norm": 9.96875, "learning_rate": 7.040415276232851e-06, "loss": 2.9649, "mean_token_accuracy": 0.47561106923656726, "step": 15964 }, { "epoch": 2.9597701149425286, "grad_norm": 12.71875, "learning_rate": 7.040229885057471e-06, "loss": 3.1712, "mean_token_accuracy": 0.44451434323100153, "step": 15965 }, { "epoch": 2.959955506117909, "grad_norm": 11.9453125, "learning_rate": 7.040044493882092e-06, "loss": 3.1579, "mean_token_accuracy": 0.44703165592225536, "step": 15966 }, { "epoch": 2.960140897293289, "grad_norm": 11.96875, "learning_rate": 7.039859102706712e-06, "loss": 2.86, "mean_token_accuracy": 0.47367840194239047, "step": 15967 }, { "epoch": 2.9603262884686687, "grad_norm": 7.578125, "learning_rate": 7.039673711531332e-06, "loss": 3.3039, "mean_token_accuracy": 0.4448326383410346, "step": 15968 }, { "epoch": 2.960511679644049, "grad_norm": 8.625, "learning_rate": 7.039488320355951e-06, "loss": 2.9844, "mean_token_accuracy": 0.4704473850031506, "step": 15969 }, { "epoch": 2.960697070819429, "grad_norm": 13.1484375, "learning_rate": 7.039302929180572e-06, "loss": 2.9881, "mean_token_accuracy": 0.47422940879231934, "step": 15970 }, { "epoch": 2.9608824619948093, "grad_norm": 9.390625, "learning_rate": 7.0391175380051914e-06, "loss": 3.1128, "mean_token_accuracy": 0.46662584200857316, "step": 15971 }, { "epoch": 2.961067853170189, "grad_norm": 8.625, "learning_rate": 7.038932146829811e-06, "loss": 2.8655, "mean_token_accuracy": 0.48585209003215435, "step": 15972 }, { "epoch": 2.9612532443455692, "grad_norm": 10.234375, "learning_rate": 7.038746755654431e-06, "loss": 4.0741, "mean_token_accuracy": 0.41977800201816345, "step": 15973 }, { "epoch": 2.961438635520949, "grad_norm": 12.8125, "learning_rate": 7.038561364479052e-06, "loss": 2.6558, "mean_token_accuracy": 0.4894084215964867, "step": 15974 }, { "epoch": 2.961624026696329, "grad_norm": 8.4296875, "learning_rate": 7.038375973303672e-06, "loss": 2.656, "mean_token_accuracy": 0.5058947368421053, "step": 15975 }, { "epoch": 2.9618094178717094, "grad_norm": 7.4453125, "learning_rate": 7.038190582128291e-06, "loss": 2.7776, "mean_token_accuracy": 0.47499155119972963, "step": 15976 }, { "epoch": 2.9619948090470896, "grad_norm": 10.703125, "learning_rate": 7.038005190952911e-06, "loss": 3.1101, "mean_token_accuracy": 0.4675461125379407, "step": 15977 }, { "epoch": 2.9621802002224693, "grad_norm": 10.2421875, "learning_rate": 7.037819799777531e-06, "loss": 3.0675, "mean_token_accuracy": 0.46943353897924844, "step": 15978 }, { "epoch": 2.9623655913978495, "grad_norm": 8.828125, "learning_rate": 7.037634408602151e-06, "loss": 3.3477, "mean_token_accuracy": 0.443212016175621, "step": 15979 }, { "epoch": 2.9625509825732292, "grad_norm": 10.625, "learning_rate": 7.037449017426771e-06, "loss": 3.7612, "mean_token_accuracy": 0.4231354642313546, "step": 15980 }, { "epoch": 2.9627363737486094, "grad_norm": 7.42578125, "learning_rate": 7.03726362625139e-06, "loss": 2.4887, "mean_token_accuracy": 0.525781910397295, "step": 15981 }, { "epoch": 2.9629217649239896, "grad_norm": 7.6640625, "learning_rate": 7.037078235076012e-06, "loss": 2.5001, "mean_token_accuracy": 0.5222390700025271, "step": 15982 }, { "epoch": 2.96310715609937, "grad_norm": 9.1015625, "learning_rate": 7.036892843900631e-06, "loss": 3.262, "mean_token_accuracy": 0.44072321320228125, "step": 15983 }, { "epoch": 2.9632925472747496, "grad_norm": 8.3125, "learning_rate": 7.036707452725251e-06, "loss": 2.8988, "mean_token_accuracy": 0.4717436250861475, "step": 15984 }, { "epoch": 2.9634779384501297, "grad_norm": 10.9140625, "learning_rate": 7.036522061549871e-06, "loss": 2.8953, "mean_token_accuracy": 0.48127544097693353, "step": 15985 }, { "epoch": 2.96366332962551, "grad_norm": 8.6875, "learning_rate": 7.03633667037449e-06, "loss": 2.9986, "mean_token_accuracy": 0.5012654223347042, "step": 15986 }, { "epoch": 2.9638487208008897, "grad_norm": 9.0546875, "learning_rate": 7.036151279199111e-06, "loss": 3.3258, "mean_token_accuracy": 0.45209257704760764, "step": 15987 }, { "epoch": 2.96403411197627, "grad_norm": 9.1484375, "learning_rate": 7.0359658880237305e-06, "loss": 2.5993, "mean_token_accuracy": 0.4924396090724691, "step": 15988 }, { "epoch": 2.96421950315165, "grad_norm": 7.6171875, "learning_rate": 7.03578049684835e-06, "loss": 2.9731, "mean_token_accuracy": 0.46687299403943144, "step": 15989 }, { "epoch": 2.9644048943270302, "grad_norm": 8.2265625, "learning_rate": 7.035595105672971e-06, "loss": 3.6356, "mean_token_accuracy": 0.430504884076733, "step": 15990 }, { "epoch": 2.96459028550241, "grad_norm": 10.5625, "learning_rate": 7.035409714497591e-06, "loss": 2.7833, "mean_token_accuracy": 0.48601190476190476, "step": 15991 }, { "epoch": 2.96477567667779, "grad_norm": 9.265625, "learning_rate": 7.035224323322211e-06, "loss": 2.7849, "mean_token_accuracy": 0.48442224267122386, "step": 15992 }, { "epoch": 2.96496106785317, "grad_norm": 7.3125, "learning_rate": 7.03503893214683e-06, "loss": 2.9016, "mean_token_accuracy": 0.46602518412924687, "step": 15993 }, { "epoch": 2.96514645902855, "grad_norm": 8.1328125, "learning_rate": 7.03485354097145e-06, "loss": 3.4631, "mean_token_accuracy": 0.45498185224212623, "step": 15994 }, { "epoch": 2.9653318502039303, "grad_norm": 9.0859375, "learning_rate": 7.03466814979607e-06, "loss": 2.3814, "mean_token_accuracy": 0.5729805854541796, "step": 15995 }, { "epoch": 2.9655172413793105, "grad_norm": 10.7578125, "learning_rate": 7.03448275862069e-06, "loss": 3.2046, "mean_token_accuracy": 0.45206151832460734, "step": 15996 }, { "epoch": 2.9657026325546902, "grad_norm": 8.9140625, "learning_rate": 7.03429736744531e-06, "loss": 2.9143, "mean_token_accuracy": 0.4868483412322275, "step": 15997 }, { "epoch": 2.9658880237300704, "grad_norm": 9.6640625, "learning_rate": 7.03411197626993e-06, "loss": 3.0945, "mean_token_accuracy": 0.4782448377581121, "step": 15998 }, { "epoch": 2.9660734149054506, "grad_norm": 9.7421875, "learning_rate": 7.033926585094551e-06, "loss": 2.4094, "mean_token_accuracy": 0.5456736286435621, "step": 15999 }, { "epoch": 2.9662588060808304, "grad_norm": 10.5234375, "learning_rate": 7.0337411939191704e-06, "loss": 3.2685, "mean_token_accuracy": 0.46460071513706797, "step": 16000 }, { "epoch": 2.9664441972562106, "grad_norm": 7.34375, "learning_rate": 7.03355580274379e-06, "loss": 2.6325, "mean_token_accuracy": 0.5374576097009557, "step": 16001 }, { "epoch": 2.9666295884315907, "grad_norm": 11.1171875, "learning_rate": 7.03337041156841e-06, "loss": 2.57, "mean_token_accuracy": 0.5259748197041927, "step": 16002 }, { "epoch": 2.966814979606971, "grad_norm": 9.703125, "learning_rate": 7.033185020393029e-06, "loss": 3.2965, "mean_token_accuracy": 0.462882096069869, "step": 16003 }, { "epoch": 2.9670003707823507, "grad_norm": 8.8828125, "learning_rate": 7.03299962921765e-06, "loss": 2.7615, "mean_token_accuracy": 0.4979187071498531, "step": 16004 }, { "epoch": 2.967185761957731, "grad_norm": 9.265625, "learning_rate": 7.0328142380422695e-06, "loss": 2.9219, "mean_token_accuracy": 0.47659334461364916, "step": 16005 }, { "epoch": 2.9673711531331106, "grad_norm": 14.2265625, "learning_rate": 7.03262884686689e-06, "loss": 2.4375, "mean_token_accuracy": 0.5098587610124458, "step": 16006 }, { "epoch": 2.967556544308491, "grad_norm": 15.375, "learning_rate": 7.03244345569151e-06, "loss": 2.981, "mean_token_accuracy": 0.4940902280672549, "step": 16007 }, { "epoch": 2.967741935483871, "grad_norm": 9.890625, "learning_rate": 7.03225806451613e-06, "loss": 3.5057, "mean_token_accuracy": 0.4452264381884945, "step": 16008 }, { "epoch": 2.967927326659251, "grad_norm": 7.5703125, "learning_rate": 7.03207267334075e-06, "loss": 2.325, "mean_token_accuracy": 0.5525699324789858, "step": 16009 }, { "epoch": 2.968112717834631, "grad_norm": 10.046875, "learning_rate": 7.031887282165369e-06, "loss": 2.5036, "mean_token_accuracy": 0.5273972602739726, "step": 16010 }, { "epoch": 2.968298109010011, "grad_norm": 13.3671875, "learning_rate": 7.031701890989989e-06, "loss": 2.7802, "mean_token_accuracy": 0.4963719099741729, "step": 16011 }, { "epoch": 2.9684835001853913, "grad_norm": 8.7578125, "learning_rate": 7.031516499814609e-06, "loss": 4.1978, "mean_token_accuracy": 0.4329847337420363, "step": 16012 }, { "epoch": 2.968668891360771, "grad_norm": 8.046875, "learning_rate": 7.031331108639229e-06, "loss": 2.6592, "mean_token_accuracy": 0.523026851098454, "step": 16013 }, { "epoch": 2.9688542825361512, "grad_norm": 8.7890625, "learning_rate": 7.03114571746385e-06, "loss": 2.4782, "mean_token_accuracy": 0.5320798047940289, "step": 16014 }, { "epoch": 2.9690396737115314, "grad_norm": 6.7109375, "learning_rate": 7.030960326288469e-06, "loss": 2.9751, "mean_token_accuracy": 0.48104300914131576, "step": 16015 }, { "epoch": 2.9692250648869116, "grad_norm": 10.2734375, "learning_rate": 7.03077493511309e-06, "loss": 3.2134, "mean_token_accuracy": 0.4644207066557108, "step": 16016 }, { "epoch": 2.9694104560622914, "grad_norm": 7.9140625, "learning_rate": 7.0305895439377095e-06, "loss": 3.2282, "mean_token_accuracy": 0.45654853620955316, "step": 16017 }, { "epoch": 2.9695958472376716, "grad_norm": 6.90234375, "learning_rate": 7.030404152762329e-06, "loss": 2.8448, "mean_token_accuracy": 0.500245941957698, "step": 16018 }, { "epoch": 2.9697812384130513, "grad_norm": 7.83203125, "learning_rate": 7.030218761586949e-06, "loss": 2.1174, "mean_token_accuracy": 0.5502103946320938, "step": 16019 }, { "epoch": 2.9699666295884315, "grad_norm": 8.6640625, "learning_rate": 7.030033370411568e-06, "loss": 3.7031, "mean_token_accuracy": 0.42154901400400113, "step": 16020 }, { "epoch": 2.9701520207638117, "grad_norm": 7.4296875, "learning_rate": 7.029847979236188e-06, "loss": 3.3452, "mean_token_accuracy": 0.4250132485426603, "step": 16021 }, { "epoch": 2.970337411939192, "grad_norm": 9.1875, "learning_rate": 7.029662588060809e-06, "loss": 3.2503, "mean_token_accuracy": 0.44337160751565763, "step": 16022 }, { "epoch": 2.9705228031145716, "grad_norm": 9.921875, "learning_rate": 7.029477196885429e-06, "loss": 4.0854, "mean_token_accuracy": 0.43558636626227615, "step": 16023 }, { "epoch": 2.970708194289952, "grad_norm": 9.2109375, "learning_rate": 7.029291805710049e-06, "loss": 2.7537, "mean_token_accuracy": 0.4854524844133762, "step": 16024 }, { "epoch": 2.970893585465332, "grad_norm": 8.375, "learning_rate": 7.029106414534669e-06, "loss": 2.9821, "mean_token_accuracy": 0.46059799055804385, "step": 16025 }, { "epoch": 2.9710789766407117, "grad_norm": 13.171875, "learning_rate": 7.028921023359289e-06, "loss": 2.8012, "mean_token_accuracy": 0.46776532630191164, "step": 16026 }, { "epoch": 2.971264367816092, "grad_norm": 11.9375, "learning_rate": 7.0287356321839084e-06, "loss": 3.0087, "mean_token_accuracy": 0.46303849095080296, "step": 16027 }, { "epoch": 2.971449758991472, "grad_norm": 8.9765625, "learning_rate": 7.028550241008528e-06, "loss": 2.7989, "mean_token_accuracy": 0.4931907852870052, "step": 16028 }, { "epoch": 2.9716351501668523, "grad_norm": 8.5, "learning_rate": 7.028364849833148e-06, "loss": 2.8601, "mean_token_accuracy": 0.49517241379310345, "step": 16029 }, { "epoch": 2.971820541342232, "grad_norm": 11.4453125, "learning_rate": 7.028179458657769e-06, "loss": 3.0025, "mean_token_accuracy": 0.4830301095861262, "step": 16030 }, { "epoch": 2.9720059325176122, "grad_norm": 9.9609375, "learning_rate": 7.027994067482389e-06, "loss": 2.751, "mean_token_accuracy": 0.47950068712780575, "step": 16031 }, { "epoch": 2.972191323692992, "grad_norm": 7.42578125, "learning_rate": 7.027808676307008e-06, "loss": 3.3981, "mean_token_accuracy": 0.4720408405556215, "step": 16032 }, { "epoch": 2.972376714868372, "grad_norm": 8.0234375, "learning_rate": 7.027623285131628e-06, "loss": 3.1487, "mean_token_accuracy": 0.4673048091338946, "step": 16033 }, { "epoch": 2.9725621060437524, "grad_norm": 9.234375, "learning_rate": 7.0274378939562485e-06, "loss": 2.7589, "mean_token_accuracy": 0.5017758046614872, "step": 16034 }, { "epoch": 2.9727474972191326, "grad_norm": 9.421875, "learning_rate": 7.027252502780868e-06, "loss": 3.4721, "mean_token_accuracy": 0.42004200420042004, "step": 16035 }, { "epoch": 2.9729328883945123, "grad_norm": 10.4453125, "learning_rate": 7.027067111605488e-06, "loss": 3.3667, "mean_token_accuracy": 0.475139146567718, "step": 16036 }, { "epoch": 2.9731182795698925, "grad_norm": 7.30078125, "learning_rate": 7.0268817204301074e-06, "loss": 2.9765, "mean_token_accuracy": 0.47670563601465715, "step": 16037 }, { "epoch": 2.9733036707452727, "grad_norm": 8.3984375, "learning_rate": 7.026696329254729e-06, "loss": 2.813, "mean_token_accuracy": 0.5112575507962658, "step": 16038 }, { "epoch": 2.9734890619206524, "grad_norm": 13.109375, "learning_rate": 7.026510938079348e-06, "loss": 2.6759, "mean_token_accuracy": 0.5167730322369498, "step": 16039 }, { "epoch": 2.9736744530960326, "grad_norm": 8.3046875, "learning_rate": 7.026325546903968e-06, "loss": 2.5048, "mean_token_accuracy": 0.5065089933419458, "step": 16040 }, { "epoch": 2.973859844271413, "grad_norm": 13.75, "learning_rate": 7.026140155728588e-06, "loss": 3.1133, "mean_token_accuracy": 0.4612871146469669, "step": 16041 }, { "epoch": 2.974045235446793, "grad_norm": 9.7734375, "learning_rate": 7.025954764553208e-06, "loss": 3.5767, "mean_token_accuracy": 0.4438731790916881, "step": 16042 }, { "epoch": 2.9742306266221727, "grad_norm": 13.0625, "learning_rate": 7.025769373377828e-06, "loss": 3.3915, "mean_token_accuracy": 0.43561134356113435, "step": 16043 }, { "epoch": 2.974416017797553, "grad_norm": 9.796875, "learning_rate": 7.0255839822024475e-06, "loss": 2.8212, "mean_token_accuracy": 0.48490329762070405, "step": 16044 }, { "epoch": 2.9746014089729327, "grad_norm": 8.3984375, "learning_rate": 7.025398591027067e-06, "loss": 3.1151, "mean_token_accuracy": 0.46595946801773275, "step": 16045 }, { "epoch": 2.974786800148313, "grad_norm": 13.484375, "learning_rate": 7.0252131998516885e-06, "loss": 3.0944, "mean_token_accuracy": 0.46706963858456113, "step": 16046 }, { "epoch": 2.974972191323693, "grad_norm": 9.8515625, "learning_rate": 7.025027808676308e-06, "loss": 2.24, "mean_token_accuracy": 0.5503073070794445, "step": 16047 }, { "epoch": 2.9751575824990732, "grad_norm": 11.9609375, "learning_rate": 7.024842417500928e-06, "loss": 2.5877, "mean_token_accuracy": 0.4979764209044519, "step": 16048 }, { "epoch": 2.975342973674453, "grad_norm": 10.2421875, "learning_rate": 7.024657026325547e-06, "loss": 3.1334, "mean_token_accuracy": 0.4633265628526292, "step": 16049 }, { "epoch": 2.975528364849833, "grad_norm": 12.5390625, "learning_rate": 7.024471635150167e-06, "loss": 2.5981, "mean_token_accuracy": 0.5037894446741078, "step": 16050 }, { "epoch": 2.975713756025213, "grad_norm": 9.9453125, "learning_rate": 7.0242862439747875e-06, "loss": 2.7976, "mean_token_accuracy": 0.493721524613846, "step": 16051 }, { "epoch": 2.975899147200593, "grad_norm": 9.1640625, "learning_rate": 7.024100852799407e-06, "loss": 2.9564, "mean_token_accuracy": 0.47714681440443213, "step": 16052 }, { "epoch": 2.9760845383759733, "grad_norm": 10.6796875, "learning_rate": 7.023915461624027e-06, "loss": 3.7299, "mean_token_accuracy": 0.44403393541324576, "step": 16053 }, { "epoch": 2.9762699295513535, "grad_norm": 10.3125, "learning_rate": 7.023730070448648e-06, "loss": 2.6719, "mean_token_accuracy": 0.4824371791407728, "step": 16054 }, { "epoch": 2.9764553207267332, "grad_norm": 8.8984375, "learning_rate": 7.023544679273268e-06, "loss": 2.8284, "mean_token_accuracy": 0.4823960880195599, "step": 16055 }, { "epoch": 2.9766407119021134, "grad_norm": 7.52734375, "learning_rate": 7.0233592880978874e-06, "loss": 2.8217, "mean_token_accuracy": 0.4842640669704355, "step": 16056 }, { "epoch": 2.9768261030774936, "grad_norm": 10.1015625, "learning_rate": 7.023173896922507e-06, "loss": 2.6674, "mean_token_accuracy": 0.5064112467171327, "step": 16057 }, { "epoch": 2.9770114942528734, "grad_norm": 8.8359375, "learning_rate": 7.022988505747127e-06, "loss": 2.3997, "mean_token_accuracy": 0.5429324629676123, "step": 16058 }, { "epoch": 2.9771968854282536, "grad_norm": 9.4296875, "learning_rate": 7.022803114571746e-06, "loss": 3.2473, "mean_token_accuracy": 0.4431498079385403, "step": 16059 }, { "epoch": 2.9773822766036337, "grad_norm": 8.25, "learning_rate": 7.022617723396367e-06, "loss": 3.1666, "mean_token_accuracy": 0.4750362271110526, "step": 16060 }, { "epoch": 2.977567667779014, "grad_norm": 7.26953125, "learning_rate": 7.0224323322209865e-06, "loss": 2.9514, "mean_token_accuracy": 0.4817637452367991, "step": 16061 }, { "epoch": 2.9777530589543937, "grad_norm": 8.21875, "learning_rate": 7.022246941045607e-06, "loss": 2.9749, "mean_token_accuracy": 0.477124183006536, "step": 16062 }, { "epoch": 2.977938450129774, "grad_norm": 9.1875, "learning_rate": 7.0220615498702275e-06, "loss": 3.0178, "mean_token_accuracy": 0.4651747881355932, "step": 16063 }, { "epoch": 2.9781238413051536, "grad_norm": 8.5234375, "learning_rate": 7.021876158694847e-06, "loss": 3.2032, "mean_token_accuracy": 0.46174981923355024, "step": 16064 }, { "epoch": 2.978309232480534, "grad_norm": 7.734375, "learning_rate": 7.021690767519467e-06, "loss": 3.4314, "mean_token_accuracy": 0.4638534147695725, "step": 16065 }, { "epoch": 2.978494623655914, "grad_norm": 8.078125, "learning_rate": 7.0215053763440864e-06, "loss": 2.4013, "mean_token_accuracy": 0.5224535734383793, "step": 16066 }, { "epoch": 2.978680014831294, "grad_norm": 9.640625, "learning_rate": 7.021319985168706e-06, "loss": 2.583, "mean_token_accuracy": 0.49778237729154345, "step": 16067 }, { "epoch": 2.978865406006674, "grad_norm": 7.625, "learning_rate": 7.0211345939933266e-06, "loss": 2.7554, "mean_token_accuracy": 0.4992889463477699, "step": 16068 }, { "epoch": 2.979050797182054, "grad_norm": 8.1015625, "learning_rate": 7.020949202817946e-06, "loss": 2.3226, "mean_token_accuracy": 0.5322555812163202, "step": 16069 }, { "epoch": 2.9792361883574343, "grad_norm": 7.14453125, "learning_rate": 7.020763811642567e-06, "loss": 2.5209, "mean_token_accuracy": 0.5455287872601061, "step": 16070 }, { "epoch": 2.979421579532814, "grad_norm": 10.3203125, "learning_rate": 7.020578420467186e-06, "loss": 3.1868, "mean_token_accuracy": 0.48586883029073696, "step": 16071 }, { "epoch": 2.9796069707081942, "grad_norm": 7.80859375, "learning_rate": 7.020393029291807e-06, "loss": 2.8225, "mean_token_accuracy": 0.4572822065253952, "step": 16072 }, { "epoch": 2.9797923618835744, "grad_norm": 9.875, "learning_rate": 7.0202076381164265e-06, "loss": 2.9148, "mean_token_accuracy": 0.47517433751743376, "step": 16073 }, { "epoch": 2.9799777530589546, "grad_norm": 8.5859375, "learning_rate": 7.020022246941046e-06, "loss": 2.6503, "mean_token_accuracy": 0.48443962970258025, "step": 16074 }, { "epoch": 2.9801631442343344, "grad_norm": 7.15234375, "learning_rate": 7.019836855765666e-06, "loss": 2.3383, "mean_token_accuracy": 0.5538851827492248, "step": 16075 }, { "epoch": 2.9803485354097146, "grad_norm": 8.0078125, "learning_rate": 7.019651464590285e-06, "loss": 3.6542, "mean_token_accuracy": 0.419425763062597, "step": 16076 }, { "epoch": 2.9805339265850943, "grad_norm": 13.625, "learning_rate": 7.019466073414906e-06, "loss": 3.0616, "mean_token_accuracy": 0.44610814022578726, "step": 16077 }, { "epoch": 2.9807193177604745, "grad_norm": 9.1640625, "learning_rate": 7.019280682239526e-06, "loss": 2.9022, "mean_token_accuracy": 0.4836117041316087, "step": 16078 }, { "epoch": 2.9809047089358547, "grad_norm": 9.1875, "learning_rate": 7.019095291064146e-06, "loss": 3.2806, "mean_token_accuracy": 0.47354441138971526, "step": 16079 }, { "epoch": 2.981090100111235, "grad_norm": 12.640625, "learning_rate": 7.0189098998887665e-06, "loss": 2.7462, "mean_token_accuracy": 0.5073055153350007, "step": 16080 }, { "epoch": 2.9812754912866146, "grad_norm": 10.0859375, "learning_rate": 7.018724508713386e-06, "loss": 3.2699, "mean_token_accuracy": 0.47749820273184757, "step": 16081 }, { "epoch": 2.981460882461995, "grad_norm": 10.1171875, "learning_rate": 7.018539117538006e-06, "loss": 3.2011, "mean_token_accuracy": 0.4343350686228112, "step": 16082 }, { "epoch": 2.981646273637375, "grad_norm": 7.828125, "learning_rate": 7.0183537263626255e-06, "loss": 2.7871, "mean_token_accuracy": 0.47734790737811017, "step": 16083 }, { "epoch": 2.9818316648127547, "grad_norm": 9.6015625, "learning_rate": 7.018168335187245e-06, "loss": 3.187, "mean_token_accuracy": 0.4684385382059801, "step": 16084 }, { "epoch": 2.982017055988135, "grad_norm": 8.4921875, "learning_rate": 7.017982944011866e-06, "loss": 3.9207, "mean_token_accuracy": 0.4404518453598208, "step": 16085 }, { "epoch": 2.982202447163515, "grad_norm": 7.53125, "learning_rate": 7.017797552836485e-06, "loss": 3.185, "mean_token_accuracy": 0.4580178940123882, "step": 16086 }, { "epoch": 2.9823878383388953, "grad_norm": 8.4921875, "learning_rate": 7.017612161661106e-06, "loss": 3.0809, "mean_token_accuracy": 0.4618580060422961, "step": 16087 }, { "epoch": 2.982573229514275, "grad_norm": 9.1328125, "learning_rate": 7.017426770485725e-06, "loss": 3.334, "mean_token_accuracy": 0.47300271915058917, "step": 16088 }, { "epoch": 2.9827586206896552, "grad_norm": 7.6015625, "learning_rate": 7.017241379310346e-06, "loss": 2.8521, "mean_token_accuracy": 0.4849163937252198, "step": 16089 }, { "epoch": 2.982944011865035, "grad_norm": 7.09765625, "learning_rate": 7.0170559881349655e-06, "loss": 2.6224, "mean_token_accuracy": 0.5033280507131537, "step": 16090 }, { "epoch": 2.983129403040415, "grad_norm": 7.75, "learning_rate": 7.016870596959585e-06, "loss": 3.0301, "mean_token_accuracy": 0.465818759936407, "step": 16091 }, { "epoch": 2.9833147942157954, "grad_norm": 9.7734375, "learning_rate": 7.016685205784205e-06, "loss": 2.6424, "mean_token_accuracy": 0.5103589145024803, "step": 16092 }, { "epoch": 2.9835001853911756, "grad_norm": 10.15625, "learning_rate": 7.0164998146088244e-06, "loss": 2.9568, "mean_token_accuracy": 0.4986690328305235, "step": 16093 }, { "epoch": 2.9836855765665553, "grad_norm": 7.8203125, "learning_rate": 7.016314423433445e-06, "loss": 2.7042, "mean_token_accuracy": 0.5057142857142857, "step": 16094 }, { "epoch": 2.9838709677419355, "grad_norm": 6.52734375, "learning_rate": 7.0161290322580654e-06, "loss": 3.4039, "mean_token_accuracy": 0.4137034849379799, "step": 16095 }, { "epoch": 2.9840563589173157, "grad_norm": 8.765625, "learning_rate": 7.015943641082685e-06, "loss": 2.7366, "mean_token_accuracy": 0.4836190062920373, "step": 16096 }, { "epoch": 2.9842417500926954, "grad_norm": 8.515625, "learning_rate": 7.015758249907305e-06, "loss": 2.5543, "mean_token_accuracy": 0.5060856498873028, "step": 16097 }, { "epoch": 2.9844271412680756, "grad_norm": 7.66796875, "learning_rate": 7.015572858731925e-06, "loss": 3.3866, "mean_token_accuracy": 0.45870462942838514, "step": 16098 }, { "epoch": 2.984612532443456, "grad_norm": 8.984375, "learning_rate": 7.015387467556545e-06, "loss": 2.5897, "mean_token_accuracy": 0.5014084507042254, "step": 16099 }, { "epoch": 2.984797923618836, "grad_norm": 10.359375, "learning_rate": 7.0152020763811645e-06, "loss": 3.033, "mean_token_accuracy": 0.5031285914953391, "step": 16100 }, { "epoch": 2.9849833147942157, "grad_norm": 8.0234375, "learning_rate": 7.015016685205784e-06, "loss": 2.8215, "mean_token_accuracy": 0.4845119091694434, "step": 16101 }, { "epoch": 2.985168705969596, "grad_norm": 8.171875, "learning_rate": 7.014831294030404e-06, "loss": 2.7788, "mean_token_accuracy": 0.4604266067920292, "step": 16102 }, { "epoch": 2.9853540971449757, "grad_norm": 7.1015625, "learning_rate": 7.014645902855025e-06, "loss": 3.1886, "mean_token_accuracy": 0.4381831085876508, "step": 16103 }, { "epoch": 2.985539488320356, "grad_norm": 8.65625, "learning_rate": 7.014460511679645e-06, "loss": 2.8462, "mean_token_accuracy": 0.48796972154636387, "step": 16104 }, { "epoch": 2.985724879495736, "grad_norm": 9.4921875, "learning_rate": 7.014275120504264e-06, "loss": 3.5178, "mean_token_accuracy": 0.4276771443793287, "step": 16105 }, { "epoch": 2.9859102706711163, "grad_norm": 7.375, "learning_rate": 7.014089729328885e-06, "loss": 2.2633, "mean_token_accuracy": 0.5501248662147699, "step": 16106 }, { "epoch": 2.986095661846496, "grad_norm": 8.5625, "learning_rate": 7.0139043381535046e-06, "loss": 3.3164, "mean_token_accuracy": 0.44238038665082624, "step": 16107 }, { "epoch": 2.986281053021876, "grad_norm": 7.73046875, "learning_rate": 7.013718946978124e-06, "loss": 2.2746, "mean_token_accuracy": 0.5262465465070385, "step": 16108 }, { "epoch": 2.9864664441972564, "grad_norm": 8.2265625, "learning_rate": 7.013533555802744e-06, "loss": 3.2038, "mean_token_accuracy": 0.46635200974421437, "step": 16109 }, { "epoch": 2.986651835372636, "grad_norm": 10.2890625, "learning_rate": 7.0133481646273635e-06, "loss": 4.4025, "mean_token_accuracy": 0.40478026214340784, "step": 16110 }, { "epoch": 2.9868372265480163, "grad_norm": 8.2109375, "learning_rate": 7.013162773451985e-06, "loss": 2.2861, "mean_token_accuracy": 0.5616968357054027, "step": 16111 }, { "epoch": 2.9870226177233965, "grad_norm": 7.76171875, "learning_rate": 7.0129773822766045e-06, "loss": 2.6689, "mean_token_accuracy": 0.4896465174649655, "step": 16112 }, { "epoch": 2.9872080088987767, "grad_norm": 9.03125, "learning_rate": 7.012791991101224e-06, "loss": 3.5308, "mean_token_accuracy": 0.44295400943396224, "step": 16113 }, { "epoch": 2.9873934000741564, "grad_norm": 8.2890625, "learning_rate": 7.012606599925844e-06, "loss": 2.9336, "mean_token_accuracy": 0.49085754783841246, "step": 16114 }, { "epoch": 2.9875787912495366, "grad_norm": 12.0546875, "learning_rate": 7.012421208750464e-06, "loss": 3.447, "mean_token_accuracy": 0.48001122177023425, "step": 16115 }, { "epoch": 2.9877641824249164, "grad_norm": 7.43359375, "learning_rate": 7.012235817575084e-06, "loss": 3.0891, "mean_token_accuracy": 0.46674182638105977, "step": 16116 }, { "epoch": 2.9879495736002966, "grad_norm": 9.8984375, "learning_rate": 7.0120504263997035e-06, "loss": 4.0403, "mean_token_accuracy": 0.4278874056877539, "step": 16117 }, { "epoch": 2.9881349647756767, "grad_norm": 7.203125, "learning_rate": 7.011865035224323e-06, "loss": 3.0249, "mean_token_accuracy": 0.46439222235476557, "step": 16118 }, { "epoch": 2.988320355951057, "grad_norm": 8.203125, "learning_rate": 7.0116796440489445e-06, "loss": 2.3397, "mean_token_accuracy": 0.546819438956934, "step": 16119 }, { "epoch": 2.9885057471264367, "grad_norm": 7.88671875, "learning_rate": 7.011494252873564e-06, "loss": 2.8827, "mean_token_accuracy": 0.492470174066106, "step": 16120 }, { "epoch": 2.988691138301817, "grad_norm": 7.0390625, "learning_rate": 7.011308861698184e-06, "loss": 3.044, "mean_token_accuracy": 0.47456170505328293, "step": 16121 }, { "epoch": 2.9888765294771966, "grad_norm": 7.046875, "learning_rate": 7.0111234705228035e-06, "loss": 2.3503, "mean_token_accuracy": 0.5177489177489177, "step": 16122 }, { "epoch": 2.989061920652577, "grad_norm": 7.078125, "learning_rate": 7.010938079347424e-06, "loss": 2.7325, "mean_token_accuracy": 0.49824663939216834, "step": 16123 }, { "epoch": 2.989247311827957, "grad_norm": 9.4453125, "learning_rate": 7.010752688172044e-06, "loss": 3.3835, "mean_token_accuracy": 0.4614837976122797, "step": 16124 }, { "epoch": 2.989432703003337, "grad_norm": 9.515625, "learning_rate": 7.010567296996663e-06, "loss": 3.5263, "mean_token_accuracy": 0.43215997355808955, "step": 16125 }, { "epoch": 2.989618094178717, "grad_norm": 7.3671875, "learning_rate": 7.010381905821283e-06, "loss": 2.7007, "mean_token_accuracy": 0.47962962962962963, "step": 16126 }, { "epoch": 2.989803485354097, "grad_norm": 7.6015625, "learning_rate": 7.010196514645904e-06, "loss": 2.6938, "mean_token_accuracy": 0.4968358517370528, "step": 16127 }, { "epoch": 2.9899888765294773, "grad_norm": 7.21875, "learning_rate": 7.010011123470524e-06, "loss": 3.0763, "mean_token_accuracy": 0.4418569020989735, "step": 16128 }, { "epoch": 2.990174267704857, "grad_norm": 9.4296875, "learning_rate": 7.0098257322951435e-06, "loss": 3.4117, "mean_token_accuracy": 0.45829665492957744, "step": 16129 }, { "epoch": 2.9903596588802372, "grad_norm": 6.51953125, "learning_rate": 7.009640341119763e-06, "loss": 2.99, "mean_token_accuracy": 0.4742086752637749, "step": 16130 }, { "epoch": 2.9905450500556174, "grad_norm": 6.234375, "learning_rate": 7.009454949944383e-06, "loss": 2.4049, "mean_token_accuracy": 0.5227242612958791, "step": 16131 }, { "epoch": 2.9907304412309976, "grad_norm": 8.359375, "learning_rate": 7.009269558769003e-06, "loss": 3.0595, "mean_token_accuracy": 0.4732666015625, "step": 16132 }, { "epoch": 2.9909158324063774, "grad_norm": 8.03125, "learning_rate": 7.009084167593623e-06, "loss": 2.8865, "mean_token_accuracy": 0.47548377709879236, "step": 16133 }, { "epoch": 2.9911012235817576, "grad_norm": 7.1640625, "learning_rate": 7.0088987764182426e-06, "loss": 2.655, "mean_token_accuracy": 0.4969901629716635, "step": 16134 }, { "epoch": 2.9912866147571373, "grad_norm": 8.3359375, "learning_rate": 7.008713385242864e-06, "loss": 2.2886, "mean_token_accuracy": 0.554019014693172, "step": 16135 }, { "epoch": 2.9914720059325175, "grad_norm": 7.46875, "learning_rate": 7.0085279940674836e-06, "loss": 3.3715, "mean_token_accuracy": 0.4410275879524171, "step": 16136 }, { "epoch": 2.9916573971078977, "grad_norm": 7.3203125, "learning_rate": 7.008342602892103e-06, "loss": 3.336, "mean_token_accuracy": 0.450425608440235, "step": 16137 }, { "epoch": 2.991842788283278, "grad_norm": 8.1640625, "learning_rate": 7.008157211716723e-06, "loss": 2.7438, "mean_token_accuracy": 0.5210192730565256, "step": 16138 }, { "epoch": 2.9920281794586576, "grad_norm": 7.45703125, "learning_rate": 7.0079718205413425e-06, "loss": 3.3697, "mean_token_accuracy": 0.45369592608147835, "step": 16139 }, { "epoch": 2.992213570634038, "grad_norm": 16.71875, "learning_rate": 7.007786429365962e-06, "loss": 2.3761, "mean_token_accuracy": 0.5435341136168998, "step": 16140 }, { "epoch": 2.992398961809418, "grad_norm": 7.9296875, "learning_rate": 7.007601038190583e-06, "loss": 2.9515, "mean_token_accuracy": 0.46755921730175076, "step": 16141 }, { "epoch": 2.9925843529847977, "grad_norm": 7.87109375, "learning_rate": 7.007415647015202e-06, "loss": 2.7486, "mean_token_accuracy": 0.48514357053682894, "step": 16142 }, { "epoch": 2.992769744160178, "grad_norm": 8.28125, "learning_rate": 7.007230255839823e-06, "loss": 2.6572, "mean_token_accuracy": 0.5198687568355814, "step": 16143 }, { "epoch": 2.992955135335558, "grad_norm": 7.9296875, "learning_rate": 7.007044864664443e-06, "loss": 3.6763, "mean_token_accuracy": 0.42642803558032366, "step": 16144 }, { "epoch": 2.9931405265109383, "grad_norm": 9.59375, "learning_rate": 7.006859473489063e-06, "loss": 3.1856, "mean_token_accuracy": 0.4782673414706758, "step": 16145 }, { "epoch": 2.993325917686318, "grad_norm": 8.2734375, "learning_rate": 7.0066740823136825e-06, "loss": 3.5655, "mean_token_accuracy": 0.44153577661431065, "step": 16146 }, { "epoch": 2.9935113088616983, "grad_norm": 8.7578125, "learning_rate": 7.006488691138302e-06, "loss": 3.0041, "mean_token_accuracy": 0.4998449612403101, "step": 16147 }, { "epoch": 2.993696700037078, "grad_norm": 8.46875, "learning_rate": 7.006303299962922e-06, "loss": 3.0778, "mean_token_accuracy": 0.4717532899109398, "step": 16148 }, { "epoch": 2.993882091212458, "grad_norm": 9.5078125, "learning_rate": 7.006117908787542e-06, "loss": 2.6611, "mean_token_accuracy": 0.5286360698125404, "step": 16149 }, { "epoch": 2.9940674823878384, "grad_norm": 9.109375, "learning_rate": 7.005932517612162e-06, "loss": 3.1986, "mean_token_accuracy": 0.47848875300283905, "step": 16150 }, { "epoch": 2.9942528735632186, "grad_norm": 9.140625, "learning_rate": 7.0057471264367825e-06, "loss": 2.7963, "mean_token_accuracy": 0.4821073558648111, "step": 16151 }, { "epoch": 2.9944382647385983, "grad_norm": 10.375, "learning_rate": 7.005561735261402e-06, "loss": 3.4231, "mean_token_accuracy": 0.4320261437908497, "step": 16152 }, { "epoch": 2.9946236559139785, "grad_norm": 9.5703125, "learning_rate": 7.005376344086023e-06, "loss": 2.2001, "mean_token_accuracy": 0.5798851248388232, "step": 16153 }, { "epoch": 2.9948090470893587, "grad_norm": 8.0625, "learning_rate": 7.005190952910642e-06, "loss": 2.6064, "mean_token_accuracy": 0.5100308641975309, "step": 16154 }, { "epoch": 2.9949944382647384, "grad_norm": 8.328125, "learning_rate": 7.005005561735262e-06, "loss": 3.4784, "mean_token_accuracy": 0.43902439024390244, "step": 16155 }, { "epoch": 2.9951798294401186, "grad_norm": 10.21875, "learning_rate": 7.0048201705598815e-06, "loss": 3.7055, "mean_token_accuracy": 0.4459493041749503, "step": 16156 }, { "epoch": 2.995365220615499, "grad_norm": 9.625, "learning_rate": 7.004634779384501e-06, "loss": 2.9989, "mean_token_accuracy": 0.4818711967545639, "step": 16157 }, { "epoch": 2.995550611790879, "grad_norm": 8.7265625, "learning_rate": 7.004449388209122e-06, "loss": 3.4792, "mean_token_accuracy": 0.4525516055045872, "step": 16158 }, { "epoch": 2.9957360029662587, "grad_norm": 11.421875, "learning_rate": 7.004263997033742e-06, "loss": 3.0171, "mean_token_accuracy": 0.46459918080748974, "step": 16159 }, { "epoch": 2.995921394141639, "grad_norm": 10.6484375, "learning_rate": 7.004078605858362e-06, "loss": 2.5996, "mean_token_accuracy": 0.49562644713146387, "step": 16160 }, { "epoch": 2.9961067853170187, "grad_norm": 7.34375, "learning_rate": 7.003893214682982e-06, "loss": 2.9389, "mean_token_accuracy": 0.46618106139438087, "step": 16161 }, { "epoch": 2.996292176492399, "grad_norm": 9.578125, "learning_rate": 7.003707823507602e-06, "loss": 2.7988, "mean_token_accuracy": 0.504957127545552, "step": 16162 }, { "epoch": 2.996477567667779, "grad_norm": 7.62890625, "learning_rate": 7.0035224323322216e-06, "loss": 2.5946, "mean_token_accuracy": 0.5185609157808667, "step": 16163 }, { "epoch": 2.9966629588431593, "grad_norm": 8.046875, "learning_rate": 7.003337041156841e-06, "loss": 3.0388, "mean_token_accuracy": 0.4623420170234718, "step": 16164 }, { "epoch": 2.996848350018539, "grad_norm": 9.5859375, "learning_rate": 7.003151649981461e-06, "loss": 3.5104, "mean_token_accuracy": 0.4433003658040916, "step": 16165 }, { "epoch": 2.997033741193919, "grad_norm": 9.59375, "learning_rate": 7.002966258806081e-06, "loss": 2.7606, "mean_token_accuracy": 0.48795928500496527, "step": 16166 }, { "epoch": 2.9972191323692994, "grad_norm": 7.9140625, "learning_rate": 7.002780867630702e-06, "loss": 3.117, "mean_token_accuracy": 0.4677132701421801, "step": 16167 }, { "epoch": 2.997404523544679, "grad_norm": 8.296875, "learning_rate": 7.0025954764553215e-06, "loss": 3.6018, "mean_token_accuracy": 0.45206800916435547, "step": 16168 }, { "epoch": 2.9975899147200593, "grad_norm": 11.625, "learning_rate": 7.002410085279941e-06, "loss": 2.9749, "mean_token_accuracy": 0.466768193190471, "step": 16169 }, { "epoch": 2.9977753058954395, "grad_norm": 9.2421875, "learning_rate": 7.002224694104562e-06, "loss": 3.797, "mean_token_accuracy": 0.45506514224940176, "step": 16170 }, { "epoch": 2.9979606970708197, "grad_norm": 9.1953125, "learning_rate": 7.002039302929181e-06, "loss": 2.7444, "mean_token_accuracy": 0.4881911262798635, "step": 16171 }, { "epoch": 2.9981460882461994, "grad_norm": 9.0625, "learning_rate": 7.001853911753801e-06, "loss": 2.7995, "mean_token_accuracy": 0.5065345989819783, "step": 16172 }, { "epoch": 2.9983314794215796, "grad_norm": 9.0, "learning_rate": 7.0016685205784206e-06, "loss": 2.9254, "mean_token_accuracy": 0.4580208081296879, "step": 16173 }, { "epoch": 2.9985168705969594, "grad_norm": 8.296875, "learning_rate": 7.00148312940304e-06, "loss": 3.1217, "mean_token_accuracy": 0.4549600912200684, "step": 16174 }, { "epoch": 2.9987022617723396, "grad_norm": 8.0859375, "learning_rate": 7.0012977382276615e-06, "loss": 2.8073, "mean_token_accuracy": 0.48674785100286533, "step": 16175 }, { "epoch": 2.9988876529477198, "grad_norm": 8.5234375, "learning_rate": 7.001112347052281e-06, "loss": 3.1527, "mean_token_accuracy": 0.4547996272134203, "step": 16176 }, { "epoch": 2.9990730441231, "grad_norm": 9.1640625, "learning_rate": 7.000926955876901e-06, "loss": 3.1718, "mean_token_accuracy": 0.46279128038085693, "step": 16177 }, { "epoch": 2.9992584352984797, "grad_norm": 8.1328125, "learning_rate": 7.0007415647015205e-06, "loss": 2.3162, "mean_token_accuracy": 0.555876100819921, "step": 16178 }, { "epoch": 2.99944382647386, "grad_norm": 9.8203125, "learning_rate": 7.000556173526141e-06, "loss": 3.4382, "mean_token_accuracy": 0.4462552873849216, "step": 16179 }, { "epoch": 2.9996292176492396, "grad_norm": 9.6640625, "learning_rate": 7.000370782350761e-06, "loss": 3.5468, "mean_token_accuracy": 0.439365975464119, "step": 16180 }, { "epoch": 2.99981460882462, "grad_norm": 9.1953125, "learning_rate": 7.00018539117538e-06, "loss": 3.1937, "mean_token_accuracy": 0.457538789529788, "step": 16181 }, { "epoch": 3.0, "grad_norm": 7.328125, "learning_rate": 7e-06, "loss": 2.8511, "mean_token_accuracy": 0.47609608208955223, "step": 16182 } ], "logging_steps": 1, "max_steps": 53940, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6379480929583923e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }