diff --git "a/checkpoint-1753/trainer_state.json" "b/checkpoint-1753/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1753/trainer_state.json" @@ -0,0 +1,12434 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3000171144959781, + "eval_steps": 877, + "global_step": 1753, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017114495978093444, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 9.5502, + "step": 1 + }, + { + "epoch": 0.0003422899195618689, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 17.5546, + "step": 2 + }, + { + "epoch": 0.0005134348793428033, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 25.9988, + "step": 3 + }, + { + "epoch": 0.0006845798391237378, + "grad_norm": 60.18782043457031, + "learning_rate": 5.704506560182544e-09, + "loss": 9.6042, + "step": 4 + }, + { + "epoch": 0.0008557247989046722, + "grad_norm": Infinity, + "learning_rate": 5.704506560182544e-09, + "loss": 17.9169, + "step": 5 + }, + { + "epoch": 0.0010268697586856067, + "grad_norm": 84.36212158203125, + "learning_rate": 1.1409013120365088e-08, + "loss": 10.1633, + "step": 6 + }, + { + "epoch": 0.001198014718466541, + "grad_norm": 82.20382690429688, + "learning_rate": 1.711351968054763e-08, + "loss": 8.4392, + "step": 7 + }, + { + "epoch": 0.0013691596782474755, + "grad_norm": 16.687606811523438, + "learning_rate": 2.2818026240730176e-08, + "loss": 6.4113, + "step": 8 + }, + { + "epoch": 0.00154030463802841, + "grad_norm": NaN, + "learning_rate": 2.2818026240730176e-08, + "loss": 17.185, + "step": 9 + }, + { + "epoch": 0.0017114495978093444, + "grad_norm": 48.527183532714844, + "learning_rate": 2.852253280091272e-08, + "loss": 7.868, + "step": 10 + }, + { + "epoch": 0.0018825945575902789, + "grad_norm": 76.19969177246094, + "learning_rate": 3.422703936109526e-08, + "loss": 25.5097, + "step": 11 + }, + { + "epoch": 0.0020537395173712133, + "grad_norm": 44.624080657958984, + "learning_rate": 3.9931545921277814e-08, + "loss": 8.533, + "step": 12 + }, + { + "epoch": 0.002224884477152148, + "grad_norm": 68.30242156982422, + "learning_rate": 4.563605248146035e-08, + "loss": 12.1618, + "step": 13 + }, + { + "epoch": 0.002396029436933082, + "grad_norm": 77.02323913574219, + "learning_rate": 5.1340559041642904e-08, + "loss": 10.1531, + "step": 14 + }, + { + "epoch": 0.002567174396714017, + "grad_norm": 37.27943420410156, + "learning_rate": 5.704506560182544e-08, + "loss": 8.2236, + "step": 15 + }, + { + "epoch": 0.002738319356494951, + "grad_norm": 47.14276123046875, + "learning_rate": 6.274957216200798e-08, + "loss": 6.6764, + "step": 16 + }, + { + "epoch": 0.0029094643162758858, + "grad_norm": 68.288818359375, + "learning_rate": 6.845407872219053e-08, + "loss": 9.6404, + "step": 17 + }, + { + "epoch": 0.00308060927605682, + "grad_norm": 72.5563735961914, + "learning_rate": 7.415858528237308e-08, + "loss": 7.2553, + "step": 18 + }, + { + "epoch": 0.0032517542358377546, + "grad_norm": 133.2047576904297, + "learning_rate": 7.986309184255563e-08, + "loss": 17.6761, + "step": 19 + }, + { + "epoch": 0.003422899195618689, + "grad_norm": 96.22279357910156, + "learning_rate": 8.556759840273816e-08, + "loss": 23.6993, + "step": 20 + }, + { + "epoch": 0.0035940441553996235, + "grad_norm": 72.40164947509766, + "learning_rate": 9.12721049629207e-08, + "loss": 12.5069, + "step": 21 + }, + { + "epoch": 0.0037651891151805577, + "grad_norm": 58.689125061035156, + "learning_rate": 9.697661152310325e-08, + "loss": 10.4915, + "step": 22 + }, + { + "epoch": 0.003936334074961492, + "grad_norm": 41.391170501708984, + "learning_rate": 1.0268111808328581e-07, + "loss": 8.2323, + "step": 23 + }, + { + "epoch": 0.004107479034742427, + "grad_norm": 60.9368896484375, + "learning_rate": 1.0838562464346835e-07, + "loss": 9.4007, + "step": 24 + }, + { + "epoch": 0.004278623994523361, + "grad_norm": 132.59597778320312, + "learning_rate": 1.1409013120365088e-07, + "loss": 16.9119, + "step": 25 + }, + { + "epoch": 0.004449768954304296, + "grad_norm": 72.2205581665039, + "learning_rate": 1.1979463776383346e-07, + "loss": 12.5137, + "step": 26 + }, + { + "epoch": 0.00462091391408523, + "grad_norm": 69.2486572265625, + "learning_rate": 1.2549914432401596e-07, + "loss": 10.2477, + "step": 27 + }, + { + "epoch": 0.004792058873866164, + "grad_norm": 30.01091194152832, + "learning_rate": 1.3120365088419852e-07, + "loss": 6.6456, + "step": 28 + }, + { + "epoch": 0.0049632038336470995, + "grad_norm": 75.28530883789062, + "learning_rate": 1.3690815744438105e-07, + "loss": 9.7946, + "step": 29 + }, + { + "epoch": 0.005134348793428034, + "grad_norm": 70.37921142578125, + "learning_rate": 1.426126640045636e-07, + "loss": 12.4969, + "step": 30 + }, + { + "epoch": 0.005305493753208968, + "grad_norm": 90.83671569824219, + "learning_rate": 1.4831717056474617e-07, + "loss": 7.6589, + "step": 31 + }, + { + "epoch": 0.005476638712989902, + "grad_norm": 65.92588806152344, + "learning_rate": 1.540216771249287e-07, + "loss": 9.7764, + "step": 32 + }, + { + "epoch": 0.005647783672770837, + "grad_norm": 86.14967346191406, + "learning_rate": 1.5972618368511126e-07, + "loss": 8.2129, + "step": 33 + }, + { + "epoch": 0.0058189286325517715, + "grad_norm": 145.2432098388672, + "learning_rate": 1.654306902452938e-07, + "loss": 17.7377, + "step": 34 + }, + { + "epoch": 0.005990073592332706, + "grad_norm": 64.69364929199219, + "learning_rate": 1.7113519680547632e-07, + "loss": 9.9994, + "step": 35 + }, + { + "epoch": 0.00616121855211364, + "grad_norm": 58.662803649902344, + "learning_rate": 1.7683970336565888e-07, + "loss": 11.459, + "step": 36 + }, + { + "epoch": 0.006332363511894575, + "grad_norm": 123.47699737548828, + "learning_rate": 1.825442099258414e-07, + "loss": 16.278, + "step": 37 + }, + { + "epoch": 0.006503508471675509, + "grad_norm": 40.24553680419922, + "learning_rate": 1.8824871648602397e-07, + "loss": 8.1959, + "step": 38 + }, + { + "epoch": 0.0066746534314564435, + "grad_norm": 71.55098724365234, + "learning_rate": 1.939532230462065e-07, + "loss": 12.4429, + "step": 39 + }, + { + "epoch": 0.006845798391237378, + "grad_norm": 73.94329833984375, + "learning_rate": 1.9965772960638906e-07, + "loss": 12.4672, + "step": 40 + }, + { + "epoch": 0.007016943351018313, + "grad_norm": 78.10585021972656, + "learning_rate": 2.0536223616657162e-07, + "loss": 12.6372, + "step": 41 + }, + { + "epoch": 0.007188088310799247, + "grad_norm": 154.09982299804688, + "learning_rate": 2.1106674272675415e-07, + "loss": 18.2102, + "step": 42 + }, + { + "epoch": 0.007359233270580181, + "grad_norm": 69.93523406982422, + "learning_rate": 2.167712492869367e-07, + "loss": 10.2355, + "step": 43 + }, + { + "epoch": 0.0075303782303611155, + "grad_norm": 53.971500396728516, + "learning_rate": 2.224757558471192e-07, + "loss": 11.0821, + "step": 44 + }, + { + "epoch": 0.007701523190142051, + "grad_norm": 79.0840835571289, + "learning_rate": 2.2818026240730177e-07, + "loss": 12.838, + "step": 45 + }, + { + "epoch": 0.007872668149922985, + "grad_norm": 68.8064956665039, + "learning_rate": 2.3388476896748433e-07, + "loss": 11.9172, + "step": 46 + }, + { + "epoch": 0.00804381310970392, + "grad_norm": 114.59835815429688, + "learning_rate": 2.395892755276669e-07, + "loss": 16.4603, + "step": 47 + }, + { + "epoch": 0.008214958069484853, + "grad_norm": 76.0896987915039, + "learning_rate": 2.452937820878494e-07, + "loss": 12.7007, + "step": 48 + }, + { + "epoch": 0.008386103029265788, + "grad_norm": 45.47982406616211, + "learning_rate": 2.509982886480319e-07, + "loss": 10.2213, + "step": 49 + }, + { + "epoch": 0.008557247989046722, + "grad_norm": 136.47836303710938, + "learning_rate": 2.567027952082145e-07, + "loss": 17.2777, + "step": 50 + }, + { + "epoch": 0.008728392948827657, + "grad_norm": 57.07033920288086, + "learning_rate": 2.6240730176839704e-07, + "loss": 10.8704, + "step": 51 + }, + { + "epoch": 0.008899537908608592, + "grad_norm": 50.97236633300781, + "learning_rate": 2.681118083285796e-07, + "loss": 10.8266, + "step": 52 + }, + { + "epoch": 0.009070682868389525, + "grad_norm": 166.14840698242188, + "learning_rate": 2.738163148887621e-07, + "loss": 17.2345, + "step": 53 + }, + { + "epoch": 0.00924182782817046, + "grad_norm": 154.5965576171875, + "learning_rate": 2.795208214489447e-07, + "loss": 18.5922, + "step": 54 + }, + { + "epoch": 0.009412972787951395, + "grad_norm": 61.19700622558594, + "learning_rate": 2.852253280091272e-07, + "loss": 9.3514, + "step": 55 + }, + { + "epoch": 0.009584117747732329, + "grad_norm": 64.54351806640625, + "learning_rate": 2.909298345693098e-07, + "loss": 7.0867, + "step": 56 + }, + { + "epoch": 0.009755262707513264, + "grad_norm": 41.97494888305664, + "learning_rate": 2.9663434112949233e-07, + "loss": 8.319, + "step": 57 + }, + { + "epoch": 0.009926407667294199, + "grad_norm": 158.86936950683594, + "learning_rate": 3.023388476896748e-07, + "loss": 18.7347, + "step": 58 + }, + { + "epoch": 0.010097552627075132, + "grad_norm": 93.42990112304688, + "learning_rate": 3.080433542498574e-07, + "loss": 24.4121, + "step": 59 + }, + { + "epoch": 0.010268697586856067, + "grad_norm": 40.078529357910156, + "learning_rate": 3.1374786081003993e-07, + "loss": 8.4254, + "step": 60 + }, + { + "epoch": 0.010439842546637, + "grad_norm": 172.25357055664062, + "learning_rate": 3.194523673702225e-07, + "loss": 17.8112, + "step": 61 + }, + { + "epoch": 0.010610987506417936, + "grad_norm": 64.05378723144531, + "learning_rate": 3.2515687393040504e-07, + "loss": 11.648, + "step": 62 + }, + { + "epoch": 0.010782132466198871, + "grad_norm": 31.915542602539062, + "learning_rate": 3.308613804905876e-07, + "loss": 6.576, + "step": 63 + }, + { + "epoch": 0.010953277425979804, + "grad_norm": 97.90230560302734, + "learning_rate": 3.365658870507701e-07, + "loss": 24.4094, + "step": 64 + }, + { + "epoch": 0.01112442238576074, + "grad_norm": 49.965126037597656, + "learning_rate": 3.4227039361095264e-07, + "loss": 10.7296, + "step": 65 + }, + { + "epoch": 0.011295567345541675, + "grad_norm": 103.74539947509766, + "learning_rate": 3.479749001711352e-07, + "loss": 24.3446, + "step": 66 + }, + { + "epoch": 0.011466712305322608, + "grad_norm": 100.20292663574219, + "learning_rate": 3.5367940673131776e-07, + "loss": 24.4799, + "step": 67 + }, + { + "epoch": 0.011637857265103543, + "grad_norm": 87.23709869384766, + "learning_rate": 3.593839132915003e-07, + "loss": 7.8589, + "step": 68 + }, + { + "epoch": 0.011809002224884476, + "grad_norm": 105.97203063964844, + "learning_rate": 3.650884198516828e-07, + "loss": 24.5711, + "step": 69 + }, + { + "epoch": 0.011980147184665411, + "grad_norm": 139.98709106445312, + "learning_rate": 3.707929264118654e-07, + "loss": 18.0228, + "step": 70 + }, + { + "epoch": 0.012151292144446347, + "grad_norm": 32.724159240722656, + "learning_rate": 3.7649743297204793e-07, + "loss": 8.2503, + "step": 71 + }, + { + "epoch": 0.01232243710422728, + "grad_norm": 55.843509674072266, + "learning_rate": 3.822019395322305e-07, + "loss": 7.1578, + "step": 72 + }, + { + "epoch": 0.012493582064008215, + "grad_norm": 183.3024444580078, + "learning_rate": 3.87906446092413e-07, + "loss": 18.0044, + "step": 73 + }, + { + "epoch": 0.01266472702378915, + "grad_norm": 118.88136291503906, + "learning_rate": 3.9361095265259553e-07, + "loss": 15.586, + "step": 74 + }, + { + "epoch": 0.012835871983570083, + "grad_norm": 64.66754150390625, + "learning_rate": 3.993154592127781e-07, + "loss": 12.3289, + "step": 75 + }, + { + "epoch": 0.013007016943351019, + "grad_norm": 62.58869552612305, + "learning_rate": 4.0501996577296065e-07, + "loss": 8.9949, + "step": 76 + }, + { + "epoch": 0.013178161903131952, + "grad_norm": 66.82809448242188, + "learning_rate": 4.1072447233314323e-07, + "loss": 12.1411, + "step": 77 + }, + { + "epoch": 0.013349306862912887, + "grad_norm": 144.4269256591797, + "learning_rate": 4.164289788933257e-07, + "loss": 18.5364, + "step": 78 + }, + { + "epoch": 0.013520451822693822, + "grad_norm": 60.626834869384766, + "learning_rate": 4.221334854535083e-07, + "loss": 11.9262, + "step": 79 + }, + { + "epoch": 0.013691596782474755, + "grad_norm": 139.23158264160156, + "learning_rate": 4.278379920136908e-07, + "loss": 17.5537, + "step": 80 + }, + { + "epoch": 0.01386274174225569, + "grad_norm": 74.82394409179688, + "learning_rate": 4.335424985738734e-07, + "loss": 12.6047, + "step": 81 + }, + { + "epoch": 0.014033886702036626, + "grad_norm": 128.57716369628906, + "learning_rate": 4.3924700513405594e-07, + "loss": 17.2671, + "step": 82 + }, + { + "epoch": 0.014205031661817559, + "grad_norm": 50.15757369995117, + "learning_rate": 4.449515116942384e-07, + "loss": 9.8308, + "step": 83 + }, + { + "epoch": 0.014376176621598494, + "grad_norm": 46.99615478515625, + "learning_rate": 4.50656018254421e-07, + "loss": 8.6575, + "step": 84 + }, + { + "epoch": 0.01454732158137943, + "grad_norm": 61.31080627441406, + "learning_rate": 4.5636052481460354e-07, + "loss": 6.8446, + "step": 85 + }, + { + "epoch": 0.014718466541160363, + "grad_norm": 60.068443298339844, + "learning_rate": 4.620650313747861e-07, + "loss": 11.2968, + "step": 86 + }, + { + "epoch": 0.014889611500941298, + "grad_norm": 67.9156265258789, + "learning_rate": 4.6776953793496865e-07, + "loss": 12.672, + "step": 87 + }, + { + "epoch": 0.015060756460722231, + "grad_norm": 49.77825164794922, + "learning_rate": 4.734740444951512e-07, + "loss": 9.5329, + "step": 88 + }, + { + "epoch": 0.015231901420503166, + "grad_norm": 57.99795913696289, + "learning_rate": 4.791785510553338e-07, + "loss": 11.3883, + "step": 89 + }, + { + "epoch": 0.015403046380284101, + "grad_norm": 54.070491790771484, + "learning_rate": 4.848830576155162e-07, + "loss": 11.282, + "step": 90 + }, + { + "epoch": 0.015574191340065035, + "grad_norm": 46.44948959350586, + "learning_rate": 4.905875641756988e-07, + "loss": 10.4933, + "step": 91 + }, + { + "epoch": 0.01574533629984597, + "grad_norm": 91.90750885009766, + "learning_rate": 4.962920707358814e-07, + "loss": 14.1568, + "step": 92 + }, + { + "epoch": 0.015916481259626903, + "grad_norm": 45.950286865234375, + "learning_rate": 5.019965772960638e-07, + "loss": 8.6589, + "step": 93 + }, + { + "epoch": 0.01608762621940784, + "grad_norm": 172.71951293945312, + "learning_rate": 5.077010838562465e-07, + "loss": 16.7382, + "step": 94 + }, + { + "epoch": 0.016258771179188773, + "grad_norm": 54.993247985839844, + "learning_rate": 5.13405590416429e-07, + "loss": 11.4407, + "step": 95 + }, + { + "epoch": 0.016429916138969707, + "grad_norm": 110.25296020507812, + "learning_rate": 5.191100969766115e-07, + "loss": 23.8689, + "step": 96 + }, + { + "epoch": 0.016601061098750643, + "grad_norm": 43.9764289855957, + "learning_rate": 5.248146035367941e-07, + "loss": 7.3984, + "step": 97 + }, + { + "epoch": 0.016772206058531577, + "grad_norm": 72.5784912109375, + "learning_rate": 5.305191100969766e-07, + "loss": 7.2256, + "step": 98 + }, + { + "epoch": 0.01694335101831251, + "grad_norm": 35.58343505859375, + "learning_rate": 5.362236166571592e-07, + "loss": 7.8513, + "step": 99 + }, + { + "epoch": 0.017114495978093443, + "grad_norm": 69.7222900390625, + "learning_rate": 5.419281232173417e-07, + "loss": 11.8726, + "step": 100 + }, + { + "epoch": 0.01728564093787438, + "grad_norm": 54.240943908691406, + "learning_rate": 5.476326297775242e-07, + "loss": 11.6433, + "step": 101 + }, + { + "epoch": 0.017456785897655314, + "grad_norm": 40.061763763427734, + "learning_rate": 5.533371363377068e-07, + "loss": 6.452, + "step": 102 + }, + { + "epoch": 0.017627930857436247, + "grad_norm": 43.3102912902832, + "learning_rate": 5.590416428978894e-07, + "loss": 10.9229, + "step": 103 + }, + { + "epoch": 0.017799075817217184, + "grad_norm": 48.96671676635742, + "learning_rate": 5.647461494580719e-07, + "loss": 10.9523, + "step": 104 + }, + { + "epoch": 0.017970220776998117, + "grad_norm": 107.66687774658203, + "learning_rate": 5.704506560182544e-07, + "loss": 15.756, + "step": 105 + }, + { + "epoch": 0.01814136573677905, + "grad_norm": 50.87533950805664, + "learning_rate": 5.76155162578437e-07, + "loss": 9.8941, + "step": 106 + }, + { + "epoch": 0.018312510696559987, + "grad_norm": 142.70115661621094, + "learning_rate": 5.818596691386196e-07, + "loss": 16.205, + "step": 107 + }, + { + "epoch": 0.01848365565634092, + "grad_norm": 62.69704818725586, + "learning_rate": 5.87564175698802e-07, + "loss": 9.7933, + "step": 108 + }, + { + "epoch": 0.018654800616121854, + "grad_norm": 52.710227966308594, + "learning_rate": 5.932686822589847e-07, + "loss": 10.7189, + "step": 109 + }, + { + "epoch": 0.01882594557590279, + "grad_norm": 131.87474060058594, + "learning_rate": 5.989731888191672e-07, + "loss": 24.6335, + "step": 110 + }, + { + "epoch": 0.018997090535683724, + "grad_norm": 105.79902648925781, + "learning_rate": 6.046776953793496e-07, + "loss": 16.133, + "step": 111 + }, + { + "epoch": 0.019168235495464658, + "grad_norm": 56.011474609375, + "learning_rate": 6.103822019395323e-07, + "loss": 11.9402, + "step": 112 + }, + { + "epoch": 0.019339380455245594, + "grad_norm": 97.4761962890625, + "learning_rate": 6.160867084997148e-07, + "loss": 14.1356, + "step": 113 + }, + { + "epoch": 0.019510525415026528, + "grad_norm": 52.7200813293457, + "learning_rate": 6.217912150598974e-07, + "loss": 11.4511, + "step": 114 + }, + { + "epoch": 0.01968167037480746, + "grad_norm": 42.6909065246582, + "learning_rate": 6.274957216200799e-07, + "loss": 8.5194, + "step": 115 + }, + { + "epoch": 0.019852815334588398, + "grad_norm": 44.77908706665039, + "learning_rate": 6.332002281802624e-07, + "loss": 11.437, + "step": 116 + }, + { + "epoch": 0.02002396029436933, + "grad_norm": 136.7108612060547, + "learning_rate": 6.38904734740445e-07, + "loss": 23.712, + "step": 117 + }, + { + "epoch": 0.020195105254150265, + "grad_norm": 44.484893798828125, + "learning_rate": 6.446092413006275e-07, + "loss": 10.4753, + "step": 118 + }, + { + "epoch": 0.020366250213931198, + "grad_norm": 57.66374206542969, + "learning_rate": 6.503137478608101e-07, + "loss": 11.2803, + "step": 119 + }, + { + "epoch": 0.020537395173712135, + "grad_norm": 110.59872436523438, + "learning_rate": 6.560182544209926e-07, + "loss": 15.7956, + "step": 120 + }, + { + "epoch": 0.02070854013349307, + "grad_norm": 33.806732177734375, + "learning_rate": 6.617227609811752e-07, + "loss": 10.327, + "step": 121 + }, + { + "epoch": 0.020879685093274, + "grad_norm": 52.41442108154297, + "learning_rate": 6.674272675413577e-07, + "loss": 10.5716, + "step": 122 + }, + { + "epoch": 0.02105083005305494, + "grad_norm": 40.95213317871094, + "learning_rate": 6.731317741015402e-07, + "loss": 11.6353, + "step": 123 + }, + { + "epoch": 0.021221975012835872, + "grad_norm": 43.268775939941406, + "learning_rate": 6.788362806617229e-07, + "loss": 11.2876, + "step": 124 + }, + { + "epoch": 0.021393119972616805, + "grad_norm": 102.84829711914062, + "learning_rate": 6.845407872219053e-07, + "loss": 15.5444, + "step": 125 + }, + { + "epoch": 0.021564264932397742, + "grad_norm": 56.55605697631836, + "learning_rate": 6.902452937820878e-07, + "loss": 10.2011, + "step": 126 + }, + { + "epoch": 0.021735409892178675, + "grad_norm": 37.294795989990234, + "learning_rate": 6.959498003422704e-07, + "loss": 8.1014, + "step": 127 + }, + { + "epoch": 0.02190655485195961, + "grad_norm": 55.67061233520508, + "learning_rate": 7.01654306902453e-07, + "loss": 11.638, + "step": 128 + }, + { + "epoch": 0.022077699811740546, + "grad_norm": 67.4786605834961, + "learning_rate": 7.073588134626355e-07, + "loss": 6.8779, + "step": 129 + }, + { + "epoch": 0.02224884477152148, + "grad_norm": 30.9260196685791, + "learning_rate": 7.13063320022818e-07, + "loss": 9.4357, + "step": 130 + }, + { + "epoch": 0.022419989731302412, + "grad_norm": 100.22219848632812, + "learning_rate": 7.187678265830006e-07, + "loss": 14.6476, + "step": 131 + }, + { + "epoch": 0.02259113469108335, + "grad_norm": 29.24936294555664, + "learning_rate": 7.244723331431832e-07, + "loss": 6.5599, + "step": 132 + }, + { + "epoch": 0.022762279650864282, + "grad_norm": 31.59239959716797, + "learning_rate": 7.301768397033656e-07, + "loss": 6.6734, + "step": 133 + }, + { + "epoch": 0.022933424610645216, + "grad_norm": 42.93860626220703, + "learning_rate": 7.358813462635483e-07, + "loss": 9.125, + "step": 134 + }, + { + "epoch": 0.023104569570426153, + "grad_norm": 46.8751335144043, + "learning_rate": 7.415858528237308e-07, + "loss": 10.3878, + "step": 135 + }, + { + "epoch": 0.023275714530207086, + "grad_norm": 106.5069351196289, + "learning_rate": 7.472903593839132e-07, + "loss": 14.3693, + "step": 136 + }, + { + "epoch": 0.02344685948998802, + "grad_norm": 126.05512237548828, + "learning_rate": 7.529948659440959e-07, + "loss": 22.8488, + "step": 137 + }, + { + "epoch": 0.023618004449768953, + "grad_norm": 89.99185180664062, + "learning_rate": 7.586993725042784e-07, + "loss": 13.3307, + "step": 138 + }, + { + "epoch": 0.02378914940954989, + "grad_norm": 35.95622253417969, + "learning_rate": 7.64403879064461e-07, + "loss": 9.7673, + "step": 139 + }, + { + "epoch": 0.023960294369330823, + "grad_norm": 31.504724502563477, + "learning_rate": 7.701083856246435e-07, + "loss": 9.0915, + "step": 140 + }, + { + "epoch": 0.024131439329111756, + "grad_norm": 74.6131591796875, + "learning_rate": 7.75812892184826e-07, + "loss": 12.7378, + "step": 141 + }, + { + "epoch": 0.024302584288892693, + "grad_norm": 40.63880920410156, + "learning_rate": 7.815173987450086e-07, + "loss": 8.9323, + "step": 142 + }, + { + "epoch": 0.024473729248673626, + "grad_norm": 101.69804382324219, + "learning_rate": 7.872219053051911e-07, + "loss": 14.547, + "step": 143 + }, + { + "epoch": 0.02464487420845456, + "grad_norm": 131.046142578125, + "learning_rate": 7.929264118653737e-07, + "loss": 23.0012, + "step": 144 + }, + { + "epoch": 0.024816019168235497, + "grad_norm": 74.94658660888672, + "learning_rate": 7.986309184255562e-07, + "loss": 12.8286, + "step": 145 + }, + { + "epoch": 0.02498716412801643, + "grad_norm": 50.227718353271484, + "learning_rate": 8.043354249857388e-07, + "loss": 10.9684, + "step": 146 + }, + { + "epoch": 0.025158309087797363, + "grad_norm": 162.12603759765625, + "learning_rate": 8.100399315459213e-07, + "loss": 23.0181, + "step": 147 + }, + { + "epoch": 0.0253294540475783, + "grad_norm": 34.11660385131836, + "learning_rate": 8.157444381061038e-07, + "loss": 9.2808, + "step": 148 + }, + { + "epoch": 0.025500599007359234, + "grad_norm": 34.07155990600586, + "learning_rate": 8.214489446662865e-07, + "loss": 11.3059, + "step": 149 + }, + { + "epoch": 0.025671743967140167, + "grad_norm": 42.31085968017578, + "learning_rate": 8.271534512264689e-07, + "loss": 8.7746, + "step": 150 + }, + { + "epoch": 0.025842888926921104, + "grad_norm": 132.6522216796875, + "learning_rate": 8.328579577866514e-07, + "loss": 16.6408, + "step": 151 + }, + { + "epoch": 0.026014033886702037, + "grad_norm": 21.736328125, + "learning_rate": 8.385624643468341e-07, + "loss": 9.6907, + "step": 152 + }, + { + "epoch": 0.02618517884648297, + "grad_norm": 64.54568481445312, + "learning_rate": 8.442669709070166e-07, + "loss": 12.8573, + "step": 153 + }, + { + "epoch": 0.026356323806263904, + "grad_norm": 32.484703063964844, + "learning_rate": 8.499714774671991e-07, + "loss": 7.4299, + "step": 154 + }, + { + "epoch": 0.02652746876604484, + "grad_norm": 86.55378723144531, + "learning_rate": 8.556759840273817e-07, + "loss": 13.347, + "step": 155 + }, + { + "epoch": 0.026698613725825774, + "grad_norm": 32.97962188720703, + "learning_rate": 8.613804905875642e-07, + "loss": 9.2004, + "step": 156 + }, + { + "epoch": 0.026869758685606707, + "grad_norm": 66.87654113769531, + "learning_rate": 8.670849971477468e-07, + "loss": 6.6107, + "step": 157 + }, + { + "epoch": 0.027040903645387644, + "grad_norm": 56.53002166748047, + "learning_rate": 8.727895037079292e-07, + "loss": 6.0957, + "step": 158 + }, + { + "epoch": 0.027212048605168578, + "grad_norm": 37.223453521728516, + "learning_rate": 8.784940102681119e-07, + "loss": 8.9968, + "step": 159 + }, + { + "epoch": 0.02738319356494951, + "grad_norm": 30.637619018554688, + "learning_rate": 8.841985168282944e-07, + "loss": 8.9597, + "step": 160 + }, + { + "epoch": 0.027554338524730448, + "grad_norm": 22.8154354095459, + "learning_rate": 8.899030233884768e-07, + "loss": 9.2794, + "step": 161 + }, + { + "epoch": 0.02772548348451138, + "grad_norm": 64.24419403076172, + "learning_rate": 8.956075299486595e-07, + "loss": 12.9209, + "step": 162 + }, + { + "epoch": 0.027896628444292314, + "grad_norm": 27.159826278686523, + "learning_rate": 9.01312036508842e-07, + "loss": 10.7092, + "step": 163 + }, + { + "epoch": 0.02806777340407325, + "grad_norm": 29.741992950439453, + "learning_rate": 9.070165430690246e-07, + "loss": 10.1098, + "step": 164 + }, + { + "epoch": 0.028238918363854185, + "grad_norm": 61.4916877746582, + "learning_rate": 9.127210496292071e-07, + "loss": 12.5023, + "step": 165 + }, + { + "epoch": 0.028410063323635118, + "grad_norm": 21.36608123779297, + "learning_rate": 9.184255561893896e-07, + "loss": 7.2161, + "step": 166 + }, + { + "epoch": 0.028581208283416055, + "grad_norm": 51.13070297241211, + "learning_rate": 9.241300627495722e-07, + "loss": 5.5324, + "step": 167 + }, + { + "epoch": 0.028752353243196988, + "grad_norm": 27.232070922851562, + "learning_rate": 9.298345693097547e-07, + "loss": 9.3162, + "step": 168 + }, + { + "epoch": 0.02892349820297792, + "grad_norm": 51.84492111206055, + "learning_rate": 9.355390758699373e-07, + "loss": 6.0306, + "step": 169 + }, + { + "epoch": 0.02909464316275886, + "grad_norm": 24.21738052368164, + "learning_rate": 9.412435824301197e-07, + "loss": 6.6994, + "step": 170 + }, + { + "epoch": 0.02926578812253979, + "grad_norm": 27.428897857666016, + "learning_rate": 9.469480889903024e-07, + "loss": 10.5412, + "step": 171 + }, + { + "epoch": 0.029436933082320725, + "grad_norm": 123.71875762939453, + "learning_rate": 9.526525955504849e-07, + "loss": 15.9849, + "step": 172 + }, + { + "epoch": 0.02960807804210166, + "grad_norm": 34.90501403808594, + "learning_rate": 9.583571021106676e-07, + "loss": 9.2574, + "step": 173 + }, + { + "epoch": 0.029779223001882595, + "grad_norm": 26.623390197753906, + "learning_rate": 9.6406160867085e-07, + "loss": 8.7904, + "step": 174 + }, + { + "epoch": 0.02995036796166353, + "grad_norm": 21.868566513061523, + "learning_rate": 9.697661152310325e-07, + "loss": 9.2638, + "step": 175 + }, + { + "epoch": 0.030121512921444462, + "grad_norm": 28.389110565185547, + "learning_rate": 9.754706217912152e-07, + "loss": 5.9086, + "step": 176 + }, + { + "epoch": 0.0302926578812254, + "grad_norm": 51.29762649536133, + "learning_rate": 9.811751283513976e-07, + "loss": 5.9646, + "step": 177 + }, + { + "epoch": 0.030463802841006332, + "grad_norm": 28.91325569152832, + "learning_rate": 9.8687963491158e-07, + "loss": 6.0877, + "step": 178 + }, + { + "epoch": 0.030634947800787266, + "grad_norm": 66.74105834960938, + "learning_rate": 9.925841414717628e-07, + "loss": 12.4348, + "step": 179 + }, + { + "epoch": 0.030806092760568202, + "grad_norm": 19.138124465942383, + "learning_rate": 9.982886480319452e-07, + "loss": 9.5496, + "step": 180 + }, + { + "epoch": 0.030977237720349136, + "grad_norm": 43.17308044433594, + "learning_rate": 1.0039931545921277e-06, + "loss": 5.5641, + "step": 181 + }, + { + "epoch": 0.03114838268013007, + "grad_norm": 32.97599411010742, + "learning_rate": 1.0096976611523104e-06, + "loss": 9.0529, + "step": 182 + }, + { + "epoch": 0.031319527639911006, + "grad_norm": 56.315521240234375, + "learning_rate": 1.015402167712493e-06, + "loss": 12.0747, + "step": 183 + }, + { + "epoch": 0.03149067259969194, + "grad_norm": 76.77662658691406, + "learning_rate": 1.0211066742726755e-06, + "loss": 13.0892, + "step": 184 + }, + { + "epoch": 0.03166181755947287, + "grad_norm": 25.544397354125977, + "learning_rate": 1.026811180832858e-06, + "loss": 7.7117, + "step": 185 + }, + { + "epoch": 0.031832962519253806, + "grad_norm": 24.205764770507812, + "learning_rate": 1.0325156873930406e-06, + "loss": 6.6426, + "step": 186 + }, + { + "epoch": 0.03200410747903474, + "grad_norm": 25.586280822753906, + "learning_rate": 1.038220193953223e-06, + "loss": 10.4785, + "step": 187 + }, + { + "epoch": 0.03217525243881568, + "grad_norm": 68.83911895751953, + "learning_rate": 1.0439247005134056e-06, + "loss": 12.2132, + "step": 188 + }, + { + "epoch": 0.03234639739859661, + "grad_norm": 24.825489044189453, + "learning_rate": 1.0496292070735881e-06, + "loss": 6.3336, + "step": 189 + }, + { + "epoch": 0.032517542358377546, + "grad_norm": 28.293699264526367, + "learning_rate": 1.0553337136337707e-06, + "loss": 8.5374, + "step": 190 + }, + { + "epoch": 0.03268868731815848, + "grad_norm": 28.26664924621582, + "learning_rate": 1.0610382201939532e-06, + "loss": 9.7218, + "step": 191 + }, + { + "epoch": 0.03285983227793941, + "grad_norm": 84.32862854003906, + "learning_rate": 1.0667427267541357e-06, + "loss": 12.782, + "step": 192 + }, + { + "epoch": 0.033030977237720346, + "grad_norm": 26.818071365356445, + "learning_rate": 1.0724472333143185e-06, + "loss": 7.3125, + "step": 193 + }, + { + "epoch": 0.03320212219750129, + "grad_norm": 16.650196075439453, + "learning_rate": 1.0781517398745008e-06, + "loss": 9.0232, + "step": 194 + }, + { + "epoch": 0.03337326715728222, + "grad_norm": 22.659135818481445, + "learning_rate": 1.0838562464346833e-06, + "loss": 6.2787, + "step": 195 + }, + { + "epoch": 0.03354441211706315, + "grad_norm": 24.644168853759766, + "learning_rate": 1.089560752994866e-06, + "loss": 6.0047, + "step": 196 + }, + { + "epoch": 0.03371555707684409, + "grad_norm": 32.078712463378906, + "learning_rate": 1.0952652595550484e-06, + "loss": 7.5748, + "step": 197 + }, + { + "epoch": 0.03388670203662502, + "grad_norm": 55.345855712890625, + "learning_rate": 1.1009697661152311e-06, + "loss": 11.8703, + "step": 198 + }, + { + "epoch": 0.034057846996405954, + "grad_norm": 70.49486541748047, + "learning_rate": 1.1066742726754137e-06, + "loss": 11.7983, + "step": 199 + }, + { + "epoch": 0.03422899195618689, + "grad_norm": 29.946758270263672, + "learning_rate": 1.112378779235596e-06, + "loss": 7.6286, + "step": 200 + }, + { + "epoch": 0.03440013691596783, + "grad_norm": 278.3395080566406, + "learning_rate": 1.1180832857957787e-06, + "loss": 17.6192, + "step": 201 + }, + { + "epoch": 0.03457128187574876, + "grad_norm": 310.682861328125, + "learning_rate": 1.1237877923559613e-06, + "loss": 17.6315, + "step": 202 + }, + { + "epoch": 0.034742426835529694, + "grad_norm": 46.159568786621094, + "learning_rate": 1.1294922989161438e-06, + "loss": 11.6001, + "step": 203 + }, + { + "epoch": 0.03491357179531063, + "grad_norm": 20.635892868041992, + "learning_rate": 1.1351968054763263e-06, + "loss": 9.4128, + "step": 204 + }, + { + "epoch": 0.03508471675509156, + "grad_norm": 143.4097137451172, + "learning_rate": 1.1409013120365089e-06, + "loss": 16.3943, + "step": 205 + }, + { + "epoch": 0.035255861714872494, + "grad_norm": 265.5577087402344, + "learning_rate": 1.1466058185966914e-06, + "loss": 18.6869, + "step": 206 + }, + { + "epoch": 0.035427006674653434, + "grad_norm": 19.766063690185547, + "learning_rate": 1.152310325156874e-06, + "loss": 8.6515, + "step": 207 + }, + { + "epoch": 0.03559815163443437, + "grad_norm": 43.8801383972168, + "learning_rate": 1.1580148317170565e-06, + "loss": 11.424, + "step": 208 + }, + { + "epoch": 0.0357692965942153, + "grad_norm": 12.928386688232422, + "learning_rate": 1.1637193382772392e-06, + "loss": 5.5902, + "step": 209 + }, + { + "epoch": 0.035940441553996234, + "grad_norm": 123.55076599121094, + "learning_rate": 1.1694238448374215e-06, + "loss": 15.6958, + "step": 210 + }, + { + "epoch": 0.03611158651377717, + "grad_norm": 44.79010772705078, + "learning_rate": 1.175128351397604e-06, + "loss": 11.1894, + "step": 211 + }, + { + "epoch": 0.0362827314735581, + "grad_norm": 26.461137771606445, + "learning_rate": 1.1808328579577868e-06, + "loss": 7.3237, + "step": 212 + }, + { + "epoch": 0.03645387643333904, + "grad_norm": 24.63947296142578, + "learning_rate": 1.1865373645179693e-06, + "loss": 5.7252, + "step": 213 + }, + { + "epoch": 0.036625021393119975, + "grad_norm": 17.151113510131836, + "learning_rate": 1.1922418710781517e-06, + "loss": 9.0419, + "step": 214 + }, + { + "epoch": 0.03679616635290091, + "grad_norm": 26.69593620300293, + "learning_rate": 1.1979463776383344e-06, + "loss": 9.4836, + "step": 215 + }, + { + "epoch": 0.03696731131268184, + "grad_norm": 50.901573181152344, + "learning_rate": 1.203650884198517e-06, + "loss": 11.2858, + "step": 216 + }, + { + "epoch": 0.037138456272462775, + "grad_norm": 48.110328674316406, + "learning_rate": 1.2093553907586992e-06, + "loss": 11.5594, + "step": 217 + }, + { + "epoch": 0.03730960123224371, + "grad_norm": 51.77389907836914, + "learning_rate": 1.215059897318882e-06, + "loss": 11.6974, + "step": 218 + }, + { + "epoch": 0.03748074619202464, + "grad_norm": 23.52347183227539, + "learning_rate": 1.2207644038790645e-06, + "loss": 9.5737, + "step": 219 + }, + { + "epoch": 0.03765189115180558, + "grad_norm": 20.402074813842773, + "learning_rate": 1.2264689104392468e-06, + "loss": 6.1995, + "step": 220 + }, + { + "epoch": 0.037823036111586515, + "grad_norm": 18.76962661743164, + "learning_rate": 1.2321734169994296e-06, + "loss": 7.1013, + "step": 221 + }, + { + "epoch": 0.03799418107136745, + "grad_norm": 21.817501068115234, + "learning_rate": 1.2378779235596121e-06, + "loss": 9.3332, + "step": 222 + }, + { + "epoch": 0.03816532603114838, + "grad_norm": 11.452000617980957, + "learning_rate": 1.2435824301197949e-06, + "loss": 6.2887, + "step": 223 + }, + { + "epoch": 0.038336470990929315, + "grad_norm": 22.69776153564453, + "learning_rate": 1.2492869366799772e-06, + "loss": 7.9947, + "step": 224 + }, + { + "epoch": 0.03850761595071025, + "grad_norm": 25.39488410949707, + "learning_rate": 1.2549914432401597e-06, + "loss": 5.1894, + "step": 225 + }, + { + "epoch": 0.03867876091049119, + "grad_norm": 17.65719223022461, + "learning_rate": 1.2606959498003425e-06, + "loss": 7.4931, + "step": 226 + }, + { + "epoch": 0.03884990587027212, + "grad_norm": 23.45711898803711, + "learning_rate": 1.2664004563605248e-06, + "loss": 9.6157, + "step": 227 + }, + { + "epoch": 0.039021050830053056, + "grad_norm": 29.114194869995117, + "learning_rate": 1.2721049629207073e-06, + "loss": 10.4857, + "step": 228 + }, + { + "epoch": 0.03919219578983399, + "grad_norm": 46.365013122558594, + "learning_rate": 1.27780946948089e-06, + "loss": 11.9216, + "step": 229 + }, + { + "epoch": 0.03936334074961492, + "grad_norm": 23.066879272460938, + "learning_rate": 1.2835139760410724e-06, + "loss": 9.4344, + "step": 230 + }, + { + "epoch": 0.039534485709395856, + "grad_norm": 15.414644241333008, + "learning_rate": 1.289218482601255e-06, + "loss": 6.4409, + "step": 231 + }, + { + "epoch": 0.039705630669176796, + "grad_norm": 16.58795166015625, + "learning_rate": 1.2949229891614376e-06, + "loss": 7.3307, + "step": 232 + }, + { + "epoch": 0.03987677562895773, + "grad_norm": 36.44779968261719, + "learning_rate": 1.3006274957216202e-06, + "loss": 11.1388, + "step": 233 + }, + { + "epoch": 0.04004792058873866, + "grad_norm": 20.902912139892578, + "learning_rate": 1.3063320022818027e-06, + "loss": 7.378, + "step": 234 + }, + { + "epoch": 0.040219065548519596, + "grad_norm": 20.50259017944336, + "learning_rate": 1.3120365088419852e-06, + "loss": 6.156, + "step": 235 + }, + { + "epoch": 0.04039021050830053, + "grad_norm": 22.57229995727539, + "learning_rate": 1.3177410154021678e-06, + "loss": 7.0029, + "step": 236 + }, + { + "epoch": 0.04056135546808146, + "grad_norm": 25.610868453979492, + "learning_rate": 1.3234455219623503e-06, + "loss": 8.7721, + "step": 237 + }, + { + "epoch": 0.040732500427862396, + "grad_norm": 278.795654296875, + "learning_rate": 1.3291500285225328e-06, + "loss": 15.9633, + "step": 238 + }, + { + "epoch": 0.040903645387643336, + "grad_norm": 11.644048690795898, + "learning_rate": 1.3348545350827154e-06, + "loss": 6.3707, + "step": 239 + }, + { + "epoch": 0.04107479034742427, + "grad_norm": 36.32057189941406, + "learning_rate": 1.340559041642898e-06, + "loss": 10.7901, + "step": 240 + }, + { + "epoch": 0.0412459353072052, + "grad_norm": 22.911476135253906, + "learning_rate": 1.3462635482030804e-06, + "loss": 9.4097, + "step": 241 + }, + { + "epoch": 0.04141708026698614, + "grad_norm": 24.35552406311035, + "learning_rate": 1.351968054763263e-06, + "loss": 9.081, + "step": 242 + }, + { + "epoch": 0.04158822522676707, + "grad_norm": 18.466432571411133, + "learning_rate": 1.3576725613234457e-06, + "loss": 7.4805, + "step": 243 + }, + { + "epoch": 0.041759370186548, + "grad_norm": 44.41029357910156, + "learning_rate": 1.363377067883628e-06, + "loss": 11.2131, + "step": 244 + }, + { + "epoch": 0.041930515146328944, + "grad_norm": 15.328824043273926, + "learning_rate": 1.3690815744438106e-06, + "loss": 8.2706, + "step": 245 + }, + { + "epoch": 0.04210166010610988, + "grad_norm": 274.3642578125, + "learning_rate": 1.3747860810039933e-06, + "loss": 15.8791, + "step": 246 + }, + { + "epoch": 0.04227280506589081, + "grad_norm": 18.105318069458008, + "learning_rate": 1.3804905875641756e-06, + "loss": 8.9079, + "step": 247 + }, + { + "epoch": 0.042443950025671744, + "grad_norm": 22.90168571472168, + "learning_rate": 1.3861950941243584e-06, + "loss": 6.6905, + "step": 248 + }, + { + "epoch": 0.04261509498545268, + "grad_norm": 16.96687126159668, + "learning_rate": 1.391899600684541e-06, + "loss": 8.5567, + "step": 249 + }, + { + "epoch": 0.04278623994523361, + "grad_norm": 283.76409912109375, + "learning_rate": 1.3976041072447232e-06, + "loss": 14.4204, + "step": 250 + }, + { + "epoch": 0.04295738490501455, + "grad_norm": 22.41378402709961, + "learning_rate": 1.403308613804906e-06, + "loss": 9.6063, + "step": 251 + }, + { + "epoch": 0.043128529864795484, + "grad_norm": 23.26137924194336, + "learning_rate": 1.4090131203650885e-06, + "loss": 9.9569, + "step": 252 + }, + { + "epoch": 0.04329967482457642, + "grad_norm": 19.40400505065918, + "learning_rate": 1.414717626925271e-06, + "loss": 6.4322, + "step": 253 + }, + { + "epoch": 0.04347081978435735, + "grad_norm": 21.541933059692383, + "learning_rate": 1.4204221334854536e-06, + "loss": 4.5325, + "step": 254 + }, + { + "epoch": 0.043641964744138284, + "grad_norm": 17.52275276184082, + "learning_rate": 1.426126640045636e-06, + "loss": 8.3479, + "step": 255 + }, + { + "epoch": 0.04381310970391922, + "grad_norm": 125.6756591796875, + "learning_rate": 1.4318311466058186e-06, + "loss": 15.4145, + "step": 256 + }, + { + "epoch": 0.04398425466370015, + "grad_norm": 18.166152954101562, + "learning_rate": 1.4375356531660011e-06, + "loss": 4.2531, + "step": 257 + }, + { + "epoch": 0.04415539962348109, + "grad_norm": 25.4247989654541, + "learning_rate": 1.4432401597261837e-06, + "loss": 10.4856, + "step": 258 + }, + { + "epoch": 0.044326544583262024, + "grad_norm": 17.259897232055664, + "learning_rate": 1.4489446662863664e-06, + "loss": 8.6032, + "step": 259 + }, + { + "epoch": 0.04449768954304296, + "grad_norm": 23.197059631347656, + "learning_rate": 1.4546491728465487e-06, + "loss": 7.8062, + "step": 260 + }, + { + "epoch": 0.04466883450282389, + "grad_norm": 43.4500617980957, + "learning_rate": 1.4603536794067313e-06, + "loss": 11.1986, + "step": 261 + }, + { + "epoch": 0.044839979462604825, + "grad_norm": 122.06368255615234, + "learning_rate": 1.466058185966914e-06, + "loss": 15.5832, + "step": 262 + }, + { + "epoch": 0.04501112442238576, + "grad_norm": 16.506317138671875, + "learning_rate": 1.4717626925270965e-06, + "loss": 8.7747, + "step": 263 + }, + { + "epoch": 0.0451822693821667, + "grad_norm": 19.03982162475586, + "learning_rate": 1.4774671990872789e-06, + "loss": 7.6134, + "step": 264 + }, + { + "epoch": 0.04535341434194763, + "grad_norm": 33.20307540893555, + "learning_rate": 1.4831717056474616e-06, + "loss": 10.3325, + "step": 265 + }, + { + "epoch": 0.045524559301728565, + "grad_norm": 16.946876525878906, + "learning_rate": 1.4888762122076441e-06, + "loss": 8.2866, + "step": 266 + }, + { + "epoch": 0.0456957042615095, + "grad_norm": 25.170318603515625, + "learning_rate": 1.4945807187678265e-06, + "loss": 10.2958, + "step": 267 + }, + { + "epoch": 0.04586684922129043, + "grad_norm": 16.860721588134766, + "learning_rate": 1.5002852253280092e-06, + "loss": 8.5198, + "step": 268 + }, + { + "epoch": 0.046037994181071365, + "grad_norm": 18.003284454345703, + "learning_rate": 1.5059897318881917e-06, + "loss": 8.8484, + "step": 269 + }, + { + "epoch": 0.046209139140852305, + "grad_norm": 17.796016693115234, + "learning_rate": 1.511694238448374e-06, + "loss": 6.2495, + "step": 270 + }, + { + "epoch": 0.04638028410063324, + "grad_norm": 23.97182846069336, + "learning_rate": 1.5173987450085568e-06, + "loss": 7.0879, + "step": 271 + }, + { + "epoch": 0.04655142906041417, + "grad_norm": 213.1482696533203, + "learning_rate": 1.5231032515687393e-06, + "loss": 12.8754, + "step": 272 + }, + { + "epoch": 0.046722574020195105, + "grad_norm": 25.503662109375, + "learning_rate": 1.528807758128922e-06, + "loss": 7.9125, + "step": 273 + }, + { + "epoch": 0.04689371897997604, + "grad_norm": 19.832860946655273, + "learning_rate": 1.5345122646891044e-06, + "loss": 9.0794, + "step": 274 + }, + { + "epoch": 0.04706486393975697, + "grad_norm": 32.311920166015625, + "learning_rate": 1.540216771249287e-06, + "loss": 10.648, + "step": 275 + }, + { + "epoch": 0.047236008899537905, + "grad_norm": 39.916603088378906, + "learning_rate": 1.5459212778094697e-06, + "loss": 10.9246, + "step": 276 + }, + { + "epoch": 0.047407153859318846, + "grad_norm": 21.337602615356445, + "learning_rate": 1.551625784369652e-06, + "loss": 9.2191, + "step": 277 + }, + { + "epoch": 0.04757829881909978, + "grad_norm": 25.114675521850586, + "learning_rate": 1.5573302909298345e-06, + "loss": 10.2576, + "step": 278 + }, + { + "epoch": 0.04774944377888071, + "grad_norm": 14.945568084716797, + "learning_rate": 1.5630347974900173e-06, + "loss": 8.4857, + "step": 279 + }, + { + "epoch": 0.047920588738661646, + "grad_norm": 33.542449951171875, + "learning_rate": 1.5687393040501996e-06, + "loss": 10.9193, + "step": 280 + }, + { + "epoch": 0.04809173369844258, + "grad_norm": 27.331628799438477, + "learning_rate": 1.5744438106103821e-06, + "loss": 9.9441, + "step": 281 + }, + { + "epoch": 0.04826287865822351, + "grad_norm": 17.784677505493164, + "learning_rate": 1.5801483171705649e-06, + "loss": 6.4105, + "step": 282 + }, + { + "epoch": 0.04843402361800445, + "grad_norm": 46.38033676147461, + "learning_rate": 1.5858528237307474e-06, + "loss": 10.5075, + "step": 283 + }, + { + "epoch": 0.048605168577785386, + "grad_norm": 13.535309791564941, + "learning_rate": 1.59155733029093e-06, + "loss": 4.4568, + "step": 284 + }, + { + "epoch": 0.04877631353756632, + "grad_norm": 27.45166015625, + "learning_rate": 1.5972618368511125e-06, + "loss": 10.2344, + "step": 285 + }, + { + "epoch": 0.04894745849734725, + "grad_norm": 16.50087547302246, + "learning_rate": 1.602966343411295e-06, + "loss": 8.5428, + "step": 286 + }, + { + "epoch": 0.049118603457128186, + "grad_norm": 42.31341552734375, + "learning_rate": 1.6086708499714775e-06, + "loss": 10.0868, + "step": 287 + }, + { + "epoch": 0.04928974841690912, + "grad_norm": 17.977153778076172, + "learning_rate": 1.61437535653166e-06, + "loss": 9.012, + "step": 288 + }, + { + "epoch": 0.04946089337669006, + "grad_norm": 104.7464828491211, + "learning_rate": 1.6200798630918426e-06, + "loss": 14.6671, + "step": 289 + }, + { + "epoch": 0.04963203833647099, + "grad_norm": 17.432056427001953, + "learning_rate": 1.6257843696520251e-06, + "loss": 6.8872, + "step": 290 + }, + { + "epoch": 0.04980318329625193, + "grad_norm": 242.7275390625, + "learning_rate": 1.6314888762122076e-06, + "loss": 11.2526, + "step": 291 + }, + { + "epoch": 0.04997432825603286, + "grad_norm": 15.779862403869629, + "learning_rate": 1.6371933827723902e-06, + "loss": 8.7887, + "step": 292 + }, + { + "epoch": 0.05014547321581379, + "grad_norm": 13.621806144714355, + "learning_rate": 1.642897889332573e-06, + "loss": 7.0578, + "step": 293 + }, + { + "epoch": 0.05031661817559473, + "grad_norm": 14.4631986618042, + "learning_rate": 1.6486023958927552e-06, + "loss": 8.2147, + "step": 294 + }, + { + "epoch": 0.05048776313537566, + "grad_norm": 18.11038589477539, + "learning_rate": 1.6543069024529378e-06, + "loss": 6.4308, + "step": 295 + }, + { + "epoch": 0.0506589080951566, + "grad_norm": 16.797258377075195, + "learning_rate": 1.6600114090131205e-06, + "loss": 6.3738, + "step": 296 + }, + { + "epoch": 0.050830053054937534, + "grad_norm": 17.457462310791016, + "learning_rate": 1.6657159155733028e-06, + "loss": 6.3681, + "step": 297 + }, + { + "epoch": 0.05100119801471847, + "grad_norm": 14.502140045166016, + "learning_rate": 1.6714204221334856e-06, + "loss": 7.1297, + "step": 298 + }, + { + "epoch": 0.0511723429744994, + "grad_norm": 14.4544677734375, + "learning_rate": 1.6771249286936681e-06, + "loss": 8.5584, + "step": 299 + }, + { + "epoch": 0.051343487934280334, + "grad_norm": 13.313618659973145, + "learning_rate": 1.6828294352538504e-06, + "loss": 8.1348, + "step": 300 + }, + { + "epoch": 0.05151463289406127, + "grad_norm": 91.8434829711914, + "learning_rate": 1.6885339418140332e-06, + "loss": 13.9421, + "step": 301 + }, + { + "epoch": 0.05168577785384221, + "grad_norm": 39.31818389892578, + "learning_rate": 1.6942384483742157e-06, + "loss": 10.3291, + "step": 302 + }, + { + "epoch": 0.05185692281362314, + "grad_norm": 16.320667266845703, + "learning_rate": 1.6999429549343982e-06, + "loss": 4.6866, + "step": 303 + }, + { + "epoch": 0.052028067773404074, + "grad_norm": 13.367071151733398, + "learning_rate": 1.7056474614945808e-06, + "loss": 8.1535, + "step": 304 + }, + { + "epoch": 0.05219921273318501, + "grad_norm": 186.96824645996094, + "learning_rate": 1.7113519680547633e-06, + "loss": 10.6341, + "step": 305 + }, + { + "epoch": 0.05237035769296594, + "grad_norm": 28.400169372558594, + "learning_rate": 1.7170564746149458e-06, + "loss": 9.7369, + "step": 306 + }, + { + "epoch": 0.052541502652746874, + "grad_norm": 15.559652328491211, + "learning_rate": 1.7227609811751284e-06, + "loss": 7.1427, + "step": 307 + }, + { + "epoch": 0.05271264761252781, + "grad_norm": 5.730342864990234, + "learning_rate": 1.728465487735311e-06, + "loss": 5.4861, + "step": 308 + }, + { + "epoch": 0.05288379257230875, + "grad_norm": 19.06242561340332, + "learning_rate": 1.7341699942954936e-06, + "loss": 9.0657, + "step": 309 + }, + { + "epoch": 0.05305493753208968, + "grad_norm": 18.580720901489258, + "learning_rate": 1.739874500855676e-06, + "loss": 5.9947, + "step": 310 + }, + { + "epoch": 0.053226082491870615, + "grad_norm": 13.939530372619629, + "learning_rate": 1.7455790074158585e-06, + "loss": 7.1715, + "step": 311 + }, + { + "epoch": 0.05339722745165155, + "grad_norm": 12.347646713256836, + "learning_rate": 1.7512835139760412e-06, + "loss": 4.5087, + "step": 312 + }, + { + "epoch": 0.05356837241143248, + "grad_norm": 16.251863479614258, + "learning_rate": 1.7569880205362238e-06, + "loss": 8.7544, + "step": 313 + }, + { + "epoch": 0.053739517371213415, + "grad_norm": 18.887571334838867, + "learning_rate": 1.762692527096406e-06, + "loss": 7.1006, + "step": 314 + }, + { + "epoch": 0.053910662330994355, + "grad_norm": 29.57771873474121, + "learning_rate": 1.7683970336565888e-06, + "loss": 10.2554, + "step": 315 + }, + { + "epoch": 0.05408180729077529, + "grad_norm": 215.26080322265625, + "learning_rate": 1.7741015402167714e-06, + "loss": 10.6589, + "step": 316 + }, + { + "epoch": 0.05425295225055622, + "grad_norm": 6.18715763092041, + "learning_rate": 1.7798060467769537e-06, + "loss": 5.3794, + "step": 317 + }, + { + "epoch": 0.054424097210337155, + "grad_norm": 30.351348876953125, + "learning_rate": 1.7855105533371364e-06, + "loss": 10.3749, + "step": 318 + }, + { + "epoch": 0.05459524217011809, + "grad_norm": 16.978347778320312, + "learning_rate": 1.791215059897319e-06, + "loss": 6.2012, + "step": 319 + }, + { + "epoch": 0.05476638712989902, + "grad_norm": 19.239072799682617, + "learning_rate": 1.7969195664575015e-06, + "loss": 9.1925, + "step": 320 + }, + { + "epoch": 0.05493753208967996, + "grad_norm": 20.378984451293945, + "learning_rate": 1.802624073017684e-06, + "loss": 8.7484, + "step": 321 + }, + { + "epoch": 0.055108677049460895, + "grad_norm": 11.863981246948242, + "learning_rate": 1.8083285795778666e-06, + "loss": 6.308, + "step": 322 + }, + { + "epoch": 0.05527982200924183, + "grad_norm": 15.815791130065918, + "learning_rate": 1.8140330861380493e-06, + "loss": 8.9935, + "step": 323 + }, + { + "epoch": 0.05545096696902276, + "grad_norm": 31.865665435791016, + "learning_rate": 1.8197375926982316e-06, + "loss": 10.1397, + "step": 324 + }, + { + "epoch": 0.055622111928803696, + "grad_norm": 160.87301635742188, + "learning_rate": 1.8254420992584141e-06, + "loss": 9.2965, + "step": 325 + }, + { + "epoch": 0.05579325688858463, + "grad_norm": 16.763856887817383, + "learning_rate": 1.8311466058185969e-06, + "loss": 6.6638, + "step": 326 + }, + { + "epoch": 0.05596440184836556, + "grad_norm": 12.291769981384277, + "learning_rate": 1.8368511123787792e-06, + "loss": 8.2182, + "step": 327 + }, + { + "epoch": 0.0561355468081465, + "grad_norm": 20.839473724365234, + "learning_rate": 1.8425556189389617e-06, + "loss": 5.9446, + "step": 328 + }, + { + "epoch": 0.056306691767927436, + "grad_norm": 41.371337890625, + "learning_rate": 1.8482601254991445e-06, + "loss": 10.0738, + "step": 329 + }, + { + "epoch": 0.05647783672770837, + "grad_norm": 12.416519165039062, + "learning_rate": 1.8539646320593268e-06, + "loss": 7.9372, + "step": 330 + }, + { + "epoch": 0.0566489816874893, + "grad_norm": 12.856998443603516, + "learning_rate": 1.8596691386195093e-06, + "loss": 8.5894, + "step": 331 + }, + { + "epoch": 0.056820126647270236, + "grad_norm": 28.67165184020996, + "learning_rate": 1.865373645179692e-06, + "loss": 9.856, + "step": 332 + }, + { + "epoch": 0.05699127160705117, + "grad_norm": 17.425006866455078, + "learning_rate": 1.8710781517398746e-06, + "loss": 7.6487, + "step": 333 + }, + { + "epoch": 0.05716241656683211, + "grad_norm": 29.102951049804688, + "learning_rate": 1.8767826583000571e-06, + "loss": 9.7985, + "step": 334 + }, + { + "epoch": 0.05733356152661304, + "grad_norm": 15.120597839355469, + "learning_rate": 1.8824871648602395e-06, + "loss": 8.856, + "step": 335 + }, + { + "epoch": 0.057504706486393976, + "grad_norm": 188.02642822265625, + "learning_rate": 1.8881916714204222e-06, + "loss": 9.3274, + "step": 336 + }, + { + "epoch": 0.05767585144617491, + "grad_norm": 14.4713134765625, + "learning_rate": 1.8938961779806047e-06, + "loss": 8.8408, + "step": 337 + }, + { + "epoch": 0.05784699640595584, + "grad_norm": 27.848546981811523, + "learning_rate": 1.8996006845407875e-06, + "loss": 10.1598, + "step": 338 + }, + { + "epoch": 0.058018141365736776, + "grad_norm": 12.024163246154785, + "learning_rate": 1.9053051911009698e-06, + "loss": 6.2088, + "step": 339 + }, + { + "epoch": 0.05818928632551772, + "grad_norm": 11.968954086303711, + "learning_rate": 1.9110096976611523e-06, + "loss": 7.3791, + "step": 340 + }, + { + "epoch": 0.05836043128529865, + "grad_norm": 27.01519775390625, + "learning_rate": 1.9167142042213353e-06, + "loss": 10.5111, + "step": 341 + }, + { + "epoch": 0.05853157624507958, + "grad_norm": 13.136455535888672, + "learning_rate": 1.9224187107815174e-06, + "loss": 4.512, + "step": 342 + }, + { + "epoch": 0.05870272120486052, + "grad_norm": 16.26902198791504, + "learning_rate": 1.9281232173417e-06, + "loss": 6.8285, + "step": 343 + }, + { + "epoch": 0.05887386616464145, + "grad_norm": 16.47487449645996, + "learning_rate": 1.933827723901883e-06, + "loss": 8.8793, + "step": 344 + }, + { + "epoch": 0.059045011124422384, + "grad_norm": 32.750850677490234, + "learning_rate": 1.939532230462065e-06, + "loss": 10.0536, + "step": 345 + }, + { + "epoch": 0.05921615608420332, + "grad_norm": 18.996196746826172, + "learning_rate": 1.9452367370222475e-06, + "loss": 6.1966, + "step": 346 + }, + { + "epoch": 0.05938730104398426, + "grad_norm": 24.546964645385742, + "learning_rate": 1.9509412435824305e-06, + "loss": 9.4677, + "step": 347 + }, + { + "epoch": 0.05955844600376519, + "grad_norm": 84.20301055908203, + "learning_rate": 1.9566457501426126e-06, + "loss": 14.0007, + "step": 348 + }, + { + "epoch": 0.059729590963546124, + "grad_norm": 18.845518112182617, + "learning_rate": 1.962350256702795e-06, + "loss": 6.1715, + "step": 349 + }, + { + "epoch": 0.05990073592332706, + "grad_norm": 32.177085876464844, + "learning_rate": 1.968054763262978e-06, + "loss": 10.5934, + "step": 350 + }, + { + "epoch": 0.06007188088310799, + "grad_norm": 24.051923751831055, + "learning_rate": 1.97375926982316e-06, + "loss": 9.6501, + "step": 351 + }, + { + "epoch": 0.060243025842888924, + "grad_norm": 13.522736549377441, + "learning_rate": 1.9794637763833427e-06, + "loss": 8.8967, + "step": 352 + }, + { + "epoch": 0.060414170802669864, + "grad_norm": 21.437868118286133, + "learning_rate": 1.9851682829435257e-06, + "loss": 9.8243, + "step": 353 + }, + { + "epoch": 0.0605853157624508, + "grad_norm": 30.177589416503906, + "learning_rate": 1.9908727895037078e-06, + "loss": 9.5401, + "step": 354 + }, + { + "epoch": 0.06075646072223173, + "grad_norm": 12.939532279968262, + "learning_rate": 1.9965772960638903e-06, + "loss": 6.4143, + "step": 355 + }, + { + "epoch": 0.060927605682012664, + "grad_norm": 18.022136688232422, + "learning_rate": 2.0022818026240733e-06, + "loss": 9.6522, + "step": 356 + }, + { + "epoch": 0.0610987506417936, + "grad_norm": 12.483067512512207, + "learning_rate": 2.0079863091842554e-06, + "loss": 9.2254, + "step": 357 + }, + { + "epoch": 0.06126989560157453, + "grad_norm": 19.432615280151367, + "learning_rate": 2.0136908157444383e-06, + "loss": 6.2483, + "step": 358 + }, + { + "epoch": 0.06144104056135547, + "grad_norm": 177.3258819580078, + "learning_rate": 2.019395322304621e-06, + "loss": 9.2975, + "step": 359 + }, + { + "epoch": 0.061612185521136405, + "grad_norm": 14.458636283874512, + "learning_rate": 2.025099828864803e-06, + "loss": 8.5874, + "step": 360 + }, + { + "epoch": 0.06178333048091734, + "grad_norm": 21.112350463867188, + "learning_rate": 2.030804335424986e-06, + "loss": 9.4896, + "step": 361 + }, + { + "epoch": 0.06195447544069827, + "grad_norm": 15.956084251403809, + "learning_rate": 2.0365088419851685e-06, + "loss": 9.3311, + "step": 362 + }, + { + "epoch": 0.062125620400479205, + "grad_norm": 11.96216869354248, + "learning_rate": 2.042213348545351e-06, + "loss": 8.2885, + "step": 363 + }, + { + "epoch": 0.06229676536026014, + "grad_norm": 16.588687896728516, + "learning_rate": 2.0479178551055335e-06, + "loss": 8.5745, + "step": 364 + }, + { + "epoch": 0.06246791032004107, + "grad_norm": 20.95501708984375, + "learning_rate": 2.053622361665716e-06, + "loss": 9.5327, + "step": 365 + }, + { + "epoch": 0.06263905527982201, + "grad_norm": 14.255351066589355, + "learning_rate": 2.0593268682258986e-06, + "loss": 9.1372, + "step": 366 + }, + { + "epoch": 0.06281020023960295, + "grad_norm": 17.529571533203125, + "learning_rate": 2.065031374786081e-06, + "loss": 6.9098, + "step": 367 + }, + { + "epoch": 0.06298134519938388, + "grad_norm": 23.381641387939453, + "learning_rate": 2.0707358813462636e-06, + "loss": 9.4994, + "step": 368 + }, + { + "epoch": 0.06315249015916481, + "grad_norm": 152.30535888671875, + "learning_rate": 2.076440387906446e-06, + "loss": 8.5952, + "step": 369 + }, + { + "epoch": 0.06332363511894575, + "grad_norm": 15.447931289672852, + "learning_rate": 2.0821448944666287e-06, + "loss": 7.1287, + "step": 370 + }, + { + "epoch": 0.06349478007872668, + "grad_norm": 13.553053855895996, + "learning_rate": 2.0878494010268112e-06, + "loss": 8.0622, + "step": 371 + }, + { + "epoch": 0.06366592503850761, + "grad_norm": 13.198517799377441, + "learning_rate": 2.0935539075869938e-06, + "loss": 8.3527, + "step": 372 + }, + { + "epoch": 0.06383706999828855, + "grad_norm": 21.851369857788086, + "learning_rate": 2.0992584141471763e-06, + "loss": 6.7771, + "step": 373 + }, + { + "epoch": 0.06400821495806948, + "grad_norm": 30.56134605407715, + "learning_rate": 2.104962920707359e-06, + "loss": 9.666, + "step": 374 + }, + { + "epoch": 0.06417935991785043, + "grad_norm": 18.76494026184082, + "learning_rate": 2.1106674272675414e-06, + "loss": 9.3941, + "step": 375 + }, + { + "epoch": 0.06435050487763136, + "grad_norm": 19.92658805847168, + "learning_rate": 2.116371933827724e-06, + "loss": 9.3741, + "step": 376 + }, + { + "epoch": 0.06452164983741229, + "grad_norm": 10.430363655090332, + "learning_rate": 2.1220764403879064e-06, + "loss": 7.8113, + "step": 377 + }, + { + "epoch": 0.06469279479719323, + "grad_norm": 18.093847274780273, + "learning_rate": 2.1277809469480894e-06, + "loss": 6.1706, + "step": 378 + }, + { + "epoch": 0.06486393975697416, + "grad_norm": 21.807714462280273, + "learning_rate": 2.1334854535082715e-06, + "loss": 9.471, + "step": 379 + }, + { + "epoch": 0.06503508471675509, + "grad_norm": 10.38511848449707, + "learning_rate": 2.139189960068454e-06, + "loss": 4.2784, + "step": 380 + }, + { + "epoch": 0.06520622967653603, + "grad_norm": 18.564613342285156, + "learning_rate": 2.144894466628637e-06, + "loss": 9.548, + "step": 381 + }, + { + "epoch": 0.06537737463631696, + "grad_norm": 13.890935897827148, + "learning_rate": 2.150598973188819e-06, + "loss": 7.9354, + "step": 382 + }, + { + "epoch": 0.06554851959609789, + "grad_norm": 18.593252182006836, + "learning_rate": 2.1563034797490016e-06, + "loss": 6.34, + "step": 383 + }, + { + "epoch": 0.06571966455587883, + "grad_norm": 10.455931663513184, + "learning_rate": 2.1620079863091846e-06, + "loss": 6.2716, + "step": 384 + }, + { + "epoch": 0.06589080951565976, + "grad_norm": 21.231943130493164, + "learning_rate": 2.1677124928693667e-06, + "loss": 5.9761, + "step": 385 + }, + { + "epoch": 0.06606195447544069, + "grad_norm": 11.568195343017578, + "learning_rate": 2.173416999429549e-06, + "loss": 4.4776, + "step": 386 + }, + { + "epoch": 0.06623309943522163, + "grad_norm": 23.829204559326172, + "learning_rate": 2.179121505989732e-06, + "loss": 9.648, + "step": 387 + }, + { + "epoch": 0.06640424439500257, + "grad_norm": 10.398987770080566, + "learning_rate": 2.1848260125499147e-06, + "loss": 4.7062, + "step": 388 + }, + { + "epoch": 0.0665753893547835, + "grad_norm": 11.396307945251465, + "learning_rate": 2.190530519110097e-06, + "loss": 8.1087, + "step": 389 + }, + { + "epoch": 0.06674653431456444, + "grad_norm": 18.780866622924805, + "learning_rate": 2.1962350256702798e-06, + "loss": 6.196, + "step": 390 + }, + { + "epoch": 0.06691767927434537, + "grad_norm": 18.36736488342285, + "learning_rate": 2.2019395322304623e-06, + "loss": 6.2459, + "step": 391 + }, + { + "epoch": 0.0670888242341263, + "grad_norm": 18.681446075439453, + "learning_rate": 2.2076440387906444e-06, + "loss": 9.4161, + "step": 392 + }, + { + "epoch": 0.06725996919390724, + "grad_norm": 15.113629341125488, + "learning_rate": 2.2133485453508274e-06, + "loss": 6.3517, + "step": 393 + }, + { + "epoch": 0.06743111415368817, + "grad_norm": 11.273137092590332, + "learning_rate": 2.21905305191101e-06, + "loss": 8.0886, + "step": 394 + }, + { + "epoch": 0.06760225911346911, + "grad_norm": 17.580646514892578, + "learning_rate": 2.224757558471192e-06, + "loss": 6.5059, + "step": 395 + }, + { + "epoch": 0.06777340407325004, + "grad_norm": 15.864416122436523, + "learning_rate": 2.230462065031375e-06, + "loss": 8.5624, + "step": 396 + }, + { + "epoch": 0.06794454903303097, + "grad_norm": 11.407431602478027, + "learning_rate": 2.2361665715915575e-06, + "loss": 7.853, + "step": 397 + }, + { + "epoch": 0.06811569399281191, + "grad_norm": 28.192079544067383, + "learning_rate": 2.24187107815174e-06, + "loss": 9.4467, + "step": 398 + }, + { + "epoch": 0.06828683895259284, + "grad_norm": 19.4180965423584, + "learning_rate": 2.2475755847119225e-06, + "loss": 5.6605, + "step": 399 + }, + { + "epoch": 0.06845798391237377, + "grad_norm": 19.75929069519043, + "learning_rate": 2.253280091272105e-06, + "loss": 9.4512, + "step": 400 + }, + { + "epoch": 0.06862912887215472, + "grad_norm": 10.311906814575195, + "learning_rate": 2.2589845978322876e-06, + "loss": 7.9644, + "step": 401 + }, + { + "epoch": 0.06880027383193565, + "grad_norm": 20.4741268157959, + "learning_rate": 2.26468910439247e-06, + "loss": 9.4577, + "step": 402 + }, + { + "epoch": 0.06897141879171659, + "grad_norm": 25.65606117248535, + "learning_rate": 2.2703936109526527e-06, + "loss": 9.6371, + "step": 403 + }, + { + "epoch": 0.06914256375149752, + "grad_norm": 26.26441192626953, + "learning_rate": 2.276098117512835e-06, + "loss": 9.5365, + "step": 404 + }, + { + "epoch": 0.06931370871127845, + "grad_norm": 14.249612808227539, + "learning_rate": 2.2818026240730177e-06, + "loss": 8.5897, + "step": 405 + }, + { + "epoch": 0.06948485367105939, + "grad_norm": 17.306989669799805, + "learning_rate": 2.2875071306332003e-06, + "loss": 6.9065, + "step": 406 + }, + { + "epoch": 0.06965599863084032, + "grad_norm": 10.925597190856934, + "learning_rate": 2.293211637193383e-06, + "loss": 4.4132, + "step": 407 + }, + { + "epoch": 0.06982714359062125, + "grad_norm": 20.995426177978516, + "learning_rate": 2.2989161437535653e-06, + "loss": 9.6018, + "step": 408 + }, + { + "epoch": 0.06999828855040219, + "grad_norm": 13.343510627746582, + "learning_rate": 2.304620650313748e-06, + "loss": 8.3354, + "step": 409 + }, + { + "epoch": 0.07016943351018312, + "grad_norm": 21.461809158325195, + "learning_rate": 2.3103251568739304e-06, + "loss": 9.3101, + "step": 410 + }, + { + "epoch": 0.07034057846996405, + "grad_norm": 25.428903579711914, + "learning_rate": 2.316029663434113e-06, + "loss": 9.4155, + "step": 411 + }, + { + "epoch": 0.07051172342974499, + "grad_norm": 22.469390869140625, + "learning_rate": 2.3217341699942955e-06, + "loss": 6.4331, + "step": 412 + }, + { + "epoch": 0.07068286838952594, + "grad_norm": 157.02752685546875, + "learning_rate": 2.3274386765544784e-06, + "loss": 7.6313, + "step": 413 + }, + { + "epoch": 0.07085401334930687, + "grad_norm": 12.20741081237793, + "learning_rate": 2.3331431831146605e-06, + "loss": 4.2273, + "step": 414 + }, + { + "epoch": 0.0710251583090878, + "grad_norm": 19.81876564025879, + "learning_rate": 2.338847689674843e-06, + "loss": 9.5364, + "step": 415 + }, + { + "epoch": 0.07119630326886874, + "grad_norm": 17.362276077270508, + "learning_rate": 2.344552196235026e-06, + "loss": 9.4605, + "step": 416 + }, + { + "epoch": 0.07136744822864967, + "grad_norm": 22.898147583007812, + "learning_rate": 2.350256702795208e-06, + "loss": 9.5846, + "step": 417 + }, + { + "epoch": 0.0715385931884306, + "grad_norm": 17.685535430908203, + "learning_rate": 2.3559612093553906e-06, + "loss": 8.0604, + "step": 418 + }, + { + "epoch": 0.07170973814821154, + "grad_norm": 16.97225570678711, + "learning_rate": 2.3616657159155736e-06, + "loss": 9.0822, + "step": 419 + }, + { + "epoch": 0.07188088310799247, + "grad_norm": 21.690431594848633, + "learning_rate": 2.3673702224757557e-06, + "loss": 5.9587, + "step": 420 + }, + { + "epoch": 0.0720520280677734, + "grad_norm": 20.209810256958008, + "learning_rate": 2.3730747290359387e-06, + "loss": 6.1507, + "step": 421 + }, + { + "epoch": 0.07222317302755434, + "grad_norm": 19.15233039855957, + "learning_rate": 2.378779235596121e-06, + "loss": 9.5222, + "step": 422 + }, + { + "epoch": 0.07239431798733527, + "grad_norm": 15.19393539428711, + "learning_rate": 2.3844837421563033e-06, + "loss": 8.3063, + "step": 423 + }, + { + "epoch": 0.0725654629471162, + "grad_norm": 14.138923645019531, + "learning_rate": 2.3901882487164863e-06, + "loss": 8.3802, + "step": 424 + }, + { + "epoch": 0.07273660790689714, + "grad_norm": 23.83425521850586, + "learning_rate": 2.395892755276669e-06, + "loss": 9.554, + "step": 425 + }, + { + "epoch": 0.07290775286667808, + "grad_norm": 19.778850555419922, + "learning_rate": 2.401597261836851e-06, + "loss": 6.0866, + "step": 426 + }, + { + "epoch": 0.07307889782645902, + "grad_norm": 12.418360710144043, + "learning_rate": 2.407301768397034e-06, + "loss": 7.7723, + "step": 427 + }, + { + "epoch": 0.07325004278623995, + "grad_norm": 21.105587005615234, + "learning_rate": 2.4130062749572164e-06, + "loss": 6.011, + "step": 428 + }, + { + "epoch": 0.07342118774602088, + "grad_norm": 18.78055763244629, + "learning_rate": 2.4187107815173985e-06, + "loss": 6.4389, + "step": 429 + }, + { + "epoch": 0.07359233270580182, + "grad_norm": 17.227916717529297, + "learning_rate": 2.4244152880775814e-06, + "loss": 6.6973, + "step": 430 + }, + { + "epoch": 0.07376347766558275, + "grad_norm": 21.845876693725586, + "learning_rate": 2.430119794637764e-06, + "loss": 5.9158, + "step": 431 + }, + { + "epoch": 0.07393462262536368, + "grad_norm": 14.355096817016602, + "learning_rate": 2.435824301197946e-06, + "loss": 8.6576, + "step": 432 + }, + { + "epoch": 0.07410576758514462, + "grad_norm": 149.28054809570312, + "learning_rate": 2.441528807758129e-06, + "loss": 7.7649, + "step": 433 + }, + { + "epoch": 0.07427691254492555, + "grad_norm": 18.152389526367188, + "learning_rate": 2.4472333143183116e-06, + "loss": 6.6434, + "step": 434 + }, + { + "epoch": 0.07444805750470648, + "grad_norm": 17.05584716796875, + "learning_rate": 2.4529378208784937e-06, + "loss": 9.1462, + "step": 435 + }, + { + "epoch": 0.07461920246448742, + "grad_norm": 11.82278060913086, + "learning_rate": 2.4586423274386766e-06, + "loss": 8.2832, + "step": 436 + }, + { + "epoch": 0.07479034742426835, + "grad_norm": 17.951648712158203, + "learning_rate": 2.464346833998859e-06, + "loss": 8.4052, + "step": 437 + }, + { + "epoch": 0.07496149238404928, + "grad_norm": 31.258188247680664, + "learning_rate": 2.4700513405590417e-06, + "loss": 9.4477, + "step": 438 + }, + { + "epoch": 0.07513263734383023, + "grad_norm": 138.91761779785156, + "learning_rate": 2.4757558471192242e-06, + "loss": 8.3869, + "step": 439 + }, + { + "epoch": 0.07530378230361116, + "grad_norm": 17.930551528930664, + "learning_rate": 2.4814603536794068e-06, + "loss": 9.1768, + "step": 440 + }, + { + "epoch": 0.0754749272633921, + "grad_norm": 10.999883651733398, + "learning_rate": 2.4871648602395897e-06, + "loss": 4.1341, + "step": 441 + }, + { + "epoch": 0.07564607222317303, + "grad_norm": 19.707490921020508, + "learning_rate": 2.492869366799772e-06, + "loss": 6.0241, + "step": 442 + }, + { + "epoch": 0.07581721718295396, + "grad_norm": 19.63069725036621, + "learning_rate": 2.4985738733599544e-06, + "loss": 9.5659, + "step": 443 + }, + { + "epoch": 0.0759883621427349, + "grad_norm": 19.783658981323242, + "learning_rate": 2.5042783799201373e-06, + "loss": 6.632, + "step": 444 + }, + { + "epoch": 0.07615950710251583, + "grad_norm": 11.193924903869629, + "learning_rate": 2.5099828864803194e-06, + "loss": 4.213, + "step": 445 + }, + { + "epoch": 0.07633065206229676, + "grad_norm": 65.09992218017578, + "learning_rate": 2.515687393040502e-06, + "loss": 13.1721, + "step": 446 + }, + { + "epoch": 0.0765017970220777, + "grad_norm": 19.081214904785156, + "learning_rate": 2.521391899600685e-06, + "loss": 8.7605, + "step": 447 + }, + { + "epoch": 0.07667294198185863, + "grad_norm": 17.08602523803711, + "learning_rate": 2.527096406160867e-06, + "loss": 8.4352, + "step": 448 + }, + { + "epoch": 0.07684408694163956, + "grad_norm": 11.796391487121582, + "learning_rate": 2.5328009127210495e-06, + "loss": 7.9838, + "step": 449 + }, + { + "epoch": 0.0770152319014205, + "grad_norm": 17.306316375732422, + "learning_rate": 2.5385054192812325e-06, + "loss": 7.9123, + "step": 450 + }, + { + "epoch": 0.07718637686120144, + "grad_norm": 11.991724014282227, + "learning_rate": 2.5442099258414146e-06, + "loss": 7.904, + "step": 451 + }, + { + "epoch": 0.07735752182098238, + "grad_norm": 18.394563674926758, + "learning_rate": 2.549914432401597e-06, + "loss": 6.7541, + "step": 452 + }, + { + "epoch": 0.07752866678076331, + "grad_norm": 21.436811447143555, + "learning_rate": 2.55561893896178e-06, + "loss": 5.5488, + "step": 453 + }, + { + "epoch": 0.07769981174054424, + "grad_norm": 15.822162628173828, + "learning_rate": 2.561323445521962e-06, + "loss": 7.7392, + "step": 454 + }, + { + "epoch": 0.07787095670032518, + "grad_norm": 19.68645668029785, + "learning_rate": 2.5670279520821447e-06, + "loss": 6.6529, + "step": 455 + }, + { + "epoch": 0.07804210166010611, + "grad_norm": 18.808198928833008, + "learning_rate": 2.5727324586423277e-06, + "loss": 8.784, + "step": 456 + }, + { + "epoch": 0.07821324661988704, + "grad_norm": 131.1753692626953, + "learning_rate": 2.57843696520251e-06, + "loss": 7.8706, + "step": 457 + }, + { + "epoch": 0.07838439157966798, + "grad_norm": 11.708639144897461, + "learning_rate": 2.5841414717626923e-06, + "loss": 7.7402, + "step": 458 + }, + { + "epoch": 0.07855553653944891, + "grad_norm": 15.965631484985352, + "learning_rate": 2.5898459783228753e-06, + "loss": 8.301, + "step": 459 + }, + { + "epoch": 0.07872668149922984, + "grad_norm": 14.710309982299805, + "learning_rate": 2.5955504848830574e-06, + "loss": 7.9566, + "step": 460 + }, + { + "epoch": 0.07889782645901078, + "grad_norm": 15.00783634185791, + "learning_rate": 2.6012549914432404e-06, + "loss": 8.488, + "step": 461 + }, + { + "epoch": 0.07906897141879171, + "grad_norm": 13.231627464294434, + "learning_rate": 2.606959498003423e-06, + "loss": 8.1184, + "step": 462 + }, + { + "epoch": 0.07924011637857264, + "grad_norm": 170.4566192626953, + "learning_rate": 2.6126640045636054e-06, + "loss": 8.1805, + "step": 463 + }, + { + "epoch": 0.07941126133835359, + "grad_norm": 23.66990089416504, + "learning_rate": 2.618368511123788e-06, + "loss": 9.2852, + "step": 464 + }, + { + "epoch": 0.07958240629813453, + "grad_norm": 20.218496322631836, + "learning_rate": 2.6240730176839705e-06, + "loss": 6.264, + "step": 465 + }, + { + "epoch": 0.07975355125791546, + "grad_norm": 27.905323028564453, + "learning_rate": 2.629777524244153e-06, + "loss": 10.0002, + "step": 466 + }, + { + "epoch": 0.07992469621769639, + "grad_norm": 22.043649673461914, + "learning_rate": 2.6354820308043355e-06, + "loss": 8.6303, + "step": 467 + }, + { + "epoch": 0.08009584117747733, + "grad_norm": 20.095890045166016, + "learning_rate": 2.641186537364518e-06, + "loss": 8.7857, + "step": 468 + }, + { + "epoch": 0.08026698613725826, + "grad_norm": 30.715435028076172, + "learning_rate": 2.6468910439247006e-06, + "loss": 9.6486, + "step": 469 + }, + { + "epoch": 0.08043813109703919, + "grad_norm": 18.83611488342285, + "learning_rate": 2.652595550484883e-06, + "loss": 7.9544, + "step": 470 + }, + { + "epoch": 0.08060927605682013, + "grad_norm": 20.929931640625, + "learning_rate": 2.6583000570450657e-06, + "loss": 6.2772, + "step": 471 + }, + { + "epoch": 0.08078042101660106, + "grad_norm": 18.414594650268555, + "learning_rate": 2.664004563605248e-06, + "loss": 6.1477, + "step": 472 + }, + { + "epoch": 0.08095156597638199, + "grad_norm": 18.188846588134766, + "learning_rate": 2.6697090701654307e-06, + "loss": 7.099, + "step": 473 + }, + { + "epoch": 0.08112271093616293, + "grad_norm": 8.666217803955078, + "learning_rate": 2.6754135767256133e-06, + "loss": 5.0929, + "step": 474 + }, + { + "epoch": 0.08129385589594386, + "grad_norm": 15.457167625427246, + "learning_rate": 2.681118083285796e-06, + "loss": 7.8706, + "step": 475 + }, + { + "epoch": 0.08146500085572479, + "grad_norm": 17.11892318725586, + "learning_rate": 2.6868225898459783e-06, + "loss": 8.5293, + "step": 476 + }, + { + "epoch": 0.08163614581550574, + "grad_norm": 28.18759536743164, + "learning_rate": 2.692527096406161e-06, + "loss": 5.7448, + "step": 477 + }, + { + "epoch": 0.08180729077528667, + "grad_norm": 19.842830657958984, + "learning_rate": 2.6982316029663434e-06, + "loss": 8.5854, + "step": 478 + }, + { + "epoch": 0.0819784357350676, + "grad_norm": 59.76820373535156, + "learning_rate": 2.703936109526526e-06, + "loss": 12.4879, + "step": 479 + }, + { + "epoch": 0.08214958069484854, + "grad_norm": 15.530830383300781, + "learning_rate": 2.7096406160867085e-06, + "loss": 8.191, + "step": 480 + }, + { + "epoch": 0.08232072565462947, + "grad_norm": 21.211435317993164, + "learning_rate": 2.7153451226468914e-06, + "loss": 9.4326, + "step": 481 + }, + { + "epoch": 0.0824918706144104, + "grad_norm": 16.38536834716797, + "learning_rate": 2.7210496292070735e-06, + "loss": 6.3342, + "step": 482 + }, + { + "epoch": 0.08266301557419134, + "grad_norm": 30.17742919921875, + "learning_rate": 2.726754135767256e-06, + "loss": 5.5068, + "step": 483 + }, + { + "epoch": 0.08283416053397227, + "grad_norm": 27.44713020324707, + "learning_rate": 2.732458642327439e-06, + "loss": 9.4586, + "step": 484 + }, + { + "epoch": 0.0830053054937532, + "grad_norm": 59.46120071411133, + "learning_rate": 2.738163148887621e-06, + "loss": 12.3889, + "step": 485 + }, + { + "epoch": 0.08317645045353414, + "grad_norm": 26.801589965820312, + "learning_rate": 2.7438676554478036e-06, + "loss": 5.3141, + "step": 486 + }, + { + "epoch": 0.08334759541331507, + "grad_norm": 32.20411682128906, + "learning_rate": 2.7495721620079866e-06, + "loss": 5.4274, + "step": 487 + }, + { + "epoch": 0.083518740373096, + "grad_norm": 16.14412498474121, + "learning_rate": 2.755276668568169e-06, + "loss": 8.3447, + "step": 488 + }, + { + "epoch": 0.08368988533287694, + "grad_norm": 16.79600715637207, + "learning_rate": 2.7609811751283512e-06, + "loss": 7.7737, + "step": 489 + }, + { + "epoch": 0.08386103029265789, + "grad_norm": 171.59872436523438, + "learning_rate": 2.766685681688534e-06, + "loss": 8.277, + "step": 490 + }, + { + "epoch": 0.08403217525243882, + "grad_norm": 29.80289649963379, + "learning_rate": 2.7723901882487167e-06, + "loss": 5.273, + "step": 491 + }, + { + "epoch": 0.08420332021221975, + "grad_norm": 15.38176155090332, + "learning_rate": 2.778094694808899e-06, + "loss": 7.8611, + "step": 492 + }, + { + "epoch": 0.08437446517200069, + "grad_norm": 19.766082763671875, + "learning_rate": 2.783799201369082e-06, + "loss": 7.7926, + "step": 493 + }, + { + "epoch": 0.08454561013178162, + "grad_norm": 13.274962425231934, + "learning_rate": 2.7895037079292643e-06, + "loss": 4.1215, + "step": 494 + }, + { + "epoch": 0.08471675509156255, + "grad_norm": 29.015403747558594, + "learning_rate": 2.7952082144894464e-06, + "loss": 5.4146, + "step": 495 + }, + { + "epoch": 0.08488790005134349, + "grad_norm": 22.243703842163086, + "learning_rate": 2.8009127210496294e-06, + "loss": 5.753, + "step": 496 + }, + { + "epoch": 0.08505904501112442, + "grad_norm": 23.75475311279297, + "learning_rate": 2.806617227609812e-06, + "loss": 5.7119, + "step": 497 + }, + { + "epoch": 0.08523018997090535, + "grad_norm": 19.524032592773438, + "learning_rate": 2.812321734169994e-06, + "loss": 8.9719, + "step": 498 + }, + { + "epoch": 0.08540133493068629, + "grad_norm": 22.207155227661133, + "learning_rate": 2.818026240730177e-06, + "loss": 8.5433, + "step": 499 + }, + { + "epoch": 0.08557247989046722, + "grad_norm": 20.369564056396484, + "learning_rate": 2.8237307472903595e-06, + "loss": 9.2212, + "step": 500 + }, + { + "epoch": 0.08574362485024815, + "grad_norm": 12.617632865905762, + "learning_rate": 2.829435253850542e-06, + "loss": 7.5878, + "step": 501 + }, + { + "epoch": 0.0859147698100291, + "grad_norm": 16.92389678955078, + "learning_rate": 2.8351397604107246e-06, + "loss": 8.1394, + "step": 502 + }, + { + "epoch": 0.08608591476981003, + "grad_norm": 52.22781753540039, + "learning_rate": 2.840844266970907e-06, + "loss": 11.9304, + "step": 503 + }, + { + "epoch": 0.08625705972959097, + "grad_norm": 19.299196243286133, + "learning_rate": 2.8465487735310896e-06, + "loss": 7.5487, + "step": 504 + }, + { + "epoch": 0.0864282046893719, + "grad_norm": 25.007366180419922, + "learning_rate": 2.852253280091272e-06, + "loss": 6.4953, + "step": 505 + }, + { + "epoch": 0.08659934964915283, + "grad_norm": 44.58477020263672, + "learning_rate": 2.8579577866514547e-06, + "loss": 11.543, + "step": 506 + }, + { + "epoch": 0.08677049460893377, + "grad_norm": 18.95302963256836, + "learning_rate": 2.8636622932116372e-06, + "loss": 7.7713, + "step": 507 + }, + { + "epoch": 0.0869416395687147, + "grad_norm": 15.56648063659668, + "learning_rate": 2.8693667997718198e-06, + "loss": 8.5567, + "step": 508 + }, + { + "epoch": 0.08711278452849563, + "grad_norm": 20.78284454345703, + "learning_rate": 2.8750713063320023e-06, + "loss": 7.7135, + "step": 509 + }, + { + "epoch": 0.08728392948827657, + "grad_norm": 23.176607131958008, + "learning_rate": 2.880775812892185e-06, + "loss": 8.2685, + "step": 510 + }, + { + "epoch": 0.0874550744480575, + "grad_norm": 25.212718963623047, + "learning_rate": 2.8864803194523674e-06, + "loss": 8.9983, + "step": 511 + }, + { + "epoch": 0.08762621940783843, + "grad_norm": 27.220836639404297, + "learning_rate": 2.89218482601255e-06, + "loss": 6.6334, + "step": 512 + }, + { + "epoch": 0.08779736436761937, + "grad_norm": 13.128168106079102, + "learning_rate": 2.897889332572733e-06, + "loss": 3.847, + "step": 513 + }, + { + "epoch": 0.0879685093274003, + "grad_norm": 19.84160614013672, + "learning_rate": 2.903593839132915e-06, + "loss": 8.0045, + "step": 514 + }, + { + "epoch": 0.08813965428718125, + "grad_norm": 15.77076530456543, + "learning_rate": 2.9092983456930975e-06, + "loss": 7.8019, + "step": 515 + }, + { + "epoch": 0.08831079924696218, + "grad_norm": 158.41465759277344, + "learning_rate": 2.9150028522532804e-06, + "loss": 8.6448, + "step": 516 + }, + { + "epoch": 0.08848194420674312, + "grad_norm": 23.563339233398438, + "learning_rate": 2.9207073588134625e-06, + "loss": 8.8163, + "step": 517 + }, + { + "epoch": 0.08865308916652405, + "grad_norm": 30.82549476623535, + "learning_rate": 2.926411865373645e-06, + "loss": 8.627, + "step": 518 + }, + { + "epoch": 0.08882423412630498, + "grad_norm": 24.138612747192383, + "learning_rate": 2.932116371933828e-06, + "loss": 6.2112, + "step": 519 + }, + { + "epoch": 0.08899537908608592, + "grad_norm": 42.6961784362793, + "learning_rate": 2.93782087849401e-06, + "loss": 11.5101, + "step": 520 + }, + { + "epoch": 0.08916652404586685, + "grad_norm": 16.58330726623535, + "learning_rate": 2.943525385054193e-06, + "loss": 7.885, + "step": 521 + }, + { + "epoch": 0.08933766900564778, + "grad_norm": 17.490467071533203, + "learning_rate": 2.9492298916143756e-06, + "loss": 7.6631, + "step": 522 + }, + { + "epoch": 0.08950881396542872, + "grad_norm": 24.303665161132812, + "learning_rate": 2.9549343981745577e-06, + "loss": 8.5512, + "step": 523 + }, + { + "epoch": 0.08967995892520965, + "grad_norm": 14.5447416305542, + "learning_rate": 2.9606389047347407e-06, + "loss": 3.9844, + "step": 524 + }, + { + "epoch": 0.08985110388499058, + "grad_norm": 28.421756744384766, + "learning_rate": 2.9663434112949232e-06, + "loss": 9.2179, + "step": 525 + }, + { + "epoch": 0.09002224884477152, + "grad_norm": 20.097034454345703, + "learning_rate": 2.9720479178551053e-06, + "loss": 9.2422, + "step": 526 + }, + { + "epoch": 0.09019339380455245, + "grad_norm": 20.862869262695312, + "learning_rate": 2.9777524244152883e-06, + "loss": 8.2303, + "step": 527 + }, + { + "epoch": 0.0903645387643334, + "grad_norm": 30.980390548706055, + "learning_rate": 2.983456930975471e-06, + "loss": 9.1253, + "step": 528 + }, + { + "epoch": 0.09053568372411433, + "grad_norm": 29.973567962646484, + "learning_rate": 2.989161437535653e-06, + "loss": 4.8928, + "step": 529 + }, + { + "epoch": 0.09070682868389526, + "grad_norm": 35.399349212646484, + "learning_rate": 2.994865944095836e-06, + "loss": 4.5559, + "step": 530 + }, + { + "epoch": 0.0908779736436762, + "grad_norm": 21.178098678588867, + "learning_rate": 3.0005704506560184e-06, + "loss": 8.8876, + "step": 531 + }, + { + "epoch": 0.09104911860345713, + "grad_norm": 24.755205154418945, + "learning_rate": 3.0062749572162005e-06, + "loss": 7.5809, + "step": 532 + }, + { + "epoch": 0.09122026356323806, + "grad_norm": 23.76934051513672, + "learning_rate": 3.0119794637763835e-06, + "loss": 8.5706, + "step": 533 + }, + { + "epoch": 0.091391408523019, + "grad_norm": 40.431190490722656, + "learning_rate": 3.017683970336566e-06, + "loss": 11.3342, + "step": 534 + }, + { + "epoch": 0.09156255348279993, + "grad_norm": 22.674354553222656, + "learning_rate": 3.023388476896748e-06, + "loss": 8.8756, + "step": 535 + }, + { + "epoch": 0.09173369844258086, + "grad_norm": 33.92606735229492, + "learning_rate": 3.029092983456931e-06, + "loss": 4.738, + "step": 536 + }, + { + "epoch": 0.0919048434023618, + "grad_norm": 27.1170711517334, + "learning_rate": 3.0347974900171136e-06, + "loss": 6.4223, + "step": 537 + }, + { + "epoch": 0.09207598836214273, + "grad_norm": 25.11066246032715, + "learning_rate": 3.040501996577296e-06, + "loss": 9.0946, + "step": 538 + }, + { + "epoch": 0.09224713332192366, + "grad_norm": 23.894901275634766, + "learning_rate": 3.0462065031374787e-06, + "loss": 6.3395, + "step": 539 + }, + { + "epoch": 0.09241827828170461, + "grad_norm": 20.199861526489258, + "learning_rate": 3.051911009697661e-06, + "loss": 6.0524, + "step": 540 + }, + { + "epoch": 0.09258942324148554, + "grad_norm": 22.757362365722656, + "learning_rate": 3.057615516257844e-06, + "loss": 7.1293, + "step": 541 + }, + { + "epoch": 0.09276056820126648, + "grad_norm": 22.62543487548828, + "learning_rate": 3.0633200228180263e-06, + "loss": 7.5053, + "step": 542 + }, + { + "epoch": 0.09293171316104741, + "grad_norm": 16.598411560058594, + "learning_rate": 3.069024529378209e-06, + "loss": 7.8322, + "step": 543 + }, + { + "epoch": 0.09310285812082834, + "grad_norm": 20.656627655029297, + "learning_rate": 3.0747290359383917e-06, + "loss": 8.8013, + "step": 544 + }, + { + "epoch": 0.09327400308060928, + "grad_norm": 20.95423126220703, + "learning_rate": 3.080433542498574e-06, + "loss": 6.1923, + "step": 545 + }, + { + "epoch": 0.09344514804039021, + "grad_norm": 175.26722717285156, + "learning_rate": 3.0861380490587564e-06, + "loss": 10.2252, + "step": 546 + }, + { + "epoch": 0.09361629300017114, + "grad_norm": 21.737558364868164, + "learning_rate": 3.0918425556189393e-06, + "loss": 7.7486, + "step": 547 + }, + { + "epoch": 0.09378743795995208, + "grad_norm": 41.67558288574219, + "learning_rate": 3.0975470621791215e-06, + "loss": 11.1347, + "step": 548 + }, + { + "epoch": 0.09395858291973301, + "grad_norm": 24.20724868774414, + "learning_rate": 3.103251568739304e-06, + "loss": 8.1228, + "step": 549 + }, + { + "epoch": 0.09412972787951394, + "grad_norm": 23.995750427246094, + "learning_rate": 3.108956075299487e-06, + "loss": 8.1871, + "step": 550 + }, + { + "epoch": 0.09430087283929488, + "grad_norm": 18.58646583557129, + "learning_rate": 3.114660581859669e-06, + "loss": 7.4311, + "step": 551 + }, + { + "epoch": 0.09447201779907581, + "grad_norm": 26.01420021057129, + "learning_rate": 3.1203650884198516e-06, + "loss": 9.3426, + "step": 552 + }, + { + "epoch": 0.09464316275885676, + "grad_norm": 18.335588455200195, + "learning_rate": 3.1260695949800345e-06, + "loss": 8.9575, + "step": 553 + }, + { + "epoch": 0.09481430771863769, + "grad_norm": 21.414621353149414, + "learning_rate": 3.1317741015402166e-06, + "loss": 7.744, + "step": 554 + }, + { + "epoch": 0.09498545267841862, + "grad_norm": 15.28297233581543, + "learning_rate": 3.137478608100399e-06, + "loss": 8.0683, + "step": 555 + }, + { + "epoch": 0.09515659763819956, + "grad_norm": 20.182992935180664, + "learning_rate": 3.143183114660582e-06, + "loss": 7.5161, + "step": 556 + }, + { + "epoch": 0.09532774259798049, + "grad_norm": 22.94892120361328, + "learning_rate": 3.1488876212207642e-06, + "loss": 7.728, + "step": 557 + }, + { + "epoch": 0.09549888755776142, + "grad_norm": 16.93927764892578, + "learning_rate": 3.1545921277809468e-06, + "loss": 7.7731, + "step": 558 + }, + { + "epoch": 0.09567003251754236, + "grad_norm": 21.27535629272461, + "learning_rate": 3.1602966343411297e-06, + "loss": 8.7629, + "step": 559 + }, + { + "epoch": 0.09584117747732329, + "grad_norm": 20.056377410888672, + "learning_rate": 3.166001140901312e-06, + "loss": 7.3943, + "step": 560 + }, + { + "epoch": 0.09601232243710422, + "grad_norm": 37.84750747680664, + "learning_rate": 3.1717056474614948e-06, + "loss": 10.5613, + "step": 561 + }, + { + "epoch": 0.09618346739688516, + "grad_norm": 19.577177047729492, + "learning_rate": 3.1774101540216773e-06, + "loss": 7.6761, + "step": 562 + }, + { + "epoch": 0.09635461235666609, + "grad_norm": 22.209712982177734, + "learning_rate": 3.18311466058186e-06, + "loss": 6.9584, + "step": 563 + }, + { + "epoch": 0.09652575731644703, + "grad_norm": 25.258302688598633, + "learning_rate": 3.1888191671420424e-06, + "loss": 8.0565, + "step": 564 + }, + { + "epoch": 0.09669690227622796, + "grad_norm": 15.993329048156738, + "learning_rate": 3.194523673702225e-06, + "loss": 4.2519, + "step": 565 + }, + { + "epoch": 0.0968680472360089, + "grad_norm": 18.609046936035156, + "learning_rate": 3.2002281802624074e-06, + "loss": 8.0901, + "step": 566 + }, + { + "epoch": 0.09703919219578984, + "grad_norm": 39.24065017700195, + "learning_rate": 3.20593268682259e-06, + "loss": 4.2716, + "step": 567 + }, + { + "epoch": 0.09721033715557077, + "grad_norm": 158.3350067138672, + "learning_rate": 3.2116371933827725e-06, + "loss": 9.5332, + "step": 568 + }, + { + "epoch": 0.0973814821153517, + "grad_norm": 59.29450607299805, + "learning_rate": 3.217341699942955e-06, + "loss": 10.343, + "step": 569 + }, + { + "epoch": 0.09755262707513264, + "grad_norm": 16.113664627075195, + "learning_rate": 3.2230462065031376e-06, + "loss": 3.4944, + "step": 570 + }, + { + "epoch": 0.09772377203491357, + "grad_norm": 23.105350494384766, + "learning_rate": 3.22875071306332e-06, + "loss": 8.2848, + "step": 571 + }, + { + "epoch": 0.0978949169946945, + "grad_norm": 21.425796508789062, + "learning_rate": 3.2344552196235026e-06, + "loss": 8.7655, + "step": 572 + }, + { + "epoch": 0.09806606195447544, + "grad_norm": 22.587278366088867, + "learning_rate": 3.240159726183685e-06, + "loss": 5.394, + "step": 573 + }, + { + "epoch": 0.09823720691425637, + "grad_norm": 37.69017028808594, + "learning_rate": 3.2458642327438677e-06, + "loss": 9.9902, + "step": 574 + }, + { + "epoch": 0.0984083518740373, + "grad_norm": 25.255393981933594, + "learning_rate": 3.2515687393040502e-06, + "loss": 8.3951, + "step": 575 + }, + { + "epoch": 0.09857949683381824, + "grad_norm": 18.790040969848633, + "learning_rate": 3.2572732458642328e-06, + "loss": 8.2647, + "step": 576 + }, + { + "epoch": 0.09875064179359917, + "grad_norm": 18.215757369995117, + "learning_rate": 3.2629777524244153e-06, + "loss": 7.8027, + "step": 577 + }, + { + "epoch": 0.09892178675338012, + "grad_norm": 17.263294219970703, + "learning_rate": 3.268682258984598e-06, + "loss": 7.7728, + "step": 578 + }, + { + "epoch": 0.09909293171316105, + "grad_norm": 18.384496688842773, + "learning_rate": 3.2743867655447804e-06, + "loss": 7.9191, + "step": 579 + }, + { + "epoch": 0.09926407667294199, + "grad_norm": 32.68930435180664, + "learning_rate": 3.280091272104963e-06, + "loss": 7.8591, + "step": 580 + }, + { + "epoch": 0.09943522163272292, + "grad_norm": 31.514266967773438, + "learning_rate": 3.285795778665146e-06, + "loss": 5.5621, + "step": 581 + }, + { + "epoch": 0.09960636659250385, + "grad_norm": 18.912736892700195, + "learning_rate": 3.291500285225328e-06, + "loss": 7.6952, + "step": 582 + }, + { + "epoch": 0.09977751155228479, + "grad_norm": 37.68309783935547, + "learning_rate": 3.2972047917855105e-06, + "loss": 4.1392, + "step": 583 + }, + { + "epoch": 0.09994865651206572, + "grad_norm": 31.56082534790039, + "learning_rate": 3.3029092983456934e-06, + "loss": 4.3801, + "step": 584 + }, + { + "epoch": 0.10011980147184665, + "grad_norm": 24.57911491394043, + "learning_rate": 3.3086138049058755e-06, + "loss": 8.6316, + "step": 585 + }, + { + "epoch": 0.10029094643162759, + "grad_norm": 23.80208396911621, + "learning_rate": 3.314318311466058e-06, + "loss": 7.9077, + "step": 586 + }, + { + "epoch": 0.10046209139140852, + "grad_norm": 16.849803924560547, + "learning_rate": 3.320022818026241e-06, + "loss": 7.6992, + "step": 587 + }, + { + "epoch": 0.10063323635118945, + "grad_norm": 18.981300354003906, + "learning_rate": 3.3257273245864236e-06, + "loss": 8.6023, + "step": 588 + }, + { + "epoch": 0.10080438131097039, + "grad_norm": 24.398378372192383, + "learning_rate": 3.3314318311466057e-06, + "loss": 7.0319, + "step": 589 + }, + { + "epoch": 0.10097552627075132, + "grad_norm": 14.639533996582031, + "learning_rate": 3.3371363377067886e-06, + "loss": 4.7698, + "step": 590 + }, + { + "epoch": 0.10114667123053227, + "grad_norm": 25.046255111694336, + "learning_rate": 3.342840844266971e-06, + "loss": 7.1782, + "step": 591 + }, + { + "epoch": 0.1013178161903132, + "grad_norm": 20.012542724609375, + "learning_rate": 3.3485453508271533e-06, + "loss": 7.3167, + "step": 592 + }, + { + "epoch": 0.10148896115009413, + "grad_norm": 27.766891479492188, + "learning_rate": 3.3542498573873362e-06, + "loss": 4.6521, + "step": 593 + }, + { + "epoch": 0.10166010610987507, + "grad_norm": 30.79694175720215, + "learning_rate": 3.3599543639475188e-06, + "loss": 5.1196, + "step": 594 + }, + { + "epoch": 0.101831251069656, + "grad_norm": 24.9854736328125, + "learning_rate": 3.365658870507701e-06, + "loss": 7.8056, + "step": 595 + }, + { + "epoch": 0.10200239602943693, + "grad_norm": 30.63117218017578, + "learning_rate": 3.371363377067884e-06, + "loss": 4.7903, + "step": 596 + }, + { + "epoch": 0.10217354098921787, + "grad_norm": 34.852256774902344, + "learning_rate": 3.3770678836280663e-06, + "loss": 4.1994, + "step": 597 + }, + { + "epoch": 0.1023446859489988, + "grad_norm": 26.979557037353516, + "learning_rate": 3.3827723901882485e-06, + "loss": 8.5955, + "step": 598 + }, + { + "epoch": 0.10251583090877973, + "grad_norm": 21.797626495361328, + "learning_rate": 3.3884768967484314e-06, + "loss": 7.5743, + "step": 599 + }, + { + "epoch": 0.10268697586856067, + "grad_norm": 37.774139404296875, + "learning_rate": 3.394181403308614e-06, + "loss": 6.7935, + "step": 600 + }, + { + "epoch": 0.1028581208283416, + "grad_norm": 27.917823791503906, + "learning_rate": 3.3998859098687965e-06, + "loss": 7.9018, + "step": 601 + }, + { + "epoch": 0.10302926578812253, + "grad_norm": 28.479934692382812, + "learning_rate": 3.405590416428979e-06, + "loss": 4.566, + "step": 602 + }, + { + "epoch": 0.10320041074790347, + "grad_norm": 33.35675811767578, + "learning_rate": 3.4112949229891615e-06, + "loss": 4.1986, + "step": 603 + }, + { + "epoch": 0.10337155570768441, + "grad_norm": 17.17736053466797, + "learning_rate": 3.416999429549344e-06, + "loss": 4.4831, + "step": 604 + }, + { + "epoch": 0.10354270066746535, + "grad_norm": 33.52507781982422, + "learning_rate": 3.4227039361095266e-06, + "loss": 5.2033, + "step": 605 + }, + { + "epoch": 0.10371384562724628, + "grad_norm": 38.001678466796875, + "learning_rate": 3.428408442669709e-06, + "loss": 4.2796, + "step": 606 + }, + { + "epoch": 0.10388499058702722, + "grad_norm": 27.487375259399414, + "learning_rate": 3.4341129492298917e-06, + "loss": 4.3677, + "step": 607 + }, + { + "epoch": 0.10405613554680815, + "grad_norm": 43.33926010131836, + "learning_rate": 3.439817455790074e-06, + "loss": 8.1044, + "step": 608 + }, + { + "epoch": 0.10422728050658908, + "grad_norm": 19.231143951416016, + "learning_rate": 3.4455219623502567e-06, + "loss": 3.8195, + "step": 609 + }, + { + "epoch": 0.10439842546637002, + "grad_norm": 51.54021453857422, + "learning_rate": 3.4512264689104393e-06, + "loss": 8.4844, + "step": 610 + }, + { + "epoch": 0.10456957042615095, + "grad_norm": 18.752532958984375, + "learning_rate": 3.456930975470622e-06, + "loss": 4.7959, + "step": 611 + }, + { + "epoch": 0.10474071538593188, + "grad_norm": 31.644916534423828, + "learning_rate": 3.4626354820308043e-06, + "loss": 3.5838, + "step": 612 + }, + { + "epoch": 0.10491186034571282, + "grad_norm": 29.887203216552734, + "learning_rate": 3.4683399885909873e-06, + "loss": 8.1693, + "step": 613 + }, + { + "epoch": 0.10508300530549375, + "grad_norm": 26.583890914916992, + "learning_rate": 3.4740444951511694e-06, + "loss": 6.6237, + "step": 614 + }, + { + "epoch": 0.10525415026527468, + "grad_norm": 30.845338821411133, + "learning_rate": 3.479749001711352e-06, + "loss": 3.4824, + "step": 615 + }, + { + "epoch": 0.10542529522505562, + "grad_norm": 38.40910339355469, + "learning_rate": 3.485453508271535e-06, + "loss": 3.9889, + "step": 616 + }, + { + "epoch": 0.10559644018483656, + "grad_norm": 39.16193389892578, + "learning_rate": 3.491158014831717e-06, + "loss": 8.4244, + "step": 617 + }, + { + "epoch": 0.1057675851446175, + "grad_norm": 17.97920036315918, + "learning_rate": 3.4968625213918995e-06, + "loss": 3.2418, + "step": 618 + }, + { + "epoch": 0.10593873010439843, + "grad_norm": 176.26966857910156, + "learning_rate": 3.5025670279520825e-06, + "loss": 7.2639, + "step": 619 + }, + { + "epoch": 0.10610987506417936, + "grad_norm": 31.25491714477539, + "learning_rate": 3.5082715345122646e-06, + "loss": 7.428, + "step": 620 + }, + { + "epoch": 0.1062810200239603, + "grad_norm": 291.8013916015625, + "learning_rate": 3.5139760410724475e-06, + "loss": 12.9756, + "step": 621 + }, + { + "epoch": 0.10645216498374123, + "grad_norm": 34.713497161865234, + "learning_rate": 3.51968054763263e-06, + "loss": 4.2338, + "step": 622 + }, + { + "epoch": 0.10662330994352216, + "grad_norm": 30.151296615600586, + "learning_rate": 3.525385054192812e-06, + "loss": 8.7988, + "step": 623 + }, + { + "epoch": 0.1067944549033031, + "grad_norm": 36.128414154052734, + "learning_rate": 3.531089560752995e-06, + "loss": 8.0574, + "step": 624 + }, + { + "epoch": 0.10696559986308403, + "grad_norm": 15.85501480102539, + "learning_rate": 3.5367940673131777e-06, + "loss": 3.4627, + "step": 625 + }, + { + "epoch": 0.10713674482286496, + "grad_norm": 296.1280212402344, + "learning_rate": 3.5424985738733598e-06, + "loss": 11.8753, + "step": 626 + }, + { + "epoch": 0.1073078897826459, + "grad_norm": 30.480113983154297, + "learning_rate": 3.5482030804335427e-06, + "loss": 6.966, + "step": 627 + }, + { + "epoch": 0.10747903474242683, + "grad_norm": 34.42314529418945, + "learning_rate": 3.5539075869937253e-06, + "loss": 6.8004, + "step": 628 + }, + { + "epoch": 0.10765017970220778, + "grad_norm": 239.69007873535156, + "learning_rate": 3.5596120935539074e-06, + "loss": 10.727, + "step": 629 + }, + { + "epoch": 0.10782132466198871, + "grad_norm": 42.74559783935547, + "learning_rate": 3.5653166001140903e-06, + "loss": 7.948, + "step": 630 + }, + { + "epoch": 0.10799246962176964, + "grad_norm": 24.176240921020508, + "learning_rate": 3.571021106674273e-06, + "loss": 5.4348, + "step": 631 + }, + { + "epoch": 0.10816361458155058, + "grad_norm": 32.64130783081055, + "learning_rate": 3.576725613234455e-06, + "loss": 8.4, + "step": 632 + }, + { + "epoch": 0.10833475954133151, + "grad_norm": 32.354248046875, + "learning_rate": 3.582430119794638e-06, + "loss": 6.4397, + "step": 633 + }, + { + "epoch": 0.10850590450111244, + "grad_norm": 25.767475128173828, + "learning_rate": 3.5881346263548204e-06, + "loss": 5.7136, + "step": 634 + }, + { + "epoch": 0.10867704946089338, + "grad_norm": 28.90591812133789, + "learning_rate": 3.593839132915003e-06, + "loss": 5.9131, + "step": 635 + }, + { + "epoch": 0.10884819442067431, + "grad_norm": 32.62278747558594, + "learning_rate": 3.5995436394751855e-06, + "loss": 7.6547, + "step": 636 + }, + { + "epoch": 0.10901933938045524, + "grad_norm": 30.387760162353516, + "learning_rate": 3.605248146035368e-06, + "loss": 4.7063, + "step": 637 + }, + { + "epoch": 0.10919048434023618, + "grad_norm": 33.034420013427734, + "learning_rate": 3.6109526525955506e-06, + "loss": 6.8851, + "step": 638 + }, + { + "epoch": 0.10936162930001711, + "grad_norm": 31.42691421508789, + "learning_rate": 3.616657159155733e-06, + "loss": 4.1348, + "step": 639 + }, + { + "epoch": 0.10953277425979804, + "grad_norm": 32.439395904541016, + "learning_rate": 3.6223616657159156e-06, + "loss": 6.3162, + "step": 640 + }, + { + "epoch": 0.10970391921957898, + "grad_norm": 26.49324607849121, + "learning_rate": 3.6280661722760986e-06, + "loss": 5.1818, + "step": 641 + }, + { + "epoch": 0.10987506417935992, + "grad_norm": 25.558427810668945, + "learning_rate": 3.6337706788362807e-06, + "loss": 6.7631, + "step": 642 + }, + { + "epoch": 0.11004620913914086, + "grad_norm": 24.655729293823242, + "learning_rate": 3.6394751853964632e-06, + "loss": 7.4925, + "step": 643 + }, + { + "epoch": 0.11021735409892179, + "grad_norm": 28.129770278930664, + "learning_rate": 3.645179691956646e-06, + "loss": 7.4969, + "step": 644 + }, + { + "epoch": 0.11038849905870272, + "grad_norm": 14.367050170898438, + "learning_rate": 3.6508841985168283e-06, + "loss": 2.9942, + "step": 645 + }, + { + "epoch": 0.11055964401848366, + "grad_norm": 17.681976318359375, + "learning_rate": 3.656588705077011e-06, + "loss": 3.4156, + "step": 646 + }, + { + "epoch": 0.11073078897826459, + "grad_norm": 16.25703239440918, + "learning_rate": 3.6622932116371938e-06, + "loss": 4.095, + "step": 647 + }, + { + "epoch": 0.11090193393804552, + "grad_norm": 26.604623794555664, + "learning_rate": 3.667997718197376e-06, + "loss": 7.7521, + "step": 648 + }, + { + "epoch": 0.11107307889782646, + "grad_norm": 24.250492095947266, + "learning_rate": 3.6737022247575584e-06, + "loss": 5.5501, + "step": 649 + }, + { + "epoch": 0.11124422385760739, + "grad_norm": 31.94316864013672, + "learning_rate": 3.6794067313177414e-06, + "loss": 6.24, + "step": 650 + }, + { + "epoch": 0.11141536881738832, + "grad_norm": 18.14836883544922, + "learning_rate": 3.6851112378779235e-06, + "loss": 3.024, + "step": 651 + }, + { + "epoch": 0.11158651377716926, + "grad_norm": 25.239274978637695, + "learning_rate": 3.690815744438106e-06, + "loss": 7.4699, + "step": 652 + }, + { + "epoch": 0.11175765873695019, + "grad_norm": 66.97354125976562, + "learning_rate": 3.696520250998289e-06, + "loss": 11.3565, + "step": 653 + }, + { + "epoch": 0.11192880369673112, + "grad_norm": 30.029356002807617, + "learning_rate": 3.702224757558471e-06, + "loss": 7.0383, + "step": 654 + }, + { + "epoch": 0.11209994865651207, + "grad_norm": 22.021820068359375, + "learning_rate": 3.7079292641186536e-06, + "loss": 5.1655, + "step": 655 + }, + { + "epoch": 0.112271093616293, + "grad_norm": 40.79402160644531, + "learning_rate": 3.7136337706788366e-06, + "loss": 7.3738, + "step": 656 + }, + { + "epoch": 0.11244223857607394, + "grad_norm": 49.726810455322266, + "learning_rate": 3.7193382772390187e-06, + "loss": 10.1751, + "step": 657 + }, + { + "epoch": 0.11261338353585487, + "grad_norm": 34.322078704833984, + "learning_rate": 3.725042783799201e-06, + "loss": 8.127, + "step": 658 + }, + { + "epoch": 0.1127845284956358, + "grad_norm": 31.094890594482422, + "learning_rate": 3.730747290359384e-06, + "loss": 7.2578, + "step": 659 + }, + { + "epoch": 0.11295567345541674, + "grad_norm": 17.61489486694336, + "learning_rate": 3.7364517969195667e-06, + "loss": 2.9985, + "step": 660 + }, + { + "epoch": 0.11312681841519767, + "grad_norm": 31.467206954956055, + "learning_rate": 3.7421563034797492e-06, + "loss": 3.7049, + "step": 661 + }, + { + "epoch": 0.1132979633749786, + "grad_norm": 24.94162368774414, + "learning_rate": 3.7478608100399318e-06, + "loss": 5.2144, + "step": 662 + }, + { + "epoch": 0.11346910833475954, + "grad_norm": 61.16570281982422, + "learning_rate": 3.7535653166001143e-06, + "loss": 10.6124, + "step": 663 + }, + { + "epoch": 0.11364025329454047, + "grad_norm": 30.18357276916504, + "learning_rate": 3.7592698231602964e-06, + "loss": 7.2241, + "step": 664 + }, + { + "epoch": 0.1138113982543214, + "grad_norm": 40.68777847290039, + "learning_rate": 3.764974329720479e-06, + "loss": 7.604, + "step": 665 + }, + { + "epoch": 0.11398254321410234, + "grad_norm": 24.30128288269043, + "learning_rate": 3.7706788362806623e-06, + "loss": 4.6842, + "step": 666 + }, + { + "epoch": 0.11415368817388329, + "grad_norm": 33.77325439453125, + "learning_rate": 3.7763833428408444e-06, + "loss": 7.3903, + "step": 667 + }, + { + "epoch": 0.11432483313366422, + "grad_norm": 30.10031509399414, + "learning_rate": 3.782087849401027e-06, + "loss": 7.0533, + "step": 668 + }, + { + "epoch": 0.11449597809344515, + "grad_norm": 34.8586540222168, + "learning_rate": 3.7877923559612095e-06, + "loss": 7.5631, + "step": 669 + }, + { + "epoch": 0.11466712305322609, + "grad_norm": 33.20988082885742, + "learning_rate": 3.7934968625213916e-06, + "loss": 3.7029, + "step": 670 + }, + { + "epoch": 0.11483826801300702, + "grad_norm": 31.075176239013672, + "learning_rate": 3.799201369081575e-06, + "loss": 8.2182, + "step": 671 + }, + { + "epoch": 0.11500941297278795, + "grad_norm": 30.962139129638672, + "learning_rate": 3.8049058756417575e-06, + "loss": 6.5288, + "step": 672 + }, + { + "epoch": 0.11518055793256889, + "grad_norm": 37.01807403564453, + "learning_rate": 3.8106103822019396e-06, + "loss": 7.8449, + "step": 673 + }, + { + "epoch": 0.11535170289234982, + "grad_norm": 35.002742767333984, + "learning_rate": 3.816314888762122e-06, + "loss": 6.4509, + "step": 674 + }, + { + "epoch": 0.11552284785213075, + "grad_norm": 51.06761169433594, + "learning_rate": 3.822019395322305e-06, + "loss": 10.6236, + "step": 675 + }, + { + "epoch": 0.11569399281191169, + "grad_norm": 37.48448181152344, + "learning_rate": 3.827723901882487e-06, + "loss": 6.7785, + "step": 676 + }, + { + "epoch": 0.11586513777169262, + "grad_norm": 35.638832092285156, + "learning_rate": 3.8334284084426706e-06, + "loss": 7.6172, + "step": 677 + }, + { + "epoch": 0.11603628273147355, + "grad_norm": 35.00564956665039, + "learning_rate": 3.839132915002852e-06, + "loss": 7.1866, + "step": 678 + }, + { + "epoch": 0.11620742769125449, + "grad_norm": 31.42662811279297, + "learning_rate": 3.844837421563035e-06, + "loss": 3.2075, + "step": 679 + }, + { + "epoch": 0.11637857265103543, + "grad_norm": 16.111412048339844, + "learning_rate": 3.850541928123217e-06, + "loss": 4.0132, + "step": 680 + }, + { + "epoch": 0.11654971761081637, + "grad_norm": 29.7305850982666, + "learning_rate": 3.8562464346834e-06, + "loss": 7.1253, + "step": 681 + }, + { + "epoch": 0.1167208625705973, + "grad_norm": 28.033987045288086, + "learning_rate": 3.861950941243582e-06, + "loss": 3.1947, + "step": 682 + }, + { + "epoch": 0.11689200753037823, + "grad_norm": 31.460405349731445, + "learning_rate": 3.867655447803766e-06, + "loss": 7.1427, + "step": 683 + }, + { + "epoch": 0.11706315249015917, + "grad_norm": 269.6858825683594, + "learning_rate": 3.8733599543639474e-06, + "loss": 11.9947, + "step": 684 + }, + { + "epoch": 0.1172342974499401, + "grad_norm": 35.384727478027344, + "learning_rate": 3.87906446092413e-06, + "loss": 6.8334, + "step": 685 + }, + { + "epoch": 0.11740544240972103, + "grad_norm": 25.98334312438965, + "learning_rate": 3.8847689674843125e-06, + "loss": 7.2384, + "step": 686 + }, + { + "epoch": 0.11757658736950197, + "grad_norm": 33.84842300415039, + "learning_rate": 3.890473474044495e-06, + "loss": 5.9906, + "step": 687 + }, + { + "epoch": 0.1177477323292829, + "grad_norm": 41.04487609863281, + "learning_rate": 3.8961779806046776e-06, + "loss": 6.2759, + "step": 688 + }, + { + "epoch": 0.11791887728906383, + "grad_norm": 16.468915939331055, + "learning_rate": 3.901882487164861e-06, + "loss": 3.8545, + "step": 689 + }, + { + "epoch": 0.11809002224884477, + "grad_norm": 27.10782241821289, + "learning_rate": 3.907586993725043e-06, + "loss": 7.596, + "step": 690 + }, + { + "epoch": 0.1182611672086257, + "grad_norm": 25.684919357299805, + "learning_rate": 3.913291500285225e-06, + "loss": 4.2235, + "step": 691 + }, + { + "epoch": 0.11843231216840663, + "grad_norm": 27.2288761138916, + "learning_rate": 3.918996006845408e-06, + "loss": 6.9975, + "step": 692 + }, + { + "epoch": 0.11860345712818758, + "grad_norm": 33.26142120361328, + "learning_rate": 3.92470051340559e-06, + "loss": 6.5592, + "step": 693 + }, + { + "epoch": 0.11877460208796851, + "grad_norm": 38.89694595336914, + "learning_rate": 3.930405019965774e-06, + "loss": 7.2757, + "step": 694 + }, + { + "epoch": 0.11894574704774945, + "grad_norm": 27.99795150756836, + "learning_rate": 3.936109526525956e-06, + "loss": 6.7422, + "step": 695 + }, + { + "epoch": 0.11911689200753038, + "grad_norm": 24.109289169311523, + "learning_rate": 3.941814033086138e-06, + "loss": 4.9175, + "step": 696 + }, + { + "epoch": 0.11928803696731131, + "grad_norm": 15.462040901184082, + "learning_rate": 3.94751853964632e-06, + "loss": 2.7684, + "step": 697 + }, + { + "epoch": 0.11945918192709225, + "grad_norm": 39.17838668823242, + "learning_rate": 3.953223046206503e-06, + "loss": 7.1518, + "step": 698 + }, + { + "epoch": 0.11963032688687318, + "grad_norm": 30.83951759338379, + "learning_rate": 3.958927552766685e-06, + "loss": 3.8832, + "step": 699 + }, + { + "epoch": 0.11980147184665411, + "grad_norm": 26.964744567871094, + "learning_rate": 3.964632059326869e-06, + "loss": 7.224, + "step": 700 + }, + { + "epoch": 0.11997261680643505, + "grad_norm": 36.607975006103516, + "learning_rate": 3.970336565887051e-06, + "loss": 7.3389, + "step": 701 + }, + { + "epoch": 0.12014376176621598, + "grad_norm": 37.18532180786133, + "learning_rate": 3.976041072447234e-06, + "loss": 6.1083, + "step": 702 + }, + { + "epoch": 0.12031490672599691, + "grad_norm": 29.550649642944336, + "learning_rate": 3.9817455790074155e-06, + "loss": 5.1898, + "step": 703 + }, + { + "epoch": 0.12048605168577785, + "grad_norm": 24.146198272705078, + "learning_rate": 3.987450085567598e-06, + "loss": 4.6196, + "step": 704 + }, + { + "epoch": 0.1206571966455588, + "grad_norm": 25.126737594604492, + "learning_rate": 3.993154592127781e-06, + "loss": 2.7422, + "step": 705 + }, + { + "epoch": 0.12082834160533973, + "grad_norm": 18.79334259033203, + "learning_rate": 3.998859098687964e-06, + "loss": 2.6716, + "step": 706 + }, + { + "epoch": 0.12099948656512066, + "grad_norm": 33.249168395996094, + "learning_rate": 4.0045636052481465e-06, + "loss": 5.3053, + "step": 707 + }, + { + "epoch": 0.1211706315249016, + "grad_norm": 26.934682846069336, + "learning_rate": 4.010268111808329e-06, + "loss": 4.514, + "step": 708 + }, + { + "epoch": 0.12134177648468253, + "grad_norm": 44.88846206665039, + "learning_rate": 4.015972618368511e-06, + "loss": 6.4733, + "step": 709 + }, + { + "epoch": 0.12151292144446346, + "grad_norm": 41.93711471557617, + "learning_rate": 4.021677124928693e-06, + "loss": 7.4558, + "step": 710 + }, + { + "epoch": 0.1216840664042444, + "grad_norm": 41.59209060668945, + "learning_rate": 4.027381631488877e-06, + "loss": 7.0372, + "step": 711 + }, + { + "epoch": 0.12185521136402533, + "grad_norm": 41.47358703613281, + "learning_rate": 4.033086138049059e-06, + "loss": 7.2868, + "step": 712 + }, + { + "epoch": 0.12202635632380626, + "grad_norm": 41.380741119384766, + "learning_rate": 4.038790644609242e-06, + "loss": 7.6225, + "step": 713 + }, + { + "epoch": 0.1221975012835872, + "grad_norm": 40.343788146972656, + "learning_rate": 4.044495151169424e-06, + "loss": 7.2916, + "step": 714 + }, + { + "epoch": 0.12236864624336813, + "grad_norm": 30.69339370727539, + "learning_rate": 4.050199657729606e-06, + "loss": 5.8809, + "step": 715 + }, + { + "epoch": 0.12253979120314906, + "grad_norm": 25.84669303894043, + "learning_rate": 4.0559041642897885e-06, + "loss": 2.8596, + "step": 716 + }, + { + "epoch": 0.12271093616293, + "grad_norm": 37.5709114074707, + "learning_rate": 4.061608670849972e-06, + "loss": 6.5536, + "step": 717 + }, + { + "epoch": 0.12288208112271094, + "grad_norm": 44.87430191040039, + "learning_rate": 4.067313177410154e-06, + "loss": 7.8952, + "step": 718 + }, + { + "epoch": 0.12305322608249188, + "grad_norm": 29.630413055419922, + "learning_rate": 4.073017683970337e-06, + "loss": 7.2852, + "step": 719 + }, + { + "epoch": 0.12322437104227281, + "grad_norm": 38.17768096923828, + "learning_rate": 4.0787221905305194e-06, + "loss": 7.2025, + "step": 720 + }, + { + "epoch": 0.12339551600205374, + "grad_norm": 31.9378719329834, + "learning_rate": 4.084426697090702e-06, + "loss": 3.3231, + "step": 721 + }, + { + "epoch": 0.12356666096183468, + "grad_norm": 15.323390007019043, + "learning_rate": 4.090131203650884e-06, + "loss": 3.5441, + "step": 722 + }, + { + "epoch": 0.12373780592161561, + "grad_norm": 32.09744644165039, + "learning_rate": 4.095835710211067e-06, + "loss": 5.9737, + "step": 723 + }, + { + "epoch": 0.12390895088139654, + "grad_norm": 32.49777603149414, + "learning_rate": 4.1015402167712496e-06, + "loss": 5.6116, + "step": 724 + }, + { + "epoch": 0.12408009584117748, + "grad_norm": 32.568031311035156, + "learning_rate": 4.107244723331432e-06, + "loss": 6.8512, + "step": 725 + }, + { + "epoch": 0.12425124080095841, + "grad_norm": 27.68449592590332, + "learning_rate": 4.112949229891615e-06, + "loss": 3.8807, + "step": 726 + }, + { + "epoch": 0.12442238576073934, + "grad_norm": 28.595746994018555, + "learning_rate": 4.118653736451797e-06, + "loss": 2.7513, + "step": 727 + }, + { + "epoch": 0.12459353072052028, + "grad_norm": 40.44917678833008, + "learning_rate": 4.124358243011979e-06, + "loss": 6.9441, + "step": 728 + }, + { + "epoch": 0.12476467568030121, + "grad_norm": 34.75537872314453, + "learning_rate": 4.130062749572162e-06, + "loss": 6.2836, + "step": 729 + }, + { + "epoch": 0.12493582064008214, + "grad_norm": 32.49576950073242, + "learning_rate": 4.135767256132345e-06, + "loss": 3.6985, + "step": 730 + }, + { + "epoch": 0.1251069655998631, + "grad_norm": 33.09941482543945, + "learning_rate": 4.141471762692527e-06, + "loss": 6.414, + "step": 731 + }, + { + "epoch": 0.12527811055964402, + "grad_norm": 33.988101959228516, + "learning_rate": 4.14717626925271e-06, + "loss": 6.8846, + "step": 732 + }, + { + "epoch": 0.12544925551942496, + "grad_norm": 34.69337844848633, + "learning_rate": 4.152880775812892e-06, + "loss": 5.9908, + "step": 733 + }, + { + "epoch": 0.1256204004792059, + "grad_norm": 42.33815383911133, + "learning_rate": 4.158585282373075e-06, + "loss": 7.2186, + "step": 734 + }, + { + "epoch": 0.12579154543898682, + "grad_norm": 21.35869598388672, + "learning_rate": 4.164289788933257e-06, + "loss": 3.2239, + "step": 735 + }, + { + "epoch": 0.12596269039876776, + "grad_norm": 34.62517166137695, + "learning_rate": 4.16999429549344e-06, + "loss": 6.2926, + "step": 736 + }, + { + "epoch": 0.1261338353585487, + "grad_norm": 32.758544921875, + "learning_rate": 4.1756988020536225e-06, + "loss": 6.2203, + "step": 737 + }, + { + "epoch": 0.12630498031832962, + "grad_norm": 17.39285659790039, + "learning_rate": 4.181403308613805e-06, + "loss": 3.2563, + "step": 738 + }, + { + "epoch": 0.12647612527811056, + "grad_norm": 32.22175598144531, + "learning_rate": 4.1871078151739875e-06, + "loss": 5.3068, + "step": 739 + }, + { + "epoch": 0.1266472702378915, + "grad_norm": 38.13700485229492, + "learning_rate": 4.19281232173417e-06, + "loss": 7.6128, + "step": 740 + }, + { + "epoch": 0.12681841519767242, + "grad_norm": 35.74038314819336, + "learning_rate": 4.198516828294353e-06, + "loss": 6.6528, + "step": 741 + }, + { + "epoch": 0.12698956015745336, + "grad_norm": 12.027849197387695, + "learning_rate": 4.204221334854535e-06, + "loss": 2.255, + "step": 742 + }, + { + "epoch": 0.1271607051172343, + "grad_norm": 36.75061798095703, + "learning_rate": 4.209925841414718e-06, + "loss": 6.2444, + "step": 743 + }, + { + "epoch": 0.12733185007701522, + "grad_norm": 43.853187561035156, + "learning_rate": 4.2156303479749e-06, + "loss": 6.3509, + "step": 744 + }, + { + "epoch": 0.12750299503679616, + "grad_norm": 31.670143127441406, + "learning_rate": 4.221334854535083e-06, + "loss": 7.3242, + "step": 745 + }, + { + "epoch": 0.1276741399965771, + "grad_norm": 24.049455642700195, + "learning_rate": 4.227039361095265e-06, + "loss": 4.5557, + "step": 746 + }, + { + "epoch": 0.12784528495635802, + "grad_norm": 22.603431701660156, + "learning_rate": 4.232743867655448e-06, + "loss": 4.3686, + "step": 747 + }, + { + "epoch": 0.12801642991613896, + "grad_norm": 33.28196716308594, + "learning_rate": 4.23844837421563e-06, + "loss": 7.897, + "step": 748 + }, + { + "epoch": 0.1281875748759199, + "grad_norm": 14.154582023620605, + "learning_rate": 4.244152880775813e-06, + "loss": 2.5184, + "step": 749 + }, + { + "epoch": 0.12835871983570085, + "grad_norm": 34.31758117675781, + "learning_rate": 4.249857387335995e-06, + "loss": 6.4663, + "step": 750 + }, + { + "epoch": 0.12852986479548179, + "grad_norm": 29.4487361907959, + "learning_rate": 4.255561893896179e-06, + "loss": 6.4908, + "step": 751 + }, + { + "epoch": 0.12870100975526272, + "grad_norm": 26.144145965576172, + "learning_rate": 4.261266400456361e-06, + "loss": 2.7209, + "step": 752 + }, + { + "epoch": 0.12887215471504365, + "grad_norm": 32.20002746582031, + "learning_rate": 4.266970907016543e-06, + "loss": 5.3474, + "step": 753 + }, + { + "epoch": 0.12904329967482459, + "grad_norm": 22.889114379882812, + "learning_rate": 4.2726754135767255e-06, + "loss": 4.439, + "step": 754 + }, + { + "epoch": 0.12921444463460552, + "grad_norm": 29.033794403076172, + "learning_rate": 4.278379920136908e-06, + "loss": 3.1768, + "step": 755 + }, + { + "epoch": 0.12938558959438645, + "grad_norm": 36.977718353271484, + "learning_rate": 4.2840844266970906e-06, + "loss": 6.725, + "step": 756 + }, + { + "epoch": 0.12955673455416739, + "grad_norm": 24.76682472229004, + "learning_rate": 4.289788933257274e-06, + "loss": 2.3437, + "step": 757 + }, + { + "epoch": 0.12972787951394832, + "grad_norm": 16.016826629638672, + "learning_rate": 4.2954934398174565e-06, + "loss": 2.8201, + "step": 758 + }, + { + "epoch": 0.12989902447372925, + "grad_norm": 14.587915420532227, + "learning_rate": 4.301197946377638e-06, + "loss": 3.5146, + "step": 759 + }, + { + "epoch": 0.13007016943351019, + "grad_norm": 26.081321716308594, + "learning_rate": 4.306902452937821e-06, + "loss": 2.6582, + "step": 760 + }, + { + "epoch": 0.13024131439329112, + "grad_norm": 16.497404098510742, + "learning_rate": 4.312606959498003e-06, + "loss": 2.6039, + "step": 761 + }, + { + "epoch": 0.13041245935307205, + "grad_norm": 30.642013549804688, + "learning_rate": 4.318311466058186e-06, + "loss": 5.6791, + "step": 762 + }, + { + "epoch": 0.13058360431285299, + "grad_norm": 78.80982971191406, + "learning_rate": 4.324015972618369e-06, + "loss": 7.0474, + "step": 763 + }, + { + "epoch": 0.13075474927263392, + "grad_norm": 31.678878784179688, + "learning_rate": 4.329720479178552e-06, + "loss": 7.2185, + "step": 764 + }, + { + "epoch": 0.13092589423241485, + "grad_norm": 67.14193725585938, + "learning_rate": 4.335424985738733e-06, + "loss": 11.1752, + "step": 765 + }, + { + "epoch": 0.13109703919219579, + "grad_norm": 30.7507381439209, + "learning_rate": 4.341129492298916e-06, + "loss": 5.3445, + "step": 766 + }, + { + "epoch": 0.13126818415197672, + "grad_norm": 295.94195556640625, + "learning_rate": 4.346833998859098e-06, + "loss": 14.8463, + "step": 767 + }, + { + "epoch": 0.13143932911175765, + "grad_norm": 31.96709442138672, + "learning_rate": 4.352538505419281e-06, + "loss": 5.9573, + "step": 768 + }, + { + "epoch": 0.13161047407153859, + "grad_norm": 21.086137771606445, + "learning_rate": 4.358243011979464e-06, + "loss": 2.51, + "step": 769 + }, + { + "epoch": 0.13178161903131952, + "grad_norm": 23.69211196899414, + "learning_rate": 4.363947518539647e-06, + "loss": 3.9384, + "step": 770 + }, + { + "epoch": 0.13195276399110045, + "grad_norm": 29.09503173828125, + "learning_rate": 4.369652025099829e-06, + "loss": 2.6477, + "step": 771 + }, + { + "epoch": 0.13212390895088139, + "grad_norm": 34.086483001708984, + "learning_rate": 4.375356531660011e-06, + "loss": 6.8362, + "step": 772 + }, + { + "epoch": 0.13229505391066232, + "grad_norm": 22.358131408691406, + "learning_rate": 4.381061038220194e-06, + "loss": 2.5553, + "step": 773 + }, + { + "epoch": 0.13246619887044325, + "grad_norm": 32.83020782470703, + "learning_rate": 4.386765544780377e-06, + "loss": 5.6526, + "step": 774 + }, + { + "epoch": 0.1326373438302242, + "grad_norm": 32.111629486083984, + "learning_rate": 4.3924700513405595e-06, + "loss": 7.0077, + "step": 775 + }, + { + "epoch": 0.13280848879000515, + "grad_norm": 28.587032318115234, + "learning_rate": 4.398174557900742e-06, + "loss": 3.9449, + "step": 776 + }, + { + "epoch": 0.13297963374978608, + "grad_norm": 28.547178268432617, + "learning_rate": 4.403879064460925e-06, + "loss": 5.1286, + "step": 777 + }, + { + "epoch": 0.133150778709567, + "grad_norm": 31.409543991088867, + "learning_rate": 4.409583571021106e-06, + "loss": 5.3343, + "step": 778 + }, + { + "epoch": 0.13332192366934795, + "grad_norm": 33.33236312866211, + "learning_rate": 4.415288077581289e-06, + "loss": 6.5061, + "step": 779 + }, + { + "epoch": 0.13349306862912888, + "grad_norm": 226.51580810546875, + "learning_rate": 4.420992584141472e-06, + "loss": 13.5725, + "step": 780 + }, + { + "epoch": 0.1336642135889098, + "grad_norm": 29.707599639892578, + "learning_rate": 4.426697090701655e-06, + "loss": 3.5054, + "step": 781 + }, + { + "epoch": 0.13383535854869075, + "grad_norm": 29.84592628479004, + "learning_rate": 4.432401597261837e-06, + "loss": 5.5461, + "step": 782 + }, + { + "epoch": 0.13400650350847168, + "grad_norm": 23.87710189819336, + "learning_rate": 4.43810610382202e-06, + "loss": 2.7581, + "step": 783 + }, + { + "epoch": 0.1341776484682526, + "grad_norm": 27.90047264099121, + "learning_rate": 4.4438106103822015e-06, + "loss": 5.0602, + "step": 784 + }, + { + "epoch": 0.13434879342803355, + "grad_norm": 229.59202575683594, + "learning_rate": 4.449515116942384e-06, + "loss": 8.4299, + "step": 785 + }, + { + "epoch": 0.13451993838781448, + "grad_norm": 35.904483795166016, + "learning_rate": 4.455219623502567e-06, + "loss": 6.6261, + "step": 786 + }, + { + "epoch": 0.13469108334759541, + "grad_norm": 13.451172828674316, + "learning_rate": 4.46092413006275e-06, + "loss": 3.6844, + "step": 787 + }, + { + "epoch": 0.13486222830737635, + "grad_norm": 220.5408172607422, + "learning_rate": 4.4666286366229324e-06, + "loss": 12.0786, + "step": 788 + }, + { + "epoch": 0.13503337326715728, + "grad_norm": 30.378768920898438, + "learning_rate": 4.472333143183115e-06, + "loss": 6.4301, + "step": 789 + }, + { + "epoch": 0.13520451822693821, + "grad_norm": 24.894784927368164, + "learning_rate": 4.478037649743297e-06, + "loss": 4.0456, + "step": 790 + }, + { + "epoch": 0.13537566318671915, + "grad_norm": 63.11225509643555, + "learning_rate": 4.48374215630348e-06, + "loss": 10.8823, + "step": 791 + }, + { + "epoch": 0.13554680814650008, + "grad_norm": 30.484046936035156, + "learning_rate": 4.4894466628636626e-06, + "loss": 4.5215, + "step": 792 + }, + { + "epoch": 0.13571795310628101, + "grad_norm": 33.15967559814453, + "learning_rate": 4.495151169423845e-06, + "loss": 6.1466, + "step": 793 + }, + { + "epoch": 0.13588909806606195, + "grad_norm": 31.415679931640625, + "learning_rate": 4.500855675984028e-06, + "loss": 5.0997, + "step": 794 + }, + { + "epoch": 0.13606024302584288, + "grad_norm": 29.878276824951172, + "learning_rate": 4.50656018254421e-06, + "loss": 6.9375, + "step": 795 + }, + { + "epoch": 0.13623138798562381, + "grad_norm": 33.10092544555664, + "learning_rate": 4.512264689104393e-06, + "loss": 6.2231, + "step": 796 + }, + { + "epoch": 0.13640253294540475, + "grad_norm": 21.412826538085938, + "learning_rate": 4.517969195664575e-06, + "loss": 1.8474, + "step": 797 + }, + { + "epoch": 0.13657367790518568, + "grad_norm": 31.297100067138672, + "learning_rate": 4.523673702224758e-06, + "loss": 5.4762, + "step": 798 + }, + { + "epoch": 0.13674482286496661, + "grad_norm": 234.58111572265625, + "learning_rate": 4.52937820878494e-06, + "loss": 11.739, + "step": 799 + }, + { + "epoch": 0.13691596782474755, + "grad_norm": 204.88748168945312, + "learning_rate": 4.535082715345123e-06, + "loss": 13.5482, + "step": 800 + }, + { + "epoch": 0.1370871127845285, + "grad_norm": 33.66855239868164, + "learning_rate": 4.540787221905305e-06, + "loss": 5.9551, + "step": 801 + }, + { + "epoch": 0.13725825774430944, + "grad_norm": 30.423555374145508, + "learning_rate": 4.546491728465488e-06, + "loss": 6.3931, + "step": 802 + }, + { + "epoch": 0.13742940270409038, + "grad_norm": 30.737445831298828, + "learning_rate": 4.55219623502567e-06, + "loss": 4.7871, + "step": 803 + }, + { + "epoch": 0.1376005476638713, + "grad_norm": 95.86985778808594, + "learning_rate": 4.557900741585853e-06, + "loss": 6.8129, + "step": 804 + }, + { + "epoch": 0.13777169262365224, + "grad_norm": 36.5138053894043, + "learning_rate": 4.5636052481460355e-06, + "loss": 6.4333, + "step": 805 + }, + { + "epoch": 0.13794283758343318, + "grad_norm": 31.310596466064453, + "learning_rate": 4.569309754706218e-06, + "loss": 6.1982, + "step": 806 + }, + { + "epoch": 0.1381139825432141, + "grad_norm": 32.40011978149414, + "learning_rate": 4.5750142612664005e-06, + "loss": 6.5281, + "step": 807 + }, + { + "epoch": 0.13828512750299504, + "grad_norm": 33.58089828491211, + "learning_rate": 4.580718767826583e-06, + "loss": 5.0059, + "step": 808 + }, + { + "epoch": 0.13845627246277598, + "grad_norm": 46.53955841064453, + "learning_rate": 4.586423274386766e-06, + "loss": 10.3345, + "step": 809 + }, + { + "epoch": 0.1386274174225569, + "grad_norm": 23.006080627441406, + "learning_rate": 4.592127780946948e-06, + "loss": 2.1468, + "step": 810 + }, + { + "epoch": 0.13879856238233784, + "grad_norm": 21.113685607910156, + "learning_rate": 4.597832287507131e-06, + "loss": 2.0972, + "step": 811 + }, + { + "epoch": 0.13896970734211878, + "grad_norm": 29.228193283081055, + "learning_rate": 4.603536794067313e-06, + "loss": 2.9408, + "step": 812 + }, + { + "epoch": 0.1391408523018997, + "grad_norm": 39.542686462402344, + "learning_rate": 4.609241300627496e-06, + "loss": 6.4624, + "step": 813 + }, + { + "epoch": 0.13931199726168064, + "grad_norm": 42.17389678955078, + "learning_rate": 4.614945807187679e-06, + "loss": 7.4244, + "step": 814 + }, + { + "epoch": 0.13948314222146158, + "grad_norm": 31.26105308532715, + "learning_rate": 4.620650313747861e-06, + "loss": 6.5606, + "step": 815 + }, + { + "epoch": 0.1396542871812425, + "grad_norm": 40.22693634033203, + "learning_rate": 4.626354820308043e-06, + "loss": 6.2725, + "step": 816 + }, + { + "epoch": 0.13982543214102344, + "grad_norm": 25.14350700378418, + "learning_rate": 4.632059326868226e-06, + "loss": 4.0754, + "step": 817 + }, + { + "epoch": 0.13999657710080438, + "grad_norm": 23.578937530517578, + "learning_rate": 4.637763833428408e-06, + "loss": 4.1309, + "step": 818 + }, + { + "epoch": 0.1401677220605853, + "grad_norm": 37.57481002807617, + "learning_rate": 4.643468339988591e-06, + "loss": 5.8135, + "step": 819 + }, + { + "epoch": 0.14033886702036624, + "grad_norm": 35.21710205078125, + "learning_rate": 4.649172846548774e-06, + "loss": 6.6982, + "step": 820 + }, + { + "epoch": 0.14051001198014718, + "grad_norm": 14.915112495422363, + "learning_rate": 4.654877353108957e-06, + "loss": 2.1068, + "step": 821 + }, + { + "epoch": 0.1406811569399281, + "grad_norm": 27.366252899169922, + "learning_rate": 4.6605818596691385e-06, + "loss": 3.2475, + "step": 822 + }, + { + "epoch": 0.14085230189970904, + "grad_norm": 36.40489196777344, + "learning_rate": 4.666286366229321e-06, + "loss": 6.7448, + "step": 823 + }, + { + "epoch": 0.14102344685948998, + "grad_norm": 37.40996551513672, + "learning_rate": 4.6719908727895036e-06, + "loss": 6.8328, + "step": 824 + }, + { + "epoch": 0.1411945918192709, + "grad_norm": 255.09320068359375, + "learning_rate": 4.677695379349686e-06, + "loss": 12.0992, + "step": 825 + }, + { + "epoch": 0.14136573677905187, + "grad_norm": 41.39365768432617, + "learning_rate": 4.6833998859098695e-06, + "loss": 6.1908, + "step": 826 + }, + { + "epoch": 0.1415368817388328, + "grad_norm": 14.086997032165527, + "learning_rate": 4.689104392470052e-06, + "loss": 3.3856, + "step": 827 + }, + { + "epoch": 0.14170802669861374, + "grad_norm": 33.170352935791016, + "learning_rate": 4.694808899030234e-06, + "loss": 6.9479, + "step": 828 + }, + { + "epoch": 0.14187917165839467, + "grad_norm": 37.625064849853516, + "learning_rate": 4.700513405590416e-06, + "loss": 7.6713, + "step": 829 + }, + { + "epoch": 0.1420503166181756, + "grad_norm": 25.476303100585938, + "learning_rate": 4.706217912150599e-06, + "loss": 4.2481, + "step": 830 + }, + { + "epoch": 0.14222146157795654, + "grad_norm": 27.399072647094727, + "learning_rate": 4.711922418710781e-06, + "loss": 5.508, + "step": 831 + }, + { + "epoch": 0.14239260653773747, + "grad_norm": 31.020893096923828, + "learning_rate": 4.717626925270965e-06, + "loss": 5.8831, + "step": 832 + }, + { + "epoch": 0.1425637514975184, + "grad_norm": 26.108135223388672, + "learning_rate": 4.723331431831147e-06, + "loss": 3.5932, + "step": 833 + }, + { + "epoch": 0.14273489645729934, + "grad_norm": 35.8662109375, + "learning_rate": 4.729035938391329e-06, + "loss": 5.1499, + "step": 834 + }, + { + "epoch": 0.14290604141708027, + "grad_norm": 34.714324951171875, + "learning_rate": 4.734740444951511e-06, + "loss": 5.9969, + "step": 835 + }, + { + "epoch": 0.1430771863768612, + "grad_norm": 34.023067474365234, + "learning_rate": 4.740444951511694e-06, + "loss": 6.4575, + "step": 836 + }, + { + "epoch": 0.14324833133664214, + "grad_norm": 17.601118087768555, + "learning_rate": 4.746149458071877e-06, + "loss": 2.5208, + "step": 837 + }, + { + "epoch": 0.14341947629642307, + "grad_norm": 19.672815322875977, + "learning_rate": 4.75185396463206e-06, + "loss": 2.1216, + "step": 838 + }, + { + "epoch": 0.143590621256204, + "grad_norm": 25.771137237548828, + "learning_rate": 4.757558471192242e-06, + "loss": 2.6146, + "step": 839 + }, + { + "epoch": 0.14376176621598494, + "grad_norm": 32.17550277709961, + "learning_rate": 4.763262977752424e-06, + "loss": 5.8516, + "step": 840 + }, + { + "epoch": 0.14393291117576587, + "grad_norm": 72.34523010253906, + "learning_rate": 4.768967484312607e-06, + "loss": 11.0212, + "step": 841 + }, + { + "epoch": 0.1441040561355468, + "grad_norm": 22.756717681884766, + "learning_rate": 4.774671990872789e-06, + "loss": 2.3166, + "step": 842 + }, + { + "epoch": 0.14427520109532774, + "grad_norm": 22.13291358947754, + "learning_rate": 4.7803764974329725e-06, + "loss": 2.2079, + "step": 843 + }, + { + "epoch": 0.14444634605510867, + "grad_norm": 65.32748413085938, + "learning_rate": 4.786081003993155e-06, + "loss": 6.3309, + "step": 844 + }, + { + "epoch": 0.1446174910148896, + "grad_norm": 242.9714813232422, + "learning_rate": 4.791785510553338e-06, + "loss": 11.9379, + "step": 845 + }, + { + "epoch": 0.14478863597467054, + "grad_norm": 21.737802505493164, + "learning_rate": 4.79749001711352e-06, + "loss": 3.806, + "step": 846 + }, + { + "epoch": 0.14495978093445147, + "grad_norm": 29.438758850097656, + "learning_rate": 4.803194523673702e-06, + "loss": 5.7729, + "step": 847 + }, + { + "epoch": 0.1451309258942324, + "grad_norm": 25.701087951660156, + "learning_rate": 4.808899030233884e-06, + "loss": 2.8536, + "step": 848 + }, + { + "epoch": 0.14530207085401334, + "grad_norm": 130.01524353027344, + "learning_rate": 4.814603536794068e-06, + "loss": 7.5391, + "step": 849 + }, + { + "epoch": 0.14547321581379427, + "grad_norm": 30.284828186035156, + "learning_rate": 4.82030804335425e-06, + "loss": 3.806, + "step": 850 + }, + { + "epoch": 0.1456443607735752, + "grad_norm": 23.351642608642578, + "learning_rate": 4.826012549914433e-06, + "loss": 4.2263, + "step": 851 + }, + { + "epoch": 0.14581550573335617, + "grad_norm": 216.2431182861328, + "learning_rate": 4.831717056474615e-06, + "loss": 9.159, + "step": 852 + }, + { + "epoch": 0.1459866506931371, + "grad_norm": 35.071754455566406, + "learning_rate": 4.837421563034797e-06, + "loss": 6.503, + "step": 853 + }, + { + "epoch": 0.14615779565291803, + "grad_norm": 34.0211296081543, + "learning_rate": 4.84312606959498e-06, + "loss": 6.4636, + "step": 854 + }, + { + "epoch": 0.14632894061269897, + "grad_norm": 17.20896339416504, + "learning_rate": 4.848830576155163e-06, + "loss": 2.7218, + "step": 855 + }, + { + "epoch": 0.1465000855724799, + "grad_norm": 136.72647094726562, + "learning_rate": 4.8545350827153454e-06, + "loss": 7.7082, + "step": 856 + }, + { + "epoch": 0.14667123053226083, + "grad_norm": 53.50956344604492, + "learning_rate": 4.860239589275528e-06, + "loss": 10.0171, + "step": 857 + }, + { + "epoch": 0.14684237549204177, + "grad_norm": 21.030473709106445, + "learning_rate": 4.8659440958357105e-06, + "loss": 4.1916, + "step": 858 + }, + { + "epoch": 0.1470135204518227, + "grad_norm": 34.38727569580078, + "learning_rate": 4.871648602395892e-06, + "loss": 5.969, + "step": 859 + }, + { + "epoch": 0.14718466541160363, + "grad_norm": 22.703882217407227, + "learning_rate": 4.8773531089560756e-06, + "loss": 2.4073, + "step": 860 + }, + { + "epoch": 0.14735581037138457, + "grad_norm": 33.388858795166016, + "learning_rate": 4.883057615516258e-06, + "loss": 5.7571, + "step": 861 + }, + { + "epoch": 0.1475269553311655, + "grad_norm": 35.79853820800781, + "learning_rate": 4.888762122076441e-06, + "loss": 5.9363, + "step": 862 + }, + { + "epoch": 0.14769810029094643, + "grad_norm": 20.656721115112305, + "learning_rate": 4.894466628636623e-06, + "loss": 2.0406, + "step": 863 + }, + { + "epoch": 0.14786924525072737, + "grad_norm": 35.20976638793945, + "learning_rate": 4.900171135196806e-06, + "loss": 5.8613, + "step": 864 + }, + { + "epoch": 0.1480403902105083, + "grad_norm": 22.342880249023438, + "learning_rate": 4.905875641756987e-06, + "loss": 4.0119, + "step": 865 + }, + { + "epoch": 0.14821153517028923, + "grad_norm": 33.253292083740234, + "learning_rate": 4.911580148317171e-06, + "loss": 4.62, + "step": 866 + }, + { + "epoch": 0.14838268013007017, + "grad_norm": 186.65093994140625, + "learning_rate": 4.917284654877353e-06, + "loss": 11.2662, + "step": 867 + }, + { + "epoch": 0.1485538250898511, + "grad_norm": 15.842426300048828, + "learning_rate": 4.922989161437536e-06, + "loss": 2.0607, + "step": 868 + }, + { + "epoch": 0.14872497004963203, + "grad_norm": 26.70699119567871, + "learning_rate": 4.928693667997718e-06, + "loss": 3.1737, + "step": 869 + }, + { + "epoch": 0.14889611500941297, + "grad_norm": 33.37158966064453, + "learning_rate": 4.934398174557901e-06, + "loss": 4.7352, + "step": 870 + }, + { + "epoch": 0.1490672599691939, + "grad_norm": 26.4490966796875, + "learning_rate": 4.940102681118083e-06, + "loss": 4.2178, + "step": 871 + }, + { + "epoch": 0.14923840492897483, + "grad_norm": 33.25678634643555, + "learning_rate": 4.945807187678266e-06, + "loss": 5.0764, + "step": 872 + }, + { + "epoch": 0.14940954988875577, + "grad_norm": 38.204769134521484, + "learning_rate": 4.9515116942384485e-06, + "loss": 5.8078, + "step": 873 + }, + { + "epoch": 0.1495806948485367, + "grad_norm": 27.79875946044922, + "learning_rate": 4.957216200798631e-06, + "loss": 5.6432, + "step": 874 + }, + { + "epoch": 0.14975183980831763, + "grad_norm": 32.442115783691406, + "learning_rate": 4.9629207073588135e-06, + "loss": 5.7378, + "step": 875 + }, + { + "epoch": 0.14992298476809857, + "grad_norm": 57.06877517700195, + "learning_rate": 4.968625213918996e-06, + "loss": 10.3136, + "step": 876 + }, + { + "epoch": 0.15009412972787953, + "grad_norm": 32.131187438964844, + "learning_rate": 4.9743297204791794e-06, + "loss": 4.6921, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_nli-pairs_loss": 5.535374164581299, + "eval_nli-pairs_runtime": 4.3709, + "eval_nli-pairs_samples_per_second": 45.757, + "eval_nli-pairs_steps_per_second": 1.601, + "eval_sts-test_pearson_cosine": 0.6147169012893178, + "eval_sts-test_pearson_dot": 0.4334302941897573, + "eval_sts-test_pearson_euclidean": 0.6082490673246602, + "eval_sts-test_pearson_manhattan": 0.616700428941834, + "eval_sts-test_pearson_max": 0.616700428941834, + "eval_sts-test_spearman_cosine": 0.5972327557562241, + "eval_sts-test_spearman_dot": 0.41946207508864325, + "eval_sts-test_spearman_euclidean": 0.5959187544369754, + "eval_sts-test_spearman_manhattan": 0.6029031731511296, + "eval_sts-test_spearman_max": 0.6029031731511296, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_vitaminc-pairs_loss": 3.619838237762451, + "eval_vitaminc-pairs_runtime": 2.7372, + "eval_vitaminc-pairs_samples_per_second": 73.068, + "eval_vitaminc-pairs_steps_per_second": 2.557, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_qnli-contrastive_loss": 12.3779878616333, + "eval_qnli-contrastive_runtime": 0.6382, + "eval_qnli-contrastive_samples_per_second": 313.373, + "eval_qnli-contrastive_steps_per_second": 10.968, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_scitail-pairs-qa_loss": 1.6706750392913818, + "eval_scitail-pairs-qa_runtime": 1.6279, + "eval_scitail-pairs-qa_samples_per_second": 122.855, + "eval_scitail-pairs-qa_steps_per_second": 4.3, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_scitail-pairs-pos_loss": 3.0242857933044434, + "eval_scitail-pairs-pos_runtime": 2.6188, + "eval_scitail-pairs-pos_samples_per_second": 76.369, + "eval_scitail-pairs-pos_steps_per_second": 2.673, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_xsum-pairs_loss": 3.0581634044647217, + "eval_xsum-pairs_runtime": 2.6458, + "eval_xsum-pairs_samples_per_second": 66.142, + "eval_xsum-pairs_steps_per_second": 2.268, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_compression-pairs_loss": 1.9685934782028198, + "eval_compression-pairs_runtime": 0.5084, + "eval_compression-pairs_samples_per_second": 393.398, + "eval_compression-pairs_steps_per_second": 13.769, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_sciq_pairs_loss": 6.824851989746094, + "eval_sciq_pairs_runtime": 9.1685, + "eval_sciq_pairs_samples_per_second": 21.814, + "eval_sciq_pairs_steps_per_second": 0.763, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_qasc_pairs_loss": 10.253314018249512, + "eval_qasc_pairs_runtime": 2.6538, + "eval_qasc_pairs_samples_per_second": 75.363, + "eval_qasc_pairs_steps_per_second": 2.638, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_openbookqa_pairs_loss": 5.933743953704834, + "eval_openbookqa_pairs_runtime": 0.6418, + "eval_openbookqa_pairs_samples_per_second": 107.513, + "eval_openbookqa_pairs_steps_per_second": 4.674, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_msmarco_pairs_loss": 5.185385704040527, + "eval_msmarco_pairs_runtime": 3.9947, + "eval_msmarco_pairs_samples_per_second": 50.067, + "eval_msmarco_pairs_steps_per_second": 1.752, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_nq_pairs_loss": 6.44993782043457, + "eval_nq_pairs_runtime": 8.638, + "eval_nq_pairs_samples_per_second": 23.153, + "eval_nq_pairs_steps_per_second": 0.81, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_trivia_pairs_loss": 6.129721641540527, + "eval_trivia_pairs_runtime": 12.8296, + "eval_trivia_pairs_samples_per_second": 15.589, + "eval_trivia_pairs_steps_per_second": 0.546, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_quora_pairs_loss": 1.7218067646026611, + "eval_quora_pairs_runtime": 1.5931, + "eval_quora_pairs_samples_per_second": 125.544, + "eval_quora_pairs_steps_per_second": 4.394, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_gooaq_pairs_loss": 4.168159008026123, + "eval_gooaq_pairs_runtime": 2.6679, + "eval_gooaq_pairs_samples_per_second": 74.966, + "eval_gooaq_pairs_steps_per_second": 2.624, + "step": 877 + }, + { + "epoch": 0.15026527468766046, + "grad_norm": 29.085119247436523, + "learning_rate": 4.980034227039361e-06, + "loss": 5.8249, + "step": 878 + }, + { + "epoch": 0.1504364196474414, + "grad_norm": 35.45232009887695, + "learning_rate": 4.985738733599544e-06, + "loss": 6.378, + "step": 879 + }, + { + "epoch": 0.15060756460722233, + "grad_norm": 34.018470764160156, + "learning_rate": 4.991443240159726e-06, + "loss": 5.326, + "step": 880 + }, + { + "epoch": 0.15077870956700326, + "grad_norm": 22.30814552307129, + "learning_rate": 4.997147746719909e-06, + "loss": 2.6674, + "step": 881 + }, + { + "epoch": 0.1509498545267842, + "grad_norm": 36.679046630859375, + "learning_rate": 5.002852253280091e-06, + "loss": 6.6655, + "step": 882 + }, + { + "epoch": 0.15112099948656513, + "grad_norm": 36.78900146484375, + "learning_rate": 5.008556759840275e-06, + "loss": 4.5851, + "step": 883 + }, + { + "epoch": 0.15129214444634606, + "grad_norm": 46.770057678222656, + "learning_rate": 5.014261266400456e-06, + "loss": 9.9308, + "step": 884 + }, + { + "epoch": 0.151463289406127, + "grad_norm": 27.262338638305664, + "learning_rate": 5.019965772960639e-06, + "loss": 2.2515, + "step": 885 + }, + { + "epoch": 0.15163443436590793, + "grad_norm": 193.24122619628906, + "learning_rate": 5.025670279520821e-06, + "loss": 10.7631, + "step": 886 + }, + { + "epoch": 0.15180557932568886, + "grad_norm": 30.53336524963379, + "learning_rate": 5.031374786081004e-06, + "loss": 3.9297, + "step": 887 + }, + { + "epoch": 0.1519767242854698, + "grad_norm": 13.035544395446777, + "learning_rate": 5.0370792926411864e-06, + "loss": 3.16, + "step": 888 + }, + { + "epoch": 0.15214786924525073, + "grad_norm": 27.65202522277832, + "learning_rate": 5.04278379920137e-06, + "loss": 3.1012, + "step": 889 + }, + { + "epoch": 0.15231901420503166, + "grad_norm": 28.412954330444336, + "learning_rate": 5.0484883057615515e-06, + "loss": 2.4251, + "step": 890 + }, + { + "epoch": 0.1524901591648126, + "grad_norm": 35.567386627197266, + "learning_rate": 5.054192812321734e-06, + "loss": 5.1793, + "step": 891 + }, + { + "epoch": 0.15266130412459353, + "grad_norm": 31.945302963256836, + "learning_rate": 5.0598973188819166e-06, + "loss": 4.9138, + "step": 892 + }, + { + "epoch": 0.15283244908437446, + "grad_norm": 30.31682014465332, + "learning_rate": 5.065601825442099e-06, + "loss": 4.8582, + "step": 893 + }, + { + "epoch": 0.1530035940441554, + "grad_norm": 22.3225040435791, + "learning_rate": 5.0713063320022825e-06, + "loss": 2.003, + "step": 894 + }, + { + "epoch": 0.15317473900393633, + "grad_norm": 23.375139236450195, + "learning_rate": 5.077010838562465e-06, + "loss": 2.3547, + "step": 895 + }, + { + "epoch": 0.15334588396371726, + "grad_norm": 32.41263198852539, + "learning_rate": 5.0827153451226475e-06, + "loss": 6.2287, + "step": 896 + }, + { + "epoch": 0.1535170289234982, + "grad_norm": 20.43022346496582, + "learning_rate": 5.088419851682829e-06, + "loss": 2.1189, + "step": 897 + }, + { + "epoch": 0.15368817388327913, + "grad_norm": 37.203250885009766, + "learning_rate": 5.094124358243012e-06, + "loss": 6.3629, + "step": 898 + }, + { + "epoch": 0.15385931884306006, + "grad_norm": 19.725624084472656, + "learning_rate": 5.099828864803194e-06, + "loss": 2.2277, + "step": 899 + }, + { + "epoch": 0.154030463802841, + "grad_norm": 27.29782485961914, + "learning_rate": 5.105533371363378e-06, + "loss": 2.8851, + "step": 900 + }, + { + "epoch": 0.15420160876262193, + "grad_norm": 172.8111572265625, + "learning_rate": 5.11123787792356e-06, + "loss": 9.9783, + "step": 901 + }, + { + "epoch": 0.1543727537224029, + "grad_norm": 56.5546875, + "learning_rate": 5.116942384483743e-06, + "loss": 10.3301, + "step": 902 + }, + { + "epoch": 0.15454389868218382, + "grad_norm": 32.12007522583008, + "learning_rate": 5.122646891043924e-06, + "loss": 3.3146, + "step": 903 + }, + { + "epoch": 0.15471504364196476, + "grad_norm": 197.39170837402344, + "learning_rate": 5.128351397604107e-06, + "loss": 11.016, + "step": 904 + }, + { + "epoch": 0.1548861886017457, + "grad_norm": 36.48847579956055, + "learning_rate": 5.1340559041642895e-06, + "loss": 4.8215, + "step": 905 + }, + { + "epoch": 0.15505733356152662, + "grad_norm": 31.014644622802734, + "learning_rate": 5.139760410724473e-06, + "loss": 4.7237, + "step": 906 + }, + { + "epoch": 0.15522847852130756, + "grad_norm": 31.436952590942383, + "learning_rate": 5.145464917284655e-06, + "loss": 4.6175, + "step": 907 + }, + { + "epoch": 0.1553996234810885, + "grad_norm": 27.38591194152832, + "learning_rate": 5.151169423844838e-06, + "loss": 4.0958, + "step": 908 + }, + { + "epoch": 0.15557076844086942, + "grad_norm": 31.732324600219727, + "learning_rate": 5.15687393040502e-06, + "loss": 4.4682, + "step": 909 + }, + { + "epoch": 0.15574191340065036, + "grad_norm": 15.360635757446289, + "learning_rate": 5.162578436965202e-06, + "loss": 2.4148, + "step": 910 + }, + { + "epoch": 0.1559130583604313, + "grad_norm": 172.3378448486328, + "learning_rate": 5.168282943525385e-06, + "loss": 9.8466, + "step": 911 + }, + { + "epoch": 0.15608420332021222, + "grad_norm": 31.59737777709961, + "learning_rate": 5.173987450085568e-06, + "loss": 6.1221, + "step": 912 + }, + { + "epoch": 0.15625534827999316, + "grad_norm": 20.06523323059082, + "learning_rate": 5.179691956645751e-06, + "loss": 2.0035, + "step": 913 + }, + { + "epoch": 0.1564264932397741, + "grad_norm": 25.82581329345703, + "learning_rate": 5.185396463205933e-06, + "loss": 4.7388, + "step": 914 + }, + { + "epoch": 0.15659763819955502, + "grad_norm": 13.644715309143066, + "learning_rate": 5.191100969766115e-06, + "loss": 2.1442, + "step": 915 + }, + { + "epoch": 0.15676878315933596, + "grad_norm": 36.4990119934082, + "learning_rate": 5.196805476326297e-06, + "loss": 6.2552, + "step": 916 + }, + { + "epoch": 0.1569399281191169, + "grad_norm": 35.6190185546875, + "learning_rate": 5.202509982886481e-06, + "loss": 6.3529, + "step": 917 + }, + { + "epoch": 0.15711107307889782, + "grad_norm": 13.495047569274902, + "learning_rate": 5.208214489446663e-06, + "loss": 3.5731, + "step": 918 + }, + { + "epoch": 0.15728221803867876, + "grad_norm": 236.7681121826172, + "learning_rate": 5.213918996006846e-06, + "loss": 10.5726, + "step": 919 + }, + { + "epoch": 0.1574533629984597, + "grad_norm": 34.39946746826172, + "learning_rate": 5.219623502567028e-06, + "loss": 6.0673, + "step": 920 + }, + { + "epoch": 0.15762450795824062, + "grad_norm": 12.590995788574219, + "learning_rate": 5.225328009127211e-06, + "loss": 2.77, + "step": 921 + }, + { + "epoch": 0.15779565291802156, + "grad_norm": 31.968891143798828, + "learning_rate": 5.2310325156873925e-06, + "loss": 4.1677, + "step": 922 + }, + { + "epoch": 0.1579667978778025, + "grad_norm": 31.067489624023438, + "learning_rate": 5.236737022247576e-06, + "loss": 4.716, + "step": 923 + }, + { + "epoch": 0.15813794283758342, + "grad_norm": 36.08390808105469, + "learning_rate": 5.2424415288077584e-06, + "loss": 6.528, + "step": 924 + }, + { + "epoch": 0.15830908779736436, + "grad_norm": 34.2723274230957, + "learning_rate": 5.248146035367941e-06, + "loss": 6.4655, + "step": 925 + }, + { + "epoch": 0.1584802327571453, + "grad_norm": 43.43145751953125, + "learning_rate": 5.2538505419281235e-06, + "loss": 5.6795, + "step": 926 + }, + { + "epoch": 0.15865137771692622, + "grad_norm": 32.78499221801758, + "learning_rate": 5.259555048488306e-06, + "loss": 5.6396, + "step": 927 + }, + { + "epoch": 0.15882252267670718, + "grad_norm": 35.156925201416016, + "learning_rate": 5.265259555048488e-06, + "loss": 4.7143, + "step": 928 + }, + { + "epoch": 0.15899366763648812, + "grad_norm": 34.6341552734375, + "learning_rate": 5.270964061608671e-06, + "loss": 5.6931, + "step": 929 + }, + { + "epoch": 0.15916481259626905, + "grad_norm": 35.668331146240234, + "learning_rate": 5.276668568168854e-06, + "loss": 5.6404, + "step": 930 + }, + { + "epoch": 0.15933595755604998, + "grad_norm": 34.62514877319336, + "learning_rate": 5.282373074729036e-06, + "loss": 5.0469, + "step": 931 + }, + { + "epoch": 0.15950710251583092, + "grad_norm": 37.79499435424805, + "learning_rate": 5.288077581289219e-06, + "loss": 5.3761, + "step": 932 + }, + { + "epoch": 0.15967824747561185, + "grad_norm": 40.4017333984375, + "learning_rate": 5.293782087849401e-06, + "loss": 5.6738, + "step": 933 + }, + { + "epoch": 0.15984939243539278, + "grad_norm": 35.31856155395508, + "learning_rate": 5.299486594409584e-06, + "loss": 6.4936, + "step": 934 + }, + { + "epoch": 0.16002053739517372, + "grad_norm": 126.11963653564453, + "learning_rate": 5.305191100969766e-06, + "loss": 9.9326, + "step": 935 + }, + { + "epoch": 0.16019168235495465, + "grad_norm": 34.740753173828125, + "learning_rate": 5.310895607529949e-06, + "loss": 2.0987, + "step": 936 + }, + { + "epoch": 0.16036282731473558, + "grad_norm": 34.9671745300293, + "learning_rate": 5.316600114090131e-06, + "loss": 6.2338, + "step": 937 + }, + { + "epoch": 0.16053397227451652, + "grad_norm": 21.198925018310547, + "learning_rate": 5.322304620650314e-06, + "loss": 3.5463, + "step": 938 + }, + { + "epoch": 0.16070511723429745, + "grad_norm": 30.98229217529297, + "learning_rate": 5.328009127210496e-06, + "loss": 4.7342, + "step": 939 + }, + { + "epoch": 0.16087626219407838, + "grad_norm": 41.88993835449219, + "learning_rate": 5.333713633770679e-06, + "loss": 6.5058, + "step": 940 + }, + { + "epoch": 0.16104740715385932, + "grad_norm": 24.218576431274414, + "learning_rate": 5.3394181403308615e-06, + "loss": 2.0172, + "step": 941 + }, + { + "epoch": 0.16121855211364025, + "grad_norm": 32.891719818115234, + "learning_rate": 5.345122646891044e-06, + "loss": 5.893, + "step": 942 + }, + { + "epoch": 0.16138969707342118, + "grad_norm": 38.93867874145508, + "learning_rate": 5.3508271534512265e-06, + "loss": 5.8157, + "step": 943 + }, + { + "epoch": 0.16156084203320212, + "grad_norm": 31.02938461303711, + "learning_rate": 5.356531660011409e-06, + "loss": 5.529, + "step": 944 + }, + { + "epoch": 0.16173198699298305, + "grad_norm": 36.240440368652344, + "learning_rate": 5.362236166571592e-06, + "loss": 4.7931, + "step": 945 + }, + { + "epoch": 0.16190313195276398, + "grad_norm": 23.227556228637695, + "learning_rate": 5.367940673131775e-06, + "loss": 2.1265, + "step": 946 + }, + { + "epoch": 0.16207427691254492, + "grad_norm": 40.07374954223633, + "learning_rate": 5.373645179691957e-06, + "loss": 5.8823, + "step": 947 + }, + { + "epoch": 0.16224542187232585, + "grad_norm": 29.960735321044922, + "learning_rate": 5.379349686252139e-06, + "loss": 4.6281, + "step": 948 + }, + { + "epoch": 0.16241656683210678, + "grad_norm": 173.5910186767578, + "learning_rate": 5.385054192812322e-06, + "loss": 10.3282, + "step": 949 + }, + { + "epoch": 0.16258771179188772, + "grad_norm": 37.48442840576172, + "learning_rate": 5.390758699372504e-06, + "loss": 6.1584, + "step": 950 + }, + { + "epoch": 0.16275885675166865, + "grad_norm": 39.48939514160156, + "learning_rate": 5.396463205932687e-06, + "loss": 5.655, + "step": 951 + }, + { + "epoch": 0.16293000171144958, + "grad_norm": 34.57015609741211, + "learning_rate": 5.40216771249287e-06, + "loss": 5.4251, + "step": 952 + }, + { + "epoch": 0.16310114667123055, + "grad_norm": 51.02991485595703, + "learning_rate": 5.407872219053052e-06, + "loss": 10.2283, + "step": 953 + }, + { + "epoch": 0.16327229163101148, + "grad_norm": 31.77302360534668, + "learning_rate": 5.413576725613234e-06, + "loss": 4.0174, + "step": 954 + }, + { + "epoch": 0.1634434365907924, + "grad_norm": 31.242929458618164, + "learning_rate": 5.419281232173417e-06, + "loss": 5.5883, + "step": 955 + }, + { + "epoch": 0.16361458155057335, + "grad_norm": 31.789701461791992, + "learning_rate": 5.4249857387335994e-06, + "loss": 4.5646, + "step": 956 + }, + { + "epoch": 0.16378572651035428, + "grad_norm": 34.09980392456055, + "learning_rate": 5.430690245293783e-06, + "loss": 4.9872, + "step": 957 + }, + { + "epoch": 0.1639568714701352, + "grad_norm": 31.57735252380371, + "learning_rate": 5.436394751853965e-06, + "loss": 5.158, + "step": 958 + }, + { + "epoch": 0.16412801642991615, + "grad_norm": 32.941917419433594, + "learning_rate": 5.442099258414147e-06, + "loss": 5.4497, + "step": 959 + }, + { + "epoch": 0.16429916138969708, + "grad_norm": 200.919921875, + "learning_rate": 5.4478037649743296e-06, + "loss": 9.7888, + "step": 960 + }, + { + "epoch": 0.164470306349478, + "grad_norm": 28.78856658935547, + "learning_rate": 5.453508271534512e-06, + "loss": 5.0757, + "step": 961 + }, + { + "epoch": 0.16464145130925895, + "grad_norm": 22.877927780151367, + "learning_rate": 5.459212778094695e-06, + "loss": 3.6177, + "step": 962 + }, + { + "epoch": 0.16481259626903988, + "grad_norm": 24.904977798461914, + "learning_rate": 5.464917284654878e-06, + "loss": 4.2287, + "step": 963 + }, + { + "epoch": 0.1649837412288208, + "grad_norm": 35.849124908447266, + "learning_rate": 5.4706217912150605e-06, + "loss": 5.1121, + "step": 964 + }, + { + "epoch": 0.16515488618860175, + "grad_norm": 31.580976486206055, + "learning_rate": 5.476326297775242e-06, + "loss": 4.4859, + "step": 965 + }, + { + "epoch": 0.16532603114838268, + "grad_norm": 30.3056697845459, + "learning_rate": 5.482030804335425e-06, + "loss": 4.5076, + "step": 966 + }, + { + "epoch": 0.1654971761081636, + "grad_norm": 34.674468994140625, + "learning_rate": 5.487735310895607e-06, + "loss": 5.7789, + "step": 967 + }, + { + "epoch": 0.16566832106794455, + "grad_norm": 28.0445556640625, + "learning_rate": 5.49343981745579e-06, + "loss": 2.7613, + "step": 968 + }, + { + "epoch": 0.16583946602772548, + "grad_norm": 33.28575134277344, + "learning_rate": 5.499144324015973e-06, + "loss": 5.1032, + "step": 969 + }, + { + "epoch": 0.1660106109875064, + "grad_norm": 35.53700637817383, + "learning_rate": 5.504848830576156e-06, + "loss": 5.2129, + "step": 970 + }, + { + "epoch": 0.16618175594728735, + "grad_norm": 33.2183952331543, + "learning_rate": 5.510553337136338e-06, + "loss": 5.6908, + "step": 971 + }, + { + "epoch": 0.16635290090706828, + "grad_norm": 30.640926361083984, + "learning_rate": 5.51625784369652e-06, + "loss": 4.4325, + "step": 972 + }, + { + "epoch": 0.1665240458668492, + "grad_norm": 24.672338485717773, + "learning_rate": 5.5219623502567025e-06, + "loss": 3.9552, + "step": 973 + }, + { + "epoch": 0.16669519082663015, + "grad_norm": 33.66337585449219, + "learning_rate": 5.527666856816886e-06, + "loss": 5.4014, + "step": 974 + }, + { + "epoch": 0.16686633578641108, + "grad_norm": 32.082942962646484, + "learning_rate": 5.533371363377068e-06, + "loss": 5.9258, + "step": 975 + }, + { + "epoch": 0.167037480746192, + "grad_norm": 37.91094970703125, + "learning_rate": 5.539075869937251e-06, + "loss": 5.717, + "step": 976 + }, + { + "epoch": 0.16720862570597295, + "grad_norm": 20.26280975341797, + "learning_rate": 5.5447803764974335e-06, + "loss": 2.2263, + "step": 977 + }, + { + "epoch": 0.16737977066575388, + "grad_norm": 48.14308547973633, + "learning_rate": 5.550484883057615e-06, + "loss": 9.6938, + "step": 978 + }, + { + "epoch": 0.16755091562553484, + "grad_norm": 22.81192970275879, + "learning_rate": 5.556189389617798e-06, + "loss": 3.7015, + "step": 979 + }, + { + "epoch": 0.16772206058531577, + "grad_norm": 27.474571228027344, + "learning_rate": 5.561893896177981e-06, + "loss": 2.9404, + "step": 980 + }, + { + "epoch": 0.1678932055450967, + "grad_norm": 25.376007080078125, + "learning_rate": 5.567598402738164e-06, + "loss": 2.3926, + "step": 981 + }, + { + "epoch": 0.16806435050487764, + "grad_norm": 31.575468063354492, + "learning_rate": 5.573302909298346e-06, + "loss": 4.7349, + "step": 982 + }, + { + "epoch": 0.16823549546465857, + "grad_norm": 194.93817138671875, + "learning_rate": 5.579007415858529e-06, + "loss": 9.7172, + "step": 983 + }, + { + "epoch": 0.1684066404244395, + "grad_norm": 31.26558494567871, + "learning_rate": 5.58471192241871e-06, + "loss": 3.9837, + "step": 984 + }, + { + "epoch": 0.16857778538422044, + "grad_norm": 32.1373405456543, + "learning_rate": 5.590416428978893e-06, + "loss": 5.0026, + "step": 985 + }, + { + "epoch": 0.16874893034400137, + "grad_norm": 37.07416915893555, + "learning_rate": 5.596120935539076e-06, + "loss": 5.8572, + "step": 986 + }, + { + "epoch": 0.1689200753037823, + "grad_norm": 35.09983825683594, + "learning_rate": 5.601825442099259e-06, + "loss": 5.6302, + "step": 987 + }, + { + "epoch": 0.16909122026356324, + "grad_norm": 46.96855926513672, + "learning_rate": 5.607529948659441e-06, + "loss": 9.6255, + "step": 988 + }, + { + "epoch": 0.16926236522334417, + "grad_norm": 36.15262985229492, + "learning_rate": 5.613234455219624e-06, + "loss": 5.5484, + "step": 989 + }, + { + "epoch": 0.1694335101831251, + "grad_norm": 33.642967224121094, + "learning_rate": 5.6189389617798055e-06, + "loss": 5.5827, + "step": 990 + }, + { + "epoch": 0.16960465514290604, + "grad_norm": 27.581716537475586, + "learning_rate": 5.624643468339988e-06, + "loss": 2.9652, + "step": 991 + }, + { + "epoch": 0.16977580010268697, + "grad_norm": 19.107044219970703, + "learning_rate": 5.6303479749001714e-06, + "loss": 1.7442, + "step": 992 + }, + { + "epoch": 0.1699469450624679, + "grad_norm": 165.6937255859375, + "learning_rate": 5.636052481460354e-06, + "loss": 10.2439, + "step": 993 + }, + { + "epoch": 0.17011809002224884, + "grad_norm": 171.38658142089844, + "learning_rate": 5.6417569880205365e-06, + "loss": 10.7544, + "step": 994 + }, + { + "epoch": 0.17028923498202977, + "grad_norm": 29.20503807067871, + "learning_rate": 5.647461494580719e-06, + "loss": 4.176, + "step": 995 + }, + { + "epoch": 0.1704603799418107, + "grad_norm": 29.09612274169922, + "learning_rate": 5.6531660011409016e-06, + "loss": 4.1945, + "step": 996 + }, + { + "epoch": 0.17063152490159164, + "grad_norm": 39.78682327270508, + "learning_rate": 5.658870507701084e-06, + "loss": 6.4205, + "step": 997 + }, + { + "epoch": 0.17080266986137257, + "grad_norm": 13.687639236450195, + "learning_rate": 5.664575014261267e-06, + "loss": 3.468, + "step": 998 + }, + { + "epoch": 0.1709738148211535, + "grad_norm": 41.89799118041992, + "learning_rate": 5.670279520821449e-06, + "loss": 7.13, + "step": 999 + }, + { + "epoch": 0.17114495978093444, + "grad_norm": 22.78835678100586, + "learning_rate": 5.675984027381632e-06, + "loss": 2.7249, + "step": 1000 + }, + { + "epoch": 0.17131610474071537, + "grad_norm": 26.538780212402344, + "learning_rate": 5.681688533941814e-06, + "loss": 3.2385, + "step": 1001 + }, + { + "epoch": 0.1714872497004963, + "grad_norm": 24.171205520629883, + "learning_rate": 5.687393040501997e-06, + "loss": 3.7183, + "step": 1002 + }, + { + "epoch": 0.17165839466027724, + "grad_norm": 35.46499252319336, + "learning_rate": 5.693097547062179e-06, + "loss": 5.4996, + "step": 1003 + }, + { + "epoch": 0.1718295396200582, + "grad_norm": 15.119646072387695, + "learning_rate": 5.698802053622362e-06, + "loss": 2.4476, + "step": 1004 + }, + { + "epoch": 0.17200068457983914, + "grad_norm": 43.560546875, + "learning_rate": 5.704506560182544e-06, + "loss": 9.1856, + "step": 1005 + }, + { + "epoch": 0.17217182953962007, + "grad_norm": 42.41808319091797, + "learning_rate": 5.710211066742727e-06, + "loss": 5.6756, + "step": 1006 + }, + { + "epoch": 0.172342974499401, + "grad_norm": 34.344207763671875, + "learning_rate": 5.715915573302909e-06, + "loss": 5.2383, + "step": 1007 + }, + { + "epoch": 0.17251411945918194, + "grad_norm": 19.511310577392578, + "learning_rate": 5.721620079863092e-06, + "loss": 3.3214, + "step": 1008 + }, + { + "epoch": 0.17268526441896287, + "grad_norm": 33.06563949584961, + "learning_rate": 5.7273245864232745e-06, + "loss": 5.6944, + "step": 1009 + }, + { + "epoch": 0.1728564093787438, + "grad_norm": 38.382041931152344, + "learning_rate": 5.733029092983457e-06, + "loss": 5.9898, + "step": 1010 + }, + { + "epoch": 0.17302755433852474, + "grad_norm": 28.5861759185791, + "learning_rate": 5.7387335995436395e-06, + "loss": 5.2048, + "step": 1011 + }, + { + "epoch": 0.17319869929830567, + "grad_norm": 31.76646614074707, + "learning_rate": 5.744438106103822e-06, + "loss": 6.0811, + "step": 1012 + }, + { + "epoch": 0.1733698442580866, + "grad_norm": 37.81482696533203, + "learning_rate": 5.750142612664005e-06, + "loss": 4.8642, + "step": 1013 + }, + { + "epoch": 0.17354098921786754, + "grad_norm": 45.32394790649414, + "learning_rate": 5.755847119224188e-06, + "loss": 9.5803, + "step": 1014 + }, + { + "epoch": 0.17371213417764847, + "grad_norm": 35.39071273803711, + "learning_rate": 5.76155162578437e-06, + "loss": 4.3758, + "step": 1015 + }, + { + "epoch": 0.1738832791374294, + "grad_norm": 31.971323013305664, + "learning_rate": 5.767256132344552e-06, + "loss": 4.2616, + "step": 1016 + }, + { + "epoch": 0.17405442409721034, + "grad_norm": 29.855161666870117, + "learning_rate": 5.772960638904735e-06, + "loss": 5.5371, + "step": 1017 + }, + { + "epoch": 0.17422556905699127, + "grad_norm": 21.00974464416504, + "learning_rate": 5.778665145464917e-06, + "loss": 1.9809, + "step": 1018 + }, + { + "epoch": 0.1743967140167722, + "grad_norm": 23.60835075378418, + "learning_rate": 5.7843696520251e-06, + "loss": 2.5916, + "step": 1019 + }, + { + "epoch": 0.17456785897655314, + "grad_norm": 36.11520767211914, + "learning_rate": 5.790074158585283e-06, + "loss": 4.9198, + "step": 1020 + }, + { + "epoch": 0.17473900393633407, + "grad_norm": 21.838703155517578, + "learning_rate": 5.795778665145466e-06, + "loss": 2.1235, + "step": 1021 + }, + { + "epoch": 0.174910148896115, + "grad_norm": 28.41387367248535, + "learning_rate": 5.801483171705647e-06, + "loss": 5.0401, + "step": 1022 + }, + { + "epoch": 0.17508129385589594, + "grad_norm": 28.482187271118164, + "learning_rate": 5.80718767826583e-06, + "loss": 4.7167, + "step": 1023 + }, + { + "epoch": 0.17525243881567687, + "grad_norm": 33.954307556152344, + "learning_rate": 5.8128921848260124e-06, + "loss": 4.9666, + "step": 1024 + }, + { + "epoch": 0.1754235837754578, + "grad_norm": 33.401920318603516, + "learning_rate": 5.818596691386195e-06, + "loss": 6.3783, + "step": 1025 + }, + { + "epoch": 0.17559472873523874, + "grad_norm": 37.047691345214844, + "learning_rate": 5.824301197946378e-06, + "loss": 5.5925, + "step": 1026 + }, + { + "epoch": 0.17576587369501967, + "grad_norm": 30.060083389282227, + "learning_rate": 5.830005704506561e-06, + "loss": 3.8415, + "step": 1027 + }, + { + "epoch": 0.1759370186548006, + "grad_norm": 30.832544326782227, + "learning_rate": 5.8357102110667426e-06, + "loss": 4.9379, + "step": 1028 + }, + { + "epoch": 0.17610816361458156, + "grad_norm": 30.651966094970703, + "learning_rate": 5.841414717626925e-06, + "loss": 3.9393, + "step": 1029 + }, + { + "epoch": 0.1762793085743625, + "grad_norm": 12.284616470336914, + "learning_rate": 5.847119224187108e-06, + "loss": 2.7979, + "step": 1030 + }, + { + "epoch": 0.17645045353414343, + "grad_norm": 25.138864517211914, + "learning_rate": 5.85282373074729e-06, + "loss": 3.6294, + "step": 1031 + }, + { + "epoch": 0.17662159849392436, + "grad_norm": 19.136524200439453, + "learning_rate": 5.8585282373074735e-06, + "loss": 1.5926, + "step": 1032 + }, + { + "epoch": 0.1767927434537053, + "grad_norm": 36.646968841552734, + "learning_rate": 5.864232743867656e-06, + "loss": 5.8265, + "step": 1033 + }, + { + "epoch": 0.17696388841348623, + "grad_norm": 17.363170623779297, + "learning_rate": 5.869937250427838e-06, + "loss": 1.7465, + "step": 1034 + }, + { + "epoch": 0.17713503337326716, + "grad_norm": 29.55439567565918, + "learning_rate": 5.87564175698802e-06, + "loss": 3.617, + "step": 1035 + }, + { + "epoch": 0.1773061783330481, + "grad_norm": 203.16549682617188, + "learning_rate": 5.881346263548203e-06, + "loss": 7.9826, + "step": 1036 + }, + { + "epoch": 0.17747732329282903, + "grad_norm": 17.790836334228516, + "learning_rate": 5.887050770108386e-06, + "loss": 2.1574, + "step": 1037 + }, + { + "epoch": 0.17764846825260996, + "grad_norm": 40.40040969848633, + "learning_rate": 5.892755276668569e-06, + "loss": 5.5116, + "step": 1038 + }, + { + "epoch": 0.1778196132123909, + "grad_norm": 30.316959381103516, + "learning_rate": 5.898459783228751e-06, + "loss": 4.4268, + "step": 1039 + }, + { + "epoch": 0.17799075817217183, + "grad_norm": 34.86418151855469, + "learning_rate": 5.904164289788933e-06, + "loss": 4.9673, + "step": 1040 + }, + { + "epoch": 0.17816190313195276, + "grad_norm": 198.34268188476562, + "learning_rate": 5.9098687963491155e-06, + "loss": 10.3881, + "step": 1041 + }, + { + "epoch": 0.1783330480917337, + "grad_norm": 29.608211517333984, + "learning_rate": 5.915573302909298e-06, + "loss": 3.9641, + "step": 1042 + }, + { + "epoch": 0.17850419305151463, + "grad_norm": 28.76857566833496, + "learning_rate": 5.921277809469481e-06, + "loss": 4.0211, + "step": 1043 + }, + { + "epoch": 0.17867533801129556, + "grad_norm": 26.37080955505371, + "learning_rate": 5.926982316029664e-06, + "loss": 4.6642, + "step": 1044 + }, + { + "epoch": 0.1788464829710765, + "grad_norm": 32.01490020751953, + "learning_rate": 5.9326868225898464e-06, + "loss": 5.5217, + "step": 1045 + }, + { + "epoch": 0.17901762793085743, + "grad_norm": 22.62516212463379, + "learning_rate": 5.938391329150029e-06, + "loss": 1.9563, + "step": 1046 + }, + { + "epoch": 0.17918877289063836, + "grad_norm": 40.089229583740234, + "learning_rate": 5.944095835710211e-06, + "loss": 5.9567, + "step": 1047 + }, + { + "epoch": 0.1793599178504193, + "grad_norm": 22.854562759399414, + "learning_rate": 5.949800342270393e-06, + "loss": 1.9063, + "step": 1048 + }, + { + "epoch": 0.17953106281020023, + "grad_norm": 99.86076354980469, + "learning_rate": 5.9555048488305766e-06, + "loss": 6.6872, + "step": 1049 + }, + { + "epoch": 0.17970220776998116, + "grad_norm": 42.04011154174805, + "learning_rate": 5.961209355390759e-06, + "loss": 6.4974, + "step": 1050 + }, + { + "epoch": 0.1798733527297621, + "grad_norm": 26.85508155822754, + "learning_rate": 5.966913861950942e-06, + "loss": 4.3443, + "step": 1051 + }, + { + "epoch": 0.18004449768954303, + "grad_norm": 29.8301944732666, + "learning_rate": 5.972618368511124e-06, + "loss": 5.0599, + "step": 1052 + }, + { + "epoch": 0.18021564264932396, + "grad_norm": 50.89991760253906, + "learning_rate": 5.978322875071306e-06, + "loss": 9.764, + "step": 1053 + }, + { + "epoch": 0.1803867876091049, + "grad_norm": 32.19784927368164, + "learning_rate": 5.984027381631489e-06, + "loss": 4.1811, + "step": 1054 + }, + { + "epoch": 0.18055793256888586, + "grad_norm": 46.780487060546875, + "learning_rate": 5.989731888191672e-06, + "loss": 9.4505, + "step": 1055 + }, + { + "epoch": 0.1807290775286668, + "grad_norm": 17.571828842163086, + "learning_rate": 5.995436394751854e-06, + "loss": 1.8957, + "step": 1056 + }, + { + "epoch": 0.18090022248844773, + "grad_norm": 30.740095138549805, + "learning_rate": 6.001140901312037e-06, + "loss": 4.0522, + "step": 1057 + }, + { + "epoch": 0.18107136744822866, + "grad_norm": 36.38762283325195, + "learning_rate": 6.006845407872219e-06, + "loss": 5.546, + "step": 1058 + }, + { + "epoch": 0.1812425124080096, + "grad_norm": 37.66824722290039, + "learning_rate": 6.012549914432401e-06, + "loss": 4.7406, + "step": 1059 + }, + { + "epoch": 0.18141365736779053, + "grad_norm": 33.9829216003418, + "learning_rate": 6.018254420992584e-06, + "loss": 4.8123, + "step": 1060 + }, + { + "epoch": 0.18158480232757146, + "grad_norm": 25.99117088317871, + "learning_rate": 6.023958927552767e-06, + "loss": 4.6063, + "step": 1061 + }, + { + "epoch": 0.1817559472873524, + "grad_norm": 29.198394775390625, + "learning_rate": 6.0296634341129495e-06, + "loss": 5.0514, + "step": 1062 + }, + { + "epoch": 0.18192709224713333, + "grad_norm": 14.127655982971191, + "learning_rate": 6.035367940673132e-06, + "loss": 1.3962, + "step": 1063 + }, + { + "epoch": 0.18209823720691426, + "grad_norm": 12.10257339477539, + "learning_rate": 6.0410724472333145e-06, + "loss": 2.0181, + "step": 1064 + }, + { + "epoch": 0.1822693821666952, + "grad_norm": 19.635854721069336, + "learning_rate": 6.046776953793496e-06, + "loss": 1.7151, + "step": 1065 + }, + { + "epoch": 0.18244052712647613, + "grad_norm": 189.35772705078125, + "learning_rate": 6.05248146035368e-06, + "loss": 9.8327, + "step": 1066 + }, + { + "epoch": 0.18261167208625706, + "grad_norm": 34.833229064941406, + "learning_rate": 6.058185966913862e-06, + "loss": 5.6448, + "step": 1067 + }, + { + "epoch": 0.182782817046038, + "grad_norm": 24.17336654663086, + "learning_rate": 6.063890473474045e-06, + "loss": 3.8977, + "step": 1068 + }, + { + "epoch": 0.18295396200581893, + "grad_norm": 32.84638214111328, + "learning_rate": 6.069594980034227e-06, + "loss": 5.7649, + "step": 1069 + }, + { + "epoch": 0.18312510696559986, + "grad_norm": 46.32835388183594, + "learning_rate": 6.07529948659441e-06, + "loss": 9.2569, + "step": 1070 + }, + { + "epoch": 0.1832962519253808, + "grad_norm": 15.697673797607422, + "learning_rate": 6.081003993154592e-06, + "loss": 1.6445, + "step": 1071 + }, + { + "epoch": 0.18346739688516173, + "grad_norm": 31.891868591308594, + "learning_rate": 6.086708499714775e-06, + "loss": 5.4669, + "step": 1072 + }, + { + "epoch": 0.18363854184494266, + "grad_norm": 29.735248565673828, + "learning_rate": 6.092413006274957e-06, + "loss": 5.0552, + "step": 1073 + }, + { + "epoch": 0.1838096868047236, + "grad_norm": 15.486328125, + "learning_rate": 6.09811751283514e-06, + "loss": 2.2292, + "step": 1074 + }, + { + "epoch": 0.18398083176450453, + "grad_norm": 24.518693923950195, + "learning_rate": 6.103822019395322e-06, + "loss": 3.5355, + "step": 1075 + }, + { + "epoch": 0.18415197672428546, + "grad_norm": 27.474645614624023, + "learning_rate": 6.109526525955505e-06, + "loss": 2.0704, + "step": 1076 + }, + { + "epoch": 0.1843231216840664, + "grad_norm": 21.003856658935547, + "learning_rate": 6.115231032515688e-06, + "loss": 2.0773, + "step": 1077 + }, + { + "epoch": 0.18449426664384733, + "grad_norm": 12.948555946350098, + "learning_rate": 6.12093553907587e-06, + "loss": 1.9105, + "step": 1078 + }, + { + "epoch": 0.18466541160362826, + "grad_norm": 28.35967254638672, + "learning_rate": 6.1266400456360525e-06, + "loss": 5.1778, + "step": 1079 + }, + { + "epoch": 0.18483655656340922, + "grad_norm": 28.59235954284668, + "learning_rate": 6.132344552196235e-06, + "loss": 3.9724, + "step": 1080 + }, + { + "epoch": 0.18500770152319015, + "grad_norm": 32.077518463134766, + "learning_rate": 6.138049058756418e-06, + "loss": 4.2397, + "step": 1081 + }, + { + "epoch": 0.1851788464829711, + "grad_norm": 34.8428955078125, + "learning_rate": 6.1437535653166e-06, + "loss": 4.3906, + "step": 1082 + }, + { + "epoch": 0.18534999144275202, + "grad_norm": 36.8244743347168, + "learning_rate": 6.1494580718767835e-06, + "loss": 4.6433, + "step": 1083 + }, + { + "epoch": 0.18552113640253295, + "grad_norm": 34.37318420410156, + "learning_rate": 6.155162578436965e-06, + "loss": 4.7285, + "step": 1084 + }, + { + "epoch": 0.1856922813623139, + "grad_norm": 34.02301025390625, + "learning_rate": 6.160867084997148e-06, + "loss": 5.1995, + "step": 1085 + }, + { + "epoch": 0.18586342632209482, + "grad_norm": 15.779897689819336, + "learning_rate": 6.16657159155733e-06, + "loss": 1.5138, + "step": 1086 + }, + { + "epoch": 0.18603457128187575, + "grad_norm": 45.183841705322266, + "learning_rate": 6.172276098117513e-06, + "loss": 6.6194, + "step": 1087 + }, + { + "epoch": 0.1862057162416567, + "grad_norm": 15.437774658203125, + "learning_rate": 6.177980604677695e-06, + "loss": 1.4242, + "step": 1088 + }, + { + "epoch": 0.18637686120143762, + "grad_norm": 246.0555419921875, + "learning_rate": 6.183685111237879e-06, + "loss": 10.7677, + "step": 1089 + }, + { + "epoch": 0.18654800616121855, + "grad_norm": 8.7081937789917, + "learning_rate": 6.18938961779806e-06, + "loss": 2.3527, + "step": 1090 + }, + { + "epoch": 0.1867191511209995, + "grad_norm": 35.0928840637207, + "learning_rate": 6.195094124358243e-06, + "loss": 5.4856, + "step": 1091 + }, + { + "epoch": 0.18689029608078042, + "grad_norm": 36.24078369140625, + "learning_rate": 6.2007986309184254e-06, + "loss": 5.1105, + "step": 1092 + }, + { + "epoch": 0.18706144104056135, + "grad_norm": 41.07029724121094, + "learning_rate": 6.206503137478608e-06, + "loss": 5.543, + "step": 1093 + }, + { + "epoch": 0.1872325860003423, + "grad_norm": 36.27534484863281, + "learning_rate": 6.212207644038791e-06, + "loss": 4.4058, + "step": 1094 + }, + { + "epoch": 0.18740373096012322, + "grad_norm": 34.61309814453125, + "learning_rate": 6.217912150598974e-06, + "loss": 4.9065, + "step": 1095 + }, + { + "epoch": 0.18757487591990415, + "grad_norm": 36.856388092041016, + "learning_rate": 6.223616657159156e-06, + "loss": 4.8059, + "step": 1096 + }, + { + "epoch": 0.1877460208796851, + "grad_norm": 39.40951156616211, + "learning_rate": 6.229321163719338e-06, + "loss": 5.8853, + "step": 1097 + }, + { + "epoch": 0.18791716583946602, + "grad_norm": 30.013790130615234, + "learning_rate": 6.235025670279521e-06, + "loss": 4.1051, + "step": 1098 + }, + { + "epoch": 0.18808831079924695, + "grad_norm": 27.43667984008789, + "learning_rate": 6.240730176839703e-06, + "loss": 3.661, + "step": 1099 + }, + { + "epoch": 0.1882594557590279, + "grad_norm": 22.01202964782715, + "learning_rate": 6.2464346833998865e-06, + "loss": 2.0165, + "step": 1100 + }, + { + "epoch": 0.18843060071880882, + "grad_norm": 23.981887817382812, + "learning_rate": 6.252139189960069e-06, + "loss": 1.8586, + "step": 1101 + }, + { + "epoch": 0.18860174567858976, + "grad_norm": 221.93540954589844, + "learning_rate": 6.257843696520252e-06, + "loss": 8.0869, + "step": 1102 + }, + { + "epoch": 0.1887728906383707, + "grad_norm": 32.2524299621582, + "learning_rate": 6.263548203080433e-06, + "loss": 4.6553, + "step": 1103 + }, + { + "epoch": 0.18894403559815162, + "grad_norm": 14.555329322814941, + "learning_rate": 6.269252709640616e-06, + "loss": 2.0657, + "step": 1104 + }, + { + "epoch": 0.18911518055793256, + "grad_norm": 27.233903884887695, + "learning_rate": 6.274957216200798e-06, + "loss": 3.7143, + "step": 1105 + }, + { + "epoch": 0.18928632551771352, + "grad_norm": 15.294402122497559, + "learning_rate": 6.280661722760982e-06, + "loss": 1.4409, + "step": 1106 + }, + { + "epoch": 0.18945747047749445, + "grad_norm": 223.1316375732422, + "learning_rate": 6.286366229321164e-06, + "loss": 9.676, + "step": 1107 + }, + { + "epoch": 0.18962861543727538, + "grad_norm": 36.643463134765625, + "learning_rate": 6.292070735881347e-06, + "loss": 4.7202, + "step": 1108 + }, + { + "epoch": 0.18979976039705632, + "grad_norm": 37.47721481323242, + "learning_rate": 6.2977752424415285e-06, + "loss": 4.8366, + "step": 1109 + }, + { + "epoch": 0.18997090535683725, + "grad_norm": 34.74982833862305, + "learning_rate": 6.303479749001711e-06, + "loss": 4.6667, + "step": 1110 + }, + { + "epoch": 0.19014205031661818, + "grad_norm": 38.055728912353516, + "learning_rate": 6.3091842555618935e-06, + "loss": 5.3396, + "step": 1111 + }, + { + "epoch": 0.19031319527639912, + "grad_norm": 33.44966506958008, + "learning_rate": 6.314888762122077e-06, + "loss": 5.0909, + "step": 1112 + }, + { + "epoch": 0.19048434023618005, + "grad_norm": 34.397132873535156, + "learning_rate": 6.3205932686822594e-06, + "loss": 5.3514, + "step": 1113 + }, + { + "epoch": 0.19065548519596098, + "grad_norm": 39.06338119506836, + "learning_rate": 6.326297775242442e-06, + "loss": 6.3797, + "step": 1114 + }, + { + "epoch": 0.19082663015574192, + "grad_norm": 40.017799377441406, + "learning_rate": 6.332002281802624e-06, + "loss": 5.5943, + "step": 1115 + }, + { + "epoch": 0.19099777511552285, + "grad_norm": 11.964347839355469, + "learning_rate": 6.337706788362806e-06, + "loss": 1.8095, + "step": 1116 + }, + { + "epoch": 0.19116892007530378, + "grad_norm": 12.956400871276855, + "learning_rate": 6.3434112949229896e-06, + "loss": 1.3529, + "step": 1117 + }, + { + "epoch": 0.19134006503508472, + "grad_norm": 36.93289566040039, + "learning_rate": 6.349115801483172e-06, + "loss": 6.0492, + "step": 1118 + }, + { + "epoch": 0.19151120999486565, + "grad_norm": 33.92202377319336, + "learning_rate": 6.354820308043355e-06, + "loss": 5.9093, + "step": 1119 + }, + { + "epoch": 0.19168235495464658, + "grad_norm": 37.51108169555664, + "learning_rate": 6.360524814603537e-06, + "loss": 5.5156, + "step": 1120 + }, + { + "epoch": 0.19185349991442752, + "grad_norm": 23.369075775146484, + "learning_rate": 6.36622932116372e-06, + "loss": 3.9585, + "step": 1121 + }, + { + "epoch": 0.19202464487420845, + "grad_norm": 27.76898765563965, + "learning_rate": 6.371933827723901e-06, + "loss": 4.0578, + "step": 1122 + }, + { + "epoch": 0.19219578983398938, + "grad_norm": 21.719980239868164, + "learning_rate": 6.377638334284085e-06, + "loss": 1.6746, + "step": 1123 + }, + { + "epoch": 0.19236693479377032, + "grad_norm": 32.65765380859375, + "learning_rate": 6.383342840844267e-06, + "loss": 4.4355, + "step": 1124 + }, + { + "epoch": 0.19253807975355125, + "grad_norm": 31.302228927612305, + "learning_rate": 6.38904734740445e-06, + "loss": 4.3111, + "step": 1125 + }, + { + "epoch": 0.19270922471333218, + "grad_norm": 36.785396575927734, + "learning_rate": 6.394751853964632e-06, + "loss": 5.3737, + "step": 1126 + }, + { + "epoch": 0.19288036967311312, + "grad_norm": 32.185787200927734, + "learning_rate": 6.400456360524815e-06, + "loss": 4.2842, + "step": 1127 + }, + { + "epoch": 0.19305151463289405, + "grad_norm": 49.154666900634766, + "learning_rate": 6.4061608670849966e-06, + "loss": 8.8989, + "step": 1128 + }, + { + "epoch": 0.19322265959267498, + "grad_norm": 31.552207946777344, + "learning_rate": 6.41186537364518e-06, + "loss": 4.2685, + "step": 1129 + }, + { + "epoch": 0.19339380455245592, + "grad_norm": 21.41136932373047, + "learning_rate": 6.4175698802053625e-06, + "loss": 2.3051, + "step": 1130 + }, + { + "epoch": 0.19356494951223688, + "grad_norm": 13.525940895080566, + "learning_rate": 6.423274386765545e-06, + "loss": 2.1123, + "step": 1131 + }, + { + "epoch": 0.1937360944720178, + "grad_norm": 37.48530960083008, + "learning_rate": 6.4289788933257275e-06, + "loss": 4.8037, + "step": 1132 + }, + { + "epoch": 0.19390723943179874, + "grad_norm": 38.14132308959961, + "learning_rate": 6.43468339988591e-06, + "loss": 6.2294, + "step": 1133 + }, + { + "epoch": 0.19407838439157968, + "grad_norm": 33.01750183105469, + "learning_rate": 6.440387906446093e-06, + "loss": 4.9204, + "step": 1134 + }, + { + "epoch": 0.1942495293513606, + "grad_norm": 36.364158630371094, + "learning_rate": 6.446092413006275e-06, + "loss": 4.5797, + "step": 1135 + }, + { + "epoch": 0.19442067431114154, + "grad_norm": 46.81378173828125, + "learning_rate": 6.451796919566458e-06, + "loss": 6.538, + "step": 1136 + }, + { + "epoch": 0.19459181927092248, + "grad_norm": 23.135957717895508, + "learning_rate": 6.45750142612664e-06, + "loss": 4.3991, + "step": 1137 + }, + { + "epoch": 0.1947629642307034, + "grad_norm": 25.031917572021484, + "learning_rate": 6.463205932686823e-06, + "loss": 2.3886, + "step": 1138 + }, + { + "epoch": 0.19493410919048434, + "grad_norm": 35.31920623779297, + "learning_rate": 6.468910439247005e-06, + "loss": 6.0172, + "step": 1139 + }, + { + "epoch": 0.19510525415026528, + "grad_norm": 36.97047424316406, + "learning_rate": 6.474614945807188e-06, + "loss": 5.4822, + "step": 1140 + }, + { + "epoch": 0.1952763991100462, + "grad_norm": 31.77883529663086, + "learning_rate": 6.48031945236737e-06, + "loss": 4.7072, + "step": 1141 + }, + { + "epoch": 0.19544754406982714, + "grad_norm": 28.897930145263672, + "learning_rate": 6.486023958927553e-06, + "loss": 3.7105, + "step": 1142 + }, + { + "epoch": 0.19561868902960808, + "grad_norm": 29.99696922302246, + "learning_rate": 6.491728465487735e-06, + "loss": 4.5102, + "step": 1143 + }, + { + "epoch": 0.195789833989389, + "grad_norm": 25.783557891845703, + "learning_rate": 6.497432972047918e-06, + "loss": 3.6023, + "step": 1144 + }, + { + "epoch": 0.19596097894916994, + "grad_norm": 35.004642486572266, + "learning_rate": 6.5031374786081005e-06, + "loss": 4.1587, + "step": 1145 + }, + { + "epoch": 0.19613212390895088, + "grad_norm": 173.46754455566406, + "learning_rate": 6.508841985168284e-06, + "loss": 7.5547, + "step": 1146 + }, + { + "epoch": 0.1963032688687318, + "grad_norm": 18.749853134155273, + "learning_rate": 6.5145464917284655e-06, + "loss": 1.7298, + "step": 1147 + }, + { + "epoch": 0.19647441382851275, + "grad_norm": 31.15353012084961, + "learning_rate": 6.520250998288648e-06, + "loss": 5.4053, + "step": 1148 + }, + { + "epoch": 0.19664555878829368, + "grad_norm": 21.659912109375, + "learning_rate": 6.525955504848831e-06, + "loss": 1.8891, + "step": 1149 + }, + { + "epoch": 0.1968167037480746, + "grad_norm": 23.412139892578125, + "learning_rate": 6.531660011409013e-06, + "loss": 3.8619, + "step": 1150 + }, + { + "epoch": 0.19698784870785555, + "grad_norm": 22.16069221496582, + "learning_rate": 6.537364517969196e-06, + "loss": 2.0106, + "step": 1151 + }, + { + "epoch": 0.19715899366763648, + "grad_norm": 33.494136810302734, + "learning_rate": 6.543069024529379e-06, + "loss": 5.4958, + "step": 1152 + }, + { + "epoch": 0.1973301386274174, + "grad_norm": 32.96882629394531, + "learning_rate": 6.548773531089561e-06, + "loss": 4.5927, + "step": 1153 + }, + { + "epoch": 0.19750128358719835, + "grad_norm": 36.14384078979492, + "learning_rate": 6.554478037649743e-06, + "loss": 5.6357, + "step": 1154 + }, + { + "epoch": 0.19767242854697928, + "grad_norm": 23.875118255615234, + "learning_rate": 6.560182544209926e-06, + "loss": 3.158, + "step": 1155 + }, + { + "epoch": 0.19784357350676024, + "grad_norm": 23.001026153564453, + "learning_rate": 6.565887050770108e-06, + "loss": 1.8949, + "step": 1156 + }, + { + "epoch": 0.19801471846654117, + "grad_norm": 46.26600646972656, + "learning_rate": 6.571591557330292e-06, + "loss": 9.1329, + "step": 1157 + }, + { + "epoch": 0.1981858634263221, + "grad_norm": 16.32296371459961, + "learning_rate": 6.577296063890474e-06, + "loss": 1.5302, + "step": 1158 + }, + { + "epoch": 0.19835700838610304, + "grad_norm": 26.114614486694336, + "learning_rate": 6.583000570450656e-06, + "loss": 2.3763, + "step": 1159 + }, + { + "epoch": 0.19852815334588397, + "grad_norm": 37.42622756958008, + "learning_rate": 6.5887050770108384e-06, + "loss": 5.5999, + "step": 1160 + }, + { + "epoch": 0.1986992983056649, + "grad_norm": 21.48786735534668, + "learning_rate": 6.594409583571021e-06, + "loss": 3.4369, + "step": 1161 + }, + { + "epoch": 0.19887044326544584, + "grad_norm": 24.472808837890625, + "learning_rate": 6.6001140901312035e-06, + "loss": 2.0175, + "step": 1162 + }, + { + "epoch": 0.19904158822522677, + "grad_norm": 25.275909423828125, + "learning_rate": 6.605818596691387e-06, + "loss": 2.6992, + "step": 1163 + }, + { + "epoch": 0.1992127331850077, + "grad_norm": 29.439197540283203, + "learning_rate": 6.611523103251569e-06, + "loss": 4.4373, + "step": 1164 + }, + { + "epoch": 0.19938387814478864, + "grad_norm": 224.64663696289062, + "learning_rate": 6.617227609811751e-06, + "loss": 10.3737, + "step": 1165 + }, + { + "epoch": 0.19955502310456957, + "grad_norm": 34.043575286865234, + "learning_rate": 6.622932116371934e-06, + "loss": 5.0921, + "step": 1166 + }, + { + "epoch": 0.1997261680643505, + "grad_norm": 11.060107231140137, + "learning_rate": 6.628636622932116e-06, + "loss": 1.2996, + "step": 1167 + }, + { + "epoch": 0.19989731302413144, + "grad_norm": 32.19368362426758, + "learning_rate": 6.634341129492299e-06, + "loss": 4.2537, + "step": 1168 + }, + { + "epoch": 0.20006845798391237, + "grad_norm": 48.267578125, + "learning_rate": 6.640045636052482e-06, + "loss": 9.335, + "step": 1169 + }, + { + "epoch": 0.2002396029436933, + "grad_norm": 19.327762603759766, + "learning_rate": 6.645750142612665e-06, + "loss": 1.8859, + "step": 1170 + }, + { + "epoch": 0.20041074790347424, + "grad_norm": 28.81614875793457, + "learning_rate": 6.651454649172847e-06, + "loss": 3.8125, + "step": 1171 + }, + { + "epoch": 0.20058189286325517, + "grad_norm": 24.971960067749023, + "learning_rate": 6.657159155733029e-06, + "loss": 3.0816, + "step": 1172 + }, + { + "epoch": 0.2007530378230361, + "grad_norm": 154.4432373046875, + "learning_rate": 6.662863662293211e-06, + "loss": 8.568, + "step": 1173 + }, + { + "epoch": 0.20092418278281704, + "grad_norm": 47.04978942871094, + "learning_rate": 6.668568168853395e-06, + "loss": 5.1816, + "step": 1174 + }, + { + "epoch": 0.20109532774259797, + "grad_norm": 24.374345779418945, + "learning_rate": 6.674272675413577e-06, + "loss": 2.6078, + "step": 1175 + }, + { + "epoch": 0.2012664727023789, + "grad_norm": 36.597232818603516, + "learning_rate": 6.67997718197376e-06, + "loss": 5.5402, + "step": 1176 + }, + { + "epoch": 0.20143761766215984, + "grad_norm": 36.612060546875, + "learning_rate": 6.685681688533942e-06, + "loss": 5.17, + "step": 1177 + }, + { + "epoch": 0.20160876262194077, + "grad_norm": 39.452117919921875, + "learning_rate": 6.691386195094124e-06, + "loss": 6.2861, + "step": 1178 + }, + { + "epoch": 0.2017799075817217, + "grad_norm": 35.985816955566406, + "learning_rate": 6.6970907016543065e-06, + "loss": 5.7763, + "step": 1179 + }, + { + "epoch": 0.20195105254150264, + "grad_norm": 11.960805892944336, + "learning_rate": 6.70279520821449e-06, + "loss": 2.7312, + "step": 1180 + }, + { + "epoch": 0.20212219750128357, + "grad_norm": 154.7554168701172, + "learning_rate": 6.7084997147746724e-06, + "loss": 9.5806, + "step": 1181 + }, + { + "epoch": 0.20229334246106453, + "grad_norm": 31.713943481445312, + "learning_rate": 6.714204221334855e-06, + "loss": 4.9006, + "step": 1182 + }, + { + "epoch": 0.20246448742084547, + "grad_norm": 11.431591987609863, + "learning_rate": 6.7199087278950375e-06, + "loss": 3.1028, + "step": 1183 + }, + { + "epoch": 0.2026356323806264, + "grad_norm": 208.2880859375, + "learning_rate": 6.725613234455219e-06, + "loss": 8.5447, + "step": 1184 + }, + { + "epoch": 0.20280677734040733, + "grad_norm": 32.78763198852539, + "learning_rate": 6.731317741015402e-06, + "loss": 5.0437, + "step": 1185 + }, + { + "epoch": 0.20297792230018827, + "grad_norm": 31.15655517578125, + "learning_rate": 6.737022247575585e-06, + "loss": 4.1921, + "step": 1186 + }, + { + "epoch": 0.2031490672599692, + "grad_norm": 12.072607040405273, + "learning_rate": 6.742726754135768e-06, + "loss": 1.9291, + "step": 1187 + }, + { + "epoch": 0.20332021221975013, + "grad_norm": 46.76679992675781, + "learning_rate": 6.74843126069595e-06, + "loss": 9.0577, + "step": 1188 + }, + { + "epoch": 0.20349135717953107, + "grad_norm": 28.912738800048828, + "learning_rate": 6.754135767256133e-06, + "loss": 4.3274, + "step": 1189 + }, + { + "epoch": 0.203662502139312, + "grad_norm": 151.7112579345703, + "learning_rate": 6.759840273816315e-06, + "loss": 8.1049, + "step": 1190 + }, + { + "epoch": 0.20383364709909293, + "grad_norm": 19.557729721069336, + "learning_rate": 6.765544780376497e-06, + "loss": 1.6717, + "step": 1191 + }, + { + "epoch": 0.20400479205887387, + "grad_norm": 37.28075408935547, + "learning_rate": 6.77124928693668e-06, + "loss": 5.6393, + "step": 1192 + }, + { + "epoch": 0.2041759370186548, + "grad_norm": 33.639183044433594, + "learning_rate": 6.776953793496863e-06, + "loss": 4.9937, + "step": 1193 + }, + { + "epoch": 0.20434708197843574, + "grad_norm": 16.514705657958984, + "learning_rate": 6.782658300057045e-06, + "loss": 2.2396, + "step": 1194 + }, + { + "epoch": 0.20451822693821667, + "grad_norm": 29.29157066345215, + "learning_rate": 6.788362806617228e-06, + "loss": 4.5062, + "step": 1195 + }, + { + "epoch": 0.2046893718979976, + "grad_norm": 24.25420570373535, + "learning_rate": 6.79406731317741e-06, + "loss": 2.5282, + "step": 1196 + }, + { + "epoch": 0.20486051685777854, + "grad_norm": 21.87625503540039, + "learning_rate": 6.799771819737593e-06, + "loss": 2.2101, + "step": 1197 + }, + { + "epoch": 0.20503166181755947, + "grad_norm": 29.727163314819336, + "learning_rate": 6.8054763262977755e-06, + "loss": 3.5679, + "step": 1198 + }, + { + "epoch": 0.2052028067773404, + "grad_norm": 23.502267837524414, + "learning_rate": 6.811180832857958e-06, + "loss": 3.9821, + "step": 1199 + }, + { + "epoch": 0.20537395173712134, + "grad_norm": 31.961931228637695, + "learning_rate": 6.8168853394181405e-06, + "loss": 4.6, + "step": 1200 + }, + { + "epoch": 0.20554509669690227, + "grad_norm": 27.584300994873047, + "learning_rate": 6.822589845978323e-06, + "loss": 3.389, + "step": 1201 + }, + { + "epoch": 0.2057162416566832, + "grad_norm": 34.41096115112305, + "learning_rate": 6.828294352538506e-06, + "loss": 4.722, + "step": 1202 + }, + { + "epoch": 0.20588738661646414, + "grad_norm": 41.341312408447266, + "learning_rate": 6.833998859098688e-06, + "loss": 6.7225, + "step": 1203 + }, + { + "epoch": 0.20605853157624507, + "grad_norm": 160.5906982421875, + "learning_rate": 6.839703365658871e-06, + "loss": 9.8412, + "step": 1204 + }, + { + "epoch": 0.206229676536026, + "grad_norm": 23.49472999572754, + "learning_rate": 6.845407872219053e-06, + "loss": 3.6378, + "step": 1205 + }, + { + "epoch": 0.20640082149580694, + "grad_norm": 31.307947158813477, + "learning_rate": 6.851112378779236e-06, + "loss": 3.6813, + "step": 1206 + }, + { + "epoch": 0.2065719664555879, + "grad_norm": 27.893850326538086, + "learning_rate": 6.856816885339418e-06, + "loss": 4.5216, + "step": 1207 + }, + { + "epoch": 0.20674311141536883, + "grad_norm": 32.200157165527344, + "learning_rate": 6.862521391899601e-06, + "loss": 4.5525, + "step": 1208 + }, + { + "epoch": 0.20691425637514976, + "grad_norm": 31.765216827392578, + "learning_rate": 6.868225898459783e-06, + "loss": 5.2865, + "step": 1209 + }, + { + "epoch": 0.2070854013349307, + "grad_norm": 35.562294006347656, + "learning_rate": 6.873930405019966e-06, + "loss": 5.0758, + "step": 1210 + }, + { + "epoch": 0.20725654629471163, + "grad_norm": 44.582786560058594, + "learning_rate": 6.879634911580148e-06, + "loss": 8.7973, + "step": 1211 + }, + { + "epoch": 0.20742769125449256, + "grad_norm": 29.667964935302734, + "learning_rate": 6.885339418140331e-06, + "loss": 3.7483, + "step": 1212 + }, + { + "epoch": 0.2075988362142735, + "grad_norm": 33.826454162597656, + "learning_rate": 6.8910439247005135e-06, + "loss": 5.321, + "step": 1213 + }, + { + "epoch": 0.20776998117405443, + "grad_norm": 36.56757354736328, + "learning_rate": 6.896748431260697e-06, + "loss": 4.6366, + "step": 1214 + }, + { + "epoch": 0.20794112613383536, + "grad_norm": 21.483030319213867, + "learning_rate": 6.9024529378208785e-06, + "loss": 1.7844, + "step": 1215 + }, + { + "epoch": 0.2081122710936163, + "grad_norm": 22.398630142211914, + "learning_rate": 6.908157444381061e-06, + "loss": 2.9002, + "step": 1216 + }, + { + "epoch": 0.20828341605339723, + "grad_norm": 16.41680145263672, + "learning_rate": 6.913861950941244e-06, + "loss": 1.5466, + "step": 1217 + }, + { + "epoch": 0.20845456101317816, + "grad_norm": 22.448949813842773, + "learning_rate": 6.919566457501426e-06, + "loss": 3.4011, + "step": 1218 + }, + { + "epoch": 0.2086257059729591, + "grad_norm": 35.074989318847656, + "learning_rate": 6.925270964061609e-06, + "loss": 4.4769, + "step": 1219 + }, + { + "epoch": 0.20879685093274003, + "grad_norm": 29.737442016601562, + "learning_rate": 6.930975470621792e-06, + "loss": 4.6152, + "step": 1220 + }, + { + "epoch": 0.20896799589252096, + "grad_norm": 29.097299575805664, + "learning_rate": 6.9366799771819746e-06, + "loss": 3.8591, + "step": 1221 + }, + { + "epoch": 0.2091391408523019, + "grad_norm": 22.356008529663086, + "learning_rate": 6.942384483742156e-06, + "loss": 3.6379, + "step": 1222 + }, + { + "epoch": 0.20931028581208283, + "grad_norm": 29.412656784057617, + "learning_rate": 6.948088990302339e-06, + "loss": 3.5976, + "step": 1223 + }, + { + "epoch": 0.20948143077186376, + "grad_norm": 19.5412654876709, + "learning_rate": 6.953793496862521e-06, + "loss": 2.0718, + "step": 1224 + }, + { + "epoch": 0.2096525757316447, + "grad_norm": 17.43561363220215, + "learning_rate": 6.959498003422704e-06, + "loss": 1.5389, + "step": 1225 + }, + { + "epoch": 0.20982372069142563, + "grad_norm": 34.85890579223633, + "learning_rate": 6.965202509982887e-06, + "loss": 4.4105, + "step": 1226 + }, + { + "epoch": 0.20999486565120656, + "grad_norm": 33.83147430419922, + "learning_rate": 6.97090701654307e-06, + "loss": 4.108, + "step": 1227 + }, + { + "epoch": 0.2101660106109875, + "grad_norm": 33.77149963378906, + "learning_rate": 6.9766115231032514e-06, + "loss": 4.4198, + "step": 1228 + }, + { + "epoch": 0.21033715557076843, + "grad_norm": 12.30455207824707, + "learning_rate": 6.982316029663434e-06, + "loss": 1.7759, + "step": 1229 + }, + { + "epoch": 0.21050830053054936, + "grad_norm": 34.55380630493164, + "learning_rate": 6.9880205362236165e-06, + "loss": 4.4813, + "step": 1230 + }, + { + "epoch": 0.2106794454903303, + "grad_norm": 23.975025177001953, + "learning_rate": 6.993725042783799e-06, + "loss": 3.728, + "step": 1231 + }, + { + "epoch": 0.21085059045011123, + "grad_norm": 190.6012725830078, + "learning_rate": 6.999429549343982e-06, + "loss": 10.1602, + "step": 1232 + }, + { + "epoch": 0.2110217354098922, + "grad_norm": 34.527076721191406, + "learning_rate": 7.005134055904165e-06, + "loss": 4.7483, + "step": 1233 + }, + { + "epoch": 0.21119288036967312, + "grad_norm": 35.65943908691406, + "learning_rate": 7.010838562464347e-06, + "loss": 5.5499, + "step": 1234 + }, + { + "epoch": 0.21136402532945406, + "grad_norm": 34.03565216064453, + "learning_rate": 7.016543069024529e-06, + "loss": 4.7829, + "step": 1235 + }, + { + "epoch": 0.211535170289235, + "grad_norm": 20.10201072692871, + "learning_rate": 7.022247575584712e-06, + "loss": 2.9853, + "step": 1236 + }, + { + "epoch": 0.21170631524901593, + "grad_norm": 72.77118682861328, + "learning_rate": 7.027952082144895e-06, + "loss": 6.8184, + "step": 1237 + }, + { + "epoch": 0.21187746020879686, + "grad_norm": 32.084381103515625, + "learning_rate": 7.033656588705078e-06, + "loss": 5.0572, + "step": 1238 + }, + { + "epoch": 0.2120486051685778, + "grad_norm": 28.180423736572266, + "learning_rate": 7.03936109526526e-06, + "loss": 3.8185, + "step": 1239 + }, + { + "epoch": 0.21221975012835873, + "grad_norm": 20.687843322753906, + "learning_rate": 7.045065601825443e-06, + "loss": 2.1643, + "step": 1240 + }, + { + "epoch": 0.21239089508813966, + "grad_norm": 15.380537033081055, + "learning_rate": 7.050770108385624e-06, + "loss": 1.6453, + "step": 1241 + }, + { + "epoch": 0.2125620400479206, + "grad_norm": 38.16814422607422, + "learning_rate": 7.056474614945807e-06, + "loss": 5.8775, + "step": 1242 + }, + { + "epoch": 0.21273318500770153, + "grad_norm": 43.55405807495117, + "learning_rate": 7.06217912150599e-06, + "loss": 5.1528, + "step": 1243 + }, + { + "epoch": 0.21290432996748246, + "grad_norm": 30.40400505065918, + "learning_rate": 7.067883628066173e-06, + "loss": 4.155, + "step": 1244 + }, + { + "epoch": 0.2130754749272634, + "grad_norm": 39.55487823486328, + "learning_rate": 7.073588134626355e-06, + "loss": 6.8649, + "step": 1245 + }, + { + "epoch": 0.21324661988704433, + "grad_norm": 46.886600494384766, + "learning_rate": 7.079292641186538e-06, + "loss": 4.8251, + "step": 1246 + }, + { + "epoch": 0.21341776484682526, + "grad_norm": 35.842594146728516, + "learning_rate": 7.0849971477467195e-06, + "loss": 5.3382, + "step": 1247 + }, + { + "epoch": 0.2135889098066062, + "grad_norm": 10.459444999694824, + "learning_rate": 7.090701654306902e-06, + "loss": 1.1781, + "step": 1248 + }, + { + "epoch": 0.21376005476638713, + "grad_norm": 31.134531021118164, + "learning_rate": 7.0964061608670854e-06, + "loss": 3.3419, + "step": 1249 + }, + { + "epoch": 0.21393119972616806, + "grad_norm": 32.50645065307617, + "learning_rate": 7.102110667427268e-06, + "loss": 4.1592, + "step": 1250 + }, + { + "epoch": 0.214102344685949, + "grad_norm": 38.065643310546875, + "learning_rate": 7.1078151739874505e-06, + "loss": 6.1903, + "step": 1251 + }, + { + "epoch": 0.21427348964572993, + "grad_norm": 32.13066482543945, + "learning_rate": 7.113519680547633e-06, + "loss": 3.8917, + "step": 1252 + }, + { + "epoch": 0.21444463460551086, + "grad_norm": 22.333932876586914, + "learning_rate": 7.119224187107815e-06, + "loss": 3.308, + "step": 1253 + }, + { + "epoch": 0.2146157795652918, + "grad_norm": 8.437789916992188, + "learning_rate": 7.124928693667997e-06, + "loss": 2.2375, + "step": 1254 + }, + { + "epoch": 0.21478692452507273, + "grad_norm": 32.72603225708008, + "learning_rate": 7.130633200228181e-06, + "loss": 4.8237, + "step": 1255 + }, + { + "epoch": 0.21495806948485366, + "grad_norm": 34.640647888183594, + "learning_rate": 7.136337706788363e-06, + "loss": 5.2757, + "step": 1256 + }, + { + "epoch": 0.2151292144446346, + "grad_norm": 20.100618362426758, + "learning_rate": 7.142042213348546e-06, + "loss": 2.961, + "step": 1257 + }, + { + "epoch": 0.21530035940441555, + "grad_norm": 43.29427719116211, + "learning_rate": 7.147746719908728e-06, + "loss": 8.933, + "step": 1258 + }, + { + "epoch": 0.2154715043641965, + "grad_norm": 33.56546401977539, + "learning_rate": 7.15345122646891e-06, + "loss": 4.6558, + "step": 1259 + }, + { + "epoch": 0.21564264932397742, + "grad_norm": 33.7791633605957, + "learning_rate": 7.159155733029093e-06, + "loss": 4.183, + "step": 1260 + }, + { + "epoch": 0.21581379428375835, + "grad_norm": 33.235233306884766, + "learning_rate": 7.164860239589276e-06, + "loss": 3.7487, + "step": 1261 + }, + { + "epoch": 0.2159849392435393, + "grad_norm": 140.30621337890625, + "learning_rate": 7.170564746149458e-06, + "loss": 9.0381, + "step": 1262 + }, + { + "epoch": 0.21615608420332022, + "grad_norm": 20.70719337463379, + "learning_rate": 7.176269252709641e-06, + "loss": 1.7769, + "step": 1263 + }, + { + "epoch": 0.21632722916310115, + "grad_norm": 36.93478012084961, + "learning_rate": 7.181973759269823e-06, + "loss": 4.5665, + "step": 1264 + }, + { + "epoch": 0.2164983741228821, + "grad_norm": 81.26618957519531, + "learning_rate": 7.187678265830006e-06, + "loss": 7.0141, + "step": 1265 + }, + { + "epoch": 0.21666951908266302, + "grad_norm": 33.15439224243164, + "learning_rate": 7.1933827723901885e-06, + "loss": 4.5814, + "step": 1266 + }, + { + "epoch": 0.21684066404244395, + "grad_norm": 26.268171310424805, + "learning_rate": 7.199087278950371e-06, + "loss": 3.0891, + "step": 1267 + }, + { + "epoch": 0.2170118090022249, + "grad_norm": 35.35780715942383, + "learning_rate": 7.2047917855105535e-06, + "loss": 4.8355, + "step": 1268 + }, + { + "epoch": 0.21718295396200582, + "grad_norm": 21.87150764465332, + "learning_rate": 7.210496292070736e-06, + "loss": 1.7614, + "step": 1269 + }, + { + "epoch": 0.21735409892178675, + "grad_norm": 36.49989318847656, + "learning_rate": 7.216200798630919e-06, + "loss": 5.8824, + "step": 1270 + }, + { + "epoch": 0.2175252438815677, + "grad_norm": 11.613662719726562, + "learning_rate": 7.221905305191101e-06, + "loss": 1.7057, + "step": 1271 + }, + { + "epoch": 0.21769638884134862, + "grad_norm": 28.447458267211914, + "learning_rate": 7.227609811751284e-06, + "loss": 4.3815, + "step": 1272 + }, + { + "epoch": 0.21786753380112955, + "grad_norm": 34.95615005493164, + "learning_rate": 7.233314318311466e-06, + "loss": 4.7223, + "step": 1273 + }, + { + "epoch": 0.2180386787609105, + "grad_norm": 36.12034606933594, + "learning_rate": 7.239018824871649e-06, + "loss": 5.4639, + "step": 1274 + }, + { + "epoch": 0.21820982372069142, + "grad_norm": 29.200042724609375, + "learning_rate": 7.244723331431831e-06, + "loss": 3.9203, + "step": 1275 + }, + { + "epoch": 0.21838096868047235, + "grad_norm": 173.54055786132812, + "learning_rate": 7.250427837992014e-06, + "loss": 9.2819, + "step": 1276 + }, + { + "epoch": 0.2185521136402533, + "grad_norm": 30.67865562438965, + "learning_rate": 7.256132344552197e-06, + "loss": 4.7412, + "step": 1277 + }, + { + "epoch": 0.21872325860003422, + "grad_norm": 35.703468322753906, + "learning_rate": 7.261836851112379e-06, + "loss": 5.3418, + "step": 1278 + }, + { + "epoch": 0.21889440355981515, + "grad_norm": 35.29546356201172, + "learning_rate": 7.267541357672561e-06, + "loss": 5.1735, + "step": 1279 + }, + { + "epoch": 0.2190655485195961, + "grad_norm": 20.382551193237305, + "learning_rate": 7.273245864232744e-06, + "loss": 1.8851, + "step": 1280 + }, + { + "epoch": 0.21923669347937702, + "grad_norm": 20.68045997619629, + "learning_rate": 7.2789503707929265e-06, + "loss": 2.681, + "step": 1281 + }, + { + "epoch": 0.21940783843915795, + "grad_norm": 37.52497482299805, + "learning_rate": 7.284654877353109e-06, + "loss": 5.9113, + "step": 1282 + }, + { + "epoch": 0.21957898339893892, + "grad_norm": 154.6285858154297, + "learning_rate": 7.290359383913292e-06, + "loss": 8.0077, + "step": 1283 + }, + { + "epoch": 0.21975012835871985, + "grad_norm": 28.380836486816406, + "learning_rate": 7.296063890473474e-06, + "loss": 3.5758, + "step": 1284 + }, + { + "epoch": 0.21992127331850078, + "grad_norm": 13.987469673156738, + "learning_rate": 7.301768397033657e-06, + "loss": 1.4051, + "step": 1285 + }, + { + "epoch": 0.22009241827828172, + "grad_norm": 21.18030548095703, + "learning_rate": 7.307472903593839e-06, + "loss": 3.1844, + "step": 1286 + }, + { + "epoch": 0.22026356323806265, + "grad_norm": 13.61611270904541, + "learning_rate": 7.313177410154022e-06, + "loss": 1.4008, + "step": 1287 + }, + { + "epoch": 0.22043470819784358, + "grad_norm": 32.63056182861328, + "learning_rate": 7.318881916714204e-06, + "loss": 5.485, + "step": 1288 + }, + { + "epoch": 0.22060585315762452, + "grad_norm": 12.39704704284668, + "learning_rate": 7.3245864232743876e-06, + "loss": 2.8362, + "step": 1289 + }, + { + "epoch": 0.22077699811740545, + "grad_norm": 160.39300537109375, + "learning_rate": 7.33029092983457e-06, + "loss": 9.3207, + "step": 1290 + }, + { + "epoch": 0.22094814307718638, + "grad_norm": 35.63487243652344, + "learning_rate": 7.335995436394752e-06, + "loss": 4.3364, + "step": 1291 + }, + { + "epoch": 0.22111928803696732, + "grad_norm": 18.865745544433594, + "learning_rate": 7.341699942954934e-06, + "loss": 1.9152, + "step": 1292 + }, + { + "epoch": 0.22129043299674825, + "grad_norm": 34.95203399658203, + "learning_rate": 7.347404449515117e-06, + "loss": 4.2394, + "step": 1293 + }, + { + "epoch": 0.22146157795652918, + "grad_norm": 32.99889373779297, + "learning_rate": 7.353108956075299e-06, + "loss": 5.7603, + "step": 1294 + }, + { + "epoch": 0.22163272291631012, + "grad_norm": 31.541820526123047, + "learning_rate": 7.358813462635483e-06, + "loss": 4.7464, + "step": 1295 + }, + { + "epoch": 0.22180386787609105, + "grad_norm": 22.86473274230957, + "learning_rate": 7.364517969195665e-06, + "loss": 3.2885, + "step": 1296 + }, + { + "epoch": 0.22197501283587198, + "grad_norm": 34.75326919555664, + "learning_rate": 7.370222475755847e-06, + "loss": 4.4337, + "step": 1297 + }, + { + "epoch": 0.22214615779565292, + "grad_norm": 33.42300796508789, + "learning_rate": 7.3759269823160295e-06, + "loss": 4.8641, + "step": 1298 + }, + { + "epoch": 0.22231730275543385, + "grad_norm": 40.14048385620117, + "learning_rate": 7.381631488876212e-06, + "loss": 5.3092, + "step": 1299 + }, + { + "epoch": 0.22248844771521478, + "grad_norm": 33.59206008911133, + "learning_rate": 7.387335995436395e-06, + "loss": 4.6114, + "step": 1300 + }, + { + "epoch": 0.22265959267499572, + "grad_norm": 32.96902084350586, + "learning_rate": 7.393040501996578e-06, + "loss": 4.9559, + "step": 1301 + }, + { + "epoch": 0.22283073763477665, + "grad_norm": 76.84076690673828, + "learning_rate": 7.3987450085567605e-06, + "loss": 7.2409, + "step": 1302 + }, + { + "epoch": 0.22300188259455758, + "grad_norm": 29.227497100830078, + "learning_rate": 7.404449515116942e-06, + "loss": 3.4494, + "step": 1303 + }, + { + "epoch": 0.22317302755433852, + "grad_norm": 34.10039520263672, + "learning_rate": 7.410154021677125e-06, + "loss": 4.6513, + "step": 1304 + }, + { + "epoch": 0.22334417251411945, + "grad_norm": 43.62645721435547, + "learning_rate": 7.415858528237307e-06, + "loss": 6.1141, + "step": 1305 + }, + { + "epoch": 0.22351531747390038, + "grad_norm": 29.59916877746582, + "learning_rate": 7.421563034797491e-06, + "loss": 4.5189, + "step": 1306 + }, + { + "epoch": 0.22368646243368132, + "grad_norm": 32.00434494018555, + "learning_rate": 7.427267541357673e-06, + "loss": 3.7625, + "step": 1307 + }, + { + "epoch": 0.22385760739346225, + "grad_norm": 12.214600563049316, + "learning_rate": 7.432972047917856e-06, + "loss": 1.6093, + "step": 1308 + }, + { + "epoch": 0.2240287523532432, + "grad_norm": 13.289321899414062, + "learning_rate": 7.438676554478037e-06, + "loss": 1.8433, + "step": 1309 + }, + { + "epoch": 0.22419989731302414, + "grad_norm": 12.391509056091309, + "learning_rate": 7.44438106103822e-06, + "loss": 1.8211, + "step": 1310 + }, + { + "epoch": 0.22437104227280508, + "grad_norm": 31.827852249145508, + "learning_rate": 7.450085567598402e-06, + "loss": 3.5607, + "step": 1311 + }, + { + "epoch": 0.224542187232586, + "grad_norm": 172.93185424804688, + "learning_rate": 7.455790074158586e-06, + "loss": 9.5445, + "step": 1312 + }, + { + "epoch": 0.22471333219236694, + "grad_norm": 18.688396453857422, + "learning_rate": 7.461494580718768e-06, + "loss": 1.9759, + "step": 1313 + }, + { + "epoch": 0.22488447715214788, + "grad_norm": 26.364185333251953, + "learning_rate": 7.467199087278951e-06, + "loss": 3.2682, + "step": 1314 + }, + { + "epoch": 0.2250556221119288, + "grad_norm": 8.573413848876953, + "learning_rate": 7.472903593839133e-06, + "loss": 1.3051, + "step": 1315 + }, + { + "epoch": 0.22522676707170974, + "grad_norm": 24.913686752319336, + "learning_rate": 7.478608100399315e-06, + "loss": 2.4598, + "step": 1316 + }, + { + "epoch": 0.22539791203149068, + "grad_norm": 30.283504486083984, + "learning_rate": 7.4843126069594984e-06, + "loss": 4.1503, + "step": 1317 + }, + { + "epoch": 0.2255690569912716, + "grad_norm": 18.146724700927734, + "learning_rate": 7.490017113519681e-06, + "loss": 1.8957, + "step": 1318 + }, + { + "epoch": 0.22574020195105254, + "grad_norm": 11.016623497009277, + "learning_rate": 7.4957216200798635e-06, + "loss": 2.636, + "step": 1319 + }, + { + "epoch": 0.22591134691083348, + "grad_norm": 35.766883850097656, + "learning_rate": 7.501426126640046e-06, + "loss": 4.3588, + "step": 1320 + }, + { + "epoch": 0.2260824918706144, + "grad_norm": 24.76753807067871, + "learning_rate": 7.5071306332002286e-06, + "loss": 3.2106, + "step": 1321 + }, + { + "epoch": 0.22625363683039534, + "grad_norm": 35.969505310058594, + "learning_rate": 7.51283513976041e-06, + "loss": 4.5488, + "step": 1322 + }, + { + "epoch": 0.22642478179017628, + "grad_norm": 13.215656280517578, + "learning_rate": 7.518539646320593e-06, + "loss": 1.7273, + "step": 1323 + }, + { + "epoch": 0.2265959267499572, + "grad_norm": 32.75537872314453, + "learning_rate": 7.524244152880775e-06, + "loss": 4.442, + "step": 1324 + }, + { + "epoch": 0.22676707170973814, + "grad_norm": 13.069498062133789, + "learning_rate": 7.529948659440958e-06, + "loss": 1.2864, + "step": 1325 + }, + { + "epoch": 0.22693821666951908, + "grad_norm": 29.5541934967041, + "learning_rate": 7.535653166001142e-06, + "loss": 3.5993, + "step": 1326 + }, + { + "epoch": 0.2271093616293, + "grad_norm": 36.506736755371094, + "learning_rate": 7.541357672561325e-06, + "loss": 4.7108, + "step": 1327 + }, + { + "epoch": 0.22728050658908094, + "grad_norm": 30.510953903198242, + "learning_rate": 7.547062179121506e-06, + "loss": 4.168, + "step": 1328 + }, + { + "epoch": 0.22745165154886188, + "grad_norm": 11.754740715026855, + "learning_rate": 7.552766685681689e-06, + "loss": 2.7865, + "step": 1329 + }, + { + "epoch": 0.2276227965086428, + "grad_norm": 31.793643951416016, + "learning_rate": 7.558471192241871e-06, + "loss": 3.4931, + "step": 1330 + }, + { + "epoch": 0.22779394146842374, + "grad_norm": 23.95293426513672, + "learning_rate": 7.564175698802054e-06, + "loss": 3.0252, + "step": 1331 + }, + { + "epoch": 0.22796508642820468, + "grad_norm": 28.809511184692383, + "learning_rate": 7.569880205362236e-06, + "loss": 4.2144, + "step": 1332 + }, + { + "epoch": 0.2281362313879856, + "grad_norm": 34.645267486572266, + "learning_rate": 7.575584711922419e-06, + "loss": 4.5155, + "step": 1333 + }, + { + "epoch": 0.22830737634776657, + "grad_norm": 31.90658950805664, + "learning_rate": 7.581289218482601e-06, + "loss": 3.6445, + "step": 1334 + }, + { + "epoch": 0.2284785213075475, + "grad_norm": 26.37479591369629, + "learning_rate": 7.586993725042783e-06, + "loss": 2.6728, + "step": 1335 + }, + { + "epoch": 0.22864966626732844, + "grad_norm": 29.64954376220703, + "learning_rate": 7.592698231602966e-06, + "loss": 4.0421, + "step": 1336 + }, + { + "epoch": 0.22882081122710937, + "grad_norm": 28.596891403198242, + "learning_rate": 7.59840273816315e-06, + "loss": 3.3059, + "step": 1337 + }, + { + "epoch": 0.2289919561868903, + "grad_norm": 36.07052993774414, + "learning_rate": 7.6041072447233325e-06, + "loss": 4.2618, + "step": 1338 + }, + { + "epoch": 0.22916310114667124, + "grad_norm": 50.589454650878906, + "learning_rate": 7.609811751283515e-06, + "loss": 9.3326, + "step": 1339 + }, + { + "epoch": 0.22933424610645217, + "grad_norm": 31.4276180267334, + "learning_rate": 7.6155162578436975e-06, + "loss": 4.6035, + "step": 1340 + }, + { + "epoch": 0.2295053910662331, + "grad_norm": 32.5452766418457, + "learning_rate": 7.621220764403879e-06, + "loss": 3.9264, + "step": 1341 + }, + { + "epoch": 0.22967653602601404, + "grad_norm": 32.74778747558594, + "learning_rate": 7.626925270964062e-06, + "loss": 4.6618, + "step": 1342 + }, + { + "epoch": 0.22984768098579497, + "grad_norm": 11.447990417480469, + "learning_rate": 7.632629777524244e-06, + "loss": 1.2467, + "step": 1343 + }, + { + "epoch": 0.2300188259455759, + "grad_norm": 19.261301040649414, + "learning_rate": 7.638334284084426e-06, + "loss": 1.4108, + "step": 1344 + }, + { + "epoch": 0.23018997090535684, + "grad_norm": 17.838138580322266, + "learning_rate": 7.64403879064461e-06, + "loss": 1.4796, + "step": 1345 + }, + { + "epoch": 0.23036111586513777, + "grad_norm": 36.09761047363281, + "learning_rate": 7.649743297204791e-06, + "loss": 4.8769, + "step": 1346 + }, + { + "epoch": 0.2305322608249187, + "grad_norm": 17.18463706970215, + "learning_rate": 7.655447803764974e-06, + "loss": 1.6009, + "step": 1347 + }, + { + "epoch": 0.23070340578469964, + "grad_norm": 20.603784561157227, + "learning_rate": 7.661152310325156e-06, + "loss": 3.0856, + "step": 1348 + }, + { + "epoch": 0.23087455074448057, + "grad_norm": 41.716590881347656, + "learning_rate": 7.666856816885341e-06, + "loss": 5.4047, + "step": 1349 + }, + { + "epoch": 0.2310456957042615, + "grad_norm": 181.26748657226562, + "learning_rate": 7.672561323445523e-06, + "loss": 8.5903, + "step": 1350 + }, + { + "epoch": 0.23121684066404244, + "grad_norm": 41.98673629760742, + "learning_rate": 7.678265830005705e-06, + "loss": 5.2161, + "step": 1351 + }, + { + "epoch": 0.23138798562382337, + "grad_norm": 35.29446792602539, + "learning_rate": 7.683970336565888e-06, + "loss": 4.2135, + "step": 1352 + }, + { + "epoch": 0.2315591305836043, + "grad_norm": 164.35977172851562, + "learning_rate": 7.68967484312607e-06, + "loss": 7.3687, + "step": 1353 + }, + { + "epoch": 0.23173027554338524, + "grad_norm": 20.39377784729004, + "learning_rate": 7.695379349686253e-06, + "loss": 1.6669, + "step": 1354 + }, + { + "epoch": 0.23190142050316617, + "grad_norm": 33.71407699584961, + "learning_rate": 7.701083856246435e-06, + "loss": 4.5662, + "step": 1355 + }, + { + "epoch": 0.2320725654629471, + "grad_norm": 9.964597702026367, + "learning_rate": 7.706788362806616e-06, + "loss": 2.2199, + "step": 1356 + }, + { + "epoch": 0.23224371042272804, + "grad_norm": 41.83567810058594, + "learning_rate": 7.7124928693668e-06, + "loss": 5.3555, + "step": 1357 + }, + { + "epoch": 0.23241485538250897, + "grad_norm": 19.700429916381836, + "learning_rate": 7.718197375926981e-06, + "loss": 1.6864, + "step": 1358 + }, + { + "epoch": 0.2325860003422899, + "grad_norm": 32.94630432128906, + "learning_rate": 7.723901882487165e-06, + "loss": 3.5872, + "step": 1359 + }, + { + "epoch": 0.23275714530207087, + "grad_norm": 26.41133689880371, + "learning_rate": 7.729606389047348e-06, + "loss": 3.5806, + "step": 1360 + }, + { + "epoch": 0.2329282902618518, + "grad_norm": 17.184593200683594, + "learning_rate": 7.735310895607532e-06, + "loss": 1.6527, + "step": 1361 + }, + { + "epoch": 0.23309943522163273, + "grad_norm": 11.024751663208008, + "learning_rate": 7.741015402167713e-06, + "loss": 1.2203, + "step": 1362 + }, + { + "epoch": 0.23327058018141367, + "grad_norm": 35.2708625793457, + "learning_rate": 7.746719908727895e-06, + "loss": 4.5728, + "step": 1363 + }, + { + "epoch": 0.2334417251411946, + "grad_norm": 35.836387634277344, + "learning_rate": 7.752424415288078e-06, + "loss": 4.9165, + "step": 1364 + }, + { + "epoch": 0.23361287010097553, + "grad_norm": 24.741012573242188, + "learning_rate": 7.75812892184826e-06, + "loss": 2.2991, + "step": 1365 + }, + { + "epoch": 0.23378401506075647, + "grad_norm": 41.604007720947266, + "learning_rate": 7.763833428408443e-06, + "loss": 4.7384, + "step": 1366 + }, + { + "epoch": 0.2339551600205374, + "grad_norm": 37.068485260009766, + "learning_rate": 7.769537934968625e-06, + "loss": 4.1609, + "step": 1367 + }, + { + "epoch": 0.23412630498031833, + "grad_norm": 31.635995864868164, + "learning_rate": 7.775242441528808e-06, + "loss": 3.6394, + "step": 1368 + }, + { + "epoch": 0.23429744994009927, + "grad_norm": 36.181602478027344, + "learning_rate": 7.78094694808899e-06, + "loss": 3.9604, + "step": 1369 + }, + { + "epoch": 0.2344685948998802, + "grad_norm": 34.47708511352539, + "learning_rate": 7.786651454649172e-06, + "loss": 4.4621, + "step": 1370 + }, + { + "epoch": 0.23463973985966113, + "grad_norm": 36.583919525146484, + "learning_rate": 7.792355961209355e-06, + "loss": 5.4214, + "step": 1371 + }, + { + "epoch": 0.23481088481944207, + "grad_norm": 139.80113220214844, + "learning_rate": 7.798060467769539e-06, + "loss": 7.582, + "step": 1372 + }, + { + "epoch": 0.234982029779223, + "grad_norm": 10.627038955688477, + "learning_rate": 7.803764974329722e-06, + "loss": 1.1265, + "step": 1373 + }, + { + "epoch": 0.23515317473900393, + "grad_norm": 56.01224899291992, + "learning_rate": 7.809469480889904e-06, + "loss": 9.2401, + "step": 1374 + }, + { + "epoch": 0.23532431969878487, + "grad_norm": 13.42536449432373, + "learning_rate": 7.815173987450085e-06, + "loss": 1.3022, + "step": 1375 + }, + { + "epoch": 0.2354954646585658, + "grad_norm": 34.816341400146484, + "learning_rate": 7.820878494010269e-06, + "loss": 4.6249, + "step": 1376 + }, + { + "epoch": 0.23566660961834673, + "grad_norm": 13.037670135498047, + "learning_rate": 7.82658300057045e-06, + "loss": 1.5747, + "step": 1377 + }, + { + "epoch": 0.23583775457812767, + "grad_norm": 38.446537017822266, + "learning_rate": 7.832287507130634e-06, + "loss": 4.9983, + "step": 1378 + }, + { + "epoch": 0.2360088995379086, + "grad_norm": 32.81908416748047, + "learning_rate": 7.837992013690815e-06, + "loss": 3.4363, + "step": 1379 + }, + { + "epoch": 0.23618004449768953, + "grad_norm": 12.17697525024414, + "learning_rate": 7.843696520250999e-06, + "loss": 1.6211, + "step": 1380 + }, + { + "epoch": 0.23635118945747047, + "grad_norm": 35.46131896972656, + "learning_rate": 7.84940102681118e-06, + "loss": 4.8981, + "step": 1381 + }, + { + "epoch": 0.2365223344172514, + "grad_norm": 29.793787002563477, + "learning_rate": 7.855105533371362e-06, + "loss": 3.5648, + "step": 1382 + }, + { + "epoch": 0.23669347937703233, + "grad_norm": 14.550475120544434, + "learning_rate": 7.860810039931547e-06, + "loss": 1.6714, + "step": 1383 + }, + { + "epoch": 0.23686462433681327, + "grad_norm": 36.01753234863281, + "learning_rate": 7.866514546491729e-06, + "loss": 4.936, + "step": 1384 + }, + { + "epoch": 0.23703576929659423, + "grad_norm": 21.261749267578125, + "learning_rate": 7.872219053051912e-06, + "loss": 2.3239, + "step": 1385 + }, + { + "epoch": 0.23720691425637516, + "grad_norm": 160.96620178222656, + "learning_rate": 7.877923559612094e-06, + "loss": 7.9267, + "step": 1386 + }, + { + "epoch": 0.2373780592161561, + "grad_norm": 34.994293212890625, + "learning_rate": 7.883628066172276e-06, + "loss": 4.6021, + "step": 1387 + }, + { + "epoch": 0.23754920417593703, + "grad_norm": 32.08713912963867, + "learning_rate": 7.889332572732459e-06, + "loss": 4.0803, + "step": 1388 + }, + { + "epoch": 0.23772034913571796, + "grad_norm": 36.49545669555664, + "learning_rate": 7.89503707929264e-06, + "loss": 4.4858, + "step": 1389 + }, + { + "epoch": 0.2378914940954989, + "grad_norm": 146.2379608154297, + "learning_rate": 7.900741585852824e-06, + "loss": 8.1082, + "step": 1390 + }, + { + "epoch": 0.23806263905527983, + "grad_norm": 31.705169677734375, + "learning_rate": 7.906446092413006e-06, + "loss": 4.1572, + "step": 1391 + }, + { + "epoch": 0.23823378401506076, + "grad_norm": 13.439140319824219, + "learning_rate": 7.91215059897319e-06, + "loss": 1.1091, + "step": 1392 + }, + { + "epoch": 0.2384049289748417, + "grad_norm": 51.37181854248047, + "learning_rate": 7.91785510553337e-06, + "loss": 9.8544, + "step": 1393 + }, + { + "epoch": 0.23857607393462263, + "grad_norm": 16.763200759887695, + "learning_rate": 7.923559612093553e-06, + "loss": 1.4605, + "step": 1394 + }, + { + "epoch": 0.23874721889440356, + "grad_norm": 32.19613265991211, + "learning_rate": 7.929264118653738e-06, + "loss": 4.0605, + "step": 1395 + }, + { + "epoch": 0.2389183638541845, + "grad_norm": 36.1611442565918, + "learning_rate": 7.93496862521392e-06, + "loss": 4.1027, + "step": 1396 + }, + { + "epoch": 0.23908950881396543, + "grad_norm": 36.234344482421875, + "learning_rate": 7.940673131774103e-06, + "loss": 5.0933, + "step": 1397 + }, + { + "epoch": 0.23926065377374636, + "grad_norm": 39.589111328125, + "learning_rate": 7.946377638334284e-06, + "loss": 5.4176, + "step": 1398 + }, + { + "epoch": 0.2394317987335273, + "grad_norm": 13.162062644958496, + "learning_rate": 7.952082144894468e-06, + "loss": 1.3262, + "step": 1399 + }, + { + "epoch": 0.23960294369330823, + "grad_norm": 11.512036323547363, + "learning_rate": 7.95778665145465e-06, + "loss": 2.8916, + "step": 1400 + }, + { + "epoch": 0.23977408865308916, + "grad_norm": 30.82523536682129, + "learning_rate": 7.963491158014831e-06, + "loss": 3.7983, + "step": 1401 + }, + { + "epoch": 0.2399452336128701, + "grad_norm": 9.881488800048828, + "learning_rate": 7.969195664575014e-06, + "loss": 1.6009, + "step": 1402 + }, + { + "epoch": 0.24011637857265103, + "grad_norm": 26.221534729003906, + "learning_rate": 7.974900171135196e-06, + "loss": 3.2459, + "step": 1403 + }, + { + "epoch": 0.24028752353243196, + "grad_norm": 34.7869987487793, + "learning_rate": 7.98060467769538e-06, + "loss": 4.2736, + "step": 1404 + }, + { + "epoch": 0.2404586684922129, + "grad_norm": 42.81889343261719, + "learning_rate": 7.986309184255561e-06, + "loss": 6.0254, + "step": 1405 + }, + { + "epoch": 0.24062981345199383, + "grad_norm": 35.25808334350586, + "learning_rate": 7.992013690815745e-06, + "loss": 3.8331, + "step": 1406 + }, + { + "epoch": 0.24080095841177476, + "grad_norm": 29.81654167175293, + "learning_rate": 7.997718197375928e-06, + "loss": 3.3841, + "step": 1407 + }, + { + "epoch": 0.2409721033715557, + "grad_norm": 34.251243591308594, + "learning_rate": 8.00342270393611e-06, + "loss": 4.8157, + "step": 1408 + }, + { + "epoch": 0.24114324833133663, + "grad_norm": 31.04636573791504, + "learning_rate": 8.009127210496293e-06, + "loss": 3.4431, + "step": 1409 + }, + { + "epoch": 0.2413143932911176, + "grad_norm": 33.0612678527832, + "learning_rate": 8.014831717056475e-06, + "loss": 3.8054, + "step": 1410 + }, + { + "epoch": 0.24148553825089852, + "grad_norm": 25.215789794921875, + "learning_rate": 8.020536223616658e-06, + "loss": 3.2052, + "step": 1411 + }, + { + "epoch": 0.24165668321067946, + "grad_norm": 22.657257080078125, + "learning_rate": 8.02624073017684e-06, + "loss": 2.5621, + "step": 1412 + }, + { + "epoch": 0.2418278281704604, + "grad_norm": 32.54667282104492, + "learning_rate": 8.031945236737021e-06, + "loss": 4.1257, + "step": 1413 + }, + { + "epoch": 0.24199897313024132, + "grad_norm": 14.109042167663574, + "learning_rate": 8.037649743297205e-06, + "loss": 1.2616, + "step": 1414 + }, + { + "epoch": 0.24217011809002226, + "grad_norm": 35.718116760253906, + "learning_rate": 8.043354249857387e-06, + "loss": 5.263, + "step": 1415 + }, + { + "epoch": 0.2423412630498032, + "grad_norm": 10.830004692077637, + "learning_rate": 8.04905875641757e-06, + "loss": 1.6628, + "step": 1416 + }, + { + "epoch": 0.24251240800958412, + "grad_norm": 21.519893646240234, + "learning_rate": 8.054763262977753e-06, + "loss": 2.2681, + "step": 1417 + }, + { + "epoch": 0.24268355296936506, + "grad_norm": 16.527233123779297, + "learning_rate": 8.060467769537937e-06, + "loss": 1.7274, + "step": 1418 + }, + { + "epoch": 0.242854697929146, + "grad_norm": 17.97334098815918, + "learning_rate": 8.066172276098118e-06, + "loss": 1.4341, + "step": 1419 + }, + { + "epoch": 0.24302584288892692, + "grad_norm": 38.63325500488281, + "learning_rate": 8.0718767826583e-06, + "loss": 5.4521, + "step": 1420 + }, + { + "epoch": 0.24319698784870786, + "grad_norm": 37.572818756103516, + "learning_rate": 8.077581289218483e-06, + "loss": 4.057, + "step": 1421 + }, + { + "epoch": 0.2433681328084888, + "grad_norm": 36.495025634765625, + "learning_rate": 8.083285795778665e-06, + "loss": 5.3841, + "step": 1422 + }, + { + "epoch": 0.24353927776826972, + "grad_norm": 46.322486877441406, + "learning_rate": 8.088990302338848e-06, + "loss": 9.2447, + "step": 1423 + }, + { + "epoch": 0.24371042272805066, + "grad_norm": 32.26517868041992, + "learning_rate": 8.09469480889903e-06, + "loss": 3.4902, + "step": 1424 + }, + { + "epoch": 0.2438815676878316, + "grad_norm": 29.286020278930664, + "learning_rate": 8.100399315459212e-06, + "loss": 3.6562, + "step": 1425 + }, + { + "epoch": 0.24405271264761252, + "grad_norm": 9.768603324890137, + "learning_rate": 8.106103822019395e-06, + "loss": 1.0808, + "step": 1426 + }, + { + "epoch": 0.24422385760739346, + "grad_norm": 40.53557205200195, + "learning_rate": 8.111808328579577e-06, + "loss": 5.3038, + "step": 1427 + }, + { + "epoch": 0.2443950025671744, + "grad_norm": 36.29978561401367, + "learning_rate": 8.11751283513976e-06, + "loss": 4.9487, + "step": 1428 + }, + { + "epoch": 0.24456614752695532, + "grad_norm": 50.365440368652344, + "learning_rate": 8.123217341699944e-06, + "loss": 9.1753, + "step": 1429 + }, + { + "epoch": 0.24473729248673626, + "grad_norm": 25.204608917236328, + "learning_rate": 8.128921848260127e-06, + "loss": 3.06, + "step": 1430 + }, + { + "epoch": 0.2449084374465172, + "grad_norm": 36.821929931640625, + "learning_rate": 8.134626354820309e-06, + "loss": 4.2367, + "step": 1431 + }, + { + "epoch": 0.24507958240629812, + "grad_norm": 9.532563209533691, + "learning_rate": 8.14033086138049e-06, + "loss": 1.0511, + "step": 1432 + }, + { + "epoch": 0.24525072736607906, + "grad_norm": 31.35403060913086, + "learning_rate": 8.146035367940674e-06, + "loss": 4.1655, + "step": 1433 + }, + { + "epoch": 0.24542187232586, + "grad_norm": 29.057531356811523, + "learning_rate": 8.151739874500855e-06, + "loss": 3.6622, + "step": 1434 + }, + { + "epoch": 0.24559301728564092, + "grad_norm": 18.69387435913086, + "learning_rate": 8.157444381061039e-06, + "loss": 1.6006, + "step": 1435 + }, + { + "epoch": 0.24576416224542189, + "grad_norm": 27.337491989135742, + "learning_rate": 8.16314888762122e-06, + "loss": 2.1133, + "step": 1436 + }, + { + "epoch": 0.24593530720520282, + "grad_norm": 59.810035705566406, + "learning_rate": 8.168853394181404e-06, + "loss": 9.2893, + "step": 1437 + }, + { + "epoch": 0.24610645216498375, + "grad_norm": 34.85076141357422, + "learning_rate": 8.174557900741586e-06, + "loss": 4.76, + "step": 1438 + }, + { + "epoch": 0.24627759712476469, + "grad_norm": 16.229951858520508, + "learning_rate": 8.180262407301767e-06, + "loss": 1.111, + "step": 1439 + }, + { + "epoch": 0.24644874208454562, + "grad_norm": 191.14859008789062, + "learning_rate": 8.185966913861952e-06, + "loss": 8.6606, + "step": 1440 + }, + { + "epoch": 0.24661988704432655, + "grad_norm": 25.192026138305664, + "learning_rate": 8.191671420422134e-06, + "loss": 2.2213, + "step": 1441 + }, + { + "epoch": 0.24679103200410749, + "grad_norm": 16.577152252197266, + "learning_rate": 8.197375926982317e-06, + "loss": 1.4564, + "step": 1442 + }, + { + "epoch": 0.24696217696388842, + "grad_norm": 37.47216796875, + "learning_rate": 8.203080433542499e-06, + "loss": 4.9652, + "step": 1443 + }, + { + "epoch": 0.24713332192366935, + "grad_norm": 33.50614547729492, + "learning_rate": 8.20878494010268e-06, + "loss": 3.8217, + "step": 1444 + }, + { + "epoch": 0.24730446688345029, + "grad_norm": 35.54981994628906, + "learning_rate": 8.214489446662864e-06, + "loss": 5.0781, + "step": 1445 + }, + { + "epoch": 0.24747561184323122, + "grad_norm": 29.486570358276367, + "learning_rate": 8.220193953223046e-06, + "loss": 3.4324, + "step": 1446 + }, + { + "epoch": 0.24764675680301215, + "grad_norm": 23.952808380126953, + "learning_rate": 8.22589845978323e-06, + "loss": 2.9791, + "step": 1447 + }, + { + "epoch": 0.24781790176279309, + "grad_norm": 22.885963439941406, + "learning_rate": 8.231602966343411e-06, + "loss": 2.1029, + "step": 1448 + }, + { + "epoch": 0.24798904672257402, + "grad_norm": 38.23826217651367, + "learning_rate": 8.237307472903594e-06, + "loss": 5.1107, + "step": 1449 + }, + { + "epoch": 0.24816019168235495, + "grad_norm": 21.183773040771484, + "learning_rate": 8.243011979463776e-06, + "loss": 2.6462, + "step": 1450 + }, + { + "epoch": 0.24833133664213589, + "grad_norm": 11.436287879943848, + "learning_rate": 8.248716486023958e-06, + "loss": 1.139, + "step": 1451 + }, + { + "epoch": 0.24850248160191682, + "grad_norm": 21.1058349609375, + "learning_rate": 8.254420992584143e-06, + "loss": 2.6237, + "step": 1452 + }, + { + "epoch": 0.24867362656169775, + "grad_norm": 29.661510467529297, + "learning_rate": 8.260125499144324e-06, + "loss": 3.9416, + "step": 1453 + }, + { + "epoch": 0.24884477152147869, + "grad_norm": 25.654918670654297, + "learning_rate": 8.265830005704508e-06, + "loss": 2.9109, + "step": 1454 + }, + { + "epoch": 0.24901591648125962, + "grad_norm": 29.254196166992188, + "learning_rate": 8.27153451226469e-06, + "loss": 3.9703, + "step": 1455 + }, + { + "epoch": 0.24918706144104055, + "grad_norm": 15.34985065460205, + "learning_rate": 8.277239018824871e-06, + "loss": 1.277, + "step": 1456 + }, + { + "epoch": 0.24935820640082149, + "grad_norm": 20.940813064575195, + "learning_rate": 8.282943525385055e-06, + "loss": 2.8225, + "step": 1457 + }, + { + "epoch": 0.24952935136060242, + "grad_norm": 156.33163452148438, + "learning_rate": 8.288648031945236e-06, + "loss": 6.8667, + "step": 1458 + }, + { + "epoch": 0.24970049632038335, + "grad_norm": 142.04833984375, + "learning_rate": 8.29435253850542e-06, + "loss": 7.7845, + "step": 1459 + }, + { + "epoch": 0.24987164128016429, + "grad_norm": 52.80269241333008, + "learning_rate": 8.300057045065601e-06, + "loss": 9.2945, + "step": 1460 + }, + { + "epoch": 0.25004278623994525, + "grad_norm": 36.25229263305664, + "learning_rate": 8.305761551625785e-06, + "loss": 4.1385, + "step": 1461 + }, + { + "epoch": 0.2502139311997262, + "grad_norm": 32.63280487060547, + "learning_rate": 8.311466058185966e-06, + "loss": 4.9526, + "step": 1462 + }, + { + "epoch": 0.2503850761595071, + "grad_norm": 36.09181213378906, + "learning_rate": 8.31717056474615e-06, + "loss": 4.9655, + "step": 1463 + }, + { + "epoch": 0.25055622111928805, + "grad_norm": 13.666475296020508, + "learning_rate": 8.322875071306333e-06, + "loss": 1.2171, + "step": 1464 + }, + { + "epoch": 0.250727366079069, + "grad_norm": 21.431262969970703, + "learning_rate": 8.328579577866515e-06, + "loss": 2.0253, + "step": 1465 + }, + { + "epoch": 0.2508985110388499, + "grad_norm": 34.866493225097656, + "learning_rate": 8.334284084426698e-06, + "loss": 4.7963, + "step": 1466 + }, + { + "epoch": 0.25106965599863085, + "grad_norm": 28.299697875976562, + "learning_rate": 8.33998859098688e-06, + "loss": 3.2393, + "step": 1467 + }, + { + "epoch": 0.2512408009584118, + "grad_norm": 30.702220916748047, + "learning_rate": 8.345693097547063e-06, + "loss": 4.459, + "step": 1468 + }, + { + "epoch": 0.2514119459181927, + "grad_norm": 35.572662353515625, + "learning_rate": 8.351397604107245e-06, + "loss": 4.0362, + "step": 1469 + }, + { + "epoch": 0.25158309087797365, + "grad_norm": 31.228361129760742, + "learning_rate": 8.357102110667427e-06, + "loss": 3.7291, + "step": 1470 + }, + { + "epoch": 0.2517542358377546, + "grad_norm": 158.43309020996094, + "learning_rate": 8.36280661722761e-06, + "loss": 7.5395, + "step": 1471 + }, + { + "epoch": 0.2519253807975355, + "grad_norm": 26.111873626708984, + "learning_rate": 8.368511123787792e-06, + "loss": 3.2816, + "step": 1472 + }, + { + "epoch": 0.25209652575731645, + "grad_norm": 152.1773681640625, + "learning_rate": 8.374215630347975e-06, + "loss": 9.2757, + "step": 1473 + }, + { + "epoch": 0.2522676707170974, + "grad_norm": 28.91309928894043, + "learning_rate": 8.379920136908157e-06, + "loss": 3.8, + "step": 1474 + }, + { + "epoch": 0.2524388156768783, + "grad_norm": 138.71820068359375, + "learning_rate": 8.38562464346834e-06, + "loss": 8.3701, + "step": 1475 + }, + { + "epoch": 0.25260996063665925, + "grad_norm": 10.94738483428955, + "learning_rate": 8.391329150028524e-06, + "loss": 1.0987, + "step": 1476 + }, + { + "epoch": 0.2527811055964402, + "grad_norm": 33.45675277709961, + "learning_rate": 8.397033656588705e-06, + "loss": 3.8679, + "step": 1477 + }, + { + "epoch": 0.2529522505562211, + "grad_norm": 30.219728469848633, + "learning_rate": 8.402738163148889e-06, + "loss": 3.7668, + "step": 1478 + }, + { + "epoch": 0.25312339551600205, + "grad_norm": 153.4755859375, + "learning_rate": 8.40844266970907e-06, + "loss": 8.493, + "step": 1479 + }, + { + "epoch": 0.253294540475783, + "grad_norm": 27.030277252197266, + "learning_rate": 8.414147176269254e-06, + "loss": 3.6373, + "step": 1480 + }, + { + "epoch": 0.2534656854355639, + "grad_norm": 26.931581497192383, + "learning_rate": 8.419851682829435e-06, + "loss": 2.4114, + "step": 1481 + }, + { + "epoch": 0.25363683039534485, + "grad_norm": 33.86345672607422, + "learning_rate": 8.425556189389617e-06, + "loss": 4.18, + "step": 1482 + }, + { + "epoch": 0.2538079753551258, + "grad_norm": 40.67789840698242, + "learning_rate": 8.4312606959498e-06, + "loss": 5.2501, + "step": 1483 + }, + { + "epoch": 0.2539791203149067, + "grad_norm": 11.627734184265137, + "learning_rate": 8.436965202509982e-06, + "loss": 1.2352, + "step": 1484 + }, + { + "epoch": 0.25415026527468765, + "grad_norm": 27.1390438079834, + "learning_rate": 8.442669709070165e-06, + "loss": 2.4447, + "step": 1485 + }, + { + "epoch": 0.2543214102344686, + "grad_norm": 33.907615661621094, + "learning_rate": 8.448374215630349e-06, + "loss": 5.7028, + "step": 1486 + }, + { + "epoch": 0.2544925551942495, + "grad_norm": 34.770687103271484, + "learning_rate": 8.45407872219053e-06, + "loss": 5.4022, + "step": 1487 + }, + { + "epoch": 0.25466370015403045, + "grad_norm": 87.67970275878906, + "learning_rate": 8.459783228750714e-06, + "loss": 7.2429, + "step": 1488 + }, + { + "epoch": 0.2548348451138114, + "grad_norm": 36.1263313293457, + "learning_rate": 8.465487735310896e-06, + "loss": 4.7788, + "step": 1489 + }, + { + "epoch": 0.2550059900735923, + "grad_norm": 35.22165298461914, + "learning_rate": 8.471192241871079e-06, + "loss": 4.132, + "step": 1490 + }, + { + "epoch": 0.25517713503337325, + "grad_norm": 28.420682907104492, + "learning_rate": 8.47689674843126e-06, + "loss": 3.6288, + "step": 1491 + }, + { + "epoch": 0.2553482799931542, + "grad_norm": 36.37025451660156, + "learning_rate": 8.482601254991444e-06, + "loss": 5.1911, + "step": 1492 + }, + { + "epoch": 0.2555194249529351, + "grad_norm": 40.647789001464844, + "learning_rate": 8.488305761551626e-06, + "loss": 5.5946, + "step": 1493 + }, + { + "epoch": 0.25569056991271605, + "grad_norm": 19.504039764404297, + "learning_rate": 8.494010268111807e-06, + "loss": 1.7075, + "step": 1494 + }, + { + "epoch": 0.255861714872497, + "grad_norm": 32.866695404052734, + "learning_rate": 8.49971477467199e-06, + "loss": 4.4763, + "step": 1495 + }, + { + "epoch": 0.2560328598322779, + "grad_norm": 33.1104736328125, + "learning_rate": 8.505419281232172e-06, + "loss": 4.4053, + "step": 1496 + }, + { + "epoch": 0.25620400479205885, + "grad_norm": 22.860944747924805, + "learning_rate": 8.511123787792358e-06, + "loss": 2.5604, + "step": 1497 + }, + { + "epoch": 0.2563751497518398, + "grad_norm": 34.79046630859375, + "learning_rate": 8.51682829435254e-06, + "loss": 4.993, + "step": 1498 + }, + { + "epoch": 0.25654629471162077, + "grad_norm": 28.405912399291992, + "learning_rate": 8.522532800912723e-06, + "loss": 3.3138, + "step": 1499 + }, + { + "epoch": 0.2567174396714017, + "grad_norm": 32.89986038208008, + "learning_rate": 8.528237307472904e-06, + "loss": 3.1908, + "step": 1500 + }, + { + "epoch": 0.25688858463118264, + "grad_norm": 20.201610565185547, + "learning_rate": 8.533941814033086e-06, + "loss": 1.974, + "step": 1501 + }, + { + "epoch": 0.25705972959096357, + "grad_norm": 32.933231353759766, + "learning_rate": 8.53964632059327e-06, + "loss": 4.8342, + "step": 1502 + }, + { + "epoch": 0.2572308745507445, + "grad_norm": 25.67669105529785, + "learning_rate": 8.545350827153451e-06, + "loss": 2.8345, + "step": 1503 + }, + { + "epoch": 0.25740201951052544, + "grad_norm": 50.461097717285156, + "learning_rate": 8.551055333713634e-06, + "loss": 6.9385, + "step": 1504 + }, + { + "epoch": 0.25757316447030637, + "grad_norm": 32.42000198364258, + "learning_rate": 8.556759840273816e-06, + "loss": 3.4542, + "step": 1505 + }, + { + "epoch": 0.2577443094300873, + "grad_norm": 29.946523666381836, + "learning_rate": 8.562464346833998e-06, + "loss": 3.2486, + "step": 1506 + }, + { + "epoch": 0.25791545438986824, + "grad_norm": 17.451496124267578, + "learning_rate": 8.568168853394181e-06, + "loss": 1.4946, + "step": 1507 + }, + { + "epoch": 0.25808659934964917, + "grad_norm": 30.164350509643555, + "learning_rate": 8.573873359954363e-06, + "loss": 3.8272, + "step": 1508 + }, + { + "epoch": 0.2582577443094301, + "grad_norm": 26.747682571411133, + "learning_rate": 8.579577866514548e-06, + "loss": 3.0653, + "step": 1509 + }, + { + "epoch": 0.25842888926921104, + "grad_norm": 20.9317626953125, + "learning_rate": 8.58528237307473e-06, + "loss": 1.8431, + "step": 1510 + }, + { + "epoch": 0.25860003422899197, + "grad_norm": 36.90618896484375, + "learning_rate": 8.590986879634913e-06, + "loss": 3.7371, + "step": 1511 + }, + { + "epoch": 0.2587711791887729, + "grad_norm": 19.612281799316406, + "learning_rate": 8.596691386195095e-06, + "loss": 1.4799, + "step": 1512 + }, + { + "epoch": 0.25894232414855384, + "grad_norm": 35.63535690307617, + "learning_rate": 8.602395892755276e-06, + "loss": 4.2458, + "step": 1513 + }, + { + "epoch": 0.25911346910833477, + "grad_norm": 37.25559997558594, + "learning_rate": 8.60810039931546e-06, + "loss": 3.7735, + "step": 1514 + }, + { + "epoch": 0.2592846140681157, + "grad_norm": 26.81685447692871, + "learning_rate": 8.613804905875641e-06, + "loss": 2.621, + "step": 1515 + }, + { + "epoch": 0.25945575902789664, + "grad_norm": 22.918485641479492, + "learning_rate": 8.619509412435825e-06, + "loss": 1.6105, + "step": 1516 + }, + { + "epoch": 0.25962690398767757, + "grad_norm": 12.06033992767334, + "learning_rate": 8.625213918996006e-06, + "loss": 1.1731, + "step": 1517 + }, + { + "epoch": 0.2597980489474585, + "grad_norm": 35.15945053100586, + "learning_rate": 8.63091842555619e-06, + "loss": 3.8198, + "step": 1518 + }, + { + "epoch": 0.25996919390723944, + "grad_norm": 13.90102767944336, + "learning_rate": 8.636622932116372e-06, + "loss": 2.736, + "step": 1519 + }, + { + "epoch": 0.26014033886702037, + "grad_norm": 35.0964469909668, + "learning_rate": 8.642327438676555e-06, + "loss": 4.3737, + "step": 1520 + }, + { + "epoch": 0.2603114838268013, + "grad_norm": 33.16070556640625, + "learning_rate": 8.648031945236738e-06, + "loss": 3.8065, + "step": 1521 + }, + { + "epoch": 0.26048262878658224, + "grad_norm": 16.28618621826172, + "learning_rate": 8.65373645179692e-06, + "loss": 1.257, + "step": 1522 + }, + { + "epoch": 0.26065377374636317, + "grad_norm": 28.174516677856445, + "learning_rate": 8.659440958357103e-06, + "loss": 3.7114, + "step": 1523 + }, + { + "epoch": 0.2608249187061441, + "grad_norm": 26.44544792175293, + "learning_rate": 8.665145464917285e-06, + "loss": 2.829, + "step": 1524 + }, + { + "epoch": 0.26099606366592504, + "grad_norm": 38.186378479003906, + "learning_rate": 8.670849971477467e-06, + "loss": 4.3011, + "step": 1525 + }, + { + "epoch": 0.26116720862570597, + "grad_norm": 206.24801635742188, + "learning_rate": 8.67655447803765e-06, + "loss": 9.2851, + "step": 1526 + }, + { + "epoch": 0.2613383535854869, + "grad_norm": 33.12008285522461, + "learning_rate": 8.682258984597832e-06, + "loss": 4.3036, + "step": 1527 + }, + { + "epoch": 0.26150949854526784, + "grad_norm": 136.57029724121094, + "learning_rate": 8.687963491158015e-06, + "loss": 8.4189, + "step": 1528 + }, + { + "epoch": 0.26168064350504877, + "grad_norm": 40.36309051513672, + "learning_rate": 8.693667997718197e-06, + "loss": 5.4948, + "step": 1529 + }, + { + "epoch": 0.2618517884648297, + "grad_norm": 19.74286651611328, + "learning_rate": 8.69937250427838e-06, + "loss": 2.0893, + "step": 1530 + }, + { + "epoch": 0.26202293342461064, + "grad_norm": 33.62118148803711, + "learning_rate": 8.705077010838562e-06, + "loss": 3.796, + "step": 1531 + }, + { + "epoch": 0.26219407838439157, + "grad_norm": 36.64006805419922, + "learning_rate": 8.710781517398745e-06, + "loss": 3.9848, + "step": 1532 + }, + { + "epoch": 0.2623652233441725, + "grad_norm": 12.980084419250488, + "learning_rate": 8.716486023958929e-06, + "loss": 1.1166, + "step": 1533 + }, + { + "epoch": 0.26253636830395344, + "grad_norm": 35.808021545410156, + "learning_rate": 8.72219053051911e-06, + "loss": 4.3018, + "step": 1534 + }, + { + "epoch": 0.26270751326373437, + "grad_norm": 51.2911491394043, + "learning_rate": 8.727895037079294e-06, + "loss": 9.237, + "step": 1535 + }, + { + "epoch": 0.2628786582235153, + "grad_norm": 26.75223731994629, + "learning_rate": 8.733599543639475e-06, + "loss": 3.3625, + "step": 1536 + }, + { + "epoch": 0.26304980318329624, + "grad_norm": 81.07520294189453, + "learning_rate": 8.739304050199659e-06, + "loss": 7.5686, + "step": 1537 + }, + { + "epoch": 0.26322094814307717, + "grad_norm": 37.027191162109375, + "learning_rate": 8.74500855675984e-06, + "loss": 3.7701, + "step": 1538 + }, + { + "epoch": 0.2633920931028581, + "grad_norm": 47.393333435058594, + "learning_rate": 8.750713063320022e-06, + "loss": 9.0139, + "step": 1539 + }, + { + "epoch": 0.26356323806263904, + "grad_norm": 34.1210823059082, + "learning_rate": 8.756417569880206e-06, + "loss": 4.4995, + "step": 1540 + }, + { + "epoch": 0.26373438302241997, + "grad_norm": 14.312548637390137, + "learning_rate": 8.762122076440387e-06, + "loss": 2.1827, + "step": 1541 + }, + { + "epoch": 0.2639055279822009, + "grad_norm": 30.19961166381836, + "learning_rate": 8.76782658300057e-06, + "loss": 3.9737, + "step": 1542 + }, + { + "epoch": 0.26407667294198184, + "grad_norm": 10.720991134643555, + "learning_rate": 8.773531089560754e-06, + "loss": 1.1108, + "step": 1543 + }, + { + "epoch": 0.26424781790176277, + "grad_norm": 26.29660987854004, + "learning_rate": 8.779235596120936e-06, + "loss": 2.9509, + "step": 1544 + }, + { + "epoch": 0.2644189628615437, + "grad_norm": 7.651371479034424, + "learning_rate": 8.784940102681119e-06, + "loss": 0.8929, + "step": 1545 + }, + { + "epoch": 0.26459010782132464, + "grad_norm": 32.411407470703125, + "learning_rate": 8.7906446092413e-06, + "loss": 3.9279, + "step": 1546 + }, + { + "epoch": 0.26476125278110557, + "grad_norm": 43.62602233886719, + "learning_rate": 8.796349115801484e-06, + "loss": 8.7932, + "step": 1547 + }, + { + "epoch": 0.2649323977408865, + "grad_norm": 28.391075134277344, + "learning_rate": 8.802053622361666e-06, + "loss": 3.3049, + "step": 1548 + }, + { + "epoch": 0.26510354270066744, + "grad_norm": 35.11864471435547, + "learning_rate": 8.80775812892185e-06, + "loss": 4.0323, + "step": 1549 + }, + { + "epoch": 0.2652746876604484, + "grad_norm": 10.911874771118164, + "learning_rate": 8.813462635482031e-06, + "loss": 1.3744, + "step": 1550 + }, + { + "epoch": 0.26544583262022936, + "grad_norm": 22.232980728149414, + "learning_rate": 8.819167142042213e-06, + "loss": 1.972, + "step": 1551 + }, + { + "epoch": 0.2656169775800103, + "grad_norm": 171.640625, + "learning_rate": 8.824871648602396e-06, + "loss": 8.4712, + "step": 1552 + }, + { + "epoch": 0.2657881225397912, + "grad_norm": 30.831897735595703, + "learning_rate": 8.830576155162578e-06, + "loss": 3.5869, + "step": 1553 + }, + { + "epoch": 0.26595926749957216, + "grad_norm": 36.305782318115234, + "learning_rate": 8.836280661722761e-06, + "loss": 4.9009, + "step": 1554 + }, + { + "epoch": 0.2661304124593531, + "grad_norm": 44.463626861572266, + "learning_rate": 8.841985168282944e-06, + "loss": 4.6015, + "step": 1555 + }, + { + "epoch": 0.266301557419134, + "grad_norm": 22.66800308227539, + "learning_rate": 8.847689674843126e-06, + "loss": 2.1498, + "step": 1556 + }, + { + "epoch": 0.26647270237891496, + "grad_norm": 30.886274337768555, + "learning_rate": 8.85339418140331e-06, + "loss": 4.3322, + "step": 1557 + }, + { + "epoch": 0.2666438473386959, + "grad_norm": 34.30126190185547, + "learning_rate": 8.859098687963491e-06, + "loss": 4.5378, + "step": 1558 + }, + { + "epoch": 0.2668149922984768, + "grad_norm": 36.92926025390625, + "learning_rate": 8.864803194523674e-06, + "loss": 4.2903, + "step": 1559 + }, + { + "epoch": 0.26698613725825776, + "grad_norm": 34.588077545166016, + "learning_rate": 8.870507701083856e-06, + "loss": 4.9088, + "step": 1560 + }, + { + "epoch": 0.2671572822180387, + "grad_norm": 30.621044158935547, + "learning_rate": 8.87621220764404e-06, + "loss": 3.6051, + "step": 1561 + }, + { + "epoch": 0.2673284271778196, + "grad_norm": 30.107677459716797, + "learning_rate": 8.881916714204221e-06, + "loss": 3.4027, + "step": 1562 + }, + { + "epoch": 0.26749957213760056, + "grad_norm": 16.614532470703125, + "learning_rate": 8.887621220764403e-06, + "loss": 1.5846, + "step": 1563 + }, + { + "epoch": 0.2676707170973815, + "grad_norm": 35.577842712402344, + "learning_rate": 8.893325727324586e-06, + "loss": 4.2335, + "step": 1564 + }, + { + "epoch": 0.2678418620571624, + "grad_norm": 33.13545227050781, + "learning_rate": 8.899030233884768e-06, + "loss": 4.6539, + "step": 1565 + }, + { + "epoch": 0.26801300701694336, + "grad_norm": 170.64297485351562, + "learning_rate": 8.904734740444953e-06, + "loss": 9.3362, + "step": 1566 + }, + { + "epoch": 0.2681841519767243, + "grad_norm": 12.3065185546875, + "learning_rate": 8.910439247005135e-06, + "loss": 1.573, + "step": 1567 + }, + { + "epoch": 0.2683552969365052, + "grad_norm": 38.08529281616211, + "learning_rate": 8.916143753565318e-06, + "loss": 3.7314, + "step": 1568 + }, + { + "epoch": 0.26852644189628616, + "grad_norm": 169.76089477539062, + "learning_rate": 8.9218482601255e-06, + "loss": 9.6942, + "step": 1569 + }, + { + "epoch": 0.2686975868560671, + "grad_norm": 38.42169952392578, + "learning_rate": 8.927552766685681e-06, + "loss": 5.3158, + "step": 1570 + }, + { + "epoch": 0.268868731815848, + "grad_norm": 14.410723686218262, + "learning_rate": 8.933257273245865e-06, + "loss": 1.2377, + "step": 1571 + }, + { + "epoch": 0.26903987677562896, + "grad_norm": 52.682533264160156, + "learning_rate": 8.938961779806047e-06, + "loss": 6.516, + "step": 1572 + }, + { + "epoch": 0.2692110217354099, + "grad_norm": 34.07759094238281, + "learning_rate": 8.94466628636623e-06, + "loss": 4.013, + "step": 1573 + }, + { + "epoch": 0.26938216669519083, + "grad_norm": 29.74109649658203, + "learning_rate": 8.950370792926412e-06, + "loss": 3.4177, + "step": 1574 + }, + { + "epoch": 0.26955331165497176, + "grad_norm": 35.098876953125, + "learning_rate": 8.956075299486593e-06, + "loss": 4.1055, + "step": 1575 + }, + { + "epoch": 0.2697244566147527, + "grad_norm": 50.082366943359375, + "learning_rate": 8.961779806046777e-06, + "loss": 8.4876, + "step": 1576 + }, + { + "epoch": 0.26989560157453363, + "grad_norm": 116.58244323730469, + "learning_rate": 8.96748431260696e-06, + "loss": 7.8558, + "step": 1577 + }, + { + "epoch": 0.27006674653431456, + "grad_norm": 32.75837326049805, + "learning_rate": 8.973188819167143e-06, + "loss": 3.8977, + "step": 1578 + }, + { + "epoch": 0.2702378914940955, + "grad_norm": 13.686226844787598, + "learning_rate": 8.978893325727325e-06, + "loss": 1.5984, + "step": 1579 + }, + { + "epoch": 0.27040903645387643, + "grad_norm": 31.057418823242188, + "learning_rate": 8.984597832287508e-06, + "loss": 4.2033, + "step": 1580 + }, + { + "epoch": 0.27058018141365736, + "grad_norm": 31.405447006225586, + "learning_rate": 8.99030233884769e-06, + "loss": 3.2895, + "step": 1581 + }, + { + "epoch": 0.2707513263734383, + "grad_norm": 29.978918075561523, + "learning_rate": 8.996006845407872e-06, + "loss": 4.0648, + "step": 1582 + }, + { + "epoch": 0.27092247133321923, + "grad_norm": 11.317312240600586, + "learning_rate": 9.001711351968055e-06, + "loss": 0.9835, + "step": 1583 + }, + { + "epoch": 0.27109361629300016, + "grad_norm": 17.877771377563477, + "learning_rate": 9.007415858528237e-06, + "loss": 1.4293, + "step": 1584 + }, + { + "epoch": 0.2712647612527811, + "grad_norm": 26.353673934936523, + "learning_rate": 9.01312036508842e-06, + "loss": 2.6549, + "step": 1585 + }, + { + "epoch": 0.27143590621256203, + "grad_norm": 31.735876083374023, + "learning_rate": 9.018824871648602e-06, + "loss": 3.9997, + "step": 1586 + }, + { + "epoch": 0.27160705117234296, + "grad_norm": 35.91917037963867, + "learning_rate": 9.024529378208785e-06, + "loss": 4.2824, + "step": 1587 + }, + { + "epoch": 0.2717781961321239, + "grad_norm": 32.27674865722656, + "learning_rate": 9.030233884768967e-06, + "loss": 4.0964, + "step": 1588 + }, + { + "epoch": 0.27194934109190483, + "grad_norm": 37.242549896240234, + "learning_rate": 9.03593839132915e-06, + "loss": 4.4567, + "step": 1589 + }, + { + "epoch": 0.27212048605168576, + "grad_norm": 15.34211540222168, + "learning_rate": 9.041642897889334e-06, + "loss": 1.1567, + "step": 1590 + }, + { + "epoch": 0.2722916310114667, + "grad_norm": 35.38195037841797, + "learning_rate": 9.047347404449515e-06, + "loss": 4.7975, + "step": 1591 + }, + { + "epoch": 0.27246277597124763, + "grad_norm": 29.104900360107422, + "learning_rate": 9.053051911009699e-06, + "loss": 3.1354, + "step": 1592 + }, + { + "epoch": 0.27263392093102856, + "grad_norm": 15.004528999328613, + "learning_rate": 9.05875641756988e-06, + "loss": 1.1248, + "step": 1593 + }, + { + "epoch": 0.2728050658908095, + "grad_norm": 26.269655227661133, + "learning_rate": 9.064460924130062e-06, + "loss": 2.0743, + "step": 1594 + }, + { + "epoch": 0.27297621085059043, + "grad_norm": 19.79959487915039, + "learning_rate": 9.070165430690246e-06, + "loss": 1.3031, + "step": 1595 + }, + { + "epoch": 0.27314735581037136, + "grad_norm": 43.51731491088867, + "learning_rate": 9.075869937250427e-06, + "loss": 4.4293, + "step": 1596 + }, + { + "epoch": 0.2733185007701523, + "grad_norm": 7.138434410095215, + "learning_rate": 9.08157444381061e-06, + "loss": 0.8485, + "step": 1597 + }, + { + "epoch": 0.27348964572993323, + "grad_norm": 32.309593200683594, + "learning_rate": 9.087278950370792e-06, + "loss": 3.4497, + "step": 1598 + }, + { + "epoch": 0.27366079068971416, + "grad_norm": 24.805715560913086, + "learning_rate": 9.092983456930976e-06, + "loss": 2.9256, + "step": 1599 + }, + { + "epoch": 0.2738319356494951, + "grad_norm": 61.22898483276367, + "learning_rate": 9.098687963491159e-06, + "loss": 5.9283, + "step": 1600 + }, + { + "epoch": 0.2740030806092761, + "grad_norm": 29.417680740356445, + "learning_rate": 9.10439247005134e-06, + "loss": 3.8084, + "step": 1601 + }, + { + "epoch": 0.274174225569057, + "grad_norm": 34.00372314453125, + "learning_rate": 9.110096976611524e-06, + "loss": 3.4933, + "step": 1602 + }, + { + "epoch": 0.27434537052883795, + "grad_norm": 14.374422073364258, + "learning_rate": 9.115801483171706e-06, + "loss": 1.4626, + "step": 1603 + }, + { + "epoch": 0.2745165154886189, + "grad_norm": 12.729880332946777, + "learning_rate": 9.12150598973189e-06, + "loss": 1.1151, + "step": 1604 + }, + { + "epoch": 0.2746876604483998, + "grad_norm": 17.94257164001465, + "learning_rate": 9.127210496292071e-06, + "loss": 1.3846, + "step": 1605 + }, + { + "epoch": 0.27485880540818075, + "grad_norm": 38.29545974731445, + "learning_rate": 9.132915002852253e-06, + "loss": 4.5905, + "step": 1606 + }, + { + "epoch": 0.2750299503679617, + "grad_norm": 35.37318420410156, + "learning_rate": 9.138619509412436e-06, + "loss": 4.3784, + "step": 1607 + }, + { + "epoch": 0.2752010953277426, + "grad_norm": 35.77292251586914, + "learning_rate": 9.144324015972618e-06, + "loss": 3.315, + "step": 1608 + }, + { + "epoch": 0.27537224028752355, + "grad_norm": 38.70093536376953, + "learning_rate": 9.150028522532801e-06, + "loss": 5.4718, + "step": 1609 + }, + { + "epoch": 0.2755433852473045, + "grad_norm": 185.0310516357422, + "learning_rate": 9.155733029092983e-06, + "loss": 7.5009, + "step": 1610 + }, + { + "epoch": 0.2757145302070854, + "grad_norm": 28.145288467407227, + "learning_rate": 9.161437535653166e-06, + "loss": 2.8764, + "step": 1611 + }, + { + "epoch": 0.27588567516686635, + "grad_norm": 7.594282150268555, + "learning_rate": 9.16714204221335e-06, + "loss": 0.8713, + "step": 1612 + }, + { + "epoch": 0.2760568201266473, + "grad_norm": 32.899845123291016, + "learning_rate": 9.172846548773531e-06, + "loss": 4.6094, + "step": 1613 + }, + { + "epoch": 0.2762279650864282, + "grad_norm": 39.75630569458008, + "learning_rate": 9.178551055333715e-06, + "loss": 4.5632, + "step": 1614 + }, + { + "epoch": 0.27639911004620915, + "grad_norm": 29.607851028442383, + "learning_rate": 9.184255561893896e-06, + "loss": 2.9606, + "step": 1615 + }, + { + "epoch": 0.2765702550059901, + "grad_norm": 76.37677001953125, + "learning_rate": 9.18996006845408e-06, + "loss": 7.355, + "step": 1616 + }, + { + "epoch": 0.276741399965771, + "grad_norm": 22.215526580810547, + "learning_rate": 9.195664575014261e-06, + "loss": 2.8241, + "step": 1617 + }, + { + "epoch": 0.27691254492555195, + "grad_norm": 9.465276718139648, + "learning_rate": 9.201369081574445e-06, + "loss": 0.9882, + "step": 1618 + }, + { + "epoch": 0.2770836898853329, + "grad_norm": 27.726600646972656, + "learning_rate": 9.207073588134626e-06, + "loss": 3.238, + "step": 1619 + }, + { + "epoch": 0.2772548348451138, + "grad_norm": 35.69710922241211, + "learning_rate": 9.212778094694808e-06, + "loss": 4.4113, + "step": 1620 + }, + { + "epoch": 0.27742597980489475, + "grad_norm": 34.97329330444336, + "learning_rate": 9.218482601254991e-06, + "loss": 5.005, + "step": 1621 + }, + { + "epoch": 0.2775971247646757, + "grad_norm": 18.749282836914062, + "learning_rate": 9.224187107815173e-06, + "loss": 1.7009, + "step": 1622 + }, + { + "epoch": 0.2777682697244566, + "grad_norm": 130.61004638671875, + "learning_rate": 9.229891614375358e-06, + "loss": 7.8661, + "step": 1623 + }, + { + "epoch": 0.27793941468423755, + "grad_norm": 12.980770111083984, + "learning_rate": 9.23559612093554e-06, + "loss": 1.1125, + "step": 1624 + }, + { + "epoch": 0.2781105596440185, + "grad_norm": 46.32781219482422, + "learning_rate": 9.241300627495722e-06, + "loss": 8.7552, + "step": 1625 + }, + { + "epoch": 0.2782817046037994, + "grad_norm": 101.5696029663086, + "learning_rate": 9.247005134055905e-06, + "loss": 7.1054, + "step": 1626 + }, + { + "epoch": 0.27845284956358035, + "grad_norm": 22.125795364379883, + "learning_rate": 9.252709640616087e-06, + "loss": 1.7911, + "step": 1627 + }, + { + "epoch": 0.2786239945233613, + "grad_norm": 34.277095794677734, + "learning_rate": 9.25841414717627e-06, + "loss": 4.438, + "step": 1628 + }, + { + "epoch": 0.2787951394831422, + "grad_norm": 22.72269058227539, + "learning_rate": 9.264118653736452e-06, + "loss": 1.7455, + "step": 1629 + }, + { + "epoch": 0.27896628444292315, + "grad_norm": 30.11455726623535, + "learning_rate": 9.269823160296635e-06, + "loss": 3.3549, + "step": 1630 + }, + { + "epoch": 0.2791374294027041, + "grad_norm": 34.13120651245117, + "learning_rate": 9.275527666856817e-06, + "loss": 3.7081, + "step": 1631 + }, + { + "epoch": 0.279308574362485, + "grad_norm": 8.457001686096191, + "learning_rate": 9.281232173416998e-06, + "loss": 0.9564, + "step": 1632 + }, + { + "epoch": 0.27947971932226595, + "grad_norm": 38.574615478515625, + "learning_rate": 9.286936679977182e-06, + "loss": 4.3973, + "step": 1633 + }, + { + "epoch": 0.2796508642820469, + "grad_norm": 11.158347129821777, + "learning_rate": 9.292641186537364e-06, + "loss": 0.9696, + "step": 1634 + }, + { + "epoch": 0.2798220092418278, + "grad_norm": 11.847931861877441, + "learning_rate": 9.298345693097549e-06, + "loss": 1.5377, + "step": 1635 + }, + { + "epoch": 0.27999315420160875, + "grad_norm": 11.096319198608398, + "learning_rate": 9.30405019965773e-06, + "loss": 1.136, + "step": 1636 + }, + { + "epoch": 0.2801642991613897, + "grad_norm": 36.63529586791992, + "learning_rate": 9.309754706217914e-06, + "loss": 5.2998, + "step": 1637 + }, + { + "epoch": 0.2803354441211706, + "grad_norm": 30.421175003051758, + "learning_rate": 9.315459212778095e-06, + "loss": 3.5562, + "step": 1638 + }, + { + "epoch": 0.28050658908095155, + "grad_norm": 34.89402770996094, + "learning_rate": 9.321163719338277e-06, + "loss": 4.9255, + "step": 1639 + }, + { + "epoch": 0.2806777340407325, + "grad_norm": 28.486478805541992, + "learning_rate": 9.32686822589846e-06, + "loss": 3.4583, + "step": 1640 + }, + { + "epoch": 0.2808488790005134, + "grad_norm": 7.498641490936279, + "learning_rate": 9.332572732458642e-06, + "loss": 0.8123, + "step": 1641 + }, + { + "epoch": 0.28102002396029435, + "grad_norm": 47.50094223022461, + "learning_rate": 9.338277239018825e-06, + "loss": 7.894, + "step": 1642 + }, + { + "epoch": 0.2811911689200753, + "grad_norm": 62.95503616333008, + "learning_rate": 9.343981745579007e-06, + "loss": 6.7316, + "step": 1643 + }, + { + "epoch": 0.2813623138798562, + "grad_norm": 26.29498291015625, + "learning_rate": 9.349686252139189e-06, + "loss": 2.9299, + "step": 1644 + }, + { + "epoch": 0.28153345883963715, + "grad_norm": 13.663917541503906, + "learning_rate": 9.355390758699372e-06, + "loss": 1.6658, + "step": 1645 + }, + { + "epoch": 0.2817046037994181, + "grad_norm": 31.745132446289062, + "learning_rate": 9.361095265259556e-06, + "loss": 4.9097, + "step": 1646 + }, + { + "epoch": 0.281875748759199, + "grad_norm": 16.757953643798828, + "learning_rate": 9.366799771819739e-06, + "loss": 1.4769, + "step": 1647 + }, + { + "epoch": 0.28204689371897995, + "grad_norm": 21.601877212524414, + "learning_rate": 9.37250427837992e-06, + "loss": 1.7352, + "step": 1648 + }, + { + "epoch": 0.2822180386787609, + "grad_norm": 38.61962127685547, + "learning_rate": 9.378208784940104e-06, + "loss": 4.4803, + "step": 1649 + }, + { + "epoch": 0.2823891836385418, + "grad_norm": 31.342639923095703, + "learning_rate": 9.383913291500286e-06, + "loss": 3.8044, + "step": 1650 + }, + { + "epoch": 0.28256032859832275, + "grad_norm": 9.416754722595215, + "learning_rate": 9.389617798060467e-06, + "loss": 0.804, + "step": 1651 + }, + { + "epoch": 0.28273147355810374, + "grad_norm": 31.227413177490234, + "learning_rate": 9.39532230462065e-06, + "loss": 4.1229, + "step": 1652 + }, + { + "epoch": 0.2829026185178847, + "grad_norm": 13.257563591003418, + "learning_rate": 9.401026811180832e-06, + "loss": 1.1089, + "step": 1653 + }, + { + "epoch": 0.2830737634776656, + "grad_norm": 33.36773681640625, + "learning_rate": 9.406731317741016e-06, + "loss": 4.6453, + "step": 1654 + }, + { + "epoch": 0.28324490843744654, + "grad_norm": 30.116289138793945, + "learning_rate": 9.412435824301198e-06, + "loss": 3.3475, + "step": 1655 + }, + { + "epoch": 0.2834160533972275, + "grad_norm": 9.72807502746582, + "learning_rate": 9.41814033086138e-06, + "loss": 1.3987, + "step": 1656 + }, + { + "epoch": 0.2835871983570084, + "grad_norm": 35.53730392456055, + "learning_rate": 9.423844837421563e-06, + "loss": 4.4274, + "step": 1657 + }, + { + "epoch": 0.28375834331678934, + "grad_norm": 25.7310733795166, + "learning_rate": 9.429549343981746e-06, + "loss": 3.1681, + "step": 1658 + }, + { + "epoch": 0.2839294882765703, + "grad_norm": 41.159175872802734, + "learning_rate": 9.43525385054193e-06, + "loss": 5.0249, + "step": 1659 + }, + { + "epoch": 0.2841006332363512, + "grad_norm": 44.80512619018555, + "learning_rate": 9.440958357102111e-06, + "loss": 8.5485, + "step": 1660 + }, + { + "epoch": 0.28427177819613214, + "grad_norm": 30.980173110961914, + "learning_rate": 9.446662863662294e-06, + "loss": 3.4326, + "step": 1661 + }, + { + "epoch": 0.2844429231559131, + "grad_norm": 34.32295608520508, + "learning_rate": 9.452367370222476e-06, + "loss": 3.1846, + "step": 1662 + }, + { + "epoch": 0.284614068115694, + "grad_norm": 31.66938591003418, + "learning_rate": 9.458071876782658e-06, + "loss": 3.5118, + "step": 1663 + }, + { + "epoch": 0.28478521307547494, + "grad_norm": 32.4676513671875, + "learning_rate": 9.463776383342841e-06, + "loss": 4.7146, + "step": 1664 + }, + { + "epoch": 0.2849563580352559, + "grad_norm": 10.913191795349121, + "learning_rate": 9.469480889903023e-06, + "loss": 0.9731, + "step": 1665 + }, + { + "epoch": 0.2851275029950368, + "grad_norm": 35.5974006652832, + "learning_rate": 9.475185396463206e-06, + "loss": 4.3522, + "step": 1666 + }, + { + "epoch": 0.28529864795481774, + "grad_norm": 33.59803771972656, + "learning_rate": 9.480889903023388e-06, + "loss": 3.3641, + "step": 1667 + }, + { + "epoch": 0.2854697929145987, + "grad_norm": 35.429466247558594, + "learning_rate": 9.486594409583571e-06, + "loss": 3.9219, + "step": 1668 + }, + { + "epoch": 0.2856409378743796, + "grad_norm": 13.85142707824707, + "learning_rate": 9.492298916143755e-06, + "loss": 1.3341, + "step": 1669 + }, + { + "epoch": 0.28581208283416054, + "grad_norm": 9.107728004455566, + "learning_rate": 9.498003422703936e-06, + "loss": 0.8871, + "step": 1670 + }, + { + "epoch": 0.2859832277939415, + "grad_norm": 35.564979553222656, + "learning_rate": 9.50370792926412e-06, + "loss": 4.1451, + "step": 1671 + }, + { + "epoch": 0.2861543727537224, + "grad_norm": 27.561506271362305, + "learning_rate": 9.509412435824301e-06, + "loss": 3.3942, + "step": 1672 + }, + { + "epoch": 0.28632551771350334, + "grad_norm": 35.57343292236328, + "learning_rate": 9.515116942384485e-06, + "loss": 3.7111, + "step": 1673 + }, + { + "epoch": 0.2864966626732843, + "grad_norm": 39.25908279418945, + "learning_rate": 9.520821448944666e-06, + "loss": 4.3091, + "step": 1674 + }, + { + "epoch": 0.2866678076330652, + "grad_norm": 41.76926803588867, + "learning_rate": 9.526525955504848e-06, + "loss": 4.7086, + "step": 1675 + }, + { + "epoch": 0.28683895259284614, + "grad_norm": 30.626611709594727, + "learning_rate": 9.532230462065032e-06, + "loss": 3.1129, + "step": 1676 + }, + { + "epoch": 0.2870100975526271, + "grad_norm": 15.441875457763672, + "learning_rate": 9.537934968625213e-06, + "loss": 1.2888, + "step": 1677 + }, + { + "epoch": 0.287181242512408, + "grad_norm": 28.600982666015625, + "learning_rate": 9.543639475185397e-06, + "loss": 2.9279, + "step": 1678 + }, + { + "epoch": 0.28735238747218894, + "grad_norm": 31.3085994720459, + "learning_rate": 9.549343981745578e-06, + "loss": 3.8741, + "step": 1679 + }, + { + "epoch": 0.2875235324319699, + "grad_norm": 207.9216766357422, + "learning_rate": 9.555048488305763e-06, + "loss": 8.5201, + "step": 1680 + }, + { + "epoch": 0.2876946773917508, + "grad_norm": 38.76487731933594, + "learning_rate": 9.560752994865945e-06, + "loss": 4.8258, + "step": 1681 + }, + { + "epoch": 0.28786582235153174, + "grad_norm": 35.18633270263672, + "learning_rate": 9.566457501426127e-06, + "loss": 3.9555, + "step": 1682 + }, + { + "epoch": 0.2880369673113127, + "grad_norm": 153.19830322265625, + "learning_rate": 9.57216200798631e-06, + "loss": 8.1017, + "step": 1683 + }, + { + "epoch": 0.2882081122710936, + "grad_norm": 8.444355010986328, + "learning_rate": 9.577866514546492e-06, + "loss": 0.8761, + "step": 1684 + }, + { + "epoch": 0.28837925723087454, + "grad_norm": 44.78715515136719, + "learning_rate": 9.583571021106675e-06, + "loss": 8.4681, + "step": 1685 + }, + { + "epoch": 0.2885504021906555, + "grad_norm": 25.710901260375977, + "learning_rate": 9.589275527666857e-06, + "loss": 3.2682, + "step": 1686 + }, + { + "epoch": 0.2887215471504364, + "grad_norm": 161.20376586914062, + "learning_rate": 9.59498003422704e-06, + "loss": 8.3231, + "step": 1687 + }, + { + "epoch": 0.28889269211021734, + "grad_norm": 36.88936996459961, + "learning_rate": 9.600684540787222e-06, + "loss": 4.4629, + "step": 1688 + }, + { + "epoch": 0.2890638370699983, + "grad_norm": 33.05325698852539, + "learning_rate": 9.606389047347404e-06, + "loss": 4.2398, + "step": 1689 + }, + { + "epoch": 0.2892349820297792, + "grad_norm": 31.297021865844727, + "learning_rate": 9.612093553907587e-06, + "loss": 3.9676, + "step": 1690 + }, + { + "epoch": 0.28940612698956014, + "grad_norm": 33.626365661621094, + "learning_rate": 9.617798060467769e-06, + "loss": 4.2342, + "step": 1691 + }, + { + "epoch": 0.2895772719493411, + "grad_norm": 32.812740325927734, + "learning_rate": 9.623502567027954e-06, + "loss": 3.9633, + "step": 1692 + }, + { + "epoch": 0.289748416909122, + "grad_norm": 16.281417846679688, + "learning_rate": 9.629207073588135e-06, + "loss": 1.3504, + "step": 1693 + }, + { + "epoch": 0.28991956186890294, + "grad_norm": 36.80635070800781, + "learning_rate": 9.634911580148317e-06, + "loss": 4.8133, + "step": 1694 + }, + { + "epoch": 0.2900907068286839, + "grad_norm": 36.548397064208984, + "learning_rate": 9.6406160867085e-06, + "loss": 4.0484, + "step": 1695 + }, + { + "epoch": 0.2902618517884648, + "grad_norm": 35.513729095458984, + "learning_rate": 9.646320593268682e-06, + "loss": 4.3281, + "step": 1696 + }, + { + "epoch": 0.29043299674824574, + "grad_norm": 33.258995056152344, + "learning_rate": 9.652025099828866e-06, + "loss": 4.3749, + "step": 1697 + }, + { + "epoch": 0.2906041417080267, + "grad_norm": 30.854419708251953, + "learning_rate": 9.657729606389047e-06, + "loss": 3.4016, + "step": 1698 + }, + { + "epoch": 0.2907752866678076, + "grad_norm": 8.308602333068848, + "learning_rate": 9.66343411294923e-06, + "loss": 1.221, + "step": 1699 + }, + { + "epoch": 0.29094643162758854, + "grad_norm": 10.515448570251465, + "learning_rate": 9.669138619509412e-06, + "loss": 0.9434, + "step": 1700 + }, + { + "epoch": 0.2911175765873695, + "grad_norm": 20.784067153930664, + "learning_rate": 9.674843126069594e-06, + "loss": 2.2873, + "step": 1701 + }, + { + "epoch": 0.2912887215471504, + "grad_norm": 44.417884826660156, + "learning_rate": 9.680547632629777e-06, + "loss": 8.2452, + "step": 1702 + }, + { + "epoch": 0.2914598665069314, + "grad_norm": 28.057279586791992, + "learning_rate": 9.68625213918996e-06, + "loss": 2.7355, + "step": 1703 + }, + { + "epoch": 0.29163101146671233, + "grad_norm": 50.71089553833008, + "learning_rate": 9.691956645750144e-06, + "loss": 8.431, + "step": 1704 + }, + { + "epoch": 0.29180215642649326, + "grad_norm": 44.918087005615234, + "learning_rate": 9.697661152310326e-06, + "loss": 7.8944, + "step": 1705 + }, + { + "epoch": 0.2919733013862742, + "grad_norm": 143.09837341308594, + "learning_rate": 9.703365658870507e-06, + "loss": 7.8241, + "step": 1706 + }, + { + "epoch": 0.29214444634605513, + "grad_norm": 22.652225494384766, + "learning_rate": 9.709070165430691e-06, + "loss": 2.888, + "step": 1707 + }, + { + "epoch": 0.29231559130583606, + "grad_norm": 32.992774963378906, + "learning_rate": 9.714774671990873e-06, + "loss": 3.7048, + "step": 1708 + }, + { + "epoch": 0.292486736265617, + "grad_norm": 30.531761169433594, + "learning_rate": 9.720479178551056e-06, + "loss": 3.8219, + "step": 1709 + }, + { + "epoch": 0.29265788122539793, + "grad_norm": 39.57463073730469, + "learning_rate": 9.726183685111238e-06, + "loss": 4.5677, + "step": 1710 + }, + { + "epoch": 0.29282902618517886, + "grad_norm": 32.177650451660156, + "learning_rate": 9.731888191671421e-06, + "loss": 3.9562, + "step": 1711 + }, + { + "epoch": 0.2930001711449598, + "grad_norm": 11.071613311767578, + "learning_rate": 9.737592698231603e-06, + "loss": 1.3818, + "step": 1712 + }, + { + "epoch": 0.29317131610474073, + "grad_norm": 183.07089233398438, + "learning_rate": 9.743297204791784e-06, + "loss": 8.202, + "step": 1713 + }, + { + "epoch": 0.29334246106452166, + "grad_norm": 127.64228057861328, + "learning_rate": 9.749001711351968e-06, + "loss": 7.7497, + "step": 1714 + }, + { + "epoch": 0.2935136060243026, + "grad_norm": 30.094449996948242, + "learning_rate": 9.754706217912151e-06, + "loss": 3.133, + "step": 1715 + }, + { + "epoch": 0.29368475098408353, + "grad_norm": 33.56199264526367, + "learning_rate": 9.760410724472334e-06, + "loss": 3.9969, + "step": 1716 + }, + { + "epoch": 0.29385589594386446, + "grad_norm": 30.969953536987305, + "learning_rate": 9.766115231032516e-06, + "loss": 3.2142, + "step": 1717 + }, + { + "epoch": 0.2940270409036454, + "grad_norm": 26.745988845825195, + "learning_rate": 9.7718197375927e-06, + "loss": 2.5008, + "step": 1718 + }, + { + "epoch": 0.29419818586342633, + "grad_norm": 19.184772491455078, + "learning_rate": 9.777524244152881e-06, + "loss": 1.7379, + "step": 1719 + }, + { + "epoch": 0.29436933082320726, + "grad_norm": 30.3228759765625, + "learning_rate": 9.783228750713063e-06, + "loss": 3.6131, + "step": 1720 + }, + { + "epoch": 0.2945404757829882, + "grad_norm": 30.700254440307617, + "learning_rate": 9.788933257273246e-06, + "loss": 3.4613, + "step": 1721 + }, + { + "epoch": 0.29471162074276913, + "grad_norm": 35.11033248901367, + "learning_rate": 9.794637763833428e-06, + "loss": 4.2554, + "step": 1722 + }, + { + "epoch": 0.29488276570255006, + "grad_norm": 47.508277893066406, + "learning_rate": 9.800342270393611e-06, + "loss": 8.2195, + "step": 1723 + }, + { + "epoch": 0.295053910662331, + "grad_norm": 35.247528076171875, + "learning_rate": 9.806046776953793e-06, + "loss": 4.3316, + "step": 1724 + }, + { + "epoch": 0.29522505562211193, + "grad_norm": 27.610280990600586, + "learning_rate": 9.811751283513975e-06, + "loss": 3.7587, + "step": 1725 + }, + { + "epoch": 0.29539620058189286, + "grad_norm": 34.314918518066406, + "learning_rate": 9.81745579007416e-06, + "loss": 4.4546, + "step": 1726 + }, + { + "epoch": 0.2955673455416738, + "grad_norm": 31.43994140625, + "learning_rate": 9.823160296634341e-06, + "loss": 3.8653, + "step": 1727 + }, + { + "epoch": 0.29573849050145473, + "grad_norm": 15.655001640319824, + "learning_rate": 9.828864803194525e-06, + "loss": 1.2015, + "step": 1728 + }, + { + "epoch": 0.29590963546123566, + "grad_norm": 13.799985885620117, + "learning_rate": 9.834569309754707e-06, + "loss": 1.6485, + "step": 1729 + }, + { + "epoch": 0.2960807804210166, + "grad_norm": 35.408145904541016, + "learning_rate": 9.84027381631489e-06, + "loss": 4.6912, + "step": 1730 + }, + { + "epoch": 0.29625192538079753, + "grad_norm": 33.258941650390625, + "learning_rate": 9.845978322875072e-06, + "loss": 3.8243, + "step": 1731 + }, + { + "epoch": 0.29642307034057847, + "grad_norm": 34.537960052490234, + "learning_rate": 9.851682829435253e-06, + "loss": 3.6863, + "step": 1732 + }, + { + "epoch": 0.2965942153003594, + "grad_norm": 25.667997360229492, + "learning_rate": 9.857387335995437e-06, + "loss": 3.21, + "step": 1733 + }, + { + "epoch": 0.29676536026014033, + "grad_norm": 146.46380615234375, + "learning_rate": 9.863091842555618e-06, + "loss": 7.1136, + "step": 1734 + }, + { + "epoch": 0.29693650521992127, + "grad_norm": 20.732595443725586, + "learning_rate": 9.868796349115802e-06, + "loss": 2.1932, + "step": 1735 + }, + { + "epoch": 0.2971076501797022, + "grad_norm": 37.78299331665039, + "learning_rate": 9.874500855675983e-06, + "loss": 4.4365, + "step": 1736 + }, + { + "epoch": 0.29727879513948313, + "grad_norm": 30.049827575683594, + "learning_rate": 9.880205362236167e-06, + "loss": 3.2983, + "step": 1737 + }, + { + "epoch": 0.29744994009926407, + "grad_norm": 12.33377742767334, + "learning_rate": 9.88590986879635e-06, + "loss": 1.4362, + "step": 1738 + }, + { + "epoch": 0.297621085059045, + "grad_norm": 24.165996551513672, + "learning_rate": 9.891614375356532e-06, + "loss": 2.7512, + "step": 1739 + }, + { + "epoch": 0.29779223001882593, + "grad_norm": 34.980438232421875, + "learning_rate": 9.897318881916715e-06, + "loss": 3.4089, + "step": 1740 + }, + { + "epoch": 0.29796337497860687, + "grad_norm": 52.22333526611328, + "learning_rate": 9.903023388476897e-06, + "loss": 8.55, + "step": 1741 + }, + { + "epoch": 0.2981345199383878, + "grad_norm": 30.178720474243164, + "learning_rate": 9.90872789503708e-06, + "loss": 3.7629, + "step": 1742 + }, + { + "epoch": 0.29830566489816873, + "grad_norm": 12.83564281463623, + "learning_rate": 9.914432401597262e-06, + "loss": 1.5206, + "step": 1743 + }, + { + "epoch": 0.29847680985794967, + "grad_norm": 116.20635223388672, + "learning_rate": 9.920136908157444e-06, + "loss": 7.1701, + "step": 1744 + }, + { + "epoch": 0.2986479548177306, + "grad_norm": 28.332143783569336, + "learning_rate": 9.925841414717627e-06, + "loss": 4.0808, + "step": 1745 + }, + { + "epoch": 0.29881909977751153, + "grad_norm": 17.009302139282227, + "learning_rate": 9.931545921277809e-06, + "loss": 1.1237, + "step": 1746 + }, + { + "epoch": 0.29899024473729247, + "grad_norm": 22.102079391479492, + "learning_rate": 9.937250427837992e-06, + "loss": 2.1591, + "step": 1747 + }, + { + "epoch": 0.2991613896970734, + "grad_norm": 31.704936981201172, + "learning_rate": 9.942954934398174e-06, + "loss": 3.3555, + "step": 1748 + }, + { + "epoch": 0.29933253465685433, + "grad_norm": 7.139681816101074, + "learning_rate": 9.948659440958359e-06, + "loss": 0.8492, + "step": 1749 + }, + { + "epoch": 0.29950367961663527, + "grad_norm": 37.93485641479492, + "learning_rate": 9.95436394751854e-06, + "loss": 4.3445, + "step": 1750 + }, + { + "epoch": 0.2996748245764162, + "grad_norm": 23.79175567626953, + "learning_rate": 9.960068454078722e-06, + "loss": 2.6891, + "step": 1751 + }, + { + "epoch": 0.29984596953619713, + "grad_norm": 26.583223342895508, + "learning_rate": 9.965772960638906e-06, + "loss": 3.0936, + "step": 1752 + }, + { + "epoch": 0.3000171144959781, + "grad_norm": 16.86503791809082, + "learning_rate": 9.971477467199087e-06, + "loss": 1.5367, + "step": 1753 + } + ], + "logging_steps": 1, + "max_steps": 17529, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1753, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}