diff --git "a/checkpoint-8765/trainer_state.json" "b/checkpoint-8765/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-8765/trainer_state.json" @@ -0,0 +1,62558 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5000855724798905, + "eval_steps": 877, + "global_step": 8765, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017114495978093444, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 9.5502, + "step": 1 + }, + { + "epoch": 0.0003422899195618689, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 17.5546, + "step": 2 + }, + { + "epoch": 0.0005134348793428033, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 25.9988, + "step": 3 + }, + { + "epoch": 0.0006845798391237378, + "grad_norm": 60.18782043457031, + "learning_rate": 5.704506560182544e-09, + "loss": 9.6042, + "step": 4 + }, + { + "epoch": 0.0008557247989046722, + "grad_norm": Infinity, + "learning_rate": 5.704506560182544e-09, + "loss": 17.9169, + "step": 5 + }, + { + "epoch": 0.0010268697586856067, + "grad_norm": 84.36212158203125, + "learning_rate": 1.1409013120365088e-08, + "loss": 10.1633, + "step": 6 + }, + { + "epoch": 0.001198014718466541, + "grad_norm": 82.20382690429688, + "learning_rate": 1.711351968054763e-08, + "loss": 8.4392, + "step": 7 + }, + { + "epoch": 0.0013691596782474755, + "grad_norm": 16.687606811523438, + "learning_rate": 2.2818026240730176e-08, + "loss": 6.4113, + "step": 8 + }, + { + "epoch": 0.00154030463802841, + "grad_norm": NaN, + "learning_rate": 2.2818026240730176e-08, + "loss": 17.185, + "step": 9 + }, + { + "epoch": 0.0017114495978093444, + "grad_norm": 48.527183532714844, + "learning_rate": 2.852253280091272e-08, + "loss": 7.868, + "step": 10 + }, + { + "epoch": 0.0018825945575902789, + "grad_norm": 76.19969177246094, + "learning_rate": 3.422703936109526e-08, + "loss": 25.5097, + "step": 11 + }, + { + "epoch": 0.0020537395173712133, + "grad_norm": 44.624080657958984, + "learning_rate": 3.9931545921277814e-08, + "loss": 8.533, + "step": 12 + }, + { + "epoch": 0.002224884477152148, + "grad_norm": 68.30242156982422, + "learning_rate": 4.563605248146035e-08, + "loss": 12.1618, + "step": 13 + }, + { + "epoch": 0.002396029436933082, + "grad_norm": 77.02323913574219, + "learning_rate": 5.1340559041642904e-08, + "loss": 10.1531, + "step": 14 + }, + { + "epoch": 0.002567174396714017, + "grad_norm": 37.27943420410156, + "learning_rate": 5.704506560182544e-08, + "loss": 8.2236, + "step": 15 + }, + { + "epoch": 0.002738319356494951, + "grad_norm": 47.14276123046875, + "learning_rate": 6.274957216200798e-08, + "loss": 6.6764, + "step": 16 + }, + { + "epoch": 0.0029094643162758858, + "grad_norm": 68.288818359375, + "learning_rate": 6.845407872219053e-08, + "loss": 9.6404, + "step": 17 + }, + { + "epoch": 0.00308060927605682, + "grad_norm": 72.5563735961914, + "learning_rate": 7.415858528237308e-08, + "loss": 7.2553, + "step": 18 + }, + { + "epoch": 0.0032517542358377546, + "grad_norm": 133.2047576904297, + "learning_rate": 7.986309184255563e-08, + "loss": 17.6761, + "step": 19 + }, + { + "epoch": 0.003422899195618689, + "grad_norm": 96.22279357910156, + "learning_rate": 8.556759840273816e-08, + "loss": 23.6993, + "step": 20 + }, + { + "epoch": 0.0035940441553996235, + "grad_norm": 72.40164947509766, + "learning_rate": 9.12721049629207e-08, + "loss": 12.5069, + "step": 21 + }, + { + "epoch": 0.0037651891151805577, + "grad_norm": 58.689125061035156, + "learning_rate": 9.697661152310325e-08, + "loss": 10.4915, + "step": 22 + }, + { + "epoch": 0.003936334074961492, + "grad_norm": 41.391170501708984, + "learning_rate": 1.0268111808328581e-07, + "loss": 8.2323, + "step": 23 + }, + { + "epoch": 0.004107479034742427, + "grad_norm": 60.9368896484375, + "learning_rate": 1.0838562464346835e-07, + "loss": 9.4007, + "step": 24 + }, + { + "epoch": 0.004278623994523361, + "grad_norm": 132.59597778320312, + "learning_rate": 1.1409013120365088e-07, + "loss": 16.9119, + "step": 25 + }, + { + "epoch": 0.004449768954304296, + "grad_norm": 72.2205581665039, + "learning_rate": 1.1979463776383346e-07, + "loss": 12.5137, + "step": 26 + }, + { + "epoch": 0.00462091391408523, + "grad_norm": 69.2486572265625, + "learning_rate": 1.2549914432401596e-07, + "loss": 10.2477, + "step": 27 + }, + { + "epoch": 0.004792058873866164, + "grad_norm": 30.01091194152832, + "learning_rate": 1.3120365088419852e-07, + "loss": 6.6456, + "step": 28 + }, + { + "epoch": 0.0049632038336470995, + "grad_norm": 75.28530883789062, + "learning_rate": 1.3690815744438105e-07, + "loss": 9.7946, + "step": 29 + }, + { + "epoch": 0.005134348793428034, + "grad_norm": 70.37921142578125, + "learning_rate": 1.426126640045636e-07, + "loss": 12.4969, + "step": 30 + }, + { + "epoch": 0.005305493753208968, + "grad_norm": 90.83671569824219, + "learning_rate": 1.4831717056474617e-07, + "loss": 7.6589, + "step": 31 + }, + { + "epoch": 0.005476638712989902, + "grad_norm": 65.92588806152344, + "learning_rate": 1.540216771249287e-07, + "loss": 9.7764, + "step": 32 + }, + { + "epoch": 0.005647783672770837, + "grad_norm": 86.14967346191406, + "learning_rate": 1.5972618368511126e-07, + "loss": 8.2129, + "step": 33 + }, + { + "epoch": 0.0058189286325517715, + "grad_norm": 145.2432098388672, + "learning_rate": 1.654306902452938e-07, + "loss": 17.7377, + "step": 34 + }, + { + "epoch": 0.005990073592332706, + "grad_norm": 64.69364929199219, + "learning_rate": 1.7113519680547632e-07, + "loss": 9.9994, + "step": 35 + }, + { + "epoch": 0.00616121855211364, + "grad_norm": 58.662803649902344, + "learning_rate": 1.7683970336565888e-07, + "loss": 11.459, + "step": 36 + }, + { + "epoch": 0.006332363511894575, + "grad_norm": 123.47699737548828, + "learning_rate": 1.825442099258414e-07, + "loss": 16.278, + "step": 37 + }, + { + "epoch": 0.006503508471675509, + "grad_norm": 40.24553680419922, + "learning_rate": 1.8824871648602397e-07, + "loss": 8.1959, + "step": 38 + }, + { + "epoch": 0.0066746534314564435, + "grad_norm": 71.55098724365234, + "learning_rate": 1.939532230462065e-07, + "loss": 12.4429, + "step": 39 + }, + { + "epoch": 0.006845798391237378, + "grad_norm": 73.94329833984375, + "learning_rate": 1.9965772960638906e-07, + "loss": 12.4672, + "step": 40 + }, + { + "epoch": 0.007016943351018313, + "grad_norm": 78.10585021972656, + "learning_rate": 2.0536223616657162e-07, + "loss": 12.6372, + "step": 41 + }, + { + "epoch": 0.007188088310799247, + "grad_norm": 154.09982299804688, + "learning_rate": 2.1106674272675415e-07, + "loss": 18.2102, + "step": 42 + }, + { + "epoch": 0.007359233270580181, + "grad_norm": 69.93523406982422, + "learning_rate": 2.167712492869367e-07, + "loss": 10.2355, + "step": 43 + }, + { + "epoch": 0.0075303782303611155, + "grad_norm": 53.971500396728516, + "learning_rate": 2.224757558471192e-07, + "loss": 11.0821, + "step": 44 + }, + { + "epoch": 0.007701523190142051, + "grad_norm": 79.0840835571289, + "learning_rate": 2.2818026240730177e-07, + "loss": 12.838, + "step": 45 + }, + { + "epoch": 0.007872668149922985, + "grad_norm": 68.8064956665039, + "learning_rate": 2.3388476896748433e-07, + "loss": 11.9172, + "step": 46 + }, + { + "epoch": 0.00804381310970392, + "grad_norm": 114.59835815429688, + "learning_rate": 2.395892755276669e-07, + "loss": 16.4603, + "step": 47 + }, + { + "epoch": 0.008214958069484853, + "grad_norm": 76.0896987915039, + "learning_rate": 2.452937820878494e-07, + "loss": 12.7007, + "step": 48 + }, + { + "epoch": 0.008386103029265788, + "grad_norm": 45.47982406616211, + "learning_rate": 2.509982886480319e-07, + "loss": 10.2213, + "step": 49 + }, + { + "epoch": 0.008557247989046722, + "grad_norm": 136.47836303710938, + "learning_rate": 2.567027952082145e-07, + "loss": 17.2777, + "step": 50 + }, + { + "epoch": 0.008728392948827657, + "grad_norm": 57.07033920288086, + "learning_rate": 2.6240730176839704e-07, + "loss": 10.8704, + "step": 51 + }, + { + "epoch": 0.008899537908608592, + "grad_norm": 50.97236633300781, + "learning_rate": 2.681118083285796e-07, + "loss": 10.8266, + "step": 52 + }, + { + "epoch": 0.009070682868389525, + "grad_norm": 166.14840698242188, + "learning_rate": 2.738163148887621e-07, + "loss": 17.2345, + "step": 53 + }, + { + "epoch": 0.00924182782817046, + "grad_norm": 154.5965576171875, + "learning_rate": 2.795208214489447e-07, + "loss": 18.5922, + "step": 54 + }, + { + "epoch": 0.009412972787951395, + "grad_norm": 61.19700622558594, + "learning_rate": 2.852253280091272e-07, + "loss": 9.3514, + "step": 55 + }, + { + "epoch": 0.009584117747732329, + "grad_norm": 64.54351806640625, + "learning_rate": 2.909298345693098e-07, + "loss": 7.0867, + "step": 56 + }, + { + "epoch": 0.009755262707513264, + "grad_norm": 41.97494888305664, + "learning_rate": 2.9663434112949233e-07, + "loss": 8.319, + "step": 57 + }, + { + "epoch": 0.009926407667294199, + "grad_norm": 158.86936950683594, + "learning_rate": 3.023388476896748e-07, + "loss": 18.7347, + "step": 58 + }, + { + "epoch": 0.010097552627075132, + "grad_norm": 93.42990112304688, + "learning_rate": 3.080433542498574e-07, + "loss": 24.4121, + "step": 59 + }, + { + "epoch": 0.010268697586856067, + "grad_norm": 40.078529357910156, + "learning_rate": 3.1374786081003993e-07, + "loss": 8.4254, + "step": 60 + }, + { + "epoch": 0.010439842546637, + "grad_norm": 172.25357055664062, + "learning_rate": 3.194523673702225e-07, + "loss": 17.8112, + "step": 61 + }, + { + "epoch": 0.010610987506417936, + "grad_norm": 64.05378723144531, + "learning_rate": 3.2515687393040504e-07, + "loss": 11.648, + "step": 62 + }, + { + "epoch": 0.010782132466198871, + "grad_norm": 31.915542602539062, + "learning_rate": 3.308613804905876e-07, + "loss": 6.576, + "step": 63 + }, + { + "epoch": 0.010953277425979804, + "grad_norm": 97.90230560302734, + "learning_rate": 3.365658870507701e-07, + "loss": 24.4094, + "step": 64 + }, + { + "epoch": 0.01112442238576074, + "grad_norm": 49.965126037597656, + "learning_rate": 3.4227039361095264e-07, + "loss": 10.7296, + "step": 65 + }, + { + "epoch": 0.011295567345541675, + "grad_norm": 103.74539947509766, + "learning_rate": 3.479749001711352e-07, + "loss": 24.3446, + "step": 66 + }, + { + "epoch": 0.011466712305322608, + "grad_norm": 100.20292663574219, + "learning_rate": 3.5367940673131776e-07, + "loss": 24.4799, + "step": 67 + }, + { + "epoch": 0.011637857265103543, + "grad_norm": 87.23709869384766, + "learning_rate": 3.593839132915003e-07, + "loss": 7.8589, + "step": 68 + }, + { + "epoch": 0.011809002224884476, + "grad_norm": 105.97203063964844, + "learning_rate": 3.650884198516828e-07, + "loss": 24.5711, + "step": 69 + }, + { + "epoch": 0.011980147184665411, + "grad_norm": 139.98709106445312, + "learning_rate": 3.707929264118654e-07, + "loss": 18.0228, + "step": 70 + }, + { + "epoch": 0.012151292144446347, + "grad_norm": 32.724159240722656, + "learning_rate": 3.7649743297204793e-07, + "loss": 8.2503, + "step": 71 + }, + { + "epoch": 0.01232243710422728, + "grad_norm": 55.843509674072266, + "learning_rate": 3.822019395322305e-07, + "loss": 7.1578, + "step": 72 + }, + { + "epoch": 0.012493582064008215, + "grad_norm": 183.3024444580078, + "learning_rate": 3.87906446092413e-07, + "loss": 18.0044, + "step": 73 + }, + { + "epoch": 0.01266472702378915, + "grad_norm": 118.88136291503906, + "learning_rate": 3.9361095265259553e-07, + "loss": 15.586, + "step": 74 + }, + { + "epoch": 0.012835871983570083, + "grad_norm": 64.66754150390625, + "learning_rate": 3.993154592127781e-07, + "loss": 12.3289, + "step": 75 + }, + { + "epoch": 0.013007016943351019, + "grad_norm": 62.58869552612305, + "learning_rate": 4.0501996577296065e-07, + "loss": 8.9949, + "step": 76 + }, + { + "epoch": 0.013178161903131952, + "grad_norm": 66.82809448242188, + "learning_rate": 4.1072447233314323e-07, + "loss": 12.1411, + "step": 77 + }, + { + "epoch": 0.013349306862912887, + "grad_norm": 144.4269256591797, + "learning_rate": 4.164289788933257e-07, + "loss": 18.5364, + "step": 78 + }, + { + "epoch": 0.013520451822693822, + "grad_norm": 60.626834869384766, + "learning_rate": 4.221334854535083e-07, + "loss": 11.9262, + "step": 79 + }, + { + "epoch": 0.013691596782474755, + "grad_norm": 139.23158264160156, + "learning_rate": 4.278379920136908e-07, + "loss": 17.5537, + "step": 80 + }, + { + "epoch": 0.01386274174225569, + "grad_norm": 74.82394409179688, + "learning_rate": 4.335424985738734e-07, + "loss": 12.6047, + "step": 81 + }, + { + "epoch": 0.014033886702036626, + "grad_norm": 128.57716369628906, + "learning_rate": 4.3924700513405594e-07, + "loss": 17.2671, + "step": 82 + }, + { + "epoch": 0.014205031661817559, + "grad_norm": 50.15757369995117, + "learning_rate": 4.449515116942384e-07, + "loss": 9.8308, + "step": 83 + }, + { + "epoch": 0.014376176621598494, + "grad_norm": 46.99615478515625, + "learning_rate": 4.50656018254421e-07, + "loss": 8.6575, + "step": 84 + }, + { + "epoch": 0.01454732158137943, + "grad_norm": 61.31080627441406, + "learning_rate": 4.5636052481460354e-07, + "loss": 6.8446, + "step": 85 + }, + { + "epoch": 0.014718466541160363, + "grad_norm": 60.068443298339844, + "learning_rate": 4.620650313747861e-07, + "loss": 11.2968, + "step": 86 + }, + { + "epoch": 0.014889611500941298, + "grad_norm": 67.9156265258789, + "learning_rate": 4.6776953793496865e-07, + "loss": 12.672, + "step": 87 + }, + { + "epoch": 0.015060756460722231, + "grad_norm": 49.77825164794922, + "learning_rate": 4.734740444951512e-07, + "loss": 9.5329, + "step": 88 + }, + { + "epoch": 0.015231901420503166, + "grad_norm": 57.99795913696289, + "learning_rate": 4.791785510553338e-07, + "loss": 11.3883, + "step": 89 + }, + { + "epoch": 0.015403046380284101, + "grad_norm": 54.070491790771484, + "learning_rate": 4.848830576155162e-07, + "loss": 11.282, + "step": 90 + }, + { + "epoch": 0.015574191340065035, + "grad_norm": 46.44948959350586, + "learning_rate": 4.905875641756988e-07, + "loss": 10.4933, + "step": 91 + }, + { + "epoch": 0.01574533629984597, + "grad_norm": 91.90750885009766, + "learning_rate": 4.962920707358814e-07, + "loss": 14.1568, + "step": 92 + }, + { + "epoch": 0.015916481259626903, + "grad_norm": 45.950286865234375, + "learning_rate": 5.019965772960638e-07, + "loss": 8.6589, + "step": 93 + }, + { + "epoch": 0.01608762621940784, + "grad_norm": 172.71951293945312, + "learning_rate": 5.077010838562465e-07, + "loss": 16.7382, + "step": 94 + }, + { + "epoch": 0.016258771179188773, + "grad_norm": 54.993247985839844, + "learning_rate": 5.13405590416429e-07, + "loss": 11.4407, + "step": 95 + }, + { + "epoch": 0.016429916138969707, + "grad_norm": 110.25296020507812, + "learning_rate": 5.191100969766115e-07, + "loss": 23.8689, + "step": 96 + }, + { + "epoch": 0.016601061098750643, + "grad_norm": 43.9764289855957, + "learning_rate": 5.248146035367941e-07, + "loss": 7.3984, + "step": 97 + }, + { + "epoch": 0.016772206058531577, + "grad_norm": 72.5784912109375, + "learning_rate": 5.305191100969766e-07, + "loss": 7.2256, + "step": 98 + }, + { + "epoch": 0.01694335101831251, + "grad_norm": 35.58343505859375, + "learning_rate": 5.362236166571592e-07, + "loss": 7.8513, + "step": 99 + }, + { + "epoch": 0.017114495978093443, + "grad_norm": 69.7222900390625, + "learning_rate": 5.419281232173417e-07, + "loss": 11.8726, + "step": 100 + }, + { + "epoch": 0.01728564093787438, + "grad_norm": 54.240943908691406, + "learning_rate": 5.476326297775242e-07, + "loss": 11.6433, + "step": 101 + }, + { + "epoch": 0.017456785897655314, + "grad_norm": 40.061763763427734, + "learning_rate": 5.533371363377068e-07, + "loss": 6.452, + "step": 102 + }, + { + "epoch": 0.017627930857436247, + "grad_norm": 43.3102912902832, + "learning_rate": 5.590416428978894e-07, + "loss": 10.9229, + "step": 103 + }, + { + "epoch": 0.017799075817217184, + "grad_norm": 48.96671676635742, + "learning_rate": 5.647461494580719e-07, + "loss": 10.9523, + "step": 104 + }, + { + "epoch": 0.017970220776998117, + "grad_norm": 107.66687774658203, + "learning_rate": 5.704506560182544e-07, + "loss": 15.756, + "step": 105 + }, + { + "epoch": 0.01814136573677905, + "grad_norm": 50.87533950805664, + "learning_rate": 5.76155162578437e-07, + "loss": 9.8941, + "step": 106 + }, + { + "epoch": 0.018312510696559987, + "grad_norm": 142.70115661621094, + "learning_rate": 5.818596691386196e-07, + "loss": 16.205, + "step": 107 + }, + { + "epoch": 0.01848365565634092, + "grad_norm": 62.69704818725586, + "learning_rate": 5.87564175698802e-07, + "loss": 9.7933, + "step": 108 + }, + { + "epoch": 0.018654800616121854, + "grad_norm": 52.710227966308594, + "learning_rate": 5.932686822589847e-07, + "loss": 10.7189, + "step": 109 + }, + { + "epoch": 0.01882594557590279, + "grad_norm": 131.87474060058594, + "learning_rate": 5.989731888191672e-07, + "loss": 24.6335, + "step": 110 + }, + { + "epoch": 0.018997090535683724, + "grad_norm": 105.79902648925781, + "learning_rate": 6.046776953793496e-07, + "loss": 16.133, + "step": 111 + }, + { + "epoch": 0.019168235495464658, + "grad_norm": 56.011474609375, + "learning_rate": 6.103822019395323e-07, + "loss": 11.9402, + "step": 112 + }, + { + "epoch": 0.019339380455245594, + "grad_norm": 97.4761962890625, + "learning_rate": 6.160867084997148e-07, + "loss": 14.1356, + "step": 113 + }, + { + "epoch": 0.019510525415026528, + "grad_norm": 52.7200813293457, + "learning_rate": 6.217912150598974e-07, + "loss": 11.4511, + "step": 114 + }, + { + "epoch": 0.01968167037480746, + "grad_norm": 42.6909065246582, + "learning_rate": 6.274957216200799e-07, + "loss": 8.5194, + "step": 115 + }, + { + "epoch": 0.019852815334588398, + "grad_norm": 44.77908706665039, + "learning_rate": 6.332002281802624e-07, + "loss": 11.437, + "step": 116 + }, + { + "epoch": 0.02002396029436933, + "grad_norm": 136.7108612060547, + "learning_rate": 6.38904734740445e-07, + "loss": 23.712, + "step": 117 + }, + { + "epoch": 0.020195105254150265, + "grad_norm": 44.484893798828125, + "learning_rate": 6.446092413006275e-07, + "loss": 10.4753, + "step": 118 + }, + { + "epoch": 0.020366250213931198, + "grad_norm": 57.66374206542969, + "learning_rate": 6.503137478608101e-07, + "loss": 11.2803, + "step": 119 + }, + { + "epoch": 0.020537395173712135, + "grad_norm": 110.59872436523438, + "learning_rate": 6.560182544209926e-07, + "loss": 15.7956, + "step": 120 + }, + { + "epoch": 0.02070854013349307, + "grad_norm": 33.806732177734375, + "learning_rate": 6.617227609811752e-07, + "loss": 10.327, + "step": 121 + }, + { + "epoch": 0.020879685093274, + "grad_norm": 52.41442108154297, + "learning_rate": 6.674272675413577e-07, + "loss": 10.5716, + "step": 122 + }, + { + "epoch": 0.02105083005305494, + "grad_norm": 40.95213317871094, + "learning_rate": 6.731317741015402e-07, + "loss": 11.6353, + "step": 123 + }, + { + "epoch": 0.021221975012835872, + "grad_norm": 43.268775939941406, + "learning_rate": 6.788362806617229e-07, + "loss": 11.2876, + "step": 124 + }, + { + "epoch": 0.021393119972616805, + "grad_norm": 102.84829711914062, + "learning_rate": 6.845407872219053e-07, + "loss": 15.5444, + "step": 125 + }, + { + "epoch": 0.021564264932397742, + "grad_norm": 56.55605697631836, + "learning_rate": 6.902452937820878e-07, + "loss": 10.2011, + "step": 126 + }, + { + "epoch": 0.021735409892178675, + "grad_norm": 37.294795989990234, + "learning_rate": 6.959498003422704e-07, + "loss": 8.1014, + "step": 127 + }, + { + "epoch": 0.02190655485195961, + "grad_norm": 55.67061233520508, + "learning_rate": 7.01654306902453e-07, + "loss": 11.638, + "step": 128 + }, + { + "epoch": 0.022077699811740546, + "grad_norm": 67.4786605834961, + "learning_rate": 7.073588134626355e-07, + "loss": 6.8779, + "step": 129 + }, + { + "epoch": 0.02224884477152148, + "grad_norm": 30.9260196685791, + "learning_rate": 7.13063320022818e-07, + "loss": 9.4357, + "step": 130 + }, + { + "epoch": 0.022419989731302412, + "grad_norm": 100.22219848632812, + "learning_rate": 7.187678265830006e-07, + "loss": 14.6476, + "step": 131 + }, + { + "epoch": 0.02259113469108335, + "grad_norm": 29.24936294555664, + "learning_rate": 7.244723331431832e-07, + "loss": 6.5599, + "step": 132 + }, + { + "epoch": 0.022762279650864282, + "grad_norm": 31.59239959716797, + "learning_rate": 7.301768397033656e-07, + "loss": 6.6734, + "step": 133 + }, + { + "epoch": 0.022933424610645216, + "grad_norm": 42.93860626220703, + "learning_rate": 7.358813462635483e-07, + "loss": 9.125, + "step": 134 + }, + { + "epoch": 0.023104569570426153, + "grad_norm": 46.8751335144043, + "learning_rate": 7.415858528237308e-07, + "loss": 10.3878, + "step": 135 + }, + { + "epoch": 0.023275714530207086, + "grad_norm": 106.5069351196289, + "learning_rate": 7.472903593839132e-07, + "loss": 14.3693, + "step": 136 + }, + { + "epoch": 0.02344685948998802, + "grad_norm": 126.05512237548828, + "learning_rate": 7.529948659440959e-07, + "loss": 22.8488, + "step": 137 + }, + { + "epoch": 0.023618004449768953, + "grad_norm": 89.99185180664062, + "learning_rate": 7.586993725042784e-07, + "loss": 13.3307, + "step": 138 + }, + { + "epoch": 0.02378914940954989, + "grad_norm": 35.95622253417969, + "learning_rate": 7.64403879064461e-07, + "loss": 9.7673, + "step": 139 + }, + { + "epoch": 0.023960294369330823, + "grad_norm": 31.504724502563477, + "learning_rate": 7.701083856246435e-07, + "loss": 9.0915, + "step": 140 + }, + { + "epoch": 0.024131439329111756, + "grad_norm": 74.6131591796875, + "learning_rate": 7.75812892184826e-07, + "loss": 12.7378, + "step": 141 + }, + { + "epoch": 0.024302584288892693, + "grad_norm": 40.63880920410156, + "learning_rate": 7.815173987450086e-07, + "loss": 8.9323, + "step": 142 + }, + { + "epoch": 0.024473729248673626, + "grad_norm": 101.69804382324219, + "learning_rate": 7.872219053051911e-07, + "loss": 14.547, + "step": 143 + }, + { + "epoch": 0.02464487420845456, + "grad_norm": 131.046142578125, + "learning_rate": 7.929264118653737e-07, + "loss": 23.0012, + "step": 144 + }, + { + "epoch": 0.024816019168235497, + "grad_norm": 74.94658660888672, + "learning_rate": 7.986309184255562e-07, + "loss": 12.8286, + "step": 145 + }, + { + "epoch": 0.02498716412801643, + "grad_norm": 50.227718353271484, + "learning_rate": 8.043354249857388e-07, + "loss": 10.9684, + "step": 146 + }, + { + "epoch": 0.025158309087797363, + "grad_norm": 162.12603759765625, + "learning_rate": 8.100399315459213e-07, + "loss": 23.0181, + "step": 147 + }, + { + "epoch": 0.0253294540475783, + "grad_norm": 34.11660385131836, + "learning_rate": 8.157444381061038e-07, + "loss": 9.2808, + "step": 148 + }, + { + "epoch": 0.025500599007359234, + "grad_norm": 34.07155990600586, + "learning_rate": 8.214489446662865e-07, + "loss": 11.3059, + "step": 149 + }, + { + "epoch": 0.025671743967140167, + "grad_norm": 42.31085968017578, + "learning_rate": 8.271534512264689e-07, + "loss": 8.7746, + "step": 150 + }, + { + "epoch": 0.025842888926921104, + "grad_norm": 132.6522216796875, + "learning_rate": 8.328579577866514e-07, + "loss": 16.6408, + "step": 151 + }, + { + "epoch": 0.026014033886702037, + "grad_norm": 21.736328125, + "learning_rate": 8.385624643468341e-07, + "loss": 9.6907, + "step": 152 + }, + { + "epoch": 0.02618517884648297, + "grad_norm": 64.54568481445312, + "learning_rate": 8.442669709070166e-07, + "loss": 12.8573, + "step": 153 + }, + { + "epoch": 0.026356323806263904, + "grad_norm": 32.484703063964844, + "learning_rate": 8.499714774671991e-07, + "loss": 7.4299, + "step": 154 + }, + { + "epoch": 0.02652746876604484, + "grad_norm": 86.55378723144531, + "learning_rate": 8.556759840273817e-07, + "loss": 13.347, + "step": 155 + }, + { + "epoch": 0.026698613725825774, + "grad_norm": 32.97962188720703, + "learning_rate": 8.613804905875642e-07, + "loss": 9.2004, + "step": 156 + }, + { + "epoch": 0.026869758685606707, + "grad_norm": 66.87654113769531, + "learning_rate": 8.670849971477468e-07, + "loss": 6.6107, + "step": 157 + }, + { + "epoch": 0.027040903645387644, + "grad_norm": 56.53002166748047, + "learning_rate": 8.727895037079292e-07, + "loss": 6.0957, + "step": 158 + }, + { + "epoch": 0.027212048605168578, + "grad_norm": 37.223453521728516, + "learning_rate": 8.784940102681119e-07, + "loss": 8.9968, + "step": 159 + }, + { + "epoch": 0.02738319356494951, + "grad_norm": 30.637619018554688, + "learning_rate": 8.841985168282944e-07, + "loss": 8.9597, + "step": 160 + }, + { + "epoch": 0.027554338524730448, + "grad_norm": 22.8154354095459, + "learning_rate": 8.899030233884768e-07, + "loss": 9.2794, + "step": 161 + }, + { + "epoch": 0.02772548348451138, + "grad_norm": 64.24419403076172, + "learning_rate": 8.956075299486595e-07, + "loss": 12.9209, + "step": 162 + }, + { + "epoch": 0.027896628444292314, + "grad_norm": 27.159826278686523, + "learning_rate": 9.01312036508842e-07, + "loss": 10.7092, + "step": 163 + }, + { + "epoch": 0.02806777340407325, + "grad_norm": 29.741992950439453, + "learning_rate": 9.070165430690246e-07, + "loss": 10.1098, + "step": 164 + }, + { + "epoch": 0.028238918363854185, + "grad_norm": 61.4916877746582, + "learning_rate": 9.127210496292071e-07, + "loss": 12.5023, + "step": 165 + }, + { + "epoch": 0.028410063323635118, + "grad_norm": 21.36608123779297, + "learning_rate": 9.184255561893896e-07, + "loss": 7.2161, + "step": 166 + }, + { + "epoch": 0.028581208283416055, + "grad_norm": 51.13070297241211, + "learning_rate": 9.241300627495722e-07, + "loss": 5.5324, + "step": 167 + }, + { + "epoch": 0.028752353243196988, + "grad_norm": 27.232070922851562, + "learning_rate": 9.298345693097547e-07, + "loss": 9.3162, + "step": 168 + }, + { + "epoch": 0.02892349820297792, + "grad_norm": 51.84492111206055, + "learning_rate": 9.355390758699373e-07, + "loss": 6.0306, + "step": 169 + }, + { + "epoch": 0.02909464316275886, + "grad_norm": 24.21738052368164, + "learning_rate": 9.412435824301197e-07, + "loss": 6.6994, + "step": 170 + }, + { + "epoch": 0.02926578812253979, + "grad_norm": 27.428897857666016, + "learning_rate": 9.469480889903024e-07, + "loss": 10.5412, + "step": 171 + }, + { + "epoch": 0.029436933082320725, + "grad_norm": 123.71875762939453, + "learning_rate": 9.526525955504849e-07, + "loss": 15.9849, + "step": 172 + }, + { + "epoch": 0.02960807804210166, + "grad_norm": 34.90501403808594, + "learning_rate": 9.583571021106676e-07, + "loss": 9.2574, + "step": 173 + }, + { + "epoch": 0.029779223001882595, + "grad_norm": 26.623390197753906, + "learning_rate": 9.6406160867085e-07, + "loss": 8.7904, + "step": 174 + }, + { + "epoch": 0.02995036796166353, + "grad_norm": 21.868566513061523, + "learning_rate": 9.697661152310325e-07, + "loss": 9.2638, + "step": 175 + }, + { + "epoch": 0.030121512921444462, + "grad_norm": 28.389110565185547, + "learning_rate": 9.754706217912152e-07, + "loss": 5.9086, + "step": 176 + }, + { + "epoch": 0.0302926578812254, + "grad_norm": 51.29762649536133, + "learning_rate": 9.811751283513976e-07, + "loss": 5.9646, + "step": 177 + }, + { + "epoch": 0.030463802841006332, + "grad_norm": 28.91325569152832, + "learning_rate": 9.8687963491158e-07, + "loss": 6.0877, + "step": 178 + }, + { + "epoch": 0.030634947800787266, + "grad_norm": 66.74105834960938, + "learning_rate": 9.925841414717628e-07, + "loss": 12.4348, + "step": 179 + }, + { + "epoch": 0.030806092760568202, + "grad_norm": 19.138124465942383, + "learning_rate": 9.982886480319452e-07, + "loss": 9.5496, + "step": 180 + }, + { + "epoch": 0.030977237720349136, + "grad_norm": 43.17308044433594, + "learning_rate": 1.0039931545921277e-06, + "loss": 5.5641, + "step": 181 + }, + { + "epoch": 0.03114838268013007, + "grad_norm": 32.97599411010742, + "learning_rate": 1.0096976611523104e-06, + "loss": 9.0529, + "step": 182 + }, + { + "epoch": 0.031319527639911006, + "grad_norm": 56.315521240234375, + "learning_rate": 1.015402167712493e-06, + "loss": 12.0747, + "step": 183 + }, + { + "epoch": 0.03149067259969194, + "grad_norm": 76.77662658691406, + "learning_rate": 1.0211066742726755e-06, + "loss": 13.0892, + "step": 184 + }, + { + "epoch": 0.03166181755947287, + "grad_norm": 25.544397354125977, + "learning_rate": 1.026811180832858e-06, + "loss": 7.7117, + "step": 185 + }, + { + "epoch": 0.031832962519253806, + "grad_norm": 24.205764770507812, + "learning_rate": 1.0325156873930406e-06, + "loss": 6.6426, + "step": 186 + }, + { + "epoch": 0.03200410747903474, + "grad_norm": 25.586280822753906, + "learning_rate": 1.038220193953223e-06, + "loss": 10.4785, + "step": 187 + }, + { + "epoch": 0.03217525243881568, + "grad_norm": 68.83911895751953, + "learning_rate": 1.0439247005134056e-06, + "loss": 12.2132, + "step": 188 + }, + { + "epoch": 0.03234639739859661, + "grad_norm": 24.825489044189453, + "learning_rate": 1.0496292070735881e-06, + "loss": 6.3336, + "step": 189 + }, + { + "epoch": 0.032517542358377546, + "grad_norm": 28.293699264526367, + "learning_rate": 1.0553337136337707e-06, + "loss": 8.5374, + "step": 190 + }, + { + "epoch": 0.03268868731815848, + "grad_norm": 28.26664924621582, + "learning_rate": 1.0610382201939532e-06, + "loss": 9.7218, + "step": 191 + }, + { + "epoch": 0.03285983227793941, + "grad_norm": 84.32862854003906, + "learning_rate": 1.0667427267541357e-06, + "loss": 12.782, + "step": 192 + }, + { + "epoch": 0.033030977237720346, + "grad_norm": 26.818071365356445, + "learning_rate": 1.0724472333143185e-06, + "loss": 7.3125, + "step": 193 + }, + { + "epoch": 0.03320212219750129, + "grad_norm": 16.650196075439453, + "learning_rate": 1.0781517398745008e-06, + "loss": 9.0232, + "step": 194 + }, + { + "epoch": 0.03337326715728222, + "grad_norm": 22.659135818481445, + "learning_rate": 1.0838562464346833e-06, + "loss": 6.2787, + "step": 195 + }, + { + "epoch": 0.03354441211706315, + "grad_norm": 24.644168853759766, + "learning_rate": 1.089560752994866e-06, + "loss": 6.0047, + "step": 196 + }, + { + "epoch": 0.03371555707684409, + "grad_norm": 32.078712463378906, + "learning_rate": 1.0952652595550484e-06, + "loss": 7.5748, + "step": 197 + }, + { + "epoch": 0.03388670203662502, + "grad_norm": 55.345855712890625, + "learning_rate": 1.1009697661152311e-06, + "loss": 11.8703, + "step": 198 + }, + { + "epoch": 0.034057846996405954, + "grad_norm": 70.49486541748047, + "learning_rate": 1.1066742726754137e-06, + "loss": 11.7983, + "step": 199 + }, + { + "epoch": 0.03422899195618689, + "grad_norm": 29.946758270263672, + "learning_rate": 1.112378779235596e-06, + "loss": 7.6286, + "step": 200 + }, + { + "epoch": 0.03440013691596783, + "grad_norm": 278.3395080566406, + "learning_rate": 1.1180832857957787e-06, + "loss": 17.6192, + "step": 201 + }, + { + "epoch": 0.03457128187574876, + "grad_norm": 310.682861328125, + "learning_rate": 1.1237877923559613e-06, + "loss": 17.6315, + "step": 202 + }, + { + "epoch": 0.034742426835529694, + "grad_norm": 46.159568786621094, + "learning_rate": 1.1294922989161438e-06, + "loss": 11.6001, + "step": 203 + }, + { + "epoch": 0.03491357179531063, + "grad_norm": 20.635892868041992, + "learning_rate": 1.1351968054763263e-06, + "loss": 9.4128, + "step": 204 + }, + { + "epoch": 0.03508471675509156, + "grad_norm": 143.4097137451172, + "learning_rate": 1.1409013120365089e-06, + "loss": 16.3943, + "step": 205 + }, + { + "epoch": 0.035255861714872494, + "grad_norm": 265.5577087402344, + "learning_rate": 1.1466058185966914e-06, + "loss": 18.6869, + "step": 206 + }, + { + "epoch": 0.035427006674653434, + "grad_norm": 19.766063690185547, + "learning_rate": 1.152310325156874e-06, + "loss": 8.6515, + "step": 207 + }, + { + "epoch": 0.03559815163443437, + "grad_norm": 43.8801383972168, + "learning_rate": 1.1580148317170565e-06, + "loss": 11.424, + "step": 208 + }, + { + "epoch": 0.0357692965942153, + "grad_norm": 12.928386688232422, + "learning_rate": 1.1637193382772392e-06, + "loss": 5.5902, + "step": 209 + }, + { + "epoch": 0.035940441553996234, + "grad_norm": 123.55076599121094, + "learning_rate": 1.1694238448374215e-06, + "loss": 15.6958, + "step": 210 + }, + { + "epoch": 0.03611158651377717, + "grad_norm": 44.79010772705078, + "learning_rate": 1.175128351397604e-06, + "loss": 11.1894, + "step": 211 + }, + { + "epoch": 0.0362827314735581, + "grad_norm": 26.461137771606445, + "learning_rate": 1.1808328579577868e-06, + "loss": 7.3237, + "step": 212 + }, + { + "epoch": 0.03645387643333904, + "grad_norm": 24.63947296142578, + "learning_rate": 1.1865373645179693e-06, + "loss": 5.7252, + "step": 213 + }, + { + "epoch": 0.036625021393119975, + "grad_norm": 17.151113510131836, + "learning_rate": 1.1922418710781517e-06, + "loss": 9.0419, + "step": 214 + }, + { + "epoch": 0.03679616635290091, + "grad_norm": 26.69593620300293, + "learning_rate": 1.1979463776383344e-06, + "loss": 9.4836, + "step": 215 + }, + { + "epoch": 0.03696731131268184, + "grad_norm": 50.901573181152344, + "learning_rate": 1.203650884198517e-06, + "loss": 11.2858, + "step": 216 + }, + { + "epoch": 0.037138456272462775, + "grad_norm": 48.110328674316406, + "learning_rate": 1.2093553907586992e-06, + "loss": 11.5594, + "step": 217 + }, + { + "epoch": 0.03730960123224371, + "grad_norm": 51.77389907836914, + "learning_rate": 1.215059897318882e-06, + "loss": 11.6974, + "step": 218 + }, + { + "epoch": 0.03748074619202464, + "grad_norm": 23.52347183227539, + "learning_rate": 1.2207644038790645e-06, + "loss": 9.5737, + "step": 219 + }, + { + "epoch": 0.03765189115180558, + "grad_norm": 20.402074813842773, + "learning_rate": 1.2264689104392468e-06, + "loss": 6.1995, + "step": 220 + }, + { + "epoch": 0.037823036111586515, + "grad_norm": 18.76962661743164, + "learning_rate": 1.2321734169994296e-06, + "loss": 7.1013, + "step": 221 + }, + { + "epoch": 0.03799418107136745, + "grad_norm": 21.817501068115234, + "learning_rate": 1.2378779235596121e-06, + "loss": 9.3332, + "step": 222 + }, + { + "epoch": 0.03816532603114838, + "grad_norm": 11.452000617980957, + "learning_rate": 1.2435824301197949e-06, + "loss": 6.2887, + "step": 223 + }, + { + "epoch": 0.038336470990929315, + "grad_norm": 22.69776153564453, + "learning_rate": 1.2492869366799772e-06, + "loss": 7.9947, + "step": 224 + }, + { + "epoch": 0.03850761595071025, + "grad_norm": 25.39488410949707, + "learning_rate": 1.2549914432401597e-06, + "loss": 5.1894, + "step": 225 + }, + { + "epoch": 0.03867876091049119, + "grad_norm": 17.65719223022461, + "learning_rate": 1.2606959498003425e-06, + "loss": 7.4931, + "step": 226 + }, + { + "epoch": 0.03884990587027212, + "grad_norm": 23.45711898803711, + "learning_rate": 1.2664004563605248e-06, + "loss": 9.6157, + "step": 227 + }, + { + "epoch": 0.039021050830053056, + "grad_norm": 29.114194869995117, + "learning_rate": 1.2721049629207073e-06, + "loss": 10.4857, + "step": 228 + }, + { + "epoch": 0.03919219578983399, + "grad_norm": 46.365013122558594, + "learning_rate": 1.27780946948089e-06, + "loss": 11.9216, + "step": 229 + }, + { + "epoch": 0.03936334074961492, + "grad_norm": 23.066879272460938, + "learning_rate": 1.2835139760410724e-06, + "loss": 9.4344, + "step": 230 + }, + { + "epoch": 0.039534485709395856, + "grad_norm": 15.414644241333008, + "learning_rate": 1.289218482601255e-06, + "loss": 6.4409, + "step": 231 + }, + { + "epoch": 0.039705630669176796, + "grad_norm": 16.58795166015625, + "learning_rate": 1.2949229891614376e-06, + "loss": 7.3307, + "step": 232 + }, + { + "epoch": 0.03987677562895773, + "grad_norm": 36.44779968261719, + "learning_rate": 1.3006274957216202e-06, + "loss": 11.1388, + "step": 233 + }, + { + "epoch": 0.04004792058873866, + "grad_norm": 20.902912139892578, + "learning_rate": 1.3063320022818027e-06, + "loss": 7.378, + "step": 234 + }, + { + "epoch": 0.040219065548519596, + "grad_norm": 20.50259017944336, + "learning_rate": 1.3120365088419852e-06, + "loss": 6.156, + "step": 235 + }, + { + "epoch": 0.04039021050830053, + "grad_norm": 22.57229995727539, + "learning_rate": 1.3177410154021678e-06, + "loss": 7.0029, + "step": 236 + }, + { + "epoch": 0.04056135546808146, + "grad_norm": 25.610868453979492, + "learning_rate": 1.3234455219623503e-06, + "loss": 8.7721, + "step": 237 + }, + { + "epoch": 0.040732500427862396, + "grad_norm": 278.795654296875, + "learning_rate": 1.3291500285225328e-06, + "loss": 15.9633, + "step": 238 + }, + { + "epoch": 0.040903645387643336, + "grad_norm": 11.644048690795898, + "learning_rate": 1.3348545350827154e-06, + "loss": 6.3707, + "step": 239 + }, + { + "epoch": 0.04107479034742427, + "grad_norm": 36.32057189941406, + "learning_rate": 1.340559041642898e-06, + "loss": 10.7901, + "step": 240 + }, + { + "epoch": 0.0412459353072052, + "grad_norm": 22.911476135253906, + "learning_rate": 1.3462635482030804e-06, + "loss": 9.4097, + "step": 241 + }, + { + "epoch": 0.04141708026698614, + "grad_norm": 24.35552406311035, + "learning_rate": 1.351968054763263e-06, + "loss": 9.081, + "step": 242 + }, + { + "epoch": 0.04158822522676707, + "grad_norm": 18.466432571411133, + "learning_rate": 1.3576725613234457e-06, + "loss": 7.4805, + "step": 243 + }, + { + "epoch": 0.041759370186548, + "grad_norm": 44.41029357910156, + "learning_rate": 1.363377067883628e-06, + "loss": 11.2131, + "step": 244 + }, + { + "epoch": 0.041930515146328944, + "grad_norm": 15.328824043273926, + "learning_rate": 1.3690815744438106e-06, + "loss": 8.2706, + "step": 245 + }, + { + "epoch": 0.04210166010610988, + "grad_norm": 274.3642578125, + "learning_rate": 1.3747860810039933e-06, + "loss": 15.8791, + "step": 246 + }, + { + "epoch": 0.04227280506589081, + "grad_norm": 18.105318069458008, + "learning_rate": 1.3804905875641756e-06, + "loss": 8.9079, + "step": 247 + }, + { + "epoch": 0.042443950025671744, + "grad_norm": 22.90168571472168, + "learning_rate": 1.3861950941243584e-06, + "loss": 6.6905, + "step": 248 + }, + { + "epoch": 0.04261509498545268, + "grad_norm": 16.96687126159668, + "learning_rate": 1.391899600684541e-06, + "loss": 8.5567, + "step": 249 + }, + { + "epoch": 0.04278623994523361, + "grad_norm": 283.76409912109375, + "learning_rate": 1.3976041072447232e-06, + "loss": 14.4204, + "step": 250 + }, + { + "epoch": 0.04295738490501455, + "grad_norm": 22.41378402709961, + "learning_rate": 1.403308613804906e-06, + "loss": 9.6063, + "step": 251 + }, + { + "epoch": 0.043128529864795484, + "grad_norm": 23.26137924194336, + "learning_rate": 1.4090131203650885e-06, + "loss": 9.9569, + "step": 252 + }, + { + "epoch": 0.04329967482457642, + "grad_norm": 19.40400505065918, + "learning_rate": 1.414717626925271e-06, + "loss": 6.4322, + "step": 253 + }, + { + "epoch": 0.04347081978435735, + "grad_norm": 21.541933059692383, + "learning_rate": 1.4204221334854536e-06, + "loss": 4.5325, + "step": 254 + }, + { + "epoch": 0.043641964744138284, + "grad_norm": 17.52275276184082, + "learning_rate": 1.426126640045636e-06, + "loss": 8.3479, + "step": 255 + }, + { + "epoch": 0.04381310970391922, + "grad_norm": 125.6756591796875, + "learning_rate": 1.4318311466058186e-06, + "loss": 15.4145, + "step": 256 + }, + { + "epoch": 0.04398425466370015, + "grad_norm": 18.166152954101562, + "learning_rate": 1.4375356531660011e-06, + "loss": 4.2531, + "step": 257 + }, + { + "epoch": 0.04415539962348109, + "grad_norm": 25.4247989654541, + "learning_rate": 1.4432401597261837e-06, + "loss": 10.4856, + "step": 258 + }, + { + "epoch": 0.044326544583262024, + "grad_norm": 17.259897232055664, + "learning_rate": 1.4489446662863664e-06, + "loss": 8.6032, + "step": 259 + }, + { + "epoch": 0.04449768954304296, + "grad_norm": 23.197059631347656, + "learning_rate": 1.4546491728465487e-06, + "loss": 7.8062, + "step": 260 + }, + { + "epoch": 0.04466883450282389, + "grad_norm": 43.4500617980957, + "learning_rate": 1.4603536794067313e-06, + "loss": 11.1986, + "step": 261 + }, + { + "epoch": 0.044839979462604825, + "grad_norm": 122.06368255615234, + "learning_rate": 1.466058185966914e-06, + "loss": 15.5832, + "step": 262 + }, + { + "epoch": 0.04501112442238576, + "grad_norm": 16.506317138671875, + "learning_rate": 1.4717626925270965e-06, + "loss": 8.7747, + "step": 263 + }, + { + "epoch": 0.0451822693821667, + "grad_norm": 19.03982162475586, + "learning_rate": 1.4774671990872789e-06, + "loss": 7.6134, + "step": 264 + }, + { + "epoch": 0.04535341434194763, + "grad_norm": 33.20307540893555, + "learning_rate": 1.4831717056474616e-06, + "loss": 10.3325, + "step": 265 + }, + { + "epoch": 0.045524559301728565, + "grad_norm": 16.946876525878906, + "learning_rate": 1.4888762122076441e-06, + "loss": 8.2866, + "step": 266 + }, + { + "epoch": 0.0456957042615095, + "grad_norm": 25.170318603515625, + "learning_rate": 1.4945807187678265e-06, + "loss": 10.2958, + "step": 267 + }, + { + "epoch": 0.04586684922129043, + "grad_norm": 16.860721588134766, + "learning_rate": 1.5002852253280092e-06, + "loss": 8.5198, + "step": 268 + }, + { + "epoch": 0.046037994181071365, + "grad_norm": 18.003284454345703, + "learning_rate": 1.5059897318881917e-06, + "loss": 8.8484, + "step": 269 + }, + { + "epoch": 0.046209139140852305, + "grad_norm": 17.796016693115234, + "learning_rate": 1.511694238448374e-06, + "loss": 6.2495, + "step": 270 + }, + { + "epoch": 0.04638028410063324, + "grad_norm": 23.97182846069336, + "learning_rate": 1.5173987450085568e-06, + "loss": 7.0879, + "step": 271 + }, + { + "epoch": 0.04655142906041417, + "grad_norm": 213.1482696533203, + "learning_rate": 1.5231032515687393e-06, + "loss": 12.8754, + "step": 272 + }, + { + "epoch": 0.046722574020195105, + "grad_norm": 25.503662109375, + "learning_rate": 1.528807758128922e-06, + "loss": 7.9125, + "step": 273 + }, + { + "epoch": 0.04689371897997604, + "grad_norm": 19.832860946655273, + "learning_rate": 1.5345122646891044e-06, + "loss": 9.0794, + "step": 274 + }, + { + "epoch": 0.04706486393975697, + "grad_norm": 32.311920166015625, + "learning_rate": 1.540216771249287e-06, + "loss": 10.648, + "step": 275 + }, + { + "epoch": 0.047236008899537905, + "grad_norm": 39.916603088378906, + "learning_rate": 1.5459212778094697e-06, + "loss": 10.9246, + "step": 276 + }, + { + "epoch": 0.047407153859318846, + "grad_norm": 21.337602615356445, + "learning_rate": 1.551625784369652e-06, + "loss": 9.2191, + "step": 277 + }, + { + "epoch": 0.04757829881909978, + "grad_norm": 25.114675521850586, + "learning_rate": 1.5573302909298345e-06, + "loss": 10.2576, + "step": 278 + }, + { + "epoch": 0.04774944377888071, + "grad_norm": 14.945568084716797, + "learning_rate": 1.5630347974900173e-06, + "loss": 8.4857, + "step": 279 + }, + { + "epoch": 0.047920588738661646, + "grad_norm": 33.542449951171875, + "learning_rate": 1.5687393040501996e-06, + "loss": 10.9193, + "step": 280 + }, + { + "epoch": 0.04809173369844258, + "grad_norm": 27.331628799438477, + "learning_rate": 1.5744438106103821e-06, + "loss": 9.9441, + "step": 281 + }, + { + "epoch": 0.04826287865822351, + "grad_norm": 17.784677505493164, + "learning_rate": 1.5801483171705649e-06, + "loss": 6.4105, + "step": 282 + }, + { + "epoch": 0.04843402361800445, + "grad_norm": 46.38033676147461, + "learning_rate": 1.5858528237307474e-06, + "loss": 10.5075, + "step": 283 + }, + { + "epoch": 0.048605168577785386, + "grad_norm": 13.535309791564941, + "learning_rate": 1.59155733029093e-06, + "loss": 4.4568, + "step": 284 + }, + { + "epoch": 0.04877631353756632, + "grad_norm": 27.45166015625, + "learning_rate": 1.5972618368511125e-06, + "loss": 10.2344, + "step": 285 + }, + { + "epoch": 0.04894745849734725, + "grad_norm": 16.50087547302246, + "learning_rate": 1.602966343411295e-06, + "loss": 8.5428, + "step": 286 + }, + { + "epoch": 0.049118603457128186, + "grad_norm": 42.31341552734375, + "learning_rate": 1.6086708499714775e-06, + "loss": 10.0868, + "step": 287 + }, + { + "epoch": 0.04928974841690912, + "grad_norm": 17.977153778076172, + "learning_rate": 1.61437535653166e-06, + "loss": 9.012, + "step": 288 + }, + { + "epoch": 0.04946089337669006, + "grad_norm": 104.7464828491211, + "learning_rate": 1.6200798630918426e-06, + "loss": 14.6671, + "step": 289 + }, + { + "epoch": 0.04963203833647099, + "grad_norm": 17.432056427001953, + "learning_rate": 1.6257843696520251e-06, + "loss": 6.8872, + "step": 290 + }, + { + "epoch": 0.04980318329625193, + "grad_norm": 242.7275390625, + "learning_rate": 1.6314888762122076e-06, + "loss": 11.2526, + "step": 291 + }, + { + "epoch": 0.04997432825603286, + "grad_norm": 15.779862403869629, + "learning_rate": 1.6371933827723902e-06, + "loss": 8.7887, + "step": 292 + }, + { + "epoch": 0.05014547321581379, + "grad_norm": 13.621806144714355, + "learning_rate": 1.642897889332573e-06, + "loss": 7.0578, + "step": 293 + }, + { + "epoch": 0.05031661817559473, + "grad_norm": 14.4631986618042, + "learning_rate": 1.6486023958927552e-06, + "loss": 8.2147, + "step": 294 + }, + { + "epoch": 0.05048776313537566, + "grad_norm": 18.11038589477539, + "learning_rate": 1.6543069024529378e-06, + "loss": 6.4308, + "step": 295 + }, + { + "epoch": 0.0506589080951566, + "grad_norm": 16.797258377075195, + "learning_rate": 1.6600114090131205e-06, + "loss": 6.3738, + "step": 296 + }, + { + "epoch": 0.050830053054937534, + "grad_norm": 17.457462310791016, + "learning_rate": 1.6657159155733028e-06, + "loss": 6.3681, + "step": 297 + }, + { + "epoch": 0.05100119801471847, + "grad_norm": 14.502140045166016, + "learning_rate": 1.6714204221334856e-06, + "loss": 7.1297, + "step": 298 + }, + { + "epoch": 0.0511723429744994, + "grad_norm": 14.4544677734375, + "learning_rate": 1.6771249286936681e-06, + "loss": 8.5584, + "step": 299 + }, + { + "epoch": 0.051343487934280334, + "grad_norm": 13.313618659973145, + "learning_rate": 1.6828294352538504e-06, + "loss": 8.1348, + "step": 300 + }, + { + "epoch": 0.05151463289406127, + "grad_norm": 91.8434829711914, + "learning_rate": 1.6885339418140332e-06, + "loss": 13.9421, + "step": 301 + }, + { + "epoch": 0.05168577785384221, + "grad_norm": 39.31818389892578, + "learning_rate": 1.6942384483742157e-06, + "loss": 10.3291, + "step": 302 + }, + { + "epoch": 0.05185692281362314, + "grad_norm": 16.320667266845703, + "learning_rate": 1.6999429549343982e-06, + "loss": 4.6866, + "step": 303 + }, + { + "epoch": 0.052028067773404074, + "grad_norm": 13.367071151733398, + "learning_rate": 1.7056474614945808e-06, + "loss": 8.1535, + "step": 304 + }, + { + "epoch": 0.05219921273318501, + "grad_norm": 186.96824645996094, + "learning_rate": 1.7113519680547633e-06, + "loss": 10.6341, + "step": 305 + }, + { + "epoch": 0.05237035769296594, + "grad_norm": 28.400169372558594, + "learning_rate": 1.7170564746149458e-06, + "loss": 9.7369, + "step": 306 + }, + { + "epoch": 0.052541502652746874, + "grad_norm": 15.559652328491211, + "learning_rate": 1.7227609811751284e-06, + "loss": 7.1427, + "step": 307 + }, + { + "epoch": 0.05271264761252781, + "grad_norm": 5.730342864990234, + "learning_rate": 1.728465487735311e-06, + "loss": 5.4861, + "step": 308 + }, + { + "epoch": 0.05288379257230875, + "grad_norm": 19.06242561340332, + "learning_rate": 1.7341699942954936e-06, + "loss": 9.0657, + "step": 309 + }, + { + "epoch": 0.05305493753208968, + "grad_norm": 18.580720901489258, + "learning_rate": 1.739874500855676e-06, + "loss": 5.9947, + "step": 310 + }, + { + "epoch": 0.053226082491870615, + "grad_norm": 13.939530372619629, + "learning_rate": 1.7455790074158585e-06, + "loss": 7.1715, + "step": 311 + }, + { + "epoch": 0.05339722745165155, + "grad_norm": 12.347646713256836, + "learning_rate": 1.7512835139760412e-06, + "loss": 4.5087, + "step": 312 + }, + { + "epoch": 0.05356837241143248, + "grad_norm": 16.251863479614258, + "learning_rate": 1.7569880205362238e-06, + "loss": 8.7544, + "step": 313 + }, + { + "epoch": 0.053739517371213415, + "grad_norm": 18.887571334838867, + "learning_rate": 1.762692527096406e-06, + "loss": 7.1006, + "step": 314 + }, + { + "epoch": 0.053910662330994355, + "grad_norm": 29.57771873474121, + "learning_rate": 1.7683970336565888e-06, + "loss": 10.2554, + "step": 315 + }, + { + "epoch": 0.05408180729077529, + "grad_norm": 215.26080322265625, + "learning_rate": 1.7741015402167714e-06, + "loss": 10.6589, + "step": 316 + }, + { + "epoch": 0.05425295225055622, + "grad_norm": 6.18715763092041, + "learning_rate": 1.7798060467769537e-06, + "loss": 5.3794, + "step": 317 + }, + { + "epoch": 0.054424097210337155, + "grad_norm": 30.351348876953125, + "learning_rate": 1.7855105533371364e-06, + "loss": 10.3749, + "step": 318 + }, + { + "epoch": 0.05459524217011809, + "grad_norm": 16.978347778320312, + "learning_rate": 1.791215059897319e-06, + "loss": 6.2012, + "step": 319 + }, + { + "epoch": 0.05476638712989902, + "grad_norm": 19.239072799682617, + "learning_rate": 1.7969195664575015e-06, + "loss": 9.1925, + "step": 320 + }, + { + "epoch": 0.05493753208967996, + "grad_norm": 20.378984451293945, + "learning_rate": 1.802624073017684e-06, + "loss": 8.7484, + "step": 321 + }, + { + "epoch": 0.055108677049460895, + "grad_norm": 11.863981246948242, + "learning_rate": 1.8083285795778666e-06, + "loss": 6.308, + "step": 322 + }, + { + "epoch": 0.05527982200924183, + "grad_norm": 15.815791130065918, + "learning_rate": 1.8140330861380493e-06, + "loss": 8.9935, + "step": 323 + }, + { + "epoch": 0.05545096696902276, + "grad_norm": 31.865665435791016, + "learning_rate": 1.8197375926982316e-06, + "loss": 10.1397, + "step": 324 + }, + { + "epoch": 0.055622111928803696, + "grad_norm": 160.87301635742188, + "learning_rate": 1.8254420992584141e-06, + "loss": 9.2965, + "step": 325 + }, + { + "epoch": 0.05579325688858463, + "grad_norm": 16.763856887817383, + "learning_rate": 1.8311466058185969e-06, + "loss": 6.6638, + "step": 326 + }, + { + "epoch": 0.05596440184836556, + "grad_norm": 12.291769981384277, + "learning_rate": 1.8368511123787792e-06, + "loss": 8.2182, + "step": 327 + }, + { + "epoch": 0.0561355468081465, + "grad_norm": 20.839473724365234, + "learning_rate": 1.8425556189389617e-06, + "loss": 5.9446, + "step": 328 + }, + { + "epoch": 0.056306691767927436, + "grad_norm": 41.371337890625, + "learning_rate": 1.8482601254991445e-06, + "loss": 10.0738, + "step": 329 + }, + { + "epoch": 0.05647783672770837, + "grad_norm": 12.416519165039062, + "learning_rate": 1.8539646320593268e-06, + "loss": 7.9372, + "step": 330 + }, + { + "epoch": 0.0566489816874893, + "grad_norm": 12.856998443603516, + "learning_rate": 1.8596691386195093e-06, + "loss": 8.5894, + "step": 331 + }, + { + "epoch": 0.056820126647270236, + "grad_norm": 28.67165184020996, + "learning_rate": 1.865373645179692e-06, + "loss": 9.856, + "step": 332 + }, + { + "epoch": 0.05699127160705117, + "grad_norm": 17.425006866455078, + "learning_rate": 1.8710781517398746e-06, + "loss": 7.6487, + "step": 333 + }, + { + "epoch": 0.05716241656683211, + "grad_norm": 29.102951049804688, + "learning_rate": 1.8767826583000571e-06, + "loss": 9.7985, + "step": 334 + }, + { + "epoch": 0.05733356152661304, + "grad_norm": 15.120597839355469, + "learning_rate": 1.8824871648602395e-06, + "loss": 8.856, + "step": 335 + }, + { + "epoch": 0.057504706486393976, + "grad_norm": 188.02642822265625, + "learning_rate": 1.8881916714204222e-06, + "loss": 9.3274, + "step": 336 + }, + { + "epoch": 0.05767585144617491, + "grad_norm": 14.4713134765625, + "learning_rate": 1.8938961779806047e-06, + "loss": 8.8408, + "step": 337 + }, + { + "epoch": 0.05784699640595584, + "grad_norm": 27.848546981811523, + "learning_rate": 1.8996006845407875e-06, + "loss": 10.1598, + "step": 338 + }, + { + "epoch": 0.058018141365736776, + "grad_norm": 12.024163246154785, + "learning_rate": 1.9053051911009698e-06, + "loss": 6.2088, + "step": 339 + }, + { + "epoch": 0.05818928632551772, + "grad_norm": 11.968954086303711, + "learning_rate": 1.9110096976611523e-06, + "loss": 7.3791, + "step": 340 + }, + { + "epoch": 0.05836043128529865, + "grad_norm": 27.01519775390625, + "learning_rate": 1.9167142042213353e-06, + "loss": 10.5111, + "step": 341 + }, + { + "epoch": 0.05853157624507958, + "grad_norm": 13.136455535888672, + "learning_rate": 1.9224187107815174e-06, + "loss": 4.512, + "step": 342 + }, + { + "epoch": 0.05870272120486052, + "grad_norm": 16.26902198791504, + "learning_rate": 1.9281232173417e-06, + "loss": 6.8285, + "step": 343 + }, + { + "epoch": 0.05887386616464145, + "grad_norm": 16.47487449645996, + "learning_rate": 1.933827723901883e-06, + "loss": 8.8793, + "step": 344 + }, + { + "epoch": 0.059045011124422384, + "grad_norm": 32.750850677490234, + "learning_rate": 1.939532230462065e-06, + "loss": 10.0536, + "step": 345 + }, + { + "epoch": 0.05921615608420332, + "grad_norm": 18.996196746826172, + "learning_rate": 1.9452367370222475e-06, + "loss": 6.1966, + "step": 346 + }, + { + "epoch": 0.05938730104398426, + "grad_norm": 24.546964645385742, + "learning_rate": 1.9509412435824305e-06, + "loss": 9.4677, + "step": 347 + }, + { + "epoch": 0.05955844600376519, + "grad_norm": 84.20301055908203, + "learning_rate": 1.9566457501426126e-06, + "loss": 14.0007, + "step": 348 + }, + { + "epoch": 0.059729590963546124, + "grad_norm": 18.845518112182617, + "learning_rate": 1.962350256702795e-06, + "loss": 6.1715, + "step": 349 + }, + { + "epoch": 0.05990073592332706, + "grad_norm": 32.177085876464844, + "learning_rate": 1.968054763262978e-06, + "loss": 10.5934, + "step": 350 + }, + { + "epoch": 0.06007188088310799, + "grad_norm": 24.051923751831055, + "learning_rate": 1.97375926982316e-06, + "loss": 9.6501, + "step": 351 + }, + { + "epoch": 0.060243025842888924, + "grad_norm": 13.522736549377441, + "learning_rate": 1.9794637763833427e-06, + "loss": 8.8967, + "step": 352 + }, + { + "epoch": 0.060414170802669864, + "grad_norm": 21.437868118286133, + "learning_rate": 1.9851682829435257e-06, + "loss": 9.8243, + "step": 353 + }, + { + "epoch": 0.0605853157624508, + "grad_norm": 30.177589416503906, + "learning_rate": 1.9908727895037078e-06, + "loss": 9.5401, + "step": 354 + }, + { + "epoch": 0.06075646072223173, + "grad_norm": 12.939532279968262, + "learning_rate": 1.9965772960638903e-06, + "loss": 6.4143, + "step": 355 + }, + { + "epoch": 0.060927605682012664, + "grad_norm": 18.022136688232422, + "learning_rate": 2.0022818026240733e-06, + "loss": 9.6522, + "step": 356 + }, + { + "epoch": 0.0610987506417936, + "grad_norm": 12.483067512512207, + "learning_rate": 2.0079863091842554e-06, + "loss": 9.2254, + "step": 357 + }, + { + "epoch": 0.06126989560157453, + "grad_norm": 19.432615280151367, + "learning_rate": 2.0136908157444383e-06, + "loss": 6.2483, + "step": 358 + }, + { + "epoch": 0.06144104056135547, + "grad_norm": 177.3258819580078, + "learning_rate": 2.019395322304621e-06, + "loss": 9.2975, + "step": 359 + }, + { + "epoch": 0.061612185521136405, + "grad_norm": 14.458636283874512, + "learning_rate": 2.025099828864803e-06, + "loss": 8.5874, + "step": 360 + }, + { + "epoch": 0.06178333048091734, + "grad_norm": 21.112350463867188, + "learning_rate": 2.030804335424986e-06, + "loss": 9.4896, + "step": 361 + }, + { + "epoch": 0.06195447544069827, + "grad_norm": 15.956084251403809, + "learning_rate": 2.0365088419851685e-06, + "loss": 9.3311, + "step": 362 + }, + { + "epoch": 0.062125620400479205, + "grad_norm": 11.96216869354248, + "learning_rate": 2.042213348545351e-06, + "loss": 8.2885, + "step": 363 + }, + { + "epoch": 0.06229676536026014, + "grad_norm": 16.588687896728516, + "learning_rate": 2.0479178551055335e-06, + "loss": 8.5745, + "step": 364 + }, + { + "epoch": 0.06246791032004107, + "grad_norm": 20.95501708984375, + "learning_rate": 2.053622361665716e-06, + "loss": 9.5327, + "step": 365 + }, + { + "epoch": 0.06263905527982201, + "grad_norm": 14.255351066589355, + "learning_rate": 2.0593268682258986e-06, + "loss": 9.1372, + "step": 366 + }, + { + "epoch": 0.06281020023960295, + "grad_norm": 17.529571533203125, + "learning_rate": 2.065031374786081e-06, + "loss": 6.9098, + "step": 367 + }, + { + "epoch": 0.06298134519938388, + "grad_norm": 23.381641387939453, + "learning_rate": 2.0707358813462636e-06, + "loss": 9.4994, + "step": 368 + }, + { + "epoch": 0.06315249015916481, + "grad_norm": 152.30535888671875, + "learning_rate": 2.076440387906446e-06, + "loss": 8.5952, + "step": 369 + }, + { + "epoch": 0.06332363511894575, + "grad_norm": 15.447931289672852, + "learning_rate": 2.0821448944666287e-06, + "loss": 7.1287, + "step": 370 + }, + { + "epoch": 0.06349478007872668, + "grad_norm": 13.553053855895996, + "learning_rate": 2.0878494010268112e-06, + "loss": 8.0622, + "step": 371 + }, + { + "epoch": 0.06366592503850761, + "grad_norm": 13.198517799377441, + "learning_rate": 2.0935539075869938e-06, + "loss": 8.3527, + "step": 372 + }, + { + "epoch": 0.06383706999828855, + "grad_norm": 21.851369857788086, + "learning_rate": 2.0992584141471763e-06, + "loss": 6.7771, + "step": 373 + }, + { + "epoch": 0.06400821495806948, + "grad_norm": 30.56134605407715, + "learning_rate": 2.104962920707359e-06, + "loss": 9.666, + "step": 374 + }, + { + "epoch": 0.06417935991785043, + "grad_norm": 18.76494026184082, + "learning_rate": 2.1106674272675414e-06, + "loss": 9.3941, + "step": 375 + }, + { + "epoch": 0.06435050487763136, + "grad_norm": 19.92658805847168, + "learning_rate": 2.116371933827724e-06, + "loss": 9.3741, + "step": 376 + }, + { + "epoch": 0.06452164983741229, + "grad_norm": 10.430363655090332, + "learning_rate": 2.1220764403879064e-06, + "loss": 7.8113, + "step": 377 + }, + { + "epoch": 0.06469279479719323, + "grad_norm": 18.093847274780273, + "learning_rate": 2.1277809469480894e-06, + "loss": 6.1706, + "step": 378 + }, + { + "epoch": 0.06486393975697416, + "grad_norm": 21.807714462280273, + "learning_rate": 2.1334854535082715e-06, + "loss": 9.471, + "step": 379 + }, + { + "epoch": 0.06503508471675509, + "grad_norm": 10.38511848449707, + "learning_rate": 2.139189960068454e-06, + "loss": 4.2784, + "step": 380 + }, + { + "epoch": 0.06520622967653603, + "grad_norm": 18.564613342285156, + "learning_rate": 2.144894466628637e-06, + "loss": 9.548, + "step": 381 + }, + { + "epoch": 0.06537737463631696, + "grad_norm": 13.890935897827148, + "learning_rate": 2.150598973188819e-06, + "loss": 7.9354, + "step": 382 + }, + { + "epoch": 0.06554851959609789, + "grad_norm": 18.593252182006836, + "learning_rate": 2.1563034797490016e-06, + "loss": 6.34, + "step": 383 + }, + { + "epoch": 0.06571966455587883, + "grad_norm": 10.455931663513184, + "learning_rate": 2.1620079863091846e-06, + "loss": 6.2716, + "step": 384 + }, + { + "epoch": 0.06589080951565976, + "grad_norm": 21.231943130493164, + "learning_rate": 2.1677124928693667e-06, + "loss": 5.9761, + "step": 385 + }, + { + "epoch": 0.06606195447544069, + "grad_norm": 11.568195343017578, + "learning_rate": 2.173416999429549e-06, + "loss": 4.4776, + "step": 386 + }, + { + "epoch": 0.06623309943522163, + "grad_norm": 23.829204559326172, + "learning_rate": 2.179121505989732e-06, + "loss": 9.648, + "step": 387 + }, + { + "epoch": 0.06640424439500257, + "grad_norm": 10.398987770080566, + "learning_rate": 2.1848260125499147e-06, + "loss": 4.7062, + "step": 388 + }, + { + "epoch": 0.0665753893547835, + "grad_norm": 11.396307945251465, + "learning_rate": 2.190530519110097e-06, + "loss": 8.1087, + "step": 389 + }, + { + "epoch": 0.06674653431456444, + "grad_norm": 18.780866622924805, + "learning_rate": 2.1962350256702798e-06, + "loss": 6.196, + "step": 390 + }, + { + "epoch": 0.06691767927434537, + "grad_norm": 18.36736488342285, + "learning_rate": 2.2019395322304623e-06, + "loss": 6.2459, + "step": 391 + }, + { + "epoch": 0.0670888242341263, + "grad_norm": 18.681446075439453, + "learning_rate": 2.2076440387906444e-06, + "loss": 9.4161, + "step": 392 + }, + { + "epoch": 0.06725996919390724, + "grad_norm": 15.113629341125488, + "learning_rate": 2.2133485453508274e-06, + "loss": 6.3517, + "step": 393 + }, + { + "epoch": 0.06743111415368817, + "grad_norm": 11.273137092590332, + "learning_rate": 2.21905305191101e-06, + "loss": 8.0886, + "step": 394 + }, + { + "epoch": 0.06760225911346911, + "grad_norm": 17.580646514892578, + "learning_rate": 2.224757558471192e-06, + "loss": 6.5059, + "step": 395 + }, + { + "epoch": 0.06777340407325004, + "grad_norm": 15.864416122436523, + "learning_rate": 2.230462065031375e-06, + "loss": 8.5624, + "step": 396 + }, + { + "epoch": 0.06794454903303097, + "grad_norm": 11.407431602478027, + "learning_rate": 2.2361665715915575e-06, + "loss": 7.853, + "step": 397 + }, + { + "epoch": 0.06811569399281191, + "grad_norm": 28.192079544067383, + "learning_rate": 2.24187107815174e-06, + "loss": 9.4467, + "step": 398 + }, + { + "epoch": 0.06828683895259284, + "grad_norm": 19.4180965423584, + "learning_rate": 2.2475755847119225e-06, + "loss": 5.6605, + "step": 399 + }, + { + "epoch": 0.06845798391237377, + "grad_norm": 19.75929069519043, + "learning_rate": 2.253280091272105e-06, + "loss": 9.4512, + "step": 400 + }, + { + "epoch": 0.06862912887215472, + "grad_norm": 10.311906814575195, + "learning_rate": 2.2589845978322876e-06, + "loss": 7.9644, + "step": 401 + }, + { + "epoch": 0.06880027383193565, + "grad_norm": 20.4741268157959, + "learning_rate": 2.26468910439247e-06, + "loss": 9.4577, + "step": 402 + }, + { + "epoch": 0.06897141879171659, + "grad_norm": 25.65606117248535, + "learning_rate": 2.2703936109526527e-06, + "loss": 9.6371, + "step": 403 + }, + { + "epoch": 0.06914256375149752, + "grad_norm": 26.26441192626953, + "learning_rate": 2.276098117512835e-06, + "loss": 9.5365, + "step": 404 + }, + { + "epoch": 0.06931370871127845, + "grad_norm": 14.249612808227539, + "learning_rate": 2.2818026240730177e-06, + "loss": 8.5897, + "step": 405 + }, + { + "epoch": 0.06948485367105939, + "grad_norm": 17.306989669799805, + "learning_rate": 2.2875071306332003e-06, + "loss": 6.9065, + "step": 406 + }, + { + "epoch": 0.06965599863084032, + "grad_norm": 10.925597190856934, + "learning_rate": 2.293211637193383e-06, + "loss": 4.4132, + "step": 407 + }, + { + "epoch": 0.06982714359062125, + "grad_norm": 20.995426177978516, + "learning_rate": 2.2989161437535653e-06, + "loss": 9.6018, + "step": 408 + }, + { + "epoch": 0.06999828855040219, + "grad_norm": 13.343510627746582, + "learning_rate": 2.304620650313748e-06, + "loss": 8.3354, + "step": 409 + }, + { + "epoch": 0.07016943351018312, + "grad_norm": 21.461809158325195, + "learning_rate": 2.3103251568739304e-06, + "loss": 9.3101, + "step": 410 + }, + { + "epoch": 0.07034057846996405, + "grad_norm": 25.428903579711914, + "learning_rate": 2.316029663434113e-06, + "loss": 9.4155, + "step": 411 + }, + { + "epoch": 0.07051172342974499, + "grad_norm": 22.469390869140625, + "learning_rate": 2.3217341699942955e-06, + "loss": 6.4331, + "step": 412 + }, + { + "epoch": 0.07068286838952594, + "grad_norm": 157.02752685546875, + "learning_rate": 2.3274386765544784e-06, + "loss": 7.6313, + "step": 413 + }, + { + "epoch": 0.07085401334930687, + "grad_norm": 12.20741081237793, + "learning_rate": 2.3331431831146605e-06, + "loss": 4.2273, + "step": 414 + }, + { + "epoch": 0.0710251583090878, + "grad_norm": 19.81876564025879, + "learning_rate": 2.338847689674843e-06, + "loss": 9.5364, + "step": 415 + }, + { + "epoch": 0.07119630326886874, + "grad_norm": 17.362276077270508, + "learning_rate": 2.344552196235026e-06, + "loss": 9.4605, + "step": 416 + }, + { + "epoch": 0.07136744822864967, + "grad_norm": 22.898147583007812, + "learning_rate": 2.350256702795208e-06, + "loss": 9.5846, + "step": 417 + }, + { + "epoch": 0.0715385931884306, + "grad_norm": 17.685535430908203, + "learning_rate": 2.3559612093553906e-06, + "loss": 8.0604, + "step": 418 + }, + { + "epoch": 0.07170973814821154, + "grad_norm": 16.97225570678711, + "learning_rate": 2.3616657159155736e-06, + "loss": 9.0822, + "step": 419 + }, + { + "epoch": 0.07188088310799247, + "grad_norm": 21.690431594848633, + "learning_rate": 2.3673702224757557e-06, + "loss": 5.9587, + "step": 420 + }, + { + "epoch": 0.0720520280677734, + "grad_norm": 20.209810256958008, + "learning_rate": 2.3730747290359387e-06, + "loss": 6.1507, + "step": 421 + }, + { + "epoch": 0.07222317302755434, + "grad_norm": 19.15233039855957, + "learning_rate": 2.378779235596121e-06, + "loss": 9.5222, + "step": 422 + }, + { + "epoch": 0.07239431798733527, + "grad_norm": 15.19393539428711, + "learning_rate": 2.3844837421563033e-06, + "loss": 8.3063, + "step": 423 + }, + { + "epoch": 0.0725654629471162, + "grad_norm": 14.138923645019531, + "learning_rate": 2.3901882487164863e-06, + "loss": 8.3802, + "step": 424 + }, + { + "epoch": 0.07273660790689714, + "grad_norm": 23.83425521850586, + "learning_rate": 2.395892755276669e-06, + "loss": 9.554, + "step": 425 + }, + { + "epoch": 0.07290775286667808, + "grad_norm": 19.778850555419922, + "learning_rate": 2.401597261836851e-06, + "loss": 6.0866, + "step": 426 + }, + { + "epoch": 0.07307889782645902, + "grad_norm": 12.418360710144043, + "learning_rate": 2.407301768397034e-06, + "loss": 7.7723, + "step": 427 + }, + { + "epoch": 0.07325004278623995, + "grad_norm": 21.105587005615234, + "learning_rate": 2.4130062749572164e-06, + "loss": 6.011, + "step": 428 + }, + { + "epoch": 0.07342118774602088, + "grad_norm": 18.78055763244629, + "learning_rate": 2.4187107815173985e-06, + "loss": 6.4389, + "step": 429 + }, + { + "epoch": 0.07359233270580182, + "grad_norm": 17.227916717529297, + "learning_rate": 2.4244152880775814e-06, + "loss": 6.6973, + "step": 430 + }, + { + "epoch": 0.07376347766558275, + "grad_norm": 21.845876693725586, + "learning_rate": 2.430119794637764e-06, + "loss": 5.9158, + "step": 431 + }, + { + "epoch": 0.07393462262536368, + "grad_norm": 14.355096817016602, + "learning_rate": 2.435824301197946e-06, + "loss": 8.6576, + "step": 432 + }, + { + "epoch": 0.07410576758514462, + "grad_norm": 149.28054809570312, + "learning_rate": 2.441528807758129e-06, + "loss": 7.7649, + "step": 433 + }, + { + "epoch": 0.07427691254492555, + "grad_norm": 18.152389526367188, + "learning_rate": 2.4472333143183116e-06, + "loss": 6.6434, + "step": 434 + }, + { + "epoch": 0.07444805750470648, + "grad_norm": 17.05584716796875, + "learning_rate": 2.4529378208784937e-06, + "loss": 9.1462, + "step": 435 + }, + { + "epoch": 0.07461920246448742, + "grad_norm": 11.82278060913086, + "learning_rate": 2.4586423274386766e-06, + "loss": 8.2832, + "step": 436 + }, + { + "epoch": 0.07479034742426835, + "grad_norm": 17.951648712158203, + "learning_rate": 2.464346833998859e-06, + "loss": 8.4052, + "step": 437 + }, + { + "epoch": 0.07496149238404928, + "grad_norm": 31.258188247680664, + "learning_rate": 2.4700513405590417e-06, + "loss": 9.4477, + "step": 438 + }, + { + "epoch": 0.07513263734383023, + "grad_norm": 138.91761779785156, + "learning_rate": 2.4757558471192242e-06, + "loss": 8.3869, + "step": 439 + }, + { + "epoch": 0.07530378230361116, + "grad_norm": 17.930551528930664, + "learning_rate": 2.4814603536794068e-06, + "loss": 9.1768, + "step": 440 + }, + { + "epoch": 0.0754749272633921, + "grad_norm": 10.999883651733398, + "learning_rate": 2.4871648602395897e-06, + "loss": 4.1341, + "step": 441 + }, + { + "epoch": 0.07564607222317303, + "grad_norm": 19.707490921020508, + "learning_rate": 2.492869366799772e-06, + "loss": 6.0241, + "step": 442 + }, + { + "epoch": 0.07581721718295396, + "grad_norm": 19.63069725036621, + "learning_rate": 2.4985738733599544e-06, + "loss": 9.5659, + "step": 443 + }, + { + "epoch": 0.0759883621427349, + "grad_norm": 19.783658981323242, + "learning_rate": 2.5042783799201373e-06, + "loss": 6.632, + "step": 444 + }, + { + "epoch": 0.07615950710251583, + "grad_norm": 11.193924903869629, + "learning_rate": 2.5099828864803194e-06, + "loss": 4.213, + "step": 445 + }, + { + "epoch": 0.07633065206229676, + "grad_norm": 65.09992218017578, + "learning_rate": 2.515687393040502e-06, + "loss": 13.1721, + "step": 446 + }, + { + "epoch": 0.0765017970220777, + "grad_norm": 19.081214904785156, + "learning_rate": 2.521391899600685e-06, + "loss": 8.7605, + "step": 447 + }, + { + "epoch": 0.07667294198185863, + "grad_norm": 17.08602523803711, + "learning_rate": 2.527096406160867e-06, + "loss": 8.4352, + "step": 448 + }, + { + "epoch": 0.07684408694163956, + "grad_norm": 11.796391487121582, + "learning_rate": 2.5328009127210495e-06, + "loss": 7.9838, + "step": 449 + }, + { + "epoch": 0.0770152319014205, + "grad_norm": 17.306316375732422, + "learning_rate": 2.5385054192812325e-06, + "loss": 7.9123, + "step": 450 + }, + { + "epoch": 0.07718637686120144, + "grad_norm": 11.991724014282227, + "learning_rate": 2.5442099258414146e-06, + "loss": 7.904, + "step": 451 + }, + { + "epoch": 0.07735752182098238, + "grad_norm": 18.394563674926758, + "learning_rate": 2.549914432401597e-06, + "loss": 6.7541, + "step": 452 + }, + { + "epoch": 0.07752866678076331, + "grad_norm": 21.436811447143555, + "learning_rate": 2.55561893896178e-06, + "loss": 5.5488, + "step": 453 + }, + { + "epoch": 0.07769981174054424, + "grad_norm": 15.822162628173828, + "learning_rate": 2.561323445521962e-06, + "loss": 7.7392, + "step": 454 + }, + { + "epoch": 0.07787095670032518, + "grad_norm": 19.68645668029785, + "learning_rate": 2.5670279520821447e-06, + "loss": 6.6529, + "step": 455 + }, + { + "epoch": 0.07804210166010611, + "grad_norm": 18.808198928833008, + "learning_rate": 2.5727324586423277e-06, + "loss": 8.784, + "step": 456 + }, + { + "epoch": 0.07821324661988704, + "grad_norm": 131.1753692626953, + "learning_rate": 2.57843696520251e-06, + "loss": 7.8706, + "step": 457 + }, + { + "epoch": 0.07838439157966798, + "grad_norm": 11.708639144897461, + "learning_rate": 2.5841414717626923e-06, + "loss": 7.7402, + "step": 458 + }, + { + "epoch": 0.07855553653944891, + "grad_norm": 15.965631484985352, + "learning_rate": 2.5898459783228753e-06, + "loss": 8.301, + "step": 459 + }, + { + "epoch": 0.07872668149922984, + "grad_norm": 14.710309982299805, + "learning_rate": 2.5955504848830574e-06, + "loss": 7.9566, + "step": 460 + }, + { + "epoch": 0.07889782645901078, + "grad_norm": 15.00783634185791, + "learning_rate": 2.6012549914432404e-06, + "loss": 8.488, + "step": 461 + }, + { + "epoch": 0.07906897141879171, + "grad_norm": 13.231627464294434, + "learning_rate": 2.606959498003423e-06, + "loss": 8.1184, + "step": 462 + }, + { + "epoch": 0.07924011637857264, + "grad_norm": 170.4566192626953, + "learning_rate": 2.6126640045636054e-06, + "loss": 8.1805, + "step": 463 + }, + { + "epoch": 0.07941126133835359, + "grad_norm": 23.66990089416504, + "learning_rate": 2.618368511123788e-06, + "loss": 9.2852, + "step": 464 + }, + { + "epoch": 0.07958240629813453, + "grad_norm": 20.218496322631836, + "learning_rate": 2.6240730176839705e-06, + "loss": 6.264, + "step": 465 + }, + { + "epoch": 0.07975355125791546, + "grad_norm": 27.905323028564453, + "learning_rate": 2.629777524244153e-06, + "loss": 10.0002, + "step": 466 + }, + { + "epoch": 0.07992469621769639, + "grad_norm": 22.043649673461914, + "learning_rate": 2.6354820308043355e-06, + "loss": 8.6303, + "step": 467 + }, + { + "epoch": 0.08009584117747733, + "grad_norm": 20.095890045166016, + "learning_rate": 2.641186537364518e-06, + "loss": 8.7857, + "step": 468 + }, + { + "epoch": 0.08026698613725826, + "grad_norm": 30.715435028076172, + "learning_rate": 2.6468910439247006e-06, + "loss": 9.6486, + "step": 469 + }, + { + "epoch": 0.08043813109703919, + "grad_norm": 18.83611488342285, + "learning_rate": 2.652595550484883e-06, + "loss": 7.9544, + "step": 470 + }, + { + "epoch": 0.08060927605682013, + "grad_norm": 20.929931640625, + "learning_rate": 2.6583000570450657e-06, + "loss": 6.2772, + "step": 471 + }, + { + "epoch": 0.08078042101660106, + "grad_norm": 18.414594650268555, + "learning_rate": 2.664004563605248e-06, + "loss": 6.1477, + "step": 472 + }, + { + "epoch": 0.08095156597638199, + "grad_norm": 18.188846588134766, + "learning_rate": 2.6697090701654307e-06, + "loss": 7.099, + "step": 473 + }, + { + "epoch": 0.08112271093616293, + "grad_norm": 8.666217803955078, + "learning_rate": 2.6754135767256133e-06, + "loss": 5.0929, + "step": 474 + }, + { + "epoch": 0.08129385589594386, + "grad_norm": 15.457167625427246, + "learning_rate": 2.681118083285796e-06, + "loss": 7.8706, + "step": 475 + }, + { + "epoch": 0.08146500085572479, + "grad_norm": 17.11892318725586, + "learning_rate": 2.6868225898459783e-06, + "loss": 8.5293, + "step": 476 + }, + { + "epoch": 0.08163614581550574, + "grad_norm": 28.18759536743164, + "learning_rate": 2.692527096406161e-06, + "loss": 5.7448, + "step": 477 + }, + { + "epoch": 0.08180729077528667, + "grad_norm": 19.842830657958984, + "learning_rate": 2.6982316029663434e-06, + "loss": 8.5854, + "step": 478 + }, + { + "epoch": 0.0819784357350676, + "grad_norm": 59.76820373535156, + "learning_rate": 2.703936109526526e-06, + "loss": 12.4879, + "step": 479 + }, + { + "epoch": 0.08214958069484854, + "grad_norm": 15.530830383300781, + "learning_rate": 2.7096406160867085e-06, + "loss": 8.191, + "step": 480 + }, + { + "epoch": 0.08232072565462947, + "grad_norm": 21.211435317993164, + "learning_rate": 2.7153451226468914e-06, + "loss": 9.4326, + "step": 481 + }, + { + "epoch": 0.0824918706144104, + "grad_norm": 16.38536834716797, + "learning_rate": 2.7210496292070735e-06, + "loss": 6.3342, + "step": 482 + }, + { + "epoch": 0.08266301557419134, + "grad_norm": 30.17742919921875, + "learning_rate": 2.726754135767256e-06, + "loss": 5.5068, + "step": 483 + }, + { + "epoch": 0.08283416053397227, + "grad_norm": 27.44713020324707, + "learning_rate": 2.732458642327439e-06, + "loss": 9.4586, + "step": 484 + }, + { + "epoch": 0.0830053054937532, + "grad_norm": 59.46120071411133, + "learning_rate": 2.738163148887621e-06, + "loss": 12.3889, + "step": 485 + }, + { + "epoch": 0.08317645045353414, + "grad_norm": 26.801589965820312, + "learning_rate": 2.7438676554478036e-06, + "loss": 5.3141, + "step": 486 + }, + { + "epoch": 0.08334759541331507, + "grad_norm": 32.20411682128906, + "learning_rate": 2.7495721620079866e-06, + "loss": 5.4274, + "step": 487 + }, + { + "epoch": 0.083518740373096, + "grad_norm": 16.14412498474121, + "learning_rate": 2.755276668568169e-06, + "loss": 8.3447, + "step": 488 + }, + { + "epoch": 0.08368988533287694, + "grad_norm": 16.79600715637207, + "learning_rate": 2.7609811751283512e-06, + "loss": 7.7737, + "step": 489 + }, + { + "epoch": 0.08386103029265789, + "grad_norm": 171.59872436523438, + "learning_rate": 2.766685681688534e-06, + "loss": 8.277, + "step": 490 + }, + { + "epoch": 0.08403217525243882, + "grad_norm": 29.80289649963379, + "learning_rate": 2.7723901882487167e-06, + "loss": 5.273, + "step": 491 + }, + { + "epoch": 0.08420332021221975, + "grad_norm": 15.38176155090332, + "learning_rate": 2.778094694808899e-06, + "loss": 7.8611, + "step": 492 + }, + { + "epoch": 0.08437446517200069, + "grad_norm": 19.766082763671875, + "learning_rate": 2.783799201369082e-06, + "loss": 7.7926, + "step": 493 + }, + { + "epoch": 0.08454561013178162, + "grad_norm": 13.274962425231934, + "learning_rate": 2.7895037079292643e-06, + "loss": 4.1215, + "step": 494 + }, + { + "epoch": 0.08471675509156255, + "grad_norm": 29.015403747558594, + "learning_rate": 2.7952082144894464e-06, + "loss": 5.4146, + "step": 495 + }, + { + "epoch": 0.08488790005134349, + "grad_norm": 22.243703842163086, + "learning_rate": 2.8009127210496294e-06, + "loss": 5.753, + "step": 496 + }, + { + "epoch": 0.08505904501112442, + "grad_norm": 23.75475311279297, + "learning_rate": 2.806617227609812e-06, + "loss": 5.7119, + "step": 497 + }, + { + "epoch": 0.08523018997090535, + "grad_norm": 19.524032592773438, + "learning_rate": 2.812321734169994e-06, + "loss": 8.9719, + "step": 498 + }, + { + "epoch": 0.08540133493068629, + "grad_norm": 22.207155227661133, + "learning_rate": 2.818026240730177e-06, + "loss": 8.5433, + "step": 499 + }, + { + "epoch": 0.08557247989046722, + "grad_norm": 20.369564056396484, + "learning_rate": 2.8237307472903595e-06, + "loss": 9.2212, + "step": 500 + }, + { + "epoch": 0.08574362485024815, + "grad_norm": 12.617632865905762, + "learning_rate": 2.829435253850542e-06, + "loss": 7.5878, + "step": 501 + }, + { + "epoch": 0.0859147698100291, + "grad_norm": 16.92389678955078, + "learning_rate": 2.8351397604107246e-06, + "loss": 8.1394, + "step": 502 + }, + { + "epoch": 0.08608591476981003, + "grad_norm": 52.22781753540039, + "learning_rate": 2.840844266970907e-06, + "loss": 11.9304, + "step": 503 + }, + { + "epoch": 0.08625705972959097, + "grad_norm": 19.299196243286133, + "learning_rate": 2.8465487735310896e-06, + "loss": 7.5487, + "step": 504 + }, + { + "epoch": 0.0864282046893719, + "grad_norm": 25.007366180419922, + "learning_rate": 2.852253280091272e-06, + "loss": 6.4953, + "step": 505 + }, + { + "epoch": 0.08659934964915283, + "grad_norm": 44.58477020263672, + "learning_rate": 2.8579577866514547e-06, + "loss": 11.543, + "step": 506 + }, + { + "epoch": 0.08677049460893377, + "grad_norm": 18.95302963256836, + "learning_rate": 2.8636622932116372e-06, + "loss": 7.7713, + "step": 507 + }, + { + "epoch": 0.0869416395687147, + "grad_norm": 15.56648063659668, + "learning_rate": 2.8693667997718198e-06, + "loss": 8.5567, + "step": 508 + }, + { + "epoch": 0.08711278452849563, + "grad_norm": 20.78284454345703, + "learning_rate": 2.8750713063320023e-06, + "loss": 7.7135, + "step": 509 + }, + { + "epoch": 0.08728392948827657, + "grad_norm": 23.176607131958008, + "learning_rate": 2.880775812892185e-06, + "loss": 8.2685, + "step": 510 + }, + { + "epoch": 0.0874550744480575, + "grad_norm": 25.212718963623047, + "learning_rate": 2.8864803194523674e-06, + "loss": 8.9983, + "step": 511 + }, + { + "epoch": 0.08762621940783843, + "grad_norm": 27.220836639404297, + "learning_rate": 2.89218482601255e-06, + "loss": 6.6334, + "step": 512 + }, + { + "epoch": 0.08779736436761937, + "grad_norm": 13.128168106079102, + "learning_rate": 2.897889332572733e-06, + "loss": 3.847, + "step": 513 + }, + { + "epoch": 0.0879685093274003, + "grad_norm": 19.84160614013672, + "learning_rate": 2.903593839132915e-06, + "loss": 8.0045, + "step": 514 + }, + { + "epoch": 0.08813965428718125, + "grad_norm": 15.77076530456543, + "learning_rate": 2.9092983456930975e-06, + "loss": 7.8019, + "step": 515 + }, + { + "epoch": 0.08831079924696218, + "grad_norm": 158.41465759277344, + "learning_rate": 2.9150028522532804e-06, + "loss": 8.6448, + "step": 516 + }, + { + "epoch": 0.08848194420674312, + "grad_norm": 23.563339233398438, + "learning_rate": 2.9207073588134625e-06, + "loss": 8.8163, + "step": 517 + }, + { + "epoch": 0.08865308916652405, + "grad_norm": 30.82549476623535, + "learning_rate": 2.926411865373645e-06, + "loss": 8.627, + "step": 518 + }, + { + "epoch": 0.08882423412630498, + "grad_norm": 24.138612747192383, + "learning_rate": 2.932116371933828e-06, + "loss": 6.2112, + "step": 519 + }, + { + "epoch": 0.08899537908608592, + "grad_norm": 42.6961784362793, + "learning_rate": 2.93782087849401e-06, + "loss": 11.5101, + "step": 520 + }, + { + "epoch": 0.08916652404586685, + "grad_norm": 16.58330726623535, + "learning_rate": 2.943525385054193e-06, + "loss": 7.885, + "step": 521 + }, + { + "epoch": 0.08933766900564778, + "grad_norm": 17.490467071533203, + "learning_rate": 2.9492298916143756e-06, + "loss": 7.6631, + "step": 522 + }, + { + "epoch": 0.08950881396542872, + "grad_norm": 24.303665161132812, + "learning_rate": 2.9549343981745577e-06, + "loss": 8.5512, + "step": 523 + }, + { + "epoch": 0.08967995892520965, + "grad_norm": 14.5447416305542, + "learning_rate": 2.9606389047347407e-06, + "loss": 3.9844, + "step": 524 + }, + { + "epoch": 0.08985110388499058, + "grad_norm": 28.421756744384766, + "learning_rate": 2.9663434112949232e-06, + "loss": 9.2179, + "step": 525 + }, + { + "epoch": 0.09002224884477152, + "grad_norm": 20.097034454345703, + "learning_rate": 2.9720479178551053e-06, + "loss": 9.2422, + "step": 526 + }, + { + "epoch": 0.09019339380455245, + "grad_norm": 20.862869262695312, + "learning_rate": 2.9777524244152883e-06, + "loss": 8.2303, + "step": 527 + }, + { + "epoch": 0.0903645387643334, + "grad_norm": 30.980390548706055, + "learning_rate": 2.983456930975471e-06, + "loss": 9.1253, + "step": 528 + }, + { + "epoch": 0.09053568372411433, + "grad_norm": 29.973567962646484, + "learning_rate": 2.989161437535653e-06, + "loss": 4.8928, + "step": 529 + }, + { + "epoch": 0.09070682868389526, + "grad_norm": 35.399349212646484, + "learning_rate": 2.994865944095836e-06, + "loss": 4.5559, + "step": 530 + }, + { + "epoch": 0.0908779736436762, + "grad_norm": 21.178098678588867, + "learning_rate": 3.0005704506560184e-06, + "loss": 8.8876, + "step": 531 + }, + { + "epoch": 0.09104911860345713, + "grad_norm": 24.755205154418945, + "learning_rate": 3.0062749572162005e-06, + "loss": 7.5809, + "step": 532 + }, + { + "epoch": 0.09122026356323806, + "grad_norm": 23.76934051513672, + "learning_rate": 3.0119794637763835e-06, + "loss": 8.5706, + "step": 533 + }, + { + "epoch": 0.091391408523019, + "grad_norm": 40.431190490722656, + "learning_rate": 3.017683970336566e-06, + "loss": 11.3342, + "step": 534 + }, + { + "epoch": 0.09156255348279993, + "grad_norm": 22.674354553222656, + "learning_rate": 3.023388476896748e-06, + "loss": 8.8756, + "step": 535 + }, + { + "epoch": 0.09173369844258086, + "grad_norm": 33.92606735229492, + "learning_rate": 3.029092983456931e-06, + "loss": 4.738, + "step": 536 + }, + { + "epoch": 0.0919048434023618, + "grad_norm": 27.1170711517334, + "learning_rate": 3.0347974900171136e-06, + "loss": 6.4223, + "step": 537 + }, + { + "epoch": 0.09207598836214273, + "grad_norm": 25.11066246032715, + "learning_rate": 3.040501996577296e-06, + "loss": 9.0946, + "step": 538 + }, + { + "epoch": 0.09224713332192366, + "grad_norm": 23.894901275634766, + "learning_rate": 3.0462065031374787e-06, + "loss": 6.3395, + "step": 539 + }, + { + "epoch": 0.09241827828170461, + "grad_norm": 20.199861526489258, + "learning_rate": 3.051911009697661e-06, + "loss": 6.0524, + "step": 540 + }, + { + "epoch": 0.09258942324148554, + "grad_norm": 22.757362365722656, + "learning_rate": 3.057615516257844e-06, + "loss": 7.1293, + "step": 541 + }, + { + "epoch": 0.09276056820126648, + "grad_norm": 22.62543487548828, + "learning_rate": 3.0633200228180263e-06, + "loss": 7.5053, + "step": 542 + }, + { + "epoch": 0.09293171316104741, + "grad_norm": 16.598411560058594, + "learning_rate": 3.069024529378209e-06, + "loss": 7.8322, + "step": 543 + }, + { + "epoch": 0.09310285812082834, + "grad_norm": 20.656627655029297, + "learning_rate": 3.0747290359383917e-06, + "loss": 8.8013, + "step": 544 + }, + { + "epoch": 0.09327400308060928, + "grad_norm": 20.95423126220703, + "learning_rate": 3.080433542498574e-06, + "loss": 6.1923, + "step": 545 + }, + { + "epoch": 0.09344514804039021, + "grad_norm": 175.26722717285156, + "learning_rate": 3.0861380490587564e-06, + "loss": 10.2252, + "step": 546 + }, + { + "epoch": 0.09361629300017114, + "grad_norm": 21.737558364868164, + "learning_rate": 3.0918425556189393e-06, + "loss": 7.7486, + "step": 547 + }, + { + "epoch": 0.09378743795995208, + "grad_norm": 41.67558288574219, + "learning_rate": 3.0975470621791215e-06, + "loss": 11.1347, + "step": 548 + }, + { + "epoch": 0.09395858291973301, + "grad_norm": 24.20724868774414, + "learning_rate": 3.103251568739304e-06, + "loss": 8.1228, + "step": 549 + }, + { + "epoch": 0.09412972787951394, + "grad_norm": 23.995750427246094, + "learning_rate": 3.108956075299487e-06, + "loss": 8.1871, + "step": 550 + }, + { + "epoch": 0.09430087283929488, + "grad_norm": 18.58646583557129, + "learning_rate": 3.114660581859669e-06, + "loss": 7.4311, + "step": 551 + }, + { + "epoch": 0.09447201779907581, + "grad_norm": 26.01420021057129, + "learning_rate": 3.1203650884198516e-06, + "loss": 9.3426, + "step": 552 + }, + { + "epoch": 0.09464316275885676, + "grad_norm": 18.335588455200195, + "learning_rate": 3.1260695949800345e-06, + "loss": 8.9575, + "step": 553 + }, + { + "epoch": 0.09481430771863769, + "grad_norm": 21.414621353149414, + "learning_rate": 3.1317741015402166e-06, + "loss": 7.744, + "step": 554 + }, + { + "epoch": 0.09498545267841862, + "grad_norm": 15.28297233581543, + "learning_rate": 3.137478608100399e-06, + "loss": 8.0683, + "step": 555 + }, + { + "epoch": 0.09515659763819956, + "grad_norm": 20.182992935180664, + "learning_rate": 3.143183114660582e-06, + "loss": 7.5161, + "step": 556 + }, + { + "epoch": 0.09532774259798049, + "grad_norm": 22.94892120361328, + "learning_rate": 3.1488876212207642e-06, + "loss": 7.728, + "step": 557 + }, + { + "epoch": 0.09549888755776142, + "grad_norm": 16.93927764892578, + "learning_rate": 3.1545921277809468e-06, + "loss": 7.7731, + "step": 558 + }, + { + "epoch": 0.09567003251754236, + "grad_norm": 21.27535629272461, + "learning_rate": 3.1602966343411297e-06, + "loss": 8.7629, + "step": 559 + }, + { + "epoch": 0.09584117747732329, + "grad_norm": 20.056377410888672, + "learning_rate": 3.166001140901312e-06, + "loss": 7.3943, + "step": 560 + }, + { + "epoch": 0.09601232243710422, + "grad_norm": 37.84750747680664, + "learning_rate": 3.1717056474614948e-06, + "loss": 10.5613, + "step": 561 + }, + { + "epoch": 0.09618346739688516, + "grad_norm": 19.577177047729492, + "learning_rate": 3.1774101540216773e-06, + "loss": 7.6761, + "step": 562 + }, + { + "epoch": 0.09635461235666609, + "grad_norm": 22.209712982177734, + "learning_rate": 3.18311466058186e-06, + "loss": 6.9584, + "step": 563 + }, + { + "epoch": 0.09652575731644703, + "grad_norm": 25.258302688598633, + "learning_rate": 3.1888191671420424e-06, + "loss": 8.0565, + "step": 564 + }, + { + "epoch": 0.09669690227622796, + "grad_norm": 15.993329048156738, + "learning_rate": 3.194523673702225e-06, + "loss": 4.2519, + "step": 565 + }, + { + "epoch": 0.0968680472360089, + "grad_norm": 18.609046936035156, + "learning_rate": 3.2002281802624074e-06, + "loss": 8.0901, + "step": 566 + }, + { + "epoch": 0.09703919219578984, + "grad_norm": 39.24065017700195, + "learning_rate": 3.20593268682259e-06, + "loss": 4.2716, + "step": 567 + }, + { + "epoch": 0.09721033715557077, + "grad_norm": 158.3350067138672, + "learning_rate": 3.2116371933827725e-06, + "loss": 9.5332, + "step": 568 + }, + { + "epoch": 0.0973814821153517, + "grad_norm": 59.29450607299805, + "learning_rate": 3.217341699942955e-06, + "loss": 10.343, + "step": 569 + }, + { + "epoch": 0.09755262707513264, + "grad_norm": 16.113664627075195, + "learning_rate": 3.2230462065031376e-06, + "loss": 3.4944, + "step": 570 + }, + { + "epoch": 0.09772377203491357, + "grad_norm": 23.105350494384766, + "learning_rate": 3.22875071306332e-06, + "loss": 8.2848, + "step": 571 + }, + { + "epoch": 0.0978949169946945, + "grad_norm": 21.425796508789062, + "learning_rate": 3.2344552196235026e-06, + "loss": 8.7655, + "step": 572 + }, + { + "epoch": 0.09806606195447544, + "grad_norm": 22.587278366088867, + "learning_rate": 3.240159726183685e-06, + "loss": 5.394, + "step": 573 + }, + { + "epoch": 0.09823720691425637, + "grad_norm": 37.69017028808594, + "learning_rate": 3.2458642327438677e-06, + "loss": 9.9902, + "step": 574 + }, + { + "epoch": 0.0984083518740373, + "grad_norm": 25.255393981933594, + "learning_rate": 3.2515687393040502e-06, + "loss": 8.3951, + "step": 575 + }, + { + "epoch": 0.09857949683381824, + "grad_norm": 18.790040969848633, + "learning_rate": 3.2572732458642328e-06, + "loss": 8.2647, + "step": 576 + }, + { + "epoch": 0.09875064179359917, + "grad_norm": 18.215757369995117, + "learning_rate": 3.2629777524244153e-06, + "loss": 7.8027, + "step": 577 + }, + { + "epoch": 0.09892178675338012, + "grad_norm": 17.263294219970703, + "learning_rate": 3.268682258984598e-06, + "loss": 7.7728, + "step": 578 + }, + { + "epoch": 0.09909293171316105, + "grad_norm": 18.384496688842773, + "learning_rate": 3.2743867655447804e-06, + "loss": 7.9191, + "step": 579 + }, + { + "epoch": 0.09926407667294199, + "grad_norm": 32.68930435180664, + "learning_rate": 3.280091272104963e-06, + "loss": 7.8591, + "step": 580 + }, + { + "epoch": 0.09943522163272292, + "grad_norm": 31.514266967773438, + "learning_rate": 3.285795778665146e-06, + "loss": 5.5621, + "step": 581 + }, + { + "epoch": 0.09960636659250385, + "grad_norm": 18.912736892700195, + "learning_rate": 3.291500285225328e-06, + "loss": 7.6952, + "step": 582 + }, + { + "epoch": 0.09977751155228479, + "grad_norm": 37.68309783935547, + "learning_rate": 3.2972047917855105e-06, + "loss": 4.1392, + "step": 583 + }, + { + "epoch": 0.09994865651206572, + "grad_norm": 31.56082534790039, + "learning_rate": 3.3029092983456934e-06, + "loss": 4.3801, + "step": 584 + }, + { + "epoch": 0.10011980147184665, + "grad_norm": 24.57911491394043, + "learning_rate": 3.3086138049058755e-06, + "loss": 8.6316, + "step": 585 + }, + { + "epoch": 0.10029094643162759, + "grad_norm": 23.80208396911621, + "learning_rate": 3.314318311466058e-06, + "loss": 7.9077, + "step": 586 + }, + { + "epoch": 0.10046209139140852, + "grad_norm": 16.849803924560547, + "learning_rate": 3.320022818026241e-06, + "loss": 7.6992, + "step": 587 + }, + { + "epoch": 0.10063323635118945, + "grad_norm": 18.981300354003906, + "learning_rate": 3.3257273245864236e-06, + "loss": 8.6023, + "step": 588 + }, + { + "epoch": 0.10080438131097039, + "grad_norm": 24.398378372192383, + "learning_rate": 3.3314318311466057e-06, + "loss": 7.0319, + "step": 589 + }, + { + "epoch": 0.10097552627075132, + "grad_norm": 14.639533996582031, + "learning_rate": 3.3371363377067886e-06, + "loss": 4.7698, + "step": 590 + }, + { + "epoch": 0.10114667123053227, + "grad_norm": 25.046255111694336, + "learning_rate": 3.342840844266971e-06, + "loss": 7.1782, + "step": 591 + }, + { + "epoch": 0.1013178161903132, + "grad_norm": 20.012542724609375, + "learning_rate": 3.3485453508271533e-06, + "loss": 7.3167, + "step": 592 + }, + { + "epoch": 0.10148896115009413, + "grad_norm": 27.766891479492188, + "learning_rate": 3.3542498573873362e-06, + "loss": 4.6521, + "step": 593 + }, + { + "epoch": 0.10166010610987507, + "grad_norm": 30.79694175720215, + "learning_rate": 3.3599543639475188e-06, + "loss": 5.1196, + "step": 594 + }, + { + "epoch": 0.101831251069656, + "grad_norm": 24.9854736328125, + "learning_rate": 3.365658870507701e-06, + "loss": 7.8056, + "step": 595 + }, + { + "epoch": 0.10200239602943693, + "grad_norm": 30.63117218017578, + "learning_rate": 3.371363377067884e-06, + "loss": 4.7903, + "step": 596 + }, + { + "epoch": 0.10217354098921787, + "grad_norm": 34.852256774902344, + "learning_rate": 3.3770678836280663e-06, + "loss": 4.1994, + "step": 597 + }, + { + "epoch": 0.1023446859489988, + "grad_norm": 26.979557037353516, + "learning_rate": 3.3827723901882485e-06, + "loss": 8.5955, + "step": 598 + }, + { + "epoch": 0.10251583090877973, + "grad_norm": 21.797626495361328, + "learning_rate": 3.3884768967484314e-06, + "loss": 7.5743, + "step": 599 + }, + { + "epoch": 0.10268697586856067, + "grad_norm": 37.774139404296875, + "learning_rate": 3.394181403308614e-06, + "loss": 6.7935, + "step": 600 + }, + { + "epoch": 0.1028581208283416, + "grad_norm": 27.917823791503906, + "learning_rate": 3.3998859098687965e-06, + "loss": 7.9018, + "step": 601 + }, + { + "epoch": 0.10302926578812253, + "grad_norm": 28.479934692382812, + "learning_rate": 3.405590416428979e-06, + "loss": 4.566, + "step": 602 + }, + { + "epoch": 0.10320041074790347, + "grad_norm": 33.35675811767578, + "learning_rate": 3.4112949229891615e-06, + "loss": 4.1986, + "step": 603 + }, + { + "epoch": 0.10337155570768441, + "grad_norm": 17.17736053466797, + "learning_rate": 3.416999429549344e-06, + "loss": 4.4831, + "step": 604 + }, + { + "epoch": 0.10354270066746535, + "grad_norm": 33.52507781982422, + "learning_rate": 3.4227039361095266e-06, + "loss": 5.2033, + "step": 605 + }, + { + "epoch": 0.10371384562724628, + "grad_norm": 38.001678466796875, + "learning_rate": 3.428408442669709e-06, + "loss": 4.2796, + "step": 606 + }, + { + "epoch": 0.10388499058702722, + "grad_norm": 27.487375259399414, + "learning_rate": 3.4341129492298917e-06, + "loss": 4.3677, + "step": 607 + }, + { + "epoch": 0.10405613554680815, + "grad_norm": 43.33926010131836, + "learning_rate": 3.439817455790074e-06, + "loss": 8.1044, + "step": 608 + }, + { + "epoch": 0.10422728050658908, + "grad_norm": 19.231143951416016, + "learning_rate": 3.4455219623502567e-06, + "loss": 3.8195, + "step": 609 + }, + { + "epoch": 0.10439842546637002, + "grad_norm": 51.54021453857422, + "learning_rate": 3.4512264689104393e-06, + "loss": 8.4844, + "step": 610 + }, + { + "epoch": 0.10456957042615095, + "grad_norm": 18.752532958984375, + "learning_rate": 3.456930975470622e-06, + "loss": 4.7959, + "step": 611 + }, + { + "epoch": 0.10474071538593188, + "grad_norm": 31.644916534423828, + "learning_rate": 3.4626354820308043e-06, + "loss": 3.5838, + "step": 612 + }, + { + "epoch": 0.10491186034571282, + "grad_norm": 29.887203216552734, + "learning_rate": 3.4683399885909873e-06, + "loss": 8.1693, + "step": 613 + }, + { + "epoch": 0.10508300530549375, + "grad_norm": 26.583890914916992, + "learning_rate": 3.4740444951511694e-06, + "loss": 6.6237, + "step": 614 + }, + { + "epoch": 0.10525415026527468, + "grad_norm": 30.845338821411133, + "learning_rate": 3.479749001711352e-06, + "loss": 3.4824, + "step": 615 + }, + { + "epoch": 0.10542529522505562, + "grad_norm": 38.40910339355469, + "learning_rate": 3.485453508271535e-06, + "loss": 3.9889, + "step": 616 + }, + { + "epoch": 0.10559644018483656, + "grad_norm": 39.16193389892578, + "learning_rate": 3.491158014831717e-06, + "loss": 8.4244, + "step": 617 + }, + { + "epoch": 0.1057675851446175, + "grad_norm": 17.97920036315918, + "learning_rate": 3.4968625213918995e-06, + "loss": 3.2418, + "step": 618 + }, + { + "epoch": 0.10593873010439843, + "grad_norm": 176.26966857910156, + "learning_rate": 3.5025670279520825e-06, + "loss": 7.2639, + "step": 619 + }, + { + "epoch": 0.10610987506417936, + "grad_norm": 31.25491714477539, + "learning_rate": 3.5082715345122646e-06, + "loss": 7.428, + "step": 620 + }, + { + "epoch": 0.1062810200239603, + "grad_norm": 291.8013916015625, + "learning_rate": 3.5139760410724475e-06, + "loss": 12.9756, + "step": 621 + }, + { + "epoch": 0.10645216498374123, + "grad_norm": 34.713497161865234, + "learning_rate": 3.51968054763263e-06, + "loss": 4.2338, + "step": 622 + }, + { + "epoch": 0.10662330994352216, + "grad_norm": 30.151296615600586, + "learning_rate": 3.525385054192812e-06, + "loss": 8.7988, + "step": 623 + }, + { + "epoch": 0.1067944549033031, + "grad_norm": 36.128414154052734, + "learning_rate": 3.531089560752995e-06, + "loss": 8.0574, + "step": 624 + }, + { + "epoch": 0.10696559986308403, + "grad_norm": 15.85501480102539, + "learning_rate": 3.5367940673131777e-06, + "loss": 3.4627, + "step": 625 + }, + { + "epoch": 0.10713674482286496, + "grad_norm": 296.1280212402344, + "learning_rate": 3.5424985738733598e-06, + "loss": 11.8753, + "step": 626 + }, + { + "epoch": 0.1073078897826459, + "grad_norm": 30.480113983154297, + "learning_rate": 3.5482030804335427e-06, + "loss": 6.966, + "step": 627 + }, + { + "epoch": 0.10747903474242683, + "grad_norm": 34.42314529418945, + "learning_rate": 3.5539075869937253e-06, + "loss": 6.8004, + "step": 628 + }, + { + "epoch": 0.10765017970220778, + "grad_norm": 239.69007873535156, + "learning_rate": 3.5596120935539074e-06, + "loss": 10.727, + "step": 629 + }, + { + "epoch": 0.10782132466198871, + "grad_norm": 42.74559783935547, + "learning_rate": 3.5653166001140903e-06, + "loss": 7.948, + "step": 630 + }, + { + "epoch": 0.10799246962176964, + "grad_norm": 24.176240921020508, + "learning_rate": 3.571021106674273e-06, + "loss": 5.4348, + "step": 631 + }, + { + "epoch": 0.10816361458155058, + "grad_norm": 32.64130783081055, + "learning_rate": 3.576725613234455e-06, + "loss": 8.4, + "step": 632 + }, + { + "epoch": 0.10833475954133151, + "grad_norm": 32.354248046875, + "learning_rate": 3.582430119794638e-06, + "loss": 6.4397, + "step": 633 + }, + { + "epoch": 0.10850590450111244, + "grad_norm": 25.767475128173828, + "learning_rate": 3.5881346263548204e-06, + "loss": 5.7136, + "step": 634 + }, + { + "epoch": 0.10867704946089338, + "grad_norm": 28.90591812133789, + "learning_rate": 3.593839132915003e-06, + "loss": 5.9131, + "step": 635 + }, + { + "epoch": 0.10884819442067431, + "grad_norm": 32.62278747558594, + "learning_rate": 3.5995436394751855e-06, + "loss": 7.6547, + "step": 636 + }, + { + "epoch": 0.10901933938045524, + "grad_norm": 30.387760162353516, + "learning_rate": 3.605248146035368e-06, + "loss": 4.7063, + "step": 637 + }, + { + "epoch": 0.10919048434023618, + "grad_norm": 33.034420013427734, + "learning_rate": 3.6109526525955506e-06, + "loss": 6.8851, + "step": 638 + }, + { + "epoch": 0.10936162930001711, + "grad_norm": 31.42691421508789, + "learning_rate": 3.616657159155733e-06, + "loss": 4.1348, + "step": 639 + }, + { + "epoch": 0.10953277425979804, + "grad_norm": 32.439395904541016, + "learning_rate": 3.6223616657159156e-06, + "loss": 6.3162, + "step": 640 + }, + { + "epoch": 0.10970391921957898, + "grad_norm": 26.49324607849121, + "learning_rate": 3.6280661722760986e-06, + "loss": 5.1818, + "step": 641 + }, + { + "epoch": 0.10987506417935992, + "grad_norm": 25.558427810668945, + "learning_rate": 3.6337706788362807e-06, + "loss": 6.7631, + "step": 642 + }, + { + "epoch": 0.11004620913914086, + "grad_norm": 24.655729293823242, + "learning_rate": 3.6394751853964632e-06, + "loss": 7.4925, + "step": 643 + }, + { + "epoch": 0.11021735409892179, + "grad_norm": 28.129770278930664, + "learning_rate": 3.645179691956646e-06, + "loss": 7.4969, + "step": 644 + }, + { + "epoch": 0.11038849905870272, + "grad_norm": 14.367050170898438, + "learning_rate": 3.6508841985168283e-06, + "loss": 2.9942, + "step": 645 + }, + { + "epoch": 0.11055964401848366, + "grad_norm": 17.681976318359375, + "learning_rate": 3.656588705077011e-06, + "loss": 3.4156, + "step": 646 + }, + { + "epoch": 0.11073078897826459, + "grad_norm": 16.25703239440918, + "learning_rate": 3.6622932116371938e-06, + "loss": 4.095, + "step": 647 + }, + { + "epoch": 0.11090193393804552, + "grad_norm": 26.604623794555664, + "learning_rate": 3.667997718197376e-06, + "loss": 7.7521, + "step": 648 + }, + { + "epoch": 0.11107307889782646, + "grad_norm": 24.250492095947266, + "learning_rate": 3.6737022247575584e-06, + "loss": 5.5501, + "step": 649 + }, + { + "epoch": 0.11124422385760739, + "grad_norm": 31.94316864013672, + "learning_rate": 3.6794067313177414e-06, + "loss": 6.24, + "step": 650 + }, + { + "epoch": 0.11141536881738832, + "grad_norm": 18.14836883544922, + "learning_rate": 3.6851112378779235e-06, + "loss": 3.024, + "step": 651 + }, + { + "epoch": 0.11158651377716926, + "grad_norm": 25.239274978637695, + "learning_rate": 3.690815744438106e-06, + "loss": 7.4699, + "step": 652 + }, + { + "epoch": 0.11175765873695019, + "grad_norm": 66.97354125976562, + "learning_rate": 3.696520250998289e-06, + "loss": 11.3565, + "step": 653 + }, + { + "epoch": 0.11192880369673112, + "grad_norm": 30.029356002807617, + "learning_rate": 3.702224757558471e-06, + "loss": 7.0383, + "step": 654 + }, + { + "epoch": 0.11209994865651207, + "grad_norm": 22.021820068359375, + "learning_rate": 3.7079292641186536e-06, + "loss": 5.1655, + "step": 655 + }, + { + "epoch": 0.112271093616293, + "grad_norm": 40.79402160644531, + "learning_rate": 3.7136337706788366e-06, + "loss": 7.3738, + "step": 656 + }, + { + "epoch": 0.11244223857607394, + "grad_norm": 49.726810455322266, + "learning_rate": 3.7193382772390187e-06, + "loss": 10.1751, + "step": 657 + }, + { + "epoch": 0.11261338353585487, + "grad_norm": 34.322078704833984, + "learning_rate": 3.725042783799201e-06, + "loss": 8.127, + "step": 658 + }, + { + "epoch": 0.1127845284956358, + "grad_norm": 31.094890594482422, + "learning_rate": 3.730747290359384e-06, + "loss": 7.2578, + "step": 659 + }, + { + "epoch": 0.11295567345541674, + "grad_norm": 17.61489486694336, + "learning_rate": 3.7364517969195667e-06, + "loss": 2.9985, + "step": 660 + }, + { + "epoch": 0.11312681841519767, + "grad_norm": 31.467206954956055, + "learning_rate": 3.7421563034797492e-06, + "loss": 3.7049, + "step": 661 + }, + { + "epoch": 0.1132979633749786, + "grad_norm": 24.94162368774414, + "learning_rate": 3.7478608100399318e-06, + "loss": 5.2144, + "step": 662 + }, + { + "epoch": 0.11346910833475954, + "grad_norm": 61.16570281982422, + "learning_rate": 3.7535653166001143e-06, + "loss": 10.6124, + "step": 663 + }, + { + "epoch": 0.11364025329454047, + "grad_norm": 30.18357276916504, + "learning_rate": 3.7592698231602964e-06, + "loss": 7.2241, + "step": 664 + }, + { + "epoch": 0.1138113982543214, + "grad_norm": 40.68777847290039, + "learning_rate": 3.764974329720479e-06, + "loss": 7.604, + "step": 665 + }, + { + "epoch": 0.11398254321410234, + "grad_norm": 24.30128288269043, + "learning_rate": 3.7706788362806623e-06, + "loss": 4.6842, + "step": 666 + }, + { + "epoch": 0.11415368817388329, + "grad_norm": 33.77325439453125, + "learning_rate": 3.7763833428408444e-06, + "loss": 7.3903, + "step": 667 + }, + { + "epoch": 0.11432483313366422, + "grad_norm": 30.10031509399414, + "learning_rate": 3.782087849401027e-06, + "loss": 7.0533, + "step": 668 + }, + { + "epoch": 0.11449597809344515, + "grad_norm": 34.8586540222168, + "learning_rate": 3.7877923559612095e-06, + "loss": 7.5631, + "step": 669 + }, + { + "epoch": 0.11466712305322609, + "grad_norm": 33.20988082885742, + "learning_rate": 3.7934968625213916e-06, + "loss": 3.7029, + "step": 670 + }, + { + "epoch": 0.11483826801300702, + "grad_norm": 31.075176239013672, + "learning_rate": 3.799201369081575e-06, + "loss": 8.2182, + "step": 671 + }, + { + "epoch": 0.11500941297278795, + "grad_norm": 30.962139129638672, + "learning_rate": 3.8049058756417575e-06, + "loss": 6.5288, + "step": 672 + }, + { + "epoch": 0.11518055793256889, + "grad_norm": 37.01807403564453, + "learning_rate": 3.8106103822019396e-06, + "loss": 7.8449, + "step": 673 + }, + { + "epoch": 0.11535170289234982, + "grad_norm": 35.002742767333984, + "learning_rate": 3.816314888762122e-06, + "loss": 6.4509, + "step": 674 + }, + { + "epoch": 0.11552284785213075, + "grad_norm": 51.06761169433594, + "learning_rate": 3.822019395322305e-06, + "loss": 10.6236, + "step": 675 + }, + { + "epoch": 0.11569399281191169, + "grad_norm": 37.48448181152344, + "learning_rate": 3.827723901882487e-06, + "loss": 6.7785, + "step": 676 + }, + { + "epoch": 0.11586513777169262, + "grad_norm": 35.638832092285156, + "learning_rate": 3.8334284084426706e-06, + "loss": 7.6172, + "step": 677 + }, + { + "epoch": 0.11603628273147355, + "grad_norm": 35.00564956665039, + "learning_rate": 3.839132915002852e-06, + "loss": 7.1866, + "step": 678 + }, + { + "epoch": 0.11620742769125449, + "grad_norm": 31.42662811279297, + "learning_rate": 3.844837421563035e-06, + "loss": 3.2075, + "step": 679 + }, + { + "epoch": 0.11637857265103543, + "grad_norm": 16.111412048339844, + "learning_rate": 3.850541928123217e-06, + "loss": 4.0132, + "step": 680 + }, + { + "epoch": 0.11654971761081637, + "grad_norm": 29.7305850982666, + "learning_rate": 3.8562464346834e-06, + "loss": 7.1253, + "step": 681 + }, + { + "epoch": 0.1167208625705973, + "grad_norm": 28.033987045288086, + "learning_rate": 3.861950941243582e-06, + "loss": 3.1947, + "step": 682 + }, + { + "epoch": 0.11689200753037823, + "grad_norm": 31.460405349731445, + "learning_rate": 3.867655447803766e-06, + "loss": 7.1427, + "step": 683 + }, + { + "epoch": 0.11706315249015917, + "grad_norm": 269.6858825683594, + "learning_rate": 3.8733599543639474e-06, + "loss": 11.9947, + "step": 684 + }, + { + "epoch": 0.1172342974499401, + "grad_norm": 35.384727478027344, + "learning_rate": 3.87906446092413e-06, + "loss": 6.8334, + "step": 685 + }, + { + "epoch": 0.11740544240972103, + "grad_norm": 25.98334312438965, + "learning_rate": 3.8847689674843125e-06, + "loss": 7.2384, + "step": 686 + }, + { + "epoch": 0.11757658736950197, + "grad_norm": 33.84842300415039, + "learning_rate": 3.890473474044495e-06, + "loss": 5.9906, + "step": 687 + }, + { + "epoch": 0.1177477323292829, + "grad_norm": 41.04487609863281, + "learning_rate": 3.8961779806046776e-06, + "loss": 6.2759, + "step": 688 + }, + { + "epoch": 0.11791887728906383, + "grad_norm": 16.468915939331055, + "learning_rate": 3.901882487164861e-06, + "loss": 3.8545, + "step": 689 + }, + { + "epoch": 0.11809002224884477, + "grad_norm": 27.10782241821289, + "learning_rate": 3.907586993725043e-06, + "loss": 7.596, + "step": 690 + }, + { + "epoch": 0.1182611672086257, + "grad_norm": 25.684919357299805, + "learning_rate": 3.913291500285225e-06, + "loss": 4.2235, + "step": 691 + }, + { + "epoch": 0.11843231216840663, + "grad_norm": 27.2288761138916, + "learning_rate": 3.918996006845408e-06, + "loss": 6.9975, + "step": 692 + }, + { + "epoch": 0.11860345712818758, + "grad_norm": 33.26142120361328, + "learning_rate": 3.92470051340559e-06, + "loss": 6.5592, + "step": 693 + }, + { + "epoch": 0.11877460208796851, + "grad_norm": 38.89694595336914, + "learning_rate": 3.930405019965774e-06, + "loss": 7.2757, + "step": 694 + }, + { + "epoch": 0.11894574704774945, + "grad_norm": 27.99795150756836, + "learning_rate": 3.936109526525956e-06, + "loss": 6.7422, + "step": 695 + }, + { + "epoch": 0.11911689200753038, + "grad_norm": 24.109289169311523, + "learning_rate": 3.941814033086138e-06, + "loss": 4.9175, + "step": 696 + }, + { + "epoch": 0.11928803696731131, + "grad_norm": 15.462040901184082, + "learning_rate": 3.94751853964632e-06, + "loss": 2.7684, + "step": 697 + }, + { + "epoch": 0.11945918192709225, + "grad_norm": 39.17838668823242, + "learning_rate": 3.953223046206503e-06, + "loss": 7.1518, + "step": 698 + }, + { + "epoch": 0.11963032688687318, + "grad_norm": 30.83951759338379, + "learning_rate": 3.958927552766685e-06, + "loss": 3.8832, + "step": 699 + }, + { + "epoch": 0.11980147184665411, + "grad_norm": 26.964744567871094, + "learning_rate": 3.964632059326869e-06, + "loss": 7.224, + "step": 700 + }, + { + "epoch": 0.11997261680643505, + "grad_norm": 36.607975006103516, + "learning_rate": 3.970336565887051e-06, + "loss": 7.3389, + "step": 701 + }, + { + "epoch": 0.12014376176621598, + "grad_norm": 37.18532180786133, + "learning_rate": 3.976041072447234e-06, + "loss": 6.1083, + "step": 702 + }, + { + "epoch": 0.12031490672599691, + "grad_norm": 29.550649642944336, + "learning_rate": 3.9817455790074155e-06, + "loss": 5.1898, + "step": 703 + }, + { + "epoch": 0.12048605168577785, + "grad_norm": 24.146198272705078, + "learning_rate": 3.987450085567598e-06, + "loss": 4.6196, + "step": 704 + }, + { + "epoch": 0.1206571966455588, + "grad_norm": 25.126737594604492, + "learning_rate": 3.993154592127781e-06, + "loss": 2.7422, + "step": 705 + }, + { + "epoch": 0.12082834160533973, + "grad_norm": 18.79334259033203, + "learning_rate": 3.998859098687964e-06, + "loss": 2.6716, + "step": 706 + }, + { + "epoch": 0.12099948656512066, + "grad_norm": 33.249168395996094, + "learning_rate": 4.0045636052481465e-06, + "loss": 5.3053, + "step": 707 + }, + { + "epoch": 0.1211706315249016, + "grad_norm": 26.934682846069336, + "learning_rate": 4.010268111808329e-06, + "loss": 4.514, + "step": 708 + }, + { + "epoch": 0.12134177648468253, + "grad_norm": 44.88846206665039, + "learning_rate": 4.015972618368511e-06, + "loss": 6.4733, + "step": 709 + }, + { + "epoch": 0.12151292144446346, + "grad_norm": 41.93711471557617, + "learning_rate": 4.021677124928693e-06, + "loss": 7.4558, + "step": 710 + }, + { + "epoch": 0.1216840664042444, + "grad_norm": 41.59209060668945, + "learning_rate": 4.027381631488877e-06, + "loss": 7.0372, + "step": 711 + }, + { + "epoch": 0.12185521136402533, + "grad_norm": 41.47358703613281, + "learning_rate": 4.033086138049059e-06, + "loss": 7.2868, + "step": 712 + }, + { + "epoch": 0.12202635632380626, + "grad_norm": 41.380741119384766, + "learning_rate": 4.038790644609242e-06, + "loss": 7.6225, + "step": 713 + }, + { + "epoch": 0.1221975012835872, + "grad_norm": 40.343788146972656, + "learning_rate": 4.044495151169424e-06, + "loss": 7.2916, + "step": 714 + }, + { + "epoch": 0.12236864624336813, + "grad_norm": 30.69339370727539, + "learning_rate": 4.050199657729606e-06, + "loss": 5.8809, + "step": 715 + }, + { + "epoch": 0.12253979120314906, + "grad_norm": 25.84669303894043, + "learning_rate": 4.0559041642897885e-06, + "loss": 2.8596, + "step": 716 + }, + { + "epoch": 0.12271093616293, + "grad_norm": 37.5709114074707, + "learning_rate": 4.061608670849972e-06, + "loss": 6.5536, + "step": 717 + }, + { + "epoch": 0.12288208112271094, + "grad_norm": 44.87430191040039, + "learning_rate": 4.067313177410154e-06, + "loss": 7.8952, + "step": 718 + }, + { + "epoch": 0.12305322608249188, + "grad_norm": 29.630413055419922, + "learning_rate": 4.073017683970337e-06, + "loss": 7.2852, + "step": 719 + }, + { + "epoch": 0.12322437104227281, + "grad_norm": 38.17768096923828, + "learning_rate": 4.0787221905305194e-06, + "loss": 7.2025, + "step": 720 + }, + { + "epoch": 0.12339551600205374, + "grad_norm": 31.9378719329834, + "learning_rate": 4.084426697090702e-06, + "loss": 3.3231, + "step": 721 + }, + { + "epoch": 0.12356666096183468, + "grad_norm": 15.323390007019043, + "learning_rate": 4.090131203650884e-06, + "loss": 3.5441, + "step": 722 + }, + { + "epoch": 0.12373780592161561, + "grad_norm": 32.09744644165039, + "learning_rate": 4.095835710211067e-06, + "loss": 5.9737, + "step": 723 + }, + { + "epoch": 0.12390895088139654, + "grad_norm": 32.49777603149414, + "learning_rate": 4.1015402167712496e-06, + "loss": 5.6116, + "step": 724 + }, + { + "epoch": 0.12408009584117748, + "grad_norm": 32.568031311035156, + "learning_rate": 4.107244723331432e-06, + "loss": 6.8512, + "step": 725 + }, + { + "epoch": 0.12425124080095841, + "grad_norm": 27.68449592590332, + "learning_rate": 4.112949229891615e-06, + "loss": 3.8807, + "step": 726 + }, + { + "epoch": 0.12442238576073934, + "grad_norm": 28.595746994018555, + "learning_rate": 4.118653736451797e-06, + "loss": 2.7513, + "step": 727 + }, + { + "epoch": 0.12459353072052028, + "grad_norm": 40.44917678833008, + "learning_rate": 4.124358243011979e-06, + "loss": 6.9441, + "step": 728 + }, + { + "epoch": 0.12476467568030121, + "grad_norm": 34.75537872314453, + "learning_rate": 4.130062749572162e-06, + "loss": 6.2836, + "step": 729 + }, + { + "epoch": 0.12493582064008214, + "grad_norm": 32.49576950073242, + "learning_rate": 4.135767256132345e-06, + "loss": 3.6985, + "step": 730 + }, + { + "epoch": 0.1251069655998631, + "grad_norm": 33.09941482543945, + "learning_rate": 4.141471762692527e-06, + "loss": 6.414, + "step": 731 + }, + { + "epoch": 0.12527811055964402, + "grad_norm": 33.988101959228516, + "learning_rate": 4.14717626925271e-06, + "loss": 6.8846, + "step": 732 + }, + { + "epoch": 0.12544925551942496, + "grad_norm": 34.69337844848633, + "learning_rate": 4.152880775812892e-06, + "loss": 5.9908, + "step": 733 + }, + { + "epoch": 0.1256204004792059, + "grad_norm": 42.33815383911133, + "learning_rate": 4.158585282373075e-06, + "loss": 7.2186, + "step": 734 + }, + { + "epoch": 0.12579154543898682, + "grad_norm": 21.35869598388672, + "learning_rate": 4.164289788933257e-06, + "loss": 3.2239, + "step": 735 + }, + { + "epoch": 0.12596269039876776, + "grad_norm": 34.62517166137695, + "learning_rate": 4.16999429549344e-06, + "loss": 6.2926, + "step": 736 + }, + { + "epoch": 0.1261338353585487, + "grad_norm": 32.758544921875, + "learning_rate": 4.1756988020536225e-06, + "loss": 6.2203, + "step": 737 + }, + { + "epoch": 0.12630498031832962, + "grad_norm": 17.39285659790039, + "learning_rate": 4.181403308613805e-06, + "loss": 3.2563, + "step": 738 + }, + { + "epoch": 0.12647612527811056, + "grad_norm": 32.22175598144531, + "learning_rate": 4.1871078151739875e-06, + "loss": 5.3068, + "step": 739 + }, + { + "epoch": 0.1266472702378915, + "grad_norm": 38.13700485229492, + "learning_rate": 4.19281232173417e-06, + "loss": 7.6128, + "step": 740 + }, + { + "epoch": 0.12681841519767242, + "grad_norm": 35.74038314819336, + "learning_rate": 4.198516828294353e-06, + "loss": 6.6528, + "step": 741 + }, + { + "epoch": 0.12698956015745336, + "grad_norm": 12.027849197387695, + "learning_rate": 4.204221334854535e-06, + "loss": 2.255, + "step": 742 + }, + { + "epoch": 0.1271607051172343, + "grad_norm": 36.75061798095703, + "learning_rate": 4.209925841414718e-06, + "loss": 6.2444, + "step": 743 + }, + { + "epoch": 0.12733185007701522, + "grad_norm": 43.853187561035156, + "learning_rate": 4.2156303479749e-06, + "loss": 6.3509, + "step": 744 + }, + { + "epoch": 0.12750299503679616, + "grad_norm": 31.670143127441406, + "learning_rate": 4.221334854535083e-06, + "loss": 7.3242, + "step": 745 + }, + { + "epoch": 0.1276741399965771, + "grad_norm": 24.049455642700195, + "learning_rate": 4.227039361095265e-06, + "loss": 4.5557, + "step": 746 + }, + { + "epoch": 0.12784528495635802, + "grad_norm": 22.603431701660156, + "learning_rate": 4.232743867655448e-06, + "loss": 4.3686, + "step": 747 + }, + { + "epoch": 0.12801642991613896, + "grad_norm": 33.28196716308594, + "learning_rate": 4.23844837421563e-06, + "loss": 7.897, + "step": 748 + }, + { + "epoch": 0.1281875748759199, + "grad_norm": 14.154582023620605, + "learning_rate": 4.244152880775813e-06, + "loss": 2.5184, + "step": 749 + }, + { + "epoch": 0.12835871983570085, + "grad_norm": 34.31758117675781, + "learning_rate": 4.249857387335995e-06, + "loss": 6.4663, + "step": 750 + }, + { + "epoch": 0.12852986479548179, + "grad_norm": 29.4487361907959, + "learning_rate": 4.255561893896179e-06, + "loss": 6.4908, + "step": 751 + }, + { + "epoch": 0.12870100975526272, + "grad_norm": 26.144145965576172, + "learning_rate": 4.261266400456361e-06, + "loss": 2.7209, + "step": 752 + }, + { + "epoch": 0.12887215471504365, + "grad_norm": 32.20002746582031, + "learning_rate": 4.266970907016543e-06, + "loss": 5.3474, + "step": 753 + }, + { + "epoch": 0.12904329967482459, + "grad_norm": 22.889114379882812, + "learning_rate": 4.2726754135767255e-06, + "loss": 4.439, + "step": 754 + }, + { + "epoch": 0.12921444463460552, + "grad_norm": 29.033794403076172, + "learning_rate": 4.278379920136908e-06, + "loss": 3.1768, + "step": 755 + }, + { + "epoch": 0.12938558959438645, + "grad_norm": 36.977718353271484, + "learning_rate": 4.2840844266970906e-06, + "loss": 6.725, + "step": 756 + }, + { + "epoch": 0.12955673455416739, + "grad_norm": 24.76682472229004, + "learning_rate": 4.289788933257274e-06, + "loss": 2.3437, + "step": 757 + }, + { + "epoch": 0.12972787951394832, + "grad_norm": 16.016826629638672, + "learning_rate": 4.2954934398174565e-06, + "loss": 2.8201, + "step": 758 + }, + { + "epoch": 0.12989902447372925, + "grad_norm": 14.587915420532227, + "learning_rate": 4.301197946377638e-06, + "loss": 3.5146, + "step": 759 + }, + { + "epoch": 0.13007016943351019, + "grad_norm": 26.081321716308594, + "learning_rate": 4.306902452937821e-06, + "loss": 2.6582, + "step": 760 + }, + { + "epoch": 0.13024131439329112, + "grad_norm": 16.497404098510742, + "learning_rate": 4.312606959498003e-06, + "loss": 2.6039, + "step": 761 + }, + { + "epoch": 0.13041245935307205, + "grad_norm": 30.642013549804688, + "learning_rate": 4.318311466058186e-06, + "loss": 5.6791, + "step": 762 + }, + { + "epoch": 0.13058360431285299, + "grad_norm": 78.80982971191406, + "learning_rate": 4.324015972618369e-06, + "loss": 7.0474, + "step": 763 + }, + { + "epoch": 0.13075474927263392, + "grad_norm": 31.678878784179688, + "learning_rate": 4.329720479178552e-06, + "loss": 7.2185, + "step": 764 + }, + { + "epoch": 0.13092589423241485, + "grad_norm": 67.14193725585938, + "learning_rate": 4.335424985738733e-06, + "loss": 11.1752, + "step": 765 + }, + { + "epoch": 0.13109703919219579, + "grad_norm": 30.7507381439209, + "learning_rate": 4.341129492298916e-06, + "loss": 5.3445, + "step": 766 + }, + { + "epoch": 0.13126818415197672, + "grad_norm": 295.94195556640625, + "learning_rate": 4.346833998859098e-06, + "loss": 14.8463, + "step": 767 + }, + { + "epoch": 0.13143932911175765, + "grad_norm": 31.96709442138672, + "learning_rate": 4.352538505419281e-06, + "loss": 5.9573, + "step": 768 + }, + { + "epoch": 0.13161047407153859, + "grad_norm": 21.086137771606445, + "learning_rate": 4.358243011979464e-06, + "loss": 2.51, + "step": 769 + }, + { + "epoch": 0.13178161903131952, + "grad_norm": 23.69211196899414, + "learning_rate": 4.363947518539647e-06, + "loss": 3.9384, + "step": 770 + }, + { + "epoch": 0.13195276399110045, + "grad_norm": 29.09503173828125, + "learning_rate": 4.369652025099829e-06, + "loss": 2.6477, + "step": 771 + }, + { + "epoch": 0.13212390895088139, + "grad_norm": 34.086483001708984, + "learning_rate": 4.375356531660011e-06, + "loss": 6.8362, + "step": 772 + }, + { + "epoch": 0.13229505391066232, + "grad_norm": 22.358131408691406, + "learning_rate": 4.381061038220194e-06, + "loss": 2.5553, + "step": 773 + }, + { + "epoch": 0.13246619887044325, + "grad_norm": 32.83020782470703, + "learning_rate": 4.386765544780377e-06, + "loss": 5.6526, + "step": 774 + }, + { + "epoch": 0.1326373438302242, + "grad_norm": 32.111629486083984, + "learning_rate": 4.3924700513405595e-06, + "loss": 7.0077, + "step": 775 + }, + { + "epoch": 0.13280848879000515, + "grad_norm": 28.587032318115234, + "learning_rate": 4.398174557900742e-06, + "loss": 3.9449, + "step": 776 + }, + { + "epoch": 0.13297963374978608, + "grad_norm": 28.547178268432617, + "learning_rate": 4.403879064460925e-06, + "loss": 5.1286, + "step": 777 + }, + { + "epoch": 0.133150778709567, + "grad_norm": 31.409543991088867, + "learning_rate": 4.409583571021106e-06, + "loss": 5.3343, + "step": 778 + }, + { + "epoch": 0.13332192366934795, + "grad_norm": 33.33236312866211, + "learning_rate": 4.415288077581289e-06, + "loss": 6.5061, + "step": 779 + }, + { + "epoch": 0.13349306862912888, + "grad_norm": 226.51580810546875, + "learning_rate": 4.420992584141472e-06, + "loss": 13.5725, + "step": 780 + }, + { + "epoch": 0.1336642135889098, + "grad_norm": 29.707599639892578, + "learning_rate": 4.426697090701655e-06, + "loss": 3.5054, + "step": 781 + }, + { + "epoch": 0.13383535854869075, + "grad_norm": 29.84592628479004, + "learning_rate": 4.432401597261837e-06, + "loss": 5.5461, + "step": 782 + }, + { + "epoch": 0.13400650350847168, + "grad_norm": 23.87710189819336, + "learning_rate": 4.43810610382202e-06, + "loss": 2.7581, + "step": 783 + }, + { + "epoch": 0.1341776484682526, + "grad_norm": 27.90047264099121, + "learning_rate": 4.4438106103822015e-06, + "loss": 5.0602, + "step": 784 + }, + { + "epoch": 0.13434879342803355, + "grad_norm": 229.59202575683594, + "learning_rate": 4.449515116942384e-06, + "loss": 8.4299, + "step": 785 + }, + { + "epoch": 0.13451993838781448, + "grad_norm": 35.904483795166016, + "learning_rate": 4.455219623502567e-06, + "loss": 6.6261, + "step": 786 + }, + { + "epoch": 0.13469108334759541, + "grad_norm": 13.451172828674316, + "learning_rate": 4.46092413006275e-06, + "loss": 3.6844, + "step": 787 + }, + { + "epoch": 0.13486222830737635, + "grad_norm": 220.5408172607422, + "learning_rate": 4.4666286366229324e-06, + "loss": 12.0786, + "step": 788 + }, + { + "epoch": 0.13503337326715728, + "grad_norm": 30.378768920898438, + "learning_rate": 4.472333143183115e-06, + "loss": 6.4301, + "step": 789 + }, + { + "epoch": 0.13520451822693821, + "grad_norm": 24.894784927368164, + "learning_rate": 4.478037649743297e-06, + "loss": 4.0456, + "step": 790 + }, + { + "epoch": 0.13537566318671915, + "grad_norm": 63.11225509643555, + "learning_rate": 4.48374215630348e-06, + "loss": 10.8823, + "step": 791 + }, + { + "epoch": 0.13554680814650008, + "grad_norm": 30.484046936035156, + "learning_rate": 4.4894466628636626e-06, + "loss": 4.5215, + "step": 792 + }, + { + "epoch": 0.13571795310628101, + "grad_norm": 33.15967559814453, + "learning_rate": 4.495151169423845e-06, + "loss": 6.1466, + "step": 793 + }, + { + "epoch": 0.13588909806606195, + "grad_norm": 31.415679931640625, + "learning_rate": 4.500855675984028e-06, + "loss": 5.0997, + "step": 794 + }, + { + "epoch": 0.13606024302584288, + "grad_norm": 29.878276824951172, + "learning_rate": 4.50656018254421e-06, + "loss": 6.9375, + "step": 795 + }, + { + "epoch": 0.13623138798562381, + "grad_norm": 33.10092544555664, + "learning_rate": 4.512264689104393e-06, + "loss": 6.2231, + "step": 796 + }, + { + "epoch": 0.13640253294540475, + "grad_norm": 21.412826538085938, + "learning_rate": 4.517969195664575e-06, + "loss": 1.8474, + "step": 797 + }, + { + "epoch": 0.13657367790518568, + "grad_norm": 31.297100067138672, + "learning_rate": 4.523673702224758e-06, + "loss": 5.4762, + "step": 798 + }, + { + "epoch": 0.13674482286496661, + "grad_norm": 234.58111572265625, + "learning_rate": 4.52937820878494e-06, + "loss": 11.739, + "step": 799 + }, + { + "epoch": 0.13691596782474755, + "grad_norm": 204.88748168945312, + "learning_rate": 4.535082715345123e-06, + "loss": 13.5482, + "step": 800 + }, + { + "epoch": 0.1370871127845285, + "grad_norm": 33.66855239868164, + "learning_rate": 4.540787221905305e-06, + "loss": 5.9551, + "step": 801 + }, + { + "epoch": 0.13725825774430944, + "grad_norm": 30.423555374145508, + "learning_rate": 4.546491728465488e-06, + "loss": 6.3931, + "step": 802 + }, + { + "epoch": 0.13742940270409038, + "grad_norm": 30.737445831298828, + "learning_rate": 4.55219623502567e-06, + "loss": 4.7871, + "step": 803 + }, + { + "epoch": 0.1376005476638713, + "grad_norm": 95.86985778808594, + "learning_rate": 4.557900741585853e-06, + "loss": 6.8129, + "step": 804 + }, + { + "epoch": 0.13777169262365224, + "grad_norm": 36.5138053894043, + "learning_rate": 4.5636052481460355e-06, + "loss": 6.4333, + "step": 805 + }, + { + "epoch": 0.13794283758343318, + "grad_norm": 31.310596466064453, + "learning_rate": 4.569309754706218e-06, + "loss": 6.1982, + "step": 806 + }, + { + "epoch": 0.1381139825432141, + "grad_norm": 32.40011978149414, + "learning_rate": 4.5750142612664005e-06, + "loss": 6.5281, + "step": 807 + }, + { + "epoch": 0.13828512750299504, + "grad_norm": 33.58089828491211, + "learning_rate": 4.580718767826583e-06, + "loss": 5.0059, + "step": 808 + }, + { + "epoch": 0.13845627246277598, + "grad_norm": 46.53955841064453, + "learning_rate": 4.586423274386766e-06, + "loss": 10.3345, + "step": 809 + }, + { + "epoch": 0.1386274174225569, + "grad_norm": 23.006080627441406, + "learning_rate": 4.592127780946948e-06, + "loss": 2.1468, + "step": 810 + }, + { + "epoch": 0.13879856238233784, + "grad_norm": 21.113685607910156, + "learning_rate": 4.597832287507131e-06, + "loss": 2.0972, + "step": 811 + }, + { + "epoch": 0.13896970734211878, + "grad_norm": 29.228193283081055, + "learning_rate": 4.603536794067313e-06, + "loss": 2.9408, + "step": 812 + }, + { + "epoch": 0.1391408523018997, + "grad_norm": 39.542686462402344, + "learning_rate": 4.609241300627496e-06, + "loss": 6.4624, + "step": 813 + }, + { + "epoch": 0.13931199726168064, + "grad_norm": 42.17389678955078, + "learning_rate": 4.614945807187679e-06, + "loss": 7.4244, + "step": 814 + }, + { + "epoch": 0.13948314222146158, + "grad_norm": 31.26105308532715, + "learning_rate": 4.620650313747861e-06, + "loss": 6.5606, + "step": 815 + }, + { + "epoch": 0.1396542871812425, + "grad_norm": 40.22693634033203, + "learning_rate": 4.626354820308043e-06, + "loss": 6.2725, + "step": 816 + }, + { + "epoch": 0.13982543214102344, + "grad_norm": 25.14350700378418, + "learning_rate": 4.632059326868226e-06, + "loss": 4.0754, + "step": 817 + }, + { + "epoch": 0.13999657710080438, + "grad_norm": 23.578937530517578, + "learning_rate": 4.637763833428408e-06, + "loss": 4.1309, + "step": 818 + }, + { + "epoch": 0.1401677220605853, + "grad_norm": 37.57481002807617, + "learning_rate": 4.643468339988591e-06, + "loss": 5.8135, + "step": 819 + }, + { + "epoch": 0.14033886702036624, + "grad_norm": 35.21710205078125, + "learning_rate": 4.649172846548774e-06, + "loss": 6.6982, + "step": 820 + }, + { + "epoch": 0.14051001198014718, + "grad_norm": 14.915112495422363, + "learning_rate": 4.654877353108957e-06, + "loss": 2.1068, + "step": 821 + }, + { + "epoch": 0.1406811569399281, + "grad_norm": 27.366252899169922, + "learning_rate": 4.6605818596691385e-06, + "loss": 3.2475, + "step": 822 + }, + { + "epoch": 0.14085230189970904, + "grad_norm": 36.40489196777344, + "learning_rate": 4.666286366229321e-06, + "loss": 6.7448, + "step": 823 + }, + { + "epoch": 0.14102344685948998, + "grad_norm": 37.40996551513672, + "learning_rate": 4.6719908727895036e-06, + "loss": 6.8328, + "step": 824 + }, + { + "epoch": 0.1411945918192709, + "grad_norm": 255.09320068359375, + "learning_rate": 4.677695379349686e-06, + "loss": 12.0992, + "step": 825 + }, + { + "epoch": 0.14136573677905187, + "grad_norm": 41.39365768432617, + "learning_rate": 4.6833998859098695e-06, + "loss": 6.1908, + "step": 826 + }, + { + "epoch": 0.1415368817388328, + "grad_norm": 14.086997032165527, + "learning_rate": 4.689104392470052e-06, + "loss": 3.3856, + "step": 827 + }, + { + "epoch": 0.14170802669861374, + "grad_norm": 33.170352935791016, + "learning_rate": 4.694808899030234e-06, + "loss": 6.9479, + "step": 828 + }, + { + "epoch": 0.14187917165839467, + "grad_norm": 37.625064849853516, + "learning_rate": 4.700513405590416e-06, + "loss": 7.6713, + "step": 829 + }, + { + "epoch": 0.1420503166181756, + "grad_norm": 25.476303100585938, + "learning_rate": 4.706217912150599e-06, + "loss": 4.2481, + "step": 830 + }, + { + "epoch": 0.14222146157795654, + "grad_norm": 27.399072647094727, + "learning_rate": 4.711922418710781e-06, + "loss": 5.508, + "step": 831 + }, + { + "epoch": 0.14239260653773747, + "grad_norm": 31.020893096923828, + "learning_rate": 4.717626925270965e-06, + "loss": 5.8831, + "step": 832 + }, + { + "epoch": 0.1425637514975184, + "grad_norm": 26.108135223388672, + "learning_rate": 4.723331431831147e-06, + "loss": 3.5932, + "step": 833 + }, + { + "epoch": 0.14273489645729934, + "grad_norm": 35.8662109375, + "learning_rate": 4.729035938391329e-06, + "loss": 5.1499, + "step": 834 + }, + { + "epoch": 0.14290604141708027, + "grad_norm": 34.714324951171875, + "learning_rate": 4.734740444951511e-06, + "loss": 5.9969, + "step": 835 + }, + { + "epoch": 0.1430771863768612, + "grad_norm": 34.023067474365234, + "learning_rate": 4.740444951511694e-06, + "loss": 6.4575, + "step": 836 + }, + { + "epoch": 0.14324833133664214, + "grad_norm": 17.601118087768555, + "learning_rate": 4.746149458071877e-06, + "loss": 2.5208, + "step": 837 + }, + { + "epoch": 0.14341947629642307, + "grad_norm": 19.672815322875977, + "learning_rate": 4.75185396463206e-06, + "loss": 2.1216, + "step": 838 + }, + { + "epoch": 0.143590621256204, + "grad_norm": 25.771137237548828, + "learning_rate": 4.757558471192242e-06, + "loss": 2.6146, + "step": 839 + }, + { + "epoch": 0.14376176621598494, + "grad_norm": 32.17550277709961, + "learning_rate": 4.763262977752424e-06, + "loss": 5.8516, + "step": 840 + }, + { + "epoch": 0.14393291117576587, + "grad_norm": 72.34523010253906, + "learning_rate": 4.768967484312607e-06, + "loss": 11.0212, + "step": 841 + }, + { + "epoch": 0.1441040561355468, + "grad_norm": 22.756717681884766, + "learning_rate": 4.774671990872789e-06, + "loss": 2.3166, + "step": 842 + }, + { + "epoch": 0.14427520109532774, + "grad_norm": 22.13291358947754, + "learning_rate": 4.7803764974329725e-06, + "loss": 2.2079, + "step": 843 + }, + { + "epoch": 0.14444634605510867, + "grad_norm": 65.32748413085938, + "learning_rate": 4.786081003993155e-06, + "loss": 6.3309, + "step": 844 + }, + { + "epoch": 0.1446174910148896, + "grad_norm": 242.9714813232422, + "learning_rate": 4.791785510553338e-06, + "loss": 11.9379, + "step": 845 + }, + { + "epoch": 0.14478863597467054, + "grad_norm": 21.737802505493164, + "learning_rate": 4.79749001711352e-06, + "loss": 3.806, + "step": 846 + }, + { + "epoch": 0.14495978093445147, + "grad_norm": 29.438758850097656, + "learning_rate": 4.803194523673702e-06, + "loss": 5.7729, + "step": 847 + }, + { + "epoch": 0.1451309258942324, + "grad_norm": 25.701087951660156, + "learning_rate": 4.808899030233884e-06, + "loss": 2.8536, + "step": 848 + }, + { + "epoch": 0.14530207085401334, + "grad_norm": 130.01524353027344, + "learning_rate": 4.814603536794068e-06, + "loss": 7.5391, + "step": 849 + }, + { + "epoch": 0.14547321581379427, + "grad_norm": 30.284828186035156, + "learning_rate": 4.82030804335425e-06, + "loss": 3.806, + "step": 850 + }, + { + "epoch": 0.1456443607735752, + "grad_norm": 23.351642608642578, + "learning_rate": 4.826012549914433e-06, + "loss": 4.2263, + "step": 851 + }, + { + "epoch": 0.14581550573335617, + "grad_norm": 216.2431182861328, + "learning_rate": 4.831717056474615e-06, + "loss": 9.159, + "step": 852 + }, + { + "epoch": 0.1459866506931371, + "grad_norm": 35.071754455566406, + "learning_rate": 4.837421563034797e-06, + "loss": 6.503, + "step": 853 + }, + { + "epoch": 0.14615779565291803, + "grad_norm": 34.0211296081543, + "learning_rate": 4.84312606959498e-06, + "loss": 6.4636, + "step": 854 + }, + { + "epoch": 0.14632894061269897, + "grad_norm": 17.20896339416504, + "learning_rate": 4.848830576155163e-06, + "loss": 2.7218, + "step": 855 + }, + { + "epoch": 0.1465000855724799, + "grad_norm": 136.72647094726562, + "learning_rate": 4.8545350827153454e-06, + "loss": 7.7082, + "step": 856 + }, + { + "epoch": 0.14667123053226083, + "grad_norm": 53.50956344604492, + "learning_rate": 4.860239589275528e-06, + "loss": 10.0171, + "step": 857 + }, + { + "epoch": 0.14684237549204177, + "grad_norm": 21.030473709106445, + "learning_rate": 4.8659440958357105e-06, + "loss": 4.1916, + "step": 858 + }, + { + "epoch": 0.1470135204518227, + "grad_norm": 34.38727569580078, + "learning_rate": 4.871648602395892e-06, + "loss": 5.969, + "step": 859 + }, + { + "epoch": 0.14718466541160363, + "grad_norm": 22.703882217407227, + "learning_rate": 4.8773531089560756e-06, + "loss": 2.4073, + "step": 860 + }, + { + "epoch": 0.14735581037138457, + "grad_norm": 33.388858795166016, + "learning_rate": 4.883057615516258e-06, + "loss": 5.7571, + "step": 861 + }, + { + "epoch": 0.1475269553311655, + "grad_norm": 35.79853820800781, + "learning_rate": 4.888762122076441e-06, + "loss": 5.9363, + "step": 862 + }, + { + "epoch": 0.14769810029094643, + "grad_norm": 20.656721115112305, + "learning_rate": 4.894466628636623e-06, + "loss": 2.0406, + "step": 863 + }, + { + "epoch": 0.14786924525072737, + "grad_norm": 35.20976638793945, + "learning_rate": 4.900171135196806e-06, + "loss": 5.8613, + "step": 864 + }, + { + "epoch": 0.1480403902105083, + "grad_norm": 22.342880249023438, + "learning_rate": 4.905875641756987e-06, + "loss": 4.0119, + "step": 865 + }, + { + "epoch": 0.14821153517028923, + "grad_norm": 33.253292083740234, + "learning_rate": 4.911580148317171e-06, + "loss": 4.62, + "step": 866 + }, + { + "epoch": 0.14838268013007017, + "grad_norm": 186.65093994140625, + "learning_rate": 4.917284654877353e-06, + "loss": 11.2662, + "step": 867 + }, + { + "epoch": 0.1485538250898511, + "grad_norm": 15.842426300048828, + "learning_rate": 4.922989161437536e-06, + "loss": 2.0607, + "step": 868 + }, + { + "epoch": 0.14872497004963203, + "grad_norm": 26.70699119567871, + "learning_rate": 4.928693667997718e-06, + "loss": 3.1737, + "step": 869 + }, + { + "epoch": 0.14889611500941297, + "grad_norm": 33.37158966064453, + "learning_rate": 4.934398174557901e-06, + "loss": 4.7352, + "step": 870 + }, + { + "epoch": 0.1490672599691939, + "grad_norm": 26.4490966796875, + "learning_rate": 4.940102681118083e-06, + "loss": 4.2178, + "step": 871 + }, + { + "epoch": 0.14923840492897483, + "grad_norm": 33.25678634643555, + "learning_rate": 4.945807187678266e-06, + "loss": 5.0764, + "step": 872 + }, + { + "epoch": 0.14940954988875577, + "grad_norm": 38.204769134521484, + "learning_rate": 4.9515116942384485e-06, + "loss": 5.8078, + "step": 873 + }, + { + "epoch": 0.1495806948485367, + "grad_norm": 27.79875946044922, + "learning_rate": 4.957216200798631e-06, + "loss": 5.6432, + "step": 874 + }, + { + "epoch": 0.14975183980831763, + "grad_norm": 32.442115783691406, + "learning_rate": 4.9629207073588135e-06, + "loss": 5.7378, + "step": 875 + }, + { + "epoch": 0.14992298476809857, + "grad_norm": 57.06877517700195, + "learning_rate": 4.968625213918996e-06, + "loss": 10.3136, + "step": 876 + }, + { + "epoch": 0.15009412972787953, + "grad_norm": 32.131187438964844, + "learning_rate": 4.9743297204791794e-06, + "loss": 4.6921, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_nli-pairs_loss": 5.535374164581299, + "eval_nli-pairs_runtime": 4.3709, + "eval_nli-pairs_samples_per_second": 45.757, + "eval_nli-pairs_steps_per_second": 1.601, + "eval_sts-test_pearson_cosine": 0.6147169012893178, + "eval_sts-test_pearson_dot": 0.4334302941897573, + "eval_sts-test_pearson_euclidean": 0.6082490673246602, + "eval_sts-test_pearson_manhattan": 0.616700428941834, + "eval_sts-test_pearson_max": 0.616700428941834, + "eval_sts-test_spearman_cosine": 0.5972327557562241, + "eval_sts-test_spearman_dot": 0.41946207508864325, + "eval_sts-test_spearman_euclidean": 0.5959187544369754, + "eval_sts-test_spearman_manhattan": 0.6029031731511296, + "eval_sts-test_spearman_max": 0.6029031731511296, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_vitaminc-pairs_loss": 3.619838237762451, + "eval_vitaminc-pairs_runtime": 2.7372, + "eval_vitaminc-pairs_samples_per_second": 73.068, + "eval_vitaminc-pairs_steps_per_second": 2.557, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_qnli-contrastive_loss": 12.3779878616333, + "eval_qnli-contrastive_runtime": 0.6382, + "eval_qnli-contrastive_samples_per_second": 313.373, + "eval_qnli-contrastive_steps_per_second": 10.968, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_scitail-pairs-qa_loss": 1.6706750392913818, + "eval_scitail-pairs-qa_runtime": 1.6279, + "eval_scitail-pairs-qa_samples_per_second": 122.855, + "eval_scitail-pairs-qa_steps_per_second": 4.3, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_scitail-pairs-pos_loss": 3.0242857933044434, + "eval_scitail-pairs-pos_runtime": 2.6188, + "eval_scitail-pairs-pos_samples_per_second": 76.369, + "eval_scitail-pairs-pos_steps_per_second": 2.673, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_xsum-pairs_loss": 3.0581634044647217, + "eval_xsum-pairs_runtime": 2.6458, + "eval_xsum-pairs_samples_per_second": 66.142, + "eval_xsum-pairs_steps_per_second": 2.268, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_compression-pairs_loss": 1.9685934782028198, + "eval_compression-pairs_runtime": 0.5084, + "eval_compression-pairs_samples_per_second": 393.398, + "eval_compression-pairs_steps_per_second": 13.769, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_sciq_pairs_loss": 6.824851989746094, + "eval_sciq_pairs_runtime": 9.1685, + "eval_sciq_pairs_samples_per_second": 21.814, + "eval_sciq_pairs_steps_per_second": 0.763, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_qasc_pairs_loss": 10.253314018249512, + "eval_qasc_pairs_runtime": 2.6538, + "eval_qasc_pairs_samples_per_second": 75.363, + "eval_qasc_pairs_steps_per_second": 2.638, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_openbookqa_pairs_loss": 5.933743953704834, + "eval_openbookqa_pairs_runtime": 0.6418, + "eval_openbookqa_pairs_samples_per_second": 107.513, + "eval_openbookqa_pairs_steps_per_second": 4.674, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_msmarco_pairs_loss": 5.185385704040527, + "eval_msmarco_pairs_runtime": 3.9947, + "eval_msmarco_pairs_samples_per_second": 50.067, + "eval_msmarco_pairs_steps_per_second": 1.752, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_nq_pairs_loss": 6.44993782043457, + "eval_nq_pairs_runtime": 8.638, + "eval_nq_pairs_samples_per_second": 23.153, + "eval_nq_pairs_steps_per_second": 0.81, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_trivia_pairs_loss": 6.129721641540527, + "eval_trivia_pairs_runtime": 12.8296, + "eval_trivia_pairs_samples_per_second": 15.589, + "eval_trivia_pairs_steps_per_second": 0.546, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_quora_pairs_loss": 1.7218067646026611, + "eval_quora_pairs_runtime": 1.5931, + "eval_quora_pairs_samples_per_second": 125.544, + "eval_quora_pairs_steps_per_second": 4.394, + "step": 877 + }, + { + "epoch": 0.15009412972787953, + "eval_gooaq_pairs_loss": 4.168159008026123, + "eval_gooaq_pairs_runtime": 2.6679, + "eval_gooaq_pairs_samples_per_second": 74.966, + "eval_gooaq_pairs_steps_per_second": 2.624, + "step": 877 + }, + { + "epoch": 0.15026527468766046, + "grad_norm": 29.085119247436523, + "learning_rate": 4.980034227039361e-06, + "loss": 5.8249, + "step": 878 + }, + { + "epoch": 0.1504364196474414, + "grad_norm": 35.45232009887695, + "learning_rate": 4.985738733599544e-06, + "loss": 6.378, + "step": 879 + }, + { + "epoch": 0.15060756460722233, + "grad_norm": 34.018470764160156, + "learning_rate": 4.991443240159726e-06, + "loss": 5.326, + "step": 880 + }, + { + "epoch": 0.15077870956700326, + "grad_norm": 22.30814552307129, + "learning_rate": 4.997147746719909e-06, + "loss": 2.6674, + "step": 881 + }, + { + "epoch": 0.1509498545267842, + "grad_norm": 36.679046630859375, + "learning_rate": 5.002852253280091e-06, + "loss": 6.6655, + "step": 882 + }, + { + "epoch": 0.15112099948656513, + "grad_norm": 36.78900146484375, + "learning_rate": 5.008556759840275e-06, + "loss": 4.5851, + "step": 883 + }, + { + "epoch": 0.15129214444634606, + "grad_norm": 46.770057678222656, + "learning_rate": 5.014261266400456e-06, + "loss": 9.9308, + "step": 884 + }, + { + "epoch": 0.151463289406127, + "grad_norm": 27.262338638305664, + "learning_rate": 5.019965772960639e-06, + "loss": 2.2515, + "step": 885 + }, + { + "epoch": 0.15163443436590793, + "grad_norm": 193.24122619628906, + "learning_rate": 5.025670279520821e-06, + "loss": 10.7631, + "step": 886 + }, + { + "epoch": 0.15180557932568886, + "grad_norm": 30.53336524963379, + "learning_rate": 5.031374786081004e-06, + "loss": 3.9297, + "step": 887 + }, + { + "epoch": 0.1519767242854698, + "grad_norm": 13.035544395446777, + "learning_rate": 5.0370792926411864e-06, + "loss": 3.16, + "step": 888 + }, + { + "epoch": 0.15214786924525073, + "grad_norm": 27.65202522277832, + "learning_rate": 5.04278379920137e-06, + "loss": 3.1012, + "step": 889 + }, + { + "epoch": 0.15231901420503166, + "grad_norm": 28.412954330444336, + "learning_rate": 5.0484883057615515e-06, + "loss": 2.4251, + "step": 890 + }, + { + "epoch": 0.1524901591648126, + "grad_norm": 35.567386627197266, + "learning_rate": 5.054192812321734e-06, + "loss": 5.1793, + "step": 891 + }, + { + "epoch": 0.15266130412459353, + "grad_norm": 31.945302963256836, + "learning_rate": 5.0598973188819166e-06, + "loss": 4.9138, + "step": 892 + }, + { + "epoch": 0.15283244908437446, + "grad_norm": 30.31682014465332, + "learning_rate": 5.065601825442099e-06, + "loss": 4.8582, + "step": 893 + }, + { + "epoch": 0.1530035940441554, + "grad_norm": 22.3225040435791, + "learning_rate": 5.0713063320022825e-06, + "loss": 2.003, + "step": 894 + }, + { + "epoch": 0.15317473900393633, + "grad_norm": 23.375139236450195, + "learning_rate": 5.077010838562465e-06, + "loss": 2.3547, + "step": 895 + }, + { + "epoch": 0.15334588396371726, + "grad_norm": 32.41263198852539, + "learning_rate": 5.0827153451226475e-06, + "loss": 6.2287, + "step": 896 + }, + { + "epoch": 0.1535170289234982, + "grad_norm": 20.43022346496582, + "learning_rate": 5.088419851682829e-06, + "loss": 2.1189, + "step": 897 + }, + { + "epoch": 0.15368817388327913, + "grad_norm": 37.203250885009766, + "learning_rate": 5.094124358243012e-06, + "loss": 6.3629, + "step": 898 + }, + { + "epoch": 0.15385931884306006, + "grad_norm": 19.725624084472656, + "learning_rate": 5.099828864803194e-06, + "loss": 2.2277, + "step": 899 + }, + { + "epoch": 0.154030463802841, + "grad_norm": 27.29782485961914, + "learning_rate": 5.105533371363378e-06, + "loss": 2.8851, + "step": 900 + }, + { + "epoch": 0.15420160876262193, + "grad_norm": 172.8111572265625, + "learning_rate": 5.11123787792356e-06, + "loss": 9.9783, + "step": 901 + }, + { + "epoch": 0.1543727537224029, + "grad_norm": 56.5546875, + "learning_rate": 5.116942384483743e-06, + "loss": 10.3301, + "step": 902 + }, + { + "epoch": 0.15454389868218382, + "grad_norm": 32.12007522583008, + "learning_rate": 5.122646891043924e-06, + "loss": 3.3146, + "step": 903 + }, + { + "epoch": 0.15471504364196476, + "grad_norm": 197.39170837402344, + "learning_rate": 5.128351397604107e-06, + "loss": 11.016, + "step": 904 + }, + { + "epoch": 0.1548861886017457, + "grad_norm": 36.48847579956055, + "learning_rate": 5.1340559041642895e-06, + "loss": 4.8215, + "step": 905 + }, + { + "epoch": 0.15505733356152662, + "grad_norm": 31.014644622802734, + "learning_rate": 5.139760410724473e-06, + "loss": 4.7237, + "step": 906 + }, + { + "epoch": 0.15522847852130756, + "grad_norm": 31.436952590942383, + "learning_rate": 5.145464917284655e-06, + "loss": 4.6175, + "step": 907 + }, + { + "epoch": 0.1553996234810885, + "grad_norm": 27.38591194152832, + "learning_rate": 5.151169423844838e-06, + "loss": 4.0958, + "step": 908 + }, + { + "epoch": 0.15557076844086942, + "grad_norm": 31.732324600219727, + "learning_rate": 5.15687393040502e-06, + "loss": 4.4682, + "step": 909 + }, + { + "epoch": 0.15574191340065036, + "grad_norm": 15.360635757446289, + "learning_rate": 5.162578436965202e-06, + "loss": 2.4148, + "step": 910 + }, + { + "epoch": 0.1559130583604313, + "grad_norm": 172.3378448486328, + "learning_rate": 5.168282943525385e-06, + "loss": 9.8466, + "step": 911 + }, + { + "epoch": 0.15608420332021222, + "grad_norm": 31.59737777709961, + "learning_rate": 5.173987450085568e-06, + "loss": 6.1221, + "step": 912 + }, + { + "epoch": 0.15625534827999316, + "grad_norm": 20.06523323059082, + "learning_rate": 5.179691956645751e-06, + "loss": 2.0035, + "step": 913 + }, + { + "epoch": 0.1564264932397741, + "grad_norm": 25.82581329345703, + "learning_rate": 5.185396463205933e-06, + "loss": 4.7388, + "step": 914 + }, + { + "epoch": 0.15659763819955502, + "grad_norm": 13.644715309143066, + "learning_rate": 5.191100969766115e-06, + "loss": 2.1442, + "step": 915 + }, + { + "epoch": 0.15676878315933596, + "grad_norm": 36.4990119934082, + "learning_rate": 5.196805476326297e-06, + "loss": 6.2552, + "step": 916 + }, + { + "epoch": 0.1569399281191169, + "grad_norm": 35.6190185546875, + "learning_rate": 5.202509982886481e-06, + "loss": 6.3529, + "step": 917 + }, + { + "epoch": 0.15711107307889782, + "grad_norm": 13.495047569274902, + "learning_rate": 5.208214489446663e-06, + "loss": 3.5731, + "step": 918 + }, + { + "epoch": 0.15728221803867876, + "grad_norm": 236.7681121826172, + "learning_rate": 5.213918996006846e-06, + "loss": 10.5726, + "step": 919 + }, + { + "epoch": 0.1574533629984597, + "grad_norm": 34.39946746826172, + "learning_rate": 5.219623502567028e-06, + "loss": 6.0673, + "step": 920 + }, + { + "epoch": 0.15762450795824062, + "grad_norm": 12.590995788574219, + "learning_rate": 5.225328009127211e-06, + "loss": 2.77, + "step": 921 + }, + { + "epoch": 0.15779565291802156, + "grad_norm": 31.968891143798828, + "learning_rate": 5.2310325156873925e-06, + "loss": 4.1677, + "step": 922 + }, + { + "epoch": 0.1579667978778025, + "grad_norm": 31.067489624023438, + "learning_rate": 5.236737022247576e-06, + "loss": 4.716, + "step": 923 + }, + { + "epoch": 0.15813794283758342, + "grad_norm": 36.08390808105469, + "learning_rate": 5.2424415288077584e-06, + "loss": 6.528, + "step": 924 + }, + { + "epoch": 0.15830908779736436, + "grad_norm": 34.2723274230957, + "learning_rate": 5.248146035367941e-06, + "loss": 6.4655, + "step": 925 + }, + { + "epoch": 0.1584802327571453, + "grad_norm": 43.43145751953125, + "learning_rate": 5.2538505419281235e-06, + "loss": 5.6795, + "step": 926 + }, + { + "epoch": 0.15865137771692622, + "grad_norm": 32.78499221801758, + "learning_rate": 5.259555048488306e-06, + "loss": 5.6396, + "step": 927 + }, + { + "epoch": 0.15882252267670718, + "grad_norm": 35.156925201416016, + "learning_rate": 5.265259555048488e-06, + "loss": 4.7143, + "step": 928 + }, + { + "epoch": 0.15899366763648812, + "grad_norm": 34.6341552734375, + "learning_rate": 5.270964061608671e-06, + "loss": 5.6931, + "step": 929 + }, + { + "epoch": 0.15916481259626905, + "grad_norm": 35.668331146240234, + "learning_rate": 5.276668568168854e-06, + "loss": 5.6404, + "step": 930 + }, + { + "epoch": 0.15933595755604998, + "grad_norm": 34.62514877319336, + "learning_rate": 5.282373074729036e-06, + "loss": 5.0469, + "step": 931 + }, + { + "epoch": 0.15950710251583092, + "grad_norm": 37.79499435424805, + "learning_rate": 5.288077581289219e-06, + "loss": 5.3761, + "step": 932 + }, + { + "epoch": 0.15967824747561185, + "grad_norm": 40.4017333984375, + "learning_rate": 5.293782087849401e-06, + "loss": 5.6738, + "step": 933 + }, + { + "epoch": 0.15984939243539278, + "grad_norm": 35.31856155395508, + "learning_rate": 5.299486594409584e-06, + "loss": 6.4936, + "step": 934 + }, + { + "epoch": 0.16002053739517372, + "grad_norm": 126.11963653564453, + "learning_rate": 5.305191100969766e-06, + "loss": 9.9326, + "step": 935 + }, + { + "epoch": 0.16019168235495465, + "grad_norm": 34.740753173828125, + "learning_rate": 5.310895607529949e-06, + "loss": 2.0987, + "step": 936 + }, + { + "epoch": 0.16036282731473558, + "grad_norm": 34.9671745300293, + "learning_rate": 5.316600114090131e-06, + "loss": 6.2338, + "step": 937 + }, + { + "epoch": 0.16053397227451652, + "grad_norm": 21.198925018310547, + "learning_rate": 5.322304620650314e-06, + "loss": 3.5463, + "step": 938 + }, + { + "epoch": 0.16070511723429745, + "grad_norm": 30.98229217529297, + "learning_rate": 5.328009127210496e-06, + "loss": 4.7342, + "step": 939 + }, + { + "epoch": 0.16087626219407838, + "grad_norm": 41.88993835449219, + "learning_rate": 5.333713633770679e-06, + "loss": 6.5058, + "step": 940 + }, + { + "epoch": 0.16104740715385932, + "grad_norm": 24.218576431274414, + "learning_rate": 5.3394181403308615e-06, + "loss": 2.0172, + "step": 941 + }, + { + "epoch": 0.16121855211364025, + "grad_norm": 32.891719818115234, + "learning_rate": 5.345122646891044e-06, + "loss": 5.893, + "step": 942 + }, + { + "epoch": 0.16138969707342118, + "grad_norm": 38.93867874145508, + "learning_rate": 5.3508271534512265e-06, + "loss": 5.8157, + "step": 943 + }, + { + "epoch": 0.16156084203320212, + "grad_norm": 31.02938461303711, + "learning_rate": 5.356531660011409e-06, + "loss": 5.529, + "step": 944 + }, + { + "epoch": 0.16173198699298305, + "grad_norm": 36.240440368652344, + "learning_rate": 5.362236166571592e-06, + "loss": 4.7931, + "step": 945 + }, + { + "epoch": 0.16190313195276398, + "grad_norm": 23.227556228637695, + "learning_rate": 5.367940673131775e-06, + "loss": 2.1265, + "step": 946 + }, + { + "epoch": 0.16207427691254492, + "grad_norm": 40.07374954223633, + "learning_rate": 5.373645179691957e-06, + "loss": 5.8823, + "step": 947 + }, + { + "epoch": 0.16224542187232585, + "grad_norm": 29.960735321044922, + "learning_rate": 5.379349686252139e-06, + "loss": 4.6281, + "step": 948 + }, + { + "epoch": 0.16241656683210678, + "grad_norm": 173.5910186767578, + "learning_rate": 5.385054192812322e-06, + "loss": 10.3282, + "step": 949 + }, + { + "epoch": 0.16258771179188772, + "grad_norm": 37.48442840576172, + "learning_rate": 5.390758699372504e-06, + "loss": 6.1584, + "step": 950 + }, + { + "epoch": 0.16275885675166865, + "grad_norm": 39.48939514160156, + "learning_rate": 5.396463205932687e-06, + "loss": 5.655, + "step": 951 + }, + { + "epoch": 0.16293000171144958, + "grad_norm": 34.57015609741211, + "learning_rate": 5.40216771249287e-06, + "loss": 5.4251, + "step": 952 + }, + { + "epoch": 0.16310114667123055, + "grad_norm": 51.02991485595703, + "learning_rate": 5.407872219053052e-06, + "loss": 10.2283, + "step": 953 + }, + { + "epoch": 0.16327229163101148, + "grad_norm": 31.77302360534668, + "learning_rate": 5.413576725613234e-06, + "loss": 4.0174, + "step": 954 + }, + { + "epoch": 0.1634434365907924, + "grad_norm": 31.242929458618164, + "learning_rate": 5.419281232173417e-06, + "loss": 5.5883, + "step": 955 + }, + { + "epoch": 0.16361458155057335, + "grad_norm": 31.789701461791992, + "learning_rate": 5.4249857387335994e-06, + "loss": 4.5646, + "step": 956 + }, + { + "epoch": 0.16378572651035428, + "grad_norm": 34.09980392456055, + "learning_rate": 5.430690245293783e-06, + "loss": 4.9872, + "step": 957 + }, + { + "epoch": 0.1639568714701352, + "grad_norm": 31.57735252380371, + "learning_rate": 5.436394751853965e-06, + "loss": 5.158, + "step": 958 + }, + { + "epoch": 0.16412801642991615, + "grad_norm": 32.941917419433594, + "learning_rate": 5.442099258414147e-06, + "loss": 5.4497, + "step": 959 + }, + { + "epoch": 0.16429916138969708, + "grad_norm": 200.919921875, + "learning_rate": 5.4478037649743296e-06, + "loss": 9.7888, + "step": 960 + }, + { + "epoch": 0.164470306349478, + "grad_norm": 28.78856658935547, + "learning_rate": 5.453508271534512e-06, + "loss": 5.0757, + "step": 961 + }, + { + "epoch": 0.16464145130925895, + "grad_norm": 22.877927780151367, + "learning_rate": 5.459212778094695e-06, + "loss": 3.6177, + "step": 962 + }, + { + "epoch": 0.16481259626903988, + "grad_norm": 24.904977798461914, + "learning_rate": 5.464917284654878e-06, + "loss": 4.2287, + "step": 963 + }, + { + "epoch": 0.1649837412288208, + "grad_norm": 35.849124908447266, + "learning_rate": 5.4706217912150605e-06, + "loss": 5.1121, + "step": 964 + }, + { + "epoch": 0.16515488618860175, + "grad_norm": 31.580976486206055, + "learning_rate": 5.476326297775242e-06, + "loss": 4.4859, + "step": 965 + }, + { + "epoch": 0.16532603114838268, + "grad_norm": 30.3056697845459, + "learning_rate": 5.482030804335425e-06, + "loss": 4.5076, + "step": 966 + }, + { + "epoch": 0.1654971761081636, + "grad_norm": 34.674468994140625, + "learning_rate": 5.487735310895607e-06, + "loss": 5.7789, + "step": 967 + }, + { + "epoch": 0.16566832106794455, + "grad_norm": 28.0445556640625, + "learning_rate": 5.49343981745579e-06, + "loss": 2.7613, + "step": 968 + }, + { + "epoch": 0.16583946602772548, + "grad_norm": 33.28575134277344, + "learning_rate": 5.499144324015973e-06, + "loss": 5.1032, + "step": 969 + }, + { + "epoch": 0.1660106109875064, + "grad_norm": 35.53700637817383, + "learning_rate": 5.504848830576156e-06, + "loss": 5.2129, + "step": 970 + }, + { + "epoch": 0.16618175594728735, + "grad_norm": 33.2183952331543, + "learning_rate": 5.510553337136338e-06, + "loss": 5.6908, + "step": 971 + }, + { + "epoch": 0.16635290090706828, + "grad_norm": 30.640926361083984, + "learning_rate": 5.51625784369652e-06, + "loss": 4.4325, + "step": 972 + }, + { + "epoch": 0.1665240458668492, + "grad_norm": 24.672338485717773, + "learning_rate": 5.5219623502567025e-06, + "loss": 3.9552, + "step": 973 + }, + { + "epoch": 0.16669519082663015, + "grad_norm": 33.66337585449219, + "learning_rate": 5.527666856816886e-06, + "loss": 5.4014, + "step": 974 + }, + { + "epoch": 0.16686633578641108, + "grad_norm": 32.082942962646484, + "learning_rate": 5.533371363377068e-06, + "loss": 5.9258, + "step": 975 + }, + { + "epoch": 0.167037480746192, + "grad_norm": 37.91094970703125, + "learning_rate": 5.539075869937251e-06, + "loss": 5.717, + "step": 976 + }, + { + "epoch": 0.16720862570597295, + "grad_norm": 20.26280975341797, + "learning_rate": 5.5447803764974335e-06, + "loss": 2.2263, + "step": 977 + }, + { + "epoch": 0.16737977066575388, + "grad_norm": 48.14308547973633, + "learning_rate": 5.550484883057615e-06, + "loss": 9.6938, + "step": 978 + }, + { + "epoch": 0.16755091562553484, + "grad_norm": 22.81192970275879, + "learning_rate": 5.556189389617798e-06, + "loss": 3.7015, + "step": 979 + }, + { + "epoch": 0.16772206058531577, + "grad_norm": 27.474571228027344, + "learning_rate": 5.561893896177981e-06, + "loss": 2.9404, + "step": 980 + }, + { + "epoch": 0.1678932055450967, + "grad_norm": 25.376007080078125, + "learning_rate": 5.567598402738164e-06, + "loss": 2.3926, + "step": 981 + }, + { + "epoch": 0.16806435050487764, + "grad_norm": 31.575468063354492, + "learning_rate": 5.573302909298346e-06, + "loss": 4.7349, + "step": 982 + }, + { + "epoch": 0.16823549546465857, + "grad_norm": 194.93817138671875, + "learning_rate": 5.579007415858529e-06, + "loss": 9.7172, + "step": 983 + }, + { + "epoch": 0.1684066404244395, + "grad_norm": 31.26558494567871, + "learning_rate": 5.58471192241871e-06, + "loss": 3.9837, + "step": 984 + }, + { + "epoch": 0.16857778538422044, + "grad_norm": 32.1373405456543, + "learning_rate": 5.590416428978893e-06, + "loss": 5.0026, + "step": 985 + }, + { + "epoch": 0.16874893034400137, + "grad_norm": 37.07416915893555, + "learning_rate": 5.596120935539076e-06, + "loss": 5.8572, + "step": 986 + }, + { + "epoch": 0.1689200753037823, + "grad_norm": 35.09983825683594, + "learning_rate": 5.601825442099259e-06, + "loss": 5.6302, + "step": 987 + }, + { + "epoch": 0.16909122026356324, + "grad_norm": 46.96855926513672, + "learning_rate": 5.607529948659441e-06, + "loss": 9.6255, + "step": 988 + }, + { + "epoch": 0.16926236522334417, + "grad_norm": 36.15262985229492, + "learning_rate": 5.613234455219624e-06, + "loss": 5.5484, + "step": 989 + }, + { + "epoch": 0.1694335101831251, + "grad_norm": 33.642967224121094, + "learning_rate": 5.6189389617798055e-06, + "loss": 5.5827, + "step": 990 + }, + { + "epoch": 0.16960465514290604, + "grad_norm": 27.581716537475586, + "learning_rate": 5.624643468339988e-06, + "loss": 2.9652, + "step": 991 + }, + { + "epoch": 0.16977580010268697, + "grad_norm": 19.107044219970703, + "learning_rate": 5.6303479749001714e-06, + "loss": 1.7442, + "step": 992 + }, + { + "epoch": 0.1699469450624679, + "grad_norm": 165.6937255859375, + "learning_rate": 5.636052481460354e-06, + "loss": 10.2439, + "step": 993 + }, + { + "epoch": 0.17011809002224884, + "grad_norm": 171.38658142089844, + "learning_rate": 5.6417569880205365e-06, + "loss": 10.7544, + "step": 994 + }, + { + "epoch": 0.17028923498202977, + "grad_norm": 29.20503807067871, + "learning_rate": 5.647461494580719e-06, + "loss": 4.176, + "step": 995 + }, + { + "epoch": 0.1704603799418107, + "grad_norm": 29.09612274169922, + "learning_rate": 5.6531660011409016e-06, + "loss": 4.1945, + "step": 996 + }, + { + "epoch": 0.17063152490159164, + "grad_norm": 39.78682327270508, + "learning_rate": 5.658870507701084e-06, + "loss": 6.4205, + "step": 997 + }, + { + "epoch": 0.17080266986137257, + "grad_norm": 13.687639236450195, + "learning_rate": 5.664575014261267e-06, + "loss": 3.468, + "step": 998 + }, + { + "epoch": 0.1709738148211535, + "grad_norm": 41.89799118041992, + "learning_rate": 5.670279520821449e-06, + "loss": 7.13, + "step": 999 + }, + { + "epoch": 0.17114495978093444, + "grad_norm": 22.78835678100586, + "learning_rate": 5.675984027381632e-06, + "loss": 2.7249, + "step": 1000 + }, + { + "epoch": 0.17131610474071537, + "grad_norm": 26.538780212402344, + "learning_rate": 5.681688533941814e-06, + "loss": 3.2385, + "step": 1001 + }, + { + "epoch": 0.1714872497004963, + "grad_norm": 24.171205520629883, + "learning_rate": 5.687393040501997e-06, + "loss": 3.7183, + "step": 1002 + }, + { + "epoch": 0.17165839466027724, + "grad_norm": 35.46499252319336, + "learning_rate": 5.693097547062179e-06, + "loss": 5.4996, + "step": 1003 + }, + { + "epoch": 0.1718295396200582, + "grad_norm": 15.119646072387695, + "learning_rate": 5.698802053622362e-06, + "loss": 2.4476, + "step": 1004 + }, + { + "epoch": 0.17200068457983914, + "grad_norm": 43.560546875, + "learning_rate": 5.704506560182544e-06, + "loss": 9.1856, + "step": 1005 + }, + { + "epoch": 0.17217182953962007, + "grad_norm": 42.41808319091797, + "learning_rate": 5.710211066742727e-06, + "loss": 5.6756, + "step": 1006 + }, + { + "epoch": 0.172342974499401, + "grad_norm": 34.344207763671875, + "learning_rate": 5.715915573302909e-06, + "loss": 5.2383, + "step": 1007 + }, + { + "epoch": 0.17251411945918194, + "grad_norm": 19.511310577392578, + "learning_rate": 5.721620079863092e-06, + "loss": 3.3214, + "step": 1008 + }, + { + "epoch": 0.17268526441896287, + "grad_norm": 33.06563949584961, + "learning_rate": 5.7273245864232745e-06, + "loss": 5.6944, + "step": 1009 + }, + { + "epoch": 0.1728564093787438, + "grad_norm": 38.382041931152344, + "learning_rate": 5.733029092983457e-06, + "loss": 5.9898, + "step": 1010 + }, + { + "epoch": 0.17302755433852474, + "grad_norm": 28.5861759185791, + "learning_rate": 5.7387335995436395e-06, + "loss": 5.2048, + "step": 1011 + }, + { + "epoch": 0.17319869929830567, + "grad_norm": 31.76646614074707, + "learning_rate": 5.744438106103822e-06, + "loss": 6.0811, + "step": 1012 + }, + { + "epoch": 0.1733698442580866, + "grad_norm": 37.81482696533203, + "learning_rate": 5.750142612664005e-06, + "loss": 4.8642, + "step": 1013 + }, + { + "epoch": 0.17354098921786754, + "grad_norm": 45.32394790649414, + "learning_rate": 5.755847119224188e-06, + "loss": 9.5803, + "step": 1014 + }, + { + "epoch": 0.17371213417764847, + "grad_norm": 35.39071273803711, + "learning_rate": 5.76155162578437e-06, + "loss": 4.3758, + "step": 1015 + }, + { + "epoch": 0.1738832791374294, + "grad_norm": 31.971323013305664, + "learning_rate": 5.767256132344552e-06, + "loss": 4.2616, + "step": 1016 + }, + { + "epoch": 0.17405442409721034, + "grad_norm": 29.855161666870117, + "learning_rate": 5.772960638904735e-06, + "loss": 5.5371, + "step": 1017 + }, + { + "epoch": 0.17422556905699127, + "grad_norm": 21.00974464416504, + "learning_rate": 5.778665145464917e-06, + "loss": 1.9809, + "step": 1018 + }, + { + "epoch": 0.1743967140167722, + "grad_norm": 23.60835075378418, + "learning_rate": 5.7843696520251e-06, + "loss": 2.5916, + "step": 1019 + }, + { + "epoch": 0.17456785897655314, + "grad_norm": 36.11520767211914, + "learning_rate": 5.790074158585283e-06, + "loss": 4.9198, + "step": 1020 + }, + { + "epoch": 0.17473900393633407, + "grad_norm": 21.838703155517578, + "learning_rate": 5.795778665145466e-06, + "loss": 2.1235, + "step": 1021 + }, + { + "epoch": 0.174910148896115, + "grad_norm": 28.41387367248535, + "learning_rate": 5.801483171705647e-06, + "loss": 5.0401, + "step": 1022 + }, + { + "epoch": 0.17508129385589594, + "grad_norm": 28.482187271118164, + "learning_rate": 5.80718767826583e-06, + "loss": 4.7167, + "step": 1023 + }, + { + "epoch": 0.17525243881567687, + "grad_norm": 33.954307556152344, + "learning_rate": 5.8128921848260124e-06, + "loss": 4.9666, + "step": 1024 + }, + { + "epoch": 0.1754235837754578, + "grad_norm": 33.401920318603516, + "learning_rate": 5.818596691386195e-06, + "loss": 6.3783, + "step": 1025 + }, + { + "epoch": 0.17559472873523874, + "grad_norm": 37.047691345214844, + "learning_rate": 5.824301197946378e-06, + "loss": 5.5925, + "step": 1026 + }, + { + "epoch": 0.17576587369501967, + "grad_norm": 30.060083389282227, + "learning_rate": 5.830005704506561e-06, + "loss": 3.8415, + "step": 1027 + }, + { + "epoch": 0.1759370186548006, + "grad_norm": 30.832544326782227, + "learning_rate": 5.8357102110667426e-06, + "loss": 4.9379, + "step": 1028 + }, + { + "epoch": 0.17610816361458156, + "grad_norm": 30.651966094970703, + "learning_rate": 5.841414717626925e-06, + "loss": 3.9393, + "step": 1029 + }, + { + "epoch": 0.1762793085743625, + "grad_norm": 12.284616470336914, + "learning_rate": 5.847119224187108e-06, + "loss": 2.7979, + "step": 1030 + }, + { + "epoch": 0.17645045353414343, + "grad_norm": 25.138864517211914, + "learning_rate": 5.85282373074729e-06, + "loss": 3.6294, + "step": 1031 + }, + { + "epoch": 0.17662159849392436, + "grad_norm": 19.136524200439453, + "learning_rate": 5.8585282373074735e-06, + "loss": 1.5926, + "step": 1032 + }, + { + "epoch": 0.1767927434537053, + "grad_norm": 36.646968841552734, + "learning_rate": 5.864232743867656e-06, + "loss": 5.8265, + "step": 1033 + }, + { + "epoch": 0.17696388841348623, + "grad_norm": 17.363170623779297, + "learning_rate": 5.869937250427838e-06, + "loss": 1.7465, + "step": 1034 + }, + { + "epoch": 0.17713503337326716, + "grad_norm": 29.55439567565918, + "learning_rate": 5.87564175698802e-06, + "loss": 3.617, + "step": 1035 + }, + { + "epoch": 0.1773061783330481, + "grad_norm": 203.16549682617188, + "learning_rate": 5.881346263548203e-06, + "loss": 7.9826, + "step": 1036 + }, + { + "epoch": 0.17747732329282903, + "grad_norm": 17.790836334228516, + "learning_rate": 5.887050770108386e-06, + "loss": 2.1574, + "step": 1037 + }, + { + "epoch": 0.17764846825260996, + "grad_norm": 40.40040969848633, + "learning_rate": 5.892755276668569e-06, + "loss": 5.5116, + "step": 1038 + }, + { + "epoch": 0.1778196132123909, + "grad_norm": 30.316959381103516, + "learning_rate": 5.898459783228751e-06, + "loss": 4.4268, + "step": 1039 + }, + { + "epoch": 0.17799075817217183, + "grad_norm": 34.86418151855469, + "learning_rate": 5.904164289788933e-06, + "loss": 4.9673, + "step": 1040 + }, + { + "epoch": 0.17816190313195276, + "grad_norm": 198.34268188476562, + "learning_rate": 5.9098687963491155e-06, + "loss": 10.3881, + "step": 1041 + }, + { + "epoch": 0.1783330480917337, + "grad_norm": 29.608211517333984, + "learning_rate": 5.915573302909298e-06, + "loss": 3.9641, + "step": 1042 + }, + { + "epoch": 0.17850419305151463, + "grad_norm": 28.76857566833496, + "learning_rate": 5.921277809469481e-06, + "loss": 4.0211, + "step": 1043 + }, + { + "epoch": 0.17867533801129556, + "grad_norm": 26.37080955505371, + "learning_rate": 5.926982316029664e-06, + "loss": 4.6642, + "step": 1044 + }, + { + "epoch": 0.1788464829710765, + "grad_norm": 32.01490020751953, + "learning_rate": 5.9326868225898464e-06, + "loss": 5.5217, + "step": 1045 + }, + { + "epoch": 0.17901762793085743, + "grad_norm": 22.62516212463379, + "learning_rate": 5.938391329150029e-06, + "loss": 1.9563, + "step": 1046 + }, + { + "epoch": 0.17918877289063836, + "grad_norm": 40.089229583740234, + "learning_rate": 5.944095835710211e-06, + "loss": 5.9567, + "step": 1047 + }, + { + "epoch": 0.1793599178504193, + "grad_norm": 22.854562759399414, + "learning_rate": 5.949800342270393e-06, + "loss": 1.9063, + "step": 1048 + }, + { + "epoch": 0.17953106281020023, + "grad_norm": 99.86076354980469, + "learning_rate": 5.9555048488305766e-06, + "loss": 6.6872, + "step": 1049 + }, + { + "epoch": 0.17970220776998116, + "grad_norm": 42.04011154174805, + "learning_rate": 5.961209355390759e-06, + "loss": 6.4974, + "step": 1050 + }, + { + "epoch": 0.1798733527297621, + "grad_norm": 26.85508155822754, + "learning_rate": 5.966913861950942e-06, + "loss": 4.3443, + "step": 1051 + }, + { + "epoch": 0.18004449768954303, + "grad_norm": 29.8301944732666, + "learning_rate": 5.972618368511124e-06, + "loss": 5.0599, + "step": 1052 + }, + { + "epoch": 0.18021564264932396, + "grad_norm": 50.89991760253906, + "learning_rate": 5.978322875071306e-06, + "loss": 9.764, + "step": 1053 + }, + { + "epoch": 0.1803867876091049, + "grad_norm": 32.19784927368164, + "learning_rate": 5.984027381631489e-06, + "loss": 4.1811, + "step": 1054 + }, + { + "epoch": 0.18055793256888586, + "grad_norm": 46.780487060546875, + "learning_rate": 5.989731888191672e-06, + "loss": 9.4505, + "step": 1055 + }, + { + "epoch": 0.1807290775286668, + "grad_norm": 17.571828842163086, + "learning_rate": 5.995436394751854e-06, + "loss": 1.8957, + "step": 1056 + }, + { + "epoch": 0.18090022248844773, + "grad_norm": 30.740095138549805, + "learning_rate": 6.001140901312037e-06, + "loss": 4.0522, + "step": 1057 + }, + { + "epoch": 0.18107136744822866, + "grad_norm": 36.38762283325195, + "learning_rate": 6.006845407872219e-06, + "loss": 5.546, + "step": 1058 + }, + { + "epoch": 0.1812425124080096, + "grad_norm": 37.66824722290039, + "learning_rate": 6.012549914432401e-06, + "loss": 4.7406, + "step": 1059 + }, + { + "epoch": 0.18141365736779053, + "grad_norm": 33.9829216003418, + "learning_rate": 6.018254420992584e-06, + "loss": 4.8123, + "step": 1060 + }, + { + "epoch": 0.18158480232757146, + "grad_norm": 25.99117088317871, + "learning_rate": 6.023958927552767e-06, + "loss": 4.6063, + "step": 1061 + }, + { + "epoch": 0.1817559472873524, + "grad_norm": 29.198394775390625, + "learning_rate": 6.0296634341129495e-06, + "loss": 5.0514, + "step": 1062 + }, + { + "epoch": 0.18192709224713333, + "grad_norm": 14.127655982971191, + "learning_rate": 6.035367940673132e-06, + "loss": 1.3962, + "step": 1063 + }, + { + "epoch": 0.18209823720691426, + "grad_norm": 12.10257339477539, + "learning_rate": 6.0410724472333145e-06, + "loss": 2.0181, + "step": 1064 + }, + { + "epoch": 0.1822693821666952, + "grad_norm": 19.635854721069336, + "learning_rate": 6.046776953793496e-06, + "loss": 1.7151, + "step": 1065 + }, + { + "epoch": 0.18244052712647613, + "grad_norm": 189.35772705078125, + "learning_rate": 6.05248146035368e-06, + "loss": 9.8327, + "step": 1066 + }, + { + "epoch": 0.18261167208625706, + "grad_norm": 34.833229064941406, + "learning_rate": 6.058185966913862e-06, + "loss": 5.6448, + "step": 1067 + }, + { + "epoch": 0.182782817046038, + "grad_norm": 24.17336654663086, + "learning_rate": 6.063890473474045e-06, + "loss": 3.8977, + "step": 1068 + }, + { + "epoch": 0.18295396200581893, + "grad_norm": 32.84638214111328, + "learning_rate": 6.069594980034227e-06, + "loss": 5.7649, + "step": 1069 + }, + { + "epoch": 0.18312510696559986, + "grad_norm": 46.32835388183594, + "learning_rate": 6.07529948659441e-06, + "loss": 9.2569, + "step": 1070 + }, + { + "epoch": 0.1832962519253808, + "grad_norm": 15.697673797607422, + "learning_rate": 6.081003993154592e-06, + "loss": 1.6445, + "step": 1071 + }, + { + "epoch": 0.18346739688516173, + "grad_norm": 31.891868591308594, + "learning_rate": 6.086708499714775e-06, + "loss": 5.4669, + "step": 1072 + }, + { + "epoch": 0.18363854184494266, + "grad_norm": 29.735248565673828, + "learning_rate": 6.092413006274957e-06, + "loss": 5.0552, + "step": 1073 + }, + { + "epoch": 0.1838096868047236, + "grad_norm": 15.486328125, + "learning_rate": 6.09811751283514e-06, + "loss": 2.2292, + "step": 1074 + }, + { + "epoch": 0.18398083176450453, + "grad_norm": 24.518693923950195, + "learning_rate": 6.103822019395322e-06, + "loss": 3.5355, + "step": 1075 + }, + { + "epoch": 0.18415197672428546, + "grad_norm": 27.474645614624023, + "learning_rate": 6.109526525955505e-06, + "loss": 2.0704, + "step": 1076 + }, + { + "epoch": 0.1843231216840664, + "grad_norm": 21.003856658935547, + "learning_rate": 6.115231032515688e-06, + "loss": 2.0773, + "step": 1077 + }, + { + "epoch": 0.18449426664384733, + "grad_norm": 12.948555946350098, + "learning_rate": 6.12093553907587e-06, + "loss": 1.9105, + "step": 1078 + }, + { + "epoch": 0.18466541160362826, + "grad_norm": 28.35967254638672, + "learning_rate": 6.1266400456360525e-06, + "loss": 5.1778, + "step": 1079 + }, + { + "epoch": 0.18483655656340922, + "grad_norm": 28.59235954284668, + "learning_rate": 6.132344552196235e-06, + "loss": 3.9724, + "step": 1080 + }, + { + "epoch": 0.18500770152319015, + "grad_norm": 32.077518463134766, + "learning_rate": 6.138049058756418e-06, + "loss": 4.2397, + "step": 1081 + }, + { + "epoch": 0.1851788464829711, + "grad_norm": 34.8428955078125, + "learning_rate": 6.1437535653166e-06, + "loss": 4.3906, + "step": 1082 + }, + { + "epoch": 0.18534999144275202, + "grad_norm": 36.8244743347168, + "learning_rate": 6.1494580718767835e-06, + "loss": 4.6433, + "step": 1083 + }, + { + "epoch": 0.18552113640253295, + "grad_norm": 34.37318420410156, + "learning_rate": 6.155162578436965e-06, + "loss": 4.7285, + "step": 1084 + }, + { + "epoch": 0.1856922813623139, + "grad_norm": 34.02301025390625, + "learning_rate": 6.160867084997148e-06, + "loss": 5.1995, + "step": 1085 + }, + { + "epoch": 0.18586342632209482, + "grad_norm": 15.779897689819336, + "learning_rate": 6.16657159155733e-06, + "loss": 1.5138, + "step": 1086 + }, + { + "epoch": 0.18603457128187575, + "grad_norm": 45.183841705322266, + "learning_rate": 6.172276098117513e-06, + "loss": 6.6194, + "step": 1087 + }, + { + "epoch": 0.1862057162416567, + "grad_norm": 15.437774658203125, + "learning_rate": 6.177980604677695e-06, + "loss": 1.4242, + "step": 1088 + }, + { + "epoch": 0.18637686120143762, + "grad_norm": 246.0555419921875, + "learning_rate": 6.183685111237879e-06, + "loss": 10.7677, + "step": 1089 + }, + { + "epoch": 0.18654800616121855, + "grad_norm": 8.7081937789917, + "learning_rate": 6.18938961779806e-06, + "loss": 2.3527, + "step": 1090 + }, + { + "epoch": 0.1867191511209995, + "grad_norm": 35.0928840637207, + "learning_rate": 6.195094124358243e-06, + "loss": 5.4856, + "step": 1091 + }, + { + "epoch": 0.18689029608078042, + "grad_norm": 36.24078369140625, + "learning_rate": 6.2007986309184254e-06, + "loss": 5.1105, + "step": 1092 + }, + { + "epoch": 0.18706144104056135, + "grad_norm": 41.07029724121094, + "learning_rate": 6.206503137478608e-06, + "loss": 5.543, + "step": 1093 + }, + { + "epoch": 0.1872325860003423, + "grad_norm": 36.27534484863281, + "learning_rate": 6.212207644038791e-06, + "loss": 4.4058, + "step": 1094 + }, + { + "epoch": 0.18740373096012322, + "grad_norm": 34.61309814453125, + "learning_rate": 6.217912150598974e-06, + "loss": 4.9065, + "step": 1095 + }, + { + "epoch": 0.18757487591990415, + "grad_norm": 36.856388092041016, + "learning_rate": 6.223616657159156e-06, + "loss": 4.8059, + "step": 1096 + }, + { + "epoch": 0.1877460208796851, + "grad_norm": 39.40951156616211, + "learning_rate": 6.229321163719338e-06, + "loss": 5.8853, + "step": 1097 + }, + { + "epoch": 0.18791716583946602, + "grad_norm": 30.013790130615234, + "learning_rate": 6.235025670279521e-06, + "loss": 4.1051, + "step": 1098 + }, + { + "epoch": 0.18808831079924695, + "grad_norm": 27.43667984008789, + "learning_rate": 6.240730176839703e-06, + "loss": 3.661, + "step": 1099 + }, + { + "epoch": 0.1882594557590279, + "grad_norm": 22.01202964782715, + "learning_rate": 6.2464346833998865e-06, + "loss": 2.0165, + "step": 1100 + }, + { + "epoch": 0.18843060071880882, + "grad_norm": 23.981887817382812, + "learning_rate": 6.252139189960069e-06, + "loss": 1.8586, + "step": 1101 + }, + { + "epoch": 0.18860174567858976, + "grad_norm": 221.93540954589844, + "learning_rate": 6.257843696520252e-06, + "loss": 8.0869, + "step": 1102 + }, + { + "epoch": 0.1887728906383707, + "grad_norm": 32.2524299621582, + "learning_rate": 6.263548203080433e-06, + "loss": 4.6553, + "step": 1103 + }, + { + "epoch": 0.18894403559815162, + "grad_norm": 14.555329322814941, + "learning_rate": 6.269252709640616e-06, + "loss": 2.0657, + "step": 1104 + }, + { + "epoch": 0.18911518055793256, + "grad_norm": 27.233903884887695, + "learning_rate": 6.274957216200798e-06, + "loss": 3.7143, + "step": 1105 + }, + { + "epoch": 0.18928632551771352, + "grad_norm": 15.294402122497559, + "learning_rate": 6.280661722760982e-06, + "loss": 1.4409, + "step": 1106 + }, + { + "epoch": 0.18945747047749445, + "grad_norm": 223.1316375732422, + "learning_rate": 6.286366229321164e-06, + "loss": 9.676, + "step": 1107 + }, + { + "epoch": 0.18962861543727538, + "grad_norm": 36.643463134765625, + "learning_rate": 6.292070735881347e-06, + "loss": 4.7202, + "step": 1108 + }, + { + "epoch": 0.18979976039705632, + "grad_norm": 37.47721481323242, + "learning_rate": 6.2977752424415285e-06, + "loss": 4.8366, + "step": 1109 + }, + { + "epoch": 0.18997090535683725, + "grad_norm": 34.74982833862305, + "learning_rate": 6.303479749001711e-06, + "loss": 4.6667, + "step": 1110 + }, + { + "epoch": 0.19014205031661818, + "grad_norm": 38.055728912353516, + "learning_rate": 6.3091842555618935e-06, + "loss": 5.3396, + "step": 1111 + }, + { + "epoch": 0.19031319527639912, + "grad_norm": 33.44966506958008, + "learning_rate": 6.314888762122077e-06, + "loss": 5.0909, + "step": 1112 + }, + { + "epoch": 0.19048434023618005, + "grad_norm": 34.397132873535156, + "learning_rate": 6.3205932686822594e-06, + "loss": 5.3514, + "step": 1113 + }, + { + "epoch": 0.19065548519596098, + "grad_norm": 39.06338119506836, + "learning_rate": 6.326297775242442e-06, + "loss": 6.3797, + "step": 1114 + }, + { + "epoch": 0.19082663015574192, + "grad_norm": 40.017799377441406, + "learning_rate": 6.332002281802624e-06, + "loss": 5.5943, + "step": 1115 + }, + { + "epoch": 0.19099777511552285, + "grad_norm": 11.964347839355469, + "learning_rate": 6.337706788362806e-06, + "loss": 1.8095, + "step": 1116 + }, + { + "epoch": 0.19116892007530378, + "grad_norm": 12.956400871276855, + "learning_rate": 6.3434112949229896e-06, + "loss": 1.3529, + "step": 1117 + }, + { + "epoch": 0.19134006503508472, + "grad_norm": 36.93289566040039, + "learning_rate": 6.349115801483172e-06, + "loss": 6.0492, + "step": 1118 + }, + { + "epoch": 0.19151120999486565, + "grad_norm": 33.92202377319336, + "learning_rate": 6.354820308043355e-06, + "loss": 5.9093, + "step": 1119 + }, + { + "epoch": 0.19168235495464658, + "grad_norm": 37.51108169555664, + "learning_rate": 6.360524814603537e-06, + "loss": 5.5156, + "step": 1120 + }, + { + "epoch": 0.19185349991442752, + "grad_norm": 23.369075775146484, + "learning_rate": 6.36622932116372e-06, + "loss": 3.9585, + "step": 1121 + }, + { + "epoch": 0.19202464487420845, + "grad_norm": 27.76898765563965, + "learning_rate": 6.371933827723901e-06, + "loss": 4.0578, + "step": 1122 + }, + { + "epoch": 0.19219578983398938, + "grad_norm": 21.719980239868164, + "learning_rate": 6.377638334284085e-06, + "loss": 1.6746, + "step": 1123 + }, + { + "epoch": 0.19236693479377032, + "grad_norm": 32.65765380859375, + "learning_rate": 6.383342840844267e-06, + "loss": 4.4355, + "step": 1124 + }, + { + "epoch": 0.19253807975355125, + "grad_norm": 31.302228927612305, + "learning_rate": 6.38904734740445e-06, + "loss": 4.3111, + "step": 1125 + }, + { + "epoch": 0.19270922471333218, + "grad_norm": 36.785396575927734, + "learning_rate": 6.394751853964632e-06, + "loss": 5.3737, + "step": 1126 + }, + { + "epoch": 0.19288036967311312, + "grad_norm": 32.185787200927734, + "learning_rate": 6.400456360524815e-06, + "loss": 4.2842, + "step": 1127 + }, + { + "epoch": 0.19305151463289405, + "grad_norm": 49.154666900634766, + "learning_rate": 6.4061608670849966e-06, + "loss": 8.8989, + "step": 1128 + }, + { + "epoch": 0.19322265959267498, + "grad_norm": 31.552207946777344, + "learning_rate": 6.41186537364518e-06, + "loss": 4.2685, + "step": 1129 + }, + { + "epoch": 0.19339380455245592, + "grad_norm": 21.41136932373047, + "learning_rate": 6.4175698802053625e-06, + "loss": 2.3051, + "step": 1130 + }, + { + "epoch": 0.19356494951223688, + "grad_norm": 13.525940895080566, + "learning_rate": 6.423274386765545e-06, + "loss": 2.1123, + "step": 1131 + }, + { + "epoch": 0.1937360944720178, + "grad_norm": 37.48530960083008, + "learning_rate": 6.4289788933257275e-06, + "loss": 4.8037, + "step": 1132 + }, + { + "epoch": 0.19390723943179874, + "grad_norm": 38.14132308959961, + "learning_rate": 6.43468339988591e-06, + "loss": 6.2294, + "step": 1133 + }, + { + "epoch": 0.19407838439157968, + "grad_norm": 33.01750183105469, + "learning_rate": 6.440387906446093e-06, + "loss": 4.9204, + "step": 1134 + }, + { + "epoch": 0.1942495293513606, + "grad_norm": 36.364158630371094, + "learning_rate": 6.446092413006275e-06, + "loss": 4.5797, + "step": 1135 + }, + { + "epoch": 0.19442067431114154, + "grad_norm": 46.81378173828125, + "learning_rate": 6.451796919566458e-06, + "loss": 6.538, + "step": 1136 + }, + { + "epoch": 0.19459181927092248, + "grad_norm": 23.135957717895508, + "learning_rate": 6.45750142612664e-06, + "loss": 4.3991, + "step": 1137 + }, + { + "epoch": 0.1947629642307034, + "grad_norm": 25.031917572021484, + "learning_rate": 6.463205932686823e-06, + "loss": 2.3886, + "step": 1138 + }, + { + "epoch": 0.19493410919048434, + "grad_norm": 35.31920623779297, + "learning_rate": 6.468910439247005e-06, + "loss": 6.0172, + "step": 1139 + }, + { + "epoch": 0.19510525415026528, + "grad_norm": 36.97047424316406, + "learning_rate": 6.474614945807188e-06, + "loss": 5.4822, + "step": 1140 + }, + { + "epoch": 0.1952763991100462, + "grad_norm": 31.77883529663086, + "learning_rate": 6.48031945236737e-06, + "loss": 4.7072, + "step": 1141 + }, + { + "epoch": 0.19544754406982714, + "grad_norm": 28.897930145263672, + "learning_rate": 6.486023958927553e-06, + "loss": 3.7105, + "step": 1142 + }, + { + "epoch": 0.19561868902960808, + "grad_norm": 29.99696922302246, + "learning_rate": 6.491728465487735e-06, + "loss": 4.5102, + "step": 1143 + }, + { + "epoch": 0.195789833989389, + "grad_norm": 25.783557891845703, + "learning_rate": 6.497432972047918e-06, + "loss": 3.6023, + "step": 1144 + }, + { + "epoch": 0.19596097894916994, + "grad_norm": 35.004642486572266, + "learning_rate": 6.5031374786081005e-06, + "loss": 4.1587, + "step": 1145 + }, + { + "epoch": 0.19613212390895088, + "grad_norm": 173.46754455566406, + "learning_rate": 6.508841985168284e-06, + "loss": 7.5547, + "step": 1146 + }, + { + "epoch": 0.1963032688687318, + "grad_norm": 18.749853134155273, + "learning_rate": 6.5145464917284655e-06, + "loss": 1.7298, + "step": 1147 + }, + { + "epoch": 0.19647441382851275, + "grad_norm": 31.15353012084961, + "learning_rate": 6.520250998288648e-06, + "loss": 5.4053, + "step": 1148 + }, + { + "epoch": 0.19664555878829368, + "grad_norm": 21.659912109375, + "learning_rate": 6.525955504848831e-06, + "loss": 1.8891, + "step": 1149 + }, + { + "epoch": 0.1968167037480746, + "grad_norm": 23.412139892578125, + "learning_rate": 6.531660011409013e-06, + "loss": 3.8619, + "step": 1150 + }, + { + "epoch": 0.19698784870785555, + "grad_norm": 22.16069221496582, + "learning_rate": 6.537364517969196e-06, + "loss": 2.0106, + "step": 1151 + }, + { + "epoch": 0.19715899366763648, + "grad_norm": 33.494136810302734, + "learning_rate": 6.543069024529379e-06, + "loss": 5.4958, + "step": 1152 + }, + { + "epoch": 0.1973301386274174, + "grad_norm": 32.96882629394531, + "learning_rate": 6.548773531089561e-06, + "loss": 4.5927, + "step": 1153 + }, + { + "epoch": 0.19750128358719835, + "grad_norm": 36.14384078979492, + "learning_rate": 6.554478037649743e-06, + "loss": 5.6357, + "step": 1154 + }, + { + "epoch": 0.19767242854697928, + "grad_norm": 23.875118255615234, + "learning_rate": 6.560182544209926e-06, + "loss": 3.158, + "step": 1155 + }, + { + "epoch": 0.19784357350676024, + "grad_norm": 23.001026153564453, + "learning_rate": 6.565887050770108e-06, + "loss": 1.8949, + "step": 1156 + }, + { + "epoch": 0.19801471846654117, + "grad_norm": 46.26600646972656, + "learning_rate": 6.571591557330292e-06, + "loss": 9.1329, + "step": 1157 + }, + { + "epoch": 0.1981858634263221, + "grad_norm": 16.32296371459961, + "learning_rate": 6.577296063890474e-06, + "loss": 1.5302, + "step": 1158 + }, + { + "epoch": 0.19835700838610304, + "grad_norm": 26.114614486694336, + "learning_rate": 6.583000570450656e-06, + "loss": 2.3763, + "step": 1159 + }, + { + "epoch": 0.19852815334588397, + "grad_norm": 37.42622756958008, + "learning_rate": 6.5887050770108384e-06, + "loss": 5.5999, + "step": 1160 + }, + { + "epoch": 0.1986992983056649, + "grad_norm": 21.48786735534668, + "learning_rate": 6.594409583571021e-06, + "loss": 3.4369, + "step": 1161 + }, + { + "epoch": 0.19887044326544584, + "grad_norm": 24.472808837890625, + "learning_rate": 6.6001140901312035e-06, + "loss": 2.0175, + "step": 1162 + }, + { + "epoch": 0.19904158822522677, + "grad_norm": 25.275909423828125, + "learning_rate": 6.605818596691387e-06, + "loss": 2.6992, + "step": 1163 + }, + { + "epoch": 0.1992127331850077, + "grad_norm": 29.439197540283203, + "learning_rate": 6.611523103251569e-06, + "loss": 4.4373, + "step": 1164 + }, + { + "epoch": 0.19938387814478864, + "grad_norm": 224.64663696289062, + "learning_rate": 6.617227609811751e-06, + "loss": 10.3737, + "step": 1165 + }, + { + "epoch": 0.19955502310456957, + "grad_norm": 34.043575286865234, + "learning_rate": 6.622932116371934e-06, + "loss": 5.0921, + "step": 1166 + }, + { + "epoch": 0.1997261680643505, + "grad_norm": 11.060107231140137, + "learning_rate": 6.628636622932116e-06, + "loss": 1.2996, + "step": 1167 + }, + { + "epoch": 0.19989731302413144, + "grad_norm": 32.19368362426758, + "learning_rate": 6.634341129492299e-06, + "loss": 4.2537, + "step": 1168 + }, + { + "epoch": 0.20006845798391237, + "grad_norm": 48.267578125, + "learning_rate": 6.640045636052482e-06, + "loss": 9.335, + "step": 1169 + }, + { + "epoch": 0.2002396029436933, + "grad_norm": 19.327762603759766, + "learning_rate": 6.645750142612665e-06, + "loss": 1.8859, + "step": 1170 + }, + { + "epoch": 0.20041074790347424, + "grad_norm": 28.81614875793457, + "learning_rate": 6.651454649172847e-06, + "loss": 3.8125, + "step": 1171 + }, + { + "epoch": 0.20058189286325517, + "grad_norm": 24.971960067749023, + "learning_rate": 6.657159155733029e-06, + "loss": 3.0816, + "step": 1172 + }, + { + "epoch": 0.2007530378230361, + "grad_norm": 154.4432373046875, + "learning_rate": 6.662863662293211e-06, + "loss": 8.568, + "step": 1173 + }, + { + "epoch": 0.20092418278281704, + "grad_norm": 47.04978942871094, + "learning_rate": 6.668568168853395e-06, + "loss": 5.1816, + "step": 1174 + }, + { + "epoch": 0.20109532774259797, + "grad_norm": 24.374345779418945, + "learning_rate": 6.674272675413577e-06, + "loss": 2.6078, + "step": 1175 + }, + { + "epoch": 0.2012664727023789, + "grad_norm": 36.597232818603516, + "learning_rate": 6.67997718197376e-06, + "loss": 5.5402, + "step": 1176 + }, + { + "epoch": 0.20143761766215984, + "grad_norm": 36.612060546875, + "learning_rate": 6.685681688533942e-06, + "loss": 5.17, + "step": 1177 + }, + { + "epoch": 0.20160876262194077, + "grad_norm": 39.452117919921875, + "learning_rate": 6.691386195094124e-06, + "loss": 6.2861, + "step": 1178 + }, + { + "epoch": 0.2017799075817217, + "grad_norm": 35.985816955566406, + "learning_rate": 6.6970907016543065e-06, + "loss": 5.7763, + "step": 1179 + }, + { + "epoch": 0.20195105254150264, + "grad_norm": 11.960805892944336, + "learning_rate": 6.70279520821449e-06, + "loss": 2.7312, + "step": 1180 + }, + { + "epoch": 0.20212219750128357, + "grad_norm": 154.7554168701172, + "learning_rate": 6.7084997147746724e-06, + "loss": 9.5806, + "step": 1181 + }, + { + "epoch": 0.20229334246106453, + "grad_norm": 31.713943481445312, + "learning_rate": 6.714204221334855e-06, + "loss": 4.9006, + "step": 1182 + }, + { + "epoch": 0.20246448742084547, + "grad_norm": 11.431591987609863, + "learning_rate": 6.7199087278950375e-06, + "loss": 3.1028, + "step": 1183 + }, + { + "epoch": 0.2026356323806264, + "grad_norm": 208.2880859375, + "learning_rate": 6.725613234455219e-06, + "loss": 8.5447, + "step": 1184 + }, + { + "epoch": 0.20280677734040733, + "grad_norm": 32.78763198852539, + "learning_rate": 6.731317741015402e-06, + "loss": 5.0437, + "step": 1185 + }, + { + "epoch": 0.20297792230018827, + "grad_norm": 31.15655517578125, + "learning_rate": 6.737022247575585e-06, + "loss": 4.1921, + "step": 1186 + }, + { + "epoch": 0.2031490672599692, + "grad_norm": 12.072607040405273, + "learning_rate": 6.742726754135768e-06, + "loss": 1.9291, + "step": 1187 + }, + { + "epoch": 0.20332021221975013, + "grad_norm": 46.76679992675781, + "learning_rate": 6.74843126069595e-06, + "loss": 9.0577, + "step": 1188 + }, + { + "epoch": 0.20349135717953107, + "grad_norm": 28.912738800048828, + "learning_rate": 6.754135767256133e-06, + "loss": 4.3274, + "step": 1189 + }, + { + "epoch": 0.203662502139312, + "grad_norm": 151.7112579345703, + "learning_rate": 6.759840273816315e-06, + "loss": 8.1049, + "step": 1190 + }, + { + "epoch": 0.20383364709909293, + "grad_norm": 19.557729721069336, + "learning_rate": 6.765544780376497e-06, + "loss": 1.6717, + "step": 1191 + }, + { + "epoch": 0.20400479205887387, + "grad_norm": 37.28075408935547, + "learning_rate": 6.77124928693668e-06, + "loss": 5.6393, + "step": 1192 + }, + { + "epoch": 0.2041759370186548, + "grad_norm": 33.639183044433594, + "learning_rate": 6.776953793496863e-06, + "loss": 4.9937, + "step": 1193 + }, + { + "epoch": 0.20434708197843574, + "grad_norm": 16.514705657958984, + "learning_rate": 6.782658300057045e-06, + "loss": 2.2396, + "step": 1194 + }, + { + "epoch": 0.20451822693821667, + "grad_norm": 29.29157066345215, + "learning_rate": 6.788362806617228e-06, + "loss": 4.5062, + "step": 1195 + }, + { + "epoch": 0.2046893718979976, + "grad_norm": 24.25420570373535, + "learning_rate": 6.79406731317741e-06, + "loss": 2.5282, + "step": 1196 + }, + { + "epoch": 0.20486051685777854, + "grad_norm": 21.87625503540039, + "learning_rate": 6.799771819737593e-06, + "loss": 2.2101, + "step": 1197 + }, + { + "epoch": 0.20503166181755947, + "grad_norm": 29.727163314819336, + "learning_rate": 6.8054763262977755e-06, + "loss": 3.5679, + "step": 1198 + }, + { + "epoch": 0.2052028067773404, + "grad_norm": 23.502267837524414, + "learning_rate": 6.811180832857958e-06, + "loss": 3.9821, + "step": 1199 + }, + { + "epoch": 0.20537395173712134, + "grad_norm": 31.961931228637695, + "learning_rate": 6.8168853394181405e-06, + "loss": 4.6, + "step": 1200 + }, + { + "epoch": 0.20554509669690227, + "grad_norm": 27.584300994873047, + "learning_rate": 6.822589845978323e-06, + "loss": 3.389, + "step": 1201 + }, + { + "epoch": 0.2057162416566832, + "grad_norm": 34.41096115112305, + "learning_rate": 6.828294352538506e-06, + "loss": 4.722, + "step": 1202 + }, + { + "epoch": 0.20588738661646414, + "grad_norm": 41.341312408447266, + "learning_rate": 6.833998859098688e-06, + "loss": 6.7225, + "step": 1203 + }, + { + "epoch": 0.20605853157624507, + "grad_norm": 160.5906982421875, + "learning_rate": 6.839703365658871e-06, + "loss": 9.8412, + "step": 1204 + }, + { + "epoch": 0.206229676536026, + "grad_norm": 23.49472999572754, + "learning_rate": 6.845407872219053e-06, + "loss": 3.6378, + "step": 1205 + }, + { + "epoch": 0.20640082149580694, + "grad_norm": 31.307947158813477, + "learning_rate": 6.851112378779236e-06, + "loss": 3.6813, + "step": 1206 + }, + { + "epoch": 0.2065719664555879, + "grad_norm": 27.893850326538086, + "learning_rate": 6.856816885339418e-06, + "loss": 4.5216, + "step": 1207 + }, + { + "epoch": 0.20674311141536883, + "grad_norm": 32.200157165527344, + "learning_rate": 6.862521391899601e-06, + "loss": 4.5525, + "step": 1208 + }, + { + "epoch": 0.20691425637514976, + "grad_norm": 31.765216827392578, + "learning_rate": 6.868225898459783e-06, + "loss": 5.2865, + "step": 1209 + }, + { + "epoch": 0.2070854013349307, + "grad_norm": 35.562294006347656, + "learning_rate": 6.873930405019966e-06, + "loss": 5.0758, + "step": 1210 + }, + { + "epoch": 0.20725654629471163, + "grad_norm": 44.582786560058594, + "learning_rate": 6.879634911580148e-06, + "loss": 8.7973, + "step": 1211 + }, + { + "epoch": 0.20742769125449256, + "grad_norm": 29.667964935302734, + "learning_rate": 6.885339418140331e-06, + "loss": 3.7483, + "step": 1212 + }, + { + "epoch": 0.2075988362142735, + "grad_norm": 33.826454162597656, + "learning_rate": 6.8910439247005135e-06, + "loss": 5.321, + "step": 1213 + }, + { + "epoch": 0.20776998117405443, + "grad_norm": 36.56757354736328, + "learning_rate": 6.896748431260697e-06, + "loss": 4.6366, + "step": 1214 + }, + { + "epoch": 0.20794112613383536, + "grad_norm": 21.483030319213867, + "learning_rate": 6.9024529378208785e-06, + "loss": 1.7844, + "step": 1215 + }, + { + "epoch": 0.2081122710936163, + "grad_norm": 22.398630142211914, + "learning_rate": 6.908157444381061e-06, + "loss": 2.9002, + "step": 1216 + }, + { + "epoch": 0.20828341605339723, + "grad_norm": 16.41680145263672, + "learning_rate": 6.913861950941244e-06, + "loss": 1.5466, + "step": 1217 + }, + { + "epoch": 0.20845456101317816, + "grad_norm": 22.448949813842773, + "learning_rate": 6.919566457501426e-06, + "loss": 3.4011, + "step": 1218 + }, + { + "epoch": 0.2086257059729591, + "grad_norm": 35.074989318847656, + "learning_rate": 6.925270964061609e-06, + "loss": 4.4769, + "step": 1219 + }, + { + "epoch": 0.20879685093274003, + "grad_norm": 29.737442016601562, + "learning_rate": 6.930975470621792e-06, + "loss": 4.6152, + "step": 1220 + }, + { + "epoch": 0.20896799589252096, + "grad_norm": 29.097299575805664, + "learning_rate": 6.9366799771819746e-06, + "loss": 3.8591, + "step": 1221 + }, + { + "epoch": 0.2091391408523019, + "grad_norm": 22.356008529663086, + "learning_rate": 6.942384483742156e-06, + "loss": 3.6379, + "step": 1222 + }, + { + "epoch": 0.20931028581208283, + "grad_norm": 29.412656784057617, + "learning_rate": 6.948088990302339e-06, + "loss": 3.5976, + "step": 1223 + }, + { + "epoch": 0.20948143077186376, + "grad_norm": 19.5412654876709, + "learning_rate": 6.953793496862521e-06, + "loss": 2.0718, + "step": 1224 + }, + { + "epoch": 0.2096525757316447, + "grad_norm": 17.43561363220215, + "learning_rate": 6.959498003422704e-06, + "loss": 1.5389, + "step": 1225 + }, + { + "epoch": 0.20982372069142563, + "grad_norm": 34.85890579223633, + "learning_rate": 6.965202509982887e-06, + "loss": 4.4105, + "step": 1226 + }, + { + "epoch": 0.20999486565120656, + "grad_norm": 33.83147430419922, + "learning_rate": 6.97090701654307e-06, + "loss": 4.108, + "step": 1227 + }, + { + "epoch": 0.2101660106109875, + "grad_norm": 33.77149963378906, + "learning_rate": 6.9766115231032514e-06, + "loss": 4.4198, + "step": 1228 + }, + { + "epoch": 0.21033715557076843, + "grad_norm": 12.30455207824707, + "learning_rate": 6.982316029663434e-06, + "loss": 1.7759, + "step": 1229 + }, + { + "epoch": 0.21050830053054936, + "grad_norm": 34.55380630493164, + "learning_rate": 6.9880205362236165e-06, + "loss": 4.4813, + "step": 1230 + }, + { + "epoch": 0.2106794454903303, + "grad_norm": 23.975025177001953, + "learning_rate": 6.993725042783799e-06, + "loss": 3.728, + "step": 1231 + }, + { + "epoch": 0.21085059045011123, + "grad_norm": 190.6012725830078, + "learning_rate": 6.999429549343982e-06, + "loss": 10.1602, + "step": 1232 + }, + { + "epoch": 0.2110217354098922, + "grad_norm": 34.527076721191406, + "learning_rate": 7.005134055904165e-06, + "loss": 4.7483, + "step": 1233 + }, + { + "epoch": 0.21119288036967312, + "grad_norm": 35.65943908691406, + "learning_rate": 7.010838562464347e-06, + "loss": 5.5499, + "step": 1234 + }, + { + "epoch": 0.21136402532945406, + "grad_norm": 34.03565216064453, + "learning_rate": 7.016543069024529e-06, + "loss": 4.7829, + "step": 1235 + }, + { + "epoch": 0.211535170289235, + "grad_norm": 20.10201072692871, + "learning_rate": 7.022247575584712e-06, + "loss": 2.9853, + "step": 1236 + }, + { + "epoch": 0.21170631524901593, + "grad_norm": 72.77118682861328, + "learning_rate": 7.027952082144895e-06, + "loss": 6.8184, + "step": 1237 + }, + { + "epoch": 0.21187746020879686, + "grad_norm": 32.084381103515625, + "learning_rate": 7.033656588705078e-06, + "loss": 5.0572, + "step": 1238 + }, + { + "epoch": 0.2120486051685778, + "grad_norm": 28.180423736572266, + "learning_rate": 7.03936109526526e-06, + "loss": 3.8185, + "step": 1239 + }, + { + "epoch": 0.21221975012835873, + "grad_norm": 20.687843322753906, + "learning_rate": 7.045065601825443e-06, + "loss": 2.1643, + "step": 1240 + }, + { + "epoch": 0.21239089508813966, + "grad_norm": 15.380537033081055, + "learning_rate": 7.050770108385624e-06, + "loss": 1.6453, + "step": 1241 + }, + { + "epoch": 0.2125620400479206, + "grad_norm": 38.16814422607422, + "learning_rate": 7.056474614945807e-06, + "loss": 5.8775, + "step": 1242 + }, + { + "epoch": 0.21273318500770153, + "grad_norm": 43.55405807495117, + "learning_rate": 7.06217912150599e-06, + "loss": 5.1528, + "step": 1243 + }, + { + "epoch": 0.21290432996748246, + "grad_norm": 30.40400505065918, + "learning_rate": 7.067883628066173e-06, + "loss": 4.155, + "step": 1244 + }, + { + "epoch": 0.2130754749272634, + "grad_norm": 39.55487823486328, + "learning_rate": 7.073588134626355e-06, + "loss": 6.8649, + "step": 1245 + }, + { + "epoch": 0.21324661988704433, + "grad_norm": 46.886600494384766, + "learning_rate": 7.079292641186538e-06, + "loss": 4.8251, + "step": 1246 + }, + { + "epoch": 0.21341776484682526, + "grad_norm": 35.842594146728516, + "learning_rate": 7.0849971477467195e-06, + "loss": 5.3382, + "step": 1247 + }, + { + "epoch": 0.2135889098066062, + "grad_norm": 10.459444999694824, + "learning_rate": 7.090701654306902e-06, + "loss": 1.1781, + "step": 1248 + }, + { + "epoch": 0.21376005476638713, + "grad_norm": 31.134531021118164, + "learning_rate": 7.0964061608670854e-06, + "loss": 3.3419, + "step": 1249 + }, + { + "epoch": 0.21393119972616806, + "grad_norm": 32.50645065307617, + "learning_rate": 7.102110667427268e-06, + "loss": 4.1592, + "step": 1250 + }, + { + "epoch": 0.214102344685949, + "grad_norm": 38.065643310546875, + "learning_rate": 7.1078151739874505e-06, + "loss": 6.1903, + "step": 1251 + }, + { + "epoch": 0.21427348964572993, + "grad_norm": 32.13066482543945, + "learning_rate": 7.113519680547633e-06, + "loss": 3.8917, + "step": 1252 + }, + { + "epoch": 0.21444463460551086, + "grad_norm": 22.333932876586914, + "learning_rate": 7.119224187107815e-06, + "loss": 3.308, + "step": 1253 + }, + { + "epoch": 0.2146157795652918, + "grad_norm": 8.437789916992188, + "learning_rate": 7.124928693667997e-06, + "loss": 2.2375, + "step": 1254 + }, + { + "epoch": 0.21478692452507273, + "grad_norm": 32.72603225708008, + "learning_rate": 7.130633200228181e-06, + "loss": 4.8237, + "step": 1255 + }, + { + "epoch": 0.21495806948485366, + "grad_norm": 34.640647888183594, + "learning_rate": 7.136337706788363e-06, + "loss": 5.2757, + "step": 1256 + }, + { + "epoch": 0.2151292144446346, + "grad_norm": 20.100618362426758, + "learning_rate": 7.142042213348546e-06, + "loss": 2.961, + "step": 1257 + }, + { + "epoch": 0.21530035940441555, + "grad_norm": 43.29427719116211, + "learning_rate": 7.147746719908728e-06, + "loss": 8.933, + "step": 1258 + }, + { + "epoch": 0.2154715043641965, + "grad_norm": 33.56546401977539, + "learning_rate": 7.15345122646891e-06, + "loss": 4.6558, + "step": 1259 + }, + { + "epoch": 0.21564264932397742, + "grad_norm": 33.7791633605957, + "learning_rate": 7.159155733029093e-06, + "loss": 4.183, + "step": 1260 + }, + { + "epoch": 0.21581379428375835, + "grad_norm": 33.235233306884766, + "learning_rate": 7.164860239589276e-06, + "loss": 3.7487, + "step": 1261 + }, + { + "epoch": 0.2159849392435393, + "grad_norm": 140.30621337890625, + "learning_rate": 7.170564746149458e-06, + "loss": 9.0381, + "step": 1262 + }, + { + "epoch": 0.21615608420332022, + "grad_norm": 20.70719337463379, + "learning_rate": 7.176269252709641e-06, + "loss": 1.7769, + "step": 1263 + }, + { + "epoch": 0.21632722916310115, + "grad_norm": 36.93478012084961, + "learning_rate": 7.181973759269823e-06, + "loss": 4.5665, + "step": 1264 + }, + { + "epoch": 0.2164983741228821, + "grad_norm": 81.26618957519531, + "learning_rate": 7.187678265830006e-06, + "loss": 7.0141, + "step": 1265 + }, + { + "epoch": 0.21666951908266302, + "grad_norm": 33.15439224243164, + "learning_rate": 7.1933827723901885e-06, + "loss": 4.5814, + "step": 1266 + }, + { + "epoch": 0.21684066404244395, + "grad_norm": 26.268171310424805, + "learning_rate": 7.199087278950371e-06, + "loss": 3.0891, + "step": 1267 + }, + { + "epoch": 0.2170118090022249, + "grad_norm": 35.35780715942383, + "learning_rate": 7.2047917855105535e-06, + "loss": 4.8355, + "step": 1268 + }, + { + "epoch": 0.21718295396200582, + "grad_norm": 21.87150764465332, + "learning_rate": 7.210496292070736e-06, + "loss": 1.7614, + "step": 1269 + }, + { + "epoch": 0.21735409892178675, + "grad_norm": 36.49989318847656, + "learning_rate": 7.216200798630919e-06, + "loss": 5.8824, + "step": 1270 + }, + { + "epoch": 0.2175252438815677, + "grad_norm": 11.613662719726562, + "learning_rate": 7.221905305191101e-06, + "loss": 1.7057, + "step": 1271 + }, + { + "epoch": 0.21769638884134862, + "grad_norm": 28.447458267211914, + "learning_rate": 7.227609811751284e-06, + "loss": 4.3815, + "step": 1272 + }, + { + "epoch": 0.21786753380112955, + "grad_norm": 34.95615005493164, + "learning_rate": 7.233314318311466e-06, + "loss": 4.7223, + "step": 1273 + }, + { + "epoch": 0.2180386787609105, + "grad_norm": 36.12034606933594, + "learning_rate": 7.239018824871649e-06, + "loss": 5.4639, + "step": 1274 + }, + { + "epoch": 0.21820982372069142, + "grad_norm": 29.200042724609375, + "learning_rate": 7.244723331431831e-06, + "loss": 3.9203, + "step": 1275 + }, + { + "epoch": 0.21838096868047235, + "grad_norm": 173.54055786132812, + "learning_rate": 7.250427837992014e-06, + "loss": 9.2819, + "step": 1276 + }, + { + "epoch": 0.2185521136402533, + "grad_norm": 30.67865562438965, + "learning_rate": 7.256132344552197e-06, + "loss": 4.7412, + "step": 1277 + }, + { + "epoch": 0.21872325860003422, + "grad_norm": 35.703468322753906, + "learning_rate": 7.261836851112379e-06, + "loss": 5.3418, + "step": 1278 + }, + { + "epoch": 0.21889440355981515, + "grad_norm": 35.29546356201172, + "learning_rate": 7.267541357672561e-06, + "loss": 5.1735, + "step": 1279 + }, + { + "epoch": 0.2190655485195961, + "grad_norm": 20.382551193237305, + "learning_rate": 7.273245864232744e-06, + "loss": 1.8851, + "step": 1280 + }, + { + "epoch": 0.21923669347937702, + "grad_norm": 20.68045997619629, + "learning_rate": 7.2789503707929265e-06, + "loss": 2.681, + "step": 1281 + }, + { + "epoch": 0.21940783843915795, + "grad_norm": 37.52497482299805, + "learning_rate": 7.284654877353109e-06, + "loss": 5.9113, + "step": 1282 + }, + { + "epoch": 0.21957898339893892, + "grad_norm": 154.6285858154297, + "learning_rate": 7.290359383913292e-06, + "loss": 8.0077, + "step": 1283 + }, + { + "epoch": 0.21975012835871985, + "grad_norm": 28.380836486816406, + "learning_rate": 7.296063890473474e-06, + "loss": 3.5758, + "step": 1284 + }, + { + "epoch": 0.21992127331850078, + "grad_norm": 13.987469673156738, + "learning_rate": 7.301768397033657e-06, + "loss": 1.4051, + "step": 1285 + }, + { + "epoch": 0.22009241827828172, + "grad_norm": 21.18030548095703, + "learning_rate": 7.307472903593839e-06, + "loss": 3.1844, + "step": 1286 + }, + { + "epoch": 0.22026356323806265, + "grad_norm": 13.61611270904541, + "learning_rate": 7.313177410154022e-06, + "loss": 1.4008, + "step": 1287 + }, + { + "epoch": 0.22043470819784358, + "grad_norm": 32.63056182861328, + "learning_rate": 7.318881916714204e-06, + "loss": 5.485, + "step": 1288 + }, + { + "epoch": 0.22060585315762452, + "grad_norm": 12.39704704284668, + "learning_rate": 7.3245864232743876e-06, + "loss": 2.8362, + "step": 1289 + }, + { + "epoch": 0.22077699811740545, + "grad_norm": 160.39300537109375, + "learning_rate": 7.33029092983457e-06, + "loss": 9.3207, + "step": 1290 + }, + { + "epoch": 0.22094814307718638, + "grad_norm": 35.63487243652344, + "learning_rate": 7.335995436394752e-06, + "loss": 4.3364, + "step": 1291 + }, + { + "epoch": 0.22111928803696732, + "grad_norm": 18.865745544433594, + "learning_rate": 7.341699942954934e-06, + "loss": 1.9152, + "step": 1292 + }, + { + "epoch": 0.22129043299674825, + "grad_norm": 34.95203399658203, + "learning_rate": 7.347404449515117e-06, + "loss": 4.2394, + "step": 1293 + }, + { + "epoch": 0.22146157795652918, + "grad_norm": 32.99889373779297, + "learning_rate": 7.353108956075299e-06, + "loss": 5.7603, + "step": 1294 + }, + { + "epoch": 0.22163272291631012, + "grad_norm": 31.541820526123047, + "learning_rate": 7.358813462635483e-06, + "loss": 4.7464, + "step": 1295 + }, + { + "epoch": 0.22180386787609105, + "grad_norm": 22.86473274230957, + "learning_rate": 7.364517969195665e-06, + "loss": 3.2885, + "step": 1296 + }, + { + "epoch": 0.22197501283587198, + "grad_norm": 34.75326919555664, + "learning_rate": 7.370222475755847e-06, + "loss": 4.4337, + "step": 1297 + }, + { + "epoch": 0.22214615779565292, + "grad_norm": 33.42300796508789, + "learning_rate": 7.3759269823160295e-06, + "loss": 4.8641, + "step": 1298 + }, + { + "epoch": 0.22231730275543385, + "grad_norm": 40.14048385620117, + "learning_rate": 7.381631488876212e-06, + "loss": 5.3092, + "step": 1299 + }, + { + "epoch": 0.22248844771521478, + "grad_norm": 33.59206008911133, + "learning_rate": 7.387335995436395e-06, + "loss": 4.6114, + "step": 1300 + }, + { + "epoch": 0.22265959267499572, + "grad_norm": 32.96902084350586, + "learning_rate": 7.393040501996578e-06, + "loss": 4.9559, + "step": 1301 + }, + { + "epoch": 0.22283073763477665, + "grad_norm": 76.84076690673828, + "learning_rate": 7.3987450085567605e-06, + "loss": 7.2409, + "step": 1302 + }, + { + "epoch": 0.22300188259455758, + "grad_norm": 29.227497100830078, + "learning_rate": 7.404449515116942e-06, + "loss": 3.4494, + "step": 1303 + }, + { + "epoch": 0.22317302755433852, + "grad_norm": 34.10039520263672, + "learning_rate": 7.410154021677125e-06, + "loss": 4.6513, + "step": 1304 + }, + { + "epoch": 0.22334417251411945, + "grad_norm": 43.62645721435547, + "learning_rate": 7.415858528237307e-06, + "loss": 6.1141, + "step": 1305 + }, + { + "epoch": 0.22351531747390038, + "grad_norm": 29.59916877746582, + "learning_rate": 7.421563034797491e-06, + "loss": 4.5189, + "step": 1306 + }, + { + "epoch": 0.22368646243368132, + "grad_norm": 32.00434494018555, + "learning_rate": 7.427267541357673e-06, + "loss": 3.7625, + "step": 1307 + }, + { + "epoch": 0.22385760739346225, + "grad_norm": 12.214600563049316, + "learning_rate": 7.432972047917856e-06, + "loss": 1.6093, + "step": 1308 + }, + { + "epoch": 0.2240287523532432, + "grad_norm": 13.289321899414062, + "learning_rate": 7.438676554478037e-06, + "loss": 1.8433, + "step": 1309 + }, + { + "epoch": 0.22419989731302414, + "grad_norm": 12.391509056091309, + "learning_rate": 7.44438106103822e-06, + "loss": 1.8211, + "step": 1310 + }, + { + "epoch": 0.22437104227280508, + "grad_norm": 31.827852249145508, + "learning_rate": 7.450085567598402e-06, + "loss": 3.5607, + "step": 1311 + }, + { + "epoch": 0.224542187232586, + "grad_norm": 172.93185424804688, + "learning_rate": 7.455790074158586e-06, + "loss": 9.5445, + "step": 1312 + }, + { + "epoch": 0.22471333219236694, + "grad_norm": 18.688396453857422, + "learning_rate": 7.461494580718768e-06, + "loss": 1.9759, + "step": 1313 + }, + { + "epoch": 0.22488447715214788, + "grad_norm": 26.364185333251953, + "learning_rate": 7.467199087278951e-06, + "loss": 3.2682, + "step": 1314 + }, + { + "epoch": 0.2250556221119288, + "grad_norm": 8.573413848876953, + "learning_rate": 7.472903593839133e-06, + "loss": 1.3051, + "step": 1315 + }, + { + "epoch": 0.22522676707170974, + "grad_norm": 24.913686752319336, + "learning_rate": 7.478608100399315e-06, + "loss": 2.4598, + "step": 1316 + }, + { + "epoch": 0.22539791203149068, + "grad_norm": 30.283504486083984, + "learning_rate": 7.4843126069594984e-06, + "loss": 4.1503, + "step": 1317 + }, + { + "epoch": 0.2255690569912716, + "grad_norm": 18.146724700927734, + "learning_rate": 7.490017113519681e-06, + "loss": 1.8957, + "step": 1318 + }, + { + "epoch": 0.22574020195105254, + "grad_norm": 11.016623497009277, + "learning_rate": 7.4957216200798635e-06, + "loss": 2.636, + "step": 1319 + }, + { + "epoch": 0.22591134691083348, + "grad_norm": 35.766883850097656, + "learning_rate": 7.501426126640046e-06, + "loss": 4.3588, + "step": 1320 + }, + { + "epoch": 0.2260824918706144, + "grad_norm": 24.76753807067871, + "learning_rate": 7.5071306332002286e-06, + "loss": 3.2106, + "step": 1321 + }, + { + "epoch": 0.22625363683039534, + "grad_norm": 35.969505310058594, + "learning_rate": 7.51283513976041e-06, + "loss": 4.5488, + "step": 1322 + }, + { + "epoch": 0.22642478179017628, + "grad_norm": 13.215656280517578, + "learning_rate": 7.518539646320593e-06, + "loss": 1.7273, + "step": 1323 + }, + { + "epoch": 0.2265959267499572, + "grad_norm": 32.75537872314453, + "learning_rate": 7.524244152880775e-06, + "loss": 4.442, + "step": 1324 + }, + { + "epoch": 0.22676707170973814, + "grad_norm": 13.069498062133789, + "learning_rate": 7.529948659440958e-06, + "loss": 1.2864, + "step": 1325 + }, + { + "epoch": 0.22693821666951908, + "grad_norm": 29.5541934967041, + "learning_rate": 7.535653166001142e-06, + "loss": 3.5993, + "step": 1326 + }, + { + "epoch": 0.2271093616293, + "grad_norm": 36.506736755371094, + "learning_rate": 7.541357672561325e-06, + "loss": 4.7108, + "step": 1327 + }, + { + "epoch": 0.22728050658908094, + "grad_norm": 30.510953903198242, + "learning_rate": 7.547062179121506e-06, + "loss": 4.168, + "step": 1328 + }, + { + "epoch": 0.22745165154886188, + "grad_norm": 11.754740715026855, + "learning_rate": 7.552766685681689e-06, + "loss": 2.7865, + "step": 1329 + }, + { + "epoch": 0.2276227965086428, + "grad_norm": 31.793643951416016, + "learning_rate": 7.558471192241871e-06, + "loss": 3.4931, + "step": 1330 + }, + { + "epoch": 0.22779394146842374, + "grad_norm": 23.95293426513672, + "learning_rate": 7.564175698802054e-06, + "loss": 3.0252, + "step": 1331 + }, + { + "epoch": 0.22796508642820468, + "grad_norm": 28.809511184692383, + "learning_rate": 7.569880205362236e-06, + "loss": 4.2144, + "step": 1332 + }, + { + "epoch": 0.2281362313879856, + "grad_norm": 34.645267486572266, + "learning_rate": 7.575584711922419e-06, + "loss": 4.5155, + "step": 1333 + }, + { + "epoch": 0.22830737634776657, + "grad_norm": 31.90658950805664, + "learning_rate": 7.581289218482601e-06, + "loss": 3.6445, + "step": 1334 + }, + { + "epoch": 0.2284785213075475, + "grad_norm": 26.37479591369629, + "learning_rate": 7.586993725042783e-06, + "loss": 2.6728, + "step": 1335 + }, + { + "epoch": 0.22864966626732844, + "grad_norm": 29.64954376220703, + "learning_rate": 7.592698231602966e-06, + "loss": 4.0421, + "step": 1336 + }, + { + "epoch": 0.22882081122710937, + "grad_norm": 28.596891403198242, + "learning_rate": 7.59840273816315e-06, + "loss": 3.3059, + "step": 1337 + }, + { + "epoch": 0.2289919561868903, + "grad_norm": 36.07052993774414, + "learning_rate": 7.6041072447233325e-06, + "loss": 4.2618, + "step": 1338 + }, + { + "epoch": 0.22916310114667124, + "grad_norm": 50.589454650878906, + "learning_rate": 7.609811751283515e-06, + "loss": 9.3326, + "step": 1339 + }, + { + "epoch": 0.22933424610645217, + "grad_norm": 31.4276180267334, + "learning_rate": 7.6155162578436975e-06, + "loss": 4.6035, + "step": 1340 + }, + { + "epoch": 0.2295053910662331, + "grad_norm": 32.5452766418457, + "learning_rate": 7.621220764403879e-06, + "loss": 3.9264, + "step": 1341 + }, + { + "epoch": 0.22967653602601404, + "grad_norm": 32.74778747558594, + "learning_rate": 7.626925270964062e-06, + "loss": 4.6618, + "step": 1342 + }, + { + "epoch": 0.22984768098579497, + "grad_norm": 11.447990417480469, + "learning_rate": 7.632629777524244e-06, + "loss": 1.2467, + "step": 1343 + }, + { + "epoch": 0.2300188259455759, + "grad_norm": 19.261301040649414, + "learning_rate": 7.638334284084426e-06, + "loss": 1.4108, + "step": 1344 + }, + { + "epoch": 0.23018997090535684, + "grad_norm": 17.838138580322266, + "learning_rate": 7.64403879064461e-06, + "loss": 1.4796, + "step": 1345 + }, + { + "epoch": 0.23036111586513777, + "grad_norm": 36.09761047363281, + "learning_rate": 7.649743297204791e-06, + "loss": 4.8769, + "step": 1346 + }, + { + "epoch": 0.2305322608249187, + "grad_norm": 17.18463706970215, + "learning_rate": 7.655447803764974e-06, + "loss": 1.6009, + "step": 1347 + }, + { + "epoch": 0.23070340578469964, + "grad_norm": 20.603784561157227, + "learning_rate": 7.661152310325156e-06, + "loss": 3.0856, + "step": 1348 + }, + { + "epoch": 0.23087455074448057, + "grad_norm": 41.716590881347656, + "learning_rate": 7.666856816885341e-06, + "loss": 5.4047, + "step": 1349 + }, + { + "epoch": 0.2310456957042615, + "grad_norm": 181.26748657226562, + "learning_rate": 7.672561323445523e-06, + "loss": 8.5903, + "step": 1350 + }, + { + "epoch": 0.23121684066404244, + "grad_norm": 41.98673629760742, + "learning_rate": 7.678265830005705e-06, + "loss": 5.2161, + "step": 1351 + }, + { + "epoch": 0.23138798562382337, + "grad_norm": 35.29446792602539, + "learning_rate": 7.683970336565888e-06, + "loss": 4.2135, + "step": 1352 + }, + { + "epoch": 0.2315591305836043, + "grad_norm": 164.35977172851562, + "learning_rate": 7.68967484312607e-06, + "loss": 7.3687, + "step": 1353 + }, + { + "epoch": 0.23173027554338524, + "grad_norm": 20.39377784729004, + "learning_rate": 7.695379349686253e-06, + "loss": 1.6669, + "step": 1354 + }, + { + "epoch": 0.23190142050316617, + "grad_norm": 33.71407699584961, + "learning_rate": 7.701083856246435e-06, + "loss": 4.5662, + "step": 1355 + }, + { + "epoch": 0.2320725654629471, + "grad_norm": 9.964597702026367, + "learning_rate": 7.706788362806616e-06, + "loss": 2.2199, + "step": 1356 + }, + { + "epoch": 0.23224371042272804, + "grad_norm": 41.83567810058594, + "learning_rate": 7.7124928693668e-06, + "loss": 5.3555, + "step": 1357 + }, + { + "epoch": 0.23241485538250897, + "grad_norm": 19.700429916381836, + "learning_rate": 7.718197375926981e-06, + "loss": 1.6864, + "step": 1358 + }, + { + "epoch": 0.2325860003422899, + "grad_norm": 32.94630432128906, + "learning_rate": 7.723901882487165e-06, + "loss": 3.5872, + "step": 1359 + }, + { + "epoch": 0.23275714530207087, + "grad_norm": 26.41133689880371, + "learning_rate": 7.729606389047348e-06, + "loss": 3.5806, + "step": 1360 + }, + { + "epoch": 0.2329282902618518, + "grad_norm": 17.184593200683594, + "learning_rate": 7.735310895607532e-06, + "loss": 1.6527, + "step": 1361 + }, + { + "epoch": 0.23309943522163273, + "grad_norm": 11.024751663208008, + "learning_rate": 7.741015402167713e-06, + "loss": 1.2203, + "step": 1362 + }, + { + "epoch": 0.23327058018141367, + "grad_norm": 35.2708625793457, + "learning_rate": 7.746719908727895e-06, + "loss": 4.5728, + "step": 1363 + }, + { + "epoch": 0.2334417251411946, + "grad_norm": 35.836387634277344, + "learning_rate": 7.752424415288078e-06, + "loss": 4.9165, + "step": 1364 + }, + { + "epoch": 0.23361287010097553, + "grad_norm": 24.741012573242188, + "learning_rate": 7.75812892184826e-06, + "loss": 2.2991, + "step": 1365 + }, + { + "epoch": 0.23378401506075647, + "grad_norm": 41.604007720947266, + "learning_rate": 7.763833428408443e-06, + "loss": 4.7384, + "step": 1366 + }, + { + "epoch": 0.2339551600205374, + "grad_norm": 37.068485260009766, + "learning_rate": 7.769537934968625e-06, + "loss": 4.1609, + "step": 1367 + }, + { + "epoch": 0.23412630498031833, + "grad_norm": 31.635995864868164, + "learning_rate": 7.775242441528808e-06, + "loss": 3.6394, + "step": 1368 + }, + { + "epoch": 0.23429744994009927, + "grad_norm": 36.181602478027344, + "learning_rate": 7.78094694808899e-06, + "loss": 3.9604, + "step": 1369 + }, + { + "epoch": 0.2344685948998802, + "grad_norm": 34.47708511352539, + "learning_rate": 7.786651454649172e-06, + "loss": 4.4621, + "step": 1370 + }, + { + "epoch": 0.23463973985966113, + "grad_norm": 36.583919525146484, + "learning_rate": 7.792355961209355e-06, + "loss": 5.4214, + "step": 1371 + }, + { + "epoch": 0.23481088481944207, + "grad_norm": 139.80113220214844, + "learning_rate": 7.798060467769539e-06, + "loss": 7.582, + "step": 1372 + }, + { + "epoch": 0.234982029779223, + "grad_norm": 10.627038955688477, + "learning_rate": 7.803764974329722e-06, + "loss": 1.1265, + "step": 1373 + }, + { + "epoch": 0.23515317473900393, + "grad_norm": 56.01224899291992, + "learning_rate": 7.809469480889904e-06, + "loss": 9.2401, + "step": 1374 + }, + { + "epoch": 0.23532431969878487, + "grad_norm": 13.42536449432373, + "learning_rate": 7.815173987450085e-06, + "loss": 1.3022, + "step": 1375 + }, + { + "epoch": 0.2354954646585658, + "grad_norm": 34.816341400146484, + "learning_rate": 7.820878494010269e-06, + "loss": 4.6249, + "step": 1376 + }, + { + "epoch": 0.23566660961834673, + "grad_norm": 13.037670135498047, + "learning_rate": 7.82658300057045e-06, + "loss": 1.5747, + "step": 1377 + }, + { + "epoch": 0.23583775457812767, + "grad_norm": 38.446537017822266, + "learning_rate": 7.832287507130634e-06, + "loss": 4.9983, + "step": 1378 + }, + { + "epoch": 0.2360088995379086, + "grad_norm": 32.81908416748047, + "learning_rate": 7.837992013690815e-06, + "loss": 3.4363, + "step": 1379 + }, + { + "epoch": 0.23618004449768953, + "grad_norm": 12.17697525024414, + "learning_rate": 7.843696520250999e-06, + "loss": 1.6211, + "step": 1380 + }, + { + "epoch": 0.23635118945747047, + "grad_norm": 35.46131896972656, + "learning_rate": 7.84940102681118e-06, + "loss": 4.8981, + "step": 1381 + }, + { + "epoch": 0.2365223344172514, + "grad_norm": 29.793787002563477, + "learning_rate": 7.855105533371362e-06, + "loss": 3.5648, + "step": 1382 + }, + { + "epoch": 0.23669347937703233, + "grad_norm": 14.550475120544434, + "learning_rate": 7.860810039931547e-06, + "loss": 1.6714, + "step": 1383 + }, + { + "epoch": 0.23686462433681327, + "grad_norm": 36.01753234863281, + "learning_rate": 7.866514546491729e-06, + "loss": 4.936, + "step": 1384 + }, + { + "epoch": 0.23703576929659423, + "grad_norm": 21.261749267578125, + "learning_rate": 7.872219053051912e-06, + "loss": 2.3239, + "step": 1385 + }, + { + "epoch": 0.23720691425637516, + "grad_norm": 160.96620178222656, + "learning_rate": 7.877923559612094e-06, + "loss": 7.9267, + "step": 1386 + }, + { + "epoch": 0.2373780592161561, + "grad_norm": 34.994293212890625, + "learning_rate": 7.883628066172276e-06, + "loss": 4.6021, + "step": 1387 + }, + { + "epoch": 0.23754920417593703, + "grad_norm": 32.08713912963867, + "learning_rate": 7.889332572732459e-06, + "loss": 4.0803, + "step": 1388 + }, + { + "epoch": 0.23772034913571796, + "grad_norm": 36.49545669555664, + "learning_rate": 7.89503707929264e-06, + "loss": 4.4858, + "step": 1389 + }, + { + "epoch": 0.2378914940954989, + "grad_norm": 146.2379608154297, + "learning_rate": 7.900741585852824e-06, + "loss": 8.1082, + "step": 1390 + }, + { + "epoch": 0.23806263905527983, + "grad_norm": 31.705169677734375, + "learning_rate": 7.906446092413006e-06, + "loss": 4.1572, + "step": 1391 + }, + { + "epoch": 0.23823378401506076, + "grad_norm": 13.439140319824219, + "learning_rate": 7.91215059897319e-06, + "loss": 1.1091, + "step": 1392 + }, + { + "epoch": 0.2384049289748417, + "grad_norm": 51.37181854248047, + "learning_rate": 7.91785510553337e-06, + "loss": 9.8544, + "step": 1393 + }, + { + "epoch": 0.23857607393462263, + "grad_norm": 16.763200759887695, + "learning_rate": 7.923559612093553e-06, + "loss": 1.4605, + "step": 1394 + }, + { + "epoch": 0.23874721889440356, + "grad_norm": 32.19613265991211, + "learning_rate": 7.929264118653738e-06, + "loss": 4.0605, + "step": 1395 + }, + { + "epoch": 0.2389183638541845, + "grad_norm": 36.1611442565918, + "learning_rate": 7.93496862521392e-06, + "loss": 4.1027, + "step": 1396 + }, + { + "epoch": 0.23908950881396543, + "grad_norm": 36.234344482421875, + "learning_rate": 7.940673131774103e-06, + "loss": 5.0933, + "step": 1397 + }, + { + "epoch": 0.23926065377374636, + "grad_norm": 39.589111328125, + "learning_rate": 7.946377638334284e-06, + "loss": 5.4176, + "step": 1398 + }, + { + "epoch": 0.2394317987335273, + "grad_norm": 13.162062644958496, + "learning_rate": 7.952082144894468e-06, + "loss": 1.3262, + "step": 1399 + }, + { + "epoch": 0.23960294369330823, + "grad_norm": 11.512036323547363, + "learning_rate": 7.95778665145465e-06, + "loss": 2.8916, + "step": 1400 + }, + { + "epoch": 0.23977408865308916, + "grad_norm": 30.82523536682129, + "learning_rate": 7.963491158014831e-06, + "loss": 3.7983, + "step": 1401 + }, + { + "epoch": 0.2399452336128701, + "grad_norm": 9.881488800048828, + "learning_rate": 7.969195664575014e-06, + "loss": 1.6009, + "step": 1402 + }, + { + "epoch": 0.24011637857265103, + "grad_norm": 26.221534729003906, + "learning_rate": 7.974900171135196e-06, + "loss": 3.2459, + "step": 1403 + }, + { + "epoch": 0.24028752353243196, + "grad_norm": 34.7869987487793, + "learning_rate": 7.98060467769538e-06, + "loss": 4.2736, + "step": 1404 + }, + { + "epoch": 0.2404586684922129, + "grad_norm": 42.81889343261719, + "learning_rate": 7.986309184255561e-06, + "loss": 6.0254, + "step": 1405 + }, + { + "epoch": 0.24062981345199383, + "grad_norm": 35.25808334350586, + "learning_rate": 7.992013690815745e-06, + "loss": 3.8331, + "step": 1406 + }, + { + "epoch": 0.24080095841177476, + "grad_norm": 29.81654167175293, + "learning_rate": 7.997718197375928e-06, + "loss": 3.3841, + "step": 1407 + }, + { + "epoch": 0.2409721033715557, + "grad_norm": 34.251243591308594, + "learning_rate": 8.00342270393611e-06, + "loss": 4.8157, + "step": 1408 + }, + { + "epoch": 0.24114324833133663, + "grad_norm": 31.04636573791504, + "learning_rate": 8.009127210496293e-06, + "loss": 3.4431, + "step": 1409 + }, + { + "epoch": 0.2413143932911176, + "grad_norm": 33.0612678527832, + "learning_rate": 8.014831717056475e-06, + "loss": 3.8054, + "step": 1410 + }, + { + "epoch": 0.24148553825089852, + "grad_norm": 25.215789794921875, + "learning_rate": 8.020536223616658e-06, + "loss": 3.2052, + "step": 1411 + }, + { + "epoch": 0.24165668321067946, + "grad_norm": 22.657257080078125, + "learning_rate": 8.02624073017684e-06, + "loss": 2.5621, + "step": 1412 + }, + { + "epoch": 0.2418278281704604, + "grad_norm": 32.54667282104492, + "learning_rate": 8.031945236737021e-06, + "loss": 4.1257, + "step": 1413 + }, + { + "epoch": 0.24199897313024132, + "grad_norm": 14.109042167663574, + "learning_rate": 8.037649743297205e-06, + "loss": 1.2616, + "step": 1414 + }, + { + "epoch": 0.24217011809002226, + "grad_norm": 35.718116760253906, + "learning_rate": 8.043354249857387e-06, + "loss": 5.263, + "step": 1415 + }, + { + "epoch": 0.2423412630498032, + "grad_norm": 10.830004692077637, + "learning_rate": 8.04905875641757e-06, + "loss": 1.6628, + "step": 1416 + }, + { + "epoch": 0.24251240800958412, + "grad_norm": 21.519893646240234, + "learning_rate": 8.054763262977753e-06, + "loss": 2.2681, + "step": 1417 + }, + { + "epoch": 0.24268355296936506, + "grad_norm": 16.527233123779297, + "learning_rate": 8.060467769537937e-06, + "loss": 1.7274, + "step": 1418 + }, + { + "epoch": 0.242854697929146, + "grad_norm": 17.97334098815918, + "learning_rate": 8.066172276098118e-06, + "loss": 1.4341, + "step": 1419 + }, + { + "epoch": 0.24302584288892692, + "grad_norm": 38.63325500488281, + "learning_rate": 8.0718767826583e-06, + "loss": 5.4521, + "step": 1420 + }, + { + "epoch": 0.24319698784870786, + "grad_norm": 37.572818756103516, + "learning_rate": 8.077581289218483e-06, + "loss": 4.057, + "step": 1421 + }, + { + "epoch": 0.2433681328084888, + "grad_norm": 36.495025634765625, + "learning_rate": 8.083285795778665e-06, + "loss": 5.3841, + "step": 1422 + }, + { + "epoch": 0.24353927776826972, + "grad_norm": 46.322486877441406, + "learning_rate": 8.088990302338848e-06, + "loss": 9.2447, + "step": 1423 + }, + { + "epoch": 0.24371042272805066, + "grad_norm": 32.26517868041992, + "learning_rate": 8.09469480889903e-06, + "loss": 3.4902, + "step": 1424 + }, + { + "epoch": 0.2438815676878316, + "grad_norm": 29.286020278930664, + "learning_rate": 8.100399315459212e-06, + "loss": 3.6562, + "step": 1425 + }, + { + "epoch": 0.24405271264761252, + "grad_norm": 9.768603324890137, + "learning_rate": 8.106103822019395e-06, + "loss": 1.0808, + "step": 1426 + }, + { + "epoch": 0.24422385760739346, + "grad_norm": 40.53557205200195, + "learning_rate": 8.111808328579577e-06, + "loss": 5.3038, + "step": 1427 + }, + { + "epoch": 0.2443950025671744, + "grad_norm": 36.29978561401367, + "learning_rate": 8.11751283513976e-06, + "loss": 4.9487, + "step": 1428 + }, + { + "epoch": 0.24456614752695532, + "grad_norm": 50.365440368652344, + "learning_rate": 8.123217341699944e-06, + "loss": 9.1753, + "step": 1429 + }, + { + "epoch": 0.24473729248673626, + "grad_norm": 25.204608917236328, + "learning_rate": 8.128921848260127e-06, + "loss": 3.06, + "step": 1430 + }, + { + "epoch": 0.2449084374465172, + "grad_norm": 36.821929931640625, + "learning_rate": 8.134626354820309e-06, + "loss": 4.2367, + "step": 1431 + }, + { + "epoch": 0.24507958240629812, + "grad_norm": 9.532563209533691, + "learning_rate": 8.14033086138049e-06, + "loss": 1.0511, + "step": 1432 + }, + { + "epoch": 0.24525072736607906, + "grad_norm": 31.35403060913086, + "learning_rate": 8.146035367940674e-06, + "loss": 4.1655, + "step": 1433 + }, + { + "epoch": 0.24542187232586, + "grad_norm": 29.057531356811523, + "learning_rate": 8.151739874500855e-06, + "loss": 3.6622, + "step": 1434 + }, + { + "epoch": 0.24559301728564092, + "grad_norm": 18.69387435913086, + "learning_rate": 8.157444381061039e-06, + "loss": 1.6006, + "step": 1435 + }, + { + "epoch": 0.24576416224542189, + "grad_norm": 27.337491989135742, + "learning_rate": 8.16314888762122e-06, + "loss": 2.1133, + "step": 1436 + }, + { + "epoch": 0.24593530720520282, + "grad_norm": 59.810035705566406, + "learning_rate": 8.168853394181404e-06, + "loss": 9.2893, + "step": 1437 + }, + { + "epoch": 0.24610645216498375, + "grad_norm": 34.85076141357422, + "learning_rate": 8.174557900741586e-06, + "loss": 4.76, + "step": 1438 + }, + { + "epoch": 0.24627759712476469, + "grad_norm": 16.229951858520508, + "learning_rate": 8.180262407301767e-06, + "loss": 1.111, + "step": 1439 + }, + { + "epoch": 0.24644874208454562, + "grad_norm": 191.14859008789062, + "learning_rate": 8.185966913861952e-06, + "loss": 8.6606, + "step": 1440 + }, + { + "epoch": 0.24661988704432655, + "grad_norm": 25.192026138305664, + "learning_rate": 8.191671420422134e-06, + "loss": 2.2213, + "step": 1441 + }, + { + "epoch": 0.24679103200410749, + "grad_norm": 16.577152252197266, + "learning_rate": 8.197375926982317e-06, + "loss": 1.4564, + "step": 1442 + }, + { + "epoch": 0.24696217696388842, + "grad_norm": 37.47216796875, + "learning_rate": 8.203080433542499e-06, + "loss": 4.9652, + "step": 1443 + }, + { + "epoch": 0.24713332192366935, + "grad_norm": 33.50614547729492, + "learning_rate": 8.20878494010268e-06, + "loss": 3.8217, + "step": 1444 + }, + { + "epoch": 0.24730446688345029, + "grad_norm": 35.54981994628906, + "learning_rate": 8.214489446662864e-06, + "loss": 5.0781, + "step": 1445 + }, + { + "epoch": 0.24747561184323122, + "grad_norm": 29.486570358276367, + "learning_rate": 8.220193953223046e-06, + "loss": 3.4324, + "step": 1446 + }, + { + "epoch": 0.24764675680301215, + "grad_norm": 23.952808380126953, + "learning_rate": 8.22589845978323e-06, + "loss": 2.9791, + "step": 1447 + }, + { + "epoch": 0.24781790176279309, + "grad_norm": 22.885963439941406, + "learning_rate": 8.231602966343411e-06, + "loss": 2.1029, + "step": 1448 + }, + { + "epoch": 0.24798904672257402, + "grad_norm": 38.23826217651367, + "learning_rate": 8.237307472903594e-06, + "loss": 5.1107, + "step": 1449 + }, + { + "epoch": 0.24816019168235495, + "grad_norm": 21.183773040771484, + "learning_rate": 8.243011979463776e-06, + "loss": 2.6462, + "step": 1450 + }, + { + "epoch": 0.24833133664213589, + "grad_norm": 11.436287879943848, + "learning_rate": 8.248716486023958e-06, + "loss": 1.139, + "step": 1451 + }, + { + "epoch": 0.24850248160191682, + "grad_norm": 21.1058349609375, + "learning_rate": 8.254420992584143e-06, + "loss": 2.6237, + "step": 1452 + }, + { + "epoch": 0.24867362656169775, + "grad_norm": 29.661510467529297, + "learning_rate": 8.260125499144324e-06, + "loss": 3.9416, + "step": 1453 + }, + { + "epoch": 0.24884477152147869, + "grad_norm": 25.654918670654297, + "learning_rate": 8.265830005704508e-06, + "loss": 2.9109, + "step": 1454 + }, + { + "epoch": 0.24901591648125962, + "grad_norm": 29.254196166992188, + "learning_rate": 8.27153451226469e-06, + "loss": 3.9703, + "step": 1455 + }, + { + "epoch": 0.24918706144104055, + "grad_norm": 15.34985065460205, + "learning_rate": 8.277239018824871e-06, + "loss": 1.277, + "step": 1456 + }, + { + "epoch": 0.24935820640082149, + "grad_norm": 20.940813064575195, + "learning_rate": 8.282943525385055e-06, + "loss": 2.8225, + "step": 1457 + }, + { + "epoch": 0.24952935136060242, + "grad_norm": 156.33163452148438, + "learning_rate": 8.288648031945236e-06, + "loss": 6.8667, + "step": 1458 + }, + { + "epoch": 0.24970049632038335, + "grad_norm": 142.04833984375, + "learning_rate": 8.29435253850542e-06, + "loss": 7.7845, + "step": 1459 + }, + { + "epoch": 0.24987164128016429, + "grad_norm": 52.80269241333008, + "learning_rate": 8.300057045065601e-06, + "loss": 9.2945, + "step": 1460 + }, + { + "epoch": 0.25004278623994525, + "grad_norm": 36.25229263305664, + "learning_rate": 8.305761551625785e-06, + "loss": 4.1385, + "step": 1461 + }, + { + "epoch": 0.2502139311997262, + "grad_norm": 32.63280487060547, + "learning_rate": 8.311466058185966e-06, + "loss": 4.9526, + "step": 1462 + }, + { + "epoch": 0.2503850761595071, + "grad_norm": 36.09181213378906, + "learning_rate": 8.31717056474615e-06, + "loss": 4.9655, + "step": 1463 + }, + { + "epoch": 0.25055622111928805, + "grad_norm": 13.666475296020508, + "learning_rate": 8.322875071306333e-06, + "loss": 1.2171, + "step": 1464 + }, + { + "epoch": 0.250727366079069, + "grad_norm": 21.431262969970703, + "learning_rate": 8.328579577866515e-06, + "loss": 2.0253, + "step": 1465 + }, + { + "epoch": 0.2508985110388499, + "grad_norm": 34.866493225097656, + "learning_rate": 8.334284084426698e-06, + "loss": 4.7963, + "step": 1466 + }, + { + "epoch": 0.25106965599863085, + "grad_norm": 28.299697875976562, + "learning_rate": 8.33998859098688e-06, + "loss": 3.2393, + "step": 1467 + }, + { + "epoch": 0.2512408009584118, + "grad_norm": 30.702220916748047, + "learning_rate": 8.345693097547063e-06, + "loss": 4.459, + "step": 1468 + }, + { + "epoch": 0.2514119459181927, + "grad_norm": 35.572662353515625, + "learning_rate": 8.351397604107245e-06, + "loss": 4.0362, + "step": 1469 + }, + { + "epoch": 0.25158309087797365, + "grad_norm": 31.228361129760742, + "learning_rate": 8.357102110667427e-06, + "loss": 3.7291, + "step": 1470 + }, + { + "epoch": 0.2517542358377546, + "grad_norm": 158.43309020996094, + "learning_rate": 8.36280661722761e-06, + "loss": 7.5395, + "step": 1471 + }, + { + "epoch": 0.2519253807975355, + "grad_norm": 26.111873626708984, + "learning_rate": 8.368511123787792e-06, + "loss": 3.2816, + "step": 1472 + }, + { + "epoch": 0.25209652575731645, + "grad_norm": 152.1773681640625, + "learning_rate": 8.374215630347975e-06, + "loss": 9.2757, + "step": 1473 + }, + { + "epoch": 0.2522676707170974, + "grad_norm": 28.91309928894043, + "learning_rate": 8.379920136908157e-06, + "loss": 3.8, + "step": 1474 + }, + { + "epoch": 0.2524388156768783, + "grad_norm": 138.71820068359375, + "learning_rate": 8.38562464346834e-06, + "loss": 8.3701, + "step": 1475 + }, + { + "epoch": 0.25260996063665925, + "grad_norm": 10.94738483428955, + "learning_rate": 8.391329150028524e-06, + "loss": 1.0987, + "step": 1476 + }, + { + "epoch": 0.2527811055964402, + "grad_norm": 33.45675277709961, + "learning_rate": 8.397033656588705e-06, + "loss": 3.8679, + "step": 1477 + }, + { + "epoch": 0.2529522505562211, + "grad_norm": 30.219728469848633, + "learning_rate": 8.402738163148889e-06, + "loss": 3.7668, + "step": 1478 + }, + { + "epoch": 0.25312339551600205, + "grad_norm": 153.4755859375, + "learning_rate": 8.40844266970907e-06, + "loss": 8.493, + "step": 1479 + }, + { + "epoch": 0.253294540475783, + "grad_norm": 27.030277252197266, + "learning_rate": 8.414147176269254e-06, + "loss": 3.6373, + "step": 1480 + }, + { + "epoch": 0.2534656854355639, + "grad_norm": 26.931581497192383, + "learning_rate": 8.419851682829435e-06, + "loss": 2.4114, + "step": 1481 + }, + { + "epoch": 0.25363683039534485, + "grad_norm": 33.86345672607422, + "learning_rate": 8.425556189389617e-06, + "loss": 4.18, + "step": 1482 + }, + { + "epoch": 0.2538079753551258, + "grad_norm": 40.67789840698242, + "learning_rate": 8.4312606959498e-06, + "loss": 5.2501, + "step": 1483 + }, + { + "epoch": 0.2539791203149067, + "grad_norm": 11.627734184265137, + "learning_rate": 8.436965202509982e-06, + "loss": 1.2352, + "step": 1484 + }, + { + "epoch": 0.25415026527468765, + "grad_norm": 27.1390438079834, + "learning_rate": 8.442669709070165e-06, + "loss": 2.4447, + "step": 1485 + }, + { + "epoch": 0.2543214102344686, + "grad_norm": 33.907615661621094, + "learning_rate": 8.448374215630349e-06, + "loss": 5.7028, + "step": 1486 + }, + { + "epoch": 0.2544925551942495, + "grad_norm": 34.770687103271484, + "learning_rate": 8.45407872219053e-06, + "loss": 5.4022, + "step": 1487 + }, + { + "epoch": 0.25466370015403045, + "grad_norm": 87.67970275878906, + "learning_rate": 8.459783228750714e-06, + "loss": 7.2429, + "step": 1488 + }, + { + "epoch": 0.2548348451138114, + "grad_norm": 36.1263313293457, + "learning_rate": 8.465487735310896e-06, + "loss": 4.7788, + "step": 1489 + }, + { + "epoch": 0.2550059900735923, + "grad_norm": 35.22165298461914, + "learning_rate": 8.471192241871079e-06, + "loss": 4.132, + "step": 1490 + }, + { + "epoch": 0.25517713503337325, + "grad_norm": 28.420682907104492, + "learning_rate": 8.47689674843126e-06, + "loss": 3.6288, + "step": 1491 + }, + { + "epoch": 0.2553482799931542, + "grad_norm": 36.37025451660156, + "learning_rate": 8.482601254991444e-06, + "loss": 5.1911, + "step": 1492 + }, + { + "epoch": 0.2555194249529351, + "grad_norm": 40.647789001464844, + "learning_rate": 8.488305761551626e-06, + "loss": 5.5946, + "step": 1493 + }, + { + "epoch": 0.25569056991271605, + "grad_norm": 19.504039764404297, + "learning_rate": 8.494010268111807e-06, + "loss": 1.7075, + "step": 1494 + }, + { + "epoch": 0.255861714872497, + "grad_norm": 32.866695404052734, + "learning_rate": 8.49971477467199e-06, + "loss": 4.4763, + "step": 1495 + }, + { + "epoch": 0.2560328598322779, + "grad_norm": 33.1104736328125, + "learning_rate": 8.505419281232172e-06, + "loss": 4.4053, + "step": 1496 + }, + { + "epoch": 0.25620400479205885, + "grad_norm": 22.860944747924805, + "learning_rate": 8.511123787792358e-06, + "loss": 2.5604, + "step": 1497 + }, + { + "epoch": 0.2563751497518398, + "grad_norm": 34.79046630859375, + "learning_rate": 8.51682829435254e-06, + "loss": 4.993, + "step": 1498 + }, + { + "epoch": 0.25654629471162077, + "grad_norm": 28.405912399291992, + "learning_rate": 8.522532800912723e-06, + "loss": 3.3138, + "step": 1499 + }, + { + "epoch": 0.2567174396714017, + "grad_norm": 32.89986038208008, + "learning_rate": 8.528237307472904e-06, + "loss": 3.1908, + "step": 1500 + }, + { + "epoch": 0.25688858463118264, + "grad_norm": 20.201610565185547, + "learning_rate": 8.533941814033086e-06, + "loss": 1.974, + "step": 1501 + }, + { + "epoch": 0.25705972959096357, + "grad_norm": 32.933231353759766, + "learning_rate": 8.53964632059327e-06, + "loss": 4.8342, + "step": 1502 + }, + { + "epoch": 0.2572308745507445, + "grad_norm": 25.67669105529785, + "learning_rate": 8.545350827153451e-06, + "loss": 2.8345, + "step": 1503 + }, + { + "epoch": 0.25740201951052544, + "grad_norm": 50.461097717285156, + "learning_rate": 8.551055333713634e-06, + "loss": 6.9385, + "step": 1504 + }, + { + "epoch": 0.25757316447030637, + "grad_norm": 32.42000198364258, + "learning_rate": 8.556759840273816e-06, + "loss": 3.4542, + "step": 1505 + }, + { + "epoch": 0.2577443094300873, + "grad_norm": 29.946523666381836, + "learning_rate": 8.562464346833998e-06, + "loss": 3.2486, + "step": 1506 + }, + { + "epoch": 0.25791545438986824, + "grad_norm": 17.451496124267578, + "learning_rate": 8.568168853394181e-06, + "loss": 1.4946, + "step": 1507 + }, + { + "epoch": 0.25808659934964917, + "grad_norm": 30.164350509643555, + "learning_rate": 8.573873359954363e-06, + "loss": 3.8272, + "step": 1508 + }, + { + "epoch": 0.2582577443094301, + "grad_norm": 26.747682571411133, + "learning_rate": 8.579577866514548e-06, + "loss": 3.0653, + "step": 1509 + }, + { + "epoch": 0.25842888926921104, + "grad_norm": 20.9317626953125, + "learning_rate": 8.58528237307473e-06, + "loss": 1.8431, + "step": 1510 + }, + { + "epoch": 0.25860003422899197, + "grad_norm": 36.90618896484375, + "learning_rate": 8.590986879634913e-06, + "loss": 3.7371, + "step": 1511 + }, + { + "epoch": 0.2587711791887729, + "grad_norm": 19.612281799316406, + "learning_rate": 8.596691386195095e-06, + "loss": 1.4799, + "step": 1512 + }, + { + "epoch": 0.25894232414855384, + "grad_norm": 35.63535690307617, + "learning_rate": 8.602395892755276e-06, + "loss": 4.2458, + "step": 1513 + }, + { + "epoch": 0.25911346910833477, + "grad_norm": 37.25559997558594, + "learning_rate": 8.60810039931546e-06, + "loss": 3.7735, + "step": 1514 + }, + { + "epoch": 0.2592846140681157, + "grad_norm": 26.81685447692871, + "learning_rate": 8.613804905875641e-06, + "loss": 2.621, + "step": 1515 + }, + { + "epoch": 0.25945575902789664, + "grad_norm": 22.918485641479492, + "learning_rate": 8.619509412435825e-06, + "loss": 1.6105, + "step": 1516 + }, + { + "epoch": 0.25962690398767757, + "grad_norm": 12.06033992767334, + "learning_rate": 8.625213918996006e-06, + "loss": 1.1731, + "step": 1517 + }, + { + "epoch": 0.2597980489474585, + "grad_norm": 35.15945053100586, + "learning_rate": 8.63091842555619e-06, + "loss": 3.8198, + "step": 1518 + }, + { + "epoch": 0.25996919390723944, + "grad_norm": 13.90102767944336, + "learning_rate": 8.636622932116372e-06, + "loss": 2.736, + "step": 1519 + }, + { + "epoch": 0.26014033886702037, + "grad_norm": 35.0964469909668, + "learning_rate": 8.642327438676555e-06, + "loss": 4.3737, + "step": 1520 + }, + { + "epoch": 0.2603114838268013, + "grad_norm": 33.16070556640625, + "learning_rate": 8.648031945236738e-06, + "loss": 3.8065, + "step": 1521 + }, + { + "epoch": 0.26048262878658224, + "grad_norm": 16.28618621826172, + "learning_rate": 8.65373645179692e-06, + "loss": 1.257, + "step": 1522 + }, + { + "epoch": 0.26065377374636317, + "grad_norm": 28.174516677856445, + "learning_rate": 8.659440958357103e-06, + "loss": 3.7114, + "step": 1523 + }, + { + "epoch": 0.2608249187061441, + "grad_norm": 26.44544792175293, + "learning_rate": 8.665145464917285e-06, + "loss": 2.829, + "step": 1524 + }, + { + "epoch": 0.26099606366592504, + "grad_norm": 38.186378479003906, + "learning_rate": 8.670849971477467e-06, + "loss": 4.3011, + "step": 1525 + }, + { + "epoch": 0.26116720862570597, + "grad_norm": 206.24801635742188, + "learning_rate": 8.67655447803765e-06, + "loss": 9.2851, + "step": 1526 + }, + { + "epoch": 0.2613383535854869, + "grad_norm": 33.12008285522461, + "learning_rate": 8.682258984597832e-06, + "loss": 4.3036, + "step": 1527 + }, + { + "epoch": 0.26150949854526784, + "grad_norm": 136.57029724121094, + "learning_rate": 8.687963491158015e-06, + "loss": 8.4189, + "step": 1528 + }, + { + "epoch": 0.26168064350504877, + "grad_norm": 40.36309051513672, + "learning_rate": 8.693667997718197e-06, + "loss": 5.4948, + "step": 1529 + }, + { + "epoch": 0.2618517884648297, + "grad_norm": 19.74286651611328, + "learning_rate": 8.69937250427838e-06, + "loss": 2.0893, + "step": 1530 + }, + { + "epoch": 0.26202293342461064, + "grad_norm": 33.62118148803711, + "learning_rate": 8.705077010838562e-06, + "loss": 3.796, + "step": 1531 + }, + { + "epoch": 0.26219407838439157, + "grad_norm": 36.64006805419922, + "learning_rate": 8.710781517398745e-06, + "loss": 3.9848, + "step": 1532 + }, + { + "epoch": 0.2623652233441725, + "grad_norm": 12.980084419250488, + "learning_rate": 8.716486023958929e-06, + "loss": 1.1166, + "step": 1533 + }, + { + "epoch": 0.26253636830395344, + "grad_norm": 35.808021545410156, + "learning_rate": 8.72219053051911e-06, + "loss": 4.3018, + "step": 1534 + }, + { + "epoch": 0.26270751326373437, + "grad_norm": 51.2911491394043, + "learning_rate": 8.727895037079294e-06, + "loss": 9.237, + "step": 1535 + }, + { + "epoch": 0.2628786582235153, + "grad_norm": 26.75223731994629, + "learning_rate": 8.733599543639475e-06, + "loss": 3.3625, + "step": 1536 + }, + { + "epoch": 0.26304980318329624, + "grad_norm": 81.07520294189453, + "learning_rate": 8.739304050199659e-06, + "loss": 7.5686, + "step": 1537 + }, + { + "epoch": 0.26322094814307717, + "grad_norm": 37.027191162109375, + "learning_rate": 8.74500855675984e-06, + "loss": 3.7701, + "step": 1538 + }, + { + "epoch": 0.2633920931028581, + "grad_norm": 47.393333435058594, + "learning_rate": 8.750713063320022e-06, + "loss": 9.0139, + "step": 1539 + }, + { + "epoch": 0.26356323806263904, + "grad_norm": 34.1210823059082, + "learning_rate": 8.756417569880206e-06, + "loss": 4.4995, + "step": 1540 + }, + { + "epoch": 0.26373438302241997, + "grad_norm": 14.312548637390137, + "learning_rate": 8.762122076440387e-06, + "loss": 2.1827, + "step": 1541 + }, + { + "epoch": 0.2639055279822009, + "grad_norm": 30.19961166381836, + "learning_rate": 8.76782658300057e-06, + "loss": 3.9737, + "step": 1542 + }, + { + "epoch": 0.26407667294198184, + "grad_norm": 10.720991134643555, + "learning_rate": 8.773531089560754e-06, + "loss": 1.1108, + "step": 1543 + }, + { + "epoch": 0.26424781790176277, + "grad_norm": 26.29660987854004, + "learning_rate": 8.779235596120936e-06, + "loss": 2.9509, + "step": 1544 + }, + { + "epoch": 0.2644189628615437, + "grad_norm": 7.651371479034424, + "learning_rate": 8.784940102681119e-06, + "loss": 0.8929, + "step": 1545 + }, + { + "epoch": 0.26459010782132464, + "grad_norm": 32.411407470703125, + "learning_rate": 8.7906446092413e-06, + "loss": 3.9279, + "step": 1546 + }, + { + "epoch": 0.26476125278110557, + "grad_norm": 43.62602233886719, + "learning_rate": 8.796349115801484e-06, + "loss": 8.7932, + "step": 1547 + }, + { + "epoch": 0.2649323977408865, + "grad_norm": 28.391075134277344, + "learning_rate": 8.802053622361666e-06, + "loss": 3.3049, + "step": 1548 + }, + { + "epoch": 0.26510354270066744, + "grad_norm": 35.11864471435547, + "learning_rate": 8.80775812892185e-06, + "loss": 4.0323, + "step": 1549 + }, + { + "epoch": 0.2652746876604484, + "grad_norm": 10.911874771118164, + "learning_rate": 8.813462635482031e-06, + "loss": 1.3744, + "step": 1550 + }, + { + "epoch": 0.26544583262022936, + "grad_norm": 22.232980728149414, + "learning_rate": 8.819167142042213e-06, + "loss": 1.972, + "step": 1551 + }, + { + "epoch": 0.2656169775800103, + "grad_norm": 171.640625, + "learning_rate": 8.824871648602396e-06, + "loss": 8.4712, + "step": 1552 + }, + { + "epoch": 0.2657881225397912, + "grad_norm": 30.831897735595703, + "learning_rate": 8.830576155162578e-06, + "loss": 3.5869, + "step": 1553 + }, + { + "epoch": 0.26595926749957216, + "grad_norm": 36.305782318115234, + "learning_rate": 8.836280661722761e-06, + "loss": 4.9009, + "step": 1554 + }, + { + "epoch": 0.2661304124593531, + "grad_norm": 44.463626861572266, + "learning_rate": 8.841985168282944e-06, + "loss": 4.6015, + "step": 1555 + }, + { + "epoch": 0.266301557419134, + "grad_norm": 22.66800308227539, + "learning_rate": 8.847689674843126e-06, + "loss": 2.1498, + "step": 1556 + }, + { + "epoch": 0.26647270237891496, + "grad_norm": 30.886274337768555, + "learning_rate": 8.85339418140331e-06, + "loss": 4.3322, + "step": 1557 + }, + { + "epoch": 0.2666438473386959, + "grad_norm": 34.30126190185547, + "learning_rate": 8.859098687963491e-06, + "loss": 4.5378, + "step": 1558 + }, + { + "epoch": 0.2668149922984768, + "grad_norm": 36.92926025390625, + "learning_rate": 8.864803194523674e-06, + "loss": 4.2903, + "step": 1559 + }, + { + "epoch": 0.26698613725825776, + "grad_norm": 34.588077545166016, + "learning_rate": 8.870507701083856e-06, + "loss": 4.9088, + "step": 1560 + }, + { + "epoch": 0.2671572822180387, + "grad_norm": 30.621044158935547, + "learning_rate": 8.87621220764404e-06, + "loss": 3.6051, + "step": 1561 + }, + { + "epoch": 0.2673284271778196, + "grad_norm": 30.107677459716797, + "learning_rate": 8.881916714204221e-06, + "loss": 3.4027, + "step": 1562 + }, + { + "epoch": 0.26749957213760056, + "grad_norm": 16.614532470703125, + "learning_rate": 8.887621220764403e-06, + "loss": 1.5846, + "step": 1563 + }, + { + "epoch": 0.2676707170973815, + "grad_norm": 35.577842712402344, + "learning_rate": 8.893325727324586e-06, + "loss": 4.2335, + "step": 1564 + }, + { + "epoch": 0.2678418620571624, + "grad_norm": 33.13545227050781, + "learning_rate": 8.899030233884768e-06, + "loss": 4.6539, + "step": 1565 + }, + { + "epoch": 0.26801300701694336, + "grad_norm": 170.64297485351562, + "learning_rate": 8.904734740444953e-06, + "loss": 9.3362, + "step": 1566 + }, + { + "epoch": 0.2681841519767243, + "grad_norm": 12.3065185546875, + "learning_rate": 8.910439247005135e-06, + "loss": 1.573, + "step": 1567 + }, + { + "epoch": 0.2683552969365052, + "grad_norm": 38.08529281616211, + "learning_rate": 8.916143753565318e-06, + "loss": 3.7314, + "step": 1568 + }, + { + "epoch": 0.26852644189628616, + "grad_norm": 169.76089477539062, + "learning_rate": 8.9218482601255e-06, + "loss": 9.6942, + "step": 1569 + }, + { + "epoch": 0.2686975868560671, + "grad_norm": 38.42169952392578, + "learning_rate": 8.927552766685681e-06, + "loss": 5.3158, + "step": 1570 + }, + { + "epoch": 0.268868731815848, + "grad_norm": 14.410723686218262, + "learning_rate": 8.933257273245865e-06, + "loss": 1.2377, + "step": 1571 + }, + { + "epoch": 0.26903987677562896, + "grad_norm": 52.682533264160156, + "learning_rate": 8.938961779806047e-06, + "loss": 6.516, + "step": 1572 + }, + { + "epoch": 0.2692110217354099, + "grad_norm": 34.07759094238281, + "learning_rate": 8.94466628636623e-06, + "loss": 4.013, + "step": 1573 + }, + { + "epoch": 0.26938216669519083, + "grad_norm": 29.74109649658203, + "learning_rate": 8.950370792926412e-06, + "loss": 3.4177, + "step": 1574 + }, + { + "epoch": 0.26955331165497176, + "grad_norm": 35.098876953125, + "learning_rate": 8.956075299486593e-06, + "loss": 4.1055, + "step": 1575 + }, + { + "epoch": 0.2697244566147527, + "grad_norm": 50.082366943359375, + "learning_rate": 8.961779806046777e-06, + "loss": 8.4876, + "step": 1576 + }, + { + "epoch": 0.26989560157453363, + "grad_norm": 116.58244323730469, + "learning_rate": 8.96748431260696e-06, + "loss": 7.8558, + "step": 1577 + }, + { + "epoch": 0.27006674653431456, + "grad_norm": 32.75837326049805, + "learning_rate": 8.973188819167143e-06, + "loss": 3.8977, + "step": 1578 + }, + { + "epoch": 0.2702378914940955, + "grad_norm": 13.686226844787598, + "learning_rate": 8.978893325727325e-06, + "loss": 1.5984, + "step": 1579 + }, + { + "epoch": 0.27040903645387643, + "grad_norm": 31.057418823242188, + "learning_rate": 8.984597832287508e-06, + "loss": 4.2033, + "step": 1580 + }, + { + "epoch": 0.27058018141365736, + "grad_norm": 31.405447006225586, + "learning_rate": 8.99030233884769e-06, + "loss": 3.2895, + "step": 1581 + }, + { + "epoch": 0.2707513263734383, + "grad_norm": 29.978918075561523, + "learning_rate": 8.996006845407872e-06, + "loss": 4.0648, + "step": 1582 + }, + { + "epoch": 0.27092247133321923, + "grad_norm": 11.317312240600586, + "learning_rate": 9.001711351968055e-06, + "loss": 0.9835, + "step": 1583 + }, + { + "epoch": 0.27109361629300016, + "grad_norm": 17.877771377563477, + "learning_rate": 9.007415858528237e-06, + "loss": 1.4293, + "step": 1584 + }, + { + "epoch": 0.2712647612527811, + "grad_norm": 26.353673934936523, + "learning_rate": 9.01312036508842e-06, + "loss": 2.6549, + "step": 1585 + }, + { + "epoch": 0.27143590621256203, + "grad_norm": 31.735876083374023, + "learning_rate": 9.018824871648602e-06, + "loss": 3.9997, + "step": 1586 + }, + { + "epoch": 0.27160705117234296, + "grad_norm": 35.91917037963867, + "learning_rate": 9.024529378208785e-06, + "loss": 4.2824, + "step": 1587 + }, + { + "epoch": 0.2717781961321239, + "grad_norm": 32.27674865722656, + "learning_rate": 9.030233884768967e-06, + "loss": 4.0964, + "step": 1588 + }, + { + "epoch": 0.27194934109190483, + "grad_norm": 37.242549896240234, + "learning_rate": 9.03593839132915e-06, + "loss": 4.4567, + "step": 1589 + }, + { + "epoch": 0.27212048605168576, + "grad_norm": 15.34211540222168, + "learning_rate": 9.041642897889334e-06, + "loss": 1.1567, + "step": 1590 + }, + { + "epoch": 0.2722916310114667, + "grad_norm": 35.38195037841797, + "learning_rate": 9.047347404449515e-06, + "loss": 4.7975, + "step": 1591 + }, + { + "epoch": 0.27246277597124763, + "grad_norm": 29.104900360107422, + "learning_rate": 9.053051911009699e-06, + "loss": 3.1354, + "step": 1592 + }, + { + "epoch": 0.27263392093102856, + "grad_norm": 15.004528999328613, + "learning_rate": 9.05875641756988e-06, + "loss": 1.1248, + "step": 1593 + }, + { + "epoch": 0.2728050658908095, + "grad_norm": 26.269655227661133, + "learning_rate": 9.064460924130062e-06, + "loss": 2.0743, + "step": 1594 + }, + { + "epoch": 0.27297621085059043, + "grad_norm": 19.79959487915039, + "learning_rate": 9.070165430690246e-06, + "loss": 1.3031, + "step": 1595 + }, + { + "epoch": 0.27314735581037136, + "grad_norm": 43.51731491088867, + "learning_rate": 9.075869937250427e-06, + "loss": 4.4293, + "step": 1596 + }, + { + "epoch": 0.2733185007701523, + "grad_norm": 7.138434410095215, + "learning_rate": 9.08157444381061e-06, + "loss": 0.8485, + "step": 1597 + }, + { + "epoch": 0.27348964572993323, + "grad_norm": 32.309593200683594, + "learning_rate": 9.087278950370792e-06, + "loss": 3.4497, + "step": 1598 + }, + { + "epoch": 0.27366079068971416, + "grad_norm": 24.805715560913086, + "learning_rate": 9.092983456930976e-06, + "loss": 2.9256, + "step": 1599 + }, + { + "epoch": 0.2738319356494951, + "grad_norm": 61.22898483276367, + "learning_rate": 9.098687963491159e-06, + "loss": 5.9283, + "step": 1600 + }, + { + "epoch": 0.2740030806092761, + "grad_norm": 29.417680740356445, + "learning_rate": 9.10439247005134e-06, + "loss": 3.8084, + "step": 1601 + }, + { + "epoch": 0.274174225569057, + "grad_norm": 34.00372314453125, + "learning_rate": 9.110096976611524e-06, + "loss": 3.4933, + "step": 1602 + }, + { + "epoch": 0.27434537052883795, + "grad_norm": 14.374422073364258, + "learning_rate": 9.115801483171706e-06, + "loss": 1.4626, + "step": 1603 + }, + { + "epoch": 0.2745165154886189, + "grad_norm": 12.729880332946777, + "learning_rate": 9.12150598973189e-06, + "loss": 1.1151, + "step": 1604 + }, + { + "epoch": 0.2746876604483998, + "grad_norm": 17.94257164001465, + "learning_rate": 9.127210496292071e-06, + "loss": 1.3846, + "step": 1605 + }, + { + "epoch": 0.27485880540818075, + "grad_norm": 38.29545974731445, + "learning_rate": 9.132915002852253e-06, + "loss": 4.5905, + "step": 1606 + }, + { + "epoch": 0.2750299503679617, + "grad_norm": 35.37318420410156, + "learning_rate": 9.138619509412436e-06, + "loss": 4.3784, + "step": 1607 + }, + { + "epoch": 0.2752010953277426, + "grad_norm": 35.77292251586914, + "learning_rate": 9.144324015972618e-06, + "loss": 3.315, + "step": 1608 + }, + { + "epoch": 0.27537224028752355, + "grad_norm": 38.70093536376953, + "learning_rate": 9.150028522532801e-06, + "loss": 5.4718, + "step": 1609 + }, + { + "epoch": 0.2755433852473045, + "grad_norm": 185.0310516357422, + "learning_rate": 9.155733029092983e-06, + "loss": 7.5009, + "step": 1610 + }, + { + "epoch": 0.2757145302070854, + "grad_norm": 28.145288467407227, + "learning_rate": 9.161437535653166e-06, + "loss": 2.8764, + "step": 1611 + }, + { + "epoch": 0.27588567516686635, + "grad_norm": 7.594282150268555, + "learning_rate": 9.16714204221335e-06, + "loss": 0.8713, + "step": 1612 + }, + { + "epoch": 0.2760568201266473, + "grad_norm": 32.899845123291016, + "learning_rate": 9.172846548773531e-06, + "loss": 4.6094, + "step": 1613 + }, + { + "epoch": 0.2762279650864282, + "grad_norm": 39.75630569458008, + "learning_rate": 9.178551055333715e-06, + "loss": 4.5632, + "step": 1614 + }, + { + "epoch": 0.27639911004620915, + "grad_norm": 29.607851028442383, + "learning_rate": 9.184255561893896e-06, + "loss": 2.9606, + "step": 1615 + }, + { + "epoch": 0.2765702550059901, + "grad_norm": 76.37677001953125, + "learning_rate": 9.18996006845408e-06, + "loss": 7.355, + "step": 1616 + }, + { + "epoch": 0.276741399965771, + "grad_norm": 22.215526580810547, + "learning_rate": 9.195664575014261e-06, + "loss": 2.8241, + "step": 1617 + }, + { + "epoch": 0.27691254492555195, + "grad_norm": 9.465276718139648, + "learning_rate": 9.201369081574445e-06, + "loss": 0.9882, + "step": 1618 + }, + { + "epoch": 0.2770836898853329, + "grad_norm": 27.726600646972656, + "learning_rate": 9.207073588134626e-06, + "loss": 3.238, + "step": 1619 + }, + { + "epoch": 0.2772548348451138, + "grad_norm": 35.69710922241211, + "learning_rate": 9.212778094694808e-06, + "loss": 4.4113, + "step": 1620 + }, + { + "epoch": 0.27742597980489475, + "grad_norm": 34.97329330444336, + "learning_rate": 9.218482601254991e-06, + "loss": 5.005, + "step": 1621 + }, + { + "epoch": 0.2775971247646757, + "grad_norm": 18.749282836914062, + "learning_rate": 9.224187107815173e-06, + "loss": 1.7009, + "step": 1622 + }, + { + "epoch": 0.2777682697244566, + "grad_norm": 130.61004638671875, + "learning_rate": 9.229891614375358e-06, + "loss": 7.8661, + "step": 1623 + }, + { + "epoch": 0.27793941468423755, + "grad_norm": 12.980770111083984, + "learning_rate": 9.23559612093554e-06, + "loss": 1.1125, + "step": 1624 + }, + { + "epoch": 0.2781105596440185, + "grad_norm": 46.32781219482422, + "learning_rate": 9.241300627495722e-06, + "loss": 8.7552, + "step": 1625 + }, + { + "epoch": 0.2782817046037994, + "grad_norm": 101.5696029663086, + "learning_rate": 9.247005134055905e-06, + "loss": 7.1054, + "step": 1626 + }, + { + "epoch": 0.27845284956358035, + "grad_norm": 22.125795364379883, + "learning_rate": 9.252709640616087e-06, + "loss": 1.7911, + "step": 1627 + }, + { + "epoch": 0.2786239945233613, + "grad_norm": 34.277095794677734, + "learning_rate": 9.25841414717627e-06, + "loss": 4.438, + "step": 1628 + }, + { + "epoch": 0.2787951394831422, + "grad_norm": 22.72269058227539, + "learning_rate": 9.264118653736452e-06, + "loss": 1.7455, + "step": 1629 + }, + { + "epoch": 0.27896628444292315, + "grad_norm": 30.11455726623535, + "learning_rate": 9.269823160296635e-06, + "loss": 3.3549, + "step": 1630 + }, + { + "epoch": 0.2791374294027041, + "grad_norm": 34.13120651245117, + "learning_rate": 9.275527666856817e-06, + "loss": 3.7081, + "step": 1631 + }, + { + "epoch": 0.279308574362485, + "grad_norm": 8.457001686096191, + "learning_rate": 9.281232173416998e-06, + "loss": 0.9564, + "step": 1632 + }, + { + "epoch": 0.27947971932226595, + "grad_norm": 38.574615478515625, + "learning_rate": 9.286936679977182e-06, + "loss": 4.3973, + "step": 1633 + }, + { + "epoch": 0.2796508642820469, + "grad_norm": 11.158347129821777, + "learning_rate": 9.292641186537364e-06, + "loss": 0.9696, + "step": 1634 + }, + { + "epoch": 0.2798220092418278, + "grad_norm": 11.847931861877441, + "learning_rate": 9.298345693097549e-06, + "loss": 1.5377, + "step": 1635 + }, + { + "epoch": 0.27999315420160875, + "grad_norm": 11.096319198608398, + "learning_rate": 9.30405019965773e-06, + "loss": 1.136, + "step": 1636 + }, + { + "epoch": 0.2801642991613897, + "grad_norm": 36.63529586791992, + "learning_rate": 9.309754706217914e-06, + "loss": 5.2998, + "step": 1637 + }, + { + "epoch": 0.2803354441211706, + "grad_norm": 30.421175003051758, + "learning_rate": 9.315459212778095e-06, + "loss": 3.5562, + "step": 1638 + }, + { + "epoch": 0.28050658908095155, + "grad_norm": 34.89402770996094, + "learning_rate": 9.321163719338277e-06, + "loss": 4.9255, + "step": 1639 + }, + { + "epoch": 0.2806777340407325, + "grad_norm": 28.486478805541992, + "learning_rate": 9.32686822589846e-06, + "loss": 3.4583, + "step": 1640 + }, + { + "epoch": 0.2808488790005134, + "grad_norm": 7.498641490936279, + "learning_rate": 9.332572732458642e-06, + "loss": 0.8123, + "step": 1641 + }, + { + "epoch": 0.28102002396029435, + "grad_norm": 47.50094223022461, + "learning_rate": 9.338277239018825e-06, + "loss": 7.894, + "step": 1642 + }, + { + "epoch": 0.2811911689200753, + "grad_norm": 62.95503616333008, + "learning_rate": 9.343981745579007e-06, + "loss": 6.7316, + "step": 1643 + }, + { + "epoch": 0.2813623138798562, + "grad_norm": 26.29498291015625, + "learning_rate": 9.349686252139189e-06, + "loss": 2.9299, + "step": 1644 + }, + { + "epoch": 0.28153345883963715, + "grad_norm": 13.663917541503906, + "learning_rate": 9.355390758699372e-06, + "loss": 1.6658, + "step": 1645 + }, + { + "epoch": 0.2817046037994181, + "grad_norm": 31.745132446289062, + "learning_rate": 9.361095265259556e-06, + "loss": 4.9097, + "step": 1646 + }, + { + "epoch": 0.281875748759199, + "grad_norm": 16.757953643798828, + "learning_rate": 9.366799771819739e-06, + "loss": 1.4769, + "step": 1647 + }, + { + "epoch": 0.28204689371897995, + "grad_norm": 21.601877212524414, + "learning_rate": 9.37250427837992e-06, + "loss": 1.7352, + "step": 1648 + }, + { + "epoch": 0.2822180386787609, + "grad_norm": 38.61962127685547, + "learning_rate": 9.378208784940104e-06, + "loss": 4.4803, + "step": 1649 + }, + { + "epoch": 0.2823891836385418, + "grad_norm": 31.342639923095703, + "learning_rate": 9.383913291500286e-06, + "loss": 3.8044, + "step": 1650 + }, + { + "epoch": 0.28256032859832275, + "grad_norm": 9.416754722595215, + "learning_rate": 9.389617798060467e-06, + "loss": 0.804, + "step": 1651 + }, + { + "epoch": 0.28273147355810374, + "grad_norm": 31.227413177490234, + "learning_rate": 9.39532230462065e-06, + "loss": 4.1229, + "step": 1652 + }, + { + "epoch": 0.2829026185178847, + "grad_norm": 13.257563591003418, + "learning_rate": 9.401026811180832e-06, + "loss": 1.1089, + "step": 1653 + }, + { + "epoch": 0.2830737634776656, + "grad_norm": 33.36773681640625, + "learning_rate": 9.406731317741016e-06, + "loss": 4.6453, + "step": 1654 + }, + { + "epoch": 0.28324490843744654, + "grad_norm": 30.116289138793945, + "learning_rate": 9.412435824301198e-06, + "loss": 3.3475, + "step": 1655 + }, + { + "epoch": 0.2834160533972275, + "grad_norm": 9.72807502746582, + "learning_rate": 9.41814033086138e-06, + "loss": 1.3987, + "step": 1656 + }, + { + "epoch": 0.2835871983570084, + "grad_norm": 35.53730392456055, + "learning_rate": 9.423844837421563e-06, + "loss": 4.4274, + "step": 1657 + }, + { + "epoch": 0.28375834331678934, + "grad_norm": 25.7310733795166, + "learning_rate": 9.429549343981746e-06, + "loss": 3.1681, + "step": 1658 + }, + { + "epoch": 0.2839294882765703, + "grad_norm": 41.159175872802734, + "learning_rate": 9.43525385054193e-06, + "loss": 5.0249, + "step": 1659 + }, + { + "epoch": 0.2841006332363512, + "grad_norm": 44.80512619018555, + "learning_rate": 9.440958357102111e-06, + "loss": 8.5485, + "step": 1660 + }, + { + "epoch": 0.28427177819613214, + "grad_norm": 30.980173110961914, + "learning_rate": 9.446662863662294e-06, + "loss": 3.4326, + "step": 1661 + }, + { + "epoch": 0.2844429231559131, + "grad_norm": 34.32295608520508, + "learning_rate": 9.452367370222476e-06, + "loss": 3.1846, + "step": 1662 + }, + { + "epoch": 0.284614068115694, + "grad_norm": 31.66938591003418, + "learning_rate": 9.458071876782658e-06, + "loss": 3.5118, + "step": 1663 + }, + { + "epoch": 0.28478521307547494, + "grad_norm": 32.4676513671875, + "learning_rate": 9.463776383342841e-06, + "loss": 4.7146, + "step": 1664 + }, + { + "epoch": 0.2849563580352559, + "grad_norm": 10.913191795349121, + "learning_rate": 9.469480889903023e-06, + "loss": 0.9731, + "step": 1665 + }, + { + "epoch": 0.2851275029950368, + "grad_norm": 35.5974006652832, + "learning_rate": 9.475185396463206e-06, + "loss": 4.3522, + "step": 1666 + }, + { + "epoch": 0.28529864795481774, + "grad_norm": 33.59803771972656, + "learning_rate": 9.480889903023388e-06, + "loss": 3.3641, + "step": 1667 + }, + { + "epoch": 0.2854697929145987, + "grad_norm": 35.429466247558594, + "learning_rate": 9.486594409583571e-06, + "loss": 3.9219, + "step": 1668 + }, + { + "epoch": 0.2856409378743796, + "grad_norm": 13.85142707824707, + "learning_rate": 9.492298916143755e-06, + "loss": 1.3341, + "step": 1669 + }, + { + "epoch": 0.28581208283416054, + "grad_norm": 9.107728004455566, + "learning_rate": 9.498003422703936e-06, + "loss": 0.8871, + "step": 1670 + }, + { + "epoch": 0.2859832277939415, + "grad_norm": 35.564979553222656, + "learning_rate": 9.50370792926412e-06, + "loss": 4.1451, + "step": 1671 + }, + { + "epoch": 0.2861543727537224, + "grad_norm": 27.561506271362305, + "learning_rate": 9.509412435824301e-06, + "loss": 3.3942, + "step": 1672 + }, + { + "epoch": 0.28632551771350334, + "grad_norm": 35.57343292236328, + "learning_rate": 9.515116942384485e-06, + "loss": 3.7111, + "step": 1673 + }, + { + "epoch": 0.2864966626732843, + "grad_norm": 39.25908279418945, + "learning_rate": 9.520821448944666e-06, + "loss": 4.3091, + "step": 1674 + }, + { + "epoch": 0.2866678076330652, + "grad_norm": 41.76926803588867, + "learning_rate": 9.526525955504848e-06, + "loss": 4.7086, + "step": 1675 + }, + { + "epoch": 0.28683895259284614, + "grad_norm": 30.626611709594727, + "learning_rate": 9.532230462065032e-06, + "loss": 3.1129, + "step": 1676 + }, + { + "epoch": 0.2870100975526271, + "grad_norm": 15.441875457763672, + "learning_rate": 9.537934968625213e-06, + "loss": 1.2888, + "step": 1677 + }, + { + "epoch": 0.287181242512408, + "grad_norm": 28.600982666015625, + "learning_rate": 9.543639475185397e-06, + "loss": 2.9279, + "step": 1678 + }, + { + "epoch": 0.28735238747218894, + "grad_norm": 31.3085994720459, + "learning_rate": 9.549343981745578e-06, + "loss": 3.8741, + "step": 1679 + }, + { + "epoch": 0.2875235324319699, + "grad_norm": 207.9216766357422, + "learning_rate": 9.555048488305763e-06, + "loss": 8.5201, + "step": 1680 + }, + { + "epoch": 0.2876946773917508, + "grad_norm": 38.76487731933594, + "learning_rate": 9.560752994865945e-06, + "loss": 4.8258, + "step": 1681 + }, + { + "epoch": 0.28786582235153174, + "grad_norm": 35.18633270263672, + "learning_rate": 9.566457501426127e-06, + "loss": 3.9555, + "step": 1682 + }, + { + "epoch": 0.2880369673113127, + "grad_norm": 153.19830322265625, + "learning_rate": 9.57216200798631e-06, + "loss": 8.1017, + "step": 1683 + }, + { + "epoch": 0.2882081122710936, + "grad_norm": 8.444355010986328, + "learning_rate": 9.577866514546492e-06, + "loss": 0.8761, + "step": 1684 + }, + { + "epoch": 0.28837925723087454, + "grad_norm": 44.78715515136719, + "learning_rate": 9.583571021106675e-06, + "loss": 8.4681, + "step": 1685 + }, + { + "epoch": 0.2885504021906555, + "grad_norm": 25.710901260375977, + "learning_rate": 9.589275527666857e-06, + "loss": 3.2682, + "step": 1686 + }, + { + "epoch": 0.2887215471504364, + "grad_norm": 161.20376586914062, + "learning_rate": 9.59498003422704e-06, + "loss": 8.3231, + "step": 1687 + }, + { + "epoch": 0.28889269211021734, + "grad_norm": 36.88936996459961, + "learning_rate": 9.600684540787222e-06, + "loss": 4.4629, + "step": 1688 + }, + { + "epoch": 0.2890638370699983, + "grad_norm": 33.05325698852539, + "learning_rate": 9.606389047347404e-06, + "loss": 4.2398, + "step": 1689 + }, + { + "epoch": 0.2892349820297792, + "grad_norm": 31.297021865844727, + "learning_rate": 9.612093553907587e-06, + "loss": 3.9676, + "step": 1690 + }, + { + "epoch": 0.28940612698956014, + "grad_norm": 33.626365661621094, + "learning_rate": 9.617798060467769e-06, + "loss": 4.2342, + "step": 1691 + }, + { + "epoch": 0.2895772719493411, + "grad_norm": 32.812740325927734, + "learning_rate": 9.623502567027954e-06, + "loss": 3.9633, + "step": 1692 + }, + { + "epoch": 0.289748416909122, + "grad_norm": 16.281417846679688, + "learning_rate": 9.629207073588135e-06, + "loss": 1.3504, + "step": 1693 + }, + { + "epoch": 0.28991956186890294, + "grad_norm": 36.80635070800781, + "learning_rate": 9.634911580148317e-06, + "loss": 4.8133, + "step": 1694 + }, + { + "epoch": 0.2900907068286839, + "grad_norm": 36.548397064208984, + "learning_rate": 9.6406160867085e-06, + "loss": 4.0484, + "step": 1695 + }, + { + "epoch": 0.2902618517884648, + "grad_norm": 35.513729095458984, + "learning_rate": 9.646320593268682e-06, + "loss": 4.3281, + "step": 1696 + }, + { + "epoch": 0.29043299674824574, + "grad_norm": 33.258995056152344, + "learning_rate": 9.652025099828866e-06, + "loss": 4.3749, + "step": 1697 + }, + { + "epoch": 0.2906041417080267, + "grad_norm": 30.854419708251953, + "learning_rate": 9.657729606389047e-06, + "loss": 3.4016, + "step": 1698 + }, + { + "epoch": 0.2907752866678076, + "grad_norm": 8.308602333068848, + "learning_rate": 9.66343411294923e-06, + "loss": 1.221, + "step": 1699 + }, + { + "epoch": 0.29094643162758854, + "grad_norm": 10.515448570251465, + "learning_rate": 9.669138619509412e-06, + "loss": 0.9434, + "step": 1700 + }, + { + "epoch": 0.2911175765873695, + "grad_norm": 20.784067153930664, + "learning_rate": 9.674843126069594e-06, + "loss": 2.2873, + "step": 1701 + }, + { + "epoch": 0.2912887215471504, + "grad_norm": 44.417884826660156, + "learning_rate": 9.680547632629777e-06, + "loss": 8.2452, + "step": 1702 + }, + { + "epoch": 0.2914598665069314, + "grad_norm": 28.057279586791992, + "learning_rate": 9.68625213918996e-06, + "loss": 2.7355, + "step": 1703 + }, + { + "epoch": 0.29163101146671233, + "grad_norm": 50.71089553833008, + "learning_rate": 9.691956645750144e-06, + "loss": 8.431, + "step": 1704 + }, + { + "epoch": 0.29180215642649326, + "grad_norm": 44.918087005615234, + "learning_rate": 9.697661152310326e-06, + "loss": 7.8944, + "step": 1705 + }, + { + "epoch": 0.2919733013862742, + "grad_norm": 143.09837341308594, + "learning_rate": 9.703365658870507e-06, + "loss": 7.8241, + "step": 1706 + }, + { + "epoch": 0.29214444634605513, + "grad_norm": 22.652225494384766, + "learning_rate": 9.709070165430691e-06, + "loss": 2.888, + "step": 1707 + }, + { + "epoch": 0.29231559130583606, + "grad_norm": 32.992774963378906, + "learning_rate": 9.714774671990873e-06, + "loss": 3.7048, + "step": 1708 + }, + { + "epoch": 0.292486736265617, + "grad_norm": 30.531761169433594, + "learning_rate": 9.720479178551056e-06, + "loss": 3.8219, + "step": 1709 + }, + { + "epoch": 0.29265788122539793, + "grad_norm": 39.57463073730469, + "learning_rate": 9.726183685111238e-06, + "loss": 4.5677, + "step": 1710 + }, + { + "epoch": 0.29282902618517886, + "grad_norm": 32.177650451660156, + "learning_rate": 9.731888191671421e-06, + "loss": 3.9562, + "step": 1711 + }, + { + "epoch": 0.2930001711449598, + "grad_norm": 11.071613311767578, + "learning_rate": 9.737592698231603e-06, + "loss": 1.3818, + "step": 1712 + }, + { + "epoch": 0.29317131610474073, + "grad_norm": 183.07089233398438, + "learning_rate": 9.743297204791784e-06, + "loss": 8.202, + "step": 1713 + }, + { + "epoch": 0.29334246106452166, + "grad_norm": 127.64228057861328, + "learning_rate": 9.749001711351968e-06, + "loss": 7.7497, + "step": 1714 + }, + { + "epoch": 0.2935136060243026, + "grad_norm": 30.094449996948242, + "learning_rate": 9.754706217912151e-06, + "loss": 3.133, + "step": 1715 + }, + { + "epoch": 0.29368475098408353, + "grad_norm": 33.56199264526367, + "learning_rate": 9.760410724472334e-06, + "loss": 3.9969, + "step": 1716 + }, + { + "epoch": 0.29385589594386446, + "grad_norm": 30.969953536987305, + "learning_rate": 9.766115231032516e-06, + "loss": 3.2142, + "step": 1717 + }, + { + "epoch": 0.2940270409036454, + "grad_norm": 26.745988845825195, + "learning_rate": 9.7718197375927e-06, + "loss": 2.5008, + "step": 1718 + }, + { + "epoch": 0.29419818586342633, + "grad_norm": 19.184772491455078, + "learning_rate": 9.777524244152881e-06, + "loss": 1.7379, + "step": 1719 + }, + { + "epoch": 0.29436933082320726, + "grad_norm": 30.3228759765625, + "learning_rate": 9.783228750713063e-06, + "loss": 3.6131, + "step": 1720 + }, + { + "epoch": 0.2945404757829882, + "grad_norm": 30.700254440307617, + "learning_rate": 9.788933257273246e-06, + "loss": 3.4613, + "step": 1721 + }, + { + "epoch": 0.29471162074276913, + "grad_norm": 35.11033248901367, + "learning_rate": 9.794637763833428e-06, + "loss": 4.2554, + "step": 1722 + }, + { + "epoch": 0.29488276570255006, + "grad_norm": 47.508277893066406, + "learning_rate": 9.800342270393611e-06, + "loss": 8.2195, + "step": 1723 + }, + { + "epoch": 0.295053910662331, + "grad_norm": 35.247528076171875, + "learning_rate": 9.806046776953793e-06, + "loss": 4.3316, + "step": 1724 + }, + { + "epoch": 0.29522505562211193, + "grad_norm": 27.610280990600586, + "learning_rate": 9.811751283513975e-06, + "loss": 3.7587, + "step": 1725 + }, + { + "epoch": 0.29539620058189286, + "grad_norm": 34.314918518066406, + "learning_rate": 9.81745579007416e-06, + "loss": 4.4546, + "step": 1726 + }, + { + "epoch": 0.2955673455416738, + "grad_norm": 31.43994140625, + "learning_rate": 9.823160296634341e-06, + "loss": 3.8653, + "step": 1727 + }, + { + "epoch": 0.29573849050145473, + "grad_norm": 15.655001640319824, + "learning_rate": 9.828864803194525e-06, + "loss": 1.2015, + "step": 1728 + }, + { + "epoch": 0.29590963546123566, + "grad_norm": 13.799985885620117, + "learning_rate": 9.834569309754707e-06, + "loss": 1.6485, + "step": 1729 + }, + { + "epoch": 0.2960807804210166, + "grad_norm": 35.408145904541016, + "learning_rate": 9.84027381631489e-06, + "loss": 4.6912, + "step": 1730 + }, + { + "epoch": 0.29625192538079753, + "grad_norm": 33.258941650390625, + "learning_rate": 9.845978322875072e-06, + "loss": 3.8243, + "step": 1731 + }, + { + "epoch": 0.29642307034057847, + "grad_norm": 34.537960052490234, + "learning_rate": 9.851682829435253e-06, + "loss": 3.6863, + "step": 1732 + }, + { + "epoch": 0.2965942153003594, + "grad_norm": 25.667997360229492, + "learning_rate": 9.857387335995437e-06, + "loss": 3.21, + "step": 1733 + }, + { + "epoch": 0.29676536026014033, + "grad_norm": 146.46380615234375, + "learning_rate": 9.863091842555618e-06, + "loss": 7.1136, + "step": 1734 + }, + { + "epoch": 0.29693650521992127, + "grad_norm": 20.732595443725586, + "learning_rate": 9.868796349115802e-06, + "loss": 2.1932, + "step": 1735 + }, + { + "epoch": 0.2971076501797022, + "grad_norm": 37.78299331665039, + "learning_rate": 9.874500855675983e-06, + "loss": 4.4365, + "step": 1736 + }, + { + "epoch": 0.29727879513948313, + "grad_norm": 30.049827575683594, + "learning_rate": 9.880205362236167e-06, + "loss": 3.2983, + "step": 1737 + }, + { + "epoch": 0.29744994009926407, + "grad_norm": 12.33377742767334, + "learning_rate": 9.88590986879635e-06, + "loss": 1.4362, + "step": 1738 + }, + { + "epoch": 0.297621085059045, + "grad_norm": 24.165996551513672, + "learning_rate": 9.891614375356532e-06, + "loss": 2.7512, + "step": 1739 + }, + { + "epoch": 0.29779223001882593, + "grad_norm": 34.980438232421875, + "learning_rate": 9.897318881916715e-06, + "loss": 3.4089, + "step": 1740 + }, + { + "epoch": 0.29796337497860687, + "grad_norm": 52.22333526611328, + "learning_rate": 9.903023388476897e-06, + "loss": 8.55, + "step": 1741 + }, + { + "epoch": 0.2981345199383878, + "grad_norm": 30.178720474243164, + "learning_rate": 9.90872789503708e-06, + "loss": 3.7629, + "step": 1742 + }, + { + "epoch": 0.29830566489816873, + "grad_norm": 12.83564281463623, + "learning_rate": 9.914432401597262e-06, + "loss": 1.5206, + "step": 1743 + }, + { + "epoch": 0.29847680985794967, + "grad_norm": 116.20635223388672, + "learning_rate": 9.920136908157444e-06, + "loss": 7.1701, + "step": 1744 + }, + { + "epoch": 0.2986479548177306, + "grad_norm": 28.332143783569336, + "learning_rate": 9.925841414717627e-06, + "loss": 4.0808, + "step": 1745 + }, + { + "epoch": 0.29881909977751153, + "grad_norm": 17.009302139282227, + "learning_rate": 9.931545921277809e-06, + "loss": 1.1237, + "step": 1746 + }, + { + "epoch": 0.29899024473729247, + "grad_norm": 22.102079391479492, + "learning_rate": 9.937250427837992e-06, + "loss": 2.1591, + "step": 1747 + }, + { + "epoch": 0.2991613896970734, + "grad_norm": 31.704936981201172, + "learning_rate": 9.942954934398174e-06, + "loss": 3.3555, + "step": 1748 + }, + { + "epoch": 0.29933253465685433, + "grad_norm": 7.139681816101074, + "learning_rate": 9.948659440958359e-06, + "loss": 0.8492, + "step": 1749 + }, + { + "epoch": 0.29950367961663527, + "grad_norm": 37.93485641479492, + "learning_rate": 9.95436394751854e-06, + "loss": 4.3445, + "step": 1750 + }, + { + "epoch": 0.2996748245764162, + "grad_norm": 23.79175567626953, + "learning_rate": 9.960068454078722e-06, + "loss": 2.6891, + "step": 1751 + }, + { + "epoch": 0.29984596953619713, + "grad_norm": 26.583223342895508, + "learning_rate": 9.965772960638906e-06, + "loss": 3.0936, + "step": 1752 + }, + { + "epoch": 0.3000171144959781, + "grad_norm": 16.86503791809082, + "learning_rate": 9.971477467199087e-06, + "loss": 1.5367, + "step": 1753 + }, + { + "epoch": 0.30018825945575905, + "grad_norm": 37.780025482177734, + "learning_rate": 9.97718197375927e-06, + "loss": 4.5862, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_nli-pairs_loss": 3.577563524246216, + "eval_nli-pairs_runtime": 4.5158, + "eval_nli-pairs_samples_per_second": 44.289, + "eval_nli-pairs_steps_per_second": 1.55, + "eval_sts-test_pearson_cosine": 0.7051574603634622, + "eval_sts-test_pearson_dot": 0.5937802816639131, + "eval_sts-test_pearson_euclidean": 0.7000060119936138, + "eval_sts-test_pearson_manhattan": 0.7079127065958083, + "eval_sts-test_pearson_max": 0.7079127065958083, + "eval_sts-test_spearman_cosine": 0.6765504113809614, + "eval_sts-test_spearman_dot": 0.5611218190113842, + "eval_sts-test_spearman_euclidean": 0.6793571635918119, + "eval_sts-test_spearman_manhattan": 0.6864576898108908, + "eval_sts-test_spearman_max": 0.6864576898108908, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_vitaminc-pairs_loss": 2.382566213607788, + "eval_vitaminc-pairs_runtime": 2.7572, + "eval_vitaminc-pairs_samples_per_second": 72.538, + "eval_vitaminc-pairs_steps_per_second": 2.539, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_qnli-contrastive_loss": 7.762363910675049, + "eval_qnli-contrastive_runtime": 0.6686, + "eval_qnli-contrastive_samples_per_second": 299.128, + "eval_qnli-contrastive_steps_per_second": 10.469, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_scitail-pairs-qa_loss": 0.7197363972663879, + "eval_scitail-pairs-qa_runtime": 1.7426, + "eval_scitail-pairs-qa_samples_per_second": 114.768, + "eval_scitail-pairs-qa_steps_per_second": 4.017, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_scitail-pairs-pos_loss": 2.2759039402008057, + "eval_scitail-pairs-pos_runtime": 2.8206, + "eval_scitail-pairs-pos_samples_per_second": 70.906, + "eval_scitail-pairs-pos_steps_per_second": 2.482, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_xsum-pairs_loss": 2.1139955520629883, + "eval_xsum-pairs_runtime": 2.6563, + "eval_xsum-pairs_samples_per_second": 65.88, + "eval_xsum-pairs_steps_per_second": 2.259, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_compression-pairs_loss": 1.1527378559112549, + "eval_compression-pairs_runtime": 0.5278, + "eval_compression-pairs_samples_per_second": 378.929, + "eval_compression-pairs_steps_per_second": 13.263, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_sciq_pairs_loss": 6.166472434997559, + "eval_sciq_pairs_runtime": 9.2821, + "eval_sciq_pairs_samples_per_second": 21.547, + "eval_sciq_pairs_steps_per_second": 0.754, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_qasc_pairs_loss": 8.247413635253906, + "eval_qasc_pairs_runtime": 2.7444, + "eval_qasc_pairs_samples_per_second": 72.876, + "eval_qasc_pairs_steps_per_second": 2.551, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_openbookqa_pairs_loss": 4.27993631362915, + "eval_openbookqa_pairs_runtime": 0.68, + "eval_openbookqa_pairs_samples_per_second": 101.475, + "eval_openbookqa_pairs_steps_per_second": 4.412, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_msmarco_pairs_loss": 3.4503884315490723, + "eval_msmarco_pairs_runtime": 4.1424, + "eval_msmarco_pairs_samples_per_second": 48.281, + "eval_msmarco_pairs_steps_per_second": 1.69, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_nq_pairs_loss": 4.303767204284668, + "eval_nq_pairs_runtime": 8.7194, + "eval_nq_pairs_samples_per_second": 22.937, + "eval_nq_pairs_steps_per_second": 0.803, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_trivia_pairs_loss": 3.893390417098999, + "eval_trivia_pairs_runtime": 13.177, + "eval_trivia_pairs_samples_per_second": 15.178, + "eval_trivia_pairs_steps_per_second": 0.531, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_quora_pairs_loss": 1.0257954597473145, + "eval_quora_pairs_runtime": 1.5896, + "eval_quora_pairs_samples_per_second": 125.821, + "eval_quora_pairs_steps_per_second": 4.404, + "step": 1754 + }, + { + "epoch": 0.30018825945575905, + "eval_gooaq_pairs_loss": 2.6827940940856934, + "eval_gooaq_pairs_runtime": 2.6669, + "eval_gooaq_pairs_samples_per_second": 74.993, + "eval_gooaq_pairs_steps_per_second": 2.625, + "step": 1754 + }, + { + "epoch": 0.30035940441554, + "grad_norm": 32.57681655883789, + "learning_rate": 9.982886480319452e-06, + "loss": 4.3391, + "step": 1755 + }, + { + "epoch": 0.3005305493753209, + "grad_norm": 26.65064811706543, + "learning_rate": 9.988590986879634e-06, + "loss": 2.7014, + "step": 1756 + }, + { + "epoch": 0.30070169433510185, + "grad_norm": 33.25247573852539, + "learning_rate": 9.994295493439817e-06, + "loss": 4.1446, + "step": 1757 + }, + { + "epoch": 0.3008728392948828, + "grad_norm": 25.792116165161133, + "learning_rate": 9.999999999999999e-06, + "loss": 2.7164, + "step": 1758 + }, + { + "epoch": 0.3010439842546637, + "grad_norm": 28.707399368286133, + "learning_rate": 1.0005704506560183e-05, + "loss": 3.1937, + "step": 1759 + }, + { + "epoch": 0.30121512921444465, + "grad_norm": 38.30696105957031, + "learning_rate": 1.0011409013120366e-05, + "loss": 4.2427, + "step": 1760 + }, + { + "epoch": 0.3013862741742256, + "grad_norm": 26.254148483276367, + "learning_rate": 1.001711351968055e-05, + "loss": 2.5525, + "step": 1761 + }, + { + "epoch": 0.3015574191340065, + "grad_norm": 7.5429487228393555, + "learning_rate": 1.0022818026240731e-05, + "loss": 0.8481, + "step": 1762 + }, + { + "epoch": 0.30172856409378745, + "grad_norm": 45.37841796875, + "learning_rate": 1.0028522532800913e-05, + "loss": 6.5584, + "step": 1763 + }, + { + "epoch": 0.3018997090535684, + "grad_norm": 17.617197036743164, + "learning_rate": 1.0034227039361096e-05, + "loss": 1.5689, + "step": 1764 + }, + { + "epoch": 0.3020708540133493, + "grad_norm": 8.921030044555664, + "learning_rate": 1.0039931545921278e-05, + "loss": 1.9049, + "step": 1765 + }, + { + "epoch": 0.30224199897313025, + "grad_norm": 11.456149101257324, + "learning_rate": 1.0045636052481461e-05, + "loss": 1.4351, + "step": 1766 + }, + { + "epoch": 0.3024131439329112, + "grad_norm": 36.827125549316406, + "learning_rate": 1.0051340559041643e-05, + "loss": 3.8073, + "step": 1767 + }, + { + "epoch": 0.3025842888926921, + "grad_norm": 31.50043296813965, + "learning_rate": 1.0057045065601826e-05, + "loss": 3.4761, + "step": 1768 + }, + { + "epoch": 0.30275543385247305, + "grad_norm": 212.15618896484375, + "learning_rate": 1.0062749572162008e-05, + "loss": 8.804, + "step": 1769 + }, + { + "epoch": 0.302926578812254, + "grad_norm": 11.170289039611816, + "learning_rate": 1.006845407872219e-05, + "loss": 1.5324, + "step": 1770 + }, + { + "epoch": 0.3030977237720349, + "grad_norm": 11.275130271911621, + "learning_rate": 1.0074158585282373e-05, + "loss": 1.0326, + "step": 1771 + }, + { + "epoch": 0.30326886873181585, + "grad_norm": 37.139068603515625, + "learning_rate": 1.0079863091842556e-05, + "loss": 4.5464, + "step": 1772 + }, + { + "epoch": 0.3034400136915968, + "grad_norm": 24.030378341674805, + "learning_rate": 1.008556759840274e-05, + "loss": 1.9306, + "step": 1773 + }, + { + "epoch": 0.3036111586513777, + "grad_norm": 23.25863265991211, + "learning_rate": 1.0091272104962921e-05, + "loss": 1.8897, + "step": 1774 + }, + { + "epoch": 0.30378230361115865, + "grad_norm": 33.125823974609375, + "learning_rate": 1.0096976611523103e-05, + "loss": 3.4839, + "step": 1775 + }, + { + "epoch": 0.3039534485709396, + "grad_norm": 21.4809627532959, + "learning_rate": 1.0102681118083286e-05, + "loss": 2.866, + "step": 1776 + }, + { + "epoch": 0.3041245935307205, + "grad_norm": 54.2559928894043, + "learning_rate": 1.0108385624643468e-05, + "loss": 8.802, + "step": 1777 + }, + { + "epoch": 0.30429573849050146, + "grad_norm": 39.62715148925781, + "learning_rate": 1.0114090131203651e-05, + "loss": 5.1068, + "step": 1778 + }, + { + "epoch": 0.3044668834502824, + "grad_norm": 14.615751266479492, + "learning_rate": 1.0119794637763833e-05, + "loss": 1.2298, + "step": 1779 + }, + { + "epoch": 0.3046380284100633, + "grad_norm": 36.6978874206543, + "learning_rate": 1.0125499144324017e-05, + "loss": 4.1995, + "step": 1780 + }, + { + "epoch": 0.30480917336984426, + "grad_norm": 14.718832015991211, + "learning_rate": 1.0131203650884198e-05, + "loss": 1.1796, + "step": 1781 + }, + { + "epoch": 0.3049803183296252, + "grad_norm": 36.830204010009766, + "learning_rate": 1.013690815744438e-05, + "loss": 4.1858, + "step": 1782 + }, + { + "epoch": 0.3051514632894061, + "grad_norm": 23.391765594482422, + "learning_rate": 1.0142612664004565e-05, + "loss": 2.4115, + "step": 1783 + }, + { + "epoch": 0.30532260824918706, + "grad_norm": 35.27947998046875, + "learning_rate": 1.0148317170564747e-05, + "loss": 4.8061, + "step": 1784 + }, + { + "epoch": 0.305493753208968, + "grad_norm": 10.68021297454834, + "learning_rate": 1.015402167712493e-05, + "loss": 2.1324, + "step": 1785 + }, + { + "epoch": 0.3056648981687489, + "grad_norm": 23.529436111450195, + "learning_rate": 1.0159726183685112e-05, + "loss": 2.7194, + "step": 1786 + }, + { + "epoch": 0.30583604312852986, + "grad_norm": 32.76841354370117, + "learning_rate": 1.0165430690245295e-05, + "loss": 3.9735, + "step": 1787 + }, + { + "epoch": 0.3060071880883108, + "grad_norm": 20.872732162475586, + "learning_rate": 1.0171135196805477e-05, + "loss": 2.3385, + "step": 1788 + }, + { + "epoch": 0.3061783330480917, + "grad_norm": 14.08251953125, + "learning_rate": 1.0176839703365658e-05, + "loss": 1.8159, + "step": 1789 + }, + { + "epoch": 0.30634947800787266, + "grad_norm": 39.58723831176758, + "learning_rate": 1.0182544209925842e-05, + "loss": 4.7749, + "step": 1790 + }, + { + "epoch": 0.3065206229676536, + "grad_norm": 65.20591735839844, + "learning_rate": 1.0188248716486024e-05, + "loss": 6.4724, + "step": 1791 + }, + { + "epoch": 0.3066917679274345, + "grad_norm": 44.97452926635742, + "learning_rate": 1.0193953223046207e-05, + "loss": 4.9313, + "step": 1792 + }, + { + "epoch": 0.30686291288721546, + "grad_norm": 35.091163635253906, + "learning_rate": 1.0199657729606389e-05, + "loss": 3.4266, + "step": 1793 + }, + { + "epoch": 0.3070340578469964, + "grad_norm": 17.238380432128906, + "learning_rate": 1.020536223616657e-05, + "loss": 1.4114, + "step": 1794 + }, + { + "epoch": 0.3072052028067773, + "grad_norm": 12.661242485046387, + "learning_rate": 1.0211066742726755e-05, + "loss": 2.2799, + "step": 1795 + }, + { + "epoch": 0.30737634776655826, + "grad_norm": 29.67556381225586, + "learning_rate": 1.0216771249286937e-05, + "loss": 2.9217, + "step": 1796 + }, + { + "epoch": 0.3075474927263392, + "grad_norm": 34.465126037597656, + "learning_rate": 1.022247575584712e-05, + "loss": 3.9674, + "step": 1797 + }, + { + "epoch": 0.3077186376861201, + "grad_norm": 66.6548080444336, + "learning_rate": 1.0228180262407302e-05, + "loss": 6.0514, + "step": 1798 + }, + { + "epoch": 0.30788978264590106, + "grad_norm": 36.210044860839844, + "learning_rate": 1.0233884768967485e-05, + "loss": 4.2555, + "step": 1799 + }, + { + "epoch": 0.308060927605682, + "grad_norm": 24.441967010498047, + "learning_rate": 1.0239589275527667e-05, + "loss": 2.5473, + "step": 1800 + }, + { + "epoch": 0.3082320725654629, + "grad_norm": 20.574525833129883, + "learning_rate": 1.0245293782087849e-05, + "loss": 1.6693, + "step": 1801 + }, + { + "epoch": 0.30840321752524386, + "grad_norm": 26.07015037536621, + "learning_rate": 1.0250998288648032e-05, + "loss": 2.7451, + "step": 1802 + }, + { + "epoch": 0.3085743624850248, + "grad_norm": 29.663963317871094, + "learning_rate": 1.0256702795208214e-05, + "loss": 4.0482, + "step": 1803 + }, + { + "epoch": 0.3087455074448058, + "grad_norm": 27.77281379699707, + "learning_rate": 1.0262407301768397e-05, + "loss": 3.0752, + "step": 1804 + }, + { + "epoch": 0.3089166524045867, + "grad_norm": 34.827430725097656, + "learning_rate": 1.0268111808328579e-05, + "loss": 3.7669, + "step": 1805 + }, + { + "epoch": 0.30908779736436764, + "grad_norm": 37.112361907958984, + "learning_rate": 1.0273816314888762e-05, + "loss": 4.7788, + "step": 1806 + }, + { + "epoch": 0.3092589423241486, + "grad_norm": 53.2462272644043, + "learning_rate": 1.0279520821448946e-05, + "loss": 8.0593, + "step": 1807 + }, + { + "epoch": 0.3094300872839295, + "grad_norm": 38.18441390991211, + "learning_rate": 1.0285225328009127e-05, + "loss": 4.2028, + "step": 1808 + }, + { + "epoch": 0.30960123224371044, + "grad_norm": 13.605740547180176, + "learning_rate": 1.029092983456931e-05, + "loss": 2.5679, + "step": 1809 + }, + { + "epoch": 0.3097723772034914, + "grad_norm": 37.292240142822266, + "learning_rate": 1.0296634341129492e-05, + "loss": 4.0864, + "step": 1810 + }, + { + "epoch": 0.3099435221632723, + "grad_norm": 10.673694610595703, + "learning_rate": 1.0302338847689676e-05, + "loss": 0.953, + "step": 1811 + }, + { + "epoch": 0.31011466712305324, + "grad_norm": 30.847604751586914, + "learning_rate": 1.0308043354249858e-05, + "loss": 4.4181, + "step": 1812 + }, + { + "epoch": 0.3102858120828342, + "grad_norm": 25.303640365600586, + "learning_rate": 1.031374786081004e-05, + "loss": 3.0808, + "step": 1813 + }, + { + "epoch": 0.3104569570426151, + "grad_norm": 31.284347534179688, + "learning_rate": 1.0319452367370223e-05, + "loss": 3.3148, + "step": 1814 + }, + { + "epoch": 0.31062810200239604, + "grad_norm": 18.292266845703125, + "learning_rate": 1.0325156873930404e-05, + "loss": 1.4786, + "step": 1815 + }, + { + "epoch": 0.310799246962177, + "grad_norm": 93.66471099853516, + "learning_rate": 1.0330861380490588e-05, + "loss": 6.8127, + "step": 1816 + }, + { + "epoch": 0.3109703919219579, + "grad_norm": 38.12440872192383, + "learning_rate": 1.033656588705077e-05, + "loss": 5.0019, + "step": 1817 + }, + { + "epoch": 0.31114153688173884, + "grad_norm": 32.61493682861328, + "learning_rate": 1.0342270393610954e-05, + "loss": 4.3171, + "step": 1818 + }, + { + "epoch": 0.3113126818415198, + "grad_norm": 38.087646484375, + "learning_rate": 1.0347974900171136e-05, + "loss": 7.6945, + "step": 1819 + }, + { + "epoch": 0.3114838268013007, + "grad_norm": 21.899497985839844, + "learning_rate": 1.0353679406731318e-05, + "loss": 1.7206, + "step": 1820 + }, + { + "epoch": 0.31165497176108165, + "grad_norm": 113.81354522705078, + "learning_rate": 1.0359383913291501e-05, + "loss": 7.2513, + "step": 1821 + }, + { + "epoch": 0.3118261167208626, + "grad_norm": 11.316397666931152, + "learning_rate": 1.0365088419851683e-05, + "loss": 2.1259, + "step": 1822 + }, + { + "epoch": 0.3119972616806435, + "grad_norm": 26.67529296875, + "learning_rate": 1.0370792926411866e-05, + "loss": 3.1664, + "step": 1823 + }, + { + "epoch": 0.31216840664042445, + "grad_norm": 25.253353118896484, + "learning_rate": 1.0376497432972048e-05, + "loss": 2.4222, + "step": 1824 + }, + { + "epoch": 0.3123395516002054, + "grad_norm": 8.143440246582031, + "learning_rate": 1.038220193953223e-05, + "loss": 0.7973, + "step": 1825 + }, + { + "epoch": 0.3125106965599863, + "grad_norm": 19.66392707824707, + "learning_rate": 1.0387906446092413e-05, + "loss": 1.5552, + "step": 1826 + }, + { + "epoch": 0.31268184151976725, + "grad_norm": 23.67314910888672, + "learning_rate": 1.0393610952652595e-05, + "loss": 3.07, + "step": 1827 + }, + { + "epoch": 0.3128529864795482, + "grad_norm": 26.236251831054688, + "learning_rate": 1.0399315459212778e-05, + "loss": 3.1091, + "step": 1828 + }, + { + "epoch": 0.3130241314393291, + "grad_norm": 28.10502815246582, + "learning_rate": 1.0405019965772961e-05, + "loss": 3.0707, + "step": 1829 + }, + { + "epoch": 0.31319527639911005, + "grad_norm": 34.508846282958984, + "learning_rate": 1.0410724472333145e-05, + "loss": 4.872, + "step": 1830 + }, + { + "epoch": 0.313366421358891, + "grad_norm": 34.22414016723633, + "learning_rate": 1.0416428978893326e-05, + "loss": 3.3169, + "step": 1831 + }, + { + "epoch": 0.3135375663186719, + "grad_norm": 46.06840515136719, + "learning_rate": 1.0422133485453508e-05, + "loss": 7.9438, + "step": 1832 + }, + { + "epoch": 0.31370871127845285, + "grad_norm": 18.041322708129883, + "learning_rate": 1.0427837992013692e-05, + "loss": 1.629, + "step": 1833 + }, + { + "epoch": 0.3138798562382338, + "grad_norm": 14.525741577148438, + "learning_rate": 1.0433542498573873e-05, + "loss": 1.1969, + "step": 1834 + }, + { + "epoch": 0.3140510011980147, + "grad_norm": 4.135936260223389, + "learning_rate": 1.0439247005134057e-05, + "loss": 0.7184, + "step": 1835 + }, + { + "epoch": 0.31422214615779565, + "grad_norm": 41.9599723815918, + "learning_rate": 1.0444951511694238e-05, + "loss": 4.2524, + "step": 1836 + }, + { + "epoch": 0.3143932911175766, + "grad_norm": 7.373823642730713, + "learning_rate": 1.0450656018254422e-05, + "loss": 1.8983, + "step": 1837 + }, + { + "epoch": 0.3145644360773575, + "grad_norm": 31.084392547607422, + "learning_rate": 1.0456360524814603e-05, + "loss": 4.0436, + "step": 1838 + }, + { + "epoch": 0.31473558103713845, + "grad_norm": 11.967267036437988, + "learning_rate": 1.0462065031374785e-05, + "loss": 1.0282, + "step": 1839 + }, + { + "epoch": 0.3149067259969194, + "grad_norm": 33.466121673583984, + "learning_rate": 1.046776953793497e-05, + "loss": 3.9262, + "step": 1840 + }, + { + "epoch": 0.3150778709567003, + "grad_norm": 39.21562576293945, + "learning_rate": 1.0473474044495152e-05, + "loss": 4.844, + "step": 1841 + }, + { + "epoch": 0.31524901591648125, + "grad_norm": 33.843055725097656, + "learning_rate": 1.0479178551055335e-05, + "loss": 3.5103, + "step": 1842 + }, + { + "epoch": 0.3154201608762622, + "grad_norm": 35.37272644042969, + "learning_rate": 1.0484883057615517e-05, + "loss": 3.584, + "step": 1843 + }, + { + "epoch": 0.3155913058360431, + "grad_norm": 17.376483917236328, + "learning_rate": 1.0490587564175699e-05, + "loss": 1.4993, + "step": 1844 + }, + { + "epoch": 0.31576245079582405, + "grad_norm": 45.614688873291016, + "learning_rate": 1.0496292070735882e-05, + "loss": 8.1587, + "step": 1845 + }, + { + "epoch": 0.315933595755605, + "grad_norm": 31.185443878173828, + "learning_rate": 1.0501996577296064e-05, + "loss": 4.1762, + "step": 1846 + }, + { + "epoch": 0.3161047407153859, + "grad_norm": 33.703514099121094, + "learning_rate": 1.0507701083856247e-05, + "loss": 4.1885, + "step": 1847 + }, + { + "epoch": 0.31627588567516685, + "grad_norm": 24.48247718811035, + "learning_rate": 1.0513405590416429e-05, + "loss": 2.7277, + "step": 1848 + }, + { + "epoch": 0.3164470306349478, + "grad_norm": 25.966876983642578, + "learning_rate": 1.0519110096976612e-05, + "loss": 2.8921, + "step": 1849 + }, + { + "epoch": 0.3166181755947287, + "grad_norm": 35.0124626159668, + "learning_rate": 1.0524814603536794e-05, + "loss": 4.3145, + "step": 1850 + }, + { + "epoch": 0.31678932055450965, + "grad_norm": 33.62586975097656, + "learning_rate": 1.0530519110096975e-05, + "loss": 3.8524, + "step": 1851 + }, + { + "epoch": 0.3169604655142906, + "grad_norm": 30.16233253479004, + "learning_rate": 1.053622361665716e-05, + "loss": 3.3166, + "step": 1852 + }, + { + "epoch": 0.3171316104740715, + "grad_norm": 31.811193466186523, + "learning_rate": 1.0541928123217342e-05, + "loss": 3.5965, + "step": 1853 + }, + { + "epoch": 0.31730275543385245, + "grad_norm": 35.756778717041016, + "learning_rate": 1.0547632629777526e-05, + "loss": 4.4027, + "step": 1854 + }, + { + "epoch": 0.31747390039363343, + "grad_norm": 17.929304122924805, + "learning_rate": 1.0553337136337707e-05, + "loss": 2.2128, + "step": 1855 + }, + { + "epoch": 0.31764504535341437, + "grad_norm": 29.329362869262695, + "learning_rate": 1.0559041642897889e-05, + "loss": 2.4503, + "step": 1856 + }, + { + "epoch": 0.3178161903131953, + "grad_norm": 38.31791305541992, + "learning_rate": 1.0564746149458072e-05, + "loss": 4.1596, + "step": 1857 + }, + { + "epoch": 0.31798733527297623, + "grad_norm": 26.978776931762695, + "learning_rate": 1.0570450656018254e-05, + "loss": 2.5148, + "step": 1858 + }, + { + "epoch": 0.31815848023275717, + "grad_norm": 183.96864318847656, + "learning_rate": 1.0576155162578437e-05, + "loss": 7.8451, + "step": 1859 + }, + { + "epoch": 0.3183296251925381, + "grad_norm": 34.898677825927734, + "learning_rate": 1.0581859669138619e-05, + "loss": 3.631, + "step": 1860 + }, + { + "epoch": 0.31850077015231903, + "grad_norm": 18.749799728393555, + "learning_rate": 1.0587564175698802e-05, + "loss": 1.5066, + "step": 1861 + }, + { + "epoch": 0.31867191511209997, + "grad_norm": 32.26422882080078, + "learning_rate": 1.0593268682258984e-05, + "loss": 4.0466, + "step": 1862 + }, + { + "epoch": 0.3188430600718809, + "grad_norm": 9.538769721984863, + "learning_rate": 1.0598973188819167e-05, + "loss": 1.2133, + "step": 1863 + }, + { + "epoch": 0.31901420503166183, + "grad_norm": 9.156614303588867, + "learning_rate": 1.0604677695379351e-05, + "loss": 0.9202, + "step": 1864 + }, + { + "epoch": 0.31918534999144277, + "grad_norm": 137.56471252441406, + "learning_rate": 1.0610382201939533e-05, + "loss": 7.6205, + "step": 1865 + }, + { + "epoch": 0.3193564949512237, + "grad_norm": 24.30291748046875, + "learning_rate": 1.0616086708499716e-05, + "loss": 2.5704, + "step": 1866 + }, + { + "epoch": 0.31952763991100464, + "grad_norm": 32.78607940673828, + "learning_rate": 1.0621791215059898e-05, + "loss": 3.4866, + "step": 1867 + }, + { + "epoch": 0.31969878487078557, + "grad_norm": 25.44717025756836, + "learning_rate": 1.0627495721620081e-05, + "loss": 2.8747, + "step": 1868 + }, + { + "epoch": 0.3198699298305665, + "grad_norm": 71.5486831665039, + "learning_rate": 1.0633200228180263e-05, + "loss": 6.3834, + "step": 1869 + }, + { + "epoch": 0.32004107479034744, + "grad_norm": 36.36513900756836, + "learning_rate": 1.0638904734740444e-05, + "loss": 3.8896, + "step": 1870 + }, + { + "epoch": 0.32021221975012837, + "grad_norm": 14.369461059570312, + "learning_rate": 1.0644609241300628e-05, + "loss": 1.2576, + "step": 1871 + }, + { + "epoch": 0.3203833647099093, + "grad_norm": 34.6867561340332, + "learning_rate": 1.065031374786081e-05, + "loss": 3.4093, + "step": 1872 + }, + { + "epoch": 0.32055450966969024, + "grad_norm": 21.84122657775879, + "learning_rate": 1.0656018254420993e-05, + "loss": 2.2791, + "step": 1873 + }, + { + "epoch": 0.32072565462947117, + "grad_norm": 21.254135131835938, + "learning_rate": 1.0661722760981174e-05, + "loss": 2.2054, + "step": 1874 + }, + { + "epoch": 0.3208967995892521, + "grad_norm": 33.362220764160156, + "learning_rate": 1.0667427267541358e-05, + "loss": 4.1888, + "step": 1875 + }, + { + "epoch": 0.32106794454903304, + "grad_norm": 63.412601470947266, + "learning_rate": 1.0673131774101541e-05, + "loss": 8.5606, + "step": 1876 + }, + { + "epoch": 0.32123908950881397, + "grad_norm": 14.283455848693848, + "learning_rate": 1.0678836280661723e-05, + "loss": 0.9998, + "step": 1877 + }, + { + "epoch": 0.3214102344685949, + "grad_norm": 35.16504669189453, + "learning_rate": 1.0684540787221906e-05, + "loss": 4.2321, + "step": 1878 + }, + { + "epoch": 0.32158137942837584, + "grad_norm": 12.61963939666748, + "learning_rate": 1.0690245293782088e-05, + "loss": 1.5004, + "step": 1879 + }, + { + "epoch": 0.32175252438815677, + "grad_norm": 32.174076080322266, + "learning_rate": 1.0695949800342271e-05, + "loss": 3.5576, + "step": 1880 + }, + { + "epoch": 0.3219236693479377, + "grad_norm": 30.472043991088867, + "learning_rate": 1.0701654306902453e-05, + "loss": 3.4048, + "step": 1881 + }, + { + "epoch": 0.32209481430771864, + "grad_norm": 84.8609848022461, + "learning_rate": 1.0707358813462635e-05, + "loss": 6.2658, + "step": 1882 + }, + { + "epoch": 0.32226595926749957, + "grad_norm": 25.621240615844727, + "learning_rate": 1.0713063320022818e-05, + "loss": 2.6459, + "step": 1883 + }, + { + "epoch": 0.3224371042272805, + "grad_norm": 79.82257080078125, + "learning_rate": 1.0718767826583e-05, + "loss": 6.3192, + "step": 1884 + }, + { + "epoch": 0.32260824918706144, + "grad_norm": 7.729169845581055, + "learning_rate": 1.0724472333143183e-05, + "loss": 0.825, + "step": 1885 + }, + { + "epoch": 0.32277939414684237, + "grad_norm": 29.313451766967773, + "learning_rate": 1.0730176839703367e-05, + "loss": 2.9915, + "step": 1886 + }, + { + "epoch": 0.3229505391066233, + "grad_norm": 6.555768013000488, + "learning_rate": 1.073588134626355e-05, + "loss": 0.7525, + "step": 1887 + }, + { + "epoch": 0.32312168406640424, + "grad_norm": 35.07060241699219, + "learning_rate": 1.0741585852823732e-05, + "loss": 4.2147, + "step": 1888 + }, + { + "epoch": 0.32329282902618517, + "grad_norm": 10.583313941955566, + "learning_rate": 1.0747290359383913e-05, + "loss": 0.8557, + "step": 1889 + }, + { + "epoch": 0.3234639739859661, + "grad_norm": 26.075578689575195, + "learning_rate": 1.0752994865944097e-05, + "loss": 2.9433, + "step": 1890 + }, + { + "epoch": 0.32363511894574704, + "grad_norm": 17.7381591796875, + "learning_rate": 1.0758699372504278e-05, + "loss": 1.4998, + "step": 1891 + }, + { + "epoch": 0.32380626390552797, + "grad_norm": 16.11162567138672, + "learning_rate": 1.0764403879064462e-05, + "loss": 1.2949, + "step": 1892 + }, + { + "epoch": 0.3239774088653089, + "grad_norm": 28.165752410888672, + "learning_rate": 1.0770108385624643e-05, + "loss": 3.4363, + "step": 1893 + }, + { + "epoch": 0.32414855382508984, + "grad_norm": 37.37394714355469, + "learning_rate": 1.0775812892184825e-05, + "loss": 4.7016, + "step": 1894 + }, + { + "epoch": 0.32431969878487077, + "grad_norm": 35.620826721191406, + "learning_rate": 1.0781517398745008e-05, + "loss": 4.4153, + "step": 1895 + }, + { + "epoch": 0.3244908437446517, + "grad_norm": 35.83405303955078, + "learning_rate": 1.078722190530519e-05, + "loss": 4.4295, + "step": 1896 + }, + { + "epoch": 0.32466198870443264, + "grad_norm": 12.846619606018066, + "learning_rate": 1.0792926411865374e-05, + "loss": 1.4411, + "step": 1897 + }, + { + "epoch": 0.32483313366421357, + "grad_norm": 11.455179214477539, + "learning_rate": 1.0798630918425557e-05, + "loss": 1.1335, + "step": 1898 + }, + { + "epoch": 0.3250042786239945, + "grad_norm": 36.278289794921875, + "learning_rate": 1.080433542498574e-05, + "loss": 3.6505, + "step": 1899 + }, + { + "epoch": 0.32517542358377544, + "grad_norm": 37.59969711303711, + "learning_rate": 1.0810039931545922e-05, + "loss": 5.1473, + "step": 1900 + }, + { + "epoch": 0.32534656854355637, + "grad_norm": 27.851537704467773, + "learning_rate": 1.0815744438106104e-05, + "loss": 2.792, + "step": 1901 + }, + { + "epoch": 0.3255177135033373, + "grad_norm": 20.874591827392578, + "learning_rate": 1.0821448944666287e-05, + "loss": 2.5421, + "step": 1902 + }, + { + "epoch": 0.32568885846311824, + "grad_norm": 12.82272720336914, + "learning_rate": 1.0827153451226469e-05, + "loss": 0.9663, + "step": 1903 + }, + { + "epoch": 0.32586000342289917, + "grad_norm": 27.367874145507812, + "learning_rate": 1.0832857957786652e-05, + "loss": 2.6934, + "step": 1904 + }, + { + "epoch": 0.3260311483826801, + "grad_norm": 31.575483322143555, + "learning_rate": 1.0838562464346834e-05, + "loss": 3.3276, + "step": 1905 + }, + { + "epoch": 0.3262022933424611, + "grad_norm": 36.26526641845703, + "learning_rate": 1.0844266970907017e-05, + "loss": 4.196, + "step": 1906 + }, + { + "epoch": 0.326373438302242, + "grad_norm": 20.60125160217285, + "learning_rate": 1.0849971477467199e-05, + "loss": 1.5247, + "step": 1907 + }, + { + "epoch": 0.32654458326202296, + "grad_norm": 19.104351043701172, + "learning_rate": 1.085567598402738e-05, + "loss": 1.9953, + "step": 1908 + }, + { + "epoch": 0.3267157282218039, + "grad_norm": 31.618993759155273, + "learning_rate": 1.0861380490587566e-05, + "loss": 3.2496, + "step": 1909 + }, + { + "epoch": 0.3268868731815848, + "grad_norm": 20.25756072998047, + "learning_rate": 1.0867084997147747e-05, + "loss": 1.4173, + "step": 1910 + }, + { + "epoch": 0.32705801814136576, + "grad_norm": 19.579376220703125, + "learning_rate": 1.087278950370793e-05, + "loss": 1.4559, + "step": 1911 + }, + { + "epoch": 0.3272291631011467, + "grad_norm": 33.51919174194336, + "learning_rate": 1.0878494010268112e-05, + "loss": 4.3546, + "step": 1912 + }, + { + "epoch": 0.3274003080609276, + "grad_norm": 34.54380416870117, + "learning_rate": 1.0884198516828294e-05, + "loss": 3.8532, + "step": 1913 + }, + { + "epoch": 0.32757145302070856, + "grad_norm": 43.39759063720703, + "learning_rate": 1.0889903023388477e-05, + "loss": 5.7, + "step": 1914 + }, + { + "epoch": 0.3277425979804895, + "grad_norm": 31.343278884887695, + "learning_rate": 1.0895607529948659e-05, + "loss": 3.6086, + "step": 1915 + }, + { + "epoch": 0.3279137429402704, + "grad_norm": 37.40540313720703, + "learning_rate": 1.0901312036508843e-05, + "loss": 3.6012, + "step": 1916 + }, + { + "epoch": 0.32808488790005136, + "grad_norm": 10.474573135375977, + "learning_rate": 1.0907016543069024e-05, + "loss": 0.9649, + "step": 1917 + }, + { + "epoch": 0.3282560328598323, + "grad_norm": 26.88408088684082, + "learning_rate": 1.0912721049629208e-05, + "loss": 2.6185, + "step": 1918 + }, + { + "epoch": 0.3284271778196132, + "grad_norm": 24.986539840698242, + "learning_rate": 1.091842555618939e-05, + "loss": 2.0861, + "step": 1919 + }, + { + "epoch": 0.32859832277939416, + "grad_norm": 36.754337310791016, + "learning_rate": 1.0924130062749573e-05, + "loss": 4.4734, + "step": 1920 + }, + { + "epoch": 0.3287694677391751, + "grad_norm": 36.0711555480957, + "learning_rate": 1.0929834569309756e-05, + "loss": 3.7612, + "step": 1921 + }, + { + "epoch": 0.328940612698956, + "grad_norm": 33.72808074951172, + "learning_rate": 1.0935539075869938e-05, + "loss": 3.6817, + "step": 1922 + }, + { + "epoch": 0.32911175765873696, + "grad_norm": 31.21643829345703, + "learning_rate": 1.0941243582430121e-05, + "loss": 3.1247, + "step": 1923 + }, + { + "epoch": 0.3292829026185179, + "grad_norm": 26.2045955657959, + "learning_rate": 1.0946948088990303e-05, + "loss": 3.1474, + "step": 1924 + }, + { + "epoch": 0.3294540475782988, + "grad_norm": 30.681350708007812, + "learning_rate": 1.0952652595550484e-05, + "loss": 3.1958, + "step": 1925 + }, + { + "epoch": 0.32962519253807976, + "grad_norm": 57.95525360107422, + "learning_rate": 1.0958357102110668e-05, + "loss": 8.8044, + "step": 1926 + }, + { + "epoch": 0.3297963374978607, + "grad_norm": 178.06443786621094, + "learning_rate": 1.096406160867085e-05, + "loss": 8.7701, + "step": 1927 + }, + { + "epoch": 0.3299674824576416, + "grad_norm": 35.5237922668457, + "learning_rate": 1.0969766115231033e-05, + "loss": 3.8513, + "step": 1928 + }, + { + "epoch": 0.33013862741742256, + "grad_norm": 39.186771392822266, + "learning_rate": 1.0975470621791215e-05, + "loss": 4.4358, + "step": 1929 + }, + { + "epoch": 0.3303097723772035, + "grad_norm": 25.387964248657227, + "learning_rate": 1.0981175128351398e-05, + "loss": 2.6496, + "step": 1930 + }, + { + "epoch": 0.3304809173369844, + "grad_norm": 41.67265319824219, + "learning_rate": 1.098687963491158e-05, + "loss": 4.5891, + "step": 1931 + }, + { + "epoch": 0.33065206229676536, + "grad_norm": 36.71438217163086, + "learning_rate": 1.0992584141471763e-05, + "loss": 4.1564, + "step": 1932 + }, + { + "epoch": 0.3308232072565463, + "grad_norm": 12.194602012634277, + "learning_rate": 1.0998288648031946e-05, + "loss": 1.3654, + "step": 1933 + }, + { + "epoch": 0.3309943522163272, + "grad_norm": 30.5019474029541, + "learning_rate": 1.1003993154592128e-05, + "loss": 2.9248, + "step": 1934 + }, + { + "epoch": 0.33116549717610816, + "grad_norm": 30.596206665039062, + "learning_rate": 1.1009697661152311e-05, + "loss": 3.6483, + "step": 1935 + }, + { + "epoch": 0.3313366421358891, + "grad_norm": 190.34573364257812, + "learning_rate": 1.1015402167712493e-05, + "loss": 9.976, + "step": 1936 + }, + { + "epoch": 0.33150778709567, + "grad_norm": 23.65143585205078, + "learning_rate": 1.1021106674272677e-05, + "loss": 2.6501, + "step": 1937 + }, + { + "epoch": 0.33167893205545096, + "grad_norm": 32.524288177490234, + "learning_rate": 1.1026811180832858e-05, + "loss": 3.6287, + "step": 1938 + }, + { + "epoch": 0.3318500770152319, + "grad_norm": 24.90087890625, + "learning_rate": 1.103251568739304e-05, + "loss": 2.8126, + "step": 1939 + }, + { + "epoch": 0.3320212219750128, + "grad_norm": 11.670059204101562, + "learning_rate": 1.1038220193953223e-05, + "loss": 0.9268, + "step": 1940 + }, + { + "epoch": 0.33219236693479376, + "grad_norm": 20.560199737548828, + "learning_rate": 1.1043924700513405e-05, + "loss": 2.0298, + "step": 1941 + }, + { + "epoch": 0.3323635118945747, + "grad_norm": 32.11676788330078, + "learning_rate": 1.1049629207073588e-05, + "loss": 3.379, + "step": 1942 + }, + { + "epoch": 0.3325346568543556, + "grad_norm": 31.273881912231445, + "learning_rate": 1.1055333713633772e-05, + "loss": 3.6115, + "step": 1943 + }, + { + "epoch": 0.33270580181413656, + "grad_norm": 76.62176513671875, + "learning_rate": 1.1061038220193953e-05, + "loss": 6.2689, + "step": 1944 + }, + { + "epoch": 0.3328769467739175, + "grad_norm": 29.79790496826172, + "learning_rate": 1.1066742726754137e-05, + "loss": 2.9922, + "step": 1945 + }, + { + "epoch": 0.3330480917336984, + "grad_norm": 28.528804779052734, + "learning_rate": 1.1072447233314318e-05, + "loss": 3.192, + "step": 1946 + }, + { + "epoch": 0.33321923669347936, + "grad_norm": 101.99966430664062, + "learning_rate": 1.1078151739874502e-05, + "loss": 6.9582, + "step": 1947 + }, + { + "epoch": 0.3333903816532603, + "grad_norm": 33.45838165283203, + "learning_rate": 1.1083856246434684e-05, + "loss": 4.3572, + "step": 1948 + }, + { + "epoch": 0.3335615266130412, + "grad_norm": 31.591665267944336, + "learning_rate": 1.1089560752994867e-05, + "loss": 3.7906, + "step": 1949 + }, + { + "epoch": 0.33373267157282216, + "grad_norm": 42.0833740234375, + "learning_rate": 1.1095265259555049e-05, + "loss": 4.95, + "step": 1950 + }, + { + "epoch": 0.3339038165326031, + "grad_norm": 94.96964263916016, + "learning_rate": 1.110096976611523e-05, + "loss": 6.5888, + "step": 1951 + }, + { + "epoch": 0.334074961492384, + "grad_norm": 35.450111389160156, + "learning_rate": 1.1106674272675414e-05, + "loss": 4.8891, + "step": 1952 + }, + { + "epoch": 0.33424610645216496, + "grad_norm": 32.57542037963867, + "learning_rate": 1.1112378779235595e-05, + "loss": 4.2762, + "step": 1953 + }, + { + "epoch": 0.3344172514119459, + "grad_norm": 24.635988235473633, + "learning_rate": 1.1118083285795779e-05, + "loss": 2.6646, + "step": 1954 + }, + { + "epoch": 0.3345883963717268, + "grad_norm": 22.50608253479004, + "learning_rate": 1.1123787792355962e-05, + "loss": 2.1994, + "step": 1955 + }, + { + "epoch": 0.33475954133150776, + "grad_norm": 35.915611267089844, + "learning_rate": 1.1129492298916144e-05, + "loss": 4.3539, + "step": 1956 + }, + { + "epoch": 0.33493068629128875, + "grad_norm": 39.85637283325195, + "learning_rate": 1.1135196805476327e-05, + "loss": 8.0766, + "step": 1957 + }, + { + "epoch": 0.3351018312510697, + "grad_norm": 31.60897445678711, + "learning_rate": 1.1140901312036509e-05, + "loss": 3.5052, + "step": 1958 + }, + { + "epoch": 0.3352729762108506, + "grad_norm": 10.988346099853516, + "learning_rate": 1.1146605818596692e-05, + "loss": 2.0192, + "step": 1959 + }, + { + "epoch": 0.33544412117063155, + "grad_norm": 77.31686401367188, + "learning_rate": 1.1152310325156874e-05, + "loss": 6.7873, + "step": 1960 + }, + { + "epoch": 0.3356152661304125, + "grad_norm": 37.3287239074707, + "learning_rate": 1.1158014831717057e-05, + "loss": 4.5134, + "step": 1961 + }, + { + "epoch": 0.3357864110901934, + "grad_norm": 28.940874099731445, + "learning_rate": 1.1163719338277239e-05, + "loss": 3.5488, + "step": 1962 + }, + { + "epoch": 0.33595755604997435, + "grad_norm": 27.005020141601562, + "learning_rate": 1.116942384483742e-05, + "loss": 3.4131, + "step": 1963 + }, + { + "epoch": 0.3361287010097553, + "grad_norm": 23.171354293823242, + "learning_rate": 1.1175128351397604e-05, + "loss": 3.0202, + "step": 1964 + }, + { + "epoch": 0.3362998459695362, + "grad_norm": 33.08194351196289, + "learning_rate": 1.1180832857957786e-05, + "loss": 3.5406, + "step": 1965 + }, + { + "epoch": 0.33647099092931715, + "grad_norm": 42.914058685302734, + "learning_rate": 1.118653736451797e-05, + "loss": 7.7143, + "step": 1966 + }, + { + "epoch": 0.3366421358890981, + "grad_norm": 6.044030666351318, + "learning_rate": 1.1192241871078152e-05, + "loss": 1.0934, + "step": 1967 + }, + { + "epoch": 0.336813280848879, + "grad_norm": 13.652383804321289, + "learning_rate": 1.1197946377638336e-05, + "loss": 1.2611, + "step": 1968 + }, + { + "epoch": 0.33698442580865995, + "grad_norm": 120.25743103027344, + "learning_rate": 1.1203650884198518e-05, + "loss": 6.9692, + "step": 1969 + }, + { + "epoch": 0.3371555707684409, + "grad_norm": 138.58935546875, + "learning_rate": 1.12093553907587e-05, + "loss": 6.7316, + "step": 1970 + }, + { + "epoch": 0.3373267157282218, + "grad_norm": 30.030006408691406, + "learning_rate": 1.1215059897318883e-05, + "loss": 4.1817, + "step": 1971 + }, + { + "epoch": 0.33749786068800275, + "grad_norm": 9.535407066345215, + "learning_rate": 1.1220764403879064e-05, + "loss": 0.9512, + "step": 1972 + }, + { + "epoch": 0.3376690056477837, + "grad_norm": 25.748254776000977, + "learning_rate": 1.1226468910439248e-05, + "loss": 3.1973, + "step": 1973 + }, + { + "epoch": 0.3378401506075646, + "grad_norm": 29.184724807739258, + "learning_rate": 1.123217341699943e-05, + "loss": 3.5403, + "step": 1974 + }, + { + "epoch": 0.33801129556734555, + "grad_norm": 36.09633255004883, + "learning_rate": 1.1237877923559611e-05, + "loss": 4.1013, + "step": 1975 + }, + { + "epoch": 0.3381824405271265, + "grad_norm": 31.967252731323242, + "learning_rate": 1.1243582430119794e-05, + "loss": 3.2354, + "step": 1976 + }, + { + "epoch": 0.3383535854869074, + "grad_norm": 38.74686813354492, + "learning_rate": 1.1249286936679976e-05, + "loss": 4.5663, + "step": 1977 + }, + { + "epoch": 0.33852473044668835, + "grad_norm": 30.3746395111084, + "learning_rate": 1.1254991443240161e-05, + "loss": 3.3973, + "step": 1978 + }, + { + "epoch": 0.3386958754064693, + "grad_norm": 11.366987228393555, + "learning_rate": 1.1260695949800343e-05, + "loss": 0.8323, + "step": 1979 + }, + { + "epoch": 0.3388670203662502, + "grad_norm": 20.15157699584961, + "learning_rate": 1.1266400456360526e-05, + "loss": 1.5111, + "step": 1980 + }, + { + "epoch": 0.33903816532603115, + "grad_norm": 25.638330459594727, + "learning_rate": 1.1272104962920708e-05, + "loss": 2.7039, + "step": 1981 + }, + { + "epoch": 0.3392093102858121, + "grad_norm": 30.38153839111328, + "learning_rate": 1.127780946948089e-05, + "loss": 3.6275, + "step": 1982 + }, + { + "epoch": 0.339380455245593, + "grad_norm": 31.235469818115234, + "learning_rate": 1.1283513976041073e-05, + "loss": 4.032, + "step": 1983 + }, + { + "epoch": 0.33955160020537395, + "grad_norm": 36.95757293701172, + "learning_rate": 1.1289218482601255e-05, + "loss": 4.052, + "step": 1984 + }, + { + "epoch": 0.3397227451651549, + "grad_norm": 5.83810567855835, + "learning_rate": 1.1294922989161438e-05, + "loss": 0.7531, + "step": 1985 + }, + { + "epoch": 0.3398938901249358, + "grad_norm": 187.32872009277344, + "learning_rate": 1.130062749572162e-05, + "loss": 8.1223, + "step": 1986 + }, + { + "epoch": 0.34006503508471675, + "grad_norm": 10.221015930175781, + "learning_rate": 1.1306332002281803e-05, + "loss": 1.3128, + "step": 1987 + }, + { + "epoch": 0.3402361800444977, + "grad_norm": 23.46990203857422, + "learning_rate": 1.1312036508841985e-05, + "loss": 2.2877, + "step": 1988 + }, + { + "epoch": 0.3404073250042786, + "grad_norm": 204.71218872070312, + "learning_rate": 1.1317741015402168e-05, + "loss": 9.0911, + "step": 1989 + }, + { + "epoch": 0.34057846996405955, + "grad_norm": 11.691418647766113, + "learning_rate": 1.1323445521962352e-05, + "loss": 2.0669, + "step": 1990 + }, + { + "epoch": 0.3407496149238405, + "grad_norm": 34.32474899291992, + "learning_rate": 1.1329150028522533e-05, + "loss": 3.8131, + "step": 1991 + }, + { + "epoch": 0.3409207598836214, + "grad_norm": 15.316189765930176, + "learning_rate": 1.1334854535082717e-05, + "loss": 1.4449, + "step": 1992 + }, + { + "epoch": 0.34109190484340235, + "grad_norm": 33.847110748291016, + "learning_rate": 1.1340559041642898e-05, + "loss": 3.6209, + "step": 1993 + }, + { + "epoch": 0.3412630498031833, + "grad_norm": 30.83047103881836, + "learning_rate": 1.134626354820308e-05, + "loss": 3.3044, + "step": 1994 + }, + { + "epoch": 0.3414341947629642, + "grad_norm": 23.169050216674805, + "learning_rate": 1.1351968054763263e-05, + "loss": 2.7778, + "step": 1995 + }, + { + "epoch": 0.34160533972274515, + "grad_norm": 28.009946823120117, + "learning_rate": 1.1357672561323445e-05, + "loss": 2.5658, + "step": 1996 + }, + { + "epoch": 0.3417764846825261, + "grad_norm": 24.620206832885742, + "learning_rate": 1.1363377067883628e-05, + "loss": 2.8611, + "step": 1997 + }, + { + "epoch": 0.341947629642307, + "grad_norm": 35.302894592285156, + "learning_rate": 1.136908157444381e-05, + "loss": 3.8368, + "step": 1998 + }, + { + "epoch": 0.34211877460208795, + "grad_norm": 48.49169921875, + "learning_rate": 1.1374786081003993e-05, + "loss": 8.3039, + "step": 1999 + }, + { + "epoch": 0.3422899195618689, + "grad_norm": 26.473003387451172, + "learning_rate": 1.1380490587564177e-05, + "loss": 2.6571, + "step": 2000 + }, + { + "epoch": 0.3424610645216498, + "grad_norm": 8.975080490112305, + "learning_rate": 1.1386195094124359e-05, + "loss": 0.8311, + "step": 2001 + }, + { + "epoch": 0.34263220948143075, + "grad_norm": 29.154399871826172, + "learning_rate": 1.1391899600684542e-05, + "loss": 3.3092, + "step": 2002 + }, + { + "epoch": 0.3428033544412117, + "grad_norm": 9.116958618164062, + "learning_rate": 1.1397604107244724e-05, + "loss": 1.109, + "step": 2003 + }, + { + "epoch": 0.3429744994009926, + "grad_norm": 150.9268341064453, + "learning_rate": 1.1403308613804907e-05, + "loss": 6.7063, + "step": 2004 + }, + { + "epoch": 0.34314564436077355, + "grad_norm": 28.97213363647461, + "learning_rate": 1.1409013120365089e-05, + "loss": 3.4316, + "step": 2005 + }, + { + "epoch": 0.3433167893205545, + "grad_norm": 35.343074798583984, + "learning_rate": 1.1414717626925272e-05, + "loss": 4.1921, + "step": 2006 + }, + { + "epoch": 0.34348793428033547, + "grad_norm": 26.21539306640625, + "learning_rate": 1.1420422133485454e-05, + "loss": 2.8775, + "step": 2007 + }, + { + "epoch": 0.3436590792401164, + "grad_norm": 24.8580322265625, + "learning_rate": 1.1426126640045635e-05, + "loss": 2.7428, + "step": 2008 + }, + { + "epoch": 0.34383022419989734, + "grad_norm": 18.229679107666016, + "learning_rate": 1.1431831146605819e-05, + "loss": 2.1508, + "step": 2009 + }, + { + "epoch": 0.34400136915967827, + "grad_norm": 12.01388168334961, + "learning_rate": 1.1437535653166e-05, + "loss": 1.002, + "step": 2010 + }, + { + "epoch": 0.3441725141194592, + "grad_norm": 101.5674819946289, + "learning_rate": 1.1443240159726184e-05, + "loss": 6.9708, + "step": 2011 + }, + { + "epoch": 0.34434365907924014, + "grad_norm": 135.65138244628906, + "learning_rate": 1.1448944666286367e-05, + "loss": 6.0953, + "step": 2012 + }, + { + "epoch": 0.34451480403902107, + "grad_norm": 28.10844612121582, + "learning_rate": 1.1454649172846549e-05, + "loss": 3.5016, + "step": 2013 + }, + { + "epoch": 0.344685948998802, + "grad_norm": 31.837894439697266, + "learning_rate": 1.1460353679406732e-05, + "loss": 3.2448, + "step": 2014 + }, + { + "epoch": 0.34485709395858294, + "grad_norm": 28.26076889038086, + "learning_rate": 1.1466058185966914e-05, + "loss": 3.1378, + "step": 2015 + }, + { + "epoch": 0.34502823891836387, + "grad_norm": 32.99501419067383, + "learning_rate": 1.1471762692527097e-05, + "loss": 3.4328, + "step": 2016 + }, + { + "epoch": 0.3451993838781448, + "grad_norm": 31.268230438232422, + "learning_rate": 1.1477467199087279e-05, + "loss": 4.0378, + "step": 2017 + }, + { + "epoch": 0.34537052883792574, + "grad_norm": 32.19254684448242, + "learning_rate": 1.1483171705647462e-05, + "loss": 4.356, + "step": 2018 + }, + { + "epoch": 0.34554167379770667, + "grad_norm": 28.953779220581055, + "learning_rate": 1.1488876212207644e-05, + "loss": 3.8967, + "step": 2019 + }, + { + "epoch": 0.3457128187574876, + "grad_norm": 26.264999389648438, + "learning_rate": 1.1494580718767826e-05, + "loss": 2.7881, + "step": 2020 + }, + { + "epoch": 0.34588396371726854, + "grad_norm": 21.80779457092285, + "learning_rate": 1.150028522532801e-05, + "loss": 2.0569, + "step": 2021 + }, + { + "epoch": 0.34605510867704947, + "grad_norm": 5.897726535797119, + "learning_rate": 1.1505989731888191e-05, + "loss": 0.6854, + "step": 2022 + }, + { + "epoch": 0.3462262536368304, + "grad_norm": 18.685945510864258, + "learning_rate": 1.1511694238448376e-05, + "loss": 1.7189, + "step": 2023 + }, + { + "epoch": 0.34639739859661134, + "grad_norm": 16.55164909362793, + "learning_rate": 1.1517398745008558e-05, + "loss": 1.6266, + "step": 2024 + }, + { + "epoch": 0.3465685435563923, + "grad_norm": 26.497346878051758, + "learning_rate": 1.152310325156874e-05, + "loss": 3.1355, + "step": 2025 + }, + { + "epoch": 0.3467396885161732, + "grad_norm": 36.22391128540039, + "learning_rate": 1.1528807758128923e-05, + "loss": 4.2871, + "step": 2026 + }, + { + "epoch": 0.34691083347595414, + "grad_norm": 25.69757080078125, + "learning_rate": 1.1534512264689104e-05, + "loss": 2.4604, + "step": 2027 + }, + { + "epoch": 0.3470819784357351, + "grad_norm": 34.47371292114258, + "learning_rate": 1.1540216771249288e-05, + "loss": 4.5727, + "step": 2028 + }, + { + "epoch": 0.347253123395516, + "grad_norm": 25.829330444335938, + "learning_rate": 1.154592127780947e-05, + "loss": 2.3708, + "step": 2029 + }, + { + "epoch": 0.34742426835529694, + "grad_norm": 23.152074813842773, + "learning_rate": 1.1551625784369653e-05, + "loss": 2.5885, + "step": 2030 + }, + { + "epoch": 0.3475954133150779, + "grad_norm": 33.27009582519531, + "learning_rate": 1.1557330290929834e-05, + "loss": 4.0326, + "step": 2031 + }, + { + "epoch": 0.3477665582748588, + "grad_norm": 11.642922401428223, + "learning_rate": 1.1563034797490016e-05, + "loss": 1.3036, + "step": 2032 + }, + { + "epoch": 0.34793770323463974, + "grad_norm": 16.035924911499023, + "learning_rate": 1.15687393040502e-05, + "loss": 1.3584, + "step": 2033 + }, + { + "epoch": 0.3481088481944207, + "grad_norm": 38.5884895324707, + "learning_rate": 1.1574443810610381e-05, + "loss": 5.2381, + "step": 2034 + }, + { + "epoch": 0.3482799931542016, + "grad_norm": 34.79248046875, + "learning_rate": 1.1580148317170566e-05, + "loss": 3.4977, + "step": 2035 + }, + { + "epoch": 0.34845113811398254, + "grad_norm": 24.086618423461914, + "learning_rate": 1.1585852823730748e-05, + "loss": 2.489, + "step": 2036 + }, + { + "epoch": 0.3486222830737635, + "grad_norm": 17.970691680908203, + "learning_rate": 1.1591557330290931e-05, + "loss": 1.2174, + "step": 2037 + }, + { + "epoch": 0.3487934280335444, + "grad_norm": 27.199962615966797, + "learning_rate": 1.1597261836851113e-05, + "loss": 2.4304, + "step": 2038 + }, + { + "epoch": 0.34896457299332534, + "grad_norm": 36.157230377197266, + "learning_rate": 1.1602966343411295e-05, + "loss": 4.5914, + "step": 2039 + }, + { + "epoch": 0.3491357179531063, + "grad_norm": 30.98073387145996, + "learning_rate": 1.1608670849971478e-05, + "loss": 3.1108, + "step": 2040 + }, + { + "epoch": 0.3493068629128872, + "grad_norm": 4.110781192779541, + "learning_rate": 1.161437535653166e-05, + "loss": 0.6784, + "step": 2041 + }, + { + "epoch": 0.34947800787266814, + "grad_norm": 7.259744644165039, + "learning_rate": 1.1620079863091843e-05, + "loss": 0.7546, + "step": 2042 + }, + { + "epoch": 0.3496491528324491, + "grad_norm": 9.056280136108398, + "learning_rate": 1.1625784369652025e-05, + "loss": 0.8102, + "step": 2043 + }, + { + "epoch": 0.34982029779223, + "grad_norm": 17.079927444458008, + "learning_rate": 1.1631488876212207e-05, + "loss": 1.8825, + "step": 2044 + }, + { + "epoch": 0.34999144275201094, + "grad_norm": 5.583414077758789, + "learning_rate": 1.163719338277239e-05, + "loss": 0.6958, + "step": 2045 + }, + { + "epoch": 0.3501625877117919, + "grad_norm": 32.52211380004883, + "learning_rate": 1.1642897889332573e-05, + "loss": 3.8308, + "step": 2046 + }, + { + "epoch": 0.3503337326715728, + "grad_norm": 8.453152656555176, + "learning_rate": 1.1648602395892757e-05, + "loss": 0.9997, + "step": 2047 + }, + { + "epoch": 0.35050487763135374, + "grad_norm": 17.828163146972656, + "learning_rate": 1.1654306902452938e-05, + "loss": 2.0197, + "step": 2048 + }, + { + "epoch": 0.3506760225911347, + "grad_norm": 33.86958312988281, + "learning_rate": 1.1660011409013122e-05, + "loss": 3.5889, + "step": 2049 + }, + { + "epoch": 0.3508471675509156, + "grad_norm": 39.53785705566406, + "learning_rate": 1.1665715915573303e-05, + "loss": 4.3322, + "step": 2050 + }, + { + "epoch": 0.35101831251069654, + "grad_norm": 119.68132019042969, + "learning_rate": 1.1671420422133485e-05, + "loss": 8.5534, + "step": 2051 + }, + { + "epoch": 0.3511894574704775, + "grad_norm": 20.703731536865234, + "learning_rate": 1.1677124928693669e-05, + "loss": 1.9145, + "step": 2052 + }, + { + "epoch": 0.3513606024302584, + "grad_norm": 32.62479019165039, + "learning_rate": 1.168282943525385e-05, + "loss": 3.4411, + "step": 2053 + }, + { + "epoch": 0.35153174739003934, + "grad_norm": 28.38721466064453, + "learning_rate": 1.1688533941814034e-05, + "loss": 2.913, + "step": 2054 + }, + { + "epoch": 0.3517028923498203, + "grad_norm": 11.139078140258789, + "learning_rate": 1.1694238448374215e-05, + "loss": 1.2331, + "step": 2055 + }, + { + "epoch": 0.3518740373096012, + "grad_norm": 36.095458984375, + "learning_rate": 1.1699942954934399e-05, + "loss": 4.4497, + "step": 2056 + }, + { + "epoch": 0.35204518226938214, + "grad_norm": 17.7105655670166, + "learning_rate": 1.170564746149458e-05, + "loss": 1.341, + "step": 2057 + }, + { + "epoch": 0.35221632722916313, + "grad_norm": 34.70029067993164, + "learning_rate": 1.1711351968054764e-05, + "loss": 3.8577, + "step": 2058 + }, + { + "epoch": 0.35238747218894406, + "grad_norm": 30.967939376831055, + "learning_rate": 1.1717056474614947e-05, + "loss": 3.5998, + "step": 2059 + }, + { + "epoch": 0.352558617148725, + "grad_norm": 175.67909240722656, + "learning_rate": 1.1722760981175129e-05, + "loss": 7.5725, + "step": 2060 + }, + { + "epoch": 0.35272976210850593, + "grad_norm": 14.09093189239502, + "learning_rate": 1.1728465487735312e-05, + "loss": 1.1863, + "step": 2061 + }, + { + "epoch": 0.35290090706828686, + "grad_norm": 16.4505672454834, + "learning_rate": 1.1734169994295494e-05, + "loss": 1.3923, + "step": 2062 + }, + { + "epoch": 0.3530720520280678, + "grad_norm": 30.69254493713379, + "learning_rate": 1.1739874500855676e-05, + "loss": 4.0609, + "step": 2063 + }, + { + "epoch": 0.35324319698784873, + "grad_norm": 35.82154846191406, + "learning_rate": 1.1745579007415859e-05, + "loss": 4.1915, + "step": 2064 + }, + { + "epoch": 0.35341434194762966, + "grad_norm": 34.619754791259766, + "learning_rate": 1.175128351397604e-05, + "loss": 4.8903, + "step": 2065 + }, + { + "epoch": 0.3535854869074106, + "grad_norm": 13.456661224365234, + "learning_rate": 1.1756988020536224e-05, + "loss": 1.4971, + "step": 2066 + }, + { + "epoch": 0.35375663186719153, + "grad_norm": 34.76420974731445, + "learning_rate": 1.1762692527096406e-05, + "loss": 3.9249, + "step": 2067 + }, + { + "epoch": 0.35392777682697246, + "grad_norm": 11.180761337280273, + "learning_rate": 1.1768397033656589e-05, + "loss": 0.7, + "step": 2068 + }, + { + "epoch": 0.3540989217867534, + "grad_norm": 195.03485107421875, + "learning_rate": 1.1774101540216772e-05, + "loss": 6.9708, + "step": 2069 + }, + { + "epoch": 0.35427006674653433, + "grad_norm": 34.15081787109375, + "learning_rate": 1.1779806046776954e-05, + "loss": 4.0197, + "step": 2070 + }, + { + "epoch": 0.35444121170631526, + "grad_norm": 44.15553283691406, + "learning_rate": 1.1785510553337137e-05, + "loss": 8.034, + "step": 2071 + }, + { + "epoch": 0.3546123566660962, + "grad_norm": 36.1580924987793, + "learning_rate": 1.1791215059897319e-05, + "loss": 4.3774, + "step": 2072 + }, + { + "epoch": 0.35478350162587713, + "grad_norm": 37.583351135253906, + "learning_rate": 1.1796919566457503e-05, + "loss": 5.0443, + "step": 2073 + }, + { + "epoch": 0.35495464658565806, + "grad_norm": 7.443456172943115, + "learning_rate": 1.1802624073017684e-05, + "loss": 0.7081, + "step": 2074 + }, + { + "epoch": 0.355125791545439, + "grad_norm": 27.195236206054688, + "learning_rate": 1.1808328579577866e-05, + "loss": 2.7896, + "step": 2075 + }, + { + "epoch": 0.35529693650521993, + "grad_norm": 10.81725788116455, + "learning_rate": 1.181403308613805e-05, + "loss": 2.1049, + "step": 2076 + }, + { + "epoch": 0.35546808146500086, + "grad_norm": 32.889869689941406, + "learning_rate": 1.1819737592698231e-05, + "loss": 3.9205, + "step": 2077 + }, + { + "epoch": 0.3556392264247818, + "grad_norm": 119.37525939941406, + "learning_rate": 1.1825442099258414e-05, + "loss": 7.0729, + "step": 2078 + }, + { + "epoch": 0.35581037138456273, + "grad_norm": 13.211540222167969, + "learning_rate": 1.1831146605818596e-05, + "loss": 1.5046, + "step": 2079 + }, + { + "epoch": 0.35598151634434366, + "grad_norm": 29.677011489868164, + "learning_rate": 1.183685111237878e-05, + "loss": 3.4441, + "step": 2080 + }, + { + "epoch": 0.3561526613041246, + "grad_norm": 116.09097290039062, + "learning_rate": 1.1842555618938963e-05, + "loss": 6.9657, + "step": 2081 + }, + { + "epoch": 0.35632380626390553, + "grad_norm": 36.9529914855957, + "learning_rate": 1.1848260125499144e-05, + "loss": 5.1966, + "step": 2082 + }, + { + "epoch": 0.35649495122368646, + "grad_norm": 32.45378112792969, + "learning_rate": 1.1853964632059328e-05, + "loss": 3.8259, + "step": 2083 + }, + { + "epoch": 0.3566660961834674, + "grad_norm": 28.279193878173828, + "learning_rate": 1.185966913861951e-05, + "loss": 3.0802, + "step": 2084 + }, + { + "epoch": 0.35683724114324833, + "grad_norm": 16.36111831665039, + "learning_rate": 1.1865373645179693e-05, + "loss": 1.6254, + "step": 2085 + }, + { + "epoch": 0.35700838610302926, + "grad_norm": 33.62881851196289, + "learning_rate": 1.1871078151739875e-05, + "loss": 3.482, + "step": 2086 + }, + { + "epoch": 0.3571795310628102, + "grad_norm": 22.785282135009766, + "learning_rate": 1.1876782658300058e-05, + "loss": 2.5492, + "step": 2087 + }, + { + "epoch": 0.35735067602259113, + "grad_norm": 18.783733367919922, + "learning_rate": 1.188248716486024e-05, + "loss": 2.1471, + "step": 2088 + }, + { + "epoch": 0.35752182098237206, + "grad_norm": 25.175399780273438, + "learning_rate": 1.1888191671420421e-05, + "loss": 2.901, + "step": 2089 + }, + { + "epoch": 0.357692965942153, + "grad_norm": 32.070228576660156, + "learning_rate": 1.1893896177980605e-05, + "loss": 4.0126, + "step": 2090 + }, + { + "epoch": 0.35786411090193393, + "grad_norm": 30.165206909179688, + "learning_rate": 1.1899600684540786e-05, + "loss": 3.1196, + "step": 2091 + }, + { + "epoch": 0.35803525586171486, + "grad_norm": 25.695375442504883, + "learning_rate": 1.1905305191100971e-05, + "loss": 2.5124, + "step": 2092 + }, + { + "epoch": 0.3582064008214958, + "grad_norm": 7.505849838256836, + "learning_rate": 1.1911009697661153e-05, + "loss": 1.0043, + "step": 2093 + }, + { + "epoch": 0.35837754578127673, + "grad_norm": 28.15729522705078, + "learning_rate": 1.1916714204221335e-05, + "loss": 3.8256, + "step": 2094 + }, + { + "epoch": 0.35854869074105766, + "grad_norm": 15.077316284179688, + "learning_rate": 1.1922418710781518e-05, + "loss": 0.9039, + "step": 2095 + }, + { + "epoch": 0.3587198357008386, + "grad_norm": 11.068819999694824, + "learning_rate": 1.19281232173417e-05, + "loss": 0.9256, + "step": 2096 + }, + { + "epoch": 0.35889098066061953, + "grad_norm": 30.34836769104004, + "learning_rate": 1.1933827723901883e-05, + "loss": 3.3198, + "step": 2097 + }, + { + "epoch": 0.35906212562040046, + "grad_norm": 92.60661315917969, + "learning_rate": 1.1939532230462065e-05, + "loss": 5.7395, + "step": 2098 + }, + { + "epoch": 0.3592332705801814, + "grad_norm": 26.518394470214844, + "learning_rate": 1.1945236737022248e-05, + "loss": 2.7506, + "step": 2099 + }, + { + "epoch": 0.35940441553996233, + "grad_norm": 4.0069780349731445, + "learning_rate": 1.195094124358243e-05, + "loss": 0.622, + "step": 2100 + }, + { + "epoch": 0.35957556049974326, + "grad_norm": 25.66058349609375, + "learning_rate": 1.1956645750142612e-05, + "loss": 2.4436, + "step": 2101 + }, + { + "epoch": 0.3597467054595242, + "grad_norm": 16.090246200561523, + "learning_rate": 1.1962350256702795e-05, + "loss": 1.4181, + "step": 2102 + }, + { + "epoch": 0.35991785041930513, + "grad_norm": 9.653539657592773, + "learning_rate": 1.1968054763262978e-05, + "loss": 1.1303, + "step": 2103 + }, + { + "epoch": 0.36008899537908606, + "grad_norm": 26.997007369995117, + "learning_rate": 1.1973759269823162e-05, + "loss": 2.8454, + "step": 2104 + }, + { + "epoch": 0.360260140338867, + "grad_norm": 35.292945861816406, + "learning_rate": 1.1979463776383344e-05, + "loss": 4.4265, + "step": 2105 + }, + { + "epoch": 0.36043128529864793, + "grad_norm": 9.962848663330078, + "learning_rate": 1.1985168282943527e-05, + "loss": 1.1083, + "step": 2106 + }, + { + "epoch": 0.36060243025842886, + "grad_norm": 21.34442138671875, + "learning_rate": 1.1990872789503709e-05, + "loss": 1.9815, + "step": 2107 + }, + { + "epoch": 0.3607735752182098, + "grad_norm": 63.102256774902344, + "learning_rate": 1.199657729606389e-05, + "loss": 8.2906, + "step": 2108 + }, + { + "epoch": 0.3609447201779908, + "grad_norm": 31.640159606933594, + "learning_rate": 1.2002281802624074e-05, + "loss": 3.9734, + "step": 2109 + }, + { + "epoch": 0.3611158651377717, + "grad_norm": 29.008909225463867, + "learning_rate": 1.2007986309184255e-05, + "loss": 2.8619, + "step": 2110 + }, + { + "epoch": 0.36128701009755265, + "grad_norm": 158.99563598632812, + "learning_rate": 1.2013690815744439e-05, + "loss": 8.8876, + "step": 2111 + }, + { + "epoch": 0.3614581550573336, + "grad_norm": 12.028635025024414, + "learning_rate": 1.201939532230462e-05, + "loss": 1.1747, + "step": 2112 + }, + { + "epoch": 0.3616293000171145, + "grad_norm": 49.29413986206055, + "learning_rate": 1.2025099828864802e-05, + "loss": 8.4677, + "step": 2113 + }, + { + "epoch": 0.36180044497689545, + "grad_norm": 35.586788177490234, + "learning_rate": 1.2030804335424985e-05, + "loss": 4.3141, + "step": 2114 + }, + { + "epoch": 0.3619715899366764, + "grad_norm": 15.967235565185547, + "learning_rate": 1.2036508841985169e-05, + "loss": 1.4648, + "step": 2115 + }, + { + "epoch": 0.3621427348964573, + "grad_norm": 116.31715393066406, + "learning_rate": 1.2042213348545352e-05, + "loss": 5.9115, + "step": 2116 + }, + { + "epoch": 0.36231387985623825, + "grad_norm": 39.9970703125, + "learning_rate": 1.2047917855105534e-05, + "loss": 5.2751, + "step": 2117 + }, + { + "epoch": 0.3624850248160192, + "grad_norm": 15.636171340942383, + "learning_rate": 1.2053622361665717e-05, + "loss": 1.1331, + "step": 2118 + }, + { + "epoch": 0.3626561697758001, + "grad_norm": 29.51291847229004, + "learning_rate": 1.2059326868225899e-05, + "loss": 3.0782, + "step": 2119 + }, + { + "epoch": 0.36282731473558105, + "grad_norm": 33.99169921875, + "learning_rate": 1.206503137478608e-05, + "loss": 3.4875, + "step": 2120 + }, + { + "epoch": 0.362998459695362, + "grad_norm": 8.469818115234375, + "learning_rate": 1.2070735881346264e-05, + "loss": 0.9351, + "step": 2121 + }, + { + "epoch": 0.3631696046551429, + "grad_norm": 87.96151733398438, + "learning_rate": 1.2076440387906446e-05, + "loss": 5.0553, + "step": 2122 + }, + { + "epoch": 0.36334074961492385, + "grad_norm": 11.59670352935791, + "learning_rate": 1.2082144894466629e-05, + "loss": 1.314, + "step": 2123 + }, + { + "epoch": 0.3635118945747048, + "grad_norm": 7.859058856964111, + "learning_rate": 1.208784940102681e-05, + "loss": 0.9692, + "step": 2124 + }, + { + "epoch": 0.3636830395344857, + "grad_norm": 48.24964904785156, + "learning_rate": 1.2093553907586992e-05, + "loss": 5.6168, + "step": 2125 + }, + { + "epoch": 0.36385418449426665, + "grad_norm": 35.264366149902344, + "learning_rate": 1.2099258414147178e-05, + "loss": 3.7475, + "step": 2126 + }, + { + "epoch": 0.3640253294540476, + "grad_norm": 30.4807071685791, + "learning_rate": 1.210496292070736e-05, + "loss": 3.6681, + "step": 2127 + }, + { + "epoch": 0.3641964744138285, + "grad_norm": 37.583274841308594, + "learning_rate": 1.2110667427267543e-05, + "loss": 7.6763, + "step": 2128 + }, + { + "epoch": 0.36436761937360945, + "grad_norm": 10.553574562072754, + "learning_rate": 1.2116371933827724e-05, + "loss": 0.7119, + "step": 2129 + }, + { + "epoch": 0.3645387643333904, + "grad_norm": 25.893739700317383, + "learning_rate": 1.2122076440387908e-05, + "loss": 2.7102, + "step": 2130 + }, + { + "epoch": 0.3647099092931713, + "grad_norm": 37.81182861328125, + "learning_rate": 1.212778094694809e-05, + "loss": 7.7056, + "step": 2131 + }, + { + "epoch": 0.36488105425295225, + "grad_norm": 24.436336517333984, + "learning_rate": 1.2133485453508271e-05, + "loss": 3.0385, + "step": 2132 + }, + { + "epoch": 0.3650521992127332, + "grad_norm": 33.72613525390625, + "learning_rate": 1.2139189960068454e-05, + "loss": 3.625, + "step": 2133 + }, + { + "epoch": 0.3652233441725141, + "grad_norm": 29.429370880126953, + "learning_rate": 1.2144894466628636e-05, + "loss": 3.2735, + "step": 2134 + }, + { + "epoch": 0.36539448913229505, + "grad_norm": 29.37833595275879, + "learning_rate": 1.215059897318882e-05, + "loss": 3.3102, + "step": 2135 + }, + { + "epoch": 0.365565634092076, + "grad_norm": 4.678672790527344, + "learning_rate": 1.2156303479749001e-05, + "loss": 0.6167, + "step": 2136 + }, + { + "epoch": 0.3657367790518569, + "grad_norm": 13.350298881530762, + "learning_rate": 1.2162007986309185e-05, + "loss": 0.9838, + "step": 2137 + }, + { + "epoch": 0.36590792401163785, + "grad_norm": 197.19981384277344, + "learning_rate": 1.2167712492869368e-05, + "loss": 10.1095, + "step": 2138 + }, + { + "epoch": 0.3660790689714188, + "grad_norm": 33.24477767944336, + "learning_rate": 1.217341699942955e-05, + "loss": 3.6562, + "step": 2139 + }, + { + "epoch": 0.3662502139311997, + "grad_norm": 31.698823928833008, + "learning_rate": 1.2179121505989733e-05, + "loss": 3.1984, + "step": 2140 + }, + { + "epoch": 0.36642135889098065, + "grad_norm": 28.302553176879883, + "learning_rate": 1.2184826012549915e-05, + "loss": 2.9794, + "step": 2141 + }, + { + "epoch": 0.3665925038507616, + "grad_norm": 26.840988159179688, + "learning_rate": 1.2190530519110098e-05, + "loss": 3.1451, + "step": 2142 + }, + { + "epoch": 0.3667636488105425, + "grad_norm": 10.02106761932373, + "learning_rate": 1.219623502567028e-05, + "loss": 1.7728, + "step": 2143 + }, + { + "epoch": 0.36693479377032345, + "grad_norm": 19.4163761138916, + "learning_rate": 1.2201939532230461e-05, + "loss": 1.4892, + "step": 2144 + }, + { + "epoch": 0.3671059387301044, + "grad_norm": 117.40380096435547, + "learning_rate": 1.2207644038790645e-05, + "loss": 6.037, + "step": 2145 + }, + { + "epoch": 0.3672770836898853, + "grad_norm": 36.802330017089844, + "learning_rate": 1.2213348545350826e-05, + "loss": 3.9472, + "step": 2146 + }, + { + "epoch": 0.36744822864966625, + "grad_norm": 26.534914016723633, + "learning_rate": 1.221905305191101e-05, + "loss": 2.9076, + "step": 2147 + }, + { + "epoch": 0.3676193736094472, + "grad_norm": 8.252175331115723, + "learning_rate": 1.2224757558471192e-05, + "loss": 1.7274, + "step": 2148 + }, + { + "epoch": 0.3677905185692281, + "grad_norm": 36.72080993652344, + "learning_rate": 1.2230462065031377e-05, + "loss": 3.9691, + "step": 2149 + }, + { + "epoch": 0.36796166352900905, + "grad_norm": 31.389694213867188, + "learning_rate": 1.2236166571591558e-05, + "loss": 3.662, + "step": 2150 + }, + { + "epoch": 0.36813280848879, + "grad_norm": 17.889827728271484, + "learning_rate": 1.224187107815174e-05, + "loss": 1.3585, + "step": 2151 + }, + { + "epoch": 0.3683039534485709, + "grad_norm": 37.55808639526367, + "learning_rate": 1.2247575584711923e-05, + "loss": 3.733, + "step": 2152 + }, + { + "epoch": 0.36847509840835185, + "grad_norm": 28.830768585205078, + "learning_rate": 1.2253280091272105e-05, + "loss": 2.596, + "step": 2153 + }, + { + "epoch": 0.3686462433681328, + "grad_norm": 11.456624031066895, + "learning_rate": 1.2258984597832288e-05, + "loss": 0.6827, + "step": 2154 + }, + { + "epoch": 0.3688173883279137, + "grad_norm": 29.137744903564453, + "learning_rate": 1.226468910439247e-05, + "loss": 3.4631, + "step": 2155 + }, + { + "epoch": 0.36898853328769465, + "grad_norm": 27.315082550048828, + "learning_rate": 1.2270393610952653e-05, + "loss": 2.4743, + "step": 2156 + }, + { + "epoch": 0.3691596782474756, + "grad_norm": 15.013253211975098, + "learning_rate": 1.2276098117512835e-05, + "loss": 1.2812, + "step": 2157 + }, + { + "epoch": 0.3693308232072565, + "grad_norm": 33.02097702026367, + "learning_rate": 1.2281802624073017e-05, + "loss": 3.1825, + "step": 2158 + }, + { + "epoch": 0.36950196816703745, + "grad_norm": 37.75695037841797, + "learning_rate": 1.22875071306332e-05, + "loss": 3.9585, + "step": 2159 + }, + { + "epoch": 0.36967311312681844, + "grad_norm": 33.56565475463867, + "learning_rate": 1.2293211637193382e-05, + "loss": 3.9576, + "step": 2160 + }, + { + "epoch": 0.3698442580865994, + "grad_norm": 8.82251262664795, + "learning_rate": 1.2298916143753567e-05, + "loss": 1.045, + "step": 2161 + }, + { + "epoch": 0.3700154030463803, + "grad_norm": 26.975778579711914, + "learning_rate": 1.2304620650313749e-05, + "loss": 2.5674, + "step": 2162 + }, + { + "epoch": 0.37018654800616124, + "grad_norm": 136.73780822753906, + "learning_rate": 1.231032515687393e-05, + "loss": 5.8467, + "step": 2163 + }, + { + "epoch": 0.3703576929659422, + "grad_norm": 29.269546508789062, + "learning_rate": 1.2316029663434114e-05, + "loss": 2.9019, + "step": 2164 + }, + { + "epoch": 0.3705288379257231, + "grad_norm": 31.114402770996094, + "learning_rate": 1.2321734169994295e-05, + "loss": 3.803, + "step": 2165 + }, + { + "epoch": 0.37069998288550404, + "grad_norm": 28.02252769470215, + "learning_rate": 1.2327438676554479e-05, + "loss": 3.334, + "step": 2166 + }, + { + "epoch": 0.370871127845285, + "grad_norm": 36.24296951293945, + "learning_rate": 1.233314318311466e-05, + "loss": 4.0221, + "step": 2167 + }, + { + "epoch": 0.3710422728050659, + "grad_norm": 42.49361801147461, + "learning_rate": 1.2338847689674844e-05, + "loss": 4.4893, + "step": 2168 + }, + { + "epoch": 0.37121341776484684, + "grad_norm": 31.110870361328125, + "learning_rate": 1.2344552196235026e-05, + "loss": 3.2998, + "step": 2169 + }, + { + "epoch": 0.3713845627246278, + "grad_norm": 38.54166030883789, + "learning_rate": 1.2350256702795207e-05, + "loss": 3.9307, + "step": 2170 + }, + { + "epoch": 0.3715557076844087, + "grad_norm": 31.027143478393555, + "learning_rate": 1.235596120935539e-05, + "loss": 3.3663, + "step": 2171 + }, + { + "epoch": 0.37172685264418964, + "grad_norm": 19.778564453125, + "learning_rate": 1.2361665715915574e-05, + "loss": 1.4132, + "step": 2172 + }, + { + "epoch": 0.3718979976039706, + "grad_norm": 6.935482025146484, + "learning_rate": 1.2367370222475757e-05, + "loss": 0.7138, + "step": 2173 + }, + { + "epoch": 0.3720691425637515, + "grad_norm": 17.002243041992188, + "learning_rate": 1.2373074729035939e-05, + "loss": 1.363, + "step": 2174 + }, + { + "epoch": 0.37224028752353244, + "grad_norm": 36.1330451965332, + "learning_rate": 1.237877923559612e-05, + "loss": 4.304, + "step": 2175 + }, + { + "epoch": 0.3724114324833134, + "grad_norm": 37.96760940551758, + "learning_rate": 1.2384483742156304e-05, + "loss": 4.1877, + "step": 2176 + }, + { + "epoch": 0.3725825774430943, + "grad_norm": 37.3785400390625, + "learning_rate": 1.2390188248716486e-05, + "loss": 4.2806, + "step": 2177 + }, + { + "epoch": 0.37275372240287524, + "grad_norm": 124.93565368652344, + "learning_rate": 1.239589275527667e-05, + "loss": 5.4911, + "step": 2178 + }, + { + "epoch": 0.3729248673626562, + "grad_norm": 28.42656707763672, + "learning_rate": 1.2401597261836851e-05, + "loss": 2.685, + "step": 2179 + }, + { + "epoch": 0.3730960123224371, + "grad_norm": 44.78040313720703, + "learning_rate": 1.2407301768397034e-05, + "loss": 8.0281, + "step": 2180 + }, + { + "epoch": 0.37326715728221804, + "grad_norm": 106.5615005493164, + "learning_rate": 1.2413006274957216e-05, + "loss": 9.7692, + "step": 2181 + }, + { + "epoch": 0.373438302241999, + "grad_norm": 32.70700454711914, + "learning_rate": 1.2418710781517398e-05, + "loss": 3.7167, + "step": 2182 + }, + { + "epoch": 0.3736094472017799, + "grad_norm": 27.95832633972168, + "learning_rate": 1.2424415288077583e-05, + "loss": 3.4558, + "step": 2183 + }, + { + "epoch": 0.37378059216156084, + "grad_norm": 51.62168502807617, + "learning_rate": 1.2430119794637764e-05, + "loss": 7.8843, + "step": 2184 + }, + { + "epoch": 0.3739517371213418, + "grad_norm": 22.549152374267578, + "learning_rate": 1.2435824301197948e-05, + "loss": 2.2902, + "step": 2185 + }, + { + "epoch": 0.3741228820811227, + "grad_norm": 49.26498031616211, + "learning_rate": 1.244152880775813e-05, + "loss": 7.9836, + "step": 2186 + }, + { + "epoch": 0.37429402704090364, + "grad_norm": 32.918434143066406, + "learning_rate": 1.2447233314318313e-05, + "loss": 3.7321, + "step": 2187 + }, + { + "epoch": 0.3744651720006846, + "grad_norm": 115.87164306640625, + "learning_rate": 1.2452937820878495e-05, + "loss": 6.1209, + "step": 2188 + }, + { + "epoch": 0.3746363169604655, + "grad_norm": 32.60509490966797, + "learning_rate": 1.2458642327438676e-05, + "loss": 4.3652, + "step": 2189 + }, + { + "epoch": 0.37480746192024644, + "grad_norm": 40.821720123291016, + "learning_rate": 1.246434683399886e-05, + "loss": 7.5982, + "step": 2190 + }, + { + "epoch": 0.3749786068800274, + "grad_norm": 30.804649353027344, + "learning_rate": 1.2470051340559041e-05, + "loss": 3.6332, + "step": 2191 + }, + { + "epoch": 0.3751497518398083, + "grad_norm": 28.10482406616211, + "learning_rate": 1.2475755847119225e-05, + "loss": 3.1805, + "step": 2192 + }, + { + "epoch": 0.37532089679958924, + "grad_norm": 5.394840240478516, + "learning_rate": 1.2481460353679406e-05, + "loss": 0.6131, + "step": 2193 + }, + { + "epoch": 0.3754920417593702, + "grad_norm": 22.42398452758789, + "learning_rate": 1.2487164860239588e-05, + "loss": 2.9095, + "step": 2194 + }, + { + "epoch": 0.3756631867191511, + "grad_norm": 31.861984252929688, + "learning_rate": 1.2492869366799773e-05, + "loss": 3.7872, + "step": 2195 + }, + { + "epoch": 0.37583433167893204, + "grad_norm": 30.0163631439209, + "learning_rate": 1.2498573873359955e-05, + "loss": 3.2556, + "step": 2196 + }, + { + "epoch": 0.376005476638713, + "grad_norm": 43.01797103881836, + "learning_rate": 1.2504278379920138e-05, + "loss": 7.4534, + "step": 2197 + }, + { + "epoch": 0.3761766215984939, + "grad_norm": 26.029483795166016, + "learning_rate": 1.250998288648032e-05, + "loss": 3.4138, + "step": 2198 + }, + { + "epoch": 0.37634776655827484, + "grad_norm": 31.733152389526367, + "learning_rate": 1.2515687393040503e-05, + "loss": 3.7965, + "step": 2199 + }, + { + "epoch": 0.3765189115180558, + "grad_norm": 29.86209487915039, + "learning_rate": 1.2521391899600685e-05, + "loss": 3.1073, + "step": 2200 + }, + { + "epoch": 0.3766900564778367, + "grad_norm": 73.94261932373047, + "learning_rate": 1.2527096406160867e-05, + "loss": 6.7022, + "step": 2201 + }, + { + "epoch": 0.37686120143761764, + "grad_norm": 33.266666412353516, + "learning_rate": 1.253280091272105e-05, + "loss": 3.467, + "step": 2202 + }, + { + "epoch": 0.3770323463973986, + "grad_norm": 9.25309944152832, + "learning_rate": 1.2538505419281232e-05, + "loss": 0.9735, + "step": 2203 + }, + { + "epoch": 0.3772034913571795, + "grad_norm": 32.7879753112793, + "learning_rate": 1.2544209925841415e-05, + "loss": 4.3873, + "step": 2204 + }, + { + "epoch": 0.37737463631696044, + "grad_norm": 38.24089813232422, + "learning_rate": 1.2549914432401597e-05, + "loss": 4.1272, + "step": 2205 + }, + { + "epoch": 0.3775457812767414, + "grad_norm": 11.10142707824707, + "learning_rate": 1.2555618938961782e-05, + "loss": 0.8028, + "step": 2206 + }, + { + "epoch": 0.3777169262365223, + "grad_norm": 37.619815826416016, + "learning_rate": 1.2561323445521963e-05, + "loss": 3.8663, + "step": 2207 + }, + { + "epoch": 0.37788807119630324, + "grad_norm": 43.338417053222656, + "learning_rate": 1.2567027952082145e-05, + "loss": 4.6084, + "step": 2208 + }, + { + "epoch": 0.3780592161560842, + "grad_norm": 29.597476959228516, + "learning_rate": 1.2572732458642329e-05, + "loss": 4.3275, + "step": 2209 + }, + { + "epoch": 0.3782303611158651, + "grad_norm": 111.00467681884766, + "learning_rate": 1.257843696520251e-05, + "loss": 6.2678, + "step": 2210 + }, + { + "epoch": 0.3784015060756461, + "grad_norm": 28.328218460083008, + "learning_rate": 1.2584141471762694e-05, + "loss": 3.7021, + "step": 2211 + }, + { + "epoch": 0.37857265103542703, + "grad_norm": 7.334059238433838, + "learning_rate": 1.2589845978322875e-05, + "loss": 1.0266, + "step": 2212 + }, + { + "epoch": 0.37874379599520797, + "grad_norm": 12.333498001098633, + "learning_rate": 1.2595550484883057e-05, + "loss": 1.2937, + "step": 2213 + }, + { + "epoch": 0.3789149409549889, + "grad_norm": 33.395259857177734, + "learning_rate": 1.260125499144324e-05, + "loss": 3.7564, + "step": 2214 + }, + { + "epoch": 0.37908608591476983, + "grad_norm": 12.443466186523438, + "learning_rate": 1.2606959498003422e-05, + "loss": 1.1804, + "step": 2215 + }, + { + "epoch": 0.37925723087455077, + "grad_norm": 29.2781982421875, + "learning_rate": 1.2612664004563605e-05, + "loss": 2.8089, + "step": 2216 + }, + { + "epoch": 0.3794283758343317, + "grad_norm": 30.066843032836914, + "learning_rate": 1.2618368511123787e-05, + "loss": 3.678, + "step": 2217 + }, + { + "epoch": 0.37959952079411263, + "grad_norm": 198.62889099121094, + "learning_rate": 1.2624073017683972e-05, + "loss": 10.5079, + "step": 2218 + }, + { + "epoch": 0.37977066575389357, + "grad_norm": 36.29426574707031, + "learning_rate": 1.2629777524244154e-05, + "loss": 3.7594, + "step": 2219 + }, + { + "epoch": 0.3799418107136745, + "grad_norm": 4.288938522338867, + "learning_rate": 1.2635482030804336e-05, + "loss": 0.6002, + "step": 2220 + }, + { + "epoch": 0.38011295567345543, + "grad_norm": 16.282394409179688, + "learning_rate": 1.2641186537364519e-05, + "loss": 1.2398, + "step": 2221 + }, + { + "epoch": 0.38028410063323637, + "grad_norm": 15.423003196716309, + "learning_rate": 1.26468910439247e-05, + "loss": 1.0447, + "step": 2222 + }, + { + "epoch": 0.3804552455930173, + "grad_norm": 8.580951690673828, + "learning_rate": 1.2652595550484884e-05, + "loss": 1.0967, + "step": 2223 + }, + { + "epoch": 0.38062639055279823, + "grad_norm": 23.481037139892578, + "learning_rate": 1.2658300057045066e-05, + "loss": 3.0215, + "step": 2224 + }, + { + "epoch": 0.38079753551257917, + "grad_norm": 31.463350296020508, + "learning_rate": 1.2664004563605247e-05, + "loss": 3.9185, + "step": 2225 + }, + { + "epoch": 0.3809686804723601, + "grad_norm": 33.95023727416992, + "learning_rate": 1.266970907016543e-05, + "loss": 4.4252, + "step": 2226 + }, + { + "epoch": 0.38113982543214103, + "grad_norm": 32.201377868652344, + "learning_rate": 1.2675413576725612e-05, + "loss": 3.1638, + "step": 2227 + }, + { + "epoch": 0.38131097039192197, + "grad_norm": 33.09391784667969, + "learning_rate": 1.2681118083285796e-05, + "loss": 4.5716, + "step": 2228 + }, + { + "epoch": 0.3814821153517029, + "grad_norm": 89.28120422363281, + "learning_rate": 1.2686822589845979e-05, + "loss": 5.4798, + "step": 2229 + }, + { + "epoch": 0.38165326031148383, + "grad_norm": 18.636362075805664, + "learning_rate": 1.2692527096406163e-05, + "loss": 1.3417, + "step": 2230 + }, + { + "epoch": 0.38182440527126477, + "grad_norm": 108.82768249511719, + "learning_rate": 1.2698231602966344e-05, + "loss": 5.2101, + "step": 2231 + }, + { + "epoch": 0.3819955502310457, + "grad_norm": 32.57135009765625, + "learning_rate": 1.2703936109526526e-05, + "loss": 4.3203, + "step": 2232 + }, + { + "epoch": 0.38216669519082663, + "grad_norm": 33.27009963989258, + "learning_rate": 1.270964061608671e-05, + "loss": 3.9393, + "step": 2233 + }, + { + "epoch": 0.38233784015060757, + "grad_norm": 16.50580406188965, + "learning_rate": 1.2715345122646891e-05, + "loss": 1.5263, + "step": 2234 + }, + { + "epoch": 0.3825089851103885, + "grad_norm": 18.65876579284668, + "learning_rate": 1.2721049629207074e-05, + "loss": 1.948, + "step": 2235 + }, + { + "epoch": 0.38268013007016943, + "grad_norm": 28.283248901367188, + "learning_rate": 1.2726754135767256e-05, + "loss": 2.8414, + "step": 2236 + }, + { + "epoch": 0.38285127502995037, + "grad_norm": 118.61890411376953, + "learning_rate": 1.273245864232744e-05, + "loss": 7.9885, + "step": 2237 + }, + { + "epoch": 0.3830224199897313, + "grad_norm": 16.00472640991211, + "learning_rate": 1.2738163148887621e-05, + "loss": 1.4454, + "step": 2238 + }, + { + "epoch": 0.38319356494951223, + "grad_norm": 18.229719161987305, + "learning_rate": 1.2743867655447803e-05, + "loss": 1.5843, + "step": 2239 + }, + { + "epoch": 0.38336470990929317, + "grad_norm": 26.571413040161133, + "learning_rate": 1.2749572162007986e-05, + "loss": 2.6752, + "step": 2240 + }, + { + "epoch": 0.3835358548690741, + "grad_norm": 66.64990234375, + "learning_rate": 1.275527666856817e-05, + "loss": 5.1676, + "step": 2241 + }, + { + "epoch": 0.38370699982885503, + "grad_norm": 19.84005355834961, + "learning_rate": 1.2760981175128353e-05, + "loss": 1.4974, + "step": 2242 + }, + { + "epoch": 0.38387814478863597, + "grad_norm": 18.671689987182617, + "learning_rate": 1.2766685681688535e-05, + "loss": 1.9852, + "step": 2243 + }, + { + "epoch": 0.3840492897484169, + "grad_norm": 98.68587493896484, + "learning_rate": 1.2772390188248716e-05, + "loss": 4.9745, + "step": 2244 + }, + { + "epoch": 0.38422043470819783, + "grad_norm": 6.933028221130371, + "learning_rate": 1.27780946948089e-05, + "loss": 0.6802, + "step": 2245 + }, + { + "epoch": 0.38439157966797877, + "grad_norm": 18.11700439453125, + "learning_rate": 1.2783799201369081e-05, + "loss": 1.5278, + "step": 2246 + }, + { + "epoch": 0.3845627246277597, + "grad_norm": 18.046253204345703, + "learning_rate": 1.2789503707929265e-05, + "loss": 2.1576, + "step": 2247 + }, + { + "epoch": 0.38473386958754063, + "grad_norm": 44.326602935791016, + "learning_rate": 1.2795208214489446e-05, + "loss": 7.2696, + "step": 2248 + }, + { + "epoch": 0.38490501454732157, + "grad_norm": 95.7645034790039, + "learning_rate": 1.280091272104963e-05, + "loss": 6.0354, + "step": 2249 + }, + { + "epoch": 0.3850761595071025, + "grad_norm": 7.336085796356201, + "learning_rate": 1.2806617227609811e-05, + "loss": 1.6021, + "step": 2250 + }, + { + "epoch": 0.38524730446688343, + "grad_norm": 21.488544464111328, + "learning_rate": 1.2812321734169993e-05, + "loss": 1.9826, + "step": 2251 + }, + { + "epoch": 0.38541844942666437, + "grad_norm": 34.97186279296875, + "learning_rate": 1.2818026240730178e-05, + "loss": 4.3046, + "step": 2252 + }, + { + "epoch": 0.3855895943864453, + "grad_norm": 4.4676008224487305, + "learning_rate": 1.282373074729036e-05, + "loss": 0.6622, + "step": 2253 + }, + { + "epoch": 0.38576073934622623, + "grad_norm": 6.151776313781738, + "learning_rate": 1.2829435253850543e-05, + "loss": 0.6381, + "step": 2254 + }, + { + "epoch": 0.38593188430600717, + "grad_norm": 6.36190938949585, + "learning_rate": 1.2835139760410725e-05, + "loss": 0.6743, + "step": 2255 + }, + { + "epoch": 0.3861030292657881, + "grad_norm": 24.97540283203125, + "learning_rate": 1.2840844266970908e-05, + "loss": 2.6396, + "step": 2256 + }, + { + "epoch": 0.38627417422556903, + "grad_norm": 141.4521026611328, + "learning_rate": 1.284654877353109e-05, + "loss": 4.7593, + "step": 2257 + }, + { + "epoch": 0.38644531918534997, + "grad_norm": 16.442359924316406, + "learning_rate": 1.2852253280091272e-05, + "loss": 1.3891, + "step": 2258 + }, + { + "epoch": 0.3866164641451309, + "grad_norm": 26.524892807006836, + "learning_rate": 1.2857957786651455e-05, + "loss": 2.8351, + "step": 2259 + }, + { + "epoch": 0.38678760910491183, + "grad_norm": 20.284482955932617, + "learning_rate": 1.2863662293211637e-05, + "loss": 2.2276, + "step": 2260 + }, + { + "epoch": 0.3869587540646928, + "grad_norm": 13.217884063720703, + "learning_rate": 1.286936679977182e-05, + "loss": 0.9694, + "step": 2261 + }, + { + "epoch": 0.38712989902447376, + "grad_norm": 33.80121612548828, + "learning_rate": 1.2875071306332002e-05, + "loss": 4.1736, + "step": 2262 + }, + { + "epoch": 0.3873010439842547, + "grad_norm": 33.30670928955078, + "learning_rate": 1.2880775812892185e-05, + "loss": 3.5895, + "step": 2263 + }, + { + "epoch": 0.3874721889440356, + "grad_norm": 24.27392578125, + "learning_rate": 1.2886480319452369e-05, + "loss": 2.6142, + "step": 2264 + }, + { + "epoch": 0.38764333390381656, + "grad_norm": 4.387927055358887, + "learning_rate": 1.289218482601255e-05, + "loss": 0.5611, + "step": 2265 + }, + { + "epoch": 0.3878144788635975, + "grad_norm": 11.723445892333984, + "learning_rate": 1.2897889332572734e-05, + "loss": 1.9691, + "step": 2266 + }, + { + "epoch": 0.3879856238233784, + "grad_norm": 31.290142059326172, + "learning_rate": 1.2903593839132915e-05, + "loss": 4.4918, + "step": 2267 + }, + { + "epoch": 0.38815676878315936, + "grad_norm": 29.301557540893555, + "learning_rate": 1.2909298345693099e-05, + "loss": 3.0265, + "step": 2268 + }, + { + "epoch": 0.3883279137429403, + "grad_norm": 102.96603393554688, + "learning_rate": 1.291500285225328e-05, + "loss": 5.4491, + "step": 2269 + }, + { + "epoch": 0.3884990587027212, + "grad_norm": 6.566992282867432, + "learning_rate": 1.2920707358813462e-05, + "loss": 0.6619, + "step": 2270 + }, + { + "epoch": 0.38867020366250216, + "grad_norm": 20.63521385192871, + "learning_rate": 1.2926411865373645e-05, + "loss": 1.4227, + "step": 2271 + }, + { + "epoch": 0.3888413486222831, + "grad_norm": 35.605445861816406, + "learning_rate": 1.2932116371933827e-05, + "loss": 3.0135, + "step": 2272 + }, + { + "epoch": 0.389012493582064, + "grad_norm": 26.535554885864258, + "learning_rate": 1.293782087849401e-05, + "loss": 2.8951, + "step": 2273 + }, + { + "epoch": 0.38918363854184496, + "grad_norm": 216.86865234375, + "learning_rate": 1.2943525385054192e-05, + "loss": 9.7157, + "step": 2274 + }, + { + "epoch": 0.3893547835016259, + "grad_norm": 30.289108276367188, + "learning_rate": 1.2949229891614376e-05, + "loss": 3.7199, + "step": 2275 + }, + { + "epoch": 0.3895259284614068, + "grad_norm": 70.54218292236328, + "learning_rate": 1.2954934398174559e-05, + "loss": 8.7961, + "step": 2276 + }, + { + "epoch": 0.38969707342118776, + "grad_norm": 37.42404556274414, + "learning_rate": 1.296063890473474e-05, + "loss": 4.6408, + "step": 2277 + }, + { + "epoch": 0.3898682183809687, + "grad_norm": 20.272388458251953, + "learning_rate": 1.2966343411294924e-05, + "loss": 2.2787, + "step": 2278 + }, + { + "epoch": 0.3900393633407496, + "grad_norm": 21.717552185058594, + "learning_rate": 1.2972047917855106e-05, + "loss": 2.6312, + "step": 2279 + }, + { + "epoch": 0.39021050830053056, + "grad_norm": 27.405563354492188, + "learning_rate": 1.2977752424415289e-05, + "loss": 2.922, + "step": 2280 + }, + { + "epoch": 0.3903816532603115, + "grad_norm": 9.014309883117676, + "learning_rate": 1.298345693097547e-05, + "loss": 0.7394, + "step": 2281 + }, + { + "epoch": 0.3905527982200924, + "grad_norm": 34.70540237426758, + "learning_rate": 1.2989161437535652e-05, + "loss": 4.5773, + "step": 2282 + }, + { + "epoch": 0.39072394317987336, + "grad_norm": 17.615568161010742, + "learning_rate": 1.2994865944095836e-05, + "loss": 1.3793, + "step": 2283 + }, + { + "epoch": 0.3908950881396543, + "grad_norm": 9.69536018371582, + "learning_rate": 1.3000570450656018e-05, + "loss": 1.4154, + "step": 2284 + }, + { + "epoch": 0.3910662330994352, + "grad_norm": 42.174076080322266, + "learning_rate": 1.3006274957216201e-05, + "loss": 3.7989, + "step": 2285 + }, + { + "epoch": 0.39123737805921616, + "grad_norm": 23.85903549194336, + "learning_rate": 1.3011979463776384e-05, + "loss": 2.7904, + "step": 2286 + }, + { + "epoch": 0.3914085230189971, + "grad_norm": 12.722695350646973, + "learning_rate": 1.3017683970336568e-05, + "loss": 0.9675, + "step": 2287 + }, + { + "epoch": 0.391579667978778, + "grad_norm": 29.10125160217285, + "learning_rate": 1.302338847689675e-05, + "loss": 3.3556, + "step": 2288 + }, + { + "epoch": 0.39175081293855896, + "grad_norm": 28.335847854614258, + "learning_rate": 1.3029092983456931e-05, + "loss": 3.6913, + "step": 2289 + }, + { + "epoch": 0.3919219578983399, + "grad_norm": 27.098371505737305, + "learning_rate": 1.3034797490017114e-05, + "loss": 3.4171, + "step": 2290 + }, + { + "epoch": 0.3920931028581208, + "grad_norm": 24.61624526977539, + "learning_rate": 1.3040501996577296e-05, + "loss": 2.5601, + "step": 2291 + }, + { + "epoch": 0.39226424781790176, + "grad_norm": 36.29865264892578, + "learning_rate": 1.304620650313748e-05, + "loss": 4.1205, + "step": 2292 + }, + { + "epoch": 0.3924353927776827, + "grad_norm": Infinity, + "learning_rate": 1.304620650313748e-05, + "loss": 10.2854, + "step": 2293 + }, + { + "epoch": 0.3926065377374636, + "grad_norm": 43.4719352722168, + "learning_rate": 1.3051911009697661e-05, + "loss": 5.7968, + "step": 2294 + }, + { + "epoch": 0.39277768269724456, + "grad_norm": 34.03304672241211, + "learning_rate": 1.3057615516257843e-05, + "loss": 3.2482, + "step": 2295 + }, + { + "epoch": 0.3929488276570255, + "grad_norm": 28.92998695373535, + "learning_rate": 1.3063320022818026e-05, + "loss": 3.1048, + "step": 2296 + }, + { + "epoch": 0.3931199726168064, + "grad_norm": 30.764570236206055, + "learning_rate": 1.3069024529378208e-05, + "loss": 3.4204, + "step": 2297 + }, + { + "epoch": 0.39329111757658736, + "grad_norm": 30.185405731201172, + "learning_rate": 1.3074729035938391e-05, + "loss": 3.6159, + "step": 2298 + }, + { + "epoch": 0.3934622625363683, + "grad_norm": 15.160475730895996, + "learning_rate": 1.3080433542498575e-05, + "loss": 1.0351, + "step": 2299 + }, + { + "epoch": 0.3936334074961492, + "grad_norm": 30.460662841796875, + "learning_rate": 1.3086138049058758e-05, + "loss": 3.0676, + "step": 2300 + }, + { + "epoch": 0.39380455245593016, + "grad_norm": 31.176111221313477, + "learning_rate": 1.309184255561894e-05, + "loss": 3.07, + "step": 2301 + }, + { + "epoch": 0.3939756974157111, + "grad_norm": 40.287208557128906, + "learning_rate": 1.3097547062179121e-05, + "loss": 4.0389, + "step": 2302 + }, + { + "epoch": 0.394146842375492, + "grad_norm": 31.603471755981445, + "learning_rate": 1.3103251568739305e-05, + "loss": 3.1424, + "step": 2303 + }, + { + "epoch": 0.39431798733527296, + "grad_norm": 27.959386825561523, + "learning_rate": 1.3108956075299486e-05, + "loss": 2.7499, + "step": 2304 + }, + { + "epoch": 0.3944891322950539, + "grad_norm": 9.44000244140625, + "learning_rate": 1.311466058185967e-05, + "loss": 0.6843, + "step": 2305 + }, + { + "epoch": 0.3946602772548348, + "grad_norm": 31.026531219482422, + "learning_rate": 1.3120365088419852e-05, + "loss": 3.0951, + "step": 2306 + }, + { + "epoch": 0.39483142221461576, + "grad_norm": 28.429651260375977, + "learning_rate": 1.3126069594980035e-05, + "loss": 3.1954, + "step": 2307 + }, + { + "epoch": 0.3950025671743967, + "grad_norm": 36.807884216308594, + "learning_rate": 1.3131774101540217e-05, + "loss": 3.5692, + "step": 2308 + }, + { + "epoch": 0.3951737121341776, + "grad_norm": 27.523998260498047, + "learning_rate": 1.3137478608100398e-05, + "loss": 3.04, + "step": 2309 + }, + { + "epoch": 0.39534485709395856, + "grad_norm": 22.569734573364258, + "learning_rate": 1.3143183114660583e-05, + "loss": 2.6063, + "step": 2310 + }, + { + "epoch": 0.3955160020537395, + "grad_norm": 30.23894691467285, + "learning_rate": 1.3148887621220765e-05, + "loss": 2.6877, + "step": 2311 + }, + { + "epoch": 0.3956871470135205, + "grad_norm": 32.485286712646484, + "learning_rate": 1.3154592127780948e-05, + "loss": 4.2341, + "step": 2312 + }, + { + "epoch": 0.3958582919733014, + "grad_norm": 9.512272834777832, + "learning_rate": 1.316029663434113e-05, + "loss": 0.7438, + "step": 2313 + }, + { + "epoch": 0.39602943693308235, + "grad_norm": 30.39967918395996, + "learning_rate": 1.3166001140901312e-05, + "loss": 2.678, + "step": 2314 + }, + { + "epoch": 0.3962005818928633, + "grad_norm": 26.347349166870117, + "learning_rate": 1.3171705647461495e-05, + "loss": 2.6173, + "step": 2315 + }, + { + "epoch": 0.3963717268526442, + "grad_norm": 11.27676010131836, + "learning_rate": 1.3177410154021677e-05, + "loss": 0.9184, + "step": 2316 + }, + { + "epoch": 0.39654287181242515, + "grad_norm": 28.942106246948242, + "learning_rate": 1.318311466058186e-05, + "loss": 2.7751, + "step": 2317 + }, + { + "epoch": 0.3967140167722061, + "grad_norm": 155.31259155273438, + "learning_rate": 1.3188819167142042e-05, + "loss": 7.0884, + "step": 2318 + }, + { + "epoch": 0.396885161731987, + "grad_norm": 15.048434257507324, + "learning_rate": 1.3194523673702225e-05, + "loss": 1.0894, + "step": 2319 + }, + { + "epoch": 0.39705630669176795, + "grad_norm": 29.555904388427734, + "learning_rate": 1.3200228180262407e-05, + "loss": 3.0407, + "step": 2320 + }, + { + "epoch": 0.3972274516515489, + "grad_norm": 22.9705810546875, + "learning_rate": 1.3205932686822589e-05, + "loss": 2.1862, + "step": 2321 + }, + { + "epoch": 0.3973985966113298, + "grad_norm": 31.04474449157715, + "learning_rate": 1.3211637193382774e-05, + "loss": 3.5704, + "step": 2322 + }, + { + "epoch": 0.39756974157111075, + "grad_norm": 38.25536346435547, + "learning_rate": 1.3217341699942955e-05, + "loss": 3.7726, + "step": 2323 + }, + { + "epoch": 0.3977408865308917, + "grad_norm": 24.22712516784668, + "learning_rate": 1.3223046206503139e-05, + "loss": 2.9952, + "step": 2324 + }, + { + "epoch": 0.3979120314906726, + "grad_norm": 32.82272720336914, + "learning_rate": 1.322875071306332e-05, + "loss": 3.313, + "step": 2325 + }, + { + "epoch": 0.39808317645045355, + "grad_norm": 29.25124168395996, + "learning_rate": 1.3234455219623502e-05, + "loss": 2.9707, + "step": 2326 + }, + { + "epoch": 0.3982543214102345, + "grad_norm": 42.494041442871094, + "learning_rate": 1.3240159726183686e-05, + "loss": 4.4698, + "step": 2327 + }, + { + "epoch": 0.3984254663700154, + "grad_norm": 32.5220947265625, + "learning_rate": 1.3245864232743867e-05, + "loss": 3.7016, + "step": 2328 + }, + { + "epoch": 0.39859661132979635, + "grad_norm": 4.652500629425049, + "learning_rate": 1.325156873930405e-05, + "loss": 0.5571, + "step": 2329 + }, + { + "epoch": 0.3987677562895773, + "grad_norm": 136.5018768310547, + "learning_rate": 1.3257273245864232e-05, + "loss": 5.8926, + "step": 2330 + }, + { + "epoch": 0.3989389012493582, + "grad_norm": 51.504051208496094, + "learning_rate": 1.3262977752424416e-05, + "loss": 5.6096, + "step": 2331 + }, + { + "epoch": 0.39911004620913915, + "grad_norm": 18.578340530395508, + "learning_rate": 1.3268682258984597e-05, + "loss": 2.2877, + "step": 2332 + }, + { + "epoch": 0.3992811911689201, + "grad_norm": 26.573881149291992, + "learning_rate": 1.327438676554478e-05, + "loss": 3.3675, + "step": 2333 + }, + { + "epoch": 0.399452336128701, + "grad_norm": 28.39176368713379, + "learning_rate": 1.3280091272104964e-05, + "loss": 3.4595, + "step": 2334 + }, + { + "epoch": 0.39962348108848195, + "grad_norm": 27.315298080444336, + "learning_rate": 1.3285795778665146e-05, + "loss": 3.7229, + "step": 2335 + }, + { + "epoch": 0.3997946260482629, + "grad_norm": 34.018280029296875, + "learning_rate": 1.329150028522533e-05, + "loss": 3.7833, + "step": 2336 + }, + { + "epoch": 0.3999657710080438, + "grad_norm": 35.161949157714844, + "learning_rate": 1.3297204791785511e-05, + "loss": 3.6256, + "step": 2337 + }, + { + "epoch": 0.40013691596782475, + "grad_norm": 54.34180450439453, + "learning_rate": 1.3302909298345694e-05, + "loss": 8.2972, + "step": 2338 + }, + { + "epoch": 0.4003080609276057, + "grad_norm": 37.41242980957031, + "learning_rate": 1.3308613804905876e-05, + "loss": 4.1554, + "step": 2339 + }, + { + "epoch": 0.4004792058873866, + "grad_norm": 24.117671966552734, + "learning_rate": 1.3314318311466058e-05, + "loss": 2.6779, + "step": 2340 + }, + { + "epoch": 0.40065035084716755, + "grad_norm": 75.865234375, + "learning_rate": 1.3320022818026241e-05, + "loss": 5.2769, + "step": 2341 + }, + { + "epoch": 0.4008214958069485, + "grad_norm": 10.710969924926758, + "learning_rate": 1.3325727324586423e-05, + "loss": 0.8464, + "step": 2342 + }, + { + "epoch": 0.4009926407667294, + "grad_norm": 32.42598342895508, + "learning_rate": 1.3331431831146606e-05, + "loss": 4.2176, + "step": 2343 + }, + { + "epoch": 0.40116378572651035, + "grad_norm": 31.665010452270508, + "learning_rate": 1.333713633770679e-05, + "loss": 3.6966, + "step": 2344 + }, + { + "epoch": 0.4013349306862913, + "grad_norm": 19.899494171142578, + "learning_rate": 1.3342840844266971e-05, + "loss": 1.5844, + "step": 2345 + }, + { + "epoch": 0.4015060756460722, + "grad_norm": 26.72218132019043, + "learning_rate": 1.3348545350827155e-05, + "loss": 2.6391, + "step": 2346 + }, + { + "epoch": 0.40167722060585315, + "grad_norm": 67.58808135986328, + "learning_rate": 1.3354249857387336e-05, + "loss": 4.8317, + "step": 2347 + }, + { + "epoch": 0.4018483655656341, + "grad_norm": 35.947166442871094, + "learning_rate": 1.335995436394752e-05, + "loss": 4.4359, + "step": 2348 + }, + { + "epoch": 0.402019510525415, + "grad_norm": 33.10310745239258, + "learning_rate": 1.3365658870507701e-05, + "loss": 4.2287, + "step": 2349 + }, + { + "epoch": 0.40219065548519595, + "grad_norm": 26.962339401245117, + "learning_rate": 1.3371363377067885e-05, + "loss": 2.8973, + "step": 2350 + }, + { + "epoch": 0.4023618004449769, + "grad_norm": 10.832432746887207, + "learning_rate": 1.3377067883628066e-05, + "loss": 1.1334, + "step": 2351 + }, + { + "epoch": 0.4025329454047578, + "grad_norm": 3.8930623531341553, + "learning_rate": 1.3382772390188248e-05, + "loss": 0.5691, + "step": 2352 + }, + { + "epoch": 0.40270409036453875, + "grad_norm": 30.617422103881836, + "learning_rate": 1.3388476896748431e-05, + "loss": 4.1985, + "step": 2353 + }, + { + "epoch": 0.4028752353243197, + "grad_norm": 29.522432327270508, + "learning_rate": 1.3394181403308613e-05, + "loss": 3.3929, + "step": 2354 + }, + { + "epoch": 0.4030463802841006, + "grad_norm": 29.46415901184082, + "learning_rate": 1.3399885909868796e-05, + "loss": 3.0491, + "step": 2355 + }, + { + "epoch": 0.40321752524388155, + "grad_norm": 28.462308883666992, + "learning_rate": 1.340559041642898e-05, + "loss": 3.4012, + "step": 2356 + }, + { + "epoch": 0.4033886702036625, + "grad_norm": 26.383548736572266, + "learning_rate": 1.3411294922989163e-05, + "loss": 2.8104, + "step": 2357 + }, + { + "epoch": 0.4035598151634434, + "grad_norm": 5.228971004486084, + "learning_rate": 1.3416999429549345e-05, + "loss": 0.6984, + "step": 2358 + }, + { + "epoch": 0.40373096012322435, + "grad_norm": 65.2656021118164, + "learning_rate": 1.3422703936109527e-05, + "loss": 4.842, + "step": 2359 + }, + { + "epoch": 0.4039021050830053, + "grad_norm": 9.332406044006348, + "learning_rate": 1.342840844266971e-05, + "loss": 1.1261, + "step": 2360 + }, + { + "epoch": 0.4040732500427862, + "grad_norm": 19.021203994750977, + "learning_rate": 1.3434112949229892e-05, + "loss": 2.1595, + "step": 2361 + }, + { + "epoch": 0.40424439500256715, + "grad_norm": 26.664148330688477, + "learning_rate": 1.3439817455790075e-05, + "loss": 3.1312, + "step": 2362 + }, + { + "epoch": 0.40441553996234814, + "grad_norm": 14.464133262634277, + "learning_rate": 1.3445521962350257e-05, + "loss": 1.3528, + "step": 2363 + }, + { + "epoch": 0.40458668492212907, + "grad_norm": 25.62092399597168, + "learning_rate": 1.3451226468910438e-05, + "loss": 2.6933, + "step": 2364 + }, + { + "epoch": 0.40475782988191, + "grad_norm": 30.344446182250977, + "learning_rate": 1.3456930975470622e-05, + "loss": 3.5816, + "step": 2365 + }, + { + "epoch": 0.40492897484169094, + "grad_norm": 34.0131950378418, + "learning_rate": 1.3462635482030803e-05, + "loss": 3.8169, + "step": 2366 + }, + { + "epoch": 0.40510011980147187, + "grad_norm": 25.75012969970703, + "learning_rate": 1.3468339988590989e-05, + "loss": 2.9735, + "step": 2367 + }, + { + "epoch": 0.4052712647612528, + "grad_norm": 31.590328216552734, + "learning_rate": 1.347404449515117e-05, + "loss": 3.4589, + "step": 2368 + }, + { + "epoch": 0.40544240972103374, + "grad_norm": 24.881752014160156, + "learning_rate": 1.3479749001711354e-05, + "loss": 2.5041, + "step": 2369 + }, + { + "epoch": 0.40561355468081467, + "grad_norm": 20.611392974853516, + "learning_rate": 1.3485453508271535e-05, + "loss": 2.0903, + "step": 2370 + }, + { + "epoch": 0.4057846996405956, + "grad_norm": 35.99172592163086, + "learning_rate": 1.3491158014831717e-05, + "loss": 4.3005, + "step": 2371 + }, + { + "epoch": 0.40595584460037654, + "grad_norm": 44.53636932373047, + "learning_rate": 1.34968625213919e-05, + "loss": 7.8911, + "step": 2372 + }, + { + "epoch": 0.40612698956015747, + "grad_norm": 41.1456298828125, + "learning_rate": 1.3502567027952082e-05, + "loss": 7.4041, + "step": 2373 + }, + { + "epoch": 0.4062981345199384, + "grad_norm": 24.72629737854004, + "learning_rate": 1.3508271534512265e-05, + "loss": 2.0511, + "step": 2374 + }, + { + "epoch": 0.40646927947971934, + "grad_norm": 9.164275169372559, + "learning_rate": 1.3513976041072447e-05, + "loss": 1.0127, + "step": 2375 + }, + { + "epoch": 0.40664042443950027, + "grad_norm": 55.97251892089844, + "learning_rate": 1.351968054763263e-05, + "loss": 7.7883, + "step": 2376 + }, + { + "epoch": 0.4068115693992812, + "grad_norm": 15.729819297790527, + "learning_rate": 1.3525385054192812e-05, + "loss": 1.3747, + "step": 2377 + }, + { + "epoch": 0.40698271435906214, + "grad_norm": 31.85474967956543, + "learning_rate": 1.3531089560752994e-05, + "loss": 3.7341, + "step": 2378 + }, + { + "epoch": 0.40715385931884307, + "grad_norm": 32.369163513183594, + "learning_rate": 1.3536794067313179e-05, + "loss": 3.4044, + "step": 2379 + }, + { + "epoch": 0.407325004278624, + "grad_norm": 26.481473922729492, + "learning_rate": 1.354249857387336e-05, + "loss": 2.7264, + "step": 2380 + }, + { + "epoch": 0.40749614923840494, + "grad_norm": 36.87574005126953, + "learning_rate": 1.3548203080433544e-05, + "loss": 7.1091, + "step": 2381 + }, + { + "epoch": 0.40766729419818587, + "grad_norm": 34.68164825439453, + "learning_rate": 1.3553907586993726e-05, + "loss": 3.5182, + "step": 2382 + }, + { + "epoch": 0.4078384391579668, + "grad_norm": 4.539041042327881, + "learning_rate": 1.3559612093553907e-05, + "loss": 0.5712, + "step": 2383 + }, + { + "epoch": 0.40800958411774774, + "grad_norm": 18.264692306518555, + "learning_rate": 1.356531660011409e-05, + "loss": 1.6827, + "step": 2384 + }, + { + "epoch": 0.40818072907752867, + "grad_norm": 58.49655532836914, + "learning_rate": 1.3571021106674272e-05, + "loss": 4.7984, + "step": 2385 + }, + { + "epoch": 0.4083518740373096, + "grad_norm": 38.31999969482422, + "learning_rate": 1.3576725613234456e-05, + "loss": 4.9844, + "step": 2386 + }, + { + "epoch": 0.40852301899709054, + "grad_norm": 31.779747009277344, + "learning_rate": 1.3582430119794637e-05, + "loss": 4.109, + "step": 2387 + }, + { + "epoch": 0.40869416395687147, + "grad_norm": 28.318117141723633, + "learning_rate": 1.358813462635482e-05, + "loss": 3.1768, + "step": 2388 + }, + { + "epoch": 0.4088653089166524, + "grad_norm": 109.76797485351562, + "learning_rate": 1.3593839132915003e-05, + "loss": 8.1629, + "step": 2389 + }, + { + "epoch": 0.40903645387643334, + "grad_norm": 29.490888595581055, + "learning_rate": 1.3599543639475186e-05, + "loss": 2.8556, + "step": 2390 + }, + { + "epoch": 0.40920759883621427, + "grad_norm": 59.6926383972168, + "learning_rate": 1.360524814603537e-05, + "loss": 7.2838, + "step": 2391 + }, + { + "epoch": 0.4093787437959952, + "grad_norm": 26.968727111816406, + "learning_rate": 1.3610952652595551e-05, + "loss": 3.1073, + "step": 2392 + }, + { + "epoch": 0.40954988875577614, + "grad_norm": 14.444951057434082, + "learning_rate": 1.3616657159155734e-05, + "loss": 1.1151, + "step": 2393 + }, + { + "epoch": 0.40972103371555707, + "grad_norm": 27.179691314697266, + "learning_rate": 1.3622361665715916e-05, + "loss": 3.4778, + "step": 2394 + }, + { + "epoch": 0.409892178675338, + "grad_norm": 28.209474563598633, + "learning_rate": 1.3628066172276098e-05, + "loss": 3.1061, + "step": 2395 + }, + { + "epoch": 0.41006332363511894, + "grad_norm": 28.115158081054688, + "learning_rate": 1.3633770678836281e-05, + "loss": 3.3303, + "step": 2396 + }, + { + "epoch": 0.41023446859489987, + "grad_norm": 33.9571418762207, + "learning_rate": 1.3639475185396463e-05, + "loss": 3.476, + "step": 2397 + }, + { + "epoch": 0.4104056135546808, + "grad_norm": 99.95455932617188, + "learning_rate": 1.3645179691956646e-05, + "loss": 4.891, + "step": 2398 + }, + { + "epoch": 0.41057675851446174, + "grad_norm": 32.09910583496094, + "learning_rate": 1.3650884198516828e-05, + "loss": 4.5344, + "step": 2399 + }, + { + "epoch": 0.41074790347424267, + "grad_norm": 22.752981185913086, + "learning_rate": 1.3656588705077011e-05, + "loss": 2.5455, + "step": 2400 + }, + { + "epoch": 0.4109190484340236, + "grad_norm": 31.16071128845215, + "learning_rate": 1.3662293211637193e-05, + "loss": 3.5245, + "step": 2401 + }, + { + "epoch": 0.41109019339380454, + "grad_norm": 16.054365158081055, + "learning_rate": 1.3667997718197376e-05, + "loss": 1.1714, + "step": 2402 + }, + { + "epoch": 0.41126133835358547, + "grad_norm": 61.82563018798828, + "learning_rate": 1.367370222475756e-05, + "loss": 4.4332, + "step": 2403 + }, + { + "epoch": 0.4114324833133664, + "grad_norm": 25.521482467651367, + "learning_rate": 1.3679406731317741e-05, + "loss": 3.1523, + "step": 2404 + }, + { + "epoch": 0.41160362827314734, + "grad_norm": 28.02633285522461, + "learning_rate": 1.3685111237877925e-05, + "loss": 3.3077, + "step": 2405 + }, + { + "epoch": 0.41177477323292827, + "grad_norm": 31.012575149536133, + "learning_rate": 1.3690815744438106e-05, + "loss": 3.2338, + "step": 2406 + }, + { + "epoch": 0.4119459181927092, + "grad_norm": 24.693798065185547, + "learning_rate": 1.369652025099829e-05, + "loss": 3.1159, + "step": 2407 + }, + { + "epoch": 0.41211706315249014, + "grad_norm": 28.928600311279297, + "learning_rate": 1.3702224757558471e-05, + "loss": 3.231, + "step": 2408 + }, + { + "epoch": 0.41228820811227107, + "grad_norm": 30.929235458374023, + "learning_rate": 1.3707929264118653e-05, + "loss": 3.7399, + "step": 2409 + }, + { + "epoch": 0.412459353072052, + "grad_norm": 29.809967041015625, + "learning_rate": 1.3713633770678837e-05, + "loss": 2.8299, + "step": 2410 + }, + { + "epoch": 0.41263049803183294, + "grad_norm": 34.67237091064453, + "learning_rate": 1.3719338277239018e-05, + "loss": 3.7692, + "step": 2411 + }, + { + "epoch": 0.41280164299161387, + "grad_norm": 29.03022575378418, + "learning_rate": 1.3725042783799202e-05, + "loss": 3.1456, + "step": 2412 + }, + { + "epoch": 0.4129727879513948, + "grad_norm": 27.838979721069336, + "learning_rate": 1.3730747290359385e-05, + "loss": 2.7944, + "step": 2413 + }, + { + "epoch": 0.4131439329111758, + "grad_norm": 27.87117576599121, + "learning_rate": 1.3736451796919567e-05, + "loss": 2.9217, + "step": 2414 + }, + { + "epoch": 0.4133150778709567, + "grad_norm": 34.504295349121094, + "learning_rate": 1.374215630347975e-05, + "loss": 3.9797, + "step": 2415 + }, + { + "epoch": 0.41348622283073766, + "grad_norm": 21.570331573486328, + "learning_rate": 1.3747860810039932e-05, + "loss": 2.3607, + "step": 2416 + }, + { + "epoch": 0.4136573677905186, + "grad_norm": 132.19834899902344, + "learning_rate": 1.3753565316600115e-05, + "loss": 7.9895, + "step": 2417 + }, + { + "epoch": 0.4138285127502995, + "grad_norm": 29.5281925201416, + "learning_rate": 1.3759269823160297e-05, + "loss": 3.3183, + "step": 2418 + }, + { + "epoch": 0.41399965771008046, + "grad_norm": 7.409353256225586, + "learning_rate": 1.376497432972048e-05, + "loss": 0.6978, + "step": 2419 + }, + { + "epoch": 0.4141708026698614, + "grad_norm": 23.6326847076416, + "learning_rate": 1.3770678836280662e-05, + "loss": 2.4085, + "step": 2420 + }, + { + "epoch": 0.4143419476296423, + "grad_norm": 6.584427356719971, + "learning_rate": 1.3776383342840844e-05, + "loss": 0.7725, + "step": 2421 + }, + { + "epoch": 0.41451309258942326, + "grad_norm": 5.124080181121826, + "learning_rate": 1.3782087849401027e-05, + "loss": 0.5998, + "step": 2422 + }, + { + "epoch": 0.4146842375492042, + "grad_norm": 129.28781127929688, + "learning_rate": 1.3787792355961209e-05, + "loss": 5.2033, + "step": 2423 + }, + { + "epoch": 0.4148553825089851, + "grad_norm": 30.348461151123047, + "learning_rate": 1.3793496862521394e-05, + "loss": 3.4905, + "step": 2424 + }, + { + "epoch": 0.41502652746876606, + "grad_norm": 25.107507705688477, + "learning_rate": 1.3799201369081575e-05, + "loss": 2.4536, + "step": 2425 + }, + { + "epoch": 0.415197672428547, + "grad_norm": 20.649410247802734, + "learning_rate": 1.3804905875641757e-05, + "loss": 2.11, + "step": 2426 + }, + { + "epoch": 0.4153688173883279, + "grad_norm": 31.82566261291504, + "learning_rate": 1.381061038220194e-05, + "loss": 3.4624, + "step": 2427 + }, + { + "epoch": 0.41553996234810886, + "grad_norm": 25.216468811035156, + "learning_rate": 1.3816314888762122e-05, + "loss": 2.7103, + "step": 2428 + }, + { + "epoch": 0.4157111073078898, + "grad_norm": 62.44169616699219, + "learning_rate": 1.3822019395322305e-05, + "loss": 7.9435, + "step": 2429 + }, + { + "epoch": 0.4158822522676707, + "grad_norm": 14.46311092376709, + "learning_rate": 1.3827723901882487e-05, + "loss": 1.3546, + "step": 2430 + }, + { + "epoch": 0.41605339722745166, + "grad_norm": 21.584251403808594, + "learning_rate": 1.383342840844267e-05, + "loss": 2.3481, + "step": 2431 + }, + { + "epoch": 0.4162245421872326, + "grad_norm": 28.41043472290039, + "learning_rate": 1.3839132915002852e-05, + "loss": 3.242, + "step": 2432 + }, + { + "epoch": 0.4163956871470135, + "grad_norm": 57.48540496826172, + "learning_rate": 1.3844837421563034e-05, + "loss": 8.205, + "step": 2433 + }, + { + "epoch": 0.41656683210679446, + "grad_norm": 20.560029983520508, + "learning_rate": 1.3850541928123217e-05, + "loss": 2.4152, + "step": 2434 + }, + { + "epoch": 0.4167379770665754, + "grad_norm": 29.860027313232422, + "learning_rate": 1.3856246434683399e-05, + "loss": 2.9946, + "step": 2435 + }, + { + "epoch": 0.4169091220263563, + "grad_norm": 34.29914855957031, + "learning_rate": 1.3861950941243584e-05, + "loss": 4.0154, + "step": 2436 + }, + { + "epoch": 0.41708026698613726, + "grad_norm": 31.778980255126953, + "learning_rate": 1.3867655447803766e-05, + "loss": 3.279, + "step": 2437 + }, + { + "epoch": 0.4172514119459182, + "grad_norm": 30.92992401123047, + "learning_rate": 1.3873359954363949e-05, + "loss": 3.7128, + "step": 2438 + }, + { + "epoch": 0.4174225569056991, + "grad_norm": 30.067113876342773, + "learning_rate": 1.387906446092413e-05, + "loss": 3.3294, + "step": 2439 + }, + { + "epoch": 0.41759370186548006, + "grad_norm": 78.94349670410156, + "learning_rate": 1.3884768967484312e-05, + "loss": 4.5151, + "step": 2440 + }, + { + "epoch": 0.417764846825261, + "grad_norm": 35.60622787475586, + "learning_rate": 1.3890473474044496e-05, + "loss": 3.3575, + "step": 2441 + }, + { + "epoch": 0.4179359917850419, + "grad_norm": 29.288429260253906, + "learning_rate": 1.3896177980604678e-05, + "loss": 3.0086, + "step": 2442 + }, + { + "epoch": 0.41810713674482286, + "grad_norm": 29.106294631958008, + "learning_rate": 1.3901882487164861e-05, + "loss": 3.228, + "step": 2443 + }, + { + "epoch": 0.4182782817046038, + "grad_norm": 20.533992767333984, + "learning_rate": 1.3907586993725043e-05, + "loss": 2.2425, + "step": 2444 + }, + { + "epoch": 0.4184494266643847, + "grad_norm": 26.775163650512695, + "learning_rate": 1.3913291500285224e-05, + "loss": 2.6486, + "step": 2445 + }, + { + "epoch": 0.41862057162416566, + "grad_norm": 23.887187957763672, + "learning_rate": 1.3918996006845408e-05, + "loss": 2.0254, + "step": 2446 + }, + { + "epoch": 0.4187917165839466, + "grad_norm": 32.54766082763672, + "learning_rate": 1.3924700513405591e-05, + "loss": 4.0827, + "step": 2447 + }, + { + "epoch": 0.4189628615437275, + "grad_norm": 24.691999435424805, + "learning_rate": 1.3930405019965774e-05, + "loss": 2.7637, + "step": 2448 + }, + { + "epoch": 0.41913400650350846, + "grad_norm": 36.446842193603516, + "learning_rate": 1.3936109526525956e-05, + "loss": 7.2209, + "step": 2449 + }, + { + "epoch": 0.4193051514632894, + "grad_norm": 24.245582580566406, + "learning_rate": 1.394181403308614e-05, + "loss": 2.5936, + "step": 2450 + }, + { + "epoch": 0.4194762964230703, + "grad_norm": 34.520198822021484, + "learning_rate": 1.3947518539646321e-05, + "loss": 3.7529, + "step": 2451 + }, + { + "epoch": 0.41964744138285126, + "grad_norm": 34.79539489746094, + "learning_rate": 1.3953223046206503e-05, + "loss": 4.0982, + "step": 2452 + }, + { + "epoch": 0.4198185863426322, + "grad_norm": 6.38947057723999, + "learning_rate": 1.3958927552766686e-05, + "loss": 0.5817, + "step": 2453 + }, + { + "epoch": 0.4199897313024131, + "grad_norm": 35.33879852294922, + "learning_rate": 1.3964632059326868e-05, + "loss": 3.3258, + "step": 2454 + }, + { + "epoch": 0.42016087626219406, + "grad_norm": 8.833622932434082, + "learning_rate": 1.3970336565887051e-05, + "loss": 1.1178, + "step": 2455 + }, + { + "epoch": 0.420332021221975, + "grad_norm": 25.07313346862793, + "learning_rate": 1.3976041072447233e-05, + "loss": 2.7209, + "step": 2456 + }, + { + "epoch": 0.4205031661817559, + "grad_norm": 30.224679946899414, + "learning_rate": 1.3981745579007416e-05, + "loss": 3.4578, + "step": 2457 + }, + { + "epoch": 0.42067431114153686, + "grad_norm": 26.421674728393555, + "learning_rate": 1.3987450085567598e-05, + "loss": 2.6023, + "step": 2458 + }, + { + "epoch": 0.4208454561013178, + "grad_norm": 33.97099685668945, + "learning_rate": 1.3993154592127781e-05, + "loss": 4.066, + "step": 2459 + }, + { + "epoch": 0.4210166010610987, + "grad_norm": 28.07750701904297, + "learning_rate": 1.3998859098687965e-05, + "loss": 3.1308, + "step": 2460 + }, + { + "epoch": 0.42118774602087966, + "grad_norm": 33.50989532470703, + "learning_rate": 1.4004563605248146e-05, + "loss": 3.461, + "step": 2461 + }, + { + "epoch": 0.4213588909806606, + "grad_norm": 16.63654136657715, + "learning_rate": 1.401026811180833e-05, + "loss": 1.3419, + "step": 2462 + }, + { + "epoch": 0.4215300359404415, + "grad_norm": 16.42656707763672, + "learning_rate": 1.4015972618368512e-05, + "loss": 1.8682, + "step": 2463 + }, + { + "epoch": 0.42170118090022246, + "grad_norm": 30.457616806030273, + "learning_rate": 1.4021677124928693e-05, + "loss": 3.1266, + "step": 2464 + }, + { + "epoch": 0.42187232586000345, + "grad_norm": 20.4791202545166, + "learning_rate": 1.4027381631488877e-05, + "loss": 2.2995, + "step": 2465 + }, + { + "epoch": 0.4220434708197844, + "grad_norm": 8.99075698852539, + "learning_rate": 1.4033086138049058e-05, + "loss": 1.6982, + "step": 2466 + }, + { + "epoch": 0.4222146157795653, + "grad_norm": 57.64451599121094, + "learning_rate": 1.4038790644609242e-05, + "loss": 8.0313, + "step": 2467 + }, + { + "epoch": 0.42238576073934625, + "grad_norm": 13.558103561401367, + "learning_rate": 1.4044495151169423e-05, + "loss": 1.2578, + "step": 2468 + }, + { + "epoch": 0.4225569056991272, + "grad_norm": 21.366905212402344, + "learning_rate": 1.4050199657729607e-05, + "loss": 2.2006, + "step": 2469 + }, + { + "epoch": 0.4227280506589081, + "grad_norm": 14.984084129333496, + "learning_rate": 1.405590416428979e-05, + "loss": 1.1421, + "step": 2470 + }, + { + "epoch": 0.42289919561868905, + "grad_norm": 37.106781005859375, + "learning_rate": 1.4061608670849972e-05, + "loss": 3.448, + "step": 2471 + }, + { + "epoch": 0.42307034057847, + "grad_norm": 103.56417083740234, + "learning_rate": 1.4067313177410155e-05, + "loss": 8.869, + "step": 2472 + }, + { + "epoch": 0.4232414855382509, + "grad_norm": 34.15910339355469, + "learning_rate": 1.4073017683970337e-05, + "loss": 3.99, + "step": 2473 + }, + { + "epoch": 0.42341263049803185, + "grad_norm": 9.371402740478516, + "learning_rate": 1.407872219053052e-05, + "loss": 1.7126, + "step": 2474 + }, + { + "epoch": 0.4235837754578128, + "grad_norm": 5.610677719116211, + "learning_rate": 1.4084426697090702e-05, + "loss": 0.6139, + "step": 2475 + }, + { + "epoch": 0.4237549204175937, + "grad_norm": 25.18387222290039, + "learning_rate": 1.4090131203650885e-05, + "loss": 3.0951, + "step": 2476 + }, + { + "epoch": 0.42392606537737465, + "grad_norm": 21.81611442565918, + "learning_rate": 1.4095835710211067e-05, + "loss": 2.1739, + "step": 2477 + }, + { + "epoch": 0.4240972103371556, + "grad_norm": 37.39387893676758, + "learning_rate": 1.4101540216771249e-05, + "loss": 4.5252, + "step": 2478 + }, + { + "epoch": 0.4242683552969365, + "grad_norm": 5.823449611663818, + "learning_rate": 1.4107244723331432e-05, + "loss": 0.6128, + "step": 2479 + }, + { + "epoch": 0.42443950025671745, + "grad_norm": 31.69689178466797, + "learning_rate": 1.4112949229891614e-05, + "loss": 2.9986, + "step": 2480 + }, + { + "epoch": 0.4246106452164984, + "grad_norm": 35.987152099609375, + "learning_rate": 1.4118653736451797e-05, + "loss": 3.4619, + "step": 2481 + }, + { + "epoch": 0.4247817901762793, + "grad_norm": 16.255069732666016, + "learning_rate": 1.412435824301198e-05, + "loss": 1.2687, + "step": 2482 + }, + { + "epoch": 0.42495293513606025, + "grad_norm": 38.878501892089844, + "learning_rate": 1.4130062749572162e-05, + "loss": 4.4326, + "step": 2483 + }, + { + "epoch": 0.4251240800958412, + "grad_norm": 33.7603759765625, + "learning_rate": 1.4135767256132346e-05, + "loss": 3.6103, + "step": 2484 + }, + { + "epoch": 0.4252952250556221, + "grad_norm": 30.415058135986328, + "learning_rate": 1.4141471762692527e-05, + "loss": 2.9553, + "step": 2485 + }, + { + "epoch": 0.42546637001540305, + "grad_norm": 18.42668914794922, + "learning_rate": 1.414717626925271e-05, + "loss": 1.6697, + "step": 2486 + }, + { + "epoch": 0.425637514975184, + "grad_norm": 28.910137176513672, + "learning_rate": 1.4152880775812892e-05, + "loss": 2.7775, + "step": 2487 + }, + { + "epoch": 0.4258086599349649, + "grad_norm": 24.642555236816406, + "learning_rate": 1.4158585282373076e-05, + "loss": 2.6428, + "step": 2488 + }, + { + "epoch": 0.42597980489474585, + "grad_norm": 6.298614501953125, + "learning_rate": 1.4164289788933257e-05, + "loss": 0.5965, + "step": 2489 + }, + { + "epoch": 0.4261509498545268, + "grad_norm": 134.2201690673828, + "learning_rate": 1.4169994295493439e-05, + "loss": 5.7153, + "step": 2490 + }, + { + "epoch": 0.4263220948143077, + "grad_norm": 29.22636604309082, + "learning_rate": 1.4175698802053622e-05, + "loss": 3.1392, + "step": 2491 + }, + { + "epoch": 0.42649323977408865, + "grad_norm": 29.945589065551758, + "learning_rate": 1.4181403308613804e-05, + "loss": 2.9141, + "step": 2492 + }, + { + "epoch": 0.4266643847338696, + "grad_norm": 122.40117645263672, + "learning_rate": 1.418710781517399e-05, + "loss": 9.1376, + "step": 2493 + }, + { + "epoch": 0.4268355296936505, + "grad_norm": 12.278093338012695, + "learning_rate": 1.4192812321734171e-05, + "loss": 0.8576, + "step": 2494 + }, + { + "epoch": 0.42700667465343145, + "grad_norm": 29.33226776123047, + "learning_rate": 1.4198516828294353e-05, + "loss": 2.9443, + "step": 2495 + }, + { + "epoch": 0.4271778196132124, + "grad_norm": 31.89412498474121, + "learning_rate": 1.4204221334854536e-05, + "loss": 3.7789, + "step": 2496 + }, + { + "epoch": 0.4273489645729933, + "grad_norm": 30.404138565063477, + "learning_rate": 1.4209925841414718e-05, + "loss": 4.0024, + "step": 2497 + }, + { + "epoch": 0.42752010953277425, + "grad_norm": 7.538527488708496, + "learning_rate": 1.4215630347974901e-05, + "loss": 0.6812, + "step": 2498 + }, + { + "epoch": 0.4276912544925552, + "grad_norm": 194.7794952392578, + "learning_rate": 1.4221334854535083e-05, + "loss": 10.7557, + "step": 2499 + }, + { + "epoch": 0.4278623994523361, + "grad_norm": 27.38447380065918, + "learning_rate": 1.4227039361095266e-05, + "loss": 3.0669, + "step": 2500 + }, + { + "epoch": 0.42803354441211705, + "grad_norm": 36.52588653564453, + "learning_rate": 1.4232743867655448e-05, + "loss": 3.7922, + "step": 2501 + }, + { + "epoch": 0.428204689371898, + "grad_norm": 14.776211738586426, + "learning_rate": 1.423844837421563e-05, + "loss": 1.0695, + "step": 2502 + }, + { + "epoch": 0.4283758343316789, + "grad_norm": 22.516334533691406, + "learning_rate": 1.4244152880775813e-05, + "loss": 2.175, + "step": 2503 + }, + { + "epoch": 0.42854697929145985, + "grad_norm": 31.414302825927734, + "learning_rate": 1.4249857387335995e-05, + "loss": 3.5488, + "step": 2504 + }, + { + "epoch": 0.4287181242512408, + "grad_norm": 20.823116302490234, + "learning_rate": 1.425556189389618e-05, + "loss": 2.0128, + "step": 2505 + }, + { + "epoch": 0.4288892692110217, + "grad_norm": 33.47979736328125, + "learning_rate": 1.4261266400456361e-05, + "loss": 3.6721, + "step": 2506 + }, + { + "epoch": 0.42906041417080265, + "grad_norm": 33.771358489990234, + "learning_rate": 1.4266970907016545e-05, + "loss": 4.2083, + "step": 2507 + }, + { + "epoch": 0.4292315591305836, + "grad_norm": 21.674623489379883, + "learning_rate": 1.4272675413576726e-05, + "loss": 1.8789, + "step": 2508 + }, + { + "epoch": 0.4294027040903645, + "grad_norm": 31.44987678527832, + "learning_rate": 1.4278379920136908e-05, + "loss": 3.6812, + "step": 2509 + }, + { + "epoch": 0.42957384905014545, + "grad_norm": 9.912192344665527, + "learning_rate": 1.4284084426697091e-05, + "loss": 1.2073, + "step": 2510 + }, + { + "epoch": 0.4297449940099264, + "grad_norm": 26.342119216918945, + "learning_rate": 1.4289788933257273e-05, + "loss": 2.4193, + "step": 2511 + }, + { + "epoch": 0.4299161389697073, + "grad_norm": 57.646331787109375, + "learning_rate": 1.4295493439817456e-05, + "loss": 8.0588, + "step": 2512 + }, + { + "epoch": 0.43008728392948825, + "grad_norm": 25.247426986694336, + "learning_rate": 1.4301197946377638e-05, + "loss": 2.5184, + "step": 2513 + }, + { + "epoch": 0.4302584288892692, + "grad_norm": 21.471519470214844, + "learning_rate": 1.430690245293782e-05, + "loss": 2.2334, + "step": 2514 + }, + { + "epoch": 0.4304295738490501, + "grad_norm": 25.605525970458984, + "learning_rate": 1.4312606959498003e-05, + "loss": 2.584, + "step": 2515 + }, + { + "epoch": 0.4306007188088311, + "grad_norm": 34.87372589111328, + "learning_rate": 1.4318311466058187e-05, + "loss": 3.7447, + "step": 2516 + }, + { + "epoch": 0.43077186376861204, + "grad_norm": 28.899642944335938, + "learning_rate": 1.432401597261837e-05, + "loss": 2.9852, + "step": 2517 + }, + { + "epoch": 0.430943008728393, + "grad_norm": 24.084014892578125, + "learning_rate": 1.4329720479178552e-05, + "loss": 2.5703, + "step": 2518 + }, + { + "epoch": 0.4311141536881739, + "grad_norm": 13.15533447265625, + "learning_rate": 1.4335424985738735e-05, + "loss": 0.8629, + "step": 2519 + }, + { + "epoch": 0.43128529864795484, + "grad_norm": 100.28350067138672, + "learning_rate": 1.4341129492298917e-05, + "loss": 5.5365, + "step": 2520 + }, + { + "epoch": 0.4314564436077358, + "grad_norm": 25.63288116455078, + "learning_rate": 1.4346833998859098e-05, + "loss": 2.2759, + "step": 2521 + }, + { + "epoch": 0.4316275885675167, + "grad_norm": 4.588881969451904, + "learning_rate": 1.4352538505419282e-05, + "loss": 0.5143, + "step": 2522 + }, + { + "epoch": 0.43179873352729764, + "grad_norm": 31.304664611816406, + "learning_rate": 1.4358243011979463e-05, + "loss": 4.1463, + "step": 2523 + }, + { + "epoch": 0.4319698784870786, + "grad_norm": 18.030874252319336, + "learning_rate": 1.4363947518539647e-05, + "loss": 1.8934, + "step": 2524 + }, + { + "epoch": 0.4321410234468595, + "grad_norm": 29.338178634643555, + "learning_rate": 1.4369652025099829e-05, + "loss": 3.3346, + "step": 2525 + }, + { + "epoch": 0.43231216840664044, + "grad_norm": 29.54951286315918, + "learning_rate": 1.4375356531660012e-05, + "loss": 2.695, + "step": 2526 + }, + { + "epoch": 0.4324833133664214, + "grad_norm": 17.5317325592041, + "learning_rate": 1.4381061038220195e-05, + "loss": 1.4575, + "step": 2527 + }, + { + "epoch": 0.4326544583262023, + "grad_norm": 32.96657943725586, + "learning_rate": 1.4386765544780377e-05, + "loss": 3.592, + "step": 2528 + }, + { + "epoch": 0.43282560328598324, + "grad_norm": 19.32137107849121, + "learning_rate": 1.439247005134056e-05, + "loss": 1.2429, + "step": 2529 + }, + { + "epoch": 0.4329967482457642, + "grad_norm": 8.846491813659668, + "learning_rate": 1.4398174557900742e-05, + "loss": 1.021, + "step": 2530 + }, + { + "epoch": 0.4331678932055451, + "grad_norm": 4.180466651916504, + "learning_rate": 1.4403879064460925e-05, + "loss": 0.5108, + "step": 2531 + }, + { + "epoch": 0.43333903816532604, + "grad_norm": 28.7572078704834, + "learning_rate": 1.4409583571021107e-05, + "loss": 2.694, + "step": 2532 + }, + { + "epoch": 0.433510183125107, + "grad_norm": 34.48351287841797, + "learning_rate": 1.4415288077581289e-05, + "loss": 3.7674, + "step": 2533 + }, + { + "epoch": 0.4336813280848879, + "grad_norm": 27.524559020996094, + "learning_rate": 1.4420992584141472e-05, + "loss": 2.768, + "step": 2534 + }, + { + "epoch": 0.43385247304466884, + "grad_norm": 33.70855712890625, + "learning_rate": 1.4426697090701654e-05, + "loss": 3.1902, + "step": 2535 + }, + { + "epoch": 0.4340236180044498, + "grad_norm": 30.53034210205078, + "learning_rate": 1.4432401597261837e-05, + "loss": 3.4593, + "step": 2536 + }, + { + "epoch": 0.4341947629642307, + "grad_norm": 30.834991455078125, + "learning_rate": 1.4438106103822019e-05, + "loss": 3.439, + "step": 2537 + }, + { + "epoch": 0.43436590792401164, + "grad_norm": 31.815725326538086, + "learning_rate": 1.4443810610382202e-05, + "loss": 3.683, + "step": 2538 + }, + { + "epoch": 0.4345370528837926, + "grad_norm": 29.159996032714844, + "learning_rate": 1.4449515116942386e-05, + "loss": 2.9478, + "step": 2539 + }, + { + "epoch": 0.4347081978435735, + "grad_norm": 81.45700073242188, + "learning_rate": 1.4455219623502567e-05, + "loss": 9.0416, + "step": 2540 + }, + { + "epoch": 0.43487934280335444, + "grad_norm": 87.70926666259766, + "learning_rate": 1.446092413006275e-05, + "loss": 4.7629, + "step": 2541 + }, + { + "epoch": 0.4350504877631354, + "grad_norm": 9.934538841247559, + "learning_rate": 1.4466628636622932e-05, + "loss": 1.8243, + "step": 2542 + }, + { + "epoch": 0.4352216327229163, + "grad_norm": 9.613969802856445, + "learning_rate": 1.4472333143183116e-05, + "loss": 1.031, + "step": 2543 + }, + { + "epoch": 0.43539277768269724, + "grad_norm": 45.231689453125, + "learning_rate": 1.4478037649743297e-05, + "loss": 7.5705, + "step": 2544 + }, + { + "epoch": 0.4355639226424782, + "grad_norm": 9.317858695983887, + "learning_rate": 1.4483742156303479e-05, + "loss": 0.6934, + "step": 2545 + }, + { + "epoch": 0.4357350676022591, + "grad_norm": 35.789794921875, + "learning_rate": 1.4489446662863663e-05, + "loss": 4.1345, + "step": 2546 + }, + { + "epoch": 0.43590621256204004, + "grad_norm": 11.596151351928711, + "learning_rate": 1.4495151169423844e-05, + "loss": 1.3557, + "step": 2547 + }, + { + "epoch": 0.436077357521821, + "grad_norm": 4.43747091293335, + "learning_rate": 1.4500855675984028e-05, + "loss": 0.4991, + "step": 2548 + }, + { + "epoch": 0.4362485024816019, + "grad_norm": 29.71784019470215, + "learning_rate": 1.450656018254421e-05, + "loss": 3.192, + "step": 2549 + }, + { + "epoch": 0.43641964744138284, + "grad_norm": 44.21783447265625, + "learning_rate": 1.4512264689104394e-05, + "loss": 3.6461, + "step": 2550 + }, + { + "epoch": 0.4365907924011638, + "grad_norm": 27.61203384399414, + "learning_rate": 1.4517969195664576e-05, + "loss": 3.3135, + "step": 2551 + }, + { + "epoch": 0.4367619373609447, + "grad_norm": 23.84665298461914, + "learning_rate": 1.4523673702224758e-05, + "loss": 2.5268, + "step": 2552 + }, + { + "epoch": 0.43693308232072564, + "grad_norm": 29.368938446044922, + "learning_rate": 1.4529378208784941e-05, + "loss": 2.7211, + "step": 2553 + }, + { + "epoch": 0.4371042272805066, + "grad_norm": 36.08073806762695, + "learning_rate": 1.4535082715345123e-05, + "loss": 3.7964, + "step": 2554 + }, + { + "epoch": 0.4372753722402875, + "grad_norm": 32.68186950683594, + "learning_rate": 1.4540787221905306e-05, + "loss": 3.6173, + "step": 2555 + }, + { + "epoch": 0.43744651720006844, + "grad_norm": 34.985904693603516, + "learning_rate": 1.4546491728465488e-05, + "loss": 4.2656, + "step": 2556 + }, + { + "epoch": 0.4376176621598494, + "grad_norm": 129.27252197265625, + "learning_rate": 1.4552196235025671e-05, + "loss": 8.7164, + "step": 2557 + }, + { + "epoch": 0.4377888071196303, + "grad_norm": 29.99295997619629, + "learning_rate": 1.4557900741585853e-05, + "loss": 3.6185, + "step": 2558 + }, + { + "epoch": 0.43795995207941124, + "grad_norm": 28.371896743774414, + "learning_rate": 1.4563605248146035e-05, + "loss": 2.9256, + "step": 2559 + }, + { + "epoch": 0.4381310970391922, + "grad_norm": 8.728231430053711, + "learning_rate": 1.4569309754706218e-05, + "loss": 0.9915, + "step": 2560 + }, + { + "epoch": 0.4383022419989731, + "grad_norm": 31.164567947387695, + "learning_rate": 1.45750142612664e-05, + "loss": 3.8704, + "step": 2561 + }, + { + "epoch": 0.43847338695875404, + "grad_norm": 32.18178176879883, + "learning_rate": 1.4580718767826585e-05, + "loss": 4.2259, + "step": 2562 + }, + { + "epoch": 0.438644531918535, + "grad_norm": 25.499011993408203, + "learning_rate": 1.4586423274386766e-05, + "loss": 2.6854, + "step": 2563 + }, + { + "epoch": 0.4388156768783159, + "grad_norm": 34.26057815551758, + "learning_rate": 1.4592127780946948e-05, + "loss": 3.4689, + "step": 2564 + }, + { + "epoch": 0.43898682183809684, + "grad_norm": 33.73667526245117, + "learning_rate": 1.4597832287507131e-05, + "loss": 4.3474, + "step": 2565 + }, + { + "epoch": 0.43915796679787783, + "grad_norm": 32.83565902709961, + "learning_rate": 1.4603536794067313e-05, + "loss": 3.475, + "step": 2566 + }, + { + "epoch": 0.43932911175765876, + "grad_norm": 3.187453269958496, + "learning_rate": 1.4609241300627497e-05, + "loss": 0.4736, + "step": 2567 + }, + { + "epoch": 0.4395002567174397, + "grad_norm": 19.98860740661621, + "learning_rate": 1.4614945807187678e-05, + "loss": 2.0086, + "step": 2568 + }, + { + "epoch": 0.43967140167722063, + "grad_norm": 27.594697952270508, + "learning_rate": 1.4620650313747862e-05, + "loss": 3.2803, + "step": 2569 + }, + { + "epoch": 0.43984254663700156, + "grad_norm": 3.9966156482696533, + "learning_rate": 1.4626354820308043e-05, + "loss": 0.568, + "step": 2570 + }, + { + "epoch": 0.4400136915967825, + "grad_norm": 5.779835224151611, + "learning_rate": 1.4632059326868225e-05, + "loss": 0.648, + "step": 2571 + }, + { + "epoch": 0.44018483655656343, + "grad_norm": 39.23750305175781, + "learning_rate": 1.4637763833428408e-05, + "loss": 7.0941, + "step": 2572 + }, + { + "epoch": 0.44035598151634436, + "grad_norm": 28.068208694458008, + "learning_rate": 1.4643468339988592e-05, + "loss": 3.0381, + "step": 2573 + }, + { + "epoch": 0.4405271264761253, + "grad_norm": 25.783096313476562, + "learning_rate": 1.4649172846548775e-05, + "loss": 3.0511, + "step": 2574 + }, + { + "epoch": 0.44069827143590623, + "grad_norm": 29.101238250732422, + "learning_rate": 1.4654877353108957e-05, + "loss": 2.9123, + "step": 2575 + }, + { + "epoch": 0.44086941639568716, + "grad_norm": 14.171677589416504, + "learning_rate": 1.466058185966914e-05, + "loss": 1.0138, + "step": 2576 + }, + { + "epoch": 0.4410405613554681, + "grad_norm": 27.117347717285156, + "learning_rate": 1.4666286366229322e-05, + "loss": 3.1994, + "step": 2577 + }, + { + "epoch": 0.44121170631524903, + "grad_norm": 29.480358123779297, + "learning_rate": 1.4671990872789504e-05, + "loss": 3.4766, + "step": 2578 + }, + { + "epoch": 0.44138285127502996, + "grad_norm": 4.977560997009277, + "learning_rate": 1.4677695379349687e-05, + "loss": 0.7032, + "step": 2579 + }, + { + "epoch": 0.4415539962348109, + "grad_norm": 31.941097259521484, + "learning_rate": 1.4683399885909869e-05, + "loss": 2.931, + "step": 2580 + }, + { + "epoch": 0.44172514119459183, + "grad_norm": 136.83563232421875, + "learning_rate": 1.4689104392470052e-05, + "loss": 5.0846, + "step": 2581 + }, + { + "epoch": 0.44189628615437276, + "grad_norm": 9.305535316467285, + "learning_rate": 1.4694808899030234e-05, + "loss": 0.911, + "step": 2582 + }, + { + "epoch": 0.4420674311141537, + "grad_norm": 18.890281677246094, + "learning_rate": 1.4700513405590415e-05, + "loss": 1.4747, + "step": 2583 + }, + { + "epoch": 0.44223857607393463, + "grad_norm": 46.04558563232422, + "learning_rate": 1.4706217912150599e-05, + "loss": 7.673, + "step": 2584 + }, + { + "epoch": 0.44240972103371556, + "grad_norm": 24.37186050415039, + "learning_rate": 1.4711922418710782e-05, + "loss": 2.3299, + "step": 2585 + }, + { + "epoch": 0.4425808659934965, + "grad_norm": 38.21072006225586, + "learning_rate": 1.4717626925270965e-05, + "loss": 7.1275, + "step": 2586 + }, + { + "epoch": 0.44275201095327743, + "grad_norm": 77.47330474853516, + "learning_rate": 1.4723331431831147e-05, + "loss": 4.7878, + "step": 2587 + }, + { + "epoch": 0.44292315591305836, + "grad_norm": 37.18149185180664, + "learning_rate": 1.472903593839133e-05, + "loss": 3.7571, + "step": 2588 + }, + { + "epoch": 0.4430943008728393, + "grad_norm": 3.5262255668640137, + "learning_rate": 1.4734740444951512e-05, + "loss": 0.5224, + "step": 2589 + }, + { + "epoch": 0.44326544583262023, + "grad_norm": 11.645423889160156, + "learning_rate": 1.4740444951511694e-05, + "loss": 1.0846, + "step": 2590 + }, + { + "epoch": 0.44343659079240116, + "grad_norm": 6.892613410949707, + "learning_rate": 1.4746149458071877e-05, + "loss": 0.54, + "step": 2591 + }, + { + "epoch": 0.4436077357521821, + "grad_norm": 8.752089500427246, + "learning_rate": 1.4751853964632059e-05, + "loss": 0.6267, + "step": 2592 + }, + { + "epoch": 0.44377888071196303, + "grad_norm": 23.974550247192383, + "learning_rate": 1.4757558471192242e-05, + "loss": 2.6998, + "step": 2593 + }, + { + "epoch": 0.44395002567174396, + "grad_norm": 7.374299049377441, + "learning_rate": 1.4763262977752424e-05, + "loss": 0.8349, + "step": 2594 + }, + { + "epoch": 0.4441211706315249, + "grad_norm": 23.21881103515625, + "learning_rate": 1.4768967484312606e-05, + "loss": 2.7586, + "step": 2595 + }, + { + "epoch": 0.44429231559130583, + "grad_norm": 170.6956024169922, + "learning_rate": 1.477467199087279e-05, + "loss": 9.1929, + "step": 2596 + }, + { + "epoch": 0.44446346055108676, + "grad_norm": 20.11836814880371, + "learning_rate": 1.4780376497432972e-05, + "loss": 2.3475, + "step": 2597 + }, + { + "epoch": 0.4446346055108677, + "grad_norm": 24.03493881225586, + "learning_rate": 1.4786081003993156e-05, + "loss": 2.9464, + "step": 2598 + }, + { + "epoch": 0.44480575047064863, + "grad_norm": 27.76041603088379, + "learning_rate": 1.4791785510553338e-05, + "loss": 2.7217, + "step": 2599 + }, + { + "epoch": 0.44497689543042956, + "grad_norm": 17.792516708374023, + "learning_rate": 1.4797490017113521e-05, + "loss": 1.6209, + "step": 2600 + }, + { + "epoch": 0.4451480403902105, + "grad_norm": 34.788169860839844, + "learning_rate": 1.4803194523673703e-05, + "loss": 2.8761, + "step": 2601 + }, + { + "epoch": 0.44531918534999143, + "grad_norm": 18.824007034301758, + "learning_rate": 1.4808899030233884e-05, + "loss": 2.5789, + "step": 2602 + }, + { + "epoch": 0.44549033030977236, + "grad_norm": 19.51264190673828, + "learning_rate": 1.4814603536794068e-05, + "loss": 2.2163, + "step": 2603 + }, + { + "epoch": 0.4456614752695533, + "grad_norm": 31.428625106811523, + "learning_rate": 1.482030804335425e-05, + "loss": 2.9496, + "step": 2604 + }, + { + "epoch": 0.44583262022933423, + "grad_norm": 13.012333869934082, + "learning_rate": 1.4826012549914433e-05, + "loss": 0.8159, + "step": 2605 + }, + { + "epoch": 0.44600376518911516, + "grad_norm": 23.477638244628906, + "learning_rate": 1.4831717056474614e-05, + "loss": 2.4937, + "step": 2606 + }, + { + "epoch": 0.4461749101488961, + "grad_norm": 35.8111572265625, + "learning_rate": 1.48374215630348e-05, + "loss": 3.2833, + "step": 2607 + }, + { + "epoch": 0.44634605510867703, + "grad_norm": 32.99673080444336, + "learning_rate": 1.4843126069594981e-05, + "loss": 3.5874, + "step": 2608 + }, + { + "epoch": 0.44651720006845796, + "grad_norm": 3.853698253631592, + "learning_rate": 1.4848830576155163e-05, + "loss": 0.4709, + "step": 2609 + }, + { + "epoch": 0.4466883450282389, + "grad_norm": 27.9306583404541, + "learning_rate": 1.4854535082715346e-05, + "loss": 2.6247, + "step": 2610 + }, + { + "epoch": 0.44685948998801983, + "grad_norm": 11.854992866516113, + "learning_rate": 1.4860239589275528e-05, + "loss": 0.9967, + "step": 2611 + }, + { + "epoch": 0.44703063494780076, + "grad_norm": 49.759117126464844, + "learning_rate": 1.4865944095835711e-05, + "loss": 7.1995, + "step": 2612 + }, + { + "epoch": 0.4472017799075817, + "grad_norm": 31.380281448364258, + "learning_rate": 1.4871648602395893e-05, + "loss": 2.7301, + "step": 2613 + }, + { + "epoch": 0.44737292486736263, + "grad_norm": 29.84979820251465, + "learning_rate": 1.4877353108956075e-05, + "loss": 3.1099, + "step": 2614 + }, + { + "epoch": 0.44754406982714356, + "grad_norm": 13.841278076171875, + "learning_rate": 1.4883057615516258e-05, + "loss": 1.0569, + "step": 2615 + }, + { + "epoch": 0.4477152147869245, + "grad_norm": 28.414051055908203, + "learning_rate": 1.488876212207644e-05, + "loss": 2.6359, + "step": 2616 + }, + { + "epoch": 0.4478863597467055, + "grad_norm": 29.42824363708496, + "learning_rate": 1.4894466628636623e-05, + "loss": 3.3203, + "step": 2617 + }, + { + "epoch": 0.4480575047064864, + "grad_norm": 33.065799713134766, + "learning_rate": 1.4900171135196805e-05, + "loss": 4.0124, + "step": 2618 + }, + { + "epoch": 0.44822864966626735, + "grad_norm": 9.898391723632812, + "learning_rate": 1.490587564175699e-05, + "loss": 0.9107, + "step": 2619 + }, + { + "epoch": 0.4483997946260483, + "grad_norm": 23.923398971557617, + "learning_rate": 1.4911580148317172e-05, + "loss": 2.5028, + "step": 2620 + }, + { + "epoch": 0.4485709395858292, + "grad_norm": 25.825178146362305, + "learning_rate": 1.4917284654877353e-05, + "loss": 2.3647, + "step": 2621 + }, + { + "epoch": 0.44874208454561015, + "grad_norm": 24.46117401123047, + "learning_rate": 1.4922989161437537e-05, + "loss": 2.66, + "step": 2622 + }, + { + "epoch": 0.4489132295053911, + "grad_norm": 19.926624298095703, + "learning_rate": 1.4928693667997718e-05, + "loss": 1.771, + "step": 2623 + }, + { + "epoch": 0.449084374465172, + "grad_norm": 107.68805694580078, + "learning_rate": 1.4934398174557902e-05, + "loss": 9.219, + "step": 2624 + }, + { + "epoch": 0.44925551942495295, + "grad_norm": 18.121204376220703, + "learning_rate": 1.4940102681118083e-05, + "loss": 1.3726, + "step": 2625 + }, + { + "epoch": 0.4494266643847339, + "grad_norm": 27.648178100585938, + "learning_rate": 1.4945807187678267e-05, + "loss": 2.6469, + "step": 2626 + }, + { + "epoch": 0.4495978093445148, + "grad_norm": 28.146556854248047, + "learning_rate": 1.4951511694238448e-05, + "loss": 2.8926, + "step": 2627 + }, + { + "epoch": 0.44976895430429575, + "grad_norm": 52.536190032958984, + "learning_rate": 1.495721620079863e-05, + "loss": 7.5002, + "step": 2628 + }, + { + "epoch": 0.4499400992640767, + "grad_norm": 24.027881622314453, + "learning_rate": 1.4962920707358814e-05, + "loss": 2.3452, + "step": 2629 + }, + { + "epoch": 0.4501112442238576, + "grad_norm": 34.977684020996094, + "learning_rate": 1.4968625213918997e-05, + "loss": 3.9508, + "step": 2630 + }, + { + "epoch": 0.45028238918363855, + "grad_norm": 30.991193771362305, + "learning_rate": 1.497432972047918e-05, + "loss": 3.6064, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_nli-pairs_loss": 2.871744394302368, + "eval_nli-pairs_runtime": 4.2947, + "eval_nli-pairs_samples_per_second": 46.569, + "eval_nli-pairs_steps_per_second": 1.63, + "eval_sts-test_pearson_cosine": 0.7195428557259504, + "eval_sts-test_pearson_dot": 0.6098064793689061, + "eval_sts-test_pearson_euclidean": 0.7205423612792191, + "eval_sts-test_pearson_manhattan": 0.7293110123887395, + "eval_sts-test_pearson_max": 0.7293110123887395, + "eval_sts-test_spearman_cosine": 0.6966954300008318, + "eval_sts-test_spearman_dot": 0.5822364450229315, + "eval_sts-test_spearman_euclidean": 0.7004689124572796, + "eval_sts-test_spearman_manhattan": 0.7099498051685355, + "eval_sts-test_spearman_max": 0.7099498051685355, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_vitaminc-pairs_loss": 1.8629615306854248, + "eval_vitaminc-pairs_runtime": 2.7342, + "eval_vitaminc-pairs_samples_per_second": 73.148, + "eval_vitaminc-pairs_steps_per_second": 2.56, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_qnli-contrastive_loss": 5.418925762176514, + "eval_qnli-contrastive_runtime": 0.6359, + "eval_qnli-contrastive_samples_per_second": 314.496, + "eval_qnli-contrastive_steps_per_second": 11.007, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_scitail-pairs-qa_loss": 0.4216327965259552, + "eval_scitail-pairs-qa_runtime": 1.6135, + "eval_scitail-pairs-qa_samples_per_second": 123.956, + "eval_scitail-pairs-qa_steps_per_second": 4.338, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_scitail-pairs-pos_loss": 1.3018670082092285, + "eval_scitail-pairs-pos_runtime": 2.6103, + "eval_scitail-pairs-pos_samples_per_second": 76.619, + "eval_scitail-pairs-pos_steps_per_second": 2.682, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_xsum-pairs_loss": 1.584064245223999, + "eval_xsum-pairs_runtime": 2.6388, + "eval_xsum-pairs_samples_per_second": 66.317, + "eval_xsum-pairs_steps_per_second": 2.274, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_compression-pairs_loss": 0.7760603427886963, + "eval_compression-pairs_runtime": 0.5146, + "eval_compression-pairs_samples_per_second": 388.623, + "eval_compression-pairs_steps_per_second": 13.602, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_sciq_pairs_loss": 5.851566314697266, + "eval_sciq_pairs_runtime": 9.2089, + "eval_sciq_pairs_samples_per_second": 21.718, + "eval_sciq_pairs_steps_per_second": 0.76, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_qasc_pairs_loss": 7.442629814147949, + "eval_qasc_pairs_runtime": 2.6477, + "eval_qasc_pairs_samples_per_second": 75.537, + "eval_qasc_pairs_steps_per_second": 2.644, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_openbookqa_pairs_loss": 4.049252033233643, + "eval_openbookqa_pairs_runtime": 0.6399, + "eval_openbookqa_pairs_samples_per_second": 107.834, + "eval_openbookqa_pairs_steps_per_second": 4.688, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_msmarco_pairs_loss": 2.6957242488861084, + "eval_msmarco_pairs_runtime": 3.9586, + "eval_msmarco_pairs_samples_per_second": 50.523, + "eval_msmarco_pairs_steps_per_second": 1.768, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_nq_pairs_loss": 3.332510471343994, + "eval_nq_pairs_runtime": 8.6125, + "eval_nq_pairs_samples_per_second": 23.222, + "eval_nq_pairs_steps_per_second": 0.813, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_trivia_pairs_loss": 3.298595905303955, + "eval_trivia_pairs_runtime": 12.8335, + "eval_trivia_pairs_samples_per_second": 15.584, + "eval_trivia_pairs_steps_per_second": 0.545, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_quora_pairs_loss": 0.6931056380271912, + "eval_quora_pairs_runtime": 1.5975, + "eval_quora_pairs_samples_per_second": 125.194, + "eval_quora_pairs_steps_per_second": 4.382, + "step": 2631 + }, + { + "epoch": 0.45028238918363855, + "eval_gooaq_pairs_loss": 2.1408634185791016, + "eval_gooaq_pairs_runtime": 2.6505, + "eval_gooaq_pairs_samples_per_second": 75.457, + "eval_gooaq_pairs_steps_per_second": 2.641, + "step": 2631 + }, + { + "epoch": 0.4504535341434195, + "grad_norm": 25.071407318115234, + "learning_rate": 1.4980034227039362e-05, + "loss": 2.3902, + "step": 2632 + }, + { + "epoch": 0.4506246791032004, + "grad_norm": 14.988327026367188, + "learning_rate": 1.4985738733599544e-05, + "loss": 1.1409, + "step": 2633 + }, + { + "epoch": 0.45079582406298135, + "grad_norm": 26.867197036743164, + "learning_rate": 1.4991443240159727e-05, + "loss": 2.932, + "step": 2634 + }, + { + "epoch": 0.4509669690227623, + "grad_norm": 36.01612091064453, + "learning_rate": 1.4997147746719909e-05, + "loss": 4.7158, + "step": 2635 + }, + { + "epoch": 0.4511381139825432, + "grad_norm": 16.741594314575195, + "learning_rate": 1.5002852253280092e-05, + "loss": 1.3786, + "step": 2636 + }, + { + "epoch": 0.45130925894232415, + "grad_norm": 27.737268447875977, + "learning_rate": 1.5008556759840274e-05, + "loss": 3.3069, + "step": 2637 + }, + { + "epoch": 0.4514804039021051, + "grad_norm": 12.152483940124512, + "learning_rate": 1.5014261266400457e-05, + "loss": 1.0332, + "step": 2638 + }, + { + "epoch": 0.451651548861886, + "grad_norm": 12.2247314453125, + "learning_rate": 1.5019965772960639e-05, + "loss": 0.825, + "step": 2639 + }, + { + "epoch": 0.45182269382166695, + "grad_norm": 10.578752517700195, + "learning_rate": 1.502567027952082e-05, + "loss": 1.097, + "step": 2640 + }, + { + "epoch": 0.4519938387814479, + "grad_norm": 27.834949493408203, + "learning_rate": 1.5031374786081004e-05, + "loss": 3.3057, + "step": 2641 + }, + { + "epoch": 0.4521649837412288, + "grad_norm": 31.40846824645996, + "learning_rate": 1.5037079292641186e-05, + "loss": 3.5691, + "step": 2642 + }, + { + "epoch": 0.45233612870100975, + "grad_norm": 37.22605514526367, + "learning_rate": 1.5042783799201369e-05, + "loss": 3.8636, + "step": 2643 + }, + { + "epoch": 0.4525072736607907, + "grad_norm": 10.362072944641113, + "learning_rate": 1.504848830576155e-05, + "loss": 0.9574, + "step": 2644 + }, + { + "epoch": 0.4526784186205716, + "grad_norm": 27.246967315673828, + "learning_rate": 1.5054192812321734e-05, + "loss": 2.6717, + "step": 2645 + }, + { + "epoch": 0.45284956358035255, + "grad_norm": 17.54155921936035, + "learning_rate": 1.5059897318881916e-05, + "loss": 1.1453, + "step": 2646 + }, + { + "epoch": 0.4530207085401335, + "grad_norm": 27.662446975708008, + "learning_rate": 1.50656018254421e-05, + "loss": 2.3804, + "step": 2647 + }, + { + "epoch": 0.4531918534999144, + "grad_norm": 94.5572738647461, + "learning_rate": 1.5071306332002284e-05, + "loss": 8.0449, + "step": 2648 + }, + { + "epoch": 0.45336299845969535, + "grad_norm": 30.91036605834961, + "learning_rate": 1.5077010838562466e-05, + "loss": 3.4507, + "step": 2649 + }, + { + "epoch": 0.4535341434194763, + "grad_norm": 24.88844108581543, + "learning_rate": 1.508271534512265e-05, + "loss": 2.4537, + "step": 2650 + }, + { + "epoch": 0.4537052883792572, + "grad_norm": 36.03679656982422, + "learning_rate": 1.5088419851682831e-05, + "loss": 7.241, + "step": 2651 + }, + { + "epoch": 0.45387643333903815, + "grad_norm": 25.070192337036133, + "learning_rate": 1.5094124358243013e-05, + "loss": 2.5557, + "step": 2652 + }, + { + "epoch": 0.4540475782988191, + "grad_norm": 7.677706718444824, + "learning_rate": 1.5099828864803196e-05, + "loss": 0.5579, + "step": 2653 + }, + { + "epoch": 0.4542187232586, + "grad_norm": 30.037338256835938, + "learning_rate": 1.5105533371363378e-05, + "loss": 3.2643, + "step": 2654 + }, + { + "epoch": 0.45438986821838095, + "grad_norm": 25.368310928344727, + "learning_rate": 1.5111237877923561e-05, + "loss": 2.722, + "step": 2655 + }, + { + "epoch": 0.4545610131781619, + "grad_norm": 36.92127990722656, + "learning_rate": 1.5116942384483743e-05, + "loss": 4.7207, + "step": 2656 + }, + { + "epoch": 0.4547321581379428, + "grad_norm": 22.686552047729492, + "learning_rate": 1.5122646891043926e-05, + "loss": 2.1942, + "step": 2657 + }, + { + "epoch": 0.45490330309772375, + "grad_norm": 53.640262603759766, + "learning_rate": 1.5128351397604108e-05, + "loss": 6.8632, + "step": 2658 + }, + { + "epoch": 0.4550744480575047, + "grad_norm": 24.542247772216797, + "learning_rate": 1.513405590416429e-05, + "loss": 2.4562, + "step": 2659 + }, + { + "epoch": 0.4552455930172856, + "grad_norm": 5.353951930999756, + "learning_rate": 1.5139760410724473e-05, + "loss": 0.5523, + "step": 2660 + }, + { + "epoch": 0.45541673797706655, + "grad_norm": 32.79592514038086, + "learning_rate": 1.5145464917284655e-05, + "loss": 3.4424, + "step": 2661 + }, + { + "epoch": 0.4555878829368475, + "grad_norm": 35.7240104675293, + "learning_rate": 1.5151169423844838e-05, + "loss": 3.5062, + "step": 2662 + }, + { + "epoch": 0.4557590278966284, + "grad_norm": 30.997047424316406, + "learning_rate": 1.515687393040502e-05, + "loss": 3.9807, + "step": 2663 + }, + { + "epoch": 0.45593017285640935, + "grad_norm": 41.52260208129883, + "learning_rate": 1.5162578436965201e-05, + "loss": 4.4682, + "step": 2664 + }, + { + "epoch": 0.4561013178161903, + "grad_norm": 33.410797119140625, + "learning_rate": 1.5168282943525385e-05, + "loss": 3.3602, + "step": 2665 + }, + { + "epoch": 0.4562724627759712, + "grad_norm": 22.308074951171875, + "learning_rate": 1.5173987450085566e-05, + "loss": 2.5111, + "step": 2666 + }, + { + "epoch": 0.45644360773575215, + "grad_norm": 21.073827743530273, + "learning_rate": 1.517969195664575e-05, + "loss": 2.1332, + "step": 2667 + }, + { + "epoch": 0.45661475269553314, + "grad_norm": 36.2976188659668, + "learning_rate": 1.5185396463205931e-05, + "loss": 4.8541, + "step": 2668 + }, + { + "epoch": 0.4567858976553141, + "grad_norm": 37.76522445678711, + "learning_rate": 1.5191100969766115e-05, + "loss": 6.96, + "step": 2669 + }, + { + "epoch": 0.456957042615095, + "grad_norm": 29.864612579345703, + "learning_rate": 1.51968054763263e-05, + "loss": 2.896, + "step": 2670 + }, + { + "epoch": 0.45712818757487594, + "grad_norm": 22.04704475402832, + "learning_rate": 1.5202509982886482e-05, + "loss": 2.6772, + "step": 2671 + }, + { + "epoch": 0.4572993325346569, + "grad_norm": 19.153793334960938, + "learning_rate": 1.5208214489446665e-05, + "loss": 1.7357, + "step": 2672 + }, + { + "epoch": 0.4574704774944378, + "grad_norm": 30.495540618896484, + "learning_rate": 1.5213918996006847e-05, + "loss": 3.1067, + "step": 2673 + }, + { + "epoch": 0.45764162245421874, + "grad_norm": 12.724396705627441, + "learning_rate": 1.521962350256703e-05, + "loss": 0.9931, + "step": 2674 + }, + { + "epoch": 0.4578127674139997, + "grad_norm": 6.2942399978637695, + "learning_rate": 1.5225328009127212e-05, + "loss": 0.5454, + "step": 2675 + }, + { + "epoch": 0.4579839123737806, + "grad_norm": 10.231136322021484, + "learning_rate": 1.5231032515687395e-05, + "loss": 1.696, + "step": 2676 + }, + { + "epoch": 0.45815505733356154, + "grad_norm": 152.32469177246094, + "learning_rate": 1.5236737022247577e-05, + "loss": 8.8958, + "step": 2677 + }, + { + "epoch": 0.4583262022933425, + "grad_norm": 38.06270980834961, + "learning_rate": 1.5242441528807758e-05, + "loss": 3.9409, + "step": 2678 + }, + { + "epoch": 0.4584973472531234, + "grad_norm": 25.77074432373047, + "learning_rate": 1.5248146035367942e-05, + "loss": 2.6594, + "step": 2679 + }, + { + "epoch": 0.45866849221290434, + "grad_norm": 29.309284210205078, + "learning_rate": 1.5253850541928123e-05, + "loss": 3.3099, + "step": 2680 + }, + { + "epoch": 0.4588396371726853, + "grad_norm": 25.866558074951172, + "learning_rate": 1.5259555048488305e-05, + "loss": 3.4843, + "step": 2681 + }, + { + "epoch": 0.4590107821324662, + "grad_norm": 23.736045837402344, + "learning_rate": 1.526525955504849e-05, + "loss": 2.5762, + "step": 2682 + }, + { + "epoch": 0.45918192709224714, + "grad_norm": 24.48654556274414, + "learning_rate": 1.5270964061608672e-05, + "loss": 2.4442, + "step": 2683 + }, + { + "epoch": 0.4593530720520281, + "grad_norm": 9.48880386352539, + "learning_rate": 1.5276668568168852e-05, + "loss": 1.0744, + "step": 2684 + }, + { + "epoch": 0.459524217011809, + "grad_norm": 5.050061225891113, + "learning_rate": 1.5282373074729035e-05, + "loss": 0.4942, + "step": 2685 + }, + { + "epoch": 0.45969536197158994, + "grad_norm": 22.153188705444336, + "learning_rate": 1.528807758128922e-05, + "loss": 2.3914, + "step": 2686 + }, + { + "epoch": 0.4598665069313709, + "grad_norm": 3.6951212882995605, + "learning_rate": 1.5293782087849402e-05, + "loss": 0.4808, + "step": 2687 + }, + { + "epoch": 0.4600376518911518, + "grad_norm": 11.074764251708984, + "learning_rate": 1.5299486594409582e-05, + "loss": 1.7231, + "step": 2688 + }, + { + "epoch": 0.46020879685093274, + "grad_norm": 19.63498306274414, + "learning_rate": 1.5305191100969765e-05, + "loss": 2.1931, + "step": 2689 + }, + { + "epoch": 0.4603799418107137, + "grad_norm": 30.376020431518555, + "learning_rate": 1.531089560752995e-05, + "loss": 3.2142, + "step": 2690 + }, + { + "epoch": 0.4605510867704946, + "grad_norm": 39.05623245239258, + "learning_rate": 1.531660011409013e-05, + "loss": 4.3445, + "step": 2691 + }, + { + "epoch": 0.46072223173027554, + "grad_norm": 34.95427703857422, + "learning_rate": 1.5322304620650312e-05, + "loss": 3.5087, + "step": 2692 + }, + { + "epoch": 0.4608933766900565, + "grad_norm": 24.339468002319336, + "learning_rate": 1.53280091272105e-05, + "loss": 2.139, + "step": 2693 + }, + { + "epoch": 0.4610645216498374, + "grad_norm": 36.85024642944336, + "learning_rate": 1.5333713633770682e-05, + "loss": 4.5667, + "step": 2694 + }, + { + "epoch": 0.46123566660961834, + "grad_norm": 121.48307037353516, + "learning_rate": 1.5339418140330862e-05, + "loss": 5.9012, + "step": 2695 + }, + { + "epoch": 0.4614068115693993, + "grad_norm": 7.473188877105713, + "learning_rate": 1.5345122646891046e-05, + "loss": 0.9301, + "step": 2696 + }, + { + "epoch": 0.4615779565291802, + "grad_norm": 27.48497200012207, + "learning_rate": 1.535082715345123e-05, + "loss": 3.0351, + "step": 2697 + }, + { + "epoch": 0.46174910148896114, + "grad_norm": 22.619394302368164, + "learning_rate": 1.535653166001141e-05, + "loss": 2.4788, + "step": 2698 + }, + { + "epoch": 0.4619202464487421, + "grad_norm": 25.0198974609375, + "learning_rate": 1.5362236166571592e-05, + "loss": 2.3989, + "step": 2699 + }, + { + "epoch": 0.462091391408523, + "grad_norm": 23.36564064025879, + "learning_rate": 1.5367940673131776e-05, + "loss": 2.4179, + "step": 2700 + }, + { + "epoch": 0.46226253636830394, + "grad_norm": 29.04068946838379, + "learning_rate": 1.537364517969196e-05, + "loss": 3.1229, + "step": 2701 + }, + { + "epoch": 0.4624336813280849, + "grad_norm": 27.629722595214844, + "learning_rate": 1.537934968625214e-05, + "loss": 2.7618, + "step": 2702 + }, + { + "epoch": 0.4626048262878658, + "grad_norm": 23.081079483032227, + "learning_rate": 1.5385054192812323e-05, + "loss": 2.8201, + "step": 2703 + }, + { + "epoch": 0.46277597124764674, + "grad_norm": 26.009172439575195, + "learning_rate": 1.5390758699372506e-05, + "loss": 3.1322, + "step": 2704 + }, + { + "epoch": 0.4629471162074277, + "grad_norm": 18.447147369384766, + "learning_rate": 1.5396463205932686e-05, + "loss": 1.2356, + "step": 2705 + }, + { + "epoch": 0.4631182611672086, + "grad_norm": 22.773012161254883, + "learning_rate": 1.540216771249287e-05, + "loss": 2.6551, + "step": 2706 + }, + { + "epoch": 0.46328940612698954, + "grad_norm": 32.899314880371094, + "learning_rate": 1.5407872219053053e-05, + "loss": 3.7763, + "step": 2707 + }, + { + "epoch": 0.4634605510867705, + "grad_norm": 97.4777603149414, + "learning_rate": 1.5413576725613233e-05, + "loss": 4.5767, + "step": 2708 + }, + { + "epoch": 0.4636316960465514, + "grad_norm": 41.41079330444336, + "learning_rate": 1.5419281232173416e-05, + "loss": 7.352, + "step": 2709 + }, + { + "epoch": 0.46380284100633234, + "grad_norm": 24.83094024658203, + "learning_rate": 1.54249857387336e-05, + "loss": 2.836, + "step": 2710 + }, + { + "epoch": 0.4639739859661133, + "grad_norm": 12.101001739501953, + "learning_rate": 1.5430690245293783e-05, + "loss": 0.9624, + "step": 2711 + }, + { + "epoch": 0.4641451309258942, + "grad_norm": 24.289182662963867, + "learning_rate": 1.5436394751853963e-05, + "loss": 2.3101, + "step": 2712 + }, + { + "epoch": 0.46431627588567514, + "grad_norm": 23.911334991455078, + "learning_rate": 1.5442099258414146e-05, + "loss": 2.4969, + "step": 2713 + }, + { + "epoch": 0.4644874208454561, + "grad_norm": 35.51081085205078, + "learning_rate": 1.544780376497433e-05, + "loss": 3.353, + "step": 2714 + }, + { + "epoch": 0.464658565805237, + "grad_norm": 21.24627113342285, + "learning_rate": 1.545350827153451e-05, + "loss": 2.5466, + "step": 2715 + }, + { + "epoch": 0.46482971076501794, + "grad_norm": 30.70880126953125, + "learning_rate": 1.5459212778094696e-05, + "loss": 3.8228, + "step": 2716 + }, + { + "epoch": 0.4650008557247989, + "grad_norm": 25.956119537353516, + "learning_rate": 1.546491728465488e-05, + "loss": 2.6475, + "step": 2717 + }, + { + "epoch": 0.4651720006845798, + "grad_norm": 37.32086944580078, + "learning_rate": 1.5470621791215063e-05, + "loss": 3.6192, + "step": 2718 + }, + { + "epoch": 0.4653431456443608, + "grad_norm": 25.61843490600586, + "learning_rate": 1.5476326297775243e-05, + "loss": 2.336, + "step": 2719 + }, + { + "epoch": 0.46551429060414173, + "grad_norm": 31.5511531829834, + "learning_rate": 1.5482030804335426e-05, + "loss": 3.1832, + "step": 2720 + }, + { + "epoch": 0.46568543556392267, + "grad_norm": 35.96617889404297, + "learning_rate": 1.548773531089561e-05, + "loss": 4.2684, + "step": 2721 + }, + { + "epoch": 0.4658565805237036, + "grad_norm": 12.214024543762207, + "learning_rate": 1.549343981745579e-05, + "loss": 0.8686, + "step": 2722 + }, + { + "epoch": 0.46602772548348453, + "grad_norm": 3.517146110534668, + "learning_rate": 1.5499144324015973e-05, + "loss": 0.4999, + "step": 2723 + }, + { + "epoch": 0.46619887044326547, + "grad_norm": 27.56136703491211, + "learning_rate": 1.5504848830576157e-05, + "loss": 2.7019, + "step": 2724 + }, + { + "epoch": 0.4663700154030464, + "grad_norm": 4.812444686889648, + "learning_rate": 1.551055333713634e-05, + "loss": 0.5107, + "step": 2725 + }, + { + "epoch": 0.46654116036282733, + "grad_norm": 30.523237228393555, + "learning_rate": 1.551625784369652e-05, + "loss": 3.2109, + "step": 2726 + }, + { + "epoch": 0.46671230532260827, + "grad_norm": 28.326934814453125, + "learning_rate": 1.5521962350256703e-05, + "loss": 3.2289, + "step": 2727 + }, + { + "epoch": 0.4668834502823892, + "grad_norm": 34.37868118286133, + "learning_rate": 1.5527666856816887e-05, + "loss": 3.6814, + "step": 2728 + }, + { + "epoch": 0.46705459524217013, + "grad_norm": 30.16160774230957, + "learning_rate": 1.5533371363377067e-05, + "loss": 3.4049, + "step": 2729 + }, + { + "epoch": 0.46722574020195107, + "grad_norm": 4.218698024749756, + "learning_rate": 1.553907586993725e-05, + "loss": 0.4987, + "step": 2730 + }, + { + "epoch": 0.467396885161732, + "grad_norm": 23.180875778198242, + "learning_rate": 1.5544780376497433e-05, + "loss": 2.2238, + "step": 2731 + }, + { + "epoch": 0.46756803012151293, + "grad_norm": 25.21503257751465, + "learning_rate": 1.5550484883057617e-05, + "loss": 2.4819, + "step": 2732 + }, + { + "epoch": 0.46773917508129387, + "grad_norm": 30.37474822998047, + "learning_rate": 1.5556189389617797e-05, + "loss": 3.2935, + "step": 2733 + }, + { + "epoch": 0.4679103200410748, + "grad_norm": 16.8712100982666, + "learning_rate": 1.556189389617798e-05, + "loss": 1.0892, + "step": 2734 + }, + { + "epoch": 0.46808146500085573, + "grad_norm": 23.52683448791504, + "learning_rate": 1.5567598402738164e-05, + "loss": 2.3256, + "step": 2735 + }, + { + "epoch": 0.46825260996063667, + "grad_norm": 37.76002502441406, + "learning_rate": 1.5573302909298344e-05, + "loss": 3.8535, + "step": 2736 + }, + { + "epoch": 0.4684237549204176, + "grad_norm": 31.672475814819336, + "learning_rate": 1.5579007415858527e-05, + "loss": 2.5348, + "step": 2737 + }, + { + "epoch": 0.46859489988019853, + "grad_norm": 59.173072814941406, + "learning_rate": 1.558471192241871e-05, + "loss": 7.7627, + "step": 2738 + }, + { + "epoch": 0.46876604483997947, + "grad_norm": 23.428421020507812, + "learning_rate": 1.5590416428978894e-05, + "loss": 2.3317, + "step": 2739 + }, + { + "epoch": 0.4689371897997604, + "grad_norm": 38.1778564453125, + "learning_rate": 1.5596120935539077e-05, + "loss": 7.6561, + "step": 2740 + }, + { + "epoch": 0.46910833475954133, + "grad_norm": 10.163063049316406, + "learning_rate": 1.560182544209926e-05, + "loss": 0.7524, + "step": 2741 + }, + { + "epoch": 0.46927947971932227, + "grad_norm": 3.395460367202759, + "learning_rate": 1.5607529948659444e-05, + "loss": 0.4881, + "step": 2742 + }, + { + "epoch": 0.4694506246791032, + "grad_norm": 28.233747482299805, + "learning_rate": 1.5613234455219624e-05, + "loss": 2.606, + "step": 2743 + }, + { + "epoch": 0.46962176963888413, + "grad_norm": 33.14704513549805, + "learning_rate": 1.5618938961779807e-05, + "loss": 3.4617, + "step": 2744 + }, + { + "epoch": 0.46979291459866507, + "grad_norm": 4.885557651519775, + "learning_rate": 1.562464346833999e-05, + "loss": 0.5159, + "step": 2745 + }, + { + "epoch": 0.469964059558446, + "grad_norm": 32.37671661376953, + "learning_rate": 1.563034797490017e-05, + "loss": 3.1744, + "step": 2746 + }, + { + "epoch": 0.47013520451822693, + "grad_norm": 26.680980682373047, + "learning_rate": 1.5636052481460354e-05, + "loss": 2.8685, + "step": 2747 + }, + { + "epoch": 0.47030634947800787, + "grad_norm": 27.004371643066406, + "learning_rate": 1.5641756988020537e-05, + "loss": 3.0092, + "step": 2748 + }, + { + "epoch": 0.4704774944377888, + "grad_norm": 21.964834213256836, + "learning_rate": 1.564746149458072e-05, + "loss": 2.2193, + "step": 2749 + }, + { + "epoch": 0.47064863939756973, + "grad_norm": 66.10285186767578, + "learning_rate": 1.56531660011409e-05, + "loss": 7.346, + "step": 2750 + }, + { + "epoch": 0.47081978435735067, + "grad_norm": 6.018518924713135, + "learning_rate": 1.5658870507701084e-05, + "loss": 0.5488, + "step": 2751 + }, + { + "epoch": 0.4709909293171316, + "grad_norm": 30.385318756103516, + "learning_rate": 1.5664575014261267e-05, + "loss": 3.0093, + "step": 2752 + }, + { + "epoch": 0.47116207427691253, + "grad_norm": 13.027030944824219, + "learning_rate": 1.5670279520821447e-05, + "loss": 1.9682, + "step": 2753 + }, + { + "epoch": 0.47133321923669347, + "grad_norm": 35.71416473388672, + "learning_rate": 1.567598402738163e-05, + "loss": 6.69, + "step": 2754 + }, + { + "epoch": 0.4715043641964744, + "grad_norm": 29.253435134887695, + "learning_rate": 1.5681688533941814e-05, + "loss": 2.932, + "step": 2755 + }, + { + "epoch": 0.47167550915625533, + "grad_norm": 30.07666778564453, + "learning_rate": 1.5687393040501998e-05, + "loss": 3.8444, + "step": 2756 + }, + { + "epoch": 0.47184665411603627, + "grad_norm": 35.976871490478516, + "learning_rate": 1.5693097547062178e-05, + "loss": 4.9907, + "step": 2757 + }, + { + "epoch": 0.4720177990758172, + "grad_norm": 87.46236419677734, + "learning_rate": 1.569880205362236e-05, + "loss": 4.691, + "step": 2758 + }, + { + "epoch": 0.47218894403559813, + "grad_norm": 22.59965705871582, + "learning_rate": 1.5704506560182544e-05, + "loss": 2.1086, + "step": 2759 + }, + { + "epoch": 0.47236008899537907, + "grad_norm": 8.256918907165527, + "learning_rate": 1.5710211066742724e-05, + "loss": 0.9678, + "step": 2760 + }, + { + "epoch": 0.47253123395516, + "grad_norm": 38.63548278808594, + "learning_rate": 1.5715915573302908e-05, + "loss": 3.4149, + "step": 2761 + }, + { + "epoch": 0.47270237891494093, + "grad_norm": 7.380117893218994, + "learning_rate": 1.5721620079863094e-05, + "loss": 0.6134, + "step": 2762 + }, + { + "epoch": 0.47287352387472187, + "grad_norm": 27.26441764831543, + "learning_rate": 1.5727324586423278e-05, + "loss": 2.8164, + "step": 2763 + }, + { + "epoch": 0.4730446688345028, + "grad_norm": 18.643917083740234, + "learning_rate": 1.5733029092983458e-05, + "loss": 1.9656, + "step": 2764 + }, + { + "epoch": 0.47321581379428374, + "grad_norm": 27.289445877075195, + "learning_rate": 1.573873359954364e-05, + "loss": 2.8402, + "step": 2765 + }, + { + "epoch": 0.47338695875406467, + "grad_norm": 6.67548942565918, + "learning_rate": 1.5744438106103825e-05, + "loss": 0.7842, + "step": 2766 + }, + { + "epoch": 0.4735581037138456, + "grad_norm": 33.30831527709961, + "learning_rate": 1.5750142612664005e-05, + "loss": 3.6011, + "step": 2767 + }, + { + "epoch": 0.47372924867362654, + "grad_norm": 104.88871765136719, + "learning_rate": 1.5755847119224188e-05, + "loss": 5.0141, + "step": 2768 + }, + { + "epoch": 0.47390039363340747, + "grad_norm": 5.102696418762207, + "learning_rate": 1.576155162578437e-05, + "loss": 0.5258, + "step": 2769 + }, + { + "epoch": 0.47407153859318846, + "grad_norm": 3.6947317123413086, + "learning_rate": 1.576725613234455e-05, + "loss": 0.4833, + "step": 2770 + }, + { + "epoch": 0.4742426835529694, + "grad_norm": 32.43489074707031, + "learning_rate": 1.5772960638904735e-05, + "loss": 3.7272, + "step": 2771 + }, + { + "epoch": 0.4744138285127503, + "grad_norm": 32.32176971435547, + "learning_rate": 1.5778665145464918e-05, + "loss": 3.4356, + "step": 2772 + }, + { + "epoch": 0.47458497347253126, + "grad_norm": 2.583381175994873, + "learning_rate": 1.57843696520251e-05, + "loss": 0.4212, + "step": 2773 + }, + { + "epoch": 0.4747561184323122, + "grad_norm": 38.029869079589844, + "learning_rate": 1.579007415858528e-05, + "loss": 7.2033, + "step": 2774 + }, + { + "epoch": 0.4749272633920931, + "grad_norm": 10.577736854553223, + "learning_rate": 1.5795778665145465e-05, + "loss": 1.2395, + "step": 2775 + }, + { + "epoch": 0.47509840835187406, + "grad_norm": 9.981147766113281, + "learning_rate": 1.5801483171705648e-05, + "loss": 1.4924, + "step": 2776 + }, + { + "epoch": 0.475269553311655, + "grad_norm": 28.36383819580078, + "learning_rate": 1.5807187678265828e-05, + "loss": 3.8155, + "step": 2777 + }, + { + "epoch": 0.4754406982714359, + "grad_norm": 6.329680442810059, + "learning_rate": 1.581289218482601e-05, + "loss": 0.4932, + "step": 2778 + }, + { + "epoch": 0.47561184323121686, + "grad_norm": 17.587629318237305, + "learning_rate": 1.5818596691386195e-05, + "loss": 1.8358, + "step": 2779 + }, + { + "epoch": 0.4757829881909978, + "grad_norm": 32.48772048950195, + "learning_rate": 1.582430119794638e-05, + "loss": 4.1859, + "step": 2780 + }, + { + "epoch": 0.4759541331507787, + "grad_norm": 41.349056243896484, + "learning_rate": 1.583000570450656e-05, + "loss": 7.0338, + "step": 2781 + }, + { + "epoch": 0.47612527811055966, + "grad_norm": 32.28718185424805, + "learning_rate": 1.583571021106674e-05, + "loss": 3.439, + "step": 2782 + }, + { + "epoch": 0.4762964230703406, + "grad_norm": 52.53911209106445, + "learning_rate": 1.5841414717626925e-05, + "loss": 6.9516, + "step": 2783 + }, + { + "epoch": 0.4764675680301215, + "grad_norm": 6.450766086578369, + "learning_rate": 1.5847119224187105e-05, + "loss": 0.4587, + "step": 2784 + }, + { + "epoch": 0.47663871298990246, + "grad_norm": 31.295753479003906, + "learning_rate": 1.5852823730747292e-05, + "loss": 3.6037, + "step": 2785 + }, + { + "epoch": 0.4768098579496834, + "grad_norm": 10.392585754394531, + "learning_rate": 1.5858528237307475e-05, + "loss": 0.7695, + "step": 2786 + }, + { + "epoch": 0.4769810029094643, + "grad_norm": 31.578166961669922, + "learning_rate": 1.586423274386766e-05, + "loss": 3.4914, + "step": 2787 + }, + { + "epoch": 0.47715214786924526, + "grad_norm": 35.540199279785156, + "learning_rate": 1.586993725042784e-05, + "loss": 4.0507, + "step": 2788 + }, + { + "epoch": 0.4773232928290262, + "grad_norm": 30.065216064453125, + "learning_rate": 1.5875641756988022e-05, + "loss": 3.4183, + "step": 2789 + }, + { + "epoch": 0.4774944377888071, + "grad_norm": 3.6649258136749268, + "learning_rate": 1.5881346263548205e-05, + "loss": 0.4127, + "step": 2790 + }, + { + "epoch": 0.47766558274858806, + "grad_norm": 88.72532653808594, + "learning_rate": 1.5887050770108385e-05, + "loss": 4.5608, + "step": 2791 + }, + { + "epoch": 0.477836727708369, + "grad_norm": 23.770221710205078, + "learning_rate": 1.589275527666857e-05, + "loss": 2.2223, + "step": 2792 + }, + { + "epoch": 0.4780078726681499, + "grad_norm": 5.284163951873779, + "learning_rate": 1.5898459783228752e-05, + "loss": 0.5186, + "step": 2793 + }, + { + "epoch": 0.47817901762793086, + "grad_norm": 29.41139793395996, + "learning_rate": 1.5904164289788935e-05, + "loss": 3.0647, + "step": 2794 + }, + { + "epoch": 0.4783501625877118, + "grad_norm": 26.757612228393555, + "learning_rate": 1.5909868796349115e-05, + "loss": 2.3827, + "step": 2795 + }, + { + "epoch": 0.4785213075474927, + "grad_norm": 12.758798599243164, + "learning_rate": 1.59155733029093e-05, + "loss": 0.818, + "step": 2796 + }, + { + "epoch": 0.47869245250727366, + "grad_norm": 29.093143463134766, + "learning_rate": 1.5921277809469482e-05, + "loss": 2.9151, + "step": 2797 + }, + { + "epoch": 0.4788635974670546, + "grad_norm": 126.96649932861328, + "learning_rate": 1.5926982316029662e-05, + "loss": 8.4343, + "step": 2798 + }, + { + "epoch": 0.4790347424268355, + "grad_norm": 31.195518493652344, + "learning_rate": 1.5932686822589846e-05, + "loss": 3.7256, + "step": 2799 + }, + { + "epoch": 0.47920588738661646, + "grad_norm": 28.148395538330078, + "learning_rate": 1.593839132915003e-05, + "loss": 2.8813, + "step": 2800 + }, + { + "epoch": 0.4793770323463974, + "grad_norm": 36.015403747558594, + "learning_rate": 1.5944095835710212e-05, + "loss": 4.6005, + "step": 2801 + }, + { + "epoch": 0.4795481773061783, + "grad_norm": 31.1592960357666, + "learning_rate": 1.5949800342270392e-05, + "loss": 3.1305, + "step": 2802 + }, + { + "epoch": 0.47971932226595926, + "grad_norm": 128.36007690429688, + "learning_rate": 1.5955504848830576e-05, + "loss": 4.3169, + "step": 2803 + }, + { + "epoch": 0.4798904672257402, + "grad_norm": 13.735505104064941, + "learning_rate": 1.596120935539076e-05, + "loss": 0.9245, + "step": 2804 + }, + { + "epoch": 0.4800616121855211, + "grad_norm": 42.414024353027344, + "learning_rate": 1.596691386195094e-05, + "loss": 6.975, + "step": 2805 + }, + { + "epoch": 0.48023275714530206, + "grad_norm": 31.763032913208008, + "learning_rate": 1.5972618368511122e-05, + "loss": 3.1284, + "step": 2806 + }, + { + "epoch": 0.480403902105083, + "grad_norm": 27.716442108154297, + "learning_rate": 1.597832287507131e-05, + "loss": 3.0007, + "step": 2807 + }, + { + "epoch": 0.4805750470648639, + "grad_norm": 32.059425354003906, + "learning_rate": 1.598402738163149e-05, + "loss": 4.0829, + "step": 2808 + }, + { + "epoch": 0.48074619202464486, + "grad_norm": 36.31050491333008, + "learning_rate": 1.5989731888191673e-05, + "loss": 4.8666, + "step": 2809 + }, + { + "epoch": 0.4809173369844258, + "grad_norm": 23.16267204284668, + "learning_rate": 1.5995436394751856e-05, + "loss": 2.0993, + "step": 2810 + }, + { + "epoch": 0.4810884819442067, + "grad_norm": 12.366683006286621, + "learning_rate": 1.600114090131204e-05, + "loss": 0.913, + "step": 2811 + }, + { + "epoch": 0.48125962690398766, + "grad_norm": 11.630936622619629, + "learning_rate": 1.600684540787222e-05, + "loss": 1.233, + "step": 2812 + }, + { + "epoch": 0.4814307718637686, + "grad_norm": 5.433574676513672, + "learning_rate": 1.6012549914432403e-05, + "loss": 0.5408, + "step": 2813 + }, + { + "epoch": 0.4816019168235495, + "grad_norm": 25.152584075927734, + "learning_rate": 1.6018254420992586e-05, + "loss": 2.7558, + "step": 2814 + }, + { + "epoch": 0.48177306178333046, + "grad_norm": 32.2104377746582, + "learning_rate": 1.6023958927552766e-05, + "loss": 3.56, + "step": 2815 + }, + { + "epoch": 0.4819442067431114, + "grad_norm": 21.62321662902832, + "learning_rate": 1.602966343411295e-05, + "loss": 2.626, + "step": 2816 + }, + { + "epoch": 0.4821153517028923, + "grad_norm": 27.26594352722168, + "learning_rate": 1.6035367940673133e-05, + "loss": 3.057, + "step": 2817 + }, + { + "epoch": 0.48228649666267326, + "grad_norm": 29.751848220825195, + "learning_rate": 1.6041072447233316e-05, + "loss": 3.0557, + "step": 2818 + }, + { + "epoch": 0.4824576416224542, + "grad_norm": 28.00129508972168, + "learning_rate": 1.6046776953793496e-05, + "loss": 2.9606, + "step": 2819 + }, + { + "epoch": 0.4826287865822352, + "grad_norm": 10.21130084991455, + "learning_rate": 1.605248146035368e-05, + "loss": 1.0526, + "step": 2820 + }, + { + "epoch": 0.4827999315420161, + "grad_norm": 194.53099060058594, + "learning_rate": 1.6058185966913863e-05, + "loss": 9.7692, + "step": 2821 + }, + { + "epoch": 0.48297107650179705, + "grad_norm": 20.116971969604492, + "learning_rate": 1.6063890473474043e-05, + "loss": 1.702, + "step": 2822 + }, + { + "epoch": 0.483142221461578, + "grad_norm": 25.585695266723633, + "learning_rate": 1.6069594980034226e-05, + "loss": 3.1031, + "step": 2823 + }, + { + "epoch": 0.4833133664213589, + "grad_norm": 10.690316200256348, + "learning_rate": 1.607529948659441e-05, + "loss": 1.112, + "step": 2824 + }, + { + "epoch": 0.48348451138113985, + "grad_norm": 46.31101989746094, + "learning_rate": 1.6081003993154593e-05, + "loss": 7.0695, + "step": 2825 + }, + { + "epoch": 0.4836556563409208, + "grad_norm": 2.7449769973754883, + "learning_rate": 1.6086708499714773e-05, + "loss": 0.412, + "step": 2826 + }, + { + "epoch": 0.4838268013007017, + "grad_norm": 31.60761260986328, + "learning_rate": 1.6092413006274956e-05, + "loss": 3.5248, + "step": 2827 + }, + { + "epoch": 0.48399794626048265, + "grad_norm": 30.04467010498047, + "learning_rate": 1.609811751283514e-05, + "loss": 3.5359, + "step": 2828 + }, + { + "epoch": 0.4841690912202636, + "grad_norm": 10.859264373779297, + "learning_rate": 1.610382201939532e-05, + "loss": 0.9806, + "step": 2829 + }, + { + "epoch": 0.4843402361800445, + "grad_norm": 24.42304229736328, + "learning_rate": 1.6109526525955507e-05, + "loss": 2.5163, + "step": 2830 + }, + { + "epoch": 0.48451138113982545, + "grad_norm": 32.02371597290039, + "learning_rate": 1.611523103251569e-05, + "loss": 4.1818, + "step": 2831 + }, + { + "epoch": 0.4846825260996064, + "grad_norm": 35.690147399902344, + "learning_rate": 1.6120935539075873e-05, + "loss": 3.3438, + "step": 2832 + }, + { + "epoch": 0.4848536710593873, + "grad_norm": 25.543243408203125, + "learning_rate": 1.6126640045636053e-05, + "loss": 2.5981, + "step": 2833 + }, + { + "epoch": 0.48502481601916825, + "grad_norm": 25.119115829467773, + "learning_rate": 1.6132344552196237e-05, + "loss": 2.1322, + "step": 2834 + }, + { + "epoch": 0.4851959609789492, + "grad_norm": 23.112409591674805, + "learning_rate": 1.613804905875642e-05, + "loss": 2.5395, + "step": 2835 + }, + { + "epoch": 0.4853671059387301, + "grad_norm": 91.41179656982422, + "learning_rate": 1.61437535653166e-05, + "loss": 4.2215, + "step": 2836 + }, + { + "epoch": 0.48553825089851105, + "grad_norm": 34.66135787963867, + "learning_rate": 1.6149458071876783e-05, + "loss": 3.1988, + "step": 2837 + }, + { + "epoch": 0.485709395858292, + "grad_norm": 28.888839721679688, + "learning_rate": 1.6155162578436967e-05, + "loss": 3.1345, + "step": 2838 + }, + { + "epoch": 0.4858805408180729, + "grad_norm": 63.08065414428711, + "learning_rate": 1.6160867084997147e-05, + "loss": 8.1288, + "step": 2839 + }, + { + "epoch": 0.48605168577785385, + "grad_norm": 148.98455810546875, + "learning_rate": 1.616657159155733e-05, + "loss": 4.9747, + "step": 2840 + }, + { + "epoch": 0.4862228307376348, + "grad_norm": 29.048202514648438, + "learning_rate": 1.6172276098117514e-05, + "loss": 2.9531, + "step": 2841 + }, + { + "epoch": 0.4863939756974157, + "grad_norm": 6.495917320251465, + "learning_rate": 1.6177980604677697e-05, + "loss": 0.5056, + "step": 2842 + }, + { + "epoch": 0.48656512065719665, + "grad_norm": 8.356714248657227, + "learning_rate": 1.6183685111237877e-05, + "loss": 0.9125, + "step": 2843 + }, + { + "epoch": 0.4867362656169776, + "grad_norm": 26.18461036682129, + "learning_rate": 1.618938961779806e-05, + "loss": 3.175, + "step": 2844 + }, + { + "epoch": 0.4869074105767585, + "grad_norm": 9.202829360961914, + "learning_rate": 1.6195094124358244e-05, + "loss": 1.0864, + "step": 2845 + }, + { + "epoch": 0.48707855553653945, + "grad_norm": 34.182373046875, + "learning_rate": 1.6200798630918424e-05, + "loss": 2.7523, + "step": 2846 + }, + { + "epoch": 0.4872497004963204, + "grad_norm": 68.9462890625, + "learning_rate": 1.6206503137478607e-05, + "loss": 3.7044, + "step": 2847 + }, + { + "epoch": 0.4874208454561013, + "grad_norm": 24.633121490478516, + "learning_rate": 1.621220764403879e-05, + "loss": 2.6342, + "step": 2848 + }, + { + "epoch": 0.48759199041588225, + "grad_norm": 32.68869400024414, + "learning_rate": 1.6217912150598974e-05, + "loss": 4.6795, + "step": 2849 + }, + { + "epoch": 0.4877631353756632, + "grad_norm": 28.001712799072266, + "learning_rate": 1.6223616657159154e-05, + "loss": 3.3885, + "step": 2850 + }, + { + "epoch": 0.4879342803354441, + "grad_norm": 4.1197099685668945, + "learning_rate": 1.6229321163719337e-05, + "loss": 0.4097, + "step": 2851 + }, + { + "epoch": 0.48810542529522505, + "grad_norm": 29.35110092163086, + "learning_rate": 1.623502567027952e-05, + "loss": 3.5865, + "step": 2852 + }, + { + "epoch": 0.488276570255006, + "grad_norm": 26.92041778564453, + "learning_rate": 1.6240730176839704e-05, + "loss": 2.7247, + "step": 2853 + }, + { + "epoch": 0.4884477152147869, + "grad_norm": 34.873775482177734, + "learning_rate": 1.6246434683399887e-05, + "loss": 7.1172, + "step": 2854 + }, + { + "epoch": 0.48861886017456785, + "grad_norm": 24.180212020874023, + "learning_rate": 1.625213918996007e-05, + "loss": 2.4944, + "step": 2855 + }, + { + "epoch": 0.4887900051343488, + "grad_norm": 28.294334411621094, + "learning_rate": 1.6257843696520254e-05, + "loss": 3.4049, + "step": 2856 + }, + { + "epoch": 0.4889611500941297, + "grad_norm": 20.231170654296875, + "learning_rate": 1.6263548203080434e-05, + "loss": 2.2117, + "step": 2857 + }, + { + "epoch": 0.48913229505391065, + "grad_norm": 21.00507164001465, + "learning_rate": 1.6269252709640617e-05, + "loss": 1.8153, + "step": 2858 + }, + { + "epoch": 0.4893034400136916, + "grad_norm": 26.58632469177246, + "learning_rate": 1.62749572162008e-05, + "loss": 2.7509, + "step": 2859 + }, + { + "epoch": 0.4894745849734725, + "grad_norm": 25.922264099121094, + "learning_rate": 1.628066172276098e-05, + "loss": 3.0767, + "step": 2860 + }, + { + "epoch": 0.48964572993325345, + "grad_norm": 36.93525695800781, + "learning_rate": 1.6286366229321164e-05, + "loss": 6.587, + "step": 2861 + }, + { + "epoch": 0.4898168748930344, + "grad_norm": 30.786312103271484, + "learning_rate": 1.6292070735881348e-05, + "loss": 4.5453, + "step": 2862 + }, + { + "epoch": 0.4899880198528153, + "grad_norm": 10.850686073303223, + "learning_rate": 1.629777524244153e-05, + "loss": 0.8675, + "step": 2863 + }, + { + "epoch": 0.49015916481259625, + "grad_norm": 22.04916763305664, + "learning_rate": 1.630347974900171e-05, + "loss": 2.1868, + "step": 2864 + }, + { + "epoch": 0.4903303097723772, + "grad_norm": 27.125104904174805, + "learning_rate": 1.6309184255561894e-05, + "loss": 2.7107, + "step": 2865 + }, + { + "epoch": 0.4905014547321581, + "grad_norm": 26.232017517089844, + "learning_rate": 1.6314888762122078e-05, + "loss": 3.0023, + "step": 2866 + }, + { + "epoch": 0.49067259969193905, + "grad_norm": 2.6513617038726807, + "learning_rate": 1.6320593268682258e-05, + "loss": 0.4064, + "step": 2867 + }, + { + "epoch": 0.49084374465172, + "grad_norm": 29.269208908081055, + "learning_rate": 1.632629777524244e-05, + "loss": 2.9074, + "step": 2868 + }, + { + "epoch": 0.4910148896115009, + "grad_norm": 28.653419494628906, + "learning_rate": 1.6332002281802624e-05, + "loss": 2.749, + "step": 2869 + }, + { + "epoch": 0.49118603457128185, + "grad_norm": 24.419513702392578, + "learning_rate": 1.6337706788362808e-05, + "loss": 2.4733, + "step": 2870 + }, + { + "epoch": 0.49135717953106284, + "grad_norm": 31.81149673461914, + "learning_rate": 1.6343411294922988e-05, + "loss": 3.0798, + "step": 2871 + }, + { + "epoch": 0.49152832449084377, + "grad_norm": 7.05307149887085, + "learning_rate": 1.634911580148317e-05, + "loss": 0.6362, + "step": 2872 + }, + { + "epoch": 0.4916994694506247, + "grad_norm": 22.482975006103516, + "learning_rate": 1.6354820308043355e-05, + "loss": 2.0351, + "step": 2873 + }, + { + "epoch": 0.49187061441040564, + "grad_norm": 9.290128707885742, + "learning_rate": 1.6360524814603535e-05, + "loss": 0.6272, + "step": 2874 + }, + { + "epoch": 0.49204175937018657, + "grad_norm": 27.201467514038086, + "learning_rate": 1.6366229321163718e-05, + "loss": 2.6431, + "step": 2875 + }, + { + "epoch": 0.4922129043299675, + "grad_norm": 44.08928298950195, + "learning_rate": 1.6371933827723905e-05, + "loss": 6.6881, + "step": 2876 + }, + { + "epoch": 0.49238404928974844, + "grad_norm": 14.08613109588623, + "learning_rate": 1.6377638334284085e-05, + "loss": 1.0184, + "step": 2877 + }, + { + "epoch": 0.49255519424952937, + "grad_norm": 19.89874839782715, + "learning_rate": 1.6383342840844268e-05, + "loss": 2.0983, + "step": 2878 + }, + { + "epoch": 0.4927263392093103, + "grad_norm": 31.281314849853516, + "learning_rate": 1.638904734740445e-05, + "loss": 4.3604, + "step": 2879 + }, + { + "epoch": 0.49289748416909124, + "grad_norm": 4.3934245109558105, + "learning_rate": 1.6394751853964635e-05, + "loss": 0.4535, + "step": 2880 + }, + { + "epoch": 0.49306862912887217, + "grad_norm": 16.13640785217285, + "learning_rate": 1.6400456360524815e-05, + "loss": 1.4628, + "step": 2881 + }, + { + "epoch": 0.4932397740886531, + "grad_norm": 2.4228832721710205, + "learning_rate": 1.6406160867084998e-05, + "loss": 0.3669, + "step": 2882 + }, + { + "epoch": 0.49341091904843404, + "grad_norm": 39.298160552978516, + "learning_rate": 1.641186537364518e-05, + "loss": 5.1978, + "step": 2883 + }, + { + "epoch": 0.49358206400821497, + "grad_norm": 7.103499889373779, + "learning_rate": 1.641756988020536e-05, + "loss": 0.7534, + "step": 2884 + }, + { + "epoch": 0.4937532089679959, + "grad_norm": 36.24224090576172, + "learning_rate": 1.6423274386765545e-05, + "loss": 5.1747, + "step": 2885 + }, + { + "epoch": 0.49392435392777684, + "grad_norm": 88.6714859008789, + "learning_rate": 1.642897889332573e-05, + "loss": 4.1515, + "step": 2886 + }, + { + "epoch": 0.49409549888755777, + "grad_norm": 102.38868713378906, + "learning_rate": 1.6434683399885912e-05, + "loss": 4.1397, + "step": 2887 + }, + { + "epoch": 0.4942666438473387, + "grad_norm": 32.09382247924805, + "learning_rate": 1.6440387906446092e-05, + "loss": 3.822, + "step": 2888 + }, + { + "epoch": 0.49443778880711964, + "grad_norm": 27.632850646972656, + "learning_rate": 1.6446092413006275e-05, + "loss": 3.0071, + "step": 2889 + }, + { + "epoch": 0.49460893376690057, + "grad_norm": 29.850147247314453, + "learning_rate": 1.645179691956646e-05, + "loss": 4.5876, + "step": 2890 + }, + { + "epoch": 0.4947800787266815, + "grad_norm": 20.323644638061523, + "learning_rate": 1.645750142612664e-05, + "loss": 2.0093, + "step": 2891 + }, + { + "epoch": 0.49495122368646244, + "grad_norm": 28.592273712158203, + "learning_rate": 1.6463205932686822e-05, + "loss": 2.6316, + "step": 2892 + }, + { + "epoch": 0.49512236864624337, + "grad_norm": 29.890256881713867, + "learning_rate": 1.6468910439247005e-05, + "loss": 2.7351, + "step": 2893 + }, + { + "epoch": 0.4952935136060243, + "grad_norm": 25.856136322021484, + "learning_rate": 1.647461494580719e-05, + "loss": 2.7318, + "step": 2894 + }, + { + "epoch": 0.49546465856580524, + "grad_norm": 28.1647891998291, + "learning_rate": 1.648031945236737e-05, + "loss": 2.7787, + "step": 2895 + }, + { + "epoch": 0.49563580352558617, + "grad_norm": 24.757694244384766, + "learning_rate": 1.6486023958927552e-05, + "loss": 2.7135, + "step": 2896 + }, + { + "epoch": 0.4958069484853671, + "grad_norm": 42.44664764404297, + "learning_rate": 1.6491728465487735e-05, + "loss": 3.6649, + "step": 2897 + }, + { + "epoch": 0.49597809344514804, + "grad_norm": 30.2053279876709, + "learning_rate": 1.6497432972047915e-05, + "loss": 4.0259, + "step": 2898 + }, + { + "epoch": 0.49614923840492897, + "grad_norm": 12.054943084716797, + "learning_rate": 1.6503137478608102e-05, + "loss": 1.0105, + "step": 2899 + }, + { + "epoch": 0.4963203833647099, + "grad_norm": 17.974079132080078, + "learning_rate": 1.6508841985168286e-05, + "loss": 2.0786, + "step": 2900 + }, + { + "epoch": 0.49649152832449084, + "grad_norm": 12.725552558898926, + "learning_rate": 1.651454649172847e-05, + "loss": 1.0647, + "step": 2901 + }, + { + "epoch": 0.49666267328427177, + "grad_norm": 22.831754684448242, + "learning_rate": 1.652025099828865e-05, + "loss": 2.2329, + "step": 2902 + }, + { + "epoch": 0.4968338182440527, + "grad_norm": 21.267478942871094, + "learning_rate": 1.6525955504848832e-05, + "loss": 2.5314, + "step": 2903 + }, + { + "epoch": 0.49700496320383364, + "grad_norm": 27.087793350219727, + "learning_rate": 1.6531660011409016e-05, + "loss": 2.8437, + "step": 2904 + }, + { + "epoch": 0.49717610816361457, + "grad_norm": 19.73915672302246, + "learning_rate": 1.6537364517969196e-05, + "loss": 1.8543, + "step": 2905 + }, + { + "epoch": 0.4973472531233955, + "grad_norm": 2.955650806427002, + "learning_rate": 1.654306902452938e-05, + "loss": 0.4054, + "step": 2906 + }, + { + "epoch": 0.49751839808317644, + "grad_norm": 25.305593490600586, + "learning_rate": 1.6548773531089562e-05, + "loss": 2.5412, + "step": 2907 + }, + { + "epoch": 0.49768954304295737, + "grad_norm": 29.378746032714844, + "learning_rate": 1.6554478037649742e-05, + "loss": 2.7018, + "step": 2908 + }, + { + "epoch": 0.4978606880027383, + "grad_norm": 14.516071319580078, + "learning_rate": 1.6560182544209926e-05, + "loss": 1.9194, + "step": 2909 + }, + { + "epoch": 0.49803183296251924, + "grad_norm": 25.602577209472656, + "learning_rate": 1.656588705077011e-05, + "loss": 2.1128, + "step": 2910 + }, + { + "epoch": 0.49820297792230017, + "grad_norm": 109.72111511230469, + "learning_rate": 1.6571591557330293e-05, + "loss": 4.1774, + "step": 2911 + }, + { + "epoch": 0.4983741228820811, + "grad_norm": 19.274553298950195, + "learning_rate": 1.6577296063890472e-05, + "loss": 1.5632, + "step": 2912 + }, + { + "epoch": 0.49854526784186204, + "grad_norm": 29.17140007019043, + "learning_rate": 1.6583000570450656e-05, + "loss": 3.7158, + "step": 2913 + }, + { + "epoch": 0.49871641280164297, + "grad_norm": 31.559934616088867, + "learning_rate": 1.658870507701084e-05, + "loss": 4.5437, + "step": 2914 + }, + { + "epoch": 0.4988875577614239, + "grad_norm": 18.08380699157715, + "learning_rate": 1.659440958357102e-05, + "loss": 1.1722, + "step": 2915 + }, + { + "epoch": 0.49905870272120484, + "grad_norm": 29.155492782592773, + "learning_rate": 1.6600114090131203e-05, + "loss": 3.2768, + "step": 2916 + }, + { + "epoch": 0.49922984768098577, + "grad_norm": 36.51355743408203, + "learning_rate": 1.6605818596691386e-05, + "loss": 4.8346, + "step": 2917 + }, + { + "epoch": 0.4994009926407667, + "grad_norm": 18.29048728942871, + "learning_rate": 1.661152310325157e-05, + "loss": 1.1614, + "step": 2918 + }, + { + "epoch": 0.49957213760054764, + "grad_norm": 29.851797103881836, + "learning_rate": 1.661722760981175e-05, + "loss": 2.9554, + "step": 2919 + }, + { + "epoch": 0.49974328256032857, + "grad_norm": 27.82573699951172, + "learning_rate": 1.6622932116371933e-05, + "loss": 3.4135, + "step": 2920 + }, + { + "epoch": 0.4999144275201095, + "grad_norm": 26.42146110534668, + "learning_rate": 1.6628636622932116e-05, + "loss": 2.5056, + "step": 2921 + }, + { + "epoch": 0.5000855724798905, + "grad_norm": 11.394399642944336, + "learning_rate": 1.66343411294923e-05, + "loss": 1.5378, + "step": 2922 + }, + { + "epoch": 0.5002567174396714, + "grad_norm": 76.39617156982422, + "learning_rate": 1.6640045636052483e-05, + "loss": 7.2706, + "step": 2923 + }, + { + "epoch": 0.5004278623994524, + "grad_norm": 30.514179229736328, + "learning_rate": 1.6645750142612666e-05, + "loss": 3.1234, + "step": 2924 + }, + { + "epoch": 0.5005990073592332, + "grad_norm": 25.776514053344727, + "learning_rate": 1.665145464917285e-05, + "loss": 2.852, + "step": 2925 + }, + { + "epoch": 0.5007701523190142, + "grad_norm": 33.94929122924805, + "learning_rate": 1.665715915573303e-05, + "loss": 4.5202, + "step": 2926 + }, + { + "epoch": 0.5009412972787951, + "grad_norm": 42.92927551269531, + "learning_rate": 1.6662863662293213e-05, + "loss": 7.0151, + "step": 2927 + }, + { + "epoch": 0.5011124422385761, + "grad_norm": 8.699772834777832, + "learning_rate": 1.6668568168853396e-05, + "loss": 0.8725, + "step": 2928 + }, + { + "epoch": 0.501283587198357, + "grad_norm": 27.853302001953125, + "learning_rate": 1.6674272675413576e-05, + "loss": 2.2825, + "step": 2929 + }, + { + "epoch": 0.501454732158138, + "grad_norm": 26.110185623168945, + "learning_rate": 1.667997718197376e-05, + "loss": 2.5107, + "step": 2930 + }, + { + "epoch": 0.5016258771179188, + "grad_norm": 4.521554946899414, + "learning_rate": 1.6685681688533943e-05, + "loss": 0.4957, + "step": 2931 + }, + { + "epoch": 0.5017970220776998, + "grad_norm": 42.245086669921875, + "learning_rate": 1.6691386195094127e-05, + "loss": 6.5318, + "step": 2932 + }, + { + "epoch": 0.5019681670374807, + "grad_norm": 25.86848258972168, + "learning_rate": 1.6697090701654307e-05, + "loss": 2.2382, + "step": 2933 + }, + { + "epoch": 0.5021393119972617, + "grad_norm": 48.50715637207031, + "learning_rate": 1.670279520821449e-05, + "loss": 7.148, + "step": 2934 + }, + { + "epoch": 0.5023104569570426, + "grad_norm": 32.559574127197266, + "learning_rate": 1.6708499714774673e-05, + "loss": 3.6438, + "step": 2935 + }, + { + "epoch": 0.5024816019168236, + "grad_norm": 24.84282112121582, + "learning_rate": 1.6714204221334853e-05, + "loss": 2.7729, + "step": 2936 + }, + { + "epoch": 0.5026527468766044, + "grad_norm": 14.403919219970703, + "learning_rate": 1.6719908727895037e-05, + "loss": 1.217, + "step": 2937 + }, + { + "epoch": 0.5028238918363854, + "grad_norm": 27.424219131469727, + "learning_rate": 1.672561323445522e-05, + "loss": 2.5197, + "step": 2938 + }, + { + "epoch": 0.5029950367961663, + "grad_norm": 9.789163589477539, + "learning_rate": 1.67313177410154e-05, + "loss": 1.7931, + "step": 2939 + }, + { + "epoch": 0.5031661817559473, + "grad_norm": 27.327239990234375, + "learning_rate": 1.6737022247575583e-05, + "loss": 3.3266, + "step": 2940 + }, + { + "epoch": 0.5033373267157282, + "grad_norm": 19.182161331176758, + "learning_rate": 1.6742726754135767e-05, + "loss": 1.9967, + "step": 2941 + }, + { + "epoch": 0.5035084716755092, + "grad_norm": 56.43001174926758, + "learning_rate": 1.674843126069595e-05, + "loss": 3.7189, + "step": 2942 + }, + { + "epoch": 0.50367961663529, + "grad_norm": 19.654386520385742, + "learning_rate": 1.675413576725613e-05, + "loss": 1.9999, + "step": 2943 + }, + { + "epoch": 0.503850761595071, + "grad_norm": 22.203187942504883, + "learning_rate": 1.6759840273816314e-05, + "loss": 2.1959, + "step": 2944 + }, + { + "epoch": 0.5040219065548519, + "grad_norm": 6.563319683074951, + "learning_rate": 1.67655447803765e-05, + "loss": 0.6367, + "step": 2945 + }, + { + "epoch": 0.5041930515146329, + "grad_norm": 10.192085266113281, + "learning_rate": 1.677124928693668e-05, + "loss": 0.7288, + "step": 2946 + }, + { + "epoch": 0.5043641964744139, + "grad_norm": 32.45716094970703, + "learning_rate": 1.6776953793496864e-05, + "loss": 3.9021, + "step": 2947 + }, + { + "epoch": 0.5045353414341948, + "grad_norm": 4.9417595863342285, + "learning_rate": 1.6782658300057047e-05, + "loss": 0.4681, + "step": 2948 + }, + { + "epoch": 0.5047064863939758, + "grad_norm": 27.206302642822266, + "learning_rate": 1.678836280661723e-05, + "loss": 3.3352, + "step": 2949 + }, + { + "epoch": 0.5048776313537566, + "grad_norm": 28.154144287109375, + "learning_rate": 1.679406731317741e-05, + "loss": 2.9377, + "step": 2950 + }, + { + "epoch": 0.5050487763135376, + "grad_norm": 21.303789138793945, + "learning_rate": 1.6799771819737594e-05, + "loss": 2.785, + "step": 2951 + }, + { + "epoch": 0.5052199212733185, + "grad_norm": 31.954051971435547, + "learning_rate": 1.6805476326297777e-05, + "loss": 3.1305, + "step": 2952 + }, + { + "epoch": 0.5053910662330995, + "grad_norm": 10.69640827178955, + "learning_rate": 1.6811180832857957e-05, + "loss": 1.6799, + "step": 2953 + }, + { + "epoch": 0.5055622111928804, + "grad_norm": 30.222347259521484, + "learning_rate": 1.681688533941814e-05, + "loss": 2.8247, + "step": 2954 + }, + { + "epoch": 0.5057333561526614, + "grad_norm": 96.27491760253906, + "learning_rate": 1.6822589845978324e-05, + "loss": 4.6357, + "step": 2955 + }, + { + "epoch": 0.5059045011124422, + "grad_norm": 28.582870483398438, + "learning_rate": 1.6828294352538507e-05, + "loss": 3.2733, + "step": 2956 + }, + { + "epoch": 0.5060756460722232, + "grad_norm": 41.087825775146484, + "learning_rate": 1.6833998859098687e-05, + "loss": 7.1278, + "step": 2957 + }, + { + "epoch": 0.5062467910320041, + "grad_norm": 7.500061511993408, + "learning_rate": 1.683970336565887e-05, + "loss": 0.8286, + "step": 2958 + }, + { + "epoch": 0.5064179359917851, + "grad_norm": 26.969345092773438, + "learning_rate": 1.6845407872219054e-05, + "loss": 2.235, + "step": 2959 + }, + { + "epoch": 0.506589080951566, + "grad_norm": 26.311525344848633, + "learning_rate": 1.6851112378779234e-05, + "loss": 3.0085, + "step": 2960 + }, + { + "epoch": 0.506760225911347, + "grad_norm": 31.306970596313477, + "learning_rate": 1.6856816885339417e-05, + "loss": 2.5939, + "step": 2961 + }, + { + "epoch": 0.5069313708711278, + "grad_norm": 24.608043670654297, + "learning_rate": 1.68625213918996e-05, + "loss": 2.3096, + "step": 2962 + }, + { + "epoch": 0.5071025158309088, + "grad_norm": 27.197254180908203, + "learning_rate": 1.6868225898459784e-05, + "loss": 2.9187, + "step": 2963 + }, + { + "epoch": 0.5072736607906897, + "grad_norm": 28.446548461914062, + "learning_rate": 1.6873930405019964e-05, + "loss": 3.2735, + "step": 2964 + }, + { + "epoch": 0.5074448057504707, + "grad_norm": 32.15707778930664, + "learning_rate": 1.6879634911580148e-05, + "loss": 6.7019, + "step": 2965 + }, + { + "epoch": 0.5076159507102516, + "grad_norm": 23.724163055419922, + "learning_rate": 1.688533941814033e-05, + "loss": 2.1627, + "step": 2966 + }, + { + "epoch": 0.5077870956700326, + "grad_norm": 28.04530143737793, + "learning_rate": 1.689104392470051e-05, + "loss": 2.6273, + "step": 2967 + }, + { + "epoch": 0.5079582406298134, + "grad_norm": 30.895709991455078, + "learning_rate": 1.6896748431260698e-05, + "loss": 3.8368, + "step": 2968 + }, + { + "epoch": 0.5081293855895944, + "grad_norm": 14.024374961853027, + "learning_rate": 1.690245293782088e-05, + "loss": 1.0194, + "step": 2969 + }, + { + "epoch": 0.5083005305493753, + "grad_norm": 29.09341049194336, + "learning_rate": 1.690815744438106e-05, + "loss": 3.5337, + "step": 2970 + }, + { + "epoch": 0.5084716755091563, + "grad_norm": 28.34062385559082, + "learning_rate": 1.6913861950941244e-05, + "loss": 3.1743, + "step": 2971 + }, + { + "epoch": 0.5086428204689372, + "grad_norm": 25.496129989624023, + "learning_rate": 1.6919566457501428e-05, + "loss": 2.898, + "step": 2972 + }, + { + "epoch": 0.5088139654287182, + "grad_norm": 38.798343658447266, + "learning_rate": 1.692527096406161e-05, + "loss": 3.5201, + "step": 2973 + }, + { + "epoch": 0.508985110388499, + "grad_norm": 10.149602890014648, + "learning_rate": 1.693097547062179e-05, + "loss": 0.7939, + "step": 2974 + }, + { + "epoch": 0.50915625534828, + "grad_norm": 6.670815944671631, + "learning_rate": 1.6936679977181975e-05, + "loss": 0.854, + "step": 2975 + }, + { + "epoch": 0.5093274003080609, + "grad_norm": 75.72901153564453, + "learning_rate": 1.6942384483742158e-05, + "loss": 2.9379, + "step": 2976 + }, + { + "epoch": 0.5094985452678419, + "grad_norm": 26.788955688476562, + "learning_rate": 1.6948088990302338e-05, + "loss": 2.4457, + "step": 2977 + }, + { + "epoch": 0.5096696902276228, + "grad_norm": 14.796418190002441, + "learning_rate": 1.695379349686252e-05, + "loss": 1.0122, + "step": 2978 + }, + { + "epoch": 0.5098408351874038, + "grad_norm": 4.948236465454102, + "learning_rate": 1.6959498003422705e-05, + "loss": 0.5853, + "step": 2979 + }, + { + "epoch": 0.5100119801471846, + "grad_norm": 182.9610595703125, + "learning_rate": 1.6965202509982888e-05, + "loss": 8.9776, + "step": 2980 + }, + { + "epoch": 0.5101831251069656, + "grad_norm": 29.51963996887207, + "learning_rate": 1.6970907016543068e-05, + "loss": 2.9543, + "step": 2981 + }, + { + "epoch": 0.5103542700667465, + "grad_norm": 28.639034271240234, + "learning_rate": 1.697661152310325e-05, + "loss": 3.2262, + "step": 2982 + }, + { + "epoch": 0.5105254150265275, + "grad_norm": 29.50834846496582, + "learning_rate": 1.6982316029663435e-05, + "loss": 3.0001, + "step": 2983 + }, + { + "epoch": 0.5106965599863084, + "grad_norm": 15.582537651062012, + "learning_rate": 1.6988020536223615e-05, + "loss": 1.1638, + "step": 2984 + }, + { + "epoch": 0.5108677049460894, + "grad_norm": 27.667177200317383, + "learning_rate": 1.6993725042783798e-05, + "loss": 2.9351, + "step": 2985 + }, + { + "epoch": 0.5110388499058702, + "grad_norm": 28.853923797607422, + "learning_rate": 1.699942954934398e-05, + "loss": 3.6286, + "step": 2986 + }, + { + "epoch": 0.5112099948656512, + "grad_norm": 26.117013931274414, + "learning_rate": 1.7005134055904165e-05, + "loss": 2.8584, + "step": 2987 + }, + { + "epoch": 0.5113811398254321, + "grad_norm": 34.81660842895508, + "learning_rate": 1.7010838562464345e-05, + "loss": 4.3968, + "step": 2988 + }, + { + "epoch": 0.5115522847852131, + "grad_norm": 35.10283279418945, + "learning_rate": 1.7016543069024528e-05, + "loss": 6.599, + "step": 2989 + }, + { + "epoch": 0.511723429744994, + "grad_norm": 19.16140365600586, + "learning_rate": 1.7022247575584715e-05, + "loss": 2.1204, + "step": 2990 + }, + { + "epoch": 0.511894574704775, + "grad_norm": 22.029394149780273, + "learning_rate": 1.7027952082144895e-05, + "loss": 1.8696, + "step": 2991 + }, + { + "epoch": 0.5120657196645558, + "grad_norm": 3.448702335357666, + "learning_rate": 1.703365658870508e-05, + "loss": 0.4607, + "step": 2992 + }, + { + "epoch": 0.5122368646243368, + "grad_norm": 22.506763458251953, + "learning_rate": 1.7039361095265262e-05, + "loss": 2.1106, + "step": 2993 + }, + { + "epoch": 0.5124080095841177, + "grad_norm": 31.842361450195312, + "learning_rate": 1.7045065601825445e-05, + "loss": 3.8676, + "step": 2994 + }, + { + "epoch": 0.5125791545438987, + "grad_norm": 141.6663818359375, + "learning_rate": 1.7050770108385625e-05, + "loss": 8.5208, + "step": 2995 + }, + { + "epoch": 0.5127502995036796, + "grad_norm": 62.276729583740234, + "learning_rate": 1.705647461494581e-05, + "loss": 3.2482, + "step": 2996 + }, + { + "epoch": 0.5129214444634606, + "grad_norm": 22.119609832763672, + "learning_rate": 1.7062179121505992e-05, + "loss": 1.9903, + "step": 2997 + }, + { + "epoch": 0.5130925894232415, + "grad_norm": 52.37403106689453, + "learning_rate": 1.7067883628066172e-05, + "loss": 6.8319, + "step": 2998 + }, + { + "epoch": 0.5132637343830224, + "grad_norm": 12.259587287902832, + "learning_rate": 1.7073588134626355e-05, + "loss": 1.162, + "step": 2999 + }, + { + "epoch": 0.5134348793428034, + "grad_norm": 8.290674209594727, + "learning_rate": 1.707929264118654e-05, + "loss": 0.9012, + "step": 3000 + }, + { + "epoch": 0.5136060243025843, + "grad_norm": 32.74642562866211, + "learning_rate": 1.7084997147746722e-05, + "loss": 3.4785, + "step": 3001 + }, + { + "epoch": 0.5137771692623653, + "grad_norm": 31.82801055908203, + "learning_rate": 1.7090701654306902e-05, + "loss": 4.2721, + "step": 3002 + }, + { + "epoch": 0.5139483142221462, + "grad_norm": 32.273136138916016, + "learning_rate": 1.7096406160867085e-05, + "loss": 3.2625, + "step": 3003 + }, + { + "epoch": 0.5141194591819271, + "grad_norm": 78.98668670654297, + "learning_rate": 1.710211066742727e-05, + "loss": 3.2698, + "step": 3004 + }, + { + "epoch": 0.514290604141708, + "grad_norm": 30.16362762451172, + "learning_rate": 1.710781517398745e-05, + "loss": 3.9137, + "step": 3005 + }, + { + "epoch": 0.514461749101489, + "grad_norm": 18.465227127075195, + "learning_rate": 1.7113519680547632e-05, + "loss": 1.8387, + "step": 3006 + }, + { + "epoch": 0.5146328940612699, + "grad_norm": 3.536219358444214, + "learning_rate": 1.7119224187107816e-05, + "loss": 0.446, + "step": 3007 + }, + { + "epoch": 0.5148040390210509, + "grad_norm": 17.390464782714844, + "learning_rate": 1.7124928693667996e-05, + "loss": 1.7668, + "step": 3008 + }, + { + "epoch": 0.5149751839808318, + "grad_norm": 18.47218894958496, + "learning_rate": 1.713063320022818e-05, + "loss": 2.1817, + "step": 3009 + }, + { + "epoch": 0.5151463289406127, + "grad_norm": 28.22992515563965, + "learning_rate": 1.7136337706788362e-05, + "loss": 2.9769, + "step": 3010 + }, + { + "epoch": 0.5153174739003936, + "grad_norm": 62.36894989013672, + "learning_rate": 1.7142042213348546e-05, + "loss": 7.6922, + "step": 3011 + }, + { + "epoch": 0.5154886188601746, + "grad_norm": 33.23900604248047, + "learning_rate": 1.7147746719908726e-05, + "loss": 3.3971, + "step": 3012 + }, + { + "epoch": 0.5156597638199555, + "grad_norm": 2.5457472801208496, + "learning_rate": 1.7153451226468912e-05, + "loss": 0.4122, + "step": 3013 + }, + { + "epoch": 0.5158309087797365, + "grad_norm": 26.533376693725586, + "learning_rate": 1.7159155733029096e-05, + "loss": 2.9528, + "step": 3014 + }, + { + "epoch": 0.5160020537395174, + "grad_norm": 33.18933868408203, + "learning_rate": 1.7164860239589276e-05, + "loss": 3.7197, + "step": 3015 + }, + { + "epoch": 0.5161731986992983, + "grad_norm": 25.48127555847168, + "learning_rate": 1.717056474614946e-05, + "loss": 2.8834, + "step": 3016 + }, + { + "epoch": 0.5163443436590792, + "grad_norm": 32.51988983154297, + "learning_rate": 1.7176269252709643e-05, + "loss": 3.6681, + "step": 3017 + }, + { + "epoch": 0.5165154886188602, + "grad_norm": 21.83390998840332, + "learning_rate": 1.7181973759269826e-05, + "loss": 2.3579, + "step": 3018 + }, + { + "epoch": 0.5166866335786411, + "grad_norm": 21.106168746948242, + "learning_rate": 1.7187678265830006e-05, + "loss": 2.1503, + "step": 3019 + }, + { + "epoch": 0.5168577785384221, + "grad_norm": 23.668697357177734, + "learning_rate": 1.719338277239019e-05, + "loss": 2.65, + "step": 3020 + }, + { + "epoch": 0.517028923498203, + "grad_norm": 56.29466247558594, + "learning_rate": 1.7199087278950373e-05, + "loss": 7.269, + "step": 3021 + }, + { + "epoch": 0.5172000684579839, + "grad_norm": 14.612650871276855, + "learning_rate": 1.7204791785510553e-05, + "loss": 1.5426, + "step": 3022 + }, + { + "epoch": 0.5173712134177648, + "grad_norm": 28.365121841430664, + "learning_rate": 1.7210496292070736e-05, + "loss": 3.636, + "step": 3023 + }, + { + "epoch": 0.5175423583775458, + "grad_norm": 25.329317092895508, + "learning_rate": 1.721620079863092e-05, + "loss": 2.3847, + "step": 3024 + }, + { + "epoch": 0.5177135033373267, + "grad_norm": 32.05517578125, + "learning_rate": 1.7221905305191103e-05, + "loss": 3.7742, + "step": 3025 + }, + { + "epoch": 0.5178846482971077, + "grad_norm": 11.009437561035156, + "learning_rate": 1.7227609811751283e-05, + "loss": 1.5541, + "step": 3026 + }, + { + "epoch": 0.5180557932568886, + "grad_norm": 4.6759490966796875, + "learning_rate": 1.7233314318311466e-05, + "loss": 0.448, + "step": 3027 + }, + { + "epoch": 0.5182269382166695, + "grad_norm": 23.18576431274414, + "learning_rate": 1.723901882487165e-05, + "loss": 2.3099, + "step": 3028 + }, + { + "epoch": 0.5183980831764504, + "grad_norm": 21.823318481445312, + "learning_rate": 1.724472333143183e-05, + "loss": 2.0502, + "step": 3029 + }, + { + "epoch": 0.5185692281362314, + "grad_norm": 33.11149215698242, + "learning_rate": 1.7250427837992013e-05, + "loss": 3.7448, + "step": 3030 + }, + { + "epoch": 0.5187403730960123, + "grad_norm": 32.03651809692383, + "learning_rate": 1.7256132344552196e-05, + "loss": 3.5141, + "step": 3031 + }, + { + "epoch": 0.5189115180557933, + "grad_norm": 29.257003784179688, + "learning_rate": 1.726183685111238e-05, + "loss": 3.5149, + "step": 3032 + }, + { + "epoch": 0.5190826630155742, + "grad_norm": 6.367782115936279, + "learning_rate": 1.726754135767256e-05, + "loss": 1.1882, + "step": 3033 + }, + { + "epoch": 0.5192538079753551, + "grad_norm": 21.6986083984375, + "learning_rate": 1.7273245864232743e-05, + "loss": 1.7948, + "step": 3034 + }, + { + "epoch": 0.519424952935136, + "grad_norm": 14.612825393676758, + "learning_rate": 1.7278950370792926e-05, + "loss": 1.1659, + "step": 3035 + }, + { + "epoch": 0.519596097894917, + "grad_norm": 28.725549697875977, + "learning_rate": 1.728465487735311e-05, + "loss": 2.7611, + "step": 3036 + }, + { + "epoch": 0.5197672428546979, + "grad_norm": 30.985149383544922, + "learning_rate": 1.7290359383913293e-05, + "loss": 2.9706, + "step": 3037 + }, + { + "epoch": 0.5199383878144789, + "grad_norm": 17.664464950561523, + "learning_rate": 1.7296063890473477e-05, + "loss": 1.6902, + "step": 3038 + }, + { + "epoch": 0.5201095327742598, + "grad_norm": 32.17440414428711, + "learning_rate": 1.7301768397033657e-05, + "loss": 3.8811, + "step": 3039 + }, + { + "epoch": 0.5202806777340407, + "grad_norm": 5.3300580978393555, + "learning_rate": 1.730747290359384e-05, + "loss": 0.4921, + "step": 3040 + }, + { + "epoch": 0.5204518226938216, + "grad_norm": 38.537044525146484, + "learning_rate": 1.7313177410154023e-05, + "loss": 3.2832, + "step": 3041 + }, + { + "epoch": 0.5206229676536026, + "grad_norm": 39.10978698730469, + "learning_rate": 1.7318881916714207e-05, + "loss": 3.8919, + "step": 3042 + }, + { + "epoch": 0.5207941126133835, + "grad_norm": 29.357208251953125, + "learning_rate": 1.7324586423274387e-05, + "loss": 3.115, + "step": 3043 + }, + { + "epoch": 0.5209652575731645, + "grad_norm": 15.655451774597168, + "learning_rate": 1.733029092983457e-05, + "loss": 1.4122, + "step": 3044 + }, + { + "epoch": 0.5211364025329454, + "grad_norm": 28.293025970458984, + "learning_rate": 1.7335995436394753e-05, + "loss": 2.9349, + "step": 3045 + }, + { + "epoch": 0.5213075474927263, + "grad_norm": 32.65211868286133, + "learning_rate": 1.7341699942954933e-05, + "loss": 3.2992, + "step": 3046 + }, + { + "epoch": 0.5214786924525072, + "grad_norm": 23.2037296295166, + "learning_rate": 1.7347404449515117e-05, + "loss": 2.3879, + "step": 3047 + }, + { + "epoch": 0.5216498374122882, + "grad_norm": 26.37859535217285, + "learning_rate": 1.73531089560753e-05, + "loss": 2.1987, + "step": 3048 + }, + { + "epoch": 0.5218209823720692, + "grad_norm": 18.490966796875, + "learning_rate": 1.7358813462635484e-05, + "loss": 1.461, + "step": 3049 + }, + { + "epoch": 0.5219921273318501, + "grad_norm": 31.97382354736328, + "learning_rate": 1.7364517969195664e-05, + "loss": 2.8849, + "step": 3050 + }, + { + "epoch": 0.5221632722916311, + "grad_norm": 49.7996711730957, + "learning_rate": 1.7370222475755847e-05, + "loss": 6.9056, + "step": 3051 + }, + { + "epoch": 0.5223344172514119, + "grad_norm": 28.981660842895508, + "learning_rate": 1.737592698231603e-05, + "loss": 3.0778, + "step": 3052 + }, + { + "epoch": 0.5225055622111929, + "grad_norm": 120.67489624023438, + "learning_rate": 1.738163148887621e-05, + "loss": 4.2964, + "step": 3053 + }, + { + "epoch": 0.5226767071709738, + "grad_norm": 158.1115264892578, + "learning_rate": 1.7387335995436394e-05, + "loss": 8.5312, + "step": 3054 + }, + { + "epoch": 0.5228478521307548, + "grad_norm": 28.185558319091797, + "learning_rate": 1.7393040501996577e-05, + "loss": 3.537, + "step": 3055 + }, + { + "epoch": 0.5230189970905357, + "grad_norm": 9.107454299926758, + "learning_rate": 1.739874500855676e-05, + "loss": 1.0684, + "step": 3056 + }, + { + "epoch": 0.5231901420503167, + "grad_norm": 36.81668472290039, + "learning_rate": 1.740444951511694e-05, + "loss": 3.8926, + "step": 3057 + }, + { + "epoch": 0.5233612870100975, + "grad_norm": 26.352327346801758, + "learning_rate": 1.7410154021677124e-05, + "loss": 2.664, + "step": 3058 + }, + { + "epoch": 0.5235324319698785, + "grad_norm": 21.38902473449707, + "learning_rate": 1.741585852823731e-05, + "loss": 2.2566, + "step": 3059 + }, + { + "epoch": 0.5237035769296594, + "grad_norm": 10.19254207611084, + "learning_rate": 1.742156303479749e-05, + "loss": 0.8717, + "step": 3060 + }, + { + "epoch": 0.5238747218894404, + "grad_norm": 19.25916862487793, + "learning_rate": 1.7427267541357674e-05, + "loss": 1.3792, + "step": 3061 + }, + { + "epoch": 0.5240458668492213, + "grad_norm": 21.88836669921875, + "learning_rate": 1.7432972047917857e-05, + "loss": 2.2543, + "step": 3062 + }, + { + "epoch": 0.5242170118090023, + "grad_norm": 30.14661979675293, + "learning_rate": 1.743867655447804e-05, + "loss": 3.4672, + "step": 3063 + }, + { + "epoch": 0.5243881567687831, + "grad_norm": 25.134571075439453, + "learning_rate": 1.744438106103822e-05, + "loss": 2.5836, + "step": 3064 + }, + { + "epoch": 0.5245593017285641, + "grad_norm": 21.906818389892578, + "learning_rate": 1.7450085567598404e-05, + "loss": 2.4618, + "step": 3065 + }, + { + "epoch": 0.524730446688345, + "grad_norm": 31.330976486206055, + "learning_rate": 1.7455790074158587e-05, + "loss": 3.1397, + "step": 3066 + }, + { + "epoch": 0.524901591648126, + "grad_norm": 6.355524063110352, + "learning_rate": 1.7461494580718767e-05, + "loss": 0.5091, + "step": 3067 + }, + { + "epoch": 0.5250727366079069, + "grad_norm": 110.98942565917969, + "learning_rate": 1.746719908727895e-05, + "loss": 7.9114, + "step": 3068 + }, + { + "epoch": 0.5252438815676879, + "grad_norm": 31.17119789123535, + "learning_rate": 1.7472903593839134e-05, + "loss": 3.2594, + "step": 3069 + }, + { + "epoch": 0.5254150265274687, + "grad_norm": 24.364032745361328, + "learning_rate": 1.7478608100399318e-05, + "loss": 1.9217, + "step": 3070 + }, + { + "epoch": 0.5255861714872497, + "grad_norm": 34.264041900634766, + "learning_rate": 1.7484312606959498e-05, + "loss": 3.9812, + "step": 3071 + }, + { + "epoch": 0.5257573164470306, + "grad_norm": 27.54375648498535, + "learning_rate": 1.749001711351968e-05, + "loss": 2.8373, + "step": 3072 + }, + { + "epoch": 0.5259284614068116, + "grad_norm": 58.27510452270508, + "learning_rate": 1.7495721620079864e-05, + "loss": 7.2686, + "step": 3073 + }, + { + "epoch": 0.5260996063665925, + "grad_norm": 27.861116409301758, + "learning_rate": 1.7501426126640044e-05, + "loss": 3.2877, + "step": 3074 + }, + { + "epoch": 0.5262707513263735, + "grad_norm": 28.097177505493164, + "learning_rate": 1.7507130633200228e-05, + "loss": 2.3413, + "step": 3075 + }, + { + "epoch": 0.5264418962861543, + "grad_norm": 30.74901008605957, + "learning_rate": 1.751283513976041e-05, + "loss": 3.2284, + "step": 3076 + }, + { + "epoch": 0.5266130412459353, + "grad_norm": 5.434010982513428, + "learning_rate": 1.751853964632059e-05, + "loss": 0.5515, + "step": 3077 + }, + { + "epoch": 0.5267841862057162, + "grad_norm": 19.591594696044922, + "learning_rate": 1.7524244152880774e-05, + "loss": 1.8104, + "step": 3078 + }, + { + "epoch": 0.5269553311654972, + "grad_norm": 27.989707946777344, + "learning_rate": 1.7529948659440958e-05, + "loss": 2.4876, + "step": 3079 + }, + { + "epoch": 0.5271264761252781, + "grad_norm": 45.50398635864258, + "learning_rate": 1.753565316600114e-05, + "loss": 6.9276, + "step": 3080 + }, + { + "epoch": 0.5272976210850591, + "grad_norm": 29.907915115356445, + "learning_rate": 1.754135767256132e-05, + "loss": 3.8381, + "step": 3081 + }, + { + "epoch": 0.5274687660448399, + "grad_norm": 22.03485679626465, + "learning_rate": 1.7547062179121508e-05, + "loss": 1.8432, + "step": 3082 + }, + { + "epoch": 0.5276399110046209, + "grad_norm": 41.72187042236328, + "learning_rate": 1.755276668568169e-05, + "loss": 6.81, + "step": 3083 + }, + { + "epoch": 0.5278110559644018, + "grad_norm": 15.85753345489502, + "learning_rate": 1.755847119224187e-05, + "loss": 1.1867, + "step": 3084 + }, + { + "epoch": 0.5279822009241828, + "grad_norm": 14.52872085571289, + "learning_rate": 1.7564175698802055e-05, + "loss": 1.02, + "step": 3085 + }, + { + "epoch": 0.5281533458839637, + "grad_norm": 47.226070404052734, + "learning_rate": 1.7569880205362238e-05, + "loss": 6.5701, + "step": 3086 + }, + { + "epoch": 0.5283244908437447, + "grad_norm": 26.31117820739746, + "learning_rate": 1.757558471192242e-05, + "loss": 2.8588, + "step": 3087 + }, + { + "epoch": 0.5284956358035255, + "grad_norm": 24.817096710205078, + "learning_rate": 1.75812892184826e-05, + "loss": 2.5557, + "step": 3088 + }, + { + "epoch": 0.5286667807633065, + "grad_norm": 3.8697149753570557, + "learning_rate": 1.7586993725042785e-05, + "loss": 0.447, + "step": 3089 + }, + { + "epoch": 0.5288379257230874, + "grad_norm": 27.01019287109375, + "learning_rate": 1.7592698231602968e-05, + "loss": 2.6394, + "step": 3090 + }, + { + "epoch": 0.5290090706828684, + "grad_norm": 3.1552348136901855, + "learning_rate": 1.7598402738163148e-05, + "loss": 0.4523, + "step": 3091 + }, + { + "epoch": 0.5291802156426493, + "grad_norm": 30.454021453857422, + "learning_rate": 1.760410724472333e-05, + "loss": 3.3332, + "step": 3092 + }, + { + "epoch": 0.5293513606024303, + "grad_norm": 2.6408188343048096, + "learning_rate": 1.7609811751283515e-05, + "loss": 0.4075, + "step": 3093 + }, + { + "epoch": 0.5295225055622111, + "grad_norm": 27.623132705688477, + "learning_rate": 1.76155162578437e-05, + "loss": 2.7079, + "step": 3094 + }, + { + "epoch": 0.5296936505219921, + "grad_norm": 22.717605590820312, + "learning_rate": 1.762122076440388e-05, + "loss": 2.2378, + "step": 3095 + }, + { + "epoch": 0.529864795481773, + "grad_norm": 50.63970184326172, + "learning_rate": 1.7626925270964062e-05, + "loss": 3.3046, + "step": 3096 + }, + { + "epoch": 0.530035940441554, + "grad_norm": 47.14366912841797, + "learning_rate": 1.7632629777524245e-05, + "loss": 6.9276, + "step": 3097 + }, + { + "epoch": 0.5302070854013349, + "grad_norm": 26.201753616333008, + "learning_rate": 1.7638334284084425e-05, + "loss": 2.3728, + "step": 3098 + }, + { + "epoch": 0.5303782303611159, + "grad_norm": 33.462398529052734, + "learning_rate": 1.764403879064461e-05, + "loss": 6.8023, + "step": 3099 + }, + { + "epoch": 0.5305493753208969, + "grad_norm": 32.51939010620117, + "learning_rate": 1.7649743297204792e-05, + "loss": 3.992, + "step": 3100 + }, + { + "epoch": 0.5307205202806777, + "grad_norm": 14.161356925964355, + "learning_rate": 1.7655447803764975e-05, + "loss": 1.0058, + "step": 3101 + }, + { + "epoch": 0.5308916652404587, + "grad_norm": 91.61168670654297, + "learning_rate": 1.7661152310325155e-05, + "loss": 6.6935, + "step": 3102 + }, + { + "epoch": 0.5310628102002396, + "grad_norm": 26.40794563293457, + "learning_rate": 1.766685681688534e-05, + "loss": 2.5008, + "step": 3103 + }, + { + "epoch": 0.5312339551600206, + "grad_norm": 21.793699264526367, + "learning_rate": 1.7672561323445522e-05, + "loss": 2.0443, + "step": 3104 + }, + { + "epoch": 0.5314051001198015, + "grad_norm": 27.75925636291504, + "learning_rate": 1.7678265830005705e-05, + "loss": 2.893, + "step": 3105 + }, + { + "epoch": 0.5315762450795825, + "grad_norm": 22.48872947692871, + "learning_rate": 1.768397033656589e-05, + "loss": 1.8274, + "step": 3106 + }, + { + "epoch": 0.5317473900393633, + "grad_norm": 21.972978591918945, + "learning_rate": 1.7689674843126072e-05, + "loss": 2.0134, + "step": 3107 + }, + { + "epoch": 0.5319185349991443, + "grad_norm": 4.393357753753662, + "learning_rate": 1.7695379349686252e-05, + "loss": 0.4006, + "step": 3108 + }, + { + "epoch": 0.5320896799589252, + "grad_norm": 15.986166000366211, + "learning_rate": 1.7701083856246435e-05, + "loss": 1.0921, + "step": 3109 + }, + { + "epoch": 0.5322608249187062, + "grad_norm": 23.317607879638672, + "learning_rate": 1.770678836280662e-05, + "loss": 2.5234, + "step": 3110 + }, + { + "epoch": 0.5324319698784871, + "grad_norm": 81.8624267578125, + "learning_rate": 1.7712492869366802e-05, + "loss": 3.4206, + "step": 3111 + }, + { + "epoch": 0.532603114838268, + "grad_norm": 46.01921844482422, + "learning_rate": 1.7718197375926982e-05, + "loss": 3.2694, + "step": 3112 + }, + { + "epoch": 0.5327742597980489, + "grad_norm": 14.079997062683105, + "learning_rate": 1.7723901882487166e-05, + "loss": 1.1213, + "step": 3113 + }, + { + "epoch": 0.5329454047578299, + "grad_norm": 27.70348358154297, + "learning_rate": 1.772960638904735e-05, + "loss": 2.9553, + "step": 3114 + }, + { + "epoch": 0.5331165497176108, + "grad_norm": 13.08663558959961, + "learning_rate": 1.773531089560753e-05, + "loss": 0.9058, + "step": 3115 + }, + { + "epoch": 0.5332876946773918, + "grad_norm": 5.895364761352539, + "learning_rate": 1.7741015402167712e-05, + "loss": 0.5572, + "step": 3116 + }, + { + "epoch": 0.5334588396371727, + "grad_norm": 14.521390914916992, + "learning_rate": 1.7746719908727896e-05, + "loss": 1.2956, + "step": 3117 + }, + { + "epoch": 0.5336299845969537, + "grad_norm": 5.561517238616943, + "learning_rate": 1.775242441528808e-05, + "loss": 0.7001, + "step": 3118 + }, + { + "epoch": 0.5338011295567345, + "grad_norm": 12.158028602600098, + "learning_rate": 1.775812892184826e-05, + "loss": 0.8123, + "step": 3119 + }, + { + "epoch": 0.5339722745165155, + "grad_norm": 32.72988510131836, + "learning_rate": 1.7763833428408442e-05, + "loss": 2.9845, + "step": 3120 + }, + { + "epoch": 0.5341434194762964, + "grad_norm": 31.350831985473633, + "learning_rate": 1.7769537934968626e-05, + "loss": 3.6956, + "step": 3121 + }, + { + "epoch": 0.5343145644360774, + "grad_norm": 19.63844871520996, + "learning_rate": 1.7775242441528806e-05, + "loss": 2.0279, + "step": 3122 + }, + { + "epoch": 0.5344857093958583, + "grad_norm": 10.440444946289062, + "learning_rate": 1.778094694808899e-05, + "loss": 0.8947, + "step": 3123 + }, + { + "epoch": 0.5346568543556393, + "grad_norm": 28.158235549926758, + "learning_rate": 1.7786651454649173e-05, + "loss": 2.9307, + "step": 3124 + }, + { + "epoch": 0.5348279993154201, + "grad_norm": 25.009632110595703, + "learning_rate": 1.7792355961209356e-05, + "loss": 2.3439, + "step": 3125 + }, + { + "epoch": 0.5349991442752011, + "grad_norm": 25.99068832397461, + "learning_rate": 1.7798060467769536e-05, + "loss": 2.681, + "step": 3126 + }, + { + "epoch": 0.535170289234982, + "grad_norm": 16.541526794433594, + "learning_rate": 1.780376497432972e-05, + "loss": 1.7645, + "step": 3127 + }, + { + "epoch": 0.535341434194763, + "grad_norm": 32.52701950073242, + "learning_rate": 1.7809469480889906e-05, + "loss": 3.4399, + "step": 3128 + }, + { + "epoch": 0.5355125791545439, + "grad_norm": 1.9595259428024292, + "learning_rate": 1.7815173987450086e-05, + "loss": 0.3411, + "step": 3129 + }, + { + "epoch": 0.5356837241143249, + "grad_norm": 22.871707916259766, + "learning_rate": 1.782087849401027e-05, + "loss": 2.7152, + "step": 3130 + }, + { + "epoch": 0.5358548690741057, + "grad_norm": 30.88572120666504, + "learning_rate": 1.7826583000570453e-05, + "loss": 3.0383, + "step": 3131 + }, + { + "epoch": 0.5360260140338867, + "grad_norm": 24.158727645874023, + "learning_rate": 1.7832287507130636e-05, + "loss": 2.7594, + "step": 3132 + }, + { + "epoch": 0.5361971589936676, + "grad_norm": 19.16653823852539, + "learning_rate": 1.7837992013690816e-05, + "loss": 1.7749, + "step": 3133 + }, + { + "epoch": 0.5363683039534486, + "grad_norm": 11.925354957580566, + "learning_rate": 1.7843696520251e-05, + "loss": 0.8569, + "step": 3134 + }, + { + "epoch": 0.5365394489132295, + "grad_norm": 20.42278289794922, + "learning_rate": 1.7849401026811183e-05, + "loss": 1.9146, + "step": 3135 + }, + { + "epoch": 0.5367105938730105, + "grad_norm": 36.13545227050781, + "learning_rate": 1.7855105533371363e-05, + "loss": 4.4798, + "step": 3136 + }, + { + "epoch": 0.5368817388327913, + "grad_norm": 4.70065975189209, + "learning_rate": 1.7860810039931546e-05, + "loss": 0.4304, + "step": 3137 + }, + { + "epoch": 0.5370528837925723, + "grad_norm": 24.28241539001465, + "learning_rate": 1.786651454649173e-05, + "loss": 2.4005, + "step": 3138 + }, + { + "epoch": 0.5372240287523532, + "grad_norm": 14.650952339172363, + "learning_rate": 1.787221905305191e-05, + "loss": 0.968, + "step": 3139 + }, + { + "epoch": 0.5373951737121342, + "grad_norm": 16.861696243286133, + "learning_rate": 1.7877923559612093e-05, + "loss": 1.7277, + "step": 3140 + }, + { + "epoch": 0.5375663186719151, + "grad_norm": 5.233786106109619, + "learning_rate": 1.7883628066172276e-05, + "loss": 0.7488, + "step": 3141 + }, + { + "epoch": 0.537737463631696, + "grad_norm": 32.38574981689453, + "learning_rate": 1.788933257273246e-05, + "loss": 3.4367, + "step": 3142 + }, + { + "epoch": 0.5379086085914769, + "grad_norm": 75.13265991210938, + "learning_rate": 1.789503707929264e-05, + "loss": 3.4272, + "step": 3143 + }, + { + "epoch": 0.5380797535512579, + "grad_norm": 23.56121063232422, + "learning_rate": 1.7900741585852823e-05, + "loss": 2.2267, + "step": 3144 + }, + { + "epoch": 0.5382508985110388, + "grad_norm": 6.575436592102051, + "learning_rate": 1.7906446092413007e-05, + "loss": 0.7715, + "step": 3145 + }, + { + "epoch": 0.5384220434708198, + "grad_norm": 30.233795166015625, + "learning_rate": 1.7912150598973187e-05, + "loss": 2.2247, + "step": 3146 + }, + { + "epoch": 0.5385931884306007, + "grad_norm": 18.158550262451172, + "learning_rate": 1.791785510553337e-05, + "loss": 1.4116, + "step": 3147 + }, + { + "epoch": 0.5387643333903817, + "grad_norm": 25.578800201416016, + "learning_rate": 1.7923559612093553e-05, + "loss": 2.237, + "step": 3148 + }, + { + "epoch": 0.5389354783501625, + "grad_norm": 4.3977460861206055, + "learning_rate": 1.7929264118653737e-05, + "loss": 0.4421, + "step": 3149 + }, + { + "epoch": 0.5391066233099435, + "grad_norm": 23.86539649963379, + "learning_rate": 1.793496862521392e-05, + "loss": 2.2867, + "step": 3150 + }, + { + "epoch": 0.5392777682697245, + "grad_norm": 2.900665283203125, + "learning_rate": 1.7940673131774103e-05, + "loss": 0.3778, + "step": 3151 + }, + { + "epoch": 0.5394489132295054, + "grad_norm": 28.02079200744629, + "learning_rate": 1.7946377638334287e-05, + "loss": 2.756, + "step": 3152 + }, + { + "epoch": 0.5396200581892864, + "grad_norm": 27.565895080566406, + "learning_rate": 1.7952082144894467e-05, + "loss": 2.3044, + "step": 3153 + }, + { + "epoch": 0.5397912031490673, + "grad_norm": 35.14018630981445, + "learning_rate": 1.795778665145465e-05, + "loss": 4.3437, + "step": 3154 + }, + { + "epoch": 0.5399623481088482, + "grad_norm": 24.932573318481445, + "learning_rate": 1.7963491158014834e-05, + "loss": 2.2505, + "step": 3155 + }, + { + "epoch": 0.5401334930686291, + "grad_norm": 26.866313934326172, + "learning_rate": 1.7969195664575017e-05, + "loss": 2.7324, + "step": 3156 + }, + { + "epoch": 0.5403046380284101, + "grad_norm": 22.461328506469727, + "learning_rate": 1.7974900171135197e-05, + "loss": 2.1863, + "step": 3157 + }, + { + "epoch": 0.540475782988191, + "grad_norm": 16.967121124267578, + "learning_rate": 1.798060467769538e-05, + "loss": 1.0429, + "step": 3158 + }, + { + "epoch": 0.540646927947972, + "grad_norm": 116.18841552734375, + "learning_rate": 1.7986309184255564e-05, + "loss": 3.4443, + "step": 3159 + }, + { + "epoch": 0.5408180729077529, + "grad_norm": 28.559480667114258, + "learning_rate": 1.7992013690815744e-05, + "loss": 2.4973, + "step": 3160 + }, + { + "epoch": 0.5409892178675338, + "grad_norm": 3.590916395187378, + "learning_rate": 1.7997718197375927e-05, + "loss": 0.4966, + "step": 3161 + }, + { + "epoch": 0.5411603628273147, + "grad_norm": 78.85108947753906, + "learning_rate": 1.800342270393611e-05, + "loss": 4.2312, + "step": 3162 + }, + { + "epoch": 0.5413315077870957, + "grad_norm": 25.83539390563965, + "learning_rate": 1.8009127210496294e-05, + "loss": 2.4909, + "step": 3163 + }, + { + "epoch": 0.5415026527468766, + "grad_norm": 4.292176246643066, + "learning_rate": 1.8014831717056474e-05, + "loss": 0.4314, + "step": 3164 + }, + { + "epoch": 0.5416737977066576, + "grad_norm": 6.629253387451172, + "learning_rate": 1.8020536223616657e-05, + "loss": 0.7743, + "step": 3165 + }, + { + "epoch": 0.5418449426664385, + "grad_norm": 22.770082473754883, + "learning_rate": 1.802624073017684e-05, + "loss": 2.2334, + "step": 3166 + }, + { + "epoch": 0.5420160876262194, + "grad_norm": 26.48427963256836, + "learning_rate": 1.803194523673702e-05, + "loss": 3.4341, + "step": 3167 + }, + { + "epoch": 0.5421872325860003, + "grad_norm": 9.429801940917969, + "learning_rate": 1.8037649743297204e-05, + "loss": 0.8092, + "step": 3168 + }, + { + "epoch": 0.5423583775457813, + "grad_norm": 56.79134750366211, + "learning_rate": 1.8043354249857387e-05, + "loss": 7.2408, + "step": 3169 + }, + { + "epoch": 0.5425295225055622, + "grad_norm": 26.484098434448242, + "learning_rate": 1.804905875641757e-05, + "loss": 2.4684, + "step": 3170 + }, + { + "epoch": 0.5427006674653432, + "grad_norm": 21.694990158081055, + "learning_rate": 1.805476326297775e-05, + "loss": 2.1088, + "step": 3171 + }, + { + "epoch": 0.5428718124251241, + "grad_norm": 23.824108123779297, + "learning_rate": 1.8060467769537934e-05, + "loss": 2.7028, + "step": 3172 + }, + { + "epoch": 0.543042957384905, + "grad_norm": 23.9963321685791, + "learning_rate": 1.806617227609812e-05, + "loss": 2.1508, + "step": 3173 + }, + { + "epoch": 0.5432141023446859, + "grad_norm": 23.810443878173828, + "learning_rate": 1.80718767826583e-05, + "loss": 3.0015, + "step": 3174 + }, + { + "epoch": 0.5433852473044669, + "grad_norm": 38.47050857543945, + "learning_rate": 1.8077581289218484e-05, + "loss": 7.0851, + "step": 3175 + }, + { + "epoch": 0.5435563922642478, + "grad_norm": 26.14175033569336, + "learning_rate": 1.8083285795778668e-05, + "loss": 2.8927, + "step": 3176 + }, + { + "epoch": 0.5437275372240288, + "grad_norm": 34.895294189453125, + "learning_rate": 1.8088990302338848e-05, + "loss": 6.4431, + "step": 3177 + }, + { + "epoch": 0.5438986821838097, + "grad_norm": 30.46366310119629, + "learning_rate": 1.809469480889903e-05, + "loss": 3.8774, + "step": 3178 + }, + { + "epoch": 0.5440698271435906, + "grad_norm": 2.045729637145996, + "learning_rate": 1.8100399315459214e-05, + "loss": 0.3887, + "step": 3179 + }, + { + "epoch": 0.5442409721033715, + "grad_norm": 23.526275634765625, + "learning_rate": 1.8106103822019398e-05, + "loss": 2.2568, + "step": 3180 + }, + { + "epoch": 0.5444121170631525, + "grad_norm": 37.37553024291992, + "learning_rate": 1.8111808328579578e-05, + "loss": 5.5636, + "step": 3181 + }, + { + "epoch": 0.5445832620229334, + "grad_norm": 2.853957176208496, + "learning_rate": 1.811751283513976e-05, + "loss": 0.3745, + "step": 3182 + }, + { + "epoch": 0.5447544069827144, + "grad_norm": 3.5641119480133057, + "learning_rate": 1.8123217341699944e-05, + "loss": 0.424, + "step": 3183 + }, + { + "epoch": 0.5449255519424953, + "grad_norm": 8.759005546569824, + "learning_rate": 1.8128921848260124e-05, + "loss": 0.7256, + "step": 3184 + }, + { + "epoch": 0.5450966969022762, + "grad_norm": 54.43397521972656, + "learning_rate": 1.8134626354820308e-05, + "loss": 7.7319, + "step": 3185 + }, + { + "epoch": 0.5452678418620571, + "grad_norm": 26.35443878173828, + "learning_rate": 1.814033086138049e-05, + "loss": 3.1477, + "step": 3186 + }, + { + "epoch": 0.5454389868218381, + "grad_norm": 18.872291564941406, + "learning_rate": 1.8146035367940675e-05, + "loss": 1.8724, + "step": 3187 + }, + { + "epoch": 0.545610131781619, + "grad_norm": 22.673784255981445, + "learning_rate": 1.8151739874500855e-05, + "loss": 1.9899, + "step": 3188 + }, + { + "epoch": 0.5457812767414, + "grad_norm": 9.217958450317383, + "learning_rate": 1.8157444381061038e-05, + "loss": 0.726, + "step": 3189 + }, + { + "epoch": 0.5459524217011809, + "grad_norm": 2.148630380630493, + "learning_rate": 1.816314888762122e-05, + "loss": 0.323, + "step": 3190 + }, + { + "epoch": 0.5461235666609618, + "grad_norm": 26.988340377807617, + "learning_rate": 1.81688533941814e-05, + "loss": 2.586, + "step": 3191 + }, + { + "epoch": 0.5462947116207427, + "grad_norm": 37.6932373046875, + "learning_rate": 1.8174557900741585e-05, + "loss": 2.9146, + "step": 3192 + }, + { + "epoch": 0.5464658565805237, + "grad_norm": 74.42720794677734, + "learning_rate": 1.8180262407301768e-05, + "loss": 3.2535, + "step": 3193 + }, + { + "epoch": 0.5466370015403046, + "grad_norm": 29.757360458374023, + "learning_rate": 1.818596691386195e-05, + "loss": 2.8882, + "step": 3194 + }, + { + "epoch": 0.5468081465000856, + "grad_norm": 15.420557975769043, + "learning_rate": 1.819167142042213e-05, + "loss": 1.6278, + "step": 3195 + }, + { + "epoch": 0.5469792914598665, + "grad_norm": 31.367387771606445, + "learning_rate": 1.8197375926982318e-05, + "loss": 2.9209, + "step": 3196 + }, + { + "epoch": 0.5471504364196474, + "grad_norm": 28.30303382873535, + "learning_rate": 1.82030804335425e-05, + "loss": 3.066, + "step": 3197 + }, + { + "epoch": 0.5473215813794283, + "grad_norm": 27.540369033813477, + "learning_rate": 1.820878494010268e-05, + "loss": 3.3267, + "step": 3198 + }, + { + "epoch": 0.5474927263392093, + "grad_norm": 4.438743591308594, + "learning_rate": 1.8214489446662865e-05, + "loss": 0.3722, + "step": 3199 + }, + { + "epoch": 0.5476638712989902, + "grad_norm": 29.85404396057129, + "learning_rate": 1.822019395322305e-05, + "loss": 4.0139, + "step": 3200 + }, + { + "epoch": 0.5478350162587712, + "grad_norm": 28.56346893310547, + "learning_rate": 1.8225898459783232e-05, + "loss": 3.6119, + "step": 3201 + }, + { + "epoch": 0.5480061612185522, + "grad_norm": 29.419742584228516, + "learning_rate": 1.8231602966343412e-05, + "loss": 2.9087, + "step": 3202 + }, + { + "epoch": 0.548177306178333, + "grad_norm": 44.72222900390625, + "learning_rate": 1.8237307472903595e-05, + "loss": 6.7043, + "step": 3203 + }, + { + "epoch": 0.548348451138114, + "grad_norm": 21.762168884277344, + "learning_rate": 1.824301197946378e-05, + "loss": 1.9849, + "step": 3204 + }, + { + "epoch": 0.5485195960978949, + "grad_norm": 26.09598731994629, + "learning_rate": 1.824871648602396e-05, + "loss": 2.6737, + "step": 3205 + }, + { + "epoch": 0.5486907410576759, + "grad_norm": 32.6449089050293, + "learning_rate": 1.8254420992584142e-05, + "loss": 2.7534, + "step": 3206 + }, + { + "epoch": 0.5488618860174568, + "grad_norm": 20.140134811401367, + "learning_rate": 1.8260125499144325e-05, + "loss": 2.0024, + "step": 3207 + }, + { + "epoch": 0.5490330309772378, + "grad_norm": 8.021845817565918, + "learning_rate": 1.8265830005704505e-05, + "loss": 0.8484, + "step": 3208 + }, + { + "epoch": 0.5492041759370186, + "grad_norm": 23.706680297851562, + "learning_rate": 1.827153451226469e-05, + "loss": 2.5228, + "step": 3209 + }, + { + "epoch": 0.5493753208967996, + "grad_norm": 25.105031967163086, + "learning_rate": 1.8277239018824872e-05, + "loss": 2.4342, + "step": 3210 + }, + { + "epoch": 0.5495464658565805, + "grad_norm": 16.53352165222168, + "learning_rate": 1.8282943525385055e-05, + "loss": 1.4315, + "step": 3211 + }, + { + "epoch": 0.5497176108163615, + "grad_norm": 24.55224609375, + "learning_rate": 1.8288648031945235e-05, + "loss": 2.469, + "step": 3212 + }, + { + "epoch": 0.5498887557761424, + "grad_norm": 3.4264721870422363, + "learning_rate": 1.829435253850542e-05, + "loss": 0.3677, + "step": 3213 + }, + { + "epoch": 0.5500599007359234, + "grad_norm": 20.305509567260742, + "learning_rate": 1.8300057045065602e-05, + "loss": 2.3214, + "step": 3214 + }, + { + "epoch": 0.5502310456957042, + "grad_norm": 27.69756507873535, + "learning_rate": 1.8305761551625782e-05, + "loss": 2.8746, + "step": 3215 + }, + { + "epoch": 0.5504021906554852, + "grad_norm": 100.86264038085938, + "learning_rate": 1.8311466058185965e-05, + "loss": 7.5686, + "step": 3216 + }, + { + "epoch": 0.5505733356152661, + "grad_norm": 26.603628158569336, + "learning_rate": 1.831717056474615e-05, + "loss": 2.5639, + "step": 3217 + }, + { + "epoch": 0.5507444805750471, + "grad_norm": 31.449655532836914, + "learning_rate": 1.8322875071306332e-05, + "loss": 3.9795, + "step": 3218 + }, + { + "epoch": 0.550915625534828, + "grad_norm": 25.562639236450195, + "learning_rate": 1.8328579577866516e-05, + "loss": 3.0795, + "step": 3219 + }, + { + "epoch": 0.551086770494609, + "grad_norm": 4.988560199737549, + "learning_rate": 1.83342840844267e-05, + "loss": 0.4445, + "step": 3220 + }, + { + "epoch": 0.5512579154543898, + "grad_norm": 31.045183181762695, + "learning_rate": 1.8339988590986882e-05, + "loss": 6.4413, + "step": 3221 + }, + { + "epoch": 0.5514290604141708, + "grad_norm": 32.938106536865234, + "learning_rate": 1.8345693097547062e-05, + "loss": 4.0821, + "step": 3222 + }, + { + "epoch": 0.5516002053739517, + "grad_norm": 23.498254776000977, + "learning_rate": 1.8351397604107246e-05, + "loss": 2.8752, + "step": 3223 + }, + { + "epoch": 0.5517713503337327, + "grad_norm": 27.559247970581055, + "learning_rate": 1.835710211066743e-05, + "loss": 3.3216, + "step": 3224 + }, + { + "epoch": 0.5519424952935136, + "grad_norm": 46.420135498046875, + "learning_rate": 1.8362806617227613e-05, + "loss": 6.9903, + "step": 3225 + }, + { + "epoch": 0.5521136402532946, + "grad_norm": 23.508155822753906, + "learning_rate": 1.8368511123787793e-05, + "loss": 2.1877, + "step": 3226 + }, + { + "epoch": 0.5522847852130754, + "grad_norm": 20.4776611328125, + "learning_rate": 1.8374215630347976e-05, + "loss": 1.8942, + "step": 3227 + }, + { + "epoch": 0.5524559301728564, + "grad_norm": 15.294054985046387, + "learning_rate": 1.837992013690816e-05, + "loss": 1.2082, + "step": 3228 + }, + { + "epoch": 0.5526270751326373, + "grad_norm": 22.51180076599121, + "learning_rate": 1.838562464346834e-05, + "loss": 2.2929, + "step": 3229 + }, + { + "epoch": 0.5527982200924183, + "grad_norm": 21.741634368896484, + "learning_rate": 1.8391329150028523e-05, + "loss": 1.884, + "step": 3230 + }, + { + "epoch": 0.5529693650521992, + "grad_norm": 4.330467224121094, + "learning_rate": 1.8397033656588706e-05, + "loss": 0.4163, + "step": 3231 + }, + { + "epoch": 0.5531405100119802, + "grad_norm": 26.344017028808594, + "learning_rate": 1.840273816314889e-05, + "loss": 2.5767, + "step": 3232 + }, + { + "epoch": 0.553311654971761, + "grad_norm": 53.116172790527344, + "learning_rate": 1.840844266970907e-05, + "loss": 2.7426, + "step": 3233 + }, + { + "epoch": 0.553482799931542, + "grad_norm": 15.442861557006836, + "learning_rate": 1.8414147176269253e-05, + "loss": 0.9586, + "step": 3234 + }, + { + "epoch": 0.5536539448913229, + "grad_norm": 28.15229606628418, + "learning_rate": 1.8419851682829436e-05, + "loss": 2.9423, + "step": 3235 + }, + { + "epoch": 0.5538250898511039, + "grad_norm": 21.91160011291504, + "learning_rate": 1.8425556189389616e-05, + "loss": 2.3152, + "step": 3236 + }, + { + "epoch": 0.5539962348108848, + "grad_norm": 21.20878028869629, + "learning_rate": 1.84312606959498e-05, + "loss": 2.117, + "step": 3237 + }, + { + "epoch": 0.5541673797706658, + "grad_norm": 1.7572641372680664, + "learning_rate": 1.8436965202509983e-05, + "loss": 0.3516, + "step": 3238 + }, + { + "epoch": 0.5543385247304466, + "grad_norm": 25.218217849731445, + "learning_rate": 1.8442669709070166e-05, + "loss": 2.5554, + "step": 3239 + }, + { + "epoch": 0.5545096696902276, + "grad_norm": 30.133291244506836, + "learning_rate": 1.8448374215630346e-05, + "loss": 3.078, + "step": 3240 + }, + { + "epoch": 0.5546808146500085, + "grad_norm": 30.298227310180664, + "learning_rate": 1.845407872219053e-05, + "loss": 3.2276, + "step": 3241 + }, + { + "epoch": 0.5548519596097895, + "grad_norm": 31.560077667236328, + "learning_rate": 1.8459783228750716e-05, + "loss": 3.713, + "step": 3242 + }, + { + "epoch": 0.5550231045695704, + "grad_norm": 7.287442207336426, + "learning_rate": 1.8465487735310896e-05, + "loss": 0.4505, + "step": 3243 + }, + { + "epoch": 0.5551942495293514, + "grad_norm": 12.331917762756348, + "learning_rate": 1.847119224187108e-05, + "loss": 0.8588, + "step": 3244 + }, + { + "epoch": 0.5553653944891322, + "grad_norm": 44.76494216918945, + "learning_rate": 1.8476896748431263e-05, + "loss": 7.2343, + "step": 3245 + }, + { + "epoch": 0.5555365394489132, + "grad_norm": 32.351600646972656, + "learning_rate": 1.8482601254991443e-05, + "loss": 4.0445, + "step": 3246 + }, + { + "epoch": 0.5557076844086941, + "grad_norm": 28.279098510742188, + "learning_rate": 1.8488305761551627e-05, + "loss": 3.0796, + "step": 3247 + }, + { + "epoch": 0.5558788293684751, + "grad_norm": 28.361543655395508, + "learning_rate": 1.849401026811181e-05, + "loss": 3.3123, + "step": 3248 + }, + { + "epoch": 0.556049974328256, + "grad_norm": 41.510597229003906, + "learning_rate": 1.8499714774671993e-05, + "loss": 6.904, + "step": 3249 + }, + { + "epoch": 0.556221119288037, + "grad_norm": 28.658105850219727, + "learning_rate": 1.8505419281232173e-05, + "loss": 3.0404, + "step": 3250 + }, + { + "epoch": 0.5563922642478178, + "grad_norm": 28.752214431762695, + "learning_rate": 1.8511123787792357e-05, + "loss": 3.7672, + "step": 3251 + }, + { + "epoch": 0.5565634092075988, + "grad_norm": 34.223060607910156, + "learning_rate": 1.851682829435254e-05, + "loss": 5.3153, + "step": 3252 + }, + { + "epoch": 0.5567345541673798, + "grad_norm": 12.981977462768555, + "learning_rate": 1.852253280091272e-05, + "loss": 0.9685, + "step": 3253 + }, + { + "epoch": 0.5569056991271607, + "grad_norm": 20.176815032958984, + "learning_rate": 1.8528237307472903e-05, + "loss": 1.9294, + "step": 3254 + }, + { + "epoch": 0.5570768440869417, + "grad_norm": 24.55870246887207, + "learning_rate": 1.8533941814033087e-05, + "loss": 2.8725, + "step": 3255 + }, + { + "epoch": 0.5572479890467226, + "grad_norm": 21.821077346801758, + "learning_rate": 1.853964632059327e-05, + "loss": 2.0436, + "step": 3256 + }, + { + "epoch": 0.5574191340065036, + "grad_norm": 23.027362823486328, + "learning_rate": 1.854535082715345e-05, + "loss": 2.1357, + "step": 3257 + }, + { + "epoch": 0.5575902789662844, + "grad_norm": 10.035751342773438, + "learning_rate": 1.8551055333713634e-05, + "loss": 0.6818, + "step": 3258 + }, + { + "epoch": 0.5577614239260654, + "grad_norm": 4.451612949371338, + "learning_rate": 1.8556759840273817e-05, + "loss": 0.4069, + "step": 3259 + }, + { + "epoch": 0.5579325688858463, + "grad_norm": 25.953588485717773, + "learning_rate": 1.8562464346833997e-05, + "loss": 2.8325, + "step": 3260 + }, + { + "epoch": 0.5581037138456273, + "grad_norm": 63.914024353027344, + "learning_rate": 1.856816885339418e-05, + "loss": 8.0674, + "step": 3261 + }, + { + "epoch": 0.5582748588054082, + "grad_norm": 11.405961990356445, + "learning_rate": 1.8573873359954364e-05, + "loss": 0.8008, + "step": 3262 + }, + { + "epoch": 0.5584460037651892, + "grad_norm": 21.461894989013672, + "learning_rate": 1.8579577866514547e-05, + "loss": 2.1028, + "step": 3263 + }, + { + "epoch": 0.55861714872497, + "grad_norm": 22.72207260131836, + "learning_rate": 1.8585282373074727e-05, + "loss": 2.0172, + "step": 3264 + }, + { + "epoch": 0.558788293684751, + "grad_norm": 27.586618423461914, + "learning_rate": 1.8590986879634914e-05, + "loss": 2.0431, + "step": 3265 + }, + { + "epoch": 0.5589594386445319, + "grad_norm": 26.892311096191406, + "learning_rate": 1.8596691386195097e-05, + "loss": 2.5198, + "step": 3266 + }, + { + "epoch": 0.5591305836043129, + "grad_norm": 22.420379638671875, + "learning_rate": 1.8602395892755277e-05, + "loss": 2.155, + "step": 3267 + }, + { + "epoch": 0.5593017285640938, + "grad_norm": 8.758207321166992, + "learning_rate": 1.860810039931546e-05, + "loss": 1.3864, + "step": 3268 + }, + { + "epoch": 0.5594728735238748, + "grad_norm": 8.072163581848145, + "learning_rate": 1.8613804905875644e-05, + "loss": 0.5867, + "step": 3269 + }, + { + "epoch": 0.5596440184836556, + "grad_norm": 6.457092761993408, + "learning_rate": 1.8619509412435827e-05, + "loss": 0.6464, + "step": 3270 + }, + { + "epoch": 0.5598151634434366, + "grad_norm": 7.5770392417907715, + "learning_rate": 1.8625213918996007e-05, + "loss": 0.8377, + "step": 3271 + }, + { + "epoch": 0.5599863084032175, + "grad_norm": 28.0118350982666, + "learning_rate": 1.863091842555619e-05, + "loss": 3.5048, + "step": 3272 + }, + { + "epoch": 0.5601574533629985, + "grad_norm": 75.3667984008789, + "learning_rate": 1.8636622932116374e-05, + "loss": 7.3634, + "step": 3273 + }, + { + "epoch": 0.5603285983227794, + "grad_norm": 6.486256122589111, + "learning_rate": 1.8642327438676554e-05, + "loss": 0.772, + "step": 3274 + }, + { + "epoch": 0.5604997432825604, + "grad_norm": 18.678125381469727, + "learning_rate": 1.8648031945236737e-05, + "loss": 1.9006, + "step": 3275 + }, + { + "epoch": 0.5606708882423412, + "grad_norm": 7.29653263092041, + "learning_rate": 1.865373645179692e-05, + "loss": 1.075, + "step": 3276 + }, + { + "epoch": 0.5608420332021222, + "grad_norm": 2.164841890335083, + "learning_rate": 1.86594409583571e-05, + "loss": 0.3693, + "step": 3277 + }, + { + "epoch": 0.5610131781619031, + "grad_norm": 21.217857360839844, + "learning_rate": 1.8665145464917284e-05, + "loss": 2.2186, + "step": 3278 + }, + { + "epoch": 0.5611843231216841, + "grad_norm": 8.882852554321289, + "learning_rate": 1.8670849971477468e-05, + "loss": 0.9134, + "step": 3279 + }, + { + "epoch": 0.561355468081465, + "grad_norm": 17.709449768066406, + "learning_rate": 1.867655447803765e-05, + "loss": 1.5424, + "step": 3280 + }, + { + "epoch": 0.561526613041246, + "grad_norm": 5.205548286437988, + "learning_rate": 1.868225898459783e-05, + "loss": 0.6676, + "step": 3281 + }, + { + "epoch": 0.5616977580010268, + "grad_norm": 52.959259033203125, + "learning_rate": 1.8687963491158014e-05, + "loss": 7.2033, + "step": 3282 + }, + { + "epoch": 0.5618689029608078, + "grad_norm": 29.030555725097656, + "learning_rate": 1.8693667997718198e-05, + "loss": 2.9477, + "step": 3283 + }, + { + "epoch": 0.5620400479205887, + "grad_norm": 27.745031356811523, + "learning_rate": 1.8699372504278378e-05, + "loss": 2.719, + "step": 3284 + }, + { + "epoch": 0.5622111928803697, + "grad_norm": 25.971677780151367, + "learning_rate": 1.870507701083856e-05, + "loss": 2.706, + "step": 3285 + }, + { + "epoch": 0.5623823378401506, + "grad_norm": 27.33722686767578, + "learning_rate": 1.8710781517398744e-05, + "loss": 2.568, + "step": 3286 + }, + { + "epoch": 0.5625534827999316, + "grad_norm": 22.52666473388672, + "learning_rate": 1.8716486023958928e-05, + "loss": 2.3127, + "step": 3287 + }, + { + "epoch": 0.5627246277597124, + "grad_norm": 10.016031265258789, + "learning_rate": 1.872219053051911e-05, + "loss": 1.4001, + "step": 3288 + }, + { + "epoch": 0.5628957727194934, + "grad_norm": 24.30003547668457, + "learning_rate": 1.8727895037079295e-05, + "loss": 2.4201, + "step": 3289 + }, + { + "epoch": 0.5630669176792743, + "grad_norm": 6.622725009918213, + "learning_rate": 1.8733599543639478e-05, + "loss": 0.7098, + "step": 3290 + }, + { + "epoch": 0.5632380626390553, + "grad_norm": 24.1121883392334, + "learning_rate": 1.8739304050199658e-05, + "loss": 1.887, + "step": 3291 + }, + { + "epoch": 0.5634092075988362, + "grad_norm": 31.559614181518555, + "learning_rate": 1.874500855675984e-05, + "loss": 3.5751, + "step": 3292 + }, + { + "epoch": 0.5635803525586172, + "grad_norm": 33.04099655151367, + "learning_rate": 1.8750713063320025e-05, + "loss": 3.2233, + "step": 3293 + }, + { + "epoch": 0.563751497518398, + "grad_norm": 81.57552337646484, + "learning_rate": 1.8756417569880208e-05, + "loss": 3.8443, + "step": 3294 + }, + { + "epoch": 0.563922642478179, + "grad_norm": 30.438037872314453, + "learning_rate": 1.8762122076440388e-05, + "loss": 3.7816, + "step": 3295 + }, + { + "epoch": 0.5640937874379599, + "grad_norm": 7.756313323974609, + "learning_rate": 1.876782658300057e-05, + "loss": 0.7225, + "step": 3296 + }, + { + "epoch": 0.5642649323977409, + "grad_norm": 28.59238624572754, + "learning_rate": 1.8773531089560755e-05, + "loss": 3.1516, + "step": 3297 + }, + { + "epoch": 0.5644360773575218, + "grad_norm": 25.167417526245117, + "learning_rate": 1.8779235596120935e-05, + "loss": 3.174, + "step": 3298 + }, + { + "epoch": 0.5646072223173028, + "grad_norm": 86.82372283935547, + "learning_rate": 1.8784940102681118e-05, + "loss": 4.5193, + "step": 3299 + }, + { + "epoch": 0.5647783672770836, + "grad_norm": 30.278440475463867, + "learning_rate": 1.87906446092413e-05, + "loss": 3.2045, + "step": 3300 + }, + { + "epoch": 0.5649495122368646, + "grad_norm": 34.26241683959961, + "learning_rate": 1.8796349115801485e-05, + "loss": 3.7586, + "step": 3301 + }, + { + "epoch": 0.5651206571966455, + "grad_norm": 20.874797821044922, + "learning_rate": 1.8802053622361665e-05, + "loss": 1.9123, + "step": 3302 + }, + { + "epoch": 0.5652918021564265, + "grad_norm": 26.034624099731445, + "learning_rate": 1.8807758128921848e-05, + "loss": 2.522, + "step": 3303 + }, + { + "epoch": 0.5654629471162075, + "grad_norm": 11.349614143371582, + "learning_rate": 1.881346263548203e-05, + "loss": 0.9236, + "step": 3304 + }, + { + "epoch": 0.5656340920759884, + "grad_norm": 10.266570091247559, + "learning_rate": 1.881916714204221e-05, + "loss": 0.6643, + "step": 3305 + }, + { + "epoch": 0.5658052370357693, + "grad_norm": 32.189842224121094, + "learning_rate": 1.8824871648602395e-05, + "loss": 4.5101, + "step": 3306 + }, + { + "epoch": 0.5659763819955502, + "grad_norm": 24.921152114868164, + "learning_rate": 1.883057615516258e-05, + "loss": 2.9263, + "step": 3307 + }, + { + "epoch": 0.5661475269553312, + "grad_norm": 35.14552307128906, + "learning_rate": 1.883628066172276e-05, + "loss": 4.0464, + "step": 3308 + }, + { + "epoch": 0.5663186719151121, + "grad_norm": 37.087039947509766, + "learning_rate": 1.8841985168282942e-05, + "loss": 4.0199, + "step": 3309 + }, + { + "epoch": 0.5664898168748931, + "grad_norm": 26.691438674926758, + "learning_rate": 1.8847689674843125e-05, + "loss": 3.2809, + "step": 3310 + }, + { + "epoch": 0.566660961834674, + "grad_norm": 31.133575439453125, + "learning_rate": 1.8853394181403312e-05, + "loss": 3.4234, + "step": 3311 + }, + { + "epoch": 0.566832106794455, + "grad_norm": 98.82320404052734, + "learning_rate": 1.8859098687963492e-05, + "loss": 7.426, + "step": 3312 + }, + { + "epoch": 0.5670032517542358, + "grad_norm": 26.13225746154785, + "learning_rate": 1.8864803194523675e-05, + "loss": 2.5794, + "step": 3313 + }, + { + "epoch": 0.5671743967140168, + "grad_norm": 28.947038650512695, + "learning_rate": 1.887050770108386e-05, + "loss": 2.7049, + "step": 3314 + }, + { + "epoch": 0.5673455416737977, + "grad_norm": 23.491085052490234, + "learning_rate": 1.887621220764404e-05, + "loss": 2.1517, + "step": 3315 + }, + { + "epoch": 0.5675166866335787, + "grad_norm": 17.27471351623535, + "learning_rate": 1.8881916714204222e-05, + "loss": 1.4919, + "step": 3316 + }, + { + "epoch": 0.5676878315933596, + "grad_norm": 34.11532974243164, + "learning_rate": 1.8887621220764405e-05, + "loss": 4.5321, + "step": 3317 + }, + { + "epoch": 0.5678589765531405, + "grad_norm": 19.040075302124023, + "learning_rate": 1.889332572732459e-05, + "loss": 1.6601, + "step": 3318 + }, + { + "epoch": 0.5680301215129214, + "grad_norm": 18.085039138793945, + "learning_rate": 1.889903023388477e-05, + "loss": 1.5302, + "step": 3319 + }, + { + "epoch": 0.5682012664727024, + "grad_norm": 27.968341827392578, + "learning_rate": 1.8904734740444952e-05, + "loss": 3.3977, + "step": 3320 + }, + { + "epoch": 0.5683724114324833, + "grad_norm": 2.1676626205444336, + "learning_rate": 1.8910439247005136e-05, + "loss": 0.3746, + "step": 3321 + }, + { + "epoch": 0.5685435563922643, + "grad_norm": 3.0772573947906494, + "learning_rate": 1.8916143753565316e-05, + "loss": 0.4364, + "step": 3322 + }, + { + "epoch": 0.5687147013520452, + "grad_norm": 25.465309143066406, + "learning_rate": 1.89218482601255e-05, + "loss": 2.5141, + "step": 3323 + }, + { + "epoch": 0.5688858463118261, + "grad_norm": 28.62706184387207, + "learning_rate": 1.8927552766685682e-05, + "loss": 2.9376, + "step": 3324 + }, + { + "epoch": 0.569056991271607, + "grad_norm": 10.36950969696045, + "learning_rate": 1.8933257273245866e-05, + "loss": 0.9048, + "step": 3325 + }, + { + "epoch": 0.569228136231388, + "grad_norm": 22.50096893310547, + "learning_rate": 1.8938961779806046e-05, + "loss": 2.3555, + "step": 3326 + }, + { + "epoch": 0.5693992811911689, + "grad_norm": 25.440292358398438, + "learning_rate": 1.894466628636623e-05, + "loss": 2.5734, + "step": 3327 + }, + { + "epoch": 0.5695704261509499, + "grad_norm": 5.363638401031494, + "learning_rate": 1.8950370792926412e-05, + "loss": 0.4315, + "step": 3328 + }, + { + "epoch": 0.5697415711107308, + "grad_norm": 29.033611297607422, + "learning_rate": 1.8956075299486592e-05, + "loss": 2.7605, + "step": 3329 + }, + { + "epoch": 0.5699127160705117, + "grad_norm": 6.961116790771484, + "learning_rate": 1.8961779806046776e-05, + "loss": 1.2524, + "step": 3330 + }, + { + "epoch": 0.5700838610302926, + "grad_norm": 29.2668399810791, + "learning_rate": 1.896748431260696e-05, + "loss": 3.3542, + "step": 3331 + }, + { + "epoch": 0.5702550059900736, + "grad_norm": 38.82827377319336, + "learning_rate": 1.8973188819167143e-05, + "loss": 4.5204, + "step": 3332 + }, + { + "epoch": 0.5704261509498545, + "grad_norm": 32.07524871826172, + "learning_rate": 1.8978893325727326e-05, + "loss": 4.3224, + "step": 3333 + }, + { + "epoch": 0.5705972959096355, + "grad_norm": 3.1426124572753906, + "learning_rate": 1.898459783228751e-05, + "loss": 0.4036, + "step": 3334 + }, + { + "epoch": 0.5707684408694164, + "grad_norm": 19.389469146728516, + "learning_rate": 1.8990302338847693e-05, + "loss": 2.1048, + "step": 3335 + }, + { + "epoch": 0.5709395858291973, + "grad_norm": 17.071313858032227, + "learning_rate": 1.8996006845407873e-05, + "loss": 1.6332, + "step": 3336 + }, + { + "epoch": 0.5711107307889782, + "grad_norm": 7.998443603515625, + "learning_rate": 1.9001711351968056e-05, + "loss": 0.6125, + "step": 3337 + }, + { + "epoch": 0.5712818757487592, + "grad_norm": 27.566017150878906, + "learning_rate": 1.900741585852824e-05, + "loss": 3.0635, + "step": 3338 + }, + { + "epoch": 0.5714530207085401, + "grad_norm": 6.867462158203125, + "learning_rate": 1.901312036508842e-05, + "loss": 0.5083, + "step": 3339 + }, + { + "epoch": 0.5716241656683211, + "grad_norm": 24.942699432373047, + "learning_rate": 1.9018824871648603e-05, + "loss": 2.4329, + "step": 3340 + }, + { + "epoch": 0.571795310628102, + "grad_norm": 17.44595718383789, + "learning_rate": 1.9024529378208786e-05, + "loss": 1.2566, + "step": 3341 + }, + { + "epoch": 0.571966455587883, + "grad_norm": 30.833187103271484, + "learning_rate": 1.903023388476897e-05, + "loss": 2.7353, + "step": 3342 + }, + { + "epoch": 0.5721376005476638, + "grad_norm": 31.722270965576172, + "learning_rate": 1.903593839132915e-05, + "loss": 3.8463, + "step": 3343 + }, + { + "epoch": 0.5723087455074448, + "grad_norm": 12.909158706665039, + "learning_rate": 1.9041642897889333e-05, + "loss": 1.0365, + "step": 3344 + }, + { + "epoch": 0.5724798904672257, + "grad_norm": 32.17844772338867, + "learning_rate": 1.9047347404449516e-05, + "loss": 3.2414, + "step": 3345 + }, + { + "epoch": 0.5726510354270067, + "grad_norm": 25.432022094726562, + "learning_rate": 1.9053051911009696e-05, + "loss": 2.4539, + "step": 3346 + }, + { + "epoch": 0.5728221803867876, + "grad_norm": 2.373732805252075, + "learning_rate": 1.905875641756988e-05, + "loss": 0.3677, + "step": 3347 + }, + { + "epoch": 0.5729933253465685, + "grad_norm": 40.49632263183594, + "learning_rate": 1.9064460924130063e-05, + "loss": 3.9831, + "step": 3348 + }, + { + "epoch": 0.5731644703063494, + "grad_norm": 1.9657033681869507, + "learning_rate": 1.9070165430690246e-05, + "loss": 0.3775, + "step": 3349 + }, + { + "epoch": 0.5733356152661304, + "grad_norm": 61.38923645019531, + "learning_rate": 1.9075869937250426e-05, + "loss": 7.6187, + "step": 3350 + }, + { + "epoch": 0.5735067602259113, + "grad_norm": 24.892297744750977, + "learning_rate": 1.908157444381061e-05, + "loss": 2.5702, + "step": 3351 + }, + { + "epoch": 0.5736779051856923, + "grad_norm": 27.634868621826172, + "learning_rate": 1.9087278950370793e-05, + "loss": 2.6693, + "step": 3352 + }, + { + "epoch": 0.5738490501454732, + "grad_norm": 30.543689727783203, + "learning_rate": 1.9092983456930973e-05, + "loss": 2.881, + "step": 3353 + }, + { + "epoch": 0.5740201951052541, + "grad_norm": 20.875457763671875, + "learning_rate": 1.9098687963491157e-05, + "loss": 2.0431, + "step": 3354 + }, + { + "epoch": 0.5741913400650351, + "grad_norm": 10.260396003723145, + "learning_rate": 1.910439247005134e-05, + "loss": 1.0317, + "step": 3355 + }, + { + "epoch": 0.574362485024816, + "grad_norm": 28.790538787841797, + "learning_rate": 1.9110096976611527e-05, + "loss": 2.8238, + "step": 3356 + }, + { + "epoch": 0.574533629984597, + "grad_norm": 25.868772506713867, + "learning_rate": 1.9115801483171707e-05, + "loss": 2.5919, + "step": 3357 + }, + { + "epoch": 0.5747047749443779, + "grad_norm": 25.83347511291504, + "learning_rate": 1.912150598973189e-05, + "loss": 3.1996, + "step": 3358 + }, + { + "epoch": 0.5748759199041589, + "grad_norm": 29.329633712768555, + "learning_rate": 1.9127210496292073e-05, + "loss": 3.1316, + "step": 3359 + }, + { + "epoch": 0.5750470648639397, + "grad_norm": 9.001529693603516, + "learning_rate": 1.9132915002852253e-05, + "loss": 1.2363, + "step": 3360 + }, + { + "epoch": 0.5752182098237207, + "grad_norm": 5.358071804046631, + "learning_rate": 1.9138619509412437e-05, + "loss": 0.3949, + "step": 3361 + }, + { + "epoch": 0.5753893547835016, + "grad_norm": 13.40963363647461, + "learning_rate": 1.914432401597262e-05, + "loss": 0.7941, + "step": 3362 + }, + { + "epoch": 0.5755604997432826, + "grad_norm": 37.820556640625, + "learning_rate": 1.9150028522532804e-05, + "loss": 6.717, + "step": 3363 + }, + { + "epoch": 0.5757316447030635, + "grad_norm": 181.16746520996094, + "learning_rate": 1.9155733029092984e-05, + "loss": 9.5897, + "step": 3364 + }, + { + "epoch": 0.5759027896628445, + "grad_norm": 29.568683624267578, + "learning_rate": 1.9161437535653167e-05, + "loss": 2.4104, + "step": 3365 + }, + { + "epoch": 0.5760739346226253, + "grad_norm": 10.582496643066406, + "learning_rate": 1.916714204221335e-05, + "loss": 0.5204, + "step": 3366 + }, + { + "epoch": 0.5762450795824063, + "grad_norm": 37.75896072387695, + "learning_rate": 1.917284654877353e-05, + "loss": 6.4064, + "step": 3367 + }, + { + "epoch": 0.5764162245421872, + "grad_norm": 23.44141960144043, + "learning_rate": 1.9178551055333714e-05, + "loss": 2.2333, + "step": 3368 + }, + { + "epoch": 0.5765873695019682, + "grad_norm": 23.17081642150879, + "learning_rate": 1.9184255561893897e-05, + "loss": 2.3916, + "step": 3369 + }, + { + "epoch": 0.5767585144617491, + "grad_norm": 22.356122970581055, + "learning_rate": 1.918996006845408e-05, + "loss": 2.259, + "step": 3370 + }, + { + "epoch": 0.5769296594215301, + "grad_norm": 25.988954544067383, + "learning_rate": 1.919566457501426e-05, + "loss": 2.6056, + "step": 3371 + }, + { + "epoch": 0.577100804381311, + "grad_norm": 17.81022071838379, + "learning_rate": 1.9201369081574444e-05, + "loss": 1.3797, + "step": 3372 + }, + { + "epoch": 0.5772719493410919, + "grad_norm": 28.269866943359375, + "learning_rate": 1.9207073588134627e-05, + "loss": 2.3896, + "step": 3373 + }, + { + "epoch": 0.5774430943008728, + "grad_norm": 24.576251983642578, + "learning_rate": 1.9212778094694807e-05, + "loss": 2.2518, + "step": 3374 + }, + { + "epoch": 0.5776142392606538, + "grad_norm": 5.2097649574279785, + "learning_rate": 1.921848260125499e-05, + "loss": 0.3953, + "step": 3375 + }, + { + "epoch": 0.5777853842204347, + "grad_norm": 3.1124250888824463, + "learning_rate": 1.9224187107815174e-05, + "loss": 0.3687, + "step": 3376 + }, + { + "epoch": 0.5779565291802157, + "grad_norm": 20.81354331970215, + "learning_rate": 1.9229891614375354e-05, + "loss": 2.0595, + "step": 3377 + }, + { + "epoch": 0.5781276741399966, + "grad_norm": 29.21316909790039, + "learning_rate": 1.9235596120935537e-05, + "loss": 3.7875, + "step": 3378 + }, + { + "epoch": 0.5782988190997775, + "grad_norm": 84.69393157958984, + "learning_rate": 1.9241300627495724e-05, + "loss": 3.0112, + "step": 3379 + }, + { + "epoch": 0.5784699640595584, + "grad_norm": 1.8985782861709595, + "learning_rate": 1.9247005134055907e-05, + "loss": 0.3148, + "step": 3380 + }, + { + "epoch": 0.5786411090193394, + "grad_norm": 10.058646202087402, + "learning_rate": 1.9252709640616087e-05, + "loss": 0.8459, + "step": 3381 + }, + { + "epoch": 0.5788122539791203, + "grad_norm": 27.1168270111084, + "learning_rate": 1.925841414717627e-05, + "loss": 2.5347, + "step": 3382 + }, + { + "epoch": 0.5789833989389013, + "grad_norm": 89.62450408935547, + "learning_rate": 1.9264118653736454e-05, + "loss": 3.7248, + "step": 3383 + }, + { + "epoch": 0.5791545438986822, + "grad_norm": 4.5566558837890625, + "learning_rate": 1.9269823160296634e-05, + "loss": 0.6092, + "step": 3384 + }, + { + "epoch": 0.5793256888584631, + "grad_norm": 30.642803192138672, + "learning_rate": 1.9275527666856818e-05, + "loss": 3.5006, + "step": 3385 + }, + { + "epoch": 0.579496833818244, + "grad_norm": 27.308584213256836, + "learning_rate": 1.9281232173417e-05, + "loss": 3.4485, + "step": 3386 + }, + { + "epoch": 0.579667978778025, + "grad_norm": 29.646587371826172, + "learning_rate": 1.9286936679977184e-05, + "loss": 3.0531, + "step": 3387 + }, + { + "epoch": 0.5798391237378059, + "grad_norm": 14.223383903503418, + "learning_rate": 1.9292641186537364e-05, + "loss": 1.5835, + "step": 3388 + }, + { + "epoch": 0.5800102686975869, + "grad_norm": 24.695066452026367, + "learning_rate": 1.9298345693097548e-05, + "loss": 2.368, + "step": 3389 + }, + { + "epoch": 0.5801814136573678, + "grad_norm": 26.341815948486328, + "learning_rate": 1.930405019965773e-05, + "loss": 2.3547, + "step": 3390 + }, + { + "epoch": 0.5803525586171487, + "grad_norm": 18.38678741455078, + "learning_rate": 1.930975470621791e-05, + "loss": 1.7797, + "step": 3391 + }, + { + "epoch": 0.5805237035769296, + "grad_norm": 25.61127471923828, + "learning_rate": 1.9315459212778094e-05, + "loss": 2.638, + "step": 3392 + }, + { + "epoch": 0.5806948485367106, + "grad_norm": 2.290560007095337, + "learning_rate": 1.9321163719338278e-05, + "loss": 0.3367, + "step": 3393 + }, + { + "epoch": 0.5808659934964915, + "grad_norm": 11.412469863891602, + "learning_rate": 1.932686822589846e-05, + "loss": 1.1406, + "step": 3394 + }, + { + "epoch": 0.5810371384562725, + "grad_norm": 8.998905181884766, + "learning_rate": 1.933257273245864e-05, + "loss": 0.6117, + "step": 3395 + }, + { + "epoch": 0.5812082834160534, + "grad_norm": 7.52636194229126, + "learning_rate": 1.9338277239018825e-05, + "loss": 0.653, + "step": 3396 + }, + { + "epoch": 0.5813794283758343, + "grad_norm": 27.57058334350586, + "learning_rate": 1.9343981745579008e-05, + "loss": 3.0084, + "step": 3397 + }, + { + "epoch": 0.5815505733356152, + "grad_norm": 26.775415420532227, + "learning_rate": 1.9349686252139188e-05, + "loss": 2.9132, + "step": 3398 + }, + { + "epoch": 0.5817217182953962, + "grad_norm": 3.1826353073120117, + "learning_rate": 1.935539075869937e-05, + "loss": 0.3669, + "step": 3399 + }, + { + "epoch": 0.5818928632551771, + "grad_norm": 6.152859210968018, + "learning_rate": 1.9361095265259555e-05, + "loss": 0.6359, + "step": 3400 + }, + { + "epoch": 0.5820640082149581, + "grad_norm": 1.7208553552627563, + "learning_rate": 1.9366799771819738e-05, + "loss": 0.3746, + "step": 3401 + }, + { + "epoch": 0.582235153174739, + "grad_norm": 7.883406162261963, + "learning_rate": 1.937250427837992e-05, + "loss": 1.0997, + "step": 3402 + }, + { + "epoch": 0.5824062981345199, + "grad_norm": 26.301164627075195, + "learning_rate": 1.9378208784940105e-05, + "loss": 2.4291, + "step": 3403 + }, + { + "epoch": 0.5825774430943008, + "grad_norm": 23.660444259643555, + "learning_rate": 1.9383913291500288e-05, + "loss": 2.6292, + "step": 3404 + }, + { + "epoch": 0.5827485880540818, + "grad_norm": 17.410369873046875, + "learning_rate": 1.9389617798060468e-05, + "loss": 1.4877, + "step": 3405 + }, + { + "epoch": 0.5829197330138628, + "grad_norm": 31.716928482055664, + "learning_rate": 1.939532230462065e-05, + "loss": 3.778, + "step": 3406 + }, + { + "epoch": 0.5830908779736437, + "grad_norm": 39.23788833618164, + "learning_rate": 1.9401026811180835e-05, + "loss": 3.4125, + "step": 3407 + }, + { + "epoch": 0.5832620229334247, + "grad_norm": 21.296669006347656, + "learning_rate": 1.9406731317741015e-05, + "loss": 1.9439, + "step": 3408 + }, + { + "epoch": 0.5834331678932055, + "grad_norm": 4.249104022979736, + "learning_rate": 1.94124358243012e-05, + "loss": 0.5641, + "step": 3409 + }, + { + "epoch": 0.5836043128529865, + "grad_norm": 25.32843780517578, + "learning_rate": 1.9418140330861382e-05, + "loss": 2.1923, + "step": 3410 + }, + { + "epoch": 0.5837754578127674, + "grad_norm": 31.81114387512207, + "learning_rate": 1.9423844837421565e-05, + "loss": 6.2289, + "step": 3411 + }, + { + "epoch": 0.5839466027725484, + "grad_norm": 34.15937423706055, + "learning_rate": 1.9429549343981745e-05, + "loss": 7.0035, + "step": 3412 + }, + { + "epoch": 0.5841177477323293, + "grad_norm": 27.947298049926758, + "learning_rate": 1.943525385054193e-05, + "loss": 3.2937, + "step": 3413 + }, + { + "epoch": 0.5842888926921103, + "grad_norm": 13.201940536499023, + "learning_rate": 1.9440958357102112e-05, + "loss": 0.9029, + "step": 3414 + }, + { + "epoch": 0.5844600376518911, + "grad_norm": 21.287315368652344, + "learning_rate": 1.9446662863662292e-05, + "loss": 2.1476, + "step": 3415 + }, + { + "epoch": 0.5846311826116721, + "grad_norm": 27.151569366455078, + "learning_rate": 1.9452367370222475e-05, + "loss": 2.8889, + "step": 3416 + }, + { + "epoch": 0.584802327571453, + "grad_norm": 25.92886734008789, + "learning_rate": 1.945807187678266e-05, + "loss": 2.43, + "step": 3417 + }, + { + "epoch": 0.584973472531234, + "grad_norm": 16.41077423095703, + "learning_rate": 1.9463776383342842e-05, + "loss": 1.1515, + "step": 3418 + }, + { + "epoch": 0.5851446174910149, + "grad_norm": 7.387080669403076, + "learning_rate": 1.9469480889903022e-05, + "loss": 1.0438, + "step": 3419 + }, + { + "epoch": 0.5853157624507959, + "grad_norm": 28.30823516845703, + "learning_rate": 1.9475185396463205e-05, + "loss": 2.4817, + "step": 3420 + }, + { + "epoch": 0.5854869074105767, + "grad_norm": 19.957653045654297, + "learning_rate": 1.948088990302339e-05, + "loss": 2.492, + "step": 3421 + }, + { + "epoch": 0.5856580523703577, + "grad_norm": 26.708097457885742, + "learning_rate": 1.948659440958357e-05, + "loss": 2.9457, + "step": 3422 + }, + { + "epoch": 0.5858291973301386, + "grad_norm": 6.408317565917969, + "learning_rate": 1.9492298916143752e-05, + "loss": 0.7295, + "step": 3423 + }, + { + "epoch": 0.5860003422899196, + "grad_norm": 30.148130416870117, + "learning_rate": 1.9498003422703935e-05, + "loss": 3.3614, + "step": 3424 + }, + { + "epoch": 0.5861714872497005, + "grad_norm": 22.77581787109375, + "learning_rate": 1.9503707929264122e-05, + "loss": 1.9865, + "step": 3425 + }, + { + "epoch": 0.5863426322094815, + "grad_norm": 27.753477096557617, + "learning_rate": 1.9509412435824302e-05, + "loss": 2.899, + "step": 3426 + }, + { + "epoch": 0.5865137771692623, + "grad_norm": 6.288846015930176, + "learning_rate": 1.9515116942384486e-05, + "loss": 0.646, + "step": 3427 + }, + { + "epoch": 0.5866849221290433, + "grad_norm": 24.92253303527832, + "learning_rate": 1.952082144894467e-05, + "loss": 2.5784, + "step": 3428 + }, + { + "epoch": 0.5868560670888242, + "grad_norm": 24.49477767944336, + "learning_rate": 1.952652595550485e-05, + "loss": 2.1704, + "step": 3429 + }, + { + "epoch": 0.5870272120486052, + "grad_norm": 24.100597381591797, + "learning_rate": 1.9532230462065032e-05, + "loss": 2.5277, + "step": 3430 + }, + { + "epoch": 0.5871983570083861, + "grad_norm": 21.0911922454834, + "learning_rate": 1.9537934968625216e-05, + "loss": 2.2242, + "step": 3431 + }, + { + "epoch": 0.5873695019681671, + "grad_norm": 22.534944534301758, + "learning_rate": 1.95436394751854e-05, + "loss": 2.3912, + "step": 3432 + }, + { + "epoch": 0.5875406469279479, + "grad_norm": 22.132417678833008, + "learning_rate": 1.954934398174558e-05, + "loss": 2.3557, + "step": 3433 + }, + { + "epoch": 0.5877117918877289, + "grad_norm": 21.22612953186035, + "learning_rate": 1.9555048488305762e-05, + "loss": 1.8585, + "step": 3434 + }, + { + "epoch": 0.5878829368475098, + "grad_norm": 87.90875244140625, + "learning_rate": 1.9560752994865946e-05, + "loss": 4.0792, + "step": 3435 + }, + { + "epoch": 0.5880540818072908, + "grad_norm": 89.19034576416016, + "learning_rate": 1.9566457501426126e-05, + "loss": 4.43, + "step": 3436 + }, + { + "epoch": 0.5882252267670717, + "grad_norm": 19.258451461791992, + "learning_rate": 1.957216200798631e-05, + "loss": 1.7874, + "step": 3437 + }, + { + "epoch": 0.5883963717268527, + "grad_norm": 1.7522574663162231, + "learning_rate": 1.9577866514546493e-05, + "loss": 0.3112, + "step": 3438 + }, + { + "epoch": 0.5885675166866335, + "grad_norm": 18.229957580566406, + "learning_rate": 1.9583571021106676e-05, + "loss": 2.766, + "step": 3439 + }, + { + "epoch": 0.5887386616464145, + "grad_norm": 36.58788299560547, + "learning_rate": 1.9589275527666856e-05, + "loss": 5.2847, + "step": 3440 + }, + { + "epoch": 0.5889098066061954, + "grad_norm": 23.946247100830078, + "learning_rate": 1.959498003422704e-05, + "loss": 2.3658, + "step": 3441 + }, + { + "epoch": 0.5890809515659764, + "grad_norm": 29.713180541992188, + "learning_rate": 1.9600684540787223e-05, + "loss": 3.6696, + "step": 3442 + }, + { + "epoch": 0.5892520965257573, + "grad_norm": 22.247447967529297, + "learning_rate": 1.9606389047347403e-05, + "loss": 2.6781, + "step": 3443 + }, + { + "epoch": 0.5894232414855383, + "grad_norm": 23.726993560791016, + "learning_rate": 1.9612093553907586e-05, + "loss": 2.765, + "step": 3444 + }, + { + "epoch": 0.5895943864453191, + "grad_norm": 39.94513702392578, + "learning_rate": 1.961779806046777e-05, + "loss": 7.0047, + "step": 3445 + }, + { + "epoch": 0.5897655314051001, + "grad_norm": 24.248090744018555, + "learning_rate": 1.962350256702795e-05, + "loss": 2.7753, + "step": 3446 + }, + { + "epoch": 0.589936676364881, + "grad_norm": 3.6631691455841064, + "learning_rate": 1.9629207073588133e-05, + "loss": 0.3529, + "step": 3447 + }, + { + "epoch": 0.590107821324662, + "grad_norm": 25.42365264892578, + "learning_rate": 1.963491158014832e-05, + "loss": 2.1961, + "step": 3448 + }, + { + "epoch": 0.5902789662844429, + "grad_norm": 25.308515548706055, + "learning_rate": 1.9640616086708503e-05, + "loss": 2.3387, + "step": 3449 + }, + { + "epoch": 0.5904501112442239, + "grad_norm": 19.806636810302734, + "learning_rate": 1.9646320593268683e-05, + "loss": 2.0071, + "step": 3450 + }, + { + "epoch": 0.5906212562040047, + "grad_norm": 17.552900314331055, + "learning_rate": 1.9652025099828866e-05, + "loss": 1.4304, + "step": 3451 + }, + { + "epoch": 0.5907924011637857, + "grad_norm": 23.210519790649414, + "learning_rate": 1.965772960638905e-05, + "loss": 2.1445, + "step": 3452 + }, + { + "epoch": 0.5909635461235666, + "grad_norm": 25.595361709594727, + "learning_rate": 1.966343411294923e-05, + "loss": 3.0135, + "step": 3453 + }, + { + "epoch": 0.5911346910833476, + "grad_norm": 3.9893271923065186, + "learning_rate": 1.9669138619509413e-05, + "loss": 0.4081, + "step": 3454 + }, + { + "epoch": 0.5913058360431286, + "grad_norm": 2.2912561893463135, + "learning_rate": 1.9674843126069596e-05, + "loss": 0.3066, + "step": 3455 + }, + { + "epoch": 0.5914769810029095, + "grad_norm": 23.45972442626953, + "learning_rate": 1.968054763262978e-05, + "loss": 2.4938, + "step": 3456 + }, + { + "epoch": 0.5916481259626905, + "grad_norm": 24.78557777404785, + "learning_rate": 1.968625213918996e-05, + "loss": 2.2888, + "step": 3457 + }, + { + "epoch": 0.5918192709224713, + "grad_norm": 56.51396560668945, + "learning_rate": 1.9691956645750143e-05, + "loss": 3.3839, + "step": 3458 + }, + { + "epoch": 0.5919904158822523, + "grad_norm": 15.350875854492188, + "learning_rate": 1.9697661152310327e-05, + "loss": 1.0531, + "step": 3459 + }, + { + "epoch": 0.5921615608420332, + "grad_norm": 73.21929931640625, + "learning_rate": 1.9703365658870507e-05, + "loss": 3.5199, + "step": 3460 + }, + { + "epoch": 0.5923327058018142, + "grad_norm": 29.828990936279297, + "learning_rate": 1.970907016543069e-05, + "loss": 3.8054, + "step": 3461 + }, + { + "epoch": 0.5925038507615951, + "grad_norm": 18.3194637298584, + "learning_rate": 1.9714774671990873e-05, + "loss": 1.7246, + "step": 3462 + }, + { + "epoch": 0.592674995721376, + "grad_norm": 29.311429977416992, + "learning_rate": 1.9720479178551057e-05, + "loss": 3.2428, + "step": 3463 + }, + { + "epoch": 0.5928461406811569, + "grad_norm": 1.9222893714904785, + "learning_rate": 1.9726183685111237e-05, + "loss": 0.3078, + "step": 3464 + }, + { + "epoch": 0.5930172856409379, + "grad_norm": 6.286295413970947, + "learning_rate": 1.973188819167142e-05, + "loss": 0.6446, + "step": 3465 + }, + { + "epoch": 0.5931884306007188, + "grad_norm": 29.647480010986328, + "learning_rate": 1.9737592698231603e-05, + "loss": 2.9621, + "step": 3466 + }, + { + "epoch": 0.5933595755604998, + "grad_norm": 26.92269515991211, + "learning_rate": 1.9743297204791783e-05, + "loss": 2.5933, + "step": 3467 + }, + { + "epoch": 0.5935307205202807, + "grad_norm": 50.6396484375, + "learning_rate": 1.9749001711351967e-05, + "loss": 6.8098, + "step": 3468 + }, + { + "epoch": 0.5937018654800617, + "grad_norm": 25.224733352661133, + "learning_rate": 1.975470621791215e-05, + "loss": 2.4431, + "step": 3469 + }, + { + "epoch": 0.5938730104398425, + "grad_norm": 17.845563888549805, + "learning_rate": 1.9760410724472334e-05, + "loss": 1.7165, + "step": 3470 + }, + { + "epoch": 0.5940441553996235, + "grad_norm": 5.634066104888916, + "learning_rate": 1.9766115231032517e-05, + "loss": 0.5882, + "step": 3471 + }, + { + "epoch": 0.5942153003594044, + "grad_norm": 34.622920989990234, + "learning_rate": 1.97718197375927e-05, + "loss": 3.5714, + "step": 3472 + }, + { + "epoch": 0.5943864453191854, + "grad_norm": 63.40961837768555, + "learning_rate": 1.9777524244152884e-05, + "loss": 2.734, + "step": 3473 + }, + { + "epoch": 0.5945575902789663, + "grad_norm": 29.88731575012207, + "learning_rate": 1.9783228750713064e-05, + "loss": 3.8436, + "step": 3474 + }, + { + "epoch": 0.5947287352387473, + "grad_norm": 27.8708553314209, + "learning_rate": 1.9788933257273247e-05, + "loss": 2.3388, + "step": 3475 + }, + { + "epoch": 0.5948998801985281, + "grad_norm": 25.777362823486328, + "learning_rate": 1.979463776383343e-05, + "loss": 2.3517, + "step": 3476 + }, + { + "epoch": 0.5950710251583091, + "grad_norm": 14.805953979492188, + "learning_rate": 1.980034227039361e-05, + "loss": 1.5038, + "step": 3477 + }, + { + "epoch": 0.59524217011809, + "grad_norm": 19.073440551757812, + "learning_rate": 1.9806046776953794e-05, + "loss": 1.8955, + "step": 3478 + }, + { + "epoch": 0.595413315077871, + "grad_norm": 21.738014221191406, + "learning_rate": 1.9811751283513977e-05, + "loss": 2.3869, + "step": 3479 + }, + { + "epoch": 0.5955844600376519, + "grad_norm": 2.9714324474334717, + "learning_rate": 1.981745579007416e-05, + "loss": 0.3378, + "step": 3480 + }, + { + "epoch": 0.5957556049974329, + "grad_norm": 8.826178550720215, + "learning_rate": 1.982316029663434e-05, + "loss": 0.9817, + "step": 3481 + }, + { + "epoch": 0.5959267499572137, + "grad_norm": 16.54644012451172, + "learning_rate": 1.9828864803194524e-05, + "loss": 1.2827, + "step": 3482 + }, + { + "epoch": 0.5960978949169947, + "grad_norm": 9.384221076965332, + "learning_rate": 1.9834569309754707e-05, + "loss": 1.3316, + "step": 3483 + }, + { + "epoch": 0.5962690398767756, + "grad_norm": 25.255199432373047, + "learning_rate": 1.9840273816314887e-05, + "loss": 2.1236, + "step": 3484 + }, + { + "epoch": 0.5964401848365566, + "grad_norm": 27.23832893371582, + "learning_rate": 1.984597832287507e-05, + "loss": 2.8921, + "step": 3485 + }, + { + "epoch": 0.5966113297963375, + "grad_norm": 31.743816375732422, + "learning_rate": 1.9851682829435254e-05, + "loss": 4.1041, + "step": 3486 + }, + { + "epoch": 0.5967824747561185, + "grad_norm": 23.10817527770996, + "learning_rate": 1.9857387335995437e-05, + "loss": 1.973, + "step": 3487 + }, + { + "epoch": 0.5969536197158993, + "grad_norm": 40.163639068603516, + "learning_rate": 1.9863091842555617e-05, + "loss": 6.3457, + "step": 3488 + }, + { + "epoch": 0.5971247646756803, + "grad_norm": 29.302976608276367, + "learning_rate": 1.98687963491158e-05, + "loss": 2.8273, + "step": 3489 + }, + { + "epoch": 0.5972959096354612, + "grad_norm": 29.635021209716797, + "learning_rate": 1.9874500855675984e-05, + "loss": 3.671, + "step": 3490 + }, + { + "epoch": 0.5974670545952422, + "grad_norm": 21.227108001708984, + "learning_rate": 1.9880205362236164e-05, + "loss": 2.1672, + "step": 3491 + }, + { + "epoch": 0.5976381995550231, + "grad_norm": 30.448522567749023, + "learning_rate": 1.9885909868796348e-05, + "loss": 3.0936, + "step": 3492 + }, + { + "epoch": 0.597809344514804, + "grad_norm": 27.133663177490234, + "learning_rate": 1.9891614375356534e-05, + "loss": 2.4887, + "step": 3493 + }, + { + "epoch": 0.5979804894745849, + "grad_norm": 39.466121673583984, + "learning_rate": 1.9897318881916718e-05, + "loss": 4.8888, + "step": 3494 + }, + { + "epoch": 0.5981516344343659, + "grad_norm": 39.85908889770508, + "learning_rate": 1.9903023388476898e-05, + "loss": 6.6469, + "step": 3495 + }, + { + "epoch": 0.5983227793941468, + "grad_norm": 19.293907165527344, + "learning_rate": 1.990872789503708e-05, + "loss": 2.0085, + "step": 3496 + }, + { + "epoch": 0.5984939243539278, + "grad_norm": 30.540531158447266, + "learning_rate": 1.9914432401597265e-05, + "loss": 2.8524, + "step": 3497 + }, + { + "epoch": 0.5986650693137087, + "grad_norm": 2.173297882080078, + "learning_rate": 1.9920136908157444e-05, + "loss": 0.4671, + "step": 3498 + }, + { + "epoch": 0.5988362142734897, + "grad_norm": 23.616220474243164, + "learning_rate": 1.9925841414717628e-05, + "loss": 2.1941, + "step": 3499 + }, + { + "epoch": 0.5990073592332705, + "grad_norm": 10.88476276397705, + "learning_rate": 1.993154592127781e-05, + "loss": 0.9849, + "step": 3500 + }, + { + "epoch": 0.5991785041930515, + "grad_norm": 35.73077392578125, + "learning_rate": 1.9937250427837995e-05, + "loss": 3.3526, + "step": 3501 + }, + { + "epoch": 0.5993496491528324, + "grad_norm": 16.617977142333984, + "learning_rate": 1.9942954934398175e-05, + "loss": 1.305, + "step": 3502 + }, + { + "epoch": 0.5995207941126134, + "grad_norm": 18.637554168701172, + "learning_rate": 1.9948659440958358e-05, + "loss": 1.7833, + "step": 3503 + }, + { + "epoch": 0.5996919390723943, + "grad_norm": 22.126482009887695, + "learning_rate": 1.995436394751854e-05, + "loss": 1.8701, + "step": 3504 + }, + { + "epoch": 0.5998630840321753, + "grad_norm": 19.62862777709961, + "learning_rate": 1.996006845407872e-05, + "loss": 1.9236, + "step": 3505 + }, + { + "epoch": 0.6000342289919562, + "grad_norm": 27.936777114868164, + "learning_rate": 1.9965772960638905e-05, + "loss": 2.4178, + "step": 3506 + }, + { + "epoch": 0.6002053739517371, + "grad_norm": 19.932191848754883, + "learning_rate": 1.9971477467199088e-05, + "loss": 2.1511, + "step": 3507 + }, + { + "epoch": 0.6003765189115181, + "grad_norm": 25.053146362304688, + "learning_rate": 1.9977181973759268e-05, + "loss": 2.4803, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_nli-pairs_loss": 2.4920291900634766, + "eval_nli-pairs_runtime": 4.6698, + "eval_nli-pairs_samples_per_second": 42.828, + "eval_nli-pairs_steps_per_second": 1.499, + "eval_sts-test_pearson_cosine": 0.7445126100709293, + "eval_sts-test_pearson_dot": 0.6267026529286148, + "eval_sts-test_pearson_euclidean": 0.7432252885023554, + "eval_sts-test_pearson_manhattan": 0.7498148030136934, + "eval_sts-test_pearson_max": 0.7498148030136934, + "eval_sts-test_spearman_cosine": 0.7257459075346154, + "eval_sts-test_spearman_dot": 0.6080996929747863, + "eval_sts-test_spearman_euclidean": 0.7251182727779897, + "eval_sts-test_spearman_manhattan": 0.7328124096687271, + "eval_sts-test_spearman_max": 0.7328124096687271, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_vitaminc-pairs_loss": 1.5536390542984009, + "eval_vitaminc-pairs_runtime": 2.8901, + "eval_vitaminc-pairs_samples_per_second": 69.202, + "eval_vitaminc-pairs_steps_per_second": 2.422, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_qnli-contrastive_loss": 3.72904109954834, + "eval_qnli-contrastive_runtime": 0.7044, + "eval_qnli-contrastive_samples_per_second": 283.946, + "eval_qnli-contrastive_steps_per_second": 9.938, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_scitail-pairs-qa_loss": 0.28478389978408813, + "eval_scitail-pairs-qa_runtime": 1.9184, + "eval_scitail-pairs-qa_samples_per_second": 104.251, + "eval_scitail-pairs-qa_steps_per_second": 3.649, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_scitail-pairs-pos_loss": 1.0560411214828491, + "eval_scitail-pairs-pos_runtime": 2.9426, + "eval_scitail-pairs-pos_samples_per_second": 67.966, + "eval_scitail-pairs-pos_steps_per_second": 2.379, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_xsum-pairs_loss": 1.246793508529663, + "eval_xsum-pairs_runtime": 2.6747, + "eval_xsum-pairs_samples_per_second": 65.429, + "eval_xsum-pairs_steps_per_second": 2.243, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_compression-pairs_loss": 0.5663356184959412, + "eval_compression-pairs_runtime": 0.5441, + "eval_compression-pairs_samples_per_second": 367.559, + "eval_compression-pairs_steps_per_second": 12.865, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_sciq_pairs_loss": 5.566298484802246, + "eval_sciq_pairs_runtime": 9.5047, + "eval_sciq_pairs_samples_per_second": 21.042, + "eval_sciq_pairs_steps_per_second": 0.736, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_qasc_pairs_loss": 6.534984588623047, + "eval_qasc_pairs_runtime": 2.8892, + "eval_qasc_pairs_samples_per_second": 69.224, + "eval_qasc_pairs_steps_per_second": 2.423, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_openbookqa_pairs_loss": 3.5413291454315186, + "eval_openbookqa_pairs_runtime": 0.7338, + "eval_openbookqa_pairs_samples_per_second": 94.027, + "eval_openbookqa_pairs_steps_per_second": 4.088, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_msmarco_pairs_loss": 2.2276792526245117, + "eval_msmarco_pairs_runtime": 4.1013, + "eval_msmarco_pairs_samples_per_second": 48.765, + "eval_msmarco_pairs_steps_per_second": 1.707, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_nq_pairs_loss": 2.868544340133667, + "eval_nq_pairs_runtime": 8.7773, + "eval_nq_pairs_samples_per_second": 22.786, + "eval_nq_pairs_steps_per_second": 0.798, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_trivia_pairs_loss": 2.8433399200439453, + "eval_trivia_pairs_runtime": 12.7884, + "eval_trivia_pairs_samples_per_second": 15.639, + "eval_trivia_pairs_steps_per_second": 0.547, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_quora_pairs_loss": 0.5191998481750488, + "eval_quora_pairs_runtime": 1.6069, + "eval_quora_pairs_samples_per_second": 124.459, + "eval_quora_pairs_steps_per_second": 4.356, + "step": 3508 + }, + { + "epoch": 0.6003765189115181, + "eval_gooaq_pairs_loss": 1.7708619832992554, + "eval_gooaq_pairs_runtime": 2.6531, + "eval_gooaq_pairs_samples_per_second": 75.384, + "eval_gooaq_pairs_steps_per_second": 2.638, + "step": 3508 + }, + { + "epoch": 0.600547663871299, + "grad_norm": 25.026371002197266, + "learning_rate": 1.998288648031945e-05, + "loss": 2.5669, + "step": 3509 + }, + { + "epoch": 0.60071880883108, + "grad_norm": 27.141857147216797, + "learning_rate": 1.9988590986879635e-05, + "loss": 3.5018, + "step": 3510 + }, + { + "epoch": 0.6008899537908609, + "grad_norm": 14.711791038513184, + "learning_rate": 1.9994295493439818e-05, + "loss": 1.5597, + "step": 3511 + }, + { + "epoch": 0.6010610987506418, + "grad_norm": 21.489046096801758, + "learning_rate": 1.9999999999999998e-05, + "loss": 1.9611, + "step": 3512 + }, + { + "epoch": 0.6012322437104227, + "grad_norm": 25.058837890625, + "learning_rate": 2.000570450656018e-05, + "loss": 2.6108, + "step": 3513 + }, + { + "epoch": 0.6014033886702037, + "grad_norm": 35.174922943115234, + "learning_rate": 2.0011409013120365e-05, + "loss": 4.3163, + "step": 3514 + }, + { + "epoch": 0.6015745336299846, + "grad_norm": 24.40542984008789, + "learning_rate": 2.0017113519680545e-05, + "loss": 2.2731, + "step": 3515 + }, + { + "epoch": 0.6017456785897656, + "grad_norm": 25.144128799438477, + "learning_rate": 2.0022818026240732e-05, + "loss": 2.9364, + "step": 3516 + }, + { + "epoch": 0.6019168235495465, + "grad_norm": 24.48015594482422, + "learning_rate": 2.0028522532800915e-05, + "loss": 2.9065, + "step": 3517 + }, + { + "epoch": 0.6020879685093274, + "grad_norm": 3.758195161819458, + "learning_rate": 2.00342270393611e-05, + "loss": 0.3902, + "step": 3518 + }, + { + "epoch": 0.6022591134691083, + "grad_norm": 24.701047897338867, + "learning_rate": 2.003993154592128e-05, + "loss": 2.7503, + "step": 3519 + }, + { + "epoch": 0.6024302584288893, + "grad_norm": 35.492271423339844, + "learning_rate": 2.0045636052481462e-05, + "loss": 3.48, + "step": 3520 + }, + { + "epoch": 0.6026014033886702, + "grad_norm": 23.064668655395508, + "learning_rate": 2.0051340559041645e-05, + "loss": 2.0561, + "step": 3521 + }, + { + "epoch": 0.6027725483484512, + "grad_norm": 20.811628341674805, + "learning_rate": 2.0057045065601825e-05, + "loss": 2.1345, + "step": 3522 + }, + { + "epoch": 0.602943693308232, + "grad_norm": 29.092313766479492, + "learning_rate": 2.006274957216201e-05, + "loss": 3.3692, + "step": 3523 + }, + { + "epoch": 0.603114838268013, + "grad_norm": 4.536581993103027, + "learning_rate": 2.0068454078722192e-05, + "loss": 0.3497, + "step": 3524 + }, + { + "epoch": 0.6032859832277939, + "grad_norm": 30.311725616455078, + "learning_rate": 2.0074158585282375e-05, + "loss": 3.1903, + "step": 3525 + }, + { + "epoch": 0.6034571281875749, + "grad_norm": 22.41107749938965, + "learning_rate": 2.0079863091842555e-05, + "loss": 1.8769, + "step": 3526 + }, + { + "epoch": 0.6036282731473558, + "grad_norm": 110.15360260009766, + "learning_rate": 2.008556759840274e-05, + "loss": 8.2043, + "step": 3527 + }, + { + "epoch": 0.6037994181071368, + "grad_norm": 22.728544235229492, + "learning_rate": 2.0091272104962922e-05, + "loss": 2.2539, + "step": 3528 + }, + { + "epoch": 0.6039705630669177, + "grad_norm": 20.25188446044922, + "learning_rate": 2.0096976611523102e-05, + "loss": 2.1306, + "step": 3529 + }, + { + "epoch": 0.6041417080266986, + "grad_norm": 25.181900024414062, + "learning_rate": 2.0102681118083286e-05, + "loss": 2.7009, + "step": 3530 + }, + { + "epoch": 0.6043128529864795, + "grad_norm": 20.236555099487305, + "learning_rate": 2.010838562464347e-05, + "loss": 2.0943, + "step": 3531 + }, + { + "epoch": 0.6044839979462605, + "grad_norm": 28.511568069458008, + "learning_rate": 2.0114090131203652e-05, + "loss": 3.4193, + "step": 3532 + }, + { + "epoch": 0.6046551429060414, + "grad_norm": 20.58401870727539, + "learning_rate": 2.0119794637763832e-05, + "loss": 1.8277, + "step": 3533 + }, + { + "epoch": 0.6048262878658224, + "grad_norm": 28.2340087890625, + "learning_rate": 2.0125499144324016e-05, + "loss": 3.125, + "step": 3534 + }, + { + "epoch": 0.6049974328256033, + "grad_norm": 29.512651443481445, + "learning_rate": 2.01312036508842e-05, + "loss": 2.8504, + "step": 3535 + }, + { + "epoch": 0.6051685777853842, + "grad_norm": 20.814037322998047, + "learning_rate": 2.013690815744438e-05, + "loss": 1.988, + "step": 3536 + }, + { + "epoch": 0.6053397227451651, + "grad_norm": 31.458534240722656, + "learning_rate": 2.0142612664004562e-05, + "loss": 3.4279, + "step": 3537 + }, + { + "epoch": 0.6055108677049461, + "grad_norm": 6.904580116271973, + "learning_rate": 2.0148317170564746e-05, + "loss": 0.596, + "step": 3538 + }, + { + "epoch": 0.605682012664727, + "grad_norm": 4.964241981506348, + "learning_rate": 2.015402167712493e-05, + "loss": 0.5453, + "step": 3539 + }, + { + "epoch": 0.605853157624508, + "grad_norm": 13.879029273986816, + "learning_rate": 2.0159726183685113e-05, + "loss": 0.8886, + "step": 3540 + }, + { + "epoch": 0.6060243025842889, + "grad_norm": 5.926167964935303, + "learning_rate": 2.0165430690245296e-05, + "loss": 0.4682, + "step": 3541 + }, + { + "epoch": 0.6061954475440698, + "grad_norm": 37.99885177612305, + "learning_rate": 2.017113519680548e-05, + "loss": 2.6702, + "step": 3542 + }, + { + "epoch": 0.6063665925038507, + "grad_norm": 1.8707212209701538, + "learning_rate": 2.017683970336566e-05, + "loss": 0.3064, + "step": 3543 + }, + { + "epoch": 0.6065377374636317, + "grad_norm": 19.801597595214844, + "learning_rate": 2.0182544209925843e-05, + "loss": 1.9881, + "step": 3544 + }, + { + "epoch": 0.6067088824234126, + "grad_norm": 6.91411828994751, + "learning_rate": 2.0188248716486026e-05, + "loss": 0.6814, + "step": 3545 + }, + { + "epoch": 0.6068800273831936, + "grad_norm": 24.066730499267578, + "learning_rate": 2.0193953223046206e-05, + "loss": 2.9773, + "step": 3546 + }, + { + "epoch": 0.6070511723429745, + "grad_norm": 26.214096069335938, + "learning_rate": 2.019965772960639e-05, + "loss": 2.5413, + "step": 3547 + }, + { + "epoch": 0.6072223173027554, + "grad_norm": 26.099639892578125, + "learning_rate": 2.0205362236166573e-05, + "loss": 3.1048, + "step": 3548 + }, + { + "epoch": 0.6073934622625363, + "grad_norm": 5.310629844665527, + "learning_rate": 2.0211066742726756e-05, + "loss": 0.4841, + "step": 3549 + }, + { + "epoch": 0.6075646072223173, + "grad_norm": 31.81784439086914, + "learning_rate": 2.0216771249286936e-05, + "loss": 2.6404, + "step": 3550 + }, + { + "epoch": 0.6077357521820982, + "grad_norm": 20.07958221435547, + "learning_rate": 2.022247575584712e-05, + "loss": 2.0417, + "step": 3551 + }, + { + "epoch": 0.6079068971418792, + "grad_norm": 23.589279174804688, + "learning_rate": 2.0228180262407303e-05, + "loss": 1.8377, + "step": 3552 + }, + { + "epoch": 0.60807804210166, + "grad_norm": 7.810224533081055, + "learning_rate": 2.0233884768967483e-05, + "loss": 0.6155, + "step": 3553 + }, + { + "epoch": 0.608249187061441, + "grad_norm": 25.48611068725586, + "learning_rate": 2.0239589275527666e-05, + "loss": 2.6452, + "step": 3554 + }, + { + "epoch": 0.6084203320212219, + "grad_norm": 28.041179656982422, + "learning_rate": 2.024529378208785e-05, + "loss": 2.7049, + "step": 3555 + }, + { + "epoch": 0.6085914769810029, + "grad_norm": 29.171598434448242, + "learning_rate": 2.0250998288648033e-05, + "loss": 2.9279, + "step": 3556 + }, + { + "epoch": 0.6087626219407839, + "grad_norm": 31.75198745727539, + "learning_rate": 2.0256702795208213e-05, + "loss": 3.5333, + "step": 3557 + }, + { + "epoch": 0.6089337669005648, + "grad_norm": 27.840137481689453, + "learning_rate": 2.0262407301768396e-05, + "loss": 2.582, + "step": 3558 + }, + { + "epoch": 0.6091049118603458, + "grad_norm": 30.64188575744629, + "learning_rate": 2.026811180832858e-05, + "loss": 3.4939, + "step": 3559 + }, + { + "epoch": 0.6092760568201266, + "grad_norm": 5.60610294342041, + "learning_rate": 2.027381631488876e-05, + "loss": 0.5944, + "step": 3560 + }, + { + "epoch": 0.6094472017799076, + "grad_norm": 6.2669148445129395, + "learning_rate": 2.0279520821448943e-05, + "loss": 0.6124, + "step": 3561 + }, + { + "epoch": 0.6096183467396885, + "grad_norm": 24.084915161132812, + "learning_rate": 2.028522532800913e-05, + "loss": 2.3808, + "step": 3562 + }, + { + "epoch": 0.6097894916994695, + "grad_norm": 25.230403900146484, + "learning_rate": 2.0290929834569313e-05, + "loss": 2.2763, + "step": 3563 + }, + { + "epoch": 0.6099606366592504, + "grad_norm": 3.4166452884674072, + "learning_rate": 2.0296634341129493e-05, + "loss": 0.3625, + "step": 3564 + }, + { + "epoch": 0.6101317816190314, + "grad_norm": 43.064022064208984, + "learning_rate": 2.0302338847689677e-05, + "loss": 7.0079, + "step": 3565 + }, + { + "epoch": 0.6103029265788122, + "grad_norm": 126.53868103027344, + "learning_rate": 2.030804335424986e-05, + "loss": 8.3917, + "step": 3566 + }, + { + "epoch": 0.6104740715385932, + "grad_norm": 131.35928344726562, + "learning_rate": 2.031374786081004e-05, + "loss": 3.4001, + "step": 3567 + }, + { + "epoch": 0.6106452164983741, + "grad_norm": 25.176708221435547, + "learning_rate": 2.0319452367370223e-05, + "loss": 2.6828, + "step": 3568 + }, + { + "epoch": 0.6108163614581551, + "grad_norm": 10.270312309265137, + "learning_rate": 2.0325156873930407e-05, + "loss": 0.6769, + "step": 3569 + }, + { + "epoch": 0.610987506417936, + "grad_norm": 134.7881317138672, + "learning_rate": 2.033086138049059e-05, + "loss": 9.4478, + "step": 3570 + }, + { + "epoch": 0.611158651377717, + "grad_norm": 29.06338119506836, + "learning_rate": 2.033656588705077e-05, + "loss": 3.239, + "step": 3571 + }, + { + "epoch": 0.6113297963374978, + "grad_norm": 28.089710235595703, + "learning_rate": 2.0342270393610954e-05, + "loss": 3.3457, + "step": 3572 + }, + { + "epoch": 0.6115009412972788, + "grad_norm": 30.67607879638672, + "learning_rate": 2.0347974900171137e-05, + "loss": 3.2694, + "step": 3573 + }, + { + "epoch": 0.6116720862570597, + "grad_norm": 22.92820167541504, + "learning_rate": 2.0353679406731317e-05, + "loss": 2.2235, + "step": 3574 + }, + { + "epoch": 0.6118432312168407, + "grad_norm": 34.413116455078125, + "learning_rate": 2.03593839132915e-05, + "loss": 4.2986, + "step": 3575 + }, + { + "epoch": 0.6120143761766216, + "grad_norm": 31.22587013244629, + "learning_rate": 2.0365088419851684e-05, + "loss": 3.0481, + "step": 3576 + }, + { + "epoch": 0.6121855211364026, + "grad_norm": 25.429521560668945, + "learning_rate": 2.0370792926411864e-05, + "loss": 2.229, + "step": 3577 + }, + { + "epoch": 0.6123566660961834, + "grad_norm": 11.0814208984375, + "learning_rate": 2.0376497432972047e-05, + "loss": 0.8888, + "step": 3578 + }, + { + "epoch": 0.6125278110559644, + "grad_norm": 7.115586757659912, + "learning_rate": 2.038220193953223e-05, + "loss": 0.7247, + "step": 3579 + }, + { + "epoch": 0.6126989560157453, + "grad_norm": 23.84605598449707, + "learning_rate": 2.0387906446092414e-05, + "loss": 2.5407, + "step": 3580 + }, + { + "epoch": 0.6128701009755263, + "grad_norm": 29.189983367919922, + "learning_rate": 2.0393610952652594e-05, + "loss": 2.9275, + "step": 3581 + }, + { + "epoch": 0.6130412459353072, + "grad_norm": 13.353384971618652, + "learning_rate": 2.0399315459212777e-05, + "loss": 1.2951, + "step": 3582 + }, + { + "epoch": 0.6132123908950882, + "grad_norm": 19.755294799804688, + "learning_rate": 2.040501996577296e-05, + "loss": 1.69, + "step": 3583 + }, + { + "epoch": 0.613383535854869, + "grad_norm": 42.32930374145508, + "learning_rate": 2.041072447233314e-05, + "loss": 6.7412, + "step": 3584 + }, + { + "epoch": 0.61355468081465, + "grad_norm": 28.144733428955078, + "learning_rate": 2.0416428978893327e-05, + "loss": 3.0857, + "step": 3585 + }, + { + "epoch": 0.6137258257744309, + "grad_norm": 9.064329147338867, + "learning_rate": 2.042213348545351e-05, + "loss": 0.6073, + "step": 3586 + }, + { + "epoch": 0.6138969707342119, + "grad_norm": 13.769346237182617, + "learning_rate": 2.0427837992013694e-05, + "loss": 0.9597, + "step": 3587 + }, + { + "epoch": 0.6140681156939928, + "grad_norm": 26.750154495239258, + "learning_rate": 2.0433542498573874e-05, + "loss": 2.5717, + "step": 3588 + }, + { + "epoch": 0.6142392606537738, + "grad_norm": 21.833545684814453, + "learning_rate": 2.0439247005134057e-05, + "loss": 2.2722, + "step": 3589 + }, + { + "epoch": 0.6144104056135546, + "grad_norm": 20.934206008911133, + "learning_rate": 2.044495151169424e-05, + "loss": 1.8977, + "step": 3590 + }, + { + "epoch": 0.6145815505733356, + "grad_norm": 20.740619659423828, + "learning_rate": 2.045065601825442e-05, + "loss": 2.1618, + "step": 3591 + }, + { + "epoch": 0.6147526955331165, + "grad_norm": 18.453035354614258, + "learning_rate": 2.0456360524814604e-05, + "loss": 1.5326, + "step": 3592 + }, + { + "epoch": 0.6149238404928975, + "grad_norm": 21.29932403564453, + "learning_rate": 2.0462065031374788e-05, + "loss": 1.8663, + "step": 3593 + }, + { + "epoch": 0.6150949854526784, + "grad_norm": 43.378116607666016, + "learning_rate": 2.046776953793497e-05, + "loss": 2.6436, + "step": 3594 + }, + { + "epoch": 0.6152661304124594, + "grad_norm": 30.3048038482666, + "learning_rate": 2.047347404449515e-05, + "loss": 2.4363, + "step": 3595 + }, + { + "epoch": 0.6154372753722402, + "grad_norm": 7.601318836212158, + "learning_rate": 2.0479178551055334e-05, + "loss": 0.5412, + "step": 3596 + }, + { + "epoch": 0.6156084203320212, + "grad_norm": 22.385950088500977, + "learning_rate": 2.0484883057615518e-05, + "loss": 2.1345, + "step": 3597 + }, + { + "epoch": 0.6157795652918021, + "grad_norm": 21.425384521484375, + "learning_rate": 2.0490587564175698e-05, + "loss": 2.3012, + "step": 3598 + }, + { + "epoch": 0.6159507102515831, + "grad_norm": 35.983375549316406, + "learning_rate": 2.049629207073588e-05, + "loss": 3.4264, + "step": 3599 + }, + { + "epoch": 0.616121855211364, + "grad_norm": 21.63048553466797, + "learning_rate": 2.0501996577296064e-05, + "loss": 1.8898, + "step": 3600 + }, + { + "epoch": 0.616293000171145, + "grad_norm": 22.900203704833984, + "learning_rate": 2.0507701083856248e-05, + "loss": 2.0752, + "step": 3601 + }, + { + "epoch": 0.6164641451309258, + "grad_norm": 36.20056915283203, + "learning_rate": 2.0513405590416428e-05, + "loss": 3.6793, + "step": 3602 + }, + { + "epoch": 0.6166352900907068, + "grad_norm": 65.48631286621094, + "learning_rate": 2.051911009697661e-05, + "loss": 3.0171, + "step": 3603 + }, + { + "epoch": 0.6168064350504877, + "grad_norm": 28.85053062438965, + "learning_rate": 2.0524814603536795e-05, + "loss": 2.9372, + "step": 3604 + }, + { + "epoch": 0.6169775800102687, + "grad_norm": 22.775283813476562, + "learning_rate": 2.0530519110096975e-05, + "loss": 2.0222, + "step": 3605 + }, + { + "epoch": 0.6171487249700496, + "grad_norm": 20.72616195678711, + "learning_rate": 2.0536223616657158e-05, + "loss": 1.9483, + "step": 3606 + }, + { + "epoch": 0.6173198699298306, + "grad_norm": 82.96855926513672, + "learning_rate": 2.054192812321734e-05, + "loss": 3.4477, + "step": 3607 + }, + { + "epoch": 0.6174910148896116, + "grad_norm": 31.961795806884766, + "learning_rate": 2.0547632629777525e-05, + "loss": 3.855, + "step": 3608 + }, + { + "epoch": 0.6176621598493924, + "grad_norm": 8.887676239013672, + "learning_rate": 2.0553337136337708e-05, + "loss": 0.9918, + "step": 3609 + }, + { + "epoch": 0.6178333048091734, + "grad_norm": 28.133888244628906, + "learning_rate": 2.055904164289789e-05, + "loss": 3.5373, + "step": 3610 + }, + { + "epoch": 0.6180044497689543, + "grad_norm": 15.302783966064453, + "learning_rate": 2.0564746149458075e-05, + "loss": 1.4358, + "step": 3611 + }, + { + "epoch": 0.6181755947287353, + "grad_norm": 1.7312853336334229, + "learning_rate": 2.0570450656018255e-05, + "loss": 0.3083, + "step": 3612 + }, + { + "epoch": 0.6183467396885162, + "grad_norm": 57.10562515258789, + "learning_rate": 2.0576155162578438e-05, + "loss": 2.6412, + "step": 3613 + }, + { + "epoch": 0.6185178846482972, + "grad_norm": 23.632875442504883, + "learning_rate": 2.058185966913862e-05, + "loss": 2.165, + "step": 3614 + }, + { + "epoch": 0.618689029608078, + "grad_norm": 12.391186714172363, + "learning_rate": 2.05875641756988e-05, + "loss": 0.8518, + "step": 3615 + }, + { + "epoch": 0.618860174567859, + "grad_norm": 33.55079650878906, + "learning_rate": 2.0593268682258985e-05, + "loss": 6.3741, + "step": 3616 + }, + { + "epoch": 0.6190313195276399, + "grad_norm": 30.267724990844727, + "learning_rate": 2.0598973188819168e-05, + "loss": 3.5029, + "step": 3617 + }, + { + "epoch": 0.6192024644874209, + "grad_norm": 23.680438995361328, + "learning_rate": 2.060467769537935e-05, + "loss": 2.4026, + "step": 3618 + }, + { + "epoch": 0.6193736094472018, + "grad_norm": 24.904333114624023, + "learning_rate": 2.061038220193953e-05, + "loss": 2.6435, + "step": 3619 + }, + { + "epoch": 0.6195447544069828, + "grad_norm": 16.03217124938965, + "learning_rate": 2.0616086708499715e-05, + "loss": 1.515, + "step": 3620 + }, + { + "epoch": 0.6197158993667636, + "grad_norm": 20.824888229370117, + "learning_rate": 2.06217912150599e-05, + "loss": 1.8808, + "step": 3621 + }, + { + "epoch": 0.6198870443265446, + "grad_norm": 18.110668182373047, + "learning_rate": 2.062749572162008e-05, + "loss": 1.6689, + "step": 3622 + }, + { + "epoch": 0.6200581892863255, + "grad_norm": 21.38336753845215, + "learning_rate": 2.0633200228180262e-05, + "loss": 2.0968, + "step": 3623 + }, + { + "epoch": 0.6202293342461065, + "grad_norm": 19.99571418762207, + "learning_rate": 2.0638904734740445e-05, + "loss": 1.7209, + "step": 3624 + }, + { + "epoch": 0.6204004792058874, + "grad_norm": 26.232772827148438, + "learning_rate": 2.064460924130063e-05, + "loss": 2.5226, + "step": 3625 + }, + { + "epoch": 0.6205716241656684, + "grad_norm": 2.9006407260894775, + "learning_rate": 2.065031374786081e-05, + "loss": 0.3453, + "step": 3626 + }, + { + "epoch": 0.6207427691254492, + "grad_norm": 1.3548880815505981, + "learning_rate": 2.0656018254420992e-05, + "loss": 0.318, + "step": 3627 + }, + { + "epoch": 0.6209139140852302, + "grad_norm": 28.67076301574707, + "learning_rate": 2.0661722760981175e-05, + "loss": 2.7404, + "step": 3628 + }, + { + "epoch": 0.6210850590450111, + "grad_norm": 19.108945846557617, + "learning_rate": 2.0667427267541355e-05, + "loss": 1.9497, + "step": 3629 + }, + { + "epoch": 0.6212562040047921, + "grad_norm": 19.495290756225586, + "learning_rate": 2.067313177410154e-05, + "loss": 1.7563, + "step": 3630 + }, + { + "epoch": 0.621427348964573, + "grad_norm": 9.806105613708496, + "learning_rate": 2.0678836280661725e-05, + "loss": 0.7359, + "step": 3631 + }, + { + "epoch": 0.621598493924354, + "grad_norm": 21.068401336669922, + "learning_rate": 2.068454078722191e-05, + "loss": 1.8909, + "step": 3632 + }, + { + "epoch": 0.6217696388841348, + "grad_norm": 24.633346557617188, + "learning_rate": 2.069024529378209e-05, + "loss": 2.1693, + "step": 3633 + }, + { + "epoch": 0.6219407838439158, + "grad_norm": 20.427921295166016, + "learning_rate": 2.0695949800342272e-05, + "loss": 1.9249, + "step": 3634 + }, + { + "epoch": 0.6221119288036967, + "grad_norm": 31.32949447631836, + "learning_rate": 2.0701654306902456e-05, + "loss": 3.3678, + "step": 3635 + }, + { + "epoch": 0.6222830737634777, + "grad_norm": 1.1559346914291382, + "learning_rate": 2.0707358813462636e-05, + "loss": 0.2955, + "step": 3636 + }, + { + "epoch": 0.6224542187232586, + "grad_norm": 31.57821273803711, + "learning_rate": 2.071306332002282e-05, + "loss": 3.6773, + "step": 3637 + }, + { + "epoch": 0.6226253636830396, + "grad_norm": 54.79661560058594, + "learning_rate": 2.0718767826583002e-05, + "loss": 2.7538, + "step": 3638 + }, + { + "epoch": 0.6227965086428204, + "grad_norm": 22.394084930419922, + "learning_rate": 2.0724472333143186e-05, + "loss": 1.9239, + "step": 3639 + }, + { + "epoch": 0.6229676536026014, + "grad_norm": 27.102100372314453, + "learning_rate": 2.0730176839703366e-05, + "loss": 2.4047, + "step": 3640 + }, + { + "epoch": 0.6231387985623823, + "grad_norm": 10.830344200134277, + "learning_rate": 2.073588134626355e-05, + "loss": 0.9573, + "step": 3641 + }, + { + "epoch": 0.6233099435221633, + "grad_norm": 36.204898834228516, + "learning_rate": 2.0741585852823732e-05, + "loss": 4.9509, + "step": 3642 + }, + { + "epoch": 0.6234810884819442, + "grad_norm": 27.16095542907715, + "learning_rate": 2.0747290359383912e-05, + "loss": 3.0999, + "step": 3643 + }, + { + "epoch": 0.6236522334417252, + "grad_norm": 25.359394073486328, + "learning_rate": 2.0752994865944096e-05, + "loss": 1.8888, + "step": 3644 + }, + { + "epoch": 0.623823378401506, + "grad_norm": 27.978900909423828, + "learning_rate": 2.075869937250428e-05, + "loss": 2.641, + "step": 3645 + }, + { + "epoch": 0.623994523361287, + "grad_norm": 22.19740867614746, + "learning_rate": 2.076440387906446e-05, + "loss": 2.2979, + "step": 3646 + }, + { + "epoch": 0.6241656683210679, + "grad_norm": 21.350025177001953, + "learning_rate": 2.0770108385624643e-05, + "loss": 2.7609, + "step": 3647 + }, + { + "epoch": 0.6243368132808489, + "grad_norm": 24.283403396606445, + "learning_rate": 2.0775812892184826e-05, + "loss": 2.2185, + "step": 3648 + }, + { + "epoch": 0.6245079582406298, + "grad_norm": 24.77626609802246, + "learning_rate": 2.078151739874501e-05, + "loss": 3.0207, + "step": 3649 + }, + { + "epoch": 0.6246791032004108, + "grad_norm": 25.94424819946289, + "learning_rate": 2.078722190530519e-05, + "loss": 3.1535, + "step": 3650 + }, + { + "epoch": 0.6248502481601916, + "grad_norm": 23.725664138793945, + "learning_rate": 2.0792926411865373e-05, + "loss": 2.1339, + "step": 3651 + }, + { + "epoch": 0.6250213931199726, + "grad_norm": 20.791276931762695, + "learning_rate": 2.0798630918425556e-05, + "loss": 2.2234, + "step": 3652 + }, + { + "epoch": 0.6251925380797535, + "grad_norm": 18.761796951293945, + "learning_rate": 2.0804335424985736e-05, + "loss": 1.7418, + "step": 3653 + }, + { + "epoch": 0.6253636830395345, + "grad_norm": 19.412919998168945, + "learning_rate": 2.0810039931545923e-05, + "loss": 1.6532, + "step": 3654 + }, + { + "epoch": 0.6255348279993154, + "grad_norm": 34.13801193237305, + "learning_rate": 2.0815744438106106e-05, + "loss": 6.6811, + "step": 3655 + }, + { + "epoch": 0.6257059729590964, + "grad_norm": 28.877214431762695, + "learning_rate": 2.082144894466629e-05, + "loss": 3.9092, + "step": 3656 + }, + { + "epoch": 0.6258771179188772, + "grad_norm": 25.978158950805664, + "learning_rate": 2.082715345122647e-05, + "loss": 2.3646, + "step": 3657 + }, + { + "epoch": 0.6260482628786582, + "grad_norm": 45.43318176269531, + "learning_rate": 2.0832857957786653e-05, + "loss": 2.9491, + "step": 3658 + }, + { + "epoch": 0.6262194078384392, + "grad_norm": 25.172359466552734, + "learning_rate": 2.0838562464346836e-05, + "loss": 3.1014, + "step": 3659 + }, + { + "epoch": 0.6263905527982201, + "grad_norm": 31.28904914855957, + "learning_rate": 2.0844266970907016e-05, + "loss": 3.1078, + "step": 3660 + }, + { + "epoch": 0.6265616977580011, + "grad_norm": 28.047880172729492, + "learning_rate": 2.08499714774672e-05, + "loss": 3.2356, + "step": 3661 + }, + { + "epoch": 0.626732842717782, + "grad_norm": 14.974970817565918, + "learning_rate": 2.0855675984027383e-05, + "loss": 1.3538, + "step": 3662 + }, + { + "epoch": 0.626903987677563, + "grad_norm": 1.7350009679794312, + "learning_rate": 2.0861380490587566e-05, + "loss": 0.3083, + "step": 3663 + }, + { + "epoch": 0.6270751326373438, + "grad_norm": 24.56871795654297, + "learning_rate": 2.0867084997147746e-05, + "loss": 2.5915, + "step": 3664 + }, + { + "epoch": 0.6272462775971248, + "grad_norm": 24.486120223999023, + "learning_rate": 2.087278950370793e-05, + "loss": 2.461, + "step": 3665 + }, + { + "epoch": 0.6274174225569057, + "grad_norm": 24.524600982666016, + "learning_rate": 2.0878494010268113e-05, + "loss": 2.5748, + "step": 3666 + }, + { + "epoch": 0.6275885675166867, + "grad_norm": 5.067863464355469, + "learning_rate": 2.0884198516828293e-05, + "loss": 0.5522, + "step": 3667 + }, + { + "epoch": 0.6277597124764676, + "grad_norm": 15.868297576904297, + "learning_rate": 2.0889903023388477e-05, + "loss": 1.6833, + "step": 3668 + }, + { + "epoch": 0.6279308574362485, + "grad_norm": 25.489429473876953, + "learning_rate": 2.089560752994866e-05, + "loss": 2.5381, + "step": 3669 + }, + { + "epoch": 0.6281020023960294, + "grad_norm": 26.983837127685547, + "learning_rate": 2.0901312036508843e-05, + "loss": 3.3307, + "step": 3670 + }, + { + "epoch": 0.6282731473558104, + "grad_norm": 17.707273483276367, + "learning_rate": 2.0907016543069023e-05, + "loss": 1.8142, + "step": 3671 + }, + { + "epoch": 0.6284442923155913, + "grad_norm": 23.989248275756836, + "learning_rate": 2.0912721049629207e-05, + "loss": 2.5138, + "step": 3672 + }, + { + "epoch": 0.6286154372753723, + "grad_norm": 24.12046241760254, + "learning_rate": 2.091842555618939e-05, + "loss": 2.7494, + "step": 3673 + }, + { + "epoch": 0.6287865822351532, + "grad_norm": 1.9827460050582886, + "learning_rate": 2.092413006274957e-05, + "loss": 0.2992, + "step": 3674 + }, + { + "epoch": 0.6289577271949341, + "grad_norm": 15.272665977478027, + "learning_rate": 2.0929834569309753e-05, + "loss": 1.5809, + "step": 3675 + }, + { + "epoch": 0.629128872154715, + "grad_norm": 7.758640289306641, + "learning_rate": 2.093553907586994e-05, + "loss": 0.5765, + "step": 3676 + }, + { + "epoch": 0.629300017114496, + "grad_norm": 29.360862731933594, + "learning_rate": 2.094124358243012e-05, + "loss": 3.1532, + "step": 3677 + }, + { + "epoch": 0.6294711620742769, + "grad_norm": 2.6255311965942383, + "learning_rate": 2.0946948088990304e-05, + "loss": 0.3356, + "step": 3678 + }, + { + "epoch": 0.6296423070340579, + "grad_norm": 31.4219970703125, + "learning_rate": 2.0952652595550487e-05, + "loss": 6.6895, + "step": 3679 + }, + { + "epoch": 0.6298134519938388, + "grad_norm": 26.191577911376953, + "learning_rate": 2.095835710211067e-05, + "loss": 2.5616, + "step": 3680 + }, + { + "epoch": 0.6299845969536197, + "grad_norm": 22.00040054321289, + "learning_rate": 2.096406160867085e-05, + "loss": 2.0715, + "step": 3681 + }, + { + "epoch": 0.6301557419134006, + "grad_norm": 18.956966400146484, + "learning_rate": 2.0969766115231034e-05, + "loss": 1.9574, + "step": 3682 + }, + { + "epoch": 0.6303268868731816, + "grad_norm": 27.760032653808594, + "learning_rate": 2.0975470621791217e-05, + "loss": 3.7785, + "step": 3683 + }, + { + "epoch": 0.6304980318329625, + "grad_norm": 10.644538879394531, + "learning_rate": 2.0981175128351397e-05, + "loss": 0.8287, + "step": 3684 + }, + { + "epoch": 0.6306691767927435, + "grad_norm": 51.96141815185547, + "learning_rate": 2.098687963491158e-05, + "loss": 2.4722, + "step": 3685 + }, + { + "epoch": 0.6308403217525244, + "grad_norm": 7.464876174926758, + "learning_rate": 2.0992584141471764e-05, + "loss": 0.6759, + "step": 3686 + }, + { + "epoch": 0.6310114667123053, + "grad_norm": 18.195411682128906, + "learning_rate": 2.0998288648031947e-05, + "loss": 1.814, + "step": 3687 + }, + { + "epoch": 0.6311826116720862, + "grad_norm": 35.01757049560547, + "learning_rate": 2.1003993154592127e-05, + "loss": 4.6493, + "step": 3688 + }, + { + "epoch": 0.6313537566318672, + "grad_norm": 27.28526496887207, + "learning_rate": 2.100969766115231e-05, + "loss": 2.6136, + "step": 3689 + }, + { + "epoch": 0.6315249015916481, + "grad_norm": 10.132340431213379, + "learning_rate": 2.1015402167712494e-05, + "loss": 1.4228, + "step": 3690 + }, + { + "epoch": 0.6316960465514291, + "grad_norm": 29.740331649780273, + "learning_rate": 2.1021106674272674e-05, + "loss": 3.4955, + "step": 3691 + }, + { + "epoch": 0.63186719151121, + "grad_norm": 6.783731937408447, + "learning_rate": 2.1026811180832857e-05, + "loss": 0.589, + "step": 3692 + }, + { + "epoch": 0.632038336470991, + "grad_norm": 26.901226043701172, + "learning_rate": 2.103251568739304e-05, + "loss": 2.8409, + "step": 3693 + }, + { + "epoch": 0.6322094814307718, + "grad_norm": 67.48046112060547, + "learning_rate": 2.1038220193953224e-05, + "loss": 2.4784, + "step": 3694 + }, + { + "epoch": 0.6323806263905528, + "grad_norm": 16.813676834106445, + "learning_rate": 2.1043924700513404e-05, + "loss": 1.3682, + "step": 3695 + }, + { + "epoch": 0.6325517713503337, + "grad_norm": 27.411855697631836, + "learning_rate": 2.1049629207073587e-05, + "loss": 2.8456, + "step": 3696 + }, + { + "epoch": 0.6327229163101147, + "grad_norm": 73.62898254394531, + "learning_rate": 2.105533371363377e-05, + "loss": 3.2879, + "step": 3697 + }, + { + "epoch": 0.6328940612698956, + "grad_norm": 22.297090530395508, + "learning_rate": 2.106103822019395e-05, + "loss": 2.3233, + "step": 3698 + }, + { + "epoch": 0.6330652062296765, + "grad_norm": 24.923654556274414, + "learning_rate": 2.1066742726754138e-05, + "loss": 2.1826, + "step": 3699 + }, + { + "epoch": 0.6332363511894574, + "grad_norm": 20.588891983032227, + "learning_rate": 2.107244723331432e-05, + "loss": 2.0226, + "step": 3700 + }, + { + "epoch": 0.6334074961492384, + "grad_norm": 5.975876331329346, + "learning_rate": 2.1078151739874504e-05, + "loss": 0.6341, + "step": 3701 + }, + { + "epoch": 0.6335786411090193, + "grad_norm": 21.88986587524414, + "learning_rate": 2.1083856246434684e-05, + "loss": 2.1575, + "step": 3702 + }, + { + "epoch": 0.6337497860688003, + "grad_norm": 19.65184211730957, + "learning_rate": 2.1089560752994868e-05, + "loss": 1.663, + "step": 3703 + }, + { + "epoch": 0.6339209310285812, + "grad_norm": 30.190269470214844, + "learning_rate": 2.109526525955505e-05, + "loss": 3.6871, + "step": 3704 + }, + { + "epoch": 0.6340920759883621, + "grad_norm": 8.569622993469238, + "learning_rate": 2.110096976611523e-05, + "loss": 0.9122, + "step": 3705 + }, + { + "epoch": 0.634263220948143, + "grad_norm": 59.63459777832031, + "learning_rate": 2.1106674272675414e-05, + "loss": 2.3494, + "step": 3706 + }, + { + "epoch": 0.634434365907924, + "grad_norm": 26.309602737426758, + "learning_rate": 2.1112378779235598e-05, + "loss": 3.1797, + "step": 3707 + }, + { + "epoch": 0.6346055108677049, + "grad_norm": 28.47893714904785, + "learning_rate": 2.1118083285795778e-05, + "loss": 3.2675, + "step": 3708 + }, + { + "epoch": 0.6347766558274859, + "grad_norm": 24.453359603881836, + "learning_rate": 2.112378779235596e-05, + "loss": 2.5852, + "step": 3709 + }, + { + "epoch": 0.6349478007872669, + "grad_norm": 24.291488647460938, + "learning_rate": 2.1129492298916145e-05, + "loss": 2.3409, + "step": 3710 + }, + { + "epoch": 0.6351189457470477, + "grad_norm": 26.9472599029541, + "learning_rate": 2.1135196805476328e-05, + "loss": 3.1146, + "step": 3711 + }, + { + "epoch": 0.6352900907068287, + "grad_norm": 20.753297805786133, + "learning_rate": 2.1140901312036508e-05, + "loss": 1.8245, + "step": 3712 + }, + { + "epoch": 0.6354612356666096, + "grad_norm": 11.65485668182373, + "learning_rate": 2.114660581859669e-05, + "loss": 0.8492, + "step": 3713 + }, + { + "epoch": 0.6356323806263906, + "grad_norm": 18.369417190551758, + "learning_rate": 2.1152310325156875e-05, + "loss": 1.8056, + "step": 3714 + }, + { + "epoch": 0.6358035255861715, + "grad_norm": 3.041557788848877, + "learning_rate": 2.1158014831717055e-05, + "loss": 0.3459, + "step": 3715 + }, + { + "epoch": 0.6359746705459525, + "grad_norm": 31.116910934448242, + "learning_rate": 2.1163719338277238e-05, + "loss": 2.4418, + "step": 3716 + }, + { + "epoch": 0.6361458155057333, + "grad_norm": 26.295557022094727, + "learning_rate": 2.116942384483742e-05, + "loss": 1.8444, + "step": 3717 + }, + { + "epoch": 0.6363169604655143, + "grad_norm": 25.38450813293457, + "learning_rate": 2.1175128351397605e-05, + "loss": 2.2447, + "step": 3718 + }, + { + "epoch": 0.6364881054252952, + "grad_norm": 25.307218551635742, + "learning_rate": 2.1180832857957785e-05, + "loss": 2.5005, + "step": 3719 + }, + { + "epoch": 0.6366592503850762, + "grad_norm": 2.224104642868042, + "learning_rate": 2.1186537364517968e-05, + "loss": 0.3241, + "step": 3720 + }, + { + "epoch": 0.6368303953448571, + "grad_norm": 17.863842010498047, + "learning_rate": 2.119224187107815e-05, + "loss": 1.8059, + "step": 3721 + }, + { + "epoch": 0.6370015403046381, + "grad_norm": 143.09255981445312, + "learning_rate": 2.1197946377638335e-05, + "loss": 7.8615, + "step": 3722 + }, + { + "epoch": 0.637172685264419, + "grad_norm": 20.51776695251465, + "learning_rate": 2.120365088419852e-05, + "loss": 1.9465, + "step": 3723 + }, + { + "epoch": 0.6373438302241999, + "grad_norm": 19.772676467895508, + "learning_rate": 2.1209355390758702e-05, + "loss": 1.874, + "step": 3724 + }, + { + "epoch": 0.6375149751839808, + "grad_norm": 29.25998306274414, + "learning_rate": 2.1215059897318885e-05, + "loss": 3.1729, + "step": 3725 + }, + { + "epoch": 0.6376861201437618, + "grad_norm": 68.94001770019531, + "learning_rate": 2.1220764403879065e-05, + "loss": 7.3574, + "step": 3726 + }, + { + "epoch": 0.6378572651035427, + "grad_norm": 26.3350887298584, + "learning_rate": 2.122646891043925e-05, + "loss": 3.0566, + "step": 3727 + }, + { + "epoch": 0.6380284100633237, + "grad_norm": 1.6111328601837158, + "learning_rate": 2.1232173416999432e-05, + "loss": 0.2934, + "step": 3728 + }, + { + "epoch": 0.6381995550231045, + "grad_norm": 20.667644500732422, + "learning_rate": 2.1237877923559612e-05, + "loss": 2.0751, + "step": 3729 + }, + { + "epoch": 0.6383706999828855, + "grad_norm": 14.264472961425781, + "learning_rate": 2.1243582430119795e-05, + "loss": 1.0027, + "step": 3730 + }, + { + "epoch": 0.6385418449426664, + "grad_norm": 22.407548904418945, + "learning_rate": 2.124928693667998e-05, + "loss": 1.9791, + "step": 3731 + }, + { + "epoch": 0.6387129899024474, + "grad_norm": 31.578723907470703, + "learning_rate": 2.1254991443240162e-05, + "loss": 2.7384, + "step": 3732 + }, + { + "epoch": 0.6388841348622283, + "grad_norm": 2.103879690170288, + "learning_rate": 2.1260695949800342e-05, + "loss": 0.312, + "step": 3733 + }, + { + "epoch": 0.6390552798220093, + "grad_norm": 68.59303283691406, + "learning_rate": 2.1266400456360525e-05, + "loss": 7.0889, + "step": 3734 + }, + { + "epoch": 0.6392264247817901, + "grad_norm": 7.920656681060791, + "learning_rate": 2.127210496292071e-05, + "loss": 1.09, + "step": 3735 + }, + { + "epoch": 0.6393975697415711, + "grad_norm": 29.83785057067871, + "learning_rate": 2.127780946948089e-05, + "loss": 2.6416, + "step": 3736 + }, + { + "epoch": 0.639568714701352, + "grad_norm": 22.06658172607422, + "learning_rate": 2.1283513976041072e-05, + "loss": 2.3255, + "step": 3737 + }, + { + "epoch": 0.639739859661133, + "grad_norm": 18.649507522583008, + "learning_rate": 2.1289218482601255e-05, + "loss": 1.5747, + "step": 3738 + }, + { + "epoch": 0.6399110046209139, + "grad_norm": 25.857921600341797, + "learning_rate": 2.129492298916144e-05, + "loss": 2.3616, + "step": 3739 + }, + { + "epoch": 0.6400821495806949, + "grad_norm": 27.74761390686035, + "learning_rate": 2.130062749572162e-05, + "loss": 3.559, + "step": 3740 + }, + { + "epoch": 0.6402532945404757, + "grad_norm": 42.965763092041016, + "learning_rate": 2.1306332002281802e-05, + "loss": 6.2665, + "step": 3741 + }, + { + "epoch": 0.6404244395002567, + "grad_norm": 13.754633903503418, + "learning_rate": 2.1312036508841986e-05, + "loss": 0.9964, + "step": 3742 + }, + { + "epoch": 0.6405955844600376, + "grad_norm": 30.93828010559082, + "learning_rate": 2.1317741015402166e-05, + "loss": 2.1571, + "step": 3743 + }, + { + "epoch": 0.6407667294198186, + "grad_norm": 13.315516471862793, + "learning_rate": 2.132344552196235e-05, + "loss": 0.9772, + "step": 3744 + }, + { + "epoch": 0.6409378743795995, + "grad_norm": 25.19696617126465, + "learning_rate": 2.1329150028522536e-05, + "loss": 2.2795, + "step": 3745 + }, + { + "epoch": 0.6411090193393805, + "grad_norm": 19.73836326599121, + "learning_rate": 2.1334854535082716e-05, + "loss": 2.1502, + "step": 3746 + }, + { + "epoch": 0.6412801642991613, + "grad_norm": 13.088768005371094, + "learning_rate": 2.13405590416429e-05, + "loss": 0.6967, + "step": 3747 + }, + { + "epoch": 0.6414513092589423, + "grad_norm": 8.49268627166748, + "learning_rate": 2.1346263548203082e-05, + "loss": 0.8886, + "step": 3748 + }, + { + "epoch": 0.6416224542187232, + "grad_norm": 14.05295181274414, + "learning_rate": 2.1351968054763266e-05, + "loss": 1.1393, + "step": 3749 + }, + { + "epoch": 0.6417935991785042, + "grad_norm": 35.74152755737305, + "learning_rate": 2.1357672561323446e-05, + "loss": 3.8807, + "step": 3750 + }, + { + "epoch": 0.6419647441382851, + "grad_norm": 12.692075729370117, + "learning_rate": 2.136337706788363e-05, + "loss": 0.9175, + "step": 3751 + }, + { + "epoch": 0.6421358890980661, + "grad_norm": 3.9487550258636475, + "learning_rate": 2.1369081574443813e-05, + "loss": 0.3691, + "step": 3752 + }, + { + "epoch": 0.642307034057847, + "grad_norm": 25.308374404907227, + "learning_rate": 2.1374786081003993e-05, + "loss": 3.566, + "step": 3753 + }, + { + "epoch": 0.6424781790176279, + "grad_norm": 27.15434455871582, + "learning_rate": 2.1380490587564176e-05, + "loss": 2.8604, + "step": 3754 + }, + { + "epoch": 0.6426493239774088, + "grad_norm": 28.912559509277344, + "learning_rate": 2.138619509412436e-05, + "loss": 2.7714, + "step": 3755 + }, + { + "epoch": 0.6428204689371898, + "grad_norm": 185.9123992919922, + "learning_rate": 2.1391899600684543e-05, + "loss": 11.7502, + "step": 3756 + }, + { + "epoch": 0.6429916138969707, + "grad_norm": 33.01267623901367, + "learning_rate": 2.1397604107244723e-05, + "loss": 1.9282, + "step": 3757 + }, + { + "epoch": 0.6431627588567517, + "grad_norm": 32.01244354248047, + "learning_rate": 2.1403308613804906e-05, + "loss": 3.0304, + "step": 3758 + }, + { + "epoch": 0.6433339038165325, + "grad_norm": 26.142648696899414, + "learning_rate": 2.140901312036509e-05, + "loss": 2.5164, + "step": 3759 + }, + { + "epoch": 0.6435050487763135, + "grad_norm": 24.568946838378906, + "learning_rate": 2.141471762692527e-05, + "loss": 2.3979, + "step": 3760 + }, + { + "epoch": 0.6436761937360945, + "grad_norm": 26.051137924194336, + "learning_rate": 2.1420422133485453e-05, + "loss": 3.0876, + "step": 3761 + }, + { + "epoch": 0.6438473386958754, + "grad_norm": 17.334243774414062, + "learning_rate": 2.1426126640045636e-05, + "loss": 1.4332, + "step": 3762 + }, + { + "epoch": 0.6440184836556564, + "grad_norm": 29.278783798217773, + "learning_rate": 2.143183114660582e-05, + "loss": 2.8496, + "step": 3763 + }, + { + "epoch": 0.6441896286154373, + "grad_norm": 24.168411254882812, + "learning_rate": 2.1437535653166e-05, + "loss": 2.4451, + "step": 3764 + }, + { + "epoch": 0.6443607735752183, + "grad_norm": 31.55498504638672, + "learning_rate": 2.1443240159726183e-05, + "loss": 2.0817, + "step": 3765 + }, + { + "epoch": 0.6445319185349991, + "grad_norm": 32.670005798339844, + "learning_rate": 2.1448944666286366e-05, + "loss": 6.7024, + "step": 3766 + }, + { + "epoch": 0.6447030634947801, + "grad_norm": 2.9258251190185547, + "learning_rate": 2.1454649172846546e-05, + "loss": 0.3334, + "step": 3767 + }, + { + "epoch": 0.644874208454561, + "grad_norm": 12.174300193786621, + "learning_rate": 2.1460353679406733e-05, + "loss": 0.8169, + "step": 3768 + }, + { + "epoch": 0.645045353414342, + "grad_norm": 28.00621223449707, + "learning_rate": 2.1466058185966917e-05, + "loss": 3.2524, + "step": 3769 + }, + { + "epoch": 0.6452164983741229, + "grad_norm": 26.712377548217773, + "learning_rate": 2.14717626925271e-05, + "loss": 2.7371, + "step": 3770 + }, + { + "epoch": 0.6453876433339039, + "grad_norm": 21.200624465942383, + "learning_rate": 2.147746719908728e-05, + "loss": 2.6966, + "step": 3771 + }, + { + "epoch": 0.6455587882936847, + "grad_norm": 11.284048080444336, + "learning_rate": 2.1483171705647463e-05, + "loss": 0.7751, + "step": 3772 + }, + { + "epoch": 0.6457299332534657, + "grad_norm": 11.401342391967773, + "learning_rate": 2.1488876212207647e-05, + "loss": 0.7674, + "step": 3773 + }, + { + "epoch": 0.6459010782132466, + "grad_norm": 6.0696587562561035, + "learning_rate": 2.1494580718767827e-05, + "loss": 0.6054, + "step": 3774 + }, + { + "epoch": 0.6460722231730276, + "grad_norm": 27.059720993041992, + "learning_rate": 2.150028522532801e-05, + "loss": 3.0029, + "step": 3775 + }, + { + "epoch": 0.6462433681328085, + "grad_norm": 20.365650177001953, + "learning_rate": 2.1505989731888193e-05, + "loss": 2.0427, + "step": 3776 + }, + { + "epoch": 0.6464145130925895, + "grad_norm": 6.034448146820068, + "learning_rate": 2.1511694238448373e-05, + "loss": 0.6441, + "step": 3777 + }, + { + "epoch": 0.6465856580523703, + "grad_norm": 23.394229888916016, + "learning_rate": 2.1517398745008557e-05, + "loss": 2.4589, + "step": 3778 + }, + { + "epoch": 0.6467568030121513, + "grad_norm": 46.16388702392578, + "learning_rate": 2.152310325156874e-05, + "loss": 7.0883, + "step": 3779 + }, + { + "epoch": 0.6469279479719322, + "grad_norm": 23.118371963500977, + "learning_rate": 2.1528807758128924e-05, + "loss": 2.5132, + "step": 3780 + }, + { + "epoch": 0.6470990929317132, + "grad_norm": 29.06417465209961, + "learning_rate": 2.1534512264689103e-05, + "loss": 2.967, + "step": 3781 + }, + { + "epoch": 0.6472702378914941, + "grad_norm": 4.700016498565674, + "learning_rate": 2.1540216771249287e-05, + "loss": 0.3476, + "step": 3782 + }, + { + "epoch": 0.6474413828512751, + "grad_norm": 9.841592788696289, + "learning_rate": 2.154592127780947e-05, + "loss": 0.6132, + "step": 3783 + }, + { + "epoch": 0.6476125278110559, + "grad_norm": 6.57474946975708, + "learning_rate": 2.155162578436965e-05, + "loss": 0.5898, + "step": 3784 + }, + { + "epoch": 0.6477836727708369, + "grad_norm": 10.39101791381836, + "learning_rate": 2.1557330290929834e-05, + "loss": 0.9835, + "step": 3785 + }, + { + "epoch": 0.6479548177306178, + "grad_norm": 23.276077270507812, + "learning_rate": 2.1563034797490017e-05, + "loss": 2.6405, + "step": 3786 + }, + { + "epoch": 0.6481259626903988, + "grad_norm": 25.941986083984375, + "learning_rate": 2.15687393040502e-05, + "loss": 2.89, + "step": 3787 + }, + { + "epoch": 0.6482971076501797, + "grad_norm": 32.4000129699707, + "learning_rate": 2.157444381061038e-05, + "loss": 3.63, + "step": 3788 + }, + { + "epoch": 0.6484682526099607, + "grad_norm": 36.605247497558594, + "learning_rate": 2.1580148317170564e-05, + "loss": 6.3912, + "step": 3789 + }, + { + "epoch": 0.6486393975697415, + "grad_norm": 31.485267639160156, + "learning_rate": 2.1585852823730747e-05, + "loss": 3.3366, + "step": 3790 + }, + { + "epoch": 0.6488105425295225, + "grad_norm": 23.93595314025879, + "learning_rate": 2.159155733029093e-05, + "loss": 2.4522, + "step": 3791 + }, + { + "epoch": 0.6489816874893034, + "grad_norm": 25.628398895263672, + "learning_rate": 2.1597261836851114e-05, + "loss": 3.0389, + "step": 3792 + }, + { + "epoch": 0.6491528324490844, + "grad_norm": 25.61122703552246, + "learning_rate": 2.1602966343411297e-05, + "loss": 2.1422, + "step": 3793 + }, + { + "epoch": 0.6493239774088653, + "grad_norm": 26.866369247436523, + "learning_rate": 2.160867084997148e-05, + "loss": 2.8132, + "step": 3794 + }, + { + "epoch": 0.6494951223686463, + "grad_norm": 60.774818420410156, + "learning_rate": 2.161437535653166e-05, + "loss": 6.1755, + "step": 3795 + }, + { + "epoch": 0.6496662673284271, + "grad_norm": 12.326183319091797, + "learning_rate": 2.1620079863091844e-05, + "loss": 0.839, + "step": 3796 + }, + { + "epoch": 0.6498374122882081, + "grad_norm": 22.0472354888916, + "learning_rate": 2.1625784369652027e-05, + "loss": 2.4891, + "step": 3797 + }, + { + "epoch": 0.650008557247989, + "grad_norm": 8.49109935760498, + "learning_rate": 2.1631488876212207e-05, + "loss": 0.7858, + "step": 3798 + }, + { + "epoch": 0.65017970220777, + "grad_norm": 46.285579681396484, + "learning_rate": 2.163719338277239e-05, + "loss": 2.7199, + "step": 3799 + }, + { + "epoch": 0.6503508471675509, + "grad_norm": 16.816619873046875, + "learning_rate": 2.1642897889332574e-05, + "loss": 1.5917, + "step": 3800 + }, + { + "epoch": 0.6505219921273319, + "grad_norm": 28.71768569946289, + "learning_rate": 2.1648602395892758e-05, + "loss": 3.4323, + "step": 3801 + }, + { + "epoch": 0.6506931370871127, + "grad_norm": 23.60746192932129, + "learning_rate": 2.1654306902452938e-05, + "loss": 2.0654, + "step": 3802 + }, + { + "epoch": 0.6508642820468937, + "grad_norm": 26.327360153198242, + "learning_rate": 2.166001140901312e-05, + "loss": 1.8876, + "step": 3803 + }, + { + "epoch": 0.6510354270066746, + "grad_norm": 44.482337951660156, + "learning_rate": 2.1665715915573304e-05, + "loss": 6.5739, + "step": 3804 + }, + { + "epoch": 0.6512065719664556, + "grad_norm": 27.297197341918945, + "learning_rate": 2.1671420422133484e-05, + "loss": 2.1462, + "step": 3805 + }, + { + "epoch": 0.6513777169262365, + "grad_norm": 11.890837669372559, + "learning_rate": 2.1677124928693668e-05, + "loss": 0.7348, + "step": 3806 + }, + { + "epoch": 0.6515488618860175, + "grad_norm": 27.35932731628418, + "learning_rate": 2.168282943525385e-05, + "loss": 2.9561, + "step": 3807 + }, + { + "epoch": 0.6517200068457983, + "grad_norm": 25.932842254638672, + "learning_rate": 2.1688533941814034e-05, + "loss": 2.5524, + "step": 3808 + }, + { + "epoch": 0.6518911518055793, + "grad_norm": 7.344489574432373, + "learning_rate": 2.1694238448374214e-05, + "loss": 0.8045, + "step": 3809 + }, + { + "epoch": 0.6520622967653602, + "grad_norm": 24.049985885620117, + "learning_rate": 2.1699942954934398e-05, + "loss": 2.3474, + "step": 3810 + }, + { + "epoch": 0.6522334417251412, + "grad_norm": 25.154258728027344, + "learning_rate": 2.170564746149458e-05, + "loss": 2.8768, + "step": 3811 + }, + { + "epoch": 0.6524045866849222, + "grad_norm": 35.475502014160156, + "learning_rate": 2.171135196805476e-05, + "loss": 5.6263, + "step": 3812 + }, + { + "epoch": 0.6525757316447031, + "grad_norm": 18.898576736450195, + "learning_rate": 2.1717056474614945e-05, + "loss": 2.0987, + "step": 3813 + }, + { + "epoch": 0.652746876604484, + "grad_norm": 64.42694091796875, + "learning_rate": 2.172276098117513e-05, + "loss": 1.9397, + "step": 3814 + }, + { + "epoch": 0.6529180215642649, + "grad_norm": 51.23388671875, + "learning_rate": 2.172846548773531e-05, + "loss": 2.187, + "step": 3815 + }, + { + "epoch": 0.6530891665240459, + "grad_norm": 24.042943954467773, + "learning_rate": 2.1734169994295495e-05, + "loss": 2.9904, + "step": 3816 + }, + { + "epoch": 0.6532603114838268, + "grad_norm": 4.368581295013428, + "learning_rate": 2.1739874500855678e-05, + "loss": 0.442, + "step": 3817 + }, + { + "epoch": 0.6534314564436078, + "grad_norm": 22.971675872802734, + "learning_rate": 2.174557900741586e-05, + "loss": 2.3311, + "step": 3818 + }, + { + "epoch": 0.6536026014033887, + "grad_norm": 23.986604690551758, + "learning_rate": 2.175128351397604e-05, + "loss": 2.6135, + "step": 3819 + }, + { + "epoch": 0.6537737463631696, + "grad_norm": 28.69915771484375, + "learning_rate": 2.1756988020536225e-05, + "loss": 3.5022, + "step": 3820 + }, + { + "epoch": 0.6539448913229505, + "grad_norm": 8.601239204406738, + "learning_rate": 2.1762692527096408e-05, + "loss": 0.6592, + "step": 3821 + }, + { + "epoch": 0.6541160362827315, + "grad_norm": 22.482227325439453, + "learning_rate": 2.1768397033656588e-05, + "loss": 2.4048, + "step": 3822 + }, + { + "epoch": 0.6542871812425124, + "grad_norm": 25.31351089477539, + "learning_rate": 2.177410154021677e-05, + "loss": 3.4277, + "step": 3823 + }, + { + "epoch": 0.6544583262022934, + "grad_norm": 20.58570671081543, + "learning_rate": 2.1779806046776955e-05, + "loss": 2.1318, + "step": 3824 + }, + { + "epoch": 0.6546294711620743, + "grad_norm": 15.284663200378418, + "learning_rate": 2.1785510553337138e-05, + "loss": 1.6332, + "step": 3825 + }, + { + "epoch": 0.6548006161218553, + "grad_norm": 22.946290969848633, + "learning_rate": 2.1791215059897318e-05, + "loss": 2.5015, + "step": 3826 + }, + { + "epoch": 0.6549717610816361, + "grad_norm": 49.23842239379883, + "learning_rate": 2.17969195664575e-05, + "loss": 7.6205, + "step": 3827 + }, + { + "epoch": 0.6551429060414171, + "grad_norm": 16.11168670654297, + "learning_rate": 2.1802624073017685e-05, + "loss": 1.5222, + "step": 3828 + }, + { + "epoch": 0.655314051001198, + "grad_norm": 25.72747039794922, + "learning_rate": 2.1808328579577865e-05, + "loss": 2.7138, + "step": 3829 + }, + { + "epoch": 0.655485195960979, + "grad_norm": 14.393827438354492, + "learning_rate": 2.181403308613805e-05, + "loss": 1.1036, + "step": 3830 + }, + { + "epoch": 0.6556563409207599, + "grad_norm": 27.66619300842285, + "learning_rate": 2.1819737592698232e-05, + "loss": 2.5863, + "step": 3831 + }, + { + "epoch": 0.6558274858805409, + "grad_norm": 34.8533935546875, + "learning_rate": 2.1825442099258415e-05, + "loss": 6.3297, + "step": 3832 + }, + { + "epoch": 0.6559986308403217, + "grad_norm": 32.486045837402344, + "learning_rate": 2.1831146605818595e-05, + "loss": 3.5127, + "step": 3833 + }, + { + "epoch": 0.6561697758001027, + "grad_norm": 22.248271942138672, + "learning_rate": 2.183685111237878e-05, + "loss": 2.0005, + "step": 3834 + }, + { + "epoch": 0.6563409207598836, + "grad_norm": 10.389534950256348, + "learning_rate": 2.1842555618938962e-05, + "loss": 0.7072, + "step": 3835 + }, + { + "epoch": 0.6565120657196646, + "grad_norm": 11.556964874267578, + "learning_rate": 2.1848260125499145e-05, + "loss": 0.8002, + "step": 3836 + }, + { + "epoch": 0.6566832106794455, + "grad_norm": 41.708778381347656, + "learning_rate": 2.185396463205933e-05, + "loss": 6.6155, + "step": 3837 + }, + { + "epoch": 0.6568543556392265, + "grad_norm": 26.74636459350586, + "learning_rate": 2.1859669138619512e-05, + "loss": 3.0992, + "step": 3838 + }, + { + "epoch": 0.6570255005990073, + "grad_norm": 24.822227478027344, + "learning_rate": 2.1865373645179695e-05, + "loss": 1.993, + "step": 3839 + }, + { + "epoch": 0.6571966455587883, + "grad_norm": 28.681447982788086, + "learning_rate": 2.1871078151739875e-05, + "loss": 2.4919, + "step": 3840 + }, + { + "epoch": 0.6573677905185692, + "grad_norm": 24.745319366455078, + "learning_rate": 2.187678265830006e-05, + "loss": 2.4664, + "step": 3841 + }, + { + "epoch": 0.6575389354783502, + "grad_norm": 21.362070083618164, + "learning_rate": 2.1882487164860242e-05, + "loss": 1.9891, + "step": 3842 + }, + { + "epoch": 0.6577100804381311, + "grad_norm": 5.6285481452941895, + "learning_rate": 2.1888191671420422e-05, + "loss": 0.5344, + "step": 3843 + }, + { + "epoch": 0.657881225397912, + "grad_norm": 22.420122146606445, + "learning_rate": 2.1893896177980606e-05, + "loss": 2.1797, + "step": 3844 + }, + { + "epoch": 0.6580523703576929, + "grad_norm": 1.2516911029815674, + "learning_rate": 2.189960068454079e-05, + "loss": 0.284, + "step": 3845 + }, + { + "epoch": 0.6582235153174739, + "grad_norm": 16.150665283203125, + "learning_rate": 2.190530519110097e-05, + "loss": 1.2482, + "step": 3846 + }, + { + "epoch": 0.6583946602772548, + "grad_norm": 29.241182327270508, + "learning_rate": 2.1911009697661152e-05, + "loss": 3.0362, + "step": 3847 + }, + { + "epoch": 0.6585658052370358, + "grad_norm": 12.874473571777344, + "learning_rate": 2.1916714204221336e-05, + "loss": 1.3586, + "step": 3848 + }, + { + "epoch": 0.6587369501968167, + "grad_norm": 25.05373764038086, + "learning_rate": 2.192241871078152e-05, + "loss": 2.9891, + "step": 3849 + }, + { + "epoch": 0.6589080951565977, + "grad_norm": 5.338008880615234, + "learning_rate": 2.19281232173417e-05, + "loss": 0.6053, + "step": 3850 + }, + { + "epoch": 0.6590792401163785, + "grad_norm": 24.934619903564453, + "learning_rate": 2.1933827723901882e-05, + "loss": 3.4717, + "step": 3851 + }, + { + "epoch": 0.6592503850761595, + "grad_norm": 18.92520523071289, + "learning_rate": 2.1939532230462066e-05, + "loss": 1.792, + "step": 3852 + }, + { + "epoch": 0.6594215300359404, + "grad_norm": 22.526769638061523, + "learning_rate": 2.1945236737022246e-05, + "loss": 2.2758, + "step": 3853 + }, + { + "epoch": 0.6595926749957214, + "grad_norm": 13.877473831176758, + "learning_rate": 2.195094124358243e-05, + "loss": 0.9918, + "step": 3854 + }, + { + "epoch": 0.6597638199555023, + "grad_norm": 26.623685836791992, + "learning_rate": 2.1956645750142613e-05, + "loss": 3.1529, + "step": 3855 + }, + { + "epoch": 0.6599349649152833, + "grad_norm": 9.52644157409668, + "learning_rate": 2.1962350256702796e-05, + "loss": 1.4529, + "step": 3856 + }, + { + "epoch": 0.6601061098750641, + "grad_norm": 27.445514678955078, + "learning_rate": 2.1968054763262976e-05, + "loss": 3.2014, + "step": 3857 + }, + { + "epoch": 0.6602772548348451, + "grad_norm": 26.250980377197266, + "learning_rate": 2.197375926982316e-05, + "loss": 2.4078, + "step": 3858 + }, + { + "epoch": 0.660448399794626, + "grad_norm": 13.75600814819336, + "learning_rate": 2.1979463776383346e-05, + "loss": 1.2816, + "step": 3859 + }, + { + "epoch": 0.660619544754407, + "grad_norm": 39.159141540527344, + "learning_rate": 2.1985168282943526e-05, + "loss": 7.109, + "step": 3860 + }, + { + "epoch": 0.6607906897141879, + "grad_norm": 29.373641967773438, + "learning_rate": 2.199087278950371e-05, + "loss": 3.3187, + "step": 3861 + }, + { + "epoch": 0.6609618346739689, + "grad_norm": 21.516931533813477, + "learning_rate": 2.1996577296063893e-05, + "loss": 2.3298, + "step": 3862 + }, + { + "epoch": 0.6611329796337498, + "grad_norm": 95.70633697509766, + "learning_rate": 2.2002281802624076e-05, + "loss": 7.5026, + "step": 3863 + }, + { + "epoch": 0.6613041245935307, + "grad_norm": 34.55537796020508, + "learning_rate": 2.2007986309184256e-05, + "loss": 4.4655, + "step": 3864 + }, + { + "epoch": 0.6614752695533117, + "grad_norm": 6.342493534088135, + "learning_rate": 2.201369081574444e-05, + "loss": 0.7267, + "step": 3865 + }, + { + "epoch": 0.6616464145130926, + "grad_norm": 21.994108200073242, + "learning_rate": 2.2019395322304623e-05, + "loss": 1.894, + "step": 3866 + }, + { + "epoch": 0.6618175594728736, + "grad_norm": 62.81085205078125, + "learning_rate": 2.2025099828864803e-05, + "loss": 2.5491, + "step": 3867 + }, + { + "epoch": 0.6619887044326545, + "grad_norm": 13.31482219696045, + "learning_rate": 2.2030804335424986e-05, + "loss": 1.2874, + "step": 3868 + }, + { + "epoch": 0.6621598493924354, + "grad_norm": 37.990882873535156, + "learning_rate": 2.203650884198517e-05, + "loss": 6.5223, + "step": 3869 + }, + { + "epoch": 0.6623309943522163, + "grad_norm": 7.525701522827148, + "learning_rate": 2.2042213348545353e-05, + "loss": 0.7646, + "step": 3870 + }, + { + "epoch": 0.6625021393119973, + "grad_norm": 28.383960723876953, + "learning_rate": 2.2047917855105533e-05, + "loss": 3.8601, + "step": 3871 + }, + { + "epoch": 0.6626732842717782, + "grad_norm": 25.914175033569336, + "learning_rate": 2.2053622361665716e-05, + "loss": 2.8097, + "step": 3872 + }, + { + "epoch": 0.6628444292315592, + "grad_norm": 25.789932250976562, + "learning_rate": 2.20593268682259e-05, + "loss": 2.4306, + "step": 3873 + }, + { + "epoch": 0.66301557419134, + "grad_norm": 25.461977005004883, + "learning_rate": 2.206503137478608e-05, + "loss": 3.0559, + "step": 3874 + }, + { + "epoch": 0.663186719151121, + "grad_norm": 21.75908088684082, + "learning_rate": 2.2070735881346263e-05, + "loss": 2.186, + "step": 3875 + }, + { + "epoch": 0.6633578641109019, + "grad_norm": 26.937217712402344, + "learning_rate": 2.2076440387906447e-05, + "loss": 3.0092, + "step": 3876 + }, + { + "epoch": 0.6635290090706829, + "grad_norm": 23.69822883605957, + "learning_rate": 2.2082144894466627e-05, + "loss": 2.4399, + "step": 3877 + }, + { + "epoch": 0.6637001540304638, + "grad_norm": 2.651796579360962, + "learning_rate": 2.208784940102681e-05, + "loss": 0.3089, + "step": 3878 + }, + { + "epoch": 0.6638712989902448, + "grad_norm": 13.779840469360352, + "learning_rate": 2.2093553907586993e-05, + "loss": 1.4293, + "step": 3879 + }, + { + "epoch": 0.6640424439500257, + "grad_norm": 9.433732032775879, + "learning_rate": 2.2099258414147177e-05, + "loss": 0.738, + "step": 3880 + }, + { + "epoch": 0.6642135889098066, + "grad_norm": 8.053833961486816, + "learning_rate": 2.2104962920707357e-05, + "loss": 0.8599, + "step": 3881 + }, + { + "epoch": 0.6643847338695875, + "grad_norm": 8.789999008178711, + "learning_rate": 2.2110667427267543e-05, + "loss": 1.2145, + "step": 3882 + }, + { + "epoch": 0.6645558788293685, + "grad_norm": 35.39924240112305, + "learning_rate": 2.2116371933827727e-05, + "loss": 3.1608, + "step": 3883 + }, + { + "epoch": 0.6647270237891494, + "grad_norm": 123.84489440917969, + "learning_rate": 2.2122076440387907e-05, + "loss": 6.8157, + "step": 3884 + }, + { + "epoch": 0.6648981687489304, + "grad_norm": 24.360124588012695, + "learning_rate": 2.212778094694809e-05, + "loss": 2.1383, + "step": 3885 + }, + { + "epoch": 0.6650693137087113, + "grad_norm": 26.01473045349121, + "learning_rate": 2.2133485453508274e-05, + "loss": 3.114, + "step": 3886 + }, + { + "epoch": 0.6652404586684922, + "grad_norm": 27.838552474975586, + "learning_rate": 2.2139189960068457e-05, + "loss": 2.8416, + "step": 3887 + }, + { + "epoch": 0.6654116036282731, + "grad_norm": 27.128395080566406, + "learning_rate": 2.2144894466628637e-05, + "loss": 2.7295, + "step": 3888 + }, + { + "epoch": 0.6655827485880541, + "grad_norm": 20.852027893066406, + "learning_rate": 2.215059897318882e-05, + "loss": 2.2252, + "step": 3889 + }, + { + "epoch": 0.665753893547835, + "grad_norm": 51.29511260986328, + "learning_rate": 2.2156303479749004e-05, + "loss": 2.4494, + "step": 3890 + }, + { + "epoch": 0.665925038507616, + "grad_norm": 27.06675910949707, + "learning_rate": 2.2162007986309184e-05, + "loss": 2.8832, + "step": 3891 + }, + { + "epoch": 0.6660961834673969, + "grad_norm": 8.248744010925293, + "learning_rate": 2.2167712492869367e-05, + "loss": 0.7378, + "step": 3892 + }, + { + "epoch": 0.6662673284271778, + "grad_norm": 23.798690795898438, + "learning_rate": 2.217341699942955e-05, + "loss": 3.3288, + "step": 3893 + }, + { + "epoch": 0.6664384733869587, + "grad_norm": 25.778766632080078, + "learning_rate": 2.2179121505989734e-05, + "loss": 3.0603, + "step": 3894 + }, + { + "epoch": 0.6666096183467397, + "grad_norm": 2.8828747272491455, + "learning_rate": 2.2184826012549914e-05, + "loss": 0.3141, + "step": 3895 + }, + { + "epoch": 0.6667807633065206, + "grad_norm": 26.101049423217773, + "learning_rate": 2.2190530519110097e-05, + "loss": 3.1024, + "step": 3896 + }, + { + "epoch": 0.6669519082663016, + "grad_norm": 20.340776443481445, + "learning_rate": 2.219623502567028e-05, + "loss": 1.7807, + "step": 3897 + }, + { + "epoch": 0.6671230532260825, + "grad_norm": 22.285655975341797, + "learning_rate": 2.220193953223046e-05, + "loss": 1.9623, + "step": 3898 + }, + { + "epoch": 0.6672941981858634, + "grad_norm": 10.816023826599121, + "learning_rate": 2.2207644038790644e-05, + "loss": 0.7881, + "step": 3899 + }, + { + "epoch": 0.6674653431456443, + "grad_norm": 25.382898330688477, + "learning_rate": 2.2213348545350827e-05, + "loss": 2.2422, + "step": 3900 + }, + { + "epoch": 0.6676364881054253, + "grad_norm": 18.11640167236328, + "learning_rate": 2.221905305191101e-05, + "loss": 1.323, + "step": 3901 + }, + { + "epoch": 0.6678076330652062, + "grad_norm": 30.607837677001953, + "learning_rate": 2.222475755847119e-05, + "loss": 2.3077, + "step": 3902 + }, + { + "epoch": 0.6679787780249872, + "grad_norm": 22.524381637573242, + "learning_rate": 2.2230462065031374e-05, + "loss": 2.7118, + "step": 3903 + }, + { + "epoch": 0.668149922984768, + "grad_norm": 22.379953384399414, + "learning_rate": 2.2236166571591557e-05, + "loss": 2.0846, + "step": 3904 + }, + { + "epoch": 0.668321067944549, + "grad_norm": 4.688474655151367, + "learning_rate": 2.224187107815174e-05, + "loss": 0.5239, + "step": 3905 + }, + { + "epoch": 0.6684922129043299, + "grad_norm": 26.99576759338379, + "learning_rate": 2.2247575584711924e-05, + "loss": 3.4886, + "step": 3906 + }, + { + "epoch": 0.6686633578641109, + "grad_norm": 29.186248779296875, + "learning_rate": 2.2253280091272108e-05, + "loss": 3.6048, + "step": 3907 + }, + { + "epoch": 0.6688345028238918, + "grad_norm": 37.26026916503906, + "learning_rate": 2.2258984597832288e-05, + "loss": 6.4346, + "step": 3908 + }, + { + "epoch": 0.6690056477836728, + "grad_norm": 15.596887588500977, + "learning_rate": 2.226468910439247e-05, + "loss": 1.5087, + "step": 3909 + }, + { + "epoch": 0.6691767927434537, + "grad_norm": 22.914793014526367, + "learning_rate": 2.2270393610952654e-05, + "loss": 2.4031, + "step": 3910 + }, + { + "epoch": 0.6693479377032346, + "grad_norm": 34.148956298828125, + "learning_rate": 2.2276098117512838e-05, + "loss": 3.4389, + "step": 3911 + }, + { + "epoch": 0.6695190826630155, + "grad_norm": 21.66793441772461, + "learning_rate": 2.2281802624073018e-05, + "loss": 2.514, + "step": 3912 + }, + { + "epoch": 0.6696902276227965, + "grad_norm": 10.826380729675293, + "learning_rate": 2.22875071306332e-05, + "loss": 0.8586, + "step": 3913 + }, + { + "epoch": 0.6698613725825775, + "grad_norm": 25.435211181640625, + "learning_rate": 2.2293211637193384e-05, + "loss": 2.6889, + "step": 3914 + }, + { + "epoch": 0.6700325175423584, + "grad_norm": 35.62110900878906, + "learning_rate": 2.2298916143753564e-05, + "loss": 6.5931, + "step": 3915 + }, + { + "epoch": 0.6702036625021394, + "grad_norm": 30.739681243896484, + "learning_rate": 2.2304620650313748e-05, + "loss": 2.809, + "step": 3916 + }, + { + "epoch": 0.6703748074619202, + "grad_norm": 3.0653045177459717, + "learning_rate": 2.231032515687393e-05, + "loss": 0.3283, + "step": 3917 + }, + { + "epoch": 0.6705459524217012, + "grad_norm": 29.558330535888672, + "learning_rate": 2.2316029663434115e-05, + "loss": 2.0042, + "step": 3918 + }, + { + "epoch": 0.6707170973814821, + "grad_norm": 23.827219009399414, + "learning_rate": 2.2321734169994295e-05, + "loss": 2.737, + "step": 3919 + }, + { + "epoch": 0.6708882423412631, + "grad_norm": 33.99700927734375, + "learning_rate": 2.2327438676554478e-05, + "loss": 6.5747, + "step": 3920 + }, + { + "epoch": 0.671059387301044, + "grad_norm": 27.55402374267578, + "learning_rate": 2.233314318311466e-05, + "loss": 2.6091, + "step": 3921 + }, + { + "epoch": 0.671230532260825, + "grad_norm": 31.95720672607422, + "learning_rate": 2.233884768967484e-05, + "loss": 3.6537, + "step": 3922 + }, + { + "epoch": 0.6714016772206058, + "grad_norm": 25.14667510986328, + "learning_rate": 2.2344552196235025e-05, + "loss": 2.6334, + "step": 3923 + }, + { + "epoch": 0.6715728221803868, + "grad_norm": 23.47039794921875, + "learning_rate": 2.2350256702795208e-05, + "loss": 2.1698, + "step": 3924 + }, + { + "epoch": 0.6717439671401677, + "grad_norm": 18.31406021118164, + "learning_rate": 2.235596120935539e-05, + "loss": 1.5905, + "step": 3925 + }, + { + "epoch": 0.6719151120999487, + "grad_norm": 23.610937118530273, + "learning_rate": 2.236166571591557e-05, + "loss": 2.4129, + "step": 3926 + }, + { + "epoch": 0.6720862570597296, + "grad_norm": 26.94730567932129, + "learning_rate": 2.2367370222475755e-05, + "loss": 2.8646, + "step": 3927 + }, + { + "epoch": 0.6722574020195106, + "grad_norm": 19.2611026763916, + "learning_rate": 2.237307472903594e-05, + "loss": 1.685, + "step": 3928 + }, + { + "epoch": 0.6724285469792914, + "grad_norm": 7.879249095916748, + "learning_rate": 2.237877923559612e-05, + "loss": 0.6331, + "step": 3929 + }, + { + "epoch": 0.6725996919390724, + "grad_norm": 24.44508171081543, + "learning_rate": 2.2384483742156305e-05, + "loss": 2.3878, + "step": 3930 + }, + { + "epoch": 0.6727708368988533, + "grad_norm": 20.18474769592285, + "learning_rate": 2.239018824871649e-05, + "loss": 1.8989, + "step": 3931 + }, + { + "epoch": 0.6729419818586343, + "grad_norm": 5.985182762145996, + "learning_rate": 2.2395892755276672e-05, + "loss": 0.6958, + "step": 3932 + }, + { + "epoch": 0.6731131268184152, + "grad_norm": 17.770193099975586, + "learning_rate": 2.240159726183685e-05, + "loss": 1.5905, + "step": 3933 + }, + { + "epoch": 0.6732842717781962, + "grad_norm": 28.44164276123047, + "learning_rate": 2.2407301768397035e-05, + "loss": 2.9128, + "step": 3934 + }, + { + "epoch": 0.673455416737977, + "grad_norm": 27.433252334594727, + "learning_rate": 2.241300627495722e-05, + "loss": 2.608, + "step": 3935 + }, + { + "epoch": 0.673626561697758, + "grad_norm": 27.8862247467041, + "learning_rate": 2.24187107815174e-05, + "loss": 3.3689, + "step": 3936 + }, + { + "epoch": 0.6737977066575389, + "grad_norm": 1.7033040523529053, + "learning_rate": 2.2424415288077582e-05, + "loss": 0.3299, + "step": 3937 + }, + { + "epoch": 0.6739688516173199, + "grad_norm": 24.00095558166504, + "learning_rate": 2.2430119794637765e-05, + "loss": 2.1404, + "step": 3938 + }, + { + "epoch": 0.6741399965771008, + "grad_norm": 28.09699821472168, + "learning_rate": 2.243582430119795e-05, + "loss": 2.9765, + "step": 3939 + }, + { + "epoch": 0.6743111415368818, + "grad_norm": 33.0010871887207, + "learning_rate": 2.244152880775813e-05, + "loss": 6.8094, + "step": 3940 + }, + { + "epoch": 0.6744822864966626, + "grad_norm": 25.918590545654297, + "learning_rate": 2.2447233314318312e-05, + "loss": 2.3787, + "step": 3941 + }, + { + "epoch": 0.6746534314564436, + "grad_norm": 6.518866062164307, + "learning_rate": 2.2452937820878495e-05, + "loss": 0.501, + "step": 3942 + }, + { + "epoch": 0.6748245764162245, + "grad_norm": 22.836181640625, + "learning_rate": 2.2458642327438675e-05, + "loss": 1.9666, + "step": 3943 + }, + { + "epoch": 0.6749957213760055, + "grad_norm": 21.074495315551758, + "learning_rate": 2.246434683399886e-05, + "loss": 1.9682, + "step": 3944 + }, + { + "epoch": 0.6751668663357864, + "grad_norm": 24.013696670532227, + "learning_rate": 2.2470051340559042e-05, + "loss": 2.5082, + "step": 3945 + }, + { + "epoch": 0.6753380112955674, + "grad_norm": 19.56346893310547, + "learning_rate": 2.2475755847119222e-05, + "loss": 2.1205, + "step": 3946 + }, + { + "epoch": 0.6755091562553482, + "grad_norm": 22.354597091674805, + "learning_rate": 2.2481460353679405e-05, + "loss": 2.1432, + "step": 3947 + }, + { + "epoch": 0.6756803012151292, + "grad_norm": 20.16799545288086, + "learning_rate": 2.248716486023959e-05, + "loss": 1.7877, + "step": 3948 + }, + { + "epoch": 0.6758514461749101, + "grad_norm": 31.60150146484375, + "learning_rate": 2.2492869366799772e-05, + "loss": 3.0757, + "step": 3949 + }, + { + "epoch": 0.6760225911346911, + "grad_norm": 27.673959732055664, + "learning_rate": 2.2498573873359952e-05, + "loss": 3.013, + "step": 3950 + }, + { + "epoch": 0.676193736094472, + "grad_norm": 20.703968048095703, + "learning_rate": 2.250427837992014e-05, + "loss": 2.0428, + "step": 3951 + }, + { + "epoch": 0.676364881054253, + "grad_norm": 1.652134656906128, + "learning_rate": 2.2509982886480322e-05, + "loss": 0.2892, + "step": 3952 + }, + { + "epoch": 0.6765360260140338, + "grad_norm": 4.8036017417907715, + "learning_rate": 2.2515687393040502e-05, + "loss": 0.3591, + "step": 3953 + }, + { + "epoch": 0.6767071709738148, + "grad_norm": 1.2371207475662231, + "learning_rate": 2.2521391899600686e-05, + "loss": 0.2731, + "step": 3954 + }, + { + "epoch": 0.6768783159335957, + "grad_norm": 34.3635368347168, + "learning_rate": 2.252709640616087e-05, + "loss": 6.5111, + "step": 3955 + }, + { + "epoch": 0.6770494608933767, + "grad_norm": 26.6789493560791, + "learning_rate": 2.2532800912721052e-05, + "loss": 2.1933, + "step": 3956 + }, + { + "epoch": 0.6772206058531576, + "grad_norm": 2.4849586486816406, + "learning_rate": 2.2538505419281232e-05, + "loss": 0.3021, + "step": 3957 + }, + { + "epoch": 0.6773917508129386, + "grad_norm": 17.35017204284668, + "learning_rate": 2.2544209925841416e-05, + "loss": 1.7005, + "step": 3958 + }, + { + "epoch": 0.6775628957727194, + "grad_norm": 36.90235137939453, + "learning_rate": 2.25499144324016e-05, + "loss": 3.5547, + "step": 3959 + }, + { + "epoch": 0.6777340407325004, + "grad_norm": 14.82332706451416, + "learning_rate": 2.255561893896178e-05, + "loss": 1.2344, + "step": 3960 + }, + { + "epoch": 0.6779051856922813, + "grad_norm": 16.59126853942871, + "learning_rate": 2.2561323445521963e-05, + "loss": 1.6819, + "step": 3961 + }, + { + "epoch": 0.6780763306520623, + "grad_norm": 35.49818420410156, + "learning_rate": 2.2567027952082146e-05, + "loss": 5.1844, + "step": 3962 + }, + { + "epoch": 0.6782474756118432, + "grad_norm": 18.354028701782227, + "learning_rate": 2.257273245864233e-05, + "loss": 1.5253, + "step": 3963 + }, + { + "epoch": 0.6784186205716242, + "grad_norm": 33.30420684814453, + "learning_rate": 2.257843696520251e-05, + "loss": 3.3261, + "step": 3964 + }, + { + "epoch": 0.6785897655314052, + "grad_norm": 1.1292017698287964, + "learning_rate": 2.2584141471762693e-05, + "loss": 0.2504, + "step": 3965 + }, + { + "epoch": 0.678760910491186, + "grad_norm": 11.09749984741211, + "learning_rate": 2.2589845978322876e-05, + "loss": 0.7005, + "step": 3966 + }, + { + "epoch": 0.678932055450967, + "grad_norm": 26.572961807250977, + "learning_rate": 2.2595550484883056e-05, + "loss": 2.5559, + "step": 3967 + }, + { + "epoch": 0.6791032004107479, + "grad_norm": 28.02602767944336, + "learning_rate": 2.260125499144324e-05, + "loss": 2.5011, + "step": 3968 + }, + { + "epoch": 0.6792743453705289, + "grad_norm": 6.116679668426514, + "learning_rate": 2.2606959498003423e-05, + "loss": 0.6422, + "step": 3969 + }, + { + "epoch": 0.6794454903303098, + "grad_norm": 49.190433502197266, + "learning_rate": 2.2612664004563606e-05, + "loss": 2.2484, + "step": 3970 + }, + { + "epoch": 0.6796166352900908, + "grad_norm": 27.249277114868164, + "learning_rate": 2.2618368511123786e-05, + "loss": 2.8075, + "step": 3971 + }, + { + "epoch": 0.6797877802498716, + "grad_norm": 9.899073600769043, + "learning_rate": 2.262407301768397e-05, + "loss": 0.7593, + "step": 3972 + }, + { + "epoch": 0.6799589252096526, + "grad_norm": 32.11344528198242, + "learning_rate": 2.2629777524244153e-05, + "loss": 3.4356, + "step": 3973 + }, + { + "epoch": 0.6801300701694335, + "grad_norm": 33.132877349853516, + "learning_rate": 2.2635482030804336e-05, + "loss": 6.1209, + "step": 3974 + }, + { + "epoch": 0.6803012151292145, + "grad_norm": 23.018150329589844, + "learning_rate": 2.264118653736452e-05, + "loss": 2.6845, + "step": 3975 + }, + { + "epoch": 0.6804723600889954, + "grad_norm": 19.093454360961914, + "learning_rate": 2.2646891043924703e-05, + "loss": 1.9479, + "step": 3976 + }, + { + "epoch": 0.6806435050487764, + "grad_norm": 16.804319381713867, + "learning_rate": 2.2652595550484883e-05, + "loss": 1.7933, + "step": 3977 + }, + { + "epoch": 0.6808146500085572, + "grad_norm": 14.481977462768555, + "learning_rate": 2.2658300057045066e-05, + "loss": 1.2585, + "step": 3978 + }, + { + "epoch": 0.6809857949683382, + "grad_norm": 30.040294647216797, + "learning_rate": 2.266400456360525e-05, + "loss": 3.3274, + "step": 3979 + }, + { + "epoch": 0.6811569399281191, + "grad_norm": 13.815556526184082, + "learning_rate": 2.2669709070165433e-05, + "loss": 1.3265, + "step": 3980 + }, + { + "epoch": 0.6813280848879001, + "grad_norm": 6.664211273193359, + "learning_rate": 2.2675413576725613e-05, + "loss": 0.673, + "step": 3981 + }, + { + "epoch": 0.681499229847681, + "grad_norm": 28.066905975341797, + "learning_rate": 2.2681118083285797e-05, + "loss": 3.1732, + "step": 3982 + }, + { + "epoch": 0.681670374807462, + "grad_norm": 3.707343101501465, + "learning_rate": 2.268682258984598e-05, + "loss": 0.6108, + "step": 3983 + }, + { + "epoch": 0.6818415197672428, + "grad_norm": 3.5951898097991943, + "learning_rate": 2.269252709640616e-05, + "loss": 0.3119, + "step": 3984 + }, + { + "epoch": 0.6820126647270238, + "grad_norm": 34.33369064331055, + "learning_rate": 2.2698231602966343e-05, + "loss": 3.5062, + "step": 3985 + }, + { + "epoch": 0.6821838096868047, + "grad_norm": 10.984424591064453, + "learning_rate": 2.2703936109526527e-05, + "loss": 1.2835, + "step": 3986 + }, + { + "epoch": 0.6823549546465857, + "grad_norm": 44.93880844116211, + "learning_rate": 2.270964061608671e-05, + "loss": 2.0757, + "step": 3987 + }, + { + "epoch": 0.6825260996063666, + "grad_norm": 25.658374786376953, + "learning_rate": 2.271534512264689e-05, + "loss": 2.4198, + "step": 3988 + }, + { + "epoch": 0.6826972445661476, + "grad_norm": 23.74066162109375, + "learning_rate": 2.2721049629207073e-05, + "loss": 2.3887, + "step": 3989 + }, + { + "epoch": 0.6828683895259284, + "grad_norm": 22.6767578125, + "learning_rate": 2.2726754135767257e-05, + "loss": 2.0854, + "step": 3990 + }, + { + "epoch": 0.6830395344857094, + "grad_norm": 13.210683822631836, + "learning_rate": 2.2732458642327437e-05, + "loss": 0.7998, + "step": 3991 + }, + { + "epoch": 0.6832106794454903, + "grad_norm": 17.68242073059082, + "learning_rate": 2.273816314888762e-05, + "loss": 1.6588, + "step": 3992 + }, + { + "epoch": 0.6833818244052713, + "grad_norm": 29.65629005432129, + "learning_rate": 2.2743867655447804e-05, + "loss": 3.4329, + "step": 3993 + }, + { + "epoch": 0.6835529693650522, + "grad_norm": 23.901870727539062, + "learning_rate": 2.2749572162007987e-05, + "loss": 2.0216, + "step": 3994 + }, + { + "epoch": 0.6837241143248332, + "grad_norm": 26.25312614440918, + "learning_rate": 2.2755276668568167e-05, + "loss": 2.3758, + "step": 3995 + }, + { + "epoch": 0.683895259284614, + "grad_norm": 21.86573028564453, + "learning_rate": 2.2760981175128354e-05, + "loss": 1.8268, + "step": 3996 + }, + { + "epoch": 0.684066404244395, + "grad_norm": 32.05353546142578, + "learning_rate": 2.2766685681688537e-05, + "loss": 2.6875, + "step": 3997 + }, + { + "epoch": 0.6842375492041759, + "grad_norm": 24.166894912719727, + "learning_rate": 2.2772390188248717e-05, + "loss": 2.9649, + "step": 3998 + }, + { + "epoch": 0.6844086941639569, + "grad_norm": 2.158492088317871, + "learning_rate": 2.27780946948089e-05, + "loss": 0.3136, + "step": 3999 + }, + { + "epoch": 0.6845798391237378, + "grad_norm": 3.6095380783081055, + "learning_rate": 2.2783799201369084e-05, + "loss": 0.4003, + "step": 4000 + }, + { + "epoch": 0.6847509840835188, + "grad_norm": 15.331098556518555, + "learning_rate": 2.2789503707929267e-05, + "loss": 1.3303, + "step": 4001 + }, + { + "epoch": 0.6849221290432996, + "grad_norm": 72.33516693115234, + "learning_rate": 2.2795208214489447e-05, + "loss": 2.7759, + "step": 4002 + }, + { + "epoch": 0.6850932740030806, + "grad_norm": 28.182132720947266, + "learning_rate": 2.280091272104963e-05, + "loss": 2.6851, + "step": 4003 + }, + { + "epoch": 0.6852644189628615, + "grad_norm": 28.051651000976562, + "learning_rate": 2.2806617227609814e-05, + "loss": 2.7711, + "step": 4004 + }, + { + "epoch": 0.6854355639226425, + "grad_norm": 30.069196701049805, + "learning_rate": 2.2812321734169994e-05, + "loss": 3.0094, + "step": 4005 + }, + { + "epoch": 0.6856067088824234, + "grad_norm": 22.24188232421875, + "learning_rate": 2.2818026240730177e-05, + "loss": 2.049, + "step": 4006 + }, + { + "epoch": 0.6857778538422044, + "grad_norm": 26.202171325683594, + "learning_rate": 2.282373074729036e-05, + "loss": 2.9979, + "step": 4007 + }, + { + "epoch": 0.6859489988019852, + "grad_norm": 30.185165405273438, + "learning_rate": 2.2829435253850544e-05, + "loss": 2.4561, + "step": 4008 + }, + { + "epoch": 0.6861201437617662, + "grad_norm": 38.99590301513672, + "learning_rate": 2.2835139760410724e-05, + "loss": 6.2926, + "step": 4009 + }, + { + "epoch": 0.6862912887215471, + "grad_norm": 22.699562072753906, + "learning_rate": 2.2840844266970907e-05, + "loss": 2.1231, + "step": 4010 + }, + { + "epoch": 0.6864624336813281, + "grad_norm": 16.00046730041504, + "learning_rate": 2.284654877353109e-05, + "loss": 1.4517, + "step": 4011 + }, + { + "epoch": 0.686633578641109, + "grad_norm": 2.1720409393310547, + "learning_rate": 2.285225328009127e-05, + "loss": 0.2857, + "step": 4012 + }, + { + "epoch": 0.68680472360089, + "grad_norm": 24.013540267944336, + "learning_rate": 2.2857957786651454e-05, + "loss": 2.0506, + "step": 4013 + }, + { + "epoch": 0.6869758685606709, + "grad_norm": 31.45075798034668, + "learning_rate": 2.2863662293211638e-05, + "loss": 3.625, + "step": 4014 + }, + { + "epoch": 0.6871470135204518, + "grad_norm": 13.581439018249512, + "learning_rate": 2.2869366799771818e-05, + "loss": 1.0461, + "step": 4015 + }, + { + "epoch": 0.6873181584802328, + "grad_norm": 4.323340892791748, + "learning_rate": 2.2875071306332e-05, + "loss": 0.4245, + "step": 4016 + }, + { + "epoch": 0.6874893034400137, + "grad_norm": 22.576906204223633, + "learning_rate": 2.2880775812892184e-05, + "loss": 1.9977, + "step": 4017 + }, + { + "epoch": 0.6876604483997947, + "grad_norm": 24.18263053894043, + "learning_rate": 2.2886480319452368e-05, + "loss": 2.257, + "step": 4018 + }, + { + "epoch": 0.6878315933595756, + "grad_norm": 31.072450637817383, + "learning_rate": 2.289218482601255e-05, + "loss": 4.4122, + "step": 4019 + }, + { + "epoch": 0.6880027383193565, + "grad_norm": 6.332727432250977, + "learning_rate": 2.2897889332572734e-05, + "loss": 0.5804, + "step": 4020 + }, + { + "epoch": 0.6881738832791374, + "grad_norm": 28.49344825744629, + "learning_rate": 2.2903593839132918e-05, + "loss": 3.0976, + "step": 4021 + }, + { + "epoch": 0.6883450282389184, + "grad_norm": 25.831926345825195, + "learning_rate": 2.2909298345693098e-05, + "loss": 2.437, + "step": 4022 + }, + { + "epoch": 0.6885161731986993, + "grad_norm": 1.773380994796753, + "learning_rate": 2.291500285225328e-05, + "loss": 0.316, + "step": 4023 + }, + { + "epoch": 0.6886873181584803, + "grad_norm": 28.22812843322754, + "learning_rate": 2.2920707358813465e-05, + "loss": 3.162, + "step": 4024 + }, + { + "epoch": 0.6888584631182612, + "grad_norm": 25.598621368408203, + "learning_rate": 2.2926411865373648e-05, + "loss": 2.4509, + "step": 4025 + }, + { + "epoch": 0.6890296080780421, + "grad_norm": 23.125686645507812, + "learning_rate": 2.2932116371933828e-05, + "loss": 2.4663, + "step": 4026 + }, + { + "epoch": 0.689200753037823, + "grad_norm": 16.213899612426758, + "learning_rate": 2.293782087849401e-05, + "loss": 1.3174, + "step": 4027 + }, + { + "epoch": 0.689371897997604, + "grad_norm": 21.237468719482422, + "learning_rate": 2.2943525385054195e-05, + "loss": 1.7555, + "step": 4028 + }, + { + "epoch": 0.6895430429573849, + "grad_norm": 18.583372116088867, + "learning_rate": 2.2949229891614375e-05, + "loss": 2.1211, + "step": 4029 + }, + { + "epoch": 0.6897141879171659, + "grad_norm": 23.49361228942871, + "learning_rate": 2.2954934398174558e-05, + "loss": 2.3383, + "step": 4030 + }, + { + "epoch": 0.6898853328769468, + "grad_norm": 18.21615219116211, + "learning_rate": 2.296063890473474e-05, + "loss": 1.7035, + "step": 4031 + }, + { + "epoch": 0.6900564778367277, + "grad_norm": 21.006032943725586, + "learning_rate": 2.2966343411294925e-05, + "loss": 1.9274, + "step": 4032 + }, + { + "epoch": 0.6902276227965086, + "grad_norm": 33.84695816040039, + "learning_rate": 2.2972047917855105e-05, + "loss": 4.7633, + "step": 4033 + }, + { + "epoch": 0.6903987677562896, + "grad_norm": 23.903696060180664, + "learning_rate": 2.2977752424415288e-05, + "loss": 2.2789, + "step": 4034 + }, + { + "epoch": 0.6905699127160705, + "grad_norm": 32.980133056640625, + "learning_rate": 2.298345693097547e-05, + "loss": 4.0104, + "step": 4035 + }, + { + "epoch": 0.6907410576758515, + "grad_norm": 26.50374984741211, + "learning_rate": 2.298916143753565e-05, + "loss": 3.0929, + "step": 4036 + }, + { + "epoch": 0.6909122026356324, + "grad_norm": 18.61760902404785, + "learning_rate": 2.2994865944095835e-05, + "loss": 1.5559, + "step": 4037 + }, + { + "epoch": 0.6910833475954133, + "grad_norm": 21.44686508178711, + "learning_rate": 2.300057045065602e-05, + "loss": 1.9526, + "step": 4038 + }, + { + "epoch": 0.6912544925551942, + "grad_norm": 21.688053131103516, + "learning_rate": 2.3006274957216202e-05, + "loss": 2.2512, + "step": 4039 + }, + { + "epoch": 0.6914256375149752, + "grad_norm": 16.904464721679688, + "learning_rate": 2.3011979463776382e-05, + "loss": 1.5846, + "step": 4040 + }, + { + "epoch": 0.6915967824747561, + "grad_norm": 15.30504035949707, + "learning_rate": 2.3017683970336565e-05, + "loss": 1.4085, + "step": 4041 + }, + { + "epoch": 0.6917679274345371, + "grad_norm": 38.0904655456543, + "learning_rate": 2.3023388476896752e-05, + "loss": 6.407, + "step": 4042 + }, + { + "epoch": 0.691939072394318, + "grad_norm": 1.5367884635925293, + "learning_rate": 2.3029092983456932e-05, + "loss": 0.2607, + "step": 4043 + }, + { + "epoch": 0.6921102173540989, + "grad_norm": 6.76278018951416, + "learning_rate": 2.3034797490017115e-05, + "loss": 0.5019, + "step": 4044 + }, + { + "epoch": 0.6922813623138798, + "grad_norm": 28.894147872924805, + "learning_rate": 2.30405019965773e-05, + "loss": 3.1341, + "step": 4045 + }, + { + "epoch": 0.6924525072736608, + "grad_norm": 21.855031967163086, + "learning_rate": 2.304620650313748e-05, + "loss": 2.6851, + "step": 4046 + }, + { + "epoch": 0.6926236522334417, + "grad_norm": 21.59860610961914, + "learning_rate": 2.3051911009697662e-05, + "loss": 2.0748, + "step": 4047 + }, + { + "epoch": 0.6927947971932227, + "grad_norm": 82.68946838378906, + "learning_rate": 2.3057615516257845e-05, + "loss": 2.8348, + "step": 4048 + }, + { + "epoch": 0.6929659421530036, + "grad_norm": 47.771636962890625, + "learning_rate": 2.306332002281803e-05, + "loss": 6.3127, + "step": 4049 + }, + { + "epoch": 0.6931370871127845, + "grad_norm": 19.974872589111328, + "learning_rate": 2.306902452937821e-05, + "loss": 2.0138, + "step": 4050 + }, + { + "epoch": 0.6933082320725654, + "grad_norm": 28.05762481689453, + "learning_rate": 2.3074729035938392e-05, + "loss": 2.804, + "step": 4051 + }, + { + "epoch": 0.6934793770323464, + "grad_norm": 26.722768783569336, + "learning_rate": 2.3080433542498575e-05, + "loss": 3.3621, + "step": 4052 + }, + { + "epoch": 0.6936505219921273, + "grad_norm": 10.087456703186035, + "learning_rate": 2.3086138049058755e-05, + "loss": 0.6185, + "step": 4053 + }, + { + "epoch": 0.6938216669519083, + "grad_norm": 19.535655975341797, + "learning_rate": 2.309184255561894e-05, + "loss": 2.3951, + "step": 4054 + }, + { + "epoch": 0.6939928119116892, + "grad_norm": 24.08242416381836, + "learning_rate": 2.3097547062179122e-05, + "loss": 2.5265, + "step": 4055 + }, + { + "epoch": 0.6941639568714701, + "grad_norm": 10.019787788391113, + "learning_rate": 2.3103251568739306e-05, + "loss": 1.679, + "step": 4056 + }, + { + "epoch": 0.694335101831251, + "grad_norm": 23.744293212890625, + "learning_rate": 2.3108956075299486e-05, + "loss": 2.3542, + "step": 4057 + }, + { + "epoch": 0.694506246791032, + "grad_norm": 4.5092244148254395, + "learning_rate": 2.311466058185967e-05, + "loss": 0.4777, + "step": 4058 + }, + { + "epoch": 0.6946773917508129, + "grad_norm": 17.50345802307129, + "learning_rate": 2.3120365088419852e-05, + "loss": 1.8737, + "step": 4059 + }, + { + "epoch": 0.6948485367105939, + "grad_norm": 23.234378814697266, + "learning_rate": 2.3126069594980032e-05, + "loss": 2.307, + "step": 4060 + }, + { + "epoch": 0.6950196816703748, + "grad_norm": 2.192140579223633, + "learning_rate": 2.3131774101540216e-05, + "loss": 0.2844, + "step": 4061 + }, + { + "epoch": 0.6951908266301557, + "grad_norm": 22.02082633972168, + "learning_rate": 2.31374786081004e-05, + "loss": 2.0206, + "step": 4062 + }, + { + "epoch": 0.6953619715899366, + "grad_norm": 18.239028930664062, + "learning_rate": 2.3143183114660582e-05, + "loss": 1.5416, + "step": 4063 + }, + { + "epoch": 0.6955331165497176, + "grad_norm": 8.209535598754883, + "learning_rate": 2.3148887621220762e-05, + "loss": 0.8478, + "step": 4064 + }, + { + "epoch": 0.6957042615094986, + "grad_norm": 38.67818832397461, + "learning_rate": 2.315459212778095e-05, + "loss": 2.1182, + "step": 4065 + }, + { + "epoch": 0.6958754064692795, + "grad_norm": 27.814809799194336, + "learning_rate": 2.3160296634341133e-05, + "loss": 3.3903, + "step": 4066 + }, + { + "epoch": 0.6960465514290605, + "grad_norm": 1.4533177614212036, + "learning_rate": 2.3166001140901313e-05, + "loss": 0.2869, + "step": 4067 + }, + { + "epoch": 0.6962176963888413, + "grad_norm": 10.602791786193848, + "learning_rate": 2.3171705647461496e-05, + "loss": 0.6294, + "step": 4068 + }, + { + "epoch": 0.6963888413486223, + "grad_norm": 20.941123962402344, + "learning_rate": 2.317741015402168e-05, + "loss": 2.1678, + "step": 4069 + }, + { + "epoch": 0.6965599863084032, + "grad_norm": 26.170923233032227, + "learning_rate": 2.3183114660581863e-05, + "loss": 2.4743, + "step": 4070 + }, + { + "epoch": 0.6967311312681842, + "grad_norm": 24.81916618347168, + "learning_rate": 2.3188819167142043e-05, + "loss": 2.6961, + "step": 4071 + }, + { + "epoch": 0.6969022762279651, + "grad_norm": 21.444581985473633, + "learning_rate": 2.3194523673702226e-05, + "loss": 2.297, + "step": 4072 + }, + { + "epoch": 0.6970734211877461, + "grad_norm": 24.241352081298828, + "learning_rate": 2.320022818026241e-05, + "loss": 2.47, + "step": 4073 + }, + { + "epoch": 0.697244566147527, + "grad_norm": 19.334854125976562, + "learning_rate": 2.320593268682259e-05, + "loss": 1.8912, + "step": 4074 + }, + { + "epoch": 0.6974157111073079, + "grad_norm": 1.0382440090179443, + "learning_rate": 2.3211637193382773e-05, + "loss": 0.2496, + "step": 4075 + }, + { + "epoch": 0.6975868560670888, + "grad_norm": 9.449311256408691, + "learning_rate": 2.3217341699942956e-05, + "loss": 0.7254, + "step": 4076 + }, + { + "epoch": 0.6977580010268698, + "grad_norm": 73.3146743774414, + "learning_rate": 2.3223046206503136e-05, + "loss": 7.7671, + "step": 4077 + }, + { + "epoch": 0.6979291459866507, + "grad_norm": 1.6645268201828003, + "learning_rate": 2.322875071306332e-05, + "loss": 0.2892, + "step": 4078 + }, + { + "epoch": 0.6981002909464317, + "grad_norm": 4.770233631134033, + "learning_rate": 2.3234455219623503e-05, + "loss": 0.4124, + "step": 4079 + }, + { + "epoch": 0.6982714359062125, + "grad_norm": 28.565988540649414, + "learning_rate": 2.3240159726183686e-05, + "loss": 1.4992, + "step": 4080 + }, + { + "epoch": 0.6984425808659935, + "grad_norm": 12.26234245300293, + "learning_rate": 2.3245864232743866e-05, + "loss": 0.8612, + "step": 4081 + }, + { + "epoch": 0.6986137258257744, + "grad_norm": 25.978046417236328, + "learning_rate": 2.325156873930405e-05, + "loss": 2.1483, + "step": 4082 + }, + { + "epoch": 0.6987848707855554, + "grad_norm": 19.333969116210938, + "learning_rate": 2.3257273245864233e-05, + "loss": 2.0506, + "step": 4083 + }, + { + "epoch": 0.6989560157453363, + "grad_norm": 28.06734275817871, + "learning_rate": 2.3262977752424413e-05, + "loss": 2.6222, + "step": 4084 + }, + { + "epoch": 0.6991271607051173, + "grad_norm": 9.045817375183105, + "learning_rate": 2.3268682258984596e-05, + "loss": 0.8083, + "step": 4085 + }, + { + "epoch": 0.6992983056648981, + "grad_norm": 6.522378444671631, + "learning_rate": 2.327438676554478e-05, + "loss": 0.5991, + "step": 4086 + }, + { + "epoch": 0.6994694506246791, + "grad_norm": 24.520263671875, + "learning_rate": 2.3280091272104963e-05, + "loss": 1.9785, + "step": 4087 + }, + { + "epoch": 0.69964059558446, + "grad_norm": 25.192462921142578, + "learning_rate": 2.3285795778665147e-05, + "loss": 1.9242, + "step": 4088 + }, + { + "epoch": 0.699811740544241, + "grad_norm": 21.36008071899414, + "learning_rate": 2.329150028522533e-05, + "loss": 2.2738, + "step": 4089 + }, + { + "epoch": 0.6999828855040219, + "grad_norm": 15.994437217712402, + "learning_rate": 2.3297204791785513e-05, + "loss": 1.5939, + "step": 4090 + }, + { + "epoch": 0.7001540304638029, + "grad_norm": 13.80662727355957, + "learning_rate": 2.3302909298345693e-05, + "loss": 0.8196, + "step": 4091 + }, + { + "epoch": 0.7003251754235837, + "grad_norm": 1.906544804573059, + "learning_rate": 2.3308613804905877e-05, + "loss": 0.2608, + "step": 4092 + }, + { + "epoch": 0.7004963203833647, + "grad_norm": 6.288933753967285, + "learning_rate": 2.331431831146606e-05, + "loss": 0.8815, + "step": 4093 + }, + { + "epoch": 0.7006674653431456, + "grad_norm": 25.848539352416992, + "learning_rate": 2.3320022818026244e-05, + "loss": 3.1319, + "step": 4094 + }, + { + "epoch": 0.7008386103029266, + "grad_norm": 2.6723341941833496, + "learning_rate": 2.3325727324586424e-05, + "loss": 0.3466, + "step": 4095 + }, + { + "epoch": 0.7010097552627075, + "grad_norm": 34.171104431152344, + "learning_rate": 2.3331431831146607e-05, + "loss": 6.0408, + "step": 4096 + }, + { + "epoch": 0.7011809002224885, + "grad_norm": 6.2798662185668945, + "learning_rate": 2.333713633770679e-05, + "loss": 0.5715, + "step": 4097 + }, + { + "epoch": 0.7013520451822693, + "grad_norm": 21.24723243713379, + "learning_rate": 2.334284084426697e-05, + "loss": 1.801, + "step": 4098 + }, + { + "epoch": 0.7015231901420503, + "grad_norm": 2.3047332763671875, + "learning_rate": 2.3348545350827154e-05, + "loss": 0.2806, + "step": 4099 + }, + { + "epoch": 0.7016943351018312, + "grad_norm": 35.639129638671875, + "learning_rate": 2.3354249857387337e-05, + "loss": 6.2857, + "step": 4100 + }, + { + "epoch": 0.7018654800616122, + "grad_norm": 22.578310012817383, + "learning_rate": 2.335995436394752e-05, + "loss": 2.3757, + "step": 4101 + }, + { + "epoch": 0.7020366250213931, + "grad_norm": 27.650184631347656, + "learning_rate": 2.33656588705077e-05, + "loss": 2.7948, + "step": 4102 + }, + { + "epoch": 0.7022077699811741, + "grad_norm": 22.47934913635254, + "learning_rate": 2.3371363377067884e-05, + "loss": 2.9155, + "step": 4103 + }, + { + "epoch": 0.702378914940955, + "grad_norm": 117.08856201171875, + "learning_rate": 2.3377067883628067e-05, + "loss": 8.9408, + "step": 4104 + }, + { + "epoch": 0.7025500599007359, + "grad_norm": 6.346577167510986, + "learning_rate": 2.3382772390188247e-05, + "loss": 0.4762, + "step": 4105 + }, + { + "epoch": 0.7027212048605168, + "grad_norm": 24.838397979736328, + "learning_rate": 2.338847689674843e-05, + "loss": 2.0305, + "step": 4106 + }, + { + "epoch": 0.7028923498202978, + "grad_norm": 5.3628716468811035, + "learning_rate": 2.3394181403308614e-05, + "loss": 0.4928, + "step": 4107 + }, + { + "epoch": 0.7030634947800787, + "grad_norm": 27.933374404907227, + "learning_rate": 2.3399885909868797e-05, + "loss": 2.7148, + "step": 4108 + }, + { + "epoch": 0.7032346397398597, + "grad_norm": 61.49900436401367, + "learning_rate": 2.3405590416428977e-05, + "loss": 2.22, + "step": 4109 + }, + { + "epoch": 0.7034057846996405, + "grad_norm": 35.6771354675293, + "learning_rate": 2.341129492298916e-05, + "loss": 2.7082, + "step": 4110 + }, + { + "epoch": 0.7035769296594215, + "grad_norm": 6.308041095733643, + "learning_rate": 2.3416999429549347e-05, + "loss": 0.5861, + "step": 4111 + }, + { + "epoch": 0.7037480746192024, + "grad_norm": 18.36146354675293, + "learning_rate": 2.3422703936109527e-05, + "loss": 1.5818, + "step": 4112 + }, + { + "epoch": 0.7039192195789834, + "grad_norm": 26.6254940032959, + "learning_rate": 2.342840844266971e-05, + "loss": 2.5972, + "step": 4113 + }, + { + "epoch": 0.7040903645387643, + "grad_norm": 41.90875244140625, + "learning_rate": 2.3434112949229894e-05, + "loss": 7.2004, + "step": 4114 + }, + { + "epoch": 0.7042615094985453, + "grad_norm": 26.454225540161133, + "learning_rate": 2.3439817455790074e-05, + "loss": 3.0425, + "step": 4115 + }, + { + "epoch": 0.7044326544583263, + "grad_norm": 67.04540252685547, + "learning_rate": 2.3445521962350258e-05, + "loss": 2.5417, + "step": 4116 + }, + { + "epoch": 0.7046037994181071, + "grad_norm": 29.956275939941406, + "learning_rate": 2.345122646891044e-05, + "loss": 3.9422, + "step": 4117 + }, + { + "epoch": 0.7047749443778881, + "grad_norm": 28.678985595703125, + "learning_rate": 2.3456930975470624e-05, + "loss": 2.8701, + "step": 4118 + }, + { + "epoch": 0.704946089337669, + "grad_norm": 21.728593826293945, + "learning_rate": 2.3462635482030804e-05, + "loss": 1.9539, + "step": 4119 + }, + { + "epoch": 0.70511723429745, + "grad_norm": 21.133193969726562, + "learning_rate": 2.3468339988590988e-05, + "loss": 2.0111, + "step": 4120 + }, + { + "epoch": 0.7052883792572309, + "grad_norm": 57.13780975341797, + "learning_rate": 2.347404449515117e-05, + "loss": 2.4365, + "step": 4121 + }, + { + "epoch": 0.7054595242170119, + "grad_norm": 1.5548804998397827, + "learning_rate": 2.347974900171135e-05, + "loss": 0.2656, + "step": 4122 + }, + { + "epoch": 0.7056306691767927, + "grad_norm": 16.94969940185547, + "learning_rate": 2.3485453508271534e-05, + "loss": 1.5448, + "step": 4123 + }, + { + "epoch": 0.7058018141365737, + "grad_norm": 29.882259368896484, + "learning_rate": 2.3491158014831718e-05, + "loss": 1.7722, + "step": 4124 + }, + { + "epoch": 0.7059729590963546, + "grad_norm": 13.851731300354004, + "learning_rate": 2.34968625213919e-05, + "loss": 1.2664, + "step": 4125 + }, + { + "epoch": 0.7061441040561356, + "grad_norm": 19.379756927490234, + "learning_rate": 2.350256702795208e-05, + "loss": 1.5466, + "step": 4126 + }, + { + "epoch": 0.7063152490159165, + "grad_norm": 20.181297302246094, + "learning_rate": 2.3508271534512265e-05, + "loss": 2.0799, + "step": 4127 + }, + { + "epoch": 0.7064863939756975, + "grad_norm": 18.206491470336914, + "learning_rate": 2.3513976041072448e-05, + "loss": 1.7423, + "step": 4128 + }, + { + "epoch": 0.7066575389354783, + "grad_norm": 39.42982482910156, + "learning_rate": 2.3519680547632628e-05, + "loss": 6.003, + "step": 4129 + }, + { + "epoch": 0.7068286838952593, + "grad_norm": 10.509867668151855, + "learning_rate": 2.352538505419281e-05, + "loss": 0.7636, + "step": 4130 + }, + { + "epoch": 0.7069998288550402, + "grad_norm": 2.2939412593841553, + "learning_rate": 2.3531089560752995e-05, + "loss": 0.2719, + "step": 4131 + }, + { + "epoch": 0.7071709738148212, + "grad_norm": 25.9577579498291, + "learning_rate": 2.3536794067313178e-05, + "loss": 2.4994, + "step": 4132 + }, + { + "epoch": 0.7073421187746021, + "grad_norm": 72.57787322998047, + "learning_rate": 2.3542498573873358e-05, + "loss": 7.4552, + "step": 4133 + }, + { + "epoch": 0.7075132637343831, + "grad_norm": 138.49476623535156, + "learning_rate": 2.3548203080433545e-05, + "loss": 8.7384, + "step": 4134 + }, + { + "epoch": 0.7076844086941639, + "grad_norm": 18.31711769104004, + "learning_rate": 2.3553907586993728e-05, + "loss": 1.8353, + "step": 4135 + }, + { + "epoch": 0.7078555536539449, + "grad_norm": 23.117900848388672, + "learning_rate": 2.3559612093553908e-05, + "loss": 2.2019, + "step": 4136 + }, + { + "epoch": 0.7080266986137258, + "grad_norm": 29.79839515686035, + "learning_rate": 2.356531660011409e-05, + "loss": 3.6913, + "step": 4137 + }, + { + "epoch": 0.7081978435735068, + "grad_norm": 6.226611614227295, + "learning_rate": 2.3571021106674275e-05, + "loss": 0.3817, + "step": 4138 + }, + { + "epoch": 0.7083689885332877, + "grad_norm": 7.866857051849365, + "learning_rate": 2.3576725613234458e-05, + "loss": 0.7468, + "step": 4139 + }, + { + "epoch": 0.7085401334930687, + "grad_norm": 13.393908500671387, + "learning_rate": 2.3582430119794638e-05, + "loss": 0.9038, + "step": 4140 + }, + { + "epoch": 0.7087112784528495, + "grad_norm": 4.077215194702148, + "learning_rate": 2.358813462635482e-05, + "loss": 0.3156, + "step": 4141 + }, + { + "epoch": 0.7088824234126305, + "grad_norm": 35.0775146484375, + "learning_rate": 2.3593839132915005e-05, + "loss": 6.4035, + "step": 4142 + }, + { + "epoch": 0.7090535683724114, + "grad_norm": 27.827789306640625, + "learning_rate": 2.3599543639475185e-05, + "loss": 2.6875, + "step": 4143 + }, + { + "epoch": 0.7092247133321924, + "grad_norm": 7.222084045410156, + "learning_rate": 2.360524814603537e-05, + "loss": 0.6337, + "step": 4144 + }, + { + "epoch": 0.7093958582919733, + "grad_norm": 16.112009048461914, + "learning_rate": 2.3610952652595552e-05, + "loss": 1.5604, + "step": 4145 + }, + { + "epoch": 0.7095670032517543, + "grad_norm": 33.34373092651367, + "learning_rate": 2.3616657159155732e-05, + "loss": 6.1852, + "step": 4146 + }, + { + "epoch": 0.7097381482115351, + "grad_norm": 23.43242835998535, + "learning_rate": 2.3622361665715915e-05, + "loss": 2.2633, + "step": 4147 + }, + { + "epoch": 0.7099092931713161, + "grad_norm": 23.43446159362793, + "learning_rate": 2.36280661722761e-05, + "loss": 2.2967, + "step": 4148 + }, + { + "epoch": 0.710080438131097, + "grad_norm": 9.296281814575195, + "learning_rate": 2.3633770678836282e-05, + "loss": 0.9115, + "step": 4149 + }, + { + "epoch": 0.710251583090878, + "grad_norm": 23.257322311401367, + "learning_rate": 2.3639475185396462e-05, + "loss": 2.2038, + "step": 4150 + }, + { + "epoch": 0.7104227280506589, + "grad_norm": 30.016109466552734, + "learning_rate": 2.3645179691956645e-05, + "loss": 3.0839, + "step": 4151 + }, + { + "epoch": 0.7105938730104399, + "grad_norm": 25.682640075683594, + "learning_rate": 2.365088419851683e-05, + "loss": 2.8843, + "step": 4152 + }, + { + "epoch": 0.7107650179702207, + "grad_norm": 23.85866355895996, + "learning_rate": 2.365658870507701e-05, + "loss": 1.9993, + "step": 4153 + }, + { + "epoch": 0.7109361629300017, + "grad_norm": 4.204997539520264, + "learning_rate": 2.3662293211637192e-05, + "loss": 0.5192, + "step": 4154 + }, + { + "epoch": 0.7111073078897826, + "grad_norm": 24.741037368774414, + "learning_rate": 2.3667997718197375e-05, + "loss": 2.4707, + "step": 4155 + }, + { + "epoch": 0.7112784528495636, + "grad_norm": 5.143214225769043, + "learning_rate": 2.367370222475756e-05, + "loss": 0.3212, + "step": 4156 + }, + { + "epoch": 0.7114495978093445, + "grad_norm": 30.81825828552246, + "learning_rate": 2.3679406731317742e-05, + "loss": 2.3157, + "step": 4157 + }, + { + "epoch": 0.7116207427691255, + "grad_norm": 25.597097396850586, + "learning_rate": 2.3685111237877926e-05, + "loss": 2.8225, + "step": 4158 + }, + { + "epoch": 0.7117918877289063, + "grad_norm": 12.126123428344727, + "learning_rate": 2.369081574443811e-05, + "loss": 0.6904, + "step": 4159 + }, + { + "epoch": 0.7119630326886873, + "grad_norm": 17.513898849487305, + "learning_rate": 2.369652025099829e-05, + "loss": 1.9711, + "step": 4160 + }, + { + "epoch": 0.7121341776484682, + "grad_norm": 24.142879486083984, + "learning_rate": 2.3702224757558472e-05, + "loss": 2.5014, + "step": 4161 + }, + { + "epoch": 0.7123053226082492, + "grad_norm": 26.637598037719727, + "learning_rate": 2.3707929264118656e-05, + "loss": 2.5949, + "step": 4162 + }, + { + "epoch": 0.7124764675680301, + "grad_norm": 22.141407012939453, + "learning_rate": 2.371363377067884e-05, + "loss": 2.4929, + "step": 4163 + }, + { + "epoch": 0.7126476125278111, + "grad_norm": 26.212926864624023, + "learning_rate": 2.371933827723902e-05, + "loss": 2.6947, + "step": 4164 + }, + { + "epoch": 0.7128187574875919, + "grad_norm": 18.995336532592773, + "learning_rate": 2.3725042783799202e-05, + "loss": 1.6993, + "step": 4165 + }, + { + "epoch": 0.7129899024473729, + "grad_norm": 27.700637817382812, + "learning_rate": 2.3730747290359386e-05, + "loss": 3.3321, + "step": 4166 + }, + { + "epoch": 0.7131610474071539, + "grad_norm": 5.5868754386901855, + "learning_rate": 2.3736451796919566e-05, + "loss": 0.5355, + "step": 4167 + }, + { + "epoch": 0.7133321923669348, + "grad_norm": 15.529837608337402, + "learning_rate": 2.374215630347975e-05, + "loss": 1.5037, + "step": 4168 + }, + { + "epoch": 0.7135033373267158, + "grad_norm": 21.299524307250977, + "learning_rate": 2.3747860810039933e-05, + "loss": 1.9929, + "step": 4169 + }, + { + "epoch": 0.7136744822864967, + "grad_norm": 45.21254348754883, + "learning_rate": 2.3753565316600116e-05, + "loss": 6.7553, + "step": 4170 + }, + { + "epoch": 0.7138456272462776, + "grad_norm": 26.380695343017578, + "learning_rate": 2.3759269823160296e-05, + "loss": 2.6056, + "step": 4171 + }, + { + "epoch": 0.7140167722060585, + "grad_norm": 3.5742998123168945, + "learning_rate": 2.376497432972048e-05, + "loss": 0.4821, + "step": 4172 + }, + { + "epoch": 0.7141879171658395, + "grad_norm": 31.626493453979492, + "learning_rate": 2.3770678836280663e-05, + "loss": 3.368, + "step": 4173 + }, + { + "epoch": 0.7143590621256204, + "grad_norm": 21.61958885192871, + "learning_rate": 2.3776383342840843e-05, + "loss": 1.709, + "step": 4174 + }, + { + "epoch": 0.7145302070854014, + "grad_norm": 6.5677809715271, + "learning_rate": 2.3782087849401026e-05, + "loss": 0.5463, + "step": 4175 + }, + { + "epoch": 0.7147013520451823, + "grad_norm": 151.24124145507812, + "learning_rate": 2.378779235596121e-05, + "loss": 8.6429, + "step": 4176 + }, + { + "epoch": 0.7148724970049632, + "grad_norm": 34.68931579589844, + "learning_rate": 2.3793496862521393e-05, + "loss": 6.1887, + "step": 4177 + }, + { + "epoch": 0.7150436419647441, + "grad_norm": 18.861997604370117, + "learning_rate": 2.3799201369081573e-05, + "loss": 1.8546, + "step": 4178 + }, + { + "epoch": 0.7152147869245251, + "grad_norm": 19.337419509887695, + "learning_rate": 2.380490587564176e-05, + "loss": 1.8454, + "step": 4179 + }, + { + "epoch": 0.715385931884306, + "grad_norm": 1.8883466720581055, + "learning_rate": 2.3810610382201943e-05, + "loss": 0.289, + "step": 4180 + }, + { + "epoch": 0.715557076844087, + "grad_norm": 110.51686096191406, + "learning_rate": 2.3816314888762123e-05, + "loss": 7.8896, + "step": 4181 + }, + { + "epoch": 0.7157282218038679, + "grad_norm": 69.93323516845703, + "learning_rate": 2.3822019395322306e-05, + "loss": 2.6675, + "step": 4182 + }, + { + "epoch": 0.7158993667636488, + "grad_norm": 23.35276222229004, + "learning_rate": 2.382772390188249e-05, + "loss": 2.0773, + "step": 4183 + }, + { + "epoch": 0.7160705117234297, + "grad_norm": 20.778461456298828, + "learning_rate": 2.383342840844267e-05, + "loss": 2.2745, + "step": 4184 + }, + { + "epoch": 0.7162416566832107, + "grad_norm": 13.58486557006836, + "learning_rate": 2.3839132915002853e-05, + "loss": 1.2723, + "step": 4185 + }, + { + "epoch": 0.7164128016429916, + "grad_norm": 6.069742202758789, + "learning_rate": 2.3844837421563036e-05, + "loss": 0.865, + "step": 4186 + }, + { + "epoch": 0.7165839466027726, + "grad_norm": 21.17997169494629, + "learning_rate": 2.385054192812322e-05, + "loss": 1.8901, + "step": 4187 + }, + { + "epoch": 0.7167550915625535, + "grad_norm": 24.12006187438965, + "learning_rate": 2.38562464346834e-05, + "loss": 2.7853, + "step": 4188 + }, + { + "epoch": 0.7169262365223344, + "grad_norm": 13.66297721862793, + "learning_rate": 2.3861950941243583e-05, + "loss": 0.9545, + "step": 4189 + }, + { + "epoch": 0.7170973814821153, + "grad_norm": 51.27836990356445, + "learning_rate": 2.3867655447803767e-05, + "loss": 1.9518, + "step": 4190 + }, + { + "epoch": 0.7172685264418963, + "grad_norm": 7.221101760864258, + "learning_rate": 2.3873359954363947e-05, + "loss": 0.5895, + "step": 4191 + }, + { + "epoch": 0.7174396714016772, + "grad_norm": 91.32466125488281, + "learning_rate": 2.387906446092413e-05, + "loss": 7.6073, + "step": 4192 + }, + { + "epoch": 0.7176108163614582, + "grad_norm": 19.606168746948242, + "learning_rate": 2.3884768967484313e-05, + "loss": 1.8311, + "step": 4193 + }, + { + "epoch": 0.7177819613212391, + "grad_norm": 8.86071491241455, + "learning_rate": 2.3890473474044497e-05, + "loss": 0.8949, + "step": 4194 + }, + { + "epoch": 0.71795310628102, + "grad_norm": 21.827577590942383, + "learning_rate": 2.3896177980604677e-05, + "loss": 2.0555, + "step": 4195 + }, + { + "epoch": 0.7181242512408009, + "grad_norm": 24.964656829833984, + "learning_rate": 2.390188248716486e-05, + "loss": 3.0599, + "step": 4196 + }, + { + "epoch": 0.7182953962005819, + "grad_norm": 1.9594327211380005, + "learning_rate": 2.3907586993725043e-05, + "loss": 0.2823, + "step": 4197 + }, + { + "epoch": 0.7184665411603628, + "grad_norm": 18.58384132385254, + "learning_rate": 2.3913291500285223e-05, + "loss": 1.8147, + "step": 4198 + }, + { + "epoch": 0.7186376861201438, + "grad_norm": 22.12485122680664, + "learning_rate": 2.3918996006845407e-05, + "loss": 2.4456, + "step": 4199 + }, + { + "epoch": 0.7188088310799247, + "grad_norm": 20.110654830932617, + "learning_rate": 2.392470051340559e-05, + "loss": 1.9338, + "step": 4200 + }, + { + "epoch": 0.7189799760397056, + "grad_norm": 25.46828842163086, + "learning_rate": 2.3930405019965774e-05, + "loss": 1.6106, + "step": 4201 + }, + { + "epoch": 0.7191511209994865, + "grad_norm": 10.579689979553223, + "learning_rate": 2.3936109526525957e-05, + "loss": 0.6947, + "step": 4202 + }, + { + "epoch": 0.7193222659592675, + "grad_norm": 50.630008697509766, + "learning_rate": 2.394181403308614e-05, + "loss": 1.6672, + "step": 4203 + }, + { + "epoch": 0.7194934109190484, + "grad_norm": 22.716222763061523, + "learning_rate": 2.3947518539646324e-05, + "loss": 2.0138, + "step": 4204 + }, + { + "epoch": 0.7196645558788294, + "grad_norm": 16.84745216369629, + "learning_rate": 2.3953223046206504e-05, + "loss": 1.0839, + "step": 4205 + }, + { + "epoch": 0.7198357008386103, + "grad_norm": 1.9779396057128906, + "learning_rate": 2.3958927552766687e-05, + "loss": 0.2798, + "step": 4206 + }, + { + "epoch": 0.7200068457983912, + "grad_norm": 30.206113815307617, + "learning_rate": 2.396463205932687e-05, + "loss": 2.7617, + "step": 4207 + }, + { + "epoch": 0.7201779907581721, + "grad_norm": 14.300946235656738, + "learning_rate": 2.3970336565887054e-05, + "loss": 1.4846, + "step": 4208 + }, + { + "epoch": 0.7203491357179531, + "grad_norm": 14.153777122497559, + "learning_rate": 2.3976041072447234e-05, + "loss": 0.9053, + "step": 4209 + }, + { + "epoch": 0.720520280677734, + "grad_norm": 30.488285064697266, + "learning_rate": 2.3981745579007417e-05, + "loss": 3.6317, + "step": 4210 + }, + { + "epoch": 0.720691425637515, + "grad_norm": 40.644447326660156, + "learning_rate": 2.39874500855676e-05, + "loss": 6.598, + "step": 4211 + }, + { + "epoch": 0.7208625705972959, + "grad_norm": 18.58864402770996, + "learning_rate": 2.399315459212778e-05, + "loss": 1.7715, + "step": 4212 + }, + { + "epoch": 0.7210337155570768, + "grad_norm": 25.57322883605957, + "learning_rate": 2.3998859098687964e-05, + "loss": 2.5285, + "step": 4213 + }, + { + "epoch": 0.7212048605168577, + "grad_norm": 21.300174713134766, + "learning_rate": 2.4004563605248147e-05, + "loss": 2.5907, + "step": 4214 + }, + { + "epoch": 0.7213760054766387, + "grad_norm": 15.046341896057129, + "learning_rate": 2.4010268111808327e-05, + "loss": 1.2937, + "step": 4215 + }, + { + "epoch": 0.7215471504364196, + "grad_norm": 23.890518188476562, + "learning_rate": 2.401597261836851e-05, + "loss": 2.1427, + "step": 4216 + }, + { + "epoch": 0.7217182953962006, + "grad_norm": 12.539481163024902, + "learning_rate": 2.4021677124928694e-05, + "loss": 0.8432, + "step": 4217 + }, + { + "epoch": 0.7218894403559816, + "grad_norm": 21.108722686767578, + "learning_rate": 2.4027381631488877e-05, + "loss": 2.3811, + "step": 4218 + }, + { + "epoch": 0.7220605853157624, + "grad_norm": 18.81709861755371, + "learning_rate": 2.4033086138049057e-05, + "loss": 1.6914, + "step": 4219 + }, + { + "epoch": 0.7222317302755434, + "grad_norm": 20.49457359313965, + "learning_rate": 2.403879064460924e-05, + "loss": 1.9627, + "step": 4220 + }, + { + "epoch": 0.7224028752353243, + "grad_norm": 9.025213241577148, + "learning_rate": 2.4044495151169424e-05, + "loss": 0.6996, + "step": 4221 + }, + { + "epoch": 0.7225740201951053, + "grad_norm": 2.5068697929382324, + "learning_rate": 2.4050199657729604e-05, + "loss": 0.3039, + "step": 4222 + }, + { + "epoch": 0.7227451651548862, + "grad_norm": 9.711918830871582, + "learning_rate": 2.4055904164289788e-05, + "loss": 1.3666, + "step": 4223 + }, + { + "epoch": 0.7229163101146672, + "grad_norm": 17.298583984375, + "learning_rate": 2.406160867084997e-05, + "loss": 1.5982, + "step": 4224 + }, + { + "epoch": 0.723087455074448, + "grad_norm": 8.982203483581543, + "learning_rate": 2.4067313177410158e-05, + "loss": 0.6486, + "step": 4225 + }, + { + "epoch": 0.723258600034229, + "grad_norm": 24.49521827697754, + "learning_rate": 2.4073017683970338e-05, + "loss": 3.2304, + "step": 4226 + }, + { + "epoch": 0.7234297449940099, + "grad_norm": 21.123031616210938, + "learning_rate": 2.407872219053052e-05, + "loss": 1.8011, + "step": 4227 + }, + { + "epoch": 0.7236008899537909, + "grad_norm": 9.760648727416992, + "learning_rate": 2.4084426697090704e-05, + "loss": 0.6659, + "step": 4228 + }, + { + "epoch": 0.7237720349135718, + "grad_norm": 8.795894622802734, + "learning_rate": 2.4090131203650884e-05, + "loss": 0.8128, + "step": 4229 + }, + { + "epoch": 0.7239431798733528, + "grad_norm": 5.41191291809082, + "learning_rate": 2.4095835710211068e-05, + "loss": 0.4972, + "step": 4230 + }, + { + "epoch": 0.7241143248331336, + "grad_norm": 6.078951358795166, + "learning_rate": 2.410154021677125e-05, + "loss": 0.6283, + "step": 4231 + }, + { + "epoch": 0.7242854697929146, + "grad_norm": 9.235028266906738, + "learning_rate": 2.4107244723331435e-05, + "loss": 1.1542, + "step": 4232 + }, + { + "epoch": 0.7244566147526955, + "grad_norm": 18.9094295501709, + "learning_rate": 2.4112949229891615e-05, + "loss": 1.3619, + "step": 4233 + }, + { + "epoch": 0.7246277597124765, + "grad_norm": 8.397602081298828, + "learning_rate": 2.4118653736451798e-05, + "loss": 0.5137, + "step": 4234 + }, + { + "epoch": 0.7247989046722574, + "grad_norm": 35.325618743896484, + "learning_rate": 2.412435824301198e-05, + "loss": 5.9278, + "step": 4235 + }, + { + "epoch": 0.7249700496320384, + "grad_norm": 7.471834182739258, + "learning_rate": 2.413006274957216e-05, + "loss": 0.5769, + "step": 4236 + }, + { + "epoch": 0.7251411945918192, + "grad_norm": 10.854155540466309, + "learning_rate": 2.4135767256132345e-05, + "loss": 0.8743, + "step": 4237 + }, + { + "epoch": 0.7253123395516002, + "grad_norm": 26.70799446105957, + "learning_rate": 2.4141471762692528e-05, + "loss": 2.6374, + "step": 4238 + }, + { + "epoch": 0.7254834845113811, + "grad_norm": 23.99932861328125, + "learning_rate": 2.414717626925271e-05, + "loss": 1.9431, + "step": 4239 + }, + { + "epoch": 0.7256546294711621, + "grad_norm": 7.719635963439941, + "learning_rate": 2.415288077581289e-05, + "loss": 0.5437, + "step": 4240 + }, + { + "epoch": 0.725825774430943, + "grad_norm": 11.576183319091797, + "learning_rate": 2.4158585282373075e-05, + "loss": 0.8931, + "step": 4241 + }, + { + "epoch": 0.725996919390724, + "grad_norm": 1.2357739210128784, + "learning_rate": 2.4164289788933258e-05, + "loss": 0.2441, + "step": 4242 + }, + { + "epoch": 0.7261680643505048, + "grad_norm": 32.51354217529297, + "learning_rate": 2.4169994295493438e-05, + "loss": 5.9717, + "step": 4243 + }, + { + "epoch": 0.7263392093102858, + "grad_norm": 20.202180862426758, + "learning_rate": 2.417569880205362e-05, + "loss": 1.909, + "step": 4244 + }, + { + "epoch": 0.7265103542700667, + "grad_norm": 84.00714111328125, + "learning_rate": 2.4181403308613805e-05, + "loss": 1.9052, + "step": 4245 + }, + { + "epoch": 0.7266814992298477, + "grad_norm": 28.061588287353516, + "learning_rate": 2.4187107815173985e-05, + "loss": 1.4887, + "step": 4246 + }, + { + "epoch": 0.7268526441896286, + "grad_norm": 2.4384922981262207, + "learning_rate": 2.419281232173417e-05, + "loss": 0.2672, + "step": 4247 + }, + { + "epoch": 0.7270237891494096, + "grad_norm": 2.515739679336548, + "learning_rate": 2.4198516828294355e-05, + "loss": 0.2884, + "step": 4248 + }, + { + "epoch": 0.7271949341091904, + "grad_norm": 29.182708740234375, + "learning_rate": 2.420422133485454e-05, + "loss": 3.5217, + "step": 4249 + }, + { + "epoch": 0.7273660790689714, + "grad_norm": 16.3360538482666, + "learning_rate": 2.420992584141472e-05, + "loss": 1.7574, + "step": 4250 + }, + { + "epoch": 0.7275372240287523, + "grad_norm": 37.92715072631836, + "learning_rate": 2.4215630347974902e-05, + "loss": 3.4675, + "step": 4251 + }, + { + "epoch": 0.7277083689885333, + "grad_norm": 26.36214256286621, + "learning_rate": 2.4221334854535085e-05, + "loss": 3.8431, + "step": 4252 + }, + { + "epoch": 0.7278795139483142, + "grad_norm": 26.715503692626953, + "learning_rate": 2.4227039361095265e-05, + "loss": 2.7822, + "step": 4253 + }, + { + "epoch": 0.7280506589080952, + "grad_norm": 27.2398624420166, + "learning_rate": 2.423274386765545e-05, + "loss": 3.1297, + "step": 4254 + }, + { + "epoch": 0.728221803867876, + "grad_norm": 30.56747055053711, + "learning_rate": 2.4238448374215632e-05, + "loss": 5.8095, + "step": 4255 + }, + { + "epoch": 0.728392948827657, + "grad_norm": 7.464992523193359, + "learning_rate": 2.4244152880775815e-05, + "loss": 0.6275, + "step": 4256 + }, + { + "epoch": 0.7285640937874379, + "grad_norm": 12.612624168395996, + "learning_rate": 2.4249857387335995e-05, + "loss": 0.8099, + "step": 4257 + }, + { + "epoch": 0.7287352387472189, + "grad_norm": 24.279560089111328, + "learning_rate": 2.425556189389618e-05, + "loss": 2.72, + "step": 4258 + }, + { + "epoch": 0.7289063837069998, + "grad_norm": 21.02090835571289, + "learning_rate": 2.4261266400456362e-05, + "loss": 2.1918, + "step": 4259 + }, + { + "epoch": 0.7290775286667808, + "grad_norm": 4.548274040222168, + "learning_rate": 2.4266970907016542e-05, + "loss": 0.3331, + "step": 4260 + }, + { + "epoch": 0.7292486736265616, + "grad_norm": 20.990015029907227, + "learning_rate": 2.4272675413576725e-05, + "loss": 2.199, + "step": 4261 + }, + { + "epoch": 0.7294198185863426, + "grad_norm": 3.5553269386291504, + "learning_rate": 2.427837992013691e-05, + "loss": 0.3964, + "step": 4262 + }, + { + "epoch": 0.7295909635461235, + "grad_norm": 17.434099197387695, + "learning_rate": 2.4284084426697092e-05, + "loss": 1.7278, + "step": 4263 + }, + { + "epoch": 0.7297621085059045, + "grad_norm": 26.248044967651367, + "learning_rate": 2.4289788933257272e-05, + "loss": 3.4657, + "step": 4264 + }, + { + "epoch": 0.7299332534656854, + "grad_norm": 26.377473831176758, + "learning_rate": 2.4295493439817456e-05, + "loss": 2.8837, + "step": 4265 + }, + { + "epoch": 0.7301043984254664, + "grad_norm": 29.46402931213379, + "learning_rate": 2.430119794637764e-05, + "loss": 2.9555, + "step": 4266 + }, + { + "epoch": 0.7302755433852472, + "grad_norm": 27.894542694091797, + "learning_rate": 2.430690245293782e-05, + "loss": 2.4692, + "step": 4267 + }, + { + "epoch": 0.7304466883450282, + "grad_norm": 23.581409454345703, + "learning_rate": 2.4312606959498002e-05, + "loss": 2.2207, + "step": 4268 + }, + { + "epoch": 0.7306178333048092, + "grad_norm": 26.48499870300293, + "learning_rate": 2.4318311466058186e-05, + "loss": 3.1471, + "step": 4269 + }, + { + "epoch": 0.7307889782645901, + "grad_norm": 1.8997199535369873, + "learning_rate": 2.432401597261837e-05, + "loss": 0.2555, + "step": 4270 + }, + { + "epoch": 0.7309601232243711, + "grad_norm": 27.24410057067871, + "learning_rate": 2.4329720479178552e-05, + "loss": 5.7181, + "step": 4271 + }, + { + "epoch": 0.731131268184152, + "grad_norm": 20.901037216186523, + "learning_rate": 2.4335424985738736e-05, + "loss": 1.8875, + "step": 4272 + }, + { + "epoch": 0.731302413143933, + "grad_norm": 14.34110164642334, + "learning_rate": 2.434112949229892e-05, + "loss": 1.1739, + "step": 4273 + }, + { + "epoch": 0.7314735581037138, + "grad_norm": 1.0765384435653687, + "learning_rate": 2.43468339988591e-05, + "loss": 0.2476, + "step": 4274 + }, + { + "epoch": 0.7316447030634948, + "grad_norm": 32.98508834838867, + "learning_rate": 2.4352538505419283e-05, + "loss": 1.3042, + "step": 4275 + }, + { + "epoch": 0.7318158480232757, + "grad_norm": 16.529098510742188, + "learning_rate": 2.4358243011979466e-05, + "loss": 1.3914, + "step": 4276 + }, + { + "epoch": 0.7319869929830567, + "grad_norm": 25.088991165161133, + "learning_rate": 2.4363947518539646e-05, + "loss": 1.8855, + "step": 4277 + }, + { + "epoch": 0.7321581379428376, + "grad_norm": 16.27592658996582, + "learning_rate": 2.436965202509983e-05, + "loss": 1.2685, + "step": 4278 + }, + { + "epoch": 0.7323292829026186, + "grad_norm": 24.759716033935547, + "learning_rate": 2.4375356531660013e-05, + "loss": 2.4846, + "step": 4279 + }, + { + "epoch": 0.7325004278623994, + "grad_norm": 30.153457641601562, + "learning_rate": 2.4381061038220196e-05, + "loss": 3.6713, + "step": 4280 + }, + { + "epoch": 0.7326715728221804, + "grad_norm": 25.423839569091797, + "learning_rate": 2.4386765544780376e-05, + "loss": 2.4835, + "step": 4281 + }, + { + "epoch": 0.7328427177819613, + "grad_norm": 17.405637741088867, + "learning_rate": 2.439247005134056e-05, + "loss": 1.6224, + "step": 4282 + }, + { + "epoch": 0.7330138627417423, + "grad_norm": 21.038610458374023, + "learning_rate": 2.4398174557900743e-05, + "loss": 1.7716, + "step": 4283 + }, + { + "epoch": 0.7331850077015232, + "grad_norm": 30.59052848815918, + "learning_rate": 2.4403879064460923e-05, + "loss": 2.7644, + "step": 4284 + }, + { + "epoch": 0.7333561526613042, + "grad_norm": 22.464582443237305, + "learning_rate": 2.4409583571021106e-05, + "loss": 1.891, + "step": 4285 + }, + { + "epoch": 0.733527297621085, + "grad_norm": 7.699871063232422, + "learning_rate": 2.441528807758129e-05, + "loss": 0.7411, + "step": 4286 + }, + { + "epoch": 0.733698442580866, + "grad_norm": 4.126267910003662, + "learning_rate": 2.4420992584141473e-05, + "loss": 0.4547, + "step": 4287 + }, + { + "epoch": 0.7338695875406469, + "grad_norm": 35.644535064697266, + "learning_rate": 2.4426697090701653e-05, + "loss": 6.2534, + "step": 4288 + }, + { + "epoch": 0.7340407325004279, + "grad_norm": 28.631427764892578, + "learning_rate": 2.4432401597261836e-05, + "loss": 2.9862, + "step": 4289 + }, + { + "epoch": 0.7342118774602088, + "grad_norm": 2.322627067565918, + "learning_rate": 2.443810610382202e-05, + "loss": 0.2372, + "step": 4290 + }, + { + "epoch": 0.7343830224199898, + "grad_norm": 24.10704803466797, + "learning_rate": 2.44438106103822e-05, + "loss": 2.0144, + "step": 4291 + }, + { + "epoch": 0.7345541673797706, + "grad_norm": 30.596025466918945, + "learning_rate": 2.4449515116942383e-05, + "loss": 4.3575, + "step": 4292 + }, + { + "epoch": 0.7347253123395516, + "grad_norm": 16.235584259033203, + "learning_rate": 2.4455219623502566e-05, + "loss": 1.3492, + "step": 4293 + }, + { + "epoch": 0.7348964572993325, + "grad_norm": 8.513513565063477, + "learning_rate": 2.4460924130062753e-05, + "loss": 0.6521, + "step": 4294 + }, + { + "epoch": 0.7350676022591135, + "grad_norm": 20.195348739624023, + "learning_rate": 2.4466628636622933e-05, + "loss": 2.3305, + "step": 4295 + }, + { + "epoch": 0.7352387472188944, + "grad_norm": 18.629962921142578, + "learning_rate": 2.4472333143183117e-05, + "loss": 1.8291, + "step": 4296 + }, + { + "epoch": 0.7354098921786754, + "grad_norm": 26.3973388671875, + "learning_rate": 2.44780376497433e-05, + "loss": 3.334, + "step": 4297 + }, + { + "epoch": 0.7355810371384562, + "grad_norm": 0.9989029765129089, + "learning_rate": 2.448374215630348e-05, + "loss": 0.2261, + "step": 4298 + }, + { + "epoch": 0.7357521820982372, + "grad_norm": 22.981550216674805, + "learning_rate": 2.4489446662863663e-05, + "loss": 2.0884, + "step": 4299 + }, + { + "epoch": 0.7359233270580181, + "grad_norm": 22.626544952392578, + "learning_rate": 2.4495151169423847e-05, + "loss": 2.1202, + "step": 4300 + }, + { + "epoch": 0.7360944720177991, + "grad_norm": 16.54318618774414, + "learning_rate": 2.450085567598403e-05, + "loss": 1.5155, + "step": 4301 + }, + { + "epoch": 0.73626561697758, + "grad_norm": 116.46958923339844, + "learning_rate": 2.450656018254421e-05, + "loss": 8.2721, + "step": 4302 + }, + { + "epoch": 0.736436761937361, + "grad_norm": 19.956342697143555, + "learning_rate": 2.4512264689104393e-05, + "loss": 1.8512, + "step": 4303 + }, + { + "epoch": 0.7366079068971418, + "grad_norm": 27.972797393798828, + "learning_rate": 2.4517969195664577e-05, + "loss": 2.7114, + "step": 4304 + }, + { + "epoch": 0.7367790518569228, + "grad_norm": 11.407204627990723, + "learning_rate": 2.4523673702224757e-05, + "loss": 0.9488, + "step": 4305 + }, + { + "epoch": 0.7369501968167037, + "grad_norm": 33.092105865478516, + "learning_rate": 2.452937820878494e-05, + "loss": 2.9253, + "step": 4306 + }, + { + "epoch": 0.7371213417764847, + "grad_norm": 10.819000244140625, + "learning_rate": 2.4535082715345124e-05, + "loss": 0.7126, + "step": 4307 + }, + { + "epoch": 0.7372924867362656, + "grad_norm": 7.100123882293701, + "learning_rate": 2.4540787221905307e-05, + "loss": 0.5677, + "step": 4308 + }, + { + "epoch": 0.7374636316960466, + "grad_norm": 3.432849407196045, + "learning_rate": 2.4546491728465487e-05, + "loss": 0.2723, + "step": 4309 + }, + { + "epoch": 0.7376347766558274, + "grad_norm": 22.43340492248535, + "learning_rate": 2.455219623502567e-05, + "loss": 2.1355, + "step": 4310 + }, + { + "epoch": 0.7378059216156084, + "grad_norm": 22.277841567993164, + "learning_rate": 2.4557900741585854e-05, + "loss": 3.1528, + "step": 4311 + }, + { + "epoch": 0.7379770665753893, + "grad_norm": 26.116939544677734, + "learning_rate": 2.4563605248146034e-05, + "loss": 2.7783, + "step": 4312 + }, + { + "epoch": 0.7381482115351703, + "grad_norm": 21.692012786865234, + "learning_rate": 2.4569309754706217e-05, + "loss": 2.08, + "step": 4313 + }, + { + "epoch": 0.7383193564949512, + "grad_norm": 2.461122751235962, + "learning_rate": 2.45750142612664e-05, + "loss": 0.2903, + "step": 4314 + }, + { + "epoch": 0.7384905014547322, + "grad_norm": 26.099124908447266, + "learning_rate": 2.458071876782658e-05, + "loss": 2.0427, + "step": 4315 + }, + { + "epoch": 0.738661646414513, + "grad_norm": 18.19993782043457, + "learning_rate": 2.4586423274386764e-05, + "loss": 1.5727, + "step": 4316 + }, + { + "epoch": 0.738832791374294, + "grad_norm": 28.66299057006836, + "learning_rate": 2.459212778094695e-05, + "loss": 2.5081, + "step": 4317 + }, + { + "epoch": 0.7390039363340749, + "grad_norm": 27.575727462768555, + "learning_rate": 2.4597832287507134e-05, + "loss": 2.7801, + "step": 4318 + }, + { + "epoch": 0.7391750812938559, + "grad_norm": 46.224586486816406, + "learning_rate": 2.4603536794067314e-05, + "loss": 6.5383, + "step": 4319 + }, + { + "epoch": 0.7393462262536369, + "grad_norm": 9.273065567016602, + "learning_rate": 2.4609241300627497e-05, + "loss": 1.2797, + "step": 4320 + }, + { + "epoch": 0.7395173712134178, + "grad_norm": 12.391923904418945, + "learning_rate": 2.461494580718768e-05, + "loss": 1.0646, + "step": 4321 + }, + { + "epoch": 0.7396885161731988, + "grad_norm": 33.82769012451172, + "learning_rate": 2.462065031374786e-05, + "loss": 4.4309, + "step": 4322 + }, + { + "epoch": 0.7398596611329796, + "grad_norm": 9.532022476196289, + "learning_rate": 2.4626354820308044e-05, + "loss": 1.0837, + "step": 4323 + }, + { + "epoch": 0.7400308060927606, + "grad_norm": 16.09440803527832, + "learning_rate": 2.4632059326868227e-05, + "loss": 1.2765, + "step": 4324 + }, + { + "epoch": 0.7402019510525415, + "grad_norm": 1.2351820468902588, + "learning_rate": 2.463776383342841e-05, + "loss": 0.2564, + "step": 4325 + }, + { + "epoch": 0.7403730960123225, + "grad_norm": 31.442874908447266, + "learning_rate": 2.464346833998859e-05, + "loss": 3.7296, + "step": 4326 + }, + { + "epoch": 0.7405442409721034, + "grad_norm": 21.058252334594727, + "learning_rate": 2.4649172846548774e-05, + "loss": 1.7717, + "step": 4327 + }, + { + "epoch": 0.7407153859318844, + "grad_norm": 25.62652015686035, + "learning_rate": 2.4654877353108958e-05, + "loss": 2.1194, + "step": 4328 + }, + { + "epoch": 0.7408865308916652, + "grad_norm": 16.54743003845215, + "learning_rate": 2.4660581859669138e-05, + "loss": 1.6091, + "step": 4329 + }, + { + "epoch": 0.7410576758514462, + "grad_norm": 9.183897018432617, + "learning_rate": 2.466628636622932e-05, + "loss": 0.7841, + "step": 4330 + }, + { + "epoch": 0.7412288208112271, + "grad_norm": 12.366703987121582, + "learning_rate": 2.4671990872789504e-05, + "loss": 1.2934, + "step": 4331 + }, + { + "epoch": 0.7413999657710081, + "grad_norm": 9.97996997833252, + "learning_rate": 2.4677695379349688e-05, + "loss": 0.648, + "step": 4332 + }, + { + "epoch": 0.741571110730789, + "grad_norm": 4.609592437744141, + "learning_rate": 2.4683399885909868e-05, + "loss": 0.4228, + "step": 4333 + }, + { + "epoch": 0.74174225569057, + "grad_norm": 40.74041748046875, + "learning_rate": 2.468910439247005e-05, + "loss": 6.2855, + "step": 4334 + }, + { + "epoch": 0.7419134006503508, + "grad_norm": 34.59137725830078, + "learning_rate": 2.4694808899030234e-05, + "loss": 5.9909, + "step": 4335 + }, + { + "epoch": 0.7420845456101318, + "grad_norm": 8.708815574645996, + "learning_rate": 2.4700513405590414e-05, + "loss": 0.7173, + "step": 4336 + }, + { + "epoch": 0.7422556905699127, + "grad_norm": 16.282533645629883, + "learning_rate": 2.4706217912150598e-05, + "loss": 1.7374, + "step": 4337 + }, + { + "epoch": 0.7424268355296937, + "grad_norm": 11.787439346313477, + "learning_rate": 2.471192241871078e-05, + "loss": 0.7362, + "step": 4338 + }, + { + "epoch": 0.7425979804894746, + "grad_norm": 6.672769546508789, + "learning_rate": 2.4717626925270968e-05, + "loss": 0.8264, + "step": 4339 + }, + { + "epoch": 0.7427691254492556, + "grad_norm": 11.766725540161133, + "learning_rate": 2.4723331431831148e-05, + "loss": 1.1676, + "step": 4340 + }, + { + "epoch": 0.7429402704090364, + "grad_norm": 40.45454788208008, + "learning_rate": 2.472903593839133e-05, + "loss": 3.9712, + "step": 4341 + }, + { + "epoch": 0.7431114153688174, + "grad_norm": 26.344924926757812, + "learning_rate": 2.4734740444951515e-05, + "loss": 5.5252, + "step": 4342 + }, + { + "epoch": 0.7432825603285983, + "grad_norm": 23.83209228515625, + "learning_rate": 2.4740444951511695e-05, + "loss": 2.138, + "step": 4343 + }, + { + "epoch": 0.7434537052883793, + "grad_norm": 31.80006980895996, + "learning_rate": 2.4746149458071878e-05, + "loss": 3.0915, + "step": 4344 + }, + { + "epoch": 0.7436248502481602, + "grad_norm": 6.755828380584717, + "learning_rate": 2.475185396463206e-05, + "loss": 0.5986, + "step": 4345 + }, + { + "epoch": 0.7437959952079412, + "grad_norm": 20.988128662109375, + "learning_rate": 2.475755847119224e-05, + "loss": 2.3074, + "step": 4346 + }, + { + "epoch": 0.743967140167722, + "grad_norm": 15.407384872436523, + "learning_rate": 2.4763262977752425e-05, + "loss": 1.2107, + "step": 4347 + }, + { + "epoch": 0.744138285127503, + "grad_norm": 189.76187133789062, + "learning_rate": 2.4768967484312608e-05, + "loss": 7.4887, + "step": 4348 + }, + { + "epoch": 0.7443094300872839, + "grad_norm": 15.863279342651367, + "learning_rate": 2.477467199087279e-05, + "loss": 1.4751, + "step": 4349 + }, + { + "epoch": 0.7444805750470649, + "grad_norm": 15.989771842956543, + "learning_rate": 2.478037649743297e-05, + "loss": 1.5714, + "step": 4350 + }, + { + "epoch": 0.7446517200068458, + "grad_norm": 22.554706573486328, + "learning_rate": 2.4786081003993155e-05, + "loss": 2.0379, + "step": 4351 + }, + { + "epoch": 0.7448228649666268, + "grad_norm": 19.29569435119629, + "learning_rate": 2.479178551055334e-05, + "loss": 1.6695, + "step": 4352 + }, + { + "epoch": 0.7449940099264076, + "grad_norm": 32.85340881347656, + "learning_rate": 2.479749001711352e-05, + "loss": 6.1252, + "step": 4353 + }, + { + "epoch": 0.7451651548861886, + "grad_norm": 7.867766380310059, + "learning_rate": 2.4803194523673702e-05, + "loss": 0.5292, + "step": 4354 + }, + { + "epoch": 0.7453362998459695, + "grad_norm": 31.13640022277832, + "learning_rate": 2.4808899030233885e-05, + "loss": 3.6394, + "step": 4355 + }, + { + "epoch": 0.7455074448057505, + "grad_norm": 29.437509536743164, + "learning_rate": 2.481460353679407e-05, + "loss": 5.7376, + "step": 4356 + }, + { + "epoch": 0.7456785897655314, + "grad_norm": 22.40192222595215, + "learning_rate": 2.482030804335425e-05, + "loss": 2.1447, + "step": 4357 + }, + { + "epoch": 0.7458497347253124, + "grad_norm": 16.596187591552734, + "learning_rate": 2.4826012549914432e-05, + "loss": 1.5623, + "step": 4358 + }, + { + "epoch": 0.7460208796850932, + "grad_norm": 18.423601150512695, + "learning_rate": 2.4831717056474615e-05, + "loss": 1.8746, + "step": 4359 + }, + { + "epoch": 0.7461920246448742, + "grad_norm": 27.663909912109375, + "learning_rate": 2.4837421563034795e-05, + "loss": 3.0094, + "step": 4360 + }, + { + "epoch": 0.7463631696046551, + "grad_norm": 22.51800537109375, + "learning_rate": 2.484312606959498e-05, + "loss": 2.4867, + "step": 4361 + }, + { + "epoch": 0.7465343145644361, + "grad_norm": 14.807696342468262, + "learning_rate": 2.4848830576155165e-05, + "loss": 1.3953, + "step": 4362 + }, + { + "epoch": 0.746705459524217, + "grad_norm": 27.524484634399414, + "learning_rate": 2.485453508271535e-05, + "loss": 2.9434, + "step": 4363 + }, + { + "epoch": 0.746876604483998, + "grad_norm": 20.50353240966797, + "learning_rate": 2.486023958927553e-05, + "loss": 1.9921, + "step": 4364 + }, + { + "epoch": 0.7470477494437788, + "grad_norm": 8.223814010620117, + "learning_rate": 2.4865944095835712e-05, + "loss": 1.2869, + "step": 4365 + }, + { + "epoch": 0.7472188944035598, + "grad_norm": 9.882466316223145, + "learning_rate": 2.4871648602395896e-05, + "loss": 0.8515, + "step": 4366 + }, + { + "epoch": 0.7473900393633407, + "grad_norm": 9.574341773986816, + "learning_rate": 2.4877353108956075e-05, + "loss": 0.617, + "step": 4367 + }, + { + "epoch": 0.7475611843231217, + "grad_norm": 65.22699737548828, + "learning_rate": 2.488305761551626e-05, + "loss": 3.1356, + "step": 4368 + }, + { + "epoch": 0.7477323292829026, + "grad_norm": 3.961726427078247, + "learning_rate": 2.4888762122076442e-05, + "loss": 0.4726, + "step": 4369 + }, + { + "epoch": 0.7479034742426836, + "grad_norm": 24.925546646118164, + "learning_rate": 2.4894466628636626e-05, + "loss": 2.5579, + "step": 4370 + }, + { + "epoch": 0.7480746192024645, + "grad_norm": 18.773216247558594, + "learning_rate": 2.4900171135196806e-05, + "loss": 1.7584, + "step": 4371 + }, + { + "epoch": 0.7482457641622454, + "grad_norm": 29.555532455444336, + "learning_rate": 2.490587564175699e-05, + "loss": 6.1397, + "step": 4372 + }, + { + "epoch": 0.7484169091220264, + "grad_norm": 24.256031036376953, + "learning_rate": 2.4911580148317172e-05, + "loss": 1.9726, + "step": 4373 + }, + { + "epoch": 0.7485880540818073, + "grad_norm": 16.05461883544922, + "learning_rate": 2.4917284654877352e-05, + "loss": 1.264, + "step": 4374 + }, + { + "epoch": 0.7487591990415883, + "grad_norm": 24.213459014892578, + "learning_rate": 2.4922989161437536e-05, + "loss": 2.3189, + "step": 4375 + }, + { + "epoch": 0.7489303440013692, + "grad_norm": 29.8961181640625, + "learning_rate": 2.492869366799772e-05, + "loss": 3.6839, + "step": 4376 + }, + { + "epoch": 0.7491014889611501, + "grad_norm": 22.435625076293945, + "learning_rate": 2.4934398174557903e-05, + "loss": 2.2289, + "step": 4377 + }, + { + "epoch": 0.749272633920931, + "grad_norm": 30.220745086669922, + "learning_rate": 2.4940102681118082e-05, + "loss": 4.4896, + "step": 4378 + }, + { + "epoch": 0.749443778880712, + "grad_norm": 18.16118049621582, + "learning_rate": 2.4945807187678266e-05, + "loss": 1.8377, + "step": 4379 + }, + { + "epoch": 0.7496149238404929, + "grad_norm": 27.19329833984375, + "learning_rate": 2.495151169423845e-05, + "loss": 2.8044, + "step": 4380 + }, + { + "epoch": 0.7497860688002739, + "grad_norm": 22.332542419433594, + "learning_rate": 2.495721620079863e-05, + "loss": 1.8498, + "step": 4381 + }, + { + "epoch": 0.7499572137600548, + "grad_norm": 4.663109302520752, + "learning_rate": 2.4962920707358813e-05, + "loss": 0.5393, + "step": 4382 + }, + { + "epoch": 0.7501283587198357, + "grad_norm": 18.246469497680664, + "learning_rate": 2.4968625213918996e-05, + "loss": 1.5186, + "step": 4383 + }, + { + "epoch": 0.7502995036796166, + "grad_norm": 28.977493286132812, + "learning_rate": 2.4974329720479176e-05, + "loss": 2.8719, + "step": 4384 + }, + { + "epoch": 0.7504706486393976, + "grad_norm": 6.890769958496094, + "learning_rate": 2.4980034227039363e-05, + "loss": 0.601, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_nli-pairs_loss": 2.050740957260132, + "eval_nli-pairs_runtime": 4.2795, + "eval_nli-pairs_samples_per_second": 46.734, + "eval_nli-pairs_steps_per_second": 1.636, + "eval_sts-test_pearson_cosine": 0.756734064986887, + "eval_sts-test_pearson_dot": 0.6528865740820513, + "eval_sts-test_pearson_euclidean": 0.7545477323381371, + "eval_sts-test_pearson_manhattan": 0.7602184258166524, + "eval_sts-test_pearson_max": 0.7602184258166524, + "eval_sts-test_spearman_cosine": 0.7444733315413253, + "eval_sts-test_spearman_dot": 0.6319213377688324, + "eval_sts-test_spearman_euclidean": 0.7398981584440489, + "eval_sts-test_spearman_manhattan": 0.7468720146418238, + "eval_sts-test_spearman_max": 0.7468720146418238, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_vitaminc-pairs_loss": 1.3987665176391602, + "eval_vitaminc-pairs_runtime": 2.7296, + "eval_vitaminc-pairs_samples_per_second": 73.272, + "eval_vitaminc-pairs_steps_per_second": 2.565, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_qnli-contrastive_loss": 2.7516510486602783, + "eval_qnli-contrastive_runtime": 0.6347, + "eval_qnli-contrastive_samples_per_second": 315.112, + "eval_qnli-contrastive_steps_per_second": 11.029, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_scitail-pairs-qa_loss": 0.22120414674282074, + "eval_scitail-pairs-qa_runtime": 1.6102, + "eval_scitail-pairs-qa_samples_per_second": 124.21, + "eval_scitail-pairs-qa_steps_per_second": 4.347, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_scitail-pairs-pos_loss": 0.9076427817344666, + "eval_scitail-pairs-pos_runtime": 2.6161, + "eval_scitail-pairs-pos_samples_per_second": 76.449, + "eval_scitail-pairs-pos_steps_per_second": 2.676, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_xsum-pairs_loss": 1.0805269479751587, + "eval_xsum-pairs_runtime": 2.6446, + "eval_xsum-pairs_samples_per_second": 66.172, + "eval_xsum-pairs_steps_per_second": 2.269, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_compression-pairs_loss": 0.44501441717147827, + "eval_compression-pairs_runtime": 0.5283, + "eval_compression-pairs_samples_per_second": 378.589, + "eval_compression-pairs_steps_per_second": 13.251, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_sciq_pairs_loss": 5.368130207061768, + "eval_sciq_pairs_runtime": 9.1813, + "eval_sciq_pairs_samples_per_second": 21.783, + "eval_sciq_pairs_steps_per_second": 0.762, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_qasc_pairs_loss": 5.916055679321289, + "eval_qasc_pairs_runtime": 2.6536, + "eval_qasc_pairs_samples_per_second": 75.369, + "eval_qasc_pairs_steps_per_second": 2.638, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_openbookqa_pairs_loss": 3.2691121101379395, + "eval_openbookqa_pairs_runtime": 0.6379, + "eval_openbookqa_pairs_samples_per_second": 108.16, + "eval_openbookqa_pairs_steps_per_second": 4.703, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_msmarco_pairs_loss": 1.845609426498413, + "eval_msmarco_pairs_runtime": 3.9718, + "eval_msmarco_pairs_samples_per_second": 50.355, + "eval_msmarco_pairs_steps_per_second": 1.762, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_nq_pairs_loss": 2.279620409011841, + "eval_nq_pairs_runtime": 8.6017, + "eval_nq_pairs_samples_per_second": 23.251, + "eval_nq_pairs_steps_per_second": 0.814, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_trivia_pairs_loss": 2.464531421661377, + "eval_trivia_pairs_runtime": 12.8394, + "eval_trivia_pairs_samples_per_second": 15.577, + "eval_trivia_pairs_steps_per_second": 0.545, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_quora_pairs_loss": 0.40776023268699646, + "eval_quora_pairs_runtime": 1.5837, + "eval_quora_pairs_samples_per_second": 126.29, + "eval_quora_pairs_steps_per_second": 4.42, + "step": 4385 + }, + { + "epoch": 0.7504706486393976, + "eval_gooaq_pairs_loss": 1.4506279230117798, + "eval_gooaq_pairs_runtime": 2.6527, + "eval_gooaq_pairs_samples_per_second": 75.396, + "eval_gooaq_pairs_steps_per_second": 2.639, + "step": 4385 + }, + { + "epoch": 0.7506417935991785, + "grad_norm": 29.356266021728516, + "learning_rate": 2.4985738733599546e-05, + "loss": 2.6466, + "step": 4386 + }, + { + "epoch": 0.7508129385589595, + "grad_norm": 20.16213607788086, + "learning_rate": 2.499144324015973e-05, + "loss": 2.096, + "step": 4387 + }, + { + "epoch": 0.7509840835187404, + "grad_norm": 6.681097984313965, + "learning_rate": 2.499714774671991e-05, + "loss": 0.4151, + "step": 4388 + }, + { + "epoch": 0.7511552284785213, + "grad_norm": 28.097131729125977, + "learning_rate": 2.5002852253280093e-05, + "loss": 2.6453, + "step": 4389 + }, + { + "epoch": 0.7513263734383022, + "grad_norm": 23.283533096313477, + "learning_rate": 2.5008556759840276e-05, + "loss": 1.8409, + "step": 4390 + }, + { + "epoch": 0.7514975183980832, + "grad_norm": 17.86503791809082, + "learning_rate": 2.5014261266400456e-05, + "loss": 1.9202, + "step": 4391 + }, + { + "epoch": 0.7516686633578641, + "grad_norm": 24.048988342285156, + "learning_rate": 2.501996577296064e-05, + "loss": 2.8925, + "step": 4392 + }, + { + "epoch": 0.7518398083176451, + "grad_norm": 14.309977531433105, + "learning_rate": 2.5025670279520823e-05, + "loss": 1.2498, + "step": 4393 + }, + { + "epoch": 0.752010953277426, + "grad_norm": 23.62467384338379, + "learning_rate": 2.5031374786081006e-05, + "loss": 2.1623, + "step": 4394 + }, + { + "epoch": 0.7521820982372069, + "grad_norm": 13.426614761352539, + "learning_rate": 2.5037079292641186e-05, + "loss": 1.299, + "step": 4395 + }, + { + "epoch": 0.7523532431969878, + "grad_norm": 8.141080856323242, + "learning_rate": 2.504278379920137e-05, + "loss": 1.2283, + "step": 4396 + }, + { + "epoch": 0.7525243881567688, + "grad_norm": 21.36097526550293, + "learning_rate": 2.5048488305761553e-05, + "loss": 1.5297, + "step": 4397 + }, + { + "epoch": 0.7526955331165497, + "grad_norm": 50.34371566772461, + "learning_rate": 2.5054192812321733e-05, + "loss": 2.2539, + "step": 4398 + }, + { + "epoch": 0.7528666780763307, + "grad_norm": 1.270694613456726, + "learning_rate": 2.5059897318881917e-05, + "loss": 0.2376, + "step": 4399 + }, + { + "epoch": 0.7530378230361116, + "grad_norm": 25.70792579650879, + "learning_rate": 2.50656018254421e-05, + "loss": 2.8688, + "step": 4400 + }, + { + "epoch": 0.7532089679958925, + "grad_norm": 21.675689697265625, + "learning_rate": 2.5071306332002283e-05, + "loss": 2.2722, + "step": 4401 + }, + { + "epoch": 0.7533801129556734, + "grad_norm": 0.9436509013175964, + "learning_rate": 2.5077010838562463e-05, + "loss": 0.2405, + "step": 4402 + }, + { + "epoch": 0.7535512579154544, + "grad_norm": 17.242216110229492, + "learning_rate": 2.5082715345122647e-05, + "loss": 1.5298, + "step": 4403 + }, + { + "epoch": 0.7537224028752353, + "grad_norm": 23.45809555053711, + "learning_rate": 2.508841985168283e-05, + "loss": 1.9691, + "step": 4404 + }, + { + "epoch": 0.7538935478350163, + "grad_norm": 3.92891788482666, + "learning_rate": 2.509412435824301e-05, + "loss": 0.555, + "step": 4405 + }, + { + "epoch": 0.7540646927947972, + "grad_norm": 22.707956314086914, + "learning_rate": 2.5099828864803193e-05, + "loss": 2.3504, + "step": 4406 + }, + { + "epoch": 0.7542358377545781, + "grad_norm": 20.21977996826172, + "learning_rate": 2.5105533371363377e-05, + "loss": 2.0231, + "step": 4407 + }, + { + "epoch": 0.754406982714359, + "grad_norm": 1.7529772520065308, + "learning_rate": 2.5111237877923564e-05, + "loss": 0.2485, + "step": 4408 + }, + { + "epoch": 0.75457812767414, + "grad_norm": 1.6466134786605835, + "learning_rate": 2.5116942384483744e-05, + "loss": 0.2669, + "step": 4409 + }, + { + "epoch": 0.7547492726339209, + "grad_norm": 17.274892807006836, + "learning_rate": 2.5122646891043927e-05, + "loss": 1.6309, + "step": 4410 + }, + { + "epoch": 0.7549204175937019, + "grad_norm": 21.545635223388672, + "learning_rate": 2.512835139760411e-05, + "loss": 2.004, + "step": 4411 + }, + { + "epoch": 0.7550915625534828, + "grad_norm": 12.705281257629395, + "learning_rate": 2.513405590416429e-05, + "loss": 0.8108, + "step": 4412 + }, + { + "epoch": 0.7552627075132637, + "grad_norm": 21.702417373657227, + "learning_rate": 2.5139760410724474e-05, + "loss": 1.9521, + "step": 4413 + }, + { + "epoch": 0.7554338524730446, + "grad_norm": 5.507816314697266, + "learning_rate": 2.5145464917284657e-05, + "loss": 0.4295, + "step": 4414 + }, + { + "epoch": 0.7556049974328256, + "grad_norm": 44.05494689941406, + "learning_rate": 2.5151169423844837e-05, + "loss": 2.7, + "step": 4415 + }, + { + "epoch": 0.7557761423926065, + "grad_norm": 23.11194610595703, + "learning_rate": 2.515687393040502e-05, + "loss": 2.84, + "step": 4416 + }, + { + "epoch": 0.7559472873523875, + "grad_norm": 26.601444244384766, + "learning_rate": 2.5162578436965204e-05, + "loss": 2.6999, + "step": 4417 + }, + { + "epoch": 0.7561184323121684, + "grad_norm": 21.454803466796875, + "learning_rate": 2.5168282943525387e-05, + "loss": 2.186, + "step": 4418 + }, + { + "epoch": 0.7562895772719493, + "grad_norm": 19.278148651123047, + "learning_rate": 2.5173987450085567e-05, + "loss": 1.9123, + "step": 4419 + }, + { + "epoch": 0.7564607222317302, + "grad_norm": 25.827354431152344, + "learning_rate": 2.517969195664575e-05, + "loss": 2.32, + "step": 4420 + }, + { + "epoch": 0.7566318671915112, + "grad_norm": 24.202350616455078, + "learning_rate": 2.5185396463205934e-05, + "loss": 3.0039, + "step": 4421 + }, + { + "epoch": 0.7568030121512922, + "grad_norm": 16.3718318939209, + "learning_rate": 2.5191100969766114e-05, + "loss": 1.56, + "step": 4422 + }, + { + "epoch": 0.7569741571110731, + "grad_norm": 27.989944458007812, + "learning_rate": 2.5196805476326297e-05, + "loss": 3.6797, + "step": 4423 + }, + { + "epoch": 0.7571453020708541, + "grad_norm": 29.627376556396484, + "learning_rate": 2.520250998288648e-05, + "loss": 3.0339, + "step": 4424 + }, + { + "epoch": 0.7573164470306349, + "grad_norm": 23.632122039794922, + "learning_rate": 2.5208214489446664e-05, + "loss": 2.2779, + "step": 4425 + }, + { + "epoch": 0.7574875919904159, + "grad_norm": 9.593125343322754, + "learning_rate": 2.5213918996006844e-05, + "loss": 0.7422, + "step": 4426 + }, + { + "epoch": 0.7576587369501968, + "grad_norm": 17.23970603942871, + "learning_rate": 2.5219623502567027e-05, + "loss": 1.2251, + "step": 4427 + }, + { + "epoch": 0.7578298819099778, + "grad_norm": 26.436206817626953, + "learning_rate": 2.522532800912721e-05, + "loss": 2.0061, + "step": 4428 + }, + { + "epoch": 0.7580010268697587, + "grad_norm": 5.741855144500732, + "learning_rate": 2.523103251568739e-05, + "loss": 0.4587, + "step": 4429 + }, + { + "epoch": 0.7581721718295397, + "grad_norm": 28.37969398498535, + "learning_rate": 2.5236737022247574e-05, + "loss": 2.671, + "step": 4430 + }, + { + "epoch": 0.7583433167893205, + "grad_norm": 18.421100616455078, + "learning_rate": 2.524244152880776e-05, + "loss": 1.7614, + "step": 4431 + }, + { + "epoch": 0.7585144617491015, + "grad_norm": 4.070728302001953, + "learning_rate": 2.5248146035367944e-05, + "loss": 0.2731, + "step": 4432 + }, + { + "epoch": 0.7586856067088824, + "grad_norm": 1.1695162057876587, + "learning_rate": 2.5253850541928124e-05, + "loss": 0.2524, + "step": 4433 + }, + { + "epoch": 0.7588567516686634, + "grad_norm": 22.150390625, + "learning_rate": 2.5259555048488308e-05, + "loss": 2.1763, + "step": 4434 + }, + { + "epoch": 0.7590278966284443, + "grad_norm": 24.137784957885742, + "learning_rate": 2.526525955504849e-05, + "loss": 2.7464, + "step": 4435 + }, + { + "epoch": 0.7591990415882253, + "grad_norm": 97.06147003173828, + "learning_rate": 2.527096406160867e-05, + "loss": 8.2917, + "step": 4436 + }, + { + "epoch": 0.7593701865480061, + "grad_norm": 18.736169815063477, + "learning_rate": 2.5276668568168854e-05, + "loss": 1.8898, + "step": 4437 + }, + { + "epoch": 0.7595413315077871, + "grad_norm": 6.186467170715332, + "learning_rate": 2.5282373074729038e-05, + "loss": 0.6734, + "step": 4438 + }, + { + "epoch": 0.759712476467568, + "grad_norm": 2.713296413421631, + "learning_rate": 2.528807758128922e-05, + "loss": 0.2576, + "step": 4439 + }, + { + "epoch": 0.759883621427349, + "grad_norm": 5.079885482788086, + "learning_rate": 2.52937820878494e-05, + "loss": 0.928, + "step": 4440 + }, + { + "epoch": 0.7600547663871299, + "grad_norm": 7.678714275360107, + "learning_rate": 2.5299486594409585e-05, + "loss": 0.9906, + "step": 4441 + }, + { + "epoch": 0.7602259113469109, + "grad_norm": 28.940954208374023, + "learning_rate": 2.5305191100969768e-05, + "loss": 2.2819, + "step": 4442 + }, + { + "epoch": 0.7603970563066917, + "grad_norm": 19.63545799255371, + "learning_rate": 2.5310895607529948e-05, + "loss": 1.5862, + "step": 4443 + }, + { + "epoch": 0.7605682012664727, + "grad_norm": 24.078157424926758, + "learning_rate": 2.531660011409013e-05, + "loss": 1.9171, + "step": 4444 + }, + { + "epoch": 0.7607393462262536, + "grad_norm": 25.036775588989258, + "learning_rate": 2.5322304620650315e-05, + "loss": 2.7408, + "step": 4445 + }, + { + "epoch": 0.7609104911860346, + "grad_norm": 25.063066482543945, + "learning_rate": 2.5328009127210495e-05, + "loss": 2.0789, + "step": 4446 + }, + { + "epoch": 0.7610816361458155, + "grad_norm": 26.909889221191406, + "learning_rate": 2.5333713633770678e-05, + "loss": 3.0789, + "step": 4447 + }, + { + "epoch": 0.7612527811055965, + "grad_norm": 55.91215896606445, + "learning_rate": 2.533941814033086e-05, + "loss": 2.2273, + "step": 4448 + }, + { + "epoch": 0.7614239260653773, + "grad_norm": 16.553728103637695, + "learning_rate": 2.5345122646891045e-05, + "loss": 1.4893, + "step": 4449 + }, + { + "epoch": 0.7615950710251583, + "grad_norm": 35.240394592285156, + "learning_rate": 2.5350827153451225e-05, + "loss": 6.035, + "step": 4450 + }, + { + "epoch": 0.7617662159849392, + "grad_norm": 9.102727890014648, + "learning_rate": 2.5356531660011408e-05, + "loss": 0.5959, + "step": 4451 + }, + { + "epoch": 0.7619373609447202, + "grad_norm": 19.193933486938477, + "learning_rate": 2.536223616657159e-05, + "loss": 1.4628, + "step": 4452 + }, + { + "epoch": 0.7621085059045011, + "grad_norm": 31.62565803527832, + "learning_rate": 2.536794067313177e-05, + "loss": 5.7421, + "step": 4453 + }, + { + "epoch": 0.7622796508642821, + "grad_norm": 19.388065338134766, + "learning_rate": 2.5373645179691958e-05, + "loss": 1.8378, + "step": 4454 + }, + { + "epoch": 0.7624507958240629, + "grad_norm": 4.330984115600586, + "learning_rate": 2.537934968625214e-05, + "loss": 0.4705, + "step": 4455 + }, + { + "epoch": 0.7626219407838439, + "grad_norm": 5.933356761932373, + "learning_rate": 2.5385054192812325e-05, + "loss": 0.5481, + "step": 4456 + }, + { + "epoch": 0.7627930857436248, + "grad_norm": 30.84785270690918, + "learning_rate": 2.5390758699372505e-05, + "loss": 5.6798, + "step": 4457 + }, + { + "epoch": 0.7629642307034058, + "grad_norm": 22.500408172607422, + "learning_rate": 2.539646320593269e-05, + "loss": 2.1908, + "step": 4458 + }, + { + "epoch": 0.7631353756631867, + "grad_norm": 1.4484208822250366, + "learning_rate": 2.5402167712492872e-05, + "loss": 0.2429, + "step": 4459 + }, + { + "epoch": 0.7633065206229677, + "grad_norm": 18.820663452148438, + "learning_rate": 2.5407872219053052e-05, + "loss": 1.5979, + "step": 4460 + }, + { + "epoch": 0.7634776655827485, + "grad_norm": 29.224742889404297, + "learning_rate": 2.5413576725613235e-05, + "loss": 3.4263, + "step": 4461 + }, + { + "epoch": 0.7636488105425295, + "grad_norm": 27.641237258911133, + "learning_rate": 2.541928123217342e-05, + "loss": 2.9448, + "step": 4462 + }, + { + "epoch": 0.7638199555023104, + "grad_norm": 28.82822608947754, + "learning_rate": 2.5424985738733602e-05, + "loss": 3.3548, + "step": 4463 + }, + { + "epoch": 0.7639911004620914, + "grad_norm": 31.449243545532227, + "learning_rate": 2.5430690245293782e-05, + "loss": 3.569, + "step": 4464 + }, + { + "epoch": 0.7641622454218723, + "grad_norm": 22.96309471130371, + "learning_rate": 2.5436394751853965e-05, + "loss": 2.7067, + "step": 4465 + }, + { + "epoch": 0.7643333903816533, + "grad_norm": 54.38570022583008, + "learning_rate": 2.544209925841415e-05, + "loss": 1.9878, + "step": 4466 + }, + { + "epoch": 0.7645045353414341, + "grad_norm": 13.225268363952637, + "learning_rate": 2.544780376497433e-05, + "loss": 0.9805, + "step": 4467 + }, + { + "epoch": 0.7646756803012151, + "grad_norm": 11.44771957397461, + "learning_rate": 2.5453508271534512e-05, + "loss": 0.8328, + "step": 4468 + }, + { + "epoch": 0.764846825260996, + "grad_norm": 1.3261395692825317, + "learning_rate": 2.5459212778094695e-05, + "loss": 0.2307, + "step": 4469 + }, + { + "epoch": 0.765017970220777, + "grad_norm": 2.594686269760132, + "learning_rate": 2.546491728465488e-05, + "loss": 0.2594, + "step": 4470 + }, + { + "epoch": 0.7651891151805579, + "grad_norm": 1.5864319801330566, + "learning_rate": 2.547062179121506e-05, + "loss": 0.2351, + "step": 4471 + }, + { + "epoch": 0.7653602601403389, + "grad_norm": 23.80762481689453, + "learning_rate": 2.5476326297775242e-05, + "loss": 2.3596, + "step": 4472 + }, + { + "epoch": 0.7655314051001199, + "grad_norm": 0.9860055446624756, + "learning_rate": 2.5482030804335426e-05, + "loss": 0.2113, + "step": 4473 + }, + { + "epoch": 0.7657025500599007, + "grad_norm": 24.054868698120117, + "learning_rate": 2.5487735310895606e-05, + "loss": 2.4686, + "step": 4474 + }, + { + "epoch": 0.7658736950196817, + "grad_norm": 20.621498107910156, + "learning_rate": 2.549343981745579e-05, + "loss": 1.9467, + "step": 4475 + }, + { + "epoch": 0.7660448399794626, + "grad_norm": 23.417434692382812, + "learning_rate": 2.5499144324015972e-05, + "loss": 2.6939, + "step": 4476 + }, + { + "epoch": 0.7662159849392436, + "grad_norm": 35.24362564086914, + "learning_rate": 2.5504848830576156e-05, + "loss": 2.1405, + "step": 4477 + }, + { + "epoch": 0.7663871298990245, + "grad_norm": 24.250762939453125, + "learning_rate": 2.551055333713634e-05, + "loss": 2.0974, + "step": 4478 + }, + { + "epoch": 0.7665582748588055, + "grad_norm": 34.85161590576172, + "learning_rate": 2.5516257843696522e-05, + "loss": 2.6934, + "step": 4479 + }, + { + "epoch": 0.7667294198185863, + "grad_norm": 23.28230857849121, + "learning_rate": 2.5521962350256706e-05, + "loss": 2.4125, + "step": 4480 + }, + { + "epoch": 0.7669005647783673, + "grad_norm": 24.412673950195312, + "learning_rate": 2.5527666856816886e-05, + "loss": 2.0085, + "step": 4481 + }, + { + "epoch": 0.7670717097381482, + "grad_norm": 5.911852836608887, + "learning_rate": 2.553337136337707e-05, + "loss": 0.4889, + "step": 4482 + }, + { + "epoch": 0.7672428546979292, + "grad_norm": 15.7787504196167, + "learning_rate": 2.5539075869937253e-05, + "loss": 1.6727, + "step": 4483 + }, + { + "epoch": 0.7674139996577101, + "grad_norm": 30.21489715576172, + "learning_rate": 2.5544780376497433e-05, + "loss": 3.3485, + "step": 4484 + }, + { + "epoch": 0.767585144617491, + "grad_norm": 1.4315123558044434, + "learning_rate": 2.5550484883057616e-05, + "loss": 0.2253, + "step": 4485 + }, + { + "epoch": 0.7677562895772719, + "grad_norm": 43.58045959472656, + "learning_rate": 2.55561893896178e-05, + "loss": 2.2129, + "step": 4486 + }, + { + "epoch": 0.7679274345370529, + "grad_norm": 15.73321533203125, + "learning_rate": 2.5561893896177983e-05, + "loss": 1.8963, + "step": 4487 + }, + { + "epoch": 0.7680985794968338, + "grad_norm": 15.624593734741211, + "learning_rate": 2.5567598402738163e-05, + "loss": 1.4826, + "step": 4488 + }, + { + "epoch": 0.7682697244566148, + "grad_norm": 19.84630012512207, + "learning_rate": 2.5573302909298346e-05, + "loss": 1.9842, + "step": 4489 + }, + { + "epoch": 0.7684408694163957, + "grad_norm": 23.67464828491211, + "learning_rate": 2.557900741585853e-05, + "loss": 1.9172, + "step": 4490 + }, + { + "epoch": 0.7686120143761767, + "grad_norm": 26.324172973632812, + "learning_rate": 2.558471192241871e-05, + "loss": 2.4427, + "step": 4491 + }, + { + "epoch": 0.7687831593359575, + "grad_norm": 29.327041625976562, + "learning_rate": 2.5590416428978893e-05, + "loss": 2.9564, + "step": 4492 + }, + { + "epoch": 0.7689543042957385, + "grad_norm": 18.07971954345703, + "learning_rate": 2.5596120935539076e-05, + "loss": 1.5832, + "step": 4493 + }, + { + "epoch": 0.7691254492555194, + "grad_norm": 25.531024932861328, + "learning_rate": 2.560182544209926e-05, + "loss": 2.5085, + "step": 4494 + }, + { + "epoch": 0.7692965942153004, + "grad_norm": 26.901735305786133, + "learning_rate": 2.560752994865944e-05, + "loss": 2.7555, + "step": 4495 + }, + { + "epoch": 0.7694677391750813, + "grad_norm": 1.0921388864517212, + "learning_rate": 2.5613234455219623e-05, + "loss": 0.2269, + "step": 4496 + }, + { + "epoch": 0.7696388841348623, + "grad_norm": 24.066415786743164, + "learning_rate": 2.5618938961779806e-05, + "loss": 2.2307, + "step": 4497 + }, + { + "epoch": 0.7698100290946431, + "grad_norm": 27.785593032836914, + "learning_rate": 2.5624643468339986e-05, + "loss": 3.4921, + "step": 4498 + }, + { + "epoch": 0.7699811740544241, + "grad_norm": 29.524158477783203, + "learning_rate": 2.563034797490017e-05, + "loss": 3.7322, + "step": 4499 + }, + { + "epoch": 0.770152319014205, + "grad_norm": 8.157841682434082, + "learning_rate": 2.5636052481460356e-05, + "loss": 0.6122, + "step": 4500 + }, + { + "epoch": 0.770323463973986, + "grad_norm": 0.7895174026489258, + "learning_rate": 2.564175698802054e-05, + "loss": 0.2004, + "step": 4501 + }, + { + "epoch": 0.7704946089337669, + "grad_norm": 19.612058639526367, + "learning_rate": 2.564746149458072e-05, + "loss": 1.4928, + "step": 4502 + }, + { + "epoch": 0.7706657538935479, + "grad_norm": 24.910694122314453, + "learning_rate": 2.5653166001140903e-05, + "loss": 3.1004, + "step": 4503 + }, + { + "epoch": 0.7708368988533287, + "grad_norm": 16.94511604309082, + "learning_rate": 2.5658870507701087e-05, + "loss": 1.5157, + "step": 4504 + }, + { + "epoch": 0.7710080438131097, + "grad_norm": 18.35201072692871, + "learning_rate": 2.5664575014261267e-05, + "loss": 1.6228, + "step": 4505 + }, + { + "epoch": 0.7711791887728906, + "grad_norm": 1.8921977281570435, + "learning_rate": 2.567027952082145e-05, + "loss": 0.2404, + "step": 4506 + }, + { + "epoch": 0.7713503337326716, + "grad_norm": 15.37820053100586, + "learning_rate": 2.5675984027381633e-05, + "loss": 1.4636, + "step": 4507 + }, + { + "epoch": 0.7715214786924525, + "grad_norm": 45.28339767456055, + "learning_rate": 2.5681688533941817e-05, + "loss": 5.8686, + "step": 4508 + }, + { + "epoch": 0.7716926236522335, + "grad_norm": 10.306188583374023, + "learning_rate": 2.5687393040501997e-05, + "loss": 0.8692, + "step": 4509 + }, + { + "epoch": 0.7718637686120143, + "grad_norm": 7.009439468383789, + "learning_rate": 2.569309754706218e-05, + "loss": 0.4821, + "step": 4510 + }, + { + "epoch": 0.7720349135717953, + "grad_norm": 16.78397560119629, + "learning_rate": 2.5698802053622363e-05, + "loss": 1.4687, + "step": 4511 + }, + { + "epoch": 0.7722060585315762, + "grad_norm": 27.457345962524414, + "learning_rate": 2.5704506560182543e-05, + "loss": 5.8033, + "step": 4512 + }, + { + "epoch": 0.7723772034913572, + "grad_norm": 18.70033836364746, + "learning_rate": 2.5710211066742727e-05, + "loss": 1.7159, + "step": 4513 + }, + { + "epoch": 0.7725483484511381, + "grad_norm": 6.5526204109191895, + "learning_rate": 2.571591557330291e-05, + "loss": 0.6285, + "step": 4514 + }, + { + "epoch": 0.7727194934109191, + "grad_norm": 27.923290252685547, + "learning_rate": 2.572162007986309e-05, + "loss": 2.8709, + "step": 4515 + }, + { + "epoch": 0.7728906383706999, + "grad_norm": 21.99191665649414, + "learning_rate": 2.5727324586423274e-05, + "loss": 1.7881, + "step": 4516 + }, + { + "epoch": 0.7730617833304809, + "grad_norm": 23.24179458618164, + "learning_rate": 2.5733029092983457e-05, + "loss": 2.0511, + "step": 4517 + }, + { + "epoch": 0.7732329282902618, + "grad_norm": 17.54611587524414, + "learning_rate": 2.573873359954364e-05, + "loss": 1.326, + "step": 4518 + }, + { + "epoch": 0.7734040732500428, + "grad_norm": 30.525188446044922, + "learning_rate": 2.574443810610382e-05, + "loss": 5.8319, + "step": 4519 + }, + { + "epoch": 0.7735752182098237, + "grad_norm": 22.358930587768555, + "learning_rate": 2.5750142612664004e-05, + "loss": 2.7106, + "step": 4520 + }, + { + "epoch": 0.7737463631696047, + "grad_norm": 28.14476776123047, + "learning_rate": 2.5755847119224187e-05, + "loss": 2.6532, + "step": 4521 + }, + { + "epoch": 0.7739175081293856, + "grad_norm": 14.640401840209961, + "learning_rate": 2.576155162578437e-05, + "loss": 1.2756, + "step": 4522 + }, + { + "epoch": 0.7740886530891665, + "grad_norm": 22.830739974975586, + "learning_rate": 2.5767256132344554e-05, + "loss": 1.9665, + "step": 4523 + }, + { + "epoch": 0.7742597980489475, + "grad_norm": 2.0804736614227295, + "learning_rate": 2.5772960638904737e-05, + "loss": 0.2506, + "step": 4524 + }, + { + "epoch": 0.7744309430087284, + "grad_norm": 80.72746276855469, + "learning_rate": 2.577866514546492e-05, + "loss": 2.0284, + "step": 4525 + }, + { + "epoch": 0.7746020879685094, + "grad_norm": 1.6171777248382568, + "learning_rate": 2.57843696520251e-05, + "loss": 0.2526, + "step": 4526 + }, + { + "epoch": 0.7747732329282903, + "grad_norm": 1.1948031187057495, + "learning_rate": 2.5790074158585284e-05, + "loss": 0.2329, + "step": 4527 + }, + { + "epoch": 0.7749443778880712, + "grad_norm": 16.24471664428711, + "learning_rate": 2.5795778665145467e-05, + "loss": 1.2654, + "step": 4528 + }, + { + "epoch": 0.7751155228478521, + "grad_norm": 11.590794563293457, + "learning_rate": 2.5801483171705647e-05, + "loss": 0.7527, + "step": 4529 + }, + { + "epoch": 0.7752866678076331, + "grad_norm": 21.045690536499023, + "learning_rate": 2.580718767826583e-05, + "loss": 2.1076, + "step": 4530 + }, + { + "epoch": 0.775457812767414, + "grad_norm": 29.146739959716797, + "learning_rate": 2.5812892184826014e-05, + "loss": 2.3028, + "step": 4531 + }, + { + "epoch": 0.775628957727195, + "grad_norm": 27.92205810546875, + "learning_rate": 2.5818596691386197e-05, + "loss": 2.6337, + "step": 4532 + }, + { + "epoch": 0.7758001026869759, + "grad_norm": 4.507087707519531, + "learning_rate": 2.5824301197946377e-05, + "loss": 0.3644, + "step": 4533 + }, + { + "epoch": 0.7759712476467568, + "grad_norm": 31.79628562927246, + "learning_rate": 2.583000570450656e-05, + "loss": 2.0363, + "step": 4534 + }, + { + "epoch": 0.7761423926065377, + "grad_norm": 20.116243362426758, + "learning_rate": 2.5835710211066744e-05, + "loss": 2.037, + "step": 4535 + }, + { + "epoch": 0.7763135375663187, + "grad_norm": 20.690771102905273, + "learning_rate": 2.5841414717626924e-05, + "loss": 2.5927, + "step": 4536 + }, + { + "epoch": 0.7764846825260996, + "grad_norm": 26.378053665161133, + "learning_rate": 2.5847119224187108e-05, + "loss": 2.4488, + "step": 4537 + }, + { + "epoch": 0.7766558274858806, + "grad_norm": 1.2249003648757935, + "learning_rate": 2.585282373074729e-05, + "loss": 0.2434, + "step": 4538 + }, + { + "epoch": 0.7768269724456615, + "grad_norm": 6.3218793869018555, + "learning_rate": 2.5858528237307474e-05, + "loss": 0.5515, + "step": 4539 + }, + { + "epoch": 0.7769981174054424, + "grad_norm": 2.316464424133301, + "learning_rate": 2.5864232743867654e-05, + "loss": 0.2183, + "step": 4540 + }, + { + "epoch": 0.7771692623652233, + "grad_norm": 19.063066482543945, + "learning_rate": 2.5869937250427838e-05, + "loss": 1.6202, + "step": 4541 + }, + { + "epoch": 0.7773404073250043, + "grad_norm": 13.33887767791748, + "learning_rate": 2.587564175698802e-05, + "loss": 1.0805, + "step": 4542 + }, + { + "epoch": 0.7775115522847852, + "grad_norm": 60.49800109863281, + "learning_rate": 2.58813462635482e-05, + "loss": 2.2874, + "step": 4543 + }, + { + "epoch": 0.7776826972445662, + "grad_norm": 1.045248031616211, + "learning_rate": 2.5887050770108384e-05, + "loss": 0.2115, + "step": 4544 + }, + { + "epoch": 0.7778538422043471, + "grad_norm": 0.962874710559845, + "learning_rate": 2.589275527666857e-05, + "loss": 0.2111, + "step": 4545 + }, + { + "epoch": 0.778024987164128, + "grad_norm": 9.272252082824707, + "learning_rate": 2.589845978322875e-05, + "loss": 0.6141, + "step": 4546 + }, + { + "epoch": 0.7781961321239089, + "grad_norm": 1.7271431684494019, + "learning_rate": 2.5904164289788935e-05, + "loss": 0.2153, + "step": 4547 + }, + { + "epoch": 0.7783672770836899, + "grad_norm": 10.894009590148926, + "learning_rate": 2.5909868796349118e-05, + "loss": 0.7097, + "step": 4548 + }, + { + "epoch": 0.7785384220434708, + "grad_norm": 26.39044761657715, + "learning_rate": 2.59155733029093e-05, + "loss": 2.3284, + "step": 4549 + }, + { + "epoch": 0.7787095670032518, + "grad_norm": 18.677852630615234, + "learning_rate": 2.592127780946948e-05, + "loss": 1.6088, + "step": 4550 + }, + { + "epoch": 0.7788807119630327, + "grad_norm": 24.6732177734375, + "learning_rate": 2.5926982316029665e-05, + "loss": 3.2763, + "step": 4551 + }, + { + "epoch": 0.7790518569228136, + "grad_norm": 19.83483123779297, + "learning_rate": 2.5932686822589848e-05, + "loss": 1.9389, + "step": 4552 + }, + { + "epoch": 0.7792230018825945, + "grad_norm": 20.399280548095703, + "learning_rate": 2.5938391329150028e-05, + "loss": 1.7964, + "step": 4553 + }, + { + "epoch": 0.7793941468423755, + "grad_norm": 17.091896057128906, + "learning_rate": 2.594409583571021e-05, + "loss": 1.3734, + "step": 4554 + }, + { + "epoch": 0.7795652918021564, + "grad_norm": 31.487939834594727, + "learning_rate": 2.5949800342270395e-05, + "loss": 2.4755, + "step": 4555 + }, + { + "epoch": 0.7797364367619374, + "grad_norm": 31.707317352294922, + "learning_rate": 2.5955504848830578e-05, + "loss": 4.5809, + "step": 4556 + }, + { + "epoch": 0.7799075817217183, + "grad_norm": 14.886727333068848, + "learning_rate": 2.5961209355390758e-05, + "loss": 0.8313, + "step": 4557 + }, + { + "epoch": 0.7800787266814992, + "grad_norm": 21.29567527770996, + "learning_rate": 2.596691386195094e-05, + "loss": 1.757, + "step": 4558 + }, + { + "epoch": 0.7802498716412801, + "grad_norm": 68.5009765625, + "learning_rate": 2.5972618368511125e-05, + "loss": 2.4949, + "step": 4559 + }, + { + "epoch": 0.7804210166010611, + "grad_norm": 34.35353088378906, + "learning_rate": 2.5978322875071305e-05, + "loss": 2.2289, + "step": 4560 + }, + { + "epoch": 0.780592161560842, + "grad_norm": 23.028303146362305, + "learning_rate": 2.598402738163149e-05, + "loss": 2.4301, + "step": 4561 + }, + { + "epoch": 0.780763306520623, + "grad_norm": 19.792776107788086, + "learning_rate": 2.5989731888191672e-05, + "loss": 1.9774, + "step": 4562 + }, + { + "epoch": 0.7809344514804039, + "grad_norm": 21.630163192749023, + "learning_rate": 2.5995436394751855e-05, + "loss": 1.9028, + "step": 4563 + }, + { + "epoch": 0.7811055964401848, + "grad_norm": 28.1812744140625, + "learning_rate": 2.6001140901312035e-05, + "loss": 3.1802, + "step": 4564 + }, + { + "epoch": 0.7812767413999657, + "grad_norm": 24.079713821411133, + "learning_rate": 2.600684540787222e-05, + "loss": 2.3985, + "step": 4565 + }, + { + "epoch": 0.7814478863597467, + "grad_norm": 20.737655639648438, + "learning_rate": 2.6012549914432402e-05, + "loss": 2.114, + "step": 4566 + }, + { + "epoch": 0.7816190313195276, + "grad_norm": 24.673608779907227, + "learning_rate": 2.6018254420992582e-05, + "loss": 2.7534, + "step": 4567 + }, + { + "epoch": 0.7817901762793086, + "grad_norm": 26.247447967529297, + "learning_rate": 2.602395892755277e-05, + "loss": 2.557, + "step": 4568 + }, + { + "epoch": 0.7819613212390895, + "grad_norm": 5.863075256347656, + "learning_rate": 2.6029663434112952e-05, + "loss": 0.5004, + "step": 4569 + }, + { + "epoch": 0.7821324661988704, + "grad_norm": 17.882400512695312, + "learning_rate": 2.6035367940673135e-05, + "loss": 1.7885, + "step": 4570 + }, + { + "epoch": 0.7823036111586513, + "grad_norm": 38.73212814331055, + "learning_rate": 2.6041072447233315e-05, + "loss": 6.2463, + "step": 4571 + }, + { + "epoch": 0.7824747561184323, + "grad_norm": 22.570146560668945, + "learning_rate": 2.60467769537935e-05, + "loss": 2.3429, + "step": 4572 + }, + { + "epoch": 0.7826459010782133, + "grad_norm": 33.48434066772461, + "learning_rate": 2.6052481460353682e-05, + "loss": 4.7106, + "step": 4573 + }, + { + "epoch": 0.7828170460379942, + "grad_norm": 8.184353828430176, + "learning_rate": 2.6058185966913862e-05, + "loss": 0.5866, + "step": 4574 + }, + { + "epoch": 0.7829881909977752, + "grad_norm": 24.859272003173828, + "learning_rate": 2.6063890473474045e-05, + "loss": 2.7946, + "step": 4575 + }, + { + "epoch": 0.783159335957556, + "grad_norm": 24.745906829833984, + "learning_rate": 2.606959498003423e-05, + "loss": 2.6296, + "step": 4576 + }, + { + "epoch": 0.783330480917337, + "grad_norm": 21.468034744262695, + "learning_rate": 2.6075299486594412e-05, + "loss": 2.0762, + "step": 4577 + }, + { + "epoch": 0.7835016258771179, + "grad_norm": 19.518083572387695, + "learning_rate": 2.6081003993154592e-05, + "loss": 1.7499, + "step": 4578 + }, + { + "epoch": 0.7836727708368989, + "grad_norm": 25.370912551879883, + "learning_rate": 2.6086708499714776e-05, + "loss": 2.0335, + "step": 4579 + }, + { + "epoch": 0.7838439157966798, + "grad_norm": 9.235052108764648, + "learning_rate": 2.609241300627496e-05, + "loss": 0.7306, + "step": 4580 + }, + { + "epoch": 0.7840150607564608, + "grad_norm": 26.33196449279785, + "learning_rate": 2.609811751283514e-05, + "loss": 2.8747, + "step": 4581 + }, + { + "epoch": 0.7841862057162416, + "grad_norm": 22.177465438842773, + "learning_rate": 2.6103822019395322e-05, + "loss": 1.9306, + "step": 4582 + }, + { + "epoch": 0.7843573506760226, + "grad_norm": 15.260997772216797, + "learning_rate": 2.6109526525955506e-05, + "loss": 1.5436, + "step": 4583 + }, + { + "epoch": 0.7845284956358035, + "grad_norm": 17.693405151367188, + "learning_rate": 2.6115231032515686e-05, + "loss": 1.7023, + "step": 4584 + }, + { + "epoch": 0.7846996405955845, + "grad_norm": 19.666189193725586, + "learning_rate": 2.612093553907587e-05, + "loss": 1.7938, + "step": 4585 + }, + { + "epoch": 0.7848707855553654, + "grad_norm": 1.3055251836776733, + "learning_rate": 2.6126640045636052e-05, + "loss": 0.2317, + "step": 4586 + }, + { + "epoch": 0.7850419305151464, + "grad_norm": 23.14202880859375, + "learning_rate": 2.6132344552196236e-05, + "loss": 2.4039, + "step": 4587 + }, + { + "epoch": 0.7852130754749272, + "grad_norm": 24.73307991027832, + "learning_rate": 2.6138049058756416e-05, + "loss": 2.5498, + "step": 4588 + }, + { + "epoch": 0.7853842204347082, + "grad_norm": 14.071855545043945, + "learning_rate": 2.61437535653166e-05, + "loss": 1.1738, + "step": 4589 + }, + { + "epoch": 0.7855553653944891, + "grad_norm": 31.454723358154297, + "learning_rate": 2.6149458071876783e-05, + "loss": 6.066, + "step": 4590 + }, + { + "epoch": 0.7857265103542701, + "grad_norm": 10.21793270111084, + "learning_rate": 2.6155162578436966e-05, + "loss": 0.7021, + "step": 4591 + }, + { + "epoch": 0.785897655314051, + "grad_norm": 0.9553564190864563, + "learning_rate": 2.616086708499715e-05, + "loss": 0.2104, + "step": 4592 + }, + { + "epoch": 0.786068800273832, + "grad_norm": 10.402186393737793, + "learning_rate": 2.6166571591557333e-05, + "loss": 0.6858, + "step": 4593 + }, + { + "epoch": 0.7862399452336128, + "grad_norm": 83.13465118408203, + "learning_rate": 2.6172276098117516e-05, + "loss": 8.0191, + "step": 4594 + }, + { + "epoch": 0.7864110901933938, + "grad_norm": 79.72745513916016, + "learning_rate": 2.6177980604677696e-05, + "loss": 7.8977, + "step": 4595 + }, + { + "epoch": 0.7865822351531747, + "grad_norm": 19.631969451904297, + "learning_rate": 2.618368511123788e-05, + "loss": 1.6448, + "step": 4596 + }, + { + "epoch": 0.7867533801129557, + "grad_norm": 21.243122100830078, + "learning_rate": 2.6189389617798063e-05, + "loss": 2.725, + "step": 4597 + }, + { + "epoch": 0.7869245250727366, + "grad_norm": 27.33664321899414, + "learning_rate": 2.6195094124358243e-05, + "loss": 6.0632, + "step": 4598 + }, + { + "epoch": 0.7870956700325176, + "grad_norm": 27.157442092895508, + "learning_rate": 2.6200798630918426e-05, + "loss": 2.6574, + "step": 4599 + }, + { + "epoch": 0.7872668149922984, + "grad_norm": 16.595340728759766, + "learning_rate": 2.620650313747861e-05, + "loss": 1.8749, + "step": 4600 + }, + { + "epoch": 0.7874379599520794, + "grad_norm": 5.997491836547852, + "learning_rate": 2.6212207644038793e-05, + "loss": 0.8366, + "step": 4601 + }, + { + "epoch": 0.7876091049118603, + "grad_norm": 8.601006507873535, + "learning_rate": 2.6217912150598973e-05, + "loss": 0.6105, + "step": 4602 + }, + { + "epoch": 0.7877802498716413, + "grad_norm": 22.949264526367188, + "learning_rate": 2.6223616657159156e-05, + "loss": 2.0644, + "step": 4603 + }, + { + "epoch": 0.7879513948314222, + "grad_norm": 20.955198287963867, + "learning_rate": 2.622932116371934e-05, + "loss": 1.9205, + "step": 4604 + }, + { + "epoch": 0.7881225397912032, + "grad_norm": 4.057135105133057, + "learning_rate": 2.623502567027952e-05, + "loss": 0.531, + "step": 4605 + }, + { + "epoch": 0.788293684750984, + "grad_norm": 38.576942443847656, + "learning_rate": 2.6240730176839703e-05, + "loss": 1.9383, + "step": 4606 + }, + { + "epoch": 0.788464829710765, + "grad_norm": 28.404165267944336, + "learning_rate": 2.6246434683399886e-05, + "loss": 2.4168, + "step": 4607 + }, + { + "epoch": 0.7886359746705459, + "grad_norm": 30.351381301879883, + "learning_rate": 2.625213918996007e-05, + "loss": 5.746, + "step": 4608 + }, + { + "epoch": 0.7888071196303269, + "grad_norm": 6.266965389251709, + "learning_rate": 2.625784369652025e-05, + "loss": 0.5733, + "step": 4609 + }, + { + "epoch": 0.7889782645901078, + "grad_norm": 20.09222984313965, + "learning_rate": 2.6263548203080433e-05, + "loss": 1.7396, + "step": 4610 + }, + { + "epoch": 0.7891494095498888, + "grad_norm": 29.68480110168457, + "learning_rate": 2.6269252709640617e-05, + "loss": 3.2917, + "step": 4611 + }, + { + "epoch": 0.7893205545096696, + "grad_norm": 28.49522590637207, + "learning_rate": 2.6274957216200797e-05, + "loss": 3.904, + "step": 4612 + }, + { + "epoch": 0.7894916994694506, + "grad_norm": 24.60774803161621, + "learning_rate": 2.628066172276098e-05, + "loss": 2.4849, + "step": 4613 + }, + { + "epoch": 0.7896628444292315, + "grad_norm": 19.17363929748535, + "learning_rate": 2.6286366229321167e-05, + "loss": 1.6276, + "step": 4614 + }, + { + "epoch": 0.7898339893890125, + "grad_norm": 27.375503540039062, + "learning_rate": 2.6292070735881347e-05, + "loss": 3.1904, + "step": 4615 + }, + { + "epoch": 0.7900051343487934, + "grad_norm": 7.622123718261719, + "learning_rate": 2.629777524244153e-05, + "loss": 0.6504, + "step": 4616 + }, + { + "epoch": 0.7901762793085744, + "grad_norm": 18.307863235473633, + "learning_rate": 2.6303479749001713e-05, + "loss": 1.6608, + "step": 4617 + }, + { + "epoch": 0.7903474242683552, + "grad_norm": 23.98590850830078, + "learning_rate": 2.6309184255561897e-05, + "loss": 2.2049, + "step": 4618 + }, + { + "epoch": 0.7905185692281362, + "grad_norm": 5.612167835235596, + "learning_rate": 2.6314888762122077e-05, + "loss": 0.552, + "step": 4619 + }, + { + "epoch": 0.7906897141879171, + "grad_norm": 26.797847747802734, + "learning_rate": 2.632059326868226e-05, + "loss": 2.891, + "step": 4620 + }, + { + "epoch": 0.7908608591476981, + "grad_norm": 26.22957420349121, + "learning_rate": 2.6326297775242444e-05, + "loss": 2.4378, + "step": 4621 + }, + { + "epoch": 0.791032004107479, + "grad_norm": 1.1691210269927979, + "learning_rate": 2.6332002281802624e-05, + "loss": 0.2558, + "step": 4622 + }, + { + "epoch": 0.79120314906726, + "grad_norm": 3.431807041168213, + "learning_rate": 2.6337706788362807e-05, + "loss": 0.2987, + "step": 4623 + }, + { + "epoch": 0.791374294027041, + "grad_norm": 9.09062385559082, + "learning_rate": 2.634341129492299e-05, + "loss": 1.2888, + "step": 4624 + }, + { + "epoch": 0.7915454389868218, + "grad_norm": 15.95527172088623, + "learning_rate": 2.6349115801483174e-05, + "loss": 1.237, + "step": 4625 + }, + { + "epoch": 0.7917165839466028, + "grad_norm": 15.407099723815918, + "learning_rate": 2.6354820308043354e-05, + "loss": 1.4981, + "step": 4626 + }, + { + "epoch": 0.7918877289063837, + "grad_norm": 21.4428653717041, + "learning_rate": 2.6360524814603537e-05, + "loss": 1.9965, + "step": 4627 + }, + { + "epoch": 0.7920588738661647, + "grad_norm": 32.93854522705078, + "learning_rate": 2.636622932116372e-05, + "loss": 1.6353, + "step": 4628 + }, + { + "epoch": 0.7922300188259456, + "grad_norm": 17.02910804748535, + "learning_rate": 2.63719338277239e-05, + "loss": 1.6158, + "step": 4629 + }, + { + "epoch": 0.7924011637857266, + "grad_norm": 18.701148986816406, + "learning_rate": 2.6377638334284084e-05, + "loss": 1.4652, + "step": 4630 + }, + { + "epoch": 0.7925723087455074, + "grad_norm": 20.513490676879883, + "learning_rate": 2.6383342840844267e-05, + "loss": 2.1445, + "step": 4631 + }, + { + "epoch": 0.7927434537052884, + "grad_norm": 33.653228759765625, + "learning_rate": 2.638904734740445e-05, + "loss": 3.4384, + "step": 4632 + }, + { + "epoch": 0.7929145986650693, + "grad_norm": 22.135591506958008, + "learning_rate": 2.639475185396463e-05, + "loss": 1.9552, + "step": 4633 + }, + { + "epoch": 0.7930857436248503, + "grad_norm": 7.528356552124023, + "learning_rate": 2.6400456360524814e-05, + "loss": 0.6067, + "step": 4634 + }, + { + "epoch": 0.7932568885846312, + "grad_norm": 17.62458038330078, + "learning_rate": 2.6406160867084997e-05, + "loss": 1.4482, + "step": 4635 + }, + { + "epoch": 0.7934280335444122, + "grad_norm": 1.2217499017715454, + "learning_rate": 2.6411865373645177e-05, + "loss": 0.2197, + "step": 4636 + }, + { + "epoch": 0.793599178504193, + "grad_norm": 26.31184196472168, + "learning_rate": 2.6417569880205364e-05, + "loss": 2.5345, + "step": 4637 + }, + { + "epoch": 0.793770323463974, + "grad_norm": 1.5256792306900024, + "learning_rate": 2.6423274386765547e-05, + "loss": 0.2251, + "step": 4638 + }, + { + "epoch": 0.7939414684237549, + "grad_norm": 14.517436981201172, + "learning_rate": 2.642897889332573e-05, + "loss": 0.9695, + "step": 4639 + }, + { + "epoch": 0.7941126133835359, + "grad_norm": 23.73163414001465, + "learning_rate": 2.643468339988591e-05, + "loss": 2.5979, + "step": 4640 + }, + { + "epoch": 0.7942837583433168, + "grad_norm": 4.0032172203063965, + "learning_rate": 2.6440387906446094e-05, + "loss": 0.3854, + "step": 4641 + }, + { + "epoch": 0.7944549033030978, + "grad_norm": 17.482643127441406, + "learning_rate": 2.6446092413006278e-05, + "loss": 1.5197, + "step": 4642 + }, + { + "epoch": 0.7946260482628786, + "grad_norm": 14.50070571899414, + "learning_rate": 2.6451796919566458e-05, + "loss": 1.1852, + "step": 4643 + }, + { + "epoch": 0.7947971932226596, + "grad_norm": 25.02318572998047, + "learning_rate": 2.645750142612664e-05, + "loss": 2.3457, + "step": 4644 + }, + { + "epoch": 0.7949683381824405, + "grad_norm": 31.2506103515625, + "learning_rate": 2.6463205932686824e-05, + "loss": 2.6284, + "step": 4645 + }, + { + "epoch": 0.7951394831422215, + "grad_norm": 7.928152084350586, + "learning_rate": 2.6468910439247004e-05, + "loss": 0.6422, + "step": 4646 + }, + { + "epoch": 0.7953106281020024, + "grad_norm": 0.9403290152549744, + "learning_rate": 2.6474614945807188e-05, + "loss": 0.1969, + "step": 4647 + }, + { + "epoch": 0.7954817730617834, + "grad_norm": 56.036251068115234, + "learning_rate": 2.648031945236737e-05, + "loss": 2.2416, + "step": 4648 + }, + { + "epoch": 0.7956529180215642, + "grad_norm": 35.657833099365234, + "learning_rate": 2.6486023958927554e-05, + "loss": 6.1541, + "step": 4649 + }, + { + "epoch": 0.7958240629813452, + "grad_norm": 14.264200210571289, + "learning_rate": 2.6491728465487734e-05, + "loss": 1.4948, + "step": 4650 + }, + { + "epoch": 0.7959952079411261, + "grad_norm": 0.9517439603805542, + "learning_rate": 2.6497432972047918e-05, + "loss": 0.1973, + "step": 4651 + }, + { + "epoch": 0.7961663529009071, + "grad_norm": 3.9652233123779297, + "learning_rate": 2.65031374786081e-05, + "loss": 0.3578, + "step": 4652 + }, + { + "epoch": 0.796337497860688, + "grad_norm": 20.68568229675293, + "learning_rate": 2.650884198516828e-05, + "loss": 1.6687, + "step": 4653 + }, + { + "epoch": 0.796508642820469, + "grad_norm": 36.3359375, + "learning_rate": 2.6514546491728465e-05, + "loss": 6.1452, + "step": 4654 + }, + { + "epoch": 0.7966797877802498, + "grad_norm": 13.956791877746582, + "learning_rate": 2.6520250998288648e-05, + "loss": 0.8717, + "step": 4655 + }, + { + "epoch": 0.7968509327400308, + "grad_norm": 30.4771671295166, + "learning_rate": 2.652595550484883e-05, + "loss": 3.0092, + "step": 4656 + }, + { + "epoch": 0.7970220776998117, + "grad_norm": 6.4612226486206055, + "learning_rate": 2.653166001140901e-05, + "loss": 0.6398, + "step": 4657 + }, + { + "epoch": 0.7971932226595927, + "grad_norm": 16.6127986907959, + "learning_rate": 2.6537364517969195e-05, + "loss": 1.8739, + "step": 4658 + }, + { + "epoch": 0.7973643676193736, + "grad_norm": 21.27251625061035, + "learning_rate": 2.6543069024529378e-05, + "loss": 1.9702, + "step": 4659 + }, + { + "epoch": 0.7975355125791546, + "grad_norm": 18.95979118347168, + "learning_rate": 2.654877353108956e-05, + "loss": 1.6295, + "step": 4660 + }, + { + "epoch": 0.7977066575389354, + "grad_norm": 10.863266944885254, + "learning_rate": 2.6554478037649745e-05, + "loss": 0.7761, + "step": 4661 + }, + { + "epoch": 0.7978778024987164, + "grad_norm": 10.78805160522461, + "learning_rate": 2.6560182544209928e-05, + "loss": 0.9169, + "step": 4662 + }, + { + "epoch": 0.7980489474584973, + "grad_norm": 21.447656631469727, + "learning_rate": 2.656588705077011e-05, + "loss": 2.3001, + "step": 4663 + }, + { + "epoch": 0.7982200924182783, + "grad_norm": 7.096908092498779, + "learning_rate": 2.657159155733029e-05, + "loss": 0.6332, + "step": 4664 + }, + { + "epoch": 0.7983912373780592, + "grad_norm": 11.815482139587402, + "learning_rate": 2.6577296063890475e-05, + "loss": 0.9556, + "step": 4665 + }, + { + "epoch": 0.7985623823378402, + "grad_norm": 25.208463668823242, + "learning_rate": 2.658300057045066e-05, + "loss": 2.7888, + "step": 4666 + }, + { + "epoch": 0.798733527297621, + "grad_norm": 28.86720848083496, + "learning_rate": 2.658870507701084e-05, + "loss": 1.907, + "step": 4667 + }, + { + "epoch": 0.798904672257402, + "grad_norm": 6.2774858474731445, + "learning_rate": 2.6594409583571022e-05, + "loss": 0.4982, + "step": 4668 + }, + { + "epoch": 0.7990758172171829, + "grad_norm": 31.023252487182617, + "learning_rate": 2.6600114090131205e-05, + "loss": 5.9759, + "step": 4669 + }, + { + "epoch": 0.7992469621769639, + "grad_norm": 22.48405647277832, + "learning_rate": 2.660581859669139e-05, + "loss": 2.4245, + "step": 4670 + }, + { + "epoch": 0.7994181071367448, + "grad_norm": 21.80152130126953, + "learning_rate": 2.661152310325157e-05, + "loss": 2.3967, + "step": 4671 + }, + { + "epoch": 0.7995892520965258, + "grad_norm": 24.37519645690918, + "learning_rate": 2.6617227609811752e-05, + "loss": 2.6876, + "step": 4672 + }, + { + "epoch": 0.7997603970563066, + "grad_norm": 24.82401466369629, + "learning_rate": 2.6622932116371935e-05, + "loss": 3.2294, + "step": 4673 + }, + { + "epoch": 0.7999315420160876, + "grad_norm": 0.9125476479530334, + "learning_rate": 2.6628636622932115e-05, + "loss": 0.2049, + "step": 4674 + }, + { + "epoch": 0.8001026869758686, + "grad_norm": 20.301301956176758, + "learning_rate": 2.66343411294923e-05, + "loss": 2.0656, + "step": 4675 + }, + { + "epoch": 0.8002738319356495, + "grad_norm": 17.966495513916016, + "learning_rate": 2.6640045636052482e-05, + "loss": 1.658, + "step": 4676 + }, + { + "epoch": 0.8004449768954305, + "grad_norm": 0.8491156697273254, + "learning_rate": 2.6645750142612665e-05, + "loss": 0.2129, + "step": 4677 + }, + { + "epoch": 0.8006161218552114, + "grad_norm": 21.60484504699707, + "learning_rate": 2.6651454649172845e-05, + "loss": 1.6655, + "step": 4678 + }, + { + "epoch": 0.8007872668149923, + "grad_norm": 19.46196174621582, + "learning_rate": 2.665715915573303e-05, + "loss": 1.7872, + "step": 4679 + }, + { + "epoch": 0.8009584117747732, + "grad_norm": 21.08289909362793, + "learning_rate": 2.6662863662293212e-05, + "loss": 2.101, + "step": 4680 + }, + { + "epoch": 0.8011295567345542, + "grad_norm": 19.137561798095703, + "learning_rate": 2.6668568168853392e-05, + "loss": 1.9419, + "step": 4681 + }, + { + "epoch": 0.8013007016943351, + "grad_norm": 22.642850875854492, + "learning_rate": 2.667427267541358e-05, + "loss": 2.2932, + "step": 4682 + }, + { + "epoch": 0.8014718466541161, + "grad_norm": 31.17798614501953, + "learning_rate": 2.6679977181973762e-05, + "loss": 2.9005, + "step": 4683 + }, + { + "epoch": 0.801642991613897, + "grad_norm": 21.248584747314453, + "learning_rate": 2.6685681688533942e-05, + "loss": 2.0125, + "step": 4684 + }, + { + "epoch": 0.801814136573678, + "grad_norm": 2.3411998748779297, + "learning_rate": 2.6691386195094126e-05, + "loss": 0.2708, + "step": 4685 + }, + { + "epoch": 0.8019852815334588, + "grad_norm": 1.263325810432434, + "learning_rate": 2.669709070165431e-05, + "loss": 0.2294, + "step": 4686 + }, + { + "epoch": 0.8021564264932398, + "grad_norm": 24.95157814025879, + "learning_rate": 2.6702795208214492e-05, + "loss": 3.1625, + "step": 4687 + }, + { + "epoch": 0.8023275714530207, + "grad_norm": 26.514177322387695, + "learning_rate": 2.6708499714774672e-05, + "loss": 3.3918, + "step": 4688 + }, + { + "epoch": 0.8024987164128017, + "grad_norm": 12.605335235595703, + "learning_rate": 2.6714204221334856e-05, + "loss": 0.9508, + "step": 4689 + }, + { + "epoch": 0.8026698613725826, + "grad_norm": 19.918142318725586, + "learning_rate": 2.671990872789504e-05, + "loss": 1.5214, + "step": 4690 + }, + { + "epoch": 0.8028410063323635, + "grad_norm": 1.9992042779922485, + "learning_rate": 2.672561323445522e-05, + "loss": 0.2171, + "step": 4691 + }, + { + "epoch": 0.8030121512921444, + "grad_norm": 9.957181930541992, + "learning_rate": 2.6731317741015403e-05, + "loss": 0.7101, + "step": 4692 + }, + { + "epoch": 0.8031832962519254, + "grad_norm": 25.742799758911133, + "learning_rate": 2.6737022247575586e-05, + "loss": 2.705, + "step": 4693 + }, + { + "epoch": 0.8033544412117063, + "grad_norm": 25.987510681152344, + "learning_rate": 2.674272675413577e-05, + "loss": 2.4936, + "step": 4694 + }, + { + "epoch": 0.8035255861714873, + "grad_norm": 19.842357635498047, + "learning_rate": 2.674843126069595e-05, + "loss": 1.7686, + "step": 4695 + }, + { + "epoch": 0.8036967311312682, + "grad_norm": 26.451980590820312, + "learning_rate": 2.6754135767256133e-05, + "loss": 2.3944, + "step": 4696 + }, + { + "epoch": 0.8038678760910491, + "grad_norm": 15.866630554199219, + "learning_rate": 2.6759840273816316e-05, + "loss": 1.239, + "step": 4697 + }, + { + "epoch": 0.80403902105083, + "grad_norm": 58.552215576171875, + "learning_rate": 2.6765544780376496e-05, + "loss": 1.9071, + "step": 4698 + }, + { + "epoch": 0.804210166010611, + "grad_norm": 19.024993896484375, + "learning_rate": 2.677124928693668e-05, + "loss": 1.7101, + "step": 4699 + }, + { + "epoch": 0.8043813109703919, + "grad_norm": 25.27145767211914, + "learning_rate": 2.6776953793496863e-05, + "loss": 2.543, + "step": 4700 + }, + { + "epoch": 0.8045524559301729, + "grad_norm": 7.051549911499023, + "learning_rate": 2.6782658300057046e-05, + "loss": 0.5373, + "step": 4701 + }, + { + "epoch": 0.8047236008899538, + "grad_norm": 24.07325553894043, + "learning_rate": 2.6788362806617226e-05, + "loss": 2.1983, + "step": 4702 + }, + { + "epoch": 0.8048947458497347, + "grad_norm": 16.690013885498047, + "learning_rate": 2.679406731317741e-05, + "loss": 1.6365, + "step": 4703 + }, + { + "epoch": 0.8050658908095156, + "grad_norm": 29.604305267333984, + "learning_rate": 2.6799771819737593e-05, + "loss": 1.4153, + "step": 4704 + }, + { + "epoch": 0.8052370357692966, + "grad_norm": 10.602456092834473, + "learning_rate": 2.6805476326297776e-05, + "loss": 0.8433, + "step": 4705 + }, + { + "epoch": 0.8054081807290775, + "grad_norm": 18.099092483520508, + "learning_rate": 2.681118083285796e-05, + "loss": 1.6778, + "step": 4706 + }, + { + "epoch": 0.8055793256888585, + "grad_norm": 26.6840763092041, + "learning_rate": 2.6816885339418143e-05, + "loss": 2.8567, + "step": 4707 + }, + { + "epoch": 0.8057504706486394, + "grad_norm": 25.60426139831543, + "learning_rate": 2.6822589845978326e-05, + "loss": 2.7501, + "step": 4708 + }, + { + "epoch": 0.8059216156084203, + "grad_norm": 23.098224639892578, + "learning_rate": 2.6828294352538506e-05, + "loss": 2.5179, + "step": 4709 + }, + { + "epoch": 0.8060927605682012, + "grad_norm": 5.202851295471191, + "learning_rate": 2.683399885909869e-05, + "loss": 0.4799, + "step": 4710 + }, + { + "epoch": 0.8062639055279822, + "grad_norm": 1.1651976108551025, + "learning_rate": 2.6839703365658873e-05, + "loss": 0.2147, + "step": 4711 + }, + { + "epoch": 0.8064350504877631, + "grad_norm": 24.902393341064453, + "learning_rate": 2.6845407872219053e-05, + "loss": 2.4519, + "step": 4712 + }, + { + "epoch": 0.8066061954475441, + "grad_norm": 12.363120079040527, + "learning_rate": 2.6851112378779237e-05, + "loss": 0.9881, + "step": 4713 + }, + { + "epoch": 0.806777340407325, + "grad_norm": 5.912435054779053, + "learning_rate": 2.685681688533942e-05, + "loss": 0.5938, + "step": 4714 + }, + { + "epoch": 0.806948485367106, + "grad_norm": 106.52516174316406, + "learning_rate": 2.68625213918996e-05, + "loss": 7.5182, + "step": 4715 + }, + { + "epoch": 0.8071196303268868, + "grad_norm": 1.668204665184021, + "learning_rate": 2.6868225898459783e-05, + "loss": 0.2216, + "step": 4716 + }, + { + "epoch": 0.8072907752866678, + "grad_norm": 16.875843048095703, + "learning_rate": 2.6873930405019967e-05, + "loss": 1.6175, + "step": 4717 + }, + { + "epoch": 0.8074619202464487, + "grad_norm": 25.818157196044922, + "learning_rate": 2.687963491158015e-05, + "loss": 3.323, + "step": 4718 + }, + { + "epoch": 0.8076330652062297, + "grad_norm": 15.579858779907227, + "learning_rate": 2.688533941814033e-05, + "loss": 1.5144, + "step": 4719 + }, + { + "epoch": 0.8078042101660106, + "grad_norm": 16.2536563873291, + "learning_rate": 2.6891043924700513e-05, + "loss": 1.7982, + "step": 4720 + }, + { + "epoch": 0.8079753551257916, + "grad_norm": 24.011157989501953, + "learning_rate": 2.6896748431260697e-05, + "loss": 1.8936, + "step": 4721 + }, + { + "epoch": 0.8081465000855724, + "grad_norm": 2.2636773586273193, + "learning_rate": 2.6902452937820877e-05, + "loss": 0.2311, + "step": 4722 + }, + { + "epoch": 0.8083176450453534, + "grad_norm": 19.2104434967041, + "learning_rate": 2.690815744438106e-05, + "loss": 2.1646, + "step": 4723 + }, + { + "epoch": 0.8084887900051343, + "grad_norm": 18.140581130981445, + "learning_rate": 2.6913861950941244e-05, + "loss": 1.5111, + "step": 4724 + }, + { + "epoch": 0.8086599349649153, + "grad_norm": 17.46432113647461, + "learning_rate": 2.6919566457501427e-05, + "loss": 1.6828, + "step": 4725 + }, + { + "epoch": 0.8088310799246963, + "grad_norm": 9.412572860717773, + "learning_rate": 2.6925270964061607e-05, + "loss": 0.6319, + "step": 4726 + }, + { + "epoch": 0.8090022248844772, + "grad_norm": 27.534698486328125, + "learning_rate": 2.693097547062179e-05, + "loss": 3.4404, + "step": 4727 + }, + { + "epoch": 0.8091733698442581, + "grad_norm": 20.456804275512695, + "learning_rate": 2.6936679977181977e-05, + "loss": 1.8323, + "step": 4728 + }, + { + "epoch": 0.809344514804039, + "grad_norm": 29.691913604736328, + "learning_rate": 2.6942384483742157e-05, + "loss": 2.6556, + "step": 4729 + }, + { + "epoch": 0.80951565976382, + "grad_norm": 88.99212646484375, + "learning_rate": 2.694808899030234e-05, + "loss": 8.2099, + "step": 4730 + }, + { + "epoch": 0.8096868047236009, + "grad_norm": 24.28471565246582, + "learning_rate": 2.6953793496862524e-05, + "loss": 2.0042, + "step": 4731 + }, + { + "epoch": 0.8098579496833819, + "grad_norm": 7.255256175994873, + "learning_rate": 2.6959498003422707e-05, + "loss": 0.5848, + "step": 4732 + }, + { + "epoch": 0.8100290946431628, + "grad_norm": 57.900230407714844, + "learning_rate": 2.6965202509982887e-05, + "loss": 1.7669, + "step": 4733 + }, + { + "epoch": 0.8102002396029437, + "grad_norm": 23.751659393310547, + "learning_rate": 2.697090701654307e-05, + "loss": 2.1911, + "step": 4734 + }, + { + "epoch": 0.8103713845627246, + "grad_norm": 29.752071380615234, + "learning_rate": 2.6976611523103254e-05, + "loss": 2.9697, + "step": 4735 + }, + { + "epoch": 0.8105425295225056, + "grad_norm": 24.04039192199707, + "learning_rate": 2.6982316029663434e-05, + "loss": 2.1467, + "step": 4736 + }, + { + "epoch": 0.8107136744822865, + "grad_norm": 7.993510723114014, + "learning_rate": 2.6988020536223617e-05, + "loss": 0.5852, + "step": 4737 + }, + { + "epoch": 0.8108848194420675, + "grad_norm": 20.55529022216797, + "learning_rate": 2.69937250427838e-05, + "loss": 1.6321, + "step": 4738 + }, + { + "epoch": 0.8110559644018484, + "grad_norm": 10.042283058166504, + "learning_rate": 2.6999429549343984e-05, + "loss": 0.7064, + "step": 4739 + }, + { + "epoch": 0.8112271093616293, + "grad_norm": 13.695612907409668, + "learning_rate": 2.7005134055904164e-05, + "loss": 1.1387, + "step": 4740 + }, + { + "epoch": 0.8113982543214102, + "grad_norm": 21.712265014648438, + "learning_rate": 2.7010838562464347e-05, + "loss": 1.6631, + "step": 4741 + }, + { + "epoch": 0.8115693992811912, + "grad_norm": 23.428848266601562, + "learning_rate": 2.701654306902453e-05, + "loss": 2.6433, + "step": 4742 + }, + { + "epoch": 0.8117405442409721, + "grad_norm": 25.32332420349121, + "learning_rate": 2.702224757558471e-05, + "loss": 2.2108, + "step": 4743 + }, + { + "epoch": 0.8119116892007531, + "grad_norm": 21.363313674926758, + "learning_rate": 2.7027952082144894e-05, + "loss": 2.5228, + "step": 4744 + }, + { + "epoch": 0.812082834160534, + "grad_norm": 8.273282051086426, + "learning_rate": 2.7033656588705078e-05, + "loss": 0.5786, + "step": 4745 + }, + { + "epoch": 0.8122539791203149, + "grad_norm": 14.859856605529785, + "learning_rate": 2.703936109526526e-05, + "loss": 1.1601, + "step": 4746 + }, + { + "epoch": 0.8124251240800958, + "grad_norm": 22.67235565185547, + "learning_rate": 2.704506560182544e-05, + "loss": 2.1171, + "step": 4747 + }, + { + "epoch": 0.8125962690398768, + "grad_norm": 20.38551139831543, + "learning_rate": 2.7050770108385624e-05, + "loss": 2.1415, + "step": 4748 + }, + { + "epoch": 0.8127674139996577, + "grad_norm": 15.979461669921875, + "learning_rate": 2.7056474614945808e-05, + "loss": 1.4184, + "step": 4749 + }, + { + "epoch": 0.8129385589594387, + "grad_norm": 15.520298957824707, + "learning_rate": 2.7062179121505988e-05, + "loss": 1.4953, + "step": 4750 + }, + { + "epoch": 0.8131097039192196, + "grad_norm": 25.01488494873047, + "learning_rate": 2.7067883628066174e-05, + "loss": 2.7059, + "step": 4751 + }, + { + "epoch": 0.8132808488790005, + "grad_norm": 5.247277736663818, + "learning_rate": 2.7073588134626358e-05, + "loss": 0.514, + "step": 4752 + }, + { + "epoch": 0.8134519938387814, + "grad_norm": 9.065279006958008, + "learning_rate": 2.7079292641186538e-05, + "loss": 0.8338, + "step": 4753 + }, + { + "epoch": 0.8136231387985624, + "grad_norm": 20.89956283569336, + "learning_rate": 2.708499714774672e-05, + "loss": 1.9556, + "step": 4754 + }, + { + "epoch": 0.8137942837583433, + "grad_norm": 22.0202579498291, + "learning_rate": 2.7090701654306905e-05, + "loss": 1.9706, + "step": 4755 + }, + { + "epoch": 0.8139654287181243, + "grad_norm": 23.732559204101562, + "learning_rate": 2.7096406160867088e-05, + "loss": 2.6891, + "step": 4756 + }, + { + "epoch": 0.8141365736779052, + "grad_norm": 33.41729736328125, + "learning_rate": 2.7102110667427268e-05, + "loss": 6.6742, + "step": 4757 + }, + { + "epoch": 0.8143077186376861, + "grad_norm": 24.37078285217285, + "learning_rate": 2.710781517398745e-05, + "loss": 2.2191, + "step": 4758 + }, + { + "epoch": 0.814478863597467, + "grad_norm": 1.2146947383880615, + "learning_rate": 2.7113519680547635e-05, + "loss": 0.2281, + "step": 4759 + }, + { + "epoch": 0.814650008557248, + "grad_norm": 18.136754989624023, + "learning_rate": 2.7119224187107815e-05, + "loss": 1.4566, + "step": 4760 + }, + { + "epoch": 0.8148211535170289, + "grad_norm": 51.885501861572266, + "learning_rate": 2.7124928693667998e-05, + "loss": 6.6106, + "step": 4761 + }, + { + "epoch": 0.8149922984768099, + "grad_norm": 23.535844802856445, + "learning_rate": 2.713063320022818e-05, + "loss": 2.0082, + "step": 4762 + }, + { + "epoch": 0.8151634434365908, + "grad_norm": 21.241649627685547, + "learning_rate": 2.7136337706788365e-05, + "loss": 2.3369, + "step": 4763 + }, + { + "epoch": 0.8153345883963717, + "grad_norm": 18.623498916625977, + "learning_rate": 2.7142042213348545e-05, + "loss": 1.9244, + "step": 4764 + }, + { + "epoch": 0.8155057333561526, + "grad_norm": 5.655921936035156, + "learning_rate": 2.7147746719908728e-05, + "loss": 0.5875, + "step": 4765 + }, + { + "epoch": 0.8156768783159336, + "grad_norm": 21.945968627929688, + "learning_rate": 2.715345122646891e-05, + "loss": 2.1567, + "step": 4766 + }, + { + "epoch": 0.8158480232757145, + "grad_norm": 51.72159957885742, + "learning_rate": 2.715915573302909e-05, + "loss": 6.8426, + "step": 4767 + }, + { + "epoch": 0.8160191682354955, + "grad_norm": 21.90216636657715, + "learning_rate": 2.7164860239589275e-05, + "loss": 2.084, + "step": 4768 + }, + { + "epoch": 0.8161903131952764, + "grad_norm": 11.635622024536133, + "learning_rate": 2.7170564746149458e-05, + "loss": 0.8471, + "step": 4769 + }, + { + "epoch": 0.8163614581550573, + "grad_norm": 4.031811714172363, + "learning_rate": 2.717626925270964e-05, + "loss": 0.4889, + "step": 4770 + }, + { + "epoch": 0.8165326031148382, + "grad_norm": 30.011260986328125, + "learning_rate": 2.718197375926982e-05, + "loss": 2.7194, + "step": 4771 + }, + { + "epoch": 0.8167037480746192, + "grad_norm": 18.62017250061035, + "learning_rate": 2.7187678265830005e-05, + "loss": 1.7315, + "step": 4772 + }, + { + "epoch": 0.8168748930344001, + "grad_norm": 27.6317138671875, + "learning_rate": 2.719338277239019e-05, + "loss": 3.5465, + "step": 4773 + }, + { + "epoch": 0.8170460379941811, + "grad_norm": 25.174705505371094, + "learning_rate": 2.7199087278950372e-05, + "loss": 2.5848, + "step": 4774 + }, + { + "epoch": 0.817217182953962, + "grad_norm": 25.61824607849121, + "learning_rate": 2.7204791785510555e-05, + "loss": 2.3883, + "step": 4775 + }, + { + "epoch": 0.8173883279137429, + "grad_norm": 23.317171096801758, + "learning_rate": 2.721049629207074e-05, + "loss": 1.9388, + "step": 4776 + }, + { + "epoch": 0.8175594728735239, + "grad_norm": 19.05599021911621, + "learning_rate": 2.7216200798630922e-05, + "loss": 1.9781, + "step": 4777 + }, + { + "epoch": 0.8177306178333048, + "grad_norm": 3.6496589183807373, + "learning_rate": 2.7221905305191102e-05, + "loss": 0.3759, + "step": 4778 + }, + { + "epoch": 0.8179017627930858, + "grad_norm": 1.0984550714492798, + "learning_rate": 2.7227609811751285e-05, + "loss": 0.212, + "step": 4779 + }, + { + "epoch": 0.8180729077528667, + "grad_norm": 6.329287052154541, + "learning_rate": 2.723331431831147e-05, + "loss": 0.6039, + "step": 4780 + }, + { + "epoch": 0.8182440527126477, + "grad_norm": 2.0273239612579346, + "learning_rate": 2.723901882487165e-05, + "loss": 0.2497, + "step": 4781 + }, + { + "epoch": 0.8184151976724285, + "grad_norm": 25.492948532104492, + "learning_rate": 2.7244723331431832e-05, + "loss": 2.387, + "step": 4782 + }, + { + "epoch": 0.8185863426322095, + "grad_norm": 26.385509490966797, + "learning_rate": 2.7250427837992015e-05, + "loss": 2.7364, + "step": 4783 + }, + { + "epoch": 0.8187574875919904, + "grad_norm": 3.4072940349578857, + "learning_rate": 2.7256132344552195e-05, + "loss": 0.3718, + "step": 4784 + }, + { + "epoch": 0.8189286325517714, + "grad_norm": 14.639547348022461, + "learning_rate": 2.726183685111238e-05, + "loss": 1.3097, + "step": 4785 + }, + { + "epoch": 0.8190997775115523, + "grad_norm": 22.575746536254883, + "learning_rate": 2.7267541357672562e-05, + "loss": 1.9179, + "step": 4786 + }, + { + "epoch": 0.8192709224713333, + "grad_norm": 25.742076873779297, + "learning_rate": 2.7273245864232746e-05, + "loss": 3.313, + "step": 4787 + }, + { + "epoch": 0.8194420674311141, + "grad_norm": 37.464515686035156, + "learning_rate": 2.7278950370792926e-05, + "loss": 6.2871, + "step": 4788 + }, + { + "epoch": 0.8196132123908951, + "grad_norm": 18.994226455688477, + "learning_rate": 2.728465487735311e-05, + "loss": 1.8293, + "step": 4789 + }, + { + "epoch": 0.819784357350676, + "grad_norm": 21.22791290283203, + "learning_rate": 2.7290359383913292e-05, + "loss": 2.0439, + "step": 4790 + }, + { + "epoch": 0.819955502310457, + "grad_norm": 23.675783157348633, + "learning_rate": 2.7296063890473472e-05, + "loss": 2.7619, + "step": 4791 + }, + { + "epoch": 0.8201266472702379, + "grad_norm": 20.29876708984375, + "learning_rate": 2.7301768397033656e-05, + "loss": 1.6941, + "step": 4792 + }, + { + "epoch": 0.8202977922300189, + "grad_norm": 18.569841384887695, + "learning_rate": 2.730747290359384e-05, + "loss": 1.9272, + "step": 4793 + }, + { + "epoch": 0.8204689371897997, + "grad_norm": 45.40720748901367, + "learning_rate": 2.7313177410154022e-05, + "loss": 2.4204, + "step": 4794 + }, + { + "epoch": 0.8206400821495807, + "grad_norm": 20.549243927001953, + "learning_rate": 2.7318881916714202e-05, + "loss": 2.1771, + "step": 4795 + }, + { + "epoch": 0.8208112271093616, + "grad_norm": 17.86515235900879, + "learning_rate": 2.7324586423274386e-05, + "loss": 1.666, + "step": 4796 + }, + { + "epoch": 0.8209823720691426, + "grad_norm": 20.349185943603516, + "learning_rate": 2.7330290929834573e-05, + "loss": 1.7841, + "step": 4797 + }, + { + "epoch": 0.8211535170289235, + "grad_norm": 20.81956672668457, + "learning_rate": 2.7335995436394753e-05, + "loss": 1.9564, + "step": 4798 + }, + { + "epoch": 0.8213246619887045, + "grad_norm": 23.731029510498047, + "learning_rate": 2.7341699942954936e-05, + "loss": 1.8663, + "step": 4799 + }, + { + "epoch": 0.8214958069484853, + "grad_norm": 20.08209991455078, + "learning_rate": 2.734740444951512e-05, + "loss": 1.9046, + "step": 4800 + }, + { + "epoch": 0.8216669519082663, + "grad_norm": 153.8986053466797, + "learning_rate": 2.7353108956075303e-05, + "loss": 8.5195, + "step": 4801 + }, + { + "epoch": 0.8218380968680472, + "grad_norm": 21.99418067932129, + "learning_rate": 2.7358813462635483e-05, + "loss": 2.2924, + "step": 4802 + }, + { + "epoch": 0.8220092418278282, + "grad_norm": 20.278175354003906, + "learning_rate": 2.7364517969195666e-05, + "loss": 2.2382, + "step": 4803 + }, + { + "epoch": 0.8221803867876091, + "grad_norm": 0.9226766228675842, + "learning_rate": 2.737022247575585e-05, + "loss": 0.2091, + "step": 4804 + }, + { + "epoch": 0.8223515317473901, + "grad_norm": 25.265033721923828, + "learning_rate": 2.737592698231603e-05, + "loss": 2.7456, + "step": 4805 + }, + { + "epoch": 0.8225226767071709, + "grad_norm": 17.61090660095215, + "learning_rate": 2.7381631488876213e-05, + "loss": 1.7355, + "step": 4806 + }, + { + "epoch": 0.8226938216669519, + "grad_norm": 27.21466827392578, + "learning_rate": 2.7387335995436396e-05, + "loss": 2.3528, + "step": 4807 + }, + { + "epoch": 0.8228649666267328, + "grad_norm": 1.4176955223083496, + "learning_rate": 2.739304050199658e-05, + "loss": 0.2325, + "step": 4808 + }, + { + "epoch": 0.8230361115865138, + "grad_norm": 26.031166076660156, + "learning_rate": 2.739874500855676e-05, + "loss": 2.5746, + "step": 4809 + }, + { + "epoch": 0.8232072565462947, + "grad_norm": 23.706703186035156, + "learning_rate": 2.7404449515116943e-05, + "loss": 2.6316, + "step": 4810 + }, + { + "epoch": 0.8233784015060757, + "grad_norm": 2.113330841064453, + "learning_rate": 2.7410154021677126e-05, + "loss": 0.3267, + "step": 4811 + }, + { + "epoch": 0.8235495464658565, + "grad_norm": 13.31404972076416, + "learning_rate": 2.7415858528237306e-05, + "loss": 1.3433, + "step": 4812 + }, + { + "epoch": 0.8237206914256375, + "grad_norm": 22.22062110900879, + "learning_rate": 2.742156303479749e-05, + "loss": 2.5822, + "step": 4813 + }, + { + "epoch": 0.8238918363854184, + "grad_norm": 17.15830421447754, + "learning_rate": 2.7427267541357673e-05, + "loss": 1.4505, + "step": 4814 + }, + { + "epoch": 0.8240629813451994, + "grad_norm": 25.40969467163086, + "learning_rate": 2.7432972047917853e-05, + "loss": 1.8021, + "step": 4815 + }, + { + "epoch": 0.8242341263049803, + "grad_norm": 9.526328086853027, + "learning_rate": 2.7438676554478036e-05, + "loss": 1.0048, + "step": 4816 + }, + { + "epoch": 0.8244052712647613, + "grad_norm": 16.948484420776367, + "learning_rate": 2.744438106103822e-05, + "loss": 1.4061, + "step": 4817 + }, + { + "epoch": 0.8245764162245421, + "grad_norm": 1.4328776597976685, + "learning_rate": 2.7450085567598403e-05, + "loss": 0.2117, + "step": 4818 + }, + { + "epoch": 0.8247475611843231, + "grad_norm": 18.04222869873047, + "learning_rate": 2.7455790074158583e-05, + "loss": 1.676, + "step": 4819 + }, + { + "epoch": 0.824918706144104, + "grad_norm": 21.746219635009766, + "learning_rate": 2.746149458071877e-05, + "loss": 2.1541, + "step": 4820 + }, + { + "epoch": 0.825089851103885, + "grad_norm": 28.491777420043945, + "learning_rate": 2.7467199087278953e-05, + "loss": 4.0988, + "step": 4821 + }, + { + "epoch": 0.8252609960636659, + "grad_norm": 4.467132091522217, + "learning_rate": 2.7472903593839133e-05, + "loss": 0.4074, + "step": 4822 + }, + { + "epoch": 0.8254321410234469, + "grad_norm": 2.841317892074585, + "learning_rate": 2.7478608100399317e-05, + "loss": 0.2482, + "step": 4823 + }, + { + "epoch": 0.8256032859832277, + "grad_norm": 25.000173568725586, + "learning_rate": 2.74843126069595e-05, + "loss": 2.6425, + "step": 4824 + }, + { + "epoch": 0.8257744309430087, + "grad_norm": 22.400287628173828, + "learning_rate": 2.7490017113519683e-05, + "loss": 2.0586, + "step": 4825 + }, + { + "epoch": 0.8259455759027896, + "grad_norm": 0.8643401265144348, + "learning_rate": 2.7495721620079863e-05, + "loss": 0.1913, + "step": 4826 + }, + { + "epoch": 0.8261167208625706, + "grad_norm": 52.71999740600586, + "learning_rate": 2.7501426126640047e-05, + "loss": 1.9317, + "step": 4827 + }, + { + "epoch": 0.8262878658223516, + "grad_norm": 16.348684310913086, + "learning_rate": 2.750713063320023e-05, + "loss": 1.3432, + "step": 4828 + }, + { + "epoch": 0.8264590107821325, + "grad_norm": 15.317984580993652, + "learning_rate": 2.751283513976041e-05, + "loss": 1.2614, + "step": 4829 + }, + { + "epoch": 0.8266301557419135, + "grad_norm": 5.9997406005859375, + "learning_rate": 2.7518539646320594e-05, + "loss": 0.5047, + "step": 4830 + }, + { + "epoch": 0.8268013007016943, + "grad_norm": 21.828754425048828, + "learning_rate": 2.7524244152880777e-05, + "loss": 2.1042, + "step": 4831 + }, + { + "epoch": 0.8269724456614753, + "grad_norm": 27.086246490478516, + "learning_rate": 2.752994865944096e-05, + "loss": 2.9267, + "step": 4832 + }, + { + "epoch": 0.8271435906212562, + "grad_norm": 22.872150421142578, + "learning_rate": 2.753565316600114e-05, + "loss": 2.6579, + "step": 4833 + }, + { + "epoch": 0.8273147355810372, + "grad_norm": 13.048178672790527, + "learning_rate": 2.7541357672561324e-05, + "loss": 1.2089, + "step": 4834 + }, + { + "epoch": 0.8274858805408181, + "grad_norm": 8.70570182800293, + "learning_rate": 2.7547062179121507e-05, + "loss": 0.6508, + "step": 4835 + }, + { + "epoch": 0.827657025500599, + "grad_norm": 6.766833782196045, + "learning_rate": 2.7552766685681687e-05, + "loss": 0.6133, + "step": 4836 + }, + { + "epoch": 0.8278281704603799, + "grad_norm": 5.038801193237305, + "learning_rate": 2.755847119224187e-05, + "loss": 0.4408, + "step": 4837 + }, + { + "epoch": 0.8279993154201609, + "grad_norm": 17.220415115356445, + "learning_rate": 2.7564175698802054e-05, + "loss": 1.5166, + "step": 4838 + }, + { + "epoch": 0.8281704603799418, + "grad_norm": 4.953532695770264, + "learning_rate": 2.7569880205362237e-05, + "loss": 0.467, + "step": 4839 + }, + { + "epoch": 0.8283416053397228, + "grad_norm": 1.3376152515411377, + "learning_rate": 2.7575584711922417e-05, + "loss": 0.2116, + "step": 4840 + }, + { + "epoch": 0.8285127502995037, + "grad_norm": 12.627934455871582, + "learning_rate": 2.75812892184826e-05, + "loss": 0.7787, + "step": 4841 + }, + { + "epoch": 0.8286838952592847, + "grad_norm": 24.3588809967041, + "learning_rate": 2.7586993725042787e-05, + "loss": 2.7193, + "step": 4842 + }, + { + "epoch": 0.8288550402190655, + "grad_norm": 11.016646385192871, + "learning_rate": 2.7592698231602967e-05, + "loss": 0.5866, + "step": 4843 + }, + { + "epoch": 0.8290261851788465, + "grad_norm": 29.477998733520508, + "learning_rate": 2.759840273816315e-05, + "loss": 3.4923, + "step": 4844 + }, + { + "epoch": 0.8291973301386274, + "grad_norm": 44.269596099853516, + "learning_rate": 2.7604107244723334e-05, + "loss": 1.683, + "step": 4845 + }, + { + "epoch": 0.8293684750984084, + "grad_norm": 1.1761341094970703, + "learning_rate": 2.7609811751283514e-05, + "loss": 0.2143, + "step": 4846 + }, + { + "epoch": 0.8295396200581893, + "grad_norm": 0.9938428401947021, + "learning_rate": 2.7615516257843697e-05, + "loss": 0.1943, + "step": 4847 + }, + { + "epoch": 0.8297107650179703, + "grad_norm": 24.60161590576172, + "learning_rate": 2.762122076440388e-05, + "loss": 1.7267, + "step": 4848 + }, + { + "epoch": 0.8298819099777511, + "grad_norm": 24.709163665771484, + "learning_rate": 2.7626925270964064e-05, + "loss": 2.423, + "step": 4849 + }, + { + "epoch": 0.8300530549375321, + "grad_norm": 7.876855850219727, + "learning_rate": 2.7632629777524244e-05, + "loss": 0.9071, + "step": 4850 + }, + { + "epoch": 0.830224199897313, + "grad_norm": 6.107041358947754, + "learning_rate": 2.7638334284084428e-05, + "loss": 0.4236, + "step": 4851 + }, + { + "epoch": 0.830395344857094, + "grad_norm": 2.59680438041687, + "learning_rate": 2.764403879064461e-05, + "loss": 0.2292, + "step": 4852 + }, + { + "epoch": 0.8305664898168749, + "grad_norm": 17.364612579345703, + "learning_rate": 2.764974329720479e-05, + "loss": 1.3699, + "step": 4853 + }, + { + "epoch": 0.8307376347766559, + "grad_norm": 19.087657928466797, + "learning_rate": 2.7655447803764974e-05, + "loss": 1.1447, + "step": 4854 + }, + { + "epoch": 0.8309087797364367, + "grad_norm": 0.8207781910896301, + "learning_rate": 2.7661152310325158e-05, + "loss": 0.1838, + "step": 4855 + }, + { + "epoch": 0.8310799246962177, + "grad_norm": 5.4272894859313965, + "learning_rate": 2.766685681688534e-05, + "loss": 0.5259, + "step": 4856 + }, + { + "epoch": 0.8312510696559986, + "grad_norm": 21.006179809570312, + "learning_rate": 2.767256132344552e-05, + "loss": 1.4462, + "step": 4857 + }, + { + "epoch": 0.8314222146157796, + "grad_norm": 27.480995178222656, + "learning_rate": 2.7678265830005704e-05, + "loss": 2.9023, + "step": 4858 + }, + { + "epoch": 0.8315933595755605, + "grad_norm": 25.569726943969727, + "learning_rate": 2.7683970336565888e-05, + "loss": 2.3783, + "step": 4859 + }, + { + "epoch": 0.8317645045353415, + "grad_norm": 21.275972366333008, + "learning_rate": 2.7689674843126068e-05, + "loss": 2.207, + "step": 4860 + }, + { + "epoch": 0.8319356494951223, + "grad_norm": 18.127140045166016, + "learning_rate": 2.769537934968625e-05, + "loss": 1.7016, + "step": 4861 + }, + { + "epoch": 0.8321067944549033, + "grad_norm": 8.969490051269531, + "learning_rate": 2.7701083856246435e-05, + "loss": 0.7251, + "step": 4862 + }, + { + "epoch": 0.8322779394146842, + "grad_norm": 21.6286678314209, + "learning_rate": 2.7706788362806618e-05, + "loss": 2.1559, + "step": 4863 + }, + { + "epoch": 0.8324490843744652, + "grad_norm": 22.46690559387207, + "learning_rate": 2.7712492869366798e-05, + "loss": 2.5798, + "step": 4864 + }, + { + "epoch": 0.8326202293342461, + "grad_norm": 24.45953941345215, + "learning_rate": 2.7718197375926985e-05, + "loss": 2.7626, + "step": 4865 + }, + { + "epoch": 0.832791374294027, + "grad_norm": 23.767484664916992, + "learning_rate": 2.7723901882487168e-05, + "loss": 2.5827, + "step": 4866 + }, + { + "epoch": 0.8329625192538079, + "grad_norm": 12.12441349029541, + "learning_rate": 2.7729606389047348e-05, + "loss": 1.1174, + "step": 4867 + }, + { + "epoch": 0.8331336642135889, + "grad_norm": 3.884326934814453, + "learning_rate": 2.773531089560753e-05, + "loss": 0.4211, + "step": 4868 + }, + { + "epoch": 0.8333048091733698, + "grad_norm": 17.074594497680664, + "learning_rate": 2.7741015402167715e-05, + "loss": 1.4077, + "step": 4869 + }, + { + "epoch": 0.8334759541331508, + "grad_norm": 13.826687812805176, + "learning_rate": 2.7746719908727898e-05, + "loss": 1.2421, + "step": 4870 + }, + { + "epoch": 0.8336470990929317, + "grad_norm": 19.932655334472656, + "learning_rate": 2.7752424415288078e-05, + "loss": 1.6989, + "step": 4871 + }, + { + "epoch": 0.8338182440527127, + "grad_norm": 10.384773254394531, + "learning_rate": 2.775812892184826e-05, + "loss": 0.9357, + "step": 4872 + }, + { + "epoch": 0.8339893890124935, + "grad_norm": 21.14198112487793, + "learning_rate": 2.7763833428408445e-05, + "loss": 1.8342, + "step": 4873 + }, + { + "epoch": 0.8341605339722745, + "grad_norm": 1.3305509090423584, + "learning_rate": 2.7769537934968625e-05, + "loss": 0.2297, + "step": 4874 + }, + { + "epoch": 0.8343316789320554, + "grad_norm": 19.950328826904297, + "learning_rate": 2.777524244152881e-05, + "loss": 1.6679, + "step": 4875 + }, + { + "epoch": 0.8345028238918364, + "grad_norm": 0.8316130042076111, + "learning_rate": 2.7780946948088992e-05, + "loss": 0.1935, + "step": 4876 + }, + { + "epoch": 0.8346739688516173, + "grad_norm": 19.933990478515625, + "learning_rate": 2.7786651454649175e-05, + "loss": 1.7269, + "step": 4877 + }, + { + "epoch": 0.8348451138113983, + "grad_norm": 26.975513458251953, + "learning_rate": 2.7792355961209355e-05, + "loss": 1.4489, + "step": 4878 + }, + { + "epoch": 0.8350162587711792, + "grad_norm": 31.234407424926758, + "learning_rate": 2.779806046776954e-05, + "loss": 5.6514, + "step": 4879 + }, + { + "epoch": 0.8351874037309601, + "grad_norm": 26.34868049621582, + "learning_rate": 2.7803764974329722e-05, + "loss": 2.7207, + "step": 4880 + }, + { + "epoch": 0.8353585486907411, + "grad_norm": 22.602392196655273, + "learning_rate": 2.7809469480889902e-05, + "loss": 2.0605, + "step": 4881 + }, + { + "epoch": 0.835529693650522, + "grad_norm": 2.0254154205322266, + "learning_rate": 2.7815173987450085e-05, + "loss": 0.2238, + "step": 4882 + }, + { + "epoch": 0.835700838610303, + "grad_norm": 19.05372428894043, + "learning_rate": 2.782087849401027e-05, + "loss": 1.8702, + "step": 4883 + }, + { + "epoch": 0.8358719835700839, + "grad_norm": 17.850038528442383, + "learning_rate": 2.782658300057045e-05, + "loss": 1.7083, + "step": 4884 + }, + { + "epoch": 0.8360431285298648, + "grad_norm": 1.1266815662384033, + "learning_rate": 2.7832287507130632e-05, + "loss": 0.1847, + "step": 4885 + }, + { + "epoch": 0.8362142734896457, + "grad_norm": 145.55615234375, + "learning_rate": 2.7837992013690815e-05, + "loss": 8.2946, + "step": 4886 + }, + { + "epoch": 0.8363854184494267, + "grad_norm": 19.88404083251953, + "learning_rate": 2.7843696520251e-05, + "loss": 2.1403, + "step": 4887 + }, + { + "epoch": 0.8365565634092076, + "grad_norm": 18.304166793823242, + "learning_rate": 2.7849401026811182e-05, + "loss": 1.3405, + "step": 4888 + }, + { + "epoch": 0.8367277083689886, + "grad_norm": 31.058061599731445, + "learning_rate": 2.7855105533371365e-05, + "loss": 3.5569, + "step": 4889 + }, + { + "epoch": 0.8368988533287695, + "grad_norm": 3.8306217193603516, + "learning_rate": 2.786081003993155e-05, + "loss": 0.3999, + "step": 4890 + }, + { + "epoch": 0.8370699982885504, + "grad_norm": 21.254697799682617, + "learning_rate": 2.786651454649173e-05, + "loss": 2.2565, + "step": 4891 + }, + { + "epoch": 0.8372411432483313, + "grad_norm": 24.048858642578125, + "learning_rate": 2.7872219053051912e-05, + "loss": 2.2869, + "step": 4892 + }, + { + "epoch": 0.8374122882081123, + "grad_norm": 32.828433990478516, + "learning_rate": 2.7877923559612096e-05, + "loss": 6.1064, + "step": 4893 + }, + { + "epoch": 0.8375834331678932, + "grad_norm": 12.22719669342041, + "learning_rate": 2.788362806617228e-05, + "loss": 1.727, + "step": 4894 + }, + { + "epoch": 0.8377545781276742, + "grad_norm": 78.69731903076172, + "learning_rate": 2.788933257273246e-05, + "loss": 7.6272, + "step": 4895 + }, + { + "epoch": 0.837925723087455, + "grad_norm": 27.703277587890625, + "learning_rate": 2.7895037079292642e-05, + "loss": 5.7777, + "step": 4896 + }, + { + "epoch": 0.838096868047236, + "grad_norm": 31.29784393310547, + "learning_rate": 2.7900741585852826e-05, + "loss": 4.6196, + "step": 4897 + }, + { + "epoch": 0.8382680130070169, + "grad_norm": 18.404560089111328, + "learning_rate": 2.7906446092413006e-05, + "loss": 1.4676, + "step": 4898 + }, + { + "epoch": 0.8384391579667979, + "grad_norm": 14.947914123535156, + "learning_rate": 2.791215059897319e-05, + "loss": 1.4166, + "step": 4899 + }, + { + "epoch": 0.8386103029265788, + "grad_norm": 13.326464653015137, + "learning_rate": 2.7917855105533372e-05, + "loss": 0.923, + "step": 4900 + }, + { + "epoch": 0.8387814478863598, + "grad_norm": 19.72146987915039, + "learning_rate": 2.7923559612093556e-05, + "loss": 1.8728, + "step": 4901 + }, + { + "epoch": 0.8389525928461407, + "grad_norm": 23.139135360717773, + "learning_rate": 2.7929264118653736e-05, + "loss": 2.6928, + "step": 4902 + }, + { + "epoch": 0.8391237378059216, + "grad_norm": 24.41969871520996, + "learning_rate": 2.793496862521392e-05, + "loss": 2.5487, + "step": 4903 + }, + { + "epoch": 0.8392948827657025, + "grad_norm": 18.01527214050293, + "learning_rate": 2.7940673131774103e-05, + "loss": 1.7845, + "step": 4904 + }, + { + "epoch": 0.8394660277254835, + "grad_norm": 33.79884719848633, + "learning_rate": 2.7946377638334283e-05, + "loss": 2.9576, + "step": 4905 + }, + { + "epoch": 0.8396371726852644, + "grad_norm": 20.432056427001953, + "learning_rate": 2.7952082144894466e-05, + "loss": 2.1053, + "step": 4906 + }, + { + "epoch": 0.8398083176450454, + "grad_norm": 53.79264831542969, + "learning_rate": 2.795778665145465e-05, + "loss": 2.1853, + "step": 4907 + }, + { + "epoch": 0.8399794626048263, + "grad_norm": 30.717485427856445, + "learning_rate": 2.7963491158014833e-05, + "loss": 5.9112, + "step": 4908 + }, + { + "epoch": 0.8401506075646072, + "grad_norm": 24.705486297607422, + "learning_rate": 2.7969195664575013e-05, + "loss": 3.0099, + "step": 4909 + }, + { + "epoch": 0.8403217525243881, + "grad_norm": 14.869185447692871, + "learning_rate": 2.7974900171135196e-05, + "loss": 1.0484, + "step": 4910 + }, + { + "epoch": 0.8404928974841691, + "grad_norm": 5.514411449432373, + "learning_rate": 2.7980604677695383e-05, + "loss": 0.3749, + "step": 4911 + }, + { + "epoch": 0.84066404244395, + "grad_norm": 25.00140380859375, + "learning_rate": 2.7986309184255563e-05, + "loss": 2.7309, + "step": 4912 + }, + { + "epoch": 0.840835187403731, + "grad_norm": 16.862314224243164, + "learning_rate": 2.7992013690815746e-05, + "loss": 1.5645, + "step": 4913 + }, + { + "epoch": 0.8410063323635119, + "grad_norm": 16.41586685180664, + "learning_rate": 2.799771819737593e-05, + "loss": 1.3737, + "step": 4914 + }, + { + "epoch": 0.8411774773232928, + "grad_norm": 4.305161476135254, + "learning_rate": 2.800342270393611e-05, + "loss": 0.4024, + "step": 4915 + }, + { + "epoch": 0.8413486222830737, + "grad_norm": 9.081815719604492, + "learning_rate": 2.8009127210496293e-05, + "loss": 0.5857, + "step": 4916 + }, + { + "epoch": 0.8415197672428547, + "grad_norm": 23.440717697143555, + "learning_rate": 2.8014831717056476e-05, + "loss": 2.1649, + "step": 4917 + }, + { + "epoch": 0.8416909122026356, + "grad_norm": 9.641824722290039, + "learning_rate": 2.802053622361666e-05, + "loss": 0.5756, + "step": 4918 + }, + { + "epoch": 0.8418620571624166, + "grad_norm": 22.86307716369629, + "learning_rate": 2.802624073017684e-05, + "loss": 2.4724, + "step": 4919 + }, + { + "epoch": 0.8420332021221975, + "grad_norm": 20.317447662353516, + "learning_rate": 2.8031945236737023e-05, + "loss": 2.1701, + "step": 4920 + }, + { + "epoch": 0.8422043470819784, + "grad_norm": 5.848948955535889, + "learning_rate": 2.8037649743297206e-05, + "loss": 0.5332, + "step": 4921 + }, + { + "epoch": 0.8423754920417593, + "grad_norm": 29.350927352905273, + "learning_rate": 2.8043354249857386e-05, + "loss": 1.6343, + "step": 4922 + }, + { + "epoch": 0.8425466370015403, + "grad_norm": 23.29707145690918, + "learning_rate": 2.804905875641757e-05, + "loss": 2.0175, + "step": 4923 + }, + { + "epoch": 0.8427177819613212, + "grad_norm": 21.697025299072266, + "learning_rate": 2.8054763262977753e-05, + "loss": 2.3221, + "step": 4924 + }, + { + "epoch": 0.8428889269211022, + "grad_norm": 24.808443069458008, + "learning_rate": 2.8060467769537937e-05, + "loss": 2.1885, + "step": 4925 + }, + { + "epoch": 0.843060071880883, + "grad_norm": 18.006011962890625, + "learning_rate": 2.8066172276098117e-05, + "loss": 1.2711, + "step": 4926 + }, + { + "epoch": 0.843231216840664, + "grad_norm": 18.567319869995117, + "learning_rate": 2.80718767826583e-05, + "loss": 1.6861, + "step": 4927 + }, + { + "epoch": 0.8434023618004449, + "grad_norm": 19.155935287475586, + "learning_rate": 2.8077581289218483e-05, + "loss": 1.5559, + "step": 4928 + }, + { + "epoch": 0.8435735067602259, + "grad_norm": 14.273578643798828, + "learning_rate": 2.8083285795778663e-05, + "loss": 1.5563, + "step": 4929 + }, + { + "epoch": 0.8437446517200069, + "grad_norm": 23.349454879760742, + "learning_rate": 2.8088990302338847e-05, + "loss": 2.4812, + "step": 4930 + }, + { + "epoch": 0.8439157966797878, + "grad_norm": 1.6741917133331299, + "learning_rate": 2.809469480889903e-05, + "loss": 0.2197, + "step": 4931 + }, + { + "epoch": 0.8440869416395688, + "grad_norm": 24.860034942626953, + "learning_rate": 2.8100399315459213e-05, + "loss": 2.4578, + "step": 4932 + }, + { + "epoch": 0.8442580865993496, + "grad_norm": 14.901060104370117, + "learning_rate": 2.8106103822019393e-05, + "loss": 1.5235, + "step": 4933 + }, + { + "epoch": 0.8444292315591306, + "grad_norm": 9.101666450500488, + "learning_rate": 2.811180832857958e-05, + "loss": 0.5367, + "step": 4934 + }, + { + "epoch": 0.8446003765189115, + "grad_norm": 25.940553665161133, + "learning_rate": 2.8117512835139764e-05, + "loss": 2.8182, + "step": 4935 + }, + { + "epoch": 0.8447715214786925, + "grad_norm": 21.500452041625977, + "learning_rate": 2.8123217341699944e-05, + "loss": 2.2284, + "step": 4936 + }, + { + "epoch": 0.8449426664384734, + "grad_norm": 9.465425491333008, + "learning_rate": 2.8128921848260127e-05, + "loss": 0.8752, + "step": 4937 + }, + { + "epoch": 0.8451138113982544, + "grad_norm": 12.658205032348633, + "learning_rate": 2.813462635482031e-05, + "loss": 0.7855, + "step": 4938 + }, + { + "epoch": 0.8452849563580352, + "grad_norm": 3.6830475330352783, + "learning_rate": 2.8140330861380494e-05, + "loss": 0.3844, + "step": 4939 + }, + { + "epoch": 0.8454561013178162, + "grad_norm": 79.1318130493164, + "learning_rate": 2.8146035367940674e-05, + "loss": 7.1857, + "step": 4940 + }, + { + "epoch": 0.8456272462775971, + "grad_norm": 18.231271743774414, + "learning_rate": 2.8151739874500857e-05, + "loss": 1.4963, + "step": 4941 + }, + { + "epoch": 0.8457983912373781, + "grad_norm": 30.293922424316406, + "learning_rate": 2.815744438106104e-05, + "loss": 3.6746, + "step": 4942 + }, + { + "epoch": 0.845969536197159, + "grad_norm": 23.53990364074707, + "learning_rate": 2.816314888762122e-05, + "loss": 2.1727, + "step": 4943 + }, + { + "epoch": 0.84614068115694, + "grad_norm": 45.553245544433594, + "learning_rate": 2.8168853394181404e-05, + "loss": 6.6332, + "step": 4944 + }, + { + "epoch": 0.8463118261167208, + "grad_norm": 23.199127197265625, + "learning_rate": 2.8174557900741587e-05, + "loss": 1.8783, + "step": 4945 + }, + { + "epoch": 0.8464829710765018, + "grad_norm": 29.977962493896484, + "learning_rate": 2.818026240730177e-05, + "loss": 3.5376, + "step": 4946 + }, + { + "epoch": 0.8466541160362827, + "grad_norm": 13.712217330932617, + "learning_rate": 2.818596691386195e-05, + "loss": 1.2064, + "step": 4947 + }, + { + "epoch": 0.8468252609960637, + "grad_norm": 21.567646026611328, + "learning_rate": 2.8191671420422134e-05, + "loss": 1.7774, + "step": 4948 + }, + { + "epoch": 0.8469964059558446, + "grad_norm": 20.53908348083496, + "learning_rate": 2.8197375926982317e-05, + "loss": 1.5944, + "step": 4949 + }, + { + "epoch": 0.8471675509156256, + "grad_norm": 57.32319641113281, + "learning_rate": 2.8203080433542497e-05, + "loss": 1.6839, + "step": 4950 + }, + { + "epoch": 0.8473386958754064, + "grad_norm": 19.78857421875, + "learning_rate": 2.820878494010268e-05, + "loss": 1.645, + "step": 4951 + }, + { + "epoch": 0.8475098408351874, + "grad_norm": 18.463855743408203, + "learning_rate": 2.8214489446662864e-05, + "loss": 1.8035, + "step": 4952 + }, + { + "epoch": 0.8476809857949683, + "grad_norm": 3.614971160888672, + "learning_rate": 2.8220193953223044e-05, + "loss": 0.3524, + "step": 4953 + }, + { + "epoch": 0.8478521307547493, + "grad_norm": 44.223514556884766, + "learning_rate": 2.8225898459783227e-05, + "loss": 1.5668, + "step": 4954 + }, + { + "epoch": 0.8480232757145302, + "grad_norm": 28.211584091186523, + "learning_rate": 2.823160296634341e-05, + "loss": 2.7987, + "step": 4955 + }, + { + "epoch": 0.8481944206743112, + "grad_norm": 6.280284881591797, + "learning_rate": 2.8237307472903594e-05, + "loss": 0.5097, + "step": 4956 + }, + { + "epoch": 0.848365565634092, + "grad_norm": 27.26231575012207, + "learning_rate": 2.8243011979463778e-05, + "loss": 3.0276, + "step": 4957 + }, + { + "epoch": 0.848536710593873, + "grad_norm": 23.75172996520996, + "learning_rate": 2.824871648602396e-05, + "loss": 2.684, + "step": 4958 + }, + { + "epoch": 0.8487078555536539, + "grad_norm": 25.655179977416992, + "learning_rate": 2.8254420992584144e-05, + "loss": 2.2778, + "step": 4959 + }, + { + "epoch": 0.8488790005134349, + "grad_norm": 1.2349088191986084, + "learning_rate": 2.8260125499144324e-05, + "loss": 0.2043, + "step": 4960 + }, + { + "epoch": 0.8490501454732158, + "grad_norm": 29.569950103759766, + "learning_rate": 2.8265830005704508e-05, + "loss": 5.8835, + "step": 4961 + }, + { + "epoch": 0.8492212904329968, + "grad_norm": 21.26607322692871, + "learning_rate": 2.827153451226469e-05, + "loss": 1.8996, + "step": 4962 + }, + { + "epoch": 0.8493924353927776, + "grad_norm": 18.51241111755371, + "learning_rate": 2.8277239018824875e-05, + "loss": 1.6964, + "step": 4963 + }, + { + "epoch": 0.8495635803525586, + "grad_norm": 129.38690185546875, + "learning_rate": 2.8282943525385055e-05, + "loss": 8.2196, + "step": 4964 + }, + { + "epoch": 0.8497347253123395, + "grad_norm": 14.641566276550293, + "learning_rate": 2.8288648031945238e-05, + "loss": 1.8633, + "step": 4965 + }, + { + "epoch": 0.8499058702721205, + "grad_norm": 31.69512176513672, + "learning_rate": 2.829435253850542e-05, + "loss": 3.3676, + "step": 4966 + }, + { + "epoch": 0.8500770152319014, + "grad_norm": 1.0442372560501099, + "learning_rate": 2.83000570450656e-05, + "loss": 0.201, + "step": 4967 + }, + { + "epoch": 0.8502481601916824, + "grad_norm": 20.171241760253906, + "learning_rate": 2.8305761551625785e-05, + "loss": 1.8819, + "step": 4968 + }, + { + "epoch": 0.8504193051514632, + "grad_norm": 7.881687164306641, + "learning_rate": 2.8311466058185968e-05, + "loss": 0.5707, + "step": 4969 + }, + { + "epoch": 0.8505904501112442, + "grad_norm": 5.979015350341797, + "learning_rate": 2.831717056474615e-05, + "loss": 0.5286, + "step": 4970 + }, + { + "epoch": 0.8507615950710251, + "grad_norm": 0.8935146331787109, + "learning_rate": 2.832287507130633e-05, + "loss": 0.1914, + "step": 4971 + }, + { + "epoch": 0.8509327400308061, + "grad_norm": 28.41226577758789, + "learning_rate": 2.8328579577866515e-05, + "loss": 3.873, + "step": 4972 + }, + { + "epoch": 0.851103884990587, + "grad_norm": 23.3878116607666, + "learning_rate": 2.8334284084426698e-05, + "loss": 2.738, + "step": 4973 + }, + { + "epoch": 0.851275029950368, + "grad_norm": 1.1107969284057617, + "learning_rate": 2.8339988590986878e-05, + "loss": 0.2044, + "step": 4974 + }, + { + "epoch": 0.8514461749101488, + "grad_norm": 12.226590156555176, + "learning_rate": 2.834569309754706e-05, + "loss": 0.729, + "step": 4975 + }, + { + "epoch": 0.8516173198699298, + "grad_norm": 24.799179077148438, + "learning_rate": 2.8351397604107245e-05, + "loss": 1.7961, + "step": 4976 + }, + { + "epoch": 0.8517884648297107, + "grad_norm": 22.254865646362305, + "learning_rate": 2.8357102110667428e-05, + "loss": 2.345, + "step": 4977 + }, + { + "epoch": 0.8519596097894917, + "grad_norm": 22.752105712890625, + "learning_rate": 2.8362806617227608e-05, + "loss": 2.2281, + "step": 4978 + }, + { + "epoch": 0.8521307547492726, + "grad_norm": 2.3499200344085693, + "learning_rate": 2.836851112378779e-05, + "loss": 0.2585, + "step": 4979 + }, + { + "epoch": 0.8523018997090536, + "grad_norm": 24.360803604125977, + "learning_rate": 2.837421563034798e-05, + "loss": 2.2443, + "step": 4980 + }, + { + "epoch": 0.8524730446688346, + "grad_norm": 22.948808670043945, + "learning_rate": 2.837992013690816e-05, + "loss": 2.2422, + "step": 4981 + }, + { + "epoch": 0.8526441896286154, + "grad_norm": 7.84674072265625, + "learning_rate": 2.8385624643468342e-05, + "loss": 0.6825, + "step": 4982 + }, + { + "epoch": 0.8528153345883964, + "grad_norm": 11.925847053527832, + "learning_rate": 2.8391329150028525e-05, + "loss": 0.9807, + "step": 4983 + }, + { + "epoch": 0.8529864795481773, + "grad_norm": 12.705161094665527, + "learning_rate": 2.8397033656588705e-05, + "loss": 1.0903, + "step": 4984 + }, + { + "epoch": 0.8531576245079583, + "grad_norm": 22.5230770111084, + "learning_rate": 2.840273816314889e-05, + "loss": 2.2401, + "step": 4985 + }, + { + "epoch": 0.8533287694677392, + "grad_norm": 20.577150344848633, + "learning_rate": 2.8408442669709072e-05, + "loss": 1.4695, + "step": 4986 + }, + { + "epoch": 0.8534999144275202, + "grad_norm": 8.088134765625, + "learning_rate": 2.8414147176269255e-05, + "loss": 0.9019, + "step": 4987 + }, + { + "epoch": 0.853671059387301, + "grad_norm": 2.71311354637146, + "learning_rate": 2.8419851682829435e-05, + "loss": 0.3453, + "step": 4988 + }, + { + "epoch": 0.853842204347082, + "grad_norm": 22.019060134887695, + "learning_rate": 2.842555618938962e-05, + "loss": 1.6718, + "step": 4989 + }, + { + "epoch": 0.8540133493068629, + "grad_norm": 30.339303970336914, + "learning_rate": 2.8431260695949802e-05, + "loss": 2.9397, + "step": 4990 + }, + { + "epoch": 0.8541844942666439, + "grad_norm": 1.055464744567871, + "learning_rate": 2.8436965202509982e-05, + "loss": 0.1917, + "step": 4991 + }, + { + "epoch": 0.8543556392264248, + "grad_norm": 19.96885871887207, + "learning_rate": 2.8442669709070165e-05, + "loss": 1.551, + "step": 4992 + }, + { + "epoch": 0.8545267841862058, + "grad_norm": 31.464706420898438, + "learning_rate": 2.844837421563035e-05, + "loss": 1.7026, + "step": 4993 + }, + { + "epoch": 0.8546979291459866, + "grad_norm": 6.268205165863037, + "learning_rate": 2.8454078722190532e-05, + "loss": 0.6232, + "step": 4994 + }, + { + "epoch": 0.8548690741057676, + "grad_norm": 1.3084250688552856, + "learning_rate": 2.8459783228750712e-05, + "loss": 0.194, + "step": 4995 + }, + { + "epoch": 0.8550402190655485, + "grad_norm": 24.96925163269043, + "learning_rate": 2.8465487735310896e-05, + "loss": 2.4543, + "step": 4996 + }, + { + "epoch": 0.8552113640253295, + "grad_norm": 20.334264755249023, + "learning_rate": 2.847119224187108e-05, + "loss": 1.9524, + "step": 4997 + }, + { + "epoch": 0.8553825089851104, + "grad_norm": 32.75785446166992, + "learning_rate": 2.847689674843126e-05, + "loss": 1.7313, + "step": 4998 + }, + { + "epoch": 0.8555536539448914, + "grad_norm": 20.249191284179688, + "learning_rate": 2.8482601254991442e-05, + "loss": 1.7413, + "step": 4999 + }, + { + "epoch": 0.8557247989046722, + "grad_norm": 9.610689163208008, + "learning_rate": 2.8488305761551626e-05, + "loss": 0.5728, + "step": 5000 + }, + { + "epoch": 0.8558959438644532, + "grad_norm": 20.597206115722656, + "learning_rate": 2.849401026811181e-05, + "loss": 1.9744, + "step": 5001 + }, + { + "epoch": 0.8560670888242341, + "grad_norm": 22.978784561157227, + "learning_rate": 2.849971477467199e-05, + "loss": 1.547, + "step": 5002 + }, + { + "epoch": 0.8562382337840151, + "grad_norm": 15.783121109008789, + "learning_rate": 2.8505419281232176e-05, + "loss": 1.2838, + "step": 5003 + }, + { + "epoch": 0.856409378743796, + "grad_norm": 30.393911361694336, + "learning_rate": 2.851112378779236e-05, + "loss": 5.1174, + "step": 5004 + }, + { + "epoch": 0.856580523703577, + "grad_norm": 30.374027252197266, + "learning_rate": 2.851682829435254e-05, + "loss": 6.2091, + "step": 5005 + }, + { + "epoch": 0.8567516686633578, + "grad_norm": 20.213665008544922, + "learning_rate": 2.8522532800912723e-05, + "loss": 1.4388, + "step": 5006 + }, + { + "epoch": 0.8569228136231388, + "grad_norm": 22.47203254699707, + "learning_rate": 2.8528237307472906e-05, + "loss": 2.1662, + "step": 5007 + }, + { + "epoch": 0.8570939585829197, + "grad_norm": 67.63754272460938, + "learning_rate": 2.853394181403309e-05, + "loss": 8.0588, + "step": 5008 + }, + { + "epoch": 0.8572651035427007, + "grad_norm": 18.833168029785156, + "learning_rate": 2.853964632059327e-05, + "loss": 1.5184, + "step": 5009 + }, + { + "epoch": 0.8574362485024816, + "grad_norm": 20.953800201416016, + "learning_rate": 2.8545350827153453e-05, + "loss": 1.9016, + "step": 5010 + }, + { + "epoch": 0.8576073934622626, + "grad_norm": 11.872946739196777, + "learning_rate": 2.8551055333713636e-05, + "loss": 0.689, + "step": 5011 + }, + { + "epoch": 0.8577785384220434, + "grad_norm": 20.248685836791992, + "learning_rate": 2.8556759840273816e-05, + "loss": 2.0556, + "step": 5012 + }, + { + "epoch": 0.8579496833818244, + "grad_norm": 12.471953392028809, + "learning_rate": 2.8562464346834e-05, + "loss": 0.8929, + "step": 5013 + }, + { + "epoch": 0.8581208283416053, + "grad_norm": 9.479846954345703, + "learning_rate": 2.8568168853394183e-05, + "loss": 0.8552, + "step": 5014 + }, + { + "epoch": 0.8582919733013863, + "grad_norm": 25.91791343688965, + "learning_rate": 2.8573873359954363e-05, + "loss": 2.4487, + "step": 5015 + }, + { + "epoch": 0.8584631182611672, + "grad_norm": 30.794086456298828, + "learning_rate": 2.8579577866514546e-05, + "loss": 1.6757, + "step": 5016 + }, + { + "epoch": 0.8586342632209482, + "grad_norm": 30.115619659423828, + "learning_rate": 2.858528237307473e-05, + "loss": 2.2301, + "step": 5017 + }, + { + "epoch": 0.858805408180729, + "grad_norm": 15.450135231018066, + "learning_rate": 2.8590986879634913e-05, + "loss": 1.3106, + "step": 5018 + }, + { + "epoch": 0.85897655314051, + "grad_norm": 23.82478904724121, + "learning_rate": 2.8596691386195093e-05, + "loss": 2.1391, + "step": 5019 + }, + { + "epoch": 0.8591476981002909, + "grad_norm": 21.615318298339844, + "learning_rate": 2.8602395892755276e-05, + "loss": 1.7556, + "step": 5020 + }, + { + "epoch": 0.8593188430600719, + "grad_norm": 20.00986671447754, + "learning_rate": 2.860810039931546e-05, + "loss": 1.8048, + "step": 5021 + }, + { + "epoch": 0.8594899880198528, + "grad_norm": 1.3434984683990479, + "learning_rate": 2.861380490587564e-05, + "loss": 0.2228, + "step": 5022 + }, + { + "epoch": 0.8596611329796338, + "grad_norm": 12.534896850585938, + "learning_rate": 2.8619509412435823e-05, + "loss": 1.6978, + "step": 5023 + }, + { + "epoch": 0.8598322779394146, + "grad_norm": 30.588682174682617, + "learning_rate": 2.8625213918996006e-05, + "loss": 2.4971, + "step": 5024 + }, + { + "epoch": 0.8600034228991956, + "grad_norm": 16.27142333984375, + "learning_rate": 2.8630918425556193e-05, + "loss": 1.2722, + "step": 5025 + }, + { + "epoch": 0.8601745678589765, + "grad_norm": 8.828721046447754, + "learning_rate": 2.8636622932116373e-05, + "loss": 0.4231, + "step": 5026 + }, + { + "epoch": 0.8603457128187575, + "grad_norm": 11.027689933776855, + "learning_rate": 2.8642327438676557e-05, + "loss": 0.7595, + "step": 5027 + }, + { + "epoch": 0.8605168577785384, + "grad_norm": 23.096948623657227, + "learning_rate": 2.864803194523674e-05, + "loss": 2.6978, + "step": 5028 + }, + { + "epoch": 0.8606880027383194, + "grad_norm": 20.453035354614258, + "learning_rate": 2.865373645179692e-05, + "loss": 2.2458, + "step": 5029 + }, + { + "epoch": 0.8608591476981002, + "grad_norm": 16.359405517578125, + "learning_rate": 2.8659440958357103e-05, + "loss": 1.6137, + "step": 5030 + }, + { + "epoch": 0.8610302926578812, + "grad_norm": 21.571128845214844, + "learning_rate": 2.8665145464917287e-05, + "loss": 1.8346, + "step": 5031 + }, + { + "epoch": 0.8612014376176622, + "grad_norm": 30.24646759033203, + "learning_rate": 2.867084997147747e-05, + "loss": 3.9606, + "step": 5032 + }, + { + "epoch": 0.8613725825774431, + "grad_norm": 20.61684799194336, + "learning_rate": 2.867655447803765e-05, + "loss": 2.4751, + "step": 5033 + }, + { + "epoch": 0.8615437275372241, + "grad_norm": 22.913393020629883, + "learning_rate": 2.8682258984597833e-05, + "loss": 2.5773, + "step": 5034 + }, + { + "epoch": 0.861714872497005, + "grad_norm": 16.3863582611084, + "learning_rate": 2.8687963491158017e-05, + "loss": 1.2117, + "step": 5035 + }, + { + "epoch": 0.861886017456786, + "grad_norm": 1.4799400568008423, + "learning_rate": 2.8693667997718197e-05, + "loss": 0.2188, + "step": 5036 + }, + { + "epoch": 0.8620571624165668, + "grad_norm": 2.393829822540283, + "learning_rate": 2.869937250427838e-05, + "loss": 0.3249, + "step": 5037 + }, + { + "epoch": 0.8622283073763478, + "grad_norm": 6.015737533569336, + "learning_rate": 2.8705077010838564e-05, + "loss": 0.475, + "step": 5038 + }, + { + "epoch": 0.8623994523361287, + "grad_norm": 10.611120223999023, + "learning_rate": 2.8710781517398747e-05, + "loss": 0.6693, + "step": 5039 + }, + { + "epoch": 0.8625705972959097, + "grad_norm": 40.11723709106445, + "learning_rate": 2.8716486023958927e-05, + "loss": 6.5789, + "step": 5040 + }, + { + "epoch": 0.8627417422556906, + "grad_norm": 18.530126571655273, + "learning_rate": 2.872219053051911e-05, + "loss": 1.5516, + "step": 5041 + }, + { + "epoch": 0.8629128872154715, + "grad_norm": 18.136308670043945, + "learning_rate": 2.8727895037079294e-05, + "loss": 1.4063, + "step": 5042 + }, + { + "epoch": 0.8630840321752524, + "grad_norm": 25.583057403564453, + "learning_rate": 2.8733599543639474e-05, + "loss": 1.7697, + "step": 5043 + }, + { + "epoch": 0.8632551771350334, + "grad_norm": 19.754838943481445, + "learning_rate": 2.8739304050199657e-05, + "loss": 1.9445, + "step": 5044 + }, + { + "epoch": 0.8634263220948143, + "grad_norm": 24.454795837402344, + "learning_rate": 2.874500855675984e-05, + "loss": 2.0861, + "step": 5045 + }, + { + "epoch": 0.8635974670545953, + "grad_norm": 21.876201629638672, + "learning_rate": 2.8750713063320024e-05, + "loss": 2.0072, + "step": 5046 + }, + { + "epoch": 0.8637686120143762, + "grad_norm": 6.661749839782715, + "learning_rate": 2.8756417569880204e-05, + "loss": 0.5025, + "step": 5047 + }, + { + "epoch": 0.8639397569741571, + "grad_norm": 190.44247436523438, + "learning_rate": 2.876212207644039e-05, + "loss": 8.5548, + "step": 5048 + }, + { + "epoch": 0.864110901933938, + "grad_norm": 22.162609100341797, + "learning_rate": 2.8767826583000574e-05, + "loss": 2.0999, + "step": 5049 + }, + { + "epoch": 0.864282046893719, + "grad_norm": 1.4531558752059937, + "learning_rate": 2.8773531089560754e-05, + "loss": 0.2048, + "step": 5050 + }, + { + "epoch": 0.8644531918534999, + "grad_norm": 24.412981033325195, + "learning_rate": 2.8779235596120937e-05, + "loss": 2.3353, + "step": 5051 + }, + { + "epoch": 0.8646243368132809, + "grad_norm": 31.54360580444336, + "learning_rate": 2.878494010268112e-05, + "loss": 1.6217, + "step": 5052 + }, + { + "epoch": 0.8647954817730618, + "grad_norm": 23.30241584777832, + "learning_rate": 2.87906446092413e-05, + "loss": 2.1526, + "step": 5053 + }, + { + "epoch": 0.8649666267328427, + "grad_norm": 18.61789321899414, + "learning_rate": 2.8796349115801484e-05, + "loss": 1.5916, + "step": 5054 + }, + { + "epoch": 0.8651377716926236, + "grad_norm": 8.036980628967285, + "learning_rate": 2.8802053622361667e-05, + "loss": 0.6728, + "step": 5055 + }, + { + "epoch": 0.8653089166524046, + "grad_norm": 1.1689972877502441, + "learning_rate": 2.880775812892185e-05, + "loss": 0.1996, + "step": 5056 + }, + { + "epoch": 0.8654800616121855, + "grad_norm": 20.204479217529297, + "learning_rate": 2.881346263548203e-05, + "loss": 1.8613, + "step": 5057 + }, + { + "epoch": 0.8656512065719665, + "grad_norm": 20.377853393554688, + "learning_rate": 2.8819167142042214e-05, + "loss": 1.9245, + "step": 5058 + }, + { + "epoch": 0.8658223515317474, + "grad_norm": 32.21659851074219, + "learning_rate": 2.8824871648602398e-05, + "loss": 4.4384, + "step": 5059 + }, + { + "epoch": 0.8659934964915283, + "grad_norm": 17.437664031982422, + "learning_rate": 2.8830576155162578e-05, + "loss": 1.8003, + "step": 5060 + }, + { + "epoch": 0.8661646414513092, + "grad_norm": 8.696148872375488, + "learning_rate": 2.883628066172276e-05, + "loss": 0.8588, + "step": 5061 + }, + { + "epoch": 0.8663357864110902, + "grad_norm": 38.84085464477539, + "learning_rate": 2.8841985168282944e-05, + "loss": 6.4101, + "step": 5062 + }, + { + "epoch": 0.8665069313708711, + "grad_norm": 22.779674530029297, + "learning_rate": 2.8847689674843128e-05, + "loss": 2.1035, + "step": 5063 + }, + { + "epoch": 0.8666780763306521, + "grad_norm": 20.018033981323242, + "learning_rate": 2.8853394181403308e-05, + "loss": 1.7867, + "step": 5064 + }, + { + "epoch": 0.866849221290433, + "grad_norm": 20.817026138305664, + "learning_rate": 2.885909868796349e-05, + "loss": 1.8507, + "step": 5065 + }, + { + "epoch": 0.867020366250214, + "grad_norm": 22.181928634643555, + "learning_rate": 2.8864803194523674e-05, + "loss": 2.3944, + "step": 5066 + }, + { + "epoch": 0.8671915112099948, + "grad_norm": 21.743671417236328, + "learning_rate": 2.8870507701083854e-05, + "loss": 2.0094, + "step": 5067 + }, + { + "epoch": 0.8673626561697758, + "grad_norm": 18.235118865966797, + "learning_rate": 2.8876212207644038e-05, + "loss": 1.7605, + "step": 5068 + }, + { + "epoch": 0.8675338011295567, + "grad_norm": 16.165708541870117, + "learning_rate": 2.888191671420422e-05, + "loss": 1.5405, + "step": 5069 + }, + { + "epoch": 0.8677049460893377, + "grad_norm": 33.633140563964844, + "learning_rate": 2.8887621220764405e-05, + "loss": 6.2029, + "step": 5070 + }, + { + "epoch": 0.8678760910491186, + "grad_norm": 9.106796264648438, + "learning_rate": 2.8893325727324588e-05, + "loss": 0.5696, + "step": 5071 + }, + { + "epoch": 0.8680472360088995, + "grad_norm": 10.702054023742676, + "learning_rate": 2.889903023388477e-05, + "loss": 0.8735, + "step": 5072 + }, + { + "epoch": 0.8682183809686804, + "grad_norm": 34.0421142578125, + "learning_rate": 2.8904734740444955e-05, + "loss": 2.06, + "step": 5073 + }, + { + "epoch": 0.8683895259284614, + "grad_norm": 26.271299362182617, + "learning_rate": 2.8910439247005135e-05, + "loss": 2.3461, + "step": 5074 + }, + { + "epoch": 0.8685606708882423, + "grad_norm": 24.45592498779297, + "learning_rate": 2.8916143753565318e-05, + "loss": 2.7885, + "step": 5075 + }, + { + "epoch": 0.8687318158480233, + "grad_norm": 29.799867630004883, + "learning_rate": 2.89218482601255e-05, + "loss": 3.7322, + "step": 5076 + }, + { + "epoch": 0.8689029608078042, + "grad_norm": 50.683589935302734, + "learning_rate": 2.8927552766685685e-05, + "loss": 1.8316, + "step": 5077 + }, + { + "epoch": 0.8690741057675851, + "grad_norm": 8.911881446838379, + "learning_rate": 2.8933257273245865e-05, + "loss": 0.9942, + "step": 5078 + }, + { + "epoch": 0.869245250727366, + "grad_norm": 14.339098930358887, + "learning_rate": 2.8938961779806048e-05, + "loss": 1.3359, + "step": 5079 + }, + { + "epoch": 0.869416395687147, + "grad_norm": 60.05573272705078, + "learning_rate": 2.894466628636623e-05, + "loss": 6.966, + "step": 5080 + }, + { + "epoch": 0.869587540646928, + "grad_norm": 30.60601806640625, + "learning_rate": 2.895037079292641e-05, + "loss": 5.8897, + "step": 5081 + }, + { + "epoch": 0.8697586856067089, + "grad_norm": 19.455875396728516, + "learning_rate": 2.8956075299486595e-05, + "loss": 1.5791, + "step": 5082 + }, + { + "epoch": 0.8699298305664899, + "grad_norm": 11.915603637695312, + "learning_rate": 2.896177980604678e-05, + "loss": 0.9361, + "step": 5083 + }, + { + "epoch": 0.8701009755262707, + "grad_norm": 20.202234268188477, + "learning_rate": 2.8967484312606958e-05, + "loss": 5.2419, + "step": 5084 + }, + { + "epoch": 0.8702721204860517, + "grad_norm": 6.621884346008301, + "learning_rate": 2.897318881916714e-05, + "loss": 0.6413, + "step": 5085 + }, + { + "epoch": 0.8704432654458326, + "grad_norm": 28.005216598510742, + "learning_rate": 2.8978893325727325e-05, + "loss": 2.8525, + "step": 5086 + }, + { + "epoch": 0.8706144104056136, + "grad_norm": 20.600162506103516, + "learning_rate": 2.898459783228751e-05, + "loss": 1.7191, + "step": 5087 + }, + { + "epoch": 0.8707855553653945, + "grad_norm": 6.3435821533203125, + "learning_rate": 2.899030233884769e-05, + "loss": 0.5639, + "step": 5088 + }, + { + "epoch": 0.8709567003251755, + "grad_norm": 25.650978088378906, + "learning_rate": 2.8996006845407872e-05, + "loss": 5.7733, + "step": 5089 + }, + { + "epoch": 0.8711278452849563, + "grad_norm": 22.818950653076172, + "learning_rate": 2.9001711351968055e-05, + "loss": 2.128, + "step": 5090 + }, + { + "epoch": 0.8712989902447373, + "grad_norm": 5.952839374542236, + "learning_rate": 2.9007415858528235e-05, + "loss": 0.4348, + "step": 5091 + }, + { + "epoch": 0.8714701352045182, + "grad_norm": 1.6554477214813232, + "learning_rate": 2.901312036508842e-05, + "loss": 0.2175, + "step": 5092 + }, + { + "epoch": 0.8716412801642992, + "grad_norm": 22.035751342773438, + "learning_rate": 2.9018824871648602e-05, + "loss": 2.054, + "step": 5093 + }, + { + "epoch": 0.8718124251240801, + "grad_norm": 1.5505925416946411, + "learning_rate": 2.902452937820879e-05, + "loss": 0.2221, + "step": 5094 + }, + { + "epoch": 0.8719835700838611, + "grad_norm": 27.077693939208984, + "learning_rate": 2.903023388476897e-05, + "loss": 3.1948, + "step": 5095 + }, + { + "epoch": 0.872154715043642, + "grad_norm": 22.83882713317871, + "learning_rate": 2.9035938391329152e-05, + "loss": 2.3321, + "step": 5096 + }, + { + "epoch": 0.8723258600034229, + "grad_norm": 4.607493877410889, + "learning_rate": 2.9041642897889335e-05, + "loss": 0.4254, + "step": 5097 + }, + { + "epoch": 0.8724970049632038, + "grad_norm": 17.07670783996582, + "learning_rate": 2.9047347404449515e-05, + "loss": 1.4832, + "step": 5098 + }, + { + "epoch": 0.8726681499229848, + "grad_norm": 15.903471946716309, + "learning_rate": 2.90530519110097e-05, + "loss": 1.5948, + "step": 5099 + }, + { + "epoch": 0.8728392948827657, + "grad_norm": 1.760149598121643, + "learning_rate": 2.9058756417569882e-05, + "loss": 0.2047, + "step": 5100 + }, + { + "epoch": 0.8730104398425467, + "grad_norm": 6.830605983734131, + "learning_rate": 2.9064460924130066e-05, + "loss": 0.6232, + "step": 5101 + }, + { + "epoch": 0.8731815848023275, + "grad_norm": 24.54201316833496, + "learning_rate": 2.9070165430690246e-05, + "loss": 2.2265, + "step": 5102 + }, + { + "epoch": 0.8733527297621085, + "grad_norm": 20.139869689941406, + "learning_rate": 2.907586993725043e-05, + "loss": 2.3484, + "step": 5103 + }, + { + "epoch": 0.8735238747218894, + "grad_norm": 17.505416870117188, + "learning_rate": 2.9081574443810612e-05, + "loss": 1.434, + "step": 5104 + }, + { + "epoch": 0.8736950196816704, + "grad_norm": 33.22725296020508, + "learning_rate": 2.9087278950370792e-05, + "loss": 6.1542, + "step": 5105 + }, + { + "epoch": 0.8738661646414513, + "grad_norm": 146.86790466308594, + "learning_rate": 2.9092983456930976e-05, + "loss": 9.0732, + "step": 5106 + }, + { + "epoch": 0.8740373096012323, + "grad_norm": 16.295713424682617, + "learning_rate": 2.909868796349116e-05, + "loss": 1.4125, + "step": 5107 + }, + { + "epoch": 0.8742084545610131, + "grad_norm": 14.710768699645996, + "learning_rate": 2.9104392470051342e-05, + "loss": 1.1648, + "step": 5108 + }, + { + "epoch": 0.8743795995207941, + "grad_norm": 12.489028930664062, + "learning_rate": 2.9110096976611522e-05, + "loss": 1.268, + "step": 5109 + }, + { + "epoch": 0.874550744480575, + "grad_norm": 13.691608428955078, + "learning_rate": 2.9115801483171706e-05, + "loss": 1.085, + "step": 5110 + }, + { + "epoch": 0.874721889440356, + "grad_norm": 23.710769653320312, + "learning_rate": 2.912150598973189e-05, + "loss": 2.4187, + "step": 5111 + }, + { + "epoch": 0.8748930344001369, + "grad_norm": 25.6949405670166, + "learning_rate": 2.912721049629207e-05, + "loss": 2.5228, + "step": 5112 + }, + { + "epoch": 0.8750641793599179, + "grad_norm": 21.368337631225586, + "learning_rate": 2.9132915002852253e-05, + "loss": 1.8782, + "step": 5113 + }, + { + "epoch": 0.8752353243196987, + "grad_norm": 82.89339447021484, + "learning_rate": 2.9138619509412436e-05, + "loss": 8.207, + "step": 5114 + }, + { + "epoch": 0.8754064692794797, + "grad_norm": 16.71074676513672, + "learning_rate": 2.914432401597262e-05, + "loss": 1.4629, + "step": 5115 + }, + { + "epoch": 0.8755776142392606, + "grad_norm": 2.993272542953491, + "learning_rate": 2.91500285225328e-05, + "loss": 0.394, + "step": 5116 + }, + { + "epoch": 0.8757487591990416, + "grad_norm": 14.644696235656738, + "learning_rate": 2.9155733029092986e-05, + "loss": 1.3512, + "step": 5117 + }, + { + "epoch": 0.8759199041588225, + "grad_norm": 19.816692352294922, + "learning_rate": 2.916143753565317e-05, + "loss": 2.3304, + "step": 5118 + }, + { + "epoch": 0.8760910491186035, + "grad_norm": 33.53482437133789, + "learning_rate": 2.916714204221335e-05, + "loss": 6.1537, + "step": 5119 + }, + { + "epoch": 0.8762621940783843, + "grad_norm": 18.77507972717285, + "learning_rate": 2.9172846548773533e-05, + "loss": 1.5366, + "step": 5120 + }, + { + "epoch": 0.8764333390381653, + "grad_norm": 19.784685134887695, + "learning_rate": 2.9178551055333716e-05, + "loss": 2.1277, + "step": 5121 + }, + { + "epoch": 0.8766044839979462, + "grad_norm": 9.049237251281738, + "learning_rate": 2.9184255561893896e-05, + "loss": 0.6134, + "step": 5122 + }, + { + "epoch": 0.8767756289577272, + "grad_norm": 8.538041114807129, + "learning_rate": 2.918996006845408e-05, + "loss": 0.5802, + "step": 5123 + }, + { + "epoch": 0.8769467739175081, + "grad_norm": 26.41038703918457, + "learning_rate": 2.9195664575014263e-05, + "loss": 2.5314, + "step": 5124 + }, + { + "epoch": 0.8771179188772891, + "grad_norm": 9.25900936126709, + "learning_rate": 2.9201369081574446e-05, + "loss": 0.7571, + "step": 5125 + }, + { + "epoch": 0.87728906383707, + "grad_norm": 28.40367317199707, + "learning_rate": 2.9207073588134626e-05, + "loss": 2.4607, + "step": 5126 + }, + { + "epoch": 0.8774602087968509, + "grad_norm": 28.677005767822266, + "learning_rate": 2.921277809469481e-05, + "loss": 3.544, + "step": 5127 + }, + { + "epoch": 0.8776313537566318, + "grad_norm": 1.4200770854949951, + "learning_rate": 2.9218482601254993e-05, + "loss": 0.2095, + "step": 5128 + }, + { + "epoch": 0.8778024987164128, + "grad_norm": 18.276594161987305, + "learning_rate": 2.9224187107815173e-05, + "loss": 1.494, + "step": 5129 + }, + { + "epoch": 0.8779736436761937, + "grad_norm": 24.763029098510742, + "learning_rate": 2.9229891614375356e-05, + "loss": 2.4242, + "step": 5130 + }, + { + "epoch": 0.8781447886359747, + "grad_norm": 10.431611061096191, + "learning_rate": 2.923559612093554e-05, + "loss": 0.6153, + "step": 5131 + }, + { + "epoch": 0.8783159335957557, + "grad_norm": 58.08489990234375, + "learning_rate": 2.9241300627495723e-05, + "loss": 1.706, + "step": 5132 + }, + { + "epoch": 0.8784870785555365, + "grad_norm": 22.97351837158203, + "learning_rate": 2.9247005134055903e-05, + "loss": 2.2527, + "step": 5133 + }, + { + "epoch": 0.8786582235153175, + "grad_norm": 27.15216827392578, + "learning_rate": 2.9252709640616087e-05, + "loss": 2.759, + "step": 5134 + }, + { + "epoch": 0.8788293684750984, + "grad_norm": 22.588558197021484, + "learning_rate": 2.925841414717627e-05, + "loss": 2.6227, + "step": 5135 + }, + { + "epoch": 0.8790005134348794, + "grad_norm": 33.35072326660156, + "learning_rate": 2.926411865373645e-05, + "loss": 1.2665, + "step": 5136 + }, + { + "epoch": 0.8791716583946603, + "grad_norm": 41.13359451293945, + "learning_rate": 2.9269823160296633e-05, + "loss": 5.9406, + "step": 5137 + }, + { + "epoch": 0.8793428033544413, + "grad_norm": 23.982492446899414, + "learning_rate": 2.9275527666856817e-05, + "loss": 2.9734, + "step": 5138 + }, + { + "epoch": 0.8795139483142221, + "grad_norm": 16.065969467163086, + "learning_rate": 2.9281232173417e-05, + "loss": 1.4901, + "step": 5139 + }, + { + "epoch": 0.8796850932740031, + "grad_norm": 29.961244583129883, + "learning_rate": 2.9286936679977183e-05, + "loss": 1.5246, + "step": 5140 + }, + { + "epoch": 0.879856238233784, + "grad_norm": 33.30720138549805, + "learning_rate": 2.9292641186537367e-05, + "loss": 5.9995, + "step": 5141 + }, + { + "epoch": 0.880027383193565, + "grad_norm": 24.118555068969727, + "learning_rate": 2.929834569309755e-05, + "loss": 2.6075, + "step": 5142 + }, + { + "epoch": 0.8801985281533459, + "grad_norm": 19.221567153930664, + "learning_rate": 2.930405019965773e-05, + "loss": 1.7106, + "step": 5143 + }, + { + "epoch": 0.8803696731131269, + "grad_norm": 22.26023292541504, + "learning_rate": 2.9309754706217914e-05, + "loss": 2.5151, + "step": 5144 + }, + { + "epoch": 0.8805408180729077, + "grad_norm": 13.728766441345215, + "learning_rate": 2.9315459212778097e-05, + "loss": 1.1806, + "step": 5145 + }, + { + "epoch": 0.8807119630326887, + "grad_norm": 20.878938674926758, + "learning_rate": 2.932116371933828e-05, + "loss": 2.107, + "step": 5146 + }, + { + "epoch": 0.8808831079924696, + "grad_norm": 21.325651168823242, + "learning_rate": 2.932686822589846e-05, + "loss": 2.1829, + "step": 5147 + }, + { + "epoch": 0.8810542529522506, + "grad_norm": 1.0309278964996338, + "learning_rate": 2.9332572732458644e-05, + "loss": 0.1782, + "step": 5148 + }, + { + "epoch": 0.8812253979120315, + "grad_norm": 4.816126823425293, + "learning_rate": 2.9338277239018827e-05, + "loss": 0.3899, + "step": 5149 + }, + { + "epoch": 0.8813965428718125, + "grad_norm": 1.190704584121704, + "learning_rate": 2.9343981745579007e-05, + "loss": 0.2021, + "step": 5150 + }, + { + "epoch": 0.8815676878315933, + "grad_norm": 21.205900192260742, + "learning_rate": 2.934968625213919e-05, + "loss": 2.7627, + "step": 5151 + }, + { + "epoch": 0.8817388327913743, + "grad_norm": 1.5020864009857178, + "learning_rate": 2.9355390758699374e-05, + "loss": 0.2301, + "step": 5152 + }, + { + "epoch": 0.8819099777511552, + "grad_norm": 0.8392736315727234, + "learning_rate": 2.9361095265259554e-05, + "loss": 0.1906, + "step": 5153 + }, + { + "epoch": 0.8820811227109362, + "grad_norm": 0.8165884017944336, + "learning_rate": 2.9366799771819737e-05, + "loss": 0.1924, + "step": 5154 + }, + { + "epoch": 0.8822522676707171, + "grad_norm": 16.63069725036621, + "learning_rate": 2.937250427837992e-05, + "loss": 1.2813, + "step": 5155 + }, + { + "epoch": 0.8824234126304981, + "grad_norm": 8.837629318237305, + "learning_rate": 2.9378208784940104e-05, + "loss": 0.5551, + "step": 5156 + }, + { + "epoch": 0.8825945575902789, + "grad_norm": 30.035585403442383, + "learning_rate": 2.9383913291500284e-05, + "loss": 5.6298, + "step": 5157 + }, + { + "epoch": 0.8827657025500599, + "grad_norm": 18.92460060119629, + "learning_rate": 2.9389617798060467e-05, + "loss": 2.1136, + "step": 5158 + }, + { + "epoch": 0.8829368475098408, + "grad_norm": 18.019941329956055, + "learning_rate": 2.939532230462065e-05, + "loss": 1.701, + "step": 5159 + }, + { + "epoch": 0.8831079924696218, + "grad_norm": 25.73262596130371, + "learning_rate": 2.940102681118083e-05, + "loss": 2.4158, + "step": 5160 + }, + { + "epoch": 0.8832791374294027, + "grad_norm": 30.859712600708008, + "learning_rate": 2.9406731317741014e-05, + "loss": 5.5833, + "step": 5161 + }, + { + "epoch": 0.8834502823891837, + "grad_norm": 18.802223205566406, + "learning_rate": 2.9412435824301197e-05, + "loss": 1.9511, + "step": 5162 + }, + { + "epoch": 0.8836214273489645, + "grad_norm": 5.314499855041504, + "learning_rate": 2.9418140330861384e-05, + "loss": 0.4971, + "step": 5163 + }, + { + "epoch": 0.8837925723087455, + "grad_norm": 27.17005157470703, + "learning_rate": 2.9423844837421564e-05, + "loss": 3.7148, + "step": 5164 + }, + { + "epoch": 0.8839637172685264, + "grad_norm": 25.687992095947266, + "learning_rate": 2.9429549343981748e-05, + "loss": 3.388, + "step": 5165 + }, + { + "epoch": 0.8841348622283074, + "grad_norm": 29.165775299072266, + "learning_rate": 2.943525385054193e-05, + "loss": 3.7435, + "step": 5166 + }, + { + "epoch": 0.8843060071880883, + "grad_norm": 21.13896942138672, + "learning_rate": 2.944095835710211e-05, + "loss": 1.7033, + "step": 5167 + }, + { + "epoch": 0.8844771521478693, + "grad_norm": 14.15404224395752, + "learning_rate": 2.9446662863662294e-05, + "loss": 1.156, + "step": 5168 + }, + { + "epoch": 0.8846482971076501, + "grad_norm": 27.660737991333008, + "learning_rate": 2.9452367370222478e-05, + "loss": 1.8129, + "step": 5169 + }, + { + "epoch": 0.8848194420674311, + "grad_norm": 24.719099044799805, + "learning_rate": 2.945807187678266e-05, + "loss": 2.121, + "step": 5170 + }, + { + "epoch": 0.884990587027212, + "grad_norm": 21.35886001586914, + "learning_rate": 2.946377638334284e-05, + "loss": 2.0431, + "step": 5171 + }, + { + "epoch": 0.885161731986993, + "grad_norm": 22.36219596862793, + "learning_rate": 2.9469480889903024e-05, + "loss": 2.134, + "step": 5172 + }, + { + "epoch": 0.8853328769467739, + "grad_norm": 24.10101890563965, + "learning_rate": 2.9475185396463208e-05, + "loss": 2.071, + "step": 5173 + }, + { + "epoch": 0.8855040219065549, + "grad_norm": 18.05703353881836, + "learning_rate": 2.9480889903023388e-05, + "loss": 1.558, + "step": 5174 + }, + { + "epoch": 0.8856751668663357, + "grad_norm": 26.822391510009766, + "learning_rate": 2.948659440958357e-05, + "loss": 2.3976, + "step": 5175 + }, + { + "epoch": 0.8858463118261167, + "grad_norm": 18.833152770996094, + "learning_rate": 2.9492298916143755e-05, + "loss": 1.6977, + "step": 5176 + }, + { + "epoch": 0.8860174567858976, + "grad_norm": 19.91875648498535, + "learning_rate": 2.9498003422703938e-05, + "loss": 1.5125, + "step": 5177 + }, + { + "epoch": 0.8861886017456786, + "grad_norm": 20.179113388061523, + "learning_rate": 2.9503707929264118e-05, + "loss": 1.9563, + "step": 5178 + }, + { + "epoch": 0.8863597467054595, + "grad_norm": 22.212738037109375, + "learning_rate": 2.95094124358243e-05, + "loss": 1.8149, + "step": 5179 + }, + { + "epoch": 0.8865308916652405, + "grad_norm": 19.415225982666016, + "learning_rate": 2.9515116942384485e-05, + "loss": 1.6777, + "step": 5180 + }, + { + "epoch": 0.8867020366250213, + "grad_norm": 31.313318252563477, + "learning_rate": 2.9520821448944665e-05, + "loss": 4.1531, + "step": 5181 + }, + { + "epoch": 0.8868731815848023, + "grad_norm": 34.813720703125, + "learning_rate": 2.9526525955504848e-05, + "loss": 5.9337, + "step": 5182 + }, + { + "epoch": 0.8870443265445833, + "grad_norm": 5.619294166564941, + "learning_rate": 2.953223046206503e-05, + "loss": 0.4206, + "step": 5183 + }, + { + "epoch": 0.8872154715043642, + "grad_norm": 18.10093116760254, + "learning_rate": 2.953793496862521e-05, + "loss": 1.8818, + "step": 5184 + }, + { + "epoch": 0.8873866164641452, + "grad_norm": 22.031768798828125, + "learning_rate": 2.9543639475185398e-05, + "loss": 2.6025, + "step": 5185 + }, + { + "epoch": 0.8875577614239261, + "grad_norm": 19.005178451538086, + "learning_rate": 2.954934398174558e-05, + "loss": 2.3032, + "step": 5186 + }, + { + "epoch": 0.887728906383707, + "grad_norm": 2.4968888759613037, + "learning_rate": 2.9555048488305765e-05, + "loss": 0.2314, + "step": 5187 + }, + { + "epoch": 0.8879000513434879, + "grad_norm": 15.055726051330566, + "learning_rate": 2.9560752994865945e-05, + "loss": 1.2134, + "step": 5188 + }, + { + "epoch": 0.8880711963032689, + "grad_norm": 16.972787857055664, + "learning_rate": 2.956645750142613e-05, + "loss": 1.5495, + "step": 5189 + }, + { + "epoch": 0.8882423412630498, + "grad_norm": 5.097226142883301, + "learning_rate": 2.9572162007986312e-05, + "loss": 0.4263, + "step": 5190 + }, + { + "epoch": 0.8884134862228308, + "grad_norm": 23.945755004882812, + "learning_rate": 2.9577866514546492e-05, + "loss": 1.9982, + "step": 5191 + }, + { + "epoch": 0.8885846311826117, + "grad_norm": 18.41358184814453, + "learning_rate": 2.9583571021106675e-05, + "loss": 1.5775, + "step": 5192 + }, + { + "epoch": 0.8887557761423927, + "grad_norm": 20.26495361328125, + "learning_rate": 2.958927552766686e-05, + "loss": 2.0699, + "step": 5193 + }, + { + "epoch": 0.8889269211021735, + "grad_norm": 21.891618728637695, + "learning_rate": 2.9594980034227042e-05, + "loss": 1.8668, + "step": 5194 + }, + { + "epoch": 0.8890980660619545, + "grad_norm": 18.51753807067871, + "learning_rate": 2.9600684540787222e-05, + "loss": 1.7949, + "step": 5195 + }, + { + "epoch": 0.8892692110217354, + "grad_norm": 21.540264129638672, + "learning_rate": 2.9606389047347405e-05, + "loss": 2.2159, + "step": 5196 + }, + { + "epoch": 0.8894403559815164, + "grad_norm": 25.46014976501465, + "learning_rate": 2.961209355390759e-05, + "loss": 2.5065, + "step": 5197 + }, + { + "epoch": 0.8896115009412973, + "grad_norm": 37.4268798828125, + "learning_rate": 2.961779806046777e-05, + "loss": 1.9917, + "step": 5198 + }, + { + "epoch": 0.8897826459010783, + "grad_norm": 25.361825942993164, + "learning_rate": 2.9623502567027952e-05, + "loss": 2.6214, + "step": 5199 + }, + { + "epoch": 0.8899537908608591, + "grad_norm": 8.111501693725586, + "learning_rate": 2.9629207073588135e-05, + "loss": 0.8144, + "step": 5200 + }, + { + "epoch": 0.8901249358206401, + "grad_norm": 18.618261337280273, + "learning_rate": 2.963491158014832e-05, + "loss": 1.9664, + "step": 5201 + }, + { + "epoch": 0.890296080780421, + "grad_norm": 17.81608009338379, + "learning_rate": 2.96406160867085e-05, + "loss": 1.5596, + "step": 5202 + }, + { + "epoch": 0.890467225740202, + "grad_norm": 5.169036388397217, + "learning_rate": 2.9646320593268682e-05, + "loss": 0.4681, + "step": 5203 + }, + { + "epoch": 0.8906383706999829, + "grad_norm": 21.34773826599121, + "learning_rate": 2.9652025099828865e-05, + "loss": 2.1088, + "step": 5204 + }, + { + "epoch": 0.8908095156597639, + "grad_norm": 13.635762214660645, + "learning_rate": 2.9657729606389045e-05, + "loss": 1.147, + "step": 5205 + }, + { + "epoch": 0.8909806606195447, + "grad_norm": 21.587596893310547, + "learning_rate": 2.966343411294923e-05, + "loss": 2.0986, + "step": 5206 + }, + { + "epoch": 0.8911518055793257, + "grad_norm": 1.29149329662323, + "learning_rate": 2.9669138619509412e-05, + "loss": 0.2093, + "step": 5207 + }, + { + "epoch": 0.8913229505391066, + "grad_norm": 20.227731704711914, + "learning_rate": 2.96748431260696e-05, + "loss": 1.682, + "step": 5208 + }, + { + "epoch": 0.8914940954988876, + "grad_norm": 12.934089660644531, + "learning_rate": 2.968054763262978e-05, + "loss": 0.6278, + "step": 5209 + }, + { + "epoch": 0.8916652404586685, + "grad_norm": 18.040390014648438, + "learning_rate": 2.9686252139189962e-05, + "loss": 1.561, + "step": 5210 + }, + { + "epoch": 0.8918363854184495, + "grad_norm": 24.204835891723633, + "learning_rate": 2.9691956645750146e-05, + "loss": 1.7786, + "step": 5211 + }, + { + "epoch": 0.8920075303782303, + "grad_norm": 23.571611404418945, + "learning_rate": 2.9697661152310326e-05, + "loss": 2.0676, + "step": 5212 + }, + { + "epoch": 0.8921786753380113, + "grad_norm": 22.09473991394043, + "learning_rate": 2.970336565887051e-05, + "loss": 2.099, + "step": 5213 + }, + { + "epoch": 0.8923498202977922, + "grad_norm": 1.3482792377471924, + "learning_rate": 2.9709070165430692e-05, + "loss": 0.2145, + "step": 5214 + }, + { + "epoch": 0.8925209652575732, + "grad_norm": 21.764923095703125, + "learning_rate": 2.9714774671990872e-05, + "loss": 2.0434, + "step": 5215 + }, + { + "epoch": 0.8926921102173541, + "grad_norm": 19.195314407348633, + "learning_rate": 2.9720479178551056e-05, + "loss": 1.4061, + "step": 5216 + }, + { + "epoch": 0.892863255177135, + "grad_norm": 87.61866760253906, + "learning_rate": 2.972618368511124e-05, + "loss": 7.9821, + "step": 5217 + }, + { + "epoch": 0.8930344001369159, + "grad_norm": 19.71204376220703, + "learning_rate": 2.9731888191671423e-05, + "loss": 1.7201, + "step": 5218 + }, + { + "epoch": 0.8932055450966969, + "grad_norm": 25.464221954345703, + "learning_rate": 2.9737592698231603e-05, + "loss": 2.8268, + "step": 5219 + }, + { + "epoch": 0.8933766900564778, + "grad_norm": 1.361387014389038, + "learning_rate": 2.9743297204791786e-05, + "loss": 0.2127, + "step": 5220 + }, + { + "epoch": 0.8935478350162588, + "grad_norm": 18.932161331176758, + "learning_rate": 2.974900171135197e-05, + "loss": 1.7737, + "step": 5221 + }, + { + "epoch": 0.8937189799760397, + "grad_norm": 6.523068904876709, + "learning_rate": 2.975470621791215e-05, + "loss": 0.5011, + "step": 5222 + }, + { + "epoch": 0.8938901249358207, + "grad_norm": 21.28632926940918, + "learning_rate": 2.9760410724472333e-05, + "loss": 2.0839, + "step": 5223 + }, + { + "epoch": 0.8940612698956015, + "grad_norm": 2.508774995803833, + "learning_rate": 2.9766115231032516e-05, + "loss": 0.2127, + "step": 5224 + }, + { + "epoch": 0.8942324148553825, + "grad_norm": 22.384605407714844, + "learning_rate": 2.97718197375927e-05, + "loss": 1.7595, + "step": 5225 + }, + { + "epoch": 0.8944035598151634, + "grad_norm": 13.072036743164062, + "learning_rate": 2.977752424415288e-05, + "loss": 1.0962, + "step": 5226 + }, + { + "epoch": 0.8945747047749444, + "grad_norm": 27.329818725585938, + "learning_rate": 2.9783228750713063e-05, + "loss": 2.492, + "step": 5227 + }, + { + "epoch": 0.8947458497347253, + "grad_norm": 0.9053159952163696, + "learning_rate": 2.9788933257273246e-05, + "loss": 0.1874, + "step": 5228 + }, + { + "epoch": 0.8949169946945063, + "grad_norm": 17.836803436279297, + "learning_rate": 2.9794637763833426e-05, + "loss": 1.5566, + "step": 5229 + }, + { + "epoch": 0.8950881396542871, + "grad_norm": 15.548909187316895, + "learning_rate": 2.980034227039361e-05, + "loss": 1.2955, + "step": 5230 + }, + { + "epoch": 0.8952592846140681, + "grad_norm": 19.422529220581055, + "learning_rate": 2.9806046776953796e-05, + "loss": 2.5122, + "step": 5231 + }, + { + "epoch": 0.895430429573849, + "grad_norm": 0.8730620741844177, + "learning_rate": 2.981175128351398e-05, + "loss": 0.1783, + "step": 5232 + }, + { + "epoch": 0.89560157453363, + "grad_norm": 22.175594329833984, + "learning_rate": 2.981745579007416e-05, + "loss": 1.6344, + "step": 5233 + }, + { + "epoch": 0.895772719493411, + "grad_norm": 4.08912467956543, + "learning_rate": 2.9823160296634343e-05, + "loss": 0.3297, + "step": 5234 + }, + { + "epoch": 0.8959438644531919, + "grad_norm": 20.610801696777344, + "learning_rate": 2.9828864803194527e-05, + "loss": 2.0051, + "step": 5235 + }, + { + "epoch": 0.8961150094129728, + "grad_norm": 19.67643165588379, + "learning_rate": 2.9834569309754706e-05, + "loss": 1.281, + "step": 5236 + }, + { + "epoch": 0.8962861543727537, + "grad_norm": 22.13687515258789, + "learning_rate": 2.984027381631489e-05, + "loss": 2.5087, + "step": 5237 + }, + { + "epoch": 0.8964572993325347, + "grad_norm": 5.210666656494141, + "learning_rate": 2.9845978322875073e-05, + "loss": 0.4318, + "step": 5238 + }, + { + "epoch": 0.8966284442923156, + "grad_norm": 31.837879180908203, + "learning_rate": 2.9851682829435257e-05, + "loss": 3.9733, + "step": 5239 + }, + { + "epoch": 0.8967995892520966, + "grad_norm": 24.229183197021484, + "learning_rate": 2.9857387335995437e-05, + "loss": 2.0104, + "step": 5240 + }, + { + "epoch": 0.8969707342118775, + "grad_norm": 62.011810302734375, + "learning_rate": 2.986309184255562e-05, + "loss": 7.4416, + "step": 5241 + }, + { + "epoch": 0.8971418791716584, + "grad_norm": 23.459850311279297, + "learning_rate": 2.9868796349115803e-05, + "loss": 2.7344, + "step": 5242 + }, + { + "epoch": 0.8973130241314393, + "grad_norm": 106.41157531738281, + "learning_rate": 2.9874500855675983e-05, + "loss": 8.8184, + "step": 5243 + }, + { + "epoch": 0.8974841690912203, + "grad_norm": 18.093460083007812, + "learning_rate": 2.9880205362236167e-05, + "loss": 1.3483, + "step": 5244 + }, + { + "epoch": 0.8976553140510012, + "grad_norm": 6.958301544189453, + "learning_rate": 2.988590986879635e-05, + "loss": 0.5888, + "step": 5245 + }, + { + "epoch": 0.8978264590107822, + "grad_norm": 28.051706314086914, + "learning_rate": 2.9891614375356534e-05, + "loss": 3.8837, + "step": 5246 + }, + { + "epoch": 0.897997603970563, + "grad_norm": 16.499065399169922, + "learning_rate": 2.9897318881916713e-05, + "loss": 1.548, + "step": 5247 + }, + { + "epoch": 0.898168748930344, + "grad_norm": 24.6423282623291, + "learning_rate": 2.9903023388476897e-05, + "loss": 2.0936, + "step": 5248 + }, + { + "epoch": 0.8983398938901249, + "grad_norm": 23.600177764892578, + "learning_rate": 2.990872789503708e-05, + "loss": 2.4881, + "step": 5249 + }, + { + "epoch": 0.8985110388499059, + "grad_norm": 6.3298163414001465, + "learning_rate": 2.991443240159726e-05, + "loss": 0.8614, + "step": 5250 + }, + { + "epoch": 0.8986821838096868, + "grad_norm": 84.1204833984375, + "learning_rate": 2.9920136908157444e-05, + "loss": 7.2037, + "step": 5251 + }, + { + "epoch": 0.8988533287694678, + "grad_norm": 13.504063606262207, + "learning_rate": 2.9925841414717627e-05, + "loss": 1.2107, + "step": 5252 + }, + { + "epoch": 0.8990244737292487, + "grad_norm": 10.01652717590332, + "learning_rate": 2.9931545921277807e-05, + "loss": 0.6106, + "step": 5253 + }, + { + "epoch": 0.8991956186890296, + "grad_norm": 139.29315185546875, + "learning_rate": 2.9937250427837994e-05, + "loss": 8.8993, + "step": 5254 + }, + { + "epoch": 0.8993667636488105, + "grad_norm": 31.561298370361328, + "learning_rate": 2.9942954934398177e-05, + "loss": 3.2313, + "step": 5255 + }, + { + "epoch": 0.8995379086085915, + "grad_norm": 2.1672093868255615, + "learning_rate": 2.994865944095836e-05, + "loss": 0.2206, + "step": 5256 + }, + { + "epoch": 0.8997090535683724, + "grad_norm": 6.4866414070129395, + "learning_rate": 2.995436394751854e-05, + "loss": 0.5123, + "step": 5257 + }, + { + "epoch": 0.8998801985281534, + "grad_norm": 12.993927955627441, + "learning_rate": 2.9960068454078724e-05, + "loss": 1.0112, + "step": 5258 + }, + { + "epoch": 0.9000513434879343, + "grad_norm": 27.867324829101562, + "learning_rate": 2.9965772960638907e-05, + "loss": 3.0155, + "step": 5259 + }, + { + "epoch": 0.9002224884477152, + "grad_norm": 27.61526870727539, + "learning_rate": 2.9971477467199087e-05, + "loss": 3.0738, + "step": 5260 + }, + { + "epoch": 0.9003936334074961, + "grad_norm": 38.995018005371094, + "learning_rate": 2.997718197375927e-05, + "loss": 1.5856, + "step": 5261 + }, + { + "epoch": 0.9005647783672771, + "grad_norm": 1.1415177583694458, + "learning_rate": 2.9982886480319454e-05, + "loss": 0.2026, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_nli-pairs_loss": 1.9112855195999146, + "eval_nli-pairs_runtime": 4.5969, + "eval_nli-pairs_samples_per_second": 43.508, + "eval_nli-pairs_steps_per_second": 1.523, + "eval_sts-test_pearson_cosine": 0.7624463325036241, + "eval_sts-test_pearson_dot": 0.6376176200770809, + "eval_sts-test_pearson_euclidean": 0.7595445366220308, + "eval_sts-test_pearson_manhattan": 0.7665949852672425, + "eval_sts-test_pearson_max": 0.7665949852672425, + "eval_sts-test_spearman_cosine": 0.7585611029472056, + "eval_sts-test_spearman_dot": 0.6199386840754815, + "eval_sts-test_spearman_euclidean": 0.7459963199554185, + "eval_sts-test_spearman_manhattan": 0.755338152133313, + "eval_sts-test_spearman_max": 0.7585611029472056, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_vitaminc-pairs_loss": 1.1131478548049927, + "eval_vitaminc-pairs_runtime": 2.815, + "eval_vitaminc-pairs_samples_per_second": 71.047, + "eval_vitaminc-pairs_steps_per_second": 2.487, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_qnli-contrastive_loss": 2.1876909732818604, + "eval_qnli-contrastive_runtime": 0.6866, + "eval_qnli-contrastive_samples_per_second": 291.295, + "eval_qnli-contrastive_steps_per_second": 10.195, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_scitail-pairs-qa_loss": 0.16065949201583862, + "eval_scitail-pairs-qa_runtime": 1.7154, + "eval_scitail-pairs-qa_samples_per_second": 116.59, + "eval_scitail-pairs-qa_steps_per_second": 4.081, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_scitail-pairs-pos_loss": 0.7642461061477661, + "eval_scitail-pairs-pos_runtime": 3.0222, + "eval_scitail-pairs-pos_samples_per_second": 66.177, + "eval_scitail-pairs-pos_steps_per_second": 2.316, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_xsum-pairs_loss": 0.9787455797195435, + "eval_xsum-pairs_runtime": 2.6515, + "eval_xsum-pairs_samples_per_second": 66.001, + "eval_xsum-pairs_steps_per_second": 2.263, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_compression-pairs_loss": 0.35531559586524963, + "eval_compression-pairs_runtime": 0.519, + "eval_compression-pairs_samples_per_second": 385.368, + "eval_compression-pairs_steps_per_second": 13.488, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_sciq_pairs_loss": 5.063950538635254, + "eval_sciq_pairs_runtime": 9.7486, + "eval_sciq_pairs_samples_per_second": 20.516, + "eval_sciq_pairs_steps_per_second": 0.718, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_qasc_pairs_loss": 6.048434257507324, + "eval_qasc_pairs_runtime": 2.9175, + "eval_qasc_pairs_samples_per_second": 68.551, + "eval_qasc_pairs_steps_per_second": 2.399, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_openbookqa_pairs_loss": 3.0121548175811768, + "eval_openbookqa_pairs_runtime": 0.6555, + "eval_openbookqa_pairs_samples_per_second": 105.258, + "eval_openbookqa_pairs_steps_per_second": 4.576, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_msmarco_pairs_loss": 1.5317801237106323, + "eval_msmarco_pairs_runtime": 3.9921, + "eval_msmarco_pairs_samples_per_second": 50.099, + "eval_msmarco_pairs_steps_per_second": 1.753, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_nq_pairs_loss": 1.7686961889266968, + "eval_nq_pairs_runtime": 8.7237, + "eval_nq_pairs_samples_per_second": 22.926, + "eval_nq_pairs_steps_per_second": 0.802, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_trivia_pairs_loss": 2.2582404613494873, + "eval_trivia_pairs_runtime": 12.9183, + "eval_trivia_pairs_samples_per_second": 15.482, + "eval_trivia_pairs_steps_per_second": 0.542, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_quora_pairs_loss": 0.30566632747650146, + "eval_quora_pairs_runtime": 1.5883, + "eval_quora_pairs_samples_per_second": 125.918, + "eval_quora_pairs_steps_per_second": 4.407, + "step": 5262 + }, + { + "epoch": 0.9005647783672771, + "eval_gooaq_pairs_loss": 1.1452974081039429, + "eval_gooaq_pairs_runtime": 2.6271, + "eval_gooaq_pairs_samples_per_second": 76.128, + "eval_gooaq_pairs_steps_per_second": 2.664, + "step": 5262 + }, + { + "epoch": 0.900735923327058, + "grad_norm": 17.04757308959961, + "learning_rate": 2.9988590986879637e-05, + "loss": 1.4081, + "step": 5263 + }, + { + "epoch": 0.900907068286839, + "grad_norm": 30.77292823791504, + "learning_rate": 2.9994295493439817e-05, + "loss": 5.9519, + "step": 5264 + }, + { + "epoch": 0.9010782132466199, + "grad_norm": 18.369760513305664, + "learning_rate": 3e-05, + "loss": 1.7582, + "step": 5265 + }, + { + "epoch": 0.9012493582064008, + "grad_norm": 33.92514419555664, + "learning_rate": 2.999999557498883e-05, + "loss": 5.7165, + "step": 5266 + }, + { + "epoch": 0.9014205031661817, + "grad_norm": 25.037368774414062, + "learning_rate": 2.999998229995793e-05, + "loss": 3.4615, + "step": 5267 + }, + { + "epoch": 0.9015916481259627, + "grad_norm": 1.5256587266921997, + "learning_rate": 2.9999960174915127e-05, + "loss": 0.2069, + "step": 5268 + }, + { + "epoch": 0.9017627930857436, + "grad_norm": 16.602886199951172, + "learning_rate": 2.999992919987348e-05, + "loss": 1.3721, + "step": 5269 + }, + { + "epoch": 0.9019339380455246, + "grad_norm": 16.512067794799805, + "learning_rate": 2.9999889374851267e-05, + "loss": 1.4327, + "step": 5270 + }, + { + "epoch": 0.9021050830053055, + "grad_norm": 36.09376525878906, + "learning_rate": 2.999984069987198e-05, + "loss": 5.6646, + "step": 5271 + }, + { + "epoch": 0.9022762279650864, + "grad_norm": 18.73316192626953, + "learning_rate": 2.9999783174964336e-05, + "loss": 1.9157, + "step": 5272 + }, + { + "epoch": 0.9024473729248673, + "grad_norm": 24.654346466064453, + "learning_rate": 2.9999716800162275e-05, + "loss": 2.7708, + "step": 5273 + }, + { + "epoch": 0.9026185178846483, + "grad_norm": 7.934967517852783, + "learning_rate": 2.9999641575504964e-05, + "loss": 0.9629, + "step": 5274 + }, + { + "epoch": 0.9027896628444292, + "grad_norm": 13.806313514709473, + "learning_rate": 2.9999557501036782e-05, + "loss": 1.181, + "step": 5275 + }, + { + "epoch": 0.9029608078042102, + "grad_norm": 21.018030166625977, + "learning_rate": 2.999946457680733e-05, + "loss": 2.4935, + "step": 5276 + }, + { + "epoch": 0.903131952763991, + "grad_norm": 27.857648849487305, + "learning_rate": 2.999936280287144e-05, + "loss": 3.6943, + "step": 5277 + }, + { + "epoch": 0.903303097723772, + "grad_norm": 16.297943115234375, + "learning_rate": 2.9999252179289158e-05, + "loss": 1.3468, + "step": 5278 + }, + { + "epoch": 0.9034742426835529, + "grad_norm": 24.035676956176758, + "learning_rate": 2.9999132706125743e-05, + "loss": 2.1091, + "step": 5279 + }, + { + "epoch": 0.9036453876433339, + "grad_norm": 26.095598220825195, + "learning_rate": 2.9999004383451696e-05, + "loss": 5.4428, + "step": 5280 + }, + { + "epoch": 0.9038165326031148, + "grad_norm": 8.601061820983887, + "learning_rate": 2.9998867211342718e-05, + "loss": 0.7404, + "step": 5281 + }, + { + "epoch": 0.9039876775628958, + "grad_norm": 95.90294647216797, + "learning_rate": 2.999872118987975e-05, + "loss": 7.0901, + "step": 5282 + }, + { + "epoch": 0.9041588225226767, + "grad_norm": 20.32706069946289, + "learning_rate": 2.9998566319148938e-05, + "loss": 2.4138, + "step": 5283 + }, + { + "epoch": 0.9043299674824576, + "grad_norm": 16.315454483032227, + "learning_rate": 2.9998402599241654e-05, + "loss": 1.3743, + "step": 5284 + }, + { + "epoch": 0.9045011124422386, + "grad_norm": 28.84542465209961, + "learning_rate": 2.99982300302545e-05, + "loss": 5.7125, + "step": 5285 + }, + { + "epoch": 0.9046722574020195, + "grad_norm": 17.301790237426758, + "learning_rate": 2.9998048612289287e-05, + "loss": 1.1816, + "step": 5286 + }, + { + "epoch": 0.9048434023618005, + "grad_norm": 21.75240135192871, + "learning_rate": 2.9997858345453058e-05, + "loss": 1.8483, + "step": 5287 + }, + { + "epoch": 0.9050145473215814, + "grad_norm": 12.841358184814453, + "learning_rate": 2.9997659229858064e-05, + "loss": 1.3388, + "step": 5288 + }, + { + "epoch": 0.9051856922813624, + "grad_norm": 20.186582565307617, + "learning_rate": 2.999745126562179e-05, + "loss": 1.8119, + "step": 5289 + }, + { + "epoch": 0.9053568372411432, + "grad_norm": 18.223058700561523, + "learning_rate": 2.9997234452866925e-05, + "loss": 1.2181, + "step": 5290 + }, + { + "epoch": 0.9055279822009242, + "grad_norm": 8.343592643737793, + "learning_rate": 2.9997008791721397e-05, + "loss": 0.4821, + "step": 5291 + }, + { + "epoch": 0.9056991271607051, + "grad_norm": 23.329736709594727, + "learning_rate": 2.9996774282318344e-05, + "loss": 1.6308, + "step": 5292 + }, + { + "epoch": 0.9058702721204861, + "grad_norm": 18.162965774536133, + "learning_rate": 2.9996530924796127e-05, + "loss": 1.6218, + "step": 5293 + }, + { + "epoch": 0.906041417080267, + "grad_norm": 12.122421264648438, + "learning_rate": 2.999627871929833e-05, + "loss": 1.0583, + "step": 5294 + }, + { + "epoch": 0.906212562040048, + "grad_norm": 84.63624572753906, + "learning_rate": 2.999601766597375e-05, + "loss": 7.7833, + "step": 5295 + }, + { + "epoch": 0.9063837069998288, + "grad_norm": 8.922574043273926, + "learning_rate": 2.9995747764976414e-05, + "loss": 0.567, + "step": 5296 + }, + { + "epoch": 0.9065548519596098, + "grad_norm": 50.66462707519531, + "learning_rate": 2.999546901646556e-05, + "loss": 2.4272, + "step": 5297 + }, + { + "epoch": 0.9067259969193907, + "grad_norm": 25.654756546020508, + "learning_rate": 2.9995181420605653e-05, + "loss": 3.151, + "step": 5298 + }, + { + "epoch": 0.9068971418791717, + "grad_norm": 20.01262664794922, + "learning_rate": 2.9994884977566372e-05, + "loss": 2.04, + "step": 5299 + }, + { + "epoch": 0.9070682868389526, + "grad_norm": 20.105493545532227, + "learning_rate": 2.9994579687522615e-05, + "loss": 2.3112, + "step": 5300 + }, + { + "epoch": 0.9072394317987336, + "grad_norm": 18.445158004760742, + "learning_rate": 2.9994265550654512e-05, + "loss": 1.6806, + "step": 5301 + }, + { + "epoch": 0.9074105767585144, + "grad_norm": 18.450517654418945, + "learning_rate": 2.9993942567147402e-05, + "loss": 1.6844, + "step": 5302 + }, + { + "epoch": 0.9075817217182954, + "grad_norm": 1.4754540920257568, + "learning_rate": 2.999361073719184e-05, + "loss": 0.2101, + "step": 5303 + }, + { + "epoch": 0.9077528666780763, + "grad_norm": 1.742583990097046, + "learning_rate": 2.999327006098362e-05, + "loss": 0.2184, + "step": 5304 + }, + { + "epoch": 0.9079240116378573, + "grad_norm": 55.38993835449219, + "learning_rate": 2.9992920538723722e-05, + "loss": 7.8441, + "step": 5305 + }, + { + "epoch": 0.9080951565976382, + "grad_norm": 18.097368240356445, + "learning_rate": 2.999256217061838e-05, + "loss": 2.0351, + "step": 5306 + }, + { + "epoch": 0.9082663015574192, + "grad_norm": 18.082368850708008, + "learning_rate": 2.9992194956879027e-05, + "loss": 1.5539, + "step": 5307 + }, + { + "epoch": 0.9084374465172, + "grad_norm": 25.358264923095703, + "learning_rate": 2.9991818897722315e-05, + "loss": 2.4061, + "step": 5308 + }, + { + "epoch": 0.908608591476981, + "grad_norm": 25.521377563476562, + "learning_rate": 2.9991433993370126e-05, + "loss": 2.4613, + "step": 5309 + }, + { + "epoch": 0.9087797364367619, + "grad_norm": 18.244693756103516, + "learning_rate": 2.9991040244049556e-05, + "loss": 1.9235, + "step": 5310 + }, + { + "epoch": 0.9089508813965429, + "grad_norm": 1.618678331375122, + "learning_rate": 2.999063764999291e-05, + "loss": 0.2182, + "step": 5311 + }, + { + "epoch": 0.9091220263563238, + "grad_norm": 15.342190742492676, + "learning_rate": 2.9990226211437717e-05, + "loss": 1.4192, + "step": 5312 + }, + { + "epoch": 0.9092931713161048, + "grad_norm": 19.77103614807129, + "learning_rate": 2.9989805928626736e-05, + "loss": 1.673, + "step": 5313 + }, + { + "epoch": 0.9094643162758856, + "grad_norm": 22.868837356567383, + "learning_rate": 2.9989376801807933e-05, + "loss": 2.5452, + "step": 5314 + }, + { + "epoch": 0.9096354612356666, + "grad_norm": 106.19268798828125, + "learning_rate": 2.998893883123449e-05, + "loss": 8.6557, + "step": 5315 + }, + { + "epoch": 0.9098066061954475, + "grad_norm": 17.946029663085938, + "learning_rate": 2.9988492017164812e-05, + "loss": 1.928, + "step": 5316 + }, + { + "epoch": 0.9099777511552285, + "grad_norm": 13.810042381286621, + "learning_rate": 2.9988036359862517e-05, + "loss": 1.3295, + "step": 5317 + }, + { + "epoch": 0.9101488961150094, + "grad_norm": 13.559564590454102, + "learning_rate": 2.9987571859596446e-05, + "loss": 0.9588, + "step": 5318 + }, + { + "epoch": 0.9103200410747904, + "grad_norm": 1.3217213153839111, + "learning_rate": 2.9987098516640656e-05, + "loss": 0.193, + "step": 5319 + }, + { + "epoch": 0.9104911860345712, + "grad_norm": 17.758501052856445, + "learning_rate": 2.9986616331274415e-05, + "loss": 1.5314, + "step": 5320 + }, + { + "epoch": 0.9106623309943522, + "grad_norm": 9.879485130310059, + "learning_rate": 2.998612530378222e-05, + "loss": 0.6681, + "step": 5321 + }, + { + "epoch": 0.9108334759541331, + "grad_norm": 8.769416809082031, + "learning_rate": 2.9985625434453774e-05, + "loss": 0.6281, + "step": 5322 + }, + { + "epoch": 0.9110046209139141, + "grad_norm": 20.583568572998047, + "learning_rate": 2.9985116723584e-05, + "loss": 2.0724, + "step": 5323 + }, + { + "epoch": 0.911175765873695, + "grad_norm": 1.0460354089736938, + "learning_rate": 2.998459917147304e-05, + "loss": 0.1869, + "step": 5324 + }, + { + "epoch": 0.911346910833476, + "grad_norm": 26.880903244018555, + "learning_rate": 2.9984072778426246e-05, + "loss": 2.9832, + "step": 5325 + }, + { + "epoch": 0.9115180557932568, + "grad_norm": 5.727287292480469, + "learning_rate": 2.99835375447542e-05, + "loss": 0.5114, + "step": 5326 + }, + { + "epoch": 0.9116892007530378, + "grad_norm": 76.71002960205078, + "learning_rate": 2.9982993470772684e-05, + "loss": 7.6678, + "step": 5327 + }, + { + "epoch": 0.9118603457128187, + "grad_norm": 15.319390296936035, + "learning_rate": 2.99824405568027e-05, + "loss": 1.161, + "step": 5328 + }, + { + "epoch": 0.9120314906725997, + "grad_norm": 16.707138061523438, + "learning_rate": 2.9981878803170476e-05, + "loss": 1.7732, + "step": 5329 + }, + { + "epoch": 0.9122026356323806, + "grad_norm": 21.353525161743164, + "learning_rate": 2.9981308210207444e-05, + "loss": 1.9459, + "step": 5330 + }, + { + "epoch": 0.9123737805921616, + "grad_norm": 26.289155960083008, + "learning_rate": 2.998072877825025e-05, + "loss": 2.7606, + "step": 5331 + }, + { + "epoch": 0.9125449255519424, + "grad_norm": 1.3349337577819824, + "learning_rate": 2.9980140507640764e-05, + "loss": 0.1861, + "step": 5332 + }, + { + "epoch": 0.9127160705117234, + "grad_norm": 15.343520164489746, + "learning_rate": 2.9979543398726073e-05, + "loss": 1.3217, + "step": 5333 + }, + { + "epoch": 0.9128872154715043, + "grad_norm": 160.10142517089844, + "learning_rate": 2.9978937451858457e-05, + "loss": 9.5253, + "step": 5334 + }, + { + "epoch": 0.9130583604312853, + "grad_norm": 19.802030563354492, + "learning_rate": 2.997832266739544e-05, + "loss": 1.7929, + "step": 5335 + }, + { + "epoch": 0.9132295053910663, + "grad_norm": 0.8016830086708069, + "learning_rate": 2.9977699045699735e-05, + "loss": 0.1808, + "step": 5336 + }, + { + "epoch": 0.9134006503508472, + "grad_norm": 12.472278594970703, + "learning_rate": 2.9977066587139287e-05, + "loss": 0.8948, + "step": 5337 + }, + { + "epoch": 0.9135717953106282, + "grad_norm": 16.328351974487305, + "learning_rate": 2.9976425292087245e-05, + "loss": 1.5154, + "step": 5338 + }, + { + "epoch": 0.913742940270409, + "grad_norm": 15.434953689575195, + "learning_rate": 2.9975775160921972e-05, + "loss": 1.3687, + "step": 5339 + }, + { + "epoch": 0.91391408523019, + "grad_norm": 18.20110321044922, + "learning_rate": 2.9975116194027046e-05, + "loss": 2.0817, + "step": 5340 + }, + { + "epoch": 0.9140852301899709, + "grad_norm": 10.015130043029785, + "learning_rate": 2.9974448391791268e-05, + "loss": 0.6482, + "step": 5341 + }, + { + "epoch": 0.9142563751497519, + "grad_norm": 11.071905136108398, + "learning_rate": 2.997377175460863e-05, + "loss": 0.5993, + "step": 5342 + }, + { + "epoch": 0.9144275201095328, + "grad_norm": 15.070958137512207, + "learning_rate": 2.9973086282878353e-05, + "loss": 1.5188, + "step": 5343 + }, + { + "epoch": 0.9145986650693138, + "grad_norm": 6.0472917556762695, + "learning_rate": 2.9972391977004867e-05, + "loss": 0.4436, + "step": 5344 + }, + { + "epoch": 0.9147698100290946, + "grad_norm": 21.765090942382812, + "learning_rate": 2.9971688837397816e-05, + "loss": 1.858, + "step": 5345 + }, + { + "epoch": 0.9149409549888756, + "grad_norm": 23.28327178955078, + "learning_rate": 2.997097686447205e-05, + "loss": 2.1091, + "step": 5346 + }, + { + "epoch": 0.9151120999486565, + "grad_norm": 11.79105281829834, + "learning_rate": 2.9970256058647636e-05, + "loss": 0.8249, + "step": 5347 + }, + { + "epoch": 0.9152832449084375, + "grad_norm": 41.14191436767578, + "learning_rate": 2.996952642034985e-05, + "loss": 6.36, + "step": 5348 + }, + { + "epoch": 0.9154543898682184, + "grad_norm": 20.3240966796875, + "learning_rate": 2.996878795000918e-05, + "loss": 1.5731, + "step": 5349 + }, + { + "epoch": 0.9156255348279994, + "grad_norm": 20.114635467529297, + "learning_rate": 2.9968040648061324e-05, + "loss": 1.6892, + "step": 5350 + }, + { + "epoch": 0.9157966797877802, + "grad_norm": 26.455171585083008, + "learning_rate": 2.9967284514947192e-05, + "loss": 2.6576, + "step": 5351 + }, + { + "epoch": 0.9159678247475612, + "grad_norm": 20.05623435974121, + "learning_rate": 2.9966519551112904e-05, + "loss": 1.7239, + "step": 5352 + }, + { + "epoch": 0.9161389697073421, + "grad_norm": 16.712665557861328, + "learning_rate": 2.9965745757009784e-05, + "loss": 1.3935, + "step": 5353 + }, + { + "epoch": 0.9163101146671231, + "grad_norm": 24.99353790283203, + "learning_rate": 2.996496313309438e-05, + "loss": 2.6955, + "step": 5354 + }, + { + "epoch": 0.916481259626904, + "grad_norm": 28.002347946166992, + "learning_rate": 2.9964171679828438e-05, + "loss": 2.7177, + "step": 5355 + }, + { + "epoch": 0.916652404586685, + "grad_norm": 26.17693328857422, + "learning_rate": 2.9963371397678915e-05, + "loss": 2.1296, + "step": 5356 + }, + { + "epoch": 0.9168235495464658, + "grad_norm": 9.675960540771484, + "learning_rate": 2.9962562287117978e-05, + "loss": 0.6918, + "step": 5357 + }, + { + "epoch": 0.9169946945062468, + "grad_norm": 7.958675861358643, + "learning_rate": 2.996174434862301e-05, + "loss": 0.5486, + "step": 5358 + }, + { + "epoch": 0.9171658394660277, + "grad_norm": 26.32320785522461, + "learning_rate": 2.9960917582676586e-05, + "loss": 3.7321, + "step": 5359 + }, + { + "epoch": 0.9173369844258087, + "grad_norm": 24.980010986328125, + "learning_rate": 2.996008198976651e-05, + "loss": 2.9858, + "step": 5360 + }, + { + "epoch": 0.9175081293855896, + "grad_norm": 21.72593879699707, + "learning_rate": 2.9959237570385777e-05, + "loss": 1.9385, + "step": 5361 + }, + { + "epoch": 0.9176792743453706, + "grad_norm": 4.216024398803711, + "learning_rate": 2.995838432503259e-05, + "loss": 0.4022, + "step": 5362 + }, + { + "epoch": 0.9178504193051514, + "grad_norm": 7.37491512298584, + "learning_rate": 2.9957522254210375e-05, + "loss": 0.5214, + "step": 5363 + }, + { + "epoch": 0.9180215642649324, + "grad_norm": 18.15989875793457, + "learning_rate": 2.9956651358427758e-05, + "loss": 1.3646, + "step": 5364 + }, + { + "epoch": 0.9181927092247133, + "grad_norm": 1.2965679168701172, + "learning_rate": 2.9955771638198556e-05, + "loss": 0.2127, + "step": 5365 + }, + { + "epoch": 0.9183638541844943, + "grad_norm": 17.309446334838867, + "learning_rate": 2.9954883094041813e-05, + "loss": 1.3875, + "step": 5366 + }, + { + "epoch": 0.9185349991442752, + "grad_norm": 31.739850997924805, + "learning_rate": 2.9953985726481765e-05, + "loss": 5.3984, + "step": 5367 + }, + { + "epoch": 0.9187061441040562, + "grad_norm": 1.0152965784072876, + "learning_rate": 2.995307953604787e-05, + "loss": 0.2038, + "step": 5368 + }, + { + "epoch": 0.918877289063837, + "grad_norm": 26.772960662841797, + "learning_rate": 2.9952164523274775e-05, + "loss": 1.5391, + "step": 5369 + }, + { + "epoch": 0.919048434023618, + "grad_norm": 22.822797775268555, + "learning_rate": 2.9951240688702342e-05, + "loss": 2.2907, + "step": 5370 + }, + { + "epoch": 0.9192195789833989, + "grad_norm": 25.479511260986328, + "learning_rate": 2.9950308032875634e-05, + "loss": 3.1916, + "step": 5371 + }, + { + "epoch": 0.9193907239431799, + "grad_norm": 17.20477294921875, + "learning_rate": 2.9949366556344916e-05, + "loss": 1.8385, + "step": 5372 + }, + { + "epoch": 0.9195618689029608, + "grad_norm": 71.76097869873047, + "learning_rate": 2.9948416259665665e-05, + "loss": 7.5777, + "step": 5373 + }, + { + "epoch": 0.9197330138627418, + "grad_norm": 24.833290100097656, + "learning_rate": 2.9947457143398554e-05, + "loss": 2.549, + "step": 5374 + }, + { + "epoch": 0.9199041588225226, + "grad_norm": 7.05638313293457, + "learning_rate": 2.9946489208109468e-05, + "loss": 0.6932, + "step": 5375 + }, + { + "epoch": 0.9200753037823036, + "grad_norm": 20.02423858642578, + "learning_rate": 2.9945512454369485e-05, + "loss": 2.2629, + "step": 5376 + }, + { + "epoch": 0.9202464487420845, + "grad_norm": 5.115697860717773, + "learning_rate": 2.9944526882754894e-05, + "loss": 0.4451, + "step": 5377 + }, + { + "epoch": 0.9204175937018655, + "grad_norm": 21.760183334350586, + "learning_rate": 2.994353249384718e-05, + "loss": 1.7632, + "step": 5378 + }, + { + "epoch": 0.9205887386616464, + "grad_norm": 19.334026336669922, + "learning_rate": 2.994252928823304e-05, + "loss": 1.6726, + "step": 5379 + }, + { + "epoch": 0.9207598836214274, + "grad_norm": 8.374558448791504, + "learning_rate": 2.9941517266504363e-05, + "loss": 0.7515, + "step": 5380 + }, + { + "epoch": 0.9209310285812082, + "grad_norm": 17.895437240600586, + "learning_rate": 2.994049642925824e-05, + "loss": 1.7334, + "step": 5381 + }, + { + "epoch": 0.9211021735409892, + "grad_norm": 22.024200439453125, + "learning_rate": 2.9939466777096975e-05, + "loss": 2.2925, + "step": 5382 + }, + { + "epoch": 0.9212733185007701, + "grad_norm": 13.27621841430664, + "learning_rate": 2.9938428310628057e-05, + "loss": 1.1169, + "step": 5383 + }, + { + "epoch": 0.9214444634605511, + "grad_norm": 16.63943099975586, + "learning_rate": 2.993738103046419e-05, + "loss": 1.4458, + "step": 5384 + }, + { + "epoch": 0.921615608420332, + "grad_norm": 14.45860767364502, + "learning_rate": 2.9936324937223263e-05, + "loss": 1.2044, + "step": 5385 + }, + { + "epoch": 0.921786753380113, + "grad_norm": 23.660669326782227, + "learning_rate": 2.9935260031528377e-05, + "loss": 2.323, + "step": 5386 + }, + { + "epoch": 0.9219578983398939, + "grad_norm": 5.464766025543213, + "learning_rate": 2.993418631400783e-05, + "loss": 0.5573, + "step": 5387 + }, + { + "epoch": 0.9221290432996748, + "grad_norm": 4.731202125549316, + "learning_rate": 2.993310378529511e-05, + "loss": 0.4096, + "step": 5388 + }, + { + "epoch": 0.9223001882594558, + "grad_norm": 0.8416853547096252, + "learning_rate": 2.9932012446028916e-05, + "loss": 0.1837, + "step": 5389 + }, + { + "epoch": 0.9224713332192367, + "grad_norm": 84.00711822509766, + "learning_rate": 2.9930912296853136e-05, + "loss": 9.7173, + "step": 5390 + }, + { + "epoch": 0.9226424781790177, + "grad_norm": 19.461177825927734, + "learning_rate": 2.9929803338416863e-05, + "loss": 1.7145, + "step": 5391 + }, + { + "epoch": 0.9228136231387986, + "grad_norm": 18.703411102294922, + "learning_rate": 2.992868557137438e-05, + "loss": 2.2235, + "step": 5392 + }, + { + "epoch": 0.9229847680985795, + "grad_norm": 29.23065185546875, + "learning_rate": 2.9927558996385178e-05, + "loss": 5.9163, + "step": 5393 + }, + { + "epoch": 0.9231559130583604, + "grad_norm": 3.1618072986602783, + "learning_rate": 2.9926423614113935e-05, + "loss": 0.3965, + "step": 5394 + }, + { + "epoch": 0.9233270580181414, + "grad_norm": 22.468812942504883, + "learning_rate": 2.9925279425230525e-05, + "loss": 2.0604, + "step": 5395 + }, + { + "epoch": 0.9234982029779223, + "grad_norm": 18.728565216064453, + "learning_rate": 2.9924126430410022e-05, + "loss": 1.6515, + "step": 5396 + }, + { + "epoch": 0.9236693479377033, + "grad_norm": 24.218313217163086, + "learning_rate": 2.99229646303327e-05, + "loss": 3.4235, + "step": 5397 + }, + { + "epoch": 0.9238404928974842, + "grad_norm": 22.003061294555664, + "learning_rate": 2.992179402568402e-05, + "loss": 2.5075, + "step": 5398 + }, + { + "epoch": 0.9240116378572651, + "grad_norm": 22.205307006835938, + "learning_rate": 2.9920614617154634e-05, + "loss": 2.9202, + "step": 5399 + }, + { + "epoch": 0.924182782817046, + "grad_norm": 8.464454650878906, + "learning_rate": 2.9919426405440406e-05, + "loss": 0.5435, + "step": 5400 + }, + { + "epoch": 0.924353927776827, + "grad_norm": 21.94829559326172, + "learning_rate": 2.991822939124237e-05, + "loss": 2.3394, + "step": 5401 + }, + { + "epoch": 0.9245250727366079, + "grad_norm": 17.917320251464844, + "learning_rate": 2.991702357526678e-05, + "loss": 1.4916, + "step": 5402 + }, + { + "epoch": 0.9246962176963889, + "grad_norm": 7.415956497192383, + "learning_rate": 2.9915808958225057e-05, + "loss": 0.5225, + "step": 5403 + }, + { + "epoch": 0.9248673626561698, + "grad_norm": 25.18984031677246, + "learning_rate": 2.9914585540833836e-05, + "loss": 1.8023, + "step": 5404 + }, + { + "epoch": 0.9250385076159507, + "grad_norm": 17.561384201049805, + "learning_rate": 2.9913353323814928e-05, + "loss": 1.46, + "step": 5405 + }, + { + "epoch": 0.9252096525757316, + "grad_norm": 4.7073588371276855, + "learning_rate": 2.9912112307895352e-05, + "loss": 0.3232, + "step": 5406 + }, + { + "epoch": 0.9253807975355126, + "grad_norm": 19.66071319580078, + "learning_rate": 2.9910862493807297e-05, + "loss": 2.3367, + "step": 5407 + }, + { + "epoch": 0.9255519424952935, + "grad_norm": 6.795912742614746, + "learning_rate": 2.9909603882288167e-05, + "loss": 0.5667, + "step": 5408 + }, + { + "epoch": 0.9257230874550745, + "grad_norm": 21.672094345092773, + "learning_rate": 2.9908336474080534e-05, + "loss": 1.8804, + "step": 5409 + }, + { + "epoch": 0.9258942324148554, + "grad_norm": 14.29305648803711, + "learning_rate": 2.9907060269932176e-05, + "loss": 1.4349, + "step": 5410 + }, + { + "epoch": 0.9260653773746363, + "grad_norm": 5.773280143737793, + "learning_rate": 2.9905775270596058e-05, + "loss": 0.36, + "step": 5411 + }, + { + "epoch": 0.9262365223344172, + "grad_norm": 8.370780944824219, + "learning_rate": 2.990448147683033e-05, + "loss": 0.5075, + "step": 5412 + }, + { + "epoch": 0.9264076672941982, + "grad_norm": 18.59760093688965, + "learning_rate": 2.9903178889398325e-05, + "loss": 1.7468, + "step": 5413 + }, + { + "epoch": 0.9265788122539791, + "grad_norm": 10.879481315612793, + "learning_rate": 2.9901867509068582e-05, + "loss": 0.5504, + "step": 5414 + }, + { + "epoch": 0.9267499572137601, + "grad_norm": 95.23054504394531, + "learning_rate": 2.9900547336614815e-05, + "loss": 2.7427, + "step": 5415 + }, + { + "epoch": 0.926921102173541, + "grad_norm": 24.241077423095703, + "learning_rate": 2.9899218372815923e-05, + "loss": 2.9172, + "step": 5416 + }, + { + "epoch": 0.927092247133322, + "grad_norm": 19.38135528564453, + "learning_rate": 2.9897880618456e-05, + "loss": 1.7318, + "step": 5417 + }, + { + "epoch": 0.9272633920931028, + "grad_norm": 19.093875885009766, + "learning_rate": 2.989653407432432e-05, + "loss": 1.5689, + "step": 5418 + }, + { + "epoch": 0.9274345370528838, + "grad_norm": 15.81525707244873, + "learning_rate": 2.989517874121535e-05, + "loss": 1.4285, + "step": 5419 + }, + { + "epoch": 0.9276056820126647, + "grad_norm": 6.848993301391602, + "learning_rate": 2.9893814619928737e-05, + "loss": 0.5049, + "step": 5420 + }, + { + "epoch": 0.9277768269724457, + "grad_norm": 45.37533187866211, + "learning_rate": 2.9892441711269315e-05, + "loss": 2.325, + "step": 5421 + }, + { + "epoch": 0.9279479719322266, + "grad_norm": 15.195178031921387, + "learning_rate": 2.9891060016047097e-05, + "loss": 1.2322, + "step": 5422 + }, + { + "epoch": 0.9281191168920075, + "grad_norm": 9.455146789550781, + "learning_rate": 2.9889669535077297e-05, + "loss": 1.2928, + "step": 5423 + }, + { + "epoch": 0.9282902618517884, + "grad_norm": 25.632465362548828, + "learning_rate": 2.988827026918029e-05, + "loss": 2.4087, + "step": 5424 + }, + { + "epoch": 0.9284614068115694, + "grad_norm": 16.109840393066406, + "learning_rate": 2.988686221918165e-05, + "loss": 1.2142, + "step": 5425 + }, + { + "epoch": 0.9286325517713503, + "grad_norm": 20.72352409362793, + "learning_rate": 2.9885445385912124e-05, + "loss": 1.6889, + "step": 5426 + }, + { + "epoch": 0.9288036967311313, + "grad_norm": 7.68108606338501, + "learning_rate": 2.9884019770207654e-05, + "loss": 0.7547, + "step": 5427 + }, + { + "epoch": 0.9289748416909122, + "grad_norm": 19.388324737548828, + "learning_rate": 2.9882585372909345e-05, + "loss": 1.7884, + "step": 5428 + }, + { + "epoch": 0.9291459866506931, + "grad_norm": 20.230783462524414, + "learning_rate": 2.9881142194863503e-05, + "loss": 2.2057, + "step": 5429 + }, + { + "epoch": 0.929317131610474, + "grad_norm": 1.923845648765564, + "learning_rate": 2.9879690236921604e-05, + "loss": 0.2006, + "step": 5430 + }, + { + "epoch": 0.929488276570255, + "grad_norm": 15.715072631835938, + "learning_rate": 2.98782294999403e-05, + "loss": 1.6502, + "step": 5431 + }, + { + "epoch": 0.9296594215300359, + "grad_norm": 135.404296875, + "learning_rate": 2.987675998478143e-05, + "loss": 9.1465, + "step": 5432 + }, + { + "epoch": 0.9298305664898169, + "grad_norm": 3.6420836448669434, + "learning_rate": 2.9875281692312005e-05, + "loss": 0.3347, + "step": 5433 + }, + { + "epoch": 0.9300017114495978, + "grad_norm": 23.983654022216797, + "learning_rate": 2.987379462340423e-05, + "loss": 2.8753, + "step": 5434 + }, + { + "epoch": 0.9301728564093787, + "grad_norm": 13.618528366088867, + "learning_rate": 2.9872298778935472e-05, + "loss": 1.1736, + "step": 5435 + }, + { + "epoch": 0.9303440013691596, + "grad_norm": 31.96976661682129, + "learning_rate": 2.9870794159788285e-05, + "loss": 3.0737, + "step": 5436 + }, + { + "epoch": 0.9305151463289406, + "grad_norm": 0.8422608971595764, + "learning_rate": 2.9869280766850397e-05, + "loss": 0.1952, + "step": 5437 + }, + { + "epoch": 0.9306862912887216, + "grad_norm": 12.685113906860352, + "learning_rate": 2.986775860101471e-05, + "loss": 0.9543, + "step": 5438 + }, + { + "epoch": 0.9308574362485025, + "grad_norm": 17.57579231262207, + "learning_rate": 2.9866227663179295e-05, + "loss": 1.5615, + "step": 5439 + }, + { + "epoch": 0.9310285812082835, + "grad_norm": 166.62765502929688, + "learning_rate": 2.986468795424742e-05, + "loss": 8.5544, + "step": 5440 + }, + { + "epoch": 0.9311997261680643, + "grad_norm": 0.7508867383003235, + "learning_rate": 2.9863139475127515e-05, + "loss": 0.1882, + "step": 5441 + }, + { + "epoch": 0.9313708711278453, + "grad_norm": 16.12635612487793, + "learning_rate": 2.9861582226733176e-05, + "loss": 1.2267, + "step": 5442 + }, + { + "epoch": 0.9315420160876262, + "grad_norm": 19.12047576904297, + "learning_rate": 2.986001620998319e-05, + "loss": 2.0048, + "step": 5443 + }, + { + "epoch": 0.9317131610474072, + "grad_norm": 27.335081100463867, + "learning_rate": 2.985844142580151e-05, + "loss": 1.6885, + "step": 5444 + }, + { + "epoch": 0.9318843060071881, + "grad_norm": 7.98264217376709, + "learning_rate": 2.9856857875117254e-05, + "loss": 0.6538, + "step": 5445 + }, + { + "epoch": 0.9320554509669691, + "grad_norm": 29.969785690307617, + "learning_rate": 2.985526555886472e-05, + "loss": 5.6321, + "step": 5446 + }, + { + "epoch": 0.93222659592675, + "grad_norm": 12.692312240600586, + "learning_rate": 2.9853664477983386e-05, + "loss": 1.3322, + "step": 5447 + }, + { + "epoch": 0.9323977408865309, + "grad_norm": 0.7580317258834839, + "learning_rate": 2.9852054633417885e-05, + "loss": 0.1724, + "step": 5448 + }, + { + "epoch": 0.9325688858463118, + "grad_norm": 12.668038368225098, + "learning_rate": 2.9850436026118027e-05, + "loss": 1.2257, + "step": 5449 + }, + { + "epoch": 0.9327400308060928, + "grad_norm": 23.302888870239258, + "learning_rate": 2.9848808657038795e-05, + "loss": 1.729, + "step": 5450 + }, + { + "epoch": 0.9329111757658737, + "grad_norm": 0.7295454740524292, + "learning_rate": 2.9847172527140338e-05, + "loss": 0.1795, + "step": 5451 + }, + { + "epoch": 0.9330823207256547, + "grad_norm": 14.082499504089355, + "learning_rate": 2.9845527637387974e-05, + "loss": 2.0225, + "step": 5452 + }, + { + "epoch": 0.9332534656854355, + "grad_norm": 17.303613662719727, + "learning_rate": 2.9843873988752195e-05, + "loss": 1.432, + "step": 5453 + }, + { + "epoch": 0.9334246106452165, + "grad_norm": 14.602163314819336, + "learning_rate": 2.9842211582208652e-05, + "loss": 1.3601, + "step": 5454 + }, + { + "epoch": 0.9335957556049974, + "grad_norm": 17.85161018371582, + "learning_rate": 2.984054041873817e-05, + "loss": 1.72, + "step": 5455 + }, + { + "epoch": 0.9337669005647784, + "grad_norm": 19.920385360717773, + "learning_rate": 2.983886049932674e-05, + "loss": 2.7301, + "step": 5456 + }, + { + "epoch": 0.9339380455245593, + "grad_norm": 24.145822525024414, + "learning_rate": 2.9837171824965506e-05, + "loss": 3.4391, + "step": 5457 + }, + { + "epoch": 0.9341091904843403, + "grad_norm": 8.437520027160645, + "learning_rate": 2.9835474396650802e-05, + "loss": 0.5739, + "step": 5458 + }, + { + "epoch": 0.9342803354441211, + "grad_norm": 25.245956420898438, + "learning_rate": 2.98337682153841e-05, + "loss": 2.4175, + "step": 5459 + }, + { + "epoch": 0.9344514804039021, + "grad_norm": 0.8599206805229187, + "learning_rate": 2.9832053282172065e-05, + "loss": 0.1786, + "step": 5460 + }, + { + "epoch": 0.934622625363683, + "grad_norm": 19.534420013427734, + "learning_rate": 2.9830329598026498e-05, + "loss": 1.4205, + "step": 5461 + }, + { + "epoch": 0.934793770323464, + "grad_norm": 22.197025299072266, + "learning_rate": 2.982859716396438e-05, + "loss": 2.7027, + "step": 5462 + }, + { + "epoch": 0.9349649152832449, + "grad_norm": 24.305126190185547, + "learning_rate": 2.9826855981007845e-05, + "loss": 2.2722, + "step": 5463 + }, + { + "epoch": 0.9351360602430259, + "grad_norm": 25.295196533203125, + "learning_rate": 2.98251060501842e-05, + "loss": 3.0028, + "step": 5464 + }, + { + "epoch": 0.9353072052028067, + "grad_norm": 9.921125411987305, + "learning_rate": 2.9823347372525905e-05, + "loss": 0.9076, + "step": 5465 + }, + { + "epoch": 0.9354783501625877, + "grad_norm": 92.25393676757812, + "learning_rate": 2.9821579949070577e-05, + "loss": 7.8437, + "step": 5466 + }, + { + "epoch": 0.9356494951223686, + "grad_norm": 22.936864852905273, + "learning_rate": 2.9819803780861006e-05, + "loss": 2.5465, + "step": 5467 + }, + { + "epoch": 0.9358206400821496, + "grad_norm": 26.565526962280273, + "learning_rate": 2.9818018868945135e-05, + "loss": 2.7099, + "step": 5468 + }, + { + "epoch": 0.9359917850419305, + "grad_norm": 21.221967697143555, + "learning_rate": 2.9816225214376052e-05, + "loss": 1.9547, + "step": 5469 + }, + { + "epoch": 0.9361629300017115, + "grad_norm": 15.053738594055176, + "learning_rate": 2.9814422818212032e-05, + "loss": 1.1589, + "step": 5470 + }, + { + "epoch": 0.9363340749614923, + "grad_norm": 18.163192749023438, + "learning_rate": 2.981261168151648e-05, + "loss": 1.5918, + "step": 5471 + }, + { + "epoch": 0.9365052199212733, + "grad_norm": 18.153732299804688, + "learning_rate": 2.9810791805357972e-05, + "loss": 1.503, + "step": 5472 + }, + { + "epoch": 0.9366763648810542, + "grad_norm": 0.5720072984695435, + "learning_rate": 2.980896319081024e-05, + "loss": 0.1676, + "step": 5473 + }, + { + "epoch": 0.9368475098408352, + "grad_norm": 36.6143798828125, + "learning_rate": 2.9807125838952168e-05, + "loss": 5.8291, + "step": 5474 + }, + { + "epoch": 0.9370186548006161, + "grad_norm": 134.6314697265625, + "learning_rate": 2.9805279750867796e-05, + "loss": 8.5043, + "step": 5475 + }, + { + "epoch": 0.9371897997603971, + "grad_norm": 17.09921646118164, + "learning_rate": 2.980342492764632e-05, + "loss": 1.6199, + "step": 5476 + }, + { + "epoch": 0.937360944720178, + "grad_norm": 16.151418685913086, + "learning_rate": 2.980156137038209e-05, + "loss": 1.4363, + "step": 5477 + }, + { + "epoch": 0.9375320896799589, + "grad_norm": 29.319974899291992, + "learning_rate": 2.97996890801746e-05, + "loss": 4.7161, + "step": 5478 + }, + { + "epoch": 0.9377032346397398, + "grad_norm": 21.868122100830078, + "learning_rate": 2.9797808058128513e-05, + "loss": 2.1905, + "step": 5479 + }, + { + "epoch": 0.9378743795995208, + "grad_norm": 19.497089385986328, + "learning_rate": 2.979591830535363e-05, + "loss": 2.518, + "step": 5480 + }, + { + "epoch": 0.9380455245593017, + "grad_norm": 9.813490867614746, + "learning_rate": 2.9794019822964908e-05, + "loss": 0.8437, + "step": 5481 + }, + { + "epoch": 0.9382166695190827, + "grad_norm": 15.766708374023438, + "learning_rate": 2.9792112612082455e-05, + "loss": 1.1947, + "step": 5482 + }, + { + "epoch": 0.9383878144788635, + "grad_norm": 21.335594177246094, + "learning_rate": 2.9790196673831532e-05, + "loss": 1.9788, + "step": 5483 + }, + { + "epoch": 0.9385589594386445, + "grad_norm": 20.775419235229492, + "learning_rate": 2.9788272009342537e-05, + "loss": 1.4183, + "step": 5484 + }, + { + "epoch": 0.9387301043984254, + "grad_norm": 18.061182022094727, + "learning_rate": 2.9786338619751033e-05, + "loss": 1.7959, + "step": 5485 + }, + { + "epoch": 0.9389012493582064, + "grad_norm": 18.276731491088867, + "learning_rate": 2.978439650619772e-05, + "loss": 1.7148, + "step": 5486 + }, + { + "epoch": 0.9390723943179873, + "grad_norm": 1.8720197677612305, + "learning_rate": 2.9782445669828445e-05, + "loss": 0.2043, + "step": 5487 + }, + { + "epoch": 0.9392435392777683, + "grad_norm": 25.51718521118164, + "learning_rate": 2.978048611179421e-05, + "loss": 2.6434, + "step": 5488 + }, + { + "epoch": 0.9394146842375493, + "grad_norm": 15.08374309539795, + "learning_rate": 2.977851783325115e-05, + "loss": 1.2173, + "step": 5489 + }, + { + "epoch": 0.9395858291973301, + "grad_norm": 13.691933631896973, + "learning_rate": 2.977654083536056e-05, + "loss": 1.22, + "step": 5490 + }, + { + "epoch": 0.9397569741571111, + "grad_norm": 16.964563369750977, + "learning_rate": 2.9774555119288868e-05, + "loss": 1.5784, + "step": 5491 + }, + { + "epoch": 0.939928119116892, + "grad_norm": 18.708364486694336, + "learning_rate": 2.977256068620765e-05, + "loss": 1.9451, + "step": 5492 + }, + { + "epoch": 0.940099264076673, + "grad_norm": 81.27881622314453, + "learning_rate": 2.9770557537293624e-05, + "loss": 6.9958, + "step": 5493 + }, + { + "epoch": 0.9402704090364539, + "grad_norm": 0.9163722395896912, + "learning_rate": 2.976854567372865e-05, + "loss": 0.1888, + "step": 5494 + }, + { + "epoch": 0.9404415539962349, + "grad_norm": 9.509047508239746, + "learning_rate": 2.976652509669973e-05, + "loss": 0.5597, + "step": 5495 + }, + { + "epoch": 0.9406126989560157, + "grad_norm": 6.115006923675537, + "learning_rate": 2.976449580739901e-05, + "loss": 0.5571, + "step": 5496 + }, + { + "epoch": 0.9407838439157967, + "grad_norm": 54.508541107177734, + "learning_rate": 2.976245780702377e-05, + "loss": 6.6135, + "step": 5497 + }, + { + "epoch": 0.9409549888755776, + "grad_norm": 5.758550643920898, + "learning_rate": 2.9760411096776442e-05, + "loss": 0.7069, + "step": 5498 + }, + { + "epoch": 0.9411261338353586, + "grad_norm": 16.58424949645996, + "learning_rate": 2.9758355677864574e-05, + "loss": 1.4041, + "step": 5499 + }, + { + "epoch": 0.9412972787951395, + "grad_norm": 13.171340942382812, + "learning_rate": 2.975629155150088e-05, + "loss": 0.9814, + "step": 5500 + }, + { + "epoch": 0.9414684237549205, + "grad_norm": 44.11324691772461, + "learning_rate": 2.975421871890319e-05, + "loss": 1.8447, + "step": 5501 + }, + { + "epoch": 0.9416395687147013, + "grad_norm": 13.571734428405762, + "learning_rate": 2.9752137181294477e-05, + "loss": 1.5262, + "step": 5502 + }, + { + "epoch": 0.9418107136744823, + "grad_norm": 23.19210433959961, + "learning_rate": 2.975004693990286e-05, + "loss": 2.9885, + "step": 5503 + }, + { + "epoch": 0.9419818586342632, + "grad_norm": 19.186378479003906, + "learning_rate": 2.9747947995961572e-05, + "loss": 1.8989, + "step": 5504 + }, + { + "epoch": 0.9421530035940442, + "grad_norm": 20.15719223022461, + "learning_rate": 2.974584035070901e-05, + "loss": 1.784, + "step": 5505 + }, + { + "epoch": 0.9423241485538251, + "grad_norm": 22.521728515625, + "learning_rate": 2.974372400538867e-05, + "loss": 1.5145, + "step": 5506 + }, + { + "epoch": 0.9424952935136061, + "grad_norm": 23.285200119018555, + "learning_rate": 2.974159896124921e-05, + "loss": 2.343, + "step": 5507 + }, + { + "epoch": 0.9426664384733869, + "grad_norm": 17.868976593017578, + "learning_rate": 2.97394652195444e-05, + "loss": 1.7375, + "step": 5508 + }, + { + "epoch": 0.9428375834331679, + "grad_norm": 5.771058559417725, + "learning_rate": 2.973732278153316e-05, + "loss": 0.3672, + "step": 5509 + }, + { + "epoch": 0.9430087283929488, + "grad_norm": 20.794448852539062, + "learning_rate": 2.9735171648479534e-05, + "loss": 2.0266, + "step": 5510 + }, + { + "epoch": 0.9431798733527298, + "grad_norm": 23.893619537353516, + "learning_rate": 2.973301182165268e-05, + "loss": 2.8642, + "step": 5511 + }, + { + "epoch": 0.9433510183125107, + "grad_norm": 26.520517349243164, + "learning_rate": 2.973084330232691e-05, + "loss": 2.3619, + "step": 5512 + }, + { + "epoch": 0.9435221632722917, + "grad_norm": 0.8600165843963623, + "learning_rate": 2.972866609178165e-05, + "loss": 0.1799, + "step": 5513 + }, + { + "epoch": 0.9436933082320725, + "grad_norm": 22.70649528503418, + "learning_rate": 2.972648019130146e-05, + "loss": 2.497, + "step": 5514 + }, + { + "epoch": 0.9438644531918535, + "grad_norm": 18.863407135009766, + "learning_rate": 2.972428560217602e-05, + "loss": 2.2419, + "step": 5515 + }, + { + "epoch": 0.9440355981516344, + "grad_norm": 21.091896057128906, + "learning_rate": 2.9722082325700145e-05, + "loss": 1.6153, + "step": 5516 + }, + { + "epoch": 0.9442067431114154, + "grad_norm": 16.721572875976562, + "learning_rate": 2.971987036317377e-05, + "loss": 1.4408, + "step": 5517 + }, + { + "epoch": 0.9443778880711963, + "grad_norm": 19.266984939575195, + "learning_rate": 2.9717649715901956e-05, + "loss": 1.6994, + "step": 5518 + }, + { + "epoch": 0.9445490330309773, + "grad_norm": 105.64752197265625, + "learning_rate": 2.971542038519489e-05, + "loss": 7.7849, + "step": 5519 + }, + { + "epoch": 0.9447201779907581, + "grad_norm": 17.627609252929688, + "learning_rate": 2.9713182372367874e-05, + "loss": 1.7657, + "step": 5520 + }, + { + "epoch": 0.9448913229505391, + "grad_norm": 15.488409042358398, + "learning_rate": 2.9710935678741347e-05, + "loss": 1.5876, + "step": 5521 + }, + { + "epoch": 0.94506246791032, + "grad_norm": 0.9112039804458618, + "learning_rate": 2.9708680305640856e-05, + "loss": 0.1914, + "step": 5522 + }, + { + "epoch": 0.945233612870101, + "grad_norm": 8.6788911819458, + "learning_rate": 2.9706416254397077e-05, + "loss": 0.6403, + "step": 5523 + }, + { + "epoch": 0.9454047578298819, + "grad_norm": 19.066268920898438, + "learning_rate": 2.970414352634581e-05, + "loss": 1.6665, + "step": 5524 + }, + { + "epoch": 0.9455759027896629, + "grad_norm": 20.287647247314453, + "learning_rate": 2.9701862122827953e-05, + "loss": 1.3302, + "step": 5525 + }, + { + "epoch": 0.9457470477494437, + "grad_norm": 18.065208435058594, + "learning_rate": 2.969957204518955e-05, + "loss": 1.5168, + "step": 5526 + }, + { + "epoch": 0.9459181927092247, + "grad_norm": 20.927453994750977, + "learning_rate": 2.969727329478174e-05, + "loss": 1.8764, + "step": 5527 + }, + { + "epoch": 0.9460893376690056, + "grad_norm": 13.100236892700195, + "learning_rate": 2.96949658729608e-05, + "loss": 1.3787, + "step": 5528 + }, + { + "epoch": 0.9462604826287866, + "grad_norm": 3.962916851043701, + "learning_rate": 2.969264978108811e-05, + "loss": 0.3372, + "step": 5529 + }, + { + "epoch": 0.9464316275885675, + "grad_norm": 24.814523696899414, + "learning_rate": 2.969032502053016e-05, + "loss": 3.0329, + "step": 5530 + }, + { + "epoch": 0.9466027725483485, + "grad_norm": 26.95598030090332, + "learning_rate": 2.9687991592658568e-05, + "loss": 3.471, + "step": 5531 + }, + { + "epoch": 0.9467739175081293, + "grad_norm": 18.62015724182129, + "learning_rate": 2.9685649498850063e-05, + "loss": 1.7259, + "step": 5532 + }, + { + "epoch": 0.9469450624679103, + "grad_norm": 22.18822479248047, + "learning_rate": 2.9683298740486477e-05, + "loss": 2.9985, + "step": 5533 + }, + { + "epoch": 0.9471162074276912, + "grad_norm": 22.694929122924805, + "learning_rate": 2.968093931895476e-05, + "loss": 2.6509, + "step": 5534 + }, + { + "epoch": 0.9472873523874722, + "grad_norm": 19.534927368164062, + "learning_rate": 2.9678571235646983e-05, + "loss": 1.6884, + "step": 5535 + }, + { + "epoch": 0.9474584973472531, + "grad_norm": 20.47797966003418, + "learning_rate": 2.9676194491960313e-05, + "loss": 1.9016, + "step": 5536 + }, + { + "epoch": 0.9476296423070341, + "grad_norm": 18.410903930664062, + "learning_rate": 2.9673809089297037e-05, + "loss": 1.614, + "step": 5537 + }, + { + "epoch": 0.9478007872668149, + "grad_norm": 1.0874691009521484, + "learning_rate": 2.967141502906454e-05, + "loss": 0.1918, + "step": 5538 + }, + { + "epoch": 0.9479719322265959, + "grad_norm": 17.28070068359375, + "learning_rate": 2.9669012312675317e-05, + "loss": 1.6688, + "step": 5539 + }, + { + "epoch": 0.9481430771863769, + "grad_norm": 22.55012321472168, + "learning_rate": 2.966660094154699e-05, + "loss": 2.8969, + "step": 5540 + }, + { + "epoch": 0.9483142221461578, + "grad_norm": 7.753538608551025, + "learning_rate": 2.966418091710226e-05, + "loss": 0.5489, + "step": 5541 + }, + { + "epoch": 0.9484853671059388, + "grad_norm": 15.588809967041016, + "learning_rate": 2.966175224076894e-05, + "loss": 1.3865, + "step": 5542 + }, + { + "epoch": 0.9486565120657197, + "grad_norm": 13.793673515319824, + "learning_rate": 2.9659314913979966e-05, + "loss": 1.036, + "step": 5543 + }, + { + "epoch": 0.9488276570255006, + "grad_norm": 14.182084083557129, + "learning_rate": 2.9656868938173353e-05, + "loss": 1.2276, + "step": 5544 + }, + { + "epoch": 0.9489988019852815, + "grad_norm": 9.446614265441895, + "learning_rate": 2.965441431479224e-05, + "loss": 0.8011, + "step": 5545 + }, + { + "epoch": 0.9491699469450625, + "grad_norm": 39.7092170715332, + "learning_rate": 2.9651951045284857e-05, + "loss": 6.266, + "step": 5546 + }, + { + "epoch": 0.9493410919048434, + "grad_norm": 23.74007225036621, + "learning_rate": 2.9649479131104533e-05, + "loss": 2.252, + "step": 5547 + }, + { + "epoch": 0.9495122368646244, + "grad_norm": 18.47795867919922, + "learning_rate": 2.9646998573709693e-05, + "loss": 2.0481, + "step": 5548 + }, + { + "epoch": 0.9496833818244053, + "grad_norm": 0.6889795660972595, + "learning_rate": 2.9644509374563887e-05, + "loss": 0.1743, + "step": 5549 + }, + { + "epoch": 0.9498545267841862, + "grad_norm": 29.57404899597168, + "learning_rate": 2.9642011535135736e-05, + "loss": 1.8229, + "step": 5550 + }, + { + "epoch": 0.9500256717439671, + "grad_norm": 93.30299377441406, + "learning_rate": 2.963950505689897e-05, + "loss": 7.0144, + "step": 5551 + }, + { + "epoch": 0.9501968167037481, + "grad_norm": 17.450119018554688, + "learning_rate": 2.9636989941332415e-05, + "loss": 1.3122, + "step": 5552 + }, + { + "epoch": 0.950367961663529, + "grad_norm": 22.121047973632812, + "learning_rate": 2.9634466189919995e-05, + "loss": 2.3305, + "step": 5553 + }, + { + "epoch": 0.95053910662331, + "grad_norm": 13.311408042907715, + "learning_rate": 2.9631933804150726e-05, + "loss": 1.3286, + "step": 5554 + }, + { + "epoch": 0.9507102515830909, + "grad_norm": 30.132402420043945, + "learning_rate": 2.9629392785518714e-05, + "loss": 5.6789, + "step": 5555 + }, + { + "epoch": 0.9508813965428718, + "grad_norm": 15.2648286819458, + "learning_rate": 2.9626843135523174e-05, + "loss": 1.4216, + "step": 5556 + }, + { + "epoch": 0.9510525415026527, + "grad_norm": 12.007978439331055, + "learning_rate": 2.9624284855668394e-05, + "loss": 1.3364, + "step": 5557 + }, + { + "epoch": 0.9512236864624337, + "grad_norm": 11.762513160705566, + "learning_rate": 2.9621717947463768e-05, + "loss": 1.1949, + "step": 5558 + }, + { + "epoch": 0.9513948314222146, + "grad_norm": 22.043636322021484, + "learning_rate": 2.9619142412423775e-05, + "loss": 2.9871, + "step": 5559 + }, + { + "epoch": 0.9515659763819956, + "grad_norm": 18.573747634887695, + "learning_rate": 2.9616558252067985e-05, + "loss": 1.5664, + "step": 5560 + }, + { + "epoch": 0.9517371213417765, + "grad_norm": 16.317903518676758, + "learning_rate": 2.9613965467921053e-05, + "loss": 2.1047, + "step": 5561 + }, + { + "epoch": 0.9519082663015574, + "grad_norm": 67.44075012207031, + "learning_rate": 2.9611364061512733e-05, + "loss": 7.6283, + "step": 5562 + }, + { + "epoch": 0.9520794112613383, + "grad_norm": 22.03914451599121, + "learning_rate": 2.960875403437785e-05, + "loss": 2.7385, + "step": 5563 + }, + { + "epoch": 0.9522505562211193, + "grad_norm": 21.74106788635254, + "learning_rate": 2.960613538805633e-05, + "loss": 2.1558, + "step": 5564 + }, + { + "epoch": 0.9524217011809002, + "grad_norm": 25.852855682373047, + "learning_rate": 2.9603508124093173e-05, + "loss": 2.4488, + "step": 5565 + }, + { + "epoch": 0.9525928461406812, + "grad_norm": 8.967046737670898, + "learning_rate": 2.9600872244038473e-05, + "loss": 6.6295, + "step": 5566 + }, + { + "epoch": 0.9527639911004621, + "grad_norm": 0.8483248353004456, + "learning_rate": 2.95982277494474e-05, + "loss": 0.1717, + "step": 5567 + }, + { + "epoch": 0.952935136060243, + "grad_norm": 21.750349044799805, + "learning_rate": 2.9595574641880213e-05, + "loss": 2.1601, + "step": 5568 + }, + { + "epoch": 0.9531062810200239, + "grad_norm": 16.128721237182617, + "learning_rate": 2.9592912922902246e-05, + "loss": 1.3876, + "step": 5569 + }, + { + "epoch": 0.9532774259798049, + "grad_norm": 8.323986053466797, + "learning_rate": 2.959024259408392e-05, + "loss": 0.5623, + "step": 5570 + }, + { + "epoch": 0.9534485709395858, + "grad_norm": 17.731412887573242, + "learning_rate": 2.9587563657000733e-05, + "loss": 1.6673, + "step": 5571 + }, + { + "epoch": 0.9536197158993668, + "grad_norm": 19.352418899536133, + "learning_rate": 2.9584876113233258e-05, + "loss": 2.8274, + "step": 5572 + }, + { + "epoch": 0.9537908608591477, + "grad_norm": 22.872282028198242, + "learning_rate": 2.9582179964367155e-05, + "loss": 1.9904, + "step": 5573 + }, + { + "epoch": 0.9539620058189286, + "grad_norm": 26.09086036682129, + "learning_rate": 2.957947521199315e-05, + "loss": 2.6627, + "step": 5574 + }, + { + "epoch": 0.9541331507787095, + "grad_norm": 12.8101806640625, + "learning_rate": 2.957676185770706e-05, + "loss": 1.1689, + "step": 5575 + }, + { + "epoch": 0.9543042957384905, + "grad_norm": 8.432300567626953, + "learning_rate": 2.957403990310976e-05, + "loss": 0.5261, + "step": 5576 + }, + { + "epoch": 0.9544754406982714, + "grad_norm": 3.770911693572998, + "learning_rate": 2.957130934980721e-05, + "loss": 0.3705, + "step": 5577 + }, + { + "epoch": 0.9546465856580524, + "grad_norm": 14.25076675415039, + "learning_rate": 2.9568570199410436e-05, + "loss": 1.3667, + "step": 5578 + }, + { + "epoch": 0.9548177306178333, + "grad_norm": 13.956623077392578, + "learning_rate": 2.9565822453535553e-05, + "loss": 7.2221, + "step": 5579 + }, + { + "epoch": 0.9549888755776142, + "grad_norm": 26.3257999420166, + "learning_rate": 2.956306611380372e-05, + "loss": 2.8963, + "step": 5580 + }, + { + "epoch": 0.9551600205373951, + "grad_norm": 19.502458572387695, + "learning_rate": 2.956030118184119e-05, + "loss": 1.7251, + "step": 5581 + }, + { + "epoch": 0.9553311654971761, + "grad_norm": 15.120080947875977, + "learning_rate": 2.955752765927928e-05, + "loss": 6.4349, + "step": 5582 + }, + { + "epoch": 0.955502310456957, + "grad_norm": 18.764127731323242, + "learning_rate": 2.9554745547754364e-05, + "loss": 1.5513, + "step": 5583 + }, + { + "epoch": 0.955673455416738, + "grad_norm": 14.517118453979492, + "learning_rate": 2.9551954848907897e-05, + "loss": 1.4727, + "step": 5584 + }, + { + "epoch": 0.9558446003765189, + "grad_norm": 4.777843952178955, + "learning_rate": 2.9549155564386396e-05, + "loss": 0.5504, + "step": 5585 + }, + { + "epoch": 0.9560157453362998, + "grad_norm": 16.334457397460938, + "learning_rate": 2.9546347695841443e-05, + "loss": 1.4043, + "step": 5586 + }, + { + "epoch": 0.9561868902960807, + "grad_norm": 24.128767013549805, + "learning_rate": 2.9543531244929677e-05, + "loss": 2.866, + "step": 5587 + }, + { + "epoch": 0.9563580352558617, + "grad_norm": 31.431903839111328, + "learning_rate": 2.954070621331282e-05, + "loss": 6.181, + "step": 5588 + }, + { + "epoch": 0.9565291802156427, + "grad_norm": 20.100135803222656, + "learning_rate": 2.9537872602657637e-05, + "loss": 2.1438, + "step": 5589 + }, + { + "epoch": 0.9567003251754236, + "grad_norm": 22.727962493896484, + "learning_rate": 2.953503041463597e-05, + "loss": 3.139, + "step": 5590 + }, + { + "epoch": 0.9568714701352046, + "grad_norm": 20.488447189331055, + "learning_rate": 2.9532179650924702e-05, + "loss": 2.2346, + "step": 5591 + }, + { + "epoch": 0.9570426150949854, + "grad_norm": 17.242015838623047, + "learning_rate": 2.9529320313205797e-05, + "loss": 1.763, + "step": 5592 + }, + { + "epoch": 0.9572137600547664, + "grad_norm": 110.57696533203125, + "learning_rate": 2.9526452403166268e-05, + "loss": 8.8231, + "step": 5593 + }, + { + "epoch": 0.9573849050145473, + "grad_norm": 1.8073965311050415, + "learning_rate": 2.952357592249818e-05, + "loss": 0.1905, + "step": 5594 + }, + { + "epoch": 0.9575560499743283, + "grad_norm": 30.926698684692383, + "learning_rate": 2.952069087289867e-05, + "loss": 5.726, + "step": 5595 + }, + { + "epoch": 0.9577271949341092, + "grad_norm": 49.94111251831055, + "learning_rate": 2.9517797256069917e-05, + "loss": 5.4946, + "step": 5596 + }, + { + "epoch": 0.9578983398938902, + "grad_norm": 25.600858688354492, + "learning_rate": 2.951489507371916e-05, + "loss": 2.066, + "step": 5597 + }, + { + "epoch": 0.958069484853671, + "grad_norm": 17.886932373046875, + "learning_rate": 2.9511984327558687e-05, + "loss": 1.5243, + "step": 5598 + }, + { + "epoch": 0.958240629813452, + "grad_norm": 1.2921315431594849, + "learning_rate": 2.950906501930585e-05, + "loss": 0.1839, + "step": 5599 + }, + { + "epoch": 0.9584117747732329, + "grad_norm": 15.199446678161621, + "learning_rate": 2.950613715068303e-05, + "loss": 1.2364, + "step": 5600 + }, + { + "epoch": 0.9585829197330139, + "grad_norm": 45.226661682128906, + "learning_rate": 2.9503200723417697e-05, + "loss": 5.7504, + "step": 5601 + }, + { + "epoch": 0.9587540646927948, + "grad_norm": 46.436668395996094, + "learning_rate": 2.9500255739242333e-05, + "loss": 5.2193, + "step": 5602 + }, + { + "epoch": 0.9589252096525758, + "grad_norm": 23.483407974243164, + "learning_rate": 2.9497302199894482e-05, + "loss": 5.5494, + "step": 5603 + }, + { + "epoch": 0.9590963546123566, + "grad_norm": 21.826322555541992, + "learning_rate": 2.949434010711674e-05, + "loss": 2.1491, + "step": 5604 + }, + { + "epoch": 0.9592674995721376, + "grad_norm": 39.89952850341797, + "learning_rate": 2.949136946265675e-05, + "loss": 5.3737, + "step": 5605 + }, + { + "epoch": 0.9594386445319185, + "grad_norm": 51.043880462646484, + "learning_rate": 2.9488390268267186e-05, + "loss": 6.9771, + "step": 5606 + }, + { + "epoch": 0.9596097894916995, + "grad_norm": 1.1591174602508545, + "learning_rate": 2.948540252570579e-05, + "loss": 0.1753, + "step": 5607 + }, + { + "epoch": 0.9597809344514804, + "grad_norm": 4.65718412399292, + "learning_rate": 2.9482406236735328e-05, + "loss": 0.353, + "step": 5608 + }, + { + "epoch": 0.9599520794112614, + "grad_norm": 18.54163932800293, + "learning_rate": 2.947940140312361e-05, + "loss": 2.3331, + "step": 5609 + }, + { + "epoch": 0.9601232243710422, + "grad_norm": 16.34014892578125, + "learning_rate": 2.9476388026643504e-05, + "loss": 1.4218, + "step": 5610 + }, + { + "epoch": 0.9602943693308232, + "grad_norm": 5.687966346740723, + "learning_rate": 2.9473366109072895e-05, + "loss": 0.5198, + "step": 5611 + }, + { + "epoch": 0.9604655142906041, + "grad_norm": 6.7711968421936035, + "learning_rate": 2.947033565219473e-05, + "loss": 0.461, + "step": 5612 + }, + { + "epoch": 0.9606366592503851, + "grad_norm": 8.50647258758545, + "learning_rate": 2.9467296657796975e-05, + "loss": 0.8049, + "step": 5613 + }, + { + "epoch": 0.960807804210166, + "grad_norm": 21.197162628173828, + "learning_rate": 2.946424912767264e-05, + "loss": 1.9265, + "step": 5614 + }, + { + "epoch": 0.960978949169947, + "grad_norm": 1.9009312391281128, + "learning_rate": 2.9461193063619777e-05, + "loss": 0.2764, + "step": 5615 + }, + { + "epoch": 0.9611500941297278, + "grad_norm": 20.549768447875977, + "learning_rate": 2.9458128467441473e-05, + "loss": 1.8308, + "step": 5616 + }, + { + "epoch": 0.9613212390895088, + "grad_norm": 17.79680061340332, + "learning_rate": 2.945505534094583e-05, + "loss": 1.3624, + "step": 5617 + }, + { + "epoch": 0.9614923840492897, + "grad_norm": 19.88323211669922, + "learning_rate": 2.945197368594601e-05, + "loss": 2.3432, + "step": 5618 + }, + { + "epoch": 0.9616635290090707, + "grad_norm": 48.602989196777344, + "learning_rate": 2.944888350426019e-05, + "loss": 1.3937, + "step": 5619 + }, + { + "epoch": 0.9618346739688516, + "grad_norm": 1.3009952306747437, + "learning_rate": 2.944578479771158e-05, + "loss": 0.194, + "step": 5620 + }, + { + "epoch": 0.9620058189286326, + "grad_norm": 29.68500328063965, + "learning_rate": 2.9442677568128422e-05, + "loss": 1.4026, + "step": 5621 + }, + { + "epoch": 0.9621769638884135, + "grad_norm": 25.2308292388916, + "learning_rate": 2.943956181734399e-05, + "loss": 3.3288, + "step": 5622 + }, + { + "epoch": 0.9623481088481944, + "grad_norm": 15.033297538757324, + "learning_rate": 2.943643754719658e-05, + "loss": 1.8449, + "step": 5623 + }, + { + "epoch": 0.9625192538079753, + "grad_norm": 20.76141357421875, + "learning_rate": 2.943330475952951e-05, + "loss": 1.8632, + "step": 5624 + }, + { + "epoch": 0.9626903987677563, + "grad_norm": 3.1637096405029297, + "learning_rate": 2.9430163456191132e-05, + "loss": 0.3309, + "step": 5625 + }, + { + "epoch": 0.9628615437275372, + "grad_norm": 24.777233123779297, + "learning_rate": 2.9427013639034825e-05, + "loss": 2.8741, + "step": 5626 + }, + { + "epoch": 0.9630326886873182, + "grad_norm": 0.9104720950126648, + "learning_rate": 2.9423855309918986e-05, + "loss": 0.1717, + "step": 5627 + }, + { + "epoch": 0.963203833647099, + "grad_norm": 85.43341827392578, + "learning_rate": 2.942068847070703e-05, + "loss": 2.3461, + "step": 5628 + }, + { + "epoch": 0.96337497860688, + "grad_norm": 18.911468505859375, + "learning_rate": 2.941751312326739e-05, + "loss": 1.7253, + "step": 5629 + }, + { + "epoch": 0.9635461235666609, + "grad_norm": 17.639625549316406, + "learning_rate": 2.941432926947354e-05, + "loss": 0.6695, + "step": 5630 + }, + { + "epoch": 0.9637172685264419, + "grad_norm": 24.79645347595215, + "learning_rate": 2.9411136911203945e-05, + "loss": 2.3482, + "step": 5631 + }, + { + "epoch": 0.9638884134862228, + "grad_norm": 18.90751838684082, + "learning_rate": 2.9407936050342114e-05, + "loss": 0.6091, + "step": 5632 + }, + { + "epoch": 0.9640595584460038, + "grad_norm": 0.7830367684364319, + "learning_rate": 2.940472668877655e-05, + "loss": 0.1618, + "step": 5633 + }, + { + "epoch": 0.9642307034057847, + "grad_norm": 17.63616943359375, + "learning_rate": 2.940150882840079e-05, + "loss": 1.4876, + "step": 5634 + }, + { + "epoch": 0.9644018483655656, + "grad_norm": 13.081395149230957, + "learning_rate": 2.939828247111336e-05, + "loss": 1.0297, + "step": 5635 + }, + { + "epoch": 0.9645729933253465, + "grad_norm": 6.1109819412231445, + "learning_rate": 2.9395047618817837e-05, + "loss": 0.5382, + "step": 5636 + }, + { + "epoch": 0.9647441382851275, + "grad_norm": 20.527721405029297, + "learning_rate": 2.939180427342277e-05, + "loss": 2.6818, + "step": 5637 + }, + { + "epoch": 0.9649152832449084, + "grad_norm": 18.875795364379883, + "learning_rate": 2.938855243684175e-05, + "loss": 1.6882, + "step": 5638 + }, + { + "epoch": 0.9650864282046894, + "grad_norm": 1.780640721321106, + "learning_rate": 2.938529211099336e-05, + "loss": 0.2069, + "step": 5639 + }, + { + "epoch": 0.9652575731644704, + "grad_norm": 76.09477233886719, + "learning_rate": 2.9382023297801196e-05, + "loss": 7.8139, + "step": 5640 + }, + { + "epoch": 0.9654287181242512, + "grad_norm": 12.15816879272461, + "learning_rate": 2.9378745999193868e-05, + "loss": 0.8883, + "step": 5641 + }, + { + "epoch": 0.9655998630840322, + "grad_norm": 55.28908920288086, + "learning_rate": 2.9375460217104984e-05, + "loss": 0.6874, + "step": 5642 + }, + { + "epoch": 0.9657710080438131, + "grad_norm": 22.68217658996582, + "learning_rate": 2.937216595347316e-05, + "loss": 2.2194, + "step": 5643 + }, + { + "epoch": 0.9659421530035941, + "grad_norm": 25.656339645385742, + "learning_rate": 2.9368863210242015e-05, + "loss": 0.8989, + "step": 5644 + }, + { + "epoch": 0.966113297963375, + "grad_norm": 21.246002197265625, + "learning_rate": 2.9365551989360176e-05, + "loss": 1.7201, + "step": 5645 + }, + { + "epoch": 0.966284442923156, + "grad_norm": 76.55830383300781, + "learning_rate": 2.9362232292781267e-05, + "loss": 7.3444, + "step": 5646 + }, + { + "epoch": 0.9664555878829368, + "grad_norm": 0.7908893823623657, + "learning_rate": 2.935890412246391e-05, + "loss": 0.1723, + "step": 5647 + }, + { + "epoch": 0.9666267328427178, + "grad_norm": 78.82179260253906, + "learning_rate": 2.9355567480371734e-05, + "loss": 8.4267, + "step": 5648 + }, + { + "epoch": 0.9667978778024987, + "grad_norm": 11.044159889221191, + "learning_rate": 2.9352222368473366e-05, + "loss": 1.4268, + "step": 5649 + }, + { + "epoch": 0.9669690227622797, + "grad_norm": 13.06053638458252, + "learning_rate": 2.934886878874242e-05, + "loss": 0.9995, + "step": 5650 + }, + { + "epoch": 0.9671401677220606, + "grad_norm": 28.17104148864746, + "learning_rate": 2.934550674315752e-05, + "loss": 4.9991, + "step": 5651 + }, + { + "epoch": 0.9673113126818416, + "grad_norm": 2.7776384353637695, + "learning_rate": 2.9342136233702272e-05, + "loss": 0.314, + "step": 5652 + }, + { + "epoch": 0.9674824576416224, + "grad_norm": 2.665609836578369, + "learning_rate": 2.9338757262365284e-05, + "loss": 0.2768, + "step": 5653 + }, + { + "epoch": 0.9676536026014034, + "grad_norm": 0.5785967707633972, + "learning_rate": 2.9335369831140155e-05, + "loss": 0.1556, + "step": 5654 + }, + { + "epoch": 0.9678247475611843, + "grad_norm": 27.51642417907715, + "learning_rate": 2.9331973942025476e-05, + "loss": 2.3748, + "step": 5655 + }, + { + "epoch": 0.9679958925209653, + "grad_norm": 9.160310745239258, + "learning_rate": 2.932856959702482e-05, + "loss": 0.5655, + "step": 5656 + }, + { + "epoch": 0.9681670374807462, + "grad_norm": 4.589384078979492, + "learning_rate": 2.932515679814676e-05, + "loss": 0.4441, + "step": 5657 + }, + { + "epoch": 0.9683381824405272, + "grad_norm": 22.23826026916504, + "learning_rate": 2.9321735547404857e-05, + "loss": 2.5487, + "step": 5658 + }, + { + "epoch": 0.968509327400308, + "grad_norm": 14.580739974975586, + "learning_rate": 2.931830584681765e-05, + "loss": 1.1825, + "step": 5659 + }, + { + "epoch": 0.968680472360089, + "grad_norm": 24.317304611206055, + "learning_rate": 2.931486769840866e-05, + "loss": 2.8601, + "step": 5660 + }, + { + "epoch": 0.9688516173198699, + "grad_norm": 16.11456871032715, + "learning_rate": 2.9311421104206407e-05, + "loss": 1.542, + "step": 5661 + }, + { + "epoch": 0.9690227622796509, + "grad_norm": 4.138606548309326, + "learning_rate": 2.9307966066244392e-05, + "loss": 0.4524, + "step": 5662 + }, + { + "epoch": 0.9691939072394318, + "grad_norm": 86.82868194580078, + "learning_rate": 2.9304502586561086e-05, + "loss": 7.8846, + "step": 5663 + }, + { + "epoch": 0.9693650521992128, + "grad_norm": 6.498536109924316, + "learning_rate": 2.9301030667199943e-05, + "loss": 0.3622, + "step": 5664 + }, + { + "epoch": 0.9695361971589936, + "grad_norm": 16.460418701171875, + "learning_rate": 2.929755031020941e-05, + "loss": 1.379, + "step": 5665 + }, + { + "epoch": 0.9697073421187746, + "grad_norm": 4.3522539138793945, + "learning_rate": 2.92940615176429e-05, + "loss": 0.4806, + "step": 5666 + }, + { + "epoch": 0.9698784870785555, + "grad_norm": 2.5449700355529785, + "learning_rate": 2.92905642915588e-05, + "loss": 0.4481, + "step": 5667 + }, + { + "epoch": 0.9700496320383365, + "grad_norm": 22.875797271728516, + "learning_rate": 2.928705863402048e-05, + "loss": 1.7887, + "step": 5668 + }, + { + "epoch": 0.9702207769981174, + "grad_norm": 1.1875667572021484, + "learning_rate": 2.928354454709629e-05, + "loss": 0.1802, + "step": 5669 + }, + { + "epoch": 0.9703919219578984, + "grad_norm": 0.7507598400115967, + "learning_rate": 2.9280022032859543e-05, + "loss": 0.1708, + "step": 5670 + }, + { + "epoch": 0.9705630669176792, + "grad_norm": 4.884947776794434, + "learning_rate": 2.9276491093388525e-05, + "loss": 0.3205, + "step": 5671 + }, + { + "epoch": 0.9707342118774602, + "grad_norm": 28.726037979125977, + "learning_rate": 2.9272951730766496e-05, + "loss": 1.3092, + "step": 5672 + }, + { + "epoch": 0.9709053568372411, + "grad_norm": 38.645179748535156, + "learning_rate": 2.9269403947081693e-05, + "loss": 6.425, + "step": 5673 + }, + { + "epoch": 0.9710765017970221, + "grad_norm": 30.966270446777344, + "learning_rate": 2.9265847744427305e-05, + "loss": 3.1593, + "step": 5674 + }, + { + "epoch": 0.971247646756803, + "grad_norm": 12.976767539978027, + "learning_rate": 2.92622831249015e-05, + "loss": 0.7746, + "step": 5675 + }, + { + "epoch": 0.971418791716584, + "grad_norm": 17.295045852661133, + "learning_rate": 2.9258710090607405e-05, + "loss": 1.3965, + "step": 5676 + }, + { + "epoch": 0.9715899366763648, + "grad_norm": 19.398035049438477, + "learning_rate": 2.925512864365312e-05, + "loss": 2.2865, + "step": 5677 + }, + { + "epoch": 0.9717610816361458, + "grad_norm": 0.4852750301361084, + "learning_rate": 2.9251538786151702e-05, + "loss": 0.1467, + "step": 5678 + }, + { + "epoch": 0.9719322265959267, + "grad_norm": 0.7427716851234436, + "learning_rate": 2.9247940520221176e-05, + "loss": 0.1506, + "step": 5679 + }, + { + "epoch": 0.9721033715557077, + "grad_norm": 16.088226318359375, + "learning_rate": 2.9244333847984522e-05, + "loss": 1.5801, + "step": 5680 + }, + { + "epoch": 0.9722745165154886, + "grad_norm": 9.518933296203613, + "learning_rate": 2.9240718771569676e-05, + "loss": 0.5943, + "step": 5681 + }, + { + "epoch": 0.9724456614752696, + "grad_norm": 20.686717987060547, + "learning_rate": 2.923709529310955e-05, + "loss": 1.8611, + "step": 5682 + }, + { + "epoch": 0.9726168064350504, + "grad_norm": 13.390531539916992, + "learning_rate": 2.923346341474199e-05, + "loss": 1.3245, + "step": 5683 + }, + { + "epoch": 0.9727879513948314, + "grad_norm": 6.539584159851074, + "learning_rate": 2.922982313860982e-05, + "loss": 0.3497, + "step": 5684 + }, + { + "epoch": 0.9729590963546123, + "grad_norm": 32.39887619018555, + "learning_rate": 2.9226174466860797e-05, + "loss": 3.6324, + "step": 5685 + }, + { + "epoch": 0.9731302413143933, + "grad_norm": 1.9811031818389893, + "learning_rate": 2.922251740164765e-05, + "loss": 0.3822, + "step": 5686 + }, + { + "epoch": 0.9733013862741742, + "grad_norm": 18.403629302978516, + "learning_rate": 2.921885194512806e-05, + "loss": 1.9106, + "step": 5687 + }, + { + "epoch": 0.9734725312339552, + "grad_norm": 49.918663024902344, + "learning_rate": 2.921517809946464e-05, + "loss": 5.6214, + "step": 5688 + }, + { + "epoch": 0.973643676193736, + "grad_norm": 12.015061378479004, + "learning_rate": 2.921149586682497e-05, + "loss": 1.2064, + "step": 5689 + }, + { + "epoch": 0.973814821153517, + "grad_norm": 19.471036911010742, + "learning_rate": 2.9207805249381565e-05, + "loss": 1.6467, + "step": 5690 + }, + { + "epoch": 0.973985966113298, + "grad_norm": 32.368045806884766, + "learning_rate": 2.9204106249311904e-05, + "loss": 1.7296, + "step": 5691 + }, + { + "epoch": 0.9741571110730789, + "grad_norm": 26.181180953979492, + "learning_rate": 2.92003988687984e-05, + "loss": 2.8505, + "step": 5692 + }, + { + "epoch": 0.9743282560328599, + "grad_norm": 12.218537330627441, + "learning_rate": 2.9196683110028412e-05, + "loss": 0.8142, + "step": 5693 + }, + { + "epoch": 0.9744994009926408, + "grad_norm": 23.380990982055664, + "learning_rate": 2.9192958975194248e-05, + "loss": 1.4624, + "step": 5694 + }, + { + "epoch": 0.9746705459524218, + "grad_norm": 17.542766571044922, + "learning_rate": 2.9189226466493143e-05, + "loss": 1.4295, + "step": 5695 + }, + { + "epoch": 0.9748416909122026, + "grad_norm": 13.465328216552734, + "learning_rate": 2.9185485586127293e-05, + "loss": 1.2529, + "step": 5696 + }, + { + "epoch": 0.9750128358719836, + "grad_norm": 19.861682891845703, + "learning_rate": 2.9181736336303814e-05, + "loss": 2.0874, + "step": 5697 + }, + { + "epoch": 0.9751839808317645, + "grad_norm": 2.460387706756592, + "learning_rate": 2.9177978719234775e-05, + "loss": 0.345, + "step": 5698 + }, + { + "epoch": 0.9753551257915455, + "grad_norm": 11.749364852905273, + "learning_rate": 2.9174212737137177e-05, + "loss": 0.8893, + "step": 5699 + }, + { + "epoch": 0.9755262707513264, + "grad_norm": 8.463294982910156, + "learning_rate": 2.9170438392232947e-05, + "loss": 0.6591, + "step": 5700 + }, + { + "epoch": 0.9756974157111074, + "grad_norm": 23.321102142333984, + "learning_rate": 2.9166655686748964e-05, + "loss": 3.1519, + "step": 5701 + }, + { + "epoch": 0.9758685606708882, + "grad_norm": 43.21113586425781, + "learning_rate": 2.916286462291702e-05, + "loss": 5.9953, + "step": 5702 + }, + { + "epoch": 0.9760397056306692, + "grad_norm": 6.378363609313965, + "learning_rate": 2.915906520297386e-05, + "loss": 0.5454, + "step": 5703 + }, + { + "epoch": 0.9762108505904501, + "grad_norm": 22.250072479248047, + "learning_rate": 2.915525742916114e-05, + "loss": 2.7336, + "step": 5704 + }, + { + "epoch": 0.9763819955502311, + "grad_norm": 16.970355987548828, + "learning_rate": 2.915144130372545e-05, + "loss": 1.6552, + "step": 5705 + }, + { + "epoch": 0.976553140510012, + "grad_norm": 2.4970545768737793, + "learning_rate": 2.914761682891831e-05, + "loss": 0.3561, + "step": 5706 + }, + { + "epoch": 0.976724285469793, + "grad_norm": 24.48183822631836, + "learning_rate": 2.9143784006996174e-05, + "loss": 2.858, + "step": 5707 + }, + { + "epoch": 0.9768954304295738, + "grad_norm": 24.73363494873047, + "learning_rate": 2.9139942840220407e-05, + "loss": 2.3049, + "step": 5708 + }, + { + "epoch": 0.9770665753893548, + "grad_norm": 1.5006498098373413, + "learning_rate": 2.91360933308573e-05, + "loss": 0.2978, + "step": 5709 + }, + { + "epoch": 0.9772377203491357, + "grad_norm": Infinity, + "learning_rate": 2.91360933308573e-05, + "loss": 1.1356, + "step": 5710 + }, + { + "epoch": 0.9774088653089167, + "grad_norm": 27.760709762573242, + "learning_rate": 2.9132235481178077e-05, + "loss": 2.9795, + "step": 5711 + }, + { + "epoch": 0.9775800102686976, + "grad_norm": 3.4136054515838623, + "learning_rate": 2.912836929345887e-05, + "loss": 0.313, + "step": 5712 + }, + { + "epoch": 0.9777511552284786, + "grad_norm": 7.273303031921387, + "learning_rate": 2.912449476998073e-05, + "loss": 1.0696, + "step": 5713 + }, + { + "epoch": 0.9779223001882594, + "grad_norm": 13.993139266967773, + "learning_rate": 2.9120611913029645e-05, + "loss": 1.2567, + "step": 5714 + }, + { + "epoch": 0.9780934451480404, + "grad_norm": 14.55916690826416, + "learning_rate": 2.9116720724896495e-05, + "loss": 1.4643, + "step": 5715 + }, + { + "epoch": 0.9782645901078213, + "grad_norm": 19.891088485717773, + "learning_rate": 2.9112821207877092e-05, + "loss": 2.3964, + "step": 5716 + }, + { + "epoch": 0.9784357350676023, + "grad_norm": 19.82097625732422, + "learning_rate": 2.9108913364272157e-05, + "loss": 2.4607, + "step": 5717 + }, + { + "epoch": 0.9786068800273832, + "grad_norm": 23.450258255004883, + "learning_rate": 2.9104997196387325e-05, + "loss": 1.9309, + "step": 5718 + }, + { + "epoch": 0.9787780249871642, + "grad_norm": 55.61565017700195, + "learning_rate": 2.9101072706533134e-05, + "loss": 2.2202, + "step": 5719 + }, + { + "epoch": 0.978949169946945, + "grad_norm": 10.280345916748047, + "learning_rate": 2.9097139897025045e-05, + "loss": 1.3173, + "step": 5720 + }, + { + "epoch": 0.979120314906726, + "grad_norm": 24.033308029174805, + "learning_rate": 2.9093198770183416e-05, + "loss": 2.455, + "step": 5721 + }, + { + "epoch": 0.9792914598665069, + "grad_norm": 8.36307430267334, + "learning_rate": 2.9089249328333528e-05, + "loss": 0.7162, + "step": 5722 + }, + { + "epoch": 0.9794626048262879, + "grad_norm": 16.257606506347656, + "learning_rate": 2.9085291573805546e-05, + "loss": 1.3828, + "step": 5723 + }, + { + "epoch": 0.9796337497860688, + "grad_norm": 7.985149383544922, + "learning_rate": 2.9081325508934556e-05, + "loss": 0.4998, + "step": 5724 + }, + { + "epoch": 0.9798048947458498, + "grad_norm": 11.047928810119629, + "learning_rate": 2.9077351136060545e-05, + "loss": 0.8385, + "step": 5725 + }, + { + "epoch": 0.9799760397056306, + "grad_norm": 7.28739070892334, + "learning_rate": 2.907336845752839e-05, + "loss": 0.4586, + "step": 5726 + }, + { + "epoch": 0.9801471846654116, + "grad_norm": 90.9510269165039, + "learning_rate": 2.906937747568789e-05, + "loss": 0.9434, + "step": 5727 + }, + { + "epoch": 0.9803183296251925, + "grad_norm": 1.1575900316238403, + "learning_rate": 2.906537819289372e-05, + "loss": 0.1922, + "step": 5728 + }, + { + "epoch": 0.9804894745849735, + "grad_norm": 19.945966720581055, + "learning_rate": 2.906137061150547e-05, + "loss": 2.3853, + "step": 5729 + }, + { + "epoch": 0.9806606195447544, + "grad_norm": 22.720388412475586, + "learning_rate": 2.9057354733887612e-05, + "loss": 2.3841, + "step": 5730 + }, + { + "epoch": 0.9808317645045354, + "grad_norm": 22.19735336303711, + "learning_rate": 2.9053330562409525e-05, + "loss": 2.4123, + "step": 5731 + }, + { + "epoch": 0.9810029094643162, + "grad_norm": 0.8141705393791199, + "learning_rate": 2.9049298099445474e-05, + "loss": 0.1741, + "step": 5732 + }, + { + "epoch": 0.9811740544240972, + "grad_norm": 109.93119049072266, + "learning_rate": 2.9045257347374616e-05, + "loss": 7.3922, + "step": 5733 + }, + { + "epoch": 0.9813451993838781, + "grad_norm": 10.224390029907227, + "learning_rate": 2.9041208308581005e-05, + "loss": 0.6587, + "step": 5734 + }, + { + "epoch": 0.9815163443436591, + "grad_norm": 28.685192108154297, + "learning_rate": 2.903715098545358e-05, + "loss": 1.9348, + "step": 5735 + }, + { + "epoch": 0.98168748930344, + "grad_norm": 27.049457550048828, + "learning_rate": 2.9033085380386163e-05, + "loss": 3.427, + "step": 5736 + }, + { + "epoch": 0.981858634263221, + "grad_norm": 15.711125373840332, + "learning_rate": 2.902901149577747e-05, + "loss": 1.3106, + "step": 5737 + }, + { + "epoch": 0.9820297792230018, + "grad_norm": 28.39513397216797, + "learning_rate": 2.9024929334031102e-05, + "loss": 2.721, + "step": 5738 + }, + { + "epoch": 0.9822009241827828, + "grad_norm": 0.9611316919326782, + "learning_rate": 2.9020838897555534e-05, + "loss": 0.1634, + "step": 5739 + }, + { + "epoch": 0.9823720691425637, + "grad_norm": 20.618175506591797, + "learning_rate": 2.9016740188764137e-05, + "loss": 2.3602, + "step": 5740 + }, + { + "epoch": 0.9825432141023447, + "grad_norm": 16.827938079833984, + "learning_rate": 2.9012633210075146e-05, + "loss": 1.5323, + "step": 5741 + }, + { + "epoch": 0.9827143590621257, + "grad_norm": 10.538216590881348, + "learning_rate": 2.900851796391169e-05, + "loss": 0.8438, + "step": 5742 + }, + { + "epoch": 0.9828855040219066, + "grad_norm": 17.156232833862305, + "learning_rate": 2.9004394452701773e-05, + "loss": 1.5221, + "step": 5743 + }, + { + "epoch": 0.9830566489816875, + "grad_norm": 17.9312801361084, + "learning_rate": 2.9000262678878273e-05, + "loss": 1.7531, + "step": 5744 + }, + { + "epoch": 0.9832277939414684, + "grad_norm": 14.673002243041992, + "learning_rate": 2.8996122644878934e-05, + "loss": 1.2513, + "step": 5745 + }, + { + "epoch": 0.9833989389012494, + "grad_norm": 23.0510196685791, + "learning_rate": 2.899197435314639e-05, + "loss": 2.9687, + "step": 5746 + }, + { + "epoch": 0.9835700838610303, + "grad_norm": 10.26144790649414, + "learning_rate": 2.8987817806128138e-05, + "loss": 1.1219, + "step": 5747 + }, + { + "epoch": 0.9837412288208113, + "grad_norm": 29.97235107421875, + "learning_rate": 2.8983653006276544e-05, + "loss": 1.2083, + "step": 5748 + }, + { + "epoch": 0.9839123737805922, + "grad_norm": 36.61714172363281, + "learning_rate": 2.897947995604885e-05, + "loss": 2.4298, + "step": 5749 + }, + { + "epoch": 0.9840835187403731, + "grad_norm": 22.347938537597656, + "learning_rate": 2.8975298657907158e-05, + "loss": 2.2842, + "step": 5750 + }, + { + "epoch": 0.984254663700154, + "grad_norm": 12.373258590698242, + "learning_rate": 2.8971109114318446e-05, + "loss": 0.78, + "step": 5751 + }, + { + "epoch": 0.984425808659935, + "grad_norm": 22.580699920654297, + "learning_rate": 2.8966911327754543e-05, + "loss": 3.0575, + "step": 5752 + }, + { + "epoch": 0.9845969536197159, + "grad_norm": 7.398533344268799, + "learning_rate": 2.8962705300692156e-05, + "loss": 0.9823, + "step": 5753 + }, + { + "epoch": 0.9847680985794969, + "grad_norm": 25.29918098449707, + "learning_rate": 2.8958491035612842e-05, + "loss": 2.2433, + "step": 5754 + }, + { + "epoch": 0.9849392435392778, + "grad_norm": 24.103534698486328, + "learning_rate": 2.895426853500303e-05, + "loss": 2.4689, + "step": 5755 + }, + { + "epoch": 0.9851103884990587, + "grad_norm": 20.637611389160156, + "learning_rate": 2.8950037801353995e-05, + "loss": 1.447, + "step": 5756 + }, + { + "epoch": 0.9852815334588396, + "grad_norm": 21.759105682373047, + "learning_rate": 2.894579883716188e-05, + "loss": 2.3284, + "step": 5757 + }, + { + "epoch": 0.9854526784186206, + "grad_norm": 15.341378211975098, + "learning_rate": 2.894155164492768e-05, + "loss": 1.1234, + "step": 5758 + }, + { + "epoch": 0.9856238233784015, + "grad_norm": 3.0613319873809814, + "learning_rate": 2.8937296227157242e-05, + "loss": 0.3687, + "step": 5759 + }, + { + "epoch": 0.9857949683381825, + "grad_norm": 13.759583473205566, + "learning_rate": 2.8933032586361275e-05, + "loss": 1.0151, + "step": 5760 + }, + { + "epoch": 0.9859661132979634, + "grad_norm": 17.648202896118164, + "learning_rate": 2.8928760725055328e-05, + "loss": 1.4579, + "step": 5761 + }, + { + "epoch": 0.9861372582577443, + "grad_norm": 11.312187194824219, + "learning_rate": 2.8924480645759805e-05, + "loss": 1.0932, + "step": 5762 + }, + { + "epoch": 0.9863084032175252, + "grad_norm": 27.215620040893555, + "learning_rate": 2.892019235099996e-05, + "loss": 2.8235, + "step": 5763 + }, + { + "epoch": 0.9864795481773062, + "grad_norm": 4.202890872955322, + "learning_rate": 2.8915895843305896e-05, + "loss": 0.3119, + "step": 5764 + }, + { + "epoch": 0.9866506931370871, + "grad_norm": 4.855360984802246, + "learning_rate": 2.891159112521256e-05, + "loss": 0.3622, + "step": 5765 + }, + { + "epoch": 0.9868218380968681, + "grad_norm": 21.578210830688477, + "learning_rate": 2.8907278199259737e-05, + "loss": 1.6091, + "step": 5766 + }, + { + "epoch": 0.986992983056649, + "grad_norm": 4.96875524520874, + "learning_rate": 2.8902957067992063e-05, + "loss": 0.5169, + "step": 5767 + }, + { + "epoch": 0.9871641280164299, + "grad_norm": 26.378623962402344, + "learning_rate": 2.8898627733959008e-05, + "loss": 3.8767, + "step": 5768 + }, + { + "epoch": 0.9873352729762108, + "grad_norm": 22.44243621826172, + "learning_rate": 2.8894290199714893e-05, + "loss": 2.9247, + "step": 5769 + }, + { + "epoch": 0.9875064179359918, + "grad_norm": 17.894241333007812, + "learning_rate": 2.888994446781886e-05, + "loss": 2.0144, + "step": 5770 + }, + { + "epoch": 0.9876775628957727, + "grad_norm": 19.04180145263672, + "learning_rate": 2.888559054083491e-05, + "loss": 1.5276, + "step": 5771 + }, + { + "epoch": 0.9878487078555537, + "grad_norm": 16.866159439086914, + "learning_rate": 2.8881228421331854e-05, + "loss": 1.3376, + "step": 5772 + }, + { + "epoch": 0.9880198528153346, + "grad_norm": 18.463472366333008, + "learning_rate": 2.8876858111883352e-05, + "loss": 1.501, + "step": 5773 + }, + { + "epoch": 0.9881909977751155, + "grad_norm": 52.910152435302734, + "learning_rate": 2.88724796150679e-05, + "loss": 0.5458, + "step": 5774 + }, + { + "epoch": 0.9883621427348964, + "grad_norm": 15.363899230957031, + "learning_rate": 2.8868092933468808e-05, + "loss": 1.302, + "step": 5775 + }, + { + "epoch": 0.9885332876946774, + "grad_norm": 22.834556579589844, + "learning_rate": 2.886369806967423e-05, + "loss": 2.5358, + "step": 5776 + }, + { + "epoch": 0.9887044326544583, + "grad_norm": 7.162318229675293, + "learning_rate": 2.885929502627714e-05, + "loss": 0.4514, + "step": 5777 + }, + { + "epoch": 0.9888755776142393, + "grad_norm": 5.512144565582275, + "learning_rate": 2.8854883805875346e-05, + "loss": 0.2699, + "step": 5778 + }, + { + "epoch": 0.9890467225740202, + "grad_norm": 8.424713134765625, + "learning_rate": 2.8850464411071465e-05, + "loss": 1.254, + "step": 5779 + }, + { + "epoch": 0.9892178675338011, + "grad_norm": 9.863699913024902, + "learning_rate": 2.884603684447296e-05, + "loss": 0.6334, + "step": 5780 + }, + { + "epoch": 0.989389012493582, + "grad_norm": 7.864769458770752, + "learning_rate": 2.8841601108692086e-05, + "loss": 0.5012, + "step": 5781 + }, + { + "epoch": 0.989560157453363, + "grad_norm": 28.139631271362305, + "learning_rate": 2.883715720634594e-05, + "loss": 2.9025, + "step": 5782 + }, + { + "epoch": 0.9897313024131439, + "grad_norm": 20.099287033081055, + "learning_rate": 2.883270514005644e-05, + "loss": 1.7654, + "step": 5783 + }, + { + "epoch": 0.9899024473729249, + "grad_norm": 11.549965858459473, + "learning_rate": 2.8828244912450305e-05, + "loss": 0.3662, + "step": 5784 + }, + { + "epoch": 0.9900735923327058, + "grad_norm": 15.588282585144043, + "learning_rate": 2.8823776526159067e-05, + "loss": 1.2371, + "step": 5785 + }, + { + "epoch": 0.9902447372924867, + "grad_norm": 14.879979133605957, + "learning_rate": 2.8819299983819093e-05, + "loss": 1.2031, + "step": 5786 + }, + { + "epoch": 0.9904158822522676, + "grad_norm": 16.02366828918457, + "learning_rate": 2.8814815288071547e-05, + "loss": 1.3539, + "step": 5787 + }, + { + "epoch": 0.9905870272120486, + "grad_norm": 1.5881539583206177, + "learning_rate": 2.8810322441562403e-05, + "loss": 0.2036, + "step": 5788 + }, + { + "epoch": 0.9907581721718295, + "grad_norm": 106.81831359863281, + "learning_rate": 2.8805821446942446e-05, + "loss": 8.9651, + "step": 5789 + }, + { + "epoch": 0.9909293171316105, + "grad_norm": 20.03091812133789, + "learning_rate": 2.8801312306867275e-05, + "loss": 1.7219, + "step": 5790 + }, + { + "epoch": 0.9911004620913914, + "grad_norm": 30.344528198242188, + "learning_rate": 2.8796795023997282e-05, + "loss": 5.7801, + "step": 5791 + }, + { + "epoch": 0.9912716070511723, + "grad_norm": 21.556596755981445, + "learning_rate": 2.879226960099768e-05, + "loss": 1.8613, + "step": 5792 + }, + { + "epoch": 0.9914427520109533, + "grad_norm": 21.872007369995117, + "learning_rate": 2.8787736040538466e-05, + "loss": 1.746, + "step": 5793 + }, + { + "epoch": 0.9916138969707342, + "grad_norm": 25.079710006713867, + "learning_rate": 2.878319434529445e-05, + "loss": 3.4592, + "step": 5794 + }, + { + "epoch": 0.9917850419305152, + "grad_norm": 20.306957244873047, + "learning_rate": 2.877864451794525e-05, + "loss": 2.257, + "step": 5795 + }, + { + "epoch": 0.9919561868902961, + "grad_norm": 9.488975524902344, + "learning_rate": 2.8774086561175256e-05, + "loss": 0.6263, + "step": 5796 + }, + { + "epoch": 0.9921273318500771, + "grad_norm": 18.889690399169922, + "learning_rate": 2.8769520477673678e-05, + "loss": 2.3502, + "step": 5797 + }, + { + "epoch": 0.9922984768098579, + "grad_norm": 8.002593994140625, + "learning_rate": 2.8764946270134506e-05, + "loss": 0.3844, + "step": 5798 + }, + { + "epoch": 0.9924696217696389, + "grad_norm": 23.74399757385254, + "learning_rate": 2.8760363941256532e-05, + "loss": 3.349, + "step": 5799 + }, + { + "epoch": 0.9926407667294198, + "grad_norm": 7.996278285980225, + "learning_rate": 2.875577349374334e-05, + "loss": 0.9081, + "step": 5800 + }, + { + "epoch": 0.9928119116892008, + "grad_norm": 50.602272033691406, + "learning_rate": 2.8751174930303295e-05, + "loss": 1.317, + "step": 5801 + }, + { + "epoch": 0.9929830566489817, + "grad_norm": 3.7856621742248535, + "learning_rate": 2.8746568253649562e-05, + "loss": 0.3388, + "step": 5802 + }, + { + "epoch": 0.9931542016087627, + "grad_norm": 7.5464091300964355, + "learning_rate": 2.874195346650008e-05, + "loss": 0.9043, + "step": 5803 + }, + { + "epoch": 0.9933253465685435, + "grad_norm": 3.0814945697784424, + "learning_rate": 2.8737330571577588e-05, + "loss": 0.3952, + "step": 5804 + }, + { + "epoch": 0.9934964915283245, + "grad_norm": 22.453025817871094, + "learning_rate": 2.8732699571609594e-05, + "loss": 2.323, + "step": 5805 + }, + { + "epoch": 0.9936676364881054, + "grad_norm": 26.30789566040039, + "learning_rate": 2.8728060469328404e-05, + "loss": 2.5883, + "step": 5806 + }, + { + "epoch": 0.9938387814478864, + "grad_norm": 19.208297729492188, + "learning_rate": 2.8723413267471086e-05, + "loss": 1.9291, + "step": 5807 + }, + { + "epoch": 0.9940099264076673, + "grad_norm": 6.7294206619262695, + "learning_rate": 2.8718757968779503e-05, + "loss": 0.546, + "step": 5808 + }, + { + "epoch": 0.9941810713674483, + "grad_norm": 3.1863274574279785, + "learning_rate": 2.871409457600028e-05, + "loss": 0.2827, + "step": 5809 + }, + { + "epoch": 0.9943522163272291, + "grad_norm": 15.646027565002441, + "learning_rate": 2.8709423091884836e-05, + "loss": 1.2538, + "step": 5810 + }, + { + "epoch": 0.9945233612870101, + "grad_norm": 4.021656513214111, + "learning_rate": 2.8704743519189347e-05, + "loss": 0.3942, + "step": 5811 + }, + { + "epoch": 0.994694506246791, + "grad_norm": 6.282193183898926, + "learning_rate": 2.8700055860674765e-05, + "loss": 0.5794, + "step": 5812 + }, + { + "epoch": 0.994865651206572, + "grad_norm": 31.727582931518555, + "learning_rate": 2.8695360119106822e-05, + "loss": 1.9632, + "step": 5813 + }, + { + "epoch": 0.9950367961663529, + "grad_norm": 62.91410827636719, + "learning_rate": 2.8690656297256014e-05, + "loss": 6.5346, + "step": 5814 + }, + { + "epoch": 0.9952079411261339, + "grad_norm": 131.5074005126953, + "learning_rate": 2.868594439789759e-05, + "loss": 2.5371, + "step": 5815 + }, + { + "epoch": 0.9953790860859147, + "grad_norm": 9.428857803344727, + "learning_rate": 2.8681224423811595e-05, + "loss": 0.8309, + "step": 5816 + }, + { + "epoch": 0.9955502310456957, + "grad_norm": 6.470557689666748, + "learning_rate": 2.8676496377782808e-05, + "loss": 0.498, + "step": 5817 + }, + { + "epoch": 0.9957213760054766, + "grad_norm": 138.75096130371094, + "learning_rate": 2.867176026260079e-05, + "loss": 2.177, + "step": 5818 + }, + { + "epoch": 0.9958925209652576, + "grad_norm": 24.097305297851562, + "learning_rate": 2.8667016081059853e-05, + "loss": 5.9194, + "step": 5819 + }, + { + "epoch": 0.9960636659250385, + "grad_norm": 26.96247673034668, + "learning_rate": 2.866226383595907e-05, + "loss": 5.791, + "step": 5820 + }, + { + "epoch": 0.9962348108848195, + "grad_norm": 18.310457229614258, + "learning_rate": 2.865750353010227e-05, + "loss": 1.7159, + "step": 5821 + }, + { + "epoch": 0.9964059558446003, + "grad_norm": 10.174948692321777, + "learning_rate": 2.8652735166298053e-05, + "loss": 0.6524, + "step": 5822 + }, + { + "epoch": 0.9965771008043813, + "grad_norm": 14.041560173034668, + "learning_rate": 2.864795874735975e-05, + "loss": 1.3189, + "step": 5823 + }, + { + "epoch": 0.9967482457641622, + "grad_norm": 25.18544578552246, + "learning_rate": 2.8643174276105456e-05, + "loss": 2.4232, + "step": 5824 + }, + { + "epoch": 0.9969193907239432, + "grad_norm": 15.618341445922852, + "learning_rate": 2.863838175535802e-05, + "loss": 0.35, + "step": 5825 + }, + { + "epoch": 0.9970905356837241, + "grad_norm": 7.583703517913818, + "learning_rate": 2.8633581187945035e-05, + "loss": 0.9977, + "step": 5826 + }, + { + "epoch": 0.9972616806435051, + "grad_norm": 30.84953498840332, + "learning_rate": 2.862877257669884e-05, + "loss": 5.2481, + "step": 5827 + }, + { + "epoch": 0.9974328256032859, + "grad_norm": 1.524729609489441, + "learning_rate": 2.8623955924456532e-05, + "loss": 0.1755, + "step": 5828 + }, + { + "epoch": 0.9976039705630669, + "grad_norm": 13.6453857421875, + "learning_rate": 2.8619131234059926e-05, + "loss": 1.3664, + "step": 5829 + }, + { + "epoch": 0.9977751155228478, + "grad_norm": 48.88385009765625, + "learning_rate": 2.8614298508355615e-05, + "loss": 1.7573, + "step": 5830 + }, + { + "epoch": 0.9979462604826288, + "grad_norm": 3.5198447704315186, + "learning_rate": 2.8609457750194903e-05, + "loss": 0.3419, + "step": 5831 + }, + { + "epoch": 0.9981174054424097, + "grad_norm": 11.469450950622559, + "learning_rate": 2.8604608962433847e-05, + "loss": 0.8417, + "step": 5832 + }, + { + "epoch": 0.9982885504021907, + "grad_norm": 13.271893501281738, + "learning_rate": 2.859975214793324e-05, + "loss": 1.218, + "step": 5833 + }, + { + "epoch": 0.9984596953619715, + "grad_norm": 82.31917572021484, + "learning_rate": 2.859488730955861e-05, + "loss": 6.6491, + "step": 5834 + }, + { + "epoch": 0.9986308403217525, + "grad_norm": 7.952483654022217, + "learning_rate": 2.8590014450180218e-05, + "loss": 0.4058, + "step": 5835 + }, + { + "epoch": 0.9988019852815334, + "grad_norm": 1.028527855873108, + "learning_rate": 2.858513357267306e-05, + "loss": 0.1751, + "step": 5836 + }, + { + "epoch": 0.9989731302413144, + "grad_norm": 10.787403106689453, + "learning_rate": 2.858024467991686e-05, + "loss": 0.5362, + "step": 5837 + }, + { + "epoch": 0.9991442752010953, + "grad_norm": 17.176511764526367, + "learning_rate": 2.8575347774796066e-05, + "loss": 1.5915, + "step": 5838 + }, + { + "epoch": 0.9993154201608763, + "grad_norm": 1.1093616485595703, + "learning_rate": 2.857044286019987e-05, + "loss": 0.1634, + "step": 5839 + }, + { + "epoch": 0.9994865651206571, + "grad_norm": 118.06546020507812, + "learning_rate": 2.8565529939022174e-05, + "loss": 5.0138, + "step": 5840 + }, + { + "epoch": 0.9996577100804381, + "grad_norm": 9.499468803405762, + "learning_rate": 2.856060901416161e-05, + "loss": 0.4036, + "step": 5841 + }, + { + "epoch": 0.999828855040219, + "grad_norm": 8.388986587524414, + "learning_rate": 2.8555680088521526e-05, + "loss": 0.6442, + "step": 5842 + }, + { + "epoch": 1.0, + "grad_norm": 7.313854217529297, + "learning_rate": 2.855074316501e-05, + "loss": 0.3924, + "step": 5843 + }, + { + "epoch": 1.000171144959781, + "grad_norm": 17.557804107666016, + "learning_rate": 2.8545798246539824e-05, + "loss": 1.6257, + "step": 5844 + }, + { + "epoch": 1.000342289919562, + "grad_norm": 21.283592224121094, + "learning_rate": 2.8540845336028503e-05, + "loss": 1.7384, + "step": 5845 + }, + { + "epoch": 1.0005134348793427, + "grad_norm": 40.37034606933594, + "learning_rate": 2.8535884436398268e-05, + "loss": 1.7387, + "step": 5846 + }, + { + "epoch": 1.0006845798391237, + "grad_norm": 11.7813081741333, + "learning_rate": 2.8530915550576057e-05, + "loss": 0.9925, + "step": 5847 + }, + { + "epoch": 1.0008557247989047, + "grad_norm": 30.10382843017578, + "learning_rate": 2.8525938681493515e-05, + "loss": 6.0054, + "step": 5848 + }, + { + "epoch": 1.0010268697586857, + "grad_norm": 23.582857131958008, + "learning_rate": 2.8520953832087005e-05, + "loss": 2.8579, + "step": 5849 + }, + { + "epoch": 1.0011980147184665, + "grad_norm": 5.103301048278809, + "learning_rate": 2.8515961005297594e-05, + "loss": 0.45, + "step": 5850 + }, + { + "epoch": 1.0013691596782475, + "grad_norm": 148.56295776367188, + "learning_rate": 2.851096020407106e-05, + "loss": 9.776, + "step": 5851 + }, + { + "epoch": 1.0015403046380285, + "grad_norm": 21.1070499420166, + "learning_rate": 2.850595143135788e-05, + "loss": 1.6187, + "step": 5852 + }, + { + "epoch": 1.0017114495978094, + "grad_norm": 0.8959584832191467, + "learning_rate": 2.850093469011324e-05, + "loss": 0.1585, + "step": 5853 + }, + { + "epoch": 1.0018825945575902, + "grad_norm": 27.73525619506836, + "learning_rate": 2.8495909983297022e-05, + "loss": 1.3172, + "step": 5854 + }, + { + "epoch": 1.0020537395173712, + "grad_norm": 8.246828079223633, + "learning_rate": 2.8490877313873814e-05, + "loss": 0.4995, + "step": 5855 + }, + { + "epoch": 1.0022248844771522, + "grad_norm": 26.056087493896484, + "learning_rate": 2.8485836684812896e-05, + "loss": 2.0727, + "step": 5856 + }, + { + "epoch": 1.0023960294369332, + "grad_norm": 15.057696342468262, + "learning_rate": 2.848078809908825e-05, + "loss": 1.5215, + "step": 5857 + }, + { + "epoch": 1.002567174396714, + "grad_norm": 8.138750076293945, + "learning_rate": 2.8475731559678545e-05, + "loss": 0.7365, + "step": 5858 + }, + { + "epoch": 1.002738319356495, + "grad_norm": 6.88981819152832, + "learning_rate": 2.8470667069567146e-05, + "loss": 0.5962, + "step": 5859 + }, + { + "epoch": 1.002909464316276, + "grad_norm": 22.670846939086914, + "learning_rate": 2.8465594631742116e-05, + "loss": 2.5846, + "step": 5860 + }, + { + "epoch": 1.003080609276057, + "grad_norm": 8.0685453414917, + "learning_rate": 2.846051424919619e-05, + "loss": 1.0937, + "step": 5861 + }, + { + "epoch": 1.0032517542358377, + "grad_norm": 26.379934310913086, + "learning_rate": 2.8455425924926812e-05, + "loss": 2.7335, + "step": 5862 + }, + { + "epoch": 1.0034228991956187, + "grad_norm": 12.378397941589355, + "learning_rate": 2.84503296619361e-05, + "loss": 1.1461, + "step": 5863 + }, + { + "epoch": 1.0035940441553997, + "grad_norm": 27.61152458190918, + "learning_rate": 2.8445225463230852e-05, + "loss": 2.5595, + "step": 5864 + }, + { + "epoch": 1.0037651891151806, + "grad_norm": 27.199756622314453, + "learning_rate": 2.8440113331822553e-05, + "loss": 4.2857, + "step": 5865 + }, + { + "epoch": 1.0039363340749614, + "grad_norm": 5.878708362579346, + "learning_rate": 2.843499327072737e-05, + "loss": 0.4147, + "step": 5866 + }, + { + "epoch": 1.0041074790347424, + "grad_norm": 15.698190689086914, + "learning_rate": 2.8429865282966146e-05, + "loss": 1.2913, + "step": 5867 + }, + { + "epoch": 1.0042786239945234, + "grad_norm": 18.01776123046875, + "learning_rate": 2.8424729371564404e-05, + "loss": 1.56, + "step": 5868 + }, + { + "epoch": 1.0044497689543044, + "grad_norm": 15.176900863647461, + "learning_rate": 2.8419585539552334e-05, + "loss": 1.3356, + "step": 5869 + }, + { + "epoch": 1.0046209139140851, + "grad_norm": 21.878238677978516, + "learning_rate": 2.841443378996481e-05, + "loss": 2.5368, + "step": 5870 + }, + { + "epoch": 1.0047920588738661, + "grad_norm": 101.53121948242188, + "learning_rate": 2.840927412584137e-05, + "loss": 6.9461, + "step": 5871 + }, + { + "epoch": 1.0049632038336471, + "grad_norm": 16.709596633911133, + "learning_rate": 2.840410655022622e-05, + "loss": 1.3958, + "step": 5872 + }, + { + "epoch": 1.0051343487934281, + "grad_norm": 14.55434799194336, + "learning_rate": 2.8398931066168244e-05, + "loss": 1.1612, + "step": 5873 + }, + { + "epoch": 1.0053054937532089, + "grad_norm": 3.6406946182250977, + "learning_rate": 2.839374767672098e-05, + "loss": 0.3982, + "step": 5874 + }, + { + "epoch": 1.0054766387129899, + "grad_norm": 14.200058937072754, + "learning_rate": 2.8388556384942638e-05, + "loss": 1.1531, + "step": 5875 + }, + { + "epoch": 1.0056477836727709, + "grad_norm": 6.786914825439453, + "learning_rate": 2.8383357193896086e-05, + "loss": 0.4971, + "step": 5876 + }, + { + "epoch": 1.0058189286325518, + "grad_norm": 18.153135299682617, + "learning_rate": 2.8378150106648857e-05, + "loss": 1.4036, + "step": 5877 + }, + { + "epoch": 1.0059900735923326, + "grad_norm": 19.425262451171875, + "learning_rate": 2.837293512627314e-05, + "loss": 1.226, + "step": 5878 + }, + { + "epoch": 1.0061612185521136, + "grad_norm": 26.178579330444336, + "learning_rate": 2.8367712255845776e-05, + "loss": 2.1104, + "step": 5879 + }, + { + "epoch": 1.0063323635118946, + "grad_norm": 21.157712936401367, + "learning_rate": 2.8362481498448274e-05, + "loss": 1.9276, + "step": 5880 + }, + { + "epoch": 1.0065035084716756, + "grad_norm": 9.509711265563965, + "learning_rate": 2.8357242857166787e-05, + "loss": 0.6173, + "step": 5881 + }, + { + "epoch": 1.0066746534314563, + "grad_norm": 22.75120735168457, + "learning_rate": 2.8351996335092114e-05, + "loss": 2.034, + "step": 5882 + }, + { + "epoch": 1.0068457983912373, + "grad_norm": 14.869056701660156, + "learning_rate": 2.8346741935319716e-05, + "loss": 1.3493, + "step": 5883 + }, + { + "epoch": 1.0070169433510183, + "grad_norm": 21.23749351501465, + "learning_rate": 2.8341479660949704e-05, + "loss": 1.853, + "step": 5884 + }, + { + "epoch": 1.0071880883107993, + "grad_norm": 24.229785919189453, + "learning_rate": 2.8336209515086813e-05, + "loss": 2.0553, + "step": 5885 + }, + { + "epoch": 1.00735923327058, + "grad_norm": 20.623104095458984, + "learning_rate": 2.8330931500840446e-05, + "loss": 2.5378, + "step": 5886 + }, + { + "epoch": 1.007530378230361, + "grad_norm": 17.120683670043945, + "learning_rate": 2.8325645621324642e-05, + "loss": 1.6522, + "step": 5887 + }, + { + "epoch": 1.007701523190142, + "grad_norm": 21.083953857421875, + "learning_rate": 2.8320351879658065e-05, + "loss": 2.0134, + "step": 5888 + }, + { + "epoch": 1.007872668149923, + "grad_norm": 15.527298927307129, + "learning_rate": 2.8315050278964046e-05, + "loss": 1.5302, + "step": 5889 + }, + { + "epoch": 1.0080438131097038, + "grad_norm": 25.979265213012695, + "learning_rate": 2.830974082237053e-05, + "loss": 1.8001, + "step": 5890 + }, + { + "epoch": 1.0082149580694848, + "grad_norm": 19.79598045349121, + "learning_rate": 2.8304423513010098e-05, + "loss": 1.5617, + "step": 5891 + }, + { + "epoch": 1.0083861030292658, + "grad_norm": 15.19708251953125, + "learning_rate": 2.8299098354019984e-05, + "loss": 1.3218, + "step": 5892 + }, + { + "epoch": 1.0085572479890468, + "grad_norm": 16.328197479248047, + "learning_rate": 2.8293765348542028e-05, + "loss": 1.3238, + "step": 5893 + }, + { + "epoch": 1.0087283929488278, + "grad_norm": 14.854755401611328, + "learning_rate": 2.8288424499722717e-05, + "loss": 1.0872, + "step": 5894 + }, + { + "epoch": 1.0088995379086085, + "grad_norm": 14.211106300354004, + "learning_rate": 2.828307581071316e-05, + "loss": 1.2177, + "step": 5895 + }, + { + "epoch": 1.0090706828683895, + "grad_norm": 34.42771911621094, + "learning_rate": 2.827771928466909e-05, + "loss": 5.7955, + "step": 5896 + }, + { + "epoch": 1.0092418278281705, + "grad_norm": 25.623085021972656, + "learning_rate": 2.8272354924750864e-05, + "loss": 2.0299, + "step": 5897 + }, + { + "epoch": 1.0094129727879515, + "grad_norm": 14.502741813659668, + "learning_rate": 2.8266982734123462e-05, + "loss": 1.0217, + "step": 5898 + }, + { + "epoch": 1.0095841177477323, + "grad_norm": 7.936575889587402, + "learning_rate": 2.826160271595649e-05, + "loss": 0.896, + "step": 5899 + }, + { + "epoch": 1.0097552627075133, + "grad_norm": 7.099348545074463, + "learning_rate": 2.8256214873424163e-05, + "loss": 0.6054, + "step": 5900 + }, + { + "epoch": 1.0099264076672942, + "grad_norm": 23.936155319213867, + "learning_rate": 2.8250819209705313e-05, + "loss": 1.9715, + "step": 5901 + }, + { + "epoch": 1.0100975526270752, + "grad_norm": 29.676971435546875, + "learning_rate": 2.8245415727983395e-05, + "loss": 1.3108, + "step": 5902 + }, + { + "epoch": 1.010268697586856, + "grad_norm": 8.874768257141113, + "learning_rate": 2.8240004431446472e-05, + "loss": 0.5545, + "step": 5903 + }, + { + "epoch": 1.010439842546637, + "grad_norm": 26.877277374267578, + "learning_rate": 2.823458532328721e-05, + "loss": 5.8937, + "step": 5904 + }, + { + "epoch": 1.010610987506418, + "grad_norm": 17.6776123046875, + "learning_rate": 2.822915840670289e-05, + "loss": 1.5149, + "step": 5905 + }, + { + "epoch": 1.010782132466199, + "grad_norm": 100.99117279052734, + "learning_rate": 2.8223723684895413e-05, + "loss": 8.8381, + "step": 5906 + }, + { + "epoch": 1.0109532774259797, + "grad_norm": 12.721230506896973, + "learning_rate": 2.8218281161071265e-05, + "loss": 1.1296, + "step": 5907 + }, + { + "epoch": 1.0111244223857607, + "grad_norm": 20.310392379760742, + "learning_rate": 2.8212830838441544e-05, + "loss": 1.5951, + "step": 5908 + }, + { + "epoch": 1.0112955673455417, + "grad_norm": 24.384496688842773, + "learning_rate": 2.8207372720221944e-05, + "loss": 1.6431, + "step": 5909 + }, + { + "epoch": 1.0114667123053227, + "grad_norm": 10.639677047729492, + "learning_rate": 2.820190680963277e-05, + "loss": 1.4176, + "step": 5910 + }, + { + "epoch": 1.0116378572651035, + "grad_norm": 4.812514305114746, + "learning_rate": 2.8196433109898917e-05, + "loss": 0.4569, + "step": 5911 + }, + { + "epoch": 1.0118090022248845, + "grad_norm": 15.104774475097656, + "learning_rate": 2.819095162424987e-05, + "loss": 0.9118, + "step": 5912 + }, + { + "epoch": 1.0119801471846654, + "grad_norm": 16.543947219848633, + "learning_rate": 2.8185462355919717e-05, + "loss": 1.4082, + "step": 5913 + }, + { + "epoch": 1.0121512921444464, + "grad_norm": 10.53313159942627, + "learning_rate": 2.817996530814714e-05, + "loss": 0.6856, + "step": 5914 + }, + { + "epoch": 1.0123224371042272, + "grad_norm": 5.993205547332764, + "learning_rate": 2.8174460484175396e-05, + "loss": 0.4012, + "step": 5915 + }, + { + "epoch": 1.0124935820640082, + "grad_norm": 28.994531631469727, + "learning_rate": 2.8168947887252344e-05, + "loss": 5.966, + "step": 5916 + }, + { + "epoch": 1.0126647270237892, + "grad_norm": 16.274168014526367, + "learning_rate": 2.8163427520630427e-05, + "loss": 1.4668, + "step": 5917 + }, + { + "epoch": 1.0128358719835702, + "grad_norm": 19.380239486694336, + "learning_rate": 2.815789938756666e-05, + "loss": 1.4762, + "step": 5918 + }, + { + "epoch": 1.013007016943351, + "grad_norm": 11.244359016418457, + "learning_rate": 2.8152363491322658e-05, + "loss": 0.8843, + "step": 5919 + }, + { + "epoch": 1.013178161903132, + "grad_norm": 22.26492691040039, + "learning_rate": 2.814681983516461e-05, + "loss": 2.1597, + "step": 5920 + }, + { + "epoch": 1.013349306862913, + "grad_norm": 19.49887466430664, + "learning_rate": 2.8141268422363276e-05, + "loss": 1.7275, + "step": 5921 + }, + { + "epoch": 1.013520451822694, + "grad_norm": 22.942861557006836, + "learning_rate": 2.8135709256194e-05, + "loss": 2.0654, + "step": 5922 + }, + { + "epoch": 1.0136915967824747, + "grad_norm": 19.50592803955078, + "learning_rate": 2.81301423399367e-05, + "loss": 1.7861, + "step": 5923 + }, + { + "epoch": 1.0138627417422557, + "grad_norm": 16.96805763244629, + "learning_rate": 2.8124567676875854e-05, + "loss": 1.3677, + "step": 5924 + }, + { + "epoch": 1.0140338867020366, + "grad_norm": 28.91179084777832, + "learning_rate": 2.8118985270300535e-05, + "loss": 2.7091, + "step": 5925 + }, + { + "epoch": 1.0142050316618176, + "grad_norm": 30.7325496673584, + "learning_rate": 2.811339512350437e-05, + "loss": 4.6992, + "step": 5926 + }, + { + "epoch": 1.0143761766215984, + "grad_norm": 7.521573543548584, + "learning_rate": 2.8107797239785545e-05, + "loss": 0.6563, + "step": 5927 + }, + { + "epoch": 1.0145473215813794, + "grad_norm": 8.949661254882812, + "learning_rate": 2.8102191622446825e-05, + "loss": 0.7485, + "step": 5928 + }, + { + "epoch": 1.0147184665411604, + "grad_norm": 17.26093292236328, + "learning_rate": 2.8096578274795534e-05, + "loss": 1.5096, + "step": 5929 + }, + { + "epoch": 1.0148896115009414, + "grad_norm": 15.956555366516113, + "learning_rate": 2.809095720014356e-05, + "loss": 1.6165, + "step": 5930 + }, + { + "epoch": 1.0150607564607221, + "grad_norm": 27.52875518798828, + "learning_rate": 2.8085328401807334e-05, + "loss": 3.6796, + "step": 5931 + }, + { + "epoch": 1.0152319014205031, + "grad_norm": 11.933248519897461, + "learning_rate": 2.8079691883107857e-05, + "loss": 1.0633, + "step": 5932 + }, + { + "epoch": 1.0154030463802841, + "grad_norm": 20.19046974182129, + "learning_rate": 2.807404764737069e-05, + "loss": 1.8556, + "step": 5933 + }, + { + "epoch": 1.015574191340065, + "grad_norm": 17.771652221679688, + "learning_rate": 2.806839569792594e-05, + "loss": 1.6362, + "step": 5934 + }, + { + "epoch": 1.0157453362998459, + "grad_norm": 21.638553619384766, + "learning_rate": 2.8062736038108263e-05, + "loss": 2.0634, + "step": 5935 + }, + { + "epoch": 1.0159164812596269, + "grad_norm": 9.296874046325684, + "learning_rate": 2.805706867125687e-05, + "loss": 0.6169, + "step": 5936 + }, + { + "epoch": 1.0160876262194078, + "grad_norm": 36.52355194091797, + "learning_rate": 2.8051393600715507e-05, + "loss": 5.7418, + "step": 5937 + }, + { + "epoch": 1.0162587711791888, + "grad_norm": 20.66568946838379, + "learning_rate": 2.8045710829832482e-05, + "loss": 1.9262, + "step": 5938 + }, + { + "epoch": 1.0164299161389696, + "grad_norm": 16.43720245361328, + "learning_rate": 2.804002036196064e-05, + "loss": 1.259, + "step": 5939 + }, + { + "epoch": 1.0166010610987506, + "grad_norm": 11.791168212890625, + "learning_rate": 2.8034322200457354e-05, + "loss": 0.7025, + "step": 5940 + }, + { + "epoch": 1.0167722060585316, + "grad_norm": 3.836411476135254, + "learning_rate": 2.802861634868456e-05, + "loss": 0.3997, + "step": 5941 + }, + { + "epoch": 1.0169433510183126, + "grad_norm": 7.429993152618408, + "learning_rate": 2.8022902810008718e-05, + "loss": 0.5388, + "step": 5942 + }, + { + "epoch": 1.0171144959780933, + "grad_norm": 16.87652587890625, + "learning_rate": 2.8017181587800816e-05, + "loss": 1.3096, + "step": 5943 + }, + { + "epoch": 1.0172856409378743, + "grad_norm": 22.756383895874023, + "learning_rate": 2.801145268543639e-05, + "loss": 2.1821, + "step": 5944 + }, + { + "epoch": 1.0174567858976553, + "grad_norm": 3.1953980922698975, + "learning_rate": 2.8005716106295502e-05, + "loss": 0.3136, + "step": 5945 + }, + { + "epoch": 1.0176279308574363, + "grad_norm": 19.231718063354492, + "learning_rate": 2.799997185376274e-05, + "loss": 2.0442, + "step": 5946 + }, + { + "epoch": 1.0177990758172173, + "grad_norm": 15.215984344482422, + "learning_rate": 2.7994219931227218e-05, + "loss": 1.3251, + "step": 5947 + }, + { + "epoch": 1.017970220776998, + "grad_norm": 19.25925064086914, + "learning_rate": 2.7988460342082582e-05, + "loss": 1.6565, + "step": 5948 + }, + { + "epoch": 1.018141365736779, + "grad_norm": 22.894515991210938, + "learning_rate": 2.7982693089727e-05, + "loss": 2.3551, + "step": 5949 + }, + { + "epoch": 1.01831251069656, + "grad_norm": 27.23731803894043, + "learning_rate": 2.7976918177563157e-05, + "loss": 2.5454, + "step": 5950 + }, + { + "epoch": 1.018483655656341, + "grad_norm": 26.405460357666016, + "learning_rate": 2.797113560899826e-05, + "loss": 2.8609, + "step": 5951 + }, + { + "epoch": 1.0186548006161218, + "grad_norm": 15.821601867675781, + "learning_rate": 2.7965345387444035e-05, + "loss": 1.1841, + "step": 5952 + }, + { + "epoch": 1.0188259455759028, + "grad_norm": 23.906511306762695, + "learning_rate": 2.7959547516316723e-05, + "loss": 1.3018, + "step": 5953 + }, + { + "epoch": 1.0189970905356838, + "grad_norm": 26.260948181152344, + "learning_rate": 2.7953741999037074e-05, + "loss": 2.1539, + "step": 5954 + }, + { + "epoch": 1.0191682354954648, + "grad_norm": 15.948780059814453, + "learning_rate": 2.7947928839030343e-05, + "loss": 1.423, + "step": 5955 + }, + { + "epoch": 1.0193393804552455, + "grad_norm": 21.961246490478516, + "learning_rate": 2.794210803972632e-05, + "loss": 1.9959, + "step": 5956 + }, + { + "epoch": 1.0195105254150265, + "grad_norm": 20.09709358215332, + "learning_rate": 2.7936279604559274e-05, + "loss": 1.9793, + "step": 5957 + }, + { + "epoch": 1.0196816703748075, + "grad_norm": 14.262060165405273, + "learning_rate": 2.793044353696799e-05, + "loss": 1.1946, + "step": 5958 + }, + { + "epoch": 1.0198528153345885, + "grad_norm": 20.70449447631836, + "learning_rate": 2.7924599840395764e-05, + "loss": 1.7147, + "step": 5959 + }, + { + "epoch": 1.0200239602943693, + "grad_norm": 9.10728645324707, + "learning_rate": 2.791874851829038e-05, + "loss": 1.2659, + "step": 5960 + }, + { + "epoch": 1.0201951052541502, + "grad_norm": 14.673197746276855, + "learning_rate": 2.791288957410413e-05, + "loss": 1.3024, + "step": 5961 + }, + { + "epoch": 1.0203662502139312, + "grad_norm": 19.009380340576172, + "learning_rate": 2.7907023011293794e-05, + "loss": 1.7915, + "step": 5962 + }, + { + "epoch": 1.0205373951737122, + "grad_norm": 22.03476333618164, + "learning_rate": 2.790114883332066e-05, + "loss": 1.9748, + "step": 5963 + }, + { + "epoch": 1.020708540133493, + "grad_norm": 24.187057495117188, + "learning_rate": 2.7895267043650498e-05, + "loss": 2.6434, + "step": 5964 + }, + { + "epoch": 1.020879685093274, + "grad_norm": 15.004847526550293, + "learning_rate": 2.7889377645753573e-05, + "loss": 1.1828, + "step": 5965 + }, + { + "epoch": 1.021050830053055, + "grad_norm": 21.534664154052734, + "learning_rate": 2.7883480643104636e-05, + "loss": 2.6369, + "step": 5966 + }, + { + "epoch": 1.021221975012836, + "grad_norm": 16.689029693603516, + "learning_rate": 2.7877576039182934e-05, + "loss": 1.5116, + "step": 5967 + }, + { + "epoch": 1.0213931199726167, + "grad_norm": 18.7231502532959, + "learning_rate": 2.7871663837472186e-05, + "loss": 1.4577, + "step": 5968 + }, + { + "epoch": 1.0215642649323977, + "grad_norm": 21.580528259277344, + "learning_rate": 2.7865744041460606e-05, + "loss": 2.8208, + "step": 5969 + }, + { + "epoch": 1.0217354098921787, + "grad_norm": 5.064426422119141, + "learning_rate": 2.7859816654640876e-05, + "loss": 0.3887, + "step": 5970 + }, + { + "epoch": 1.0219065548519597, + "grad_norm": 12.983659744262695, + "learning_rate": 2.7853881680510165e-05, + "loss": 1.1746, + "step": 5971 + }, + { + "epoch": 1.0220776998117405, + "grad_norm": 5.966195106506348, + "learning_rate": 2.784793912257012e-05, + "loss": 0.4734, + "step": 5972 + }, + { + "epoch": 1.0222488447715214, + "grad_norm": 14.946609497070312, + "learning_rate": 2.784198898432685e-05, + "loss": 1.2327, + "step": 5973 + }, + { + "epoch": 1.0224199897313024, + "grad_norm": 17.650800704956055, + "learning_rate": 2.7836031269290958e-05, + "loss": 1.3377, + "step": 5974 + }, + { + "epoch": 1.0225911346910834, + "grad_norm": 0.8784258365631104, + "learning_rate": 2.7830065980977493e-05, + "loss": 0.1799, + "step": 5975 + }, + { + "epoch": 1.0227622796508642, + "grad_norm": 1.385658621788025, + "learning_rate": 2.7824093122905993e-05, + "loss": 0.1917, + "step": 5976 + }, + { + "epoch": 1.0229334246106452, + "grad_norm": 22.822864532470703, + "learning_rate": 2.7818112698600452e-05, + "loss": 2.8015, + "step": 5977 + }, + { + "epoch": 1.0231045695704262, + "grad_norm": 16.914342880249023, + "learning_rate": 2.7812124711589323e-05, + "loss": 1.241, + "step": 5978 + }, + { + "epoch": 1.0232757145302072, + "grad_norm": 15.315549850463867, + "learning_rate": 2.7806129165405535e-05, + "loss": 1.1594, + "step": 5979 + }, + { + "epoch": 1.023446859489988, + "grad_norm": 43.14786148071289, + "learning_rate": 2.780012606358646e-05, + "loss": 1.5922, + "step": 5980 + }, + { + "epoch": 1.023618004449769, + "grad_norm": 19.019437789916992, + "learning_rate": 2.7794115409673942e-05, + "loss": 2.0881, + "step": 5981 + }, + { + "epoch": 1.02378914940955, + "grad_norm": 28.388994216918945, + "learning_rate": 2.778809720721428e-05, + "loss": 3.914, + "step": 5982 + }, + { + "epoch": 1.023960294369331, + "grad_norm": 27.33300018310547, + "learning_rate": 2.7782071459758215e-05, + "loss": 3.924, + "step": 5983 + }, + { + "epoch": 1.0241314393291117, + "grad_norm": 14.441838264465332, + "learning_rate": 2.7776038170860952e-05, + "loss": 1.2082, + "step": 5984 + }, + { + "epoch": 1.0243025842888926, + "grad_norm": 16.712003707885742, + "learning_rate": 2.776999734408214e-05, + "loss": 2.1091, + "step": 5985 + }, + { + "epoch": 1.0244737292486736, + "grad_norm": 20.41153335571289, + "learning_rate": 2.776394898298587e-05, + "loss": 1.7813, + "step": 5986 + }, + { + "epoch": 1.0246448742084546, + "grad_norm": 11.618141174316406, + "learning_rate": 2.7757893091140692e-05, + "loss": 1.3035, + "step": 5987 + }, + { + "epoch": 1.0248160191682354, + "grad_norm": 17.35718536376953, + "learning_rate": 2.775182967211959e-05, + "loss": 1.7141, + "step": 5988 + }, + { + "epoch": 1.0249871641280164, + "grad_norm": 16.45880699157715, + "learning_rate": 2.7745758729499983e-05, + "loss": 1.7197, + "step": 5989 + }, + { + "epoch": 1.0251583090877974, + "grad_norm": 10.551615715026855, + "learning_rate": 2.7739680266863744e-05, + "loss": 1.0905, + "step": 5990 + }, + { + "epoch": 1.0253294540475784, + "grad_norm": 26.532712936401367, + "learning_rate": 2.7733594287797172e-05, + "loss": 3.3279, + "step": 5991 + }, + { + "epoch": 1.0255005990073591, + "grad_norm": 19.217823028564453, + "learning_rate": 2.7727500795891007e-05, + "loss": 1.7208, + "step": 5992 + }, + { + "epoch": 1.0256717439671401, + "grad_norm": 12.649834632873535, + "learning_rate": 2.7721399794740412e-05, + "loss": 1.4955, + "step": 5993 + }, + { + "epoch": 1.025842888926921, + "grad_norm": 32.663753509521484, + "learning_rate": 2.771529128794499e-05, + "loss": 5.9865, + "step": 5994 + }, + { + "epoch": 1.026014033886702, + "grad_norm": 14.022912979125977, + "learning_rate": 2.770917527910877e-05, + "loss": 1.1403, + "step": 5995 + }, + { + "epoch": 1.0261851788464829, + "grad_norm": 20.389646530151367, + "learning_rate": 2.7703051771840205e-05, + "loss": 1.536, + "step": 5996 + }, + { + "epoch": 1.0263563238062638, + "grad_norm": 11.238503456115723, + "learning_rate": 2.7696920769752176e-05, + "loss": 0.8791, + "step": 5997 + }, + { + "epoch": 1.0265274687660448, + "grad_norm": 22.10092544555664, + "learning_rate": 2.7690782276461976e-05, + "loss": 1.9444, + "step": 5998 + }, + { + "epoch": 1.0266986137258258, + "grad_norm": 23.456623077392578, + "learning_rate": 2.768463629559134e-05, + "loss": 3.2625, + "step": 5999 + }, + { + "epoch": 1.0268697586856068, + "grad_norm": 3.355825901031494, + "learning_rate": 2.7678482830766388e-05, + "loss": 0.3481, + "step": 6000 + }, + { + "epoch": 1.0270409036453876, + "grad_norm": 3.933943510055542, + "learning_rate": 2.7672321885617683e-05, + "loss": 0.3642, + "step": 6001 + }, + { + "epoch": 1.0272120486051686, + "grad_norm": 22.027666091918945, + "learning_rate": 2.7666153463780192e-05, + "loss": 3.2508, + "step": 6002 + }, + { + "epoch": 1.0273831935649496, + "grad_norm": 28.67835807800293, + "learning_rate": 2.7659977568893294e-05, + "loss": 3.81, + "step": 6003 + }, + { + "epoch": 1.0275543385247305, + "grad_norm": 18.57536506652832, + "learning_rate": 2.765379420460077e-05, + "loss": 1.6178, + "step": 6004 + }, + { + "epoch": 1.0277254834845113, + "grad_norm": 17.354440689086914, + "learning_rate": 2.7647603374550814e-05, + "loss": 1.696, + "step": 6005 + }, + { + "epoch": 1.0278966284442923, + "grad_norm": 22.169416427612305, + "learning_rate": 2.7641405082396038e-05, + "loss": 1.9527, + "step": 6006 + }, + { + "epoch": 1.0280677734040733, + "grad_norm": 18.655746459960938, + "learning_rate": 2.7635199331793434e-05, + "loss": 1.5853, + "step": 6007 + }, + { + "epoch": 1.0282389183638543, + "grad_norm": 13.884475708007812, + "learning_rate": 2.7628986126404398e-05, + "loss": 1.1785, + "step": 6008 + }, + { + "epoch": 1.028410063323635, + "grad_norm": 5.888846397399902, + "learning_rate": 2.762276546989474e-05, + "loss": 0.5917, + "step": 6009 + }, + { + "epoch": 1.028581208283416, + "grad_norm": 3.593951940536499, + "learning_rate": 2.7616537365934652e-05, + "loss": 0.3687, + "step": 6010 + }, + { + "epoch": 1.028752353243197, + "grad_norm": 22.454221725463867, + "learning_rate": 2.761030181819873e-05, + "loss": 2.4793, + "step": 6011 + }, + { + "epoch": 1.028923498202978, + "grad_norm": 2.3373348712921143, + "learning_rate": 2.7604058830365952e-05, + "loss": 0.3585, + "step": 6012 + }, + { + "epoch": 1.0290946431627588, + "grad_norm": 1.7826164960861206, + "learning_rate": 2.759780840611969e-05, + "loss": 0.2326, + "step": 6013 + }, + { + "epoch": 1.0292657881225398, + "grad_norm": 29.18971824645996, + "learning_rate": 2.7591550549147704e-05, + "loss": 2.5937, + "step": 6014 + }, + { + "epoch": 1.0294369330823208, + "grad_norm": 26.53842544555664, + "learning_rate": 2.7585285263142143e-05, + "loss": 5.4126, + "step": 6015 + }, + { + "epoch": 1.0296080780421017, + "grad_norm": 20.951757431030273, + "learning_rate": 2.7579012551799526e-05, + "loss": 2.4848, + "step": 6016 + }, + { + "epoch": 1.0297792230018825, + "grad_norm": 21.251033782958984, + "learning_rate": 2.757273241882077e-05, + "loss": 2.3906, + "step": 6017 + }, + { + "epoch": 1.0299503679616635, + "grad_norm": 19.32236099243164, + "learning_rate": 2.756644486791116e-05, + "loss": 1.5559, + "step": 6018 + }, + { + "epoch": 1.0301215129214445, + "grad_norm": 9.028992652893066, + "learning_rate": 2.7560149902780358e-05, + "loss": 1.0507, + "step": 6019 + }, + { + "epoch": 1.0302926578812255, + "grad_norm": 3.18945574760437, + "learning_rate": 2.7553847527142412e-05, + "loss": 0.3364, + "step": 6020 + }, + { + "epoch": 1.0304638028410062, + "grad_norm": 0.8448656797409058, + "learning_rate": 2.7547537744715722e-05, + "loss": 0.1704, + "step": 6021 + }, + { + "epoch": 1.0306349478007872, + "grad_norm": 16.985177993774414, + "learning_rate": 2.7541220559223072e-05, + "loss": 1.6313, + "step": 6022 + }, + { + "epoch": 1.0308060927605682, + "grad_norm": 17.1005802154541, + "learning_rate": 2.7534895974391614e-05, + "loss": 1.7317, + "step": 6023 + }, + { + "epoch": 1.0309772377203492, + "grad_norm": 4.488323211669922, + "learning_rate": 2.7528563993952863e-05, + "loss": 0.386, + "step": 6024 + }, + { + "epoch": 1.03114838268013, + "grad_norm": 20.139062881469727, + "learning_rate": 2.7522224621642692e-05, + "loss": 2.5811, + "step": 6025 + }, + { + "epoch": 1.031319527639911, + "grad_norm": 19.923547744750977, + "learning_rate": 2.7515877861201348e-05, + "loss": 2.1045, + "step": 6026 + }, + { + "epoch": 1.031490672599692, + "grad_norm": 24.75724220275879, + "learning_rate": 2.7509523716373417e-05, + "loss": 2.4101, + "step": 6027 + }, + { + "epoch": 1.031661817559473, + "grad_norm": 13.764646530151367, + "learning_rate": 2.7503162190907868e-05, + "loss": 1.1857, + "step": 6028 + }, + { + "epoch": 1.0318329625192537, + "grad_norm": 0.987221896648407, + "learning_rate": 2.7496793288557997e-05, + "loss": 0.1712, + "step": 6029 + }, + { + "epoch": 1.0320041074790347, + "grad_norm": 21.300790786743164, + "learning_rate": 2.7490417013081475e-05, + "loss": 2.4975, + "step": 6030 + }, + { + "epoch": 1.0321752524388157, + "grad_norm": 17.621726989746094, + "learning_rate": 2.7484033368240313e-05, + "loss": 1.2443, + "step": 6031 + }, + { + "epoch": 1.0323463973985967, + "grad_norm": 5.2010579109191895, + "learning_rate": 2.7477642357800867e-05, + "loss": 0.3915, + "step": 6032 + }, + { + "epoch": 1.0325175423583774, + "grad_norm": 20.610048294067383, + "learning_rate": 2.7471243985533842e-05, + "loss": 2.4583, + "step": 6033 + }, + { + "epoch": 1.0326886873181584, + "grad_norm": 16.952592849731445, + "learning_rate": 2.7464838255214296e-05, + "loss": 1.373, + "step": 6034 + }, + { + "epoch": 1.0328598322779394, + "grad_norm": 20.06004524230957, + "learning_rate": 2.745842517062161e-05, + "loss": 1.711, + "step": 6035 + }, + { + "epoch": 1.0330309772377204, + "grad_norm": 12.473008155822754, + "learning_rate": 2.7452004735539523e-05, + "loss": 0.9459, + "step": 6036 + }, + { + "epoch": 1.0332021221975012, + "grad_norm": 15.336243629455566, + "learning_rate": 2.7445576953756088e-05, + "loss": 1.5641, + "step": 6037 + }, + { + "epoch": 1.0333732671572822, + "grad_norm": 7.581173896789551, + "learning_rate": 2.7439141829063718e-05, + "loss": 0.4988, + "step": 6038 + }, + { + "epoch": 1.0335444121170632, + "grad_norm": 4.762014389038086, + "learning_rate": 2.7432699365259136e-05, + "loss": 0.3329, + "step": 6039 + }, + { + "epoch": 1.0337155570768441, + "grad_norm": 10.778675079345703, + "learning_rate": 2.742624956614341e-05, + "loss": 0.9549, + "step": 6040 + }, + { + "epoch": 1.033886702036625, + "grad_norm": 22.29326820373535, + "learning_rate": 2.7419792435521935e-05, + "loss": 1.9647, + "step": 6041 + }, + { + "epoch": 1.034057846996406, + "grad_norm": 26.956926345825195, + "learning_rate": 2.7413327977204426e-05, + "loss": 2.3277, + "step": 6042 + }, + { + "epoch": 1.034228991956187, + "grad_norm": 13.276328086853027, + "learning_rate": 2.740685619500492e-05, + "loss": 0.9316, + "step": 6043 + }, + { + "epoch": 1.0344001369159679, + "grad_norm": 38.53458023071289, + "learning_rate": 2.740037709274178e-05, + "loss": 1.8594, + "step": 6044 + }, + { + "epoch": 1.0345712818757486, + "grad_norm": 33.913692474365234, + "learning_rate": 2.7393890674237685e-05, + "loss": 1.6477, + "step": 6045 + }, + { + "epoch": 1.0347424268355296, + "grad_norm": 18.45242691040039, + "learning_rate": 2.7387396943319628e-05, + "loss": 2.0229, + "step": 6046 + }, + { + "epoch": 1.0349135717953106, + "grad_norm": 17.105792999267578, + "learning_rate": 2.7380895903818927e-05, + "loss": 1.2288, + "step": 6047 + }, + { + "epoch": 1.0350847167550916, + "grad_norm": 26.542585372924805, + "learning_rate": 2.7374387559571203e-05, + "loss": 5.4436, + "step": 6048 + }, + { + "epoch": 1.0352558617148726, + "grad_norm": 9.220451354980469, + "learning_rate": 2.7367871914416383e-05, + "loss": 1.0206, + "step": 6049 + }, + { + "epoch": 1.0354270066746534, + "grad_norm": 18.158157348632812, + "learning_rate": 2.736134897219872e-05, + "loss": 2.1711, + "step": 6050 + }, + { + "epoch": 1.0355981516344344, + "grad_norm": 11.473002433776855, + "learning_rate": 2.7354818736766747e-05, + "loss": 0.9204, + "step": 6051 + }, + { + "epoch": 1.0357692965942153, + "grad_norm": 4.108072757720947, + "learning_rate": 2.7348281211973317e-05, + "loss": 0.5207, + "step": 6052 + }, + { + "epoch": 1.0359404415539963, + "grad_norm": 26.6224308013916, + "learning_rate": 2.7341736401675578e-05, + "loss": 5.5512, + "step": 6053 + }, + { + "epoch": 1.036111586513777, + "grad_norm": 17.62623405456543, + "learning_rate": 2.7335184309734983e-05, + "loss": 1.689, + "step": 6054 + }, + { + "epoch": 1.036282731473558, + "grad_norm": 11.85800552368164, + "learning_rate": 2.732862494001727e-05, + "loss": 1.0557, + "step": 6055 + }, + { + "epoch": 1.036453876433339, + "grad_norm": 5.9652485847473145, + "learning_rate": 2.7322058296392484e-05, + "loss": 0.413, + "step": 6056 + }, + { + "epoch": 1.03662502139312, + "grad_norm": 14.44044303894043, + "learning_rate": 2.7315484382734947e-05, + "loss": 1.4248, + "step": 6057 + }, + { + "epoch": 1.0367961663529008, + "grad_norm": 7.914107799530029, + "learning_rate": 2.730890320292328e-05, + "loss": 0.7368, + "step": 6058 + }, + { + "epoch": 1.0369673113126818, + "grad_norm": 22.765840530395508, + "learning_rate": 2.730231476084039e-05, + "loss": 2.4051, + "step": 6059 + }, + { + "epoch": 1.0371384562724628, + "grad_norm": 22.832996368408203, + "learning_rate": 2.7295719060373468e-05, + "loss": 2.212, + "step": 6060 + }, + { + "epoch": 1.0373096012322438, + "grad_norm": 15.181253433227539, + "learning_rate": 2.7289116105413985e-05, + "loss": 1.2104, + "step": 6061 + }, + { + "epoch": 1.0374807461920246, + "grad_norm": 14.0794095993042, + "learning_rate": 2.7282505899857695e-05, + "loss": 1.2968, + "step": 6062 + }, + { + "epoch": 1.0376518911518056, + "grad_norm": 4.96496057510376, + "learning_rate": 2.7275888447604632e-05, + "loss": 0.3762, + "step": 6063 + }, + { + "epoch": 1.0378230361115865, + "grad_norm": 6.760622024536133, + "learning_rate": 2.7269263752559102e-05, + "loss": 0.4875, + "step": 6064 + }, + { + "epoch": 1.0379941810713675, + "grad_norm": 19.93670654296875, + "learning_rate": 2.7262631818629676e-05, + "loss": 1.7135, + "step": 6065 + }, + { + "epoch": 1.0381653260311483, + "grad_norm": 83.41024017333984, + "learning_rate": 2.7255992649729222e-05, + "loss": 6.0933, + "step": 6066 + }, + { + "epoch": 1.0383364709909293, + "grad_norm": 13.883889198303223, + "learning_rate": 2.7249346249774843e-05, + "loss": 1.3731, + "step": 6067 + }, + { + "epoch": 1.0385076159507103, + "grad_norm": 3.8786051273345947, + "learning_rate": 2.7242692622687934e-05, + "loss": 0.3786, + "step": 6068 + }, + { + "epoch": 1.0386787609104913, + "grad_norm": 4.438342571258545, + "learning_rate": 2.723603177239415e-05, + "loss": 0.3986, + "step": 6069 + }, + { + "epoch": 1.038849905870272, + "grad_norm": 11.963251113891602, + "learning_rate": 2.722936370282338e-05, + "loss": 1.0745, + "step": 6070 + }, + { + "epoch": 1.039021050830053, + "grad_norm": 13.7601957321167, + "learning_rate": 2.7222688417909817e-05, + "loss": 1.1668, + "step": 6071 + }, + { + "epoch": 1.039192195789834, + "grad_norm": 23.49478530883789, + "learning_rate": 2.7216005921591886e-05, + "loss": 2.3047, + "step": 6072 + }, + { + "epoch": 1.039363340749615, + "grad_norm": 20.574737548828125, + "learning_rate": 2.7209316217812263e-05, + "loss": 1.4941, + "step": 6073 + }, + { + "epoch": 1.0395344857093958, + "grad_norm": 94.0860824584961, + "learning_rate": 2.720261931051789e-05, + "loss": 8.5413, + "step": 6074 + }, + { + "epoch": 1.0397056306691768, + "grad_norm": 7.629838943481445, + "learning_rate": 2.719591520365994e-05, + "loss": 0.4664, + "step": 6075 + }, + { + "epoch": 1.0398767756289578, + "grad_norm": 14.49278450012207, + "learning_rate": 2.718920390119386e-05, + "loss": 1.3559, + "step": 6076 + }, + { + "epoch": 1.0400479205887387, + "grad_norm": 14.303519248962402, + "learning_rate": 2.7182485407079323e-05, + "loss": 1.189, + "step": 6077 + }, + { + "epoch": 1.0402190655485195, + "grad_norm": 5.337088584899902, + "learning_rate": 2.717575972528025e-05, + "loss": 0.4424, + "step": 6078 + }, + { + "epoch": 1.0403902105083005, + "grad_norm": 19.05851173400879, + "learning_rate": 2.7169026859764806e-05, + "loss": 1.741, + "step": 6079 + }, + { + "epoch": 1.0405613554680815, + "grad_norm": 20.25058364868164, + "learning_rate": 2.7162286814505385e-05, + "loss": 2.3799, + "step": 6080 + }, + { + "epoch": 1.0407325004278625, + "grad_norm": 23.0300350189209, + "learning_rate": 2.7155539593478633e-05, + "loss": 0.8189, + "step": 6081 + }, + { + "epoch": 1.0409036453876432, + "grad_norm": 43.651771545410156, + "learning_rate": 2.714878520066541e-05, + "loss": 6.1888, + "step": 6082 + }, + { + "epoch": 1.0410747903474242, + "grad_norm": 16.711088180541992, + "learning_rate": 2.7142023640050826e-05, + "loss": 1.1842, + "step": 6083 + }, + { + "epoch": 1.0412459353072052, + "grad_norm": 16.65302085876465, + "learning_rate": 2.7135254915624213e-05, + "loss": 1.3284, + "step": 6084 + }, + { + "epoch": 1.0414170802669862, + "grad_norm": 14.885180473327637, + "learning_rate": 2.712847903137912e-05, + "loss": 1.1705, + "step": 6085 + }, + { + "epoch": 1.041588225226767, + "grad_norm": 14.422985076904297, + "learning_rate": 2.7121695991313332e-05, + "loss": 1.3098, + "step": 6086 + }, + { + "epoch": 1.041759370186548, + "grad_norm": 23.700284957885742, + "learning_rate": 2.7114905799428863e-05, + "loss": 1.958, + "step": 6087 + }, + { + "epoch": 1.041930515146329, + "grad_norm": 17.216182708740234, + "learning_rate": 2.710810845973192e-05, + "loss": 1.7762, + "step": 6088 + }, + { + "epoch": 1.04210166010611, + "grad_norm": 15.392316818237305, + "learning_rate": 2.7101303976232955e-05, + "loss": 1.3241, + "step": 6089 + }, + { + "epoch": 1.0422728050658907, + "grad_norm": 18.830110549926758, + "learning_rate": 2.709449235294662e-05, + "loss": 1.5202, + "step": 6090 + }, + { + "epoch": 1.0424439500256717, + "grad_norm": 1.7132902145385742, + "learning_rate": 2.708767359389178e-05, + "loss": 0.2298, + "step": 6091 + }, + { + "epoch": 1.0426150949854527, + "grad_norm": 20.93419075012207, + "learning_rate": 2.7080847703091514e-05, + "loss": 2.1347, + "step": 6092 + }, + { + "epoch": 1.0427862399452337, + "grad_norm": 9.760008811950684, + "learning_rate": 2.7074014684573112e-05, + "loss": 1.3687, + "step": 6093 + }, + { + "epoch": 1.0429573849050144, + "grad_norm": 16.97105598449707, + "learning_rate": 2.7067174542368064e-05, + "loss": 1.5609, + "step": 6094 + }, + { + "epoch": 1.0431285298647954, + "grad_norm": 18.94205665588379, + "learning_rate": 2.7060327280512057e-05, + "loss": 1.3384, + "step": 6095 + }, + { + "epoch": 1.0432996748245764, + "grad_norm": 1.1413235664367676, + "learning_rate": 2.7053472903044994e-05, + "loss": 0.1765, + "step": 6096 + }, + { + "epoch": 1.0434708197843574, + "grad_norm": 3.6719653606414795, + "learning_rate": 2.7046611414010968e-05, + "loss": 0.3251, + "step": 6097 + }, + { + "epoch": 1.0436419647441384, + "grad_norm": 21.025039672851562, + "learning_rate": 2.703974281745826e-05, + "loss": 2.9787, + "step": 6098 + }, + { + "epoch": 1.0438131097039192, + "grad_norm": 32.26662063598633, + "learning_rate": 2.703286711743936e-05, + "loss": 5.5577, + "step": 6099 + }, + { + "epoch": 1.0439842546637002, + "grad_norm": 4.281724452972412, + "learning_rate": 2.702598431801094e-05, + "loss": 0.3655, + "step": 6100 + }, + { + "epoch": 1.0441553996234811, + "grad_norm": 24.29343032836914, + "learning_rate": 2.7019094423233853e-05, + "loss": 1.8119, + "step": 6101 + }, + { + "epoch": 1.0443265445832621, + "grad_norm": 22.842308044433594, + "learning_rate": 2.701219743717316e-05, + "loss": 2.8777, + "step": 6102 + }, + { + "epoch": 1.044497689543043, + "grad_norm": 15.586821556091309, + "learning_rate": 2.7005293363898085e-05, + "loss": 1.6434, + "step": 6103 + }, + { + "epoch": 1.0446688345028239, + "grad_norm": 23.474470138549805, + "learning_rate": 2.6998382207482045e-05, + "loss": 2.3843, + "step": 6104 + }, + { + "epoch": 1.0448399794626049, + "grad_norm": 37.05857849121094, + "learning_rate": 2.6991463972002633e-05, + "loss": 5.6854, + "step": 6105 + }, + { + "epoch": 1.0450111244223859, + "grad_norm": 25.402162551879883, + "learning_rate": 2.6984538661541615e-05, + "loss": 3.2815, + "step": 6106 + }, + { + "epoch": 1.0451822693821666, + "grad_norm": 13.91854190826416, + "learning_rate": 2.6977606280184937e-05, + "loss": 1.1625, + "step": 6107 + }, + { + "epoch": 1.0453534143419476, + "grad_norm": 28.982385635375977, + "learning_rate": 2.6970666832022713e-05, + "loss": 2.5425, + "step": 6108 + }, + { + "epoch": 1.0455245593017286, + "grad_norm": 21.270137786865234, + "learning_rate": 2.696372032114923e-05, + "loss": 2.7429, + "step": 6109 + }, + { + "epoch": 1.0456957042615096, + "grad_norm": 18.849096298217773, + "learning_rate": 2.6956766751662936e-05, + "loss": 2.0294, + "step": 6110 + }, + { + "epoch": 1.0458668492212904, + "grad_norm": 23.60141944885254, + "learning_rate": 2.694980612766645e-05, + "loss": 2.7659, + "step": 6111 + }, + { + "epoch": 1.0460379941810714, + "grad_norm": 14.735956192016602, + "learning_rate": 2.6942838453266547e-05, + "loss": 1.3023, + "step": 6112 + }, + { + "epoch": 1.0462091391408523, + "grad_norm": 0.9047157168388367, + "learning_rate": 2.693586373257417e-05, + "loss": 0.1675, + "step": 6113 + }, + { + "epoch": 1.0463802841006333, + "grad_norm": 8.82510757446289, + "learning_rate": 2.6928881969704416e-05, + "loss": 0.6863, + "step": 6114 + }, + { + "epoch": 1.046551429060414, + "grad_norm": 25.968671798706055, + "learning_rate": 2.692189316877653e-05, + "loss": 1.1619, + "step": 6115 + }, + { + "epoch": 1.046722574020195, + "grad_norm": 18.711915969848633, + "learning_rate": 2.6914897333913914e-05, + "loss": 1.6528, + "step": 6116 + }, + { + "epoch": 1.046893718979976, + "grad_norm": 15.643209457397461, + "learning_rate": 2.6907894469244127e-05, + "loss": 1.3218, + "step": 6117 + }, + { + "epoch": 1.047064863939757, + "grad_norm": 22.364917755126953, + "learning_rate": 2.6900884578898872e-05, + "loss": 2.1141, + "step": 6118 + }, + { + "epoch": 1.0472360088995378, + "grad_norm": 21.417695999145508, + "learning_rate": 2.6893867667013983e-05, + "loss": 2.0941, + "step": 6119 + }, + { + "epoch": 1.0474071538593188, + "grad_norm": 14.733633995056152, + "learning_rate": 2.6886843737729457e-05, + "loss": 1.5188, + "step": 6120 + }, + { + "epoch": 1.0475782988190998, + "grad_norm": 23.66362953186035, + "learning_rate": 2.687981279518942e-05, + "loss": 2.0821, + "step": 6121 + }, + { + "epoch": 1.0477494437788808, + "grad_norm": 24.078081130981445, + "learning_rate": 2.687277484354214e-05, + "loss": 2.9703, + "step": 6122 + }, + { + "epoch": 1.0479205887386616, + "grad_norm": 21.633882522583008, + "learning_rate": 2.686572988694002e-05, + "loss": 1.7227, + "step": 6123 + }, + { + "epoch": 1.0480917336984426, + "grad_norm": 15.577491760253906, + "learning_rate": 2.685867792953959e-05, + "loss": 1.2526, + "step": 6124 + }, + { + "epoch": 1.0482628786582235, + "grad_norm": 0.8374304175376892, + "learning_rate": 2.685161897550152e-05, + "loss": 0.165, + "step": 6125 + }, + { + "epoch": 1.0484340236180045, + "grad_norm": 15.729249000549316, + "learning_rate": 2.68445530289906e-05, + "loss": 1.5607, + "step": 6126 + }, + { + "epoch": 1.0486051685777853, + "grad_norm": 3.7719273567199707, + "learning_rate": 2.6837480094175753e-05, + "loss": 0.3735, + "step": 6127 + }, + { + "epoch": 1.0487763135375663, + "grad_norm": 22.409833908081055, + "learning_rate": 2.6830400175230022e-05, + "loss": 2.0298, + "step": 6128 + }, + { + "epoch": 1.0489474584973473, + "grad_norm": 27.7617244720459, + "learning_rate": 2.682331327633056e-05, + "loss": 3.3434, + "step": 6129 + }, + { + "epoch": 1.0491186034571283, + "grad_norm": 20.888198852539062, + "learning_rate": 2.681621940165866e-05, + "loss": 2.0731, + "step": 6130 + }, + { + "epoch": 1.049289748416909, + "grad_norm": 17.169727325439453, + "learning_rate": 2.680911855539971e-05, + "loss": 1.4024, + "step": 6131 + }, + { + "epoch": 1.04946089337669, + "grad_norm": 32.940948486328125, + "learning_rate": 2.6802010741743227e-05, + "loss": 5.7171, + "step": 6132 + }, + { + "epoch": 1.049632038336471, + "grad_norm": 6.877800464630127, + "learning_rate": 2.6794895964882826e-05, + "loss": 0.4265, + "step": 6133 + }, + { + "epoch": 1.049803183296252, + "grad_norm": 31.221891403198242, + "learning_rate": 2.678777422901623e-05, + "loss": 1.3258, + "step": 6134 + }, + { + "epoch": 1.0499743282560328, + "grad_norm": 18.083770751953125, + "learning_rate": 2.6780645538345294e-05, + "loss": 1.3423, + "step": 6135 + }, + { + "epoch": 1.0501454732158138, + "grad_norm": 5.254927158355713, + "learning_rate": 2.677350989707594e-05, + "loss": 0.3645, + "step": 6136 + }, + { + "epoch": 1.0503166181755947, + "grad_norm": 20.59516143798828, + "learning_rate": 2.6766367309418206e-05, + "loss": 2.2506, + "step": 6137 + }, + { + "epoch": 1.0504877631353757, + "grad_norm": 1.0191950798034668, + "learning_rate": 2.6759217779586237e-05, + "loss": 0.1677, + "step": 6138 + }, + { + "epoch": 1.0506589080951565, + "grad_norm": 123.47958374023438, + "learning_rate": 2.675206131179826e-05, + "loss": 9.1588, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_nli-pairs_loss": 1.7010418176651, + "eval_nli-pairs_runtime": 4.3443, + "eval_nli-pairs_samples_per_second": 46.038, + "eval_nli-pairs_steps_per_second": 1.611, + "eval_sts-test_pearson_cosine": 0.7625267011985226, + "eval_sts-test_pearson_dot": 0.6259248949371231, + "eval_sts-test_pearson_euclidean": 0.7619764658843026, + "eval_sts-test_pearson_manhattan": 0.7680853132392583, + "eval_sts-test_pearson_max": 0.7680853132392583, + "eval_sts-test_spearman_cosine": 0.7625076199036728, + "eval_sts-test_spearman_dot": 0.6009276916665572, + "eval_sts-test_spearman_euclidean": 0.7523681562432721, + "eval_sts-test_spearman_manhattan": 0.7608907210750292, + "eval_sts-test_spearman_max": 0.7625076199036728, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_vitaminc-pairs_loss": 1.0161106586456299, + "eval_vitaminc-pairs_runtime": 2.7397, + "eval_vitaminc-pairs_samples_per_second": 73.0, + "eval_vitaminc-pairs_steps_per_second": 2.555, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_qnli-contrastive_loss": 1.7974004745483398, + "eval_qnli-contrastive_runtime": 0.6358, + "eval_qnli-contrastive_samples_per_second": 314.542, + "eval_qnli-contrastive_steps_per_second": 11.009, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_scitail-pairs-qa_loss": 0.1420755535364151, + "eval_scitail-pairs-qa_runtime": 1.5961, + "eval_scitail-pairs-qa_samples_per_second": 125.302, + "eval_scitail-pairs-qa_steps_per_second": 4.386, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_scitail-pairs-pos_loss": 0.7034513354301453, + "eval_scitail-pairs-pos_runtime": 2.627, + "eval_scitail-pairs-pos_samples_per_second": 76.133, + "eval_scitail-pairs-pos_steps_per_second": 2.665, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_xsum-pairs_loss": 0.8569247126579285, + "eval_xsum-pairs_runtime": 2.6434, + "eval_xsum-pairs_samples_per_second": 66.204, + "eval_xsum-pairs_steps_per_second": 2.27, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_compression-pairs_loss": 0.3054618835449219, + "eval_compression-pairs_runtime": 0.5142, + "eval_compression-pairs_samples_per_second": 388.927, + "eval_compression-pairs_steps_per_second": 13.612, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_sciq_pairs_loss": 0.5279684066772461, + "eval_sciq_pairs_runtime": 9.1732, + "eval_sciq_pairs_samples_per_second": 21.803, + "eval_sciq_pairs_steps_per_second": 0.763, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_qasc_pairs_loss": 5.473604679107666, + "eval_qasc_pairs_runtime": 2.6467, + "eval_qasc_pairs_samples_per_second": 75.564, + "eval_qasc_pairs_steps_per_second": 2.645, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_openbookqa_pairs_loss": 2.757842779159546, + "eval_openbookqa_pairs_runtime": 0.6388, + "eval_openbookqa_pairs_samples_per_second": 108.01, + "eval_openbookqa_pairs_steps_per_second": 4.696, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_msmarco_pairs_loss": 1.4087409973144531, + "eval_msmarco_pairs_runtime": 3.9825, + "eval_msmarco_pairs_samples_per_second": 50.22, + "eval_msmarco_pairs_steps_per_second": 1.758, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_nq_pairs_loss": 1.5596331357955933, + "eval_nq_pairs_runtime": 8.6663, + "eval_nq_pairs_samples_per_second": 23.078, + "eval_nq_pairs_steps_per_second": 0.808, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_trivia_pairs_loss": 1.9876388311386108, + "eval_trivia_pairs_runtime": 12.8237, + "eval_trivia_pairs_samples_per_second": 15.596, + "eval_trivia_pairs_steps_per_second": 0.546, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_quora_pairs_loss": 0.2655409276485443, + "eval_quora_pairs_runtime": 1.5911, + "eval_quora_pairs_samples_per_second": 125.702, + "eval_quora_pairs_steps_per_second": 4.4, + "step": 6139 + }, + { + "epoch": 1.0506589080951565, + "eval_gooaq_pairs_loss": 1.0326839685440063, + "eval_gooaq_pairs_runtime": 2.6507, + "eval_gooaq_pairs_samples_per_second": 75.453, + "eval_gooaq_pairs_steps_per_second": 2.641, + "step": 6139 + }, + { + "epoch": 1.0508300530549375, + "grad_norm": 1.1939899921417236, + "learning_rate": 2.674489791027661e-05, + "loss": 0.1777, + "step": 6140 + }, + { + "epoch": 1.0510011980147185, + "grad_norm": 7.697817802429199, + "learning_rate": 2.6737727579247696e-05, + "loss": 0.5119, + "step": 6141 + }, + { + "epoch": 1.0511723429744995, + "grad_norm": 24.694753646850586, + "learning_rate": 2.673055032294203e-05, + "loss": 2.7351, + "step": 6142 + }, + { + "epoch": 1.0513434879342802, + "grad_norm": 25.370515823364258, + "learning_rate": 2.67233661455942e-05, + "loss": 2.9894, + "step": 6143 + }, + { + "epoch": 1.0515146328940612, + "grad_norm": 29.25603485107422, + "learning_rate": 2.671617505144288e-05, + "loss": 5.7828, + "step": 6144 + }, + { + "epoch": 1.0516857778538422, + "grad_norm": 21.737051010131836, + "learning_rate": 2.6708977044730832e-05, + "loss": 2.0899, + "step": 6145 + }, + { + "epoch": 1.0518569228136232, + "grad_norm": 5.30400276184082, + "learning_rate": 2.6701772129704884e-05, + "loss": 0.4699, + "step": 6146 + }, + { + "epoch": 1.0520280677734042, + "grad_norm": 20.10776710510254, + "learning_rate": 2.669456031061595e-05, + "loss": 2.4938, + "step": 6147 + }, + { + "epoch": 1.052199212733185, + "grad_norm": 10.818122863769531, + "learning_rate": 2.668734159171902e-05, + "loss": 0.9672, + "step": 6148 + }, + { + "epoch": 1.052370357692966, + "grad_norm": 15.619196891784668, + "learning_rate": 2.668011597727314e-05, + "loss": 1.3626, + "step": 6149 + }, + { + "epoch": 1.052541502652747, + "grad_norm": 12.81397819519043, + "learning_rate": 2.6672883471541436e-05, + "loss": 1.1308, + "step": 6150 + }, + { + "epoch": 1.0527126476125277, + "grad_norm": 7.247302055358887, + "learning_rate": 2.6665644078791098e-05, + "loss": 0.8993, + "step": 6151 + }, + { + "epoch": 1.0528837925723087, + "grad_norm": 16.9379825592041, + "learning_rate": 2.6658397803293376e-05, + "loss": 1.2732, + "step": 6152 + }, + { + "epoch": 1.0530549375320897, + "grad_norm": 0.6092934608459473, + "learning_rate": 2.665114464932359e-05, + "loss": 0.1539, + "step": 6153 + }, + { + "epoch": 1.0532260824918707, + "grad_norm": 9.06037425994873, + "learning_rate": 2.6643884621161102e-05, + "loss": 0.6905, + "step": 6154 + }, + { + "epoch": 1.0533972274516517, + "grad_norm": 6.986600875854492, + "learning_rate": 2.6636617723089345e-05, + "loss": 0.5406, + "step": 6155 + }, + { + "epoch": 1.0535683724114324, + "grad_norm": 18.403789520263672, + "learning_rate": 2.6629343959395805e-05, + "loss": 1.4752, + "step": 6156 + }, + { + "epoch": 1.0537395173712134, + "grad_norm": 11.036933898925781, + "learning_rate": 2.6622063334372e-05, + "loss": 0.8559, + "step": 6157 + }, + { + "epoch": 1.0539106623309944, + "grad_norm": 21.17281150817871, + "learning_rate": 2.661477585231352e-05, + "loss": 2.043, + "step": 6158 + }, + { + "epoch": 1.0540818072907754, + "grad_norm": 17.409849166870117, + "learning_rate": 2.6607481517519984e-05, + "loss": 1.0679, + "step": 6159 + }, + { + "epoch": 1.0542529522505562, + "grad_norm": 4.291676044464111, + "learning_rate": 2.6600180334295073e-05, + "loss": 0.5096, + "step": 6160 + }, + { + "epoch": 1.0544240972103371, + "grad_norm": 20.648345947265625, + "learning_rate": 2.659287230694648e-05, + "loss": 1.899, + "step": 6161 + }, + { + "epoch": 1.0545952421701181, + "grad_norm": 0.6559057235717773, + "learning_rate": 2.658555743978596e-05, + "loss": 0.1645, + "step": 6162 + }, + { + "epoch": 1.0547663871298991, + "grad_norm": 15.449231147766113, + "learning_rate": 2.6578235737129292e-05, + "loss": 1.2553, + "step": 6163 + }, + { + "epoch": 1.0549375320896799, + "grad_norm": 11.673152923583984, + "learning_rate": 2.65709072032963e-05, + "loss": 1.1081, + "step": 6164 + }, + { + "epoch": 1.0551086770494609, + "grad_norm": 110.93241882324219, + "learning_rate": 2.6563571842610817e-05, + "loss": 9.2153, + "step": 6165 + }, + { + "epoch": 1.0552798220092419, + "grad_norm": 13.813591957092285, + "learning_rate": 2.6556229659400724e-05, + "loss": 1.1657, + "step": 6166 + }, + { + "epoch": 1.0554509669690229, + "grad_norm": 20.899248123168945, + "learning_rate": 2.6548880657997922e-05, + "loss": 2.6031, + "step": 6167 + }, + { + "epoch": 1.0556221119288036, + "grad_norm": 9.75277042388916, + "learning_rate": 2.6541524842738333e-05, + "loss": 0.9529, + "step": 6168 + }, + { + "epoch": 1.0557932568885846, + "grad_norm": 2.945573568344116, + "learning_rate": 2.653416221796189e-05, + "loss": 0.2978, + "step": 6169 + }, + { + "epoch": 1.0559644018483656, + "grad_norm": 24.812198638916016, + "learning_rate": 2.6526792788012562e-05, + "loss": 2.9515, + "step": 6170 + }, + { + "epoch": 1.0561355468081466, + "grad_norm": 3.9069230556488037, + "learning_rate": 2.651941655723832e-05, + "loss": 0.3683, + "step": 6171 + }, + { + "epoch": 1.0563066917679274, + "grad_norm": 27.411827087402344, + "learning_rate": 2.6512033529991148e-05, + "loss": 2.3892, + "step": 6172 + }, + { + "epoch": 1.0564778367277083, + "grad_norm": 18.866891860961914, + "learning_rate": 2.6504643710627054e-05, + "loss": 2.1363, + "step": 6173 + }, + { + "epoch": 1.0566489816874893, + "grad_norm": 23.880348205566406, + "learning_rate": 2.649724710350603e-05, + "loss": 2.8926, + "step": 6174 + }, + { + "epoch": 1.0568201266472703, + "grad_norm": 19.74806022644043, + "learning_rate": 2.6489843712992093e-05, + "loss": 1.891, + "step": 6175 + }, + { + "epoch": 1.056991271607051, + "grad_norm": 15.93543815612793, + "learning_rate": 2.648243354345325e-05, + "loss": 1.4833, + "step": 6176 + }, + { + "epoch": 1.057162416566832, + "grad_norm": 19.573871612548828, + "learning_rate": 2.6475016599261512e-05, + "loss": 1.5903, + "step": 6177 + }, + { + "epoch": 1.057333561526613, + "grad_norm": 13.658683776855469, + "learning_rate": 2.6467592884792892e-05, + "loss": 1.3537, + "step": 6178 + }, + { + "epoch": 1.057504706486394, + "grad_norm": 8.274129867553711, + "learning_rate": 2.6460162404427387e-05, + "loss": 1.3073, + "step": 6179 + }, + { + "epoch": 1.0576758514461748, + "grad_norm": 13.96843147277832, + "learning_rate": 2.6452725162548994e-05, + "loss": 1.0987, + "step": 6180 + }, + { + "epoch": 1.0578469964059558, + "grad_norm": 18.048133850097656, + "learning_rate": 2.6445281163545698e-05, + "loss": 1.5427, + "step": 6181 + }, + { + "epoch": 1.0580181413657368, + "grad_norm": 51.59361267089844, + "learning_rate": 2.643783041180947e-05, + "loss": 5.8192, + "step": 6182 + }, + { + "epoch": 1.0581892863255178, + "grad_norm": 6.808162212371826, + "learning_rate": 2.643037291173626e-05, + "loss": 0.4664, + "step": 6183 + }, + { + "epoch": 1.0583604312852986, + "grad_norm": 24.485843658447266, + "learning_rate": 2.6422908667726006e-05, + "loss": 2.1158, + "step": 6184 + }, + { + "epoch": 1.0585315762450795, + "grad_norm": 3.2217724323272705, + "learning_rate": 2.6415437684182626e-05, + "loss": 0.383, + "step": 6185 + }, + { + "epoch": 1.0587027212048605, + "grad_norm": 13.59959602355957, + "learning_rate": 2.640795996551401e-05, + "loss": 1.0761, + "step": 6186 + }, + { + "epoch": 1.0588738661646415, + "grad_norm": 15.931357383728027, + "learning_rate": 2.6400475516132022e-05, + "loss": 1.5406, + "step": 6187 + }, + { + "epoch": 1.0590450111244223, + "grad_norm": 24.72441864013672, + "learning_rate": 2.63929843404525e-05, + "loss": 2.225, + "step": 6188 + }, + { + "epoch": 1.0592161560842033, + "grad_norm": 4.713048458099365, + "learning_rate": 2.6385486442895244e-05, + "loss": 0.3465, + "step": 6189 + }, + { + "epoch": 1.0593873010439843, + "grad_norm": 16.011383056640625, + "learning_rate": 2.637798182788403e-05, + "loss": 1.3794, + "step": 6190 + }, + { + "epoch": 1.0595584460037653, + "grad_norm": 32.27912521362305, + "learning_rate": 2.637047049984659e-05, + "loss": 5.8596, + "step": 6191 + }, + { + "epoch": 1.059729590963546, + "grad_norm": 6.318559646606445, + "learning_rate": 2.6362952463214628e-05, + "loss": 0.2795, + "step": 6192 + }, + { + "epoch": 1.059900735923327, + "grad_norm": 16.79538345336914, + "learning_rate": 2.6355427722423774e-05, + "loss": 1.5498, + "step": 6193 + }, + { + "epoch": 1.060071880883108, + "grad_norm": 22.450878143310547, + "learning_rate": 2.634789628191366e-05, + "loss": 2.1571, + "step": 6194 + }, + { + "epoch": 1.060243025842889, + "grad_norm": 17.413171768188477, + "learning_rate": 2.6340358146127835e-05, + "loss": 1.6402, + "step": 6195 + }, + { + "epoch": 1.06041417080267, + "grad_norm": 20.46025276184082, + "learning_rate": 2.6332813319513813e-05, + "loss": 1.7809, + "step": 6196 + }, + { + "epoch": 1.0605853157624507, + "grad_norm": 19.27883529663086, + "learning_rate": 2.6325261806523055e-05, + "loss": 1.7623, + "step": 6197 + }, + { + "epoch": 1.0607564607222317, + "grad_norm": 177.13131713867188, + "learning_rate": 2.6317703611610957e-05, + "loss": 8.661, + "step": 6198 + }, + { + "epoch": 1.0609276056820127, + "grad_norm": 14.9853515625, + "learning_rate": 2.6310138739236873e-05, + "loss": 1.2864, + "step": 6199 + }, + { + "epoch": 1.0610987506417935, + "grad_norm": 17.604280471801758, + "learning_rate": 2.6302567193864087e-05, + "loss": 1.5651, + "step": 6200 + }, + { + "epoch": 1.0612698956015745, + "grad_norm": 9.68821907043457, + "learning_rate": 2.6294988979959822e-05, + "loss": 0.5439, + "step": 6201 + }, + { + "epoch": 1.0614410405613555, + "grad_norm": 9.67438793182373, + "learning_rate": 2.6287404101995235e-05, + "loss": 1.3076, + "step": 6202 + }, + { + "epoch": 1.0616121855211365, + "grad_norm": 18.51534652709961, + "learning_rate": 2.627981256444542e-05, + "loss": 1.3825, + "step": 6203 + }, + { + "epoch": 1.0617833304809174, + "grad_norm": 18.125104904174805, + "learning_rate": 2.6272214371789385e-05, + "loss": 1.5245, + "step": 6204 + }, + { + "epoch": 1.0619544754406982, + "grad_norm": 19.579633712768555, + "learning_rate": 2.6264609528510084e-05, + "loss": 1.8224, + "step": 6205 + }, + { + "epoch": 1.0621256204004792, + "grad_norm": 23.845556259155273, + "learning_rate": 2.6256998039094383e-05, + "loss": 2.6739, + "step": 6206 + }, + { + "epoch": 1.0622967653602602, + "grad_norm": 22.428180694580078, + "learning_rate": 2.6249379908033074e-05, + "loss": 3.1672, + "step": 6207 + }, + { + "epoch": 1.0624679103200412, + "grad_norm": 29.91859245300293, + "learning_rate": 2.624175513982086e-05, + "loss": 3.3232, + "step": 6208 + }, + { + "epoch": 1.062639055279822, + "grad_norm": 17.00956153869629, + "learning_rate": 2.623412373895637e-05, + "loss": 1.4198, + "step": 6209 + }, + { + "epoch": 1.062810200239603, + "grad_norm": 11.492415428161621, + "learning_rate": 2.622648570994214e-05, + "loss": 0.934, + "step": 6210 + }, + { + "epoch": 1.062981345199384, + "grad_norm": 21.991470336914062, + "learning_rate": 2.6218841057284624e-05, + "loss": 1.5342, + "step": 6211 + }, + { + "epoch": 1.063152490159165, + "grad_norm": 23.230363845825195, + "learning_rate": 2.621118978549417e-05, + "loss": 1.2235, + "step": 6212 + }, + { + "epoch": 1.0633236351189457, + "grad_norm": 14.931122779846191, + "learning_rate": 2.620353189908505e-05, + "loss": 1.4041, + "step": 6213 + }, + { + "epoch": 1.0634947800787267, + "grad_norm": 24.851787567138672, + "learning_rate": 2.6195867402575414e-05, + "loss": 3.0139, + "step": 6214 + }, + { + "epoch": 1.0636659250385077, + "grad_norm": 16.248435974121094, + "learning_rate": 2.618819630048734e-05, + "loss": 1.4302, + "step": 6215 + }, + { + "epoch": 1.0638370699982886, + "grad_norm": 16.73040771484375, + "learning_rate": 2.6180518597346788e-05, + "loss": 1.613, + "step": 6216 + }, + { + "epoch": 1.0640082149580694, + "grad_norm": 19.11425018310547, + "learning_rate": 2.61728342976836e-05, + "loss": 1.8297, + "step": 6217 + }, + { + "epoch": 1.0641793599178504, + "grad_norm": 15.915562629699707, + "learning_rate": 2.6165143406031547e-05, + "loss": 1.2021, + "step": 6218 + }, + { + "epoch": 1.0643505048776314, + "grad_norm": 16.527769088745117, + "learning_rate": 2.6157445926928247e-05, + "loss": 1.3672, + "step": 6219 + }, + { + "epoch": 1.0645216498374124, + "grad_norm": 20.64341163635254, + "learning_rate": 2.614974186491523e-05, + "loss": 2.6897, + "step": 6220 + }, + { + "epoch": 1.0646927947971931, + "grad_norm": 0.6564233899116516, + "learning_rate": 2.6142031224537907e-05, + "loss": 0.1614, + "step": 6221 + }, + { + "epoch": 1.0648639397569741, + "grad_norm": 20.409786224365234, + "learning_rate": 2.6134314010345565e-05, + "loss": 2.0853, + "step": 6222 + }, + { + "epoch": 1.0650350847167551, + "grad_norm": 2.155768394470215, + "learning_rate": 2.612659022689138e-05, + "loss": 0.336, + "step": 6223 + }, + { + "epoch": 1.065206229676536, + "grad_norm": 20.48274803161621, + "learning_rate": 2.6118859878732382e-05, + "loss": 2.0026, + "step": 6224 + }, + { + "epoch": 1.0653773746363169, + "grad_norm": 22.327909469604492, + "learning_rate": 2.6111122970429495e-05, + "loss": 2.4422, + "step": 6225 + }, + { + "epoch": 1.0655485195960979, + "grad_norm": 1.1469937562942505, + "learning_rate": 2.6103379506547513e-05, + "loss": 0.1708, + "step": 6226 + }, + { + "epoch": 1.0657196645558789, + "grad_norm": 106.68755340576172, + "learning_rate": 2.6095629491655074e-05, + "loss": 8.7931, + "step": 6227 + }, + { + "epoch": 1.0658908095156598, + "grad_norm": 1.095432162284851, + "learning_rate": 2.6087872930324714e-05, + "loss": 0.1697, + "step": 6228 + }, + { + "epoch": 1.0660619544754406, + "grad_norm": 5.803622722625732, + "learning_rate": 2.608010982713281e-05, + "loss": 0.5993, + "step": 6229 + }, + { + "epoch": 1.0662330994352216, + "grad_norm": 14.511138916015625, + "learning_rate": 2.60723401866596e-05, + "loss": 1.4565, + "step": 6230 + }, + { + "epoch": 1.0664042443950026, + "grad_norm": 5.559805870056152, + "learning_rate": 2.6064564013489195e-05, + "loss": 0.4986, + "step": 6231 + }, + { + "epoch": 1.0665753893547836, + "grad_norm": 24.374418258666992, + "learning_rate": 2.6056781312209537e-05, + "loss": 3.2554, + "step": 6232 + }, + { + "epoch": 1.0667465343145643, + "grad_norm": 0.6469919085502625, + "learning_rate": 2.6048992087412437e-05, + "loss": 0.1491, + "step": 6233 + }, + { + "epoch": 1.0669176792743453, + "grad_norm": 1.050114393234253, + "learning_rate": 2.604119634369355e-05, + "loss": 0.1731, + "step": 6234 + }, + { + "epoch": 1.0670888242341263, + "grad_norm": 17.869945526123047, + "learning_rate": 2.603339408565237e-05, + "loss": 1.8747, + "step": 6235 + }, + { + "epoch": 1.0672599691939073, + "grad_norm": 65.78507995605469, + "learning_rate": 2.602558531789225e-05, + "loss": 6.7831, + "step": 6236 + }, + { + "epoch": 1.067431114153688, + "grad_norm": 24.211278915405273, + "learning_rate": 2.601777004502037e-05, + "loss": 3.1394, + "step": 6237 + }, + { + "epoch": 1.067602259113469, + "grad_norm": 11.33743953704834, + "learning_rate": 2.6009948271647753e-05, + "loss": 0.8017, + "step": 6238 + }, + { + "epoch": 1.06777340407325, + "grad_norm": 13.077141761779785, + "learning_rate": 2.6002120002389257e-05, + "loss": 1.0105, + "step": 6239 + }, + { + "epoch": 1.067944549033031, + "grad_norm": 21.057144165039062, + "learning_rate": 2.5994285241863572e-05, + "loss": 2.6367, + "step": 6240 + }, + { + "epoch": 1.0681156939928118, + "grad_norm": 21.67076301574707, + "learning_rate": 2.5986443994693216e-05, + "loss": 1.9473, + "step": 6241 + }, + { + "epoch": 1.0682868389525928, + "grad_norm": 3.669816255569458, + "learning_rate": 2.5978596265504542e-05, + "loss": 0.3221, + "step": 6242 + }, + { + "epoch": 1.0684579839123738, + "grad_norm": 14.505615234375, + "learning_rate": 2.597074205892772e-05, + "loss": 1.5707, + "step": 6243 + }, + { + "epoch": 1.0686291288721548, + "grad_norm": 17.163841247558594, + "learning_rate": 2.5962881379596738e-05, + "loss": 1.9492, + "step": 6244 + }, + { + "epoch": 1.0688002738319358, + "grad_norm": 22.28460693359375, + "learning_rate": 2.595501423214942e-05, + "loss": 2.6652, + "step": 6245 + }, + { + "epoch": 1.0689714187917165, + "grad_norm": 24.26947593688965, + "learning_rate": 2.5947140621227384e-05, + "loss": 2.0312, + "step": 6246 + }, + { + "epoch": 1.0691425637514975, + "grad_norm": 17.7855224609375, + "learning_rate": 2.5939260551476075e-05, + "loss": 1.8179, + "step": 6247 + }, + { + "epoch": 1.0693137087112785, + "grad_norm": 10.421858787536621, + "learning_rate": 2.5931374027544752e-05, + "loss": 0.894, + "step": 6248 + }, + { + "epoch": 1.0694848536710593, + "grad_norm": 11.617718696594238, + "learning_rate": 2.5923481054086467e-05, + "loss": 1.0684, + "step": 6249 + }, + { + "epoch": 1.0696559986308403, + "grad_norm": 3.5524022579193115, + "learning_rate": 2.5915581635758093e-05, + "loss": 0.3627, + "step": 6250 + }, + { + "epoch": 1.0698271435906213, + "grad_norm": 24.77372169494629, + "learning_rate": 2.5907675777220293e-05, + "loss": 2.3108, + "step": 6251 + }, + { + "epoch": 1.0699982885504022, + "grad_norm": 16.17108917236328, + "learning_rate": 2.5899763483137538e-05, + "loss": 1.2411, + "step": 6252 + }, + { + "epoch": 1.0701694335101832, + "grad_norm": 18.850309371948242, + "learning_rate": 2.5891844758178092e-05, + "loss": 1.6959, + "step": 6253 + }, + { + "epoch": 1.070340578469964, + "grad_norm": 18.877622604370117, + "learning_rate": 2.588391960701402e-05, + "loss": 1.4854, + "step": 6254 + }, + { + "epoch": 1.070511723429745, + "grad_norm": 16.1739501953125, + "learning_rate": 2.5875988034321163e-05, + "loss": 1.3473, + "step": 6255 + }, + { + "epoch": 1.070682868389526, + "grad_norm": 44.91740036010742, + "learning_rate": 2.586805004477917e-05, + "loss": 1.6482, + "step": 6256 + }, + { + "epoch": 1.070854013349307, + "grad_norm": 3.3182432651519775, + "learning_rate": 2.5860105643071463e-05, + "loss": 0.3658, + "step": 6257 + }, + { + "epoch": 1.0710251583090877, + "grad_norm": 17.573970794677734, + "learning_rate": 2.585215483388525e-05, + "loss": 1.5352, + "step": 6258 + }, + { + "epoch": 1.0711963032688687, + "grad_norm": 18.447710037231445, + "learning_rate": 2.584419762191152e-05, + "loss": 1.7313, + "step": 6259 + }, + { + "epoch": 1.0713674482286497, + "grad_norm": 25.56972885131836, + "learning_rate": 2.5836234011845046e-05, + "loss": 2.1232, + "step": 6260 + }, + { + "epoch": 1.0715385931884307, + "grad_norm": 24.050052642822266, + "learning_rate": 2.5828264008384362e-05, + "loss": 2.7981, + "step": 6261 + }, + { + "epoch": 1.0717097381482115, + "grad_norm": 21.025928497314453, + "learning_rate": 2.5820287616231785e-05, + "loss": 1.5964, + "step": 6262 + }, + { + "epoch": 1.0718808831079925, + "grad_norm": 3.2581591606140137, + "learning_rate": 2.5812304840093397e-05, + "loss": 0.3287, + "step": 6263 + }, + { + "epoch": 1.0720520280677734, + "grad_norm": 0.9607375860214233, + "learning_rate": 2.5804315684679046e-05, + "loss": 0.1687, + "step": 6264 + }, + { + "epoch": 1.0722231730275544, + "grad_norm": 19.35166358947754, + "learning_rate": 2.5796320154702352e-05, + "loss": 1.6589, + "step": 6265 + }, + { + "epoch": 1.0723943179873352, + "grad_norm": 13.92859935760498, + "learning_rate": 2.578831825488069e-05, + "loss": 1.231, + "step": 6266 + }, + { + "epoch": 1.0725654629471162, + "grad_norm": 18.011184692382812, + "learning_rate": 2.578030998993518e-05, + "loss": 1.5781, + "step": 6267 + }, + { + "epoch": 1.0727366079068972, + "grad_norm": 18.9428768157959, + "learning_rate": 2.5772295364590726e-05, + "loss": 1.5683, + "step": 6268 + }, + { + "epoch": 1.0729077528666782, + "grad_norm": 0.7132593989372253, + "learning_rate": 2.576427438357596e-05, + "loss": 0.1564, + "step": 6269 + }, + { + "epoch": 1.073078897826459, + "grad_norm": 19.910579681396484, + "learning_rate": 2.5756247051623274e-05, + "loss": 2.2763, + "step": 6270 + }, + { + "epoch": 1.07325004278624, + "grad_norm": 1.4179282188415527, + "learning_rate": 2.5748213373468808e-05, + "loss": 0.1777, + "step": 6271 + }, + { + "epoch": 1.073421187746021, + "grad_norm": 4.847038269042969, + "learning_rate": 2.574017335385244e-05, + "loss": 0.3808, + "step": 6272 + }, + { + "epoch": 1.073592332705802, + "grad_norm": 6.342759132385254, + "learning_rate": 2.5732126997517798e-05, + "loss": 0.4684, + "step": 6273 + }, + { + "epoch": 1.0737634776655827, + "grad_norm": 1.4660619497299194, + "learning_rate": 2.5724074309212243e-05, + "loss": 0.1848, + "step": 6274 + }, + { + "epoch": 1.0739346226253637, + "grad_norm": 18.226266860961914, + "learning_rate": 2.571601529368687e-05, + "loss": 1.4087, + "step": 6275 + }, + { + "epoch": 1.0741057675851446, + "grad_norm": 35.625450134277344, + "learning_rate": 2.5707949955696513e-05, + "loss": 1.2727, + "step": 6276 + }, + { + "epoch": 1.0742769125449256, + "grad_norm": 8.416872024536133, + "learning_rate": 2.5699878299999738e-05, + "loss": 0.4758, + "step": 6277 + }, + { + "epoch": 1.0744480575047064, + "grad_norm": 17.871906280517578, + "learning_rate": 2.569180033135882e-05, + "loss": 1.6863, + "step": 6278 + }, + { + "epoch": 1.0746192024644874, + "grad_norm": 22.277774810791016, + "learning_rate": 2.5683716054539787e-05, + "loss": 2.9435, + "step": 6279 + }, + { + "epoch": 1.0747903474242684, + "grad_norm": 20.06029510498047, + "learning_rate": 2.5675625474312372e-05, + "loss": 1.9097, + "step": 6280 + }, + { + "epoch": 1.0749614923840494, + "grad_norm": 24.28558349609375, + "learning_rate": 2.5667528595450024e-05, + "loss": 2.217, + "step": 6281 + }, + { + "epoch": 1.0751326373438301, + "grad_norm": 10.94339370727539, + "learning_rate": 2.565942542272991e-05, + "loss": 1.2658, + "step": 6282 + }, + { + "epoch": 1.0753037823036111, + "grad_norm": 17.108922958374023, + "learning_rate": 2.5651315960932926e-05, + "loss": 1.2507, + "step": 6283 + }, + { + "epoch": 1.075474927263392, + "grad_norm": 3.6949167251586914, + "learning_rate": 2.5643200214843658e-05, + "loss": 0.3683, + "step": 6284 + }, + { + "epoch": 1.075646072223173, + "grad_norm": 3.6829192638397217, + "learning_rate": 2.5635078189250414e-05, + "loss": 0.3396, + "step": 6285 + }, + { + "epoch": 1.0758172171829539, + "grad_norm": 14.598087310791016, + "learning_rate": 2.5626949888945196e-05, + "loss": 1.435, + "step": 6286 + }, + { + "epoch": 1.0759883621427349, + "grad_norm": 20.382143020629883, + "learning_rate": 2.5618815318723713e-05, + "loss": 1.7844, + "step": 6287 + }, + { + "epoch": 1.0761595071025158, + "grad_norm": 5.522397994995117, + "learning_rate": 2.5610674483385373e-05, + "loss": 0.5656, + "step": 6288 + }, + { + "epoch": 1.0763306520622968, + "grad_norm": 31.37264633178711, + "learning_rate": 2.560252738773329e-05, + "loss": 6.2005, + "step": 6289 + }, + { + "epoch": 1.0765017970220776, + "grad_norm": 18.120845794677734, + "learning_rate": 2.559437403657425e-05, + "loss": 1.6842, + "step": 6290 + }, + { + "epoch": 1.0766729419818586, + "grad_norm": 10.900449752807617, + "learning_rate": 2.558621443471876e-05, + "loss": 0.798, + "step": 6291 + }, + { + "epoch": 1.0768440869416396, + "grad_norm": 20.480613708496094, + "learning_rate": 2.5578048586980974e-05, + "loss": 2.6181, + "step": 6292 + }, + { + "epoch": 1.0770152319014206, + "grad_norm": 14.86552619934082, + "learning_rate": 2.5569876498178774e-05, + "loss": 1.0591, + "step": 6293 + }, + { + "epoch": 1.0771863768612016, + "grad_norm": 23.034835815429688, + "learning_rate": 2.556169817313369e-05, + "loss": 2.9569, + "step": 6294 + }, + { + "epoch": 1.0773575218209823, + "grad_norm": 7.4421868324279785, + "learning_rate": 2.5553513616670957e-05, + "loss": 0.7049, + "step": 6295 + }, + { + "epoch": 1.0775286667807633, + "grad_norm": 4.225881576538086, + "learning_rate": 2.554532283361947e-05, + "loss": 0.4169, + "step": 6296 + }, + { + "epoch": 1.0776998117405443, + "grad_norm": 14.406049728393555, + "learning_rate": 2.5537125828811803e-05, + "loss": 1.1218, + "step": 6297 + }, + { + "epoch": 1.077870956700325, + "grad_norm": 10.225951194763184, + "learning_rate": 2.5528922607084203e-05, + "loss": 1.007, + "step": 6298 + }, + { + "epoch": 1.078042101660106, + "grad_norm": 14.981630325317383, + "learning_rate": 2.552071317327658e-05, + "loss": 1.2863, + "step": 6299 + }, + { + "epoch": 1.078213246619887, + "grad_norm": 13.425951957702637, + "learning_rate": 2.5512497532232517e-05, + "loss": 1.098, + "step": 6300 + }, + { + "epoch": 1.078384391579668, + "grad_norm": 20.084674835205078, + "learning_rate": 2.550427568879925e-05, + "loss": 2.7397, + "step": 6301 + }, + { + "epoch": 1.078555536539449, + "grad_norm": 16.5575008392334, + "learning_rate": 2.5496047647827688e-05, + "loss": 1.2174, + "step": 6302 + }, + { + "epoch": 1.0787266814992298, + "grad_norm": 21.359878540039062, + "learning_rate": 2.5487813414172378e-05, + "loss": 2.3573, + "step": 6303 + }, + { + "epoch": 1.0788978264590108, + "grad_norm": 12.596624374389648, + "learning_rate": 2.547957299269153e-05, + "loss": 1.2493, + "step": 6304 + }, + { + "epoch": 1.0790689714187918, + "grad_norm": 22.200517654418945, + "learning_rate": 2.547132638824701e-05, + "loss": 2.6415, + "step": 6305 + }, + { + "epoch": 1.0792401163785725, + "grad_norm": 20.53194808959961, + "learning_rate": 2.546307360570432e-05, + "loss": 0.7938, + "step": 6306 + }, + { + "epoch": 1.0794112613383535, + "grad_norm": 13.094803810119629, + "learning_rate": 2.545481464993262e-05, + "loss": 0.958, + "step": 6307 + }, + { + "epoch": 1.0795824062981345, + "grad_norm": 13.0464506149292, + "learning_rate": 2.5446549525804703e-05, + "loss": 1.1583, + "step": 6308 + }, + { + "epoch": 1.0797535512579155, + "grad_norm": 21.018848419189453, + "learning_rate": 2.5438278238197005e-05, + "loss": 2.0161, + "step": 6309 + }, + { + "epoch": 1.0799246962176965, + "grad_norm": 11.44277286529541, + "learning_rate": 2.5430000791989604e-05, + "loss": 0.957, + "step": 6310 + }, + { + "epoch": 1.0800958411774773, + "grad_norm": 19.248382568359375, + "learning_rate": 2.5421717192066202e-05, + "loss": 1.7125, + "step": 6311 + }, + { + "epoch": 1.0802669861372582, + "grad_norm": 17.87828826904297, + "learning_rate": 2.541342744331413e-05, + "loss": 1.6807, + "step": 6312 + }, + { + "epoch": 1.0804381310970392, + "grad_norm": 14.122018814086914, + "learning_rate": 2.5405131550624355e-05, + "loss": 1.1676, + "step": 6313 + }, + { + "epoch": 1.0806092760568202, + "grad_norm": 18.704439163208008, + "learning_rate": 2.539682951889147e-05, + "loss": 1.4403, + "step": 6314 + }, + { + "epoch": 1.080780421016601, + "grad_norm": 18.118362426757812, + "learning_rate": 2.538852135301368e-05, + "loss": 1.4984, + "step": 6315 + }, + { + "epoch": 1.080951565976382, + "grad_norm": 20.102548599243164, + "learning_rate": 2.5380207057892822e-05, + "loss": 1.7471, + "step": 6316 + }, + { + "epoch": 1.081122710936163, + "grad_norm": 7.387364387512207, + "learning_rate": 2.5371886638434335e-05, + "loss": 0.6007, + "step": 6317 + }, + { + "epoch": 1.081293855895944, + "grad_norm": 20.12532615661621, + "learning_rate": 2.5363560099547282e-05, + "loss": 2.5947, + "step": 6318 + }, + { + "epoch": 1.0814650008557247, + "grad_norm": 12.348640441894531, + "learning_rate": 2.5355227446144337e-05, + "loss": 0.9146, + "step": 6319 + }, + { + "epoch": 1.0816361458155057, + "grad_norm": 1.4093433618545532, + "learning_rate": 2.5346888683141776e-05, + "loss": 0.1879, + "step": 6320 + }, + { + "epoch": 1.0818072907752867, + "grad_norm": 21.66387939453125, + "learning_rate": 2.533854381545948e-05, + "loss": 1.9458, + "step": 6321 + }, + { + "epoch": 1.0819784357350677, + "grad_norm": 29.38831329345703, + "learning_rate": 2.5330192848020935e-05, + "loss": 5.4342, + "step": 6322 + }, + { + "epoch": 1.0821495806948485, + "grad_norm": 15.972040176391602, + "learning_rate": 2.532183578575322e-05, + "loss": 1.4628, + "step": 6323 + }, + { + "epoch": 1.0823207256546294, + "grad_norm": 22.438627243041992, + "learning_rate": 2.5313472633587025e-05, + "loss": 1.9986, + "step": 6324 + }, + { + "epoch": 1.0824918706144104, + "grad_norm": 99.13875579833984, + "learning_rate": 2.5305103396456608e-05, + "loss": 7.4459, + "step": 6325 + }, + { + "epoch": 1.0826630155741914, + "grad_norm": 0.6867462396621704, + "learning_rate": 2.529672807929984e-05, + "loss": 0.1632, + "step": 6326 + }, + { + "epoch": 1.0828341605339722, + "grad_norm": 26.213483810424805, + "learning_rate": 2.5288346687058167e-05, + "loss": 2.6063, + "step": 6327 + }, + { + "epoch": 1.0830053054937532, + "grad_norm": 32.9008674621582, + "learning_rate": 2.5279959224676624e-05, + "loss": 6.1089, + "step": 6328 + }, + { + "epoch": 1.0831764504535342, + "grad_norm": 7.711669921875, + "learning_rate": 2.5271565697103828e-05, + "loss": 0.5029, + "step": 6329 + }, + { + "epoch": 1.0833475954133152, + "grad_norm": 0.9175840020179749, + "learning_rate": 2.526316610929197e-05, + "loss": 0.1591, + "step": 6330 + }, + { + "epoch": 1.083518740373096, + "grad_norm": 15.961565017700195, + "learning_rate": 2.5254760466196825e-05, + "loss": 1.2495, + "step": 6331 + }, + { + "epoch": 1.083689885332877, + "grad_norm": 14.952744483947754, + "learning_rate": 2.524634877277773e-05, + "loss": 1.2988, + "step": 6332 + }, + { + "epoch": 1.083861030292658, + "grad_norm": 21.237443923950195, + "learning_rate": 2.5237931033997598e-05, + "loss": 1.5518, + "step": 6333 + }, + { + "epoch": 1.084032175252439, + "grad_norm": 0.7877843379974365, + "learning_rate": 2.5229507254822905e-05, + "loss": 0.1497, + "step": 6334 + }, + { + "epoch": 1.0842033202122197, + "grad_norm": 20.07823371887207, + "learning_rate": 2.5221077440223696e-05, + "loss": 2.3132, + "step": 6335 + }, + { + "epoch": 1.0843744651720006, + "grad_norm": 24.466449737548828, + "learning_rate": 2.521264159517357e-05, + "loss": 1.4605, + "step": 6336 + }, + { + "epoch": 1.0845456101317816, + "grad_norm": 6.049923896789551, + "learning_rate": 2.52041997246497e-05, + "loss": 0.5203, + "step": 6337 + }, + { + "epoch": 1.0847167550915626, + "grad_norm": 0.8076518774032593, + "learning_rate": 2.5195751833632784e-05, + "loss": 0.1521, + "step": 6338 + }, + { + "epoch": 1.0848879000513434, + "grad_norm": 5.725604057312012, + "learning_rate": 2.5187297927107106e-05, + "loss": 0.4024, + "step": 6339 + }, + { + "epoch": 1.0850590450111244, + "grad_norm": 1.9877190589904785, + "learning_rate": 2.5178838010060472e-05, + "loss": 0.2543, + "step": 6340 + }, + { + "epoch": 1.0852301899709054, + "grad_norm": 20.83760643005371, + "learning_rate": 2.517037208748426e-05, + "loss": 2.1836, + "step": 6341 + }, + { + "epoch": 1.0854013349306864, + "grad_norm": 19.120075225830078, + "learning_rate": 2.516190016437336e-05, + "loss": 1.5214, + "step": 6342 + }, + { + "epoch": 1.0855724798904671, + "grad_norm": 20.080074310302734, + "learning_rate": 2.5153422245726225e-05, + "loss": 1.8331, + "step": 6343 + }, + { + "epoch": 1.0857436248502481, + "grad_norm": 19.490802764892578, + "learning_rate": 2.514493833654485e-05, + "loss": 2.3582, + "step": 6344 + }, + { + "epoch": 1.085914769810029, + "grad_norm": 25.15721321105957, + "learning_rate": 2.5136448441834744e-05, + "loss": 3.3368, + "step": 6345 + }, + { + "epoch": 1.08608591476981, + "grad_norm": 30.424531936645508, + "learning_rate": 2.512795256660496e-05, + "loss": 5.8034, + "step": 6346 + }, + { + "epoch": 1.0862570597295909, + "grad_norm": 13.79897689819336, + "learning_rate": 2.511945071586807e-05, + "loss": 1.112, + "step": 6347 + }, + { + "epoch": 1.0864282046893718, + "grad_norm": 8.542878150939941, + "learning_rate": 2.5110942894640192e-05, + "loss": 0.6739, + "step": 6348 + }, + { + "epoch": 1.0865993496491528, + "grad_norm": 29.626609802246094, + "learning_rate": 2.5102429107940947e-05, + "loss": 5.7867, + "step": 6349 + }, + { + "epoch": 1.0867704946089338, + "grad_norm": 15.035479545593262, + "learning_rate": 2.509390936079348e-05, + "loss": 1.1878, + "step": 6350 + }, + { + "epoch": 1.0869416395687148, + "grad_norm": 15.836331367492676, + "learning_rate": 2.5085383658224454e-05, + "loss": 1.4794, + "step": 6351 + }, + { + "epoch": 1.0871127845284956, + "grad_norm": 16.121477127075195, + "learning_rate": 2.5076852005264045e-05, + "loss": 1.2524, + "step": 6352 + }, + { + "epoch": 1.0872839294882766, + "grad_norm": 18.068601608276367, + "learning_rate": 2.5068314406945948e-05, + "loss": 1.4017, + "step": 6353 + }, + { + "epoch": 1.0874550744480576, + "grad_norm": 20.634361267089844, + "learning_rate": 2.5059770868307353e-05, + "loss": 1.8565, + "step": 6354 + }, + { + "epoch": 1.0876262194078383, + "grad_norm": 156.67971801757812, + "learning_rate": 2.5051221394388965e-05, + "loss": 7.3016, + "step": 6355 + }, + { + "epoch": 1.0877973643676193, + "grad_norm": 1.6159067153930664, + "learning_rate": 2.5042665990234978e-05, + "loss": 0.259, + "step": 6356 + }, + { + "epoch": 1.0879685093274003, + "grad_norm": 21.53618049621582, + "learning_rate": 2.5034104660893102e-05, + "loss": 2.9373, + "step": 6357 + }, + { + "epoch": 1.0881396542871813, + "grad_norm": 20.488115310668945, + "learning_rate": 2.5025537411414532e-05, + "loss": 2.6575, + "step": 6358 + }, + { + "epoch": 1.0883107992469623, + "grad_norm": 12.624834060668945, + "learning_rate": 2.5016964246853952e-05, + "loss": 0.9089, + "step": 6359 + }, + { + "epoch": 1.088481944206743, + "grad_norm": 22.495460510253906, + "learning_rate": 2.500838517226955e-05, + "loss": 2.379, + "step": 6360 + }, + { + "epoch": 1.088653089166524, + "grad_norm": 18.44520378112793, + "learning_rate": 2.4999800192722988e-05, + "loss": 1.4829, + "step": 6361 + }, + { + "epoch": 1.088824234126305, + "grad_norm": 8.016738891601562, + "learning_rate": 2.499120931327942e-05, + "loss": 0.571, + "step": 6362 + }, + { + "epoch": 1.088995379086086, + "grad_norm": 24.428014755249023, + "learning_rate": 2.4982612539007474e-05, + "loss": 5.3953, + "step": 6363 + }, + { + "epoch": 1.0891665240458668, + "grad_norm": 20.098068237304688, + "learning_rate": 2.497400987497926e-05, + "loss": 2.2569, + "step": 6364 + }, + { + "epoch": 1.0893376690056478, + "grad_norm": 23.772600173950195, + "learning_rate": 2.4965401326270365e-05, + "loss": 3.1548, + "step": 6365 + }, + { + "epoch": 1.0895088139654288, + "grad_norm": 13.880364418029785, + "learning_rate": 2.4956786897959844e-05, + "loss": 1.3489, + "step": 6366 + }, + { + "epoch": 1.0896799589252097, + "grad_norm": 4.314807891845703, + "learning_rate": 2.4948166595130227e-05, + "loss": 0.3581, + "step": 6367 + }, + { + "epoch": 1.0898511038849905, + "grad_norm": 16.17949104309082, + "learning_rate": 2.4939540422867497e-05, + "loss": 1.3273, + "step": 6368 + }, + { + "epoch": 1.0900222488447715, + "grad_norm": 26.82145118713379, + "learning_rate": 2.493090838626112e-05, + "loss": 2.7974, + "step": 6369 + }, + { + "epoch": 1.0901933938045525, + "grad_norm": 18.014739990234375, + "learning_rate": 2.4922270490403994e-05, + "loss": 1.6901, + "step": 6370 + }, + { + "epoch": 1.0903645387643335, + "grad_norm": 14.148981094360352, + "learning_rate": 2.49136267403925e-05, + "loss": 1.2315, + "step": 6371 + }, + { + "epoch": 1.0905356837241142, + "grad_norm": 1.2226158380508423, + "learning_rate": 2.4904977141326468e-05, + "loss": 0.1799, + "step": 6372 + }, + { + "epoch": 1.0907068286838952, + "grad_norm": 0.6526600122451782, + "learning_rate": 2.489632169830917e-05, + "loss": 0.1585, + "step": 6373 + }, + { + "epoch": 1.0908779736436762, + "grad_norm": 19.312833786010742, + "learning_rate": 2.4887660416447326e-05, + "loss": 1.8635, + "step": 6374 + }, + { + "epoch": 1.0910491186034572, + "grad_norm": 16.932435989379883, + "learning_rate": 2.4878993300851115e-05, + "loss": 1.2672, + "step": 6375 + }, + { + "epoch": 1.091220263563238, + "grad_norm": 18.97432518005371, + "learning_rate": 2.4870320356634138e-05, + "loss": 1.2766, + "step": 6376 + }, + { + "epoch": 1.091391408523019, + "grad_norm": 37.76401901245117, + "learning_rate": 2.486164158891345e-05, + "loss": 5.9113, + "step": 6377 + }, + { + "epoch": 1.0915625534828, + "grad_norm": 20.928302764892578, + "learning_rate": 2.4852957002809534e-05, + "loss": 1.8063, + "step": 6378 + }, + { + "epoch": 1.091733698442581, + "grad_norm": 0.7269101142883301, + "learning_rate": 2.484426660344631e-05, + "loss": 0.1545, + "step": 6379 + }, + { + "epoch": 1.0919048434023617, + "grad_norm": 105.674072265625, + "learning_rate": 2.4835570395951133e-05, + "loss": 9.2848, + "step": 6380 + }, + { + "epoch": 1.0920759883621427, + "grad_norm": 19.896669387817383, + "learning_rate": 2.4826868385454767e-05, + "loss": 1.8101, + "step": 6381 + }, + { + "epoch": 1.0922471333219237, + "grad_norm": 13.241930961608887, + "learning_rate": 2.4818160577091417e-05, + "loss": 1.3192, + "step": 6382 + }, + { + "epoch": 1.0924182782817047, + "grad_norm": 13.864144325256348, + "learning_rate": 2.4809446975998707e-05, + "loss": 1.1316, + "step": 6383 + }, + { + "epoch": 1.0925894232414854, + "grad_norm": 17.57133674621582, + "learning_rate": 2.480072758731767e-05, + "loss": 1.6042, + "step": 6384 + }, + { + "epoch": 1.0927605682012664, + "grad_norm": 14.61141300201416, + "learning_rate": 2.4792002416192754e-05, + "loss": 1.2322, + "step": 6385 + }, + { + "epoch": 1.0929317131610474, + "grad_norm": 24.17422866821289, + "learning_rate": 2.4783271467771832e-05, + "loss": 2.9147, + "step": 6386 + }, + { + "epoch": 1.0931028581208284, + "grad_norm": 14.182536125183105, + "learning_rate": 2.477453474720617e-05, + "loss": 1.534, + "step": 6387 + }, + { + "epoch": 1.0932740030806092, + "grad_norm": 142.10279846191406, + "learning_rate": 2.476579225965045e-05, + "loss": 7.7581, + "step": 6388 + }, + { + "epoch": 1.0934451480403902, + "grad_norm": 13.02221965789795, + "learning_rate": 2.475704401026275e-05, + "loss": 1.2573, + "step": 6389 + }, + { + "epoch": 1.0936162930001712, + "grad_norm": 14.556048393249512, + "learning_rate": 2.474829000420455e-05, + "loss": 1.2619, + "step": 6390 + }, + { + "epoch": 1.0937874379599521, + "grad_norm": 28.360790252685547, + "learning_rate": 2.473953024664073e-05, + "loss": 5.3109, + "step": 6391 + }, + { + "epoch": 1.093958582919733, + "grad_norm": 17.71285629272461, + "learning_rate": 2.4730764742739554e-05, + "loss": 1.9739, + "step": 6392 + }, + { + "epoch": 1.094129727879514, + "grad_norm": 16.858354568481445, + "learning_rate": 2.4721993497672693e-05, + "loss": 1.7266, + "step": 6393 + }, + { + "epoch": 1.094300872839295, + "grad_norm": 20.677833557128906, + "learning_rate": 2.4713216516615182e-05, + "loss": 2.622, + "step": 6394 + }, + { + "epoch": 1.0944720177990759, + "grad_norm": 25.51704978942871, + "learning_rate": 2.4704433804745465e-05, + "loss": 2.3596, + "step": 6395 + }, + { + "epoch": 1.0946431627588566, + "grad_norm": 22.32132339477539, + "learning_rate": 2.469564536724534e-05, + "loss": 2.2435, + "step": 6396 + }, + { + "epoch": 1.0948143077186376, + "grad_norm": 20.751523971557617, + "learning_rate": 2.4686851209300017e-05, + "loss": 1.7518, + "step": 6397 + }, + { + "epoch": 1.0949854526784186, + "grad_norm": 18.10664939880371, + "learning_rate": 2.4678051336098048e-05, + "loss": 1.968, + "step": 6398 + }, + { + "epoch": 1.0951565976381996, + "grad_norm": 17.6309814453125, + "learning_rate": 2.4669245752831375e-05, + "loss": 1.504, + "step": 6399 + }, + { + "epoch": 1.0953277425979806, + "grad_norm": 17.307374954223633, + "learning_rate": 2.4660434464695304e-05, + "loss": 1.3444, + "step": 6400 + }, + { + "epoch": 1.0954988875577614, + "grad_norm": 19.310840606689453, + "learning_rate": 2.465161747688851e-05, + "loss": 2.3097, + "step": 6401 + }, + { + "epoch": 1.0956700325175424, + "grad_norm": 26.530712127685547, + "learning_rate": 2.4642794794613027e-05, + "loss": 2.4516, + "step": 6402 + }, + { + "epoch": 1.0958411774773233, + "grad_norm": 14.693706512451172, + "learning_rate": 2.463396642307426e-05, + "loss": 1.1521, + "step": 6403 + }, + { + "epoch": 1.0960123224371041, + "grad_norm": 45.52013397216797, + "learning_rate": 2.4625132367480948e-05, + "loss": 6.0723, + "step": 6404 + }, + { + "epoch": 1.096183467396885, + "grad_norm": 21.434844970703125, + "learning_rate": 2.4616292633045203e-05, + "loss": 2.6903, + "step": 6405 + }, + { + "epoch": 1.096354612356666, + "grad_norm": 16.72518539428711, + "learning_rate": 2.4607447224982487e-05, + "loss": 1.3194, + "step": 6406 + }, + { + "epoch": 1.096525757316447, + "grad_norm": 14.632075309753418, + "learning_rate": 2.4598596148511592e-05, + "loss": 1.5802, + "step": 6407 + }, + { + "epoch": 1.096696902276228, + "grad_norm": 5.6426167488098145, + "learning_rate": 2.4589739408854678e-05, + "loss": 0.4257, + "step": 6408 + }, + { + "epoch": 1.0968680472360088, + "grad_norm": 14.361030578613281, + "learning_rate": 2.4580877011237228e-05, + "loss": 1.1946, + "step": 6409 + }, + { + "epoch": 1.0970391921957898, + "grad_norm": 0.7481557130813599, + "learning_rate": 2.457200896088807e-05, + "loss": 0.1552, + "step": 6410 + }, + { + "epoch": 1.0972103371555708, + "grad_norm": 11.047932624816895, + "learning_rate": 2.4563135263039368e-05, + "loss": 1.1797, + "step": 6411 + }, + { + "epoch": 1.0973814821153518, + "grad_norm": 32.6690788269043, + "learning_rate": 2.4554255922926618e-05, + "loss": 5.7241, + "step": 6412 + }, + { + "epoch": 1.0975526270751326, + "grad_norm": 6.154735565185547, + "learning_rate": 2.4545370945788642e-05, + "loss": 0.4541, + "step": 6413 + }, + { + "epoch": 1.0977237720349136, + "grad_norm": 20.97507095336914, + "learning_rate": 2.453648033686759e-05, + "loss": 2.084, + "step": 6414 + }, + { + "epoch": 1.0978949169946945, + "grad_norm": 20.436906814575195, + "learning_rate": 2.452758410140893e-05, + "loss": 1.9525, + "step": 6415 + }, + { + "epoch": 1.0980660619544755, + "grad_norm": 13.217822074890137, + "learning_rate": 2.4518682244661466e-05, + "loss": 1.068, + "step": 6416 + }, + { + "epoch": 1.0982372069142563, + "grad_norm": 26.957794189453125, + "learning_rate": 2.45097747718773e-05, + "loss": 5.2028, + "step": 6417 + }, + { + "epoch": 1.0984083518740373, + "grad_norm": 26.779298782348633, + "learning_rate": 2.4500861688311852e-05, + "loss": 2.2781, + "step": 6418 + }, + { + "epoch": 1.0985794968338183, + "grad_norm": 18.87523651123047, + "learning_rate": 2.4491942999223856e-05, + "loss": 1.7808, + "step": 6419 + }, + { + "epoch": 1.0987506417935993, + "grad_norm": 18.582653045654297, + "learning_rate": 2.4483018709875357e-05, + "loss": 2.6275, + "step": 6420 + }, + { + "epoch": 1.09892178675338, + "grad_norm": 20.106109619140625, + "learning_rate": 2.4474088825531687e-05, + "loss": 2.4183, + "step": 6421 + }, + { + "epoch": 1.099092931713161, + "grad_norm": 23.127412796020508, + "learning_rate": 2.4465153351461507e-05, + "loss": 3.2691, + "step": 6422 + }, + { + "epoch": 1.099264076672942, + "grad_norm": 26.415891647338867, + "learning_rate": 2.4456212292936747e-05, + "loss": 3.624, + "step": 6423 + }, + { + "epoch": 1.099435221632723, + "grad_norm": 8.812840461730957, + "learning_rate": 2.444726565523265e-05, + "loss": 0.6166, + "step": 6424 + }, + { + "epoch": 1.0996063665925038, + "grad_norm": 18.31468963623047, + "learning_rate": 2.4438313443627748e-05, + "loss": 2.1999, + "step": 6425 + }, + { + "epoch": 1.0997775115522848, + "grad_norm": 0.5593311786651611, + "learning_rate": 2.442935566340385e-05, + "loss": 0.1435, + "step": 6426 + }, + { + "epoch": 1.0999486565120657, + "grad_norm": 5.1220011711120605, + "learning_rate": 2.4420392319846067e-05, + "loss": 0.3795, + "step": 6427 + }, + { + "epoch": 1.1001198014718467, + "grad_norm": 17.488069534301758, + "learning_rate": 2.441142341824279e-05, + "loss": 1.6355, + "step": 6428 + }, + { + "epoch": 1.1002909464316275, + "grad_norm": 14.01169490814209, + "learning_rate": 2.4402448963885672e-05, + "loss": 1.5106, + "step": 6429 + }, + { + "epoch": 1.1004620913914085, + "grad_norm": 21.192691802978516, + "learning_rate": 2.4393468962069663e-05, + "loss": 2.3361, + "step": 6430 + }, + { + "epoch": 1.1006332363511895, + "grad_norm": 15.216924667358398, + "learning_rate": 2.4384483418092976e-05, + "loss": 1.3677, + "step": 6431 + }, + { + "epoch": 1.1008043813109705, + "grad_norm": 16.910829544067383, + "learning_rate": 2.4375492337257097e-05, + "loss": 1.5384, + "step": 6432 + }, + { + "epoch": 1.1009755262707512, + "grad_norm": 6.67744779586792, + "learning_rate": 2.4366495724866772e-05, + "loss": 0.8838, + "step": 6433 + }, + { + "epoch": 1.1011466712305322, + "grad_norm": 18.879968643188477, + "learning_rate": 2.4357493586230018e-05, + "loss": 1.6657, + "step": 6434 + }, + { + "epoch": 1.1013178161903132, + "grad_norm": 23.374927520751953, + "learning_rate": 2.4348485926658108e-05, + "loss": 2.5072, + "step": 6435 + }, + { + "epoch": 1.1014889611500942, + "grad_norm": 5.638739109039307, + "learning_rate": 2.4339472751465584e-05, + "loss": 0.3209, + "step": 6436 + }, + { + "epoch": 1.101660106109875, + "grad_norm": 8.479720115661621, + "learning_rate": 2.433045406597022e-05, + "loss": 0.517, + "step": 6437 + }, + { + "epoch": 1.101831251069656, + "grad_norm": 10.97712230682373, + "learning_rate": 2.432142987549306e-05, + "loss": 1.1031, + "step": 6438 + }, + { + "epoch": 1.102002396029437, + "grad_norm": 5.807440280914307, + "learning_rate": 2.4312400185358393e-05, + "loss": 0.359, + "step": 6439 + }, + { + "epoch": 1.102173540989218, + "grad_norm": 0.7325602173805237, + "learning_rate": 2.4303365000893744e-05, + "loss": 0.1568, + "step": 6440 + }, + { + "epoch": 1.1023446859489987, + "grad_norm": 20.65593910217285, + "learning_rate": 2.4294324327429887e-05, + "loss": 1.6213, + "step": 6441 + }, + { + "epoch": 1.1025158309087797, + "grad_norm": 22.244678497314453, + "learning_rate": 2.4285278170300835e-05, + "loss": 2.1494, + "step": 6442 + }, + { + "epoch": 1.1026869758685607, + "grad_norm": 14.809113502502441, + "learning_rate": 2.4276226534843827e-05, + "loss": 0.8485, + "step": 6443 + }, + { + "epoch": 1.1028581208283417, + "grad_norm": 22.746294021606445, + "learning_rate": 2.4267169426399356e-05, + "loss": 1.7695, + "step": 6444 + }, + { + "epoch": 1.1030292657881224, + "grad_norm": 6.71605920791626, + "learning_rate": 2.425810685031111e-05, + "loss": 0.4067, + "step": 6445 + }, + { + "epoch": 1.1032004107479034, + "grad_norm": 4.4305524826049805, + "learning_rate": 2.4249038811926042e-05, + "loss": 0.3609, + "step": 6446 + }, + { + "epoch": 1.1033715557076844, + "grad_norm": 8.703330039978027, + "learning_rate": 2.4239965316594294e-05, + "loss": 0.8769, + "step": 6447 + }, + { + "epoch": 1.1035427006674654, + "grad_norm": 9.909598350524902, + "learning_rate": 2.4230886369669248e-05, + "loss": 0.5658, + "step": 6448 + }, + { + "epoch": 1.1037138456272464, + "grad_norm": 0.643650233745575, + "learning_rate": 2.4221801976507495e-05, + "loss": 0.1524, + "step": 6449 + }, + { + "epoch": 1.1038849905870272, + "grad_norm": 11.157785415649414, + "learning_rate": 2.421271214246884e-05, + "loss": 0.4894, + "step": 6450 + }, + { + "epoch": 1.1040561355468081, + "grad_norm": 18.122224807739258, + "learning_rate": 2.42036168729163e-05, + "loss": 1.4929, + "step": 6451 + }, + { + "epoch": 1.1042272805065891, + "grad_norm": 3.4521541595458984, + "learning_rate": 2.4194516173216097e-05, + "loss": 0.3654, + "step": 6452 + }, + { + "epoch": 1.10439842546637, + "grad_norm": 21.733537673950195, + "learning_rate": 2.4185410048737654e-05, + "loss": 2.376, + "step": 6453 + }, + { + "epoch": 1.104569570426151, + "grad_norm": 7.599557876586914, + "learning_rate": 2.4176298504853604e-05, + "loss": 0.9135, + "step": 6454 + }, + { + "epoch": 1.1047407153859319, + "grad_norm": 0.5854821801185608, + "learning_rate": 2.4167181546939765e-05, + "loss": 0.1569, + "step": 6455 + }, + { + "epoch": 1.1049118603457129, + "grad_norm": 14.26248550415039, + "learning_rate": 2.415805918037516e-05, + "loss": 1.2171, + "step": 6456 + }, + { + "epoch": 1.1050830053054939, + "grad_norm": 21.380966186523438, + "learning_rate": 2.4148931410542e-05, + "loss": 1.8418, + "step": 6457 + }, + { + "epoch": 1.1052541502652746, + "grad_norm": 0.7476603984832764, + "learning_rate": 2.413979824282568e-05, + "loss": 0.1601, + "step": 6458 + }, + { + "epoch": 1.1054252952250556, + "grad_norm": 1.1973159313201904, + "learning_rate": 2.4130659682614783e-05, + "loss": 0.1709, + "step": 6459 + }, + { + "epoch": 1.1055964401848366, + "grad_norm": 24.046266555786133, + "learning_rate": 2.4121515735301076e-05, + "loss": 2.5584, + "step": 6460 + }, + { + "epoch": 1.1057675851446176, + "grad_norm": 7.6598052978515625, + "learning_rate": 2.4112366406279492e-05, + "loss": 0.5179, + "step": 6461 + }, + { + "epoch": 1.1059387301043984, + "grad_norm": 91.56566619873047, + "learning_rate": 2.4103211700948163e-05, + "loss": 6.9132, + "step": 6462 + }, + { + "epoch": 1.1061098750641793, + "grad_norm": 18.259281158447266, + "learning_rate": 2.4094051624708374e-05, + "loss": 1.5434, + "step": 6463 + }, + { + "epoch": 1.1062810200239603, + "grad_norm": 14.358467102050781, + "learning_rate": 2.4084886182964574e-05, + "loss": 1.3149, + "step": 6464 + }, + { + "epoch": 1.1064521649837413, + "grad_norm": 8.986652374267578, + "learning_rate": 2.4075715381124397e-05, + "loss": 0.6369, + "step": 6465 + }, + { + "epoch": 1.106623309943522, + "grad_norm": 20.910594940185547, + "learning_rate": 2.4066539224598623e-05, + "loss": 1.8807, + "step": 6466 + }, + { + "epoch": 1.106794454903303, + "grad_norm": 14.152681350708008, + "learning_rate": 2.405735771880121e-05, + "loss": 1.3497, + "step": 6467 + }, + { + "epoch": 1.106965599863084, + "grad_norm": 2.2874717712402344, + "learning_rate": 2.4048170869149248e-05, + "loss": 0.3023, + "step": 6468 + }, + { + "epoch": 1.107136744822865, + "grad_norm": 9.741914749145508, + "learning_rate": 2.4038978681062995e-05, + "loss": 1.1822, + "step": 6469 + }, + { + "epoch": 1.1073078897826458, + "grad_norm": 14.803202629089355, + "learning_rate": 2.402978115996586e-05, + "loss": 1.8127, + "step": 6470 + }, + { + "epoch": 1.1074790347424268, + "grad_norm": 16.078153610229492, + "learning_rate": 2.402057831128439e-05, + "loss": 1.2776, + "step": 6471 + }, + { + "epoch": 1.1076501797022078, + "grad_norm": 14.220911026000977, + "learning_rate": 2.4011370140448278e-05, + "loss": 1.0287, + "step": 6472 + }, + { + "epoch": 1.1078213246619888, + "grad_norm": 16.646638870239258, + "learning_rate": 2.4002156652890368e-05, + "loss": 1.501, + "step": 6473 + }, + { + "epoch": 1.1079924696217696, + "grad_norm": 14.892061233520508, + "learning_rate": 2.399293785404662e-05, + "loss": 1.0803, + "step": 6474 + }, + { + "epoch": 1.1081636145815505, + "grad_norm": 25.23822021484375, + "learning_rate": 2.398371374935614e-05, + "loss": 2.2634, + "step": 6475 + }, + { + "epoch": 1.1083347595413315, + "grad_norm": 164.55755615234375, + "learning_rate": 2.3974484344261175e-05, + "loss": 13.2938, + "step": 6476 + }, + { + "epoch": 1.1085059045011125, + "grad_norm": 18.659744262695312, + "learning_rate": 2.3965249644207072e-05, + "loss": 1.6686, + "step": 6477 + }, + { + "epoch": 1.1086770494608933, + "grad_norm": 17.61865997314453, + "learning_rate": 2.3956009654642333e-05, + "loss": 1.625, + "step": 6478 + }, + { + "epoch": 1.1088481944206743, + "grad_norm": 22.407318115234375, + "learning_rate": 2.394676438101855e-05, + "loss": 1.9881, + "step": 6479 + }, + { + "epoch": 1.1090193393804553, + "grad_norm": 10.573683738708496, + "learning_rate": 2.393751382879046e-05, + "loss": 0.6952, + "step": 6480 + }, + { + "epoch": 1.1091904843402363, + "grad_norm": 15.807626724243164, + "learning_rate": 2.3928258003415902e-05, + "loss": 1.4837, + "step": 6481 + }, + { + "epoch": 1.109361629300017, + "grad_norm": 8.468942642211914, + "learning_rate": 2.391899691035582e-05, + "loss": 0.5184, + "step": 6482 + }, + { + "epoch": 1.109532774259798, + "grad_norm": 14.21964168548584, + "learning_rate": 2.390973055507428e-05, + "loss": 1.099, + "step": 6483 + }, + { + "epoch": 1.109703919219579, + "grad_norm": 11.065580368041992, + "learning_rate": 2.3900458943038437e-05, + "loss": 0.8836, + "step": 6484 + }, + { + "epoch": 1.10987506417936, + "grad_norm": 23.09280776977539, + "learning_rate": 2.3891182079718563e-05, + "loss": 2.7036, + "step": 6485 + }, + { + "epoch": 1.1100462091391408, + "grad_norm": 13.219036102294922, + "learning_rate": 2.388189997058802e-05, + "loss": 1.2623, + "step": 6486 + }, + { + "epoch": 1.1102173540989217, + "grad_norm": 22.590919494628906, + "learning_rate": 2.3872612621123265e-05, + "loss": 2.8811, + "step": 6487 + }, + { + "epoch": 1.1103884990587027, + "grad_norm": 2.921581268310547, + "learning_rate": 2.3863320036803846e-05, + "loss": 0.3436, + "step": 6488 + }, + { + "epoch": 1.1105596440184837, + "grad_norm": 5.190805912017822, + "learning_rate": 2.3854022223112404e-05, + "loss": 0.4765, + "step": 6489 + }, + { + "epoch": 1.1107307889782645, + "grad_norm": 8.173125267028809, + "learning_rate": 2.3844719185534663e-05, + "loss": 1.0327, + "step": 6490 + }, + { + "epoch": 1.1109019339380455, + "grad_norm": 21.659151077270508, + "learning_rate": 2.383541092955943e-05, + "loss": 2.0525, + "step": 6491 + }, + { + "epoch": 1.1110730788978265, + "grad_norm": 16.777814865112305, + "learning_rate": 2.3826097460678588e-05, + "loss": 1.1159, + "step": 6492 + }, + { + "epoch": 1.1112442238576075, + "grad_norm": 16.834827423095703, + "learning_rate": 2.3816778784387094e-05, + "loss": 1.4569, + "step": 6493 + }, + { + "epoch": 1.1114153688173882, + "grad_norm": 6.3645853996276855, + "learning_rate": 2.3807454906182992e-05, + "loss": 0.6127, + "step": 6494 + }, + { + "epoch": 1.1115865137771692, + "grad_norm": 18.301231384277344, + "learning_rate": 2.3798125831567373e-05, + "loss": 1.6497, + "step": 6495 + }, + { + "epoch": 1.1117576587369502, + "grad_norm": 46.49019241333008, + "learning_rate": 2.378879156604441e-05, + "loss": 6.3514, + "step": 6496 + }, + { + "epoch": 1.1119288036967312, + "grad_norm": 13.724831581115723, + "learning_rate": 2.3779452115121332e-05, + "loss": 1.2679, + "step": 6497 + }, + { + "epoch": 1.1120999486565122, + "grad_norm": 13.0844144821167, + "learning_rate": 2.3770107484308435e-05, + "loss": 1.3047, + "step": 6498 + }, + { + "epoch": 1.112271093616293, + "grad_norm": 23.980138778686523, + "learning_rate": 2.376075767911905e-05, + "loss": 3.0273, + "step": 6499 + }, + { + "epoch": 1.112442238576074, + "grad_norm": 28.75403594970703, + "learning_rate": 2.375140270506959e-05, + "loss": 5.4808, + "step": 6500 + }, + { + "epoch": 1.112613383535855, + "grad_norm": 23.089555740356445, + "learning_rate": 2.3742042567679498e-05, + "loss": 2.0221, + "step": 6501 + }, + { + "epoch": 1.1127845284956357, + "grad_norm": 18.15275001525879, + "learning_rate": 2.373267727247127e-05, + "loss": 1.4721, + "step": 6502 + }, + { + "epoch": 1.1129556734554167, + "grad_norm": 2.7961483001708984, + "learning_rate": 2.372330682497045e-05, + "loss": 0.3007, + "step": 6503 + }, + { + "epoch": 1.1131268184151977, + "grad_norm": 6.274376392364502, + "learning_rate": 2.3713931230705603e-05, + "loss": 0.4561, + "step": 6504 + }, + { + "epoch": 1.1132979633749787, + "grad_norm": 15.660245895385742, + "learning_rate": 2.3704550495208356e-05, + "loss": 1.3623, + "step": 6505 + }, + { + "epoch": 1.1134691083347596, + "grad_norm": 25.518728256225586, + "learning_rate": 2.369516462401335e-05, + "loss": 5.5308, + "step": 6506 + }, + { + "epoch": 1.1136402532945404, + "grad_norm": 20.65359878540039, + "learning_rate": 2.3685773622658262e-05, + "loss": 1.6647, + "step": 6507 + }, + { + "epoch": 1.1138113982543214, + "grad_norm": 22.48552703857422, + "learning_rate": 2.3676377496683803e-05, + "loss": 1.9629, + "step": 6508 + }, + { + "epoch": 1.1139825432141024, + "grad_norm": 11.458088874816895, + "learning_rate": 2.366697625163369e-05, + "loss": 1.1358, + "step": 6509 + }, + { + "epoch": 1.1141536881738834, + "grad_norm": 21.160249710083008, + "learning_rate": 2.365756989305469e-05, + "loss": 1.4879, + "step": 6510 + }, + { + "epoch": 1.1143248331336641, + "grad_norm": 17.153186798095703, + "learning_rate": 2.3648158426496556e-05, + "loss": 2.1674, + "step": 6511 + }, + { + "epoch": 1.1144959780934451, + "grad_norm": 21.04764175415039, + "learning_rate": 2.3638741857512063e-05, + "loss": 1.7981, + "step": 6512 + }, + { + "epoch": 1.1146671230532261, + "grad_norm": 0.6828592419624329, + "learning_rate": 2.3629320191657012e-05, + "loss": 0.1629, + "step": 6513 + }, + { + "epoch": 1.1148382680130071, + "grad_norm": 17.496509552001953, + "learning_rate": 2.3619893434490194e-05, + "loss": 1.4577, + "step": 6514 + }, + { + "epoch": 1.1150094129727879, + "grad_norm": 18.580137252807617, + "learning_rate": 2.3610461591573408e-05, + "loss": 1.6204, + "step": 6515 + }, + { + "epoch": 1.1151805579325689, + "grad_norm": 24.80152702331543, + "learning_rate": 2.3601024668471462e-05, + "loss": 2.0046, + "step": 6516 + }, + { + "epoch": 1.1153517028923499, + "grad_norm": 14.06851577758789, + "learning_rate": 2.359158267075215e-05, + "loss": 1.3292, + "step": 6517 + }, + { + "epoch": 1.1155228478521308, + "grad_norm": 31.47075653076172, + "learning_rate": 2.3582135603986267e-05, + "loss": 5.1749, + "step": 6518 + }, + { + "epoch": 1.1156939928119116, + "grad_norm": 19.141130447387695, + "learning_rate": 2.3572683473747593e-05, + "loss": 1.5537, + "step": 6519 + }, + { + "epoch": 1.1158651377716926, + "grad_norm": 20.078285217285156, + "learning_rate": 2.35632262856129e-05, + "loss": 1.798, + "step": 6520 + }, + { + "epoch": 1.1160362827314736, + "grad_norm": 25.23849105834961, + "learning_rate": 2.3553764045161943e-05, + "loss": 2.3348, + "step": 6521 + }, + { + "epoch": 1.1162074276912546, + "grad_norm": 0.8329445719718933, + "learning_rate": 2.3544296757977465e-05, + "loss": 0.162, + "step": 6522 + }, + { + "epoch": 1.1163785726510354, + "grad_norm": 3.9662768840789795, + "learning_rate": 2.3534824429645163e-05, + "loss": 0.5279, + "step": 6523 + }, + { + "epoch": 1.1165497176108163, + "grad_norm": 16.716726303100586, + "learning_rate": 2.352534706575374e-05, + "loss": 1.6044, + "step": 6524 + }, + { + "epoch": 1.1167208625705973, + "grad_norm": 0.695058286190033, + "learning_rate": 2.3515864671894853e-05, + "loss": 0.1527, + "step": 6525 + }, + { + "epoch": 1.1168920075303783, + "grad_norm": 13.73619270324707, + "learning_rate": 2.3506377253663125e-05, + "loss": 1.3362, + "step": 6526 + }, + { + "epoch": 1.117063152490159, + "grad_norm": 23.782678604125977, + "learning_rate": 2.3496884816656145e-05, + "loss": 0.9277, + "step": 6527 + }, + { + "epoch": 1.11723429744994, + "grad_norm": 14.975417137145996, + "learning_rate": 2.348738736647447e-05, + "loss": 1.1147, + "step": 6528 + }, + { + "epoch": 1.117405442409721, + "grad_norm": 24.206743240356445, + "learning_rate": 2.3477884908721605e-05, + "loss": 3.5489, + "step": 6529 + }, + { + "epoch": 1.117576587369502, + "grad_norm": 14.4812650680542, + "learning_rate": 2.3468377449004014e-05, + "loss": 1.1981, + "step": 6530 + }, + { + "epoch": 1.1177477323292828, + "grad_norm": 18.3228702545166, + "learning_rate": 2.3458864992931118e-05, + "loss": 1.5354, + "step": 6531 + }, + { + "epoch": 1.1179188772890638, + "grad_norm": 8.11532211303711, + "learning_rate": 2.3449347546115275e-05, + "loss": 0.8612, + "step": 6532 + }, + { + "epoch": 1.1180900222488448, + "grad_norm": 22.54753303527832, + "learning_rate": 2.34398251141718e-05, + "loss": 1.9849, + "step": 6533 + }, + { + "epoch": 1.1182611672086258, + "grad_norm": 8.902283668518066, + "learning_rate": 2.343029770271893e-05, + "loss": 0.74, + "step": 6534 + }, + { + "epoch": 1.1184323121684066, + "grad_norm": 20.976781845092773, + "learning_rate": 2.3420765317377864e-05, + "loss": 1.906, + "step": 6535 + }, + { + "epoch": 1.1186034571281875, + "grad_norm": 18.906484603881836, + "learning_rate": 2.3411227963772713e-05, + "loss": 2.3091, + "step": 6536 + }, + { + "epoch": 1.1187746020879685, + "grad_norm": 13.609125137329102, + "learning_rate": 2.340168564753054e-05, + "loss": 1.0348, + "step": 6537 + }, + { + "epoch": 1.1189457470477495, + "grad_norm": 17.64080047607422, + "learning_rate": 2.339213837428132e-05, + "loss": 1.4005, + "step": 6538 + }, + { + "epoch": 1.1191168920075303, + "grad_norm": 12.737320899963379, + "learning_rate": 2.338258614965796e-05, + "loss": 1.1334, + "step": 6539 + }, + { + "epoch": 1.1192880369673113, + "grad_norm": 4.324669361114502, + "learning_rate": 2.3373028979296286e-05, + "loss": 0.4197, + "step": 6540 + }, + { + "epoch": 1.1194591819270923, + "grad_norm": 24.128999710083008, + "learning_rate": 2.336346686883504e-05, + "loss": 3.4262, + "step": 6541 + }, + { + "epoch": 1.1196303268868733, + "grad_norm": 8.432863235473633, + "learning_rate": 2.3353899823915887e-05, + "loss": 0.7029, + "step": 6542 + }, + { + "epoch": 1.119801471846654, + "grad_norm": 21.715240478515625, + "learning_rate": 2.3344327850183395e-05, + "loss": 1.9738, + "step": 6543 + }, + { + "epoch": 1.119972616806435, + "grad_norm": 15.863061904907227, + "learning_rate": 2.3334750953285044e-05, + "loss": 1.254, + "step": 6544 + }, + { + "epoch": 1.120143761766216, + "grad_norm": 13.353907585144043, + "learning_rate": 2.3325169138871214e-05, + "loss": 0.964, + "step": 6545 + }, + { + "epoch": 1.120314906725997, + "grad_norm": 11.292017936706543, + "learning_rate": 2.3315582412595195e-05, + "loss": 0.841, + "step": 6546 + }, + { + "epoch": 1.1204860516857778, + "grad_norm": 12.882715225219727, + "learning_rate": 2.3305990780113163e-05, + "loss": 0.9168, + "step": 6547 + }, + { + "epoch": 1.1206571966455587, + "grad_norm": 2.7862045764923096, + "learning_rate": 2.3296394247084206e-05, + "loss": 0.3106, + "step": 6548 + }, + { + "epoch": 1.1208283416053397, + "grad_norm": 2.7119204998016357, + "learning_rate": 2.328679281917028e-05, + "loss": 0.3005, + "step": 6549 + }, + { + "epoch": 1.1209994865651207, + "grad_norm": 17.22968864440918, + "learning_rate": 2.327718650203624e-05, + "loss": 1.2516, + "step": 6550 + }, + { + "epoch": 1.1211706315249015, + "grad_norm": 11.979629516601562, + "learning_rate": 2.3267575301349852e-05, + "loss": 0.9529, + "step": 6551 + }, + { + "epoch": 1.1213417764846825, + "grad_norm": 103.80817413330078, + "learning_rate": 2.3257959222781708e-05, + "loss": 8.1736, + "step": 6552 + }, + { + "epoch": 1.1215129214444635, + "grad_norm": 23.488616943359375, + "learning_rate": 2.3248338272005332e-05, + "loss": 3.1435, + "step": 6553 + }, + { + "epoch": 1.1216840664042445, + "grad_norm": 23.113727569580078, + "learning_rate": 2.323871245469709e-05, + "loss": 3.0527, + "step": 6554 + }, + { + "epoch": 1.1218552113640254, + "grad_norm": 20.720781326293945, + "learning_rate": 2.3229081776536224e-05, + "loss": 2.6047, + "step": 6555 + }, + { + "epoch": 1.1220263563238062, + "grad_norm": 23.30956268310547, + "learning_rate": 2.3219446243204853e-05, + "loss": 2.7747, + "step": 6556 + }, + { + "epoch": 1.1221975012835872, + "grad_norm": 20.627832412719727, + "learning_rate": 2.3209805860387956e-05, + "loss": 2.2667, + "step": 6557 + }, + { + "epoch": 1.1223686462433682, + "grad_norm": 13.168161392211914, + "learning_rate": 2.3200160633773374e-05, + "loss": 1.1686, + "step": 6558 + }, + { + "epoch": 1.122539791203149, + "grad_norm": 0.711658239364624, + "learning_rate": 2.3190510569051806e-05, + "loss": 0.1682, + "step": 6559 + }, + { + "epoch": 1.12271093616293, + "grad_norm": 16.090927124023438, + "learning_rate": 2.31808556719168e-05, + "loss": 1.4364, + "step": 6560 + }, + { + "epoch": 1.122882081122711, + "grad_norm": 23.919294357299805, + "learning_rate": 2.3171195948064766e-05, + "loss": 3.6051, + "step": 6561 + }, + { + "epoch": 1.123053226082492, + "grad_norm": 21.282033920288086, + "learning_rate": 2.316153140319495e-05, + "loss": 2.1329, + "step": 6562 + }, + { + "epoch": 1.123224371042273, + "grad_norm": 17.056264877319336, + "learning_rate": 2.315186204300945e-05, + "loss": 1.4888, + "step": 6563 + }, + { + "epoch": 1.1233955160020537, + "grad_norm": 3.378387689590454, + "learning_rate": 2.3142187873213202e-05, + "loss": 0.3428, + "step": 6564 + }, + { + "epoch": 1.1235666609618347, + "grad_norm": 5.910409450531006, + "learning_rate": 2.313250889951398e-05, + "loss": 0.6965, + "step": 6565 + }, + { + "epoch": 1.1237378059216157, + "grad_norm": 15.086389541625977, + "learning_rate": 2.3122825127622397e-05, + "loss": 1.2835, + "step": 6566 + }, + { + "epoch": 1.1239089508813966, + "grad_norm": 13.626158714294434, + "learning_rate": 2.311313656325189e-05, + "loss": 0.9983, + "step": 6567 + }, + { + "epoch": 1.1240800958411774, + "grad_norm": 25.3629207611084, + "learning_rate": 2.3103443212118728e-05, + "loss": 3.3747, + "step": 6568 + }, + { + "epoch": 1.1242512408009584, + "grad_norm": 4.067588806152344, + "learning_rate": 2.3093745079942e-05, + "loss": 0.3772, + "step": 6569 + }, + { + "epoch": 1.1244223857607394, + "grad_norm": 2.650106906890869, + "learning_rate": 2.3084042172443615e-05, + "loss": 0.4619, + "step": 6570 + }, + { + "epoch": 1.1245935307205204, + "grad_norm": 23.48897933959961, + "learning_rate": 2.307433449534831e-05, + "loss": 3.1404, + "step": 6571 + }, + { + "epoch": 1.1247646756803011, + "grad_norm": 18.580120086669922, + "learning_rate": 2.3064622054383628e-05, + "loss": 1.5701, + "step": 6572 + }, + { + "epoch": 1.1249358206400821, + "grad_norm": 9.494951248168945, + "learning_rate": 2.3054904855279924e-05, + "loss": 0.648, + "step": 6573 + }, + { + "epoch": 1.1251069655998631, + "grad_norm": 24.08775520324707, + "learning_rate": 2.304518290377035e-05, + "loss": 2.1751, + "step": 6574 + }, + { + "epoch": 1.125278110559644, + "grad_norm": 21.949031829833984, + "learning_rate": 2.3035456205590884e-05, + "loss": 2.5299, + "step": 6575 + }, + { + "epoch": 1.1254492555194249, + "grad_norm": 15.016345024108887, + "learning_rate": 2.3025724766480288e-05, + "loss": 1.218, + "step": 6576 + }, + { + "epoch": 1.1256204004792059, + "grad_norm": 23.55092430114746, + "learning_rate": 2.301598859218013e-05, + "loss": 2.9416, + "step": 6577 + }, + { + "epoch": 1.1257915454389869, + "grad_norm": 8.029715538024902, + "learning_rate": 2.300624768843476e-05, + "loss": 0.5919, + "step": 6578 + }, + { + "epoch": 1.1259626903987678, + "grad_norm": 13.06380844116211, + "learning_rate": 2.2996502060991327e-05, + "loss": 1.0507, + "step": 6579 + }, + { + "epoch": 1.1261338353585486, + "grad_norm": 15.745100021362305, + "learning_rate": 2.2986751715599767e-05, + "loss": 1.4026, + "step": 6580 + }, + { + "epoch": 1.1263049803183296, + "grad_norm": 6.315836429595947, + "learning_rate": 2.2976996658012805e-05, + "loss": 0.5273, + "step": 6581 + }, + { + "epoch": 1.1264761252781106, + "grad_norm": 13.709003448486328, + "learning_rate": 2.296723689398593e-05, + "loss": 1.1541, + "step": 6582 + }, + { + "epoch": 1.1266472702378916, + "grad_norm": 19.77781867980957, + "learning_rate": 2.295747242927742e-05, + "loss": 1.7555, + "step": 6583 + }, + { + "epoch": 1.1268184151976723, + "grad_norm": 19.866291046142578, + "learning_rate": 2.2947703269648323e-05, + "loss": 2.42, + "step": 6584 + }, + { + "epoch": 1.1269895601574533, + "grad_norm": 2.5817923545837402, + "learning_rate": 2.293792942086246e-05, + "loss": 0.3039, + "step": 6585 + }, + { + "epoch": 1.1271607051172343, + "grad_norm": 24.967405319213867, + "learning_rate": 2.2928150888686418e-05, + "loss": 3.0059, + "step": 6586 + }, + { + "epoch": 1.1273318500770153, + "grad_norm": 15.376130104064941, + "learning_rate": 2.2918367678889542e-05, + "loss": 1.2198, + "step": 6587 + }, + { + "epoch": 1.127502995036796, + "grad_norm": 23.588638305664062, + "learning_rate": 2.290857979724394e-05, + "loss": 2.7913, + "step": 6588 + }, + { + "epoch": 1.127674139996577, + "grad_norm": 13.602372169494629, + "learning_rate": 2.289878724952448e-05, + "loss": 1.3017, + "step": 6589 + }, + { + "epoch": 1.127845284956358, + "grad_norm": 12.126798629760742, + "learning_rate": 2.2888990041508775e-05, + "loss": 0.86, + "step": 6590 + }, + { + "epoch": 1.128016429916139, + "grad_norm": 25.87506866455078, + "learning_rate": 2.28791881789772e-05, + "loss": 2.8479, + "step": 6591 + }, + { + "epoch": 1.1281875748759198, + "grad_norm": 2.148634195327759, + "learning_rate": 2.2869381667712853e-05, + "loss": 0.3244, + "step": 6592 + }, + { + "epoch": 1.1283587198357008, + "grad_norm": 23.239213943481445, + "learning_rate": 2.2859570513501617e-05, + "loss": 2.9426, + "step": 6593 + }, + { + "epoch": 1.1285298647954818, + "grad_norm": 15.510618209838867, + "learning_rate": 2.2849754722132058e-05, + "loss": 1.3888, + "step": 6594 + }, + { + "epoch": 1.1287010097552628, + "grad_norm": 5.460433006286621, + "learning_rate": 2.2839934299395526e-05, + "loss": 0.3742, + "step": 6595 + }, + { + "epoch": 1.1288721547150438, + "grad_norm": 15.822589874267578, + "learning_rate": 2.283010925108608e-05, + "loss": 1.7936, + "step": 6596 + }, + { + "epoch": 1.1290432996748245, + "grad_norm": 19.703838348388672, + "learning_rate": 2.2820279583000514e-05, + "loss": 2.0802, + "step": 6597 + }, + { + "epoch": 1.1292144446346055, + "grad_norm": 5.518841743469238, + "learning_rate": 2.2810445300938342e-05, + "loss": 0.4586, + "step": 6598 + }, + { + "epoch": 1.1293855895943865, + "grad_norm": 15.560166358947754, + "learning_rate": 2.2800606410701813e-05, + "loss": 1.2736, + "step": 6599 + }, + { + "epoch": 1.1295567345541673, + "grad_norm": 1.1018235683441162, + "learning_rate": 2.279076291809588e-05, + "loss": 0.2273, + "step": 6600 + }, + { + "epoch": 1.1297278795139483, + "grad_norm": 6.1512064933776855, + "learning_rate": 2.2780914828928223e-05, + "loss": 0.5965, + "step": 6601 + }, + { + "epoch": 1.1298990244737293, + "grad_norm": 7.797390937805176, + "learning_rate": 2.277106214900923e-05, + "loss": 0.8642, + "step": 6602 + }, + { + "epoch": 1.1300701694335102, + "grad_norm": 4.10001277923584, + "learning_rate": 2.2761204884151997e-05, + "loss": 0.3231, + "step": 6603 + }, + { + "epoch": 1.1302413143932912, + "grad_norm": 5.270298957824707, + "learning_rate": 2.2751343040172314e-05, + "loss": 0.4003, + "step": 6604 + }, + { + "epoch": 1.130412459353072, + "grad_norm": 12.963715553283691, + "learning_rate": 2.2741476622888697e-05, + "loss": 1.1548, + "step": 6605 + }, + { + "epoch": 1.130583604312853, + "grad_norm": 49.15763473510742, + "learning_rate": 2.2731605638122342e-05, + "loss": 6.4104, + "step": 6606 + }, + { + "epoch": 1.130754749272634, + "grad_norm": 17.410070419311523, + "learning_rate": 2.2721730091697142e-05, + "loss": 2.0867, + "step": 6607 + }, + { + "epoch": 1.1309258942324147, + "grad_norm": 25.885358810424805, + "learning_rate": 2.271184998943969e-05, + "loss": 5.2408, + "step": 6608 + }, + { + "epoch": 1.1310970391921957, + "grad_norm": 17.080461502075195, + "learning_rate": 2.2701965337179254e-05, + "loss": 1.3972, + "step": 6609 + }, + { + "epoch": 1.1312681841519767, + "grad_norm": 37.435577392578125, + "learning_rate": 2.26920761407478e-05, + "loss": 1.8542, + "step": 6610 + }, + { + "epoch": 1.1314393291117577, + "grad_norm": 16.12346649169922, + "learning_rate": 2.2682182405979963e-05, + "loss": 1.6343, + "step": 6611 + }, + { + "epoch": 1.1316104740715387, + "grad_norm": 5.131670951843262, + "learning_rate": 2.2672284138713066e-05, + "loss": 0.3993, + "step": 6612 + }, + { + "epoch": 1.1317816190313195, + "grad_norm": 16.831790924072266, + "learning_rate": 2.2662381344787106e-05, + "loss": 1.0834, + "step": 6613 + }, + { + "epoch": 1.1319527639911005, + "grad_norm": 2.409867286682129, + "learning_rate": 2.265247403004473e-05, + "loss": 0.1754, + "step": 6614 + }, + { + "epoch": 1.1321239089508814, + "grad_norm": 25.386507034301758, + "learning_rate": 2.264256220033128e-05, + "loss": 2.5655, + "step": 6615 + }, + { + "epoch": 1.1322950539106622, + "grad_norm": 5.757489204406738, + "learning_rate": 2.263264586149475e-05, + "loss": 0.5487, + "step": 6616 + }, + { + "epoch": 1.1324661988704432, + "grad_norm": 10.322013854980469, + "learning_rate": 2.2622725019385794e-05, + "loss": 0.7429, + "step": 6617 + }, + { + "epoch": 1.1326373438302242, + "grad_norm": 22.98650550842285, + "learning_rate": 2.2612799679857722e-05, + "loss": 2.7043, + "step": 6618 + }, + { + "epoch": 1.1328084887900052, + "grad_norm": 14.2614164352417, + "learning_rate": 2.2602869848766497e-05, + "loss": 1.2144, + "step": 6619 + }, + { + "epoch": 1.1329796337497862, + "grad_norm": 13.950614929199219, + "learning_rate": 2.2592935531970742e-05, + "loss": 1.2144, + "step": 6620 + }, + { + "epoch": 1.133150778709567, + "grad_norm": 16.276235580444336, + "learning_rate": 2.2582996735331717e-05, + "loss": 1.7681, + "step": 6621 + }, + { + "epoch": 1.133321923669348, + "grad_norm": 22.497079849243164, + "learning_rate": 2.2573053464713314e-05, + "loss": 2.842, + "step": 6622 + }, + { + "epoch": 1.133493068629129, + "grad_norm": 20.166521072387695, + "learning_rate": 2.2563105725982094e-05, + "loss": 1.5608, + "step": 6623 + }, + { + "epoch": 1.13366421358891, + "grad_norm": 5.348848819732666, + "learning_rate": 2.2553153525007227e-05, + "loss": 0.3772, + "step": 6624 + }, + { + "epoch": 1.1338353585486907, + "grad_norm": 9.403623580932617, + "learning_rate": 2.2543196867660534e-05, + "loss": 0.9717, + "step": 6625 + }, + { + "epoch": 1.1340065035084717, + "grad_norm": 5.1142497062683105, + "learning_rate": 2.2533235759816454e-05, + "loss": 0.4947, + "step": 6626 + }, + { + "epoch": 1.1341776484682526, + "grad_norm": 14.59904956817627, + "learning_rate": 2.2523270207352046e-05, + "loss": 1.3381, + "step": 6627 + }, + { + "epoch": 1.1343487934280336, + "grad_norm": 54.968746185302734, + "learning_rate": 2.251330021614702e-05, + "loss": 6.1858, + "step": 6628 + }, + { + "epoch": 1.1345199383878144, + "grad_norm": 23.6578311920166, + "learning_rate": 2.250332579208367e-05, + "loss": 1.9513, + "step": 6629 + }, + { + "epoch": 1.1346910833475954, + "grad_norm": 6.994755268096924, + "learning_rate": 2.249334694104693e-05, + "loss": 0.7519, + "step": 6630 + }, + { + "epoch": 1.1348622283073764, + "grad_norm": 17.739059448242188, + "learning_rate": 2.2483363668924332e-05, + "loss": 1.5266, + "step": 6631 + }, + { + "epoch": 1.1350333732671574, + "grad_norm": 18.796348571777344, + "learning_rate": 2.2473375981606027e-05, + "loss": 1.5246, + "step": 6632 + }, + { + "epoch": 1.1352045182269381, + "grad_norm": 14.201302528381348, + "learning_rate": 2.246338388498476e-05, + "loss": 1.2143, + "step": 6633 + }, + { + "epoch": 1.1353756631867191, + "grad_norm": 25.871665954589844, + "learning_rate": 2.245338738495588e-05, + "loss": 5.4664, + "step": 6634 + }, + { + "epoch": 1.1355468081465, + "grad_norm": 13.160964012145996, + "learning_rate": 2.2443386487417345e-05, + "loss": 0.9775, + "step": 6635 + }, + { + "epoch": 1.135717953106281, + "grad_norm": 18.273212432861328, + "learning_rate": 2.2433381198269694e-05, + "loss": 1.9524, + "step": 6636 + }, + { + "epoch": 1.1358890980660619, + "grad_norm": 12.089645385742188, + "learning_rate": 2.2423371523416068e-05, + "loss": 1.1928, + "step": 6637 + }, + { + "epoch": 1.1360602430258429, + "grad_norm": 19.688276290893555, + "learning_rate": 2.2413357468762182e-05, + "loss": 2.09, + "step": 6638 + }, + { + "epoch": 1.1362313879856238, + "grad_norm": 20.712020874023438, + "learning_rate": 2.2403339040216348e-05, + "loss": 2.2678, + "step": 6639 + }, + { + "epoch": 1.1364025329454048, + "grad_norm": 4.0690693855285645, + "learning_rate": 2.239331624368946e-05, + "loss": 0.3117, + "step": 6640 + }, + { + "epoch": 1.1365736779051856, + "grad_norm": 14.07337474822998, + "learning_rate": 2.2383289085094966e-05, + "loss": 1.1598, + "step": 6641 + }, + { + "epoch": 1.1367448228649666, + "grad_norm": 21.29363441467285, + "learning_rate": 2.2373257570348917e-05, + "loss": 1.1275, + "step": 6642 + }, + { + "epoch": 1.1369159678247476, + "grad_norm": 11.619843482971191, + "learning_rate": 2.2363221705369926e-05, + "loss": 1.3518, + "step": 6643 + }, + { + "epoch": 1.1370871127845286, + "grad_norm": 16.24192237854004, + "learning_rate": 2.2353181496079156e-05, + "loss": 1.3699, + "step": 6644 + }, + { + "epoch": 1.1372582577443096, + "grad_norm": 20.99095916748047, + "learning_rate": 2.234313694840035e-05, + "loss": 1.9589, + "step": 6645 + }, + { + "epoch": 1.1374294027040903, + "grad_norm": 12.040764808654785, + "learning_rate": 2.2333088068259812e-05, + "loss": 0.842, + "step": 6646 + }, + { + "epoch": 1.1376005476638713, + "grad_norm": 62.060768127441406, + "learning_rate": 2.2323034861586392e-05, + "loss": 6.5968, + "step": 6647 + }, + { + "epoch": 1.1377716926236523, + "grad_norm": 14.743884086608887, + "learning_rate": 2.2312977334311492e-05, + "loss": 1.351, + "step": 6648 + }, + { + "epoch": 1.137942837583433, + "grad_norm": 16.910991668701172, + "learning_rate": 2.2302915492369072e-05, + "loss": 1.6767, + "step": 6649 + }, + { + "epoch": 1.138113982543214, + "grad_norm": 25.29916763305664, + "learning_rate": 2.2292849341695637e-05, + "loss": 2.5186, + "step": 6650 + }, + { + "epoch": 1.138285127502995, + "grad_norm": 10.531685829162598, + "learning_rate": 2.2282778888230224e-05, + "loss": 0.8632, + "step": 6651 + }, + { + "epoch": 1.138456272462776, + "grad_norm": 36.49444580078125, + "learning_rate": 2.227270413791442e-05, + "loss": 5.9198, + "step": 6652 + }, + { + "epoch": 1.138627417422557, + "grad_norm": 2.5288755893707275, + "learning_rate": 2.226262509669235e-05, + "loss": 0.2905, + "step": 6653 + }, + { + "epoch": 1.1387985623823378, + "grad_norm": 1.1701017618179321, + "learning_rate": 2.225254177051065e-05, + "loss": 0.2359, + "step": 6654 + }, + { + "epoch": 1.1389697073421188, + "grad_norm": 0.9904645085334778, + "learning_rate": 2.2242454165318507e-05, + "loss": 0.169, + "step": 6655 + }, + { + "epoch": 1.1391408523018998, + "grad_norm": 18.088199615478516, + "learning_rate": 2.223236228706761e-05, + "loss": 1.7993, + "step": 6656 + }, + { + "epoch": 1.1393119972616805, + "grad_norm": 25.48227882385254, + "learning_rate": 2.2222266141712196e-05, + "loss": 3.7581, + "step": 6657 + }, + { + "epoch": 1.1394831422214615, + "grad_norm": 17.415260314941406, + "learning_rate": 2.2212165735209e-05, + "loss": 1.6087, + "step": 6658 + }, + { + "epoch": 1.1396542871812425, + "grad_norm": 12.266166687011719, + "learning_rate": 2.2202061073517285e-05, + "loss": 1.001, + "step": 6659 + }, + { + "epoch": 1.1398254321410235, + "grad_norm": 10.487218856811523, + "learning_rate": 2.21919521625988e-05, + "loss": 0.927, + "step": 6660 + }, + { + "epoch": 1.1399965771008045, + "grad_norm": 15.511195182800293, + "learning_rate": 2.2181839008417832e-05, + "loss": 1.3514, + "step": 6661 + }, + { + "epoch": 1.1401677220605853, + "grad_norm": 19.942005157470703, + "learning_rate": 2.217172161694115e-05, + "loss": 1.5787, + "step": 6662 + }, + { + "epoch": 1.1403388670203662, + "grad_norm": 21.329761505126953, + "learning_rate": 2.2161599994138035e-05, + "loss": 2.9634, + "step": 6663 + }, + { + "epoch": 1.1405100119801472, + "grad_norm": 3.755098342895508, + "learning_rate": 2.2151474145980255e-05, + "loss": 0.3287, + "step": 6664 + }, + { + "epoch": 1.140681156939928, + "grad_norm": 5.238820552825928, + "learning_rate": 2.2141344078442076e-05, + "loss": 0.3482, + "step": 6665 + }, + { + "epoch": 1.140852301899709, + "grad_norm": 21.611568450927734, + "learning_rate": 2.2131209797500253e-05, + "loss": 2.0606, + "step": 6666 + }, + { + "epoch": 1.14102344685949, + "grad_norm": 22.647153854370117, + "learning_rate": 2.2121071309134033e-05, + "loss": 2.3558, + "step": 6667 + }, + { + "epoch": 1.141194591819271, + "grad_norm": 14.586101531982422, + "learning_rate": 2.211092861932513e-05, + "loss": 1.4945, + "step": 6668 + }, + { + "epoch": 1.141365736779052, + "grad_norm": 17.73663330078125, + "learning_rate": 2.210078173405775e-05, + "loss": 1.4825, + "step": 6669 + }, + { + "epoch": 1.1415368817388327, + "grad_norm": 8.52775764465332, + "learning_rate": 2.209063065931857e-05, + "loss": 0.9648, + "step": 6670 + }, + { + "epoch": 1.1417080266986137, + "grad_norm": 22.95386505126953, + "learning_rate": 2.2080475401096743e-05, + "loss": 3.0247, + "step": 6671 + }, + { + "epoch": 1.1418791716583947, + "grad_norm": 22.214920043945312, + "learning_rate": 2.2070315965383883e-05, + "loss": 2.5544, + "step": 6672 + }, + { + "epoch": 1.1420503166181757, + "grad_norm": 12.52978515625, + "learning_rate": 2.2060152358174066e-05, + "loss": 1.3018, + "step": 6673 + }, + { + "epoch": 1.1422214615779565, + "grad_norm": 15.886680603027344, + "learning_rate": 2.204998458546385e-05, + "loss": 1.4821, + "step": 6674 + }, + { + "epoch": 1.1423926065377374, + "grad_norm": 19.420021057128906, + "learning_rate": 2.2039812653252227e-05, + "loss": 1.6811, + "step": 6675 + }, + { + "epoch": 1.1425637514975184, + "grad_norm": 6.738640785217285, + "learning_rate": 2.202963656754065e-05, + "loss": 0.4493, + "step": 6676 + }, + { + "epoch": 1.1427348964572994, + "grad_norm": 15.929643630981445, + "learning_rate": 2.2019456334333026e-05, + "loss": 1.3787, + "step": 6677 + }, + { + "epoch": 1.1429060414170802, + "grad_norm": 15.84787654876709, + "learning_rate": 2.2009271959635712e-05, + "loss": 1.482, + "step": 6678 + }, + { + "epoch": 1.1430771863768612, + "grad_norm": 18.48777961730957, + "learning_rate": 2.19990834494575e-05, + "loss": 2.077, + "step": 6679 + }, + { + "epoch": 1.1432483313366422, + "grad_norm": 5.023829936981201, + "learning_rate": 2.1988890809809632e-05, + "loss": 0.4019, + "step": 6680 + }, + { + "epoch": 1.1434194762964232, + "grad_norm": 3.970644474029541, + "learning_rate": 2.1978694046705773e-05, + "loss": 0.3509, + "step": 6681 + }, + { + "epoch": 1.143590621256204, + "grad_norm": 1.667288064956665, + "learning_rate": 2.1968493166162032e-05, + "loss": 0.1854, + "step": 6682 + }, + { + "epoch": 1.143761766215985, + "grad_norm": 21.447723388671875, + "learning_rate": 2.1958288174196947e-05, + "loss": 3.0801, + "step": 6683 + }, + { + "epoch": 1.143932911175766, + "grad_norm": 27.4541072845459, + "learning_rate": 2.1948079076831472e-05, + "loss": 5.3531, + "step": 6684 + }, + { + "epoch": 1.1441040561355469, + "grad_norm": 5.7024312019348145, + "learning_rate": 2.193786588008899e-05, + "loss": 0.3647, + "step": 6685 + }, + { + "epoch": 1.1442752010953277, + "grad_norm": 0.5887414813041687, + "learning_rate": 2.1927648589995312e-05, + "loss": 0.1526, + "step": 6686 + }, + { + "epoch": 1.1444463460551086, + "grad_norm": 104.833984375, + "learning_rate": 2.1917427212578644e-05, + "loss": 6.8034, + "step": 6687 + }, + { + "epoch": 1.1446174910148896, + "grad_norm": 43.65744400024414, + "learning_rate": 2.1907201753869618e-05, + "loss": 1.8006, + "step": 6688 + }, + { + "epoch": 1.1447886359746706, + "grad_norm": 12.72282886505127, + "learning_rate": 2.189697221990127e-05, + "loss": 1.0824, + "step": 6689 + }, + { + "epoch": 1.1449597809344514, + "grad_norm": 22.606210708618164, + "learning_rate": 2.1886738616709038e-05, + "loss": 2.2436, + "step": 6690 + }, + { + "epoch": 1.1451309258942324, + "grad_norm": 7.099259376525879, + "learning_rate": 2.1876500950330766e-05, + "loss": 0.4624, + "step": 6691 + }, + { + "epoch": 1.1453020708540134, + "grad_norm": 102.5890884399414, + "learning_rate": 2.186625922680669e-05, + "loss": 8.1622, + "step": 6692 + }, + { + "epoch": 1.1454732158137944, + "grad_norm": 11.016797065734863, + "learning_rate": 2.1856013452179443e-05, + "loss": 0.7039, + "step": 6693 + }, + { + "epoch": 1.1456443607735751, + "grad_norm": 15.703362464904785, + "learning_rate": 2.1845763632494046e-05, + "loss": 1.5509, + "step": 6694 + }, + { + "epoch": 1.145815505733356, + "grad_norm": 70.36822509765625, + "learning_rate": 2.183550977379791e-05, + "loss": 8.0359, + "step": 6695 + }, + { + "epoch": 1.145986650693137, + "grad_norm": 22.124250411987305, + "learning_rate": 2.182525188214083e-05, + "loss": 2.2052, + "step": 6696 + }, + { + "epoch": 1.146157795652918, + "grad_norm": 18.040904998779297, + "learning_rate": 2.181498996357497e-05, + "loss": 2.0674, + "step": 6697 + }, + { + "epoch": 1.1463289406126989, + "grad_norm": 3.606860399246216, + "learning_rate": 2.1804724024154883e-05, + "loss": 0.3582, + "step": 6698 + }, + { + "epoch": 1.1465000855724798, + "grad_norm": 71.82786560058594, + "learning_rate": 2.1794454069937485e-05, + "loss": 7.7528, + "step": 6699 + }, + { + "epoch": 1.1466712305322608, + "grad_norm": 35.79607009887695, + "learning_rate": 2.1784180106982063e-05, + "loss": 5.6748, + "step": 6700 + }, + { + "epoch": 1.1468423754920418, + "grad_norm": 12.36203670501709, + "learning_rate": 2.1773902141350277e-05, + "loss": 1.2623, + "step": 6701 + }, + { + "epoch": 1.1470135204518228, + "grad_norm": 18.953035354614258, + "learning_rate": 2.1763620179106137e-05, + "loss": 1.9056, + "step": 6702 + }, + { + "epoch": 1.1471846654116036, + "grad_norm": 5.2524189949035645, + "learning_rate": 2.175333422631602e-05, + "loss": 0.4216, + "step": 6703 + }, + { + "epoch": 1.1473558103713846, + "grad_norm": 16.611282348632812, + "learning_rate": 2.1743044289048647e-05, + "loss": 1.5661, + "step": 6704 + }, + { + "epoch": 1.1475269553311656, + "grad_norm": 20.023853302001953, + "learning_rate": 2.1732750373375098e-05, + "loss": 1.8265, + "step": 6705 + }, + { + "epoch": 1.1476981002909463, + "grad_norm": 4.804213523864746, + "learning_rate": 2.1722452485368808e-05, + "loss": 0.3534, + "step": 6706 + }, + { + "epoch": 1.1478692452507273, + "grad_norm": 17.737071990966797, + "learning_rate": 2.171215063110553e-05, + "loss": 1.6854, + "step": 6707 + }, + { + "epoch": 1.1480403902105083, + "grad_norm": 11.945717811584473, + "learning_rate": 2.1701844816663387e-05, + "loss": 0.9422, + "step": 6708 + }, + { + "epoch": 1.1482115351702893, + "grad_norm": 13.96307373046875, + "learning_rate": 2.1691535048122818e-05, + "loss": 0.9708, + "step": 6709 + }, + { + "epoch": 1.1483826801300703, + "grad_norm": 35.298011779785156, + "learning_rate": 2.1681221331566605e-05, + "loss": 1.4859, + "step": 6710 + }, + { + "epoch": 1.148553825089851, + "grad_norm": 2.048851728439331, + "learning_rate": 2.167090367307986e-05, + "loss": 0.2684, + "step": 6711 + }, + { + "epoch": 1.148724970049632, + "grad_norm": 9.77960205078125, + "learning_rate": 2.1660582078750006e-05, + "loss": 0.5625, + "step": 6712 + }, + { + "epoch": 1.148896115009413, + "grad_norm": 15.336026191711426, + "learning_rate": 2.1650256554666804e-05, + "loss": 1.2984, + "step": 6713 + }, + { + "epoch": 1.1490672599691938, + "grad_norm": 15.743756294250488, + "learning_rate": 2.163992710692233e-05, + "loss": 1.1362, + "step": 6714 + }, + { + "epoch": 1.1492384049289748, + "grad_norm": 17.978227615356445, + "learning_rate": 2.1629593741610977e-05, + "loss": 1.5398, + "step": 6715 + }, + { + "epoch": 1.1494095498887558, + "grad_norm": 22.40999412536621, + "learning_rate": 2.1619256464829436e-05, + "loss": 1.8867, + "step": 6716 + }, + { + "epoch": 1.1495806948485368, + "grad_norm": 16.83521842956543, + "learning_rate": 2.1608915282676728e-05, + "loss": 1.4942, + "step": 6717 + }, + { + "epoch": 1.1497518398083177, + "grad_norm": 23.83708381652832, + "learning_rate": 2.1598570201254156e-05, + "loss": 2.1358, + "step": 6718 + }, + { + "epoch": 1.1499229847680985, + "grad_norm": 31.449512481689453, + "learning_rate": 2.1588221226665338e-05, + "loss": 5.8379, + "step": 6719 + }, + { + "epoch": 1.1500941297278795, + "grad_norm": 7.079745292663574, + "learning_rate": 2.1577868365016182e-05, + "loss": 0.675, + "step": 6720 + }, + { + "epoch": 1.1502652746876605, + "grad_norm": 21.983989715576172, + "learning_rate": 2.156751162241489e-05, + "loss": 1.9488, + "step": 6721 + }, + { + "epoch": 1.1504364196474415, + "grad_norm": 17.203046798706055, + "learning_rate": 2.155715100497197e-05, + "loss": 1.6076, + "step": 6722 + }, + { + "epoch": 1.1506075646072222, + "grad_norm": 12.79857063293457, + "learning_rate": 2.1546786518800182e-05, + "loss": 1.0769, + "step": 6723 + }, + { + "epoch": 1.1507787095670032, + "grad_norm": 8.416800498962402, + "learning_rate": 2.1536418170014595e-05, + "loss": 0.6304, + "step": 6724 + }, + { + "epoch": 1.1509498545267842, + "grad_norm": 24.620590209960938, + "learning_rate": 2.1526045964732556e-05, + "loss": 2.5394, + "step": 6725 + }, + { + "epoch": 1.1511209994865652, + "grad_norm": 14.570639610290527, + "learning_rate": 2.1515669909073675e-05, + "loss": 1.0799, + "step": 6726 + }, + { + "epoch": 1.151292144446346, + "grad_norm": 34.030250549316406, + "learning_rate": 2.1505290009159843e-05, + "loss": 5.7075, + "step": 6727 + }, + { + "epoch": 1.151463289406127, + "grad_norm": 0.49403926730155945, + "learning_rate": 2.149490627111522e-05, + "loss": 0.1496, + "step": 6728 + }, + { + "epoch": 1.151634434365908, + "grad_norm": 17.340473175048828, + "learning_rate": 2.148451870106622e-05, + "loss": 1.7152, + "step": 6729 + }, + { + "epoch": 1.151805579325689, + "grad_norm": 10.648886680603027, + "learning_rate": 2.1474127305141524e-05, + "loss": 0.7918, + "step": 6730 + }, + { + "epoch": 1.1519767242854697, + "grad_norm": 5.201786518096924, + "learning_rate": 2.1463732089472083e-05, + "loss": 0.6296, + "step": 6731 + }, + { + "epoch": 1.1521478692452507, + "grad_norm": 5.574512481689453, + "learning_rate": 2.145333306019108e-05, + "loss": 0.4166, + "step": 6732 + }, + { + "epoch": 1.1523190142050317, + "grad_norm": 9.512883186340332, + "learning_rate": 2.144293022343396e-05, + "loss": 0.553, + "step": 6733 + }, + { + "epoch": 1.1524901591648127, + "grad_norm": 22.8580379486084, + "learning_rate": 2.1432523585338406e-05, + "loss": 3.0396, + "step": 6734 + }, + { + "epoch": 1.1526613041245934, + "grad_norm": 11.338586807250977, + "learning_rate": 2.142211315204436e-05, + "loss": 1.0252, + "step": 6735 + }, + { + "epoch": 1.1528324490843744, + "grad_norm": 16.34046173095703, + "learning_rate": 2.141169892969399e-05, + "loss": 1.2929, + "step": 6736 + }, + { + "epoch": 1.1530035940441554, + "grad_norm": 0.7246559858322144, + "learning_rate": 2.1401280924431694e-05, + "loss": 0.1495, + "step": 6737 + }, + { + "epoch": 1.1531747390039364, + "grad_norm": 5.994287967681885, + "learning_rate": 2.1390859142404124e-05, + "loss": 0.43, + "step": 6738 + }, + { + "epoch": 1.1533458839637172, + "grad_norm": 21.98945426940918, + "learning_rate": 2.1380433589760144e-05, + "loss": 1.791, + "step": 6739 + }, + { + "epoch": 1.1535170289234982, + "grad_norm": 0.6687926650047302, + "learning_rate": 2.1370004272650837e-05, + "loss": 0.1457, + "step": 6740 + }, + { + "epoch": 1.1536881738832792, + "grad_norm": 20.061038970947266, + "learning_rate": 2.1359571197229526e-05, + "loss": 2.5826, + "step": 6741 + }, + { + "epoch": 1.1538593188430601, + "grad_norm": 2.983421802520752, + "learning_rate": 2.1349134369651732e-05, + "loss": 0.2926, + "step": 6742 + }, + { + "epoch": 1.154030463802841, + "grad_norm": 6.651993751525879, + "learning_rate": 2.1338693796075205e-05, + "loss": 0.42, + "step": 6743 + }, + { + "epoch": 1.154201608762622, + "grad_norm": 11.948166847229004, + "learning_rate": 2.13282494826599e-05, + "loss": 1.0267, + "step": 6744 + }, + { + "epoch": 1.1543727537224029, + "grad_norm": 27.58370590209961, + "learning_rate": 2.1317801435567967e-05, + "loss": 5.8972, + "step": 6745 + }, + { + "epoch": 1.1545438986821839, + "grad_norm": 8.554266929626465, + "learning_rate": 2.1307349660963782e-05, + "loss": 0.5493, + "step": 6746 + }, + { + "epoch": 1.1547150436419646, + "grad_norm": 18.07694435119629, + "learning_rate": 2.12968941650139e-05, + "loss": 0.9702, + "step": 6747 + }, + { + "epoch": 1.1548861886017456, + "grad_norm": 14.886399269104004, + "learning_rate": 2.128643495388709e-05, + "loss": 1.5393, + "step": 6748 + }, + { + "epoch": 1.1550573335615266, + "grad_norm": 19.339351654052734, + "learning_rate": 2.1275972033754284e-05, + "loss": 1.6208, + "step": 6749 + }, + { + "epoch": 1.1552284785213076, + "grad_norm": 11.952751159667969, + "learning_rate": 2.1265505410788633e-05, + "loss": 0.77, + "step": 6750 + }, + { + "epoch": 1.1553996234810886, + "grad_norm": 15.437543869018555, + "learning_rate": 2.1255035091165456e-05, + "loss": 1.1533, + "step": 6751 + }, + { + "epoch": 1.1555707684408694, + "grad_norm": 10.964115142822266, + "learning_rate": 2.1244561081062262e-05, + "loss": 0.8684, + "step": 6752 + }, + { + "epoch": 1.1557419134006504, + "grad_norm": 3.070553779602051, + "learning_rate": 2.123408338665873e-05, + "loss": 0.3276, + "step": 6753 + }, + { + "epoch": 1.1559130583604313, + "grad_norm": 12.972532272338867, + "learning_rate": 2.1223602014136712e-05, + "loss": 1.2192, + "step": 6754 + }, + { + "epoch": 1.1560842033202121, + "grad_norm": 18.17832374572754, + "learning_rate": 2.1213116969680237e-05, + "loss": 1.6733, + "step": 6755 + }, + { + "epoch": 1.156255348279993, + "grad_norm": 1.0295546054840088, + "learning_rate": 2.1202628259475495e-05, + "loss": 0.1569, + "step": 6756 + }, + { + "epoch": 1.156426493239774, + "grad_norm": 16.78033447265625, + "learning_rate": 2.119213588971084e-05, + "loss": 1.4867, + "step": 6757 + }, + { + "epoch": 1.156597638199555, + "grad_norm": 2.5894768238067627, + "learning_rate": 2.118163986657679e-05, + "loss": 0.3114, + "step": 6758 + }, + { + "epoch": 1.156768783159336, + "grad_norm": 21.40312385559082, + "learning_rate": 2.1171140196266012e-05, + "loss": 2.1186, + "step": 6759 + }, + { + "epoch": 1.1569399281191168, + "grad_norm": 24.32610511779785, + "learning_rate": 2.1160636884973322e-05, + "loss": 2.1989, + "step": 6760 + }, + { + "epoch": 1.1571110730788978, + "grad_norm": 8.965644836425781, + "learning_rate": 2.1150129938895695e-05, + "loss": 1.1102, + "step": 6761 + }, + { + "epoch": 1.1572822180386788, + "grad_norm": 22.843244552612305, + "learning_rate": 2.1139619364232247e-05, + "loss": 1.2688, + "step": 6762 + }, + { + "epoch": 1.1574533629984596, + "grad_norm": 19.694812774658203, + "learning_rate": 2.1129105167184227e-05, + "loss": 2.3136, + "step": 6763 + }, + { + "epoch": 1.1576245079582406, + "grad_norm": 3.668747663497925, + "learning_rate": 2.111858735395503e-05, + "loss": 0.4309, + "step": 6764 + }, + { + "epoch": 1.1577956529180216, + "grad_norm": 15.669942855834961, + "learning_rate": 2.110806593075018e-05, + "loss": 1.0937, + "step": 6765 + }, + { + "epoch": 1.1579667978778025, + "grad_norm": 12.45332145690918, + "learning_rate": 2.1097540903777333e-05, + "loss": 1.0317, + "step": 6766 + }, + { + "epoch": 1.1581379428375835, + "grad_norm": 20.682071685791016, + "learning_rate": 2.108701227924627e-05, + "loss": 2.0858, + "step": 6767 + }, + { + "epoch": 1.1583090877973643, + "grad_norm": 20.13377571105957, + "learning_rate": 2.10764800633689e-05, + "loss": 2.533, + "step": 6768 + }, + { + "epoch": 1.1584802327571453, + "grad_norm": 16.149690628051758, + "learning_rate": 2.1065944262359234e-05, + "loss": 1.3122, + "step": 6769 + }, + { + "epoch": 1.1586513777169263, + "grad_norm": 14.939237594604492, + "learning_rate": 2.1055404882433428e-05, + "loss": 1.205, + "step": 6770 + }, + { + "epoch": 1.1588225226767073, + "grad_norm": 12.55698299407959, + "learning_rate": 2.1044861929809712e-05, + "loss": 0.9223, + "step": 6771 + }, + { + "epoch": 1.158993667636488, + "grad_norm": 18.289710998535156, + "learning_rate": 2.103431541070846e-05, + "loss": 1.6357, + "step": 6772 + }, + { + "epoch": 1.159164812596269, + "grad_norm": 16.596107482910156, + "learning_rate": 2.102376533135213e-05, + "loss": 1.1197, + "step": 6773 + }, + { + "epoch": 1.15933595755605, + "grad_norm": 17.402799606323242, + "learning_rate": 2.101321169796528e-05, + "loss": 1.4677, + "step": 6774 + }, + { + "epoch": 1.159507102515831, + "grad_norm": 21.732892990112305, + "learning_rate": 2.100265451677457e-05, + "loss": 1.8113, + "step": 6775 + }, + { + "epoch": 1.1596782474756118, + "grad_norm": 15.907790184020996, + "learning_rate": 2.0992093794008755e-05, + "loss": 1.3312, + "step": 6776 + }, + { + "epoch": 1.1598493924353928, + "grad_norm": 18.397964477539062, + "learning_rate": 2.0981529535898676e-05, + "loss": 2.3183, + "step": 6777 + }, + { + "epoch": 1.1600205373951737, + "grad_norm": 28.13370704650879, + "learning_rate": 2.0970961748677267e-05, + "loss": 1.3794, + "step": 6778 + }, + { + "epoch": 1.1601916823549547, + "grad_norm": 0.6629721522331238, + "learning_rate": 2.0960390438579528e-05, + "loss": 0.1526, + "step": 6779 + }, + { + "epoch": 1.1603628273147355, + "grad_norm": 20.983154296875, + "learning_rate": 2.094981561184255e-05, + "loss": 2.3348, + "step": 6780 + }, + { + "epoch": 1.1605339722745165, + "grad_norm": 13.373003959655762, + "learning_rate": 2.09392372747055e-05, + "loss": 0.8657, + "step": 6781 + }, + { + "epoch": 1.1607051172342975, + "grad_norm": 12.408032417297363, + "learning_rate": 2.0928655433409614e-05, + "loss": 0.9496, + "step": 6782 + }, + { + "epoch": 1.1608762621940785, + "grad_norm": 28.76824378967285, + "learning_rate": 2.0918070094198195e-05, + "loss": 3.9569, + "step": 6783 + }, + { + "epoch": 1.1610474071538592, + "grad_norm": 2.336712121963501, + "learning_rate": 2.09074812633166e-05, + "loss": 0.2164, + "step": 6784 + }, + { + "epoch": 1.1612185521136402, + "grad_norm": 18.542802810668945, + "learning_rate": 2.0896888947012265e-05, + "loss": 1.59, + "step": 6785 + }, + { + "epoch": 1.1613896970734212, + "grad_norm": 22.71904945373535, + "learning_rate": 2.0886293151534663e-05, + "loss": 1.7135, + "step": 6786 + }, + { + "epoch": 1.1615608420332022, + "grad_norm": 21.552074432373047, + "learning_rate": 2.0875693883135336e-05, + "loss": 1.9509, + "step": 6787 + }, + { + "epoch": 1.161731986992983, + "grad_norm": 16.57906723022461, + "learning_rate": 2.0865091148067868e-05, + "loss": 1.3288, + "step": 6788 + }, + { + "epoch": 1.161903131952764, + "grad_norm": 3.9160079956054688, + "learning_rate": 2.085448495258789e-05, + "loss": 0.303, + "step": 6789 + }, + { + "epoch": 1.162074276912545, + "grad_norm": 15.484481811523438, + "learning_rate": 2.0843875302953064e-05, + "loss": 1.2086, + "step": 6790 + }, + { + "epoch": 1.162245421872326, + "grad_norm": 10.972047805786133, + "learning_rate": 2.0833262205423103e-05, + "loss": 1.0228, + "step": 6791 + }, + { + "epoch": 1.1624165668321067, + "grad_norm": 15.809880256652832, + "learning_rate": 2.0822645666259758e-05, + "loss": 1.0541, + "step": 6792 + }, + { + "epoch": 1.1625877117918877, + "grad_norm": 15.736574172973633, + "learning_rate": 2.0812025691726795e-05, + "loss": 1.5631, + "step": 6793 + }, + { + "epoch": 1.1627588567516687, + "grad_norm": 13.390948295593262, + "learning_rate": 2.080140228809002e-05, + "loss": 1.1472, + "step": 6794 + }, + { + "epoch": 1.1629300017114497, + "grad_norm": 18.913244247436523, + "learning_rate": 2.079077546161725e-05, + "loss": 2.0747, + "step": 6795 + }, + { + "epoch": 1.1631011466712304, + "grad_norm": 35.074134826660156, + "learning_rate": 2.0780145218578337e-05, + "loss": 6.1981, + "step": 6796 + }, + { + "epoch": 1.1632722916310114, + "grad_norm": 15.795833587646484, + "learning_rate": 2.076951156524513e-05, + "loss": 1.447, + "step": 6797 + }, + { + "epoch": 1.1634434365907924, + "grad_norm": 21.82452964782715, + "learning_rate": 2.0758874507891514e-05, + "loss": 1.8627, + "step": 6798 + }, + { + "epoch": 1.1636145815505734, + "grad_norm": 15.347325325012207, + "learning_rate": 2.0748234052793353e-05, + "loss": 1.3685, + "step": 6799 + }, + { + "epoch": 1.1637857265103544, + "grad_norm": 18.640275955200195, + "learning_rate": 2.0737590206228544e-05, + "loss": 1.7358, + "step": 6800 + }, + { + "epoch": 1.1639568714701352, + "grad_norm": 15.257899284362793, + "learning_rate": 2.0726942974476967e-05, + "loss": 1.3593, + "step": 6801 + }, + { + "epoch": 1.1641280164299161, + "grad_norm": 22.601909637451172, + "learning_rate": 2.0716292363820504e-05, + "loss": 2.0876, + "step": 6802 + }, + { + "epoch": 1.1642991613896971, + "grad_norm": 15.543705940246582, + "learning_rate": 2.0705638380543027e-05, + "loss": 0.8575, + "step": 6803 + }, + { + "epoch": 1.164470306349478, + "grad_norm": 16.10650062561035, + "learning_rate": 2.0694981030930417e-05, + "loss": 1.301, + "step": 6804 + }, + { + "epoch": 1.164641451309259, + "grad_norm": 14.519549369812012, + "learning_rate": 2.068432032127051e-05, + "loss": 1.2924, + "step": 6805 + }, + { + "epoch": 1.1648125962690399, + "grad_norm": 14.380958557128906, + "learning_rate": 2.0673656257853148e-05, + "loss": 1.3454, + "step": 6806 + }, + { + "epoch": 1.1649837412288209, + "grad_norm": 18.531217575073242, + "learning_rate": 2.0662988846970144e-05, + "loss": 1.6529, + "step": 6807 + }, + { + "epoch": 1.1651548861886019, + "grad_norm": 11.520040512084961, + "learning_rate": 2.065231809491528e-05, + "loss": 1.0933, + "step": 6808 + }, + { + "epoch": 1.1653260311483826, + "grad_norm": 11.549054145812988, + "learning_rate": 2.064164400798433e-05, + "loss": 0.9791, + "step": 6809 + }, + { + "epoch": 1.1654971761081636, + "grad_norm": 24.388668060302734, + "learning_rate": 2.0630966592475006e-05, + "loss": 2.5074, + "step": 6810 + }, + { + "epoch": 1.1656683210679446, + "grad_norm": 2.8475899696350098, + "learning_rate": 2.062028585468701e-05, + "loss": 0.31, + "step": 6811 + }, + { + "epoch": 1.1658394660277254, + "grad_norm": 21.80243682861328, + "learning_rate": 2.0609601800921984e-05, + "loss": 1.9436, + "step": 6812 + }, + { + "epoch": 1.1660106109875064, + "grad_norm": 14.159065246582031, + "learning_rate": 2.0598914437483544e-05, + "loss": 1.2508, + "step": 6813 + }, + { + "epoch": 1.1661817559472873, + "grad_norm": 18.48175811767578, + "learning_rate": 2.0588223770677244e-05, + "loss": 2.2716, + "step": 6814 + }, + { + "epoch": 1.1663529009070683, + "grad_norm": 13.813920974731445, + "learning_rate": 2.0577529806810595e-05, + "loss": 1.0515, + "step": 6815 + }, + { + "epoch": 1.1665240458668493, + "grad_norm": 11.2750244140625, + "learning_rate": 2.0566832552193052e-05, + "loss": 0.9192, + "step": 6816 + }, + { + "epoch": 1.16669519082663, + "grad_norm": 19.322492599487305, + "learning_rate": 2.0556132013136013e-05, + "loss": 1.9372, + "step": 6817 + }, + { + "epoch": 1.166866335786411, + "grad_norm": 14.883321762084961, + "learning_rate": 2.0545428195952814e-05, + "loss": 1.647, + "step": 6818 + }, + { + "epoch": 1.167037480746192, + "grad_norm": 22.583385467529297, + "learning_rate": 2.0534721106958715e-05, + "loss": 1.8467, + "step": 6819 + }, + { + "epoch": 1.1672086257059728, + "grad_norm": 7.125509262084961, + "learning_rate": 2.0524010752470924e-05, + "loss": 0.4615, + "step": 6820 + }, + { + "epoch": 1.1673797706657538, + "grad_norm": 30.93987274169922, + "learning_rate": 2.051329713880856e-05, + "loss": 5.5572, + "step": 6821 + }, + { + "epoch": 1.1675509156255348, + "grad_norm": 10.377395629882812, + "learning_rate": 2.050258027229267e-05, + "loss": 0.8635, + "step": 6822 + }, + { + "epoch": 1.1677220605853158, + "grad_norm": 7.3766093254089355, + "learning_rate": 2.0491860159246226e-05, + "loss": 0.485, + "step": 6823 + }, + { + "epoch": 1.1678932055450968, + "grad_norm": 7.117273330688477, + "learning_rate": 2.0481136805994104e-05, + "loss": 0.3838, + "step": 6824 + }, + { + "epoch": 1.1680643505048776, + "grad_norm": 18.087154388427734, + "learning_rate": 2.0470410218863106e-05, + "loss": 1.3246, + "step": 6825 + }, + { + "epoch": 1.1682354954646585, + "grad_norm": 21.010772705078125, + "learning_rate": 2.045968040418193e-05, + "loss": 0.9737, + "step": 6826 + }, + { + "epoch": 1.1684066404244395, + "grad_norm": 16.369413375854492, + "learning_rate": 2.0448947368281183e-05, + "loss": 1.1337, + "step": 6827 + }, + { + "epoch": 1.1685777853842205, + "grad_norm": 14.726228713989258, + "learning_rate": 2.0438211117493374e-05, + "loss": 1.2854, + "step": 6828 + }, + { + "epoch": 1.1687489303440013, + "grad_norm": 20.079736709594727, + "learning_rate": 2.0427471658152902e-05, + "loss": 2.8156, + "step": 6829 + }, + { + "epoch": 1.1689200753037823, + "grad_norm": 18.067716598510742, + "learning_rate": 2.041672899659607e-05, + "loss": 1.5892, + "step": 6830 + }, + { + "epoch": 1.1690912202635633, + "grad_norm": 24.576330184936523, + "learning_rate": 2.0405983139161063e-05, + "loss": 5.732, + "step": 6831 + }, + { + "epoch": 1.1692623652233443, + "grad_norm": 18.32451057434082, + "learning_rate": 2.0395234092187953e-05, + "loss": 2.2328, + "step": 6832 + }, + { + "epoch": 1.169433510183125, + "grad_norm": 22.1658935546875, + "learning_rate": 2.038448186201869e-05, + "loss": 2.32, + "step": 6833 + }, + { + "epoch": 1.169604655142906, + "grad_norm": 9.66140365600586, + "learning_rate": 2.037372645499711e-05, + "loss": 0.6548, + "step": 6834 + }, + { + "epoch": 1.169775800102687, + "grad_norm": 0.6341655850410461, + "learning_rate": 2.0362967877468916e-05, + "loss": 0.1526, + "step": 6835 + }, + { + "epoch": 1.169946945062468, + "grad_norm": 10.674327850341797, + "learning_rate": 2.0352206135781683e-05, + "loss": 0.9537, + "step": 6836 + }, + { + "epoch": 1.1701180900222488, + "grad_norm": 8.676131248474121, + "learning_rate": 2.0341441236284865e-05, + "loss": 1.0683, + "step": 6837 + }, + { + "epoch": 1.1702892349820297, + "grad_norm": 18.046106338500977, + "learning_rate": 2.033067318532976e-05, + "loss": 1.4942, + "step": 6838 + }, + { + "epoch": 1.1704603799418107, + "grad_norm": 14.745805740356445, + "learning_rate": 2.0319901989269536e-05, + "loss": 1.0318, + "step": 6839 + }, + { + "epoch": 1.1706315249015917, + "grad_norm": 22.62310028076172, + "learning_rate": 2.0309127654459213e-05, + "loss": 2.3572, + "step": 6840 + }, + { + "epoch": 1.1708026698613725, + "grad_norm": 8.969188690185547, + "learning_rate": 2.0298350187255666e-05, + "loss": 1.2112, + "step": 6841 + }, + { + "epoch": 1.1709738148211535, + "grad_norm": 21.498462677001953, + "learning_rate": 2.0287569594017617e-05, + "loss": 2.7483, + "step": 6842 + }, + { + "epoch": 1.1711449597809345, + "grad_norm": 5.7461981773376465, + "learning_rate": 2.0276785881105635e-05, + "loss": 0.6604, + "step": 6843 + }, + { + "epoch": 1.1713161047407155, + "grad_norm": 10.238706588745117, + "learning_rate": 2.026599905488212e-05, + "loss": 0.912, + "step": 6844 + }, + { + "epoch": 1.1714872497004962, + "grad_norm": 14.441835403442383, + "learning_rate": 2.025520912171132e-05, + "loss": 1.1801, + "step": 6845 + }, + { + "epoch": 1.1716583946602772, + "grad_norm": 19.147550582885742, + "learning_rate": 2.024441608795931e-05, + "loss": 2.3638, + "step": 6846 + }, + { + "epoch": 1.1718295396200582, + "grad_norm": 6.082062244415283, + "learning_rate": 2.0233619959993997e-05, + "loss": 0.4729, + "step": 6847 + }, + { + "epoch": 1.1720006845798392, + "grad_norm": 39.79881286621094, + "learning_rate": 2.0222820744185113e-05, + "loss": 5.313, + "step": 6848 + }, + { + "epoch": 1.1721718295396202, + "grad_norm": 21.230663299560547, + "learning_rate": 2.0212018446904214e-05, + "loss": 1.9576, + "step": 6849 + }, + { + "epoch": 1.172342974499401, + "grad_norm": 19.801790237426758, + "learning_rate": 2.0201213074524664e-05, + "loss": 2.0392, + "step": 6850 + }, + { + "epoch": 1.172514119459182, + "grad_norm": 11.94463062286377, + "learning_rate": 2.019040463342165e-05, + "loss": 0.8956, + "step": 6851 + }, + { + "epoch": 1.172685264418963, + "grad_norm": 18.809799194335938, + "learning_rate": 2.0179593129972178e-05, + "loss": 2.1701, + "step": 6852 + }, + { + "epoch": 1.1728564093787437, + "grad_norm": 18.971500396728516, + "learning_rate": 2.016877857055504e-05, + "loss": 1.7869, + "step": 6853 + }, + { + "epoch": 1.1730275543385247, + "grad_norm": 14.83297061920166, + "learning_rate": 2.015796096155085e-05, + "loss": 1.3335, + "step": 6854 + }, + { + "epoch": 1.1731986992983057, + "grad_norm": 21.98466682434082, + "learning_rate": 2.0147140309342008e-05, + "loss": 1.9541, + "step": 6855 + }, + { + "epoch": 1.1733698442580867, + "grad_norm": 14.660383224487305, + "learning_rate": 2.0136316620312723e-05, + "loss": 1.2516, + "step": 6856 + }, + { + "epoch": 1.1735409892178676, + "grad_norm": 23.2386474609375, + "learning_rate": 2.012548990084897e-05, + "loss": 5.376, + "step": 6857 + }, + { + "epoch": 1.1737121341776484, + "grad_norm": 16.182527542114258, + "learning_rate": 2.0114660157338545e-05, + "loss": 1.6976, + "step": 6858 + }, + { + "epoch": 1.1738832791374294, + "grad_norm": 14.8099946975708, + "learning_rate": 2.0103827396171014e-05, + "loss": 1.2173, + "step": 6859 + }, + { + "epoch": 1.1740544240972104, + "grad_norm": 22.097217559814453, + "learning_rate": 2.0092991623737716e-05, + "loss": 2.3721, + "step": 6860 + }, + { + "epoch": 1.1742255690569912, + "grad_norm": 5.707027435302734, + "learning_rate": 2.0082152846431775e-05, + "loss": 0.4328, + "step": 6861 + }, + { + "epoch": 1.1743967140167721, + "grad_norm": 8.321752548217773, + "learning_rate": 2.0071311070648083e-05, + "loss": 0.635, + "step": 6862 + }, + { + "epoch": 1.1745678589765531, + "grad_norm": 13.2294282913208, + "learning_rate": 2.0060466302783303e-05, + "loss": 0.9935, + "step": 6863 + }, + { + "epoch": 1.1747390039363341, + "grad_norm": 0.649167537689209, + "learning_rate": 2.0049618549235873e-05, + "loss": 0.1585, + "step": 6864 + }, + { + "epoch": 1.1749101488961151, + "grad_norm": 17.88936424255371, + "learning_rate": 2.0038767816405972e-05, + "loss": 1.5718, + "step": 6865 + }, + { + "epoch": 1.1750812938558959, + "grad_norm": 14.337617874145508, + "learning_rate": 2.0027914110695558e-05, + "loss": 1.1434, + "step": 6866 + }, + { + "epoch": 1.1752524388156769, + "grad_norm": 17.345027923583984, + "learning_rate": 2.001705743850833e-05, + "loss": 1.4073, + "step": 6867 + }, + { + "epoch": 1.1754235837754579, + "grad_norm": 24.623624801635742, + "learning_rate": 2.0006197806249737e-05, + "loss": 2.578, + "step": 6868 + }, + { + "epoch": 1.1755947287352386, + "grad_norm": 21.75896644592285, + "learning_rate": 1.9995335220326985e-05, + "loss": 1.8332, + "step": 6869 + }, + { + "epoch": 1.1757658736950196, + "grad_norm": 9.989360809326172, + "learning_rate": 1.998446968714901e-05, + "loss": 0.9041, + "step": 6870 + }, + { + "epoch": 1.1759370186548006, + "grad_norm": 18.148115158081055, + "learning_rate": 1.99736012131265e-05, + "loss": 1.3532, + "step": 6871 + }, + { + "epoch": 1.1761081636145816, + "grad_norm": 16.34587287902832, + "learning_rate": 1.9962729804671868e-05, + "loss": 1.2796, + "step": 6872 + }, + { + "epoch": 1.1762793085743626, + "grad_norm": 8.132262229919434, + "learning_rate": 1.995185546819925e-05, + "loss": 0.8756, + "step": 6873 + }, + { + "epoch": 1.1764504535341433, + "grad_norm": 8.244163513183594, + "learning_rate": 1.994097821012453e-05, + "loss": 0.7777, + "step": 6874 + }, + { + "epoch": 1.1766215984939243, + "grad_norm": 0.7307465672492981, + "learning_rate": 1.9930098036865315e-05, + "loss": 0.1506, + "step": 6875 + }, + { + "epoch": 1.1767927434537053, + "grad_norm": 26.213361740112305, + "learning_rate": 1.991921495484091e-05, + "loss": 2.6735, + "step": 6876 + }, + { + "epoch": 1.1769638884134863, + "grad_norm": 0.6290190815925598, + "learning_rate": 1.9908328970472363e-05, + "loss": 0.151, + "step": 6877 + }, + { + "epoch": 1.177135033373267, + "grad_norm": 12.63049602508545, + "learning_rate": 1.98974400901824e-05, + "loss": 0.8783, + "step": 6878 + }, + { + "epoch": 1.177306178333048, + "grad_norm": 149.73031616210938, + "learning_rate": 1.9886548320395496e-05, + "loss": 9.0417, + "step": 6879 + }, + { + "epoch": 1.177477323292829, + "grad_norm": 5.016288757324219, + "learning_rate": 1.9875653667537804e-05, + "loss": 0.3981, + "step": 6880 + }, + { + "epoch": 1.17764846825261, + "grad_norm": 19.39210319519043, + "learning_rate": 1.9864756138037188e-05, + "loss": 1.8209, + "step": 6881 + }, + { + "epoch": 1.1778196132123908, + "grad_norm": 14.559964179992676, + "learning_rate": 1.9853855738323204e-05, + "loss": 1.193, + "step": 6882 + }, + { + "epoch": 1.1779907581721718, + "grad_norm": 16.556758880615234, + "learning_rate": 1.9842952474827102e-05, + "loss": 1.4446, + "step": 6883 + }, + { + "epoch": 1.1781619031319528, + "grad_norm": 12.186580657958984, + "learning_rate": 1.9832046353981826e-05, + "loss": 0.8997, + "step": 6884 + }, + { + "epoch": 1.1783330480917338, + "grad_norm": 18.176359176635742, + "learning_rate": 1.982113738222201e-05, + "loss": 1.4875, + "step": 6885 + }, + { + "epoch": 1.1785041930515145, + "grad_norm": 14.406021118164062, + "learning_rate": 1.981022556598395e-05, + "loss": 1.4021, + "step": 6886 + }, + { + "epoch": 1.1786753380112955, + "grad_norm": 16.016162872314453, + "learning_rate": 1.9799310911705654e-05, + "loss": 1.7261, + "step": 6887 + }, + { + "epoch": 1.1788464829710765, + "grad_norm": 20.404844284057617, + "learning_rate": 1.978839342582676e-05, + "loss": 1.9895, + "step": 6888 + }, + { + "epoch": 1.1790176279308575, + "grad_norm": 2.6637179851531982, + "learning_rate": 1.9777473114788612e-05, + "loss": 0.2496, + "step": 6889 + }, + { + "epoch": 1.1791887728906383, + "grad_norm": 25.14553451538086, + "learning_rate": 1.9766549985034213e-05, + "loss": 2.5686, + "step": 6890 + }, + { + "epoch": 1.1793599178504193, + "grad_norm": 0.7727463245391846, + "learning_rate": 1.9755624043008223e-05, + "loss": 0.1432, + "step": 6891 + }, + { + "epoch": 1.1795310628102003, + "grad_norm": 93.68418884277344, + "learning_rate": 1.9744695295156966e-05, + "loss": 8.0592, + "step": 6892 + }, + { + "epoch": 1.1797022077699812, + "grad_norm": 21.706905364990234, + "learning_rate": 1.973376374792842e-05, + "loss": 2.8132, + "step": 6893 + }, + { + "epoch": 1.179873352729762, + "grad_norm": 12.288810729980469, + "learning_rate": 1.9722829407772208e-05, + "loss": 1.0199, + "step": 6894 + }, + { + "epoch": 1.180044497689543, + "grad_norm": 14.261706352233887, + "learning_rate": 1.971189228113961e-05, + "loss": 1.2732, + "step": 6895 + }, + { + "epoch": 1.180215642649324, + "grad_norm": 37.62468719482422, + "learning_rate": 1.970095237448355e-05, + "loss": 5.9513, + "step": 6896 + }, + { + "epoch": 1.180386787609105, + "grad_norm": 16.761072158813477, + "learning_rate": 1.9690009694258593e-05, + "loss": 1.2978, + "step": 6897 + }, + { + "epoch": 1.180557932568886, + "grad_norm": 24.69586944580078, + "learning_rate": 1.9679064246920923e-05, + "loss": 5.4448, + "step": 6898 + }, + { + "epoch": 1.1807290775286667, + "grad_norm": 0.5545434355735779, + "learning_rate": 1.9668116038928377e-05, + "loss": 0.1292, + "step": 6899 + }, + { + "epoch": 1.1809002224884477, + "grad_norm": 13.614893913269043, + "learning_rate": 1.965716507674042e-05, + "loss": 1.2286, + "step": 6900 + }, + { + "epoch": 1.1810713674482287, + "grad_norm": 17.071117401123047, + "learning_rate": 1.964621136681813e-05, + "loss": 2.147, + "step": 6901 + }, + { + "epoch": 1.1812425124080095, + "grad_norm": 18.16098976135254, + "learning_rate": 1.963525491562421e-05, + "loss": 1.7085, + "step": 6902 + }, + { + "epoch": 1.1814136573677905, + "grad_norm": 18.80036163330078, + "learning_rate": 1.962429572962299e-05, + "loss": 1.4874, + "step": 6903 + }, + { + "epoch": 1.1815848023275715, + "grad_norm": 20.31178855895996, + "learning_rate": 1.9613333815280404e-05, + "loss": 1.838, + "step": 6904 + }, + { + "epoch": 1.1817559472873524, + "grad_norm": 15.093534469604492, + "learning_rate": 1.9602369179063987e-05, + "loss": 1.3441, + "step": 6905 + }, + { + "epoch": 1.1819270922471334, + "grad_norm": 1.2397549152374268, + "learning_rate": 1.9591401827442904e-05, + "loss": 0.1521, + "step": 6906 + }, + { + "epoch": 1.1820982372069142, + "grad_norm": 6.578031063079834, + "learning_rate": 1.9580431766887904e-05, + "loss": 0.4449, + "step": 6907 + }, + { + "epoch": 1.1822693821666952, + "grad_norm": 0.47265729308128357, + "learning_rate": 1.956945900387134e-05, + "loss": 0.1383, + "step": 6908 + }, + { + "epoch": 1.1824405271264762, + "grad_norm": 7.973837375640869, + "learning_rate": 1.955848354486716e-05, + "loss": 0.8364, + "step": 6909 + }, + { + "epoch": 1.182611672086257, + "grad_norm": 24.104881286621094, + "learning_rate": 1.9547505396350893e-05, + "loss": 3.1458, + "step": 6910 + }, + { + "epoch": 1.182782817046038, + "grad_norm": 12.037428855895996, + "learning_rate": 1.9536524564799673e-05, + "loss": 0.9171, + "step": 6911 + }, + { + "epoch": 1.182953962005819, + "grad_norm": 27.550291061401367, + "learning_rate": 1.95255410566922e-05, + "loss": 3.0115, + "step": 6912 + }, + { + "epoch": 1.1831251069656, + "grad_norm": 33.36637496948242, + "learning_rate": 1.951455487850877e-05, + "loss": 5.617, + "step": 6913 + }, + { + "epoch": 1.183296251925381, + "grad_norm": 0.5798065066337585, + "learning_rate": 1.950356603673123e-05, + "loss": 0.1496, + "step": 6914 + }, + { + "epoch": 1.1834673968851617, + "grad_norm": 15.993162155151367, + "learning_rate": 1.9492574537843024e-05, + "loss": 1.5259, + "step": 6915 + }, + { + "epoch": 1.1836385418449427, + "grad_norm": 16.15716552734375, + "learning_rate": 1.948158038832914e-05, + "loss": 1.3994, + "step": 6916 + }, + { + "epoch": 1.1838096868047236, + "grad_norm": 6.515031337738037, + "learning_rate": 1.9470583594676164e-05, + "loss": 0.5043, + "step": 6917 + }, + { + "epoch": 1.1839808317645044, + "grad_norm": 16.923002243041992, + "learning_rate": 1.9459584163372203e-05, + "loss": 1.3346, + "step": 6918 + }, + { + "epoch": 1.1841519767242854, + "grad_norm": 5.997356414794922, + "learning_rate": 1.9448582100906943e-05, + "loss": 0.3956, + "step": 6919 + }, + { + "epoch": 1.1843231216840664, + "grad_norm": 8.468660354614258, + "learning_rate": 1.9437577413771623e-05, + "loss": 0.4804, + "step": 6920 + }, + { + "epoch": 1.1844942666438474, + "grad_norm": 2.0181360244750977, + "learning_rate": 1.9426570108459007e-05, + "loss": 0.2875, + "step": 6921 + }, + { + "epoch": 1.1846654116036284, + "grad_norm": 20.663789749145508, + "learning_rate": 1.9415560191463444e-05, + "loss": 2.119, + "step": 6922 + }, + { + "epoch": 1.1848365565634091, + "grad_norm": 16.835023880004883, + "learning_rate": 1.940454766928079e-05, + "loss": 1.2742, + "step": 6923 + }, + { + "epoch": 1.1850077015231901, + "grad_norm": 15.038171768188477, + "learning_rate": 1.9393532548408447e-05, + "loss": 1.3088, + "step": 6924 + }, + { + "epoch": 1.1851788464829711, + "grad_norm": 12.88936996459961, + "learning_rate": 1.938251483534536e-05, + "loss": 1.0233, + "step": 6925 + }, + { + "epoch": 1.185349991442752, + "grad_norm": 17.261011123657227, + "learning_rate": 1.937149453659199e-05, + "loss": 1.4455, + "step": 6926 + }, + { + "epoch": 1.1855211364025329, + "grad_norm": 18.969623565673828, + "learning_rate": 1.9360471658650336e-05, + "loss": 1.8209, + "step": 6927 + }, + { + "epoch": 1.1856922813623139, + "grad_norm": 16.013376235961914, + "learning_rate": 1.9349446208023903e-05, + "loss": 1.4429, + "step": 6928 + }, + { + "epoch": 1.1858634263220948, + "grad_norm": 0.6081182360649109, + "learning_rate": 1.9338418191217732e-05, + "loss": 0.1424, + "step": 6929 + }, + { + "epoch": 1.1860345712818758, + "grad_norm": 27.403413772583008, + "learning_rate": 1.932738761473837e-05, + "loss": 4.3191, + "step": 6930 + }, + { + "epoch": 1.1862057162416566, + "grad_norm": 1.8343784809112549, + "learning_rate": 1.9316354485093866e-05, + "loss": 0.2407, + "step": 6931 + }, + { + "epoch": 1.1863768612014376, + "grad_norm": 12.688108444213867, + "learning_rate": 1.9305318808793783e-05, + "loss": 0.9057, + "step": 6932 + }, + { + "epoch": 1.1865480061612186, + "grad_norm": 2.889329433441162, + "learning_rate": 1.92942805923492e-05, + "loss": 0.4465, + "step": 6933 + }, + { + "epoch": 1.1867191511209996, + "grad_norm": 18.517244338989258, + "learning_rate": 1.9283239842272665e-05, + "loss": 1.6817, + "step": 6934 + }, + { + "epoch": 1.1868902960807803, + "grad_norm": 17.190340042114258, + "learning_rate": 1.9272196565078245e-05, + "loss": 2.0391, + "step": 6935 + }, + { + "epoch": 1.1870614410405613, + "grad_norm": 22.873920440673828, + "learning_rate": 1.9261150767281486e-05, + "loss": 1.8831, + "step": 6936 + }, + { + "epoch": 1.1872325860003423, + "grad_norm": 16.942474365234375, + "learning_rate": 1.9250102455399427e-05, + "loss": 1.4123, + "step": 6937 + }, + { + "epoch": 1.1874037309601233, + "grad_norm": 17.837316513061523, + "learning_rate": 1.9239051635950588e-05, + "loss": 1.5212, + "step": 6938 + }, + { + "epoch": 1.187574875919904, + "grad_norm": 20.059364318847656, + "learning_rate": 1.9227998315454976e-05, + "loss": 1.6603, + "step": 6939 + }, + { + "epoch": 1.187746020879685, + "grad_norm": 22.977336883544922, + "learning_rate": 1.9216942500434055e-05, + "loss": 2.8594, + "step": 6940 + }, + { + "epoch": 1.187917165839466, + "grad_norm": 15.848212242126465, + "learning_rate": 1.920588419741078e-05, + "loss": 1.3736, + "step": 6941 + }, + { + "epoch": 1.188088310799247, + "grad_norm": 13.386641502380371, + "learning_rate": 1.9194823412909562e-05, + "loss": 1.0159, + "step": 6942 + }, + { + "epoch": 1.1882594557590278, + "grad_norm": 4.138679027557373, + "learning_rate": 1.9183760153456286e-05, + "loss": 0.3584, + "step": 6943 + }, + { + "epoch": 1.1884306007188088, + "grad_norm": 3.90535306930542, + "learning_rate": 1.9172694425578288e-05, + "loss": 0.3002, + "step": 6944 + }, + { + "epoch": 1.1886017456785898, + "grad_norm": 75.9722900390625, + "learning_rate": 1.916162623580436e-05, + "loss": 5.6064, + "step": 6945 + }, + { + "epoch": 1.1887728906383708, + "grad_norm": 12.245502471923828, + "learning_rate": 1.9150555590664758e-05, + "loss": 0.9652, + "step": 6946 + }, + { + "epoch": 1.1889440355981515, + "grad_norm": 3.0320892333984375, + "learning_rate": 1.913948249669117e-05, + "loss": 0.3505, + "step": 6947 + }, + { + "epoch": 1.1891151805579325, + "grad_norm": 12.209064483642578, + "learning_rate": 1.912840696041675e-05, + "loss": 1.0553, + "step": 6948 + }, + { + "epoch": 1.1892863255177135, + "grad_norm": 0.6309384703636169, + "learning_rate": 1.9117328988376072e-05, + "loss": 0.1423, + "step": 6949 + }, + { + "epoch": 1.1894574704774945, + "grad_norm": 31.398828506469727, + "learning_rate": 1.9106248587105158e-05, + "loss": 1.1428, + "step": 6950 + }, + { + "epoch": 1.1896286154372753, + "grad_norm": 13.952041625976562, + "learning_rate": 1.9095165763141463e-05, + "loss": 1.3932, + "step": 6951 + }, + { + "epoch": 1.1897997603970563, + "grad_norm": 18.571762084960938, + "learning_rate": 1.9084080523023866e-05, + "loss": 1.4609, + "step": 6952 + }, + { + "epoch": 1.1899709053568372, + "grad_norm": 13.063371658325195, + "learning_rate": 1.9072992873292676e-05, + "loss": 1.0676, + "step": 6953 + }, + { + "epoch": 1.1901420503166182, + "grad_norm": 20.675588607788086, + "learning_rate": 1.9061902820489628e-05, + "loss": 2.3088, + "step": 6954 + }, + { + "epoch": 1.1903131952763992, + "grad_norm": 18.69338607788086, + "learning_rate": 1.9050810371157865e-05, + "loss": 2.1148, + "step": 6955 + }, + { + "epoch": 1.19048434023618, + "grad_norm": 23.922927856445312, + "learning_rate": 1.9039715531841946e-05, + "loss": 2.4482, + "step": 6956 + }, + { + "epoch": 1.190655485195961, + "grad_norm": 21.081663131713867, + "learning_rate": 1.902861830908785e-05, + "loss": 2.1831, + "step": 6957 + }, + { + "epoch": 1.190826630155742, + "grad_norm": 20.321006774902344, + "learning_rate": 1.9017518709442946e-05, + "loss": 2.0846, + "step": 6958 + }, + { + "epoch": 1.1909977751155227, + "grad_norm": 2.0978660583496094, + "learning_rate": 1.9006416739456024e-05, + "loss": 0.3047, + "step": 6959 + }, + { + "epoch": 1.1911689200753037, + "grad_norm": 0.8055472373962402, + "learning_rate": 1.899531240567726e-05, + "loss": 0.1479, + "step": 6960 + }, + { + "epoch": 1.1913400650350847, + "grad_norm": 22.558717727661133, + "learning_rate": 1.8984205714658222e-05, + "loss": 2.9147, + "step": 6961 + }, + { + "epoch": 1.1915112099948657, + "grad_norm": 23.008995056152344, + "learning_rate": 1.897309667295188e-05, + "loss": 2.2334, + "step": 6962 + }, + { + "epoch": 1.1916823549546467, + "grad_norm": 19.32323455810547, + "learning_rate": 1.8961985287112583e-05, + "loss": 2.2991, + "step": 6963 + }, + { + "epoch": 1.1918534999144275, + "grad_norm": 14.19498348236084, + "learning_rate": 1.8950871563696058e-05, + "loss": 1.4139, + "step": 6964 + }, + { + "epoch": 1.1920246448742084, + "grad_norm": 15.657463073730469, + "learning_rate": 1.893975550925943e-05, + "loss": 1.2715, + "step": 6965 + }, + { + "epoch": 1.1921957898339894, + "grad_norm": 3.751906394958496, + "learning_rate": 1.892863713036119e-05, + "loss": 0.3137, + "step": 6966 + }, + { + "epoch": 1.1923669347937702, + "grad_norm": 13.209271430969238, + "learning_rate": 1.891751643356119e-05, + "loss": 1.2306, + "step": 6967 + }, + { + "epoch": 1.1925380797535512, + "grad_norm": 14.844564437866211, + "learning_rate": 1.8906393425420654e-05, + "loss": 1.1242, + "step": 6968 + }, + { + "epoch": 1.1927092247133322, + "grad_norm": 19.91156768798828, + "learning_rate": 1.8895268112502185e-05, + "loss": 1.7784, + "step": 6969 + }, + { + "epoch": 1.1928803696731132, + "grad_norm": 15.727069854736328, + "learning_rate": 1.8884140501369725e-05, + "loss": 1.2447, + "step": 6970 + }, + { + "epoch": 1.1930515146328942, + "grad_norm": 24.86445426940918, + "learning_rate": 1.887301059858858e-05, + "loss": 5.1445, + "step": 6971 + }, + { + "epoch": 1.193222659592675, + "grad_norm": 11.971946716308594, + "learning_rate": 1.886187841072542e-05, + "loss": 1.1217, + "step": 6972 + }, + { + "epoch": 1.193393804552456, + "grad_norm": 6.445328712463379, + "learning_rate": 1.885074394434824e-05, + "loss": 0.4316, + "step": 6973 + }, + { + "epoch": 1.193564949512237, + "grad_norm": 6.6650261878967285, + "learning_rate": 1.88396072060264e-05, + "loss": 0.5171, + "step": 6974 + }, + { + "epoch": 1.193736094472018, + "grad_norm": 19.152889251708984, + "learning_rate": 1.8828468202330588e-05, + "loss": 1.4326, + "step": 6975 + }, + { + "epoch": 1.1939072394317987, + "grad_norm": 23.997262954711914, + "learning_rate": 1.8817326939832835e-05, + "loss": 3.1506, + "step": 6976 + }, + { + "epoch": 1.1940783843915797, + "grad_norm": 17.117610931396484, + "learning_rate": 1.88061834251065e-05, + "loss": 1.6154, + "step": 6977 + }, + { + "epoch": 1.1942495293513606, + "grad_norm": 14.683150291442871, + "learning_rate": 1.879503766472628e-05, + "loss": 1.2889, + "step": 6978 + }, + { + "epoch": 1.1944206743111416, + "grad_norm": 22.7612361907959, + "learning_rate": 1.8783889665268182e-05, + "loss": 2.5793, + "step": 6979 + }, + { + "epoch": 1.1945918192709224, + "grad_norm": 14.981850624084473, + "learning_rate": 1.877273943330954e-05, + "loss": 1.3815, + "step": 6980 + }, + { + "epoch": 1.1947629642307034, + "grad_norm": 6.467514991760254, + "learning_rate": 1.8761586975429022e-05, + "loss": 0.4525, + "step": 6981 + }, + { + "epoch": 1.1949341091904844, + "grad_norm": 26.042551040649414, + "learning_rate": 1.875043229820658e-05, + "loss": 2.9506, + "step": 6982 + }, + { + "epoch": 1.1951052541502654, + "grad_norm": 18.11719512939453, + "learning_rate": 1.8739275408223497e-05, + "loss": 2.2709, + "step": 6983 + }, + { + "epoch": 1.1952763991100461, + "grad_norm": 23.576017379760742, + "learning_rate": 1.872811631206236e-05, + "loss": 1.9806, + "step": 6984 + }, + { + "epoch": 1.1954475440698271, + "grad_norm": 16.061267852783203, + "learning_rate": 1.8716955016307035e-05, + "loss": 1.204, + "step": 6985 + }, + { + "epoch": 1.195618689029608, + "grad_norm": 18.873077392578125, + "learning_rate": 1.8705791527542723e-05, + "loss": 1.5474, + "step": 6986 + }, + { + "epoch": 1.195789833989389, + "grad_norm": 11.208637237548828, + "learning_rate": 1.8694625852355886e-05, + "loss": 0.8615, + "step": 6987 + }, + { + "epoch": 1.1959609789491699, + "grad_norm": 16.651269912719727, + "learning_rate": 1.8683457997334292e-05, + "loss": 1.2784, + "step": 6988 + }, + { + "epoch": 1.1961321239089509, + "grad_norm": 153.02740478515625, + "learning_rate": 1.8672287969067002e-05, + "loss": 7.5781, + "step": 6989 + }, + { + "epoch": 1.1963032688687318, + "grad_norm": 0.6318583488464355, + "learning_rate": 1.8661115774144333e-05, + "loss": 0.1455, + "step": 6990 + }, + { + "epoch": 1.1964744138285128, + "grad_norm": 18.384014129638672, + "learning_rate": 1.86499414191579e-05, + "loss": 2.5856, + "step": 6991 + }, + { + "epoch": 1.1966455587882936, + "grad_norm": 6.968575954437256, + "learning_rate": 1.86387649107006e-05, + "loss": 0.3881, + "step": 6992 + }, + { + "epoch": 1.1968167037480746, + "grad_norm": 17.390628814697266, + "learning_rate": 1.862758625536658e-05, + "loss": 1.4007, + "step": 6993 + }, + { + "epoch": 1.1969878487078556, + "grad_norm": 4.716192245483398, + "learning_rate": 1.861640545975127e-05, + "loss": 0.5143, + "step": 6994 + }, + { + "epoch": 1.1971589936676366, + "grad_norm": 20.03127670288086, + "learning_rate": 1.8605222530451354e-05, + "loss": 2.2603, + "step": 6995 + }, + { + "epoch": 1.1973301386274173, + "grad_norm": 18.754140853881836, + "learning_rate": 1.8594037474064767e-05, + "loss": 1.5199, + "step": 6996 + }, + { + "epoch": 1.1975012835871983, + "grad_norm": 19.000524520874023, + "learning_rate": 1.858285029719072e-05, + "loss": 1.7049, + "step": 6997 + }, + { + "epoch": 1.1976724285469793, + "grad_norm": 15.121638298034668, + "learning_rate": 1.857166100642966e-05, + "loss": 1.1847, + "step": 6998 + }, + { + "epoch": 1.1978435735067603, + "grad_norm": 0.5012795329093933, + "learning_rate": 1.8560469608383293e-05, + "loss": 0.1359, + "step": 6999 + }, + { + "epoch": 1.198014718466541, + "grad_norm": 35.48890686035156, + "learning_rate": 1.854927610965455e-05, + "loss": 5.816, + "step": 7000 + }, + { + "epoch": 1.198185863426322, + "grad_norm": 2.8732776641845703, + "learning_rate": 1.853808051684761e-05, + "loss": 0.2891, + "step": 7001 + }, + { + "epoch": 1.198357008386103, + "grad_norm": 3.7505767345428467, + "learning_rate": 1.8526882836567904e-05, + "loss": 0.3253, + "step": 7002 + }, + { + "epoch": 1.198528153345884, + "grad_norm": 19.846040725708008, + "learning_rate": 1.8515683075422066e-05, + "loss": 1.6157, + "step": 7003 + }, + { + "epoch": 1.198699298305665, + "grad_norm": 10.942571640014648, + "learning_rate": 1.8504481240017977e-05, + "loss": 0.9332, + "step": 7004 + }, + { + "epoch": 1.1988704432654458, + "grad_norm": 0.8276537656784058, + "learning_rate": 1.849327733696474e-05, + "loss": 0.1593, + "step": 7005 + }, + { + "epoch": 1.1990415882252268, + "grad_norm": 6.18898344039917, + "learning_rate": 1.8482071372872673e-05, + "loss": 0.4576, + "step": 7006 + }, + { + "epoch": 1.1992127331850078, + "grad_norm": 22.084545135498047, + "learning_rate": 1.84708633543533e-05, + "loss": 1.9511, + "step": 7007 + }, + { + "epoch": 1.1993838781447885, + "grad_norm": 33.36897659301758, + "learning_rate": 1.8459653288019385e-05, + "loss": 1.3345, + "step": 7008 + }, + { + "epoch": 1.1995550231045695, + "grad_norm": 14.181174278259277, + "learning_rate": 1.8448441180484876e-05, + "loss": 1.3218, + "step": 7009 + }, + { + "epoch": 1.1997261680643505, + "grad_norm": 1.7246726751327515, + "learning_rate": 1.8437227038364935e-05, + "loss": 0.1508, + "step": 7010 + }, + { + "epoch": 1.1998973130241315, + "grad_norm": 14.34754467010498, + "learning_rate": 1.842601086827592e-05, + "loss": 1.3846, + "step": 7011 + }, + { + "epoch": 1.2000684579839125, + "grad_norm": 24.781330108642578, + "learning_rate": 1.8414792676835392e-05, + "loss": 5.2621, + "step": 7012 + }, + { + "epoch": 1.2002396029436933, + "grad_norm": 2.6087961196899414, + "learning_rate": 1.8403572470662098e-05, + "loss": 0.2925, + "step": 7013 + }, + { + "epoch": 1.2004107479034742, + "grad_norm": 15.484484672546387, + "learning_rate": 1.8392350256375975e-05, + "loss": 1.1175, + "step": 7014 + }, + { + "epoch": 1.2005818928632552, + "grad_norm": 9.616670608520508, + "learning_rate": 1.838112604059815e-05, + "loss": 0.7138, + "step": 7015 + }, + { + "epoch": 1.200753037823036, + "grad_norm": 16.37866973876953, + "learning_rate": 1.8369899829950928e-05, + "loss": 1.4261, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_nli-pairs_loss": 1.4953010082244873, + "eval_nli-pairs_runtime": 4.5627, + "eval_nli-pairs_samples_per_second": 43.833, + "eval_nli-pairs_steps_per_second": 1.534, + "eval_sts-test_pearson_cosine": 0.7630899675412521, + "eval_sts-test_pearson_dot": 0.6254421112157904, + "eval_sts-test_pearson_euclidean": 0.7580382057295076, + "eval_sts-test_pearson_manhattan": 0.7623571616238837, + "eval_sts-test_pearson_max": 0.7630899675412521, + "eval_sts-test_spearman_cosine": 0.7635056711829842, + "eval_sts-test_spearman_dot": 0.6093963604051945, + "eval_sts-test_spearman_euclidean": 0.7487226311935559, + "eval_sts-test_spearman_manhattan": 0.7547159555492929, + "eval_sts-test_spearman_max": 0.7635056711829842, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_vitaminc-pairs_loss": 0.8603028655052185, + "eval_vitaminc-pairs_runtime": 2.8678, + "eval_vitaminc-pairs_samples_per_second": 69.74, + "eval_vitaminc-pairs_steps_per_second": 2.441, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_qnli-contrastive_loss": 1.7720210552215576, + "eval_qnli-contrastive_runtime": 0.7553, + "eval_qnli-contrastive_samples_per_second": 264.8, + "eval_qnli-contrastive_steps_per_second": 9.268, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_scitail-pairs-qa_loss": 0.12096086144447327, + "eval_scitail-pairs-qa_runtime": 1.8473, + "eval_scitail-pairs-qa_samples_per_second": 108.263, + "eval_scitail-pairs-qa_steps_per_second": 3.789, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_scitail-pairs-pos_loss": 0.6861357092857361, + "eval_scitail-pairs-pos_runtime": 2.8905, + "eval_scitail-pairs-pos_samples_per_second": 69.193, + "eval_scitail-pairs-pos_steps_per_second": 2.422, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_xsum-pairs_loss": 0.8077878952026367, + "eval_xsum-pairs_runtime": 2.6762, + "eval_xsum-pairs_samples_per_second": 65.392, + "eval_xsum-pairs_steps_per_second": 2.242, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_compression-pairs_loss": 0.26878783106803894, + "eval_compression-pairs_runtime": 0.5264, + "eval_compression-pairs_samples_per_second": 379.949, + "eval_compression-pairs_steps_per_second": 13.298, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_sciq_pairs_loss": 0.4642964005470276, + "eval_sciq_pairs_runtime": 9.6271, + "eval_sciq_pairs_samples_per_second": 20.775, + "eval_sciq_pairs_steps_per_second": 0.727, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_qasc_pairs_loss": 5.549177169799805, + "eval_qasc_pairs_runtime": 2.7374, + "eval_qasc_pairs_samples_per_second": 73.063, + "eval_qasc_pairs_steps_per_second": 2.557, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_openbookqa_pairs_loss": 2.5830934047698975, + "eval_openbookqa_pairs_runtime": 0.6669, + "eval_openbookqa_pairs_samples_per_second": 103.471, + "eval_openbookqa_pairs_steps_per_second": 4.499, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_msmarco_pairs_loss": 1.228713870048523, + "eval_msmarco_pairs_runtime": 4.1215, + "eval_msmarco_pairs_samples_per_second": 48.526, + "eval_msmarco_pairs_steps_per_second": 1.698, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_nq_pairs_loss": 1.4215295314788818, + "eval_nq_pairs_runtime": 8.7787, + "eval_nq_pairs_samples_per_second": 22.782, + "eval_nq_pairs_steps_per_second": 0.797, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_trivia_pairs_loss": 1.794838309288025, + "eval_trivia_pairs_runtime": 12.9923, + "eval_trivia_pairs_samples_per_second": 15.394, + "eval_trivia_pairs_steps_per_second": 0.539, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_quora_pairs_loss": 0.23021991550922394, + "eval_quora_pairs_runtime": 1.5922, + "eval_quora_pairs_samples_per_second": 125.616, + "eval_quora_pairs_steps_per_second": 4.397, + "step": 7016 + }, + { + "epoch": 1.200753037823036, + "eval_gooaq_pairs_loss": 0.9434043765068054, + "eval_gooaq_pairs_runtime": 2.665, + "eval_gooaq_pairs_samples_per_second": 75.047, + "eval_gooaq_pairs_steps_per_second": 2.627, + "step": 7016 + }, + { + "epoch": 1.200924182782817, + "grad_norm": 17.212543487548828, + "learning_rate": 1.835867163105778e-05, + "loss": 1.3922, + "step": 7017 + }, + { + "epoch": 1.201095327742598, + "grad_norm": 5.500203609466553, + "learning_rate": 1.8347441450543373e-05, + "loss": 0.4196, + "step": 7018 + }, + { + "epoch": 1.201266472702379, + "grad_norm": 21.175825119018555, + "learning_rate": 1.833620929503352e-05, + "loss": 2.5103, + "step": 7019 + }, + { + "epoch": 1.20143761766216, + "grad_norm": 23.716127395629883, + "learning_rate": 1.8324975171155218e-05, + "loss": 2.388, + "step": 7020 + }, + { + "epoch": 1.2016087626219407, + "grad_norm": 22.948490142822266, + "learning_rate": 1.8313739085536613e-05, + "loss": 3.5063, + "step": 7021 + }, + { + "epoch": 1.2017799075817217, + "grad_norm": 21.151857376098633, + "learning_rate": 1.830250104480701e-05, + "loss": 2.0793, + "step": 7022 + }, + { + "epoch": 1.2019510525415027, + "grad_norm": 6.868299961090088, + "learning_rate": 1.8291261055596863e-05, + "loss": 0.6836, + "step": 7023 + }, + { + "epoch": 1.2021221975012835, + "grad_norm": 11.07442569732666, + "learning_rate": 1.82800191245378e-05, + "loss": 1.1184, + "step": 7024 + }, + { + "epoch": 1.2022933424610645, + "grad_norm": 17.20339012145996, + "learning_rate": 1.8268775258262567e-05, + "loss": 1.8645, + "step": 7025 + }, + { + "epoch": 1.2024644874208454, + "grad_norm": 8.099205017089844, + "learning_rate": 1.8257529463405063e-05, + "loss": 1.0813, + "step": 7026 + }, + { + "epoch": 1.2026356323806264, + "grad_norm": 10.576147079467773, + "learning_rate": 1.8246281746600325e-05, + "loss": 1.1647, + "step": 7027 + }, + { + "epoch": 1.2028067773404074, + "grad_norm": 23.977031707763672, + "learning_rate": 1.823503211448451e-05, + "loss": 2.1884, + "step": 7028 + }, + { + "epoch": 1.2029779223001882, + "grad_norm": 17.264558792114258, + "learning_rate": 1.8223780573694943e-05, + "loss": 1.5955, + "step": 7029 + }, + { + "epoch": 1.2031490672599692, + "grad_norm": 3.9181175231933594, + "learning_rate": 1.8212527130870032e-05, + "loss": 0.3104, + "step": 7030 + }, + { + "epoch": 1.2033202122197502, + "grad_norm": 28.664770126342773, + "learning_rate": 1.820127179264933e-05, + "loss": 5.3973, + "step": 7031 + }, + { + "epoch": 1.2034913571795312, + "grad_norm": 15.21064281463623, + "learning_rate": 1.81900145656735e-05, + "loss": 1.5112, + "step": 7032 + }, + { + "epoch": 1.203662502139312, + "grad_norm": 86.03211975097656, + "learning_rate": 1.8178755456584325e-05, + "loss": 7.2344, + "step": 7033 + }, + { + "epoch": 1.203833647099093, + "grad_norm": 3.1570894718170166, + "learning_rate": 1.8167494472024694e-05, + "loss": 0.2625, + "step": 7034 + }, + { + "epoch": 1.204004792058874, + "grad_norm": 14.07559585571289, + "learning_rate": 1.815623161863861e-05, + "loss": 1.3018, + "step": 7035 + }, + { + "epoch": 1.2041759370186549, + "grad_norm": 17.405441284179688, + "learning_rate": 1.814496690307117e-05, + "loss": 1.9948, + "step": 7036 + }, + { + "epoch": 1.2043470819784357, + "grad_norm": 4.415936470031738, + "learning_rate": 1.8133700331968572e-05, + "loss": 0.3861, + "step": 7037 + }, + { + "epoch": 1.2045182269382166, + "grad_norm": 18.057254791259766, + "learning_rate": 1.8122431911978114e-05, + "loss": 1.4661, + "step": 7038 + }, + { + "epoch": 1.2046893718979976, + "grad_norm": 9.419878005981445, + "learning_rate": 1.811116164974817e-05, + "loss": 0.5912, + "step": 7039 + }, + { + "epoch": 1.2048605168577786, + "grad_norm": 4.697745323181152, + "learning_rate": 1.809988955192822e-05, + "loss": 0.3498, + "step": 7040 + }, + { + "epoch": 1.2050316618175594, + "grad_norm": 13.607497215270996, + "learning_rate": 1.808861562516882e-05, + "loss": 1.0024, + "step": 7041 + }, + { + "epoch": 1.2052028067773404, + "grad_norm": 17.339616775512695, + "learning_rate": 1.8077339876121604e-05, + "loss": 1.295, + "step": 7042 + }, + { + "epoch": 1.2053739517371214, + "grad_norm": 21.256227493286133, + "learning_rate": 1.8066062311439275e-05, + "loss": 1.472, + "step": 7043 + }, + { + "epoch": 1.2055450966969024, + "grad_norm": 13.709125518798828, + "learning_rate": 1.8054782937775607e-05, + "loss": 0.9876, + "step": 7044 + }, + { + "epoch": 1.2057162416566831, + "grad_norm": 21.412246704101562, + "learning_rate": 1.804350176178546e-05, + "loss": 1.8479, + "step": 7045 + }, + { + "epoch": 1.205887386616464, + "grad_norm": 15.640390396118164, + "learning_rate": 1.803221879012475e-05, + "loss": 1.6526, + "step": 7046 + }, + { + "epoch": 1.206058531576245, + "grad_norm": 22.768157958984375, + "learning_rate": 1.802093402945043e-05, + "loss": 1.0484, + "step": 7047 + }, + { + "epoch": 1.206229676536026, + "grad_norm": 12.367416381835938, + "learning_rate": 1.8009647486420535e-05, + "loss": 1.1592, + "step": 7048 + }, + { + "epoch": 1.2064008214958069, + "grad_norm": 11.553193092346191, + "learning_rate": 1.799835916769414e-05, + "loss": 0.9264, + "step": 7049 + }, + { + "epoch": 1.2065719664555878, + "grad_norm": 16.40020751953125, + "learning_rate": 1.7987069079931363e-05, + "loss": 1.5556, + "step": 7050 + }, + { + "epoch": 1.2067431114153688, + "grad_norm": 15.786770820617676, + "learning_rate": 1.7975777229793386e-05, + "loss": 1.2909, + "step": 7051 + }, + { + "epoch": 1.2069142563751498, + "grad_norm": 23.682838439941406, + "learning_rate": 1.7964483623942413e-05, + "loss": 2.1692, + "step": 7052 + }, + { + "epoch": 1.2070854013349308, + "grad_norm": 17.22081184387207, + "learning_rate": 1.7953188269041686e-05, + "loss": 1.985, + "step": 7053 + }, + { + "epoch": 1.2072565462947116, + "grad_norm": 33.2260627746582, + "learning_rate": 1.794189117175548e-05, + "loss": 5.3859, + "step": 7054 + }, + { + "epoch": 1.2074276912544926, + "grad_norm": 12.188201904296875, + "learning_rate": 1.7930592338749095e-05, + "loss": 1.0235, + "step": 7055 + }, + { + "epoch": 1.2075988362142736, + "grad_norm": 20.444673538208008, + "learning_rate": 1.7919291776688875e-05, + "loss": 2.1585, + "step": 7056 + }, + { + "epoch": 1.2077699811740543, + "grad_norm": 19.24138832092285, + "learning_rate": 1.7907989492242157e-05, + "loss": 1.724, + "step": 7057 + }, + { + "epoch": 1.2079411261338353, + "grad_norm": 3.7893197536468506, + "learning_rate": 1.7896685492077302e-05, + "loss": 0.3529, + "step": 7058 + }, + { + "epoch": 1.2081122710936163, + "grad_norm": 12.841107368469238, + "learning_rate": 1.7885379782863695e-05, + "loss": 1.0369, + "step": 7059 + }, + { + "epoch": 1.2082834160533973, + "grad_norm": 0.8358324766159058, + "learning_rate": 1.7874072371271714e-05, + "loss": 0.1581, + "step": 7060 + }, + { + "epoch": 1.2084545610131783, + "grad_norm": 10.016582489013672, + "learning_rate": 1.786276326397276e-05, + "loss": 0.8467, + "step": 7061 + }, + { + "epoch": 1.208625705972959, + "grad_norm": 14.59321403503418, + "learning_rate": 1.7851452467639218e-05, + "loss": 1.0185, + "step": 7062 + }, + { + "epoch": 1.20879685093274, + "grad_norm": 18.010021209716797, + "learning_rate": 1.784013998894447e-05, + "loss": 1.4267, + "step": 7063 + }, + { + "epoch": 1.208967995892521, + "grad_norm": 15.748735427856445, + "learning_rate": 1.7828825834562897e-05, + "loss": 1.1944, + "step": 7064 + }, + { + "epoch": 1.2091391408523018, + "grad_norm": 14.062028884887695, + "learning_rate": 1.7817510011169865e-05, + "loss": 1.2241, + "step": 7065 + }, + { + "epoch": 1.2093102858120828, + "grad_norm": 18.22197151184082, + "learning_rate": 1.7806192525441734e-05, + "loss": 1.3791, + "step": 7066 + }, + { + "epoch": 1.2094814307718638, + "grad_norm": 6.379945755004883, + "learning_rate": 1.7794873384055832e-05, + "loss": 0.4092, + "step": 7067 + }, + { + "epoch": 1.2096525757316448, + "grad_norm": 2.92317795753479, + "learning_rate": 1.778355259369047e-05, + "loss": 0.2682, + "step": 7068 + }, + { + "epoch": 1.2098237206914257, + "grad_norm": 19.18209457397461, + "learning_rate": 1.7772230161024935e-05, + "loss": 1.602, + "step": 7069 + }, + { + "epoch": 1.2099948656512065, + "grad_norm": 13.44565486907959, + "learning_rate": 1.776090609273947e-05, + "loss": 1.0468, + "step": 7070 + }, + { + "epoch": 1.2101660106109875, + "grad_norm": 20.34478187561035, + "learning_rate": 1.77495803955153e-05, + "loss": 2.189, + "step": 7071 + }, + { + "epoch": 1.2103371555707685, + "grad_norm": 2.485504388809204, + "learning_rate": 1.7738253076034604e-05, + "loss": 0.2954, + "step": 7072 + }, + { + "epoch": 1.2105083005305493, + "grad_norm": 17.274112701416016, + "learning_rate": 1.7726924140980513e-05, + "loss": 1.3465, + "step": 7073 + }, + { + "epoch": 1.2106794454903302, + "grad_norm": 13.669708251953125, + "learning_rate": 1.771559359703712e-05, + "loss": 1.1574, + "step": 7074 + }, + { + "epoch": 1.2108505904501112, + "grad_norm": 26.553890228271484, + "learning_rate": 1.7704261450889457e-05, + "loss": 1.2326, + "step": 7075 + }, + { + "epoch": 1.2110217354098922, + "grad_norm": 20.33255386352539, + "learning_rate": 1.769292770922351e-05, + "loss": 1.7637, + "step": 7076 + }, + { + "epoch": 1.2111928803696732, + "grad_norm": 13.613040924072266, + "learning_rate": 1.7681592378726203e-05, + "loss": 1.3363, + "step": 7077 + }, + { + "epoch": 1.211364025329454, + "grad_norm": 14.0830078125, + "learning_rate": 1.76702554660854e-05, + "loss": 1.0899, + "step": 7078 + }, + { + "epoch": 1.211535170289235, + "grad_norm": 12.427796363830566, + "learning_rate": 1.7658916977989897e-05, + "loss": 0.8537, + "step": 7079 + }, + { + "epoch": 1.211706315249016, + "grad_norm": 55.171695709228516, + "learning_rate": 1.7647576921129415e-05, + "loss": 6.3338, + "step": 7080 + }, + { + "epoch": 1.211877460208797, + "grad_norm": 23.50286293029785, + "learning_rate": 1.7636235302194604e-05, + "loss": 2.6936, + "step": 7081 + }, + { + "epoch": 1.2120486051685777, + "grad_norm": 14.088695526123047, + "learning_rate": 1.762489212787704e-05, + "loss": 1.2116, + "step": 7082 + }, + { + "epoch": 1.2122197501283587, + "grad_norm": 5.875244617462158, + "learning_rate": 1.7613547404869208e-05, + "loss": 0.4451, + "step": 7083 + }, + { + "epoch": 1.2123908950881397, + "grad_norm": 5.248507022857666, + "learning_rate": 1.7602201139864518e-05, + "loss": 0.3901, + "step": 7084 + }, + { + "epoch": 1.2125620400479207, + "grad_norm": 16.395782470703125, + "learning_rate": 1.7590853339557276e-05, + "loss": 1.8082, + "step": 7085 + }, + { + "epoch": 1.2127331850077014, + "grad_norm": 11.277172088623047, + "learning_rate": 1.757950401064271e-05, + "loss": 0.9103, + "step": 7086 + }, + { + "epoch": 1.2129043299674824, + "grad_norm": 16.459196090698242, + "learning_rate": 1.756815315981693e-05, + "loss": 1.3341, + "step": 7087 + }, + { + "epoch": 1.2130754749272634, + "grad_norm": 23.725290298461914, + "learning_rate": 1.755680079377696e-05, + "loss": 2.6328, + "step": 7088 + }, + { + "epoch": 1.2132466198870444, + "grad_norm": 18.98377227783203, + "learning_rate": 1.754544691922072e-05, + "loss": 1.3258, + "step": 7089 + }, + { + "epoch": 1.2134177648468252, + "grad_norm": 18.941240310668945, + "learning_rate": 1.7534091542847005e-05, + "loss": 1.8892, + "step": 7090 + }, + { + "epoch": 1.2135889098066062, + "grad_norm": 0.5720072984695435, + "learning_rate": 1.75227346713555e-05, + "loss": 0.146, + "step": 7091 + }, + { + "epoch": 1.2137600547663872, + "grad_norm": 10.331928253173828, + "learning_rate": 1.7511376311446785e-05, + "loss": 0.7565, + "step": 7092 + }, + { + "epoch": 1.2139311997261681, + "grad_norm": 14.877519607543945, + "learning_rate": 1.75000164698223e-05, + "loss": 1.0429, + "step": 7093 + }, + { + "epoch": 1.214102344685949, + "grad_norm": 23.970779418945312, + "learning_rate": 1.748865515318438e-05, + "loss": 3.479, + "step": 7094 + }, + { + "epoch": 1.21427348964573, + "grad_norm": 16.405920028686523, + "learning_rate": 1.7477292368236214e-05, + "loss": 1.2732, + "step": 7095 + }, + { + "epoch": 1.2144446346055109, + "grad_norm": 10.476763725280762, + "learning_rate": 1.7465928121681858e-05, + "loss": 0.9867, + "step": 7096 + }, + { + "epoch": 1.2146157795652919, + "grad_norm": 7.040361404418945, + "learning_rate": 1.7454562420226242e-05, + "loss": 0.682, + "step": 7097 + }, + { + "epoch": 1.2147869245250726, + "grad_norm": 22.748382568359375, + "learning_rate": 1.7443195270575136e-05, + "loss": 2.2859, + "step": 7098 + }, + { + "epoch": 1.2149580694848536, + "grad_norm": 15.852254867553711, + "learning_rate": 1.7431826679435186e-05, + "loss": 1.3293, + "step": 7099 + }, + { + "epoch": 1.2151292144446346, + "grad_norm": 14.472835540771484, + "learning_rate": 1.742045665351387e-05, + "loss": 1.1619, + "step": 7100 + }, + { + "epoch": 1.2153003594044156, + "grad_norm": 29.56837272644043, + "learning_rate": 1.7409085199519524e-05, + "loss": 6.0521, + "step": 7101 + }, + { + "epoch": 1.2154715043641966, + "grad_norm": 21.541980743408203, + "learning_rate": 1.7397712324161322e-05, + "loss": 2.0102, + "step": 7102 + }, + { + "epoch": 1.2156426493239774, + "grad_norm": 17.5698184967041, + "learning_rate": 1.7386338034149276e-05, + "loss": 1.315, + "step": 7103 + }, + { + "epoch": 1.2158137942837584, + "grad_norm": 17.530529022216797, + "learning_rate": 1.737496233619424e-05, + "loss": 1.2779, + "step": 7104 + }, + { + "epoch": 1.2159849392435393, + "grad_norm": 9.821117401123047, + "learning_rate": 1.7363585237007886e-05, + "loss": 0.8769, + "step": 7105 + }, + { + "epoch": 1.21615608420332, + "grad_norm": 0.9964603185653687, + "learning_rate": 1.735220674330272e-05, + "loss": 0.1617, + "step": 7106 + }, + { + "epoch": 1.216327229163101, + "grad_norm": 18.4915714263916, + "learning_rate": 1.7340826861792072e-05, + "loss": 1.6341, + "step": 7107 + }, + { + "epoch": 1.216498374122882, + "grad_norm": 109.80364227294922, + "learning_rate": 1.732944559919009e-05, + "loss": 8.1499, + "step": 7108 + }, + { + "epoch": 1.216669519082663, + "grad_norm": 22.95547866821289, + "learning_rate": 1.7318062962211734e-05, + "loss": 2.3982, + "step": 7109 + }, + { + "epoch": 1.216840664042444, + "grad_norm": 15.885193824768066, + "learning_rate": 1.7306678957572778e-05, + "loss": 0.9601, + "step": 7110 + }, + { + "epoch": 1.2170118090022248, + "grad_norm": 13.969958305358887, + "learning_rate": 1.72952935919898e-05, + "loss": 1.2441, + "step": 7111 + }, + { + "epoch": 1.2171829539620058, + "grad_norm": 0.7053208947181702, + "learning_rate": 1.7283906872180185e-05, + "loss": 0.1422, + "step": 7112 + }, + { + "epoch": 1.2173540989217868, + "grad_norm": 20.290607452392578, + "learning_rate": 1.7272518804862115e-05, + "loss": 2.6781, + "step": 7113 + }, + { + "epoch": 1.2175252438815676, + "grad_norm": 4.332746982574463, + "learning_rate": 1.726112939675456e-05, + "loss": 0.3709, + "step": 7114 + }, + { + "epoch": 1.2176963888413486, + "grad_norm": 16.232742309570312, + "learning_rate": 1.72497386545773e-05, + "loss": 1.2398, + "step": 7115 + }, + { + "epoch": 1.2178675338011296, + "grad_norm": 12.56082534790039, + "learning_rate": 1.7238346585050878e-05, + "loss": 1.0542, + "step": 7116 + }, + { + "epoch": 1.2180386787609105, + "grad_norm": 18.8144474029541, + "learning_rate": 1.722695319489664e-05, + "loss": 1.6548, + "step": 7117 + }, + { + "epoch": 1.2182098237206915, + "grad_norm": 14.830066680908203, + "learning_rate": 1.7215558490836708e-05, + "loss": 1.4143, + "step": 7118 + }, + { + "epoch": 1.2183809686804723, + "grad_norm": 14.560158729553223, + "learning_rate": 1.7204162479593954e-05, + "loss": 0.8984, + "step": 7119 + }, + { + "epoch": 1.2185521136402533, + "grad_norm": 19.058626174926758, + "learning_rate": 1.7192765167892057e-05, + "loss": 1.8648, + "step": 7120 + }, + { + "epoch": 1.2187232586000343, + "grad_norm": 21.232755661010742, + "learning_rate": 1.7181366562455445e-05, + "loss": 1.7785, + "step": 7121 + }, + { + "epoch": 1.218894403559815, + "grad_norm": 19.12632942199707, + "learning_rate": 1.7169966670009313e-05, + "loss": 1.607, + "step": 7122 + }, + { + "epoch": 1.219065548519596, + "grad_norm": 4.814243316650391, + "learning_rate": 1.7158565497279616e-05, + "loss": 0.3387, + "step": 7123 + }, + { + "epoch": 1.219236693479377, + "grad_norm": 13.105286598205566, + "learning_rate": 1.714716305099306e-05, + "loss": 0.941, + "step": 7124 + }, + { + "epoch": 1.219407838439158, + "grad_norm": 22.902633666992188, + "learning_rate": 1.7135759337877103e-05, + "loss": 2.8356, + "step": 7125 + }, + { + "epoch": 1.219578983398939, + "grad_norm": 15.38435173034668, + "learning_rate": 1.7124354364659955e-05, + "loss": 0.9884, + "step": 7126 + }, + { + "epoch": 1.2197501283587198, + "grad_norm": 12.530359268188477, + "learning_rate": 1.711294813807057e-05, + "loss": 1.004, + "step": 7127 + }, + { + "epoch": 1.2199212733185008, + "grad_norm": 0.6558112502098083, + "learning_rate": 1.7101540664838635e-05, + "loss": 0.1397, + "step": 7128 + }, + { + "epoch": 1.2200924182782817, + "grad_norm": 13.729168891906738, + "learning_rate": 1.7090131951694577e-05, + "loss": 1.0953, + "step": 7129 + }, + { + "epoch": 1.2202635632380627, + "grad_norm": 6.121183395385742, + "learning_rate": 1.7078722005369552e-05, + "loss": 0.3995, + "step": 7130 + }, + { + "epoch": 1.2204347081978435, + "grad_norm": 19.05879020690918, + "learning_rate": 1.7067310832595453e-05, + "loss": 2.6498, + "step": 7131 + }, + { + "epoch": 1.2206058531576245, + "grad_norm": 8.5386962890625, + "learning_rate": 1.705589844010488e-05, + "loss": 1.0223, + "step": 7132 + }, + { + "epoch": 1.2207769981174055, + "grad_norm": 10.45544719696045, + "learning_rate": 1.7044484834631167e-05, + "loss": 1.5029, + "step": 7133 + }, + { + "epoch": 1.2209481430771865, + "grad_norm": 17.10525894165039, + "learning_rate": 1.703307002290836e-05, + "loss": 1.3586, + "step": 7134 + }, + { + "epoch": 1.2211192880369672, + "grad_norm": 4.275002956390381, + "learning_rate": 1.70216540116712e-05, + "loss": 0.3468, + "step": 7135 + }, + { + "epoch": 1.2212904329967482, + "grad_norm": 16.066173553466797, + "learning_rate": 1.7010236807655172e-05, + "loss": 1.4318, + "step": 7136 + }, + { + "epoch": 1.2214615779565292, + "grad_norm": 24.005477905273438, + "learning_rate": 1.699881841759643e-05, + "loss": 3.3863, + "step": 7137 + }, + { + "epoch": 1.2216327229163102, + "grad_norm": 18.728178024291992, + "learning_rate": 1.6987398848231845e-05, + "loss": 1.5824, + "step": 7138 + }, + { + "epoch": 1.221803867876091, + "grad_norm": 12.951663970947266, + "learning_rate": 1.6975978106298984e-05, + "loss": 1.1184, + "step": 7139 + }, + { + "epoch": 1.221975012835872, + "grad_norm": 10.898070335388184, + "learning_rate": 1.6964556198536093e-05, + "loss": 1.1159, + "step": 7140 + }, + { + "epoch": 1.222146157795653, + "grad_norm": 19.487085342407227, + "learning_rate": 1.6953133131682116e-05, + "loss": 2.0669, + "step": 7141 + }, + { + "epoch": 1.222317302755434, + "grad_norm": 20.508235931396484, + "learning_rate": 1.6941708912476684e-05, + "loss": 1.7454, + "step": 7142 + }, + { + "epoch": 1.2224884477152147, + "grad_norm": 15.410446166992188, + "learning_rate": 1.6930283547660102e-05, + "loss": 1.3581, + "step": 7143 + }, + { + "epoch": 1.2226595926749957, + "grad_norm": 22.360820770263672, + "learning_rate": 1.6918857043973357e-05, + "loss": 2.0329, + "step": 7144 + }, + { + "epoch": 1.2228307376347767, + "grad_norm": 97.12826538085938, + "learning_rate": 1.6907429408158092e-05, + "loss": 8.4468, + "step": 7145 + }, + { + "epoch": 1.2230018825945577, + "grad_norm": 9.912070274353027, + "learning_rate": 1.6896000646956632e-05, + "loss": 0.7416, + "step": 7146 + }, + { + "epoch": 1.2231730275543384, + "grad_norm": 21.225231170654297, + "learning_rate": 1.6884570767111972e-05, + "loss": 1.7576, + "step": 7147 + }, + { + "epoch": 1.2233441725141194, + "grad_norm": 23.260570526123047, + "learning_rate": 1.6873139775367752e-05, + "loss": 3.2608, + "step": 7148 + }, + { + "epoch": 1.2235153174739004, + "grad_norm": 18.95728874206543, + "learning_rate": 1.6861707678468275e-05, + "loss": 1.7623, + "step": 7149 + }, + { + "epoch": 1.2236864624336814, + "grad_norm": 20.349519729614258, + "learning_rate": 1.6850274483158495e-05, + "loss": 1.5536, + "step": 7150 + }, + { + "epoch": 1.2238576073934622, + "grad_norm": 2.7749392986297607, + "learning_rate": 1.683884019618401e-05, + "loss": 0.2806, + "step": 7151 + }, + { + "epoch": 1.2240287523532432, + "grad_norm": 5.438803195953369, + "learning_rate": 1.682740482429107e-05, + "loss": 0.3878, + "step": 7152 + }, + { + "epoch": 1.2241998973130241, + "grad_norm": 4.200693130493164, + "learning_rate": 1.6815968374226565e-05, + "loss": 0.3669, + "step": 7153 + }, + { + "epoch": 1.2243710422728051, + "grad_norm": 13.428351402282715, + "learning_rate": 1.6804530852738016e-05, + "loss": 1.1932, + "step": 7154 + }, + { + "epoch": 1.224542187232586, + "grad_norm": 11.479659080505371, + "learning_rate": 1.6793092266573576e-05, + "loss": 1.0441, + "step": 7155 + }, + { + "epoch": 1.2247133321923669, + "grad_norm": 7.372691631317139, + "learning_rate": 1.6781652622482024e-05, + "loss": 0.4833, + "step": 7156 + }, + { + "epoch": 1.2248844771521479, + "grad_norm": 11.623597145080566, + "learning_rate": 1.677021192721277e-05, + "loss": 0.8838, + "step": 7157 + }, + { + "epoch": 1.2250556221119289, + "grad_norm": 1.917174220085144, + "learning_rate": 1.675877018751584e-05, + "loss": 0.2578, + "step": 7158 + }, + { + "epoch": 1.2252267670717099, + "grad_norm": 4.783413887023926, + "learning_rate": 1.674732741014189e-05, + "loss": 0.3817, + "step": 7159 + }, + { + "epoch": 1.2253979120314906, + "grad_norm": 19.55664825439453, + "learning_rate": 1.673588360184216e-05, + "loss": 1.3781, + "step": 7160 + }, + { + "epoch": 1.2255690569912716, + "grad_norm": 9.297699928283691, + "learning_rate": 1.6724438769368523e-05, + "loss": 0.5528, + "step": 7161 + }, + { + "epoch": 1.2257402019510526, + "grad_norm": 5.512182235717773, + "learning_rate": 1.6712992919473437e-05, + "loss": 0.7764, + "step": 7162 + }, + { + "epoch": 1.2259113469108334, + "grad_norm": 13.277100563049316, + "learning_rate": 1.6701546058909978e-05, + "loss": 1.1193, + "step": 7163 + }, + { + "epoch": 1.2260824918706144, + "grad_norm": 14.236153602600098, + "learning_rate": 1.6690098194431815e-05, + "loss": 1.0366, + "step": 7164 + }, + { + "epoch": 1.2262536368303953, + "grad_norm": 16.89085578918457, + "learning_rate": 1.66786493327932e-05, + "loss": 1.3351, + "step": 7165 + }, + { + "epoch": 1.2264247817901763, + "grad_norm": 4.800353527069092, + "learning_rate": 1.6667199480748975e-05, + "loss": 0.3385, + "step": 7166 + }, + { + "epoch": 1.2265959267499573, + "grad_norm": 16.8074893951416, + "learning_rate": 1.6655748645054575e-05, + "loss": 1.3478, + "step": 7167 + }, + { + "epoch": 1.226767071709738, + "grad_norm": 0.6463353037834167, + "learning_rate": 1.6644296832466e-05, + "loss": 0.1442, + "step": 7168 + }, + { + "epoch": 1.226938216669519, + "grad_norm": 13.736554145812988, + "learning_rate": 1.6632844049739856e-05, + "loss": 0.979, + "step": 7169 + }, + { + "epoch": 1.2271093616293, + "grad_norm": 18.092864990234375, + "learning_rate": 1.6621390303633287e-05, + "loss": 1.633, + "step": 7170 + }, + { + "epoch": 1.2272805065890808, + "grad_norm": 14.763882637023926, + "learning_rate": 1.6609935600904025e-05, + "loss": 1.2596, + "step": 7171 + }, + { + "epoch": 1.2274516515488618, + "grad_norm": 8.309615135192871, + "learning_rate": 1.659847994831036e-05, + "loss": 0.9655, + "step": 7172 + }, + { + "epoch": 1.2276227965086428, + "grad_norm": 12.788700103759766, + "learning_rate": 1.6587023352611137e-05, + "loss": 1.027, + "step": 7173 + }, + { + "epoch": 1.2277939414684238, + "grad_norm": 8.727684020996094, + "learning_rate": 1.657556582056578e-05, + "loss": 0.7562, + "step": 7174 + }, + { + "epoch": 1.2279650864282048, + "grad_norm": 18.728055953979492, + "learning_rate": 1.6564107358934242e-05, + "loss": 1.4026, + "step": 7175 + }, + { + "epoch": 1.2281362313879856, + "grad_norm": 19.143850326538086, + "learning_rate": 1.655264797447703e-05, + "loss": 1.6007, + "step": 7176 + }, + { + "epoch": 1.2283073763477665, + "grad_norm": 12.161046028137207, + "learning_rate": 1.6541187673955203e-05, + "loss": 0.929, + "step": 7177 + }, + { + "epoch": 1.2284785213075475, + "grad_norm": 6.196813583374023, + "learning_rate": 1.6529726464130348e-05, + "loss": 0.3963, + "step": 7178 + }, + { + "epoch": 1.2286496662673285, + "grad_norm": 14.087017059326172, + "learning_rate": 1.6518264351764606e-05, + "loss": 1.1864, + "step": 7179 + }, + { + "epoch": 1.2288208112271093, + "grad_norm": 10.506908416748047, + "learning_rate": 1.6506801343620635e-05, + "loss": 0.8129, + "step": 7180 + }, + { + "epoch": 1.2289919561868903, + "grad_norm": 13.82801342010498, + "learning_rate": 1.6495337446461623e-05, + "loss": 1.0856, + "step": 7181 + }, + { + "epoch": 1.2291631011466713, + "grad_norm": 41.28950500488281, + "learning_rate": 1.648387266705129e-05, + "loss": 6.4556, + "step": 7182 + }, + { + "epoch": 1.2293342461064523, + "grad_norm": 18.74609375, + "learning_rate": 1.6472407012153877e-05, + "loss": 2.0057, + "step": 7183 + }, + { + "epoch": 1.229505391066233, + "grad_norm": 15.446449279785156, + "learning_rate": 1.646094048853413e-05, + "loss": 1.3507, + "step": 7184 + }, + { + "epoch": 1.229676536026014, + "grad_norm": 20.220766067504883, + "learning_rate": 1.6449473102957327e-05, + "loss": 2.4399, + "step": 7185 + }, + { + "epoch": 1.229847680985795, + "grad_norm": 0.7779582142829895, + "learning_rate": 1.6438004862189228e-05, + "loss": 0.1509, + "step": 7186 + }, + { + "epoch": 1.230018825945576, + "grad_norm": 1.8531559705734253, + "learning_rate": 1.642653577299612e-05, + "loss": 0.2327, + "step": 7187 + }, + { + "epoch": 1.2301899709053568, + "grad_norm": 0.5356231927871704, + "learning_rate": 1.641506584214478e-05, + "loss": 0.1463, + "step": 7188 + }, + { + "epoch": 1.2303611158651377, + "grad_norm": 17.74931526184082, + "learning_rate": 1.6403595076402483e-05, + "loss": 1.8497, + "step": 7189 + }, + { + "epoch": 1.2305322608249187, + "grad_norm": 0.6164054870605469, + "learning_rate": 1.6392123482537002e-05, + "loss": 0.1491, + "step": 7190 + }, + { + "epoch": 1.2307034057846997, + "grad_norm": 16.2725830078125, + "learning_rate": 1.6380651067316598e-05, + "loss": 1.2988, + "step": 7191 + }, + { + "epoch": 1.2308745507444805, + "grad_norm": 14.284119606018066, + "learning_rate": 1.6369177837510003e-05, + "loss": 1.3086, + "step": 7192 + }, + { + "epoch": 1.2310456957042615, + "grad_norm": 23.917457580566406, + "learning_rate": 1.635770379988645e-05, + "loss": 1.668, + "step": 7193 + }, + { + "epoch": 1.2312168406640425, + "grad_norm": 18.673789978027344, + "learning_rate": 1.634622896121562e-05, + "loss": 2.3279, + "step": 7194 + }, + { + "epoch": 1.2313879856238235, + "grad_norm": 14.05659294128418, + "learning_rate": 1.6334753328267706e-05, + "loss": 1.1137, + "step": 7195 + }, + { + "epoch": 1.2315591305836042, + "grad_norm": 106.70809936523438, + "learning_rate": 1.632327690781334e-05, + "loss": 7.4916, + "step": 7196 + }, + { + "epoch": 1.2317302755433852, + "grad_norm": 1.955734133720398, + "learning_rate": 1.631179970662363e-05, + "loss": 0.1702, + "step": 7197 + }, + { + "epoch": 1.2319014205031662, + "grad_norm": 16.24543571472168, + "learning_rate": 1.6300321731470136e-05, + "loss": 1.4861, + "step": 7198 + }, + { + "epoch": 1.2320725654629472, + "grad_norm": 5.551748275756836, + "learning_rate": 1.6288842989124883e-05, + "loss": 0.5526, + "step": 7199 + }, + { + "epoch": 1.232243710422728, + "grad_norm": 26.176286697387695, + "learning_rate": 1.6277363486360348e-05, + "loss": 3.4829, + "step": 7200 + }, + { + "epoch": 1.232414855382509, + "grad_norm": 1.213494896888733, + "learning_rate": 1.626588322994945e-05, + "loss": 0.1637, + "step": 7201 + }, + { + "epoch": 1.23258600034229, + "grad_norm": 13.539944648742676, + "learning_rate": 1.6254402226665566e-05, + "loss": 0.9396, + "step": 7202 + }, + { + "epoch": 1.232757145302071, + "grad_norm": 14.775613784790039, + "learning_rate": 1.62429204832825e-05, + "loss": 1.3005, + "step": 7203 + }, + { + "epoch": 1.2329282902618517, + "grad_norm": 6.944300174713135, + "learning_rate": 1.6231438006574496e-05, + "loss": 0.3753, + "step": 7204 + }, + { + "epoch": 1.2330994352216327, + "grad_norm": 7.613897800445557, + "learning_rate": 1.6219954803316233e-05, + "loss": 0.4657, + "step": 7205 + }, + { + "epoch": 1.2332705801814137, + "grad_norm": 19.34268569946289, + "learning_rate": 1.620847088028282e-05, + "loss": 1.7217, + "step": 7206 + }, + { + "epoch": 1.2334417251411947, + "grad_norm": 18.5648136138916, + "learning_rate": 1.6196986244249786e-05, + "loss": 2.1591, + "step": 7207 + }, + { + "epoch": 1.2336128701009756, + "grad_norm": 8.385674476623535, + "learning_rate": 1.6185500901993086e-05, + "loss": 0.4739, + "step": 7208 + }, + { + "epoch": 1.2337840150607564, + "grad_norm": 19.39516258239746, + "learning_rate": 1.617401486028909e-05, + "loss": 1.554, + "step": 7209 + }, + { + "epoch": 1.2339551600205374, + "grad_norm": 16.113798141479492, + "learning_rate": 1.6162528125914575e-05, + "loss": 1.1473, + "step": 7210 + }, + { + "epoch": 1.2341263049803184, + "grad_norm": 14.98845100402832, + "learning_rate": 1.6151040705646737e-05, + "loss": 1.1081, + "step": 7211 + }, + { + "epoch": 1.2342974499400992, + "grad_norm": 16.200519561767578, + "learning_rate": 1.6139552606263167e-05, + "loss": 1.2292, + "step": 7212 + }, + { + "epoch": 1.2344685948998801, + "grad_norm": 19.726041793823242, + "learning_rate": 1.6128063834541862e-05, + "loss": 1.9293, + "step": 7213 + }, + { + "epoch": 1.2346397398596611, + "grad_norm": 21.031391143798828, + "learning_rate": 1.6116574397261217e-05, + "loss": 2.9587, + "step": 7214 + }, + { + "epoch": 1.2348108848194421, + "grad_norm": 50.508731842041016, + "learning_rate": 1.610508430120001e-05, + "loss": 7.2049, + "step": 7215 + }, + { + "epoch": 1.2349820297792231, + "grad_norm": 0.6097683906555176, + "learning_rate": 1.6093593553137416e-05, + "loss": 0.1377, + "step": 7216 + }, + { + "epoch": 1.2351531747390039, + "grad_norm": 29.152162551879883, + "learning_rate": 1.6082102159853005e-05, + "loss": 5.9683, + "step": 7217 + }, + { + "epoch": 1.2353243196987849, + "grad_norm": 0.45944657921791077, + "learning_rate": 1.6070610128126705e-05, + "loss": 0.1373, + "step": 7218 + }, + { + "epoch": 1.2354954646585659, + "grad_norm": 13.421952247619629, + "learning_rate": 1.6059117464738833e-05, + "loss": 1.136, + "step": 7219 + }, + { + "epoch": 1.2356666096183466, + "grad_norm": 1.488674521446228, + "learning_rate": 1.6047624176470083e-05, + "loss": 0.2489, + "step": 7220 + }, + { + "epoch": 1.2358377545781276, + "grad_norm": 21.020978927612305, + "learning_rate": 1.6036130270101503e-05, + "loss": 2.125, + "step": 7221 + }, + { + "epoch": 1.2360088995379086, + "grad_norm": 10.7100248336792, + "learning_rate": 1.6024635752414523e-05, + "loss": 0.9111, + "step": 7222 + }, + { + "epoch": 1.2361800444976896, + "grad_norm": 2.771242141723633, + "learning_rate": 1.6013140630190924e-05, + "loss": 0.295, + "step": 7223 + }, + { + "epoch": 1.2363511894574706, + "grad_norm": 21.59668731689453, + "learning_rate": 1.6001644910212843e-05, + "loss": 2.7739, + "step": 7224 + }, + { + "epoch": 1.2365223344172513, + "grad_norm": 11.862739562988281, + "learning_rate": 1.5990148599262772e-05, + "loss": 1.1786, + "step": 7225 + }, + { + "epoch": 1.2366934793770323, + "grad_norm": 2.565239906311035, + "learning_rate": 1.5978651704123557e-05, + "loss": 0.2939, + "step": 7226 + }, + { + "epoch": 1.2368646243368133, + "grad_norm": 15.954310417175293, + "learning_rate": 1.5967154231578377e-05, + "loss": 1.4216, + "step": 7227 + }, + { + "epoch": 1.2370357692965943, + "grad_norm": 9.40361213684082, + "learning_rate": 1.5955656188410756e-05, + "loss": 0.7256, + "step": 7228 + }, + { + "epoch": 1.237206914256375, + "grad_norm": 10.410623550415039, + "learning_rate": 1.5944157581404565e-05, + "loss": 0.9274, + "step": 7229 + }, + { + "epoch": 1.237378059216156, + "grad_norm": 16.96190643310547, + "learning_rate": 1.5932658417343995e-05, + "loss": 2.0504, + "step": 7230 + }, + { + "epoch": 1.237549204175937, + "grad_norm": 16.113800048828125, + "learning_rate": 1.5921158703013566e-05, + "loss": 1.396, + "step": 7231 + }, + { + "epoch": 1.237720349135718, + "grad_norm": 17.308685302734375, + "learning_rate": 1.590965844519813e-05, + "loss": 1.4615, + "step": 7232 + }, + { + "epoch": 1.2378914940954988, + "grad_norm": 12.704358100891113, + "learning_rate": 1.5898157650682862e-05, + "loss": 1.1502, + "step": 7233 + }, + { + "epoch": 1.2380626390552798, + "grad_norm": 20.537282943725586, + "learning_rate": 1.5886656326253237e-05, + "loss": 1.7221, + "step": 7234 + }, + { + "epoch": 1.2382337840150608, + "grad_norm": 0.48646315932273865, + "learning_rate": 1.5875154478695064e-05, + "loss": 0.1359, + "step": 7235 + }, + { + "epoch": 1.2384049289748418, + "grad_norm": 30.423362731933594, + "learning_rate": 1.5863652114794442e-05, + "loss": 6.2909, + "step": 7236 + }, + { + "epoch": 1.2385760739346225, + "grad_norm": 0.5225798487663269, + "learning_rate": 1.585214924133778e-05, + "loss": 0.1348, + "step": 7237 + }, + { + "epoch": 1.2387472188944035, + "grad_norm": 12.045647621154785, + "learning_rate": 1.5840645865111804e-05, + "loss": 0.9722, + "step": 7238 + }, + { + "epoch": 1.2389183638541845, + "grad_norm": 20.3051815032959, + "learning_rate": 1.5829141992903513e-05, + "loss": 1.6011, + "step": 7239 + }, + { + "epoch": 1.2390895088139655, + "grad_norm": 17.894704818725586, + "learning_rate": 1.5817637631500213e-05, + "loss": 1.9778, + "step": 7240 + }, + { + "epoch": 1.2392606537737463, + "grad_norm": 22.45466423034668, + "learning_rate": 1.5806132787689492e-05, + "loss": 2.9642, + "step": 7241 + }, + { + "epoch": 1.2394317987335273, + "grad_norm": 2.576204538345337, + "learning_rate": 1.5794627468259224e-05, + "loss": 0.2153, + "step": 7242 + }, + { + "epoch": 1.2396029436933083, + "grad_norm": 10.791744232177734, + "learning_rate": 1.5783121679997564e-05, + "loss": 1.4476, + "step": 7243 + }, + { + "epoch": 1.2397740886530892, + "grad_norm": 10.57784366607666, + "learning_rate": 1.577161542969295e-05, + "loss": 0.9374, + "step": 7244 + }, + { + "epoch": 1.23994523361287, + "grad_norm": 3.045717477798462, + "learning_rate": 1.576010872413408e-05, + "loss": 0.3024, + "step": 7245 + }, + { + "epoch": 1.240116378572651, + "grad_norm": 11.742828369140625, + "learning_rate": 1.5748601570109935e-05, + "loss": 0.9983, + "step": 7246 + }, + { + "epoch": 1.240287523532432, + "grad_norm": 14.008795738220215, + "learning_rate": 1.573709397440975e-05, + "loss": 1.2937, + "step": 7247 + }, + { + "epoch": 1.240458668492213, + "grad_norm": 26.63939666748047, + "learning_rate": 1.5725585943823016e-05, + "loss": 3.5139, + "step": 7248 + }, + { + "epoch": 1.2406298134519937, + "grad_norm": 11.621224403381348, + "learning_rate": 1.57140774851395e-05, + "loss": 0.84, + "step": 7249 + }, + { + "epoch": 1.2408009584117747, + "grad_norm": 11.871199607849121, + "learning_rate": 1.57025686051492e-05, + "loss": 1.0298, + "step": 7250 + }, + { + "epoch": 1.2409721033715557, + "grad_norm": 19.76806640625, + "learning_rate": 1.569105931064238e-05, + "loss": 2.4675, + "step": 7251 + }, + { + "epoch": 1.2411432483313367, + "grad_norm": 16.911815643310547, + "learning_rate": 1.567954960840953e-05, + "loss": 1.4983, + "step": 7252 + }, + { + "epoch": 1.2413143932911175, + "grad_norm": 11.281521797180176, + "learning_rate": 1.5668039505241407e-05, + "loss": 0.8902, + "step": 7253 + }, + { + "epoch": 1.2414855382508985, + "grad_norm": 14.933623313903809, + "learning_rate": 1.565652900792898e-05, + "loss": 0.9997, + "step": 7254 + }, + { + "epoch": 1.2416566832106795, + "grad_norm": 9.2579345703125, + "learning_rate": 1.564501812326346e-05, + "loss": 0.5748, + "step": 7255 + }, + { + "epoch": 1.2418278281704604, + "grad_norm": 12.569852828979492, + "learning_rate": 1.5633506858036286e-05, + "loss": 0.9632, + "step": 7256 + }, + { + "epoch": 1.2419989731302414, + "grad_norm": 4.103747844696045, + "learning_rate": 1.5621995219039122e-05, + "loss": 0.2848, + "step": 7257 + }, + { + "epoch": 1.2421701180900222, + "grad_norm": 16.797086715698242, + "learning_rate": 1.561048321306385e-05, + "loss": 1.4364, + "step": 7258 + }, + { + "epoch": 1.2423412630498032, + "grad_norm": 3.778282642364502, + "learning_rate": 1.559897084690257e-05, + "loss": 0.3722, + "step": 7259 + }, + { + "epoch": 1.2425124080095842, + "grad_norm": 8.077884674072266, + "learning_rate": 1.5587458127347603e-05, + "loss": 0.4795, + "step": 7260 + }, + { + "epoch": 1.242683552969365, + "grad_norm": 7.148391246795654, + "learning_rate": 1.5575945061191467e-05, + "loss": 0.5597, + "step": 7261 + }, + { + "epoch": 1.242854697929146, + "grad_norm": 0.840991199016571, + "learning_rate": 1.5564431655226888e-05, + "loss": 0.1403, + "step": 7262 + }, + { + "epoch": 1.243025842888927, + "grad_norm": 21.8636531829834, + "learning_rate": 1.5552917916246792e-05, + "loss": 3.0481, + "step": 7263 + }, + { + "epoch": 1.243196987848708, + "grad_norm": 15.274545669555664, + "learning_rate": 1.5541403851044294e-05, + "loss": 1.2329, + "step": 7264 + }, + { + "epoch": 1.243368132808489, + "grad_norm": 21.48591423034668, + "learning_rate": 1.552988946641272e-05, + "loss": 1.768, + "step": 7265 + }, + { + "epoch": 1.2435392777682697, + "grad_norm": 35.31312561035156, + "learning_rate": 1.5518374769145577e-05, + "loss": 5.3356, + "step": 7266 + }, + { + "epoch": 1.2437104227280507, + "grad_norm": 12.25826644897461, + "learning_rate": 1.550685976603655e-05, + "loss": 0.9782, + "step": 7267 + }, + { + "epoch": 1.2438815676878316, + "grad_norm": 14.4639892578125, + "learning_rate": 1.5495344463879502e-05, + "loss": 1.1457, + "step": 7268 + }, + { + "epoch": 1.2440527126476124, + "grad_norm": 0.5839426517486572, + "learning_rate": 1.5483828869468477e-05, + "loss": 0.1417, + "step": 7269 + }, + { + "epoch": 1.2442238576073934, + "grad_norm": 19.122604370117188, + "learning_rate": 1.5472312989597703e-05, + "loss": 1.7344, + "step": 7270 + }, + { + "epoch": 1.2443950025671744, + "grad_norm": 19.182523727416992, + "learning_rate": 1.5460796831061564e-05, + "loss": 2.572, + "step": 7271 + }, + { + "epoch": 1.2445661475269554, + "grad_norm": 28.435495376586914, + "learning_rate": 1.5449280400654607e-05, + "loss": 5.9176, + "step": 7272 + }, + { + "epoch": 1.2447372924867364, + "grad_norm": 11.444164276123047, + "learning_rate": 1.543776370517155e-05, + "loss": 0.8763, + "step": 7273 + }, + { + "epoch": 1.2449084374465171, + "grad_norm": 15.6010160446167, + "learning_rate": 1.5426246751407248e-05, + "loss": 1.6153, + "step": 7274 + }, + { + "epoch": 1.2450795824062981, + "grad_norm": 0.541800320148468, + "learning_rate": 1.5414729546156723e-05, + "loss": 0.1371, + "step": 7275 + }, + { + "epoch": 1.2452507273660791, + "grad_norm": 17.38156509399414, + "learning_rate": 1.5403212096215158e-05, + "loss": 1.475, + "step": 7276 + }, + { + "epoch": 1.2454218723258599, + "grad_norm": 19.85516929626465, + "learning_rate": 1.539169440837785e-05, + "loss": 1.42, + "step": 7277 + }, + { + "epoch": 1.2455930172856409, + "grad_norm": 2.570357084274292, + "learning_rate": 1.538017648944026e-05, + "loss": 0.2723, + "step": 7278 + }, + { + "epoch": 1.2457641622454219, + "grad_norm": 6.349524974822998, + "learning_rate": 1.5368658346197977e-05, + "loss": 0.45, + "step": 7279 + }, + { + "epoch": 1.2459353072052028, + "grad_norm": 33.41670227050781, + "learning_rate": 1.5357139985446712e-05, + "loss": 5.7086, + "step": 7280 + }, + { + "epoch": 1.2461064521649838, + "grad_norm": 20.294418334960938, + "learning_rate": 1.5345621413982327e-05, + "loss": 2.0751, + "step": 7281 + }, + { + "epoch": 1.2462775971247646, + "grad_norm": 0.47335755825042725, + "learning_rate": 1.5334102638600797e-05, + "loss": 0.1342, + "step": 7282 + }, + { + "epoch": 1.2464487420845456, + "grad_norm": 103.37303161621094, + "learning_rate": 1.5322583666098214e-05, + "loss": 7.0826, + "step": 7283 + }, + { + "epoch": 1.2466198870443266, + "grad_norm": 11.035674095153809, + "learning_rate": 1.5311064503270783e-05, + "loss": 0.756, + "step": 7284 + }, + { + "epoch": 1.2467910320041076, + "grad_norm": 2.611645460128784, + "learning_rate": 1.5299545156914833e-05, + "loss": 0.2622, + "step": 7285 + }, + { + "epoch": 1.2469621769638883, + "grad_norm": 18.983016967773438, + "learning_rate": 1.528802563382679e-05, + "loss": 1.7266, + "step": 7286 + }, + { + "epoch": 1.2471333219236693, + "grad_norm": 15.634206771850586, + "learning_rate": 1.52765059408032e-05, + "loss": 1.177, + "step": 7287 + }, + { + "epoch": 1.2473044668834503, + "grad_norm": 20.26268768310547, + "learning_rate": 1.5264986084640694e-05, + "loss": 2.0696, + "step": 7288 + }, + { + "epoch": 1.2474756118432313, + "grad_norm": 10.667828559875488, + "learning_rate": 1.5253466072136002e-05, + "loss": 1.0634, + "step": 7289 + }, + { + "epoch": 1.247646756803012, + "grad_norm": 14.556381225585938, + "learning_rate": 1.524194591008595e-05, + "loss": 1.0867, + "step": 7290 + }, + { + "epoch": 1.247817901762793, + "grad_norm": 9.206250190734863, + "learning_rate": 1.5230425605287444e-05, + "loss": 0.5104, + "step": 7291 + }, + { + "epoch": 1.247989046722574, + "grad_norm": 21.278926849365234, + "learning_rate": 1.5218905164537493e-05, + "loss": 1.9708, + "step": 7292 + }, + { + "epoch": 1.248160191682355, + "grad_norm": 11.876038551330566, + "learning_rate": 1.5207384594633174e-05, + "loss": 0.917, + "step": 7293 + }, + { + "epoch": 1.2483313366421358, + "grad_norm": 2.246145725250244, + "learning_rate": 1.5195863902371633e-05, + "loss": 0.2441, + "step": 7294 + }, + { + "epoch": 1.2485024816019168, + "grad_norm": 12.991911888122559, + "learning_rate": 1.5184343094550094e-05, + "loss": 0.9051, + "step": 7295 + }, + { + "epoch": 1.2486736265616978, + "grad_norm": 17.714027404785156, + "learning_rate": 1.517282217796585e-05, + "loss": 1.5865, + "step": 7296 + }, + { + "epoch": 1.2488447715214788, + "grad_norm": 17.127317428588867, + "learning_rate": 1.516130115941627e-05, + "loss": 1.1275, + "step": 7297 + }, + { + "epoch": 1.2490159164812595, + "grad_norm": 16.890031814575195, + "learning_rate": 1.5149780045698768e-05, + "loss": 2.2108, + "step": 7298 + }, + { + "epoch": 1.2491870614410405, + "grad_norm": 0.5093104839324951, + "learning_rate": 1.5138258843610814e-05, + "loss": 0.1335, + "step": 7299 + }, + { + "epoch": 1.2493582064008215, + "grad_norm": 10.212474822998047, + "learning_rate": 1.5126737559949937e-05, + "loss": 0.9708, + "step": 7300 + }, + { + "epoch": 1.2495293513606025, + "grad_norm": 53.22825622558594, + "learning_rate": 1.511521620151371e-05, + "loss": 6.0426, + "step": 7301 + }, + { + "epoch": 1.2497004963203833, + "grad_norm": 69.67967224121094, + "learning_rate": 1.5103694775099744e-05, + "loss": 7.1124, + "step": 7302 + }, + { + "epoch": 1.2498716412801643, + "grad_norm": 26.3138427734375, + "learning_rate": 1.5092173287505715e-05, + "loss": 5.7451, + "step": 7303 + }, + { + "epoch": 1.2500427862399452, + "grad_norm": 21.674297332763672, + "learning_rate": 1.508065174552931e-05, + "loss": 1.9393, + "step": 7304 + }, + { + "epoch": 1.2502139311997262, + "grad_norm": 21.02547836303711, + "learning_rate": 1.5069130155968256e-05, + "loss": 2.0444, + "step": 7305 + }, + { + "epoch": 1.2503850761595072, + "grad_norm": 17.467077255249023, + "learning_rate": 1.5057608525620305e-05, + "loss": 2.0333, + "step": 7306 + }, + { + "epoch": 1.250556221119288, + "grad_norm": 0.9757794737815857, + "learning_rate": 1.5046086861283228e-05, + "loss": 0.1458, + "step": 7307 + }, + { + "epoch": 1.250727366079069, + "grad_norm": 10.240049362182617, + "learning_rate": 1.5034565169754846e-05, + "loss": 0.727, + "step": 7308 + }, + { + "epoch": 1.25089851103885, + "grad_norm": 21.112850189208984, + "learning_rate": 1.502304345783296e-05, + "loss": 1.5793, + "step": 7309 + }, + { + "epoch": 1.2510696559986307, + "grad_norm": 9.824060440063477, + "learning_rate": 1.50115217323154e-05, + "loss": 0.7622, + "step": 7310 + }, + { + "epoch": 1.2512408009584117, + "grad_norm": 24.483760833740234, + "learning_rate": 1.5e-05, + "loss": 2.0368, + "step": 7311 + }, + { + "epoch": 1.2514119459181927, + "grad_norm": 14.989441871643066, + "learning_rate": 1.49884782676846e-05, + "loss": 1.1908, + "step": 7312 + }, + { + "epoch": 1.2515830908779737, + "grad_norm": 14.618575096130371, + "learning_rate": 1.497695654216704e-05, + "loss": 1.1606, + "step": 7313 + }, + { + "epoch": 1.2517542358377547, + "grad_norm": 79.52991485595703, + "learning_rate": 1.4965434830245154e-05, + "loss": 6.9577, + "step": 7314 + }, + { + "epoch": 1.2519253807975355, + "grad_norm": 14.076753616333008, + "learning_rate": 1.4953913138716775e-05, + "loss": 1.0866, + "step": 7315 + }, + { + "epoch": 1.2520965257573164, + "grad_norm": 17.31161880493164, + "learning_rate": 1.4942391474379706e-05, + "loss": 1.1286, + "step": 7316 + }, + { + "epoch": 1.2522676707170974, + "grad_norm": 14.135196685791016, + "learning_rate": 1.4930869844031755e-05, + "loss": 1.2252, + "step": 7317 + }, + { + "epoch": 1.2524388156768782, + "grad_norm": 25.48478126525879, + "learning_rate": 1.4919348254470692e-05, + "loss": 1.2945, + "step": 7318 + }, + { + "epoch": 1.2526099606366592, + "grad_norm": 0.5730566382408142, + "learning_rate": 1.4907826712494287e-05, + "loss": 0.1365, + "step": 7319 + }, + { + "epoch": 1.2527811055964402, + "grad_norm": 19.554141998291016, + "learning_rate": 1.4896305224900258e-05, + "loss": 1.6812, + "step": 7320 + }, + { + "epoch": 1.2529522505562212, + "grad_norm": 12.831884384155273, + "learning_rate": 1.4884783798486301e-05, + "loss": 0.9967, + "step": 7321 + }, + { + "epoch": 1.2531233955160022, + "grad_norm": 7.81449031829834, + "learning_rate": 1.4873262440050072e-05, + "loss": 0.9633, + "step": 7322 + }, + { + "epoch": 1.253294540475783, + "grad_norm": 14.348649978637695, + "learning_rate": 1.4861741156389186e-05, + "loss": 1.1456, + "step": 7323 + }, + { + "epoch": 1.253465685435564, + "grad_norm": 6.867240905761719, + "learning_rate": 1.4850219954301236e-05, + "loss": 0.4317, + "step": 7324 + }, + { + "epoch": 1.253636830395345, + "grad_norm": 17.300615310668945, + "learning_rate": 1.4838698840583732e-05, + "loss": 1.6941, + "step": 7325 + }, + { + "epoch": 1.2538079753551257, + "grad_norm": 20.17761993408203, + "learning_rate": 1.4827177822034152e-05, + "loss": 1.7142, + "step": 7326 + }, + { + "epoch": 1.2539791203149067, + "grad_norm": 3.2199995517730713, + "learning_rate": 1.4815656905449914e-05, + "loss": 0.2867, + "step": 7327 + }, + { + "epoch": 1.2541502652746876, + "grad_norm": 9.007351875305176, + "learning_rate": 1.4804136097628372e-05, + "loss": 0.6969, + "step": 7328 + }, + { + "epoch": 1.2543214102344686, + "grad_norm": 21.556840896606445, + "learning_rate": 1.479261540536683e-05, + "loss": 2.9136, + "step": 7329 + }, + { + "epoch": 1.2544925551942496, + "grad_norm": 21.820880889892578, + "learning_rate": 1.4781094835462506e-05, + "loss": 3.0347, + "step": 7330 + }, + { + "epoch": 1.2546637001540304, + "grad_norm": 12.111786842346191, + "learning_rate": 1.4769574394712555e-05, + "loss": 1.0611, + "step": 7331 + }, + { + "epoch": 1.2548348451138114, + "grad_norm": 22.66676139831543, + "learning_rate": 1.4758054089914054e-05, + "loss": 2.1661, + "step": 7332 + }, + { + "epoch": 1.2550059900735924, + "grad_norm": 15.881113052368164, + "learning_rate": 1.4746533927864e-05, + "loss": 1.4002, + "step": 7333 + }, + { + "epoch": 1.2551771350333731, + "grad_norm": 19.4534854888916, + "learning_rate": 1.4735013915359306e-05, + "loss": 1.1345, + "step": 7334 + }, + { + "epoch": 1.2553482799931541, + "grad_norm": 20.17005157470703, + "learning_rate": 1.4723494059196796e-05, + "loss": 2.3617, + "step": 7335 + }, + { + "epoch": 1.2555194249529351, + "grad_norm": 28.31744956970215, + "learning_rate": 1.4711974366173207e-05, + "loss": 3.8263, + "step": 7336 + }, + { + "epoch": 1.255690569912716, + "grad_norm": 7.080312252044678, + "learning_rate": 1.4700454843085167e-05, + "loss": 0.4506, + "step": 7337 + }, + { + "epoch": 1.255861714872497, + "grad_norm": 16.499441146850586, + "learning_rate": 1.4688935496729218e-05, + "loss": 1.4713, + "step": 7338 + }, + { + "epoch": 1.2560328598322779, + "grad_norm": 22.658084869384766, + "learning_rate": 1.4677416333901789e-05, + "loss": 1.8541, + "step": 7339 + }, + { + "epoch": 1.2562040047920588, + "grad_norm": 7.0036702156066895, + "learning_rate": 1.4665897361399205e-05, + "loss": 0.6589, + "step": 7340 + }, + { + "epoch": 1.2563751497518398, + "grad_norm": 21.883045196533203, + "learning_rate": 1.4654378586017674e-05, + "loss": 2.2609, + "step": 7341 + }, + { + "epoch": 1.2565462947116208, + "grad_norm": 10.533642768859863, + "learning_rate": 1.4642860014553292e-05, + "loss": 0.8586, + "step": 7342 + }, + { + "epoch": 1.2567174396714016, + "grad_norm": 13.321760177612305, + "learning_rate": 1.4631341653802032e-05, + "loss": 1.0397, + "step": 7343 + }, + { + "epoch": 1.2568885846311826, + "grad_norm": 3.57551908493042, + "learning_rate": 1.4619823510559747e-05, + "loss": 0.4809, + "step": 7344 + }, + { + "epoch": 1.2570597295909636, + "grad_norm": 22.513717651367188, + "learning_rate": 1.4608305591622153e-05, + "loss": 2.6994, + "step": 7345 + }, + { + "epoch": 1.2572308745507446, + "grad_norm": 10.400582313537598, + "learning_rate": 1.4596787903784848e-05, + "loss": 0.9155, + "step": 7346 + }, + { + "epoch": 1.2574020195105255, + "grad_norm": 27.71649742126465, + "learning_rate": 1.4585270453843277e-05, + "loss": 4.5367, + "step": 7347 + }, + { + "epoch": 1.2575731644703063, + "grad_norm": 16.393686294555664, + "learning_rate": 1.457375324859276e-05, + "loss": 1.3998, + "step": 7348 + }, + { + "epoch": 1.2577443094300873, + "grad_norm": 15.453287124633789, + "learning_rate": 1.4562236294828458e-05, + "loss": 1.0674, + "step": 7349 + }, + { + "epoch": 1.2579154543898683, + "grad_norm": 8.631450653076172, + "learning_rate": 1.4550719599345392e-05, + "loss": 0.4541, + "step": 7350 + }, + { + "epoch": 1.258086599349649, + "grad_norm": 21.972387313842773, + "learning_rate": 1.4539203168938434e-05, + "loss": 1.7009, + "step": 7351 + }, + { + "epoch": 1.25825774430943, + "grad_norm": 14.740167617797852, + "learning_rate": 1.4527687010402294e-05, + "loss": 0.9929, + "step": 7352 + }, + { + "epoch": 1.258428889269211, + "grad_norm": 5.436156749725342, + "learning_rate": 1.451617113053152e-05, + "loss": 0.3797, + "step": 7353 + }, + { + "epoch": 1.258600034228992, + "grad_norm": 12.183341979980469, + "learning_rate": 1.4504655536120502e-05, + "loss": 1.0988, + "step": 7354 + }, + { + "epoch": 1.258771179188773, + "grad_norm": 0.7028123736381531, + "learning_rate": 1.4493140233963452e-05, + "loss": 0.1615, + "step": 7355 + }, + { + "epoch": 1.2589423241485538, + "grad_norm": 16.393653869628906, + "learning_rate": 1.4481625230854426e-05, + "loss": 1.7488, + "step": 7356 + }, + { + "epoch": 1.2591134691083348, + "grad_norm": 16.673946380615234, + "learning_rate": 1.447011053358728e-05, + "loss": 1.0904, + "step": 7357 + }, + { + "epoch": 1.2592846140681158, + "grad_norm": 10.811583518981934, + "learning_rate": 1.4458596148955709e-05, + "loss": 0.6634, + "step": 7358 + }, + { + "epoch": 1.2594557590278965, + "grad_norm": 1.4404106140136719, + "learning_rate": 1.4447082083753217e-05, + "loss": 0.2295, + "step": 7359 + }, + { + "epoch": 1.2596269039876775, + "grad_norm": 1.8505138158798218, + "learning_rate": 1.4435568344773118e-05, + "loss": 0.1746, + "step": 7360 + }, + { + "epoch": 1.2597980489474585, + "grad_norm": 12.746377944946289, + "learning_rate": 1.4424054938808534e-05, + "loss": 0.913, + "step": 7361 + }, + { + "epoch": 1.2599691939072395, + "grad_norm": 7.94597864151001, + "learning_rate": 1.4412541872652397e-05, + "loss": 0.9712, + "step": 7362 + }, + { + "epoch": 1.2601403388670205, + "grad_norm": 16.183761596679688, + "learning_rate": 1.440102915309743e-05, + "loss": 1.7207, + "step": 7363 + }, + { + "epoch": 1.2603114838268012, + "grad_norm": 15.641918182373047, + "learning_rate": 1.4389516786936155e-05, + "loss": 1.6151, + "step": 7364 + }, + { + "epoch": 1.2604826287865822, + "grad_norm": 0.5059029459953308, + "learning_rate": 1.4378004780960885e-05, + "loss": 0.137, + "step": 7365 + }, + { + "epoch": 1.2606537737463632, + "grad_norm": 18.027734756469727, + "learning_rate": 1.4366493141963718e-05, + "loss": 1.6826, + "step": 7366 + }, + { + "epoch": 1.260824918706144, + "grad_norm": 9.822291374206543, + "learning_rate": 1.4354981876736543e-05, + "loss": 0.6362, + "step": 7367 + }, + { + "epoch": 1.260996063665925, + "grad_norm": 17.900630950927734, + "learning_rate": 1.4343470992071023e-05, + "loss": 1.6278, + "step": 7368 + }, + { + "epoch": 1.261167208625706, + "grad_norm": 17.575546264648438, + "learning_rate": 1.4331960494758594e-05, + "loss": 0.9771, + "step": 7369 + }, + { + "epoch": 1.261338353585487, + "grad_norm": 17.916412353515625, + "learning_rate": 1.4320450391590468e-05, + "loss": 2.165, + "step": 7370 + }, + { + "epoch": 1.261509498545268, + "grad_norm": 9.473165512084961, + "learning_rate": 1.430894068935762e-05, + "loss": 1.2812, + "step": 7371 + }, + { + "epoch": 1.2616806435050487, + "grad_norm": 21.768396377563477, + "learning_rate": 1.42974313948508e-05, + "loss": 2.4049, + "step": 7372 + }, + { + "epoch": 1.2618517884648297, + "grad_norm": 5.626485824584961, + "learning_rate": 1.4285922514860502e-05, + "loss": 0.5733, + "step": 7373 + }, + { + "epoch": 1.2620229334246107, + "grad_norm": 15.431833267211914, + "learning_rate": 1.4274414056176982e-05, + "loss": 1.4185, + "step": 7374 + }, + { + "epoch": 1.2621940783843915, + "grad_norm": 20.183462142944336, + "learning_rate": 1.4262906025590251e-05, + "loss": 1.9588, + "step": 7375 + }, + { + "epoch": 1.2623652233441724, + "grad_norm": 0.3969564437866211, + "learning_rate": 1.4251398429890066e-05, + "loss": 0.1221, + "step": 7376 + }, + { + "epoch": 1.2625363683039534, + "grad_norm": 15.512309074401855, + "learning_rate": 1.4239891275865913e-05, + "loss": 1.231, + "step": 7377 + }, + { + "epoch": 1.2627075132637344, + "grad_norm": 28.235328674316406, + "learning_rate": 1.4228384570307047e-05, + "loss": 5.5077, + "step": 7378 + }, + { + "epoch": 1.2628786582235154, + "grad_norm": 14.990321159362793, + "learning_rate": 1.4216878320002431e-05, + "loss": 1.4904, + "step": 7379 + }, + { + "epoch": 1.2630498031832962, + "grad_norm": 27.102006912231445, + "learning_rate": 1.4205372531740779e-05, + "loss": 0.9632, + "step": 7380 + }, + { + "epoch": 1.2632209481430772, + "grad_norm": 14.286447525024414, + "learning_rate": 1.4193867212310512e-05, + "loss": 0.8609, + "step": 7381 + }, + { + "epoch": 1.2633920931028582, + "grad_norm": 27.299739837646484, + "learning_rate": 1.418236236849979e-05, + "loss": 5.5275, + "step": 7382 + }, + { + "epoch": 1.263563238062639, + "grad_norm": 20.601612091064453, + "learning_rate": 1.417085800709649e-05, + "loss": 2.0682, + "step": 7383 + }, + { + "epoch": 1.26373438302242, + "grad_norm": 4.8306169509887695, + "learning_rate": 1.4159354134888199e-05, + "loss": 0.4245, + "step": 7384 + }, + { + "epoch": 1.263905527982201, + "grad_norm": 14.180475234985352, + "learning_rate": 1.414785075866222e-05, + "loss": 1.1286, + "step": 7385 + }, + { + "epoch": 1.264076672941982, + "grad_norm": 0.5731605887413025, + "learning_rate": 1.4136347885205565e-05, + "loss": 0.1332, + "step": 7386 + }, + { + "epoch": 1.2642478179017629, + "grad_norm": 8.598193168640137, + "learning_rate": 1.4124845521304939e-05, + "loss": 0.6556, + "step": 7387 + }, + { + "epoch": 1.2644189628615436, + "grad_norm": 0.4574246406555176, + "learning_rate": 1.4113343673746762e-05, + "loss": 0.1275, + "step": 7388 + }, + { + "epoch": 1.2645901078213246, + "grad_norm": 14.310580253601074, + "learning_rate": 1.410184234931714e-05, + "loss": 1.2369, + "step": 7389 + }, + { + "epoch": 1.2647612527811056, + "grad_norm": 28.03304100036621, + "learning_rate": 1.409034155480187e-05, + "loss": 5.8028, + "step": 7390 + }, + { + "epoch": 1.2649323977408864, + "grad_norm": 14.480050086975098, + "learning_rate": 1.4078841296986435e-05, + "loss": 1.0287, + "step": 7391 + }, + { + "epoch": 1.2651035427006674, + "grad_norm": 21.460359573364258, + "learning_rate": 1.4067341582656011e-05, + "loss": 1.7921, + "step": 7392 + }, + { + "epoch": 1.2652746876604484, + "grad_norm": 1.9403401613235474, + "learning_rate": 1.4055842418595432e-05, + "loss": 0.2589, + "step": 7393 + }, + { + "epoch": 1.2654458326202294, + "grad_norm": 7.202798843383789, + "learning_rate": 1.4044343811589241e-05, + "loss": 0.4405, + "step": 7394 + }, + { + "epoch": 1.2656169775800103, + "grad_norm": 62.627830505371094, + "learning_rate": 1.4032845768421624e-05, + "loss": 7.4275, + "step": 7395 + }, + { + "epoch": 1.2657881225397913, + "grad_norm": 14.698546409606934, + "learning_rate": 1.4021348295876447e-05, + "loss": 1.4408, + "step": 7396 + }, + { + "epoch": 1.265959267499572, + "grad_norm": 22.262781143188477, + "learning_rate": 1.4009851400737227e-05, + "loss": 2.1098, + "step": 7397 + }, + { + "epoch": 1.266130412459353, + "grad_norm": 15.667557716369629, + "learning_rate": 1.399835508978716e-05, + "loss": 1.3713, + "step": 7398 + }, + { + "epoch": 1.266301557419134, + "grad_norm": 8.630463600158691, + "learning_rate": 1.398685936980908e-05, + "loss": 0.7282, + "step": 7399 + }, + { + "epoch": 1.2664727023789148, + "grad_norm": 15.559676170349121, + "learning_rate": 1.397536424758548e-05, + "loss": 1.74, + "step": 7400 + }, + { + "epoch": 1.2666438473386958, + "grad_norm": 12.456164360046387, + "learning_rate": 1.3963869729898501e-05, + "loss": 1.2426, + "step": 7401 + }, + { + "epoch": 1.2668149922984768, + "grad_norm": 18.912338256835938, + "learning_rate": 1.3952375823529925e-05, + "loss": 2.2147, + "step": 7402 + }, + { + "epoch": 1.2669861372582578, + "grad_norm": 19.029582977294922, + "learning_rate": 1.3940882535261173e-05, + "loss": 1.9542, + "step": 7403 + }, + { + "epoch": 1.2671572822180388, + "grad_norm": 15.901615142822266, + "learning_rate": 1.3929389871873299e-05, + "loss": 1.1382, + "step": 7404 + }, + { + "epoch": 1.2673284271778196, + "grad_norm": 16.025964736938477, + "learning_rate": 1.3917897840147e-05, + "loss": 1.3947, + "step": 7405 + }, + { + "epoch": 1.2674995721376006, + "grad_norm": 3.6541223526000977, + "learning_rate": 1.3906406446862585e-05, + "loss": 0.2928, + "step": 7406 + }, + { + "epoch": 1.2676707170973815, + "grad_norm": 14.086526870727539, + "learning_rate": 1.3894915698799997e-05, + "loss": 1.2049, + "step": 7407 + }, + { + "epoch": 1.2678418620571623, + "grad_norm": 19.24121856689453, + "learning_rate": 1.3883425602738794e-05, + "loss": 2.2081, + "step": 7408 + }, + { + "epoch": 1.2680130070169433, + "grad_norm": 86.15853118896484, + "learning_rate": 1.3871936165458139e-05, + "loss": 7.7859, + "step": 7409 + }, + { + "epoch": 1.2681841519767243, + "grad_norm": 3.7285234928131104, + "learning_rate": 1.3860447393736834e-05, + "loss": 0.323, + "step": 7410 + }, + { + "epoch": 1.2683552969365053, + "grad_norm": 14.129486083984375, + "learning_rate": 1.3848959294353263e-05, + "loss": 1.1798, + "step": 7411 + }, + { + "epoch": 1.2685264418962863, + "grad_norm": 25.551198959350586, + "learning_rate": 1.3837471874085428e-05, + "loss": 1.2741, + "step": 7412 + }, + { + "epoch": 1.268697586856067, + "grad_norm": 19.952129364013672, + "learning_rate": 1.3825985139710915e-05, + "loss": 2.2252, + "step": 7413 + }, + { + "epoch": 1.268868731815848, + "grad_norm": 3.0930943489074707, + "learning_rate": 1.3814499098006911e-05, + "loss": 0.2683, + "step": 7414 + }, + { + "epoch": 1.269039876775629, + "grad_norm": 58.00749969482422, + "learning_rate": 1.3803013755750211e-05, + "loss": 6.766, + "step": 7415 + }, + { + "epoch": 1.2692110217354098, + "grad_norm": 14.905405044555664, + "learning_rate": 1.3791529119717183e-05, + "loss": 1.2094, + "step": 7416 + }, + { + "epoch": 1.2693821666951908, + "grad_norm": 14.11672592163086, + "learning_rate": 1.3780045196683771e-05, + "loss": 1.0501, + "step": 7417 + }, + { + "epoch": 1.2695533116549718, + "grad_norm": 17.17655372619629, + "learning_rate": 1.3768561993425508e-05, + "loss": 1.3703, + "step": 7418 + }, + { + "epoch": 1.2697244566147528, + "grad_norm": 23.263961791992188, + "learning_rate": 1.3757079516717503e-05, + "loss": 4.9318, + "step": 7419 + }, + { + "epoch": 1.2698956015745337, + "grad_norm": 10.799759864807129, + "learning_rate": 1.374559777333443e-05, + "loss": 1.4142, + "step": 7420 + }, + { + "epoch": 1.2700667465343145, + "grad_norm": 12.609829902648926, + "learning_rate": 1.3734116770050548e-05, + "loss": 1.0778, + "step": 7421 + }, + { + "epoch": 1.2702378914940955, + "grad_norm": 5.679696083068848, + "learning_rate": 1.3722636513639654e-05, + "loss": 0.4246, + "step": 7422 + }, + { + "epoch": 1.2704090364538765, + "grad_norm": 15.620946884155273, + "learning_rate": 1.371115701087512e-05, + "loss": 1.2893, + "step": 7423 + }, + { + "epoch": 1.2705801814136573, + "grad_norm": 15.387858390808105, + "learning_rate": 1.3699678268529866e-05, + "loss": 1.3466, + "step": 7424 + }, + { + "epoch": 1.2707513263734382, + "grad_norm": 9.487750053405762, + "learning_rate": 1.3688200293376372e-05, + "loss": 0.9842, + "step": 7425 + }, + { + "epoch": 1.2709224713332192, + "grad_norm": 0.6645063757896423, + "learning_rate": 1.367672309218666e-05, + "loss": 0.1334, + "step": 7426 + }, + { + "epoch": 1.2710936162930002, + "grad_norm": 0.7540750503540039, + "learning_rate": 1.3665246671732296e-05, + "loss": 0.1474, + "step": 7427 + }, + { + "epoch": 1.2712647612527812, + "grad_norm": 8.321049690246582, + "learning_rate": 1.3653771038784385e-05, + "loss": 0.6695, + "step": 7428 + }, + { + "epoch": 1.271435906212562, + "grad_norm": 15.873809814453125, + "learning_rate": 1.3642296200113564e-05, + "loss": 1.4287, + "step": 7429 + }, + { + "epoch": 1.271607051172343, + "grad_norm": 11.329692840576172, + "learning_rate": 1.3630822162489998e-05, + "loss": 1.0208, + "step": 7430 + }, + { + "epoch": 1.271778196132124, + "grad_norm": 12.636488914489746, + "learning_rate": 1.3619348932683404e-05, + "loss": 1.0985, + "step": 7431 + }, + { + "epoch": 1.2719493410919047, + "grad_norm": 23.067245483398438, + "learning_rate": 1.3607876517463e-05, + "loss": 2.325, + "step": 7432 + }, + { + "epoch": 1.2721204860516857, + "grad_norm": 0.6290095448493958, + "learning_rate": 1.3596404923597521e-05, + "loss": 0.144, + "step": 7433 + }, + { + "epoch": 1.2722916310114667, + "grad_norm": 21.06360626220703, + "learning_rate": 1.3584934157855227e-05, + "loss": 1.6601, + "step": 7434 + }, + { + "epoch": 1.2724627759712477, + "grad_norm": 14.543760299682617, + "learning_rate": 1.3573464227003888e-05, + "loss": 1.2144, + "step": 7435 + }, + { + "epoch": 1.2726339209310287, + "grad_norm": 0.4954783320426941, + "learning_rate": 1.3561995137810771e-05, + "loss": 0.1383, + "step": 7436 + }, + { + "epoch": 1.2728050658908094, + "grad_norm": 8.514019012451172, + "learning_rate": 1.3550526897042677e-05, + "loss": 0.501, + "step": 7437 + }, + { + "epoch": 1.2729762108505904, + "grad_norm": 0.661493718624115, + "learning_rate": 1.3539059511465868e-05, + "loss": 0.1446, + "step": 7438 + }, + { + "epoch": 1.2731473558103714, + "grad_norm": 20.316226959228516, + "learning_rate": 1.3527592987846124e-05, + "loss": 1.8445, + "step": 7439 + }, + { + "epoch": 1.2733185007701522, + "grad_norm": 0.44578152894973755, + "learning_rate": 1.3516127332948709e-05, + "loss": 0.1261, + "step": 7440 + }, + { + "epoch": 1.2734896457299332, + "grad_norm": 11.764918327331543, + "learning_rate": 1.350466255353838e-05, + "loss": 0.8638, + "step": 7441 + }, + { + "epoch": 1.2736607906897142, + "grad_norm": 14.545720100402832, + "learning_rate": 1.349319865637937e-05, + "loss": 1.0689, + "step": 7442 + }, + { + "epoch": 1.2738319356494952, + "grad_norm": 92.29745483398438, + "learning_rate": 1.3481735648235398e-05, + "loss": 6.7581, + "step": 7443 + }, + { + "epoch": 1.2740030806092761, + "grad_norm": 17.633432388305664, + "learning_rate": 1.3470273535869658e-05, + "loss": 1.9326, + "step": 7444 + }, + { + "epoch": 1.2741742255690571, + "grad_norm": 16.83913803100586, + "learning_rate": 1.3458812326044806e-05, + "loss": 1.3725, + "step": 7445 + }, + { + "epoch": 1.274345370528838, + "grad_norm": 3.9324121475219727, + "learning_rate": 1.3447352025522971e-05, + "loss": 0.6321, + "step": 7446 + }, + { + "epoch": 1.2745165154886189, + "grad_norm": 0.8707903623580933, + "learning_rate": 1.343589264106576e-05, + "loss": 0.1474, + "step": 7447 + }, + { + "epoch": 1.2746876604483999, + "grad_norm": 0.502506673336029, + "learning_rate": 1.3424434179434224e-05, + "loss": 0.1367, + "step": 7448 + }, + { + "epoch": 1.2748588054081806, + "grad_norm": 18.582305908203125, + "learning_rate": 1.3412976647388867e-05, + "loss": 1.4433, + "step": 7449 + }, + { + "epoch": 1.2750299503679616, + "grad_norm": 20.04930877685547, + "learning_rate": 1.340152005168965e-05, + "loss": 1.4891, + "step": 7450 + }, + { + "epoch": 1.2752010953277426, + "grad_norm": 9.783509254455566, + "learning_rate": 1.3390064399095984e-05, + "loss": 0.8113, + "step": 7451 + }, + { + "epoch": 1.2753722402875236, + "grad_norm": 20.472023010253906, + "learning_rate": 1.3378609696366715e-05, + "loss": 2.769, + "step": 7452 + }, + { + "epoch": 1.2755433852473046, + "grad_norm": 8.195066452026367, + "learning_rate": 1.3367155950260148e-05, + "loss": 0.7378, + "step": 7453 + }, + { + "epoch": 1.2757145302070854, + "grad_norm": 13.91455078125, + "learning_rate": 1.3355703167534e-05, + "loss": 0.9211, + "step": 7454 + }, + { + "epoch": 1.2758856751668664, + "grad_norm": 0.5569220185279846, + "learning_rate": 1.3344251354945433e-05, + "loss": 0.1286, + "step": 7455 + }, + { + "epoch": 1.2760568201266473, + "grad_norm": 19.011075973510742, + "learning_rate": 1.3332800519251031e-05, + "loss": 2.0409, + "step": 7456 + }, + { + "epoch": 1.276227965086428, + "grad_norm": 21.4835262298584, + "learning_rate": 1.3321350667206805e-05, + "loss": 1.8533, + "step": 7457 + }, + { + "epoch": 1.276399110046209, + "grad_norm": 13.103194236755371, + "learning_rate": 1.3309901805568186e-05, + "loss": 0.901, + "step": 7458 + }, + { + "epoch": 1.27657025500599, + "grad_norm": 40.336116790771484, + "learning_rate": 1.329845394109002e-05, + "loss": 6.137, + "step": 7459 + }, + { + "epoch": 1.276741399965771, + "grad_norm": 11.189400672912598, + "learning_rate": 1.3287007080526564e-05, + "loss": 0.84, + "step": 7460 + }, + { + "epoch": 1.276912544925552, + "grad_norm": 2.909088611602783, + "learning_rate": 1.3275561230631481e-05, + "loss": 0.2344, + "step": 7461 + }, + { + "epoch": 1.2770836898853328, + "grad_norm": 13.368642807006836, + "learning_rate": 1.3264116398157843e-05, + "loss": 1.0364, + "step": 7462 + }, + { + "epoch": 1.2772548348451138, + "grad_norm": 21.844898223876953, + "learning_rate": 1.325267258985811e-05, + "loss": 1.8354, + "step": 7463 + }, + { + "epoch": 1.2774259798048948, + "grad_norm": 16.96022605895996, + "learning_rate": 1.3241229812484153e-05, + "loss": 1.4636, + "step": 7464 + }, + { + "epoch": 1.2775971247646756, + "grad_norm": 3.6713128089904785, + "learning_rate": 1.322978807278723e-05, + "loss": 0.3187, + "step": 7465 + }, + { + "epoch": 1.2777682697244566, + "grad_norm": 23.283611297607422, + "learning_rate": 1.3218347377517979e-05, + "loss": 1.2698, + "step": 7466 + }, + { + "epoch": 1.2779394146842376, + "grad_norm": 4.398828506469727, + "learning_rate": 1.320690773342643e-05, + "loss": 0.2959, + "step": 7467 + }, + { + "epoch": 1.2781105596440185, + "grad_norm": 21.414701461791992, + "learning_rate": 1.3195469147261987e-05, + "loss": 5.061, + "step": 7468 + }, + { + "epoch": 1.2782817046037995, + "grad_norm": 11.26382064819336, + "learning_rate": 1.318403162577344e-05, + "loss": 0.8673, + "step": 7469 + }, + { + "epoch": 1.2784528495635803, + "grad_norm": 7.07971715927124, + "learning_rate": 1.3172595175708934e-05, + "loss": 0.3555, + "step": 7470 + }, + { + "epoch": 1.2786239945233613, + "grad_norm": 17.331661224365234, + "learning_rate": 1.3161159803815996e-05, + "loss": 1.2332, + "step": 7471 + }, + { + "epoch": 1.2787951394831423, + "grad_norm": 3.61580753326416, + "learning_rate": 1.3149725516841514e-05, + "loss": 0.2735, + "step": 7472 + }, + { + "epoch": 1.278966284442923, + "grad_norm": 17.832317352294922, + "learning_rate": 1.313829232153173e-05, + "loss": 1.1603, + "step": 7473 + }, + { + "epoch": 1.279137429402704, + "grad_norm": 14.138060569763184, + "learning_rate": 1.3126860224632252e-05, + "loss": 1.0651, + "step": 7474 + }, + { + "epoch": 1.279308574362485, + "grad_norm": 3.1998164653778076, + "learning_rate": 1.311542923288803e-05, + "loss": 0.2512, + "step": 7475 + }, + { + "epoch": 1.279479719322266, + "grad_norm": 21.73483657836914, + "learning_rate": 1.3103999353043369e-05, + "loss": 1.6646, + "step": 7476 + }, + { + "epoch": 1.279650864282047, + "grad_norm": 0.5186206102371216, + "learning_rate": 1.3092570591841912e-05, + "loss": 0.1297, + "step": 7477 + }, + { + "epoch": 1.2798220092418278, + "grad_norm": 5.291880130767822, + "learning_rate": 1.308114295602665e-05, + "loss": 0.4219, + "step": 7478 + }, + { + "epoch": 1.2799931542016088, + "grad_norm": 4.802558898925781, + "learning_rate": 1.3069716452339897e-05, + "loss": 0.3575, + "step": 7479 + }, + { + "epoch": 1.2801642991613897, + "grad_norm": 20.09796142578125, + "learning_rate": 1.3058291087523315e-05, + "loss": 2.2933, + "step": 7480 + }, + { + "epoch": 1.2803354441211705, + "grad_norm": 17.5977840423584, + "learning_rate": 1.3046866868317883e-05, + "loss": 1.5162, + "step": 7481 + }, + { + "epoch": 1.2805065890809515, + "grad_norm": 19.869531631469727, + "learning_rate": 1.303544380146391e-05, + "loss": 2.4812, + "step": 7482 + }, + { + "epoch": 1.2806777340407325, + "grad_norm": 13.439567565917969, + "learning_rate": 1.3024021893701019e-05, + "loss": 1.1286, + "step": 7483 + }, + { + "epoch": 1.2808488790005135, + "grad_norm": 0.5329412817955017, + "learning_rate": 1.3012601151768157e-05, + "loss": 0.1252, + "step": 7484 + }, + { + "epoch": 1.2810200239602945, + "grad_norm": 27.384693145751953, + "learning_rate": 1.3001181582403573e-05, + "loss": 5.4171, + "step": 7485 + }, + { + "epoch": 1.2811911689200752, + "grad_norm": 83.22950744628906, + "learning_rate": 1.298976319234483e-05, + "loss": 7.6711, + "step": 7486 + }, + { + "epoch": 1.2813623138798562, + "grad_norm": 10.799430847167969, + "learning_rate": 1.29783459883288e-05, + "loss": 0.821, + "step": 7487 + }, + { + "epoch": 1.2815334588396372, + "grad_norm": 4.837438583374023, + "learning_rate": 1.296692997709165e-05, + "loss": 0.3963, + "step": 7488 + }, + { + "epoch": 1.281704603799418, + "grad_norm": 23.152280807495117, + "learning_rate": 1.2955515165368835e-05, + "loss": 2.2947, + "step": 7489 + }, + { + "epoch": 1.281875748759199, + "grad_norm": 4.582759380340576, + "learning_rate": 1.294410155989512e-05, + "loss": 0.3968, + "step": 7490 + }, + { + "epoch": 1.28204689371898, + "grad_norm": 8.332213401794434, + "learning_rate": 1.293268916740455e-05, + "loss": 0.5057, + "step": 7491 + }, + { + "epoch": 1.282218038678761, + "grad_norm": 18.660024642944336, + "learning_rate": 1.292127799463045e-05, + "loss": 1.5929, + "step": 7492 + }, + { + "epoch": 1.282389183638542, + "grad_norm": 14.767051696777344, + "learning_rate": 1.2909868048305429e-05, + "loss": 1.321, + "step": 7493 + }, + { + "epoch": 1.2825603285983227, + "grad_norm": 0.5502254962921143, + "learning_rate": 1.2898459335161372e-05, + "loss": 0.1237, + "step": 7494 + }, + { + "epoch": 1.2827314735581037, + "grad_norm": 18.758405685424805, + "learning_rate": 1.2887051861929434e-05, + "loss": 1.7648, + "step": 7495 + }, + { + "epoch": 1.2829026185178847, + "grad_norm": 0.6086778044700623, + "learning_rate": 1.287564563534005e-05, + "loss": 0.1372, + "step": 7496 + }, + { + "epoch": 1.2830737634776657, + "grad_norm": 21.15096664428711, + "learning_rate": 1.2864240662122903e-05, + "loss": 1.7089, + "step": 7497 + }, + { + "epoch": 1.2832449084374464, + "grad_norm": 16.411664962768555, + "learning_rate": 1.2852836949006946e-05, + "loss": 1.412, + "step": 7498 + }, + { + "epoch": 1.2834160533972274, + "grad_norm": 4.890477180480957, + "learning_rate": 1.2841434502720388e-05, + "loss": 0.3891, + "step": 7499 + }, + { + "epoch": 1.2835871983570084, + "grad_norm": 17.419998168945312, + "learning_rate": 1.2830033329990685e-05, + "loss": 1.6243, + "step": 7500 + }, + { + "epoch": 1.2837583433167894, + "grad_norm": 15.588261604309082, + "learning_rate": 1.2818633437544555e-05, + "loss": 1.0956, + "step": 7501 + }, + { + "epoch": 1.2839294882765704, + "grad_norm": 15.560494422912598, + "learning_rate": 1.2807234832107943e-05, + "loss": 1.5608, + "step": 7502 + }, + { + "epoch": 1.2841006332363512, + "grad_norm": 25.251907348632812, + "learning_rate": 1.2795837520406048e-05, + "loss": 5.2597, + "step": 7503 + }, + { + "epoch": 1.2842717781961321, + "grad_norm": 12.475388526916504, + "learning_rate": 1.2784441509163297e-05, + "loss": 1.0031, + "step": 7504 + }, + { + "epoch": 1.2844429231559131, + "grad_norm": 12.28564167022705, + "learning_rate": 1.2773046805103353e-05, + "loss": 0.8902, + "step": 7505 + }, + { + "epoch": 1.284614068115694, + "grad_norm": 18.319833755493164, + "learning_rate": 1.2761653414949115e-05, + "loss": 1.701, + "step": 7506 + }, + { + "epoch": 1.2847852130754749, + "grad_norm": 18.66644859313965, + "learning_rate": 1.2750261345422696e-05, + "loss": 2.0435, + "step": 7507 + }, + { + "epoch": 1.2849563580352559, + "grad_norm": 2.7434170246124268, + "learning_rate": 1.2738870603245434e-05, + "loss": 0.2375, + "step": 7508 + }, + { + "epoch": 1.2851275029950369, + "grad_norm": 16.576847076416016, + "learning_rate": 1.2727481195137886e-05, + "loss": 1.5716, + "step": 7509 + }, + { + "epoch": 1.2852986479548179, + "grad_norm": 16.501052856445312, + "learning_rate": 1.2716093127819816e-05, + "loss": 1.0884, + "step": 7510 + }, + { + "epoch": 1.2854697929145986, + "grad_norm": 12.034847259521484, + "learning_rate": 1.2704706408010203e-05, + "loss": 0.9672, + "step": 7511 + }, + { + "epoch": 1.2856409378743796, + "grad_norm": 5.079814910888672, + "learning_rate": 1.2693321042427225e-05, + "loss": 0.3154, + "step": 7512 + }, + { + "epoch": 1.2858120828341606, + "grad_norm": 0.42951419949531555, + "learning_rate": 1.2681937037788272e-05, + "loss": 0.1267, + "step": 7513 + }, + { + "epoch": 1.2859832277939414, + "grad_norm": 24.724746704101562, + "learning_rate": 1.2670554400809915e-05, + "loss": 2.0771, + "step": 7514 + }, + { + "epoch": 1.2861543727537224, + "grad_norm": 15.345876693725586, + "learning_rate": 1.2659173138207933e-05, + "loss": 1.3652, + "step": 7515 + }, + { + "epoch": 1.2863255177135033, + "grad_norm": 17.33806037902832, + "learning_rate": 1.2647793256697284e-05, + "loss": 1.59, + "step": 7516 + }, + { + "epoch": 1.2864966626732843, + "grad_norm": 17.414993286132812, + "learning_rate": 1.2636414762992118e-05, + "loss": 1.5101, + "step": 7517 + }, + { + "epoch": 1.2866678076330653, + "grad_norm": 20.75063705444336, + "learning_rate": 1.2625037663805763e-05, + "loss": 1.6154, + "step": 7518 + }, + { + "epoch": 1.286838952592846, + "grad_norm": 14.97973918914795, + "learning_rate": 1.2613661965850725e-05, + "loss": 1.1913, + "step": 7519 + }, + { + "epoch": 1.287010097552627, + "grad_norm": 3.322770118713379, + "learning_rate": 1.2602287675838682e-05, + "loss": 0.2227, + "step": 7520 + }, + { + "epoch": 1.287181242512408, + "grad_norm": 15.625368118286133, + "learning_rate": 1.2590914800480482e-05, + "loss": 1.1869, + "step": 7521 + }, + { + "epoch": 1.2873523874721888, + "grad_norm": 15.407942771911621, + "learning_rate": 1.2579543346486132e-05, + "loss": 1.7004, + "step": 7522 + }, + { + "epoch": 1.2875235324319698, + "grad_norm": 29.231836318969727, + "learning_rate": 1.2568173320564815e-05, + "loss": 1.3564, + "step": 7523 + }, + { + "epoch": 1.2876946773917508, + "grad_norm": 19.370813369750977, + "learning_rate": 1.2556804729424863e-05, + "loss": 1.5538, + "step": 7524 + }, + { + "epoch": 1.2878658223515318, + "grad_norm": 16.274520874023438, + "learning_rate": 1.2545437579773762e-05, + "loss": 1.3551, + "step": 7525 + }, + { + "epoch": 1.2880369673113128, + "grad_norm": 12.730664253234863, + "learning_rate": 1.2534071878318143e-05, + "loss": 1.1659, + "step": 7526 + }, + { + "epoch": 1.2882081122710936, + "grad_norm": 0.45628058910369873, + "learning_rate": 1.252270763176379e-05, + "loss": 0.1251, + "step": 7527 + }, + { + "epoch": 1.2883792572308745, + "grad_norm": 26.614437103271484, + "learning_rate": 1.2511344846815621e-05, + "loss": 5.658, + "step": 7528 + }, + { + "epoch": 1.2885504021906555, + "grad_norm": 14.866990089416504, + "learning_rate": 1.24999835301777e-05, + "loss": 1.4717, + "step": 7529 + }, + { + "epoch": 1.2887215471504363, + "grad_norm": 67.76228332519531, + "learning_rate": 1.248862368855322e-05, + "loss": 7.1911, + "step": 7530 + }, + { + "epoch": 1.2888926921102173, + "grad_norm": 15.273962020874023, + "learning_rate": 1.2477265328644505e-05, + "loss": 1.2144, + "step": 7531 + }, + { + "epoch": 1.2890638370699983, + "grad_norm": 11.719128608703613, + "learning_rate": 1.2465908457153e-05, + "loss": 1.0968, + "step": 7532 + }, + { + "epoch": 1.2892349820297793, + "grad_norm": 25.215810775756836, + "learning_rate": 1.2454553080779283e-05, + "loss": 1.9797, + "step": 7533 + }, + { + "epoch": 1.2894061269895603, + "grad_norm": 20.87401580810547, + "learning_rate": 1.244319920622304e-05, + "loss": 1.9945, + "step": 7534 + }, + { + "epoch": 1.289577271949341, + "grad_norm": 19.922645568847656, + "learning_rate": 1.2431846840183074e-05, + "loss": 1.7993, + "step": 7535 + }, + { + "epoch": 1.289748416909122, + "grad_norm": 3.47524356842041, + "learning_rate": 1.2420495989357297e-05, + "loss": 0.2826, + "step": 7536 + }, + { + "epoch": 1.289919561868903, + "grad_norm": 21.989328384399414, + "learning_rate": 1.240914666044273e-05, + "loss": 2.259, + "step": 7537 + }, + { + "epoch": 1.2900907068286838, + "grad_norm": 14.711652755737305, + "learning_rate": 1.2397798860135483e-05, + "loss": 1.169, + "step": 7538 + }, + { + "epoch": 1.2902618517884648, + "grad_norm": 16.291339874267578, + "learning_rate": 1.2386452595130793e-05, + "loss": 1.3224, + "step": 7539 + }, + { + "epoch": 1.2904329967482457, + "grad_norm": 18.962268829345703, + "learning_rate": 1.2375107872122963e-05, + "loss": 1.5555, + "step": 7540 + }, + { + "epoch": 1.2906041417080267, + "grad_norm": 13.209924697875977, + "learning_rate": 1.2363764697805402e-05, + "loss": 1.2939, + "step": 7541 + }, + { + "epoch": 1.2907752866678077, + "grad_norm": 1.855237603187561, + "learning_rate": 1.2352423078870592e-05, + "loss": 0.2471, + "step": 7542 + }, + { + "epoch": 1.2909464316275885, + "grad_norm": 0.9598819613456726, + "learning_rate": 1.2341083022010103e-05, + "loss": 0.1429, + "step": 7543 + }, + { + "epoch": 1.2911175765873695, + "grad_norm": 13.005754470825195, + "learning_rate": 1.23297445339146e-05, + "loss": 0.9044, + "step": 7544 + }, + { + "epoch": 1.2912887215471505, + "grad_norm": 26.620267868041992, + "learning_rate": 1.2318407621273798e-05, + "loss": 5.5246, + "step": 7545 + }, + { + "epoch": 1.2914598665069315, + "grad_norm": 7.993296146392822, + "learning_rate": 1.2307072290776492e-05, + "loss": 0.6265, + "step": 7546 + }, + { + "epoch": 1.2916310114667122, + "grad_norm": 28.138734817504883, + "learning_rate": 1.2295738549110547e-05, + "loss": 5.4191, + "step": 7547 + }, + { + "epoch": 1.2918021564264932, + "grad_norm": 30.03383445739746, + "learning_rate": 1.2284406402962877e-05, + "loss": 5.7041, + "step": 7548 + }, + { + "epoch": 1.2919733013862742, + "grad_norm": 8.908590316772461, + "learning_rate": 1.2273075859019486e-05, + "loss": 1.1269, + "step": 7549 + }, + { + "epoch": 1.2921444463460552, + "grad_norm": 13.159024238586426, + "learning_rate": 1.2261746923965395e-05, + "loss": 1.0702, + "step": 7550 + }, + { + "epoch": 1.2923155913058362, + "grad_norm": 22.145456314086914, + "learning_rate": 1.2250419604484698e-05, + "loss": 1.5557, + "step": 7551 + }, + { + "epoch": 1.292486736265617, + "grad_norm": 17.461442947387695, + "learning_rate": 1.223909390726053e-05, + "loss": 1.6463, + "step": 7552 + }, + { + "epoch": 1.292657881225398, + "grad_norm": 13.080147743225098, + "learning_rate": 1.2227769838975069e-05, + "loss": 1.0846, + "step": 7553 + }, + { + "epoch": 1.292829026185179, + "grad_norm": 16.844499588012695, + "learning_rate": 1.221644740630953e-05, + "loss": 1.587, + "step": 7554 + }, + { + "epoch": 1.2930001711449597, + "grad_norm": 1.6371265649795532, + "learning_rate": 1.220512661594417e-05, + "loss": 0.2438, + "step": 7555 + }, + { + "epoch": 1.2931713161047407, + "grad_norm": 61.34412384033203, + "learning_rate": 1.2193807474558268e-05, + "loss": 6.45, + "step": 7556 + }, + { + "epoch": 1.2933424610645217, + "grad_norm": 9.564046859741211, + "learning_rate": 1.2182489988830141e-05, + "loss": 1.0304, + "step": 7557 + }, + { + "epoch": 1.2935136060243027, + "grad_norm": 8.173623085021973, + "learning_rate": 1.2171174165437112e-05, + "loss": 0.6847, + "step": 7558 + }, + { + "epoch": 1.2936847509840836, + "grad_norm": 13.157476425170898, + "learning_rate": 1.2159860011055534e-05, + "loss": 1.353, + "step": 7559 + }, + { + "epoch": 1.2938558959438644, + "grad_norm": 17.588411331176758, + "learning_rate": 1.2148547532360786e-05, + "loss": 1.2431, + "step": 7560 + }, + { + "epoch": 1.2940270409036454, + "grad_norm": 10.51596450805664, + "learning_rate": 1.2137236736027241e-05, + "loss": 0.7703, + "step": 7561 + }, + { + "epoch": 1.2941981858634264, + "grad_norm": 5.292048454284668, + "learning_rate": 1.2125927628728285e-05, + "loss": 0.3808, + "step": 7562 + }, + { + "epoch": 1.2943693308232072, + "grad_norm": 17.15229034423828, + "learning_rate": 1.2114620217136309e-05, + "loss": 1.3197, + "step": 7563 + }, + { + "epoch": 1.2945404757829881, + "grad_norm": 12.941659927368164, + "learning_rate": 1.2103314507922697e-05, + "loss": 1.134, + "step": 7564 + }, + { + "epoch": 1.2947116207427691, + "grad_norm": 10.494157791137695, + "learning_rate": 1.2092010507757849e-05, + "loss": 0.9958, + "step": 7565 + }, + { + "epoch": 1.2948827657025501, + "grad_norm": 22.58926773071289, + "learning_rate": 1.2080708223311127e-05, + "loss": 5.5401, + "step": 7566 + }, + { + "epoch": 1.295053910662331, + "grad_norm": 20.25670051574707, + "learning_rate": 1.2069407661250903e-05, + "loss": 1.9963, + "step": 7567 + }, + { + "epoch": 1.2952250556221119, + "grad_norm": 14.322406768798828, + "learning_rate": 1.2058108828244524e-05, + "loss": 1.3138, + "step": 7568 + }, + { + "epoch": 1.2953962005818929, + "grad_norm": 10.519211769104004, + "learning_rate": 1.204681173095832e-05, + "loss": 1.0285, + "step": 7569 + }, + { + "epoch": 1.2955673455416739, + "grad_norm": 15.762115478515625, + "learning_rate": 1.2035516376057591e-05, + "loss": 1.5637, + "step": 7570 + }, + { + "epoch": 1.2957384905014546, + "grad_norm": 0.9862691164016724, + "learning_rate": 1.2024222770206614e-05, + "loss": 0.1459, + "step": 7571 + }, + { + "epoch": 1.2959096354612356, + "grad_norm": 7.247229099273682, + "learning_rate": 1.2012930920068638e-05, + "loss": 0.4876, + "step": 7572 + }, + { + "epoch": 1.2960807804210166, + "grad_norm": 19.82130241394043, + "learning_rate": 1.2001640832305872e-05, + "loss": 2.2351, + "step": 7573 + }, + { + "epoch": 1.2962519253807976, + "grad_norm": 20.365571975708008, + "learning_rate": 1.1990352513579476e-05, + "loss": 1.6286, + "step": 7574 + }, + { + "epoch": 1.2964230703405786, + "grad_norm": 14.565311431884766, + "learning_rate": 1.1979065970549573e-05, + "loss": 1.146, + "step": 7575 + }, + { + "epoch": 1.2965942153003593, + "grad_norm": 10.465729713439941, + "learning_rate": 1.1967781209875254e-05, + "loss": 0.8435, + "step": 7576 + }, + { + "epoch": 1.2967653602601403, + "grad_norm": 151.20823669433594, + "learning_rate": 1.195649823821454e-05, + "loss": 8.9416, + "step": 7577 + }, + { + "epoch": 1.2969365052199213, + "grad_norm": 5.1723103523254395, + "learning_rate": 1.1945217062224398e-05, + "loss": 0.5016, + "step": 7578 + }, + { + "epoch": 1.297107650179702, + "grad_norm": 20.893043518066406, + "learning_rate": 1.1933937688560737e-05, + "loss": 1.8287, + "step": 7579 + }, + { + "epoch": 1.297278795139483, + "grad_norm": 9.546403884887695, + "learning_rate": 1.1922660123878407e-05, + "loss": 0.8622, + "step": 7580 + }, + { + "epoch": 1.297449940099264, + "grad_norm": 3.183964490890503, + "learning_rate": 1.1911384374831184e-05, + "loss": 0.3136, + "step": 7581 + }, + { + "epoch": 1.297621085059045, + "grad_norm": 8.555353164672852, + "learning_rate": 1.1900110448071781e-05, + "loss": 0.78, + "step": 7582 + }, + { + "epoch": 1.297792230018826, + "grad_norm": 15.852571487426758, + "learning_rate": 1.1888838350251835e-05, + "loss": 1.3163, + "step": 7583 + }, + { + "epoch": 1.2979633749786068, + "grad_norm": 30.275362014770508, + "learning_rate": 1.1877568088021896e-05, + "loss": 5.5375, + "step": 7584 + }, + { + "epoch": 1.2981345199383878, + "grad_norm": 19.9348201751709, + "learning_rate": 1.1866299668031434e-05, + "loss": 1.3742, + "step": 7585 + }, + { + "epoch": 1.2983056648981688, + "grad_norm": 4.104188919067383, + "learning_rate": 1.185503309692883e-05, + "loss": 0.3701, + "step": 7586 + }, + { + "epoch": 1.2984768098579496, + "grad_norm": 14.718408584594727, + "learning_rate": 1.184376838136139e-05, + "loss": 1.0145, + "step": 7587 + }, + { + "epoch": 1.2986479548177305, + "grad_norm": 16.853395462036133, + "learning_rate": 1.1832505527975305e-05, + "loss": 2.297, + "step": 7588 + }, + { + "epoch": 1.2988190997775115, + "grad_norm": 0.6066228151321411, + "learning_rate": 1.1821244543415678e-05, + "loss": 0.1384, + "step": 7589 + }, + { + "epoch": 1.2989902447372925, + "grad_norm": 6.714770317077637, + "learning_rate": 1.1809985434326502e-05, + "loss": 0.4904, + "step": 7590 + }, + { + "epoch": 1.2991613896970735, + "grad_norm": 12.138726234436035, + "learning_rate": 1.179872820735067e-05, + "loss": 1.1924, + "step": 7591 + }, + { + "epoch": 1.2993325346568543, + "grad_norm": 0.5841194987297058, + "learning_rate": 1.1787472869129965e-05, + "loss": 0.1313, + "step": 7592 + }, + { + "epoch": 1.2995036796166353, + "grad_norm": 15.870086669921875, + "learning_rate": 1.1776219426305055e-05, + "loss": 1.3421, + "step": 7593 + }, + { + "epoch": 1.2996748245764163, + "grad_norm": 8.11445140838623, + "learning_rate": 1.1764967885515483e-05, + "loss": 0.6656, + "step": 7594 + }, + { + "epoch": 1.299845969536197, + "grad_norm": 10.974652290344238, + "learning_rate": 1.1753718253399677e-05, + "loss": 0.9709, + "step": 7595 + }, + { + "epoch": 1.300017114495978, + "grad_norm": 8.759194374084473, + "learning_rate": 1.1742470536594938e-05, + "loss": 0.5787, + "step": 7596 + }, + { + "epoch": 1.300188259455759, + "grad_norm": 22.7238826751709, + "learning_rate": 1.1731224741737437e-05, + "loss": 1.7518, + "step": 7597 + }, + { + "epoch": 1.30035940441554, + "grad_norm": 15.450611114501953, + "learning_rate": 1.1719980875462205e-05, + "loss": 1.4182, + "step": 7598 + }, + { + "epoch": 1.300530549375321, + "grad_norm": 13.895708084106445, + "learning_rate": 1.170873894440314e-05, + "loss": 0.9097, + "step": 7599 + }, + { + "epoch": 1.300701694335102, + "grad_norm": 21.733577728271484, + "learning_rate": 1.1697498955193e-05, + "loss": 1.898, + "step": 7600 + }, + { + "epoch": 1.3008728392948827, + "grad_norm": 12.212570190429688, + "learning_rate": 1.1686260914463396e-05, + "loss": 1.0078, + "step": 7601 + }, + { + "epoch": 1.3010439842546637, + "grad_norm": 18.94055938720703, + "learning_rate": 1.1675024828844786e-05, + "loss": 1.3883, + "step": 7602 + }, + { + "epoch": 1.3012151292144447, + "grad_norm": 18.536867141723633, + "learning_rate": 1.1663790704966482e-05, + "loss": 1.6046, + "step": 7603 + }, + { + "epoch": 1.3013862741742255, + "grad_norm": 10.81980037689209, + "learning_rate": 1.165255854945663e-05, + "loss": 0.9442, + "step": 7604 + }, + { + "epoch": 1.3015574191340065, + "grad_norm": 0.4216785430908203, + "learning_rate": 1.1641328368942222e-05, + "loss": 0.1273, + "step": 7605 + }, + { + "epoch": 1.3017285640937875, + "grad_norm": 26.57358741760254, + "learning_rate": 1.1630100170049076e-05, + "loss": 3.7187, + "step": 7606 + }, + { + "epoch": 1.3018997090535684, + "grad_norm": 4.196712493896484, + "learning_rate": 1.1618873959401848e-05, + "loss": 0.3642, + "step": 7607 + }, + { + "epoch": 1.3020708540133494, + "grad_norm": 3.9136526584625244, + "learning_rate": 1.1607649743624024e-05, + "loss": 0.5315, + "step": 7608 + }, + { + "epoch": 1.3022419989731302, + "grad_norm": 2.5337984561920166, + "learning_rate": 1.15964275293379e-05, + "loss": 0.2766, + "step": 7609 + }, + { + "epoch": 1.3024131439329112, + "grad_norm": 19.823795318603516, + "learning_rate": 1.1585207323164607e-05, + "loss": 1.5666, + "step": 7610 + }, + { + "epoch": 1.3025842888926922, + "grad_norm": 17.423213958740234, + "learning_rate": 1.1573989131724079e-05, + "loss": 1.2708, + "step": 7611 + }, + { + "epoch": 1.302755433852473, + "grad_norm": 23.07868194580078, + "learning_rate": 1.1562772961635064e-05, + "loss": 1.222, + "step": 7612 + }, + { + "epoch": 1.302926578812254, + "grad_norm": 3.356442928314209, + "learning_rate": 1.1551558819515127e-05, + "loss": 0.3407, + "step": 7613 + }, + { + "epoch": 1.303097723772035, + "grad_norm": 1.479873538017273, + "learning_rate": 1.154034671198062e-05, + "loss": 0.2154, + "step": 7614 + }, + { + "epoch": 1.303268868731816, + "grad_norm": 20.11029052734375, + "learning_rate": 1.1529136645646705e-05, + "loss": 2.3389, + "step": 7615 + }, + { + "epoch": 1.303440013691597, + "grad_norm": 6.832971096038818, + "learning_rate": 1.1517928627127338e-05, + "loss": 0.4051, + "step": 7616 + }, + { + "epoch": 1.3036111586513777, + "grad_norm": 7.990339279174805, + "learning_rate": 1.1506722663035266e-05, + "loss": 0.5021, + "step": 7617 + }, + { + "epoch": 1.3037823036111587, + "grad_norm": 15.064433097839355, + "learning_rate": 1.1495518759982024e-05, + "loss": 1.0979, + "step": 7618 + }, + { + "epoch": 1.3039534485709396, + "grad_norm": 14.080619812011719, + "learning_rate": 1.1484316924577938e-05, + "loss": 1.0404, + "step": 7619 + }, + { + "epoch": 1.3041245935307204, + "grad_norm": 25.42909812927246, + "learning_rate": 1.1473117163432102e-05, + "loss": 5.4308, + "step": 7620 + }, + { + "epoch": 1.3042957384905014, + "grad_norm": 18.933637619018555, + "learning_rate": 1.1461919483152392e-05, + "loss": 2.5292, + "step": 7621 + }, + { + "epoch": 1.3044668834502824, + "grad_norm": 0.5344691276550293, + "learning_rate": 1.1450723890345459e-05, + "loss": 0.1414, + "step": 7622 + }, + { + "epoch": 1.3046380284100634, + "grad_norm": 22.04081916809082, + "learning_rate": 1.1439530391616711e-05, + "loss": 1.8622, + "step": 7623 + }, + { + "epoch": 1.3048091733698444, + "grad_norm": 0.5450618863105774, + "learning_rate": 1.1428338993570341e-05, + "loss": 0.1399, + "step": 7624 + }, + { + "epoch": 1.3049803183296251, + "grad_norm": 19.328487396240234, + "learning_rate": 1.1417149702809283e-05, + "loss": 1.4455, + "step": 7625 + }, + { + "epoch": 1.3051514632894061, + "grad_norm": 10.009942054748535, + "learning_rate": 1.1405962525935237e-05, + "loss": 0.7642, + "step": 7626 + }, + { + "epoch": 1.305322608249187, + "grad_norm": 19.036094665527344, + "learning_rate": 1.1394777469548654e-05, + "loss": 1.6603, + "step": 7627 + }, + { + "epoch": 1.3054937532089679, + "grad_norm": 7.515437126159668, + "learning_rate": 1.1383594540248733e-05, + "loss": 0.6696, + "step": 7628 + }, + { + "epoch": 1.3056648981687489, + "grad_norm": 10.558402061462402, + "learning_rate": 1.1372413744633417e-05, + "loss": 0.7952, + "step": 7629 + }, + { + "epoch": 1.3058360431285299, + "grad_norm": 12.095398902893066, + "learning_rate": 1.1361235089299398e-05, + "loss": 1.2338, + "step": 7630 + }, + { + "epoch": 1.3060071880883108, + "grad_norm": 10.3566312789917, + "learning_rate": 1.1350058580842098e-05, + "loss": 0.9752, + "step": 7631 + }, + { + "epoch": 1.3061783330480918, + "grad_norm": 4.167954444885254, + "learning_rate": 1.133888422585567e-05, + "loss": 0.384, + "step": 7632 + }, + { + "epoch": 1.3063494780078726, + "grad_norm": 18.931066513061523, + "learning_rate": 1.1327712030933002e-05, + "loss": 2.0, + "step": 7633 + }, + { + "epoch": 1.3065206229676536, + "grad_norm": 123.077392578125, + "learning_rate": 1.1316542002665701e-05, + "loss": 7.1118, + "step": 7634 + }, + { + "epoch": 1.3066917679274346, + "grad_norm": 12.888384819030762, + "learning_rate": 1.1305374147644112e-05, + "loss": 1.194, + "step": 7635 + }, + { + "epoch": 1.3068629128872153, + "grad_norm": 13.995150566101074, + "learning_rate": 1.1294208472457276e-05, + "loss": 1.0899, + "step": 7636 + }, + { + "epoch": 1.3070340578469963, + "grad_norm": 7.124889373779297, + "learning_rate": 1.128304498369296e-05, + "loss": 0.4111, + "step": 7637 + }, + { + "epoch": 1.3072052028067773, + "grad_norm": 9.083507537841797, + "learning_rate": 1.1271883687937645e-05, + "loss": 1.0383, + "step": 7638 + }, + { + "epoch": 1.3073763477665583, + "grad_norm": 10.323081016540527, + "learning_rate": 1.1260724591776502e-05, + "loss": 0.8118, + "step": 7639 + }, + { + "epoch": 1.3075474927263393, + "grad_norm": 12.912384986877441, + "learning_rate": 1.1249567701793422e-05, + "loss": 1.1339, + "step": 7640 + }, + { + "epoch": 1.30771863768612, + "grad_norm": 63.08304214477539, + "learning_rate": 1.1238413024570982e-05, + "loss": 6.4905, + "step": 7641 + }, + { + "epoch": 1.307889782645901, + "grad_norm": 17.389144897460938, + "learning_rate": 1.122726056669046e-05, + "loss": 1.6189, + "step": 7642 + }, + { + "epoch": 1.308060927605682, + "grad_norm": 7.14937162399292, + "learning_rate": 1.1216110334731825e-05, + "loss": 0.6183, + "step": 7643 + }, + { + "epoch": 1.3082320725654628, + "grad_norm": 5.7636919021606445, + "learning_rate": 1.1204962335273728e-05, + "loss": 0.4276, + "step": 7644 + }, + { + "epoch": 1.3084032175252438, + "grad_norm": 13.262709617614746, + "learning_rate": 1.1193816574893499e-05, + "loss": 0.8461, + "step": 7645 + }, + { + "epoch": 1.3085743624850248, + "grad_norm": 17.416807174682617, + "learning_rate": 1.1182673060167168e-05, + "loss": 1.6257, + "step": 7646 + }, + { + "epoch": 1.3087455074448058, + "grad_norm": 21.898544311523438, + "learning_rate": 1.1171531797669413e-05, + "loss": 1.7339, + "step": 7647 + }, + { + "epoch": 1.3089166524045868, + "grad_norm": 21.608203887939453, + "learning_rate": 1.1160392793973605e-05, + "loss": 2.1039, + "step": 7648 + }, + { + "epoch": 1.3090877973643678, + "grad_norm": 22.45233726501465, + "learning_rate": 1.1149256055651767e-05, + "loss": 2.9647, + "step": 7649 + }, + { + "epoch": 1.3092589423241485, + "grad_norm": 21.876420974731445, + "learning_rate": 1.113812158927458e-05, + "loss": 5.0497, + "step": 7650 + }, + { + "epoch": 1.3094300872839295, + "grad_norm": 16.637968063354492, + "learning_rate": 1.1126989401411418e-05, + "loss": 1.7433, + "step": 7651 + }, + { + "epoch": 1.3096012322437105, + "grad_norm": 9.169917106628418, + "learning_rate": 1.1115859498630277e-05, + "loss": 1.1167, + "step": 7652 + }, + { + "epoch": 1.3097723772034913, + "grad_norm": 16.875045776367188, + "learning_rate": 1.1104731887497817e-05, + "loss": 1.5559, + "step": 7653 + }, + { + "epoch": 1.3099435221632723, + "grad_norm": 0.782894492149353, + "learning_rate": 1.1093606574579346e-05, + "loss": 0.135, + "step": 7654 + }, + { + "epoch": 1.3101146671230532, + "grad_norm": 19.62564468383789, + "learning_rate": 1.1082483566438814e-05, + "loss": 2.1235, + "step": 7655 + }, + { + "epoch": 1.3102858120828342, + "grad_norm": 14.405755996704102, + "learning_rate": 1.107136286963881e-05, + "loss": 1.208, + "step": 7656 + }, + { + "epoch": 1.3104569570426152, + "grad_norm": 13.453514099121094, + "learning_rate": 1.1060244490740567e-05, + "loss": 1.0998, + "step": 7657 + }, + { + "epoch": 1.310628102002396, + "grad_norm": 7.483944892883301, + "learning_rate": 1.1049128436303943e-05, + "loss": 0.4954, + "step": 7658 + }, + { + "epoch": 1.310799246962177, + "grad_norm": 139.34796142578125, + "learning_rate": 1.1038014712887425e-05, + "loss": 7.8605, + "step": 7659 + }, + { + "epoch": 1.310970391921958, + "grad_norm": 20.454601287841797, + "learning_rate": 1.1026903327048128e-05, + "loss": 2.6431, + "step": 7660 + }, + { + "epoch": 1.3111415368817387, + "grad_norm": 18.389663696289062, + "learning_rate": 1.1015794285341782e-05, + "loss": 2.1709, + "step": 7661 + }, + { + "epoch": 1.3113126818415197, + "grad_norm": 39.07166290283203, + "learning_rate": 1.1004687594322747e-05, + "loss": 5.7758, + "step": 7662 + }, + { + "epoch": 1.3114838268013007, + "grad_norm": 5.378295421600342, + "learning_rate": 1.0993583260543978e-05, + "loss": 0.3618, + "step": 7663 + }, + { + "epoch": 1.3116549717610817, + "grad_norm": 23.937475204467773, + "learning_rate": 1.0982481290557056e-05, + "loss": 1.1674, + "step": 7664 + }, + { + "epoch": 1.3118261167208627, + "grad_norm": 7.371132850646973, + "learning_rate": 1.0971381690912159e-05, + "loss": 0.7905, + "step": 7665 + }, + { + "epoch": 1.3119972616806435, + "grad_norm": 9.415770530700684, + "learning_rate": 1.0960284468158055e-05, + "loss": 1.0082, + "step": 7666 + }, + { + "epoch": 1.3121684066404244, + "grad_norm": 12.06342887878418, + "learning_rate": 1.0949189628842139e-05, + "loss": 0.9626, + "step": 7667 + }, + { + "epoch": 1.3123395516002054, + "grad_norm": 0.5903443098068237, + "learning_rate": 1.0938097179510376e-05, + "loss": 0.1343, + "step": 7668 + }, + { + "epoch": 1.3125106965599862, + "grad_norm": 7.682429790496826, + "learning_rate": 1.0927007126707325e-05, + "loss": 0.4691, + "step": 7669 + }, + { + "epoch": 1.3126818415197672, + "grad_norm": 10.526029586791992, + "learning_rate": 1.0915919476976142e-05, + "loss": 1.0967, + "step": 7670 + }, + { + "epoch": 1.3128529864795482, + "grad_norm": 11.364990234375, + "learning_rate": 1.0904834236858544e-05, + "loss": 1.022, + "step": 7671 + }, + { + "epoch": 1.3130241314393292, + "grad_norm": 9.109969139099121, + "learning_rate": 1.0893751412894843e-05, + "loss": 0.7541, + "step": 7672 + }, + { + "epoch": 1.3131952763991102, + "grad_norm": 15.1395902633667, + "learning_rate": 1.0882671011623927e-05, + "loss": 1.9152, + "step": 7673 + }, + { + "epoch": 1.313366421358891, + "grad_norm": 17.07834243774414, + "learning_rate": 1.0871593039583253e-05, + "loss": 1.2759, + "step": 7674 + }, + { + "epoch": 1.313537566318672, + "grad_norm": 22.273035049438477, + "learning_rate": 1.086051750330883e-05, + "loss": 5.0596, + "step": 7675 + }, + { + "epoch": 1.313708711278453, + "grad_norm": 2.752251148223877, + "learning_rate": 1.0849444409335247e-05, + "loss": 0.2592, + "step": 7676 + }, + { + "epoch": 1.3138798562382337, + "grad_norm": 4.555734634399414, + "learning_rate": 1.0838373764195636e-05, + "loss": 0.3269, + "step": 7677 + }, + { + "epoch": 1.3140510011980147, + "grad_norm": 0.5538658499717712, + "learning_rate": 1.0827305574421713e-05, + "loss": 0.1312, + "step": 7678 + }, + { + "epoch": 1.3142221461577956, + "grad_norm": 15.580399513244629, + "learning_rate": 1.0816239846543714e-05, + "loss": 1.2762, + "step": 7679 + }, + { + "epoch": 1.3143932911175766, + "grad_norm": 4.781283378601074, + "learning_rate": 1.0805176587090435e-05, + "loss": 0.5832, + "step": 7680 + }, + { + "epoch": 1.3145644360773576, + "grad_norm": 10.91511058807373, + "learning_rate": 1.079411580258922e-05, + "loss": 0.9796, + "step": 7681 + }, + { + "epoch": 1.3147355810371384, + "grad_norm": 1.3821643590927124, + "learning_rate": 1.0783057499565945e-05, + "loss": 0.1531, + "step": 7682 + }, + { + "epoch": 1.3149067259969194, + "grad_norm": 18.001895904541016, + "learning_rate": 1.0772001684545027e-05, + "loss": 1.8831, + "step": 7683 + }, + { + "epoch": 1.3150778709567004, + "grad_norm": 18.8228759765625, + "learning_rate": 1.0760948364049413e-05, + "loss": 1.4197, + "step": 7684 + }, + { + "epoch": 1.3152490159164811, + "grad_norm": 15.630839347839355, + "learning_rate": 1.0749897544600576e-05, + "loss": 1.2747, + "step": 7685 + }, + { + "epoch": 1.3154201608762621, + "grad_norm": 15.606013298034668, + "learning_rate": 1.0738849232718523e-05, + "loss": 1.2318, + "step": 7686 + }, + { + "epoch": 1.3155913058360431, + "grad_norm": 4.676835536956787, + "learning_rate": 1.0727803434921765e-05, + "loss": 0.2796, + "step": 7687 + }, + { + "epoch": 1.315762450795824, + "grad_norm": 26.167057037353516, + "learning_rate": 1.0716760157727336e-05, + "loss": 5.4689, + "step": 7688 + }, + { + "epoch": 1.315933595755605, + "grad_norm": 19.37649154663086, + "learning_rate": 1.0705719407650805e-05, + "loss": 1.9204, + "step": 7689 + }, + { + "epoch": 1.3161047407153859, + "grad_norm": 16.943334579467773, + "learning_rate": 1.0694681191206218e-05, + "loss": 1.5151, + "step": 7690 + }, + { + "epoch": 1.3162758856751668, + "grad_norm": 12.672527313232422, + "learning_rate": 1.068364551490614e-05, + "loss": 0.9585, + "step": 7691 + }, + { + "epoch": 1.3164470306349478, + "grad_norm": 14.246026992797852, + "learning_rate": 1.0672612385261636e-05, + "loss": 1.1833, + "step": 7692 + }, + { + "epoch": 1.3166181755947286, + "grad_norm": 19.577896118164062, + "learning_rate": 1.0661581808782264e-05, + "loss": 1.6338, + "step": 7693 + }, + { + "epoch": 1.3167893205545096, + "grad_norm": 18.652299880981445, + "learning_rate": 1.0650553791976096e-05, + "loss": 1.8381, + "step": 7694 + }, + { + "epoch": 1.3169604655142906, + "grad_norm": 14.095416069030762, + "learning_rate": 1.0639528341349668e-05, + "loss": 1.135, + "step": 7695 + }, + { + "epoch": 1.3171316104740716, + "grad_norm": 13.167633056640625, + "learning_rate": 1.062850546340801e-05, + "loss": 1.0979, + "step": 7696 + }, + { + "epoch": 1.3173027554338526, + "grad_norm": 18.160646438598633, + "learning_rate": 1.0617485164654645e-05, + "loss": 2.445, + "step": 7697 + }, + { + "epoch": 1.3174739003936335, + "grad_norm": 10.149754524230957, + "learning_rate": 1.0606467451591556e-05, + "loss": 0.8535, + "step": 7698 + }, + { + "epoch": 1.3176450453534143, + "grad_norm": 12.657750129699707, + "learning_rate": 1.0595452330719214e-05, + "loss": 0.8057, + "step": 7699 + }, + { + "epoch": 1.3178161903131953, + "grad_norm": 19.711631774902344, + "learning_rate": 1.058443980853656e-05, + "loss": 2.5439, + "step": 7700 + }, + { + "epoch": 1.3179873352729763, + "grad_norm": 13.596765518188477, + "learning_rate": 1.0573429891540995e-05, + "loss": 0.9922, + "step": 7701 + }, + { + "epoch": 1.318158480232757, + "grad_norm": 11.752197265625, + "learning_rate": 1.056242258622839e-05, + "loss": 0.9834, + "step": 7702 + }, + { + "epoch": 1.318329625192538, + "grad_norm": 13.766919136047363, + "learning_rate": 1.0551417899093064e-05, + "loss": 1.2064, + "step": 7703 + }, + { + "epoch": 1.318500770152319, + "grad_norm": 3.4079298973083496, + "learning_rate": 1.05404158366278e-05, + "loss": 0.3151, + "step": 7704 + }, + { + "epoch": 1.3186719151121, + "grad_norm": 17.611780166625977, + "learning_rate": 1.0529416405323839e-05, + "loss": 2.0194, + "step": 7705 + }, + { + "epoch": 1.318843060071881, + "grad_norm": 6.004448890686035, + "learning_rate": 1.0518419611670863e-05, + "loss": 0.3998, + "step": 7706 + }, + { + "epoch": 1.3190142050316618, + "grad_norm": 0.5409048795700073, + "learning_rate": 1.0507425462156985e-05, + "loss": 0.1352, + "step": 7707 + }, + { + "epoch": 1.3191853499914428, + "grad_norm": 11.109127044677734, + "learning_rate": 1.0496433963268778e-05, + "loss": 1.2146, + "step": 7708 + }, + { + "epoch": 1.3193564949512238, + "grad_norm": 13.240180969238281, + "learning_rate": 1.0485445121491234e-05, + "loss": 1.0734, + "step": 7709 + }, + { + "epoch": 1.3195276399110045, + "grad_norm": 11.445103645324707, + "learning_rate": 1.0474458943307803e-05, + "loss": 0.872, + "step": 7710 + }, + { + "epoch": 1.3196987848707855, + "grad_norm": 13.537281036376953, + "learning_rate": 1.0463475435200332e-05, + "loss": 1.038, + "step": 7711 + }, + { + "epoch": 1.3198699298305665, + "grad_norm": 100.9023666381836, + "learning_rate": 1.0452494603649108e-05, + "loss": 7.4707, + "step": 7712 + }, + { + "epoch": 1.3200410747903475, + "grad_norm": 14.019550323486328, + "learning_rate": 1.0441516455132846e-05, + "loss": 1.2618, + "step": 7713 + }, + { + "epoch": 1.3202122197501285, + "grad_norm": 4.731026649475098, + "learning_rate": 1.0430540996128663e-05, + "loss": 0.3173, + "step": 7714 + }, + { + "epoch": 1.3203833647099092, + "grad_norm": 12.830647468566895, + "learning_rate": 1.0419568233112095e-05, + "loss": 1.0202, + "step": 7715 + }, + { + "epoch": 1.3205545096696902, + "grad_norm": 8.369707107543945, + "learning_rate": 1.0408598172557096e-05, + "loss": 0.6916, + "step": 7716 + }, + { + "epoch": 1.3207256546294712, + "grad_norm": 7.795764446258545, + "learning_rate": 1.0397630820936014e-05, + "loss": 0.767, + "step": 7717 + }, + { + "epoch": 1.320896799589252, + "grad_norm": 15.821785926818848, + "learning_rate": 1.03866661847196e-05, + "loss": 1.7533, + "step": 7718 + }, + { + "epoch": 1.321067944549033, + "grad_norm": 30.86501693725586, + "learning_rate": 1.0375704270377012e-05, + "loss": 5.3286, + "step": 7719 + }, + { + "epoch": 1.321239089508814, + "grad_norm": 0.4868166446685791, + "learning_rate": 1.0364745084375787e-05, + "loss": 0.1375, + "step": 7720 + }, + { + "epoch": 1.321410234468595, + "grad_norm": 19.305397033691406, + "learning_rate": 1.035378863318187e-05, + "loss": 2.0752, + "step": 7721 + }, + { + "epoch": 1.321581379428376, + "grad_norm": 4.07341194152832, + "learning_rate": 1.034283492325958e-05, + "loss": 0.3295, + "step": 7722 + }, + { + "epoch": 1.3217525243881567, + "grad_norm": 15.446962356567383, + "learning_rate": 1.033188396107162e-05, + "loss": 1.3605, + "step": 7723 + }, + { + "epoch": 1.3219236693479377, + "grad_norm": 17.727609634399414, + "learning_rate": 1.0320935753079077e-05, + "loss": 1.2988, + "step": 7724 + }, + { + "epoch": 1.3220948143077187, + "grad_norm": 12.297486305236816, + "learning_rate": 1.0309990305741412e-05, + "loss": 1.0018, + "step": 7725 + }, + { + "epoch": 1.3222659592674995, + "grad_norm": 8.26491641998291, + "learning_rate": 1.0299047625516452e-05, + "loss": 0.8143, + "step": 7726 + }, + { + "epoch": 1.3224371042272804, + "grad_norm": 7.907574653625488, + "learning_rate": 1.028810771886039e-05, + "loss": 1.0222, + "step": 7727 + }, + { + "epoch": 1.3226082491870614, + "grad_norm": 0.49876004457473755, + "learning_rate": 1.0277170592227796e-05, + "loss": 0.1302, + "step": 7728 + }, + { + "epoch": 1.3227793941468424, + "grad_norm": 13.960297584533691, + "learning_rate": 1.0266236252071584e-05, + "loss": 1.0474, + "step": 7729 + }, + { + "epoch": 1.3229505391066234, + "grad_norm": 0.38284948468208313, + "learning_rate": 1.0255304704843037e-05, + "loss": 0.1207, + "step": 7730 + }, + { + "epoch": 1.3231216840664042, + "grad_norm": 21.4552001953125, + "learning_rate": 1.0244375956991776e-05, + "loss": 1.9789, + "step": 7731 + }, + { + "epoch": 1.3232928290261852, + "grad_norm": 0.8332533240318298, + "learning_rate": 1.0233450014965787e-05, + "loss": 0.1405, + "step": 7732 + }, + { + "epoch": 1.3234639739859662, + "grad_norm": 15.524604797363281, + "learning_rate": 1.022252688521139e-05, + "loss": 1.1942, + "step": 7733 + }, + { + "epoch": 1.323635118945747, + "grad_norm": 4.579717636108398, + "learning_rate": 1.0211606574173245e-05, + "loss": 0.3253, + "step": 7734 + }, + { + "epoch": 1.323806263905528, + "grad_norm": 5.621901988983154, + "learning_rate": 1.0200689088294356e-05, + "loss": 0.4184, + "step": 7735 + }, + { + "epoch": 1.323977408865309, + "grad_norm": 16.8519229888916, + "learning_rate": 1.0189774434016048e-05, + "loss": 1.1868, + "step": 7736 + }, + { + "epoch": 1.32414855382509, + "grad_norm": 20.170482635498047, + "learning_rate": 1.017886261777799e-05, + "loss": 2.5078, + "step": 7737 + }, + { + "epoch": 1.3243196987848709, + "grad_norm": 21.047101974487305, + "learning_rate": 1.0167953646018171e-05, + "loss": 2.4527, + "step": 7738 + }, + { + "epoch": 1.3244908437446516, + "grad_norm": 18.5394229888916, + "learning_rate": 1.0157047525172897e-05, + "loss": 2.0388, + "step": 7739 + }, + { + "epoch": 1.3246619887044326, + "grad_norm": 2.999378204345703, + "learning_rate": 1.0146144261676798e-05, + "loss": 0.2756, + "step": 7740 + }, + { + "epoch": 1.3248331336642136, + "grad_norm": 5.7144269943237305, + "learning_rate": 1.0135243861962813e-05, + "loss": 0.4059, + "step": 7741 + }, + { + "epoch": 1.3250042786239944, + "grad_norm": 13.805132865905762, + "learning_rate": 1.0124346332462198e-05, + "loss": 1.1937, + "step": 7742 + }, + { + "epoch": 1.3251754235837754, + "grad_norm": 12.285717964172363, + "learning_rate": 1.0113451679604507e-05, + "loss": 1.3567, + "step": 7743 + }, + { + "epoch": 1.3253465685435564, + "grad_norm": 10.607677459716797, + "learning_rate": 1.0102559909817604e-05, + "loss": 0.8848, + "step": 7744 + }, + { + "epoch": 1.3255177135033374, + "grad_norm": 9.427162170410156, + "learning_rate": 1.0091671029527644e-05, + "loss": 0.6905, + "step": 7745 + }, + { + "epoch": 1.3256888584631183, + "grad_norm": 0.6724772453308105, + "learning_rate": 1.0080785045159091e-05, + "loss": 0.1361, + "step": 7746 + }, + { + "epoch": 1.3258600034228991, + "grad_norm": 10.575011253356934, + "learning_rate": 1.0069901963134687e-05, + "loss": 0.7674, + "step": 7747 + }, + { + "epoch": 1.32603114838268, + "grad_norm": 10.72413158416748, + "learning_rate": 1.005902178987547e-05, + "loss": 0.8709, + "step": 7748 + }, + { + "epoch": 1.326202293342461, + "grad_norm": 21.01888656616211, + "learning_rate": 1.0048144531800754e-05, + "loss": 1.8226, + "step": 7749 + }, + { + "epoch": 1.326373438302242, + "grad_norm": 6.55217981338501, + "learning_rate": 1.0037270195328141e-05, + "loss": 0.382, + "step": 7750 + }, + { + "epoch": 1.3265445832620228, + "grad_norm": 10.578644752502441, + "learning_rate": 1.0026398786873505e-05, + "loss": 0.7, + "step": 7751 + }, + { + "epoch": 1.3267157282218038, + "grad_norm": 14.880857467651367, + "learning_rate": 1.0015530312850989e-05, + "loss": 1.1683, + "step": 7752 + }, + { + "epoch": 1.3268868731815848, + "grad_norm": 9.10450553894043, + "learning_rate": 1.0004664779673017e-05, + "loss": 0.4336, + "step": 7753 + }, + { + "epoch": 1.3270580181413658, + "grad_norm": 7.297650337219238, + "learning_rate": 9.993802193750263e-06, + "loss": 0.458, + "step": 7754 + }, + { + "epoch": 1.3272291631011468, + "grad_norm": 20.097755432128906, + "learning_rate": 9.982942561491673e-06, + "loss": 2.2245, + "step": 7755 + }, + { + "epoch": 1.3274003080609276, + "grad_norm": 19.3148136138916, + "learning_rate": 9.972085889304445e-06, + "loss": 1.512, + "step": 7756 + }, + { + "epoch": 1.3275714530207086, + "grad_norm": 24.726036071777344, + "learning_rate": 9.96123218359403e-06, + "loss": 3.2435, + "step": 7757 + }, + { + "epoch": 1.3277425979804895, + "grad_norm": 14.569957733154297, + "learning_rate": 9.95038145076413e-06, + "loss": 1.122, + "step": 7758 + }, + { + "epoch": 1.3279137429402703, + "grad_norm": 16.211366653442383, + "learning_rate": 9.939533697216696e-06, + "loss": 1.1394, + "step": 7759 + }, + { + "epoch": 1.3280848879000513, + "grad_norm": 0.47790247201919556, + "learning_rate": 9.92868892935192e-06, + "loss": 0.132, + "step": 7760 + }, + { + "epoch": 1.3282560328598323, + "grad_norm": 10.083187103271484, + "learning_rate": 9.917847153568227e-06, + "loss": 0.687, + "step": 7761 + }, + { + "epoch": 1.3284271778196133, + "grad_norm": 10.120450973510742, + "learning_rate": 9.907008376262288e-06, + "loss": 0.6699, + "step": 7762 + }, + { + "epoch": 1.3285983227793943, + "grad_norm": 16.270835876464844, + "learning_rate": 9.896172603828982e-06, + "loss": 1.3664, + "step": 7763 + }, + { + "epoch": 1.328769467739175, + "grad_norm": 15.282939910888672, + "learning_rate": 9.88533984266145e-06, + "loss": 1.2255, + "step": 7764 + }, + { + "epoch": 1.328940612698956, + "grad_norm": 18.38252830505371, + "learning_rate": 9.874510099151028e-06, + "loss": 1.4593, + "step": 7765 + }, + { + "epoch": 1.329111757658737, + "grad_norm": 7.455735683441162, + "learning_rate": 9.863683379687281e-06, + "loss": 0.8564, + "step": 7766 + }, + { + "epoch": 1.3292829026185178, + "grad_norm": 7.662227630615234, + "learning_rate": 9.852859690657995e-06, + "loss": 0.6627, + "step": 7767 + }, + { + "epoch": 1.3294540475782988, + "grad_norm": 15.670798301696777, + "learning_rate": 9.842039038449153e-06, + "loss": 1.2757, + "step": 7768 + }, + { + "epoch": 1.3296251925380798, + "grad_norm": 33.80439758300781, + "learning_rate": 9.831221429444963e-06, + "loss": 5.7635, + "step": 7769 + }, + { + "epoch": 1.3297963374978607, + "grad_norm": 145.04127502441406, + "learning_rate": 9.820406870027826e-06, + "loss": 8.283, + "step": 7770 + }, + { + "epoch": 1.3299674824576417, + "grad_norm": 17.340248107910156, + "learning_rate": 9.809595366578351e-06, + "loss": 1.3826, + "step": 7771 + }, + { + "epoch": 1.3301386274174225, + "grad_norm": 21.254674911499023, + "learning_rate": 9.798786925475342e-06, + "loss": 1.8894, + "step": 7772 + }, + { + "epoch": 1.3303097723772035, + "grad_norm": 11.893777847290039, + "learning_rate": 9.787981553095794e-06, + "loss": 0.8193, + "step": 7773 + }, + { + "epoch": 1.3304809173369845, + "grad_norm": 22.362438201904297, + "learning_rate": 9.777179255814888e-06, + "loss": 2.1771, + "step": 7774 + }, + { + "epoch": 1.3306520622967652, + "grad_norm": 21.80306053161621, + "learning_rate": 9.766380040006005e-06, + "loss": 3.0675, + "step": 7775 + }, + { + "epoch": 1.3308232072565462, + "grad_norm": 1.6046303510665894, + "learning_rate": 9.755583912040692e-06, + "loss": 0.2445, + "step": 7776 + }, + { + "epoch": 1.3309943522163272, + "grad_norm": 9.884397506713867, + "learning_rate": 9.744790878288683e-06, + "loss": 0.7851, + "step": 7777 + }, + { + "epoch": 1.3311654971761082, + "grad_norm": 20.357301712036133, + "learning_rate": 9.734000945117886e-06, + "loss": 1.7883, + "step": 7778 + }, + { + "epoch": 1.3313366421358892, + "grad_norm": 129.23085021972656, + "learning_rate": 9.723214118894366e-06, + "loss": 9.2255, + "step": 7779 + }, + { + "epoch": 1.33150778709567, + "grad_norm": 13.197471618652344, + "learning_rate": 9.712430405982382e-06, + "loss": 0.8971, + "step": 7780 + }, + { + "epoch": 1.331678932055451, + "grad_norm": 18.437232971191406, + "learning_rate": 9.701649812744335e-06, + "loss": 1.4011, + "step": 7781 + }, + { + "epoch": 1.331850077015232, + "grad_norm": 12.094624519348145, + "learning_rate": 9.69087234554079e-06, + "loss": 0.7557, + "step": 7782 + }, + { + "epoch": 1.3320212219750127, + "grad_norm": 0.46964266896247864, + "learning_rate": 9.680098010730468e-06, + "loss": 0.1422, + "step": 7783 + }, + { + "epoch": 1.3321923669347937, + "grad_norm": 10.259994506835938, + "learning_rate": 9.669326814670244e-06, + "loss": 0.8023, + "step": 7784 + }, + { + "epoch": 1.3323635118945747, + "grad_norm": 13.318816184997559, + "learning_rate": 9.658558763715139e-06, + "loss": 1.0209, + "step": 7785 + }, + { + "epoch": 1.3325346568543557, + "grad_norm": 14.060872077941895, + "learning_rate": 9.647793864218318e-06, + "loss": 1.2461, + "step": 7786 + }, + { + "epoch": 1.3327058018141367, + "grad_norm": 40.56553649902344, + "learning_rate": 9.63703212253109e-06, + "loss": 5.0031, + "step": 7787 + }, + { + "epoch": 1.3328769467739174, + "grad_norm": 12.271203994750977, + "learning_rate": 9.626273545002897e-06, + "loss": 1.0462, + "step": 7788 + }, + { + "epoch": 1.3330480917336984, + "grad_norm": 12.823019027709961, + "learning_rate": 9.615518137981317e-06, + "loss": 0.9287, + "step": 7789 + }, + { + "epoch": 1.3332192366934794, + "grad_norm": 72.58329772949219, + "learning_rate": 9.604765907812051e-06, + "loss": 7.3743, + "step": 7790 + }, + { + "epoch": 1.3333903816532602, + "grad_norm": 17.8341064453125, + "learning_rate": 9.59401686083894e-06, + "loss": 2.4335, + "step": 7791 + }, + { + "epoch": 1.3335615266130412, + "grad_norm": 18.05230140686035, + "learning_rate": 9.583271003403932e-06, + "loss": 1.3464, + "step": 7792 + }, + { + "epoch": 1.3337326715728222, + "grad_norm": 22.57405662536621, + "learning_rate": 9.5725283418471e-06, + "loss": 2.1458, + "step": 7793 + }, + { + "epoch": 1.3339038165326031, + "grad_norm": 21.594478607177734, + "learning_rate": 9.561788882506636e-06, + "loss": 1.1118, + "step": 7794 + }, + { + "epoch": 1.3340749614923841, + "grad_norm": 20.780794143676758, + "learning_rate": 9.55105263171882e-06, + "loss": 2.7362, + "step": 7795 + }, + { + "epoch": 1.334246106452165, + "grad_norm": 20.054019927978516, + "learning_rate": 9.540319595818072e-06, + "loss": 2.4585, + "step": 7796 + }, + { + "epoch": 1.334417251411946, + "grad_norm": 8.055940628051758, + "learning_rate": 9.529589781136899e-06, + "loss": 0.6487, + "step": 7797 + }, + { + "epoch": 1.3345883963717269, + "grad_norm": 13.177647590637207, + "learning_rate": 9.518863194005898e-06, + "loss": 0.9085, + "step": 7798 + }, + { + "epoch": 1.3347595413315076, + "grad_norm": 16.22951316833496, + "learning_rate": 9.508139840753782e-06, + "loss": 1.3964, + "step": 7799 + }, + { + "epoch": 1.3349306862912886, + "grad_norm": 23.331989288330078, + "learning_rate": 9.49741972770733e-06, + "loss": 5.3252, + "step": 7800 + }, + { + "epoch": 1.3351018312510696, + "grad_norm": 12.572565078735352, + "learning_rate": 9.486702861191439e-06, + "loss": 1.1444, + "step": 7801 + }, + { + "epoch": 1.3352729762108506, + "grad_norm": 5.193902015686035, + "learning_rate": 9.475989247529075e-06, + "loss": 0.547, + "step": 7802 + }, + { + "epoch": 1.3354441211706316, + "grad_norm": 57.83475112915039, + "learning_rate": 9.465278893041285e-06, + "loss": 6.9681, + "step": 7803 + }, + { + "epoch": 1.3356152661304126, + "grad_norm": 17.487966537475586, + "learning_rate": 9.454571804047189e-06, + "loss": 1.8436, + "step": 7804 + }, + { + "epoch": 1.3357864110901934, + "grad_norm": 15.3053617477417, + "learning_rate": 9.443867986863986e-06, + "loss": 1.2377, + "step": 7805 + }, + { + "epoch": 1.3359575560499743, + "grad_norm": 15.734844207763672, + "learning_rate": 9.433167447806942e-06, + "loss": 1.4479, + "step": 7806 + }, + { + "epoch": 1.3361287010097553, + "grad_norm": 14.855875968933105, + "learning_rate": 9.422470193189406e-06, + "loss": 1.3216, + "step": 7807 + }, + { + "epoch": 1.336299845969536, + "grad_norm": 14.787861824035645, + "learning_rate": 9.411776229322759e-06, + "loss": 1.3146, + "step": 7808 + }, + { + "epoch": 1.336470990929317, + "grad_norm": 22.168827056884766, + "learning_rate": 9.40108556251646e-06, + "loss": 5.2831, + "step": 7809 + }, + { + "epoch": 1.336642135889098, + "grad_norm": 1.7247884273529053, + "learning_rate": 9.390398199078018e-06, + "loss": 0.2437, + "step": 7810 + }, + { + "epoch": 1.336813280848879, + "grad_norm": 5.1476593017578125, + "learning_rate": 9.379714145312994e-06, + "loss": 0.2906, + "step": 7811 + }, + { + "epoch": 1.33698442580866, + "grad_norm": 17.64693832397461, + "learning_rate": 9.369033407524996e-06, + "loss": 1.0584, + "step": 7812 + }, + { + "epoch": 1.3371555707684408, + "grad_norm": 15.794921875, + "learning_rate": 9.358355992015674e-06, + "loss": 1.1463, + "step": 7813 + }, + { + "epoch": 1.3373267157282218, + "grad_norm": 19.980100631713867, + "learning_rate": 9.34768190508472e-06, + "loss": 2.206, + "step": 7814 + }, + { + "epoch": 1.3374978606880028, + "grad_norm": 1.3618296384811401, + "learning_rate": 9.337011153029864e-06, + "loss": 0.2121, + "step": 7815 + }, + { + "epoch": 1.3376690056477836, + "grad_norm": 11.703173637390137, + "learning_rate": 9.326343742146853e-06, + "loss": 0.9959, + "step": 7816 + }, + { + "epoch": 1.3378401506075646, + "grad_norm": 13.13909912109375, + "learning_rate": 9.315679678729492e-06, + "loss": 1.4271, + "step": 7817 + }, + { + "epoch": 1.3380112955673455, + "grad_norm": 23.269561767578125, + "learning_rate": 9.305018969069586e-06, + "loss": 1.7931, + "step": 7818 + }, + { + "epoch": 1.3381824405271265, + "grad_norm": 13.245935440063477, + "learning_rate": 9.294361619456975e-06, + "loss": 1.0624, + "step": 7819 + }, + { + "epoch": 1.3383535854869075, + "grad_norm": 17.5803279876709, + "learning_rate": 9.283707636179504e-06, + "loss": 1.6743, + "step": 7820 + }, + { + "epoch": 1.3385247304466883, + "grad_norm": 13.90809154510498, + "learning_rate": 9.273057025523039e-06, + "loss": 1.1273, + "step": 7821 + }, + { + "epoch": 1.3386958754064693, + "grad_norm": 0.44874170422554016, + "learning_rate": 9.262409793771455e-06, + "loss": 0.1323, + "step": 7822 + }, + { + "epoch": 1.3388670203662503, + "grad_norm": 4.137473106384277, + "learning_rate": 9.251765947206648e-06, + "loss": 0.3071, + "step": 7823 + }, + { + "epoch": 1.339038165326031, + "grad_norm": 7.665952205657959, + "learning_rate": 9.24112549210849e-06, + "loss": 0.7075, + "step": 7824 + }, + { + "epoch": 1.339209310285812, + "grad_norm": 14.941093444824219, + "learning_rate": 9.230488434754869e-06, + "loss": 1.2963, + "step": 7825 + }, + { + "epoch": 1.339380455245593, + "grad_norm": 19.078445434570312, + "learning_rate": 9.219854781421665e-06, + "loss": 2.2177, + "step": 7826 + }, + { + "epoch": 1.339551600205374, + "grad_norm": 18.343894958496094, + "learning_rate": 9.209224538382751e-06, + "loss": 1.8106, + "step": 7827 + }, + { + "epoch": 1.339722745165155, + "grad_norm": 0.48707735538482666, + "learning_rate": 9.198597711909983e-06, + "loss": 0.1278, + "step": 7828 + }, + { + "epoch": 1.3398938901249358, + "grad_norm": 76.68790435791016, + "learning_rate": 9.187974308273206e-06, + "loss": 7.0306, + "step": 7829 + }, + { + "epoch": 1.3400650350847167, + "grad_norm": 3.141286849975586, + "learning_rate": 9.177354333740248e-06, + "loss": 0.2852, + "step": 7830 + }, + { + "epoch": 1.3402361800444977, + "grad_norm": 9.084844589233398, + "learning_rate": 9.166737794576901e-06, + "loss": 0.6802, + "step": 7831 + }, + { + "epoch": 1.3404073250042785, + "grad_norm": 86.43070220947266, + "learning_rate": 9.156124697046946e-06, + "loss": 7.8303, + "step": 7832 + }, + { + "epoch": 1.3405784699640595, + "grad_norm": 7.052088737487793, + "learning_rate": 9.145515047412115e-06, + "loss": 0.7937, + "step": 7833 + }, + { + "epoch": 1.3407496149238405, + "grad_norm": 13.691901206970215, + "learning_rate": 9.134908851932133e-06, + "loss": 1.1782, + "step": 7834 + }, + { + "epoch": 1.3409207598836215, + "grad_norm": 5.9041290283203125, + "learning_rate": 9.124306116864668e-06, + "loss": 0.4507, + "step": 7835 + }, + { + "epoch": 1.3410919048434025, + "grad_norm": 12.721549034118652, + "learning_rate": 9.113706848465341e-06, + "loss": 0.8944, + "step": 7836 + }, + { + "epoch": 1.3412630498031832, + "grad_norm": 16.74374771118164, + "learning_rate": 9.103111052987743e-06, + "loss": 1.4153, + "step": 7837 + }, + { + "epoch": 1.3414341947629642, + "grad_norm": 14.554159164428711, + "learning_rate": 9.0925187366834e-06, + "loss": 1.3382, + "step": 7838 + }, + { + "epoch": 1.3416053397227452, + "grad_norm": 13.031160354614258, + "learning_rate": 9.08192990580181e-06, + "loss": 0.746, + "step": 7839 + }, + { + "epoch": 1.341776484682526, + "grad_norm": 11.126832962036133, + "learning_rate": 9.071344566590387e-06, + "loss": 0.9381, + "step": 7840 + }, + { + "epoch": 1.341947629642307, + "grad_norm": 17.594663619995117, + "learning_rate": 9.060762725294501e-06, + "loss": 1.4099, + "step": 7841 + }, + { + "epoch": 1.342118774602088, + "grad_norm": 23.681806564331055, + "learning_rate": 9.050184388157454e-06, + "loss": 5.1249, + "step": 7842 + }, + { + "epoch": 1.342289919561869, + "grad_norm": 11.81534194946289, + "learning_rate": 9.039609561420477e-06, + "loss": 0.738, + "step": 7843 + }, + { + "epoch": 1.34246106452165, + "grad_norm": 0.513480544090271, + "learning_rate": 9.029038251322738e-06, + "loss": 0.1294, + "step": 7844 + }, + { + "epoch": 1.3426322094814307, + "grad_norm": 19.058822631835938, + "learning_rate": 9.018470464101325e-06, + "loss": 1.3968, + "step": 7845 + }, + { + "epoch": 1.3428033544412117, + "grad_norm": 2.775284767150879, + "learning_rate": 9.007906205991247e-06, + "loss": 0.2575, + "step": 7846 + }, + { + "epoch": 1.3429744994009927, + "grad_norm": 8.478485107421875, + "learning_rate": 8.997345483225433e-06, + "loss": 1.0727, + "step": 7847 + }, + { + "epoch": 1.3431456443607734, + "grad_norm": 13.795414924621582, + "learning_rate": 8.986788302034724e-06, + "loss": 1.3467, + "step": 7848 + }, + { + "epoch": 1.3433167893205544, + "grad_norm": 21.440906524658203, + "learning_rate": 8.976234668647871e-06, + "loss": 1.8343, + "step": 7849 + }, + { + "epoch": 1.3434879342803354, + "grad_norm": 11.401413917541504, + "learning_rate": 8.965684589291537e-06, + "loss": 0.8268, + "step": 7850 + }, + { + "epoch": 1.3436590792401164, + "grad_norm": 11.775639533996582, + "learning_rate": 8.955138070190284e-06, + "loss": 1.1194, + "step": 7851 + }, + { + "epoch": 1.3438302241998974, + "grad_norm": 9.425366401672363, + "learning_rate": 8.944595117566574e-06, + "loss": 0.8341, + "step": 7852 + }, + { + "epoch": 1.3440013691596784, + "grad_norm": 1.781152606010437, + "learning_rate": 8.934055737640765e-06, + "loss": 0.1981, + "step": 7853 + }, + { + "epoch": 1.3441725141194591, + "grad_norm": 13.886120796203613, + "learning_rate": 8.923519936631102e-06, + "loss": 1.2971, + "step": 7854 + }, + { + "epoch": 1.3443436590792401, + "grad_norm": 12.310486793518066, + "learning_rate": 8.912987720753735e-06, + "loss": 0.7952, + "step": 7855 + }, + { + "epoch": 1.3445148040390211, + "grad_norm": 11.246068000793457, + "learning_rate": 8.902459096222673e-06, + "loss": 1.0557, + "step": 7856 + }, + { + "epoch": 1.344685948998802, + "grad_norm": 11.441505432128906, + "learning_rate": 8.891934069249827e-06, + "loss": 0.8978, + "step": 7857 + }, + { + "epoch": 1.3448570939585829, + "grad_norm": 13.720816612243652, + "learning_rate": 8.881412646044977e-06, + "loss": 1.0826, + "step": 7858 + }, + { + "epoch": 1.3450282389183639, + "grad_norm": 13.07275104522705, + "learning_rate": 8.870894832815776e-06, + "loss": 0.9818, + "step": 7859 + }, + { + "epoch": 1.3451993838781449, + "grad_norm": 13.763599395751953, + "learning_rate": 8.860380635767758e-06, + "loss": 1.3965, + "step": 7860 + }, + { + "epoch": 1.3453705288379258, + "grad_norm": 21.626535415649414, + "learning_rate": 8.849870061104309e-06, + "loss": 1.7967, + "step": 7861 + }, + { + "epoch": 1.3455416737977066, + "grad_norm": 20.506174087524414, + "learning_rate": 8.83936311502668e-06, + "loss": 2.3941, + "step": 7862 + }, + { + "epoch": 1.3457128187574876, + "grad_norm": 12.698630332946777, + "learning_rate": 8.828859803733994e-06, + "loss": 0.8943, + "step": 7863 + }, + { + "epoch": 1.3458839637172686, + "grad_norm": 10.888079643249512, + "learning_rate": 8.818360133423214e-06, + "loss": 0.6842, + "step": 7864 + }, + { + "epoch": 1.3460551086770494, + "grad_norm": 0.5401256680488586, + "learning_rate": 8.807864110289159e-06, + "loss": 0.1295, + "step": 7865 + }, + { + "epoch": 1.3462262536368304, + "grad_norm": 7.9375081062316895, + "learning_rate": 8.797371740524508e-06, + "loss": 0.5116, + "step": 7866 + }, + { + "epoch": 1.3463973985966113, + "grad_norm": 5.087529182434082, + "learning_rate": 8.786883030319765e-06, + "loss": 0.4002, + "step": 7867 + }, + { + "epoch": 1.3465685435563923, + "grad_norm": 12.330865859985352, + "learning_rate": 8.776397985863289e-06, + "loss": 1.0029, + "step": 7868 + }, + { + "epoch": 1.3467396885161733, + "grad_norm": 17.346025466918945, + "learning_rate": 8.765916613341272e-06, + "loss": 1.667, + "step": 7869 + }, + { + "epoch": 1.346910833475954, + "grad_norm": 14.988444328308105, + "learning_rate": 8.75543891893774e-06, + "loss": 1.1046, + "step": 7870 + }, + { + "epoch": 1.347081978435735, + "grad_norm": 20.520917892456055, + "learning_rate": 8.744964908834543e-06, + "loss": 2.4082, + "step": 7871 + }, + { + "epoch": 1.347253123395516, + "grad_norm": 11.348979949951172, + "learning_rate": 8.734494589211371e-06, + "loss": 1.0192, + "step": 7872 + }, + { + "epoch": 1.3474242683552968, + "grad_norm": 15.412310600280762, + "learning_rate": 8.724027966245718e-06, + "loss": 1.3022, + "step": 7873 + }, + { + "epoch": 1.3475954133150778, + "grad_norm": 20.15825080871582, + "learning_rate": 8.71356504611292e-06, + "loss": 2.031, + "step": 7874 + }, + { + "epoch": 1.3477665582748588, + "grad_norm": 5.575507640838623, + "learning_rate": 8.7031058349861e-06, + "loss": 0.3702, + "step": 7875 + }, + { + "epoch": 1.3479377032346398, + "grad_norm": 6.61264705657959, + "learning_rate": 8.692650339036217e-06, + "loss": 0.4081, + "step": 7876 + }, + { + "epoch": 1.3481088481944208, + "grad_norm": 21.451274871826172, + "learning_rate": 8.682198564432035e-06, + "loss": 2.5121, + "step": 7877 + }, + { + "epoch": 1.3482799931542016, + "grad_norm": 14.139251708984375, + "learning_rate": 8.671750517340103e-06, + "loss": 1.4738, + "step": 7878 + }, + { + "epoch": 1.3484511381139825, + "grad_norm": 15.205315589904785, + "learning_rate": 8.661306203924797e-06, + "loss": 1.1368, + "step": 7879 + }, + { + "epoch": 1.3486222830737635, + "grad_norm": 1.4786579608917236, + "learning_rate": 8.650865630348275e-06, + "loss": 0.195, + "step": 7880 + }, + { + "epoch": 1.3487934280335443, + "grad_norm": 10.230877876281738, + "learning_rate": 8.640428802770474e-06, + "loss": 0.7811, + "step": 7881 + }, + { + "epoch": 1.3489645729933253, + "grad_norm": 19.68021583557129, + "learning_rate": 8.629995727349164e-06, + "loss": 2.2237, + "step": 7882 + }, + { + "epoch": 1.3491357179531063, + "grad_norm": 10.938794136047363, + "learning_rate": 8.619566410239862e-06, + "loss": 0.9205, + "step": 7883 + }, + { + "epoch": 1.3493068629128873, + "grad_norm": 1.4069541692733765, + "learning_rate": 8.609140857595876e-06, + "loss": 0.1698, + "step": 7884 + }, + { + "epoch": 1.3494780078726683, + "grad_norm": 0.7581135034561157, + "learning_rate": 8.598719075568308e-06, + "loss": 0.1348, + "step": 7885 + }, + { + "epoch": 1.349649152832449, + "grad_norm": 0.621238648891449, + "learning_rate": 8.58830107030601e-06, + "loss": 0.1342, + "step": 7886 + }, + { + "epoch": 1.34982029779223, + "grad_norm": 7.031002521514893, + "learning_rate": 8.57788684795564e-06, + "loss": 0.582, + "step": 7887 + }, + { + "epoch": 1.349991442752011, + "grad_norm": 0.5243487358093262, + "learning_rate": 8.567476414661596e-06, + "loss": 0.1336, + "step": 7888 + }, + { + "epoch": 1.3501625877117918, + "grad_norm": 18.68903350830078, + "learning_rate": 8.557069776566044e-06, + "loss": 1.4371, + "step": 7889 + }, + { + "epoch": 1.3503337326715728, + "grad_norm": 1.3490283489227295, + "learning_rate": 8.546666939808924e-06, + "loss": 0.2222, + "step": 7890 + }, + { + "epoch": 1.3505048776313537, + "grad_norm": 9.998093605041504, + "learning_rate": 8.536267910527919e-06, + "loss": 0.7217, + "step": 7891 + }, + { + "epoch": 1.3506760225911347, + "grad_norm": 11.268606185913086, + "learning_rate": 8.52587269485847e-06, + "loss": 0.9479, + "step": 7892 + }, + { + "epoch": 1.3508471675509157, + "grad_norm": 20.187633514404297, + "learning_rate": 8.515481298933783e-06, + "loss": 1.8128, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_nli-pairs_loss": 1.4432600736618042, + "eval_nli-pairs_runtime": 4.3769, + "eval_nli-pairs_samples_per_second": 45.694, + "eval_nli-pairs_steps_per_second": 1.599, + "eval_sts-test_pearson_cosine": 0.7748113572228759, + "eval_sts-test_pearson_dot": 0.6390425293409608, + "eval_sts-test_pearson_euclidean": 0.7620744050210577, + "eval_sts-test_pearson_manhattan": 0.7657457138434305, + "eval_sts-test_pearson_max": 0.7748113572228759, + "eval_sts-test_spearman_cosine": 0.7729829193564915, + "eval_sts-test_spearman_dot": 0.6192746726630098, + "eval_sts-test_spearman_euclidean": 0.7504799466626302, + "eval_sts-test_spearman_manhattan": 0.755559036954118, + "eval_sts-test_spearman_max": 0.7729829193564915, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_vitaminc-pairs_loss": 0.7334175109863281, + "eval_vitaminc-pairs_runtime": 2.7709, + "eval_vitaminc-pairs_samples_per_second": 72.178, + "eval_vitaminc-pairs_steps_per_second": 2.526, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_qnli-contrastive_loss": 1.5698559284210205, + "eval_qnli-contrastive_runtime": 0.6423, + "eval_qnli-contrastive_samples_per_second": 311.394, + "eval_qnli-contrastive_steps_per_second": 10.899, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_scitail-pairs-qa_loss": 0.10769753158092499, + "eval_scitail-pairs-qa_runtime": 1.6203, + "eval_scitail-pairs-qa_samples_per_second": 123.431, + "eval_scitail-pairs-qa_steps_per_second": 4.32, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_scitail-pairs-pos_loss": 0.6731968522071838, + "eval_scitail-pairs-pos_runtime": 2.6601, + "eval_scitail-pairs-pos_samples_per_second": 75.186, + "eval_scitail-pairs-pos_steps_per_second": 2.631, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_xsum-pairs_loss": 0.7274036407470703, + "eval_xsum-pairs_runtime": 2.6535, + "eval_xsum-pairs_samples_per_second": 65.951, + "eval_xsum-pairs_steps_per_second": 2.261, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_compression-pairs_loss": 0.24030046164989471, + "eval_compression-pairs_runtime": 0.5226, + "eval_compression-pairs_samples_per_second": 382.686, + "eval_compression-pairs_steps_per_second": 13.394, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_sciq_pairs_loss": 0.43072450160980225, + "eval_sciq_pairs_runtime": 9.2015, + "eval_sciq_pairs_samples_per_second": 21.736, + "eval_sciq_pairs_steps_per_second": 0.761, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_qasc_pairs_loss": 5.355893611907959, + "eval_qasc_pairs_runtime": 2.7315, + "eval_qasc_pairs_samples_per_second": 73.219, + "eval_qasc_pairs_steps_per_second": 2.563, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_openbookqa_pairs_loss": 2.571211338043213, + "eval_openbookqa_pairs_runtime": 0.659, + "eval_openbookqa_pairs_samples_per_second": 104.704, + "eval_openbookqa_pairs_steps_per_second": 4.552, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_msmarco_pairs_loss": 1.1058056354522705, + "eval_msmarco_pairs_runtime": 4.0254, + "eval_msmarco_pairs_samples_per_second": 49.684, + "eval_msmarco_pairs_steps_per_second": 1.739, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_nq_pairs_loss": 1.2713885307312012, + "eval_nq_pairs_runtime": 8.6454, + "eval_nq_pairs_samples_per_second": 23.134, + "eval_nq_pairs_steps_per_second": 0.81, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_trivia_pairs_loss": 1.5911108255386353, + "eval_trivia_pairs_runtime": 12.8789, + "eval_trivia_pairs_samples_per_second": 15.529, + "eval_trivia_pairs_steps_per_second": 0.544, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_quora_pairs_loss": 0.21135039627552032, + "eval_quora_pairs_runtime": 1.6142, + "eval_quora_pairs_samples_per_second": 123.901, + "eval_quora_pairs_steps_per_second": 4.337, + "step": 7893 + }, + { + "epoch": 1.3508471675509157, + "eval_gooaq_pairs_loss": 0.8607009053230286, + "eval_gooaq_pairs_runtime": 2.7341, + "eval_gooaq_pairs_samples_per_second": 73.15, + "eval_gooaq_pairs_steps_per_second": 2.56, + "step": 7893 + }, + { + "epoch": 1.3510183125106965, + "grad_norm": 107.09418487548828, + "learning_rate": 8.50509372888478e-06, + "loss": 8.7501, + "step": 7894 + }, + { + "epoch": 1.3511894574704775, + "grad_norm": 11.88301944732666, + "learning_rate": 8.494709990840158e-06, + "loss": 0.8261, + "step": 7895 + }, + { + "epoch": 1.3513606024302585, + "grad_norm": 15.522592544555664, + "learning_rate": 8.484330090926324e-06, + "loss": 1.3149, + "step": 7896 + }, + { + "epoch": 1.3515317473900392, + "grad_norm": 13.993993759155273, + "learning_rate": 8.473954035267448e-06, + "loss": 1.0682, + "step": 7897 + }, + { + "epoch": 1.3517028923498202, + "grad_norm": 5.835932731628418, + "learning_rate": 8.463581829985406e-06, + "loss": 0.5868, + "step": 7898 + }, + { + "epoch": 1.3518740373096012, + "grad_norm": 19.500629425048828, + "learning_rate": 8.453213481199823e-06, + "loss": 2.0912, + "step": 7899 + }, + { + "epoch": 1.3520451822693822, + "grad_norm": 4.438239574432373, + "learning_rate": 8.44284899502804e-06, + "loss": 0.3397, + "step": 7900 + }, + { + "epoch": 1.3522163272291632, + "grad_norm": 13.467206001281738, + "learning_rate": 8.43248837758511e-06, + "loss": 0.9853, + "step": 7901 + }, + { + "epoch": 1.3523874721889442, + "grad_norm": 13.498573303222656, + "learning_rate": 8.422131634983819e-06, + "loss": 1.3403, + "step": 7902 + }, + { + "epoch": 1.352558617148725, + "grad_norm": 197.8994140625, + "learning_rate": 8.411778773334667e-06, + "loss": 8.94, + "step": 7903 + }, + { + "epoch": 1.352729762108506, + "grad_norm": 2.5428144931793213, + "learning_rate": 8.401429798745847e-06, + "loss": 0.2328, + "step": 7904 + }, + { + "epoch": 1.352900907068287, + "grad_norm": 3.9354376792907715, + "learning_rate": 8.391084717323278e-06, + "loss": 0.2556, + "step": 7905 + }, + { + "epoch": 1.3530720520280677, + "grad_norm": 17.734619140625, + "learning_rate": 8.380743535170563e-06, + "loss": 2.4227, + "step": 7906 + }, + { + "epoch": 1.3532431969878487, + "grad_norm": 21.069412231445312, + "learning_rate": 8.37040625838903e-06, + "loss": 1.9247, + "step": 7907 + }, + { + "epoch": 1.3534143419476297, + "grad_norm": 21.592254638671875, + "learning_rate": 8.360072893077672e-06, + "loss": 2.9089, + "step": 7908 + }, + { + "epoch": 1.3535854869074107, + "grad_norm": 3.6049587726593018, + "learning_rate": 8.349743445333196e-06, + "loss": 0.3444, + "step": 7909 + }, + { + "epoch": 1.3537566318671916, + "grad_norm": 16.893281936645508, + "learning_rate": 8.339417921249998e-06, + "loss": 1.6815, + "step": 7910 + }, + { + "epoch": 1.3539277768269724, + "grad_norm": 0.4270938038825989, + "learning_rate": 8.329096326920142e-06, + "loss": 0.1226, + "step": 7911 + }, + { + "epoch": 1.3540989217867534, + "grad_norm": 26.750118255615234, + "learning_rate": 8.318778668433396e-06, + "loss": 1.192, + "step": 7912 + }, + { + "epoch": 1.3542700667465344, + "grad_norm": 11.184526443481445, + "learning_rate": 8.308464951877181e-06, + "loss": 1.0331, + "step": 7913 + }, + { + "epoch": 1.3544412117063152, + "grad_norm": 25.97614860534668, + "learning_rate": 8.298155183336617e-06, + "loss": 5.0103, + "step": 7914 + }, + { + "epoch": 1.3546123566660961, + "grad_norm": 16.017576217651367, + "learning_rate": 8.287849368894476e-06, + "loss": 1.6073, + "step": 7915 + }, + { + "epoch": 1.3547835016258771, + "grad_norm": 20.626556396484375, + "learning_rate": 8.277547514631201e-06, + "loss": 2.6192, + "step": 7916 + }, + { + "epoch": 1.3549546465856581, + "grad_norm": 0.4352658987045288, + "learning_rate": 8.267249626624908e-06, + "loss": 0.1248, + "step": 7917 + }, + { + "epoch": 1.355125791545439, + "grad_norm": 12.039044380187988, + "learning_rate": 8.256955710951354e-06, + "loss": 0.9049, + "step": 7918 + }, + { + "epoch": 1.3552969365052199, + "grad_norm": 7.01033353805542, + "learning_rate": 8.246665773683985e-06, + "loss": 0.8376, + "step": 7919 + }, + { + "epoch": 1.3554680814650009, + "grad_norm": 15.398770332336426, + "learning_rate": 8.236379820893868e-06, + "loss": 1.4464, + "step": 7920 + }, + { + "epoch": 1.3556392264247819, + "grad_norm": 24.395837783813477, + "learning_rate": 8.226097858649725e-06, + "loss": 1.5301, + "step": 7921 + }, + { + "epoch": 1.3558103713845626, + "grad_norm": 3.7129740715026855, + "learning_rate": 8.215819893017941e-06, + "loss": 0.3776, + "step": 7922 + }, + { + "epoch": 1.3559815163443436, + "grad_norm": 17.666425704956055, + "learning_rate": 8.20554593006252e-06, + "loss": 1.3419, + "step": 7923 + }, + { + "epoch": 1.3561526613041246, + "grad_norm": 20.95212173461914, + "learning_rate": 8.195275975845118e-06, + "loss": 1.4918, + "step": 7924 + }, + { + "epoch": 1.3563238062639056, + "grad_norm": 21.755355834960938, + "learning_rate": 8.185010036425032e-06, + "loss": 2.0847, + "step": 7925 + }, + { + "epoch": 1.3564949512236866, + "grad_norm": 15.192948341369629, + "learning_rate": 8.17474811785917e-06, + "loss": 1.4842, + "step": 7926 + }, + { + "epoch": 1.3566660961834673, + "grad_norm": 10.453164100646973, + "learning_rate": 8.164490226202092e-06, + "loss": 0.9547, + "step": 7927 + }, + { + "epoch": 1.3568372411432483, + "grad_norm": 10.041457176208496, + "learning_rate": 8.154236367505955e-06, + "loss": 0.6116, + "step": 7928 + }, + { + "epoch": 1.3570083861030293, + "grad_norm": 14.595532417297363, + "learning_rate": 8.143986547820556e-06, + "loss": 1.2874, + "step": 7929 + }, + { + "epoch": 1.35717953106281, + "grad_norm": 9.68789291381836, + "learning_rate": 8.133740773193313e-06, + "loss": 0.6884, + "step": 7930 + }, + { + "epoch": 1.357350676022591, + "grad_norm": 10.335504531860352, + "learning_rate": 8.123499049669234e-06, + "loss": 0.9279, + "step": 7931 + }, + { + "epoch": 1.357521820982372, + "grad_norm": 15.070721626281738, + "learning_rate": 8.113261383290964e-06, + "loss": 1.2174, + "step": 7932 + }, + { + "epoch": 1.357692965942153, + "grad_norm": 23.035741806030273, + "learning_rate": 8.10302778009873e-06, + "loss": 2.1438, + "step": 7933 + }, + { + "epoch": 1.357864110901934, + "grad_norm": 15.676365852355957, + "learning_rate": 8.092798246130377e-06, + "loss": 1.3295, + "step": 7934 + }, + { + "epoch": 1.3580352558617148, + "grad_norm": 14.068243980407715, + "learning_rate": 8.082572787421357e-06, + "loss": 0.8558, + "step": 7935 + }, + { + "epoch": 1.3582064008214958, + "grad_norm": 2.1047537326812744, + "learning_rate": 8.072351410004685e-06, + "loss": 0.2433, + "step": 7936 + }, + { + "epoch": 1.3583775457812768, + "grad_norm": 18.384124755859375, + "learning_rate": 8.062134119911007e-06, + "loss": 2.3037, + "step": 7937 + }, + { + "epoch": 1.3585486907410576, + "grad_norm": 0.5251880288124084, + "learning_rate": 8.051920923168527e-06, + "loss": 0.1249, + "step": 7938 + }, + { + "epoch": 1.3587198357008385, + "grad_norm": 5.348263263702393, + "learning_rate": 8.041711825803055e-06, + "loss": 0.3089, + "step": 7939 + }, + { + "epoch": 1.3588909806606195, + "grad_norm": 12.722254753112793, + "learning_rate": 8.03150683383797e-06, + "loss": 0.9297, + "step": 7940 + }, + { + "epoch": 1.3590621256204005, + "grad_norm": 16.143598556518555, + "learning_rate": 8.02130595329423e-06, + "loss": 0.8998, + "step": 7941 + }, + { + "epoch": 1.3592332705801815, + "grad_norm": 12.335897445678711, + "learning_rate": 8.011109190190374e-06, + "loss": 0.9059, + "step": 7942 + }, + { + "epoch": 1.3594044155399623, + "grad_norm": 0.3998869061470032, + "learning_rate": 8.0009165505425e-06, + "loss": 0.1258, + "step": 7943 + }, + { + "epoch": 1.3595755604997433, + "grad_norm": 9.871674537658691, + "learning_rate": 7.990728040364294e-06, + "loss": 0.8648, + "step": 7944 + }, + { + "epoch": 1.3597467054595243, + "grad_norm": 5.811002731323242, + "learning_rate": 7.980543665666978e-06, + "loss": 0.3648, + "step": 7945 + }, + { + "epoch": 1.359917850419305, + "grad_norm": 3.4111969470977783, + "learning_rate": 7.970363432459352e-06, + "loss": 0.2808, + "step": 7946 + }, + { + "epoch": 1.360088995379086, + "grad_norm": 12.346128463745117, + "learning_rate": 7.96018734674778e-06, + "loss": 0.9339, + "step": 7947 + }, + { + "epoch": 1.360260140338867, + "grad_norm": 20.27092742919922, + "learning_rate": 7.950015414536152e-06, + "loss": 1.8533, + "step": 7948 + }, + { + "epoch": 1.360431285298648, + "grad_norm": 2.390983819961548, + "learning_rate": 7.939847641825934e-06, + "loss": 0.2616, + "step": 7949 + }, + { + "epoch": 1.360602430258429, + "grad_norm": 10.336169242858887, + "learning_rate": 7.929684034616122e-06, + "loss": 0.7228, + "step": 7950 + }, + { + "epoch": 1.3607735752182097, + "grad_norm": 23.217824935913086, + "learning_rate": 7.919524598903256e-06, + "loss": 5.2922, + "step": 7951 + }, + { + "epoch": 1.3609447201779907, + "grad_norm": 16.831451416015625, + "learning_rate": 7.90936934068143e-06, + "loss": 1.9508, + "step": 7952 + }, + { + "epoch": 1.3611158651377717, + "grad_norm": 11.5362548828125, + "learning_rate": 7.89921826594225e-06, + "loss": 0.971, + "step": 7953 + }, + { + "epoch": 1.3612870100975527, + "grad_norm": 121.41738891601562, + "learning_rate": 7.889071380674873e-06, + "loss": 8.451, + "step": 7954 + }, + { + "epoch": 1.3614581550573335, + "grad_norm": 1.763120412826538, + "learning_rate": 7.878928690865967e-06, + "loss": 0.2545, + "step": 7955 + }, + { + "epoch": 1.3616293000171145, + "grad_norm": 29.964916229248047, + "learning_rate": 7.868790202499748e-06, + "loss": 5.3622, + "step": 7956 + }, + { + "epoch": 1.3618004449768955, + "grad_norm": 18.546653747558594, + "learning_rate": 7.858655921557928e-06, + "loss": 1.8526, + "step": 7957 + }, + { + "epoch": 1.3619715899366764, + "grad_norm": 9.387438774108887, + "learning_rate": 7.848525854019749e-06, + "loss": 0.6086, + "step": 7958 + }, + { + "epoch": 1.3621427348964574, + "grad_norm": 9.829107284545898, + "learning_rate": 7.838400005861972e-06, + "loss": 1.1134, + "step": 7959 + }, + { + "epoch": 1.3623138798562382, + "grad_norm": 24.65566635131836, + "learning_rate": 7.828278383058852e-06, + "loss": 3.0146, + "step": 7960 + }, + { + "epoch": 1.3624850248160192, + "grad_norm": 1.144333004951477, + "learning_rate": 7.818160991582167e-06, + "loss": 0.1983, + "step": 7961 + }, + { + "epoch": 1.3626561697758002, + "grad_norm": 14.87730884552002, + "learning_rate": 7.808047837401202e-06, + "loss": 1.1744, + "step": 7962 + }, + { + "epoch": 1.362827314735581, + "grad_norm": 18.721660614013672, + "learning_rate": 7.79793892648272e-06, + "loss": 1.5515, + "step": 7963 + }, + { + "epoch": 1.362998459695362, + "grad_norm": 2.341946601867676, + "learning_rate": 7.787834264791002e-06, + "loss": 0.2613, + "step": 7964 + }, + { + "epoch": 1.363169604655143, + "grad_norm": 10.254395484924316, + "learning_rate": 7.777733858287805e-06, + "loss": 0.7639, + "step": 7965 + }, + { + "epoch": 1.363340749614924, + "grad_norm": 2.7340681552886963, + "learning_rate": 7.767637712932395e-06, + "loss": 0.2966, + "step": 7966 + }, + { + "epoch": 1.363511894574705, + "grad_norm": 3.232351303100586, + "learning_rate": 7.7575458346815e-06, + "loss": 0.2851, + "step": 7967 + }, + { + "epoch": 1.3636830395344857, + "grad_norm": 13.136812210083008, + "learning_rate": 7.74745822948935e-06, + "loss": 1.3821, + "step": 7968 + }, + { + "epoch": 1.3638541844942667, + "grad_norm": 16.989933013916016, + "learning_rate": 7.737374903307653e-06, + "loss": 1.3374, + "step": 7969 + }, + { + "epoch": 1.3640253294540476, + "grad_norm": 15.131267547607422, + "learning_rate": 7.727295862085576e-06, + "loss": 1.7686, + "step": 7970 + }, + { + "epoch": 1.3641964744138284, + "grad_norm": 23.571996688842773, + "learning_rate": 7.717221111769777e-06, + "loss": 5.0176, + "step": 7971 + }, + { + "epoch": 1.3643676193736094, + "grad_norm": 0.3861672580242157, + "learning_rate": 7.707150658304364e-06, + "loss": 0.1201, + "step": 7972 + }, + { + "epoch": 1.3645387643333904, + "grad_norm": 11.791454315185547, + "learning_rate": 7.697084507630925e-06, + "loss": 0.9477, + "step": 7973 + }, + { + "epoch": 1.3647099092931714, + "grad_norm": 25.322511672973633, + "learning_rate": 7.68702266568851e-06, + "loss": 5.0495, + "step": 7974 + }, + { + "epoch": 1.3648810542529524, + "grad_norm": 15.774009704589844, + "learning_rate": 7.67696513841361e-06, + "loss": 1.4033, + "step": 7975 + }, + { + "epoch": 1.3650521992127331, + "grad_norm": 13.291998863220215, + "learning_rate": 7.66691193174019e-06, + "loss": 1.2573, + "step": 7976 + }, + { + "epoch": 1.3652233441725141, + "grad_norm": 14.601156234741211, + "learning_rate": 7.656863051599646e-06, + "loss": 1.1831, + "step": 7977 + }, + { + "epoch": 1.365394489132295, + "grad_norm": 13.060968399047852, + "learning_rate": 7.646818503920841e-06, + "loss": 1.1125, + "step": 7978 + }, + { + "epoch": 1.3655656340920759, + "grad_norm": 0.6287318468093872, + "learning_rate": 7.636778294630076e-06, + "loss": 0.1291, + "step": 7979 + }, + { + "epoch": 1.3657367790518569, + "grad_norm": 2.176541805267334, + "learning_rate": 7.6267424296510836e-06, + "loss": 0.2442, + "step": 7980 + }, + { + "epoch": 1.3659079240116379, + "grad_norm": 89.38330841064453, + "learning_rate": 7.616710914905035e-06, + "loss": 8.7055, + "step": 7981 + }, + { + "epoch": 1.3660790689714188, + "grad_norm": 14.080567359924316, + "learning_rate": 7.606683756310548e-06, + "loss": 1.318, + "step": 7982 + }, + { + "epoch": 1.3662502139311998, + "grad_norm": 15.819098472595215, + "learning_rate": 7.596660959783651e-06, + "loss": 1.3924, + "step": 7983 + }, + { + "epoch": 1.3664213588909806, + "grad_norm": 14.727864265441895, + "learning_rate": 7.586642531237823e-06, + "loss": 1.0125, + "step": 7984 + }, + { + "epoch": 1.3665925038507616, + "grad_norm": 19.342557907104492, + "learning_rate": 7.576628476583937e-06, + "loss": 1.4103, + "step": 7985 + }, + { + "epoch": 1.3667636488105426, + "grad_norm": 5.45033597946167, + "learning_rate": 7.56661880173031e-06, + "loss": 0.5339, + "step": 7986 + }, + { + "epoch": 1.3669347937703233, + "grad_norm": 6.75947904586792, + "learning_rate": 7.556613512582664e-06, + "loss": 0.4621, + "step": 7987 + }, + { + "epoch": 1.3671059387301043, + "grad_norm": 10.857300758361816, + "learning_rate": 7.54661261504412e-06, + "loss": 0.9741, + "step": 7988 + }, + { + "epoch": 1.3672770836898853, + "grad_norm": 17.877666473388672, + "learning_rate": 7.536616115015246e-06, + "loss": 1.5805, + "step": 7989 + }, + { + "epoch": 1.3674482286496663, + "grad_norm": 14.81043529510498, + "learning_rate": 7.526624018393975e-06, + "loss": 1.2154, + "step": 7990 + }, + { + "epoch": 1.3676193736094473, + "grad_norm": 6.165063858032227, + "learning_rate": 7.5166363310756705e-06, + "loss": 0.7289, + "step": 7991 + }, + { + "epoch": 1.367790518569228, + "grad_norm": 18.871471405029297, + "learning_rate": 7.506653058953077e-06, + "loss": 1.4732, + "step": 7992 + }, + { + "epoch": 1.367961663529009, + "grad_norm": 16.091135025024414, + "learning_rate": 7.496674207916326e-06, + "loss": 1.8831, + "step": 7993 + }, + { + "epoch": 1.36813280848879, + "grad_norm": 6.082664966583252, + "learning_rate": 7.486699783852983e-06, + "loss": 0.3378, + "step": 7994 + }, + { + "epoch": 1.3683039534485708, + "grad_norm": 15.225743293762207, + "learning_rate": 7.476729792647949e-06, + "loss": 1.3947, + "step": 7995 + }, + { + "epoch": 1.3684750984083518, + "grad_norm": 12.026875495910645, + "learning_rate": 7.466764240183551e-06, + "loss": 0.9757, + "step": 7996 + }, + { + "epoch": 1.3686462433681328, + "grad_norm": 0.3766089081764221, + "learning_rate": 7.456803132339472e-06, + "loss": 0.1186, + "step": 7997 + }, + { + "epoch": 1.3688173883279138, + "grad_norm": 15.264883041381836, + "learning_rate": 7.446846474992774e-06, + "loss": 1.2569, + "step": 7998 + }, + { + "epoch": 1.3689885332876948, + "grad_norm": 10.888883590698242, + "learning_rate": 7.4368942740179114e-06, + "loss": 0.8297, + "step": 7999 + }, + { + "epoch": 1.3691596782474755, + "grad_norm": 6.847597122192383, + "learning_rate": 7.426946535286687e-06, + "loss": 0.4545, + "step": 8000 + }, + { + "epoch": 1.3693308232072565, + "grad_norm": 10.688901901245117, + "learning_rate": 7.4170032646682915e-06, + "loss": 0.9802, + "step": 8001 + }, + { + "epoch": 1.3695019681670375, + "grad_norm": 16.83812141418457, + "learning_rate": 7.407064468029259e-06, + "loss": 1.4476, + "step": 8002 + }, + { + "epoch": 1.3696731131268185, + "grad_norm": 18.952674865722656, + "learning_rate": 7.3971301512335055e-06, + "loss": 1.4857, + "step": 8003 + }, + { + "epoch": 1.3698442580865993, + "grad_norm": 1.0890339612960815, + "learning_rate": 7.387200320142282e-06, + "loss": 0.209, + "step": 8004 + }, + { + "epoch": 1.3700154030463803, + "grad_norm": 14.084479331970215, + "learning_rate": 7.3772749806142056e-06, + "loss": 1.0087, + "step": 8005 + }, + { + "epoch": 1.3701865480061612, + "grad_norm": 10.847527503967285, + "learning_rate": 7.367354138505252e-06, + "loss": 0.7337, + "step": 8006 + }, + { + "epoch": 1.3703576929659422, + "grad_norm": 13.33262825012207, + "learning_rate": 7.35743779966872e-06, + "loss": 0.9783, + "step": 8007 + }, + { + "epoch": 1.3705288379257232, + "grad_norm": 15.647591590881348, + "learning_rate": 7.347525969955275e-06, + "loss": 1.6981, + "step": 8008 + }, + { + "epoch": 1.370699982885504, + "grad_norm": 17.50412940979004, + "learning_rate": 7.337618655212906e-06, + "loss": 1.4768, + "step": 8009 + }, + { + "epoch": 1.370871127845285, + "grad_norm": 14.713836669921875, + "learning_rate": 7.327715861286931e-06, + "loss": 1.4001, + "step": 8010 + }, + { + "epoch": 1.371042272805066, + "grad_norm": 18.337644577026367, + "learning_rate": 7.317817594020038e-06, + "loss": 1.6328, + "step": 8011 + }, + { + "epoch": 1.3712134177648467, + "grad_norm": 12.138872146606445, + "learning_rate": 7.307923859252206e-06, + "loss": 1.1789, + "step": 8012 + }, + { + "epoch": 1.3713845627246277, + "grad_norm": 15.563578605651855, + "learning_rate": 7.298034662820748e-06, + "loss": 1.4695, + "step": 8013 + }, + { + "epoch": 1.3715557076844087, + "grad_norm": 17.70362091064453, + "learning_rate": 7.288150010560317e-06, + "loss": 1.4956, + "step": 8014 + }, + { + "epoch": 1.3717268526441897, + "grad_norm": 4.129745006561279, + "learning_rate": 7.278269908302854e-06, + "loss": 0.3229, + "step": 8015 + }, + { + "epoch": 1.3718979976039707, + "grad_norm": 0.6604208946228027, + "learning_rate": 7.268394361877659e-06, + "loss": 0.1311, + "step": 8016 + }, + { + "epoch": 1.3720691425637515, + "grad_norm": 5.454418659210205, + "learning_rate": 7.2585233771113065e-06, + "loss": 0.3518, + "step": 8017 + }, + { + "epoch": 1.3722402875235324, + "grad_norm": 17.043312072753906, + "learning_rate": 7.248656959827685e-06, + "loss": 1.6863, + "step": 8018 + }, + { + "epoch": 1.3724114324833134, + "grad_norm": 21.178543090820312, + "learning_rate": 7.23879511584801e-06, + "loss": 2.4076, + "step": 8019 + }, + { + "epoch": 1.3725825774430942, + "grad_norm": 15.764005661010742, + "learning_rate": 7.22893785099077e-06, + "loss": 1.4081, + "step": 8020 + }, + { + "epoch": 1.3727537224028752, + "grad_norm": 22.064271926879883, + "learning_rate": 7.219085171071771e-06, + "loss": 0.9252, + "step": 8021 + }, + { + "epoch": 1.3729248673626562, + "grad_norm": 15.479656219482422, + "learning_rate": 7.209237081904119e-06, + "loss": 1.0212, + "step": 8022 + }, + { + "epoch": 1.3730960123224372, + "grad_norm": 25.82483673095703, + "learning_rate": 7.199393589298185e-06, + "loss": 5.4379, + "step": 8023 + }, + { + "epoch": 1.3732671572822182, + "grad_norm": 59.71751403808594, + "learning_rate": 7.189554699061658e-06, + "loss": 7.4834, + "step": 8024 + }, + { + "epoch": 1.373438302241999, + "grad_norm": 15.977654457092285, + "learning_rate": 7.179720416999488e-06, + "loss": 1.209, + "step": 8025 + }, + { + "epoch": 1.37360944720178, + "grad_norm": 16.143056869506836, + "learning_rate": 7.169890748913924e-06, + "loss": 1.948, + "step": 8026 + }, + { + "epoch": 1.373780592161561, + "grad_norm": 27.495500564575195, + "learning_rate": 7.160065700604475e-06, + "loss": 5.4623, + "step": 8027 + }, + { + "epoch": 1.3739517371213417, + "grad_norm": 6.934070110321045, + "learning_rate": 7.150245277867945e-06, + "loss": 0.5984, + "step": 8028 + }, + { + "epoch": 1.3741228820811227, + "grad_norm": 22.05686378479004, + "learning_rate": 7.140429486498394e-06, + "loss": 5.0537, + "step": 8029 + }, + { + "epoch": 1.3742940270409036, + "grad_norm": 19.047924041748047, + "learning_rate": 7.130618332287147e-06, + "loss": 1.7073, + "step": 8030 + }, + { + "epoch": 1.3744651720006846, + "grad_norm": 9.164684295654297, + "learning_rate": 7.120811821022805e-06, + "loss": 1.0877, + "step": 8031 + }, + { + "epoch": 1.3746363169604656, + "grad_norm": 19.64408302307129, + "learning_rate": 7.11100995849123e-06, + "loss": 2.3523, + "step": 8032 + }, + { + "epoch": 1.3748074619202464, + "grad_norm": 24.614337921142578, + "learning_rate": 7.101212750475524e-06, + "loss": 5.1241, + "step": 8033 + }, + { + "epoch": 1.3749786068800274, + "grad_norm": 14.775176048278809, + "learning_rate": 7.091420202756066e-06, + "loss": 1.2222, + "step": 8034 + }, + { + "epoch": 1.3751497518398084, + "grad_norm": 14.342777252197266, + "learning_rate": 7.0816323211104615e-06, + "loss": 1.2048, + "step": 8035 + }, + { + "epoch": 1.3753208967995891, + "grad_norm": 0.43484142422676086, + "learning_rate": 7.0718491113135815e-06, + "loss": 0.1246, + "step": 8036 + }, + { + "epoch": 1.3754920417593701, + "grad_norm": 9.955939292907715, + "learning_rate": 7.062070579137541e-06, + "loss": 1.0372, + "step": 8037 + }, + { + "epoch": 1.375663186719151, + "grad_norm": 18.3414306640625, + "learning_rate": 7.052296730351676e-06, + "loss": 1.8833, + "step": 8038 + }, + { + "epoch": 1.375834331678932, + "grad_norm": 13.475018501281738, + "learning_rate": 7.042527570722584e-06, + "loss": 1.1563, + "step": 8039 + }, + { + "epoch": 1.376005476638713, + "grad_norm": 29.00771713256836, + "learning_rate": 7.0327631060140705e-06, + "loss": 5.4663, + "step": 8040 + }, + { + "epoch": 1.3761766215984939, + "grad_norm": 15.355203628540039, + "learning_rate": 7.023003341987198e-06, + "loss": 1.4888, + "step": 8041 + }, + { + "epoch": 1.3763477665582748, + "grad_norm": 19.718971252441406, + "learning_rate": 7.01324828440023e-06, + "loss": 1.6287, + "step": 8042 + }, + { + "epoch": 1.3765189115180558, + "grad_norm": 12.399322509765625, + "learning_rate": 7.0034979390086755e-06, + "loss": 1.0184, + "step": 8043 + }, + { + "epoch": 1.3766900564778366, + "grad_norm": 79.78743743896484, + "learning_rate": 6.9937523115652464e-06, + "loss": 7.7877, + "step": 8044 + }, + { + "epoch": 1.3768612014376176, + "grad_norm": 8.104473114013672, + "learning_rate": 6.9840114078198745e-06, + "loss": 0.9961, + "step": 8045 + }, + { + "epoch": 1.3770323463973986, + "grad_norm": 1.3494822978973389, + "learning_rate": 6.974275233519717e-06, + "loss": 0.2309, + "step": 8046 + }, + { + "epoch": 1.3772034913571796, + "grad_norm": 21.887319564819336, + "learning_rate": 6.964543794409114e-06, + "loss": 2.6566, + "step": 8047 + }, + { + "epoch": 1.3773746363169606, + "grad_norm": 20.275388717651367, + "learning_rate": 6.954817096229651e-06, + "loss": 1.7164, + "step": 8048 + }, + { + "epoch": 1.3775457812767413, + "grad_norm": 0.4384884834289551, + "learning_rate": 6.9450951447200855e-06, + "loss": 0.1223, + "step": 8049 + }, + { + "epoch": 1.3777169262365223, + "grad_norm": 19.98276138305664, + "learning_rate": 6.935377945616375e-06, + "loss": 1.4869, + "step": 8050 + }, + { + "epoch": 1.3778880711963033, + "grad_norm": 16.372770309448242, + "learning_rate": 6.925665504651695e-06, + "loss": 1.4808, + "step": 8051 + }, + { + "epoch": 1.378059216156084, + "grad_norm": 18.803668975830078, + "learning_rate": 6.915957827556389e-06, + "loss": 2.2224, + "step": 8052 + }, + { + "epoch": 1.378230361115865, + "grad_norm": 7.975682735443115, + "learning_rate": 6.906254920058005e-06, + "loss": 0.8613, + "step": 8053 + }, + { + "epoch": 1.378401506075646, + "grad_norm": 18.069852828979492, + "learning_rate": 6.896556787881279e-06, + "loss": 1.431, + "step": 8054 + }, + { + "epoch": 1.378572651035427, + "grad_norm": 3.733964443206787, + "learning_rate": 6.8868634367481105e-06, + "loss": 0.2958, + "step": 8055 + }, + { + "epoch": 1.378743795995208, + "grad_norm": 4.760802268981934, + "learning_rate": 6.877174872377608e-06, + "loss": 0.3132, + "step": 8056 + }, + { + "epoch": 1.378914940954989, + "grad_norm": 19.099594116210938, + "learning_rate": 6.867491100486021e-06, + "loss": 1.4491, + "step": 8057 + }, + { + "epoch": 1.3790860859147698, + "grad_norm": 3.7092506885528564, + "learning_rate": 6.857812126786798e-06, + "loss": 0.4602, + "step": 8058 + }, + { + "epoch": 1.3792572308745508, + "grad_norm": 15.550238609313965, + "learning_rate": 6.848137956990553e-06, + "loss": 1.0471, + "step": 8059 + }, + { + "epoch": 1.3794283758343318, + "grad_norm": 16.970060348510742, + "learning_rate": 6.8384685968050504e-06, + "loss": 2.0567, + "step": 8060 + }, + { + "epoch": 1.3795995207941125, + "grad_norm": 57.70185852050781, + "learning_rate": 6.828804051935237e-06, + "loss": 8.2924, + "step": 8061 + }, + { + "epoch": 1.3797706657538935, + "grad_norm": 17.050525665283203, + "learning_rate": 6.8191443280831985e-06, + "loss": 1.5775, + "step": 8062 + }, + { + "epoch": 1.3799418107136745, + "grad_norm": 0.41770151257514954, + "learning_rate": 6.809489430948192e-06, + "loss": 0.1234, + "step": 8063 + }, + { + "epoch": 1.3801129556734555, + "grad_norm": 3.295809268951416, + "learning_rate": 6.799839366226626e-06, + "loss": 0.2752, + "step": 8064 + }, + { + "epoch": 1.3802841006332365, + "grad_norm": 6.964579105377197, + "learning_rate": 6.790194139612041e-06, + "loss": 0.3692, + "step": 8065 + }, + { + "epoch": 1.3804552455930172, + "grad_norm": 4.134066104888916, + "learning_rate": 6.780553756795148e-06, + "loss": 0.3073, + "step": 8066 + }, + { + "epoch": 1.3806263905527982, + "grad_norm": 12.902695655822754, + "learning_rate": 6.770918223463776e-06, + "loss": 1.4663, + "step": 8067 + }, + { + "epoch": 1.3807975355125792, + "grad_norm": 20.563579559326172, + "learning_rate": 6.761287545302915e-06, + "loss": 2.075, + "step": 8068 + }, + { + "epoch": 1.38096868047236, + "grad_norm": 17.05385398864746, + "learning_rate": 6.751661727994672e-06, + "loss": 2.182, + "step": 8069 + }, + { + "epoch": 1.381139825432141, + "grad_norm": 15.212250709533691, + "learning_rate": 6.7420407772182906e-06, + "loss": 0.9807, + "step": 8070 + }, + { + "epoch": 1.381310970391922, + "grad_norm": 17.51760482788086, + "learning_rate": 6.732424698650156e-06, + "loss": 2.3006, + "step": 8071 + }, + { + "epoch": 1.381482115351703, + "grad_norm": 12.286190032958984, + "learning_rate": 6.722813497963758e-06, + "loss": 0.8639, + "step": 8072 + }, + { + "epoch": 1.381653260311484, + "grad_norm": 6.781225681304932, + "learning_rate": 6.713207180829729e-06, + "loss": 0.5811, + "step": 8073 + }, + { + "epoch": 1.3818244052712647, + "grad_norm": 8.380965232849121, + "learning_rate": 6.703605752915802e-06, + "loss": 0.9242, + "step": 8074 + }, + { + "epoch": 1.3819955502310457, + "grad_norm": 18.39959716796875, + "learning_rate": 6.694009219886838e-06, + "loss": 2.1914, + "step": 8075 + }, + { + "epoch": 1.3821666951908267, + "grad_norm": 17.51675796508789, + "learning_rate": 6.68441758740481e-06, + "loss": 1.4097, + "step": 8076 + }, + { + "epoch": 1.3823378401506075, + "grad_norm": 5.3710198402404785, + "learning_rate": 6.6748308611287855e-06, + "loss": 0.3662, + "step": 8077 + }, + { + "epoch": 1.3825089851103884, + "grad_norm": 10.967370986938477, + "learning_rate": 6.66524904671496e-06, + "loss": 0.7261, + "step": 8078 + }, + { + "epoch": 1.3826801300701694, + "grad_norm": 15.520840644836426, + "learning_rate": 6.655672149816605e-06, + "loss": 1.0765, + "step": 8079 + }, + { + "epoch": 1.3828512750299504, + "grad_norm": 62.75824737548828, + "learning_rate": 6.646100176084111e-06, + "loss": 7.4534, + "step": 8080 + }, + { + "epoch": 1.3830224199897314, + "grad_norm": 5.519872188568115, + "learning_rate": 6.6365331311649604e-06, + "loss": 0.4165, + "step": 8081 + }, + { + "epoch": 1.3831935649495122, + "grad_norm": 5.536046981811523, + "learning_rate": 6.626971020703714e-06, + "loss": 0.3804, + "step": 8082 + }, + { + "epoch": 1.3833647099092932, + "grad_norm": 15.942901611328125, + "learning_rate": 6.617413850342042e-06, + "loss": 1.2424, + "step": 8083 + }, + { + "epoch": 1.3835358548690742, + "grad_norm": 14.011425971984863, + "learning_rate": 6.607861625718684e-06, + "loss": 1.0696, + "step": 8084 + }, + { + "epoch": 1.383706999828855, + "grad_norm": 3.4530327320098877, + "learning_rate": 6.598314352469461e-06, + "loss": 0.2756, + "step": 8085 + }, + { + "epoch": 1.383878144788636, + "grad_norm": 9.03433895111084, + "learning_rate": 6.58877203622729e-06, + "loss": 0.6991, + "step": 8086 + }, + { + "epoch": 1.384049289748417, + "grad_norm": 7.679855823516846, + "learning_rate": 6.579234682622139e-06, + "loss": 0.8322, + "step": 8087 + }, + { + "epoch": 1.3842204347081979, + "grad_norm": 0.48729658126831055, + "learning_rate": 6.5697022972810745e-06, + "loss": 0.1323, + "step": 8088 + }, + { + "epoch": 1.3843915796679789, + "grad_norm": 8.509507179260254, + "learning_rate": 6.5601748858282065e-06, + "loss": 0.5483, + "step": 8089 + }, + { + "epoch": 1.3845627246277596, + "grad_norm": 10.94564151763916, + "learning_rate": 6.550652453884724e-06, + "loss": 0.8701, + "step": 8090 + }, + { + "epoch": 1.3847338695875406, + "grad_norm": 27.77153968811035, + "learning_rate": 6.541135007068887e-06, + "loss": 5.4978, + "step": 8091 + }, + { + "epoch": 1.3849050145473216, + "grad_norm": 26.718948364257812, + "learning_rate": 6.531622550995986e-06, + "loss": 1.0582, + "step": 8092 + }, + { + "epoch": 1.3850761595071024, + "grad_norm": 5.982920169830322, + "learning_rate": 6.522115091278402e-06, + "loss": 0.6942, + "step": 8093 + }, + { + "epoch": 1.3852473044668834, + "grad_norm": 10.091400146484375, + "learning_rate": 6.512612633525535e-06, + "loss": 0.7003, + "step": 8094 + }, + { + "epoch": 1.3854184494266644, + "grad_norm": 19.210908889770508, + "learning_rate": 6.503115183343857e-06, + "loss": 1.8411, + "step": 8095 + }, + { + "epoch": 1.3855895943864454, + "grad_norm": 0.4305846691131592, + "learning_rate": 6.4936227463368795e-06, + "loss": 0.1227, + "step": 8096 + }, + { + "epoch": 1.3857607393462263, + "grad_norm": 0.4560820460319519, + "learning_rate": 6.484135328105148e-06, + "loss": 0.1265, + "step": 8097 + }, + { + "epoch": 1.3859318843060071, + "grad_norm": 0.42901623249053955, + "learning_rate": 6.474652934246262e-06, + "loss": 0.1273, + "step": 8098 + }, + { + "epoch": 1.386103029265788, + "grad_norm": 10.552484512329102, + "learning_rate": 6.465175570354837e-06, + "loss": 0.8805, + "step": 8099 + }, + { + "epoch": 1.386274174225569, + "grad_norm": 9.38833236694336, + "learning_rate": 6.455703242022543e-06, + "loss": 0.7382, + "step": 8100 + }, + { + "epoch": 1.3864453191853499, + "grad_norm": 5.672971725463867, + "learning_rate": 6.446235954838058e-06, + "loss": 0.3442, + "step": 8101 + }, + { + "epoch": 1.3866164641451308, + "grad_norm": 10.641942024230957, + "learning_rate": 6.4367737143871e-06, + "loss": 0.9234, + "step": 8102 + }, + { + "epoch": 1.3867876091049118, + "grad_norm": 9.72297191619873, + "learning_rate": 6.42731652625241e-06, + "loss": 0.9145, + "step": 8103 + }, + { + "epoch": 1.3869587540646928, + "grad_norm": 2.3832085132598877, + "learning_rate": 6.417864396013735e-06, + "loss": 0.2521, + "step": 8104 + }, + { + "epoch": 1.3871298990244738, + "grad_norm": 16.631643295288086, + "learning_rate": 6.408417329247851e-06, + "loss": 1.4814, + "step": 8105 + }, + { + "epoch": 1.3873010439842548, + "grad_norm": 15.015097618103027, + "learning_rate": 6.398975331528536e-06, + "loss": 1.2678, + "step": 8106 + }, + { + "epoch": 1.3874721889440356, + "grad_norm": 16.07328224182129, + "learning_rate": 6.389538408426587e-06, + "loss": 1.0467, + "step": 8107 + }, + { + "epoch": 1.3876433339038166, + "grad_norm": 0.4330154061317444, + "learning_rate": 6.380106565509806e-06, + "loss": 0.1198, + "step": 8108 + }, + { + "epoch": 1.3878144788635975, + "grad_norm": 7.085061550140381, + "learning_rate": 6.370679808342991e-06, + "loss": 0.8164, + "step": 8109 + }, + { + "epoch": 1.3879856238233783, + "grad_norm": 18.764047622680664, + "learning_rate": 6.361258142487936e-06, + "loss": 2.5517, + "step": 8110 + }, + { + "epoch": 1.3881567687831593, + "grad_norm": 13.04540729522705, + "learning_rate": 6.35184157350345e-06, + "loss": 1.1178, + "step": 8111 + }, + { + "epoch": 1.3883279137429403, + "grad_norm": 7.839341163635254, + "learning_rate": 6.342430106945312e-06, + "loss": 0.9889, + "step": 8112 + }, + { + "epoch": 1.3884990587027213, + "grad_norm": 0.3881106972694397, + "learning_rate": 6.333023748366311e-06, + "loss": 0.1214, + "step": 8113 + }, + { + "epoch": 1.3886702036625023, + "grad_norm": 6.755177974700928, + "learning_rate": 6.323622503316201e-06, + "loss": 0.352, + "step": 8114 + }, + { + "epoch": 1.388841348622283, + "grad_norm": 14.873271942138672, + "learning_rate": 6.314226377341743e-06, + "loss": 1.1683, + "step": 8115 + }, + { + "epoch": 1.389012493582064, + "grad_norm": 15.769723892211914, + "learning_rate": 6.304835375986661e-06, + "loss": 1.0529, + "step": 8116 + }, + { + "epoch": 1.389183638541845, + "grad_norm": 94.52820587158203, + "learning_rate": 6.2954495047916445e-06, + "loss": 9.2915, + "step": 8117 + }, + { + "epoch": 1.3893547835016258, + "grad_norm": 19.46604347229004, + "learning_rate": 6.286068769294398e-06, + "loss": 2.3134, + "step": 8118 + }, + { + "epoch": 1.3895259284614068, + "grad_norm": 27.398771286010742, + "learning_rate": 6.276693175029553e-06, + "loss": 5.3409, + "step": 8119 + }, + { + "epoch": 1.3896970734211878, + "grad_norm": 16.219106674194336, + "learning_rate": 6.267322727528731e-06, + "loss": 1.6904, + "step": 8120 + }, + { + "epoch": 1.3898682183809687, + "grad_norm": 12.038756370544434, + "learning_rate": 6.257957432320506e-06, + "loss": 0.8749, + "step": 8121 + }, + { + "epoch": 1.3900393633407497, + "grad_norm": 12.382250785827637, + "learning_rate": 6.248597294930407e-06, + "loss": 1.0359, + "step": 8122 + }, + { + "epoch": 1.3902105083005305, + "grad_norm": 13.015780448913574, + "learning_rate": 6.23924232088095e-06, + "loss": 1.0681, + "step": 8123 + }, + { + "epoch": 1.3903816532603115, + "grad_norm": 0.49243029952049255, + "learning_rate": 6.229892515691566e-06, + "loss": 0.1276, + "step": 8124 + }, + { + "epoch": 1.3905527982200925, + "grad_norm": 20.274410247802734, + "learning_rate": 6.220547884878667e-06, + "loss": 2.0958, + "step": 8125 + }, + { + "epoch": 1.3907239431798732, + "grad_norm": 7.040147304534912, + "learning_rate": 6.211208433955592e-06, + "loss": 0.4615, + "step": 8126 + }, + { + "epoch": 1.3908950881396542, + "grad_norm": 8.364036560058594, + "learning_rate": 6.201874168432627e-06, + "loss": 0.6492, + "step": 8127 + }, + { + "epoch": 1.3910662330994352, + "grad_norm": 17.132287979125977, + "learning_rate": 6.192545093817011e-06, + "loss": 1.2063, + "step": 8128 + }, + { + "epoch": 1.3912373780592162, + "grad_norm": 16.411943435668945, + "learning_rate": 6.1832212156129045e-06, + "loss": 1.2658, + "step": 8129 + }, + { + "epoch": 1.3914085230189972, + "grad_norm": 5.050886631011963, + "learning_rate": 6.173902539321417e-06, + "loss": 0.314, + "step": 8130 + }, + { + "epoch": 1.391579667978778, + "grad_norm": 14.13430404663086, + "learning_rate": 6.164589070440572e-06, + "loss": 0.8482, + "step": 8131 + }, + { + "epoch": 1.391750812938559, + "grad_norm": 22.28693962097168, + "learning_rate": 6.155280814465341e-06, + "loss": 1.8001, + "step": 8132 + }, + { + "epoch": 1.39192195789834, + "grad_norm": 16.029760360717773, + "learning_rate": 6.145977776887599e-06, + "loss": 1.2511, + "step": 8133 + }, + { + "epoch": 1.3920931028581207, + "grad_norm": 12.385472297668457, + "learning_rate": 6.136679963196155e-06, + "loss": 0.9778, + "step": 8134 + }, + { + "epoch": 1.3922642478179017, + "grad_norm": 17.029630661010742, + "learning_rate": 6.127387378876741e-06, + "loss": 1.3992, + "step": 8135 + }, + { + "epoch": 1.3924353927776827, + "grad_norm": 122.83370208740234, + "learning_rate": 6.118100029411982e-06, + "loss": 10.1844, + "step": 8136 + }, + { + "epoch": 1.3926065377374637, + "grad_norm": 25.112655639648438, + "learning_rate": 6.108817920281441e-06, + "loss": 3.64, + "step": 8137 + }, + { + "epoch": 1.3927776826972447, + "grad_norm": 14.767163276672363, + "learning_rate": 6.099541056961565e-06, + "loss": 1.1958, + "step": 8138 + }, + { + "epoch": 1.3929488276570254, + "grad_norm": 11.507041931152344, + "learning_rate": 6.090269444925722e-06, + "loss": 1.0309, + "step": 8139 + }, + { + "epoch": 1.3931199726168064, + "grad_norm": 14.378401756286621, + "learning_rate": 6.081003089644182e-06, + "loss": 1.3041, + "step": 8140 + }, + { + "epoch": 1.3932911175765874, + "grad_norm": 9.112606048583984, + "learning_rate": 6.071741996584104e-06, + "loss": 0.9412, + "step": 8141 + }, + { + "epoch": 1.3934622625363682, + "grad_norm": 2.832979440689087, + "learning_rate": 6.062486171209541e-06, + "loss": 0.2715, + "step": 8142 + }, + { + "epoch": 1.3936334074961492, + "grad_norm": 12.30532455444336, + "learning_rate": 6.053235618981454e-06, + "loss": 1.1225, + "step": 8143 + }, + { + "epoch": 1.3938045524559302, + "grad_norm": 13.39280891418457, + "learning_rate": 6.0439903453576665e-06, + "loss": 1.1622, + "step": 8144 + }, + { + "epoch": 1.3939756974157111, + "grad_norm": 18.136594772338867, + "learning_rate": 6.034750355792927e-06, + "loss": 1.5756, + "step": 8145 + }, + { + "epoch": 1.3941468423754921, + "grad_norm": 14.948494911193848, + "learning_rate": 6.0255156557388295e-06, + "loss": 1.3044, + "step": 8146 + }, + { + "epoch": 1.394317987335273, + "grad_norm": 14.837949752807617, + "learning_rate": 6.016286250643859e-06, + "loss": 1.2203, + "step": 8147 + }, + { + "epoch": 1.394489132295054, + "grad_norm": 0.6035500168800354, + "learning_rate": 6.0070621459533846e-06, + "loss": 0.1311, + "step": 8148 + }, + { + "epoch": 1.3946602772548349, + "grad_norm": 12.36839771270752, + "learning_rate": 5.997843347109634e-06, + "loss": 0.9687, + "step": 8149 + }, + { + "epoch": 1.3948314222146156, + "grad_norm": 21.35314178466797, + "learning_rate": 5.988629859551719e-06, + "loss": 1.5717, + "step": 8150 + }, + { + "epoch": 1.3950025671743966, + "grad_norm": 15.543251991271973, + "learning_rate": 5.979421688715612e-06, + "loss": 1.3761, + "step": 8151 + }, + { + "epoch": 1.3951737121341776, + "grad_norm": 13.013747215270996, + "learning_rate": 5.9702188400341394e-06, + "loss": 1.0351, + "step": 8152 + }, + { + "epoch": 1.3953448570939586, + "grad_norm": 14.551894187927246, + "learning_rate": 5.9610213189370054e-06, + "loss": 1.0881, + "step": 8153 + }, + { + "epoch": 1.3955160020537396, + "grad_norm": 19.03411102294922, + "learning_rate": 5.951829130850753e-06, + "loss": 1.3978, + "step": 8154 + }, + { + "epoch": 1.3956871470135206, + "grad_norm": 18.278907775878906, + "learning_rate": 5.9426422811987944e-06, + "loss": 2.1687, + "step": 8155 + }, + { + "epoch": 1.3958582919733014, + "grad_norm": 0.7327485680580139, + "learning_rate": 5.933460775401376e-06, + "loss": 0.1343, + "step": 8156 + }, + { + "epoch": 1.3960294369330823, + "grad_norm": 8.675280570983887, + "learning_rate": 5.9242846188756085e-06, + "loss": 0.6823, + "step": 8157 + }, + { + "epoch": 1.3962005818928633, + "grad_norm": 14.639605522155762, + "learning_rate": 5.915113817035433e-06, + "loss": 1.0151, + "step": 8158 + }, + { + "epoch": 1.396371726852644, + "grad_norm": 5.388650894165039, + "learning_rate": 5.905948375291635e-06, + "loss": 0.3492, + "step": 8159 + }, + { + "epoch": 1.396542871812425, + "grad_norm": 13.80362606048584, + "learning_rate": 5.896788299051837e-06, + "loss": 1.0125, + "step": 8160 + }, + { + "epoch": 1.396714016772206, + "grad_norm": 14.141159057617188, + "learning_rate": 5.887633593720509e-06, + "loss": 1.1325, + "step": 8161 + }, + { + "epoch": 1.396885161731987, + "grad_norm": 2.77217698097229, + "learning_rate": 5.878484264698927e-06, + "loss": 0.2681, + "step": 8162 + }, + { + "epoch": 1.397056306691768, + "grad_norm": 13.108981132507324, + "learning_rate": 5.869340317385221e-06, + "loss": 1.1958, + "step": 8163 + }, + { + "epoch": 1.3972274516515488, + "grad_norm": 17.22273826599121, + "learning_rate": 5.860201757174322e-06, + "loss": 1.1516, + "step": 8164 + }, + { + "epoch": 1.3973985966113298, + "grad_norm": 15.933062553405762, + "learning_rate": 5.851068589458e-06, + "loss": 1.5206, + "step": 8165 + }, + { + "epoch": 1.3975697415711108, + "grad_norm": 11.56320571899414, + "learning_rate": 5.841940819624841e-06, + "loss": 1.075, + "step": 8166 + }, + { + "epoch": 1.3977408865308916, + "grad_norm": 14.757344245910645, + "learning_rate": 5.832818453060236e-06, + "loss": 1.2473, + "step": 8167 + }, + { + "epoch": 1.3979120314906726, + "grad_norm": 14.71597671508789, + "learning_rate": 5.823701495146401e-06, + "loss": 1.3768, + "step": 8168 + }, + { + "epoch": 1.3980831764504535, + "grad_norm": 14.82431411743164, + "learning_rate": 5.814589951262346e-06, + "loss": 0.9769, + "step": 8169 + }, + { + "epoch": 1.3982543214102345, + "grad_norm": 24.57257652282715, + "learning_rate": 5.805483826783909e-06, + "loss": 2.1528, + "step": 8170 + }, + { + "epoch": 1.3984254663700155, + "grad_norm": 19.09847640991211, + "learning_rate": 5.796383127083702e-06, + "loss": 1.5526, + "step": 8171 + }, + { + "epoch": 1.3985966113297963, + "grad_norm": 0.5062642693519592, + "learning_rate": 5.787287857531164e-06, + "loss": 0.1277, + "step": 8172 + }, + { + "epoch": 1.3987677562895773, + "grad_norm": 14.45943832397461, + "learning_rate": 5.778198023492512e-06, + "loss": 1.0815, + "step": 8173 + }, + { + "epoch": 1.3989389012493583, + "grad_norm": 9.705041885375977, + "learning_rate": 5.769113630330755e-06, + "loss": 1.0699, + "step": 8174 + }, + { + "epoch": 1.399110046209139, + "grad_norm": 13.191262245178223, + "learning_rate": 5.760034683405712e-06, + "loss": 0.9696, + "step": 8175 + }, + { + "epoch": 1.39928119116892, + "grad_norm": 9.288146018981934, + "learning_rate": 5.750961188073959e-06, + "loss": 0.9779, + "step": 8176 + }, + { + "epoch": 1.399452336128701, + "grad_norm": 16.641170501708984, + "learning_rate": 5.74189314968889e-06, + "loss": 1.7428, + "step": 8177 + }, + { + "epoch": 1.399623481088482, + "grad_norm": 11.01945686340332, + "learning_rate": 5.732830573600652e-06, + "loss": 1.127, + "step": 8178 + }, + { + "epoch": 1.399794626048263, + "grad_norm": 17.64215660095215, + "learning_rate": 5.723773465156174e-06, + "loss": 1.4615, + "step": 8179 + }, + { + "epoch": 1.3999657710080438, + "grad_norm": 19.89571189880371, + "learning_rate": 5.714721829699173e-06, + "loss": 1.4644, + "step": 8180 + }, + { + "epoch": 1.4001369159678247, + "grad_norm": 24.79058265686035, + "learning_rate": 5.705675672570117e-06, + "loss": 5.4109, + "step": 8181 + }, + { + "epoch": 1.4003080609276057, + "grad_norm": 19.724206924438477, + "learning_rate": 5.696634999106258e-06, + "loss": 1.9591, + "step": 8182 + }, + { + "epoch": 1.4004792058873865, + "grad_norm": 16.37856101989746, + "learning_rate": 5.687599814641612e-06, + "loss": 1.5471, + "step": 8183 + }, + { + "epoch": 1.4006503508471675, + "grad_norm": 9.3511381149292, + "learning_rate": 5.6785701245069405e-06, + "loss": 1.1499, + "step": 8184 + }, + { + "epoch": 1.4008214958069485, + "grad_norm": 3.557250499725342, + "learning_rate": 5.669545934029785e-06, + "loss": 0.265, + "step": 8185 + }, + { + "epoch": 1.4009926407667295, + "grad_norm": 16.56192398071289, + "learning_rate": 5.66052724853442e-06, + "loss": 1.3035, + "step": 8186 + }, + { + "epoch": 1.4011637857265105, + "grad_norm": 16.86762237548828, + "learning_rate": 5.651514073341889e-06, + "loss": 1.2929, + "step": 8187 + }, + { + "epoch": 1.4013349306862912, + "grad_norm": 5.57222843170166, + "learning_rate": 5.642506413769985e-06, + "loss": 0.3981, + "step": 8188 + }, + { + "epoch": 1.4015060756460722, + "grad_norm": 12.076986312866211, + "learning_rate": 5.633504275133228e-06, + "loss": 0.855, + "step": 8189 + }, + { + "epoch": 1.4016772206058532, + "grad_norm": 9.291732788085938, + "learning_rate": 5.624507662742907e-06, + "loss": 0.9672, + "step": 8190 + }, + { + "epoch": 1.401848365565634, + "grad_norm": 18.100425720214844, + "learning_rate": 5.615516581907022e-06, + "loss": 2.1209, + "step": 8191 + }, + { + "epoch": 1.402019510525415, + "grad_norm": 20.10270118713379, + "learning_rate": 5.606531037930333e-06, + "loss": 2.322, + "step": 8192 + }, + { + "epoch": 1.402190655485196, + "grad_norm": 12.470746040344238, + "learning_rate": 5.597551036114328e-06, + "loss": 0.8872, + "step": 8193 + }, + { + "epoch": 1.402361800444977, + "grad_norm": 2.3300442695617676, + "learning_rate": 5.58857658175721e-06, + "loss": 0.2696, + "step": 8194 + }, + { + "epoch": 1.402532945404758, + "grad_norm": 0.4978567361831665, + "learning_rate": 5.579607680153932e-06, + "loss": 0.1298, + "step": 8195 + }, + { + "epoch": 1.4027040903645387, + "grad_norm": 17.7264461517334, + "learning_rate": 5.57064433659615e-06, + "loss": 2.0535, + "step": 8196 + }, + { + "epoch": 1.4028752353243197, + "grad_norm": 18.33466911315918, + "learning_rate": 5.561686556372258e-06, + "loss": 1.4185, + "step": 8197 + }, + { + "epoch": 1.4030463802841007, + "grad_norm": 12.567920684814453, + "learning_rate": 5.552734344767356e-06, + "loss": 0.9471, + "step": 8198 + }, + { + "epoch": 1.4032175252438814, + "grad_norm": 17.757282257080078, + "learning_rate": 5.543787707063256e-06, + "loss": 1.9904, + "step": 8199 + }, + { + "epoch": 1.4033886702036624, + "grad_norm": 15.925894737243652, + "learning_rate": 5.534846648538499e-06, + "loss": 1.0907, + "step": 8200 + }, + { + "epoch": 1.4035598151634434, + "grad_norm": 0.56212317943573, + "learning_rate": 5.525911174468313e-06, + "loss": 0.1412, + "step": 8201 + }, + { + "epoch": 1.4037309601232244, + "grad_norm": 14.887899398803711, + "learning_rate": 5.5169812901246515e-06, + "loss": 0.9752, + "step": 8202 + }, + { + "epoch": 1.4039021050830054, + "grad_norm": 6.524936199188232, + "learning_rate": 5.508057000776145e-06, + "loss": 0.486, + "step": 8203 + }, + { + "epoch": 1.4040732500427862, + "grad_norm": 6.022520542144775, + "learning_rate": 5.499138311688148e-06, + "loss": 0.5675, + "step": 8204 + }, + { + "epoch": 1.4042443950025671, + "grad_norm": 21.253719329833984, + "learning_rate": 5.490225228122704e-06, + "loss": 1.5702, + "step": 8205 + }, + { + "epoch": 1.4044155399623481, + "grad_norm": 5.472929954528809, + "learning_rate": 5.481317755338534e-06, + "loss": 0.5731, + "step": 8206 + }, + { + "epoch": 1.4045866849221291, + "grad_norm": 12.250944137573242, + "learning_rate": 5.472415898591072e-06, + "loss": 0.9764, + "step": 8207 + }, + { + "epoch": 1.40475782988191, + "grad_norm": 17.810029983520508, + "learning_rate": 5.463519663132413e-06, + "loss": 1.3339, + "step": 8208 + }, + { + "epoch": 1.4049289748416909, + "grad_norm": 14.014885902404785, + "learning_rate": 5.45462905421136e-06, + "loss": 1.3219, + "step": 8209 + }, + { + "epoch": 1.4051001198014719, + "grad_norm": 13.23001480102539, + "learning_rate": 5.445744077073386e-06, + "loss": 1.1867, + "step": 8210 + }, + { + "epoch": 1.4052712647612529, + "grad_norm": 19.602142333984375, + "learning_rate": 5.4368647369606315e-06, + "loss": 2.0208, + "step": 8211 + }, + { + "epoch": 1.4054424097210338, + "grad_norm": 15.634562492370605, + "learning_rate": 5.4279910391119335e-06, + "loss": 1.2285, + "step": 8212 + }, + { + "epoch": 1.4056135546808146, + "grad_norm": 5.58061408996582, + "learning_rate": 5.419122988762777e-06, + "loss": 0.559, + "step": 8213 + }, + { + "epoch": 1.4057846996405956, + "grad_norm": 21.571144104003906, + "learning_rate": 5.410260591145324e-06, + "loss": 2.1966, + "step": 8214 + }, + { + "epoch": 1.4059558446003766, + "grad_norm": 28.33049201965332, + "learning_rate": 5.40140385148841e-06, + "loss": 5.4506, + "step": 8215 + }, + { + "epoch": 1.4061269895601574, + "grad_norm": 23.992944717407227, + "learning_rate": 5.392552775017515e-06, + "loss": 5.2613, + "step": 8216 + }, + { + "epoch": 1.4062981345199383, + "grad_norm": 9.046107292175293, + "learning_rate": 5.383707366954799e-06, + "loss": 0.758, + "step": 8217 + }, + { + "epoch": 1.4064692794797193, + "grad_norm": 1.370169997215271, + "learning_rate": 5.374867632519054e-06, + "loss": 0.2339, + "step": 8218 + }, + { + "epoch": 1.4066404244395003, + "grad_norm": 32.47026824951172, + "learning_rate": 5.3660335769257416e-06, + "loss": 5.6423, + "step": 8219 + }, + { + "epoch": 1.4068115693992813, + "grad_norm": 7.3732194900512695, + "learning_rate": 5.357205205386974e-06, + "loss": 0.6134, + "step": 8220 + }, + { + "epoch": 1.406982714359062, + "grad_norm": 19.994348526000977, + "learning_rate": 5.348382523111492e-06, + "loss": 2.0643, + "step": 8221 + }, + { + "epoch": 1.407153859318843, + "grad_norm": 19.23360252380371, + "learning_rate": 5.339565535304703e-06, + "loss": 1.7582, + "step": 8222 + }, + { + "epoch": 1.407325004278624, + "grad_norm": 13.627464294433594, + "learning_rate": 5.330754247168631e-06, + "loss": 1.0169, + "step": 8223 + }, + { + "epoch": 1.4074961492384048, + "grad_norm": 26.512380599975586, + "learning_rate": 5.321948663901956e-06, + "loss": 5.258, + "step": 8224 + }, + { + "epoch": 1.4076672941981858, + "grad_norm": 16.750289916992188, + "learning_rate": 5.313148790699989e-06, + "loss": 1.5109, + "step": 8225 + }, + { + "epoch": 1.4078384391579668, + "grad_norm": 0.5825438499450684, + "learning_rate": 5.304354632754657e-06, + "loss": 0.1341, + "step": 8226 + }, + { + "epoch": 1.4080095841177478, + "grad_norm": 9.757774353027344, + "learning_rate": 5.295566195254541e-06, + "loss": 0.6044, + "step": 8227 + }, + { + "epoch": 1.4081807290775288, + "grad_norm": 8.211784362792969, + "learning_rate": 5.2867834833848175e-06, + "loss": 0.8329, + "step": 8228 + }, + { + "epoch": 1.4083518740373095, + "grad_norm": 21.741884231567383, + "learning_rate": 5.278006502327305e-06, + "loss": 2.9324, + "step": 8229 + }, + { + "epoch": 1.4085230189970905, + "grad_norm": 17.97635269165039, + "learning_rate": 5.269235257260444e-06, + "loss": 1.7023, + "step": 8230 + }, + { + "epoch": 1.4086941639568715, + "grad_norm": 13.902000427246094, + "learning_rate": 5.260469753359268e-06, + "loss": 1.0075, + "step": 8231 + }, + { + "epoch": 1.4088653089166523, + "grad_norm": 128.4419403076172, + "learning_rate": 5.25170999579545e-06, + "loss": 10.1796, + "step": 8232 + }, + { + "epoch": 1.4090364538764333, + "grad_norm": 8.146458625793457, + "learning_rate": 5.242955989737255e-06, + "loss": 0.7365, + "step": 8233 + }, + { + "epoch": 1.4092075988362143, + "grad_norm": 25.456911087036133, + "learning_rate": 5.234207740349552e-06, + "loss": 5.0111, + "step": 8234 + }, + { + "epoch": 1.4093787437959953, + "grad_norm": 15.250171661376953, + "learning_rate": 5.22546525279383e-06, + "loss": 1.2126, + "step": 8235 + }, + { + "epoch": 1.4095498887557762, + "grad_norm": 7.476015567779541, + "learning_rate": 5.216728532228166e-06, + "loss": 0.4713, + "step": 8236 + }, + { + "epoch": 1.409721033715557, + "grad_norm": 16.156631469726562, + "learning_rate": 5.2079975838072454e-06, + "loss": 1.3638, + "step": 8237 + }, + { + "epoch": 1.409892178675338, + "grad_norm": 14.219999313354492, + "learning_rate": 5.199272412682336e-06, + "loss": 1.1954, + "step": 8238 + }, + { + "epoch": 1.410063323635119, + "grad_norm": 12.787342071533203, + "learning_rate": 5.190553024001294e-06, + "loss": 1.0364, + "step": 8239 + }, + { + "epoch": 1.4102344685948998, + "grad_norm": 16.65154266357422, + "learning_rate": 5.181839422908585e-06, + "loss": 1.4099, + "step": 8240 + }, + { + "epoch": 1.4104056135546807, + "grad_norm": 7.621182441711426, + "learning_rate": 5.173131614545234e-06, + "loss": 0.7902, + "step": 8241 + }, + { + "epoch": 1.4105767585144617, + "grad_norm": 22.970258712768555, + "learning_rate": 5.164429604048872e-06, + "loss": 2.7821, + "step": 8242 + }, + { + "epoch": 1.4107479034742427, + "grad_norm": 11.445488929748535, + "learning_rate": 5.155733396553691e-06, + "loss": 0.9885, + "step": 8243 + }, + { + "epoch": 1.4109190484340237, + "grad_norm": 17.15314483642578, + "learning_rate": 5.147042997190471e-06, + "loss": 1.3129, + "step": 8244 + }, + { + "epoch": 1.4110901933938045, + "grad_norm": 3.5751562118530273, + "learning_rate": 5.13835841108656e-06, + "loss": 0.3196, + "step": 8245 + }, + { + "epoch": 1.4112613383535855, + "grad_norm": 10.422638893127441, + "learning_rate": 5.129679643365864e-06, + "loss": 0.8013, + "step": 8246 + }, + { + "epoch": 1.4114324833133665, + "grad_norm": 16.59783935546875, + "learning_rate": 5.121006699148889e-06, + "loss": 1.3876, + "step": 8247 + }, + { + "epoch": 1.4116036282731472, + "grad_norm": 10.978696823120117, + "learning_rate": 5.112339583552672e-06, + "loss": 0.9672, + "step": 8248 + }, + { + "epoch": 1.4117747732329282, + "grad_norm": 12.953923225402832, + "learning_rate": 5.103678301690833e-06, + "loss": 1.3148, + "step": 8249 + }, + { + "epoch": 1.4119459181927092, + "grad_norm": 14.40229606628418, + "learning_rate": 5.095022858673536e-06, + "loss": 1.1966, + "step": 8250 + }, + { + "epoch": 1.4121170631524902, + "grad_norm": 16.806026458740234, + "learning_rate": 5.086373259607495e-06, + "loss": 1.971, + "step": 8251 + }, + { + "epoch": 1.4122882081122712, + "grad_norm": 18.28573989868164, + "learning_rate": 5.077729509596009e-06, + "loss": 2.0556, + "step": 8252 + }, + { + "epoch": 1.412459353072052, + "grad_norm": 13.333511352539062, + "learning_rate": 5.069091613738883e-06, + "loss": 0.9267, + "step": 8253 + }, + { + "epoch": 1.412630498031833, + "grad_norm": 16.23652458190918, + "learning_rate": 5.060459577132504e-06, + "loss": 1.406, + "step": 8254 + }, + { + "epoch": 1.412801642991614, + "grad_norm": 18.61722183227539, + "learning_rate": 5.051833404869778e-06, + "loss": 1.7441, + "step": 8255 + }, + { + "epoch": 1.4129727879513947, + "grad_norm": 10.499286651611328, + "learning_rate": 5.043213102040155e-06, + "loss": 0.8584, + "step": 8256 + }, + { + "epoch": 1.4131439329111757, + "grad_norm": 13.57168960571289, + "learning_rate": 5.034598673729637e-06, + "loss": 1.0984, + "step": 8257 + }, + { + "epoch": 1.4133150778709567, + "grad_norm": 20.259376525878906, + "learning_rate": 5.02599012502074e-06, + "loss": 2.1547, + "step": 8258 + }, + { + "epoch": 1.4134862228307377, + "grad_norm": 10.417964935302734, + "learning_rate": 5.017387460992531e-06, + "loss": 0.7996, + "step": 8259 + }, + { + "epoch": 1.4136573677905186, + "grad_norm": 46.52947235107422, + "learning_rate": 5.0087906867205825e-06, + "loss": 7.2844, + "step": 8260 + }, + { + "epoch": 1.4138285127502996, + "grad_norm": 13.9273681640625, + "learning_rate": 5.000199807277016e-06, + "loss": 1.154, + "step": 8261 + }, + { + "epoch": 1.4139996577100804, + "grad_norm": 0.46313220262527466, + "learning_rate": 4.991614827730453e-06, + "loss": 0.1258, + "step": 8262 + }, + { + "epoch": 1.4141708026698614, + "grad_norm": 14.016765594482422, + "learning_rate": 4.983035753146048e-06, + "loss": 0.875, + "step": 8263 + }, + { + "epoch": 1.4143419476296424, + "grad_norm": 3.0206847190856934, + "learning_rate": 4.974462588585474e-06, + "loss": 0.2286, + "step": 8264 + }, + { + "epoch": 1.4145130925894231, + "grad_norm": 0.4846319854259491, + "learning_rate": 4.965895339106904e-06, + "loss": 0.1241, + "step": 8265 + }, + { + "epoch": 1.4146842375492041, + "grad_norm": 18.099964141845703, + "learning_rate": 4.957334009765025e-06, + "loss": 1.001, + "step": 8266 + }, + { + "epoch": 1.4148553825089851, + "grad_norm": 19.34149932861328, + "learning_rate": 4.9487786056110396e-06, + "loss": 2.1257, + "step": 8267 + }, + { + "epoch": 1.4150265274687661, + "grad_norm": 11.995976448059082, + "learning_rate": 4.940229131692646e-06, + "loss": 0.7773, + "step": 8268 + }, + { + "epoch": 1.415197672428547, + "grad_norm": 8.392796516418457, + "learning_rate": 4.931685593054055e-06, + "loss": 0.6936, + "step": 8269 + }, + { + "epoch": 1.4153688173883279, + "grad_norm": 22.823087692260742, + "learning_rate": 4.923147994735959e-06, + "loss": 1.7489, + "step": 8270 + }, + { + "epoch": 1.4155399623481089, + "grad_norm": 8.401246070861816, + "learning_rate": 4.91461634177555e-06, + "loss": 0.949, + "step": 8271 + }, + { + "epoch": 1.4157111073078898, + "grad_norm": 21.816896438598633, + "learning_rate": 4.906090639206523e-06, + "loss": 5.0226, + "step": 8272 + }, + { + "epoch": 1.4158822522676706, + "grad_norm": 8.518943786621094, + "learning_rate": 4.897570892059052e-06, + "loss": 0.7506, + "step": 8273 + }, + { + "epoch": 1.4160533972274516, + "grad_norm": 13.235052108764648, + "learning_rate": 4.889057105359807e-06, + "loss": 0.9746, + "step": 8274 + }, + { + "epoch": 1.4162245421872326, + "grad_norm": 15.83570671081543, + "learning_rate": 4.880549284131929e-06, + "loss": 1.3973, + "step": 8275 + }, + { + "epoch": 1.4163956871470136, + "grad_norm": 39.7057991027832, + "learning_rate": 4.8720474333950415e-06, + "loss": 5.954, + "step": 8276 + }, + { + "epoch": 1.4165668321067946, + "grad_norm": 12.785017967224121, + "learning_rate": 4.86355155816526e-06, + "loss": 1.1006, + "step": 8277 + }, + { + "epoch": 1.4167379770665753, + "grad_norm": 21.51802635192871, + "learning_rate": 4.8550616634551505e-06, + "loss": 1.5379, + "step": 8278 + }, + { + "epoch": 1.4169091220263563, + "grad_norm": 18.137781143188477, + "learning_rate": 4.8465777542737686e-06, + "loss": 1.5255, + "step": 8279 + }, + { + "epoch": 1.4170802669861373, + "grad_norm": 18.3780517578125, + "learning_rate": 4.838099835626642e-06, + "loss": 1.4327, + "step": 8280 + }, + { + "epoch": 1.417251411945918, + "grad_norm": 17.565465927124023, + "learning_rate": 4.829627912515742e-06, + "loss": 1.7263, + "step": 8281 + }, + { + "epoch": 1.417422556905699, + "grad_norm": 16.71804428100586, + "learning_rate": 4.821161989939528e-06, + "loss": 1.7956, + "step": 8282 + }, + { + "epoch": 1.41759370186548, + "grad_norm": 16.671550750732422, + "learning_rate": 4.812702072892895e-06, + "loss": 1.1207, + "step": 8283 + }, + { + "epoch": 1.417764846825261, + "grad_norm": 14.933640480041504, + "learning_rate": 4.8042481663672185e-06, + "loss": 1.0693, + "step": 8284 + }, + { + "epoch": 1.417935991785042, + "grad_norm": 15.258363723754883, + "learning_rate": 4.795800275350304e-06, + "loss": 1.0977, + "step": 8285 + }, + { + "epoch": 1.4181071367448228, + "grad_norm": 19.424457550048828, + "learning_rate": 4.787358404826431e-06, + "loss": 1.6517, + "step": 8286 + }, + { + "epoch": 1.4182782817046038, + "grad_norm": 8.330910682678223, + "learning_rate": 4.778922559776311e-06, + "loss": 0.6448, + "step": 8287 + }, + { + "epoch": 1.4184494266643848, + "grad_norm": 15.775337219238281, + "learning_rate": 4.770492745177095e-06, + "loss": 1.314, + "step": 8288 + }, + { + "epoch": 1.4186205716241655, + "grad_norm": 8.920485496520996, + "learning_rate": 4.762068966002404e-06, + "loss": 0.7039, + "step": 8289 + }, + { + "epoch": 1.4187917165839465, + "grad_norm": 19.12442970275879, + "learning_rate": 4.753651227222274e-06, + "loss": 1.4776, + "step": 8290 + }, + { + "epoch": 1.4189628615437275, + "grad_norm": 10.677218437194824, + "learning_rate": 4.745239533803176e-06, + "loss": 0.9044, + "step": 8291 + }, + { + "epoch": 1.4191340065035085, + "grad_norm": 22.69997215270996, + "learning_rate": 4.7368338907080315e-06, + "loss": 5.6359, + "step": 8292 + }, + { + "epoch": 1.4193051514632895, + "grad_norm": 12.117168426513672, + "learning_rate": 4.728434302896173e-06, + "loss": 1.0645, + "step": 8293 + }, + { + "epoch": 1.4194762964230703, + "grad_norm": 15.21337604522705, + "learning_rate": 4.720040775323374e-06, + "loss": 1.387, + "step": 8294 + }, + { + "epoch": 1.4196474413828513, + "grad_norm": 17.30541229248047, + "learning_rate": 4.711653312941836e-06, + "loss": 1.3167, + "step": 8295 + }, + { + "epoch": 1.4198185863426322, + "grad_norm": 0.8116150498390198, + "learning_rate": 4.703271920700162e-06, + "loss": 0.1334, + "step": 8296 + }, + { + "epoch": 1.419989731302413, + "grad_norm": 11.490621566772461, + "learning_rate": 4.694896603543396e-06, + "loss": 0.8766, + "step": 8297 + }, + { + "epoch": 1.420160876262194, + "grad_norm": 2.6469197273254395, + "learning_rate": 4.686527366412978e-06, + "loss": 0.2834, + "step": 8298 + }, + { + "epoch": 1.420332021221975, + "grad_norm": 14.399421691894531, + "learning_rate": 4.67816421424678e-06, + "loss": 1.3508, + "step": 8299 + }, + { + "epoch": 1.420503166181756, + "grad_norm": 15.663511276245117, + "learning_rate": 4.669807151979065e-06, + "loss": 1.4015, + "step": 8300 + }, + { + "epoch": 1.420674311141537, + "grad_norm": 11.911434173583984, + "learning_rate": 4.661456184540523e-06, + "loss": 0.9201, + "step": 8301 + }, + { + "epoch": 1.4208454561013177, + "grad_norm": 20.173824310302734, + "learning_rate": 4.6531113168582285e-06, + "loss": 1.6779, + "step": 8302 + }, + { + "epoch": 1.4210166010610987, + "grad_norm": 12.680366516113281, + "learning_rate": 4.644772553855665e-06, + "loss": 0.8848, + "step": 8303 + }, + { + "epoch": 1.4211877460208797, + "grad_norm": 15.29770565032959, + "learning_rate": 4.636439900452722e-06, + "loss": 1.1408, + "step": 8304 + }, + { + "epoch": 1.4213588909806605, + "grad_norm": 3.1856536865234375, + "learning_rate": 4.628113361565664e-06, + "loss": 0.2935, + "step": 8305 + }, + { + "epoch": 1.4215300359404415, + "grad_norm": 7.9707350730896, + "learning_rate": 4.619792942107183e-06, + "loss": 0.6461, + "step": 8306 + }, + { + "epoch": 1.4217011809002225, + "grad_norm": 17.432270050048828, + "learning_rate": 4.611478646986326e-06, + "loss": 1.5045, + "step": 8307 + }, + { + "epoch": 1.4218723258600034, + "grad_norm": 13.844263076782227, + "learning_rate": 4.603170481108535e-06, + "loss": 1.0557, + "step": 8308 + }, + { + "epoch": 1.4220434708197844, + "grad_norm": 6.255578994750977, + "learning_rate": 4.5948684493756515e-06, + "loss": 0.631, + "step": 8309 + }, + { + "epoch": 1.4222146157795654, + "grad_norm": 33.30855178833008, + "learning_rate": 4.586572556685876e-06, + "loss": 5.827, + "step": 8310 + }, + { + "epoch": 1.4223857607393462, + "grad_norm": 2.2865633964538574, + "learning_rate": 4.578282807933802e-06, + "loss": 0.267, + "step": 8311 + }, + { + "epoch": 1.4225569056991272, + "grad_norm": 6.593497276306152, + "learning_rate": 4.569999208010399e-06, + "loss": 0.7334, + "step": 8312 + }, + { + "epoch": 1.4227280506589082, + "grad_norm": 4.725667476654053, + "learning_rate": 4.5617217618029935e-06, + "loss": 0.4138, + "step": 8313 + }, + { + "epoch": 1.422899195618689, + "grad_norm": 14.847844123840332, + "learning_rate": 4.553450474195301e-06, + "loss": 1.2268, + "step": 8314 + }, + { + "epoch": 1.42307034057847, + "grad_norm": 56.064247131347656, + "learning_rate": 4.545185350067384e-06, + "loss": 7.2508, + "step": 8315 + }, + { + "epoch": 1.423241485538251, + "grad_norm": 18.788999557495117, + "learning_rate": 4.536926394295682e-06, + "loss": 1.6801, + "step": 8316 + }, + { + "epoch": 1.423412630498032, + "grad_norm": 5.343076229095459, + "learning_rate": 4.528673611752997e-06, + "loss": 0.5371, + "step": 8317 + }, + { + "epoch": 1.423583775457813, + "grad_norm": 0.4705977141857147, + "learning_rate": 4.520427007308471e-06, + "loss": 0.1254, + "step": 8318 + }, + { + "epoch": 1.4237549204175937, + "grad_norm": 14.721872329711914, + "learning_rate": 4.512186585827626e-06, + "loss": 1.1522, + "step": 8319 + }, + { + "epoch": 1.4239260653773747, + "grad_norm": 10.518433570861816, + "learning_rate": 4.503952352172312e-06, + "loss": 0.7963, + "step": 8320 + }, + { + "epoch": 1.4240972103371556, + "grad_norm": 18.39733123779297, + "learning_rate": 4.495724311200743e-06, + "loss": 1.9861, + "step": 8321 + }, + { + "epoch": 1.4242683552969364, + "grad_norm": 0.5959325432777405, + "learning_rate": 4.487502467767481e-06, + "loss": 0.1356, + "step": 8322 + }, + { + "epoch": 1.4244395002567174, + "grad_norm": 15.7307710647583, + "learning_rate": 4.479286826723415e-06, + "loss": 1.1963, + "step": 8323 + }, + { + "epoch": 1.4246106452164984, + "grad_norm": 17.11187744140625, + "learning_rate": 4.471077392915798e-06, + "loss": 1.3984, + "step": 8324 + }, + { + "epoch": 1.4247817901762794, + "grad_norm": 3.4646151065826416, + "learning_rate": 4.462874171188197e-06, + "loss": 0.2938, + "step": 8325 + }, + { + "epoch": 1.4249529351360604, + "grad_norm": 16.55135726928711, + "learning_rate": 4.454677166380533e-06, + "loss": 1.4855, + "step": 8326 + }, + { + "epoch": 1.4251240800958411, + "grad_norm": 13.826451301574707, + "learning_rate": 4.446486383329048e-06, + "loss": 1.2132, + "step": 8327 + }, + { + "epoch": 1.4252952250556221, + "grad_norm": 13.233954429626465, + "learning_rate": 4.438301826866311e-06, + "loss": 1.1412, + "step": 8328 + }, + { + "epoch": 1.425466370015403, + "grad_norm": 5.975020885467529, + "learning_rate": 4.430123501821233e-06, + "loss": 0.5786, + "step": 8329 + }, + { + "epoch": 1.4256375149751839, + "grad_norm": 12.811347007751465, + "learning_rate": 4.421951413019028e-06, + "loss": 0.9023, + "step": 8330 + }, + { + "epoch": 1.4258086599349649, + "grad_norm": 11.870346069335938, + "learning_rate": 4.413785565281244e-06, + "loss": 1.0858, + "step": 8331 + }, + { + "epoch": 1.4259798048947459, + "grad_norm": 0.5878346562385559, + "learning_rate": 4.405625963425748e-06, + "loss": 0.1382, + "step": 8332 + }, + { + "epoch": 1.4261509498545268, + "grad_norm": 22.074859619140625, + "learning_rate": 4.3974726122667095e-06, + "loss": 0.9755, + "step": 8333 + }, + { + "epoch": 1.4263220948143078, + "grad_norm": 14.925138473510742, + "learning_rate": 4.389325516614628e-06, + "loss": 1.5146, + "step": 8334 + }, + { + "epoch": 1.4264932397740886, + "grad_norm": 12.06701374053955, + "learning_rate": 4.381184681276289e-06, + "loss": 1.0256, + "step": 8335 + }, + { + "epoch": 1.4266643847338696, + "grad_norm": 66.08161163330078, + "learning_rate": 4.37305011105481e-06, + "loss": 8.282, + "step": 8336 + }, + { + "epoch": 1.4268355296936506, + "grad_norm": 4.224128723144531, + "learning_rate": 4.36492181074959e-06, + "loss": 0.2899, + "step": 8337 + }, + { + "epoch": 1.4270066746534313, + "grad_norm": 16.93010902404785, + "learning_rate": 4.356799785156346e-06, + "loss": 1.2722, + "step": 8338 + }, + { + "epoch": 1.4271778196132123, + "grad_norm": 14.85118579864502, + "learning_rate": 4.3486840390670755e-06, + "loss": 1.4349, + "step": 8339 + }, + { + "epoch": 1.4273489645729933, + "grad_norm": 19.152435302734375, + "learning_rate": 4.3405745772700875e-06, + "loss": 2.5445, + "step": 8340 + }, + { + "epoch": 1.4275201095327743, + "grad_norm": 0.5115156769752502, + "learning_rate": 4.3324714045499815e-06, + "loss": 0.1364, + "step": 8341 + }, + { + "epoch": 1.4276912544925553, + "grad_norm": 81.3175048828125, + "learning_rate": 4.324374525687635e-06, + "loss": 8.4995, + "step": 8342 + }, + { + "epoch": 1.427862399452336, + "grad_norm": 18.543806076049805, + "learning_rate": 4.3162839454602135e-06, + "loss": 1.5013, + "step": 8343 + }, + { + "epoch": 1.428033544412117, + "grad_norm": 22.6982364654541, + "learning_rate": 4.3081996686411825e-06, + "loss": 2.4938, + "step": 8344 + }, + { + "epoch": 1.428204689371898, + "grad_norm": 6.561403751373291, + "learning_rate": 4.300121700000269e-06, + "loss": 0.3636, + "step": 8345 + }, + { + "epoch": 1.4283758343316788, + "grad_norm": 9.580253601074219, + "learning_rate": 4.2920500443034915e-06, + "loss": 0.585, + "step": 8346 + }, + { + "epoch": 1.4285469792914598, + "grad_norm": 16.961750030517578, + "learning_rate": 4.283984706313135e-06, + "loss": 2.085, + "step": 8347 + }, + { + "epoch": 1.4287181242512408, + "grad_norm": 5.839288711547852, + "learning_rate": 4.275925690787765e-06, + "loss": 0.5665, + "step": 8348 + }, + { + "epoch": 1.4288892692110218, + "grad_norm": 16.67837905883789, + "learning_rate": 4.267873002482213e-06, + "loss": 1.6816, + "step": 8349 + }, + { + "epoch": 1.4290604141708028, + "grad_norm": 17.28608512878418, + "learning_rate": 4.259826646147563e-06, + "loss": 1.6976, + "step": 8350 + }, + { + "epoch": 1.4292315591305835, + "grad_norm": 7.20570182800293, + "learning_rate": 4.251786626531195e-06, + "loss": 0.5685, + "step": 8351 + }, + { + "epoch": 1.4294027040903645, + "grad_norm": 24.501218795776367, + "learning_rate": 4.2437529483767305e-06, + "loss": 1.9365, + "step": 8352 + }, + { + "epoch": 1.4295738490501455, + "grad_norm": 5.079190254211426, + "learning_rate": 4.235725616424041e-06, + "loss": 0.3883, + "step": 8353 + }, + { + "epoch": 1.4297449940099263, + "grad_norm": 13.976761817932129, + "learning_rate": 4.227704635409279e-06, + "loss": 0.9349, + "step": 8354 + }, + { + "epoch": 1.4299161389697073, + "grad_norm": 26.27178382873535, + "learning_rate": 4.219690010064819e-06, + "loss": 5.4424, + "step": 8355 + }, + { + "epoch": 1.4300872839294883, + "grad_norm": 10.558706283569336, + "learning_rate": 4.2116817451193165e-06, + "loss": 0.6608, + "step": 8356 + }, + { + "epoch": 1.4302584288892692, + "grad_norm": 10.44459342956543, + "learning_rate": 4.203679845297648e-06, + "loss": 0.7136, + "step": 8357 + }, + { + "epoch": 1.4304295738490502, + "grad_norm": 12.273661613464355, + "learning_rate": 4.195684315320957e-06, + "loss": 1.0603, + "step": 8358 + }, + { + "epoch": 1.4306007188088312, + "grad_norm": 21.012882232666016, + "learning_rate": 4.18769515990661e-06, + "loss": 2.2086, + "step": 8359 + }, + { + "epoch": 1.430771863768612, + "grad_norm": 16.736093521118164, + "learning_rate": 4.179712383768221e-06, + "loss": 1.3113, + "step": 8360 + }, + { + "epoch": 1.430943008728393, + "grad_norm": 16.535369873046875, + "learning_rate": 4.171735991615636e-06, + "loss": 1.0518, + "step": 8361 + }, + { + "epoch": 1.431114153688174, + "grad_norm": 2.179553985595703, + "learning_rate": 4.163765988154954e-06, + "loss": 0.2467, + "step": 8362 + }, + { + "epoch": 1.4312852986479547, + "grad_norm": 12.279807090759277, + "learning_rate": 4.155802378088475e-06, + "loss": 1.116, + "step": 8363 + }, + { + "epoch": 1.4314564436077357, + "grad_norm": 16.3411808013916, + "learning_rate": 4.14784516611475e-06, + "loss": 0.9596, + "step": 8364 + }, + { + "epoch": 1.4316275885675167, + "grad_norm": 0.39746716618537903, + "learning_rate": 4.139894356928535e-06, + "loss": 0.1217, + "step": 8365 + }, + { + "epoch": 1.4317987335272977, + "grad_norm": 16.14784049987793, + "learning_rate": 4.131949955220829e-06, + "loss": 1.7755, + "step": 8366 + }, + { + "epoch": 1.4319698784870787, + "grad_norm": 11.682435989379883, + "learning_rate": 4.124011965678838e-06, + "loss": 0.7324, + "step": 8367 + }, + { + "epoch": 1.4321410234468595, + "grad_norm": 15.93487548828125, + "learning_rate": 4.116080392985983e-06, + "loss": 1.5287, + "step": 8368 + }, + { + "epoch": 1.4323121684066404, + "grad_norm": 11.052955627441406, + "learning_rate": 4.10815524182191e-06, + "loss": 1.0045, + "step": 8369 + }, + { + "epoch": 1.4324833133664214, + "grad_norm": 4.440634727478027, + "learning_rate": 4.100236516862463e-06, + "loss": 0.3072, + "step": 8370 + }, + { + "epoch": 1.4326544583262022, + "grad_norm": 18.903459548950195, + "learning_rate": 4.092324222779711e-06, + "loss": 1.7344, + "step": 8371 + }, + { + "epoch": 1.4328256032859832, + "grad_norm": 6.7151031494140625, + "learning_rate": 4.0844183642419096e-06, + "loss": 0.3896, + "step": 8372 + }, + { + "epoch": 1.4329967482457642, + "grad_norm": 2.5013160705566406, + "learning_rate": 4.076518945913532e-06, + "loss": 0.271, + "step": 8373 + }, + { + "epoch": 1.4331678932055452, + "grad_norm": 0.4836881458759308, + "learning_rate": 4.068625972455251e-06, + "loss": 0.1247, + "step": 8374 + }, + { + "epoch": 1.4333390381653262, + "grad_norm": 14.715100288391113, + "learning_rate": 4.060739448523921e-06, + "loss": 1.1756, + "step": 8375 + }, + { + "epoch": 1.433510183125107, + "grad_norm": 16.3120059967041, + "learning_rate": 4.052859378772617e-06, + "loss": 1.486, + "step": 8376 + }, + { + "epoch": 1.433681328084888, + "grad_norm": 11.071383476257324, + "learning_rate": 4.04498576785058e-06, + "loss": 0.9383, + "step": 8377 + }, + { + "epoch": 1.433852473044669, + "grad_norm": 15.928082466125488, + "learning_rate": 4.0371186204032614e-06, + "loss": 1.5102, + "step": 8378 + }, + { + "epoch": 1.4340236180044497, + "grad_norm": 16.743022918701172, + "learning_rate": 4.029257941072286e-06, + "loss": 1.4692, + "step": 8379 + }, + { + "epoch": 1.4341947629642307, + "grad_norm": 15.075504302978516, + "learning_rate": 4.0214037344954604e-06, + "loss": 1.2071, + "step": 8380 + }, + { + "epoch": 1.4343659079240116, + "grad_norm": 21.42952537536621, + "learning_rate": 4.013556005306788e-06, + "loss": 1.9503, + "step": 8381 + }, + { + "epoch": 1.4345370528837926, + "grad_norm": 17.631309509277344, + "learning_rate": 4.0057147581364324e-06, + "loss": 1.4888, + "step": 8382 + }, + { + "epoch": 1.4347081978435736, + "grad_norm": 93.76021575927734, + "learning_rate": 3.997879997610745e-06, + "loss": 8.4484, + "step": 8383 + }, + { + "epoch": 1.4348793428033544, + "grad_norm": 11.445027351379395, + "learning_rate": 3.990051728352252e-06, + "loss": 0.9915, + "step": 8384 + }, + { + "epoch": 1.4350504877631354, + "grad_norm": 6.506800651550293, + "learning_rate": 3.982229954979631e-06, + "loss": 0.7748, + "step": 8385 + }, + { + "epoch": 1.4352216327229164, + "grad_norm": 3.774770736694336, + "learning_rate": 3.9744146821077546e-06, + "loss": 0.2667, + "step": 8386 + }, + { + "epoch": 1.4353927776826971, + "grad_norm": 28.999431610107422, + "learning_rate": 3.96660591434763e-06, + "loss": 5.4131, + "step": 8387 + }, + { + "epoch": 1.4355639226424781, + "grad_norm": 1.9287961721420288, + "learning_rate": 3.958803656306456e-06, + "loss": 0.211, + "step": 8388 + }, + { + "epoch": 1.435735067602259, + "grad_norm": 21.30387306213379, + "learning_rate": 3.951007912587566e-06, + "loss": 2.5411, + "step": 8389 + }, + { + "epoch": 1.43590621256204, + "grad_norm": 2.638003349304199, + "learning_rate": 3.9432186877904684e-06, + "loss": 0.3276, + "step": 8390 + }, + { + "epoch": 1.436077357521821, + "grad_norm": 0.4537413716316223, + "learning_rate": 3.9354359865108154e-06, + "loss": 0.1262, + "step": 8391 + }, + { + "epoch": 1.4362485024816019, + "grad_norm": 13.961565971374512, + "learning_rate": 3.927659813340403e-06, + "loss": 1.0929, + "step": 8392 + }, + { + "epoch": 1.4364196474413828, + "grad_norm": 13.700946807861328, + "learning_rate": 3.919890172867191e-06, + "loss": 0.9157, + "step": 8393 + }, + { + "epoch": 1.4365907924011638, + "grad_norm": 17.101577758789062, + "learning_rate": 3.912127069675288e-06, + "loss": 1.072, + "step": 8394 + }, + { + "epoch": 1.4367619373609446, + "grad_norm": 5.47079610824585, + "learning_rate": 3.904370508344924e-06, + "loss": 0.6731, + "step": 8395 + }, + { + "epoch": 1.4369330823207256, + "grad_norm": 6.5655837059021, + "learning_rate": 3.896620493452493e-06, + "loss": 0.625, + "step": 8396 + }, + { + "epoch": 1.4371042272805066, + "grad_norm": 11.239481925964355, + "learning_rate": 3.888877029570503e-06, + "loss": 1.0318, + "step": 8397 + }, + { + "epoch": 1.4372753722402876, + "grad_norm": 21.253192901611328, + "learning_rate": 3.881140121267619e-06, + "loss": 1.5675, + "step": 8398 + }, + { + "epoch": 1.4374465172000686, + "grad_norm": 17.1704044342041, + "learning_rate": 3.873409773108625e-06, + "loss": 2.0079, + "step": 8399 + }, + { + "epoch": 1.4376176621598493, + "grad_norm": 87.91876983642578, + "learning_rate": 3.865685989654433e-06, + "loss": 8.8725, + "step": 8400 + }, + { + "epoch": 1.4377888071196303, + "grad_norm": 13.40868854522705, + "learning_rate": 3.857968775462096e-06, + "loss": 1.0831, + "step": 8401 + }, + { + "epoch": 1.4379599520794113, + "grad_norm": 17.707443237304688, + "learning_rate": 3.8502581350847716e-06, + "loss": 1.3818, + "step": 8402 + }, + { + "epoch": 1.438131097039192, + "grad_norm": 2.461596727371216, + "learning_rate": 3.84255407307176e-06, + "loss": 0.2992, + "step": 8403 + }, + { + "epoch": 1.438302241998973, + "grad_norm": 18.57525062561035, + "learning_rate": 3.834856593968456e-06, + "loss": 1.9091, + "step": 8404 + }, + { + "epoch": 1.438473386958754, + "grad_norm": 20.6262264251709, + "learning_rate": 3.827165702316395e-06, + "loss": 1.6094, + "step": 8405 + }, + { + "epoch": 1.438644531918535, + "grad_norm": 12.672618865966797, + "learning_rate": 3.8194814026532146e-06, + "loss": 0.858, + "step": 8406 + }, + { + "epoch": 1.438815676878316, + "grad_norm": 14.475008964538574, + "learning_rate": 3.81180369951266e-06, + "loss": 1.2888, + "step": 8407 + }, + { + "epoch": 1.4389868218380968, + "grad_norm": 16.547607421875, + "learning_rate": 3.8041325974245826e-06, + "loss": 1.5037, + "step": 8408 + }, + { + "epoch": 1.4391579667978778, + "grad_norm": 19.653106689453125, + "learning_rate": 3.7964681009149547e-06, + "loss": 1.5261, + "step": 8409 + }, + { + "epoch": 1.4393291117576588, + "grad_norm": 0.3853145241737366, + "learning_rate": 3.788810214505829e-06, + "loss": 0.1155, + "step": 8410 + }, + { + "epoch": 1.4395002567174398, + "grad_norm": 11.497426986694336, + "learning_rate": 3.7811589427153793e-06, + "loss": 0.6922, + "step": 8411 + }, + { + "epoch": 1.4396714016772205, + "grad_norm": 17.74052619934082, + "learning_rate": 3.7735142900578578e-06, + "loss": 1.2992, + "step": 8412 + }, + { + "epoch": 1.4398425466370015, + "grad_norm": 0.7352682948112488, + "learning_rate": 3.7658762610436336e-06, + "loss": 0.1482, + "step": 8413 + }, + { + "epoch": 1.4400136915967825, + "grad_norm": 1.819342017173767, + "learning_rate": 3.758244860179142e-06, + "loss": 0.218, + "step": 8414 + }, + { + "epoch": 1.4401848365565635, + "grad_norm": 23.511795043945312, + "learning_rate": 3.7506200919669278e-06, + "loss": 5.0297, + "step": 8415 + }, + { + "epoch": 1.4403559815163445, + "grad_norm": 7.822660446166992, + "learning_rate": 3.7430019609056187e-06, + "loss": 0.7485, + "step": 8416 + }, + { + "epoch": 1.4405271264761252, + "grad_norm": 12.87449836730957, + "learning_rate": 3.735390471489915e-06, + "loss": 1.1747, + "step": 8417 + }, + { + "epoch": 1.4406982714359062, + "grad_norm": 15.970128059387207, + "learning_rate": 3.727785628210616e-06, + "loss": 1.2442, + "step": 8418 + }, + { + "epoch": 1.4408694163956872, + "grad_norm": 8.213173866271973, + "learning_rate": 3.7201874355545874e-06, + "loss": 0.4527, + "step": 8419 + }, + { + "epoch": 1.441040561355468, + "grad_norm": 15.708293914794922, + "learning_rate": 3.7125958980047662e-06, + "loss": 1.3131, + "step": 8420 + }, + { + "epoch": 1.441211706315249, + "grad_norm": 18.762908935546875, + "learning_rate": 3.7050110200401822e-06, + "loss": 1.6741, + "step": 8421 + }, + { + "epoch": 1.44138285127503, + "grad_norm": 0.9794007539749146, + "learning_rate": 3.6974328061359146e-06, + "loss": 0.2083, + "step": 8422 + }, + { + "epoch": 1.441553996234811, + "grad_norm": 5.408003807067871, + "learning_rate": 3.6898612607631327e-06, + "loss": 0.7957, + "step": 8423 + }, + { + "epoch": 1.441725141194592, + "grad_norm": 10.823113441467285, + "learning_rate": 3.6822963883890476e-06, + "loss": 0.9094, + "step": 8424 + }, + { + "epoch": 1.4418962861543727, + "grad_norm": 1.8446986675262451, + "learning_rate": 3.674738193476949e-06, + "loss": 0.2384, + "step": 8425 + }, + { + "epoch": 1.4420674311141537, + "grad_norm": 6.8515801429748535, + "learning_rate": 3.6671866804861903e-06, + "loss": 0.5296, + "step": 8426 + }, + { + "epoch": 1.4422385760739347, + "grad_norm": 25.481624603271484, + "learning_rate": 3.659641853872167e-06, + "loss": 5.4417, + "step": 8427 + }, + { + "epoch": 1.4424097210337155, + "grad_norm": 12.586084365844727, + "learning_rate": 3.652103718086344e-06, + "loss": 1.0079, + "step": 8428 + }, + { + "epoch": 1.4425808659934964, + "grad_norm": 23.349151611328125, + "learning_rate": 3.644572277576224e-06, + "loss": 5.2822, + "step": 8429 + }, + { + "epoch": 1.4427520109532774, + "grad_norm": 10.716401100158691, + "learning_rate": 3.637047536785379e-06, + "loss": 1.2512, + "step": 8430 + }, + { + "epoch": 1.4429231559130584, + "grad_norm": 13.124372482299805, + "learning_rate": 3.6295295001534133e-06, + "loss": 1.4293, + "step": 8431 + }, + { + "epoch": 1.4430943008728394, + "grad_norm": 0.47882649302482605, + "learning_rate": 3.622018172115973e-06, + "loss": 0.1324, + "step": 8432 + }, + { + "epoch": 1.4432654458326202, + "grad_norm": 5.195764064788818, + "learning_rate": 3.614513557104762e-06, + "loss": 0.4471, + "step": 8433 + }, + { + "epoch": 1.4434365907924012, + "grad_norm": 0.4021523594856262, + "learning_rate": 3.607015659547506e-06, + "loss": 0.126, + "step": 8434 + }, + { + "epoch": 1.4436077357521822, + "grad_norm": 0.5373860597610474, + "learning_rate": 3.5995244838679847e-06, + "loss": 0.1333, + "step": 8435 + }, + { + "epoch": 1.443778880711963, + "grad_norm": 13.348397254943848, + "learning_rate": 3.5920400344859905e-06, + "loss": 1.1046, + "step": 8436 + }, + { + "epoch": 1.443950025671744, + "grad_norm": 2.4842634201049805, + "learning_rate": 3.584562315817373e-06, + "loss": 0.2417, + "step": 8437 + }, + { + "epoch": 1.444121170631525, + "grad_norm": 10.33353328704834, + "learning_rate": 3.5770913322739947e-06, + "loss": 1.05, + "step": 8438 + }, + { + "epoch": 1.4442923155913059, + "grad_norm": 58.4034309387207, + "learning_rate": 3.5696270882637446e-06, + "loss": 7.4923, + "step": 8439 + }, + { + "epoch": 1.4444634605510869, + "grad_norm": 9.728248596191406, + "learning_rate": 3.562169588190533e-06, + "loss": 1.0112, + "step": 8440 + }, + { + "epoch": 1.4446346055108676, + "grad_norm": 12.72889232635498, + "learning_rate": 3.554718836454306e-06, + "loss": 1.015, + "step": 8441 + }, + { + "epoch": 1.4448057504706486, + "grad_norm": 13.545023918151855, + "learning_rate": 3.5472748374510065e-06, + "loss": 0.9717, + "step": 8442 + }, + { + "epoch": 1.4449768954304296, + "grad_norm": 5.7686285972595215, + "learning_rate": 3.539837595572617e-06, + "loss": 0.517, + "step": 8443 + }, + { + "epoch": 1.4451480403902104, + "grad_norm": 14.59744930267334, + "learning_rate": 3.5324071152071118e-06, + "loss": 1.037, + "step": 8444 + }, + { + "epoch": 1.4453191853499914, + "grad_norm": 8.939913749694824, + "learning_rate": 3.524983400738493e-06, + "loss": 0.8705, + "step": 8445 + }, + { + "epoch": 1.4454903303097724, + "grad_norm": 12.520122528076172, + "learning_rate": 3.517566456546758e-06, + "loss": 1.0416, + "step": 8446 + }, + { + "epoch": 1.4456614752695534, + "grad_norm": 14.85897445678711, + "learning_rate": 3.5101562870079085e-06, + "loss": 1.1178, + "step": 8447 + }, + { + "epoch": 1.4458326202293343, + "grad_norm": 1.4447020292282104, + "learning_rate": 3.502752896493969e-06, + "loss": 0.2198, + "step": 8448 + }, + { + "epoch": 1.446003765189115, + "grad_norm": 13.082090377807617, + "learning_rate": 3.495356289372948e-06, + "loss": 0.9224, + "step": 8449 + }, + { + "epoch": 1.446174910148896, + "grad_norm": 14.805837631225586, + "learning_rate": 3.487966470008847e-06, + "loss": 1.2862, + "step": 8450 + }, + { + "epoch": 1.446346055108677, + "grad_norm": 25.148759841918945, + "learning_rate": 3.4805834427616817e-06, + "loss": 1.9002, + "step": 8451 + }, + { + "epoch": 1.4465172000684579, + "grad_norm": 0.42858776450157166, + "learning_rate": 3.4732072119874376e-06, + "loss": 0.1169, + "step": 8452 + }, + { + "epoch": 1.4466883450282388, + "grad_norm": 13.413276672363281, + "learning_rate": 3.4658377820381114e-06, + "loss": 1.0795, + "step": 8453 + }, + { + "epoch": 1.4468594899880198, + "grad_norm": 3.4085307121276855, + "learning_rate": 3.4584751572616692e-06, + "loss": 0.2776, + "step": 8454 + }, + { + "epoch": 1.4470306349478008, + "grad_norm": 94.4915771484375, + "learning_rate": 3.4511193420020786e-06, + "loss": 7.4881, + "step": 8455 + }, + { + "epoch": 1.4472017799075818, + "grad_norm": 8.371195793151855, + "learning_rate": 3.4437703405992787e-06, + "loss": 0.7268, + "step": 8456 + }, + { + "epoch": 1.4473729248673626, + "grad_norm": 13.714017868041992, + "learning_rate": 3.436428157389181e-06, + "loss": 1.1406, + "step": 8457 + }, + { + "epoch": 1.4475440698271436, + "grad_norm": 3.720073699951172, + "learning_rate": 3.429092796703703e-06, + "loss": 0.2816, + "step": 8458 + }, + { + "epoch": 1.4477152147869246, + "grad_norm": 11.747239112854004, + "learning_rate": 3.421764262870709e-06, + "loss": 0.8131, + "step": 8459 + }, + { + "epoch": 1.4478863597467055, + "grad_norm": 21.724252700805664, + "learning_rate": 3.414442560214041e-06, + "loss": 1.8312, + "step": 8460 + }, + { + "epoch": 1.4480575047064863, + "grad_norm": 20.976316452026367, + "learning_rate": 3.407127693053524e-06, + "loss": 2.1807, + "step": 8461 + }, + { + "epoch": 1.4482286496662673, + "grad_norm": 2.2490921020507812, + "learning_rate": 3.39981966570493e-06, + "loss": 0.2436, + "step": 8462 + }, + { + "epoch": 1.4483997946260483, + "grad_norm": 15.60532283782959, + "learning_rate": 3.392518482480016e-06, + "loss": 1.0062, + "step": 8463 + }, + { + "epoch": 1.4485709395858293, + "grad_norm": 10.781365394592285, + "learning_rate": 3.385224147686482e-06, + "loss": 0.8134, + "step": 8464 + }, + { + "epoch": 1.4487420845456103, + "grad_norm": 14.161565780639648, + "learning_rate": 3.377936665628004e-06, + "loss": 1.1182, + "step": 8465 + }, + { + "epoch": 1.448913229505391, + "grad_norm": 10.106000900268555, + "learning_rate": 3.3706560406041996e-06, + "loss": 0.6549, + "step": 8466 + }, + { + "epoch": 1.449084374465172, + "grad_norm": 77.07288360595703, + "learning_rate": 3.3633822769106578e-06, + "loss": 8.6634, + "step": 8467 + }, + { + "epoch": 1.449255519424953, + "grad_norm": 6.6021037101745605, + "learning_rate": 3.3561153788388998e-06, + "loss": 0.4629, + "step": 8468 + }, + { + "epoch": 1.4494266643847338, + "grad_norm": 10.353342056274414, + "learning_rate": 3.348855350676412e-06, + "loss": 0.8539, + "step": 8469 + }, + { + "epoch": 1.4495978093445148, + "grad_norm": 16.80060577392578, + "learning_rate": 3.3416021967066256e-06, + "loss": 1.3293, + "step": 8470 + }, + { + "epoch": 1.4497689543042958, + "grad_norm": 29.66864585876465, + "learning_rate": 3.3343559212089083e-06, + "loss": 5.7813, + "step": 8471 + }, + { + "epoch": 1.4499400992640767, + "grad_norm": 12.348899841308594, + "learning_rate": 3.3271165284585677e-06, + "loss": 0.9477, + "step": 8472 + }, + { + "epoch": 1.4501112442238577, + "grad_norm": 23.271472930908203, + "learning_rate": 3.3198840227268657e-06, + "loss": 1.9464, + "step": 8473 + }, + { + "epoch": 1.4502823891836385, + "grad_norm": 15.827186584472656, + "learning_rate": 3.312658408280984e-06, + "loss": 1.828, + "step": 8474 + }, + { + "epoch": 1.4504535341434195, + "grad_norm": 11.40153980255127, + "learning_rate": 3.305439689384053e-06, + "loss": 0.7975, + "step": 8475 + }, + { + "epoch": 1.4506246791032005, + "grad_norm": 2.8747787475585938, + "learning_rate": 3.2982278702951195e-06, + "loss": 0.2592, + "step": 8476 + }, + { + "epoch": 1.4507958240629812, + "grad_norm": 13.116789817810059, + "learning_rate": 3.2910229552691763e-06, + "loss": 1.1737, + "step": 8477 + }, + { + "epoch": 1.4509669690227622, + "grad_norm": 24.355487823486328, + "learning_rate": 3.283824948557129e-06, + "loss": 3.3179, + "step": 8478 + }, + { + "epoch": 1.4511381139825432, + "grad_norm": 5.922363758087158, + "learning_rate": 3.276633854405805e-06, + "loss": 0.4773, + "step": 8479 + }, + { + "epoch": 1.4513092589423242, + "grad_norm": 18.42041778564453, + "learning_rate": 3.2694496770579727e-06, + "loss": 2.1234, + "step": 8480 + }, + { + "epoch": 1.4514804039021052, + "grad_norm": 5.232955455780029, + "learning_rate": 3.262272420752307e-06, + "loss": 0.3187, + "step": 8481 + }, + { + "epoch": 1.451651548861886, + "grad_norm": 2.305129289627075, + "learning_rate": 3.2551020897233914e-06, + "loss": 0.2394, + "step": 8482 + }, + { + "epoch": 1.451822693821667, + "grad_norm": 2.379843235015869, + "learning_rate": 3.247938688201742e-06, + "loss": 0.2671, + "step": 8483 + }, + { + "epoch": 1.451993838781448, + "grad_norm": 17.739065170288086, + "learning_rate": 3.240782220413765e-06, + "loss": 1.3484, + "step": 8484 + }, + { + "epoch": 1.4521649837412287, + "grad_norm": 17.64982032775879, + "learning_rate": 3.2336326905817978e-06, + "loss": 1.6587, + "step": 8485 + }, + { + "epoch": 1.4523361287010097, + "grad_norm": 13.729183197021484, + "learning_rate": 3.226490102924064e-06, + "loss": 1.1662, + "step": 8486 + }, + { + "epoch": 1.4525072736607907, + "grad_norm": 3.2244491577148438, + "learning_rate": 3.21935446165471e-06, + "loss": 0.2813, + "step": 8487 + }, + { + "epoch": 1.4526784186205717, + "grad_norm": 14.36533260345459, + "learning_rate": 3.212225770983771e-06, + "loss": 1.0181, + "step": 8488 + }, + { + "epoch": 1.4528495635803527, + "grad_norm": 4.451156139373779, + "learning_rate": 3.2051040351171793e-06, + "loss": 0.2937, + "step": 8489 + }, + { + "epoch": 1.4530207085401334, + "grad_norm": 10.659801483154297, + "learning_rate": 3.197989258256773e-06, + "loss": 0.8253, + "step": 8490 + }, + { + "epoch": 1.4531918534999144, + "grad_norm": 67.64859008789062, + "learning_rate": 3.190881444600289e-06, + "loss": 7.7092, + "step": 8491 + }, + { + "epoch": 1.4533629984596954, + "grad_norm": 17.89799690246582, + "learning_rate": 3.1837805983413382e-06, + "loss": 2.0522, + "step": 8492 + }, + { + "epoch": 1.4535341434194762, + "grad_norm": 10.325241088867188, + "learning_rate": 3.176686723669438e-06, + "loss": 0.76, + "step": 8493 + }, + { + "epoch": 1.4537052883792572, + "grad_norm": 20.618074417114258, + "learning_rate": 3.1695998247699774e-06, + "loss": 5.0703, + "step": 8494 + }, + { + "epoch": 1.4538764333390382, + "grad_norm": 11.815807342529297, + "learning_rate": 3.1625199058242455e-06, + "loss": 0.9483, + "step": 8495 + }, + { + "epoch": 1.4540475782988191, + "grad_norm": 0.6206079125404358, + "learning_rate": 3.1554469710094e-06, + "loss": 0.1304, + "step": 8496 + }, + { + "epoch": 1.4542187232586001, + "grad_norm": 10.281078338623047, + "learning_rate": 3.1483810244984806e-06, + "loss": 0.7949, + "step": 8497 + }, + { + "epoch": 1.454389868218381, + "grad_norm": 11.412726402282715, + "learning_rate": 3.1413220704604133e-06, + "loss": 0.9857, + "step": 8498 + }, + { + "epoch": 1.4545610131781619, + "grad_norm": 21.183530807495117, + "learning_rate": 3.1342701130599823e-06, + "loss": 2.652, + "step": 8499 + }, + { + "epoch": 1.4547321581379429, + "grad_norm": 13.151970863342285, + "learning_rate": 3.1272251564578587e-06, + "loss": 0.9153, + "step": 8500 + }, + { + "epoch": 1.4549033030977236, + "grad_norm": 25.7855224609375, + "learning_rate": 3.1201872048105813e-06, + "loss": 5.2122, + "step": 8501 + }, + { + "epoch": 1.4550744480575046, + "grad_norm": 16.713407516479492, + "learning_rate": 3.1131562622705432e-06, + "loss": 1.2021, + "step": 8502 + }, + { + "epoch": 1.4552455930172856, + "grad_norm": 0.524751603603363, + "learning_rate": 3.1061323329860185e-06, + "loss": 0.1278, + "step": 8503 + }, + { + "epoch": 1.4554167379770666, + "grad_norm": 11.288585662841797, + "learning_rate": 3.0991154211011303e-06, + "loss": 1.0099, + "step": 8504 + }, + { + "epoch": 1.4555878829368476, + "grad_norm": 16.54046058654785, + "learning_rate": 3.0921055307558714e-06, + "loss": 1.4352, + "step": 8505 + }, + { + "epoch": 1.4557590278966284, + "grad_norm": 15.869155883789062, + "learning_rate": 3.0851026660860844e-06, + "loss": 1.9399, + "step": 8506 + }, + { + "epoch": 1.4559301728564094, + "grad_norm": 21.86159896850586, + "learning_rate": 3.0781068312234732e-06, + "loss": 2.5342, + "step": 8507 + }, + { + "epoch": 1.4561013178161903, + "grad_norm": 13.476174354553223, + "learning_rate": 3.0711180302955888e-06, + "loss": 1.2246, + "step": 8508 + }, + { + "epoch": 1.456272462775971, + "grad_norm": 14.098896026611328, + "learning_rate": 3.0641362674258305e-06, + "loss": 1.0533, + "step": 8509 + }, + { + "epoch": 1.456443607735752, + "grad_norm": 9.295037269592285, + "learning_rate": 3.057161546733457e-06, + "loss": 0.7267, + "step": 8510 + }, + { + "epoch": 1.456614752695533, + "grad_norm": 20.52536392211914, + "learning_rate": 3.050193872333554e-06, + "loss": 2.0941, + "step": 8511 + }, + { + "epoch": 1.456785897655314, + "grad_norm": 22.563631057739258, + "learning_rate": 3.043233248337066e-06, + "loss": 5.463, + "step": 8512 + }, + { + "epoch": 1.456957042615095, + "grad_norm": 13.759818077087402, + "learning_rate": 3.036279678850776e-06, + "loss": 1.2918, + "step": 8513 + }, + { + "epoch": 1.457128187574876, + "grad_norm": 11.931748390197754, + "learning_rate": 3.0293331679772893e-06, + "loss": 1.1969, + "step": 8514 + }, + { + "epoch": 1.4572993325346568, + "grad_norm": 8.23759651184082, + "learning_rate": 3.0223937198150675e-06, + "loss": 0.6667, + "step": 8515 + }, + { + "epoch": 1.4574704774944378, + "grad_norm": 21.343276977539062, + "learning_rate": 3.015461338458386e-06, + "loss": 1.5682, + "step": 8516 + }, + { + "epoch": 1.4576416224542188, + "grad_norm": 4.6625142097473145, + "learning_rate": 3.0085360279973707e-06, + "loss": 0.2648, + "step": 8517 + }, + { + "epoch": 1.4578127674139996, + "grad_norm": 0.3689694404602051, + "learning_rate": 3.0016177925179555e-06, + "loss": 0.1261, + "step": 8518 + }, + { + "epoch": 1.4579839123737806, + "grad_norm": 5.428666114807129, + "learning_rate": 2.994706636101918e-06, + "loss": 0.578, + "step": 8519 + }, + { + "epoch": 1.4581550573335615, + "grad_norm": 76.22441101074219, + "learning_rate": 2.9878025628268467e-06, + "loss": 8.0354, + "step": 8520 + }, + { + "epoch": 1.4583262022933425, + "grad_norm": 20.39149284362793, + "learning_rate": 2.98090557676615e-06, + "loss": 1.7984, + "step": 8521 + }, + { + "epoch": 1.4584973472531235, + "grad_norm": 15.403654098510742, + "learning_rate": 2.974015681989063e-06, + "loss": 0.8956, + "step": 8522 + }, + { + "epoch": 1.4586684922129043, + "grad_norm": 17.665454864501953, + "learning_rate": 2.9671328825606414e-06, + "loss": 1.8613, + "step": 8523 + }, + { + "epoch": 1.4588396371726853, + "grad_norm": 16.507240295410156, + "learning_rate": 2.9602571825417383e-06, + "loss": 1.4073, + "step": 8524 + }, + { + "epoch": 1.4590107821324663, + "grad_norm": 12.627859115600586, + "learning_rate": 2.953388585989036e-06, + "loss": 1.0848, + "step": 8525 + }, + { + "epoch": 1.459181927092247, + "grad_norm": 10.567094802856445, + "learning_rate": 2.946527096955005e-06, + "loss": 0.7461, + "step": 8526 + }, + { + "epoch": 1.459353072052028, + "grad_norm": 7.751945972442627, + "learning_rate": 2.9396727194879446e-06, + "loss": 0.7472, + "step": 8527 + }, + { + "epoch": 1.459524217011809, + "grad_norm": 0.37635478377342224, + "learning_rate": 2.932825457631943e-06, + "loss": 0.118, + "step": 8528 + }, + { + "epoch": 1.45969536197159, + "grad_norm": 12.689519882202148, + "learning_rate": 2.92598531542689e-06, + "loss": 0.8756, + "step": 8529 + }, + { + "epoch": 1.459866506931371, + "grad_norm": 0.5429931879043579, + "learning_rate": 2.9191522969084895e-06, + "loss": 0.1225, + "step": 8530 + }, + { + "epoch": 1.4600376518911518, + "grad_norm": 6.63957405090332, + "learning_rate": 2.9123264061082245e-06, + "loss": 0.5892, + "step": 8531 + }, + { + "epoch": 1.4602087968509327, + "grad_norm": 18.1749267578125, + "learning_rate": 2.9055076470533786e-06, + "loss": 1.3724, + "step": 8532 + }, + { + "epoch": 1.4603799418107137, + "grad_norm": 13.266453742980957, + "learning_rate": 2.898696023767044e-06, + "loss": 1.0603, + "step": 8533 + }, + { + "epoch": 1.4605510867704945, + "grad_norm": 20.12327766418457, + "learning_rate": 2.8918915402680758e-06, + "loss": 1.895, + "step": 8534 + }, + { + "epoch": 1.4607222317302755, + "grad_norm": 18.49260139465332, + "learning_rate": 2.8850942005711372e-06, + "loss": 1.4397, + "step": 8535 + }, + { + "epoch": 1.4608933766900565, + "grad_norm": 10.08874797821045, + "learning_rate": 2.8783040086866656e-06, + "loss": 0.7415, + "step": 8536 + }, + { + "epoch": 1.4610645216498375, + "grad_norm": 27.016815185546875, + "learning_rate": 2.8715209686208783e-06, + "loss": 2.752, + "step": 8537 + }, + { + "epoch": 1.4612356666096185, + "grad_norm": 15.482584953308105, + "learning_rate": 2.86474508437579e-06, + "loss": 0.9181, + "step": 8538 + }, + { + "epoch": 1.4614068115693992, + "grad_norm": 3.8112120628356934, + "learning_rate": 2.8579763599491715e-06, + "loss": 0.2904, + "step": 8539 + }, + { + "epoch": 1.4615779565291802, + "grad_norm": 13.542598724365234, + "learning_rate": 2.8512147993345898e-06, + "loss": 1.2935, + "step": 8540 + }, + { + "epoch": 1.4617491014889612, + "grad_norm": 10.883723258972168, + "learning_rate": 2.8444604065213693e-06, + "loss": 0.6941, + "step": 8541 + }, + { + "epoch": 1.461920246448742, + "grad_norm": 14.303625106811523, + "learning_rate": 2.8377131854946162e-06, + "loss": 1.1057, + "step": 8542 + }, + { + "epoch": 1.462091391408523, + "grad_norm": 12.465928077697754, + "learning_rate": 2.8309731402351957e-06, + "loss": 0.9065, + "step": 8543 + }, + { + "epoch": 1.462262536368304, + "grad_norm": 12.034969329833984, + "learning_rate": 2.824240274719748e-06, + "loss": 0.8766, + "step": 8544 + }, + { + "epoch": 1.462433681328085, + "grad_norm": 13.294035911560059, + "learning_rate": 2.8175145929206762e-06, + "loss": 0.9318, + "step": 8545 + }, + { + "epoch": 1.462604826287866, + "grad_norm": 17.143617630004883, + "learning_rate": 2.8107960988061376e-06, + "loss": 1.6987, + "step": 8546 + }, + { + "epoch": 1.4627759712476467, + "grad_norm": 20.387392044067383, + "learning_rate": 2.804084796340059e-06, + "loss": 1.8917, + "step": 8547 + }, + { + "epoch": 1.4629471162074277, + "grad_norm": 2.370588541030884, + "learning_rate": 2.797380689482116e-06, + "loss": 0.2348, + "step": 8548 + }, + { + "epoch": 1.4631182611672087, + "grad_norm": 12.722402572631836, + "learning_rate": 2.7906837821877373e-06, + "loss": 0.9854, + "step": 8549 + }, + { + "epoch": 1.4632894061269894, + "grad_norm": 17.461929321289062, + "learning_rate": 2.783994078408118e-06, + "loss": 1.605, + "step": 8550 + }, + { + "epoch": 1.4634605510867704, + "grad_norm": 11.405351638793945, + "learning_rate": 2.777311582090181e-06, + "loss": 0.8213, + "step": 8551 + }, + { + "epoch": 1.4636316960465514, + "grad_norm": 25.37442970275879, + "learning_rate": 2.7706362971766212e-06, + "loss": 5.4851, + "step": 8552 + }, + { + "epoch": 1.4638028410063324, + "grad_norm": 15.426714897155762, + "learning_rate": 2.7639682276058583e-06, + "loss": 1.2276, + "step": 8553 + }, + { + "epoch": 1.4639739859661134, + "grad_norm": 3.5394811630249023, + "learning_rate": 2.7573073773120645e-06, + "loss": 0.3064, + "step": 8554 + }, + { + "epoch": 1.4641451309258942, + "grad_norm": 13.284134864807129, + "learning_rate": 2.7506537502251582e-06, + "loss": 0.9739, + "step": 8555 + }, + { + "epoch": 1.4643162758856751, + "grad_norm": 12.07269287109375, + "learning_rate": 2.7440073502707796e-06, + "loss": 0.9605, + "step": 8556 + }, + { + "epoch": 1.4644874208454561, + "grad_norm": 16.062152862548828, + "learning_rate": 2.737368181370323e-06, + "loss": 1.4946, + "step": 8557 + }, + { + "epoch": 1.464658565805237, + "grad_norm": 15.084619522094727, + "learning_rate": 2.730736247440901e-06, + "loss": 1.3072, + "step": 8558 + }, + { + "epoch": 1.464829710765018, + "grad_norm": 18.281511306762695, + "learning_rate": 2.7241115523953707e-06, + "loss": 2.1375, + "step": 8559 + }, + { + "epoch": 1.4650008557247989, + "grad_norm": 11.279488563537598, + "learning_rate": 2.7174941001423083e-06, + "loss": 1.1234, + "step": 8560 + }, + { + "epoch": 1.4651720006845799, + "grad_norm": 10.39518928527832, + "learning_rate": 2.710883894586018e-06, + "loss": 0.9096, + "step": 8561 + }, + { + "epoch": 1.4653431456443609, + "grad_norm": 14.887674331665039, + "learning_rate": 2.7042809396265377e-06, + "loss": 1.049, + "step": 8562 + }, + { + "epoch": 1.4655142906041418, + "grad_norm": 15.582239151000977, + "learning_rate": 2.697685239159614e-06, + "loss": 1.2409, + "step": 8563 + }, + { + "epoch": 1.4656854355639226, + "grad_norm": 22.435829162597656, + "learning_rate": 2.691096797076726e-06, + "loss": 2.2822, + "step": 8564 + }, + { + "epoch": 1.4658565805237036, + "grad_norm": 5.671708106994629, + "learning_rate": 2.6845156172650536e-06, + "loss": 0.3486, + "step": 8565 + }, + { + "epoch": 1.4660277254834846, + "grad_norm": 0.4237624406814575, + "learning_rate": 2.677941703607515e-06, + "loss": 0.1282, + "step": 8566 + }, + { + "epoch": 1.4661988704432654, + "grad_norm": 12.79686164855957, + "learning_rate": 2.6713750599827287e-06, + "loss": 0.8523, + "step": 8567 + }, + { + "epoch": 1.4663700154030463, + "grad_norm": 0.37470367550849915, + "learning_rate": 2.664815690265019e-06, + "loss": 0.1181, + "step": 8568 + }, + { + "epoch": 1.4665411603628273, + "grad_norm": 12.265129089355469, + "learning_rate": 2.6582635983244203e-06, + "loss": 1.1186, + "step": 8569 + }, + { + "epoch": 1.4667123053226083, + "grad_norm": 11.174092292785645, + "learning_rate": 2.6517187880266853e-06, + "loss": 1.097, + "step": 8570 + }, + { + "epoch": 1.4668834502823893, + "grad_norm": 22.1987247467041, + "learning_rate": 2.645181263233255e-06, + "loss": 2.09, + "step": 8571 + }, + { + "epoch": 1.46705459524217, + "grad_norm": 18.45955467224121, + "learning_rate": 2.6386510278012844e-06, + "loss": 1.7561, + "step": 8572 + }, + { + "epoch": 1.467225740201951, + "grad_norm": 0.5314795970916748, + "learning_rate": 2.632128085583616e-06, + "loss": 0.1276, + "step": 8573 + }, + { + "epoch": 1.467396885161732, + "grad_norm": 11.474678039550781, + "learning_rate": 2.6256124404288017e-06, + "loss": 0.8089, + "step": 8574 + }, + { + "epoch": 1.4675680301215128, + "grad_norm": 13.592512130737305, + "learning_rate": 2.6191040961810716e-06, + "loss": 1.0905, + "step": 8575 + }, + { + "epoch": 1.4677391750812938, + "grad_norm": 17.502023696899414, + "learning_rate": 2.6126030566803714e-06, + "loss": 1.3819, + "step": 8576 + }, + { + "epoch": 1.4679103200410748, + "grad_norm": 5.090403079986572, + "learning_rate": 2.606109325762316e-06, + "loss": 0.3459, + "step": 8577 + }, + { + "epoch": 1.4680814650008558, + "grad_norm": 12.40960693359375, + "learning_rate": 2.599622907258223e-06, + "loss": 1.0282, + "step": 8578 + }, + { + "epoch": 1.4682526099606368, + "grad_norm": 18.674455642700195, + "learning_rate": 2.5931438049950794e-06, + "loss": 1.7005, + "step": 8579 + }, + { + "epoch": 1.4684237549204175, + "grad_norm": 15.828418731689453, + "learning_rate": 2.586672022795575e-06, + "loss": 1.1007, + "step": 8580 + }, + { + "epoch": 1.4685948998801985, + "grad_norm": 27.76209259033203, + "learning_rate": 2.5802075644780626e-06, + "loss": 5.3068, + "step": 8581 + }, + { + "epoch": 1.4687660448399795, + "grad_norm": 8.822689056396484, + "learning_rate": 2.5737504338565887e-06, + "loss": 0.6865, + "step": 8582 + }, + { + "epoch": 1.4689371897997603, + "grad_norm": 22.21077537536621, + "learning_rate": 2.5673006347408658e-06, + "loss": 5.1728, + "step": 8583 + }, + { + "epoch": 1.4691083347595413, + "grad_norm": 3.085292339324951, + "learning_rate": 2.5608581709362878e-06, + "loss": 0.2355, + "step": 8584 + }, + { + "epoch": 1.4692794797193223, + "grad_norm": 0.7077583074569702, + "learning_rate": 2.5544230462439175e-06, + "loss": 0.1373, + "step": 8585 + }, + { + "epoch": 1.4694506246791033, + "grad_norm": 15.744388580322266, + "learning_rate": 2.5479952644604786e-06, + "loss": 1.1297, + "step": 8586 + }, + { + "epoch": 1.4696217696388842, + "grad_norm": 15.804190635681152, + "learning_rate": 2.5415748293783887e-06, + "loss": 1.2382, + "step": 8587 + }, + { + "epoch": 1.469792914598665, + "grad_norm": 0.848552942276001, + "learning_rate": 2.5351617447857057e-06, + "loss": 0.1375, + "step": 8588 + }, + { + "epoch": 1.469964059558446, + "grad_norm": 11.672784805297852, + "learning_rate": 2.5287560144661563e-06, + "loss": 1.0634, + "step": 8589 + }, + { + "epoch": 1.470135204518227, + "grad_norm": 12.981428146362305, + "learning_rate": 2.5223576421991362e-06, + "loss": 1.2401, + "step": 8590 + }, + { + "epoch": 1.4703063494780078, + "grad_norm": 10.640856742858887, + "learning_rate": 2.51596663175969e-06, + "loss": 0.9131, + "step": 8591 + }, + { + "epoch": 1.4704774944377887, + "grad_norm": 8.373154640197754, + "learning_rate": 2.509582986918527e-06, + "loss": 0.622, + "step": 8592 + }, + { + "epoch": 1.4706486393975697, + "grad_norm": 59.0521354675293, + "learning_rate": 2.503206711442004e-06, + "loss": 7.9709, + "step": 8593 + }, + { + "epoch": 1.4708197843573507, + "grad_norm": 0.8816530704498291, + "learning_rate": 2.4968378090921375e-06, + "loss": 0.1537, + "step": 8594 + }, + { + "epoch": 1.4709909293171317, + "grad_norm": 15.040864944458008, + "learning_rate": 2.4904762836265873e-06, + "loss": 1.1132, + "step": 8595 + }, + { + "epoch": 1.4711620742769125, + "grad_norm": 7.551300048828125, + "learning_rate": 2.4841221387986577e-06, + "loss": 0.7954, + "step": 8596 + }, + { + "epoch": 1.4713332192366935, + "grad_norm": 20.366872787475586, + "learning_rate": 2.4777753783573078e-06, + "loss": 4.8486, + "step": 8597 + }, + { + "epoch": 1.4715043641964745, + "grad_norm": 17.65959930419922, + "learning_rate": 2.4714360060471375e-06, + "loss": 1.2182, + "step": 8598 + }, + { + "epoch": 1.4716755091562552, + "grad_norm": 18.21508026123047, + "learning_rate": 2.4651040256083857e-06, + "loss": 1.6719, + "step": 8599 + }, + { + "epoch": 1.4718466541160362, + "grad_norm": 22.4281063079834, + "learning_rate": 2.4587794407769304e-06, + "loss": 2.8263, + "step": 8600 + }, + { + "epoch": 1.4720177990758172, + "grad_norm": 16.28119659423828, + "learning_rate": 2.452462255284282e-06, + "loss": 1.1152, + "step": 8601 + }, + { + "epoch": 1.4721889440355982, + "grad_norm": 16.165203094482422, + "learning_rate": 2.446152472857595e-06, + "loss": 0.9293, + "step": 8602 + }, + { + "epoch": 1.4723600889953792, + "grad_norm": 2.8993372917175293, + "learning_rate": 2.4398500972196423e-06, + "loss": 0.2628, + "step": 8603 + }, + { + "epoch": 1.47253123395516, + "grad_norm": 18.692373275756836, + "learning_rate": 2.433555132088846e-06, + "loss": 1.4589, + "step": 8604 + }, + { + "epoch": 1.472702378914941, + "grad_norm": 0.8061926960945129, + "learning_rate": 2.4272675811792348e-06, + "loss": 0.1395, + "step": 8605 + }, + { + "epoch": 1.472873523874722, + "grad_norm": 12.81713581085205, + "learning_rate": 2.42098744820048e-06, + "loss": 1.1243, + "step": 8606 + }, + { + "epoch": 1.4730446688345027, + "grad_norm": 13.554957389831543, + "learning_rate": 2.414714736857868e-06, + "loss": 0.753, + "step": 8607 + }, + { + "epoch": 1.4732158137942837, + "grad_norm": 16.154888153076172, + "learning_rate": 2.408449450852297e-06, + "loss": 1.0956, + "step": 8608 + }, + { + "epoch": 1.4733869587540647, + "grad_norm": 3.45200777053833, + "learning_rate": 2.4021915938803094e-06, + "loss": 0.2685, + "step": 8609 + }, + { + "epoch": 1.4735581037138457, + "grad_norm": 14.280316352844238, + "learning_rate": 2.3959411696340507e-06, + "loss": 1.3286, + "step": 8610 + }, + { + "epoch": 1.4737292486736266, + "grad_norm": 16.223106384277344, + "learning_rate": 2.3896981818012697e-06, + "loss": 1.2277, + "step": 8611 + }, + { + "epoch": 1.4739003936334074, + "grad_norm": 0.4728884696960449, + "learning_rate": 2.3834626340653476e-06, + "loss": 0.1268, + "step": 8612 + }, + { + "epoch": 1.4740715385931884, + "grad_norm": 0.5109286308288574, + "learning_rate": 2.3772345301052595e-06, + "loss": 0.1254, + "step": 8613 + }, + { + "epoch": 1.4742426835529694, + "grad_norm": 14.686467170715332, + "learning_rate": 2.3710138735956044e-06, + "loss": 1.4269, + "step": 8614 + }, + { + "epoch": 1.4744138285127504, + "grad_norm": 10.182634353637695, + "learning_rate": 2.36480066820657e-06, + "loss": 0.9876, + "step": 8615 + }, + { + "epoch": 1.4745849734725311, + "grad_norm": 0.5462074279785156, + "learning_rate": 2.3585949176039652e-06, + "loss": 0.1283, + "step": 8616 + }, + { + "epoch": 1.4747561184323121, + "grad_norm": 34.710514068603516, + "learning_rate": 2.3523966254491863e-06, + "loss": 5.5256, + "step": 8617 + }, + { + "epoch": 1.4749272633920931, + "grad_norm": 3.7688565254211426, + "learning_rate": 2.34620579539923e-06, + "loss": 0.3516, + "step": 8618 + }, + { + "epoch": 1.4750984083518741, + "grad_norm": 4.640887260437012, + "learning_rate": 2.340022431106706e-06, + "loss": 0.483, + "step": 8619 + }, + { + "epoch": 1.475269553311655, + "grad_norm": 20.615394592285156, + "learning_rate": 2.3338465362198074e-06, + "loss": 2.5548, + "step": 8620 + }, + { + "epoch": 1.4754406982714359, + "grad_norm": 0.4951671361923218, + "learning_rate": 2.327678114382315e-06, + "loss": 0.1262, + "step": 8621 + }, + { + "epoch": 1.4756118432312169, + "grad_norm": 5.877453327178955, + "learning_rate": 2.321517169233614e-06, + "loss": 0.5921, + "step": 8622 + }, + { + "epoch": 1.4757829881909978, + "grad_norm": 19.228952407836914, + "learning_rate": 2.3153637044086616e-06, + "loss": 2.3989, + "step": 8623 + }, + { + "epoch": 1.4759541331507786, + "grad_norm": 29.218852996826172, + "learning_rate": 2.3092177235380225e-06, + "loss": 5.3795, + "step": 8624 + }, + { + "epoch": 1.4761252781105596, + "grad_norm": 13.688109397888184, + "learning_rate": 2.303079230247827e-06, + "loss": 1.3003, + "step": 8625 + }, + { + "epoch": 1.4762964230703406, + "grad_norm": 26.70513343811035, + "learning_rate": 2.2969482281597953e-06, + "loss": 5.4995, + "step": 8626 + }, + { + "epoch": 1.4764675680301216, + "grad_norm": 0.4475199282169342, + "learning_rate": 2.2908247208912337e-06, + "loss": 0.1284, + "step": 8627 + }, + { + "epoch": 1.4766387129899026, + "grad_norm": 13.632052421569824, + "learning_rate": 2.284708712055012e-06, + "loss": 1.2663, + "step": 8628 + }, + { + "epoch": 1.4768098579496833, + "grad_norm": 1.8141242265701294, + "learning_rate": 2.278600205259589e-06, + "loss": 0.2333, + "step": 8629 + }, + { + "epoch": 1.4769810029094643, + "grad_norm": 15.47907829284668, + "learning_rate": 2.2724992041089965e-06, + "loss": 1.2852, + "step": 8630 + }, + { + "epoch": 1.4771521478692453, + "grad_norm": 21.15369987487793, + "learning_rate": 2.266405712202827e-06, + "loss": 1.6042, + "step": 8631 + }, + { + "epoch": 1.477323292829026, + "grad_norm": 17.852474212646484, + "learning_rate": 2.2603197331362564e-06, + "loss": 1.5289, + "step": 8632 + }, + { + "epoch": 1.477494437788807, + "grad_norm": 0.7824785709381104, + "learning_rate": 2.2542412705000153e-06, + "loss": 0.1304, + "step": 8633 + }, + { + "epoch": 1.477665582748588, + "grad_norm": 9.270437240600586, + "learning_rate": 2.248170327880414e-06, + "loss": 1.1586, + "step": 8634 + }, + { + "epoch": 1.477836727708369, + "grad_norm": 6.792728900909424, + "learning_rate": 2.2421069088593083e-06, + "loss": 0.5544, + "step": 8635 + }, + { + "epoch": 1.47800787266815, + "grad_norm": 0.3804835081100464, + "learning_rate": 2.2360510170141316e-06, + "loss": 0.1178, + "step": 8636 + }, + { + "epoch": 1.4781790176279308, + "grad_norm": 12.946402549743652, + "learning_rate": 2.2300026559178667e-06, + "loss": 1.2082, + "step": 8637 + }, + { + "epoch": 1.4783501625877118, + "grad_norm": 14.585646629333496, + "learning_rate": 2.223961829139051e-06, + "loss": 0.9811, + "step": 8638 + }, + { + "epoch": 1.4785213075474928, + "grad_norm": 5.403097629547119, + "learning_rate": 2.21792854024179e-06, + "loss": 0.2892, + "step": 8639 + }, + { + "epoch": 1.4786924525072735, + "grad_norm": 12.016549110412598, + "learning_rate": 2.211902792785725e-06, + "loss": 0.9699, + "step": 8640 + }, + { + "epoch": 1.4788635974670545, + "grad_norm": 42.36448669433594, + "learning_rate": 2.2058845903260595e-06, + "loss": 7.0021, + "step": 8641 + }, + { + "epoch": 1.4790347424268355, + "grad_norm": 18.061737060546875, + "learning_rate": 2.1998739364135446e-06, + "loss": 2.103, + "step": 8642 + }, + { + "epoch": 1.4792058873866165, + "grad_norm": 11.131844520568848, + "learning_rate": 2.1938708345944703e-06, + "loss": 0.8494, + "step": 8643 + }, + { + "epoch": 1.4793770323463975, + "grad_norm": 23.128252029418945, + "learning_rate": 2.18787528841068e-06, + "loss": 3.2058, + "step": 8644 + }, + { + "epoch": 1.4795481773061783, + "grad_norm": 16.91937828063965, + "learning_rate": 2.1818873013995495e-06, + "loss": 1.1437, + "step": 8645 + }, + { + "epoch": 1.4797193222659593, + "grad_norm": 10.451518058776855, + "learning_rate": 2.175906877094007e-06, + "loss": 0.6853, + "step": 8646 + }, + { + "epoch": 1.4798904672257402, + "grad_norm": 4.0505146980285645, + "learning_rate": 2.1699340190225057e-06, + "loss": 0.3101, + "step": 8647 + }, + { + "epoch": 1.480061612185521, + "grad_norm": 23.27374267578125, + "learning_rate": 2.163968730709045e-06, + "loss": 5.2235, + "step": 8648 + }, + { + "epoch": 1.480232757145302, + "grad_norm": 10.298914909362793, + "learning_rate": 2.1580110156731525e-06, + "loss": 1.0408, + "step": 8649 + }, + { + "epoch": 1.480403902105083, + "grad_norm": 9.977930068969727, + "learning_rate": 2.1520608774298815e-06, + "loss": 0.9702, + "step": 8650 + }, + { + "epoch": 1.480575047064864, + "grad_norm": 17.494873046875, + "learning_rate": 2.1461183194898325e-06, + "loss": 1.6695, + "step": 8651 + }, + { + "epoch": 1.480746192024645, + "grad_norm": 22.446674346923828, + "learning_rate": 2.140183345359124e-06, + "loss": 3.1251, + "step": 8652 + }, + { + "epoch": 1.4809173369844257, + "grad_norm": 12.42232608795166, + "learning_rate": 2.1342559585393933e-06, + "loss": 0.8405, + "step": 8653 + }, + { + "epoch": 1.4810884819442067, + "grad_norm": 1.9596792459487915, + "learning_rate": 2.1283361625278113e-06, + "loss": 0.2301, + "step": 8654 + }, + { + "epoch": 1.4812596269039877, + "grad_norm": 4.211131572723389, + "learning_rate": 2.1224239608170644e-06, + "loss": 0.3821, + "step": 8655 + }, + { + "epoch": 1.4814307718637685, + "grad_norm": 0.6189964413642883, + "learning_rate": 2.1165193568953633e-06, + "loss": 0.1314, + "step": 8656 + }, + { + "epoch": 1.4816019168235495, + "grad_norm": 16.088436126708984, + "learning_rate": 2.1106223542464304e-06, + "loss": 1.3685, + "step": 8657 + }, + { + "epoch": 1.4817730617833305, + "grad_norm": 17.937074661254883, + "learning_rate": 2.1047329563495036e-06, + "loss": 1.8444, + "step": 8658 + }, + { + "epoch": 1.4819442067431114, + "grad_norm": 11.515583038330078, + "learning_rate": 2.098851166679344e-06, + "loss": 1.2523, + "step": 8659 + }, + { + "epoch": 1.4821153517028924, + "grad_norm": 19.75164031982422, + "learning_rate": 2.0929769887062074e-06, + "loss": 1.6112, + "step": 8660 + }, + { + "epoch": 1.4822864966626732, + "grad_norm": 14.153520584106445, + "learning_rate": 2.087110425895869e-06, + "loss": 1.2514, + "step": 8661 + }, + { + "epoch": 1.4824576416224542, + "grad_norm": 13.268850326538086, + "learning_rate": 2.081251481709619e-06, + "loss": 1.0761, + "step": 8662 + }, + { + "epoch": 1.4826287865822352, + "grad_norm": 2.862231492996216, + "learning_rate": 2.075400159604234e-06, + "loss": 0.3298, + "step": 8663 + }, + { + "epoch": 1.4827999315420162, + "grad_norm": 40.3896484375, + "learning_rate": 2.0695564630320083e-06, + "loss": 6.5952, + "step": 8664 + }, + { + "epoch": 1.482971076501797, + "grad_norm": 5.492036819458008, + "learning_rate": 2.06372039544073e-06, + "loss": 0.5523, + "step": 8665 + }, + { + "epoch": 1.483142221461578, + "grad_norm": 14.369952201843262, + "learning_rate": 2.0578919602736813e-06, + "loss": 1.6392, + "step": 8666 + }, + { + "epoch": 1.483313366421359, + "grad_norm": 1.8615891933441162, + "learning_rate": 2.0520711609696573e-06, + "loss": 0.2698, + "step": 8667 + }, + { + "epoch": 1.48348451138114, + "grad_norm": 24.372041702270508, + "learning_rate": 2.04625800096293e-06, + "loss": 5.1996, + "step": 8668 + }, + { + "epoch": 1.483655656340921, + "grad_norm": 0.42217203974723816, + "learning_rate": 2.040452483683279e-06, + "loss": 0.124, + "step": 8669 + }, + { + "epoch": 1.4838268013007017, + "grad_norm": 18.064851760864258, + "learning_rate": 2.0346546125559622e-06, + "loss": 1.4861, + "step": 8670 + }, + { + "epoch": 1.4839979462604826, + "grad_norm": 13.571256637573242, + "learning_rate": 2.0288643910017405e-06, + "loss": 1.5556, + "step": 8671 + }, + { + "epoch": 1.4841690912202636, + "grad_norm": 4.089590072631836, + "learning_rate": 2.023081822436843e-06, + "loss": 0.3336, + "step": 8672 + }, + { + "epoch": 1.4843402361800444, + "grad_norm": 11.174686431884766, + "learning_rate": 2.0173069102729983e-06, + "loss": 0.916, + "step": 8673 + }, + { + "epoch": 1.4845113811398254, + "grad_norm": 19.2801570892334, + "learning_rate": 2.0115396579174183e-06, + "loss": 2.3274, + "step": 8674 + }, + { + "epoch": 1.4846825260996064, + "grad_norm": 15.093779563903809, + "learning_rate": 2.0057800687727825e-06, + "loss": 1.1786, + "step": 8675 + }, + { + "epoch": 1.4848536710593874, + "grad_norm": 9.493477821350098, + "learning_rate": 2.000028146237264e-06, + "loss": 0.855, + "step": 8676 + }, + { + "epoch": 1.4850248160191684, + "grad_norm": 9.813469886779785, + "learning_rate": 1.9942838937045015e-06, + "loss": 0.7856, + "step": 8677 + }, + { + "epoch": 1.4851959609789491, + "grad_norm": 8.897295951843262, + "learning_rate": 1.98854731456361e-06, + "loss": 0.864, + "step": 8678 + }, + { + "epoch": 1.4853671059387301, + "grad_norm": 13.642171859741211, + "learning_rate": 1.982818412199187e-06, + "loss": 0.8152, + "step": 8679 + }, + { + "epoch": 1.485538250898511, + "grad_norm": 15.76082706451416, + "learning_rate": 1.9770971899912856e-06, + "loss": 1.2928, + "step": 8680 + }, + { + "epoch": 1.4857093958582919, + "grad_norm": 18.846452713012695, + "learning_rate": 1.9713836513154427e-06, + "loss": 1.6299, + "step": 8681 + }, + { + "epoch": 1.4858805408180729, + "grad_norm": 52.782161712646484, + "learning_rate": 1.965677799542647e-06, + "loss": 7.6975, + "step": 8682 + }, + { + "epoch": 1.4860516857778538, + "grad_norm": 17.99054527282715, + "learning_rate": 1.9599796380393632e-06, + "loss": 1.1316, + "step": 8683 + }, + { + "epoch": 1.4862228307376348, + "grad_norm": 11.52419662475586, + "learning_rate": 1.9542891701675206e-06, + "loss": 1.085, + "step": 8684 + }, + { + "epoch": 1.4863939756974158, + "grad_norm": 0.395636647939682, + "learning_rate": 1.948606399284495e-06, + "loss": 0.1223, + "step": 8685 + }, + { + "epoch": 1.4865651206571966, + "grad_norm": 3.862820625305176, + "learning_rate": 1.942931328743135e-06, + "loss": 0.305, + "step": 8686 + }, + { + "epoch": 1.4867362656169776, + "grad_norm": 17.18566131591797, + "learning_rate": 1.9372639618917378e-06, + "loss": 1.8984, + "step": 8687 + }, + { + "epoch": 1.4869074105767586, + "grad_norm": 5.835579872131348, + "learning_rate": 1.9316043020740616e-06, + "loss": 0.5111, + "step": 8688 + }, + { + "epoch": 1.4870785555365393, + "grad_norm": 13.971384048461914, + "learning_rate": 1.9259523526293123e-06, + "loss": 1.2442, + "step": 8689 + }, + { + "epoch": 1.4872497004963203, + "grad_norm": 16.53280258178711, + "learning_rate": 1.9203081168921454e-06, + "loss": 1.0105, + "step": 8690 + }, + { + "epoch": 1.4874208454561013, + "grad_norm": 10.08396053314209, + "learning_rate": 1.9146715981926743e-06, + "loss": 1.1688, + "step": 8691 + }, + { + "epoch": 1.4875919904158823, + "grad_norm": 21.6230411529541, + "learning_rate": 1.909042799856447e-06, + "loss": 3.0133, + "step": 8692 + }, + { + "epoch": 1.4877631353756633, + "grad_norm": 16.009479522705078, + "learning_rate": 1.9034217252044617e-06, + "loss": 1.4418, + "step": 8693 + }, + { + "epoch": 1.487934280335444, + "grad_norm": 0.32677701115608215, + "learning_rate": 1.897808377553174e-06, + "loss": 0.1142, + "step": 8694 + }, + { + "epoch": 1.488105425295225, + "grad_norm": 12.716803550720215, + "learning_rate": 1.8922027602144532e-06, + "loss": 1.1369, + "step": 8695 + }, + { + "epoch": 1.488276570255006, + "grad_norm": 12.640409469604492, + "learning_rate": 1.8866048764956318e-06, + "loss": 0.9637, + "step": 8696 + }, + { + "epoch": 1.4884477152147868, + "grad_norm": 21.609838485717773, + "learning_rate": 1.8810147296994663e-06, + "loss": 5.1968, + "step": 8697 + }, + { + "epoch": 1.4886188601745678, + "grad_norm": 12.18921184539795, + "learning_rate": 1.8754323231241466e-06, + "loss": 0.9312, + "step": 8698 + }, + { + "epoch": 1.4887900051343488, + "grad_norm": 18.10647964477539, + "learning_rate": 1.8698576600633066e-06, + "loss": 2.0963, + "step": 8699 + }, + { + "epoch": 1.4889611500941298, + "grad_norm": 13.537436485290527, + "learning_rate": 1.864290743806002e-06, + "loss": 1.1168, + "step": 8700 + }, + { + "epoch": 1.4891322950539108, + "grad_norm": 11.2808198928833, + "learning_rate": 1.858731577636727e-06, + "loss": 0.6953, + "step": 8701 + }, + { + "epoch": 1.4893034400136915, + "grad_norm": 15.852746963500977, + "learning_rate": 1.8531801648353913e-06, + "loss": 1.2251, + "step": 8702 + }, + { + "epoch": 1.4894745849734725, + "grad_norm": 19.878219604492188, + "learning_rate": 1.8476365086773417e-06, + "loss": 1.6645, + "step": 8703 + }, + { + "epoch": 1.4896457299332535, + "grad_norm": 26.932106018066406, + "learning_rate": 1.8421006124333368e-06, + "loss": 5.3491, + "step": 8704 + }, + { + "epoch": 1.4898168748930343, + "grad_norm": 20.04827880859375, + "learning_rate": 1.8365724793695754e-06, + "loss": 2.6135, + "step": 8705 + }, + { + "epoch": 1.4899880198528153, + "grad_norm": 3.5480690002441406, + "learning_rate": 1.8310521127476538e-06, + "loss": 0.2846, + "step": 8706 + }, + { + "epoch": 1.4901591648125962, + "grad_norm": 13.716347694396973, + "learning_rate": 1.8255395158246046e-06, + "loss": 0.9585, + "step": 8707 + }, + { + "epoch": 1.4903303097723772, + "grad_norm": 9.723167419433594, + "learning_rate": 1.8200346918528598e-06, + "loss": 0.8903, + "step": 8708 + }, + { + "epoch": 1.4905014547321582, + "grad_norm": 16.154159545898438, + "learning_rate": 1.8145376440802813e-06, + "loss": 1.6429, + "step": 8709 + }, + { + "epoch": 1.490672599691939, + "grad_norm": 0.3765588700771332, + "learning_rate": 1.8090483757501281e-06, + "loss": 0.1177, + "step": 8710 + }, + { + "epoch": 1.49084374465172, + "grad_norm": 14.236617088317871, + "learning_rate": 1.8035668901010844e-06, + "loss": 1.093, + "step": 8711 + }, + { + "epoch": 1.491014889611501, + "grad_norm": 13.419577598571777, + "learning_rate": 1.79809319036723e-06, + "loss": 0.9498, + "step": 8712 + }, + { + "epoch": 1.4911860345712817, + "grad_norm": 11.613882064819336, + "learning_rate": 1.7926272797780546e-06, + "loss": 0.954, + "step": 8713 + }, + { + "epoch": 1.4913571795310627, + "grad_norm": 15.349943161010742, + "learning_rate": 1.7871691615584607e-06, + "loss": 1.1268, + "step": 8714 + }, + { + "epoch": 1.4915283244908437, + "grad_norm": 1.526613473892212, + "learning_rate": 1.7817188389287337e-06, + "loss": 0.2058, + "step": 8715 + }, + { + "epoch": 1.4916994694506247, + "grad_norm": 12.078264236450195, + "learning_rate": 1.7762763151045863e-06, + "loss": 0.7651, + "step": 8716 + }, + { + "epoch": 1.4918706144104057, + "grad_norm": 0.5502970814704895, + "learning_rate": 1.7708415932971083e-06, + "loss": 0.1337, + "step": 8717 + }, + { + "epoch": 1.4920417593701867, + "grad_norm": 9.057726860046387, + "learning_rate": 1.7654146767127915e-06, + "loss": 0.8282, + "step": 8718 + }, + { + "epoch": 1.4922129043299674, + "grad_norm": 25.093358993530273, + "learning_rate": 1.759995568553533e-06, + "loss": 5.2401, + "step": 8719 + }, + { + "epoch": 1.4923840492897484, + "grad_norm": 3.6737241744995117, + "learning_rate": 1.754584272016605e-06, + "loss": 0.2781, + "step": 8720 + }, + { + "epoch": 1.4925551942495294, + "grad_norm": 11.911739349365234, + "learning_rate": 1.7491807902946871e-06, + "loss": 0.9151, + "step": 8721 + }, + { + "epoch": 1.4927263392093102, + "grad_norm": 27.066675186157227, + "learning_rate": 1.7437851265758375e-06, + "loss": 3.2981, + "step": 8722 + }, + { + "epoch": 1.4928974841690912, + "grad_norm": 0.41131046414375305, + "learning_rate": 1.7383972840435115e-06, + "loss": 0.1204, + "step": 8723 + }, + { + "epoch": 1.4930686291288722, + "grad_norm": 9.278553009033203, + "learning_rate": 1.7330172658765391e-06, + "loss": 0.7775, + "step": 8724 + }, + { + "epoch": 1.4932397740886532, + "grad_norm": 0.3530387878417969, + "learning_rate": 1.7276450752491352e-06, + "loss": 0.117, + "step": 8725 + }, + { + "epoch": 1.4934109190484341, + "grad_norm": 23.6711368560791, + "learning_rate": 1.7222807153309123e-06, + "loss": 3.1004, + "step": 8726 + }, + { + "epoch": 1.493582064008215, + "grad_norm": 2.162137269973755, + "learning_rate": 1.7169241892868403e-06, + "loss": 0.2361, + "step": 8727 + }, + { + "epoch": 1.493753208967996, + "grad_norm": 24.51152229309082, + "learning_rate": 1.7115755002772848e-06, + "loss": 3.2392, + "step": 8728 + }, + { + "epoch": 1.493924353927777, + "grad_norm": 8.81994342803955, + "learning_rate": 1.7062346514579747e-06, + "loss": 0.9084, + "step": 8729 + }, + { + "epoch": 1.4940954988875577, + "grad_norm": 12.69940185546875, + "learning_rate": 1.7009016459800193e-06, + "loss": 0.7481, + "step": 8730 + }, + { + "epoch": 1.4942666438473386, + "grad_norm": 18.715112686157227, + "learning_rate": 1.695576486989905e-06, + "loss": 2.2736, + "step": 8731 + }, + { + "epoch": 1.4944377888071196, + "grad_norm": 17.015085220336914, + "learning_rate": 1.690259177629475e-06, + "loss": 1.2533, + "step": 8732 + }, + { + "epoch": 1.4946089337669006, + "grad_norm": 21.677297592163086, + "learning_rate": 1.6849497210359589e-06, + "loss": 3.0261, + "step": 8733 + }, + { + "epoch": 1.4947800787266816, + "grad_norm": 11.646499633789062, + "learning_rate": 1.6796481203419367e-06, + "loss": 0.8067, + "step": 8734 + }, + { + "epoch": 1.4949512236864624, + "grad_norm": 7.48187255859375, + "learning_rate": 1.674354378675365e-06, + "loss": 0.6705, + "step": 8735 + }, + { + "epoch": 1.4951223686462434, + "grad_norm": 14.336213111877441, + "learning_rate": 1.669068499159554e-06, + "loss": 1.146, + "step": 8736 + }, + { + "epoch": 1.4952935136060244, + "grad_norm": 9.391974449157715, + "learning_rate": 1.6637904849131886e-06, + "loss": 0.875, + "step": 8737 + }, + { + "epoch": 1.4954646585658051, + "grad_norm": 15.934311866760254, + "learning_rate": 1.6585203390502996e-06, + "loss": 1.2158, + "step": 8738 + }, + { + "epoch": 1.4956358035255861, + "grad_norm": 13.133450508117676, + "learning_rate": 1.6532580646802831e-06, + "loss": 0.9799, + "step": 8739 + }, + { + "epoch": 1.495806948485367, + "grad_norm": 13.175771713256836, + "learning_rate": 1.6480036649078856e-06, + "loss": 0.8717, + "step": 8740 + }, + { + "epoch": 1.495978093445148, + "grad_norm": 20.734832763671875, + "learning_rate": 1.6427571428332171e-06, + "loss": 2.6231, + "step": 8741 + }, + { + "epoch": 1.496149238404929, + "grad_norm": 4.2126970291137695, + "learning_rate": 1.6375185015517252e-06, + "loss": 0.2936, + "step": 8742 + }, + { + "epoch": 1.4963203833647098, + "grad_norm": 8.44736385345459, + "learning_rate": 1.632287744154224e-06, + "loss": 0.7255, + "step": 8743 + }, + { + "epoch": 1.4964915283244908, + "grad_norm": 7.583530902862549, + "learning_rate": 1.6270648737268646e-06, + "loss": 0.4966, + "step": 8744 + }, + { + "epoch": 1.4966626732842718, + "grad_norm": 12.269996643066406, + "learning_rate": 1.6218498933511438e-06, + "loss": 1.023, + "step": 8745 + }, + { + "epoch": 1.4968338182440526, + "grad_norm": 12.351924896240234, + "learning_rate": 1.6166428061039174e-06, + "loss": 1.0181, + "step": 8746 + }, + { + "epoch": 1.4970049632038336, + "grad_norm": 12.050604820251465, + "learning_rate": 1.6114436150573607e-06, + "loss": 0.9488, + "step": 8747 + }, + { + "epoch": 1.4971761081636146, + "grad_norm": 10.327962875366211, + "learning_rate": 1.6062523232790172e-06, + "loss": 0.7329, + "step": 8748 + }, + { + "epoch": 1.4973472531233956, + "grad_norm": 0.43802082538604736, + "learning_rate": 1.6010689338317564e-06, + "loss": 0.1212, + "step": 8749 + }, + { + "epoch": 1.4975183980831765, + "grad_norm": 10.928634643554688, + "learning_rate": 1.595893449773777e-06, + "loss": 0.8106, + "step": 8750 + }, + { + "epoch": 1.4976895430429573, + "grad_norm": 11.663643836975098, + "learning_rate": 1.5907258741586316e-06, + "loss": 0.9545, + "step": 8751 + }, + { + "epoch": 1.4978606880027383, + "grad_norm": 8.708173751831055, + "learning_rate": 1.5855662100351897e-06, + "loss": 0.9869, + "step": 8752 + }, + { + "epoch": 1.4980318329625193, + "grad_norm": 18.2066707611084, + "learning_rate": 1.580414460447666e-06, + "loss": 0.5682, + "step": 8753 + }, + { + "epoch": 1.4982029779223, + "grad_norm": 14.140893936157227, + "learning_rate": 1.5752706284355993e-06, + "loss": 1.1747, + "step": 8754 + }, + { + "epoch": 1.498374122882081, + "grad_norm": 9.528616905212402, + "learning_rate": 1.5701347170338553e-06, + "loss": 0.6355, + "step": 8755 + }, + { + "epoch": 1.498545267841862, + "grad_norm": 18.601449966430664, + "learning_rate": 1.5650067292726332e-06, + "loss": 2.5453, + "step": 8756 + }, + { + "epoch": 1.498716412801643, + "grad_norm": 19.107868194580078, + "learning_rate": 1.5598866681774481e-06, + "loss": 2.7961, + "step": 8757 + }, + { + "epoch": 1.498887557761424, + "grad_norm": 4.660526275634766, + "learning_rate": 1.5547745367691486e-06, + "loss": 0.3109, + "step": 8758 + }, + { + "epoch": 1.4990587027212048, + "grad_norm": 13.609661102294922, + "learning_rate": 1.5496703380639016e-06, + "loss": 1.2268, + "step": 8759 + }, + { + "epoch": 1.4992298476809858, + "grad_norm": 23.278398513793945, + "learning_rate": 1.5445740750731852e-06, + "loss": 3.1897, + "step": 8760 + }, + { + "epoch": 1.4994009926407668, + "grad_norm": 5.577650547027588, + "learning_rate": 1.539485750803809e-06, + "loss": 0.3686, + "step": 8761 + }, + { + "epoch": 1.4995721376005475, + "grad_norm": 14.022244453430176, + "learning_rate": 1.5344053682578869e-06, + "loss": 1.1382, + "step": 8762 + }, + { + "epoch": 1.4997432825603285, + "grad_norm": 20.950279235839844, + "learning_rate": 1.5293329304328558e-06, + "loss": 1.6109, + "step": 8763 + }, + { + "epoch": 1.4999144275201095, + "grad_norm": 5.289403915405273, + "learning_rate": 1.5242684403214569e-06, + "loss": 0.6656, + "step": 8764 + }, + { + "epoch": 1.5000855724798905, + "grad_norm": 3.436375856399536, + "learning_rate": 1.5192119009117534e-06, + "loss": 0.4689, + "step": 8765 + } + ], + "logging_steps": 1, + "max_steps": 17529, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1753, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}