diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,63102 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9991117033089054, + "eval_steps": 500, + "global_step": 9004, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00044414834554741284, + "grad_norm": 14.241565838210393, + "learning_rate": 1.1098779134295229e-08, + "loss": 0.533, + "step": 1 + }, + { + "epoch": 0.0008882966910948257, + "grad_norm": 16.278912065775813, + "learning_rate": 2.2197558268590458e-08, + "loss": 0.5359, + "step": 2 + }, + { + "epoch": 0.0013324450366422385, + "grad_norm": 15.38532645021989, + "learning_rate": 3.329633740288568e-08, + "loss": 0.6437, + "step": 3 + }, + { + "epoch": 0.0017765933821896514, + "grad_norm": 13.00040105537638, + "learning_rate": 4.4395116537180915e-08, + "loss": 0.5256, + "step": 4 + }, + { + "epoch": 0.0022207417277370642, + "grad_norm": 13.99913175512498, + "learning_rate": 5.549389567147614e-08, + "loss": 0.559, + "step": 5 + }, + { + "epoch": 0.002664890073284477, + "grad_norm": 14.827666239037775, + "learning_rate": 6.659267480577137e-08, + "loss": 0.5342, + "step": 6 + }, + { + "epoch": 0.00310903841883189, + "grad_norm": 14.56618533431443, + "learning_rate": 7.76914539400666e-08, + "loss": 0.5818, + "step": 7 + }, + { + "epoch": 0.0035531867643793028, + "grad_norm": 14.021029607148948, + "learning_rate": 8.879023307436183e-08, + "loss": 0.5116, + "step": 8 + }, + { + "epoch": 0.003997335109926716, + "grad_norm": 13.230591443503124, + "learning_rate": 9.988901220865707e-08, + "loss": 0.5254, + "step": 9 + }, + { + "epoch": 0.0044414834554741284, + "grad_norm": 14.105717100587825, + "learning_rate": 1.1098779134295228e-07, + "loss": 0.5469, + "step": 10 + }, + { + "epoch": 0.004885631801021541, + "grad_norm": 13.393742170063549, + "learning_rate": 1.220865704772475e-07, + "loss": 0.5273, + "step": 11 + }, + { + "epoch": 0.005329780146568954, + "grad_norm": 13.614380052118197, + "learning_rate": 1.3318534961154273e-07, + "loss": 0.5127, + "step": 12 + }, + { + "epoch": 0.005773928492116367, + "grad_norm": 12.784423466221073, + "learning_rate": 1.4428412874583796e-07, + "loss": 0.4306, + "step": 13 + }, + { + "epoch": 0.00621807683766378, + "grad_norm": 13.80866223671461, + "learning_rate": 1.553829078801332e-07, + "loss": 0.5439, + "step": 14 + }, + { + "epoch": 0.006662225183211193, + "grad_norm": 14.041859557487046, + "learning_rate": 1.6648168701442844e-07, + "loss": 0.5188, + "step": 15 + }, + { + "epoch": 0.0071063735287586055, + "grad_norm": 14.936881953932252, + "learning_rate": 1.7758046614872366e-07, + "loss": 0.5683, + "step": 16 + }, + { + "epoch": 0.007550521874306018, + "grad_norm": 15.650434961515957, + "learning_rate": 1.886792452830189e-07, + "loss": 0.5353, + "step": 17 + }, + { + "epoch": 0.007994670219853431, + "grad_norm": 14.344720416498966, + "learning_rate": 1.9977802441731414e-07, + "loss": 0.4958, + "step": 18 + }, + { + "epoch": 0.008438818565400843, + "grad_norm": 15.791401491309836, + "learning_rate": 2.1087680355160934e-07, + "loss": 0.5652, + "step": 19 + }, + { + "epoch": 0.008882966910948257, + "grad_norm": 13.010603153221817, + "learning_rate": 2.2197558268590456e-07, + "loss": 0.4343, + "step": 20 + }, + { + "epoch": 0.009327115256495669, + "grad_norm": 15.52141534233184, + "learning_rate": 2.330743618201998e-07, + "loss": 0.5389, + "step": 21 + }, + { + "epoch": 0.009771263602043083, + "grad_norm": 15.756642888500245, + "learning_rate": 2.44173140954495e-07, + "loss": 0.5326, + "step": 22 + }, + { + "epoch": 0.010215411947590495, + "grad_norm": 15.268952002568577, + "learning_rate": 2.5527192008879024e-07, + "loss": 0.4824, + "step": 23 + }, + { + "epoch": 0.010659560293137908, + "grad_norm": 15.932949745262599, + "learning_rate": 2.6637069922308547e-07, + "loss": 0.5079, + "step": 24 + }, + { + "epoch": 0.01110370863868532, + "grad_norm": 15.58611205142026, + "learning_rate": 2.7746947835738074e-07, + "loss": 0.476, + "step": 25 + }, + { + "epoch": 0.011547856984232734, + "grad_norm": 15.375627217870338, + "learning_rate": 2.885682574916759e-07, + "loss": 0.4744, + "step": 26 + }, + { + "epoch": 0.011992005329780146, + "grad_norm": 16.173866872463734, + "learning_rate": 2.9966703662597114e-07, + "loss": 0.5543, + "step": 27 + }, + { + "epoch": 0.01243615367532756, + "grad_norm": 11.889556738869189, + "learning_rate": 3.107658157602664e-07, + "loss": 0.459, + "step": 28 + }, + { + "epoch": 0.012880302020874972, + "grad_norm": 9.805467442732954, + "learning_rate": 3.218645948945616e-07, + "loss": 0.4039, + "step": 29 + }, + { + "epoch": 0.013324450366422385, + "grad_norm": 9.109396128992588, + "learning_rate": 3.3296337402885687e-07, + "loss": 0.374, + "step": 30 + }, + { + "epoch": 0.013768598711969797, + "grad_norm": 10.263226704180568, + "learning_rate": 3.440621531631521e-07, + "loss": 0.5212, + "step": 31 + }, + { + "epoch": 0.014212747057517211, + "grad_norm": 8.67404802135948, + "learning_rate": 3.551609322974473e-07, + "loss": 0.449, + "step": 32 + }, + { + "epoch": 0.014656895403064623, + "grad_norm": 9.21127577620048, + "learning_rate": 3.6625971143174255e-07, + "loss": 0.4436, + "step": 33 + }, + { + "epoch": 0.015101043748612037, + "grad_norm": 8.76521126068558, + "learning_rate": 3.773584905660378e-07, + "loss": 0.4802, + "step": 34 + }, + { + "epoch": 0.015545192094159449, + "grad_norm": 8.042704299848994, + "learning_rate": 3.8845726970033295e-07, + "loss": 0.4144, + "step": 35 + }, + { + "epoch": 0.015989340439706862, + "grad_norm": 7.470228322799545, + "learning_rate": 3.995560488346283e-07, + "loss": 0.3999, + "step": 36 + }, + { + "epoch": 0.016433488785254274, + "grad_norm": 8.608115328544926, + "learning_rate": 4.1065482796892345e-07, + "loss": 0.4636, + "step": 37 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 7.697028697048977, + "learning_rate": 4.217536071032187e-07, + "loss": 0.3997, + "step": 38 + }, + { + "epoch": 0.017321785476349102, + "grad_norm": 5.663561386344377, + "learning_rate": 4.328523862375139e-07, + "loss": 0.3691, + "step": 39 + }, + { + "epoch": 0.017765933821896514, + "grad_norm": 5.938577871272599, + "learning_rate": 4.4395116537180913e-07, + "loss": 0.3926, + "step": 40 + }, + { + "epoch": 0.018210082167443926, + "grad_norm": 6.0845722806591365, + "learning_rate": 4.5504994450610435e-07, + "loss": 0.4168, + "step": 41 + }, + { + "epoch": 0.018654230512991338, + "grad_norm": 4.988124447126365, + "learning_rate": 4.661487236403996e-07, + "loss": 0.3995, + "step": 42 + }, + { + "epoch": 0.019098378858538753, + "grad_norm": 5.157391164042898, + "learning_rate": 4.772475027746949e-07, + "loss": 0.3846, + "step": 43 + }, + { + "epoch": 0.019542527204086165, + "grad_norm": 4.524592140728303, + "learning_rate": 4.8834628190899e-07, + "loss": 0.3393, + "step": 44 + }, + { + "epoch": 0.019986675549633577, + "grad_norm": 4.748116191926127, + "learning_rate": 4.994450610432853e-07, + "loss": 0.3278, + "step": 45 + }, + { + "epoch": 0.02043082389518099, + "grad_norm": 5.1431403204013755, + "learning_rate": 5.105438401775805e-07, + "loss": 0.3638, + "step": 46 + }, + { + "epoch": 0.020874972240728405, + "grad_norm": 5.298031590299299, + "learning_rate": 5.216426193118758e-07, + "loss": 0.3682, + "step": 47 + }, + { + "epoch": 0.021319120586275817, + "grad_norm": 4.678508163207935, + "learning_rate": 5.327413984461709e-07, + "loss": 0.3772, + "step": 48 + }, + { + "epoch": 0.02176326893182323, + "grad_norm": 4.39773491079716, + "learning_rate": 5.438401775804662e-07, + "loss": 0.3682, + "step": 49 + }, + { + "epoch": 0.02220741727737064, + "grad_norm": 4.313057042541789, + "learning_rate": 5.549389567147615e-07, + "loss": 0.3237, + "step": 50 + }, + { + "epoch": 0.022651565622918056, + "grad_norm": 4.449086461913763, + "learning_rate": 5.660377358490567e-07, + "loss": 0.3623, + "step": 51 + }, + { + "epoch": 0.023095713968465468, + "grad_norm": 2.9654000926481783, + "learning_rate": 5.771365149833518e-07, + "loss": 0.2696, + "step": 52 + }, + { + "epoch": 0.02353986231401288, + "grad_norm": 2.9606096375757986, + "learning_rate": 5.882352941176471e-07, + "loss": 0.2518, + "step": 53 + }, + { + "epoch": 0.023984010659560292, + "grad_norm": 3.4611785952391463, + "learning_rate": 5.993340732519423e-07, + "loss": 0.2782, + "step": 54 + }, + { + "epoch": 0.024428159005107707, + "grad_norm": 3.1814895723143284, + "learning_rate": 6.104328523862376e-07, + "loss": 0.2906, + "step": 55 + }, + { + "epoch": 0.02487230735065512, + "grad_norm": 3.1808671717848496, + "learning_rate": 6.215316315205328e-07, + "loss": 0.2983, + "step": 56 + }, + { + "epoch": 0.02531645569620253, + "grad_norm": 2.118090185635995, + "learning_rate": 6.32630410654828e-07, + "loss": 0.2158, + "step": 57 + }, + { + "epoch": 0.025760604041749943, + "grad_norm": 2.697428855954616, + "learning_rate": 6.437291897891232e-07, + "loss": 0.2815, + "step": 58 + }, + { + "epoch": 0.02620475238729736, + "grad_norm": 2.8924667859746926, + "learning_rate": 6.548279689234186e-07, + "loss": 0.2487, + "step": 59 + }, + { + "epoch": 0.02664890073284477, + "grad_norm": 2.346290297240456, + "learning_rate": 6.659267480577137e-07, + "loss": 0.2496, + "step": 60 + }, + { + "epoch": 0.027093049078392183, + "grad_norm": 2.498409492992046, + "learning_rate": 6.77025527192009e-07, + "loss": 0.2728, + "step": 61 + }, + { + "epoch": 0.027537197423939595, + "grad_norm": 3.674582885160151, + "learning_rate": 6.881243063263042e-07, + "loss": 0.2383, + "step": 62 + }, + { + "epoch": 0.02798134576948701, + "grad_norm": 2.799425394175707, + "learning_rate": 6.992230854605994e-07, + "loss": 0.3199, + "step": 63 + }, + { + "epoch": 0.028425494115034422, + "grad_norm": 2.4559947409849805, + "learning_rate": 7.103218645948946e-07, + "loss": 0.2693, + "step": 64 + }, + { + "epoch": 0.028869642460581834, + "grad_norm": 2.293048008449068, + "learning_rate": 7.214206437291898e-07, + "loss": 0.2519, + "step": 65 + }, + { + "epoch": 0.029313790806129246, + "grad_norm": 2.442411284742022, + "learning_rate": 7.325194228634851e-07, + "loss": 0.2574, + "step": 66 + }, + { + "epoch": 0.02975793915167666, + "grad_norm": 1.9858437264830013, + "learning_rate": 7.436182019977803e-07, + "loss": 0.2059, + "step": 67 + }, + { + "epoch": 0.030202087497224073, + "grad_norm": 2.561701423318645, + "learning_rate": 7.547169811320755e-07, + "loss": 0.2526, + "step": 68 + }, + { + "epoch": 0.030646235842771485, + "grad_norm": 1.7160142067734883, + "learning_rate": 7.658157602663707e-07, + "loss": 0.2103, + "step": 69 + }, + { + "epoch": 0.031090384188318897, + "grad_norm": 3.0262045436370473, + "learning_rate": 7.769145394006659e-07, + "loss": 0.278, + "step": 70 + }, + { + "epoch": 0.03153453253386631, + "grad_norm": 2.1467589387452786, + "learning_rate": 7.880133185349612e-07, + "loss": 0.2033, + "step": 71 + }, + { + "epoch": 0.031978680879413725, + "grad_norm": 2.161876762654834, + "learning_rate": 7.991120976692566e-07, + "loss": 0.2319, + "step": 72 + }, + { + "epoch": 0.03242282922496114, + "grad_norm": 2.376008135141836, + "learning_rate": 8.102108768035517e-07, + "loss": 0.2775, + "step": 73 + }, + { + "epoch": 0.03286697757050855, + "grad_norm": 2.12961751744709, + "learning_rate": 8.213096559378469e-07, + "loss": 0.2066, + "step": 74 + }, + { + "epoch": 0.033311125916055964, + "grad_norm": 2.087503322703807, + "learning_rate": 8.324084350721422e-07, + "loss": 0.2432, + "step": 75 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 2.5677509345382354, + "learning_rate": 8.435072142064374e-07, + "loss": 0.2027, + "step": 76 + }, + { + "epoch": 0.03419942260715079, + "grad_norm": 1.7422813109091475, + "learning_rate": 8.546059933407326e-07, + "loss": 0.1981, + "step": 77 + }, + { + "epoch": 0.034643570952698204, + "grad_norm": 1.9135429042700127, + "learning_rate": 8.657047724750278e-07, + "loss": 0.1914, + "step": 78 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 2.0631119197996357, + "learning_rate": 8.768035516093231e-07, + "loss": 0.2247, + "step": 79 + }, + { + "epoch": 0.03553186764379303, + "grad_norm": 2.3180619400309452, + "learning_rate": 8.879023307436183e-07, + "loss": 0.2491, + "step": 80 + }, + { + "epoch": 0.03597601598934044, + "grad_norm": 1.9425485940179281, + "learning_rate": 8.990011098779134e-07, + "loss": 0.2187, + "step": 81 + }, + { + "epoch": 0.03642016433488785, + "grad_norm": 1.8295253723979448, + "learning_rate": 9.100998890122087e-07, + "loss": 0.2012, + "step": 82 + }, + { + "epoch": 0.03686431268043527, + "grad_norm": 1.9729843745153348, + "learning_rate": 9.211986681465039e-07, + "loss": 0.209, + "step": 83 + }, + { + "epoch": 0.037308461025982675, + "grad_norm": 2.037422711387131, + "learning_rate": 9.322974472807992e-07, + "loss": 0.2094, + "step": 84 + }, + { + "epoch": 0.03775260937153009, + "grad_norm": 1.577701916798112, + "learning_rate": 9.433962264150944e-07, + "loss": 0.1858, + "step": 85 + }, + { + "epoch": 0.038196757717077506, + "grad_norm": 2.139670658650865, + "learning_rate": 9.544950055493897e-07, + "loss": 0.2254, + "step": 86 + }, + { + "epoch": 0.038640906062624915, + "grad_norm": 2.8502450993225956, + "learning_rate": 9.65593784683685e-07, + "loss": 0.2145, + "step": 87 + }, + { + "epoch": 0.03908505440817233, + "grad_norm": 1.7436844528376316, + "learning_rate": 9.7669256381798e-07, + "loss": 0.1927, + "step": 88 + }, + { + "epoch": 0.039529202753719746, + "grad_norm": 2.2129521928377796, + "learning_rate": 9.877913429522753e-07, + "loss": 0.2449, + "step": 89 + }, + { + "epoch": 0.039973351099267154, + "grad_norm": 1.7063140977463, + "learning_rate": 9.988901220865706e-07, + "loss": 0.2002, + "step": 90 + }, + { + "epoch": 0.04041749944481457, + "grad_norm": 1.7274557393115937, + "learning_rate": 1.009988901220866e-06, + "loss": 0.2143, + "step": 91 + }, + { + "epoch": 0.04086164779036198, + "grad_norm": 1.6036310126204065, + "learning_rate": 1.021087680355161e-06, + "loss": 0.209, + "step": 92 + }, + { + "epoch": 0.041305796135909394, + "grad_norm": 1.6067518053520686, + "learning_rate": 1.0321864594894562e-06, + "loss": 0.2051, + "step": 93 + }, + { + "epoch": 0.04174994448145681, + "grad_norm": 2.193990373798874, + "learning_rate": 1.0432852386237515e-06, + "loss": 0.2124, + "step": 94 + }, + { + "epoch": 0.04219409282700422, + "grad_norm": 2.0886683470736536, + "learning_rate": 1.0543840177580466e-06, + "loss": 0.2283, + "step": 95 + }, + { + "epoch": 0.04263824117255163, + "grad_norm": 2.111784187748928, + "learning_rate": 1.0654827968923419e-06, + "loss": 0.2035, + "step": 96 + }, + { + "epoch": 0.04308238951809905, + "grad_norm": 2.082762904011713, + "learning_rate": 1.0765815760266371e-06, + "loss": 0.1979, + "step": 97 + }, + { + "epoch": 0.04352653786364646, + "grad_norm": 1.9566752129472822, + "learning_rate": 1.0876803551609324e-06, + "loss": 0.2146, + "step": 98 + }, + { + "epoch": 0.04397068620919387, + "grad_norm": 1.553353783765945, + "learning_rate": 1.0987791342952277e-06, + "loss": 0.1682, + "step": 99 + }, + { + "epoch": 0.04441483455474128, + "grad_norm": 1.9387075500236326, + "learning_rate": 1.109877913429523e-06, + "loss": 0.1792, + "step": 100 + }, + { + "epoch": 0.044858982900288696, + "grad_norm": 1.980937561976891, + "learning_rate": 1.120976692563818e-06, + "loss": 0.1896, + "step": 101 + }, + { + "epoch": 0.04530313124583611, + "grad_norm": 2.4054051662002824, + "learning_rate": 1.1320754716981133e-06, + "loss": 0.2097, + "step": 102 + }, + { + "epoch": 0.04574727959138352, + "grad_norm": 1.8504294534254195, + "learning_rate": 1.1431742508324086e-06, + "loss": 0.1795, + "step": 103 + }, + { + "epoch": 0.046191427936930936, + "grad_norm": 1.8842730303029716, + "learning_rate": 1.1542730299667037e-06, + "loss": 0.1883, + "step": 104 + }, + { + "epoch": 0.046635576282478344, + "grad_norm": 1.5610944241824418, + "learning_rate": 1.165371809100999e-06, + "loss": 0.1655, + "step": 105 + }, + { + "epoch": 0.04707972462802576, + "grad_norm": 1.5819842411294622, + "learning_rate": 1.1764705882352942e-06, + "loss": 0.1811, + "step": 106 + }, + { + "epoch": 0.047523872973573175, + "grad_norm": 1.9979272057896553, + "learning_rate": 1.1875693673695895e-06, + "loss": 0.1823, + "step": 107 + }, + { + "epoch": 0.047968021319120584, + "grad_norm": 1.9458215900555198, + "learning_rate": 1.1986681465038846e-06, + "loss": 0.2136, + "step": 108 + }, + { + "epoch": 0.048412169664668, + "grad_norm": 1.8191486796264364, + "learning_rate": 1.2097669256381799e-06, + "loss": 0.2082, + "step": 109 + }, + { + "epoch": 0.048856318010215415, + "grad_norm": 1.9290831624594054, + "learning_rate": 1.2208657047724751e-06, + "loss": 0.1731, + "step": 110 + }, + { + "epoch": 0.04930046635576282, + "grad_norm": 2.1157513592152233, + "learning_rate": 1.2319644839067704e-06, + "loss": 0.1883, + "step": 111 + }, + { + "epoch": 0.04974461470131024, + "grad_norm": 2.0021290506932026, + "learning_rate": 1.2430632630410657e-06, + "loss": 0.1906, + "step": 112 + }, + { + "epoch": 0.05018876304685765, + "grad_norm": 1.9159228327922788, + "learning_rate": 1.254162042175361e-06, + "loss": 0.2176, + "step": 113 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 1.7650876352312228, + "learning_rate": 1.265260821309656e-06, + "loss": 0.1718, + "step": 114 + }, + { + "epoch": 0.05107705973795248, + "grad_norm": 2.03898072034729, + "learning_rate": 1.2763596004439513e-06, + "loss": 0.1898, + "step": 115 + }, + { + "epoch": 0.051521208083499886, + "grad_norm": 1.6693420710972704, + "learning_rate": 1.2874583795782464e-06, + "loss": 0.1759, + "step": 116 + }, + { + "epoch": 0.0519653564290473, + "grad_norm": 2.335357803271554, + "learning_rate": 1.2985571587125417e-06, + "loss": 0.1837, + "step": 117 + }, + { + "epoch": 0.05240950477459472, + "grad_norm": 1.682419177219224, + "learning_rate": 1.3096559378468371e-06, + "loss": 0.1808, + "step": 118 + }, + { + "epoch": 0.052853653120142126, + "grad_norm": 2.0519984686691126, + "learning_rate": 1.3207547169811322e-06, + "loss": 0.1982, + "step": 119 + }, + { + "epoch": 0.05329780146568954, + "grad_norm": 1.6368796114014708, + "learning_rate": 1.3318534961154275e-06, + "loss": 0.1758, + "step": 120 + }, + { + "epoch": 0.05374194981123695, + "grad_norm": 1.6286801352711888, + "learning_rate": 1.3429522752497226e-06, + "loss": 0.1936, + "step": 121 + }, + { + "epoch": 0.054186098156784365, + "grad_norm": 1.8633084178414114, + "learning_rate": 1.354051054384018e-06, + "loss": 0.202, + "step": 122 + }, + { + "epoch": 0.05463024650233178, + "grad_norm": 1.9496618888186672, + "learning_rate": 1.3651498335183131e-06, + "loss": 0.2025, + "step": 123 + }, + { + "epoch": 0.05507439484787919, + "grad_norm": 2.0859989784182353, + "learning_rate": 1.3762486126526084e-06, + "loss": 0.1903, + "step": 124 + }, + { + "epoch": 0.055518543193426605, + "grad_norm": 1.993272387486733, + "learning_rate": 1.3873473917869035e-06, + "loss": 0.2123, + "step": 125 + }, + { + "epoch": 0.05596269153897402, + "grad_norm": 2.2645503131575735, + "learning_rate": 1.3984461709211987e-06, + "loss": 0.2069, + "step": 126 + }, + { + "epoch": 0.05640683988452143, + "grad_norm": 1.581545113462678, + "learning_rate": 1.409544950055494e-06, + "loss": 0.1804, + "step": 127 + }, + { + "epoch": 0.056850988230068844, + "grad_norm": 1.9654703741649047, + "learning_rate": 1.4206437291897893e-06, + "loss": 0.1625, + "step": 128 + }, + { + "epoch": 0.05729513657561625, + "grad_norm": 2.452801004200468, + "learning_rate": 1.4317425083240844e-06, + "loss": 0.1779, + "step": 129 + }, + { + "epoch": 0.05773928492116367, + "grad_norm": 1.847246271423822, + "learning_rate": 1.4428412874583796e-06, + "loss": 0.1749, + "step": 130 + }, + { + "epoch": 0.05818343326671108, + "grad_norm": 1.9357616103684274, + "learning_rate": 1.4539400665926751e-06, + "loss": 0.2213, + "step": 131 + }, + { + "epoch": 0.05862758161225849, + "grad_norm": 1.495082490211886, + "learning_rate": 1.4650388457269702e-06, + "loss": 0.1579, + "step": 132 + }, + { + "epoch": 0.05907172995780591, + "grad_norm": 1.5859431774202346, + "learning_rate": 1.4761376248612655e-06, + "loss": 0.1606, + "step": 133 + }, + { + "epoch": 0.05951587830335332, + "grad_norm": 2.8376409272544687, + "learning_rate": 1.4872364039955605e-06, + "loss": 0.2481, + "step": 134 + }, + { + "epoch": 0.05996002664890073, + "grad_norm": 1.7612190558454792, + "learning_rate": 1.498335183129856e-06, + "loss": 0.1637, + "step": 135 + }, + { + "epoch": 0.06040417499444815, + "grad_norm": 2.070041455462641, + "learning_rate": 1.509433962264151e-06, + "loss": 0.2122, + "step": 136 + }, + { + "epoch": 0.060848323339995555, + "grad_norm": 2.3780145580102627, + "learning_rate": 1.5205327413984464e-06, + "loss": 0.186, + "step": 137 + }, + { + "epoch": 0.06129247168554297, + "grad_norm": 1.8658905992142587, + "learning_rate": 1.5316315205327414e-06, + "loss": 0.1673, + "step": 138 + }, + { + "epoch": 0.061736620031090386, + "grad_norm": 2.8172930803107077, + "learning_rate": 1.5427302996670367e-06, + "loss": 0.244, + "step": 139 + }, + { + "epoch": 0.062180768376637795, + "grad_norm": 1.6725142692022914, + "learning_rate": 1.5538290788013318e-06, + "loss": 0.157, + "step": 140 + }, + { + "epoch": 0.0626249167221852, + "grad_norm": 1.392484954569343, + "learning_rate": 1.5649278579356273e-06, + "loss": 0.1558, + "step": 141 + }, + { + "epoch": 0.06306906506773262, + "grad_norm": 1.4158061250880813, + "learning_rate": 1.5760266370699223e-06, + "loss": 0.148, + "step": 142 + }, + { + "epoch": 0.06351321341328003, + "grad_norm": 1.9695160587869651, + "learning_rate": 1.5871254162042176e-06, + "loss": 0.1468, + "step": 143 + }, + { + "epoch": 0.06395736175882745, + "grad_norm": 1.6387389663448713, + "learning_rate": 1.5982241953385131e-06, + "loss": 0.1363, + "step": 144 + }, + { + "epoch": 0.06440151010437487, + "grad_norm": 1.977141381047165, + "learning_rate": 1.6093229744728082e-06, + "loss": 0.1775, + "step": 145 + }, + { + "epoch": 0.06484565844992228, + "grad_norm": 1.866914454872281, + "learning_rate": 1.6204217536071035e-06, + "loss": 0.1676, + "step": 146 + }, + { + "epoch": 0.06528980679546968, + "grad_norm": 1.8479866982339428, + "learning_rate": 1.6315205327413985e-06, + "loss": 0.1795, + "step": 147 + }, + { + "epoch": 0.0657339551410171, + "grad_norm": 2.447758458580566, + "learning_rate": 1.6426193118756938e-06, + "loss": 0.1921, + "step": 148 + }, + { + "epoch": 0.06617810348656451, + "grad_norm": 1.928627100482751, + "learning_rate": 1.653718091009989e-06, + "loss": 0.1236, + "step": 149 + }, + { + "epoch": 0.06662225183211193, + "grad_norm": 2.496031778783847, + "learning_rate": 1.6648168701442844e-06, + "loss": 0.1263, + "step": 150 + }, + { + "epoch": 0.06706640017765934, + "grad_norm": 2.543270527791031, + "learning_rate": 1.6759156492785794e-06, + "loss": 0.1798, + "step": 151 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 2.6478561509220637, + "learning_rate": 1.6870144284128747e-06, + "loss": 0.1976, + "step": 152 + }, + { + "epoch": 0.06795469686875416, + "grad_norm": 2.046113685099785, + "learning_rate": 1.6981132075471698e-06, + "loss": 0.1412, + "step": 153 + }, + { + "epoch": 0.06839884521430158, + "grad_norm": 4.398993310024598, + "learning_rate": 1.7092119866814653e-06, + "loss": 0.2004, + "step": 154 + }, + { + "epoch": 0.06884299355984899, + "grad_norm": 1.7977539055103122, + "learning_rate": 1.7203107658157603e-06, + "loss": 0.1803, + "step": 155 + }, + { + "epoch": 0.06928714190539641, + "grad_norm": 1.9000289734288398, + "learning_rate": 1.7314095449500556e-06, + "loss": 0.1725, + "step": 156 + }, + { + "epoch": 0.06973129025094381, + "grad_norm": 2.8300001805659956, + "learning_rate": 1.742508324084351e-06, + "loss": 0.1396, + "step": 157 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 1.6221647641516457, + "learning_rate": 1.7536071032186462e-06, + "loss": 0.1377, + "step": 158 + }, + { + "epoch": 0.07061958694203864, + "grad_norm": 1.8070988364351561, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.1466, + "step": 159 + }, + { + "epoch": 0.07106373528758606, + "grad_norm": 1.522016454575872, + "learning_rate": 1.7758046614872365e-06, + "loss": 0.1508, + "step": 160 + }, + { + "epoch": 0.07150788363313347, + "grad_norm": 2.5966599384816393, + "learning_rate": 1.7869034406215318e-06, + "loss": 0.1946, + "step": 161 + }, + { + "epoch": 0.07195203197868089, + "grad_norm": 2.055913601931369, + "learning_rate": 1.7980022197558269e-06, + "loss": 0.1485, + "step": 162 + }, + { + "epoch": 0.07239618032422829, + "grad_norm": 1.716589810835508, + "learning_rate": 1.8091009988901223e-06, + "loss": 0.1552, + "step": 163 + }, + { + "epoch": 0.0728403286697757, + "grad_norm": 1.324188540188503, + "learning_rate": 1.8201997780244174e-06, + "loss": 0.1217, + "step": 164 + }, + { + "epoch": 0.07328447701532312, + "grad_norm": 1.7875490213167293, + "learning_rate": 1.8312985571587127e-06, + "loss": 0.1814, + "step": 165 + }, + { + "epoch": 0.07372862536087053, + "grad_norm": 2.111076485427071, + "learning_rate": 1.8423973362930078e-06, + "loss": 0.1858, + "step": 166 + }, + { + "epoch": 0.07417277370641795, + "grad_norm": 1.8246301721111953, + "learning_rate": 1.8534961154273032e-06, + "loss": 0.1328, + "step": 167 + }, + { + "epoch": 0.07461692205196535, + "grad_norm": 7.017994935548065, + "learning_rate": 1.8645948945615983e-06, + "loss": 0.2175, + "step": 168 + }, + { + "epoch": 0.07506107039751277, + "grad_norm": 1.8833148111605775, + "learning_rate": 1.8756936736958936e-06, + "loss": 0.169, + "step": 169 + }, + { + "epoch": 0.07550521874306018, + "grad_norm": 2.1063106621517833, + "learning_rate": 1.8867924528301889e-06, + "loss": 0.1723, + "step": 170 + }, + { + "epoch": 0.0759493670886076, + "grad_norm": 2.1080508911518603, + "learning_rate": 1.8978912319644842e-06, + "loss": 0.1313, + "step": 171 + }, + { + "epoch": 0.07639351543415501, + "grad_norm": 2.042694432552121, + "learning_rate": 1.9089900110987794e-06, + "loss": 0.1561, + "step": 172 + }, + { + "epoch": 0.07683766377970241, + "grad_norm": 2.275654963885769, + "learning_rate": 1.9200887902330745e-06, + "loss": 0.2056, + "step": 173 + }, + { + "epoch": 0.07728181212524983, + "grad_norm": 1.4521350560986936, + "learning_rate": 1.93118756936737e-06, + "loss": 0.1539, + "step": 174 + }, + { + "epoch": 0.07772596047079725, + "grad_norm": 1.4721774252433903, + "learning_rate": 1.942286348501665e-06, + "loss": 0.1392, + "step": 175 + }, + { + "epoch": 0.07817010881634466, + "grad_norm": 2.236429259701826, + "learning_rate": 1.95338512763596e-06, + "loss": 0.1851, + "step": 176 + }, + { + "epoch": 0.07861425716189208, + "grad_norm": 2.0509681089981973, + "learning_rate": 1.964483906770255e-06, + "loss": 0.1613, + "step": 177 + }, + { + "epoch": 0.07905840550743949, + "grad_norm": 1.921297270021361, + "learning_rate": 1.9755826859045507e-06, + "loss": 0.1411, + "step": 178 + }, + { + "epoch": 0.07950255385298689, + "grad_norm": 2.3334407767674534, + "learning_rate": 1.9866814650388457e-06, + "loss": 0.1673, + "step": 179 + }, + { + "epoch": 0.07994670219853431, + "grad_norm": 2.4173674731585035, + "learning_rate": 1.9977802441731412e-06, + "loss": 0.1769, + "step": 180 + }, + { + "epoch": 0.08039085054408172, + "grad_norm": 2.672814520854723, + "learning_rate": 2.0088790233074363e-06, + "loss": 0.1882, + "step": 181 + }, + { + "epoch": 0.08083499888962914, + "grad_norm": 2.082354232015489, + "learning_rate": 2.019977802441732e-06, + "loss": 0.1594, + "step": 182 + }, + { + "epoch": 0.08127914723517655, + "grad_norm": 2.0548698535266965, + "learning_rate": 2.031076581576027e-06, + "loss": 0.162, + "step": 183 + }, + { + "epoch": 0.08172329558072396, + "grad_norm": 2.352430010700108, + "learning_rate": 2.042175360710322e-06, + "loss": 0.1429, + "step": 184 + }, + { + "epoch": 0.08216744392627137, + "grad_norm": 1.8421875981209739, + "learning_rate": 2.0532741398446174e-06, + "loss": 0.1575, + "step": 185 + }, + { + "epoch": 0.08261159227181879, + "grad_norm": 1.8037755953700159, + "learning_rate": 2.0643729189789125e-06, + "loss": 0.1474, + "step": 186 + }, + { + "epoch": 0.0830557406173662, + "grad_norm": 2.2301663182454887, + "learning_rate": 2.075471698113208e-06, + "loss": 0.1381, + "step": 187 + }, + { + "epoch": 0.08349988896291362, + "grad_norm": 1.8820242106298288, + "learning_rate": 2.086570477247503e-06, + "loss": 0.136, + "step": 188 + }, + { + "epoch": 0.08394403730846102, + "grad_norm": 2.0480577593839935, + "learning_rate": 2.097669256381798e-06, + "loss": 0.146, + "step": 189 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 1.844554763649842, + "learning_rate": 2.108768035516093e-06, + "loss": 0.1663, + "step": 190 + }, + { + "epoch": 0.08483233399955585, + "grad_norm": 1.5207370248204364, + "learning_rate": 2.1198668146503887e-06, + "loss": 0.1453, + "step": 191 + }, + { + "epoch": 0.08527648234510327, + "grad_norm": 1.7288600299730912, + "learning_rate": 2.1309655937846837e-06, + "loss": 0.1492, + "step": 192 + }, + { + "epoch": 0.08572063069065068, + "grad_norm": 1.6054586875361534, + "learning_rate": 2.1420643729189792e-06, + "loss": 0.1702, + "step": 193 + }, + { + "epoch": 0.0861647790361981, + "grad_norm": 2.082434204878279, + "learning_rate": 2.1531631520532743e-06, + "loss": 0.1417, + "step": 194 + }, + { + "epoch": 0.0866089273817455, + "grad_norm": 1.893129069527839, + "learning_rate": 2.1642619311875694e-06, + "loss": 0.1746, + "step": 195 + }, + { + "epoch": 0.08705307572729291, + "grad_norm": 1.5433606197712828, + "learning_rate": 2.175360710321865e-06, + "loss": 0.1434, + "step": 196 + }, + { + "epoch": 0.08749722407284033, + "grad_norm": 1.8343180586393633, + "learning_rate": 2.18645948945616e-06, + "loss": 0.1635, + "step": 197 + }, + { + "epoch": 0.08794137241838774, + "grad_norm": 1.9644492187331304, + "learning_rate": 2.1975582685904554e-06, + "loss": 0.1613, + "step": 198 + }, + { + "epoch": 0.08838552076393516, + "grad_norm": 2.3183987673393784, + "learning_rate": 2.2086570477247505e-06, + "loss": 0.1451, + "step": 199 + }, + { + "epoch": 0.08882966910948256, + "grad_norm": 1.8970811571077912, + "learning_rate": 2.219755826859046e-06, + "loss": 0.1693, + "step": 200 + }, + { + "epoch": 0.08927381745502998, + "grad_norm": 1.3438831493644725, + "learning_rate": 2.230854605993341e-06, + "loss": 0.1442, + "step": 201 + }, + { + "epoch": 0.08971796580057739, + "grad_norm": 2.0647130607040687, + "learning_rate": 2.241953385127636e-06, + "loss": 0.1483, + "step": 202 + }, + { + "epoch": 0.09016211414612481, + "grad_norm": 1.8137712195107252, + "learning_rate": 2.253052164261931e-06, + "loss": 0.1898, + "step": 203 + }, + { + "epoch": 0.09060626249167222, + "grad_norm": 1.5677896114201355, + "learning_rate": 2.2641509433962266e-06, + "loss": 0.1105, + "step": 204 + }, + { + "epoch": 0.09105041083721963, + "grad_norm": 1.5774232800494377, + "learning_rate": 2.2752497225305217e-06, + "loss": 0.1439, + "step": 205 + }, + { + "epoch": 0.09149455918276704, + "grad_norm": 2.191208018434963, + "learning_rate": 2.286348501664817e-06, + "loss": 0.1739, + "step": 206 + }, + { + "epoch": 0.09193870752831446, + "grad_norm": 1.8759698513356193, + "learning_rate": 2.2974472807991123e-06, + "loss": 0.1238, + "step": 207 + }, + { + "epoch": 0.09238285587386187, + "grad_norm": 2.193936812351504, + "learning_rate": 2.3085460599334073e-06, + "loss": 0.149, + "step": 208 + }, + { + "epoch": 0.09282700421940929, + "grad_norm": 1.7624384709819947, + "learning_rate": 2.319644839067703e-06, + "loss": 0.1395, + "step": 209 + }, + { + "epoch": 0.09327115256495669, + "grad_norm": 1.940248444895634, + "learning_rate": 2.330743618201998e-06, + "loss": 0.1321, + "step": 210 + }, + { + "epoch": 0.0937153009105041, + "grad_norm": 1.623293710115476, + "learning_rate": 2.3418423973362934e-06, + "loss": 0.1411, + "step": 211 + }, + { + "epoch": 0.09415944925605152, + "grad_norm": 2.311237234944747, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.1451, + "step": 212 + }, + { + "epoch": 0.09460359760159893, + "grad_norm": 2.0964379988774366, + "learning_rate": 2.364039955604884e-06, + "loss": 0.1628, + "step": 213 + }, + { + "epoch": 0.09504774594714635, + "grad_norm": 1.57604767113886, + "learning_rate": 2.375138734739179e-06, + "loss": 0.1282, + "step": 214 + }, + { + "epoch": 0.09549189429269377, + "grad_norm": 1.446630991649051, + "learning_rate": 2.386237513873474e-06, + "loss": 0.1392, + "step": 215 + }, + { + "epoch": 0.09593604263824117, + "grad_norm": 2.274345473449785, + "learning_rate": 2.397336293007769e-06, + "loss": 0.1962, + "step": 216 + }, + { + "epoch": 0.09638019098378858, + "grad_norm": 1.5946628028114809, + "learning_rate": 2.4084350721420646e-06, + "loss": 0.138, + "step": 217 + }, + { + "epoch": 0.096824339329336, + "grad_norm": 1.882389417953473, + "learning_rate": 2.4195338512763597e-06, + "loss": 0.1903, + "step": 218 + }, + { + "epoch": 0.09726848767488341, + "grad_norm": 1.904432577295931, + "learning_rate": 2.430632630410655e-06, + "loss": 0.1481, + "step": 219 + }, + { + "epoch": 0.09771263602043083, + "grad_norm": 2.080561584835578, + "learning_rate": 2.4417314095449503e-06, + "loss": 0.1514, + "step": 220 + }, + { + "epoch": 0.09815678436597823, + "grad_norm": 1.6677943098084358, + "learning_rate": 2.4528301886792453e-06, + "loss": 0.1347, + "step": 221 + }, + { + "epoch": 0.09860093271152565, + "grad_norm": 1.7294352905463155, + "learning_rate": 2.463928967813541e-06, + "loss": 0.1386, + "step": 222 + }, + { + "epoch": 0.09904508105707306, + "grad_norm": 2.0928510956520934, + "learning_rate": 2.475027746947836e-06, + "loss": 0.1611, + "step": 223 + }, + { + "epoch": 0.09948922940262048, + "grad_norm": 1.6018044625885859, + "learning_rate": 2.4861265260821314e-06, + "loss": 0.1338, + "step": 224 + }, + { + "epoch": 0.09993337774816789, + "grad_norm": 1.9068721122681065, + "learning_rate": 2.4972253052164264e-06, + "loss": 0.1557, + "step": 225 + }, + { + "epoch": 0.1003775260937153, + "grad_norm": 1.5854054800345856, + "learning_rate": 2.508324084350722e-06, + "loss": 0.1582, + "step": 226 + }, + { + "epoch": 0.10082167443926271, + "grad_norm": 1.6327772348060883, + "learning_rate": 2.519422863485017e-06, + "loss": 0.153, + "step": 227 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 1.7221707573000986, + "learning_rate": 2.530521642619312e-06, + "loss": 0.1521, + "step": 228 + }, + { + "epoch": 0.10170997113035754, + "grad_norm": 3.8860493727438605, + "learning_rate": 2.541620421753607e-06, + "loss": 0.185, + "step": 229 + }, + { + "epoch": 0.10215411947590496, + "grad_norm": 2.2982967806121057, + "learning_rate": 2.5527192008879026e-06, + "loss": 0.1268, + "step": 230 + }, + { + "epoch": 0.10259826782145237, + "grad_norm": 2.633523229110552, + "learning_rate": 2.563817980022198e-06, + "loss": 0.1312, + "step": 231 + }, + { + "epoch": 0.10304241616699977, + "grad_norm": 1.8405348072939953, + "learning_rate": 2.5749167591564928e-06, + "loss": 0.1352, + "step": 232 + }, + { + "epoch": 0.10348656451254719, + "grad_norm": 1.5009853192812423, + "learning_rate": 2.5860155382907882e-06, + "loss": 0.1191, + "step": 233 + }, + { + "epoch": 0.1039307128580946, + "grad_norm": 1.3280238597160159, + "learning_rate": 2.5971143174250833e-06, + "loss": 0.1207, + "step": 234 + }, + { + "epoch": 0.10437486120364202, + "grad_norm": 1.412771085327836, + "learning_rate": 2.608213096559379e-06, + "loss": 0.1343, + "step": 235 + }, + { + "epoch": 0.10481900954918943, + "grad_norm": 1.4216505684340854, + "learning_rate": 2.6193118756936743e-06, + "loss": 0.1312, + "step": 236 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.5128985222362534, + "learning_rate": 2.630410654827969e-06, + "loss": 0.1351, + "step": 237 + }, + { + "epoch": 0.10570730624028425, + "grad_norm": 1.9599268561293408, + "learning_rate": 2.6415094339622644e-06, + "loss": 0.1403, + "step": 238 + }, + { + "epoch": 0.10615145458583167, + "grad_norm": 1.4181236006554954, + "learning_rate": 2.6526082130965595e-06, + "loss": 0.1377, + "step": 239 + }, + { + "epoch": 0.10659560293137908, + "grad_norm": 1.9917215037873872, + "learning_rate": 2.663706992230855e-06, + "loss": 0.1451, + "step": 240 + }, + { + "epoch": 0.1070397512769265, + "grad_norm": 1.683475509658225, + "learning_rate": 2.67480577136515e-06, + "loss": 0.1375, + "step": 241 + }, + { + "epoch": 0.1074838996224739, + "grad_norm": 1.8301967264376793, + "learning_rate": 2.685904550499445e-06, + "loss": 0.1464, + "step": 242 + }, + { + "epoch": 0.10792804796802132, + "grad_norm": 1.6846261045693358, + "learning_rate": 2.6970033296337406e-06, + "loss": 0.1332, + "step": 243 + }, + { + "epoch": 0.10837219631356873, + "grad_norm": 1.2774719377840502, + "learning_rate": 2.708102108768036e-06, + "loss": 0.1292, + "step": 244 + }, + { + "epoch": 0.10881634465911615, + "grad_norm": 1.595633542836346, + "learning_rate": 2.7192008879023307e-06, + "loss": 0.1386, + "step": 245 + }, + { + "epoch": 0.10926049300466356, + "grad_norm": 1.4528662121606482, + "learning_rate": 2.7302996670366262e-06, + "loss": 0.1392, + "step": 246 + }, + { + "epoch": 0.10970464135021098, + "grad_norm": 1.5309789173048087, + "learning_rate": 2.7413984461709213e-06, + "loss": 0.1271, + "step": 247 + }, + { + "epoch": 0.11014878969575838, + "grad_norm": 1.518105261331223, + "learning_rate": 2.7524972253052168e-06, + "loss": 0.1237, + "step": 248 + }, + { + "epoch": 0.1105929380413058, + "grad_norm": 1.4676341827820025, + "learning_rate": 2.7635960044395123e-06, + "loss": 0.1317, + "step": 249 + }, + { + "epoch": 0.11103708638685321, + "grad_norm": 2.146666992664208, + "learning_rate": 2.774694783573807e-06, + "loss": 0.1266, + "step": 250 + }, + { + "epoch": 0.11148123473240062, + "grad_norm": 1.450876621074019, + "learning_rate": 2.7857935627081024e-06, + "loss": 0.1124, + "step": 251 + }, + { + "epoch": 0.11192538307794804, + "grad_norm": 1.6447214379715893, + "learning_rate": 2.7968923418423975e-06, + "loss": 0.1176, + "step": 252 + }, + { + "epoch": 0.11236953142349544, + "grad_norm": 1.85196408048202, + "learning_rate": 2.807991120976693e-06, + "loss": 0.1196, + "step": 253 + }, + { + "epoch": 0.11281367976904286, + "grad_norm": 1.8297536053418253, + "learning_rate": 2.819089900110988e-06, + "loss": 0.1312, + "step": 254 + }, + { + "epoch": 0.11325782811459027, + "grad_norm": 1.7773730595281947, + "learning_rate": 2.830188679245283e-06, + "loss": 0.1343, + "step": 255 + }, + { + "epoch": 0.11370197646013769, + "grad_norm": 1.5584909202093926, + "learning_rate": 2.8412874583795786e-06, + "loss": 0.1075, + "step": 256 + }, + { + "epoch": 0.1141461248056851, + "grad_norm": 1.9920621657295152, + "learning_rate": 2.852386237513874e-06, + "loss": 0.1591, + "step": 257 + }, + { + "epoch": 0.1145902731512325, + "grad_norm": 2.0535793857620264, + "learning_rate": 2.8634850166481687e-06, + "loss": 0.1459, + "step": 258 + }, + { + "epoch": 0.11503442149677992, + "grad_norm": 1.432401039912359, + "learning_rate": 2.8745837957824642e-06, + "loss": 0.1235, + "step": 259 + }, + { + "epoch": 0.11547856984232734, + "grad_norm": 2.3190551632714427, + "learning_rate": 2.8856825749167593e-06, + "loss": 0.1608, + "step": 260 + }, + { + "epoch": 0.11592271818787475, + "grad_norm": 1.4461950992247072, + "learning_rate": 2.8967813540510548e-06, + "loss": 0.1242, + "step": 261 + }, + { + "epoch": 0.11636686653342217, + "grad_norm": 1.60132726584782, + "learning_rate": 2.9078801331853503e-06, + "loss": 0.1057, + "step": 262 + }, + { + "epoch": 0.11681101487896958, + "grad_norm": 2.5527110858786553, + "learning_rate": 2.918978912319645e-06, + "loss": 0.1735, + "step": 263 + }, + { + "epoch": 0.11725516322451698, + "grad_norm": 1.587535094612724, + "learning_rate": 2.9300776914539404e-06, + "loss": 0.1145, + "step": 264 + }, + { + "epoch": 0.1176993115700644, + "grad_norm": 1.8628254342286168, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.1475, + "step": 265 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 2.029208074213613, + "learning_rate": 2.952275249722531e-06, + "loss": 0.1448, + "step": 266 + }, + { + "epoch": 0.11858760826115923, + "grad_norm": 1.788916557618341, + "learning_rate": 2.9633740288568256e-06, + "loss": 0.1223, + "step": 267 + }, + { + "epoch": 0.11903175660670665, + "grad_norm": 2.0663932534564147, + "learning_rate": 2.974472807991121e-06, + "loss": 0.1456, + "step": 268 + }, + { + "epoch": 0.11947590495225405, + "grad_norm": 1.5929695265003, + "learning_rate": 2.9855715871254166e-06, + "loss": 0.1427, + "step": 269 + }, + { + "epoch": 0.11992005329780146, + "grad_norm": 1.8366971238631566, + "learning_rate": 2.996670366259712e-06, + "loss": 0.1029, + "step": 270 + }, + { + "epoch": 0.12036420164334888, + "grad_norm": 1.4941973532109012, + "learning_rate": 3.0077691453940067e-06, + "loss": 0.1115, + "step": 271 + }, + { + "epoch": 0.1208083499888963, + "grad_norm": 2.1604567983633403, + "learning_rate": 3.018867924528302e-06, + "loss": 0.11, + "step": 272 + }, + { + "epoch": 0.12125249833444371, + "grad_norm": 1.5440708951155746, + "learning_rate": 3.0299667036625973e-06, + "loss": 0.1312, + "step": 273 + }, + { + "epoch": 0.12169664667999111, + "grad_norm": 1.1320500690500013, + "learning_rate": 3.0410654827968928e-06, + "loss": 0.1172, + "step": 274 + }, + { + "epoch": 0.12214079502553853, + "grad_norm": 2.1326279999447326, + "learning_rate": 3.0521642619311882e-06, + "loss": 0.1099, + "step": 275 + }, + { + "epoch": 0.12258494337108594, + "grad_norm": 1.8624808272019928, + "learning_rate": 3.063263041065483e-06, + "loss": 0.1061, + "step": 276 + }, + { + "epoch": 0.12302909171663336, + "grad_norm": 2.836562163833866, + "learning_rate": 3.0743618201997784e-06, + "loss": 0.1173, + "step": 277 + }, + { + "epoch": 0.12347324006218077, + "grad_norm": 1.5779920497766018, + "learning_rate": 3.0854605993340734e-06, + "loss": 0.1305, + "step": 278 + }, + { + "epoch": 0.12391738840772819, + "grad_norm": 1.4061427971159166, + "learning_rate": 3.096559378468369e-06, + "loss": 0.1063, + "step": 279 + }, + { + "epoch": 0.12436153675327559, + "grad_norm": 1.4735808311660463, + "learning_rate": 3.1076581576026636e-06, + "loss": 0.1478, + "step": 280 + }, + { + "epoch": 0.124805685098823, + "grad_norm": 1.9063172097566101, + "learning_rate": 3.118756936736959e-06, + "loss": 0.152, + "step": 281 + }, + { + "epoch": 0.1252498334443704, + "grad_norm": 1.2460125667748942, + "learning_rate": 3.1298557158712546e-06, + "loss": 0.1078, + "step": 282 + }, + { + "epoch": 0.12569398178991784, + "grad_norm": 1.422370758153891, + "learning_rate": 3.1409544950055496e-06, + "loss": 0.1143, + "step": 283 + }, + { + "epoch": 0.12613813013546524, + "grad_norm": 1.3901208342210212, + "learning_rate": 3.1520532741398447e-06, + "loss": 0.1232, + "step": 284 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 1.2571280817929795, + "learning_rate": 3.16315205327414e-06, + "loss": 0.1183, + "step": 285 + }, + { + "epoch": 0.12702642682656007, + "grad_norm": 1.7211835588268667, + "learning_rate": 3.1742508324084352e-06, + "loss": 0.1376, + "step": 286 + }, + { + "epoch": 0.1274705751721075, + "grad_norm": 1.7352722841271955, + "learning_rate": 3.1853496115427307e-06, + "loss": 0.1221, + "step": 287 + }, + { + "epoch": 0.1279147235176549, + "grad_norm": 1.608485934770127, + "learning_rate": 3.1964483906770262e-06, + "loss": 0.1287, + "step": 288 + }, + { + "epoch": 0.1283588718632023, + "grad_norm": 1.4070749655660284, + "learning_rate": 3.207547169811321e-06, + "loss": 0.1107, + "step": 289 + }, + { + "epoch": 0.12880302020874973, + "grad_norm": 1.8061966734954316, + "learning_rate": 3.2186459489456164e-06, + "loss": 0.1209, + "step": 290 + }, + { + "epoch": 0.12924716855429713, + "grad_norm": 1.9290505056364757, + "learning_rate": 3.2297447280799114e-06, + "loss": 0.1088, + "step": 291 + }, + { + "epoch": 0.12969131689984456, + "grad_norm": 1.2873703020140206, + "learning_rate": 3.240843507214207e-06, + "loss": 0.1107, + "step": 292 + }, + { + "epoch": 0.13013546524539196, + "grad_norm": 2.100967763487988, + "learning_rate": 3.2519422863485016e-06, + "loss": 0.1746, + "step": 293 + }, + { + "epoch": 0.13057961359093936, + "grad_norm": 2.9637529933084785, + "learning_rate": 3.263041065482797e-06, + "loss": 0.1408, + "step": 294 + }, + { + "epoch": 0.1310237619364868, + "grad_norm": 1.8502103362104685, + "learning_rate": 3.2741398446170925e-06, + "loss": 0.108, + "step": 295 + }, + { + "epoch": 0.1314679102820342, + "grad_norm": 1.5072896588301588, + "learning_rate": 3.2852386237513876e-06, + "loss": 0.1233, + "step": 296 + }, + { + "epoch": 0.13191205862758162, + "grad_norm": 1.9060937237646072, + "learning_rate": 3.2963374028856827e-06, + "loss": 0.1137, + "step": 297 + }, + { + "epoch": 0.13235620697312903, + "grad_norm": 1.4214687758215054, + "learning_rate": 3.307436182019978e-06, + "loss": 0.1181, + "step": 298 + }, + { + "epoch": 0.13280035531867643, + "grad_norm": 1.5173189244791243, + "learning_rate": 3.3185349611542732e-06, + "loss": 0.1221, + "step": 299 + }, + { + "epoch": 0.13324450366422386, + "grad_norm": 1.4086327372245158, + "learning_rate": 3.3296337402885687e-06, + "loss": 0.1598, + "step": 300 + }, + { + "epoch": 0.13368865200977126, + "grad_norm": 1.3949120100912162, + "learning_rate": 3.3407325194228642e-06, + "loss": 0.0996, + "step": 301 + }, + { + "epoch": 0.1341328003553187, + "grad_norm": 1.6249998744801628, + "learning_rate": 3.351831298557159e-06, + "loss": 0.0959, + "step": 302 + }, + { + "epoch": 0.1345769487008661, + "grad_norm": 1.7178562509007014, + "learning_rate": 3.3629300776914543e-06, + "loss": 0.1348, + "step": 303 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 1.7790098039504103, + "learning_rate": 3.3740288568257494e-06, + "loss": 0.1011, + "step": 304 + }, + { + "epoch": 0.13546524539196092, + "grad_norm": 1.4533709920798474, + "learning_rate": 3.385127635960045e-06, + "loss": 0.1177, + "step": 305 + }, + { + "epoch": 0.13590939373750832, + "grad_norm": 1.7170638072373428, + "learning_rate": 3.3962264150943395e-06, + "loss": 0.1264, + "step": 306 + }, + { + "epoch": 0.13635354208305575, + "grad_norm": 1.1622578542744249, + "learning_rate": 3.407325194228635e-06, + "loss": 0.1164, + "step": 307 + }, + { + "epoch": 0.13679769042860315, + "grad_norm": 1.7861497563291042, + "learning_rate": 3.4184239733629305e-06, + "loss": 0.1328, + "step": 308 + }, + { + "epoch": 0.13724183877415055, + "grad_norm": 1.2393311320446403, + "learning_rate": 3.4295227524972256e-06, + "loss": 0.0994, + "step": 309 + }, + { + "epoch": 0.13768598711969798, + "grad_norm": 1.779362058176627, + "learning_rate": 3.4406215316315207e-06, + "loss": 0.131, + "step": 310 + }, + { + "epoch": 0.13813013546524538, + "grad_norm": 1.384763433835653, + "learning_rate": 3.4517203107658157e-06, + "loss": 0.1011, + "step": 311 + }, + { + "epoch": 0.13857428381079281, + "grad_norm": 1.5455433688862117, + "learning_rate": 3.4628190899001112e-06, + "loss": 0.1216, + "step": 312 + }, + { + "epoch": 0.13901843215634022, + "grad_norm": 1.3658352699008705, + "learning_rate": 3.4739178690344067e-06, + "loss": 0.123, + "step": 313 + }, + { + "epoch": 0.13946258050188762, + "grad_norm": 1.3724682796873768, + "learning_rate": 3.485016648168702e-06, + "loss": 0.1209, + "step": 314 + }, + { + "epoch": 0.13990672884743505, + "grad_norm": 1.608691375904217, + "learning_rate": 3.496115427302997e-06, + "loss": 0.1141, + "step": 315 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 1.3598637431605427, + "learning_rate": 3.5072142064372923e-06, + "loss": 0.1528, + "step": 316 + }, + { + "epoch": 0.14079502553852988, + "grad_norm": 1.0876962111896626, + "learning_rate": 3.5183129855715874e-06, + "loss": 0.1049, + "step": 317 + }, + { + "epoch": 0.14123917388407728, + "grad_norm": 1.3385892109435766, + "learning_rate": 3.529411764705883e-06, + "loss": 0.1037, + "step": 318 + }, + { + "epoch": 0.14168332222962468, + "grad_norm": 1.7433775937165439, + "learning_rate": 3.5405105438401775e-06, + "loss": 0.1486, + "step": 319 + }, + { + "epoch": 0.1421274705751721, + "grad_norm": 1.5533508842477224, + "learning_rate": 3.551609322974473e-06, + "loss": 0.1049, + "step": 320 + }, + { + "epoch": 0.1425716189207195, + "grad_norm": 1.2029122587877374, + "learning_rate": 3.5627081021087685e-06, + "loss": 0.1098, + "step": 321 + }, + { + "epoch": 0.14301576726626694, + "grad_norm": 1.8995176312013884, + "learning_rate": 3.5738068812430636e-06, + "loss": 0.0944, + "step": 322 + }, + { + "epoch": 0.14345991561181434, + "grad_norm": 1.6602519149722867, + "learning_rate": 3.5849056603773586e-06, + "loss": 0.1243, + "step": 323 + }, + { + "epoch": 0.14390406395736177, + "grad_norm": 1.6075958194272566, + "learning_rate": 3.5960044395116537e-06, + "loss": 0.1196, + "step": 324 + }, + { + "epoch": 0.14434821230290917, + "grad_norm": 1.510226320322185, + "learning_rate": 3.607103218645949e-06, + "loss": 0.1422, + "step": 325 + }, + { + "epoch": 0.14479236064845658, + "grad_norm": 1.2802794537606514, + "learning_rate": 3.6182019977802447e-06, + "loss": 0.1244, + "step": 326 + }, + { + "epoch": 0.145236508994004, + "grad_norm": 1.1595946058732067, + "learning_rate": 3.6293007769145398e-06, + "loss": 0.0929, + "step": 327 + }, + { + "epoch": 0.1456806573395514, + "grad_norm": 1.2381936978069086, + "learning_rate": 3.640399556048835e-06, + "loss": 0.1002, + "step": 328 + }, + { + "epoch": 0.14612480568509884, + "grad_norm": 2.4354535778190742, + "learning_rate": 3.6514983351831303e-06, + "loss": 0.1377, + "step": 329 + }, + { + "epoch": 0.14656895403064624, + "grad_norm": 1.5157062223087485, + "learning_rate": 3.6625971143174254e-06, + "loss": 0.1206, + "step": 330 + }, + { + "epoch": 0.14701310237619364, + "grad_norm": 1.3681542301294034, + "learning_rate": 3.673695893451721e-06, + "loss": 0.1047, + "step": 331 + }, + { + "epoch": 0.14745725072174107, + "grad_norm": 1.9762614541590338, + "learning_rate": 3.6847946725860155e-06, + "loss": 0.1249, + "step": 332 + }, + { + "epoch": 0.14790139906728847, + "grad_norm": 1.391859368616253, + "learning_rate": 3.695893451720311e-06, + "loss": 0.1015, + "step": 333 + }, + { + "epoch": 0.1483455474128359, + "grad_norm": 1.1072542539549668, + "learning_rate": 3.7069922308546065e-06, + "loss": 0.0931, + "step": 334 + }, + { + "epoch": 0.1487896957583833, + "grad_norm": 1.4909584737380348, + "learning_rate": 3.7180910099889016e-06, + "loss": 0.1141, + "step": 335 + }, + { + "epoch": 0.1492338441039307, + "grad_norm": 1.7478929922992545, + "learning_rate": 3.7291897891231966e-06, + "loss": 0.0891, + "step": 336 + }, + { + "epoch": 0.14967799244947813, + "grad_norm": 1.5597867645297776, + "learning_rate": 3.7402885682574917e-06, + "loss": 0.128, + "step": 337 + }, + { + "epoch": 0.15012214079502553, + "grad_norm": 1.4083772110340225, + "learning_rate": 3.751387347391787e-06, + "loss": 0.1337, + "step": 338 + }, + { + "epoch": 0.15056628914057296, + "grad_norm": 2.011956681151715, + "learning_rate": 3.7624861265260827e-06, + "loss": 0.1036, + "step": 339 + }, + { + "epoch": 0.15101043748612036, + "grad_norm": 2.154404794046358, + "learning_rate": 3.7735849056603777e-06, + "loss": 0.1104, + "step": 340 + }, + { + "epoch": 0.15145458583166777, + "grad_norm": 2.1106357904010316, + "learning_rate": 3.784683684794673e-06, + "loss": 0.1686, + "step": 341 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 1.6933676844964125, + "learning_rate": 3.7957824639289683e-06, + "loss": 0.1375, + "step": 342 + }, + { + "epoch": 0.1523428825227626, + "grad_norm": 2.205537988403305, + "learning_rate": 3.8068812430632634e-06, + "loss": 0.1513, + "step": 343 + }, + { + "epoch": 0.15278703086831003, + "grad_norm": 1.2058069327729946, + "learning_rate": 3.817980022197559e-06, + "loss": 0.091, + "step": 344 + }, + { + "epoch": 0.15323117921385743, + "grad_norm": 1.400652563163732, + "learning_rate": 3.829078801331854e-06, + "loss": 0.0826, + "step": 345 + }, + { + "epoch": 0.15367532755940483, + "grad_norm": 2.057959821320217, + "learning_rate": 3.840177580466149e-06, + "loss": 0.118, + "step": 346 + }, + { + "epoch": 0.15411947590495226, + "grad_norm": 1.6604930714816526, + "learning_rate": 3.851276359600444e-06, + "loss": 0.1301, + "step": 347 + }, + { + "epoch": 0.15456362425049966, + "grad_norm": 1.033671229980745, + "learning_rate": 3.86237513873474e-06, + "loss": 0.0924, + "step": 348 + }, + { + "epoch": 0.1550077725960471, + "grad_norm": 1.17967777436608, + "learning_rate": 3.873473917869034e-06, + "loss": 0.0978, + "step": 349 + }, + { + "epoch": 0.1554519209415945, + "grad_norm": 1.650470297879078, + "learning_rate": 3.88457269700333e-06, + "loss": 0.133, + "step": 350 + }, + { + "epoch": 0.1558960692871419, + "grad_norm": 1.47519341296619, + "learning_rate": 3.895671476137625e-06, + "loss": 0.1254, + "step": 351 + }, + { + "epoch": 0.15634021763268932, + "grad_norm": 1.4035761818876917, + "learning_rate": 3.90677025527192e-06, + "loss": 0.1183, + "step": 352 + }, + { + "epoch": 0.15678436597823672, + "grad_norm": 2.2555395385843036, + "learning_rate": 3.917869034406216e-06, + "loss": 0.1202, + "step": 353 + }, + { + "epoch": 0.15722851432378415, + "grad_norm": 1.4759637016708067, + "learning_rate": 3.92896781354051e-06, + "loss": 0.1104, + "step": 354 + }, + { + "epoch": 0.15767266266933155, + "grad_norm": 2.952687567266444, + "learning_rate": 3.940066592674806e-06, + "loss": 0.122, + "step": 355 + }, + { + "epoch": 0.15811681101487898, + "grad_norm": 1.1679111801200106, + "learning_rate": 3.951165371809101e-06, + "loss": 0.0905, + "step": 356 + }, + { + "epoch": 0.15856095936042638, + "grad_norm": 1.660729161757867, + "learning_rate": 3.962264150943396e-06, + "loss": 0.1521, + "step": 357 + }, + { + "epoch": 0.15900510770597379, + "grad_norm": 1.3321535227222834, + "learning_rate": 3.9733629300776915e-06, + "loss": 0.1315, + "step": 358 + }, + { + "epoch": 0.15944925605152122, + "grad_norm": 1.1798749965091133, + "learning_rate": 3.9844617092119866e-06, + "loss": 0.0852, + "step": 359 + }, + { + "epoch": 0.15989340439706862, + "grad_norm": 1.198919817717748, + "learning_rate": 3.9955604883462825e-06, + "loss": 0.1029, + "step": 360 + }, + { + "epoch": 0.16033755274261605, + "grad_norm": 1.1799761966591196, + "learning_rate": 4.0066592674805775e-06, + "loss": 0.1009, + "step": 361 + }, + { + "epoch": 0.16078170108816345, + "grad_norm": 1.3843920770849105, + "learning_rate": 4.017758046614873e-06, + "loss": 0.1078, + "step": 362 + }, + { + "epoch": 0.16122584943371085, + "grad_norm": 1.230254034984121, + "learning_rate": 4.028856825749168e-06, + "loss": 0.1218, + "step": 363 + }, + { + "epoch": 0.16166999777925828, + "grad_norm": 1.0307816837555734, + "learning_rate": 4.039955604883464e-06, + "loss": 0.1138, + "step": 364 + }, + { + "epoch": 0.16211414612480568, + "grad_norm": 1.1369945329768874, + "learning_rate": 4.051054384017759e-06, + "loss": 0.0917, + "step": 365 + }, + { + "epoch": 0.1625582944703531, + "grad_norm": 1.3026077981860287, + "learning_rate": 4.062153163152054e-06, + "loss": 0.1209, + "step": 366 + }, + { + "epoch": 0.1630024428159005, + "grad_norm": 1.8570215631965663, + "learning_rate": 4.073251942286349e-06, + "loss": 0.103, + "step": 367 + }, + { + "epoch": 0.1634465911614479, + "grad_norm": 1.4261357373143895, + "learning_rate": 4.084350721420644e-06, + "loss": 0.1117, + "step": 368 + }, + { + "epoch": 0.16389073950699534, + "grad_norm": 1.2842977064487127, + "learning_rate": 4.09544950055494e-06, + "loss": 0.0945, + "step": 369 + }, + { + "epoch": 0.16433488785254274, + "grad_norm": 1.7942409210847103, + "learning_rate": 4.106548279689235e-06, + "loss": 0.1658, + "step": 370 + }, + { + "epoch": 0.16477903619809017, + "grad_norm": 1.2570893224456356, + "learning_rate": 4.11764705882353e-06, + "loss": 0.1238, + "step": 371 + }, + { + "epoch": 0.16522318454363757, + "grad_norm": 1.0685833693361235, + "learning_rate": 4.128745837957825e-06, + "loss": 0.0964, + "step": 372 + }, + { + "epoch": 0.16566733288918498, + "grad_norm": 1.2815898248385025, + "learning_rate": 4.13984461709212e-06, + "loss": 0.105, + "step": 373 + }, + { + "epoch": 0.1661114812347324, + "grad_norm": 1.64061710866675, + "learning_rate": 4.150943396226416e-06, + "loss": 0.1096, + "step": 374 + }, + { + "epoch": 0.1665556295802798, + "grad_norm": 1.3088826974530428, + "learning_rate": 4.16204217536071e-06, + "loss": 0.1211, + "step": 375 + }, + { + "epoch": 0.16699977792582724, + "grad_norm": 1.3003151707348597, + "learning_rate": 4.173140954495006e-06, + "loss": 0.1111, + "step": 376 + }, + { + "epoch": 0.16744392627137464, + "grad_norm": 1.3124364780409907, + "learning_rate": 4.184239733629301e-06, + "loss": 0.1085, + "step": 377 + }, + { + "epoch": 0.16788807461692204, + "grad_norm": 1.3347095844879298, + "learning_rate": 4.195338512763596e-06, + "loss": 0.1228, + "step": 378 + }, + { + "epoch": 0.16833222296246947, + "grad_norm": 1.3151237942729317, + "learning_rate": 4.206437291897892e-06, + "loss": 0.1041, + "step": 379 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 1.6082774402292863, + "learning_rate": 4.217536071032186e-06, + "loss": 0.0915, + "step": 380 + }, + { + "epoch": 0.1692205196535643, + "grad_norm": 1.2544764918032303, + "learning_rate": 4.228634850166482e-06, + "loss": 0.0903, + "step": 381 + }, + { + "epoch": 0.1696646679991117, + "grad_norm": 1.4709993024116135, + "learning_rate": 4.239733629300777e-06, + "loss": 0.1095, + "step": 382 + }, + { + "epoch": 0.1701088163446591, + "grad_norm": 1.090059538683836, + "learning_rate": 4.250832408435072e-06, + "loss": 0.09, + "step": 383 + }, + { + "epoch": 0.17055296469020653, + "grad_norm": 1.936001842072003, + "learning_rate": 4.2619311875693675e-06, + "loss": 0.125, + "step": 384 + }, + { + "epoch": 0.17099711303575393, + "grad_norm": 1.7118256852712324, + "learning_rate": 4.2730299667036625e-06, + "loss": 0.095, + "step": 385 + }, + { + "epoch": 0.17144126138130136, + "grad_norm": 2.0311105495228268, + "learning_rate": 4.2841287458379584e-06, + "loss": 0.1029, + "step": 386 + }, + { + "epoch": 0.17188540972684876, + "grad_norm": 1.444490027117435, + "learning_rate": 4.2952275249722535e-06, + "loss": 0.1232, + "step": 387 + }, + { + "epoch": 0.1723295580723962, + "grad_norm": 1.4378003146516516, + "learning_rate": 4.3063263041065486e-06, + "loss": 0.0932, + "step": 388 + }, + { + "epoch": 0.1727737064179436, + "grad_norm": 1.1043097695262434, + "learning_rate": 4.317425083240844e-06, + "loss": 0.0994, + "step": 389 + }, + { + "epoch": 0.173217854763491, + "grad_norm": 1.1954032097891434, + "learning_rate": 4.328523862375139e-06, + "loss": 0.101, + "step": 390 + }, + { + "epoch": 0.17366200310903843, + "grad_norm": 1.6447959614931191, + "learning_rate": 4.339622641509435e-06, + "loss": 0.1343, + "step": 391 + }, + { + "epoch": 0.17410615145458583, + "grad_norm": 1.5660886998105679, + "learning_rate": 4.35072142064373e-06, + "loss": 0.1052, + "step": 392 + }, + { + "epoch": 0.17455029980013326, + "grad_norm": 1.129037326613576, + "learning_rate": 4.361820199778025e-06, + "loss": 0.0949, + "step": 393 + }, + { + "epoch": 0.17499444814568066, + "grad_norm": 0.9299986418563283, + "learning_rate": 4.37291897891232e-06, + "loss": 0.091, + "step": 394 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 1.5748400271295198, + "learning_rate": 4.384017758046616e-06, + "loss": 0.1688, + "step": 395 + }, + { + "epoch": 0.1758827448367755, + "grad_norm": 0.8778899838246078, + "learning_rate": 4.395116537180911e-06, + "loss": 0.0983, + "step": 396 + }, + { + "epoch": 0.1763268931823229, + "grad_norm": 1.1516100437124952, + "learning_rate": 4.406215316315206e-06, + "loss": 0.1201, + "step": 397 + }, + { + "epoch": 0.17677104152787032, + "grad_norm": 0.9883123409205935, + "learning_rate": 4.417314095449501e-06, + "loss": 0.0693, + "step": 398 + }, + { + "epoch": 0.17721518987341772, + "grad_norm": 1.367128447993114, + "learning_rate": 4.428412874583796e-06, + "loss": 0.1009, + "step": 399 + }, + { + "epoch": 0.17765933821896512, + "grad_norm": 1.2639479044263988, + "learning_rate": 4.439511653718092e-06, + "loss": 0.093, + "step": 400 + }, + { + "epoch": 0.17810348656451255, + "grad_norm": 1.284067214420426, + "learning_rate": 4.450610432852386e-06, + "loss": 0.1029, + "step": 401 + }, + { + "epoch": 0.17854763491005995, + "grad_norm": 0.9723989283162116, + "learning_rate": 4.461709211986682e-06, + "loss": 0.093, + "step": 402 + }, + { + "epoch": 0.17899178325560738, + "grad_norm": 1.1814808630811793, + "learning_rate": 4.472807991120977e-06, + "loss": 0.1004, + "step": 403 + }, + { + "epoch": 0.17943593160115479, + "grad_norm": 1.2100875095234747, + "learning_rate": 4.483906770255272e-06, + "loss": 0.1165, + "step": 404 + }, + { + "epoch": 0.1798800799467022, + "grad_norm": 1.2480781051362362, + "learning_rate": 4.495005549389568e-06, + "loss": 0.1086, + "step": 405 + }, + { + "epoch": 0.18032422829224962, + "grad_norm": 1.4659555944931602, + "learning_rate": 4.506104328523862e-06, + "loss": 0.0966, + "step": 406 + }, + { + "epoch": 0.18076837663779702, + "grad_norm": 1.2905749286816455, + "learning_rate": 4.517203107658158e-06, + "loss": 0.1308, + "step": 407 + }, + { + "epoch": 0.18121252498334445, + "grad_norm": 1.2083135245558752, + "learning_rate": 4.528301886792453e-06, + "loss": 0.139, + "step": 408 + }, + { + "epoch": 0.18165667332889185, + "grad_norm": 1.2677821305643562, + "learning_rate": 4.539400665926748e-06, + "loss": 0.1522, + "step": 409 + }, + { + "epoch": 0.18210082167443925, + "grad_norm": 0.8309366049632025, + "learning_rate": 4.5504994450610434e-06, + "loss": 0.0955, + "step": 410 + }, + { + "epoch": 0.18254497001998668, + "grad_norm": 1.6708204443941888, + "learning_rate": 4.5615982241953385e-06, + "loss": 0.0983, + "step": 411 + }, + { + "epoch": 0.18298911836553408, + "grad_norm": 1.8028863536801523, + "learning_rate": 4.572697003329634e-06, + "loss": 0.1141, + "step": 412 + }, + { + "epoch": 0.1834332667110815, + "grad_norm": 1.6039168610165708, + "learning_rate": 4.5837957824639295e-06, + "loss": 0.14, + "step": 413 + }, + { + "epoch": 0.1838774150566289, + "grad_norm": 1.3909894579389195, + "learning_rate": 4.5948945615982245e-06, + "loss": 0.0924, + "step": 414 + }, + { + "epoch": 0.1843215634021763, + "grad_norm": 1.444002762358773, + "learning_rate": 4.60599334073252e-06, + "loss": 0.0882, + "step": 415 + }, + { + "epoch": 0.18476571174772374, + "grad_norm": 1.3517930787179577, + "learning_rate": 4.617092119866815e-06, + "loss": 0.1198, + "step": 416 + }, + { + "epoch": 0.18520986009327114, + "grad_norm": 0.9651918865616642, + "learning_rate": 4.628190899001111e-06, + "loss": 0.0794, + "step": 417 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 1.3087574560794024, + "learning_rate": 4.639289678135406e-06, + "loss": 0.1166, + "step": 418 + }, + { + "epoch": 0.18609815678436598, + "grad_norm": 1.1029726712835008, + "learning_rate": 4.650388457269701e-06, + "loss": 0.106, + "step": 419 + }, + { + "epoch": 0.18654230512991338, + "grad_norm": 1.481760865085893, + "learning_rate": 4.661487236403996e-06, + "loss": 0.1225, + "step": 420 + }, + { + "epoch": 0.1869864534754608, + "grad_norm": 1.125656650735202, + "learning_rate": 4.672586015538291e-06, + "loss": 0.0894, + "step": 421 + }, + { + "epoch": 0.1874306018210082, + "grad_norm": 1.1643460761006563, + "learning_rate": 4.683684794672587e-06, + "loss": 0.1049, + "step": 422 + }, + { + "epoch": 0.18787475016655564, + "grad_norm": 1.2081839286402132, + "learning_rate": 4.694783573806882e-06, + "loss": 0.0984, + "step": 423 + }, + { + "epoch": 0.18831889851210304, + "grad_norm": 1.0952768068994485, + "learning_rate": 4.705882352941177e-06, + "loss": 0.1083, + "step": 424 + }, + { + "epoch": 0.18876304685765047, + "grad_norm": 1.993217784490899, + "learning_rate": 4.716981132075472e-06, + "loss": 0.1198, + "step": 425 + }, + { + "epoch": 0.18920719520319787, + "grad_norm": 1.4709865866324339, + "learning_rate": 4.728079911209768e-06, + "loss": 0.1004, + "step": 426 + }, + { + "epoch": 0.18965134354874527, + "grad_norm": 1.1800698065694786, + "learning_rate": 4.739178690344062e-06, + "loss": 0.0942, + "step": 427 + }, + { + "epoch": 0.1900954918942927, + "grad_norm": 0.9004404155709735, + "learning_rate": 4.750277469478358e-06, + "loss": 0.0879, + "step": 428 + }, + { + "epoch": 0.1905396402398401, + "grad_norm": 1.3988615855599944, + "learning_rate": 4.761376248612653e-06, + "loss": 0.1313, + "step": 429 + }, + { + "epoch": 0.19098378858538753, + "grad_norm": 1.2097973380846636, + "learning_rate": 4.772475027746948e-06, + "loss": 0.1192, + "step": 430 + }, + { + "epoch": 0.19142793693093493, + "grad_norm": 1.795772492183597, + "learning_rate": 4.783573806881244e-06, + "loss": 0.1319, + "step": 431 + }, + { + "epoch": 0.19187208527648233, + "grad_norm": 1.5438576982175642, + "learning_rate": 4.794672586015538e-06, + "loss": 0.156, + "step": 432 + }, + { + "epoch": 0.19231623362202976, + "grad_norm": 1.4462148356085274, + "learning_rate": 4.805771365149834e-06, + "loss": 0.0977, + "step": 433 + }, + { + "epoch": 0.19276038196757717, + "grad_norm": 1.1013117212825567, + "learning_rate": 4.816870144284129e-06, + "loss": 0.123, + "step": 434 + }, + { + "epoch": 0.1932045303131246, + "grad_norm": 1.473729194266236, + "learning_rate": 4.827968923418424e-06, + "loss": 0.1219, + "step": 435 + }, + { + "epoch": 0.193648678658672, + "grad_norm": 1.060585950094457, + "learning_rate": 4.839067702552719e-06, + "loss": 0.0867, + "step": 436 + }, + { + "epoch": 0.1940928270042194, + "grad_norm": 1.1939948263664504, + "learning_rate": 4.8501664816870145e-06, + "loss": 0.1221, + "step": 437 + }, + { + "epoch": 0.19453697534976683, + "grad_norm": 1.0411525410331977, + "learning_rate": 4.86126526082131e-06, + "loss": 0.0955, + "step": 438 + }, + { + "epoch": 0.19498112369531423, + "grad_norm": 1.3074349171537876, + "learning_rate": 4.8723640399556054e-06, + "loss": 0.1092, + "step": 439 + }, + { + "epoch": 0.19542527204086166, + "grad_norm": 2.556790721991839, + "learning_rate": 4.8834628190899005e-06, + "loss": 0.1144, + "step": 440 + }, + { + "epoch": 0.19586942038640906, + "grad_norm": 1.5219706664834065, + "learning_rate": 4.894561598224196e-06, + "loss": 0.1353, + "step": 441 + }, + { + "epoch": 0.19631356873195646, + "grad_norm": 1.5771009476283926, + "learning_rate": 4.905660377358491e-06, + "loss": 0.1139, + "step": 442 + }, + { + "epoch": 0.1967577170775039, + "grad_norm": 1.7247841251255984, + "learning_rate": 4.9167591564927866e-06, + "loss": 0.0969, + "step": 443 + }, + { + "epoch": 0.1972018654230513, + "grad_norm": 1.118860834302166, + "learning_rate": 4.927857935627082e-06, + "loss": 0.0955, + "step": 444 + }, + { + "epoch": 0.19764601376859872, + "grad_norm": 1.4715029598755733, + "learning_rate": 4.938956714761377e-06, + "loss": 0.1001, + "step": 445 + }, + { + "epoch": 0.19809016211414612, + "grad_norm": 1.5680153762667055, + "learning_rate": 4.950055493895672e-06, + "loss": 0.1222, + "step": 446 + }, + { + "epoch": 0.19853431045969352, + "grad_norm": 1.7877852329091748, + "learning_rate": 4.961154273029967e-06, + "loss": 0.1057, + "step": 447 + }, + { + "epoch": 0.19897845880524095, + "grad_norm": 1.1650327458270198, + "learning_rate": 4.972253052164263e-06, + "loss": 0.1242, + "step": 448 + }, + { + "epoch": 0.19942260715078836, + "grad_norm": 1.0753006649574548, + "learning_rate": 4.983351831298557e-06, + "loss": 0.0912, + "step": 449 + }, + { + "epoch": 0.19986675549633579, + "grad_norm": 1.546863382694724, + "learning_rate": 4.994450610432853e-06, + "loss": 0.123, + "step": 450 + }, + { + "epoch": 0.2003109038418832, + "grad_norm": 1.3219021070497294, + "learning_rate": 5.005549389567148e-06, + "loss": 0.1167, + "step": 451 + }, + { + "epoch": 0.2007550521874306, + "grad_norm": 1.3154364501774296, + "learning_rate": 5.016648168701444e-06, + "loss": 0.1023, + "step": 452 + }, + { + "epoch": 0.20119920053297802, + "grad_norm": 1.2732051452073168, + "learning_rate": 5.027746947835739e-06, + "loss": 0.0717, + "step": 453 + }, + { + "epoch": 0.20164334887852542, + "grad_norm": 1.3206735580145712, + "learning_rate": 5.038845726970034e-06, + "loss": 0.1095, + "step": 454 + }, + { + "epoch": 0.20208749722407285, + "grad_norm": 1.0332614998616947, + "learning_rate": 5.049944506104328e-06, + "loss": 0.0845, + "step": 455 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 1.211744462644113, + "learning_rate": 5.061043285238624e-06, + "loss": 0.1066, + "step": 456 + }, + { + "epoch": 0.20297579391516768, + "grad_norm": 1.3376009797970827, + "learning_rate": 5.072142064372919e-06, + "loss": 0.0905, + "step": 457 + }, + { + "epoch": 0.20341994226071508, + "grad_norm": 1.2905140024192596, + "learning_rate": 5.083240843507214e-06, + "loss": 0.1011, + "step": 458 + }, + { + "epoch": 0.20386409060626248, + "grad_norm": 1.1671668945164224, + "learning_rate": 5.09433962264151e-06, + "loss": 0.1027, + "step": 459 + }, + { + "epoch": 0.2043082389518099, + "grad_norm": 2.1761489486755363, + "learning_rate": 5.105438401775805e-06, + "loss": 0.1093, + "step": 460 + }, + { + "epoch": 0.2047523872973573, + "grad_norm": 1.5496188187186781, + "learning_rate": 5.1165371809101e-06, + "loss": 0.1241, + "step": 461 + }, + { + "epoch": 0.20519653564290474, + "grad_norm": 1.555761458410687, + "learning_rate": 5.127635960044396e-06, + "loss": 0.1119, + "step": 462 + }, + { + "epoch": 0.20564068398845214, + "grad_norm": 1.4018925493874805, + "learning_rate": 5.138734739178691e-06, + "loss": 0.149, + "step": 463 + }, + { + "epoch": 0.20608483233399955, + "grad_norm": 1.2908877898264874, + "learning_rate": 5.1498335183129855e-06, + "loss": 0.1255, + "step": 464 + }, + { + "epoch": 0.20652898067954698, + "grad_norm": 1.2526168814511602, + "learning_rate": 5.1609322974472806e-06, + "loss": 0.1145, + "step": 465 + }, + { + "epoch": 0.20697312902509438, + "grad_norm": 1.1641091501138674, + "learning_rate": 5.1720310765815765e-06, + "loss": 0.1136, + "step": 466 + }, + { + "epoch": 0.2074172773706418, + "grad_norm": 1.1082496575030598, + "learning_rate": 5.1831298557158716e-06, + "loss": 0.0897, + "step": 467 + }, + { + "epoch": 0.2078614257161892, + "grad_norm": 1.174903547638247, + "learning_rate": 5.194228634850167e-06, + "loss": 0.1121, + "step": 468 + }, + { + "epoch": 0.2083055740617366, + "grad_norm": 1.0600460968712155, + "learning_rate": 5.2053274139844625e-06, + "loss": 0.0819, + "step": 469 + }, + { + "epoch": 0.20874972240728404, + "grad_norm": 1.3263800742507028, + "learning_rate": 5.216426193118758e-06, + "loss": 0.084, + "step": 470 + }, + { + "epoch": 0.20919387075283144, + "grad_norm": 1.604108989764318, + "learning_rate": 5.227524972253053e-06, + "loss": 0.1337, + "step": 471 + }, + { + "epoch": 0.20963801909837887, + "grad_norm": 0.9363413921696734, + "learning_rate": 5.238623751387349e-06, + "loss": 0.0863, + "step": 472 + }, + { + "epoch": 0.21008216744392627, + "grad_norm": 2.0569016097307578, + "learning_rate": 5.249722530521643e-06, + "loss": 0.1218, + "step": 473 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 2.0907599336843674, + "learning_rate": 5.260821309655938e-06, + "loss": 0.1032, + "step": 474 + }, + { + "epoch": 0.2109704641350211, + "grad_norm": 1.03795187986254, + "learning_rate": 5.271920088790233e-06, + "loss": 0.1002, + "step": 475 + }, + { + "epoch": 0.2114146124805685, + "grad_norm": 1.6168543989206048, + "learning_rate": 5.283018867924529e-06, + "loss": 0.1292, + "step": 476 + }, + { + "epoch": 0.21185876082611593, + "grad_norm": 1.555835178594497, + "learning_rate": 5.294117647058824e-06, + "loss": 0.0956, + "step": 477 + }, + { + "epoch": 0.21230290917166333, + "grad_norm": 1.4750911257012305, + "learning_rate": 5.305216426193119e-06, + "loss": 0.1025, + "step": 478 + }, + { + "epoch": 0.21274705751721074, + "grad_norm": 1.1903302332979158, + "learning_rate": 5.316315205327415e-06, + "loss": 0.0927, + "step": 479 + }, + { + "epoch": 0.21319120586275817, + "grad_norm": 1.799907699381259, + "learning_rate": 5.32741398446171e-06, + "loss": 0.1554, + "step": 480 + }, + { + "epoch": 0.21363535420830557, + "grad_norm": 1.1084539632711692, + "learning_rate": 5.338512763596004e-06, + "loss": 0.0963, + "step": 481 + }, + { + "epoch": 0.214079502553853, + "grad_norm": 1.0531894918937277, + "learning_rate": 5.3496115427303e-06, + "loss": 0.0808, + "step": 482 + }, + { + "epoch": 0.2145236508994004, + "grad_norm": 1.4912911807270512, + "learning_rate": 5.360710321864595e-06, + "loss": 0.1053, + "step": 483 + }, + { + "epoch": 0.2149677992449478, + "grad_norm": 1.1603558318841067, + "learning_rate": 5.37180910099889e-06, + "loss": 0.0859, + "step": 484 + }, + { + "epoch": 0.21541194759049523, + "grad_norm": 1.2647862898402058, + "learning_rate": 5.382907880133186e-06, + "loss": 0.1165, + "step": 485 + }, + { + "epoch": 0.21585609593604263, + "grad_norm": 1.3294179512793625, + "learning_rate": 5.394006659267481e-06, + "loss": 0.1006, + "step": 486 + }, + { + "epoch": 0.21630024428159006, + "grad_norm": 1.288870748090468, + "learning_rate": 5.405105438401776e-06, + "loss": 0.1174, + "step": 487 + }, + { + "epoch": 0.21674439262713746, + "grad_norm": 1.6526073205290404, + "learning_rate": 5.416204217536072e-06, + "loss": 0.1278, + "step": 488 + }, + { + "epoch": 0.2171885409726849, + "grad_norm": 1.1625080059645232, + "learning_rate": 5.427302996670367e-06, + "loss": 0.0962, + "step": 489 + }, + { + "epoch": 0.2176326893182323, + "grad_norm": 1.0239562074655422, + "learning_rate": 5.4384017758046615e-06, + "loss": 0.1137, + "step": 490 + }, + { + "epoch": 0.2180768376637797, + "grad_norm": 1.1775834154919058, + "learning_rate": 5.4495005549389565e-06, + "loss": 0.1167, + "step": 491 + }, + { + "epoch": 0.21852098600932712, + "grad_norm": 1.0527464833567977, + "learning_rate": 5.4605993340732525e-06, + "loss": 0.1035, + "step": 492 + }, + { + "epoch": 0.21896513435487452, + "grad_norm": 1.0639212306477968, + "learning_rate": 5.4716981132075475e-06, + "loss": 0.0798, + "step": 493 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 1.4130768874633084, + "learning_rate": 5.482796892341843e-06, + "loss": 0.1191, + "step": 494 + }, + { + "epoch": 0.21985343104596936, + "grad_norm": 1.1493987018132557, + "learning_rate": 5.4938956714761385e-06, + "loss": 0.0933, + "step": 495 + }, + { + "epoch": 0.22029757939151676, + "grad_norm": 1.5099978612143894, + "learning_rate": 5.5049944506104336e-06, + "loss": 0.1312, + "step": 496 + }, + { + "epoch": 0.2207417277370642, + "grad_norm": 1.7153041600616725, + "learning_rate": 5.516093229744729e-06, + "loss": 0.1631, + "step": 497 + }, + { + "epoch": 0.2211858760826116, + "grad_norm": 0.9681698333927948, + "learning_rate": 5.5271920088790245e-06, + "loss": 0.0811, + "step": 498 + }, + { + "epoch": 0.22163002442815902, + "grad_norm": 1.2720779336432644, + "learning_rate": 5.538290788013319e-06, + "loss": 0.1305, + "step": 499 + }, + { + "epoch": 0.22207417277370642, + "grad_norm": 1.2223556006920575, + "learning_rate": 5.549389567147614e-06, + "loss": 0.0942, + "step": 500 + }, + { + "epoch": 0.22251832111925382, + "grad_norm": 1.0558717161626225, + "learning_rate": 5.560488346281909e-06, + "loss": 0.1077, + "step": 501 + }, + { + "epoch": 0.22296246946480125, + "grad_norm": 1.1170017223897397, + "learning_rate": 5.571587125416205e-06, + "loss": 0.1309, + "step": 502 + }, + { + "epoch": 0.22340661781034865, + "grad_norm": 0.9964488397086242, + "learning_rate": 5.5826859045505e-06, + "loss": 0.0834, + "step": 503 + }, + { + "epoch": 0.22385076615589608, + "grad_norm": 1.2815734363162072, + "learning_rate": 5.593784683684795e-06, + "loss": 0.0901, + "step": 504 + }, + { + "epoch": 0.22429491450144348, + "grad_norm": 1.0732922534576572, + "learning_rate": 5.604883462819091e-06, + "loss": 0.0918, + "step": 505 + }, + { + "epoch": 0.22473906284699088, + "grad_norm": 1.0346342912485391, + "learning_rate": 5.615982241953386e-06, + "loss": 0.0983, + "step": 506 + }, + { + "epoch": 0.2251832111925383, + "grad_norm": 1.2353611959723767, + "learning_rate": 5.62708102108768e-06, + "loss": 0.1066, + "step": 507 + }, + { + "epoch": 0.22562735953808571, + "grad_norm": 1.5571341771119367, + "learning_rate": 5.638179800221976e-06, + "loss": 0.1197, + "step": 508 + }, + { + "epoch": 0.22607150788363314, + "grad_norm": 1.8593279138829277, + "learning_rate": 5.649278579356271e-06, + "loss": 0.1539, + "step": 509 + }, + { + "epoch": 0.22651565622918055, + "grad_norm": 1.3210670897027246, + "learning_rate": 5.660377358490566e-06, + "loss": 0.0972, + "step": 510 + }, + { + "epoch": 0.22695980457472795, + "grad_norm": 1.555152024387198, + "learning_rate": 5.671476137624862e-06, + "loss": 0.1017, + "step": 511 + }, + { + "epoch": 0.22740395292027538, + "grad_norm": 1.1784666679583122, + "learning_rate": 5.682574916759157e-06, + "loss": 0.0896, + "step": 512 + }, + { + "epoch": 0.22784810126582278, + "grad_norm": 0.897552288214822, + "learning_rate": 5.693673695893452e-06, + "loss": 0.0748, + "step": 513 + }, + { + "epoch": 0.2282922496113702, + "grad_norm": 0.9984908709528371, + "learning_rate": 5.704772475027748e-06, + "loss": 0.0962, + "step": 514 + }, + { + "epoch": 0.2287363979569176, + "grad_norm": 1.1522533946932125, + "learning_rate": 5.715871254162043e-06, + "loss": 0.1175, + "step": 515 + }, + { + "epoch": 0.229180546302465, + "grad_norm": 1.0543599181977559, + "learning_rate": 5.7269700332963374e-06, + "loss": 0.1386, + "step": 516 + }, + { + "epoch": 0.22962469464801244, + "grad_norm": 0.8788825627877698, + "learning_rate": 5.7380688124306325e-06, + "loss": 0.0947, + "step": 517 + }, + { + "epoch": 0.23006884299355984, + "grad_norm": 0.8916267274537737, + "learning_rate": 5.7491675915649284e-06, + "loss": 0.0875, + "step": 518 + }, + { + "epoch": 0.23051299133910727, + "grad_norm": 1.3741271628041778, + "learning_rate": 5.7602663706992235e-06, + "loss": 0.1431, + "step": 519 + }, + { + "epoch": 0.23095713968465467, + "grad_norm": 1.158157014144474, + "learning_rate": 5.7713651498335186e-06, + "loss": 0.1079, + "step": 520 + }, + { + "epoch": 0.2314012880302021, + "grad_norm": 0.9669483042814996, + "learning_rate": 5.7824639289678145e-06, + "loss": 0.0756, + "step": 521 + }, + { + "epoch": 0.2318454363757495, + "grad_norm": 0.9858439751900236, + "learning_rate": 5.7935627081021095e-06, + "loss": 0.1091, + "step": 522 + }, + { + "epoch": 0.2322895847212969, + "grad_norm": 1.0356934556615558, + "learning_rate": 5.804661487236405e-06, + "loss": 0.0899, + "step": 523 + }, + { + "epoch": 0.23273373306684433, + "grad_norm": 3.900766522495628, + "learning_rate": 5.8157602663707005e-06, + "loss": 0.0926, + "step": 524 + }, + { + "epoch": 0.23317788141239174, + "grad_norm": 1.1976168770257003, + "learning_rate": 5.826859045504995e-06, + "loss": 0.0963, + "step": 525 + }, + { + "epoch": 0.23362202975793916, + "grad_norm": 1.2441372325560522, + "learning_rate": 5.83795782463929e-06, + "loss": 0.1105, + "step": 526 + }, + { + "epoch": 0.23406617810348657, + "grad_norm": 1.379627337877928, + "learning_rate": 5.849056603773585e-06, + "loss": 0.0851, + "step": 527 + }, + { + "epoch": 0.23451032644903397, + "grad_norm": 1.0553207077914868, + "learning_rate": 5.860155382907881e-06, + "loss": 0.0876, + "step": 528 + }, + { + "epoch": 0.2349544747945814, + "grad_norm": 1.0117503811303656, + "learning_rate": 5.871254162042176e-06, + "loss": 0.0925, + "step": 529 + }, + { + "epoch": 0.2353986231401288, + "grad_norm": 1.2155239899129966, + "learning_rate": 5.882352941176471e-06, + "loss": 0.1177, + "step": 530 + }, + { + "epoch": 0.23584277148567623, + "grad_norm": 1.0134886033118073, + "learning_rate": 5.893451720310767e-06, + "loss": 0.0868, + "step": 531 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 1.2415034220382457, + "learning_rate": 5.904550499445062e-06, + "loss": 0.1016, + "step": 532 + }, + { + "epoch": 0.23673106817677103, + "grad_norm": 0.9266174346376681, + "learning_rate": 5.915649278579356e-06, + "loss": 0.1001, + "step": 533 + }, + { + "epoch": 0.23717521652231846, + "grad_norm": 1.3120341529200943, + "learning_rate": 5.926748057713651e-06, + "loss": 0.1248, + "step": 534 + }, + { + "epoch": 0.23761936486786586, + "grad_norm": 1.426598559638218, + "learning_rate": 5.937846836847947e-06, + "loss": 0.0932, + "step": 535 + }, + { + "epoch": 0.2380635132134133, + "grad_norm": 1.0182331505869047, + "learning_rate": 5.948945615982242e-06, + "loss": 0.1009, + "step": 536 + }, + { + "epoch": 0.2385076615589607, + "grad_norm": 1.4554113324351339, + "learning_rate": 5.960044395116537e-06, + "loss": 0.0998, + "step": 537 + }, + { + "epoch": 0.2389518099045081, + "grad_norm": 1.6220655384523863, + "learning_rate": 5.971143174250833e-06, + "loss": 0.105, + "step": 538 + }, + { + "epoch": 0.23939595825005552, + "grad_norm": 1.9400783770338148, + "learning_rate": 5.982241953385128e-06, + "loss": 0.1524, + "step": 539 + }, + { + "epoch": 0.23984010659560293, + "grad_norm": 1.4635091103881346, + "learning_rate": 5.993340732519424e-06, + "loss": 0.1233, + "step": 540 + }, + { + "epoch": 0.24028425494115035, + "grad_norm": 1.2937114895643234, + "learning_rate": 6.004439511653719e-06, + "loss": 0.1169, + "step": 541 + }, + { + "epoch": 0.24072840328669776, + "grad_norm": 1.2221129883336008, + "learning_rate": 6.015538290788013e-06, + "loss": 0.1049, + "step": 542 + }, + { + "epoch": 0.24117255163224516, + "grad_norm": 1.0735565355706922, + "learning_rate": 6.0266370699223085e-06, + "loss": 0.0886, + "step": 543 + }, + { + "epoch": 0.2416166999777926, + "grad_norm": 1.3261214209776406, + "learning_rate": 6.037735849056604e-06, + "loss": 0.1245, + "step": 544 + }, + { + "epoch": 0.24206084832334, + "grad_norm": 1.2122447330179282, + "learning_rate": 6.0488346281908995e-06, + "loss": 0.0824, + "step": 545 + }, + { + "epoch": 0.24250499666888742, + "grad_norm": 1.1700065718816541, + "learning_rate": 6.0599334073251945e-06, + "loss": 0.1062, + "step": 546 + }, + { + "epoch": 0.24294914501443482, + "grad_norm": 1.0126080671178852, + "learning_rate": 6.0710321864594904e-06, + "loss": 0.0876, + "step": 547 + }, + { + "epoch": 0.24339329335998222, + "grad_norm": 1.185450234336463, + "learning_rate": 6.0821309655937855e-06, + "loss": 0.1057, + "step": 548 + }, + { + "epoch": 0.24383744170552965, + "grad_norm": 0.94702847545811, + "learning_rate": 6.0932297447280806e-06, + "loss": 0.0704, + "step": 549 + }, + { + "epoch": 0.24428159005107705, + "grad_norm": 1.1538838071376252, + "learning_rate": 6.1043285238623765e-06, + "loss": 0.1062, + "step": 550 + }, + { + "epoch": 0.24472573839662448, + "grad_norm": 1.2787034775849095, + "learning_rate": 6.115427302996671e-06, + "loss": 0.1053, + "step": 551 + }, + { + "epoch": 0.24516988674217188, + "grad_norm": 2.111386576675479, + "learning_rate": 6.126526082130966e-06, + "loss": 0.1423, + "step": 552 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 0.8396851122657492, + "learning_rate": 6.137624861265261e-06, + "loss": 0.0857, + "step": 553 + }, + { + "epoch": 0.24605818343326671, + "grad_norm": 1.222223712186514, + "learning_rate": 6.148723640399557e-06, + "loss": 0.1241, + "step": 554 + }, + { + "epoch": 0.24650233177881412, + "grad_norm": 1.3542921033853719, + "learning_rate": 6.159822419533852e-06, + "loss": 0.1113, + "step": 555 + }, + { + "epoch": 0.24694648012436154, + "grad_norm": 0.921783149485065, + "learning_rate": 6.170921198668147e-06, + "loss": 0.0954, + "step": 556 + }, + { + "epoch": 0.24739062846990895, + "grad_norm": 1.170580173743532, + "learning_rate": 6.182019977802443e-06, + "loss": 0.0994, + "step": 557 + }, + { + "epoch": 0.24783477681545638, + "grad_norm": 1.4092773954441822, + "learning_rate": 6.193118756936738e-06, + "loss": 0.0931, + "step": 558 + }, + { + "epoch": 0.24827892516100378, + "grad_norm": 1.0651408845659296, + "learning_rate": 6.204217536071032e-06, + "loss": 0.1083, + "step": 559 + }, + { + "epoch": 0.24872307350655118, + "grad_norm": 1.2143455853200855, + "learning_rate": 6.215316315205327e-06, + "loss": 0.0952, + "step": 560 + }, + { + "epoch": 0.2491672218520986, + "grad_norm": 2.64991339168012, + "learning_rate": 6.226415094339623e-06, + "loss": 0.1145, + "step": 561 + }, + { + "epoch": 0.249611370197646, + "grad_norm": 0.7270989844733312, + "learning_rate": 6.237513873473918e-06, + "loss": 0.0861, + "step": 562 + }, + { + "epoch": 0.2500555185431934, + "grad_norm": 1.1431426285342126, + "learning_rate": 6.248612652608213e-06, + "loss": 0.1162, + "step": 563 + }, + { + "epoch": 0.2504996668887408, + "grad_norm": 0.860711327757446, + "learning_rate": 6.259711431742509e-06, + "loss": 0.0935, + "step": 564 + }, + { + "epoch": 0.25094381523428827, + "grad_norm": 0.8073003696766194, + "learning_rate": 6.270810210876804e-06, + "loss": 0.1079, + "step": 565 + }, + { + "epoch": 0.25138796357983567, + "grad_norm": 1.1101487411387352, + "learning_rate": 6.281908990011099e-06, + "loss": 0.1174, + "step": 566 + }, + { + "epoch": 0.2518321119253831, + "grad_norm": 1.0549286990788267, + "learning_rate": 6.293007769145395e-06, + "loss": 0.1051, + "step": 567 + }, + { + "epoch": 0.2522762602709305, + "grad_norm": 1.2019614422517741, + "learning_rate": 6.304106548279689e-06, + "loss": 0.1063, + "step": 568 + }, + { + "epoch": 0.2527204086164779, + "grad_norm": 0.9611079197173829, + "learning_rate": 6.3152053274139845e-06, + "loss": 0.0974, + "step": 569 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 1.1961495715533894, + "learning_rate": 6.32630410654828e-06, + "loss": 0.0991, + "step": 570 + }, + { + "epoch": 0.25360870530757273, + "grad_norm": 1.0576689858819426, + "learning_rate": 6.3374028856825754e-06, + "loss": 0.0953, + "step": 571 + }, + { + "epoch": 0.25405285365312014, + "grad_norm": 1.2656186745282128, + "learning_rate": 6.3485016648168705e-06, + "loss": 0.1083, + "step": 572 + }, + { + "epoch": 0.25449700199866754, + "grad_norm": 1.1776166049192853, + "learning_rate": 6.359600443951166e-06, + "loss": 0.1118, + "step": 573 + }, + { + "epoch": 0.254941150344215, + "grad_norm": 0.8664088781289958, + "learning_rate": 6.3706992230854615e-06, + "loss": 0.0863, + "step": 574 + }, + { + "epoch": 0.2553852986897624, + "grad_norm": 0.9925662850080711, + "learning_rate": 6.3817980022197565e-06, + "loss": 0.1058, + "step": 575 + }, + { + "epoch": 0.2558294470353098, + "grad_norm": 1.0606625159824246, + "learning_rate": 6.3928967813540525e-06, + "loss": 0.1075, + "step": 576 + }, + { + "epoch": 0.2562735953808572, + "grad_norm": 0.9900660508183841, + "learning_rate": 6.403995560488347e-06, + "loss": 0.0829, + "step": 577 + }, + { + "epoch": 0.2567177437264046, + "grad_norm": 0.8582749721826495, + "learning_rate": 6.415094339622642e-06, + "loss": 0.0914, + "step": 578 + }, + { + "epoch": 0.25716189207195206, + "grad_norm": 1.1617033243365338, + "learning_rate": 6.426193118756937e-06, + "loss": 0.1116, + "step": 579 + }, + { + "epoch": 0.25760604041749946, + "grad_norm": 1.0362694278635598, + "learning_rate": 6.437291897891233e-06, + "loss": 0.1145, + "step": 580 + }, + { + "epoch": 0.25805018876304686, + "grad_norm": 0.8723418719473582, + "learning_rate": 6.448390677025528e-06, + "loss": 0.0885, + "step": 581 + }, + { + "epoch": 0.25849433710859426, + "grad_norm": 1.0870215330646826, + "learning_rate": 6.459489456159823e-06, + "loss": 0.0894, + "step": 582 + }, + { + "epoch": 0.25893848545414166, + "grad_norm": 1.2407360085708667, + "learning_rate": 6.470588235294119e-06, + "loss": 0.0812, + "step": 583 + }, + { + "epoch": 0.2593826337996891, + "grad_norm": 1.8006078173259896, + "learning_rate": 6.481687014428414e-06, + "loss": 0.1429, + "step": 584 + }, + { + "epoch": 0.2598267821452365, + "grad_norm": 1.4748519282887915, + "learning_rate": 6.492785793562708e-06, + "loss": 0.0867, + "step": 585 + }, + { + "epoch": 0.2602709304907839, + "grad_norm": 1.0424639232757755, + "learning_rate": 6.503884572697003e-06, + "loss": 0.1146, + "step": 586 + }, + { + "epoch": 0.2607150788363313, + "grad_norm": 1.2786124769190668, + "learning_rate": 6.514983351831299e-06, + "loss": 0.1192, + "step": 587 + }, + { + "epoch": 0.26115922718187873, + "grad_norm": 0.9288875670879717, + "learning_rate": 6.526082130965594e-06, + "loss": 0.101, + "step": 588 + }, + { + "epoch": 0.2616033755274262, + "grad_norm": 0.6884103445991109, + "learning_rate": 6.537180910099889e-06, + "loss": 0.0743, + "step": 589 + }, + { + "epoch": 0.2620475238729736, + "grad_norm": 1.0193990545252207, + "learning_rate": 6.548279689234185e-06, + "loss": 0.0826, + "step": 590 + }, + { + "epoch": 0.262491672218521, + "grad_norm": 1.2098967104098979, + "learning_rate": 6.55937846836848e-06, + "loss": 0.116, + "step": 591 + }, + { + "epoch": 0.2629358205640684, + "grad_norm": 0.86346939462905, + "learning_rate": 6.570477247502775e-06, + "loss": 0.0833, + "step": 592 + }, + { + "epoch": 0.2633799689096158, + "grad_norm": 0.8577523664034169, + "learning_rate": 6.581576026637071e-06, + "loss": 0.0802, + "step": 593 + }, + { + "epoch": 0.26382411725516325, + "grad_norm": 0.7836334305793592, + "learning_rate": 6.592674805771365e-06, + "loss": 0.079, + "step": 594 + }, + { + "epoch": 0.26426826560071065, + "grad_norm": 0.9292792710410797, + "learning_rate": 6.60377358490566e-06, + "loss": 0.0841, + "step": 595 + }, + { + "epoch": 0.26471241394625805, + "grad_norm": 0.949730314196452, + "learning_rate": 6.614872364039956e-06, + "loss": 0.0779, + "step": 596 + }, + { + "epoch": 0.26515656229180545, + "grad_norm": 1.067206699117055, + "learning_rate": 6.625971143174251e-06, + "loss": 0.0813, + "step": 597 + }, + { + "epoch": 0.26560071063735285, + "grad_norm": 1.11737459145968, + "learning_rate": 6.6370699223085465e-06, + "loss": 0.0855, + "step": 598 + }, + { + "epoch": 0.2660448589829003, + "grad_norm": 1.3191268336017996, + "learning_rate": 6.648168701442842e-06, + "loss": 0.117, + "step": 599 + }, + { + "epoch": 0.2664890073284477, + "grad_norm": 1.0102499350799081, + "learning_rate": 6.6592674805771374e-06, + "loss": 0.0878, + "step": 600 + }, + { + "epoch": 0.2669331556739951, + "grad_norm": 0.878524576308141, + "learning_rate": 6.6703662597114325e-06, + "loss": 0.0886, + "step": 601 + }, + { + "epoch": 0.2673773040195425, + "grad_norm": 1.0385994736274717, + "learning_rate": 6.6814650388457284e-06, + "loss": 0.0895, + "step": 602 + }, + { + "epoch": 0.2678214523650899, + "grad_norm": 2.1746856925927305, + "learning_rate": 6.692563817980023e-06, + "loss": 0.1218, + "step": 603 + }, + { + "epoch": 0.2682656007106374, + "grad_norm": 1.1284346260514344, + "learning_rate": 6.703662597114318e-06, + "loss": 0.0984, + "step": 604 + }, + { + "epoch": 0.2687097490561848, + "grad_norm": 0.8795466679323811, + "learning_rate": 6.714761376248613e-06, + "loss": 0.1076, + "step": 605 + }, + { + "epoch": 0.2691538974017322, + "grad_norm": 0.7889851290279695, + "learning_rate": 6.725860155382909e-06, + "loss": 0.0809, + "step": 606 + }, + { + "epoch": 0.2695980457472796, + "grad_norm": 1.1495901760021123, + "learning_rate": 6.736958934517204e-06, + "loss": 0.1339, + "step": 607 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 0.9404581532626506, + "learning_rate": 6.748057713651499e-06, + "loss": 0.0835, + "step": 608 + }, + { + "epoch": 0.27048634243837444, + "grad_norm": 1.1614393549481714, + "learning_rate": 6.759156492785795e-06, + "loss": 0.1022, + "step": 609 + }, + { + "epoch": 0.27093049078392184, + "grad_norm": 0.7805043342922547, + "learning_rate": 6.77025527192009e-06, + "loss": 0.0872, + "step": 610 + }, + { + "epoch": 0.27137463912946924, + "grad_norm": 1.1270320700845156, + "learning_rate": 6.781354051054384e-06, + "loss": 0.106, + "step": 611 + }, + { + "epoch": 0.27181878747501664, + "grad_norm": 0.8511881872825724, + "learning_rate": 6.792452830188679e-06, + "loss": 0.077, + "step": 612 + }, + { + "epoch": 0.27226293582056404, + "grad_norm": 0.8653445546881817, + "learning_rate": 6.803551609322975e-06, + "loss": 0.0703, + "step": 613 + }, + { + "epoch": 0.2727070841661115, + "grad_norm": 0.9249325812470924, + "learning_rate": 6.81465038845727e-06, + "loss": 0.1239, + "step": 614 + }, + { + "epoch": 0.2731512325116589, + "grad_norm": 1.2035733700107387, + "learning_rate": 6.825749167591565e-06, + "loss": 0.0803, + "step": 615 + }, + { + "epoch": 0.2735953808572063, + "grad_norm": 1.058685859950284, + "learning_rate": 6.836847946725861e-06, + "loss": 0.101, + "step": 616 + }, + { + "epoch": 0.2740395292027537, + "grad_norm": 0.9291079298055421, + "learning_rate": 6.847946725860156e-06, + "loss": 0.0907, + "step": 617 + }, + { + "epoch": 0.2744836775483011, + "grad_norm": 1.1511507883949077, + "learning_rate": 6.859045504994451e-06, + "loss": 0.1077, + "step": 618 + }, + { + "epoch": 0.27492782589384857, + "grad_norm": 1.0786240033108858, + "learning_rate": 6.870144284128747e-06, + "loss": 0.1251, + "step": 619 + }, + { + "epoch": 0.27537197423939597, + "grad_norm": 0.9285679879223824, + "learning_rate": 6.881243063263041e-06, + "loss": 0.079, + "step": 620 + }, + { + "epoch": 0.27581612258494337, + "grad_norm": 0.8117578679372189, + "learning_rate": 6.892341842397336e-06, + "loss": 0.0772, + "step": 621 + }, + { + "epoch": 0.27626027093049077, + "grad_norm": 1.0138147773665356, + "learning_rate": 6.9034406215316315e-06, + "loss": 0.1238, + "step": 622 + }, + { + "epoch": 0.27670441927603817, + "grad_norm": 0.988212212065645, + "learning_rate": 6.914539400665927e-06, + "loss": 0.0726, + "step": 623 + }, + { + "epoch": 0.27714856762158563, + "grad_norm": 1.2358456192045688, + "learning_rate": 6.9256381798002224e-06, + "loss": 0.1125, + "step": 624 + }, + { + "epoch": 0.27759271596713303, + "grad_norm": 1.2887749392067072, + "learning_rate": 6.9367369589345175e-06, + "loss": 0.1003, + "step": 625 + }, + { + "epoch": 0.27803686431268043, + "grad_norm": 0.9864604958886025, + "learning_rate": 6.947835738068813e-06, + "loss": 0.0831, + "step": 626 + }, + { + "epoch": 0.27848101265822783, + "grad_norm": 1.088865911862852, + "learning_rate": 6.9589345172031085e-06, + "loss": 0.101, + "step": 627 + }, + { + "epoch": 0.27892516100377523, + "grad_norm": 1.409906763744686, + "learning_rate": 6.970033296337404e-06, + "loss": 0.104, + "step": 628 + }, + { + "epoch": 0.2793693093493227, + "grad_norm": 1.2302985416041377, + "learning_rate": 6.981132075471699e-06, + "loss": 0.0969, + "step": 629 + }, + { + "epoch": 0.2798134576948701, + "grad_norm": 1.2642567912432223, + "learning_rate": 6.992230854605994e-06, + "loss": 0.1116, + "step": 630 + }, + { + "epoch": 0.2802576060404175, + "grad_norm": 1.2962548485310728, + "learning_rate": 7.003329633740289e-06, + "loss": 0.1144, + "step": 631 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 0.9021212064599634, + "learning_rate": 7.014428412874585e-06, + "loss": 0.0794, + "step": 632 + }, + { + "epoch": 0.2811459027315123, + "grad_norm": 0.9822662924030562, + "learning_rate": 7.02552719200888e-06, + "loss": 0.081, + "step": 633 + }, + { + "epoch": 0.28159005107705976, + "grad_norm": 0.8568051773621045, + "learning_rate": 7.036625971143175e-06, + "loss": 0.0838, + "step": 634 + }, + { + "epoch": 0.28203419942260716, + "grad_norm": 0.9134125872525329, + "learning_rate": 7.047724750277471e-06, + "loss": 0.0895, + "step": 635 + }, + { + "epoch": 0.28247834776815456, + "grad_norm": 1.0203574319172648, + "learning_rate": 7.058823529411766e-06, + "loss": 0.0872, + "step": 636 + }, + { + "epoch": 0.28292249611370196, + "grad_norm": 0.8320232571139462, + "learning_rate": 7.06992230854606e-06, + "loss": 0.0778, + "step": 637 + }, + { + "epoch": 0.28336664445924936, + "grad_norm": 1.1019251570363913, + "learning_rate": 7.081021087680355e-06, + "loss": 0.101, + "step": 638 + }, + { + "epoch": 0.2838107928047968, + "grad_norm": 0.9010932622870097, + "learning_rate": 7.092119866814651e-06, + "loss": 0.0852, + "step": 639 + }, + { + "epoch": 0.2842549411503442, + "grad_norm": 1.0180000788163226, + "learning_rate": 7.103218645948946e-06, + "loss": 0.0955, + "step": 640 + }, + { + "epoch": 0.2846990894958916, + "grad_norm": 0.9251835556617967, + "learning_rate": 7.114317425083241e-06, + "loss": 0.0871, + "step": 641 + }, + { + "epoch": 0.285143237841439, + "grad_norm": 0.8901698566560228, + "learning_rate": 7.125416204217537e-06, + "loss": 0.0706, + "step": 642 + }, + { + "epoch": 0.2855873861869865, + "grad_norm": 1.0154369539307455, + "learning_rate": 7.136514983351832e-06, + "loss": 0.0887, + "step": 643 + }, + { + "epoch": 0.2860315345325339, + "grad_norm": 1.321020159204264, + "learning_rate": 7.147613762486127e-06, + "loss": 0.0865, + "step": 644 + }, + { + "epoch": 0.2864756828780813, + "grad_norm": 2.535826739420158, + "learning_rate": 7.158712541620423e-06, + "loss": 0.1096, + "step": 645 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 1.0227251731125295, + "learning_rate": 7.169811320754717e-06, + "loss": 0.0977, + "step": 646 + }, + { + "epoch": 0.2873639795691761, + "grad_norm": 0.8089367869895305, + "learning_rate": 7.180910099889012e-06, + "loss": 0.0792, + "step": 647 + }, + { + "epoch": 0.28780812791472354, + "grad_norm": 0.9374423150392824, + "learning_rate": 7.1920088790233074e-06, + "loss": 0.1055, + "step": 648 + }, + { + "epoch": 0.28825227626027095, + "grad_norm": 1.07643087941822, + "learning_rate": 7.203107658157603e-06, + "loss": 0.0957, + "step": 649 + }, + { + "epoch": 0.28869642460581835, + "grad_norm": 0.9052355958500774, + "learning_rate": 7.214206437291898e-06, + "loss": 0.097, + "step": 650 + }, + { + "epoch": 0.28914057295136575, + "grad_norm": 1.1103580563722126, + "learning_rate": 7.2253052164261935e-06, + "loss": 0.1111, + "step": 651 + }, + { + "epoch": 0.28958472129691315, + "grad_norm": 1.7388144567675092, + "learning_rate": 7.236403995560489e-06, + "loss": 0.0939, + "step": 652 + }, + { + "epoch": 0.2900288696424606, + "grad_norm": 1.078770189866491, + "learning_rate": 7.2475027746947845e-06, + "loss": 0.0845, + "step": 653 + }, + { + "epoch": 0.290473017988008, + "grad_norm": 1.2137588838688103, + "learning_rate": 7.2586015538290795e-06, + "loss": 0.0868, + "step": 654 + }, + { + "epoch": 0.2909171663335554, + "grad_norm": 1.3727496776393755, + "learning_rate": 7.269700332963375e-06, + "loss": 0.0922, + "step": 655 + }, + { + "epoch": 0.2913613146791028, + "grad_norm": 1.2519592442230572, + "learning_rate": 7.28079911209767e-06, + "loss": 0.126, + "step": 656 + }, + { + "epoch": 0.2918054630246502, + "grad_norm": 0.8673663222128764, + "learning_rate": 7.291897891231965e-06, + "loss": 0.0778, + "step": 657 + }, + { + "epoch": 0.29224961137019767, + "grad_norm": 1.1245127772074948, + "learning_rate": 7.302996670366261e-06, + "loss": 0.094, + "step": 658 + }, + { + "epoch": 0.29269375971574507, + "grad_norm": 1.156956668230168, + "learning_rate": 7.314095449500556e-06, + "loss": 0.108, + "step": 659 + }, + { + "epoch": 0.2931379080612925, + "grad_norm": 0.8322393629574484, + "learning_rate": 7.325194228634851e-06, + "loss": 0.0933, + "step": 660 + }, + { + "epoch": 0.2935820564068399, + "grad_norm": 1.1303584309903236, + "learning_rate": 7.336293007769147e-06, + "loss": 0.1144, + "step": 661 + }, + { + "epoch": 0.2940262047523873, + "grad_norm": 1.1516682415849495, + "learning_rate": 7.347391786903442e-06, + "loss": 0.1161, + "step": 662 + }, + { + "epoch": 0.29447035309793473, + "grad_norm": 0.6573879684754841, + "learning_rate": 7.358490566037736e-06, + "loss": 0.0743, + "step": 663 + }, + { + "epoch": 0.29491450144348214, + "grad_norm": 0.7675390269253073, + "learning_rate": 7.369589345172031e-06, + "loss": 0.0733, + "step": 664 + }, + { + "epoch": 0.29535864978902954, + "grad_norm": 0.9852104843013336, + "learning_rate": 7.380688124306327e-06, + "loss": 0.1057, + "step": 665 + }, + { + "epoch": 0.29580279813457694, + "grad_norm": 1.3140910564234118, + "learning_rate": 7.391786903440622e-06, + "loss": 0.1013, + "step": 666 + }, + { + "epoch": 0.29624694648012434, + "grad_norm": 1.1588394898060206, + "learning_rate": 7.402885682574917e-06, + "loss": 0.1089, + "step": 667 + }, + { + "epoch": 0.2966910948256718, + "grad_norm": 1.1857494935699013, + "learning_rate": 7.413984461709213e-06, + "loss": 0.0865, + "step": 668 + }, + { + "epoch": 0.2971352431712192, + "grad_norm": 0.861410639016016, + "learning_rate": 7.425083240843508e-06, + "loss": 0.0771, + "step": 669 + }, + { + "epoch": 0.2975793915167666, + "grad_norm": 1.293845000506648, + "learning_rate": 7.436182019977803e-06, + "loss": 0.1212, + "step": 670 + }, + { + "epoch": 0.298023539862314, + "grad_norm": 1.0407435776743181, + "learning_rate": 7.447280799112099e-06, + "loss": 0.0948, + "step": 671 + }, + { + "epoch": 0.2984676882078614, + "grad_norm": 0.9347120808526156, + "learning_rate": 7.458379578246393e-06, + "loss": 0.0779, + "step": 672 + }, + { + "epoch": 0.29891183655340886, + "grad_norm": 0.9205867276903889, + "learning_rate": 7.469478357380688e-06, + "loss": 0.0838, + "step": 673 + }, + { + "epoch": 0.29935598489895626, + "grad_norm": 1.388402805970494, + "learning_rate": 7.480577136514983e-06, + "loss": 0.0924, + "step": 674 + }, + { + "epoch": 0.29980013324450366, + "grad_norm": 1.6172485445367795, + "learning_rate": 7.491675915649279e-06, + "loss": 0.1132, + "step": 675 + }, + { + "epoch": 0.30024428159005107, + "grad_norm": 0.9561370366379224, + "learning_rate": 7.502774694783574e-06, + "loss": 0.1032, + "step": 676 + }, + { + "epoch": 0.30068842993559847, + "grad_norm": 1.0171256621828135, + "learning_rate": 7.5138734739178694e-06, + "loss": 0.0763, + "step": 677 + }, + { + "epoch": 0.3011325782811459, + "grad_norm": 1.0399008828044272, + "learning_rate": 7.524972253052165e-06, + "loss": 0.0763, + "step": 678 + }, + { + "epoch": 0.3015767266266933, + "grad_norm": 1.433840414738971, + "learning_rate": 7.5360710321864604e-06, + "loss": 0.1028, + "step": 679 + }, + { + "epoch": 0.3020208749722407, + "grad_norm": 1.0023463068044798, + "learning_rate": 7.5471698113207555e-06, + "loss": 0.0899, + "step": 680 + }, + { + "epoch": 0.30246502331778813, + "grad_norm": 0.8983226693149738, + "learning_rate": 7.55826859045505e-06, + "loss": 0.0906, + "step": 681 + }, + { + "epoch": 0.30290917166333553, + "grad_norm": 1.0458814923796536, + "learning_rate": 7.569367369589346e-06, + "loss": 0.087, + "step": 682 + }, + { + "epoch": 0.303353320008883, + "grad_norm": 1.317009232453199, + "learning_rate": 7.580466148723641e-06, + "loss": 0.1185, + "step": 683 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 0.9515944992821098, + "learning_rate": 7.591564927857937e-06, + "loss": 0.0784, + "step": 684 + }, + { + "epoch": 0.3042416166999778, + "grad_norm": 0.8794173564696354, + "learning_rate": 7.602663706992232e-06, + "loss": 0.0689, + "step": 685 + }, + { + "epoch": 0.3046857650455252, + "grad_norm": 1.3164826313969946, + "learning_rate": 7.613762486126527e-06, + "loss": 0.109, + "step": 686 + }, + { + "epoch": 0.3051299133910726, + "grad_norm": 1.208021624972249, + "learning_rate": 7.624861265260823e-06, + "loss": 0.1006, + "step": 687 + }, + { + "epoch": 0.30557406173662005, + "grad_norm": 0.9049457634328961, + "learning_rate": 7.635960044395118e-06, + "loss": 0.0898, + "step": 688 + }, + { + "epoch": 0.30601821008216745, + "grad_norm": 1.715228836512271, + "learning_rate": 7.647058823529411e-06, + "loss": 0.0954, + "step": 689 + }, + { + "epoch": 0.30646235842771485, + "grad_norm": 0.8243753589934399, + "learning_rate": 7.658157602663708e-06, + "loss": 0.0775, + "step": 690 + }, + { + "epoch": 0.30690650677326226, + "grad_norm": 1.322320542517841, + "learning_rate": 7.669256381798003e-06, + "loss": 0.1016, + "step": 691 + }, + { + "epoch": 0.30735065511880966, + "grad_norm": 0.9618609396789024, + "learning_rate": 7.680355160932298e-06, + "loss": 0.0973, + "step": 692 + }, + { + "epoch": 0.3077948034643571, + "grad_norm": 0.9164782000859841, + "learning_rate": 7.691453940066593e-06, + "loss": 0.0625, + "step": 693 + }, + { + "epoch": 0.3082389518099045, + "grad_norm": 0.9162081625393108, + "learning_rate": 7.702552719200888e-06, + "loss": 0.089, + "step": 694 + }, + { + "epoch": 0.3086831001554519, + "grad_norm": 1.0274536861399304, + "learning_rate": 7.713651498335183e-06, + "loss": 0.0938, + "step": 695 + }, + { + "epoch": 0.3091272485009993, + "grad_norm": 1.2217184648988348, + "learning_rate": 7.72475027746948e-06, + "loss": 0.0944, + "step": 696 + }, + { + "epoch": 0.3095713968465467, + "grad_norm": 1.2610843560867633, + "learning_rate": 7.735849056603775e-06, + "loss": 0.1108, + "step": 697 + }, + { + "epoch": 0.3100155451920942, + "grad_norm": 0.9157424175535596, + "learning_rate": 7.746947835738068e-06, + "loss": 0.0955, + "step": 698 + }, + { + "epoch": 0.3104596935376416, + "grad_norm": 0.8026621451546831, + "learning_rate": 7.758046614872365e-06, + "loss": 0.0659, + "step": 699 + }, + { + "epoch": 0.310903841883189, + "grad_norm": 0.8085721007510808, + "learning_rate": 7.76914539400666e-06, + "loss": 0.0677, + "step": 700 + }, + { + "epoch": 0.3113479902287364, + "grad_norm": 0.9769745515486552, + "learning_rate": 7.780244173140955e-06, + "loss": 0.0896, + "step": 701 + }, + { + "epoch": 0.3117921385742838, + "grad_norm": 1.0969240019260509, + "learning_rate": 7.79134295227525e-06, + "loss": 0.1038, + "step": 702 + }, + { + "epoch": 0.31223628691983124, + "grad_norm": 0.6785759710369018, + "learning_rate": 7.802441731409545e-06, + "loss": 0.0559, + "step": 703 + }, + { + "epoch": 0.31268043526537864, + "grad_norm": 0.9042685928894448, + "learning_rate": 7.81354051054384e-06, + "loss": 0.0843, + "step": 704 + }, + { + "epoch": 0.31312458361092604, + "grad_norm": 0.8411568659072639, + "learning_rate": 7.824639289678137e-06, + "loss": 0.1247, + "step": 705 + }, + { + "epoch": 0.31356873195647345, + "grad_norm": 0.9154910517915918, + "learning_rate": 7.835738068812432e-06, + "loss": 0.0964, + "step": 706 + }, + { + "epoch": 0.3140128803020209, + "grad_norm": 1.31944775400599, + "learning_rate": 7.846836847946726e-06, + "loss": 0.0986, + "step": 707 + }, + { + "epoch": 0.3144570286475683, + "grad_norm": 1.0518670866955138, + "learning_rate": 7.85793562708102e-06, + "loss": 0.095, + "step": 708 + }, + { + "epoch": 0.3149011769931157, + "grad_norm": 1.580342102818051, + "learning_rate": 7.869034406215318e-06, + "loss": 0.0979, + "step": 709 + }, + { + "epoch": 0.3153453253386631, + "grad_norm": 0.9955202031527373, + "learning_rate": 7.880133185349613e-06, + "loss": 0.0941, + "step": 710 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.9719600353134535, + "learning_rate": 7.891231964483908e-06, + "loss": 0.0929, + "step": 711 + }, + { + "epoch": 0.31623362202975797, + "grad_norm": 1.1433817316569035, + "learning_rate": 7.902330743618203e-06, + "loss": 0.0771, + "step": 712 + }, + { + "epoch": 0.31667777037530537, + "grad_norm": 1.3256882881979881, + "learning_rate": 7.913429522752498e-06, + "loss": 0.0926, + "step": 713 + }, + { + "epoch": 0.31712191872085277, + "grad_norm": 1.0742097800389574, + "learning_rate": 7.924528301886793e-06, + "loss": 0.111, + "step": 714 + }, + { + "epoch": 0.31756606706640017, + "grad_norm": 0.8425793522673473, + "learning_rate": 7.935627081021088e-06, + "loss": 0.08, + "step": 715 + }, + { + "epoch": 0.31801021541194757, + "grad_norm": 1.2662850654524707, + "learning_rate": 7.946725860155383e-06, + "loss": 0.1285, + "step": 716 + }, + { + "epoch": 0.31845436375749503, + "grad_norm": 1.0639958089530621, + "learning_rate": 7.957824639289678e-06, + "loss": 0.0911, + "step": 717 + }, + { + "epoch": 0.31889851210304243, + "grad_norm": 0.9354143320657179, + "learning_rate": 7.968923418423973e-06, + "loss": 0.0937, + "step": 718 + }, + { + "epoch": 0.31934266044858983, + "grad_norm": 0.864109073064804, + "learning_rate": 7.98002219755827e-06, + "loss": 0.07, + "step": 719 + }, + { + "epoch": 0.31978680879413723, + "grad_norm": 0.9458708669966496, + "learning_rate": 7.991120976692565e-06, + "loss": 0.0906, + "step": 720 + }, + { + "epoch": 0.32023095713968464, + "grad_norm": 0.9008487720151787, + "learning_rate": 8.00221975582686e-06, + "loss": 0.0707, + "step": 721 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 1.395224062553485, + "learning_rate": 8.013318534961155e-06, + "loss": 0.1097, + "step": 722 + }, + { + "epoch": 0.3211192538307795, + "grad_norm": 1.1228102547899144, + "learning_rate": 8.02441731409545e-06, + "loss": 0.103, + "step": 723 + }, + { + "epoch": 0.3215634021763269, + "grad_norm": 0.8489026661679754, + "learning_rate": 8.035516093229745e-06, + "loss": 0.0737, + "step": 724 + }, + { + "epoch": 0.3220075505218743, + "grad_norm": 1.116811038077338, + "learning_rate": 8.04661487236404e-06, + "loss": 0.0861, + "step": 725 + }, + { + "epoch": 0.3224516988674217, + "grad_norm": 1.29632952775896, + "learning_rate": 8.057713651498335e-06, + "loss": 0.1178, + "step": 726 + }, + { + "epoch": 0.32289584721296916, + "grad_norm": 0.9812002112145898, + "learning_rate": 8.06881243063263e-06, + "loss": 0.0832, + "step": 727 + }, + { + "epoch": 0.32333999555851656, + "grad_norm": 1.4524326080678385, + "learning_rate": 8.079911209766927e-06, + "loss": 0.1157, + "step": 728 + }, + { + "epoch": 0.32378414390406396, + "grad_norm": 1.004750311195787, + "learning_rate": 8.091009988901222e-06, + "loss": 0.0817, + "step": 729 + }, + { + "epoch": 0.32422829224961136, + "grad_norm": 0.9522750516409738, + "learning_rate": 8.102108768035517e-06, + "loss": 0.0731, + "step": 730 + }, + { + "epoch": 0.32467244059515876, + "grad_norm": 1.0736358930231333, + "learning_rate": 8.113207547169812e-06, + "loss": 0.0994, + "step": 731 + }, + { + "epoch": 0.3251165889407062, + "grad_norm": 1.8191859683022853, + "learning_rate": 8.124306326304107e-06, + "loss": 0.0937, + "step": 732 + }, + { + "epoch": 0.3255607372862536, + "grad_norm": 1.5487234761111215, + "learning_rate": 8.135405105438403e-06, + "loss": 0.094, + "step": 733 + }, + { + "epoch": 0.326004885631801, + "grad_norm": 1.0058113971822433, + "learning_rate": 8.146503884572698e-06, + "loss": 0.0887, + "step": 734 + }, + { + "epoch": 0.3264490339773484, + "grad_norm": 2.7718590597147963, + "learning_rate": 8.157602663706993e-06, + "loss": 0.1335, + "step": 735 + }, + { + "epoch": 0.3268931823228958, + "grad_norm": 2.05367085918422, + "learning_rate": 8.168701442841288e-06, + "loss": 0.1106, + "step": 736 + }, + { + "epoch": 0.3273373306684433, + "grad_norm": 1.115500020772734, + "learning_rate": 8.179800221975583e-06, + "loss": 0.0722, + "step": 737 + }, + { + "epoch": 0.3277814790139907, + "grad_norm": 1.1750795388816762, + "learning_rate": 8.19089900110988e-06, + "loss": 0.0625, + "step": 738 + }, + { + "epoch": 0.3282256273595381, + "grad_norm": 0.9103834127535441, + "learning_rate": 8.201997780244175e-06, + "loss": 0.0796, + "step": 739 + }, + { + "epoch": 0.3286697757050855, + "grad_norm": 0.9850210452729228, + "learning_rate": 8.21309655937847e-06, + "loss": 0.0658, + "step": 740 + }, + { + "epoch": 0.3291139240506329, + "grad_norm": 0.599291679194688, + "learning_rate": 8.224195338512763e-06, + "loss": 0.0528, + "step": 741 + }, + { + "epoch": 0.32955807239618035, + "grad_norm": 1.1762184481853932, + "learning_rate": 8.23529411764706e-06, + "loss": 0.101, + "step": 742 + }, + { + "epoch": 0.33000222074172775, + "grad_norm": 1.4905112857899239, + "learning_rate": 8.246392896781355e-06, + "loss": 0.1072, + "step": 743 + }, + { + "epoch": 0.33044636908727515, + "grad_norm": 0.9022874687789386, + "learning_rate": 8.25749167591565e-06, + "loss": 0.0948, + "step": 744 + }, + { + "epoch": 0.33089051743282255, + "grad_norm": 0.7870509929638926, + "learning_rate": 8.268590455049945e-06, + "loss": 0.0968, + "step": 745 + }, + { + "epoch": 0.33133466577836995, + "grad_norm": 0.9927750763334663, + "learning_rate": 8.27968923418424e-06, + "loss": 0.0944, + "step": 746 + }, + { + "epoch": 0.3317788141239174, + "grad_norm": 0.6875980968846429, + "learning_rate": 8.290788013318535e-06, + "loss": 0.0636, + "step": 747 + }, + { + "epoch": 0.3322229624694648, + "grad_norm": 0.8442026433419401, + "learning_rate": 8.301886792452832e-06, + "loss": 0.0831, + "step": 748 + }, + { + "epoch": 0.3326671108150122, + "grad_norm": 0.9412453312595589, + "learning_rate": 8.312985571587127e-06, + "loss": 0.0776, + "step": 749 + }, + { + "epoch": 0.3331112591605596, + "grad_norm": 0.9364463887563221, + "learning_rate": 8.32408435072142e-06, + "loss": 0.0887, + "step": 750 + }, + { + "epoch": 0.333555407506107, + "grad_norm": 0.7227541011442813, + "learning_rate": 8.335183129855715e-06, + "loss": 0.0857, + "step": 751 + }, + { + "epoch": 0.3339995558516545, + "grad_norm": 0.9456038972922441, + "learning_rate": 8.346281908990012e-06, + "loss": 0.0831, + "step": 752 + }, + { + "epoch": 0.3344437041972019, + "grad_norm": 0.8779308403041339, + "learning_rate": 8.357380688124307e-06, + "loss": 0.1193, + "step": 753 + }, + { + "epoch": 0.3348878525427493, + "grad_norm": 0.9670324236179916, + "learning_rate": 8.368479467258602e-06, + "loss": 0.0701, + "step": 754 + }, + { + "epoch": 0.3353320008882967, + "grad_norm": 0.661653951539699, + "learning_rate": 8.379578246392897e-06, + "loss": 0.0601, + "step": 755 + }, + { + "epoch": 0.3357761492338441, + "grad_norm": 0.7315432781058985, + "learning_rate": 8.390677025527192e-06, + "loss": 0.0931, + "step": 756 + }, + { + "epoch": 0.33622029757939154, + "grad_norm": 0.821857058746102, + "learning_rate": 8.40177580466149e-06, + "loss": 0.0853, + "step": 757 + }, + { + "epoch": 0.33666444592493894, + "grad_norm": 0.8172880203687719, + "learning_rate": 8.412874583795784e-06, + "loss": 0.1047, + "step": 758 + }, + { + "epoch": 0.33710859427048634, + "grad_norm": 1.0390709540622176, + "learning_rate": 8.423973362930078e-06, + "loss": 0.0954, + "step": 759 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 0.854833081193649, + "learning_rate": 8.435072142064373e-06, + "loss": 0.07, + "step": 760 + }, + { + "epoch": 0.33799689096158114, + "grad_norm": 1.0343320589862304, + "learning_rate": 8.44617092119867e-06, + "loss": 0.1111, + "step": 761 + }, + { + "epoch": 0.3384410393071286, + "grad_norm": 0.9843875498635718, + "learning_rate": 8.457269700332965e-06, + "loss": 0.0885, + "step": 762 + }, + { + "epoch": 0.338885187652676, + "grad_norm": 1.301338158343054, + "learning_rate": 8.46836847946726e-06, + "loss": 0.0978, + "step": 763 + }, + { + "epoch": 0.3393293359982234, + "grad_norm": 1.8891259339303466, + "learning_rate": 8.479467258601555e-06, + "loss": 0.1068, + "step": 764 + }, + { + "epoch": 0.3397734843437708, + "grad_norm": 0.8759783579336403, + "learning_rate": 8.49056603773585e-06, + "loss": 0.0702, + "step": 765 + }, + { + "epoch": 0.3402176326893182, + "grad_norm": 1.0288011027453567, + "learning_rate": 8.501664816870145e-06, + "loss": 0.1032, + "step": 766 + }, + { + "epoch": 0.34066178103486566, + "grad_norm": 1.0389974353449243, + "learning_rate": 8.51276359600444e-06, + "loss": 0.0876, + "step": 767 + }, + { + "epoch": 0.34110592938041306, + "grad_norm": 0.8136721936839324, + "learning_rate": 8.523862375138735e-06, + "loss": 0.0825, + "step": 768 + }, + { + "epoch": 0.34155007772596047, + "grad_norm": 0.7788745832938482, + "learning_rate": 8.53496115427303e-06, + "loss": 0.0841, + "step": 769 + }, + { + "epoch": 0.34199422607150787, + "grad_norm": 0.7903184210262955, + "learning_rate": 8.546059933407325e-06, + "loss": 0.0798, + "step": 770 + }, + { + "epoch": 0.34243837441705527, + "grad_norm": 0.9106664043393771, + "learning_rate": 8.557158712541622e-06, + "loss": 0.073, + "step": 771 + }, + { + "epoch": 0.3428825227626027, + "grad_norm": 1.129345042296519, + "learning_rate": 8.568257491675917e-06, + "loss": 0.1122, + "step": 772 + }, + { + "epoch": 0.3433266711081501, + "grad_norm": 1.0693131682338626, + "learning_rate": 8.579356270810212e-06, + "loss": 0.0963, + "step": 773 + }, + { + "epoch": 0.34377081945369753, + "grad_norm": 0.9775496598799794, + "learning_rate": 8.590455049944507e-06, + "loss": 0.0917, + "step": 774 + }, + { + "epoch": 0.34421496779924493, + "grad_norm": 1.734237792325461, + "learning_rate": 8.601553829078802e-06, + "loss": 0.0964, + "step": 775 + }, + { + "epoch": 0.3446591161447924, + "grad_norm": 0.8386031594806228, + "learning_rate": 8.612652608213097e-06, + "loss": 0.0808, + "step": 776 + }, + { + "epoch": 0.3451032644903398, + "grad_norm": 1.0606813630969132, + "learning_rate": 8.623751387347392e-06, + "loss": 0.0689, + "step": 777 + }, + { + "epoch": 0.3455474128358872, + "grad_norm": 0.9348314863875357, + "learning_rate": 8.634850166481687e-06, + "loss": 0.0732, + "step": 778 + }, + { + "epoch": 0.3459915611814346, + "grad_norm": 1.3147018531805297, + "learning_rate": 8.645948945615982e-06, + "loss": 0.1498, + "step": 779 + }, + { + "epoch": 0.346435709526982, + "grad_norm": 0.6686475142401406, + "learning_rate": 8.657047724750277e-06, + "loss": 0.0643, + "step": 780 + }, + { + "epoch": 0.34687985787252945, + "grad_norm": 1.4294668088170326, + "learning_rate": 8.668146503884574e-06, + "loss": 0.0973, + "step": 781 + }, + { + "epoch": 0.34732400621807685, + "grad_norm": 1.0334036928446257, + "learning_rate": 8.67924528301887e-06, + "loss": 0.0907, + "step": 782 + }, + { + "epoch": 0.34776815456362425, + "grad_norm": 0.787480188698369, + "learning_rate": 8.690344062153164e-06, + "loss": 0.0965, + "step": 783 + }, + { + "epoch": 0.34821230290917166, + "grad_norm": 1.1443565433110983, + "learning_rate": 8.70144284128746e-06, + "loss": 0.1128, + "step": 784 + }, + { + "epoch": 0.34865645125471906, + "grad_norm": 1.4871900340672362, + "learning_rate": 8.712541620421754e-06, + "loss": 0.1402, + "step": 785 + }, + { + "epoch": 0.3491005996002665, + "grad_norm": 1.2517565722671151, + "learning_rate": 8.72364039955605e-06, + "loss": 0.0937, + "step": 786 + }, + { + "epoch": 0.3495447479458139, + "grad_norm": 1.2066836215604748, + "learning_rate": 8.734739178690345e-06, + "loss": 0.1142, + "step": 787 + }, + { + "epoch": 0.3499888962913613, + "grad_norm": 0.9107920976880283, + "learning_rate": 8.74583795782464e-06, + "loss": 0.0885, + "step": 788 + }, + { + "epoch": 0.3504330446369087, + "grad_norm": 0.9724494230585177, + "learning_rate": 8.756936736958935e-06, + "loss": 0.0957, + "step": 789 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 0.8845187547684428, + "learning_rate": 8.768035516093231e-06, + "loss": 0.0713, + "step": 790 + }, + { + "epoch": 0.3513213413280036, + "grad_norm": 0.9555294870332015, + "learning_rate": 8.779134295227527e-06, + "loss": 0.0709, + "step": 791 + }, + { + "epoch": 0.351765489673551, + "grad_norm": 0.9971954202922753, + "learning_rate": 8.790233074361822e-06, + "loss": 0.086, + "step": 792 + }, + { + "epoch": 0.3522096380190984, + "grad_norm": 0.8799552486323357, + "learning_rate": 8.801331853496115e-06, + "loss": 0.0878, + "step": 793 + }, + { + "epoch": 0.3526537863646458, + "grad_norm": 1.1586612806863954, + "learning_rate": 8.812430632630412e-06, + "loss": 0.1017, + "step": 794 + }, + { + "epoch": 0.3530979347101932, + "grad_norm": 1.0618777970371058, + "learning_rate": 8.823529411764707e-06, + "loss": 0.0932, + "step": 795 + }, + { + "epoch": 0.35354208305574064, + "grad_norm": 1.0717541227428644, + "learning_rate": 8.834628190899002e-06, + "loss": 0.095, + "step": 796 + }, + { + "epoch": 0.35398623140128804, + "grad_norm": 1.1705694879290995, + "learning_rate": 8.845726970033297e-06, + "loss": 0.1118, + "step": 797 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 1.1829108047712253, + "learning_rate": 8.856825749167592e-06, + "loss": 0.1028, + "step": 798 + }, + { + "epoch": 0.35487452809238285, + "grad_norm": 0.7445292910176993, + "learning_rate": 8.867924528301887e-06, + "loss": 0.0639, + "step": 799 + }, + { + "epoch": 0.35531867643793025, + "grad_norm": 1.0475116637450934, + "learning_rate": 8.879023307436184e-06, + "loss": 0.0706, + "step": 800 + }, + { + "epoch": 0.3557628247834777, + "grad_norm": 1.1028250244486228, + "learning_rate": 8.890122086570479e-06, + "loss": 0.0819, + "step": 801 + }, + { + "epoch": 0.3562069731290251, + "grad_norm": 1.0807445455171478, + "learning_rate": 8.901220865704772e-06, + "loss": 0.0888, + "step": 802 + }, + { + "epoch": 0.3566511214745725, + "grad_norm": 1.0684985644998497, + "learning_rate": 8.912319644839067e-06, + "loss": 0.0822, + "step": 803 + }, + { + "epoch": 0.3570952698201199, + "grad_norm": 1.3191397870753145, + "learning_rate": 8.923418423973364e-06, + "loss": 0.1026, + "step": 804 + }, + { + "epoch": 0.3575394181656673, + "grad_norm": 1.0262852726585219, + "learning_rate": 8.93451720310766e-06, + "loss": 0.0776, + "step": 805 + }, + { + "epoch": 0.35798356651121477, + "grad_norm": 1.0652219347423726, + "learning_rate": 8.945615982241954e-06, + "loss": 0.0907, + "step": 806 + }, + { + "epoch": 0.35842771485676217, + "grad_norm": 1.0116135773749395, + "learning_rate": 8.95671476137625e-06, + "loss": 0.0959, + "step": 807 + }, + { + "epoch": 0.35887186320230957, + "grad_norm": 0.7818161469235702, + "learning_rate": 8.967813540510544e-06, + "loss": 0.0687, + "step": 808 + }, + { + "epoch": 0.359316011547857, + "grad_norm": 1.2323283633104487, + "learning_rate": 8.97891231964484e-06, + "loss": 0.1036, + "step": 809 + }, + { + "epoch": 0.3597601598934044, + "grad_norm": 1.0483052311963612, + "learning_rate": 8.990011098779136e-06, + "loss": 0.0899, + "step": 810 + }, + { + "epoch": 0.36020430823895183, + "grad_norm": 0.8215378781764873, + "learning_rate": 9.00110987791343e-06, + "loss": 0.089, + "step": 811 + }, + { + "epoch": 0.36064845658449923, + "grad_norm": 1.2325336486761531, + "learning_rate": 9.012208657047725e-06, + "loss": 0.1338, + "step": 812 + }, + { + "epoch": 0.36109260493004663, + "grad_norm": 1.0281228699327587, + "learning_rate": 9.02330743618202e-06, + "loss": 0.0928, + "step": 813 + }, + { + "epoch": 0.36153675327559404, + "grad_norm": 0.9331667240788679, + "learning_rate": 9.034406215316316e-06, + "loss": 0.0795, + "step": 814 + }, + { + "epoch": 0.36198090162114144, + "grad_norm": 0.9152493219085753, + "learning_rate": 9.045504994450612e-06, + "loss": 0.1015, + "step": 815 + }, + { + "epoch": 0.3624250499666889, + "grad_norm": 0.8370218994947535, + "learning_rate": 9.056603773584907e-06, + "loss": 0.0873, + "step": 816 + }, + { + "epoch": 0.3628691983122363, + "grad_norm": 0.7774818767814435, + "learning_rate": 9.067702552719202e-06, + "loss": 0.0822, + "step": 817 + }, + { + "epoch": 0.3633133466577837, + "grad_norm": 1.6384223947850474, + "learning_rate": 9.078801331853497e-06, + "loss": 0.081, + "step": 818 + }, + { + "epoch": 0.3637574950033311, + "grad_norm": 0.991313719663541, + "learning_rate": 9.089900110987792e-06, + "loss": 0.0837, + "step": 819 + }, + { + "epoch": 0.3642016433488785, + "grad_norm": 0.8596136661125425, + "learning_rate": 9.100998890122087e-06, + "loss": 0.0918, + "step": 820 + }, + { + "epoch": 0.36464579169442596, + "grad_norm": 1.0812055701714487, + "learning_rate": 9.112097669256382e-06, + "loss": 0.0825, + "step": 821 + }, + { + "epoch": 0.36508994003997336, + "grad_norm": 0.8070207504890635, + "learning_rate": 9.123196448390677e-06, + "loss": 0.0843, + "step": 822 + }, + { + "epoch": 0.36553408838552076, + "grad_norm": 1.1679552400138797, + "learning_rate": 9.134295227524974e-06, + "loss": 0.0914, + "step": 823 + }, + { + "epoch": 0.36597823673106816, + "grad_norm": 0.9627855966278458, + "learning_rate": 9.145394006659269e-06, + "loss": 0.08, + "step": 824 + }, + { + "epoch": 0.36642238507661556, + "grad_norm": 0.8861606234396192, + "learning_rate": 9.156492785793564e-06, + "loss": 0.0921, + "step": 825 + }, + { + "epoch": 0.366866533422163, + "grad_norm": 1.0771512576751088, + "learning_rate": 9.167591564927859e-06, + "loss": 0.0981, + "step": 826 + }, + { + "epoch": 0.3673106817677104, + "grad_norm": 0.7667307978284066, + "learning_rate": 9.178690344062154e-06, + "loss": 0.0763, + "step": 827 + }, + { + "epoch": 0.3677548301132578, + "grad_norm": 0.8789921021859833, + "learning_rate": 9.189789123196449e-06, + "loss": 0.0969, + "step": 828 + }, + { + "epoch": 0.3681989784588052, + "grad_norm": 1.100607694945035, + "learning_rate": 9.200887902330744e-06, + "loss": 0.0945, + "step": 829 + }, + { + "epoch": 0.3686431268043526, + "grad_norm": 0.8889918245566973, + "learning_rate": 9.21198668146504e-06, + "loss": 0.1027, + "step": 830 + }, + { + "epoch": 0.3690872751499001, + "grad_norm": 0.9919838528287025, + "learning_rate": 9.223085460599334e-06, + "loss": 0.0876, + "step": 831 + }, + { + "epoch": 0.3695314234954475, + "grad_norm": 0.7322878388650527, + "learning_rate": 9.23418423973363e-06, + "loss": 0.0827, + "step": 832 + }, + { + "epoch": 0.3699755718409949, + "grad_norm": 0.8334148632374379, + "learning_rate": 9.245283018867926e-06, + "loss": 0.0836, + "step": 833 + }, + { + "epoch": 0.3704197201865423, + "grad_norm": 1.2813561868661674, + "learning_rate": 9.256381798002221e-06, + "loss": 0.0889, + "step": 834 + }, + { + "epoch": 0.3708638685320897, + "grad_norm": 1.339839760620212, + "learning_rate": 9.267480577136516e-06, + "loss": 0.1145, + "step": 835 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 0.9154079084300466, + "learning_rate": 9.278579356270811e-06, + "loss": 0.0816, + "step": 836 + }, + { + "epoch": 0.37175216522318455, + "grad_norm": 0.6790049839601768, + "learning_rate": 9.289678135405106e-06, + "loss": 0.052, + "step": 837 + }, + { + "epoch": 0.37219631356873195, + "grad_norm": 1.2902350399135438, + "learning_rate": 9.300776914539401e-06, + "loss": 0.1205, + "step": 838 + }, + { + "epoch": 0.37264046191427935, + "grad_norm": 0.6777630897746993, + "learning_rate": 9.311875693673697e-06, + "loss": 0.0626, + "step": 839 + }, + { + "epoch": 0.37308461025982675, + "grad_norm": 0.974153136606222, + "learning_rate": 9.322974472807992e-06, + "loss": 0.0872, + "step": 840 + }, + { + "epoch": 0.3735287586053742, + "grad_norm": 0.8802239921866019, + "learning_rate": 9.334073251942287e-06, + "loss": 0.0705, + "step": 841 + }, + { + "epoch": 0.3739729069509216, + "grad_norm": 0.8051312485970141, + "learning_rate": 9.345172031076582e-06, + "loss": 0.0878, + "step": 842 + }, + { + "epoch": 0.374417055296469, + "grad_norm": 1.160749633212372, + "learning_rate": 9.356270810210878e-06, + "loss": 0.1112, + "step": 843 + }, + { + "epoch": 0.3748612036420164, + "grad_norm": 0.8346210460548523, + "learning_rate": 9.367369589345174e-06, + "loss": 0.1035, + "step": 844 + }, + { + "epoch": 0.3753053519875639, + "grad_norm": 1.0464653523849117, + "learning_rate": 9.378468368479467e-06, + "loss": 0.0901, + "step": 845 + }, + { + "epoch": 0.3757495003331113, + "grad_norm": 0.7515018279210384, + "learning_rate": 9.389567147613764e-06, + "loss": 0.0618, + "step": 846 + }, + { + "epoch": 0.3761936486786587, + "grad_norm": 1.1240058131240411, + "learning_rate": 9.400665926748059e-06, + "loss": 0.0682, + "step": 847 + }, + { + "epoch": 0.3766377970242061, + "grad_norm": 0.8330871505734428, + "learning_rate": 9.411764705882354e-06, + "loss": 0.0815, + "step": 848 + }, + { + "epoch": 0.3770819453697535, + "grad_norm": 0.6597157022162015, + "learning_rate": 9.422863485016649e-06, + "loss": 0.0705, + "step": 849 + }, + { + "epoch": 0.37752609371530094, + "grad_norm": 0.870783524840563, + "learning_rate": 9.433962264150944e-06, + "loss": 0.0756, + "step": 850 + }, + { + "epoch": 0.37797024206084834, + "grad_norm": 0.7718200428803087, + "learning_rate": 9.445061043285239e-06, + "loss": 0.0688, + "step": 851 + }, + { + "epoch": 0.37841439040639574, + "grad_norm": 1.0565515531045717, + "learning_rate": 9.456159822419536e-06, + "loss": 0.096, + "step": 852 + }, + { + "epoch": 0.37885853875194314, + "grad_norm": 0.9116267321999116, + "learning_rate": 9.46725860155383e-06, + "loss": 0.1219, + "step": 853 + }, + { + "epoch": 0.37930268709749054, + "grad_norm": 0.614891198569231, + "learning_rate": 9.478357380688124e-06, + "loss": 0.0618, + "step": 854 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 0.6996810655829085, + "learning_rate": 9.48945615982242e-06, + "loss": 0.0733, + "step": 855 + }, + { + "epoch": 0.3801909837885854, + "grad_norm": 0.8742171165547441, + "learning_rate": 9.500554938956716e-06, + "loss": 0.0943, + "step": 856 + }, + { + "epoch": 0.3806351321341328, + "grad_norm": 0.6941177838819981, + "learning_rate": 9.511653718091011e-06, + "loss": 0.0979, + "step": 857 + }, + { + "epoch": 0.3810792804796802, + "grad_norm": 0.9912966659857255, + "learning_rate": 9.522752497225306e-06, + "loss": 0.1146, + "step": 858 + }, + { + "epoch": 0.3815234288252276, + "grad_norm": 0.8052784568872317, + "learning_rate": 9.533851276359601e-06, + "loss": 0.0765, + "step": 859 + }, + { + "epoch": 0.38196757717077506, + "grad_norm": 0.6650776701454606, + "learning_rate": 9.544950055493896e-06, + "loss": 0.0598, + "step": 860 + }, + { + "epoch": 0.38241172551632246, + "grad_norm": 0.9207926874686408, + "learning_rate": 9.556048834628191e-06, + "loss": 0.0812, + "step": 861 + }, + { + "epoch": 0.38285587386186987, + "grad_norm": 0.9371867517767639, + "learning_rate": 9.567147613762488e-06, + "loss": 0.1016, + "step": 862 + }, + { + "epoch": 0.38330002220741727, + "grad_norm": 0.8381824885212333, + "learning_rate": 9.578246392896782e-06, + "loss": 0.0975, + "step": 863 + }, + { + "epoch": 0.38374417055296467, + "grad_norm": 0.8006129424637977, + "learning_rate": 9.589345172031077e-06, + "loss": 0.0768, + "step": 864 + }, + { + "epoch": 0.3841883188985121, + "grad_norm": 1.5462086828607524, + "learning_rate": 9.600443951165372e-06, + "loss": 0.0973, + "step": 865 + }, + { + "epoch": 0.38463246724405953, + "grad_norm": 0.9098699947791947, + "learning_rate": 9.611542730299668e-06, + "loss": 0.0862, + "step": 866 + }, + { + "epoch": 0.38507661558960693, + "grad_norm": 0.6758969469373516, + "learning_rate": 9.622641509433963e-06, + "loss": 0.0926, + "step": 867 + }, + { + "epoch": 0.38552076393515433, + "grad_norm": 0.8013932877888278, + "learning_rate": 9.633740288568259e-06, + "loss": 0.0788, + "step": 868 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 0.8330625416886887, + "learning_rate": 9.644839067702554e-06, + "loss": 0.0908, + "step": 869 + }, + { + "epoch": 0.3864090606262492, + "grad_norm": 0.8653830292054833, + "learning_rate": 9.655937846836849e-06, + "loss": 0.0845, + "step": 870 + }, + { + "epoch": 0.3868532089717966, + "grad_norm": 0.9667174949251367, + "learning_rate": 9.667036625971144e-06, + "loss": 0.0661, + "step": 871 + }, + { + "epoch": 0.387297357317344, + "grad_norm": 0.9162313384818003, + "learning_rate": 9.678135405105439e-06, + "loss": 0.0839, + "step": 872 + }, + { + "epoch": 0.3877415056628914, + "grad_norm": 1.05316579924942, + "learning_rate": 9.689234184239734e-06, + "loss": 0.0884, + "step": 873 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 0.6947987167903426, + "learning_rate": 9.700332963374029e-06, + "loss": 0.0676, + "step": 874 + }, + { + "epoch": 0.38862980235398625, + "grad_norm": 0.7637218765216399, + "learning_rate": 9.711431742508326e-06, + "loss": 0.0855, + "step": 875 + }, + { + "epoch": 0.38907395069953365, + "grad_norm": 0.8049041095871476, + "learning_rate": 9.72253052164262e-06, + "loss": 0.0783, + "step": 876 + }, + { + "epoch": 0.38951809904508106, + "grad_norm": 1.2616209833129097, + "learning_rate": 9.733629300776916e-06, + "loss": 0.0994, + "step": 877 + }, + { + "epoch": 0.38996224739062846, + "grad_norm": 0.8501327438099852, + "learning_rate": 9.744728079911211e-06, + "loss": 0.0865, + "step": 878 + }, + { + "epoch": 0.39040639573617586, + "grad_norm": 0.8980289128819791, + "learning_rate": 9.755826859045506e-06, + "loss": 0.0644, + "step": 879 + }, + { + "epoch": 0.3908505440817233, + "grad_norm": 0.7464217413483473, + "learning_rate": 9.766925638179801e-06, + "loss": 0.0751, + "step": 880 + }, + { + "epoch": 0.3912946924272707, + "grad_norm": 1.0405400428578027, + "learning_rate": 9.778024417314096e-06, + "loss": 0.0862, + "step": 881 + }, + { + "epoch": 0.3917388407728181, + "grad_norm": 0.7773928325897406, + "learning_rate": 9.789123196448391e-06, + "loss": 0.0877, + "step": 882 + }, + { + "epoch": 0.3921829891183655, + "grad_norm": 0.6831878151686481, + "learning_rate": 9.800221975582686e-06, + "loss": 0.0613, + "step": 883 + }, + { + "epoch": 0.3926271374639129, + "grad_norm": 0.8414451878698297, + "learning_rate": 9.811320754716981e-06, + "loss": 0.0742, + "step": 884 + }, + { + "epoch": 0.3930712858094604, + "grad_norm": 0.8370630047631802, + "learning_rate": 9.822419533851278e-06, + "loss": 0.0788, + "step": 885 + }, + { + "epoch": 0.3935154341550078, + "grad_norm": 0.66195315447052, + "learning_rate": 9.833518312985573e-06, + "loss": 0.0673, + "step": 886 + }, + { + "epoch": 0.3939595825005552, + "grad_norm": 0.5443847344778766, + "learning_rate": 9.844617092119868e-06, + "loss": 0.065, + "step": 887 + }, + { + "epoch": 0.3944037308461026, + "grad_norm": 0.7436541471597082, + "learning_rate": 9.855715871254163e-06, + "loss": 0.0718, + "step": 888 + }, + { + "epoch": 0.39484787919165, + "grad_norm": 0.9048716378979127, + "learning_rate": 9.866814650388458e-06, + "loss": 0.0647, + "step": 889 + }, + { + "epoch": 0.39529202753719744, + "grad_norm": 0.887466655199119, + "learning_rate": 9.877913429522753e-06, + "loss": 0.0742, + "step": 890 + }, + { + "epoch": 0.39573617588274485, + "grad_norm": 0.5558633344968636, + "learning_rate": 9.889012208657048e-06, + "loss": 0.0547, + "step": 891 + }, + { + "epoch": 0.39618032422829225, + "grad_norm": 0.6342481865802312, + "learning_rate": 9.900110987791344e-06, + "loss": 0.0652, + "step": 892 + }, + { + "epoch": 0.39662447257383965, + "grad_norm": 0.8609031233478771, + "learning_rate": 9.911209766925639e-06, + "loss": 0.0837, + "step": 893 + }, + { + "epoch": 0.39706862091938705, + "grad_norm": 0.8888916230177222, + "learning_rate": 9.922308546059934e-06, + "loss": 0.0892, + "step": 894 + }, + { + "epoch": 0.3975127692649345, + "grad_norm": 0.8338604612800079, + "learning_rate": 9.93340732519423e-06, + "loss": 0.101, + "step": 895 + }, + { + "epoch": 0.3979569176104819, + "grad_norm": 0.689341320326161, + "learning_rate": 9.944506104328525e-06, + "loss": 0.0778, + "step": 896 + }, + { + "epoch": 0.3984010659560293, + "grad_norm": 0.7629364933455176, + "learning_rate": 9.955604883462819e-06, + "loss": 0.0702, + "step": 897 + }, + { + "epoch": 0.3988452143015767, + "grad_norm": 0.7205533738637083, + "learning_rate": 9.966703662597114e-06, + "loss": 0.0712, + "step": 898 + }, + { + "epoch": 0.3992893626471241, + "grad_norm": 1.1452213067609647, + "learning_rate": 9.97780244173141e-06, + "loss": 0.0868, + "step": 899 + }, + { + "epoch": 0.39973351099267157, + "grad_norm": 1.7583678128183422, + "learning_rate": 9.988901220865706e-06, + "loss": 0.0866, + "step": 900 + }, + { + "epoch": 0.40017765933821897, + "grad_norm": 1.0237065065243787, + "learning_rate": 1e-05, + "loss": 0.0966, + "step": 901 + }, + { + "epoch": 0.4006218076837664, + "grad_norm": 0.9116552545055456, + "learning_rate": 9.999999624207532e-06, + "loss": 0.0752, + "step": 902 + }, + { + "epoch": 0.4010659560293138, + "grad_norm": 0.9745084022934988, + "learning_rate": 9.999998496830188e-06, + "loss": 0.0816, + "step": 903 + }, + { + "epoch": 0.4015101043748612, + "grad_norm": 0.8729574496188063, + "learning_rate": 9.999996617868132e-06, + "loss": 0.096, + "step": 904 + }, + { + "epoch": 0.40195425272040863, + "grad_norm": 0.8195978110162689, + "learning_rate": 9.999993987321651e-06, + "loss": 0.0848, + "step": 905 + }, + { + "epoch": 0.40239840106595604, + "grad_norm": 1.0642057405652703, + "learning_rate": 9.999990605191136e-06, + "loss": 0.0734, + "step": 906 + }, + { + "epoch": 0.40284254941150344, + "grad_norm": 1.055742691312144, + "learning_rate": 9.9999864714771e-06, + "loss": 0.1099, + "step": 907 + }, + { + "epoch": 0.40328669775705084, + "grad_norm": 1.2115140501137451, + "learning_rate": 9.999981586180161e-06, + "loss": 0.0742, + "step": 908 + }, + { + "epoch": 0.4037308461025983, + "grad_norm": 0.899219389432472, + "learning_rate": 9.999975949301057e-06, + "loss": 0.078, + "step": 909 + }, + { + "epoch": 0.4041749944481457, + "grad_norm": 0.8631042166204541, + "learning_rate": 9.99996956084063e-06, + "loss": 0.0721, + "step": 910 + }, + { + "epoch": 0.4046191427936931, + "grad_norm": 0.90232933367849, + "learning_rate": 9.999962420799846e-06, + "loss": 0.0855, + "step": 911 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 0.9832129529060516, + "learning_rate": 9.999954529179773e-06, + "loss": 0.0884, + "step": 912 + }, + { + "epoch": 0.4055074394847879, + "grad_norm": 0.6541793575769728, + "learning_rate": 9.999945885981603e-06, + "loss": 0.0649, + "step": 913 + }, + { + "epoch": 0.40595158783033536, + "grad_norm": 0.7878273299354217, + "learning_rate": 9.999936491206631e-06, + "loss": 0.0806, + "step": 914 + }, + { + "epoch": 0.40639573617588276, + "grad_norm": 1.1901788044990722, + "learning_rate": 9.99992634485627e-06, + "loss": 0.0926, + "step": 915 + }, + { + "epoch": 0.40683988452143016, + "grad_norm": 1.14952509907395, + "learning_rate": 9.999915446932045e-06, + "loss": 0.1071, + "step": 916 + }, + { + "epoch": 0.40728403286697756, + "grad_norm": 0.7798635545238589, + "learning_rate": 9.999903797435596e-06, + "loss": 0.0771, + "step": 917 + }, + { + "epoch": 0.40772818121252496, + "grad_norm": 1.0917620513168562, + "learning_rate": 9.999891396368672e-06, + "loss": 0.0949, + "step": 918 + }, + { + "epoch": 0.4081723295580724, + "grad_norm": 1.0952645685085969, + "learning_rate": 9.999878243733138e-06, + "loss": 0.0817, + "step": 919 + }, + { + "epoch": 0.4086164779036198, + "grad_norm": 0.8880652433088206, + "learning_rate": 9.99986433953097e-06, + "loss": 0.0776, + "step": 920 + }, + { + "epoch": 0.4090606262491672, + "grad_norm": 0.8333025562604276, + "learning_rate": 9.99984968376426e-06, + "loss": 0.0956, + "step": 921 + }, + { + "epoch": 0.4095047745947146, + "grad_norm": 1.0999527872907782, + "learning_rate": 9.99983427643521e-06, + "loss": 0.0782, + "step": 922 + }, + { + "epoch": 0.40994892294026203, + "grad_norm": 1.0065680465940938, + "learning_rate": 9.999818117546135e-06, + "loss": 0.0862, + "step": 923 + }, + { + "epoch": 0.4103930712858095, + "grad_norm": 0.7446846870231809, + "learning_rate": 9.999801207099464e-06, + "loss": 0.0693, + "step": 924 + }, + { + "epoch": 0.4108372196313569, + "grad_norm": 0.7664119308596476, + "learning_rate": 9.99978354509774e-06, + "loss": 0.062, + "step": 925 + }, + { + "epoch": 0.4112813679769043, + "grad_norm": 0.6216063626642743, + "learning_rate": 9.99976513154362e-06, + "loss": 0.0855, + "step": 926 + }, + { + "epoch": 0.4117255163224517, + "grad_norm": 0.7583401455167236, + "learning_rate": 9.99974596643987e-06, + "loss": 0.0902, + "step": 927 + }, + { + "epoch": 0.4121696646679991, + "grad_norm": 0.8236694673839551, + "learning_rate": 9.999726049789367e-06, + "loss": 0.094, + "step": 928 + }, + { + "epoch": 0.41261381301354655, + "grad_norm": 1.0278078628460734, + "learning_rate": 9.999705381595111e-06, + "loss": 0.1162, + "step": 929 + }, + { + "epoch": 0.41305796135909395, + "grad_norm": 0.9106208454624181, + "learning_rate": 9.999683961860205e-06, + "loss": 0.0832, + "step": 930 + }, + { + "epoch": 0.41350210970464135, + "grad_norm": 0.9583162458878111, + "learning_rate": 9.99966179058787e-06, + "loss": 0.0622, + "step": 931 + }, + { + "epoch": 0.41394625805018875, + "grad_norm": 1.02310895201938, + "learning_rate": 9.999638867781437e-06, + "loss": 0.0747, + "step": 932 + }, + { + "epoch": 0.41439040639573615, + "grad_norm": 0.9574965389717746, + "learning_rate": 9.999615193444354e-06, + "loss": 0.0826, + "step": 933 + }, + { + "epoch": 0.4148345547412836, + "grad_norm": 0.6820969947085748, + "learning_rate": 9.99959076758018e-06, + "loss": 0.0794, + "step": 934 + }, + { + "epoch": 0.415278703086831, + "grad_norm": 0.8271166638389166, + "learning_rate": 9.999565590192584e-06, + "loss": 0.0677, + "step": 935 + }, + { + "epoch": 0.4157228514323784, + "grad_norm": 0.7913919823378441, + "learning_rate": 9.999539661285354e-06, + "loss": 0.0899, + "step": 936 + }, + { + "epoch": 0.4161669997779258, + "grad_norm": 0.6933241808880691, + "learning_rate": 9.999512980862382e-06, + "loss": 0.0762, + "step": 937 + }, + { + "epoch": 0.4166111481234732, + "grad_norm": 1.0464472300779635, + "learning_rate": 9.999485548927686e-06, + "loss": 0.0879, + "step": 938 + }, + { + "epoch": 0.4170552964690207, + "grad_norm": 0.9713981902628005, + "learning_rate": 9.999457365485383e-06, + "loss": 0.0859, + "step": 939 + }, + { + "epoch": 0.4174994448145681, + "grad_norm": 0.7157769729378113, + "learning_rate": 9.999428430539713e-06, + "loss": 0.063, + "step": 940 + }, + { + "epoch": 0.4179435931601155, + "grad_norm": 1.3110820742809257, + "learning_rate": 9.999398744095024e-06, + "loss": 0.1083, + "step": 941 + }, + { + "epoch": 0.4183877415056629, + "grad_norm": 0.9278738003250622, + "learning_rate": 9.999368306155778e-06, + "loss": 0.0682, + "step": 942 + }, + { + "epoch": 0.4188318898512103, + "grad_norm": 1.081752257204933, + "learning_rate": 9.999337116726555e-06, + "loss": 0.082, + "step": 943 + }, + { + "epoch": 0.41927603819675774, + "grad_norm": 0.7655308857156353, + "learning_rate": 9.999305175812035e-06, + "loss": 0.0824, + "step": 944 + }, + { + "epoch": 0.41972018654230514, + "grad_norm": 0.843733636393823, + "learning_rate": 9.999272483417027e-06, + "loss": 0.0583, + "step": 945 + }, + { + "epoch": 0.42016433488785254, + "grad_norm": 1.0473704718025856, + "learning_rate": 9.99923903954644e-06, + "loss": 0.0893, + "step": 946 + }, + { + "epoch": 0.42060848323339994, + "grad_norm": 0.5121181388849327, + "learning_rate": 9.999204844205304e-06, + "loss": 0.0599, + "step": 947 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.9066795499623488, + "learning_rate": 9.999169897398757e-06, + "loss": 0.1344, + "step": 948 + }, + { + "epoch": 0.4214967799244948, + "grad_norm": 0.910793967335512, + "learning_rate": 9.999134199132054e-06, + "loss": 0.0669, + "step": 949 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 0.5923909271714048, + "learning_rate": 9.999097749410561e-06, + "loss": 0.0739, + "step": 950 + }, + { + "epoch": 0.4223850766155896, + "grad_norm": 0.8626478233669734, + "learning_rate": 9.999060548239757e-06, + "loss": 0.085, + "step": 951 + }, + { + "epoch": 0.422829224961137, + "grad_norm": 1.2157299829969963, + "learning_rate": 9.999022595625233e-06, + "loss": 0.0927, + "step": 952 + }, + { + "epoch": 0.4232733733066844, + "grad_norm": 0.8683032499306118, + "learning_rate": 9.998983891572693e-06, + "loss": 0.0857, + "step": 953 + }, + { + "epoch": 0.42371752165223187, + "grad_norm": 0.7958109904631795, + "learning_rate": 9.998944436087956e-06, + "loss": 0.0848, + "step": 954 + }, + { + "epoch": 0.42416166999777927, + "grad_norm": 0.8911128877165261, + "learning_rate": 9.998904229176955e-06, + "loss": 0.0684, + "step": 955 + }, + { + "epoch": 0.42460581834332667, + "grad_norm": 0.8480526510619464, + "learning_rate": 9.998863270845731e-06, + "loss": 0.0819, + "step": 956 + }, + { + "epoch": 0.42504996668887407, + "grad_norm": 1.1640161965795854, + "learning_rate": 9.99882156110044e-06, + "loss": 0.1114, + "step": 957 + }, + { + "epoch": 0.42549411503442147, + "grad_norm": 0.7232203452809441, + "learning_rate": 9.998779099947356e-06, + "loss": 0.0598, + "step": 958 + }, + { + "epoch": 0.42593826337996893, + "grad_norm": 0.9706452342240306, + "learning_rate": 9.998735887392858e-06, + "loss": 0.083, + "step": 959 + }, + { + "epoch": 0.42638241172551633, + "grad_norm": 0.9335641087127532, + "learning_rate": 9.998691923443442e-06, + "loss": 0.0866, + "step": 960 + }, + { + "epoch": 0.42682656007106373, + "grad_norm": 0.6482813505555395, + "learning_rate": 9.998647208105717e-06, + "loss": 0.0792, + "step": 961 + }, + { + "epoch": 0.42727070841661113, + "grad_norm": 1.0124751766766096, + "learning_rate": 9.998601741386404e-06, + "loss": 0.0981, + "step": 962 + }, + { + "epoch": 0.42771485676215854, + "grad_norm": 0.8713994807005938, + "learning_rate": 9.998555523292338e-06, + "loss": 0.112, + "step": 963 + }, + { + "epoch": 0.428159005107706, + "grad_norm": 0.7341323526220324, + "learning_rate": 9.998508553830468e-06, + "loss": 0.079, + "step": 964 + }, + { + "epoch": 0.4286031534532534, + "grad_norm": 0.7729393765262612, + "learning_rate": 9.99846083300785e-06, + "loss": 0.0783, + "step": 965 + }, + { + "epoch": 0.4290473017988008, + "grad_norm": 0.9156169394277074, + "learning_rate": 9.99841236083166e-06, + "loss": 0.0806, + "step": 966 + }, + { + "epoch": 0.4294914501443482, + "grad_norm": 0.8551663881257657, + "learning_rate": 9.998363137309187e-06, + "loss": 0.0618, + "step": 967 + }, + { + "epoch": 0.4299355984898956, + "grad_norm": 0.5370934826053735, + "learning_rate": 9.998313162447824e-06, + "loss": 0.0652, + "step": 968 + }, + { + "epoch": 0.43037974683544306, + "grad_norm": 0.7190678146601934, + "learning_rate": 9.998262436255087e-06, + "loss": 0.0574, + "step": 969 + }, + { + "epoch": 0.43082389518099046, + "grad_norm": 0.790706849866094, + "learning_rate": 9.998210958738601e-06, + "loss": 0.0805, + "step": 970 + }, + { + "epoch": 0.43126804352653786, + "grad_norm": 0.7463621831379682, + "learning_rate": 9.998158729906102e-06, + "loss": 0.0712, + "step": 971 + }, + { + "epoch": 0.43171219187208526, + "grad_norm": 1.0561578156156401, + "learning_rate": 9.998105749765444e-06, + "loss": 0.0901, + "step": 972 + }, + { + "epoch": 0.43215634021763266, + "grad_norm": 0.7180209621862901, + "learning_rate": 9.998052018324586e-06, + "loss": 0.0687, + "step": 973 + }, + { + "epoch": 0.4326004885631801, + "grad_norm": 0.8430240030491031, + "learning_rate": 9.99799753559161e-06, + "loss": 0.0763, + "step": 974 + }, + { + "epoch": 0.4330446369087275, + "grad_norm": 0.6157610719315397, + "learning_rate": 9.997942301574701e-06, + "loss": 0.0679, + "step": 975 + }, + { + "epoch": 0.4334887852542749, + "grad_norm": 0.8284454213431681, + "learning_rate": 9.997886316282167e-06, + "loss": 0.0901, + "step": 976 + }, + { + "epoch": 0.4339329335998223, + "grad_norm": 0.6138908846381919, + "learning_rate": 9.997829579722418e-06, + "loss": 0.068, + "step": 977 + }, + { + "epoch": 0.4343770819453698, + "grad_norm": 0.6977507297268527, + "learning_rate": 9.997772091903984e-06, + "loss": 0.0719, + "step": 978 + }, + { + "epoch": 0.4348212302909172, + "grad_norm": 0.672879000533416, + "learning_rate": 9.997713852835509e-06, + "loss": 0.0859, + "step": 979 + }, + { + "epoch": 0.4352653786364646, + "grad_norm": 1.0953915290421439, + "learning_rate": 9.997654862525746e-06, + "loss": 0.0984, + "step": 980 + }, + { + "epoch": 0.435709526982012, + "grad_norm": 0.7214317597556656, + "learning_rate": 9.997595120983561e-06, + "loss": 0.0788, + "step": 981 + }, + { + "epoch": 0.4361536753275594, + "grad_norm": 0.6240169495789859, + "learning_rate": 9.997534628217935e-06, + "loss": 0.0533, + "step": 982 + }, + { + "epoch": 0.43659782367310684, + "grad_norm": 1.0463824152225416, + "learning_rate": 9.997473384237962e-06, + "loss": 0.1256, + "step": 983 + }, + { + "epoch": 0.43704197201865425, + "grad_norm": 0.6075177918870873, + "learning_rate": 9.997411389052846e-06, + "loss": 0.0907, + "step": 984 + }, + { + "epoch": 0.43748612036420165, + "grad_norm": 0.8030490613184201, + "learning_rate": 9.997348642671906e-06, + "loss": 0.075, + "step": 985 + }, + { + "epoch": 0.43793026870974905, + "grad_norm": 0.6431009442762712, + "learning_rate": 9.997285145104578e-06, + "loss": 0.0681, + "step": 986 + }, + { + "epoch": 0.43837441705529645, + "grad_norm": 0.7359898616321635, + "learning_rate": 9.997220896360402e-06, + "loss": 0.0668, + "step": 987 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 0.9059235668805204, + "learning_rate": 9.997155896449037e-06, + "loss": 0.1006, + "step": 988 + }, + { + "epoch": 0.4392627137463913, + "grad_norm": 0.5647469272415054, + "learning_rate": 9.997090145380253e-06, + "loss": 0.0699, + "step": 989 + }, + { + "epoch": 0.4397068620919387, + "grad_norm": 0.7349393202965936, + "learning_rate": 9.997023643163937e-06, + "loss": 0.0731, + "step": 990 + }, + { + "epoch": 0.4401510104374861, + "grad_norm": 0.6226852410738429, + "learning_rate": 9.996956389810082e-06, + "loss": 0.0663, + "step": 991 + }, + { + "epoch": 0.4405951587830335, + "grad_norm": 0.6263169289849984, + "learning_rate": 9.996888385328798e-06, + "loss": 0.0733, + "step": 992 + }, + { + "epoch": 0.44103930712858097, + "grad_norm": 0.7290496435666083, + "learning_rate": 9.996819629730305e-06, + "loss": 0.0664, + "step": 993 + }, + { + "epoch": 0.4414834554741284, + "grad_norm": 0.6744456519687402, + "learning_rate": 9.996750123024943e-06, + "loss": 0.0637, + "step": 994 + }, + { + "epoch": 0.4419276038196758, + "grad_norm": 0.9231205047824739, + "learning_rate": 9.996679865223157e-06, + "loss": 0.0598, + "step": 995 + }, + { + "epoch": 0.4423717521652232, + "grad_norm": 0.6917241860836084, + "learning_rate": 9.99660885633551e-06, + "loss": 0.07, + "step": 996 + }, + { + "epoch": 0.4428159005107706, + "grad_norm": 1.0027314643568717, + "learning_rate": 9.996537096372672e-06, + "loss": 0.0998, + "step": 997 + }, + { + "epoch": 0.44326004885631803, + "grad_norm": 0.7262325240442806, + "learning_rate": 9.996464585345433e-06, + "loss": 0.0703, + "step": 998 + }, + { + "epoch": 0.44370419720186544, + "grad_norm": 0.7680770645535525, + "learning_rate": 9.996391323264693e-06, + "loss": 0.0884, + "step": 999 + }, + { + "epoch": 0.44414834554741284, + "grad_norm": 0.7559490106951665, + "learning_rate": 9.996317310141462e-06, + "loss": 0.0885, + "step": 1000 + }, + { + "epoch": 0.44459249389296024, + "grad_norm": 0.648213091365164, + "learning_rate": 9.996242545986868e-06, + "loss": 0.0657, + "step": 1001 + }, + { + "epoch": 0.44503664223850764, + "grad_norm": 0.941859001821441, + "learning_rate": 9.996167030812146e-06, + "loss": 0.0771, + "step": 1002 + }, + { + "epoch": 0.4454807905840551, + "grad_norm": 0.9336722754874243, + "learning_rate": 9.996090764628649e-06, + "loss": 0.062, + "step": 1003 + }, + { + "epoch": 0.4459249389296025, + "grad_norm": 0.6689537581560581, + "learning_rate": 9.996013747447844e-06, + "loss": 0.0676, + "step": 1004 + }, + { + "epoch": 0.4463690872751499, + "grad_norm": 0.6868609219124203, + "learning_rate": 9.995935979281304e-06, + "loss": 0.0698, + "step": 1005 + }, + { + "epoch": 0.4468132356206973, + "grad_norm": 0.7598513445760847, + "learning_rate": 9.995857460140719e-06, + "loss": 0.0663, + "step": 1006 + }, + { + "epoch": 0.4472573839662447, + "grad_norm": 0.8425846771969293, + "learning_rate": 9.995778190037893e-06, + "loss": 0.0709, + "step": 1007 + }, + { + "epoch": 0.44770153231179216, + "grad_norm": 0.7973802310676283, + "learning_rate": 9.995698168984743e-06, + "loss": 0.0669, + "step": 1008 + }, + { + "epoch": 0.44814568065733956, + "grad_norm": 0.8057530371483355, + "learning_rate": 9.995617396993297e-06, + "loss": 0.105, + "step": 1009 + }, + { + "epoch": 0.44858982900288696, + "grad_norm": 0.698450418542813, + "learning_rate": 9.995535874075692e-06, + "loss": 0.0886, + "step": 1010 + }, + { + "epoch": 0.44903397734843437, + "grad_norm": 0.6685872712573397, + "learning_rate": 9.99545360024419e-06, + "loss": 0.097, + "step": 1011 + }, + { + "epoch": 0.44947812569398177, + "grad_norm": 0.7945297170711836, + "learning_rate": 9.995370575511151e-06, + "loss": 0.0841, + "step": 1012 + }, + { + "epoch": 0.4499222740395292, + "grad_norm": 0.6520316868875402, + "learning_rate": 9.99528679988906e-06, + "loss": 0.0787, + "step": 1013 + }, + { + "epoch": 0.4503664223850766, + "grad_norm": 0.572444172084648, + "learning_rate": 9.995202273390505e-06, + "loss": 0.0594, + "step": 1014 + }, + { + "epoch": 0.450810570730624, + "grad_norm": 0.6063370258668184, + "learning_rate": 9.995116996028197e-06, + "loss": 0.0683, + "step": 1015 + }, + { + "epoch": 0.45125471907617143, + "grad_norm": 0.7022055879284854, + "learning_rate": 9.995030967814952e-06, + "loss": 0.0828, + "step": 1016 + }, + { + "epoch": 0.45169886742171883, + "grad_norm": 0.6787805656005202, + "learning_rate": 9.994944188763701e-06, + "loss": 0.0886, + "step": 1017 + }, + { + "epoch": 0.4521430157672663, + "grad_norm": 0.6743594234424202, + "learning_rate": 9.994856658887491e-06, + "loss": 0.0858, + "step": 1018 + }, + { + "epoch": 0.4525871641128137, + "grad_norm": 0.701751582175111, + "learning_rate": 9.994768378199476e-06, + "loss": 0.0877, + "step": 1019 + }, + { + "epoch": 0.4530313124583611, + "grad_norm": 0.7777579093243342, + "learning_rate": 9.994679346712927e-06, + "loss": 0.0809, + "step": 1020 + }, + { + "epoch": 0.4534754608039085, + "grad_norm": 0.7407021400306011, + "learning_rate": 9.994589564441229e-06, + "loss": 0.0667, + "step": 1021 + }, + { + "epoch": 0.4539196091494559, + "grad_norm": 0.8640591667177118, + "learning_rate": 9.994499031397874e-06, + "loss": 0.0779, + "step": 1022 + }, + { + "epoch": 0.45436375749500335, + "grad_norm": 0.741626283806636, + "learning_rate": 9.994407747596474e-06, + "loss": 0.0598, + "step": 1023 + }, + { + "epoch": 0.45480790584055075, + "grad_norm": 0.7426285442565452, + "learning_rate": 9.994315713050749e-06, + "loss": 0.0763, + "step": 1024 + }, + { + "epoch": 0.45525205418609815, + "grad_norm": 1.1459706806907142, + "learning_rate": 9.994222927774535e-06, + "loss": 0.0928, + "step": 1025 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 0.9102603719521771, + "learning_rate": 9.994129391781777e-06, + "loss": 0.0695, + "step": 1026 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 0.8406791604735616, + "learning_rate": 9.994035105086536e-06, + "loss": 0.0655, + "step": 1027 + }, + { + "epoch": 0.4565844992227404, + "grad_norm": 0.6764230445649027, + "learning_rate": 9.993940067702985e-06, + "loss": 0.0566, + "step": 1028 + }, + { + "epoch": 0.4570286475682878, + "grad_norm": 1.1241952343326866, + "learning_rate": 9.993844279645411e-06, + "loss": 0.1138, + "step": 1029 + }, + { + "epoch": 0.4574727959138352, + "grad_norm": 0.8203260077691763, + "learning_rate": 9.993747740928207e-06, + "loss": 0.0639, + "step": 1030 + }, + { + "epoch": 0.4579169442593826, + "grad_norm": 0.9349985900242503, + "learning_rate": 9.993650451565892e-06, + "loss": 0.078, + "step": 1031 + }, + { + "epoch": 0.45836109260493, + "grad_norm": 0.9978670067027727, + "learning_rate": 9.993552411573088e-06, + "loss": 0.0952, + "step": 1032 + }, + { + "epoch": 0.4588052409504775, + "grad_norm": 1.1441082866740984, + "learning_rate": 9.993453620964529e-06, + "loss": 0.0897, + "step": 1033 + }, + { + "epoch": 0.4592493892960249, + "grad_norm": 1.0696466066343144, + "learning_rate": 9.993354079755066e-06, + "loss": 0.0903, + "step": 1034 + }, + { + "epoch": 0.4596935376415723, + "grad_norm": 0.6531078727573577, + "learning_rate": 9.993253787959664e-06, + "loss": 0.0673, + "step": 1035 + }, + { + "epoch": 0.4601376859871197, + "grad_norm": 1.4198972391798237, + "learning_rate": 9.993152745593398e-06, + "loss": 0.083, + "step": 1036 + }, + { + "epoch": 0.4605818343326671, + "grad_norm": 1.070478952947314, + "learning_rate": 9.993050952671453e-06, + "loss": 0.0687, + "step": 1037 + }, + { + "epoch": 0.46102598267821454, + "grad_norm": 0.7417628375073222, + "learning_rate": 9.992948409209134e-06, + "loss": 0.0802, + "step": 1038 + }, + { + "epoch": 0.46147013102376194, + "grad_norm": 0.6201081139409721, + "learning_rate": 9.992845115221855e-06, + "loss": 0.0612, + "step": 1039 + }, + { + "epoch": 0.46191427936930934, + "grad_norm": 0.8344957050457038, + "learning_rate": 9.992741070725137e-06, + "loss": 0.0897, + "step": 1040 + }, + { + "epoch": 0.46235842771485675, + "grad_norm": 0.5628453325687238, + "learning_rate": 9.992636275734629e-06, + "loss": 0.069, + "step": 1041 + }, + { + "epoch": 0.4628025760604042, + "grad_norm": 0.6969650084016832, + "learning_rate": 9.992530730266078e-06, + "loss": 0.073, + "step": 1042 + }, + { + "epoch": 0.4632467244059516, + "grad_norm": 1.0860876745728663, + "learning_rate": 9.992424434335348e-06, + "loss": 0.0921, + "step": 1043 + }, + { + "epoch": 0.463690872751499, + "grad_norm": 0.5795193360907794, + "learning_rate": 9.99231738795842e-06, + "loss": 0.0504, + "step": 1044 + }, + { + "epoch": 0.4641350210970464, + "grad_norm": 0.6583352279848995, + "learning_rate": 9.992209591151386e-06, + "loss": 0.0831, + "step": 1045 + }, + { + "epoch": 0.4645791694425938, + "grad_norm": 0.5908927184246742, + "learning_rate": 9.992101043930444e-06, + "loss": 0.0996, + "step": 1046 + }, + { + "epoch": 0.46502331778814127, + "grad_norm": 0.7629134106102797, + "learning_rate": 9.991991746311916e-06, + "loss": 0.0595, + "step": 1047 + }, + { + "epoch": 0.46546746613368867, + "grad_norm": 0.8529582668397104, + "learning_rate": 9.991881698312229e-06, + "loss": 0.076, + "step": 1048 + }, + { + "epoch": 0.46591161447923607, + "grad_norm": 0.6804037477070127, + "learning_rate": 9.991770899947925e-06, + "loss": 0.075, + "step": 1049 + }, + { + "epoch": 0.46635576282478347, + "grad_norm": 0.8612466864991256, + "learning_rate": 9.991659351235662e-06, + "loss": 0.0796, + "step": 1050 + }, + { + "epoch": 0.4667999111703309, + "grad_norm": 0.6453565757322578, + "learning_rate": 9.991547052192203e-06, + "loss": 0.0625, + "step": 1051 + }, + { + "epoch": 0.46724405951587833, + "grad_norm": 1.7133852156999947, + "learning_rate": 9.99143400283443e-06, + "loss": 0.097, + "step": 1052 + }, + { + "epoch": 0.46768820786142573, + "grad_norm": 0.7043109079290237, + "learning_rate": 9.991320203179338e-06, + "loss": 0.0556, + "step": 1053 + }, + { + "epoch": 0.46813235620697313, + "grad_norm": 0.9119065164949084, + "learning_rate": 9.991205653244032e-06, + "loss": 0.0618, + "step": 1054 + }, + { + "epoch": 0.46857650455252053, + "grad_norm": 0.5740417394473022, + "learning_rate": 9.991090353045729e-06, + "loss": 0.0492, + "step": 1055 + }, + { + "epoch": 0.46902065289806794, + "grad_norm": 0.8813556475853607, + "learning_rate": 9.990974302601762e-06, + "loss": 0.0603, + "step": 1056 + }, + { + "epoch": 0.4694648012436154, + "grad_norm": 0.6951634771023119, + "learning_rate": 9.990857501929577e-06, + "loss": 0.0641, + "step": 1057 + }, + { + "epoch": 0.4699089495891628, + "grad_norm": 0.9703272601786429, + "learning_rate": 9.990739951046729e-06, + "loss": 0.1064, + "step": 1058 + }, + { + "epoch": 0.4703530979347102, + "grad_norm": 1.1076310196170094, + "learning_rate": 9.99062164997089e-06, + "loss": 0.077, + "step": 1059 + }, + { + "epoch": 0.4707972462802576, + "grad_norm": 0.813678875992106, + "learning_rate": 9.990502598719837e-06, + "loss": 0.0579, + "step": 1060 + }, + { + "epoch": 0.471241394625805, + "grad_norm": 0.8445949631628347, + "learning_rate": 9.990382797311474e-06, + "loss": 0.0922, + "step": 1061 + }, + { + "epoch": 0.47168554297135246, + "grad_norm": 0.6432477880049939, + "learning_rate": 9.990262245763802e-06, + "loss": 0.066, + "step": 1062 + }, + { + "epoch": 0.47212969131689986, + "grad_norm": 1.0190163753358932, + "learning_rate": 9.990140944094946e-06, + "loss": 0.1059, + "step": 1063 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 0.9736504358272349, + "learning_rate": 9.990018892323138e-06, + "loss": 0.0716, + "step": 1064 + }, + { + "epoch": 0.47301798800799466, + "grad_norm": 0.8551545600086438, + "learning_rate": 9.989896090466725e-06, + "loss": 0.0721, + "step": 1065 + }, + { + "epoch": 0.47346213635354206, + "grad_norm": 0.7846245356864615, + "learning_rate": 9.989772538544167e-06, + "loss": 0.0837, + "step": 1066 + }, + { + "epoch": 0.4739062846990895, + "grad_norm": 0.6875499187875691, + "learning_rate": 9.989648236574035e-06, + "loss": 0.0604, + "step": 1067 + }, + { + "epoch": 0.4743504330446369, + "grad_norm": 1.010871584074772, + "learning_rate": 9.989523184575013e-06, + "loss": 0.0836, + "step": 1068 + }, + { + "epoch": 0.4747945813901843, + "grad_norm": 0.8312720125551346, + "learning_rate": 9.989397382565898e-06, + "loss": 0.0712, + "step": 1069 + }, + { + "epoch": 0.4752387297357317, + "grad_norm": 0.96789104638543, + "learning_rate": 9.989270830565603e-06, + "loss": 0.0784, + "step": 1070 + }, + { + "epoch": 0.4756828780812791, + "grad_norm": 0.7404878797473026, + "learning_rate": 9.989143528593149e-06, + "loss": 0.0872, + "step": 1071 + }, + { + "epoch": 0.4761270264268266, + "grad_norm": 0.6280381360800792, + "learning_rate": 9.98901547666767e-06, + "loss": 0.0661, + "step": 1072 + }, + { + "epoch": 0.476571174772374, + "grad_norm": 0.8234412511968835, + "learning_rate": 9.988886674808418e-06, + "loss": 0.0621, + "step": 1073 + }, + { + "epoch": 0.4770153231179214, + "grad_norm": 0.5856951522424453, + "learning_rate": 9.988757123034753e-06, + "loss": 0.0657, + "step": 1074 + }, + { + "epoch": 0.4774594714634688, + "grad_norm": 0.9863728037623454, + "learning_rate": 9.988626821366147e-06, + "loss": 0.0787, + "step": 1075 + }, + { + "epoch": 0.4779036198090162, + "grad_norm": 0.8043968430169456, + "learning_rate": 9.988495769822188e-06, + "loss": 0.0712, + "step": 1076 + }, + { + "epoch": 0.47834776815456365, + "grad_norm": 0.7330127963179667, + "learning_rate": 9.988363968422577e-06, + "loss": 0.0749, + "step": 1077 + }, + { + "epoch": 0.47879191650011105, + "grad_norm": 0.8101029322049607, + "learning_rate": 9.988231417187122e-06, + "loss": 0.0901, + "step": 1078 + }, + { + "epoch": 0.47923606484565845, + "grad_norm": 0.9467235647843469, + "learning_rate": 9.98809811613575e-06, + "loss": 0.0923, + "step": 1079 + }, + { + "epoch": 0.47968021319120585, + "grad_norm": 0.7955200227969057, + "learning_rate": 9.9879640652885e-06, + "loss": 0.0802, + "step": 1080 + }, + { + "epoch": 0.48012436153675325, + "grad_norm": 1.0074849821784144, + "learning_rate": 9.987829264665518e-06, + "loss": 0.1319, + "step": 1081 + }, + { + "epoch": 0.4805685098823007, + "grad_norm": 0.680242996169655, + "learning_rate": 9.98769371428707e-06, + "loss": 0.0659, + "step": 1082 + }, + { + "epoch": 0.4810126582278481, + "grad_norm": 1.1307628357255757, + "learning_rate": 9.98755741417353e-06, + "loss": 0.0962, + "step": 1083 + }, + { + "epoch": 0.4814568065733955, + "grad_norm": 0.7637117418782801, + "learning_rate": 9.987420364345388e-06, + "loss": 0.0596, + "step": 1084 + }, + { + "epoch": 0.4819009549189429, + "grad_norm": 0.8434389453780773, + "learning_rate": 9.987282564823242e-06, + "loss": 0.0832, + "step": 1085 + }, + { + "epoch": 0.4823451032644903, + "grad_norm": 1.3802825505619392, + "learning_rate": 9.98714401562781e-06, + "loss": 0.1092, + "step": 1086 + }, + { + "epoch": 0.4827892516100378, + "grad_norm": 0.7526559659659204, + "learning_rate": 9.987004716779914e-06, + "loss": 0.0649, + "step": 1087 + }, + { + "epoch": 0.4832333999555852, + "grad_norm": 0.8564554305307791, + "learning_rate": 9.986864668300494e-06, + "loss": 0.0745, + "step": 1088 + }, + { + "epoch": 0.4836775483011326, + "grad_norm": 0.6121214143253134, + "learning_rate": 9.986723870210605e-06, + "loss": 0.0648, + "step": 1089 + }, + { + "epoch": 0.48412169664668, + "grad_norm": 1.2737874840599017, + "learning_rate": 9.986582322531406e-06, + "loss": 0.1167, + "step": 1090 + }, + { + "epoch": 0.4845658449922274, + "grad_norm": 1.1300452790006459, + "learning_rate": 9.986440025284177e-06, + "loss": 0.1004, + "step": 1091 + }, + { + "epoch": 0.48500999333777484, + "grad_norm": 0.695366891600787, + "learning_rate": 9.986296978490308e-06, + "loss": 0.0631, + "step": 1092 + }, + { + "epoch": 0.48545414168332224, + "grad_norm": 0.907315227077863, + "learning_rate": 9.9861531821713e-06, + "loss": 0.0975, + "step": 1093 + }, + { + "epoch": 0.48589829002886964, + "grad_norm": 0.8452739013500794, + "learning_rate": 9.986008636348771e-06, + "loss": 0.0725, + "step": 1094 + }, + { + "epoch": 0.48634243837441704, + "grad_norm": 0.7641747681242534, + "learning_rate": 9.985863341044444e-06, + "loss": 0.0745, + "step": 1095 + }, + { + "epoch": 0.48678658671996444, + "grad_norm": 1.0767424752885635, + "learning_rate": 9.985717296280165e-06, + "loss": 0.1343, + "step": 1096 + }, + { + "epoch": 0.4872307350655119, + "grad_norm": 1.082068622065411, + "learning_rate": 9.985570502077881e-06, + "loss": 0.0709, + "step": 1097 + }, + { + "epoch": 0.4876748834110593, + "grad_norm": 1.1385183378270511, + "learning_rate": 9.98542295845966e-06, + "loss": 0.1136, + "step": 1098 + }, + { + "epoch": 0.4881190317566067, + "grad_norm": 0.8790710582646716, + "learning_rate": 9.985274665447682e-06, + "loss": 0.0796, + "step": 1099 + }, + { + "epoch": 0.4885631801021541, + "grad_norm": 0.7645639481923898, + "learning_rate": 9.985125623064238e-06, + "loss": 0.1039, + "step": 1100 + }, + { + "epoch": 0.4890073284477015, + "grad_norm": 0.9695248420140831, + "learning_rate": 9.98497583133173e-06, + "loss": 0.0667, + "step": 1101 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 0.8791125306416613, + "learning_rate": 9.984825290272673e-06, + "loss": 0.0703, + "step": 1102 + }, + { + "epoch": 0.48989562513879636, + "grad_norm": 0.6608187335865532, + "learning_rate": 9.984673999909698e-06, + "loss": 0.0607, + "step": 1103 + }, + { + "epoch": 0.49033977348434377, + "grad_norm": 0.7238746077559932, + "learning_rate": 9.984521960265545e-06, + "loss": 0.0582, + "step": 1104 + }, + { + "epoch": 0.49078392182989117, + "grad_norm": 0.7543190655536871, + "learning_rate": 9.98436917136307e-06, + "loss": 0.075, + "step": 1105 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 0.9831510577501872, + "learning_rate": 9.98421563322524e-06, + "loss": 0.0832, + "step": 1106 + }, + { + "epoch": 0.491672218520986, + "grad_norm": 1.0953392860217965, + "learning_rate": 9.984061345875133e-06, + "loss": 0.1055, + "step": 1107 + }, + { + "epoch": 0.49211636686653343, + "grad_norm": 1.0004839390300868, + "learning_rate": 9.983906309335942e-06, + "loss": 0.0882, + "step": 1108 + }, + { + "epoch": 0.49256051521208083, + "grad_norm": 0.7389464810035322, + "learning_rate": 9.98375052363097e-06, + "loss": 0.0608, + "step": 1109 + }, + { + "epoch": 0.49300466355762823, + "grad_norm": 0.6469655367019491, + "learning_rate": 9.983593988783634e-06, + "loss": 0.0687, + "step": 1110 + }, + { + "epoch": 0.4934488119031757, + "grad_norm": 0.7833813336297293, + "learning_rate": 9.983436704817466e-06, + "loss": 0.0902, + "step": 1111 + }, + { + "epoch": 0.4938929602487231, + "grad_norm": 1.1676871219264449, + "learning_rate": 9.983278671756107e-06, + "loss": 0.0741, + "step": 1112 + }, + { + "epoch": 0.4943371085942705, + "grad_norm": 0.5948813388699934, + "learning_rate": 9.983119889623314e-06, + "loss": 0.0584, + "step": 1113 + }, + { + "epoch": 0.4947812569398179, + "grad_norm": 1.0932192784955042, + "learning_rate": 9.982960358442952e-06, + "loss": 0.0814, + "step": 1114 + }, + { + "epoch": 0.4952254052853653, + "grad_norm": 0.9576191702524788, + "learning_rate": 9.982800078239004e-06, + "loss": 0.0939, + "step": 1115 + }, + { + "epoch": 0.49566955363091275, + "grad_norm": 0.8091517385774816, + "learning_rate": 9.982639049035559e-06, + "loss": 0.0894, + "step": 1116 + }, + { + "epoch": 0.49611370197646015, + "grad_norm": 1.1318080237230808, + "learning_rate": 9.982477270856827e-06, + "loss": 0.0743, + "step": 1117 + }, + { + "epoch": 0.49655785032200755, + "grad_norm": 1.2019606615618712, + "learning_rate": 9.982314743727121e-06, + "loss": 0.107, + "step": 1118 + }, + { + "epoch": 0.49700199866755496, + "grad_norm": 0.9414473018138827, + "learning_rate": 9.982151467670876e-06, + "loss": 0.0746, + "step": 1119 + }, + { + "epoch": 0.49744614701310236, + "grad_norm": 0.7821900901787131, + "learning_rate": 9.981987442712634e-06, + "loss": 0.062, + "step": 1120 + }, + { + "epoch": 0.4978902953586498, + "grad_norm": 0.8874834331056065, + "learning_rate": 9.981822668877048e-06, + "loss": 0.0964, + "step": 1121 + }, + { + "epoch": 0.4983344437041972, + "grad_norm": 0.763549676147823, + "learning_rate": 9.98165714618889e-06, + "loss": 0.0683, + "step": 1122 + }, + { + "epoch": 0.4987785920497446, + "grad_norm": 0.7237088556818837, + "learning_rate": 9.98149087467304e-06, + "loss": 0.0706, + "step": 1123 + }, + { + "epoch": 0.499222740395292, + "grad_norm": 0.5389621815014336, + "learning_rate": 9.98132385435449e-06, + "loss": 0.0739, + "step": 1124 + }, + { + "epoch": 0.4996668887408394, + "grad_norm": 0.5780461463044186, + "learning_rate": 9.981156085258347e-06, + "loss": 0.0532, + "step": 1125 + }, + { + "epoch": 0.5001110370863868, + "grad_norm": 0.9337676739733405, + "learning_rate": 9.980987567409829e-06, + "loss": 0.0728, + "step": 1126 + }, + { + "epoch": 0.5005551854319342, + "grad_norm": 0.7121233826282255, + "learning_rate": 9.980818300834267e-06, + "loss": 0.0684, + "step": 1127 + }, + { + "epoch": 0.5009993337774816, + "grad_norm": 0.7441687384799486, + "learning_rate": 9.980648285557106e-06, + "loss": 0.0692, + "step": 1128 + }, + { + "epoch": 0.5014434821230291, + "grad_norm": 0.6423868990281854, + "learning_rate": 9.980477521603901e-06, + "loss": 0.0677, + "step": 1129 + }, + { + "epoch": 0.5018876304685765, + "grad_norm": 0.5495871418316753, + "learning_rate": 9.98030600900032e-06, + "loss": 0.0728, + "step": 1130 + }, + { + "epoch": 0.5023317788141239, + "grad_norm": 0.6241490380000925, + "learning_rate": 9.980133747772148e-06, + "loss": 0.0662, + "step": 1131 + }, + { + "epoch": 0.5027759271596713, + "grad_norm": 0.7343652098087937, + "learning_rate": 9.979960737945273e-06, + "loss": 0.0678, + "step": 1132 + }, + { + "epoch": 0.5032200755052187, + "grad_norm": 1.1015252671394224, + "learning_rate": 9.979786979545704e-06, + "loss": 0.0993, + "step": 1133 + }, + { + "epoch": 0.5036642238507661, + "grad_norm": 0.8816390886875364, + "learning_rate": 9.979612472599563e-06, + "loss": 0.0839, + "step": 1134 + }, + { + "epoch": 0.5041083721963135, + "grad_norm": 0.7833322958956014, + "learning_rate": 9.979437217133077e-06, + "loss": 0.082, + "step": 1135 + }, + { + "epoch": 0.504552520541861, + "grad_norm": 0.9520003222392113, + "learning_rate": 9.979261213172592e-06, + "loss": 0.0768, + "step": 1136 + }, + { + "epoch": 0.5049966688874084, + "grad_norm": 0.6650156355994222, + "learning_rate": 9.979084460744563e-06, + "loss": 0.0594, + "step": 1137 + }, + { + "epoch": 0.5054408172329558, + "grad_norm": 0.7702494870526182, + "learning_rate": 9.97890695987556e-06, + "loss": 0.0744, + "step": 1138 + }, + { + "epoch": 0.5058849655785033, + "grad_norm": 0.6441998434508551, + "learning_rate": 9.978728710592265e-06, + "loss": 0.0639, + "step": 1139 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 0.8638339516257324, + "learning_rate": 9.97854971292147e-06, + "loss": 0.0737, + "step": 1140 + }, + { + "epoch": 0.5067732622695981, + "grad_norm": 0.6599486193027585, + "learning_rate": 9.978369966890082e-06, + "loss": 0.0677, + "step": 1141 + }, + { + "epoch": 0.5072174106151455, + "grad_norm": 0.5684535397639588, + "learning_rate": 9.978189472525121e-06, + "loss": 0.0667, + "step": 1142 + }, + { + "epoch": 0.5076615589606929, + "grad_norm": 0.7930545362878336, + "learning_rate": 9.978008229853717e-06, + "loss": 0.0768, + "step": 1143 + }, + { + "epoch": 0.5081057073062403, + "grad_norm": 1.013824145483496, + "learning_rate": 9.977826238903116e-06, + "loss": 0.0878, + "step": 1144 + }, + { + "epoch": 0.5085498556517877, + "grad_norm": 0.7663207754691962, + "learning_rate": 9.97764349970067e-06, + "loss": 0.0757, + "step": 1145 + }, + { + "epoch": 0.5089940039973351, + "grad_norm": 0.7907421181118522, + "learning_rate": 9.977460012273854e-06, + "loss": 0.0784, + "step": 1146 + }, + { + "epoch": 0.5094381523428825, + "grad_norm": 0.6096514964784717, + "learning_rate": 9.977275776650244e-06, + "loss": 0.0806, + "step": 1147 + }, + { + "epoch": 0.50988230068843, + "grad_norm": 0.5159120034000472, + "learning_rate": 9.977090792857536e-06, + "loss": 0.0598, + "step": 1148 + }, + { + "epoch": 0.5103264490339774, + "grad_norm": 0.7102391658769992, + "learning_rate": 9.976905060923536e-06, + "loss": 0.077, + "step": 1149 + }, + { + "epoch": 0.5107705973795248, + "grad_norm": 0.7376867178754969, + "learning_rate": 9.97671858087616e-06, + "loss": 0.0726, + "step": 1150 + }, + { + "epoch": 0.5112147457250722, + "grad_norm": 0.7790796748709151, + "learning_rate": 9.976531352743445e-06, + "loss": 0.0806, + "step": 1151 + }, + { + "epoch": 0.5116588940706196, + "grad_norm": 0.618833561044351, + "learning_rate": 9.97634337655353e-06, + "loss": 0.0654, + "step": 1152 + }, + { + "epoch": 0.512103042416167, + "grad_norm": 0.808662756842487, + "learning_rate": 9.976154652334673e-06, + "loss": 0.1081, + "step": 1153 + }, + { + "epoch": 0.5125471907617144, + "grad_norm": 1.0180790521960883, + "learning_rate": 9.97596518011524e-06, + "loss": 0.1018, + "step": 1154 + }, + { + "epoch": 0.5129913391072618, + "grad_norm": 0.638695304240369, + "learning_rate": 9.975774959923717e-06, + "loss": 0.0655, + "step": 1155 + }, + { + "epoch": 0.5134354874528092, + "grad_norm": 0.7566614134333154, + "learning_rate": 9.975583991788691e-06, + "loss": 0.0658, + "step": 1156 + }, + { + "epoch": 0.5138796357983566, + "grad_norm": 0.7034717435901088, + "learning_rate": 9.97539227573887e-06, + "loss": 0.0815, + "step": 1157 + }, + { + "epoch": 0.5143237841439041, + "grad_norm": 0.7342548435589596, + "learning_rate": 9.975199811803073e-06, + "loss": 0.0755, + "step": 1158 + }, + { + "epoch": 0.5147679324894515, + "grad_norm": 0.881634196160518, + "learning_rate": 9.975006600010233e-06, + "loss": 0.0649, + "step": 1159 + }, + { + "epoch": 0.5152120808349989, + "grad_norm": 0.82644127591562, + "learning_rate": 9.97481264038939e-06, + "loss": 0.0589, + "step": 1160 + }, + { + "epoch": 0.5156562291805463, + "grad_norm": 0.8462515915216339, + "learning_rate": 9.974617932969697e-06, + "loss": 0.0735, + "step": 1161 + }, + { + "epoch": 0.5161003775260937, + "grad_norm": 0.6116708783372564, + "learning_rate": 9.974422477780426e-06, + "loss": 0.0593, + "step": 1162 + }, + { + "epoch": 0.5165445258716411, + "grad_norm": 0.7646830820557489, + "learning_rate": 9.974226274850956e-06, + "loss": 0.0866, + "step": 1163 + }, + { + "epoch": 0.5169886742171885, + "grad_norm": 0.6551362409279565, + "learning_rate": 9.97402932421078e-06, + "loss": 0.0742, + "step": 1164 + }, + { + "epoch": 0.5174328225627359, + "grad_norm": 0.5323199794273931, + "learning_rate": 9.973831625889501e-06, + "loss": 0.0599, + "step": 1165 + }, + { + "epoch": 0.5178769709082833, + "grad_norm": 0.694095375162108, + "learning_rate": 9.97363317991684e-06, + "loss": 0.0646, + "step": 1166 + }, + { + "epoch": 0.5183211192538307, + "grad_norm": 0.9040017314033096, + "learning_rate": 9.973433986322625e-06, + "loss": 0.0755, + "step": 1167 + }, + { + "epoch": 0.5187652675993782, + "grad_norm": 0.7059881945266824, + "learning_rate": 9.973234045136798e-06, + "loss": 0.0869, + "step": 1168 + }, + { + "epoch": 0.5192094159449256, + "grad_norm": 0.7791362124843618, + "learning_rate": 9.973033356389412e-06, + "loss": 0.0912, + "step": 1169 + }, + { + "epoch": 0.519653564290473, + "grad_norm": 0.7295114970239928, + "learning_rate": 9.972831920110635e-06, + "loss": 0.0769, + "step": 1170 + }, + { + "epoch": 0.5200977126360204, + "grad_norm": 0.5612528230166707, + "learning_rate": 9.972629736330748e-06, + "loss": 0.0679, + "step": 1171 + }, + { + "epoch": 0.5205418609815678, + "grad_norm": 0.7254809001904728, + "learning_rate": 9.972426805080141e-06, + "loss": 0.0715, + "step": 1172 + }, + { + "epoch": 0.5209860093271153, + "grad_norm": 0.7275192896210665, + "learning_rate": 9.97222312638932e-06, + "loss": 0.0841, + "step": 1173 + }, + { + "epoch": 0.5214301576726627, + "grad_norm": 0.8132581046482065, + "learning_rate": 9.972018700288898e-06, + "loss": 0.0715, + "step": 1174 + }, + { + "epoch": 0.52187430601821, + "grad_norm": 0.9966008387669913, + "learning_rate": 9.971813526809606e-06, + "loss": 0.0844, + "step": 1175 + }, + { + "epoch": 0.5223184543637575, + "grad_norm": 0.621117239882631, + "learning_rate": 9.971607605982285e-06, + "loss": 0.0572, + "step": 1176 + }, + { + "epoch": 0.5227626027093049, + "grad_norm": 0.8375409793845516, + "learning_rate": 9.971400937837887e-06, + "loss": 0.0728, + "step": 1177 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 0.9177354811192968, + "learning_rate": 9.97119352240748e-06, + "loss": 0.0878, + "step": 1178 + }, + { + "epoch": 0.5236508994003998, + "grad_norm": 0.7412042248938593, + "learning_rate": 9.97098535972224e-06, + "loss": 0.0649, + "step": 1179 + }, + { + "epoch": 0.5240950477459472, + "grad_norm": 0.75392664768525, + "learning_rate": 9.970776449813457e-06, + "loss": 0.0682, + "step": 1180 + }, + { + "epoch": 0.5245391960914946, + "grad_norm": 0.7437991387327917, + "learning_rate": 9.970566792712537e-06, + "loss": 0.0646, + "step": 1181 + }, + { + "epoch": 0.524983344437042, + "grad_norm": 0.708247278848663, + "learning_rate": 9.970356388450992e-06, + "loss": 0.0781, + "step": 1182 + }, + { + "epoch": 0.5254274927825894, + "grad_norm": 0.508933410042531, + "learning_rate": 9.97014523706045e-06, + "loss": 0.0535, + "step": 1183 + }, + { + "epoch": 0.5258716411281368, + "grad_norm": 1.0021738372524498, + "learning_rate": 9.96993333857265e-06, + "loss": 0.0925, + "step": 1184 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8285101542802605, + "learning_rate": 9.969720693019447e-06, + "loss": 0.0653, + "step": 1185 + }, + { + "epoch": 0.5267599378192316, + "grad_norm": 0.6010536869971059, + "learning_rate": 9.9695073004328e-06, + "loss": 0.0601, + "step": 1186 + }, + { + "epoch": 0.527204086164779, + "grad_norm": 0.43111203380102636, + "learning_rate": 9.969293160844793e-06, + "loss": 0.0435, + "step": 1187 + }, + { + "epoch": 0.5276482345103265, + "grad_norm": 0.7272050961589046, + "learning_rate": 9.969078274287607e-06, + "loss": 0.0582, + "step": 1188 + }, + { + "epoch": 0.5280923828558739, + "grad_norm": 1.0434151274290433, + "learning_rate": 9.968862640793547e-06, + "loss": 0.0882, + "step": 1189 + }, + { + "epoch": 0.5285365312014213, + "grad_norm": 0.6610761510472204, + "learning_rate": 9.968646260395027e-06, + "loss": 0.0701, + "step": 1190 + }, + { + "epoch": 0.5289806795469687, + "grad_norm": 0.7703015492278306, + "learning_rate": 9.96842913312457e-06, + "loss": 0.0755, + "step": 1191 + }, + { + "epoch": 0.5294248278925161, + "grad_norm": 1.1334603205830818, + "learning_rate": 9.968211259014817e-06, + "loss": 0.102, + "step": 1192 + }, + { + "epoch": 0.5298689762380635, + "grad_norm": 0.5680812506173959, + "learning_rate": 9.967992638098517e-06, + "loss": 0.0592, + "step": 1193 + }, + { + "epoch": 0.5303131245836109, + "grad_norm": 0.7128376996533422, + "learning_rate": 9.96777327040853e-06, + "loss": 0.0757, + "step": 1194 + }, + { + "epoch": 0.5307572729291583, + "grad_norm": 0.6547153588619065, + "learning_rate": 9.967553155977833e-06, + "loss": 0.0513, + "step": 1195 + }, + { + "epoch": 0.5312014212747057, + "grad_norm": 0.8276344454564539, + "learning_rate": 9.967332294839514e-06, + "loss": 0.0722, + "step": 1196 + }, + { + "epoch": 0.5316455696202531, + "grad_norm": 0.5942789130977645, + "learning_rate": 9.967110687026769e-06, + "loss": 0.0735, + "step": 1197 + }, + { + "epoch": 0.5320897179658006, + "grad_norm": 0.858420102793841, + "learning_rate": 9.966888332572913e-06, + "loss": 0.0888, + "step": 1198 + }, + { + "epoch": 0.532533866311348, + "grad_norm": 0.5306419562771065, + "learning_rate": 9.966665231511367e-06, + "loss": 0.0735, + "step": 1199 + }, + { + "epoch": 0.5329780146568954, + "grad_norm": 0.8818696703835058, + "learning_rate": 9.96644138387567e-06, + "loss": 0.0836, + "step": 1200 + }, + { + "epoch": 0.5334221630024428, + "grad_norm": 0.6953800899178525, + "learning_rate": 9.966216789699466e-06, + "loss": 0.0609, + "step": 1201 + }, + { + "epoch": 0.5338663113479902, + "grad_norm": 0.6141704558915728, + "learning_rate": 9.965991449016517e-06, + "loss": 0.0593, + "step": 1202 + }, + { + "epoch": 0.5343104596935376, + "grad_norm": 0.7784143890008246, + "learning_rate": 9.965765361860696e-06, + "loss": 0.0682, + "step": 1203 + }, + { + "epoch": 0.534754608039085, + "grad_norm": 0.7591572831776785, + "learning_rate": 9.965538528265986e-06, + "loss": 0.0713, + "step": 1204 + }, + { + "epoch": 0.5351987563846324, + "grad_norm": 1.0112012318529477, + "learning_rate": 9.965310948266488e-06, + "loss": 0.0885, + "step": 1205 + }, + { + "epoch": 0.5356429047301798, + "grad_norm": 0.540867345550289, + "learning_rate": 9.965082621896407e-06, + "loss": 0.0683, + "step": 1206 + }, + { + "epoch": 0.5360870530757272, + "grad_norm": 0.6987948599480821, + "learning_rate": 9.964853549190067e-06, + "loss": 0.0623, + "step": 1207 + }, + { + "epoch": 0.5365312014212748, + "grad_norm": 1.0949702611838528, + "learning_rate": 9.9646237301819e-06, + "loss": 0.0715, + "step": 1208 + }, + { + "epoch": 0.5369753497668222, + "grad_norm": 0.5434822885792773, + "learning_rate": 9.964393164906452e-06, + "loss": 0.077, + "step": 1209 + }, + { + "epoch": 0.5374194981123696, + "grad_norm": 0.785351221183312, + "learning_rate": 9.964161853398381e-06, + "loss": 0.07, + "step": 1210 + }, + { + "epoch": 0.537863646457917, + "grad_norm": 0.5762754098127977, + "learning_rate": 9.963929795692458e-06, + "loss": 0.0553, + "step": 1211 + }, + { + "epoch": 0.5383077948034644, + "grad_norm": 0.7778031310614973, + "learning_rate": 9.963696991823563e-06, + "loss": 0.0671, + "step": 1212 + }, + { + "epoch": 0.5387519431490118, + "grad_norm": 0.7236031416359869, + "learning_rate": 9.963463441826693e-06, + "loss": 0.0861, + "step": 1213 + }, + { + "epoch": 0.5391960914945592, + "grad_norm": 0.8183548985521056, + "learning_rate": 9.963229145736952e-06, + "loss": 0.0905, + "step": 1214 + }, + { + "epoch": 0.5396402398401066, + "grad_norm": 0.8238577196866539, + "learning_rate": 9.96299410358956e-06, + "loss": 0.0791, + "step": 1215 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 0.620826351541747, + "learning_rate": 9.962758315419847e-06, + "loss": 0.0627, + "step": 1216 + }, + { + "epoch": 0.5405285365312015, + "grad_norm": 0.8069644430296897, + "learning_rate": 9.962521781263259e-06, + "loss": 0.0846, + "step": 1217 + }, + { + "epoch": 0.5409726848767489, + "grad_norm": 0.6551620736321753, + "learning_rate": 9.962284501155347e-06, + "loss": 0.0626, + "step": 1218 + }, + { + "epoch": 0.5414168332222963, + "grad_norm": 0.7163185717239511, + "learning_rate": 9.96204647513178e-06, + "loss": 0.0742, + "step": 1219 + }, + { + "epoch": 0.5418609815678437, + "grad_norm": 0.6464830809225138, + "learning_rate": 9.96180770322834e-06, + "loss": 0.0597, + "step": 1220 + }, + { + "epoch": 0.5423051299133911, + "grad_norm": 0.4702533761313221, + "learning_rate": 9.961568185480912e-06, + "loss": 0.0663, + "step": 1221 + }, + { + "epoch": 0.5427492782589385, + "grad_norm": 0.5377074866144479, + "learning_rate": 9.961327921925506e-06, + "loss": 0.0682, + "step": 1222 + }, + { + "epoch": 0.5431934266044859, + "grad_norm": 0.6242182477426378, + "learning_rate": 9.961086912598232e-06, + "loss": 0.0558, + "step": 1223 + }, + { + "epoch": 0.5436375749500333, + "grad_norm": 0.919842928693314, + "learning_rate": 9.960845157535324e-06, + "loss": 0.0979, + "step": 1224 + }, + { + "epoch": 0.5440817232955807, + "grad_norm": 0.4865673297809684, + "learning_rate": 9.960602656773118e-06, + "loss": 0.0591, + "step": 1225 + }, + { + "epoch": 0.5445258716411281, + "grad_norm": 0.5994196236588832, + "learning_rate": 9.960359410348066e-06, + "loss": 0.06, + "step": 1226 + }, + { + "epoch": 0.5449700199866756, + "grad_norm": 0.6813026653368286, + "learning_rate": 9.960115418296734e-06, + "loss": 0.0563, + "step": 1227 + }, + { + "epoch": 0.545414168332223, + "grad_norm": 0.4200726079302436, + "learning_rate": 9.959870680655797e-06, + "loss": 0.0448, + "step": 1228 + }, + { + "epoch": 0.5458583166777704, + "grad_norm": 1.3484707743369218, + "learning_rate": 9.959625197462042e-06, + "loss": 0.058, + "step": 1229 + }, + { + "epoch": 0.5463024650233178, + "grad_norm": 0.91837601072641, + "learning_rate": 9.959378968752371e-06, + "loss": 0.1046, + "step": 1230 + }, + { + "epoch": 0.5467466133688652, + "grad_norm": 0.7675630782674875, + "learning_rate": 9.959131994563795e-06, + "loss": 0.0738, + "step": 1231 + }, + { + "epoch": 0.5471907617144126, + "grad_norm": 0.5684812066450396, + "learning_rate": 9.958884274933442e-06, + "loss": 0.0699, + "step": 1232 + }, + { + "epoch": 0.54763491005996, + "grad_norm": 0.6471523630810303, + "learning_rate": 9.958635809898544e-06, + "loss": 0.0713, + "step": 1233 + }, + { + "epoch": 0.5480790584055074, + "grad_norm": 0.6967540553492192, + "learning_rate": 9.95838659949645e-06, + "loss": 0.0776, + "step": 1234 + }, + { + "epoch": 0.5485232067510548, + "grad_norm": 0.6469245863206669, + "learning_rate": 9.958136643764624e-06, + "loss": 0.0764, + "step": 1235 + }, + { + "epoch": 0.5489673550966022, + "grad_norm": 0.8210874368087515, + "learning_rate": 9.957885942740635e-06, + "loss": 0.0779, + "step": 1236 + }, + { + "epoch": 0.5494115034421497, + "grad_norm": 0.6891702613627725, + "learning_rate": 9.957634496462169e-06, + "loss": 0.0575, + "step": 1237 + }, + { + "epoch": 0.5498556517876971, + "grad_norm": 0.5415934981486168, + "learning_rate": 9.957382304967024e-06, + "loss": 0.0653, + "step": 1238 + }, + { + "epoch": 0.5502998001332445, + "grad_norm": 0.606098328678092, + "learning_rate": 9.957129368293108e-06, + "loss": 0.0578, + "step": 1239 + }, + { + "epoch": 0.5507439484787919, + "grad_norm": 0.5816870040354246, + "learning_rate": 9.95687568647844e-06, + "loss": 0.0543, + "step": 1240 + }, + { + "epoch": 0.5511880968243393, + "grad_norm": 0.7853352727315064, + "learning_rate": 9.956621259561152e-06, + "loss": 0.0731, + "step": 1241 + }, + { + "epoch": 0.5516322451698867, + "grad_norm": 0.6165368112917523, + "learning_rate": 9.956366087579492e-06, + "loss": 0.0688, + "step": 1242 + }, + { + "epoch": 0.5520763935154341, + "grad_norm": 0.8107389760561104, + "learning_rate": 9.956110170571816e-06, + "loss": 0.0823, + "step": 1243 + }, + { + "epoch": 0.5525205418609815, + "grad_norm": 0.6920576051280234, + "learning_rate": 9.95585350857659e-06, + "loss": 0.0661, + "step": 1244 + }, + { + "epoch": 0.5529646902065289, + "grad_norm": 0.6032293972841873, + "learning_rate": 9.9555961016324e-06, + "loss": 0.0629, + "step": 1245 + }, + { + "epoch": 0.5534088385520763, + "grad_norm": 0.7423430185463719, + "learning_rate": 9.955337949777931e-06, + "loss": 0.0781, + "step": 1246 + }, + { + "epoch": 0.5538529868976239, + "grad_norm": 0.7004081423242674, + "learning_rate": 9.955079053051992e-06, + "loss": 0.0695, + "step": 1247 + }, + { + "epoch": 0.5542971352431713, + "grad_norm": 0.6468930634930644, + "learning_rate": 9.9548194114935e-06, + "loss": 0.0549, + "step": 1248 + }, + { + "epoch": 0.5547412835887187, + "grad_norm": 0.4810505823476616, + "learning_rate": 9.954559025141484e-06, + "loss": 0.0514, + "step": 1249 + }, + { + "epoch": 0.5551854319342661, + "grad_norm": 0.802783907262409, + "learning_rate": 9.95429789403508e-06, + "loss": 0.1085, + "step": 1250 + }, + { + "epoch": 0.5556295802798135, + "grad_norm": 0.5353100797059109, + "learning_rate": 9.954036018213548e-06, + "loss": 0.0448, + "step": 1251 + }, + { + "epoch": 0.5560737286253609, + "grad_norm": 0.5946904373037035, + "learning_rate": 9.953773397716247e-06, + "loss": 0.0724, + "step": 1252 + }, + { + "epoch": 0.5565178769709083, + "grad_norm": 1.0490812743358358, + "learning_rate": 9.953510032582652e-06, + "loss": 0.0675, + "step": 1253 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 0.5087723175679687, + "learning_rate": 9.953245922852355e-06, + "loss": 0.0599, + "step": 1254 + }, + { + "epoch": 0.5574061736620031, + "grad_norm": 0.5243033641589454, + "learning_rate": 9.952981068565055e-06, + "loss": 0.0569, + "step": 1255 + }, + { + "epoch": 0.5578503220075505, + "grad_norm": 0.7607475927289213, + "learning_rate": 9.952715469760566e-06, + "loss": 0.0775, + "step": 1256 + }, + { + "epoch": 0.558294470353098, + "grad_norm": 0.8559494882081514, + "learning_rate": 9.952449126478808e-06, + "loss": 0.0866, + "step": 1257 + }, + { + "epoch": 0.5587386186986454, + "grad_norm": 0.7656790089546245, + "learning_rate": 9.952182038759818e-06, + "loss": 0.0722, + "step": 1258 + }, + { + "epoch": 0.5591827670441928, + "grad_norm": 0.5660284655001593, + "learning_rate": 9.951914206643744e-06, + "loss": 0.0564, + "step": 1259 + }, + { + "epoch": 0.5596269153897402, + "grad_norm": 0.5835362013761695, + "learning_rate": 9.95164563017085e-06, + "loss": 0.0788, + "step": 1260 + }, + { + "epoch": 0.5600710637352876, + "grad_norm": 0.6769019916334489, + "learning_rate": 9.951376309381502e-06, + "loss": 0.0594, + "step": 1261 + }, + { + "epoch": 0.560515212080835, + "grad_norm": 0.5253934078988655, + "learning_rate": 9.951106244316184e-06, + "loss": 0.053, + "step": 1262 + }, + { + "epoch": 0.5609593604263824, + "grad_norm": 0.7300022918221285, + "learning_rate": 9.950835435015495e-06, + "loss": 0.0651, + "step": 1263 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 0.5278089608881616, + "learning_rate": 9.95056388152014e-06, + "loss": 0.0555, + "step": 1264 + }, + { + "epoch": 0.5618476571174772, + "grad_norm": 0.6301552291199896, + "learning_rate": 9.950291583870938e-06, + "loss": 0.0862, + "step": 1265 + }, + { + "epoch": 0.5622918054630246, + "grad_norm": 0.7781648389786646, + "learning_rate": 9.950018542108818e-06, + "loss": 0.0699, + "step": 1266 + }, + { + "epoch": 0.5627359538085721, + "grad_norm": 0.6889192633019509, + "learning_rate": 9.949744756274828e-06, + "loss": 0.059, + "step": 1267 + }, + { + "epoch": 0.5631801021541195, + "grad_norm": 0.84114983491113, + "learning_rate": 9.94947022641012e-06, + "loss": 0.0855, + "step": 1268 + }, + { + "epoch": 0.5636242504996669, + "grad_norm": 0.6070250007384902, + "learning_rate": 9.949194952555958e-06, + "loss": 0.0635, + "step": 1269 + }, + { + "epoch": 0.5640683988452143, + "grad_norm": 0.8282717974485503, + "learning_rate": 9.948918934753724e-06, + "loss": 0.0606, + "step": 1270 + }, + { + "epoch": 0.5645125471907617, + "grad_norm": 0.8327690010964344, + "learning_rate": 9.948642173044906e-06, + "loss": 0.0617, + "step": 1271 + }, + { + "epoch": 0.5649566955363091, + "grad_norm": 0.6842942727397727, + "learning_rate": 9.948364667471106e-06, + "loss": 0.0795, + "step": 1272 + }, + { + "epoch": 0.5654008438818565, + "grad_norm": 0.5302443265967214, + "learning_rate": 9.94808641807404e-06, + "loss": 0.0512, + "step": 1273 + }, + { + "epoch": 0.5658449922274039, + "grad_norm": 0.6110171632056333, + "learning_rate": 9.94780742489553e-06, + "loss": 0.0574, + "step": 1274 + }, + { + "epoch": 0.5662891405729513, + "grad_norm": 0.5491078913412093, + "learning_rate": 9.947527687977519e-06, + "loss": 0.0512, + "step": 1275 + }, + { + "epoch": 0.5667332889184987, + "grad_norm": 0.6269674055110014, + "learning_rate": 9.94724720736205e-06, + "loss": 0.074, + "step": 1276 + }, + { + "epoch": 0.5671774372640462, + "grad_norm": 0.7276246324646, + "learning_rate": 9.946965983091286e-06, + "loss": 0.1055, + "step": 1277 + }, + { + "epoch": 0.5676215856095936, + "grad_norm": 0.7755784175713893, + "learning_rate": 9.946684015207501e-06, + "loss": 0.0878, + "step": 1278 + }, + { + "epoch": 0.568065733955141, + "grad_norm": 0.6682673092927641, + "learning_rate": 9.94640130375308e-06, + "loss": 0.0985, + "step": 1279 + }, + { + "epoch": 0.5685098823006884, + "grad_norm": 0.71248575096518, + "learning_rate": 9.946117848770518e-06, + "loss": 0.0546, + "step": 1280 + }, + { + "epoch": 0.5689540306462358, + "grad_norm": 0.6304803324875314, + "learning_rate": 9.945833650302423e-06, + "loss": 0.0645, + "step": 1281 + }, + { + "epoch": 0.5693981789917832, + "grad_norm": 0.6709593851017193, + "learning_rate": 9.945548708391517e-06, + "loss": 0.0711, + "step": 1282 + }, + { + "epoch": 0.5698423273373306, + "grad_norm": 0.6036373550762799, + "learning_rate": 9.94526302308063e-06, + "loss": 0.0766, + "step": 1283 + }, + { + "epoch": 0.570286475682878, + "grad_norm": 0.6055441845545156, + "learning_rate": 9.944976594412702e-06, + "loss": 0.0663, + "step": 1284 + }, + { + "epoch": 0.5707306240284254, + "grad_norm": 1.1523396950548679, + "learning_rate": 9.944689422430794e-06, + "loss": 0.0876, + "step": 1285 + }, + { + "epoch": 0.571174772373973, + "grad_norm": 0.8498417793676747, + "learning_rate": 9.94440150717807e-06, + "loss": 0.0764, + "step": 1286 + }, + { + "epoch": 0.5716189207195204, + "grad_norm": 0.5983413012513809, + "learning_rate": 9.944112848697809e-06, + "loss": 0.0564, + "step": 1287 + }, + { + "epoch": 0.5720630690650678, + "grad_norm": 0.7260849111122668, + "learning_rate": 9.9438234470334e-06, + "loss": 0.0728, + "step": 1288 + }, + { + "epoch": 0.5725072174106152, + "grad_norm": 1.0050849305541534, + "learning_rate": 9.943533302228346e-06, + "loss": 0.0711, + "step": 1289 + }, + { + "epoch": 0.5729513657561626, + "grad_norm": 0.800396030085453, + "learning_rate": 9.943242414326263e-06, + "loss": 0.0724, + "step": 1290 + }, + { + "epoch": 0.57339551410171, + "grad_norm": 0.9224090719942251, + "learning_rate": 9.94295078337087e-06, + "loss": 0.1002, + "step": 1291 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 0.8382142062568964, + "learning_rate": 9.942658409406012e-06, + "loss": 0.0756, + "step": 1292 + }, + { + "epoch": 0.5742838107928048, + "grad_norm": 0.5876570577747893, + "learning_rate": 9.942365292475632e-06, + "loss": 0.0605, + "step": 1293 + }, + { + "epoch": 0.5747279591383522, + "grad_norm": 0.9775545409759507, + "learning_rate": 9.942071432623794e-06, + "loss": 0.0786, + "step": 1294 + }, + { + "epoch": 0.5751721074838996, + "grad_norm": 0.6598504226159471, + "learning_rate": 9.941776829894667e-06, + "loss": 0.0847, + "step": 1295 + }, + { + "epoch": 0.5756162558294471, + "grad_norm": 0.7984485905609663, + "learning_rate": 9.941481484332537e-06, + "loss": 0.0699, + "step": 1296 + }, + { + "epoch": 0.5760604041749945, + "grad_norm": 0.6762263367814108, + "learning_rate": 9.941185395981799e-06, + "loss": 0.0786, + "step": 1297 + }, + { + "epoch": 0.5765045525205419, + "grad_norm": 0.7731297634993985, + "learning_rate": 9.940888564886959e-06, + "loss": 0.0673, + "step": 1298 + }, + { + "epoch": 0.5769487008660893, + "grad_norm": 0.919743609676064, + "learning_rate": 9.940590991092639e-06, + "loss": 0.065, + "step": 1299 + }, + { + "epoch": 0.5773928492116367, + "grad_norm": 0.6382780908912015, + "learning_rate": 9.940292674643564e-06, + "loss": 0.0578, + "step": 1300 + }, + { + "epoch": 0.5778369975571841, + "grad_norm": 0.6620249742612588, + "learning_rate": 9.93999361558458e-06, + "loss": 0.0635, + "step": 1301 + }, + { + "epoch": 0.5782811459027315, + "grad_norm": 0.7452135944776405, + "learning_rate": 9.93969381396064e-06, + "loss": 0.0806, + "step": 1302 + }, + { + "epoch": 0.5787252942482789, + "grad_norm": 0.6403437438856001, + "learning_rate": 9.93939326981681e-06, + "loss": 0.0687, + "step": 1303 + }, + { + "epoch": 0.5791694425938263, + "grad_norm": 1.1014917271306792, + "learning_rate": 9.939091983198266e-06, + "loss": 0.098, + "step": 1304 + }, + { + "epoch": 0.5796135909393737, + "grad_norm": 1.0225638964559536, + "learning_rate": 9.938789954150296e-06, + "loss": 0.0644, + "step": 1305 + }, + { + "epoch": 0.5800577392849212, + "grad_norm": 0.637383310530645, + "learning_rate": 9.9384871827183e-06, + "loss": 0.06, + "step": 1306 + }, + { + "epoch": 0.5805018876304686, + "grad_norm": 0.6546930860338038, + "learning_rate": 9.93818366894779e-06, + "loss": 0.0681, + "step": 1307 + }, + { + "epoch": 0.580946035976016, + "grad_norm": 0.5341000978856885, + "learning_rate": 9.93787941288439e-06, + "loss": 0.0589, + "step": 1308 + }, + { + "epoch": 0.5813901843215634, + "grad_norm": 0.5540761577564739, + "learning_rate": 9.937574414573834e-06, + "loss": 0.07, + "step": 1309 + }, + { + "epoch": 0.5818343326671108, + "grad_norm": 0.6399098162524178, + "learning_rate": 9.937268674061968e-06, + "loss": 0.0681, + "step": 1310 + }, + { + "epoch": 0.5822784810126582, + "grad_norm": 0.9788827379867293, + "learning_rate": 9.936962191394753e-06, + "loss": 0.0775, + "step": 1311 + }, + { + "epoch": 0.5827226293582056, + "grad_norm": 0.5825969221377589, + "learning_rate": 9.936654966618255e-06, + "loss": 0.0662, + "step": 1312 + }, + { + "epoch": 0.583166777703753, + "grad_norm": 0.6990166610409554, + "learning_rate": 9.936346999778657e-06, + "loss": 0.0748, + "step": 1313 + }, + { + "epoch": 0.5836109260493004, + "grad_norm": 0.7726036624010885, + "learning_rate": 9.93603829092225e-06, + "loss": 0.0806, + "step": 1314 + }, + { + "epoch": 0.5840550743948478, + "grad_norm": 0.5892730393957506, + "learning_rate": 9.93572884009544e-06, + "loss": 0.0687, + "step": 1315 + }, + { + "epoch": 0.5844992227403953, + "grad_norm": 0.7324361900970796, + "learning_rate": 9.935418647344741e-06, + "loss": 0.0722, + "step": 1316 + }, + { + "epoch": 0.5849433710859427, + "grad_norm": 0.8954678705930711, + "learning_rate": 9.935107712716781e-06, + "loss": 0.0829, + "step": 1317 + }, + { + "epoch": 0.5853875194314901, + "grad_norm": 0.5986143926701439, + "learning_rate": 9.9347960362583e-06, + "loss": 0.068, + "step": 1318 + }, + { + "epoch": 0.5858316677770375, + "grad_norm": 0.6105531585151114, + "learning_rate": 9.934483618016148e-06, + "loss": 0.0719, + "step": 1319 + }, + { + "epoch": 0.586275816122585, + "grad_norm": 0.7814579585476528, + "learning_rate": 9.934170458037285e-06, + "loss": 0.0899, + "step": 1320 + }, + { + "epoch": 0.5867199644681323, + "grad_norm": 0.6130620918486331, + "learning_rate": 9.933856556368785e-06, + "loss": 0.0663, + "step": 1321 + }, + { + "epoch": 0.5871641128136798, + "grad_norm": 0.7911643262947309, + "learning_rate": 9.933541913057833e-06, + "loss": 0.0782, + "step": 1322 + }, + { + "epoch": 0.5876082611592272, + "grad_norm": 0.7091228860294103, + "learning_rate": 9.933226528151725e-06, + "loss": 0.0637, + "step": 1323 + }, + { + "epoch": 0.5880524095047746, + "grad_norm": 0.7999422262730185, + "learning_rate": 9.93291040169787e-06, + "loss": 0.0616, + "step": 1324 + }, + { + "epoch": 0.588496557850322, + "grad_norm": 0.7189040382281308, + "learning_rate": 9.932593533743786e-06, + "loss": 0.0602, + "step": 1325 + }, + { + "epoch": 0.5889407061958695, + "grad_norm": 0.5837515830643734, + "learning_rate": 9.932275924337104e-06, + "loss": 0.0586, + "step": 1326 + }, + { + "epoch": 0.5893848545414169, + "grad_norm": 0.8281667273984812, + "learning_rate": 9.931957573525566e-06, + "loss": 0.0648, + "step": 1327 + }, + { + "epoch": 0.5898290028869643, + "grad_norm": 0.5286485536715172, + "learning_rate": 9.931638481357024e-06, + "loss": 0.0531, + "step": 1328 + }, + { + "epoch": 0.5902731512325117, + "grad_norm": 0.6275875445640038, + "learning_rate": 9.931318647879445e-06, + "loss": 0.064, + "step": 1329 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 0.8183874857273685, + "learning_rate": 9.930998073140905e-06, + "loss": 0.1023, + "step": 1330 + }, + { + "epoch": 0.5911614479236065, + "grad_norm": 0.6557543608908961, + "learning_rate": 9.93067675718959e-06, + "loss": 0.0591, + "step": 1331 + }, + { + "epoch": 0.5916055962691539, + "grad_norm": 0.7946472702361359, + "learning_rate": 9.930354700073803e-06, + "loss": 0.0783, + "step": 1332 + }, + { + "epoch": 0.5920497446147013, + "grad_norm": 0.4755584849969953, + "learning_rate": 9.930031901841952e-06, + "loss": 0.065, + "step": 1333 + }, + { + "epoch": 0.5924938929602487, + "grad_norm": 0.6292133058734393, + "learning_rate": 9.929708362542559e-06, + "loss": 0.065, + "step": 1334 + }, + { + "epoch": 0.5929380413057961, + "grad_norm": 0.7194753693955848, + "learning_rate": 9.929384082224258e-06, + "loss": 0.0649, + "step": 1335 + }, + { + "epoch": 0.5933821896513436, + "grad_norm": 0.8182929882444262, + "learning_rate": 9.929059060935795e-06, + "loss": 0.0735, + "step": 1336 + }, + { + "epoch": 0.593826337996891, + "grad_norm": 0.6452166028375979, + "learning_rate": 9.928733298726024e-06, + "loss": 0.0773, + "step": 1337 + }, + { + "epoch": 0.5942704863424384, + "grad_norm": 0.6174785647804547, + "learning_rate": 9.928406795643913e-06, + "loss": 0.088, + "step": 1338 + }, + { + "epoch": 0.5947146346879858, + "grad_norm": 0.9075257756258466, + "learning_rate": 9.928079551738542e-06, + "loss": 0.0966, + "step": 1339 + }, + { + "epoch": 0.5951587830335332, + "grad_norm": 0.7930757535133779, + "learning_rate": 9.927751567059103e-06, + "loss": 0.0788, + "step": 1340 + }, + { + "epoch": 0.5956029313790806, + "grad_norm": 0.7918355971015104, + "learning_rate": 9.927422841654894e-06, + "loss": 0.0732, + "step": 1341 + }, + { + "epoch": 0.596047079724628, + "grad_norm": 0.5344795042621896, + "learning_rate": 9.92709337557533e-06, + "loss": 0.0555, + "step": 1342 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 0.7151020498309698, + "learning_rate": 9.926763168869935e-06, + "loss": 0.0699, + "step": 1343 + }, + { + "epoch": 0.5969353764157228, + "grad_norm": 0.8228281453238034, + "learning_rate": 9.926432221588342e-06, + "loss": 0.0723, + "step": 1344 + }, + { + "epoch": 0.5973795247612703, + "grad_norm": 0.6976750613205994, + "learning_rate": 9.926100533780304e-06, + "loss": 0.0877, + "step": 1345 + }, + { + "epoch": 0.5978236731068177, + "grad_norm": 0.8796022908023586, + "learning_rate": 9.925768105495675e-06, + "loss": 0.0867, + "step": 1346 + }, + { + "epoch": 0.5982678214523651, + "grad_norm": 0.6214669294617633, + "learning_rate": 9.925434936784426e-06, + "loss": 0.0724, + "step": 1347 + }, + { + "epoch": 0.5987119697979125, + "grad_norm": 0.5493814047261378, + "learning_rate": 9.925101027696636e-06, + "loss": 0.0546, + "step": 1348 + }, + { + "epoch": 0.5991561181434599, + "grad_norm": 0.6738873954295155, + "learning_rate": 9.924766378282499e-06, + "loss": 0.068, + "step": 1349 + }, + { + "epoch": 0.5996002664890073, + "grad_norm": 0.662002450028893, + "learning_rate": 9.92443098859232e-06, + "loss": 0.0618, + "step": 1350 + }, + { + "epoch": 0.6000444148345547, + "grad_norm": 0.5691844703394149, + "learning_rate": 9.92409485867651e-06, + "loss": 0.0675, + "step": 1351 + }, + { + "epoch": 0.6004885631801021, + "grad_norm": 0.5107643154244839, + "learning_rate": 9.923757988585599e-06, + "loss": 0.0582, + "step": 1352 + }, + { + "epoch": 0.6009327115256495, + "grad_norm": 0.4892180420795259, + "learning_rate": 9.923420378370221e-06, + "loss": 0.0707, + "step": 1353 + }, + { + "epoch": 0.6013768598711969, + "grad_norm": 0.5128501684775312, + "learning_rate": 9.923082028081125e-06, + "loss": 0.0515, + "step": 1354 + }, + { + "epoch": 0.6018210082167444, + "grad_norm": 0.6712578771133637, + "learning_rate": 9.922742937769172e-06, + "loss": 0.0668, + "step": 1355 + }, + { + "epoch": 0.6022651565622918, + "grad_norm": 0.8489596826538364, + "learning_rate": 9.922403107485335e-06, + "loss": 0.0852, + "step": 1356 + }, + { + "epoch": 0.6027093049078392, + "grad_norm": 0.7130581914563381, + "learning_rate": 9.922062537280692e-06, + "loss": 0.0877, + "step": 1357 + }, + { + "epoch": 0.6031534532533867, + "grad_norm": 0.5999507766881641, + "learning_rate": 9.921721227206438e-06, + "loss": 0.0693, + "step": 1358 + }, + { + "epoch": 0.603597601598934, + "grad_norm": 0.5747773135899322, + "learning_rate": 9.92137917731388e-06, + "loss": 0.0556, + "step": 1359 + }, + { + "epoch": 0.6040417499444815, + "grad_norm": 0.7771993716251702, + "learning_rate": 9.921036387654429e-06, + "loss": 0.0643, + "step": 1360 + }, + { + "epoch": 0.6044858982900289, + "grad_norm": 1.307956020844314, + "learning_rate": 9.920692858279616e-06, + "loss": 0.0551, + "step": 1361 + }, + { + "epoch": 0.6049300466355763, + "grad_norm": 0.6255379323591398, + "learning_rate": 9.92034858924108e-06, + "loss": 0.0636, + "step": 1362 + }, + { + "epoch": 0.6053741949811237, + "grad_norm": 0.553962551501975, + "learning_rate": 9.92000358059057e-06, + "loss": 0.0814, + "step": 1363 + }, + { + "epoch": 0.6058183433266711, + "grad_norm": 0.7454747633443178, + "learning_rate": 9.919657832379943e-06, + "loss": 0.0603, + "step": 1364 + }, + { + "epoch": 0.6062624916722186, + "grad_norm": 0.9733260117481642, + "learning_rate": 9.919311344661174e-06, + "loss": 0.1153, + "step": 1365 + }, + { + "epoch": 0.606706640017766, + "grad_norm": 0.613902681985122, + "learning_rate": 9.918964117486346e-06, + "loss": 0.0578, + "step": 1366 + }, + { + "epoch": 0.6071507883633134, + "grad_norm": 0.7376648326818326, + "learning_rate": 9.918616150907651e-06, + "loss": 0.066, + "step": 1367 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 0.6476167687641429, + "learning_rate": 9.918267444977398e-06, + "loss": 0.0467, + "step": 1368 + }, + { + "epoch": 0.6080390850544082, + "grad_norm": 0.6720043569712078, + "learning_rate": 9.917917999747999e-06, + "loss": 0.0647, + "step": 1369 + }, + { + "epoch": 0.6084832333999556, + "grad_norm": 0.6554718376719761, + "learning_rate": 9.917567815271986e-06, + "loss": 0.0652, + "step": 1370 + }, + { + "epoch": 0.608927381745503, + "grad_norm": 0.7668531453750762, + "learning_rate": 9.917216891601996e-06, + "loss": 0.07, + "step": 1371 + }, + { + "epoch": 0.6093715300910504, + "grad_norm": 0.8088811507082688, + "learning_rate": 9.916865228790776e-06, + "loss": 0.0771, + "step": 1372 + }, + { + "epoch": 0.6098156784365978, + "grad_norm": 0.687085496356637, + "learning_rate": 9.91651282689119e-06, + "loss": 0.0758, + "step": 1373 + }, + { + "epoch": 0.6102598267821452, + "grad_norm": 0.7118263597209087, + "learning_rate": 9.916159685956208e-06, + "loss": 0.0704, + "step": 1374 + }, + { + "epoch": 0.6107039751276927, + "grad_norm": 0.5722664877188374, + "learning_rate": 9.915805806038917e-06, + "loss": 0.0568, + "step": 1375 + }, + { + "epoch": 0.6111481234732401, + "grad_norm": 0.5809145124340924, + "learning_rate": 9.915451187192507e-06, + "loss": 0.0508, + "step": 1376 + }, + { + "epoch": 0.6115922718187875, + "grad_norm": 0.7936146709815162, + "learning_rate": 9.915095829470284e-06, + "loss": 0.0685, + "step": 1377 + }, + { + "epoch": 0.6120364201643349, + "grad_norm": 0.6343789137617302, + "learning_rate": 9.914739732925665e-06, + "loss": 0.0727, + "step": 1378 + }, + { + "epoch": 0.6124805685098823, + "grad_norm": 0.6049507318538985, + "learning_rate": 9.914382897612178e-06, + "loss": 0.0613, + "step": 1379 + }, + { + "epoch": 0.6129247168554297, + "grad_norm": 0.7820325942346816, + "learning_rate": 9.91402532358346e-06, + "loss": 0.0631, + "step": 1380 + }, + { + "epoch": 0.6133688652009771, + "grad_norm": 0.647686549326212, + "learning_rate": 9.913667010893261e-06, + "loss": 0.0593, + "step": 1381 + }, + { + "epoch": 0.6138130135465245, + "grad_norm": 0.5699910207599692, + "learning_rate": 9.913307959595443e-06, + "loss": 0.0617, + "step": 1382 + }, + { + "epoch": 0.6142571618920719, + "grad_norm": 0.8214987498163691, + "learning_rate": 9.912948169743977e-06, + "loss": 0.1062, + "step": 1383 + }, + { + "epoch": 0.6147013102376193, + "grad_norm": 0.7190447874189771, + "learning_rate": 9.912587641392943e-06, + "loss": 0.0816, + "step": 1384 + }, + { + "epoch": 0.6151454585831668, + "grad_norm": 0.4801110982287438, + "learning_rate": 9.912226374596536e-06, + "loss": 0.0531, + "step": 1385 + }, + { + "epoch": 0.6155896069287142, + "grad_norm": 0.5777279849946639, + "learning_rate": 9.911864369409062e-06, + "loss": 0.0593, + "step": 1386 + }, + { + "epoch": 0.6160337552742616, + "grad_norm": 0.6445627055234285, + "learning_rate": 9.911501625884934e-06, + "loss": 0.0562, + "step": 1387 + }, + { + "epoch": 0.616477903619809, + "grad_norm": 0.6402760221505619, + "learning_rate": 9.911138144078681e-06, + "loss": 0.0628, + "step": 1388 + }, + { + "epoch": 0.6169220519653564, + "grad_norm": 0.5943075838810645, + "learning_rate": 9.910773924044937e-06, + "loss": 0.0694, + "step": 1389 + }, + { + "epoch": 0.6173662003109038, + "grad_norm": 0.6887164265976037, + "learning_rate": 9.910408965838455e-06, + "loss": 0.0662, + "step": 1390 + }, + { + "epoch": 0.6178103486564512, + "grad_norm": 0.7025269495789147, + "learning_rate": 9.91004326951409e-06, + "loss": 0.0596, + "step": 1391 + }, + { + "epoch": 0.6182544970019986, + "grad_norm": 0.6968614560116964, + "learning_rate": 9.909676835126819e-06, + "loss": 0.08, + "step": 1392 + }, + { + "epoch": 0.618698645347546, + "grad_norm": 0.9599784608542535, + "learning_rate": 9.909309662731713e-06, + "loss": 0.0981, + "step": 1393 + }, + { + "epoch": 0.6191427936930934, + "grad_norm": 0.5936127126118025, + "learning_rate": 9.908941752383974e-06, + "loss": 0.0502, + "step": 1394 + }, + { + "epoch": 0.619586942038641, + "grad_norm": 0.6281665293752687, + "learning_rate": 9.9085731041389e-06, + "loss": 0.0894, + "step": 1395 + }, + { + "epoch": 0.6200310903841884, + "grad_norm": 0.7270920690197641, + "learning_rate": 9.908203718051907e-06, + "loss": 0.0772, + "step": 1396 + }, + { + "epoch": 0.6204752387297358, + "grad_norm": 0.9689326115460888, + "learning_rate": 9.90783359417852e-06, + "loss": 0.0955, + "step": 1397 + }, + { + "epoch": 0.6209193870752832, + "grad_norm": 0.5718494750591436, + "learning_rate": 9.907462732574373e-06, + "loss": 0.0586, + "step": 1398 + }, + { + "epoch": 0.6213635354208306, + "grad_norm": 0.547133748335618, + "learning_rate": 9.907091133295214e-06, + "loss": 0.0546, + "step": 1399 + }, + { + "epoch": 0.621807683766378, + "grad_norm": 0.7492139302396027, + "learning_rate": 9.906718796396901e-06, + "loss": 0.0726, + "step": 1400 + }, + { + "epoch": 0.6222518321119254, + "grad_norm": 1.1332966961373652, + "learning_rate": 9.906345721935402e-06, + "loss": 0.0837, + "step": 1401 + }, + { + "epoch": 0.6226959804574728, + "grad_norm": 0.6506799230647055, + "learning_rate": 9.905971909966798e-06, + "loss": 0.0676, + "step": 1402 + }, + { + "epoch": 0.6231401288030202, + "grad_norm": 0.7078271615105967, + "learning_rate": 9.905597360547276e-06, + "loss": 0.0763, + "step": 1403 + }, + { + "epoch": 0.6235842771485676, + "grad_norm": 0.8061592052669874, + "learning_rate": 9.90522207373314e-06, + "loss": 0.0941, + "step": 1404 + }, + { + "epoch": 0.6240284254941151, + "grad_norm": 0.7215188703499849, + "learning_rate": 9.904846049580804e-06, + "loss": 0.0913, + "step": 1405 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 0.7648810068346831, + "learning_rate": 9.904469288146785e-06, + "loss": 0.103, + "step": 1406 + }, + { + "epoch": 0.6249167221852099, + "grad_norm": 0.7848759400980109, + "learning_rate": 9.90409178948772e-06, + "loss": 0.0853, + "step": 1407 + }, + { + "epoch": 0.6253608705307573, + "grad_norm": 0.5249169314876924, + "learning_rate": 9.903713553660352e-06, + "loss": 0.0539, + "step": 1408 + }, + { + "epoch": 0.6258050188763047, + "grad_norm": 0.651696006803284, + "learning_rate": 9.90333458072154e-06, + "loss": 0.0725, + "step": 1409 + }, + { + "epoch": 0.6262491672218521, + "grad_norm": 0.5581372640238701, + "learning_rate": 9.902954870728246e-06, + "loss": 0.0516, + "step": 1410 + }, + { + "epoch": 0.6266933155673995, + "grad_norm": 0.46394395658623727, + "learning_rate": 9.902574423737547e-06, + "loss": 0.0543, + "step": 1411 + }, + { + "epoch": 0.6271374639129469, + "grad_norm": 0.6101555748112776, + "learning_rate": 9.902193239806634e-06, + "loss": 0.0688, + "step": 1412 + }, + { + "epoch": 0.6275816122584943, + "grad_norm": 0.6266576234326965, + "learning_rate": 9.901811318992802e-06, + "loss": 0.0619, + "step": 1413 + }, + { + "epoch": 0.6280257606040418, + "grad_norm": 0.9093069033662159, + "learning_rate": 9.901428661353462e-06, + "loss": 0.1009, + "step": 1414 + }, + { + "epoch": 0.6284699089495892, + "grad_norm": 0.796133446241698, + "learning_rate": 9.901045266946134e-06, + "loss": 0.0919, + "step": 1415 + }, + { + "epoch": 0.6289140572951366, + "grad_norm": 0.75094830010195, + "learning_rate": 9.900661135828448e-06, + "loss": 0.0594, + "step": 1416 + }, + { + "epoch": 0.629358205640684, + "grad_norm": 0.8379496784608408, + "learning_rate": 9.900276268058147e-06, + "loss": 0.0659, + "step": 1417 + }, + { + "epoch": 0.6298023539862314, + "grad_norm": 0.7794466397530133, + "learning_rate": 9.899890663693078e-06, + "loss": 0.086, + "step": 1418 + }, + { + "epoch": 0.6302465023317788, + "grad_norm": 1.781154839834256, + "learning_rate": 9.899504322791212e-06, + "loss": 0.1089, + "step": 1419 + }, + { + "epoch": 0.6306906506773262, + "grad_norm": 0.582797339406938, + "learning_rate": 9.899117245410615e-06, + "loss": 0.0449, + "step": 1420 + }, + { + "epoch": 0.6311347990228736, + "grad_norm": 0.7463622614388931, + "learning_rate": 9.898729431609477e-06, + "loss": 0.0542, + "step": 1421 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.7697052470584549, + "learning_rate": 9.89834088144609e-06, + "loss": 0.0635, + "step": 1422 + }, + { + "epoch": 0.6320230957139684, + "grad_norm": 0.900347396335065, + "learning_rate": 9.897951594978858e-06, + "loss": 0.0778, + "step": 1423 + }, + { + "epoch": 0.6324672440595159, + "grad_norm": 0.6468712468526486, + "learning_rate": 9.897561572266301e-06, + "loss": 0.066, + "step": 1424 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 0.4986782583243913, + "learning_rate": 9.897170813367045e-06, + "loss": 0.0662, + "step": 1425 + }, + { + "epoch": 0.6333555407506107, + "grad_norm": 0.6014710971137512, + "learning_rate": 9.896779318339826e-06, + "loss": 0.0657, + "step": 1426 + }, + { + "epoch": 0.6337996890961581, + "grad_norm": 0.8579597362562785, + "learning_rate": 9.896387087243496e-06, + "loss": 0.08, + "step": 1427 + }, + { + "epoch": 0.6342438374417055, + "grad_norm": 0.9386752220131082, + "learning_rate": 9.89599412013701e-06, + "loss": 0.1233, + "step": 1428 + }, + { + "epoch": 0.6346879857872529, + "grad_norm": 0.7426460359725069, + "learning_rate": 9.89560041707944e-06, + "loss": 0.0861, + "step": 1429 + }, + { + "epoch": 0.6351321341328003, + "grad_norm": 0.5143125228492677, + "learning_rate": 9.895205978129966e-06, + "loss": 0.0446, + "step": 1430 + }, + { + "epoch": 0.6355762824783477, + "grad_norm": 1.049177103786211, + "learning_rate": 9.894810803347878e-06, + "loss": 0.0912, + "step": 1431 + }, + { + "epoch": 0.6360204308238951, + "grad_norm": 0.5972518436510755, + "learning_rate": 9.894414892792579e-06, + "loss": 0.0657, + "step": 1432 + }, + { + "epoch": 0.6364645791694425, + "grad_norm": 0.4154671445638412, + "learning_rate": 9.894018246523577e-06, + "loss": 0.0532, + "step": 1433 + }, + { + "epoch": 0.6369087275149901, + "grad_norm": 0.65372867549855, + "learning_rate": 9.893620864600501e-06, + "loss": 0.087, + "step": 1434 + }, + { + "epoch": 0.6373528758605375, + "grad_norm": 0.7777909981283909, + "learning_rate": 9.89322274708308e-06, + "loss": 0.0795, + "step": 1435 + }, + { + "epoch": 0.6377970242060849, + "grad_norm": 0.6281013351353801, + "learning_rate": 9.892823894031159e-06, + "loss": 0.0525, + "step": 1436 + }, + { + "epoch": 0.6382411725516323, + "grad_norm": 0.5274870510804406, + "learning_rate": 9.89242430550469e-06, + "loss": 0.0701, + "step": 1437 + }, + { + "epoch": 0.6386853208971797, + "grad_norm": 0.9840882810513909, + "learning_rate": 9.892023981563744e-06, + "loss": 0.0888, + "step": 1438 + }, + { + "epoch": 0.6391294692427271, + "grad_norm": 0.7186121306665683, + "learning_rate": 9.89162292226849e-06, + "loss": 0.0666, + "step": 1439 + }, + { + "epoch": 0.6395736175882745, + "grad_norm": 0.7619824705257715, + "learning_rate": 9.891221127679216e-06, + "loss": 0.0654, + "step": 1440 + }, + { + "epoch": 0.6400177659338219, + "grad_norm": 0.6418832068583645, + "learning_rate": 9.89081859785632e-06, + "loss": 0.0706, + "step": 1441 + }, + { + "epoch": 0.6404619142793693, + "grad_norm": 0.6934692957378497, + "learning_rate": 9.890415332860308e-06, + "loss": 0.0684, + "step": 1442 + }, + { + "epoch": 0.6409060626249167, + "grad_norm": 0.6769661850243969, + "learning_rate": 9.8900113327518e-06, + "loss": 0.0684, + "step": 1443 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 0.6158605594742834, + "learning_rate": 9.88960659759152e-06, + "loss": 0.0652, + "step": 1444 + }, + { + "epoch": 0.6417943593160116, + "grad_norm": 0.5847451238116226, + "learning_rate": 9.88920112744031e-06, + "loss": 0.062, + "step": 1445 + }, + { + "epoch": 0.642238507661559, + "grad_norm": 0.8117410133012406, + "learning_rate": 9.888794922359116e-06, + "loss": 0.0611, + "step": 1446 + }, + { + "epoch": 0.6426826560071064, + "grad_norm": 0.7705515566871968, + "learning_rate": 9.888387982408998e-06, + "loss": 0.0752, + "step": 1447 + }, + { + "epoch": 0.6431268043526538, + "grad_norm": 0.777589892103545, + "learning_rate": 9.887980307651128e-06, + "loss": 0.0764, + "step": 1448 + }, + { + "epoch": 0.6435709526982012, + "grad_norm": 0.8491119564862617, + "learning_rate": 9.887571898146787e-06, + "loss": 0.081, + "step": 1449 + }, + { + "epoch": 0.6440151010437486, + "grad_norm": 0.7918875216411717, + "learning_rate": 9.887162753957362e-06, + "loss": 0.0626, + "step": 1450 + }, + { + "epoch": 0.644459249389296, + "grad_norm": 0.5756994501297381, + "learning_rate": 9.886752875144358e-06, + "loss": 0.0635, + "step": 1451 + }, + { + "epoch": 0.6449033977348434, + "grad_norm": 0.6492149852575843, + "learning_rate": 9.886342261769387e-06, + "loss": 0.0632, + "step": 1452 + }, + { + "epoch": 0.6453475460803908, + "grad_norm": 0.7503617012041023, + "learning_rate": 9.885930913894166e-06, + "loss": 0.0627, + "step": 1453 + }, + { + "epoch": 0.6457916944259383, + "grad_norm": 0.6405303313329822, + "learning_rate": 9.885518831580533e-06, + "loss": 0.0782, + "step": 1454 + }, + { + "epoch": 0.6462358427714857, + "grad_norm": 0.6250298341424497, + "learning_rate": 9.88510601489043e-06, + "loss": 0.0807, + "step": 1455 + }, + { + "epoch": 0.6466799911170331, + "grad_norm": 0.7939161691732389, + "learning_rate": 9.88469246388591e-06, + "loss": 0.069, + "step": 1456 + }, + { + "epoch": 0.6471241394625805, + "grad_norm": 0.7492359887972416, + "learning_rate": 9.884278178629134e-06, + "loss": 0.0854, + "step": 1457 + }, + { + "epoch": 0.6475682878081279, + "grad_norm": 0.6085287559130933, + "learning_rate": 9.883863159182379e-06, + "loss": 0.0608, + "step": 1458 + }, + { + "epoch": 0.6480124361536753, + "grad_norm": 0.6361755513609607, + "learning_rate": 9.883447405608032e-06, + "loss": 0.0433, + "step": 1459 + }, + { + "epoch": 0.6484565844992227, + "grad_norm": 0.6431691982633025, + "learning_rate": 9.88303091796858e-06, + "loss": 0.0419, + "step": 1460 + }, + { + "epoch": 0.6489007328447701, + "grad_norm": 0.8190085662166918, + "learning_rate": 9.882613696326634e-06, + "loss": 0.0767, + "step": 1461 + }, + { + "epoch": 0.6493448811903175, + "grad_norm": 0.755548239297633, + "learning_rate": 9.882195740744911e-06, + "loss": 0.0923, + "step": 1462 + }, + { + "epoch": 0.6497890295358649, + "grad_norm": 0.5719039887354458, + "learning_rate": 9.881777051286232e-06, + "loss": 0.0723, + "step": 1463 + }, + { + "epoch": 0.6502331778814124, + "grad_norm": 0.7825393714474173, + "learning_rate": 9.881357628013535e-06, + "loss": 0.0835, + "step": 1464 + }, + { + "epoch": 0.6506773262269598, + "grad_norm": 0.6610007380508074, + "learning_rate": 9.880937470989868e-06, + "loss": 0.0656, + "step": 1465 + }, + { + "epoch": 0.6511214745725072, + "grad_norm": 0.6218987787310035, + "learning_rate": 9.880516580278386e-06, + "loss": 0.0594, + "step": 1466 + }, + { + "epoch": 0.6515656229180546, + "grad_norm": 0.6871813560151103, + "learning_rate": 9.880094955942357e-06, + "loss": 0.0539, + "step": 1467 + }, + { + "epoch": 0.652009771263602, + "grad_norm": 0.6311698562321532, + "learning_rate": 9.879672598045156e-06, + "loss": 0.0806, + "step": 1468 + }, + { + "epoch": 0.6524539196091494, + "grad_norm": 0.6693355850981739, + "learning_rate": 9.879249506650275e-06, + "loss": 0.0856, + "step": 1469 + }, + { + "epoch": 0.6528980679546968, + "grad_norm": 0.7127562713682332, + "learning_rate": 9.878825681821306e-06, + "loss": 0.0685, + "step": 1470 + }, + { + "epoch": 0.6533422163002442, + "grad_norm": 0.9425811868582009, + "learning_rate": 9.878401123621963e-06, + "loss": 0.0823, + "step": 1471 + }, + { + "epoch": 0.6537863646457917, + "grad_norm": 0.6086374723526442, + "learning_rate": 9.87797583211606e-06, + "loss": 0.0614, + "step": 1472 + }, + { + "epoch": 0.654230512991339, + "grad_norm": 0.7763257358459474, + "learning_rate": 9.877549807367528e-06, + "loss": 0.0857, + "step": 1473 + }, + { + "epoch": 0.6546746613368866, + "grad_norm": 0.539995990500211, + "learning_rate": 9.877123049440405e-06, + "loss": 0.0531, + "step": 1474 + }, + { + "epoch": 0.655118809682434, + "grad_norm": 0.5915202116072419, + "learning_rate": 9.876695558398838e-06, + "loss": 0.0663, + "step": 1475 + }, + { + "epoch": 0.6555629580279814, + "grad_norm": 0.6814775856965638, + "learning_rate": 9.876267334307091e-06, + "loss": 0.0536, + "step": 1476 + }, + { + "epoch": 0.6560071063735288, + "grad_norm": 0.6565534779979346, + "learning_rate": 9.875838377229528e-06, + "loss": 0.0854, + "step": 1477 + }, + { + "epoch": 0.6564512547190762, + "grad_norm": 0.6074291178048928, + "learning_rate": 9.875408687230633e-06, + "loss": 0.0643, + "step": 1478 + }, + { + "epoch": 0.6568954030646236, + "grad_norm": 0.6683735797478915, + "learning_rate": 9.874978264374991e-06, + "loss": 0.0657, + "step": 1479 + }, + { + "epoch": 0.657339551410171, + "grad_norm": 0.6802029779866509, + "learning_rate": 9.874547108727306e-06, + "loss": 0.0571, + "step": 1480 + }, + { + "epoch": 0.6577836997557184, + "grad_norm": 0.7709030410876545, + "learning_rate": 9.874115220352386e-06, + "loss": 0.0515, + "step": 1481 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 0.6029748648689206, + "learning_rate": 9.873682599315152e-06, + "loss": 0.054, + "step": 1482 + }, + { + "epoch": 0.6586719964468133, + "grad_norm": 0.7218151069926805, + "learning_rate": 9.873249245680634e-06, + "loss": 0.0842, + "step": 1483 + }, + { + "epoch": 0.6591161447923607, + "grad_norm": 0.7118725638175186, + "learning_rate": 9.872815159513972e-06, + "loss": 0.0783, + "step": 1484 + }, + { + "epoch": 0.6595602931379081, + "grad_norm": 0.5278035850844903, + "learning_rate": 9.872380340880416e-06, + "loss": 0.0504, + "step": 1485 + }, + { + "epoch": 0.6600044414834555, + "grad_norm": 0.7090158968533085, + "learning_rate": 9.87194478984533e-06, + "loss": 0.0518, + "step": 1486 + }, + { + "epoch": 0.6604485898290029, + "grad_norm": 0.7140940579751807, + "learning_rate": 9.87150850647418e-06, + "loss": 0.0644, + "step": 1487 + }, + { + "epoch": 0.6608927381745503, + "grad_norm": 0.5233702996222387, + "learning_rate": 9.87107149083255e-06, + "loss": 0.0484, + "step": 1488 + }, + { + "epoch": 0.6613368865200977, + "grad_norm": 0.5255868844598361, + "learning_rate": 9.870633742986129e-06, + "loss": 0.0486, + "step": 1489 + }, + { + "epoch": 0.6617810348656451, + "grad_norm": 0.7456554727487221, + "learning_rate": 9.870195263000719e-06, + "loss": 0.0926, + "step": 1490 + }, + { + "epoch": 0.6622251832111925, + "grad_norm": 0.5094239000908518, + "learning_rate": 9.869756050942231e-06, + "loss": 0.0609, + "step": 1491 + }, + { + "epoch": 0.6626693315567399, + "grad_norm": 0.5934076694430587, + "learning_rate": 9.869316106876687e-06, + "loss": 0.0815, + "step": 1492 + }, + { + "epoch": 0.6631134799022874, + "grad_norm": 0.5800872547088418, + "learning_rate": 9.868875430870217e-06, + "loss": 0.0562, + "step": 1493 + }, + { + "epoch": 0.6635576282478348, + "grad_norm": 0.6466806398918721, + "learning_rate": 9.86843402298906e-06, + "loss": 0.0576, + "step": 1494 + }, + { + "epoch": 0.6640017765933822, + "grad_norm": 0.49149929770034256, + "learning_rate": 9.86799188329957e-06, + "loss": 0.0552, + "step": 1495 + }, + { + "epoch": 0.6644459249389296, + "grad_norm": 0.6869307446643815, + "learning_rate": 9.867549011868208e-06, + "loss": 0.0689, + "step": 1496 + }, + { + "epoch": 0.664890073284477, + "grad_norm": 1.0182179632443422, + "learning_rate": 9.867105408761544e-06, + "loss": 0.0658, + "step": 1497 + }, + { + "epoch": 0.6653342216300244, + "grad_norm": 0.5094314413955369, + "learning_rate": 9.866661074046258e-06, + "loss": 0.0509, + "step": 1498 + }, + { + "epoch": 0.6657783699755718, + "grad_norm": 0.6982818166570782, + "learning_rate": 9.866216007789145e-06, + "loss": 0.0728, + "step": 1499 + }, + { + "epoch": 0.6662225183211192, + "grad_norm": 0.7524936010719828, + "learning_rate": 9.8657702100571e-06, + "loss": 0.0911, + "step": 1500 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.7557138142113028, + "learning_rate": 9.86532368091714e-06, + "loss": 0.059, + "step": 1501 + }, + { + "epoch": 0.667110815012214, + "grad_norm": 0.6888932895497213, + "learning_rate": 9.864876420436383e-06, + "loss": 0.0614, + "step": 1502 + }, + { + "epoch": 0.6675549633577615, + "grad_norm": 0.5506269374818886, + "learning_rate": 9.86442842868206e-06, + "loss": 0.0599, + "step": 1503 + }, + { + "epoch": 0.667999111703309, + "grad_norm": 0.8018485119174299, + "learning_rate": 9.86397970572151e-06, + "loss": 0.0875, + "step": 1504 + }, + { + "epoch": 0.6684432600488563, + "grad_norm": 0.5631462958993229, + "learning_rate": 9.863530251622189e-06, + "loss": 0.0691, + "step": 1505 + }, + { + "epoch": 0.6688874083944037, + "grad_norm": 0.8176186875229494, + "learning_rate": 9.863080066451653e-06, + "loss": 0.072, + "step": 1506 + }, + { + "epoch": 0.6693315567399512, + "grad_norm": 0.7128371312290194, + "learning_rate": 9.862629150277574e-06, + "loss": 0.0773, + "step": 1507 + }, + { + "epoch": 0.6697757050854986, + "grad_norm": 0.759379781839776, + "learning_rate": 9.86217750316773e-06, + "loss": 0.0729, + "step": 1508 + }, + { + "epoch": 0.670219853431046, + "grad_norm": 0.6634457926439065, + "learning_rate": 9.861725125190017e-06, + "loss": 0.0595, + "step": 1509 + }, + { + "epoch": 0.6706640017765934, + "grad_norm": 0.7426433953184679, + "learning_rate": 9.861272016412429e-06, + "loss": 0.065, + "step": 1510 + }, + { + "epoch": 0.6711081501221408, + "grad_norm": 0.6600032637907064, + "learning_rate": 9.86081817690308e-06, + "loss": 0.0625, + "step": 1511 + }, + { + "epoch": 0.6715522984676882, + "grad_norm": 0.703748253196849, + "learning_rate": 9.860363606730185e-06, + "loss": 0.073, + "step": 1512 + }, + { + "epoch": 0.6719964468132357, + "grad_norm": 0.8768454660219879, + "learning_rate": 9.85990830596208e-06, + "loss": 0.093, + "step": 1513 + }, + { + "epoch": 0.6724405951587831, + "grad_norm": 0.5810307286903331, + "learning_rate": 9.859452274667199e-06, + "loss": 0.0664, + "step": 1514 + }, + { + "epoch": 0.6728847435043305, + "grad_norm": 0.6467795840989637, + "learning_rate": 9.858995512914096e-06, + "loss": 0.0736, + "step": 1515 + }, + { + "epoch": 0.6733288918498779, + "grad_norm": 0.8604239627870944, + "learning_rate": 9.858538020771424e-06, + "loss": 0.0819, + "step": 1516 + }, + { + "epoch": 0.6737730401954253, + "grad_norm": 0.5072234228135453, + "learning_rate": 9.858079798307959e-06, + "loss": 0.0764, + "step": 1517 + }, + { + "epoch": 0.6742171885409727, + "grad_norm": 0.8843203312811015, + "learning_rate": 9.857620845592573e-06, + "loss": 0.0941, + "step": 1518 + }, + { + "epoch": 0.6746613368865201, + "grad_norm": 0.7766118997696602, + "learning_rate": 9.85716116269426e-06, + "loss": 0.0697, + "step": 1519 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 0.5371610065691803, + "learning_rate": 9.856700749682114e-06, + "loss": 0.075, + "step": 1520 + }, + { + "epoch": 0.6755496335776149, + "grad_norm": 0.5048461380926635, + "learning_rate": 9.856239606625345e-06, + "loss": 0.0584, + "step": 1521 + }, + { + "epoch": 0.6759937819231623, + "grad_norm": 0.819913205588501, + "learning_rate": 9.855777733593269e-06, + "loss": 0.0748, + "step": 1522 + }, + { + "epoch": 0.6764379302687098, + "grad_norm": 0.6492894969300076, + "learning_rate": 9.855315130655315e-06, + "loss": 0.0573, + "step": 1523 + }, + { + "epoch": 0.6768820786142572, + "grad_norm": 1.8883752330861565, + "learning_rate": 9.854851797881018e-06, + "loss": 0.0586, + "step": 1524 + }, + { + "epoch": 0.6773262269598046, + "grad_norm": 0.6674563588470771, + "learning_rate": 9.854387735340028e-06, + "loss": 0.0618, + "step": 1525 + }, + { + "epoch": 0.677770375305352, + "grad_norm": 0.5745105185416173, + "learning_rate": 9.853922943102099e-06, + "loss": 0.0486, + "step": 1526 + }, + { + "epoch": 0.6782145236508994, + "grad_norm": 0.8823764363833241, + "learning_rate": 9.853457421237098e-06, + "loss": 0.0655, + "step": 1527 + }, + { + "epoch": 0.6786586719964468, + "grad_norm": 0.7502418552024257, + "learning_rate": 9.852991169815002e-06, + "loss": 0.0697, + "step": 1528 + }, + { + "epoch": 0.6791028203419942, + "grad_norm": 0.9371750933741902, + "learning_rate": 9.852524188905894e-06, + "loss": 0.0637, + "step": 1529 + }, + { + "epoch": 0.6795469686875416, + "grad_norm": 0.4760980667666431, + "learning_rate": 9.85205647857997e-06, + "loss": 0.0507, + "step": 1530 + }, + { + "epoch": 0.679991117033089, + "grad_norm": 0.5957775022770642, + "learning_rate": 9.851588038907536e-06, + "loss": 0.0725, + "step": 1531 + }, + { + "epoch": 0.6804352653786364, + "grad_norm": 0.8583868263839582, + "learning_rate": 9.851118869959006e-06, + "loss": 0.0926, + "step": 1532 + }, + { + "epoch": 0.6808794137241839, + "grad_norm": 0.5065773305637178, + "learning_rate": 9.850648971804903e-06, + "loss": 0.0519, + "step": 1533 + }, + { + "epoch": 0.6813235620697313, + "grad_norm": 0.6463242249375162, + "learning_rate": 9.850178344515861e-06, + "loss": 0.0495, + "step": 1534 + }, + { + "epoch": 0.6817677104152787, + "grad_norm": 0.713200223511551, + "learning_rate": 9.849706988162626e-06, + "loss": 0.0667, + "step": 1535 + }, + { + "epoch": 0.6822118587608261, + "grad_norm": 0.8106886906091921, + "learning_rate": 9.849234902816047e-06, + "loss": 0.078, + "step": 1536 + }, + { + "epoch": 0.6826560071063735, + "grad_norm": 0.618211559505216, + "learning_rate": 9.848762088547089e-06, + "loss": 0.0499, + "step": 1537 + }, + { + "epoch": 0.6831001554519209, + "grad_norm": 0.4683137101850568, + "learning_rate": 9.848288545426821e-06, + "loss": 0.0485, + "step": 1538 + }, + { + "epoch": 0.6835443037974683, + "grad_norm": 0.6498884933818719, + "learning_rate": 9.847814273526428e-06, + "loss": 0.0717, + "step": 1539 + }, + { + "epoch": 0.6839884521430157, + "grad_norm": 0.6112948857459086, + "learning_rate": 9.8473392729172e-06, + "loss": 0.0556, + "step": 1540 + }, + { + "epoch": 0.6844326004885631, + "grad_norm": 0.6350627430979878, + "learning_rate": 9.846863543670536e-06, + "loss": 0.067, + "step": 1541 + }, + { + "epoch": 0.6848767488341105, + "grad_norm": 1.1432261374175676, + "learning_rate": 9.846387085857949e-06, + "loss": 0.1195, + "step": 1542 + }, + { + "epoch": 0.685320897179658, + "grad_norm": 0.5445218211710267, + "learning_rate": 9.845909899551056e-06, + "loss": 0.0633, + "step": 1543 + }, + { + "epoch": 0.6857650455252055, + "grad_norm": 0.7666433728201425, + "learning_rate": 9.845431984821588e-06, + "loss": 0.0652, + "step": 1544 + }, + { + "epoch": 0.6862091938707529, + "grad_norm": 0.6445193247682808, + "learning_rate": 9.844953341741383e-06, + "loss": 0.044, + "step": 1545 + }, + { + "epoch": 0.6866533422163003, + "grad_norm": 0.6165834169963341, + "learning_rate": 9.844473970382391e-06, + "loss": 0.07, + "step": 1546 + }, + { + "epoch": 0.6870974905618477, + "grad_norm": 0.785019896367438, + "learning_rate": 9.843993870816665e-06, + "loss": 0.0692, + "step": 1547 + }, + { + "epoch": 0.6875416389073951, + "grad_norm": 0.8097989024467772, + "learning_rate": 9.843513043116378e-06, + "loss": 0.0714, + "step": 1548 + }, + { + "epoch": 0.6879857872529425, + "grad_norm": 0.7549105996829288, + "learning_rate": 9.843031487353803e-06, + "loss": 0.0642, + "step": 1549 + }, + { + "epoch": 0.6884299355984899, + "grad_norm": 0.4886439517505401, + "learning_rate": 9.842549203601327e-06, + "loss": 0.0562, + "step": 1550 + }, + { + "epoch": 0.6888740839440373, + "grad_norm": 0.7561494998266006, + "learning_rate": 9.842066191931442e-06, + "loss": 0.0643, + "step": 1551 + }, + { + "epoch": 0.6893182322895848, + "grad_norm": 0.854720220061704, + "learning_rate": 9.84158245241676e-06, + "loss": 0.0522, + "step": 1552 + }, + { + "epoch": 0.6897623806351322, + "grad_norm": 0.5346332465205076, + "learning_rate": 9.84109798512999e-06, + "loss": 0.0518, + "step": 1553 + }, + { + "epoch": 0.6902065289806796, + "grad_norm": 0.508165162429757, + "learning_rate": 9.840612790143958e-06, + "loss": 0.0538, + "step": 1554 + }, + { + "epoch": 0.690650677326227, + "grad_norm": 0.4808558159762227, + "learning_rate": 9.840126867531594e-06, + "loss": 0.0604, + "step": 1555 + }, + { + "epoch": 0.6910948256717744, + "grad_norm": 0.6036124527734282, + "learning_rate": 9.839640217365941e-06, + "loss": 0.0636, + "step": 1556 + }, + { + "epoch": 0.6915389740173218, + "grad_norm": 0.5365369852245644, + "learning_rate": 9.839152839720157e-06, + "loss": 0.0571, + "step": 1557 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 0.9675442829718458, + "learning_rate": 9.838664734667496e-06, + "loss": 0.0768, + "step": 1558 + }, + { + "epoch": 0.6924272707084166, + "grad_norm": 0.6682601437620973, + "learning_rate": 9.83817590228133e-06, + "loss": 0.0682, + "step": 1559 + }, + { + "epoch": 0.692871419053964, + "grad_norm": 0.915552570084289, + "learning_rate": 9.83768634263514e-06, + "loss": 0.0689, + "step": 1560 + }, + { + "epoch": 0.6933155673995114, + "grad_norm": 0.7294790821869619, + "learning_rate": 9.837196055802514e-06, + "loss": 0.0595, + "step": 1561 + }, + { + "epoch": 0.6937597157450589, + "grad_norm": 0.6877843622659181, + "learning_rate": 9.836705041857153e-06, + "loss": 0.0894, + "step": 1562 + }, + { + "epoch": 0.6942038640906063, + "grad_norm": 0.5609065701691318, + "learning_rate": 9.836213300872862e-06, + "loss": 0.0523, + "step": 1563 + }, + { + "epoch": 0.6946480124361537, + "grad_norm": 0.6750341373878816, + "learning_rate": 9.83572083292356e-06, + "loss": 0.0723, + "step": 1564 + }, + { + "epoch": 0.6950921607817011, + "grad_norm": 0.8179982672102986, + "learning_rate": 9.835227638083271e-06, + "loss": 0.0741, + "step": 1565 + }, + { + "epoch": 0.6955363091272485, + "grad_norm": 0.514148745778486, + "learning_rate": 9.834733716426133e-06, + "loss": 0.048, + "step": 1566 + }, + { + "epoch": 0.6959804574727959, + "grad_norm": 0.5288948126890644, + "learning_rate": 9.834239068026388e-06, + "loss": 0.0665, + "step": 1567 + }, + { + "epoch": 0.6964246058183433, + "grad_norm": 0.5107938326869891, + "learning_rate": 9.833743692958392e-06, + "loss": 0.0482, + "step": 1568 + }, + { + "epoch": 0.6968687541638907, + "grad_norm": 1.053299477751095, + "learning_rate": 9.83324759129661e-06, + "loss": 0.0726, + "step": 1569 + }, + { + "epoch": 0.6973129025094381, + "grad_norm": 0.6266804348595715, + "learning_rate": 9.832750763115611e-06, + "loss": 0.0738, + "step": 1570 + }, + { + "epoch": 0.6977570508549855, + "grad_norm": 0.7607794431765756, + "learning_rate": 9.83225320849008e-06, + "loss": 0.0739, + "step": 1571 + }, + { + "epoch": 0.698201199200533, + "grad_norm": 0.7778393457616919, + "learning_rate": 9.831754927494803e-06, + "loss": 0.0948, + "step": 1572 + }, + { + "epoch": 0.6986453475460804, + "grad_norm": 0.7195786654461274, + "learning_rate": 9.831255920204685e-06, + "loss": 0.0646, + "step": 1573 + }, + { + "epoch": 0.6990894958916278, + "grad_norm": 0.6366879939595091, + "learning_rate": 9.830756186694734e-06, + "loss": 0.0808, + "step": 1574 + }, + { + "epoch": 0.6995336442371752, + "grad_norm": 0.478436966515515, + "learning_rate": 9.830255727040066e-06, + "loss": 0.059, + "step": 1575 + }, + { + "epoch": 0.6999777925827226, + "grad_norm": 0.5378537626686002, + "learning_rate": 9.829754541315912e-06, + "loss": 0.0624, + "step": 1576 + }, + { + "epoch": 0.70042194092827, + "grad_norm": 0.5458661735869976, + "learning_rate": 9.829252629597607e-06, + "loss": 0.056, + "step": 1577 + }, + { + "epoch": 0.7008660892738174, + "grad_norm": 0.7598746622748797, + "learning_rate": 9.828749991960598e-06, + "loss": 0.0742, + "step": 1578 + }, + { + "epoch": 0.7013102376193648, + "grad_norm": 0.7874301906671837, + "learning_rate": 9.828246628480438e-06, + "loss": 0.1126, + "step": 1579 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.5696760012747649, + "learning_rate": 9.827742539232791e-06, + "loss": 0.0622, + "step": 1580 + }, + { + "epoch": 0.7021985343104596, + "grad_norm": 0.49299605250258527, + "learning_rate": 9.827237724293434e-06, + "loss": 0.0542, + "step": 1581 + }, + { + "epoch": 0.7026426826560072, + "grad_norm": 0.7950299040696133, + "learning_rate": 9.826732183738246e-06, + "loss": 0.0736, + "step": 1582 + }, + { + "epoch": 0.7030868310015546, + "grad_norm": 0.6701858553269311, + "learning_rate": 9.826225917643217e-06, + "loss": 0.0769, + "step": 1583 + }, + { + "epoch": 0.703530979347102, + "grad_norm": 0.7198204203508045, + "learning_rate": 9.825718926084451e-06, + "loss": 0.0607, + "step": 1584 + }, + { + "epoch": 0.7039751276926494, + "grad_norm": 0.7221114160407107, + "learning_rate": 9.825211209138154e-06, + "loss": 0.0662, + "step": 1585 + }, + { + "epoch": 0.7044192760381968, + "grad_norm": 0.5718930945068039, + "learning_rate": 9.82470276688065e-06, + "loss": 0.063, + "step": 1586 + }, + { + "epoch": 0.7048634243837442, + "grad_norm": 0.7156464008159005, + "learning_rate": 9.824193599388358e-06, + "loss": 0.061, + "step": 1587 + }, + { + "epoch": 0.7053075727292916, + "grad_norm": 0.4730100411783297, + "learning_rate": 9.823683706737824e-06, + "loss": 0.0538, + "step": 1588 + }, + { + "epoch": 0.705751721074839, + "grad_norm": 0.8151288747975431, + "learning_rate": 9.823173089005686e-06, + "loss": 0.0586, + "step": 1589 + }, + { + "epoch": 0.7061958694203864, + "grad_norm": 0.6500001880192491, + "learning_rate": 9.822661746268702e-06, + "loss": 0.0558, + "step": 1590 + }, + { + "epoch": 0.7066400177659338, + "grad_norm": 0.47387928526278555, + "learning_rate": 9.822149678603733e-06, + "loss": 0.0537, + "step": 1591 + }, + { + "epoch": 0.7070841661114813, + "grad_norm": 0.5533123786666888, + "learning_rate": 9.821636886087755e-06, + "loss": 0.049, + "step": 1592 + }, + { + "epoch": 0.7075283144570287, + "grad_norm": 0.5418212216604064, + "learning_rate": 9.82112336879785e-06, + "loss": 0.0443, + "step": 1593 + }, + { + "epoch": 0.7079724628025761, + "grad_norm": 0.595384134286072, + "learning_rate": 9.820609126811202e-06, + "loss": 0.0701, + "step": 1594 + }, + { + "epoch": 0.7084166111481235, + "grad_norm": 0.5686391716299167, + "learning_rate": 9.820094160205118e-06, + "loss": 0.0612, + "step": 1595 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 0.6501450869897375, + "learning_rate": 9.819578469057e-06, + "loss": 0.0571, + "step": 1596 + }, + { + "epoch": 0.7093049078392183, + "grad_norm": 0.8251798873678969, + "learning_rate": 9.819062053444369e-06, + "loss": 0.0903, + "step": 1597 + }, + { + "epoch": 0.7097490561847657, + "grad_norm": 0.659869482663948, + "learning_rate": 9.81854491344485e-06, + "loss": 0.077, + "step": 1598 + }, + { + "epoch": 0.7101932045303131, + "grad_norm": 0.5233735880329885, + "learning_rate": 9.818027049136177e-06, + "loss": 0.0756, + "step": 1599 + }, + { + "epoch": 0.7106373528758605, + "grad_norm": 0.5435283663423869, + "learning_rate": 9.817508460596195e-06, + "loss": 0.0517, + "step": 1600 + }, + { + "epoch": 0.7110815012214079, + "grad_norm": 0.8462175785185146, + "learning_rate": 9.816989147902855e-06, + "loss": 0.0776, + "step": 1601 + }, + { + "epoch": 0.7115256495669554, + "grad_norm": 0.5250146868792045, + "learning_rate": 9.816469111134221e-06, + "loss": 0.0636, + "step": 1602 + }, + { + "epoch": 0.7119697979125028, + "grad_norm": 0.4561566850818785, + "learning_rate": 9.81594835036846e-06, + "loss": 0.0549, + "step": 1603 + }, + { + "epoch": 0.7124139462580502, + "grad_norm": 0.639031123731633, + "learning_rate": 9.815426865683858e-06, + "loss": 0.0739, + "step": 1604 + }, + { + "epoch": 0.7128580946035976, + "grad_norm": 0.4546814856440393, + "learning_rate": 9.814904657158793e-06, + "loss": 0.0536, + "step": 1605 + }, + { + "epoch": 0.713302242949145, + "grad_norm": 0.49725993065397267, + "learning_rate": 9.81438172487177e-06, + "loss": 0.0538, + "step": 1606 + }, + { + "epoch": 0.7137463912946924, + "grad_norm": 0.7681224223760409, + "learning_rate": 9.813858068901391e-06, + "loss": 0.0738, + "step": 1607 + }, + { + "epoch": 0.7141905396402398, + "grad_norm": 0.5475444111433433, + "learning_rate": 9.813333689326371e-06, + "loss": 0.0532, + "step": 1608 + }, + { + "epoch": 0.7146346879857872, + "grad_norm": 0.5001622305576355, + "learning_rate": 9.812808586225533e-06, + "loss": 0.0504, + "step": 1609 + }, + { + "epoch": 0.7150788363313346, + "grad_norm": 0.5966203326606518, + "learning_rate": 9.812282759677811e-06, + "loss": 0.0632, + "step": 1610 + }, + { + "epoch": 0.715522984676882, + "grad_norm": 0.5024269719688089, + "learning_rate": 9.811756209762242e-06, + "loss": 0.054, + "step": 1611 + }, + { + "epoch": 0.7159671330224295, + "grad_norm": 0.5953895951996429, + "learning_rate": 9.811228936557977e-06, + "loss": 0.0687, + "step": 1612 + }, + { + "epoch": 0.7164112813679769, + "grad_norm": 0.9253768611459666, + "learning_rate": 9.810700940144275e-06, + "loss": 0.0936, + "step": 1613 + }, + { + "epoch": 0.7168554297135243, + "grad_norm": 0.40706001374840456, + "learning_rate": 9.810172220600503e-06, + "loss": 0.0501, + "step": 1614 + }, + { + "epoch": 0.7172995780590717, + "grad_norm": 0.6702001680907567, + "learning_rate": 9.809642778006135e-06, + "loss": 0.0721, + "step": 1615 + }, + { + "epoch": 0.7177437264046191, + "grad_norm": 0.5621892036366335, + "learning_rate": 9.809112612440757e-06, + "loss": 0.0624, + "step": 1616 + }, + { + "epoch": 0.7181878747501665, + "grad_norm": 0.6550556626236272, + "learning_rate": 9.808581723984059e-06, + "loss": 0.064, + "step": 1617 + }, + { + "epoch": 0.718632023095714, + "grad_norm": 1.171906822226206, + "learning_rate": 9.808050112715845e-06, + "loss": 0.1172, + "step": 1618 + }, + { + "epoch": 0.7190761714412613, + "grad_norm": 0.4455710256139167, + "learning_rate": 9.807517778716025e-06, + "loss": 0.045, + "step": 1619 + }, + { + "epoch": 0.7195203197868087, + "grad_norm": 0.6254516596662227, + "learning_rate": 9.806984722064616e-06, + "loss": 0.0705, + "step": 1620 + }, + { + "epoch": 0.7199644681323563, + "grad_norm": 0.5571332804687471, + "learning_rate": 9.806450942841747e-06, + "loss": 0.0502, + "step": 1621 + }, + { + "epoch": 0.7204086164779037, + "grad_norm": 1.0191625308451748, + "learning_rate": 9.805916441127657e-06, + "loss": 0.0589, + "step": 1622 + }, + { + "epoch": 0.7208527648234511, + "grad_norm": 0.7859502245676739, + "learning_rate": 9.805381217002684e-06, + "loss": 0.0431, + "step": 1623 + }, + { + "epoch": 0.7212969131689985, + "grad_norm": 0.8065166441211357, + "learning_rate": 9.804845270547288e-06, + "loss": 0.0728, + "step": 1624 + }, + { + "epoch": 0.7217410615145459, + "grad_norm": 0.7073921214139905, + "learning_rate": 9.804308601842026e-06, + "loss": 0.0753, + "step": 1625 + }, + { + "epoch": 0.7221852098600933, + "grad_norm": 0.7787625206064858, + "learning_rate": 9.80377121096757e-06, + "loss": 0.0854, + "step": 1626 + }, + { + "epoch": 0.7226293582056407, + "grad_norm": 1.148628838473968, + "learning_rate": 9.8032330980047e-06, + "loss": 0.092, + "step": 1627 + }, + { + "epoch": 0.7230735065511881, + "grad_norm": 0.5694973855182914, + "learning_rate": 9.802694263034302e-06, + "loss": 0.0661, + "step": 1628 + }, + { + "epoch": 0.7235176548967355, + "grad_norm": 0.6928354319078996, + "learning_rate": 9.802154706137372e-06, + "loss": 0.0588, + "step": 1629 + }, + { + "epoch": 0.7239618032422829, + "grad_norm": 0.5747608874251084, + "learning_rate": 9.801614427395018e-06, + "loss": 0.073, + "step": 1630 + }, + { + "epoch": 0.7244059515878304, + "grad_norm": 0.8245000270563847, + "learning_rate": 9.801073426888447e-06, + "loss": 0.0602, + "step": 1631 + }, + { + "epoch": 0.7248500999333778, + "grad_norm": 0.6063953480963461, + "learning_rate": 9.800531704698986e-06, + "loss": 0.0658, + "step": 1632 + }, + { + "epoch": 0.7252942482789252, + "grad_norm": 0.5240737295642546, + "learning_rate": 9.799989260908063e-06, + "loss": 0.0564, + "step": 1633 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 0.7016224236011144, + "learning_rate": 9.799446095597216e-06, + "loss": 0.0646, + "step": 1634 + }, + { + "epoch": 0.72618254497002, + "grad_norm": 0.6626808866197511, + "learning_rate": 9.798902208848093e-06, + "loss": 0.0818, + "step": 1635 + }, + { + "epoch": 0.7266266933155674, + "grad_norm": 0.6141836548174839, + "learning_rate": 9.79835760074245e-06, + "loss": 0.0508, + "step": 1636 + }, + { + "epoch": 0.7270708416611148, + "grad_norm": 0.6868383771343689, + "learning_rate": 9.797812271362149e-06, + "loss": 0.0552, + "step": 1637 + }, + { + "epoch": 0.7275149900066622, + "grad_norm": 0.4532209654763099, + "learning_rate": 9.79726622078916e-06, + "loss": 0.0451, + "step": 1638 + }, + { + "epoch": 0.7279591383522096, + "grad_norm": 0.6808218136734143, + "learning_rate": 9.79671944910557e-06, + "loss": 0.0571, + "step": 1639 + }, + { + "epoch": 0.728403286697757, + "grad_norm": 0.7510893535708428, + "learning_rate": 9.796171956393566e-06, + "loss": 0.0669, + "step": 1640 + }, + { + "epoch": 0.7288474350433045, + "grad_norm": 0.41663397739980335, + "learning_rate": 9.79562374273544e-06, + "loss": 0.0441, + "step": 1641 + }, + { + "epoch": 0.7292915833888519, + "grad_norm": 0.5592036648884667, + "learning_rate": 9.795074808213604e-06, + "loss": 0.0562, + "step": 1642 + }, + { + "epoch": 0.7297357317343993, + "grad_norm": 0.495293158677455, + "learning_rate": 9.794525152910573e-06, + "loss": 0.0621, + "step": 1643 + }, + { + "epoch": 0.7301798800799467, + "grad_norm": 0.5602886889611556, + "learning_rate": 9.793974776908963e-06, + "loss": 0.0531, + "step": 1644 + }, + { + "epoch": 0.7306240284254941, + "grad_norm": 0.5944252332564274, + "learning_rate": 9.79342368029151e-06, + "loss": 0.0563, + "step": 1645 + }, + { + "epoch": 0.7310681767710415, + "grad_norm": 0.482240455293522, + "learning_rate": 9.792871863141052e-06, + "loss": 0.0576, + "step": 1646 + }, + { + "epoch": 0.7315123251165889, + "grad_norm": 0.5733741929298166, + "learning_rate": 9.792319325540537e-06, + "loss": 0.0684, + "step": 1647 + }, + { + "epoch": 0.7319564734621363, + "grad_norm": 0.4985634447799434, + "learning_rate": 9.79176606757302e-06, + "loss": 0.0597, + "step": 1648 + }, + { + "epoch": 0.7324006218076837, + "grad_norm": 0.685246073677029, + "learning_rate": 9.791212089321662e-06, + "loss": 0.0721, + "step": 1649 + }, + { + "epoch": 0.7328447701532311, + "grad_norm": 0.7566932790234222, + "learning_rate": 9.790657390869742e-06, + "loss": 0.0665, + "step": 1650 + }, + { + "epoch": 0.7332889184987786, + "grad_norm": 0.47289806502397586, + "learning_rate": 9.790101972300635e-06, + "loss": 0.0536, + "step": 1651 + }, + { + "epoch": 0.733733066844326, + "grad_norm": 0.9971658246699995, + "learning_rate": 9.789545833697833e-06, + "loss": 0.0959, + "step": 1652 + }, + { + "epoch": 0.7341772151898734, + "grad_norm": 0.588355861783506, + "learning_rate": 9.788988975144933e-06, + "loss": 0.0556, + "step": 1653 + }, + { + "epoch": 0.7346213635354208, + "grad_norm": 0.5671908470000354, + "learning_rate": 9.788431396725637e-06, + "loss": 0.0669, + "step": 1654 + }, + { + "epoch": 0.7350655118809682, + "grad_norm": 0.758993148904425, + "learning_rate": 9.787873098523763e-06, + "loss": 0.0652, + "step": 1655 + }, + { + "epoch": 0.7355096602265156, + "grad_norm": 0.8048604855196553, + "learning_rate": 9.787314080623229e-06, + "loss": 0.0698, + "step": 1656 + }, + { + "epoch": 0.735953808572063, + "grad_norm": 0.8623667194773064, + "learning_rate": 9.786754343108066e-06, + "loss": 0.0779, + "step": 1657 + }, + { + "epoch": 0.7363979569176105, + "grad_norm": 0.61076619104796, + "learning_rate": 9.786193886062415e-06, + "loss": 0.0769, + "step": 1658 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.36227469684290603, + "learning_rate": 9.785632709570519e-06, + "loss": 0.0414, + "step": 1659 + }, + { + "epoch": 0.7372862536087053, + "grad_norm": 4.048112931451507, + "learning_rate": 9.785070813716733e-06, + "loss": 0.0454, + "step": 1660 + }, + { + "epoch": 0.7377304019542528, + "grad_norm": 0.6035767336706519, + "learning_rate": 9.784508198585519e-06, + "loss": 0.0633, + "step": 1661 + }, + { + "epoch": 0.7381745502998002, + "grad_norm": 0.469714457030227, + "learning_rate": 9.783944864261448e-06, + "loss": 0.0472, + "step": 1662 + }, + { + "epoch": 0.7386186986453476, + "grad_norm": 0.8129946899965498, + "learning_rate": 9.783380810829198e-06, + "loss": 0.0613, + "step": 1663 + }, + { + "epoch": 0.739062846990895, + "grad_norm": 0.6753504015110883, + "learning_rate": 9.782816038373556e-06, + "loss": 0.0902, + "step": 1664 + }, + { + "epoch": 0.7395069953364424, + "grad_norm": 0.6384325521859789, + "learning_rate": 9.782250546979421e-06, + "loss": 0.074, + "step": 1665 + }, + { + "epoch": 0.7399511436819898, + "grad_norm": 0.6956365636896437, + "learning_rate": 9.781684336731791e-06, + "loss": 0.0567, + "step": 1666 + }, + { + "epoch": 0.7403952920275372, + "grad_norm": 0.8256720537662366, + "learning_rate": 9.781117407715779e-06, + "loss": 0.0651, + "step": 1667 + }, + { + "epoch": 0.7408394403730846, + "grad_norm": 0.5355855493602046, + "learning_rate": 9.780549760016602e-06, + "loss": 0.0619, + "step": 1668 + }, + { + "epoch": 0.741283588718632, + "grad_norm": 0.5253416844707828, + "learning_rate": 9.77998139371959e-06, + "loss": 0.0592, + "step": 1669 + }, + { + "epoch": 0.7417277370641794, + "grad_norm": 0.6458381116052228, + "learning_rate": 9.779412308910176e-06, + "loss": 0.0631, + "step": 1670 + }, + { + "epoch": 0.7421718854097269, + "grad_norm": 0.6458810882415751, + "learning_rate": 9.778842505673906e-06, + "loss": 0.0721, + "step": 1671 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 0.578667702719237, + "learning_rate": 9.778271984096427e-06, + "loss": 0.0622, + "step": 1672 + }, + { + "epoch": 0.7430601821008217, + "grad_norm": 0.7294244004095107, + "learning_rate": 9.777700744263502e-06, + "loss": 0.0763, + "step": 1673 + }, + { + "epoch": 0.7435043304463691, + "grad_norm": 0.4949092508929545, + "learning_rate": 9.777128786260995e-06, + "loss": 0.0491, + "step": 1674 + }, + { + "epoch": 0.7439484787919165, + "grad_norm": 0.49019523675297216, + "learning_rate": 9.776556110174882e-06, + "loss": 0.0487, + "step": 1675 + }, + { + "epoch": 0.7443926271374639, + "grad_norm": 0.6147792916360124, + "learning_rate": 9.775982716091245e-06, + "loss": 0.0468, + "step": 1676 + }, + { + "epoch": 0.7448367754830113, + "grad_norm": 0.7728423947263149, + "learning_rate": 9.775408604096276e-06, + "loss": 0.0843, + "step": 1677 + }, + { + "epoch": 0.7452809238285587, + "grad_norm": 0.46460994551018925, + "learning_rate": 9.774833774276278e-06, + "loss": 0.0475, + "step": 1678 + }, + { + "epoch": 0.7457250721741061, + "grad_norm": 0.6020989880539805, + "learning_rate": 9.77425822671765e-06, + "loss": 0.0548, + "step": 1679 + }, + { + "epoch": 0.7461692205196535, + "grad_norm": 0.4185492378453488, + "learning_rate": 9.77368196150691e-06, + "loss": 0.0697, + "step": 1680 + }, + { + "epoch": 0.746613368865201, + "grad_norm": 0.47026038790260477, + "learning_rate": 9.77310497873068e-06, + "loss": 0.0577, + "step": 1681 + }, + { + "epoch": 0.7470575172107484, + "grad_norm": 0.7242331312035356, + "learning_rate": 9.772527278475694e-06, + "loss": 0.0646, + "step": 1682 + }, + { + "epoch": 0.7475016655562958, + "grad_norm": 0.5498639479583922, + "learning_rate": 9.771948860828783e-06, + "loss": 0.0768, + "step": 1683 + }, + { + "epoch": 0.7479458139018432, + "grad_norm": 0.5132952000374971, + "learning_rate": 9.7713697258769e-06, + "loss": 0.0749, + "step": 1684 + }, + { + "epoch": 0.7483899622473906, + "grad_norm": 0.567848025082148, + "learning_rate": 9.770789873707095e-06, + "loss": 0.0853, + "step": 1685 + }, + { + "epoch": 0.748834110592938, + "grad_norm": 0.5686689909367685, + "learning_rate": 9.770209304406531e-06, + "loss": 0.0628, + "step": 1686 + }, + { + "epoch": 0.7492782589384854, + "grad_norm": 0.6034469126916269, + "learning_rate": 9.769628018062477e-06, + "loss": 0.0479, + "step": 1687 + }, + { + "epoch": 0.7497224072840328, + "grad_norm": 0.43723219237091576, + "learning_rate": 9.769046014762307e-06, + "loss": 0.0654, + "step": 1688 + }, + { + "epoch": 0.7501665556295802, + "grad_norm": 0.6855260472494596, + "learning_rate": 9.76846329459351e-06, + "loss": 0.0651, + "step": 1689 + }, + { + "epoch": 0.7506107039751277, + "grad_norm": 0.722060394293234, + "learning_rate": 9.767879857643681e-06, + "loss": 0.0552, + "step": 1690 + }, + { + "epoch": 0.7510548523206751, + "grad_norm": 0.463106704754472, + "learning_rate": 9.767295704000514e-06, + "loss": 0.0534, + "step": 1691 + }, + { + "epoch": 0.7514990006662225, + "grad_norm": 0.6415358905679467, + "learning_rate": 9.766710833751823e-06, + "loss": 0.0806, + "step": 1692 + }, + { + "epoch": 0.75194314901177, + "grad_norm": 0.5352784756952503, + "learning_rate": 9.76612524698552e-06, + "loss": 0.0558, + "step": 1693 + }, + { + "epoch": 0.7523872973573174, + "grad_norm": 1.1814652854922993, + "learning_rate": 9.76553894378963e-06, + "loss": 0.0847, + "step": 1694 + }, + { + "epoch": 0.7528314457028648, + "grad_norm": 0.6986298733118028, + "learning_rate": 9.764951924252284e-06, + "loss": 0.05, + "step": 1695 + }, + { + "epoch": 0.7532755940484122, + "grad_norm": 0.5447554709895742, + "learning_rate": 9.764364188461723e-06, + "loss": 0.0485, + "step": 1696 + }, + { + "epoch": 0.7537197423939596, + "grad_norm": 0.47846902684606657, + "learning_rate": 9.76377573650629e-06, + "loss": 0.0551, + "step": 1697 + }, + { + "epoch": 0.754163890739507, + "grad_norm": 0.8115375903439791, + "learning_rate": 9.763186568474443e-06, + "loss": 0.0701, + "step": 1698 + }, + { + "epoch": 0.7546080390850544, + "grad_norm": 0.5954397107201211, + "learning_rate": 9.762596684454742e-06, + "loss": 0.0474, + "step": 1699 + }, + { + "epoch": 0.7550521874306019, + "grad_norm": 0.9043348007985654, + "learning_rate": 9.762006084535857e-06, + "loss": 0.0752, + "step": 1700 + }, + { + "epoch": 0.7554963357761493, + "grad_norm": 0.5876955603814114, + "learning_rate": 9.761414768806566e-06, + "loss": 0.058, + "step": 1701 + }, + { + "epoch": 0.7559404841216967, + "grad_norm": 0.6215494346347864, + "learning_rate": 9.76082273735575e-06, + "loss": 0.074, + "step": 1702 + }, + { + "epoch": 0.7563846324672441, + "grad_norm": 0.6691646984262034, + "learning_rate": 9.760229990272407e-06, + "loss": 0.0752, + "step": 1703 + }, + { + "epoch": 0.7568287808127915, + "grad_norm": 0.6530103461059469, + "learning_rate": 9.759636527645633e-06, + "loss": 0.0512, + "step": 1704 + }, + { + "epoch": 0.7572729291583389, + "grad_norm": 0.5915232768461202, + "learning_rate": 9.759042349564638e-06, + "loss": 0.0505, + "step": 1705 + }, + { + "epoch": 0.7577170775038863, + "grad_norm": 0.6211496225849954, + "learning_rate": 9.758447456118734e-06, + "loss": 0.0527, + "step": 1706 + }, + { + "epoch": 0.7581612258494337, + "grad_norm": 0.9060279971096812, + "learning_rate": 9.757851847397349e-06, + "loss": 0.0687, + "step": 1707 + }, + { + "epoch": 0.7586053741949811, + "grad_norm": 1.0250865769852973, + "learning_rate": 9.757255523490006e-06, + "loss": 0.0935, + "step": 1708 + }, + { + "epoch": 0.7590495225405285, + "grad_norm": 0.546436812313742, + "learning_rate": 9.756658484486348e-06, + "loss": 0.0667, + "step": 1709 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.5515990500251661, + "learning_rate": 9.756060730476117e-06, + "loss": 0.0529, + "step": 1710 + }, + { + "epoch": 0.7599378192316234, + "grad_norm": 0.618132276932801, + "learning_rate": 9.755462261549167e-06, + "loss": 0.0506, + "step": 1711 + }, + { + "epoch": 0.7603819675771708, + "grad_norm": 0.6625277428767956, + "learning_rate": 9.754863077795459e-06, + "loss": 0.0718, + "step": 1712 + }, + { + "epoch": 0.7608261159227182, + "grad_norm": 0.45483688763929425, + "learning_rate": 9.754263179305058e-06, + "loss": 0.0446, + "step": 1713 + }, + { + "epoch": 0.7612702642682656, + "grad_norm": 0.6438763676440094, + "learning_rate": 9.753662566168142e-06, + "loss": 0.0856, + "step": 1714 + }, + { + "epoch": 0.761714412613813, + "grad_norm": 0.6595899132325235, + "learning_rate": 9.75306123847499e-06, + "loss": 0.0725, + "step": 1715 + }, + { + "epoch": 0.7621585609593604, + "grad_norm": 0.6533315196440966, + "learning_rate": 9.752459196315996e-06, + "loss": 0.0607, + "step": 1716 + }, + { + "epoch": 0.7626027093049078, + "grad_norm": 0.5435323069050279, + "learning_rate": 9.751856439781653e-06, + "loss": 0.0511, + "step": 1717 + }, + { + "epoch": 0.7630468576504552, + "grad_norm": 0.5454956836386448, + "learning_rate": 9.751252968962567e-06, + "loss": 0.0828, + "step": 1718 + }, + { + "epoch": 0.7634910059960026, + "grad_norm": 0.5715320834454507, + "learning_rate": 9.75064878394945e-06, + "loss": 0.0518, + "step": 1719 + }, + { + "epoch": 0.7639351543415501, + "grad_norm": 0.6744811382532548, + "learning_rate": 9.750043884833121e-06, + "loss": 0.0508, + "step": 1720 + }, + { + "epoch": 0.7643793026870975, + "grad_norm": 0.6108688890165651, + "learning_rate": 9.749438271704508e-06, + "loss": 0.0615, + "step": 1721 + }, + { + "epoch": 0.7648234510326449, + "grad_norm": 0.5674222482544424, + "learning_rate": 9.748831944654643e-06, + "loss": 0.0644, + "step": 1722 + }, + { + "epoch": 0.7652675993781923, + "grad_norm": 0.7356158997613679, + "learning_rate": 9.74822490377467e-06, + "loss": 0.0733, + "step": 1723 + }, + { + "epoch": 0.7657117477237397, + "grad_norm": 0.4849462211495647, + "learning_rate": 9.747617149155834e-06, + "loss": 0.0443, + "step": 1724 + }, + { + "epoch": 0.7661558960692871, + "grad_norm": 0.831191855315457, + "learning_rate": 9.747008680889493e-06, + "loss": 0.0659, + "step": 1725 + }, + { + "epoch": 0.7666000444148345, + "grad_norm": 0.6526473429491929, + "learning_rate": 9.746399499067109e-06, + "loss": 0.064, + "step": 1726 + }, + { + "epoch": 0.7670441927603819, + "grad_norm": 0.5803840610292479, + "learning_rate": 9.745789603780254e-06, + "loss": 0.053, + "step": 1727 + }, + { + "epoch": 0.7674883411059293, + "grad_norm": 0.6610162442549882, + "learning_rate": 9.745178995120604e-06, + "loss": 0.0626, + "step": 1728 + }, + { + "epoch": 0.7679324894514767, + "grad_norm": 0.4876204778498726, + "learning_rate": 9.744567673179946e-06, + "loss": 0.0522, + "step": 1729 + }, + { + "epoch": 0.7683766377970243, + "grad_norm": 0.5622038449381063, + "learning_rate": 9.743955638050169e-06, + "loss": 0.0524, + "step": 1730 + }, + { + "epoch": 0.7688207861425717, + "grad_norm": 0.577658138102478, + "learning_rate": 9.743342889823273e-06, + "loss": 0.0559, + "step": 1731 + }, + { + "epoch": 0.7692649344881191, + "grad_norm": 0.6597870381577758, + "learning_rate": 9.742729428591368e-06, + "loss": 0.0603, + "step": 1732 + }, + { + "epoch": 0.7697090828336665, + "grad_norm": 0.8344370622440579, + "learning_rate": 9.742115254446665e-06, + "loss": 0.0836, + "step": 1733 + }, + { + "epoch": 0.7701532311792139, + "grad_norm": 0.5622150768145531, + "learning_rate": 9.741500367481481e-06, + "loss": 0.0826, + "step": 1734 + }, + { + "epoch": 0.7705973795247613, + "grad_norm": 0.6330801870134145, + "learning_rate": 9.740884767788253e-06, + "loss": 0.0569, + "step": 1735 + }, + { + "epoch": 0.7710415278703087, + "grad_norm": 0.5484349976426807, + "learning_rate": 9.740268455459507e-06, + "loss": 0.0759, + "step": 1736 + }, + { + "epoch": 0.7714856762158561, + "grad_norm": 0.6003363970239456, + "learning_rate": 9.739651430587891e-06, + "loss": 0.0559, + "step": 1737 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 0.6514870858876586, + "learning_rate": 9.739033693266152e-06, + "loss": 0.0583, + "step": 1738 + }, + { + "epoch": 0.7723739729069509, + "grad_norm": 0.7128176043400777, + "learning_rate": 9.738415243587146e-06, + "loss": 0.0615, + "step": 1739 + }, + { + "epoch": 0.7728181212524984, + "grad_norm": 0.4997932884849887, + "learning_rate": 9.737796081643838e-06, + "loss": 0.0572, + "step": 1740 + }, + { + "epoch": 0.7732622695980458, + "grad_norm": 0.6809941857723402, + "learning_rate": 9.737176207529296e-06, + "loss": 0.0579, + "step": 1741 + }, + { + "epoch": 0.7737064179435932, + "grad_norm": 0.579421035714199, + "learning_rate": 9.736555621336701e-06, + "loss": 0.0553, + "step": 1742 + }, + { + "epoch": 0.7741505662891406, + "grad_norm": 0.49198429179195113, + "learning_rate": 9.735934323159337e-06, + "loss": 0.046, + "step": 1743 + }, + { + "epoch": 0.774594714634688, + "grad_norm": 0.6373312482523877, + "learning_rate": 9.735312313090593e-06, + "loss": 0.0645, + "step": 1744 + }, + { + "epoch": 0.7750388629802354, + "grad_norm": 0.49553655680906583, + "learning_rate": 9.734689591223971e-06, + "loss": 0.0523, + "step": 1745 + }, + { + "epoch": 0.7754830113257828, + "grad_norm": 0.7575640777298455, + "learning_rate": 9.734066157653075e-06, + "loss": 0.1155, + "step": 1746 + }, + { + "epoch": 0.7759271596713302, + "grad_norm": 0.3941642367752884, + "learning_rate": 9.733442012471617e-06, + "loss": 0.0494, + "step": 1747 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 0.6007166723222428, + "learning_rate": 9.732817155773417e-06, + "loss": 0.0809, + "step": 1748 + }, + { + "epoch": 0.7768154563624251, + "grad_norm": 0.6048168057227016, + "learning_rate": 9.732191587652402e-06, + "loss": 0.0827, + "step": 1749 + }, + { + "epoch": 0.7772596047079725, + "grad_norm": 0.5258636663852201, + "learning_rate": 9.731565308202607e-06, + "loss": 0.0532, + "step": 1750 + }, + { + "epoch": 0.7777037530535199, + "grad_norm": 0.5183503036370198, + "learning_rate": 9.73093831751817e-06, + "loss": 0.0597, + "step": 1751 + }, + { + "epoch": 0.7781479013990673, + "grad_norm": 0.4539057898000694, + "learning_rate": 9.73031061569334e-06, + "loss": 0.0449, + "step": 1752 + }, + { + "epoch": 0.7785920497446147, + "grad_norm": 0.43734440001807745, + "learning_rate": 9.72968220282247e-06, + "loss": 0.0496, + "step": 1753 + }, + { + "epoch": 0.7790361980901621, + "grad_norm": 0.4621698388749695, + "learning_rate": 9.729053079000021e-06, + "loss": 0.0446, + "step": 1754 + }, + { + "epoch": 0.7794803464357095, + "grad_norm": 0.8795540562598588, + "learning_rate": 9.728423244320561e-06, + "loss": 0.0756, + "step": 1755 + }, + { + "epoch": 0.7799244947812569, + "grad_norm": 0.517878545048104, + "learning_rate": 9.727792698878767e-06, + "loss": 0.0577, + "step": 1756 + }, + { + "epoch": 0.7803686431268043, + "grad_norm": 0.5755116631603886, + "learning_rate": 9.72716144276942e-06, + "loss": 0.0507, + "step": 1757 + }, + { + "epoch": 0.7808127914723517, + "grad_norm": 0.501557195914397, + "learning_rate": 9.726529476087406e-06, + "loss": 0.0695, + "step": 1758 + }, + { + "epoch": 0.7812569398178992, + "grad_norm": 0.6162038871090194, + "learning_rate": 9.725896798927724e-06, + "loss": 0.073, + "step": 1759 + }, + { + "epoch": 0.7817010881634466, + "grad_norm": 0.44599102883855124, + "learning_rate": 9.725263411385471e-06, + "loss": 0.046, + "step": 1760 + }, + { + "epoch": 0.782145236508994, + "grad_norm": 0.5089144227251173, + "learning_rate": 9.724629313555862e-06, + "loss": 0.0566, + "step": 1761 + }, + { + "epoch": 0.7825893848545414, + "grad_norm": 0.5159558058324487, + "learning_rate": 9.723994505534209e-06, + "loss": 0.062, + "step": 1762 + }, + { + "epoch": 0.7830335332000888, + "grad_norm": 0.7445098154181274, + "learning_rate": 9.723358987415933e-06, + "loss": 0.0774, + "step": 1763 + }, + { + "epoch": 0.7834776815456362, + "grad_norm": 0.5480997952522682, + "learning_rate": 9.722722759296568e-06, + "loss": 0.0446, + "step": 1764 + }, + { + "epoch": 0.7839218298911836, + "grad_norm": 0.6171588111915909, + "learning_rate": 9.722085821271747e-06, + "loss": 0.0695, + "step": 1765 + }, + { + "epoch": 0.784365978236731, + "grad_norm": 0.5440136789733718, + "learning_rate": 9.721448173437212e-06, + "loss": 0.0509, + "step": 1766 + }, + { + "epoch": 0.7848101265822784, + "grad_norm": 0.5213635915956231, + "learning_rate": 9.720809815888814e-06, + "loss": 0.0543, + "step": 1767 + }, + { + "epoch": 0.7852542749278258, + "grad_norm": 0.5190117014644481, + "learning_rate": 9.720170748722507e-06, + "loss": 0.0625, + "step": 1768 + }, + { + "epoch": 0.7856984232733734, + "grad_norm": 0.4927607718599878, + "learning_rate": 9.719530972034356e-06, + "loss": 0.0609, + "step": 1769 + }, + { + "epoch": 0.7861425716189208, + "grad_norm": 0.48217603174270474, + "learning_rate": 9.718890485920529e-06, + "loss": 0.0496, + "step": 1770 + }, + { + "epoch": 0.7865867199644682, + "grad_norm": 0.5424722566403071, + "learning_rate": 9.7182492904773e-06, + "loss": 0.0553, + "step": 1771 + }, + { + "epoch": 0.7870308683100156, + "grad_norm": 0.549948589003628, + "learning_rate": 9.717607385801055e-06, + "loss": 0.0673, + "step": 1772 + }, + { + "epoch": 0.787475016655563, + "grad_norm": 0.5315442944470616, + "learning_rate": 9.716964771988281e-06, + "loss": 0.0696, + "step": 1773 + }, + { + "epoch": 0.7879191650011104, + "grad_norm": 0.6429809896985884, + "learning_rate": 9.716321449135578e-06, + "loss": 0.0789, + "step": 1774 + }, + { + "epoch": 0.7883633133466578, + "grad_norm": 0.7545678487097474, + "learning_rate": 9.715677417339641e-06, + "loss": 0.0791, + "step": 1775 + }, + { + "epoch": 0.7888074616922052, + "grad_norm": 0.5539973797183765, + "learning_rate": 9.715032676697285e-06, + "loss": 0.059, + "step": 1776 + }, + { + "epoch": 0.7892516100377526, + "grad_norm": 0.5317979991080277, + "learning_rate": 9.714387227305422e-06, + "loss": 0.0622, + "step": 1777 + }, + { + "epoch": 0.7896957583833, + "grad_norm": 0.5252592441778533, + "learning_rate": 9.713741069261076e-06, + "loss": 0.0463, + "step": 1778 + }, + { + "epoch": 0.7901399067288475, + "grad_norm": 0.4965411657812668, + "learning_rate": 9.713094202661374e-06, + "loss": 0.0498, + "step": 1779 + }, + { + "epoch": 0.7905840550743949, + "grad_norm": 0.7856085130172293, + "learning_rate": 9.712446627603553e-06, + "loss": 0.0732, + "step": 1780 + }, + { + "epoch": 0.7910282034199423, + "grad_norm": 0.6433472150837076, + "learning_rate": 9.711798344184952e-06, + "loss": 0.0536, + "step": 1781 + }, + { + "epoch": 0.7914723517654897, + "grad_norm": 0.5926660288471911, + "learning_rate": 9.711149352503022e-06, + "loss": 0.0611, + "step": 1782 + }, + { + "epoch": 0.7919165001110371, + "grad_norm": 0.5407209189318974, + "learning_rate": 9.710499652655313e-06, + "loss": 0.0464, + "step": 1783 + }, + { + "epoch": 0.7923606484565845, + "grad_norm": 1.2981488377904062, + "learning_rate": 9.709849244739493e-06, + "loss": 0.111, + "step": 1784 + }, + { + "epoch": 0.7928047968021319, + "grad_norm": 0.6752401202019049, + "learning_rate": 9.709198128853323e-06, + "loss": 0.0662, + "step": 1785 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 0.5128430129414299, + "learning_rate": 9.708546305094679e-06, + "loss": 0.0892, + "step": 1786 + }, + { + "epoch": 0.7936930934932267, + "grad_norm": 0.5031597058072875, + "learning_rate": 9.707893773561541e-06, + "loss": 0.0552, + "step": 1787 + }, + { + "epoch": 0.7941372418387741, + "grad_norm": 0.7529404051762868, + "learning_rate": 9.707240534351995e-06, + "loss": 0.0829, + "step": 1788 + }, + { + "epoch": 0.7945813901843216, + "grad_norm": 0.549999308663066, + "learning_rate": 9.706586587564236e-06, + "loss": 0.0456, + "step": 1789 + }, + { + "epoch": 0.795025538529869, + "grad_norm": 0.5507988655050372, + "learning_rate": 9.705931933296563e-06, + "loss": 0.0565, + "step": 1790 + }, + { + "epoch": 0.7954696868754164, + "grad_norm": 0.5120352810106473, + "learning_rate": 9.705276571647377e-06, + "loss": 0.0529, + "step": 1791 + }, + { + "epoch": 0.7959138352209638, + "grad_norm": 0.6321538206444037, + "learning_rate": 9.704620502715196e-06, + "loss": 0.0604, + "step": 1792 + }, + { + "epoch": 0.7963579835665112, + "grad_norm": 0.5978551293263742, + "learning_rate": 9.703963726598636e-06, + "loss": 0.0615, + "step": 1793 + }, + { + "epoch": 0.7968021319120586, + "grad_norm": 0.6845514683107613, + "learning_rate": 9.70330624339642e-06, + "loss": 0.0759, + "step": 1794 + }, + { + "epoch": 0.797246280257606, + "grad_norm": 0.5162860917319786, + "learning_rate": 9.702648053207381e-06, + "loss": 0.0606, + "step": 1795 + }, + { + "epoch": 0.7976904286031534, + "grad_norm": 0.5757276725457067, + "learning_rate": 9.701989156130459e-06, + "loss": 0.0494, + "step": 1796 + }, + { + "epoch": 0.7981345769487008, + "grad_norm": 0.7240460732805346, + "learning_rate": 9.70132955226469e-06, + "loss": 0.0681, + "step": 1797 + }, + { + "epoch": 0.7985787252942482, + "grad_norm": 0.6852456458554386, + "learning_rate": 9.700669241709229e-06, + "loss": 0.0696, + "step": 1798 + }, + { + "epoch": 0.7990228736397957, + "grad_norm": 0.7728747951665925, + "learning_rate": 9.70000822456333e-06, + "loss": 0.075, + "step": 1799 + }, + { + "epoch": 0.7994670219853431, + "grad_norm": 0.44864271076401085, + "learning_rate": 9.699346500926357e-06, + "loss": 0.0396, + "step": 1800 + }, + { + "epoch": 0.7999111703308905, + "grad_norm": 0.45736945189860095, + "learning_rate": 9.698684070897774e-06, + "loss": 0.0528, + "step": 1801 + }, + { + "epoch": 0.8003553186764379, + "grad_norm": 0.7755479737213115, + "learning_rate": 9.69802093457716e-06, + "loss": 0.0604, + "step": 1802 + }, + { + "epoch": 0.8007994670219853, + "grad_norm": 0.5300657753961209, + "learning_rate": 9.697357092064196e-06, + "loss": 0.0675, + "step": 1803 + }, + { + "epoch": 0.8012436153675327, + "grad_norm": 0.5613880914239765, + "learning_rate": 9.696692543458666e-06, + "loss": 0.0565, + "step": 1804 + }, + { + "epoch": 0.8016877637130801, + "grad_norm": 1.5622355700759611, + "learning_rate": 9.696027288860463e-06, + "loss": 0.0748, + "step": 1805 + }, + { + "epoch": 0.8021319120586275, + "grad_norm": 0.6809476301245401, + "learning_rate": 9.695361328369588e-06, + "loss": 0.077, + "step": 1806 + }, + { + "epoch": 0.802576060404175, + "grad_norm": 0.573004020361602, + "learning_rate": 9.694694662086143e-06, + "loss": 0.0688, + "step": 1807 + }, + { + "epoch": 0.8030202087497224, + "grad_norm": 0.48241041676332075, + "learning_rate": 9.694027290110344e-06, + "loss": 0.046, + "step": 1808 + }, + { + "epoch": 0.8034643570952699, + "grad_norm": 0.726848223856282, + "learning_rate": 9.693359212542504e-06, + "loss": 0.0554, + "step": 1809 + }, + { + "epoch": 0.8039085054408173, + "grad_norm": 0.5939220725933183, + "learning_rate": 9.692690429483049e-06, + "loss": 0.0599, + "step": 1810 + }, + { + "epoch": 0.8043526537863647, + "grad_norm": 0.8703575105729704, + "learning_rate": 9.692020941032508e-06, + "loss": 0.0697, + "step": 1811 + }, + { + "epoch": 0.8047968021319121, + "grad_norm": 0.6745036941332984, + "learning_rate": 9.691350747291514e-06, + "loss": 0.0622, + "step": 1812 + }, + { + "epoch": 0.8052409504774595, + "grad_norm": 0.5924055978232626, + "learning_rate": 9.690679848360811e-06, + "loss": 0.068, + "step": 1813 + }, + { + "epoch": 0.8056850988230069, + "grad_norm": 0.6624345407427882, + "learning_rate": 9.690008244341247e-06, + "loss": 0.0671, + "step": 1814 + }, + { + "epoch": 0.8061292471685543, + "grad_norm": 0.760605230819851, + "learning_rate": 9.689335935333775e-06, + "loss": 0.0703, + "step": 1815 + }, + { + "epoch": 0.8065733955141017, + "grad_norm": 0.6543633249820602, + "learning_rate": 9.688662921439454e-06, + "loss": 0.0537, + "step": 1816 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 0.6888300517583433, + "learning_rate": 9.687989202759448e-06, + "loss": 0.0495, + "step": 1817 + }, + { + "epoch": 0.8074616922051966, + "grad_norm": 0.686773842932207, + "learning_rate": 9.68731477939503e-06, + "loss": 0.0659, + "step": 1818 + }, + { + "epoch": 0.807905840550744, + "grad_norm": 0.5300847493623394, + "learning_rate": 9.686639651447578e-06, + "loss": 0.0518, + "step": 1819 + }, + { + "epoch": 0.8083499888962914, + "grad_norm": 0.5159319768487426, + "learning_rate": 9.685963819018575e-06, + "loss": 0.0515, + "step": 1820 + }, + { + "epoch": 0.8087941372418388, + "grad_norm": 0.7950101365777946, + "learning_rate": 9.685287282209607e-06, + "loss": 0.0728, + "step": 1821 + }, + { + "epoch": 0.8092382855873862, + "grad_norm": 1.1412965984185115, + "learning_rate": 9.684610041122375e-06, + "loss": 0.0802, + "step": 1822 + }, + { + "epoch": 0.8096824339329336, + "grad_norm": 0.45513195688472363, + "learning_rate": 9.683932095858673e-06, + "loss": 0.0615, + "step": 1823 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.7127474010386736, + "learning_rate": 9.683253446520412e-06, + "loss": 0.0617, + "step": 1824 + }, + { + "epoch": 0.8105707306240284, + "grad_norm": 0.5146350672182237, + "learning_rate": 9.682574093209603e-06, + "loss": 0.0821, + "step": 1825 + }, + { + "epoch": 0.8110148789695758, + "grad_norm": 0.5927766784074162, + "learning_rate": 9.681894036028365e-06, + "loss": 0.0616, + "step": 1826 + }, + { + "epoch": 0.8114590273151232, + "grad_norm": 0.6070773128654839, + "learning_rate": 9.681213275078922e-06, + "loss": 0.0615, + "step": 1827 + }, + { + "epoch": 0.8119031756606707, + "grad_norm": 0.532740364018843, + "learning_rate": 9.680531810463606e-06, + "loss": 0.0572, + "step": 1828 + }, + { + "epoch": 0.8123473240062181, + "grad_norm": 0.5406785460306237, + "learning_rate": 9.679849642284846e-06, + "loss": 0.0553, + "step": 1829 + }, + { + "epoch": 0.8127914723517655, + "grad_norm": 0.7113722607782379, + "learning_rate": 9.679166770645193e-06, + "loss": 0.0588, + "step": 1830 + }, + { + "epoch": 0.8132356206973129, + "grad_norm": 0.5228564923020365, + "learning_rate": 9.678483195647286e-06, + "loss": 0.0762, + "step": 1831 + }, + { + "epoch": 0.8136797690428603, + "grad_norm": 0.3998194117652634, + "learning_rate": 9.67779891739388e-06, + "loss": 0.045, + "step": 1832 + }, + { + "epoch": 0.8141239173884077, + "grad_norm": 0.46502028820157165, + "learning_rate": 9.677113935987839e-06, + "loss": 0.0508, + "step": 1833 + }, + { + "epoch": 0.8145680657339551, + "grad_norm": 0.6755084418924148, + "learning_rate": 9.67642825153212e-06, + "loss": 0.0935, + "step": 1834 + }, + { + "epoch": 0.8150122140795025, + "grad_norm": 0.6820496339271076, + "learning_rate": 9.675741864129797e-06, + "loss": 0.0648, + "step": 1835 + }, + { + "epoch": 0.8154563624250499, + "grad_norm": 0.51959592407304, + "learning_rate": 9.675054773884045e-06, + "loss": 0.0804, + "step": 1836 + }, + { + "epoch": 0.8159005107705973, + "grad_norm": 0.4703704489029568, + "learning_rate": 9.674366980898145e-06, + "loss": 0.0639, + "step": 1837 + }, + { + "epoch": 0.8163446591161448, + "grad_norm": 0.4874780271160304, + "learning_rate": 9.673678485275484e-06, + "loss": 0.0508, + "step": 1838 + }, + { + "epoch": 0.8167888074616922, + "grad_norm": 0.44148176077100115, + "learning_rate": 9.672989287119555e-06, + "loss": 0.0588, + "step": 1839 + }, + { + "epoch": 0.8172329558072396, + "grad_norm": 0.7005816504280763, + "learning_rate": 9.672299386533956e-06, + "loss": 0.074, + "step": 1840 + }, + { + "epoch": 0.817677104152787, + "grad_norm": 0.6435780803327446, + "learning_rate": 9.67160878362239e-06, + "loss": 0.064, + "step": 1841 + }, + { + "epoch": 0.8181212524983345, + "grad_norm": 0.6068596716742378, + "learning_rate": 9.670917478488669e-06, + "loss": 0.0626, + "step": 1842 + }, + { + "epoch": 0.8185654008438819, + "grad_norm": 0.48495659068419156, + "learning_rate": 9.670225471236703e-06, + "loss": 0.0566, + "step": 1843 + }, + { + "epoch": 0.8190095491894293, + "grad_norm": 0.5732730166801319, + "learning_rate": 9.669532761970518e-06, + "loss": 0.0594, + "step": 1844 + }, + { + "epoch": 0.8194536975349767, + "grad_norm": 0.6406816449651614, + "learning_rate": 9.668839350794236e-06, + "loss": 0.0602, + "step": 1845 + }, + { + "epoch": 0.8198978458805241, + "grad_norm": 0.7660699462244367, + "learning_rate": 9.66814523781209e-06, + "loss": 0.1004, + "step": 1846 + }, + { + "epoch": 0.8203419942260715, + "grad_norm": 0.6805404383061566, + "learning_rate": 9.667450423128417e-06, + "loss": 0.0727, + "step": 1847 + }, + { + "epoch": 0.820786142571619, + "grad_norm": 1.8549554381012934, + "learning_rate": 9.666754906847659e-06, + "loss": 0.0612, + "step": 1848 + }, + { + "epoch": 0.8212302909171664, + "grad_norm": 0.9016082075933314, + "learning_rate": 9.666058689074364e-06, + "loss": 0.0792, + "step": 1849 + }, + { + "epoch": 0.8216744392627138, + "grad_norm": 0.5403087387935102, + "learning_rate": 9.665361769913187e-06, + "loss": 0.045, + "step": 1850 + }, + { + "epoch": 0.8221185876082612, + "grad_norm": 0.45933630900068073, + "learning_rate": 9.664664149468885e-06, + "loss": 0.046, + "step": 1851 + }, + { + "epoch": 0.8225627359538086, + "grad_norm": 0.6057928280686538, + "learning_rate": 9.663965827846321e-06, + "loss": 0.053, + "step": 1852 + }, + { + "epoch": 0.823006884299356, + "grad_norm": 0.7862585104580586, + "learning_rate": 9.663266805150468e-06, + "loss": 0.0706, + "step": 1853 + }, + { + "epoch": 0.8234510326449034, + "grad_norm": 0.59770705811337, + "learning_rate": 9.662567081486398e-06, + "loss": 0.0568, + "step": 1854 + }, + { + "epoch": 0.8238951809904508, + "grad_norm": 0.5457681124884074, + "learning_rate": 9.661866656959293e-06, + "loss": 0.0534, + "step": 1855 + }, + { + "epoch": 0.8243393293359982, + "grad_norm": 0.7281386032100584, + "learning_rate": 9.661165531674438e-06, + "loss": 0.081, + "step": 1856 + }, + { + "epoch": 0.8247834776815456, + "grad_norm": 0.5330397841966769, + "learning_rate": 9.660463705737224e-06, + "loss": 0.0657, + "step": 1857 + }, + { + "epoch": 0.8252276260270931, + "grad_norm": 0.5773185928330131, + "learning_rate": 9.65976117925315e-06, + "loss": 0.0657, + "step": 1858 + }, + { + "epoch": 0.8256717743726405, + "grad_norm": 0.6769338412517576, + "learning_rate": 9.659057952327812e-06, + "loss": 0.0713, + "step": 1859 + }, + { + "epoch": 0.8261159227181879, + "grad_norm": 0.5999087119449273, + "learning_rate": 9.65835402506692e-06, + "loss": 0.0776, + "step": 1860 + }, + { + "epoch": 0.8265600710637353, + "grad_norm": 0.46186334132300766, + "learning_rate": 9.657649397576289e-06, + "loss": 0.0435, + "step": 1861 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 0.6132380100477, + "learning_rate": 9.656944069961832e-06, + "loss": 0.0503, + "step": 1862 + }, + { + "epoch": 0.8274483677548301, + "grad_norm": 0.511934760130446, + "learning_rate": 9.656238042329575e-06, + "loss": 0.047, + "step": 1863 + }, + { + "epoch": 0.8278925161003775, + "grad_norm": 0.9622008544679613, + "learning_rate": 9.655531314785643e-06, + "loss": 0.0727, + "step": 1864 + }, + { + "epoch": 0.8283366644459249, + "grad_norm": 0.41625404724105025, + "learning_rate": 9.654823887436272e-06, + "loss": 0.0452, + "step": 1865 + }, + { + "epoch": 0.8287808127914723, + "grad_norm": 0.6534224883169892, + "learning_rate": 9.6541157603878e-06, + "loss": 0.0812, + "step": 1866 + }, + { + "epoch": 0.8292249611370197, + "grad_norm": 0.3831024637275559, + "learning_rate": 9.653406933746667e-06, + "loss": 0.0406, + "step": 1867 + }, + { + "epoch": 0.8296691094825672, + "grad_norm": 0.5750420337273457, + "learning_rate": 9.652697407619425e-06, + "loss": 0.0655, + "step": 1868 + }, + { + "epoch": 0.8301132578281146, + "grad_norm": 0.5340996381251997, + "learning_rate": 9.651987182112727e-06, + "loss": 0.0684, + "step": 1869 + }, + { + "epoch": 0.830557406173662, + "grad_norm": 0.5216822693020573, + "learning_rate": 9.651276257333334e-06, + "loss": 0.0506, + "step": 1870 + }, + { + "epoch": 0.8310015545192094, + "grad_norm": 0.6549627952655241, + "learning_rate": 9.650564633388106e-06, + "loss": 0.0746, + "step": 1871 + }, + { + "epoch": 0.8314457028647568, + "grad_norm": 0.41543419779685914, + "learning_rate": 9.649852310384017e-06, + "loss": 0.0418, + "step": 1872 + }, + { + "epoch": 0.8318898512103042, + "grad_norm": 0.8710457342050398, + "learning_rate": 9.649139288428136e-06, + "loss": 0.0725, + "step": 1873 + }, + { + "epoch": 0.8323339995558516, + "grad_norm": 0.7280421909189481, + "learning_rate": 9.648425567627646e-06, + "loss": 0.0834, + "step": 1874 + }, + { + "epoch": 0.832778147901399, + "grad_norm": 0.48421338065941566, + "learning_rate": 9.647711148089829e-06, + "loss": 0.051, + "step": 1875 + }, + { + "epoch": 0.8332222962469464, + "grad_norm": 0.8228707840085564, + "learning_rate": 9.646996029922078e-06, + "loss": 0.0899, + "step": 1876 + }, + { + "epoch": 0.8336664445924938, + "grad_norm": 0.44548368481732253, + "learning_rate": 9.646280213231882e-06, + "loss": 0.0459, + "step": 1877 + }, + { + "epoch": 0.8341105929380414, + "grad_norm": 0.77874935532614, + "learning_rate": 9.645563698126846e-06, + "loss": 0.0874, + "step": 1878 + }, + { + "epoch": 0.8345547412835888, + "grad_norm": 0.6153818855236423, + "learning_rate": 9.64484648471467e-06, + "loss": 0.0538, + "step": 1879 + }, + { + "epoch": 0.8349988896291362, + "grad_norm": 0.6341254163576385, + "learning_rate": 9.644128573103166e-06, + "loss": 0.0794, + "step": 1880 + }, + { + "epoch": 0.8354430379746836, + "grad_norm": 1.0916751108937903, + "learning_rate": 9.643409963400247e-06, + "loss": 0.0775, + "step": 1881 + }, + { + "epoch": 0.835887186320231, + "grad_norm": 0.5090370659080116, + "learning_rate": 9.642690655713935e-06, + "loss": 0.0515, + "step": 1882 + }, + { + "epoch": 0.8363313346657784, + "grad_norm": 0.8125756753691618, + "learning_rate": 9.641970650152351e-06, + "loss": 0.0856, + "step": 1883 + }, + { + "epoch": 0.8367754830113258, + "grad_norm": 0.7917848803419996, + "learning_rate": 9.641249946823722e-06, + "loss": 0.0789, + "step": 1884 + }, + { + "epoch": 0.8372196313568732, + "grad_norm": 0.6341635191576055, + "learning_rate": 9.640528545836388e-06, + "loss": 0.0689, + "step": 1885 + }, + { + "epoch": 0.8376637797024206, + "grad_norm": 0.45285744978069686, + "learning_rate": 9.639806447298786e-06, + "loss": 0.0502, + "step": 1886 + }, + { + "epoch": 0.8381079280479681, + "grad_norm": 0.5918006498840634, + "learning_rate": 9.639083651319455e-06, + "loss": 0.077, + "step": 1887 + }, + { + "epoch": 0.8385520763935155, + "grad_norm": 0.5566719058184162, + "learning_rate": 9.638360158007049e-06, + "loss": 0.0518, + "step": 1888 + }, + { + "epoch": 0.8389962247390629, + "grad_norm": 0.5775755465954959, + "learning_rate": 9.637635967470317e-06, + "loss": 0.0583, + "step": 1889 + }, + { + "epoch": 0.8394403730846103, + "grad_norm": 0.5105748469342627, + "learning_rate": 9.636911079818121e-06, + "loss": 0.0547, + "step": 1890 + }, + { + "epoch": 0.8398845214301577, + "grad_norm": 0.7153133794510262, + "learning_rate": 9.636185495159423e-06, + "loss": 0.0598, + "step": 1891 + }, + { + "epoch": 0.8403286697757051, + "grad_norm": 0.580672148124565, + "learning_rate": 9.63545921360329e-06, + "loss": 0.0601, + "step": 1892 + }, + { + "epoch": 0.8407728181212525, + "grad_norm": 0.43212755100892075, + "learning_rate": 9.634732235258895e-06, + "loss": 0.0501, + "step": 1893 + }, + { + "epoch": 0.8412169664667999, + "grad_norm": 0.6336655954017367, + "learning_rate": 9.634004560235513e-06, + "loss": 0.0742, + "step": 1894 + }, + { + "epoch": 0.8416611148123473, + "grad_norm": 0.6609701387334967, + "learning_rate": 9.633276188642529e-06, + "loss": 0.0579, + "step": 1895 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.6932382794686757, + "learning_rate": 9.632547120589426e-06, + "loss": 0.0581, + "step": 1896 + }, + { + "epoch": 0.8425494115034422, + "grad_norm": 0.5148690171452629, + "learning_rate": 9.631817356185799e-06, + "loss": 0.0507, + "step": 1897 + }, + { + "epoch": 0.8429935598489896, + "grad_norm": 0.5764411558916347, + "learning_rate": 9.631086895541343e-06, + "loss": 0.0485, + "step": 1898 + }, + { + "epoch": 0.843437708194537, + "grad_norm": 0.713479181836388, + "learning_rate": 9.630355738765859e-06, + "loss": 0.0811, + "step": 1899 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 0.6095181273242816, + "learning_rate": 9.62962388596925e-06, + "loss": 0.051, + "step": 1900 + }, + { + "epoch": 0.8443260048856318, + "grad_norm": 0.6746333568364197, + "learning_rate": 9.628891337261527e-06, + "loss": 0.051, + "step": 1901 + }, + { + "epoch": 0.8447701532311792, + "grad_norm": 0.5956567675202917, + "learning_rate": 9.628158092752807e-06, + "loss": 0.0544, + "step": 1902 + }, + { + "epoch": 0.8452143015767266, + "grad_norm": 0.9239154351205555, + "learning_rate": 9.627424152553305e-06, + "loss": 0.0801, + "step": 1903 + }, + { + "epoch": 0.845658449922274, + "grad_norm": 0.8434016325954864, + "learning_rate": 9.626689516773348e-06, + "loss": 0.0597, + "step": 1904 + }, + { + "epoch": 0.8461025982678214, + "grad_norm": 0.6102688127848747, + "learning_rate": 9.625954185523361e-06, + "loss": 0.0751, + "step": 1905 + }, + { + "epoch": 0.8465467466133688, + "grad_norm": 0.7017970233762413, + "learning_rate": 9.62521815891388e-06, + "loss": 0.0665, + "step": 1906 + }, + { + "epoch": 0.8469908949589163, + "grad_norm": 0.6370603868487351, + "learning_rate": 9.624481437055542e-06, + "loss": 0.0584, + "step": 1907 + }, + { + "epoch": 0.8474350433044637, + "grad_norm": 0.8073786270061732, + "learning_rate": 9.623744020059086e-06, + "loss": 0.0739, + "step": 1908 + }, + { + "epoch": 0.8478791916500111, + "grad_norm": 0.6765473850641361, + "learning_rate": 9.623005908035362e-06, + "loss": 0.0578, + "step": 1909 + }, + { + "epoch": 0.8483233399955585, + "grad_norm": 0.7723618661951789, + "learning_rate": 9.622267101095318e-06, + "loss": 0.0477, + "step": 1910 + }, + { + "epoch": 0.8487674883411059, + "grad_norm": 0.5605923323935563, + "learning_rate": 9.621527599350008e-06, + "loss": 0.0603, + "step": 1911 + }, + { + "epoch": 0.8492116366866533, + "grad_norm": 0.7755772246757074, + "learning_rate": 9.620787402910597e-06, + "loss": 0.0983, + "step": 1912 + }, + { + "epoch": 0.8496557850322007, + "grad_norm": 0.7813757821925909, + "learning_rate": 9.620046511888343e-06, + "loss": 0.0499, + "step": 1913 + }, + { + "epoch": 0.8500999333777481, + "grad_norm": 0.8027788261683163, + "learning_rate": 9.619304926394619e-06, + "loss": 0.0588, + "step": 1914 + }, + { + "epoch": 0.8505440817232955, + "grad_norm": 0.8601044017181758, + "learning_rate": 9.618562646540897e-06, + "loss": 0.0638, + "step": 1915 + }, + { + "epoch": 0.8509882300688429, + "grad_norm": 0.5716901663276188, + "learning_rate": 9.617819672438754e-06, + "loss": 0.056, + "step": 1916 + }, + { + "epoch": 0.8514323784143905, + "grad_norm": 0.7888611302795855, + "learning_rate": 9.617076004199868e-06, + "loss": 0.08, + "step": 1917 + }, + { + "epoch": 0.8518765267599379, + "grad_norm": 0.5937752892253898, + "learning_rate": 9.616331641936031e-06, + "loss": 0.0545, + "step": 1918 + }, + { + "epoch": 0.8523206751054853, + "grad_norm": 0.6861875565077891, + "learning_rate": 9.61558658575913e-06, + "loss": 0.055, + "step": 1919 + }, + { + "epoch": 0.8527648234510327, + "grad_norm": 0.8458011501127822, + "learning_rate": 9.614840835781159e-06, + "loss": 0.1023, + "step": 1920 + }, + { + "epoch": 0.8532089717965801, + "grad_norm": 0.5027378978279742, + "learning_rate": 9.614094392114218e-06, + "loss": 0.0518, + "step": 1921 + }, + { + "epoch": 0.8536531201421275, + "grad_norm": 0.5758297033096836, + "learning_rate": 9.613347254870511e-06, + "loss": 0.0476, + "step": 1922 + }, + { + "epoch": 0.8540972684876749, + "grad_norm": 0.8773520125115069, + "learning_rate": 9.612599424162344e-06, + "loss": 0.0858, + "step": 1923 + }, + { + "epoch": 0.8545414168332223, + "grad_norm": 0.5342112118627843, + "learning_rate": 9.61185090010213e-06, + "loss": 0.0532, + "step": 1924 + }, + { + "epoch": 0.8549855651787697, + "grad_norm": 0.617753551302806, + "learning_rate": 9.611101682802383e-06, + "loss": 0.0664, + "step": 1925 + }, + { + "epoch": 0.8554297135243171, + "grad_norm": 0.6816206410848374, + "learning_rate": 9.610351772375724e-06, + "loss": 0.0919, + "step": 1926 + }, + { + "epoch": 0.8558738618698646, + "grad_norm": 0.636349806575952, + "learning_rate": 9.609601168934878e-06, + "loss": 0.0544, + "step": 1927 + }, + { + "epoch": 0.856318010215412, + "grad_norm": 0.5095901638429818, + "learning_rate": 9.608849872592674e-06, + "loss": 0.0584, + "step": 1928 + }, + { + "epoch": 0.8567621585609594, + "grad_norm": 0.5240679834795968, + "learning_rate": 9.608097883462043e-06, + "loss": 0.0553, + "step": 1929 + }, + { + "epoch": 0.8572063069065068, + "grad_norm": 0.6099517865434585, + "learning_rate": 9.60734520165602e-06, + "loss": 0.0657, + "step": 1930 + }, + { + "epoch": 0.8576504552520542, + "grad_norm": 0.555945227169772, + "learning_rate": 9.60659182728775e-06, + "loss": 0.0737, + "step": 1931 + }, + { + "epoch": 0.8580946035976016, + "grad_norm": 0.5341529189989662, + "learning_rate": 9.605837760470476e-06, + "loss": 0.0494, + "step": 1932 + }, + { + "epoch": 0.858538751943149, + "grad_norm": 0.5439610693418829, + "learning_rate": 9.605083001317547e-06, + "loss": 0.0566, + "step": 1933 + }, + { + "epoch": 0.8589829002886964, + "grad_norm": 0.6077814464368465, + "learning_rate": 9.604327549942415e-06, + "loss": 0.0615, + "step": 1934 + }, + { + "epoch": 0.8594270486342438, + "grad_norm": 0.7793330895236799, + "learning_rate": 9.603571406458641e-06, + "loss": 0.0669, + "step": 1935 + }, + { + "epoch": 0.8598711969797912, + "grad_norm": 0.5859164596399333, + "learning_rate": 9.60281457097988e-06, + "loss": 0.0554, + "step": 1936 + }, + { + "epoch": 0.8603153453253387, + "grad_norm": 0.5913578923102047, + "learning_rate": 9.602057043619903e-06, + "loss": 0.0819, + "step": 1937 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 0.3937188804578706, + "learning_rate": 9.601298824492577e-06, + "loss": 0.0455, + "step": 1938 + }, + { + "epoch": 0.8612036420164335, + "grad_norm": 0.5687600158251599, + "learning_rate": 9.600539913711876e-06, + "loss": 0.0703, + "step": 1939 + }, + { + "epoch": 0.8616477903619809, + "grad_norm": 0.7038982027083199, + "learning_rate": 9.599780311391876e-06, + "loss": 0.0559, + "step": 1940 + }, + { + "epoch": 0.8620919387075283, + "grad_norm": 0.5434923734263241, + "learning_rate": 9.599020017646758e-06, + "loss": 0.059, + "step": 1941 + }, + { + "epoch": 0.8625360870530757, + "grad_norm": 0.6288012353236909, + "learning_rate": 9.59825903259081e-06, + "loss": 0.0664, + "step": 1942 + }, + { + "epoch": 0.8629802353986231, + "grad_norm": 0.7037145137062264, + "learning_rate": 9.597497356338415e-06, + "loss": 0.069, + "step": 1943 + }, + { + "epoch": 0.8634243837441705, + "grad_norm": 0.5095647439957363, + "learning_rate": 9.59673498900407e-06, + "loss": 0.0618, + "step": 1944 + }, + { + "epoch": 0.8638685320897179, + "grad_norm": 0.6494185255511562, + "learning_rate": 9.595971930702372e-06, + "loss": 0.0658, + "step": 1945 + }, + { + "epoch": 0.8643126804352653, + "grad_norm": 0.5454132982093647, + "learning_rate": 9.595208181548022e-06, + "loss": 0.0591, + "step": 1946 + }, + { + "epoch": 0.8647568287808128, + "grad_norm": 0.7893076144618603, + "learning_rate": 9.594443741655823e-06, + "loss": 0.0582, + "step": 1947 + }, + { + "epoch": 0.8652009771263602, + "grad_norm": 0.7430564668455467, + "learning_rate": 9.593678611140683e-06, + "loss": 0.0836, + "step": 1948 + }, + { + "epoch": 0.8656451254719076, + "grad_norm": 0.6526222088648067, + "learning_rate": 9.592912790117614e-06, + "loss": 0.0612, + "step": 1949 + }, + { + "epoch": 0.866089273817455, + "grad_norm": 0.5060704004253027, + "learning_rate": 9.592146278701734e-06, + "loss": 0.0528, + "step": 1950 + }, + { + "epoch": 0.8665334221630024, + "grad_norm": 0.7447569328611817, + "learning_rate": 9.591379077008263e-06, + "loss": 0.0657, + "step": 1951 + }, + { + "epoch": 0.8669775705085498, + "grad_norm": 0.6783080197900007, + "learning_rate": 9.590611185152521e-06, + "loss": 0.0748, + "step": 1952 + }, + { + "epoch": 0.8674217188540972, + "grad_norm": 0.7292228470690477, + "learning_rate": 9.589842603249935e-06, + "loss": 0.0626, + "step": 1953 + }, + { + "epoch": 0.8678658671996446, + "grad_norm": 0.5752723662054593, + "learning_rate": 9.58907333141604e-06, + "loss": 0.0562, + "step": 1954 + }, + { + "epoch": 0.868310015545192, + "grad_norm": 0.5789129306578911, + "learning_rate": 9.588303369766469e-06, + "loss": 0.0523, + "step": 1955 + }, + { + "epoch": 0.8687541638907396, + "grad_norm": 0.5972967849185621, + "learning_rate": 9.58753271841696e-06, + "loss": 0.0638, + "step": 1956 + }, + { + "epoch": 0.869198312236287, + "grad_norm": 0.9160241760382546, + "learning_rate": 9.586761377483355e-06, + "loss": 0.083, + "step": 1957 + }, + { + "epoch": 0.8696424605818344, + "grad_norm": 0.6452629891186096, + "learning_rate": 9.585989347081599e-06, + "loss": 0.0765, + "step": 1958 + }, + { + "epoch": 0.8700866089273818, + "grad_norm": 0.5181341158162395, + "learning_rate": 9.58521662732774e-06, + "loss": 0.063, + "step": 1959 + }, + { + "epoch": 0.8705307572729292, + "grad_norm": 0.5208504155693663, + "learning_rate": 9.584443218337935e-06, + "loss": 0.0609, + "step": 1960 + }, + { + "epoch": 0.8709749056184766, + "grad_norm": 0.5883552043198282, + "learning_rate": 9.583669120228439e-06, + "loss": 0.0644, + "step": 1961 + }, + { + "epoch": 0.871419053964024, + "grad_norm": 0.5457449002927042, + "learning_rate": 9.582894333115608e-06, + "loss": 0.0567, + "step": 1962 + }, + { + "epoch": 0.8718632023095714, + "grad_norm": 0.6679359172890208, + "learning_rate": 9.58211885711591e-06, + "loss": 0.0652, + "step": 1963 + }, + { + "epoch": 0.8723073506551188, + "grad_norm": 0.5085227180546535, + "learning_rate": 9.581342692345913e-06, + "loss": 0.0734, + "step": 1964 + }, + { + "epoch": 0.8727514990006662, + "grad_norm": 0.63235227685218, + "learning_rate": 9.580565838922285e-06, + "loss": 0.0501, + "step": 1965 + }, + { + "epoch": 0.8731956473462137, + "grad_norm": 0.5297922964410929, + "learning_rate": 9.579788296961801e-06, + "loss": 0.0525, + "step": 1966 + }, + { + "epoch": 0.8736397956917611, + "grad_norm": 0.5775438947377388, + "learning_rate": 9.57901006658134e-06, + "loss": 0.0544, + "step": 1967 + }, + { + "epoch": 0.8740839440373085, + "grad_norm": 0.708123483636742, + "learning_rate": 9.57823114789788e-06, + "loss": 0.0617, + "step": 1968 + }, + { + "epoch": 0.8745280923828559, + "grad_norm": 0.5577152090781191, + "learning_rate": 9.577451541028509e-06, + "loss": 0.0583, + "step": 1969 + }, + { + "epoch": 0.8749722407284033, + "grad_norm": 0.6520169618466134, + "learning_rate": 9.576671246090415e-06, + "loss": 0.0566, + "step": 1970 + }, + { + "epoch": 0.8754163890739507, + "grad_norm": 0.49841809419135175, + "learning_rate": 9.575890263200887e-06, + "loss": 0.0497, + "step": 1971 + }, + { + "epoch": 0.8758605374194981, + "grad_norm": 0.5197540662913069, + "learning_rate": 9.575108592477322e-06, + "loss": 0.0555, + "step": 1972 + }, + { + "epoch": 0.8763046857650455, + "grad_norm": 0.5124464300264364, + "learning_rate": 9.57432623403722e-06, + "loss": 0.0477, + "step": 1973 + }, + { + "epoch": 0.8767488341105929, + "grad_norm": 0.5779510703301827, + "learning_rate": 9.57354318799818e-06, + "loss": 0.0685, + "step": 1974 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 0.5097058321667625, + "learning_rate": 9.572759454477907e-06, + "loss": 0.0488, + "step": 1975 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 0.8063636142109883, + "learning_rate": 9.57197503359421e-06, + "loss": 0.0544, + "step": 1976 + }, + { + "epoch": 0.8780812791472352, + "grad_norm": 0.4428063296481966, + "learning_rate": 9.571189925465002e-06, + "loss": 0.0533, + "step": 1977 + }, + { + "epoch": 0.8785254274927826, + "grad_norm": 0.4839122950764424, + "learning_rate": 9.570404130208297e-06, + "loss": 0.0479, + "step": 1978 + }, + { + "epoch": 0.87896957583833, + "grad_norm": 0.7120182407032511, + "learning_rate": 9.569617647942214e-06, + "loss": 0.0611, + "step": 1979 + }, + { + "epoch": 0.8794137241838774, + "grad_norm": 0.792336377588857, + "learning_rate": 9.568830478784975e-06, + "loss": 0.0618, + "step": 1980 + }, + { + "epoch": 0.8798578725294248, + "grad_norm": 0.7252084523924553, + "learning_rate": 9.568042622854902e-06, + "loss": 0.075, + "step": 1981 + }, + { + "epoch": 0.8803020208749722, + "grad_norm": 0.5594674149617587, + "learning_rate": 9.567254080270427e-06, + "loss": 0.0619, + "step": 1982 + }, + { + "epoch": 0.8807461692205196, + "grad_norm": 0.7969483073047114, + "learning_rate": 9.566464851150078e-06, + "loss": 0.0626, + "step": 1983 + }, + { + "epoch": 0.881190317566067, + "grad_norm": 0.760520710752294, + "learning_rate": 9.565674935612495e-06, + "loss": 0.0584, + "step": 1984 + }, + { + "epoch": 0.8816344659116144, + "grad_norm": 0.626615448212967, + "learning_rate": 9.564884333776408e-06, + "loss": 0.0596, + "step": 1985 + }, + { + "epoch": 0.8820786142571619, + "grad_norm": 0.6020936472164464, + "learning_rate": 9.564093045760663e-06, + "loss": 0.0623, + "step": 1986 + }, + { + "epoch": 0.8825227626027093, + "grad_norm": 0.7034061959408842, + "learning_rate": 9.563301071684203e-06, + "loss": 0.0549, + "step": 1987 + }, + { + "epoch": 0.8829669109482567, + "grad_norm": 0.5695284311311956, + "learning_rate": 9.562508411666077e-06, + "loss": 0.0633, + "step": 1988 + }, + { + "epoch": 0.8834110592938041, + "grad_norm": 0.9573516672168828, + "learning_rate": 9.56171506582543e-06, + "loss": 0.0653, + "step": 1989 + }, + { + "epoch": 0.8838552076393515, + "grad_norm": 0.5805029234381518, + "learning_rate": 9.56092103428152e-06, + "loss": 0.0607, + "step": 1990 + }, + { + "epoch": 0.884299355984899, + "grad_norm": 0.9854040215753008, + "learning_rate": 9.560126317153702e-06, + "loss": 0.0792, + "step": 1991 + }, + { + "epoch": 0.8847435043304464, + "grad_norm": 0.7570653217798166, + "learning_rate": 9.559330914561435e-06, + "loss": 0.0574, + "step": 1992 + }, + { + "epoch": 0.8851876526759938, + "grad_norm": 0.6760651246150844, + "learning_rate": 9.558534826624281e-06, + "loss": 0.0586, + "step": 1993 + }, + { + "epoch": 0.8856318010215412, + "grad_norm": 0.7446222818408107, + "learning_rate": 9.55773805346191e-06, + "loss": 0.0903, + "step": 1994 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 0.4971467452542264, + "learning_rate": 9.556940595194085e-06, + "loss": 0.0495, + "step": 1995 + }, + { + "epoch": 0.8865200977126361, + "grad_norm": 0.48127766478143535, + "learning_rate": 9.55614245194068e-06, + "loss": 0.0476, + "step": 1996 + }, + { + "epoch": 0.8869642460581835, + "grad_norm": 0.5264335771790009, + "learning_rate": 9.555343623821669e-06, + "loss": 0.0656, + "step": 1997 + }, + { + "epoch": 0.8874083944037309, + "grad_norm": 1.0687153694098346, + "learning_rate": 9.554544110957128e-06, + "loss": 0.1082, + "step": 1998 + }, + { + "epoch": 0.8878525427492783, + "grad_norm": 0.6068726497280238, + "learning_rate": 9.553743913467241e-06, + "loss": 0.0613, + "step": 1999 + }, + { + "epoch": 0.8882966910948257, + "grad_norm": 0.7498237482418018, + "learning_rate": 9.552943031472289e-06, + "loss": 0.0771, + "step": 2000 + }, + { + "epoch": 0.8887408394403731, + "grad_norm": 0.6023014910820245, + "learning_rate": 9.552141465092659e-06, + "loss": 0.0602, + "step": 2001 + }, + { + "epoch": 0.8891849877859205, + "grad_norm": 0.6380409827625847, + "learning_rate": 9.551339214448838e-06, + "loss": 0.0619, + "step": 2002 + }, + { + "epoch": 0.8896291361314679, + "grad_norm": 0.9507144923151234, + "learning_rate": 9.55053627966142e-06, + "loss": 0.0869, + "step": 2003 + }, + { + "epoch": 0.8900732844770153, + "grad_norm": 0.5426047915412493, + "learning_rate": 9.5497326608511e-06, + "loss": 0.0571, + "step": 2004 + }, + { + "epoch": 0.8905174328225627, + "grad_norm": 0.44148111117293815, + "learning_rate": 9.548928358138672e-06, + "loss": 0.0508, + "step": 2005 + }, + { + "epoch": 0.8909615811681102, + "grad_norm": 0.6815953588544817, + "learning_rate": 9.548123371645042e-06, + "loss": 0.0648, + "step": 2006 + }, + { + "epoch": 0.8914057295136576, + "grad_norm": 0.7228672896480205, + "learning_rate": 9.547317701491207e-06, + "loss": 0.0751, + "step": 2007 + }, + { + "epoch": 0.891849877859205, + "grad_norm": 0.5288810936079249, + "learning_rate": 9.546511347798278e-06, + "loss": 0.0718, + "step": 2008 + }, + { + "epoch": 0.8922940262047524, + "grad_norm": 0.6326908645656705, + "learning_rate": 9.545704310687462e-06, + "loss": 0.0643, + "step": 2009 + }, + { + "epoch": 0.8927381745502998, + "grad_norm": 0.715169955221305, + "learning_rate": 9.54489659028007e-06, + "loss": 0.0551, + "step": 2010 + }, + { + "epoch": 0.8931823228958472, + "grad_norm": 0.5887866106129909, + "learning_rate": 9.544088186697515e-06, + "loss": 0.0537, + "step": 2011 + }, + { + "epoch": 0.8936264712413946, + "grad_norm": 0.9378252587331583, + "learning_rate": 9.543279100061316e-06, + "loss": 0.0823, + "step": 2012 + }, + { + "epoch": 0.894070619586942, + "grad_norm": 0.5493178853844625, + "learning_rate": 9.542469330493092e-06, + "loss": 0.0518, + "step": 2013 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 1.0489392256223762, + "learning_rate": 9.541658878114564e-06, + "loss": 0.0951, + "step": 2014 + }, + { + "epoch": 0.8949589162780368, + "grad_norm": 0.9003525371835955, + "learning_rate": 9.540847743047556e-06, + "loss": 0.0823, + "step": 2015 + }, + { + "epoch": 0.8954030646235843, + "grad_norm": 0.6349100893476417, + "learning_rate": 9.540035925413997e-06, + "loss": 0.0687, + "step": 2016 + }, + { + "epoch": 0.8958472129691317, + "grad_norm": 0.6722685205867144, + "learning_rate": 9.539223425335919e-06, + "loss": 0.0584, + "step": 2017 + }, + { + "epoch": 0.8962913613146791, + "grad_norm": 0.5752288899218686, + "learning_rate": 9.53841024293545e-06, + "loss": 0.0615, + "step": 2018 + }, + { + "epoch": 0.8967355096602265, + "grad_norm": 0.6641589129330268, + "learning_rate": 9.537596378334827e-06, + "loss": 0.0599, + "step": 2019 + }, + { + "epoch": 0.8971796580057739, + "grad_norm": 0.6171698716517034, + "learning_rate": 9.53678183165639e-06, + "loss": 0.0614, + "step": 2020 + }, + { + "epoch": 0.8976238063513213, + "grad_norm": 0.46805323991408604, + "learning_rate": 9.535966603022578e-06, + "loss": 0.0497, + "step": 2021 + }, + { + "epoch": 0.8980679546968687, + "grad_norm": 0.5561153083603769, + "learning_rate": 9.53515069255593e-06, + "loss": 0.0549, + "step": 2022 + }, + { + "epoch": 0.8985121030424161, + "grad_norm": 0.8567619540393755, + "learning_rate": 9.534334100379095e-06, + "loss": 0.087, + "step": 2023 + }, + { + "epoch": 0.8989562513879635, + "grad_norm": 0.7529977371586785, + "learning_rate": 9.533516826614822e-06, + "loss": 0.071, + "step": 2024 + }, + { + "epoch": 0.899400399733511, + "grad_norm": 0.47166342806955874, + "learning_rate": 9.532698871385957e-06, + "loss": 0.0598, + "step": 2025 + }, + { + "epoch": 0.8998445480790584, + "grad_norm": 0.5143678735436241, + "learning_rate": 9.531880234815454e-06, + "loss": 0.0564, + "step": 2026 + }, + { + "epoch": 0.9002886964246058, + "grad_norm": 0.7879319949686112, + "learning_rate": 9.53106091702637e-06, + "loss": 0.0721, + "step": 2027 + }, + { + "epoch": 0.9007328447701533, + "grad_norm": 0.9434208137805631, + "learning_rate": 9.53024091814186e-06, + "loss": 0.0777, + "step": 2028 + }, + { + "epoch": 0.9011769931157007, + "grad_norm": 0.5393835319630947, + "learning_rate": 9.529420238285185e-06, + "loss": 0.0729, + "step": 2029 + }, + { + "epoch": 0.901621141461248, + "grad_norm": 1.1442231822471376, + "learning_rate": 9.528598877579707e-06, + "loss": 0.0577, + "step": 2030 + }, + { + "epoch": 0.9020652898067955, + "grad_norm": 0.6771073887283316, + "learning_rate": 9.52777683614889e-06, + "loss": 0.0524, + "step": 2031 + }, + { + "epoch": 0.9025094381523429, + "grad_norm": 0.4601735486943438, + "learning_rate": 9.5269541141163e-06, + "loss": 0.049, + "step": 2032 + }, + { + "epoch": 0.9029535864978903, + "grad_norm": 0.857950337442236, + "learning_rate": 9.526130711605609e-06, + "loss": 0.0729, + "step": 2033 + }, + { + "epoch": 0.9033977348434377, + "grad_norm": 0.6334716311616548, + "learning_rate": 9.525306628740585e-06, + "loss": 0.0557, + "step": 2034 + }, + { + "epoch": 0.9038418831889852, + "grad_norm": 0.5606743700857357, + "learning_rate": 9.524481865645105e-06, + "loss": 0.0529, + "step": 2035 + }, + { + "epoch": 0.9042860315345326, + "grad_norm": 0.7009413752452551, + "learning_rate": 9.523656422443142e-06, + "loss": 0.0705, + "step": 2036 + }, + { + "epoch": 0.90473017988008, + "grad_norm": 0.5676919166524141, + "learning_rate": 9.522830299258773e-06, + "loss": 0.0642, + "step": 2037 + }, + { + "epoch": 0.9051743282256274, + "grad_norm": 0.5909379090037243, + "learning_rate": 9.522003496216184e-06, + "loss": 0.0611, + "step": 2038 + }, + { + "epoch": 0.9056184765711748, + "grad_norm": 0.6412971030112977, + "learning_rate": 9.521176013439652e-06, + "loss": 0.0668, + "step": 2039 + }, + { + "epoch": 0.9060626249167222, + "grad_norm": 0.6697892166854362, + "learning_rate": 9.520347851053567e-06, + "loss": 0.0589, + "step": 2040 + }, + { + "epoch": 0.9065067732622696, + "grad_norm": 0.4472682187482963, + "learning_rate": 9.51951900918241e-06, + "loss": 0.0468, + "step": 2041 + }, + { + "epoch": 0.906950921607817, + "grad_norm": 0.6790472985515921, + "learning_rate": 9.518689487950772e-06, + "loss": 0.064, + "step": 2042 + }, + { + "epoch": 0.9073950699533644, + "grad_norm": 0.4982007942752425, + "learning_rate": 9.517859287483347e-06, + "loss": 0.0555, + "step": 2043 + }, + { + "epoch": 0.9078392182989118, + "grad_norm": 0.4637933782350014, + "learning_rate": 9.517028407904925e-06, + "loss": 0.0565, + "step": 2044 + }, + { + "epoch": 0.9082833666444593, + "grad_norm": 0.5832414481867781, + "learning_rate": 9.516196849340402e-06, + "loss": 0.0742, + "step": 2045 + }, + { + "epoch": 0.9087275149900067, + "grad_norm": 0.5576423922621532, + "learning_rate": 9.515364611914777e-06, + "loss": 0.0628, + "step": 2046 + }, + { + "epoch": 0.9091716633355541, + "grad_norm": 0.5229540500090215, + "learning_rate": 9.514531695753146e-06, + "loss": 0.0517, + "step": 2047 + }, + { + "epoch": 0.9096158116811015, + "grad_norm": 0.6260810656444592, + "learning_rate": 9.513698100980715e-06, + "loss": 0.0828, + "step": 2048 + }, + { + "epoch": 0.9100599600266489, + "grad_norm": 0.4707579422417085, + "learning_rate": 9.512863827722785e-06, + "loss": 0.0593, + "step": 2049 + }, + { + "epoch": 0.9105041083721963, + "grad_norm": 0.5601794044808186, + "learning_rate": 9.51202887610476e-06, + "loss": 0.063, + "step": 2050 + }, + { + "epoch": 0.9109482567177437, + "grad_norm": 0.6023400251451128, + "learning_rate": 9.51119324625215e-06, + "loss": 0.0686, + "step": 2051 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 0.5886804844960863, + "learning_rate": 9.510356938290562e-06, + "loss": 0.0618, + "step": 2052 + }, + { + "epoch": 0.9118365534088385, + "grad_norm": 0.5729230838893967, + "learning_rate": 9.509519952345709e-06, + "loss": 0.0594, + "step": 2053 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 0.5913444178469247, + "learning_rate": 9.508682288543405e-06, + "loss": 0.0544, + "step": 2054 + }, + { + "epoch": 0.9127248500999334, + "grad_norm": 0.5581180816842339, + "learning_rate": 9.507843947009562e-06, + "loss": 0.0489, + "step": 2055 + }, + { + "epoch": 0.9131689984454808, + "grad_norm": 0.9592080607831761, + "learning_rate": 9.507004927870202e-06, + "loss": 0.0674, + "step": 2056 + }, + { + "epoch": 0.9136131467910282, + "grad_norm": 0.48343586169312186, + "learning_rate": 9.506165231251438e-06, + "loss": 0.0536, + "step": 2057 + }, + { + "epoch": 0.9140572951365756, + "grad_norm": 0.5238333120956294, + "learning_rate": 9.505324857279494e-06, + "loss": 0.0521, + "step": 2058 + }, + { + "epoch": 0.914501443482123, + "grad_norm": 0.4166519007433999, + "learning_rate": 9.504483806080694e-06, + "loss": 0.0586, + "step": 2059 + }, + { + "epoch": 0.9149455918276704, + "grad_norm": 0.6437419685785489, + "learning_rate": 9.503642077781457e-06, + "loss": 0.0767, + "step": 2060 + }, + { + "epoch": 0.9153897401732178, + "grad_norm": 0.6570037258452504, + "learning_rate": 9.502799672508314e-06, + "loss": 0.0587, + "step": 2061 + }, + { + "epoch": 0.9158338885187652, + "grad_norm": 0.6138821258144063, + "learning_rate": 9.501956590387891e-06, + "loss": 0.0736, + "step": 2062 + }, + { + "epoch": 0.9162780368643126, + "grad_norm": 0.6641641647032757, + "learning_rate": 9.501112831546917e-06, + "loss": 0.0606, + "step": 2063 + }, + { + "epoch": 0.91672218520986, + "grad_norm": 0.4693758498407545, + "learning_rate": 9.500268396112224e-06, + "loss": 0.056, + "step": 2064 + }, + { + "epoch": 0.9171663335554076, + "grad_norm": 0.7149305856179645, + "learning_rate": 9.499423284210745e-06, + "loss": 0.0627, + "step": 2065 + }, + { + "epoch": 0.917610481900955, + "grad_norm": 0.522841296737736, + "learning_rate": 9.498577495969515e-06, + "loss": 0.0722, + "step": 2066 + }, + { + "epoch": 0.9180546302465024, + "grad_norm": 0.568329647769687, + "learning_rate": 9.497731031515669e-06, + "loss": 0.0546, + "step": 2067 + }, + { + "epoch": 0.9184987785920498, + "grad_norm": 0.7376883712368818, + "learning_rate": 9.496883890976445e-06, + "loss": 0.0743, + "step": 2068 + }, + { + "epoch": 0.9189429269375972, + "grad_norm": 0.8322448418666403, + "learning_rate": 9.496036074479184e-06, + "loss": 0.0505, + "step": 2069 + }, + { + "epoch": 0.9193870752831446, + "grad_norm": 0.47540090764910864, + "learning_rate": 9.495187582151328e-06, + "loss": 0.0503, + "step": 2070 + }, + { + "epoch": 0.919831223628692, + "grad_norm": 0.5694137714424392, + "learning_rate": 9.494338414120419e-06, + "loss": 0.0552, + "step": 2071 + }, + { + "epoch": 0.9202753719742394, + "grad_norm": 0.4737570623495173, + "learning_rate": 9.493488570514099e-06, + "loss": 0.0546, + "step": 2072 + }, + { + "epoch": 0.9207195203197868, + "grad_norm": 0.5080926402077233, + "learning_rate": 9.492638051460116e-06, + "loss": 0.0634, + "step": 2073 + }, + { + "epoch": 0.9211636686653342, + "grad_norm": 0.5930821340624887, + "learning_rate": 9.491786857086318e-06, + "loss": 0.0558, + "step": 2074 + }, + { + "epoch": 0.9216078170108817, + "grad_norm": 0.6489494784499551, + "learning_rate": 9.490934987520653e-06, + "loss": 0.0574, + "step": 2075 + }, + { + "epoch": 0.9220519653564291, + "grad_norm": 0.4274889745604907, + "learning_rate": 9.490082442891171e-06, + "loss": 0.0484, + "step": 2076 + }, + { + "epoch": 0.9224961137019765, + "grad_norm": 0.7132092664683398, + "learning_rate": 9.489229223326027e-06, + "loss": 0.0506, + "step": 2077 + }, + { + "epoch": 0.9229402620475239, + "grad_norm": 0.7098655395611968, + "learning_rate": 9.48837532895347e-06, + "loss": 0.0693, + "step": 2078 + }, + { + "epoch": 0.9233844103930713, + "grad_norm": 0.6125346051926611, + "learning_rate": 9.487520759901858e-06, + "loss": 0.0665, + "step": 2079 + }, + { + "epoch": 0.9238285587386187, + "grad_norm": 0.5007229228967727, + "learning_rate": 9.486665516299646e-06, + "loss": 0.0763, + "step": 2080 + }, + { + "epoch": 0.9242727070841661, + "grad_norm": 0.6228392832149312, + "learning_rate": 9.485809598275391e-06, + "loss": 0.0602, + "step": 2081 + }, + { + "epoch": 0.9247168554297135, + "grad_norm": 0.5392941744359442, + "learning_rate": 9.484953005957753e-06, + "loss": 0.0621, + "step": 2082 + }, + { + "epoch": 0.9251610037752609, + "grad_norm": 0.6769648255970269, + "learning_rate": 9.484095739475492e-06, + "loss": 0.0829, + "step": 2083 + }, + { + "epoch": 0.9256051521208084, + "grad_norm": 1.0001370312264637, + "learning_rate": 9.48323779895747e-06, + "loss": 0.0616, + "step": 2084 + }, + { + "epoch": 0.9260493004663558, + "grad_norm": 0.6960955983895617, + "learning_rate": 9.482379184532652e-06, + "loss": 0.0701, + "step": 2085 + }, + { + "epoch": 0.9264934488119032, + "grad_norm": 0.5402499843217174, + "learning_rate": 9.481519896330098e-06, + "loss": 0.0557, + "step": 2086 + }, + { + "epoch": 0.9269375971574506, + "grad_norm": 0.5511208018996919, + "learning_rate": 9.480659934478975e-06, + "loss": 0.0613, + "step": 2087 + }, + { + "epoch": 0.927381745502998, + "grad_norm": 0.5938957138905275, + "learning_rate": 9.479799299108553e-06, + "loss": 0.0808, + "step": 2088 + }, + { + "epoch": 0.9278258938485454, + "grad_norm": 0.7161437453790136, + "learning_rate": 9.478937990348196e-06, + "loss": 0.0606, + "step": 2089 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 0.5468536840129006, + "learning_rate": 9.478076008327377e-06, + "loss": 0.0513, + "step": 2090 + }, + { + "epoch": 0.9287141905396402, + "grad_norm": 0.9707762113595154, + "learning_rate": 9.477213353175663e-06, + "loss": 0.0869, + "step": 2091 + }, + { + "epoch": 0.9291583388851876, + "grad_norm": 0.7985679050710678, + "learning_rate": 9.476350025022728e-06, + "loss": 0.0794, + "step": 2092 + }, + { + "epoch": 0.929602487230735, + "grad_norm": 0.6127936826464601, + "learning_rate": 9.475486023998345e-06, + "loss": 0.055, + "step": 2093 + }, + { + "epoch": 0.9300466355762825, + "grad_norm": 0.6334292089301367, + "learning_rate": 9.474621350232387e-06, + "loss": 0.0801, + "step": 2094 + }, + { + "epoch": 0.9304907839218299, + "grad_norm": 0.5293672478661852, + "learning_rate": 9.47375600385483e-06, + "loss": 0.0463, + "step": 2095 + }, + { + "epoch": 0.9309349322673773, + "grad_norm": 0.6942808305366424, + "learning_rate": 9.47288998499575e-06, + "loss": 0.0772, + "step": 2096 + }, + { + "epoch": 0.9313790806129247, + "grad_norm": 0.801229643208364, + "learning_rate": 9.472023293785322e-06, + "loss": 0.0838, + "step": 2097 + }, + { + "epoch": 0.9318232289584721, + "grad_norm": 0.5539233738084255, + "learning_rate": 9.471155930353829e-06, + "loss": 0.0477, + "step": 2098 + }, + { + "epoch": 0.9322673773040195, + "grad_norm": 0.5083333963598577, + "learning_rate": 9.470287894831648e-06, + "loss": 0.0493, + "step": 2099 + }, + { + "epoch": 0.9327115256495669, + "grad_norm": 0.5714592576450407, + "learning_rate": 9.469419187349258e-06, + "loss": 0.0518, + "step": 2100 + }, + { + "epoch": 0.9331556739951143, + "grad_norm": 0.5821855293618007, + "learning_rate": 9.468549808037241e-06, + "loss": 0.0584, + "step": 2101 + }, + { + "epoch": 0.9335998223406617, + "grad_norm": 0.5808549016034106, + "learning_rate": 9.467679757026283e-06, + "loss": 0.0641, + "step": 2102 + }, + { + "epoch": 0.9340439706862091, + "grad_norm": 0.5185841811549224, + "learning_rate": 9.466809034447165e-06, + "loss": 0.0484, + "step": 2103 + }, + { + "epoch": 0.9344881190317567, + "grad_norm": 0.46447107395363035, + "learning_rate": 9.46593764043077e-06, + "loss": 0.0538, + "step": 2104 + }, + { + "epoch": 0.9349322673773041, + "grad_norm": 0.5957468835643678, + "learning_rate": 9.465065575108084e-06, + "loss": 0.0687, + "step": 2105 + }, + { + "epoch": 0.9353764157228515, + "grad_norm": 0.6722403187627196, + "learning_rate": 9.464192838610195e-06, + "loss": 0.0633, + "step": 2106 + }, + { + "epoch": 0.9358205640683989, + "grad_norm": 0.7367314225600079, + "learning_rate": 9.463319431068289e-06, + "loss": 0.0688, + "step": 2107 + }, + { + "epoch": 0.9362647124139463, + "grad_norm": 0.5272343499456363, + "learning_rate": 9.462445352613654e-06, + "loss": 0.0598, + "step": 2108 + }, + { + "epoch": 0.9367088607594937, + "grad_norm": 0.5251390084394981, + "learning_rate": 9.461570603377678e-06, + "loss": 0.0461, + "step": 2109 + }, + { + "epoch": 0.9371530091050411, + "grad_norm": 0.45228924863036196, + "learning_rate": 9.460695183491852e-06, + "loss": 0.0551, + "step": 2110 + }, + { + "epoch": 0.9375971574505885, + "grad_norm": 0.5609307631212084, + "learning_rate": 9.459819093087765e-06, + "loss": 0.0582, + "step": 2111 + }, + { + "epoch": 0.9380413057961359, + "grad_norm": 0.6083990162855326, + "learning_rate": 9.45894233229711e-06, + "loss": 0.0727, + "step": 2112 + }, + { + "epoch": 0.9384854541416833, + "grad_norm": 0.6031459810485845, + "learning_rate": 9.458064901251679e-06, + "loss": 0.055, + "step": 2113 + }, + { + "epoch": 0.9389296024872308, + "grad_norm": 0.5698111663947624, + "learning_rate": 9.457186800083363e-06, + "loss": 0.0542, + "step": 2114 + }, + { + "epoch": 0.9393737508327782, + "grad_norm": 0.5620094721834581, + "learning_rate": 9.456308028924157e-06, + "loss": 0.0647, + "step": 2115 + }, + { + "epoch": 0.9398178991783256, + "grad_norm": 0.47201186226610237, + "learning_rate": 9.455428587906154e-06, + "loss": 0.0587, + "step": 2116 + }, + { + "epoch": 0.940262047523873, + "grad_norm": 0.5090071386100868, + "learning_rate": 9.45454847716155e-06, + "loss": 0.0485, + "step": 2117 + }, + { + "epoch": 0.9407061958694204, + "grad_norm": 0.6100904698718481, + "learning_rate": 9.453667696822644e-06, + "loss": 0.055, + "step": 2118 + }, + { + "epoch": 0.9411503442149678, + "grad_norm": 0.5793242183604661, + "learning_rate": 9.452786247021825e-06, + "loss": 0.0633, + "step": 2119 + }, + { + "epoch": 0.9415944925605152, + "grad_norm": 0.6603367019430398, + "learning_rate": 9.451904127891593e-06, + "loss": 0.062, + "step": 2120 + }, + { + "epoch": 0.9420386409060626, + "grad_norm": 0.48144950864097125, + "learning_rate": 9.451021339564549e-06, + "loss": 0.0541, + "step": 2121 + }, + { + "epoch": 0.94248278925161, + "grad_norm": 0.8619120091253565, + "learning_rate": 9.450137882173385e-06, + "loss": 0.0622, + "step": 2122 + }, + { + "epoch": 0.9429269375971574, + "grad_norm": 0.574402948879313, + "learning_rate": 9.449253755850902e-06, + "loss": 0.0579, + "step": 2123 + }, + { + "epoch": 0.9433710859427049, + "grad_norm": 0.5040572133776431, + "learning_rate": 9.448368960730002e-06, + "loss": 0.0471, + "step": 2124 + }, + { + "epoch": 0.9438152342882523, + "grad_norm": 0.8130523767663201, + "learning_rate": 9.447483496943682e-06, + "loss": 0.0607, + "step": 2125 + }, + { + "epoch": 0.9442593826337997, + "grad_norm": 0.6488075902611455, + "learning_rate": 9.446597364625043e-06, + "loss": 0.0495, + "step": 2126 + }, + { + "epoch": 0.9447035309793471, + "grad_norm": 0.516081567488049, + "learning_rate": 9.445710563907286e-06, + "loss": 0.0597, + "step": 2127 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 0.6477844164341606, + "learning_rate": 9.444823094923712e-06, + "loss": 0.0581, + "step": 2128 + }, + { + "epoch": 0.9455918276704419, + "grad_norm": 0.5960225920848716, + "learning_rate": 9.44393495780772e-06, + "loss": 0.0693, + "step": 2129 + }, + { + "epoch": 0.9460359760159893, + "grad_norm": 0.6610698243004944, + "learning_rate": 9.443046152692818e-06, + "loss": 0.0602, + "step": 2130 + }, + { + "epoch": 0.9464801243615367, + "grad_norm": 0.501365231071439, + "learning_rate": 9.442156679712604e-06, + "loss": 0.0507, + "step": 2131 + }, + { + "epoch": 0.9469242727070841, + "grad_norm": 0.5333891976903705, + "learning_rate": 9.441266539000782e-06, + "loss": 0.0551, + "step": 2132 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.7140797441741807, + "learning_rate": 9.440375730691154e-06, + "loss": 0.0559, + "step": 2133 + }, + { + "epoch": 0.947812569398179, + "grad_norm": 1.0212073230522318, + "learning_rate": 9.439484254917626e-06, + "loss": 0.0601, + "step": 2134 + }, + { + "epoch": 0.9482567177437264, + "grad_norm": 0.6547086006006103, + "learning_rate": 9.4385921118142e-06, + "loss": 0.0678, + "step": 2135 + }, + { + "epoch": 0.9487008660892738, + "grad_norm": 0.5668847306270909, + "learning_rate": 9.437699301514983e-06, + "loss": 0.0562, + "step": 2136 + }, + { + "epoch": 0.9491450144348212, + "grad_norm": 0.535851688994489, + "learning_rate": 9.436805824154175e-06, + "loss": 0.0536, + "step": 2137 + }, + { + "epoch": 0.9495891627803686, + "grad_norm": 0.6725155861850945, + "learning_rate": 9.435911679866085e-06, + "loss": 0.0636, + "step": 2138 + }, + { + "epoch": 0.950033311125916, + "grad_norm": 0.472128745022765, + "learning_rate": 9.435016868785117e-06, + "loss": 0.0514, + "step": 2139 + }, + { + "epoch": 0.9504774594714634, + "grad_norm": 0.5611219746408826, + "learning_rate": 9.434121391045775e-06, + "loss": 0.057, + "step": 2140 + }, + { + "epoch": 0.9509216078170108, + "grad_norm": 0.5197846459934121, + "learning_rate": 9.433225246782664e-06, + "loss": 0.0497, + "step": 2141 + }, + { + "epoch": 0.9513657561625583, + "grad_norm": 0.6901510307382841, + "learning_rate": 9.432328436130493e-06, + "loss": 0.0588, + "step": 2142 + }, + { + "epoch": 0.9518099045081057, + "grad_norm": 0.6595671077813987, + "learning_rate": 9.431430959224067e-06, + "loss": 0.0852, + "step": 2143 + }, + { + "epoch": 0.9522540528536532, + "grad_norm": 0.5676059741434758, + "learning_rate": 9.43053281619829e-06, + "loss": 0.0507, + "step": 2144 + }, + { + "epoch": 0.9526982011992006, + "grad_norm": 0.4855036199663289, + "learning_rate": 9.429634007188169e-06, + "loss": 0.0474, + "step": 2145 + }, + { + "epoch": 0.953142349544748, + "grad_norm": 0.732709155194895, + "learning_rate": 9.42873453232881e-06, + "loss": 0.0619, + "step": 2146 + }, + { + "epoch": 0.9535864978902954, + "grad_norm": 0.6306866348324036, + "learning_rate": 9.42783439175542e-06, + "loss": 0.0772, + "step": 2147 + }, + { + "epoch": 0.9540306462358428, + "grad_norm": 0.6444304070837995, + "learning_rate": 9.426933585603304e-06, + "loss": 0.0565, + "step": 2148 + }, + { + "epoch": 0.9544747945813902, + "grad_norm": 0.5421189880885259, + "learning_rate": 9.42603211400787e-06, + "loss": 0.0655, + "step": 2149 + }, + { + "epoch": 0.9549189429269376, + "grad_norm": 0.5350689023537523, + "learning_rate": 9.425129977104626e-06, + "loss": 0.0598, + "step": 2150 + }, + { + "epoch": 0.955363091272485, + "grad_norm": 0.3832349941062708, + "learning_rate": 9.424227175029175e-06, + "loss": 0.0474, + "step": 2151 + }, + { + "epoch": 0.9558072396180324, + "grad_norm": 0.5764500588860708, + "learning_rate": 9.423323707917226e-06, + "loss": 0.0673, + "step": 2152 + }, + { + "epoch": 0.9562513879635799, + "grad_norm": 0.41134986778955357, + "learning_rate": 9.422419575904584e-06, + "loss": 0.0449, + "step": 2153 + }, + { + "epoch": 0.9566955363091273, + "grad_norm": 0.40411844058020524, + "learning_rate": 9.421514779127156e-06, + "loss": 0.0457, + "step": 2154 + }, + { + "epoch": 0.9571396846546747, + "grad_norm": 0.5724570461924485, + "learning_rate": 9.420609317720948e-06, + "loss": 0.064, + "step": 2155 + }, + { + "epoch": 0.9575838330002221, + "grad_norm": 0.5481138081401853, + "learning_rate": 9.419703191822067e-06, + "loss": 0.0512, + "step": 2156 + }, + { + "epoch": 0.9580279813457695, + "grad_norm": 0.5977006719684248, + "learning_rate": 9.418796401566719e-06, + "loss": 0.0452, + "step": 2157 + }, + { + "epoch": 0.9584721296913169, + "grad_norm": 0.5349085172455206, + "learning_rate": 9.417888947091208e-06, + "loss": 0.065, + "step": 2158 + }, + { + "epoch": 0.9589162780368643, + "grad_norm": 0.6787928669434861, + "learning_rate": 9.416980828531944e-06, + "loss": 0.0677, + "step": 2159 + }, + { + "epoch": 0.9593604263824117, + "grad_norm": 0.709282953075048, + "learning_rate": 9.416072046025429e-06, + "loss": 0.0729, + "step": 2160 + }, + { + "epoch": 0.9598045747279591, + "grad_norm": 0.6790718166354139, + "learning_rate": 9.415162599708268e-06, + "loss": 0.0699, + "step": 2161 + }, + { + "epoch": 0.9602487230735065, + "grad_norm": 0.545039186094067, + "learning_rate": 9.414252489717168e-06, + "loss": 0.0594, + "step": 2162 + }, + { + "epoch": 0.960692871419054, + "grad_norm": 0.7438799552703795, + "learning_rate": 9.413341716188934e-06, + "loss": 0.0653, + "step": 2163 + }, + { + "epoch": 0.9611370197646014, + "grad_norm": 0.5419774566213018, + "learning_rate": 9.412430279260473e-06, + "loss": 0.0452, + "step": 2164 + }, + { + "epoch": 0.9615811681101488, + "grad_norm": 0.5417611334038369, + "learning_rate": 9.411518179068785e-06, + "loss": 0.0695, + "step": 2165 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 0.556851360606536, + "learning_rate": 9.410605415750977e-06, + "loss": 0.0612, + "step": 2166 + }, + { + "epoch": 0.9624694648012436, + "grad_norm": 0.8789606927293152, + "learning_rate": 9.40969198944425e-06, + "loss": 0.0604, + "step": 2167 + }, + { + "epoch": 0.962913613146791, + "grad_norm": 0.5933413334023052, + "learning_rate": 9.40877790028591e-06, + "loss": 0.0623, + "step": 2168 + }, + { + "epoch": 0.9633577614923384, + "grad_norm": 0.5946398594923591, + "learning_rate": 9.407863148413361e-06, + "loss": 0.0419, + "step": 2169 + }, + { + "epoch": 0.9638019098378858, + "grad_norm": 0.48221233179240436, + "learning_rate": 9.406947733964103e-06, + "loss": 0.0546, + "step": 2170 + }, + { + "epoch": 0.9642460581834332, + "grad_norm": 0.6525276948819981, + "learning_rate": 9.40603165707574e-06, + "loss": 0.0526, + "step": 2171 + }, + { + "epoch": 0.9646902065289806, + "grad_norm": 0.7205786693472772, + "learning_rate": 9.405114917885973e-06, + "loss": 0.0684, + "step": 2172 + }, + { + "epoch": 0.9651343548745281, + "grad_norm": 0.5044988796095775, + "learning_rate": 9.404197516532605e-06, + "loss": 0.0478, + "step": 2173 + }, + { + "epoch": 0.9655785032200755, + "grad_norm": 0.44346462918671525, + "learning_rate": 9.403279453153536e-06, + "loss": 0.0446, + "step": 2174 + }, + { + "epoch": 0.966022651565623, + "grad_norm": 0.5173823021640699, + "learning_rate": 9.402360727886766e-06, + "loss": 0.0623, + "step": 2175 + }, + { + "epoch": 0.9664667999111703, + "grad_norm": 0.5687155650294274, + "learning_rate": 9.401441340870397e-06, + "loss": 0.0611, + "step": 2176 + }, + { + "epoch": 0.9669109482567178, + "grad_norm": 0.6467987726672418, + "learning_rate": 9.400521292242626e-06, + "loss": 0.0529, + "step": 2177 + }, + { + "epoch": 0.9673550966022652, + "grad_norm": 0.5130913991837206, + "learning_rate": 9.399600582141752e-06, + "loss": 0.0599, + "step": 2178 + }, + { + "epoch": 0.9677992449478126, + "grad_norm": 0.6670998717556929, + "learning_rate": 9.398679210706176e-06, + "loss": 0.0682, + "step": 2179 + }, + { + "epoch": 0.96824339329336, + "grad_norm": 0.7694218770583813, + "learning_rate": 9.397757178074392e-06, + "loss": 0.0686, + "step": 2180 + }, + { + "epoch": 0.9686875416389074, + "grad_norm": 0.7755428807155127, + "learning_rate": 9.396834484385e-06, + "loss": 0.0574, + "step": 2181 + }, + { + "epoch": 0.9691316899844548, + "grad_norm": 0.44478529742795087, + "learning_rate": 9.395911129776699e-06, + "loss": 0.0429, + "step": 2182 + }, + { + "epoch": 0.9695758383300023, + "grad_norm": 0.7140201406983687, + "learning_rate": 9.394987114388278e-06, + "loss": 0.0711, + "step": 2183 + }, + { + "epoch": 0.9700199866755497, + "grad_norm": 0.8530149060800487, + "learning_rate": 9.394062438358637e-06, + "loss": 0.0529, + "step": 2184 + }, + { + "epoch": 0.9704641350210971, + "grad_norm": 0.6524072835443322, + "learning_rate": 9.39313710182677e-06, + "loss": 0.0624, + "step": 2185 + }, + { + "epoch": 0.9709082833666445, + "grad_norm": 0.4290712533966452, + "learning_rate": 9.39221110493177e-06, + "loss": 0.041, + "step": 2186 + }, + { + "epoch": 0.9713524317121919, + "grad_norm": 0.5324670553025179, + "learning_rate": 9.39128444781283e-06, + "loss": 0.0477, + "step": 2187 + }, + { + "epoch": 0.9717965800577393, + "grad_norm": 0.5901470773440277, + "learning_rate": 9.390357130609243e-06, + "loss": 0.0568, + "step": 2188 + }, + { + "epoch": 0.9722407284032867, + "grad_norm": 0.7720115526671916, + "learning_rate": 9.3894291534604e-06, + "loss": 0.0559, + "step": 2189 + }, + { + "epoch": 0.9726848767488341, + "grad_norm": 0.6618879002055548, + "learning_rate": 9.38850051650579e-06, + "loss": 0.0789, + "step": 2190 + }, + { + "epoch": 0.9731290250943815, + "grad_norm": 0.6242983609779144, + "learning_rate": 9.387571219885008e-06, + "loss": 0.0709, + "step": 2191 + }, + { + "epoch": 0.9735731734399289, + "grad_norm": 0.5488182994392276, + "learning_rate": 9.386641263737736e-06, + "loss": 0.0483, + "step": 2192 + }, + { + "epoch": 0.9740173217854764, + "grad_norm": 0.5678755050400883, + "learning_rate": 9.38571064820377e-06, + "loss": 0.0639, + "step": 2193 + }, + { + "epoch": 0.9744614701310238, + "grad_norm": 0.7972177120971958, + "learning_rate": 9.384779373422992e-06, + "loss": 0.0688, + "step": 2194 + }, + { + "epoch": 0.9749056184765712, + "grad_norm": 0.4181144019615441, + "learning_rate": 9.38384743953539e-06, + "loss": 0.0434, + "step": 2195 + }, + { + "epoch": 0.9753497668221186, + "grad_norm": 1.0045462377103442, + "learning_rate": 9.382914846681049e-06, + "loss": 0.0626, + "step": 2196 + }, + { + "epoch": 0.975793915167666, + "grad_norm": 0.5663797823359984, + "learning_rate": 9.381981595000153e-06, + "loss": 0.0536, + "step": 2197 + }, + { + "epoch": 0.9762380635132134, + "grad_norm": 0.5230033029520286, + "learning_rate": 9.381047684632986e-06, + "loss": 0.0431, + "step": 2198 + }, + { + "epoch": 0.9766822118587608, + "grad_norm": 0.5941306490458375, + "learning_rate": 9.380113115719933e-06, + "loss": 0.0514, + "step": 2199 + }, + { + "epoch": 0.9771263602043082, + "grad_norm": 0.7074167144721983, + "learning_rate": 9.379177888401473e-06, + "loss": 0.0516, + "step": 2200 + }, + { + "epoch": 0.9775705085498556, + "grad_norm": 0.8936942623677783, + "learning_rate": 9.378242002818186e-06, + "loss": 0.0634, + "step": 2201 + }, + { + "epoch": 0.978014656895403, + "grad_norm": 0.8015956656924698, + "learning_rate": 9.377305459110754e-06, + "loss": 0.0701, + "step": 2202 + }, + { + "epoch": 0.9784588052409505, + "grad_norm": 0.5888281751564788, + "learning_rate": 9.376368257419955e-06, + "loss": 0.0735, + "step": 2203 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 0.6871960760484549, + "learning_rate": 9.375430397886661e-06, + "loss": 0.0528, + "step": 2204 + }, + { + "epoch": 0.9793471019320453, + "grad_norm": 0.5915896411171675, + "learning_rate": 9.374491880651856e-06, + "loss": 0.0577, + "step": 2205 + }, + { + "epoch": 0.9797912502775927, + "grad_norm": 0.6268683607318559, + "learning_rate": 9.373552705856612e-06, + "loss": 0.0511, + "step": 2206 + }, + { + "epoch": 0.9802353986231401, + "grad_norm": 0.5959527967947893, + "learning_rate": 9.372612873642101e-06, + "loss": 0.0577, + "step": 2207 + }, + { + "epoch": 0.9806795469686875, + "grad_norm": 0.6376605781281672, + "learning_rate": 9.3716723841496e-06, + "loss": 0.056, + "step": 2208 + }, + { + "epoch": 0.9811236953142349, + "grad_norm": 0.6655725615122635, + "learning_rate": 9.370731237520476e-06, + "loss": 0.0495, + "step": 2209 + }, + { + "epoch": 0.9815678436597823, + "grad_norm": 0.6178758449973094, + "learning_rate": 9.369789433896201e-06, + "loss": 0.0785, + "step": 2210 + }, + { + "epoch": 0.9820119920053297, + "grad_norm": 0.6203954478751535, + "learning_rate": 9.368846973418347e-06, + "loss": 0.0541, + "step": 2211 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 0.5844524216400101, + "learning_rate": 9.367903856228575e-06, + "loss": 0.0484, + "step": 2212 + }, + { + "epoch": 0.9829002886964247, + "grad_norm": 1.3376432627550732, + "learning_rate": 9.366960082468658e-06, + "loss": 0.0523, + "step": 2213 + }, + { + "epoch": 0.983344437041972, + "grad_norm": 0.7650415999284177, + "learning_rate": 9.36601565228046e-06, + "loss": 0.0758, + "step": 2214 + }, + { + "epoch": 0.9837885853875195, + "grad_norm": 0.6031770466123566, + "learning_rate": 9.365070565805941e-06, + "loss": 0.0552, + "step": 2215 + }, + { + "epoch": 0.9842327337330669, + "grad_norm": 0.5317856643301976, + "learning_rate": 9.364124823187169e-06, + "loss": 0.0495, + "step": 2216 + }, + { + "epoch": 0.9846768820786143, + "grad_norm": 0.697207175537678, + "learning_rate": 9.363178424566302e-06, + "loss": 0.0575, + "step": 2217 + }, + { + "epoch": 0.9851210304241617, + "grad_norm": 0.6901112062939687, + "learning_rate": 9.3622313700856e-06, + "loss": 0.0558, + "step": 2218 + }, + { + "epoch": 0.9855651787697091, + "grad_norm": 0.7725648574354754, + "learning_rate": 9.361283659887421e-06, + "loss": 0.0526, + "step": 2219 + }, + { + "epoch": 0.9860093271152565, + "grad_norm": 0.4919891771117441, + "learning_rate": 9.360335294114222e-06, + "loss": 0.0517, + "step": 2220 + }, + { + "epoch": 0.9864534754608039, + "grad_norm": 0.604042614778997, + "learning_rate": 9.359386272908561e-06, + "loss": 0.0654, + "step": 2221 + }, + { + "epoch": 0.9868976238063514, + "grad_norm": 0.533434108631547, + "learning_rate": 9.35843659641309e-06, + "loss": 0.0587, + "step": 2222 + }, + { + "epoch": 0.9873417721518988, + "grad_norm": 0.627831211955756, + "learning_rate": 9.35748626477056e-06, + "loss": 0.0577, + "step": 2223 + }, + { + "epoch": 0.9877859204974462, + "grad_norm": 0.48609069405336236, + "learning_rate": 9.356535278123826e-06, + "loss": 0.0625, + "step": 2224 + }, + { + "epoch": 0.9882300688429936, + "grad_norm": 0.5441221898111536, + "learning_rate": 9.355583636615832e-06, + "loss": 0.0671, + "step": 2225 + }, + { + "epoch": 0.988674217188541, + "grad_norm": 0.6653694627323021, + "learning_rate": 9.354631340389633e-06, + "loss": 0.0678, + "step": 2226 + }, + { + "epoch": 0.9891183655340884, + "grad_norm": 0.49687550533635927, + "learning_rate": 9.353678389588367e-06, + "loss": 0.048, + "step": 2227 + }, + { + "epoch": 0.9895625138796358, + "grad_norm": 0.5456376905073873, + "learning_rate": 9.352724784355286e-06, + "loss": 0.0763, + "step": 2228 + }, + { + "epoch": 0.9900066622251832, + "grad_norm": 0.5234550844694897, + "learning_rate": 9.35177052483373e-06, + "loss": 0.0573, + "step": 2229 + }, + { + "epoch": 0.9904508105707306, + "grad_norm": 0.3992910106511535, + "learning_rate": 9.35081561116714e-06, + "loss": 0.0422, + "step": 2230 + }, + { + "epoch": 0.990894958916278, + "grad_norm": 0.4091987827719479, + "learning_rate": 9.349860043499056e-06, + "loss": 0.0474, + "step": 2231 + }, + { + "epoch": 0.9913391072618255, + "grad_norm": 0.5342639072522524, + "learning_rate": 9.348903821973114e-06, + "loss": 0.0663, + "step": 2232 + }, + { + "epoch": 0.9917832556073729, + "grad_norm": 0.4467060129332395, + "learning_rate": 9.347946946733055e-06, + "loss": 0.0443, + "step": 2233 + }, + { + "epoch": 0.9922274039529203, + "grad_norm": 0.4523315857358909, + "learning_rate": 9.346989417922712e-06, + "loss": 0.0415, + "step": 2234 + }, + { + "epoch": 0.9926715522984677, + "grad_norm": 0.525064951465678, + "learning_rate": 9.346031235686014e-06, + "loss": 0.0663, + "step": 2235 + }, + { + "epoch": 0.9931157006440151, + "grad_norm": 0.5615837318533068, + "learning_rate": 9.345072400166999e-06, + "loss": 0.0558, + "step": 2236 + }, + { + "epoch": 0.9935598489895625, + "grad_norm": 0.5550096636418779, + "learning_rate": 9.34411291150979e-06, + "loss": 0.0699, + "step": 2237 + }, + { + "epoch": 0.9940039973351099, + "grad_norm": 0.9242044346261947, + "learning_rate": 9.343152769858616e-06, + "loss": 0.0623, + "step": 2238 + }, + { + "epoch": 0.9944481456806573, + "grad_norm": 0.6358712068894604, + "learning_rate": 9.342191975357806e-06, + "loss": 0.0675, + "step": 2239 + }, + { + "epoch": 0.9948922940262047, + "grad_norm": 0.5189758212655892, + "learning_rate": 9.34123052815178e-06, + "loss": 0.0446, + "step": 2240 + }, + { + "epoch": 0.9953364423717521, + "grad_norm": 0.4493321450297416, + "learning_rate": 9.340268428385062e-06, + "loss": 0.0416, + "step": 2241 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 0.4052216095743849, + "learning_rate": 9.339305676202268e-06, + "loss": 0.0524, + "step": 2242 + }, + { + "epoch": 0.996224739062847, + "grad_norm": 0.4257914645053657, + "learning_rate": 9.338342271748122e-06, + "loss": 0.0439, + "step": 2243 + }, + { + "epoch": 0.9966688874083944, + "grad_norm": 0.5443368549896095, + "learning_rate": 9.337378215167436e-06, + "loss": 0.0494, + "step": 2244 + }, + { + "epoch": 0.9971130357539418, + "grad_norm": 0.48744281644400594, + "learning_rate": 9.336413506605123e-06, + "loss": 0.0692, + "step": 2245 + }, + { + "epoch": 0.9975571840994892, + "grad_norm": 0.730043582195284, + "learning_rate": 9.335448146206201e-06, + "loss": 0.0613, + "step": 2246 + }, + { + "epoch": 0.9980013324450366, + "grad_norm": 0.6666928317513323, + "learning_rate": 9.334482134115774e-06, + "loss": 0.0626, + "step": 2247 + }, + { + "epoch": 0.998445480790584, + "grad_norm": 1.1582138795192063, + "learning_rate": 9.333515470479052e-06, + "loss": 0.0706, + "step": 2248 + }, + { + "epoch": 0.9988896291361314, + "grad_norm": 0.41845565011499725, + "learning_rate": 9.332548155441341e-06, + "loss": 0.0428, + "step": 2249 + }, + { + "epoch": 0.9993337774816788, + "grad_norm": 0.49853102857677845, + "learning_rate": 9.331580189148047e-06, + "loss": 0.0498, + "step": 2250 + }, + { + "epoch": 0.9997779258272262, + "grad_norm": 0.469460122533607, + "learning_rate": 9.330611571744668e-06, + "loss": 0.0584, + "step": 2251 + }, + { + "epoch": 0.9997779258272262, + "eval_loss": 0.06305240094661713, + "eval_runtime": 420.7122, + "eval_samples_per_second": 36.048, + "eval_steps_per_second": 1.127, + "step": 2251 + }, + { + "epoch": 1.0002220741727736, + "grad_norm": 0.8383200378207877, + "learning_rate": 9.329642303376806e-06, + "loss": 0.0866, + "step": 2252 + }, + { + "epoch": 1.0006662225183212, + "grad_norm": 0.4162547630354714, + "learning_rate": 9.328672384190158e-06, + "loss": 0.044, + "step": 2253 + }, + { + "epoch": 1.0011103708638684, + "grad_norm": 0.5811219466400265, + "learning_rate": 9.327701814330521e-06, + "loss": 0.0489, + "step": 2254 + }, + { + "epoch": 1.001554519209416, + "grad_norm": 0.4943074740960733, + "learning_rate": 9.326730593943784e-06, + "loss": 0.0532, + "step": 2255 + }, + { + "epoch": 1.0019986675549633, + "grad_norm": 0.5854940362247197, + "learning_rate": 9.325758723175942e-06, + "loss": 0.0447, + "step": 2256 + }, + { + "epoch": 1.0024428159005108, + "grad_norm": 0.6734583854523404, + "learning_rate": 9.324786202173082e-06, + "loss": 0.0565, + "step": 2257 + }, + { + "epoch": 1.0028869642460583, + "grad_norm": 0.626997176192022, + "learning_rate": 9.32381303108139e-06, + "loss": 0.0554, + "step": 2258 + }, + { + "epoch": 1.0033311125916056, + "grad_norm": 0.7899011567544635, + "learning_rate": 9.322839210047152e-06, + "loss": 0.0541, + "step": 2259 + }, + { + "epoch": 1.003775260937153, + "grad_norm": 0.44777931003199795, + "learning_rate": 9.321864739216747e-06, + "loss": 0.0475, + "step": 2260 + }, + { + "epoch": 1.0042194092827004, + "grad_norm": 0.7176524264984108, + "learning_rate": 9.320889618736657e-06, + "loss": 0.0657, + "step": 2261 + }, + { + "epoch": 1.0046635576282479, + "grad_norm": 0.6297899790013282, + "learning_rate": 9.319913848753457e-06, + "loss": 0.0548, + "step": 2262 + }, + { + "epoch": 1.0051077059737952, + "grad_norm": 0.6829050036731806, + "learning_rate": 9.318937429413823e-06, + "loss": 0.0677, + "step": 2263 + }, + { + "epoch": 1.0055518543193427, + "grad_norm": 0.6099066772979853, + "learning_rate": 9.31796036086453e-06, + "loss": 0.0674, + "step": 2264 + }, + { + "epoch": 1.00599600266489, + "grad_norm": 0.5220221509404842, + "learning_rate": 9.316982643252444e-06, + "loss": 0.0489, + "step": 2265 + }, + { + "epoch": 1.0064401510104375, + "grad_norm": 0.6319552346310404, + "learning_rate": 9.316004276724533e-06, + "loss": 0.0566, + "step": 2266 + }, + { + "epoch": 1.006884299355985, + "grad_norm": 0.75308545956684, + "learning_rate": 9.315025261427864e-06, + "loss": 0.0611, + "step": 2267 + }, + { + "epoch": 1.0073284477015323, + "grad_norm": 0.5854190329106237, + "learning_rate": 9.314045597509598e-06, + "loss": 0.0589, + "step": 2268 + }, + { + "epoch": 1.0077725960470798, + "grad_norm": 0.4592027406322817, + "learning_rate": 9.313065285116997e-06, + "loss": 0.0363, + "step": 2269 + }, + { + "epoch": 1.008216744392627, + "grad_norm": 0.7355227136248891, + "learning_rate": 9.312084324397416e-06, + "loss": 0.0666, + "step": 2270 + }, + { + "epoch": 1.0086608927381746, + "grad_norm": 0.6811364248406817, + "learning_rate": 9.311102715498312e-06, + "loss": 0.0664, + "step": 2271 + }, + { + "epoch": 1.009105041083722, + "grad_norm": 0.4859582181876178, + "learning_rate": 9.310120458567238e-06, + "loss": 0.0455, + "step": 2272 + }, + { + "epoch": 1.0095491894292694, + "grad_norm": 0.6126182761710498, + "learning_rate": 9.309137553751843e-06, + "loss": 0.0568, + "step": 2273 + }, + { + "epoch": 1.0099933377748167, + "grad_norm": 0.6903438432741064, + "learning_rate": 9.308154001199874e-06, + "loss": 0.0546, + "step": 2274 + }, + { + "epoch": 1.0104374861203642, + "grad_norm": 0.5254671203092518, + "learning_rate": 9.307169801059175e-06, + "loss": 0.0518, + "step": 2275 + }, + { + "epoch": 1.0108816344659115, + "grad_norm": 0.502322384644144, + "learning_rate": 9.30618495347769e-06, + "loss": 0.0528, + "step": 2276 + }, + { + "epoch": 1.011325782811459, + "grad_norm": 0.4570165148396037, + "learning_rate": 9.305199458603456e-06, + "loss": 0.0423, + "step": 2277 + }, + { + "epoch": 1.0117699311570065, + "grad_norm": 0.6475436488464652, + "learning_rate": 9.304213316584612e-06, + "loss": 0.0539, + "step": 2278 + }, + { + "epoch": 1.0122140795025538, + "grad_norm": 0.5830527566752789, + "learning_rate": 9.30322652756939e-06, + "loss": 0.0533, + "step": 2279 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 0.5218861392382439, + "learning_rate": 9.302239091706121e-06, + "loss": 0.0478, + "step": 2280 + }, + { + "epoch": 1.0131023761936486, + "grad_norm": 0.7938697599205536, + "learning_rate": 9.301251009143236e-06, + "loss": 0.0913, + "step": 2281 + }, + { + "epoch": 1.0135465245391961, + "grad_norm": 0.5550504645736817, + "learning_rate": 9.300262280029257e-06, + "loss": 0.0644, + "step": 2282 + }, + { + "epoch": 1.0139906728847434, + "grad_norm": 0.594954378218846, + "learning_rate": 9.29927290451281e-06, + "loss": 0.0541, + "step": 2283 + }, + { + "epoch": 1.014434821230291, + "grad_norm": 0.9298600097346652, + "learning_rate": 9.298282882742612e-06, + "loss": 0.0518, + "step": 2284 + }, + { + "epoch": 1.0148789695758382, + "grad_norm": 0.46811932472048295, + "learning_rate": 9.297292214867484e-06, + "loss": 0.0455, + "step": 2285 + }, + { + "epoch": 1.0153231179213857, + "grad_norm": 0.5871427629727887, + "learning_rate": 9.296300901036337e-06, + "loss": 0.0572, + "step": 2286 + }, + { + "epoch": 1.0157672662669333, + "grad_norm": 0.6945899846686857, + "learning_rate": 9.295308941398183e-06, + "loss": 0.061, + "step": 2287 + }, + { + "epoch": 1.0162114146124805, + "grad_norm": 0.640279460698154, + "learning_rate": 9.294316336102132e-06, + "loss": 0.0599, + "step": 2288 + }, + { + "epoch": 1.016655562958028, + "grad_norm": 0.31136909660088496, + "learning_rate": 9.293323085297386e-06, + "loss": 0.026, + "step": 2289 + }, + { + "epoch": 1.0170997113035753, + "grad_norm": 0.4114152449104369, + "learning_rate": 9.29232918913325e-06, + "loss": 0.0379, + "step": 2290 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 0.41732022602175256, + "learning_rate": 9.291334647759122e-06, + "loss": 0.0422, + "step": 2291 + }, + { + "epoch": 1.0179880079946702, + "grad_norm": 0.762539369842014, + "learning_rate": 9.2903394613245e-06, + "loss": 0.0597, + "step": 2292 + }, + { + "epoch": 1.0184321563402177, + "grad_norm": 0.9415453759336739, + "learning_rate": 9.289343629978978e-06, + "loss": 0.0739, + "step": 2293 + }, + { + "epoch": 1.018876304685765, + "grad_norm": 0.5123579743591409, + "learning_rate": 9.288347153872245e-06, + "loss": 0.0571, + "step": 2294 + }, + { + "epoch": 1.0193204530313125, + "grad_norm": 0.458862217726016, + "learning_rate": 9.287350033154088e-06, + "loss": 0.0468, + "step": 2295 + }, + { + "epoch": 1.01976460137686, + "grad_norm": 1.3873099747217557, + "learning_rate": 9.28635226797439e-06, + "loss": 0.0745, + "step": 2296 + }, + { + "epoch": 1.0202087497224073, + "grad_norm": 0.5997981500021724, + "learning_rate": 9.285353858483138e-06, + "loss": 0.0506, + "step": 2297 + }, + { + "epoch": 1.0206528980679548, + "grad_norm": 0.6396330710132598, + "learning_rate": 9.284354804830403e-06, + "loss": 0.0518, + "step": 2298 + }, + { + "epoch": 1.021097046413502, + "grad_norm": 0.8119329100233045, + "learning_rate": 9.283355107166361e-06, + "loss": 0.088, + "step": 2299 + }, + { + "epoch": 1.0215411947590496, + "grad_norm": 0.6001947150070646, + "learning_rate": 9.282354765641286e-06, + "loss": 0.0503, + "step": 2300 + }, + { + "epoch": 1.0219853431045969, + "grad_norm": 0.5257172181736586, + "learning_rate": 9.281353780405546e-06, + "loss": 0.0632, + "step": 2301 + }, + { + "epoch": 1.0224294914501444, + "grad_norm": 0.4904406999367498, + "learning_rate": 9.280352151609604e-06, + "loss": 0.0412, + "step": 2302 + }, + { + "epoch": 1.0228736397956917, + "grad_norm": 0.6023825108536248, + "learning_rate": 9.279349879404024e-06, + "loss": 0.0501, + "step": 2303 + }, + { + "epoch": 1.0233177881412392, + "grad_norm": 0.5713607373831051, + "learning_rate": 9.278346963939464e-06, + "loss": 0.0648, + "step": 2304 + }, + { + "epoch": 1.0237619364867865, + "grad_norm": 0.5117889653709818, + "learning_rate": 9.27734340536668e-06, + "loss": 0.0595, + "step": 2305 + }, + { + "epoch": 1.024206084832334, + "grad_norm": 0.5957895920503053, + "learning_rate": 9.27633920383652e-06, + "loss": 0.0588, + "step": 2306 + }, + { + "epoch": 1.0246502331778815, + "grad_norm": 0.6119278145978992, + "learning_rate": 9.275334359499936e-06, + "loss": 0.0615, + "step": 2307 + }, + { + "epoch": 1.0250943815234288, + "grad_norm": 0.5747304102651929, + "learning_rate": 9.274328872507973e-06, + "loss": 0.0609, + "step": 2308 + }, + { + "epoch": 1.0255385298689763, + "grad_norm": 0.5890004950126706, + "learning_rate": 9.273322743011775e-06, + "loss": 0.054, + "step": 2309 + }, + { + "epoch": 1.0259826782145236, + "grad_norm": 0.46856707487309196, + "learning_rate": 9.272315971162573e-06, + "loss": 0.0438, + "step": 2310 + }, + { + "epoch": 1.0264268265600711, + "grad_norm": 0.5705380710124494, + "learning_rate": 9.27130855711171e-06, + "loss": 0.0477, + "step": 2311 + }, + { + "epoch": 1.0268709749056184, + "grad_norm": 0.7064165326935903, + "learning_rate": 9.270300501010612e-06, + "loss": 0.0568, + "step": 2312 + }, + { + "epoch": 1.027315123251166, + "grad_norm": 0.5416989623481593, + "learning_rate": 9.26929180301081e-06, + "loss": 0.0682, + "step": 2313 + }, + { + "epoch": 1.0277592715967132, + "grad_norm": 0.49073246688028643, + "learning_rate": 9.268282463263928e-06, + "loss": 0.0629, + "step": 2314 + }, + { + "epoch": 1.0282034199422607, + "grad_norm": 0.47098174668190423, + "learning_rate": 9.267272481921686e-06, + "loss": 0.0492, + "step": 2315 + }, + { + "epoch": 1.0286475682878082, + "grad_norm": 0.3813955943581627, + "learning_rate": 9.266261859135901e-06, + "loss": 0.0308, + "step": 2316 + }, + { + "epoch": 1.0290917166333555, + "grad_norm": 0.6582567910792767, + "learning_rate": 9.265250595058486e-06, + "loss": 0.0573, + "step": 2317 + }, + { + "epoch": 1.029535864978903, + "grad_norm": 0.5953836977171739, + "learning_rate": 9.264238689841456e-06, + "loss": 0.0631, + "step": 2318 + }, + { + "epoch": 1.0299800133244503, + "grad_norm": 0.6292012725822909, + "learning_rate": 9.263226143636912e-06, + "loss": 0.0691, + "step": 2319 + }, + { + "epoch": 1.0304241616699978, + "grad_norm": 0.47225699541890964, + "learning_rate": 9.262212956597059e-06, + "loss": 0.0563, + "step": 2320 + }, + { + "epoch": 1.0308683100155451, + "grad_norm": 0.48390119987242786, + "learning_rate": 9.261199128874197e-06, + "loss": 0.054, + "step": 2321 + }, + { + "epoch": 1.0313124583610926, + "grad_norm": 0.5985733647635482, + "learning_rate": 9.26018466062072e-06, + "loss": 0.0582, + "step": 2322 + }, + { + "epoch": 1.03175660670664, + "grad_norm": 0.5869595699898266, + "learning_rate": 9.259169551989121e-06, + "loss": 0.0562, + "step": 2323 + }, + { + "epoch": 1.0322007550521874, + "grad_norm": 0.5529638354373985, + "learning_rate": 9.258153803131989e-06, + "loss": 0.055, + "step": 2324 + }, + { + "epoch": 1.0326449033977347, + "grad_norm": 0.39012786867402377, + "learning_rate": 9.257137414202006e-06, + "loss": 0.0389, + "step": 2325 + }, + { + "epoch": 1.0330890517432822, + "grad_norm": 0.5320314663484899, + "learning_rate": 9.256120385351953e-06, + "loss": 0.0593, + "step": 2326 + }, + { + "epoch": 1.0335332000888298, + "grad_norm": 0.47550720714560507, + "learning_rate": 9.255102716734709e-06, + "loss": 0.0383, + "step": 2327 + }, + { + "epoch": 1.033977348434377, + "grad_norm": 0.6123732206520373, + "learning_rate": 9.254084408503243e-06, + "loss": 0.0733, + "step": 2328 + }, + { + "epoch": 1.0344214967799246, + "grad_norm": 0.6340880346930579, + "learning_rate": 9.253065460810627e-06, + "loss": 0.0703, + "step": 2329 + }, + { + "epoch": 1.0348656451254719, + "grad_norm": 0.4558646564237513, + "learning_rate": 9.252045873810026e-06, + "loss": 0.0389, + "step": 2330 + }, + { + "epoch": 1.0353097934710194, + "grad_norm": 0.573613972583976, + "learning_rate": 9.251025647654698e-06, + "loss": 0.0578, + "step": 2331 + }, + { + "epoch": 1.0357539418165667, + "grad_norm": 0.45869687127989117, + "learning_rate": 9.250004782498006e-06, + "loss": 0.0486, + "step": 2332 + }, + { + "epoch": 1.0361980901621142, + "grad_norm": 0.700597406034261, + "learning_rate": 9.248983278493399e-06, + "loss": 0.0486, + "step": 2333 + }, + { + "epoch": 1.0366422385076615, + "grad_norm": 0.5940754007816903, + "learning_rate": 9.247961135794428e-06, + "loss": 0.0487, + "step": 2334 + }, + { + "epoch": 1.037086386853209, + "grad_norm": 0.7212210545701735, + "learning_rate": 9.246938354554737e-06, + "loss": 0.051, + "step": 2335 + }, + { + "epoch": 1.0375305351987565, + "grad_norm": 0.41449723705228086, + "learning_rate": 9.245914934928068e-06, + "loss": 0.0441, + "step": 2336 + }, + { + "epoch": 1.0379746835443038, + "grad_norm": 0.9446091827668867, + "learning_rate": 9.24489087706826e-06, + "loss": 0.0941, + "step": 2337 + }, + { + "epoch": 1.0384188318898513, + "grad_norm": 0.43472127189546134, + "learning_rate": 9.243866181129246e-06, + "loss": 0.0444, + "step": 2338 + }, + { + "epoch": 1.0388629802353986, + "grad_norm": 0.8234990673910981, + "learning_rate": 9.242840847265053e-06, + "loss": 0.0449, + "step": 2339 + }, + { + "epoch": 1.039307128580946, + "grad_norm": 0.5137093258500119, + "learning_rate": 9.241814875629806e-06, + "loss": 0.047, + "step": 2340 + }, + { + "epoch": 1.0397512769264934, + "grad_norm": 0.46164431011916773, + "learning_rate": 9.24078826637773e-06, + "loss": 0.0567, + "step": 2341 + }, + { + "epoch": 1.040195425272041, + "grad_norm": 0.5917045565013841, + "learning_rate": 9.239761019663139e-06, + "loss": 0.0529, + "step": 2342 + }, + { + "epoch": 1.0406395736175882, + "grad_norm": 0.49372320773080497, + "learning_rate": 9.238733135640445e-06, + "loss": 0.0503, + "step": 2343 + }, + { + "epoch": 1.0410837219631357, + "grad_norm": 0.47687223129773526, + "learning_rate": 9.237704614464157e-06, + "loss": 0.0538, + "step": 2344 + }, + { + "epoch": 1.0415278703086832, + "grad_norm": 0.5267358173749689, + "learning_rate": 9.236675456288879e-06, + "loss": 0.0533, + "step": 2345 + }, + { + "epoch": 1.0419720186542305, + "grad_norm": 0.5562866454702318, + "learning_rate": 9.235645661269313e-06, + "loss": 0.0617, + "step": 2346 + }, + { + "epoch": 1.042416166999778, + "grad_norm": 0.48761824525349823, + "learning_rate": 9.234615229560251e-06, + "loss": 0.0646, + "step": 2347 + }, + { + "epoch": 1.0428603153453253, + "grad_norm": 0.5413755626896984, + "learning_rate": 9.233584161316588e-06, + "loss": 0.0485, + "step": 2348 + }, + { + "epoch": 1.0433044636908728, + "grad_norm": 0.48592846139974233, + "learning_rate": 9.232552456693308e-06, + "loss": 0.0466, + "step": 2349 + }, + { + "epoch": 1.04374861203642, + "grad_norm": 0.3666523977412306, + "learning_rate": 9.231520115845495e-06, + "loss": 0.0415, + "step": 2350 + }, + { + "epoch": 1.0441927603819676, + "grad_norm": 0.5041685160042538, + "learning_rate": 9.23048713892833e-06, + "loss": 0.0461, + "step": 2351 + }, + { + "epoch": 1.044636908727515, + "grad_norm": 0.4557622908913518, + "learning_rate": 9.229453526097085e-06, + "loss": 0.0444, + "step": 2352 + }, + { + "epoch": 1.0450810570730624, + "grad_norm": 0.7840502309472853, + "learning_rate": 9.228419277507126e-06, + "loss": 0.0628, + "step": 2353 + }, + { + "epoch": 1.0455252054186097, + "grad_norm": 0.5291018953826038, + "learning_rate": 9.227384393313924e-06, + "loss": 0.0496, + "step": 2354 + }, + { + "epoch": 1.0459693537641572, + "grad_norm": 0.49003402431309595, + "learning_rate": 9.226348873673036e-06, + "loss": 0.0549, + "step": 2355 + }, + { + "epoch": 1.0464135021097047, + "grad_norm": 0.5208623353433574, + "learning_rate": 9.22531271874012e-06, + "loss": 0.0473, + "step": 2356 + }, + { + "epoch": 1.046857650455252, + "grad_norm": 0.6890800835128176, + "learning_rate": 9.224275928670925e-06, + "loss": 0.0554, + "step": 2357 + }, + { + "epoch": 1.0473017988007995, + "grad_norm": 0.6619670076578327, + "learning_rate": 9.223238503621302e-06, + "loss": 0.0577, + "step": 2358 + }, + { + "epoch": 1.0477459471463468, + "grad_norm": 0.5128511282981787, + "learning_rate": 9.22220044374719e-06, + "loss": 0.0406, + "step": 2359 + }, + { + "epoch": 1.0481900954918943, + "grad_norm": 0.521752374455369, + "learning_rate": 9.221161749204629e-06, + "loss": 0.05, + "step": 2360 + }, + { + "epoch": 1.0486342438374416, + "grad_norm": 0.6524832106347492, + "learning_rate": 9.220122420149753e-06, + "loss": 0.0548, + "step": 2361 + }, + { + "epoch": 1.0490783921829892, + "grad_norm": 0.49101446033388607, + "learning_rate": 9.219082456738788e-06, + "loss": 0.0392, + "step": 2362 + }, + { + "epoch": 1.0495225405285364, + "grad_norm": 0.40291059778432986, + "learning_rate": 9.218041859128062e-06, + "loss": 0.0403, + "step": 2363 + }, + { + "epoch": 1.049966688874084, + "grad_norm": 0.52475244536887, + "learning_rate": 9.217000627473993e-06, + "loss": 0.0518, + "step": 2364 + }, + { + "epoch": 1.0504108372196312, + "grad_norm": 0.6932318744101922, + "learning_rate": 9.215958761933093e-06, + "loss": 0.0586, + "step": 2365 + }, + { + "epoch": 1.0508549855651788, + "grad_norm": 0.4220631830774893, + "learning_rate": 9.214916262661977e-06, + "loss": 0.045, + "step": 2366 + }, + { + "epoch": 1.0512991339107263, + "grad_norm": 0.4512747724107325, + "learning_rate": 9.213873129817346e-06, + "loss": 0.0543, + "step": 2367 + }, + { + "epoch": 1.0517432822562736, + "grad_norm": 0.4570731167347105, + "learning_rate": 9.212829363556003e-06, + "loss": 0.0594, + "step": 2368 + }, + { + "epoch": 1.052187430601821, + "grad_norm": 0.508942846260648, + "learning_rate": 9.211784964034842e-06, + "loss": 0.0685, + "step": 2369 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.5865147734769778, + "learning_rate": 9.210739931410857e-06, + "loss": 0.064, + "step": 2370 + }, + { + "epoch": 1.0530757272929159, + "grad_norm": 0.6404273704289678, + "learning_rate": 9.209694265841132e-06, + "loss": 0.0449, + "step": 2371 + }, + { + "epoch": 1.0535198756384632, + "grad_norm": 0.5024824890374885, + "learning_rate": 9.208647967482849e-06, + "loss": 0.053, + "step": 2372 + }, + { + "epoch": 1.0539640239840107, + "grad_norm": 0.5079871293280772, + "learning_rate": 9.207601036493284e-06, + "loss": 0.0504, + "step": 2373 + }, + { + "epoch": 1.054408172329558, + "grad_norm": 0.5005289716599005, + "learning_rate": 9.206553473029807e-06, + "loss": 0.0469, + "step": 2374 + }, + { + "epoch": 1.0548523206751055, + "grad_norm": 0.6348636511274667, + "learning_rate": 9.205505277249888e-06, + "loss": 0.0664, + "step": 2375 + }, + { + "epoch": 1.055296469020653, + "grad_norm": 0.7124285353701557, + "learning_rate": 9.204456449311086e-06, + "loss": 0.0811, + "step": 2376 + }, + { + "epoch": 1.0557406173662003, + "grad_norm": 0.7155216157670515, + "learning_rate": 9.203406989371058e-06, + "loss": 0.068, + "step": 2377 + }, + { + "epoch": 1.0561847657117478, + "grad_norm": 0.6903207059209383, + "learning_rate": 9.202356897587556e-06, + "loss": 0.0625, + "step": 2378 + }, + { + "epoch": 1.056628914057295, + "grad_norm": 0.4483522509553745, + "learning_rate": 9.201306174118428e-06, + "loss": 0.0461, + "step": 2379 + }, + { + "epoch": 1.0570730624028426, + "grad_norm": 0.6352875132111704, + "learning_rate": 9.200254819121612e-06, + "loss": 0.0838, + "step": 2380 + }, + { + "epoch": 1.05751721074839, + "grad_norm": 0.600462738349843, + "learning_rate": 9.19920283275515e-06, + "loss": 0.0498, + "step": 2381 + }, + { + "epoch": 1.0579613590939374, + "grad_norm": 0.513630635347009, + "learning_rate": 9.198150215177168e-06, + "loss": 0.0557, + "step": 2382 + }, + { + "epoch": 1.0584055074394847, + "grad_norm": 0.674636182160706, + "learning_rate": 9.197096966545896e-06, + "loss": 0.0608, + "step": 2383 + }, + { + "epoch": 1.0588496557850322, + "grad_norm": 0.7211986590126924, + "learning_rate": 9.196043087019651e-06, + "loss": 0.0675, + "step": 2384 + }, + { + "epoch": 1.0592938041305797, + "grad_norm": 0.48469937945694536, + "learning_rate": 9.194988576756855e-06, + "loss": 0.0533, + "step": 2385 + }, + { + "epoch": 1.059737952476127, + "grad_norm": 0.6223548159808464, + "learning_rate": 9.193933435916013e-06, + "loss": 0.0609, + "step": 2386 + }, + { + "epoch": 1.0601821008216745, + "grad_norm": 0.4702625375198161, + "learning_rate": 9.192877664655736e-06, + "loss": 0.0629, + "step": 2387 + }, + { + "epoch": 1.0606262491672218, + "grad_norm": 0.6156516266579465, + "learning_rate": 9.191821263134718e-06, + "loss": 0.0519, + "step": 2388 + }, + { + "epoch": 1.0610703975127693, + "grad_norm": 0.41634294802260347, + "learning_rate": 9.19076423151176e-06, + "loss": 0.0339, + "step": 2389 + }, + { + "epoch": 1.0615145458583166, + "grad_norm": 0.5676744907352943, + "learning_rate": 9.189706569945749e-06, + "loss": 0.0672, + "step": 2390 + }, + { + "epoch": 1.0619586942038641, + "grad_norm": 0.5506701658376884, + "learning_rate": 9.188648278595669e-06, + "loss": 0.0549, + "step": 2391 + }, + { + "epoch": 1.0624028425494114, + "grad_norm": 0.5548831925430109, + "learning_rate": 9.187589357620602e-06, + "loss": 0.0495, + "step": 2392 + }, + { + "epoch": 1.062846990894959, + "grad_norm": 0.5733351694159566, + "learning_rate": 9.186529807179715e-06, + "loss": 0.0413, + "step": 2393 + }, + { + "epoch": 1.0632911392405062, + "grad_norm": 0.4263224419709196, + "learning_rate": 9.185469627432287e-06, + "loss": 0.0376, + "step": 2394 + }, + { + "epoch": 1.0637352875860537, + "grad_norm": 0.5107765087622207, + "learning_rate": 9.184408818537673e-06, + "loss": 0.0441, + "step": 2395 + }, + { + "epoch": 1.0641794359316012, + "grad_norm": 0.4468370290354222, + "learning_rate": 9.183347380655332e-06, + "loss": 0.0393, + "step": 2396 + }, + { + "epoch": 1.0646235842771485, + "grad_norm": 0.4890359715306359, + "learning_rate": 9.182285313944818e-06, + "loss": 0.0481, + "step": 2397 + }, + { + "epoch": 1.065067732622696, + "grad_norm": 0.45195434165164133, + "learning_rate": 9.181222618565777e-06, + "loss": 0.0509, + "step": 2398 + }, + { + "epoch": 1.0655118809682433, + "grad_norm": 0.49289142398585184, + "learning_rate": 9.180159294677948e-06, + "loss": 0.0515, + "step": 2399 + }, + { + "epoch": 1.0659560293137909, + "grad_norm": 0.42385971921718624, + "learning_rate": 9.179095342441171e-06, + "loss": 0.0448, + "step": 2400 + }, + { + "epoch": 1.0664001776593381, + "grad_norm": 0.6172728890244491, + "learning_rate": 9.178030762015372e-06, + "loss": 0.0401, + "step": 2401 + }, + { + "epoch": 1.0668443260048857, + "grad_norm": 0.5217625747110314, + "learning_rate": 9.176965553560578e-06, + "loss": 0.0578, + "step": 2402 + }, + { + "epoch": 1.067288474350433, + "grad_norm": 0.4881611280755445, + "learning_rate": 9.175899717236907e-06, + "loss": 0.0437, + "step": 2403 + }, + { + "epoch": 1.0677326226959805, + "grad_norm": 0.5243251412540129, + "learning_rate": 9.174833253204571e-06, + "loss": 0.0408, + "step": 2404 + }, + { + "epoch": 1.068176771041528, + "grad_norm": 0.4299956266707952, + "learning_rate": 9.17376616162388e-06, + "loss": 0.0416, + "step": 2405 + }, + { + "epoch": 1.0686209193870753, + "grad_norm": 0.5888767684513153, + "learning_rate": 9.172698442655236e-06, + "loss": 0.064, + "step": 2406 + }, + { + "epoch": 1.0690650677326228, + "grad_norm": 0.4022150666774181, + "learning_rate": 9.171630096459134e-06, + "loss": 0.0416, + "step": 2407 + }, + { + "epoch": 1.06950921607817, + "grad_norm": 0.9093917263815293, + "learning_rate": 9.170561123196165e-06, + "loss": 0.0573, + "step": 2408 + }, + { + "epoch": 1.0699533644237176, + "grad_norm": 0.5641484333920734, + "learning_rate": 9.169491523027012e-06, + "loss": 0.0527, + "step": 2409 + }, + { + "epoch": 1.0703975127692649, + "grad_norm": 0.5614858136344199, + "learning_rate": 9.168421296112457e-06, + "loss": 0.0525, + "step": 2410 + }, + { + "epoch": 1.0708416611148124, + "grad_norm": 0.5361742414415347, + "learning_rate": 9.167350442613371e-06, + "loss": 0.0512, + "step": 2411 + }, + { + "epoch": 1.0712858094603597, + "grad_norm": 0.4065744899599026, + "learning_rate": 9.166278962690724e-06, + "loss": 0.0352, + "step": 2412 + }, + { + "epoch": 1.0717299578059072, + "grad_norm": 0.7190511616030503, + "learning_rate": 9.165206856505577e-06, + "loss": 0.0711, + "step": 2413 + }, + { + "epoch": 1.0721741061514547, + "grad_norm": 0.5266458384013354, + "learning_rate": 9.164134124219085e-06, + "loss": 0.0436, + "step": 2414 + }, + { + "epoch": 1.072618254497002, + "grad_norm": 0.5599669781296026, + "learning_rate": 9.163060765992495e-06, + "loss": 0.0759, + "step": 2415 + }, + { + "epoch": 1.0730624028425495, + "grad_norm": 0.5128109774041522, + "learning_rate": 9.161986781987156e-06, + "loss": 0.0455, + "step": 2416 + }, + { + "epoch": 1.0735065511880968, + "grad_norm": 0.4078193083238796, + "learning_rate": 9.160912172364503e-06, + "loss": 0.0403, + "step": 2417 + }, + { + "epoch": 1.0739506995336443, + "grad_norm": 0.6961390181093496, + "learning_rate": 9.15983693728607e-06, + "loss": 0.0757, + "step": 2418 + }, + { + "epoch": 1.0743948478791916, + "grad_norm": 0.518134789703249, + "learning_rate": 9.158761076913481e-06, + "loss": 0.0458, + "step": 2419 + }, + { + "epoch": 1.074838996224739, + "grad_norm": 0.6466366059180623, + "learning_rate": 9.157684591408458e-06, + "loss": 0.0639, + "step": 2420 + }, + { + "epoch": 1.0752831445702864, + "grad_norm": 0.45726102190092227, + "learning_rate": 9.156607480932813e-06, + "loss": 0.0485, + "step": 2421 + }, + { + "epoch": 1.075727292915834, + "grad_norm": 0.47111677703640187, + "learning_rate": 9.155529745648457e-06, + "loss": 0.0576, + "step": 2422 + }, + { + "epoch": 1.0761714412613812, + "grad_norm": 0.4424777927187194, + "learning_rate": 9.154451385717387e-06, + "loss": 0.0415, + "step": 2423 + }, + { + "epoch": 1.0766155896069287, + "grad_norm": 0.5637865585741092, + "learning_rate": 9.153372401301706e-06, + "loss": 0.0569, + "step": 2424 + }, + { + "epoch": 1.0770597379524762, + "grad_norm": 0.6193090479661898, + "learning_rate": 9.152292792563596e-06, + "loss": 0.0541, + "step": 2425 + }, + { + "epoch": 1.0775038862980235, + "grad_norm": 0.5604767145835976, + "learning_rate": 9.151212559665345e-06, + "loss": 0.0487, + "step": 2426 + }, + { + "epoch": 1.077948034643571, + "grad_norm": 0.614420431496054, + "learning_rate": 9.150131702769332e-06, + "loss": 0.0543, + "step": 2427 + }, + { + "epoch": 1.0783921829891183, + "grad_norm": 0.500155979890897, + "learning_rate": 9.149050222038024e-06, + "loss": 0.0543, + "step": 2428 + }, + { + "epoch": 1.0788363313346658, + "grad_norm": 0.6046519445448296, + "learning_rate": 9.147968117633988e-06, + "loss": 0.0669, + "step": 2429 + }, + { + "epoch": 1.0792804796802131, + "grad_norm": 0.5730375661279963, + "learning_rate": 9.14688538971988e-06, + "loss": 0.0495, + "step": 2430 + }, + { + "epoch": 1.0797246280257606, + "grad_norm": 0.482430110901872, + "learning_rate": 9.145802038458457e-06, + "loss": 0.0484, + "step": 2431 + }, + { + "epoch": 1.080168776371308, + "grad_norm": 0.49413095926471534, + "learning_rate": 9.144718064012562e-06, + "loss": 0.0482, + "step": 2432 + }, + { + "epoch": 1.0806129247168554, + "grad_norm": 0.6274129207366034, + "learning_rate": 9.143633466545136e-06, + "loss": 0.0609, + "step": 2433 + }, + { + "epoch": 1.0810570730624027, + "grad_norm": 0.4754419676708797, + "learning_rate": 9.142548246219212e-06, + "loss": 0.0539, + "step": 2434 + }, + { + "epoch": 1.0815012214079502, + "grad_norm": 0.5129065641839461, + "learning_rate": 9.141462403197917e-06, + "loss": 0.0502, + "step": 2435 + }, + { + "epoch": 1.0819453697534978, + "grad_norm": 0.5492083841021022, + "learning_rate": 9.14037593764447e-06, + "loss": 0.0547, + "step": 2436 + }, + { + "epoch": 1.082389518099045, + "grad_norm": 0.5638343322048809, + "learning_rate": 9.139288849722188e-06, + "loss": 0.0552, + "step": 2437 + }, + { + "epoch": 1.0828336664445926, + "grad_norm": 0.48234420470469497, + "learning_rate": 9.138201139594478e-06, + "loss": 0.0509, + "step": 2438 + }, + { + "epoch": 1.0832778147901398, + "grad_norm": 0.48233545504362385, + "learning_rate": 9.137112807424842e-06, + "loss": 0.0618, + "step": 2439 + }, + { + "epoch": 1.0837219631356874, + "grad_norm": 0.6597245369594125, + "learning_rate": 9.136023853376872e-06, + "loss": 0.0505, + "step": 2440 + }, + { + "epoch": 1.0841661114812347, + "grad_norm": 0.8640144400788339, + "learning_rate": 9.134934277614258e-06, + "loss": 0.0552, + "step": 2441 + }, + { + "epoch": 1.0846102598267822, + "grad_norm": 0.5151117640182895, + "learning_rate": 9.133844080300783e-06, + "loss": 0.0552, + "step": 2442 + }, + { + "epoch": 1.0850544081723295, + "grad_norm": 0.5553941913181785, + "learning_rate": 9.13275326160032e-06, + "loss": 0.068, + "step": 2443 + }, + { + "epoch": 1.085498556517877, + "grad_norm": 1.2548272109304157, + "learning_rate": 9.131661821676839e-06, + "loss": 0.0615, + "step": 2444 + }, + { + "epoch": 1.0859427048634245, + "grad_norm": 0.7349991566989792, + "learning_rate": 9.130569760694402e-06, + "loss": 0.0592, + "step": 2445 + }, + { + "epoch": 1.0863868532089718, + "grad_norm": 0.4235071999153545, + "learning_rate": 9.129477078817165e-06, + "loss": 0.0399, + "step": 2446 + }, + { + "epoch": 1.0868310015545193, + "grad_norm": 0.5132669145700011, + "learning_rate": 9.128383776209372e-06, + "loss": 0.051, + "step": 2447 + }, + { + "epoch": 1.0872751499000666, + "grad_norm": 1.6678578417052072, + "learning_rate": 9.127289853035371e-06, + "loss": 0.068, + "step": 2448 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 0.3444388637101336, + "learning_rate": 9.126195309459593e-06, + "loss": 0.0326, + "step": 2449 + }, + { + "epoch": 1.0881634465911614, + "grad_norm": 0.38355662180503935, + "learning_rate": 9.12510014564657e-06, + "loss": 0.0552, + "step": 2450 + }, + { + "epoch": 1.0886075949367089, + "grad_norm": 0.5490923753617978, + "learning_rate": 9.124004361760921e-06, + "loss": 0.0598, + "step": 2451 + }, + { + "epoch": 1.0890517432822562, + "grad_norm": 0.49443123554529045, + "learning_rate": 9.122907957967363e-06, + "loss": 0.0545, + "step": 2452 + }, + { + "epoch": 1.0894958916278037, + "grad_norm": 0.47265541042225623, + "learning_rate": 9.121810934430702e-06, + "loss": 0.0523, + "step": 2453 + }, + { + "epoch": 1.0899400399733512, + "grad_norm": 0.6134212234858121, + "learning_rate": 9.12071329131584e-06, + "loss": 0.0669, + "step": 2454 + }, + { + "epoch": 1.0903841883188985, + "grad_norm": 0.5102377020582352, + "learning_rate": 9.119615028787771e-06, + "loss": 0.0488, + "step": 2455 + }, + { + "epoch": 1.090828336664446, + "grad_norm": 0.5477789256953114, + "learning_rate": 9.118516147011585e-06, + "loss": 0.0609, + "step": 2456 + }, + { + "epoch": 1.0912724850099933, + "grad_norm": 0.5014262210362811, + "learning_rate": 9.117416646152459e-06, + "loss": 0.0455, + "step": 2457 + }, + { + "epoch": 1.0917166333555408, + "grad_norm": 0.5068151485473731, + "learning_rate": 9.11631652637567e-06, + "loss": 0.0558, + "step": 2458 + }, + { + "epoch": 1.092160781701088, + "grad_norm": 0.5967678649273153, + "learning_rate": 9.115215787846583e-06, + "loss": 0.0576, + "step": 2459 + }, + { + "epoch": 1.0926049300466356, + "grad_norm": 0.5382239114996434, + "learning_rate": 9.114114430730656e-06, + "loss": 0.0568, + "step": 2460 + }, + { + "epoch": 1.093049078392183, + "grad_norm": 0.46675820455299727, + "learning_rate": 9.113012455193444e-06, + "loss": 0.0437, + "step": 2461 + }, + { + "epoch": 1.0934932267377304, + "grad_norm": 0.6129912443014283, + "learning_rate": 9.111909861400594e-06, + "loss": 0.0584, + "step": 2462 + }, + { + "epoch": 1.0939373750832777, + "grad_norm": 0.49745724599799956, + "learning_rate": 9.110806649517841e-06, + "loss": 0.0541, + "step": 2463 + }, + { + "epoch": 1.0943815234288252, + "grad_norm": 0.5223020387615885, + "learning_rate": 9.109702819711018e-06, + "loss": 0.0579, + "step": 2464 + }, + { + "epoch": 1.0948256717743727, + "grad_norm": 0.522897777305782, + "learning_rate": 9.108598372146052e-06, + "loss": 0.065, + "step": 2465 + }, + { + "epoch": 1.09526982011992, + "grad_norm": 0.4978718424633595, + "learning_rate": 9.107493306988955e-06, + "loss": 0.057, + "step": 2466 + }, + { + "epoch": 1.0957139684654675, + "grad_norm": 0.41889427668506446, + "learning_rate": 9.10638762440584e-06, + "loss": 0.0546, + "step": 2467 + }, + { + "epoch": 1.0961581168110148, + "grad_norm": 0.4889531194168305, + "learning_rate": 9.10528132456291e-06, + "loss": 0.0475, + "step": 2468 + }, + { + "epoch": 1.0966022651565623, + "grad_norm": 0.49561727802236016, + "learning_rate": 9.10417440762646e-06, + "loss": 0.0624, + "step": 2469 + }, + { + "epoch": 1.0970464135021096, + "grad_norm": 0.5035282177847037, + "learning_rate": 9.10306687376288e-06, + "loss": 0.0613, + "step": 2470 + }, + { + "epoch": 1.0974905618476571, + "grad_norm": 0.44693886713635184, + "learning_rate": 9.101958723138651e-06, + "loss": 0.0505, + "step": 2471 + }, + { + "epoch": 1.0979347101932044, + "grad_norm": 0.6676580453833789, + "learning_rate": 9.100849955920344e-06, + "loss": 0.0637, + "step": 2472 + }, + { + "epoch": 1.098378858538752, + "grad_norm": 0.47501330783440915, + "learning_rate": 9.099740572274627e-06, + "loss": 0.0391, + "step": 2473 + }, + { + "epoch": 1.0988230068842995, + "grad_norm": 0.6923283723141113, + "learning_rate": 9.098630572368262e-06, + "loss": 0.0518, + "step": 2474 + }, + { + "epoch": 1.0992671552298467, + "grad_norm": 0.47645749570109275, + "learning_rate": 9.097519956368096e-06, + "loss": 0.0273, + "step": 2475 + }, + { + "epoch": 1.0997113035753943, + "grad_norm": 0.8125207380279094, + "learning_rate": 9.096408724441078e-06, + "loss": 0.062, + "step": 2476 + }, + { + "epoch": 1.1001554519209416, + "grad_norm": 0.6208955890687268, + "learning_rate": 9.09529687675424e-06, + "loss": 0.0399, + "step": 2477 + }, + { + "epoch": 1.100599600266489, + "grad_norm": 0.6086645282708443, + "learning_rate": 9.094184413474716e-06, + "loss": 0.0538, + "step": 2478 + }, + { + "epoch": 1.1010437486120364, + "grad_norm": 0.5638622958089239, + "learning_rate": 9.093071334769727e-06, + "loss": 0.087, + "step": 2479 + }, + { + "epoch": 1.1014878969575839, + "grad_norm": 0.5860122134432946, + "learning_rate": 9.091957640806585e-06, + "loss": 0.0514, + "step": 2480 + }, + { + "epoch": 1.1019320453031312, + "grad_norm": 0.5483193797542968, + "learning_rate": 9.090843331752704e-06, + "loss": 0.0837, + "step": 2481 + }, + { + "epoch": 1.1023761936486787, + "grad_norm": 0.671106831086069, + "learning_rate": 9.089728407775576e-06, + "loss": 0.0533, + "step": 2482 + }, + { + "epoch": 1.1028203419942262, + "grad_norm": 0.471025449998095, + "learning_rate": 9.088612869042794e-06, + "loss": 0.0694, + "step": 2483 + }, + { + "epoch": 1.1032644903397735, + "grad_norm": 0.46180112116306377, + "learning_rate": 9.087496715722049e-06, + "loss": 0.0426, + "step": 2484 + }, + { + "epoch": 1.103708638685321, + "grad_norm": 0.49860403863116065, + "learning_rate": 9.08637994798111e-06, + "loss": 0.0583, + "step": 2485 + }, + { + "epoch": 1.1041527870308683, + "grad_norm": 0.5897260124865925, + "learning_rate": 9.08526256598785e-06, + "loss": 0.0648, + "step": 2486 + }, + { + "epoch": 1.1045969353764158, + "grad_norm": 0.453592540271068, + "learning_rate": 9.084144569910229e-06, + "loss": 0.0455, + "step": 2487 + }, + { + "epoch": 1.105041083721963, + "grad_norm": 0.6113711566277047, + "learning_rate": 9.083025959916302e-06, + "loss": 0.0556, + "step": 2488 + }, + { + "epoch": 1.1054852320675106, + "grad_norm": 0.4026753684401193, + "learning_rate": 9.081906736174217e-06, + "loss": 0.0347, + "step": 2489 + }, + { + "epoch": 1.1059293804130579, + "grad_norm": 0.7535472922225546, + "learning_rate": 9.080786898852207e-06, + "loss": 0.0561, + "step": 2490 + }, + { + "epoch": 1.1063735287586054, + "grad_norm": 0.7712189521196268, + "learning_rate": 9.079666448118607e-06, + "loss": 0.0815, + "step": 2491 + }, + { + "epoch": 1.1068176771041527, + "grad_norm": 0.549393105917364, + "learning_rate": 9.07854538414184e-06, + "loss": 0.0411, + "step": 2492 + }, + { + "epoch": 1.1072618254497002, + "grad_norm": 0.8270414561268318, + "learning_rate": 9.077423707090418e-06, + "loss": 0.0697, + "step": 2493 + }, + { + "epoch": 1.1077059737952477, + "grad_norm": 0.4463853554583335, + "learning_rate": 9.07630141713295e-06, + "loss": 0.0426, + "step": 2494 + }, + { + "epoch": 1.108150122140795, + "grad_norm": 0.6269526052059413, + "learning_rate": 9.075178514438133e-06, + "loss": 0.059, + "step": 2495 + }, + { + "epoch": 1.1085942704863425, + "grad_norm": 0.7332597224037389, + "learning_rate": 9.074054999174762e-06, + "loss": 0.0647, + "step": 2496 + }, + { + "epoch": 1.1090384188318898, + "grad_norm": 0.6649774981712752, + "learning_rate": 9.072930871511718e-06, + "loss": 0.0751, + "step": 2497 + }, + { + "epoch": 1.1094825671774373, + "grad_norm": 0.8146175851807935, + "learning_rate": 9.071806131617976e-06, + "loss": 0.0591, + "step": 2498 + }, + { + "epoch": 1.1099267155229846, + "grad_norm": 0.6708407619412956, + "learning_rate": 9.070680779662606e-06, + "loss": 0.0552, + "step": 2499 + }, + { + "epoch": 1.1103708638685321, + "grad_norm": 0.6510598528132346, + "learning_rate": 9.069554815814765e-06, + "loss": 0.0547, + "step": 2500 + }, + { + "epoch": 1.1108150122140794, + "grad_norm": 0.7337961607233396, + "learning_rate": 9.068428240243705e-06, + "loss": 0.08, + "step": 2501 + }, + { + "epoch": 1.111259160559627, + "grad_norm": 0.9471208974140737, + "learning_rate": 9.067301053118773e-06, + "loss": 0.0575, + "step": 2502 + }, + { + "epoch": 1.1117033089051742, + "grad_norm": 0.6247228777438872, + "learning_rate": 9.066173254609399e-06, + "loss": 0.0593, + "step": 2503 + }, + { + "epoch": 1.1121474572507217, + "grad_norm": 0.4119422798089505, + "learning_rate": 9.065044844885111e-06, + "loss": 0.0369, + "step": 2504 + }, + { + "epoch": 1.1125916055962692, + "grad_norm": 0.5707033305872198, + "learning_rate": 9.063915824115531e-06, + "loss": 0.0586, + "step": 2505 + }, + { + "epoch": 1.1130357539418165, + "grad_norm": 0.5864819346852046, + "learning_rate": 9.062786192470372e-06, + "loss": 0.0541, + "step": 2506 + }, + { + "epoch": 1.113479902287364, + "grad_norm": 0.4941216006043351, + "learning_rate": 9.06165595011943e-06, + "loss": 0.0467, + "step": 2507 + }, + { + "epoch": 1.1139240506329113, + "grad_norm": 0.7179600112901418, + "learning_rate": 9.060525097232603e-06, + "loss": 0.061, + "step": 2508 + }, + { + "epoch": 1.1143681989784588, + "grad_norm": 0.536437166711899, + "learning_rate": 9.059393633979881e-06, + "loss": 0.0514, + "step": 2509 + }, + { + "epoch": 1.1148123473240061, + "grad_norm": 0.5200228688873401, + "learning_rate": 9.058261560531337e-06, + "loss": 0.0489, + "step": 2510 + }, + { + "epoch": 1.1152564956695536, + "grad_norm": 0.8639507676709893, + "learning_rate": 9.057128877057141e-06, + "loss": 0.0777, + "step": 2511 + }, + { + "epoch": 1.1157006440151012, + "grad_norm": 0.7459642047335504, + "learning_rate": 9.055995583727559e-06, + "loss": 0.0462, + "step": 2512 + }, + { + "epoch": 1.1161447923606485, + "grad_norm": 0.43557336969374605, + "learning_rate": 9.05486168071294e-06, + "loss": 0.0411, + "step": 2513 + }, + { + "epoch": 1.116588940706196, + "grad_norm": 0.4693246881167572, + "learning_rate": 9.05372716818373e-06, + "loss": 0.0317, + "step": 2514 + }, + { + "epoch": 1.1170330890517433, + "grad_norm": 0.5801314945806954, + "learning_rate": 9.052592046310466e-06, + "loss": 0.0504, + "step": 2515 + }, + { + "epoch": 1.1174772373972908, + "grad_norm": 0.5303403405055431, + "learning_rate": 9.051456315263775e-06, + "loss": 0.0417, + "step": 2516 + }, + { + "epoch": 1.117921385742838, + "grad_norm": 0.4682906045403869, + "learning_rate": 9.05031997521438e-06, + "loss": 0.0441, + "step": 2517 + }, + { + "epoch": 1.1183655340883856, + "grad_norm": 0.5610438227567353, + "learning_rate": 9.049183026333089e-06, + "loss": 0.054, + "step": 2518 + }, + { + "epoch": 1.1188096824339329, + "grad_norm": 0.5309141601127093, + "learning_rate": 9.048045468790805e-06, + "loss": 0.0599, + "step": 2519 + }, + { + "epoch": 1.1192538307794804, + "grad_norm": 0.4715253267557383, + "learning_rate": 9.04690730275852e-06, + "loss": 0.0501, + "step": 2520 + }, + { + "epoch": 1.1196979791250277, + "grad_norm": 0.42642161615167634, + "learning_rate": 9.045768528407326e-06, + "loss": 0.0352, + "step": 2521 + }, + { + "epoch": 1.1201421274705752, + "grad_norm": 0.8452679234650169, + "learning_rate": 9.044629145908397e-06, + "loss": 0.0792, + "step": 2522 + }, + { + "epoch": 1.1205862758161227, + "grad_norm": 0.5024078744040577, + "learning_rate": 9.043489155433e-06, + "loss": 0.0578, + "step": 2523 + }, + { + "epoch": 1.12103042416167, + "grad_norm": 0.6039439853816904, + "learning_rate": 9.042348557152495e-06, + "loss": 0.0591, + "step": 2524 + }, + { + "epoch": 1.1214745725072175, + "grad_norm": 0.5200721258931258, + "learning_rate": 9.041207351238336e-06, + "loss": 0.0589, + "step": 2525 + }, + { + "epoch": 1.1219187208527648, + "grad_norm": 0.6201687763530818, + "learning_rate": 9.040065537862063e-06, + "loss": 0.0564, + "step": 2526 + }, + { + "epoch": 1.1223628691983123, + "grad_norm": 0.4816887693066457, + "learning_rate": 9.038923117195313e-06, + "loss": 0.047, + "step": 2527 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 0.4930745832447622, + "learning_rate": 9.037780089409807e-06, + "loss": 0.0437, + "step": 2528 + }, + { + "epoch": 1.123251165889407, + "grad_norm": 0.4493292408106485, + "learning_rate": 9.036636454677363e-06, + "loss": 0.0587, + "step": 2529 + }, + { + "epoch": 1.1236953142349544, + "grad_norm": 0.5080635531655728, + "learning_rate": 9.035492213169892e-06, + "loss": 0.0536, + "step": 2530 + }, + { + "epoch": 1.124139462580502, + "grad_norm": 0.3142735249050316, + "learning_rate": 9.034347365059389e-06, + "loss": 0.0324, + "step": 2531 + }, + { + "epoch": 1.1245836109260492, + "grad_norm": 0.4572661512872438, + "learning_rate": 9.033201910517944e-06, + "loss": 0.0484, + "step": 2532 + }, + { + "epoch": 1.1250277592715967, + "grad_norm": 0.5858908999181529, + "learning_rate": 9.032055849717743e-06, + "loss": 0.041, + "step": 2533 + }, + { + "epoch": 1.1254719076171442, + "grad_norm": 0.5641689055994421, + "learning_rate": 9.030909182831052e-06, + "loss": 0.062, + "step": 2534 + }, + { + "epoch": 1.1259160559626915, + "grad_norm": 0.48230195194298414, + "learning_rate": 9.02976191003024e-06, + "loss": 0.055, + "step": 2535 + }, + { + "epoch": 1.126360204308239, + "grad_norm": 0.45378491026345646, + "learning_rate": 9.028614031487757e-06, + "loss": 0.0485, + "step": 2536 + }, + { + "epoch": 1.1268043526537863, + "grad_norm": 0.5351056809427763, + "learning_rate": 9.027465547376154e-06, + "loss": 0.0554, + "step": 2537 + }, + { + "epoch": 1.1272485009993338, + "grad_norm": 0.44870767793480254, + "learning_rate": 9.02631645786806e-06, + "loss": 0.0443, + "step": 2538 + }, + { + "epoch": 1.1276926493448811, + "grad_norm": 1.1371656771905172, + "learning_rate": 9.02516676313621e-06, + "loss": 0.0572, + "step": 2539 + }, + { + "epoch": 1.1281367976904286, + "grad_norm": 0.5747790388905519, + "learning_rate": 9.02401646335342e-06, + "loss": 0.0662, + "step": 2540 + }, + { + "epoch": 1.1285809460359761, + "grad_norm": 0.5477714109841172, + "learning_rate": 9.022865558692599e-06, + "loss": 0.0629, + "step": 2541 + }, + { + "epoch": 1.1290250943815234, + "grad_norm": 0.7201777798009189, + "learning_rate": 9.021714049326749e-06, + "loss": 0.0642, + "step": 2542 + }, + { + "epoch": 1.1294692427270707, + "grad_norm": 0.5084161768229669, + "learning_rate": 9.02056193542896e-06, + "loss": 0.0413, + "step": 2543 + }, + { + "epoch": 1.1299133910726182, + "grad_norm": 0.5818759215205692, + "learning_rate": 9.019409217172414e-06, + "loss": 0.0458, + "step": 2544 + }, + { + "epoch": 1.1303575394181657, + "grad_norm": 0.4060737061879425, + "learning_rate": 9.018255894730384e-06, + "loss": 0.0417, + "step": 2545 + }, + { + "epoch": 1.130801687763713, + "grad_norm": 0.37654168507528063, + "learning_rate": 9.017101968276237e-06, + "loss": 0.0356, + "step": 2546 + }, + { + "epoch": 1.1312458361092605, + "grad_norm": 0.5269591862707016, + "learning_rate": 9.015947437983423e-06, + "loss": 0.0502, + "step": 2547 + }, + { + "epoch": 1.1316899844548078, + "grad_norm": 0.6087429306966435, + "learning_rate": 9.014792304025492e-06, + "loss": 0.0549, + "step": 2548 + }, + { + "epoch": 1.1321341328003554, + "grad_norm": 0.47161508183894374, + "learning_rate": 9.013636566576078e-06, + "loss": 0.046, + "step": 2549 + }, + { + "epoch": 1.1325782811459026, + "grad_norm": 0.5566854251234638, + "learning_rate": 9.012480225808908e-06, + "loss": 0.0582, + "step": 2550 + }, + { + "epoch": 1.1330224294914502, + "grad_norm": 0.7226965671519505, + "learning_rate": 9.0113232818978e-06, + "loss": 0.0535, + "step": 2551 + }, + { + "epoch": 1.1334665778369977, + "grad_norm": 0.528548310886907, + "learning_rate": 9.010165735016663e-06, + "loss": 0.0471, + "step": 2552 + }, + { + "epoch": 1.133910726182545, + "grad_norm": 0.6224718792548548, + "learning_rate": 9.009007585339493e-06, + "loss": 0.0411, + "step": 2553 + }, + { + "epoch": 1.1343548745280925, + "grad_norm": 0.46637655730340244, + "learning_rate": 9.007848833040385e-06, + "loss": 0.0382, + "step": 2554 + }, + { + "epoch": 1.1347990228736398, + "grad_norm": 0.6524984653497812, + "learning_rate": 9.006689478293513e-06, + "loss": 0.0649, + "step": 2555 + }, + { + "epoch": 1.1352431712191873, + "grad_norm": 0.42573749158130464, + "learning_rate": 9.005529521273152e-06, + "loss": 0.0333, + "step": 2556 + }, + { + "epoch": 1.1356873195647346, + "grad_norm": 0.4720058401308709, + "learning_rate": 9.004368962153662e-06, + "loss": 0.0454, + "step": 2557 + }, + { + "epoch": 1.136131467910282, + "grad_norm": 0.4623215710338066, + "learning_rate": 9.003207801109495e-06, + "loss": 0.0413, + "step": 2558 + }, + { + "epoch": 1.1365756162558294, + "grad_norm": 0.6101080407770559, + "learning_rate": 9.002046038315192e-06, + "loss": 0.0534, + "step": 2559 + }, + { + "epoch": 1.1370197646013769, + "grad_norm": 0.5040799125891366, + "learning_rate": 9.000883673945387e-06, + "loss": 0.0392, + "step": 2560 + }, + { + "epoch": 1.1374639129469242, + "grad_norm": 0.6363733147612941, + "learning_rate": 8.999720708174802e-06, + "loss": 0.0628, + "step": 2561 + }, + { + "epoch": 1.1379080612924717, + "grad_norm": 0.5620665765918448, + "learning_rate": 8.998557141178252e-06, + "loss": 0.0524, + "step": 2562 + }, + { + "epoch": 1.1383522096380192, + "grad_norm": 0.48743987280774526, + "learning_rate": 8.99739297313064e-06, + "loss": 0.0502, + "step": 2563 + }, + { + "epoch": 1.1387963579835665, + "grad_norm": 0.4714981818194369, + "learning_rate": 8.99622820420696e-06, + "loss": 0.0477, + "step": 2564 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 0.466559878027, + "learning_rate": 8.995062834582297e-06, + "loss": 0.0585, + "step": 2565 + }, + { + "epoch": 1.1396846546746613, + "grad_norm": 0.5806057329454937, + "learning_rate": 8.993896864431825e-06, + "loss": 0.0653, + "step": 2566 + }, + { + "epoch": 1.1401288030202088, + "grad_norm": 0.565404505122367, + "learning_rate": 8.992730293930812e-06, + "loss": 0.0375, + "step": 2567 + }, + { + "epoch": 1.140572951365756, + "grad_norm": 0.49869252300833783, + "learning_rate": 8.99156312325461e-06, + "loss": 0.0412, + "step": 2568 + }, + { + "epoch": 1.1410170997113036, + "grad_norm": 0.4533506243280891, + "learning_rate": 8.990395352578665e-06, + "loss": 0.0423, + "step": 2569 + }, + { + "epoch": 1.141461248056851, + "grad_norm": 0.5186554476274421, + "learning_rate": 8.989226982078513e-06, + "loss": 0.0486, + "step": 2570 + }, + { + "epoch": 1.1419053964023984, + "grad_norm": 0.5971608007231468, + "learning_rate": 8.988058011929781e-06, + "loss": 0.063, + "step": 2571 + }, + { + "epoch": 1.1423495447479457, + "grad_norm": 0.6848430211876612, + "learning_rate": 8.986888442308187e-06, + "loss": 0.0815, + "step": 2572 + }, + { + "epoch": 1.1427936930934932, + "grad_norm": 0.5213622343694703, + "learning_rate": 8.985718273389532e-06, + "loss": 0.0426, + "step": 2573 + }, + { + "epoch": 1.1432378414390407, + "grad_norm": 0.4678564391636386, + "learning_rate": 8.984547505349714e-06, + "loss": 0.0461, + "step": 2574 + }, + { + "epoch": 1.143681989784588, + "grad_norm": 0.6562307201524248, + "learning_rate": 8.983376138364723e-06, + "loss": 0.0666, + "step": 2575 + }, + { + "epoch": 1.1441261381301355, + "grad_norm": 0.7770057155172067, + "learning_rate": 8.982204172610632e-06, + "loss": 0.0761, + "step": 2576 + }, + { + "epoch": 1.1445702864756828, + "grad_norm": 0.5346372112341888, + "learning_rate": 8.981031608263608e-06, + "loss": 0.0742, + "step": 2577 + }, + { + "epoch": 1.1450144348212303, + "grad_norm": 0.549185701186776, + "learning_rate": 8.979858445499908e-06, + "loss": 0.0583, + "step": 2578 + }, + { + "epoch": 1.1454585831667776, + "grad_norm": 0.44014613661158175, + "learning_rate": 8.978684684495875e-06, + "loss": 0.0422, + "step": 2579 + }, + { + "epoch": 1.1459027315123251, + "grad_norm": 0.43518841056839336, + "learning_rate": 8.97751032542795e-06, + "loss": 0.0513, + "step": 2580 + }, + { + "epoch": 1.1463468798578726, + "grad_norm": 0.3840746634100509, + "learning_rate": 8.976335368472657e-06, + "loss": 0.0295, + "step": 2581 + }, + { + "epoch": 1.14679102820342, + "grad_norm": 0.7385893846304249, + "learning_rate": 8.97515981380661e-06, + "loss": 0.0794, + "step": 2582 + }, + { + "epoch": 1.1472351765489675, + "grad_norm": 0.38857897583910317, + "learning_rate": 8.97398366160652e-06, + "loss": 0.0361, + "step": 2583 + }, + { + "epoch": 1.1476793248945147, + "grad_norm": 0.6486029298679971, + "learning_rate": 8.972806912049178e-06, + "loss": 0.0732, + "step": 2584 + }, + { + "epoch": 1.1481234732400623, + "grad_norm": 0.4395809321084259, + "learning_rate": 8.971629565311471e-06, + "loss": 0.0426, + "step": 2585 + }, + { + "epoch": 1.1485676215856095, + "grad_norm": 0.42320652941064185, + "learning_rate": 8.970451621570376e-06, + "loss": 0.0476, + "step": 2586 + }, + { + "epoch": 1.149011769931157, + "grad_norm": 0.3926981271234761, + "learning_rate": 8.969273081002954e-06, + "loss": 0.0408, + "step": 2587 + }, + { + "epoch": 1.1494559182767043, + "grad_norm": 0.6068945137716437, + "learning_rate": 8.96809394378636e-06, + "loss": 0.0448, + "step": 2588 + }, + { + "epoch": 1.1499000666222519, + "grad_norm": 0.46200186631768236, + "learning_rate": 8.966914210097843e-06, + "loss": 0.0587, + "step": 2589 + }, + { + "epoch": 1.1503442149677992, + "grad_norm": 0.8019976203155184, + "learning_rate": 8.965733880114734e-06, + "loss": 0.0768, + "step": 2590 + }, + { + "epoch": 1.1507883633133467, + "grad_norm": 0.4272911771923715, + "learning_rate": 8.964552954014455e-06, + "loss": 0.0555, + "step": 2591 + }, + { + "epoch": 1.1512325116588942, + "grad_norm": 0.46591985962506655, + "learning_rate": 8.963371431974521e-06, + "loss": 0.0491, + "step": 2592 + }, + { + "epoch": 1.1516766600044415, + "grad_norm": 0.34427825623736114, + "learning_rate": 8.962189314172537e-06, + "loss": 0.0291, + "step": 2593 + }, + { + "epoch": 1.152120808349989, + "grad_norm": 0.3941777289275942, + "learning_rate": 8.961006600786191e-06, + "loss": 0.0386, + "step": 2594 + }, + { + "epoch": 1.1525649566955363, + "grad_norm": 0.7828627909758169, + "learning_rate": 8.959823291993268e-06, + "loss": 0.0644, + "step": 2595 + }, + { + "epoch": 1.1530091050410838, + "grad_norm": 0.44448667498264577, + "learning_rate": 8.95863938797164e-06, + "loss": 0.0437, + "step": 2596 + }, + { + "epoch": 1.153453253386631, + "grad_norm": 0.44346487742364565, + "learning_rate": 8.957454888899264e-06, + "loss": 0.0462, + "step": 2597 + }, + { + "epoch": 1.1538974017321786, + "grad_norm": 0.4503855208067358, + "learning_rate": 8.956269794954195e-06, + "loss": 0.0467, + "step": 2598 + }, + { + "epoch": 1.1543415500777259, + "grad_norm": 0.50075395685955, + "learning_rate": 8.95508410631457e-06, + "loss": 0.0513, + "step": 2599 + }, + { + "epoch": 1.1547856984232734, + "grad_norm": 1.5111182491065092, + "learning_rate": 8.953897823158618e-06, + "loss": 0.0422, + "step": 2600 + }, + { + "epoch": 1.1552298467688207, + "grad_norm": 0.4837059174435265, + "learning_rate": 8.95271094566466e-06, + "loss": 0.0484, + "step": 2601 + }, + { + "epoch": 1.1556739951143682, + "grad_norm": 0.9602960350821099, + "learning_rate": 8.9515234740111e-06, + "loss": 0.0596, + "step": 2602 + }, + { + "epoch": 1.1561181434599157, + "grad_norm": 0.4674309097106074, + "learning_rate": 8.950335408376438e-06, + "loss": 0.0388, + "step": 2603 + }, + { + "epoch": 1.156562291805463, + "grad_norm": 0.5494197343743635, + "learning_rate": 8.949146748939259e-06, + "loss": 0.0516, + "step": 2604 + }, + { + "epoch": 1.1570064401510105, + "grad_norm": 0.603296345877419, + "learning_rate": 8.94795749587824e-06, + "loss": 0.0648, + "step": 2605 + }, + { + "epoch": 1.1574505884965578, + "grad_norm": 0.6840688549848888, + "learning_rate": 8.946767649372144e-06, + "loss": 0.0633, + "step": 2606 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.7710851984691011, + "learning_rate": 8.945577209599829e-06, + "loss": 0.0557, + "step": 2607 + }, + { + "epoch": 1.1583388851876526, + "grad_norm": 0.5783757044901674, + "learning_rate": 8.944386176740233e-06, + "loss": 0.0552, + "step": 2608 + }, + { + "epoch": 1.1587830335332001, + "grad_norm": 0.4854270839314596, + "learning_rate": 8.943194550972392e-06, + "loss": 0.0531, + "step": 2609 + }, + { + "epoch": 1.1592271818787476, + "grad_norm": 0.5109780619188811, + "learning_rate": 8.942002332475428e-06, + "loss": 0.0512, + "step": 2610 + }, + { + "epoch": 1.159671330224295, + "grad_norm": 0.541497267825657, + "learning_rate": 8.940809521428551e-06, + "loss": 0.0622, + "step": 2611 + }, + { + "epoch": 1.1601154785698422, + "grad_norm": 0.6325619808955779, + "learning_rate": 8.939616118011058e-06, + "loss": 0.0577, + "step": 2612 + }, + { + "epoch": 1.1605596269153897, + "grad_norm": 0.5836195560024343, + "learning_rate": 8.938422122402342e-06, + "loss": 0.0608, + "step": 2613 + }, + { + "epoch": 1.1610037752609372, + "grad_norm": 0.5121529339351947, + "learning_rate": 8.937227534781878e-06, + "loss": 0.0496, + "step": 2614 + }, + { + "epoch": 1.1614479236064845, + "grad_norm": 0.5247406850098297, + "learning_rate": 8.936032355329233e-06, + "loss": 0.0509, + "step": 2615 + }, + { + "epoch": 1.161892071952032, + "grad_norm": 0.43915296270490234, + "learning_rate": 8.934836584224065e-06, + "loss": 0.0483, + "step": 2616 + }, + { + "epoch": 1.1623362202975793, + "grad_norm": 0.4849661298577939, + "learning_rate": 8.933640221646116e-06, + "loss": 0.0534, + "step": 2617 + }, + { + "epoch": 1.1627803686431268, + "grad_norm": 0.5053186137211374, + "learning_rate": 8.932443267775221e-06, + "loss": 0.0508, + "step": 2618 + }, + { + "epoch": 1.1632245169886741, + "grad_norm": 0.5361080174409868, + "learning_rate": 8.931245722791305e-06, + "loss": 0.0609, + "step": 2619 + }, + { + "epoch": 1.1636686653342216, + "grad_norm": 0.40985225869641795, + "learning_rate": 8.930047586874373e-06, + "loss": 0.0458, + "step": 2620 + }, + { + "epoch": 1.1641128136797692, + "grad_norm": 0.48841596818152816, + "learning_rate": 8.928848860204531e-06, + "loss": 0.0782, + "step": 2621 + }, + { + "epoch": 1.1645569620253164, + "grad_norm": 0.5219141696813542, + "learning_rate": 8.927649542961965e-06, + "loss": 0.0498, + "step": 2622 + }, + { + "epoch": 1.165001110370864, + "grad_norm": 0.6061661462461002, + "learning_rate": 8.926449635326954e-06, + "loss": 0.0586, + "step": 2623 + }, + { + "epoch": 1.1654452587164112, + "grad_norm": 0.4500746397097711, + "learning_rate": 8.925249137479864e-06, + "loss": 0.0399, + "step": 2624 + }, + { + "epoch": 1.1658894070619588, + "grad_norm": 0.5211157612826388, + "learning_rate": 8.92404804960115e-06, + "loss": 0.05, + "step": 2625 + }, + { + "epoch": 1.166333555407506, + "grad_norm": 0.4903699164468352, + "learning_rate": 8.922846371871355e-06, + "loss": 0.0496, + "step": 2626 + }, + { + "epoch": 1.1667777037530536, + "grad_norm": 0.7464483529673437, + "learning_rate": 8.921644104471114e-06, + "loss": 0.0601, + "step": 2627 + }, + { + "epoch": 1.1672218520986009, + "grad_norm": 0.4872521353158615, + "learning_rate": 8.920441247581148e-06, + "loss": 0.0525, + "step": 2628 + }, + { + "epoch": 1.1676660004441484, + "grad_norm": 0.5674275180118361, + "learning_rate": 8.919237801382265e-06, + "loss": 0.0552, + "step": 2629 + }, + { + "epoch": 1.1681101487896957, + "grad_norm": 0.8950605367456311, + "learning_rate": 8.918033766055364e-06, + "loss": 0.0602, + "step": 2630 + }, + { + "epoch": 1.1685542971352432, + "grad_norm": 0.6732326138636984, + "learning_rate": 8.916829141781432e-06, + "loss": 0.0558, + "step": 2631 + }, + { + "epoch": 1.1689984454807907, + "grad_norm": 0.48994316207093613, + "learning_rate": 8.915623928741546e-06, + "loss": 0.0628, + "step": 2632 + }, + { + "epoch": 1.169442593826338, + "grad_norm": 0.4212795487662145, + "learning_rate": 8.914418127116867e-06, + "loss": 0.0508, + "step": 2633 + }, + { + "epoch": 1.1698867421718855, + "grad_norm": 0.49361391712276814, + "learning_rate": 8.91321173708865e-06, + "loss": 0.0638, + "step": 2634 + }, + { + "epoch": 1.1703308905174328, + "grad_norm": 0.7512476603132378, + "learning_rate": 8.912004758838235e-06, + "loss": 0.0839, + "step": 2635 + }, + { + "epoch": 1.1707750388629803, + "grad_norm": 0.505025169980782, + "learning_rate": 8.910797192547051e-06, + "loss": 0.0457, + "step": 2636 + }, + { + "epoch": 1.1712191872085276, + "grad_norm": 0.5671039324048706, + "learning_rate": 8.909589038396617e-06, + "loss": 0.044, + "step": 2637 + }, + { + "epoch": 1.171663335554075, + "grad_norm": 0.4745487457505175, + "learning_rate": 8.908380296568537e-06, + "loss": 0.0479, + "step": 2638 + }, + { + "epoch": 1.1721074838996224, + "grad_norm": 0.8949118984931155, + "learning_rate": 8.907170967244508e-06, + "loss": 0.0576, + "step": 2639 + }, + { + "epoch": 1.17255163224517, + "grad_norm": 0.4520757913913456, + "learning_rate": 8.905961050606311e-06, + "loss": 0.0467, + "step": 2640 + }, + { + "epoch": 1.1729957805907172, + "grad_norm": 0.4749545091509921, + "learning_rate": 8.904750546835817e-06, + "loss": 0.0446, + "step": 2641 + }, + { + "epoch": 1.1734399289362647, + "grad_norm": 0.522016267548402, + "learning_rate": 8.903539456114988e-06, + "loss": 0.053, + "step": 2642 + }, + { + "epoch": 1.1738840772818122, + "grad_norm": 0.530579514310406, + "learning_rate": 8.902327778625865e-06, + "loss": 0.0437, + "step": 2643 + }, + { + "epoch": 1.1743282256273595, + "grad_norm": 0.6048157018439171, + "learning_rate": 8.90111551455059e-06, + "loss": 0.0435, + "step": 2644 + }, + { + "epoch": 1.174772373972907, + "grad_norm": 0.5906854069033922, + "learning_rate": 8.899902664071384e-06, + "loss": 0.0626, + "step": 2645 + }, + { + "epoch": 1.1752165223184543, + "grad_norm": 0.47030730215576705, + "learning_rate": 8.898689227370563e-06, + "loss": 0.0386, + "step": 2646 + }, + { + "epoch": 1.1756606706640018, + "grad_norm": 0.6351029367838139, + "learning_rate": 8.897475204630521e-06, + "loss": 0.0591, + "step": 2647 + }, + { + "epoch": 1.176104819009549, + "grad_norm": 0.6410082214075851, + "learning_rate": 8.89626059603375e-06, + "loss": 0.0579, + "step": 2648 + }, + { + "epoch": 1.1765489673550966, + "grad_norm": 0.5406985095669288, + "learning_rate": 8.895045401762825e-06, + "loss": 0.0568, + "step": 2649 + }, + { + "epoch": 1.1769931157006441, + "grad_norm": 0.7480895085443751, + "learning_rate": 8.893829622000412e-06, + "loss": 0.0707, + "step": 2650 + }, + { + "epoch": 1.1774372640461914, + "grad_norm": 0.45890411967998634, + "learning_rate": 8.892613256929261e-06, + "loss": 0.0416, + "step": 2651 + }, + { + "epoch": 1.177881412391739, + "grad_norm": 0.396661981002553, + "learning_rate": 8.891396306732214e-06, + "loss": 0.0423, + "step": 2652 + }, + { + "epoch": 1.1783255607372862, + "grad_norm": 0.5069251937753831, + "learning_rate": 8.890178771592198e-06, + "loss": 0.0487, + "step": 2653 + }, + { + "epoch": 1.1787697090828337, + "grad_norm": 0.7785700220445251, + "learning_rate": 8.888960651692231e-06, + "loss": 0.0719, + "step": 2654 + }, + { + "epoch": 1.179213857428381, + "grad_norm": 0.5673896965611613, + "learning_rate": 8.887741947215415e-06, + "loss": 0.0556, + "step": 2655 + }, + { + "epoch": 1.1796580057739285, + "grad_norm": 0.542725413232058, + "learning_rate": 8.886522658344944e-06, + "loss": 0.0493, + "step": 2656 + }, + { + "epoch": 1.1801021541194758, + "grad_norm": 0.6266382123491776, + "learning_rate": 8.885302785264098e-06, + "loss": 0.0383, + "step": 2657 + }, + { + "epoch": 1.1805463024650233, + "grad_norm": 0.46581618376933115, + "learning_rate": 8.884082328156243e-06, + "loss": 0.0419, + "step": 2658 + }, + { + "epoch": 1.1809904508105706, + "grad_norm": 0.5581446013530325, + "learning_rate": 8.882861287204836e-06, + "loss": 0.0498, + "step": 2659 + }, + { + "epoch": 1.1814345991561181, + "grad_norm": 0.42956852547953145, + "learning_rate": 8.881639662593417e-06, + "loss": 0.0381, + "step": 2660 + }, + { + "epoch": 1.1818787475016657, + "grad_norm": 0.45803495008537815, + "learning_rate": 8.880417454505622e-06, + "loss": 0.0492, + "step": 2661 + }, + { + "epoch": 1.182322895847213, + "grad_norm": 0.6914169208326956, + "learning_rate": 8.879194663125164e-06, + "loss": 0.0517, + "step": 2662 + }, + { + "epoch": 1.1827670441927605, + "grad_norm": 0.6395374873974822, + "learning_rate": 8.877971288635853e-06, + "loss": 0.0631, + "step": 2663 + }, + { + "epoch": 1.1832111925383078, + "grad_norm": 0.558846270469075, + "learning_rate": 8.876747331221583e-06, + "loss": 0.058, + "step": 2664 + }, + { + "epoch": 1.1836553408838553, + "grad_norm": 0.49754542881845054, + "learning_rate": 8.875522791066333e-06, + "loss": 0.0616, + "step": 2665 + }, + { + "epoch": 1.1840994892294026, + "grad_norm": 0.641582498045218, + "learning_rate": 8.874297668354175e-06, + "loss": 0.0432, + "step": 2666 + }, + { + "epoch": 1.18454363757495, + "grad_norm": 0.5357146614905643, + "learning_rate": 8.873071963269265e-06, + "loss": 0.0581, + "step": 2667 + }, + { + "epoch": 1.1849877859204974, + "grad_norm": 0.3864891516746441, + "learning_rate": 8.871845675995847e-06, + "loss": 0.0397, + "step": 2668 + }, + { + "epoch": 1.1854319342660449, + "grad_norm": 0.5817776113032109, + "learning_rate": 8.870618806718252e-06, + "loss": 0.0548, + "step": 2669 + }, + { + "epoch": 1.1858760826115922, + "grad_norm": 0.6451637896241947, + "learning_rate": 8.8693913556209e-06, + "loss": 0.0555, + "step": 2670 + }, + { + "epoch": 1.1863202309571397, + "grad_norm": 0.522938279572616, + "learning_rate": 8.868163322888298e-06, + "loss": 0.0429, + "step": 2671 + }, + { + "epoch": 1.1867643793026872, + "grad_norm": 0.6059080038510308, + "learning_rate": 8.86693470870504e-06, + "loss": 0.056, + "step": 2672 + }, + { + "epoch": 1.1872085276482345, + "grad_norm": 0.5024058690626159, + "learning_rate": 8.865705513255807e-06, + "loss": 0.0371, + "step": 2673 + }, + { + "epoch": 1.187652675993782, + "grad_norm": 0.5033221412559168, + "learning_rate": 8.864475736725369e-06, + "loss": 0.06, + "step": 2674 + }, + { + "epoch": 1.1880968243393293, + "grad_norm": 0.4156977033942008, + "learning_rate": 8.863245379298582e-06, + "loss": 0.0378, + "step": 2675 + }, + { + "epoch": 1.1885409726848768, + "grad_norm": 0.8144903668245238, + "learning_rate": 8.86201444116039e-06, + "loss": 0.0614, + "step": 2676 + }, + { + "epoch": 1.188985121030424, + "grad_norm": 0.4226975303790829, + "learning_rate": 8.860782922495821e-06, + "loss": 0.046, + "step": 2677 + }, + { + "epoch": 1.1894292693759716, + "grad_norm": 0.52956763181697, + "learning_rate": 8.859550823489997e-06, + "loss": 0.0384, + "step": 2678 + }, + { + "epoch": 1.189873417721519, + "grad_norm": 0.5489791027942442, + "learning_rate": 8.858318144328123e-06, + "loss": 0.0742, + "step": 2679 + }, + { + "epoch": 1.1903175660670664, + "grad_norm": 1.0552083022753025, + "learning_rate": 8.85708488519549e-06, + "loss": 0.0446, + "step": 2680 + }, + { + "epoch": 1.1907617144126137, + "grad_norm": 0.5078924313476717, + "learning_rate": 8.855851046277478e-06, + "loss": 0.054, + "step": 2681 + }, + { + "epoch": 1.1912058627581612, + "grad_norm": 0.5148608895352339, + "learning_rate": 8.854616627759553e-06, + "loss": 0.0523, + "step": 2682 + }, + { + "epoch": 1.1916500111037087, + "grad_norm": 0.7282958984855765, + "learning_rate": 8.853381629827272e-06, + "loss": 0.0506, + "step": 2683 + }, + { + "epoch": 1.192094159449256, + "grad_norm": 0.5448500153068538, + "learning_rate": 8.852146052666275e-06, + "loss": 0.0404, + "step": 2684 + }, + { + "epoch": 1.1925383077948035, + "grad_norm": 0.40034874918632524, + "learning_rate": 8.850909896462288e-06, + "loss": 0.0351, + "step": 2685 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 0.6365377232810542, + "learning_rate": 8.849673161401129e-06, + "loss": 0.0616, + "step": 2686 + }, + { + "epoch": 1.1934266044858983, + "grad_norm": 0.5874533737153114, + "learning_rate": 8.848435847668699e-06, + "loss": 0.06, + "step": 2687 + }, + { + "epoch": 1.1938707528314456, + "grad_norm": 0.4075740080944545, + "learning_rate": 8.847197955450988e-06, + "loss": 0.0399, + "step": 2688 + }, + { + "epoch": 1.1943149011769931, + "grad_norm": 0.4560997072221143, + "learning_rate": 8.845959484934073e-06, + "loss": 0.0482, + "step": 2689 + }, + { + "epoch": 1.1947590495225406, + "grad_norm": 0.7271393081297982, + "learning_rate": 8.844720436304113e-06, + "loss": 0.0694, + "step": 2690 + }, + { + "epoch": 1.195203197868088, + "grad_norm": 0.5135039124857559, + "learning_rate": 8.843480809747363e-06, + "loss": 0.064, + "step": 2691 + }, + { + "epoch": 1.1956473462136354, + "grad_norm": 0.5417817389406259, + "learning_rate": 8.842240605450158e-06, + "loss": 0.0816, + "step": 2692 + }, + { + "epoch": 1.1960914945591827, + "grad_norm": 0.40641657577367263, + "learning_rate": 8.840999823598921e-06, + "loss": 0.0378, + "step": 2693 + }, + { + "epoch": 1.1965356429047302, + "grad_norm": 0.7137594073963416, + "learning_rate": 8.839758464380163e-06, + "loss": 0.063, + "step": 2694 + }, + { + "epoch": 1.1969797912502775, + "grad_norm": 0.6152412711374633, + "learning_rate": 8.838516527980483e-06, + "loss": 0.0403, + "step": 2695 + }, + { + "epoch": 1.197423939595825, + "grad_norm": 0.5995872223176858, + "learning_rate": 8.837274014586564e-06, + "loss": 0.0461, + "step": 2696 + }, + { + "epoch": 1.1978680879413723, + "grad_norm": 0.5251664478854555, + "learning_rate": 8.836030924385175e-06, + "loss": 0.0558, + "step": 2697 + }, + { + "epoch": 1.1983122362869199, + "grad_norm": 0.444869598867083, + "learning_rate": 8.834787257563178e-06, + "loss": 0.0583, + "step": 2698 + }, + { + "epoch": 1.1987563846324671, + "grad_norm": 0.5140725378841854, + "learning_rate": 8.833543014307513e-06, + "loss": 0.0513, + "step": 2699 + }, + { + "epoch": 1.1992005329780147, + "grad_norm": 0.5779472328853184, + "learning_rate": 8.83229819480521e-06, + "loss": 0.0653, + "step": 2700 + }, + { + "epoch": 1.1996446813235622, + "grad_norm": 0.5908242045279332, + "learning_rate": 8.831052799243394e-06, + "loss": 0.0565, + "step": 2701 + }, + { + "epoch": 1.2000888296691095, + "grad_norm": 0.9242559999496844, + "learning_rate": 8.82980682780926e-06, + "loss": 0.0628, + "step": 2702 + }, + { + "epoch": 1.200532978014657, + "grad_norm": 0.4108528589278202, + "learning_rate": 8.828560280690104e-06, + "loss": 0.0438, + "step": 2703 + }, + { + "epoch": 1.2009771263602043, + "grad_norm": 0.4411744291440838, + "learning_rate": 8.827313158073304e-06, + "loss": 0.0567, + "step": 2704 + }, + { + "epoch": 1.2014212747057518, + "grad_norm": 0.5260335018456415, + "learning_rate": 8.826065460146318e-06, + "loss": 0.0579, + "step": 2705 + }, + { + "epoch": 1.201865423051299, + "grad_norm": 0.6891291032685649, + "learning_rate": 8.824817187096702e-06, + "loss": 0.056, + "step": 2706 + }, + { + "epoch": 1.2023095713968466, + "grad_norm": 0.7678244223744226, + "learning_rate": 8.823568339112089e-06, + "loss": 0.0577, + "step": 2707 + }, + { + "epoch": 1.2027537197423939, + "grad_norm": 0.605196561971164, + "learning_rate": 8.822318916380207e-06, + "loss": 0.0511, + "step": 2708 + }, + { + "epoch": 1.2031978680879414, + "grad_norm": 0.5123811082634186, + "learning_rate": 8.821068919088858e-06, + "loss": 0.0578, + "step": 2709 + }, + { + "epoch": 1.2036420164334887, + "grad_norm": 0.4168805885973522, + "learning_rate": 8.819818347425943e-06, + "loss": 0.0402, + "step": 2710 + }, + { + "epoch": 1.2040861647790362, + "grad_norm": 0.4322765343829467, + "learning_rate": 8.818567201579444e-06, + "loss": 0.0404, + "step": 2711 + }, + { + "epoch": 1.2045303131245837, + "grad_norm": 0.6086948606935972, + "learning_rate": 8.817315481737428e-06, + "loss": 0.0574, + "step": 2712 + }, + { + "epoch": 1.204974461470131, + "grad_norm": 0.5388644202630685, + "learning_rate": 8.816063188088049e-06, + "loss": 0.0526, + "step": 2713 + }, + { + "epoch": 1.2054186098156785, + "grad_norm": 0.4833033812042681, + "learning_rate": 8.814810320819551e-06, + "loss": 0.0522, + "step": 2714 + }, + { + "epoch": 1.2058627581612258, + "grad_norm": 0.9567007955598438, + "learning_rate": 8.81355688012026e-06, + "loss": 0.0926, + "step": 2715 + }, + { + "epoch": 1.2063069065067733, + "grad_norm": 0.6610578357230338, + "learning_rate": 8.812302866178586e-06, + "loss": 0.0508, + "step": 2716 + }, + { + "epoch": 1.2067510548523206, + "grad_norm": 0.5170485323761623, + "learning_rate": 8.811048279183034e-06, + "loss": 0.0482, + "step": 2717 + }, + { + "epoch": 1.207195203197868, + "grad_norm": 0.49170885647162377, + "learning_rate": 8.809793119322188e-06, + "loss": 0.0452, + "step": 2718 + }, + { + "epoch": 1.2076393515434156, + "grad_norm": 0.6569021320261341, + "learning_rate": 8.808537386784717e-06, + "loss": 0.0602, + "step": 2719 + }, + { + "epoch": 1.208083499888963, + "grad_norm": 0.5861436832219081, + "learning_rate": 8.807281081759382e-06, + "loss": 0.0503, + "step": 2720 + }, + { + "epoch": 1.2085276482345104, + "grad_norm": 0.6362413163945917, + "learning_rate": 8.806024204435024e-06, + "loss": 0.0619, + "step": 2721 + }, + { + "epoch": 1.2089717965800577, + "grad_norm": 0.8112427507399114, + "learning_rate": 8.804766755000577e-06, + "loss": 0.0878, + "step": 2722 + }, + { + "epoch": 1.2094159449256052, + "grad_norm": 0.6213886119436266, + "learning_rate": 8.803508733645056e-06, + "loss": 0.0391, + "step": 2723 + }, + { + "epoch": 1.2098600932711525, + "grad_norm": 0.4701764036600453, + "learning_rate": 8.80225014055756e-06, + "loss": 0.044, + "step": 2724 + }, + { + "epoch": 1.2103042416167, + "grad_norm": 0.6176542835674168, + "learning_rate": 8.80099097592728e-06, + "loss": 0.0717, + "step": 2725 + }, + { + "epoch": 1.2107483899622473, + "grad_norm": 0.5676221760627296, + "learning_rate": 8.799731239943488e-06, + "loss": 0.0485, + "step": 2726 + }, + { + "epoch": 1.2111925383077948, + "grad_norm": 0.7416139133479585, + "learning_rate": 8.798470932795545e-06, + "loss": 0.0848, + "step": 2727 + }, + { + "epoch": 1.2116366866533421, + "grad_norm": 0.8518948114813076, + "learning_rate": 8.797210054672897e-06, + "loss": 0.0603, + "step": 2728 + }, + { + "epoch": 1.2120808349988896, + "grad_norm": 0.5447845702100781, + "learning_rate": 8.795948605765071e-06, + "loss": 0.0503, + "step": 2729 + }, + { + "epoch": 1.2125249833444371, + "grad_norm": 0.6041323757362492, + "learning_rate": 8.794686586261692e-06, + "loss": 0.0606, + "step": 2730 + }, + { + "epoch": 1.2129691316899844, + "grad_norm": 0.3720241663572847, + "learning_rate": 8.793423996352458e-06, + "loss": 0.0396, + "step": 2731 + }, + { + "epoch": 1.213413280035532, + "grad_norm": 0.6614032276078902, + "learning_rate": 8.792160836227156e-06, + "loss": 0.0778, + "step": 2732 + }, + { + "epoch": 1.2138574283810792, + "grad_norm": 0.4371895131814461, + "learning_rate": 8.790897106075665e-06, + "loss": 0.0464, + "step": 2733 + }, + { + "epoch": 1.2143015767266268, + "grad_norm": 0.42112583525881137, + "learning_rate": 8.78963280608794e-06, + "loss": 0.0283, + "step": 2734 + }, + { + "epoch": 1.214745725072174, + "grad_norm": 0.8563223231466682, + "learning_rate": 8.788367936454033e-06, + "loss": 0.0588, + "step": 2735 + }, + { + "epoch": 1.2151898734177216, + "grad_norm": 0.5558136427910825, + "learning_rate": 8.78710249736407e-06, + "loss": 0.0438, + "step": 2736 + }, + { + "epoch": 1.2156340217632688, + "grad_norm": 0.5340710064178852, + "learning_rate": 8.78583648900827e-06, + "loss": 0.0517, + "step": 2737 + }, + { + "epoch": 1.2160781701088164, + "grad_norm": 0.4855046400709726, + "learning_rate": 8.784569911576937e-06, + "loss": 0.0429, + "step": 2738 + }, + { + "epoch": 1.2165223184543636, + "grad_norm": 0.5862019061765111, + "learning_rate": 8.783302765260456e-06, + "loss": 0.0549, + "step": 2739 + }, + { + "epoch": 1.2169664667999112, + "grad_norm": 0.754950173363004, + "learning_rate": 8.782035050249302e-06, + "loss": 0.051, + "step": 2740 + }, + { + "epoch": 1.2174106151454587, + "grad_norm": 0.5310823633824483, + "learning_rate": 8.780766766734037e-06, + "loss": 0.0535, + "step": 2741 + }, + { + "epoch": 1.217854763491006, + "grad_norm": 0.515619945566938, + "learning_rate": 8.779497914905302e-06, + "loss": 0.0518, + "step": 2742 + }, + { + "epoch": 1.2182989118365535, + "grad_norm": 0.5419625287890412, + "learning_rate": 8.778228494953826e-06, + "loss": 0.0629, + "step": 2743 + }, + { + "epoch": 1.2187430601821008, + "grad_norm": 0.41887351869316514, + "learning_rate": 8.776958507070427e-06, + "loss": 0.0436, + "step": 2744 + }, + { + "epoch": 1.2191872085276483, + "grad_norm": 0.5475369085076727, + "learning_rate": 8.775687951446007e-06, + "loss": 0.043, + "step": 2745 + }, + { + "epoch": 1.2196313568731956, + "grad_norm": 0.6765114475250111, + "learning_rate": 8.774416828271548e-06, + "loss": 0.0542, + "step": 2746 + }, + { + "epoch": 1.220075505218743, + "grad_norm": 0.678214320662665, + "learning_rate": 8.773145137738125e-06, + "loss": 0.0815, + "step": 2747 + }, + { + "epoch": 1.2205196535642906, + "grad_norm": 0.49324022002296203, + "learning_rate": 8.771872880036893e-06, + "loss": 0.0524, + "step": 2748 + }, + { + "epoch": 1.2209638019098379, + "grad_norm": 0.5958708788497837, + "learning_rate": 8.770600055359094e-06, + "loss": 0.0456, + "step": 2749 + }, + { + "epoch": 1.2214079502553852, + "grad_norm": 0.4999935592497019, + "learning_rate": 8.769326663896056e-06, + "loss": 0.0409, + "step": 2750 + }, + { + "epoch": 1.2218520986009327, + "grad_norm": 0.43642259806336103, + "learning_rate": 8.76805270583919e-06, + "loss": 0.05, + "step": 2751 + }, + { + "epoch": 1.2222962469464802, + "grad_norm": 0.6350373365457326, + "learning_rate": 8.766778181379993e-06, + "loss": 0.046, + "step": 2752 + }, + { + "epoch": 1.2227403952920275, + "grad_norm": 0.49370518082751286, + "learning_rate": 8.765503090710052e-06, + "loss": 0.0504, + "step": 2753 + }, + { + "epoch": 1.223184543637575, + "grad_norm": 0.6617812231173426, + "learning_rate": 8.76422743402103e-06, + "loss": 0.0524, + "step": 2754 + }, + { + "epoch": 1.2236286919831223, + "grad_norm": 0.46946114196024546, + "learning_rate": 8.762951211504682e-06, + "loss": 0.0459, + "step": 2755 + }, + { + "epoch": 1.2240728403286698, + "grad_norm": 0.49081430568764717, + "learning_rate": 8.761674423352844e-06, + "loss": 0.0438, + "step": 2756 + }, + { + "epoch": 1.224516988674217, + "grad_norm": 0.7508698890744291, + "learning_rate": 8.760397069757443e-06, + "loss": 0.0535, + "step": 2757 + }, + { + "epoch": 1.2249611370197646, + "grad_norm": 0.7095498367244547, + "learning_rate": 8.759119150910482e-06, + "loss": 0.0439, + "step": 2758 + }, + { + "epoch": 1.2254052853653121, + "grad_norm": 0.5423777599181153, + "learning_rate": 8.757840667004059e-06, + "loss": 0.0584, + "step": 2759 + }, + { + "epoch": 1.2258494337108594, + "grad_norm": 0.38826805829970257, + "learning_rate": 8.756561618230348e-06, + "loss": 0.0372, + "step": 2760 + }, + { + "epoch": 1.226293582056407, + "grad_norm": 0.4214243171446597, + "learning_rate": 8.755282004781613e-06, + "loss": 0.0387, + "step": 2761 + }, + { + "epoch": 1.2267377304019542, + "grad_norm": 0.6110973962790421, + "learning_rate": 8.754001826850201e-06, + "loss": 0.0504, + "step": 2762 + }, + { + "epoch": 1.2271818787475017, + "grad_norm": 0.5706371024485902, + "learning_rate": 8.752721084628545e-06, + "loss": 0.0447, + "step": 2763 + }, + { + "epoch": 1.227626027093049, + "grad_norm": 0.7667925876568604, + "learning_rate": 8.751439778309162e-06, + "loss": 0.0661, + "step": 2764 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 0.4947295202945595, + "learning_rate": 8.750157908084655e-06, + "loss": 0.0481, + "step": 2765 + }, + { + "epoch": 1.2285143237841438, + "grad_norm": 0.4844603657944363, + "learning_rate": 8.74887547414771e-06, + "loss": 0.0428, + "step": 2766 + }, + { + "epoch": 1.2289584721296913, + "grad_norm": 0.5262436629587928, + "learning_rate": 8.747592476691102e-06, + "loss": 0.0572, + "step": 2767 + }, + { + "epoch": 1.2294026204752386, + "grad_norm": 0.48653491041855407, + "learning_rate": 8.746308915907681e-06, + "loss": 0.0466, + "step": 2768 + }, + { + "epoch": 1.2298467688207861, + "grad_norm": 0.3598264709368313, + "learning_rate": 8.745024791990392e-06, + "loss": 0.036, + "step": 2769 + }, + { + "epoch": 1.2302909171663337, + "grad_norm": 0.4281231861639907, + "learning_rate": 8.74374010513226e-06, + "loss": 0.0375, + "step": 2770 + }, + { + "epoch": 1.230735065511881, + "grad_norm": 0.6227157372551906, + "learning_rate": 8.742454855526396e-06, + "loss": 0.06, + "step": 2771 + }, + { + "epoch": 1.2311792138574285, + "grad_norm": 0.5721086967364898, + "learning_rate": 8.741169043365994e-06, + "loss": 0.0517, + "step": 2772 + }, + { + "epoch": 1.2316233622029757, + "grad_norm": 0.4118993047204808, + "learning_rate": 8.739882668844332e-06, + "loss": 0.0433, + "step": 2773 + }, + { + "epoch": 1.2320675105485233, + "grad_norm": 0.7343040841582446, + "learning_rate": 8.738595732154776e-06, + "loss": 0.0525, + "step": 2774 + }, + { + "epoch": 1.2325116588940705, + "grad_norm": 0.4304569080551511, + "learning_rate": 8.737308233490775e-06, + "loss": 0.0548, + "step": 2775 + }, + { + "epoch": 1.232955807239618, + "grad_norm": 0.5483425130829543, + "learning_rate": 8.736020173045858e-06, + "loss": 0.0547, + "step": 2776 + }, + { + "epoch": 1.2333999555851654, + "grad_norm": 0.45127669817592087, + "learning_rate": 8.734731551013648e-06, + "loss": 0.0458, + "step": 2777 + }, + { + "epoch": 1.2338441039307129, + "grad_norm": 0.5076179219488122, + "learning_rate": 8.733442367587842e-06, + "loss": 0.0495, + "step": 2778 + }, + { + "epoch": 1.2342882522762602, + "grad_norm": 0.4061357633325624, + "learning_rate": 8.732152622962229e-06, + "loss": 0.038, + "step": 2779 + }, + { + "epoch": 1.2347324006218077, + "grad_norm": 0.6061025558148905, + "learning_rate": 8.730862317330678e-06, + "loss": 0.0552, + "step": 2780 + }, + { + "epoch": 1.2351765489673552, + "grad_norm": 0.7935186621762401, + "learning_rate": 8.729571450887145e-06, + "loss": 0.0497, + "step": 2781 + }, + { + "epoch": 1.2356206973129025, + "grad_norm": 0.4698985875136965, + "learning_rate": 8.728280023825667e-06, + "loss": 0.0449, + "step": 2782 + }, + { + "epoch": 1.23606484565845, + "grad_norm": 0.5845370985288897, + "learning_rate": 8.726988036340372e-06, + "loss": 0.0558, + "step": 2783 + }, + { + "epoch": 1.2365089940039973, + "grad_norm": 0.40081844727764104, + "learning_rate": 8.725695488625463e-06, + "loss": 0.0309, + "step": 2784 + }, + { + "epoch": 1.2369531423495448, + "grad_norm": 0.565816281763518, + "learning_rate": 8.724402380875234e-06, + "loss": 0.0527, + "step": 2785 + }, + { + "epoch": 1.237397290695092, + "grad_norm": 0.4908483433280354, + "learning_rate": 8.72310871328406e-06, + "loss": 0.0406, + "step": 2786 + }, + { + "epoch": 1.2378414390406396, + "grad_norm": 0.5334199926157142, + "learning_rate": 8.7218144860464e-06, + "loss": 0.0432, + "step": 2787 + }, + { + "epoch": 1.238285587386187, + "grad_norm": 0.43486952273189144, + "learning_rate": 8.720519699356804e-06, + "loss": 0.0449, + "step": 2788 + }, + { + "epoch": 1.2387297357317344, + "grad_norm": 0.5049916355349909, + "learning_rate": 8.719224353409895e-06, + "loss": 0.056, + "step": 2789 + }, + { + "epoch": 1.239173884077282, + "grad_norm": 0.44547159070564635, + "learning_rate": 8.717928448400387e-06, + "loss": 0.0444, + "step": 2790 + }, + { + "epoch": 1.2396180324228292, + "grad_norm": 0.4328408997299911, + "learning_rate": 8.716631984523076e-06, + "loss": 0.0461, + "step": 2791 + }, + { + "epoch": 1.2400621807683767, + "grad_norm": 0.42920188220498545, + "learning_rate": 8.715334961972844e-06, + "loss": 0.0477, + "step": 2792 + }, + { + "epoch": 1.240506329113924, + "grad_norm": 0.45226945207712443, + "learning_rate": 8.714037380944655e-06, + "loss": 0.0445, + "step": 2793 + }, + { + "epoch": 1.2409504774594715, + "grad_norm": 0.6142043461351961, + "learning_rate": 8.712739241633557e-06, + "loss": 0.0526, + "step": 2794 + }, + { + "epoch": 1.2413946258050188, + "grad_norm": 0.49524623051926364, + "learning_rate": 8.711440544234681e-06, + "loss": 0.0448, + "step": 2795 + }, + { + "epoch": 1.2418387741505663, + "grad_norm": 0.527589137144962, + "learning_rate": 8.710141288943247e-06, + "loss": 0.0681, + "step": 2796 + }, + { + "epoch": 1.2422829224961136, + "grad_norm": 0.45949156992557444, + "learning_rate": 8.708841475954551e-06, + "loss": 0.0597, + "step": 2797 + }, + { + "epoch": 1.2427270708416611, + "grad_norm": 0.5807053135952894, + "learning_rate": 8.707541105463982e-06, + "loss": 0.0602, + "step": 2798 + }, + { + "epoch": 1.2431712191872086, + "grad_norm": 0.47202242550842133, + "learning_rate": 8.706240177667003e-06, + "loss": 0.043, + "step": 2799 + }, + { + "epoch": 1.243615367532756, + "grad_norm": 0.5195621264387825, + "learning_rate": 8.704938692759166e-06, + "loss": 0.0384, + "step": 2800 + }, + { + "epoch": 1.2440595158783034, + "grad_norm": 0.40308041507113895, + "learning_rate": 8.703636650936108e-06, + "loss": 0.0352, + "step": 2801 + }, + { + "epoch": 1.2445036642238507, + "grad_norm": 0.5663764923556683, + "learning_rate": 8.70233405239355e-06, + "loss": 0.0487, + "step": 2802 + }, + { + "epoch": 1.2449478125693982, + "grad_norm": 0.6290221419877656, + "learning_rate": 8.70103089732729e-06, + "loss": 0.0518, + "step": 2803 + }, + { + "epoch": 1.2453919609149455, + "grad_norm": 0.4173665256358516, + "learning_rate": 8.699727185933215e-06, + "loss": 0.0352, + "step": 2804 + }, + { + "epoch": 1.245836109260493, + "grad_norm": 0.8204810767617772, + "learning_rate": 8.698422918407299e-06, + "loss": 0.0487, + "step": 2805 + }, + { + "epoch": 1.2462802576060403, + "grad_norm": 0.9850006429206544, + "learning_rate": 8.697118094945593e-06, + "loss": 0.0865, + "step": 2806 + }, + { + "epoch": 1.2467244059515878, + "grad_norm": 0.48735159482648766, + "learning_rate": 8.695812715744235e-06, + "loss": 0.0358, + "step": 2807 + }, + { + "epoch": 1.2471685542971351, + "grad_norm": 0.5010917630261117, + "learning_rate": 8.694506780999444e-06, + "loss": 0.0477, + "step": 2808 + }, + { + "epoch": 1.2476127026426826, + "grad_norm": 0.4869736837475569, + "learning_rate": 8.693200290907525e-06, + "loss": 0.0484, + "step": 2809 + }, + { + "epoch": 1.2480568509882302, + "grad_norm": 0.3824351180128345, + "learning_rate": 8.691893245664867e-06, + "loss": 0.0352, + "step": 2810 + }, + { + "epoch": 1.2485009993337775, + "grad_norm": 0.5417070708808928, + "learning_rate": 8.690585645467937e-06, + "loss": 0.0447, + "step": 2811 + }, + { + "epoch": 1.248945147679325, + "grad_norm": 0.5086804337705053, + "learning_rate": 8.689277490513295e-06, + "loss": 0.05, + "step": 2812 + }, + { + "epoch": 1.2493892960248723, + "grad_norm": 0.4431550852814911, + "learning_rate": 8.687968780997576e-06, + "loss": 0.0635, + "step": 2813 + }, + { + "epoch": 1.2498334443704198, + "grad_norm": 0.3783726536150246, + "learning_rate": 8.686659517117501e-06, + "loss": 0.0384, + "step": 2814 + }, + { + "epoch": 1.250277592715967, + "grad_norm": 0.5331217000478606, + "learning_rate": 8.685349699069875e-06, + "loss": 0.0409, + "step": 2815 + }, + { + "epoch": 1.2507217410615146, + "grad_norm": 0.38074858661235217, + "learning_rate": 8.684039327051586e-06, + "loss": 0.038, + "step": 2816 + }, + { + "epoch": 1.251165889407062, + "grad_norm": 0.6121662418691846, + "learning_rate": 8.682728401259606e-06, + "loss": 0.0556, + "step": 2817 + }, + { + "epoch": 1.2516100377526094, + "grad_norm": 0.8655438293508221, + "learning_rate": 8.681416921890988e-06, + "loss": 0.0559, + "step": 2818 + }, + { + "epoch": 1.2520541860981567, + "grad_norm": 0.5646419752199546, + "learning_rate": 8.680104889142871e-06, + "loss": 0.0546, + "step": 2819 + }, + { + "epoch": 1.2524983344437042, + "grad_norm": 0.47189997209382956, + "learning_rate": 8.678792303212474e-06, + "loss": 0.038, + "step": 2820 + }, + { + "epoch": 1.2529424827892517, + "grad_norm": 0.5347473900252707, + "learning_rate": 8.677479164297102e-06, + "loss": 0.0506, + "step": 2821 + }, + { + "epoch": 1.253386631134799, + "grad_norm": 0.6752833223029766, + "learning_rate": 8.676165472594145e-06, + "loss": 0.0553, + "step": 2822 + }, + { + "epoch": 1.2538307794803465, + "grad_norm": 0.6290436949126601, + "learning_rate": 8.674851228301066e-06, + "loss": 0.0628, + "step": 2823 + }, + { + "epoch": 1.2542749278258938, + "grad_norm": 0.5190239487706246, + "learning_rate": 8.673536431615426e-06, + "loss": 0.053, + "step": 2824 + }, + { + "epoch": 1.2547190761714413, + "grad_norm": 0.5039759155119382, + "learning_rate": 8.672221082734857e-06, + "loss": 0.0538, + "step": 2825 + }, + { + "epoch": 1.2551632245169886, + "grad_norm": 0.5007524426018746, + "learning_rate": 8.670905181857078e-06, + "loss": 0.0379, + "step": 2826 + }, + { + "epoch": 1.255607372862536, + "grad_norm": 0.4471624712255033, + "learning_rate": 8.669588729179895e-06, + "loss": 0.0425, + "step": 2827 + }, + { + "epoch": 1.2560515212080836, + "grad_norm": 0.6483988178781206, + "learning_rate": 8.668271724901188e-06, + "loss": 0.0667, + "step": 2828 + }, + { + "epoch": 1.256495669553631, + "grad_norm": 0.7074590212609329, + "learning_rate": 8.666954169218929e-06, + "loss": 0.0644, + "step": 2829 + }, + { + "epoch": 1.2569398178991782, + "grad_norm": 0.4799067719368653, + "learning_rate": 8.665636062331166e-06, + "loss": 0.0516, + "step": 2830 + }, + { + "epoch": 1.2573839662447257, + "grad_norm": 0.4519642992082907, + "learning_rate": 8.664317404436036e-06, + "loss": 0.0388, + "step": 2831 + }, + { + "epoch": 1.2578281145902732, + "grad_norm": 1.9317339882104076, + "learning_rate": 8.662998195731755e-06, + "loss": 0.0445, + "step": 2832 + }, + { + "epoch": 1.2582722629358205, + "grad_norm": 0.9801917673057263, + "learning_rate": 8.661678436416621e-06, + "loss": 0.0609, + "step": 2833 + }, + { + "epoch": 1.258716411281368, + "grad_norm": 0.4629508087617577, + "learning_rate": 8.660358126689015e-06, + "loss": 0.046, + "step": 2834 + }, + { + "epoch": 1.2591605596269153, + "grad_norm": 0.55871143511484, + "learning_rate": 8.659037266747405e-06, + "loss": 0.0669, + "step": 2835 + }, + { + "epoch": 1.2596047079724628, + "grad_norm": 0.4373848923476879, + "learning_rate": 8.65771585679034e-06, + "loss": 0.0365, + "step": 2836 + }, + { + "epoch": 1.2600488563180101, + "grad_norm": 0.4620726535649706, + "learning_rate": 8.656393897016446e-06, + "loss": 0.0508, + "step": 2837 + }, + { + "epoch": 1.2604930046635576, + "grad_norm": 0.7076034863902437, + "learning_rate": 8.655071387624439e-06, + "loss": 0.066, + "step": 2838 + }, + { + "epoch": 1.2609371530091051, + "grad_norm": 0.7170929705949399, + "learning_rate": 8.653748328813112e-06, + "loss": 0.0663, + "step": 2839 + }, + { + "epoch": 1.2613813013546524, + "grad_norm": 0.5278820360047365, + "learning_rate": 8.652424720781346e-06, + "loss": 0.0703, + "step": 2840 + }, + { + "epoch": 1.2618254497002, + "grad_norm": 0.6581330490442558, + "learning_rate": 8.6511005637281e-06, + "loss": 0.0475, + "step": 2841 + }, + { + "epoch": 1.2622695980457472, + "grad_norm": 0.5785660781607422, + "learning_rate": 8.649775857852419e-06, + "loss": 0.0539, + "step": 2842 + }, + { + "epoch": 1.2627137463912947, + "grad_norm": 0.7193486412721348, + "learning_rate": 8.648450603353427e-06, + "loss": 0.0527, + "step": 2843 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.4879073594292654, + "learning_rate": 8.647124800430332e-06, + "loss": 0.0462, + "step": 2844 + }, + { + "epoch": 1.2636020430823895, + "grad_norm": 0.6283549482710774, + "learning_rate": 8.645798449282427e-06, + "loss": 0.047, + "step": 2845 + }, + { + "epoch": 1.264046191427937, + "grad_norm": 0.590026590546146, + "learning_rate": 8.644471550109084e-06, + "loss": 0.0417, + "step": 2846 + }, + { + "epoch": 1.2644903397734844, + "grad_norm": 0.5727280121631517, + "learning_rate": 8.643144103109757e-06, + "loss": 0.0461, + "step": 2847 + }, + { + "epoch": 1.2649344881190316, + "grad_norm": 0.5188962100480214, + "learning_rate": 8.641816108483987e-06, + "loss": 0.0528, + "step": 2848 + }, + { + "epoch": 1.2653786364645792, + "grad_norm": 0.6762837917534186, + "learning_rate": 8.64048756643139e-06, + "loss": 0.0551, + "step": 2849 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 0.6063186060719946, + "learning_rate": 8.639158477151673e-06, + "loss": 0.0529, + "step": 2850 + }, + { + "epoch": 1.266266933155674, + "grad_norm": 0.8416294217857067, + "learning_rate": 8.637828840844615e-06, + "loss": 0.0605, + "step": 2851 + }, + { + "epoch": 1.2667110815012215, + "grad_norm": 0.42460216144695934, + "learning_rate": 8.636498657710091e-06, + "loss": 0.0406, + "step": 2852 + }, + { + "epoch": 1.2671552298467688, + "grad_norm": 0.45348809902081694, + "learning_rate": 8.635167927948041e-06, + "loss": 0.0423, + "step": 2853 + }, + { + "epoch": 1.2675993781923163, + "grad_norm": 0.5925945869143667, + "learning_rate": 8.633836651758502e-06, + "loss": 0.0422, + "step": 2854 + }, + { + "epoch": 1.2680435265378636, + "grad_norm": 0.474702929089884, + "learning_rate": 8.632504829341588e-06, + "loss": 0.0365, + "step": 2855 + }, + { + "epoch": 1.268487674883411, + "grad_norm": 0.8276698250673459, + "learning_rate": 8.63117246089749e-06, + "loss": 0.0477, + "step": 2856 + }, + { + "epoch": 1.2689318232289586, + "grad_norm": 0.4891929607601022, + "learning_rate": 8.62983954662649e-06, + "loss": 0.044, + "step": 2857 + }, + { + "epoch": 1.2693759715745059, + "grad_norm": 0.6525774912230711, + "learning_rate": 8.628506086728947e-06, + "loss": 0.0651, + "step": 2858 + }, + { + "epoch": 1.2698201199200532, + "grad_norm": 0.6594678233990844, + "learning_rate": 8.6271720814053e-06, + "loss": 0.0381, + "step": 2859 + }, + { + "epoch": 1.2702642682656007, + "grad_norm": 0.3241646392485162, + "learning_rate": 8.625837530856074e-06, + "loss": 0.0309, + "step": 2860 + }, + { + "epoch": 1.2707084166111482, + "grad_norm": 0.7122663241070569, + "learning_rate": 8.624502435281875e-06, + "loss": 0.0617, + "step": 2861 + }, + { + "epoch": 1.2711525649566955, + "grad_norm": 0.7005320306757822, + "learning_rate": 8.623166794883393e-06, + "loss": 0.0581, + "step": 2862 + }, + { + "epoch": 1.271596713302243, + "grad_norm": 0.5328754180731429, + "learning_rate": 8.621830609861392e-06, + "loss": 0.0493, + "step": 2863 + }, + { + "epoch": 1.2720408616477903, + "grad_norm": 0.6269919214433372, + "learning_rate": 8.620493880416727e-06, + "loss": 0.0516, + "step": 2864 + }, + { + "epoch": 1.2724850099933378, + "grad_norm": 0.5844430878482636, + "learning_rate": 8.619156606750329e-06, + "loss": 0.0725, + "step": 2865 + }, + { + "epoch": 1.272929158338885, + "grad_norm": 0.4737556741912441, + "learning_rate": 8.617818789063217e-06, + "loss": 0.0421, + "step": 2866 + }, + { + "epoch": 1.2733733066844326, + "grad_norm": 0.7524484026464038, + "learning_rate": 8.616480427556484e-06, + "loss": 0.0708, + "step": 2867 + }, + { + "epoch": 1.2738174550299801, + "grad_norm": 0.8150405412731492, + "learning_rate": 8.61514152243131e-06, + "loss": 0.0704, + "step": 2868 + }, + { + "epoch": 1.2742616033755274, + "grad_norm": 0.6289008429543912, + "learning_rate": 8.613802073888953e-06, + "loss": 0.0544, + "step": 2869 + }, + { + "epoch": 1.274705751721075, + "grad_norm": 0.49267955313551814, + "learning_rate": 8.612462082130758e-06, + "loss": 0.0423, + "step": 2870 + }, + { + "epoch": 1.2751499000666222, + "grad_norm": 0.8023016461317273, + "learning_rate": 8.611121547358146e-06, + "loss": 0.0581, + "step": 2871 + }, + { + "epoch": 1.2755940484121697, + "grad_norm": 0.5971368904341509, + "learning_rate": 8.609780469772623e-06, + "loss": 0.0555, + "step": 2872 + }, + { + "epoch": 1.276038196757717, + "grad_norm": 0.497140537686175, + "learning_rate": 8.608438849575777e-06, + "loss": 0.0511, + "step": 2873 + }, + { + "epoch": 1.2764823451032645, + "grad_norm": 0.5696597646442083, + "learning_rate": 8.607096686969274e-06, + "loss": 0.0592, + "step": 2874 + }, + { + "epoch": 1.276926493448812, + "grad_norm": 0.7494228951484337, + "learning_rate": 8.605753982154865e-06, + "loss": 0.0486, + "step": 2875 + }, + { + "epoch": 1.2773706417943593, + "grad_norm": 0.5764143358267766, + "learning_rate": 8.604410735334383e-06, + "loss": 0.0551, + "step": 2876 + }, + { + "epoch": 1.2778147901399066, + "grad_norm": 0.6011073566193142, + "learning_rate": 8.603066946709739e-06, + "loss": 0.0567, + "step": 2877 + }, + { + "epoch": 1.2782589384854541, + "grad_norm": 0.44537454001664567, + "learning_rate": 8.601722616482927e-06, + "loss": 0.0473, + "step": 2878 + }, + { + "epoch": 1.2787030868310016, + "grad_norm": 0.45558471841390763, + "learning_rate": 8.600377744856024e-06, + "loss": 0.0577, + "step": 2879 + }, + { + "epoch": 1.279147235176549, + "grad_norm": 0.5192384143571988, + "learning_rate": 8.599032332031185e-06, + "loss": 0.0549, + "step": 2880 + }, + { + "epoch": 1.2795913835220964, + "grad_norm": 0.641251450966757, + "learning_rate": 8.59768637821065e-06, + "loss": 0.0599, + "step": 2881 + }, + { + "epoch": 1.2800355318676437, + "grad_norm": 0.5747913363539975, + "learning_rate": 8.596339883596738e-06, + "loss": 0.0546, + "step": 2882 + }, + { + "epoch": 1.2804796802131913, + "grad_norm": 0.5416056259419014, + "learning_rate": 8.594992848391852e-06, + "loss": 0.0536, + "step": 2883 + }, + { + "epoch": 1.2809238285587385, + "grad_norm": 0.6028308551659004, + "learning_rate": 8.59364527279847e-06, + "loss": 0.0699, + "step": 2884 + }, + { + "epoch": 1.281367976904286, + "grad_norm": 0.4368044045668833, + "learning_rate": 8.59229715701916e-06, + "loss": 0.0396, + "step": 2885 + }, + { + "epoch": 1.2818121252498336, + "grad_norm": 0.5104164604694876, + "learning_rate": 8.590948501256564e-06, + "loss": 0.04, + "step": 2886 + }, + { + "epoch": 1.2822562735953809, + "grad_norm": 0.7612641229651903, + "learning_rate": 8.58959930571341e-06, + "loss": 0.0624, + "step": 2887 + }, + { + "epoch": 1.2827004219409281, + "grad_norm": 0.5035114097597291, + "learning_rate": 8.588249570592502e-06, + "loss": 0.0437, + "step": 2888 + }, + { + "epoch": 1.2831445702864757, + "grad_norm": 0.499768070561634, + "learning_rate": 8.586899296096731e-06, + "loss": 0.0378, + "step": 2889 + }, + { + "epoch": 1.2835887186320232, + "grad_norm": 0.4925272487243439, + "learning_rate": 8.585548482429064e-06, + "loss": 0.0406, + "step": 2890 + }, + { + "epoch": 1.2840328669775705, + "grad_norm": 0.5079455242566149, + "learning_rate": 8.584197129792553e-06, + "loss": 0.049, + "step": 2891 + }, + { + "epoch": 1.284477015323118, + "grad_norm": 0.5999702910432165, + "learning_rate": 8.58284523839033e-06, + "loss": 0.0525, + "step": 2892 + }, + { + "epoch": 1.2849211636686653, + "grad_norm": 0.5762429387843919, + "learning_rate": 8.581492808425604e-06, + "loss": 0.0412, + "step": 2893 + }, + { + "epoch": 1.2853653120142128, + "grad_norm": 0.5170549640705353, + "learning_rate": 8.58013984010167e-06, + "loss": 0.0426, + "step": 2894 + }, + { + "epoch": 1.28580946035976, + "grad_norm": 0.44644738551782165, + "learning_rate": 8.578786333621902e-06, + "loss": 0.0449, + "step": 2895 + }, + { + "epoch": 1.2862536087053076, + "grad_norm": 0.49787403784527023, + "learning_rate": 8.577432289189755e-06, + "loss": 0.053, + "step": 2896 + }, + { + "epoch": 1.286697757050855, + "grad_norm": 0.4201343576902461, + "learning_rate": 8.576077707008766e-06, + "loss": 0.0548, + "step": 2897 + }, + { + "epoch": 1.2871419053964024, + "grad_norm": 0.43223595023539274, + "learning_rate": 8.57472258728255e-06, + "loss": 0.0414, + "step": 2898 + }, + { + "epoch": 1.2875860537419497, + "grad_norm": 0.45140599595731223, + "learning_rate": 8.573366930214807e-06, + "loss": 0.0512, + "step": 2899 + }, + { + "epoch": 1.2880302020874972, + "grad_norm": 0.7864322313017508, + "learning_rate": 8.57201073600931e-06, + "loss": 0.0611, + "step": 2900 + }, + { + "epoch": 1.2884743504330447, + "grad_norm": 0.4533939072001686, + "learning_rate": 8.570654004869924e-06, + "loss": 0.0455, + "step": 2901 + }, + { + "epoch": 1.288918498778592, + "grad_norm": 0.5822538472138225, + "learning_rate": 8.569296737000586e-06, + "loss": 0.0689, + "step": 2902 + }, + { + "epoch": 1.2893626471241395, + "grad_norm": 0.6558805640817998, + "learning_rate": 8.567938932605315e-06, + "loss": 0.0716, + "step": 2903 + }, + { + "epoch": 1.2898067954696868, + "grad_norm": 0.6962929137245736, + "learning_rate": 8.566580591888216e-06, + "loss": 0.0456, + "step": 2904 + }, + { + "epoch": 1.2902509438152343, + "grad_norm": 0.5885420821356683, + "learning_rate": 8.565221715053467e-06, + "loss": 0.0519, + "step": 2905 + }, + { + "epoch": 1.2906950921607816, + "grad_norm": 0.5367742959382944, + "learning_rate": 8.563862302305333e-06, + "loss": 0.0385, + "step": 2906 + }, + { + "epoch": 1.2911392405063291, + "grad_norm": 0.7323581959894143, + "learning_rate": 8.562502353848155e-06, + "loss": 0.0659, + "step": 2907 + }, + { + "epoch": 1.2915833888518766, + "grad_norm": 0.5051887556869058, + "learning_rate": 8.561141869886356e-06, + "loss": 0.0559, + "step": 2908 + }, + { + "epoch": 1.292027537197424, + "grad_norm": 0.5491304399843213, + "learning_rate": 8.55978085062444e-06, + "loss": 0.0639, + "step": 2909 + }, + { + "epoch": 1.2924716855429714, + "grad_norm": 0.8492250674519668, + "learning_rate": 8.558419296266995e-06, + "loss": 0.0636, + "step": 2910 + }, + { + "epoch": 1.2929158338885187, + "grad_norm": 0.3882329610931263, + "learning_rate": 8.557057207018681e-06, + "loss": 0.0487, + "step": 2911 + }, + { + "epoch": 1.2933599822340662, + "grad_norm": 0.625233318674613, + "learning_rate": 8.555694583084244e-06, + "loss": 0.0465, + "step": 2912 + }, + { + "epoch": 1.2938041305796135, + "grad_norm": 0.4026058166802596, + "learning_rate": 8.554331424668511e-06, + "loss": 0.0354, + "step": 2913 + }, + { + "epoch": 1.294248278925161, + "grad_norm": 0.5634837852049973, + "learning_rate": 8.552967731976388e-06, + "loss": 0.0541, + "step": 2914 + }, + { + "epoch": 1.2946924272707085, + "grad_norm": 0.48751917705852116, + "learning_rate": 8.551603505212862e-06, + "loss": 0.0322, + "step": 2915 + }, + { + "epoch": 1.2951365756162558, + "grad_norm": 0.5156212240116493, + "learning_rate": 8.550238744582997e-06, + "loss": 0.0489, + "step": 2916 + }, + { + "epoch": 1.2955807239618031, + "grad_norm": 0.6223211171347353, + "learning_rate": 8.548873450291939e-06, + "loss": 0.0457, + "step": 2917 + }, + { + "epoch": 1.2960248723073506, + "grad_norm": 0.47672895894969763, + "learning_rate": 8.547507622544916e-06, + "loss": 0.0463, + "step": 2918 + }, + { + "epoch": 1.2964690206528982, + "grad_norm": 0.4421458887458943, + "learning_rate": 8.546141261547238e-06, + "loss": 0.045, + "step": 2919 + }, + { + "epoch": 1.2969131689984454, + "grad_norm": 0.848623824582431, + "learning_rate": 8.544774367504291e-06, + "loss": 0.0605, + "step": 2920 + }, + { + "epoch": 1.297357317343993, + "grad_norm": 0.554513822402812, + "learning_rate": 8.54340694062154e-06, + "loss": 0.0518, + "step": 2921 + }, + { + "epoch": 1.2978014656895402, + "grad_norm": 0.5649747315400963, + "learning_rate": 8.542038981104532e-06, + "loss": 0.0565, + "step": 2922 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 0.42021656791448186, + "learning_rate": 8.540670489158899e-06, + "loss": 0.0426, + "step": 2923 + }, + { + "epoch": 1.298689762380635, + "grad_norm": 0.6402654468272049, + "learning_rate": 8.539301464990345e-06, + "loss": 0.0543, + "step": 2924 + }, + { + "epoch": 1.2991339107261826, + "grad_norm": 0.3618130143994902, + "learning_rate": 8.53793190880466e-06, + "loss": 0.038, + "step": 2925 + }, + { + "epoch": 1.29957805907173, + "grad_norm": 0.3384505689514398, + "learning_rate": 8.536561820807707e-06, + "loss": 0.0346, + "step": 2926 + }, + { + "epoch": 1.3000222074172774, + "grad_norm": 0.46188012553966973, + "learning_rate": 8.535191201205439e-06, + "loss": 0.0514, + "step": 2927 + }, + { + "epoch": 1.3004663557628247, + "grad_norm": 0.5300445785864025, + "learning_rate": 8.533820050203881e-06, + "loss": 0.0483, + "step": 2928 + }, + { + "epoch": 1.3009105041083722, + "grad_norm": 0.5536409349327417, + "learning_rate": 8.532448368009139e-06, + "loss": 0.0489, + "step": 2929 + }, + { + "epoch": 1.3013546524539197, + "grad_norm": 0.371658656980648, + "learning_rate": 8.531076154827402e-06, + "loss": 0.0445, + "step": 2930 + }, + { + "epoch": 1.301798800799467, + "grad_norm": 0.5129545567634131, + "learning_rate": 8.529703410864938e-06, + "loss": 0.0341, + "step": 2931 + }, + { + "epoch": 1.3022429491450145, + "grad_norm": 0.6790917670823785, + "learning_rate": 8.52833013632809e-06, + "loss": 0.0604, + "step": 2932 + }, + { + "epoch": 1.3026870974905618, + "grad_norm": 0.4758905378740349, + "learning_rate": 8.526956331423289e-06, + "loss": 0.0458, + "step": 2933 + }, + { + "epoch": 1.3031312458361093, + "grad_norm": 0.5428940197749453, + "learning_rate": 8.525581996357036e-06, + "loss": 0.047, + "step": 2934 + }, + { + "epoch": 1.3035753941816566, + "grad_norm": 0.5546360540394489, + "learning_rate": 8.52420713133592e-06, + "loss": 0.0585, + "step": 2935 + }, + { + "epoch": 1.304019542527204, + "grad_norm": 0.8256368560390267, + "learning_rate": 8.522831736566607e-06, + "loss": 0.0592, + "step": 2936 + }, + { + "epoch": 1.3044636908727516, + "grad_norm": 0.6807172008845478, + "learning_rate": 8.521455812255843e-06, + "loss": 0.0727, + "step": 2937 + }, + { + "epoch": 1.304907839218299, + "grad_norm": 0.5972239265610148, + "learning_rate": 8.52007935861045e-06, + "loss": 0.0531, + "step": 2938 + }, + { + "epoch": 1.3053519875638464, + "grad_norm": 0.5756047785651232, + "learning_rate": 8.518702375837335e-06, + "loss": 0.0484, + "step": 2939 + }, + { + "epoch": 1.3057961359093937, + "grad_norm": 0.5254079002689842, + "learning_rate": 8.51732486414348e-06, + "loss": 0.0551, + "step": 2940 + }, + { + "epoch": 1.3062402842549412, + "grad_norm": 0.5057686654115676, + "learning_rate": 8.515946823735948e-06, + "loss": 0.0555, + "step": 2941 + }, + { + "epoch": 1.3066844326004885, + "grad_norm": 0.5425819759814885, + "learning_rate": 8.514568254821884e-06, + "loss": 0.0461, + "step": 2942 + }, + { + "epoch": 1.307128580946036, + "grad_norm": 0.4919842279881122, + "learning_rate": 8.51318915760851e-06, + "loss": 0.0351, + "step": 2943 + }, + { + "epoch": 1.3075727292915835, + "grad_norm": 0.5651189893610779, + "learning_rate": 8.511809532303126e-06, + "loss": 0.0485, + "step": 2944 + }, + { + "epoch": 1.3080168776371308, + "grad_norm": 0.5135664399251805, + "learning_rate": 8.510429379113114e-06, + "loss": 0.0345, + "step": 2945 + }, + { + "epoch": 1.308461025982678, + "grad_norm": 0.698576525158577, + "learning_rate": 8.509048698245934e-06, + "loss": 0.0541, + "step": 2946 + }, + { + "epoch": 1.3089051743282256, + "grad_norm": 0.5914910850001974, + "learning_rate": 8.507667489909126e-06, + "loss": 0.0422, + "step": 2947 + }, + { + "epoch": 1.3093493226737731, + "grad_norm": 0.39464116666974663, + "learning_rate": 8.506285754310311e-06, + "loss": 0.0395, + "step": 2948 + }, + { + "epoch": 1.3097934710193204, + "grad_norm": 0.41501372577846396, + "learning_rate": 8.504903491657185e-06, + "loss": 0.0392, + "step": 2949 + }, + { + "epoch": 1.310237619364868, + "grad_norm": 0.5710008833310989, + "learning_rate": 8.503520702157527e-06, + "loss": 0.0486, + "step": 2950 + }, + { + "epoch": 1.3106817677104152, + "grad_norm": 0.912615051335769, + "learning_rate": 8.502137386019191e-06, + "loss": 0.0636, + "step": 2951 + }, + { + "epoch": 1.3111259160559627, + "grad_norm": 0.543540441011416, + "learning_rate": 8.500753543450118e-06, + "loss": 0.062, + "step": 2952 + }, + { + "epoch": 1.31157006440151, + "grad_norm": 0.39754298101965646, + "learning_rate": 8.499369174658318e-06, + "loss": 0.0383, + "step": 2953 + }, + { + "epoch": 1.3120142127470575, + "grad_norm": 0.5539227325166909, + "learning_rate": 8.497984279851888e-06, + "loss": 0.0499, + "step": 2954 + }, + { + "epoch": 1.312458361092605, + "grad_norm": 0.599023615011808, + "learning_rate": 8.496598859238997e-06, + "loss": 0.0472, + "step": 2955 + }, + { + "epoch": 1.3129025094381523, + "grad_norm": 0.4220558992968304, + "learning_rate": 8.495212913027906e-06, + "loss": 0.0446, + "step": 2956 + }, + { + "epoch": 1.3133466577836996, + "grad_norm": 0.49593231969327095, + "learning_rate": 8.493826441426937e-06, + "loss": 0.0636, + "step": 2957 + }, + { + "epoch": 1.3137908061292471, + "grad_norm": 0.5382980257808322, + "learning_rate": 8.492439444644506e-06, + "loss": 0.0462, + "step": 2958 + }, + { + "epoch": 1.3142349544747947, + "grad_norm": 0.4925896725925621, + "learning_rate": 8.4910519228891e-06, + "loss": 0.0602, + "step": 2959 + }, + { + "epoch": 1.314679102820342, + "grad_norm": 1.1877590532157762, + "learning_rate": 8.489663876369288e-06, + "loss": 0.0723, + "step": 2960 + }, + { + "epoch": 1.3151232511658895, + "grad_norm": 0.6920312294903902, + "learning_rate": 8.488275305293715e-06, + "loss": 0.0531, + "step": 2961 + }, + { + "epoch": 1.3155673995114368, + "grad_norm": 0.36761567736761985, + "learning_rate": 8.486886209871108e-06, + "loss": 0.0397, + "step": 2962 + }, + { + "epoch": 1.3160115478569843, + "grad_norm": 0.55057942024177, + "learning_rate": 8.485496590310274e-06, + "loss": 0.0591, + "step": 2963 + }, + { + "epoch": 1.3164556962025316, + "grad_norm": 0.5710852175463684, + "learning_rate": 8.484106446820094e-06, + "loss": 0.0471, + "step": 2964 + }, + { + "epoch": 1.316899844548079, + "grad_norm": 0.5828688111433763, + "learning_rate": 8.482715779609526e-06, + "loss": 0.0551, + "step": 2965 + }, + { + "epoch": 1.3173439928936266, + "grad_norm": 0.531963458189058, + "learning_rate": 8.481324588887619e-06, + "loss": 0.0504, + "step": 2966 + }, + { + "epoch": 1.3177881412391739, + "grad_norm": 0.9382025539874739, + "learning_rate": 8.47993287486349e-06, + "loss": 0.0778, + "step": 2967 + }, + { + "epoch": 1.3182322895847212, + "grad_norm": 0.5014993856992925, + "learning_rate": 8.478540637746334e-06, + "loss": 0.0635, + "step": 2968 + }, + { + "epoch": 1.3186764379302687, + "grad_norm": 0.396172698958811, + "learning_rate": 8.477147877745431e-06, + "loss": 0.0378, + "step": 2969 + }, + { + "epoch": 1.3191205862758162, + "grad_norm": 0.5285271119946755, + "learning_rate": 8.475754595070134e-06, + "loss": 0.0544, + "step": 2970 + }, + { + "epoch": 1.3195647346213635, + "grad_norm": 0.4032933677589254, + "learning_rate": 8.474360789929881e-06, + "loss": 0.0431, + "step": 2971 + }, + { + "epoch": 1.320008882966911, + "grad_norm": 0.45143103762565806, + "learning_rate": 8.47296646253418e-06, + "loss": 0.0509, + "step": 2972 + }, + { + "epoch": 1.3204530313124583, + "grad_norm": 0.5252154450521924, + "learning_rate": 8.471571613092626e-06, + "loss": 0.0594, + "step": 2973 + }, + { + "epoch": 1.3208971796580058, + "grad_norm": 0.34642633471428613, + "learning_rate": 8.470176241814886e-06, + "loss": 0.027, + "step": 2974 + }, + { + "epoch": 1.321341328003553, + "grad_norm": 0.4784318360661084, + "learning_rate": 8.46878034891071e-06, + "loss": 0.0444, + "step": 2975 + }, + { + "epoch": 1.3217854763491006, + "grad_norm": 0.580781905109618, + "learning_rate": 8.467383934589923e-06, + "loss": 0.043, + "step": 2976 + }, + { + "epoch": 1.322229624694648, + "grad_norm": 0.46709485088977415, + "learning_rate": 8.465986999062427e-06, + "loss": 0.0485, + "step": 2977 + }, + { + "epoch": 1.3226737730401954, + "grad_norm": 0.7439128060207424, + "learning_rate": 8.464589542538213e-06, + "loss": 0.0566, + "step": 2978 + }, + { + "epoch": 1.323117921385743, + "grad_norm": 0.5532502253043584, + "learning_rate": 8.463191565227336e-06, + "loss": 0.0486, + "step": 2979 + }, + { + "epoch": 1.3235620697312902, + "grad_norm": 0.46900654723685586, + "learning_rate": 8.461793067339936e-06, + "loss": 0.0384, + "step": 2980 + }, + { + "epoch": 1.3240062180768377, + "grad_norm": 0.4715987465734639, + "learning_rate": 8.460394049086232e-06, + "loss": 0.0467, + "step": 2981 + }, + { + "epoch": 1.324450366422385, + "grad_norm": 0.4464995773383881, + "learning_rate": 8.458994510676523e-06, + "loss": 0.0392, + "step": 2982 + }, + { + "epoch": 1.3248945147679325, + "grad_norm": 0.5120257592364733, + "learning_rate": 8.457594452321178e-06, + "loss": 0.0468, + "step": 2983 + }, + { + "epoch": 1.32533866311348, + "grad_norm": 0.5011205376177407, + "learning_rate": 8.456193874230656e-06, + "loss": 0.0479, + "step": 2984 + }, + { + "epoch": 1.3257828114590273, + "grad_norm": 0.4942109449168538, + "learning_rate": 8.454792776615482e-06, + "loss": 0.0456, + "step": 2985 + }, + { + "epoch": 1.3262269598045746, + "grad_norm": 0.5223005309985502, + "learning_rate": 8.453391159686268e-06, + "loss": 0.0578, + "step": 2986 + }, + { + "epoch": 1.3266711081501221, + "grad_norm": 0.4751851347230112, + "learning_rate": 8.4519890236537e-06, + "loss": 0.0496, + "step": 2987 + }, + { + "epoch": 1.3271152564956696, + "grad_norm": 0.46876537416517666, + "learning_rate": 8.450586368728541e-06, + "loss": 0.0556, + "step": 2988 + }, + { + "epoch": 1.327559404841217, + "grad_norm": 0.5342196778194671, + "learning_rate": 8.449183195121638e-06, + "loss": 0.0462, + "step": 2989 + }, + { + "epoch": 1.3280035531867644, + "grad_norm": 0.7891053866526658, + "learning_rate": 8.447779503043907e-06, + "loss": 0.0695, + "step": 2990 + }, + { + "epoch": 1.3284477015323117, + "grad_norm": 0.4837197404803745, + "learning_rate": 8.44637529270635e-06, + "loss": 0.0416, + "step": 2991 + }, + { + "epoch": 1.3288918498778592, + "grad_norm": 0.4822831093408252, + "learning_rate": 8.444970564320044e-06, + "loss": 0.0492, + "step": 2992 + }, + { + "epoch": 1.3293359982234065, + "grad_norm": 1.1468745158193232, + "learning_rate": 8.443565318096141e-06, + "loss": 0.0829, + "step": 2993 + }, + { + "epoch": 1.329780146568954, + "grad_norm": 0.3846877899730632, + "learning_rate": 8.442159554245875e-06, + "loss": 0.0383, + "step": 2994 + }, + { + "epoch": 1.3302242949145016, + "grad_norm": 0.48249463397520415, + "learning_rate": 8.440753272980555e-06, + "loss": 0.0487, + "step": 2995 + }, + { + "epoch": 1.3306684432600488, + "grad_norm": 0.5300221622862935, + "learning_rate": 8.439346474511572e-06, + "loss": 0.0506, + "step": 2996 + }, + { + "epoch": 1.3311125916055961, + "grad_norm": 0.6183171953601485, + "learning_rate": 8.437939159050388e-06, + "loss": 0.063, + "step": 2997 + }, + { + "epoch": 1.3315567399511437, + "grad_norm": 0.5988476945915722, + "learning_rate": 8.43653132680855e-06, + "loss": 0.0625, + "step": 2998 + }, + { + "epoch": 1.3320008882966912, + "grad_norm": 0.5556515413344068, + "learning_rate": 8.435122977997675e-06, + "loss": 0.0491, + "step": 2999 + }, + { + "epoch": 1.3324450366422385, + "grad_norm": 0.4154893093213718, + "learning_rate": 8.433714112829464e-06, + "loss": 0.0431, + "step": 3000 + }, + { + "epoch": 1.332889184987786, + "grad_norm": 0.6020512275698489, + "learning_rate": 8.432304731515695e-06, + "loss": 0.0596, + "step": 3001 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.41304989339620046, + "learning_rate": 8.430894834268218e-06, + "loss": 0.0436, + "step": 3002 + }, + { + "epoch": 1.3337774816788808, + "grad_norm": 0.5204666086414845, + "learning_rate": 8.429484421298968e-06, + "loss": 0.0581, + "step": 3003 + }, + { + "epoch": 1.334221630024428, + "grad_norm": 0.5429684889982698, + "learning_rate": 8.428073492819953e-06, + "loss": 0.0404, + "step": 3004 + }, + { + "epoch": 1.3346657783699756, + "grad_norm": 0.4188357634818928, + "learning_rate": 8.426662049043258e-06, + "loss": 0.0422, + "step": 3005 + }, + { + "epoch": 1.335109926715523, + "grad_norm": 0.40403921511813046, + "learning_rate": 8.42525009018105e-06, + "loss": 0.0376, + "step": 3006 + }, + { + "epoch": 1.3355540750610704, + "grad_norm": 0.40629767747303225, + "learning_rate": 8.423837616445568e-06, + "loss": 0.0404, + "step": 3007 + }, + { + "epoch": 1.335998223406618, + "grad_norm": 0.4159409808633564, + "learning_rate": 8.42242462804913e-06, + "loss": 0.0394, + "step": 3008 + }, + { + "epoch": 1.3364423717521652, + "grad_norm": 0.7041981820609958, + "learning_rate": 8.421011125204134e-06, + "loss": 0.0693, + "step": 3009 + }, + { + "epoch": 1.3368865200977127, + "grad_norm": 0.7951485702237924, + "learning_rate": 8.419597108123054e-06, + "loss": 0.0612, + "step": 3010 + }, + { + "epoch": 1.33733066844326, + "grad_norm": 0.3990758518161739, + "learning_rate": 8.418182577018438e-06, + "loss": 0.0511, + "step": 3011 + }, + { + "epoch": 1.3377748167888075, + "grad_norm": 0.9209585349240309, + "learning_rate": 8.416767532102918e-06, + "loss": 0.0416, + "step": 3012 + }, + { + "epoch": 1.338218965134355, + "grad_norm": 0.5459049872521075, + "learning_rate": 8.415351973589197e-06, + "loss": 0.0434, + "step": 3013 + }, + { + "epoch": 1.3386631134799023, + "grad_norm": 0.5057701190286418, + "learning_rate": 8.413935901690057e-06, + "loss": 0.0434, + "step": 3014 + }, + { + "epoch": 1.3391072618254496, + "grad_norm": 0.48056478669043734, + "learning_rate": 8.412519316618359e-06, + "loss": 0.0502, + "step": 3015 + }, + { + "epoch": 1.339551410170997, + "grad_norm": 0.45090544113428555, + "learning_rate": 8.411102218587039e-06, + "loss": 0.0419, + "step": 3016 + }, + { + "epoch": 1.3399955585165446, + "grad_norm": 0.4066569281524885, + "learning_rate": 8.40968460780911e-06, + "loss": 0.0368, + "step": 3017 + }, + { + "epoch": 1.340439706862092, + "grad_norm": 0.48568103858702, + "learning_rate": 8.408266484497664e-06, + "loss": 0.0416, + "step": 3018 + }, + { + "epoch": 1.3408838552076394, + "grad_norm": 0.5224793245797686, + "learning_rate": 8.406847848865871e-06, + "loss": 0.0573, + "step": 3019 + }, + { + "epoch": 1.3413280035531867, + "grad_norm": 0.5080673094422519, + "learning_rate": 8.405428701126973e-06, + "loss": 0.0496, + "step": 3020 + }, + { + "epoch": 1.3417721518987342, + "grad_norm": 0.5515622788542163, + "learning_rate": 8.404009041494292e-06, + "loss": 0.054, + "step": 3021 + }, + { + "epoch": 1.3422163002442815, + "grad_norm": 0.4992441781647522, + "learning_rate": 8.40258887018123e-06, + "loss": 0.0383, + "step": 3022 + }, + { + "epoch": 1.342660448589829, + "grad_norm": 0.5853495583783108, + "learning_rate": 8.40116818740126e-06, + "loss": 0.0419, + "step": 3023 + }, + { + "epoch": 1.3431045969353765, + "grad_norm": 0.47598229779455054, + "learning_rate": 8.399746993367936e-06, + "loss": 0.0342, + "step": 3024 + }, + { + "epoch": 1.3435487452809238, + "grad_norm": 0.46492013802658844, + "learning_rate": 8.398325288294886e-06, + "loss": 0.043, + "step": 3025 + }, + { + "epoch": 1.3439928936264711, + "grad_norm": 0.4287825450323698, + "learning_rate": 8.396903072395819e-06, + "loss": 0.0401, + "step": 3026 + }, + { + "epoch": 1.3444370419720186, + "grad_norm": 0.4586017747854982, + "learning_rate": 8.395480345884516e-06, + "loss": 0.0431, + "step": 3027 + }, + { + "epoch": 1.3448811903175661, + "grad_norm": 0.494703530250107, + "learning_rate": 8.39405710897484e-06, + "loss": 0.0462, + "step": 3028 + }, + { + "epoch": 1.3453253386631134, + "grad_norm": 0.5340552133978543, + "learning_rate": 8.392633361880724e-06, + "loss": 0.0567, + "step": 3029 + }, + { + "epoch": 1.345769487008661, + "grad_norm": 0.49912393023299867, + "learning_rate": 8.391209104816183e-06, + "loss": 0.0501, + "step": 3030 + }, + { + "epoch": 1.3462136353542082, + "grad_norm": 0.6779311071781129, + "learning_rate": 8.389784337995306e-06, + "loss": 0.0682, + "step": 3031 + }, + { + "epoch": 1.3466577836997558, + "grad_norm": 0.47717232493661027, + "learning_rate": 8.388359061632262e-06, + "loss": 0.043, + "step": 3032 + }, + { + "epoch": 1.347101932045303, + "grad_norm": 0.47326694390163015, + "learning_rate": 8.386933275941294e-06, + "loss": 0.0456, + "step": 3033 + }, + { + "epoch": 1.3475460803908506, + "grad_norm": 0.6662990359563103, + "learning_rate": 8.385506981136717e-06, + "loss": 0.0487, + "step": 3034 + }, + { + "epoch": 1.347990228736398, + "grad_norm": 0.718683780503916, + "learning_rate": 8.384080177432933e-06, + "loss": 0.0394, + "step": 3035 + }, + { + "epoch": 1.3484343770819454, + "grad_norm": 0.5289766475406907, + "learning_rate": 8.382652865044414e-06, + "loss": 0.0553, + "step": 3036 + }, + { + "epoch": 1.3488785254274926, + "grad_norm": 0.6441199621383914, + "learning_rate": 8.381225044185708e-06, + "loss": 0.0522, + "step": 3037 + }, + { + "epoch": 1.3493226737730402, + "grad_norm": 0.9166796511578611, + "learning_rate": 8.37979671507144e-06, + "loss": 0.0458, + "step": 3038 + }, + { + "epoch": 1.3497668221185877, + "grad_norm": 0.531028146563615, + "learning_rate": 8.378367877916313e-06, + "loss": 0.0522, + "step": 3039 + }, + { + "epoch": 1.350210970464135, + "grad_norm": 0.46742487240112696, + "learning_rate": 8.376938532935106e-06, + "loss": 0.0379, + "step": 3040 + }, + { + "epoch": 1.3506551188096825, + "grad_norm": 0.5443821001104316, + "learning_rate": 8.375508680342674e-06, + "loss": 0.0439, + "step": 3041 + }, + { + "epoch": 1.3510992671552298, + "grad_norm": 0.5727922217841197, + "learning_rate": 8.374078320353944e-06, + "loss": 0.0424, + "step": 3042 + }, + { + "epoch": 1.3515434155007773, + "grad_norm": 0.49458279067850347, + "learning_rate": 8.37264745318393e-06, + "loss": 0.0506, + "step": 3043 + }, + { + "epoch": 1.3519875638463246, + "grad_norm": 1.3831663223083857, + "learning_rate": 8.371216079047713e-06, + "loss": 0.0392, + "step": 3044 + }, + { + "epoch": 1.352431712191872, + "grad_norm": 0.4619345954223979, + "learning_rate": 8.369784198160451e-06, + "loss": 0.0413, + "step": 3045 + }, + { + "epoch": 1.3528758605374196, + "grad_norm": 0.4847101460239981, + "learning_rate": 8.368351810737383e-06, + "loss": 0.0389, + "step": 3046 + }, + { + "epoch": 1.3533200088829669, + "grad_norm": 0.5592056103162423, + "learning_rate": 8.366918916993817e-06, + "loss": 0.0527, + "step": 3047 + }, + { + "epoch": 1.3537641572285144, + "grad_norm": 0.3872649705998668, + "learning_rate": 8.365485517145145e-06, + "loss": 0.0324, + "step": 3048 + }, + { + "epoch": 1.3542083055740617, + "grad_norm": 0.4866737656887245, + "learning_rate": 8.364051611406829e-06, + "loss": 0.0438, + "step": 3049 + }, + { + "epoch": 1.3546524539196092, + "grad_norm": 0.969360380725499, + "learning_rate": 8.362617199994413e-06, + "loss": 0.0471, + "step": 3050 + }, + { + "epoch": 1.3550966022651565, + "grad_norm": 0.582815842887394, + "learning_rate": 8.36118228312351e-06, + "loss": 0.042, + "step": 3051 + }, + { + "epoch": 1.355540750610704, + "grad_norm": 0.721376890360811, + "learning_rate": 8.359746861009812e-06, + "loss": 0.0625, + "step": 3052 + }, + { + "epoch": 1.3559848989562515, + "grad_norm": 0.6319029678683107, + "learning_rate": 8.358310933869091e-06, + "loss": 0.0645, + "step": 3053 + }, + { + "epoch": 1.3564290473017988, + "grad_norm": 2.7049677243291828, + "learning_rate": 8.356874501917188e-06, + "loss": 0.0531, + "step": 3054 + }, + { + "epoch": 1.356873195647346, + "grad_norm": 0.3779085536381152, + "learning_rate": 8.355437565370022e-06, + "loss": 0.0356, + "step": 3055 + }, + { + "epoch": 1.3573173439928936, + "grad_norm": 0.7424758490934648, + "learning_rate": 8.354000124443594e-06, + "loss": 0.0524, + "step": 3056 + }, + { + "epoch": 1.3577614923384411, + "grad_norm": 0.6071475225743952, + "learning_rate": 8.352562179353971e-06, + "loss": 0.0476, + "step": 3057 + }, + { + "epoch": 1.3582056406839884, + "grad_norm": 0.4823292349978002, + "learning_rate": 8.351123730317303e-06, + "loss": 0.037, + "step": 3058 + }, + { + "epoch": 1.358649789029536, + "grad_norm": 0.42940088227364875, + "learning_rate": 8.349684777549813e-06, + "loss": 0.032, + "step": 3059 + }, + { + "epoch": 1.3590939373750832, + "grad_norm": 0.6291618703403149, + "learning_rate": 8.348245321267798e-06, + "loss": 0.0542, + "step": 3060 + }, + { + "epoch": 1.3595380857206307, + "grad_norm": 0.46443218939790437, + "learning_rate": 8.346805361687637e-06, + "loss": 0.0482, + "step": 3061 + }, + { + "epoch": 1.359982234066178, + "grad_norm": 0.44266144811788105, + "learning_rate": 8.345364899025776e-06, + "loss": 0.0508, + "step": 3062 + }, + { + "epoch": 1.3604263824117255, + "grad_norm": 0.5931758192805282, + "learning_rate": 8.343923933498742e-06, + "loss": 0.052, + "step": 3063 + }, + { + "epoch": 1.360870530757273, + "grad_norm": 0.4228622965921645, + "learning_rate": 8.342482465323141e-06, + "loss": 0.0342, + "step": 3064 + }, + { + "epoch": 1.3613146791028203, + "grad_norm": 0.6573207468033229, + "learning_rate": 8.341040494715644e-06, + "loss": 0.054, + "step": 3065 + }, + { + "epoch": 1.3617588274483676, + "grad_norm": 0.694757132531818, + "learning_rate": 8.339598021893007e-06, + "loss": 0.0573, + "step": 3066 + }, + { + "epoch": 1.3622029757939151, + "grad_norm": 0.5168331449919007, + "learning_rate": 8.338155047072058e-06, + "loss": 0.0476, + "step": 3067 + }, + { + "epoch": 1.3626471241394627, + "grad_norm": 0.5294718006665416, + "learning_rate": 8.336711570469698e-06, + "loss": 0.0536, + "step": 3068 + }, + { + "epoch": 1.36309127248501, + "grad_norm": 0.5902958008882144, + "learning_rate": 8.33526759230291e-06, + "loss": 0.0543, + "step": 3069 + }, + { + "epoch": 1.3635354208305575, + "grad_norm": 0.5123768514504031, + "learning_rate": 8.333823112788747e-06, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 1.3639795691761047, + "grad_norm": 0.4051444968388966, + "learning_rate": 8.332378132144336e-06, + "loss": 0.0408, + "step": 3071 + }, + { + "epoch": 1.3644237175216523, + "grad_norm": 0.45045135771966455, + "learning_rate": 8.330932650586887e-06, + "loss": 0.0449, + "step": 3072 + }, + { + "epoch": 1.3648678658671995, + "grad_norm": 0.6922855447849937, + "learning_rate": 8.329486668333677e-06, + "loss": 0.052, + "step": 3073 + }, + { + "epoch": 1.365312014212747, + "grad_norm": 0.3651988449178566, + "learning_rate": 8.328040185602063e-06, + "loss": 0.0383, + "step": 3074 + }, + { + "epoch": 1.3657561625582946, + "grad_norm": 0.6637690048102727, + "learning_rate": 8.326593202609475e-06, + "loss": 0.0571, + "step": 3075 + }, + { + "epoch": 1.3662003109038419, + "grad_norm": 0.5428561517455449, + "learning_rate": 8.325145719573419e-06, + "loss": 0.0384, + "step": 3076 + }, + { + "epoch": 1.3666444592493894, + "grad_norm": 0.7398928249310872, + "learning_rate": 8.323697736711478e-06, + "loss": 0.0535, + "step": 3077 + }, + { + "epoch": 1.3670886075949367, + "grad_norm": 0.6968067508458905, + "learning_rate": 8.322249254241309e-06, + "loss": 0.0624, + "step": 3078 + }, + { + "epoch": 1.3675327559404842, + "grad_norm": 0.42153823690397146, + "learning_rate": 8.320800272380639e-06, + "loss": 0.036, + "step": 3079 + }, + { + "epoch": 1.3679769042860315, + "grad_norm": 0.7070448290401131, + "learning_rate": 8.319350791347279e-06, + "loss": 0.0682, + "step": 3080 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.5028115971086855, + "learning_rate": 8.31790081135911e-06, + "loss": 0.0543, + "step": 3081 + }, + { + "epoch": 1.3688652009771265, + "grad_norm": 0.5307360164451863, + "learning_rate": 8.316450332634084e-06, + "loss": 0.0535, + "step": 3082 + }, + { + "epoch": 1.3693093493226738, + "grad_norm": 0.6501199496749419, + "learning_rate": 8.31499935539024e-06, + "loss": 0.0461, + "step": 3083 + }, + { + "epoch": 1.369753497668221, + "grad_norm": 0.5797446911962002, + "learning_rate": 8.313547879845682e-06, + "loss": 0.0472, + "step": 3084 + }, + { + "epoch": 1.3701976460137686, + "grad_norm": 0.4268924633424178, + "learning_rate": 8.312095906218588e-06, + "loss": 0.0447, + "step": 3085 + }, + { + "epoch": 1.370641794359316, + "grad_norm": 0.6304839984425172, + "learning_rate": 8.310643434727216e-06, + "loss": 0.0625, + "step": 3086 + }, + { + "epoch": 1.3710859427048634, + "grad_norm": 0.5665653851714997, + "learning_rate": 8.3091904655899e-06, + "loss": 0.0539, + "step": 3087 + }, + { + "epoch": 1.371530091050411, + "grad_norm": 0.40002359226927725, + "learning_rate": 8.307736999025043e-06, + "loss": 0.0329, + "step": 3088 + }, + { + "epoch": 1.3719742393959582, + "grad_norm": 0.7513234692327438, + "learning_rate": 8.306283035251125e-06, + "loss": 0.0648, + "step": 3089 + }, + { + "epoch": 1.3724183877415057, + "grad_norm": 0.5469597650300881, + "learning_rate": 8.304828574486704e-06, + "loss": 0.0432, + "step": 3090 + }, + { + "epoch": 1.372862536087053, + "grad_norm": 0.48401783163541084, + "learning_rate": 8.303373616950408e-06, + "loss": 0.0457, + "step": 3091 + }, + { + "epoch": 1.3733066844326005, + "grad_norm": 0.4692359706390245, + "learning_rate": 8.301918162860944e-06, + "loss": 0.0422, + "step": 3092 + }, + { + "epoch": 1.373750832778148, + "grad_norm": 0.5378714608611251, + "learning_rate": 8.30046221243709e-06, + "loss": 0.0513, + "step": 3093 + }, + { + "epoch": 1.3741949811236953, + "grad_norm": 0.5625724873607021, + "learning_rate": 8.2990057658977e-06, + "loss": 0.0532, + "step": 3094 + }, + { + "epoch": 1.3746391294692426, + "grad_norm": 0.4899279935109507, + "learning_rate": 8.297548823461704e-06, + "loss": 0.0454, + "step": 3095 + }, + { + "epoch": 1.3750832778147901, + "grad_norm": 0.7726706942428412, + "learning_rate": 8.296091385348104e-06, + "loss": 0.0697, + "step": 3096 + }, + { + "epoch": 1.3755274261603376, + "grad_norm": 0.5012946731170276, + "learning_rate": 8.294633451775977e-06, + "loss": 0.0378, + "step": 3097 + }, + { + "epoch": 1.375971574505885, + "grad_norm": 0.48151498654702213, + "learning_rate": 8.293175022964476e-06, + "loss": 0.0434, + "step": 3098 + }, + { + "epoch": 1.3764157228514324, + "grad_norm": 0.4130737267146749, + "learning_rate": 8.291716099132829e-06, + "loss": 0.0359, + "step": 3099 + }, + { + "epoch": 1.3768598711969797, + "grad_norm": 0.4681903495603353, + "learning_rate": 8.290256680500336e-06, + "loss": 0.0398, + "step": 3100 + }, + { + "epoch": 1.3773040195425272, + "grad_norm": 0.6272554974390412, + "learning_rate": 8.28879676728637e-06, + "loss": 0.0627, + "step": 3101 + }, + { + "epoch": 1.3777481678880745, + "grad_norm": 0.5051341999455976, + "learning_rate": 8.287336359710386e-06, + "loss": 0.047, + "step": 3102 + }, + { + "epoch": 1.378192316233622, + "grad_norm": 0.5262863424826024, + "learning_rate": 8.285875457991903e-06, + "loss": 0.047, + "step": 3103 + }, + { + "epoch": 1.3786364645791696, + "grad_norm": 0.5493114786417674, + "learning_rate": 8.284414062350524e-06, + "loss": 0.0477, + "step": 3104 + }, + { + "epoch": 1.3790806129247168, + "grad_norm": 0.529779249166117, + "learning_rate": 8.282952173005916e-06, + "loss": 0.0489, + "step": 3105 + }, + { + "epoch": 1.3795247612702641, + "grad_norm": 0.5169327523467377, + "learning_rate": 8.28148979017783e-06, + "loss": 0.0568, + "step": 3106 + }, + { + "epoch": 1.3799689096158116, + "grad_norm": 0.6639124677133846, + "learning_rate": 8.280026914086086e-06, + "loss": 0.0831, + "step": 3107 + }, + { + "epoch": 1.3804130579613592, + "grad_norm": 0.6604146215653776, + "learning_rate": 8.278563544950579e-06, + "loss": 0.0688, + "step": 3108 + }, + { + "epoch": 1.3808572063069064, + "grad_norm": 0.499500034008347, + "learning_rate": 8.277099682991276e-06, + "loss": 0.0466, + "step": 3109 + }, + { + "epoch": 1.381301354652454, + "grad_norm": 0.5463437639449784, + "learning_rate": 8.275635328428226e-06, + "loss": 0.0576, + "step": 3110 + }, + { + "epoch": 1.3817455029980013, + "grad_norm": 0.43757681367045054, + "learning_rate": 8.274170481481541e-06, + "loss": 0.0348, + "step": 3111 + }, + { + "epoch": 1.3821896513435488, + "grad_norm": 0.5431219597419497, + "learning_rate": 8.272705142371414e-06, + "loss": 0.0592, + "step": 3112 + }, + { + "epoch": 1.382633799689096, + "grad_norm": 0.6091624737010345, + "learning_rate": 8.271239311318111e-06, + "loss": 0.0645, + "step": 3113 + }, + { + "epoch": 1.3830779480346436, + "grad_norm": 0.46664330067036014, + "learning_rate": 8.269772988541971e-06, + "loss": 0.0293, + "step": 3114 + }, + { + "epoch": 1.383522096380191, + "grad_norm": 0.39018350334607316, + "learning_rate": 8.268306174263407e-06, + "loss": 0.0488, + "step": 3115 + }, + { + "epoch": 1.3839662447257384, + "grad_norm": 0.7360437552444796, + "learning_rate": 8.266838868702904e-06, + "loss": 0.0539, + "step": 3116 + }, + { + "epoch": 1.3844103930712859, + "grad_norm": 0.3523843914894874, + "learning_rate": 8.265371072081028e-06, + "loss": 0.033, + "step": 3117 + }, + { + "epoch": 1.3848545414168332, + "grad_norm": 0.5841378028235703, + "learning_rate": 8.263902784618409e-06, + "loss": 0.0512, + "step": 3118 + }, + { + "epoch": 1.3852986897623807, + "grad_norm": 0.6604716728640524, + "learning_rate": 8.262434006535759e-06, + "loss": 0.0841, + "step": 3119 + }, + { + "epoch": 1.385742838107928, + "grad_norm": 0.43203367132780174, + "learning_rate": 8.260964738053859e-06, + "loss": 0.0474, + "step": 3120 + }, + { + "epoch": 1.3861869864534755, + "grad_norm": 0.48844306169229995, + "learning_rate": 8.259494979393563e-06, + "loss": 0.0481, + "step": 3121 + }, + { + "epoch": 1.386631134799023, + "grad_norm": 0.3906618714847538, + "learning_rate": 8.258024730775805e-06, + "loss": 0.0398, + "step": 3122 + }, + { + "epoch": 1.3870752831445703, + "grad_norm": 0.47997636522094667, + "learning_rate": 8.256553992421583e-06, + "loss": 0.0554, + "step": 3123 + }, + { + "epoch": 1.3875194314901176, + "grad_norm": 0.6434789607226818, + "learning_rate": 8.255082764551978e-06, + "loss": 0.0484, + "step": 3124 + }, + { + "epoch": 1.387963579835665, + "grad_norm": 0.3887987489990569, + "learning_rate": 8.25361104738814e-06, + "loss": 0.0464, + "step": 3125 + }, + { + "epoch": 1.3884077281812126, + "grad_norm": 0.487796373914711, + "learning_rate": 8.252138841151292e-06, + "loss": 0.0441, + "step": 3126 + }, + { + "epoch": 1.38885187652676, + "grad_norm": 0.6398270077190703, + "learning_rate": 8.250666146062732e-06, + "loss": 0.0419, + "step": 3127 + }, + { + "epoch": 1.3892960248723074, + "grad_norm": 0.4422350214805325, + "learning_rate": 8.249192962343829e-06, + "loss": 0.041, + "step": 3128 + }, + { + "epoch": 1.3897401732178547, + "grad_norm": 0.5252093775770235, + "learning_rate": 8.247719290216032e-06, + "loss": 0.0479, + "step": 3129 + }, + { + "epoch": 1.3901843215634022, + "grad_norm": 0.6735609599734196, + "learning_rate": 8.246245129900856e-06, + "loss": 0.038, + "step": 3130 + }, + { + "epoch": 1.3906284699089495, + "grad_norm": 0.5147679810299035, + "learning_rate": 8.244770481619892e-06, + "loss": 0.05, + "step": 3131 + }, + { + "epoch": 1.391072618254497, + "grad_norm": 0.4260669615511085, + "learning_rate": 8.243295345594807e-06, + "loss": 0.0351, + "step": 3132 + }, + { + "epoch": 1.3915167666000445, + "grad_norm": 0.5948196620222996, + "learning_rate": 8.241819722047337e-06, + "loss": 0.0532, + "step": 3133 + }, + { + "epoch": 1.3919609149455918, + "grad_norm": 0.4954498969782017, + "learning_rate": 8.240343611199294e-06, + "loss": 0.0395, + "step": 3134 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 0.5748384966095534, + "learning_rate": 8.238867013272562e-06, + "loss": 0.051, + "step": 3135 + }, + { + "epoch": 1.3928492116366866, + "grad_norm": 0.47868756581223826, + "learning_rate": 8.237389928489099e-06, + "loss": 0.0406, + "step": 3136 + }, + { + "epoch": 1.3932933599822341, + "grad_norm": 0.42581407271928307, + "learning_rate": 8.235912357070938e-06, + "loss": 0.0337, + "step": 3137 + }, + { + "epoch": 1.3937375083277814, + "grad_norm": 0.6781537407713069, + "learning_rate": 8.234434299240179e-06, + "loss": 0.0483, + "step": 3138 + }, + { + "epoch": 1.394181656673329, + "grad_norm": 0.598623626498893, + "learning_rate": 8.232955755219002e-06, + "loss": 0.0498, + "step": 3139 + }, + { + "epoch": 1.3946258050188762, + "grad_norm": 0.6505669269254222, + "learning_rate": 8.231476725229659e-06, + "loss": 0.0391, + "step": 3140 + }, + { + "epoch": 1.3950699533644237, + "grad_norm": 0.5150042095978116, + "learning_rate": 8.229997209494468e-06, + "loss": 0.0538, + "step": 3141 + }, + { + "epoch": 1.395514101709971, + "grad_norm": 0.48445621459488397, + "learning_rate": 8.228517208235829e-06, + "loss": 0.0485, + "step": 3142 + }, + { + "epoch": 1.3959582500555185, + "grad_norm": 0.42479311306640655, + "learning_rate": 8.22703672167621e-06, + "loss": 0.0454, + "step": 3143 + }, + { + "epoch": 1.396402398401066, + "grad_norm": 0.4968133842399203, + "learning_rate": 8.225555750038157e-06, + "loss": 0.0548, + "step": 3144 + }, + { + "epoch": 1.3968465467466133, + "grad_norm": 0.839408513781402, + "learning_rate": 8.22407429354428e-06, + "loss": 0.0508, + "step": 3145 + }, + { + "epoch": 1.3972906950921609, + "grad_norm": 0.4593510375216177, + "learning_rate": 8.222592352417268e-06, + "loss": 0.0484, + "step": 3146 + }, + { + "epoch": 1.3977348434377082, + "grad_norm": 0.41788586732862426, + "learning_rate": 8.221109926879885e-06, + "loss": 0.0394, + "step": 3147 + }, + { + "epoch": 1.3981789917832557, + "grad_norm": 0.6011495923589089, + "learning_rate": 8.219627017154962e-06, + "loss": 0.0538, + "step": 3148 + }, + { + "epoch": 1.398623140128803, + "grad_norm": 0.483716139104255, + "learning_rate": 8.218143623465407e-06, + "loss": 0.0479, + "step": 3149 + }, + { + "epoch": 1.3990672884743505, + "grad_norm": 0.7567570451553496, + "learning_rate": 8.216659746034199e-06, + "loss": 0.059, + "step": 3150 + }, + { + "epoch": 1.399511436819898, + "grad_norm": 0.48553150567848463, + "learning_rate": 8.215175385084389e-06, + "loss": 0.0617, + "step": 3151 + }, + { + "epoch": 1.3999555851654453, + "grad_norm": 0.47459878094993846, + "learning_rate": 8.2136905408391e-06, + "loss": 0.0527, + "step": 3152 + }, + { + "epoch": 1.4003997335109926, + "grad_norm": 0.5833223937440162, + "learning_rate": 8.212205213521535e-06, + "loss": 0.0626, + "step": 3153 + }, + { + "epoch": 1.40084388185654, + "grad_norm": 0.35626979864365443, + "learning_rate": 8.210719403354961e-06, + "loss": 0.033, + "step": 3154 + }, + { + "epoch": 1.4012880302020876, + "grad_norm": 0.7430849859689593, + "learning_rate": 8.209233110562719e-06, + "loss": 0.0645, + "step": 3155 + }, + { + "epoch": 1.4017321785476349, + "grad_norm": 0.45738357167910687, + "learning_rate": 8.207746335368223e-06, + "loss": 0.0378, + "step": 3156 + }, + { + "epoch": 1.4021763268931824, + "grad_norm": 0.5265228817791134, + "learning_rate": 8.206259077994966e-06, + "loss": 0.0653, + "step": 3157 + }, + { + "epoch": 1.4026204752387297, + "grad_norm": 0.5945877517368485, + "learning_rate": 8.204771338666504e-06, + "loss": 0.0517, + "step": 3158 + }, + { + "epoch": 1.4030646235842772, + "grad_norm": 0.48633704182422527, + "learning_rate": 8.20328311760647e-06, + "loss": 0.0467, + "step": 3159 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 0.45125218858977967, + "learning_rate": 8.201794415038569e-06, + "loss": 0.0436, + "step": 3160 + }, + { + "epoch": 1.403952920275372, + "grad_norm": 0.454560780546378, + "learning_rate": 8.200305231186578e-06, + "loss": 0.0566, + "step": 3161 + }, + { + "epoch": 1.4043970686209195, + "grad_norm": 0.4442970232995022, + "learning_rate": 8.198815566274346e-06, + "loss": 0.0343, + "step": 3162 + }, + { + "epoch": 1.4048412169664668, + "grad_norm": 0.4600734220829904, + "learning_rate": 8.197325420525797e-06, + "loss": 0.0471, + "step": 3163 + }, + { + "epoch": 1.405285365312014, + "grad_norm": 0.46601308549255893, + "learning_rate": 8.195834794164925e-06, + "loss": 0.0427, + "step": 3164 + }, + { + "epoch": 1.4057295136575616, + "grad_norm": 0.5996621159738204, + "learning_rate": 8.194343687415795e-06, + "loss": 0.0457, + "step": 3165 + }, + { + "epoch": 1.4061736620031091, + "grad_norm": 0.9069132714321548, + "learning_rate": 8.192852100502547e-06, + "loss": 0.0678, + "step": 3166 + }, + { + "epoch": 1.4066178103486564, + "grad_norm": 0.43166301548877434, + "learning_rate": 8.191360033649392e-06, + "loss": 0.0352, + "step": 3167 + }, + { + "epoch": 1.407061958694204, + "grad_norm": 0.5216023574393313, + "learning_rate": 8.18986748708061e-06, + "loss": 0.0475, + "step": 3168 + }, + { + "epoch": 1.4075061070397512, + "grad_norm": 0.5468879202276986, + "learning_rate": 8.18837446102056e-06, + "loss": 0.0547, + "step": 3169 + }, + { + "epoch": 1.4079502553852987, + "grad_norm": 0.4122751122216154, + "learning_rate": 8.186880955693667e-06, + "loss": 0.0357, + "step": 3170 + }, + { + "epoch": 1.408394403730846, + "grad_norm": 0.49605981026080986, + "learning_rate": 8.18538697132443e-06, + "loss": 0.0593, + "step": 3171 + }, + { + "epoch": 1.4088385520763935, + "grad_norm": 0.7328755570855009, + "learning_rate": 8.183892508137423e-06, + "loss": 0.0691, + "step": 3172 + }, + { + "epoch": 1.409282700421941, + "grad_norm": 0.5835998191367716, + "learning_rate": 8.182397566357286e-06, + "loss": 0.0621, + "step": 3173 + }, + { + "epoch": 1.4097268487674883, + "grad_norm": 0.45687160374129326, + "learning_rate": 8.180902146208734e-06, + "loss": 0.0529, + "step": 3174 + }, + { + "epoch": 1.4101709971130356, + "grad_norm": 0.4464006299057523, + "learning_rate": 8.179406247916555e-06, + "loss": 0.0462, + "step": 3175 + }, + { + "epoch": 1.4106151454585831, + "grad_norm": 0.5104992313269215, + "learning_rate": 8.17790987170561e-06, + "loss": 0.0397, + "step": 3176 + }, + { + "epoch": 1.4110592938041306, + "grad_norm": 0.4500241032324194, + "learning_rate": 8.176413017800828e-06, + "loss": 0.0588, + "step": 3177 + }, + { + "epoch": 1.411503442149678, + "grad_norm": 0.9698569462217295, + "learning_rate": 8.174915686427211e-06, + "loss": 0.0593, + "step": 3178 + }, + { + "epoch": 1.4119475904952254, + "grad_norm": 1.9467722797227018, + "learning_rate": 8.173417877809835e-06, + "loss": 0.0657, + "step": 3179 + }, + { + "epoch": 1.4123917388407727, + "grad_norm": 0.8619854581857347, + "learning_rate": 8.171919592173843e-06, + "loss": 0.0616, + "step": 3180 + }, + { + "epoch": 1.4128358871863202, + "grad_norm": 2.1957744086497324, + "learning_rate": 8.170420829744458e-06, + "loss": 0.0441, + "step": 3181 + }, + { + "epoch": 1.4132800355318675, + "grad_norm": 0.437679909069633, + "learning_rate": 8.168921590746964e-06, + "loss": 0.0448, + "step": 3182 + }, + { + "epoch": 1.413724183877415, + "grad_norm": 0.44545740554831986, + "learning_rate": 8.167421875406725e-06, + "loss": 0.0534, + "step": 3183 + }, + { + "epoch": 1.4141683322229626, + "grad_norm": 0.6661022181292509, + "learning_rate": 8.165921683949172e-06, + "loss": 0.0752, + "step": 3184 + }, + { + "epoch": 1.4146124805685099, + "grad_norm": 0.5379954758436954, + "learning_rate": 8.164421016599811e-06, + "loss": 0.0465, + "step": 3185 + }, + { + "epoch": 1.4150566289140574, + "grad_norm": 0.7649578135185098, + "learning_rate": 8.162919873584216e-06, + "loss": 0.0655, + "step": 3186 + }, + { + "epoch": 1.4155007772596047, + "grad_norm": 0.6155469523069211, + "learning_rate": 8.161418255128037e-06, + "loss": 0.058, + "step": 3187 + }, + { + "epoch": 1.4159449256051522, + "grad_norm": 0.43980966134894045, + "learning_rate": 8.15991616145699e-06, + "loss": 0.046, + "step": 3188 + }, + { + "epoch": 1.4163890739506995, + "grad_norm": 0.3772085957077231, + "learning_rate": 8.158413592796867e-06, + "loss": 0.0407, + "step": 3189 + }, + { + "epoch": 1.416833222296247, + "grad_norm": 0.4916035633372075, + "learning_rate": 8.156910549373529e-06, + "loss": 0.0509, + "step": 3190 + }, + { + "epoch": 1.4172773706417945, + "grad_norm": 0.44586244165360156, + "learning_rate": 8.15540703141291e-06, + "loss": 0.0421, + "step": 3191 + }, + { + "epoch": 1.4177215189873418, + "grad_norm": 0.6752971156056389, + "learning_rate": 8.153903039141011e-06, + "loss": 0.0394, + "step": 3192 + }, + { + "epoch": 1.418165667332889, + "grad_norm": 0.5021744951340577, + "learning_rate": 8.15239857278391e-06, + "loss": 0.0446, + "step": 3193 + }, + { + "epoch": 1.4186098156784366, + "grad_norm": 0.4057752198360003, + "learning_rate": 8.150893632567755e-06, + "loss": 0.0413, + "step": 3194 + }, + { + "epoch": 1.419053964023984, + "grad_norm": 0.44646591292416776, + "learning_rate": 8.149388218718763e-06, + "loss": 0.033, + "step": 3195 + }, + { + "epoch": 1.4194981123695314, + "grad_norm": 0.5430849460817241, + "learning_rate": 8.147882331463221e-06, + "loss": 0.0476, + "step": 3196 + }, + { + "epoch": 1.419942260715079, + "grad_norm": 0.5748975413489712, + "learning_rate": 8.146375971027492e-06, + "loss": 0.0422, + "step": 3197 + }, + { + "epoch": 1.4203864090606262, + "grad_norm": 0.8569726405019049, + "learning_rate": 8.144869137638008e-06, + "loss": 0.0586, + "step": 3198 + }, + { + "epoch": 1.4208305574061737, + "grad_norm": 0.48111798749968265, + "learning_rate": 8.14336183152127e-06, + "loss": 0.0511, + "step": 3199 + }, + { + "epoch": 1.421274705751721, + "grad_norm": 0.4447988259403195, + "learning_rate": 8.141854052903853e-06, + "loss": 0.0409, + "step": 3200 + }, + { + "epoch": 1.4217188540972685, + "grad_norm": 0.42318494315184574, + "learning_rate": 8.1403458020124e-06, + "loss": 0.0393, + "step": 3201 + }, + { + "epoch": 1.422163002442816, + "grad_norm": 0.527946330855177, + "learning_rate": 8.138837079073628e-06, + "loss": 0.0544, + "step": 3202 + }, + { + "epoch": 1.4226071507883633, + "grad_norm": 0.5168740647207917, + "learning_rate": 8.137327884314323e-06, + "loss": 0.042, + "step": 3203 + }, + { + "epoch": 1.4230512991339106, + "grad_norm": 0.44556932550096595, + "learning_rate": 8.135818217961344e-06, + "loss": 0.043, + "step": 3204 + }, + { + "epoch": 1.423495447479458, + "grad_norm": 0.4521168782601511, + "learning_rate": 8.13430808024162e-06, + "loss": 0.0616, + "step": 3205 + }, + { + "epoch": 1.4239395958250056, + "grad_norm": 0.998651913471337, + "learning_rate": 8.132797471382148e-06, + "loss": 0.0857, + "step": 3206 + }, + { + "epoch": 1.424383744170553, + "grad_norm": 0.4845757133637773, + "learning_rate": 8.131286391609996e-06, + "loss": 0.0316, + "step": 3207 + }, + { + "epoch": 1.4248278925161004, + "grad_norm": 0.5115534012315782, + "learning_rate": 8.129774841152311e-06, + "loss": 0.0442, + "step": 3208 + }, + { + "epoch": 1.4252720408616477, + "grad_norm": 0.4658784277879884, + "learning_rate": 8.128262820236302e-06, + "loss": 0.0392, + "step": 3209 + }, + { + "epoch": 1.4257161892071952, + "grad_norm": 0.7323138806429754, + "learning_rate": 8.12675032908925e-06, + "loss": 0.0476, + "step": 3210 + }, + { + "epoch": 1.4261603375527425, + "grad_norm": 0.7544916173168683, + "learning_rate": 8.125237367938511e-06, + "loss": 0.0539, + "step": 3211 + }, + { + "epoch": 1.42660448589829, + "grad_norm": 0.62197595498055, + "learning_rate": 8.123723937011507e-06, + "loss": 0.0526, + "step": 3212 + }, + { + "epoch": 1.4270486342438375, + "grad_norm": 0.411592343530701, + "learning_rate": 8.12221003653573e-06, + "loss": 0.0422, + "step": 3213 + }, + { + "epoch": 1.4274927825893848, + "grad_norm": 0.5467878759617523, + "learning_rate": 8.12069566673875e-06, + "loss": 0.0486, + "step": 3214 + }, + { + "epoch": 1.4279369309349323, + "grad_norm": 0.3951319611407401, + "learning_rate": 8.119180827848199e-06, + "loss": 0.0372, + "step": 3215 + }, + { + "epoch": 1.4283810792804796, + "grad_norm": 0.5355923646902133, + "learning_rate": 8.117665520091783e-06, + "loss": 0.0583, + "step": 3216 + }, + { + "epoch": 1.4288252276260272, + "grad_norm": 0.49870365746589324, + "learning_rate": 8.11614974369728e-06, + "loss": 0.0521, + "step": 3217 + }, + { + "epoch": 1.4292693759715744, + "grad_norm": 0.5463670187827259, + "learning_rate": 8.114633498892537e-06, + "loss": 0.0567, + "step": 3218 + }, + { + "epoch": 1.429713524317122, + "grad_norm": 0.5699638456777965, + "learning_rate": 8.11311678590547e-06, + "loss": 0.05, + "step": 3219 + }, + { + "epoch": 1.4301576726626695, + "grad_norm": 0.3789965811347938, + "learning_rate": 8.11159960496407e-06, + "loss": 0.0405, + "step": 3220 + }, + { + "epoch": 1.4306018210082168, + "grad_norm": 0.4496180621274656, + "learning_rate": 8.11008195629639e-06, + "loss": 0.0414, + "step": 3221 + }, + { + "epoch": 1.431045969353764, + "grad_norm": 0.6660820533473273, + "learning_rate": 8.10856384013056e-06, + "loss": 0.0519, + "step": 3222 + }, + { + "epoch": 1.4314901176993116, + "grad_norm": 0.42475615332155653, + "learning_rate": 8.107045256694782e-06, + "loss": 0.0507, + "step": 3223 + }, + { + "epoch": 1.431934266044859, + "grad_norm": 0.531367471068279, + "learning_rate": 8.105526206217322e-06, + "loss": 0.0438, + "step": 3224 + }, + { + "epoch": 1.4323784143904064, + "grad_norm": 0.6173452877900699, + "learning_rate": 8.104006688926518e-06, + "loss": 0.044, + "step": 3225 + }, + { + "epoch": 1.4328225627359539, + "grad_norm": 0.5047091534195701, + "learning_rate": 8.102486705050782e-06, + "loss": 0.0448, + "step": 3226 + }, + { + "epoch": 1.4332667110815012, + "grad_norm": 0.756671059089655, + "learning_rate": 8.100966254818591e-06, + "loss": 0.0472, + "step": 3227 + }, + { + "epoch": 1.4337108594270487, + "grad_norm": 0.6679241833390342, + "learning_rate": 8.099445338458496e-06, + "loss": 0.0428, + "step": 3228 + }, + { + "epoch": 1.434155007772596, + "grad_norm": 0.40517368896131173, + "learning_rate": 8.097923956199118e-06, + "loss": 0.0366, + "step": 3229 + }, + { + "epoch": 1.4345991561181435, + "grad_norm": 0.4490587246876595, + "learning_rate": 8.096402108269144e-06, + "loss": 0.0356, + "step": 3230 + }, + { + "epoch": 1.435043304463691, + "grad_norm": 0.5508426411635136, + "learning_rate": 8.094879794897333e-06, + "loss": 0.0499, + "step": 3231 + }, + { + "epoch": 1.4354874528092383, + "grad_norm": 0.39803990157367924, + "learning_rate": 8.093357016312518e-06, + "loss": 0.039, + "step": 3232 + }, + { + "epoch": 1.4359316011547856, + "grad_norm": 0.3911481714996281, + "learning_rate": 8.091833772743595e-06, + "loss": 0.0408, + "step": 3233 + }, + { + "epoch": 1.436375749500333, + "grad_norm": 0.43202710834535546, + "learning_rate": 8.090310064419536e-06, + "loss": 0.0517, + "step": 3234 + }, + { + "epoch": 1.4368198978458806, + "grad_norm": 0.63394524089963, + "learning_rate": 8.088785891569379e-06, + "loss": 0.0498, + "step": 3235 + }, + { + "epoch": 1.437264046191428, + "grad_norm": 0.6014431551871622, + "learning_rate": 8.087261254422232e-06, + "loss": 0.0553, + "step": 3236 + }, + { + "epoch": 1.4377081945369754, + "grad_norm": 0.6136207381555616, + "learning_rate": 8.085736153207277e-06, + "loss": 0.0594, + "step": 3237 + }, + { + "epoch": 1.4381523428825227, + "grad_norm": 0.5952470946539382, + "learning_rate": 8.08421058815376e-06, + "loss": 0.0769, + "step": 3238 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 0.5189933118580958, + "learning_rate": 8.082684559490999e-06, + "loss": 0.0554, + "step": 3239 + }, + { + "epoch": 1.4390406395736175, + "grad_norm": 0.4135426057188439, + "learning_rate": 8.081158067448385e-06, + "loss": 0.0396, + "step": 3240 + }, + { + "epoch": 1.439484787919165, + "grad_norm": 0.5443217663228829, + "learning_rate": 8.079631112255372e-06, + "loss": 0.044, + "step": 3241 + }, + { + "epoch": 1.4399289362647125, + "grad_norm": 0.5826766711918354, + "learning_rate": 8.078103694141487e-06, + "loss": 0.0446, + "step": 3242 + }, + { + "epoch": 1.4403730846102598, + "grad_norm": 0.648859138960752, + "learning_rate": 8.076575813336333e-06, + "loss": 0.0609, + "step": 3243 + }, + { + "epoch": 1.440817232955807, + "grad_norm": 0.4409431909915432, + "learning_rate": 8.07504747006957e-06, + "loss": 0.0531, + "step": 3244 + }, + { + "epoch": 1.4412613813013546, + "grad_norm": 0.3721919303810988, + "learning_rate": 8.073518664570938e-06, + "loss": 0.0386, + "step": 3245 + }, + { + "epoch": 1.4417055296469021, + "grad_norm": 0.43841622793791346, + "learning_rate": 8.07198939707024e-06, + "loss": 0.0493, + "step": 3246 + }, + { + "epoch": 1.4421496779924494, + "grad_norm": 0.5931350944063895, + "learning_rate": 8.070459667797351e-06, + "loss": 0.0634, + "step": 3247 + }, + { + "epoch": 1.442593826337997, + "grad_norm": 0.41937120746821677, + "learning_rate": 8.068929476982217e-06, + "loss": 0.0423, + "step": 3248 + }, + { + "epoch": 1.4430379746835442, + "grad_norm": 0.6050623457059973, + "learning_rate": 8.067398824854851e-06, + "loss": 0.07, + "step": 3249 + }, + { + "epoch": 1.4434821230290917, + "grad_norm": 0.36263255781004144, + "learning_rate": 8.065867711645334e-06, + "loss": 0.0488, + "step": 3250 + }, + { + "epoch": 1.443926271374639, + "grad_norm": 0.5452720169189981, + "learning_rate": 8.064336137583821e-06, + "loss": 0.0592, + "step": 3251 + }, + { + "epoch": 1.4443704197201865, + "grad_norm": 0.5648377578907434, + "learning_rate": 8.062804102900532e-06, + "loss": 0.045, + "step": 3252 + }, + { + "epoch": 1.444814568065734, + "grad_norm": 0.5203009129121725, + "learning_rate": 8.061271607825758e-06, + "loss": 0.0455, + "step": 3253 + }, + { + "epoch": 1.4452587164112813, + "grad_norm": 0.5388011169643779, + "learning_rate": 8.059738652589862e-06, + "loss": 0.0563, + "step": 3254 + }, + { + "epoch": 1.4457028647568289, + "grad_norm": 0.41394935328254073, + "learning_rate": 8.058205237423266e-06, + "loss": 0.0523, + "step": 3255 + }, + { + "epoch": 1.4461470131023761, + "grad_norm": 0.921759574953431, + "learning_rate": 8.056671362556476e-06, + "loss": 0.0356, + "step": 3256 + }, + { + "epoch": 1.4465911614479237, + "grad_norm": 0.8754840871870988, + "learning_rate": 8.055137028220058e-06, + "loss": 0.0615, + "step": 3257 + }, + { + "epoch": 1.447035309793471, + "grad_norm": 0.4878410786722894, + "learning_rate": 8.053602234644644e-06, + "loss": 0.0415, + "step": 3258 + }, + { + "epoch": 1.4474794581390185, + "grad_norm": 0.6534435017529012, + "learning_rate": 8.052066982060945e-06, + "loss": 0.0579, + "step": 3259 + }, + { + "epoch": 1.447923606484566, + "grad_norm": 0.9319839406156598, + "learning_rate": 8.050531270699731e-06, + "loss": 0.0484, + "step": 3260 + }, + { + "epoch": 1.4483677548301133, + "grad_norm": 0.4011114975003397, + "learning_rate": 8.048995100791847e-06, + "loss": 0.0372, + "step": 3261 + }, + { + "epoch": 1.4488119031756606, + "grad_norm": 0.5604426931227581, + "learning_rate": 8.047458472568208e-06, + "loss": 0.0496, + "step": 3262 + }, + { + "epoch": 1.449256051521208, + "grad_norm": 0.3856602729544308, + "learning_rate": 8.045921386259792e-06, + "loss": 0.0368, + "step": 3263 + }, + { + "epoch": 1.4497001998667556, + "grad_norm": 0.37413170967076825, + "learning_rate": 8.044383842097651e-06, + "loss": 0.0426, + "step": 3264 + }, + { + "epoch": 1.4501443482123029, + "grad_norm": 0.4129802274607094, + "learning_rate": 8.042845840312903e-06, + "loss": 0.0398, + "step": 3265 + }, + { + "epoch": 1.4505884965578504, + "grad_norm": 0.8418363464980803, + "learning_rate": 8.041307381136738e-06, + "loss": 0.0656, + "step": 3266 + }, + { + "epoch": 1.4510326449033977, + "grad_norm": 0.5365298547106184, + "learning_rate": 8.039768464800408e-06, + "loss": 0.048, + "step": 3267 + }, + { + "epoch": 1.4514767932489452, + "grad_norm": 1.0687462882240213, + "learning_rate": 8.038229091535244e-06, + "loss": 0.0503, + "step": 3268 + }, + { + "epoch": 1.4519209415944925, + "grad_norm": 0.49239171264959003, + "learning_rate": 8.036689261572636e-06, + "loss": 0.0563, + "step": 3269 + }, + { + "epoch": 1.45236508994004, + "grad_norm": 0.5011792630285886, + "learning_rate": 8.035148975144046e-06, + "loss": 0.0396, + "step": 3270 + }, + { + "epoch": 1.4528092382855875, + "grad_norm": 0.49603707035563677, + "learning_rate": 8.033608232481009e-06, + "loss": 0.0501, + "step": 3271 + }, + { + "epoch": 1.4532533866311348, + "grad_norm": 0.5939987053954954, + "learning_rate": 8.032067033815123e-06, + "loss": 0.0458, + "step": 3272 + }, + { + "epoch": 1.453697534976682, + "grad_norm": 0.5812600520648006, + "learning_rate": 8.030525379378053e-06, + "loss": 0.0447, + "step": 3273 + }, + { + "epoch": 1.4541416833222296, + "grad_norm": 0.6480809819716409, + "learning_rate": 8.028983269401542e-06, + "loss": 0.0608, + "step": 3274 + }, + { + "epoch": 1.454585831667777, + "grad_norm": 0.5090480104564535, + "learning_rate": 8.027440704117391e-06, + "loss": 0.0559, + "step": 3275 + }, + { + "epoch": 1.4550299800133244, + "grad_norm": 0.6602628376919341, + "learning_rate": 8.025897683757473e-06, + "loss": 0.0587, + "step": 3276 + }, + { + "epoch": 1.455474128358872, + "grad_norm": 0.5252379759239632, + "learning_rate": 8.024354208553735e-06, + "loss": 0.061, + "step": 3277 + }, + { + "epoch": 1.4559182767044192, + "grad_norm": 0.4725375990607396, + "learning_rate": 8.022810278738185e-06, + "loss": 0.0408, + "step": 3278 + }, + { + "epoch": 1.4563624250499667, + "grad_norm": 0.5401711443633535, + "learning_rate": 8.021265894542898e-06, + "loss": 0.0667, + "step": 3279 + }, + { + "epoch": 1.456806573395514, + "grad_norm": 0.5352342030628493, + "learning_rate": 8.019721056200027e-06, + "loss": 0.0458, + "step": 3280 + }, + { + "epoch": 1.4572507217410615, + "grad_norm": 0.4210073842896914, + "learning_rate": 8.018175763941784e-06, + "loss": 0.0437, + "step": 3281 + }, + { + "epoch": 1.457694870086609, + "grad_norm": 0.3904193074228819, + "learning_rate": 8.016630018000457e-06, + "loss": 0.0456, + "step": 3282 + }, + { + "epoch": 1.4581390184321563, + "grad_norm": 0.4133159146388692, + "learning_rate": 8.015083818608393e-06, + "loss": 0.0385, + "step": 3283 + }, + { + "epoch": 1.4585831667777038, + "grad_norm": 0.5275288891932939, + "learning_rate": 8.013537165998014e-06, + "loss": 0.0477, + "step": 3284 + }, + { + "epoch": 1.4590273151232511, + "grad_norm": 0.43540224877298483, + "learning_rate": 8.011990060401806e-06, + "loss": 0.0433, + "step": 3285 + }, + { + "epoch": 1.4594714634687986, + "grad_norm": 0.476963740475049, + "learning_rate": 8.010442502052329e-06, + "loss": 0.0549, + "step": 3286 + }, + { + "epoch": 1.459915611814346, + "grad_norm": 0.4927774183701049, + "learning_rate": 8.008894491182205e-06, + "loss": 0.0464, + "step": 3287 + }, + { + "epoch": 1.4603597601598934, + "grad_norm": 0.4716288282663322, + "learning_rate": 8.007346028024125e-06, + "loss": 0.0437, + "step": 3288 + }, + { + "epoch": 1.460803908505441, + "grad_norm": 0.5277979981560662, + "learning_rate": 8.005797112810854e-06, + "loss": 0.0452, + "step": 3289 + }, + { + "epoch": 1.4612480568509882, + "grad_norm": 0.6486682818717127, + "learning_rate": 8.004247745775216e-06, + "loss": 0.0636, + "step": 3290 + }, + { + "epoch": 1.4616922051965355, + "grad_norm": 0.693590181694553, + "learning_rate": 8.00269792715011e-06, + "loss": 0.0392, + "step": 3291 + }, + { + "epoch": 1.462136353542083, + "grad_norm": 0.4973813779816927, + "learning_rate": 8.001147657168497e-06, + "loss": 0.0523, + "step": 3292 + }, + { + "epoch": 1.4625805018876306, + "grad_norm": 0.42880946574568146, + "learning_rate": 7.99959693606341e-06, + "loss": 0.0365, + "step": 3293 + }, + { + "epoch": 1.4630246502331778, + "grad_norm": 0.4647071982633707, + "learning_rate": 7.99804576406795e-06, + "loss": 0.0473, + "step": 3294 + }, + { + "epoch": 1.4634687985787254, + "grad_norm": 0.5564296867063627, + "learning_rate": 7.996494141415284e-06, + "loss": 0.0654, + "step": 3295 + }, + { + "epoch": 1.4639129469242727, + "grad_norm": 0.4602454467326126, + "learning_rate": 7.994942068338647e-06, + "loss": 0.0375, + "step": 3296 + }, + { + "epoch": 1.4643570952698202, + "grad_norm": 0.44240678049891874, + "learning_rate": 7.993389545071341e-06, + "loss": 0.0538, + "step": 3297 + }, + { + "epoch": 1.4648012436153675, + "grad_norm": 0.5956552998865925, + "learning_rate": 7.991836571846739e-06, + "loss": 0.0483, + "step": 3298 + }, + { + "epoch": 1.465245391960915, + "grad_norm": 0.8272941029925692, + "learning_rate": 7.990283148898277e-06, + "loss": 0.0621, + "step": 3299 + }, + { + "epoch": 1.4656895403064625, + "grad_norm": 0.521288053624164, + "learning_rate": 7.988729276459463e-06, + "loss": 0.0389, + "step": 3300 + }, + { + "epoch": 1.4661336886520098, + "grad_norm": 0.3918246593118355, + "learning_rate": 7.987174954763867e-06, + "loss": 0.0351, + "step": 3301 + }, + { + "epoch": 1.466577836997557, + "grad_norm": 0.5695857738579163, + "learning_rate": 7.985620184045133e-06, + "loss": 0.051, + "step": 3302 + }, + { + "epoch": 1.4670219853431046, + "grad_norm": 0.5461168059678839, + "learning_rate": 7.98406496453697e-06, + "loss": 0.045, + "step": 3303 + }, + { + "epoch": 1.467466133688652, + "grad_norm": 0.45316446756220935, + "learning_rate": 7.982509296473151e-06, + "loss": 0.0376, + "step": 3304 + }, + { + "epoch": 1.4679102820341994, + "grad_norm": 0.5116686497553603, + "learning_rate": 7.98095318008752e-06, + "loss": 0.0523, + "step": 3305 + }, + { + "epoch": 1.4683544303797469, + "grad_norm": 0.5042517448525269, + "learning_rate": 7.97939661561399e-06, + "loss": 0.0473, + "step": 3306 + }, + { + "epoch": 1.4687985787252942, + "grad_norm": 0.5555029036352702, + "learning_rate": 7.977839603286537e-06, + "loss": 0.053, + "step": 3307 + }, + { + "epoch": 1.4692427270708417, + "grad_norm": 0.3410097103369562, + "learning_rate": 7.976282143339207e-06, + "loss": 0.0293, + "step": 3308 + }, + { + "epoch": 1.469686875416389, + "grad_norm": 0.6134381443649801, + "learning_rate": 7.974724236006113e-06, + "loss": 0.0528, + "step": 3309 + }, + { + "epoch": 1.4701310237619365, + "grad_norm": 0.4165723941603629, + "learning_rate": 7.973165881521435e-06, + "loss": 0.041, + "step": 3310 + }, + { + "epoch": 1.470575172107484, + "grad_norm": 0.4357045233722827, + "learning_rate": 7.971607080119418e-06, + "loss": 0.0499, + "step": 3311 + }, + { + "epoch": 1.4710193204530313, + "grad_norm": 0.5469760548909597, + "learning_rate": 7.97004783203438e-06, + "loss": 0.0436, + "step": 3312 + }, + { + "epoch": 1.4714634687985786, + "grad_norm": 0.4196612097876158, + "learning_rate": 7.968488137500699e-06, + "loss": 0.0474, + "step": 3313 + }, + { + "epoch": 1.471907617144126, + "grad_norm": 0.4878242528103949, + "learning_rate": 7.966927996752824e-06, + "loss": 0.0407, + "step": 3314 + }, + { + "epoch": 1.4723517654896736, + "grad_norm": 0.4654212754221821, + "learning_rate": 7.965367410025275e-06, + "loss": 0.0276, + "step": 3315 + }, + { + "epoch": 1.472795913835221, + "grad_norm": 0.4496002039864948, + "learning_rate": 7.96380637755263e-06, + "loss": 0.043, + "step": 3316 + }, + { + "epoch": 1.4732400621807684, + "grad_norm": 0.843662279100629, + "learning_rate": 7.96224489956954e-06, + "loss": 0.0507, + "step": 3317 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.32395601492358206, + "learning_rate": 7.960682976310721e-06, + "loss": 0.0356, + "step": 3318 + }, + { + "epoch": 1.4741283588718632, + "grad_norm": 0.6598994798652713, + "learning_rate": 7.959120608010959e-06, + "loss": 0.0532, + "step": 3319 + }, + { + "epoch": 1.4745725072174105, + "grad_norm": 0.6135947839100228, + "learning_rate": 7.957557794905104e-06, + "loss": 0.0568, + "step": 3320 + }, + { + "epoch": 1.475016655562958, + "grad_norm": 0.5386416254084523, + "learning_rate": 7.955994537228068e-06, + "loss": 0.0524, + "step": 3321 + }, + { + "epoch": 1.4754608039085055, + "grad_norm": 0.41178054802832886, + "learning_rate": 7.954430835214844e-06, + "loss": 0.0478, + "step": 3322 + }, + { + "epoch": 1.4759049522540528, + "grad_norm": 0.553965746901051, + "learning_rate": 7.952866689100476e-06, + "loss": 0.0497, + "step": 3323 + }, + { + "epoch": 1.4763491005996003, + "grad_norm": 0.5002040941699343, + "learning_rate": 7.951302099120087e-06, + "loss": 0.0479, + "step": 3324 + }, + { + "epoch": 1.4767932489451476, + "grad_norm": 0.5774527696932492, + "learning_rate": 7.949737065508856e-06, + "loss": 0.0447, + "step": 3325 + }, + { + "epoch": 1.4772373972906951, + "grad_norm": 0.5834169178186834, + "learning_rate": 7.948171588502036e-06, + "loss": 0.0442, + "step": 3326 + }, + { + "epoch": 1.4776815456362424, + "grad_norm": 0.48069558429150877, + "learning_rate": 7.946605668334947e-06, + "loss": 0.0509, + "step": 3327 + }, + { + "epoch": 1.47812569398179, + "grad_norm": 0.6916569805639394, + "learning_rate": 7.945039305242972e-06, + "loss": 0.0543, + "step": 3328 + }, + { + "epoch": 1.4785698423273375, + "grad_norm": 0.45596286134877817, + "learning_rate": 7.943472499461562e-06, + "loss": 0.0463, + "step": 3329 + }, + { + "epoch": 1.4790139906728847, + "grad_norm": 0.7085391238735931, + "learning_rate": 7.941905251226235e-06, + "loss": 0.0643, + "step": 3330 + }, + { + "epoch": 1.479458139018432, + "grad_norm": 0.878574496425455, + "learning_rate": 7.940337560772573e-06, + "loss": 0.0623, + "step": 3331 + }, + { + "epoch": 1.4799022873639796, + "grad_norm": 0.3602452580945399, + "learning_rate": 7.93876942833623e-06, + "loss": 0.0325, + "step": 3332 + }, + { + "epoch": 1.480346435709527, + "grad_norm": 0.5694019432619376, + "learning_rate": 7.937200854152917e-06, + "loss": 0.0527, + "step": 3333 + }, + { + "epoch": 1.4807905840550744, + "grad_norm": 0.46397062738751477, + "learning_rate": 7.935631838458426e-06, + "loss": 0.0435, + "step": 3334 + }, + { + "epoch": 1.4812347324006219, + "grad_norm": 0.38977994271678623, + "learning_rate": 7.9340623814886e-06, + "loss": 0.0346, + "step": 3335 + }, + { + "epoch": 1.4816788807461692, + "grad_norm": 0.5991214620269069, + "learning_rate": 7.932492483479358e-06, + "loss": 0.0519, + "step": 3336 + }, + { + "epoch": 1.4821230290917167, + "grad_norm": 0.42521594572783633, + "learning_rate": 7.930922144666679e-06, + "loss": 0.0407, + "step": 3337 + }, + { + "epoch": 1.482567177437264, + "grad_norm": 0.6068474747842975, + "learning_rate": 7.929351365286614e-06, + "loss": 0.0569, + "step": 3338 + }, + { + "epoch": 1.4830113257828115, + "grad_norm": 0.3673998638748444, + "learning_rate": 7.927780145575281e-06, + "loss": 0.0398, + "step": 3339 + }, + { + "epoch": 1.483455474128359, + "grad_norm": 0.6378602388998965, + "learning_rate": 7.926208485768856e-06, + "loss": 0.0445, + "step": 3340 + }, + { + "epoch": 1.4838996224739063, + "grad_norm": 0.4635352438588869, + "learning_rate": 7.924636386103588e-06, + "loss": 0.0472, + "step": 3341 + }, + { + "epoch": 1.4843437708194536, + "grad_norm": 0.43741344875712923, + "learning_rate": 7.923063846815791e-06, + "loss": 0.0365, + "step": 3342 + }, + { + "epoch": 1.484787919165001, + "grad_norm": 0.409274641921781, + "learning_rate": 7.921490868141843e-06, + "loss": 0.0437, + "step": 3343 + }, + { + "epoch": 1.4852320675105486, + "grad_norm": 0.44864312363339953, + "learning_rate": 7.91991745031819e-06, + "loss": 0.0365, + "step": 3344 + }, + { + "epoch": 1.4856762158560959, + "grad_norm": 0.7767506869882325, + "learning_rate": 7.918343593581344e-06, + "loss": 0.055, + "step": 3345 + }, + { + "epoch": 1.4861203642016434, + "grad_norm": 0.663893547353565, + "learning_rate": 7.916769298167881e-06, + "loss": 0.0539, + "step": 3346 + }, + { + "epoch": 1.4865645125471907, + "grad_norm": 0.44134070690444965, + "learning_rate": 7.915194564314446e-06, + "loss": 0.0451, + "step": 3347 + }, + { + "epoch": 1.4870086608927382, + "grad_norm": 0.5644812574628542, + "learning_rate": 7.913619392257748e-06, + "loss": 0.052, + "step": 3348 + }, + { + "epoch": 1.4874528092382855, + "grad_norm": 0.4293480800359804, + "learning_rate": 7.912043782234562e-06, + "loss": 0.0375, + "step": 3349 + }, + { + "epoch": 1.487896957583833, + "grad_norm": 0.4208900641552744, + "learning_rate": 7.910467734481726e-06, + "loss": 0.0263, + "step": 3350 + }, + { + "epoch": 1.4883411059293805, + "grad_norm": 0.5686191171590155, + "learning_rate": 7.90889124923615e-06, + "loss": 0.0552, + "step": 3351 + }, + { + "epoch": 1.4887852542749278, + "grad_norm": 0.45995768523750874, + "learning_rate": 7.907314326734807e-06, + "loss": 0.0392, + "step": 3352 + }, + { + "epoch": 1.4892294026204753, + "grad_norm": 0.6197595746880575, + "learning_rate": 7.905736967214735e-06, + "loss": 0.055, + "step": 3353 + }, + { + "epoch": 1.4896735509660226, + "grad_norm": 0.6356786842165147, + "learning_rate": 7.904159170913035e-06, + "loss": 0.046, + "step": 3354 + }, + { + "epoch": 1.4901176993115701, + "grad_norm": 0.4601859054406351, + "learning_rate": 7.902580938066878e-06, + "loss": 0.0414, + "step": 3355 + }, + { + "epoch": 1.4905618476571174, + "grad_norm": 0.36553812505176564, + "learning_rate": 7.901002268913501e-06, + "loss": 0.0322, + "step": 3356 + }, + { + "epoch": 1.491005996002665, + "grad_norm": 0.615226798089253, + "learning_rate": 7.899423163690204e-06, + "loss": 0.0549, + "step": 3357 + }, + { + "epoch": 1.4914501443482124, + "grad_norm": 0.4095361485944277, + "learning_rate": 7.897843622634352e-06, + "loss": 0.0387, + "step": 3358 + }, + { + "epoch": 1.4918942926937597, + "grad_norm": 0.5006702378564764, + "learning_rate": 7.896263645983378e-06, + "loss": 0.0517, + "step": 3359 + }, + { + "epoch": 1.492338441039307, + "grad_norm": 0.6244366583571412, + "learning_rate": 7.89468323397478e-06, + "loss": 0.0496, + "step": 3360 + }, + { + "epoch": 1.4927825893848545, + "grad_norm": 0.5367025221002972, + "learning_rate": 7.893102386846118e-06, + "loss": 0.0436, + "step": 3361 + }, + { + "epoch": 1.493226737730402, + "grad_norm": 0.4508013926132327, + "learning_rate": 7.891521104835023e-06, + "loss": 0.0344, + "step": 3362 + }, + { + "epoch": 1.4936708860759493, + "grad_norm": 0.6393470972778723, + "learning_rate": 7.889939388179188e-06, + "loss": 0.0526, + "step": 3363 + }, + { + "epoch": 1.4941150344214968, + "grad_norm": 0.472940776234741, + "learning_rate": 7.888357237116372e-06, + "loss": 0.0431, + "step": 3364 + }, + { + "epoch": 1.4945591827670441, + "grad_norm": 0.4993044626375099, + "learning_rate": 7.886774651884397e-06, + "loss": 0.0541, + "step": 3365 + }, + { + "epoch": 1.4950033311125916, + "grad_norm": 0.4991379094630535, + "learning_rate": 7.885191632721156e-06, + "loss": 0.0423, + "step": 3366 + }, + { + "epoch": 1.495447479458139, + "grad_norm": 0.6104766035319223, + "learning_rate": 7.8836081798646e-06, + "loss": 0.0489, + "step": 3367 + }, + { + "epoch": 1.4958916278036865, + "grad_norm": 0.582209332395513, + "learning_rate": 7.882024293552752e-06, + "loss": 0.0481, + "step": 3368 + }, + { + "epoch": 1.496335776149234, + "grad_norm": 1.2878711444761508, + "learning_rate": 7.880439974023694e-06, + "loss": 0.0895, + "step": 3369 + }, + { + "epoch": 1.4967799244947813, + "grad_norm": 0.47752757971222215, + "learning_rate": 7.87885522151558e-06, + "loss": 0.0556, + "step": 3370 + }, + { + "epoch": 1.4972240728403285, + "grad_norm": 0.34960537720365853, + "learning_rate": 7.877270036266622e-06, + "loss": 0.0361, + "step": 3371 + }, + { + "epoch": 1.497668221185876, + "grad_norm": 0.6171844191638863, + "learning_rate": 7.875684418515101e-06, + "loss": 0.0413, + "step": 3372 + }, + { + "epoch": 1.4981123695314236, + "grad_norm": 0.5888605884776164, + "learning_rate": 7.874098368499362e-06, + "loss": 0.0433, + "step": 3373 + }, + { + "epoch": 1.4985565178769709, + "grad_norm": 0.32860735680481784, + "learning_rate": 7.872511886457816e-06, + "loss": 0.029, + "step": 3374 + }, + { + "epoch": 1.4990006662225184, + "grad_norm": 0.4838256059081808, + "learning_rate": 7.87092497262894e-06, + "loss": 0.0428, + "step": 3375 + }, + { + "epoch": 1.4994448145680657, + "grad_norm": 0.5961233345859986, + "learning_rate": 7.86933762725127e-06, + "loss": 0.0568, + "step": 3376 + }, + { + "epoch": 1.4998889629136132, + "grad_norm": 0.4985166006472107, + "learning_rate": 7.867749850563414e-06, + "loss": 0.0539, + "step": 3377 + }, + { + "epoch": 1.5003331112591605, + "grad_norm": 0.8288419788268757, + "learning_rate": 7.86616164280404e-06, + "loss": 0.0376, + "step": 3378 + }, + { + "epoch": 1.500777259604708, + "grad_norm": 0.5437911475680088, + "learning_rate": 7.864573004211884e-06, + "loss": 0.0629, + "step": 3379 + }, + { + "epoch": 1.5012214079502555, + "grad_norm": 0.5173272400012955, + "learning_rate": 7.862983935025745e-06, + "loss": 0.0496, + "step": 3380 + }, + { + "epoch": 1.5016655562958028, + "grad_norm": 0.41368737678867235, + "learning_rate": 7.861394435484488e-06, + "loss": 0.0428, + "step": 3381 + }, + { + "epoch": 1.50210970464135, + "grad_norm": 0.382287625598364, + "learning_rate": 7.85980450582704e-06, + "loss": 0.0362, + "step": 3382 + }, + { + "epoch": 1.5025538529868976, + "grad_norm": 0.7914574257051636, + "learning_rate": 7.858214146292394e-06, + "loss": 0.0723, + "step": 3383 + }, + { + "epoch": 1.502998001332445, + "grad_norm": 0.600368806153385, + "learning_rate": 7.85662335711961e-06, + "loss": 0.0426, + "step": 3384 + }, + { + "epoch": 1.5034421496779924, + "grad_norm": 0.516289015563682, + "learning_rate": 7.855032138547811e-06, + "loss": 0.0401, + "step": 3385 + }, + { + "epoch": 1.50388629802354, + "grad_norm": 0.4579999933485943, + "learning_rate": 7.853440490816182e-06, + "loss": 0.0372, + "step": 3386 + }, + { + "epoch": 1.5043304463690874, + "grad_norm": 0.4924686770621345, + "learning_rate": 7.851848414163976e-06, + "loss": 0.0457, + "step": 3387 + }, + { + "epoch": 1.5047745947146347, + "grad_norm": 0.5624519083004446, + "learning_rate": 7.850255908830508e-06, + "loss": 0.0431, + "step": 3388 + }, + { + "epoch": 1.505218743060182, + "grad_norm": 0.46856624335860086, + "learning_rate": 7.848662975055161e-06, + "loss": 0.0374, + "step": 3389 + }, + { + "epoch": 1.5056628914057295, + "grad_norm": 0.6866568127731671, + "learning_rate": 7.847069613077377e-06, + "loss": 0.088, + "step": 3390 + }, + { + "epoch": 1.506107039751277, + "grad_norm": 0.5263852322647139, + "learning_rate": 7.845475823136669e-06, + "loss": 0.0475, + "step": 3391 + }, + { + "epoch": 1.5065511880968243, + "grad_norm": 0.6378963066155895, + "learning_rate": 7.843881605472606e-06, + "loss": 0.0579, + "step": 3392 + }, + { + "epoch": 1.5069953364423716, + "grad_norm": 0.8470563699341165, + "learning_rate": 7.84228696032483e-06, + "loss": 0.0685, + "step": 3393 + }, + { + "epoch": 1.5074394847879191, + "grad_norm": 0.5121438548331829, + "learning_rate": 7.840691887933042e-06, + "loss": 0.0482, + "step": 3394 + }, + { + "epoch": 1.5078836331334666, + "grad_norm": 0.46761061686619343, + "learning_rate": 7.839096388537008e-06, + "loss": 0.053, + "step": 3395 + }, + { + "epoch": 1.508327781479014, + "grad_norm": 0.568716582017773, + "learning_rate": 7.837500462376559e-06, + "loss": 0.0446, + "step": 3396 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 0.427488840303415, + "learning_rate": 7.83590410969159e-06, + "loss": 0.045, + "step": 3397 + }, + { + "epoch": 1.509216078170109, + "grad_norm": 0.5336130953887138, + "learning_rate": 7.834307330722059e-06, + "loss": 0.0501, + "step": 3398 + }, + { + "epoch": 1.5096602265156562, + "grad_norm": 0.6643085395902935, + "learning_rate": 7.832710125707991e-06, + "loss": 0.0714, + "step": 3399 + }, + { + "epoch": 1.5101043748612035, + "grad_norm": 0.7517073070810807, + "learning_rate": 7.831112494889472e-06, + "loss": 0.0574, + "step": 3400 + }, + { + "epoch": 1.510548523206751, + "grad_norm": 0.6793225641094393, + "learning_rate": 7.829514438506651e-06, + "loss": 0.054, + "step": 3401 + }, + { + "epoch": 1.5109926715522985, + "grad_norm": 0.4505747895513854, + "learning_rate": 7.827915956799745e-06, + "loss": 0.0417, + "step": 3402 + }, + { + "epoch": 1.5114368198978458, + "grad_norm": 0.7701556350457059, + "learning_rate": 7.826317050009035e-06, + "loss": 0.046, + "step": 3403 + }, + { + "epoch": 1.5118809682433931, + "grad_norm": 0.4711452448196391, + "learning_rate": 7.82471771837486e-06, + "loss": 0.0393, + "step": 3404 + }, + { + "epoch": 1.5123251165889409, + "grad_norm": 0.5955356460688755, + "learning_rate": 7.823117962137628e-06, + "loss": 0.0475, + "step": 3405 + }, + { + "epoch": 1.5127692649344882, + "grad_norm": 0.47383670753602875, + "learning_rate": 7.821517781537811e-06, + "loss": 0.0471, + "step": 3406 + }, + { + "epoch": 1.5132134132800354, + "grad_norm": 0.36906733260563584, + "learning_rate": 7.819917176815942e-06, + "loss": 0.0424, + "step": 3407 + }, + { + "epoch": 1.513657561625583, + "grad_norm": 0.5026639235608937, + "learning_rate": 7.818316148212619e-06, + "loss": 0.0478, + "step": 3408 + }, + { + "epoch": 1.5141017099711305, + "grad_norm": 0.5632301347650163, + "learning_rate": 7.816714695968503e-06, + "loss": 0.0582, + "step": 3409 + }, + { + "epoch": 1.5145458583166778, + "grad_norm": 0.5601350342899486, + "learning_rate": 7.815112820324322e-06, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 1.514990006662225, + "grad_norm": 0.44545554824483924, + "learning_rate": 7.813510521520864e-06, + "loss": 0.0437, + "step": 3411 + }, + { + "epoch": 1.5154341550077726, + "grad_norm": 0.5321000678999631, + "learning_rate": 7.811907799798981e-06, + "loss": 0.0446, + "step": 3412 + }, + { + "epoch": 1.51587830335332, + "grad_norm": 0.6123306795917254, + "learning_rate": 7.81030465539959e-06, + "loss": 0.0472, + "step": 3413 + }, + { + "epoch": 1.5163224516988674, + "grad_norm": 0.7182097530608341, + "learning_rate": 7.808701088563669e-06, + "loss": 0.0652, + "step": 3414 + }, + { + "epoch": 1.5167666000444149, + "grad_norm": 0.6740358195253988, + "learning_rate": 7.807097099532264e-06, + "loss": 0.0446, + "step": 3415 + }, + { + "epoch": 1.5172107483899624, + "grad_norm": 0.43545100676950116, + "learning_rate": 7.805492688546481e-06, + "loss": 0.0327, + "step": 3416 + }, + { + "epoch": 1.5176548967355097, + "grad_norm": 0.4493644872097168, + "learning_rate": 7.80388785584749e-06, + "loss": 0.0401, + "step": 3417 + }, + { + "epoch": 1.518099045081057, + "grad_norm": 0.4270665812232271, + "learning_rate": 7.802282601676522e-06, + "loss": 0.0517, + "step": 3418 + }, + { + "epoch": 1.5185431934266045, + "grad_norm": 0.48691036016457884, + "learning_rate": 7.800676926274881e-06, + "loss": 0.0398, + "step": 3419 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 0.5623140589479151, + "learning_rate": 7.79907082988392e-06, + "loss": 0.0482, + "step": 3420 + }, + { + "epoch": 1.5194314901176993, + "grad_norm": 0.6102149116754874, + "learning_rate": 7.797464312745067e-06, + "loss": 0.0541, + "step": 3421 + }, + { + "epoch": 1.5198756384632466, + "grad_norm": 0.5623605927044153, + "learning_rate": 7.795857375099806e-06, + "loss": 0.05, + "step": 3422 + }, + { + "epoch": 1.520319786808794, + "grad_norm": 0.4402109113654038, + "learning_rate": 7.794250017189689e-06, + "loss": 0.046, + "step": 3423 + }, + { + "epoch": 1.5207639351543416, + "grad_norm": 0.48101120541164943, + "learning_rate": 7.792642239256327e-06, + "loss": 0.0461, + "step": 3424 + }, + { + "epoch": 1.521208083499889, + "grad_norm": 0.5296531840354222, + "learning_rate": 7.791034041541398e-06, + "loss": 0.0473, + "step": 3425 + }, + { + "epoch": 1.5216522318454364, + "grad_norm": 0.37685452582544965, + "learning_rate": 7.78942542428664e-06, + "loss": 0.0317, + "step": 3426 + }, + { + "epoch": 1.522096380190984, + "grad_norm": 0.3991510986249673, + "learning_rate": 7.78781638773386e-06, + "loss": 0.0347, + "step": 3427 + }, + { + "epoch": 1.5225405285365312, + "grad_norm": 0.7554838302963066, + "learning_rate": 7.786206932124918e-06, + "loss": 0.0586, + "step": 3428 + }, + { + "epoch": 1.5229846768820785, + "grad_norm": 0.42032863329599834, + "learning_rate": 7.784597057701745e-06, + "loss": 0.0252, + "step": 3429 + }, + { + "epoch": 1.523428825227626, + "grad_norm": 0.5551436730522737, + "learning_rate": 7.782986764706334e-06, + "loss": 0.0563, + "step": 3430 + }, + { + "epoch": 1.5238729735731735, + "grad_norm": 0.6187970153795604, + "learning_rate": 7.781376053380735e-06, + "loss": 0.0592, + "step": 3431 + }, + { + "epoch": 1.5243171219187208, + "grad_norm": 0.5501189193555737, + "learning_rate": 7.779764923967069e-06, + "loss": 0.0556, + "step": 3432 + }, + { + "epoch": 1.524761270264268, + "grad_norm": 0.44533955935715447, + "learning_rate": 7.778153376707513e-06, + "loss": 0.0507, + "step": 3433 + }, + { + "epoch": 1.5252054186098158, + "grad_norm": 0.4340430082400607, + "learning_rate": 7.776541411844315e-06, + "loss": 0.0385, + "step": 3434 + }, + { + "epoch": 1.5256495669553631, + "grad_norm": 0.5599425444294093, + "learning_rate": 7.774929029619775e-06, + "loss": 0.0454, + "step": 3435 + }, + { + "epoch": 1.5260937153009104, + "grad_norm": 0.3908201446163151, + "learning_rate": 7.773316230276267e-06, + "loss": 0.041, + "step": 3436 + }, + { + "epoch": 1.526537863646458, + "grad_norm": 0.6432918655836172, + "learning_rate": 7.771703014056217e-06, + "loss": 0.0664, + "step": 3437 + }, + { + "epoch": 1.5269820119920055, + "grad_norm": 0.4341973490141778, + "learning_rate": 7.770089381202121e-06, + "loss": 0.0536, + "step": 3438 + }, + { + "epoch": 1.5274261603375527, + "grad_norm": 0.5232821809072614, + "learning_rate": 7.768475331956537e-06, + "loss": 0.0506, + "step": 3439 + }, + { + "epoch": 1.5278703086831, + "grad_norm": 0.49831548658076835, + "learning_rate": 7.76686086656208e-06, + "loss": 0.0497, + "step": 3440 + }, + { + "epoch": 1.5283144570286475, + "grad_norm": 0.6350239638046895, + "learning_rate": 7.765245985261436e-06, + "loss": 0.0603, + "step": 3441 + }, + { + "epoch": 1.528758605374195, + "grad_norm": 0.8205513486577842, + "learning_rate": 7.763630688297347e-06, + "loss": 0.0555, + "step": 3442 + }, + { + "epoch": 1.5292027537197423, + "grad_norm": 0.5773711065726657, + "learning_rate": 7.76201497591262e-06, + "loss": 0.0564, + "step": 3443 + }, + { + "epoch": 1.5296469020652899, + "grad_norm": 0.42024558526878264, + "learning_rate": 7.760398848350121e-06, + "loss": 0.0327, + "step": 3444 + }, + { + "epoch": 1.5300910504108374, + "grad_norm": 0.5673807060962988, + "learning_rate": 7.758782305852787e-06, + "loss": 0.0551, + "step": 3445 + }, + { + "epoch": 1.5305351987563847, + "grad_norm": 0.4401980723100094, + "learning_rate": 7.757165348663606e-06, + "loss": 0.0341, + "step": 3446 + }, + { + "epoch": 1.530979347101932, + "grad_norm": 0.4134655160444097, + "learning_rate": 7.755547977025641e-06, + "loss": 0.0396, + "step": 3447 + }, + { + "epoch": 1.5314234954474795, + "grad_norm": 0.7907504517928898, + "learning_rate": 7.753930191182005e-06, + "loss": 0.072, + "step": 3448 + }, + { + "epoch": 1.531867643793027, + "grad_norm": 0.7425759733481122, + "learning_rate": 7.752311991375878e-06, + "loss": 0.0584, + "step": 3449 + }, + { + "epoch": 1.5323117921385743, + "grad_norm": 0.3701818263547308, + "learning_rate": 7.750693377850506e-06, + "loss": 0.0311, + "step": 3450 + }, + { + "epoch": 1.5327559404841216, + "grad_norm": 0.4194874739178355, + "learning_rate": 7.749074350849196e-06, + "loss": 0.0299, + "step": 3451 + }, + { + "epoch": 1.533200088829669, + "grad_norm": 0.399280041395756, + "learning_rate": 7.747454910615309e-06, + "loss": 0.0333, + "step": 3452 + }, + { + "epoch": 1.5336442371752166, + "grad_norm": 0.5116821227256184, + "learning_rate": 7.74583505739228e-06, + "loss": 0.053, + "step": 3453 + }, + { + "epoch": 1.5340883855207639, + "grad_norm": 0.6704114941806725, + "learning_rate": 7.744214791423597e-06, + "loss": 0.0559, + "step": 3454 + }, + { + "epoch": 1.5345325338663114, + "grad_norm": 0.4031097936225357, + "learning_rate": 7.742594112952816e-06, + "loss": 0.04, + "step": 3455 + }, + { + "epoch": 1.534976682211859, + "grad_norm": 0.39875784908896506, + "learning_rate": 7.74097302222355e-06, + "loss": 0.0359, + "step": 3456 + }, + { + "epoch": 1.5354208305574062, + "grad_norm": 0.49710136867375654, + "learning_rate": 7.739351519479479e-06, + "loss": 0.0433, + "step": 3457 + }, + { + "epoch": 1.5358649789029535, + "grad_norm": 0.587479752061736, + "learning_rate": 7.73772960496434e-06, + "loss": 0.0482, + "step": 3458 + }, + { + "epoch": 1.536309127248501, + "grad_norm": 0.9958134431642607, + "learning_rate": 7.736107278921937e-06, + "loss": 0.0604, + "step": 3459 + }, + { + "epoch": 1.5367532755940485, + "grad_norm": 0.5924386484163253, + "learning_rate": 7.73448454159613e-06, + "loss": 0.0643, + "step": 3460 + }, + { + "epoch": 1.5371974239395958, + "grad_norm": 0.4469041409309243, + "learning_rate": 7.732861393230845e-06, + "loss": 0.0377, + "step": 3461 + }, + { + "epoch": 1.537641572285143, + "grad_norm": 0.6971338285035604, + "learning_rate": 7.731237834070071e-06, + "loss": 0.0525, + "step": 3462 + }, + { + "epoch": 1.5380857206306906, + "grad_norm": 0.377836838682399, + "learning_rate": 7.729613864357854e-06, + "loss": 0.037, + "step": 3463 + }, + { + "epoch": 1.5385298689762381, + "grad_norm": 0.3909300894809345, + "learning_rate": 7.727989484338306e-06, + "loss": 0.0327, + "step": 3464 + }, + { + "epoch": 1.5389740173217854, + "grad_norm": 0.44535118870735724, + "learning_rate": 7.726364694255598e-06, + "loss": 0.0451, + "step": 3465 + }, + { + "epoch": 1.539418165667333, + "grad_norm": 0.4310182821670584, + "learning_rate": 7.724739494353963e-06, + "loss": 0.043, + "step": 3466 + }, + { + "epoch": 1.5398623140128804, + "grad_norm": 0.5072110735404695, + "learning_rate": 7.723113884877698e-06, + "loss": 0.0409, + "step": 3467 + }, + { + "epoch": 1.5403064623584277, + "grad_norm": 0.4243921084057989, + "learning_rate": 7.721487866071158e-06, + "loss": 0.0577, + "step": 3468 + }, + { + "epoch": 1.540750610703975, + "grad_norm": 0.6225210766153572, + "learning_rate": 7.71986143817876e-06, + "loss": 0.041, + "step": 3469 + }, + { + "epoch": 1.5411947590495225, + "grad_norm": 0.7207947800360287, + "learning_rate": 7.718234601444987e-06, + "loss": 0.0525, + "step": 3470 + }, + { + "epoch": 1.54163890739507, + "grad_norm": 0.6448910865817588, + "learning_rate": 7.716607356114378e-06, + "loss": 0.0691, + "step": 3471 + }, + { + "epoch": 1.5420830557406173, + "grad_norm": 0.4192180148347075, + "learning_rate": 7.714979702431537e-06, + "loss": 0.0401, + "step": 3472 + }, + { + "epoch": 1.5425272040861646, + "grad_norm": 0.5353520298031071, + "learning_rate": 7.713351640641127e-06, + "loss": 0.047, + "step": 3473 + }, + { + "epoch": 1.5429713524317124, + "grad_norm": 0.4566196176860585, + "learning_rate": 7.711723170987875e-06, + "loss": 0.0377, + "step": 3474 + }, + { + "epoch": 1.5434155007772596, + "grad_norm": 0.8744543565589462, + "learning_rate": 7.710094293716563e-06, + "loss": 0.0421, + "step": 3475 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 0.4450101049065203, + "learning_rate": 7.708465009072046e-06, + "loss": 0.0368, + "step": 3476 + }, + { + "epoch": 1.5443037974683544, + "grad_norm": 0.3880510280013894, + "learning_rate": 7.706835317299228e-06, + "loss": 0.042, + "step": 3477 + }, + { + "epoch": 1.544747945813902, + "grad_norm": 0.5627812490127366, + "learning_rate": 7.705205218643079e-06, + "loss": 0.0485, + "step": 3478 + }, + { + "epoch": 1.5451920941594492, + "grad_norm": 0.5226084645577708, + "learning_rate": 7.703574713348633e-06, + "loss": 0.0392, + "step": 3479 + }, + { + "epoch": 1.5456362425049965, + "grad_norm": 0.646405817519741, + "learning_rate": 7.701943801660983e-06, + "loss": 0.0679, + "step": 3480 + }, + { + "epoch": 1.546080390850544, + "grad_norm": 0.4165107020464423, + "learning_rate": 7.700312483825281e-06, + "loss": 0.0398, + "step": 3481 + }, + { + "epoch": 1.5465245391960916, + "grad_norm": 0.4015766878565599, + "learning_rate": 7.698680760086743e-06, + "loss": 0.0397, + "step": 3482 + }, + { + "epoch": 1.5469686875416389, + "grad_norm": 0.5170446946635605, + "learning_rate": 7.697048630690642e-06, + "loss": 0.0437, + "step": 3483 + }, + { + "epoch": 1.5474128358871864, + "grad_norm": 0.6516919138739381, + "learning_rate": 7.69541609588232e-06, + "loss": 0.0487, + "step": 3484 + }, + { + "epoch": 1.5478569842327339, + "grad_norm": 0.3866667271027062, + "learning_rate": 7.69378315590717e-06, + "loss": 0.0455, + "step": 3485 + }, + { + "epoch": 1.5483011325782812, + "grad_norm": 0.4947363891869916, + "learning_rate": 7.692149811010651e-06, + "loss": 0.0455, + "step": 3486 + }, + { + "epoch": 1.5487452809238285, + "grad_norm": 0.3642222145788099, + "learning_rate": 7.690516061438287e-06, + "loss": 0.0302, + "step": 3487 + }, + { + "epoch": 1.549189429269376, + "grad_norm": 0.49548313003009337, + "learning_rate": 7.688881907435653e-06, + "loss": 0.0491, + "step": 3488 + }, + { + "epoch": 1.5496335776149235, + "grad_norm": 0.5051267185591125, + "learning_rate": 7.687247349248393e-06, + "loss": 0.037, + "step": 3489 + }, + { + "epoch": 1.5500777259604708, + "grad_norm": 0.35479785899347316, + "learning_rate": 7.685612387122206e-06, + "loss": 0.0393, + "step": 3490 + }, + { + "epoch": 1.550521874306018, + "grad_norm": 0.3491505040065738, + "learning_rate": 7.68397702130286e-06, + "loss": 0.0391, + "step": 3491 + }, + { + "epoch": 1.5509660226515656, + "grad_norm": 0.4064600432185325, + "learning_rate": 7.682341252036171e-06, + "loss": 0.0369, + "step": 3492 + }, + { + "epoch": 1.551410170997113, + "grad_norm": 0.5118602299189453, + "learning_rate": 7.68070507956803e-06, + "loss": 0.0455, + "step": 3493 + }, + { + "epoch": 1.5518543193426604, + "grad_norm": 0.4565926867283545, + "learning_rate": 7.679068504144378e-06, + "loss": 0.038, + "step": 3494 + }, + { + "epoch": 1.552298467688208, + "grad_norm": 0.7287584197582313, + "learning_rate": 7.677431526011218e-06, + "loss": 0.0569, + "step": 3495 + }, + { + "epoch": 1.5527426160337554, + "grad_norm": 0.5994194003053249, + "learning_rate": 7.67579414541462e-06, + "loss": 0.0652, + "step": 3496 + }, + { + "epoch": 1.5531867643793027, + "grad_norm": 0.826708798968826, + "learning_rate": 7.674156362600708e-06, + "loss": 0.0788, + "step": 3497 + }, + { + "epoch": 1.55363091272485, + "grad_norm": 0.6304316452414537, + "learning_rate": 7.672518177815669e-06, + "loss": 0.0447, + "step": 3498 + }, + { + "epoch": 1.5540750610703975, + "grad_norm": 0.5354755353841119, + "learning_rate": 7.67087959130575e-06, + "loss": 0.0398, + "step": 3499 + }, + { + "epoch": 1.554519209415945, + "grad_norm": 0.5926105062715095, + "learning_rate": 7.669240603317257e-06, + "loss": 0.0495, + "step": 3500 + }, + { + "epoch": 1.5549633577614923, + "grad_norm": 0.4138530773015362, + "learning_rate": 7.66760121409656e-06, + "loss": 0.0354, + "step": 3501 + }, + { + "epoch": 1.5554075061070396, + "grad_norm": 0.44256364969603434, + "learning_rate": 7.665961423890085e-06, + "loss": 0.045, + "step": 3502 + }, + { + "epoch": 1.5558516544525873, + "grad_norm": 0.47643325623563626, + "learning_rate": 7.664321232944321e-06, + "loss": 0.0403, + "step": 3503 + }, + { + "epoch": 1.5562958027981346, + "grad_norm": 0.3731544040854046, + "learning_rate": 7.662680641505817e-06, + "loss": 0.0412, + "step": 3504 + }, + { + "epoch": 1.556739951143682, + "grad_norm": 0.8553804228205281, + "learning_rate": 7.661039649821183e-06, + "loss": 0.0688, + "step": 3505 + }, + { + "epoch": 1.5571840994892294, + "grad_norm": 0.5134256859909045, + "learning_rate": 7.659398258137085e-06, + "loss": 0.044, + "step": 3506 + }, + { + "epoch": 1.557628247834777, + "grad_norm": 0.5647254301702445, + "learning_rate": 7.657756466700252e-06, + "loss": 0.0437, + "step": 3507 + }, + { + "epoch": 1.5580723961803242, + "grad_norm": 0.44604429848268956, + "learning_rate": 7.656114275757477e-06, + "loss": 0.0399, + "step": 3508 + }, + { + "epoch": 1.5585165445258715, + "grad_norm": 0.45991000971025053, + "learning_rate": 7.654471685555606e-06, + "loss": 0.0484, + "step": 3509 + }, + { + "epoch": 1.558960692871419, + "grad_norm": 0.5298662733782437, + "learning_rate": 7.65282869634155e-06, + "loss": 0.061, + "step": 3510 + }, + { + "epoch": 1.5594048412169665, + "grad_norm": 0.49816414515287183, + "learning_rate": 7.651185308362276e-06, + "loss": 0.0499, + "step": 3511 + }, + { + "epoch": 1.5598489895625138, + "grad_norm": 0.4052222432440109, + "learning_rate": 7.649541521864816e-06, + "loss": 0.0451, + "step": 3512 + }, + { + "epoch": 1.5602931379080613, + "grad_norm": 0.5042113116702226, + "learning_rate": 7.647897337096257e-06, + "loss": 0.0457, + "step": 3513 + }, + { + "epoch": 1.5607372862536089, + "grad_norm": 0.7798340643811736, + "learning_rate": 7.646252754303746e-06, + "loss": 0.1019, + "step": 3514 + }, + { + "epoch": 1.5611814345991561, + "grad_norm": 0.4972991596269239, + "learning_rate": 7.644607773734496e-06, + "loss": 0.0578, + "step": 3515 + }, + { + "epoch": 1.5616255829447034, + "grad_norm": 0.5179606694497955, + "learning_rate": 7.642962395635773e-06, + "loss": 0.0484, + "step": 3516 + }, + { + "epoch": 1.562069731290251, + "grad_norm": 0.4239628046426599, + "learning_rate": 7.641316620254907e-06, + "loss": 0.0401, + "step": 3517 + }, + { + "epoch": 1.5625138796357985, + "grad_norm": 0.41910897244128137, + "learning_rate": 7.639670447839284e-06, + "loss": 0.0431, + "step": 3518 + }, + { + "epoch": 1.5629580279813458, + "grad_norm": 0.6150053023433038, + "learning_rate": 7.638023878636353e-06, + "loss": 0.0401, + "step": 3519 + }, + { + "epoch": 1.563402176326893, + "grad_norm": 0.4401882181013756, + "learning_rate": 7.63637691289362e-06, + "loss": 0.0557, + "step": 3520 + }, + { + "epoch": 1.5638463246724406, + "grad_norm": 0.4753196454260942, + "learning_rate": 7.634729550858652e-06, + "loss": 0.0393, + "step": 3521 + }, + { + "epoch": 1.564290473017988, + "grad_norm": 0.36301949275943124, + "learning_rate": 7.633081792779079e-06, + "loss": 0.0352, + "step": 3522 + }, + { + "epoch": 1.5647346213635354, + "grad_norm": 0.6312670336926534, + "learning_rate": 7.631433638902583e-06, + "loss": 0.0458, + "step": 3523 + }, + { + "epoch": 1.5651787697090829, + "grad_norm": 0.4853790667151454, + "learning_rate": 7.629785089476912e-06, + "loss": 0.0465, + "step": 3524 + }, + { + "epoch": 1.5656229180546304, + "grad_norm": 0.5377885843949731, + "learning_rate": 7.628136144749867e-06, + "loss": 0.0432, + "step": 3525 + }, + { + "epoch": 1.5660670664001777, + "grad_norm": 0.4560714160433749, + "learning_rate": 7.626486804969316e-06, + "loss": 0.0443, + "step": 3526 + }, + { + "epoch": 1.566511214745725, + "grad_norm": 0.40682805145370315, + "learning_rate": 7.624837070383183e-06, + "loss": 0.0353, + "step": 3527 + }, + { + "epoch": 1.5669553630912725, + "grad_norm": 0.4791132302623183, + "learning_rate": 7.6231869412394495e-06, + "loss": 0.0502, + "step": 3528 + }, + { + "epoch": 1.56739951143682, + "grad_norm": 0.4343747460657118, + "learning_rate": 7.621536417786159e-06, + "loss": 0.0395, + "step": 3529 + }, + { + "epoch": 1.5678436597823673, + "grad_norm": 0.492794572243227, + "learning_rate": 7.619885500271413e-06, + "loss": 0.0361, + "step": 3530 + }, + { + "epoch": 1.5682878081279146, + "grad_norm": 0.3645392422465027, + "learning_rate": 7.618234188943372e-06, + "loss": 0.0389, + "step": 3531 + }, + { + "epoch": 1.568731956473462, + "grad_norm": 0.43518151556556955, + "learning_rate": 7.616582484050256e-06, + "loss": 0.037, + "step": 3532 + }, + { + "epoch": 1.5691761048190096, + "grad_norm": 0.5607869474661537, + "learning_rate": 7.614930385840345e-06, + "loss": 0.0494, + "step": 3533 + }, + { + "epoch": 1.5696202531645569, + "grad_norm": 0.3841819768936594, + "learning_rate": 7.613277894561978e-06, + "loss": 0.0426, + "step": 3534 + }, + { + "epoch": 1.5700644015101044, + "grad_norm": 0.3817054851480292, + "learning_rate": 7.611625010463549e-06, + "loss": 0.045, + "step": 3535 + }, + { + "epoch": 1.570508549855652, + "grad_norm": 0.8877604841062007, + "learning_rate": 7.60997173379352e-06, + "loss": 0.0511, + "step": 3536 + }, + { + "epoch": 1.5709526982011992, + "grad_norm": 0.5367152034215165, + "learning_rate": 7.608318064800403e-06, + "loss": 0.0437, + "step": 3537 + }, + { + "epoch": 1.5713968465467465, + "grad_norm": 0.37970223029486955, + "learning_rate": 7.606664003732771e-06, + "loss": 0.0426, + "step": 3538 + }, + { + "epoch": 1.571840994892294, + "grad_norm": 0.5586015168302153, + "learning_rate": 7.605009550839263e-06, + "loss": 0.0443, + "step": 3539 + }, + { + "epoch": 1.5722851432378415, + "grad_norm": 0.5314909266145166, + "learning_rate": 7.603354706368567e-06, + "loss": 0.0482, + "step": 3540 + }, + { + "epoch": 1.5727292915833888, + "grad_norm": 0.6999799538645216, + "learning_rate": 7.601699470569434e-06, + "loss": 0.0481, + "step": 3541 + }, + { + "epoch": 1.573173439928936, + "grad_norm": 0.6037904990943627, + "learning_rate": 7.600043843690677e-06, + "loss": 0.0563, + "step": 3542 + }, + { + "epoch": 1.5736175882744838, + "grad_norm": 0.47456339272048825, + "learning_rate": 7.5983878259811625e-06, + "loss": 0.069, + "step": 3543 + }, + { + "epoch": 1.5740617366200311, + "grad_norm": 0.9776402251476092, + "learning_rate": 7.59673141768982e-06, + "loss": 0.0634, + "step": 3544 + }, + { + "epoch": 1.5745058849655784, + "grad_norm": 0.430567824394842, + "learning_rate": 7.595074619065635e-06, + "loss": 0.0389, + "step": 3545 + }, + { + "epoch": 1.574950033311126, + "grad_norm": 0.592046861388675, + "learning_rate": 7.593417430357649e-06, + "loss": 0.0569, + "step": 3546 + }, + { + "epoch": 1.5753941816566734, + "grad_norm": 0.528657961671009, + "learning_rate": 7.591759851814972e-06, + "loss": 0.0477, + "step": 3547 + }, + { + "epoch": 1.5758383300022207, + "grad_norm": 0.6683751656639313, + "learning_rate": 7.590101883686761e-06, + "loss": 0.0567, + "step": 3548 + }, + { + "epoch": 1.576282478347768, + "grad_norm": 0.6152824586906789, + "learning_rate": 7.58844352622224e-06, + "loss": 0.0514, + "step": 3549 + }, + { + "epoch": 1.5767266266933155, + "grad_norm": 0.5175459666893117, + "learning_rate": 7.5867847796706865e-06, + "loss": 0.0466, + "step": 3550 + }, + { + "epoch": 1.577170775038863, + "grad_norm": 0.46509447514156393, + "learning_rate": 7.585125644281439e-06, + "loss": 0.0356, + "step": 3551 + }, + { + "epoch": 1.5776149233844103, + "grad_norm": 0.42445822473522876, + "learning_rate": 7.583466120303893e-06, + "loss": 0.0368, + "step": 3552 + }, + { + "epoch": 1.5780590717299579, + "grad_norm": 0.5278835867879181, + "learning_rate": 7.581806207987504e-06, + "loss": 0.0412, + "step": 3553 + }, + { + "epoch": 1.5785032200755054, + "grad_norm": 0.49560125396493776, + "learning_rate": 7.5801459075817865e-06, + "loss": 0.0435, + "step": 3554 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.5441296297037692, + "learning_rate": 7.578485219336307e-06, + "loss": 0.0408, + "step": 3555 + }, + { + "epoch": 1.5793915167666, + "grad_norm": 0.5569362734290482, + "learning_rate": 7.5768241435007e-06, + "loss": 0.0508, + "step": 3556 + }, + { + "epoch": 1.5798356651121475, + "grad_norm": 0.688395754837478, + "learning_rate": 7.57516268032465e-06, + "loss": 0.0575, + "step": 3557 + }, + { + "epoch": 1.580279813457695, + "grad_norm": 0.5366509995471049, + "learning_rate": 7.573500830057907e-06, + "loss": 0.0393, + "step": 3558 + }, + { + "epoch": 1.5807239618032423, + "grad_norm": 0.5521349728712537, + "learning_rate": 7.571838592950271e-06, + "loss": 0.0464, + "step": 3559 + }, + { + "epoch": 1.5811681101487896, + "grad_norm": 0.3794603822110102, + "learning_rate": 7.570175969251609e-06, + "loss": 0.0271, + "step": 3560 + }, + { + "epoch": 1.581612258494337, + "grad_norm": 0.954806853873049, + "learning_rate": 7.568512959211838e-06, + "loss": 0.0798, + "step": 3561 + }, + { + "epoch": 1.5820564068398846, + "grad_norm": 0.3774549347222219, + "learning_rate": 7.566849563080938e-06, + "loss": 0.034, + "step": 3562 + }, + { + "epoch": 1.5825005551854319, + "grad_norm": 0.5717245905532253, + "learning_rate": 7.565185781108944e-06, + "loss": 0.0499, + "step": 3563 + }, + { + "epoch": 1.5829447035309794, + "grad_norm": 0.5372548568959261, + "learning_rate": 7.563521613545954e-06, + "loss": 0.0543, + "step": 3564 + }, + { + "epoch": 1.583388851876527, + "grad_norm": 0.6132605143548788, + "learning_rate": 7.56185706064212e-06, + "loss": 0.0513, + "step": 3565 + }, + { + "epoch": 1.5838330002220742, + "grad_norm": 0.4784620611087414, + "learning_rate": 7.560192122647647e-06, + "loss": 0.0507, + "step": 3566 + }, + { + "epoch": 1.5842771485676215, + "grad_norm": 0.46891721716697665, + "learning_rate": 7.558526799812812e-06, + "loss": 0.0447, + "step": 3567 + }, + { + "epoch": 1.584721296913169, + "grad_norm": 0.6007245405053189, + "learning_rate": 7.556861092387937e-06, + "loss": 0.0461, + "step": 3568 + }, + { + "epoch": 1.5851654452587165, + "grad_norm": 0.6832271146214529, + "learning_rate": 7.555195000623404e-06, + "loss": 0.0615, + "step": 3569 + }, + { + "epoch": 1.5856095936042638, + "grad_norm": 0.628507751994035, + "learning_rate": 7.553528524769658e-06, + "loss": 0.0511, + "step": 3570 + }, + { + "epoch": 1.586053741949811, + "grad_norm": 0.47038947814194393, + "learning_rate": 7.551861665077199e-06, + "loss": 0.0503, + "step": 3571 + }, + { + "epoch": 1.5864978902953588, + "grad_norm": 0.5039990895794063, + "learning_rate": 7.550194421796583e-06, + "loss": 0.0757, + "step": 3572 + }, + { + "epoch": 1.586942038640906, + "grad_norm": 0.43304671963904556, + "learning_rate": 7.548526795178424e-06, + "loss": 0.0435, + "step": 3573 + }, + { + "epoch": 1.5873861869864534, + "grad_norm": 0.35567796030147675, + "learning_rate": 7.546858785473397e-06, + "loss": 0.0411, + "step": 3574 + }, + { + "epoch": 1.587830335332001, + "grad_norm": 0.5082433548982926, + "learning_rate": 7.54519039293223e-06, + "loss": 0.0476, + "step": 3575 + }, + { + "epoch": 1.5882744836775484, + "grad_norm": 0.4853723595468299, + "learning_rate": 7.543521617805711e-06, + "loss": 0.0403, + "step": 3576 + }, + { + "epoch": 1.5887186320230957, + "grad_norm": 0.4349996299659683, + "learning_rate": 7.541852460344687e-06, + "loss": 0.0444, + "step": 3577 + }, + { + "epoch": 1.589162780368643, + "grad_norm": 0.3738858521887595, + "learning_rate": 7.540182920800061e-06, + "loss": 0.0342, + "step": 3578 + }, + { + "epoch": 1.5896069287141905, + "grad_norm": 0.3860111205972228, + "learning_rate": 7.5385129994227916e-06, + "loss": 0.0326, + "step": 3579 + }, + { + "epoch": 1.590051077059738, + "grad_norm": 0.4607949804916419, + "learning_rate": 7.536842696463894e-06, + "loss": 0.0406, + "step": 3580 + }, + { + "epoch": 1.5904952254052853, + "grad_norm": 0.4090438356252611, + "learning_rate": 7.535172012174447e-06, + "loss": 0.0435, + "step": 3581 + }, + { + "epoch": 1.5909393737508328, + "grad_norm": 0.44992279283684267, + "learning_rate": 7.533500946805583e-06, + "loss": 0.0433, + "step": 3582 + }, + { + "epoch": 1.5913835220963803, + "grad_norm": 0.5608893338456544, + "learning_rate": 7.531829500608489e-06, + "loss": 0.0461, + "step": 3583 + }, + { + "epoch": 1.5918276704419276, + "grad_norm": 0.46587297637504893, + "learning_rate": 7.530157673834413e-06, + "loss": 0.0462, + "step": 3584 + }, + { + "epoch": 1.592271818787475, + "grad_norm": 0.4650339559246697, + "learning_rate": 7.528485466734658e-06, + "loss": 0.0463, + "step": 3585 + }, + { + "epoch": 1.5927159671330224, + "grad_norm": 0.6227086084109071, + "learning_rate": 7.526812879560586e-06, + "loss": 0.0579, + "step": 3586 + }, + { + "epoch": 1.59316011547857, + "grad_norm": 0.38020073620262274, + "learning_rate": 7.525139912563616e-06, + "loss": 0.0356, + "step": 3587 + }, + { + "epoch": 1.5936042638241172, + "grad_norm": 0.4538280786991795, + "learning_rate": 7.523466565995224e-06, + "loss": 0.041, + "step": 3588 + }, + { + "epoch": 1.5940484121696645, + "grad_norm": 0.41382625396603623, + "learning_rate": 7.521792840106937e-06, + "loss": 0.0309, + "step": 3589 + }, + { + "epoch": 1.594492560515212, + "grad_norm": 0.6086317820490929, + "learning_rate": 7.52011873515035e-06, + "loss": 0.055, + "step": 3590 + }, + { + "epoch": 1.5949367088607596, + "grad_norm": 0.5518315774986915, + "learning_rate": 7.518444251377108e-06, + "loss": 0.0448, + "step": 3591 + }, + { + "epoch": 1.5953808572063068, + "grad_norm": 0.46458719141632254, + "learning_rate": 7.516769389038915e-06, + "loss": 0.0375, + "step": 3592 + }, + { + "epoch": 1.5958250055518544, + "grad_norm": 0.6946531905139686, + "learning_rate": 7.515094148387529e-06, + "loss": 0.0637, + "step": 3593 + }, + { + "epoch": 1.5962691538974019, + "grad_norm": 0.47113343903410826, + "learning_rate": 7.51341852967477e-06, + "loss": 0.0403, + "step": 3594 + }, + { + "epoch": 1.5967133022429492, + "grad_norm": 0.34877248157599594, + "learning_rate": 7.511742533152509e-06, + "loss": 0.035, + "step": 3595 + }, + { + "epoch": 1.5971574505884965, + "grad_norm": 0.6597113801136277, + "learning_rate": 7.51006615907268e-06, + "loss": 0.0419, + "step": 3596 + }, + { + "epoch": 1.597601598934044, + "grad_norm": 0.6377364313042297, + "learning_rate": 7.508389407687267e-06, + "loss": 0.0503, + "step": 3597 + }, + { + "epoch": 1.5980457472795915, + "grad_norm": 0.45492565688398423, + "learning_rate": 7.506712279248316e-06, + "loss": 0.0399, + "step": 3598 + }, + { + "epoch": 1.5984898956251388, + "grad_norm": 0.4963351358417883, + "learning_rate": 7.5050347740079285e-06, + "loss": 0.0404, + "step": 3599 + }, + { + "epoch": 1.598934043970686, + "grad_norm": 0.3821109498168256, + "learning_rate": 7.503356892218261e-06, + "loss": 0.0341, + "step": 3600 + }, + { + "epoch": 1.5993781923162336, + "grad_norm": 0.5218817014308058, + "learning_rate": 7.501678634131528e-06, + "loss": 0.0457, + "step": 3601 + }, + { + "epoch": 1.599822340661781, + "grad_norm": 0.48140971211980904, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0416, + "step": 3602 + }, + { + "epoch": 1.6002664890073284, + "grad_norm": 0.6724090547593772, + "learning_rate": 7.498320990076006e-06, + "loss": 0.0407, + "step": 3603 + }, + { + "epoch": 1.6007106373528759, + "grad_norm": 0.4673511339888186, + "learning_rate": 7.496641604611926e-06, + "loss": 0.0445, + "step": 3604 + }, + { + "epoch": 1.6011547856984234, + "grad_norm": 0.4736546778146936, + "learning_rate": 7.494961843860204e-06, + "loss": 0.0457, + "step": 3605 + }, + { + "epoch": 1.6015989340439707, + "grad_norm": 0.46683144779872127, + "learning_rate": 7.4932817080733345e-06, + "loss": 0.0534, + "step": 3606 + }, + { + "epoch": 1.602043082389518, + "grad_norm": 0.38959987746176267, + "learning_rate": 7.491601197503871e-06, + "loss": 0.037, + "step": 3607 + }, + { + "epoch": 1.6024872307350655, + "grad_norm": 0.5453813196476248, + "learning_rate": 7.489920312404422e-06, + "loss": 0.0524, + "step": 3608 + }, + { + "epoch": 1.602931379080613, + "grad_norm": 0.5800473949841606, + "learning_rate": 7.488239053027653e-06, + "loss": 0.0575, + "step": 3609 + }, + { + "epoch": 1.6033755274261603, + "grad_norm": 0.5289683983594908, + "learning_rate": 7.486557419626288e-06, + "loss": 0.0548, + "step": 3610 + }, + { + "epoch": 1.6038196757717076, + "grad_norm": 0.6070529095018132, + "learning_rate": 7.484875412453102e-06, + "loss": 0.0412, + "step": 3611 + }, + { + "epoch": 1.6042638241172553, + "grad_norm": 0.589551206830852, + "learning_rate": 7.483193031760932e-06, + "loss": 0.0548, + "step": 3612 + }, + { + "epoch": 1.6047079724628026, + "grad_norm": 0.4017721125553618, + "learning_rate": 7.481510277802667e-06, + "loss": 0.0294, + "step": 3613 + }, + { + "epoch": 1.60515212080835, + "grad_norm": 0.45650083282856746, + "learning_rate": 7.479827150831254e-06, + "loss": 0.0368, + "step": 3614 + }, + { + "epoch": 1.6055962691538974, + "grad_norm": 0.3377636070281279, + "learning_rate": 7.478143651099694e-06, + "loss": 0.0361, + "step": 3615 + }, + { + "epoch": 1.606040417499445, + "grad_norm": 0.4206873654117964, + "learning_rate": 7.4764597788610496e-06, + "loss": 0.0373, + "step": 3616 + }, + { + "epoch": 1.6064845658449922, + "grad_norm": 0.5196617152819853, + "learning_rate": 7.47477553436843e-06, + "loss": 0.0535, + "step": 3617 + }, + { + "epoch": 1.6069287141905395, + "grad_norm": 0.499359501911801, + "learning_rate": 7.47309091787501e-06, + "loss": 0.0414, + "step": 3618 + }, + { + "epoch": 1.607372862536087, + "grad_norm": 0.5994533032039974, + "learning_rate": 7.471405929634014e-06, + "loss": 0.0521, + "step": 3619 + }, + { + "epoch": 1.6078170108816345, + "grad_norm": 0.3309953500833457, + "learning_rate": 7.469720569898725e-06, + "loss": 0.0331, + "step": 3620 + }, + { + "epoch": 1.6082611592271818, + "grad_norm": 0.4509086857710446, + "learning_rate": 7.468034838922482e-06, + "loss": 0.0458, + "step": 3621 + }, + { + "epoch": 1.6087053075727293, + "grad_norm": 0.45137595767758343, + "learning_rate": 7.4663487369586776e-06, + "loss": 0.0469, + "step": 3622 + }, + { + "epoch": 1.6091494559182768, + "grad_norm": 0.4696695504363821, + "learning_rate": 7.464662264260761e-06, + "loss": 0.0452, + "step": 3623 + }, + { + "epoch": 1.6095936042638241, + "grad_norm": 0.4596849970347727, + "learning_rate": 7.46297542108224e-06, + "loss": 0.0438, + "step": 3624 + }, + { + "epoch": 1.6100377526093714, + "grad_norm": 0.5614676347080366, + "learning_rate": 7.4612882076766744e-06, + "loss": 0.0599, + "step": 3625 + }, + { + "epoch": 1.610481900954919, + "grad_norm": 0.3947391918010033, + "learning_rate": 7.459600624297681e-06, + "loss": 0.0347, + "step": 3626 + }, + { + "epoch": 1.6109260493004665, + "grad_norm": 0.6133457502317237, + "learning_rate": 7.4579126711989326e-06, + "loss": 0.053, + "step": 3627 + }, + { + "epoch": 1.6113701976460137, + "grad_norm": 0.45537205293561567, + "learning_rate": 7.456224348634158e-06, + "loss": 0.0436, + "step": 3628 + }, + { + "epoch": 1.611814345991561, + "grad_norm": 0.46644204998070415, + "learning_rate": 7.454535656857138e-06, + "loss": 0.0492, + "step": 3629 + }, + { + "epoch": 1.6122584943371085, + "grad_norm": 0.5228600296837344, + "learning_rate": 7.4528465961217145e-06, + "loss": 0.0521, + "step": 3630 + }, + { + "epoch": 1.612702642682656, + "grad_norm": 0.5557514684322911, + "learning_rate": 7.451157166681781e-06, + "loss": 0.0526, + "step": 3631 + }, + { + "epoch": 1.6131467910282034, + "grad_norm": 0.4758338953909275, + "learning_rate": 7.449467368791287e-06, + "loss": 0.0417, + "step": 3632 + }, + { + "epoch": 1.6135909393737509, + "grad_norm": 0.5619608884775461, + "learning_rate": 7.4477772027042395e-06, + "loss": 0.0582, + "step": 3633 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 0.46236286351850703, + "learning_rate": 7.4460866686746966e-06, + "loss": 0.0384, + "step": 3634 + }, + { + "epoch": 1.6144792360648457, + "grad_norm": 0.44941172754104425, + "learning_rate": 7.444395766956776e-06, + "loss": 0.0451, + "step": 3635 + }, + { + "epoch": 1.614923384410393, + "grad_norm": 0.5343367958828964, + "learning_rate": 7.4427044978046496e-06, + "loss": 0.0539, + "step": 3636 + }, + { + "epoch": 1.6153675327559405, + "grad_norm": 0.7114764542018213, + "learning_rate": 7.4410128614725406e-06, + "loss": 0.0632, + "step": 3637 + }, + { + "epoch": 1.615811681101488, + "grad_norm": 0.33754698523568316, + "learning_rate": 7.439320858214736e-06, + "loss": 0.0432, + "step": 3638 + }, + { + "epoch": 1.6162558294470353, + "grad_norm": 0.43643183273515707, + "learning_rate": 7.437628488285568e-06, + "loss": 0.0379, + "step": 3639 + }, + { + "epoch": 1.6166999777925826, + "grad_norm": 0.44390629309704815, + "learning_rate": 7.435935751939429e-06, + "loss": 0.0446, + "step": 3640 + }, + { + "epoch": 1.6171441261381303, + "grad_norm": 0.4354396645312695, + "learning_rate": 7.4342426494307695e-06, + "loss": 0.043, + "step": 3641 + }, + { + "epoch": 1.6175882744836776, + "grad_norm": 0.44873911764176055, + "learning_rate": 7.432549181014088e-06, + "loss": 0.0302, + "step": 3642 + }, + { + "epoch": 1.6180324228292249, + "grad_norm": 0.41154412795322054, + "learning_rate": 7.430855346943942e-06, + "loss": 0.0395, + "step": 3643 + }, + { + "epoch": 1.6184765711747724, + "grad_norm": 0.5287832714190103, + "learning_rate": 7.4291611474749455e-06, + "loss": 0.0473, + "step": 3644 + }, + { + "epoch": 1.61892071952032, + "grad_norm": 0.4007407767223763, + "learning_rate": 7.427466582861765e-06, + "loss": 0.0378, + "step": 3645 + }, + { + "epoch": 1.6193648678658672, + "grad_norm": 0.58929177581027, + "learning_rate": 7.42577165335912e-06, + "loss": 0.0551, + "step": 3646 + }, + { + "epoch": 1.6198090162114145, + "grad_norm": 0.3940542333112033, + "learning_rate": 7.42407635922179e-06, + "loss": 0.0294, + "step": 3647 + }, + { + "epoch": 1.620253164556962, + "grad_norm": 0.3862139275428533, + "learning_rate": 7.4223807007046045e-06, + "loss": 0.0318, + "step": 3648 + }, + { + "epoch": 1.6206973129025095, + "grad_norm": 0.4233634000349271, + "learning_rate": 7.4206846780624505e-06, + "loss": 0.0351, + "step": 3649 + }, + { + "epoch": 1.6211414612480568, + "grad_norm": 0.595696689600085, + "learning_rate": 7.418988291550271e-06, + "loss": 0.05, + "step": 3650 + }, + { + "epoch": 1.6215856095936043, + "grad_norm": 0.45676187183693634, + "learning_rate": 7.417291541423057e-06, + "loss": 0.0408, + "step": 3651 + }, + { + "epoch": 1.6220297579391518, + "grad_norm": 0.5662085190370286, + "learning_rate": 7.415594427935864e-06, + "loss": 0.0449, + "step": 3652 + }, + { + "epoch": 1.6224739062846991, + "grad_norm": 0.5893188179834489, + "learning_rate": 7.4138969513437945e-06, + "loss": 0.0514, + "step": 3653 + }, + { + "epoch": 1.6229180546302464, + "grad_norm": 1.1475511733222958, + "learning_rate": 7.412199111902007e-06, + "loss": 0.0539, + "step": 3654 + }, + { + "epoch": 1.623362202975794, + "grad_norm": 0.4595814551148835, + "learning_rate": 7.410500909865718e-06, + "loss": 0.0422, + "step": 3655 + }, + { + "epoch": 1.6238063513213414, + "grad_norm": 0.4905543614968632, + "learning_rate": 7.408802345490194e-06, + "loss": 0.0426, + "step": 3656 + }, + { + "epoch": 1.6242504996668887, + "grad_norm": 0.44546697092666765, + "learning_rate": 7.407103419030759e-06, + "loss": 0.0543, + "step": 3657 + }, + { + "epoch": 1.624694648012436, + "grad_norm": 0.4740804743160003, + "learning_rate": 7.405404130742793e-06, + "loss": 0.0493, + "step": 3658 + }, + { + "epoch": 1.6251387963579835, + "grad_norm": 1.5473311509032195, + "learning_rate": 7.4037044808817224e-06, + "loss": 0.0513, + "step": 3659 + }, + { + "epoch": 1.625582944703531, + "grad_norm": 0.6098239500874039, + "learning_rate": 7.402004469703038e-06, + "loss": 0.0367, + "step": 3660 + }, + { + "epoch": 1.6260270930490783, + "grad_norm": 0.6586494701114967, + "learning_rate": 7.4003040974622784e-06, + "loss": 0.0585, + "step": 3661 + }, + { + "epoch": 1.6264712413946258, + "grad_norm": 0.45627803806709566, + "learning_rate": 7.39860336441504e-06, + "loss": 0.048, + "step": 3662 + }, + { + "epoch": 1.6269153897401734, + "grad_norm": 0.5771583864203911, + "learning_rate": 7.3969022708169695e-06, + "loss": 0.0378, + "step": 3663 + }, + { + "epoch": 1.6273595380857206, + "grad_norm": 0.4931171394680339, + "learning_rate": 7.395200816923774e-06, + "loss": 0.033, + "step": 3664 + }, + { + "epoch": 1.627803686431268, + "grad_norm": 0.7206307962976604, + "learning_rate": 7.393499002991206e-06, + "loss": 0.0541, + "step": 3665 + }, + { + "epoch": 1.6282478347768155, + "grad_norm": 0.4622361606059328, + "learning_rate": 7.3917968292750785e-06, + "loss": 0.0415, + "step": 3666 + }, + { + "epoch": 1.628691983122363, + "grad_norm": 0.7660149085068831, + "learning_rate": 7.390094296031259e-06, + "loss": 0.0627, + "step": 3667 + }, + { + "epoch": 1.6291361314679103, + "grad_norm": 0.7776923661917167, + "learning_rate": 7.3883914035156666e-06, + "loss": 0.0396, + "step": 3668 + }, + { + "epoch": 1.6295802798134575, + "grad_norm": 0.6556574717804856, + "learning_rate": 7.386688151984275e-06, + "loss": 0.0449, + "step": 3669 + }, + { + "epoch": 1.630024428159005, + "grad_norm": 0.6486302576198958, + "learning_rate": 7.384984541693111e-06, + "loss": 0.0456, + "step": 3670 + }, + { + "epoch": 1.6304685765045526, + "grad_norm": 0.38753606074247743, + "learning_rate": 7.383280572898256e-06, + "loss": 0.0374, + "step": 3671 + }, + { + "epoch": 1.6309127248500999, + "grad_norm": 0.4446665040074296, + "learning_rate": 7.381576245855847e-06, + "loss": 0.0399, + "step": 3672 + }, + { + "epoch": 1.6313568731956474, + "grad_norm": 0.6808941373857945, + "learning_rate": 7.379871560822071e-06, + "loss": 0.0523, + "step": 3673 + }, + { + "epoch": 1.6318010215411949, + "grad_norm": 0.6878067318911092, + "learning_rate": 7.378166518053174e-06, + "loss": 0.0557, + "step": 3674 + }, + { + "epoch": 1.6322451698867422, + "grad_norm": 0.4460005003836504, + "learning_rate": 7.37646111780545e-06, + "loss": 0.0338, + "step": 3675 + }, + { + "epoch": 1.6326893182322895, + "grad_norm": 0.392173745509544, + "learning_rate": 7.374755360335253e-06, + "loss": 0.0358, + "step": 3676 + }, + { + "epoch": 1.633133466577837, + "grad_norm": 0.4942681170970006, + "learning_rate": 7.3730492458989825e-06, + "loss": 0.0451, + "step": 3677 + }, + { + "epoch": 1.6335776149233845, + "grad_norm": 0.35125391907425363, + "learning_rate": 7.371342774753101e-06, + "loss": 0.0376, + "step": 3678 + }, + { + "epoch": 1.6340217632689318, + "grad_norm": 0.5143098431682592, + "learning_rate": 7.369635947154119e-06, + "loss": 0.0469, + "step": 3679 + }, + { + "epoch": 1.634465911614479, + "grad_norm": 0.5196302676867172, + "learning_rate": 7.3679287633585995e-06, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 1.6349100599600268, + "grad_norm": 0.5148856987270073, + "learning_rate": 7.366221223623163e-06, + "loss": 0.0429, + "step": 3681 + }, + { + "epoch": 1.635354208305574, + "grad_norm": 0.42764696599583774, + "learning_rate": 7.3645133282044835e-06, + "loss": 0.0333, + "step": 3682 + }, + { + "epoch": 1.6357983566511214, + "grad_norm": 0.3662627068077827, + "learning_rate": 7.362805077359283e-06, + "loss": 0.0312, + "step": 3683 + }, + { + "epoch": 1.636242504996669, + "grad_norm": 0.5318925951291891, + "learning_rate": 7.361096471344341e-06, + "loss": 0.0392, + "step": 3684 + }, + { + "epoch": 1.6366866533422164, + "grad_norm": 0.5101772714710058, + "learning_rate": 7.359387510416494e-06, + "loss": 0.0444, + "step": 3685 + }, + { + "epoch": 1.6371308016877637, + "grad_norm": 0.6838175180628748, + "learning_rate": 7.357678194832623e-06, + "loss": 0.0443, + "step": 3686 + }, + { + "epoch": 1.637574950033311, + "grad_norm": 0.4298262192631913, + "learning_rate": 7.355968524849671e-06, + "loss": 0.0397, + "step": 3687 + }, + { + "epoch": 1.6380190983788585, + "grad_norm": 0.5227260046796232, + "learning_rate": 7.354258500724627e-06, + "loss": 0.0428, + "step": 3688 + }, + { + "epoch": 1.638463246724406, + "grad_norm": 0.3848693505144518, + "learning_rate": 7.352548122714541e-06, + "loss": 0.038, + "step": 3689 + }, + { + "epoch": 1.6389073950699533, + "grad_norm": 0.35605605527782813, + "learning_rate": 7.350837391076509e-06, + "loss": 0.0385, + "step": 3690 + }, + { + "epoch": 1.6393515434155008, + "grad_norm": 0.5496746740887447, + "learning_rate": 7.349126306067681e-06, + "loss": 0.0399, + "step": 3691 + }, + { + "epoch": 1.6397956917610483, + "grad_norm": 0.3577570105037349, + "learning_rate": 7.347414867945266e-06, + "loss": 0.035, + "step": 3692 + }, + { + "epoch": 1.6402398401065956, + "grad_norm": 0.4964615292337297, + "learning_rate": 7.345703076966522e-06, + "loss": 0.0517, + "step": 3693 + }, + { + "epoch": 1.640683988452143, + "grad_norm": 0.7390592067058361, + "learning_rate": 7.343990933388757e-06, + "loss": 0.0367, + "step": 3694 + }, + { + "epoch": 1.6411281367976904, + "grad_norm": 0.3720816990200118, + "learning_rate": 7.342278437469338e-06, + "loss": 0.0385, + "step": 3695 + }, + { + "epoch": 1.641572285143238, + "grad_norm": 0.4417091992699088, + "learning_rate": 7.340565589465681e-06, + "loss": 0.0496, + "step": 3696 + }, + { + "epoch": 1.6420164334887852, + "grad_norm": 0.4387119629881648, + "learning_rate": 7.338852389635258e-06, + "loss": 0.0482, + "step": 3697 + }, + { + "epoch": 1.6424605818343325, + "grad_norm": 0.5237427797642873, + "learning_rate": 7.337138838235589e-06, + "loss": 0.0397, + "step": 3698 + }, + { + "epoch": 1.64290473017988, + "grad_norm": 0.5126897468237226, + "learning_rate": 7.335424935524254e-06, + "loss": 0.0354, + "step": 3699 + }, + { + "epoch": 1.6433488785254275, + "grad_norm": 0.5632224593772709, + "learning_rate": 7.333710681758876e-06, + "loss": 0.0515, + "step": 3700 + }, + { + "epoch": 1.6437930268709748, + "grad_norm": 0.37651906930453466, + "learning_rate": 7.331996077197141e-06, + "loss": 0.0263, + "step": 3701 + }, + { + "epoch": 1.6442371752165224, + "grad_norm": 0.403180672879644, + "learning_rate": 7.330281122096783e-06, + "loss": 0.0361, + "step": 3702 + }, + { + "epoch": 1.6446813235620699, + "grad_norm": 0.3870458503286661, + "learning_rate": 7.328565816715587e-06, + "loss": 0.0391, + "step": 3703 + }, + { + "epoch": 1.6451254719076172, + "grad_norm": 0.46910403259693756, + "learning_rate": 7.326850161311394e-06, + "loss": 0.0539, + "step": 3704 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 0.6277176707566176, + "learning_rate": 7.325134156142093e-06, + "loss": 0.0434, + "step": 3705 + }, + { + "epoch": 1.646013768598712, + "grad_norm": 0.5311345861511363, + "learning_rate": 7.323417801465633e-06, + "loss": 0.0528, + "step": 3706 + }, + { + "epoch": 1.6464579169442595, + "grad_norm": 0.49558316497497373, + "learning_rate": 7.32170109754001e-06, + "loss": 0.0518, + "step": 3707 + }, + { + "epoch": 1.6469020652898068, + "grad_norm": 0.4215235777284794, + "learning_rate": 7.319984044623274e-06, + "loss": 0.0374, + "step": 3708 + }, + { + "epoch": 1.647346213635354, + "grad_norm": 0.7018323227064814, + "learning_rate": 7.3182666429735236e-06, + "loss": 0.0589, + "step": 3709 + }, + { + "epoch": 1.6477903619809018, + "grad_norm": 0.3811938009083553, + "learning_rate": 7.316548892848919e-06, + "loss": 0.0336, + "step": 3710 + }, + { + "epoch": 1.648234510326449, + "grad_norm": 0.48162066324285185, + "learning_rate": 7.314830794507664e-06, + "loss": 0.0392, + "step": 3711 + }, + { + "epoch": 1.6486786586719964, + "grad_norm": 0.7241447798938966, + "learning_rate": 7.313112348208017e-06, + "loss": 0.0705, + "step": 3712 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 0.4040868157055559, + "learning_rate": 7.311393554208292e-06, + "loss": 0.0417, + "step": 3713 + }, + { + "epoch": 1.6495669553630914, + "grad_norm": 0.43212705890325137, + "learning_rate": 7.3096744127668515e-06, + "loss": 0.0516, + "step": 3714 + }, + { + "epoch": 1.6500111037086387, + "grad_norm": 0.7227761525435585, + "learning_rate": 7.307954924142113e-06, + "loss": 0.0427, + "step": 3715 + }, + { + "epoch": 1.650455252054186, + "grad_norm": 0.6454421002922318, + "learning_rate": 7.306235088592545e-06, + "loss": 0.0709, + "step": 3716 + }, + { + "epoch": 1.6508994003997335, + "grad_norm": 0.5263009072366842, + "learning_rate": 7.304514906376665e-06, + "loss": 0.0354, + "step": 3717 + }, + { + "epoch": 1.651343548745281, + "grad_norm": 0.4475608856046002, + "learning_rate": 7.3027943777530504e-06, + "loss": 0.0376, + "step": 3718 + }, + { + "epoch": 1.6517876970908283, + "grad_norm": 0.4187029855449, + "learning_rate": 7.301073502980321e-06, + "loss": 0.032, + "step": 3719 + }, + { + "epoch": 1.6522318454363758, + "grad_norm": 0.4688452297403804, + "learning_rate": 7.299352282317156e-06, + "loss": 0.0328, + "step": 3720 + }, + { + "epoch": 1.6526759937819233, + "grad_norm": 0.4905958290493516, + "learning_rate": 7.297630716022285e-06, + "loss": 0.0408, + "step": 3721 + }, + { + "epoch": 1.6531201421274706, + "grad_norm": 0.5196371163361964, + "learning_rate": 7.295908804354486e-06, + "loss": 0.0518, + "step": 3722 + }, + { + "epoch": 1.653564290473018, + "grad_norm": 0.5745303077003567, + "learning_rate": 7.294186547572593e-06, + "loss": 0.0525, + "step": 3723 + }, + { + "epoch": 1.6540084388185654, + "grad_norm": 0.4232397957508058, + "learning_rate": 7.292463945935492e-06, + "loss": 0.046, + "step": 3724 + }, + { + "epoch": 1.654452587164113, + "grad_norm": 0.5715502169683124, + "learning_rate": 7.290740999702117e-06, + "loss": 0.0502, + "step": 3725 + }, + { + "epoch": 1.6548967355096602, + "grad_norm": 0.45041445198482544, + "learning_rate": 7.289017709131456e-06, + "loss": 0.0385, + "step": 3726 + }, + { + "epoch": 1.6553408838552075, + "grad_norm": 0.5898309113830352, + "learning_rate": 7.287294074482551e-06, + "loss": 0.0586, + "step": 3727 + }, + { + "epoch": 1.655785032200755, + "grad_norm": 0.42537829603624505, + "learning_rate": 7.285570096014491e-06, + "loss": 0.0382, + "step": 3728 + }, + { + "epoch": 1.6562291805463025, + "grad_norm": 0.5068748041039001, + "learning_rate": 7.283845773986421e-06, + "loss": 0.0424, + "step": 3729 + }, + { + "epoch": 1.6566733288918498, + "grad_norm": 0.49158429307244256, + "learning_rate": 7.2821211086575365e-06, + "loss": 0.0413, + "step": 3730 + }, + { + "epoch": 1.6571174772373973, + "grad_norm": 0.5434741115681563, + "learning_rate": 7.280396100287082e-06, + "loss": 0.0415, + "step": 3731 + }, + { + "epoch": 1.6575616255829448, + "grad_norm": 0.54047842476067, + "learning_rate": 7.278670749134356e-06, + "loss": 0.0337, + "step": 3732 + }, + { + "epoch": 1.6580057739284921, + "grad_norm": 0.7610613678272631, + "learning_rate": 7.276945055458709e-06, + "loss": 0.0513, + "step": 3733 + }, + { + "epoch": 1.6584499222740394, + "grad_norm": 0.43103079664602795, + "learning_rate": 7.275219019519542e-06, + "loss": 0.0365, + "step": 3734 + }, + { + "epoch": 1.658894070619587, + "grad_norm": 0.6707297267218786, + "learning_rate": 7.2734926415763074e-06, + "loss": 0.0425, + "step": 3735 + }, + { + "epoch": 1.6593382189651344, + "grad_norm": 0.3906372094311339, + "learning_rate": 7.271765921888507e-06, + "loss": 0.0379, + "step": 3736 + }, + { + "epoch": 1.6597823673106817, + "grad_norm": 0.8497034117571121, + "learning_rate": 7.2700388607157e-06, + "loss": 0.0592, + "step": 3737 + }, + { + "epoch": 1.660226515656229, + "grad_norm": 0.4906869751133543, + "learning_rate": 7.268311458317491e-06, + "loss": 0.0448, + "step": 3738 + }, + { + "epoch": 1.6606706640017765, + "grad_norm": 0.46268372537996755, + "learning_rate": 7.266583714953536e-06, + "loss": 0.0363, + "step": 3739 + }, + { + "epoch": 1.661114812347324, + "grad_norm": 0.3974000929871205, + "learning_rate": 7.2648556308835476e-06, + "loss": 0.0375, + "step": 3740 + }, + { + "epoch": 1.6615589606928713, + "grad_norm": 0.3782320844424881, + "learning_rate": 7.263127206367285e-06, + "loss": 0.0327, + "step": 3741 + }, + { + "epoch": 1.6620031090384189, + "grad_norm": 0.7073636631807192, + "learning_rate": 7.2613984416645586e-06, + "loss": 0.0549, + "step": 3742 + }, + { + "epoch": 1.6624472573839664, + "grad_norm": 0.48125971182332594, + "learning_rate": 7.2596693370352325e-06, + "loss": 0.0342, + "step": 3743 + }, + { + "epoch": 1.6628914057295137, + "grad_norm": 0.4622696042568772, + "learning_rate": 7.257939892739221e-06, + "loss": 0.0366, + "step": 3744 + }, + { + "epoch": 1.663335554075061, + "grad_norm": 0.4506482837249478, + "learning_rate": 7.256210109036485e-06, + "loss": 0.0399, + "step": 3745 + }, + { + "epoch": 1.6637797024206085, + "grad_norm": 0.468794971821978, + "learning_rate": 7.254479986187045e-06, + "loss": 0.0486, + "step": 3746 + }, + { + "epoch": 1.664223850766156, + "grad_norm": 0.461349555479564, + "learning_rate": 7.252749524450967e-06, + "loss": 0.0399, + "step": 3747 + }, + { + "epoch": 1.6646679991117033, + "grad_norm": 0.35487507978196975, + "learning_rate": 7.251018724088367e-06, + "loss": 0.0399, + "step": 3748 + }, + { + "epoch": 1.6651121474572506, + "grad_norm": 0.5665408950529203, + "learning_rate": 7.249287585359416e-06, + "loss": 0.0515, + "step": 3749 + }, + { + "epoch": 1.6655562958027983, + "grad_norm": 0.47765729176525834, + "learning_rate": 7.24755610852433e-06, + "loss": 0.0524, + "step": 3750 + }, + { + "epoch": 1.6660004441483456, + "grad_norm": 0.49560492659442207, + "learning_rate": 7.245824293843382e-06, + "loss": 0.0474, + "step": 3751 + }, + { + "epoch": 1.6664445924938929, + "grad_norm": 0.47858236763124845, + "learning_rate": 7.244092141576895e-06, + "loss": 0.0404, + "step": 3752 + }, + { + "epoch": 1.6668887408394404, + "grad_norm": 0.5242443798051569, + "learning_rate": 7.2423596519852354e-06, + "loss": 0.0564, + "step": 3753 + }, + { + "epoch": 1.667332889184988, + "grad_norm": 0.4276118939925525, + "learning_rate": 7.240626825328832e-06, + "loss": 0.0396, + "step": 3754 + }, + { + "epoch": 1.6677770375305352, + "grad_norm": 0.365991499711862, + "learning_rate": 7.238893661868154e-06, + "loss": 0.0373, + "step": 3755 + }, + { + "epoch": 1.6682211858760825, + "grad_norm": 0.614415895237667, + "learning_rate": 7.237160161863725e-06, + "loss": 0.0528, + "step": 3756 + }, + { + "epoch": 1.66866533422163, + "grad_norm": 0.40303741888400607, + "learning_rate": 7.235426325576123e-06, + "loss": 0.0407, + "step": 3757 + }, + { + "epoch": 1.6691094825671775, + "grad_norm": 0.5735579153246894, + "learning_rate": 7.23369215326597e-06, + "loss": 0.047, + "step": 3758 + }, + { + "epoch": 1.6695536309127248, + "grad_norm": 0.6025767202418845, + "learning_rate": 7.231957645193943e-06, + "loss": 0.0375, + "step": 3759 + }, + { + "epoch": 1.6699977792582723, + "grad_norm": 0.4473170515938267, + "learning_rate": 7.2302228016207666e-06, + "loss": 0.0483, + "step": 3760 + }, + { + "epoch": 1.6704419276038198, + "grad_norm": 0.677052589914415, + "learning_rate": 7.2284876228072195e-06, + "loss": 0.0635, + "step": 3761 + }, + { + "epoch": 1.6708860759493671, + "grad_norm": 0.5710811815695489, + "learning_rate": 7.226752109014127e-06, + "loss": 0.0464, + "step": 3762 + }, + { + "epoch": 1.6713302242949144, + "grad_norm": 0.36528232704433305, + "learning_rate": 7.225016260502366e-06, + "loss": 0.0372, + "step": 3763 + }, + { + "epoch": 1.671774372640462, + "grad_norm": 0.5011404570603847, + "learning_rate": 7.223280077532866e-06, + "loss": 0.0361, + "step": 3764 + }, + { + "epoch": 1.6722185209860094, + "grad_norm": 0.9792675536360905, + "learning_rate": 7.221543560366602e-06, + "loss": 0.0695, + "step": 3765 + }, + { + "epoch": 1.6726626693315567, + "grad_norm": 0.40690296889128674, + "learning_rate": 7.219806709264605e-06, + "loss": 0.0434, + "step": 3766 + }, + { + "epoch": 1.673106817677104, + "grad_norm": 0.454187985377748, + "learning_rate": 7.21806952448795e-06, + "loss": 0.0353, + "step": 3767 + }, + { + "epoch": 1.6735509660226515, + "grad_norm": 0.41286435056484855, + "learning_rate": 7.216332006297769e-06, + "loss": 0.0379, + "step": 3768 + }, + { + "epoch": 1.673995114368199, + "grad_norm": 0.6022194735287347, + "learning_rate": 7.2145941549552364e-06, + "loss": 0.0491, + "step": 3769 + }, + { + "epoch": 1.6744392627137463, + "grad_norm": 0.368180906342434, + "learning_rate": 7.212855970721584e-06, + "loss": 0.0302, + "step": 3770 + }, + { + "epoch": 1.6748834110592938, + "grad_norm": 0.3923410824291727, + "learning_rate": 7.211117453858088e-06, + "loss": 0.0434, + "step": 3771 + }, + { + "epoch": 1.6753275594048413, + "grad_norm": 0.5009695065668602, + "learning_rate": 7.209378604626081e-06, + "loss": 0.042, + "step": 3772 + }, + { + "epoch": 1.6757717077503886, + "grad_norm": 0.41349339679074837, + "learning_rate": 7.207639423286938e-06, + "loss": 0.0363, + "step": 3773 + }, + { + "epoch": 1.676215856095936, + "grad_norm": 0.363113317588028, + "learning_rate": 7.205899910102087e-06, + "loss": 0.0502, + "step": 3774 + }, + { + "epoch": 1.6766600044414834, + "grad_norm": 0.5667120483543412, + "learning_rate": 7.204160065333009e-06, + "loss": 0.047, + "step": 3775 + }, + { + "epoch": 1.677104152787031, + "grad_norm": 0.6591252875236339, + "learning_rate": 7.202419889241231e-06, + "loss": 0.0622, + "step": 3776 + }, + { + "epoch": 1.6775483011325782, + "grad_norm": 0.4627599954413904, + "learning_rate": 7.2006793820883315e-06, + "loss": 0.0294, + "step": 3777 + }, + { + "epoch": 1.6779924494781255, + "grad_norm": 0.35617793994157315, + "learning_rate": 7.198938544135936e-06, + "loss": 0.0302, + "step": 3778 + }, + { + "epoch": 1.6784365978236733, + "grad_norm": 0.4097862631071463, + "learning_rate": 7.197197375645724e-06, + "loss": 0.0414, + "step": 3779 + }, + { + "epoch": 1.6788807461692206, + "grad_norm": 0.675818069401739, + "learning_rate": 7.195455876879425e-06, + "loss": 0.0386, + "step": 3780 + }, + { + "epoch": 1.6793248945147679, + "grad_norm": 0.5308299724023986, + "learning_rate": 7.193714048098812e-06, + "loss": 0.0425, + "step": 3781 + }, + { + "epoch": 1.6797690428603154, + "grad_norm": 0.5374963949654195, + "learning_rate": 7.191971889565713e-06, + "loss": 0.0381, + "step": 3782 + }, + { + "epoch": 1.6802131912058629, + "grad_norm": 0.5783394947551381, + "learning_rate": 7.190229401542004e-06, + "loss": 0.0568, + "step": 3783 + }, + { + "epoch": 1.6806573395514102, + "grad_norm": 0.5082898930645872, + "learning_rate": 7.18848658428961e-06, + "loss": 0.0421, + "step": 3784 + }, + { + "epoch": 1.6811014878969575, + "grad_norm": 0.4025027461987578, + "learning_rate": 7.186743438070507e-06, + "loss": 0.0311, + "step": 3785 + }, + { + "epoch": 1.681545636242505, + "grad_norm": 0.3763513501474927, + "learning_rate": 7.1849999631467194e-06, + "loss": 0.0334, + "step": 3786 + }, + { + "epoch": 1.6819897845880525, + "grad_norm": 0.4272449780002536, + "learning_rate": 7.183256159780321e-06, + "loss": 0.0461, + "step": 3787 + }, + { + "epoch": 1.6824339329335998, + "grad_norm": 0.6764426293492224, + "learning_rate": 7.181512028233433e-06, + "loss": 0.0494, + "step": 3788 + }, + { + "epoch": 1.6828780812791473, + "grad_norm": 0.4493107523320909, + "learning_rate": 7.17976756876823e-06, + "loss": 0.0407, + "step": 3789 + }, + { + "epoch": 1.6833222296246948, + "grad_norm": 0.5311611910450604, + "learning_rate": 7.178022781646936e-06, + "loss": 0.0446, + "step": 3790 + }, + { + "epoch": 1.683766377970242, + "grad_norm": 0.49447252037240147, + "learning_rate": 7.176277667131817e-06, + "loss": 0.0418, + "step": 3791 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.5692208839762621, + "learning_rate": 7.1745322254851966e-06, + "loss": 0.0492, + "step": 3792 + }, + { + "epoch": 1.684654674661337, + "grad_norm": 0.5169615968797148, + "learning_rate": 7.172786456969445e-06, + "loss": 0.0529, + "step": 3793 + }, + { + "epoch": 1.6850988230068844, + "grad_norm": 0.393440341612259, + "learning_rate": 7.171040361846979e-06, + "loss": 0.0467, + "step": 3794 + }, + { + "epoch": 1.6855429713524317, + "grad_norm": 0.4049011862637367, + "learning_rate": 7.1692939403802676e-06, + "loss": 0.0395, + "step": 3795 + }, + { + "epoch": 1.685987119697979, + "grad_norm": 0.33756471256165815, + "learning_rate": 7.167547192831827e-06, + "loss": 0.0321, + "step": 3796 + }, + { + "epoch": 1.6864312680435265, + "grad_norm": 0.4670323249605969, + "learning_rate": 7.1658001194642225e-06, + "loss": 0.0433, + "step": 3797 + }, + { + "epoch": 1.686875416389074, + "grad_norm": 0.47348531343201694, + "learning_rate": 7.16405272054007e-06, + "loss": 0.0391, + "step": 3798 + }, + { + "epoch": 1.6873195647346213, + "grad_norm": 0.5297792409513463, + "learning_rate": 7.1623049963220325e-06, + "loss": 0.0396, + "step": 3799 + }, + { + "epoch": 1.6877637130801688, + "grad_norm": 0.6022176716427052, + "learning_rate": 7.160556947072823e-06, + "loss": 0.0457, + "step": 3800 + }, + { + "epoch": 1.6882078614257163, + "grad_norm": 0.4342556651872498, + "learning_rate": 7.158808573055205e-06, + "loss": 0.0406, + "step": 3801 + }, + { + "epoch": 1.6886520097712636, + "grad_norm": 0.571370547347438, + "learning_rate": 7.157059874531982e-06, + "loss": 0.039, + "step": 3802 + }, + { + "epoch": 1.689096158116811, + "grad_norm": 0.6752816385308873, + "learning_rate": 7.155310851766022e-06, + "loss": 0.0518, + "step": 3803 + }, + { + "epoch": 1.6895403064623584, + "grad_norm": 0.4681922042318178, + "learning_rate": 7.153561505020228e-06, + "loss": 0.0487, + "step": 3804 + }, + { + "epoch": 1.689984454807906, + "grad_norm": 0.4722352367968629, + "learning_rate": 7.151811834557556e-06, + "loss": 0.0506, + "step": 3805 + }, + { + "epoch": 1.6904286031534532, + "grad_norm": 0.46550029686119654, + "learning_rate": 7.150061840641012e-06, + "loss": 0.042, + "step": 3806 + }, + { + "epoch": 1.6908727514990005, + "grad_norm": 0.46380378977431014, + "learning_rate": 7.148311523533652e-06, + "loss": 0.051, + "step": 3807 + }, + { + "epoch": 1.691316899844548, + "grad_norm": 0.40338431598829894, + "learning_rate": 7.146560883498575e-06, + "loss": 0.0358, + "step": 3808 + }, + { + "epoch": 1.6917610481900955, + "grad_norm": 0.5997068568205779, + "learning_rate": 7.144809920798934e-06, + "loss": 0.0549, + "step": 3809 + }, + { + "epoch": 1.6922051965356428, + "grad_norm": 0.746314445865342, + "learning_rate": 7.143058635697928e-06, + "loss": 0.0572, + "step": 3810 + }, + { + "epoch": 1.6926493448811903, + "grad_norm": 0.6126953828556693, + "learning_rate": 7.141307028458805e-06, + "loss": 0.0581, + "step": 3811 + }, + { + "epoch": 1.6930934932267379, + "grad_norm": 0.45183518843270887, + "learning_rate": 7.13955509934486e-06, + "loss": 0.0527, + "step": 3812 + }, + { + "epoch": 1.6935376415722851, + "grad_norm": 0.40741342750683424, + "learning_rate": 7.137802848619442e-06, + "loss": 0.0448, + "step": 3813 + }, + { + "epoch": 1.6939817899178324, + "grad_norm": 0.409139232347241, + "learning_rate": 7.136050276545937e-06, + "loss": 0.0381, + "step": 3814 + }, + { + "epoch": 1.69442593826338, + "grad_norm": 0.532572512466246, + "learning_rate": 7.134297383387794e-06, + "loss": 0.0541, + "step": 3815 + }, + { + "epoch": 1.6948700866089275, + "grad_norm": 0.38463725645897007, + "learning_rate": 7.1325441694084955e-06, + "loss": 0.0413, + "step": 3816 + }, + { + "epoch": 1.6953142349544748, + "grad_norm": 1.088465520533885, + "learning_rate": 7.130790634871585e-06, + "loss": 0.052, + "step": 3817 + }, + { + "epoch": 1.695758383300022, + "grad_norm": 0.629929584219581, + "learning_rate": 7.129036780040646e-06, + "loss": 0.0567, + "step": 3818 + }, + { + "epoch": 1.6962025316455698, + "grad_norm": 0.63053218226347, + "learning_rate": 7.127282605179311e-06, + "loss": 0.0532, + "step": 3819 + }, + { + "epoch": 1.696646679991117, + "grad_norm": 0.4433559445013171, + "learning_rate": 7.125528110551266e-06, + "loss": 0.0463, + "step": 3820 + }, + { + "epoch": 1.6970908283366644, + "grad_norm": 0.42229460674035696, + "learning_rate": 7.12377329642024e-06, + "loss": 0.0397, + "step": 3821 + }, + { + "epoch": 1.6975349766822119, + "grad_norm": 0.49873647457140674, + "learning_rate": 7.122018163050011e-06, + "loss": 0.0545, + "step": 3822 + }, + { + "epoch": 1.6979791250277594, + "grad_norm": 0.6060966583878594, + "learning_rate": 7.1202627107044035e-06, + "loss": 0.0457, + "step": 3823 + }, + { + "epoch": 1.6984232733733067, + "grad_norm": 0.603057650967115, + "learning_rate": 7.118506939647295e-06, + "loss": 0.0464, + "step": 3824 + }, + { + "epoch": 1.698867421718854, + "grad_norm": 0.6194476035587843, + "learning_rate": 7.116750850142606e-06, + "loss": 0.0457, + "step": 3825 + }, + { + "epoch": 1.6993115700644015, + "grad_norm": 0.5096734124949287, + "learning_rate": 7.114994442454306e-06, + "loss": 0.045, + "step": 3826 + }, + { + "epoch": 1.699755718409949, + "grad_norm": 0.5365035460325245, + "learning_rate": 7.113237716846416e-06, + "loss": 0.0623, + "step": 3827 + }, + { + "epoch": 1.7001998667554963, + "grad_norm": 0.4418379177592074, + "learning_rate": 7.111480673582998e-06, + "loss": 0.0434, + "step": 3828 + }, + { + "epoch": 1.7006440151010438, + "grad_norm": 0.5486324781269635, + "learning_rate": 7.1097233129281674e-06, + "loss": 0.0515, + "step": 3829 + }, + { + "epoch": 1.7010881634465913, + "grad_norm": 0.46832429223654365, + "learning_rate": 7.107965635146085e-06, + "loss": 0.036, + "step": 3830 + }, + { + "epoch": 1.7015323117921386, + "grad_norm": 0.4428576565727785, + "learning_rate": 7.106207640500959e-06, + "loss": 0.0431, + "step": 3831 + }, + { + "epoch": 1.7019764601376859, + "grad_norm": 0.44260932357845195, + "learning_rate": 7.104449329257047e-06, + "loss": 0.0488, + "step": 3832 + }, + { + "epoch": 1.7024206084832334, + "grad_norm": 0.6480909417028274, + "learning_rate": 7.10269070167865e-06, + "loss": 0.0538, + "step": 3833 + }, + { + "epoch": 1.702864756828781, + "grad_norm": 0.5907643196584621, + "learning_rate": 7.100931758030126e-06, + "loss": 0.0667, + "step": 3834 + }, + { + "epoch": 1.7033089051743282, + "grad_norm": 0.530721353895675, + "learning_rate": 7.0991724985758694e-06, + "loss": 0.0355, + "step": 3835 + }, + { + "epoch": 1.7037530535198755, + "grad_norm": 0.536328694373341, + "learning_rate": 7.0974129235803256e-06, + "loss": 0.0492, + "step": 3836 + }, + { + "epoch": 1.704197201865423, + "grad_norm": 0.47794780473543447, + "learning_rate": 7.095653033307992e-06, + "loss": 0.0394, + "step": 3837 + }, + { + "epoch": 1.7046413502109705, + "grad_norm": 0.5788739767339988, + "learning_rate": 7.093892828023408e-06, + "loss": 0.0684, + "step": 3838 + }, + { + "epoch": 1.7050854985565178, + "grad_norm": 0.4591656439375814, + "learning_rate": 7.092132307991163e-06, + "loss": 0.0426, + "step": 3839 + }, + { + "epoch": 1.7055296469020653, + "grad_norm": 0.4376969955300339, + "learning_rate": 7.090371473475894e-06, + "loss": 0.0424, + "step": 3840 + }, + { + "epoch": 1.7059737952476128, + "grad_norm": 0.43964430921337727, + "learning_rate": 7.088610324742282e-06, + "loss": 0.0348, + "step": 3841 + }, + { + "epoch": 1.7064179435931601, + "grad_norm": 0.40903365943544095, + "learning_rate": 7.086848862055059e-06, + "loss": 0.0317, + "step": 3842 + }, + { + "epoch": 1.7068620919387074, + "grad_norm": 0.5320572070600705, + "learning_rate": 7.085087085679003e-06, + "loss": 0.0493, + "step": 3843 + }, + { + "epoch": 1.707306240284255, + "grad_norm": 0.4182055882812196, + "learning_rate": 7.0833249958789396e-06, + "loss": 0.0413, + "step": 3844 + }, + { + "epoch": 1.7077503886298024, + "grad_norm": 0.5174573980801822, + "learning_rate": 7.081562592919737e-06, + "loss": 0.0468, + "step": 3845 + }, + { + "epoch": 1.7081945369753497, + "grad_norm": 0.36708254507339966, + "learning_rate": 7.07979987706632e-06, + "loss": 0.0392, + "step": 3846 + }, + { + "epoch": 1.708638685320897, + "grad_norm": 0.42988860867301804, + "learning_rate": 7.078036848583651e-06, + "loss": 0.0438, + "step": 3847 + }, + { + "epoch": 1.7090828336664448, + "grad_norm": 0.8624108371914289, + "learning_rate": 7.076273507736744e-06, + "loss": 0.0564, + "step": 3848 + }, + { + "epoch": 1.709526982011992, + "grad_norm": 0.3825294027975058, + "learning_rate": 7.074509854790659e-06, + "loss": 0.042, + "step": 3849 + }, + { + "epoch": 1.7099711303575393, + "grad_norm": 0.7845280836992656, + "learning_rate": 7.072745890010502e-06, + "loss": 0.0513, + "step": 3850 + }, + { + "epoch": 1.7104152787030868, + "grad_norm": 0.511895797137735, + "learning_rate": 7.070981613661429e-06, + "loss": 0.042, + "step": 3851 + }, + { + "epoch": 1.7108594270486344, + "grad_norm": 0.39864960740423, + "learning_rate": 7.06921702600864e-06, + "loss": 0.0435, + "step": 3852 + }, + { + "epoch": 1.7113035753941817, + "grad_norm": 0.3728345248612773, + "learning_rate": 7.067452127317381e-06, + "loss": 0.0414, + "step": 3853 + }, + { + "epoch": 1.711747723739729, + "grad_norm": 0.4146176183523404, + "learning_rate": 7.065686917852948e-06, + "loss": 0.0394, + "step": 3854 + }, + { + "epoch": 1.7121918720852765, + "grad_norm": 0.4550263965386846, + "learning_rate": 7.063921397880682e-06, + "loss": 0.0353, + "step": 3855 + }, + { + "epoch": 1.712636020430824, + "grad_norm": 0.6273234491307014, + "learning_rate": 7.062155567665969e-06, + "loss": 0.0336, + "step": 3856 + }, + { + "epoch": 1.7130801687763713, + "grad_norm": 0.5739885387932523, + "learning_rate": 7.0603894274742445e-06, + "loss": 0.0463, + "step": 3857 + }, + { + "epoch": 1.7135243171219188, + "grad_norm": 0.46562620789463843, + "learning_rate": 7.05862297757099e-06, + "loss": 0.0453, + "step": 3858 + }, + { + "epoch": 1.7139684654674663, + "grad_norm": 0.5078681314633765, + "learning_rate": 7.056856218221731e-06, + "loss": 0.0472, + "step": 3859 + }, + { + "epoch": 1.7144126138130136, + "grad_norm": 0.44379024742354006, + "learning_rate": 7.055089149692044e-06, + "loss": 0.0362, + "step": 3860 + }, + { + "epoch": 1.7148567621585609, + "grad_norm": 0.6765029010883838, + "learning_rate": 7.053321772247546e-06, + "loss": 0.0605, + "step": 3861 + }, + { + "epoch": 1.7153009105041084, + "grad_norm": 0.5667840551717872, + "learning_rate": 7.051554086153907e-06, + "loss": 0.052, + "step": 3862 + }, + { + "epoch": 1.715745058849656, + "grad_norm": 0.4505940404872499, + "learning_rate": 7.049786091676838e-06, + "loss": 0.0437, + "step": 3863 + }, + { + "epoch": 1.7161892071952032, + "grad_norm": 0.3740985912782501, + "learning_rate": 7.0480177890821e-06, + "loss": 0.034, + "step": 3864 + }, + { + "epoch": 1.7166333555407505, + "grad_norm": 0.5770251755986612, + "learning_rate": 7.046249178635499e-06, + "loss": 0.0502, + "step": 3865 + }, + { + "epoch": 1.717077503886298, + "grad_norm": 0.5237695494745064, + "learning_rate": 7.044480260602888e-06, + "loss": 0.0565, + "step": 3866 + }, + { + "epoch": 1.7175216522318455, + "grad_norm": 0.49695229712892924, + "learning_rate": 7.042711035250162e-06, + "loss": 0.0526, + "step": 3867 + }, + { + "epoch": 1.7179658005773928, + "grad_norm": 0.7710570644630359, + "learning_rate": 7.0409415028432685e-06, + "loss": 0.0353, + "step": 3868 + }, + { + "epoch": 1.7184099489229403, + "grad_norm": 0.5569345846072523, + "learning_rate": 7.0391716636481976e-06, + "loss": 0.0463, + "step": 3869 + }, + { + "epoch": 1.7188540972684878, + "grad_norm": 0.46227610895953286, + "learning_rate": 7.037401517930986e-06, + "loss": 0.0518, + "step": 3870 + }, + { + "epoch": 1.719298245614035, + "grad_norm": 0.3855965081893482, + "learning_rate": 7.035631065957718e-06, + "loss": 0.0345, + "step": 3871 + }, + { + "epoch": 1.7197423939595824, + "grad_norm": 0.4363418780122806, + "learning_rate": 7.03386030799452e-06, + "loss": 0.0492, + "step": 3872 + }, + { + "epoch": 1.72018654230513, + "grad_norm": 0.4427385688478732, + "learning_rate": 7.03208924430757e-06, + "loss": 0.0463, + "step": 3873 + }, + { + "epoch": 1.7206306906506774, + "grad_norm": 0.5897859219840801, + "learning_rate": 7.030317875163086e-06, + "loss": 0.0472, + "step": 3874 + }, + { + "epoch": 1.7210748389962247, + "grad_norm": 0.44136482686712686, + "learning_rate": 7.0285462008273365e-06, + "loss": 0.0339, + "step": 3875 + }, + { + "epoch": 1.721518987341772, + "grad_norm": 0.4341786806553805, + "learning_rate": 7.026774221566634e-06, + "loss": 0.0479, + "step": 3876 + }, + { + "epoch": 1.7219631356873195, + "grad_norm": 0.4859029892336248, + "learning_rate": 7.0250019376473375e-06, + "loss": 0.0481, + "step": 3877 + }, + { + "epoch": 1.722407284032867, + "grad_norm": 0.554393064292429, + "learning_rate": 7.0232293493358515e-06, + "loss": 0.0423, + "step": 3878 + }, + { + "epoch": 1.7228514323784143, + "grad_norm": 0.4954355417965089, + "learning_rate": 7.021456456898624e-06, + "loss": 0.0529, + "step": 3879 + }, + { + "epoch": 1.7232955807239618, + "grad_norm": 0.39348052805418005, + "learning_rate": 7.019683260602155e-06, + "loss": 0.04, + "step": 3880 + }, + { + "epoch": 1.7237397290695093, + "grad_norm": 0.4876831116193254, + "learning_rate": 7.017909760712982e-06, + "loss": 0.0416, + "step": 3881 + }, + { + "epoch": 1.7241838774150566, + "grad_norm": 0.5390273774730024, + "learning_rate": 7.016135957497693e-06, + "loss": 0.0685, + "step": 3882 + }, + { + "epoch": 1.724628025760604, + "grad_norm": 0.553831689747292, + "learning_rate": 7.014361851222923e-06, + "loss": 0.0575, + "step": 3883 + }, + { + "epoch": 1.7250721741061514, + "grad_norm": 0.5211453167042258, + "learning_rate": 7.012587442155349e-06, + "loss": 0.0508, + "step": 3884 + }, + { + "epoch": 1.725516322451699, + "grad_norm": 0.5052516566700862, + "learning_rate": 7.010812730561691e-06, + "loss": 0.0411, + "step": 3885 + }, + { + "epoch": 1.7259604707972462, + "grad_norm": 0.5073428394585604, + "learning_rate": 7.009037716708725e-06, + "loss": 0.0426, + "step": 3886 + }, + { + "epoch": 1.7264046191427935, + "grad_norm": 0.48746152511684293, + "learning_rate": 7.007262400863262e-06, + "loss": 0.043, + "step": 3887 + }, + { + "epoch": 1.7268487674883413, + "grad_norm": 0.48800631902973207, + "learning_rate": 7.005486783292164e-06, + "loss": 0.0416, + "step": 3888 + }, + { + "epoch": 1.7272929158338886, + "grad_norm": 0.5824704615616569, + "learning_rate": 7.003710864262333e-06, + "loss": 0.0485, + "step": 3889 + }, + { + "epoch": 1.7277370641794358, + "grad_norm": 0.4562507795280803, + "learning_rate": 7.0019346440407225e-06, + "loss": 0.0392, + "step": 3890 + }, + { + "epoch": 1.7281812125249834, + "grad_norm": 0.5660300341993364, + "learning_rate": 7.000158122894329e-06, + "loss": 0.0443, + "step": 3891 + }, + { + "epoch": 1.7286253608705309, + "grad_norm": 1.2237037452955262, + "learning_rate": 6.9983813010901925e-06, + "loss": 0.0446, + "step": 3892 + }, + { + "epoch": 1.7290695092160782, + "grad_norm": 0.3834611348340635, + "learning_rate": 6.996604178895398e-06, + "loss": 0.0321, + "step": 3893 + }, + { + "epoch": 1.7295136575616255, + "grad_norm": 0.500508115884937, + "learning_rate": 6.994826756577082e-06, + "loss": 0.0547, + "step": 3894 + }, + { + "epoch": 1.729957805907173, + "grad_norm": 0.41016179375129264, + "learning_rate": 6.993049034402417e-06, + "loss": 0.0299, + "step": 3895 + }, + { + "epoch": 1.7304019542527205, + "grad_norm": 0.5423253130879615, + "learning_rate": 6.991271012638626e-06, + "loss": 0.0616, + "step": 3896 + }, + { + "epoch": 1.7308461025982678, + "grad_norm": 0.5282418101161326, + "learning_rate": 6.9894926915529774e-06, + "loss": 0.0457, + "step": 3897 + }, + { + "epoch": 1.7312902509438153, + "grad_norm": 0.4989861140225304, + "learning_rate": 6.987714071412781e-06, + "loss": 0.0408, + "step": 3898 + }, + { + "epoch": 1.7317343992893628, + "grad_norm": 0.48460144374543296, + "learning_rate": 6.985935152485392e-06, + "loss": 0.0508, + "step": 3899 + }, + { + "epoch": 1.73217854763491, + "grad_norm": 0.37082720341713776, + "learning_rate": 6.984155935038217e-06, + "loss": 0.0376, + "step": 3900 + }, + { + "epoch": 1.7326226959804574, + "grad_norm": 0.6794555649441637, + "learning_rate": 6.9823764193387e-06, + "loss": 0.0568, + "step": 3901 + }, + { + "epoch": 1.7330668443260049, + "grad_norm": 0.5551808325017253, + "learning_rate": 6.980596605654332e-06, + "loss": 0.0502, + "step": 3902 + }, + { + "epoch": 1.7335109926715524, + "grad_norm": 0.5306956016396489, + "learning_rate": 6.9788164942526495e-06, + "loss": 0.0432, + "step": 3903 + }, + { + "epoch": 1.7339551410170997, + "grad_norm": 0.6362458412949452, + "learning_rate": 6.977036085401234e-06, + "loss": 0.0486, + "step": 3904 + }, + { + "epoch": 1.734399289362647, + "grad_norm": 0.29485239085618, + "learning_rate": 6.9752553793677105e-06, + "loss": 0.0245, + "step": 3905 + }, + { + "epoch": 1.7348434377081945, + "grad_norm": 0.5487150575741024, + "learning_rate": 6.9734743764197485e-06, + "loss": 0.0463, + "step": 3906 + }, + { + "epoch": 1.735287586053742, + "grad_norm": 0.7871819167162935, + "learning_rate": 6.9716930768250655e-06, + "loss": 0.0524, + "step": 3907 + }, + { + "epoch": 1.7357317343992893, + "grad_norm": 0.5219429848305476, + "learning_rate": 6.9699114808514215e-06, + "loss": 0.0549, + "step": 3908 + }, + { + "epoch": 1.7361758827448368, + "grad_norm": 0.4635849498371789, + "learning_rate": 6.968129588766617e-06, + "loss": 0.0604, + "step": 3909 + }, + { + "epoch": 1.7366200310903843, + "grad_norm": 0.43616273854041554, + "learning_rate": 6.966347400838502e-06, + "loss": 0.0373, + "step": 3910 + }, + { + "epoch": 1.7370641794359316, + "grad_norm": 0.593451901083156, + "learning_rate": 6.964564917334973e-06, + "loss": 0.0527, + "step": 3911 + }, + { + "epoch": 1.737508327781479, + "grad_norm": 0.6448036909846457, + "learning_rate": 6.962782138523963e-06, + "loss": 0.048, + "step": 3912 + }, + { + "epoch": 1.7379524761270264, + "grad_norm": 0.6507383892986597, + "learning_rate": 6.960999064673455e-06, + "loss": 0.0405, + "step": 3913 + }, + { + "epoch": 1.738396624472574, + "grad_norm": 0.3700619002880566, + "learning_rate": 6.959215696051478e-06, + "loss": 0.0375, + "step": 3914 + }, + { + "epoch": 1.7388407728181212, + "grad_norm": 0.5636095366670266, + "learning_rate": 6.957432032926099e-06, + "loss": 0.0473, + "step": 3915 + }, + { + "epoch": 1.7392849211636685, + "grad_norm": 0.31284491700710365, + "learning_rate": 6.955648075565435e-06, + "loss": 0.0262, + "step": 3916 + }, + { + "epoch": 1.7397290695092162, + "grad_norm": 0.88892656741503, + "learning_rate": 6.953863824237644e-06, + "loss": 0.0439, + "step": 3917 + }, + { + "epoch": 1.7401732178547635, + "grad_norm": 0.3940650039553802, + "learning_rate": 6.952079279210931e-06, + "loss": 0.0336, + "step": 3918 + }, + { + "epoch": 1.7406173662003108, + "grad_norm": 0.43415099426990134, + "learning_rate": 6.950294440753542e-06, + "loss": 0.0372, + "step": 3919 + }, + { + "epoch": 1.7410615145458583, + "grad_norm": 0.43498781141215753, + "learning_rate": 6.948509309133769e-06, + "loss": 0.0471, + "step": 3920 + }, + { + "epoch": 1.7415056628914058, + "grad_norm": 0.3673155534955715, + "learning_rate": 6.9467238846199465e-06, + "loss": 0.0329, + "step": 3921 + }, + { + "epoch": 1.7419498112369531, + "grad_norm": 0.3522859182784973, + "learning_rate": 6.944938167480456e-06, + "loss": 0.0337, + "step": 3922 + }, + { + "epoch": 1.7423939595825004, + "grad_norm": 0.40635886553673484, + "learning_rate": 6.943152157983719e-06, + "loss": 0.0429, + "step": 3923 + }, + { + "epoch": 1.742838107928048, + "grad_norm": 0.8451883765559832, + "learning_rate": 6.941365856398205e-06, + "loss": 0.0668, + "step": 3924 + }, + { + "epoch": 1.7432822562735955, + "grad_norm": 0.5600822140198348, + "learning_rate": 6.939579262992426e-06, + "loss": 0.0451, + "step": 3925 + }, + { + "epoch": 1.7437264046191427, + "grad_norm": 1.2545344075254676, + "learning_rate": 6.937792378034936e-06, + "loss": 0.0743, + "step": 3926 + }, + { + "epoch": 1.7441705529646903, + "grad_norm": 1.1071420667016871, + "learning_rate": 6.936005201794331e-06, + "loss": 0.073, + "step": 3927 + }, + { + "epoch": 1.7446147013102378, + "grad_norm": 0.4227881516723295, + "learning_rate": 6.93421773453926e-06, + "loss": 0.043, + "step": 3928 + }, + { + "epoch": 1.745058849655785, + "grad_norm": 0.46137549848480547, + "learning_rate": 6.932429976538407e-06, + "loss": 0.0413, + "step": 3929 + }, + { + "epoch": 1.7455029980013324, + "grad_norm": 0.5679012650124624, + "learning_rate": 6.930641928060501e-06, + "loss": 0.0331, + "step": 3930 + }, + { + "epoch": 1.7459471463468799, + "grad_norm": 0.5321986376581693, + "learning_rate": 6.928853589374318e-06, + "loss": 0.0467, + "step": 3931 + }, + { + "epoch": 1.7463912946924274, + "grad_norm": 0.6219676953962676, + "learning_rate": 6.927064960748675e-06, + "loss": 0.0479, + "step": 3932 + }, + { + "epoch": 1.7468354430379747, + "grad_norm": 0.5001191355293207, + "learning_rate": 6.925276042452433e-06, + "loss": 0.0535, + "step": 3933 + }, + { + "epoch": 1.747279591383522, + "grad_norm": 0.4563948076269455, + "learning_rate": 6.923486834754498e-06, + "loss": 0.0427, + "step": 3934 + }, + { + "epoch": 1.7477237397290695, + "grad_norm": 0.6974643048935755, + "learning_rate": 6.9216973379238175e-06, + "loss": 0.0485, + "step": 3935 + }, + { + "epoch": 1.748167888074617, + "grad_norm": 0.9428504067985796, + "learning_rate": 6.9199075522293815e-06, + "loss": 0.0612, + "step": 3936 + }, + { + "epoch": 1.7486120364201643, + "grad_norm": 0.8097896480889225, + "learning_rate": 6.918117477940227e-06, + "loss": 0.0467, + "step": 3937 + }, + { + "epoch": 1.7490561847657118, + "grad_norm": 0.5120773769465646, + "learning_rate": 6.916327115325434e-06, + "loss": 0.0554, + "step": 3938 + }, + { + "epoch": 1.7495003331112593, + "grad_norm": 0.49704552104919647, + "learning_rate": 6.914536464654123e-06, + "loss": 0.0429, + "step": 3939 + }, + { + "epoch": 1.7499444814568066, + "grad_norm": 0.4966530008001101, + "learning_rate": 6.912745526195457e-06, + "loss": 0.0416, + "step": 3940 + }, + { + "epoch": 1.7503886298023539, + "grad_norm": 0.6069162585490371, + "learning_rate": 6.910954300218648e-06, + "loss": 0.0407, + "step": 3941 + }, + { + "epoch": 1.7508327781479014, + "grad_norm": 0.5065891450556588, + "learning_rate": 6.9091627869929456e-06, + "loss": 0.0522, + "step": 3942 + }, + { + "epoch": 1.751276926493449, + "grad_norm": 0.5821716098410316, + "learning_rate": 6.907370986787647e-06, + "loss": 0.0588, + "step": 3943 + }, + { + "epoch": 1.7517210748389962, + "grad_norm": 0.4854527206747191, + "learning_rate": 6.905578899872085e-06, + "loss": 0.0347, + "step": 3944 + }, + { + "epoch": 1.7521652231845435, + "grad_norm": 0.6099586438595753, + "learning_rate": 6.903786526515648e-06, + "loss": 0.057, + "step": 3945 + }, + { + "epoch": 1.752609371530091, + "grad_norm": 0.39621484939213947, + "learning_rate": 6.901993866987755e-06, + "loss": 0.036, + "step": 3946 + }, + { + "epoch": 1.7530535198756385, + "grad_norm": 0.5825955749635414, + "learning_rate": 6.9002009215578736e-06, + "loss": 0.0371, + "step": 3947 + }, + { + "epoch": 1.7534976682211858, + "grad_norm": 0.5518034617506287, + "learning_rate": 6.898407690495516e-06, + "loss": 0.0535, + "step": 3948 + }, + { + "epoch": 1.7539418165667333, + "grad_norm": 0.7169225712774631, + "learning_rate": 6.896614174070234e-06, + "loss": 0.065, + "step": 3949 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.487246075580528, + "learning_rate": 6.894820372551624e-06, + "loss": 0.0425, + "step": 3950 + }, + { + "epoch": 1.7548301132578281, + "grad_norm": 0.5762490980340829, + "learning_rate": 6.893026286209324e-06, + "loss": 0.0579, + "step": 3951 + }, + { + "epoch": 1.7552742616033754, + "grad_norm": 0.40568305353842643, + "learning_rate": 6.891231915313017e-06, + "loss": 0.0447, + "step": 3952 + }, + { + "epoch": 1.755718409948923, + "grad_norm": 1.1277438855550295, + "learning_rate": 6.889437260132426e-06, + "loss": 0.0646, + "step": 3953 + }, + { + "epoch": 1.7561625582944704, + "grad_norm": 1.0348468689646162, + "learning_rate": 6.887642320937319e-06, + "loss": 0.0477, + "step": 3954 + }, + { + "epoch": 1.7566067066400177, + "grad_norm": 0.4516096937991291, + "learning_rate": 6.885847097997507e-06, + "loss": 0.0441, + "step": 3955 + }, + { + "epoch": 1.757050854985565, + "grad_norm": 0.5215834296906894, + "learning_rate": 6.884051591582838e-06, + "loss": 0.0413, + "step": 3956 + }, + { + "epoch": 1.7574950033311127, + "grad_norm": 0.4942925713506997, + "learning_rate": 6.882255801963215e-06, + "loss": 0.0313, + "step": 3957 + }, + { + "epoch": 1.75793915167666, + "grad_norm": 0.5511920080731652, + "learning_rate": 6.8804597294085676e-06, + "loss": 0.0468, + "step": 3958 + }, + { + "epoch": 1.7583833000222073, + "grad_norm": 0.8005773203813454, + "learning_rate": 6.87866337418888e-06, + "loss": 0.0723, + "step": 3959 + }, + { + "epoch": 1.7588274483677548, + "grad_norm": 0.6193224773136193, + "learning_rate": 6.876866736574175e-06, + "loss": 0.0458, + "step": 3960 + }, + { + "epoch": 1.7592715967133024, + "grad_norm": 0.7463153621358136, + "learning_rate": 6.875069816834517e-06, + "loss": 0.0452, + "step": 3961 + }, + { + "epoch": 1.7597157450588496, + "grad_norm": 0.6653689016758167, + "learning_rate": 6.873272615240013e-06, + "loss": 0.0379, + "step": 3962 + }, + { + "epoch": 1.760159893404397, + "grad_norm": 0.46506318034990934, + "learning_rate": 6.871475132060814e-06, + "loss": 0.0405, + "step": 3963 + }, + { + "epoch": 1.7606040417499444, + "grad_norm": 0.4803172632231074, + "learning_rate": 6.8696773675671125e-06, + "loss": 0.0494, + "step": 3964 + }, + { + "epoch": 1.761048190095492, + "grad_norm": 0.5683228177486831, + "learning_rate": 6.8678793220291406e-06, + "loss": 0.0474, + "step": 3965 + }, + { + "epoch": 1.7614923384410393, + "grad_norm": 0.6405917034428388, + "learning_rate": 6.866080995717179e-06, + "loss": 0.0444, + "step": 3966 + }, + { + "epoch": 1.7619364867865868, + "grad_norm": 0.469723345316827, + "learning_rate": 6.864282388901544e-06, + "loss": 0.0456, + "step": 3967 + }, + { + "epoch": 1.7623806351321343, + "grad_norm": 0.3596729988674669, + "learning_rate": 6.862483501852597e-06, + "loss": 0.0273, + "step": 3968 + }, + { + "epoch": 1.7628247834776816, + "grad_norm": 0.4095733039990315, + "learning_rate": 6.8606843348407416e-06, + "loss": 0.0423, + "step": 3969 + }, + { + "epoch": 1.7632689318232289, + "grad_norm": 0.5817103130183384, + "learning_rate": 6.858884888136423e-06, + "loss": 0.061, + "step": 3970 + }, + { + "epoch": 1.7637130801687764, + "grad_norm": 0.5515400819468078, + "learning_rate": 6.85708516201013e-06, + "loss": 0.0493, + "step": 3971 + }, + { + "epoch": 1.7641572285143239, + "grad_norm": 0.4010509115861224, + "learning_rate": 6.855285156732389e-06, + "loss": 0.0384, + "step": 3972 + }, + { + "epoch": 1.7646013768598712, + "grad_norm": 0.48651107038259084, + "learning_rate": 6.853484872573773e-06, + "loss": 0.0417, + "step": 3973 + }, + { + "epoch": 1.7650455252054185, + "grad_norm": 0.6804998963023164, + "learning_rate": 6.851684309804898e-06, + "loss": 0.0511, + "step": 3974 + }, + { + "epoch": 1.765489673550966, + "grad_norm": 0.5378576658955843, + "learning_rate": 6.849883468696414e-06, + "loss": 0.0466, + "step": 3975 + }, + { + "epoch": 1.7659338218965135, + "grad_norm": 0.6080913375351253, + "learning_rate": 6.848082349519021e-06, + "loss": 0.0504, + "step": 3976 + }, + { + "epoch": 1.7663779702420608, + "grad_norm": 0.4317853459587004, + "learning_rate": 6.846280952543459e-06, + "loss": 0.0419, + "step": 3977 + }, + { + "epoch": 1.7668221185876083, + "grad_norm": 0.44881873306480574, + "learning_rate": 6.844479278040506e-06, + "loss": 0.0373, + "step": 3978 + }, + { + "epoch": 1.7672662669331558, + "grad_norm": 0.38885972043532946, + "learning_rate": 6.842677326280984e-06, + "loss": 0.0396, + "step": 3979 + }, + { + "epoch": 1.767710415278703, + "grad_norm": 0.4524735899424509, + "learning_rate": 6.840875097535761e-06, + "loss": 0.0448, + "step": 3980 + }, + { + "epoch": 1.7681545636242504, + "grad_norm": 0.455210687429983, + "learning_rate": 6.8390725920757374e-06, + "loss": 0.0382, + "step": 3981 + }, + { + "epoch": 1.768598711969798, + "grad_norm": 0.398331712609469, + "learning_rate": 6.837269810171864e-06, + "loss": 0.0414, + "step": 3982 + }, + { + "epoch": 1.7690428603153454, + "grad_norm": 0.37522143210177494, + "learning_rate": 6.835466752095129e-06, + "loss": 0.0386, + "step": 3983 + }, + { + "epoch": 1.7694870086608927, + "grad_norm": 0.6174104941670406, + "learning_rate": 6.833663418116561e-06, + "loss": 0.0389, + "step": 3984 + }, + { + "epoch": 1.76993115700644, + "grad_norm": 0.6659454494298849, + "learning_rate": 6.831859808507233e-06, + "loss": 0.0447, + "step": 3985 + }, + { + "epoch": 1.7703753053519877, + "grad_norm": 0.47498410446306544, + "learning_rate": 6.830055923538258e-06, + "loss": 0.0417, + "step": 3986 + }, + { + "epoch": 1.770819453697535, + "grad_norm": 0.4396131477174768, + "learning_rate": 6.82825176348079e-06, + "loss": 0.0372, + "step": 3987 + }, + { + "epoch": 1.7712636020430823, + "grad_norm": 0.576388987980822, + "learning_rate": 6.826447328606026e-06, + "loss": 0.0432, + "step": 3988 + }, + { + "epoch": 1.7717077503886298, + "grad_norm": 0.5115559440177404, + "learning_rate": 6.8246426191852025e-06, + "loss": 0.046, + "step": 3989 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 0.3933392068642798, + "learning_rate": 6.822837635489597e-06, + "loss": 0.0424, + "step": 3990 + }, + { + "epoch": 1.7725960470797246, + "grad_norm": 0.48762159715898146, + "learning_rate": 6.821032377790533e-06, + "loss": 0.0428, + "step": 3991 + }, + { + "epoch": 1.773040195425272, + "grad_norm": 0.5275360991096351, + "learning_rate": 6.819226846359366e-06, + "loss": 0.0437, + "step": 3992 + }, + { + "epoch": 1.7734843437708194, + "grad_norm": 0.5998959208861094, + "learning_rate": 6.817421041467501e-06, + "loss": 0.0493, + "step": 3993 + }, + { + "epoch": 1.773928492116367, + "grad_norm": 0.558883606638067, + "learning_rate": 6.815614963386383e-06, + "loss": 0.0443, + "step": 3994 + }, + { + "epoch": 1.7743726404619142, + "grad_norm": 0.530805377008147, + "learning_rate": 6.813808612387493e-06, + "loss": 0.0493, + "step": 3995 + }, + { + "epoch": 1.7748167888074617, + "grad_norm": 0.44577841042032773, + "learning_rate": 6.812001988742356e-06, + "loss": 0.0423, + "step": 3996 + }, + { + "epoch": 1.7752609371530093, + "grad_norm": 0.4208112565964139, + "learning_rate": 6.81019509272254e-06, + "loss": 0.0383, + "step": 3997 + }, + { + "epoch": 1.7757050854985565, + "grad_norm": 0.5652594178574358, + "learning_rate": 6.808387924599653e-06, + "loss": 0.0469, + "step": 3998 + }, + { + "epoch": 1.7761492338441038, + "grad_norm": 0.47567339447317186, + "learning_rate": 6.806580484645342e-06, + "loss": 0.0536, + "step": 3999 + }, + { + "epoch": 1.7765933821896513, + "grad_norm": 0.5061651547732329, + "learning_rate": 6.804772773131294e-06, + "loss": 0.0477, + "step": 4000 + }, + { + "epoch": 1.7770375305351989, + "grad_norm": 0.3302628327205031, + "learning_rate": 6.80296479032924e-06, + "loss": 0.0287, + "step": 4001 + }, + { + "epoch": 1.7774816788807462, + "grad_norm": 0.5657117208796698, + "learning_rate": 6.801156536510953e-06, + "loss": 0.063, + "step": 4002 + }, + { + "epoch": 1.7779258272262934, + "grad_norm": 0.40290409532344185, + "learning_rate": 6.799348011948242e-06, + "loss": 0.0328, + "step": 4003 + }, + { + "epoch": 1.778369975571841, + "grad_norm": 0.4303302869190356, + "learning_rate": 6.797539216912958e-06, + "loss": 0.0389, + "step": 4004 + }, + { + "epoch": 1.7788141239173885, + "grad_norm": 0.4915190826163001, + "learning_rate": 6.795730151676996e-06, + "loss": 0.0506, + "step": 4005 + }, + { + "epoch": 1.7792582722629358, + "grad_norm": 0.4472370244553756, + "learning_rate": 6.793920816512287e-06, + "loss": 0.0479, + "step": 4006 + }, + { + "epoch": 1.7797024206084833, + "grad_norm": 0.46823865267198, + "learning_rate": 6.792111211690807e-06, + "loss": 0.04, + "step": 4007 + }, + { + "epoch": 1.7801465689540308, + "grad_norm": 0.713302627768962, + "learning_rate": 6.790301337484569e-06, + "loss": 0.0467, + "step": 4008 + }, + { + "epoch": 1.780590717299578, + "grad_norm": 0.544988693204241, + "learning_rate": 6.788491194165629e-06, + "loss": 0.0597, + "step": 4009 + }, + { + "epoch": 1.7810348656451254, + "grad_norm": 0.5488358962167847, + "learning_rate": 6.786680782006079e-06, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 1.7814790139906729, + "grad_norm": 0.45588667933120275, + "learning_rate": 6.784870101278058e-06, + "loss": 0.0458, + "step": 4011 + }, + { + "epoch": 1.7819231623362204, + "grad_norm": 0.41134367023706303, + "learning_rate": 6.783059152253743e-06, + "loss": 0.0423, + "step": 4012 + }, + { + "epoch": 1.7823673106817677, + "grad_norm": 0.5379695606986187, + "learning_rate": 6.7812479352053465e-06, + "loss": 0.0434, + "step": 4013 + }, + { + "epoch": 1.782811459027315, + "grad_norm": 0.390627468228112, + "learning_rate": 6.779436450405127e-06, + "loss": 0.0459, + "step": 4014 + }, + { + "epoch": 1.7832556073728625, + "grad_norm": 0.7523391862616232, + "learning_rate": 6.7776246981253835e-06, + "loss": 0.0578, + "step": 4015 + }, + { + "epoch": 1.78369975571841, + "grad_norm": 0.4670417748350829, + "learning_rate": 6.775812678638449e-06, + "loss": 0.0391, + "step": 4016 + }, + { + "epoch": 1.7841439040639573, + "grad_norm": 0.6629585111671369, + "learning_rate": 6.7740003922167045e-06, + "loss": 0.0525, + "step": 4017 + }, + { + "epoch": 1.7845880524095048, + "grad_norm": 0.5468251468649823, + "learning_rate": 6.7721878391325655e-06, + "loss": 0.048, + "step": 4018 + }, + { + "epoch": 1.7850322007550523, + "grad_norm": 0.6407194911078166, + "learning_rate": 6.770375019658491e-06, + "loss": 0.0342, + "step": 4019 + }, + { + "epoch": 1.7854763491005996, + "grad_norm": 0.3555576600475007, + "learning_rate": 6.7685619340669775e-06, + "loss": 0.0424, + "step": 4020 + }, + { + "epoch": 1.785920497446147, + "grad_norm": 0.5655823535701915, + "learning_rate": 6.766748582630561e-06, + "loss": 0.0498, + "step": 4021 + }, + { + "epoch": 1.7863646457916944, + "grad_norm": 0.5446372824826297, + "learning_rate": 6.764934965621823e-06, + "loss": 0.0326, + "step": 4022 + }, + { + "epoch": 1.786808794137242, + "grad_norm": 0.7635864213363933, + "learning_rate": 6.763121083313378e-06, + "loss": 0.0469, + "step": 4023 + }, + { + "epoch": 1.7872529424827892, + "grad_norm": 0.5831230436540045, + "learning_rate": 6.761306935977883e-06, + "loss": 0.0437, + "step": 4024 + }, + { + "epoch": 1.7876970908283365, + "grad_norm": 0.4755466748257561, + "learning_rate": 6.759492523888036e-06, + "loss": 0.0349, + "step": 4025 + }, + { + "epoch": 1.7881412391738842, + "grad_norm": 0.3810710893901648, + "learning_rate": 6.757677847316576e-06, + "loss": 0.0304, + "step": 4026 + }, + { + "epoch": 1.7885853875194315, + "grad_norm": 0.5635909990676548, + "learning_rate": 6.755862906536276e-06, + "loss": 0.0509, + "step": 4027 + }, + { + "epoch": 1.7890295358649788, + "grad_norm": 0.44071811791526233, + "learning_rate": 6.754047701819954e-06, + "loss": 0.0409, + "step": 4028 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.6047548580207962, + "learning_rate": 6.752232233440469e-06, + "loss": 0.0472, + "step": 4029 + }, + { + "epoch": 1.7899178325560738, + "grad_norm": 0.4265056649643106, + "learning_rate": 6.750416501670712e-06, + "loss": 0.0341, + "step": 4030 + }, + { + "epoch": 1.7903619809016211, + "grad_norm": 0.4750323545017985, + "learning_rate": 6.74860050678362e-06, + "loss": 0.0399, + "step": 4031 + }, + { + "epoch": 1.7908061292471684, + "grad_norm": 0.43275311316068676, + "learning_rate": 6.74678424905217e-06, + "loss": 0.0381, + "step": 4032 + }, + { + "epoch": 1.791250277592716, + "grad_norm": 0.4591066133415168, + "learning_rate": 6.744967728749374e-06, + "loss": 0.049, + "step": 4033 + }, + { + "epoch": 1.7916944259382634, + "grad_norm": 0.46035832875677174, + "learning_rate": 6.743150946148286e-06, + "loss": 0.037, + "step": 4034 + }, + { + "epoch": 1.7921385742838107, + "grad_norm": 0.44089681593883423, + "learning_rate": 6.7413339015219995e-06, + "loss": 0.0489, + "step": 4035 + }, + { + "epoch": 1.7925827226293582, + "grad_norm": 0.3967713328470784, + "learning_rate": 6.739516595143649e-06, + "loss": 0.0348, + "step": 4036 + }, + { + "epoch": 1.7930268709749058, + "grad_norm": 0.47307408803909806, + "learning_rate": 6.737699027286404e-06, + "loss": 0.0488, + "step": 4037 + }, + { + "epoch": 1.793471019320453, + "grad_norm": 0.6980297064306341, + "learning_rate": 6.735881198223476e-06, + "loss": 0.0603, + "step": 4038 + }, + { + "epoch": 1.7939151676660003, + "grad_norm": 0.641654879554228, + "learning_rate": 6.734063108228118e-06, + "loss": 0.046, + "step": 4039 + }, + { + "epoch": 1.7943593160115479, + "grad_norm": 0.43889131829448813, + "learning_rate": 6.732244757573619e-06, + "loss": 0.0417, + "step": 4040 + }, + { + "epoch": 1.7948034643570954, + "grad_norm": 0.5791145146187624, + "learning_rate": 6.730426146533304e-06, + "loss": 0.0629, + "step": 4041 + }, + { + "epoch": 1.7952476127026427, + "grad_norm": 0.3191065387044477, + "learning_rate": 6.728607275380548e-06, + "loss": 0.034, + "step": 4042 + }, + { + "epoch": 1.79569176104819, + "grad_norm": 0.5785674908181686, + "learning_rate": 6.726788144388754e-06, + "loss": 0.0495, + "step": 4043 + }, + { + "epoch": 1.7961359093937375, + "grad_norm": 0.5420845224176172, + "learning_rate": 6.724968753831367e-06, + "loss": 0.0568, + "step": 4044 + }, + { + "epoch": 1.796580057739285, + "grad_norm": 0.4088444696114397, + "learning_rate": 6.723149103981874e-06, + "loss": 0.045, + "step": 4045 + }, + { + "epoch": 1.7970242060848323, + "grad_norm": 0.6392237685246868, + "learning_rate": 6.721329195113802e-06, + "loss": 0.0554, + "step": 4046 + }, + { + "epoch": 1.7974683544303798, + "grad_norm": 0.3977912647423139, + "learning_rate": 6.7195090275007104e-06, + "loss": 0.0361, + "step": 4047 + }, + { + "epoch": 1.7979125027759273, + "grad_norm": 0.36038011427697747, + "learning_rate": 6.717688601416201e-06, + "loss": 0.0406, + "step": 4048 + }, + { + "epoch": 1.7983566511214746, + "grad_norm": 0.5212415789499063, + "learning_rate": 6.715867917133919e-06, + "loss": 0.0422, + "step": 4049 + }, + { + "epoch": 1.7988007994670219, + "grad_norm": 0.49377763847355416, + "learning_rate": 6.714046974927539e-06, + "loss": 0.043, + "step": 4050 + }, + { + "epoch": 1.7992449478125694, + "grad_norm": 0.47840704302357145, + "learning_rate": 6.712225775070784e-06, + "loss": 0.044, + "step": 4051 + }, + { + "epoch": 1.799689096158117, + "grad_norm": 0.4112067824225565, + "learning_rate": 6.71040431783741e-06, + "loss": 0.0354, + "step": 4052 + }, + { + "epoch": 1.8001332445036642, + "grad_norm": 0.3668140961966663, + "learning_rate": 6.70858260350121e-06, + "loss": 0.0369, + "step": 4053 + }, + { + "epoch": 1.8005773928492115, + "grad_norm": 0.6061145451464093, + "learning_rate": 6.706760632336023e-06, + "loss": 0.0529, + "step": 4054 + }, + { + "epoch": 1.8010215411947592, + "grad_norm": 0.804408929113384, + "learning_rate": 6.704938404615718e-06, + "loss": 0.0695, + "step": 4055 + }, + { + "epoch": 1.8014656895403065, + "grad_norm": 0.44516753707686785, + "learning_rate": 6.703115920614212e-06, + "loss": 0.041, + "step": 4056 + }, + { + "epoch": 1.8019098378858538, + "grad_norm": 0.6944007446846528, + "learning_rate": 6.701293180605451e-06, + "loss": 0.063, + "step": 4057 + }, + { + "epoch": 1.8023539862314013, + "grad_norm": 0.6199411354460107, + "learning_rate": 6.699470184863423e-06, + "loss": 0.0478, + "step": 4058 + }, + { + "epoch": 1.8027981345769488, + "grad_norm": 0.6947222002075307, + "learning_rate": 6.6976469336621595e-06, + "loss": 0.0425, + "step": 4059 + }, + { + "epoch": 1.803242282922496, + "grad_norm": 0.5292009929059116, + "learning_rate": 6.6958234272757235e-06, + "loss": 0.0575, + "step": 4060 + }, + { + "epoch": 1.8036864312680434, + "grad_norm": 0.438201912001024, + "learning_rate": 6.6939996659782194e-06, + "loss": 0.0395, + "step": 4061 + }, + { + "epoch": 1.804130579613591, + "grad_norm": 0.5407542517674611, + "learning_rate": 6.692175650043789e-06, + "loss": 0.0403, + "step": 4062 + }, + { + "epoch": 1.8045747279591384, + "grad_norm": 0.5631536015470302, + "learning_rate": 6.690351379746613e-06, + "loss": 0.0421, + "step": 4063 + }, + { + "epoch": 1.8050188763046857, + "grad_norm": 0.5913174820930391, + "learning_rate": 6.6885268553609115e-06, + "loss": 0.0265, + "step": 4064 + }, + { + "epoch": 1.8054630246502332, + "grad_norm": 0.576629329530581, + "learning_rate": 6.68670207716094e-06, + "loss": 0.0393, + "step": 4065 + }, + { + "epoch": 1.8059071729957807, + "grad_norm": 0.47244671586292697, + "learning_rate": 6.6848770454209955e-06, + "loss": 0.043, + "step": 4066 + }, + { + "epoch": 1.806351321341328, + "grad_norm": 0.42376459190231364, + "learning_rate": 6.683051760415409e-06, + "loss": 0.0349, + "step": 4067 + }, + { + "epoch": 1.8067954696868753, + "grad_norm": 0.5089448244604045, + "learning_rate": 6.681226222418553e-06, + "loss": 0.0367, + "step": 4068 + }, + { + "epoch": 1.8072396180324228, + "grad_norm": 0.7287865087611148, + "learning_rate": 6.679400431704837e-06, + "loss": 0.0474, + "step": 4069 + }, + { + "epoch": 1.8076837663779703, + "grad_norm": 0.4089819648317366, + "learning_rate": 6.677574388548706e-06, + "loss": 0.0273, + "step": 4070 + }, + { + "epoch": 1.8081279147235176, + "grad_norm": 0.3655870554565193, + "learning_rate": 6.67574809322465e-06, + "loss": 0.0366, + "step": 4071 + }, + { + "epoch": 1.808572063069065, + "grad_norm": 0.4524590952965907, + "learning_rate": 6.6739215460071885e-06, + "loss": 0.0388, + "step": 4072 + }, + { + "epoch": 1.8090162114146124, + "grad_norm": 0.8966024978124474, + "learning_rate": 6.672094747170883e-06, + "loss": 0.0566, + "step": 4073 + }, + { + "epoch": 1.80946035976016, + "grad_norm": 0.3591914341706326, + "learning_rate": 6.670267696990335e-06, + "loss": 0.0318, + "step": 4074 + }, + { + "epoch": 1.8099045081057072, + "grad_norm": 0.44339585259060615, + "learning_rate": 6.668440395740178e-06, + "loss": 0.0363, + "step": 4075 + }, + { + "epoch": 1.8103486564512548, + "grad_norm": 0.3901591376438945, + "learning_rate": 6.666612843695087e-06, + "loss": 0.0485, + "step": 4076 + }, + { + "epoch": 1.8107928047968023, + "grad_norm": 0.4015830587763988, + "learning_rate": 6.664785041129777e-06, + "loss": 0.0383, + "step": 4077 + }, + { + "epoch": 1.8112369531423496, + "grad_norm": 0.5367818518459079, + "learning_rate": 6.662956988318994e-06, + "loss": 0.0456, + "step": 4078 + }, + { + "epoch": 1.8116811014878968, + "grad_norm": 0.40853437415084287, + "learning_rate": 6.661128685537526e-06, + "loss": 0.0322, + "step": 4079 + }, + { + "epoch": 1.8121252498334444, + "grad_norm": 0.6527784454525124, + "learning_rate": 6.659300133060201e-06, + "loss": 0.0507, + "step": 4080 + }, + { + "epoch": 1.8125693981789919, + "grad_norm": 0.5074533806315775, + "learning_rate": 6.657471331161878e-06, + "loss": 0.0432, + "step": 4081 + }, + { + "epoch": 1.8130135465245392, + "grad_norm": 0.4472089329933828, + "learning_rate": 6.65564228011746e-06, + "loss": 0.0599, + "step": 4082 + }, + { + "epoch": 1.8134576948700865, + "grad_norm": 0.47876435411805834, + "learning_rate": 6.653812980201882e-06, + "loss": 0.0478, + "step": 4083 + }, + { + "epoch": 1.813901843215634, + "grad_norm": 0.44056500378250846, + "learning_rate": 6.651983431690119e-06, + "loss": 0.0434, + "step": 4084 + }, + { + "epoch": 1.8143459915611815, + "grad_norm": 0.42532030681624505, + "learning_rate": 6.650153634857183e-06, + "loss": 0.0372, + "step": 4085 + }, + { + "epoch": 1.8147901399067288, + "grad_norm": 0.38789115474665614, + "learning_rate": 6.648323589978128e-06, + "loss": 0.0332, + "step": 4086 + }, + { + "epoch": 1.8152342882522763, + "grad_norm": 0.5287105455383381, + "learning_rate": 6.646493297328034e-06, + "loss": 0.0483, + "step": 4087 + }, + { + "epoch": 1.8156784365978238, + "grad_norm": 2.7184134061804475, + "learning_rate": 6.6446627571820295e-06, + "loss": 0.05, + "step": 4088 + }, + { + "epoch": 1.816122584943371, + "grad_norm": 0.48923408476238167, + "learning_rate": 6.642831969815275e-06, + "loss": 0.0524, + "step": 4089 + }, + { + "epoch": 1.8165667332889184, + "grad_norm": 0.4086306503799156, + "learning_rate": 6.641000935502968e-06, + "loss": 0.0369, + "step": 4090 + }, + { + "epoch": 1.817010881634466, + "grad_norm": 0.42279116752419027, + "learning_rate": 6.639169654520345e-06, + "loss": 0.0384, + "step": 4091 + }, + { + "epoch": 1.8174550299800134, + "grad_norm": 0.47809993483091023, + "learning_rate": 6.637338127142678e-06, + "loss": 0.0557, + "step": 4092 + }, + { + "epoch": 1.8178991783255607, + "grad_norm": 0.43894142864567726, + "learning_rate": 6.635506353645277e-06, + "loss": 0.0442, + "step": 4093 + }, + { + "epoch": 1.818343326671108, + "grad_norm": 0.4258410799351809, + "learning_rate": 6.633674334303489e-06, + "loss": 0.0453, + "step": 4094 + }, + { + "epoch": 1.8187874750166557, + "grad_norm": 0.4223073933724936, + "learning_rate": 6.631842069392698e-06, + "loss": 0.0425, + "step": 4095 + }, + { + "epoch": 1.819231623362203, + "grad_norm": 0.379351123137964, + "learning_rate": 6.630009559188323e-06, + "loss": 0.0333, + "step": 4096 + }, + { + "epoch": 1.8196757717077503, + "grad_norm": 0.38768367796199465, + "learning_rate": 6.628176803965823e-06, + "loss": 0.0391, + "step": 4097 + }, + { + "epoch": 1.8201199200532978, + "grad_norm": 0.34224780916225656, + "learning_rate": 6.62634380400069e-06, + "loss": 0.0341, + "step": 4098 + }, + { + "epoch": 1.8205640683988453, + "grad_norm": 0.42049665765180966, + "learning_rate": 6.624510559568458e-06, + "loss": 0.0332, + "step": 4099 + }, + { + "epoch": 1.8210082167443926, + "grad_norm": 0.4650798805753388, + "learning_rate": 6.622677070944692e-06, + "loss": 0.0522, + "step": 4100 + }, + { + "epoch": 1.82145236508994, + "grad_norm": 0.7403412237132078, + "learning_rate": 6.6208433384049974e-06, + "loss": 0.0468, + "step": 4101 + }, + { + "epoch": 1.8218965134354874, + "grad_norm": 0.5003084175625752, + "learning_rate": 6.619009362225017e-06, + "loss": 0.0369, + "step": 4102 + }, + { + "epoch": 1.822340661781035, + "grad_norm": 0.4709276604503224, + "learning_rate": 6.617175142680426e-06, + "loss": 0.0373, + "step": 4103 + }, + { + "epoch": 1.8227848101265822, + "grad_norm": 0.405634995248283, + "learning_rate": 6.615340680046941e-06, + "loss": 0.0402, + "step": 4104 + }, + { + "epoch": 1.8232289584721297, + "grad_norm": 0.48277401824042093, + "learning_rate": 6.613505974600313e-06, + "loss": 0.0427, + "step": 4105 + }, + { + "epoch": 1.8236731068176772, + "grad_norm": 0.5146304089304894, + "learning_rate": 6.611671026616328e-06, + "loss": 0.0449, + "step": 4106 + }, + { + "epoch": 1.8241172551632245, + "grad_norm": 0.5014137439041658, + "learning_rate": 6.609835836370808e-06, + "loss": 0.0462, + "step": 4107 + }, + { + "epoch": 1.8245614035087718, + "grad_norm": 0.5653319356301473, + "learning_rate": 6.6080004041396176e-06, + "loss": 0.0385, + "step": 4108 + }, + { + "epoch": 1.8250055518543193, + "grad_norm": 0.43607877589233934, + "learning_rate": 6.60616473019865e-06, + "loss": 0.0366, + "step": 4109 + }, + { + "epoch": 1.8254497001998669, + "grad_norm": 0.34512361350227927, + "learning_rate": 6.6043288148238405e-06, + "loss": 0.0327, + "step": 4110 + }, + { + "epoch": 1.8258938485454141, + "grad_norm": 0.46278297105670835, + "learning_rate": 6.6024926582911576e-06, + "loss": 0.0412, + "step": 4111 + }, + { + "epoch": 1.8263379968909614, + "grad_norm": 0.4906820801194115, + "learning_rate": 6.600656260876605e-06, + "loss": 0.0381, + "step": 4112 + }, + { + "epoch": 1.826782145236509, + "grad_norm": 0.48071884316968305, + "learning_rate": 6.598819622856227e-06, + "loss": 0.0409, + "step": 4113 + }, + { + "epoch": 1.8272262935820565, + "grad_norm": 0.47730730807604993, + "learning_rate": 6.596982744506101e-06, + "loss": 0.0557, + "step": 4114 + }, + { + "epoch": 1.8276704419276038, + "grad_norm": 0.3790889364580573, + "learning_rate": 6.595145626102339e-06, + "loss": 0.0404, + "step": 4115 + }, + { + "epoch": 1.8281145902731513, + "grad_norm": 0.463399073158891, + "learning_rate": 6.593308267921095e-06, + "loss": 0.0411, + "step": 4116 + }, + { + "epoch": 1.8285587386186988, + "grad_norm": 0.501176246654533, + "learning_rate": 6.59147067023855e-06, + "loss": 0.0506, + "step": 4117 + }, + { + "epoch": 1.829002886964246, + "grad_norm": 0.46578549229661426, + "learning_rate": 6.58963283333093e-06, + "loss": 0.0454, + "step": 4118 + }, + { + "epoch": 1.8294470353097934, + "grad_norm": 0.4499464900379103, + "learning_rate": 6.587794757474493e-06, + "loss": 0.0328, + "step": 4119 + }, + { + "epoch": 1.8298911836553409, + "grad_norm": 0.4379709545832753, + "learning_rate": 6.585956442945531e-06, + "loss": 0.0342, + "step": 4120 + }, + { + "epoch": 1.8303353320008884, + "grad_norm": 0.5260425346926577, + "learning_rate": 6.584117890020374e-06, + "loss": 0.0439, + "step": 4121 + }, + { + "epoch": 1.8307794803464357, + "grad_norm": 0.426360295901579, + "learning_rate": 6.5822790989753905e-06, + "loss": 0.0411, + "step": 4122 + }, + { + "epoch": 1.831223628691983, + "grad_norm": 0.45007999975919516, + "learning_rate": 6.5804400700869806e-06, + "loss": 0.0443, + "step": 4123 + }, + { + "epoch": 1.8316677770375307, + "grad_norm": 0.59501919115864, + "learning_rate": 6.578600803631579e-06, + "loss": 0.0454, + "step": 4124 + }, + { + "epoch": 1.832111925383078, + "grad_norm": 0.3671794598732458, + "learning_rate": 6.5767612998856625e-06, + "loss": 0.0393, + "step": 4125 + }, + { + "epoch": 1.8325560737286253, + "grad_norm": 0.36069972183791427, + "learning_rate": 6.574921559125737e-06, + "loss": 0.0375, + "step": 4126 + }, + { + "epoch": 1.8330002220741728, + "grad_norm": 0.47931723820550676, + "learning_rate": 6.573081581628349e-06, + "loss": 0.0398, + "step": 4127 + }, + { + "epoch": 1.8334443704197203, + "grad_norm": 0.4393540376137144, + "learning_rate": 6.571241367670077e-06, + "loss": 0.048, + "step": 4128 + }, + { + "epoch": 1.8338885187652676, + "grad_norm": 0.5502773042166398, + "learning_rate": 6.569400917527536e-06, + "loss": 0.0391, + "step": 4129 + }, + { + "epoch": 1.8343326671108149, + "grad_norm": 0.42922588665741507, + "learning_rate": 6.567560231477379e-06, + "loss": 0.0371, + "step": 4130 + }, + { + "epoch": 1.8347768154563624, + "grad_norm": 0.4243115655078441, + "learning_rate": 6.56571930979629e-06, + "loss": 0.0464, + "step": 4131 + }, + { + "epoch": 1.83522096380191, + "grad_norm": 0.5637304292180152, + "learning_rate": 6.563878152760992e-06, + "loss": 0.0587, + "step": 4132 + }, + { + "epoch": 1.8356651121474572, + "grad_norm": 0.5080427318268489, + "learning_rate": 6.562036760648242e-06, + "loss": 0.0445, + "step": 4133 + }, + { + "epoch": 1.8361092604930047, + "grad_norm": 0.5069784829710062, + "learning_rate": 6.560195133734833e-06, + "loss": 0.0329, + "step": 4134 + }, + { + "epoch": 1.8365534088385522, + "grad_norm": 0.3415574151612511, + "learning_rate": 6.55835327229759e-06, + "loss": 0.0292, + "step": 4135 + }, + { + "epoch": 1.8369975571840995, + "grad_norm": 0.4077726290203496, + "learning_rate": 6.556511176613381e-06, + "loss": 0.0347, + "step": 4136 + }, + { + "epoch": 1.8374417055296468, + "grad_norm": 0.5240761531044744, + "learning_rate": 6.554668846959102e-06, + "loss": 0.0454, + "step": 4137 + }, + { + "epoch": 1.8378858538751943, + "grad_norm": 0.5670009106024317, + "learning_rate": 6.552826283611684e-06, + "loss": 0.0488, + "step": 4138 + }, + { + "epoch": 1.8383300022207418, + "grad_norm": 0.5071828765865913, + "learning_rate": 6.5509834868480994e-06, + "loss": 0.0464, + "step": 4139 + }, + { + "epoch": 1.8387741505662891, + "grad_norm": 0.6942163546904666, + "learning_rate": 6.54914045694535e-06, + "loss": 0.0558, + "step": 4140 + }, + { + "epoch": 1.8392182989118364, + "grad_norm": 0.42462008853388156, + "learning_rate": 6.547297194180473e-06, + "loss": 0.028, + "step": 4141 + }, + { + "epoch": 1.839662447257384, + "grad_norm": 0.3935408254645297, + "learning_rate": 6.545453698830545e-06, + "loss": 0.028, + "step": 4142 + }, + { + "epoch": 1.8401065956029314, + "grad_norm": 0.43222089529601715, + "learning_rate": 6.543609971172673e-06, + "loss": 0.0439, + "step": 4143 + }, + { + "epoch": 1.8405507439484787, + "grad_norm": 0.8970440945065747, + "learning_rate": 6.541766011484001e-06, + "loss": 0.0569, + "step": 4144 + }, + { + "epoch": 1.8409948922940262, + "grad_norm": 0.43018474254953293, + "learning_rate": 6.539921820041708e-06, + "loss": 0.0429, + "step": 4145 + }, + { + "epoch": 1.8414390406395738, + "grad_norm": 0.5272129102858879, + "learning_rate": 6.538077397123006e-06, + "loss": 0.042, + "step": 4146 + }, + { + "epoch": 1.841883188985121, + "grad_norm": 0.5767831742144044, + "learning_rate": 6.536232743005144e-06, + "loss": 0.0505, + "step": 4147 + }, + { + "epoch": 1.8423273373306683, + "grad_norm": 0.5455867077197153, + "learning_rate": 6.534387857965405e-06, + "loss": 0.0499, + "step": 4148 + }, + { + "epoch": 1.8427714856762158, + "grad_norm": 0.4392562319330063, + "learning_rate": 6.532542742281105e-06, + "loss": 0.0404, + "step": 4149 + }, + { + "epoch": 1.8432156340217634, + "grad_norm": 0.41150977239123576, + "learning_rate": 6.5306973962296e-06, + "loss": 0.0388, + "step": 4150 + }, + { + "epoch": 1.8436597823673107, + "grad_norm": 1.313911582342841, + "learning_rate": 6.528851820088273e-06, + "loss": 0.0595, + "step": 4151 + }, + { + "epoch": 1.844103930712858, + "grad_norm": 0.3322454559510484, + "learning_rate": 6.527006014134546e-06, + "loss": 0.0274, + "step": 4152 + }, + { + "epoch": 1.8445480790584055, + "grad_norm": 0.44783811513765753, + "learning_rate": 6.525159978645876e-06, + "loss": 0.0391, + "step": 4153 + }, + { + "epoch": 1.844992227403953, + "grad_norm": 0.5214447787650061, + "learning_rate": 6.523313713899755e-06, + "loss": 0.0396, + "step": 4154 + }, + { + "epoch": 1.8454363757495003, + "grad_norm": 0.45172874563459914, + "learning_rate": 6.521467220173705e-06, + "loss": 0.0356, + "step": 4155 + }, + { + "epoch": 1.8458805240950478, + "grad_norm": 0.3652608177780825, + "learning_rate": 6.519620497745286e-06, + "loss": 0.0413, + "step": 4156 + }, + { + "epoch": 1.8463246724405953, + "grad_norm": 0.435616251575995, + "learning_rate": 6.5177735468920935e-06, + "loss": 0.0444, + "step": 4157 + }, + { + "epoch": 1.8467688207861426, + "grad_norm": 0.7590654646546372, + "learning_rate": 6.515926367891754e-06, + "loss": 0.049, + "step": 4158 + }, + { + "epoch": 1.8472129691316899, + "grad_norm": 0.45565692087369364, + "learning_rate": 6.51407896102193e-06, + "loss": 0.0377, + "step": 4159 + }, + { + "epoch": 1.8476571174772374, + "grad_norm": 1.8021394181856782, + "learning_rate": 6.512231326560319e-06, + "loss": 0.0632, + "step": 4160 + }, + { + "epoch": 1.8481012658227849, + "grad_norm": 0.37814866423712995, + "learning_rate": 6.510383464784651e-06, + "loss": 0.0357, + "step": 4161 + }, + { + "epoch": 1.8485454141683322, + "grad_norm": 0.5013395214860348, + "learning_rate": 6.508535375972691e-06, + "loss": 0.0359, + "step": 4162 + }, + { + "epoch": 1.8489895625138795, + "grad_norm": 0.6460169691477774, + "learning_rate": 6.506687060402238e-06, + "loss": 0.055, + "step": 4163 + }, + { + "epoch": 1.8494337108594272, + "grad_norm": 0.731410974029027, + "learning_rate": 6.504838518351127e-06, + "loss": 0.0418, + "step": 4164 + }, + { + "epoch": 1.8498778592049745, + "grad_norm": 0.4318554461256705, + "learning_rate": 6.502989750097224e-06, + "loss": 0.0423, + "step": 4165 + }, + { + "epoch": 1.8503220075505218, + "grad_norm": 0.4065429352215067, + "learning_rate": 6.501140755918428e-06, + "loss": 0.0325, + "step": 4166 + }, + { + "epoch": 1.8507661558960693, + "grad_norm": 0.5268522126273875, + "learning_rate": 6.499291536092679e-06, + "loss": 0.052, + "step": 4167 + }, + { + "epoch": 1.8512103042416168, + "grad_norm": 0.4963236453076724, + "learning_rate": 6.497442090897943e-06, + "loss": 0.0507, + "step": 4168 + }, + { + "epoch": 1.851654452587164, + "grad_norm": 0.5716468049405105, + "learning_rate": 6.495592420612224e-06, + "loss": 0.0557, + "step": 4169 + }, + { + "epoch": 1.8520986009327114, + "grad_norm": 0.36399318359325455, + "learning_rate": 6.493742525513556e-06, + "loss": 0.0318, + "step": 4170 + }, + { + "epoch": 1.852542749278259, + "grad_norm": 0.4368593597512668, + "learning_rate": 6.491892405880015e-06, + "loss": 0.036, + "step": 4171 + }, + { + "epoch": 1.8529868976238064, + "grad_norm": 0.490968844938355, + "learning_rate": 6.490042061989701e-06, + "loss": 0.0384, + "step": 4172 + }, + { + "epoch": 1.8534310459693537, + "grad_norm": 0.4895671851812533, + "learning_rate": 6.4881914941207545e-06, + "loss": 0.0526, + "step": 4173 + }, + { + "epoch": 1.8538751943149012, + "grad_norm": 0.4582350620997356, + "learning_rate": 6.486340702551347e-06, + "loss": 0.044, + "step": 4174 + }, + { + "epoch": 1.8543193426604487, + "grad_norm": 0.4289059463290211, + "learning_rate": 6.484489687559682e-06, + "loss": 0.0379, + "step": 4175 + }, + { + "epoch": 1.854763491005996, + "grad_norm": 0.3956879154243665, + "learning_rate": 6.4826384494240006e-06, + "loss": 0.0324, + "step": 4176 + }, + { + "epoch": 1.8552076393515433, + "grad_norm": 0.4900266012746407, + "learning_rate": 6.480786988422575e-06, + "loss": 0.0472, + "step": 4177 + }, + { + "epoch": 1.8556517876970908, + "grad_norm": 0.4376815416787356, + "learning_rate": 6.47893530483371e-06, + "loss": 0.0631, + "step": 4178 + }, + { + "epoch": 1.8560959360426383, + "grad_norm": 0.3452667188092541, + "learning_rate": 6.4770833989357464e-06, + "loss": 0.031, + "step": 4179 + }, + { + "epoch": 1.8565400843881856, + "grad_norm": 0.4575028834398297, + "learning_rate": 6.4752312710070565e-06, + "loss": 0.0357, + "step": 4180 + }, + { + "epoch": 1.856984232733733, + "grad_norm": 0.35802922814056004, + "learning_rate": 6.4733789213260465e-06, + "loss": 0.0363, + "step": 4181 + }, + { + "epoch": 1.8574283810792804, + "grad_norm": 0.4610516053299678, + "learning_rate": 6.471526350171158e-06, + "loss": 0.0421, + "step": 4182 + }, + { + "epoch": 1.857872529424828, + "grad_norm": 0.41994717663617964, + "learning_rate": 6.46967355782086e-06, + "loss": 0.0427, + "step": 4183 + }, + { + "epoch": 1.8583166777703752, + "grad_norm": 0.41511781298154615, + "learning_rate": 6.4678205445536615e-06, + "loss": 0.0388, + "step": 4184 + }, + { + "epoch": 1.8587608261159227, + "grad_norm": 0.4226703337206278, + "learning_rate": 6.465967310648103e-06, + "loss": 0.0363, + "step": 4185 + }, + { + "epoch": 1.8592049744614703, + "grad_norm": 0.3756285974893726, + "learning_rate": 6.464113856382752e-06, + "loss": 0.0431, + "step": 4186 + }, + { + "epoch": 1.8596491228070176, + "grad_norm": 0.7660609289210849, + "learning_rate": 6.46226018203622e-06, + "loss": 0.0548, + "step": 4187 + }, + { + "epoch": 1.8600932711525648, + "grad_norm": 0.44778181358321073, + "learning_rate": 6.460406287887142e-06, + "loss": 0.0402, + "step": 4188 + }, + { + "epoch": 1.8605374194981124, + "grad_norm": 0.4653083552128945, + "learning_rate": 6.4585521742141924e-06, + "loss": 0.0429, + "step": 4189 + }, + { + "epoch": 1.8609815678436599, + "grad_norm": 0.4092361015012088, + "learning_rate": 6.456697841296072e-06, + "loss": 0.0538, + "step": 4190 + }, + { + "epoch": 1.8614257161892072, + "grad_norm": 0.3943066511337006, + "learning_rate": 6.4548432894115236e-06, + "loss": 0.0422, + "step": 4191 + }, + { + "epoch": 1.8618698645347544, + "grad_norm": 0.574897072037143, + "learning_rate": 6.452988518839314e-06, + "loss": 0.0638, + "step": 4192 + }, + { + "epoch": 1.8623140128803022, + "grad_norm": 0.3525774963677566, + "learning_rate": 6.451133529858249e-06, + "loss": 0.0308, + "step": 4193 + }, + { + "epoch": 1.8627581612258495, + "grad_norm": 0.47268074934306253, + "learning_rate": 6.449278322747164e-06, + "loss": 0.0443, + "step": 4194 + }, + { + "epoch": 1.8632023095713968, + "grad_norm": 0.49332043887118054, + "learning_rate": 6.447422897784927e-06, + "loss": 0.0486, + "step": 4195 + }, + { + "epoch": 1.8636464579169443, + "grad_norm": 0.4599615865532288, + "learning_rate": 6.445567255250442e-06, + "loss": 0.0374, + "step": 4196 + }, + { + "epoch": 1.8640906062624918, + "grad_norm": 0.6216566278556651, + "learning_rate": 6.443711395422641e-06, + "loss": 0.0458, + "step": 4197 + }, + { + "epoch": 1.864534754608039, + "grad_norm": 0.45476603291439793, + "learning_rate": 6.4418553185804946e-06, + "loss": 0.0411, + "step": 4198 + }, + { + "epoch": 1.8649789029535864, + "grad_norm": 0.5568340998927974, + "learning_rate": 6.4399990250030005e-06, + "loss": 0.0469, + "step": 4199 + }, + { + "epoch": 1.8654230512991339, + "grad_norm": 0.5792755550634929, + "learning_rate": 6.438142514969192e-06, + "loss": 0.0486, + "step": 4200 + }, + { + "epoch": 1.8658671996446814, + "grad_norm": 0.5484197721468562, + "learning_rate": 6.436285788758133e-06, + "loss": 0.0373, + "step": 4201 + }, + { + "epoch": 1.8663113479902287, + "grad_norm": 0.408467206743211, + "learning_rate": 6.434428846648923e-06, + "loss": 0.0345, + "step": 4202 + }, + { + "epoch": 1.8667554963357762, + "grad_norm": 0.4273190617477224, + "learning_rate": 6.43257168892069e-06, + "loss": 0.0521, + "step": 4203 + }, + { + "epoch": 1.8671996446813237, + "grad_norm": 0.913822616436895, + "learning_rate": 6.430714315852595e-06, + "loss": 0.0731, + "step": 4204 + }, + { + "epoch": 1.867643793026871, + "grad_norm": 0.3654209870999856, + "learning_rate": 6.428856727723838e-06, + "loss": 0.0426, + "step": 4205 + }, + { + "epoch": 1.8680879413724183, + "grad_norm": 0.35677094964790335, + "learning_rate": 6.426998924813641e-06, + "loss": 0.0354, + "step": 4206 + }, + { + "epoch": 1.8685320897179658, + "grad_norm": 0.7442434568098797, + "learning_rate": 6.425140907401266e-06, + "loss": 0.057, + "step": 4207 + }, + { + "epoch": 1.8689762380635133, + "grad_norm": 0.5792617957834036, + "learning_rate": 6.423282675766002e-06, + "loss": 0.0409, + "step": 4208 + }, + { + "epoch": 1.8694203864090606, + "grad_norm": 0.7180220670929347, + "learning_rate": 6.4214242301871766e-06, + "loss": 0.055, + "step": 4209 + }, + { + "epoch": 1.869864534754608, + "grad_norm": 0.369340852755107, + "learning_rate": 6.4195655709441425e-06, + "loss": 0.0365, + "step": 4210 + }, + { + "epoch": 1.8703086831001554, + "grad_norm": 0.5907561592268787, + "learning_rate": 6.41770669831629e-06, + "loss": 0.05, + "step": 4211 + }, + { + "epoch": 1.870752831445703, + "grad_norm": 0.4737879055532281, + "learning_rate": 6.415847612583036e-06, + "loss": 0.0387, + "step": 4212 + }, + { + "epoch": 1.8711969797912502, + "grad_norm": 0.45358931580739376, + "learning_rate": 6.413988314023837e-06, + "loss": 0.0455, + "step": 4213 + }, + { + "epoch": 1.8716411281367977, + "grad_norm": 0.396008499582763, + "learning_rate": 6.412128802918174e-06, + "loss": 0.0428, + "step": 4214 + }, + { + "epoch": 1.8720852764823452, + "grad_norm": 0.446819083411039, + "learning_rate": 6.410269079545563e-06, + "loss": 0.0353, + "step": 4215 + }, + { + "epoch": 1.8725294248278925, + "grad_norm": 0.5360694926018593, + "learning_rate": 6.408409144185555e-06, + "loss": 0.0394, + "step": 4216 + }, + { + "epoch": 1.8729735731734398, + "grad_norm": 0.419696086205574, + "learning_rate": 6.406548997117728e-06, + "loss": 0.0442, + "step": 4217 + }, + { + "epoch": 1.8734177215189873, + "grad_norm": 0.5541434243753425, + "learning_rate": 6.404688638621691e-06, + "loss": 0.0362, + "step": 4218 + }, + { + "epoch": 1.8738618698645348, + "grad_norm": 0.5424748341749609, + "learning_rate": 6.402828068977092e-06, + "loss": 0.0385, + "step": 4219 + }, + { + "epoch": 1.8743060182100821, + "grad_norm": 0.5755082545471814, + "learning_rate": 6.400967288463604e-06, + "loss": 0.0408, + "step": 4220 + }, + { + "epoch": 1.8747501665556294, + "grad_norm": 0.5534620278515894, + "learning_rate": 6.399106297360934e-06, + "loss": 0.0394, + "step": 4221 + }, + { + "epoch": 1.875194314901177, + "grad_norm": 0.43339133128599205, + "learning_rate": 6.397245095948822e-06, + "loss": 0.0447, + "step": 4222 + }, + { + "epoch": 1.8756384632467245, + "grad_norm": 0.4242062003649521, + "learning_rate": 6.395383684507036e-06, + "loss": 0.0367, + "step": 4223 + }, + { + "epoch": 1.8760826115922717, + "grad_norm": 0.429738984394203, + "learning_rate": 6.393522063315379e-06, + "loss": 0.0434, + "step": 4224 + }, + { + "epoch": 1.8765267599378193, + "grad_norm": 0.5905398248805794, + "learning_rate": 6.391660232653685e-06, + "loss": 0.0387, + "step": 4225 + }, + { + "epoch": 1.8769709082833668, + "grad_norm": 0.46069485833459556, + "learning_rate": 6.389798192801816e-06, + "loss": 0.042, + "step": 4226 + }, + { + "epoch": 1.877415056628914, + "grad_norm": 0.3636972167781559, + "learning_rate": 6.387935944039672e-06, + "loss": 0.0355, + "step": 4227 + }, + { + "epoch": 1.8778592049744613, + "grad_norm": 0.4783149493763749, + "learning_rate": 6.3860734866471775e-06, + "loss": 0.0404, + "step": 4228 + }, + { + "epoch": 1.8783033533200089, + "grad_norm": 0.3543783240099781, + "learning_rate": 6.384210820904292e-06, + "loss": 0.0291, + "step": 4229 + }, + { + "epoch": 1.8787475016655564, + "grad_norm": 0.4823964461174852, + "learning_rate": 6.382347947091008e-06, + "loss": 0.0454, + "step": 4230 + }, + { + "epoch": 1.8791916500111037, + "grad_norm": 0.7938854928835402, + "learning_rate": 6.380484865487346e-06, + "loss": 0.0651, + "step": 4231 + }, + { + "epoch": 1.879635798356651, + "grad_norm": 0.43202860148998995, + "learning_rate": 6.378621576373356e-06, + "loss": 0.0373, + "step": 4232 + }, + { + "epoch": 1.8800799467021987, + "grad_norm": 0.40297846856972874, + "learning_rate": 6.376758080029126e-06, + "loss": 0.0399, + "step": 4233 + }, + { + "epoch": 1.880524095047746, + "grad_norm": 0.6777187231862498, + "learning_rate": 6.37489437673477e-06, + "loss": 0.0451, + "step": 4234 + }, + { + "epoch": 1.8809682433932933, + "grad_norm": 0.449724408023701, + "learning_rate": 6.3730304667704315e-06, + "loss": 0.0364, + "step": 4235 + }, + { + "epoch": 1.8814123917388408, + "grad_norm": 0.508560023808873, + "learning_rate": 6.371166350416293e-06, + "loss": 0.0357, + "step": 4236 + }, + { + "epoch": 1.8818565400843883, + "grad_norm": 0.45097496833863476, + "learning_rate": 6.369302027952559e-06, + "loss": 0.0314, + "step": 4237 + }, + { + "epoch": 1.8823006884299356, + "grad_norm": 0.3904261571231129, + "learning_rate": 6.36743749965947e-06, + "loss": 0.0417, + "step": 4238 + }, + { + "epoch": 1.8827448367754829, + "grad_norm": 0.3804238828895874, + "learning_rate": 6.365572765817295e-06, + "loss": 0.039, + "step": 4239 + }, + { + "epoch": 1.8831889851210304, + "grad_norm": 0.5067555487119486, + "learning_rate": 6.363707826706336e-06, + "loss": 0.0382, + "step": 4240 + }, + { + "epoch": 1.883633133466578, + "grad_norm": 0.4732566932753566, + "learning_rate": 6.3618426826069265e-06, + "loss": 0.0471, + "step": 4241 + }, + { + "epoch": 1.8840772818121252, + "grad_norm": 0.4900514966702779, + "learning_rate": 6.359977333799429e-06, + "loss": 0.0441, + "step": 4242 + }, + { + "epoch": 1.8845214301576727, + "grad_norm": 0.5196575689420362, + "learning_rate": 6.358111780564233e-06, + "loss": 0.0411, + "step": 4243 + }, + { + "epoch": 1.8849655785032202, + "grad_norm": 0.43773037282352034, + "learning_rate": 6.35624602318177e-06, + "loss": 0.0402, + "step": 4244 + }, + { + "epoch": 1.8854097268487675, + "grad_norm": 0.589548376792247, + "learning_rate": 6.354380061932489e-06, + "loss": 0.0381, + "step": 4245 + }, + { + "epoch": 1.8858538751943148, + "grad_norm": 0.5624692048798179, + "learning_rate": 6.352513897096878e-06, + "loss": 0.0394, + "step": 4246 + }, + { + "epoch": 1.8862980235398623, + "grad_norm": 0.48689251252375637, + "learning_rate": 6.3506475289554534e-06, + "loss": 0.0524, + "step": 4247 + }, + { + "epoch": 1.8867421718854098, + "grad_norm": 0.8189635814115437, + "learning_rate": 6.3487809577887625e-06, + "loss": 0.0587, + "step": 4248 + }, + { + "epoch": 1.8871863202309571, + "grad_norm": 0.43746947984188156, + "learning_rate": 6.346914183877379e-06, + "loss": 0.0316, + "step": 4249 + }, + { + "epoch": 1.8876304685765044, + "grad_norm": 0.5538154749300077, + "learning_rate": 6.345047207501916e-06, + "loss": 0.0471, + "step": 4250 + }, + { + "epoch": 1.888074616922052, + "grad_norm": 0.4360679802124053, + "learning_rate": 6.34318002894301e-06, + "loss": 0.0371, + "step": 4251 + }, + { + "epoch": 1.8885187652675994, + "grad_norm": 0.5075276918535925, + "learning_rate": 6.341312648481328e-06, + "loss": 0.0417, + "step": 4252 + }, + { + "epoch": 1.8889629136131467, + "grad_norm": 0.49156392064774324, + "learning_rate": 6.339445066397569e-06, + "loss": 0.0423, + "step": 4253 + }, + { + "epoch": 1.8894070619586942, + "grad_norm": 0.852671422019365, + "learning_rate": 6.337577282972465e-06, + "loss": 0.0524, + "step": 4254 + }, + { + "epoch": 1.8898512103042417, + "grad_norm": 0.37799180602912125, + "learning_rate": 6.335709298486773e-06, + "loss": 0.0303, + "step": 4255 + }, + { + "epoch": 1.890295358649789, + "grad_norm": 0.4549492374420902, + "learning_rate": 6.333841113221283e-06, + "loss": 0.0376, + "step": 4256 + }, + { + "epoch": 1.8907395069953363, + "grad_norm": 0.4682663299692366, + "learning_rate": 6.331972727456816e-06, + "loss": 0.0264, + "step": 4257 + }, + { + "epoch": 1.8911836553408838, + "grad_norm": 0.3907535099519968, + "learning_rate": 6.330104141474223e-06, + "loss": 0.0382, + "step": 4258 + }, + { + "epoch": 1.8916278036864314, + "grad_norm": 0.43935294276918296, + "learning_rate": 6.328235355554382e-06, + "loss": 0.0362, + "step": 4259 + }, + { + "epoch": 1.8920719520319786, + "grad_norm": 0.576541252998893, + "learning_rate": 6.326366369978204e-06, + "loss": 0.0481, + "step": 4260 + }, + { + "epoch": 1.892516100377526, + "grad_norm": 0.43778261815885045, + "learning_rate": 6.324497185026631e-06, + "loss": 0.034, + "step": 4261 + }, + { + "epoch": 1.8929602487230737, + "grad_norm": 0.44466828672780784, + "learning_rate": 6.3226278009806315e-06, + "loss": 0.0307, + "step": 4262 + }, + { + "epoch": 1.893404397068621, + "grad_norm": 0.41339084625105765, + "learning_rate": 6.320758218121205e-06, + "loss": 0.0443, + "step": 4263 + }, + { + "epoch": 1.8938485454141682, + "grad_norm": 0.4481199734504317, + "learning_rate": 6.318888436729382e-06, + "loss": 0.0354, + "step": 4264 + }, + { + "epoch": 1.8942926937597158, + "grad_norm": 0.6150629213528874, + "learning_rate": 6.317018457086226e-06, + "loss": 0.046, + "step": 4265 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.8066271182355852, + "learning_rate": 6.31514827947282e-06, + "loss": 0.0425, + "step": 4266 + }, + { + "epoch": 1.8951809904508106, + "grad_norm": 0.5827323877974867, + "learning_rate": 6.31327790417029e-06, + "loss": 0.0457, + "step": 4267 + }, + { + "epoch": 1.8956251387963579, + "grad_norm": 0.4984272032963798, + "learning_rate": 6.311407331459781e-06, + "loss": 0.03, + "step": 4268 + }, + { + "epoch": 1.8960692871419054, + "grad_norm": 0.5018245857439331, + "learning_rate": 6.309536561622474e-06, + "loss": 0.0399, + "step": 4269 + }, + { + "epoch": 1.8965134354874529, + "grad_norm": 0.46552046167173733, + "learning_rate": 6.307665594939575e-06, + "loss": 0.0379, + "step": 4270 + }, + { + "epoch": 1.8969575838330002, + "grad_norm": 0.48037243813958513, + "learning_rate": 6.3057944316923246e-06, + "loss": 0.0293, + "step": 4271 + }, + { + "epoch": 1.8974017321785477, + "grad_norm": 0.4527119332615495, + "learning_rate": 6.30392307216199e-06, + "loss": 0.0386, + "step": 4272 + }, + { + "epoch": 1.8978458805240952, + "grad_norm": 0.419684782365093, + "learning_rate": 6.3020515166298665e-06, + "loss": 0.0317, + "step": 4273 + }, + { + "epoch": 1.8982900288696425, + "grad_norm": 0.418987789446061, + "learning_rate": 6.300179765377283e-06, + "loss": 0.0361, + "step": 4274 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 0.58612758503126, + "learning_rate": 6.298307818685595e-06, + "loss": 0.0416, + "step": 4275 + }, + { + "epoch": 1.8991783255607373, + "grad_norm": 0.501397444674289, + "learning_rate": 6.296435676836188e-06, + "loss": 0.0443, + "step": 4276 + }, + { + "epoch": 1.8996224739062848, + "grad_norm": 0.4267458008276168, + "learning_rate": 6.294563340110474e-06, + "loss": 0.0465, + "step": 4277 + }, + { + "epoch": 1.900066622251832, + "grad_norm": 0.5455194479999508, + "learning_rate": 6.292690808789901e-06, + "loss": 0.055, + "step": 4278 + }, + { + "epoch": 1.9005107705973794, + "grad_norm": 0.678438654630706, + "learning_rate": 6.290818083155941e-06, + "loss": 0.0559, + "step": 4279 + }, + { + "epoch": 1.900954918942927, + "grad_norm": 0.5367006488443361, + "learning_rate": 6.288945163490093e-06, + "loss": 0.045, + "step": 4280 + }, + { + "epoch": 1.9013990672884744, + "grad_norm": 0.41451518482226474, + "learning_rate": 6.287072050073894e-06, + "loss": 0.0384, + "step": 4281 + }, + { + "epoch": 1.9018432156340217, + "grad_norm": 0.594043909692931, + "learning_rate": 6.2851987431889025e-06, + "loss": 0.0414, + "step": 4282 + }, + { + "epoch": 1.9022873639795692, + "grad_norm": 0.4491119239025248, + "learning_rate": 6.2833252431167066e-06, + "loss": 0.0393, + "step": 4283 + }, + { + "epoch": 1.9027315123251167, + "grad_norm": 0.39577109884937645, + "learning_rate": 6.2814515501389275e-06, + "loss": 0.035, + "step": 4284 + }, + { + "epoch": 1.903175660670664, + "grad_norm": 0.3708469471452283, + "learning_rate": 6.279577664537213e-06, + "loss": 0.026, + "step": 4285 + }, + { + "epoch": 1.9036198090162113, + "grad_norm": 0.4178266369288517, + "learning_rate": 6.2777035865932375e-06, + "loss": 0.0334, + "step": 4286 + }, + { + "epoch": 1.9040639573617588, + "grad_norm": 0.5196821555520535, + "learning_rate": 6.275829316588711e-06, + "loss": 0.0454, + "step": 4287 + }, + { + "epoch": 1.9045081057073063, + "grad_norm": 0.4709231790248879, + "learning_rate": 6.273954854805364e-06, + "loss": 0.0395, + "step": 4288 + }, + { + "epoch": 1.9049522540528536, + "grad_norm": 0.47040632233388197, + "learning_rate": 6.2720802015249615e-06, + "loss": 0.0459, + "step": 4289 + }, + { + "epoch": 1.905396402398401, + "grad_norm": 0.5132518236758158, + "learning_rate": 6.2702053570292976e-06, + "loss": 0.0453, + "step": 4290 + }, + { + "epoch": 1.9058405507439484, + "grad_norm": 0.4975445699296196, + "learning_rate": 6.26833032160019e-06, + "loss": 0.0384, + "step": 4291 + }, + { + "epoch": 1.906284699089496, + "grad_norm": 0.42703967829123596, + "learning_rate": 6.26645509551949e-06, + "loss": 0.036, + "step": 4292 + }, + { + "epoch": 1.9067288474350432, + "grad_norm": 0.4865047041410074, + "learning_rate": 6.264579679069077e-06, + "loss": 0.0371, + "step": 4293 + }, + { + "epoch": 1.9071729957805907, + "grad_norm": 0.5254303510911112, + "learning_rate": 6.262704072530856e-06, + "loss": 0.0501, + "step": 4294 + }, + { + "epoch": 1.9076171441261383, + "grad_norm": 0.36509386500252816, + "learning_rate": 6.260828276186762e-06, + "loss": 0.0366, + "step": 4295 + }, + { + "epoch": 1.9080612924716855, + "grad_norm": 0.4317077297037696, + "learning_rate": 6.258952290318763e-06, + "loss": 0.0367, + "step": 4296 + }, + { + "epoch": 1.9085054408172328, + "grad_norm": 0.4444631290624624, + "learning_rate": 6.257076115208847e-06, + "loss": 0.0366, + "step": 4297 + }, + { + "epoch": 1.9089495891627803, + "grad_norm": 0.3303398456720475, + "learning_rate": 6.255199751139036e-06, + "loss": 0.03, + "step": 4298 + }, + { + "epoch": 1.9093937375083279, + "grad_norm": 0.33258553726398404, + "learning_rate": 6.253323198391383e-06, + "loss": 0.0345, + "step": 4299 + }, + { + "epoch": 1.9098378858538752, + "grad_norm": 0.6445178538911288, + "learning_rate": 6.251446457247961e-06, + "loss": 0.0444, + "step": 4300 + }, + { + "epoch": 1.9102820341994224, + "grad_norm": 0.503486774604937, + "learning_rate": 6.249569527990878e-06, + "loss": 0.0508, + "step": 4301 + }, + { + "epoch": 1.9107261825449702, + "grad_norm": 0.532598812939261, + "learning_rate": 6.247692410902271e-06, + "loss": 0.0366, + "step": 4302 + }, + { + "epoch": 1.9111703308905175, + "grad_norm": 0.5689719802618134, + "learning_rate": 6.245815106264297e-06, + "loss": 0.0508, + "step": 4303 + }, + { + "epoch": 1.9116144792360648, + "grad_norm": 0.44795775453400327, + "learning_rate": 6.243937614359152e-06, + "loss": 0.0425, + "step": 4304 + }, + { + "epoch": 1.9120586275816123, + "grad_norm": 0.63806779442234, + "learning_rate": 6.242059935469051e-06, + "loss": 0.0409, + "step": 4305 + }, + { + "epoch": 1.9125027759271598, + "grad_norm": 0.41950353971072635, + "learning_rate": 6.240182069876244e-06, + "loss": 0.0344, + "step": 4306 + }, + { + "epoch": 1.912946924272707, + "grad_norm": 0.599261124567421, + "learning_rate": 6.238304017863005e-06, + "loss": 0.0415, + "step": 4307 + }, + { + "epoch": 1.9133910726182544, + "grad_norm": 0.364585724464603, + "learning_rate": 6.236425779711637e-06, + "loss": 0.0346, + "step": 4308 + }, + { + "epoch": 1.9138352209638019, + "grad_norm": 0.6254535972915019, + "learning_rate": 6.23454735570447e-06, + "loss": 0.0509, + "step": 4309 + }, + { + "epoch": 1.9142793693093494, + "grad_norm": 0.41487059504531737, + "learning_rate": 6.232668746123865e-06, + "loss": 0.0331, + "step": 4310 + }, + { + "epoch": 1.9147235176548967, + "grad_norm": 0.5104472659733067, + "learning_rate": 6.230789951252208e-06, + "loss": 0.0349, + "step": 4311 + }, + { + "epoch": 1.9151676660004442, + "grad_norm": 0.40794899664296186, + "learning_rate": 6.228910971371913e-06, + "loss": 0.0408, + "step": 4312 + }, + { + "epoch": 1.9156118143459917, + "grad_norm": 0.5437919242408334, + "learning_rate": 6.227031806765424e-06, + "loss": 0.0387, + "step": 4313 + }, + { + "epoch": 1.916055962691539, + "grad_norm": 0.4898004885001543, + "learning_rate": 6.225152457715211e-06, + "loss": 0.0501, + "step": 4314 + }, + { + "epoch": 1.9165001110370863, + "grad_norm": 0.32765369475691036, + "learning_rate": 6.223272924503773e-06, + "loss": 0.0313, + "step": 4315 + }, + { + "epoch": 1.9169442593826338, + "grad_norm": 1.1036222737179877, + "learning_rate": 6.221393207413634e-06, + "loss": 0.0555, + "step": 4316 + }, + { + "epoch": 1.9173884077281813, + "grad_norm": 0.5313402238818737, + "learning_rate": 6.219513306727347e-06, + "loss": 0.0435, + "step": 4317 + }, + { + "epoch": 1.9178325560737286, + "grad_norm": 0.44083561782231834, + "learning_rate": 6.217633222727495e-06, + "loss": 0.0375, + "step": 4318 + }, + { + "epoch": 1.918276704419276, + "grad_norm": 0.9040287143232594, + "learning_rate": 6.215752955696686e-06, + "loss": 0.0741, + "step": 4319 + }, + { + "epoch": 1.9187208527648234, + "grad_norm": 0.406508099143207, + "learning_rate": 6.213872505917554e-06, + "loss": 0.0309, + "step": 4320 + }, + { + "epoch": 1.919165001110371, + "grad_norm": 0.4001138707894935, + "learning_rate": 6.2119918736727666e-06, + "loss": 0.0292, + "step": 4321 + }, + { + "epoch": 1.9196091494559182, + "grad_norm": 0.37528739888488877, + "learning_rate": 6.210111059245011e-06, + "loss": 0.0323, + "step": 4322 + }, + { + "epoch": 1.9200532978014657, + "grad_norm": 0.4611567927734023, + "learning_rate": 6.2082300629170065e-06, + "loss": 0.0369, + "step": 4323 + }, + { + "epoch": 1.9204974461470132, + "grad_norm": 0.4615724924310186, + "learning_rate": 6.2063488849715e-06, + "loss": 0.0468, + "step": 4324 + }, + { + "epoch": 1.9209415944925605, + "grad_norm": 0.36847012924343564, + "learning_rate": 6.204467525691265e-06, + "loss": 0.0403, + "step": 4325 + }, + { + "epoch": 1.9213857428381078, + "grad_norm": 0.7462560129733484, + "learning_rate": 6.202585985359099e-06, + "loss": 0.0463, + "step": 4326 + }, + { + "epoch": 1.9218298911836553, + "grad_norm": 0.4111595385170657, + "learning_rate": 6.200704264257832e-06, + "loss": 0.0402, + "step": 4327 + }, + { + "epoch": 1.9222740395292028, + "grad_norm": 0.5886660338070746, + "learning_rate": 6.198822362670316e-06, + "loss": 0.0409, + "step": 4328 + }, + { + "epoch": 1.9227181878747501, + "grad_norm": 0.551351371253542, + "learning_rate": 6.196940280879436e-06, + "loss": 0.0371, + "step": 4329 + }, + { + "epoch": 1.9231623362202974, + "grad_norm": 0.5211858751187458, + "learning_rate": 6.1950580191681e-06, + "loss": 0.043, + "step": 4330 + }, + { + "epoch": 1.9236064845658452, + "grad_norm": 0.4258998039644192, + "learning_rate": 6.193175577819242e-06, + "loss": 0.0353, + "step": 4331 + }, + { + "epoch": 1.9240506329113924, + "grad_norm": 0.4043175914509727, + "learning_rate": 6.191292957115825e-06, + "loss": 0.0285, + "step": 4332 + }, + { + "epoch": 1.9244947812569397, + "grad_norm": 0.4563276066272647, + "learning_rate": 6.1894101573408425e-06, + "loss": 0.0371, + "step": 4333 + }, + { + "epoch": 1.9249389296024872, + "grad_norm": 0.54090150421601, + "learning_rate": 6.1875271787773075e-06, + "loss": 0.0519, + "step": 4334 + }, + { + "epoch": 1.9253830779480348, + "grad_norm": 0.3562700559697815, + "learning_rate": 6.185644021708266e-06, + "loss": 0.0267, + "step": 4335 + }, + { + "epoch": 1.925827226293582, + "grad_norm": 1.2138310918842605, + "learning_rate": 6.183760686416785e-06, + "loss": 0.0483, + "step": 4336 + }, + { + "epoch": 1.9262713746391293, + "grad_norm": 0.4024294145624342, + "learning_rate": 6.181877173185966e-06, + "loss": 0.0311, + "step": 4337 + }, + { + "epoch": 1.9267155229846769, + "grad_norm": 0.6790926459631366, + "learning_rate": 6.1799934822989315e-06, + "loss": 0.0494, + "step": 4338 + }, + { + "epoch": 1.9271596713302244, + "grad_norm": 0.48577305067511145, + "learning_rate": 6.178109614038832e-06, + "loss": 0.0412, + "step": 4339 + }, + { + "epoch": 1.9276038196757717, + "grad_norm": 0.4461768395787564, + "learning_rate": 6.176225568688844e-06, + "loss": 0.0337, + "step": 4340 + }, + { + "epoch": 1.9280479680213192, + "grad_norm": 0.38001980047585654, + "learning_rate": 6.174341346532173e-06, + "loss": 0.0404, + "step": 4341 + }, + { + "epoch": 1.9284921163668667, + "grad_norm": 0.4252479347353827, + "learning_rate": 6.1724569478520495e-06, + "loss": 0.0452, + "step": 4342 + }, + { + "epoch": 1.928936264712414, + "grad_norm": 0.3799614809822163, + "learning_rate": 6.1705723729317295e-06, + "loss": 0.0401, + "step": 4343 + }, + { + "epoch": 1.9293804130579613, + "grad_norm": 0.8795175854169174, + "learning_rate": 6.168687622054497e-06, + "loss": 0.0419, + "step": 4344 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 0.35580314936274277, + "learning_rate": 6.1668026955036645e-06, + "loss": 0.0275, + "step": 4345 + }, + { + "epoch": 1.9302687097490563, + "grad_norm": 0.5402722431604882, + "learning_rate": 6.1649175935625635e-06, + "loss": 0.0459, + "step": 4346 + }, + { + "epoch": 1.9307128580946036, + "grad_norm": 0.6557570062151068, + "learning_rate": 6.1630323165145615e-06, + "loss": 0.0463, + "step": 4347 + }, + { + "epoch": 1.9311570064401509, + "grad_norm": 0.45047259263477546, + "learning_rate": 6.161146864643045e-06, + "loss": 0.0408, + "step": 4348 + }, + { + "epoch": 1.9316011547856984, + "grad_norm": 0.4701310559005135, + "learning_rate": 6.159261238231431e-06, + "loss": 0.0379, + "step": 4349 + }, + { + "epoch": 1.932045303131246, + "grad_norm": 0.4022795195574756, + "learning_rate": 6.15737543756316e-06, + "loss": 0.0372, + "step": 4350 + }, + { + "epoch": 1.9324894514767932, + "grad_norm": 0.4838913493812182, + "learning_rate": 6.1554894629217e-06, + "loss": 0.0512, + "step": 4351 + }, + { + "epoch": 1.9329335998223407, + "grad_norm": 1.2278585350906228, + "learning_rate": 6.153603314590547e-06, + "loss": 0.0864, + "step": 4352 + }, + { + "epoch": 1.9333777481678882, + "grad_norm": 0.354715018531944, + "learning_rate": 6.1517169928532185e-06, + "loss": 0.0324, + "step": 4353 + }, + { + "epoch": 1.9338218965134355, + "grad_norm": 0.3643440217338777, + "learning_rate": 6.149830497993261e-06, + "loss": 0.0346, + "step": 4354 + }, + { + "epoch": 1.9342660448589828, + "grad_norm": 0.5368481909179409, + "learning_rate": 6.147943830294248e-06, + "loss": 0.0406, + "step": 4355 + }, + { + "epoch": 1.9347101932045303, + "grad_norm": 0.3847206541326022, + "learning_rate": 6.146056990039777e-06, + "loss": 0.0291, + "step": 4356 + }, + { + "epoch": 1.9351543415500778, + "grad_norm": 0.4195486386731195, + "learning_rate": 6.1441699775134724e-06, + "loss": 0.037, + "step": 4357 + }, + { + "epoch": 1.935598489895625, + "grad_norm": 0.4387846462865948, + "learning_rate": 6.142282792998985e-06, + "loss": 0.0336, + "step": 4358 + }, + { + "epoch": 1.9360426382411724, + "grad_norm": 0.33683241345246556, + "learning_rate": 6.14039543677999e-06, + "loss": 0.0357, + "step": 4359 + }, + { + "epoch": 1.93648678658672, + "grad_norm": 0.39849510173786257, + "learning_rate": 6.138507909140187e-06, + "loss": 0.0384, + "step": 4360 + }, + { + "epoch": 1.9369309349322674, + "grad_norm": 0.4846553810738608, + "learning_rate": 6.136620210363307e-06, + "loss": 0.0488, + "step": 4361 + }, + { + "epoch": 1.9373750832778147, + "grad_norm": 0.44711186795351665, + "learning_rate": 6.1347323407331e-06, + "loss": 0.0403, + "step": 4362 + }, + { + "epoch": 1.9378192316233622, + "grad_norm": 0.546459442186708, + "learning_rate": 6.132844300533348e-06, + "loss": 0.0442, + "step": 4363 + }, + { + "epoch": 1.9382633799689097, + "grad_norm": 0.6727853387902581, + "learning_rate": 6.130956090047852e-06, + "loss": 0.0542, + "step": 4364 + }, + { + "epoch": 1.938707528314457, + "grad_norm": 0.5269547149952615, + "learning_rate": 6.129067709560445e-06, + "loss": 0.0553, + "step": 4365 + }, + { + "epoch": 1.9391516766600043, + "grad_norm": 1.3382161278121019, + "learning_rate": 6.127179159354985e-06, + "loss": 0.0426, + "step": 4366 + }, + { + "epoch": 1.9395958250055518, + "grad_norm": 0.4304150369100395, + "learning_rate": 6.125290439715346e-06, + "loss": 0.0373, + "step": 4367 + }, + { + "epoch": 1.9400399733510993, + "grad_norm": 0.43883673281798163, + "learning_rate": 6.12340155092544e-06, + "loss": 0.0377, + "step": 4368 + }, + { + "epoch": 1.9404841216966466, + "grad_norm": 0.5188193338070988, + "learning_rate": 6.121512493269197e-06, + "loss": 0.0371, + "step": 4369 + }, + { + "epoch": 1.9409282700421941, + "grad_norm": 0.505484234164325, + "learning_rate": 6.119623267030576e-06, + "loss": 0.0402, + "step": 4370 + }, + { + "epoch": 1.9413724183877417, + "grad_norm": 0.5176850701235032, + "learning_rate": 6.1177338724935576e-06, + "loss": 0.0466, + "step": 4371 + }, + { + "epoch": 1.941816566733289, + "grad_norm": 0.4703785260705156, + "learning_rate": 6.115844309942153e-06, + "loss": 0.04, + "step": 4372 + }, + { + "epoch": 1.9422607150788362, + "grad_norm": 0.43937571329979475, + "learning_rate": 6.1139545796603925e-06, + "loss": 0.0395, + "step": 4373 + }, + { + "epoch": 1.9427048634243838, + "grad_norm": 0.5671825068350916, + "learning_rate": 6.112064681932335e-06, + "loss": 0.0445, + "step": 4374 + }, + { + "epoch": 1.9431490117699313, + "grad_norm": 0.38466158842057985, + "learning_rate": 6.110174617042066e-06, + "loss": 0.033, + "step": 4375 + }, + { + "epoch": 1.9435931601154786, + "grad_norm": 0.46940056697212595, + "learning_rate": 6.108284385273695e-06, + "loss": 0.0448, + "step": 4376 + }, + { + "epoch": 1.9440373084610258, + "grad_norm": 0.4747699143093753, + "learning_rate": 6.106393986911353e-06, + "loss": 0.0459, + "step": 4377 + }, + { + "epoch": 1.9444814568065734, + "grad_norm": 0.4922037733238623, + "learning_rate": 6.1045034222392e-06, + "loss": 0.0462, + "step": 4378 + }, + { + "epoch": 1.9449256051521209, + "grad_norm": 0.48074571927831655, + "learning_rate": 6.102612691541422e-06, + "loss": 0.0433, + "step": 4379 + }, + { + "epoch": 1.9453697534976682, + "grad_norm": 0.45838348922641425, + "learning_rate": 6.1007217951022244e-06, + "loss": 0.0567, + "step": 4380 + }, + { + "epoch": 1.9458139018432157, + "grad_norm": 0.39557432428184086, + "learning_rate": 6.098830733205844e-06, + "loss": 0.0318, + "step": 4381 + }, + { + "epoch": 1.9462580501887632, + "grad_norm": 0.4177848324601465, + "learning_rate": 6.096939506136539e-06, + "loss": 0.0475, + "step": 4382 + }, + { + "epoch": 1.9467021985343105, + "grad_norm": 0.4875377634328118, + "learning_rate": 6.095048114178591e-06, + "loss": 0.0385, + "step": 4383 + }, + { + "epoch": 1.9471463468798578, + "grad_norm": 0.3458943442785933, + "learning_rate": 6.093156557616311e-06, + "loss": 0.0259, + "step": 4384 + }, + { + "epoch": 1.9475904952254053, + "grad_norm": 0.48182367581929, + "learning_rate": 6.09126483673403e-06, + "loss": 0.0516, + "step": 4385 + }, + { + "epoch": 1.9480346435709528, + "grad_norm": 0.4073510855592602, + "learning_rate": 6.089372951816108e-06, + "loss": 0.0313, + "step": 4386 + }, + { + "epoch": 1.9484787919165, + "grad_norm": 0.48547952748532, + "learning_rate": 6.087480903146926e-06, + "loss": 0.0445, + "step": 4387 + }, + { + "epoch": 1.9489229402620474, + "grad_norm": 0.5136489604538488, + "learning_rate": 6.085588691010888e-06, + "loss": 0.0452, + "step": 4388 + }, + { + "epoch": 1.9493670886075949, + "grad_norm": 0.37826853557517365, + "learning_rate": 6.0836963156924335e-06, + "loss": 0.0356, + "step": 4389 + }, + { + "epoch": 1.9498112369531424, + "grad_norm": 0.4912572577888434, + "learning_rate": 6.081803777476012e-06, + "loss": 0.0462, + "step": 4390 + }, + { + "epoch": 1.9502553852986897, + "grad_norm": 0.4860207069904635, + "learning_rate": 6.079911076646106e-06, + "loss": 0.0425, + "step": 4391 + }, + { + "epoch": 1.9506995336442372, + "grad_norm": 0.3704904721214982, + "learning_rate": 6.07801821348722e-06, + "loss": 0.032, + "step": 4392 + }, + { + "epoch": 1.9511436819897847, + "grad_norm": 0.4740957603042776, + "learning_rate": 6.076125188283885e-06, + "loss": 0.0447, + "step": 4393 + }, + { + "epoch": 1.951587830335332, + "grad_norm": 0.4597613542477622, + "learning_rate": 6.074232001320654e-06, + "loss": 0.0508, + "step": 4394 + }, + { + "epoch": 1.9520319786808793, + "grad_norm": 0.35666990194120035, + "learning_rate": 6.072338652882105e-06, + "loss": 0.036, + "step": 4395 + }, + { + "epoch": 1.9524761270264268, + "grad_norm": 0.5521794806193095, + "learning_rate": 6.070445143252842e-06, + "loss": 0.0555, + "step": 4396 + }, + { + "epoch": 1.9529202753719743, + "grad_norm": 0.4402940395866301, + "learning_rate": 6.0685514727174885e-06, + "loss": 0.0365, + "step": 4397 + }, + { + "epoch": 1.9533644237175216, + "grad_norm": 0.45905507047472427, + "learning_rate": 6.066657641560697e-06, + "loss": 0.0371, + "step": 4398 + }, + { + "epoch": 1.953808572063069, + "grad_norm": 0.5661699028159852, + "learning_rate": 6.064763650067145e-06, + "loss": 0.0382, + "step": 4399 + }, + { + "epoch": 1.9542527204086166, + "grad_norm": 0.35700483978034786, + "learning_rate": 6.062869498521527e-06, + "loss": 0.0366, + "step": 4400 + }, + { + "epoch": 1.954696868754164, + "grad_norm": 0.43424661857402086, + "learning_rate": 6.060975187208569e-06, + "loss": 0.0489, + "step": 4401 + }, + { + "epoch": 1.9551410170997112, + "grad_norm": 0.4129343066796859, + "learning_rate": 6.059080716413016e-06, + "loss": 0.0389, + "step": 4402 + }, + { + "epoch": 1.9555851654452587, + "grad_norm": 0.5034728724416178, + "learning_rate": 6.057186086419643e-06, + "loss": 0.043, + "step": 4403 + }, + { + "epoch": 1.9560293137908062, + "grad_norm": 0.36382177248066033, + "learning_rate": 6.055291297513243e-06, + "loss": 0.0305, + "step": 4404 + }, + { + "epoch": 1.9564734621363535, + "grad_norm": 0.42171053505933703, + "learning_rate": 6.053396349978632e-06, + "loss": 0.0537, + "step": 4405 + }, + { + "epoch": 1.9569176104819008, + "grad_norm": 0.42351346045820926, + "learning_rate": 6.0515012441006574e-06, + "loss": 0.0444, + "step": 4406 + }, + { + "epoch": 1.9573617588274483, + "grad_norm": 0.3951815066981233, + "learning_rate": 6.0496059801641835e-06, + "loss": 0.0329, + "step": 4407 + }, + { + "epoch": 1.9578059071729959, + "grad_norm": 0.6939750072874196, + "learning_rate": 6.047710558454102e-06, + "loss": 0.0495, + "step": 4408 + }, + { + "epoch": 1.9582500555185431, + "grad_norm": 0.47139234544641123, + "learning_rate": 6.0458149792553245e-06, + "loss": 0.0338, + "step": 4409 + }, + { + "epoch": 1.9586942038640907, + "grad_norm": 0.4743764736759579, + "learning_rate": 6.043919242852792e-06, + "loss": 0.0401, + "step": 4410 + }, + { + "epoch": 1.9591383522096382, + "grad_norm": 0.7527833386589416, + "learning_rate": 6.042023349531463e-06, + "loss": 0.046, + "step": 4411 + }, + { + "epoch": 1.9595825005551855, + "grad_norm": 0.4434725389118798, + "learning_rate": 6.040127299576324e-06, + "loss": 0.0366, + "step": 4412 + }, + { + "epoch": 1.9600266489007327, + "grad_norm": 0.5310028240539822, + "learning_rate": 6.038231093272383e-06, + "loss": 0.0475, + "step": 4413 + }, + { + "epoch": 1.9604707972462803, + "grad_norm": 0.6870302944712058, + "learning_rate": 6.036334730904672e-06, + "loss": 0.0401, + "step": 4414 + }, + { + "epoch": 1.9609149455918278, + "grad_norm": 0.3754707520324698, + "learning_rate": 6.034438212758249e-06, + "loss": 0.0344, + "step": 4415 + }, + { + "epoch": 1.961359093937375, + "grad_norm": 0.7273648929204632, + "learning_rate": 6.032541539118188e-06, + "loss": 0.059, + "step": 4416 + }, + { + "epoch": 1.9618032422829224, + "grad_norm": 0.5563803052374947, + "learning_rate": 6.030644710269595e-06, + "loss": 0.0337, + "step": 4417 + }, + { + "epoch": 1.9622473906284699, + "grad_norm": 0.5196612260430283, + "learning_rate": 6.028747726497594e-06, + "loss": 0.043, + "step": 4418 + }, + { + "epoch": 1.9626915389740174, + "grad_norm": 0.37874941124980105, + "learning_rate": 6.026850588087334e-06, + "loss": 0.0438, + "step": 4419 + }, + { + "epoch": 1.9631356873195647, + "grad_norm": 0.3791041958907497, + "learning_rate": 6.024953295323987e-06, + "loss": 0.0318, + "step": 4420 + }, + { + "epoch": 1.9635798356651122, + "grad_norm": 0.7223736573889502, + "learning_rate": 6.02305584849275e-06, + "loss": 0.0406, + "step": 4421 + }, + { + "epoch": 1.9640239840106597, + "grad_norm": 0.5182650699628792, + "learning_rate": 6.02115824787884e-06, + "loss": 0.0292, + "step": 4422 + }, + { + "epoch": 1.964468132356207, + "grad_norm": 0.42934775755171317, + "learning_rate": 6.019260493767499e-06, + "loss": 0.0328, + "step": 4423 + }, + { + "epoch": 1.9649122807017543, + "grad_norm": 0.5027721140555967, + "learning_rate": 6.0173625864439924e-06, + "loss": 0.0328, + "step": 4424 + }, + { + "epoch": 1.9653564290473018, + "grad_norm": 0.4239046281164633, + "learning_rate": 6.015464526193605e-06, + "loss": 0.0402, + "step": 4425 + }, + { + "epoch": 1.9658005773928493, + "grad_norm": 0.3685503956667686, + "learning_rate": 6.013566313301651e-06, + "loss": 0.0356, + "step": 4426 + }, + { + "epoch": 1.9662447257383966, + "grad_norm": 0.5335432976601984, + "learning_rate": 6.011667948053462e-06, + "loss": 0.0488, + "step": 4427 + }, + { + "epoch": 1.9666888740839439, + "grad_norm": 0.41452010755189883, + "learning_rate": 6.009769430734395e-06, + "loss": 0.0458, + "step": 4428 + }, + { + "epoch": 1.9671330224294914, + "grad_norm": 0.4362710704789863, + "learning_rate": 6.007870761629831e-06, + "loss": 0.0399, + "step": 4429 + }, + { + "epoch": 1.967577170775039, + "grad_norm": 0.40405492276821936, + "learning_rate": 6.005971941025171e-06, + "loss": 0.0338, + "step": 4430 + }, + { + "epoch": 1.9680213191205862, + "grad_norm": 0.602695490290635, + "learning_rate": 6.004072969205838e-06, + "loss": 0.0684, + "step": 4431 + }, + { + "epoch": 1.9684654674661337, + "grad_norm": 0.47106411491289435, + "learning_rate": 6.002173846457282e-06, + "loss": 0.0402, + "step": 4432 + }, + { + "epoch": 1.9689096158116812, + "grad_norm": 0.5941226728121359, + "learning_rate": 6.0002745730649725e-06, + "loss": 0.0376, + "step": 4433 + }, + { + "epoch": 1.9693537641572285, + "grad_norm": 0.3952404366832551, + "learning_rate": 5.998375149314404e-06, + "loss": 0.0375, + "step": 4434 + }, + { + "epoch": 1.9697979125027758, + "grad_norm": 0.5414636177351856, + "learning_rate": 5.996475575491091e-06, + "loss": 0.036, + "step": 4435 + }, + { + "epoch": 1.9702420608483233, + "grad_norm": 0.35141134241433086, + "learning_rate": 5.994575851880571e-06, + "loss": 0.0299, + "step": 4436 + }, + { + "epoch": 1.9706862091938708, + "grad_norm": 0.5469528061005677, + "learning_rate": 5.992675978768406e-06, + "loss": 0.0312, + "step": 4437 + }, + { + "epoch": 1.9711303575394181, + "grad_norm": 0.4485140591176533, + "learning_rate": 5.99077595644018e-06, + "loss": 0.0397, + "step": 4438 + }, + { + "epoch": 1.9715745058849656, + "grad_norm": 0.4559475041999291, + "learning_rate": 5.988875785181496e-06, + "loss": 0.0399, + "step": 4439 + }, + { + "epoch": 1.9720186542305131, + "grad_norm": 0.5184191786649656, + "learning_rate": 5.986975465277983e-06, + "loss": 0.042, + "step": 4440 + }, + { + "epoch": 1.9724628025760604, + "grad_norm": 0.5018306786878316, + "learning_rate": 5.9850749970152935e-06, + "loss": 0.0433, + "step": 4441 + }, + { + "epoch": 1.9729069509216077, + "grad_norm": 0.41853899287083135, + "learning_rate": 5.983174380679096e-06, + "loss": 0.0311, + "step": 4442 + }, + { + "epoch": 1.9733510992671552, + "grad_norm": 0.3979151331311135, + "learning_rate": 5.98127361655509e-06, + "loss": 0.0393, + "step": 4443 + }, + { + "epoch": 1.9737952476127028, + "grad_norm": 0.4773579735193138, + "learning_rate": 5.979372704928991e-06, + "loss": 0.0404, + "step": 4444 + }, + { + "epoch": 1.97423939595825, + "grad_norm": 0.5742352016540867, + "learning_rate": 5.977471646086535e-06, + "loss": 0.0371, + "step": 4445 + }, + { + "epoch": 1.9746835443037973, + "grad_norm": 0.4462263069511724, + "learning_rate": 5.97557044031349e-06, + "loss": 0.0327, + "step": 4446 + }, + { + "epoch": 1.9751276926493448, + "grad_norm": 0.6732019962659918, + "learning_rate": 5.973669087895633e-06, + "loss": 0.0596, + "step": 4447 + }, + { + "epoch": 1.9755718409948924, + "grad_norm": 0.485770657972991, + "learning_rate": 5.971767589118772e-06, + "loss": 0.0337, + "step": 4448 + }, + { + "epoch": 1.9760159893404396, + "grad_norm": 0.342670431757156, + "learning_rate": 5.969865944268737e-06, + "loss": 0.0303, + "step": 4449 + }, + { + "epoch": 1.9764601376859872, + "grad_norm": 0.4112414603189444, + "learning_rate": 5.9679641536313734e-06, + "loss": 0.038, + "step": 4450 + }, + { + "epoch": 1.9769042860315347, + "grad_norm": 0.6127929425160972, + "learning_rate": 5.9660622174925564e-06, + "loss": 0.0537, + "step": 4451 + }, + { + "epoch": 1.977348434377082, + "grad_norm": 0.33024844150086474, + "learning_rate": 5.964160136138177e-06, + "loss": 0.0353, + "step": 4452 + }, + { + "epoch": 1.9777925827226293, + "grad_norm": 0.5529535095882723, + "learning_rate": 5.96225790985415e-06, + "loss": 0.0442, + "step": 4453 + }, + { + "epoch": 1.9782367310681768, + "grad_norm": 0.4225770450854345, + "learning_rate": 5.960355538926414e-06, + "loss": 0.0494, + "step": 4454 + }, + { + "epoch": 1.9786808794137243, + "grad_norm": 0.5282007627303459, + "learning_rate": 5.958453023640928e-06, + "loss": 0.0416, + "step": 4455 + }, + { + "epoch": 1.9791250277592716, + "grad_norm": 0.3898204372093461, + "learning_rate": 5.956550364283671e-06, + "loss": 0.0373, + "step": 4456 + }, + { + "epoch": 1.9795691761048189, + "grad_norm": 0.6543021886189491, + "learning_rate": 5.954647561140643e-06, + "loss": 0.0406, + "step": 4457 + }, + { + "epoch": 1.9800133244503664, + "grad_norm": 0.7003740987141756, + "learning_rate": 5.952744614497872e-06, + "loss": 0.0437, + "step": 4458 + }, + { + "epoch": 1.9804574727959139, + "grad_norm": 0.4937974164168222, + "learning_rate": 5.9508415246414e-06, + "loss": 0.0539, + "step": 4459 + }, + { + "epoch": 1.9809016211414612, + "grad_norm": 0.46863912061469276, + "learning_rate": 5.948938291857296e-06, + "loss": 0.0503, + "step": 4460 + }, + { + "epoch": 1.9813457694870087, + "grad_norm": 0.5235494039593215, + "learning_rate": 5.947034916431646e-06, + "loss": 0.0462, + "step": 4461 + }, + { + "epoch": 1.9817899178325562, + "grad_norm": 0.4097032398279417, + "learning_rate": 5.945131398650561e-06, + "loss": 0.038, + "step": 4462 + }, + { + "epoch": 1.9822340661781035, + "grad_norm": 0.4528672743260559, + "learning_rate": 5.943227738800172e-06, + "loss": 0.0558, + "step": 4463 + }, + { + "epoch": 1.9826782145236508, + "grad_norm": 0.5204018448947473, + "learning_rate": 5.941323937166632e-06, + "loss": 0.0522, + "step": 4464 + }, + { + "epoch": 1.9831223628691983, + "grad_norm": 0.3540421279343773, + "learning_rate": 5.939419994036113e-06, + "loss": 0.038, + "step": 4465 + }, + { + "epoch": 1.9835665112147458, + "grad_norm": 0.6263363230917521, + "learning_rate": 5.937515909694811e-06, + "loss": 0.0569, + "step": 4466 + }, + { + "epoch": 1.984010659560293, + "grad_norm": 0.4116502316940924, + "learning_rate": 5.9356116844289426e-06, + "loss": 0.0413, + "step": 4467 + }, + { + "epoch": 1.9844548079058404, + "grad_norm": 0.44077017956645004, + "learning_rate": 5.933707318524744e-06, + "loss": 0.0333, + "step": 4468 + }, + { + "epoch": 1.9848989562513881, + "grad_norm": 0.3729944832338479, + "learning_rate": 5.931802812268476e-06, + "loss": 0.0327, + "step": 4469 + }, + { + "epoch": 1.9853431045969354, + "grad_norm": 0.40626287554685864, + "learning_rate": 5.929898165946416e-06, + "loss": 0.0363, + "step": 4470 + }, + { + "epoch": 1.9857872529424827, + "grad_norm": 0.5950032125541065, + "learning_rate": 5.927993379844864e-06, + "loss": 0.0426, + "step": 4471 + }, + { + "epoch": 1.9862314012880302, + "grad_norm": 0.40799770820647496, + "learning_rate": 5.9260884542501455e-06, + "loss": 0.044, + "step": 4472 + }, + { + "epoch": 1.9866755496335777, + "grad_norm": 0.41140378050861237, + "learning_rate": 5.9241833894486e-06, + "loss": 0.0482, + "step": 4473 + }, + { + "epoch": 1.987119697979125, + "grad_norm": 0.40378379008973436, + "learning_rate": 5.922278185726591e-06, + "loss": 0.0327, + "step": 4474 + }, + { + "epoch": 1.9875638463246723, + "grad_norm": 0.41453258443290436, + "learning_rate": 5.920372843370504e-06, + "loss": 0.0416, + "step": 4475 + }, + { + "epoch": 1.9880079946702198, + "grad_norm": 0.5109670847768875, + "learning_rate": 5.9184673626667455e-06, + "loss": 0.039, + "step": 4476 + }, + { + "epoch": 1.9884521430157673, + "grad_norm": 0.5686294228064248, + "learning_rate": 5.9165617439017395e-06, + "loss": 0.0534, + "step": 4477 + }, + { + "epoch": 1.9888962913613146, + "grad_norm": 0.4064180941136422, + "learning_rate": 5.914655987361934e-06, + "loss": 0.0328, + "step": 4478 + }, + { + "epoch": 1.9893404397068621, + "grad_norm": 0.5070558821658653, + "learning_rate": 5.912750093333796e-06, + "loss": 0.0542, + "step": 4479 + }, + { + "epoch": 1.9897845880524097, + "grad_norm": 0.3530913079623509, + "learning_rate": 5.910844062103814e-06, + "loss": 0.0395, + "step": 4480 + }, + { + "epoch": 1.990228736397957, + "grad_norm": 0.43523429839148015, + "learning_rate": 5.908937893958497e-06, + "loss": 0.0366, + "step": 4481 + }, + { + "epoch": 1.9906728847435042, + "grad_norm": 0.4229033433291945, + "learning_rate": 5.907031589184374e-06, + "loss": 0.0383, + "step": 4482 + }, + { + "epoch": 1.9911170330890517, + "grad_norm": 0.39083478145330547, + "learning_rate": 5.905125148067997e-06, + "loss": 0.0371, + "step": 4483 + }, + { + "epoch": 1.9915611814345993, + "grad_norm": 0.5025271192866929, + "learning_rate": 5.9032185708959354e-06, + "loss": 0.0395, + "step": 4484 + }, + { + "epoch": 1.9920053297801465, + "grad_norm": 0.349650340955071, + "learning_rate": 5.901311857954777e-06, + "loss": 0.0266, + "step": 4485 + }, + { + "epoch": 1.9924494781256938, + "grad_norm": 0.5279876104400976, + "learning_rate": 5.899405009531136e-06, + "loss": 0.0402, + "step": 4486 + }, + { + "epoch": 1.9928936264712414, + "grad_norm": 0.6233845443742896, + "learning_rate": 5.897498025911645e-06, + "loss": 0.048, + "step": 4487 + }, + { + "epoch": 1.9933377748167889, + "grad_norm": 0.49026655627892524, + "learning_rate": 5.8955909073829555e-06, + "loss": 0.0412, + "step": 4488 + }, + { + "epoch": 1.9937819231623362, + "grad_norm": 0.5000915892039338, + "learning_rate": 5.893683654231737e-06, + "loss": 0.0394, + "step": 4489 + }, + { + "epoch": 1.9942260715078837, + "grad_norm": 0.5000990386217609, + "learning_rate": 5.891776266744686e-06, + "loss": 0.0355, + "step": 4490 + }, + { + "epoch": 1.9946702198534312, + "grad_norm": 0.39359957737650036, + "learning_rate": 5.889868745208514e-06, + "loss": 0.0304, + "step": 4491 + }, + { + "epoch": 1.9951143681989785, + "grad_norm": 0.9651263441707514, + "learning_rate": 5.8879610899099505e-06, + "loss": 0.0529, + "step": 4492 + }, + { + "epoch": 1.9955585165445258, + "grad_norm": 0.32907807731556993, + "learning_rate": 5.886053301135755e-06, + "loss": 0.0322, + "step": 4493 + }, + { + "epoch": 1.9960026648900733, + "grad_norm": 0.4231846959633054, + "learning_rate": 5.8841453791726944e-06, + "loss": 0.0362, + "step": 4494 + }, + { + "epoch": 1.9964468132356208, + "grad_norm": 0.4958581720016726, + "learning_rate": 5.882237324307564e-06, + "loss": 0.0304, + "step": 4495 + }, + { + "epoch": 1.996890961581168, + "grad_norm": 0.44600019470214586, + "learning_rate": 5.880329136827178e-06, + "loss": 0.0405, + "step": 4496 + }, + { + "epoch": 1.9973351099267154, + "grad_norm": 0.5659602027375302, + "learning_rate": 5.878420817018369e-06, + "loss": 0.0472, + "step": 4497 + }, + { + "epoch": 1.9977792582722629, + "grad_norm": 1.0132374448664145, + "learning_rate": 5.87651236516799e-06, + "loss": 0.0706, + "step": 4498 + }, + { + "epoch": 1.9982234066178104, + "grad_norm": 0.4914853042492375, + "learning_rate": 5.874603781562911e-06, + "loss": 0.0422, + "step": 4499 + }, + { + "epoch": 1.9986675549633577, + "grad_norm": 0.40024702771181153, + "learning_rate": 5.872695066490028e-06, + "loss": 0.0344, + "step": 4500 + }, + { + "epoch": 1.9991117033089052, + "grad_norm": 0.49874140037011355, + "learning_rate": 5.870786220236253e-06, + "loss": 0.0417, + "step": 4501 + }, + { + "epoch": 1.9995558516544527, + "grad_norm": 0.47988916644310126, + "learning_rate": 5.868877243088515e-06, + "loss": 0.0441, + "step": 4502 + }, + { + "epoch": 2.0, + "grad_norm": 0.904943869460001, + "learning_rate": 5.866968135333769e-06, + "loss": 0.0502, + "step": 4503 + }, + { + "epoch": 2.0, + "eval_loss": 0.04465880244970322, + "eval_runtime": 403.5178, + "eval_samples_per_second": 37.584, + "eval_steps_per_second": 1.175, + "step": 4503 + }, + { + "epoch": 2.0004441483455473, + "grad_norm": 0.4375728669928517, + "learning_rate": 5.8650588972589865e-06, + "loss": 0.0263, + "step": 4504 + }, + { + "epoch": 2.000888296691095, + "grad_norm": 0.9215657711078544, + "learning_rate": 5.863149529151154e-06, + "loss": 0.0492, + "step": 4505 + }, + { + "epoch": 2.0013324450366423, + "grad_norm": 0.5271072063506441, + "learning_rate": 5.8612400312972865e-06, + "loss": 0.0508, + "step": 4506 + }, + { + "epoch": 2.0017765933821896, + "grad_norm": 0.48411386914362037, + "learning_rate": 5.859330403984413e-06, + "loss": 0.0371, + "step": 4507 + }, + { + "epoch": 2.002220741727737, + "grad_norm": 0.4017233429265068, + "learning_rate": 5.85742064749958e-06, + "loss": 0.0418, + "step": 4508 + }, + { + "epoch": 2.0026648900732846, + "grad_norm": 0.4120119737365717, + "learning_rate": 5.85551076212986e-06, + "loss": 0.046, + "step": 4509 + }, + { + "epoch": 2.003109038418832, + "grad_norm": 0.4366707287416164, + "learning_rate": 5.8536007481623406e-06, + "loss": 0.0443, + "step": 4510 + }, + { + "epoch": 2.003553186764379, + "grad_norm": 0.7145508695971371, + "learning_rate": 5.851690605884127e-06, + "loss": 0.0532, + "step": 4511 + }, + { + "epoch": 2.0039973351099265, + "grad_norm": 0.5011936156881229, + "learning_rate": 5.84978033558235e-06, + "loss": 0.0354, + "step": 4512 + }, + { + "epoch": 2.0044414834554742, + "grad_norm": 0.36581304940892057, + "learning_rate": 5.847869937544151e-06, + "loss": 0.0294, + "step": 4513 + }, + { + "epoch": 2.0048856318010215, + "grad_norm": 0.37996248718965503, + "learning_rate": 5.845959412056699e-06, + "loss": 0.0286, + "step": 4514 + }, + { + "epoch": 2.005329780146569, + "grad_norm": 0.5097034078222489, + "learning_rate": 5.844048759407177e-06, + "loss": 0.0414, + "step": 4515 + }, + { + "epoch": 2.0057739284921166, + "grad_norm": 0.4963020471009325, + "learning_rate": 5.842137979882786e-06, + "loss": 0.0459, + "step": 4516 + }, + { + "epoch": 2.006218076837664, + "grad_norm": 0.6161319347902735, + "learning_rate": 5.840227073770754e-06, + "loss": 0.0629, + "step": 4517 + }, + { + "epoch": 2.006662225183211, + "grad_norm": 0.3511248203446856, + "learning_rate": 5.838316041358319e-06, + "loss": 0.0295, + "step": 4518 + }, + { + "epoch": 2.0071063735287584, + "grad_norm": 0.37830736549229443, + "learning_rate": 5.836404882932744e-06, + "loss": 0.0319, + "step": 4519 + }, + { + "epoch": 2.007550521874306, + "grad_norm": 0.6333845141434062, + "learning_rate": 5.8344935987813045e-06, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 2.0079946702198535, + "grad_norm": 0.5166443794796353, + "learning_rate": 5.832582189191304e-06, + "loss": 0.0346, + "step": 4521 + }, + { + "epoch": 2.0084388185654007, + "grad_norm": 0.4317191740162663, + "learning_rate": 5.8306706544500544e-06, + "loss": 0.0319, + "step": 4522 + }, + { + "epoch": 2.0088829669109485, + "grad_norm": 0.4388602499538576, + "learning_rate": 5.828758994844896e-06, + "loss": 0.0377, + "step": 4523 + }, + { + "epoch": 2.0093271152564958, + "grad_norm": 0.5946947521373457, + "learning_rate": 5.826847210663184e-06, + "loss": 0.0421, + "step": 4524 + }, + { + "epoch": 2.009771263602043, + "grad_norm": 0.3821952135869883, + "learning_rate": 5.8249353021922895e-06, + "loss": 0.0303, + "step": 4525 + }, + { + "epoch": 2.0102154119475903, + "grad_norm": 0.5519322372289749, + "learning_rate": 5.823023269719606e-06, + "loss": 0.0513, + "step": 4526 + }, + { + "epoch": 2.010659560293138, + "grad_norm": 0.6313520947120921, + "learning_rate": 5.821111113532545e-06, + "loss": 0.0484, + "step": 4527 + }, + { + "epoch": 2.0111037086386854, + "grad_norm": 0.5368562306090153, + "learning_rate": 5.819198833918533e-06, + "loss": 0.051, + "step": 4528 + }, + { + "epoch": 2.0115478569842327, + "grad_norm": 0.39624325698963303, + "learning_rate": 5.817286431165024e-06, + "loss": 0.0333, + "step": 4529 + }, + { + "epoch": 2.01199200532978, + "grad_norm": 0.411233856841024, + "learning_rate": 5.815373905559478e-06, + "loss": 0.0312, + "step": 4530 + }, + { + "epoch": 2.0124361536753277, + "grad_norm": 0.3742461123258978, + "learning_rate": 5.813461257389384e-06, + "loss": 0.0268, + "step": 4531 + }, + { + "epoch": 2.012880302020875, + "grad_norm": 0.3749818385865246, + "learning_rate": 5.811548486942246e-06, + "loss": 0.0346, + "step": 4532 + }, + { + "epoch": 2.0133244503664223, + "grad_norm": 0.34116601308623506, + "learning_rate": 5.809635594505585e-06, + "loss": 0.0251, + "step": 4533 + }, + { + "epoch": 2.01376859871197, + "grad_norm": 0.5178521236398543, + "learning_rate": 5.807722580366939e-06, + "loss": 0.0381, + "step": 4534 + }, + { + "epoch": 2.0142127470575173, + "grad_norm": 0.32707032102652794, + "learning_rate": 5.805809444813869e-06, + "loss": 0.0275, + "step": 4535 + }, + { + "epoch": 2.0146568954030646, + "grad_norm": 0.4024749636693741, + "learning_rate": 5.80389618813395e-06, + "loss": 0.0365, + "step": 4536 + }, + { + "epoch": 2.015101043748612, + "grad_norm": 0.3506397945464826, + "learning_rate": 5.8019828106147805e-06, + "loss": 0.0252, + "step": 4537 + }, + { + "epoch": 2.0155451920941596, + "grad_norm": 0.5822851446048461, + "learning_rate": 5.80006931254397e-06, + "loss": 0.0369, + "step": 4538 + }, + { + "epoch": 2.015989340439707, + "grad_norm": 0.44067833638988113, + "learning_rate": 5.798155694209151e-06, + "loss": 0.0344, + "step": 4539 + }, + { + "epoch": 2.016433488785254, + "grad_norm": 0.4460004586157456, + "learning_rate": 5.796241955897972e-06, + "loss": 0.0381, + "step": 4540 + }, + { + "epoch": 2.0168776371308015, + "grad_norm": 0.3328661208832795, + "learning_rate": 5.7943280978981034e-06, + "loss": 0.0284, + "step": 4541 + }, + { + "epoch": 2.017321785476349, + "grad_norm": 0.48357435513498903, + "learning_rate": 5.792414120497227e-06, + "loss": 0.0361, + "step": 4542 + }, + { + "epoch": 2.0177659338218965, + "grad_norm": 0.4866683235779641, + "learning_rate": 5.790500023983049e-06, + "loss": 0.0359, + "step": 4543 + }, + { + "epoch": 2.018210082167444, + "grad_norm": 0.39359993621118305, + "learning_rate": 5.788585808643287e-06, + "loss": 0.0281, + "step": 4544 + }, + { + "epoch": 2.0186542305129915, + "grad_norm": 0.4058975435223384, + "learning_rate": 5.786671474765683e-06, + "loss": 0.0314, + "step": 4545 + }, + { + "epoch": 2.019098378858539, + "grad_norm": 0.38121689511830104, + "learning_rate": 5.784757022637993e-06, + "loss": 0.0291, + "step": 4546 + }, + { + "epoch": 2.019542527204086, + "grad_norm": 0.4508375438456433, + "learning_rate": 5.782842452547992e-06, + "loss": 0.0334, + "step": 4547 + }, + { + "epoch": 2.0199866755496334, + "grad_norm": 0.42673347363830266, + "learning_rate": 5.780927764783473e-06, + "loss": 0.0249, + "step": 4548 + }, + { + "epoch": 2.020430823895181, + "grad_norm": 0.43669165432209434, + "learning_rate": 5.779012959632244e-06, + "loss": 0.0358, + "step": 4549 + }, + { + "epoch": 2.0208749722407284, + "grad_norm": 0.4022820994220209, + "learning_rate": 5.777098037382135e-06, + "loss": 0.0293, + "step": 4550 + }, + { + "epoch": 2.0213191205862757, + "grad_norm": 0.42387266543810437, + "learning_rate": 5.77518299832099e-06, + "loss": 0.0384, + "step": 4551 + }, + { + "epoch": 2.021763268931823, + "grad_norm": 0.4585217445539665, + "learning_rate": 5.7732678427366725e-06, + "loss": 0.0349, + "step": 4552 + }, + { + "epoch": 2.0222074172773707, + "grad_norm": 0.44358027335422784, + "learning_rate": 5.771352570917062e-06, + "loss": 0.0332, + "step": 4553 + }, + { + "epoch": 2.022651565622918, + "grad_norm": 0.43507443913798416, + "learning_rate": 5.769437183150057e-06, + "loss": 0.0347, + "step": 4554 + }, + { + "epoch": 2.0230957139684653, + "grad_norm": 0.37205140444853446, + "learning_rate": 5.767521679723574e-06, + "loss": 0.029, + "step": 4555 + }, + { + "epoch": 2.023539862314013, + "grad_norm": 0.3603917915607534, + "learning_rate": 5.765606060925545e-06, + "loss": 0.0309, + "step": 4556 + }, + { + "epoch": 2.0239840106595604, + "grad_norm": 0.5400074623803027, + "learning_rate": 5.763690327043919e-06, + "loss": 0.0459, + "step": 4557 + }, + { + "epoch": 2.0244281590051076, + "grad_norm": 0.39760305609807395, + "learning_rate": 5.761774478366664e-06, + "loss": 0.0338, + "step": 4558 + }, + { + "epoch": 2.024872307350655, + "grad_norm": 0.44010489846206996, + "learning_rate": 5.759858515181763e-06, + "loss": 0.0305, + "step": 4559 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.38439690133076265, + "learning_rate": 5.757942437777222e-06, + "loss": 0.0281, + "step": 4560 + }, + { + "epoch": 2.02576060404175, + "grad_norm": 0.36756874447130394, + "learning_rate": 5.756026246441056e-06, + "loss": 0.0276, + "step": 4561 + }, + { + "epoch": 2.0262047523872972, + "grad_norm": 0.3543127710037651, + "learning_rate": 5.754109941461302e-06, + "loss": 0.0304, + "step": 4562 + }, + { + "epoch": 2.026648900732845, + "grad_norm": 0.4789471062184959, + "learning_rate": 5.7521935231260166e-06, + "loss": 0.0361, + "step": 4563 + }, + { + "epoch": 2.0270930490783923, + "grad_norm": 0.4658261130750723, + "learning_rate": 5.7502769917232635e-06, + "loss": 0.0297, + "step": 4564 + }, + { + "epoch": 2.0275371974239396, + "grad_norm": 0.37364282464180315, + "learning_rate": 5.748360347541136e-06, + "loss": 0.0301, + "step": 4565 + }, + { + "epoch": 2.027981345769487, + "grad_norm": 0.5624254377961666, + "learning_rate": 5.746443590867735e-06, + "loss": 0.0375, + "step": 4566 + }, + { + "epoch": 2.0284254941150346, + "grad_norm": 0.4305222164333765, + "learning_rate": 5.7445267219911815e-06, + "loss": 0.033, + "step": 4567 + }, + { + "epoch": 2.028869642460582, + "grad_norm": 0.669501618559028, + "learning_rate": 5.742609741199615e-06, + "loss": 0.0497, + "step": 4568 + }, + { + "epoch": 2.029313790806129, + "grad_norm": 0.38137453136293886, + "learning_rate": 5.740692648781191e-06, + "loss": 0.0377, + "step": 4569 + }, + { + "epoch": 2.0297579391516765, + "grad_norm": 0.4610447553595522, + "learning_rate": 5.738775445024078e-06, + "loss": 0.0394, + "step": 4570 + }, + { + "epoch": 2.030202087497224, + "grad_norm": 0.46477609561023486, + "learning_rate": 5.736858130216468e-06, + "loss": 0.044, + "step": 4571 + }, + { + "epoch": 2.0306462358427715, + "grad_norm": 0.34915412378821464, + "learning_rate": 5.7349407046465625e-06, + "loss": 0.0234, + "step": 4572 + }, + { + "epoch": 2.0310903841883188, + "grad_norm": 0.47023014530194523, + "learning_rate": 5.733023168602584e-06, + "loss": 0.0427, + "step": 4573 + }, + { + "epoch": 2.0315345325338665, + "grad_norm": 0.5052588063022365, + "learning_rate": 5.731105522372773e-06, + "loss": 0.0406, + "step": 4574 + }, + { + "epoch": 2.031978680879414, + "grad_norm": 0.5345856053201472, + "learning_rate": 5.729187766245382e-06, + "loss": 0.0347, + "step": 4575 + }, + { + "epoch": 2.032422829224961, + "grad_norm": 0.41581297859158317, + "learning_rate": 5.727269900508682e-06, + "loss": 0.0366, + "step": 4576 + }, + { + "epoch": 2.0328669775705084, + "grad_norm": 0.5119888393853211, + "learning_rate": 5.725351925450964e-06, + "loss": 0.0372, + "step": 4577 + }, + { + "epoch": 2.033311125916056, + "grad_norm": 0.4247836793476431, + "learning_rate": 5.723433841360528e-06, + "loss": 0.0259, + "step": 4578 + }, + { + "epoch": 2.0337552742616034, + "grad_norm": 0.3959142952189294, + "learning_rate": 5.721515648525698e-06, + "loss": 0.0358, + "step": 4579 + }, + { + "epoch": 2.0341994226071507, + "grad_norm": 0.4826767145510797, + "learning_rate": 5.719597347234809e-06, + "loss": 0.0404, + "step": 4580 + }, + { + "epoch": 2.034643570952698, + "grad_norm": 0.4091839747164647, + "learning_rate": 5.7176789377762155e-06, + "loss": 0.0421, + "step": 4581 + }, + { + "epoch": 2.0350877192982457, + "grad_norm": 0.41330812118932564, + "learning_rate": 5.715760420438284e-06, + "loss": 0.025, + "step": 4582 + }, + { + "epoch": 2.035531867643793, + "grad_norm": 0.3786001612535725, + "learning_rate": 5.713841795509405e-06, + "loss": 0.0304, + "step": 4583 + }, + { + "epoch": 2.0359760159893403, + "grad_norm": 0.38932682295296067, + "learning_rate": 5.711923063277979e-06, + "loss": 0.033, + "step": 4584 + }, + { + "epoch": 2.036420164334888, + "grad_norm": 0.3925654682001415, + "learning_rate": 5.710004224032421e-06, + "loss": 0.035, + "step": 4585 + }, + { + "epoch": 2.0368643126804353, + "grad_norm": 0.40837083418560577, + "learning_rate": 5.708085278061167e-06, + "loss": 0.0232, + "step": 4586 + }, + { + "epoch": 2.0373084610259826, + "grad_norm": 0.4396699075372347, + "learning_rate": 5.706166225652669e-06, + "loss": 0.0404, + "step": 4587 + }, + { + "epoch": 2.03775260937153, + "grad_norm": 0.486841383129979, + "learning_rate": 5.704247067095391e-06, + "loss": 0.0325, + "step": 4588 + }, + { + "epoch": 2.0381967577170776, + "grad_norm": 0.4503221450108852, + "learning_rate": 5.702327802677815e-06, + "loss": 0.0346, + "step": 4589 + }, + { + "epoch": 2.038640906062625, + "grad_norm": 0.4565221863051935, + "learning_rate": 5.70040843268844e-06, + "loss": 0.0404, + "step": 4590 + }, + { + "epoch": 2.0390850544081722, + "grad_norm": 0.39766027067422877, + "learning_rate": 5.698488957415782e-06, + "loss": 0.0325, + "step": 4591 + }, + { + "epoch": 2.03952920275372, + "grad_norm": 0.469121812114123, + "learning_rate": 5.6965693771483654e-06, + "loss": 0.0361, + "step": 4592 + }, + { + "epoch": 2.0399733510992673, + "grad_norm": 0.34294205911608633, + "learning_rate": 5.6946496921747394e-06, + "loss": 0.0274, + "step": 4593 + }, + { + "epoch": 2.0404174994448145, + "grad_norm": 0.45197263089243245, + "learning_rate": 5.692729902783467e-06, + "loss": 0.0357, + "step": 4594 + }, + { + "epoch": 2.040861647790362, + "grad_norm": 0.4143752687268431, + "learning_rate": 5.6908100092631215e-06, + "loss": 0.0325, + "step": 4595 + }, + { + "epoch": 2.0413057961359096, + "grad_norm": 0.5126979012787386, + "learning_rate": 5.688890011902295e-06, + "loss": 0.0402, + "step": 4596 + }, + { + "epoch": 2.041749944481457, + "grad_norm": 0.38129445526321254, + "learning_rate": 5.686969910989599e-06, + "loss": 0.0247, + "step": 4597 + }, + { + "epoch": 2.042194092827004, + "grad_norm": 0.43439418379805417, + "learning_rate": 5.685049706813657e-06, + "loss": 0.0334, + "step": 4598 + }, + { + "epoch": 2.0426382411725514, + "grad_norm": 0.3528242448836029, + "learning_rate": 5.683129399663105e-06, + "loss": 0.0237, + "step": 4599 + }, + { + "epoch": 2.043082389518099, + "grad_norm": 0.4979008424907728, + "learning_rate": 5.681208989826601e-06, + "loss": 0.0346, + "step": 4600 + }, + { + "epoch": 2.0435265378636465, + "grad_norm": 0.4344255344249799, + "learning_rate": 5.679288477592815e-06, + "loss": 0.0288, + "step": 4601 + }, + { + "epoch": 2.0439706862091938, + "grad_norm": 0.5875305767400001, + "learning_rate": 5.67736786325043e-06, + "loss": 0.0336, + "step": 4602 + }, + { + "epoch": 2.0444148345547415, + "grad_norm": 0.6007666135225987, + "learning_rate": 5.675447147088148e-06, + "loss": 0.0337, + "step": 4603 + }, + { + "epoch": 2.044858982900289, + "grad_norm": 0.3501303227446166, + "learning_rate": 5.673526329394688e-06, + "loss": 0.0316, + "step": 4604 + }, + { + "epoch": 2.045303131245836, + "grad_norm": 0.5466639051174087, + "learning_rate": 5.6716054104587784e-06, + "loss": 0.0513, + "step": 4605 + }, + { + "epoch": 2.0457472795913834, + "grad_norm": 0.3453198669378243, + "learning_rate": 5.669684390569167e-06, + "loss": 0.0292, + "step": 4606 + }, + { + "epoch": 2.046191427936931, + "grad_norm": 0.3855523349827862, + "learning_rate": 5.667763270014616e-06, + "loss": 0.0274, + "step": 4607 + }, + { + "epoch": 2.0466355762824784, + "grad_norm": 0.34309904151982246, + "learning_rate": 5.665842049083902e-06, + "loss": 0.0264, + "step": 4608 + }, + { + "epoch": 2.0470797246280257, + "grad_norm": 0.7952938969316026, + "learning_rate": 5.6639207280658194e-06, + "loss": 0.0534, + "step": 4609 + }, + { + "epoch": 2.047523872973573, + "grad_norm": 0.3602547692842435, + "learning_rate": 5.6619993072491694e-06, + "loss": 0.027, + "step": 4610 + }, + { + "epoch": 2.0479680213191207, + "grad_norm": 0.33009638203411035, + "learning_rate": 5.6600777869227805e-06, + "loss": 0.0267, + "step": 4611 + }, + { + "epoch": 2.048412169664668, + "grad_norm": 0.43117489579425794, + "learning_rate": 5.658156167375488e-06, + "loss": 0.038, + "step": 4612 + }, + { + "epoch": 2.0488563180102153, + "grad_norm": 0.7172674302994839, + "learning_rate": 5.656234448896142e-06, + "loss": 0.0461, + "step": 4613 + }, + { + "epoch": 2.049300466355763, + "grad_norm": 0.4507173154420471, + "learning_rate": 5.654312631773612e-06, + "loss": 0.027, + "step": 4614 + }, + { + "epoch": 2.0497446147013103, + "grad_norm": 0.6398319387217463, + "learning_rate": 5.652390716296778e-06, + "loss": 0.041, + "step": 4615 + }, + { + "epoch": 2.0501887630468576, + "grad_norm": 0.43458565414603195, + "learning_rate": 5.650468702754537e-06, + "loss": 0.0475, + "step": 4616 + }, + { + "epoch": 2.050632911392405, + "grad_norm": 0.4724564626628951, + "learning_rate": 5.6485465914358005e-06, + "loss": 0.039, + "step": 4617 + }, + { + "epoch": 2.0510770597379526, + "grad_norm": 0.4813725140216776, + "learning_rate": 5.646624382629495e-06, + "loss": 0.0442, + "step": 4618 + }, + { + "epoch": 2.0515212080835, + "grad_norm": 0.422057899941897, + "learning_rate": 5.64470207662456e-06, + "loss": 0.0354, + "step": 4619 + }, + { + "epoch": 2.051965356429047, + "grad_norm": 0.5520244840521104, + "learning_rate": 5.6427796737099515e-06, + "loss": 0.0393, + "step": 4620 + }, + { + "epoch": 2.0524095047745945, + "grad_norm": 0.5717238340997483, + "learning_rate": 5.64085717417464e-06, + "loss": 0.041, + "step": 4621 + }, + { + "epoch": 2.0528536531201422, + "grad_norm": 0.4162602609657127, + "learning_rate": 5.638934578307608e-06, + "loss": 0.0233, + "step": 4622 + }, + { + "epoch": 2.0532978014656895, + "grad_norm": 0.40689474682922516, + "learning_rate": 5.637011886397854e-06, + "loss": 0.0479, + "step": 4623 + }, + { + "epoch": 2.053741949811237, + "grad_norm": 0.46291048789069594, + "learning_rate": 5.635089098734394e-06, + "loss": 0.0345, + "step": 4624 + }, + { + "epoch": 2.0541860981567845, + "grad_norm": 0.6061112924495777, + "learning_rate": 5.633166215606254e-06, + "loss": 0.0519, + "step": 4625 + }, + { + "epoch": 2.054630246502332, + "grad_norm": 0.37385156175737194, + "learning_rate": 5.631243237302478e-06, + "loss": 0.0432, + "step": 4626 + }, + { + "epoch": 2.055074394847879, + "grad_norm": 0.32050349885999035, + "learning_rate": 5.629320164112116e-06, + "loss": 0.031, + "step": 4627 + }, + { + "epoch": 2.0555185431934264, + "grad_norm": 0.44064173532980155, + "learning_rate": 5.627396996324247e-06, + "loss": 0.0255, + "step": 4628 + }, + { + "epoch": 2.055962691538974, + "grad_norm": 0.35873848998988794, + "learning_rate": 5.625473734227952e-06, + "loss": 0.0352, + "step": 4629 + }, + { + "epoch": 2.0564068398845214, + "grad_norm": 0.6158412215543486, + "learning_rate": 5.623550378112328e-06, + "loss": 0.0442, + "step": 4630 + }, + { + "epoch": 2.0568509882300687, + "grad_norm": 0.4813854059047201, + "learning_rate": 5.621626928266489e-06, + "loss": 0.0478, + "step": 4631 + }, + { + "epoch": 2.0572951365756165, + "grad_norm": 0.4790652853444873, + "learning_rate": 5.619703384979566e-06, + "loss": 0.0463, + "step": 4632 + }, + { + "epoch": 2.0577392849211638, + "grad_norm": 0.41738179511720797, + "learning_rate": 5.617779748540695e-06, + "loss": 0.0338, + "step": 4633 + }, + { + "epoch": 2.058183433266711, + "grad_norm": 0.41477600567937595, + "learning_rate": 5.615856019239034e-06, + "loss": 0.0361, + "step": 4634 + }, + { + "epoch": 2.0586275816122583, + "grad_norm": 0.4967975613891594, + "learning_rate": 5.613932197363753e-06, + "loss": 0.0333, + "step": 4635 + }, + { + "epoch": 2.059071729957806, + "grad_norm": 0.49458901207002304, + "learning_rate": 5.612008283204033e-06, + "loss": 0.0427, + "step": 4636 + }, + { + "epoch": 2.0595158783033534, + "grad_norm": 0.42427512577649273, + "learning_rate": 5.610084277049071e-06, + "loss": 0.0354, + "step": 4637 + }, + { + "epoch": 2.0599600266489007, + "grad_norm": 0.43245137715403886, + "learning_rate": 5.608160179188079e-06, + "loss": 0.0326, + "step": 4638 + }, + { + "epoch": 2.060404174994448, + "grad_norm": 0.4874634691089771, + "learning_rate": 5.6062359899102815e-06, + "loss": 0.0371, + "step": 4639 + }, + { + "epoch": 2.0608483233399957, + "grad_norm": 0.5004987254382077, + "learning_rate": 5.604311709504917e-06, + "loss": 0.0397, + "step": 4640 + }, + { + "epoch": 2.061292471685543, + "grad_norm": 0.4751004755592279, + "learning_rate": 5.602387338261236e-06, + "loss": 0.0414, + "step": 4641 + }, + { + "epoch": 2.0617366200310903, + "grad_norm": 0.5299506551694105, + "learning_rate": 5.600462876468506e-06, + "loss": 0.0368, + "step": 4642 + }, + { + "epoch": 2.062180768376638, + "grad_norm": 0.5312690498368814, + "learning_rate": 5.598538324416007e-06, + "loss": 0.0449, + "step": 4643 + }, + { + "epoch": 2.0626249167221853, + "grad_norm": 0.38593169971896735, + "learning_rate": 5.5966136823930286e-06, + "loss": 0.0281, + "step": 4644 + }, + { + "epoch": 2.0630690650677326, + "grad_norm": 0.3958642054782115, + "learning_rate": 5.594688950688879e-06, + "loss": 0.034, + "step": 4645 + }, + { + "epoch": 2.06351321341328, + "grad_norm": 0.40570614021956714, + "learning_rate": 5.592764129592879e-06, + "loss": 0.0323, + "step": 4646 + }, + { + "epoch": 2.0639573617588276, + "grad_norm": 0.45390531771906856, + "learning_rate": 5.590839219394361e-06, + "loss": 0.0254, + "step": 4647 + }, + { + "epoch": 2.064401510104375, + "grad_norm": 0.39108513738732803, + "learning_rate": 5.58891422038267e-06, + "loss": 0.0278, + "step": 4648 + }, + { + "epoch": 2.064845658449922, + "grad_norm": 0.5223612545370472, + "learning_rate": 5.58698913284717e-06, + "loss": 0.0367, + "step": 4649 + }, + { + "epoch": 2.0652898067954695, + "grad_norm": 0.48567914535043527, + "learning_rate": 5.585063957077231e-06, + "loss": 0.0303, + "step": 4650 + }, + { + "epoch": 2.065733955141017, + "grad_norm": 0.3022843285079369, + "learning_rate": 5.583138693362241e-06, + "loss": 0.0268, + "step": 4651 + }, + { + "epoch": 2.0661781034865645, + "grad_norm": 0.3626425665807493, + "learning_rate": 5.5812133419916e-06, + "loss": 0.0282, + "step": 4652 + }, + { + "epoch": 2.066622251832112, + "grad_norm": 0.4001866579471354, + "learning_rate": 5.5792879032547205e-06, + "loss": 0.0346, + "step": 4653 + }, + { + "epoch": 2.0670664001776595, + "grad_norm": 0.4654295875105734, + "learning_rate": 5.577362377441029e-06, + "loss": 0.0348, + "step": 4654 + }, + { + "epoch": 2.067510548523207, + "grad_norm": 0.39677357569841354, + "learning_rate": 5.5754367648399644e-06, + "loss": 0.0274, + "step": 4655 + }, + { + "epoch": 2.067954696868754, + "grad_norm": 0.4143564936380847, + "learning_rate": 5.5735110657409775e-06, + "loss": 0.0427, + "step": 4656 + }, + { + "epoch": 2.0683988452143014, + "grad_norm": 0.3681145367593844, + "learning_rate": 5.571585280433537e-06, + "loss": 0.0321, + "step": 4657 + }, + { + "epoch": 2.068842993559849, + "grad_norm": 0.46007574827334824, + "learning_rate": 5.569659409207119e-06, + "loss": 0.0324, + "step": 4658 + }, + { + "epoch": 2.0692871419053964, + "grad_norm": 0.4377445107919845, + "learning_rate": 5.567733452351214e-06, + "loss": 0.0364, + "step": 4659 + }, + { + "epoch": 2.0697312902509437, + "grad_norm": 0.5314428707092943, + "learning_rate": 5.565807410155329e-06, + "loss": 0.039, + "step": 4660 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 0.4445997114256765, + "learning_rate": 5.563881282908976e-06, + "loss": 0.0451, + "step": 4661 + }, + { + "epoch": 2.0706195869420387, + "grad_norm": 0.41675191455829885, + "learning_rate": 5.561955070901689e-06, + "loss": 0.032, + "step": 4662 + }, + { + "epoch": 2.071063735287586, + "grad_norm": 0.49984516013690966, + "learning_rate": 5.56002877442301e-06, + "loss": 0.0379, + "step": 4663 + }, + { + "epoch": 2.0715078836331333, + "grad_norm": 0.48011634971976885, + "learning_rate": 5.558102393762491e-06, + "loss": 0.0289, + "step": 4664 + }, + { + "epoch": 2.071952031978681, + "grad_norm": 0.44249526000612904, + "learning_rate": 5.556175929209703e-06, + "loss": 0.0254, + "step": 4665 + }, + { + "epoch": 2.0723961803242283, + "grad_norm": 0.40597832272770745, + "learning_rate": 5.554249381054224e-06, + "loss": 0.0269, + "step": 4666 + }, + { + "epoch": 2.0728403286697756, + "grad_norm": 0.45148588795717426, + "learning_rate": 5.552322749585649e-06, + "loss": 0.0388, + "step": 4667 + }, + { + "epoch": 2.073284477015323, + "grad_norm": 0.598994856507402, + "learning_rate": 5.550396035093582e-06, + "loss": 0.0423, + "step": 4668 + }, + { + "epoch": 2.0737286253608707, + "grad_norm": 0.5181098142664673, + "learning_rate": 5.548469237867642e-06, + "loss": 0.0433, + "step": 4669 + }, + { + "epoch": 2.074172773706418, + "grad_norm": 0.5101400816921863, + "learning_rate": 5.546542358197458e-06, + "loss": 0.0463, + "step": 4670 + }, + { + "epoch": 2.0746169220519652, + "grad_norm": 0.5101937458580871, + "learning_rate": 5.544615396372673e-06, + "loss": 0.0341, + "step": 4671 + }, + { + "epoch": 2.075061070397513, + "grad_norm": 0.5392434065378148, + "learning_rate": 5.542688352682944e-06, + "loss": 0.0451, + "step": 4672 + }, + { + "epoch": 2.0755052187430603, + "grad_norm": 0.6897926097007651, + "learning_rate": 5.540761227417934e-06, + "loss": 0.0355, + "step": 4673 + }, + { + "epoch": 2.0759493670886076, + "grad_norm": 0.28051948597694654, + "learning_rate": 5.53883402086733e-06, + "loss": 0.0243, + "step": 4674 + }, + { + "epoch": 2.076393515434155, + "grad_norm": 0.793052180191732, + "learning_rate": 5.536906733320816e-06, + "loss": 0.0563, + "step": 4675 + }, + { + "epoch": 2.0768376637797026, + "grad_norm": 0.4215318597715739, + "learning_rate": 5.5349793650681006e-06, + "loss": 0.0354, + "step": 4676 + }, + { + "epoch": 2.07728181212525, + "grad_norm": 0.34425051368314813, + "learning_rate": 5.533051916398899e-06, + "loss": 0.0295, + "step": 4677 + }, + { + "epoch": 2.077725960470797, + "grad_norm": 0.6685404197751216, + "learning_rate": 5.531124387602938e-06, + "loss": 0.0399, + "step": 4678 + }, + { + "epoch": 2.0781701088163445, + "grad_norm": 0.31141908709280325, + "learning_rate": 5.529196778969961e-06, + "loss": 0.0271, + "step": 4679 + }, + { + "epoch": 2.078614257161892, + "grad_norm": 0.4606806580907262, + "learning_rate": 5.527269090789718e-06, + "loss": 0.0394, + "step": 4680 + }, + { + "epoch": 2.0790584055074395, + "grad_norm": 0.4252999601959849, + "learning_rate": 5.525341323351975e-06, + "loss": 0.0394, + "step": 4681 + }, + { + "epoch": 2.0795025538529868, + "grad_norm": 0.5409739246288477, + "learning_rate": 5.5234134769465065e-06, + "loss": 0.0521, + "step": 4682 + }, + { + "epoch": 2.0799467021985345, + "grad_norm": 0.4948477331543779, + "learning_rate": 5.5214855518631005e-06, + "loss": 0.0485, + "step": 4683 + }, + { + "epoch": 2.080390850544082, + "grad_norm": 0.3543340984392336, + "learning_rate": 5.519557548391557e-06, + "loss": 0.0242, + "step": 4684 + }, + { + "epoch": 2.080834998889629, + "grad_norm": 0.3751926442365401, + "learning_rate": 5.517629466821691e-06, + "loss": 0.036, + "step": 4685 + }, + { + "epoch": 2.0812791472351764, + "grad_norm": 0.4117924266399266, + "learning_rate": 5.515701307443321e-06, + "loss": 0.0485, + "step": 4686 + }, + { + "epoch": 2.081723295580724, + "grad_norm": 0.47251260035381365, + "learning_rate": 5.513773070546284e-06, + "loss": 0.0419, + "step": 4687 + }, + { + "epoch": 2.0821674439262714, + "grad_norm": 0.6076861002224307, + "learning_rate": 5.5118447564204295e-06, + "loss": 0.0391, + "step": 4688 + }, + { + "epoch": 2.0826115922718187, + "grad_norm": 0.4792646073001442, + "learning_rate": 5.50991636535561e-06, + "loss": 0.0295, + "step": 4689 + }, + { + "epoch": 2.0830557406173664, + "grad_norm": 0.32440405189073857, + "learning_rate": 5.5079878976417e-06, + "loss": 0.022, + "step": 4690 + }, + { + "epoch": 2.0834998889629137, + "grad_norm": 0.4221580236556979, + "learning_rate": 5.506059353568581e-06, + "loss": 0.0263, + "step": 4691 + }, + { + "epoch": 2.083944037308461, + "grad_norm": 0.35561395384075745, + "learning_rate": 5.504130733426145e-06, + "loss": 0.0273, + "step": 4692 + }, + { + "epoch": 2.0843881856540083, + "grad_norm": 0.4935918433503542, + "learning_rate": 5.502202037504293e-06, + "loss": 0.0328, + "step": 4693 + }, + { + "epoch": 2.084832333999556, + "grad_norm": 0.439030655630297, + "learning_rate": 5.500273266092947e-06, + "loss": 0.0278, + "step": 4694 + }, + { + "epoch": 2.0852764823451033, + "grad_norm": 0.5492104667996629, + "learning_rate": 5.49834441948203e-06, + "loss": 0.0477, + "step": 4695 + }, + { + "epoch": 2.0857206306906506, + "grad_norm": 0.48997887857295236, + "learning_rate": 5.496415497961482e-06, + "loss": 0.036, + "step": 4696 + }, + { + "epoch": 2.086164779036198, + "grad_norm": 0.9251501688761363, + "learning_rate": 5.49448650182125e-06, + "loss": 0.0543, + "step": 4697 + }, + { + "epoch": 2.0866089273817456, + "grad_norm": 0.44096687951153934, + "learning_rate": 5.492557431351298e-06, + "loss": 0.0372, + "step": 4698 + }, + { + "epoch": 2.087053075727293, + "grad_norm": 0.48261463177180974, + "learning_rate": 5.4906282868415974e-06, + "loss": 0.0323, + "step": 4699 + }, + { + "epoch": 2.08749722407284, + "grad_norm": 0.44612947746443893, + "learning_rate": 5.488699068582129e-06, + "loss": 0.0379, + "step": 4700 + }, + { + "epoch": 2.087941372418388, + "grad_norm": 0.6019236481517227, + "learning_rate": 5.486769776862891e-06, + "loss": 0.0392, + "step": 4701 + }, + { + "epoch": 2.0883855207639352, + "grad_norm": 0.41037964449549674, + "learning_rate": 5.484840411973888e-06, + "loss": 0.045, + "step": 4702 + }, + { + "epoch": 2.0888296691094825, + "grad_norm": 0.6139112370737255, + "learning_rate": 5.482910974205133e-06, + "loss": 0.0434, + "step": 4703 + }, + { + "epoch": 2.08927381745503, + "grad_norm": 0.47897436644713304, + "learning_rate": 5.480981463846655e-06, + "loss": 0.0407, + "step": 4704 + }, + { + "epoch": 2.0897179658005776, + "grad_norm": 0.4257961843862624, + "learning_rate": 5.479051881188494e-06, + "loss": 0.0402, + "step": 4705 + }, + { + "epoch": 2.090162114146125, + "grad_norm": 0.4458277075765739, + "learning_rate": 5.477122226520698e-06, + "loss": 0.0335, + "step": 4706 + }, + { + "epoch": 2.090606262491672, + "grad_norm": 0.5283109987423077, + "learning_rate": 5.475192500133324e-06, + "loss": 0.0321, + "step": 4707 + }, + { + "epoch": 2.0910504108372194, + "grad_norm": 0.6005581697681983, + "learning_rate": 5.473262702316447e-06, + "loss": 0.0369, + "step": 4708 + }, + { + "epoch": 2.091494559182767, + "grad_norm": 0.40161109082786944, + "learning_rate": 5.471332833360147e-06, + "loss": 0.0289, + "step": 4709 + }, + { + "epoch": 2.0919387075283145, + "grad_norm": 0.4371088945729004, + "learning_rate": 5.4694028935545126e-06, + "loss": 0.0343, + "step": 4710 + }, + { + "epoch": 2.0923828558738617, + "grad_norm": 1.1703243713580254, + "learning_rate": 5.467472883189653e-06, + "loss": 0.0507, + "step": 4711 + }, + { + "epoch": 2.0928270042194095, + "grad_norm": 0.40364633007175293, + "learning_rate": 5.465542802555677e-06, + "loss": 0.0295, + "step": 4712 + }, + { + "epoch": 2.0932711525649568, + "grad_norm": 0.46666883180291846, + "learning_rate": 5.4636126519427095e-06, + "loss": 0.0412, + "step": 4713 + }, + { + "epoch": 2.093715300910504, + "grad_norm": 0.4145958391325037, + "learning_rate": 5.461682431640885e-06, + "loss": 0.0319, + "step": 4714 + }, + { + "epoch": 2.0941594492560514, + "grad_norm": 0.3217656626768428, + "learning_rate": 5.459752141940347e-06, + "loss": 0.0318, + "step": 4715 + }, + { + "epoch": 2.094603597601599, + "grad_norm": 0.5490857014126957, + "learning_rate": 5.457821783131254e-06, + "loss": 0.046, + "step": 4716 + }, + { + "epoch": 2.0950477459471464, + "grad_norm": 0.4425046183099989, + "learning_rate": 5.455891355503768e-06, + "loss": 0.0299, + "step": 4717 + }, + { + "epoch": 2.0954918942926937, + "grad_norm": 0.6349791543930909, + "learning_rate": 5.453960859348069e-06, + "loss": 0.0565, + "step": 4718 + }, + { + "epoch": 2.095936042638241, + "grad_norm": 0.4072328187100098, + "learning_rate": 5.4520302949543415e-06, + "loss": 0.0349, + "step": 4719 + }, + { + "epoch": 2.0963801909837887, + "grad_norm": 0.3811118913667539, + "learning_rate": 5.450099662612781e-06, + "loss": 0.0305, + "step": 4720 + }, + { + "epoch": 2.096824339329336, + "grad_norm": 0.4774255455908721, + "learning_rate": 5.448168962613596e-06, + "loss": 0.0368, + "step": 4721 + }, + { + "epoch": 2.0972684876748833, + "grad_norm": 0.32813215206793156, + "learning_rate": 5.446238195247003e-06, + "loss": 0.0321, + "step": 4722 + }, + { + "epoch": 2.097712636020431, + "grad_norm": 0.5512782021205551, + "learning_rate": 5.44430736080323e-06, + "loss": 0.0387, + "step": 4723 + }, + { + "epoch": 2.0981567843659783, + "grad_norm": 0.5835802051010996, + "learning_rate": 5.44237645957251e-06, + "loss": 0.0354, + "step": 4724 + }, + { + "epoch": 2.0986009327115256, + "grad_norm": 0.3806252215797266, + "learning_rate": 5.440445491845095e-06, + "loss": 0.0363, + "step": 4725 + }, + { + "epoch": 2.099045081057073, + "grad_norm": 0.42251102762378473, + "learning_rate": 5.438514457911241e-06, + "loss": 0.0386, + "step": 4726 + }, + { + "epoch": 2.0994892294026206, + "grad_norm": 0.35748316520246215, + "learning_rate": 5.436583358061215e-06, + "loss": 0.0282, + "step": 4727 + }, + { + "epoch": 2.099933377748168, + "grad_norm": 0.46907671848937027, + "learning_rate": 5.434652192585294e-06, + "loss": 0.0411, + "step": 4728 + }, + { + "epoch": 2.100377526093715, + "grad_norm": 0.41478959673973786, + "learning_rate": 5.432720961773765e-06, + "loss": 0.0323, + "step": 4729 + }, + { + "epoch": 2.1008216744392625, + "grad_norm": 0.6441994593730457, + "learning_rate": 5.430789665916925e-06, + "loss": 0.0477, + "step": 4730 + }, + { + "epoch": 2.1012658227848102, + "grad_norm": 0.7936242646685359, + "learning_rate": 5.428858305305079e-06, + "loss": 0.0438, + "step": 4731 + }, + { + "epoch": 2.1017099711303575, + "grad_norm": 0.4542564531841097, + "learning_rate": 5.426926880228547e-06, + "loss": 0.039, + "step": 4732 + }, + { + "epoch": 2.102154119475905, + "grad_norm": 0.43879296755150776, + "learning_rate": 5.424995390977651e-06, + "loss": 0.024, + "step": 4733 + }, + { + "epoch": 2.1025982678214525, + "grad_norm": 0.4720404728255635, + "learning_rate": 5.423063837842728e-06, + "loss": 0.0366, + "step": 4734 + }, + { + "epoch": 2.103042416167, + "grad_norm": 0.3210152022294204, + "learning_rate": 5.421132221114124e-06, + "loss": 0.0287, + "step": 4735 + }, + { + "epoch": 2.103486564512547, + "grad_norm": 0.8156671041659774, + "learning_rate": 5.419200541082194e-06, + "loss": 0.0423, + "step": 4736 + }, + { + "epoch": 2.1039307128580944, + "grad_norm": 0.5417603660166322, + "learning_rate": 5.417268798037303e-06, + "loss": 0.0388, + "step": 4737 + }, + { + "epoch": 2.104374861203642, + "grad_norm": 0.46205560016247954, + "learning_rate": 5.415336992269821e-06, + "loss": 0.0363, + "step": 4738 + }, + { + "epoch": 2.1048190095491894, + "grad_norm": 0.3348835489607936, + "learning_rate": 5.413405124070134e-06, + "loss": 0.0206, + "step": 4739 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.46193255668613026, + "learning_rate": 5.411473193728636e-06, + "loss": 0.0295, + "step": 4740 + }, + { + "epoch": 2.1057073062402845, + "grad_norm": 0.4346087883354204, + "learning_rate": 5.409541201535727e-06, + "loss": 0.039, + "step": 4741 + }, + { + "epoch": 2.1061514545858318, + "grad_norm": 0.46994212871132296, + "learning_rate": 5.407609147781816e-06, + "loss": 0.0374, + "step": 4742 + }, + { + "epoch": 2.106595602931379, + "grad_norm": 0.4605573678630878, + "learning_rate": 5.405677032757329e-06, + "loss": 0.0371, + "step": 4743 + }, + { + "epoch": 2.1070397512769263, + "grad_norm": 0.6425519607388686, + "learning_rate": 5.403744856752691e-06, + "loss": 0.0337, + "step": 4744 + }, + { + "epoch": 2.107483899622474, + "grad_norm": 0.34739298130141844, + "learning_rate": 5.401812620058343e-06, + "loss": 0.0269, + "step": 4745 + }, + { + "epoch": 2.1079280479680214, + "grad_norm": 0.4490417625063517, + "learning_rate": 5.399880322964733e-06, + "loss": 0.0372, + "step": 4746 + }, + { + "epoch": 2.1083721963135686, + "grad_norm": 0.6489786195301251, + "learning_rate": 5.397947965762317e-06, + "loss": 0.0509, + "step": 4747 + }, + { + "epoch": 2.108816344659116, + "grad_norm": 0.326521120100658, + "learning_rate": 5.396015548741562e-06, + "loss": 0.0288, + "step": 4748 + }, + { + "epoch": 2.1092604930046637, + "grad_norm": 0.4078407477455278, + "learning_rate": 5.394083072192944e-06, + "loss": 0.0382, + "step": 4749 + }, + { + "epoch": 2.109704641350211, + "grad_norm": 0.5177065744163067, + "learning_rate": 5.392150536406945e-06, + "loss": 0.0449, + "step": 4750 + }, + { + "epoch": 2.1101487896957583, + "grad_norm": 0.33995975799200945, + "learning_rate": 5.39021794167406e-06, + "loss": 0.0264, + "step": 4751 + }, + { + "epoch": 2.110592938041306, + "grad_norm": 0.42003616424190965, + "learning_rate": 5.388285288284787e-06, + "loss": 0.0372, + "step": 4752 + }, + { + "epoch": 2.1110370863868533, + "grad_norm": 0.3472432854029145, + "learning_rate": 5.386352576529641e-06, + "loss": 0.0307, + "step": 4753 + }, + { + "epoch": 2.1114812347324006, + "grad_norm": 0.4130591691843907, + "learning_rate": 5.384419806699141e-06, + "loss": 0.0276, + "step": 4754 + }, + { + "epoch": 2.111925383077948, + "grad_norm": 0.4591952950693401, + "learning_rate": 5.382486979083812e-06, + "loss": 0.0363, + "step": 4755 + }, + { + "epoch": 2.1123695314234956, + "grad_norm": 0.3857971037693101, + "learning_rate": 5.380554093974193e-06, + "loss": 0.0359, + "step": 4756 + }, + { + "epoch": 2.112813679769043, + "grad_norm": 0.37782187453237637, + "learning_rate": 5.37862115166083e-06, + "loss": 0.0358, + "step": 4757 + }, + { + "epoch": 2.11325782811459, + "grad_norm": 0.3395886410665437, + "learning_rate": 5.376688152434275e-06, + "loss": 0.0243, + "step": 4758 + }, + { + "epoch": 2.1137019764601375, + "grad_norm": 0.4470609752189812, + "learning_rate": 5.374755096585093e-06, + "loss": 0.0246, + "step": 4759 + }, + { + "epoch": 2.114146124805685, + "grad_norm": 0.4371584818717651, + "learning_rate": 5.372821984403854e-06, + "loss": 0.028, + "step": 4760 + }, + { + "epoch": 2.1145902731512325, + "grad_norm": 0.49305971027495465, + "learning_rate": 5.370888816181138e-06, + "loss": 0.0362, + "step": 4761 + }, + { + "epoch": 2.11503442149678, + "grad_norm": 0.4175857188422132, + "learning_rate": 5.368955592207531e-06, + "loss": 0.033, + "step": 4762 + }, + { + "epoch": 2.1154785698423275, + "grad_norm": 0.5139037264567338, + "learning_rate": 5.367022312773633e-06, + "loss": 0.0323, + "step": 4763 + }, + { + "epoch": 2.115922718187875, + "grad_norm": 0.8385325013800425, + "learning_rate": 5.365088978170045e-06, + "loss": 0.0387, + "step": 4764 + }, + { + "epoch": 2.116366866533422, + "grad_norm": 0.4295996848147971, + "learning_rate": 5.363155588687383e-06, + "loss": 0.0297, + "step": 4765 + }, + { + "epoch": 2.1168110148789694, + "grad_norm": 0.5951452068654114, + "learning_rate": 5.361222144616267e-06, + "loss": 0.0395, + "step": 4766 + }, + { + "epoch": 2.117255163224517, + "grad_norm": 0.3679591157867813, + "learning_rate": 5.359288646247326e-06, + "loss": 0.0324, + "step": 4767 + }, + { + "epoch": 2.1176993115700644, + "grad_norm": 0.351749833643755, + "learning_rate": 5.357355093871199e-06, + "loss": 0.0262, + "step": 4768 + }, + { + "epoch": 2.1181434599156117, + "grad_norm": 0.4550233701632367, + "learning_rate": 5.355421487778529e-06, + "loss": 0.0405, + "step": 4769 + }, + { + "epoch": 2.1185876082611594, + "grad_norm": 0.5196049007489856, + "learning_rate": 5.353487828259973e-06, + "loss": 0.0354, + "step": 4770 + }, + { + "epoch": 2.1190317566067067, + "grad_norm": 0.5003818333286508, + "learning_rate": 5.351554115606194e-06, + "loss": 0.0346, + "step": 4771 + }, + { + "epoch": 2.119475904952254, + "grad_norm": 0.48260320470669443, + "learning_rate": 5.349620350107857e-06, + "loss": 0.0363, + "step": 4772 + }, + { + "epoch": 2.1199200532978013, + "grad_norm": 0.5154065971255504, + "learning_rate": 5.347686532055643e-06, + "loss": 0.037, + "step": 4773 + }, + { + "epoch": 2.120364201643349, + "grad_norm": 0.5641510479972823, + "learning_rate": 5.345752661740236e-06, + "loss": 0.0418, + "step": 4774 + }, + { + "epoch": 2.1208083499888963, + "grad_norm": 0.3842072493781642, + "learning_rate": 5.343818739452332e-06, + "loss": 0.0289, + "step": 4775 + }, + { + "epoch": 2.1212524983344436, + "grad_norm": 0.48007816649878726, + "learning_rate": 5.34188476548263e-06, + "loss": 0.0382, + "step": 4776 + }, + { + "epoch": 2.121696646679991, + "grad_norm": 0.403727363474673, + "learning_rate": 5.339950740121842e-06, + "loss": 0.0355, + "step": 4777 + }, + { + "epoch": 2.1221407950255387, + "grad_norm": 0.5060901595070586, + "learning_rate": 5.338016663660681e-06, + "loss": 0.0372, + "step": 4778 + }, + { + "epoch": 2.122584943371086, + "grad_norm": 0.991711640962585, + "learning_rate": 5.336082536389875e-06, + "loss": 0.0575, + "step": 4779 + }, + { + "epoch": 2.1230290917166332, + "grad_norm": 0.6280177866580242, + "learning_rate": 5.334148358600154e-06, + "loss": 0.0524, + "step": 4780 + }, + { + "epoch": 2.123473240062181, + "grad_norm": 0.38877559204447126, + "learning_rate": 5.332214130582259e-06, + "loss": 0.0252, + "step": 4781 + }, + { + "epoch": 2.1239173884077283, + "grad_norm": 0.5153735769890214, + "learning_rate": 5.330279852626936e-06, + "loss": 0.0399, + "step": 4782 + }, + { + "epoch": 2.1243615367532755, + "grad_norm": 0.4835493880223582, + "learning_rate": 5.32834552502494e-06, + "loss": 0.0398, + "step": 4783 + }, + { + "epoch": 2.124805685098823, + "grad_norm": 0.5440156606024599, + "learning_rate": 5.326411148067036e-06, + "loss": 0.0359, + "step": 4784 + }, + { + "epoch": 2.1252498334443706, + "grad_norm": 0.47437512421776024, + "learning_rate": 5.324476722043991e-06, + "loss": 0.0357, + "step": 4785 + }, + { + "epoch": 2.125693981789918, + "grad_norm": 0.4015404790313753, + "learning_rate": 5.322542247246583e-06, + "loss": 0.0259, + "step": 4786 + }, + { + "epoch": 2.126138130135465, + "grad_norm": 0.3327411043647303, + "learning_rate": 5.320607723965594e-06, + "loss": 0.0288, + "step": 4787 + }, + { + "epoch": 2.1265822784810124, + "grad_norm": 0.4112477949684857, + "learning_rate": 5.318673152491821e-06, + "loss": 0.0306, + "step": 4788 + }, + { + "epoch": 2.12702642682656, + "grad_norm": 0.4352341389187628, + "learning_rate": 5.316738533116058e-06, + "loss": 0.0294, + "step": 4789 + }, + { + "epoch": 2.1274705751721075, + "grad_norm": 0.5247331669668354, + "learning_rate": 5.314803866129114e-06, + "loss": 0.0455, + "step": 4790 + }, + { + "epoch": 2.1279147235176548, + "grad_norm": 0.5740961919642005, + "learning_rate": 5.3128691518218015e-06, + "loss": 0.0313, + "step": 4791 + }, + { + "epoch": 2.1283588718632025, + "grad_norm": 0.3931985647767843, + "learning_rate": 5.310934390484939e-06, + "loss": 0.0303, + "step": 4792 + }, + { + "epoch": 2.12880302020875, + "grad_norm": 0.4665172928488037, + "learning_rate": 5.308999582409357e-06, + "loss": 0.0324, + "step": 4793 + }, + { + "epoch": 2.129247168554297, + "grad_norm": 0.394722347425358, + "learning_rate": 5.307064727885889e-06, + "loss": 0.0386, + "step": 4794 + }, + { + "epoch": 2.1296913168998444, + "grad_norm": 0.3880325853085057, + "learning_rate": 5.305129827205375e-06, + "loss": 0.0305, + "step": 4795 + }, + { + "epoch": 2.130135465245392, + "grad_norm": 0.33971233702286885, + "learning_rate": 5.303194880658668e-06, + "loss": 0.0336, + "step": 4796 + }, + { + "epoch": 2.1305796135909394, + "grad_norm": 0.5540350723444334, + "learning_rate": 5.301259888536616e-06, + "loss": 0.0369, + "step": 4797 + }, + { + "epoch": 2.1310237619364867, + "grad_norm": 0.562001973004502, + "learning_rate": 5.299324851130086e-06, + "loss": 0.0463, + "step": 4798 + }, + { + "epoch": 2.1314679102820344, + "grad_norm": 0.40052051170910113, + "learning_rate": 5.297389768729949e-06, + "loss": 0.0343, + "step": 4799 + }, + { + "epoch": 2.1319120586275817, + "grad_norm": 0.37582805444109113, + "learning_rate": 5.295454641627076e-06, + "loss": 0.0279, + "step": 4800 + }, + { + "epoch": 2.132356206973129, + "grad_norm": 0.556183127195747, + "learning_rate": 5.293519470112351e-06, + "loss": 0.055, + "step": 4801 + }, + { + "epoch": 2.1328003553186763, + "grad_norm": 0.43600853079083646, + "learning_rate": 5.2915842544766645e-06, + "loss": 0.0341, + "step": 4802 + }, + { + "epoch": 2.133244503664224, + "grad_norm": 0.43024160171177206, + "learning_rate": 5.289648995010912e-06, + "loss": 0.0311, + "step": 4803 + }, + { + "epoch": 2.1336886520097713, + "grad_norm": 0.4112510401014529, + "learning_rate": 5.287713692005993e-06, + "loss": 0.0319, + "step": 4804 + }, + { + "epoch": 2.1341328003553186, + "grad_norm": 0.4594812201613338, + "learning_rate": 5.285778345752821e-06, + "loss": 0.0372, + "step": 4805 + }, + { + "epoch": 2.134576948700866, + "grad_norm": 0.3657072390383706, + "learning_rate": 5.2838429565423074e-06, + "loss": 0.0283, + "step": 4806 + }, + { + "epoch": 2.1350210970464136, + "grad_norm": 0.6140456097709794, + "learning_rate": 5.281907524665377e-06, + "loss": 0.0454, + "step": 4807 + }, + { + "epoch": 2.135465245391961, + "grad_norm": 0.39045919917216204, + "learning_rate": 5.279972050412957e-06, + "loss": 0.0378, + "step": 4808 + }, + { + "epoch": 2.135909393737508, + "grad_norm": 0.3927097404516126, + "learning_rate": 5.278036534075981e-06, + "loss": 0.0333, + "step": 4809 + }, + { + "epoch": 2.136353542083056, + "grad_norm": 0.5754764647926375, + "learning_rate": 5.276100975945393e-06, + "loss": 0.0327, + "step": 4810 + }, + { + "epoch": 2.1367976904286032, + "grad_norm": 0.9692035662201081, + "learning_rate": 5.274165376312136e-06, + "loss": 0.0433, + "step": 4811 + }, + { + "epoch": 2.1372418387741505, + "grad_norm": 0.3898618853951006, + "learning_rate": 5.272229735467166e-06, + "loss": 0.0301, + "step": 4812 + }, + { + "epoch": 2.137685987119698, + "grad_norm": 0.5041899434970386, + "learning_rate": 5.270294053701442e-06, + "loss": 0.0293, + "step": 4813 + }, + { + "epoch": 2.1381301354652456, + "grad_norm": 0.4402127129653806, + "learning_rate": 5.268358331305931e-06, + "loss": 0.0321, + "step": 4814 + }, + { + "epoch": 2.138574283810793, + "grad_norm": 0.45642467547731175, + "learning_rate": 5.266422568571604e-06, + "loss": 0.0359, + "step": 4815 + }, + { + "epoch": 2.13901843215634, + "grad_norm": 0.3935008736888044, + "learning_rate": 5.264486765789439e-06, + "loss": 0.034, + "step": 4816 + }, + { + "epoch": 2.1394625805018874, + "grad_norm": 0.334289910641262, + "learning_rate": 5.262550923250421e-06, + "loss": 0.0235, + "step": 4817 + }, + { + "epoch": 2.139906728847435, + "grad_norm": 0.45188823216686197, + "learning_rate": 5.260615041245538e-06, + "loss": 0.0293, + "step": 4818 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.5579134180377057, + "learning_rate": 5.25867912006579e-06, + "loss": 0.0428, + "step": 4819 + }, + { + "epoch": 2.1407950255385297, + "grad_norm": 0.33710370134564305, + "learning_rate": 5.256743160002174e-06, + "loss": 0.0265, + "step": 4820 + }, + { + "epoch": 2.1412391738840775, + "grad_norm": 0.5496665050765797, + "learning_rate": 5.254807161345699e-06, + "loss": 0.0482, + "step": 4821 + }, + { + "epoch": 2.1416833222296248, + "grad_norm": 0.41971477565058973, + "learning_rate": 5.2528711243873795e-06, + "loss": 0.032, + "step": 4822 + }, + { + "epoch": 2.142127470575172, + "grad_norm": 0.4604575178968341, + "learning_rate": 5.2509350494182365e-06, + "loss": 0.0366, + "step": 4823 + }, + { + "epoch": 2.1425716189207193, + "grad_norm": 0.3714626261982935, + "learning_rate": 5.2489989367292916e-06, + "loss": 0.0256, + "step": 4824 + }, + { + "epoch": 2.143015767266267, + "grad_norm": 0.40647254628304297, + "learning_rate": 5.247062786611575e-06, + "loss": 0.034, + "step": 4825 + }, + { + "epoch": 2.1434599156118144, + "grad_norm": 0.534798459704081, + "learning_rate": 5.245126599356126e-06, + "loss": 0.0344, + "step": 4826 + }, + { + "epoch": 2.1439040639573617, + "grad_norm": 0.3767967274585948, + "learning_rate": 5.243190375253987e-06, + "loss": 0.0388, + "step": 4827 + }, + { + "epoch": 2.1443482123029094, + "grad_norm": 0.43416732973374494, + "learning_rate": 5.241254114596201e-06, + "loss": 0.036, + "step": 4828 + }, + { + "epoch": 2.1447923606484567, + "grad_norm": 0.37510655690708544, + "learning_rate": 5.2393178176738246e-06, + "loss": 0.0339, + "step": 4829 + }, + { + "epoch": 2.145236508994004, + "grad_norm": 0.509295659153276, + "learning_rate": 5.237381484777914e-06, + "loss": 0.0449, + "step": 4830 + }, + { + "epoch": 2.1456806573395513, + "grad_norm": 0.43018107465023353, + "learning_rate": 5.235445116199536e-06, + "loss": 0.0386, + "step": 4831 + }, + { + "epoch": 2.146124805685099, + "grad_norm": 0.532799480118876, + "learning_rate": 5.2335087122297545e-06, + "loss": 0.0536, + "step": 4832 + }, + { + "epoch": 2.1465689540306463, + "grad_norm": 0.3898351411048676, + "learning_rate": 5.231572273159649e-06, + "loss": 0.0328, + "step": 4833 + }, + { + "epoch": 2.1470131023761936, + "grad_norm": 0.34769775883890974, + "learning_rate": 5.229635799280298e-06, + "loss": 0.0297, + "step": 4834 + }, + { + "epoch": 2.147457250721741, + "grad_norm": 0.4711513587854737, + "learning_rate": 5.2276992908827825e-06, + "loss": 0.0422, + "step": 4835 + }, + { + "epoch": 2.1479013990672886, + "grad_norm": 0.405896587319854, + "learning_rate": 5.2257627482581985e-06, + "loss": 0.033, + "step": 4836 + }, + { + "epoch": 2.148345547412836, + "grad_norm": 0.3745089869031137, + "learning_rate": 5.2238261716976375e-06, + "loss": 0.0246, + "step": 4837 + }, + { + "epoch": 2.148789695758383, + "grad_norm": 0.40688545389600017, + "learning_rate": 5.2218895614922e-06, + "loss": 0.044, + "step": 4838 + }, + { + "epoch": 2.1492338441039305, + "grad_norm": 0.48659815613858365, + "learning_rate": 5.219952917932993e-06, + "loss": 0.0412, + "step": 4839 + }, + { + "epoch": 2.149677992449478, + "grad_norm": 0.5300858396288847, + "learning_rate": 5.218016241311126e-06, + "loss": 0.0441, + "step": 4840 + }, + { + "epoch": 2.1501221407950255, + "grad_norm": 0.5970494126272853, + "learning_rate": 5.216079531917714e-06, + "loss": 0.0377, + "step": 4841 + }, + { + "epoch": 2.150566289140573, + "grad_norm": 0.4023665494232216, + "learning_rate": 5.2141427900438765e-06, + "loss": 0.0261, + "step": 4842 + }, + { + "epoch": 2.1510104374861205, + "grad_norm": 0.3360515019130085, + "learning_rate": 5.212206015980742e-06, + "loss": 0.0232, + "step": 4843 + }, + { + "epoch": 2.151454585831668, + "grad_norm": 0.37156982937954675, + "learning_rate": 5.210269210019438e-06, + "loss": 0.0313, + "step": 4844 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 0.43881581226491895, + "learning_rate": 5.2083323724511e-06, + "loss": 0.0387, + "step": 4845 + }, + { + "epoch": 2.1523428825227624, + "grad_norm": 0.5133014074927194, + "learning_rate": 5.206395503566867e-06, + "loss": 0.033, + "step": 4846 + }, + { + "epoch": 2.15278703086831, + "grad_norm": 0.7393084909457972, + "learning_rate": 5.204458603657885e-06, + "loss": 0.0451, + "step": 4847 + }, + { + "epoch": 2.1532311792138574, + "grad_norm": 0.5276996020099548, + "learning_rate": 5.2025216730153016e-06, + "loss": 0.0244, + "step": 4848 + }, + { + "epoch": 2.1536753275594047, + "grad_norm": 0.47830343576785345, + "learning_rate": 5.200584711930267e-06, + "loss": 0.0387, + "step": 4849 + }, + { + "epoch": 2.1541194759049525, + "grad_norm": 0.5578861068890496, + "learning_rate": 5.198647720693948e-06, + "loss": 0.0421, + "step": 4850 + }, + { + "epoch": 2.1545636242504997, + "grad_norm": 0.3280166182279637, + "learning_rate": 5.1967106995975e-06, + "loss": 0.0233, + "step": 4851 + }, + { + "epoch": 2.155007772596047, + "grad_norm": 0.5452793800296186, + "learning_rate": 5.194773648932092e-06, + "loss": 0.0403, + "step": 4852 + }, + { + "epoch": 2.1554519209415943, + "grad_norm": 0.5130821544913388, + "learning_rate": 5.192836568988895e-06, + "loss": 0.0307, + "step": 4853 + }, + { + "epoch": 2.155896069287142, + "grad_norm": 0.33770672647996586, + "learning_rate": 5.190899460059088e-06, + "loss": 0.0232, + "step": 4854 + }, + { + "epoch": 2.1563402176326893, + "grad_norm": 0.6783231395478261, + "learning_rate": 5.188962322433848e-06, + "loss": 0.0359, + "step": 4855 + }, + { + "epoch": 2.1567843659782366, + "grad_norm": 0.49312348081568724, + "learning_rate": 5.187025156404361e-06, + "loss": 0.051, + "step": 4856 + }, + { + "epoch": 2.1572285143237844, + "grad_norm": 0.5188160057025217, + "learning_rate": 5.185087962261817e-06, + "loss": 0.037, + "step": 4857 + }, + { + "epoch": 2.1576726626693317, + "grad_norm": 0.4979475713257613, + "learning_rate": 5.183150740297407e-06, + "loss": 0.0457, + "step": 4858 + }, + { + "epoch": 2.158116811014879, + "grad_norm": 0.8466081637820392, + "learning_rate": 5.181213490802329e-06, + "loss": 0.0467, + "step": 4859 + }, + { + "epoch": 2.1585609593604262, + "grad_norm": 0.4708804574175826, + "learning_rate": 5.179276214067788e-06, + "loss": 0.0404, + "step": 4860 + }, + { + "epoch": 2.159005107705974, + "grad_norm": 0.3951062024143596, + "learning_rate": 5.1773389103849835e-06, + "loss": 0.0262, + "step": 4861 + }, + { + "epoch": 2.1594492560515213, + "grad_norm": 0.5862820412725568, + "learning_rate": 5.175401580045131e-06, + "loss": 0.0471, + "step": 4862 + }, + { + "epoch": 2.1598934043970686, + "grad_norm": 0.3583235113891142, + "learning_rate": 5.173464223339438e-06, + "loss": 0.0261, + "step": 4863 + }, + { + "epoch": 2.160337552742616, + "grad_norm": 0.7139304922452345, + "learning_rate": 5.171526840559129e-06, + "loss": 0.0581, + "step": 4864 + }, + { + "epoch": 2.1607817010881636, + "grad_norm": 0.41880853503590465, + "learning_rate": 5.169589431995421e-06, + "loss": 0.0305, + "step": 4865 + }, + { + "epoch": 2.161225849433711, + "grad_norm": 0.974797863480534, + "learning_rate": 5.16765199793954e-06, + "loss": 0.0476, + "step": 4866 + }, + { + "epoch": 2.161669997779258, + "grad_norm": 0.6058306397735719, + "learning_rate": 5.165714538682716e-06, + "loss": 0.0603, + "step": 4867 + }, + { + "epoch": 2.1621141461248055, + "grad_norm": 0.6017608849107138, + "learning_rate": 5.163777054516182e-06, + "loss": 0.0415, + "step": 4868 + }, + { + "epoch": 2.162558294470353, + "grad_norm": 1.5666330014886598, + "learning_rate": 5.161839545731175e-06, + "loss": 0.0424, + "step": 4869 + }, + { + "epoch": 2.1630024428159005, + "grad_norm": 0.4625370484241095, + "learning_rate": 5.159902012618933e-06, + "loss": 0.0337, + "step": 4870 + }, + { + "epoch": 2.1634465911614478, + "grad_norm": 0.6306574373250589, + "learning_rate": 5.1579644554707054e-06, + "loss": 0.057, + "step": 4871 + }, + { + "epoch": 2.1638907395069955, + "grad_norm": 0.9235120056329936, + "learning_rate": 5.156026874577735e-06, + "loss": 0.0561, + "step": 4872 + }, + { + "epoch": 2.164334887852543, + "grad_norm": 0.4148788359576288, + "learning_rate": 5.154089270231275e-06, + "loss": 0.0365, + "step": 4873 + }, + { + "epoch": 2.16477903619809, + "grad_norm": 0.5384050767235271, + "learning_rate": 5.152151642722582e-06, + "loss": 0.0397, + "step": 4874 + }, + { + "epoch": 2.1652231845436374, + "grad_norm": 0.42398179308700884, + "learning_rate": 5.15021399234291e-06, + "loss": 0.0244, + "step": 4875 + }, + { + "epoch": 2.165667332889185, + "grad_norm": 0.4428879862840884, + "learning_rate": 5.148276319383525e-06, + "loss": 0.0338, + "step": 4876 + }, + { + "epoch": 2.1661114812347324, + "grad_norm": 0.5151742619891833, + "learning_rate": 5.146338624135689e-06, + "loss": 0.0429, + "step": 4877 + }, + { + "epoch": 2.1665556295802797, + "grad_norm": 0.5906956484118873, + "learning_rate": 5.144400906890672e-06, + "loss": 0.0486, + "step": 4878 + }, + { + "epoch": 2.1669997779258274, + "grad_norm": 0.5089563089876176, + "learning_rate": 5.142463167939748e-06, + "loss": 0.0336, + "step": 4879 + }, + { + "epoch": 2.1674439262713747, + "grad_norm": 0.4990120356759535, + "learning_rate": 5.140525407574187e-06, + "loss": 0.0397, + "step": 4880 + }, + { + "epoch": 2.167888074616922, + "grad_norm": 0.47055574764670566, + "learning_rate": 5.138587626085271e-06, + "loss": 0.0411, + "step": 4881 + }, + { + "epoch": 2.1683322229624693, + "grad_norm": 0.49103425451011984, + "learning_rate": 5.136649823764281e-06, + "loss": 0.0422, + "step": 4882 + }, + { + "epoch": 2.168776371308017, + "grad_norm": 0.44674321466653016, + "learning_rate": 5.1347120009025005e-06, + "loss": 0.0312, + "step": 4883 + }, + { + "epoch": 2.1692205196535643, + "grad_norm": 0.2849178715545674, + "learning_rate": 5.132774157791218e-06, + "loss": 0.0226, + "step": 4884 + }, + { + "epoch": 2.1696646679991116, + "grad_norm": 0.46152868939886915, + "learning_rate": 5.130836294721726e-06, + "loss": 0.0364, + "step": 4885 + }, + { + "epoch": 2.170108816344659, + "grad_norm": 0.43325439116729864, + "learning_rate": 5.128898411985315e-06, + "loss": 0.0288, + "step": 4886 + }, + { + "epoch": 2.1705529646902066, + "grad_norm": 0.39479226787289406, + "learning_rate": 5.1269605098732825e-06, + "loss": 0.0251, + "step": 4887 + }, + { + "epoch": 2.170997113035754, + "grad_norm": 0.41019425261041265, + "learning_rate": 5.12502258867693e-06, + "loss": 0.031, + "step": 4888 + }, + { + "epoch": 2.1714412613813012, + "grad_norm": 0.508850109174841, + "learning_rate": 5.123084648687557e-06, + "loss": 0.0474, + "step": 4889 + }, + { + "epoch": 2.171885409726849, + "grad_norm": 0.42681095953850656, + "learning_rate": 5.121146690196472e-06, + "loss": 0.0244, + "step": 4890 + }, + { + "epoch": 2.1723295580723962, + "grad_norm": 0.32346458488317453, + "learning_rate": 5.1192087134949804e-06, + "loss": 0.0264, + "step": 4891 + }, + { + "epoch": 2.1727737064179435, + "grad_norm": 0.40304790268945045, + "learning_rate": 5.1172707188743955e-06, + "loss": 0.0264, + "step": 4892 + }, + { + "epoch": 2.173217854763491, + "grad_norm": 0.5433823527011491, + "learning_rate": 5.115332706626028e-06, + "loss": 0.0459, + "step": 4893 + }, + { + "epoch": 2.1736620031090386, + "grad_norm": 0.33519824313183216, + "learning_rate": 5.113394677041197e-06, + "loss": 0.0289, + "step": 4894 + }, + { + "epoch": 2.174106151454586, + "grad_norm": 0.399864916190364, + "learning_rate": 5.111456630411218e-06, + "loss": 0.026, + "step": 4895 + }, + { + "epoch": 2.174550299800133, + "grad_norm": 0.49817314831150256, + "learning_rate": 5.109518567027416e-06, + "loss": 0.0341, + "step": 4896 + }, + { + "epoch": 2.1749944481456804, + "grad_norm": 0.41814385729810516, + "learning_rate": 5.107580487181112e-06, + "loss": 0.0315, + "step": 4897 + }, + { + "epoch": 2.175438596491228, + "grad_norm": 0.44895548729597046, + "learning_rate": 5.105642391163633e-06, + "loss": 0.0403, + "step": 4898 + }, + { + "epoch": 2.1758827448367755, + "grad_norm": 0.3688472653021723, + "learning_rate": 5.10370427926631e-06, + "loss": 0.0345, + "step": 4899 + }, + { + "epoch": 2.1763268931823228, + "grad_norm": 0.4629343116213356, + "learning_rate": 5.1017661517804694e-06, + "loss": 0.0353, + "step": 4900 + }, + { + "epoch": 2.1767710415278705, + "grad_norm": 0.4559917864661212, + "learning_rate": 5.099828008997448e-06, + "loss": 0.0398, + "step": 4901 + }, + { + "epoch": 2.1772151898734178, + "grad_norm": 0.45480161362407634, + "learning_rate": 5.097889851208583e-06, + "loss": 0.035, + "step": 4902 + }, + { + "epoch": 2.177659338218965, + "grad_norm": 0.42473449296712756, + "learning_rate": 5.0959516787052085e-06, + "loss": 0.0317, + "step": 4903 + }, + { + "epoch": 2.1781034865645124, + "grad_norm": 0.4952860166933172, + "learning_rate": 5.094013491778668e-06, + "loss": 0.0443, + "step": 4904 + }, + { + "epoch": 2.17854763491006, + "grad_norm": 0.4001101623153066, + "learning_rate": 5.092075290720302e-06, + "loss": 0.0298, + "step": 4905 + }, + { + "epoch": 2.1789917832556074, + "grad_norm": 0.5225430247317463, + "learning_rate": 5.0901370758214565e-06, + "loss": 0.0398, + "step": 4906 + }, + { + "epoch": 2.1794359316011547, + "grad_norm": 0.6146833193819635, + "learning_rate": 5.088198847373477e-06, + "loss": 0.0464, + "step": 4907 + }, + { + "epoch": 2.1798800799467024, + "grad_norm": 0.47802919024440194, + "learning_rate": 5.086260605667712e-06, + "loss": 0.037, + "step": 4908 + }, + { + "epoch": 2.1803242282922497, + "grad_norm": 0.5368694025591854, + "learning_rate": 5.084322350995512e-06, + "loss": 0.0488, + "step": 4909 + }, + { + "epoch": 2.180768376637797, + "grad_norm": 0.32415608827454506, + "learning_rate": 5.0823840836482316e-06, + "loss": 0.0233, + "step": 4910 + }, + { + "epoch": 2.1812125249833443, + "grad_norm": 0.4761380556003533, + "learning_rate": 5.080445803917225e-06, + "loss": 0.0471, + "step": 4911 + }, + { + "epoch": 2.181656673328892, + "grad_norm": 0.3515304190735633, + "learning_rate": 5.078507512093844e-06, + "loss": 0.0259, + "step": 4912 + }, + { + "epoch": 2.1821008216744393, + "grad_norm": 0.6155251528359741, + "learning_rate": 5.076569208469454e-06, + "loss": 0.0474, + "step": 4913 + }, + { + "epoch": 2.1825449700199866, + "grad_norm": 0.4580217790122366, + "learning_rate": 5.0746308933354105e-06, + "loss": 0.0438, + "step": 4914 + }, + { + "epoch": 2.182989118365534, + "grad_norm": 0.408141050974567, + "learning_rate": 5.072692566983074e-06, + "loss": 0.0376, + "step": 4915 + }, + { + "epoch": 2.1834332667110816, + "grad_norm": 0.4622430926137514, + "learning_rate": 5.070754229703811e-06, + "loss": 0.0325, + "step": 4916 + }, + { + "epoch": 2.183877415056629, + "grad_norm": 0.49891927431787836, + "learning_rate": 5.068815881788986e-06, + "loss": 0.0383, + "step": 4917 + }, + { + "epoch": 2.184321563402176, + "grad_norm": 0.3944468340370449, + "learning_rate": 5.0668775235299636e-06, + "loss": 0.0364, + "step": 4918 + }, + { + "epoch": 2.184765711747724, + "grad_norm": 0.5143753892518631, + "learning_rate": 5.064939155218115e-06, + "loss": 0.0327, + "step": 4919 + }, + { + "epoch": 2.1852098600932712, + "grad_norm": 0.4613419832071711, + "learning_rate": 5.0630007771448064e-06, + "loss": 0.0365, + "step": 4920 + }, + { + "epoch": 2.1856540084388185, + "grad_norm": 0.42353949789442463, + "learning_rate": 5.061062389601413e-06, + "loss": 0.0305, + "step": 4921 + }, + { + "epoch": 2.186098156784366, + "grad_norm": 0.4615529011211128, + "learning_rate": 5.059123992879303e-06, + "loss": 0.0363, + "step": 4922 + }, + { + "epoch": 2.1865423051299135, + "grad_norm": 0.38154251403565953, + "learning_rate": 5.057185587269854e-06, + "loss": 0.0237, + "step": 4923 + }, + { + "epoch": 2.186986453475461, + "grad_norm": 0.45850860345485767, + "learning_rate": 5.05524717306444e-06, + "loss": 0.0309, + "step": 4924 + }, + { + "epoch": 2.187430601821008, + "grad_norm": 0.3296860610369256, + "learning_rate": 5.053308750554437e-06, + "loss": 0.0311, + "step": 4925 + }, + { + "epoch": 2.1878747501665554, + "grad_norm": 0.3804383024783399, + "learning_rate": 5.051370320031221e-06, + "loss": 0.0337, + "step": 4926 + }, + { + "epoch": 2.188318898512103, + "grad_norm": 0.44377911056623215, + "learning_rate": 5.049431881786176e-06, + "loss": 0.0451, + "step": 4927 + }, + { + "epoch": 2.1887630468576504, + "grad_norm": 0.3897615063112148, + "learning_rate": 5.04749343611068e-06, + "loss": 0.0264, + "step": 4928 + }, + { + "epoch": 2.1892071952031977, + "grad_norm": 0.857708991433585, + "learning_rate": 5.045554983296111e-06, + "loss": 0.0421, + "step": 4929 + }, + { + "epoch": 2.1896513435487455, + "grad_norm": 0.4460864492390492, + "learning_rate": 5.043616523633856e-06, + "loss": 0.0348, + "step": 4930 + }, + { + "epoch": 2.1900954918942928, + "grad_norm": 0.46033992290883, + "learning_rate": 5.0416780574152976e-06, + "loss": 0.0391, + "step": 4931 + }, + { + "epoch": 2.19053964023984, + "grad_norm": 0.36797085648755834, + "learning_rate": 5.0397395849318165e-06, + "loss": 0.0379, + "step": 4932 + }, + { + "epoch": 2.1909837885853873, + "grad_norm": 0.3675945320964035, + "learning_rate": 5.0378011064748025e-06, + "loss": 0.0252, + "step": 4933 + }, + { + "epoch": 2.191427936930935, + "grad_norm": 0.6163310761797228, + "learning_rate": 5.035862622335641e-06, + "loss": 0.0477, + "step": 4934 + }, + { + "epoch": 2.1918720852764824, + "grad_norm": 0.5755380695413213, + "learning_rate": 5.0339241328057164e-06, + "loss": 0.0321, + "step": 4935 + }, + { + "epoch": 2.1923162336220297, + "grad_norm": 0.7365554315970313, + "learning_rate": 5.0319856381764175e-06, + "loss": 0.0421, + "step": 4936 + }, + { + "epoch": 2.1927603819675774, + "grad_norm": 0.3786316850609603, + "learning_rate": 5.030047138739136e-06, + "loss": 0.0274, + "step": 4937 + }, + { + "epoch": 2.1932045303131247, + "grad_norm": 0.46503560614578676, + "learning_rate": 5.028108634785258e-06, + "loss": 0.0329, + "step": 4938 + }, + { + "epoch": 2.193648678658672, + "grad_norm": 0.39835692305695053, + "learning_rate": 5.0261701266061746e-06, + "loss": 0.0267, + "step": 4939 + }, + { + "epoch": 2.1940928270042193, + "grad_norm": 0.3883823936267887, + "learning_rate": 5.024231614493277e-06, + "loss": 0.0284, + "step": 4940 + }, + { + "epoch": 2.194536975349767, + "grad_norm": 0.38466420585767425, + "learning_rate": 5.022293098737957e-06, + "loss": 0.0302, + "step": 4941 + }, + { + "epoch": 2.1949811236953143, + "grad_norm": 0.5034584168375414, + "learning_rate": 5.0203545796316044e-06, + "loss": 0.05, + "step": 4942 + }, + { + "epoch": 2.1954252720408616, + "grad_norm": 0.5196054438026021, + "learning_rate": 5.0184160574656125e-06, + "loss": 0.0348, + "step": 4943 + }, + { + "epoch": 2.195869420386409, + "grad_norm": 0.4645537912636916, + "learning_rate": 5.0164775325313755e-06, + "loss": 0.0376, + "step": 4944 + }, + { + "epoch": 2.1963135687319566, + "grad_norm": 0.4894769238899238, + "learning_rate": 5.0145390051202846e-06, + "loss": 0.0357, + "step": 4945 + }, + { + "epoch": 2.196757717077504, + "grad_norm": 0.3225076116998955, + "learning_rate": 5.012600475523733e-06, + "loss": 0.0268, + "step": 4946 + }, + { + "epoch": 2.197201865423051, + "grad_norm": 0.46007489982886224, + "learning_rate": 5.010661944033118e-06, + "loss": 0.0406, + "step": 4947 + }, + { + "epoch": 2.197646013768599, + "grad_norm": 0.6157542604470476, + "learning_rate": 5.008723410939832e-06, + "loss": 0.0329, + "step": 4948 + }, + { + "epoch": 2.198090162114146, + "grad_norm": 0.41686085632858955, + "learning_rate": 5.006784876535268e-06, + "loss": 0.0416, + "step": 4949 + }, + { + "epoch": 2.1985343104596935, + "grad_norm": 0.5468745101745486, + "learning_rate": 5.004846341110822e-06, + "loss": 0.0499, + "step": 4950 + }, + { + "epoch": 2.198978458805241, + "grad_norm": 0.43554432758066686, + "learning_rate": 5.002907804957889e-06, + "loss": 0.0353, + "step": 4951 + }, + { + "epoch": 2.1994226071507885, + "grad_norm": 0.4925752472694445, + "learning_rate": 5.000969268367862e-06, + "loss": 0.0367, + "step": 4952 + }, + { + "epoch": 2.199866755496336, + "grad_norm": 0.4262017675003274, + "learning_rate": 4.999030731632139e-06, + "loss": 0.0444, + "step": 4953 + }, + { + "epoch": 2.200310903841883, + "grad_norm": 0.3899535716527509, + "learning_rate": 4.997092195042113e-06, + "loss": 0.0299, + "step": 4954 + }, + { + "epoch": 2.2007550521874304, + "grad_norm": 0.4534373375430896, + "learning_rate": 4.995153658889181e-06, + "loss": 0.0346, + "step": 4955 + }, + { + "epoch": 2.201199200532978, + "grad_norm": 0.32786374874254637, + "learning_rate": 4.993215123464734e-06, + "loss": 0.0292, + "step": 4956 + }, + { + "epoch": 2.2016433488785254, + "grad_norm": 0.48869610889723925, + "learning_rate": 4.991276589060169e-06, + "loss": 0.0435, + "step": 4957 + }, + { + "epoch": 2.2020874972240727, + "grad_norm": 0.41141706707817044, + "learning_rate": 4.989338055966883e-06, + "loss": 0.0317, + "step": 4958 + }, + { + "epoch": 2.2025316455696204, + "grad_norm": 0.35799145890163087, + "learning_rate": 4.987399524476268e-06, + "loss": 0.0343, + "step": 4959 + }, + { + "epoch": 2.2029757939151677, + "grad_norm": 0.39541269197255546, + "learning_rate": 4.985460994879717e-06, + "loss": 0.0303, + "step": 4960 + }, + { + "epoch": 2.203419942260715, + "grad_norm": 0.4446866574845067, + "learning_rate": 4.983522467468627e-06, + "loss": 0.0458, + "step": 4961 + }, + { + "epoch": 2.2038640906062623, + "grad_norm": 0.44841266243980166, + "learning_rate": 4.981583942534388e-06, + "loss": 0.0401, + "step": 4962 + }, + { + "epoch": 2.20430823895181, + "grad_norm": 0.40609549354200053, + "learning_rate": 4.979645420368397e-06, + "loss": 0.0318, + "step": 4963 + }, + { + "epoch": 2.2047523872973573, + "grad_norm": 0.4655683020461079, + "learning_rate": 4.977706901262045e-06, + "loss": 0.0318, + "step": 4964 + }, + { + "epoch": 2.2051965356429046, + "grad_norm": 0.4105935741895686, + "learning_rate": 4.975768385506725e-06, + "loss": 0.0365, + "step": 4965 + }, + { + "epoch": 2.2056406839884524, + "grad_norm": 0.4840996358141117, + "learning_rate": 4.973829873393827e-06, + "loss": 0.0288, + "step": 4966 + }, + { + "epoch": 2.2060848323339997, + "grad_norm": 0.43687024211685477, + "learning_rate": 4.971891365214743e-06, + "loss": 0.0284, + "step": 4967 + }, + { + "epoch": 2.206528980679547, + "grad_norm": 0.5919063970733193, + "learning_rate": 4.969952861260865e-06, + "loss": 0.0316, + "step": 4968 + }, + { + "epoch": 2.2069731290250942, + "grad_norm": 0.34222056631687486, + "learning_rate": 4.968014361823583e-06, + "loss": 0.0269, + "step": 4969 + }, + { + "epoch": 2.207417277370642, + "grad_norm": 0.577057583701676, + "learning_rate": 4.966075867194285e-06, + "loss": 0.0391, + "step": 4970 + }, + { + "epoch": 2.2078614257161893, + "grad_norm": 0.45130320193766277, + "learning_rate": 4.964137377664362e-06, + "loss": 0.0358, + "step": 4971 + }, + { + "epoch": 2.2083055740617366, + "grad_norm": 0.37843493642246295, + "learning_rate": 4.9621988935252e-06, + "loss": 0.0356, + "step": 4972 + }, + { + "epoch": 2.208749722407284, + "grad_norm": 0.37308118817776387, + "learning_rate": 4.9602604150681835e-06, + "loss": 0.0244, + "step": 4973 + }, + { + "epoch": 2.2091938707528316, + "grad_norm": 0.5336868565377078, + "learning_rate": 4.958321942584703e-06, + "loss": 0.0488, + "step": 4974 + }, + { + "epoch": 2.209638019098379, + "grad_norm": 0.3742330589994002, + "learning_rate": 4.956383476366145e-06, + "loss": 0.0309, + "step": 4975 + }, + { + "epoch": 2.210082167443926, + "grad_norm": 0.46020603183712594, + "learning_rate": 4.95444501670389e-06, + "loss": 0.0323, + "step": 4976 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.8475632066917597, + "learning_rate": 4.9525065638893226e-06, + "loss": 0.0465, + "step": 4977 + }, + { + "epoch": 2.210970464135021, + "grad_norm": 0.5219077605882717, + "learning_rate": 4.950568118213825e-06, + "loss": 0.0393, + "step": 4978 + }, + { + "epoch": 2.2114146124805685, + "grad_norm": 0.37860034305038076, + "learning_rate": 4.948629679968778e-06, + "loss": 0.0366, + "step": 4979 + }, + { + "epoch": 2.2118587608261158, + "grad_norm": 0.6128046042996554, + "learning_rate": 4.946691249445565e-06, + "loss": 0.04, + "step": 4980 + }, + { + "epoch": 2.2123029091716635, + "grad_norm": 0.4839071616267329, + "learning_rate": 4.944752826935562e-06, + "loss": 0.035, + "step": 4981 + }, + { + "epoch": 2.212747057517211, + "grad_norm": 0.45065394012575055, + "learning_rate": 4.942814412730147e-06, + "loss": 0.0267, + "step": 4982 + }, + { + "epoch": 2.213191205862758, + "grad_norm": 0.5986492760384308, + "learning_rate": 4.940876007120699e-06, + "loss": 0.0289, + "step": 4983 + }, + { + "epoch": 2.2136353542083054, + "grad_norm": 0.5082807449831283, + "learning_rate": 4.938937610398588e-06, + "loss": 0.034, + "step": 4984 + }, + { + "epoch": 2.214079502553853, + "grad_norm": 0.46534139162341914, + "learning_rate": 4.9369992228551935e-06, + "loss": 0.0323, + "step": 4985 + }, + { + "epoch": 2.2145236508994004, + "grad_norm": 0.4943061938684834, + "learning_rate": 4.935060844781886e-06, + "loss": 0.0353, + "step": 4986 + }, + { + "epoch": 2.2149677992449477, + "grad_norm": 0.4336318300801724, + "learning_rate": 4.933122476470038e-06, + "loss": 0.0275, + "step": 4987 + }, + { + "epoch": 2.2154119475904954, + "grad_norm": 0.4360854695437589, + "learning_rate": 4.931184118211016e-06, + "loss": 0.0349, + "step": 4988 + }, + { + "epoch": 2.2158560959360427, + "grad_norm": 0.6620193372024554, + "learning_rate": 4.929245770296191e-06, + "loss": 0.0483, + "step": 4989 + }, + { + "epoch": 2.21630024428159, + "grad_norm": 0.48994703833403463, + "learning_rate": 4.927307433016927e-06, + "loss": 0.0432, + "step": 4990 + }, + { + "epoch": 2.2167443926271373, + "grad_norm": 0.4622061500888628, + "learning_rate": 4.925369106664591e-06, + "loss": 0.0387, + "step": 4991 + }, + { + "epoch": 2.217188540972685, + "grad_norm": 0.4624898194765989, + "learning_rate": 4.923430791530547e-06, + "loss": 0.0196, + "step": 4992 + }, + { + "epoch": 2.2176326893182323, + "grad_norm": 0.3453109472117995, + "learning_rate": 4.9214924879061565e-06, + "loss": 0.0286, + "step": 4993 + }, + { + "epoch": 2.2180768376637796, + "grad_norm": 0.41871877730955476, + "learning_rate": 4.919554196082778e-06, + "loss": 0.0397, + "step": 4994 + }, + { + "epoch": 2.2185209860093273, + "grad_norm": 0.42587370028059823, + "learning_rate": 4.91761591635177e-06, + "loss": 0.0408, + "step": 4995 + }, + { + "epoch": 2.2189651343548746, + "grad_norm": 0.36571656649505513, + "learning_rate": 4.9156776490044875e-06, + "loss": 0.0335, + "step": 4996 + }, + { + "epoch": 2.219409282700422, + "grad_norm": 0.4229412174299105, + "learning_rate": 4.91373939433229e-06, + "loss": 0.0396, + "step": 4997 + }, + { + "epoch": 2.219853431045969, + "grad_norm": 0.3904502538208118, + "learning_rate": 4.911801152626525e-06, + "loss": 0.0352, + "step": 4998 + }, + { + "epoch": 2.220297579391517, + "grad_norm": 0.7376575867139856, + "learning_rate": 4.909862924178545e-06, + "loss": 0.04, + "step": 4999 + }, + { + "epoch": 2.2207417277370642, + "grad_norm": 0.45886434872272697, + "learning_rate": 4.9079247092797e-06, + "loss": 0.0375, + "step": 5000 + }, + { + "epoch": 2.2211858760826115, + "grad_norm": 0.38398725710621795, + "learning_rate": 4.905986508221333e-06, + "loss": 0.034, + "step": 5001 + }, + { + "epoch": 2.221630024428159, + "grad_norm": 0.4363686982956202, + "learning_rate": 4.904048321294791e-06, + "loss": 0.0395, + "step": 5002 + }, + { + "epoch": 2.2220741727737066, + "grad_norm": 0.47908384837669443, + "learning_rate": 4.9021101487914185e-06, + "loss": 0.0451, + "step": 5003 + }, + { + "epoch": 2.222518321119254, + "grad_norm": 0.49404577583056375, + "learning_rate": 4.900171991002553e-06, + "loss": 0.0382, + "step": 5004 + }, + { + "epoch": 2.222962469464801, + "grad_norm": 0.4254636742864719, + "learning_rate": 4.898233848219532e-06, + "loss": 0.0385, + "step": 5005 + }, + { + "epoch": 2.2234066178103484, + "grad_norm": 0.5219692883962996, + "learning_rate": 4.896295720733694e-06, + "loss": 0.04, + "step": 5006 + }, + { + "epoch": 2.223850766155896, + "grad_norm": 0.8849555604502084, + "learning_rate": 4.894357608836368e-06, + "loss": 0.0357, + "step": 5007 + }, + { + "epoch": 2.2242949145014435, + "grad_norm": 0.772611179602613, + "learning_rate": 4.89241951281889e-06, + "loss": 0.0418, + "step": 5008 + }, + { + "epoch": 2.2247390628469907, + "grad_norm": 0.5532053379610344, + "learning_rate": 4.890481432972586e-06, + "loss": 0.0352, + "step": 5009 + }, + { + "epoch": 2.2251832111925385, + "grad_norm": 0.4003154527748429, + "learning_rate": 4.8885433695887836e-06, + "loss": 0.032, + "step": 5010 + }, + { + "epoch": 2.2256273595380858, + "grad_norm": 0.45204263035502396, + "learning_rate": 4.886605322958806e-06, + "loss": 0.0303, + "step": 5011 + }, + { + "epoch": 2.226071507883633, + "grad_norm": 0.3440942170404326, + "learning_rate": 4.884667293373973e-06, + "loss": 0.022, + "step": 5012 + }, + { + "epoch": 2.2265156562291804, + "grad_norm": 0.7116582097517625, + "learning_rate": 4.882729281125605e-06, + "loss": 0.0523, + "step": 5013 + }, + { + "epoch": 2.226959804574728, + "grad_norm": 0.4053030773514669, + "learning_rate": 4.88079128650502e-06, + "loss": 0.027, + "step": 5014 + }, + { + "epoch": 2.2274039529202754, + "grad_norm": 0.6656245142980673, + "learning_rate": 4.878853309803529e-06, + "loss": 0.0462, + "step": 5015 + }, + { + "epoch": 2.2278481012658227, + "grad_norm": 0.38760410447982047, + "learning_rate": 4.876915351312444e-06, + "loss": 0.0265, + "step": 5016 + }, + { + "epoch": 2.2282922496113704, + "grad_norm": 0.407666066744564, + "learning_rate": 4.874977411323073e-06, + "loss": 0.0354, + "step": 5017 + }, + { + "epoch": 2.2287363979569177, + "grad_norm": 0.483920298386755, + "learning_rate": 4.873039490126718e-06, + "loss": 0.0367, + "step": 5018 + }, + { + "epoch": 2.229180546302465, + "grad_norm": 0.45353775305411304, + "learning_rate": 4.871101588014686e-06, + "loss": 0.0358, + "step": 5019 + }, + { + "epoch": 2.2296246946480123, + "grad_norm": 0.4339146999685688, + "learning_rate": 4.869163705278276e-06, + "loss": 0.0326, + "step": 5020 + }, + { + "epoch": 2.23006884299356, + "grad_norm": 1.02624057870213, + "learning_rate": 4.867225842208783e-06, + "loss": 0.0598, + "step": 5021 + }, + { + "epoch": 2.2305129913391073, + "grad_norm": 0.44283161622309714, + "learning_rate": 4.8652879990975e-06, + "loss": 0.0378, + "step": 5022 + }, + { + "epoch": 2.2309571396846546, + "grad_norm": 0.44468206860911264, + "learning_rate": 4.863350176235721e-06, + "loss": 0.0435, + "step": 5023 + }, + { + "epoch": 2.2314012880302023, + "grad_norm": 0.4317246743183953, + "learning_rate": 4.861412373914729e-06, + "loss": 0.0339, + "step": 5024 + }, + { + "epoch": 2.2318454363757496, + "grad_norm": 1.9956444643720284, + "learning_rate": 4.8594745924258144e-06, + "loss": 0.0611, + "step": 5025 + }, + { + "epoch": 2.232289584721297, + "grad_norm": 0.5268379152925193, + "learning_rate": 4.857536832060255e-06, + "loss": 0.0408, + "step": 5026 + }, + { + "epoch": 2.232733733066844, + "grad_norm": 0.4937129380739942, + "learning_rate": 4.85559909310933e-06, + "loss": 0.0523, + "step": 5027 + }, + { + "epoch": 2.233177881412392, + "grad_norm": 0.42104370790377565, + "learning_rate": 4.853661375864313e-06, + "loss": 0.0226, + "step": 5028 + }, + { + "epoch": 2.233622029757939, + "grad_norm": 0.38358747557836487, + "learning_rate": 4.851723680616477e-06, + "loss": 0.0288, + "step": 5029 + }, + { + "epoch": 2.2340661781034865, + "grad_norm": 0.35278248179931115, + "learning_rate": 4.84978600765709e-06, + "loss": 0.0273, + "step": 5030 + }, + { + "epoch": 2.234510326449034, + "grad_norm": 0.5070929341551949, + "learning_rate": 4.84784835727742e-06, + "loss": 0.0421, + "step": 5031 + }, + { + "epoch": 2.2349544747945815, + "grad_norm": 0.4414918243805681, + "learning_rate": 4.845910729768726e-06, + "loss": 0.0299, + "step": 5032 + }, + { + "epoch": 2.235398623140129, + "grad_norm": 0.4272744129564785, + "learning_rate": 4.843973125422266e-06, + "loss": 0.0377, + "step": 5033 + }, + { + "epoch": 2.235842771485676, + "grad_norm": 0.4042780850426815, + "learning_rate": 4.842035544529296e-06, + "loss": 0.0273, + "step": 5034 + }, + { + "epoch": 2.2362869198312234, + "grad_norm": 0.384529185725243, + "learning_rate": 4.8400979873810675e-06, + "loss": 0.0297, + "step": 5035 + }, + { + "epoch": 2.236731068176771, + "grad_norm": 0.5143165626824721, + "learning_rate": 4.838160454268827e-06, + "loss": 0.0378, + "step": 5036 + }, + { + "epoch": 2.2371752165223184, + "grad_norm": 0.48737479457986954, + "learning_rate": 4.8362229454838185e-06, + "loss": 0.0343, + "step": 5037 + }, + { + "epoch": 2.2376193648678657, + "grad_norm": 0.4465431723189805, + "learning_rate": 4.834285461317286e-06, + "loss": 0.0367, + "step": 5038 + }, + { + "epoch": 2.2380635132134135, + "grad_norm": 0.5195671115110356, + "learning_rate": 4.832348002060461e-06, + "loss": 0.0396, + "step": 5039 + }, + { + "epoch": 2.2385076615589607, + "grad_norm": 0.46391257174900896, + "learning_rate": 4.830410568004581e-06, + "loss": 0.036, + "step": 5040 + }, + { + "epoch": 2.238951809904508, + "grad_norm": 0.5290347592358237, + "learning_rate": 4.8284731594408715e-06, + "loss": 0.0448, + "step": 5041 + }, + { + "epoch": 2.2393959582500553, + "grad_norm": 0.4737259034687236, + "learning_rate": 4.826535776660562e-06, + "loss": 0.0328, + "step": 5042 + }, + { + "epoch": 2.239840106595603, + "grad_norm": 0.6929115316947699, + "learning_rate": 4.824598419954871e-06, + "loss": 0.0473, + "step": 5043 + }, + { + "epoch": 2.2402842549411504, + "grad_norm": 0.6258064825937694, + "learning_rate": 4.822661089615017e-06, + "loss": 0.0594, + "step": 5044 + }, + { + "epoch": 2.2407284032866976, + "grad_norm": 0.4537831813620375, + "learning_rate": 4.8207237859322144e-06, + "loss": 0.0413, + "step": 5045 + }, + { + "epoch": 2.2411725516322454, + "grad_norm": 0.37664633790712776, + "learning_rate": 4.818786509197672e-06, + "loss": 0.0303, + "step": 5046 + }, + { + "epoch": 2.2416166999777927, + "grad_norm": 0.3886976621007685, + "learning_rate": 4.816849259702594e-06, + "loss": 0.0311, + "step": 5047 + }, + { + "epoch": 2.24206084832334, + "grad_norm": 0.4486972497289991, + "learning_rate": 4.814912037738185e-06, + "loss": 0.0442, + "step": 5048 + }, + { + "epoch": 2.2425049966688873, + "grad_norm": 0.35002545439343274, + "learning_rate": 4.812974843595641e-06, + "loss": 0.0318, + "step": 5049 + }, + { + "epoch": 2.242949145014435, + "grad_norm": 0.48300556886418516, + "learning_rate": 4.811037677566154e-06, + "loss": 0.0395, + "step": 5050 + }, + { + "epoch": 2.2433932933599823, + "grad_norm": 0.45349978147674774, + "learning_rate": 4.8091005399409145e-06, + "loss": 0.0371, + "step": 5051 + }, + { + "epoch": 2.2438374417055296, + "grad_norm": 0.4518413356936792, + "learning_rate": 4.807163431011107e-06, + "loss": 0.0311, + "step": 5052 + }, + { + "epoch": 2.244281590051077, + "grad_norm": 0.3777032140049169, + "learning_rate": 4.80522635106791e-06, + "loss": 0.0292, + "step": 5053 + }, + { + "epoch": 2.2447257383966246, + "grad_norm": 0.4369243999495971, + "learning_rate": 4.8032893004025016e-06, + "loss": 0.0264, + "step": 5054 + }, + { + "epoch": 2.245169886742172, + "grad_norm": 0.6447452509815467, + "learning_rate": 4.801352279306054e-06, + "loss": 0.0427, + "step": 5055 + }, + { + "epoch": 2.245614035087719, + "grad_norm": 0.5030519012852677, + "learning_rate": 4.799415288069733e-06, + "loss": 0.0381, + "step": 5056 + }, + { + "epoch": 2.246058183433267, + "grad_norm": 0.4348315875534344, + "learning_rate": 4.797478326984702e-06, + "loss": 0.0235, + "step": 5057 + }, + { + "epoch": 2.246502331778814, + "grad_norm": 0.47850023846789697, + "learning_rate": 4.795541396342116e-06, + "loss": 0.036, + "step": 5058 + }, + { + "epoch": 2.2469464801243615, + "grad_norm": 0.5227923023136899, + "learning_rate": 4.793604496433133e-06, + "loss": 0.0382, + "step": 5059 + }, + { + "epoch": 2.247390628469909, + "grad_norm": 0.5404616232699001, + "learning_rate": 4.791667627548902e-06, + "loss": 0.0426, + "step": 5060 + }, + { + "epoch": 2.2478347768154565, + "grad_norm": 0.5031844080280006, + "learning_rate": 4.7897307899805624e-06, + "loss": 0.0296, + "step": 5061 + }, + { + "epoch": 2.248278925161004, + "grad_norm": 0.557884313184048, + "learning_rate": 4.78779398401926e-06, + "loss": 0.0324, + "step": 5062 + }, + { + "epoch": 2.248723073506551, + "grad_norm": 0.46196149242671997, + "learning_rate": 4.785857209956124e-06, + "loss": 0.031, + "step": 5063 + }, + { + "epoch": 2.2491672218520984, + "grad_norm": 0.40473001188667446, + "learning_rate": 4.783920468082288e-06, + "loss": 0.0341, + "step": 5064 + }, + { + "epoch": 2.249611370197646, + "grad_norm": 0.558756976897473, + "learning_rate": 4.781983758688876e-06, + "loss": 0.0257, + "step": 5065 + }, + { + "epoch": 2.2500555185431934, + "grad_norm": 0.4215840569154271, + "learning_rate": 4.780047082067009e-06, + "loss": 0.0402, + "step": 5066 + }, + { + "epoch": 2.2504996668887407, + "grad_norm": 0.41373536715916187, + "learning_rate": 4.778110438507801e-06, + "loss": 0.0352, + "step": 5067 + }, + { + "epoch": 2.2509438152342884, + "grad_norm": 0.41096920941351195, + "learning_rate": 4.776173828302365e-06, + "loss": 0.0411, + "step": 5068 + }, + { + "epoch": 2.2513879635798357, + "grad_norm": 0.4730245858374104, + "learning_rate": 4.774237251741805e-06, + "loss": 0.0327, + "step": 5069 + }, + { + "epoch": 2.251832111925383, + "grad_norm": 0.38391334338943445, + "learning_rate": 4.7723007091172175e-06, + "loss": 0.0368, + "step": 5070 + }, + { + "epoch": 2.2522762602709303, + "grad_norm": 0.34213411038422287, + "learning_rate": 4.770364200719703e-06, + "loss": 0.0288, + "step": 5071 + }, + { + "epoch": 2.252720408616478, + "grad_norm": 0.3470528030999554, + "learning_rate": 4.7684277268403515e-06, + "loss": 0.0218, + "step": 5072 + }, + { + "epoch": 2.2531645569620253, + "grad_norm": 0.48424365825920085, + "learning_rate": 4.766491287770246e-06, + "loss": 0.047, + "step": 5073 + }, + { + "epoch": 2.2536087053075726, + "grad_norm": 0.44693843285607443, + "learning_rate": 4.7645548838004665e-06, + "loss": 0.0314, + "step": 5074 + }, + { + "epoch": 2.2540528536531204, + "grad_norm": 0.43485150995828376, + "learning_rate": 4.762618515222085e-06, + "loss": 0.0342, + "step": 5075 + }, + { + "epoch": 2.2544970019986676, + "grad_norm": 0.4279292260890883, + "learning_rate": 4.760682182326176e-06, + "loss": 0.0407, + "step": 5076 + }, + { + "epoch": 2.254941150344215, + "grad_norm": 0.4726345938038404, + "learning_rate": 4.7587458854038e-06, + "loss": 0.0366, + "step": 5077 + }, + { + "epoch": 2.2553852986897622, + "grad_norm": 0.4982236299340504, + "learning_rate": 4.756809624746015e-06, + "loss": 0.0351, + "step": 5078 + }, + { + "epoch": 2.25582944703531, + "grad_norm": 0.416498003591409, + "learning_rate": 4.754873400643875e-06, + "loss": 0.0345, + "step": 5079 + }, + { + "epoch": 2.2562735953808573, + "grad_norm": 0.5784538425140838, + "learning_rate": 4.7529372133884265e-06, + "loss": 0.0351, + "step": 5080 + }, + { + "epoch": 2.2567177437264045, + "grad_norm": 0.4069962515786798, + "learning_rate": 4.75100106327071e-06, + "loss": 0.0287, + "step": 5081 + }, + { + "epoch": 2.2571618920719523, + "grad_norm": 0.44000239841542776, + "learning_rate": 4.749064950581765e-06, + "loss": 0.0318, + "step": 5082 + }, + { + "epoch": 2.2576060404174996, + "grad_norm": 0.4673614175152977, + "learning_rate": 4.747128875612621e-06, + "loss": 0.0298, + "step": 5083 + }, + { + "epoch": 2.258050188763047, + "grad_norm": 0.577011350942782, + "learning_rate": 4.745192838654304e-06, + "loss": 0.0388, + "step": 5084 + }, + { + "epoch": 2.258494337108594, + "grad_norm": 0.40314614436836343, + "learning_rate": 4.743256839997828e-06, + "loss": 0.0318, + "step": 5085 + }, + { + "epoch": 2.2589384854541414, + "grad_norm": 0.3926832668709334, + "learning_rate": 4.741320879934213e-06, + "loss": 0.0361, + "step": 5086 + }, + { + "epoch": 2.259382633799689, + "grad_norm": 0.5101559666731487, + "learning_rate": 4.739384958754461e-06, + "loss": 0.0451, + "step": 5087 + }, + { + "epoch": 2.2598267821452365, + "grad_norm": 0.35566284403310655, + "learning_rate": 4.73744907674958e-06, + "loss": 0.0228, + "step": 5088 + }, + { + "epoch": 2.2602709304907838, + "grad_norm": 0.5266642057409271, + "learning_rate": 4.7355132342105615e-06, + "loss": 0.0387, + "step": 5089 + }, + { + "epoch": 2.2607150788363315, + "grad_norm": 0.4021115546007318, + "learning_rate": 4.733577431428398e-06, + "loss": 0.0313, + "step": 5090 + }, + { + "epoch": 2.261159227181879, + "grad_norm": 0.48902419392318525, + "learning_rate": 4.73164166869407e-06, + "loss": 0.0465, + "step": 5091 + }, + { + "epoch": 2.261603375527426, + "grad_norm": 0.4831670488263336, + "learning_rate": 4.729705946298557e-06, + "loss": 0.0298, + "step": 5092 + }, + { + "epoch": 2.2620475238729734, + "grad_norm": 1.0045520985152046, + "learning_rate": 4.727770264532835e-06, + "loss": 0.0386, + "step": 5093 + }, + { + "epoch": 2.262491672218521, + "grad_norm": 0.5424294421568248, + "learning_rate": 4.725834623687866e-06, + "loss": 0.0448, + "step": 5094 + }, + { + "epoch": 2.2629358205640684, + "grad_norm": 0.37090231243704935, + "learning_rate": 4.723899024054609e-06, + "loss": 0.0276, + "step": 5095 + }, + { + "epoch": 2.2633799689096157, + "grad_norm": 0.41587596025980633, + "learning_rate": 4.7219634659240195e-06, + "loss": 0.037, + "step": 5096 + }, + { + "epoch": 2.2638241172551634, + "grad_norm": 0.4579599445978751, + "learning_rate": 4.720027949587046e-06, + "loss": 0.0355, + "step": 5097 + }, + { + "epoch": 2.2642682656007107, + "grad_norm": 0.33053691402038254, + "learning_rate": 4.718092475334623e-06, + "loss": 0.0237, + "step": 5098 + }, + { + "epoch": 2.264712413946258, + "grad_norm": 0.4734298805101396, + "learning_rate": 4.716157043457692e-06, + "loss": 0.0406, + "step": 5099 + }, + { + "epoch": 2.2651565622918053, + "grad_norm": 0.495566512713766, + "learning_rate": 4.71422165424718e-06, + "loss": 0.0446, + "step": 5100 + }, + { + "epoch": 2.265600710637353, + "grad_norm": 0.5845335631241523, + "learning_rate": 4.712286307994008e-06, + "loss": 0.0483, + "step": 5101 + }, + { + "epoch": 2.2660448589829003, + "grad_norm": 0.5632138786502331, + "learning_rate": 4.71035100498909e-06, + "loss": 0.0301, + "step": 5102 + }, + { + "epoch": 2.2664890073284476, + "grad_norm": 0.4679864832864739, + "learning_rate": 4.708415745523338e-06, + "loss": 0.0282, + "step": 5103 + }, + { + "epoch": 2.2669331556739953, + "grad_norm": 0.3347447433418278, + "learning_rate": 4.70648052988765e-06, + "loss": 0.0284, + "step": 5104 + }, + { + "epoch": 2.2673773040195426, + "grad_norm": 0.3922788691977759, + "learning_rate": 4.704545358372926e-06, + "loss": 0.0376, + "step": 5105 + }, + { + "epoch": 2.26782145236509, + "grad_norm": 0.574421309924243, + "learning_rate": 4.702610231270053e-06, + "loss": 0.0357, + "step": 5106 + }, + { + "epoch": 2.268265600710637, + "grad_norm": 0.3970980176428026, + "learning_rate": 4.7006751488699145e-06, + "loss": 0.0352, + "step": 5107 + }, + { + "epoch": 2.268709749056185, + "grad_norm": 0.48320849905313684, + "learning_rate": 4.698740111463386e-06, + "loss": 0.0482, + "step": 5108 + }, + { + "epoch": 2.2691538974017322, + "grad_norm": 0.48488451433259316, + "learning_rate": 4.696805119341334e-06, + "loss": 0.0323, + "step": 5109 + }, + { + "epoch": 2.2695980457472795, + "grad_norm": 0.38461440123857293, + "learning_rate": 4.694870172794625e-06, + "loss": 0.0256, + "step": 5110 + }, + { + "epoch": 2.270042194092827, + "grad_norm": 0.5557330047140479, + "learning_rate": 4.692935272114113e-06, + "loss": 0.0303, + "step": 5111 + }, + { + "epoch": 2.2704863424383745, + "grad_norm": 0.4256236131352179, + "learning_rate": 4.6910004175906435e-06, + "loss": 0.0233, + "step": 5112 + }, + { + "epoch": 2.270930490783922, + "grad_norm": 0.46988918438082106, + "learning_rate": 4.689065609515062e-06, + "loss": 0.0363, + "step": 5113 + }, + { + "epoch": 2.271374639129469, + "grad_norm": 0.49373029930466805, + "learning_rate": 4.687130848178202e-06, + "loss": 0.0483, + "step": 5114 + }, + { + "epoch": 2.2718187874750164, + "grad_norm": 0.34972548300588746, + "learning_rate": 4.685196133870887e-06, + "loss": 0.0255, + "step": 5115 + }, + { + "epoch": 2.272262935820564, + "grad_norm": 0.581571662658351, + "learning_rate": 4.683261466883942e-06, + "loss": 0.0414, + "step": 5116 + }, + { + "epoch": 2.2727070841661114, + "grad_norm": 0.44401771745072516, + "learning_rate": 4.681326847508181e-06, + "loss": 0.032, + "step": 5117 + }, + { + "epoch": 2.2731512325116587, + "grad_norm": 0.5278060546522586, + "learning_rate": 4.6793922760344065e-06, + "loss": 0.0418, + "step": 5118 + }, + { + "epoch": 2.2735953808572065, + "grad_norm": 0.5206864377402288, + "learning_rate": 4.6774577527534195e-06, + "loss": 0.0388, + "step": 5119 + }, + { + "epoch": 2.2740395292027538, + "grad_norm": 0.4082716881176943, + "learning_rate": 4.675523277956011e-06, + "loss": 0.0283, + "step": 5120 + }, + { + "epoch": 2.274483677548301, + "grad_norm": 0.4188100723565611, + "learning_rate": 4.673588851932964e-06, + "loss": 0.0377, + "step": 5121 + }, + { + "epoch": 2.2749278258938483, + "grad_norm": 0.32957447472088847, + "learning_rate": 4.671654474975061e-06, + "loss": 0.0284, + "step": 5122 + }, + { + "epoch": 2.275371974239396, + "grad_norm": 0.5325173700148402, + "learning_rate": 4.669720147373065e-06, + "loss": 0.0304, + "step": 5123 + }, + { + "epoch": 2.2758161225849434, + "grad_norm": 0.7259350026566369, + "learning_rate": 4.667785869417744e-06, + "loss": 0.0484, + "step": 5124 + }, + { + "epoch": 2.2762602709304907, + "grad_norm": 0.34263569388234766, + "learning_rate": 4.6658516413998486e-06, + "loss": 0.0302, + "step": 5125 + }, + { + "epoch": 2.2767044192760384, + "grad_norm": 0.4050161385405534, + "learning_rate": 4.663917463610128e-06, + "loss": 0.0327, + "step": 5126 + }, + { + "epoch": 2.2771485676215857, + "grad_norm": 0.4351043792797754, + "learning_rate": 4.661983336339319e-06, + "loss": 0.0347, + "step": 5127 + }, + { + "epoch": 2.277592715967133, + "grad_norm": 0.4544801574632252, + "learning_rate": 4.66004925987816e-06, + "loss": 0.0335, + "step": 5128 + }, + { + "epoch": 2.2780368643126803, + "grad_norm": 0.5298789728156417, + "learning_rate": 4.6581152345173714e-06, + "loss": 0.0481, + "step": 5129 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 0.42879788700967447, + "learning_rate": 4.656181260547669e-06, + "loss": 0.0306, + "step": 5130 + }, + { + "epoch": 2.2789251610037753, + "grad_norm": 0.432457604882885, + "learning_rate": 4.654247338259766e-06, + "loss": 0.0348, + "step": 5131 + }, + { + "epoch": 2.2793693093493226, + "grad_norm": 0.4238026290724064, + "learning_rate": 4.652313467944358e-06, + "loss": 0.0362, + "step": 5132 + }, + { + "epoch": 2.2798134576948703, + "grad_norm": 0.37078187284702663, + "learning_rate": 4.650379649892145e-06, + "loss": 0.0225, + "step": 5133 + }, + { + "epoch": 2.2802576060404176, + "grad_norm": 0.541283465006368, + "learning_rate": 4.648445884393808e-06, + "loss": 0.0427, + "step": 5134 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.5756809781245248, + "learning_rate": 4.646512171740028e-06, + "loss": 0.0416, + "step": 5135 + }, + { + "epoch": 2.281145902731512, + "grad_norm": 0.6172689341906578, + "learning_rate": 4.6445785122214715e-06, + "loss": 0.0468, + "step": 5136 + }, + { + "epoch": 2.28159005107706, + "grad_norm": 0.4664467318852636, + "learning_rate": 4.6426449061288035e-06, + "loss": 0.0394, + "step": 5137 + }, + { + "epoch": 2.282034199422607, + "grad_norm": 0.3708061416037573, + "learning_rate": 4.640711353752675e-06, + "loss": 0.0308, + "step": 5138 + }, + { + "epoch": 2.2824783477681545, + "grad_norm": 0.4312428309303351, + "learning_rate": 4.638777855383735e-06, + "loss": 0.0361, + "step": 5139 + }, + { + "epoch": 2.282922496113702, + "grad_norm": 0.3963057474327594, + "learning_rate": 4.636844411312618e-06, + "loss": 0.0244, + "step": 5140 + }, + { + "epoch": 2.2833666444592495, + "grad_norm": 0.48035729602544386, + "learning_rate": 4.634911021829956e-06, + "loss": 0.0388, + "step": 5141 + }, + { + "epoch": 2.283810792804797, + "grad_norm": 0.3951503991589514, + "learning_rate": 4.63297768722637e-06, + "loss": 0.0311, + "step": 5142 + }, + { + "epoch": 2.284254941150344, + "grad_norm": 0.3422700971686053, + "learning_rate": 4.6310444077924705e-06, + "loss": 0.0244, + "step": 5143 + }, + { + "epoch": 2.2846990894958914, + "grad_norm": 0.6947651223374093, + "learning_rate": 4.629111183818863e-06, + "loss": 0.0351, + "step": 5144 + }, + { + "epoch": 2.285143237841439, + "grad_norm": 0.4898604396865276, + "learning_rate": 4.627178015596147e-06, + "loss": 0.0369, + "step": 5145 + }, + { + "epoch": 2.2855873861869864, + "grad_norm": 0.384272758272605, + "learning_rate": 4.625244903414908e-06, + "loss": 0.0378, + "step": 5146 + }, + { + "epoch": 2.2860315345325337, + "grad_norm": 0.6383973512520991, + "learning_rate": 4.623311847565725e-06, + "loss": 0.0371, + "step": 5147 + }, + { + "epoch": 2.2864756828780815, + "grad_norm": 0.43116354217379305, + "learning_rate": 4.621378848339172e-06, + "loss": 0.0385, + "step": 5148 + }, + { + "epoch": 2.2869198312236287, + "grad_norm": 0.37732659096182064, + "learning_rate": 4.619445906025807e-06, + "loss": 0.0302, + "step": 5149 + }, + { + "epoch": 2.287363979569176, + "grad_norm": 0.38619197439251224, + "learning_rate": 4.6175130209161894e-06, + "loss": 0.0309, + "step": 5150 + }, + { + "epoch": 2.2878081279147233, + "grad_norm": 0.3996619270485742, + "learning_rate": 4.615580193300861e-06, + "loss": 0.0342, + "step": 5151 + }, + { + "epoch": 2.288252276260271, + "grad_norm": 0.4975335885399077, + "learning_rate": 4.613647423470361e-06, + "loss": 0.0295, + "step": 5152 + }, + { + "epoch": 2.2886964246058183, + "grad_norm": 0.3800465801231585, + "learning_rate": 4.611714711715215e-06, + "loss": 0.0287, + "step": 5153 + }, + { + "epoch": 2.2891405729513656, + "grad_norm": 0.4187996600512099, + "learning_rate": 4.609782058325944e-06, + "loss": 0.0357, + "step": 5154 + }, + { + "epoch": 2.2895847212969134, + "grad_norm": 0.6627247371766289, + "learning_rate": 4.607849463593056e-06, + "loss": 0.0319, + "step": 5155 + }, + { + "epoch": 2.2900288696424607, + "grad_norm": 0.44696110840918063, + "learning_rate": 4.6059169278070576e-06, + "loss": 0.0439, + "step": 5156 + }, + { + "epoch": 2.290473017988008, + "grad_norm": 0.43597000773880085, + "learning_rate": 4.603984451258439e-06, + "loss": 0.0309, + "step": 5157 + }, + { + "epoch": 2.2909171663335552, + "grad_norm": 0.3416433662316786, + "learning_rate": 4.602052034237684e-06, + "loss": 0.0244, + "step": 5158 + }, + { + "epoch": 2.291361314679103, + "grad_norm": 0.4086413859197415, + "learning_rate": 4.600119677035269e-06, + "loss": 0.0357, + "step": 5159 + }, + { + "epoch": 2.2918054630246503, + "grad_norm": 0.4905561101374712, + "learning_rate": 4.598187379941659e-06, + "loss": 0.036, + "step": 5160 + }, + { + "epoch": 2.2922496113701976, + "grad_norm": 0.47425089286645844, + "learning_rate": 4.59625514324731e-06, + "loss": 0.0344, + "step": 5161 + }, + { + "epoch": 2.2926937597157453, + "grad_norm": 0.4768903592454644, + "learning_rate": 4.594322967242673e-06, + "loss": 0.0445, + "step": 5162 + }, + { + "epoch": 2.2931379080612926, + "grad_norm": 0.6604194357601725, + "learning_rate": 4.592390852218185e-06, + "loss": 0.0353, + "step": 5163 + }, + { + "epoch": 2.29358205640684, + "grad_norm": 0.6477706525687013, + "learning_rate": 4.590458798464275e-06, + "loss": 0.0346, + "step": 5164 + }, + { + "epoch": 2.294026204752387, + "grad_norm": 0.4782427676732749, + "learning_rate": 4.588526806271366e-06, + "loss": 0.0332, + "step": 5165 + }, + { + "epoch": 2.294470353097935, + "grad_norm": 0.33598954590538455, + "learning_rate": 4.5865948759298656e-06, + "loss": 0.0242, + "step": 5166 + }, + { + "epoch": 2.294914501443482, + "grad_norm": 0.5314954081213854, + "learning_rate": 4.58466300773018e-06, + "loss": 0.0375, + "step": 5167 + }, + { + "epoch": 2.2953586497890295, + "grad_norm": 0.45504963601939913, + "learning_rate": 4.582731201962699e-06, + "loss": 0.0325, + "step": 5168 + }, + { + "epoch": 2.2958027981345768, + "grad_norm": 0.5020378540227112, + "learning_rate": 4.5807994589178066e-06, + "loss": 0.0397, + "step": 5169 + }, + { + "epoch": 2.2962469464801245, + "grad_norm": 0.44086322018058155, + "learning_rate": 4.578867778885877e-06, + "loss": 0.0385, + "step": 5170 + }, + { + "epoch": 2.296691094825672, + "grad_norm": 0.4190415887827377, + "learning_rate": 4.5769361621572735e-06, + "loss": 0.0351, + "step": 5171 + }, + { + "epoch": 2.297135243171219, + "grad_norm": 0.40356235040543037, + "learning_rate": 4.575004609022349e-06, + "loss": 0.0384, + "step": 5172 + }, + { + "epoch": 2.2975793915167664, + "grad_norm": 0.5214338427099589, + "learning_rate": 4.573073119771455e-06, + "loss": 0.0414, + "step": 5173 + }, + { + "epoch": 2.298023539862314, + "grad_norm": 0.6042586649916282, + "learning_rate": 4.571141694694922e-06, + "loss": 0.0398, + "step": 5174 + }, + { + "epoch": 2.2984676882078614, + "grad_norm": 0.5451011557742893, + "learning_rate": 4.569210334083077e-06, + "loss": 0.0284, + "step": 5175 + }, + { + "epoch": 2.2989118365534087, + "grad_norm": 0.6693775358695208, + "learning_rate": 4.567279038226237e-06, + "loss": 0.0393, + "step": 5176 + }, + { + "epoch": 2.2993559848989564, + "grad_norm": 0.4565914726092485, + "learning_rate": 4.565347807414709e-06, + "loss": 0.0369, + "step": 5177 + }, + { + "epoch": 2.2998001332445037, + "grad_norm": 0.3909149508886464, + "learning_rate": 4.563416641938786e-06, + "loss": 0.0312, + "step": 5178 + }, + { + "epoch": 2.300244281590051, + "grad_norm": 0.48805937276628275, + "learning_rate": 4.5614855420887595e-06, + "loss": 0.0417, + "step": 5179 + }, + { + "epoch": 2.3006884299355983, + "grad_norm": 0.47062502463527334, + "learning_rate": 4.559554508154906e-06, + "loss": 0.0359, + "step": 5180 + }, + { + "epoch": 2.301132578281146, + "grad_norm": 0.41598990819931514, + "learning_rate": 4.557623540427492e-06, + "loss": 0.0298, + "step": 5181 + }, + { + "epoch": 2.3015767266266933, + "grad_norm": 0.5301634737897044, + "learning_rate": 4.555692639196774e-06, + "loss": 0.0419, + "step": 5182 + }, + { + "epoch": 2.3020208749722406, + "grad_norm": 0.3965912885980032, + "learning_rate": 4.553761804752997e-06, + "loss": 0.0268, + "step": 5183 + }, + { + "epoch": 2.3024650233177884, + "grad_norm": 0.3853380797199742, + "learning_rate": 4.551831037386405e-06, + "loss": 0.0296, + "step": 5184 + }, + { + "epoch": 2.3029091716633356, + "grad_norm": 0.4626560137060772, + "learning_rate": 4.54990033738722e-06, + "loss": 0.0347, + "step": 5185 + }, + { + "epoch": 2.303353320008883, + "grad_norm": 0.4662767664074809, + "learning_rate": 4.54796970504566e-06, + "loss": 0.0382, + "step": 5186 + }, + { + "epoch": 2.3037974683544302, + "grad_norm": 0.5212735203918284, + "learning_rate": 4.546039140651932e-06, + "loss": 0.0347, + "step": 5187 + }, + { + "epoch": 2.304241616699978, + "grad_norm": 0.6660381074871294, + "learning_rate": 4.544108644496232e-06, + "loss": 0.0414, + "step": 5188 + }, + { + "epoch": 2.3046857650455252, + "grad_norm": 0.49995652041639127, + "learning_rate": 4.542178216868746e-06, + "loss": 0.0394, + "step": 5189 + }, + { + "epoch": 2.3051299133910725, + "grad_norm": 0.38693338487127227, + "learning_rate": 4.540247858059654e-06, + "loss": 0.0244, + "step": 5190 + }, + { + "epoch": 2.3055740617366203, + "grad_norm": 0.42802350164666275, + "learning_rate": 4.538317568359117e-06, + "loss": 0.0234, + "step": 5191 + }, + { + "epoch": 2.3060182100821676, + "grad_norm": 0.33959690635237333, + "learning_rate": 4.536387348057292e-06, + "loss": 0.0273, + "step": 5192 + }, + { + "epoch": 2.306462358427715, + "grad_norm": 0.560656839558957, + "learning_rate": 4.5344571974443255e-06, + "loss": 0.0361, + "step": 5193 + }, + { + "epoch": 2.306906506773262, + "grad_norm": 0.3618868056091977, + "learning_rate": 4.5325271168103496e-06, + "loss": 0.0289, + "step": 5194 + }, + { + "epoch": 2.3073506551188094, + "grad_norm": 0.38869936597852134, + "learning_rate": 4.530597106445487e-06, + "loss": 0.0335, + "step": 5195 + }, + { + "epoch": 2.307794803464357, + "grad_norm": 0.451335091229613, + "learning_rate": 4.528667166639855e-06, + "loss": 0.0293, + "step": 5196 + }, + { + "epoch": 2.3082389518099045, + "grad_norm": 0.3756798518286319, + "learning_rate": 4.526737297683554e-06, + "loss": 0.0307, + "step": 5197 + }, + { + "epoch": 2.3086831001554518, + "grad_norm": 0.41638755620692586, + "learning_rate": 4.524807499866678e-06, + "loss": 0.0313, + "step": 5198 + }, + { + "epoch": 2.3091272485009995, + "grad_norm": 0.49512523947166825, + "learning_rate": 4.522877773479305e-06, + "loss": 0.0341, + "step": 5199 + }, + { + "epoch": 2.3095713968465468, + "grad_norm": 0.38110036222157506, + "learning_rate": 4.520948118811508e-06, + "loss": 0.0301, + "step": 5200 + }, + { + "epoch": 2.310015545192094, + "grad_norm": 0.4176831056196918, + "learning_rate": 4.519018536153346e-06, + "loss": 0.0325, + "step": 5201 + }, + { + "epoch": 2.3104596935376414, + "grad_norm": 0.37358674774847195, + "learning_rate": 4.517089025794869e-06, + "loss": 0.0223, + "step": 5202 + }, + { + "epoch": 2.310903841883189, + "grad_norm": 0.5722574491295905, + "learning_rate": 4.515159588026114e-06, + "loss": 0.0385, + "step": 5203 + }, + { + "epoch": 2.3113479902287364, + "grad_norm": 0.3452449788849875, + "learning_rate": 4.51323022313711e-06, + "loss": 0.0239, + "step": 5204 + }, + { + "epoch": 2.3117921385742837, + "grad_norm": 0.5594235169002529, + "learning_rate": 4.511300931417872e-06, + "loss": 0.0378, + "step": 5205 + }, + { + "epoch": 2.3122362869198314, + "grad_norm": 0.4629383899400429, + "learning_rate": 4.509371713158404e-06, + "loss": 0.0465, + "step": 5206 + }, + { + "epoch": 2.3126804352653787, + "grad_norm": 0.344536109032876, + "learning_rate": 4.507442568648702e-06, + "loss": 0.0214, + "step": 5207 + }, + { + "epoch": 2.313124583610926, + "grad_norm": 0.5327665523608216, + "learning_rate": 4.505513498178752e-06, + "loss": 0.0474, + "step": 5208 + }, + { + "epoch": 2.3135687319564733, + "grad_norm": 0.43197164521390413, + "learning_rate": 4.503584502038521e-06, + "loss": 0.0307, + "step": 5209 + }, + { + "epoch": 2.314012880302021, + "grad_norm": 0.46940506061763837, + "learning_rate": 4.501655580517972e-06, + "loss": 0.0405, + "step": 5210 + }, + { + "epoch": 2.3144570286475683, + "grad_norm": 0.3043189475892609, + "learning_rate": 4.499726733907056e-06, + "loss": 0.0208, + "step": 5211 + }, + { + "epoch": 2.3149011769931156, + "grad_norm": 0.5162978477725472, + "learning_rate": 4.497797962495707e-06, + "loss": 0.0417, + "step": 5212 + }, + { + "epoch": 2.3153453253386633, + "grad_norm": 0.4166068328105937, + "learning_rate": 4.495869266573857e-06, + "loss": 0.0363, + "step": 5213 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.5916403656140393, + "learning_rate": 4.49394064643142e-06, + "loss": 0.0401, + "step": 5214 + }, + { + "epoch": 2.316233622029758, + "grad_norm": 0.6022617123765319, + "learning_rate": 4.492012102358301e-06, + "loss": 0.0307, + "step": 5215 + }, + { + "epoch": 2.316677770375305, + "grad_norm": 0.4369169634282751, + "learning_rate": 4.490083634644391e-06, + "loss": 0.042, + "step": 5216 + }, + { + "epoch": 2.317121918720853, + "grad_norm": 0.5201884422005028, + "learning_rate": 4.488155243579574e-06, + "loss": 0.0353, + "step": 5217 + }, + { + "epoch": 2.3175660670664002, + "grad_norm": 0.4192988414508763, + "learning_rate": 4.486226929453716e-06, + "loss": 0.0347, + "step": 5218 + }, + { + "epoch": 2.3180102154119475, + "grad_norm": 0.47826969893341476, + "learning_rate": 4.4842986925566805e-06, + "loss": 0.0412, + "step": 5219 + }, + { + "epoch": 2.3184543637574953, + "grad_norm": 0.4411896998427744, + "learning_rate": 4.482370533178311e-06, + "loss": 0.0473, + "step": 5220 + }, + { + "epoch": 2.3188985121030425, + "grad_norm": 0.7154366399859141, + "learning_rate": 4.4804424516084435e-06, + "loss": 0.0318, + "step": 5221 + }, + { + "epoch": 2.31934266044859, + "grad_norm": 0.4486899236378927, + "learning_rate": 4.478514448136901e-06, + "loss": 0.03, + "step": 5222 + }, + { + "epoch": 2.319786808794137, + "grad_norm": 0.6696629896177303, + "learning_rate": 4.476586523053494e-06, + "loss": 0.0447, + "step": 5223 + }, + { + "epoch": 2.3202309571396844, + "grad_norm": 0.36764590834428945, + "learning_rate": 4.474658676648025e-06, + "loss": 0.0317, + "step": 5224 + }, + { + "epoch": 2.320675105485232, + "grad_norm": 0.4074350155089678, + "learning_rate": 4.4727309092102825e-06, + "loss": 0.0338, + "step": 5225 + }, + { + "epoch": 2.3211192538307794, + "grad_norm": 0.5410060660419875, + "learning_rate": 4.47080322103004e-06, + "loss": 0.0406, + "step": 5226 + }, + { + "epoch": 2.3215634021763267, + "grad_norm": 0.3869245785320258, + "learning_rate": 4.4688756123970625e-06, + "loss": 0.025, + "step": 5227 + }, + { + "epoch": 2.3220075505218745, + "grad_norm": 0.3306613006808203, + "learning_rate": 4.466948083601103e-06, + "loss": 0.0216, + "step": 5228 + }, + { + "epoch": 2.3224516988674218, + "grad_norm": 0.3885322170673341, + "learning_rate": 4.4650206349319e-06, + "loss": 0.0334, + "step": 5229 + }, + { + "epoch": 2.322895847212969, + "grad_norm": 0.45727532111911046, + "learning_rate": 4.463093266679185e-06, + "loss": 0.0314, + "step": 5230 + }, + { + "epoch": 2.3233399955585163, + "grad_norm": 0.40470162685593614, + "learning_rate": 4.4611659791326726e-06, + "loss": 0.0225, + "step": 5231 + }, + { + "epoch": 2.323784143904064, + "grad_norm": 0.3709631261080152, + "learning_rate": 4.459238772582067e-06, + "loss": 0.0281, + "step": 5232 + }, + { + "epoch": 2.3242282922496114, + "grad_norm": 0.5000330379973269, + "learning_rate": 4.457311647317058e-06, + "loss": 0.0408, + "step": 5233 + }, + { + "epoch": 2.3246724405951587, + "grad_norm": 0.40795990787328895, + "learning_rate": 4.4553846036273294e-06, + "loss": 0.0304, + "step": 5234 + }, + { + "epoch": 2.3251165889407064, + "grad_norm": 0.41652506953547214, + "learning_rate": 4.453457641802542e-06, + "loss": 0.0283, + "step": 5235 + }, + { + "epoch": 2.3255607372862537, + "grad_norm": 0.422544506737885, + "learning_rate": 4.451530762132359e-06, + "loss": 0.0247, + "step": 5236 + }, + { + "epoch": 2.326004885631801, + "grad_norm": 0.42941673106868544, + "learning_rate": 4.4496039649064185e-06, + "loss": 0.0271, + "step": 5237 + }, + { + "epoch": 2.3264490339773483, + "grad_norm": 0.5518213591251218, + "learning_rate": 4.4476772504143525e-06, + "loss": 0.0283, + "step": 5238 + }, + { + "epoch": 2.326893182322896, + "grad_norm": 0.3905942457168205, + "learning_rate": 4.445750618945778e-06, + "loss": 0.0338, + "step": 5239 + }, + { + "epoch": 2.3273373306684433, + "grad_norm": 0.5264154088819042, + "learning_rate": 4.443824070790298e-06, + "loss": 0.0373, + "step": 5240 + }, + { + "epoch": 2.3277814790139906, + "grad_norm": 0.4859563195815842, + "learning_rate": 4.4418976062375095e-06, + "loss": 0.0472, + "step": 5241 + }, + { + "epoch": 2.3282256273595383, + "grad_norm": 0.43082619257296717, + "learning_rate": 4.439971225576992e-06, + "loss": 0.0321, + "step": 5242 + }, + { + "epoch": 2.3286697757050856, + "grad_norm": 0.4148209810054243, + "learning_rate": 4.438044929098312e-06, + "loss": 0.027, + "step": 5243 + }, + { + "epoch": 2.329113924050633, + "grad_norm": 0.3906796792959702, + "learning_rate": 4.436118717091025e-06, + "loss": 0.0302, + "step": 5244 + }, + { + "epoch": 2.32955807239618, + "grad_norm": 0.5074169188006878, + "learning_rate": 4.434192589844674e-06, + "loss": 0.033, + "step": 5245 + }, + { + "epoch": 2.330002220741728, + "grad_norm": 0.5700168097501661, + "learning_rate": 4.432266547648786e-06, + "loss": 0.0386, + "step": 5246 + }, + { + "epoch": 2.330446369087275, + "grad_norm": 0.7002894794054985, + "learning_rate": 4.430340590792883e-06, + "loss": 0.0452, + "step": 5247 + }, + { + "epoch": 2.3308905174328225, + "grad_norm": 0.43587113620478884, + "learning_rate": 4.428414719566464e-06, + "loss": 0.0455, + "step": 5248 + }, + { + "epoch": 2.33133466577837, + "grad_norm": 0.4414200442172319, + "learning_rate": 4.426488934259023e-06, + "loss": 0.038, + "step": 5249 + }, + { + "epoch": 2.3317788141239175, + "grad_norm": 0.3603980656114873, + "learning_rate": 4.424563235160039e-06, + "loss": 0.0284, + "step": 5250 + }, + { + "epoch": 2.332222962469465, + "grad_norm": 0.3831150016755091, + "learning_rate": 4.422637622558973e-06, + "loss": 0.0302, + "step": 5251 + }, + { + "epoch": 2.332667110815012, + "grad_norm": 0.3992416324305334, + "learning_rate": 4.42071209674528e-06, + "loss": 0.0324, + "step": 5252 + }, + { + "epoch": 2.3331112591605594, + "grad_norm": 0.4795655074267854, + "learning_rate": 4.4187866580084005e-06, + "loss": 0.0456, + "step": 5253 + }, + { + "epoch": 2.333555407506107, + "grad_norm": 0.3484662534276821, + "learning_rate": 4.41686130663776e-06, + "loss": 0.0223, + "step": 5254 + }, + { + "epoch": 2.3339995558516544, + "grad_norm": 0.4234114697000452, + "learning_rate": 4.4149360429227695e-06, + "loss": 0.0296, + "step": 5255 + }, + { + "epoch": 2.3344437041972017, + "grad_norm": 0.4632049279941834, + "learning_rate": 4.4130108671528315e-06, + "loss": 0.0264, + "step": 5256 + }, + { + "epoch": 2.3348878525427494, + "grad_norm": 0.5054504530004318, + "learning_rate": 4.41108577961733e-06, + "loss": 0.0423, + "step": 5257 + }, + { + "epoch": 2.3353320008882967, + "grad_norm": 0.4132515605214912, + "learning_rate": 4.40916078060564e-06, + "loss": 0.026, + "step": 5258 + }, + { + "epoch": 2.335776149233844, + "grad_norm": 0.4886399938932667, + "learning_rate": 4.407235870407122e-06, + "loss": 0.0378, + "step": 5259 + }, + { + "epoch": 2.3362202975793913, + "grad_norm": 0.48169156288056764, + "learning_rate": 4.4053110493111226e-06, + "loss": 0.0366, + "step": 5260 + }, + { + "epoch": 2.336664445924939, + "grad_norm": 0.5264104312524703, + "learning_rate": 4.403386317606972e-06, + "loss": 0.0427, + "step": 5261 + }, + { + "epoch": 2.3371085942704863, + "grad_norm": 0.3726652002849068, + "learning_rate": 4.4014616755839955e-06, + "loss": 0.026, + "step": 5262 + }, + { + "epoch": 2.3375527426160336, + "grad_norm": 0.531223484070124, + "learning_rate": 4.399537123531494e-06, + "loss": 0.0389, + "step": 5263 + }, + { + "epoch": 2.3379968909615814, + "grad_norm": 0.5377515081195285, + "learning_rate": 4.3976126617387645e-06, + "loss": 0.0466, + "step": 5264 + }, + { + "epoch": 2.3384410393071287, + "grad_norm": 0.4256267011064791, + "learning_rate": 4.395688290495084e-06, + "loss": 0.0381, + "step": 5265 + }, + { + "epoch": 2.338885187652676, + "grad_norm": 0.4792927506755932, + "learning_rate": 4.393764010089719e-06, + "loss": 0.046, + "step": 5266 + }, + { + "epoch": 2.3393293359982232, + "grad_norm": 0.5281025771868061, + "learning_rate": 4.391839820811923e-06, + "loss": 0.0396, + "step": 5267 + }, + { + "epoch": 2.339773484343771, + "grad_norm": 0.4893018865311666, + "learning_rate": 4.389915722950931e-06, + "loss": 0.0368, + "step": 5268 + }, + { + "epoch": 2.3402176326893183, + "grad_norm": 0.4896668042908599, + "learning_rate": 4.387991716795968e-06, + "loss": 0.0406, + "step": 5269 + }, + { + "epoch": 2.3406617810348656, + "grad_norm": 0.39529432125848185, + "learning_rate": 4.386067802636249e-06, + "loss": 0.0323, + "step": 5270 + }, + { + "epoch": 2.3411059293804133, + "grad_norm": 0.38393804306481244, + "learning_rate": 4.384143980760968e-06, + "loss": 0.0321, + "step": 5271 + }, + { + "epoch": 2.3415500777259606, + "grad_norm": 0.4030478157940605, + "learning_rate": 4.382220251459306e-06, + "loss": 0.0348, + "step": 5272 + }, + { + "epoch": 2.341994226071508, + "grad_norm": 0.46335758617758316, + "learning_rate": 4.380296615020437e-06, + "loss": 0.0481, + "step": 5273 + }, + { + "epoch": 2.342438374417055, + "grad_norm": 0.39799741674310996, + "learning_rate": 4.3783730717335124e-06, + "loss": 0.0365, + "step": 5274 + }, + { + "epoch": 2.342882522762603, + "grad_norm": 0.6313177165283153, + "learning_rate": 4.376449621887674e-06, + "loss": 0.0357, + "step": 5275 + }, + { + "epoch": 2.34332667110815, + "grad_norm": 0.46331907567288616, + "learning_rate": 4.37452626577205e-06, + "loss": 0.0404, + "step": 5276 + }, + { + "epoch": 2.3437708194536975, + "grad_norm": 0.45182075889541223, + "learning_rate": 4.372603003675755e-06, + "loss": 0.0273, + "step": 5277 + }, + { + "epoch": 2.3442149677992448, + "grad_norm": 0.45396241594109277, + "learning_rate": 4.370679835887885e-06, + "loss": 0.0334, + "step": 5278 + }, + { + "epoch": 2.3446591161447925, + "grad_norm": 0.4008935982764722, + "learning_rate": 4.368756762697525e-06, + "loss": 0.0295, + "step": 5279 + }, + { + "epoch": 2.34510326449034, + "grad_norm": 0.4497120560175991, + "learning_rate": 4.366833784393746e-06, + "loss": 0.034, + "step": 5280 + }, + { + "epoch": 2.345547412835887, + "grad_norm": 0.5274538552861234, + "learning_rate": 4.364910901265607e-06, + "loss": 0.0358, + "step": 5281 + }, + { + "epoch": 2.3459915611814344, + "grad_norm": 0.5012732757402258, + "learning_rate": 4.362988113602147e-06, + "loss": 0.0351, + "step": 5282 + }, + { + "epoch": 2.346435709526982, + "grad_norm": 0.41702907826337277, + "learning_rate": 4.361065421692394e-06, + "loss": 0.0311, + "step": 5283 + }, + { + "epoch": 2.3468798578725294, + "grad_norm": 0.4723862662210319, + "learning_rate": 4.3591428258253634e-06, + "loss": 0.0283, + "step": 5284 + }, + { + "epoch": 2.3473240062180767, + "grad_norm": 0.4642888220640226, + "learning_rate": 4.35722032629005e-06, + "loss": 0.0341, + "step": 5285 + }, + { + "epoch": 2.3477681545636244, + "grad_norm": 0.5515201989991365, + "learning_rate": 4.35529792337544e-06, + "loss": 0.0426, + "step": 5286 + }, + { + "epoch": 2.3482123029091717, + "grad_norm": 0.42814364810008887, + "learning_rate": 4.353375617370506e-06, + "loss": 0.0386, + "step": 5287 + }, + { + "epoch": 2.348656451254719, + "grad_norm": 0.422713836528967, + "learning_rate": 4.3514534085642e-06, + "loss": 0.0477, + "step": 5288 + }, + { + "epoch": 2.3491005996002663, + "grad_norm": 0.388857891163739, + "learning_rate": 4.349531297245464e-06, + "loss": 0.0284, + "step": 5289 + }, + { + "epoch": 2.349544747945814, + "grad_norm": 0.33430013810360537, + "learning_rate": 4.347609283703224e-06, + "loss": 0.0222, + "step": 5290 + }, + { + "epoch": 2.3499888962913613, + "grad_norm": 0.4550663748035299, + "learning_rate": 4.345687368226391e-06, + "loss": 0.0484, + "step": 5291 + }, + { + "epoch": 2.3504330446369086, + "grad_norm": 0.4904858360330206, + "learning_rate": 4.343765551103859e-06, + "loss": 0.0431, + "step": 5292 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 0.418865690563144, + "learning_rate": 4.3418438326245134e-06, + "loss": 0.0264, + "step": 5293 + }, + { + "epoch": 2.3513213413280036, + "grad_norm": 0.3313326958383252, + "learning_rate": 4.33992221307722e-06, + "loss": 0.0359, + "step": 5294 + }, + { + "epoch": 2.351765489673551, + "grad_norm": 0.38342759888274064, + "learning_rate": 4.338000692750832e-06, + "loss": 0.03, + "step": 5295 + }, + { + "epoch": 2.352209638019098, + "grad_norm": 0.6016424768870177, + "learning_rate": 4.336079271934184e-06, + "loss": 0.0432, + "step": 5296 + }, + { + "epoch": 2.352653786364646, + "grad_norm": 0.5554548071020693, + "learning_rate": 4.334157950916098e-06, + "loss": 0.0659, + "step": 5297 + }, + { + "epoch": 2.3530979347101932, + "grad_norm": 0.5014661231015702, + "learning_rate": 4.332236729985385e-06, + "loss": 0.0343, + "step": 5298 + }, + { + "epoch": 2.3535420830557405, + "grad_norm": 0.5146747988924121, + "learning_rate": 4.330315609430835e-06, + "loss": 0.0351, + "step": 5299 + }, + { + "epoch": 2.3539862314012883, + "grad_norm": 0.3876173155044, + "learning_rate": 4.328394589541223e-06, + "loss": 0.0282, + "step": 5300 + }, + { + "epoch": 2.3544303797468356, + "grad_norm": 0.414824658223231, + "learning_rate": 4.326473670605315e-06, + "loss": 0.0323, + "step": 5301 + }, + { + "epoch": 2.354874528092383, + "grad_norm": 0.47728404717752354, + "learning_rate": 4.324552852911854e-06, + "loss": 0.0459, + "step": 5302 + }, + { + "epoch": 2.35531867643793, + "grad_norm": 0.40142086992053616, + "learning_rate": 4.322632136749572e-06, + "loss": 0.0329, + "step": 5303 + }, + { + "epoch": 2.355762824783478, + "grad_norm": 0.4704980131084727, + "learning_rate": 4.3207115224071874e-06, + "loss": 0.0366, + "step": 5304 + }, + { + "epoch": 2.356206973129025, + "grad_norm": 0.402777301290612, + "learning_rate": 4.318791010173401e-06, + "loss": 0.0319, + "step": 5305 + }, + { + "epoch": 2.3566511214745725, + "grad_norm": 0.41278158938376935, + "learning_rate": 4.316870600336896e-06, + "loss": 0.0217, + "step": 5306 + }, + { + "epoch": 2.3570952698201197, + "grad_norm": 0.5124207452989102, + "learning_rate": 4.314950293186346e-06, + "loss": 0.0398, + "step": 5307 + }, + { + "epoch": 2.3575394181656675, + "grad_norm": 0.4244779016373727, + "learning_rate": 4.3130300890104035e-06, + "loss": 0.0263, + "step": 5308 + }, + { + "epoch": 2.3579835665112148, + "grad_norm": 0.6692076129286184, + "learning_rate": 4.311109988097706e-06, + "loss": 0.0347, + "step": 5309 + }, + { + "epoch": 2.358427714856762, + "grad_norm": 0.5345819569804751, + "learning_rate": 4.30918999073688e-06, + "loss": 0.0406, + "step": 5310 + }, + { + "epoch": 2.3588718632023093, + "grad_norm": 0.4416731105459865, + "learning_rate": 4.307270097216535e-06, + "loss": 0.0478, + "step": 5311 + }, + { + "epoch": 2.359316011547857, + "grad_norm": 0.4789720007194078, + "learning_rate": 4.305350307825261e-06, + "loss": 0.0347, + "step": 5312 + }, + { + "epoch": 2.3597601598934044, + "grad_norm": 0.39908333172506805, + "learning_rate": 4.303430622851635e-06, + "loss": 0.0346, + "step": 5313 + }, + { + "epoch": 2.3602043082389517, + "grad_norm": 0.49076992885563064, + "learning_rate": 4.301511042584219e-06, + "loss": 0.035, + "step": 5314 + }, + { + "epoch": 2.3606484565844994, + "grad_norm": 0.3862322602452697, + "learning_rate": 4.29959156731156e-06, + "loss": 0.0409, + "step": 5315 + }, + { + "epoch": 2.3610926049300467, + "grad_norm": 0.4148195535388891, + "learning_rate": 4.297672197322186e-06, + "loss": 0.0382, + "step": 5316 + }, + { + "epoch": 2.361536753275594, + "grad_norm": 0.5373449885131869, + "learning_rate": 4.29575293290461e-06, + "loss": 0.0371, + "step": 5317 + }, + { + "epoch": 2.3619809016211413, + "grad_norm": 0.41088601906859734, + "learning_rate": 4.293833774347333e-06, + "loss": 0.0297, + "step": 5318 + }, + { + "epoch": 2.362425049966689, + "grad_norm": 1.1416110662239514, + "learning_rate": 4.291914721938835e-06, + "loss": 0.0382, + "step": 5319 + }, + { + "epoch": 2.3628691983122363, + "grad_norm": 0.4504483504568535, + "learning_rate": 4.289995775967581e-06, + "loss": 0.0297, + "step": 5320 + }, + { + "epoch": 2.3633133466577836, + "grad_norm": 0.38320982061969394, + "learning_rate": 4.2880769367220234e-06, + "loss": 0.024, + "step": 5321 + }, + { + "epoch": 2.3637574950033313, + "grad_norm": 0.3866521874991146, + "learning_rate": 4.2861582044905966e-06, + "loss": 0.0264, + "step": 5322 + }, + { + "epoch": 2.3642016433488786, + "grad_norm": 0.5381622631772952, + "learning_rate": 4.284239579561718e-06, + "loss": 0.0514, + "step": 5323 + }, + { + "epoch": 2.364645791694426, + "grad_norm": 0.4285272602850889, + "learning_rate": 4.282321062223788e-06, + "loss": 0.0365, + "step": 5324 + }, + { + "epoch": 2.365089940039973, + "grad_norm": 0.5903609551581849, + "learning_rate": 4.280402652765194e-06, + "loss": 0.0428, + "step": 5325 + }, + { + "epoch": 2.365534088385521, + "grad_norm": 0.6051857396994949, + "learning_rate": 4.278484351474303e-06, + "loss": 0.0336, + "step": 5326 + }, + { + "epoch": 2.365978236731068, + "grad_norm": 0.569315522539356, + "learning_rate": 4.2765661586394736e-06, + "loss": 0.0334, + "step": 5327 + }, + { + "epoch": 2.3664223850766155, + "grad_norm": 0.6976731675123818, + "learning_rate": 4.2746480745490385e-06, + "loss": 0.0367, + "step": 5328 + }, + { + "epoch": 2.3668665334221632, + "grad_norm": 0.5611945388042203, + "learning_rate": 4.272730099491319e-06, + "loss": 0.0407, + "step": 5329 + }, + { + "epoch": 2.3673106817677105, + "grad_norm": 0.49730153273797245, + "learning_rate": 4.27081223375462e-06, + "loss": 0.038, + "step": 5330 + }, + { + "epoch": 2.367754830113258, + "grad_norm": 0.5313463154313218, + "learning_rate": 4.268894477627229e-06, + "loss": 0.0435, + "step": 5331 + }, + { + "epoch": 2.368198978458805, + "grad_norm": 0.48983566744126533, + "learning_rate": 4.2669768313974155e-06, + "loss": 0.0373, + "step": 5332 + }, + { + "epoch": 2.3686431268043524, + "grad_norm": 0.5179274904316602, + "learning_rate": 4.265059295353439e-06, + "loss": 0.0445, + "step": 5333 + }, + { + "epoch": 2.3690872751499, + "grad_norm": 0.5588093911942073, + "learning_rate": 4.2631418697835335e-06, + "loss": 0.0351, + "step": 5334 + }, + { + "epoch": 2.3695314234954474, + "grad_norm": 0.3586724840114786, + "learning_rate": 4.261224554975923e-06, + "loss": 0.0214, + "step": 5335 + }, + { + "epoch": 2.3699755718409947, + "grad_norm": 0.418317991811463, + "learning_rate": 4.259307351218812e-06, + "loss": 0.0301, + "step": 5336 + }, + { + "epoch": 2.3704197201865425, + "grad_norm": 0.33540637763336467, + "learning_rate": 4.2573902588003844e-06, + "loss": 0.023, + "step": 5337 + }, + { + "epoch": 2.3708638685320897, + "grad_norm": 0.45375734251758604, + "learning_rate": 4.2554732780088185e-06, + "loss": 0.0394, + "step": 5338 + }, + { + "epoch": 2.371308016877637, + "grad_norm": 0.5337079104144298, + "learning_rate": 4.253556409132267e-06, + "loss": 0.0441, + "step": 5339 + }, + { + "epoch": 2.3717521652231843, + "grad_norm": 0.5739232408785224, + "learning_rate": 4.251639652458866e-06, + "loss": 0.0417, + "step": 5340 + }, + { + "epoch": 2.372196313568732, + "grad_norm": 0.6130343197365772, + "learning_rate": 4.249723008276737e-06, + "loss": 0.0387, + "step": 5341 + }, + { + "epoch": 2.3726404619142794, + "grad_norm": 0.4104117521503695, + "learning_rate": 4.247806476873987e-06, + "loss": 0.033, + "step": 5342 + }, + { + "epoch": 2.3730846102598266, + "grad_norm": 0.48677001689089716, + "learning_rate": 4.245890058538697e-06, + "loss": 0.0377, + "step": 5343 + }, + { + "epoch": 2.3735287586053744, + "grad_norm": 0.3714192923602575, + "learning_rate": 4.2439737535589455e-06, + "loss": 0.0294, + "step": 5344 + }, + { + "epoch": 2.3739729069509217, + "grad_norm": 0.40375546238165533, + "learning_rate": 4.2420575622227786e-06, + "loss": 0.0314, + "step": 5345 + }, + { + "epoch": 2.374417055296469, + "grad_norm": 0.7948951370956706, + "learning_rate": 4.240141484818238e-06, + "loss": 0.0393, + "step": 5346 + }, + { + "epoch": 2.3748612036420162, + "grad_norm": 0.7705250912114215, + "learning_rate": 4.238225521633339e-06, + "loss": 0.0446, + "step": 5347 + }, + { + "epoch": 2.375305351987564, + "grad_norm": 0.5226003605622348, + "learning_rate": 4.2363096729560824e-06, + "loss": 0.0397, + "step": 5348 + }, + { + "epoch": 2.3757495003331113, + "grad_norm": 0.37536659049033705, + "learning_rate": 4.234393939074456e-06, + "loss": 0.0264, + "step": 5349 + }, + { + "epoch": 2.3761936486786586, + "grad_norm": 0.35662293585999577, + "learning_rate": 4.2324783202764265e-06, + "loss": 0.0294, + "step": 5350 + }, + { + "epoch": 2.3766377970242063, + "grad_norm": 0.5673307864129802, + "learning_rate": 4.230562816849944e-06, + "loss": 0.0471, + "step": 5351 + }, + { + "epoch": 2.3770819453697536, + "grad_norm": 0.42418386406735814, + "learning_rate": 4.228647429082939e-06, + "loss": 0.0306, + "step": 5352 + }, + { + "epoch": 2.377526093715301, + "grad_norm": 0.5746067334007909, + "learning_rate": 4.22673215726333e-06, + "loss": 0.031, + "step": 5353 + }, + { + "epoch": 2.377970242060848, + "grad_norm": 0.3492096810181002, + "learning_rate": 4.224817001679011e-06, + "loss": 0.021, + "step": 5354 + }, + { + "epoch": 2.378414390406396, + "grad_norm": 0.5532015823221401, + "learning_rate": 4.222901962617867e-06, + "loss": 0.0432, + "step": 5355 + }, + { + "epoch": 2.378858538751943, + "grad_norm": 0.5532405005608227, + "learning_rate": 4.220987040367757e-06, + "loss": 0.0407, + "step": 5356 + }, + { + "epoch": 2.3793026870974905, + "grad_norm": 0.35180832498879067, + "learning_rate": 4.219072235216529e-06, + "loss": 0.028, + "step": 5357 + }, + { + "epoch": 2.379746835443038, + "grad_norm": 0.7448501299403424, + "learning_rate": 4.2171575474520084e-06, + "loss": 0.0328, + "step": 5358 + }, + { + "epoch": 2.3801909837885855, + "grad_norm": 0.4554992316854567, + "learning_rate": 4.215242977362009e-06, + "loss": 0.0301, + "step": 5359 + }, + { + "epoch": 2.380635132134133, + "grad_norm": 0.5712009195638152, + "learning_rate": 4.213328525234317e-06, + "loss": 0.0344, + "step": 5360 + }, + { + "epoch": 2.38107928047968, + "grad_norm": 0.46746160638085626, + "learning_rate": 4.211414191356714e-06, + "loss": 0.0387, + "step": 5361 + }, + { + "epoch": 2.3815234288252274, + "grad_norm": 0.4318040553120997, + "learning_rate": 4.209499976016953e-06, + "loss": 0.0359, + "step": 5362 + }, + { + "epoch": 2.381967577170775, + "grad_norm": 0.4278461319811184, + "learning_rate": 4.2075858795027745e-06, + "loss": 0.0368, + "step": 5363 + }, + { + "epoch": 2.3824117255163224, + "grad_norm": 0.32663597876713696, + "learning_rate": 4.205671902101899e-06, + "loss": 0.0256, + "step": 5364 + }, + { + "epoch": 2.3828558738618697, + "grad_norm": 0.389353159520307, + "learning_rate": 4.203758044102029e-06, + "loss": 0.0259, + "step": 5365 + }, + { + "epoch": 2.3833000222074174, + "grad_norm": 0.5653053154481631, + "learning_rate": 4.2018443057908495e-06, + "loss": 0.047, + "step": 5366 + }, + { + "epoch": 2.3837441705529647, + "grad_norm": 0.34982509990328564, + "learning_rate": 4.199930687456031e-06, + "loss": 0.0267, + "step": 5367 + }, + { + "epoch": 2.384188318898512, + "grad_norm": 0.41740396260225204, + "learning_rate": 4.198017189385221e-06, + "loss": 0.0293, + "step": 5368 + }, + { + "epoch": 2.3846324672440593, + "grad_norm": 0.38322501636096945, + "learning_rate": 4.1961038118660504e-06, + "loss": 0.0329, + "step": 5369 + }, + { + "epoch": 2.385076615589607, + "grad_norm": 0.45223291597674525, + "learning_rate": 4.194190555186133e-06, + "loss": 0.0347, + "step": 5370 + }, + { + "epoch": 2.3855207639351543, + "grad_norm": 0.5113109910932524, + "learning_rate": 4.1922774196330614e-06, + "loss": 0.037, + "step": 5371 + }, + { + "epoch": 2.3859649122807016, + "grad_norm": 0.4161439641942477, + "learning_rate": 4.190364405494417e-06, + "loss": 0.032, + "step": 5372 + }, + { + "epoch": 2.3864090606262494, + "grad_norm": 0.3241401849079966, + "learning_rate": 4.1884515130577545e-06, + "loss": 0.0277, + "step": 5373 + }, + { + "epoch": 2.3868532089717966, + "grad_norm": 0.41604888730983397, + "learning_rate": 4.1865387426106165e-06, + "loss": 0.0342, + "step": 5374 + }, + { + "epoch": 2.387297357317344, + "grad_norm": 0.4441137282989938, + "learning_rate": 4.184626094440524e-06, + "loss": 0.0284, + "step": 5375 + }, + { + "epoch": 2.3877415056628912, + "grad_norm": 0.5249221470565395, + "learning_rate": 4.182713568834979e-06, + "loss": 0.0363, + "step": 5376 + }, + { + "epoch": 2.388185654008439, + "grad_norm": 0.5338153166350271, + "learning_rate": 4.180801166081466e-06, + "loss": 0.0505, + "step": 5377 + }, + { + "epoch": 2.3886298023539863, + "grad_norm": 0.6473388727476899, + "learning_rate": 4.178888886467457e-06, + "loss": 0.0426, + "step": 5378 + }, + { + "epoch": 2.3890739506995335, + "grad_norm": 0.362371921356082, + "learning_rate": 4.176976730280396e-06, + "loss": 0.0367, + "step": 5379 + }, + { + "epoch": 2.3895180990450813, + "grad_norm": 0.401397401331949, + "learning_rate": 4.175064697807712e-06, + "loss": 0.0315, + "step": 5380 + }, + { + "epoch": 2.3899622473906286, + "grad_norm": 0.4918164976839154, + "learning_rate": 4.173152789336818e-06, + "loss": 0.0377, + "step": 5381 + }, + { + "epoch": 2.390406395736176, + "grad_norm": 0.4428239856696088, + "learning_rate": 4.171241005155105e-06, + "loss": 0.0424, + "step": 5382 + }, + { + "epoch": 2.390850544081723, + "grad_norm": 0.5071830732288508, + "learning_rate": 4.169329345549945e-06, + "loss": 0.0343, + "step": 5383 + }, + { + "epoch": 2.391294692427271, + "grad_norm": 0.5154632693129214, + "learning_rate": 4.167417810808698e-06, + "loss": 0.0342, + "step": 5384 + }, + { + "epoch": 2.391738840772818, + "grad_norm": 0.6827121614415474, + "learning_rate": 4.165506401218697e-06, + "loss": 0.0537, + "step": 5385 + }, + { + "epoch": 2.3921829891183655, + "grad_norm": 0.5360898324290861, + "learning_rate": 4.163595117067258e-06, + "loss": 0.0415, + "step": 5386 + }, + { + "epoch": 2.3926271374639128, + "grad_norm": 0.4268060401836661, + "learning_rate": 4.1616839586416825e-06, + "loss": 0.0295, + "step": 5387 + }, + { + "epoch": 2.3930712858094605, + "grad_norm": 0.6427383929847603, + "learning_rate": 4.159772926229247e-06, + "loss": 0.0312, + "step": 5388 + }, + { + "epoch": 2.393515434155008, + "grad_norm": 0.5325554425749491, + "learning_rate": 4.1578620201172144e-06, + "loss": 0.0354, + "step": 5389 + }, + { + "epoch": 2.393959582500555, + "grad_norm": 0.43938556377430205, + "learning_rate": 4.155951240592825e-06, + "loss": 0.0372, + "step": 5390 + }, + { + "epoch": 2.3944037308461024, + "grad_norm": 0.5152442233098135, + "learning_rate": 4.154040587943303e-06, + "loss": 0.0402, + "step": 5391 + }, + { + "epoch": 2.39484787919165, + "grad_norm": 0.42703323082019723, + "learning_rate": 4.1521300624558516e-06, + "loss": 0.0373, + "step": 5392 + }, + { + "epoch": 2.3952920275371974, + "grad_norm": 0.402722970762066, + "learning_rate": 4.150219664417653e-06, + "loss": 0.0311, + "step": 5393 + }, + { + "epoch": 2.3957361758827447, + "grad_norm": 0.6214432623647976, + "learning_rate": 4.148309394115872e-06, + "loss": 0.0364, + "step": 5394 + }, + { + "epoch": 2.3961803242282924, + "grad_norm": 0.44276990850604514, + "learning_rate": 4.14639925183766e-06, + "loss": 0.0341, + "step": 5395 + }, + { + "epoch": 2.3966244725738397, + "grad_norm": 0.4294877304529703, + "learning_rate": 4.144489237870141e-06, + "loss": 0.0298, + "step": 5396 + }, + { + "epoch": 2.397068620919387, + "grad_norm": 0.46553831317336336, + "learning_rate": 4.142579352500421e-06, + "loss": 0.0318, + "step": 5397 + }, + { + "epoch": 2.3975127692649343, + "grad_norm": 0.4319310212143851, + "learning_rate": 4.14066959601559e-06, + "loss": 0.0412, + "step": 5398 + }, + { + "epoch": 2.397956917610482, + "grad_norm": 0.6999580389169097, + "learning_rate": 4.138759968702716e-06, + "loss": 0.0496, + "step": 5399 + }, + { + "epoch": 2.3984010659560293, + "grad_norm": 0.6261420455213397, + "learning_rate": 4.1368504708488476e-06, + "loss": 0.0387, + "step": 5400 + }, + { + "epoch": 2.3988452143015766, + "grad_norm": 0.352239243573041, + "learning_rate": 4.134941102741016e-06, + "loss": 0.0286, + "step": 5401 + }, + { + "epoch": 2.3992893626471243, + "grad_norm": 0.4855267366461917, + "learning_rate": 4.133031864666232e-06, + "loss": 0.042, + "step": 5402 + }, + { + "epoch": 2.3997335109926716, + "grad_norm": 0.48074181838542007, + "learning_rate": 4.1311227569114855e-06, + "loss": 0.0516, + "step": 5403 + }, + { + "epoch": 2.400177659338219, + "grad_norm": 0.4770686273769997, + "learning_rate": 4.12921377976375e-06, + "loss": 0.0378, + "step": 5404 + }, + { + "epoch": 2.400621807683766, + "grad_norm": 0.5247613321525735, + "learning_rate": 4.127304933509972e-06, + "loss": 0.0414, + "step": 5405 + }, + { + "epoch": 2.401065956029314, + "grad_norm": 0.4623711119275284, + "learning_rate": 4.125396218437089e-06, + "loss": 0.0589, + "step": 5406 + }, + { + "epoch": 2.4015101043748612, + "grad_norm": 0.49698176360744994, + "learning_rate": 4.123487634832011e-06, + "loss": 0.0456, + "step": 5407 + }, + { + "epoch": 2.4019542527204085, + "grad_norm": 0.4203922139028562, + "learning_rate": 4.121579182981632e-06, + "loss": 0.0312, + "step": 5408 + }, + { + "epoch": 2.4023984010659563, + "grad_norm": 0.3669217976875289, + "learning_rate": 4.119670863172824e-06, + "loss": 0.0289, + "step": 5409 + }, + { + "epoch": 2.4028425494115035, + "grad_norm": 0.38804870930885565, + "learning_rate": 4.117762675692437e-06, + "loss": 0.0251, + "step": 5410 + }, + { + "epoch": 2.403286697757051, + "grad_norm": 0.45435459736787537, + "learning_rate": 4.115854620827306e-06, + "loss": 0.0257, + "step": 5411 + }, + { + "epoch": 2.403730846102598, + "grad_norm": 0.46297467981750196, + "learning_rate": 4.1139466988642475e-06, + "loss": 0.0433, + "step": 5412 + }, + { + "epoch": 2.404174994448146, + "grad_norm": 0.4824849195132302, + "learning_rate": 4.11203891009005e-06, + "loss": 0.0484, + "step": 5413 + }, + { + "epoch": 2.404619142793693, + "grad_norm": 0.45065972785277525, + "learning_rate": 4.110131254791489e-06, + "loss": 0.028, + "step": 5414 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 0.5431313239180846, + "learning_rate": 4.108223733255316e-06, + "loss": 0.0442, + "step": 5415 + }, + { + "epoch": 2.4055074394847877, + "grad_norm": 0.46821071422858296, + "learning_rate": 4.106316345768265e-06, + "loss": 0.027, + "step": 5416 + }, + { + "epoch": 2.4059515878303355, + "grad_norm": 0.4155972801004647, + "learning_rate": 4.104409092617047e-06, + "loss": 0.033, + "step": 5417 + }, + { + "epoch": 2.4063957361758828, + "grad_norm": 0.6893775471251186, + "learning_rate": 4.1025019740883556e-06, + "loss": 0.0504, + "step": 5418 + }, + { + "epoch": 2.40683988452143, + "grad_norm": 0.4083036792334355, + "learning_rate": 4.100594990468865e-06, + "loss": 0.0333, + "step": 5419 + }, + { + "epoch": 2.4072840328669773, + "grad_norm": 0.4849564201026396, + "learning_rate": 4.0986881420452254e-06, + "loss": 0.0338, + "step": 5420 + }, + { + "epoch": 2.407728181212525, + "grad_norm": 0.4268462015661056, + "learning_rate": 4.096781429104068e-06, + "loss": 0.0341, + "step": 5421 + }, + { + "epoch": 2.4081723295580724, + "grad_norm": 0.4834546635420465, + "learning_rate": 4.094874851932002e-06, + "loss": 0.035, + "step": 5422 + }, + { + "epoch": 2.4086164779036197, + "grad_norm": 0.46585094752698974, + "learning_rate": 4.092968410815625e-06, + "loss": 0.0288, + "step": 5423 + }, + { + "epoch": 2.4090606262491674, + "grad_norm": 0.6920655509720604, + "learning_rate": 4.091062106041504e-06, + "loss": 0.0296, + "step": 5424 + }, + { + "epoch": 2.4095047745947147, + "grad_norm": 0.5102739827979796, + "learning_rate": 4.089155937896187e-06, + "loss": 0.0316, + "step": 5425 + }, + { + "epoch": 2.409948922940262, + "grad_norm": 0.36415630770650137, + "learning_rate": 4.087249906666206e-06, + "loss": 0.0296, + "step": 5426 + }, + { + "epoch": 2.4103930712858093, + "grad_norm": 0.3263196544116076, + "learning_rate": 4.085344012638067e-06, + "loss": 0.0273, + "step": 5427 + }, + { + "epoch": 2.410837219631357, + "grad_norm": 0.5467141941835238, + "learning_rate": 4.083438256098261e-06, + "loss": 0.0368, + "step": 5428 + }, + { + "epoch": 2.4112813679769043, + "grad_norm": 0.5887360709523032, + "learning_rate": 4.081532637333255e-06, + "loss": 0.0389, + "step": 5429 + }, + { + "epoch": 2.4117255163224516, + "grad_norm": 0.4074807509483992, + "learning_rate": 4.079627156629497e-06, + "loss": 0.0239, + "step": 5430 + }, + { + "epoch": 2.4121696646679993, + "grad_norm": 0.3424789553984037, + "learning_rate": 4.07772181427341e-06, + "loss": 0.0301, + "step": 5431 + }, + { + "epoch": 2.4126138130135466, + "grad_norm": 0.607202231589749, + "learning_rate": 4.075816610551402e-06, + "loss": 0.0496, + "step": 5432 + }, + { + "epoch": 2.413057961359094, + "grad_norm": 0.47864092786503787, + "learning_rate": 4.073911545749857e-06, + "loss": 0.0316, + "step": 5433 + }, + { + "epoch": 2.413502109704641, + "grad_norm": 0.5034557440074656, + "learning_rate": 4.072006620155136e-06, + "loss": 0.026, + "step": 5434 + }, + { + "epoch": 2.413946258050189, + "grad_norm": 0.36403294815194714, + "learning_rate": 4.070101834053585e-06, + "loss": 0.0304, + "step": 5435 + }, + { + "epoch": 2.414390406395736, + "grad_norm": 0.4715313044352068, + "learning_rate": 4.068197187731526e-06, + "loss": 0.0369, + "step": 5436 + }, + { + "epoch": 2.4148345547412835, + "grad_norm": 0.27768669661494494, + "learning_rate": 4.066292681475257e-06, + "loss": 0.0174, + "step": 5437 + }, + { + "epoch": 2.4152787030868312, + "grad_norm": 0.6691350505746541, + "learning_rate": 4.064388315571059e-06, + "loss": 0.0318, + "step": 5438 + }, + { + "epoch": 2.4157228514323785, + "grad_norm": 0.4180829624879483, + "learning_rate": 4.062484090305191e-06, + "loss": 0.0416, + "step": 5439 + }, + { + "epoch": 2.416166999777926, + "grad_norm": 0.3034927493418548, + "learning_rate": 4.060580005963888e-06, + "loss": 0.0249, + "step": 5440 + }, + { + "epoch": 2.416611148123473, + "grad_norm": 0.42896525131188157, + "learning_rate": 4.05867606283337e-06, + "loss": 0.0345, + "step": 5441 + }, + { + "epoch": 2.417055296469021, + "grad_norm": 0.356634005220255, + "learning_rate": 4.0567722611998285e-06, + "loss": 0.0244, + "step": 5442 + }, + { + "epoch": 2.417499444814568, + "grad_norm": 0.49362452123065936, + "learning_rate": 4.054868601349441e-06, + "loss": 0.0349, + "step": 5443 + }, + { + "epoch": 2.4179435931601154, + "grad_norm": 0.3476206524561192, + "learning_rate": 4.052965083568356e-06, + "loss": 0.0262, + "step": 5444 + }, + { + "epoch": 2.4183877415056627, + "grad_norm": 0.4046428177539252, + "learning_rate": 4.051061708142705e-06, + "loss": 0.0308, + "step": 5445 + }, + { + "epoch": 2.4188318898512104, + "grad_norm": 0.6886851057569472, + "learning_rate": 4.0491584753586e-06, + "loss": 0.0489, + "step": 5446 + }, + { + "epoch": 2.4192760381967577, + "grad_norm": 0.4422724381498167, + "learning_rate": 4.047255385502129e-06, + "loss": 0.0338, + "step": 5447 + }, + { + "epoch": 2.419720186542305, + "grad_norm": 0.35168629623661846, + "learning_rate": 4.045352438859359e-06, + "loss": 0.0202, + "step": 5448 + }, + { + "epoch": 2.4201643348878523, + "grad_norm": 0.449273667282489, + "learning_rate": 4.043449635716332e-06, + "loss": 0.0396, + "step": 5449 + }, + { + "epoch": 2.4206084832334, + "grad_norm": 0.5131783172439474, + "learning_rate": 4.0415469763590745e-06, + "loss": 0.0416, + "step": 5450 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.46807114802560923, + "learning_rate": 4.0396444610735865e-06, + "loss": 0.0439, + "step": 5451 + }, + { + "epoch": 2.4214967799244946, + "grad_norm": 0.6426317590879459, + "learning_rate": 4.037742090145851e-06, + "loss": 0.05, + "step": 5452 + }, + { + "epoch": 2.4219409282700424, + "grad_norm": 0.5241231307427713, + "learning_rate": 4.0358398638618245e-06, + "loss": 0.0279, + "step": 5453 + }, + { + "epoch": 2.4223850766155897, + "grad_norm": 0.3603709645668666, + "learning_rate": 4.033937782507445e-06, + "loss": 0.0323, + "step": 5454 + }, + { + "epoch": 2.422829224961137, + "grad_norm": 0.4940232401715231, + "learning_rate": 4.032035846368627e-06, + "loss": 0.0316, + "step": 5455 + }, + { + "epoch": 2.4232733733066842, + "grad_norm": 0.4547687513915308, + "learning_rate": 4.030134055731266e-06, + "loss": 0.0398, + "step": 5456 + }, + { + "epoch": 2.423717521652232, + "grad_norm": 0.42013634054715737, + "learning_rate": 4.028232410881228e-06, + "loss": 0.0218, + "step": 5457 + }, + { + "epoch": 2.4241616699977793, + "grad_norm": 0.44875489093858906, + "learning_rate": 4.026330912104369e-06, + "loss": 0.0447, + "step": 5458 + }, + { + "epoch": 2.4246058183433266, + "grad_norm": 0.827939430580337, + "learning_rate": 4.024429559686513e-06, + "loss": 0.0678, + "step": 5459 + }, + { + "epoch": 2.4250499666888743, + "grad_norm": 0.7485263459812835, + "learning_rate": 4.022528353913466e-06, + "loss": 0.0442, + "step": 5460 + }, + { + "epoch": 2.4254941150344216, + "grad_norm": 0.3550612167644366, + "learning_rate": 4.020627295071012e-06, + "loss": 0.0334, + "step": 5461 + }, + { + "epoch": 2.425938263379969, + "grad_norm": 0.5079952410434588, + "learning_rate": 4.018726383444911e-06, + "loss": 0.0424, + "step": 5462 + }, + { + "epoch": 2.426382411725516, + "grad_norm": 0.3588389906173018, + "learning_rate": 4.016825619320904e-06, + "loss": 0.0256, + "step": 5463 + }, + { + "epoch": 2.426826560071064, + "grad_norm": 0.37557127358260195, + "learning_rate": 4.014925002984708e-06, + "loss": 0.027, + "step": 5464 + }, + { + "epoch": 2.427270708416611, + "grad_norm": 0.3878661608771635, + "learning_rate": 4.013024534722018e-06, + "loss": 0.0331, + "step": 5465 + }, + { + "epoch": 2.4277148567621585, + "grad_norm": 0.5433675019773777, + "learning_rate": 4.011124214818506e-06, + "loss": 0.032, + "step": 5466 + }, + { + "epoch": 2.428159005107706, + "grad_norm": 0.3899751784915578, + "learning_rate": 4.0092240435598225e-06, + "loss": 0.0356, + "step": 5467 + }, + { + "epoch": 2.4286031534532535, + "grad_norm": 0.4327878275713323, + "learning_rate": 4.007324021231594e-06, + "loss": 0.0319, + "step": 5468 + }, + { + "epoch": 2.429047301798801, + "grad_norm": 0.49698866083093113, + "learning_rate": 4.00542414811943e-06, + "loss": 0.0305, + "step": 5469 + }, + { + "epoch": 2.429491450144348, + "grad_norm": 0.4026428626742401, + "learning_rate": 4.00352442450891e-06, + "loss": 0.0348, + "step": 5470 + }, + { + "epoch": 2.4299355984898954, + "grad_norm": 0.48636219393391805, + "learning_rate": 4.001624850685598e-06, + "loss": 0.0346, + "step": 5471 + }, + { + "epoch": 2.430379746835443, + "grad_norm": 0.4112453328110524, + "learning_rate": 3.999725426935029e-06, + "loss": 0.0428, + "step": 5472 + }, + { + "epoch": 2.4308238951809904, + "grad_norm": 0.5337831095831886, + "learning_rate": 3.99782615354272e-06, + "loss": 0.0442, + "step": 5473 + }, + { + "epoch": 2.4312680435265377, + "grad_norm": 0.37305930397141557, + "learning_rate": 3.995927030794163e-06, + "loss": 0.0357, + "step": 5474 + }, + { + "epoch": 2.4317121918720854, + "grad_norm": 0.4326214227306239, + "learning_rate": 3.994028058974832e-06, + "loss": 0.0294, + "step": 5475 + }, + { + "epoch": 2.4321563402176327, + "grad_norm": 0.35769209966498045, + "learning_rate": 3.992129238370171e-06, + "loss": 0.033, + "step": 5476 + }, + { + "epoch": 2.43260048856318, + "grad_norm": 0.5540164347858187, + "learning_rate": 3.9902305692656056e-06, + "loss": 0.0396, + "step": 5477 + }, + { + "epoch": 2.4330446369087273, + "grad_norm": 0.6878308326999009, + "learning_rate": 3.98833205194654e-06, + "loss": 0.0291, + "step": 5478 + }, + { + "epoch": 2.433488785254275, + "grad_norm": 0.6656069204526499, + "learning_rate": 3.98643368669835e-06, + "loss": 0.0377, + "step": 5479 + }, + { + "epoch": 2.4339329335998223, + "grad_norm": 0.36440064361824326, + "learning_rate": 3.984535473806395e-06, + "loss": 0.031, + "step": 5480 + }, + { + "epoch": 2.4343770819453696, + "grad_norm": 0.46753874608394186, + "learning_rate": 3.98263741355601e-06, + "loss": 0.0296, + "step": 5481 + }, + { + "epoch": 2.4348212302909173, + "grad_norm": 0.41167122039004567, + "learning_rate": 3.980739506232503e-06, + "loss": 0.0308, + "step": 5482 + }, + { + "epoch": 2.4352653786364646, + "grad_norm": 0.4469078274911405, + "learning_rate": 3.978841752121161e-06, + "loss": 0.0355, + "step": 5483 + }, + { + "epoch": 2.435709526982012, + "grad_norm": 0.39826295605317574, + "learning_rate": 3.976944151507251e-06, + "loss": 0.0288, + "step": 5484 + }, + { + "epoch": 2.436153675327559, + "grad_norm": 0.4535357297321569, + "learning_rate": 3.975046704676014e-06, + "loss": 0.0286, + "step": 5485 + }, + { + "epoch": 2.436597823673107, + "grad_norm": 0.4396407655662091, + "learning_rate": 3.973149411912668e-06, + "loss": 0.0305, + "step": 5486 + }, + { + "epoch": 2.4370419720186542, + "grad_norm": 0.3507364407629326, + "learning_rate": 3.971252273502407e-06, + "loss": 0.0248, + "step": 5487 + }, + { + "epoch": 2.4374861203642015, + "grad_norm": 0.3450849459589068, + "learning_rate": 3.969355289730407e-06, + "loss": 0.0206, + "step": 5488 + }, + { + "epoch": 2.4379302687097493, + "grad_norm": 0.503140049702594, + "learning_rate": 3.967458460881815e-06, + "loss": 0.04, + "step": 5489 + }, + { + "epoch": 2.4383744170552966, + "grad_norm": 0.5640472250485181, + "learning_rate": 3.965561787241754e-06, + "loss": 0.0404, + "step": 5490 + }, + { + "epoch": 2.438818565400844, + "grad_norm": 0.6056306520006238, + "learning_rate": 3.963665269095328e-06, + "loss": 0.0404, + "step": 5491 + }, + { + "epoch": 2.439262713746391, + "grad_norm": 0.4258100611472551, + "learning_rate": 3.961768906727618e-06, + "loss": 0.028, + "step": 5492 + }, + { + "epoch": 2.439706862091939, + "grad_norm": 0.38264655994704755, + "learning_rate": 3.959872700423678e-06, + "loss": 0.0256, + "step": 5493 + }, + { + "epoch": 2.440151010437486, + "grad_norm": 0.389861248826636, + "learning_rate": 3.957976650468539e-06, + "loss": 0.0333, + "step": 5494 + }, + { + "epoch": 2.4405951587830335, + "grad_norm": 0.6710261752550767, + "learning_rate": 3.956080757147211e-06, + "loss": 0.0288, + "step": 5495 + }, + { + "epoch": 2.441039307128581, + "grad_norm": 0.533539192209703, + "learning_rate": 3.9541850207446754e-06, + "loss": 0.029, + "step": 5496 + }, + { + "epoch": 2.4414834554741285, + "grad_norm": 0.7167183091153162, + "learning_rate": 3.9522894415459e-06, + "loss": 0.0473, + "step": 5497 + }, + { + "epoch": 2.4419276038196758, + "grad_norm": 0.322074964494659, + "learning_rate": 3.950394019835817e-06, + "loss": 0.0258, + "step": 5498 + }, + { + "epoch": 2.442371752165223, + "grad_norm": 0.3900552673739537, + "learning_rate": 3.948498755899344e-06, + "loss": 0.0287, + "step": 5499 + }, + { + "epoch": 2.4428159005107704, + "grad_norm": 0.4466770793345454, + "learning_rate": 3.94660365002137e-06, + "loss": 0.0292, + "step": 5500 + }, + { + "epoch": 2.443260048856318, + "grad_norm": 0.5418862551373604, + "learning_rate": 3.94470870248676e-06, + "loss": 0.0434, + "step": 5501 + }, + { + "epoch": 2.4437041972018654, + "grad_norm": 0.5170924051250978, + "learning_rate": 3.942813913580358e-06, + "loss": 0.044, + "step": 5502 + }, + { + "epoch": 2.4441483455474127, + "grad_norm": 0.5438284418911795, + "learning_rate": 3.940919283586985e-06, + "loss": 0.0448, + "step": 5503 + }, + { + "epoch": 2.4445924938929604, + "grad_norm": 0.5198158479237611, + "learning_rate": 3.9390248127914325e-06, + "loss": 0.0528, + "step": 5504 + }, + { + "epoch": 2.4450366422385077, + "grad_norm": 0.520564892893801, + "learning_rate": 3.937130501478475e-06, + "loss": 0.0488, + "step": 5505 + }, + { + "epoch": 2.445480790584055, + "grad_norm": 0.46307199806507215, + "learning_rate": 3.935236349932858e-06, + "loss": 0.0334, + "step": 5506 + }, + { + "epoch": 2.4459249389296023, + "grad_norm": 0.5703039507859231, + "learning_rate": 3.933342358439304e-06, + "loss": 0.0388, + "step": 5507 + }, + { + "epoch": 2.44636908727515, + "grad_norm": 0.480094036827002, + "learning_rate": 3.931448527282512e-06, + "loss": 0.0383, + "step": 5508 + }, + { + "epoch": 2.4468132356206973, + "grad_norm": 0.5416338368617786, + "learning_rate": 3.9295548567471595e-06, + "loss": 0.044, + "step": 5509 + }, + { + "epoch": 2.4472573839662446, + "grad_norm": 0.3789299998421564, + "learning_rate": 3.927661347117896e-06, + "loss": 0.0308, + "step": 5510 + }, + { + "epoch": 2.4477015323117923, + "grad_norm": 0.42767639025063464, + "learning_rate": 3.925767998679347e-06, + "loss": 0.032, + "step": 5511 + }, + { + "epoch": 2.4481456806573396, + "grad_norm": 0.3723173022056738, + "learning_rate": 3.923874811716116e-06, + "loss": 0.0299, + "step": 5512 + }, + { + "epoch": 2.448589829002887, + "grad_norm": 0.647876325917259, + "learning_rate": 3.92198178651278e-06, + "loss": 0.0334, + "step": 5513 + }, + { + "epoch": 2.449033977348434, + "grad_norm": 0.3651110551254141, + "learning_rate": 3.920088923353895e-06, + "loss": 0.0404, + "step": 5514 + }, + { + "epoch": 2.449478125693982, + "grad_norm": 0.43323931222905593, + "learning_rate": 3.918196222523989e-06, + "loss": 0.0458, + "step": 5515 + }, + { + "epoch": 2.4499222740395292, + "grad_norm": 0.4066604237779864, + "learning_rate": 3.916303684307568e-06, + "loss": 0.0291, + "step": 5516 + }, + { + "epoch": 2.4503664223850765, + "grad_norm": 0.4166876666433253, + "learning_rate": 3.914411308989113e-06, + "loss": 0.027, + "step": 5517 + }, + { + "epoch": 2.4508105707306242, + "grad_norm": 0.45791540033658323, + "learning_rate": 3.9125190968530766e-06, + "loss": 0.0271, + "step": 5518 + }, + { + "epoch": 2.4512547190761715, + "grad_norm": 0.4364649579290464, + "learning_rate": 3.910627048183893e-06, + "loss": 0.0325, + "step": 5519 + }, + { + "epoch": 2.451698867421719, + "grad_norm": 0.34758907020261137, + "learning_rate": 3.908735163265971e-06, + "loss": 0.0225, + "step": 5520 + }, + { + "epoch": 2.452143015767266, + "grad_norm": 0.386095366133868, + "learning_rate": 3.906843442383691e-06, + "loss": 0.0253, + "step": 5521 + }, + { + "epoch": 2.452587164112814, + "grad_norm": 0.5285696022742041, + "learning_rate": 3.90495188582141e-06, + "loss": 0.0288, + "step": 5522 + }, + { + "epoch": 2.453031312458361, + "grad_norm": 0.5685349400918212, + "learning_rate": 3.903060493863463e-06, + "loss": 0.0384, + "step": 5523 + }, + { + "epoch": 2.4534754608039084, + "grad_norm": 0.33536529902813805, + "learning_rate": 3.901169266794158e-06, + "loss": 0.0256, + "step": 5524 + }, + { + "epoch": 2.4539196091494557, + "grad_norm": 0.3367209757616414, + "learning_rate": 3.899278204897777e-06, + "loss": 0.0308, + "step": 5525 + }, + { + "epoch": 2.4543637574950035, + "grad_norm": 0.35301257937407804, + "learning_rate": 3.89738730845858e-06, + "loss": 0.0326, + "step": 5526 + }, + { + "epoch": 2.4548079058405508, + "grad_norm": 0.44674618292196533, + "learning_rate": 3.895496577760802e-06, + "loss": 0.0378, + "step": 5527 + }, + { + "epoch": 2.455252054186098, + "grad_norm": 0.33090638899042374, + "learning_rate": 3.893606013088649e-06, + "loss": 0.0287, + "step": 5528 + }, + { + "epoch": 2.4556962025316453, + "grad_norm": 0.6629654415706786, + "learning_rate": 3.8917156147263075e-06, + "loss": 0.0361, + "step": 5529 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 0.4137805309428058, + "learning_rate": 3.889825382957935e-06, + "loss": 0.0381, + "step": 5530 + }, + { + "epoch": 2.4565844992227404, + "grad_norm": 0.48510449046292115, + "learning_rate": 3.887935318067665e-06, + "loss": 0.0369, + "step": 5531 + }, + { + "epoch": 2.4570286475682876, + "grad_norm": 0.4301148483997256, + "learning_rate": 3.886045420339608e-06, + "loss": 0.0288, + "step": 5532 + }, + { + "epoch": 2.4574727959138354, + "grad_norm": 0.42246466717722975, + "learning_rate": 3.884155690057849e-06, + "loss": 0.0301, + "step": 5533 + }, + { + "epoch": 2.4579169442593827, + "grad_norm": 0.4345894945232888, + "learning_rate": 3.882266127506444e-06, + "loss": 0.0327, + "step": 5534 + }, + { + "epoch": 2.45836109260493, + "grad_norm": 0.47014262773033944, + "learning_rate": 3.880376732969427e-06, + "loss": 0.0406, + "step": 5535 + }, + { + "epoch": 2.4588052409504773, + "grad_norm": 0.5529245797017762, + "learning_rate": 3.8784875067308035e-06, + "loss": 0.0428, + "step": 5536 + }, + { + "epoch": 2.459249389296025, + "grad_norm": 0.5853637583441337, + "learning_rate": 3.876598449074561e-06, + "loss": 0.0484, + "step": 5537 + }, + { + "epoch": 2.4596935376415723, + "grad_norm": 0.4308021015437979, + "learning_rate": 3.874709560284655e-06, + "loss": 0.0331, + "step": 5538 + }, + { + "epoch": 2.4601376859871196, + "grad_norm": 0.41105828222307955, + "learning_rate": 3.872820840645017e-06, + "loss": 0.0291, + "step": 5539 + }, + { + "epoch": 2.4605818343326673, + "grad_norm": 0.4669061473028007, + "learning_rate": 3.8709322904395556e-06, + "loss": 0.0275, + "step": 5540 + }, + { + "epoch": 2.4610259826782146, + "grad_norm": 0.4419324556460313, + "learning_rate": 3.869043909952149e-06, + "loss": 0.046, + "step": 5541 + }, + { + "epoch": 2.461470131023762, + "grad_norm": 0.4298170656814399, + "learning_rate": 3.867155699466653e-06, + "loss": 0.0376, + "step": 5542 + }, + { + "epoch": 2.461914279369309, + "grad_norm": 0.9256029510191909, + "learning_rate": 3.865267659266901e-06, + "loss": 0.0365, + "step": 5543 + }, + { + "epoch": 2.462358427714857, + "grad_norm": 0.4535797528392373, + "learning_rate": 3.863379789636696e-06, + "loss": 0.0302, + "step": 5544 + }, + { + "epoch": 2.462802576060404, + "grad_norm": 0.5482760330729667, + "learning_rate": 3.861492090859816e-06, + "loss": 0.0365, + "step": 5545 + }, + { + "epoch": 2.4632467244059515, + "grad_norm": 0.36157460865248275, + "learning_rate": 3.8596045632200126e-06, + "loss": 0.0266, + "step": 5546 + }, + { + "epoch": 2.4636908727514992, + "grad_norm": 0.4873814379100533, + "learning_rate": 3.857717207001017e-06, + "loss": 0.037, + "step": 5547 + }, + { + "epoch": 2.4641350210970465, + "grad_norm": 0.7370873255745318, + "learning_rate": 3.855830022486528e-06, + "loss": 0.0415, + "step": 5548 + }, + { + "epoch": 2.464579169442594, + "grad_norm": 0.493249372834125, + "learning_rate": 3.853943009960225e-06, + "loss": 0.0386, + "step": 5549 + }, + { + "epoch": 2.465023317788141, + "grad_norm": 0.6919203726918821, + "learning_rate": 3.852056169705753e-06, + "loss": 0.0389, + "step": 5550 + }, + { + "epoch": 2.465467466133689, + "grad_norm": 0.4116990565345076, + "learning_rate": 3.850169502006741e-06, + "loss": 0.0302, + "step": 5551 + }, + { + "epoch": 2.465911614479236, + "grad_norm": 0.34358727354238183, + "learning_rate": 3.848283007146784e-06, + "loss": 0.0325, + "step": 5552 + }, + { + "epoch": 2.4663557628247834, + "grad_norm": 0.35614401203480484, + "learning_rate": 3.846396685409455e-06, + "loss": 0.0322, + "step": 5553 + }, + { + "epoch": 2.4667999111703307, + "grad_norm": 0.25982925770375787, + "learning_rate": 3.8445105370782995e-06, + "loss": 0.0262, + "step": 5554 + }, + { + "epoch": 2.4672440595158784, + "grad_norm": 0.3510865325198374, + "learning_rate": 3.842624562436841e-06, + "loss": 0.0386, + "step": 5555 + }, + { + "epoch": 2.4676882078614257, + "grad_norm": 0.4049390820424808, + "learning_rate": 3.8407387617685696e-06, + "loss": 0.0247, + "step": 5556 + }, + { + "epoch": 2.468132356206973, + "grad_norm": 0.42859540412907965, + "learning_rate": 3.838853135356956e-06, + "loss": 0.0368, + "step": 5557 + }, + { + "epoch": 2.4685765045525203, + "grad_norm": 0.35853923916250774, + "learning_rate": 3.836967683485441e-06, + "loss": 0.0329, + "step": 5558 + }, + { + "epoch": 2.469020652898068, + "grad_norm": 0.4278309028616722, + "learning_rate": 3.835082406437437e-06, + "loss": 0.0362, + "step": 5559 + }, + { + "epoch": 2.4694648012436153, + "grad_norm": 1.0923622589898494, + "learning_rate": 3.833197304496336e-06, + "loss": 0.044, + "step": 5560 + }, + { + "epoch": 2.4699089495891626, + "grad_norm": 0.37654012560879, + "learning_rate": 3.8313123779455035e-06, + "loss": 0.0262, + "step": 5561 + }, + { + "epoch": 2.4703530979347104, + "grad_norm": 0.41898160595248646, + "learning_rate": 3.829427627068272e-06, + "loss": 0.0344, + "step": 5562 + }, + { + "epoch": 2.4707972462802577, + "grad_norm": 0.3844261706628325, + "learning_rate": 3.827543052147952e-06, + "loss": 0.0273, + "step": 5563 + }, + { + "epoch": 2.471241394625805, + "grad_norm": 0.4489319056107684, + "learning_rate": 3.8256586534678285e-06, + "loss": 0.0318, + "step": 5564 + }, + { + "epoch": 2.4716855429713522, + "grad_norm": 0.6833288857085646, + "learning_rate": 3.8237744313111565e-06, + "loss": 0.0404, + "step": 5565 + }, + { + "epoch": 2.4721296913169, + "grad_norm": 0.430664114118146, + "learning_rate": 3.82189038596117e-06, + "loss": 0.0352, + "step": 5566 + }, + { + "epoch": 2.4725738396624473, + "grad_norm": 0.4030254224585019, + "learning_rate": 3.820006517701069e-06, + "loss": 0.0321, + "step": 5567 + }, + { + "epoch": 2.4730179880079945, + "grad_norm": 0.5685844819493571, + "learning_rate": 3.8181228268140354e-06, + "loss": 0.0507, + "step": 5568 + }, + { + "epoch": 2.4734621363535423, + "grad_norm": 0.44264389289592554, + "learning_rate": 3.816239313583217e-06, + "loss": 0.0376, + "step": 5569 + }, + { + "epoch": 2.4739062846990896, + "grad_norm": 0.3377904339220037, + "learning_rate": 3.814355978291736e-06, + "loss": 0.0327, + "step": 5570 + }, + { + "epoch": 2.474350433044637, + "grad_norm": 0.630683589245786, + "learning_rate": 3.8124728212226938e-06, + "loss": 0.0471, + "step": 5571 + }, + { + "epoch": 2.474794581390184, + "grad_norm": 0.42602963126310434, + "learning_rate": 3.810589842659159e-06, + "loss": 0.0297, + "step": 5572 + }, + { + "epoch": 2.475238729735732, + "grad_norm": 0.4633136103256966, + "learning_rate": 3.808707042884176e-06, + "loss": 0.028, + "step": 5573 + }, + { + "epoch": 2.475682878081279, + "grad_norm": 0.5782431299481687, + "learning_rate": 3.8068244221807606e-06, + "loss": 0.0383, + "step": 5574 + }, + { + "epoch": 2.4761270264268265, + "grad_norm": 0.5372901069709559, + "learning_rate": 3.8049419808319033e-06, + "loss": 0.036, + "step": 5575 + }, + { + "epoch": 2.476571174772374, + "grad_norm": 0.2611922701250809, + "learning_rate": 3.8030597191205643e-06, + "loss": 0.0217, + "step": 5576 + }, + { + "epoch": 2.4770153231179215, + "grad_norm": 0.36585043972090653, + "learning_rate": 3.8011776373296837e-06, + "loss": 0.0286, + "step": 5577 + }, + { + "epoch": 2.477459471463469, + "grad_norm": 0.5157568238144473, + "learning_rate": 3.79929573574217e-06, + "loss": 0.0343, + "step": 5578 + }, + { + "epoch": 2.477903619809016, + "grad_norm": 0.4610943022253674, + "learning_rate": 3.797414014640903e-06, + "loss": 0.0318, + "step": 5579 + }, + { + "epoch": 2.478347768154564, + "grad_norm": 0.47373180317876196, + "learning_rate": 3.795532474308737e-06, + "loss": 0.0427, + "step": 5580 + }, + { + "epoch": 2.478791916500111, + "grad_norm": 0.4145736062776239, + "learning_rate": 3.7936511150285014e-06, + "loss": 0.038, + "step": 5581 + }, + { + "epoch": 2.4792360648456584, + "grad_norm": 0.5254698644896055, + "learning_rate": 3.7917699370829935e-06, + "loss": 0.0349, + "step": 5582 + }, + { + "epoch": 2.4796802131912057, + "grad_norm": 0.4171576347283061, + "learning_rate": 3.789888940754991e-06, + "loss": 0.0303, + "step": 5583 + }, + { + "epoch": 2.4801243615367534, + "grad_norm": 0.39892945172204647, + "learning_rate": 3.788008126327235e-06, + "loss": 0.041, + "step": 5584 + }, + { + "epoch": 2.4805685098823007, + "grad_norm": 0.4156599218368941, + "learning_rate": 3.7861274940824473e-06, + "loss": 0.0308, + "step": 5585 + }, + { + "epoch": 2.481012658227848, + "grad_norm": 0.3949693643427857, + "learning_rate": 3.784247044303317e-06, + "loss": 0.0311, + "step": 5586 + }, + { + "epoch": 2.4814568065733953, + "grad_norm": 0.3619897883910703, + "learning_rate": 3.782366777272506e-06, + "loss": 0.0304, + "step": 5587 + }, + { + "epoch": 2.481900954918943, + "grad_norm": 0.3797535629139442, + "learning_rate": 3.7804866932726535e-06, + "loss": 0.0265, + "step": 5588 + }, + { + "epoch": 2.4823451032644903, + "grad_norm": 0.4514704220606347, + "learning_rate": 3.778606792586368e-06, + "loss": 0.0242, + "step": 5589 + }, + { + "epoch": 2.4827892516100376, + "grad_norm": 0.5885462347335189, + "learning_rate": 3.7767270754962294e-06, + "loss": 0.0462, + "step": 5590 + }, + { + "epoch": 2.4832333999555853, + "grad_norm": 0.46737727413834057, + "learning_rate": 3.7748475422847896e-06, + "loss": 0.0476, + "step": 5591 + }, + { + "epoch": 2.4836775483011326, + "grad_norm": 0.34347582817428995, + "learning_rate": 3.7729681932345776e-06, + "loss": 0.0309, + "step": 5592 + }, + { + "epoch": 2.48412169664668, + "grad_norm": 0.5416760213809743, + "learning_rate": 3.771089028628087e-06, + "loss": 0.0301, + "step": 5593 + }, + { + "epoch": 2.484565844992227, + "grad_norm": 0.5279294508428536, + "learning_rate": 3.7692100487477936e-06, + "loss": 0.0338, + "step": 5594 + }, + { + "epoch": 2.485009993337775, + "grad_norm": 0.40771734537444654, + "learning_rate": 3.7673312538761362e-06, + "loss": 0.0244, + "step": 5595 + }, + { + "epoch": 2.4854541416833222, + "grad_norm": 0.3225283085841615, + "learning_rate": 3.765452644295532e-06, + "loss": 0.0252, + "step": 5596 + }, + { + "epoch": 2.4858982900288695, + "grad_norm": 0.4343781070994849, + "learning_rate": 3.7635742202883664e-06, + "loss": 0.0301, + "step": 5597 + }, + { + "epoch": 2.4863424383744173, + "grad_norm": 0.48736918945639, + "learning_rate": 3.761695982136997e-06, + "loss": 0.0283, + "step": 5598 + }, + { + "epoch": 2.4867865867199646, + "grad_norm": 0.4628216568506521, + "learning_rate": 3.759817930123756e-06, + "loss": 0.0325, + "step": 5599 + }, + { + "epoch": 2.487230735065512, + "grad_norm": 0.4979234883742358, + "learning_rate": 3.75794006453095e-06, + "loss": 0.046, + "step": 5600 + }, + { + "epoch": 2.487674883411059, + "grad_norm": 0.5295486655610855, + "learning_rate": 3.7560623856408496e-06, + "loss": 0.0421, + "step": 5601 + }, + { + "epoch": 2.488119031756607, + "grad_norm": 0.3550831132877616, + "learning_rate": 3.7541848937357037e-06, + "loss": 0.0282, + "step": 5602 + }, + { + "epoch": 2.488563180102154, + "grad_norm": 0.4820776586497428, + "learning_rate": 3.7523075890977323e-06, + "loss": 0.0327, + "step": 5603 + }, + { + "epoch": 2.4890073284477015, + "grad_norm": 0.36069709664433147, + "learning_rate": 3.7504304720091227e-06, + "loss": 0.0282, + "step": 5604 + }, + { + "epoch": 2.489451476793249, + "grad_norm": 0.3378590317401163, + "learning_rate": 3.7485535427520393e-06, + "loss": 0.0327, + "step": 5605 + }, + { + "epoch": 2.4898956251387965, + "grad_norm": 0.5101551267438247, + "learning_rate": 3.7466768016086187e-06, + "loss": 0.0426, + "step": 5606 + }, + { + "epoch": 2.4903397734843438, + "grad_norm": 0.4554277064950327, + "learning_rate": 3.7448002488609647e-06, + "loss": 0.0363, + "step": 5607 + }, + { + "epoch": 2.490783921829891, + "grad_norm": 0.5190026980108126, + "learning_rate": 3.7429238847911555e-06, + "loss": 0.0303, + "step": 5608 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 0.4937954972214749, + "learning_rate": 3.7410477096812402e-06, + "loss": 0.0489, + "step": 5609 + }, + { + "epoch": 2.491672218520986, + "grad_norm": 0.36659365050297416, + "learning_rate": 3.7391717238132386e-06, + "loss": 0.0266, + "step": 5610 + }, + { + "epoch": 2.4921163668665334, + "grad_norm": 0.391793755436005, + "learning_rate": 3.737295927469146e-06, + "loss": 0.0288, + "step": 5611 + }, + { + "epoch": 2.4925605152120807, + "grad_norm": 0.386017966668196, + "learning_rate": 3.7354203209309246e-06, + "loss": 0.0347, + "step": 5612 + }, + { + "epoch": 2.4930046635576284, + "grad_norm": 0.30330961970277515, + "learning_rate": 3.733544904480512e-06, + "loss": 0.0248, + "step": 5613 + }, + { + "epoch": 2.4934488119031757, + "grad_norm": 0.4407907387683592, + "learning_rate": 3.7316696783998124e-06, + "loss": 0.0347, + "step": 5614 + }, + { + "epoch": 2.493892960248723, + "grad_norm": 0.37646444767265064, + "learning_rate": 3.7297946429707045e-06, + "loss": 0.0328, + "step": 5615 + }, + { + "epoch": 2.4943371085942703, + "grad_norm": 0.4575522855234119, + "learning_rate": 3.727919798475038e-06, + "loss": 0.0352, + "step": 5616 + }, + { + "epoch": 2.494781256939818, + "grad_norm": 0.35695162304512706, + "learning_rate": 3.7260451451946365e-06, + "loss": 0.0316, + "step": 5617 + }, + { + "epoch": 2.4952254052853653, + "grad_norm": 0.43360958825611745, + "learning_rate": 3.724170683411291e-06, + "loss": 0.0332, + "step": 5618 + }, + { + "epoch": 2.4956695536309126, + "grad_norm": 0.4262372003773419, + "learning_rate": 3.722296413406763e-06, + "loss": 0.0255, + "step": 5619 + }, + { + "epoch": 2.4961137019764603, + "grad_norm": 0.43363995486396417, + "learning_rate": 3.7204223354627894e-06, + "loss": 0.031, + "step": 5620 + }, + { + "epoch": 2.4965578503220076, + "grad_norm": 0.47249288635203707, + "learning_rate": 3.718548449861074e-06, + "loss": 0.0331, + "step": 5621 + }, + { + "epoch": 2.497001998667555, + "grad_norm": 0.4319106936736054, + "learning_rate": 3.716674756883295e-06, + "loss": 0.0392, + "step": 5622 + }, + { + "epoch": 2.497446147013102, + "grad_norm": 0.5119693456166591, + "learning_rate": 3.714801256811099e-06, + "loss": 0.0409, + "step": 5623 + }, + { + "epoch": 2.49789029535865, + "grad_norm": 0.40570657622630646, + "learning_rate": 3.712927949926108e-06, + "loss": 0.0266, + "step": 5624 + }, + { + "epoch": 2.498334443704197, + "grad_norm": 0.40821652834924144, + "learning_rate": 3.7110548365099075e-06, + "loss": 0.0402, + "step": 5625 + }, + { + "epoch": 2.4987785920497445, + "grad_norm": 0.3940755701591239, + "learning_rate": 3.7091819168440624e-06, + "loss": 0.0315, + "step": 5626 + }, + { + "epoch": 2.4992227403952922, + "grad_norm": 0.495948418850682, + "learning_rate": 3.7073091912101002e-06, + "loss": 0.0326, + "step": 5627 + }, + { + "epoch": 2.4996668887408395, + "grad_norm": 0.4435967840620372, + "learning_rate": 3.705436659889527e-06, + "loss": 0.0368, + "step": 5628 + }, + { + "epoch": 2.500111037086387, + "grad_norm": 0.32392962929044805, + "learning_rate": 3.7035643231638135e-06, + "loss": 0.029, + "step": 5629 + }, + { + "epoch": 2.500555185431934, + "grad_norm": 0.6179012687291638, + "learning_rate": 3.7016921813144063e-06, + "loss": 0.0382, + "step": 5630 + }, + { + "epoch": 2.5009993337774814, + "grad_norm": 0.45744910004367445, + "learning_rate": 3.6998202346227183e-06, + "loss": 0.0418, + "step": 5631 + }, + { + "epoch": 2.501443482123029, + "grad_norm": 0.5527549347471298, + "learning_rate": 3.697948483370135e-06, + "loss": 0.0257, + "step": 5632 + }, + { + "epoch": 2.5018876304685764, + "grad_norm": 0.5434768947853562, + "learning_rate": 3.696076927838011e-06, + "loss": 0.0389, + "step": 5633 + }, + { + "epoch": 2.502331778814124, + "grad_norm": 0.3410765055084389, + "learning_rate": 3.6942055683076767e-06, + "loss": 0.0299, + "step": 5634 + }, + { + "epoch": 2.5027759271596715, + "grad_norm": 0.7036711081682665, + "learning_rate": 3.692334405060427e-06, + "loss": 0.0405, + "step": 5635 + }, + { + "epoch": 2.5032200755052187, + "grad_norm": 0.3864467247613326, + "learning_rate": 3.6904634383775283e-06, + "loss": 0.0293, + "step": 5636 + }, + { + "epoch": 2.503664223850766, + "grad_norm": 0.5152666553526902, + "learning_rate": 3.6885926685402213e-06, + "loss": 0.042, + "step": 5637 + }, + { + "epoch": 2.5041083721963133, + "grad_norm": 0.3913873319828441, + "learning_rate": 3.6867220958297132e-06, + "loss": 0.0341, + "step": 5638 + }, + { + "epoch": 2.504552520541861, + "grad_norm": 0.3712133677859356, + "learning_rate": 3.6848517205271805e-06, + "loss": 0.0257, + "step": 5639 + }, + { + "epoch": 2.5049966688874084, + "grad_norm": 0.42947535443731244, + "learning_rate": 3.682981542913776e-06, + "loss": 0.0369, + "step": 5640 + }, + { + "epoch": 2.5054408172329556, + "grad_norm": 0.5274708204343582, + "learning_rate": 3.6811115632706185e-06, + "loss": 0.0324, + "step": 5641 + }, + { + "epoch": 2.5058849655785034, + "grad_norm": 0.4291181112678621, + "learning_rate": 3.6792417818787972e-06, + "loss": 0.0293, + "step": 5642 + }, + { + "epoch": 2.5063291139240507, + "grad_norm": 0.42231381186795797, + "learning_rate": 3.677372199019371e-06, + "loss": 0.0366, + "step": 5643 + }, + { + "epoch": 2.506773262269598, + "grad_norm": 0.4789995350453658, + "learning_rate": 3.6755028149733697e-06, + "loss": 0.0327, + "step": 5644 + }, + { + "epoch": 2.5072174106151452, + "grad_norm": 0.40689270267790856, + "learning_rate": 3.6736336300217964e-06, + "loss": 0.0237, + "step": 5645 + }, + { + "epoch": 2.507661558960693, + "grad_norm": 0.5913689239490344, + "learning_rate": 3.6717646444456196e-06, + "loss": 0.0366, + "step": 5646 + }, + { + "epoch": 2.5081057073062403, + "grad_norm": 0.3914862469261757, + "learning_rate": 3.669895858525778e-06, + "loss": 0.025, + "step": 5647 + }, + { + "epoch": 2.5085498556517876, + "grad_norm": 0.42201187516388383, + "learning_rate": 3.6680272725431854e-06, + "loss": 0.0336, + "step": 5648 + }, + { + "epoch": 2.5089940039973353, + "grad_norm": 0.42314579939123975, + "learning_rate": 3.6661588867787183e-06, + "loss": 0.0368, + "step": 5649 + }, + { + "epoch": 2.5094381523428826, + "grad_norm": 0.46190362474830615, + "learning_rate": 3.664290701513229e-06, + "loss": 0.0366, + "step": 5650 + }, + { + "epoch": 2.50988230068843, + "grad_norm": 0.5151193711763113, + "learning_rate": 3.662422717027536e-06, + "loss": 0.0343, + "step": 5651 + }, + { + "epoch": 2.510326449033977, + "grad_norm": 0.4683557027001918, + "learning_rate": 3.6605549336024327e-06, + "loss": 0.0502, + "step": 5652 + }, + { + "epoch": 2.510770597379525, + "grad_norm": 0.3966005086612889, + "learning_rate": 3.658687351518674e-06, + "loss": 0.0307, + "step": 5653 + }, + { + "epoch": 2.511214745725072, + "grad_norm": 0.4901749101545214, + "learning_rate": 3.656819971056992e-06, + "loss": 0.0514, + "step": 5654 + }, + { + "epoch": 2.5116588940706195, + "grad_norm": 0.44088737564599295, + "learning_rate": 3.654952792498086e-06, + "loss": 0.035, + "step": 5655 + }, + { + "epoch": 2.512103042416167, + "grad_norm": 0.637540920302593, + "learning_rate": 3.653085816122621e-06, + "loss": 0.0433, + "step": 5656 + }, + { + "epoch": 2.5125471907617145, + "grad_norm": 0.3989139686742009, + "learning_rate": 3.651219042211239e-06, + "loss": 0.0266, + "step": 5657 + }, + { + "epoch": 2.512991339107262, + "grad_norm": 0.49156765417598675, + "learning_rate": 3.649352471044548e-06, + "loss": 0.0232, + "step": 5658 + }, + { + "epoch": 2.513435487452809, + "grad_norm": 0.3796366437427131, + "learning_rate": 3.647486102903124e-06, + "loss": 0.0254, + "step": 5659 + }, + { + "epoch": 2.5138796357983564, + "grad_norm": 0.4428861844260197, + "learning_rate": 3.6456199380675128e-06, + "loss": 0.0336, + "step": 5660 + }, + { + "epoch": 2.514323784143904, + "grad_norm": 0.4660014764332606, + "learning_rate": 3.6437539768182305e-06, + "loss": 0.0379, + "step": 5661 + }, + { + "epoch": 2.5147679324894514, + "grad_norm": 0.48208527317355315, + "learning_rate": 3.6418882194357662e-06, + "loss": 0.0309, + "step": 5662 + }, + { + "epoch": 2.515212080834999, + "grad_norm": 0.46984872075734047, + "learning_rate": 3.6400226662005733e-06, + "loss": 0.0281, + "step": 5663 + }, + { + "epoch": 2.5156562291805464, + "grad_norm": 0.3674696428042144, + "learning_rate": 3.638157317393074e-06, + "loss": 0.0254, + "step": 5664 + }, + { + "epoch": 2.5161003775260937, + "grad_norm": 0.3790734362034619, + "learning_rate": 3.636292173293665e-06, + "loss": 0.0322, + "step": 5665 + }, + { + "epoch": 2.516544525871641, + "grad_norm": 0.36816453960949835, + "learning_rate": 3.634427234182708e-06, + "loss": 0.0339, + "step": 5666 + }, + { + "epoch": 2.5169886742171883, + "grad_norm": 0.310536315755628, + "learning_rate": 3.632562500340532e-06, + "loss": 0.0204, + "step": 5667 + }, + { + "epoch": 2.517432822562736, + "grad_norm": 0.5208883625261909, + "learning_rate": 3.6306979720474424e-06, + "loss": 0.0385, + "step": 5668 + }, + { + "epoch": 2.5178769709082833, + "grad_norm": 0.4448683196893281, + "learning_rate": 3.6288336495837085e-06, + "loss": 0.0357, + "step": 5669 + }, + { + "epoch": 2.5183211192538306, + "grad_norm": 0.44092160751797316, + "learning_rate": 3.6269695332295697e-06, + "loss": 0.0344, + "step": 5670 + }, + { + "epoch": 2.5187652675993784, + "grad_norm": 0.45635940839778255, + "learning_rate": 3.6251056232652327e-06, + "loss": 0.0305, + "step": 5671 + }, + { + "epoch": 2.5192094159449256, + "grad_norm": 0.5533263657116287, + "learning_rate": 3.6232419199708764e-06, + "loss": 0.0284, + "step": 5672 + }, + { + "epoch": 2.519653564290473, + "grad_norm": 0.37835546841851936, + "learning_rate": 3.6213784236266447e-06, + "loss": 0.0326, + "step": 5673 + }, + { + "epoch": 2.5200977126360202, + "grad_norm": 0.4665241718562087, + "learning_rate": 3.6195151345126556e-06, + "loss": 0.036, + "step": 5674 + }, + { + "epoch": 2.520541860981568, + "grad_norm": 0.3945669545233313, + "learning_rate": 3.6176520529089932e-06, + "loss": 0.0339, + "step": 5675 + }, + { + "epoch": 2.5209860093271153, + "grad_norm": 0.37055474601271443, + "learning_rate": 3.6157891790957096e-06, + "loss": 0.0339, + "step": 5676 + }, + { + "epoch": 2.5214301576726625, + "grad_norm": 0.492060267079228, + "learning_rate": 3.6139265133528246e-06, + "loss": 0.0316, + "step": 5677 + }, + { + "epoch": 2.5218743060182103, + "grad_norm": 0.5701337130019367, + "learning_rate": 3.612064055960331e-06, + "loss": 0.0464, + "step": 5678 + }, + { + "epoch": 2.5223184543637576, + "grad_norm": 0.39863450477642115, + "learning_rate": 3.6102018071981846e-06, + "loss": 0.0334, + "step": 5679 + }, + { + "epoch": 2.522762602709305, + "grad_norm": 0.515203363747466, + "learning_rate": 3.6083397673463172e-06, + "loss": 0.0486, + "step": 5680 + }, + { + "epoch": 2.523206751054852, + "grad_norm": 0.3893233450260606, + "learning_rate": 3.606477936684622e-06, + "loss": 0.0359, + "step": 5681 + }, + { + "epoch": 2.5236508994004, + "grad_norm": 0.45713551996484814, + "learning_rate": 3.6046163154929657e-06, + "loss": 0.0374, + "step": 5682 + }, + { + "epoch": 2.524095047745947, + "grad_norm": 0.5641320893715582, + "learning_rate": 3.6027549040511806e-06, + "loss": 0.058, + "step": 5683 + }, + { + "epoch": 2.5245391960914945, + "grad_norm": 0.3265494892390024, + "learning_rate": 3.600893702639067e-06, + "loss": 0.023, + "step": 5684 + }, + { + "epoch": 2.524983344437042, + "grad_norm": 0.41936797741330234, + "learning_rate": 3.5990327115363967e-06, + "loss": 0.0256, + "step": 5685 + }, + { + "epoch": 2.5254274927825895, + "grad_norm": 0.46801276167069494, + "learning_rate": 3.5971719310229093e-06, + "loss": 0.0254, + "step": 5686 + }, + { + "epoch": 2.525871641128137, + "grad_norm": 0.49189838872219027, + "learning_rate": 3.595311361378311e-06, + "loss": 0.0364, + "step": 5687 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.5586944397026717, + "learning_rate": 3.593451002882275e-06, + "loss": 0.0335, + "step": 5688 + }, + { + "epoch": 2.5267599378192314, + "grad_norm": 0.4422926900473203, + "learning_rate": 3.5915908558144476e-06, + "loss": 0.0298, + "step": 5689 + }, + { + "epoch": 2.527204086164779, + "grad_norm": 0.3434949432587989, + "learning_rate": 3.5897309204544375e-06, + "loss": 0.0336, + "step": 5690 + }, + { + "epoch": 2.5276482345103264, + "grad_norm": 0.3621563955807786, + "learning_rate": 3.587871197081828e-06, + "loss": 0.0254, + "step": 5691 + }, + { + "epoch": 2.528092382855874, + "grad_norm": 0.4132257794043078, + "learning_rate": 3.586011685976164e-06, + "loss": 0.0332, + "step": 5692 + }, + { + "epoch": 2.5285365312014214, + "grad_norm": 0.5011026660419655, + "learning_rate": 3.5841523874169648e-06, + "loss": 0.0345, + "step": 5693 + }, + { + "epoch": 2.5289806795469687, + "grad_norm": 0.45248007625680975, + "learning_rate": 3.582293301683713e-06, + "loss": 0.0293, + "step": 5694 + }, + { + "epoch": 2.529424827892516, + "grad_norm": 0.7885398440569542, + "learning_rate": 3.580434429055859e-06, + "loss": 0.032, + "step": 5695 + }, + { + "epoch": 2.5298689762380633, + "grad_norm": 0.8455065643807562, + "learning_rate": 3.578575769812824e-06, + "loss": 0.0566, + "step": 5696 + }, + { + "epoch": 2.530313124583611, + "grad_norm": 0.3581406871410735, + "learning_rate": 3.576717324233998e-06, + "loss": 0.0283, + "step": 5697 + }, + { + "epoch": 2.5307572729291583, + "grad_norm": 0.37566727714871373, + "learning_rate": 3.5748590925987347e-06, + "loss": 0.0274, + "step": 5698 + }, + { + "epoch": 2.5312014212747056, + "grad_norm": 0.40595777932831095, + "learning_rate": 3.5730010751863605e-06, + "loss": 0.029, + "step": 5699 + }, + { + "epoch": 2.5316455696202533, + "grad_norm": 0.47532996991241144, + "learning_rate": 3.571143272276164e-06, + "loss": 0.0532, + "step": 5700 + }, + { + "epoch": 2.5320897179658006, + "grad_norm": 0.44327227035484773, + "learning_rate": 3.5692856841474045e-06, + "loss": 0.0355, + "step": 5701 + }, + { + "epoch": 2.532533866311348, + "grad_norm": 0.34290979724264187, + "learning_rate": 3.5674283110793105e-06, + "loss": 0.0219, + "step": 5702 + }, + { + "epoch": 2.532978014656895, + "grad_norm": 0.6517538589913022, + "learning_rate": 3.5655711533510783e-06, + "loss": 0.0354, + "step": 5703 + }, + { + "epoch": 2.533422163002443, + "grad_norm": 0.40477955297885887, + "learning_rate": 3.5637142112418684e-06, + "loss": 0.0292, + "step": 5704 + }, + { + "epoch": 2.5338663113479902, + "grad_norm": 0.47637177118935925, + "learning_rate": 3.5618574850308095e-06, + "loss": 0.0256, + "step": 5705 + }, + { + "epoch": 2.5343104596935375, + "grad_norm": 0.4050526950368258, + "learning_rate": 3.560000974997001e-06, + "loss": 0.0279, + "step": 5706 + }, + { + "epoch": 2.5347546080390853, + "grad_norm": 0.4510491005204953, + "learning_rate": 3.5581446814195054e-06, + "loss": 0.039, + "step": 5707 + }, + { + "epoch": 2.5351987563846325, + "grad_norm": 0.43059426602406603, + "learning_rate": 3.556288604577359e-06, + "loss": 0.0344, + "step": 5708 + }, + { + "epoch": 2.53564290473018, + "grad_norm": 0.38353556571368813, + "learning_rate": 3.5544327447495598e-06, + "loss": 0.033, + "step": 5709 + }, + { + "epoch": 2.536087053075727, + "grad_norm": 0.3857238286700737, + "learning_rate": 3.5525771022150746e-06, + "loss": 0.0282, + "step": 5710 + }, + { + "epoch": 2.536531201421275, + "grad_norm": 0.48091169542814494, + "learning_rate": 3.5507216772528392e-06, + "loss": 0.0354, + "step": 5711 + }, + { + "epoch": 2.536975349766822, + "grad_norm": 1.0810667528006974, + "learning_rate": 3.548866470141753e-06, + "loss": 0.0411, + "step": 5712 + }, + { + "epoch": 2.5374194981123694, + "grad_norm": 0.37822644450804105, + "learning_rate": 3.547011481160686e-06, + "loss": 0.0287, + "step": 5713 + }, + { + "epoch": 2.537863646457917, + "grad_norm": 0.44547431389156866, + "learning_rate": 3.5451567105884777e-06, + "loss": 0.0446, + "step": 5714 + }, + { + "epoch": 2.5383077948034645, + "grad_norm": 0.33403093136486245, + "learning_rate": 3.543302158703929e-06, + "loss": 0.0304, + "step": 5715 + }, + { + "epoch": 2.5387519431490118, + "grad_norm": 0.3853720547985273, + "learning_rate": 3.5414478257858097e-06, + "loss": 0.0316, + "step": 5716 + }, + { + "epoch": 2.539196091494559, + "grad_norm": 0.4078086186517431, + "learning_rate": 3.53959371211286e-06, + "loss": 0.0359, + "step": 5717 + }, + { + "epoch": 2.5396402398401063, + "grad_norm": 0.3855600056546902, + "learning_rate": 3.5377398179637807e-06, + "loss": 0.0201, + "step": 5718 + }, + { + "epoch": 2.540084388185654, + "grad_norm": 0.3712283572301996, + "learning_rate": 3.5358861436172487e-06, + "loss": 0.0327, + "step": 5719 + }, + { + "epoch": 2.5405285365312014, + "grad_norm": 0.4121457515255565, + "learning_rate": 3.5340326893518993e-06, + "loss": 0.0337, + "step": 5720 + }, + { + "epoch": 2.540972684876749, + "grad_norm": 0.4589358591248568, + "learning_rate": 3.5321794554463397e-06, + "loss": 0.0282, + "step": 5721 + }, + { + "epoch": 2.5414168332222964, + "grad_norm": 0.41283412784679185, + "learning_rate": 3.530326442179142e-06, + "loss": 0.0433, + "step": 5722 + }, + { + "epoch": 2.5418609815678437, + "grad_norm": 0.3832687248919424, + "learning_rate": 3.5284736498288452e-06, + "loss": 0.0275, + "step": 5723 + }, + { + "epoch": 2.542305129913391, + "grad_norm": 0.4017937297419123, + "learning_rate": 3.526621078673954e-06, + "loss": 0.0301, + "step": 5724 + }, + { + "epoch": 2.5427492782589383, + "grad_norm": 0.3799788724726438, + "learning_rate": 3.5247687289929443e-06, + "loss": 0.0352, + "step": 5725 + }, + { + "epoch": 2.543193426604486, + "grad_norm": 0.3381062594883136, + "learning_rate": 3.5229166010642544e-06, + "loss": 0.0237, + "step": 5726 + }, + { + "epoch": 2.5436375749500333, + "grad_norm": 0.9370246012561935, + "learning_rate": 3.521064695166292e-06, + "loss": 0.0417, + "step": 5727 + }, + { + "epoch": 2.5440817232955806, + "grad_norm": 0.5170089647125808, + "learning_rate": 3.5192130115774283e-06, + "loss": 0.0379, + "step": 5728 + }, + { + "epoch": 2.5445258716411283, + "grad_norm": 0.5478100844231081, + "learning_rate": 3.5173615505760015e-06, + "loss": 0.0327, + "step": 5729 + }, + { + "epoch": 2.5449700199866756, + "grad_norm": 0.3263956967060353, + "learning_rate": 3.5155103124403184e-06, + "loss": 0.0235, + "step": 5730 + }, + { + "epoch": 2.545414168332223, + "grad_norm": 0.4151765481340016, + "learning_rate": 3.513659297448655e-06, + "loss": 0.0315, + "step": 5731 + }, + { + "epoch": 2.54585831667777, + "grad_norm": 0.4249849360391111, + "learning_rate": 3.511808505879247e-06, + "loss": 0.037, + "step": 5732 + }, + { + "epoch": 2.546302465023318, + "grad_norm": 0.430816357070103, + "learning_rate": 3.5099579380103e-06, + "loss": 0.0313, + "step": 5733 + }, + { + "epoch": 2.546746613368865, + "grad_norm": 0.36068218836576676, + "learning_rate": 3.508107594119987e-06, + "loss": 0.0248, + "step": 5734 + }, + { + "epoch": 2.5471907617144125, + "grad_norm": 0.5603158518242672, + "learning_rate": 3.506257474486444e-06, + "loss": 0.0407, + "step": 5735 + }, + { + "epoch": 2.5476349100599602, + "grad_norm": 0.457105631896688, + "learning_rate": 3.5044075793877784e-06, + "loss": 0.0285, + "step": 5736 + }, + { + "epoch": 2.5480790584055075, + "grad_norm": 0.4513751440943639, + "learning_rate": 3.5025579091020584e-06, + "loss": 0.0295, + "step": 5737 + }, + { + "epoch": 2.548523206751055, + "grad_norm": 0.4467104682114999, + "learning_rate": 3.500708463907323e-06, + "loss": 0.0297, + "step": 5738 + }, + { + "epoch": 2.548967355096602, + "grad_norm": 0.3775580565507929, + "learning_rate": 3.498859244081573e-06, + "loss": 0.0302, + "step": 5739 + }, + { + "epoch": 2.54941150344215, + "grad_norm": 0.4736436006509606, + "learning_rate": 3.4970102499027787e-06, + "loss": 0.0324, + "step": 5740 + }, + { + "epoch": 2.549855651787697, + "grad_norm": 0.38970978243484516, + "learning_rate": 3.4951614816488733e-06, + "loss": 0.0239, + "step": 5741 + }, + { + "epoch": 2.5502998001332444, + "grad_norm": 0.3858729170075839, + "learning_rate": 3.4933129395977627e-06, + "loss": 0.0282, + "step": 5742 + }, + { + "epoch": 2.550743948478792, + "grad_norm": 0.46868923999899664, + "learning_rate": 3.491464624027311e-06, + "loss": 0.0309, + "step": 5743 + }, + { + "epoch": 2.5511880968243394, + "grad_norm": 0.42676283372780777, + "learning_rate": 3.489616535215351e-06, + "loss": 0.0288, + "step": 5744 + }, + { + "epoch": 2.5516322451698867, + "grad_norm": 0.47537306808660484, + "learning_rate": 3.487768673439684e-06, + "loss": 0.0409, + "step": 5745 + }, + { + "epoch": 2.552076393515434, + "grad_norm": 0.5509640851909691, + "learning_rate": 3.4859210389780717e-06, + "loss": 0.0537, + "step": 5746 + }, + { + "epoch": 2.5525205418609813, + "grad_norm": 0.4532699668327261, + "learning_rate": 3.484073632108248e-06, + "loss": 0.0374, + "step": 5747 + }, + { + "epoch": 2.552964690206529, + "grad_norm": 0.30162200474796635, + "learning_rate": 3.4822264531079074e-06, + "loss": 0.0346, + "step": 5748 + }, + { + "epoch": 2.5534088385520763, + "grad_norm": 0.41586878305661834, + "learning_rate": 3.4803795022547152e-06, + "loss": 0.0385, + "step": 5749 + }, + { + "epoch": 2.553852986897624, + "grad_norm": 0.5284306714224316, + "learning_rate": 3.478532779826297e-06, + "loss": 0.0329, + "step": 5750 + }, + { + "epoch": 2.5542971352431714, + "grad_norm": 0.3711615788486314, + "learning_rate": 3.476686286100247e-06, + "loss": 0.0277, + "step": 5751 + }, + { + "epoch": 2.5547412835887187, + "grad_norm": 0.4470649615235087, + "learning_rate": 3.4748400213541233e-06, + "loss": 0.0463, + "step": 5752 + }, + { + "epoch": 2.555185431934266, + "grad_norm": 0.40717240259190046, + "learning_rate": 3.4729939858654548e-06, + "loss": 0.0296, + "step": 5753 + }, + { + "epoch": 2.5556295802798132, + "grad_norm": 0.4846910790786083, + "learning_rate": 3.471148179911728e-06, + "loss": 0.0404, + "step": 5754 + }, + { + "epoch": 2.556073728625361, + "grad_norm": 0.37627232321927323, + "learning_rate": 3.4693026037704012e-06, + "loss": 0.0334, + "step": 5755 + }, + { + "epoch": 2.5565178769709083, + "grad_norm": 0.34629139309877494, + "learning_rate": 3.467457257718896e-06, + "loss": 0.0272, + "step": 5756 + }, + { + "epoch": 2.5569620253164556, + "grad_norm": 0.3421995312642631, + "learning_rate": 3.4656121420345968e-06, + "loss": 0.0317, + "step": 5757 + }, + { + "epoch": 2.5574061736620033, + "grad_norm": 0.44077802939094335, + "learning_rate": 3.463767256994856e-06, + "loss": 0.0549, + "step": 5758 + }, + { + "epoch": 2.5578503220075506, + "grad_norm": 0.3954224399718614, + "learning_rate": 3.461922602876995e-06, + "loss": 0.0334, + "step": 5759 + }, + { + "epoch": 2.558294470353098, + "grad_norm": 0.41169694619074404, + "learning_rate": 3.460078179958294e-06, + "loss": 0.0341, + "step": 5760 + }, + { + "epoch": 2.558738618698645, + "grad_norm": 0.4812985244340657, + "learning_rate": 3.458233988516e-06, + "loss": 0.0407, + "step": 5761 + }, + { + "epoch": 2.559182767044193, + "grad_norm": 0.3331989432004887, + "learning_rate": 3.4563900288273287e-06, + "loss": 0.0247, + "step": 5762 + }, + { + "epoch": 2.55962691538974, + "grad_norm": 0.3885261825147752, + "learning_rate": 3.454546301169458e-06, + "loss": 0.0294, + "step": 5763 + }, + { + "epoch": 2.5600710637352875, + "grad_norm": 0.43455546109798127, + "learning_rate": 3.4527028058195276e-06, + "loss": 0.0276, + "step": 5764 + }, + { + "epoch": 2.560515212080835, + "grad_norm": 0.41019774224604005, + "learning_rate": 3.4508595430546516e-06, + "loss": 0.0376, + "step": 5765 + }, + { + "epoch": 2.5609593604263825, + "grad_norm": 0.5340205931151287, + "learning_rate": 3.4490165131519027e-06, + "loss": 0.0363, + "step": 5766 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.32981175279651753, + "learning_rate": 3.4471737163883178e-06, + "loss": 0.0341, + "step": 5767 + }, + { + "epoch": 2.561847657117477, + "grad_norm": 0.4191900996748185, + "learning_rate": 3.4453311530409008e-06, + "loss": 0.0336, + "step": 5768 + }, + { + "epoch": 2.5622918054630244, + "grad_norm": 0.4533154388512584, + "learning_rate": 3.4434888233866205e-06, + "loss": 0.0379, + "step": 5769 + }, + { + "epoch": 2.562735953808572, + "grad_norm": 0.4083356757917847, + "learning_rate": 3.4416467277024097e-06, + "loss": 0.0256, + "step": 5770 + }, + { + "epoch": 2.5631801021541194, + "grad_norm": 0.46425748132873407, + "learning_rate": 3.4398048662651693e-06, + "loss": 0.0455, + "step": 5771 + }, + { + "epoch": 2.563624250499667, + "grad_norm": 0.3043869275862561, + "learning_rate": 3.4379632393517593e-06, + "loss": 0.022, + "step": 5772 + }, + { + "epoch": 2.5640683988452144, + "grad_norm": 0.7475044005307006, + "learning_rate": 3.43612184723901e-06, + "loss": 0.0502, + "step": 5773 + }, + { + "epoch": 2.5645125471907617, + "grad_norm": 0.32220506593319287, + "learning_rate": 3.4342806902037118e-06, + "loss": 0.0271, + "step": 5774 + }, + { + "epoch": 2.564956695536309, + "grad_norm": 0.36616059681338226, + "learning_rate": 3.4324397685226217e-06, + "loss": 0.0327, + "step": 5775 + }, + { + "epoch": 2.5654008438818563, + "grad_norm": 0.4598617313466593, + "learning_rate": 3.4305990824724645e-06, + "loss": 0.0255, + "step": 5776 + }, + { + "epoch": 2.565844992227404, + "grad_norm": 0.4169024524406747, + "learning_rate": 3.428758632329925e-06, + "loss": 0.0261, + "step": 5777 + }, + { + "epoch": 2.5662891405729513, + "grad_norm": 0.46153415454886376, + "learning_rate": 3.426918418371652e-06, + "loss": 0.0313, + "step": 5778 + }, + { + "epoch": 2.5667332889184986, + "grad_norm": 0.5358677284520018, + "learning_rate": 3.4250784408742644e-06, + "loss": 0.045, + "step": 5779 + }, + { + "epoch": 2.5671774372640463, + "grad_norm": 0.4437829933438522, + "learning_rate": 3.4232387001143396e-06, + "loss": 0.0423, + "step": 5780 + }, + { + "epoch": 2.5676215856095936, + "grad_norm": 0.39105850673238723, + "learning_rate": 3.4213991963684212e-06, + "loss": 0.0353, + "step": 5781 + }, + { + "epoch": 2.568065733955141, + "grad_norm": 0.5049529429986211, + "learning_rate": 3.419559929913021e-06, + "loss": 0.0345, + "step": 5782 + }, + { + "epoch": 2.568509882300688, + "grad_norm": 0.4292519880189004, + "learning_rate": 3.4177209010246104e-06, + "loss": 0.0372, + "step": 5783 + }, + { + "epoch": 2.568954030646236, + "grad_norm": 0.373978887044069, + "learning_rate": 3.415882109979627e-06, + "loss": 0.0428, + "step": 5784 + }, + { + "epoch": 2.5693981789917832, + "grad_norm": 0.500647171885995, + "learning_rate": 3.4140435570544708e-06, + "loss": 0.036, + "step": 5785 + }, + { + "epoch": 2.5698423273373305, + "grad_norm": 0.5585745761796531, + "learning_rate": 3.4122052425255097e-06, + "loss": 0.0371, + "step": 5786 + }, + { + "epoch": 2.5702864756828783, + "grad_norm": 0.40623789381422887, + "learning_rate": 3.4103671666690706e-06, + "loss": 0.0346, + "step": 5787 + }, + { + "epoch": 2.5707306240284256, + "grad_norm": 0.5204708462096757, + "learning_rate": 3.4085293297614513e-06, + "loss": 0.0341, + "step": 5788 + }, + { + "epoch": 2.571174772373973, + "grad_norm": 0.38514191058110864, + "learning_rate": 3.406691732078907e-06, + "loss": 0.0247, + "step": 5789 + }, + { + "epoch": 2.57161892071952, + "grad_norm": 0.47415946915349233, + "learning_rate": 3.4048543738976624e-06, + "loss": 0.03, + "step": 5790 + }, + { + "epoch": 2.572063069065068, + "grad_norm": 0.35195512662300593, + "learning_rate": 3.4030172554939022e-06, + "loss": 0.0179, + "step": 5791 + }, + { + "epoch": 2.572507217410615, + "grad_norm": 0.455360916505429, + "learning_rate": 3.401180377143774e-06, + "loss": 0.0355, + "step": 5792 + }, + { + "epoch": 2.5729513657561625, + "grad_norm": 0.6594611426447926, + "learning_rate": 3.399343739123395e-06, + "loss": 0.04, + "step": 5793 + }, + { + "epoch": 2.57339551410171, + "grad_norm": 0.30764478468780626, + "learning_rate": 3.3975073417088445e-06, + "loss": 0.024, + "step": 5794 + }, + { + "epoch": 2.5738396624472575, + "grad_norm": 0.5139517791311652, + "learning_rate": 3.3956711851761603e-06, + "loss": 0.0386, + "step": 5795 + }, + { + "epoch": 2.5742838107928048, + "grad_norm": 0.44276110456272694, + "learning_rate": 3.393835269801351e-06, + "loss": 0.0276, + "step": 5796 + }, + { + "epoch": 2.574727959138352, + "grad_norm": 0.6084624820775043, + "learning_rate": 3.3919995958603845e-06, + "loss": 0.0349, + "step": 5797 + }, + { + "epoch": 2.5751721074838994, + "grad_norm": 0.6187475898080996, + "learning_rate": 3.3901641636291925e-06, + "loss": 0.0369, + "step": 5798 + }, + { + "epoch": 2.575616255829447, + "grad_norm": 0.4166403983258688, + "learning_rate": 3.388328973383673e-06, + "loss": 0.028, + "step": 5799 + }, + { + "epoch": 2.5760604041749944, + "grad_norm": 0.37895711615417893, + "learning_rate": 3.3864940253996885e-06, + "loss": 0.0309, + "step": 5800 + }, + { + "epoch": 2.576504552520542, + "grad_norm": 0.4165330638891502, + "learning_rate": 3.3846593199530598e-06, + "loss": 0.0324, + "step": 5801 + }, + { + "epoch": 2.5769487008660894, + "grad_norm": 0.3850803256916831, + "learning_rate": 3.3828248573195744e-06, + "loss": 0.0243, + "step": 5802 + }, + { + "epoch": 2.5773928492116367, + "grad_norm": 0.4535252370266945, + "learning_rate": 3.3809906377749853e-06, + "loss": 0.026, + "step": 5803 + }, + { + "epoch": 2.577836997557184, + "grad_norm": 0.4473597091096733, + "learning_rate": 3.3791566615950034e-06, + "loss": 0.0446, + "step": 5804 + }, + { + "epoch": 2.5782811459027313, + "grad_norm": 0.4461365886821962, + "learning_rate": 3.37732292905531e-06, + "loss": 0.0353, + "step": 5805 + }, + { + "epoch": 2.578725294248279, + "grad_norm": 0.4044232712398919, + "learning_rate": 3.375489440431544e-06, + "loss": 0.0356, + "step": 5806 + }, + { + "epoch": 2.5791694425938263, + "grad_norm": 0.4100077927626587, + "learning_rate": 3.373656195999312e-06, + "loss": 0.0288, + "step": 5807 + }, + { + "epoch": 2.5796135909393736, + "grad_norm": 0.49681714136189903, + "learning_rate": 3.3718231960341807e-06, + "loss": 0.0392, + "step": 5808 + }, + { + "epoch": 2.5800577392849213, + "grad_norm": 0.3508735282517198, + "learning_rate": 3.3699904408116778e-06, + "loss": 0.0266, + "step": 5809 + }, + { + "epoch": 2.5805018876304686, + "grad_norm": 0.4685055266479743, + "learning_rate": 3.368157930607303e-06, + "loss": 0.0322, + "step": 5810 + }, + { + "epoch": 2.580946035976016, + "grad_norm": 0.3392092135667048, + "learning_rate": 3.3663256656965115e-06, + "loss": 0.0266, + "step": 5811 + }, + { + "epoch": 2.581390184321563, + "grad_norm": 0.3673851977352648, + "learning_rate": 3.364493646354724e-06, + "loss": 0.0291, + "step": 5812 + }, + { + "epoch": 2.581834332667111, + "grad_norm": 0.4826180223086163, + "learning_rate": 3.3626618728573233e-06, + "loss": 0.0366, + "step": 5813 + }, + { + "epoch": 2.5822784810126582, + "grad_norm": 0.46379192115203366, + "learning_rate": 3.3608303454796578e-06, + "loss": 0.0359, + "step": 5814 + }, + { + "epoch": 2.5827226293582055, + "grad_norm": 0.33507634737295205, + "learning_rate": 3.3589990644970325e-06, + "loss": 0.0293, + "step": 5815 + }, + { + "epoch": 2.5831667777037532, + "grad_norm": 0.394019845804789, + "learning_rate": 3.3571680301847265e-06, + "loss": 0.0287, + "step": 5816 + }, + { + "epoch": 2.5836109260493005, + "grad_norm": 0.4267750495828387, + "learning_rate": 3.355337242817972e-06, + "loss": 0.0225, + "step": 5817 + }, + { + "epoch": 2.584055074394848, + "grad_norm": 0.40408538263164145, + "learning_rate": 3.3535067026719683e-06, + "loss": 0.0354, + "step": 5818 + }, + { + "epoch": 2.584499222740395, + "grad_norm": 0.3264537051893323, + "learning_rate": 3.3516764100218744e-06, + "loss": 0.0184, + "step": 5819 + }, + { + "epoch": 2.584943371085943, + "grad_norm": 0.5823548169064077, + "learning_rate": 3.3498463651428183e-06, + "loss": 0.0555, + "step": 5820 + }, + { + "epoch": 2.58538751943149, + "grad_norm": 0.6018406880891104, + "learning_rate": 3.348016568309882e-06, + "loss": 0.0323, + "step": 5821 + }, + { + "epoch": 2.5858316677770374, + "grad_norm": 0.4499473097806689, + "learning_rate": 3.3461870197981205e-06, + "loss": 0.0313, + "step": 5822 + }, + { + "epoch": 2.586275816122585, + "grad_norm": 0.4573988840299094, + "learning_rate": 3.3443577198825416e-06, + "loss": 0.0217, + "step": 5823 + }, + { + "epoch": 2.5867199644681325, + "grad_norm": 0.47710512867182214, + "learning_rate": 3.342528668838123e-06, + "loss": 0.0305, + "step": 5824 + }, + { + "epoch": 2.5871641128136798, + "grad_norm": 0.3333521539172945, + "learning_rate": 3.3406998669398015e-06, + "loss": 0.0276, + "step": 5825 + }, + { + "epoch": 2.587608261159227, + "grad_norm": 0.4557037415155258, + "learning_rate": 3.338871314462474e-06, + "loss": 0.0283, + "step": 5826 + }, + { + "epoch": 2.5880524095047743, + "grad_norm": 0.4444177366240355, + "learning_rate": 3.337043011681007e-06, + "loss": 0.0328, + "step": 5827 + }, + { + "epoch": 2.588496557850322, + "grad_norm": 0.3980372800704896, + "learning_rate": 3.335214958870225e-06, + "loss": 0.0312, + "step": 5828 + }, + { + "epoch": 2.5889407061958694, + "grad_norm": 0.38163158317470414, + "learning_rate": 3.333387156304914e-06, + "loss": 0.021, + "step": 5829 + }, + { + "epoch": 2.589384854541417, + "grad_norm": 0.3444340204979025, + "learning_rate": 3.3315596042598235e-06, + "loss": 0.0283, + "step": 5830 + }, + { + "epoch": 2.5898290028869644, + "grad_norm": 0.4610922606737187, + "learning_rate": 3.3297323030096672e-06, + "loss": 0.032, + "step": 5831 + }, + { + "epoch": 2.5902731512325117, + "grad_norm": 0.41870835356631375, + "learning_rate": 3.327905252829117e-06, + "loss": 0.0448, + "step": 5832 + }, + { + "epoch": 2.590717299578059, + "grad_norm": 0.4779584466171409, + "learning_rate": 3.326078453992813e-06, + "loss": 0.0242, + "step": 5833 + }, + { + "epoch": 2.5911614479236063, + "grad_norm": 0.5325628313786069, + "learning_rate": 3.324251906775351e-06, + "loss": 0.0395, + "step": 5834 + }, + { + "epoch": 2.591605596269154, + "grad_norm": 0.358759607760571, + "learning_rate": 3.3224256114512953e-06, + "loss": 0.0316, + "step": 5835 + }, + { + "epoch": 2.5920497446147013, + "grad_norm": 0.43363197580686524, + "learning_rate": 3.3205995682951666e-06, + "loss": 0.0334, + "step": 5836 + }, + { + "epoch": 2.5924938929602486, + "grad_norm": 0.4445842523123569, + "learning_rate": 3.31877377758145e-06, + "loss": 0.033, + "step": 5837 + }, + { + "epoch": 2.5929380413057963, + "grad_norm": 0.603779398622737, + "learning_rate": 3.316948239584592e-06, + "loss": 0.0449, + "step": 5838 + }, + { + "epoch": 2.5933821896513436, + "grad_norm": 0.4392799290696822, + "learning_rate": 3.3151229545790066e-06, + "loss": 0.0384, + "step": 5839 + }, + { + "epoch": 2.593826337996891, + "grad_norm": 0.38613834204137615, + "learning_rate": 3.3132979228390615e-06, + "loss": 0.0248, + "step": 5840 + }, + { + "epoch": 2.594270486342438, + "grad_norm": 0.31483135068991924, + "learning_rate": 3.3114731446390897e-06, + "loss": 0.0204, + "step": 5841 + }, + { + "epoch": 2.594714634687986, + "grad_norm": 0.42067924470792134, + "learning_rate": 3.3096486202533884e-06, + "loss": 0.0312, + "step": 5842 + }, + { + "epoch": 2.595158783033533, + "grad_norm": 0.4747651071435795, + "learning_rate": 3.3078243499562126e-06, + "loss": 0.0325, + "step": 5843 + }, + { + "epoch": 2.5956029313790805, + "grad_norm": 0.3567548194704181, + "learning_rate": 3.3060003340217822e-06, + "loss": 0.0241, + "step": 5844 + }, + { + "epoch": 2.5960470797246282, + "grad_norm": 0.3574932064483991, + "learning_rate": 3.3041765727242773e-06, + "loss": 0.0302, + "step": 5845 + }, + { + "epoch": 2.5964912280701755, + "grad_norm": 0.8739424998669374, + "learning_rate": 3.302353066337842e-06, + "loss": 0.0458, + "step": 5846 + }, + { + "epoch": 2.596935376415723, + "grad_norm": 0.4781713811806308, + "learning_rate": 3.300529815136577e-06, + "loss": 0.0328, + "step": 5847 + }, + { + "epoch": 2.59737952476127, + "grad_norm": 0.4686560316809108, + "learning_rate": 3.2987068193945515e-06, + "loss": 0.0392, + "step": 5848 + }, + { + "epoch": 2.597823673106818, + "grad_norm": 0.7682784843583998, + "learning_rate": 3.296884079385789e-06, + "loss": 0.0432, + "step": 5849 + }, + { + "epoch": 2.598267821452365, + "grad_norm": 0.43912852313892203, + "learning_rate": 3.2950615953842816e-06, + "loss": 0.0357, + "step": 5850 + }, + { + "epoch": 2.5987119697979124, + "grad_norm": 0.4363862214055466, + "learning_rate": 3.293239367663978e-06, + "loss": 0.0355, + "step": 5851 + }, + { + "epoch": 2.59915611814346, + "grad_norm": 0.4194478678628829, + "learning_rate": 3.2914173964987905e-06, + "loss": 0.0376, + "step": 5852 + }, + { + "epoch": 2.5996002664890074, + "grad_norm": 0.3939580447848396, + "learning_rate": 3.289595682162593e-06, + "loss": 0.0351, + "step": 5853 + }, + { + "epoch": 2.6000444148345547, + "grad_norm": 0.3563672060439901, + "learning_rate": 3.2877742249292174e-06, + "loss": 0.0264, + "step": 5854 + }, + { + "epoch": 2.600488563180102, + "grad_norm": 0.45772110095911156, + "learning_rate": 3.2859530250724604e-06, + "loss": 0.0286, + "step": 5855 + }, + { + "epoch": 2.6009327115256493, + "grad_norm": 0.3788943790281068, + "learning_rate": 3.284132082866083e-06, + "loss": 0.0231, + "step": 5856 + }, + { + "epoch": 2.601376859871197, + "grad_norm": 0.42056256980059764, + "learning_rate": 3.2823113985837996e-06, + "loss": 0.0301, + "step": 5857 + }, + { + "epoch": 2.6018210082167443, + "grad_norm": 0.4731371391351043, + "learning_rate": 3.2804909724992917e-06, + "loss": 0.0421, + "step": 5858 + }, + { + "epoch": 2.602265156562292, + "grad_norm": 0.4171211336356501, + "learning_rate": 3.2786708048862e-06, + "loss": 0.0267, + "step": 5859 + }, + { + "epoch": 2.6027093049078394, + "grad_norm": 0.33335158042812324, + "learning_rate": 3.276850896018128e-06, + "loss": 0.0287, + "step": 5860 + }, + { + "epoch": 2.6031534532533867, + "grad_norm": 0.6871125152697936, + "learning_rate": 3.2750312461686346e-06, + "loss": 0.0253, + "step": 5861 + }, + { + "epoch": 2.603597601598934, + "grad_norm": 0.38868974447379917, + "learning_rate": 3.273211855611248e-06, + "loss": 0.0384, + "step": 5862 + }, + { + "epoch": 2.6040417499444812, + "grad_norm": 0.41427271665417736, + "learning_rate": 3.271392724619454e-06, + "loss": 0.0283, + "step": 5863 + }, + { + "epoch": 2.604485898290029, + "grad_norm": 0.47635109894278754, + "learning_rate": 3.2695738534666964e-06, + "loss": 0.0307, + "step": 5864 + }, + { + "epoch": 2.6049300466355763, + "grad_norm": 0.43305461847503646, + "learning_rate": 3.2677552424263836e-06, + "loss": 0.0323, + "step": 5865 + }, + { + "epoch": 2.6053741949811235, + "grad_norm": 0.5925027302243593, + "learning_rate": 3.2659368917718813e-06, + "loss": 0.0349, + "step": 5866 + }, + { + "epoch": 2.6058183433266713, + "grad_norm": 0.5670868601287159, + "learning_rate": 3.264118801776524e-06, + "loss": 0.0272, + "step": 5867 + }, + { + "epoch": 2.6062624916722186, + "grad_norm": 0.4639294675991596, + "learning_rate": 3.262300972713598e-06, + "loss": 0.0281, + "step": 5868 + }, + { + "epoch": 2.606706640017766, + "grad_norm": 0.5113196767933541, + "learning_rate": 3.2604834048563527e-06, + "loss": 0.0258, + "step": 5869 + }, + { + "epoch": 2.607150788363313, + "grad_norm": 0.37055184296693755, + "learning_rate": 3.2586660984780017e-06, + "loss": 0.0292, + "step": 5870 + }, + { + "epoch": 2.607594936708861, + "grad_norm": 0.5340070208292783, + "learning_rate": 3.256849053851716e-06, + "loss": 0.0362, + "step": 5871 + }, + { + "epoch": 2.608039085054408, + "grad_norm": 0.6461515771963295, + "learning_rate": 3.2550322712506265e-06, + "loss": 0.0378, + "step": 5872 + }, + { + "epoch": 2.6084832333999555, + "grad_norm": 0.43751867677053585, + "learning_rate": 3.2532157509478313e-06, + "loss": 0.0297, + "step": 5873 + }, + { + "epoch": 2.608927381745503, + "grad_norm": 0.4731032790298419, + "learning_rate": 3.2513994932163806e-06, + "loss": 0.0426, + "step": 5874 + }, + { + "epoch": 2.6093715300910505, + "grad_norm": 0.40443116847842614, + "learning_rate": 3.2495834983292894e-06, + "loss": 0.0315, + "step": 5875 + }, + { + "epoch": 2.609815678436598, + "grad_norm": 0.4867165911113075, + "learning_rate": 3.2477677665595333e-06, + "loss": 0.0319, + "step": 5876 + }, + { + "epoch": 2.610259826782145, + "grad_norm": 0.42932234052521123, + "learning_rate": 3.2459522981800473e-06, + "loss": 0.0465, + "step": 5877 + }, + { + "epoch": 2.610703975127693, + "grad_norm": 0.35081568597263124, + "learning_rate": 3.244137093463725e-06, + "loss": 0.0194, + "step": 5878 + }, + { + "epoch": 2.61114812347324, + "grad_norm": 0.5690602488132523, + "learning_rate": 3.2423221526834253e-06, + "loss": 0.0381, + "step": 5879 + }, + { + "epoch": 2.6115922718187874, + "grad_norm": 0.36352934356142463, + "learning_rate": 3.2405074761119648e-06, + "loss": 0.0282, + "step": 5880 + }, + { + "epoch": 2.612036420164335, + "grad_norm": 0.3774759920653436, + "learning_rate": 3.2386930640221193e-06, + "loss": 0.0344, + "step": 5881 + }, + { + "epoch": 2.6124805685098824, + "grad_norm": 0.5371544305687926, + "learning_rate": 3.2368789166866244e-06, + "loss": 0.0497, + "step": 5882 + }, + { + "epoch": 2.6129247168554297, + "grad_norm": 0.38943009927535477, + "learning_rate": 3.2350650343781775e-06, + "loss": 0.0227, + "step": 5883 + }, + { + "epoch": 2.613368865200977, + "grad_norm": 0.4501911900625063, + "learning_rate": 3.2332514173694396e-06, + "loss": 0.0395, + "step": 5884 + }, + { + "epoch": 2.6138130135465243, + "grad_norm": 0.4234499617743984, + "learning_rate": 3.2314380659330246e-06, + "loss": 0.0301, + "step": 5885 + }, + { + "epoch": 2.614257161892072, + "grad_norm": 0.4318219863554864, + "learning_rate": 3.22962498034151e-06, + "loss": 0.0294, + "step": 5886 + }, + { + "epoch": 2.6147013102376193, + "grad_norm": 0.503946757308322, + "learning_rate": 3.227812160867436e-06, + "loss": 0.0394, + "step": 5887 + }, + { + "epoch": 2.615145458583167, + "grad_norm": 0.3890770124231559, + "learning_rate": 3.2259996077832976e-06, + "loss": 0.0267, + "step": 5888 + }, + { + "epoch": 2.6155896069287143, + "grad_norm": 0.3697580136655012, + "learning_rate": 3.2241873213615514e-06, + "loss": 0.034, + "step": 5889 + }, + { + "epoch": 2.6160337552742616, + "grad_norm": 0.5649235090654835, + "learning_rate": 3.2223753018746186e-06, + "loss": 0.0272, + "step": 5890 + }, + { + "epoch": 2.616477903619809, + "grad_norm": 0.3708345448824024, + "learning_rate": 3.220563549594874e-06, + "loss": 0.035, + "step": 5891 + }, + { + "epoch": 2.616922051965356, + "grad_norm": 0.3432122520668072, + "learning_rate": 3.2187520647946547e-06, + "loss": 0.0283, + "step": 5892 + }, + { + "epoch": 2.617366200310904, + "grad_norm": 0.42022838385345057, + "learning_rate": 3.2169408477462594e-06, + "loss": 0.0364, + "step": 5893 + }, + { + "epoch": 2.6178103486564512, + "grad_norm": 0.44719094760718237, + "learning_rate": 3.2151298987219437e-06, + "loss": 0.0325, + "step": 5894 + }, + { + "epoch": 2.6182544970019985, + "grad_norm": 0.35618319174657215, + "learning_rate": 3.2133192179939215e-06, + "loss": 0.0256, + "step": 5895 + }, + { + "epoch": 2.6186986453475463, + "grad_norm": 0.3925862521483163, + "learning_rate": 3.2115088058343725e-06, + "loss": 0.0282, + "step": 5896 + }, + { + "epoch": 2.6191427936930936, + "grad_norm": 0.45474329229543986, + "learning_rate": 3.209698662515432e-06, + "loss": 0.0296, + "step": 5897 + }, + { + "epoch": 2.619586942038641, + "grad_norm": 0.3953757491585112, + "learning_rate": 3.2078887883091948e-06, + "loss": 0.0422, + "step": 5898 + }, + { + "epoch": 2.620031090384188, + "grad_norm": 0.36604722451208876, + "learning_rate": 3.2060791834877136e-06, + "loss": 0.029, + "step": 5899 + }, + { + "epoch": 2.620475238729736, + "grad_norm": 0.32800664474353197, + "learning_rate": 3.204269848323004e-06, + "loss": 0.0295, + "step": 5900 + }, + { + "epoch": 2.620919387075283, + "grad_norm": 0.5458287273191036, + "learning_rate": 3.2024607830870424e-06, + "loss": 0.0475, + "step": 5901 + }, + { + "epoch": 2.6213635354208304, + "grad_norm": 0.43819732935782096, + "learning_rate": 3.2006519880517597e-06, + "loss": 0.0452, + "step": 5902 + }, + { + "epoch": 2.621807683766378, + "grad_norm": 0.36142275590555617, + "learning_rate": 3.1988434634890476e-06, + "loss": 0.0277, + "step": 5903 + }, + { + "epoch": 2.6222518321119255, + "grad_norm": 0.4477755001983317, + "learning_rate": 3.197035209670761e-06, + "loss": 0.0294, + "step": 5904 + }, + { + "epoch": 2.6226959804574728, + "grad_norm": 0.46965829732778464, + "learning_rate": 3.1952272268687083e-06, + "loss": 0.0336, + "step": 5905 + }, + { + "epoch": 2.62314012880302, + "grad_norm": 0.39994107522556216, + "learning_rate": 3.19341951535466e-06, + "loss": 0.0214, + "step": 5906 + }, + { + "epoch": 2.6235842771485673, + "grad_norm": 0.4238276596672287, + "learning_rate": 3.1916120754003475e-06, + "loss": 0.0269, + "step": 5907 + }, + { + "epoch": 2.624028425494115, + "grad_norm": 1.8594679605411257, + "learning_rate": 3.1898049072774605e-06, + "loss": 0.0367, + "step": 5908 + }, + { + "epoch": 2.6244725738396624, + "grad_norm": 0.31000445284347417, + "learning_rate": 3.1879980112576457e-06, + "loss": 0.0248, + "step": 5909 + }, + { + "epoch": 2.62491672218521, + "grad_norm": 0.40069203774397927, + "learning_rate": 3.1861913876125093e-06, + "loss": 0.0314, + "step": 5910 + }, + { + "epoch": 2.6253608705307574, + "grad_norm": 0.39410237773498447, + "learning_rate": 3.1843850366136198e-06, + "loss": 0.0293, + "step": 5911 + }, + { + "epoch": 2.6258050188763047, + "grad_norm": 0.3268432306409947, + "learning_rate": 3.182578958532499e-06, + "loss": 0.0266, + "step": 5912 + }, + { + "epoch": 2.626249167221852, + "grad_norm": 0.45108591338261844, + "learning_rate": 3.180773153640635e-06, + "loss": 0.0341, + "step": 5913 + }, + { + "epoch": 2.6266933155673993, + "grad_norm": 0.43982934139295143, + "learning_rate": 3.178967622209469e-06, + "loss": 0.0328, + "step": 5914 + }, + { + "epoch": 2.627137463912947, + "grad_norm": 0.34491560520514963, + "learning_rate": 3.177162364510404e-06, + "loss": 0.0243, + "step": 5915 + }, + { + "epoch": 2.6275816122584943, + "grad_norm": 0.404112232349346, + "learning_rate": 3.175357380814799e-06, + "loss": 0.0233, + "step": 5916 + }, + { + "epoch": 2.628025760604042, + "grad_norm": 0.4687180614976695, + "learning_rate": 3.1735526713939757e-06, + "loss": 0.0337, + "step": 5917 + }, + { + "epoch": 2.6284699089495893, + "grad_norm": 0.37747833327236135, + "learning_rate": 3.1717482365192106e-06, + "loss": 0.0263, + "step": 5918 + }, + { + "epoch": 2.6289140572951366, + "grad_norm": 0.38430475009185994, + "learning_rate": 3.1699440764617432e-06, + "loss": 0.0398, + "step": 5919 + }, + { + "epoch": 2.629358205640684, + "grad_norm": 0.39716398492985894, + "learning_rate": 3.1681401914927678e-06, + "loss": 0.0324, + "step": 5920 + }, + { + "epoch": 2.629802353986231, + "grad_norm": 0.40985834454742587, + "learning_rate": 3.1663365818834406e-06, + "loss": 0.0365, + "step": 5921 + }, + { + "epoch": 2.630246502331779, + "grad_norm": 0.32571897096876656, + "learning_rate": 3.1645332479048734e-06, + "loss": 0.03, + "step": 5922 + }, + { + "epoch": 2.630690650677326, + "grad_norm": 0.41303965455433855, + "learning_rate": 3.1627301898281364e-06, + "loss": 0.0256, + "step": 5923 + }, + { + "epoch": 2.6311347990228735, + "grad_norm": 0.4244488216427459, + "learning_rate": 3.1609274079242625e-06, + "loss": 0.0383, + "step": 5924 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.5719404451011891, + "learning_rate": 3.159124902464241e-06, + "loss": 0.0549, + "step": 5925 + }, + { + "epoch": 2.6320230957139685, + "grad_norm": 0.4271035366652883, + "learning_rate": 3.1573226737190164e-06, + "loss": 0.0365, + "step": 5926 + }, + { + "epoch": 2.632467244059516, + "grad_norm": 0.3779054549587856, + "learning_rate": 3.155520721959496e-06, + "loss": 0.0338, + "step": 5927 + }, + { + "epoch": 2.632911392405063, + "grad_norm": 0.3266233056591113, + "learning_rate": 3.1537190474565437e-06, + "loss": 0.0219, + "step": 5928 + }, + { + "epoch": 2.633355540750611, + "grad_norm": 0.6026007410470748, + "learning_rate": 3.151917650480979e-06, + "loss": 0.0593, + "step": 5929 + }, + { + "epoch": 2.633799689096158, + "grad_norm": 0.47698196221281425, + "learning_rate": 3.1501165313035877e-06, + "loss": 0.0393, + "step": 5930 + }, + { + "epoch": 2.6342438374417054, + "grad_norm": 0.47225489877722016, + "learning_rate": 3.148315690195104e-06, + "loss": 0.036, + "step": 5931 + }, + { + "epoch": 2.634687985787253, + "grad_norm": 0.32874590760012345, + "learning_rate": 3.146515127426228e-06, + "loss": 0.021, + "step": 5932 + }, + { + "epoch": 2.6351321341328005, + "grad_norm": 0.4293763106482794, + "learning_rate": 3.144714843267613e-06, + "loss": 0.0334, + "step": 5933 + }, + { + "epoch": 2.6355762824783477, + "grad_norm": 0.34918767908800713, + "learning_rate": 3.142914837989873e-06, + "loss": 0.0235, + "step": 5934 + }, + { + "epoch": 2.636020430823895, + "grad_norm": 0.514273957879937, + "learning_rate": 3.1411151118635774e-06, + "loss": 0.0335, + "step": 5935 + }, + { + "epoch": 2.6364645791694423, + "grad_norm": 0.42893295828776457, + "learning_rate": 3.1393156651592597e-06, + "loss": 0.0271, + "step": 5936 + }, + { + "epoch": 2.63690872751499, + "grad_norm": 0.5880664186723723, + "learning_rate": 3.137516498147405e-06, + "loss": 0.0346, + "step": 5937 + }, + { + "epoch": 2.6373528758605373, + "grad_norm": 0.4554614929699438, + "learning_rate": 3.1357176110984578e-06, + "loss": 0.0466, + "step": 5938 + }, + { + "epoch": 2.637797024206085, + "grad_norm": 0.4626649260029378, + "learning_rate": 3.1339190042828227e-06, + "loss": 0.0325, + "step": 5939 + }, + { + "epoch": 2.6382411725516324, + "grad_norm": 0.385619247407723, + "learning_rate": 3.132120677970859e-06, + "loss": 0.0317, + "step": 5940 + }, + { + "epoch": 2.6386853208971797, + "grad_norm": 0.772804476594651, + "learning_rate": 3.1303226324328896e-06, + "loss": 0.041, + "step": 5941 + }, + { + "epoch": 2.639129469242727, + "grad_norm": 0.7245537321538326, + "learning_rate": 3.1285248679391866e-06, + "loss": 0.0335, + "step": 5942 + }, + { + "epoch": 2.6395736175882742, + "grad_norm": 0.4668910845302412, + "learning_rate": 3.1267273847599888e-06, + "loss": 0.0282, + "step": 5943 + }, + { + "epoch": 2.640017765933822, + "grad_norm": 0.4437588379794779, + "learning_rate": 3.1249301831654842e-06, + "loss": 0.0248, + "step": 5944 + }, + { + "epoch": 2.6404619142793693, + "grad_norm": 0.391809580171727, + "learning_rate": 3.123133263425827e-06, + "loss": 0.0218, + "step": 5945 + }, + { + "epoch": 2.6409060626249166, + "grad_norm": 0.4573419094850795, + "learning_rate": 3.1213366258111207e-06, + "loss": 0.0377, + "step": 5946 + }, + { + "epoch": 2.6413502109704643, + "grad_norm": 0.3567944286524997, + "learning_rate": 3.1195402705914337e-06, + "loss": 0.0244, + "step": 5947 + }, + { + "epoch": 2.6417943593160116, + "grad_norm": 0.5003131202018632, + "learning_rate": 3.1177441980367873e-06, + "loss": 0.0389, + "step": 5948 + }, + { + "epoch": 2.642238507661559, + "grad_norm": 0.5664593011176533, + "learning_rate": 3.115948408417162e-06, + "loss": 0.027, + "step": 5949 + }, + { + "epoch": 2.642682656007106, + "grad_norm": 0.4398029849585822, + "learning_rate": 3.1141529020024964e-06, + "loss": 0.0306, + "step": 5950 + }, + { + "epoch": 2.643126804352654, + "grad_norm": 0.3797625962528218, + "learning_rate": 3.1123576790626825e-06, + "loss": 0.0266, + "step": 5951 + }, + { + "epoch": 2.643570952698201, + "grad_norm": 0.41053216679804944, + "learning_rate": 3.1105627398675743e-06, + "loss": 0.0352, + "step": 5952 + }, + { + "epoch": 2.6440151010437485, + "grad_norm": 0.3836633925963885, + "learning_rate": 3.1087680846869844e-06, + "loss": 0.0324, + "step": 5953 + }, + { + "epoch": 2.644459249389296, + "grad_norm": 0.439056518784003, + "learning_rate": 3.1069737137906776e-06, + "loss": 0.042, + "step": 5954 + }, + { + "epoch": 2.6449033977348435, + "grad_norm": 0.45357743660503874, + "learning_rate": 3.1051796274483776e-06, + "loss": 0.0343, + "step": 5955 + }, + { + "epoch": 2.645347546080391, + "grad_norm": 0.38982067943155413, + "learning_rate": 3.1033858259297677e-06, + "loss": 0.0374, + "step": 5956 + }, + { + "epoch": 2.645791694425938, + "grad_norm": 0.36926658548126234, + "learning_rate": 3.1015923095044844e-06, + "loss": 0.0307, + "step": 5957 + }, + { + "epoch": 2.646235842771486, + "grad_norm": 0.5321168036178672, + "learning_rate": 3.0997990784421273e-06, + "loss": 0.0571, + "step": 5958 + }, + { + "epoch": 2.646679991117033, + "grad_norm": 0.46761980336540304, + "learning_rate": 3.0980061330122463e-06, + "loss": 0.0398, + "step": 5959 + }, + { + "epoch": 2.6471241394625804, + "grad_norm": 0.41367845862555414, + "learning_rate": 3.096213473484354e-06, + "loss": 0.0211, + "step": 5960 + }, + { + "epoch": 2.647568287808128, + "grad_norm": 0.42099626188495787, + "learning_rate": 3.094421100127916e-06, + "loss": 0.0365, + "step": 5961 + }, + { + "epoch": 2.6480124361536754, + "grad_norm": 0.4995405826324344, + "learning_rate": 3.092629013212356e-06, + "loss": 0.0394, + "step": 5962 + }, + { + "epoch": 2.6484565844992227, + "grad_norm": 0.4212882639660684, + "learning_rate": 3.090837213007054e-06, + "loss": 0.0357, + "step": 5963 + }, + { + "epoch": 2.64890073284477, + "grad_norm": 0.4419706138728259, + "learning_rate": 3.0890456997813534e-06, + "loss": 0.0382, + "step": 5964 + }, + { + "epoch": 2.6493448811903173, + "grad_norm": 0.5281379267202536, + "learning_rate": 3.087254473804544e-06, + "loss": 0.0442, + "step": 5965 + }, + { + "epoch": 2.649789029535865, + "grad_norm": 0.9881997927190076, + "learning_rate": 3.0854635353458795e-06, + "loss": 0.0348, + "step": 5966 + }, + { + "epoch": 2.6502331778814123, + "grad_norm": 0.4051851073824622, + "learning_rate": 3.083672884674568e-06, + "loss": 0.0328, + "step": 5967 + }, + { + "epoch": 2.65067732622696, + "grad_norm": 0.43295642683770935, + "learning_rate": 3.081882522059774e-06, + "loss": 0.0445, + "step": 5968 + }, + { + "epoch": 2.6511214745725074, + "grad_norm": 0.7752250617010704, + "learning_rate": 3.0800924477706185e-06, + "loss": 0.0366, + "step": 5969 + }, + { + "epoch": 2.6515656229180546, + "grad_norm": 0.551166830656724, + "learning_rate": 3.0783026620761846e-06, + "loss": 0.0313, + "step": 5970 + }, + { + "epoch": 2.652009771263602, + "grad_norm": 0.35152614881285826, + "learning_rate": 3.076513165245504e-06, + "loss": 0.0279, + "step": 5971 + }, + { + "epoch": 2.6524539196091492, + "grad_norm": 0.39759843537124434, + "learning_rate": 3.0747239575475674e-06, + "loss": 0.0363, + "step": 5972 + }, + { + "epoch": 2.652898067954697, + "grad_norm": 0.4918571550335759, + "learning_rate": 3.072935039251327e-06, + "loss": 0.03, + "step": 5973 + }, + { + "epoch": 2.6533422163002442, + "grad_norm": 0.3903119645882834, + "learning_rate": 3.071146410625682e-06, + "loss": 0.0262, + "step": 5974 + }, + { + "epoch": 2.6537863646457915, + "grad_norm": 0.3990290251739691, + "learning_rate": 3.0693580719395e-06, + "loss": 0.0457, + "step": 5975 + }, + { + "epoch": 2.6542305129913393, + "grad_norm": 0.4382406546867813, + "learning_rate": 3.067570023461594e-06, + "loss": 0.0345, + "step": 5976 + }, + { + "epoch": 2.6546746613368866, + "grad_norm": 0.4165975886070993, + "learning_rate": 3.065782265460741e-06, + "loss": 0.0273, + "step": 5977 + }, + { + "epoch": 2.655118809682434, + "grad_norm": 0.5153364937600957, + "learning_rate": 3.06399479820567e-06, + "loss": 0.0386, + "step": 5978 + }, + { + "epoch": 2.655562958027981, + "grad_norm": 0.5238135987655357, + "learning_rate": 3.062207621965067e-06, + "loss": 0.0352, + "step": 5979 + }, + { + "epoch": 2.656007106373529, + "grad_norm": 0.427195897063526, + "learning_rate": 3.0604207370075743e-06, + "loss": 0.0252, + "step": 5980 + }, + { + "epoch": 2.656451254719076, + "grad_norm": 0.46584491776786274, + "learning_rate": 3.0586341436017954e-06, + "loss": 0.0476, + "step": 5981 + }, + { + "epoch": 2.6568954030646235, + "grad_norm": 0.43759944048446714, + "learning_rate": 3.056847842016282e-06, + "loss": 0.0273, + "step": 5982 + }, + { + "epoch": 2.657339551410171, + "grad_norm": 0.47962982437840673, + "learning_rate": 3.0550618325195457e-06, + "loss": 0.0292, + "step": 5983 + }, + { + "epoch": 2.6577836997557185, + "grad_norm": 0.40526954332365667, + "learning_rate": 3.053276115380055e-06, + "loss": 0.0249, + "step": 5984 + }, + { + "epoch": 2.6582278481012658, + "grad_norm": 0.5052423442608693, + "learning_rate": 3.0514906908662346e-06, + "loss": 0.0247, + "step": 5985 + }, + { + "epoch": 2.658671996446813, + "grad_norm": 0.5146431419630214, + "learning_rate": 3.0497055592464596e-06, + "loss": 0.0381, + "step": 5986 + }, + { + "epoch": 2.659116144792361, + "grad_norm": 0.40107763434385424, + "learning_rate": 3.04792072078907e-06, + "loss": 0.0305, + "step": 5987 + }, + { + "epoch": 2.659560293137908, + "grad_norm": 0.5005148653434598, + "learning_rate": 3.046136175762357e-06, + "loss": 0.0308, + "step": 5988 + }, + { + "epoch": 2.6600044414834554, + "grad_norm": 0.5429487251371317, + "learning_rate": 3.0443519244345666e-06, + "loss": 0.0359, + "step": 5989 + }, + { + "epoch": 2.660448589829003, + "grad_norm": 0.5062800108359077, + "learning_rate": 3.0425679670739026e-06, + "loss": 0.0409, + "step": 5990 + }, + { + "epoch": 2.6608927381745504, + "grad_norm": 0.37298776061190037, + "learning_rate": 3.040784303948523e-06, + "loss": 0.0314, + "step": 5991 + }, + { + "epoch": 2.6613368865200977, + "grad_norm": 0.6131690894387388, + "learning_rate": 3.0390009353265458e-06, + "loss": 0.0276, + "step": 5992 + }, + { + "epoch": 2.661781034865645, + "grad_norm": 0.4287293989906706, + "learning_rate": 3.0372178614760382e-06, + "loss": 0.038, + "step": 5993 + }, + { + "epoch": 2.6622251832111923, + "grad_norm": 0.494407979190499, + "learning_rate": 3.035435082665029e-06, + "loss": 0.0328, + "step": 5994 + }, + { + "epoch": 2.66266933155674, + "grad_norm": 0.4258763284446213, + "learning_rate": 3.033652599161499e-06, + "loss": 0.0268, + "step": 5995 + }, + { + "epoch": 2.6631134799022873, + "grad_norm": 0.4705172448160009, + "learning_rate": 3.0318704112333847e-06, + "loss": 0.0314, + "step": 5996 + }, + { + "epoch": 2.663557628247835, + "grad_norm": 0.4749657917164724, + "learning_rate": 3.0300885191485797e-06, + "loss": 0.0363, + "step": 5997 + }, + { + "epoch": 2.6640017765933823, + "grad_norm": 0.7383043422980394, + "learning_rate": 3.0283069231749344e-06, + "loss": 0.0361, + "step": 5998 + }, + { + "epoch": 2.6644459249389296, + "grad_norm": 0.38814202685884, + "learning_rate": 3.026525623580252e-06, + "loss": 0.026, + "step": 5999 + }, + { + "epoch": 2.664890073284477, + "grad_norm": 0.3764976664025451, + "learning_rate": 3.0247446206322916e-06, + "loss": 0.0321, + "step": 6000 + }, + { + "epoch": 2.665334221630024, + "grad_norm": 0.45096374436929443, + "learning_rate": 3.0229639145987687e-06, + "loss": 0.034, + "step": 6001 + }, + { + "epoch": 2.665778369975572, + "grad_norm": 0.36848556078596323, + "learning_rate": 3.021183505747354e-06, + "loss": 0.031, + "step": 6002 + }, + { + "epoch": 2.6662225183211192, + "grad_norm": 0.3699352216981277, + "learning_rate": 3.0194033943456696e-06, + "loss": 0.028, + "step": 6003 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.381719196592165, + "learning_rate": 3.0176235806613008e-06, + "loss": 0.0245, + "step": 6004 + }, + { + "epoch": 2.6671108150122143, + "grad_norm": 0.4602734928315702, + "learning_rate": 3.0158440649617836e-06, + "loss": 0.039, + "step": 6005 + }, + { + "epoch": 2.6675549633577615, + "grad_norm": 0.34646717181854414, + "learning_rate": 3.014064847514609e-06, + "loss": 0.0248, + "step": 6006 + }, + { + "epoch": 2.667999111703309, + "grad_norm": 0.41630402405704753, + "learning_rate": 3.0122859285872214e-06, + "loss": 0.0364, + "step": 6007 + }, + { + "epoch": 2.668443260048856, + "grad_norm": 0.39925464642353814, + "learning_rate": 3.010507308447025e-06, + "loss": 0.0286, + "step": 6008 + }, + { + "epoch": 2.668887408394404, + "grad_norm": 0.5709915993898592, + "learning_rate": 3.0087289873613746e-06, + "loss": 0.0409, + "step": 6009 + }, + { + "epoch": 2.669331556739951, + "grad_norm": 0.5153833490415598, + "learning_rate": 3.0069509655975835e-06, + "loss": 0.0388, + "step": 6010 + }, + { + "epoch": 2.6697757050854984, + "grad_norm": 0.37873158972984705, + "learning_rate": 3.0051732434229185e-06, + "loss": 0.0254, + "step": 6011 + }, + { + "epoch": 2.670219853431046, + "grad_norm": 0.4404773311355675, + "learning_rate": 3.003395821104602e-06, + "loss": 0.0308, + "step": 6012 + }, + { + "epoch": 2.6706640017765935, + "grad_norm": 0.5123588004903873, + "learning_rate": 3.001618698909809e-06, + "loss": 0.032, + "step": 6013 + }, + { + "epoch": 2.6711081501221408, + "grad_norm": 0.41512949784084213, + "learning_rate": 2.999841877105672e-06, + "loss": 0.0277, + "step": 6014 + }, + { + "epoch": 2.671552298467688, + "grad_norm": 0.4025208152025212, + "learning_rate": 2.9980653559592775e-06, + "loss": 0.0313, + "step": 6015 + }, + { + "epoch": 2.671996446813236, + "grad_norm": 0.5344242242777247, + "learning_rate": 2.996289135737668e-06, + "loss": 0.0351, + "step": 6016 + }, + { + "epoch": 2.672440595158783, + "grad_norm": 0.4264946062568879, + "learning_rate": 2.994513216707838e-06, + "loss": 0.0263, + "step": 6017 + }, + { + "epoch": 2.6728847435043304, + "grad_norm": 0.3811830713887272, + "learning_rate": 2.992737599136739e-06, + "loss": 0.0268, + "step": 6018 + }, + { + "epoch": 2.673328891849878, + "grad_norm": 0.386704376318305, + "learning_rate": 2.9909622832912767e-06, + "loss": 0.0315, + "step": 6019 + }, + { + "epoch": 2.6737730401954254, + "grad_norm": 0.4136667792416568, + "learning_rate": 2.989187269438308e-06, + "loss": 0.0307, + "step": 6020 + }, + { + "epoch": 2.6742171885409727, + "grad_norm": 0.4174591961736545, + "learning_rate": 2.987412557844653e-06, + "loss": 0.0319, + "step": 6021 + }, + { + "epoch": 2.67466133688652, + "grad_norm": 0.42669426474826655, + "learning_rate": 2.985638148777078e-06, + "loss": 0.0288, + "step": 6022 + }, + { + "epoch": 2.6751054852320673, + "grad_norm": 0.3748882818930551, + "learning_rate": 2.983864042502308e-06, + "loss": 0.0263, + "step": 6023 + }, + { + "epoch": 2.675549633577615, + "grad_norm": 0.4706253013201654, + "learning_rate": 2.9820902392870197e-06, + "loss": 0.0284, + "step": 6024 + }, + { + "epoch": 2.6759937819231623, + "grad_norm": 0.4662239351305156, + "learning_rate": 2.980316739397847e-06, + "loss": 0.0358, + "step": 6025 + }, + { + "epoch": 2.67643793026871, + "grad_norm": 0.314532858998878, + "learning_rate": 2.9785435431013755e-06, + "loss": 0.0232, + "step": 6026 + }, + { + "epoch": 2.6768820786142573, + "grad_norm": 0.5337532314358315, + "learning_rate": 2.97677065066415e-06, + "loss": 0.0333, + "step": 6027 + }, + { + "epoch": 2.6773262269598046, + "grad_norm": 0.5956371023489798, + "learning_rate": 2.9749980623526633e-06, + "loss": 0.0381, + "step": 6028 + }, + { + "epoch": 2.677770375305352, + "grad_norm": 0.3325969522067864, + "learning_rate": 2.9732257784333673e-06, + "loss": 0.0249, + "step": 6029 + }, + { + "epoch": 2.678214523650899, + "grad_norm": 0.44226896551023326, + "learning_rate": 2.9714537991726656e-06, + "loss": 0.0321, + "step": 6030 + }, + { + "epoch": 2.678658671996447, + "grad_norm": 0.407289289939863, + "learning_rate": 2.9696821248369152e-06, + "loss": 0.0294, + "step": 6031 + }, + { + "epoch": 2.679102820341994, + "grad_norm": 0.42539794841254347, + "learning_rate": 2.9679107556924314e-06, + "loss": 0.0312, + "step": 6032 + }, + { + "epoch": 2.6795469686875415, + "grad_norm": 0.3452430100807996, + "learning_rate": 2.966139692005481e-06, + "loss": 0.0311, + "step": 6033 + }, + { + "epoch": 2.6799911170330892, + "grad_norm": 0.47114526962385006, + "learning_rate": 2.9643689340422844e-06, + "loss": 0.0351, + "step": 6034 + }, + { + "epoch": 2.6804352653786365, + "grad_norm": 0.3056012283706184, + "learning_rate": 2.962598482069015e-06, + "loss": 0.0254, + "step": 6035 + }, + { + "epoch": 2.680879413724184, + "grad_norm": 0.39834900420012637, + "learning_rate": 2.960828336351804e-06, + "loss": 0.0329, + "step": 6036 + }, + { + "epoch": 2.681323562069731, + "grad_norm": 0.4921282055431307, + "learning_rate": 2.9590584971567327e-06, + "loss": 0.0378, + "step": 6037 + }, + { + "epoch": 2.681767710415279, + "grad_norm": 0.401942181093132, + "learning_rate": 2.957288964749839e-06, + "loss": 0.0282, + "step": 6038 + }, + { + "epoch": 2.682211858760826, + "grad_norm": 0.5998754085660062, + "learning_rate": 2.955519739397114e-06, + "loss": 0.04, + "step": 6039 + }, + { + "epoch": 2.6826560071063734, + "grad_norm": 0.5229301227786665, + "learning_rate": 2.9537508213645026e-06, + "loss": 0.0439, + "step": 6040 + }, + { + "epoch": 2.683100155451921, + "grad_norm": 0.4131624612623256, + "learning_rate": 2.9519822109179007e-06, + "loss": 0.0324, + "step": 6041 + }, + { + "epoch": 2.6835443037974684, + "grad_norm": 0.3861923836427622, + "learning_rate": 2.950213908323164e-06, + "loss": 0.0277, + "step": 6042 + }, + { + "epoch": 2.6839884521430157, + "grad_norm": 0.4648268575218616, + "learning_rate": 2.948445913846094e-06, + "loss": 0.0289, + "step": 6043 + }, + { + "epoch": 2.684432600488563, + "grad_norm": 0.4732042527119129, + "learning_rate": 2.9466782277524554e-06, + "loss": 0.0309, + "step": 6044 + }, + { + "epoch": 2.6848767488341103, + "grad_norm": 0.3982899300782419, + "learning_rate": 2.944910850307958e-06, + "loss": 0.0314, + "step": 6045 + }, + { + "epoch": 2.685320897179658, + "grad_norm": 0.4153109085037015, + "learning_rate": 2.9431437817782705e-06, + "loss": 0.0339, + "step": 6046 + }, + { + "epoch": 2.6857650455252053, + "grad_norm": 0.43324892108850316, + "learning_rate": 2.9413770224290126e-06, + "loss": 0.0311, + "step": 6047 + }, + { + "epoch": 2.686209193870753, + "grad_norm": 0.28406951308335854, + "learning_rate": 2.9396105725257563e-06, + "loss": 0.0231, + "step": 6048 + }, + { + "epoch": 2.6866533422163004, + "grad_norm": 0.39298370689744644, + "learning_rate": 2.9378444323340316e-06, + "loss": 0.0304, + "step": 6049 + }, + { + "epoch": 2.6870974905618477, + "grad_norm": 0.4485933834660012, + "learning_rate": 2.9360786021193192e-06, + "loss": 0.0285, + "step": 6050 + }, + { + "epoch": 2.687541638907395, + "grad_norm": 0.35520292176022183, + "learning_rate": 2.934313082147053e-06, + "loss": 0.0226, + "step": 6051 + }, + { + "epoch": 2.6879857872529422, + "grad_norm": 0.44687364826159626, + "learning_rate": 2.93254787268262e-06, + "loss": 0.0357, + "step": 6052 + }, + { + "epoch": 2.68842993559849, + "grad_norm": 0.5271674075113904, + "learning_rate": 2.930782973991362e-06, + "loss": 0.0377, + "step": 6053 + }, + { + "epoch": 2.6888740839440373, + "grad_norm": 0.4626973474509376, + "learning_rate": 2.929018386338571e-06, + "loss": 0.0502, + "step": 6054 + }, + { + "epoch": 2.689318232289585, + "grad_norm": 0.4073720832807278, + "learning_rate": 2.927254109989499e-06, + "loss": 0.0328, + "step": 6055 + }, + { + "epoch": 2.6897623806351323, + "grad_norm": 0.38757290739790723, + "learning_rate": 2.9254901452093424e-06, + "loss": 0.031, + "step": 6056 + }, + { + "epoch": 2.6902065289806796, + "grad_norm": 0.3981460707866334, + "learning_rate": 2.923726492263258e-06, + "loss": 0.0329, + "step": 6057 + }, + { + "epoch": 2.690650677326227, + "grad_norm": 0.34586475207518175, + "learning_rate": 2.9219631514163514e-06, + "loss": 0.0229, + "step": 6058 + }, + { + "epoch": 2.691094825671774, + "grad_norm": 0.3814494157202918, + "learning_rate": 2.9202001229336817e-06, + "loss": 0.0252, + "step": 6059 + }, + { + "epoch": 2.691538974017322, + "grad_norm": 0.5320910876205124, + "learning_rate": 2.9184374070802633e-06, + "loss": 0.0528, + "step": 6060 + }, + { + "epoch": 2.691983122362869, + "grad_norm": 0.5320769049550915, + "learning_rate": 2.916675004121062e-06, + "loss": 0.0374, + "step": 6061 + }, + { + "epoch": 2.6924272707084165, + "grad_norm": 0.46814435114343156, + "learning_rate": 2.9149129143209974e-06, + "loss": 0.0366, + "step": 6062 + }, + { + "epoch": 2.692871419053964, + "grad_norm": 0.46820101387242424, + "learning_rate": 2.9131511379449428e-06, + "loss": 0.0398, + "step": 6063 + }, + { + "epoch": 2.6933155673995115, + "grad_norm": 0.42299234767365035, + "learning_rate": 2.9113896752577205e-06, + "loss": 0.0306, + "step": 6064 + }, + { + "epoch": 2.693759715745059, + "grad_norm": 0.3540454635472545, + "learning_rate": 2.9096285265241063e-06, + "loss": 0.0253, + "step": 6065 + }, + { + "epoch": 2.694203864090606, + "grad_norm": 0.47552195678010667, + "learning_rate": 2.9078676920088378e-06, + "loss": 0.0304, + "step": 6066 + }, + { + "epoch": 2.694648012436154, + "grad_norm": 0.41330422015572915, + "learning_rate": 2.9061071719765933e-06, + "loss": 0.0334, + "step": 6067 + }, + { + "epoch": 2.695092160781701, + "grad_norm": 0.3903849704575043, + "learning_rate": 2.9043469666920088e-06, + "loss": 0.0261, + "step": 6068 + }, + { + "epoch": 2.6955363091272484, + "grad_norm": 0.37706353026097844, + "learning_rate": 2.902587076419676e-06, + "loss": 0.0294, + "step": 6069 + }, + { + "epoch": 2.695980457472796, + "grad_norm": 0.3862061240373804, + "learning_rate": 2.900827501424133e-06, + "loss": 0.0354, + "step": 6070 + }, + { + "epoch": 2.6964246058183434, + "grad_norm": 0.4432184785361431, + "learning_rate": 2.899068241969876e-06, + "loss": 0.032, + "step": 6071 + }, + { + "epoch": 2.6968687541638907, + "grad_norm": 0.4814495852537153, + "learning_rate": 2.8973092983213493e-06, + "loss": 0.0335, + "step": 6072 + }, + { + "epoch": 2.697312902509438, + "grad_norm": 0.444449262745695, + "learning_rate": 2.8955506707429545e-06, + "loss": 0.0428, + "step": 6073 + }, + { + "epoch": 2.6977570508549853, + "grad_norm": 0.38128392307642117, + "learning_rate": 2.8937923594990435e-06, + "loss": 0.0253, + "step": 6074 + }, + { + "epoch": 2.698201199200533, + "grad_norm": 0.4767935660247025, + "learning_rate": 2.8920343648539174e-06, + "loss": 0.0261, + "step": 6075 + }, + { + "epoch": 2.6986453475460803, + "grad_norm": 0.5621830464805478, + "learning_rate": 2.8902766870718347e-06, + "loss": 0.0478, + "step": 6076 + }, + { + "epoch": 2.699089495891628, + "grad_norm": 0.34049032210297114, + "learning_rate": 2.8885193264170036e-06, + "loss": 0.0294, + "step": 6077 + }, + { + "epoch": 2.6995336442371753, + "grad_norm": 0.4750456467312549, + "learning_rate": 2.886762283153586e-06, + "loss": 0.0371, + "step": 6078 + }, + { + "epoch": 2.6999777925827226, + "grad_norm": 0.4893300140355996, + "learning_rate": 2.885005557545694e-06, + "loss": 0.0285, + "step": 6079 + }, + { + "epoch": 2.70042194092827, + "grad_norm": 0.4397921048743652, + "learning_rate": 2.8832491498573965e-06, + "loss": 0.0365, + "step": 6080 + }, + { + "epoch": 2.700866089273817, + "grad_norm": 0.4948308694304273, + "learning_rate": 2.8814930603527067e-06, + "loss": 0.0318, + "step": 6081 + }, + { + "epoch": 2.701310237619365, + "grad_norm": 0.4221767743708031, + "learning_rate": 2.8797372892955978e-06, + "loss": 0.0355, + "step": 6082 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 0.548027854837134, + "learning_rate": 2.877981836949991e-06, + "loss": 0.0286, + "step": 6083 + }, + { + "epoch": 2.7021985343104595, + "grad_norm": 0.50981889703332, + "learning_rate": 2.8762267035797607e-06, + "loss": 0.0346, + "step": 6084 + }, + { + "epoch": 2.7026426826560073, + "grad_norm": 0.49304824695259686, + "learning_rate": 2.8744718894487345e-06, + "loss": 0.0392, + "step": 6085 + }, + { + "epoch": 2.7030868310015546, + "grad_norm": 0.33965710652598363, + "learning_rate": 2.8727173948206905e-06, + "loss": 0.0211, + "step": 6086 + }, + { + "epoch": 2.703530979347102, + "grad_norm": 0.4906097802841706, + "learning_rate": 2.870963219959357e-06, + "loss": 0.0339, + "step": 6087 + }, + { + "epoch": 2.703975127692649, + "grad_norm": 0.45131838531292917, + "learning_rate": 2.869209365128417e-06, + "loss": 0.0327, + "step": 6088 + }, + { + "epoch": 2.704419276038197, + "grad_norm": 0.44310343452513123, + "learning_rate": 2.8674558305915057e-06, + "loss": 0.0413, + "step": 6089 + }, + { + "epoch": 2.704863424383744, + "grad_norm": 0.4694661668866994, + "learning_rate": 2.865702616612208e-06, + "loss": 0.0303, + "step": 6090 + }, + { + "epoch": 2.7053075727292915, + "grad_norm": 0.42182727027273254, + "learning_rate": 2.8639497234540646e-06, + "loss": 0.0322, + "step": 6091 + }, + { + "epoch": 2.705751721074839, + "grad_norm": 0.34911280190910793, + "learning_rate": 2.862197151380561e-06, + "loss": 0.0297, + "step": 6092 + }, + { + "epoch": 2.7061958694203865, + "grad_norm": 0.5070815814024743, + "learning_rate": 2.8604449006551406e-06, + "loss": 0.0535, + "step": 6093 + }, + { + "epoch": 2.7066400177659338, + "grad_norm": 0.6056329049163846, + "learning_rate": 2.8586929715411963e-06, + "loss": 0.0589, + "step": 6094 + }, + { + "epoch": 2.707084166111481, + "grad_norm": 0.42386614097773856, + "learning_rate": 2.8569413643020725e-06, + "loss": 0.0337, + "step": 6095 + }, + { + "epoch": 2.707528314457029, + "grad_norm": 0.5481706485098858, + "learning_rate": 2.855190079201067e-06, + "loss": 0.0346, + "step": 6096 + }, + { + "epoch": 2.707972462802576, + "grad_norm": 0.4317878891723994, + "learning_rate": 2.8534391165014275e-06, + "loss": 0.0447, + "step": 6097 + }, + { + "epoch": 2.7084166111481234, + "grad_norm": 0.4182536684547791, + "learning_rate": 2.8516884764663512e-06, + "loss": 0.0381, + "step": 6098 + }, + { + "epoch": 2.708860759493671, + "grad_norm": 0.39160951294185736, + "learning_rate": 2.849938159358989e-06, + "loss": 0.0261, + "step": 6099 + }, + { + "epoch": 2.7093049078392184, + "grad_norm": 0.3551951087739855, + "learning_rate": 2.848188165442446e-06, + "loss": 0.0293, + "step": 6100 + }, + { + "epoch": 2.7097490561847657, + "grad_norm": 0.4415214034913667, + "learning_rate": 2.846438494979774e-06, + "loss": 0.0374, + "step": 6101 + }, + { + "epoch": 2.710193204530313, + "grad_norm": 0.6570425809867534, + "learning_rate": 2.844689148233979e-06, + "loss": 0.0493, + "step": 6102 + }, + { + "epoch": 2.7106373528758603, + "grad_norm": 0.42365442096686196, + "learning_rate": 2.842940125468019e-06, + "loss": 0.0382, + "step": 6103 + }, + { + "epoch": 2.711081501221408, + "grad_norm": 0.4801471324952032, + "learning_rate": 2.8411914269447984e-06, + "loss": 0.0252, + "step": 6104 + }, + { + "epoch": 2.7115256495669553, + "grad_norm": 0.3451437825598062, + "learning_rate": 2.8394430529271777e-06, + "loss": 0.0299, + "step": 6105 + }, + { + "epoch": 2.711969797912503, + "grad_norm": 0.5219358142266073, + "learning_rate": 2.8376950036779683e-06, + "loss": 0.0295, + "step": 6106 + }, + { + "epoch": 2.7124139462580503, + "grad_norm": 0.3693933163750994, + "learning_rate": 2.8359472794599307e-06, + "loss": 0.0277, + "step": 6107 + }, + { + "epoch": 2.7128580946035976, + "grad_norm": 0.4033904609462883, + "learning_rate": 2.8341998805357796e-06, + "loss": 0.0265, + "step": 6108 + }, + { + "epoch": 2.713302242949145, + "grad_norm": 0.4144874366737247, + "learning_rate": 2.832452807168175e-06, + "loss": 0.0268, + "step": 6109 + }, + { + "epoch": 2.713746391294692, + "grad_norm": 0.5838661054136332, + "learning_rate": 2.8307060596197337e-06, + "loss": 0.0384, + "step": 6110 + }, + { + "epoch": 2.71419053964024, + "grad_norm": 0.5420801857113182, + "learning_rate": 2.8289596381530214e-06, + "loss": 0.0305, + "step": 6111 + }, + { + "epoch": 2.714634687985787, + "grad_norm": 0.42444475722005587, + "learning_rate": 2.8272135430305558e-06, + "loss": 0.041, + "step": 6112 + }, + { + "epoch": 2.7150788363313345, + "grad_norm": 0.36578048462525586, + "learning_rate": 2.825467774514803e-06, + "loss": 0.0224, + "step": 6113 + }, + { + "epoch": 2.7155229846768822, + "grad_norm": 0.33007914592544324, + "learning_rate": 2.823722332868185e-06, + "loss": 0.0252, + "step": 6114 + }, + { + "epoch": 2.7159671330224295, + "grad_norm": 0.5564364593461901, + "learning_rate": 2.821977218353067e-06, + "loss": 0.0304, + "step": 6115 + }, + { + "epoch": 2.716411281367977, + "grad_norm": 0.539249473006622, + "learning_rate": 2.820232431231771e-06, + "loss": 0.0417, + "step": 6116 + }, + { + "epoch": 2.716855429713524, + "grad_norm": 0.39638502515608776, + "learning_rate": 2.818487971766568e-06, + "loss": 0.0276, + "step": 6117 + }, + { + "epoch": 2.717299578059072, + "grad_norm": 0.3923969070351993, + "learning_rate": 2.816743840219681e-06, + "loss": 0.0345, + "step": 6118 + }, + { + "epoch": 2.717743726404619, + "grad_norm": 0.7090345157048296, + "learning_rate": 2.8150000368532826e-06, + "loss": 0.0459, + "step": 6119 + }, + { + "epoch": 2.7181878747501664, + "grad_norm": 0.49769141697815394, + "learning_rate": 2.8132565619294943e-06, + "loss": 0.0312, + "step": 6120 + }, + { + "epoch": 2.718632023095714, + "grad_norm": 0.3567290862892367, + "learning_rate": 2.8115134157103906e-06, + "loss": 0.0427, + "step": 6121 + }, + { + "epoch": 2.7190761714412615, + "grad_norm": 0.41308801601010114, + "learning_rate": 2.809770598457997e-06, + "loss": 0.0308, + "step": 6122 + }, + { + "epoch": 2.7195203197868087, + "grad_norm": 0.42911807998450446, + "learning_rate": 2.8080281104342875e-06, + "loss": 0.0389, + "step": 6123 + }, + { + "epoch": 2.719964468132356, + "grad_norm": 0.3668652303738415, + "learning_rate": 2.8062859519011885e-06, + "loss": 0.031, + "step": 6124 + }, + { + "epoch": 2.7204086164779038, + "grad_norm": 0.4729128881363199, + "learning_rate": 2.8045441231205773e-06, + "loss": 0.0359, + "step": 6125 + }, + { + "epoch": 2.720852764823451, + "grad_norm": 0.49961860986096596, + "learning_rate": 2.802802624354276e-06, + "loss": 0.0284, + "step": 6126 + }, + { + "epoch": 2.7212969131689984, + "grad_norm": 0.38704332861461993, + "learning_rate": 2.8010614558640653e-06, + "loss": 0.0306, + "step": 6127 + }, + { + "epoch": 2.721741061514546, + "grad_norm": 0.36762472796404744, + "learning_rate": 2.7993206179116706e-06, + "loss": 0.0297, + "step": 6128 + }, + { + "epoch": 2.7221852098600934, + "grad_norm": 0.40237829108230383, + "learning_rate": 2.79758011075877e-06, + "loss": 0.0379, + "step": 6129 + }, + { + "epoch": 2.7226293582056407, + "grad_norm": 0.39422460053368863, + "learning_rate": 2.7958399346669916e-06, + "loss": 0.0324, + "step": 6130 + }, + { + "epoch": 2.723073506551188, + "grad_norm": 0.4944769619389698, + "learning_rate": 2.7941000898979153e-06, + "loss": 0.0365, + "step": 6131 + }, + { + "epoch": 2.7235176548967353, + "grad_norm": 0.43625796235808223, + "learning_rate": 2.7923605767130644e-06, + "loss": 0.0385, + "step": 6132 + }, + { + "epoch": 2.723961803242283, + "grad_norm": 0.39500536045009, + "learning_rate": 2.790621395373921e-06, + "loss": 0.0247, + "step": 6133 + }, + { + "epoch": 2.7244059515878303, + "grad_norm": 0.4784483734678597, + "learning_rate": 2.7888825461419124e-06, + "loss": 0.0226, + "step": 6134 + }, + { + "epoch": 2.724850099933378, + "grad_norm": 0.5298205495609496, + "learning_rate": 2.7871440292784167e-06, + "loss": 0.0407, + "step": 6135 + }, + { + "epoch": 2.7252942482789253, + "grad_norm": 0.46575202716742126, + "learning_rate": 2.7854058450447657e-06, + "loss": 0.0412, + "step": 6136 + }, + { + "epoch": 2.7257383966244726, + "grad_norm": 0.4738047535073062, + "learning_rate": 2.783667993702234e-06, + "loss": 0.0385, + "step": 6137 + }, + { + "epoch": 2.72618254497002, + "grad_norm": 0.3869731324535632, + "learning_rate": 2.7819304755120514e-06, + "loss": 0.0295, + "step": 6138 + }, + { + "epoch": 2.726626693315567, + "grad_norm": 0.3652035596631365, + "learning_rate": 2.7801932907353966e-06, + "loss": 0.0295, + "step": 6139 + }, + { + "epoch": 2.727070841661115, + "grad_norm": 0.41533452786136, + "learning_rate": 2.778456439633398e-06, + "loss": 0.0242, + "step": 6140 + }, + { + "epoch": 2.727514990006662, + "grad_norm": 0.35015673870621206, + "learning_rate": 2.776719922467135e-06, + "loss": 0.0313, + "step": 6141 + }, + { + "epoch": 2.7279591383522095, + "grad_norm": 0.48649742529511003, + "learning_rate": 2.7749837394976353e-06, + "loss": 0.0403, + "step": 6142 + }, + { + "epoch": 2.7284032866977572, + "grad_norm": 0.4158217744500075, + "learning_rate": 2.773247890985874e-06, + "loss": 0.0294, + "step": 6143 + }, + { + "epoch": 2.7288474350433045, + "grad_norm": 0.3886529547809444, + "learning_rate": 2.7715123771927817e-06, + "loss": 0.0285, + "step": 6144 + }, + { + "epoch": 2.729291583388852, + "grad_norm": 0.48618253354322, + "learning_rate": 2.7697771983792334e-06, + "loss": 0.0381, + "step": 6145 + }, + { + "epoch": 2.729735731734399, + "grad_norm": 0.4239190944164018, + "learning_rate": 2.7680423548060574e-06, + "loss": 0.028, + "step": 6146 + }, + { + "epoch": 2.730179880079947, + "grad_norm": 0.6790565769570187, + "learning_rate": 2.766307846734032e-06, + "loss": 0.0344, + "step": 6147 + }, + { + "epoch": 2.730624028425494, + "grad_norm": 0.4083045746295625, + "learning_rate": 2.764573674423879e-06, + "loss": 0.0292, + "step": 6148 + }, + { + "epoch": 2.7310681767710414, + "grad_norm": 0.39741625214319276, + "learning_rate": 2.7628398381362765e-06, + "loss": 0.0369, + "step": 6149 + }, + { + "epoch": 2.731512325116589, + "grad_norm": 0.3364754032489686, + "learning_rate": 2.7611063381318483e-06, + "loss": 0.0223, + "step": 6150 + }, + { + "epoch": 2.7319564734621364, + "grad_norm": 0.33915812547001584, + "learning_rate": 2.7593731746711695e-06, + "loss": 0.0276, + "step": 6151 + }, + { + "epoch": 2.7324006218076837, + "grad_norm": 0.3730496797762164, + "learning_rate": 2.757640348014764e-06, + "loss": 0.0274, + "step": 6152 + }, + { + "epoch": 2.732844770153231, + "grad_norm": 0.3751953423117338, + "learning_rate": 2.755907858423108e-06, + "loss": 0.0363, + "step": 6153 + }, + { + "epoch": 2.7332889184987788, + "grad_norm": 0.42372086556075383, + "learning_rate": 2.754175706156619e-06, + "loss": 0.0244, + "step": 6154 + }, + { + "epoch": 2.733733066844326, + "grad_norm": 0.3686003953358143, + "learning_rate": 2.7524438914756714e-06, + "loss": 0.0227, + "step": 6155 + }, + { + "epoch": 2.7341772151898733, + "grad_norm": 0.38850199266907187, + "learning_rate": 2.750712414640588e-06, + "loss": 0.0318, + "step": 6156 + }, + { + "epoch": 2.734621363535421, + "grad_norm": 0.4212753598660164, + "learning_rate": 2.748981275911633e-06, + "loss": 0.027, + "step": 6157 + }, + { + "epoch": 2.7350655118809684, + "grad_norm": 0.3950070118539609, + "learning_rate": 2.747250475549033e-06, + "loss": 0.0347, + "step": 6158 + }, + { + "epoch": 2.7355096602265156, + "grad_norm": 0.5989665019227255, + "learning_rate": 2.745520013812956e-06, + "loss": 0.0413, + "step": 6159 + }, + { + "epoch": 2.735953808572063, + "grad_norm": 0.4076839235525957, + "learning_rate": 2.743789890963516e-06, + "loss": 0.0239, + "step": 6160 + }, + { + "epoch": 2.7363979569176102, + "grad_norm": 0.4896550081248881, + "learning_rate": 2.742060107260781e-06, + "loss": 0.0284, + "step": 6161 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.5961993037262533, + "learning_rate": 2.740330662964768e-06, + "loss": 0.0399, + "step": 6162 + }, + { + "epoch": 2.7372862536087053, + "grad_norm": 0.3528802270631205, + "learning_rate": 2.7386015583354414e-06, + "loss": 0.0271, + "step": 6163 + }, + { + "epoch": 2.737730401954253, + "grad_norm": 0.38484518351915703, + "learning_rate": 2.736872793632717e-06, + "loss": 0.0369, + "step": 6164 + }, + { + "epoch": 2.7381745502998003, + "grad_norm": 0.4319515735643185, + "learning_rate": 2.7351443691164537e-06, + "loss": 0.0228, + "step": 6165 + }, + { + "epoch": 2.7386186986453476, + "grad_norm": 0.42098419937374343, + "learning_rate": 2.7334162850464645e-06, + "loss": 0.0292, + "step": 6166 + }, + { + "epoch": 2.739062846990895, + "grad_norm": 0.35177937154537675, + "learning_rate": 2.7316885416825123e-06, + "loss": 0.0218, + "step": 6167 + }, + { + "epoch": 2.739506995336442, + "grad_norm": 0.41712238277892294, + "learning_rate": 2.7299611392843005e-06, + "loss": 0.0368, + "step": 6168 + }, + { + "epoch": 2.73995114368199, + "grad_norm": 0.39108614184700036, + "learning_rate": 2.7282340781114926e-06, + "loss": 0.0261, + "step": 6169 + }, + { + "epoch": 2.740395292027537, + "grad_norm": 0.526808362535383, + "learning_rate": 2.726507358423695e-06, + "loss": 0.0267, + "step": 6170 + }, + { + "epoch": 2.7408394403730845, + "grad_norm": 0.3749087607755556, + "learning_rate": 2.7247809804804593e-06, + "loss": 0.0198, + "step": 6171 + }, + { + "epoch": 2.741283588718632, + "grad_norm": 0.41805354629010627, + "learning_rate": 2.723054944541292e-06, + "loss": 0.0358, + "step": 6172 + }, + { + "epoch": 2.7417277370641795, + "grad_norm": 0.37604430696616314, + "learning_rate": 2.721329250865646e-06, + "loss": 0.0273, + "step": 6173 + }, + { + "epoch": 2.742171885409727, + "grad_norm": 0.4611005258922287, + "learning_rate": 2.719603899712919e-06, + "loss": 0.031, + "step": 6174 + }, + { + "epoch": 2.742616033755274, + "grad_norm": 0.41841947895126935, + "learning_rate": 2.7178788913424635e-06, + "loss": 0.0315, + "step": 6175 + }, + { + "epoch": 2.743060182100822, + "grad_norm": 0.41801694735137535, + "learning_rate": 2.7161542260135797e-06, + "loss": 0.0336, + "step": 6176 + }, + { + "epoch": 2.743504330446369, + "grad_norm": 0.4765481557698237, + "learning_rate": 2.7144299039855105e-06, + "loss": 0.0375, + "step": 6177 + }, + { + "epoch": 2.7439484787919164, + "grad_norm": 0.3580598459132636, + "learning_rate": 2.7127059255174504e-06, + "loss": 0.0286, + "step": 6178 + }, + { + "epoch": 2.744392627137464, + "grad_norm": 0.4101523804697921, + "learning_rate": 2.7109822908685445e-06, + "loss": 0.0274, + "step": 6179 + }, + { + "epoch": 2.7448367754830114, + "grad_norm": 0.4715955568359928, + "learning_rate": 2.7092590002978837e-06, + "loss": 0.0345, + "step": 6180 + }, + { + "epoch": 2.7452809238285587, + "grad_norm": 0.6089181044007468, + "learning_rate": 2.70753605406451e-06, + "loss": 0.0343, + "step": 6181 + }, + { + "epoch": 2.745725072174106, + "grad_norm": 0.5806324207660223, + "learning_rate": 2.7058134524274083e-06, + "loss": 0.0346, + "step": 6182 + }, + { + "epoch": 2.7461692205196533, + "grad_norm": 0.40045495221538185, + "learning_rate": 2.7040911956455153e-06, + "loss": 0.0278, + "step": 6183 + }, + { + "epoch": 2.746613368865201, + "grad_norm": 0.45888251149836196, + "learning_rate": 2.702369283977718e-06, + "loss": 0.0321, + "step": 6184 + }, + { + "epoch": 2.7470575172107483, + "grad_norm": 0.39535983779938544, + "learning_rate": 2.7006477176828443e-06, + "loss": 0.0291, + "step": 6185 + }, + { + "epoch": 2.747501665556296, + "grad_norm": 0.40307288228268773, + "learning_rate": 2.6989264970196795e-06, + "loss": 0.031, + "step": 6186 + }, + { + "epoch": 2.7479458139018433, + "grad_norm": 0.42537830184812087, + "learning_rate": 2.697205622246952e-06, + "loss": 0.0379, + "step": 6187 + }, + { + "epoch": 2.7483899622473906, + "grad_norm": 0.7384045353118717, + "learning_rate": 2.6954850936233357e-06, + "loss": 0.0388, + "step": 6188 + }, + { + "epoch": 2.748834110592938, + "grad_norm": 0.49237325688101374, + "learning_rate": 2.693764911407456e-06, + "loss": 0.0357, + "step": 6189 + }, + { + "epoch": 2.749278258938485, + "grad_norm": 0.43303507384385986, + "learning_rate": 2.6920450758578885e-06, + "loss": 0.0305, + "step": 6190 + }, + { + "epoch": 2.749722407284033, + "grad_norm": 0.5198323318463377, + "learning_rate": 2.690325587233148e-06, + "loss": 0.042, + "step": 6191 + }, + { + "epoch": 2.7501665556295802, + "grad_norm": 0.5579062089534479, + "learning_rate": 2.6886064457917094e-06, + "loss": 0.0442, + "step": 6192 + }, + { + "epoch": 2.750610703975128, + "grad_norm": 0.2992794413549194, + "learning_rate": 2.6868876517919845e-06, + "loss": 0.0214, + "step": 6193 + }, + { + "epoch": 2.7510548523206753, + "grad_norm": 0.5197412003207367, + "learning_rate": 2.6851692054923385e-06, + "loss": 0.0447, + "step": 6194 + }, + { + "epoch": 2.7514990006662225, + "grad_norm": 0.45629204802490403, + "learning_rate": 2.6834511071510823e-06, + "loss": 0.0335, + "step": 6195 + }, + { + "epoch": 2.75194314901177, + "grad_norm": 0.3939710962212484, + "learning_rate": 2.681733357026476e-06, + "loss": 0.0334, + "step": 6196 + }, + { + "epoch": 2.752387297357317, + "grad_norm": 0.4395100384242003, + "learning_rate": 2.680015955376727e-06, + "loss": 0.0269, + "step": 6197 + }, + { + "epoch": 2.752831445702865, + "grad_norm": 0.35669081976582817, + "learning_rate": 2.6782989024599913e-06, + "loss": 0.0239, + "step": 6198 + }, + { + "epoch": 2.753275594048412, + "grad_norm": 0.42101464193049454, + "learning_rate": 2.6765821985343676e-06, + "loss": 0.0297, + "step": 6199 + }, + { + "epoch": 2.7537197423939594, + "grad_norm": 0.4443116484682141, + "learning_rate": 2.6748658438579075e-06, + "loss": 0.0367, + "step": 6200 + }, + { + "epoch": 2.754163890739507, + "grad_norm": 0.3576485677615678, + "learning_rate": 2.6731498386886094e-06, + "loss": 0.0269, + "step": 6201 + }, + { + "epoch": 2.7546080390850545, + "grad_norm": 0.8029786113817223, + "learning_rate": 2.6714341832844137e-06, + "loss": 0.0366, + "step": 6202 + }, + { + "epoch": 2.7550521874306018, + "grad_norm": 0.4869826644782611, + "learning_rate": 2.6697188779032173e-06, + "loss": 0.0293, + "step": 6203 + }, + { + "epoch": 2.755496335776149, + "grad_norm": 0.43061987045181044, + "learning_rate": 2.6680039228028603e-06, + "loss": 0.0352, + "step": 6204 + }, + { + "epoch": 2.755940484121697, + "grad_norm": 0.3632393699444563, + "learning_rate": 2.6662893182411255e-06, + "loss": 0.0256, + "step": 6205 + }, + { + "epoch": 2.756384632467244, + "grad_norm": 0.37775571779218736, + "learning_rate": 2.6645750644757484e-06, + "loss": 0.0342, + "step": 6206 + }, + { + "epoch": 2.7568287808127914, + "grad_norm": 0.31327481469952234, + "learning_rate": 2.6628611617644133e-06, + "loss": 0.0209, + "step": 6207 + }, + { + "epoch": 2.757272929158339, + "grad_norm": 0.5059332837930003, + "learning_rate": 2.6611476103647425e-06, + "loss": 0.0286, + "step": 6208 + }, + { + "epoch": 2.7577170775038864, + "grad_norm": 0.39890166964039236, + "learning_rate": 2.6594344105343207e-06, + "loss": 0.0335, + "step": 6209 + }, + { + "epoch": 2.7581612258494337, + "grad_norm": 0.40337347616568126, + "learning_rate": 2.657721562530664e-06, + "loss": 0.0206, + "step": 6210 + }, + { + "epoch": 2.758605374194981, + "grad_norm": 0.44935773733297946, + "learning_rate": 2.656009066611244e-06, + "loss": 0.03, + "step": 6211 + }, + { + "epoch": 2.7590495225405283, + "grad_norm": 0.417217650015585, + "learning_rate": 2.654296923033481e-06, + "loss": 0.0354, + "step": 6212 + }, + { + "epoch": 2.759493670886076, + "grad_norm": 0.39156228698655793, + "learning_rate": 2.652585132054734e-06, + "loss": 0.0301, + "step": 6213 + }, + { + "epoch": 2.7599378192316233, + "grad_norm": 0.4181623279832991, + "learning_rate": 2.6508736939323187e-06, + "loss": 0.0416, + "step": 6214 + }, + { + "epoch": 2.760381967577171, + "grad_norm": 0.38946142096359226, + "learning_rate": 2.649162608923493e-06, + "loss": 0.022, + "step": 6215 + }, + { + "epoch": 2.7608261159227183, + "grad_norm": 0.28459219563440136, + "learning_rate": 2.6474518772854606e-06, + "loss": 0.0211, + "step": 6216 + }, + { + "epoch": 2.7612702642682656, + "grad_norm": 0.39516529374066633, + "learning_rate": 2.6457414992753728e-06, + "loss": 0.031, + "step": 6217 + }, + { + "epoch": 2.761714412613813, + "grad_norm": 0.6507735193198333, + "learning_rate": 2.6440314751503314e-06, + "loss": 0.0405, + "step": 6218 + }, + { + "epoch": 2.76215856095936, + "grad_norm": 0.39863052089616485, + "learning_rate": 2.6423218051673766e-06, + "loss": 0.0257, + "step": 6219 + }, + { + "epoch": 2.762602709304908, + "grad_norm": 0.45747750899314005, + "learning_rate": 2.6406124895835084e-06, + "loss": 0.0309, + "step": 6220 + }, + { + "epoch": 2.763046857650455, + "grad_norm": 0.39469825591698227, + "learning_rate": 2.6389035286556598e-06, + "loss": 0.0301, + "step": 6221 + }, + { + "epoch": 2.7634910059960025, + "grad_norm": 0.4493000737157054, + "learning_rate": 2.637194922640719e-06, + "loss": 0.0336, + "step": 6222 + }, + { + "epoch": 2.7639351543415502, + "grad_norm": 0.3609277682343467, + "learning_rate": 2.6354866717955186e-06, + "loss": 0.0303, + "step": 6223 + }, + { + "epoch": 2.7643793026870975, + "grad_norm": 0.3650445917615703, + "learning_rate": 2.6337787763768384e-06, + "loss": 0.0242, + "step": 6224 + }, + { + "epoch": 2.764823451032645, + "grad_norm": 0.48546314397013385, + "learning_rate": 2.6320712366414005e-06, + "loss": 0.0294, + "step": 6225 + }, + { + "epoch": 2.765267599378192, + "grad_norm": 0.32486540563930044, + "learning_rate": 2.6303640528458834e-06, + "loss": 0.0223, + "step": 6226 + }, + { + "epoch": 2.76571174772374, + "grad_norm": 0.3434852926148082, + "learning_rate": 2.6286572252469e-06, + "loss": 0.0316, + "step": 6227 + }, + { + "epoch": 2.766155896069287, + "grad_norm": 0.7142093600067161, + "learning_rate": 2.626950754101018e-06, + "loss": 0.056, + "step": 6228 + }, + { + "epoch": 2.7666000444148344, + "grad_norm": 0.6028670535609708, + "learning_rate": 2.6252446396647503e-06, + "loss": 0.0323, + "step": 6229 + }, + { + "epoch": 2.767044192760382, + "grad_norm": 0.42675838339191474, + "learning_rate": 2.6235388821945497e-06, + "loss": 0.0297, + "step": 6230 + }, + { + "epoch": 2.7674883411059295, + "grad_norm": 0.7192216736608809, + "learning_rate": 2.621833481946826e-06, + "loss": 0.0357, + "step": 6231 + }, + { + "epoch": 2.7679324894514767, + "grad_norm": 0.4766794360092108, + "learning_rate": 2.6201284391779303e-06, + "loss": 0.0331, + "step": 6232 + }, + { + "epoch": 2.768376637797024, + "grad_norm": 0.4250368407242259, + "learning_rate": 2.618423754144155e-06, + "loss": 0.0321, + "step": 6233 + }, + { + "epoch": 2.7688207861425718, + "grad_norm": 0.5212475478820174, + "learning_rate": 2.616719427101745e-06, + "loss": 0.0324, + "step": 6234 + }, + { + "epoch": 2.769264934488119, + "grad_norm": 0.3008339483542143, + "learning_rate": 2.6150154583068922e-06, + "loss": 0.0226, + "step": 6235 + }, + { + "epoch": 2.7697090828336663, + "grad_norm": 0.3816969610714372, + "learning_rate": 2.613311848015725e-06, + "loss": 0.0259, + "step": 6236 + }, + { + "epoch": 2.770153231179214, + "grad_norm": 0.38596290627054197, + "learning_rate": 2.611608596484335e-06, + "loss": 0.0288, + "step": 6237 + }, + { + "epoch": 2.7705973795247614, + "grad_norm": 0.4303882315778446, + "learning_rate": 2.609905703968742e-06, + "loss": 0.0223, + "step": 6238 + }, + { + "epoch": 2.7710415278703087, + "grad_norm": 0.4053342617572821, + "learning_rate": 2.6082031707249223e-06, + "loss": 0.03, + "step": 6239 + }, + { + "epoch": 2.771485676215856, + "grad_norm": 0.7823903528274498, + "learning_rate": 2.6065009970087974e-06, + "loss": 0.0324, + "step": 6240 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 0.42522914693804564, + "learning_rate": 2.6047991830762297e-06, + "loss": 0.038, + "step": 6241 + }, + { + "epoch": 2.772373972906951, + "grad_norm": 0.3532987358681675, + "learning_rate": 2.60309772918303e-06, + "loss": 0.0235, + "step": 6242 + }, + { + "epoch": 2.7728181212524983, + "grad_norm": 0.33902634374584634, + "learning_rate": 2.6013966355849618e-06, + "loss": 0.0327, + "step": 6243 + }, + { + "epoch": 2.773262269598046, + "grad_norm": 0.44149160136100024, + "learning_rate": 2.5996959025377224e-06, + "loss": 0.0492, + "step": 6244 + }, + { + "epoch": 2.7737064179435933, + "grad_norm": 0.48103215081730055, + "learning_rate": 2.597995530296963e-06, + "loss": 0.0267, + "step": 6245 + }, + { + "epoch": 2.7741505662891406, + "grad_norm": 0.3235934136296388, + "learning_rate": 2.5962955191182792e-06, + "loss": 0.0183, + "step": 6246 + }, + { + "epoch": 2.774594714634688, + "grad_norm": 0.3462677290866656, + "learning_rate": 2.59459586925721e-06, + "loss": 0.0272, + "step": 6247 + }, + { + "epoch": 2.775038862980235, + "grad_norm": 0.32294840719861506, + "learning_rate": 2.592896580969242e-06, + "loss": 0.0281, + "step": 6248 + }, + { + "epoch": 2.775483011325783, + "grad_norm": 0.3499956168537934, + "learning_rate": 2.591197654509807e-06, + "loss": 0.0309, + "step": 6249 + }, + { + "epoch": 2.77592715967133, + "grad_norm": 0.40008372827541966, + "learning_rate": 2.5894990901342833e-06, + "loss": 0.0268, + "step": 6250 + }, + { + "epoch": 2.7763713080168775, + "grad_norm": 0.40878589097636386, + "learning_rate": 2.587800888097993e-06, + "loss": 0.0313, + "step": 6251 + }, + { + "epoch": 2.776815456362425, + "grad_norm": 0.7688538094871912, + "learning_rate": 2.5861030486562084e-06, + "loss": 0.0479, + "step": 6252 + }, + { + "epoch": 2.7772596047079725, + "grad_norm": 0.40809425302097857, + "learning_rate": 2.5844055720641357e-06, + "loss": 0.0311, + "step": 6253 + }, + { + "epoch": 2.77770375305352, + "grad_norm": 0.37450235777366314, + "learning_rate": 2.5827084585769436e-06, + "loss": 0.0286, + "step": 6254 + }, + { + "epoch": 2.778147901399067, + "grad_norm": 0.3661400531713744, + "learning_rate": 2.581011708449731e-06, + "loss": 0.0271, + "step": 6255 + }, + { + "epoch": 2.778592049744615, + "grad_norm": 0.37170138943829045, + "learning_rate": 2.57931532193755e-06, + "loss": 0.0346, + "step": 6256 + }, + { + "epoch": 2.779036198090162, + "grad_norm": 0.30626642522716074, + "learning_rate": 2.577619299295398e-06, + "loss": 0.026, + "step": 6257 + }, + { + "epoch": 2.7794803464357094, + "grad_norm": 0.3007717640940463, + "learning_rate": 2.5759236407782128e-06, + "loss": 0.0247, + "step": 6258 + }, + { + "epoch": 2.779924494781257, + "grad_norm": 0.5105689813889596, + "learning_rate": 2.5742283466408803e-06, + "loss": 0.0477, + "step": 6259 + }, + { + "epoch": 2.7803686431268044, + "grad_norm": 0.49116305516555464, + "learning_rate": 2.572533417138237e-06, + "loss": 0.0257, + "step": 6260 + }, + { + "epoch": 2.7808127914723517, + "grad_norm": 0.356325832140254, + "learning_rate": 2.570838852525055e-06, + "loss": 0.0263, + "step": 6261 + }, + { + "epoch": 2.781256939817899, + "grad_norm": 0.4435162531660302, + "learning_rate": 2.569144653056058e-06, + "loss": 0.0435, + "step": 6262 + }, + { + "epoch": 2.7817010881634467, + "grad_norm": 0.5419805855499744, + "learning_rate": 2.5674508189859147e-06, + "loss": 0.0314, + "step": 6263 + }, + { + "epoch": 2.782145236508994, + "grad_norm": 0.3656656264231573, + "learning_rate": 2.565757350569233e-06, + "loss": 0.0241, + "step": 6264 + }, + { + "epoch": 2.7825893848545413, + "grad_norm": 0.43315585203736967, + "learning_rate": 2.5640642480605722e-06, + "loss": 0.0372, + "step": 6265 + }, + { + "epoch": 2.783033533200089, + "grad_norm": 0.6118620035354352, + "learning_rate": 2.5623715117144337e-06, + "loss": 0.0389, + "step": 6266 + }, + { + "epoch": 2.7834776815456364, + "grad_norm": 1.003007529298919, + "learning_rate": 2.5606791417852655e-06, + "loss": 0.0489, + "step": 6267 + }, + { + "epoch": 2.7839218298911836, + "grad_norm": 0.335632805390222, + "learning_rate": 2.558987138527461e-06, + "loss": 0.0298, + "step": 6268 + }, + { + "epoch": 2.784365978236731, + "grad_norm": 0.49218374523384917, + "learning_rate": 2.5572955021953525e-06, + "loss": 0.0504, + "step": 6269 + }, + { + "epoch": 2.7848101265822782, + "grad_norm": 0.3662564117457857, + "learning_rate": 2.555604233043224e-06, + "loss": 0.024, + "step": 6270 + }, + { + "epoch": 2.785254274927826, + "grad_norm": 0.4410048606518427, + "learning_rate": 2.553913331325305e-06, + "loss": 0.0308, + "step": 6271 + }, + { + "epoch": 2.7856984232733732, + "grad_norm": 0.6748277215653389, + "learning_rate": 2.5522227972957626e-06, + "loss": 0.0353, + "step": 6272 + }, + { + "epoch": 2.786142571618921, + "grad_norm": 0.4988731713313121, + "learning_rate": 2.550532631208713e-06, + "loss": 0.0262, + "step": 6273 + }, + { + "epoch": 2.7865867199644683, + "grad_norm": 0.3910346837842304, + "learning_rate": 2.5488428333182213e-06, + "loss": 0.0369, + "step": 6274 + }, + { + "epoch": 2.7870308683100156, + "grad_norm": 0.5189622685954451, + "learning_rate": 2.5471534038782876e-06, + "loss": 0.0365, + "step": 6275 + }, + { + "epoch": 2.787475016655563, + "grad_norm": 0.4108944491644082, + "learning_rate": 2.545464343142862e-06, + "loss": 0.0322, + "step": 6276 + }, + { + "epoch": 2.78791916500111, + "grad_norm": 0.3860699292761844, + "learning_rate": 2.543775651365844e-06, + "loss": 0.0232, + "step": 6277 + }, + { + "epoch": 2.788363313346658, + "grad_norm": 0.47557488338460535, + "learning_rate": 2.5420873288010682e-06, + "loss": 0.0387, + "step": 6278 + }, + { + "epoch": 2.788807461692205, + "grad_norm": 0.3839192710393702, + "learning_rate": 2.5403993757023193e-06, + "loss": 0.0261, + "step": 6279 + }, + { + "epoch": 2.7892516100377525, + "grad_norm": 0.4089956846684774, + "learning_rate": 2.538711792323328e-06, + "loss": 0.0393, + "step": 6280 + }, + { + "epoch": 2.7896957583833, + "grad_norm": 0.3534578778564665, + "learning_rate": 2.5370245789177615e-06, + "loss": 0.0234, + "step": 6281 + }, + { + "epoch": 2.7901399067288475, + "grad_norm": 0.38317967580423545, + "learning_rate": 2.53533773573924e-06, + "loss": 0.0264, + "step": 6282 + }, + { + "epoch": 2.7905840550743948, + "grad_norm": 0.42936934153083767, + "learning_rate": 2.533651263041324e-06, + "loss": 0.0393, + "step": 6283 + }, + { + "epoch": 2.791028203419942, + "grad_norm": 0.554120368450601, + "learning_rate": 2.5319651610775194e-06, + "loss": 0.0317, + "step": 6284 + }, + { + "epoch": 2.79147235176549, + "grad_norm": 0.4081200054049489, + "learning_rate": 2.5302794301012766e-06, + "loss": 0.0297, + "step": 6285 + }, + { + "epoch": 2.791916500111037, + "grad_norm": 0.7070187464980049, + "learning_rate": 2.528594070365988e-06, + "loss": 0.0356, + "step": 6286 + }, + { + "epoch": 2.7923606484565844, + "grad_norm": 0.45395646778800836, + "learning_rate": 2.52690908212499e-06, + "loss": 0.0344, + "step": 6287 + }, + { + "epoch": 2.792804796802132, + "grad_norm": 0.7713383991827947, + "learning_rate": 2.525224465631571e-06, + "loss": 0.0323, + "step": 6288 + }, + { + "epoch": 2.7932489451476794, + "grad_norm": 0.43262436851175007, + "learning_rate": 2.5235402211389525e-06, + "loss": 0.0311, + "step": 6289 + }, + { + "epoch": 2.7936930934932267, + "grad_norm": 0.9524822174469669, + "learning_rate": 2.5218563489003062e-06, + "loss": 0.0307, + "step": 6290 + }, + { + "epoch": 2.794137241838774, + "grad_norm": 0.35257313740001833, + "learning_rate": 2.520172849168749e-06, + "loss": 0.0321, + "step": 6291 + }, + { + "epoch": 2.7945813901843217, + "grad_norm": 0.4825645734393702, + "learning_rate": 2.518489722197335e-06, + "loss": 0.0374, + "step": 6292 + }, + { + "epoch": 2.795025538529869, + "grad_norm": 0.7297573078351898, + "learning_rate": 2.51680696823907e-06, + "loss": 0.041, + "step": 6293 + }, + { + "epoch": 2.7954696868754163, + "grad_norm": 0.4555687972584318, + "learning_rate": 2.5151245875468993e-06, + "loss": 0.0386, + "step": 6294 + }, + { + "epoch": 2.795913835220964, + "grad_norm": 0.4970797678676715, + "learning_rate": 2.5134425803737137e-06, + "loss": 0.0418, + "step": 6295 + }, + { + "epoch": 2.7963579835665113, + "grad_norm": 0.6816809012387476, + "learning_rate": 2.511760946972348e-06, + "loss": 0.0409, + "step": 6296 + }, + { + "epoch": 2.7968021319120586, + "grad_norm": 0.5264337163942395, + "learning_rate": 2.5100796875955815e-06, + "loss": 0.0473, + "step": 6297 + }, + { + "epoch": 2.797246280257606, + "grad_norm": 0.5625709988931173, + "learning_rate": 2.508398802496132e-06, + "loss": 0.0372, + "step": 6298 + }, + { + "epoch": 2.797690428603153, + "grad_norm": 0.5237928838120754, + "learning_rate": 2.5067182919266676e-06, + "loss": 0.0311, + "step": 6299 + }, + { + "epoch": 2.798134576948701, + "grad_norm": 0.4778467003244999, + "learning_rate": 2.5050381561397974e-06, + "loss": 0.0429, + "step": 6300 + }, + { + "epoch": 2.7985787252942482, + "grad_norm": 0.43069407371279944, + "learning_rate": 2.503358395388074e-06, + "loss": 0.0435, + "step": 6301 + }, + { + "epoch": 2.799022873639796, + "grad_norm": 0.35842408526989356, + "learning_rate": 2.501679009923997e-06, + "loss": 0.0279, + "step": 6302 + }, + { + "epoch": 2.7994670219853433, + "grad_norm": 0.3642207986414275, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.0311, + "step": 6303 + }, + { + "epoch": 2.7999111703308905, + "grad_norm": 0.655911709523861, + "learning_rate": 2.498321365868471e-06, + "loss": 0.036, + "step": 6304 + }, + { + "epoch": 2.800355318676438, + "grad_norm": 0.3959150499907849, + "learning_rate": 2.49664310778174e-06, + "loss": 0.0253, + "step": 6305 + }, + { + "epoch": 2.800799467021985, + "grad_norm": 0.5170912208789272, + "learning_rate": 2.4949652259920727e-06, + "loss": 0.0279, + "step": 6306 + }, + { + "epoch": 2.801243615367533, + "grad_norm": 0.4192727714282743, + "learning_rate": 2.4932877207516844e-06, + "loss": 0.0288, + "step": 6307 + }, + { + "epoch": 2.80168776371308, + "grad_norm": 0.4736817072181479, + "learning_rate": 2.4916105923127355e-06, + "loss": 0.0395, + "step": 6308 + }, + { + "epoch": 2.8021319120586274, + "grad_norm": 0.4164208273038199, + "learning_rate": 2.489933840927323e-06, + "loss": 0.025, + "step": 6309 + }, + { + "epoch": 2.802576060404175, + "grad_norm": 0.4237935178364351, + "learning_rate": 2.4882574668474925e-06, + "loss": 0.0326, + "step": 6310 + }, + { + "epoch": 2.8030202087497225, + "grad_norm": 0.3766334421419762, + "learning_rate": 2.486581470325232e-06, + "loss": 0.0228, + "step": 6311 + }, + { + "epoch": 2.8034643570952698, + "grad_norm": 0.41806136620537815, + "learning_rate": 2.484905851612471e-06, + "loss": 0.0364, + "step": 6312 + }, + { + "epoch": 2.803908505440817, + "grad_norm": 0.5567970380481517, + "learning_rate": 2.4832306109610877e-06, + "loss": 0.0575, + "step": 6313 + }, + { + "epoch": 2.804352653786365, + "grad_norm": 0.38489435628632473, + "learning_rate": 2.4815557486228937e-06, + "loss": 0.0243, + "step": 6314 + }, + { + "epoch": 2.804796802131912, + "grad_norm": 0.33130559466084525, + "learning_rate": 2.479881264849651e-06, + "loss": 0.0237, + "step": 6315 + }, + { + "epoch": 2.8052409504774594, + "grad_norm": 1.680243538004499, + "learning_rate": 2.478207159893064e-06, + "loss": 0.032, + "step": 6316 + }, + { + "epoch": 2.805685098823007, + "grad_norm": 0.48052932859070213, + "learning_rate": 2.476533434004779e-06, + "loss": 0.0311, + "step": 6317 + }, + { + "epoch": 2.8061292471685544, + "grad_norm": 0.36896833958182274, + "learning_rate": 2.474860087436384e-06, + "loss": 0.0261, + "step": 6318 + }, + { + "epoch": 2.8065733955141017, + "grad_norm": 0.633026238867653, + "learning_rate": 2.4731871204394155e-06, + "loss": 0.048, + "step": 6319 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 0.4418796227317967, + "learning_rate": 2.4715145332653433e-06, + "loss": 0.0359, + "step": 6320 + }, + { + "epoch": 2.8074616922051967, + "grad_norm": 0.4502178155781612, + "learning_rate": 2.4698423261655887e-06, + "loss": 0.0306, + "step": 6321 + }, + { + "epoch": 2.807905840550744, + "grad_norm": 0.4566900893012358, + "learning_rate": 2.468170499391512e-06, + "loss": 0.0362, + "step": 6322 + }, + { + "epoch": 2.8083499888962913, + "grad_norm": 0.38715597816758157, + "learning_rate": 2.4664990531944176e-06, + "loss": 0.0404, + "step": 6323 + }, + { + "epoch": 2.808794137241839, + "grad_norm": 0.43966470207668296, + "learning_rate": 2.4648279878255523e-06, + "loss": 0.0258, + "step": 6324 + }, + { + "epoch": 2.8092382855873863, + "grad_norm": 0.3596530155746306, + "learning_rate": 2.4631573035361073e-06, + "loss": 0.0294, + "step": 6325 + }, + { + "epoch": 2.8096824339329336, + "grad_norm": 0.44583220946986085, + "learning_rate": 2.4614870005772105e-06, + "loss": 0.0223, + "step": 6326 + }, + { + "epoch": 2.810126582278481, + "grad_norm": 0.5911442996714825, + "learning_rate": 2.45981707919994e-06, + "loss": 0.0393, + "step": 6327 + }, + { + "epoch": 2.810570730624028, + "grad_norm": 0.5581799888907024, + "learning_rate": 2.458147539655313e-06, + "loss": 0.0326, + "step": 6328 + }, + { + "epoch": 2.811014878969576, + "grad_norm": 0.39311733798131593, + "learning_rate": 2.4564783821942884e-06, + "loss": 0.029, + "step": 6329 + }, + { + "epoch": 2.811459027315123, + "grad_norm": 0.4450796457627704, + "learning_rate": 2.454809607067772e-06, + "loss": 0.0369, + "step": 6330 + }, + { + "epoch": 2.811903175660671, + "grad_norm": 0.43590500853966985, + "learning_rate": 2.4531412145266055e-06, + "loss": 0.0386, + "step": 6331 + }, + { + "epoch": 2.8123473240062182, + "grad_norm": 0.457972822762452, + "learning_rate": 2.4514732048215774e-06, + "loss": 0.042, + "step": 6332 + }, + { + "epoch": 2.8127914723517655, + "grad_norm": 0.32284714510978635, + "learning_rate": 2.4498055782034187e-06, + "loss": 0.0334, + "step": 6333 + }, + { + "epoch": 2.813235620697313, + "grad_norm": 0.36527473928195653, + "learning_rate": 2.4481383349228016e-06, + "loss": 0.0253, + "step": 6334 + }, + { + "epoch": 2.81367976904286, + "grad_norm": 0.3860105760601358, + "learning_rate": 2.446471475230342e-06, + "loss": 0.0391, + "step": 6335 + }, + { + "epoch": 2.814123917388408, + "grad_norm": 0.41855117247115486, + "learning_rate": 2.4448049993765975e-06, + "loss": 0.0296, + "step": 6336 + }, + { + "epoch": 2.814568065733955, + "grad_norm": 0.4146118074577428, + "learning_rate": 2.4431389076120657e-06, + "loss": 0.0325, + "step": 6337 + }, + { + "epoch": 2.8150122140795024, + "grad_norm": 0.3990029724898874, + "learning_rate": 2.4414732001871892e-06, + "loss": 0.0337, + "step": 6338 + }, + { + "epoch": 2.81545636242505, + "grad_norm": 0.5638606230897282, + "learning_rate": 2.4398078773523526e-06, + "loss": 0.0445, + "step": 6339 + }, + { + "epoch": 2.8159005107705974, + "grad_norm": 0.45902483366087526, + "learning_rate": 2.438142939357882e-06, + "loss": 0.0417, + "step": 6340 + }, + { + "epoch": 2.8163446591161447, + "grad_norm": 0.3183617092675609, + "learning_rate": 2.4364783864540482e-06, + "loss": 0.0252, + "step": 6341 + }, + { + "epoch": 2.816788807461692, + "grad_norm": 0.3554967849352285, + "learning_rate": 2.434814218891057e-06, + "loss": 0.0253, + "step": 6342 + }, + { + "epoch": 2.8172329558072398, + "grad_norm": 0.39321716328224376, + "learning_rate": 2.433150436919064e-06, + "loss": 0.0272, + "step": 6343 + }, + { + "epoch": 2.817677104152787, + "grad_norm": 0.5605795416471407, + "learning_rate": 2.4314870407881637e-06, + "loss": 0.0348, + "step": 6344 + }, + { + "epoch": 2.8181212524983343, + "grad_norm": 0.3732190806583137, + "learning_rate": 2.4298240307483923e-06, + "loss": 0.0233, + "step": 6345 + }, + { + "epoch": 2.818565400843882, + "grad_norm": 0.32751466948341074, + "learning_rate": 2.4281614070497282e-06, + "loss": 0.0284, + "step": 6346 + }, + { + "epoch": 2.8190095491894294, + "grad_norm": 0.38489431013084585, + "learning_rate": 2.4264991699420953e-06, + "loss": 0.0235, + "step": 6347 + }, + { + "epoch": 2.8194536975349767, + "grad_norm": 0.5207677415942126, + "learning_rate": 2.4248373196753512e-06, + "loss": 0.0362, + "step": 6348 + }, + { + "epoch": 2.819897845880524, + "grad_norm": 0.39019062549891126, + "learning_rate": 2.423175856499302e-06, + "loss": 0.0221, + "step": 6349 + }, + { + "epoch": 2.8203419942260712, + "grad_norm": 0.4152809637919385, + "learning_rate": 2.4215147806636942e-06, + "loss": 0.0314, + "step": 6350 + }, + { + "epoch": 2.820786142571619, + "grad_norm": 0.4000109524825699, + "learning_rate": 2.4198540924182156e-06, + "loss": 0.0307, + "step": 6351 + }, + { + "epoch": 2.8212302909171663, + "grad_norm": 0.4388997785747502, + "learning_rate": 2.4181937920124966e-06, + "loss": 0.0298, + "step": 6352 + }, + { + "epoch": 2.821674439262714, + "grad_norm": 0.4019919408619044, + "learning_rate": 2.4165338796961093e-06, + "loss": 0.033, + "step": 6353 + }, + { + "epoch": 2.8221185876082613, + "grad_norm": 0.570406566396068, + "learning_rate": 2.414874355718563e-06, + "loss": 0.0379, + "step": 6354 + }, + { + "epoch": 2.8225627359538086, + "grad_norm": 0.4407385335769093, + "learning_rate": 2.413215220329315e-06, + "loss": 0.0282, + "step": 6355 + }, + { + "epoch": 2.823006884299356, + "grad_norm": 0.38569267409608843, + "learning_rate": 2.411556473777761e-06, + "loss": 0.0282, + "step": 6356 + }, + { + "epoch": 2.823451032644903, + "grad_norm": 0.4606495839160357, + "learning_rate": 2.4098981163132395e-06, + "loss": 0.0342, + "step": 6357 + }, + { + "epoch": 2.823895180990451, + "grad_norm": 0.43263213722768684, + "learning_rate": 2.4082401481850306e-06, + "loss": 0.0369, + "step": 6358 + }, + { + "epoch": 2.824339329335998, + "grad_norm": 0.5551215018246358, + "learning_rate": 2.4065825696423522e-06, + "loss": 0.0327, + "step": 6359 + }, + { + "epoch": 2.8247834776815455, + "grad_norm": 0.3806554615954976, + "learning_rate": 2.4049253809343678e-06, + "loss": 0.0333, + "step": 6360 + }, + { + "epoch": 2.825227626027093, + "grad_norm": 0.4594230517569721, + "learning_rate": 2.4032685823101814e-06, + "loss": 0.0302, + "step": 6361 + }, + { + "epoch": 2.8256717743726405, + "grad_norm": 0.49282566341591966, + "learning_rate": 2.4016121740188375e-06, + "loss": 0.0283, + "step": 6362 + }, + { + "epoch": 2.826115922718188, + "grad_norm": 0.37575563744052526, + "learning_rate": 2.3999561563093234e-06, + "loss": 0.0349, + "step": 6363 + }, + { + "epoch": 2.826560071063735, + "grad_norm": 0.4760513100430366, + "learning_rate": 2.3983005294305673e-06, + "loss": 0.0374, + "step": 6364 + }, + { + "epoch": 2.827004219409283, + "grad_norm": 0.395278083467666, + "learning_rate": 2.396645293631435e-06, + "loss": 0.0248, + "step": 6365 + }, + { + "epoch": 2.82744836775483, + "grad_norm": 0.4876360922819094, + "learning_rate": 2.3949904491607384e-06, + "loss": 0.0342, + "step": 6366 + }, + { + "epoch": 2.8278925161003774, + "grad_norm": 0.40885930978989254, + "learning_rate": 2.393335996267229e-06, + "loss": 0.0325, + "step": 6367 + }, + { + "epoch": 2.828336664445925, + "grad_norm": 0.44935957035165386, + "learning_rate": 2.3916819351995984e-06, + "loss": 0.0273, + "step": 6368 + }, + { + "epoch": 2.8287808127914724, + "grad_norm": 1.506178960229344, + "learning_rate": 2.3900282662064806e-06, + "loss": 0.059, + "step": 6369 + }, + { + "epoch": 2.8292249611370197, + "grad_norm": 0.30959880923138633, + "learning_rate": 2.3883749895364523e-06, + "loss": 0.0217, + "step": 6370 + }, + { + "epoch": 2.829669109482567, + "grad_norm": 0.7823253783794836, + "learning_rate": 2.3867221054380244e-06, + "loss": 0.0329, + "step": 6371 + }, + { + "epoch": 2.8301132578281147, + "grad_norm": 0.4806013956209637, + "learning_rate": 2.3850696141596563e-06, + "loss": 0.0455, + "step": 6372 + }, + { + "epoch": 2.830557406173662, + "grad_norm": 0.6896449670646502, + "learning_rate": 2.3834175159497446e-06, + "loss": 0.0562, + "step": 6373 + }, + { + "epoch": 2.8310015545192093, + "grad_norm": 0.4709885726846783, + "learning_rate": 2.3817658110566288e-06, + "loss": 0.0326, + "step": 6374 + }, + { + "epoch": 2.831445702864757, + "grad_norm": 0.4631209021676538, + "learning_rate": 2.380114499728589e-06, + "loss": 0.0265, + "step": 6375 + }, + { + "epoch": 2.8318898512103043, + "grad_norm": 0.4360372773811446, + "learning_rate": 2.3784635822138424e-06, + "loss": 0.0409, + "step": 6376 + }, + { + "epoch": 2.8323339995558516, + "grad_norm": 0.36688129563252986, + "learning_rate": 2.3768130587605513e-06, + "loss": 0.0248, + "step": 6377 + }, + { + "epoch": 2.832778147901399, + "grad_norm": 0.37437289245252986, + "learning_rate": 2.3751629296168177e-06, + "loss": 0.0305, + "step": 6378 + }, + { + "epoch": 2.833222296246946, + "grad_norm": 0.531778233535108, + "learning_rate": 2.3735131950306845e-06, + "loss": 0.0352, + "step": 6379 + }, + { + "epoch": 2.833666444592494, + "grad_norm": 0.4416928778863043, + "learning_rate": 2.371863855250134e-06, + "loss": 0.0356, + "step": 6380 + }, + { + "epoch": 2.8341105929380412, + "grad_norm": 0.3539473400154312, + "learning_rate": 2.3702149105230914e-06, + "loss": 0.0329, + "step": 6381 + }, + { + "epoch": 2.834554741283589, + "grad_norm": 0.8474469374112829, + "learning_rate": 2.3685663610974193e-06, + "loss": 0.0339, + "step": 6382 + }, + { + "epoch": 2.8349988896291363, + "grad_norm": 0.42747029305832396, + "learning_rate": 2.3669182072209225e-06, + "loss": 0.0381, + "step": 6383 + }, + { + "epoch": 2.8354430379746836, + "grad_norm": 0.7608488814785124, + "learning_rate": 2.3652704491413477e-06, + "loss": 0.032, + "step": 6384 + }, + { + "epoch": 2.835887186320231, + "grad_norm": 0.388700958157769, + "learning_rate": 2.3636230871063803e-06, + "loss": 0.0273, + "step": 6385 + }, + { + "epoch": 2.836331334665778, + "grad_norm": 0.5022022192841024, + "learning_rate": 2.3619761213636496e-06, + "loss": 0.0437, + "step": 6386 + }, + { + "epoch": 2.836775483011326, + "grad_norm": 0.34657365126440165, + "learning_rate": 2.360329552160718e-06, + "loss": 0.0246, + "step": 6387 + }, + { + "epoch": 2.837219631356873, + "grad_norm": 0.3939450249882695, + "learning_rate": 2.358683379745094e-06, + "loss": 0.0267, + "step": 6388 + }, + { + "epoch": 2.8376637797024205, + "grad_norm": 0.41677776471298256, + "learning_rate": 2.357037604364229e-06, + "loss": 0.0324, + "step": 6389 + }, + { + "epoch": 2.838107928047968, + "grad_norm": 0.3562978131516035, + "learning_rate": 2.3553922262655045e-06, + "loss": 0.0289, + "step": 6390 + }, + { + "epoch": 2.8385520763935155, + "grad_norm": 0.3800097306965494, + "learning_rate": 2.3537472456962536e-06, + "loss": 0.0269, + "step": 6391 + }, + { + "epoch": 2.8389962247390628, + "grad_norm": 0.3927488750912727, + "learning_rate": 2.3521026629037456e-06, + "loss": 0.032, + "step": 6392 + }, + { + "epoch": 2.83944037308461, + "grad_norm": 0.6796389696750893, + "learning_rate": 2.3504584781351857e-06, + "loss": 0.035, + "step": 6393 + }, + { + "epoch": 2.839884521430158, + "grad_norm": 0.4312865368452842, + "learning_rate": 2.3488146916377246e-06, + "loss": 0.024, + "step": 6394 + }, + { + "epoch": 2.840328669775705, + "grad_norm": 0.5503469902463891, + "learning_rate": 2.3471713036584507e-06, + "loss": 0.0387, + "step": 6395 + }, + { + "epoch": 2.8407728181212524, + "grad_norm": 0.39537225808331916, + "learning_rate": 2.345528314444394e-06, + "loss": 0.0365, + "step": 6396 + }, + { + "epoch": 2.8412169664668, + "grad_norm": 0.4828036261450407, + "learning_rate": 2.343885724242523e-06, + "loss": 0.0359, + "step": 6397 + }, + { + "epoch": 2.8416611148123474, + "grad_norm": 0.3405572301380993, + "learning_rate": 2.342243533299749e-06, + "loss": 0.0296, + "step": 6398 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.3471570803326242, + "learning_rate": 2.3406017418629173e-06, + "loss": 0.0295, + "step": 6399 + }, + { + "epoch": 2.842549411503442, + "grad_norm": 0.30534728377719983, + "learning_rate": 2.3389603501788187e-06, + "loss": 0.0161, + "step": 6400 + }, + { + "epoch": 2.8429935598489897, + "grad_norm": 0.4261866679259168, + "learning_rate": 2.3373193584941833e-06, + "loss": 0.0213, + "step": 6401 + }, + { + "epoch": 2.843437708194537, + "grad_norm": 0.5685316006056542, + "learning_rate": 2.335678767055679e-06, + "loss": 0.0434, + "step": 6402 + }, + { + "epoch": 2.8438818565400843, + "grad_norm": 0.49631869614829277, + "learning_rate": 2.334038576109917e-06, + "loss": 0.0356, + "step": 6403 + }, + { + "epoch": 2.844326004885632, + "grad_norm": 0.5515165392461051, + "learning_rate": 2.332398785903442e-06, + "loss": 0.0363, + "step": 6404 + }, + { + "epoch": 2.8447701532311793, + "grad_norm": 0.532355259631848, + "learning_rate": 2.330759396682744e-06, + "loss": 0.0399, + "step": 6405 + }, + { + "epoch": 2.8452143015767266, + "grad_norm": 0.5194908272702733, + "learning_rate": 2.329120408694253e-06, + "loss": 0.0419, + "step": 6406 + }, + { + "epoch": 2.845658449922274, + "grad_norm": 0.4352160286424336, + "learning_rate": 2.327481822184331e-06, + "loss": 0.0323, + "step": 6407 + }, + { + "epoch": 2.846102598267821, + "grad_norm": 0.40556462446399455, + "learning_rate": 2.3258436373992914e-06, + "loss": 0.0295, + "step": 6408 + }, + { + "epoch": 2.846546746613369, + "grad_norm": 0.48488189817066807, + "learning_rate": 2.3242058545853806e-06, + "loss": 0.0323, + "step": 6409 + }, + { + "epoch": 2.846990894958916, + "grad_norm": 0.7693171033124235, + "learning_rate": 2.322568473988782e-06, + "loss": 0.0304, + "step": 6410 + }, + { + "epoch": 2.847435043304464, + "grad_norm": 0.6057507153145041, + "learning_rate": 2.3209314958556232e-06, + "loss": 0.0262, + "step": 6411 + }, + { + "epoch": 2.8478791916500112, + "grad_norm": 0.4479104368079513, + "learning_rate": 2.319294920431972e-06, + "loss": 0.0363, + "step": 6412 + }, + { + "epoch": 2.8483233399955585, + "grad_norm": 0.5011576633143359, + "learning_rate": 2.317658747963828e-06, + "loss": 0.0415, + "step": 6413 + }, + { + "epoch": 2.848767488341106, + "grad_norm": 0.43395301260402036, + "learning_rate": 2.316022978697143e-06, + "loss": 0.0305, + "step": 6414 + }, + { + "epoch": 2.849211636686653, + "grad_norm": 0.4161396554904535, + "learning_rate": 2.314387612877795e-06, + "loss": 0.031, + "step": 6415 + }, + { + "epoch": 2.849655785032201, + "grad_norm": 0.453771160365058, + "learning_rate": 2.312752650751609e-06, + "loss": 0.0328, + "step": 6416 + }, + { + "epoch": 2.850099933377748, + "grad_norm": 0.4057605385720707, + "learning_rate": 2.3111180925643477e-06, + "loss": 0.0308, + "step": 6417 + }, + { + "epoch": 2.8505440817232954, + "grad_norm": 0.4469537485385078, + "learning_rate": 2.309483938561714e-06, + "loss": 0.0345, + "step": 6418 + }, + { + "epoch": 2.850988230068843, + "grad_norm": 0.3300098185481985, + "learning_rate": 2.3078501889893477e-06, + "loss": 0.028, + "step": 6419 + }, + { + "epoch": 2.8514323784143905, + "grad_norm": 0.36465566926853343, + "learning_rate": 2.3062168440928324e-06, + "loss": 0.0299, + "step": 6420 + }, + { + "epoch": 2.8518765267599377, + "grad_norm": 0.39053368090966234, + "learning_rate": 2.304583904117682e-06, + "loss": 0.029, + "step": 6421 + }, + { + "epoch": 2.852320675105485, + "grad_norm": 0.3264096837914728, + "learning_rate": 2.302951369309358e-06, + "loss": 0.0245, + "step": 6422 + }, + { + "epoch": 2.8527648234510328, + "grad_norm": 0.4328229716057836, + "learning_rate": 2.30131923991326e-06, + "loss": 0.0383, + "step": 6423 + }, + { + "epoch": 2.85320897179658, + "grad_norm": 0.4077375134882527, + "learning_rate": 2.2996875161747194e-06, + "loss": 0.038, + "step": 6424 + }, + { + "epoch": 2.8536531201421274, + "grad_norm": 0.4094918043040497, + "learning_rate": 2.298056198339017e-06, + "loss": 0.0298, + "step": 6425 + }, + { + "epoch": 2.854097268487675, + "grad_norm": 0.32420798744454565, + "learning_rate": 2.296425286651368e-06, + "loss": 0.0335, + "step": 6426 + }, + { + "epoch": 2.8545414168332224, + "grad_norm": 0.4249764666477243, + "learning_rate": 2.294794781356922e-06, + "loss": 0.0373, + "step": 6427 + }, + { + "epoch": 2.8549855651787697, + "grad_norm": 0.8704460304111548, + "learning_rate": 2.293164682700774e-06, + "loss": 0.0456, + "step": 6428 + }, + { + "epoch": 2.855429713524317, + "grad_norm": 0.5067415787866943, + "learning_rate": 2.2915349909279573e-06, + "loss": 0.0348, + "step": 6429 + }, + { + "epoch": 2.8558738618698647, + "grad_norm": 0.38730263257871017, + "learning_rate": 2.2899057062834363e-06, + "loss": 0.0508, + "step": 6430 + }, + { + "epoch": 2.856318010215412, + "grad_norm": 0.426646849291928, + "learning_rate": 2.2882768290121277e-06, + "loss": 0.03, + "step": 6431 + }, + { + "epoch": 2.8567621585609593, + "grad_norm": 0.4433931975798466, + "learning_rate": 2.286648359358874e-06, + "loss": 0.0361, + "step": 6432 + }, + { + "epoch": 2.857206306906507, + "grad_norm": 0.41545375813689084, + "learning_rate": 2.2850202975684637e-06, + "loss": 0.0302, + "step": 6433 + }, + { + "epoch": 2.8576504552520543, + "grad_norm": 0.3777035808530584, + "learning_rate": 2.283392643885624e-06, + "loss": 0.0287, + "step": 6434 + }, + { + "epoch": 2.8580946035976016, + "grad_norm": 0.7240655817753611, + "learning_rate": 2.2817653985550132e-06, + "loss": 0.0336, + "step": 6435 + }, + { + "epoch": 2.858538751943149, + "grad_norm": 0.3261261166151642, + "learning_rate": 2.2801385618212395e-06, + "loss": 0.0244, + "step": 6436 + }, + { + "epoch": 2.858982900288696, + "grad_norm": 0.365601220986991, + "learning_rate": 2.2785121339288446e-06, + "loss": 0.0281, + "step": 6437 + }, + { + "epoch": 2.859427048634244, + "grad_norm": 0.4479492062462993, + "learning_rate": 2.276886115122304e-06, + "loss": 0.0345, + "step": 6438 + }, + { + "epoch": 2.859871196979791, + "grad_norm": 0.3824493182929994, + "learning_rate": 2.2752605056460374e-06, + "loss": 0.0263, + "step": 6439 + }, + { + "epoch": 2.860315345325339, + "grad_norm": 0.444006507921262, + "learning_rate": 2.2736353057444045e-06, + "loss": 0.0286, + "step": 6440 + }, + { + "epoch": 2.8607594936708862, + "grad_norm": 0.4135565678132424, + "learning_rate": 2.272010515661694e-06, + "loss": 0.031, + "step": 6441 + }, + { + "epoch": 2.8612036420164335, + "grad_norm": 0.4704071890757161, + "learning_rate": 2.2703861356421476e-06, + "loss": 0.0374, + "step": 6442 + }, + { + "epoch": 2.861647790361981, + "grad_norm": 0.5654612362616459, + "learning_rate": 2.268762165929931e-06, + "loss": 0.0298, + "step": 6443 + }, + { + "epoch": 2.862091938707528, + "grad_norm": 0.4525755398497248, + "learning_rate": 2.267138606769156e-06, + "loss": 0.0373, + "step": 6444 + }, + { + "epoch": 2.862536087053076, + "grad_norm": 0.3792079187603561, + "learning_rate": 2.2655154584038718e-06, + "loss": 0.0274, + "step": 6445 + }, + { + "epoch": 2.862980235398623, + "grad_norm": 0.6472477866875229, + "learning_rate": 2.263892721078067e-06, + "loss": 0.0306, + "step": 6446 + }, + { + "epoch": 2.8634243837441704, + "grad_norm": 0.4724763504374727, + "learning_rate": 2.2622703950356607e-06, + "loss": 0.0395, + "step": 6447 + }, + { + "epoch": 2.863868532089718, + "grad_norm": 0.5178919025923505, + "learning_rate": 2.2606484805205235e-06, + "loss": 0.0329, + "step": 6448 + }, + { + "epoch": 2.8643126804352654, + "grad_norm": 0.4061558909889993, + "learning_rate": 2.2590269777764516e-06, + "loss": 0.0299, + "step": 6449 + }, + { + "epoch": 2.8647568287808127, + "grad_norm": 0.39901609043961084, + "learning_rate": 2.257405887047186e-06, + "loss": 0.0328, + "step": 6450 + }, + { + "epoch": 2.86520097712636, + "grad_norm": 0.49608870962855456, + "learning_rate": 2.2557852085764053e-06, + "loss": 0.0303, + "step": 6451 + }, + { + "epoch": 2.8656451254719078, + "grad_norm": 0.38259419645750875, + "learning_rate": 2.254164942607721e-06, + "loss": 0.0219, + "step": 6452 + }, + { + "epoch": 2.866089273817455, + "grad_norm": 0.42520263669940206, + "learning_rate": 2.2525450893846906e-06, + "loss": 0.0248, + "step": 6453 + }, + { + "epoch": 2.8665334221630023, + "grad_norm": 0.343086064868294, + "learning_rate": 2.2509256491508063e-06, + "loss": 0.0226, + "step": 6454 + }, + { + "epoch": 2.86697757050855, + "grad_norm": 0.48422523578882104, + "learning_rate": 2.249306622149494e-06, + "loss": 0.0336, + "step": 6455 + }, + { + "epoch": 2.8674217188540974, + "grad_norm": 0.40460550415553925, + "learning_rate": 2.2476880086241225e-06, + "loss": 0.0325, + "step": 6456 + }, + { + "epoch": 2.8678658671996446, + "grad_norm": 0.38849375983156137, + "learning_rate": 2.2460698088179985e-06, + "loss": 0.034, + "step": 6457 + }, + { + "epoch": 2.868310015545192, + "grad_norm": 0.3659863472898491, + "learning_rate": 2.24445202297436e-06, + "loss": 0.0311, + "step": 6458 + }, + { + "epoch": 2.8687541638907397, + "grad_norm": 0.32032734176784455, + "learning_rate": 2.242834651336394e-06, + "loss": 0.0304, + "step": 6459 + }, + { + "epoch": 2.869198312236287, + "grad_norm": 0.2895901124804986, + "learning_rate": 2.2412176941472146e-06, + "loss": 0.0252, + "step": 6460 + }, + { + "epoch": 2.8696424605818343, + "grad_norm": 0.4544732858743473, + "learning_rate": 2.2396011516498794e-06, + "loss": 0.0378, + "step": 6461 + }, + { + "epoch": 2.870086608927382, + "grad_norm": 0.33869349353641764, + "learning_rate": 2.2379850240873836e-06, + "loss": 0.0243, + "step": 6462 + }, + { + "epoch": 2.8705307572729293, + "grad_norm": 0.39779581383793367, + "learning_rate": 2.2363693117026554e-06, + "loss": 0.0328, + "step": 6463 + }, + { + "epoch": 2.8709749056184766, + "grad_norm": 0.5359509284893266, + "learning_rate": 2.2347540147385636e-06, + "loss": 0.0276, + "step": 6464 + }, + { + "epoch": 2.871419053964024, + "grad_norm": 0.3901564758848709, + "learning_rate": 2.2331391334379205e-06, + "loss": 0.0286, + "step": 6465 + }, + { + "epoch": 2.871863202309571, + "grad_norm": 0.4664762506585298, + "learning_rate": 2.231524668043465e-06, + "loss": 0.0421, + "step": 6466 + }, + { + "epoch": 2.872307350655119, + "grad_norm": 0.4098655625284004, + "learning_rate": 2.229910618797879e-06, + "loss": 0.0376, + "step": 6467 + }, + { + "epoch": 2.872751499000666, + "grad_norm": 0.4583633277386958, + "learning_rate": 2.228296985943785e-06, + "loss": 0.0383, + "step": 6468 + }, + { + "epoch": 2.873195647346214, + "grad_norm": 0.37685547831341093, + "learning_rate": 2.226683769723734e-06, + "loss": 0.028, + "step": 6469 + }, + { + "epoch": 2.873639795691761, + "grad_norm": 0.400587688868656, + "learning_rate": 2.225070970380224e-06, + "loss": 0.0337, + "step": 6470 + }, + { + "epoch": 2.8740839440373085, + "grad_norm": 0.3702484950984152, + "learning_rate": 2.2234585881556864e-06, + "loss": 0.0274, + "step": 6471 + }, + { + "epoch": 2.874528092382856, + "grad_norm": 0.4674975669326644, + "learning_rate": 2.2218466232924867e-06, + "loss": 0.03, + "step": 6472 + }, + { + "epoch": 2.874972240728403, + "grad_norm": 0.37515826872200714, + "learning_rate": 2.2202350760329328e-06, + "loss": 0.0257, + "step": 6473 + }, + { + "epoch": 2.875416389073951, + "grad_norm": 0.3257808540366288, + "learning_rate": 2.2186239466192676e-06, + "loss": 0.0219, + "step": 6474 + }, + { + "epoch": 2.875860537419498, + "grad_norm": 0.4753925237563355, + "learning_rate": 2.2170132352936675e-06, + "loss": 0.0434, + "step": 6475 + }, + { + "epoch": 2.8763046857650454, + "grad_norm": 0.44581196717512095, + "learning_rate": 2.2154029422982563e-06, + "loss": 0.0371, + "step": 6476 + }, + { + "epoch": 2.876748834110593, + "grad_norm": 0.4263527151388236, + "learning_rate": 2.2137930678750835e-06, + "loss": 0.0376, + "step": 6477 + }, + { + "epoch": 2.8771929824561404, + "grad_norm": 0.46476168558248493, + "learning_rate": 2.2121836122661416e-06, + "loss": 0.034, + "step": 6478 + }, + { + "epoch": 2.8776371308016877, + "grad_norm": 0.34003139533262217, + "learning_rate": 2.2105745757133612e-06, + "loss": 0.0294, + "step": 6479 + }, + { + "epoch": 2.878081279147235, + "grad_norm": 0.49849907569693336, + "learning_rate": 2.2089659584586047e-06, + "loss": 0.0377, + "step": 6480 + }, + { + "epoch": 2.8785254274927827, + "grad_norm": 0.5382448752849822, + "learning_rate": 2.2073577607436737e-06, + "loss": 0.0443, + "step": 6481 + }, + { + "epoch": 2.87896957583833, + "grad_norm": 0.3879946256129708, + "learning_rate": 2.2057499828103142e-06, + "loss": 0.0327, + "step": 6482 + }, + { + "epoch": 2.8794137241838773, + "grad_norm": 0.5104922320743165, + "learning_rate": 2.2041426249001955e-06, + "loss": 0.0362, + "step": 6483 + }, + { + "epoch": 2.879857872529425, + "grad_norm": 0.4930647183156878, + "learning_rate": 2.2025356872549345e-06, + "loss": 0.0256, + "step": 6484 + }, + { + "epoch": 2.8803020208749723, + "grad_norm": 0.3966997553776206, + "learning_rate": 2.2009291701160817e-06, + "loss": 0.0271, + "step": 6485 + }, + { + "epoch": 2.8807461692205196, + "grad_norm": 0.4962925195644924, + "learning_rate": 2.1993230737251216e-06, + "loss": 0.0371, + "step": 6486 + }, + { + "epoch": 2.881190317566067, + "grad_norm": 0.42642379725740076, + "learning_rate": 2.197717398323477e-06, + "loss": 0.0263, + "step": 6487 + }, + { + "epoch": 2.881634465911614, + "grad_norm": 0.4141444359146802, + "learning_rate": 2.1961121441525113e-06, + "loss": 0.0344, + "step": 6488 + }, + { + "epoch": 2.882078614257162, + "grad_norm": 0.5311512207572496, + "learning_rate": 2.19450731145352e-06, + "loss": 0.0364, + "step": 6489 + }, + { + "epoch": 2.8825227626027092, + "grad_norm": 0.4267919708323801, + "learning_rate": 2.192902900467736e-06, + "loss": 0.0274, + "step": 6490 + }, + { + "epoch": 2.882966910948257, + "grad_norm": 0.31959536449480613, + "learning_rate": 2.1912989114363326e-06, + "loss": 0.0242, + "step": 6491 + }, + { + "epoch": 2.8834110592938043, + "grad_norm": 0.44718270123585074, + "learning_rate": 2.1896953446004104e-06, + "loss": 0.0397, + "step": 6492 + }, + { + "epoch": 2.8838552076393515, + "grad_norm": 0.4584663064416918, + "learning_rate": 2.1880922002010208e-06, + "loss": 0.0321, + "step": 6493 + }, + { + "epoch": 2.884299355984899, + "grad_norm": 0.6111455889547676, + "learning_rate": 2.186489478479137e-06, + "loss": 0.0423, + "step": 6494 + }, + { + "epoch": 2.884743504330446, + "grad_norm": 0.5635891360279379, + "learning_rate": 2.1848871796756784e-06, + "loss": 0.0559, + "step": 6495 + }, + { + "epoch": 2.885187652675994, + "grad_norm": 0.49224878038120795, + "learning_rate": 2.183285304031498e-06, + "loss": 0.0381, + "step": 6496 + }, + { + "epoch": 2.885631801021541, + "grad_norm": 0.412800271356595, + "learning_rate": 2.1816838517873834e-06, + "loss": 0.0281, + "step": 6497 + }, + { + "epoch": 2.8860759493670884, + "grad_norm": 0.42819522838748797, + "learning_rate": 2.1800828231840583e-06, + "loss": 0.0349, + "step": 6498 + }, + { + "epoch": 2.886520097712636, + "grad_norm": 0.3172148118997228, + "learning_rate": 2.178482218462191e-06, + "loss": 0.0198, + "step": 6499 + }, + { + "epoch": 2.8869642460581835, + "grad_norm": 0.6336802661233113, + "learning_rate": 2.176882037862373e-06, + "loss": 0.0527, + "step": 6500 + }, + { + "epoch": 2.8874083944037308, + "grad_norm": 0.6279273972202338, + "learning_rate": 2.1752822816251405e-06, + "loss": 0.0366, + "step": 6501 + }, + { + "epoch": 2.887852542749278, + "grad_norm": 0.6022931994329644, + "learning_rate": 2.173682949990968e-06, + "loss": 0.0306, + "step": 6502 + }, + { + "epoch": 2.888296691094826, + "grad_norm": 0.5362069754281357, + "learning_rate": 2.172084043200256e-06, + "loss": 0.0342, + "step": 6503 + }, + { + "epoch": 2.888740839440373, + "grad_norm": 0.7929642629941647, + "learning_rate": 2.17048556149335e-06, + "loss": 0.0333, + "step": 6504 + }, + { + "epoch": 2.8891849877859204, + "grad_norm": 0.33575039821832753, + "learning_rate": 2.16888750511053e-06, + "loss": 0.0242, + "step": 6505 + }, + { + "epoch": 2.889629136131468, + "grad_norm": 0.3741108552741579, + "learning_rate": 2.1672898742920094e-06, + "loss": 0.0273, + "step": 6506 + }, + { + "epoch": 2.8900732844770154, + "grad_norm": 0.3774493813914148, + "learning_rate": 2.1656926692779423e-06, + "loss": 0.0313, + "step": 6507 + }, + { + "epoch": 2.8905174328225627, + "grad_norm": 0.9718206954815033, + "learning_rate": 2.1640958903084118e-06, + "loss": 0.0401, + "step": 6508 + }, + { + "epoch": 2.89096158116811, + "grad_norm": 0.3694310192990195, + "learning_rate": 2.1624995376234403e-06, + "loss": 0.0348, + "step": 6509 + }, + { + "epoch": 2.8914057295136577, + "grad_norm": 0.6520572022397477, + "learning_rate": 2.1609036114629933e-06, + "loss": 0.0402, + "step": 6510 + }, + { + "epoch": 2.891849877859205, + "grad_norm": 0.4215772661238576, + "learning_rate": 2.159308112066959e-06, + "loss": 0.0293, + "step": 6511 + }, + { + "epoch": 2.8922940262047523, + "grad_norm": 0.38467024272056793, + "learning_rate": 2.1577130396751705e-06, + "loss": 0.0325, + "step": 6512 + }, + { + "epoch": 2.8927381745503, + "grad_norm": 0.38039523075207066, + "learning_rate": 2.1561183945273958e-06, + "loss": 0.029, + "step": 6513 + }, + { + "epoch": 2.8931823228958473, + "grad_norm": 0.35012157636808233, + "learning_rate": 2.154524176863334e-06, + "loss": 0.0298, + "step": 6514 + }, + { + "epoch": 2.8936264712413946, + "grad_norm": 0.32604790956444796, + "learning_rate": 2.1529303869226244e-06, + "loss": 0.0223, + "step": 6515 + }, + { + "epoch": 2.894070619586942, + "grad_norm": 0.6062038168527448, + "learning_rate": 2.151337024944841e-06, + "loss": 0.0328, + "step": 6516 + }, + { + "epoch": 2.894514767932489, + "grad_norm": 0.4616383089873512, + "learning_rate": 2.149744091169493e-06, + "loss": 0.0355, + "step": 6517 + }, + { + "epoch": 2.894958916278037, + "grad_norm": 0.31498629669392003, + "learning_rate": 2.1481515858360254e-06, + "loss": 0.0223, + "step": 6518 + }, + { + "epoch": 2.895403064623584, + "grad_norm": 0.3418552975452249, + "learning_rate": 2.1465595091838204e-06, + "loss": 0.0284, + "step": 6519 + }, + { + "epoch": 2.895847212969132, + "grad_norm": 0.6349150689262983, + "learning_rate": 2.144967861452191e-06, + "loss": 0.0288, + "step": 6520 + }, + { + "epoch": 2.8962913613146792, + "grad_norm": 0.3672718765769208, + "learning_rate": 2.143376642880391e-06, + "loss": 0.028, + "step": 6521 + }, + { + "epoch": 2.8967355096602265, + "grad_norm": 0.6329188736427485, + "learning_rate": 2.141785853707607e-06, + "loss": 0.0419, + "step": 6522 + }, + { + "epoch": 2.897179658005774, + "grad_norm": 0.35536621143471536, + "learning_rate": 2.1401954941729614e-06, + "loss": 0.0276, + "step": 6523 + }, + { + "epoch": 2.897623806351321, + "grad_norm": 0.4803884960432198, + "learning_rate": 2.1386055645155144e-06, + "loss": 0.0334, + "step": 6524 + }, + { + "epoch": 2.898067954696869, + "grad_norm": 0.7351089421454166, + "learning_rate": 2.137016064974256e-06, + "loss": 0.034, + "step": 6525 + }, + { + "epoch": 2.898512103042416, + "grad_norm": 0.35740753737071096, + "learning_rate": 2.135426995788115e-06, + "loss": 0.0255, + "step": 6526 + }, + { + "epoch": 2.8989562513879634, + "grad_norm": 0.40072320482835744, + "learning_rate": 2.133838357195961e-06, + "loss": 0.0369, + "step": 6527 + }, + { + "epoch": 2.899400399733511, + "grad_norm": 0.3671291054027193, + "learning_rate": 2.1322501494365873e-06, + "loss": 0.0248, + "step": 6528 + }, + { + "epoch": 2.8998445480790584, + "grad_norm": 0.46498735962046045, + "learning_rate": 2.1306623727487306e-06, + "loss": 0.0336, + "step": 6529 + }, + { + "epoch": 2.9002886964246057, + "grad_norm": 0.37582439553874314, + "learning_rate": 2.1290750273710625e-06, + "loss": 0.0311, + "step": 6530 + }, + { + "epoch": 2.900732844770153, + "grad_norm": 0.3769841744886795, + "learning_rate": 2.127488113542185e-06, + "loss": 0.0219, + "step": 6531 + }, + { + "epoch": 2.9011769931157008, + "grad_norm": 0.5408699565947783, + "learning_rate": 2.1259016315006388e-06, + "loss": 0.0369, + "step": 6532 + }, + { + "epoch": 2.901621141461248, + "grad_norm": 0.41566157060894093, + "learning_rate": 2.1243155814849003e-06, + "loss": 0.0322, + "step": 6533 + }, + { + "epoch": 2.9020652898067953, + "grad_norm": 0.4200346887421879, + "learning_rate": 2.1227299637333793e-06, + "loss": 0.0265, + "step": 6534 + }, + { + "epoch": 2.902509438152343, + "grad_norm": 0.481827142730724, + "learning_rate": 2.1211447784844223e-06, + "loss": 0.0332, + "step": 6535 + }, + { + "epoch": 2.9029535864978904, + "grad_norm": 0.41218410599797384, + "learning_rate": 2.1195600259763064e-06, + "loss": 0.0344, + "step": 6536 + }, + { + "epoch": 2.9033977348434377, + "grad_norm": 0.5125400394558322, + "learning_rate": 2.1179757064472495e-06, + "loss": 0.0311, + "step": 6537 + }, + { + "epoch": 2.903841883188985, + "grad_norm": 0.5243375032164893, + "learning_rate": 2.1163918201354005e-06, + "loss": 0.0442, + "step": 6538 + }, + { + "epoch": 2.9042860315345327, + "grad_norm": 0.36105640514776777, + "learning_rate": 2.114808367278845e-06, + "loss": 0.0315, + "step": 6539 + }, + { + "epoch": 2.90473017988008, + "grad_norm": 0.37491287513386307, + "learning_rate": 2.113225348115603e-06, + "loss": 0.0273, + "step": 6540 + }, + { + "epoch": 2.9051743282256273, + "grad_norm": 0.3675658951021628, + "learning_rate": 2.11164276288363e-06, + "loss": 0.0249, + "step": 6541 + }, + { + "epoch": 2.905618476571175, + "grad_norm": 0.47441864691687574, + "learning_rate": 2.110060611820813e-06, + "loss": 0.0439, + "step": 6542 + }, + { + "epoch": 2.9060626249167223, + "grad_norm": 0.4331767301461994, + "learning_rate": 2.1084788951649753e-06, + "loss": 0.0308, + "step": 6543 + }, + { + "epoch": 2.9065067732622696, + "grad_norm": 0.32889369683548264, + "learning_rate": 2.106897613153882e-06, + "loss": 0.0269, + "step": 6544 + }, + { + "epoch": 2.906950921607817, + "grad_norm": 0.46027509577610654, + "learning_rate": 2.105316766025221e-06, + "loss": 0.0309, + "step": 6545 + }, + { + "epoch": 2.907395069953364, + "grad_norm": 0.5494880014843939, + "learning_rate": 2.1037363540166224e-06, + "loss": 0.0273, + "step": 6546 + }, + { + "epoch": 2.907839218298912, + "grad_norm": 0.4011778001737706, + "learning_rate": 2.1021563773656493e-06, + "loss": 0.0352, + "step": 6547 + }, + { + "epoch": 2.908283366644459, + "grad_norm": 0.45266951475580575, + "learning_rate": 2.1005768363097977e-06, + "loss": 0.0301, + "step": 6548 + }, + { + "epoch": 2.908727514990007, + "grad_norm": 0.49868301394140047, + "learning_rate": 2.0989977310865e-06, + "loss": 0.0339, + "step": 6549 + }, + { + "epoch": 2.909171663335554, + "grad_norm": 0.6954722852910884, + "learning_rate": 2.0974190619331224e-06, + "loss": 0.0349, + "step": 6550 + }, + { + "epoch": 2.9096158116811015, + "grad_norm": 0.4383145753772345, + "learning_rate": 2.0958408290869662e-06, + "loss": 0.0356, + "step": 6551 + }, + { + "epoch": 2.910059960026649, + "grad_norm": 0.3704293568574071, + "learning_rate": 2.0942630327852687e-06, + "loss": 0.0303, + "step": 6552 + }, + { + "epoch": 2.910504108372196, + "grad_norm": 0.3498231235044318, + "learning_rate": 2.092685673265195e-06, + "loss": 0.0318, + "step": 6553 + }, + { + "epoch": 2.910948256717744, + "grad_norm": 0.36641412131018597, + "learning_rate": 2.0911087507638513e-06, + "loss": 0.0237, + "step": 6554 + }, + { + "epoch": 2.911392405063291, + "grad_norm": 0.4005634976837491, + "learning_rate": 2.0895322655182754e-06, + "loss": 0.0327, + "step": 6555 + }, + { + "epoch": 2.9118365534088384, + "grad_norm": 0.36560498009667564, + "learning_rate": 2.0879562177654404e-06, + "loss": 0.0247, + "step": 6556 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 0.38952731591083495, + "learning_rate": 2.0863806077422534e-06, + "loss": 0.027, + "step": 6557 + }, + { + "epoch": 2.9127248500999334, + "grad_norm": 0.4138708556738211, + "learning_rate": 2.0848054356855557e-06, + "loss": 0.0304, + "step": 6558 + }, + { + "epoch": 2.9131689984454807, + "grad_norm": 0.5073707851794668, + "learning_rate": 2.08323070183212e-06, + "loss": 0.0353, + "step": 6559 + }, + { + "epoch": 2.913613146791028, + "grad_norm": 0.5942930274632409, + "learning_rate": 2.081656406418658e-06, + "loss": 0.0331, + "step": 6560 + }, + { + "epoch": 2.9140572951365757, + "grad_norm": 0.35640079528674945, + "learning_rate": 2.080082549681811e-06, + "loss": 0.0279, + "step": 6561 + }, + { + "epoch": 2.914501443482123, + "grad_norm": 0.3814193495027854, + "learning_rate": 2.0785091318581577e-06, + "loss": 0.0305, + "step": 6562 + }, + { + "epoch": 2.9149455918276703, + "grad_norm": 0.358555755728585, + "learning_rate": 2.076936153184211e-06, + "loss": 0.0239, + "step": 6563 + }, + { + "epoch": 2.915389740173218, + "grad_norm": 0.4250693357842293, + "learning_rate": 2.0753636138964134e-06, + "loss": 0.0286, + "step": 6564 + }, + { + "epoch": 2.9158338885187653, + "grad_norm": 0.6221059510930096, + "learning_rate": 2.0737915142311454e-06, + "loss": 0.0441, + "step": 6565 + }, + { + "epoch": 2.9162780368643126, + "grad_norm": 0.38815256704717155, + "learning_rate": 2.07221985442472e-06, + "loss": 0.0292, + "step": 6566 + }, + { + "epoch": 2.91672218520986, + "grad_norm": 0.420249928229787, + "learning_rate": 2.0706486347133853e-06, + "loss": 0.0289, + "step": 6567 + }, + { + "epoch": 2.9171663335554077, + "grad_norm": 0.39867842058238456, + "learning_rate": 2.0690778553333215e-06, + "loss": 0.0399, + "step": 6568 + }, + { + "epoch": 2.917610481900955, + "grad_norm": 0.35700246538754193, + "learning_rate": 2.0675075165206456e-06, + "loss": 0.0328, + "step": 6569 + }, + { + "epoch": 2.9180546302465022, + "grad_norm": 0.39862639265555505, + "learning_rate": 2.0659376185114024e-06, + "loss": 0.0282, + "step": 6570 + }, + { + "epoch": 2.91849877859205, + "grad_norm": 0.3874386180786859, + "learning_rate": 2.064368161541576e-06, + "loss": 0.0269, + "step": 6571 + }, + { + "epoch": 2.9189429269375973, + "grad_norm": 0.35734528243866814, + "learning_rate": 2.0627991458470826e-06, + "loss": 0.0277, + "step": 6572 + }, + { + "epoch": 2.9193870752831446, + "grad_norm": 0.43326587014764417, + "learning_rate": 2.061230571663772e-06, + "loss": 0.0287, + "step": 6573 + }, + { + "epoch": 2.919831223628692, + "grad_norm": 0.442123671529554, + "learning_rate": 2.0596624392274277e-06, + "loss": 0.0348, + "step": 6574 + }, + { + "epoch": 2.920275371974239, + "grad_norm": 0.41360740653208006, + "learning_rate": 2.058094748773768e-06, + "loss": 0.0245, + "step": 6575 + }, + { + "epoch": 2.920719520319787, + "grad_norm": 0.4521622442444811, + "learning_rate": 2.05652750053844e-06, + "loss": 0.0272, + "step": 6576 + }, + { + "epoch": 2.921163668665334, + "grad_norm": 0.6652238384193798, + "learning_rate": 2.0549606947570295e-06, + "loss": 0.0403, + "step": 6577 + }, + { + "epoch": 2.921607817010882, + "grad_norm": 0.33561551996607286, + "learning_rate": 2.053394331665054e-06, + "loss": 0.023, + "step": 6578 + }, + { + "epoch": 2.922051965356429, + "grad_norm": 0.4533033136781839, + "learning_rate": 2.051828411497964e-06, + "loss": 0.0418, + "step": 6579 + }, + { + "epoch": 2.9224961137019765, + "grad_norm": 0.2933903835866281, + "learning_rate": 2.0502629344911475e-06, + "loss": 0.0204, + "step": 6580 + }, + { + "epoch": 2.9229402620475238, + "grad_norm": 0.4807368532427772, + "learning_rate": 2.0486979008799164e-06, + "loss": 0.0388, + "step": 6581 + }, + { + "epoch": 2.923384410393071, + "grad_norm": 0.39312159469638, + "learning_rate": 2.047133310899525e-06, + "loss": 0.0352, + "step": 6582 + }, + { + "epoch": 2.923828558738619, + "grad_norm": 0.49386287194544953, + "learning_rate": 2.045569164785157e-06, + "loss": 0.0318, + "step": 6583 + }, + { + "epoch": 2.924272707084166, + "grad_norm": 0.3909685186859985, + "learning_rate": 2.044005462771931e-06, + "loss": 0.0244, + "step": 6584 + }, + { + "epoch": 2.9247168554297134, + "grad_norm": 0.4648834902668276, + "learning_rate": 2.0424422050948976e-06, + "loss": 0.0387, + "step": 6585 + }, + { + "epoch": 2.925161003775261, + "grad_norm": 0.36487456799403395, + "learning_rate": 2.0408793919890424e-06, + "loss": 0.0288, + "step": 6586 + }, + { + "epoch": 2.9256051521208084, + "grad_norm": 0.5349577863858896, + "learning_rate": 2.0393170236892795e-06, + "loss": 0.0426, + "step": 6587 + }, + { + "epoch": 2.9260493004663557, + "grad_norm": 0.35432394100233194, + "learning_rate": 2.0377551004304613e-06, + "loss": 0.0304, + "step": 6588 + }, + { + "epoch": 2.926493448811903, + "grad_norm": 0.5133489099915194, + "learning_rate": 2.036193622447371e-06, + "loss": 0.0436, + "step": 6589 + }, + { + "epoch": 2.9269375971574507, + "grad_norm": 0.40489974505235427, + "learning_rate": 2.034632589974726e-06, + "loss": 0.0319, + "step": 6590 + }, + { + "epoch": 2.927381745502998, + "grad_norm": 0.3994253955209103, + "learning_rate": 2.033072003247175e-06, + "loss": 0.0308, + "step": 6591 + }, + { + "epoch": 2.9278258938485453, + "grad_norm": 0.34532953088972845, + "learning_rate": 2.0315118624993035e-06, + "loss": 0.0261, + "step": 6592 + }, + { + "epoch": 2.928270042194093, + "grad_norm": 0.35520607631530465, + "learning_rate": 2.0299521679656225e-06, + "loss": 0.0228, + "step": 6593 + }, + { + "epoch": 2.9287141905396403, + "grad_norm": 0.32280141712366317, + "learning_rate": 2.0283929198805837e-06, + "loss": 0.0189, + "step": 6594 + }, + { + "epoch": 2.9291583388851876, + "grad_norm": 0.43110913730957606, + "learning_rate": 2.0268341184785674e-06, + "loss": 0.0378, + "step": 6595 + }, + { + "epoch": 2.929602487230735, + "grad_norm": 0.354328605582052, + "learning_rate": 2.025275763993888e-06, + "loss": 0.0313, + "step": 6596 + }, + { + "epoch": 2.9300466355762826, + "grad_norm": 0.3880427101102111, + "learning_rate": 2.023717856660795e-06, + "loss": 0.0283, + "step": 6597 + }, + { + "epoch": 2.93049078392183, + "grad_norm": 0.37092577723252107, + "learning_rate": 2.0221603967134645e-06, + "loss": 0.0309, + "step": 6598 + }, + { + "epoch": 2.9309349322673772, + "grad_norm": 0.3853764249821867, + "learning_rate": 2.0206033843860113e-06, + "loss": 0.036, + "step": 6599 + }, + { + "epoch": 2.931379080612925, + "grad_norm": 0.32195066732125943, + "learning_rate": 2.0190468199124804e-06, + "loss": 0.0266, + "step": 6600 + }, + { + "epoch": 2.9318232289584722, + "grad_norm": 0.377934707171494, + "learning_rate": 2.01749070352685e-06, + "loss": 0.0323, + "step": 6601 + }, + { + "epoch": 2.9322673773040195, + "grad_norm": 0.4090601636781386, + "learning_rate": 2.0159350354630307e-06, + "loss": 0.0275, + "step": 6602 + }, + { + "epoch": 2.932711525649567, + "grad_norm": 0.4773326358669816, + "learning_rate": 2.0143798159548677e-06, + "loss": 0.0331, + "step": 6603 + }, + { + "epoch": 2.933155673995114, + "grad_norm": 0.4210797373291252, + "learning_rate": 2.0128250452361334e-06, + "loss": 0.0362, + "step": 6604 + }, + { + "epoch": 2.933599822340662, + "grad_norm": 0.4248736186586538, + "learning_rate": 2.0112707235405386e-06, + "loss": 0.0309, + "step": 6605 + }, + { + "epoch": 2.934043970686209, + "grad_norm": 0.3339641218919188, + "learning_rate": 2.0097168511017234e-06, + "loss": 0.0187, + "step": 6606 + }, + { + "epoch": 2.934488119031757, + "grad_norm": 0.5167791996296286, + "learning_rate": 2.0081634281532613e-06, + "loss": 0.0315, + "step": 6607 + }, + { + "epoch": 2.934932267377304, + "grad_norm": 0.4991803228092949, + "learning_rate": 2.0066104549286602e-06, + "loss": 0.0436, + "step": 6608 + }, + { + "epoch": 2.9353764157228515, + "grad_norm": 0.3779855137295941, + "learning_rate": 2.005057931661355e-06, + "loss": 0.0273, + "step": 6609 + }, + { + "epoch": 2.9358205640683988, + "grad_norm": 0.5036801733855913, + "learning_rate": 2.0035058585847173e-06, + "loss": 0.0328, + "step": 6610 + }, + { + "epoch": 2.936264712413946, + "grad_norm": 0.3544894089098337, + "learning_rate": 2.001954235932051e-06, + "loss": 0.0235, + "step": 6611 + }, + { + "epoch": 2.9367088607594938, + "grad_norm": 0.42819328292612396, + "learning_rate": 2.0004030639365907e-06, + "loss": 0.0308, + "step": 6612 + }, + { + "epoch": 2.937153009105041, + "grad_norm": 0.39806520440496346, + "learning_rate": 1.9988523428315045e-06, + "loss": 0.0277, + "step": 6613 + }, + { + "epoch": 2.9375971574505884, + "grad_norm": 0.41174193415314053, + "learning_rate": 1.997302072849893e-06, + "loss": 0.0282, + "step": 6614 + }, + { + "epoch": 2.938041305796136, + "grad_norm": 0.4414650832704903, + "learning_rate": 1.995752254224786e-06, + "loss": 0.0303, + "step": 6615 + }, + { + "epoch": 2.9384854541416834, + "grad_norm": 0.4432623492720395, + "learning_rate": 1.994202887189148e-06, + "loss": 0.0253, + "step": 6616 + }, + { + "epoch": 2.9389296024872307, + "grad_norm": 0.4800911306612613, + "learning_rate": 1.9926539719758747e-06, + "loss": 0.0388, + "step": 6617 + }, + { + "epoch": 2.939373750832778, + "grad_norm": 0.4059304636548806, + "learning_rate": 1.9911055088177967e-06, + "loss": 0.0276, + "step": 6618 + }, + { + "epoch": 2.9398178991783257, + "grad_norm": 0.5632995645750524, + "learning_rate": 1.9895574979476717e-06, + "loss": 0.0364, + "step": 6619 + }, + { + "epoch": 2.940262047523873, + "grad_norm": 0.3730668889292687, + "learning_rate": 1.9880099395981954e-06, + "loss": 0.0266, + "step": 6620 + }, + { + "epoch": 2.9407061958694203, + "grad_norm": 0.4942132883415099, + "learning_rate": 1.986462834001989e-06, + "loss": 0.0424, + "step": 6621 + }, + { + "epoch": 2.941150344214968, + "grad_norm": 0.35051082450586374, + "learning_rate": 1.984916181391609e-06, + "loss": 0.029, + "step": 6622 + }, + { + "epoch": 2.9415944925605153, + "grad_norm": 0.49757968739541286, + "learning_rate": 1.983369981999544e-06, + "loss": 0.0391, + "step": 6623 + }, + { + "epoch": 2.9420386409060626, + "grad_norm": 0.3866532839130346, + "learning_rate": 1.9818242360582145e-06, + "loss": 0.0196, + "step": 6624 + }, + { + "epoch": 2.94248278925161, + "grad_norm": 0.3312297948056805, + "learning_rate": 1.980278943799974e-06, + "loss": 0.024, + "step": 6625 + }, + { + "epoch": 2.942926937597157, + "grad_norm": 0.369854753089198, + "learning_rate": 1.978734105457103e-06, + "loss": 0.0261, + "step": 6626 + }, + { + "epoch": 2.943371085942705, + "grad_norm": 0.40104725649008754, + "learning_rate": 1.9771897212618172e-06, + "loss": 0.0338, + "step": 6627 + }, + { + "epoch": 2.943815234288252, + "grad_norm": 0.34952669002936676, + "learning_rate": 1.9756457914462677e-06, + "loss": 0.0245, + "step": 6628 + }, + { + "epoch": 2.9442593826338, + "grad_norm": 0.41129129029247236, + "learning_rate": 1.9741023162425265e-06, + "loss": 0.0313, + "step": 6629 + }, + { + "epoch": 2.9447035309793472, + "grad_norm": 0.5211003073625236, + "learning_rate": 1.9725592958826102e-06, + "loss": 0.0421, + "step": 6630 + }, + { + "epoch": 2.9451476793248945, + "grad_norm": 0.3984417865901208, + "learning_rate": 1.9710167305984607e-06, + "loss": 0.0375, + "step": 6631 + }, + { + "epoch": 2.945591827670442, + "grad_norm": 0.30941970478869846, + "learning_rate": 1.9694746206219477e-06, + "loss": 0.0216, + "step": 6632 + }, + { + "epoch": 2.946035976015989, + "grad_norm": 0.4229304742479391, + "learning_rate": 1.9679329661848795e-06, + "loss": 0.034, + "step": 6633 + }, + { + "epoch": 2.946480124361537, + "grad_norm": 0.476591153079035, + "learning_rate": 1.966391767518992e-06, + "loss": 0.0516, + "step": 6634 + }, + { + "epoch": 2.946924272707084, + "grad_norm": 0.6821582454050819, + "learning_rate": 1.9648510248559546e-06, + "loss": 0.034, + "step": 6635 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.43772389004233875, + "learning_rate": 1.9633107384273668e-06, + "loss": 0.0296, + "step": 6636 + }, + { + "epoch": 2.947812569398179, + "grad_norm": 0.49311057599048846, + "learning_rate": 1.9617709084647584e-06, + "loss": 0.0471, + "step": 6637 + }, + { + "epoch": 2.9482567177437264, + "grad_norm": 0.48536816799978794, + "learning_rate": 1.9602315351995928e-06, + "loss": 0.0393, + "step": 6638 + }, + { + "epoch": 2.9487008660892737, + "grad_norm": 0.5141630455505187, + "learning_rate": 1.958692618863264e-06, + "loss": 0.0342, + "step": 6639 + }, + { + "epoch": 2.949145014434821, + "grad_norm": 0.4586328384368867, + "learning_rate": 1.9571541596870974e-06, + "loss": 0.0391, + "step": 6640 + }, + { + "epoch": 2.9495891627803688, + "grad_norm": 0.4178954318106239, + "learning_rate": 1.9556161579023493e-06, + "loss": 0.0315, + "step": 6641 + }, + { + "epoch": 2.950033311125916, + "grad_norm": 0.4366977616184424, + "learning_rate": 1.9540786137402097e-06, + "loss": 0.0325, + "step": 6642 + }, + { + "epoch": 2.9504774594714633, + "grad_norm": 0.35044020018653177, + "learning_rate": 1.952541527431794e-06, + "loss": 0.0238, + "step": 6643 + }, + { + "epoch": 2.950921607817011, + "grad_norm": 0.36474391941516165, + "learning_rate": 1.951004899208154e-06, + "loss": 0.0284, + "step": 6644 + }, + { + "epoch": 2.9513657561625584, + "grad_norm": 0.3808970857069157, + "learning_rate": 1.9494687293002724e-06, + "loss": 0.0262, + "step": 6645 + }, + { + "epoch": 2.9518099045081057, + "grad_norm": 0.49087581676544356, + "learning_rate": 1.947933017939057e-06, + "loss": 0.0334, + "step": 6646 + }, + { + "epoch": 2.952254052853653, + "grad_norm": 0.4067523864649502, + "learning_rate": 1.946397765355356e-06, + "loss": 0.0224, + "step": 6647 + }, + { + "epoch": 2.9526982011992007, + "grad_norm": 0.5385094196478863, + "learning_rate": 1.9448629717799444e-06, + "loss": 0.0274, + "step": 6648 + }, + { + "epoch": 2.953142349544748, + "grad_norm": 0.4534794423284923, + "learning_rate": 1.9433286374435243e-06, + "loss": 0.0307, + "step": 6649 + }, + { + "epoch": 2.9535864978902953, + "grad_norm": 0.42491711765749635, + "learning_rate": 1.9417947625767338e-06, + "loss": 0.0282, + "step": 6650 + }, + { + "epoch": 2.954030646235843, + "grad_norm": 0.43191372654900584, + "learning_rate": 1.9402613474101418e-06, + "loss": 0.0284, + "step": 6651 + }, + { + "epoch": 2.9544747945813903, + "grad_norm": 0.4659070261041793, + "learning_rate": 1.9387283921742417e-06, + "loss": 0.0348, + "step": 6652 + }, + { + "epoch": 2.9549189429269376, + "grad_norm": 0.3521658502328422, + "learning_rate": 1.9371958970994697e-06, + "loss": 0.0264, + "step": 6653 + }, + { + "epoch": 2.955363091272485, + "grad_norm": 0.3744257822368085, + "learning_rate": 1.935663862416181e-06, + "loss": 0.0233, + "step": 6654 + }, + { + "epoch": 2.955807239618032, + "grad_norm": 0.3533483955176666, + "learning_rate": 1.934132288354667e-06, + "loss": 0.0223, + "step": 6655 + }, + { + "epoch": 2.95625138796358, + "grad_norm": 0.48990756810689967, + "learning_rate": 1.9326011751451523e-06, + "loss": 0.0296, + "step": 6656 + }, + { + "epoch": 2.956695536309127, + "grad_norm": 0.4155057113271156, + "learning_rate": 1.9310705230177834e-06, + "loss": 0.032, + "step": 6657 + }, + { + "epoch": 2.957139684654675, + "grad_norm": 0.46467269039209746, + "learning_rate": 1.9295403322026485e-06, + "loss": 0.0302, + "step": 6658 + }, + { + "epoch": 2.957583833000222, + "grad_norm": 0.436904391432258, + "learning_rate": 1.928010602929762e-06, + "loss": 0.0249, + "step": 6659 + }, + { + "epoch": 2.9580279813457695, + "grad_norm": 0.4078298528030376, + "learning_rate": 1.9264813354290635e-06, + "loss": 0.0339, + "step": 6660 + }, + { + "epoch": 2.958472129691317, + "grad_norm": 0.43921352541689956, + "learning_rate": 1.92495252993043e-06, + "loss": 0.0293, + "step": 6661 + }, + { + "epoch": 2.958916278036864, + "grad_norm": 0.47221505355593735, + "learning_rate": 1.9234241866636693e-06, + "loss": 0.0267, + "step": 6662 + }, + { + "epoch": 2.959360426382412, + "grad_norm": 0.3695229541487737, + "learning_rate": 1.9218963058585117e-06, + "loss": 0.0273, + "step": 6663 + }, + { + "epoch": 2.959804574727959, + "grad_norm": 0.3129170332680405, + "learning_rate": 1.9203688877446285e-06, + "loss": 0.0177, + "step": 6664 + }, + { + "epoch": 2.9602487230735064, + "grad_norm": 0.3892558088874507, + "learning_rate": 1.9188419325516177e-06, + "loss": 0.0381, + "step": 6665 + }, + { + "epoch": 2.960692871419054, + "grad_norm": 0.3716168146474431, + "learning_rate": 1.9173154405090024e-06, + "loss": 0.0222, + "step": 6666 + }, + { + "epoch": 2.9611370197646014, + "grad_norm": 0.4147701146563668, + "learning_rate": 1.9157894118462416e-06, + "loss": 0.0307, + "step": 6667 + }, + { + "epoch": 2.9615811681101487, + "grad_norm": 0.39499526563472726, + "learning_rate": 1.9142638467927254e-06, + "loss": 0.0365, + "step": 6668 + }, + { + "epoch": 2.962025316455696, + "grad_norm": 0.3874596806249992, + "learning_rate": 1.9127387455777673e-06, + "loss": 0.0321, + "step": 6669 + }, + { + "epoch": 2.9624694648012437, + "grad_norm": 0.42027304319891273, + "learning_rate": 1.911214108430623e-06, + "loss": 0.033, + "step": 6670 + }, + { + "epoch": 2.962913613146791, + "grad_norm": 0.44739921526779486, + "learning_rate": 1.9096899355804655e-06, + "loss": 0.0275, + "step": 6671 + }, + { + "epoch": 2.9633577614923383, + "grad_norm": 0.37802801428083804, + "learning_rate": 1.9081662272564055e-06, + "loss": 0.0319, + "step": 6672 + }, + { + "epoch": 2.963801909837886, + "grad_norm": 0.38442939573511425, + "learning_rate": 1.9066429836874844e-06, + "loss": 0.0295, + "step": 6673 + }, + { + "epoch": 2.9642460581834333, + "grad_norm": 0.6866561926311151, + "learning_rate": 1.9051202051026669e-06, + "loss": 0.0381, + "step": 6674 + }, + { + "epoch": 2.9646902065289806, + "grad_norm": 0.39471908589991933, + "learning_rate": 1.9035978917308568e-06, + "loss": 0.0282, + "step": 6675 + }, + { + "epoch": 2.965134354874528, + "grad_norm": 0.3683316118105389, + "learning_rate": 1.902076043800884e-06, + "loss": 0.0237, + "step": 6676 + }, + { + "epoch": 2.9655785032200757, + "grad_norm": 0.37626637800602947, + "learning_rate": 1.9005546615415044e-06, + "loss": 0.0241, + "step": 6677 + }, + { + "epoch": 2.966022651565623, + "grad_norm": 0.3496930484588171, + "learning_rate": 1.8990337451814095e-06, + "loss": 0.0286, + "step": 6678 + }, + { + "epoch": 2.9664667999111702, + "grad_norm": 0.41344810193656406, + "learning_rate": 1.897513294949221e-06, + "loss": 0.0273, + "step": 6679 + }, + { + "epoch": 2.966910948256718, + "grad_norm": 0.378120591478157, + "learning_rate": 1.895993311073483e-06, + "loss": 0.0294, + "step": 6680 + }, + { + "epoch": 2.9673550966022653, + "grad_norm": 0.33697535541119333, + "learning_rate": 1.8944737937826813e-06, + "loss": 0.0273, + "step": 6681 + }, + { + "epoch": 2.9677992449478126, + "grad_norm": 0.3663439761387446, + "learning_rate": 1.8929547433052202e-06, + "loss": 0.032, + "step": 6682 + }, + { + "epoch": 2.96824339329336, + "grad_norm": 0.5047952130541636, + "learning_rate": 1.8914361598694408e-06, + "loss": 0.0339, + "step": 6683 + }, + { + "epoch": 2.968687541638907, + "grad_norm": 0.4039062494421022, + "learning_rate": 1.8899180437036119e-06, + "loss": 0.0375, + "step": 6684 + }, + { + "epoch": 2.969131689984455, + "grad_norm": 0.3025200589569862, + "learning_rate": 1.8884003950359337e-06, + "loss": 0.0262, + "step": 6685 + }, + { + "epoch": 2.969575838330002, + "grad_norm": 0.44229043636076315, + "learning_rate": 1.8868832140945297e-06, + "loss": 0.028, + "step": 6686 + }, + { + "epoch": 2.97001998667555, + "grad_norm": 0.4708612100607534, + "learning_rate": 1.8853665011074645e-06, + "loss": 0.027, + "step": 6687 + }, + { + "epoch": 2.970464135021097, + "grad_norm": 0.3722598107484293, + "learning_rate": 1.8838502563027212e-06, + "loss": 0.0253, + "step": 6688 + }, + { + "epoch": 2.9709082833666445, + "grad_norm": 0.6036467003416066, + "learning_rate": 1.8823344799082177e-06, + "loss": 0.0284, + "step": 6689 + }, + { + "epoch": 2.9713524317121918, + "grad_norm": 0.34918427015083503, + "learning_rate": 1.8808191721518043e-06, + "loss": 0.0294, + "step": 6690 + }, + { + "epoch": 2.971796580057739, + "grad_norm": 0.42849383719138034, + "learning_rate": 1.879304333261251e-06, + "loss": 0.0278, + "step": 6691 + }, + { + "epoch": 2.972240728403287, + "grad_norm": 0.48072735895909585, + "learning_rate": 1.87778996346427e-06, + "loss": 0.0417, + "step": 6692 + }, + { + "epoch": 2.972684876748834, + "grad_norm": 0.29126197319559194, + "learning_rate": 1.8762760629884958e-06, + "loss": 0.0243, + "step": 6693 + }, + { + "epoch": 2.9731290250943814, + "grad_norm": 0.3610203458255818, + "learning_rate": 1.8747626320614904e-06, + "loss": 0.0269, + "step": 6694 + }, + { + "epoch": 2.973573173439929, + "grad_norm": 0.4145287000272817, + "learning_rate": 1.87324967091075e-06, + "loss": 0.0461, + "step": 6695 + }, + { + "epoch": 2.9740173217854764, + "grad_norm": 0.3579965634087584, + "learning_rate": 1.8717371797637002e-06, + "loss": 0.0296, + "step": 6696 + }, + { + "epoch": 2.9744614701310237, + "grad_norm": 0.41883841344207373, + "learning_rate": 1.8702251588476889e-06, + "loss": 0.0314, + "step": 6697 + }, + { + "epoch": 2.974905618476571, + "grad_norm": 0.5313573343030277, + "learning_rate": 1.868713608390005e-06, + "loss": 0.0408, + "step": 6698 + }, + { + "epoch": 2.9753497668221187, + "grad_norm": 0.39733118203820383, + "learning_rate": 1.8672025286178546e-06, + "loss": 0.0357, + "step": 6699 + }, + { + "epoch": 2.975793915167666, + "grad_norm": 0.4305571327880009, + "learning_rate": 1.8656919197583816e-06, + "loss": 0.0287, + "step": 6700 + }, + { + "epoch": 2.9762380635132133, + "grad_norm": 0.38007221498778115, + "learning_rate": 1.8641817820386576e-06, + "loss": 0.0256, + "step": 6701 + }, + { + "epoch": 2.976682211858761, + "grad_norm": 0.35438367987841246, + "learning_rate": 1.862672115685678e-06, + "loss": 0.0318, + "step": 6702 + }, + { + "epoch": 2.9771263602043083, + "grad_norm": 0.3521742278005089, + "learning_rate": 1.861162920926372e-06, + "loss": 0.0246, + "step": 6703 + }, + { + "epoch": 2.9775705085498556, + "grad_norm": 0.4587000972835763, + "learning_rate": 1.8596541979876016e-06, + "loss": 0.0403, + "step": 6704 + }, + { + "epoch": 2.978014656895403, + "grad_norm": 0.4391686978547565, + "learning_rate": 1.8581459470961488e-06, + "loss": 0.033, + "step": 6705 + }, + { + "epoch": 2.9784588052409506, + "grad_norm": 0.8681641943800276, + "learning_rate": 1.856638168478731e-06, + "loss": 0.0271, + "step": 6706 + }, + { + "epoch": 2.978902953586498, + "grad_norm": 0.3412875403024399, + "learning_rate": 1.8551308623619945e-06, + "loss": 0.0278, + "step": 6707 + }, + { + "epoch": 2.979347101932045, + "grad_norm": 0.39261569640506333, + "learning_rate": 1.8536240289725078e-06, + "loss": 0.035, + "step": 6708 + }, + { + "epoch": 2.979791250277593, + "grad_norm": 0.3999387719956585, + "learning_rate": 1.8521176685367804e-06, + "loss": 0.0287, + "step": 6709 + }, + { + "epoch": 2.9802353986231402, + "grad_norm": 0.3873753971954603, + "learning_rate": 1.850611781281239e-06, + "loss": 0.0322, + "step": 6710 + }, + { + "epoch": 2.9806795469686875, + "grad_norm": 0.4380866274093617, + "learning_rate": 1.8491063674322457e-06, + "loss": 0.033, + "step": 6711 + }, + { + "epoch": 2.981123695314235, + "grad_norm": 0.4171386719486352, + "learning_rate": 1.8476014272160896e-06, + "loss": 0.0354, + "step": 6712 + }, + { + "epoch": 2.981567843659782, + "grad_norm": 0.37606765896917305, + "learning_rate": 1.8460969608589913e-06, + "loss": 0.0309, + "step": 6713 + }, + { + "epoch": 2.98201199200533, + "grad_norm": 0.33316278037626945, + "learning_rate": 1.8445929685870912e-06, + "loss": 0.0281, + "step": 6714 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.4295124088150545, + "learning_rate": 1.8430894506264724e-06, + "loss": 0.0281, + "step": 6715 + }, + { + "epoch": 2.982900288696425, + "grad_norm": 0.6219155251106016, + "learning_rate": 1.8415864072031335e-06, + "loss": 0.0412, + "step": 6716 + }, + { + "epoch": 2.983344437041972, + "grad_norm": 1.0649791746325505, + "learning_rate": 1.8400838385430104e-06, + "loss": 0.0296, + "step": 6717 + }, + { + "epoch": 2.9837885853875195, + "grad_norm": 0.4321630566554828, + "learning_rate": 1.838581744871965e-06, + "loss": 0.0349, + "step": 6718 + }, + { + "epoch": 2.9842327337330667, + "grad_norm": 0.3860040470778094, + "learning_rate": 1.8370801264157857e-06, + "loss": 0.029, + "step": 6719 + }, + { + "epoch": 2.984676882078614, + "grad_norm": 0.38156983645011616, + "learning_rate": 1.8355789834001898e-06, + "loss": 0.0284, + "step": 6720 + }, + { + "epoch": 2.9851210304241618, + "grad_norm": 0.5127281233435997, + "learning_rate": 1.8340783160508297e-06, + "loss": 0.0355, + "step": 6721 + }, + { + "epoch": 2.985565178769709, + "grad_norm": 0.3767235771133534, + "learning_rate": 1.8325781245932772e-06, + "loss": 0.0297, + "step": 6722 + }, + { + "epoch": 2.9860093271152564, + "grad_norm": 0.4559507813085918, + "learning_rate": 1.8310784092530376e-06, + "loss": 0.0362, + "step": 6723 + }, + { + "epoch": 2.986453475460804, + "grad_norm": 0.36189720669347597, + "learning_rate": 1.8295791702555455e-06, + "loss": 0.0294, + "step": 6724 + }, + { + "epoch": 2.9868976238063514, + "grad_norm": 0.3457736093542848, + "learning_rate": 1.8280804078261577e-06, + "loss": 0.0305, + "step": 6725 + }, + { + "epoch": 2.9873417721518987, + "grad_norm": 0.5911976787943448, + "learning_rate": 1.826582122190167e-06, + "loss": 0.0368, + "step": 6726 + }, + { + "epoch": 2.987785920497446, + "grad_norm": 0.4315279094342813, + "learning_rate": 1.8250843135727898e-06, + "loss": 0.0365, + "step": 6727 + }, + { + "epoch": 2.9882300688429937, + "grad_norm": 0.3782115314248128, + "learning_rate": 1.8235869821991726e-06, + "loss": 0.0297, + "step": 6728 + }, + { + "epoch": 2.988674217188541, + "grad_norm": 0.42412903793940915, + "learning_rate": 1.8220901282943915e-06, + "loss": 0.0348, + "step": 6729 + }, + { + "epoch": 2.9891183655340883, + "grad_norm": 0.42286509801094985, + "learning_rate": 1.820593752083446e-06, + "loss": 0.0356, + "step": 6730 + }, + { + "epoch": 2.989562513879636, + "grad_norm": 0.48807857838384455, + "learning_rate": 1.8190978537912662e-06, + "loss": 0.0441, + "step": 6731 + }, + { + "epoch": 2.9900066622251833, + "grad_norm": 0.4660013011382873, + "learning_rate": 1.8176024336427167e-06, + "loss": 0.0356, + "step": 6732 + }, + { + "epoch": 2.9904508105707306, + "grad_norm": 0.3894973608346122, + "learning_rate": 1.8161074918625792e-06, + "loss": 0.0234, + "step": 6733 + }, + { + "epoch": 2.990894958916278, + "grad_norm": 0.43148032392899044, + "learning_rate": 1.8146130286755704e-06, + "loss": 0.0316, + "step": 6734 + }, + { + "epoch": 2.9913391072618256, + "grad_norm": 0.4541609973903636, + "learning_rate": 1.8131190443063357e-06, + "loss": 0.0228, + "step": 6735 + }, + { + "epoch": 2.991783255607373, + "grad_norm": 0.37931640734503375, + "learning_rate": 1.8116255389794418e-06, + "loss": 0.03, + "step": 6736 + }, + { + "epoch": 2.99222740395292, + "grad_norm": 0.38246202047737776, + "learning_rate": 1.8101325129193897e-06, + "loss": 0.0432, + "step": 6737 + }, + { + "epoch": 2.992671552298468, + "grad_norm": 0.42501044694023266, + "learning_rate": 1.8086399663506099e-06, + "loss": 0.0378, + "step": 6738 + }, + { + "epoch": 2.993115700644015, + "grad_norm": 0.4349574825819462, + "learning_rate": 1.8071478994974534e-06, + "loss": 0.022, + "step": 6739 + }, + { + "epoch": 2.9935598489895625, + "grad_norm": 0.36182919079127596, + "learning_rate": 1.8056563125842046e-06, + "loss": 0.0281, + "step": 6740 + }, + { + "epoch": 2.99400399733511, + "grad_norm": 0.3297563936891292, + "learning_rate": 1.8041652058350768e-06, + "loss": 0.0258, + "step": 6741 + }, + { + "epoch": 2.994448145680657, + "grad_norm": 0.7003299863726014, + "learning_rate": 1.802674579474204e-06, + "loss": 0.0351, + "step": 6742 + }, + { + "epoch": 2.994892294026205, + "grad_norm": 0.6139326715543403, + "learning_rate": 1.801184433725655e-06, + "loss": 0.0379, + "step": 6743 + }, + { + "epoch": 2.995336442371752, + "grad_norm": 0.5062057494487118, + "learning_rate": 1.7996947688134241e-06, + "loss": 0.0315, + "step": 6744 + }, + { + "epoch": 2.9957805907173, + "grad_norm": 0.7604714501213958, + "learning_rate": 1.7982055849614327e-06, + "loss": 0.0435, + "step": 6745 + }, + { + "epoch": 2.996224739062847, + "grad_norm": 0.4500952174598657, + "learning_rate": 1.7967168823935333e-06, + "loss": 0.0364, + "step": 6746 + }, + { + "epoch": 2.9966688874083944, + "grad_norm": 0.6302920187503901, + "learning_rate": 1.7952286613334986e-06, + "loss": 0.0319, + "step": 6747 + }, + { + "epoch": 2.9971130357539417, + "grad_norm": 0.36539112186296896, + "learning_rate": 1.793740922005034e-06, + "loss": 0.0308, + "step": 6748 + }, + { + "epoch": 2.997557184099489, + "grad_norm": 0.43381962390876766, + "learning_rate": 1.7922536646317767e-06, + "loss": 0.0377, + "step": 6749 + }, + { + "epoch": 2.9980013324450367, + "grad_norm": 0.33856021759003035, + "learning_rate": 1.7907668894372826e-06, + "loss": 0.0239, + "step": 6750 + }, + { + "epoch": 2.998445480790584, + "grad_norm": 0.37584653604138824, + "learning_rate": 1.78928059664504e-06, + "loss": 0.024, + "step": 6751 + }, + { + "epoch": 2.9988896291361313, + "grad_norm": 0.361912705997716, + "learning_rate": 1.7877947864784662e-06, + "loss": 0.0261, + "step": 6752 + }, + { + "epoch": 2.999333777481679, + "grad_norm": 0.35751593403707377, + "learning_rate": 1.7863094591609003e-06, + "loss": 0.0254, + "step": 6753 + }, + { + "epoch": 2.9997779258272264, + "grad_norm": 0.4724255587714232, + "learning_rate": 1.7848246149156134e-06, + "loss": 0.0421, + "step": 6754 + }, + { + "epoch": 2.9997779258272264, + "eval_loss": 0.03672339767217636, + "eval_runtime": 403.6019, + "eval_samples_per_second": 37.577, + "eval_steps_per_second": 1.174, + "step": 6754 + }, + { + "epoch": 3.0002220741727736, + "grad_norm": 0.3269520644923097, + "learning_rate": 1.783340253965803e-06, + "loss": 0.0215, + "step": 6755 + }, + { + "epoch": 3.000666222518321, + "grad_norm": 0.3360624322653449, + "learning_rate": 1.7818563765345942e-06, + "loss": 0.0284, + "step": 6756 + }, + { + "epoch": 3.0011103708638687, + "grad_norm": 0.3740872446254905, + "learning_rate": 1.7803729828450405e-06, + "loss": 0.0207, + "step": 6757 + }, + { + "epoch": 3.001554519209416, + "grad_norm": 0.29900746203751005, + "learning_rate": 1.7788900731201174e-06, + "loss": 0.0206, + "step": 6758 + }, + { + "epoch": 3.0019986675549633, + "grad_norm": 0.4119590022541271, + "learning_rate": 1.7774076475827335e-06, + "loss": 0.0239, + "step": 6759 + }, + { + "epoch": 3.002442815900511, + "grad_norm": 0.4073542383308494, + "learning_rate": 1.7759257064557229e-06, + "loss": 0.022, + "step": 6760 + }, + { + "epoch": 3.0028869642460583, + "grad_norm": 0.6159273406264103, + "learning_rate": 1.7744442499618453e-06, + "loss": 0.032, + "step": 6761 + }, + { + "epoch": 3.0033311125916056, + "grad_norm": 0.8514717794851285, + "learning_rate": 1.77296327832379e-06, + "loss": 0.0421, + "step": 6762 + }, + { + "epoch": 3.003775260937153, + "grad_norm": 0.3400576483770952, + "learning_rate": 1.7714827917641737e-06, + "loss": 0.0209, + "step": 6763 + }, + { + "epoch": 3.0042194092827006, + "grad_norm": 0.38696151832577724, + "learning_rate": 1.7700027905055344e-06, + "loss": 0.0232, + "step": 6764 + }, + { + "epoch": 3.004663557628248, + "grad_norm": 0.40863063034907293, + "learning_rate": 1.7685232747703424e-06, + "loss": 0.0246, + "step": 6765 + }, + { + "epoch": 3.005107705973795, + "grad_norm": 0.39214643564722806, + "learning_rate": 1.7670442447809989e-06, + "loss": 0.0283, + "step": 6766 + }, + { + "epoch": 3.0055518543193425, + "grad_norm": 0.5937092447872463, + "learning_rate": 1.7655657007598216e-06, + "loss": 0.0269, + "step": 6767 + }, + { + "epoch": 3.00599600266489, + "grad_norm": 0.43120240703534374, + "learning_rate": 1.7640876429290633e-06, + "loss": 0.0238, + "step": 6768 + }, + { + "epoch": 3.0064401510104375, + "grad_norm": 0.405113495539343, + "learning_rate": 1.7626100715109018e-06, + "loss": 0.0206, + "step": 6769 + }, + { + "epoch": 3.006884299355985, + "grad_norm": 0.35078016785761906, + "learning_rate": 1.761132986727439e-06, + "loss": 0.0207, + "step": 6770 + }, + { + "epoch": 3.0073284477015325, + "grad_norm": 0.36832682307040443, + "learning_rate": 1.7596563888007073e-06, + "loss": 0.0246, + "step": 6771 + }, + { + "epoch": 3.00777259604708, + "grad_norm": 0.38464153090067194, + "learning_rate": 1.7581802779526642e-06, + "loss": 0.0187, + "step": 6772 + }, + { + "epoch": 3.008216744392627, + "grad_norm": 0.44328209442836686, + "learning_rate": 1.7567046544051935e-06, + "loss": 0.0254, + "step": 6773 + }, + { + "epoch": 3.0086608927381744, + "grad_norm": 0.4311223508530591, + "learning_rate": 1.7552295183801093e-06, + "loss": 0.0282, + "step": 6774 + }, + { + "epoch": 3.009105041083722, + "grad_norm": 0.4187784150098612, + "learning_rate": 1.7537548700991463e-06, + "loss": 0.026, + "step": 6775 + }, + { + "epoch": 3.0095491894292694, + "grad_norm": 0.49752308362720904, + "learning_rate": 1.75228070978397e-06, + "loss": 0.0333, + "step": 6776 + }, + { + "epoch": 3.0099933377748167, + "grad_norm": 0.3661955272882175, + "learning_rate": 1.750807037656172e-06, + "loss": 0.0263, + "step": 6777 + }, + { + "epoch": 3.010437486120364, + "grad_norm": 0.3951032549450352, + "learning_rate": 1.7493338539372701e-06, + "loss": 0.0245, + "step": 6778 + }, + { + "epoch": 3.0108816344659117, + "grad_norm": 0.4288937494263082, + "learning_rate": 1.7478611588487098e-06, + "loss": 0.0224, + "step": 6779 + }, + { + "epoch": 3.011325782811459, + "grad_norm": 1.0043569602354414, + "learning_rate": 1.7463889526118628e-06, + "loss": 0.0321, + "step": 6780 + }, + { + "epoch": 3.0117699311570063, + "grad_norm": 0.345117052755733, + "learning_rate": 1.7449172354480236e-06, + "loss": 0.0222, + "step": 6781 + }, + { + "epoch": 3.012214079502554, + "grad_norm": 0.37263032859522277, + "learning_rate": 1.7434460075784183e-06, + "loss": 0.0371, + "step": 6782 + }, + { + "epoch": 3.0126582278481013, + "grad_norm": 0.42797870753405914, + "learning_rate": 1.741975269224197e-06, + "loss": 0.0275, + "step": 6783 + }, + { + "epoch": 3.0131023761936486, + "grad_norm": 0.4487558433311281, + "learning_rate": 1.7405050206064372e-06, + "loss": 0.0278, + "step": 6784 + }, + { + "epoch": 3.013546524539196, + "grad_norm": 0.3104547304379723, + "learning_rate": 1.739035261946142e-06, + "loss": 0.0162, + "step": 6785 + }, + { + "epoch": 3.0139906728847436, + "grad_norm": 0.4274226742898415, + "learning_rate": 1.7375659934642425e-06, + "loss": 0.0232, + "step": 6786 + }, + { + "epoch": 3.014434821230291, + "grad_norm": 0.4378508355204737, + "learning_rate": 1.7360972153815919e-06, + "loss": 0.0263, + "step": 6787 + }, + { + "epoch": 3.0148789695758382, + "grad_norm": 0.35914322608698723, + "learning_rate": 1.7346289279189732e-06, + "loss": 0.0236, + "step": 6788 + }, + { + "epoch": 3.015323117921386, + "grad_norm": 0.3717761462102794, + "learning_rate": 1.7331611312970965e-06, + "loss": 0.0223, + "step": 6789 + }, + { + "epoch": 3.0157672662669333, + "grad_norm": 0.49078555792657397, + "learning_rate": 1.7316938257365945e-06, + "loss": 0.0479, + "step": 6790 + }, + { + "epoch": 3.0162114146124805, + "grad_norm": 0.5303389635543914, + "learning_rate": 1.7302270114580316e-06, + "loss": 0.0257, + "step": 6791 + }, + { + "epoch": 3.016655562958028, + "grad_norm": 0.5963610689982617, + "learning_rate": 1.7287606886818914e-06, + "loss": 0.025, + "step": 6792 + }, + { + "epoch": 3.0170997113035756, + "grad_norm": 0.3358156185785528, + "learning_rate": 1.7272948576285874e-06, + "loss": 0.0259, + "step": 6793 + }, + { + "epoch": 3.017543859649123, + "grad_norm": 0.44451083436086486, + "learning_rate": 1.7258295185184604e-06, + "loss": 0.0371, + "step": 6794 + }, + { + "epoch": 3.01798800799467, + "grad_norm": 0.8409978484969116, + "learning_rate": 1.7243646715717754e-06, + "loss": 0.0198, + "step": 6795 + }, + { + "epoch": 3.0184321563402174, + "grad_norm": 0.40598942285053524, + "learning_rate": 1.7229003170087232e-06, + "loss": 0.0253, + "step": 6796 + }, + { + "epoch": 3.018876304685765, + "grad_norm": 0.40045735020677964, + "learning_rate": 1.7214364550494235e-06, + "loss": 0.0305, + "step": 6797 + }, + { + "epoch": 3.0193204530313125, + "grad_norm": 0.4205212377847487, + "learning_rate": 1.7199730859139157e-06, + "loss": 0.0233, + "step": 6798 + }, + { + "epoch": 3.0197646013768598, + "grad_norm": 0.39200127688818526, + "learning_rate": 1.7185102098221713e-06, + "loss": 0.0225, + "step": 6799 + }, + { + "epoch": 3.0202087497224075, + "grad_norm": 0.4741877419032876, + "learning_rate": 1.717047826994085e-06, + "loss": 0.0262, + "step": 6800 + }, + { + "epoch": 3.020652898067955, + "grad_norm": 0.40147810647504373, + "learning_rate": 1.7155859376494776e-06, + "loss": 0.0267, + "step": 6801 + }, + { + "epoch": 3.021097046413502, + "grad_norm": 0.4280335792767891, + "learning_rate": 1.7141245420080982e-06, + "loss": 0.0242, + "step": 6802 + }, + { + "epoch": 3.0215411947590494, + "grad_norm": 0.34267631232424817, + "learning_rate": 1.7126636402896158e-06, + "loss": 0.0228, + "step": 6803 + }, + { + "epoch": 3.021985343104597, + "grad_norm": 0.517542942077693, + "learning_rate": 1.7112032327136296e-06, + "loss": 0.0289, + "step": 6804 + }, + { + "epoch": 3.0224294914501444, + "grad_norm": 0.48842613500647986, + "learning_rate": 1.7097433194996654e-06, + "loss": 0.0362, + "step": 6805 + }, + { + "epoch": 3.0228736397956917, + "grad_norm": 0.5122441524272605, + "learning_rate": 1.7082839008671714e-06, + "loss": 0.0327, + "step": 6806 + }, + { + "epoch": 3.023317788141239, + "grad_norm": 0.3656414000536619, + "learning_rate": 1.706824977035524e-06, + "loss": 0.0239, + "step": 6807 + }, + { + "epoch": 3.0237619364867867, + "grad_norm": 0.35844316810361615, + "learning_rate": 1.705366548224025e-06, + "loss": 0.022, + "step": 6808 + }, + { + "epoch": 3.024206084832334, + "grad_norm": 0.5208246273651607, + "learning_rate": 1.7039086146518986e-06, + "loss": 0.0396, + "step": 6809 + }, + { + "epoch": 3.0246502331778813, + "grad_norm": 0.35888427376762055, + "learning_rate": 1.7024511765382978e-06, + "loss": 0.0218, + "step": 6810 + }, + { + "epoch": 3.025094381523429, + "grad_norm": 0.4996038911163621, + "learning_rate": 1.7009942341023012e-06, + "loss": 0.031, + "step": 6811 + }, + { + "epoch": 3.0255385298689763, + "grad_norm": 0.4715140989011031, + "learning_rate": 1.699537787562911e-06, + "loss": 0.027, + "step": 6812 + }, + { + "epoch": 3.0259826782145236, + "grad_norm": 0.4700851815071929, + "learning_rate": 1.6980818371390567e-06, + "loss": 0.0352, + "step": 6813 + }, + { + "epoch": 3.026426826560071, + "grad_norm": 0.42536546436391975, + "learning_rate": 1.6966263830495939e-06, + "loss": 0.0307, + "step": 6814 + }, + { + "epoch": 3.0268709749056186, + "grad_norm": 0.3928810675523391, + "learning_rate": 1.6951714255132985e-06, + "loss": 0.0317, + "step": 6815 + }, + { + "epoch": 3.027315123251166, + "grad_norm": 0.36328069775398497, + "learning_rate": 1.6937169647488765e-06, + "loss": 0.0225, + "step": 6816 + }, + { + "epoch": 3.027759271596713, + "grad_norm": 0.4176283792585422, + "learning_rate": 1.6922630009749592e-06, + "loss": 0.0335, + "step": 6817 + }, + { + "epoch": 3.0282034199422605, + "grad_norm": 0.45941453677809496, + "learning_rate": 1.6908095344101016e-06, + "loss": 0.0307, + "step": 6818 + }, + { + "epoch": 3.0286475682878082, + "grad_norm": 0.4022633883354449, + "learning_rate": 1.6893565652727857e-06, + "loss": 0.0285, + "step": 6819 + }, + { + "epoch": 3.0290917166333555, + "grad_norm": 0.46606194138773704, + "learning_rate": 1.687904093781414e-06, + "loss": 0.0346, + "step": 6820 + }, + { + "epoch": 3.029535864978903, + "grad_norm": 0.3781526947765934, + "learning_rate": 1.68645212015432e-06, + "loss": 0.0217, + "step": 6821 + }, + { + "epoch": 3.0299800133244505, + "grad_norm": 0.40815913991276614, + "learning_rate": 1.68500064460976e-06, + "loss": 0.0303, + "step": 6822 + }, + { + "epoch": 3.030424161669998, + "grad_norm": 0.42153363035695735, + "learning_rate": 1.6835496673659145e-06, + "loss": 0.0329, + "step": 6823 + }, + { + "epoch": 3.030868310015545, + "grad_norm": 0.5257269502253524, + "learning_rate": 1.6820991886408911e-06, + "loss": 0.0342, + "step": 6824 + }, + { + "epoch": 3.0313124583610924, + "grad_norm": 0.36375569874164343, + "learning_rate": 1.6806492086527226e-06, + "loss": 0.0268, + "step": 6825 + }, + { + "epoch": 3.03175660670664, + "grad_norm": 0.4517518524381332, + "learning_rate": 1.6791997276193623e-06, + "loss": 0.0291, + "step": 6826 + }, + { + "epoch": 3.0322007550521874, + "grad_norm": 0.41358501413398874, + "learning_rate": 1.6777507457586933e-06, + "loss": 0.0263, + "step": 6827 + }, + { + "epoch": 3.0326449033977347, + "grad_norm": 0.4371773593425646, + "learning_rate": 1.6763022632885223e-06, + "loss": 0.026, + "step": 6828 + }, + { + "epoch": 3.0330890517432825, + "grad_norm": 0.41878585193753987, + "learning_rate": 1.674854280426581e-06, + "loss": 0.0289, + "step": 6829 + }, + { + "epoch": 3.0335332000888298, + "grad_norm": 0.6072899498645479, + "learning_rate": 1.6734067973905272e-06, + "loss": 0.046, + "step": 6830 + }, + { + "epoch": 3.033977348434377, + "grad_norm": 0.37768879943900463, + "learning_rate": 1.6719598143979392e-06, + "loss": 0.0175, + "step": 6831 + }, + { + "epoch": 3.0344214967799243, + "grad_norm": 0.42243370237101113, + "learning_rate": 1.6705133316663247e-06, + "loss": 0.0318, + "step": 6832 + }, + { + "epoch": 3.034865645125472, + "grad_norm": 0.4272111914281022, + "learning_rate": 1.6690673494131143e-06, + "loss": 0.0318, + "step": 6833 + }, + { + "epoch": 3.0353097934710194, + "grad_norm": 0.555542458187917, + "learning_rate": 1.6676218678556637e-06, + "loss": 0.0333, + "step": 6834 + }, + { + "epoch": 3.0357539418165667, + "grad_norm": 0.5174806483008364, + "learning_rate": 1.6661768872112544e-06, + "loss": 0.0257, + "step": 6835 + }, + { + "epoch": 3.036198090162114, + "grad_norm": 0.40261446781293897, + "learning_rate": 1.6647324076970917e-06, + "loss": 0.0271, + "step": 6836 + }, + { + "epoch": 3.0366422385076617, + "grad_norm": 0.28515858656757614, + "learning_rate": 1.663288429530303e-06, + "loss": 0.0139, + "step": 6837 + }, + { + "epoch": 3.037086386853209, + "grad_norm": 0.3801745026945722, + "learning_rate": 1.661844952927944e-06, + "loss": 0.0278, + "step": 6838 + }, + { + "epoch": 3.0375305351987563, + "grad_norm": 0.5653418682210891, + "learning_rate": 1.660401978106994e-06, + "loss": 0.0356, + "step": 6839 + }, + { + "epoch": 3.037974683544304, + "grad_norm": 0.4351497414805404, + "learning_rate": 1.6589595052843567e-06, + "loss": 0.0255, + "step": 6840 + }, + { + "epoch": 3.0384188318898513, + "grad_norm": 0.36924524660599933, + "learning_rate": 1.6575175346768597e-06, + "loss": 0.0213, + "step": 6841 + }, + { + "epoch": 3.0388629802353986, + "grad_norm": 0.36560298465093416, + "learning_rate": 1.6560760665012581e-06, + "loss": 0.0291, + "step": 6842 + }, + { + "epoch": 3.039307128580946, + "grad_norm": 0.5231650605635862, + "learning_rate": 1.6546351009742252e-06, + "loss": 0.0416, + "step": 6843 + }, + { + "epoch": 3.0397512769264936, + "grad_norm": 0.40002607438188575, + "learning_rate": 1.6531946383123647e-06, + "loss": 0.0279, + "step": 6844 + }, + { + "epoch": 3.040195425272041, + "grad_norm": 0.3820335905823042, + "learning_rate": 1.6517546787322019e-06, + "loss": 0.0332, + "step": 6845 + }, + { + "epoch": 3.040639573617588, + "grad_norm": 0.4052075541407361, + "learning_rate": 1.6503152224501883e-06, + "loss": 0.0273, + "step": 6846 + }, + { + "epoch": 3.0410837219631355, + "grad_norm": 0.40616006159309487, + "learning_rate": 1.6488762696826992e-06, + "loss": 0.029, + "step": 6847 + }, + { + "epoch": 3.041527870308683, + "grad_norm": 0.41149108296545567, + "learning_rate": 1.6474378206460306e-06, + "loss": 0.0239, + "step": 6848 + }, + { + "epoch": 3.0419720186542305, + "grad_norm": 0.35312483842889825, + "learning_rate": 1.6459998755564078e-06, + "loss": 0.0265, + "step": 6849 + }, + { + "epoch": 3.042416166999778, + "grad_norm": 0.44423084298706683, + "learning_rate": 1.64456243462998e-06, + "loss": 0.0214, + "step": 6850 + }, + { + "epoch": 3.0428603153453255, + "grad_norm": 0.3441262864593607, + "learning_rate": 1.6431254980828137e-06, + "loss": 0.0261, + "step": 6851 + }, + { + "epoch": 3.043304463690873, + "grad_norm": 0.5333130750896956, + "learning_rate": 1.6416890661309098e-06, + "loss": 0.0345, + "step": 6852 + }, + { + "epoch": 3.04374861203642, + "grad_norm": 0.32916273546434566, + "learning_rate": 1.6402531389901894e-06, + "loss": 0.0196, + "step": 6853 + }, + { + "epoch": 3.0441927603819674, + "grad_norm": 0.4575672416898921, + "learning_rate": 1.6388177168764919e-06, + "loss": 0.0316, + "step": 6854 + }, + { + "epoch": 3.044636908727515, + "grad_norm": 0.4231274981211455, + "learning_rate": 1.6373828000055886e-06, + "loss": 0.0267, + "step": 6855 + }, + { + "epoch": 3.0450810570730624, + "grad_norm": 0.35267922234888455, + "learning_rate": 1.6359483885931709e-06, + "loss": 0.0224, + "step": 6856 + }, + { + "epoch": 3.0455252054186097, + "grad_norm": 0.4270485321761482, + "learning_rate": 1.634514482854856e-06, + "loss": 0.0215, + "step": 6857 + }, + { + "epoch": 3.045969353764157, + "grad_norm": 0.45838498705619496, + "learning_rate": 1.6330810830061833e-06, + "loss": 0.0259, + "step": 6858 + }, + { + "epoch": 3.0464135021097047, + "grad_norm": 0.39622697822045655, + "learning_rate": 1.6316481892626202e-06, + "loss": 0.0227, + "step": 6859 + }, + { + "epoch": 3.046857650455252, + "grad_norm": 0.4767421948555728, + "learning_rate": 1.6302158018395504e-06, + "loss": 0.021, + "step": 6860 + }, + { + "epoch": 3.0473017988007993, + "grad_norm": 0.4108572908804955, + "learning_rate": 1.6287839209522883e-06, + "loss": 0.0243, + "step": 6861 + }, + { + "epoch": 3.047745947146347, + "grad_norm": 0.3361055439360816, + "learning_rate": 1.62735254681607e-06, + "loss": 0.016, + "step": 6862 + }, + { + "epoch": 3.0481900954918943, + "grad_norm": 0.4890820081997742, + "learning_rate": 1.6259216796460553e-06, + "loss": 0.0332, + "step": 6863 + }, + { + "epoch": 3.0486342438374416, + "grad_norm": 0.3446651596488856, + "learning_rate": 1.6244913196573291e-06, + "loss": 0.0231, + "step": 6864 + }, + { + "epoch": 3.049078392182989, + "grad_norm": 0.4912264351287514, + "learning_rate": 1.623061467064896e-06, + "loss": 0.0272, + "step": 6865 + }, + { + "epoch": 3.0495225405285367, + "grad_norm": 0.4925885654573531, + "learning_rate": 1.6216321220836885e-06, + "loss": 0.0248, + "step": 6866 + }, + { + "epoch": 3.049966688874084, + "grad_norm": 0.3824243589770029, + "learning_rate": 1.6202032849285626e-06, + "loss": 0.0225, + "step": 6867 + }, + { + "epoch": 3.0504108372196312, + "grad_norm": 0.36019367540666763, + "learning_rate": 1.618774955814293e-06, + "loss": 0.0179, + "step": 6868 + }, + { + "epoch": 3.050854985565179, + "grad_norm": 0.5143104427412662, + "learning_rate": 1.6173471349555858e-06, + "loss": 0.0376, + "step": 6869 + }, + { + "epoch": 3.0512991339107263, + "grad_norm": 0.5934473311861377, + "learning_rate": 1.6159198225670676e-06, + "loss": 0.0448, + "step": 6870 + }, + { + "epoch": 3.0517432822562736, + "grad_norm": 0.45164863726044957, + "learning_rate": 1.6144930188632835e-06, + "loss": 0.0278, + "step": 6871 + }, + { + "epoch": 3.052187430601821, + "grad_norm": 0.5798490797388325, + "learning_rate": 1.6130667240587083e-06, + "loss": 0.0297, + "step": 6872 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.38501590071391395, + "learning_rate": 1.6116409383677383e-06, + "loss": 0.0244, + "step": 6873 + }, + { + "epoch": 3.053075727292916, + "grad_norm": 0.4626822552615348, + "learning_rate": 1.6102156620046937e-06, + "loss": 0.0234, + "step": 6874 + }, + { + "epoch": 3.053519875638463, + "grad_norm": 0.4709097565849538, + "learning_rate": 1.6087908951838193e-06, + "loss": 0.0325, + "step": 6875 + }, + { + "epoch": 3.0539640239840105, + "grad_norm": 0.38512614079510565, + "learning_rate": 1.6073666381192777e-06, + "loss": 0.0185, + "step": 6876 + }, + { + "epoch": 3.054408172329558, + "grad_norm": 0.3981638223693639, + "learning_rate": 1.6059428910251617e-06, + "loss": 0.0297, + "step": 6877 + }, + { + "epoch": 3.0548523206751055, + "grad_norm": 0.40972645998312435, + "learning_rate": 1.604519654115484e-06, + "loss": 0.0217, + "step": 6878 + }, + { + "epoch": 3.0552964690206528, + "grad_norm": 0.44788376402765917, + "learning_rate": 1.6030969276041813e-06, + "loss": 0.0255, + "step": 6879 + }, + { + "epoch": 3.0557406173662005, + "grad_norm": 0.45493435461264287, + "learning_rate": 1.6016747117051135e-06, + "loss": 0.0367, + "step": 6880 + }, + { + "epoch": 3.056184765711748, + "grad_norm": 0.5367527315236188, + "learning_rate": 1.6002530066320659e-06, + "loss": 0.0386, + "step": 6881 + }, + { + "epoch": 3.056628914057295, + "grad_norm": 0.36716772288492844, + "learning_rate": 1.5988318125987412e-06, + "loss": 0.0243, + "step": 6882 + }, + { + "epoch": 3.0570730624028424, + "grad_norm": 0.39700109646152437, + "learning_rate": 1.597411129818771e-06, + "loss": 0.0331, + "step": 6883 + }, + { + "epoch": 3.05751721074839, + "grad_norm": 0.4200578523175907, + "learning_rate": 1.5959909585057099e-06, + "loss": 0.0254, + "step": 6884 + }, + { + "epoch": 3.0579613590939374, + "grad_norm": 0.5911121451638379, + "learning_rate": 1.5945712988730278e-06, + "loss": 0.0418, + "step": 6885 + }, + { + "epoch": 3.0584055074394847, + "grad_norm": 0.3316421396306147, + "learning_rate": 1.5931521511341292e-06, + "loss": 0.0266, + "step": 6886 + }, + { + "epoch": 3.058849655785032, + "grad_norm": 0.38444781154877616, + "learning_rate": 1.5917335155023368e-06, + "loss": 0.018, + "step": 6887 + }, + { + "epoch": 3.0592938041305797, + "grad_norm": 0.38538195654260493, + "learning_rate": 1.590315392190891e-06, + "loss": 0.0223, + "step": 6888 + }, + { + "epoch": 3.059737952476127, + "grad_norm": 0.3188424830635555, + "learning_rate": 1.5888977814129625e-06, + "loss": 0.0162, + "step": 6889 + }, + { + "epoch": 3.0601821008216743, + "grad_norm": 0.5604048534911154, + "learning_rate": 1.5874806833816436e-06, + "loss": 0.0451, + "step": 6890 + }, + { + "epoch": 3.060626249167222, + "grad_norm": 0.4396556865933855, + "learning_rate": 1.5860640983099435e-06, + "loss": 0.027, + "step": 6891 + }, + { + "epoch": 3.0610703975127693, + "grad_norm": 0.5338475568401356, + "learning_rate": 1.584648026410805e-06, + "loss": 0.0428, + "step": 6892 + }, + { + "epoch": 3.0615145458583166, + "grad_norm": 0.41364738868710765, + "learning_rate": 1.583232467897083e-06, + "loss": 0.0255, + "step": 6893 + }, + { + "epoch": 3.061958694203864, + "grad_norm": 0.44128675402771145, + "learning_rate": 1.581817422981562e-06, + "loss": 0.0266, + "step": 6894 + }, + { + "epoch": 3.0624028425494116, + "grad_norm": 0.40811877248427353, + "learning_rate": 1.5804028918769488e-06, + "loss": 0.0202, + "step": 6895 + }, + { + "epoch": 3.062846990894959, + "grad_norm": 0.38420681148774133, + "learning_rate": 1.5789888747958666e-06, + "loss": 0.0299, + "step": 6896 + }, + { + "epoch": 3.0632911392405062, + "grad_norm": 0.3556137019009215, + "learning_rate": 1.5775753719508708e-06, + "loss": 0.021, + "step": 6897 + }, + { + "epoch": 3.063735287586054, + "grad_norm": 0.2882748081505788, + "learning_rate": 1.5761623835544348e-06, + "loss": 0.0258, + "step": 6898 + }, + { + "epoch": 3.0641794359316012, + "grad_norm": 0.3787886197229445, + "learning_rate": 1.5747499098189524e-06, + "loss": 0.0281, + "step": 6899 + }, + { + "epoch": 3.0646235842771485, + "grad_norm": 0.3354175757689207, + "learning_rate": 1.5733379509567426e-06, + "loss": 0.0234, + "step": 6900 + }, + { + "epoch": 3.065067732622696, + "grad_norm": 0.3209071922045546, + "learning_rate": 1.5719265071800498e-06, + "loss": 0.0182, + "step": 6901 + }, + { + "epoch": 3.0655118809682436, + "grad_norm": 0.43501967745626874, + "learning_rate": 1.5705155787010324e-06, + "loss": 0.0353, + "step": 6902 + }, + { + "epoch": 3.065956029313791, + "grad_norm": 0.4929319158649113, + "learning_rate": 1.5691051657317835e-06, + "loss": 0.0271, + "step": 6903 + }, + { + "epoch": 3.066400177659338, + "grad_norm": 0.3584810561885387, + "learning_rate": 1.5676952684843072e-06, + "loss": 0.0258, + "step": 6904 + }, + { + "epoch": 3.0668443260048854, + "grad_norm": 0.4638657269405188, + "learning_rate": 1.5662858871705366e-06, + "loss": 0.0236, + "step": 6905 + }, + { + "epoch": 3.067288474350433, + "grad_norm": 0.5065060680688717, + "learning_rate": 1.5648770220023263e-06, + "loss": 0.0287, + "step": 6906 + }, + { + "epoch": 3.0677326226959805, + "grad_norm": 0.4859275697928332, + "learning_rate": 1.5634686731914533e-06, + "loss": 0.0434, + "step": 6907 + }, + { + "epoch": 3.0681767710415278, + "grad_norm": 0.38551762262809786, + "learning_rate": 1.562060840949612e-06, + "loss": 0.0265, + "step": 6908 + }, + { + "epoch": 3.0686209193870755, + "grad_norm": 0.6596634602131578, + "learning_rate": 1.5606535254884297e-06, + "loss": 0.0343, + "step": 6909 + }, + { + "epoch": 3.0690650677326228, + "grad_norm": 0.40308217569658306, + "learning_rate": 1.5592467270194456e-06, + "loss": 0.0306, + "step": 6910 + }, + { + "epoch": 3.06950921607817, + "grad_norm": 0.6298585513540671, + "learning_rate": 1.5578404457541264e-06, + "loss": 0.0276, + "step": 6911 + }, + { + "epoch": 3.0699533644237174, + "grad_norm": 0.6519358046715888, + "learning_rate": 1.5564346819038616e-06, + "loss": 0.0335, + "step": 6912 + }, + { + "epoch": 3.070397512769265, + "grad_norm": 0.4424893479760814, + "learning_rate": 1.5550294356799573e-06, + "loss": 0.0249, + "step": 6913 + }, + { + "epoch": 3.0708416611148124, + "grad_norm": 0.44065297318246016, + "learning_rate": 1.55362470729365e-06, + "loss": 0.031, + "step": 6914 + }, + { + "epoch": 3.0712858094603597, + "grad_norm": 0.3171014816611921, + "learning_rate": 1.5522204969560945e-06, + "loss": 0.0228, + "step": 6915 + }, + { + "epoch": 3.071729957805907, + "grad_norm": 0.48982852285102907, + "learning_rate": 1.5508168048783645e-06, + "loss": 0.0243, + "step": 6916 + }, + { + "epoch": 3.0721741061514547, + "grad_norm": 0.36638227513318433, + "learning_rate": 1.5494136312714598e-06, + "loss": 0.0209, + "step": 6917 + }, + { + "epoch": 3.072618254497002, + "grad_norm": 0.40243990074588076, + "learning_rate": 1.5480109763463031e-06, + "loss": 0.0323, + "step": 6918 + }, + { + "epoch": 3.0730624028425493, + "grad_norm": 0.6138342320939462, + "learning_rate": 1.5466088403137326e-06, + "loss": 0.0303, + "step": 6919 + }, + { + "epoch": 3.073506551188097, + "grad_norm": 0.41638032443840284, + "learning_rate": 1.5452072233845194e-06, + "loss": 0.0302, + "step": 6920 + }, + { + "epoch": 3.0739506995336443, + "grad_norm": 0.3958771196139635, + "learning_rate": 1.5438061257693459e-06, + "loss": 0.025, + "step": 6921 + }, + { + "epoch": 3.0743948478791916, + "grad_norm": 0.409201101395247, + "learning_rate": 1.5424055476788219e-06, + "loss": 0.0281, + "step": 6922 + }, + { + "epoch": 3.074838996224739, + "grad_norm": 0.36270140865155653, + "learning_rate": 1.54100548932348e-06, + "loss": 0.0211, + "step": 6923 + }, + { + "epoch": 3.0752831445702866, + "grad_norm": 0.3939239641466573, + "learning_rate": 1.5396059509137694e-06, + "loss": 0.0221, + "step": 6924 + }, + { + "epoch": 3.075727292915834, + "grad_norm": 0.3910124202096161, + "learning_rate": 1.5382069326600645e-06, + "loss": 0.0247, + "step": 6925 + }, + { + "epoch": 3.076171441261381, + "grad_norm": 0.4045068362861821, + "learning_rate": 1.536808434772667e-06, + "loss": 0.0252, + "step": 6926 + }, + { + "epoch": 3.076615589606929, + "grad_norm": 0.48842679698944363, + "learning_rate": 1.5354104574617889e-06, + "loss": 0.0272, + "step": 6927 + }, + { + "epoch": 3.0770597379524762, + "grad_norm": 0.5568048405499652, + "learning_rate": 1.5340130009375725e-06, + "loss": 0.0271, + "step": 6928 + }, + { + "epoch": 3.0775038862980235, + "grad_norm": 0.35344319652310796, + "learning_rate": 1.5326160654100803e-06, + "loss": 0.0203, + "step": 6929 + }, + { + "epoch": 3.077948034643571, + "grad_norm": 0.41134278821618964, + "learning_rate": 1.5312196510892907e-06, + "loss": 0.0203, + "step": 6930 + }, + { + "epoch": 3.0783921829891185, + "grad_norm": 0.43439941775477803, + "learning_rate": 1.529823758185115e-06, + "loss": 0.0307, + "step": 6931 + }, + { + "epoch": 3.078836331334666, + "grad_norm": 0.36710103440734276, + "learning_rate": 1.5284283869073753e-06, + "loss": 0.0226, + "step": 6932 + }, + { + "epoch": 3.079280479680213, + "grad_norm": 0.3795190028300577, + "learning_rate": 1.5270335374658202e-06, + "loss": 0.0193, + "step": 6933 + }, + { + "epoch": 3.0797246280257604, + "grad_norm": 0.416086597648837, + "learning_rate": 1.5256392100701201e-06, + "loss": 0.0341, + "step": 6934 + }, + { + "epoch": 3.080168776371308, + "grad_norm": 0.47903570547844193, + "learning_rate": 1.5242454049298672e-06, + "loss": 0.0326, + "step": 6935 + }, + { + "epoch": 3.0806129247168554, + "grad_norm": 0.41924576417637155, + "learning_rate": 1.5228521222545694e-06, + "loss": 0.0232, + "step": 6936 + }, + { + "epoch": 3.0810570730624027, + "grad_norm": 0.4086739351412735, + "learning_rate": 1.5214593622536677e-06, + "loss": 0.0191, + "step": 6937 + }, + { + "epoch": 3.0815012214079505, + "grad_norm": 0.42090655404898863, + "learning_rate": 1.5200671251365118e-06, + "loss": 0.0234, + "step": 6938 + }, + { + "epoch": 3.0819453697534978, + "grad_norm": 0.4491694516637582, + "learning_rate": 1.5186754111123814e-06, + "loss": 0.0309, + "step": 6939 + }, + { + "epoch": 3.082389518099045, + "grad_norm": 0.3593852978556601, + "learning_rate": 1.5172842203904752e-06, + "loss": 0.0219, + "step": 6940 + }, + { + "epoch": 3.0828336664445923, + "grad_norm": 0.4142188627731481, + "learning_rate": 1.5158935531799102e-06, + "loss": 0.0231, + "step": 6941 + }, + { + "epoch": 3.08327781479014, + "grad_norm": 0.3433910778382263, + "learning_rate": 1.5145034096897271e-06, + "loss": 0.0232, + "step": 6942 + }, + { + "epoch": 3.0837219631356874, + "grad_norm": 0.42040065985676595, + "learning_rate": 1.5131137901288928e-06, + "loss": 0.0274, + "step": 6943 + }, + { + "epoch": 3.0841661114812347, + "grad_norm": 0.37811344114245793, + "learning_rate": 1.5117246947062864e-06, + "loss": 0.0278, + "step": 6944 + }, + { + "epoch": 3.084610259826782, + "grad_norm": 0.6332645566079246, + "learning_rate": 1.5103361236307135e-06, + "loss": 0.0199, + "step": 6945 + }, + { + "epoch": 3.0850544081723297, + "grad_norm": 0.3367889244804879, + "learning_rate": 1.5089480771109021e-06, + "loss": 0.0221, + "step": 6946 + }, + { + "epoch": 3.085498556517877, + "grad_norm": 0.45979021485293203, + "learning_rate": 1.507560555355494e-06, + "loss": 0.0259, + "step": 6947 + }, + { + "epoch": 3.0859427048634243, + "grad_norm": 0.3846307657242651, + "learning_rate": 1.5061735585730636e-06, + "loss": 0.0252, + "step": 6948 + }, + { + "epoch": 3.086386853208972, + "grad_norm": 0.36534660664962226, + "learning_rate": 1.504787086972096e-06, + "loss": 0.0173, + "step": 6949 + }, + { + "epoch": 3.0868310015545193, + "grad_norm": 0.6011247419581762, + "learning_rate": 1.5034011407610021e-06, + "loss": 0.0399, + "step": 6950 + }, + { + "epoch": 3.0872751499000666, + "grad_norm": 0.4220887506749395, + "learning_rate": 1.502015720148115e-06, + "loss": 0.0265, + "step": 6951 + }, + { + "epoch": 3.087719298245614, + "grad_norm": 0.383622304930661, + "learning_rate": 1.5006308253416846e-06, + "loss": 0.0235, + "step": 6952 + }, + { + "epoch": 3.0881634465911616, + "grad_norm": 0.48234757617698376, + "learning_rate": 1.4992464565498831e-06, + "loss": 0.0354, + "step": 6953 + }, + { + "epoch": 3.088607594936709, + "grad_norm": 0.4645377835459482, + "learning_rate": 1.4978626139808094e-06, + "loss": 0.0332, + "step": 6954 + }, + { + "epoch": 3.089051743282256, + "grad_norm": 0.4061976190338302, + "learning_rate": 1.4964792978424746e-06, + "loss": 0.0231, + "step": 6955 + }, + { + "epoch": 3.089495891627804, + "grad_norm": 0.5482133012861865, + "learning_rate": 1.495096508342816e-06, + "loss": 0.0369, + "step": 6956 + }, + { + "epoch": 3.089940039973351, + "grad_norm": 0.3661650923331457, + "learning_rate": 1.4937142456896907e-06, + "loss": 0.0231, + "step": 6957 + }, + { + "epoch": 3.0903841883188985, + "grad_norm": 0.4124097316559942, + "learning_rate": 1.4923325100908749e-06, + "loss": 0.026, + "step": 6958 + }, + { + "epoch": 3.090828336664446, + "grad_norm": 0.5506003037235153, + "learning_rate": 1.490951301754066e-06, + "loss": 0.0272, + "step": 6959 + }, + { + "epoch": 3.0912724850099935, + "grad_norm": 0.39939980904986866, + "learning_rate": 1.4895706208868876e-06, + "loss": 0.028, + "step": 6960 + }, + { + "epoch": 3.091716633355541, + "grad_norm": 0.46904750344961466, + "learning_rate": 1.4881904676968756e-06, + "loss": 0.0258, + "step": 6961 + }, + { + "epoch": 3.092160781701088, + "grad_norm": 0.46041949493275547, + "learning_rate": 1.4868108423914913e-06, + "loss": 0.0284, + "step": 6962 + }, + { + "epoch": 3.0926049300466354, + "grad_norm": 0.3681186282877754, + "learning_rate": 1.4854317451781175e-06, + "loss": 0.0232, + "step": 6963 + }, + { + "epoch": 3.093049078392183, + "grad_norm": 0.4723461677240559, + "learning_rate": 1.4840531762640524e-06, + "loss": 0.0277, + "step": 6964 + }, + { + "epoch": 3.0934932267377304, + "grad_norm": 0.38175424189288976, + "learning_rate": 1.4826751358565211e-06, + "loss": 0.019, + "step": 6965 + }, + { + "epoch": 3.0939373750832777, + "grad_norm": 0.41086502374434375, + "learning_rate": 1.4812976241626659e-06, + "loss": 0.0287, + "step": 6966 + }, + { + "epoch": 3.0943815234288254, + "grad_norm": 0.39348852384191785, + "learning_rate": 1.4799206413895494e-06, + "loss": 0.026, + "step": 6967 + }, + { + "epoch": 3.0948256717743727, + "grad_norm": 0.4897251060835583, + "learning_rate": 1.4785441877441587e-06, + "loss": 0.0283, + "step": 6968 + }, + { + "epoch": 3.09526982011992, + "grad_norm": 0.396902884934688, + "learning_rate": 1.4771682634333933e-06, + "loss": 0.0241, + "step": 6969 + }, + { + "epoch": 3.0957139684654673, + "grad_norm": 0.6024718135186303, + "learning_rate": 1.4757928686640788e-06, + "loss": 0.0324, + "step": 6970 + }, + { + "epoch": 3.096158116811015, + "grad_norm": 0.3483981306238304, + "learning_rate": 1.4744180036429656e-06, + "loss": 0.0203, + "step": 6971 + }, + { + "epoch": 3.0966022651565623, + "grad_norm": 0.45268776356014967, + "learning_rate": 1.4730436685767135e-06, + "loss": 0.0302, + "step": 6972 + }, + { + "epoch": 3.0970464135021096, + "grad_norm": 0.5030795577403537, + "learning_rate": 1.4716698636719107e-06, + "loss": 0.0291, + "step": 6973 + }, + { + "epoch": 3.097490561847657, + "grad_norm": 0.4396401120191148, + "learning_rate": 1.470296589135065e-06, + "loss": 0.0342, + "step": 6974 + }, + { + "epoch": 3.0979347101932047, + "grad_norm": 0.3979132301675201, + "learning_rate": 1.4689238451725995e-06, + "loss": 0.0201, + "step": 6975 + }, + { + "epoch": 3.098378858538752, + "grad_norm": 0.3787151125083963, + "learning_rate": 1.4675516319908629e-06, + "loss": 0.0257, + "step": 6976 + }, + { + "epoch": 3.0988230068842992, + "grad_norm": 0.37709012703404926, + "learning_rate": 1.466179949796121e-06, + "loss": 0.031, + "step": 6977 + }, + { + "epoch": 3.099267155229847, + "grad_norm": 0.46313083842641917, + "learning_rate": 1.4648087987945625e-06, + "loss": 0.0187, + "step": 6978 + }, + { + "epoch": 3.0997113035753943, + "grad_norm": 0.41171533054686754, + "learning_rate": 1.4634381791922936e-06, + "loss": 0.0265, + "step": 6979 + }, + { + "epoch": 3.1001554519209416, + "grad_norm": 0.35930958286366377, + "learning_rate": 1.4620680911953433e-06, + "loss": 0.0287, + "step": 6980 + }, + { + "epoch": 3.100599600266489, + "grad_norm": 0.5934414847112055, + "learning_rate": 1.460698535009657e-06, + "loss": 0.021, + "step": 6981 + }, + { + "epoch": 3.1010437486120366, + "grad_norm": 0.4183538172458836, + "learning_rate": 1.4593295108411027e-06, + "loss": 0.0239, + "step": 6982 + }, + { + "epoch": 3.101487896957584, + "grad_norm": 0.3854427898688108, + "learning_rate": 1.4579610188954685e-06, + "loss": 0.0274, + "step": 6983 + }, + { + "epoch": 3.101932045303131, + "grad_norm": 0.4711207189434762, + "learning_rate": 1.4565930593784616e-06, + "loss": 0.0282, + "step": 6984 + }, + { + "epoch": 3.1023761936486784, + "grad_norm": 0.4616599969953312, + "learning_rate": 1.455225632495712e-06, + "loss": 0.029, + "step": 6985 + }, + { + "epoch": 3.102820341994226, + "grad_norm": 0.4076291882386512, + "learning_rate": 1.453858738452763e-06, + "loss": 0.0326, + "step": 6986 + }, + { + "epoch": 3.1032644903397735, + "grad_norm": 0.31683467511767754, + "learning_rate": 1.4524923774550825e-06, + "loss": 0.0176, + "step": 6987 + }, + { + "epoch": 3.1037086386853208, + "grad_norm": 0.39424949712455315, + "learning_rate": 1.4511265497080624e-06, + "loss": 0.0302, + "step": 6988 + }, + { + "epoch": 3.1041527870308685, + "grad_norm": 0.4103257196866521, + "learning_rate": 1.4497612554170054e-06, + "loss": 0.0273, + "step": 6989 + }, + { + "epoch": 3.104596935376416, + "grad_norm": 0.39987357613853897, + "learning_rate": 1.4483964947871392e-06, + "loss": 0.0255, + "step": 6990 + }, + { + "epoch": 3.105041083721963, + "grad_norm": 0.41100905630593115, + "learning_rate": 1.4470322680236132e-06, + "loss": 0.0302, + "step": 6991 + }, + { + "epoch": 3.1054852320675104, + "grad_norm": 0.4415871306778436, + "learning_rate": 1.4456685753314898e-06, + "loss": 0.0248, + "step": 6992 + }, + { + "epoch": 3.105929380413058, + "grad_norm": 0.3985527776293508, + "learning_rate": 1.4443054169157566e-06, + "loss": 0.024, + "step": 6993 + }, + { + "epoch": 3.1063735287586054, + "grad_norm": 0.5025368665991552, + "learning_rate": 1.4429427929813205e-06, + "loss": 0.0327, + "step": 6994 + }, + { + "epoch": 3.1068176771041527, + "grad_norm": 0.543210885126374, + "learning_rate": 1.4415807037330065e-06, + "loss": 0.0244, + "step": 6995 + }, + { + "epoch": 3.1072618254497, + "grad_norm": 0.5795921769157866, + "learning_rate": 1.4402191493755614e-06, + "loss": 0.0319, + "step": 6996 + }, + { + "epoch": 3.1077059737952477, + "grad_norm": 0.33181453971757086, + "learning_rate": 1.4388581301136463e-06, + "loss": 0.0239, + "step": 6997 + }, + { + "epoch": 3.108150122140795, + "grad_norm": 0.35538456672100055, + "learning_rate": 1.4374976461518475e-06, + "loss": 0.0202, + "step": 6998 + }, + { + "epoch": 3.1085942704863423, + "grad_norm": 0.30394370356689626, + "learning_rate": 1.436137697694669e-06, + "loss": 0.0148, + "step": 6999 + }, + { + "epoch": 3.10903841883189, + "grad_norm": 0.5358782537926752, + "learning_rate": 1.4347782849465335e-06, + "loss": 0.0361, + "step": 7000 + }, + { + "epoch": 3.1094825671774373, + "grad_norm": 0.45423657841045967, + "learning_rate": 1.4334194081117853e-06, + "loss": 0.0296, + "step": 7001 + }, + { + "epoch": 3.1099267155229846, + "grad_norm": 0.391451201654507, + "learning_rate": 1.4320610673946862e-06, + "loss": 0.0219, + "step": 7002 + }, + { + "epoch": 3.110370863868532, + "grad_norm": 0.3695407430957425, + "learning_rate": 1.4307032629994162e-06, + "loss": 0.0275, + "step": 7003 + }, + { + "epoch": 3.1108150122140796, + "grad_norm": 0.3522320469665786, + "learning_rate": 1.4293459951300775e-06, + "loss": 0.0191, + "step": 7004 + }, + { + "epoch": 3.111259160559627, + "grad_norm": 0.39441008031342695, + "learning_rate": 1.4279892639906906e-06, + "loss": 0.022, + "step": 7005 + }, + { + "epoch": 3.111703308905174, + "grad_norm": 0.3604951087918798, + "learning_rate": 1.4266330697851955e-06, + "loss": 0.0291, + "step": 7006 + }, + { + "epoch": 3.112147457250722, + "grad_norm": 0.3966852264525251, + "learning_rate": 1.4252774127174502e-06, + "loss": 0.0336, + "step": 7007 + }, + { + "epoch": 3.1125916055962692, + "grad_norm": 0.4920706934740084, + "learning_rate": 1.4239222929912354e-06, + "loss": 0.0339, + "step": 7008 + }, + { + "epoch": 3.1130357539418165, + "grad_norm": 0.452968414032015, + "learning_rate": 1.422567710810246e-06, + "loss": 0.0334, + "step": 7009 + }, + { + "epoch": 3.113479902287364, + "grad_norm": 0.34946018464557227, + "learning_rate": 1.421213666378099e-06, + "loss": 0.0259, + "step": 7010 + }, + { + "epoch": 3.1139240506329116, + "grad_norm": 0.3785739486603564, + "learning_rate": 1.419860159898331e-06, + "loss": 0.0248, + "step": 7011 + }, + { + "epoch": 3.114368198978459, + "grad_norm": 0.48985400850238237, + "learning_rate": 1.418507191574397e-06, + "loss": 0.0245, + "step": 7012 + }, + { + "epoch": 3.114812347324006, + "grad_norm": 0.37256560600518596, + "learning_rate": 1.4171547616096726e-06, + "loss": 0.0263, + "step": 7013 + }, + { + "epoch": 3.1152564956695534, + "grad_norm": 0.4745149660869735, + "learning_rate": 1.4158028702074478e-06, + "loss": 0.0315, + "step": 7014 + }, + { + "epoch": 3.115700644015101, + "grad_norm": 0.4272234241381395, + "learning_rate": 1.4144515175709366e-06, + "loss": 0.0359, + "step": 7015 + }, + { + "epoch": 3.1161447923606485, + "grad_norm": 0.4717714680449159, + "learning_rate": 1.4131007039032702e-06, + "loss": 0.0247, + "step": 7016 + }, + { + "epoch": 3.1165889407061957, + "grad_norm": 0.3661175444111234, + "learning_rate": 1.4117504294074985e-06, + "loss": 0.0199, + "step": 7017 + }, + { + "epoch": 3.1170330890517435, + "grad_norm": 0.39156431702822175, + "learning_rate": 1.4104006942865911e-06, + "loss": 0.0225, + "step": 7018 + }, + { + "epoch": 3.1174772373972908, + "grad_norm": 0.4841602294941045, + "learning_rate": 1.4090514987434372e-06, + "loss": 0.0224, + "step": 7019 + }, + { + "epoch": 3.117921385742838, + "grad_norm": 0.3409883421268121, + "learning_rate": 1.4077028429808415e-06, + "loss": 0.0184, + "step": 7020 + }, + { + "epoch": 3.1183655340883853, + "grad_norm": 0.4571971842074867, + "learning_rate": 1.4063547272015305e-06, + "loss": 0.0334, + "step": 7021 + }, + { + "epoch": 3.118809682433933, + "grad_norm": 0.4859572155144659, + "learning_rate": 1.4050071516081499e-06, + "loss": 0.0236, + "step": 7022 + }, + { + "epoch": 3.1192538307794804, + "grad_norm": 0.36054003070051327, + "learning_rate": 1.4036601164032626e-06, + "loss": 0.0244, + "step": 7023 + }, + { + "epoch": 3.1196979791250277, + "grad_norm": 0.4164627314286096, + "learning_rate": 1.4023136217893518e-06, + "loss": 0.0276, + "step": 7024 + }, + { + "epoch": 3.120142127470575, + "grad_norm": 0.5468105573468193, + "learning_rate": 1.4009676679688167e-06, + "loss": 0.0344, + "step": 7025 + }, + { + "epoch": 3.1205862758161227, + "grad_norm": 0.5699602944440265, + "learning_rate": 1.399622255143978e-06, + "loss": 0.0227, + "step": 7026 + }, + { + "epoch": 3.12103042416167, + "grad_norm": 0.3763296846537518, + "learning_rate": 1.3982773835170738e-06, + "loss": 0.0233, + "step": 7027 + }, + { + "epoch": 3.1214745725072173, + "grad_norm": 0.3809901917507222, + "learning_rate": 1.396933053290262e-06, + "loss": 0.0261, + "step": 7028 + }, + { + "epoch": 3.121918720852765, + "grad_norm": 0.3942636268702358, + "learning_rate": 1.3955892646656172e-06, + "loss": 0.0244, + "step": 7029 + }, + { + "epoch": 3.1223628691983123, + "grad_norm": 0.4034606683568663, + "learning_rate": 1.3942460178451357e-06, + "loss": 0.0267, + "step": 7030 + }, + { + "epoch": 3.1228070175438596, + "grad_norm": 0.4219843417449712, + "learning_rate": 1.3929033130307273e-06, + "loss": 0.0291, + "step": 7031 + }, + { + "epoch": 3.123251165889407, + "grad_norm": 0.39268086756994935, + "learning_rate": 1.3915611504242248e-06, + "loss": 0.0242, + "step": 7032 + }, + { + "epoch": 3.1236953142349546, + "grad_norm": 0.41199677628842846, + "learning_rate": 1.390219530227378e-06, + "loss": 0.0284, + "step": 7033 + }, + { + "epoch": 3.124139462580502, + "grad_norm": 0.39148197640312005, + "learning_rate": 1.3888784526418552e-06, + "loss": 0.0268, + "step": 7034 + }, + { + "epoch": 3.124583610926049, + "grad_norm": 0.6462152373026261, + "learning_rate": 1.3875379178692433e-06, + "loss": 0.042, + "step": 7035 + }, + { + "epoch": 3.125027759271597, + "grad_norm": 0.5120068554721335, + "learning_rate": 1.3861979261110493e-06, + "loss": 0.0352, + "step": 7036 + }, + { + "epoch": 3.125471907617144, + "grad_norm": 0.5096494527748306, + "learning_rate": 1.3848584775686923e-06, + "loss": 0.0329, + "step": 7037 + }, + { + "epoch": 3.1259160559626915, + "grad_norm": 0.49019540876863293, + "learning_rate": 1.3835195724435175e-06, + "loss": 0.0274, + "step": 7038 + }, + { + "epoch": 3.126360204308239, + "grad_norm": 0.4378287851338807, + "learning_rate": 1.3821812109367838e-06, + "loss": 0.0261, + "step": 7039 + }, + { + "epoch": 3.1268043526537865, + "grad_norm": 0.4665401569139363, + "learning_rate": 1.38084339324967e-06, + "loss": 0.0244, + "step": 7040 + }, + { + "epoch": 3.127248500999334, + "grad_norm": 0.4839568704489146, + "learning_rate": 1.3795061195832749e-06, + "loss": 0.0185, + "step": 7041 + }, + { + "epoch": 3.127692649344881, + "grad_norm": 0.38633411841281007, + "learning_rate": 1.3781693901386094e-06, + "loss": 0.0206, + "step": 7042 + }, + { + "epoch": 3.1281367976904284, + "grad_norm": 0.3852498984003089, + "learning_rate": 1.3768332051166089e-06, + "loss": 0.023, + "step": 7043 + }, + { + "epoch": 3.128580946035976, + "grad_norm": 0.3527312929509418, + "learning_rate": 1.3754975647181245e-06, + "loss": 0.0272, + "step": 7044 + }, + { + "epoch": 3.1290250943815234, + "grad_norm": 0.4078162247196801, + "learning_rate": 1.374162469143926e-06, + "loss": 0.0184, + "step": 7045 + }, + { + "epoch": 3.1294692427270707, + "grad_norm": 0.6947202101134006, + "learning_rate": 1.3728279185947002e-06, + "loss": 0.0287, + "step": 7046 + }, + { + "epoch": 3.1299133910726185, + "grad_norm": 0.3650330793591096, + "learning_rate": 1.3714939132710547e-06, + "loss": 0.0203, + "step": 7047 + }, + { + "epoch": 3.1303575394181657, + "grad_norm": 0.3894198333673228, + "learning_rate": 1.3701604533735102e-06, + "loss": 0.0383, + "step": 7048 + }, + { + "epoch": 3.130801687763713, + "grad_norm": 0.3626357108946466, + "learning_rate": 1.3688275391025096e-06, + "loss": 0.019, + "step": 7049 + }, + { + "epoch": 3.1312458361092603, + "grad_norm": 0.421447173128161, + "learning_rate": 1.3674951706584134e-06, + "loss": 0.0182, + "step": 7050 + }, + { + "epoch": 3.131689984454808, + "grad_norm": 0.4627577920703621, + "learning_rate": 1.3661633482414977e-06, + "loss": 0.0265, + "step": 7051 + }, + { + "epoch": 3.1321341328003554, + "grad_norm": 0.43244182202980463, + "learning_rate": 1.3648320720519592e-06, + "loss": 0.0196, + "step": 7052 + }, + { + "epoch": 3.1325782811459026, + "grad_norm": 0.4592125815303857, + "learning_rate": 1.3635013422899124e-06, + "loss": 0.0298, + "step": 7053 + }, + { + "epoch": 3.13302242949145, + "grad_norm": 0.44770730468191844, + "learning_rate": 1.3621711591553854e-06, + "loss": 0.0347, + "step": 7054 + }, + { + "epoch": 3.1334665778369977, + "grad_norm": 0.4417156773267622, + "learning_rate": 1.3608415228483291e-06, + "loss": 0.0286, + "step": 7055 + }, + { + "epoch": 3.133910726182545, + "grad_norm": 0.33340946983600855, + "learning_rate": 1.3595124335686104e-06, + "loss": 0.0239, + "step": 7056 + }, + { + "epoch": 3.1343548745280922, + "grad_norm": 0.45018385116758614, + "learning_rate": 1.3581838915160145e-06, + "loss": 0.0307, + "step": 7057 + }, + { + "epoch": 3.13479902287364, + "grad_norm": 0.49692015647969867, + "learning_rate": 1.3568558968902445e-06, + "loss": 0.0291, + "step": 7058 + }, + { + "epoch": 3.1352431712191873, + "grad_norm": 0.43270232873861747, + "learning_rate": 1.3555284498909183e-06, + "loss": 0.031, + "step": 7059 + }, + { + "epoch": 3.1356873195647346, + "grad_norm": 0.40820462560013654, + "learning_rate": 1.3542015507175743e-06, + "loss": 0.0235, + "step": 7060 + }, + { + "epoch": 3.136131467910282, + "grad_norm": 0.4972923121321135, + "learning_rate": 1.3528751995696688e-06, + "loss": 0.051, + "step": 7061 + }, + { + "epoch": 3.1365756162558296, + "grad_norm": 0.41578377070138484, + "learning_rate": 1.3515493966465743e-06, + "loss": 0.028, + "step": 7062 + }, + { + "epoch": 3.137019764601377, + "grad_norm": 0.4491202264177913, + "learning_rate": 1.350224142147582e-06, + "loss": 0.0172, + "step": 7063 + }, + { + "epoch": 3.137463912946924, + "grad_norm": 0.4586826960329923, + "learning_rate": 1.3488994362719016e-06, + "loss": 0.0244, + "step": 7064 + }, + { + "epoch": 3.137908061292472, + "grad_norm": 0.3366156664630001, + "learning_rate": 1.3475752792186559e-06, + "loss": 0.0177, + "step": 7065 + }, + { + "epoch": 3.138352209638019, + "grad_norm": 0.4038403979608836, + "learning_rate": 1.3462516711868894e-06, + "loss": 0.0283, + "step": 7066 + }, + { + "epoch": 3.1387963579835665, + "grad_norm": 0.36057350319064324, + "learning_rate": 1.3449286123755628e-06, + "loss": 0.0282, + "step": 7067 + }, + { + "epoch": 3.1392405063291138, + "grad_norm": 0.4323404311148623, + "learning_rate": 1.343606102983555e-06, + "loss": 0.0285, + "step": 7068 + }, + { + "epoch": 3.1396846546746615, + "grad_norm": 0.42881049252651987, + "learning_rate": 1.3422841432096623e-06, + "loss": 0.0299, + "step": 7069 + }, + { + "epoch": 3.140128803020209, + "grad_norm": 0.2902742598750208, + "learning_rate": 1.3409627332525954e-06, + "loss": 0.0181, + "step": 7070 + }, + { + "epoch": 3.140572951365756, + "grad_norm": 0.4433992896224678, + "learning_rate": 1.3396418733109856e-06, + "loss": 0.0257, + "step": 7071 + }, + { + "epoch": 3.1410170997113034, + "grad_norm": 0.6894028385625418, + "learning_rate": 1.3383215635833829e-06, + "loss": 0.0304, + "step": 7072 + }, + { + "epoch": 3.141461248056851, + "grad_norm": 0.5245286511325312, + "learning_rate": 1.337001804268247e-06, + "loss": 0.0269, + "step": 7073 + }, + { + "epoch": 3.1419053964023984, + "grad_norm": 0.5000099376315333, + "learning_rate": 1.3356825955639645e-06, + "loss": 0.0308, + "step": 7074 + }, + { + "epoch": 3.1423495447479457, + "grad_norm": 0.6132659503324829, + "learning_rate": 1.3343639376688355e-06, + "loss": 0.032, + "step": 7075 + }, + { + "epoch": 3.1427936930934934, + "grad_norm": 0.4546873288387578, + "learning_rate": 1.3330458307810734e-06, + "loss": 0.0246, + "step": 7076 + }, + { + "epoch": 3.1432378414390407, + "grad_norm": 0.385185952241093, + "learning_rate": 1.3317282750988137e-06, + "loss": 0.0184, + "step": 7077 + }, + { + "epoch": 3.143681989784588, + "grad_norm": 0.39438378600816204, + "learning_rate": 1.3304112708201073e-06, + "loss": 0.0279, + "step": 7078 + }, + { + "epoch": 3.1441261381301353, + "grad_norm": 0.4806029181761233, + "learning_rate": 1.329094818142922e-06, + "loss": 0.0241, + "step": 7079 + }, + { + "epoch": 3.144570286475683, + "grad_norm": 0.5228838911668552, + "learning_rate": 1.327778917265144e-06, + "loss": 0.0274, + "step": 7080 + }, + { + "epoch": 3.1450144348212303, + "grad_norm": 0.46531223651019554, + "learning_rate": 1.3264635683845755e-06, + "loss": 0.0241, + "step": 7081 + }, + { + "epoch": 3.1454585831667776, + "grad_norm": 0.32902026350154623, + "learning_rate": 1.3251487716989341e-06, + "loss": 0.0169, + "step": 7082 + }, + { + "epoch": 3.145902731512325, + "grad_norm": 0.3895325085859838, + "learning_rate": 1.3238345274058572e-06, + "loss": 0.0219, + "step": 7083 + }, + { + "epoch": 3.1463468798578726, + "grad_norm": 0.4481405965406686, + "learning_rate": 1.322520835702898e-06, + "loss": 0.025, + "step": 7084 + }, + { + "epoch": 3.14679102820342, + "grad_norm": 0.5128203909715668, + "learning_rate": 1.3212076967875265e-06, + "loss": 0.0348, + "step": 7085 + }, + { + "epoch": 3.1472351765489672, + "grad_norm": 0.46447445611495575, + "learning_rate": 1.3198951108571312e-06, + "loss": 0.0313, + "step": 7086 + }, + { + "epoch": 3.147679324894515, + "grad_norm": 0.29983798874260054, + "learning_rate": 1.3185830781090136e-06, + "loss": 0.0142, + "step": 7087 + }, + { + "epoch": 3.1481234732400623, + "grad_norm": 0.39996284685169026, + "learning_rate": 1.3172715987403955e-06, + "loss": 0.0279, + "step": 7088 + }, + { + "epoch": 3.1485676215856095, + "grad_norm": 0.44271681407903213, + "learning_rate": 1.3159606729484165e-06, + "loss": 0.0309, + "step": 7089 + }, + { + "epoch": 3.149011769931157, + "grad_norm": 0.39836027232808446, + "learning_rate": 1.3146503009301258e-06, + "loss": 0.0258, + "step": 7090 + }, + { + "epoch": 3.1494559182767046, + "grad_norm": 0.38922372733595656, + "learning_rate": 1.3133404828824998e-06, + "loss": 0.0249, + "step": 7091 + }, + { + "epoch": 3.149900066622252, + "grad_norm": 0.4629939120495913, + "learning_rate": 1.3120312190024265e-06, + "loss": 0.0177, + "step": 7092 + }, + { + "epoch": 3.150344214967799, + "grad_norm": 0.5175669897186087, + "learning_rate": 1.3107225094867066e-06, + "loss": 0.0352, + "step": 7093 + }, + { + "epoch": 3.150788363313347, + "grad_norm": 0.4487683854548024, + "learning_rate": 1.3094143545320636e-06, + "loss": 0.0309, + "step": 7094 + }, + { + "epoch": 3.151232511658894, + "grad_norm": 0.4853986633083335, + "learning_rate": 1.3081067543351351e-06, + "loss": 0.0351, + "step": 7095 + }, + { + "epoch": 3.1516766600044415, + "grad_norm": 0.4346141380035006, + "learning_rate": 1.3067997090924755e-06, + "loss": 0.0275, + "step": 7096 + }, + { + "epoch": 3.1521208083499888, + "grad_norm": 0.7565904341486527, + "learning_rate": 1.305493219000558e-06, + "loss": 0.0331, + "step": 7097 + }, + { + "epoch": 3.1525649566955365, + "grad_norm": 0.4041064888863333, + "learning_rate": 1.3041872842557669e-06, + "loss": 0.0303, + "step": 7098 + }, + { + "epoch": 3.153009105041084, + "grad_norm": 0.39758421079788286, + "learning_rate": 1.3028819050544078e-06, + "loss": 0.02, + "step": 7099 + }, + { + "epoch": 3.153453253386631, + "grad_norm": 0.43003387120599507, + "learning_rate": 1.3015770815927009e-06, + "loss": 0.034, + "step": 7100 + }, + { + "epoch": 3.1538974017321784, + "grad_norm": 0.4278201479573488, + "learning_rate": 1.3002728140667847e-06, + "loss": 0.0233, + "step": 7101 + }, + { + "epoch": 3.154341550077726, + "grad_norm": 0.37161324406542584, + "learning_rate": 1.2989691026727114e-06, + "loss": 0.0197, + "step": 7102 + }, + { + "epoch": 3.1547856984232734, + "grad_norm": 0.49261598275828133, + "learning_rate": 1.2976659476064528e-06, + "loss": 0.0332, + "step": 7103 + }, + { + "epoch": 3.1552298467688207, + "grad_norm": 0.41968097121084996, + "learning_rate": 1.2963633490638927e-06, + "loss": 0.0266, + "step": 7104 + }, + { + "epoch": 3.155673995114368, + "grad_norm": 0.37929111645310976, + "learning_rate": 1.2950613072408352e-06, + "loss": 0.0206, + "step": 7105 + }, + { + "epoch": 3.1561181434599157, + "grad_norm": 0.4490037141777807, + "learning_rate": 1.2937598223330006e-06, + "loss": 0.0335, + "step": 7106 + }, + { + "epoch": 3.156562291805463, + "grad_norm": 0.6757819685260033, + "learning_rate": 1.2924588945360195e-06, + "loss": 0.0387, + "step": 7107 + }, + { + "epoch": 3.1570064401510103, + "grad_norm": 0.38030068064549954, + "learning_rate": 1.2911585240454483e-06, + "loss": 0.0225, + "step": 7108 + }, + { + "epoch": 3.157450588496558, + "grad_norm": 0.3303111905779037, + "learning_rate": 1.2898587110567546e-06, + "loss": 0.0158, + "step": 7109 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.36066815697274596, + "learning_rate": 1.2885594557653197e-06, + "loss": 0.0271, + "step": 7110 + }, + { + "epoch": 3.1583388851876526, + "grad_norm": 0.5116023254049137, + "learning_rate": 1.2872607583664443e-06, + "loss": 0.0332, + "step": 7111 + }, + { + "epoch": 3.1587830335332, + "grad_norm": 0.4098679802743982, + "learning_rate": 1.2859626190553459e-06, + "loss": 0.0206, + "step": 7112 + }, + { + "epoch": 3.1592271818787476, + "grad_norm": 0.3664260571550095, + "learning_rate": 1.2846650380271563e-06, + "loss": 0.0215, + "step": 7113 + }, + { + "epoch": 3.159671330224295, + "grad_norm": 0.4801826025425368, + "learning_rate": 1.283368015476925e-06, + "loss": 0.0207, + "step": 7114 + }, + { + "epoch": 3.160115478569842, + "grad_norm": 0.43450502438961686, + "learning_rate": 1.2820715515996146e-06, + "loss": 0.02, + "step": 7115 + }, + { + "epoch": 3.16055962691539, + "grad_norm": 0.4768531731139893, + "learning_rate": 1.280775646590106e-06, + "loss": 0.0321, + "step": 7116 + }, + { + "epoch": 3.1610037752609372, + "grad_norm": 0.3425693376069199, + "learning_rate": 1.2794803006431984e-06, + "loss": 0.0188, + "step": 7117 + }, + { + "epoch": 3.1614479236064845, + "grad_norm": 0.5214149879054594, + "learning_rate": 1.2781855139535988e-06, + "loss": 0.0262, + "step": 7118 + }, + { + "epoch": 3.161892071952032, + "grad_norm": 0.40187878997324955, + "learning_rate": 1.2768912867159406e-06, + "loss": 0.0361, + "step": 7119 + }, + { + "epoch": 3.1623362202975795, + "grad_norm": 0.5083763707824389, + "learning_rate": 1.2755976191247682e-06, + "loss": 0.021, + "step": 7120 + }, + { + "epoch": 3.162780368643127, + "grad_norm": 0.4055192408506625, + "learning_rate": 1.2743045113745385e-06, + "loss": 0.0227, + "step": 7121 + }, + { + "epoch": 3.163224516988674, + "grad_norm": 0.4094976103066204, + "learning_rate": 1.2730119636596288e-06, + "loss": 0.0281, + "step": 7122 + }, + { + "epoch": 3.163668665334222, + "grad_norm": 0.45282087571650875, + "learning_rate": 1.2717199761743336e-06, + "loss": 0.0251, + "step": 7123 + }, + { + "epoch": 3.164112813679769, + "grad_norm": 0.4299624187324527, + "learning_rate": 1.2704285491128553e-06, + "loss": 0.0283, + "step": 7124 + }, + { + "epoch": 3.1645569620253164, + "grad_norm": 0.4009907594057322, + "learning_rate": 1.2691376826693235e-06, + "loss": 0.0281, + "step": 7125 + }, + { + "epoch": 3.1650011103708637, + "grad_norm": 0.36139049623487757, + "learning_rate": 1.2678473770377726e-06, + "loss": 0.0227, + "step": 7126 + }, + { + "epoch": 3.1654452587164115, + "grad_norm": 0.3911090099172185, + "learning_rate": 1.2665576324121587e-06, + "loss": 0.019, + "step": 7127 + }, + { + "epoch": 3.1658894070619588, + "grad_norm": 0.4537050187193512, + "learning_rate": 1.2652684489863532e-06, + "loss": 0.0171, + "step": 7128 + }, + { + "epoch": 3.166333555407506, + "grad_norm": 0.4737431638279446, + "learning_rate": 1.2639798269541432e-06, + "loss": 0.0262, + "step": 7129 + }, + { + "epoch": 3.1667777037530533, + "grad_norm": 0.5356720104817232, + "learning_rate": 1.2626917665092265e-06, + "loss": 0.0367, + "step": 7130 + }, + { + "epoch": 3.167221852098601, + "grad_norm": 0.41884008779805304, + "learning_rate": 1.2614042678452254e-06, + "loss": 0.0277, + "step": 7131 + }, + { + "epoch": 3.1676660004441484, + "grad_norm": 0.3813088744442865, + "learning_rate": 1.260117331155669e-06, + "loss": 0.0241, + "step": 7132 + }, + { + "epoch": 3.1681101487896957, + "grad_norm": 0.4824659904614872, + "learning_rate": 1.258830956634008e-06, + "loss": 0.0266, + "step": 7133 + }, + { + "epoch": 3.168554297135243, + "grad_norm": 0.3933046865818541, + "learning_rate": 1.2575451444736065e-06, + "loss": 0.0241, + "step": 7134 + }, + { + "epoch": 3.1689984454807907, + "grad_norm": 0.4675792699661695, + "learning_rate": 1.25625989486774e-06, + "loss": 0.0231, + "step": 7135 + }, + { + "epoch": 3.169442593826338, + "grad_norm": 0.47235339239033924, + "learning_rate": 1.2549752080096078e-06, + "loss": 0.0228, + "step": 7136 + }, + { + "epoch": 3.1698867421718853, + "grad_norm": 0.5375969536724705, + "learning_rate": 1.2536910840923205e-06, + "loss": 0.0221, + "step": 7137 + }, + { + "epoch": 3.170330890517433, + "grad_norm": 0.4497621301881044, + "learning_rate": 1.2524075233089e-06, + "loss": 0.0343, + "step": 7138 + }, + { + "epoch": 3.1707750388629803, + "grad_norm": 0.292034357924613, + "learning_rate": 1.251124525852289e-06, + "loss": 0.0126, + "step": 7139 + }, + { + "epoch": 3.1712191872085276, + "grad_norm": 0.35964553866038385, + "learning_rate": 1.2498420919153464e-06, + "loss": 0.0275, + "step": 7140 + }, + { + "epoch": 3.171663335554075, + "grad_norm": 0.35000228782660847, + "learning_rate": 1.2485602216908378e-06, + "loss": 0.0175, + "step": 7141 + }, + { + "epoch": 3.1721074838996226, + "grad_norm": 0.3682428993293255, + "learning_rate": 1.2472789153714572e-06, + "loss": 0.0239, + "step": 7142 + }, + { + "epoch": 3.17255163224517, + "grad_norm": 0.5552689170116238, + "learning_rate": 1.245998173149801e-06, + "loss": 0.0296, + "step": 7143 + }, + { + "epoch": 3.172995780590717, + "grad_norm": 0.4582090480824963, + "learning_rate": 1.244717995218389e-06, + "loss": 0.0331, + "step": 7144 + }, + { + "epoch": 3.173439928936265, + "grad_norm": 0.5212899725680629, + "learning_rate": 1.2434383817696548e-06, + "loss": 0.0258, + "step": 7145 + }, + { + "epoch": 3.173884077281812, + "grad_norm": 0.381695700080771, + "learning_rate": 1.2421593329959437e-06, + "loss": 0.0243, + "step": 7146 + }, + { + "epoch": 3.1743282256273595, + "grad_norm": 0.3974320066485413, + "learning_rate": 1.2408808490895176e-06, + "loss": 0.0296, + "step": 7147 + }, + { + "epoch": 3.174772373972907, + "grad_norm": 0.37908843626651684, + "learning_rate": 1.2396029302425589e-06, + "loss": 0.0233, + "step": 7148 + }, + { + "epoch": 3.1752165223184545, + "grad_norm": 0.3916584808786519, + "learning_rate": 1.2383255766471564e-06, + "loss": 0.0231, + "step": 7149 + }, + { + "epoch": 3.175660670664002, + "grad_norm": 0.4921323330468778, + "learning_rate": 1.2370487884953198e-06, + "loss": 0.0282, + "step": 7150 + }, + { + "epoch": 3.176104819009549, + "grad_norm": 0.4474378513318365, + "learning_rate": 1.2357725659789727e-06, + "loss": 0.0316, + "step": 7151 + }, + { + "epoch": 3.1765489673550964, + "grad_norm": 0.507768145335404, + "learning_rate": 1.234496909289949e-06, + "loss": 0.0314, + "step": 7152 + }, + { + "epoch": 3.176993115700644, + "grad_norm": 0.43929762431880826, + "learning_rate": 1.2332218186200062e-06, + "loss": 0.036, + "step": 7153 + }, + { + "epoch": 3.1774372640461914, + "grad_norm": 0.45050762765015684, + "learning_rate": 1.2319472941608118e-06, + "loss": 0.0243, + "step": 7154 + }, + { + "epoch": 3.1778814123917387, + "grad_norm": 0.35294587274588873, + "learning_rate": 1.2306733361039457e-06, + "loss": 0.0214, + "step": 7155 + }, + { + "epoch": 3.1783255607372864, + "grad_norm": 0.41208302752606807, + "learning_rate": 1.2293999446409067e-06, + "loss": 0.0261, + "step": 7156 + }, + { + "epoch": 3.1787697090828337, + "grad_norm": 0.5041315813475921, + "learning_rate": 1.228127119963109e-06, + "loss": 0.0285, + "step": 7157 + }, + { + "epoch": 3.179213857428381, + "grad_norm": 0.44300292068122016, + "learning_rate": 1.2268548622618753e-06, + "loss": 0.0297, + "step": 7158 + }, + { + "epoch": 3.1796580057739283, + "grad_norm": 0.45740073144316085, + "learning_rate": 1.2255831717284528e-06, + "loss": 0.0279, + "step": 7159 + }, + { + "epoch": 3.180102154119476, + "grad_norm": 0.3604803514556297, + "learning_rate": 1.2243120485539944e-06, + "loss": 0.0223, + "step": 7160 + }, + { + "epoch": 3.1805463024650233, + "grad_norm": 0.3321847278445319, + "learning_rate": 1.223041492929573e-06, + "loss": 0.0165, + "step": 7161 + }, + { + "epoch": 3.1809904508105706, + "grad_norm": 0.35593201085972326, + "learning_rate": 1.221771505046176e-06, + "loss": 0.0214, + "step": 7162 + }, + { + "epoch": 3.181434599156118, + "grad_norm": 0.48430071345607734, + "learning_rate": 1.2205020850947009e-06, + "loss": 0.0324, + "step": 7163 + }, + { + "epoch": 3.1818787475016657, + "grad_norm": 0.3786369491408754, + "learning_rate": 1.219233233265964e-06, + "loss": 0.0196, + "step": 7164 + }, + { + "epoch": 3.182322895847213, + "grad_norm": 0.422396003313245, + "learning_rate": 1.2179649497506984e-06, + "loss": 0.0235, + "step": 7165 + }, + { + "epoch": 3.1827670441927602, + "grad_norm": 0.5088819210251279, + "learning_rate": 1.216697234739545e-06, + "loss": 0.0245, + "step": 7166 + }, + { + "epoch": 3.183211192538308, + "grad_norm": 0.44593024209496107, + "learning_rate": 1.2154300884230647e-06, + "loss": 0.0248, + "step": 7167 + }, + { + "epoch": 3.1836553408838553, + "grad_norm": 0.44151282819218973, + "learning_rate": 1.2141635109917322e-06, + "loss": 0.0243, + "step": 7168 + }, + { + "epoch": 3.1840994892294026, + "grad_norm": 0.5179310919609202, + "learning_rate": 1.2128975026359308e-06, + "loss": 0.0303, + "step": 7169 + }, + { + "epoch": 3.18454363757495, + "grad_norm": 0.47378558079444577, + "learning_rate": 1.2116320635459694e-06, + "loss": 0.0308, + "step": 7170 + }, + { + "epoch": 3.1849877859204976, + "grad_norm": 0.3483756788167636, + "learning_rate": 1.2103671939120603e-06, + "loss": 0.0205, + "step": 7171 + }, + { + "epoch": 3.185431934266045, + "grad_norm": 0.43627278086001237, + "learning_rate": 1.2091028939243372e-06, + "loss": 0.0348, + "step": 7172 + }, + { + "epoch": 3.185876082611592, + "grad_norm": 0.49521087150532456, + "learning_rate": 1.207839163772845e-06, + "loss": 0.0312, + "step": 7173 + }, + { + "epoch": 3.18632023095714, + "grad_norm": 0.4856268760629859, + "learning_rate": 1.206576003647545e-06, + "loss": 0.02, + "step": 7174 + }, + { + "epoch": 3.186764379302687, + "grad_norm": 0.39637861480749204, + "learning_rate": 1.2053134137383082e-06, + "loss": 0.0255, + "step": 7175 + }, + { + "epoch": 3.1872085276482345, + "grad_norm": 0.43819489132093886, + "learning_rate": 1.2040513942349285e-06, + "loss": 0.0197, + "step": 7176 + }, + { + "epoch": 3.1876526759937818, + "grad_norm": 0.4216663259343465, + "learning_rate": 1.2027899453271046e-06, + "loss": 0.023, + "step": 7177 + }, + { + "epoch": 3.1880968243393295, + "grad_norm": 0.36509628207338957, + "learning_rate": 1.2015290672044555e-06, + "loss": 0.0213, + "step": 7178 + }, + { + "epoch": 3.188540972684877, + "grad_norm": 0.5214855632257979, + "learning_rate": 1.2002687600565138e-06, + "loss": 0.0326, + "step": 7179 + }, + { + "epoch": 3.188985121030424, + "grad_norm": 0.4141851354865393, + "learning_rate": 1.199009024072722e-06, + "loss": 0.0224, + "step": 7180 + }, + { + "epoch": 3.1894292693759714, + "grad_norm": 0.33059616966272004, + "learning_rate": 1.1977498594424404e-06, + "loss": 0.0197, + "step": 7181 + }, + { + "epoch": 3.189873417721519, + "grad_norm": 0.5492371676948207, + "learning_rate": 1.196491266354946e-06, + "loss": 0.0251, + "step": 7182 + }, + { + "epoch": 3.1903175660670664, + "grad_norm": 0.4321665285793292, + "learning_rate": 1.1952332449994236e-06, + "loss": 0.0254, + "step": 7183 + }, + { + "epoch": 3.1907617144126137, + "grad_norm": 0.5421889351399802, + "learning_rate": 1.1939757955649762e-06, + "loss": 0.0205, + "step": 7184 + }, + { + "epoch": 3.1912058627581614, + "grad_norm": 0.3637124855912809, + "learning_rate": 1.1927189182406207e-06, + "loss": 0.028, + "step": 7185 + }, + { + "epoch": 3.1916500111037087, + "grad_norm": 0.4272124427935157, + "learning_rate": 1.191462613215284e-06, + "loss": 0.0245, + "step": 7186 + }, + { + "epoch": 3.192094159449256, + "grad_norm": 0.5695918619211989, + "learning_rate": 1.190206880677815e-06, + "loss": 0.0275, + "step": 7187 + }, + { + "epoch": 3.1925383077948033, + "grad_norm": 0.47921802084665843, + "learning_rate": 1.188951720816967e-06, + "loss": 0.0327, + "step": 7188 + }, + { + "epoch": 3.192982456140351, + "grad_norm": 0.47676512936038173, + "learning_rate": 1.1876971338214144e-06, + "loss": 0.0237, + "step": 7189 + }, + { + "epoch": 3.1934266044858983, + "grad_norm": 0.32414238199829704, + "learning_rate": 1.1864431198797433e-06, + "loss": 0.0223, + "step": 7190 + }, + { + "epoch": 3.1938707528314456, + "grad_norm": 0.37243806607221175, + "learning_rate": 1.1851896791804507e-06, + "loss": 0.0226, + "step": 7191 + }, + { + "epoch": 3.194314901176993, + "grad_norm": 0.45790234199301505, + "learning_rate": 1.1839368119119504e-06, + "loss": 0.0366, + "step": 7192 + }, + { + "epoch": 3.1947590495225406, + "grad_norm": 0.3980186449229834, + "learning_rate": 1.182684518262574e-06, + "loss": 0.0285, + "step": 7193 + }, + { + "epoch": 3.195203197868088, + "grad_norm": 0.37521542650727085, + "learning_rate": 1.1814327984205576e-06, + "loss": 0.0262, + "step": 7194 + }, + { + "epoch": 3.195647346213635, + "grad_norm": 0.5161650488172221, + "learning_rate": 1.1801816525740578e-06, + "loss": 0.0314, + "step": 7195 + }, + { + "epoch": 3.196091494559183, + "grad_norm": 0.4308857816197324, + "learning_rate": 1.1789310809111444e-06, + "loss": 0.0214, + "step": 7196 + }, + { + "epoch": 3.1965356429047302, + "grad_norm": 0.34195463841205975, + "learning_rate": 1.1776810836197965e-06, + "loss": 0.0148, + "step": 7197 + }, + { + "epoch": 3.1969797912502775, + "grad_norm": 0.40193187911181305, + "learning_rate": 1.1764316608879122e-06, + "loss": 0.0246, + "step": 7198 + }, + { + "epoch": 3.197423939595825, + "grad_norm": 0.45090018760439604, + "learning_rate": 1.1751828129033e-06, + "loss": 0.0332, + "step": 7199 + }, + { + "epoch": 3.1978680879413726, + "grad_norm": 0.45570245011519855, + "learning_rate": 1.1739345398536834e-06, + "loss": 0.0287, + "step": 7200 + }, + { + "epoch": 3.19831223628692, + "grad_norm": 0.3951478023748843, + "learning_rate": 1.1726868419266985e-06, + "loss": 0.022, + "step": 7201 + }, + { + "epoch": 3.198756384632467, + "grad_norm": 0.45157628259237137, + "learning_rate": 1.1714397193098975e-06, + "loss": 0.0254, + "step": 7202 + }, + { + "epoch": 3.199200532978015, + "grad_norm": 0.577449890397031, + "learning_rate": 1.1701931721907417e-06, + "loss": 0.0475, + "step": 7203 + }, + { + "epoch": 3.199644681323562, + "grad_norm": 0.3377142483011571, + "learning_rate": 1.1689472007566082e-06, + "loss": 0.0233, + "step": 7204 + }, + { + "epoch": 3.2000888296691095, + "grad_norm": 0.5632120326062124, + "learning_rate": 1.1677018051947898e-06, + "loss": 0.0306, + "step": 7205 + }, + { + "epoch": 3.2005329780146567, + "grad_norm": 0.3154600517639766, + "learning_rate": 1.1664569856924885e-06, + "loss": 0.0185, + "step": 7206 + }, + { + "epoch": 3.2009771263602045, + "grad_norm": 0.3451810701202185, + "learning_rate": 1.1652127424368248e-06, + "loss": 0.0188, + "step": 7207 + }, + { + "epoch": 3.2014212747057518, + "grad_norm": 0.4315345094850551, + "learning_rate": 1.1639690756148258e-06, + "loss": 0.0247, + "step": 7208 + }, + { + "epoch": 3.201865423051299, + "grad_norm": 0.46106139760465975, + "learning_rate": 1.162725985413436e-06, + "loss": 0.0373, + "step": 7209 + }, + { + "epoch": 3.2023095713968464, + "grad_norm": 0.35725860550074096, + "learning_rate": 1.1614834720195173e-06, + "loss": 0.0225, + "step": 7210 + }, + { + "epoch": 3.202753719742394, + "grad_norm": 0.4457350062298839, + "learning_rate": 1.1602415356198366e-06, + "loss": 0.0257, + "step": 7211 + }, + { + "epoch": 3.2031978680879414, + "grad_norm": 0.414436238208724, + "learning_rate": 1.1590001764010795e-06, + "loss": 0.0233, + "step": 7212 + }, + { + "epoch": 3.2036420164334887, + "grad_norm": 0.5552444496185882, + "learning_rate": 1.1577593945498439e-06, + "loss": 0.0267, + "step": 7213 + }, + { + "epoch": 3.2040861647790364, + "grad_norm": 0.9572529892460224, + "learning_rate": 1.156519190252638e-06, + "loss": 0.0208, + "step": 7214 + }, + { + "epoch": 3.2045303131245837, + "grad_norm": 0.4000797957812149, + "learning_rate": 1.1552795636958874e-06, + "loss": 0.021, + "step": 7215 + }, + { + "epoch": 3.204974461470131, + "grad_norm": 0.44923949008034364, + "learning_rate": 1.154040515065929e-06, + "loss": 0.0257, + "step": 7216 + }, + { + "epoch": 3.2054186098156783, + "grad_norm": 0.3620392807393453, + "learning_rate": 1.1528020445490122e-06, + "loss": 0.0168, + "step": 7217 + }, + { + "epoch": 3.205862758161226, + "grad_norm": 0.40007534369795833, + "learning_rate": 1.1515641523313026e-06, + "loss": 0.019, + "step": 7218 + }, + { + "epoch": 3.2063069065067733, + "grad_norm": 0.48298838416688933, + "learning_rate": 1.1503268385988726e-06, + "loss": 0.0289, + "step": 7219 + }, + { + "epoch": 3.2067510548523206, + "grad_norm": 0.42002715971416216, + "learning_rate": 1.1490901035377127e-06, + "loss": 0.0218, + "step": 7220 + }, + { + "epoch": 3.207195203197868, + "grad_norm": 0.3286114150705751, + "learning_rate": 1.147853947333727e-06, + "loss": 0.0197, + "step": 7221 + }, + { + "epoch": 3.2076393515434156, + "grad_norm": 0.4609222221897965, + "learning_rate": 1.1466183701727285e-06, + "loss": 0.0324, + "step": 7222 + }, + { + "epoch": 3.208083499888963, + "grad_norm": 0.685447809015932, + "learning_rate": 1.1453833722404467e-06, + "loss": 0.0324, + "step": 7223 + }, + { + "epoch": 3.20852764823451, + "grad_norm": 0.3436945045836064, + "learning_rate": 1.1441489537225242e-06, + "loss": 0.0157, + "step": 7224 + }, + { + "epoch": 3.208971796580058, + "grad_norm": 0.5032138454546828, + "learning_rate": 1.142915114804512e-06, + "loss": 0.0305, + "step": 7225 + }, + { + "epoch": 3.2094159449256052, + "grad_norm": 0.4731526252440794, + "learning_rate": 1.1416818556718766e-06, + "loss": 0.0213, + "step": 7226 + }, + { + "epoch": 3.2098600932711525, + "grad_norm": 0.4103989387772799, + "learning_rate": 1.1404491765100028e-06, + "loss": 0.0233, + "step": 7227 + }, + { + "epoch": 3.2103042416167, + "grad_norm": 0.4494816164510247, + "learning_rate": 1.1392170775041788e-06, + "loss": 0.0297, + "step": 7228 + }, + { + "epoch": 3.2107483899622475, + "grad_norm": 0.49371762049111106, + "learning_rate": 1.1379855588396111e-06, + "loss": 0.0352, + "step": 7229 + }, + { + "epoch": 3.211192538307795, + "grad_norm": 0.5061710158808476, + "learning_rate": 1.1367546207014197e-06, + "loss": 0.029, + "step": 7230 + }, + { + "epoch": 3.211636686653342, + "grad_norm": 0.8145152873713564, + "learning_rate": 1.1355242632746322e-06, + "loss": 0.0262, + "step": 7231 + }, + { + "epoch": 3.21208083499889, + "grad_norm": 0.3714708449077717, + "learning_rate": 1.134294486744194e-06, + "loss": 0.0195, + "step": 7232 + }, + { + "epoch": 3.212524983344437, + "grad_norm": 0.3746783694058875, + "learning_rate": 1.1330652912949614e-06, + "loss": 0.0315, + "step": 7233 + }, + { + "epoch": 3.2129691316899844, + "grad_norm": 0.3552761658812224, + "learning_rate": 1.131836677111703e-06, + "loss": 0.0195, + "step": 7234 + }, + { + "epoch": 3.2134132800355317, + "grad_norm": 0.425007652814607, + "learning_rate": 1.130608644379102e-06, + "loss": 0.025, + "step": 7235 + }, + { + "epoch": 3.2138574283810795, + "grad_norm": 0.43005151453900625, + "learning_rate": 1.12938119328175e-06, + "loss": 0.0209, + "step": 7236 + }, + { + "epoch": 3.2143015767266268, + "grad_norm": 0.4595931551885754, + "learning_rate": 1.1281543240041553e-06, + "loss": 0.0299, + "step": 7237 + }, + { + "epoch": 3.214745725072174, + "grad_norm": 0.46371024210057155, + "learning_rate": 1.1269280367307366e-06, + "loss": 0.0323, + "step": 7238 + }, + { + "epoch": 3.2151898734177213, + "grad_norm": 0.6313936018053397, + "learning_rate": 1.125702331645826e-06, + "loss": 0.045, + "step": 7239 + }, + { + "epoch": 3.215634021763269, + "grad_norm": 0.4072252776090633, + "learning_rate": 1.1244772089336676e-06, + "loss": 0.0201, + "step": 7240 + }, + { + "epoch": 3.2160781701088164, + "grad_norm": 0.4916818117136901, + "learning_rate": 1.1232526687784196e-06, + "loss": 0.0321, + "step": 7241 + }, + { + "epoch": 3.2165223184543636, + "grad_norm": 0.4020576743291894, + "learning_rate": 1.1220287113641487e-06, + "loss": 0.0235, + "step": 7242 + }, + { + "epoch": 3.216966466799911, + "grad_norm": 0.3816616945761622, + "learning_rate": 1.1208053368748379e-06, + "loss": 0.0195, + "step": 7243 + }, + { + "epoch": 3.2174106151454587, + "grad_norm": 0.7096021175300753, + "learning_rate": 1.1195825454943805e-06, + "loss": 0.0277, + "step": 7244 + }, + { + "epoch": 3.217854763491006, + "grad_norm": 0.4537143505718799, + "learning_rate": 1.1183603374065832e-06, + "loss": 0.0317, + "step": 7245 + }, + { + "epoch": 3.2182989118365533, + "grad_norm": 0.3124829853038914, + "learning_rate": 1.1171387127951667e-06, + "loss": 0.0134, + "step": 7246 + }, + { + "epoch": 3.218743060182101, + "grad_norm": 0.3523041765470259, + "learning_rate": 1.1159176718437581e-06, + "loss": 0.0197, + "step": 7247 + }, + { + "epoch": 3.2191872085276483, + "grad_norm": 0.46781406748141774, + "learning_rate": 1.114697214735903e-06, + "loss": 0.03, + "step": 7248 + }, + { + "epoch": 3.2196313568731956, + "grad_norm": 0.45497498562705085, + "learning_rate": 1.113477341655056e-06, + "loss": 0.0326, + "step": 7249 + }, + { + "epoch": 3.220075505218743, + "grad_norm": 0.34355725383797536, + "learning_rate": 1.1122580527845844e-06, + "loss": 0.0199, + "step": 7250 + }, + { + "epoch": 3.2205196535642906, + "grad_norm": 0.4222581331896639, + "learning_rate": 1.1110393483077697e-06, + "loss": 0.0248, + "step": 7251 + }, + { + "epoch": 3.220963801909838, + "grad_norm": 0.3466800241459592, + "learning_rate": 1.1098212284078037e-06, + "loss": 0.0262, + "step": 7252 + }, + { + "epoch": 3.221407950255385, + "grad_norm": 0.4157855377746249, + "learning_rate": 1.108603693267788e-06, + "loss": 0.0256, + "step": 7253 + }, + { + "epoch": 3.221852098600933, + "grad_norm": 0.383389548238552, + "learning_rate": 1.1073867430707409e-06, + "loss": 0.024, + "step": 7254 + }, + { + "epoch": 3.22229624694648, + "grad_norm": 0.30999218839566933, + "learning_rate": 1.1061703779995903e-06, + "loss": 0.0132, + "step": 7255 + }, + { + "epoch": 3.2227403952920275, + "grad_norm": 0.4028830849965787, + "learning_rate": 1.1049545982371763e-06, + "loss": 0.0237, + "step": 7256 + }, + { + "epoch": 3.223184543637575, + "grad_norm": 0.399211499707528, + "learning_rate": 1.1037394039662514e-06, + "loss": 0.025, + "step": 7257 + }, + { + "epoch": 3.2236286919831225, + "grad_norm": 0.4718629067386526, + "learning_rate": 1.1025247953694812e-06, + "loss": 0.022, + "step": 7258 + }, + { + "epoch": 3.22407284032867, + "grad_norm": 0.37912161530644367, + "learning_rate": 1.1013107726294398e-06, + "loss": 0.0218, + "step": 7259 + }, + { + "epoch": 3.224516988674217, + "grad_norm": 0.34533958399869225, + "learning_rate": 1.100097335928616e-06, + "loss": 0.0165, + "step": 7260 + }, + { + "epoch": 3.224961137019765, + "grad_norm": 0.5551950864778356, + "learning_rate": 1.0988844854494108e-06, + "loss": 0.0351, + "step": 7261 + }, + { + "epoch": 3.225405285365312, + "grad_norm": 0.5024454449508828, + "learning_rate": 1.0976722213741353e-06, + "loss": 0.0264, + "step": 7262 + }, + { + "epoch": 3.2258494337108594, + "grad_norm": 0.44064209832820683, + "learning_rate": 1.0964605438850157e-06, + "loss": 0.0277, + "step": 7263 + }, + { + "epoch": 3.2262935820564067, + "grad_norm": 0.3886834550548472, + "learning_rate": 1.0952494531641845e-06, + "loss": 0.0254, + "step": 7264 + }, + { + "epoch": 3.2267377304019544, + "grad_norm": 0.44369145821666817, + "learning_rate": 1.0940389493936903e-06, + "loss": 0.0305, + "step": 7265 + }, + { + "epoch": 3.2271818787475017, + "grad_norm": 0.5083410379311883, + "learning_rate": 1.092829032755493e-06, + "loss": 0.0234, + "step": 7266 + }, + { + "epoch": 3.227626027093049, + "grad_norm": 0.5641232827648482, + "learning_rate": 1.091619703431463e-06, + "loss": 0.033, + "step": 7267 + }, + { + "epoch": 3.2280701754385963, + "grad_norm": 0.49450571188792775, + "learning_rate": 1.0904109616033837e-06, + "loss": 0.0259, + "step": 7268 + }, + { + "epoch": 3.228514323784144, + "grad_norm": 0.40475539158874035, + "learning_rate": 1.0892028074529504e-06, + "loss": 0.0304, + "step": 7269 + }, + { + "epoch": 3.2289584721296913, + "grad_norm": 0.3822868070952512, + "learning_rate": 1.0879952411617668e-06, + "loss": 0.0258, + "step": 7270 + }, + { + "epoch": 3.2294026204752386, + "grad_norm": 0.32534348009223135, + "learning_rate": 1.0867882629113512e-06, + "loss": 0.0165, + "step": 7271 + }, + { + "epoch": 3.229846768820786, + "grad_norm": 0.44997009638741614, + "learning_rate": 1.085581872883134e-06, + "loss": 0.0288, + "step": 7272 + }, + { + "epoch": 3.2302909171663337, + "grad_norm": 0.3775740023239124, + "learning_rate": 1.0843760712584557e-06, + "loss": 0.0191, + "step": 7273 + }, + { + "epoch": 3.230735065511881, + "grad_norm": 0.36873567439006616, + "learning_rate": 1.0831708582185684e-06, + "loss": 0.0215, + "step": 7274 + }, + { + "epoch": 3.2311792138574282, + "grad_norm": 0.41830159166514674, + "learning_rate": 1.081966233944638e-06, + "loss": 0.0349, + "step": 7275 + }, + { + "epoch": 3.231623362202976, + "grad_norm": 0.3474261761721994, + "learning_rate": 1.0807621986177369e-06, + "loss": 0.0197, + "step": 7276 + }, + { + "epoch": 3.2320675105485233, + "grad_norm": 0.4726466217876324, + "learning_rate": 1.0795587524188532e-06, + "loss": 0.0257, + "step": 7277 + }, + { + "epoch": 3.2325116588940705, + "grad_norm": 0.5853092492971105, + "learning_rate": 1.0783558955288864e-06, + "loss": 0.033, + "step": 7278 + }, + { + "epoch": 3.232955807239618, + "grad_norm": 0.4076799037255477, + "learning_rate": 1.0771536281286454e-06, + "loss": 0.0251, + "step": 7279 + }, + { + "epoch": 3.2333999555851656, + "grad_norm": 0.3975030901670652, + "learning_rate": 1.0759519503988525e-06, + "loss": 0.0223, + "step": 7280 + }, + { + "epoch": 3.233844103930713, + "grad_norm": 0.40051231781743457, + "learning_rate": 1.0747508625201387e-06, + "loss": 0.0247, + "step": 7281 + }, + { + "epoch": 3.23428825227626, + "grad_norm": 0.44390849962202206, + "learning_rate": 1.0735503646730483e-06, + "loss": 0.0281, + "step": 7282 + }, + { + "epoch": 3.234732400621808, + "grad_norm": 0.4052200206226338, + "learning_rate": 1.0723504570380367e-06, + "loss": 0.0286, + "step": 7283 + }, + { + "epoch": 3.235176548967355, + "grad_norm": 0.37902885647278484, + "learning_rate": 1.0711511397954706e-06, + "loss": 0.0205, + "step": 7284 + }, + { + "epoch": 3.2356206973129025, + "grad_norm": 0.3395460769637531, + "learning_rate": 1.0699524131256273e-06, + "loss": 0.0201, + "step": 7285 + }, + { + "epoch": 3.2360648456584498, + "grad_norm": 0.3790836643740825, + "learning_rate": 1.0687542772086978e-06, + "loss": 0.0213, + "step": 7286 + }, + { + "epoch": 3.2365089940039975, + "grad_norm": 0.3344270224539923, + "learning_rate": 1.0675567322247794e-06, + "loss": 0.0188, + "step": 7287 + }, + { + "epoch": 3.236953142349545, + "grad_norm": 0.49193821672434174, + "learning_rate": 1.0663597783538843e-06, + "loss": 0.0258, + "step": 7288 + }, + { + "epoch": 3.237397290695092, + "grad_norm": 0.40685832395170973, + "learning_rate": 1.0651634157759361e-06, + "loss": 0.0266, + "step": 7289 + }, + { + "epoch": 3.2378414390406394, + "grad_norm": 0.4090407200808684, + "learning_rate": 1.063967644670767e-06, + "loss": 0.0264, + "step": 7290 + }, + { + "epoch": 3.238285587386187, + "grad_norm": 0.3883951968082906, + "learning_rate": 1.0627724652181237e-06, + "loss": 0.033, + "step": 7291 + }, + { + "epoch": 3.2387297357317344, + "grad_norm": 0.43888578494036506, + "learning_rate": 1.06157787759766e-06, + "loss": 0.0291, + "step": 7292 + }, + { + "epoch": 3.2391738840772817, + "grad_norm": 0.5336092270346524, + "learning_rate": 1.0603838819889429e-06, + "loss": 0.031, + "step": 7293 + }, + { + "epoch": 3.2396180324228294, + "grad_norm": 0.4556708038719475, + "learning_rate": 1.0591904785714507e-06, + "loss": 0.0189, + "step": 7294 + }, + { + "epoch": 3.2400621807683767, + "grad_norm": 0.42245566909222265, + "learning_rate": 1.0579976675245724e-06, + "loss": 0.0241, + "step": 7295 + }, + { + "epoch": 3.240506329113924, + "grad_norm": 0.4528542240509129, + "learning_rate": 1.0568054490276075e-06, + "loss": 0.0251, + "step": 7296 + }, + { + "epoch": 3.2409504774594713, + "grad_norm": 0.4541061461339724, + "learning_rate": 1.0556138232597684e-06, + "loss": 0.0201, + "step": 7297 + }, + { + "epoch": 3.241394625805019, + "grad_norm": 0.3790673698790023, + "learning_rate": 1.054422790400173e-06, + "loss": 0.019, + "step": 7298 + }, + { + "epoch": 3.2418387741505663, + "grad_norm": 0.41427572592093276, + "learning_rate": 1.0532323506278564e-06, + "loss": 0.0302, + "step": 7299 + }, + { + "epoch": 3.2422829224961136, + "grad_norm": 0.4164805864455333, + "learning_rate": 1.0520425041217613e-06, + "loss": 0.0374, + "step": 7300 + }, + { + "epoch": 3.242727070841661, + "grad_norm": 0.5061586285788595, + "learning_rate": 1.0508532510607421e-06, + "loss": 0.0369, + "step": 7301 + }, + { + "epoch": 3.2431712191872086, + "grad_norm": 0.51039712429684, + "learning_rate": 1.049664591623563e-06, + "loss": 0.0256, + "step": 7302 + }, + { + "epoch": 3.243615367532756, + "grad_norm": 0.3147661792422783, + "learning_rate": 1.0484765259889024e-06, + "loss": 0.0178, + "step": 7303 + }, + { + "epoch": 3.244059515878303, + "grad_norm": 0.34967275161446504, + "learning_rate": 1.0472890543353425e-06, + "loss": 0.022, + "step": 7304 + }, + { + "epoch": 3.244503664223851, + "grad_norm": 0.425569161792133, + "learning_rate": 1.0461021768413827e-06, + "loss": 0.0327, + "step": 7305 + }, + { + "epoch": 3.2449478125693982, + "grad_norm": 0.3260603432721134, + "learning_rate": 1.0449158936854308e-06, + "loss": 0.019, + "step": 7306 + }, + { + "epoch": 3.2453919609149455, + "grad_norm": 0.40232295144807395, + "learning_rate": 1.0437302050458053e-06, + "loss": 0.0343, + "step": 7307 + }, + { + "epoch": 3.245836109260493, + "grad_norm": 0.435190410566104, + "learning_rate": 1.0425451111007368e-06, + "loss": 0.0371, + "step": 7308 + }, + { + "epoch": 3.2462802576060406, + "grad_norm": 0.36886684345005827, + "learning_rate": 1.0413606120283616e-06, + "loss": 0.0159, + "step": 7309 + }, + { + "epoch": 3.246724405951588, + "grad_norm": 0.6185604106742186, + "learning_rate": 1.040176708006732e-06, + "loss": 0.0317, + "step": 7310 + }, + { + "epoch": 3.247168554297135, + "grad_norm": 0.4287929879516384, + "learning_rate": 1.0389933992138106e-06, + "loss": 0.0285, + "step": 7311 + }, + { + "epoch": 3.247612702642683, + "grad_norm": 0.5172218558312743, + "learning_rate": 1.0378106858274639e-06, + "loss": 0.0269, + "step": 7312 + }, + { + "epoch": 3.24805685098823, + "grad_norm": 0.47500334875701583, + "learning_rate": 1.036628568025479e-06, + "loss": 0.0291, + "step": 7313 + }, + { + "epoch": 3.2485009993337775, + "grad_norm": 0.36315045000352464, + "learning_rate": 1.035447045985547e-06, + "loss": 0.0205, + "step": 7314 + }, + { + "epoch": 3.2489451476793247, + "grad_norm": 0.3702839335927883, + "learning_rate": 1.0342661198852689e-06, + "loss": 0.0272, + "step": 7315 + }, + { + "epoch": 3.2493892960248725, + "grad_norm": 0.4172702979013527, + "learning_rate": 1.0330857899021584e-06, + "loss": 0.0288, + "step": 7316 + }, + { + "epoch": 3.2498334443704198, + "grad_norm": 0.4003396546345466, + "learning_rate": 1.03190605621364e-06, + "loss": 0.0249, + "step": 7317 + }, + { + "epoch": 3.250277592715967, + "grad_norm": 0.36419349273758006, + "learning_rate": 1.0307269189970482e-06, + "loss": 0.0227, + "step": 7318 + }, + { + "epoch": 3.250721741061515, + "grad_norm": 0.46337961532186456, + "learning_rate": 1.0295483784296274e-06, + "loss": 0.0292, + "step": 7319 + }, + { + "epoch": 3.251165889407062, + "grad_norm": 0.4844395758839405, + "learning_rate": 1.0283704346885303e-06, + "loss": 0.0315, + "step": 7320 + }, + { + "epoch": 3.2516100377526094, + "grad_norm": 0.43388879742046443, + "learning_rate": 1.027193087950823e-06, + "loss": 0.0253, + "step": 7321 + }, + { + "epoch": 3.2520541860981567, + "grad_norm": 0.43253571684648157, + "learning_rate": 1.0260163383934807e-06, + "loss": 0.0292, + "step": 7322 + }, + { + "epoch": 3.252498334443704, + "grad_norm": 0.37106610809103213, + "learning_rate": 1.0248401861933888e-06, + "loss": 0.0208, + "step": 7323 + }, + { + "epoch": 3.2529424827892517, + "grad_norm": 0.35259048739158694, + "learning_rate": 1.0236646315273436e-06, + "loss": 0.0223, + "step": 7324 + }, + { + "epoch": 3.253386631134799, + "grad_norm": 0.5151899180549023, + "learning_rate": 1.0224896745720513e-06, + "loss": 0.0313, + "step": 7325 + }, + { + "epoch": 3.2538307794803463, + "grad_norm": 0.4063218873557718, + "learning_rate": 1.0213153155041255e-06, + "loss": 0.0291, + "step": 7326 + }, + { + "epoch": 3.254274927825894, + "grad_norm": 0.6251543341151611, + "learning_rate": 1.0201415545000941e-06, + "loss": 0.0267, + "step": 7327 + }, + { + "epoch": 3.2547190761714413, + "grad_norm": 0.4171212598426182, + "learning_rate": 1.0189683917363947e-06, + "loss": 0.0227, + "step": 7328 + }, + { + "epoch": 3.2551632245169886, + "grad_norm": 0.49598266444833977, + "learning_rate": 1.0177958273893684e-06, + "loss": 0.0239, + "step": 7329 + }, + { + "epoch": 3.255607372862536, + "grad_norm": 0.3899965486282789, + "learning_rate": 1.016623861635277e-06, + "loss": 0.0209, + "step": 7330 + }, + { + "epoch": 3.2560515212080836, + "grad_norm": 0.43687191924955027, + "learning_rate": 1.0154524946502864e-06, + "loss": 0.0272, + "step": 7331 + }, + { + "epoch": 3.256495669553631, + "grad_norm": 0.4063747702825569, + "learning_rate": 1.01428172661047e-06, + "loss": 0.0255, + "step": 7332 + }, + { + "epoch": 3.256939817899178, + "grad_norm": 0.48223387404929335, + "learning_rate": 1.0131115576918154e-06, + "loss": 0.031, + "step": 7333 + }, + { + "epoch": 3.257383966244726, + "grad_norm": 0.39589428565081675, + "learning_rate": 1.011941988070219e-06, + "loss": 0.0222, + "step": 7334 + }, + { + "epoch": 3.257828114590273, + "grad_norm": 0.4100082037653643, + "learning_rate": 1.0107730179214875e-06, + "loss": 0.0359, + "step": 7335 + }, + { + "epoch": 3.2582722629358205, + "grad_norm": 0.7048819567321716, + "learning_rate": 1.0096046474213378e-06, + "loss": 0.0239, + "step": 7336 + }, + { + "epoch": 3.258716411281368, + "grad_norm": 0.4205899532427439, + "learning_rate": 1.008436876745393e-06, + "loss": 0.0329, + "step": 7337 + }, + { + "epoch": 3.2591605596269155, + "grad_norm": 0.3747110714323454, + "learning_rate": 1.00726970606919e-06, + "loss": 0.0217, + "step": 7338 + }, + { + "epoch": 3.259604707972463, + "grad_norm": 0.4970634544493451, + "learning_rate": 1.0061031355681766e-06, + "loss": 0.029, + "step": 7339 + }, + { + "epoch": 3.26004885631801, + "grad_norm": 0.35455430411205807, + "learning_rate": 1.0049371654177036e-06, + "loss": 0.0241, + "step": 7340 + }, + { + "epoch": 3.260493004663558, + "grad_norm": 0.38458418543297923, + "learning_rate": 1.0037717957930404e-06, + "loss": 0.0264, + "step": 7341 + }, + { + "epoch": 3.260937153009105, + "grad_norm": 0.44567315711458705, + "learning_rate": 1.0026070268693616e-06, + "loss": 0.033, + "step": 7342 + }, + { + "epoch": 3.2613813013546524, + "grad_norm": 0.6191592999982571, + "learning_rate": 1.0014428588217495e-06, + "loss": 0.0433, + "step": 7343 + }, + { + "epoch": 3.2618254497001997, + "grad_norm": 0.31811859198013076, + "learning_rate": 1.0002792918251991e-06, + "loss": 0.0178, + "step": 7344 + }, + { + "epoch": 3.2622695980457475, + "grad_norm": 0.4500830212917085, + "learning_rate": 9.991163260546154e-07, + "loss": 0.0283, + "step": 7345 + }, + { + "epoch": 3.2627137463912947, + "grad_norm": 0.45351163208898665, + "learning_rate": 9.979539616848088e-07, + "loss": 0.0347, + "step": 7346 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.35335672660967254, + "learning_rate": 9.96792198890506e-07, + "loss": 0.0188, + "step": 7347 + }, + { + "epoch": 3.2636020430823893, + "grad_norm": 0.44507187430087025, + "learning_rate": 9.956310378463397e-07, + "loss": 0.028, + "step": 7348 + }, + { + "epoch": 3.264046191427937, + "grad_norm": 0.5621390992373146, + "learning_rate": 9.94470478726849e-07, + "loss": 0.0318, + "step": 7349 + }, + { + "epoch": 3.2644903397734844, + "grad_norm": 0.4752468135015428, + "learning_rate": 9.933105217064876e-07, + "loss": 0.0301, + "step": 7350 + }, + { + "epoch": 3.2649344881190316, + "grad_norm": 0.4866383052323718, + "learning_rate": 9.921511669596169e-07, + "loss": 0.0258, + "step": 7351 + }, + { + "epoch": 3.265378636464579, + "grad_norm": 0.4858680526762395, + "learning_rate": 9.909924146605065e-07, + "loss": 0.0337, + "step": 7352 + }, + { + "epoch": 3.2658227848101267, + "grad_norm": 0.39515608234465965, + "learning_rate": 9.898342649833392e-07, + "loss": 0.0192, + "step": 7353 + }, + { + "epoch": 3.266266933155674, + "grad_norm": 0.36506881604203717, + "learning_rate": 9.88676718102201e-07, + "loss": 0.021, + "step": 7354 + }, + { + "epoch": 3.2667110815012212, + "grad_norm": 0.4187792232437751, + "learning_rate": 9.87519774191093e-07, + "loss": 0.0331, + "step": 7355 + }, + { + "epoch": 3.267155229846769, + "grad_norm": 0.46579982125230246, + "learning_rate": 9.863634334239241e-07, + "loss": 0.0231, + "step": 7356 + }, + { + "epoch": 3.2675993781923163, + "grad_norm": 0.38694241705258164, + "learning_rate": 9.852076959745082e-07, + "loss": 0.0259, + "step": 7357 + }, + { + "epoch": 3.2680435265378636, + "grad_norm": 0.46212859105639015, + "learning_rate": 9.840525620165763e-07, + "loss": 0.0317, + "step": 7358 + }, + { + "epoch": 3.268487674883411, + "grad_norm": 0.5473555001130638, + "learning_rate": 9.828980317237652e-07, + "loss": 0.0396, + "step": 7359 + }, + { + "epoch": 3.2689318232289586, + "grad_norm": 0.36341029826160076, + "learning_rate": 9.817441052696164e-07, + "loss": 0.0185, + "step": 7360 + }, + { + "epoch": 3.269375971574506, + "grad_norm": 0.44926471264353585, + "learning_rate": 9.805907828275874e-07, + "loss": 0.0315, + "step": 7361 + }, + { + "epoch": 3.269820119920053, + "grad_norm": 0.3017207630293574, + "learning_rate": 9.794380645710428e-07, + "loss": 0.017, + "step": 7362 + }, + { + "epoch": 3.270264268265601, + "grad_norm": 0.4066309090272054, + "learning_rate": 9.782859506732517e-07, + "loss": 0.0252, + "step": 7363 + }, + { + "epoch": 3.270708416611148, + "grad_norm": 0.45822398666676095, + "learning_rate": 9.771344413074018e-07, + "loss": 0.0289, + "step": 7364 + }, + { + "epoch": 3.2711525649566955, + "grad_norm": 0.45754692936117375, + "learning_rate": 9.75983536646581e-07, + "loss": 0.0204, + "step": 7365 + }, + { + "epoch": 3.2715967133022428, + "grad_norm": 0.4003872658949929, + "learning_rate": 9.748332368637903e-07, + "loss": 0.0239, + "step": 7366 + }, + { + "epoch": 3.2720408616477905, + "grad_norm": 0.40491759770498775, + "learning_rate": 9.736835421319397e-07, + "loss": 0.016, + "step": 7367 + }, + { + "epoch": 3.272485009993338, + "grad_norm": 0.40672999266918314, + "learning_rate": 9.725344526238495e-07, + "loss": 0.0275, + "step": 7368 + }, + { + "epoch": 3.272929158338885, + "grad_norm": 0.4720762654089572, + "learning_rate": 9.713859685122428e-07, + "loss": 0.0267, + "step": 7369 + }, + { + "epoch": 3.273373306684433, + "grad_norm": 0.41020011208140905, + "learning_rate": 9.702380899697621e-07, + "loss": 0.0231, + "step": 7370 + }, + { + "epoch": 3.27381745502998, + "grad_norm": 0.51306227756855, + "learning_rate": 9.69090817168949e-07, + "loss": 0.0213, + "step": 7371 + }, + { + "epoch": 3.2742616033755274, + "grad_norm": 0.5514386921409069, + "learning_rate": 9.67944150282259e-07, + "loss": 0.0388, + "step": 7372 + }, + { + "epoch": 3.2747057517210747, + "grad_norm": 0.47876201361463355, + "learning_rate": 9.667980894820572e-07, + "loss": 0.0402, + "step": 7373 + }, + { + "epoch": 3.2751499000666224, + "grad_norm": 0.35596410172080417, + "learning_rate": 9.65652634940612e-07, + "loss": 0.0211, + "step": 7374 + }, + { + "epoch": 3.2755940484121697, + "grad_norm": 0.5330920908251411, + "learning_rate": 9.64507786830109e-07, + "loss": 0.0312, + "step": 7375 + }, + { + "epoch": 3.276038196757717, + "grad_norm": 0.44331071434194597, + "learning_rate": 9.633635453226376e-07, + "loss": 0.0241, + "step": 7376 + }, + { + "epoch": 3.2764823451032643, + "grad_norm": 0.4178851244614469, + "learning_rate": 9.622199105901947e-07, + "loss": 0.0251, + "step": 7377 + }, + { + "epoch": 3.276926493448812, + "grad_norm": 0.4925055411838903, + "learning_rate": 9.610768828046891e-07, + "loss": 0.0312, + "step": 7378 + }, + { + "epoch": 3.2773706417943593, + "grad_norm": 0.3674042441682116, + "learning_rate": 9.59934462137938e-07, + "loss": 0.0176, + "step": 7379 + }, + { + "epoch": 3.2778147901399066, + "grad_norm": 0.41837211074854475, + "learning_rate": 9.58792648761664e-07, + "loss": 0.0272, + "step": 7380 + }, + { + "epoch": 3.278258938485454, + "grad_norm": 0.35983457178846384, + "learning_rate": 9.576514428475058e-07, + "loss": 0.0182, + "step": 7381 + }, + { + "epoch": 3.2787030868310016, + "grad_norm": 0.5213483263717448, + "learning_rate": 9.565108445670013e-07, + "loss": 0.0285, + "step": 7382 + }, + { + "epoch": 3.279147235176549, + "grad_norm": 0.3798413510246776, + "learning_rate": 9.55370854091604e-07, + "loss": 0.0199, + "step": 7383 + }, + { + "epoch": 3.2795913835220962, + "grad_norm": 0.40316029556208016, + "learning_rate": 9.542314715926753e-07, + "loss": 0.023, + "step": 7384 + }, + { + "epoch": 3.280035531867644, + "grad_norm": 0.3279127936611918, + "learning_rate": 9.5309269724148e-07, + "loss": 0.0197, + "step": 7385 + }, + { + "epoch": 3.2804796802131913, + "grad_norm": 0.3077370020937381, + "learning_rate": 9.519545312091966e-07, + "loss": 0.0186, + "step": 7386 + }, + { + "epoch": 3.2809238285587385, + "grad_norm": 0.29637415446133053, + "learning_rate": 9.508169736669137e-07, + "loss": 0.0149, + "step": 7387 + }, + { + "epoch": 3.281367976904286, + "grad_norm": 0.36806133901057364, + "learning_rate": 9.496800247856219e-07, + "loss": 0.0255, + "step": 7388 + }, + { + "epoch": 3.2818121252498336, + "grad_norm": 0.42318209672526275, + "learning_rate": 9.485436847362257e-07, + "loss": 0.0368, + "step": 7389 + }, + { + "epoch": 3.282256273595381, + "grad_norm": 0.4481117630581104, + "learning_rate": 9.474079536895365e-07, + "loss": 0.0323, + "step": 7390 + }, + { + "epoch": 3.282700421940928, + "grad_norm": 0.4094546536057865, + "learning_rate": 9.462728318162712e-07, + "loss": 0.0216, + "step": 7391 + }, + { + "epoch": 3.283144570286476, + "grad_norm": 0.4833214117317346, + "learning_rate": 9.451383192870623e-07, + "loss": 0.0314, + "step": 7392 + }, + { + "epoch": 3.283588718632023, + "grad_norm": 0.49172490663115703, + "learning_rate": 9.440044162724432e-07, + "loss": 0.0229, + "step": 7393 + }, + { + "epoch": 3.2840328669775705, + "grad_norm": 0.3837206768061979, + "learning_rate": 9.428711229428594e-07, + "loss": 0.0281, + "step": 7394 + }, + { + "epoch": 3.2844770153231178, + "grad_norm": 0.38763854772417794, + "learning_rate": 9.417384394686646e-07, + "loss": 0.0256, + "step": 7395 + }, + { + "epoch": 3.2849211636686655, + "grad_norm": 0.4292993654748122, + "learning_rate": 9.406063660201214e-07, + "loss": 0.0178, + "step": 7396 + }, + { + "epoch": 3.285365312014213, + "grad_norm": 0.4608324783734576, + "learning_rate": 9.394749027673955e-07, + "loss": 0.03, + "step": 7397 + }, + { + "epoch": 3.28580946035976, + "grad_norm": 0.4116770755294882, + "learning_rate": 9.383440498805712e-07, + "loss": 0.0266, + "step": 7398 + }, + { + "epoch": 3.286253608705308, + "grad_norm": 0.4151574100082846, + "learning_rate": 9.3721380752963e-07, + "loss": 0.0316, + "step": 7399 + }, + { + "epoch": 3.286697757050855, + "grad_norm": 0.5201578859027275, + "learning_rate": 9.36084175884468e-07, + "loss": 0.0273, + "step": 7400 + }, + { + "epoch": 3.2871419053964024, + "grad_norm": 0.517408046194756, + "learning_rate": 9.3495515511489e-07, + "loss": 0.039, + "step": 7401 + }, + { + "epoch": 3.2875860537419497, + "grad_norm": 0.4065347285711147, + "learning_rate": 9.338267453906036e-07, + "loss": 0.0193, + "step": 7402 + }, + { + "epoch": 3.2880302020874974, + "grad_norm": 0.35637207302672474, + "learning_rate": 9.326989468812281e-07, + "loss": 0.0252, + "step": 7403 + }, + { + "epoch": 3.2884743504330447, + "grad_norm": 0.3942806613898594, + "learning_rate": 9.315717597562951e-07, + "loss": 0.0224, + "step": 7404 + }, + { + "epoch": 3.288918498778592, + "grad_norm": 0.38601773480071766, + "learning_rate": 9.304451841852358e-07, + "loss": 0.0291, + "step": 7405 + }, + { + "epoch": 3.2893626471241393, + "grad_norm": 0.4412874940460958, + "learning_rate": 9.293192203373952e-07, + "loss": 0.0223, + "step": 7406 + }, + { + "epoch": 3.289806795469687, + "grad_norm": 0.36047506716707345, + "learning_rate": 9.281938683820258e-07, + "loss": 0.0196, + "step": 7407 + }, + { + "epoch": 3.2902509438152343, + "grad_norm": 0.38105181354876544, + "learning_rate": 9.270691284882826e-07, + "loss": 0.0215, + "step": 7408 + }, + { + "epoch": 3.2906950921607816, + "grad_norm": 0.5134105812079635, + "learning_rate": 9.259450008252396e-07, + "loss": 0.0407, + "step": 7409 + }, + { + "epoch": 3.291139240506329, + "grad_norm": 0.3562398576839381, + "learning_rate": 9.248214855618676e-07, + "loss": 0.0267, + "step": 7410 + }, + { + "epoch": 3.2915833888518766, + "grad_norm": 0.40663195402797286, + "learning_rate": 9.236985828670519e-07, + "loss": 0.0187, + "step": 7411 + }, + { + "epoch": 3.292027537197424, + "grad_norm": 0.7855995128015365, + "learning_rate": 9.225762929095844e-07, + "loss": 0.0372, + "step": 7412 + }, + { + "epoch": 3.292471685542971, + "grad_norm": 0.4097302898181189, + "learning_rate": 9.214546158581622e-07, + "loss": 0.0252, + "step": 7413 + }, + { + "epoch": 3.292915833888519, + "grad_norm": 0.4179153487359494, + "learning_rate": 9.203335518813922e-07, + "loss": 0.0285, + "step": 7414 + }, + { + "epoch": 3.2933599822340662, + "grad_norm": 0.5431971211223403, + "learning_rate": 9.192131011477934e-07, + "loss": 0.0303, + "step": 7415 + }, + { + "epoch": 3.2938041305796135, + "grad_norm": 0.5065980372967815, + "learning_rate": 9.180932638257845e-07, + "loss": 0.028, + "step": 7416 + }, + { + "epoch": 3.294248278925161, + "grad_norm": 0.38822134135830527, + "learning_rate": 9.169740400836974e-07, + "loss": 0.029, + "step": 7417 + }, + { + "epoch": 3.2946924272707085, + "grad_norm": 0.3332433458833639, + "learning_rate": 9.158554300897727e-07, + "loss": 0.0174, + "step": 7418 + }, + { + "epoch": 3.295136575616256, + "grad_norm": 0.3544713822711631, + "learning_rate": 9.147374340121523e-07, + "loss": 0.022, + "step": 7419 + }, + { + "epoch": 3.295580723961803, + "grad_norm": 0.32602497933389224, + "learning_rate": 9.13620052018892e-07, + "loss": 0.0205, + "step": 7420 + }, + { + "epoch": 3.296024872307351, + "grad_norm": 0.45839560888184433, + "learning_rate": 9.125032842779535e-07, + "loss": 0.0285, + "step": 7421 + }, + { + "epoch": 3.296469020652898, + "grad_norm": 0.4451229775575657, + "learning_rate": 9.113871309572059e-07, + "loss": 0.0314, + "step": 7422 + }, + { + "epoch": 3.2969131689984454, + "grad_norm": 0.4294843373554615, + "learning_rate": 9.10271592224426e-07, + "loss": 0.0279, + "step": 7423 + }, + { + "epoch": 3.2973573173439927, + "grad_norm": 0.49025642020365773, + "learning_rate": 9.091566682472991e-07, + "loss": 0.0319, + "step": 7424 + }, + { + "epoch": 3.2978014656895405, + "grad_norm": 0.39446663575000246, + "learning_rate": 9.08042359193414e-07, + "loss": 0.0264, + "step": 7425 + }, + { + "epoch": 3.2982456140350878, + "grad_norm": 0.3073338148065069, + "learning_rate": 9.06928665230275e-07, + "loss": 0.022, + "step": 7426 + }, + { + "epoch": 3.298689762380635, + "grad_norm": 0.5363424487989449, + "learning_rate": 9.058155865252854e-07, + "loss": 0.0369, + "step": 7427 + }, + { + "epoch": 3.299133910726183, + "grad_norm": 0.4241449124561232, + "learning_rate": 9.047031232457609e-07, + "loss": 0.0284, + "step": 7428 + }, + { + "epoch": 3.29957805907173, + "grad_norm": 0.5039477021882938, + "learning_rate": 9.035912755589254e-07, + "loss": 0.0296, + "step": 7429 + }, + { + "epoch": 3.3000222074172774, + "grad_norm": 0.6325313716948877, + "learning_rate": 9.024800436319059e-07, + "loss": 0.0356, + "step": 7430 + }, + { + "epoch": 3.3004663557628247, + "grad_norm": 0.35326550695737485, + "learning_rate": 9.013694276317392e-07, + "loss": 0.0293, + "step": 7431 + }, + { + "epoch": 3.3009105041083724, + "grad_norm": 0.3943172052588995, + "learning_rate": 9.002594277253735e-07, + "loss": 0.0256, + "step": 7432 + }, + { + "epoch": 3.3013546524539197, + "grad_norm": 0.3559437962740476, + "learning_rate": 8.991500440796569e-07, + "loss": 0.0261, + "step": 7433 + }, + { + "epoch": 3.301798800799467, + "grad_norm": 0.3905214750787488, + "learning_rate": 8.9804127686135e-07, + "loss": 0.0166, + "step": 7434 + }, + { + "epoch": 3.3022429491450143, + "grad_norm": 0.5100939910295724, + "learning_rate": 8.969331262371206e-07, + "loss": 0.0305, + "step": 7435 + }, + { + "epoch": 3.302687097490562, + "grad_norm": 0.4081830294338292, + "learning_rate": 8.958255923735404e-07, + "loss": 0.0229, + "step": 7436 + }, + { + "epoch": 3.3031312458361093, + "grad_norm": 0.41704083656049834, + "learning_rate": 8.947186754370907e-07, + "loss": 0.0281, + "step": 7437 + }, + { + "epoch": 3.3035753941816566, + "grad_norm": 0.4789522896492798, + "learning_rate": 8.936123755941611e-07, + "loss": 0.0309, + "step": 7438 + }, + { + "epoch": 3.304019542527204, + "grad_norm": 0.5252768558443547, + "learning_rate": 8.925066930110465e-07, + "loss": 0.0265, + "step": 7439 + }, + { + "epoch": 3.3044636908727516, + "grad_norm": 0.34510316412181224, + "learning_rate": 8.914016278539516e-07, + "loss": 0.019, + "step": 7440 + }, + { + "epoch": 3.304907839218299, + "grad_norm": 0.5857701871825775, + "learning_rate": 8.902971802889832e-07, + "loss": 0.0387, + "step": 7441 + }, + { + "epoch": 3.305351987563846, + "grad_norm": 0.5033019496461394, + "learning_rate": 8.891933504821604e-07, + "loss": 0.0275, + "step": 7442 + }, + { + "epoch": 3.305796135909394, + "grad_norm": 0.3999151889282477, + "learning_rate": 8.880901385994079e-07, + "loss": 0.0233, + "step": 7443 + }, + { + "epoch": 3.306240284254941, + "grad_norm": 0.45097536087731294, + "learning_rate": 8.869875448065563e-07, + "loss": 0.0281, + "step": 7444 + }, + { + "epoch": 3.3066844326004885, + "grad_norm": 0.4146176937666059, + "learning_rate": 8.858855692693446e-07, + "loss": 0.0294, + "step": 7445 + }, + { + "epoch": 3.307128580946036, + "grad_norm": 0.32341587117342246, + "learning_rate": 8.847842121534195e-07, + "loss": 0.0254, + "step": 7446 + }, + { + "epoch": 3.3075727292915835, + "grad_norm": 0.3794842748021583, + "learning_rate": 8.836834736243316e-07, + "loss": 0.0219, + "step": 7447 + }, + { + "epoch": 3.308016877637131, + "grad_norm": 0.5092740919087244, + "learning_rate": 8.825833538475403e-07, + "loss": 0.0279, + "step": 7448 + }, + { + "epoch": 3.308461025982678, + "grad_norm": 0.3833821462873475, + "learning_rate": 8.814838529884162e-07, + "loss": 0.0263, + "step": 7449 + }, + { + "epoch": 3.308905174328226, + "grad_norm": 0.7887453397226605, + "learning_rate": 8.803849712122292e-07, + "loss": 0.0208, + "step": 7450 + }, + { + "epoch": 3.309349322673773, + "grad_norm": 0.3907962831950114, + "learning_rate": 8.792867086841605e-07, + "loss": 0.0206, + "step": 7451 + }, + { + "epoch": 3.3097934710193204, + "grad_norm": 0.3760419094116477, + "learning_rate": 8.781890655692998e-07, + "loss": 0.022, + "step": 7452 + }, + { + "epoch": 3.3102376193648677, + "grad_norm": 0.32255052704787457, + "learning_rate": 8.770920420326384e-07, + "loss": 0.0181, + "step": 7453 + }, + { + "epoch": 3.3106817677104154, + "grad_norm": 0.6578667511681319, + "learning_rate": 8.759956382390794e-07, + "loss": 0.0254, + "step": 7454 + }, + { + "epoch": 3.3111259160559627, + "grad_norm": 0.42423201463173366, + "learning_rate": 8.748998543534304e-07, + "loss": 0.0331, + "step": 7455 + }, + { + "epoch": 3.31157006440151, + "grad_norm": 0.4695388522211957, + "learning_rate": 8.738046905404069e-07, + "loss": 0.04, + "step": 7456 + }, + { + "epoch": 3.3120142127470578, + "grad_norm": 0.488606567568898, + "learning_rate": 8.72710146964631e-07, + "loss": 0.0274, + "step": 7457 + }, + { + "epoch": 3.312458361092605, + "grad_norm": 0.5406502377024667, + "learning_rate": 8.716162237906289e-07, + "loss": 0.028, + "step": 7458 + }, + { + "epoch": 3.3129025094381523, + "grad_norm": 0.5699097094725243, + "learning_rate": 8.705229211828376e-07, + "loss": 0.03, + "step": 7459 + }, + { + "epoch": 3.3133466577836996, + "grad_norm": 0.44717354800350295, + "learning_rate": 8.694302393055992e-07, + "loss": 0.0208, + "step": 7460 + }, + { + "epoch": 3.313790806129247, + "grad_norm": 0.4253321381991377, + "learning_rate": 8.683381783231615e-07, + "loss": 0.0234, + "step": 7461 + }, + { + "epoch": 3.3142349544747947, + "grad_norm": 0.3858599803822746, + "learning_rate": 8.672467383996802e-07, + "loss": 0.0281, + "step": 7462 + }, + { + "epoch": 3.314679102820342, + "grad_norm": 1.0071452454502907, + "learning_rate": 8.661559196992186e-07, + "loss": 0.029, + "step": 7463 + }, + { + "epoch": 3.3151232511658892, + "grad_norm": 0.5059569170522437, + "learning_rate": 8.650657223857428e-07, + "loss": 0.0304, + "step": 7464 + }, + { + "epoch": 3.315567399511437, + "grad_norm": 0.4605836239110675, + "learning_rate": 8.639761466231294e-07, + "loss": 0.0374, + "step": 7465 + }, + { + "epoch": 3.3160115478569843, + "grad_norm": 0.5663222302112452, + "learning_rate": 8.628871925751598e-07, + "loss": 0.029, + "step": 7466 + }, + { + "epoch": 3.3164556962025316, + "grad_norm": 0.4770838107638845, + "learning_rate": 8.617988604055222e-07, + "loss": 0.029, + "step": 7467 + }, + { + "epoch": 3.316899844548079, + "grad_norm": 0.5186866939183644, + "learning_rate": 8.607111502778121e-07, + "loss": 0.0367, + "step": 7468 + }, + { + "epoch": 3.3173439928936266, + "grad_norm": 0.43011652502356673, + "learning_rate": 8.596240623555313e-07, + "loss": 0.0276, + "step": 7469 + }, + { + "epoch": 3.317788141239174, + "grad_norm": 0.39270558043084164, + "learning_rate": 8.585375968020854e-07, + "loss": 0.0244, + "step": 7470 + }, + { + "epoch": 3.318232289584721, + "grad_norm": 0.3922390336468883, + "learning_rate": 8.574517537807897e-07, + "loss": 0.0247, + "step": 7471 + }, + { + "epoch": 3.318676437930269, + "grad_norm": 0.4905605263072924, + "learning_rate": 8.563665334548654e-07, + "loss": 0.0281, + "step": 7472 + }, + { + "epoch": 3.319120586275816, + "grad_norm": 0.4695814471855208, + "learning_rate": 8.552819359874387e-07, + "loss": 0.0222, + "step": 7473 + }, + { + "epoch": 3.3195647346213635, + "grad_norm": 0.49486185686147216, + "learning_rate": 8.541979615415446e-07, + "loss": 0.0304, + "step": 7474 + }, + { + "epoch": 3.3200088829669108, + "grad_norm": 0.40290781496927697, + "learning_rate": 8.531146102801208e-07, + "loss": 0.0221, + "step": 7475 + }, + { + "epoch": 3.3204530313124585, + "grad_norm": 0.45538047230870293, + "learning_rate": 8.520318823660146e-07, + "loss": 0.0257, + "step": 7476 + }, + { + "epoch": 3.320897179658006, + "grad_norm": 0.4425830829496296, + "learning_rate": 8.50949777961978e-07, + "loss": 0.027, + "step": 7477 + }, + { + "epoch": 3.321341328003553, + "grad_norm": 0.32164145024703766, + "learning_rate": 8.498682972306693e-07, + "loss": 0.0171, + "step": 7478 + }, + { + "epoch": 3.321785476349101, + "grad_norm": 0.33243355641729805, + "learning_rate": 8.487874403346547e-07, + "loss": 0.0171, + "step": 7479 + }, + { + "epoch": 3.322229624694648, + "grad_norm": 0.4774020513789674, + "learning_rate": 8.477072074364051e-07, + "loss": 0.0242, + "step": 7480 + }, + { + "epoch": 3.3226737730401954, + "grad_norm": 0.7842160045286404, + "learning_rate": 8.466275986982963e-07, + "loss": 0.0482, + "step": 7481 + }, + { + "epoch": 3.3231179213857427, + "grad_norm": 0.4462293368851325, + "learning_rate": 8.455486142826135e-07, + "loss": 0.0206, + "step": 7482 + }, + { + "epoch": 3.3235620697312904, + "grad_norm": 0.3371098785501344, + "learning_rate": 8.444702543515454e-07, + "loss": 0.016, + "step": 7483 + }, + { + "epoch": 3.3240062180768377, + "grad_norm": 0.4689818865427122, + "learning_rate": 8.433925190671876e-07, + "loss": 0.0294, + "step": 7484 + }, + { + "epoch": 3.324450366422385, + "grad_norm": 0.46126294870782475, + "learning_rate": 8.423154085915447e-07, + "loss": 0.0405, + "step": 7485 + }, + { + "epoch": 3.3248945147679323, + "grad_norm": 0.4269127171229397, + "learning_rate": 8.412389230865209e-07, + "loss": 0.0215, + "step": 7486 + }, + { + "epoch": 3.32533866311348, + "grad_norm": 0.3601691103633931, + "learning_rate": 8.401630627139317e-07, + "loss": 0.0283, + "step": 7487 + }, + { + "epoch": 3.3257828114590273, + "grad_norm": 0.3308167717075761, + "learning_rate": 8.39087827635498e-07, + "loss": 0.0203, + "step": 7488 + }, + { + "epoch": 3.3262269598045746, + "grad_norm": 0.4504744849786426, + "learning_rate": 8.380132180128453e-07, + "loss": 0.0308, + "step": 7489 + }, + { + "epoch": 3.326671108150122, + "grad_norm": 0.43274682572551426, + "learning_rate": 8.369392340075056e-07, + "loss": 0.0192, + "step": 7490 + }, + { + "epoch": 3.3271152564956696, + "grad_norm": 0.38752988514350534, + "learning_rate": 8.358658757809179e-07, + "loss": 0.0326, + "step": 7491 + }, + { + "epoch": 3.327559404841217, + "grad_norm": 0.3966193669552188, + "learning_rate": 8.347931434944245e-07, + "loss": 0.0218, + "step": 7492 + }, + { + "epoch": 3.328003553186764, + "grad_norm": 0.37005902190632034, + "learning_rate": 8.337210373092763e-07, + "loss": 0.02, + "step": 7493 + }, + { + "epoch": 3.328447701532312, + "grad_norm": 0.3982123833521082, + "learning_rate": 8.326495573866284e-07, + "loss": 0.0216, + "step": 7494 + }, + { + "epoch": 3.3288918498778592, + "grad_norm": 0.41678676657338065, + "learning_rate": 8.315787038875434e-07, + "loss": 0.0284, + "step": 7495 + }, + { + "epoch": 3.3293359982234065, + "grad_norm": 0.36841312927749376, + "learning_rate": 8.305084769729882e-07, + "loss": 0.0294, + "step": 7496 + }, + { + "epoch": 3.329780146568954, + "grad_norm": 0.42365099250088206, + "learning_rate": 8.294388768038375e-07, + "loss": 0.0294, + "step": 7497 + }, + { + "epoch": 3.3302242949145016, + "grad_norm": 0.40673867597662916, + "learning_rate": 8.283699035408677e-07, + "loss": 0.0235, + "step": 7498 + }, + { + "epoch": 3.330668443260049, + "grad_norm": 0.40817718954362553, + "learning_rate": 8.273015573447646e-07, + "loss": 0.0229, + "step": 7499 + }, + { + "epoch": 3.331112591605596, + "grad_norm": 0.47791402955813106, + "learning_rate": 8.262338383761199e-07, + "loss": 0.0306, + "step": 7500 + }, + { + "epoch": 3.331556739951144, + "grad_norm": 0.3730312585666912, + "learning_rate": 8.251667467954289e-07, + "loss": 0.0241, + "step": 7501 + }, + { + "epoch": 3.332000888296691, + "grad_norm": 0.5006016285563061, + "learning_rate": 8.241002827630945e-07, + "loss": 0.0284, + "step": 7502 + }, + { + "epoch": 3.3324450366422385, + "grad_norm": 0.5039257261200133, + "learning_rate": 8.230344464394236e-07, + "loss": 0.0234, + "step": 7503 + }, + { + "epoch": 3.3328891849877857, + "grad_norm": 0.38046904572274926, + "learning_rate": 8.219692379846289e-07, + "loss": 0.033, + "step": 7504 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.3757296330009624, + "learning_rate": 8.209046575588303e-07, + "loss": 0.0291, + "step": 7505 + }, + { + "epoch": 3.3337774816788808, + "grad_norm": 0.380415197644764, + "learning_rate": 8.198407053220519e-07, + "loss": 0.0219, + "step": 7506 + }, + { + "epoch": 3.334221630024428, + "grad_norm": 0.39647003849848267, + "learning_rate": 8.187773814342242e-07, + "loss": 0.0307, + "step": 7507 + }, + { + "epoch": 3.334665778369976, + "grad_norm": 0.48671927289549416, + "learning_rate": 8.177146860551838e-07, + "loss": 0.0289, + "step": 7508 + }, + { + "epoch": 3.335109926715523, + "grad_norm": 0.4169191245545791, + "learning_rate": 8.166526193446695e-07, + "loss": 0.0286, + "step": 7509 + }, + { + "epoch": 3.3355540750610704, + "grad_norm": 0.6209631358998482, + "learning_rate": 8.155911814623291e-07, + "loss": 0.0268, + "step": 7510 + }, + { + "epoch": 3.3359982234066177, + "grad_norm": 0.4569957343588699, + "learning_rate": 8.145303725677145e-07, + "loss": 0.025, + "step": 7511 + }, + { + "epoch": 3.3364423717521654, + "grad_norm": 0.45123275529280976, + "learning_rate": 8.134701928202843e-07, + "loss": 0.0207, + "step": 7512 + }, + { + "epoch": 3.3368865200977127, + "grad_norm": 0.3644402406540846, + "learning_rate": 8.124106423794015e-07, + "loss": 0.0156, + "step": 7513 + }, + { + "epoch": 3.33733066844326, + "grad_norm": 0.6935267382582291, + "learning_rate": 8.113517214043326e-07, + "loss": 0.042, + "step": 7514 + }, + { + "epoch": 3.3377748167888073, + "grad_norm": 0.40069923978491, + "learning_rate": 8.102934300542531e-07, + "loss": 0.0282, + "step": 7515 + }, + { + "epoch": 3.338218965134355, + "grad_norm": 0.4259846167113701, + "learning_rate": 8.092357684882413e-07, + "loss": 0.0289, + "step": 7516 + }, + { + "epoch": 3.3386631134799023, + "grad_norm": 0.4192011063222261, + "learning_rate": 8.081787368652822e-07, + "loss": 0.023, + "step": 7517 + }, + { + "epoch": 3.3391072618254496, + "grad_norm": 0.35804521930200645, + "learning_rate": 8.071223353442658e-07, + "loss": 0.0294, + "step": 7518 + }, + { + "epoch": 3.339551410170997, + "grad_norm": 0.3873707899082061, + "learning_rate": 8.060665640839882e-07, + "loss": 0.0228, + "step": 7519 + }, + { + "epoch": 3.3399955585165446, + "grad_norm": 0.33653268211511883, + "learning_rate": 8.050114232431472e-07, + "loss": 0.0229, + "step": 7520 + }, + { + "epoch": 3.340439706862092, + "grad_norm": 0.4424670871666316, + "learning_rate": 8.039569129803493e-07, + "loss": 0.0273, + "step": 7521 + }, + { + "epoch": 3.340883855207639, + "grad_norm": 0.36754810698365975, + "learning_rate": 8.029030334541061e-07, + "loss": 0.0212, + "step": 7522 + }, + { + "epoch": 3.341328003553187, + "grad_norm": 0.40011975008051315, + "learning_rate": 8.01849784822833e-07, + "loss": 0.0213, + "step": 7523 + }, + { + "epoch": 3.3417721518987342, + "grad_norm": 0.4091440045335287, + "learning_rate": 8.007971672448511e-07, + "loss": 0.0326, + "step": 7524 + }, + { + "epoch": 3.3422163002442815, + "grad_norm": 0.3979067624482782, + "learning_rate": 7.997451808783884e-07, + "loss": 0.0292, + "step": 7525 + }, + { + "epoch": 3.342660448589829, + "grad_norm": 0.4350141704240594, + "learning_rate": 7.986938258815741e-07, + "loss": 0.0294, + "step": 7526 + }, + { + "epoch": 3.3431045969353765, + "grad_norm": 0.3288238940788284, + "learning_rate": 7.976431024124448e-07, + "loss": 0.02, + "step": 7527 + }, + { + "epoch": 3.343548745280924, + "grad_norm": 0.40499639306856394, + "learning_rate": 7.965930106289432e-07, + "loss": 0.0249, + "step": 7528 + }, + { + "epoch": 3.343992893626471, + "grad_norm": 0.4511936733531736, + "learning_rate": 7.955435506889154e-07, + "loss": 0.0225, + "step": 7529 + }, + { + "epoch": 3.344437041972019, + "grad_norm": 0.32996474391647085, + "learning_rate": 7.944947227501143e-07, + "loss": 0.019, + "step": 7530 + }, + { + "epoch": 3.344881190317566, + "grad_norm": 0.33989523301533825, + "learning_rate": 7.934465269701941e-07, + "loss": 0.0281, + "step": 7531 + }, + { + "epoch": 3.3453253386631134, + "grad_norm": 0.5424696702981813, + "learning_rate": 7.923989635067181e-07, + "loss": 0.0341, + "step": 7532 + }, + { + "epoch": 3.3457694870086607, + "grad_norm": 0.5354455170131127, + "learning_rate": 7.913520325171537e-07, + "loss": 0.0332, + "step": 7533 + }, + { + "epoch": 3.3462136353542085, + "grad_norm": 0.5188210769342587, + "learning_rate": 7.903057341588683e-07, + "loss": 0.0346, + "step": 7534 + }, + { + "epoch": 3.3466577836997558, + "grad_norm": 0.32324444187136564, + "learning_rate": 7.892600685891433e-07, + "loss": 0.0202, + "step": 7535 + }, + { + "epoch": 3.347101932045303, + "grad_norm": 0.38834278141098566, + "learning_rate": 7.882150359651586e-07, + "loss": 0.0233, + "step": 7536 + }, + { + "epoch": 3.3475460803908508, + "grad_norm": 0.44587329880583815, + "learning_rate": 7.871706364439985e-07, + "loss": 0.0285, + "step": 7537 + }, + { + "epoch": 3.347990228736398, + "grad_norm": 0.38089139300391106, + "learning_rate": 7.861268701826552e-07, + "loss": 0.0264, + "step": 7538 + }, + { + "epoch": 3.3484343770819454, + "grad_norm": 0.3825436500451749, + "learning_rate": 7.850837373380244e-07, + "loss": 0.0206, + "step": 7539 + }, + { + "epoch": 3.3488785254274926, + "grad_norm": 0.3827903172923659, + "learning_rate": 7.840412380669071e-07, + "loss": 0.0221, + "step": 7540 + }, + { + "epoch": 3.3493226737730404, + "grad_norm": 0.38625911378052546, + "learning_rate": 7.829993725260082e-07, + "loss": 0.0192, + "step": 7541 + }, + { + "epoch": 3.3497668221185877, + "grad_norm": 0.37721835784709495, + "learning_rate": 7.81958140871939e-07, + "loss": 0.0291, + "step": 7542 + }, + { + "epoch": 3.350210970464135, + "grad_norm": 0.39810907293172026, + "learning_rate": 7.809175432612126e-07, + "loss": 0.0239, + "step": 7543 + }, + { + "epoch": 3.3506551188096823, + "grad_norm": 1.1010267945006187, + "learning_rate": 7.798775798502484e-07, + "loss": 0.0314, + "step": 7544 + }, + { + "epoch": 3.35109926715523, + "grad_norm": 0.4353628491116733, + "learning_rate": 7.788382507953718e-07, + "loss": 0.022, + "step": 7545 + }, + { + "epoch": 3.3515434155007773, + "grad_norm": 0.392235080799356, + "learning_rate": 7.777995562528107e-07, + "loss": 0.0187, + "step": 7546 + }, + { + "epoch": 3.3519875638463246, + "grad_norm": 0.4931466449389555, + "learning_rate": 7.767614963787007e-07, + "loss": 0.0176, + "step": 7547 + }, + { + "epoch": 3.352431712191872, + "grad_norm": 0.39853671183474193, + "learning_rate": 7.757240713290764e-07, + "loss": 0.0259, + "step": 7548 + }, + { + "epoch": 3.3528758605374196, + "grad_norm": 0.4566981980981673, + "learning_rate": 7.746872812598821e-07, + "loss": 0.0255, + "step": 7549 + }, + { + "epoch": 3.353320008882967, + "grad_norm": 0.4075307870306547, + "learning_rate": 7.736511263269664e-07, + "loss": 0.0259, + "step": 7550 + }, + { + "epoch": 3.353764157228514, + "grad_norm": 0.44508580022829525, + "learning_rate": 7.726156066860769e-07, + "loss": 0.0221, + "step": 7551 + }, + { + "epoch": 3.354208305574062, + "grad_norm": 0.5633393812829065, + "learning_rate": 7.715807224928734e-07, + "loss": 0.0295, + "step": 7552 + }, + { + "epoch": 3.354652453919609, + "grad_norm": 0.4540973249501273, + "learning_rate": 7.705464739029172e-07, + "loss": 0.0262, + "step": 7553 + }, + { + "epoch": 3.3550966022651565, + "grad_norm": 0.5423667462933897, + "learning_rate": 7.695128610716707e-07, + "loss": 0.0373, + "step": 7554 + }, + { + "epoch": 3.355540750610704, + "grad_norm": 0.41533591822222976, + "learning_rate": 7.684798841545043e-07, + "loss": 0.02, + "step": 7555 + }, + { + "epoch": 3.3559848989562515, + "grad_norm": 0.37206716629030406, + "learning_rate": 7.674475433066925e-07, + "loss": 0.027, + "step": 7556 + }, + { + "epoch": 3.356429047301799, + "grad_norm": 0.489116896622887, + "learning_rate": 7.664158386834131e-07, + "loss": 0.0276, + "step": 7557 + }, + { + "epoch": 3.356873195647346, + "grad_norm": 0.3415873437320046, + "learning_rate": 7.653847704397504e-07, + "loss": 0.0199, + "step": 7558 + }, + { + "epoch": 3.357317343992894, + "grad_norm": 0.4918207209258021, + "learning_rate": 7.643543387306896e-07, + "loss": 0.0232, + "step": 7559 + }, + { + "epoch": 3.357761492338441, + "grad_norm": 0.4284932846117368, + "learning_rate": 7.63324543711122e-07, + "loss": 0.0304, + "step": 7560 + }, + { + "epoch": 3.3582056406839884, + "grad_norm": 0.3697104694086724, + "learning_rate": 7.622953855358456e-07, + "loss": 0.0206, + "step": 7561 + }, + { + "epoch": 3.3586497890295357, + "grad_norm": 0.3548831840036858, + "learning_rate": 7.612668643595561e-07, + "loss": 0.0241, + "step": 7562 + }, + { + "epoch": 3.3590939373750834, + "grad_norm": 0.3814303904037155, + "learning_rate": 7.60238980336862e-07, + "loss": 0.0285, + "step": 7563 + }, + { + "epoch": 3.3595380857206307, + "grad_norm": 0.40972342736425, + "learning_rate": 7.592117336222709e-07, + "loss": 0.0231, + "step": 7564 + }, + { + "epoch": 3.359982234066178, + "grad_norm": 0.4313904313100148, + "learning_rate": 7.581851243701938e-07, + "loss": 0.0272, + "step": 7565 + }, + { + "epoch": 3.3604263824117258, + "grad_norm": 0.42249063117216307, + "learning_rate": 7.571591527349481e-07, + "loss": 0.0245, + "step": 7566 + }, + { + "epoch": 3.360870530757273, + "grad_norm": 0.6077788918648797, + "learning_rate": 7.561338188707562e-07, + "loss": 0.0245, + "step": 7567 + }, + { + "epoch": 3.3613146791028203, + "grad_norm": 0.3504403283983759, + "learning_rate": 7.551091229317398e-07, + "loss": 0.0233, + "step": 7568 + }, + { + "epoch": 3.3617588274483676, + "grad_norm": 0.5216233787292206, + "learning_rate": 7.540850650719317e-07, + "loss": 0.0323, + "step": 7569 + }, + { + "epoch": 3.3622029757939154, + "grad_norm": 0.350140151698931, + "learning_rate": 7.530616454452644e-07, + "loss": 0.0177, + "step": 7570 + }, + { + "epoch": 3.3626471241394627, + "grad_norm": 0.4407132954191557, + "learning_rate": 7.520388642055737e-07, + "loss": 0.0315, + "step": 7571 + }, + { + "epoch": 3.36309127248501, + "grad_norm": 0.39660867426600405, + "learning_rate": 7.510167215066022e-07, + "loss": 0.0219, + "step": 7572 + }, + { + "epoch": 3.3635354208305572, + "grad_norm": 0.4580507155342704, + "learning_rate": 7.499952175019947e-07, + "loss": 0.0289, + "step": 7573 + }, + { + "epoch": 3.363979569176105, + "grad_norm": 0.3682289326212527, + "learning_rate": 7.489743523453013e-07, + "loss": 0.0239, + "step": 7574 + }, + { + "epoch": 3.3644237175216523, + "grad_norm": 0.488168671743432, + "learning_rate": 7.479541261899758e-07, + "loss": 0.0257, + "step": 7575 + }, + { + "epoch": 3.3648678658671995, + "grad_norm": 0.5131286442085937, + "learning_rate": 7.469345391893739e-07, + "loss": 0.0251, + "step": 7576 + }, + { + "epoch": 3.365312014212747, + "grad_norm": 0.5086234654496984, + "learning_rate": 7.459155914967581e-07, + "loss": 0.0312, + "step": 7577 + }, + { + "epoch": 3.3657561625582946, + "grad_norm": 1.2159009232304825, + "learning_rate": 7.448972832652939e-07, + "loss": 0.034, + "step": 7578 + }, + { + "epoch": 3.366200310903842, + "grad_norm": 0.42264429800124254, + "learning_rate": 7.438796146480471e-07, + "loss": 0.0366, + "step": 7579 + }, + { + "epoch": 3.366644459249389, + "grad_norm": 0.474511735916579, + "learning_rate": 7.428625857979943e-07, + "loss": 0.0282, + "step": 7580 + }, + { + "epoch": 3.367088607594937, + "grad_norm": 0.48689953790784196, + "learning_rate": 7.418461968680124e-07, + "loss": 0.0303, + "step": 7581 + }, + { + "epoch": 3.367532755940484, + "grad_norm": 0.34637453409423774, + "learning_rate": 7.408304480108791e-07, + "loss": 0.0213, + "step": 7582 + }, + { + "epoch": 3.3679769042860315, + "grad_norm": 0.4489470189672078, + "learning_rate": 7.398153393792801e-07, + "loss": 0.0314, + "step": 7583 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.38625164102595216, + "learning_rate": 7.388008711258049e-07, + "loss": 0.0205, + "step": 7584 + }, + { + "epoch": 3.3688652009771265, + "grad_norm": 0.38974030926942727, + "learning_rate": 7.37787043402941e-07, + "loss": 0.0263, + "step": 7585 + }, + { + "epoch": 3.369309349322674, + "grad_norm": 0.43774082705869743, + "learning_rate": 7.367738563630894e-07, + "loss": 0.0218, + "step": 7586 + }, + { + "epoch": 3.369753497668221, + "grad_norm": 0.6108985899563969, + "learning_rate": 7.357613101585459e-07, + "loss": 0.0311, + "step": 7587 + }, + { + "epoch": 3.370197646013769, + "grad_norm": 0.46815029644071443, + "learning_rate": 7.347494049415139e-07, + "loss": 0.031, + "step": 7588 + }, + { + "epoch": 3.370641794359316, + "grad_norm": 0.38759059737412377, + "learning_rate": 7.337381408641004e-07, + "loss": 0.0201, + "step": 7589 + }, + { + "epoch": 3.3710859427048634, + "grad_norm": 0.44352496378452017, + "learning_rate": 7.327275180783156e-07, + "loss": 0.0321, + "step": 7590 + }, + { + "epoch": 3.3715300910504107, + "grad_norm": 0.48865244874942404, + "learning_rate": 7.317175367360729e-07, + "loss": 0.024, + "step": 7591 + }, + { + "epoch": 3.3719742393959584, + "grad_norm": 0.4173965077203295, + "learning_rate": 7.30708196989191e-07, + "loss": 0.0208, + "step": 7592 + }, + { + "epoch": 3.3724183877415057, + "grad_norm": 0.4555470735195906, + "learning_rate": 7.296994989893885e-07, + "loss": 0.0217, + "step": 7593 + }, + { + "epoch": 3.372862536087053, + "grad_norm": 0.3350515129151666, + "learning_rate": 7.286914428882913e-07, + "loss": 0.021, + "step": 7594 + }, + { + "epoch": 3.3733066844326007, + "grad_norm": 0.310648697981219, + "learning_rate": 7.276840288374281e-07, + "loss": 0.0257, + "step": 7595 + }, + { + "epoch": 3.373750832778148, + "grad_norm": 0.5863731801123462, + "learning_rate": 7.266772569882269e-07, + "loss": 0.0311, + "step": 7596 + }, + { + "epoch": 3.3741949811236953, + "grad_norm": 0.31868586904538343, + "learning_rate": 7.256711274920264e-07, + "loss": 0.0194, + "step": 7597 + }, + { + "epoch": 3.3746391294692426, + "grad_norm": 0.3360320574193569, + "learning_rate": 7.246656405000646e-07, + "loss": 0.0219, + "step": 7598 + }, + { + "epoch": 3.37508327781479, + "grad_norm": 0.4187991374923001, + "learning_rate": 7.236607961634812e-07, + "loss": 0.0266, + "step": 7599 + }, + { + "epoch": 3.3755274261603376, + "grad_norm": 0.3828524070852837, + "learning_rate": 7.22656594633322e-07, + "loss": 0.026, + "step": 7600 + }, + { + "epoch": 3.375971574505885, + "grad_norm": 0.4895737837652768, + "learning_rate": 7.216530360605379e-07, + "loss": 0.0413, + "step": 7601 + }, + { + "epoch": 3.376415722851432, + "grad_norm": 0.5857587373369051, + "learning_rate": 7.206501205959759e-07, + "loss": 0.0219, + "step": 7602 + }, + { + "epoch": 3.37685987119698, + "grad_norm": 0.383441583877617, + "learning_rate": 7.196478483903968e-07, + "loss": 0.0196, + "step": 7603 + }, + { + "epoch": 3.3773040195425272, + "grad_norm": 0.33141940449265894, + "learning_rate": 7.186462195944555e-07, + "loss": 0.0217, + "step": 7604 + }, + { + "epoch": 3.3777481678880745, + "grad_norm": 0.37300245379755964, + "learning_rate": 7.176452343587148e-07, + "loss": 0.0216, + "step": 7605 + }, + { + "epoch": 3.378192316233622, + "grad_norm": 0.33936395007321735, + "learning_rate": 7.166448928336411e-07, + "loss": 0.0222, + "step": 7606 + }, + { + "epoch": 3.3786364645791696, + "grad_norm": 0.5246214544898155, + "learning_rate": 7.156451951696003e-07, + "loss": 0.0235, + "step": 7607 + }, + { + "epoch": 3.379080612924717, + "grad_norm": 0.3948135131762046, + "learning_rate": 7.146461415168637e-07, + "loss": 0.0231, + "step": 7608 + }, + { + "epoch": 3.379524761270264, + "grad_norm": 0.37493268321499074, + "learning_rate": 7.136477320256102e-07, + "loss": 0.0328, + "step": 7609 + }, + { + "epoch": 3.379968909615812, + "grad_norm": 0.38860406022862326, + "learning_rate": 7.126499668459135e-07, + "loss": 0.0215, + "step": 7610 + }, + { + "epoch": 3.380413057961359, + "grad_norm": 0.443708157737675, + "learning_rate": 7.116528461277561e-07, + "loss": 0.0272, + "step": 7611 + }, + { + "epoch": 3.3808572063069064, + "grad_norm": 0.4409205615918148, + "learning_rate": 7.106563700210234e-07, + "loss": 0.0199, + "step": 7612 + }, + { + "epoch": 3.3813013546524537, + "grad_norm": 0.37530972086273984, + "learning_rate": 7.096605386754995e-07, + "loss": 0.0291, + "step": 7613 + }, + { + "epoch": 3.3817455029980015, + "grad_norm": 0.4774461806256775, + "learning_rate": 7.086653522408788e-07, + "loss": 0.0274, + "step": 7614 + }, + { + "epoch": 3.3821896513435488, + "grad_norm": 0.4294747376572903, + "learning_rate": 7.076708108667512e-07, + "loss": 0.024, + "step": 7615 + }, + { + "epoch": 3.382633799689096, + "grad_norm": 0.41100245317627226, + "learning_rate": 7.066769147026154e-07, + "loss": 0.0287, + "step": 7616 + }, + { + "epoch": 3.383077948034644, + "grad_norm": 0.3045061371856552, + "learning_rate": 7.056836638978698e-07, + "loss": 0.0198, + "step": 7617 + }, + { + "epoch": 3.383522096380191, + "grad_norm": 0.3136165513411521, + "learning_rate": 7.046910586018186e-07, + "loss": 0.0145, + "step": 7618 + }, + { + "epoch": 3.3839662447257384, + "grad_norm": 0.443848462635569, + "learning_rate": 7.036990989636628e-07, + "loss": 0.0296, + "step": 7619 + }, + { + "epoch": 3.3844103930712857, + "grad_norm": 0.4329635290692527, + "learning_rate": 7.027077851325164e-07, + "loss": 0.0286, + "step": 7620 + }, + { + "epoch": 3.3848545414168334, + "grad_norm": 0.4425986943763542, + "learning_rate": 7.017171172573872e-07, + "loss": 0.0319, + "step": 7621 + }, + { + "epoch": 3.3852986897623807, + "grad_norm": 0.3486098761277207, + "learning_rate": 7.007270954871903e-07, + "loss": 0.0269, + "step": 7622 + }, + { + "epoch": 3.385742838107928, + "grad_norm": 0.35813998328060914, + "learning_rate": 6.997377199707439e-07, + "loss": 0.0212, + "step": 7623 + }, + { + "epoch": 3.3861869864534753, + "grad_norm": 0.38415273327037347, + "learning_rate": 6.987489908567663e-07, + "loss": 0.0283, + "step": 7624 + }, + { + "epoch": 3.386631134799023, + "grad_norm": 0.42667107713144753, + "learning_rate": 6.977609082938791e-07, + "loss": 0.0364, + "step": 7625 + }, + { + "epoch": 3.3870752831445703, + "grad_norm": 0.4255004286761174, + "learning_rate": 6.967734724306119e-07, + "loss": 0.0374, + "step": 7626 + }, + { + "epoch": 3.3875194314901176, + "grad_norm": 0.3765149927746781, + "learning_rate": 6.957866834153898e-07, + "loss": 0.028, + "step": 7627 + }, + { + "epoch": 3.387963579835665, + "grad_norm": 0.5330001693156503, + "learning_rate": 6.948005413965448e-07, + "loss": 0.0379, + "step": 7628 + }, + { + "epoch": 3.3884077281812126, + "grad_norm": 0.5059080740347456, + "learning_rate": 6.938150465223126e-07, + "loss": 0.0244, + "step": 7629 + }, + { + "epoch": 3.38885187652676, + "grad_norm": 0.43454836222929405, + "learning_rate": 6.928301989408253e-07, + "loss": 0.0323, + "step": 7630 + }, + { + "epoch": 3.389296024872307, + "grad_norm": 0.3908471485523718, + "learning_rate": 6.918459988001281e-07, + "loss": 0.0312, + "step": 7631 + }, + { + "epoch": 3.389740173217855, + "grad_norm": 0.4420676727706239, + "learning_rate": 6.908624462481584e-07, + "loss": 0.026, + "step": 7632 + }, + { + "epoch": 3.390184321563402, + "grad_norm": 0.35748857241333987, + "learning_rate": 6.898795414327624e-07, + "loss": 0.0212, + "step": 7633 + }, + { + "epoch": 3.3906284699089495, + "grad_norm": 0.3032775147821817, + "learning_rate": 6.888972845016889e-07, + "loss": 0.0255, + "step": 7634 + }, + { + "epoch": 3.391072618254497, + "grad_norm": 0.3982622931840567, + "learning_rate": 6.879156756025851e-07, + "loss": 0.0281, + "step": 7635 + }, + { + "epoch": 3.3915167666000445, + "grad_norm": 0.5203243077622092, + "learning_rate": 6.869347148830035e-07, + "loss": 0.0249, + "step": 7636 + }, + { + "epoch": 3.391960914945592, + "grad_norm": 0.5073777888896281, + "learning_rate": 6.85954402490403e-07, + "loss": 0.0408, + "step": 7637 + }, + { + "epoch": 3.392405063291139, + "grad_norm": 0.6112073451304307, + "learning_rate": 6.849747385721373e-07, + "loss": 0.0262, + "step": 7638 + }, + { + "epoch": 3.392849211636687, + "grad_norm": 0.4334382118989557, + "learning_rate": 6.839957232754679e-07, + "loss": 0.0247, + "step": 7639 + }, + { + "epoch": 3.393293359982234, + "grad_norm": 0.37151198090225157, + "learning_rate": 6.830173567475584e-07, + "loss": 0.0251, + "step": 7640 + }, + { + "epoch": 3.3937375083277814, + "grad_norm": 0.47432508657043243, + "learning_rate": 6.820396391354722e-07, + "loss": 0.0361, + "step": 7641 + }, + { + "epoch": 3.3941816566733287, + "grad_norm": 0.39488273115178324, + "learning_rate": 6.810625705861762e-07, + "loss": 0.0226, + "step": 7642 + }, + { + "epoch": 3.3946258050188765, + "grad_norm": 0.5607339834034196, + "learning_rate": 6.80086151246544e-07, + "loss": 0.0403, + "step": 7643 + }, + { + "epoch": 3.3950699533644237, + "grad_norm": 0.4287176433981967, + "learning_rate": 6.791103812633443e-07, + "loss": 0.026, + "step": 7644 + }, + { + "epoch": 3.395514101709971, + "grad_norm": 0.4160527081898972, + "learning_rate": 6.781352607832536e-07, + "loss": 0.0248, + "step": 7645 + }, + { + "epoch": 3.3959582500555188, + "grad_norm": 0.31792810620229006, + "learning_rate": 6.771607899528504e-07, + "loss": 0.0179, + "step": 7646 + }, + { + "epoch": 3.396402398401066, + "grad_norm": 0.5611086221445625, + "learning_rate": 6.761869689186101e-07, + "loss": 0.0266, + "step": 7647 + }, + { + "epoch": 3.3968465467466133, + "grad_norm": 0.5050890369850448, + "learning_rate": 6.752137978269191e-07, + "loss": 0.0362, + "step": 7648 + }, + { + "epoch": 3.3972906950921606, + "grad_norm": 0.3650794330064718, + "learning_rate": 6.742412768240586e-07, + "loss": 0.0259, + "step": 7649 + }, + { + "epoch": 3.3977348434377084, + "grad_norm": 0.38912734986088593, + "learning_rate": 6.732694060562162e-07, + "loss": 0.0222, + "step": 7650 + }, + { + "epoch": 3.3981789917832557, + "grad_norm": 0.3696530901709022, + "learning_rate": 6.722981856694811e-07, + "loss": 0.0242, + "step": 7651 + }, + { + "epoch": 3.398623140128803, + "grad_norm": 0.35581748069826435, + "learning_rate": 6.713276158098425e-07, + "loss": 0.0203, + "step": 7652 + }, + { + "epoch": 3.3990672884743502, + "grad_norm": 0.3267819000046619, + "learning_rate": 6.703576966231939e-07, + "loss": 0.0177, + "step": 7653 + }, + { + "epoch": 3.399511436819898, + "grad_norm": 1.0254709261428898, + "learning_rate": 6.693884282553332e-07, + "loss": 0.0272, + "step": 7654 + }, + { + "epoch": 3.3999555851654453, + "grad_norm": 0.5090905796653779, + "learning_rate": 6.684198108519546e-07, + "loss": 0.026, + "step": 7655 + }, + { + "epoch": 3.4003997335109926, + "grad_norm": 0.47438772486404496, + "learning_rate": 6.674518445586592e-07, + "loss": 0.0293, + "step": 7656 + }, + { + "epoch": 3.40084388185654, + "grad_norm": 0.508053594793761, + "learning_rate": 6.664845295209499e-07, + "loss": 0.0397, + "step": 7657 + }, + { + "epoch": 3.4012880302020876, + "grad_norm": 0.361105004393522, + "learning_rate": 6.655178658842282e-07, + "loss": 0.0197, + "step": 7658 + }, + { + "epoch": 3.401732178547635, + "grad_norm": 0.36194311552740505, + "learning_rate": 6.645518537938012e-07, + "loss": 0.0233, + "step": 7659 + }, + { + "epoch": 3.402176326893182, + "grad_norm": 0.4835513612825343, + "learning_rate": 6.635864933948771e-07, + "loss": 0.0243, + "step": 7660 + }, + { + "epoch": 3.40262047523873, + "grad_norm": 0.40734978570499325, + "learning_rate": 6.626217848325656e-07, + "loss": 0.0235, + "step": 7661 + }, + { + "epoch": 3.403064623584277, + "grad_norm": 0.48301835002180094, + "learning_rate": 6.616577282518794e-07, + "loss": 0.0307, + "step": 7662 + }, + { + "epoch": 3.4035087719298245, + "grad_norm": 0.4251997392123568, + "learning_rate": 6.606943237977331e-07, + "loss": 0.0193, + "step": 7663 + }, + { + "epoch": 3.4039529202753718, + "grad_norm": 0.45591767603385597, + "learning_rate": 6.597315716149394e-07, + "loss": 0.0348, + "step": 7664 + }, + { + "epoch": 3.4043970686209195, + "grad_norm": 0.38978320299224184, + "learning_rate": 6.587694718482213e-07, + "loss": 0.024, + "step": 7665 + }, + { + "epoch": 3.404841216966467, + "grad_norm": 0.36098640085183586, + "learning_rate": 6.578080246421947e-07, + "loss": 0.0231, + "step": 7666 + }, + { + "epoch": 3.405285365312014, + "grad_norm": 0.42381127870689095, + "learning_rate": 6.568472301413836e-07, + "loss": 0.0293, + "step": 7667 + }, + { + "epoch": 3.405729513657562, + "grad_norm": 0.36978104460309597, + "learning_rate": 6.558870884902119e-07, + "loss": 0.0228, + "step": 7668 + }, + { + "epoch": 3.406173662003109, + "grad_norm": 0.5467829172657439, + "learning_rate": 6.549275998330029e-07, + "loss": 0.027, + "step": 7669 + }, + { + "epoch": 3.4066178103486564, + "grad_norm": 0.42439207241368543, + "learning_rate": 6.539687643139847e-07, + "loss": 0.0243, + "step": 7670 + }, + { + "epoch": 3.4070619586942037, + "grad_norm": 0.34272251460913966, + "learning_rate": 6.530105820772897e-07, + "loss": 0.0151, + "step": 7671 + }, + { + "epoch": 3.4075061070397514, + "grad_norm": 0.4182566186357329, + "learning_rate": 6.52053053266945e-07, + "loss": 0.0205, + "step": 7672 + }, + { + "epoch": 3.4079502553852987, + "grad_norm": 0.350434375510393, + "learning_rate": 6.51096178026886e-07, + "loss": 0.0237, + "step": 7673 + }, + { + "epoch": 3.408394403730846, + "grad_norm": 0.36842396127077415, + "learning_rate": 6.50139956500947e-07, + "loss": 0.0248, + "step": 7674 + }, + { + "epoch": 3.4088385520763937, + "grad_norm": 0.3776577384417059, + "learning_rate": 6.491843888328625e-07, + "loss": 0.026, + "step": 7675 + }, + { + "epoch": 3.409282700421941, + "grad_norm": 0.4739319012198071, + "learning_rate": 6.482294751662721e-07, + "loss": 0.0353, + "step": 7676 + }, + { + "epoch": 3.4097268487674883, + "grad_norm": 0.35847335034757755, + "learning_rate": 6.472752156447148e-07, + "loss": 0.0198, + "step": 7677 + }, + { + "epoch": 3.4101709971130356, + "grad_norm": 0.41470003345548434, + "learning_rate": 6.463216104116327e-07, + "loss": 0.0206, + "step": 7678 + }, + { + "epoch": 3.4106151454585834, + "grad_norm": 0.41774456358583356, + "learning_rate": 6.453686596103697e-07, + "loss": 0.0296, + "step": 7679 + }, + { + "epoch": 3.4110592938041306, + "grad_norm": 0.4014386567309541, + "learning_rate": 6.444163633841688e-07, + "loss": 0.0283, + "step": 7680 + }, + { + "epoch": 3.411503442149678, + "grad_norm": 0.5677232388670659, + "learning_rate": 6.434647218761764e-07, + "loss": 0.0399, + "step": 7681 + }, + { + "epoch": 3.4119475904952252, + "grad_norm": 0.38057897363042437, + "learning_rate": 6.425137352294408e-07, + "loss": 0.0241, + "step": 7682 + }, + { + "epoch": 3.412391738840773, + "grad_norm": 0.4520213263061807, + "learning_rate": 6.415634035869117e-07, + "loss": 0.0242, + "step": 7683 + }, + { + "epoch": 3.4128358871863202, + "grad_norm": 0.4445568386773207, + "learning_rate": 6.406137270914404e-07, + "loss": 0.0327, + "step": 7684 + }, + { + "epoch": 3.4132800355318675, + "grad_norm": 0.48242749215653374, + "learning_rate": 6.396647058857792e-07, + "loss": 0.031, + "step": 7685 + }, + { + "epoch": 3.413724183877415, + "grad_norm": 0.3708344900625483, + "learning_rate": 6.387163401125812e-07, + "loss": 0.0233, + "step": 7686 + }, + { + "epoch": 3.4141683322229626, + "grad_norm": 0.3782593306434555, + "learning_rate": 6.377686299144025e-07, + "loss": 0.0269, + "step": 7687 + }, + { + "epoch": 3.41461248056851, + "grad_norm": 0.43908345913471114, + "learning_rate": 6.368215754337004e-07, + "loss": 0.0307, + "step": 7688 + }, + { + "epoch": 3.415056628914057, + "grad_norm": 0.5407716737755903, + "learning_rate": 6.358751768128324e-07, + "loss": 0.0259, + "step": 7689 + }, + { + "epoch": 3.415500777259605, + "grad_norm": 0.3714076929525467, + "learning_rate": 6.349294341940593e-07, + "loss": 0.0182, + "step": 7690 + }, + { + "epoch": 3.415944925605152, + "grad_norm": 0.5168319772596983, + "learning_rate": 6.339843477195423e-07, + "loss": 0.0405, + "step": 7691 + }, + { + "epoch": 3.4163890739506995, + "grad_norm": 0.470888647389716, + "learning_rate": 6.330399175313429e-07, + "loss": 0.0326, + "step": 7692 + }, + { + "epoch": 3.4168332222962468, + "grad_norm": 0.3446195227420253, + "learning_rate": 6.320961437714257e-07, + "loss": 0.0136, + "step": 7693 + }, + { + "epoch": 3.4172773706417945, + "grad_norm": 0.37511059576115724, + "learning_rate": 6.311530265816551e-07, + "loss": 0.0336, + "step": 7694 + }, + { + "epoch": 3.4177215189873418, + "grad_norm": 0.362187597907496, + "learning_rate": 6.302105661037988e-07, + "loss": 0.0212, + "step": 7695 + }, + { + "epoch": 3.418165667332889, + "grad_norm": 0.4435954280766343, + "learning_rate": 6.292687624795257e-07, + "loss": 0.0315, + "step": 7696 + }, + { + "epoch": 3.418609815678437, + "grad_norm": 0.36818994844478536, + "learning_rate": 6.283276158504015e-07, + "loss": 0.0226, + "step": 7697 + }, + { + "epoch": 3.419053964023984, + "grad_norm": 0.35373090512917094, + "learning_rate": 6.27387126357899e-07, + "loss": 0.0165, + "step": 7698 + }, + { + "epoch": 3.4194981123695314, + "grad_norm": 0.364744423870996, + "learning_rate": 6.264472941433886e-07, + "loss": 0.0159, + "step": 7699 + }, + { + "epoch": 3.4199422607150787, + "grad_norm": 0.43254863668472154, + "learning_rate": 6.255081193481438e-07, + "loss": 0.0269, + "step": 7700 + }, + { + "epoch": 3.4203864090606264, + "grad_norm": 0.3606692968436507, + "learning_rate": 6.24569602113338e-07, + "loss": 0.0304, + "step": 7701 + }, + { + "epoch": 3.4208305574061737, + "grad_norm": 0.5007533786139542, + "learning_rate": 6.236317425800481e-07, + "loss": 0.0287, + "step": 7702 + }, + { + "epoch": 3.421274705751721, + "grad_norm": 0.41340230947723855, + "learning_rate": 6.226945408892477e-07, + "loss": 0.0199, + "step": 7703 + }, + { + "epoch": 3.4217188540972687, + "grad_norm": 0.365085912872857, + "learning_rate": 6.21757997181815e-07, + "loss": 0.021, + "step": 7704 + }, + { + "epoch": 3.422163002442816, + "grad_norm": 0.5548854003945028, + "learning_rate": 6.208221115985285e-07, + "loss": 0.0336, + "step": 7705 + }, + { + "epoch": 3.4226071507883633, + "grad_norm": 0.5898794387281542, + "learning_rate": 6.198868842800681e-07, + "loss": 0.0329, + "step": 7706 + }, + { + "epoch": 3.4230512991339106, + "grad_norm": 0.670696139265725, + "learning_rate": 6.189523153670152e-07, + "loss": 0.0411, + "step": 7707 + }, + { + "epoch": 3.4234954474794583, + "grad_norm": 0.44881661557818314, + "learning_rate": 6.180184049998489e-07, + "loss": 0.0294, + "step": 7708 + }, + { + "epoch": 3.4239395958250056, + "grad_norm": 0.41059208128468105, + "learning_rate": 6.170851533189537e-07, + "loss": 0.0221, + "step": 7709 + }, + { + "epoch": 3.424383744170553, + "grad_norm": 0.4264078976239932, + "learning_rate": 6.161525604646124e-07, + "loss": 0.0247, + "step": 7710 + }, + { + "epoch": 3.4248278925161, + "grad_norm": 0.4696315553933537, + "learning_rate": 6.152206265770095e-07, + "loss": 0.026, + "step": 7711 + }, + { + "epoch": 3.425272040861648, + "grad_norm": 0.45378823926056083, + "learning_rate": 6.142893517962312e-07, + "loss": 0.0235, + "step": 7712 + }, + { + "epoch": 3.4257161892071952, + "grad_norm": 0.4121833782140256, + "learning_rate": 6.133587362622645e-07, + "loss": 0.0272, + "step": 7713 + }, + { + "epoch": 3.4261603375527425, + "grad_norm": 0.3934369295423382, + "learning_rate": 6.124287801149942e-07, + "loss": 0.0254, + "step": 7714 + }, + { + "epoch": 3.42660448589829, + "grad_norm": 0.5182408476406597, + "learning_rate": 6.114994834942106e-07, + "loss": 0.0311, + "step": 7715 + }, + { + "epoch": 3.4270486342438375, + "grad_norm": 0.443783825192744, + "learning_rate": 6.105708465396021e-07, + "loss": 0.0245, + "step": 7716 + }, + { + "epoch": 3.427492782589385, + "grad_norm": 0.47129625335261466, + "learning_rate": 6.096428693907591e-07, + "loss": 0.0254, + "step": 7717 + }, + { + "epoch": 3.427936930934932, + "grad_norm": 0.38995292895347056, + "learning_rate": 6.087155521871713e-07, + "loss": 0.028, + "step": 7718 + }, + { + "epoch": 3.42838107928048, + "grad_norm": 0.38715826339897036, + "learning_rate": 6.077888950682326e-07, + "loss": 0.0228, + "step": 7719 + }, + { + "epoch": 3.428825227626027, + "grad_norm": 0.4108315391155564, + "learning_rate": 6.068628981732322e-07, + "loss": 0.0235, + "step": 7720 + }, + { + "epoch": 3.4292693759715744, + "grad_norm": 0.3702934691579192, + "learning_rate": 6.059375616413643e-07, + "loss": 0.0211, + "step": 7721 + }, + { + "epoch": 3.4297135243171217, + "grad_norm": 0.3963620092994827, + "learning_rate": 6.050128856117232e-07, + "loss": 0.019, + "step": 7722 + }, + { + "epoch": 3.4301576726626695, + "grad_norm": 0.37150524249459593, + "learning_rate": 6.040888702233033e-07, + "loss": 0.0224, + "step": 7723 + }, + { + "epoch": 3.4306018210082168, + "grad_norm": 0.44821126190431915, + "learning_rate": 6.031655156150007e-07, + "loss": 0.0305, + "step": 7724 + }, + { + "epoch": 3.431045969353764, + "grad_norm": 0.35283254586034607, + "learning_rate": 6.022428219256087e-07, + "loss": 0.024, + "step": 7725 + }, + { + "epoch": 3.431490117699312, + "grad_norm": 0.4935930259013348, + "learning_rate": 6.013207892938261e-07, + "loss": 0.0457, + "step": 7726 + }, + { + "epoch": 3.431934266044859, + "grad_norm": 0.45369029101077796, + "learning_rate": 6.003994178582489e-07, + "loss": 0.0243, + "step": 7727 + }, + { + "epoch": 3.4323784143904064, + "grad_norm": 0.48244218877941225, + "learning_rate": 5.994787077573754e-07, + "loss": 0.0231, + "step": 7728 + }, + { + "epoch": 3.4328225627359537, + "grad_norm": 0.3599007846172956, + "learning_rate": 5.985586591296044e-07, + "loss": 0.0182, + "step": 7729 + }, + { + "epoch": 3.4332667110815014, + "grad_norm": 0.40382161309843, + "learning_rate": 5.976392721132351e-07, + "loss": 0.0152, + "step": 7730 + }, + { + "epoch": 3.4337108594270487, + "grad_norm": 0.44806877408277146, + "learning_rate": 5.967205468464648e-07, + "loss": 0.026, + "step": 7731 + }, + { + "epoch": 3.434155007772596, + "grad_norm": 0.3566806457323869, + "learning_rate": 5.958024834673953e-07, + "loss": 0.0192, + "step": 7732 + }, + { + "epoch": 3.4345991561181437, + "grad_norm": 0.4251189473202698, + "learning_rate": 5.948850821140267e-07, + "loss": 0.0218, + "step": 7733 + }, + { + "epoch": 3.435043304463691, + "grad_norm": 0.38225003080929293, + "learning_rate": 5.939683429242604e-07, + "loss": 0.0217, + "step": 7734 + }, + { + "epoch": 3.4354874528092383, + "grad_norm": 0.34970498765009345, + "learning_rate": 5.930522660358973e-07, + "loss": 0.0167, + "step": 7735 + }, + { + "epoch": 3.4359316011547856, + "grad_norm": 0.41748998353130423, + "learning_rate": 5.921368515866405e-07, + "loss": 0.0314, + "step": 7736 + }, + { + "epoch": 3.436375749500333, + "grad_norm": 0.5166214998053038, + "learning_rate": 5.912220997140905e-07, + "loss": 0.0327, + "step": 7737 + }, + { + "epoch": 3.4368198978458806, + "grad_norm": 0.4503364511144795, + "learning_rate": 5.903080105557507e-07, + "loss": 0.0305, + "step": 7738 + }, + { + "epoch": 3.437264046191428, + "grad_norm": 0.38794068759388, + "learning_rate": 5.893945842490245e-07, + "loss": 0.0234, + "step": 7739 + }, + { + "epoch": 3.437708194536975, + "grad_norm": 0.4805056088393158, + "learning_rate": 5.884818209312159e-07, + "loss": 0.0298, + "step": 7740 + }, + { + "epoch": 3.438152342882523, + "grad_norm": 0.5076559758304271, + "learning_rate": 5.875697207395286e-07, + "loss": 0.035, + "step": 7741 + }, + { + "epoch": 3.43859649122807, + "grad_norm": 0.42338511678046004, + "learning_rate": 5.866582838110657e-07, + "loss": 0.0363, + "step": 7742 + }, + { + "epoch": 3.4390406395736175, + "grad_norm": 0.36808395417139134, + "learning_rate": 5.857475102828325e-07, + "loss": 0.0188, + "step": 7743 + }, + { + "epoch": 3.439484787919165, + "grad_norm": 0.3177380221983695, + "learning_rate": 5.848374002917329e-07, + "loss": 0.0193, + "step": 7744 + }, + { + "epoch": 3.4399289362647125, + "grad_norm": 0.3914364144553906, + "learning_rate": 5.839279539745729e-07, + "loss": 0.0223, + "step": 7745 + }, + { + "epoch": 3.44037308461026, + "grad_norm": 0.402651938874195, + "learning_rate": 5.830191714680578e-07, + "loss": 0.0199, + "step": 7746 + }, + { + "epoch": 3.440817232955807, + "grad_norm": 0.3522412455115687, + "learning_rate": 5.821110529087932e-07, + "loss": 0.0247, + "step": 7747 + }, + { + "epoch": 3.441261381301355, + "grad_norm": 0.4021635163074727, + "learning_rate": 5.812035984332832e-07, + "loss": 0.0229, + "step": 7748 + }, + { + "epoch": 3.441705529646902, + "grad_norm": 0.4251094753016676, + "learning_rate": 5.802968081779342e-07, + "loss": 0.024, + "step": 7749 + }, + { + "epoch": 3.4421496779924494, + "grad_norm": 0.3662362137440131, + "learning_rate": 5.79390682279053e-07, + "loss": 0.0247, + "step": 7750 + }, + { + "epoch": 3.4425938263379967, + "grad_norm": 0.4585215842279442, + "learning_rate": 5.784852208728453e-07, + "loss": 0.0295, + "step": 7751 + }, + { + "epoch": 3.4430379746835444, + "grad_norm": 0.5141537675737569, + "learning_rate": 5.775804240954181e-07, + "loss": 0.0318, + "step": 7752 + }, + { + "epoch": 3.4434821230290917, + "grad_norm": 0.5180769012624266, + "learning_rate": 5.766762920827762e-07, + "loss": 0.0255, + "step": 7753 + }, + { + "epoch": 3.443926271374639, + "grad_norm": 0.4003620998147694, + "learning_rate": 5.757728249708261e-07, + "loss": 0.0271, + "step": 7754 + }, + { + "epoch": 3.4443704197201868, + "grad_norm": 0.3418440737923465, + "learning_rate": 5.748700228953758e-07, + "loss": 0.0184, + "step": 7755 + }, + { + "epoch": 3.444814568065734, + "grad_norm": 0.687451428270991, + "learning_rate": 5.739678859921299e-07, + "loss": 0.0362, + "step": 7756 + }, + { + "epoch": 3.4452587164112813, + "grad_norm": 0.5024182127797775, + "learning_rate": 5.730664143966969e-07, + "loss": 0.0278, + "step": 7757 + }, + { + "epoch": 3.4457028647568286, + "grad_norm": 0.38303277714696055, + "learning_rate": 5.721656082445825e-07, + "loss": 0.0193, + "step": 7758 + }, + { + "epoch": 3.4461470131023764, + "grad_norm": 0.47510936774303797, + "learning_rate": 5.712654676711921e-07, + "loss": 0.0246, + "step": 7759 + }, + { + "epoch": 3.4465911614479237, + "grad_norm": 0.3741311721785513, + "learning_rate": 5.703659928118333e-07, + "loss": 0.0181, + "step": 7760 + }, + { + "epoch": 3.447035309793471, + "grad_norm": 0.4576763768736328, + "learning_rate": 5.694671838017119e-07, + "loss": 0.0274, + "step": 7761 + }, + { + "epoch": 3.4474794581390182, + "grad_norm": 0.36322150440607, + "learning_rate": 5.685690407759342e-07, + "loss": 0.0198, + "step": 7762 + }, + { + "epoch": 3.447923606484566, + "grad_norm": 0.5039426250866698, + "learning_rate": 5.676715638695063e-07, + "loss": 0.0299, + "step": 7763 + }, + { + "epoch": 3.4483677548301133, + "grad_norm": 0.499760373842697, + "learning_rate": 5.667747532173362e-07, + "loss": 0.0244, + "step": 7764 + }, + { + "epoch": 3.4488119031756606, + "grad_norm": 0.42349126436586376, + "learning_rate": 5.658786089542262e-07, + "loss": 0.0265, + "step": 7765 + }, + { + "epoch": 3.449256051521208, + "grad_norm": 0.5133192715069692, + "learning_rate": 5.649831312148845e-07, + "loss": 0.0381, + "step": 7766 + }, + { + "epoch": 3.4497001998667556, + "grad_norm": 0.4265674001652618, + "learning_rate": 5.640883201339154e-07, + "loss": 0.0254, + "step": 7767 + }, + { + "epoch": 3.450144348212303, + "grad_norm": 0.5615805590965591, + "learning_rate": 5.631941758458254e-07, + "loss": 0.028, + "step": 7768 + }, + { + "epoch": 3.45058849655785, + "grad_norm": 0.6650175323233352, + "learning_rate": 5.623006984850193e-07, + "loss": 0.0313, + "step": 7769 + }, + { + "epoch": 3.451032644903398, + "grad_norm": 0.4291032020769662, + "learning_rate": 5.61407888185801e-07, + "loss": 0.031, + "step": 7770 + }, + { + "epoch": 3.451476793248945, + "grad_norm": 0.438818710394981, + "learning_rate": 5.60515745082375e-07, + "loss": 0.0271, + "step": 7771 + }, + { + "epoch": 3.4519209415944925, + "grad_norm": 0.4109123087663869, + "learning_rate": 5.596242693088478e-07, + "loss": 0.022, + "step": 7772 + }, + { + "epoch": 3.4523650899400398, + "grad_norm": 0.5366106862877935, + "learning_rate": 5.587334609992195e-07, + "loss": 0.04, + "step": 7773 + }, + { + "epoch": 3.4528092382855875, + "grad_norm": 0.37769149540290886, + "learning_rate": 5.578433202873967e-07, + "loss": 0.0292, + "step": 7774 + }, + { + "epoch": 3.453253386631135, + "grad_norm": 0.4649378802579783, + "learning_rate": 5.569538473071834e-07, + "loss": 0.0227, + "step": 7775 + }, + { + "epoch": 3.453697534976682, + "grad_norm": 0.48743583603374796, + "learning_rate": 5.560650421922798e-07, + "loss": 0.0315, + "step": 7776 + }, + { + "epoch": 3.45414168332223, + "grad_norm": 0.5101610442676535, + "learning_rate": 5.551769050762895e-07, + "loss": 0.03, + "step": 7777 + }, + { + "epoch": 3.454585831667777, + "grad_norm": 0.3279417409470981, + "learning_rate": 5.542894360927148e-07, + "loss": 0.0171, + "step": 7778 + }, + { + "epoch": 3.4550299800133244, + "grad_norm": 0.36215562405473767, + "learning_rate": 5.534026353749572e-07, + "loss": 0.0249, + "step": 7779 + }, + { + "epoch": 3.4554741283588717, + "grad_norm": 0.33716482541449544, + "learning_rate": 5.52516503056319e-07, + "loss": 0.0205, + "step": 7780 + }, + { + "epoch": 3.4559182767044194, + "grad_norm": 0.3903797663782476, + "learning_rate": 5.516310392699991e-07, + "loss": 0.0192, + "step": 7781 + }, + { + "epoch": 3.4563624250499667, + "grad_norm": 0.4369763600207604, + "learning_rate": 5.507462441490985e-07, + "loss": 0.0254, + "step": 7782 + }, + { + "epoch": 3.456806573395514, + "grad_norm": 0.4196928785980343, + "learning_rate": 5.498621178266167e-07, + "loss": 0.0242, + "step": 7783 + }, + { + "epoch": 3.4572507217410617, + "grad_norm": 0.36290486329076327, + "learning_rate": 5.489786604354535e-07, + "loss": 0.0284, + "step": 7784 + }, + { + "epoch": 3.457694870086609, + "grad_norm": 0.4926206504376682, + "learning_rate": 5.480958721084074e-07, + "loss": 0.0281, + "step": 7785 + }, + { + "epoch": 3.4581390184321563, + "grad_norm": 0.4384128683223243, + "learning_rate": 5.472137529781768e-07, + "loss": 0.0281, + "step": 7786 + }, + { + "epoch": 3.4585831667777036, + "grad_norm": 0.42664316505284405, + "learning_rate": 5.463323031773581e-07, + "loss": 0.0286, + "step": 7787 + }, + { + "epoch": 3.4590273151232513, + "grad_norm": 0.47490376622622354, + "learning_rate": 5.454515228384493e-07, + "loss": 0.0208, + "step": 7788 + }, + { + "epoch": 3.4594714634687986, + "grad_norm": 0.40714987915665685, + "learning_rate": 5.445714120938467e-07, + "loss": 0.0321, + "step": 7789 + }, + { + "epoch": 3.459915611814346, + "grad_norm": 0.3507834802550716, + "learning_rate": 5.436919710758432e-07, + "loss": 0.0225, + "step": 7790 + }, + { + "epoch": 3.460359760159893, + "grad_norm": 0.3642385320392474, + "learning_rate": 5.42813199916637e-07, + "loss": 0.025, + "step": 7791 + }, + { + "epoch": 3.460803908505441, + "grad_norm": 0.3676585444584738, + "learning_rate": 5.419350987483224e-07, + "loss": 0.026, + "step": 7792 + }, + { + "epoch": 3.4612480568509882, + "grad_norm": 0.42929474369532705, + "learning_rate": 5.410576677028906e-07, + "loss": 0.0316, + "step": 7793 + }, + { + "epoch": 3.4616922051965355, + "grad_norm": 0.45518167323880976, + "learning_rate": 5.401809069122354e-07, + "loss": 0.0284, + "step": 7794 + }, + { + "epoch": 3.462136353542083, + "grad_norm": 0.4983281791130265, + "learning_rate": 5.393048165081493e-07, + "loss": 0.0411, + "step": 7795 + }, + { + "epoch": 3.4625805018876306, + "grad_norm": 0.419339473620287, + "learning_rate": 5.384293966223231e-07, + "loss": 0.0215, + "step": 7796 + }, + { + "epoch": 3.463024650233178, + "grad_norm": 0.3599252115295799, + "learning_rate": 5.37554647386348e-07, + "loss": 0.0254, + "step": 7797 + }, + { + "epoch": 3.463468798578725, + "grad_norm": 0.35977012849103657, + "learning_rate": 5.366805689317129e-07, + "loss": 0.0293, + "step": 7798 + }, + { + "epoch": 3.463912946924273, + "grad_norm": 0.5206144817249008, + "learning_rate": 5.358071613898064e-07, + "loss": 0.0272, + "step": 7799 + }, + { + "epoch": 3.46435709526982, + "grad_norm": 0.2890681249275682, + "learning_rate": 5.349344248919175e-07, + "loss": 0.0199, + "step": 7800 + }, + { + "epoch": 3.4648012436153675, + "grad_norm": 0.4535682999458182, + "learning_rate": 5.340623595692313e-07, + "loss": 0.0312, + "step": 7801 + }, + { + "epoch": 3.4652453919609147, + "grad_norm": 0.4712735624226538, + "learning_rate": 5.331909655528361e-07, + "loss": 0.0294, + "step": 7802 + }, + { + "epoch": 3.4656895403064625, + "grad_norm": 0.39969281186884054, + "learning_rate": 5.323202429737179e-07, + "loss": 0.0214, + "step": 7803 + }, + { + "epoch": 3.4661336886520098, + "grad_norm": 0.40892930316739623, + "learning_rate": 5.31450191962759e-07, + "loss": 0.0273, + "step": 7804 + }, + { + "epoch": 3.466577836997557, + "grad_norm": 0.40420963258096243, + "learning_rate": 5.305808126507433e-07, + "loss": 0.0257, + "step": 7805 + }, + { + "epoch": 3.467021985343105, + "grad_norm": 0.4298719054004784, + "learning_rate": 5.297121051683546e-07, + "loss": 0.0202, + "step": 7806 + }, + { + "epoch": 3.467466133688652, + "grad_norm": 0.4156335033725289, + "learning_rate": 5.288440696461716e-07, + "loss": 0.0206, + "step": 7807 + }, + { + "epoch": 3.4679102820341994, + "grad_norm": 0.6442580410348813, + "learning_rate": 5.279767062146784e-07, + "loss": 0.0335, + "step": 7808 + }, + { + "epoch": 3.4683544303797467, + "grad_norm": 0.42219725211126974, + "learning_rate": 5.271100150042518e-07, + "loss": 0.0263, + "step": 7809 + }, + { + "epoch": 3.4687985787252944, + "grad_norm": 0.39548539897151663, + "learning_rate": 5.262439961451709e-07, + "loss": 0.0249, + "step": 7810 + }, + { + "epoch": 3.4692427270708417, + "grad_norm": 0.3080627076539832, + "learning_rate": 5.253786497676134e-07, + "loss": 0.0189, + "step": 7811 + }, + { + "epoch": 3.469686875416389, + "grad_norm": 0.435674321335036, + "learning_rate": 5.245139760016549e-07, + "loss": 0.026, + "step": 7812 + }, + { + "epoch": 3.4701310237619367, + "grad_norm": 0.3308167388953746, + "learning_rate": 5.236499749772716e-07, + "loss": 0.0192, + "step": 7813 + }, + { + "epoch": 3.470575172107484, + "grad_norm": 0.411730156267874, + "learning_rate": 5.227866468243376e-07, + "loss": 0.0278, + "step": 7814 + }, + { + "epoch": 3.4710193204530313, + "grad_norm": 0.4770932092793088, + "learning_rate": 5.219239916726243e-07, + "loss": 0.0356, + "step": 7815 + }, + { + "epoch": 3.4714634687985786, + "grad_norm": 0.5082942621753359, + "learning_rate": 5.210620096518044e-07, + "loss": 0.0335, + "step": 7816 + }, + { + "epoch": 3.4719076171441263, + "grad_norm": 0.5849666623331924, + "learning_rate": 5.202007008914489e-07, + "loss": 0.0298, + "step": 7817 + }, + { + "epoch": 3.4723517654896736, + "grad_norm": 0.5043080758971297, + "learning_rate": 5.193400655210251e-07, + "loss": 0.0257, + "step": 7818 + }, + { + "epoch": 3.472795913835221, + "grad_norm": 0.35362037908035154, + "learning_rate": 5.184801036699033e-07, + "loss": 0.0195, + "step": 7819 + }, + { + "epoch": 3.473240062180768, + "grad_norm": 0.3793466003982311, + "learning_rate": 5.176208154673502e-07, + "loss": 0.0198, + "step": 7820 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.3602963289642836, + "learning_rate": 5.167622010425305e-07, + "loss": 0.0241, + "step": 7821 + }, + { + "epoch": 3.474128358871863, + "grad_norm": 0.4045415464931093, + "learning_rate": 5.159042605245085e-07, + "loss": 0.0308, + "step": 7822 + }, + { + "epoch": 3.4745725072174105, + "grad_norm": 0.3631349056889571, + "learning_rate": 5.150469940422487e-07, + "loss": 0.0199, + "step": 7823 + }, + { + "epoch": 3.475016655562958, + "grad_norm": 0.47233718849299045, + "learning_rate": 5.141904017246097e-07, + "loss": 0.0192, + "step": 7824 + }, + { + "epoch": 3.4754608039085055, + "grad_norm": 0.4948999022011918, + "learning_rate": 5.133344837003557e-07, + "loss": 0.0354, + "step": 7825 + }, + { + "epoch": 3.475904952254053, + "grad_norm": 0.37036897500750726, + "learning_rate": 5.124792400981432e-07, + "loss": 0.0236, + "step": 7826 + }, + { + "epoch": 3.4763491005996, + "grad_norm": 0.33159665048430614, + "learning_rate": 5.116246710465306e-07, + "loss": 0.0243, + "step": 7827 + }, + { + "epoch": 3.476793248945148, + "grad_norm": 0.4934189727153532, + "learning_rate": 5.10770776673975e-07, + "loss": 0.0221, + "step": 7828 + }, + { + "epoch": 3.477237397290695, + "grad_norm": 0.40096875281363703, + "learning_rate": 5.099175571088283e-07, + "loss": 0.0224, + "step": 7829 + }, + { + "epoch": 3.4776815456362424, + "grad_norm": 0.5694968604223205, + "learning_rate": 5.090650124793472e-07, + "loss": 0.0344, + "step": 7830 + }, + { + "epoch": 3.4781256939817897, + "grad_norm": 0.347016380477264, + "learning_rate": 5.082131429136833e-07, + "loss": 0.0246, + "step": 7831 + }, + { + "epoch": 3.4785698423273375, + "grad_norm": 0.3939110962708003, + "learning_rate": 5.073619485398845e-07, + "loss": 0.0216, + "step": 7832 + }, + { + "epoch": 3.4790139906728847, + "grad_norm": 0.5312038258908143, + "learning_rate": 5.065114294859019e-07, + "loss": 0.026, + "step": 7833 + }, + { + "epoch": 3.479458139018432, + "grad_norm": 0.3820612120014884, + "learning_rate": 5.056615858795838e-07, + "loss": 0.0231, + "step": 7834 + }, + { + "epoch": 3.4799022873639798, + "grad_norm": 0.5500948375085558, + "learning_rate": 5.048124178486724e-07, + "loss": 0.0371, + "step": 7835 + }, + { + "epoch": 3.480346435709527, + "grad_norm": 0.4385367139000315, + "learning_rate": 5.039639255208156e-07, + "loss": 0.032, + "step": 7836 + }, + { + "epoch": 3.4807905840550744, + "grad_norm": 0.3825085947680331, + "learning_rate": 5.031161090235559e-07, + "loss": 0.0214, + "step": 7837 + }, + { + "epoch": 3.4812347324006216, + "grad_norm": 0.42377290578412, + "learning_rate": 5.022689684843329e-07, + "loss": 0.0253, + "step": 7838 + }, + { + "epoch": 3.4816788807461694, + "grad_norm": 0.3200977768252115, + "learning_rate": 5.014225040304871e-07, + "loss": 0.0172, + "step": 7839 + }, + { + "epoch": 3.4821230290917167, + "grad_norm": 0.4075927728120571, + "learning_rate": 5.005767157892572e-07, + "loss": 0.0234, + "step": 7840 + }, + { + "epoch": 3.482567177437264, + "grad_norm": 0.494712633880696, + "learning_rate": 4.99731603887777e-07, + "loss": 0.0308, + "step": 7841 + }, + { + "epoch": 3.4830113257828117, + "grad_norm": 0.3951757300545286, + "learning_rate": 4.98887168453085e-07, + "loss": 0.0249, + "step": 7842 + }, + { + "epoch": 3.483455474128359, + "grad_norm": 0.3887321324351885, + "learning_rate": 4.980434096121106e-07, + "loss": 0.0299, + "step": 7843 + }, + { + "epoch": 3.4838996224739063, + "grad_norm": 0.403812873928378, + "learning_rate": 4.97200327491687e-07, + "loss": 0.0219, + "step": 7844 + }, + { + "epoch": 3.4843437708194536, + "grad_norm": 0.4454713027127471, + "learning_rate": 4.963579222185444e-07, + "loss": 0.0247, + "step": 7845 + }, + { + "epoch": 3.4847879191650013, + "grad_norm": 0.41374791040066133, + "learning_rate": 4.955161939193087e-07, + "loss": 0.0213, + "step": 7846 + }, + { + "epoch": 3.4852320675105486, + "grad_norm": 0.460381233413127, + "learning_rate": 4.946751427205054e-07, + "loss": 0.0359, + "step": 7847 + }, + { + "epoch": 3.485676215856096, + "grad_norm": 0.5274863598727876, + "learning_rate": 4.938347687485629e-07, + "loss": 0.0249, + "step": 7848 + }, + { + "epoch": 3.486120364201643, + "grad_norm": 0.3376353042657825, + "learning_rate": 4.929950721297993e-07, + "loss": 0.0197, + "step": 7849 + }, + { + "epoch": 3.486564512547191, + "grad_norm": 0.4896159029658331, + "learning_rate": 4.921560529904374e-07, + "loss": 0.0291, + "step": 7850 + }, + { + "epoch": 3.487008660892738, + "grad_norm": 0.36649328899036365, + "learning_rate": 4.913177114565964e-07, + "loss": 0.0198, + "step": 7851 + }, + { + "epoch": 3.4874528092382855, + "grad_norm": 0.46757175677374957, + "learning_rate": 4.90480047654291e-07, + "loss": 0.0359, + "step": 7852 + }, + { + "epoch": 3.487896957583833, + "grad_norm": 0.4027846982490894, + "learning_rate": 4.896430617094389e-07, + "loss": 0.0284, + "step": 7853 + }, + { + "epoch": 3.4883411059293805, + "grad_norm": 0.41506130883345627, + "learning_rate": 4.888067537478519e-07, + "loss": 0.0253, + "step": 7854 + }, + { + "epoch": 3.488785254274928, + "grad_norm": 0.4526519663181836, + "learning_rate": 4.879711238952412e-07, + "loss": 0.0327, + "step": 7855 + }, + { + "epoch": 3.489229402620475, + "grad_norm": 0.3779652676361818, + "learning_rate": 4.871361722772166e-07, + "loss": 0.0336, + "step": 7856 + }, + { + "epoch": 3.489673550966023, + "grad_norm": 0.6066583572691198, + "learning_rate": 4.86301899019287e-07, + "loss": 0.0277, + "step": 7857 + }, + { + "epoch": 3.49011769931157, + "grad_norm": 0.516592567980747, + "learning_rate": 4.854683042468538e-07, + "loss": 0.0345, + "step": 7858 + }, + { + "epoch": 3.4905618476571174, + "grad_norm": 0.3300726184397244, + "learning_rate": 4.84635388085225e-07, + "loss": 0.0208, + "step": 7859 + }, + { + "epoch": 3.4910059960026647, + "grad_norm": 0.4075507938842562, + "learning_rate": 4.838031506595992e-07, + "loss": 0.0253, + "step": 7860 + }, + { + "epoch": 3.4914501443482124, + "grad_norm": 0.41601787060330486, + "learning_rate": 4.829715920950761e-07, + "loss": 0.0237, + "step": 7861 + }, + { + "epoch": 3.4918942926937597, + "grad_norm": 0.5863384888723406, + "learning_rate": 4.821407125166549e-07, + "loss": 0.0264, + "step": 7862 + }, + { + "epoch": 3.492338441039307, + "grad_norm": 0.47433808650477144, + "learning_rate": 4.81310512049229e-07, + "loss": 0.0285, + "step": 7863 + }, + { + "epoch": 3.4927825893848548, + "grad_norm": 0.4906763276486267, + "learning_rate": 4.804809908175911e-07, + "loss": 0.0251, + "step": 7864 + }, + { + "epoch": 3.493226737730402, + "grad_norm": 0.46826469445803337, + "learning_rate": 4.796521489464351e-07, + "loss": 0.0269, + "step": 7865 + }, + { + "epoch": 3.4936708860759493, + "grad_norm": 0.3869063106968133, + "learning_rate": 4.788239865603478e-07, + "loss": 0.0304, + "step": 7866 + }, + { + "epoch": 3.4941150344214966, + "grad_norm": 0.4155671863362175, + "learning_rate": 4.779965037838164e-07, + "loss": 0.0222, + "step": 7867 + }, + { + "epoch": 3.4945591827670444, + "grad_norm": 0.40399166433294564, + "learning_rate": 4.771697007412268e-07, + "loss": 0.0296, + "step": 7868 + }, + { + "epoch": 3.4950033311125916, + "grad_norm": 0.4132725467300106, + "learning_rate": 4.763435775568592e-07, + "loss": 0.0263, + "step": 7869 + }, + { + "epoch": 3.495447479458139, + "grad_norm": 0.40246185290766967, + "learning_rate": 4.7551813435489703e-07, + "loss": 0.0256, + "step": 7870 + }, + { + "epoch": 3.4958916278036867, + "grad_norm": 0.4363439376254432, + "learning_rate": 4.746933712594154e-07, + "loss": 0.0303, + "step": 7871 + }, + { + "epoch": 3.496335776149234, + "grad_norm": 0.346578595904209, + "learning_rate": 4.7386928839439183e-07, + "loss": 0.0216, + "step": 7872 + }, + { + "epoch": 3.4967799244947813, + "grad_norm": 0.40218292971261804, + "learning_rate": 4.7304588588370113e-07, + "loss": 0.022, + "step": 7873 + }, + { + "epoch": 3.4972240728403285, + "grad_norm": 0.3613752874610863, + "learning_rate": 4.722231638511121e-07, + "loss": 0.0188, + "step": 7874 + }, + { + "epoch": 3.497668221185876, + "grad_norm": 0.4305489342659995, + "learning_rate": 4.7140112242029356e-07, + "loss": 0.0244, + "step": 7875 + }, + { + "epoch": 3.4981123695314236, + "grad_norm": 0.3319846218251371, + "learning_rate": 4.7057976171481614e-07, + "loss": 0.0159, + "step": 7876 + }, + { + "epoch": 3.498556517876971, + "grad_norm": 0.3685184737755979, + "learning_rate": 4.69759081858141e-07, + "loss": 0.0248, + "step": 7877 + }, + { + "epoch": 3.499000666222518, + "grad_norm": 0.5404043718315673, + "learning_rate": 4.689390829736312e-07, + "loss": 0.031, + "step": 7878 + }, + { + "epoch": 3.499444814568066, + "grad_norm": 0.3833802780855093, + "learning_rate": 4.681197651845476e-07, + "loss": 0.0156, + "step": 7879 + }, + { + "epoch": 3.499888962913613, + "grad_norm": 0.387890807698906, + "learning_rate": 4.6730112861404497e-07, + "loss": 0.028, + "step": 7880 + }, + { + "epoch": 3.5003331112591605, + "grad_norm": 0.5278897289398781, + "learning_rate": 4.6648317338518045e-07, + "loss": 0.0305, + "step": 7881 + }, + { + "epoch": 3.5007772596047078, + "grad_norm": 0.429242176272356, + "learning_rate": 4.656658996209057e-07, + "loss": 0.0299, + "step": 7882 + }, + { + "epoch": 3.5012214079502555, + "grad_norm": 0.4369670295816076, + "learning_rate": 4.6484930744407074e-07, + "loss": 0.0327, + "step": 7883 + }, + { + "epoch": 3.501665556295803, + "grad_norm": 0.4901187441061219, + "learning_rate": 4.6403339697742413e-07, + "loss": 0.0228, + "step": 7884 + }, + { + "epoch": 3.50210970464135, + "grad_norm": 0.3420486455934448, + "learning_rate": 4.63218168343611e-07, + "loss": 0.0216, + "step": 7885 + }, + { + "epoch": 3.502553852986898, + "grad_norm": 0.40939983156980164, + "learning_rate": 4.624036216651723e-07, + "loss": 0.0263, + "step": 7886 + }, + { + "epoch": 3.502998001332445, + "grad_norm": 0.3910656718501315, + "learning_rate": 4.615897570645511e-07, + "loss": 0.0183, + "step": 7887 + }, + { + "epoch": 3.5034421496779924, + "grad_norm": 0.4314655959506559, + "learning_rate": 4.6077657466408245e-07, + "loss": 0.0312, + "step": 7888 + }, + { + "epoch": 3.5038862980235397, + "grad_norm": 0.3568409941841216, + "learning_rate": 4.599640745860029e-07, + "loss": 0.0271, + "step": 7889 + }, + { + "epoch": 3.5043304463690874, + "grad_norm": 0.4093953752586413, + "learning_rate": 4.5915225695244536e-07, + "loss": 0.0241, + "step": 7890 + }, + { + "epoch": 3.5047745947146347, + "grad_norm": 0.3977398657560647, + "learning_rate": 4.583411218854383e-07, + "loss": 0.0237, + "step": 7891 + }, + { + "epoch": 3.505218743060182, + "grad_norm": 0.4275747909847334, + "learning_rate": 4.575306695069087e-07, + "loss": 0.0301, + "step": 7892 + }, + { + "epoch": 3.5056628914057297, + "grad_norm": 0.5388719609229746, + "learning_rate": 4.567208999386852e-07, + "loss": 0.035, + "step": 7893 + }, + { + "epoch": 3.506107039751277, + "grad_norm": 0.36096697143744105, + "learning_rate": 4.5591181330248534e-07, + "loss": 0.0207, + "step": 7894 + }, + { + "epoch": 3.5065511880968243, + "grad_norm": 0.4142429440699439, + "learning_rate": 4.5510340971993086e-07, + "loss": 0.0186, + "step": 7895 + }, + { + "epoch": 3.5069953364423716, + "grad_norm": 0.4393049642057595, + "learning_rate": 4.542956893125394e-07, + "loss": 0.025, + "step": 7896 + }, + { + "epoch": 3.507439484787919, + "grad_norm": 0.6384180216184727, + "learning_rate": 4.534886522017229e-07, + "loss": 0.0266, + "step": 7897 + }, + { + "epoch": 3.5078836331334666, + "grad_norm": 0.43433435288410693, + "learning_rate": 4.526822985087931e-07, + "loss": 0.031, + "step": 7898 + }, + { + "epoch": 3.508327781479014, + "grad_norm": 0.4028630240642223, + "learning_rate": 4.5187662835495974e-07, + "loss": 0.0225, + "step": 7899 + }, + { + "epoch": 3.5087719298245617, + "grad_norm": 0.3939689206368112, + "learning_rate": 4.510716418613281e-07, + "loss": 0.0224, + "step": 7900 + }, + { + "epoch": 3.509216078170109, + "grad_norm": 0.515126016444997, + "learning_rate": 4.502673391489026e-07, + "loss": 0.029, + "step": 7901 + }, + { + "epoch": 3.5096602265156562, + "grad_norm": 0.4945699098085951, + "learning_rate": 4.4946372033858157e-07, + "loss": 0.028, + "step": 7902 + }, + { + "epoch": 3.5101043748612035, + "grad_norm": 0.43373798757249604, + "learning_rate": 4.486607855511627e-07, + "loss": 0.0272, + "step": 7903 + }, + { + "epoch": 3.510548523206751, + "grad_norm": 0.40314045697996204, + "learning_rate": 4.4785853490734277e-07, + "loss": 0.027, + "step": 7904 + }, + { + "epoch": 3.5109926715522985, + "grad_norm": 0.4288828381204982, + "learning_rate": 4.470569685277115e-07, + "loss": 0.021, + "step": 7905 + }, + { + "epoch": 3.511436819897846, + "grad_norm": 0.47380514557902603, + "learning_rate": 4.462560865327592e-07, + "loss": 0.0352, + "step": 7906 + }, + { + "epoch": 3.511880968243393, + "grad_norm": 0.6718664840187395, + "learning_rate": 4.454558890428728e-07, + "loss": 0.037, + "step": 7907 + }, + { + "epoch": 3.512325116588941, + "grad_norm": 0.42187384792878063, + "learning_rate": 4.446563761783329e-07, + "loss": 0.0265, + "step": 7908 + }, + { + "epoch": 3.512769264934488, + "grad_norm": 0.4115565734686187, + "learning_rate": 4.43857548059321e-07, + "loss": 0.0292, + "step": 7909 + }, + { + "epoch": 3.5132134132800354, + "grad_norm": 0.5047831437385433, + "learning_rate": 4.430594048059167e-07, + "loss": 0.0283, + "step": 7910 + }, + { + "epoch": 3.5136575616255827, + "grad_norm": 0.4497401686326858, + "learning_rate": 4.422619465380917e-07, + "loss": 0.0257, + "step": 7911 + }, + { + "epoch": 3.5141017099711305, + "grad_norm": 0.41364265915281817, + "learning_rate": 4.4146517337571857e-07, + "loss": 0.0219, + "step": 7912 + }, + { + "epoch": 3.5145458583166778, + "grad_norm": 0.5685124202462428, + "learning_rate": 4.4066908543856704e-07, + "loss": 0.0421, + "step": 7913 + }, + { + "epoch": 3.514990006662225, + "grad_norm": 0.31669420534645987, + "learning_rate": 4.3987368284630015e-07, + "loss": 0.0161, + "step": 7914 + }, + { + "epoch": 3.515434155007773, + "grad_norm": 0.3911190961806719, + "learning_rate": 4.3907896571848187e-07, + "loss": 0.0295, + "step": 7915 + }, + { + "epoch": 3.51587830335332, + "grad_norm": 0.49151230231883697, + "learning_rate": 4.382849341745715e-07, + "loss": 0.0323, + "step": 7916 + }, + { + "epoch": 3.5163224516988674, + "grad_norm": 0.3734463542020498, + "learning_rate": 4.3749158833392535e-07, + "loss": 0.0233, + "step": 7917 + }, + { + "epoch": 3.5167666000444147, + "grad_norm": 0.4234162479441925, + "learning_rate": 4.366989283157985e-07, + "loss": 0.0221, + "step": 7918 + }, + { + "epoch": 3.5172107483899624, + "grad_norm": 0.36581058726451665, + "learning_rate": 4.3590695423933795e-07, + "loss": 0.0232, + "step": 7919 + }, + { + "epoch": 3.5176548967355097, + "grad_norm": 0.41877616966330095, + "learning_rate": 4.3511566622359224e-07, + "loss": 0.032, + "step": 7920 + }, + { + "epoch": 3.518099045081057, + "grad_norm": 0.381478457217342, + "learning_rate": 4.3432506438750745e-07, + "loss": 0.0198, + "step": 7921 + }, + { + "epoch": 3.5185431934266047, + "grad_norm": 0.5312067893718723, + "learning_rate": 4.335351488499218e-07, + "loss": 0.0341, + "step": 7922 + }, + { + "epoch": 3.518987341772152, + "grad_norm": 0.32613641832900486, + "learning_rate": 4.327459197295736e-07, + "loss": 0.0153, + "step": 7923 + }, + { + "epoch": 3.5194314901176993, + "grad_norm": 0.4186436124064849, + "learning_rate": 4.319573771450991e-07, + "loss": 0.0239, + "step": 7924 + }, + { + "epoch": 3.5198756384632466, + "grad_norm": 0.5213492953580083, + "learning_rate": 4.3116952121502686e-07, + "loss": 0.0245, + "step": 7925 + }, + { + "epoch": 3.520319786808794, + "grad_norm": 0.4470067109711501, + "learning_rate": 4.303823520577871e-07, + "loss": 0.029, + "step": 7926 + }, + { + "epoch": 3.5207639351543416, + "grad_norm": 0.44926277605141984, + "learning_rate": 4.295958697917035e-07, + "loss": 0.0251, + "step": 7927 + }, + { + "epoch": 3.521208083499889, + "grad_norm": 0.4428984780991236, + "learning_rate": 4.288100745349988e-07, + "loss": 0.0283, + "step": 7928 + }, + { + "epoch": 3.5216522318454366, + "grad_norm": 0.4649455292363394, + "learning_rate": 4.2802496640579115e-07, + "loss": 0.0261, + "step": 7929 + }, + { + "epoch": 3.522096380190984, + "grad_norm": 0.4546600325025468, + "learning_rate": 4.2724054552209515e-07, + "loss": 0.0272, + "step": 7930 + }, + { + "epoch": 3.522540528536531, + "grad_norm": 0.44460414877352883, + "learning_rate": 4.2645681200182197e-07, + "loss": 0.02, + "step": 7931 + }, + { + "epoch": 3.5229846768820785, + "grad_norm": 0.3712603269325949, + "learning_rate": 4.256737659627813e-07, + "loss": 0.0252, + "step": 7932 + }, + { + "epoch": 3.523428825227626, + "grad_norm": 0.4419446907695471, + "learning_rate": 4.248914075226779e-07, + "loss": 0.0296, + "step": 7933 + }, + { + "epoch": 3.5238729735731735, + "grad_norm": 0.4371622849898781, + "learning_rate": 4.2410973679911317e-07, + "loss": 0.0193, + "step": 7934 + }, + { + "epoch": 3.524317121918721, + "grad_norm": 0.42263243970502656, + "learning_rate": 4.2332875390958707e-07, + "loss": 0.0284, + "step": 7935 + }, + { + "epoch": 3.524761270264268, + "grad_norm": 0.4360454559382049, + "learning_rate": 4.225484589714918e-07, + "loss": 0.0267, + "step": 7936 + }, + { + "epoch": 3.525205418609816, + "grad_norm": 0.4221730269522906, + "learning_rate": 4.2176885210212127e-07, + "loss": 0.0283, + "step": 7937 + }, + { + "epoch": 3.525649566955363, + "grad_norm": 0.34554558425959747, + "learning_rate": 4.209899334186623e-07, + "loss": 0.015, + "step": 7938 + }, + { + "epoch": 3.5260937153009104, + "grad_norm": 0.3944057611136553, + "learning_rate": 4.2021170303820025e-07, + "loss": 0.0247, + "step": 7939 + }, + { + "epoch": 3.5265378636464577, + "grad_norm": 0.3950267732155638, + "learning_rate": 4.1943416107771585e-07, + "loss": 0.0237, + "step": 7940 + }, + { + "epoch": 3.5269820119920055, + "grad_norm": 0.5276030208132706, + "learning_rate": 4.186573076540884e-07, + "loss": 0.0293, + "step": 7941 + }, + { + "epoch": 3.5274261603375527, + "grad_norm": 0.42130456420073215, + "learning_rate": 4.178811428840901e-07, + "loss": 0.0221, + "step": 7942 + }, + { + "epoch": 3.5278703086831, + "grad_norm": 0.4967582028112826, + "learning_rate": 4.1710566688439314e-07, + "loss": 0.0264, + "step": 7943 + }, + { + "epoch": 3.5283144570286478, + "grad_norm": 0.38558337467937864, + "learning_rate": 4.163308797715637e-07, + "loss": 0.0276, + "step": 7944 + }, + { + "epoch": 3.528758605374195, + "grad_norm": 0.5044853945425457, + "learning_rate": 4.155567816620659e-07, + "loss": 0.0204, + "step": 7945 + }, + { + "epoch": 3.5292027537197423, + "grad_norm": 0.8869805733234495, + "learning_rate": 4.147833726722611e-07, + "loss": 0.0291, + "step": 7946 + }, + { + "epoch": 3.5296469020652896, + "grad_norm": 0.3536249761317266, + "learning_rate": 4.140106529184035e-07, + "loss": 0.0235, + "step": 7947 + }, + { + "epoch": 3.5300910504108374, + "grad_norm": 0.4154592460581955, + "learning_rate": 4.1323862251664684e-07, + "loss": 0.0233, + "step": 7948 + }, + { + "epoch": 3.5305351987563847, + "grad_norm": 0.3504099993535625, + "learning_rate": 4.1246728158304107e-07, + "loss": 0.0187, + "step": 7949 + }, + { + "epoch": 3.530979347101932, + "grad_norm": 0.4437614260756121, + "learning_rate": 4.1169663023353124e-07, + "loss": 0.0262, + "step": 7950 + }, + { + "epoch": 3.5314234954474797, + "grad_norm": 0.3600199231026137, + "learning_rate": 4.109266685839597e-07, + "loss": 0.02, + "step": 7951 + }, + { + "epoch": 3.531867643793027, + "grad_norm": 0.3877433229131516, + "learning_rate": 4.101573967500655e-07, + "loss": 0.0236, + "step": 7952 + }, + { + "epoch": 3.5323117921385743, + "grad_norm": 0.39474586031469927, + "learning_rate": 4.0938881484748116e-07, + "loss": 0.0261, + "step": 7953 + }, + { + "epoch": 3.5327559404841216, + "grad_norm": 0.4148159595077121, + "learning_rate": 4.086209229917387e-07, + "loss": 0.0255, + "step": 7954 + }, + { + "epoch": 3.533200088829669, + "grad_norm": 0.47070012710839254, + "learning_rate": 4.0785372129826586e-07, + "loss": 0.0318, + "step": 7955 + }, + { + "epoch": 3.5336442371752166, + "grad_norm": 0.3680844624737353, + "learning_rate": 4.0708720988238584e-07, + "loss": 0.0166, + "step": 7956 + }, + { + "epoch": 3.534088385520764, + "grad_norm": 0.3875786535924789, + "learning_rate": 4.063213888593176e-07, + "loss": 0.0256, + "step": 7957 + }, + { + "epoch": 3.5345325338663116, + "grad_norm": 0.39292909474931037, + "learning_rate": 4.0555625834417857e-07, + "loss": 0.0244, + "step": 7958 + }, + { + "epoch": 3.534976682211859, + "grad_norm": 0.4238019487678821, + "learning_rate": 4.047918184519789e-07, + "loss": 0.0265, + "step": 7959 + }, + { + "epoch": 3.535420830557406, + "grad_norm": 0.3868503914395472, + "learning_rate": 4.040280692976278e-07, + "loss": 0.025, + "step": 7960 + }, + { + "epoch": 3.5358649789029535, + "grad_norm": 0.5061016125162293, + "learning_rate": 4.032650109959302e-07, + "loss": 0.0305, + "step": 7961 + }, + { + "epoch": 3.5363091272485008, + "grad_norm": 0.44915195593836177, + "learning_rate": 4.0250264366158643e-07, + "loss": 0.0227, + "step": 7962 + }, + { + "epoch": 3.5367532755940485, + "grad_norm": 0.3701090299363028, + "learning_rate": 4.017409674091932e-07, + "loss": 0.0275, + "step": 7963 + }, + { + "epoch": 3.537197423939596, + "grad_norm": 0.33071866294982566, + "learning_rate": 4.009799823532434e-07, + "loss": 0.0166, + "step": 7964 + }, + { + "epoch": 3.537641572285143, + "grad_norm": 0.3869700046276894, + "learning_rate": 4.0021968860812556e-07, + "loss": 0.025, + "step": 7965 + }, + { + "epoch": 3.538085720630691, + "grad_norm": 0.34946266928394254, + "learning_rate": 3.994600862881248e-07, + "loss": 0.0205, + "step": 7966 + }, + { + "epoch": 3.538529868976238, + "grad_norm": 0.391173218273872, + "learning_rate": 3.9870117550742273e-07, + "loss": 0.0263, + "step": 7967 + }, + { + "epoch": 3.5389740173217854, + "grad_norm": 0.3665430028174577, + "learning_rate": 3.9794295638009683e-07, + "loss": 0.022, + "step": 7968 + }, + { + "epoch": 3.5394181656673327, + "grad_norm": 0.3931083199354571, + "learning_rate": 3.971854290201205e-07, + "loss": 0.0233, + "step": 7969 + }, + { + "epoch": 3.5398623140128804, + "grad_norm": 0.4886210610765913, + "learning_rate": 3.964285935413609e-07, + "loss": 0.0213, + "step": 7970 + }, + { + "epoch": 3.5403064623584277, + "grad_norm": 0.3576029073891739, + "learning_rate": 3.9567245005758537e-07, + "loss": 0.023, + "step": 7971 + }, + { + "epoch": 3.540750610703975, + "grad_norm": 0.4994177352645637, + "learning_rate": 3.9491699868245414e-07, + "loss": 0.0419, + "step": 7972 + }, + { + "epoch": 3.5411947590495227, + "grad_norm": 0.43622296138249816, + "learning_rate": 3.941622395295247e-07, + "loss": 0.0318, + "step": 7973 + }, + { + "epoch": 3.54163890739507, + "grad_norm": 0.4309663417901952, + "learning_rate": 3.934081727122513e-07, + "loss": 0.0342, + "step": 7974 + }, + { + "epoch": 3.5420830557406173, + "grad_norm": 0.4349122435505889, + "learning_rate": 3.9265479834398103e-07, + "loss": 0.0286, + "step": 7975 + }, + { + "epoch": 3.5425272040861646, + "grad_norm": 0.4420907520326438, + "learning_rate": 3.919021165379594e-07, + "loss": 0.0192, + "step": 7976 + }, + { + "epoch": 3.5429713524317124, + "grad_norm": 0.40134602793452784, + "learning_rate": 3.911501274073276e-07, + "loss": 0.0222, + "step": 7977 + }, + { + "epoch": 3.5434155007772596, + "grad_norm": 0.4162128270891882, + "learning_rate": 3.9039883106512243e-07, + "loss": 0.0263, + "step": 7978 + }, + { + "epoch": 3.543859649122807, + "grad_norm": 0.43122424865043063, + "learning_rate": 3.8964822762427633e-07, + "loss": 0.0251, + "step": 7979 + }, + { + "epoch": 3.5443037974683547, + "grad_norm": 0.351905898436228, + "learning_rate": 3.888983171976185e-07, + "loss": 0.0225, + "step": 7980 + }, + { + "epoch": 3.544747945813902, + "grad_norm": 0.37615773316072754, + "learning_rate": 3.8814909989787155e-07, + "loss": 0.02, + "step": 7981 + }, + { + "epoch": 3.5451920941594492, + "grad_norm": 0.42395585895532933, + "learning_rate": 3.87400575837657e-07, + "loss": 0.0202, + "step": 7982 + }, + { + "epoch": 3.5456362425049965, + "grad_norm": 0.3990970301083849, + "learning_rate": 3.8665274512948994e-07, + "loss": 0.0201, + "step": 7983 + }, + { + "epoch": 3.546080390850544, + "grad_norm": 0.5764101771289384, + "learning_rate": 3.859056078857826e-07, + "loss": 0.0381, + "step": 7984 + }, + { + "epoch": 3.5465245391960916, + "grad_norm": 0.3820488932414435, + "learning_rate": 3.851591642188418e-07, + "loss": 0.0242, + "step": 7985 + }, + { + "epoch": 3.546968687541639, + "grad_norm": 0.3459706819898099, + "learning_rate": 3.8441341424087233e-07, + "loss": 0.0238, + "step": 7986 + }, + { + "epoch": 3.5474128358871866, + "grad_norm": 0.4227398045664523, + "learning_rate": 3.836683580639705e-07, + "loss": 0.0244, + "step": 7987 + }, + { + "epoch": 3.547856984232734, + "grad_norm": 0.4309060643690936, + "learning_rate": 3.829239958001324e-07, + "loss": 0.0194, + "step": 7988 + }, + { + "epoch": 3.548301132578281, + "grad_norm": 0.42116836328880336, + "learning_rate": 3.8218032756124844e-07, + "loss": 0.0204, + "step": 7989 + }, + { + "epoch": 3.5487452809238285, + "grad_norm": 0.37821657931686165, + "learning_rate": 3.814373534591037e-07, + "loss": 0.0194, + "step": 7990 + }, + { + "epoch": 3.5491894292693758, + "grad_norm": 0.42284081107970184, + "learning_rate": 3.8069507360538163e-07, + "loss": 0.0221, + "step": 7991 + }, + { + "epoch": 3.5496335776149235, + "grad_norm": 0.4662610237314549, + "learning_rate": 3.799534881116573e-07, + "loss": 0.0276, + "step": 7992 + }, + { + "epoch": 3.5500777259604708, + "grad_norm": 0.45003555474688933, + "learning_rate": 3.7921259708940503e-07, + "loss": 0.0274, + "step": 7993 + }, + { + "epoch": 3.550521874306018, + "grad_norm": 0.5571173920439147, + "learning_rate": 3.7847240064999233e-07, + "loss": 0.0294, + "step": 7994 + }, + { + "epoch": 3.550966022651566, + "grad_norm": 0.4728510369440864, + "learning_rate": 3.7773289890468414e-07, + "loss": 0.026, + "step": 7995 + }, + { + "epoch": 3.551410170997113, + "grad_norm": 0.3553733406102519, + "learning_rate": 3.7699409196463977e-07, + "loss": 0.0175, + "step": 7996 + }, + { + "epoch": 3.5518543193426604, + "grad_norm": 0.4021957600650995, + "learning_rate": 3.762559799409149e-07, + "loss": 0.0268, + "step": 7997 + }, + { + "epoch": 3.5522984676882077, + "grad_norm": 0.5246882857506893, + "learning_rate": 3.7551856294445967e-07, + "loss": 0.0243, + "step": 7998 + }, + { + "epoch": 3.5527426160337554, + "grad_norm": 0.4805020243101749, + "learning_rate": 3.7478184108612036e-07, + "loss": 0.0185, + "step": 7999 + }, + { + "epoch": 3.5531867643793027, + "grad_norm": 0.49276293248029474, + "learning_rate": 3.74045814476639e-07, + "loss": 0.0322, + "step": 8000 + }, + { + "epoch": 3.55363091272485, + "grad_norm": 0.5382419991792012, + "learning_rate": 3.733104832266532e-07, + "loss": 0.0273, + "step": 8001 + }, + { + "epoch": 3.5540750610703977, + "grad_norm": 0.4236743982252115, + "learning_rate": 3.7257584744669615e-07, + "loss": 0.024, + "step": 8002 + }, + { + "epoch": 3.554519209415945, + "grad_norm": 0.3569656738216824, + "learning_rate": 3.718419072471946e-07, + "loss": 0.0206, + "step": 8003 + }, + { + "epoch": 3.5549633577614923, + "grad_norm": 0.6957355605647527, + "learning_rate": 3.7110866273847356e-07, + "loss": 0.0286, + "step": 8004 + }, + { + "epoch": 3.5554075061070396, + "grad_norm": 0.39146983502282784, + "learning_rate": 3.70376114030751e-07, + "loss": 0.0211, + "step": 8005 + }, + { + "epoch": 3.5558516544525873, + "grad_norm": 0.4183758252296795, + "learning_rate": 3.696442612341422e-07, + "loss": 0.023, + "step": 8006 + }, + { + "epoch": 3.5562958027981346, + "grad_norm": 0.4440464207564376, + "learning_rate": 3.6891310445865693e-07, + "loss": 0.0284, + "step": 8007 + }, + { + "epoch": 3.556739951143682, + "grad_norm": 0.3721283097773306, + "learning_rate": 3.681826438142011e-07, + "loss": 0.0228, + "step": 8008 + }, + { + "epoch": 3.5571840994892296, + "grad_norm": 0.4139366078303485, + "learning_rate": 3.6745287941057417e-07, + "loss": 0.0229, + "step": 8009 + }, + { + "epoch": 3.557628247834777, + "grad_norm": 0.5055064614278328, + "learning_rate": 3.6672381135747284e-07, + "loss": 0.0278, + "step": 8010 + }, + { + "epoch": 3.5580723961803242, + "grad_norm": 0.44529281390449127, + "learning_rate": 3.6599543976448884e-07, + "loss": 0.0261, + "step": 8011 + }, + { + "epoch": 3.5585165445258715, + "grad_norm": 0.4907105184886079, + "learning_rate": 3.6526776474110627e-07, + "loss": 0.0244, + "step": 8012 + }, + { + "epoch": 3.558960692871419, + "grad_norm": 0.4951645075676849, + "learning_rate": 3.645407863967104e-07, + "loss": 0.037, + "step": 8013 + }, + { + "epoch": 3.5594048412169665, + "grad_norm": 0.37863500443452736, + "learning_rate": 3.6381450484057777e-07, + "loss": 0.0252, + "step": 8014 + }, + { + "epoch": 3.559848989562514, + "grad_norm": 0.39735984529923846, + "learning_rate": 3.630889201818788e-07, + "loss": 0.0206, + "step": 8015 + }, + { + "epoch": 3.5602931379080616, + "grad_norm": 0.4346987177337589, + "learning_rate": 3.623640325296829e-07, + "loss": 0.0257, + "step": 8016 + }, + { + "epoch": 3.560737286253609, + "grad_norm": 0.33730436829577654, + "learning_rate": 3.616398419929523e-07, + "loss": 0.0189, + "step": 8017 + }, + { + "epoch": 3.561181434599156, + "grad_norm": 0.3665779162327361, + "learning_rate": 3.6091634868054557e-07, + "loss": 0.0189, + "step": 8018 + }, + { + "epoch": 3.5616255829447034, + "grad_norm": 0.3886589674388881, + "learning_rate": 3.601935527012168e-07, + "loss": 0.0225, + "step": 8019 + }, + { + "epoch": 3.5620697312902507, + "grad_norm": 0.46005381668223383, + "learning_rate": 3.594714541636124e-07, + "loss": 0.0251, + "step": 8020 + }, + { + "epoch": 3.5625138796357985, + "grad_norm": 0.437136127998239, + "learning_rate": 3.5875005317627776e-07, + "loss": 0.0336, + "step": 8021 + }, + { + "epoch": 3.5629580279813458, + "grad_norm": 0.41144565522854637, + "learning_rate": 3.580293498476517e-07, + "loss": 0.0321, + "step": 8022 + }, + { + "epoch": 3.563402176326893, + "grad_norm": 0.4402714548710384, + "learning_rate": 3.573093442860659e-07, + "loss": 0.0218, + "step": 8023 + }, + { + "epoch": 3.563846324672441, + "grad_norm": 0.46313910275690856, + "learning_rate": 3.565900365997521e-07, + "loss": 0.038, + "step": 8024 + }, + { + "epoch": 3.564290473017988, + "grad_norm": 0.4585923557925007, + "learning_rate": 3.558714268968344e-07, + "loss": 0.0242, + "step": 8025 + }, + { + "epoch": 3.5647346213635354, + "grad_norm": 0.40511430211397, + "learning_rate": 3.5515351528533024e-07, + "loss": 0.0217, + "step": 8026 + }, + { + "epoch": 3.5651787697090827, + "grad_norm": 0.4897536707816401, + "learning_rate": 3.5443630187315504e-07, + "loss": 0.0293, + "step": 8027 + }, + { + "epoch": 3.5656229180546304, + "grad_norm": 0.3947409838204125, + "learning_rate": 3.537197867681191e-07, + "loss": 0.0231, + "step": 8028 + }, + { + "epoch": 3.5660670664001777, + "grad_norm": 0.42936961733669526, + "learning_rate": 3.5300397007792364e-07, + "loss": 0.0266, + "step": 8029 + }, + { + "epoch": 3.566511214745725, + "grad_norm": 0.3453771987941547, + "learning_rate": 3.5228885191017084e-07, + "loss": 0.0181, + "step": 8030 + }, + { + "epoch": 3.5669553630912727, + "grad_norm": 0.38903220671574995, + "learning_rate": 3.515744323723558e-07, + "loss": 0.0254, + "step": 8031 + }, + { + "epoch": 3.56739951143682, + "grad_norm": 0.39781787732509344, + "learning_rate": 3.508607115718654e-07, + "loss": 0.0181, + "step": 8032 + }, + { + "epoch": 3.5678436597823673, + "grad_norm": 0.42441099532371085, + "learning_rate": 3.50147689615985e-07, + "loss": 0.0222, + "step": 8033 + }, + { + "epoch": 3.5682878081279146, + "grad_norm": 0.38842537901922425, + "learning_rate": 3.494353666118938e-07, + "loss": 0.0378, + "step": 8034 + }, + { + "epoch": 3.568731956473462, + "grad_norm": 0.35803044170961684, + "learning_rate": 3.4872374266666674e-07, + "loss": 0.0241, + "step": 8035 + }, + { + "epoch": 3.5691761048190096, + "grad_norm": 0.41680398082335124, + "learning_rate": 3.4801281788727326e-07, + "loss": 0.0226, + "step": 8036 + }, + { + "epoch": 3.569620253164557, + "grad_norm": 0.4978684696151685, + "learning_rate": 3.4730259238057563e-07, + "loss": 0.0265, + "step": 8037 + }, + { + "epoch": 3.5700644015101046, + "grad_norm": 0.526050521323082, + "learning_rate": 3.46593066253334e-07, + "loss": 0.0241, + "step": 8038 + }, + { + "epoch": 3.570508549855652, + "grad_norm": 0.4100317457053481, + "learning_rate": 3.4588423961220306e-07, + "loss": 0.0244, + "step": 8039 + }, + { + "epoch": 3.570952698201199, + "grad_norm": 0.4007810456784438, + "learning_rate": 3.4517611256372875e-07, + "loss": 0.0207, + "step": 8040 + }, + { + "epoch": 3.5713968465467465, + "grad_norm": 0.44507883967406275, + "learning_rate": 3.444686852143575e-07, + "loss": 0.0203, + "step": 8041 + }, + { + "epoch": 3.571840994892294, + "grad_norm": 0.4110302975726491, + "learning_rate": 3.4376195767042706e-07, + "loss": 0.0196, + "step": 8042 + }, + { + "epoch": 3.5722851432378415, + "grad_norm": 0.42428375307760136, + "learning_rate": 3.4305593003816917e-07, + "loss": 0.0239, + "step": 8043 + }, + { + "epoch": 3.572729291583389, + "grad_norm": 0.4068665526914318, + "learning_rate": 3.423506024237122e-07, + "loss": 0.0251, + "step": 8044 + }, + { + "epoch": 3.573173439928936, + "grad_norm": 0.5621905349071596, + "learning_rate": 3.416459749330808e-07, + "loss": 0.0436, + "step": 8045 + }, + { + "epoch": 3.573617588274484, + "grad_norm": 0.4013053666322752, + "learning_rate": 3.409420476721892e-07, + "loss": 0.0233, + "step": 8046 + }, + { + "epoch": 3.574061736620031, + "grad_norm": 0.40409509056363074, + "learning_rate": 3.4023882074685266e-07, + "loss": 0.0196, + "step": 8047 + }, + { + "epoch": 3.5745058849655784, + "grad_norm": 0.401583653285129, + "learning_rate": 3.3953629426277666e-07, + "loss": 0.0254, + "step": 8048 + }, + { + "epoch": 3.5749500333111257, + "grad_norm": 0.39247780188844755, + "learning_rate": 3.3883446832556286e-07, + "loss": 0.0226, + "step": 8049 + }, + { + "epoch": 3.5753941816566734, + "grad_norm": 0.4595707850818669, + "learning_rate": 3.381333430407074e-07, + "loss": 0.0331, + "step": 8050 + }, + { + "epoch": 3.5758383300022207, + "grad_norm": 0.38262273059202667, + "learning_rate": 3.3743291851360215e-07, + "loss": 0.032, + "step": 8051 + }, + { + "epoch": 3.576282478347768, + "grad_norm": 0.5184355648640612, + "learning_rate": 3.3673319484953224e-07, + "loss": 0.0535, + "step": 8052 + }, + { + "epoch": 3.5767266266933158, + "grad_norm": 0.4837920320850102, + "learning_rate": 3.3603417215367916e-07, + "loss": 0.028, + "step": 8053 + }, + { + "epoch": 3.577170775038863, + "grad_norm": 0.4493839583313447, + "learning_rate": 3.3533585053111604e-07, + "loss": 0.0239, + "step": 8054 + }, + { + "epoch": 3.5776149233844103, + "grad_norm": 0.45676379041549287, + "learning_rate": 3.346382300868134e-07, + "loss": 0.0183, + "step": 8055 + }, + { + "epoch": 3.5780590717299576, + "grad_norm": 0.42408641826733806, + "learning_rate": 3.339413109256362e-07, + "loss": 0.019, + "step": 8056 + }, + { + "epoch": 3.5785032200755054, + "grad_norm": 0.41032911192121174, + "learning_rate": 3.3324509315234066e-07, + "loss": 0.0243, + "step": 8057 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.3781126839349984, + "learning_rate": 3.325495768715831e-07, + "loss": 0.023, + "step": 8058 + }, + { + "epoch": 3.5793915167666, + "grad_norm": 0.48422608200816336, + "learning_rate": 3.318547621879109e-07, + "loss": 0.0209, + "step": 8059 + }, + { + "epoch": 3.5798356651121477, + "grad_norm": 0.5803863155007879, + "learning_rate": 3.311606492057651e-07, + "loss": 0.03, + "step": 8060 + }, + { + "epoch": 3.580279813457695, + "grad_norm": 0.4295397381772821, + "learning_rate": 3.304672380294832e-07, + "loss": 0.0276, + "step": 8061 + }, + { + "epoch": 3.5807239618032423, + "grad_norm": 0.4359738260640305, + "learning_rate": 3.2977452876329806e-07, + "loss": 0.0226, + "step": 8062 + }, + { + "epoch": 3.5811681101487896, + "grad_norm": 0.6578716400312555, + "learning_rate": 3.290825215113325e-07, + "loss": 0.0317, + "step": 8063 + }, + { + "epoch": 3.581612258494337, + "grad_norm": 0.31354942464141744, + "learning_rate": 3.2839121637761095e-07, + "loss": 0.021, + "step": 8064 + }, + { + "epoch": 3.5820564068398846, + "grad_norm": 0.3962803838449033, + "learning_rate": 3.277006134660454e-07, + "loss": 0.0279, + "step": 8065 + }, + { + "epoch": 3.582500555185432, + "grad_norm": 0.4214624266956511, + "learning_rate": 3.270107128804462e-07, + "loss": 0.0344, + "step": 8066 + }, + { + "epoch": 3.5829447035309796, + "grad_norm": 0.3602571631127595, + "learning_rate": 3.26321514724518e-07, + "loss": 0.0228, + "step": 8067 + }, + { + "epoch": 3.583388851876527, + "grad_norm": 0.37827106337184596, + "learning_rate": 3.2563301910185585e-07, + "loss": 0.0229, + "step": 8068 + }, + { + "epoch": 3.583833000222074, + "grad_norm": 0.3969145699088532, + "learning_rate": 3.249452261159558e-07, + "loss": 0.0216, + "step": 8069 + }, + { + "epoch": 3.5842771485676215, + "grad_norm": 0.3539428309291324, + "learning_rate": 3.242581358702046e-07, + "loss": 0.0219, + "step": 8070 + }, + { + "epoch": 3.5847212969131688, + "grad_norm": 0.46162564562225794, + "learning_rate": 3.235717484678813e-07, + "loss": 0.0219, + "step": 8071 + }, + { + "epoch": 3.5851654452587165, + "grad_norm": 0.395257543151881, + "learning_rate": 3.2288606401216283e-07, + "loss": 0.0283, + "step": 8072 + }, + { + "epoch": 3.585609593604264, + "grad_norm": 0.3543800684377955, + "learning_rate": 3.2220108260612e-07, + "loss": 0.022, + "step": 8073 + }, + { + "epoch": 3.586053741949811, + "grad_norm": 0.3956501467663245, + "learning_rate": 3.2151680435271504e-07, + "loss": 0.0227, + "step": 8074 + }, + { + "epoch": 3.586497890295359, + "grad_norm": 0.6908876097601867, + "learning_rate": 3.208332293548094e-07, + "loss": 0.0402, + "step": 8075 + }, + { + "epoch": 3.586942038640906, + "grad_norm": 0.41210563747436485, + "learning_rate": 3.2015035771515377e-07, + "loss": 0.0207, + "step": 8076 + }, + { + "epoch": 3.5873861869864534, + "grad_norm": 0.4210761069284605, + "learning_rate": 3.1946818953639604e-07, + "loss": 0.0199, + "step": 8077 + }, + { + "epoch": 3.5878303353320007, + "grad_norm": 0.562680504099952, + "learning_rate": 3.1878672492107796e-07, + "loss": 0.0333, + "step": 8078 + }, + { + "epoch": 3.5882744836775484, + "grad_norm": 0.45181012070367105, + "learning_rate": 3.181059639716355e-07, + "loss": 0.0292, + "step": 8079 + }, + { + "epoch": 3.5887186320230957, + "grad_norm": 0.3560150149469517, + "learning_rate": 3.1742590679039675e-07, + "loss": 0.0263, + "step": 8080 + }, + { + "epoch": 3.589162780368643, + "grad_norm": 0.2727495861254254, + "learning_rate": 3.167465534795888e-07, + "loss": 0.0142, + "step": 8081 + }, + { + "epoch": 3.5896069287141907, + "grad_norm": 0.3723555834846713, + "learning_rate": 3.1606790414132784e-07, + "loss": 0.0204, + "step": 8082 + }, + { + "epoch": 3.590051077059738, + "grad_norm": 0.3414027162319298, + "learning_rate": 3.153899588776266e-07, + "loss": 0.0229, + "step": 8083 + }, + { + "epoch": 3.5904952254052853, + "grad_norm": 0.42338168106939433, + "learning_rate": 3.147127177903936e-07, + "loss": 0.0232, + "step": 8084 + }, + { + "epoch": 3.5909393737508326, + "grad_norm": 0.41145971331036213, + "learning_rate": 3.1403618098142683e-07, + "loss": 0.0196, + "step": 8085 + }, + { + "epoch": 3.5913835220963803, + "grad_norm": 0.4400619534919001, + "learning_rate": 3.133603485524217e-07, + "loss": 0.0305, + "step": 8086 + }, + { + "epoch": 3.5918276704419276, + "grad_norm": 0.3445253112185815, + "learning_rate": 3.126852206049702e-07, + "loss": 0.0184, + "step": 8087 + }, + { + "epoch": 3.592271818787475, + "grad_norm": 0.4780681637104892, + "learning_rate": 3.1201079724055284e-07, + "loss": 0.0217, + "step": 8088 + }, + { + "epoch": 3.5927159671330227, + "grad_norm": 0.3835860710733381, + "learning_rate": 3.113370785605474e-07, + "loss": 0.0177, + "step": 8089 + }, + { + "epoch": 3.59316011547857, + "grad_norm": 0.36022249138379353, + "learning_rate": 3.106640646662268e-07, + "loss": 0.0229, + "step": 8090 + }, + { + "epoch": 3.5936042638241172, + "grad_norm": 0.3513194805512465, + "learning_rate": 3.099917556587534e-07, + "loss": 0.0183, + "step": 8091 + }, + { + "epoch": 3.5940484121696645, + "grad_norm": 0.44275629565228397, + "learning_rate": 3.0932015163918973e-07, + "loss": 0.0295, + "step": 8092 + }, + { + "epoch": 3.594492560515212, + "grad_norm": 0.4202067622758764, + "learning_rate": 3.0864925270848725e-07, + "loss": 0.022, + "step": 8093 + }, + { + "epoch": 3.5949367088607596, + "grad_norm": 0.35964770608777313, + "learning_rate": 3.079790589674947e-07, + "loss": 0.019, + "step": 8094 + }, + { + "epoch": 3.595380857206307, + "grad_norm": 0.33143293773682697, + "learning_rate": 3.073095705169532e-07, + "loss": 0.0174, + "step": 8095 + }, + { + "epoch": 3.5958250055518546, + "grad_norm": 0.35370082577225564, + "learning_rate": 3.066407874574978e-07, + "loss": 0.0245, + "step": 8096 + }, + { + "epoch": 3.596269153897402, + "grad_norm": 0.46712833979581136, + "learning_rate": 3.05972709889657e-07, + "loss": 0.0248, + "step": 8097 + }, + { + "epoch": 3.596713302242949, + "grad_norm": 0.38025100976241183, + "learning_rate": 3.0530533791385765e-07, + "loss": 0.0244, + "step": 8098 + }, + { + "epoch": 3.5971574505884965, + "grad_norm": 0.342220120184616, + "learning_rate": 3.0463867163041396e-07, + "loss": 0.0155, + "step": 8099 + }, + { + "epoch": 3.5976015989340437, + "grad_norm": 0.36933627653900136, + "learning_rate": 3.0397271113953796e-07, + "loss": 0.0209, + "step": 8100 + }, + { + "epoch": 3.5980457472795915, + "grad_norm": 0.3882041575523787, + "learning_rate": 3.0330745654133576e-07, + "loss": 0.0289, + "step": 8101 + }, + { + "epoch": 3.5984898956251388, + "grad_norm": 0.39071440323579304, + "learning_rate": 3.026429079358051e-07, + "loss": 0.0225, + "step": 8102 + }, + { + "epoch": 3.598934043970686, + "grad_norm": 0.5224321778667499, + "learning_rate": 3.0197906542283996e-07, + "loss": 0.0366, + "step": 8103 + }, + { + "epoch": 3.599378192316234, + "grad_norm": 0.463268142143758, + "learning_rate": 3.013159291022261e-07, + "loss": 0.0254, + "step": 8104 + }, + { + "epoch": 3.599822340661781, + "grad_norm": 0.4758792312691063, + "learning_rate": 3.006534990736448e-07, + "loss": 0.0177, + "step": 8105 + }, + { + "epoch": 3.6002664890073284, + "grad_norm": 0.3801891704764608, + "learning_rate": 2.99991775436671e-07, + "loss": 0.0285, + "step": 8106 + }, + { + "epoch": 3.6007106373528757, + "grad_norm": 0.47832635752425606, + "learning_rate": 2.993307582907728e-07, + "loss": 0.0393, + "step": 8107 + }, + { + "epoch": 3.6011547856984234, + "grad_norm": 0.4174887456808177, + "learning_rate": 2.9867044773531083e-07, + "loss": 0.0304, + "step": 8108 + }, + { + "epoch": 3.6015989340439707, + "grad_norm": 0.4160783686882648, + "learning_rate": 2.9801084386954337e-07, + "loss": 0.0224, + "step": 8109 + }, + { + "epoch": 3.602043082389518, + "grad_norm": 0.3728837062303793, + "learning_rate": 2.9735194679261835e-07, + "loss": 0.0212, + "step": 8110 + }, + { + "epoch": 3.6024872307350657, + "grad_norm": 0.7022594125847725, + "learning_rate": 2.966937566035799e-07, + "loss": 0.0301, + "step": 8111 + }, + { + "epoch": 3.602931379080613, + "grad_norm": 0.4052906698509884, + "learning_rate": 2.9603627340136553e-07, + "loss": 0.0293, + "step": 8112 + }, + { + "epoch": 3.6033755274261603, + "grad_norm": 0.3921299918104816, + "learning_rate": 2.953794972848051e-07, + "loss": 0.025, + "step": 8113 + }, + { + "epoch": 3.6038196757717076, + "grad_norm": 0.406918068376974, + "learning_rate": 2.947234283526229e-07, + "loss": 0.0278, + "step": 8114 + }, + { + "epoch": 3.6042638241172553, + "grad_norm": 0.4175532634627423, + "learning_rate": 2.940680667034396e-07, + "loss": 0.0318, + "step": 8115 + }, + { + "epoch": 3.6047079724628026, + "grad_norm": 0.43540375334253767, + "learning_rate": 2.934134124357646e-07, + "loss": 0.0268, + "step": 8116 + }, + { + "epoch": 3.60515212080835, + "grad_norm": 0.4718399188563787, + "learning_rate": 2.927594656480054e-07, + "loss": 0.0319, + "step": 8117 + }, + { + "epoch": 3.6055962691538976, + "grad_norm": 0.4746986163908478, + "learning_rate": 2.921062264384605e-07, + "loss": 0.0239, + "step": 8118 + }, + { + "epoch": 3.606040417499445, + "grad_norm": 0.43178883447784155, + "learning_rate": 2.914536949053226e-07, + "loss": 0.0252, + "step": 8119 + }, + { + "epoch": 3.606484565844992, + "grad_norm": 0.44193392412448756, + "learning_rate": 2.908018711466787e-07, + "loss": 0.0293, + "step": 8120 + }, + { + "epoch": 3.6069287141905395, + "grad_norm": 0.361976350777308, + "learning_rate": 2.901507552605087e-07, + "loss": 0.0155, + "step": 8121 + }, + { + "epoch": 3.607372862536087, + "grad_norm": 0.4277695581236822, + "learning_rate": 2.895003473446861e-07, + "loss": 0.0286, + "step": 8122 + }, + { + "epoch": 3.6078170108816345, + "grad_norm": 0.4459278029555017, + "learning_rate": 2.8885064749697987e-07, + "loss": 0.028, + "step": 8123 + }, + { + "epoch": 3.608261159227182, + "grad_norm": 0.566646999620044, + "learning_rate": 2.882016558150491e-07, + "loss": 0.0364, + "step": 8124 + }, + { + "epoch": 3.6087053075727296, + "grad_norm": 0.4313671970565694, + "learning_rate": 2.87553372396448e-07, + "loss": 0.031, + "step": 8125 + }, + { + "epoch": 3.609149455918277, + "grad_norm": 0.40292145142350155, + "learning_rate": 2.869057973386269e-07, + "loss": 0.0272, + "step": 8126 + }, + { + "epoch": 3.609593604263824, + "grad_norm": 0.41179464266273536, + "learning_rate": 2.8625893073892577e-07, + "loss": 0.0208, + "step": 8127 + }, + { + "epoch": 3.6100377526093714, + "grad_norm": 0.4344939010412826, + "learning_rate": 2.85612772694579e-07, + "loss": 0.0246, + "step": 8128 + }, + { + "epoch": 3.6104819009549187, + "grad_norm": 0.4503615177393324, + "learning_rate": 2.8496732330271726e-07, + "loss": 0.0318, + "step": 8129 + }, + { + "epoch": 3.6109260493004665, + "grad_norm": 0.49743560082851446, + "learning_rate": 2.8432258266036016e-07, + "loss": 0.0242, + "step": 8130 + }, + { + "epoch": 3.6113701976460137, + "grad_norm": 0.418557316697792, + "learning_rate": 2.8367855086442353e-07, + "loss": 0.0279, + "step": 8131 + }, + { + "epoch": 3.611814345991561, + "grad_norm": 0.3450331986237824, + "learning_rate": 2.830352280117188e-07, + "loss": 0.022, + "step": 8132 + }, + { + "epoch": 3.6122584943371088, + "grad_norm": 0.44492026164748794, + "learning_rate": 2.8239261419894526e-07, + "loss": 0.0323, + "step": 8133 + }, + { + "epoch": 3.612702642682656, + "grad_norm": 0.39841298833512895, + "learning_rate": 2.8175070952270014e-07, + "loss": 0.0285, + "step": 8134 + }, + { + "epoch": 3.6131467910282034, + "grad_norm": 0.43301418955026805, + "learning_rate": 2.811095140794734e-07, + "loss": 0.0274, + "step": 8135 + }, + { + "epoch": 3.6135909393737506, + "grad_norm": 0.5096476202086357, + "learning_rate": 2.804690279656458e-07, + "loss": 0.051, + "step": 8136 + }, + { + "epoch": 3.6140350877192984, + "grad_norm": 0.3293367472926153, + "learning_rate": 2.7982925127749416e-07, + "loss": 0.0227, + "step": 8137 + }, + { + "epoch": 3.6144792360648457, + "grad_norm": 0.37776627801259316, + "learning_rate": 2.791901841111877e-07, + "loss": 0.0254, + "step": 8138 + }, + { + "epoch": 3.614923384410393, + "grad_norm": 0.40922593503138716, + "learning_rate": 2.78551826562789e-07, + "loss": 0.045, + "step": 8139 + }, + { + "epoch": 3.6153675327559407, + "grad_norm": 0.42201659981374173, + "learning_rate": 2.779141787282547e-07, + "loss": 0.0251, + "step": 8140 + }, + { + "epoch": 3.615811681101488, + "grad_norm": 0.4053495089948873, + "learning_rate": 2.7727724070343296e-07, + "loss": 0.023, + "step": 8141 + }, + { + "epoch": 3.6162558294470353, + "grad_norm": 0.4556874573880963, + "learning_rate": 2.7664101258406626e-07, + "loss": 0.0227, + "step": 8142 + }, + { + "epoch": 3.6166999777925826, + "grad_norm": 0.3450822468016936, + "learning_rate": 2.7600549446579306e-07, + "loss": 0.018, + "step": 8143 + }, + { + "epoch": 3.6171441261381303, + "grad_norm": 0.35001380471776244, + "learning_rate": 2.753706864441391e-07, + "loss": 0.0232, + "step": 8144 + }, + { + "epoch": 3.6175882744836776, + "grad_norm": 0.364015851517707, + "learning_rate": 2.7473658861452923e-07, + "loss": 0.0185, + "step": 8145 + }, + { + "epoch": 3.618032422829225, + "grad_norm": 0.4713893596705623, + "learning_rate": 2.741032010722788e-07, + "loss": 0.0337, + "step": 8146 + }, + { + "epoch": 3.6184765711747726, + "grad_norm": 0.3440041200074425, + "learning_rate": 2.734705239125951e-07, + "loss": 0.0197, + "step": 8147 + }, + { + "epoch": 3.61892071952032, + "grad_norm": 0.41811195093373094, + "learning_rate": 2.728385572305814e-07, + "loss": 0.0278, + "step": 8148 + }, + { + "epoch": 3.619364867865867, + "grad_norm": 0.339333589639759, + "learning_rate": 2.7220730112123337e-07, + "loss": 0.0187, + "step": 8149 + }, + { + "epoch": 3.6198090162114145, + "grad_norm": 0.7462572452071613, + "learning_rate": 2.715767556794391e-07, + "loss": 0.0425, + "step": 8150 + }, + { + "epoch": 3.620253164556962, + "grad_norm": 0.5298727144349012, + "learning_rate": 2.7094692099997986e-07, + "loss": 0.0308, + "step": 8151 + }, + { + "epoch": 3.6206973129025095, + "grad_norm": 0.411102053477688, + "learning_rate": 2.7031779717753223e-07, + "loss": 0.0249, + "step": 8152 + }, + { + "epoch": 3.621141461248057, + "grad_norm": 0.4354411151955604, + "learning_rate": 2.696893843066617e-07, + "loss": 0.0307, + "step": 8153 + }, + { + "epoch": 3.6215856095936045, + "grad_norm": 0.3792115584655087, + "learning_rate": 2.6906168248183095e-07, + "loss": 0.0248, + "step": 8154 + }, + { + "epoch": 3.622029757939152, + "grad_norm": 0.3457882220586487, + "learning_rate": 2.68434691797394e-07, + "loss": 0.0197, + "step": 8155 + }, + { + "epoch": 3.622473906284699, + "grad_norm": 0.5361251704803487, + "learning_rate": 2.6780841234759826e-07, + "loss": 0.0382, + "step": 8156 + }, + { + "epoch": 3.6229180546302464, + "grad_norm": 0.45121861235515354, + "learning_rate": 2.6718284422658447e-07, + "loss": 0.0226, + "step": 8157 + }, + { + "epoch": 3.6233622029757937, + "grad_norm": 0.3726443210546779, + "learning_rate": 2.665579875283847e-07, + "loss": 0.0223, + "step": 8158 + }, + { + "epoch": 3.6238063513213414, + "grad_norm": 0.39308466518304386, + "learning_rate": 2.6593384234692597e-07, + "loss": 0.0259, + "step": 8159 + }, + { + "epoch": 3.6242504996668887, + "grad_norm": 0.3787552610454571, + "learning_rate": 2.6531040877602997e-07, + "loss": 0.0194, + "step": 8160 + }, + { + "epoch": 3.624694648012436, + "grad_norm": 0.41262964342629177, + "learning_rate": 2.646876869094073e-07, + "loss": 0.0197, + "step": 8161 + }, + { + "epoch": 3.6251387963579838, + "grad_norm": 0.34296275500584317, + "learning_rate": 2.640656768406641e-07, + "loss": 0.0215, + "step": 8162 + }, + { + "epoch": 3.625582944703531, + "grad_norm": 0.37447434249834716, + "learning_rate": 2.634443786632995e-07, + "loss": 0.0251, + "step": 8163 + }, + { + "epoch": 3.6260270930490783, + "grad_norm": 0.44671265535557425, + "learning_rate": 2.628237924707044e-07, + "loss": 0.0251, + "step": 8164 + }, + { + "epoch": 3.6264712413946256, + "grad_norm": 0.39092522082367537, + "learning_rate": 2.622039183561642e-07, + "loss": 0.0245, + "step": 8165 + }, + { + "epoch": 3.6269153897401734, + "grad_norm": 0.3971038919445469, + "learning_rate": 2.6158475641285544e-07, + "loss": 0.0297, + "step": 8166 + }, + { + "epoch": 3.6273595380857206, + "grad_norm": 0.7481153045495028, + "learning_rate": 2.609663067338497e-07, + "loss": 0.0379, + "step": 8167 + }, + { + "epoch": 3.627803686431268, + "grad_norm": 0.3705770719942717, + "learning_rate": 2.6034856941211104e-07, + "loss": 0.0238, + "step": 8168 + }, + { + "epoch": 3.6282478347768157, + "grad_norm": 0.42989268956281285, + "learning_rate": 2.597315445404941e-07, + "loss": 0.0264, + "step": 8169 + }, + { + "epoch": 3.628691983122363, + "grad_norm": 0.3871931353713456, + "learning_rate": 2.5911523221174963e-07, + "loss": 0.024, + "step": 8170 + }, + { + "epoch": 3.6291361314679103, + "grad_norm": 0.5499381201238311, + "learning_rate": 2.584996325185185e-07, + "loss": 0.036, + "step": 8171 + }, + { + "epoch": 3.6295802798134575, + "grad_norm": 0.42718459557037247, + "learning_rate": 2.5788474555333675e-07, + "loss": 0.0284, + "step": 8172 + }, + { + "epoch": 3.630024428159005, + "grad_norm": 0.45150263461678425, + "learning_rate": 2.5727057140863266e-07, + "loss": 0.0424, + "step": 8173 + }, + { + "epoch": 3.6304685765045526, + "grad_norm": 0.4604751454319313, + "learning_rate": 2.566571101767268e-07, + "loss": 0.0229, + "step": 8174 + }, + { + "epoch": 3.6309127248501, + "grad_norm": 0.40368285144270716, + "learning_rate": 2.5604436194983204e-07, + "loss": 0.0241, + "step": 8175 + }, + { + "epoch": 3.6313568731956476, + "grad_norm": 0.35454391047270123, + "learning_rate": 2.554323268200559e-07, + "loss": 0.0194, + "step": 8176 + }, + { + "epoch": 3.631801021541195, + "grad_norm": 0.4841769515138133, + "learning_rate": 2.548210048793964e-07, + "loss": 0.0253, + "step": 8177 + }, + { + "epoch": 3.632245169886742, + "grad_norm": 0.34838665066054253, + "learning_rate": 2.5421039621974677e-07, + "loss": 0.0219, + "step": 8178 + }, + { + "epoch": 3.6326893182322895, + "grad_norm": 0.40000850001012683, + "learning_rate": 2.5360050093289123e-07, + "loss": 0.0281, + "step": 8179 + }, + { + "epoch": 3.6331334665778368, + "grad_norm": 0.3910550154100924, + "learning_rate": 2.529913191105088e-07, + "loss": 0.0223, + "step": 8180 + }, + { + "epoch": 3.6335776149233845, + "grad_norm": 0.48803942739172496, + "learning_rate": 2.523828508441672e-07, + "loss": 0.025, + "step": 8181 + }, + { + "epoch": 3.634021763268932, + "grad_norm": 0.5754863723730677, + "learning_rate": 2.5177509622533183e-07, + "loss": 0.0406, + "step": 8182 + }, + { + "epoch": 3.634465911614479, + "grad_norm": 0.4286654639108965, + "learning_rate": 2.511680553453572e-07, + "loss": 0.0216, + "step": 8183 + }, + { + "epoch": 3.634910059960027, + "grad_norm": 0.35950293556792007, + "learning_rate": 2.5056172829549254e-07, + "loss": 0.0331, + "step": 8184 + }, + { + "epoch": 3.635354208305574, + "grad_norm": 0.40214970790890675, + "learning_rate": 2.4995611516688003e-07, + "loss": 0.0233, + "step": 8185 + }, + { + "epoch": 3.6357983566511214, + "grad_norm": 0.36548024593908557, + "learning_rate": 2.4935121605055125e-07, + "loss": 0.0261, + "step": 8186 + }, + { + "epoch": 3.6362425049966687, + "grad_norm": 0.5386358859158857, + "learning_rate": 2.487470310374346e-07, + "loss": 0.0215, + "step": 8187 + }, + { + "epoch": 3.6366866533422164, + "grad_norm": 0.38408368355582256, + "learning_rate": 2.481435602183485e-07, + "loss": 0.0227, + "step": 8188 + }, + { + "epoch": 3.6371308016877637, + "grad_norm": 0.3671311848219369, + "learning_rate": 2.475408036840055e-07, + "loss": 0.0203, + "step": 8189 + }, + { + "epoch": 3.637574950033311, + "grad_norm": 0.46572660180603004, + "learning_rate": 2.469387615250096e-07, + "loss": 0.0274, + "step": 8190 + }, + { + "epoch": 3.6380190983788587, + "grad_norm": 0.4911123345635677, + "learning_rate": 2.4633743383185917e-07, + "loss": 0.0252, + "step": 8191 + }, + { + "epoch": 3.638463246724406, + "grad_norm": 0.3574418971645557, + "learning_rate": 2.4573682069494234e-07, + "loss": 0.0205, + "step": 8192 + }, + { + "epoch": 3.6389073950699533, + "grad_norm": 0.4154556222921757, + "learning_rate": 2.451369222045419e-07, + "loss": 0.0248, + "step": 8193 + }, + { + "epoch": 3.6393515434155006, + "grad_norm": 0.3834951658389965, + "learning_rate": 2.445377384508335e-07, + "loss": 0.0228, + "step": 8194 + }, + { + "epoch": 3.6397956917610483, + "grad_norm": 0.3958357851929866, + "learning_rate": 2.4393926952388405e-07, + "loss": 0.0195, + "step": 8195 + }, + { + "epoch": 3.6402398401065956, + "grad_norm": 0.481708316236042, + "learning_rate": 2.433415155136543e-07, + "loss": 0.0245, + "step": 8196 + }, + { + "epoch": 3.640683988452143, + "grad_norm": 0.45974860157380204, + "learning_rate": 2.427444765099951e-07, + "loss": 0.0228, + "step": 8197 + }, + { + "epoch": 3.6411281367976907, + "grad_norm": 0.4731819261447121, + "learning_rate": 2.4214815260265367e-07, + "loss": 0.0498, + "step": 8198 + }, + { + "epoch": 3.641572285143238, + "grad_norm": 0.40140095766392536, + "learning_rate": 2.4155254388126605e-07, + "loss": 0.0309, + "step": 8199 + }, + { + "epoch": 3.6420164334887852, + "grad_norm": 0.34222283638473855, + "learning_rate": 2.4095765043536335e-07, + "loss": 0.0196, + "step": 8200 + }, + { + "epoch": 3.6424605818343325, + "grad_norm": 0.4008139911989502, + "learning_rate": 2.403634723543674e-07, + "loss": 0.0163, + "step": 8201 + }, + { + "epoch": 3.64290473017988, + "grad_norm": 0.4485823633640726, + "learning_rate": 2.3977000972759454e-07, + "loss": 0.0247, + "step": 8202 + }, + { + "epoch": 3.6433488785254275, + "grad_norm": 0.32021601023059565, + "learning_rate": 2.391772626442507e-07, + "loss": 0.0162, + "step": 8203 + }, + { + "epoch": 3.643793026870975, + "grad_norm": 0.43897160516812744, + "learning_rate": 2.385852311934367e-07, + "loss": 0.0192, + "step": 8204 + }, + { + "epoch": 3.6442371752165226, + "grad_norm": 0.35383635594441887, + "learning_rate": 2.379939154641442e-07, + "loss": 0.0224, + "step": 8205 + }, + { + "epoch": 3.64468132356207, + "grad_norm": 0.4656360521009951, + "learning_rate": 2.3740331554525875e-07, + "loss": 0.0281, + "step": 8206 + }, + { + "epoch": 3.645125471907617, + "grad_norm": 0.3752654330901412, + "learning_rate": 2.3681343152555768e-07, + "loss": 0.021, + "step": 8207 + }, + { + "epoch": 3.6455696202531644, + "grad_norm": 0.43380693965119627, + "learning_rate": 2.3622426349371064e-07, + "loss": 0.0233, + "step": 8208 + }, + { + "epoch": 3.6460137685987117, + "grad_norm": 0.3468026081865424, + "learning_rate": 2.3563581153827897e-07, + "loss": 0.0208, + "step": 8209 + }, + { + "epoch": 3.6464579169442595, + "grad_norm": 0.3963696177580075, + "learning_rate": 2.3504807574771638e-07, + "loss": 0.0234, + "step": 8210 + }, + { + "epoch": 3.6469020652898068, + "grad_norm": 0.3493278085805506, + "learning_rate": 2.3446105621037108e-07, + "loss": 0.018, + "step": 8211 + }, + { + "epoch": 3.647346213635354, + "grad_norm": 0.448389498368257, + "learning_rate": 2.3387475301448138e-07, + "loss": 0.0262, + "step": 8212 + }, + { + "epoch": 3.647790361980902, + "grad_norm": 0.4639744403941518, + "learning_rate": 2.33289166248179e-07, + "loss": 0.0299, + "step": 8213 + }, + { + "epoch": 3.648234510326449, + "grad_norm": 0.34846046299317557, + "learning_rate": 2.327042959994863e-07, + "loss": 0.0197, + "step": 8214 + }, + { + "epoch": 3.6486786586719964, + "grad_norm": 0.35001828631869614, + "learning_rate": 2.3212014235632074e-07, + "loss": 0.0184, + "step": 8215 + }, + { + "epoch": 3.6491228070175437, + "grad_norm": 0.3539191157147139, + "learning_rate": 2.3153670540648932e-07, + "loss": 0.0202, + "step": 8216 + }, + { + "epoch": 3.6495669553630914, + "grad_norm": 0.39166688802077126, + "learning_rate": 2.3095398523769353e-07, + "loss": 0.0237, + "step": 8217 + }, + { + "epoch": 3.6500111037086387, + "grad_norm": 0.41163896619179474, + "learning_rate": 2.3037198193752553e-07, + "loss": 0.0254, + "step": 8218 + }, + { + "epoch": 3.650455252054186, + "grad_norm": 0.43685004953046314, + "learning_rate": 2.2979069559347088e-07, + "loss": 0.0237, + "step": 8219 + }, + { + "epoch": 3.6508994003997337, + "grad_norm": 0.5309766077496497, + "learning_rate": 2.292101262929064e-07, + "loss": 0.0292, + "step": 8220 + }, + { + "epoch": 3.651343548745281, + "grad_norm": 0.41452507271756617, + "learning_rate": 2.2863027412310056e-07, + "loss": 0.0292, + "step": 8221 + }, + { + "epoch": 3.6517876970908283, + "grad_norm": 0.44136007566907587, + "learning_rate": 2.2805113917121647e-07, + "loss": 0.0284, + "step": 8222 + }, + { + "epoch": 3.6522318454363756, + "grad_norm": 0.3476979126592021, + "learning_rate": 2.274727215243072e-07, + "loss": 0.0197, + "step": 8223 + }, + { + "epoch": 3.6526759937819233, + "grad_norm": 0.38817929599514056, + "learning_rate": 2.2689502126931938e-07, + "loss": 0.0224, + "step": 8224 + }, + { + "epoch": 3.6531201421274706, + "grad_norm": 0.4746782142221425, + "learning_rate": 2.2631803849309076e-07, + "loss": 0.0352, + "step": 8225 + }, + { + "epoch": 3.653564290473018, + "grad_norm": 0.40968095773887653, + "learning_rate": 2.2574177328235137e-07, + "loss": 0.0276, + "step": 8226 + }, + { + "epoch": 3.6540084388185656, + "grad_norm": 0.41401682518721233, + "learning_rate": 2.2516622572372416e-07, + "loss": 0.0283, + "step": 8227 + }, + { + "epoch": 3.654452587164113, + "grad_norm": 0.4117127298389683, + "learning_rate": 2.2459139590372325e-07, + "loss": 0.0228, + "step": 8228 + }, + { + "epoch": 3.65489673550966, + "grad_norm": 0.4311069896164078, + "learning_rate": 2.240172839087551e-07, + "loss": 0.0281, + "step": 8229 + }, + { + "epoch": 3.6553408838552075, + "grad_norm": 0.4327836235259352, + "learning_rate": 2.2344388982512012e-07, + "loss": 0.0228, + "step": 8230 + }, + { + "epoch": 3.655785032200755, + "grad_norm": 0.4573606402349546, + "learning_rate": 2.2287121373900712e-07, + "loss": 0.0319, + "step": 8231 + }, + { + "epoch": 3.6562291805463025, + "grad_norm": 0.4345901205357684, + "learning_rate": 2.2229925573650001e-07, + "loss": 0.0269, + "step": 8232 + }, + { + "epoch": 3.65667332889185, + "grad_norm": 0.3825914316334593, + "learning_rate": 2.2172801590357395e-07, + "loss": 0.0282, + "step": 8233 + }, + { + "epoch": 3.6571174772373976, + "grad_norm": 0.31366037192320456, + "learning_rate": 2.2115749432609524e-07, + "loss": 0.0167, + "step": 8234 + }, + { + "epoch": 3.657561625582945, + "grad_norm": 0.3788275375636668, + "learning_rate": 2.205876910898236e-07, + "loss": 0.0306, + "step": 8235 + }, + { + "epoch": 3.658005773928492, + "grad_norm": 0.38419523963619157, + "learning_rate": 2.2001860628041106e-07, + "loss": 0.0299, + "step": 8236 + }, + { + "epoch": 3.6584499222740394, + "grad_norm": 0.4550088476883181, + "learning_rate": 2.1945023998339865e-07, + "loss": 0.0347, + "step": 8237 + }, + { + "epoch": 3.6588940706195867, + "grad_norm": 0.597248639500098, + "learning_rate": 2.1888259228422248e-07, + "loss": 0.0194, + "step": 8238 + }, + { + "epoch": 3.6593382189651344, + "grad_norm": 0.5188207400851287, + "learning_rate": 2.1831566326820986e-07, + "loss": 0.0276, + "step": 8239 + }, + { + "epoch": 3.6597823673106817, + "grad_norm": 0.32831002856910196, + "learning_rate": 2.177494530205798e-07, + "loss": 0.0245, + "step": 8240 + }, + { + "epoch": 3.660226515656229, + "grad_norm": 0.3836254147289484, + "learning_rate": 2.1718396162644319e-07, + "loss": 0.0212, + "step": 8241 + }, + { + "epoch": 3.6606706640017768, + "grad_norm": 0.4548723138202921, + "learning_rate": 2.1661918917080304e-07, + "loss": 0.0253, + "step": 8242 + }, + { + "epoch": 3.661114812347324, + "grad_norm": 0.3684605663304441, + "learning_rate": 2.1605513573855375e-07, + "loss": 0.0178, + "step": 8243 + }, + { + "epoch": 3.6615589606928713, + "grad_norm": 0.3733701839690256, + "learning_rate": 2.1549180141448356e-07, + "loss": 0.0194, + "step": 8244 + }, + { + "epoch": 3.6620031090384186, + "grad_norm": 0.4105795750947663, + "learning_rate": 2.1492918628326864e-07, + "loss": 0.0148, + "step": 8245 + }, + { + "epoch": 3.6624472573839664, + "grad_norm": 0.41761384153375, + "learning_rate": 2.143672904294819e-07, + "loss": 0.0293, + "step": 8246 + }, + { + "epoch": 3.6628914057295137, + "grad_norm": 0.3728653241577783, + "learning_rate": 2.1380611393758576e-07, + "loss": 0.0223, + "step": 8247 + }, + { + "epoch": 3.663335554075061, + "grad_norm": 0.3955291650313776, + "learning_rate": 2.1324565689193332e-07, + "loss": 0.025, + "step": 8248 + }, + { + "epoch": 3.6637797024206087, + "grad_norm": 0.5976258676447095, + "learning_rate": 2.1268591937677164e-07, + "loss": 0.0328, + "step": 8249 + }, + { + "epoch": 3.664223850766156, + "grad_norm": 0.7950146359972061, + "learning_rate": 2.1212690147623894e-07, + "loss": 0.0295, + "step": 8250 + }, + { + "epoch": 3.6646679991117033, + "grad_norm": 0.5530704513836777, + "learning_rate": 2.1156860327436302e-07, + "loss": 0.0302, + "step": 8251 + }, + { + "epoch": 3.6651121474572506, + "grad_norm": 0.29807912775738393, + "learning_rate": 2.1101102485506842e-07, + "loss": 0.0178, + "step": 8252 + }, + { + "epoch": 3.6655562958027983, + "grad_norm": 0.4517710106710957, + "learning_rate": 2.1045416630216808e-07, + "loss": 0.0242, + "step": 8253 + }, + { + "epoch": 3.6660004441483456, + "grad_norm": 0.4254157446074955, + "learning_rate": 2.0989802769936563e-07, + "loss": 0.0268, + "step": 8254 + }, + { + "epoch": 3.666444592493893, + "grad_norm": 0.3693301648290188, + "learning_rate": 2.0934260913025973e-07, + "loss": 0.0266, + "step": 8255 + }, + { + "epoch": 3.6668887408394406, + "grad_norm": 0.4631155691374193, + "learning_rate": 2.0878791067833805e-07, + "loss": 0.0293, + "step": 8256 + }, + { + "epoch": 3.667332889184988, + "grad_norm": 0.3868441750288522, + "learning_rate": 2.0823393242698275e-07, + "loss": 0.0261, + "step": 8257 + }, + { + "epoch": 3.667777037530535, + "grad_norm": 0.430361544922243, + "learning_rate": 2.0768067445946506e-07, + "loss": 0.0247, + "step": 8258 + }, + { + "epoch": 3.6682211858760825, + "grad_norm": 0.4116233802052011, + "learning_rate": 2.0712813685894894e-07, + "loss": 0.0315, + "step": 8259 + }, + { + "epoch": 3.6686653342216298, + "grad_norm": 0.3788192226900461, + "learning_rate": 2.0657631970849078e-07, + "loss": 0.0229, + "step": 8260 + }, + { + "epoch": 3.6691094825671775, + "grad_norm": 0.37557385002997257, + "learning_rate": 2.0602522309103813e-07, + "loss": 0.0236, + "step": 8261 + }, + { + "epoch": 3.669553630912725, + "grad_norm": 0.3805674438167674, + "learning_rate": 2.054748470894291e-07, + "loss": 0.0252, + "step": 8262 + }, + { + "epoch": 3.6699977792582725, + "grad_norm": 0.3690281782315337, + "learning_rate": 2.0492519178639536e-07, + "loss": 0.0161, + "step": 8263 + }, + { + "epoch": 3.67044192760382, + "grad_norm": 0.7116450161368049, + "learning_rate": 2.0437625726456024e-07, + "loss": 0.0292, + "step": 8264 + }, + { + "epoch": 3.670886075949367, + "grad_norm": 0.5939366820148858, + "learning_rate": 2.0382804360643603e-07, + "loss": 0.0382, + "step": 8265 + }, + { + "epoch": 3.6713302242949144, + "grad_norm": 0.4269858268062977, + "learning_rate": 2.0328055089443023e-07, + "loss": 0.0236, + "step": 8266 + }, + { + "epoch": 3.6717743726404617, + "grad_norm": 0.43163831702859096, + "learning_rate": 2.027337792108397e-07, + "loss": 0.0244, + "step": 8267 + }, + { + "epoch": 3.6722185209860094, + "grad_norm": 0.5025109334901467, + "learning_rate": 2.0218772863785263e-07, + "loss": 0.0296, + "step": 8268 + }, + { + "epoch": 3.6726626693315567, + "grad_norm": 0.6343391256263229, + "learning_rate": 2.016423992575517e-07, + "loss": 0.0248, + "step": 8269 + }, + { + "epoch": 3.673106817677104, + "grad_norm": 0.4000275239829487, + "learning_rate": 2.0109779115190742e-07, + "loss": 0.0198, + "step": 8270 + }, + { + "epoch": 3.6735509660226517, + "grad_norm": 0.3303791647606292, + "learning_rate": 2.0055390440278376e-07, + "loss": 0.0258, + "step": 8271 + }, + { + "epoch": 3.673995114368199, + "grad_norm": 0.36510639486980745, + "learning_rate": 2.0001073909193702e-07, + "loss": 0.0173, + "step": 8272 + }, + { + "epoch": 3.6744392627137463, + "grad_norm": 0.4445660217933993, + "learning_rate": 1.9946829530101408e-07, + "loss": 0.0279, + "step": 8273 + }, + { + "epoch": 3.6748834110592936, + "grad_norm": 0.3323295635321309, + "learning_rate": 1.989265731115525e-07, + "loss": 0.0211, + "step": 8274 + }, + { + "epoch": 3.6753275594048413, + "grad_norm": 0.4082377088884518, + "learning_rate": 1.983855726049838e-07, + "loss": 0.0281, + "step": 8275 + }, + { + "epoch": 3.6757717077503886, + "grad_norm": 0.5538219755733254, + "learning_rate": 1.9784529386262798e-07, + "loss": 0.0375, + "step": 8276 + }, + { + "epoch": 3.676215856095936, + "grad_norm": 0.4054313962604177, + "learning_rate": 1.9730573696569888e-07, + "loss": 0.0221, + "step": 8277 + }, + { + "epoch": 3.6766600044414837, + "grad_norm": 0.338502488617252, + "learning_rate": 1.9676690199530169e-07, + "loss": 0.0231, + "step": 8278 + }, + { + "epoch": 3.677104152787031, + "grad_norm": 0.4016411596672554, + "learning_rate": 1.9622878903243104e-07, + "loss": 0.024, + "step": 8279 + }, + { + "epoch": 3.6775483011325782, + "grad_norm": 0.4045075471165877, + "learning_rate": 1.95691398157975e-07, + "loss": 0.0179, + "step": 8280 + }, + { + "epoch": 3.6779924494781255, + "grad_norm": 0.40898342731485937, + "learning_rate": 1.9515472945271396e-07, + "loss": 0.0337, + "step": 8281 + }, + { + "epoch": 3.6784365978236733, + "grad_norm": 0.3742680249776215, + "learning_rate": 1.946187829973162e-07, + "loss": 0.0181, + "step": 8282 + }, + { + "epoch": 3.6788807461692206, + "grad_norm": 0.38742125913942227, + "learning_rate": 1.9408355887234443e-07, + "loss": 0.0267, + "step": 8283 + }, + { + "epoch": 3.679324894514768, + "grad_norm": 0.31803662853945275, + "learning_rate": 1.9354905715825323e-07, + "loss": 0.0106, + "step": 8284 + }, + { + "epoch": 3.6797690428603156, + "grad_norm": 0.3842318920408417, + "learning_rate": 1.9301527793538445e-07, + "loss": 0.0208, + "step": 8285 + }, + { + "epoch": 3.680213191205863, + "grad_norm": 0.34972059951342765, + "learning_rate": 1.9248222128397663e-07, + "loss": 0.0209, + "step": 8286 + }, + { + "epoch": 3.68065733955141, + "grad_norm": 0.4945754865360583, + "learning_rate": 1.9194988728415632e-07, + "loss": 0.0285, + "step": 8287 + }, + { + "epoch": 3.6811014878969575, + "grad_norm": 0.4665722830251576, + "learning_rate": 1.9141827601594221e-07, + "loss": 0.0228, + "step": 8288 + }, + { + "epoch": 3.6815456362425047, + "grad_norm": 0.33231118424653433, + "learning_rate": 1.908873875592454e-07, + "loss": 0.0215, + "step": 8289 + }, + { + "epoch": 3.6819897845880525, + "grad_norm": 0.41202362800291603, + "learning_rate": 1.9035722199386542e-07, + "loss": 0.0219, + "step": 8290 + }, + { + "epoch": 3.6824339329335998, + "grad_norm": 0.4209126047122881, + "learning_rate": 1.8982777939949736e-07, + "loss": 0.0356, + "step": 8291 + }, + { + "epoch": 3.6828780812791475, + "grad_norm": 0.35749851244891667, + "learning_rate": 1.8929905985572484e-07, + "loss": 0.0194, + "step": 8292 + }, + { + "epoch": 3.683322229624695, + "grad_norm": 0.38704831620698, + "learning_rate": 1.8877106344202312e-07, + "loss": 0.0329, + "step": 8293 + }, + { + "epoch": 3.683766377970242, + "grad_norm": 0.526720346124437, + "learning_rate": 1.8824379023775874e-07, + "loss": 0.0357, + "step": 8294 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.42113199922195677, + "learning_rate": 1.877172403221905e-07, + "loss": 0.0315, + "step": 8295 + }, + { + "epoch": 3.6846546746613367, + "grad_norm": 0.3078574734152821, + "learning_rate": 1.871914137744668e-07, + "loss": 0.0165, + "step": 8296 + }, + { + "epoch": 3.6850988230068844, + "grad_norm": 0.4123152188226272, + "learning_rate": 1.866663106736294e-07, + "loss": 0.0401, + "step": 8297 + }, + { + "epoch": 3.6855429713524317, + "grad_norm": 0.3318217390451302, + "learning_rate": 1.8614193109860955e-07, + "loss": 0.0228, + "step": 8298 + }, + { + "epoch": 3.685987119697979, + "grad_norm": 0.4377759250920508, + "learning_rate": 1.8561827512823095e-07, + "loss": 0.0271, + "step": 8299 + }, + { + "epoch": 3.6864312680435267, + "grad_norm": 0.8207219751826093, + "learning_rate": 1.8509534284120721e-07, + "loss": 0.0344, + "step": 8300 + }, + { + "epoch": 3.686875416389074, + "grad_norm": 0.8375758066775824, + "learning_rate": 1.84573134316145e-07, + "loss": 0.0367, + "step": 8301 + }, + { + "epoch": 3.6873195647346213, + "grad_norm": 0.36428841642403553, + "learning_rate": 1.840516496315392e-07, + "loss": 0.0217, + "step": 8302 + }, + { + "epoch": 3.6877637130801686, + "grad_norm": 0.5076188683241221, + "learning_rate": 1.8353088886578053e-07, + "loss": 0.0294, + "step": 8303 + }, + { + "epoch": 3.6882078614257163, + "grad_norm": 0.6798347292105588, + "learning_rate": 1.830108520971463e-07, + "loss": 0.0405, + "step": 8304 + }, + { + "epoch": 3.6886520097712636, + "grad_norm": 0.3568052667311966, + "learning_rate": 1.8249153940380738e-07, + "loss": 0.0279, + "step": 8305 + }, + { + "epoch": 3.689096158116811, + "grad_norm": 0.4023381862191607, + "learning_rate": 1.8197295086382515e-07, + "loss": 0.0271, + "step": 8306 + }, + { + "epoch": 3.6895403064623586, + "grad_norm": 0.3720480087573231, + "learning_rate": 1.8145508655515177e-07, + "loss": 0.0199, + "step": 8307 + }, + { + "epoch": 3.689984454807906, + "grad_norm": 0.4144873947838932, + "learning_rate": 1.8093794655563214e-07, + "loss": 0.0215, + "step": 8308 + }, + { + "epoch": 3.6904286031534532, + "grad_norm": 0.39511282629465616, + "learning_rate": 1.804215309430013e-07, + "loss": 0.0175, + "step": 8309 + }, + { + "epoch": 3.6908727514990005, + "grad_norm": 0.5527099360678772, + "learning_rate": 1.799058397948844e-07, + "loss": 0.0381, + "step": 8310 + }, + { + "epoch": 3.691316899844548, + "grad_norm": 0.46007077628363413, + "learning_rate": 1.7939087318879833e-07, + "loss": 0.0333, + "step": 8311 + }, + { + "epoch": 3.6917610481900955, + "grad_norm": 0.43813513461840425, + "learning_rate": 1.788766312021528e-07, + "loss": 0.0298, + "step": 8312 + }, + { + "epoch": 3.692205196535643, + "grad_norm": 0.4101350408780889, + "learning_rate": 1.7836311391224494e-07, + "loss": 0.0259, + "step": 8313 + }, + { + "epoch": 3.6926493448811906, + "grad_norm": 0.2992536206219798, + "learning_rate": 1.7785032139626734e-07, + "loss": 0.0185, + "step": 8314 + }, + { + "epoch": 3.693093493226738, + "grad_norm": 0.365121204268502, + "learning_rate": 1.7733825373129954e-07, + "loss": 0.0216, + "step": 8315 + }, + { + "epoch": 3.693537641572285, + "grad_norm": 0.416511886265595, + "learning_rate": 1.7682691099431548e-07, + "loss": 0.0372, + "step": 8316 + }, + { + "epoch": 3.6939817899178324, + "grad_norm": 0.3845252122577115, + "learning_rate": 1.763162932621787e-07, + "loss": 0.0257, + "step": 8317 + }, + { + "epoch": 3.6944259382633797, + "grad_norm": 0.4233868381729098, + "learning_rate": 1.7580640061164223e-07, + "loss": 0.0294, + "step": 8318 + }, + { + "epoch": 3.6948700866089275, + "grad_norm": 0.462463687883106, + "learning_rate": 1.7529723311935198e-07, + "loss": 0.0364, + "step": 8319 + }, + { + "epoch": 3.6953142349544748, + "grad_norm": 0.43549161369245626, + "learning_rate": 1.7478879086184564e-07, + "loss": 0.0263, + "step": 8320 + }, + { + "epoch": 3.695758383300022, + "grad_norm": 0.39333187210295867, + "learning_rate": 1.742810739155504e-07, + "loss": 0.0246, + "step": 8321 + }, + { + "epoch": 3.6962025316455698, + "grad_norm": 0.35014124457318635, + "learning_rate": 1.737740823567835e-07, + "loss": 0.0218, + "step": 8322 + }, + { + "epoch": 3.696646679991117, + "grad_norm": 0.42978622961517393, + "learning_rate": 1.7326781626175627e-07, + "loss": 0.0257, + "step": 8323 + }, + { + "epoch": 3.6970908283366644, + "grad_norm": 0.4118470726335261, + "learning_rate": 1.727622757065678e-07, + "loss": 0.0261, + "step": 8324 + }, + { + "epoch": 3.6975349766822116, + "grad_norm": 0.471980787223015, + "learning_rate": 1.7225746076720894e-07, + "loss": 0.0226, + "step": 8325 + }, + { + "epoch": 3.6979791250277594, + "grad_norm": 0.4565879663220365, + "learning_rate": 1.717533715195635e-07, + "loss": 0.0248, + "step": 8326 + }, + { + "epoch": 3.6984232733733067, + "grad_norm": 0.3976292965837061, + "learning_rate": 1.712500080394036e-07, + "loss": 0.0324, + "step": 8327 + }, + { + "epoch": 3.698867421718854, + "grad_norm": 0.4111827692476157, + "learning_rate": 1.7074737040239375e-07, + "loss": 0.0248, + "step": 8328 + }, + { + "epoch": 3.6993115700644017, + "grad_norm": 0.36585319212766676, + "learning_rate": 1.7024545868408903e-07, + "loss": 0.0209, + "step": 8329 + }, + { + "epoch": 3.699755718409949, + "grad_norm": 0.47005920699211934, + "learning_rate": 1.6974427295993412e-07, + "loss": 0.0203, + "step": 8330 + }, + { + "epoch": 3.7001998667554963, + "grad_norm": 0.36932538349452104, + "learning_rate": 1.6924381330526817e-07, + "loss": 0.0212, + "step": 8331 + }, + { + "epoch": 3.7006440151010436, + "grad_norm": 0.446723580160887, + "learning_rate": 1.6874407979531604e-07, + "loss": 0.0247, + "step": 8332 + }, + { + "epoch": 3.7010881634465913, + "grad_norm": 0.445687716290222, + "learning_rate": 1.682450725051976e-07, + "loss": 0.0251, + "step": 8333 + }, + { + "epoch": 3.7015323117921386, + "grad_norm": 0.3855963805554051, + "learning_rate": 1.677467915099229e-07, + "loss": 0.0161, + "step": 8334 + }, + { + "epoch": 3.701976460137686, + "grad_norm": 0.39452015734075696, + "learning_rate": 1.6724923688439033e-07, + "loss": 0.0238, + "step": 8335 + }, + { + "epoch": 3.7024206084832336, + "grad_norm": 0.45003118141173504, + "learning_rate": 1.667524087033906e-07, + "loss": 0.0264, + "step": 8336 + }, + { + "epoch": 3.702864756828781, + "grad_norm": 0.42598757489225364, + "learning_rate": 1.6625630704160788e-07, + "loss": 0.0336, + "step": 8337 + }, + { + "epoch": 3.703308905174328, + "grad_norm": 0.38079855634441273, + "learning_rate": 1.6576093197361253e-07, + "loss": 0.0286, + "step": 8338 + }, + { + "epoch": 3.7037530535198755, + "grad_norm": 0.3983365488271041, + "learning_rate": 1.652662835738683e-07, + "loss": 0.0205, + "step": 8339 + }, + { + "epoch": 3.704197201865423, + "grad_norm": 0.29963760935621175, + "learning_rate": 1.6477236191673018e-07, + "loss": 0.0189, + "step": 8340 + }, + { + "epoch": 3.7046413502109705, + "grad_norm": 0.771557737936414, + "learning_rate": 1.6427916707644153e-07, + "loss": 0.0319, + "step": 8341 + }, + { + "epoch": 3.705085498556518, + "grad_norm": 0.4894556058318927, + "learning_rate": 1.6378669912713862e-07, + "loss": 0.0316, + "step": 8342 + }, + { + "epoch": 3.7055296469020655, + "grad_norm": 0.3795606721666376, + "learning_rate": 1.6329495814284778e-07, + "loss": 0.0196, + "step": 8343 + }, + { + "epoch": 3.705973795247613, + "grad_norm": 0.3778626203357572, + "learning_rate": 1.62803944197486e-07, + "loss": 0.0266, + "step": 8344 + }, + { + "epoch": 3.70641794359316, + "grad_norm": 0.4426801167498142, + "learning_rate": 1.6231365736486093e-07, + "loss": 0.0288, + "step": 8345 + }, + { + "epoch": 3.7068620919387074, + "grad_norm": 0.35264417606088994, + "learning_rate": 1.6182409771867137e-07, + "loss": 0.0189, + "step": 8346 + }, + { + "epoch": 3.7073062402842547, + "grad_norm": 0.38892860903541526, + "learning_rate": 1.6133526533250566e-07, + "loss": 0.0318, + "step": 8347 + }, + { + "epoch": 3.7077503886298024, + "grad_norm": 0.3867272658310112, + "learning_rate": 1.6084716027984503e-07, + "loss": 0.0246, + "step": 8348 + }, + { + "epoch": 3.7081945369753497, + "grad_norm": 0.6221413345427776, + "learning_rate": 1.6035978263405804e-07, + "loss": 0.0333, + "step": 8349 + }, + { + "epoch": 3.708638685320897, + "grad_norm": 0.5154361170774239, + "learning_rate": 1.5987313246840718e-07, + "loss": 0.0342, + "step": 8350 + }, + { + "epoch": 3.7090828336664448, + "grad_norm": 0.3524314183478963, + "learning_rate": 1.593872098560445e-07, + "loss": 0.0211, + "step": 8351 + }, + { + "epoch": 3.709526982011992, + "grad_norm": 0.38476894824992575, + "learning_rate": 1.58902014870011e-07, + "loss": 0.0211, + "step": 8352 + }, + { + "epoch": 3.7099711303575393, + "grad_norm": 0.4245656298307411, + "learning_rate": 1.5841754758324058e-07, + "loss": 0.0209, + "step": 8353 + }, + { + "epoch": 3.7104152787030866, + "grad_norm": 0.3820971674731869, + "learning_rate": 1.579338080685572e-07, + "loss": 0.0221, + "step": 8354 + }, + { + "epoch": 3.7108594270486344, + "grad_norm": 0.37340680941918303, + "learning_rate": 1.5745079639867488e-07, + "loss": 0.0297, + "step": 8355 + }, + { + "epoch": 3.7113035753941817, + "grad_norm": 0.29722253097229934, + "learning_rate": 1.5696851264619785e-07, + "loss": 0.0178, + "step": 8356 + }, + { + "epoch": 3.711747723739729, + "grad_norm": 0.4252952177272931, + "learning_rate": 1.5648695688362304e-07, + "loss": 0.0257, + "step": 8357 + }, + { + "epoch": 3.7121918720852767, + "grad_norm": 0.49778617697788585, + "learning_rate": 1.560061291833348e-07, + "loss": 0.0377, + "step": 8358 + }, + { + "epoch": 3.712636020430824, + "grad_norm": 0.45908962801821235, + "learning_rate": 1.5552602961761033e-07, + "loss": 0.0236, + "step": 8359 + }, + { + "epoch": 3.7130801687763713, + "grad_norm": 0.35236790788170025, + "learning_rate": 1.5504665825861687e-07, + "loss": 0.018, + "step": 8360 + }, + { + "epoch": 3.7135243171219185, + "grad_norm": 0.516154033831261, + "learning_rate": 1.5456801517841236e-07, + "loss": 0.0338, + "step": 8361 + }, + { + "epoch": 3.7139684654674663, + "grad_norm": 0.2906873978074194, + "learning_rate": 1.540901004489448e-07, + "loss": 0.0257, + "step": 8362 + }, + { + "epoch": 3.7144126138130136, + "grad_norm": 0.363688530548789, + "learning_rate": 1.5361291414205226e-07, + "loss": 0.0175, + "step": 8363 + }, + { + "epoch": 3.714856762158561, + "grad_norm": 0.4720314053715725, + "learning_rate": 1.5313645632946407e-07, + "loss": 0.0406, + "step": 8364 + }, + { + "epoch": 3.7153009105041086, + "grad_norm": 0.36832914963768065, + "learning_rate": 1.5266072708280177e-07, + "loss": 0.0232, + "step": 8365 + }, + { + "epoch": 3.715745058849656, + "grad_norm": 0.40219479844398387, + "learning_rate": 1.5218572647357265e-07, + "loss": 0.0325, + "step": 8366 + }, + { + "epoch": 3.716189207195203, + "grad_norm": 0.3686119581029194, + "learning_rate": 1.517114545731796e-07, + "loss": 0.019, + "step": 8367 + }, + { + "epoch": 3.7166333555407505, + "grad_norm": 0.4442003891466178, + "learning_rate": 1.5123791145291332e-07, + "loss": 0.0275, + "step": 8368 + }, + { + "epoch": 3.7170775038862978, + "grad_norm": 0.36490090535494746, + "learning_rate": 1.5076509718395416e-07, + "loss": 0.0155, + "step": 8369 + }, + { + "epoch": 3.7175216522318455, + "grad_norm": 0.4009221914723727, + "learning_rate": 1.502930118373752e-07, + "loss": 0.0223, + "step": 8370 + }, + { + "epoch": 3.717965800577393, + "grad_norm": 0.43768126067136603, + "learning_rate": 1.4982165548413862e-07, + "loss": 0.0192, + "step": 8371 + }, + { + "epoch": 3.7184099489229405, + "grad_norm": 0.5261889966594541, + "learning_rate": 1.4935102819509717e-07, + "loss": 0.0353, + "step": 8372 + }, + { + "epoch": 3.718854097268488, + "grad_norm": 0.34594837099051023, + "learning_rate": 1.488811300409948e-07, + "loss": 0.0188, + "step": 8373 + }, + { + "epoch": 3.719298245614035, + "grad_norm": 0.5250244707730272, + "learning_rate": 1.4841196109246448e-07, + "loss": 0.0355, + "step": 8374 + }, + { + "epoch": 3.7197423939595824, + "grad_norm": 0.3336300196833185, + "learning_rate": 1.4794352142003088e-07, + "loss": 0.0199, + "step": 8375 + }, + { + "epoch": 3.7201865423051297, + "grad_norm": 0.37689522491294797, + "learning_rate": 1.4747581109410713e-07, + "loss": 0.0174, + "step": 8376 + }, + { + "epoch": 3.7206306906506774, + "grad_norm": 0.48406735678739193, + "learning_rate": 1.4700883018499979e-07, + "loss": 0.025, + "step": 8377 + }, + { + "epoch": 3.7210748389962247, + "grad_norm": 0.45821951845296044, + "learning_rate": 1.4654257876290267e-07, + "loss": 0.0392, + "step": 8378 + }, + { + "epoch": 3.721518987341772, + "grad_norm": 0.43841923148082784, + "learning_rate": 1.4607705689790197e-07, + "loss": 0.0213, + "step": 8379 + }, + { + "epoch": 3.7219631356873197, + "grad_norm": 0.38180624149856224, + "learning_rate": 1.4561226465997337e-07, + "loss": 0.0244, + "step": 8380 + }, + { + "epoch": 3.722407284032867, + "grad_norm": 0.38238991365264746, + "learning_rate": 1.4514820211898263e-07, + "loss": 0.0235, + "step": 8381 + }, + { + "epoch": 3.7228514323784143, + "grad_norm": 0.4147995047117634, + "learning_rate": 1.4468486934468728e-07, + "loss": 0.0244, + "step": 8382 + }, + { + "epoch": 3.7232955807239616, + "grad_norm": 0.37825138010304116, + "learning_rate": 1.442222664067333e-07, + "loss": 0.0204, + "step": 8383 + }, + { + "epoch": 3.7237397290695093, + "grad_norm": 0.43713370136688856, + "learning_rate": 1.437603933746573e-07, + "loss": 0.0283, + "step": 8384 + }, + { + "epoch": 3.7241838774150566, + "grad_norm": 0.3959829514177559, + "learning_rate": 1.4329925031788815e-07, + "loss": 0.024, + "step": 8385 + }, + { + "epoch": 3.724628025760604, + "grad_norm": 0.36795641365580717, + "learning_rate": 1.4283883730574212e-07, + "loss": 0.0242, + "step": 8386 + }, + { + "epoch": 3.7250721741061517, + "grad_norm": 0.33923664126749403, + "learning_rate": 1.4237915440742768e-07, + "loss": 0.0259, + "step": 8387 + }, + { + "epoch": 3.725516322451699, + "grad_norm": 0.43212191542717354, + "learning_rate": 1.4192020169204292e-07, + "loss": 0.0205, + "step": 8388 + }, + { + "epoch": 3.7259604707972462, + "grad_norm": 0.5186925399242536, + "learning_rate": 1.4146197922857597e-07, + "loss": 0.032, + "step": 8389 + }, + { + "epoch": 3.7264046191427935, + "grad_norm": 0.3400385668731632, + "learning_rate": 1.410044870859062e-07, + "loss": 0.0224, + "step": 8390 + }, + { + "epoch": 3.7268487674883413, + "grad_norm": 0.4227136704820241, + "learning_rate": 1.4054772533280137e-07, + "loss": 0.0217, + "step": 8391 + }, + { + "epoch": 3.7272929158338886, + "grad_norm": 0.4011138415574072, + "learning_rate": 1.4009169403792154e-07, + "loss": 0.025, + "step": 8392 + }, + { + "epoch": 3.727737064179436, + "grad_norm": 0.3003293847098222, + "learning_rate": 1.396363932698147e-07, + "loss": 0.0174, + "step": 8393 + }, + { + "epoch": 3.7281812125249836, + "grad_norm": 0.4465399668144551, + "learning_rate": 1.3918182309692164e-07, + "loss": 0.0361, + "step": 8394 + }, + { + "epoch": 3.728625360870531, + "grad_norm": 0.3571377988241023, + "learning_rate": 1.3872798358757155e-07, + "loss": 0.0186, + "step": 8395 + }, + { + "epoch": 3.729069509216078, + "grad_norm": 0.3618570693279145, + "learning_rate": 1.3827487480998437e-07, + "loss": 0.0204, + "step": 8396 + }, + { + "epoch": 3.7295136575616255, + "grad_norm": 0.4415246137854366, + "learning_rate": 1.3782249683226946e-07, + "loss": 0.0215, + "step": 8397 + }, + { + "epoch": 3.7299578059071727, + "grad_norm": 0.3988226147588891, + "learning_rate": 1.373708497224263e-07, + "loss": 0.025, + "step": 8398 + }, + { + "epoch": 3.7304019542527205, + "grad_norm": 0.3588668524798711, + "learning_rate": 1.3691993354834733e-07, + "loss": 0.017, + "step": 8399 + }, + { + "epoch": 3.7308461025982678, + "grad_norm": 0.4766338898375096, + "learning_rate": 1.3646974837781102e-07, + "loss": 0.0246, + "step": 8400 + }, + { + "epoch": 3.7312902509438155, + "grad_norm": 0.4346113992522893, + "learning_rate": 1.3602029427848885e-07, + "loss": 0.0224, + "step": 8401 + }, + { + "epoch": 3.731734399289363, + "grad_norm": 0.4308704837627075, + "learning_rate": 1.355715713179412e-07, + "loss": 0.0228, + "step": 8402 + }, + { + "epoch": 3.73217854763491, + "grad_norm": 0.3138469968218578, + "learning_rate": 1.35123579563618e-07, + "loss": 0.0211, + "step": 8403 + }, + { + "epoch": 3.7326226959804574, + "grad_norm": 0.3603696166551371, + "learning_rate": 1.346763190828604e-07, + "loss": 0.0202, + "step": 8404 + }, + { + "epoch": 3.7330668443260047, + "grad_norm": 0.472833085205998, + "learning_rate": 1.3422978994290014e-07, + "loss": 0.032, + "step": 8405 + }, + { + "epoch": 3.7335109926715524, + "grad_norm": 0.5373078797879084, + "learning_rate": 1.3378399221085691e-07, + "loss": 0.0331, + "step": 8406 + }, + { + "epoch": 3.7339551410170997, + "grad_norm": 0.32376977842916665, + "learning_rate": 1.3333892595374265e-07, + "loss": 0.0193, + "step": 8407 + }, + { + "epoch": 3.734399289362647, + "grad_norm": 0.5384681977899171, + "learning_rate": 1.3289459123845772e-07, + "loss": 0.0439, + "step": 8408 + }, + { + "epoch": 3.7348434377081947, + "grad_norm": 0.360367652411676, + "learning_rate": 1.3245098813179315e-07, + "loss": 0.0259, + "step": 8409 + }, + { + "epoch": 3.735287586053742, + "grad_norm": 0.34043662992938706, + "learning_rate": 1.3200811670043057e-07, + "loss": 0.0156, + "step": 8410 + }, + { + "epoch": 3.7357317343992893, + "grad_norm": 0.46845973568867966, + "learning_rate": 1.3156597701094065e-07, + "loss": 0.0223, + "step": 8411 + }, + { + "epoch": 3.7361758827448366, + "grad_norm": 0.4170724326037999, + "learning_rate": 1.3112456912978467e-07, + "loss": 0.0183, + "step": 8412 + }, + { + "epoch": 3.7366200310903843, + "grad_norm": 0.6062425923257145, + "learning_rate": 1.3068389312331398e-07, + "loss": 0.0326, + "step": 8413 + }, + { + "epoch": 3.7370641794359316, + "grad_norm": 0.32063888344376806, + "learning_rate": 1.3024394905776893e-07, + "loss": 0.0155, + "step": 8414 + }, + { + "epoch": 3.737508327781479, + "grad_norm": 0.4839068579509274, + "learning_rate": 1.298047369992811e-07, + "loss": 0.0285, + "step": 8415 + }, + { + "epoch": 3.7379524761270266, + "grad_norm": 0.4686148975397421, + "learning_rate": 1.2936625701387152e-07, + "loss": 0.0291, + "step": 8416 + }, + { + "epoch": 3.738396624472574, + "grad_norm": 0.40247010635972097, + "learning_rate": 1.289285091674508e-07, + "loss": 0.0248, + "step": 8417 + }, + { + "epoch": 3.738840772818121, + "grad_norm": 0.3788928148883821, + "learning_rate": 1.2849149352582135e-07, + "loss": 0.0277, + "step": 8418 + }, + { + "epoch": 3.7392849211636685, + "grad_norm": 0.45807438580819443, + "learning_rate": 1.280552101546717e-07, + "loss": 0.0311, + "step": 8419 + }, + { + "epoch": 3.7397290695092162, + "grad_norm": 0.513273904365022, + "learning_rate": 1.2761965911958385e-07, + "loss": 0.0245, + "step": 8420 + }, + { + "epoch": 3.7401732178547635, + "grad_norm": 0.39246829057407184, + "learning_rate": 1.2718484048602876e-07, + "loss": 0.0231, + "step": 8421 + }, + { + "epoch": 3.740617366200311, + "grad_norm": 0.41649329925637596, + "learning_rate": 1.267507543193669e-07, + "loss": 0.0199, + "step": 8422 + }, + { + "epoch": 3.7410615145458586, + "grad_norm": 0.42862616530797626, + "learning_rate": 1.2631740068484888e-07, + "loss": 0.0307, + "step": 8423 + }, + { + "epoch": 3.741505662891406, + "grad_norm": 0.35644680831072795, + "learning_rate": 1.258847796476148e-07, + "loss": 0.0241, + "step": 8424 + }, + { + "epoch": 3.741949811236953, + "grad_norm": 0.32721758016106417, + "learning_rate": 1.2545289127269488e-07, + "loss": 0.0235, + "step": 8425 + }, + { + "epoch": 3.7423939595825004, + "grad_norm": 0.40082921797531834, + "learning_rate": 1.2502173562500995e-07, + "loss": 0.021, + "step": 8426 + }, + { + "epoch": 3.7428381079280477, + "grad_norm": 0.39626883986681716, + "learning_rate": 1.2459131276936876e-07, + "loss": 0.0242, + "step": 8427 + }, + { + "epoch": 3.7432822562735955, + "grad_norm": 0.5073039426598518, + "learning_rate": 1.241616227704723e-07, + "loss": 0.0278, + "step": 8428 + }, + { + "epoch": 3.7437264046191427, + "grad_norm": 0.3603567472819875, + "learning_rate": 1.2373266569290997e-07, + "loss": 0.032, + "step": 8429 + }, + { + "epoch": 3.7441705529646905, + "grad_norm": 0.4679950484890861, + "learning_rate": 1.2330444160116196e-07, + "loss": 0.0208, + "step": 8430 + }, + { + "epoch": 3.7446147013102378, + "grad_norm": 0.468095341335435, + "learning_rate": 1.2287695055959615e-07, + "loss": 0.0223, + "step": 8431 + }, + { + "epoch": 3.745058849655785, + "grad_norm": 0.4819559753911326, + "learning_rate": 1.2245019263247283e-07, + "loss": 0.0262, + "step": 8432 + }, + { + "epoch": 3.7455029980013324, + "grad_norm": 0.4954170402461803, + "learning_rate": 1.2202416788394067e-07, + "loss": 0.0333, + "step": 8433 + }, + { + "epoch": 3.7459471463468796, + "grad_norm": 0.6174858065052228, + "learning_rate": 1.215988763780379e-07, + "loss": 0.0313, + "step": 8434 + }, + { + "epoch": 3.7463912946924274, + "grad_norm": 0.42561144433663495, + "learning_rate": 1.2117431817869453e-07, + "loss": 0.0277, + "step": 8435 + }, + { + "epoch": 3.7468354430379747, + "grad_norm": 0.3709807838740829, + "learning_rate": 1.207504933497272e-07, + "loss": 0.0243, + "step": 8436 + }, + { + "epoch": 3.747279591383522, + "grad_norm": 0.3000902649747397, + "learning_rate": 1.2032740195484448e-07, + "loss": 0.0147, + "step": 8437 + }, + { + "epoch": 3.7477237397290697, + "grad_norm": 0.5581005324860493, + "learning_rate": 1.1990504405764492e-07, + "loss": 0.0302, + "step": 8438 + }, + { + "epoch": 3.748167888074617, + "grad_norm": 0.5633312132178465, + "learning_rate": 1.1948341972161492e-07, + "loss": 0.0309, + "step": 8439 + }, + { + "epoch": 3.7486120364201643, + "grad_norm": 0.36629488791856035, + "learning_rate": 1.1906252901013271e-07, + "loss": 0.0238, + "step": 8440 + }, + { + "epoch": 3.7490561847657116, + "grad_norm": 0.49491041444267353, + "learning_rate": 1.1864237198646544e-07, + "loss": 0.0298, + "step": 8441 + }, + { + "epoch": 3.7495003331112593, + "grad_norm": 0.564678297749189, + "learning_rate": 1.1822294871376928e-07, + "loss": 0.0305, + "step": 8442 + }, + { + "epoch": 3.7499444814568066, + "grad_norm": 0.40055447684951356, + "learning_rate": 1.1780425925509043e-07, + "loss": 0.0272, + "step": 8443 + }, + { + "epoch": 3.750388629802354, + "grad_norm": 0.470018676790432, + "learning_rate": 1.1738630367336579e-07, + "loss": 0.032, + "step": 8444 + }, + { + "epoch": 3.7508327781479016, + "grad_norm": 0.4384932264169123, + "learning_rate": 1.1696908203142066e-07, + "loss": 0.0256, + "step": 8445 + }, + { + "epoch": 3.751276926493449, + "grad_norm": 0.4653823127550869, + "learning_rate": 1.1655259439197042e-07, + "loss": 0.03, + "step": 8446 + }, + { + "epoch": 3.751721074838996, + "grad_norm": 0.41947700796617987, + "learning_rate": 1.1613684081762111e-07, + "loss": 0.0236, + "step": 8447 + }, + { + "epoch": 3.7521652231845435, + "grad_norm": 0.38536838427097736, + "learning_rate": 1.1572182137086662e-07, + "loss": 0.0242, + "step": 8448 + }, + { + "epoch": 3.7526093715300908, + "grad_norm": 0.341291576535971, + "learning_rate": 1.1530753611409151e-07, + "loss": 0.0229, + "step": 8449 + }, + { + "epoch": 3.7530535198756385, + "grad_norm": 0.3792987155782225, + "learning_rate": 1.1489398510957039e-07, + "loss": 0.0237, + "step": 8450 + }, + { + "epoch": 3.753497668221186, + "grad_norm": 0.37854108977812184, + "learning_rate": 1.1448116841946688e-07, + "loss": 0.0287, + "step": 8451 + }, + { + "epoch": 3.7539418165667335, + "grad_norm": 0.46875753276752063, + "learning_rate": 1.1406908610583467e-07, + "loss": 0.0311, + "step": 8452 + }, + { + "epoch": 3.754385964912281, + "grad_norm": 0.4064331207182262, + "learning_rate": 1.1365773823061532e-07, + "loss": 0.0283, + "step": 8453 + }, + { + "epoch": 3.754830113257828, + "grad_norm": 0.4002763856777381, + "learning_rate": 1.1324712485564271e-07, + "loss": 0.024, + "step": 8454 + }, + { + "epoch": 3.7552742616033754, + "grad_norm": 0.45827533835175305, + "learning_rate": 1.1283724604263857e-07, + "loss": 0.0246, + "step": 8455 + }, + { + "epoch": 3.7557184099489227, + "grad_norm": 0.39394686171582316, + "learning_rate": 1.1242810185321473e-07, + "loss": 0.03, + "step": 8456 + }, + { + "epoch": 3.7561625582944704, + "grad_norm": 0.42301627464562236, + "learning_rate": 1.1201969234887256e-07, + "loss": 0.0321, + "step": 8457 + }, + { + "epoch": 3.7566067066400177, + "grad_norm": 0.5331607328416476, + "learning_rate": 1.1161201759100349e-07, + "loss": 0.0279, + "step": 8458 + }, + { + "epoch": 3.757050854985565, + "grad_norm": 0.44317622469926854, + "learning_rate": 1.1120507764088684e-07, + "loss": 0.0449, + "step": 8459 + }, + { + "epoch": 3.7574950033311127, + "grad_norm": 0.3625167956575573, + "learning_rate": 1.1079887255969257e-07, + "loss": 0.0194, + "step": 8460 + }, + { + "epoch": 3.75793915167666, + "grad_norm": 0.38235614382527433, + "learning_rate": 1.1039340240848129e-07, + "loss": 0.0286, + "step": 8461 + }, + { + "epoch": 3.7583833000222073, + "grad_norm": 0.4781732717681245, + "learning_rate": 1.0998866724820145e-07, + "loss": 0.0275, + "step": 8462 + }, + { + "epoch": 3.7588274483677546, + "grad_norm": 0.36277030824808554, + "learning_rate": 1.0958466713969218e-07, + "loss": 0.02, + "step": 8463 + }, + { + "epoch": 3.7592715967133024, + "grad_norm": 0.4310996548128226, + "learning_rate": 1.09181402143681e-07, + "loss": 0.0271, + "step": 8464 + }, + { + "epoch": 3.7597157450588496, + "grad_norm": 0.44245437066064086, + "learning_rate": 1.0877887232078499e-07, + "loss": 0.0221, + "step": 8465 + }, + { + "epoch": 3.760159893404397, + "grad_norm": 0.45765152233325074, + "learning_rate": 1.0837707773151185e-07, + "loss": 0.0229, + "step": 8466 + }, + { + "epoch": 3.7606040417499447, + "grad_norm": 0.5026222951099458, + "learning_rate": 1.0797601843625827e-07, + "loss": 0.0253, + "step": 8467 + }, + { + "epoch": 3.761048190095492, + "grad_norm": 0.5808566690208385, + "learning_rate": 1.0757569449530991e-07, + "loss": 0.0229, + "step": 8468 + }, + { + "epoch": 3.7614923384410393, + "grad_norm": 0.36877164338905205, + "learning_rate": 1.0717610596884309e-07, + "loss": 0.0285, + "step": 8469 + }, + { + "epoch": 3.7619364867865865, + "grad_norm": 0.3700848691513958, + "learning_rate": 1.0677725291692143e-07, + "loss": 0.0204, + "step": 8470 + }, + { + "epoch": 3.7623806351321343, + "grad_norm": 0.3125009535356397, + "learning_rate": 1.0637913539950029e-07, + "loss": 0.0181, + "step": 8471 + }, + { + "epoch": 3.7628247834776816, + "grad_norm": 0.4808509442055829, + "learning_rate": 1.0598175347642293e-07, + "loss": 0.028, + "step": 8472 + }, + { + "epoch": 3.763268931823229, + "grad_norm": 0.4442187709817902, + "learning_rate": 1.0558510720742265e-07, + "loss": 0.0316, + "step": 8473 + }, + { + "epoch": 3.7637130801687766, + "grad_norm": 0.5324409399106215, + "learning_rate": 1.0518919665212235e-07, + "loss": 0.0319, + "step": 8474 + }, + { + "epoch": 3.764157228514324, + "grad_norm": 0.42314697759763437, + "learning_rate": 1.0479402187003496e-07, + "loss": 0.0222, + "step": 8475 + }, + { + "epoch": 3.764601376859871, + "grad_norm": 0.378666602483765, + "learning_rate": 1.0439958292056074e-07, + "loss": 0.029, + "step": 8476 + }, + { + "epoch": 3.7650455252054185, + "grad_norm": 0.5003419916114011, + "learning_rate": 1.040058798629906e-07, + "loss": 0.0259, + "step": 8477 + }, + { + "epoch": 3.7654896735509658, + "grad_norm": 0.5187802895889871, + "learning_rate": 1.0361291275650498e-07, + "loss": 0.0412, + "step": 8478 + }, + { + "epoch": 3.7659338218965135, + "grad_norm": 0.41358816649715524, + "learning_rate": 1.0322068166017386e-07, + "loss": 0.0325, + "step": 8479 + }, + { + "epoch": 3.766377970242061, + "grad_norm": 0.3554104231788337, + "learning_rate": 1.0282918663295616e-07, + "loss": 0.0266, + "step": 8480 + }, + { + "epoch": 3.7668221185876085, + "grad_norm": 0.433977374572574, + "learning_rate": 1.0243842773369983e-07, + "loss": 0.0262, + "step": 8481 + }, + { + "epoch": 3.767266266933156, + "grad_norm": 0.3050089012609615, + "learning_rate": 1.0204840502114288e-07, + "loss": 0.016, + "step": 8482 + }, + { + "epoch": 3.767710415278703, + "grad_norm": 0.35035155995481493, + "learning_rate": 1.0165911855391286e-07, + "loss": 0.022, + "step": 8483 + }, + { + "epoch": 3.7681545636242504, + "grad_norm": 0.3852773325944005, + "learning_rate": 1.0127056839052462e-07, + "loss": 0.017, + "step": 8484 + }, + { + "epoch": 3.7685987119697977, + "grad_norm": 0.43026798549784784, + "learning_rate": 1.0088275458938535e-07, + "loss": 0.0246, + "step": 8485 + }, + { + "epoch": 3.7690428603153454, + "grad_norm": 0.49903822216529625, + "learning_rate": 1.004956772087895e-07, + "loss": 0.0278, + "step": 8486 + }, + { + "epoch": 3.7694870086608927, + "grad_norm": 0.30382121537402157, + "learning_rate": 1.0010933630692166e-07, + "loss": 0.0168, + "step": 8487 + }, + { + "epoch": 3.76993115700644, + "grad_norm": 0.41575029483477577, + "learning_rate": 9.972373194185481e-08, + "loss": 0.0273, + "step": 8488 + }, + { + "epoch": 3.7703753053519877, + "grad_norm": 0.425652535729631, + "learning_rate": 9.933886417155258e-08, + "loss": 0.0258, + "step": 8489 + }, + { + "epoch": 3.770819453697535, + "grad_norm": 0.4620847686387116, + "learning_rate": 9.895473305386593e-08, + "loss": 0.024, + "step": 8490 + }, + { + "epoch": 3.7712636020430823, + "grad_norm": 0.4272116851275043, + "learning_rate": 9.857133864653812e-08, + "loss": 0.0246, + "step": 8491 + }, + { + "epoch": 3.7717077503886296, + "grad_norm": 0.3869470039777947, + "learning_rate": 9.818868100719803e-08, + "loss": 0.0266, + "step": 8492 + }, + { + "epoch": 3.7721518987341773, + "grad_norm": 0.38019015167069964, + "learning_rate": 9.780676019336632e-08, + "loss": 0.0235, + "step": 8493 + }, + { + "epoch": 3.7725960470797246, + "grad_norm": 0.6659937799546494, + "learning_rate": 9.742557626245264e-08, + "loss": 0.0336, + "step": 8494 + }, + { + "epoch": 3.773040195425272, + "grad_norm": 0.5490763929102886, + "learning_rate": 9.704512927175502e-08, + "loss": 0.0325, + "step": 8495 + }, + { + "epoch": 3.7734843437708196, + "grad_norm": 0.5157099798547079, + "learning_rate": 9.666541927846107e-08, + "loss": 0.0408, + "step": 8496 + }, + { + "epoch": 3.773928492116367, + "grad_norm": 0.6662930057806212, + "learning_rate": 9.62864463396479e-08, + "loss": 0.0181, + "step": 8497 + }, + { + "epoch": 3.7743726404619142, + "grad_norm": 0.36908564910307085, + "learning_rate": 9.590821051228105e-08, + "loss": 0.0196, + "step": 8498 + }, + { + "epoch": 3.7748167888074615, + "grad_norm": 0.34743147522098267, + "learning_rate": 9.553071185321616e-08, + "loss": 0.0177, + "step": 8499 + }, + { + "epoch": 3.7752609371530093, + "grad_norm": 0.4868308303940559, + "learning_rate": 9.515395041919839e-08, + "loss": 0.0292, + "step": 8500 + }, + { + "epoch": 3.7757050854985565, + "grad_norm": 0.4939860503734759, + "learning_rate": 9.477792626685966e-08, + "loss": 0.026, + "step": 8501 + }, + { + "epoch": 3.776149233844104, + "grad_norm": 0.31320876766724953, + "learning_rate": 9.440263945272365e-08, + "loss": 0.018, + "step": 8502 + }, + { + "epoch": 3.7765933821896516, + "grad_norm": 0.36925404461957756, + "learning_rate": 9.402809003320357e-08, + "loss": 0.022, + "step": 8503 + }, + { + "epoch": 3.777037530535199, + "grad_norm": 0.5099202980127734, + "learning_rate": 9.365427806459826e-08, + "loss": 0.0219, + "step": 8504 + }, + { + "epoch": 3.777481678880746, + "grad_norm": 0.4048961589102498, + "learning_rate": 9.32812036031e-08, + "loss": 0.02, + "step": 8505 + }, + { + "epoch": 3.7779258272262934, + "grad_norm": 0.3828893218593422, + "learning_rate": 9.290886670478727e-08, + "loss": 0.0329, + "step": 8506 + }, + { + "epoch": 3.7783699755718407, + "grad_norm": 0.47619859186660657, + "learning_rate": 9.253726742562808e-08, + "loss": 0.0339, + "step": 8507 + }, + { + "epoch": 3.7788141239173885, + "grad_norm": 0.3817560157892672, + "learning_rate": 9.216640582148218e-08, + "loss": 0.0245, + "step": 8508 + }, + { + "epoch": 3.7792582722629358, + "grad_norm": 0.44955998588034224, + "learning_rate": 9.179628194809387e-08, + "loss": 0.036, + "step": 8509 + }, + { + "epoch": 3.7797024206084835, + "grad_norm": 0.39409371816193695, + "learning_rate": 9.142689586110032e-08, + "loss": 0.0243, + "step": 8510 + }, + { + "epoch": 3.780146568954031, + "grad_norm": 0.3329695501761576, + "learning_rate": 9.105824761602711e-08, + "loss": 0.0177, + "step": 8511 + }, + { + "epoch": 3.780590717299578, + "grad_norm": 0.41637622163142424, + "learning_rate": 9.069033726828657e-08, + "loss": 0.0243, + "step": 8512 + }, + { + "epoch": 3.7810348656451254, + "grad_norm": 0.513878740141547, + "learning_rate": 9.032316487318338e-08, + "loss": 0.0422, + "step": 8513 + }, + { + "epoch": 3.7814790139906727, + "grad_norm": 0.3779536127482641, + "learning_rate": 8.995673048591002e-08, + "loss": 0.0243, + "step": 8514 + }, + { + "epoch": 3.7819231623362204, + "grad_norm": 0.40656693503772895, + "learning_rate": 8.959103416154635e-08, + "loss": 0.0204, + "step": 8515 + }, + { + "epoch": 3.7823673106817677, + "grad_norm": 0.37125663642968887, + "learning_rate": 8.922607595506339e-08, + "loss": 0.0222, + "step": 8516 + }, + { + "epoch": 3.782811459027315, + "grad_norm": 0.4060879405476042, + "learning_rate": 8.886185592132113e-08, + "loss": 0.0213, + "step": 8517 + }, + { + "epoch": 3.7832556073728627, + "grad_norm": 0.4488358277812707, + "learning_rate": 8.849837411506745e-08, + "loss": 0.0344, + "step": 8518 + }, + { + "epoch": 3.78369975571841, + "grad_norm": 0.4066455322610377, + "learning_rate": 8.813563059093977e-08, + "loss": 0.0242, + "step": 8519 + }, + { + "epoch": 3.7841439040639573, + "grad_norm": 0.4551394122387141, + "learning_rate": 8.777362540346501e-08, + "loss": 0.0293, + "step": 8520 + }, + { + "epoch": 3.7845880524095046, + "grad_norm": 0.42457221786596655, + "learning_rate": 8.741235860705855e-08, + "loss": 0.0259, + "step": 8521 + }, + { + "epoch": 3.7850322007550523, + "grad_norm": 0.3043935300931366, + "learning_rate": 8.70518302560247e-08, + "loss": 0.0182, + "step": 8522 + }, + { + "epoch": 3.7854763491005996, + "grad_norm": 0.4462173753124753, + "learning_rate": 8.669204040455737e-08, + "loss": 0.0265, + "step": 8523 + }, + { + "epoch": 3.785920497446147, + "grad_norm": 0.41601964127930957, + "learning_rate": 8.633298910673826e-08, + "loss": 0.0288, + "step": 8524 + }, + { + "epoch": 3.7863646457916946, + "grad_norm": 0.2813068890343098, + "learning_rate": 8.597467641654034e-08, + "loss": 0.0212, + "step": 8525 + }, + { + "epoch": 3.786808794137242, + "grad_norm": 0.43623034836188324, + "learning_rate": 8.561710238782272e-08, + "loss": 0.0249, + "step": 8526 + }, + { + "epoch": 3.787252942482789, + "grad_norm": 0.5751855788252586, + "learning_rate": 8.526026707433577e-08, + "loss": 0.0413, + "step": 8527 + }, + { + "epoch": 3.7876970908283365, + "grad_norm": 0.3868182612344314, + "learning_rate": 8.490417052971766e-08, + "loss": 0.0281, + "step": 8528 + }, + { + "epoch": 3.7881412391738842, + "grad_norm": 0.356372518499826, + "learning_rate": 8.45488128074945e-08, + "loss": 0.022, + "step": 8529 + }, + { + "epoch": 3.7885853875194315, + "grad_norm": 0.32262084504669236, + "learning_rate": 8.419419396108464e-08, + "loss": 0.0168, + "step": 8530 + }, + { + "epoch": 3.789029535864979, + "grad_norm": 0.4043798709101995, + "learning_rate": 8.384031404379211e-08, + "loss": 0.024, + "step": 8531 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.41478103237109665, + "learning_rate": 8.34871731088116e-08, + "loss": 0.027, + "step": 8532 + }, + { + "epoch": 3.789917832556074, + "grad_norm": 0.4196449951874737, + "learning_rate": 8.313477120922563e-08, + "loss": 0.0209, + "step": 8533 + }, + { + "epoch": 3.790361980901621, + "grad_norm": 0.4317330818971433, + "learning_rate": 8.278310839800685e-08, + "loss": 0.0284, + "step": 8534 + }, + { + "epoch": 3.7908061292471684, + "grad_norm": 0.44713201608661013, + "learning_rate": 8.243218472801461e-08, + "loss": 0.0438, + "step": 8535 + }, + { + "epoch": 3.7912502775927157, + "grad_norm": 0.4371441171214935, + "learning_rate": 8.208200025200119e-08, + "loss": 0.0201, + "step": 8536 + }, + { + "epoch": 3.7916944259382634, + "grad_norm": 0.5043700394965174, + "learning_rate": 8.173255502260336e-08, + "loss": 0.0285, + "step": 8537 + }, + { + "epoch": 3.7921385742838107, + "grad_norm": 0.3893482313023411, + "learning_rate": 8.138384909234964e-08, + "loss": 0.0258, + "step": 8538 + }, + { + "epoch": 3.7925827226293585, + "grad_norm": 0.3403931667558864, + "learning_rate": 8.103588251365534e-08, + "loss": 0.0199, + "step": 8539 + }, + { + "epoch": 3.7930268709749058, + "grad_norm": 0.4249904672408077, + "learning_rate": 8.068865533882752e-08, + "loss": 0.0221, + "step": 8540 + }, + { + "epoch": 3.793471019320453, + "grad_norm": 0.4067407706983934, + "learning_rate": 8.034216762005831e-08, + "loss": 0.0374, + "step": 8541 + }, + { + "epoch": 3.7939151676660003, + "grad_norm": 0.41570749972762994, + "learning_rate": 7.99964194094327e-08, + "loss": 0.024, + "step": 8542 + }, + { + "epoch": 3.7943593160115476, + "grad_norm": 0.48499039450946113, + "learning_rate": 7.96514107589208e-08, + "loss": 0.0341, + "step": 8543 + }, + { + "epoch": 3.7948034643570954, + "grad_norm": 0.40373614464609714, + "learning_rate": 7.93071417203839e-08, + "loss": 0.0268, + "step": 8544 + }, + { + "epoch": 3.7952476127026427, + "grad_norm": 0.35452268236632206, + "learning_rate": 7.896361234557226e-08, + "loss": 0.0192, + "step": 8545 + }, + { + "epoch": 3.79569176104819, + "grad_norm": 0.4666753180876556, + "learning_rate": 7.862082268612237e-08, + "loss": 0.0342, + "step": 8546 + }, + { + "epoch": 3.7961359093937377, + "grad_norm": 0.39409303578975985, + "learning_rate": 7.8278772793563e-08, + "loss": 0.0244, + "step": 8547 + }, + { + "epoch": 3.796580057739285, + "grad_norm": 0.543942317931089, + "learning_rate": 7.793746271930968e-08, + "loss": 0.0232, + "step": 8548 + }, + { + "epoch": 3.7970242060848323, + "grad_norm": 0.4331154507830711, + "learning_rate": 7.759689251466695e-08, + "loss": 0.0314, + "step": 8549 + }, + { + "epoch": 3.7974683544303796, + "grad_norm": 0.4001739441704062, + "learning_rate": 7.72570622308283e-08, + "loss": 0.0201, + "step": 8550 + }, + { + "epoch": 3.7979125027759273, + "grad_norm": 0.4891526654283636, + "learning_rate": 7.691797191887618e-08, + "loss": 0.027, + "step": 8551 + }, + { + "epoch": 3.7983566511214746, + "grad_norm": 0.40920185175638346, + "learning_rate": 7.657962162978038e-08, + "loss": 0.0264, + "step": 8552 + }, + { + "epoch": 3.798800799467022, + "grad_norm": 0.4710270896413282, + "learning_rate": 7.624201141440301e-08, + "loss": 0.0295, + "step": 8553 + }, + { + "epoch": 3.7992449478125696, + "grad_norm": 0.3793884463205298, + "learning_rate": 7.59051413234907e-08, + "loss": 0.0162, + "step": 8554 + }, + { + "epoch": 3.799689096158117, + "grad_norm": 0.2877108795069413, + "learning_rate": 7.556901140768125e-08, + "loss": 0.0134, + "step": 8555 + }, + { + "epoch": 3.800133244503664, + "grad_norm": 0.3350085177589909, + "learning_rate": 7.523362171750148e-08, + "loss": 0.0139, + "step": 8556 + }, + { + "epoch": 3.8005773928492115, + "grad_norm": 0.4349812650446206, + "learning_rate": 7.489897230336496e-08, + "loss": 0.0276, + "step": 8557 + }, + { + "epoch": 3.801021541194759, + "grad_norm": 0.3673667656791951, + "learning_rate": 7.456506321557533e-08, + "loss": 0.0212, + "step": 8558 + }, + { + "epoch": 3.8014656895403065, + "grad_norm": 0.4327339896844779, + "learning_rate": 7.423189450432633e-08, + "loss": 0.0234, + "step": 8559 + }, + { + "epoch": 3.801909837885854, + "grad_norm": 0.3414463099110009, + "learning_rate": 7.389946621969679e-08, + "loss": 0.02, + "step": 8560 + }, + { + "epoch": 3.8023539862314015, + "grad_norm": 0.35401161038931783, + "learning_rate": 7.356777841165786e-08, + "loss": 0.0236, + "step": 8561 + }, + { + "epoch": 3.802798134576949, + "grad_norm": 0.3318458174565355, + "learning_rate": 7.32368311300674e-08, + "loss": 0.0228, + "step": 8562 + }, + { + "epoch": 3.803242282922496, + "grad_norm": 0.4407338624172669, + "learning_rate": 7.290662442467178e-08, + "loss": 0.0243, + "step": 8563 + }, + { + "epoch": 3.8036864312680434, + "grad_norm": 0.3688164392706434, + "learning_rate": 7.257715834510737e-08, + "loss": 0.0194, + "step": 8564 + }, + { + "epoch": 3.8041305796135907, + "grad_norm": 0.4497260712320215, + "learning_rate": 7.224843294089844e-08, + "loss": 0.0287, + "step": 8565 + }, + { + "epoch": 3.8045747279591384, + "grad_norm": 0.33236407987313205, + "learning_rate": 7.192044826145772e-08, + "loss": 0.0191, + "step": 8566 + }, + { + "epoch": 3.8050188763046857, + "grad_norm": 0.37564613529503493, + "learning_rate": 7.159320435608741e-08, + "loss": 0.0161, + "step": 8567 + }, + { + "epoch": 3.8054630246502335, + "grad_norm": 0.3522232268407622, + "learning_rate": 7.126670127397705e-08, + "loss": 0.0215, + "step": 8568 + }, + { + "epoch": 3.8059071729957807, + "grad_norm": 0.429595537532537, + "learning_rate": 7.094093906420629e-08, + "loss": 0.0288, + "step": 8569 + }, + { + "epoch": 3.806351321341328, + "grad_norm": 0.4178495453889158, + "learning_rate": 7.061591777574261e-08, + "loss": 0.0273, + "step": 8570 + }, + { + "epoch": 3.8067954696868753, + "grad_norm": 0.34370415504576896, + "learning_rate": 7.029163745744194e-08, + "loss": 0.0198, + "step": 8571 + }, + { + "epoch": 3.8072396180324226, + "grad_norm": 0.6738541750686097, + "learning_rate": 6.996809815804917e-08, + "loss": 0.0216, + "step": 8572 + }, + { + "epoch": 3.8076837663779703, + "grad_norm": 0.3104036826233125, + "learning_rate": 6.964529992619817e-08, + "loss": 0.0158, + "step": 8573 + }, + { + "epoch": 3.8081279147235176, + "grad_norm": 0.3875694214420725, + "learning_rate": 6.932324281041014e-08, + "loss": 0.0282, + "step": 8574 + }, + { + "epoch": 3.808572063069065, + "grad_norm": 0.7370247616254277, + "learning_rate": 6.900192685909635e-08, + "loss": 0.0427, + "step": 8575 + }, + { + "epoch": 3.8090162114146127, + "grad_norm": 0.3713027373129603, + "learning_rate": 6.868135212055649e-08, + "loss": 0.0273, + "step": 8576 + }, + { + "epoch": 3.80946035976016, + "grad_norm": 0.4118005174789893, + "learning_rate": 6.836151864297702e-08, + "loss": 0.0256, + "step": 8577 + }, + { + "epoch": 3.8099045081057072, + "grad_norm": 0.47276389861446366, + "learning_rate": 6.80424264744356e-08, + "loss": 0.0345, + "step": 8578 + }, + { + "epoch": 3.8103486564512545, + "grad_norm": 0.49450924911427574, + "learning_rate": 6.772407566289718e-08, + "loss": 0.0299, + "step": 8579 + }, + { + "epoch": 3.8107928047968023, + "grad_norm": 0.3914626594412097, + "learning_rate": 6.740646625621461e-08, + "loss": 0.0244, + "step": 8580 + }, + { + "epoch": 3.8112369531423496, + "grad_norm": 0.3930999395745575, + "learning_rate": 6.708959830213024e-08, + "loss": 0.0359, + "step": 8581 + }, + { + "epoch": 3.811681101487897, + "grad_norm": 0.49573525692180653, + "learning_rate": 6.677347184827487e-08, + "loss": 0.0319, + "step": 8582 + }, + { + "epoch": 3.8121252498334446, + "grad_norm": 0.49861370512502395, + "learning_rate": 6.645808694216715e-08, + "loss": 0.0256, + "step": 8583 + }, + { + "epoch": 3.812569398178992, + "grad_norm": 0.4401777666781036, + "learning_rate": 6.614344363121583e-08, + "loss": 0.0223, + "step": 8584 + }, + { + "epoch": 3.813013546524539, + "grad_norm": 0.36977940669747483, + "learning_rate": 6.582954196271641e-08, + "loss": 0.0214, + "step": 8585 + }, + { + "epoch": 3.8134576948700865, + "grad_norm": 0.4927485327079211, + "learning_rate": 6.55163819838528e-08, + "loss": 0.0287, + "step": 8586 + }, + { + "epoch": 3.8139018432156337, + "grad_norm": 0.37546001627014713, + "learning_rate": 6.520396374170013e-08, + "loss": 0.0229, + "step": 8587 + }, + { + "epoch": 3.8143459915611815, + "grad_norm": 0.4196889003684699, + "learning_rate": 6.489228728321917e-08, + "loss": 0.0276, + "step": 8588 + }, + { + "epoch": 3.8147901399067288, + "grad_norm": 0.4820349858082287, + "learning_rate": 6.458135265525967e-08, + "loss": 0.0274, + "step": 8589 + }, + { + "epoch": 3.8152342882522765, + "grad_norm": 0.3797443254174065, + "learning_rate": 6.427115990456201e-08, + "loss": 0.0237, + "step": 8590 + }, + { + "epoch": 3.815678436597824, + "grad_norm": 0.4390761052826011, + "learning_rate": 6.396170907775167e-08, + "loss": 0.03, + "step": 8591 + }, + { + "epoch": 3.816122584943371, + "grad_norm": 0.32997954286919473, + "learning_rate": 6.365300022134479e-08, + "loss": 0.0242, + "step": 8592 + }, + { + "epoch": 3.8165667332889184, + "grad_norm": 0.45875132604087615, + "learning_rate": 6.334503338174646e-08, + "loss": 0.0269, + "step": 8593 + }, + { + "epoch": 3.8170108816344657, + "grad_norm": 0.48148563808244804, + "learning_rate": 6.303780860524855e-08, + "loss": 0.0273, + "step": 8594 + }, + { + "epoch": 3.8174550299800134, + "grad_norm": 0.4353447524714429, + "learning_rate": 6.273132593803189e-08, + "loss": 0.0195, + "step": 8595 + }, + { + "epoch": 3.8178991783255607, + "grad_norm": 0.38567398492319127, + "learning_rate": 6.242558542616739e-08, + "loss": 0.0193, + "step": 8596 + }, + { + "epoch": 3.818343326671108, + "grad_norm": 0.4173237164922603, + "learning_rate": 6.212058711561165e-08, + "loss": 0.0278, + "step": 8597 + }, + { + "epoch": 3.8187874750166557, + "grad_norm": 0.3154210166282003, + "learning_rate": 6.18163310522113e-08, + "loss": 0.0189, + "step": 8598 + }, + { + "epoch": 3.819231623362203, + "grad_norm": 0.5134503050749415, + "learning_rate": 6.151281728170144e-08, + "loss": 0.0289, + "step": 8599 + }, + { + "epoch": 3.8196757717077503, + "grad_norm": 0.46640864477037924, + "learning_rate": 6.121004584970558e-08, + "loss": 0.0308, + "step": 8600 + }, + { + "epoch": 3.8201199200532976, + "grad_norm": 0.36566694826665086, + "learning_rate": 6.090801680173563e-08, + "loss": 0.022, + "step": 8601 + }, + { + "epoch": 3.8205640683988453, + "grad_norm": 0.39260388522181044, + "learning_rate": 6.060673018319085e-08, + "loss": 0.033, + "step": 8602 + }, + { + "epoch": 3.8210082167443926, + "grad_norm": 0.32628501291346773, + "learning_rate": 6.030618603935945e-08, + "loss": 0.0195, + "step": 8603 + }, + { + "epoch": 3.82145236508994, + "grad_norm": 0.5032358609244834, + "learning_rate": 6.000638441542029e-08, + "loss": 0.0367, + "step": 8604 + }, + { + "epoch": 3.8218965134354876, + "grad_norm": 0.49029558220343267, + "learning_rate": 5.970732535643675e-08, + "loss": 0.0357, + "step": 8605 + }, + { + "epoch": 3.822340661781035, + "grad_norm": 0.4041938487036102, + "learning_rate": 5.94090089073629e-08, + "loss": 0.0192, + "step": 8606 + }, + { + "epoch": 3.8227848101265822, + "grad_norm": 0.35662580614791967, + "learning_rate": 5.911143511304174e-08, + "loss": 0.0223, + "step": 8607 + }, + { + "epoch": 3.8232289584721295, + "grad_norm": 0.42334321858420404, + "learning_rate": 5.8814604018202494e-08, + "loss": 0.0333, + "step": 8608 + }, + { + "epoch": 3.8236731068176772, + "grad_norm": 0.3454601103621082, + "learning_rate": 5.851851566746392e-08, + "loss": 0.0184, + "step": 8609 + }, + { + "epoch": 3.8241172551632245, + "grad_norm": 0.39337857078220745, + "learning_rate": 5.8223170105333734e-08, + "loss": 0.0228, + "step": 8610 + }, + { + "epoch": 3.824561403508772, + "grad_norm": 0.3888924660914583, + "learning_rate": 5.792856737620756e-08, + "loss": 0.0222, + "step": 8611 + }, + { + "epoch": 3.8250055518543196, + "grad_norm": 0.3633339990171769, + "learning_rate": 5.763470752436884e-08, + "loss": 0.0178, + "step": 8612 + }, + { + "epoch": 3.825449700199867, + "grad_norm": 0.5069652365071539, + "learning_rate": 5.734159059398947e-08, + "loss": 0.0311, + "step": 8613 + }, + { + "epoch": 3.825893848545414, + "grad_norm": 0.36481775204860845, + "learning_rate": 5.7049216629129764e-08, + "loss": 0.0214, + "step": 8614 + }, + { + "epoch": 3.8263379968909614, + "grad_norm": 0.4066225715814007, + "learning_rate": 5.6757585673739014e-08, + "loss": 0.0196, + "step": 8615 + }, + { + "epoch": 3.8267821452365087, + "grad_norm": 0.38508300233088066, + "learning_rate": 5.6466697771654365e-08, + "loss": 0.0359, + "step": 8616 + }, + { + "epoch": 3.8272262935820565, + "grad_norm": 0.3857136200508219, + "learning_rate": 5.617655296660085e-08, + "loss": 0.0295, + "step": 8617 + }, + { + "epoch": 3.8276704419276038, + "grad_norm": 0.38361748458889544, + "learning_rate": 5.5887151302192465e-08, + "loss": 0.0225, + "step": 8618 + }, + { + "epoch": 3.8281145902731515, + "grad_norm": 0.4136696628077852, + "learning_rate": 5.5598492821931083e-08, + "loss": 0.0259, + "step": 8619 + }, + { + "epoch": 3.8285587386186988, + "grad_norm": 0.4157902458106096, + "learning_rate": 5.531057756920644e-08, + "loss": 0.0299, + "step": 8620 + }, + { + "epoch": 3.829002886964246, + "grad_norm": 0.41824752196070275, + "learning_rate": 5.502340558729835e-08, + "loss": 0.0276, + "step": 8621 + }, + { + "epoch": 3.8294470353097934, + "grad_norm": 0.3542732476447781, + "learning_rate": 5.4736976919372295e-08, + "loss": 0.0214, + "step": 8622 + }, + { + "epoch": 3.8298911836553406, + "grad_norm": 0.47614032589088634, + "learning_rate": 5.445129160848384e-08, + "loss": 0.0271, + "step": 8623 + }, + { + "epoch": 3.8303353320008884, + "grad_norm": 0.38508281181821147, + "learning_rate": 5.416634969757695e-08, + "loss": 0.0184, + "step": 8624 + }, + { + "epoch": 3.8307794803464357, + "grad_norm": 0.3910260501124829, + "learning_rate": 5.388215122948237e-08, + "loss": 0.0259, + "step": 8625 + }, + { + "epoch": 3.831223628691983, + "grad_norm": 0.31939094873163437, + "learning_rate": 5.359869624692038e-08, + "loss": 0.0197, + "step": 8626 + }, + { + "epoch": 3.8316677770375307, + "grad_norm": 0.30534186293474436, + "learning_rate": 5.331598479249911e-08, + "loss": 0.0201, + "step": 8627 + }, + { + "epoch": 3.832111925383078, + "grad_norm": 0.33277037895962464, + "learning_rate": 5.303401690871457e-08, + "loss": 0.0203, + "step": 8628 + }, + { + "epoch": 3.8325560737286253, + "grad_norm": 0.46652287015112265, + "learning_rate": 5.275279263795175e-08, + "loss": 0.0282, + "step": 8629 + }, + { + "epoch": 3.8330002220741726, + "grad_norm": 0.3848369302003942, + "learning_rate": 5.2472312022483486e-08, + "loss": 0.0258, + "step": 8630 + }, + { + "epoch": 3.8334443704197203, + "grad_norm": 0.4448218074585927, + "learning_rate": 5.2192575104469956e-08, + "loss": 0.0254, + "step": 8631 + }, + { + "epoch": 3.8338885187652676, + "grad_norm": 0.384660032948373, + "learning_rate": 5.1913581925960853e-08, + "loss": 0.0298, + "step": 8632 + }, + { + "epoch": 3.834332667110815, + "grad_norm": 0.4204621984497172, + "learning_rate": 5.16353325288943e-08, + "loss": 0.0242, + "step": 8633 + }, + { + "epoch": 3.8347768154563626, + "grad_norm": 0.46188483366801475, + "learning_rate": 5.135782695509461e-08, + "loss": 0.0231, + "step": 8634 + }, + { + "epoch": 3.83522096380191, + "grad_norm": 0.31629549729913453, + "learning_rate": 5.1081065246277314e-08, + "loss": 0.0158, + "step": 8635 + }, + { + "epoch": 3.835665112147457, + "grad_norm": 0.4353107611550551, + "learning_rate": 5.0805047444042467e-08, + "loss": 0.0287, + "step": 8636 + }, + { + "epoch": 3.8361092604930045, + "grad_norm": 0.40299033179334315, + "learning_rate": 5.0529773589881315e-08, + "loss": 0.0195, + "step": 8637 + }, + { + "epoch": 3.8365534088385522, + "grad_norm": 0.39085359021822447, + "learning_rate": 5.0255243725171876e-08, + "loss": 0.0279, + "step": 8638 + }, + { + "epoch": 3.8369975571840995, + "grad_norm": 0.48519972906519826, + "learning_rate": 4.998145789118114e-08, + "loss": 0.0333, + "step": 8639 + }, + { + "epoch": 3.837441705529647, + "grad_norm": 0.3517351658799901, + "learning_rate": 4.970841612906285e-08, + "loss": 0.0224, + "step": 8640 + }, + { + "epoch": 3.8378858538751945, + "grad_norm": 0.4190846054803923, + "learning_rate": 4.943611847986085e-08, + "loss": 0.0234, + "step": 8641 + }, + { + "epoch": 3.838330002220742, + "grad_norm": 0.35500926856677334, + "learning_rate": 4.9164564984505723e-08, + "loss": 0.0198, + "step": 8642 + }, + { + "epoch": 3.838774150566289, + "grad_norm": 0.34433663873966086, + "learning_rate": 4.889375568381594e-08, + "loss": 0.0174, + "step": 8643 + }, + { + "epoch": 3.8392182989118364, + "grad_norm": 0.38443528004632466, + "learning_rate": 4.8623690618499474e-08, + "loss": 0.0229, + "step": 8644 + }, + { + "epoch": 3.8396624472573837, + "grad_norm": 0.6274521829246321, + "learning_rate": 4.835436982915165e-08, + "loss": 0.0344, + "step": 8645 + }, + { + "epoch": 3.8401065956029314, + "grad_norm": 0.6266172275143739, + "learning_rate": 4.808579335625563e-08, + "loss": 0.0327, + "step": 8646 + }, + { + "epoch": 3.8405507439484787, + "grad_norm": 0.34058549807195543, + "learning_rate": 4.7817961240183567e-08, + "loss": 0.0224, + "step": 8647 + }, + { + "epoch": 3.8409948922940265, + "grad_norm": 0.4390835521149124, + "learning_rate": 4.7550873521194364e-08, + "loss": 0.0195, + "step": 8648 + }, + { + "epoch": 3.8414390406395738, + "grad_norm": 0.3842688631078936, + "learning_rate": 4.728453023943591e-08, + "loss": 0.0231, + "step": 8649 + }, + { + "epoch": 3.841883188985121, + "grad_norm": 0.4692045722646363, + "learning_rate": 4.701893143494507e-08, + "loss": 0.0324, + "step": 8650 + }, + { + "epoch": 3.8423273373306683, + "grad_norm": 0.45747006524072764, + "learning_rate": 4.675407714764491e-08, + "loss": 0.0308, + "step": 8651 + }, + { + "epoch": 3.8427714856762156, + "grad_norm": 0.3514460328352755, + "learning_rate": 4.648996741734857e-08, + "loss": 0.0207, + "step": 8652 + }, + { + "epoch": 3.8432156340217634, + "grad_norm": 0.3912542737417533, + "learning_rate": 4.622660228375486e-08, + "loss": 0.0219, + "step": 8653 + }, + { + "epoch": 3.8436597823673107, + "grad_norm": 0.3136962524397564, + "learning_rate": 4.596398178645323e-08, + "loss": 0.0178, + "step": 8654 + }, + { + "epoch": 3.844103930712858, + "grad_norm": 0.35110936427287714, + "learning_rate": 4.5702105964919305e-08, + "loss": 0.0163, + "step": 8655 + }, + { + "epoch": 3.8445480790584057, + "grad_norm": 0.3924180108841284, + "learning_rate": 4.5440974858517174e-08, + "loss": 0.0185, + "step": 8656 + }, + { + "epoch": 3.844992227403953, + "grad_norm": 0.41446900223517336, + "learning_rate": 4.5180588506500424e-08, + "loss": 0.0294, + "step": 8657 + }, + { + "epoch": 3.8454363757495003, + "grad_norm": 0.3695057701384476, + "learning_rate": 4.492094694800886e-08, + "loss": 0.0269, + "step": 8658 + }, + { + "epoch": 3.8458805240950475, + "grad_norm": 0.3819021189729295, + "learning_rate": 4.4662050222070707e-08, + "loss": 0.0242, + "step": 8659 + }, + { + "epoch": 3.8463246724405953, + "grad_norm": 0.4011303876812642, + "learning_rate": 4.440389836760317e-08, + "loss": 0.028, + "step": 8660 + }, + { + "epoch": 3.8467688207861426, + "grad_norm": 0.42320733559927765, + "learning_rate": 4.414649142341021e-08, + "loss": 0.023, + "step": 8661 + }, + { + "epoch": 3.84721296913169, + "grad_norm": 0.4205022645129385, + "learning_rate": 4.388982942818476e-08, + "loss": 0.0261, + "step": 8662 + }, + { + "epoch": 3.8476571174772376, + "grad_norm": 0.3312900714579485, + "learning_rate": 4.363391242050819e-08, + "loss": 0.0191, + "step": 8663 + }, + { + "epoch": 3.848101265822785, + "grad_norm": 0.40420539853558396, + "learning_rate": 4.3378740438848045e-08, + "loss": 0.0317, + "step": 8664 + }, + { + "epoch": 3.848545414168332, + "grad_norm": 0.41550220010293626, + "learning_rate": 4.312431352156143e-08, + "loss": 0.0255, + "step": 8665 + }, + { + "epoch": 3.8489895625138795, + "grad_norm": 0.34141480859605966, + "learning_rate": 4.287063170689332e-08, + "loss": 0.0219, + "step": 8666 + }, + { + "epoch": 3.849433710859427, + "grad_norm": 0.38457339309127786, + "learning_rate": 4.261769503297597e-08, + "loss": 0.0241, + "step": 8667 + }, + { + "epoch": 3.8498778592049745, + "grad_norm": 0.3865401623267944, + "learning_rate": 4.236550353783009e-08, + "loss": 0.0177, + "step": 8668 + }, + { + "epoch": 3.850322007550522, + "grad_norm": 0.40767308229409815, + "learning_rate": 4.211405725936535e-08, + "loss": 0.0255, + "step": 8669 + }, + { + "epoch": 3.8507661558960695, + "grad_norm": 0.3151055767858847, + "learning_rate": 4.186335623537707e-08, + "loss": 0.0233, + "step": 8670 + }, + { + "epoch": 3.851210304241617, + "grad_norm": 0.3928367577202992, + "learning_rate": 4.1613400503550114e-08, + "loss": 0.0212, + "step": 8671 + }, + { + "epoch": 3.851654452587164, + "grad_norm": 0.4221600614877391, + "learning_rate": 4.13641901014572e-08, + "loss": 0.034, + "step": 8672 + }, + { + "epoch": 3.8520986009327114, + "grad_norm": 0.3485554952889989, + "learning_rate": 4.1115725066559476e-08, + "loss": 0.0189, + "step": 8673 + }, + { + "epoch": 3.8525427492782587, + "grad_norm": 0.4184669816003682, + "learning_rate": 4.086800543620484e-08, + "loss": 0.0265, + "step": 8674 + }, + { + "epoch": 3.8529868976238064, + "grad_norm": 0.40633919115050987, + "learning_rate": 4.062103124763017e-08, + "loss": 0.0178, + "step": 8675 + }, + { + "epoch": 3.8534310459693537, + "grad_norm": 0.5255893189755798, + "learning_rate": 4.0374802537959114e-08, + "loss": 0.0255, + "step": 8676 + }, + { + "epoch": 3.8538751943149014, + "grad_norm": 0.4396123179486424, + "learning_rate": 4.012931934420483e-08, + "loss": 0.0317, + "step": 8677 + }, + { + "epoch": 3.8543193426604487, + "grad_norm": 0.3657385642630762, + "learning_rate": 3.9884581703267254e-08, + "loss": 0.022, + "step": 8678 + }, + { + "epoch": 3.854763491005996, + "grad_norm": 0.34626474417977715, + "learning_rate": 3.964058965193473e-08, + "loss": 0.022, + "step": 8679 + }, + { + "epoch": 3.8552076393515433, + "grad_norm": 0.38562426634481894, + "learning_rate": 3.939734322688349e-08, + "loss": 0.0338, + "step": 8680 + }, + { + "epoch": 3.8556517876970906, + "grad_norm": 0.4035987782299242, + "learning_rate": 3.9154842464677045e-08, + "loss": 0.0236, + "step": 8681 + }, + { + "epoch": 3.8560959360426383, + "grad_norm": 0.4159647047153014, + "learning_rate": 3.8913087401767914e-08, + "loss": 0.024, + "step": 8682 + }, + { + "epoch": 3.8565400843881856, + "grad_norm": 0.5090881621984651, + "learning_rate": 3.867207807449591e-08, + "loss": 0.03, + "step": 8683 + }, + { + "epoch": 3.856984232733733, + "grad_norm": 0.3724391004939136, + "learning_rate": 3.843181451908928e-08, + "loss": 0.0222, + "step": 8684 + }, + { + "epoch": 3.8574283810792807, + "grad_norm": 0.3994107927570375, + "learning_rate": 3.8192296771663026e-08, + "loss": 0.0365, + "step": 8685 + }, + { + "epoch": 3.857872529424828, + "grad_norm": 0.534388923736043, + "learning_rate": 3.795352486822057e-08, + "loss": 0.0286, + "step": 8686 + }, + { + "epoch": 3.8583166777703752, + "grad_norm": 0.3902311138190663, + "learning_rate": 3.7715498844653755e-08, + "loss": 0.0332, + "step": 8687 + }, + { + "epoch": 3.8587608261159225, + "grad_norm": 0.6121843349896643, + "learning_rate": 3.7478218736742286e-08, + "loss": 0.0354, + "step": 8688 + }, + { + "epoch": 3.8592049744614703, + "grad_norm": 0.3370534483485381, + "learning_rate": 3.724168458015265e-08, + "loss": 0.0221, + "step": 8689 + }, + { + "epoch": 3.8596491228070176, + "grad_norm": 0.450573063405829, + "learning_rate": 3.700589641044083e-08, + "loss": 0.0225, + "step": 8690 + }, + { + "epoch": 3.860093271152565, + "grad_norm": 0.49278309629574163, + "learning_rate": 3.677085426304905e-08, + "loss": 0.029, + "step": 8691 + }, + { + "epoch": 3.8605374194981126, + "grad_norm": 0.4055229539649227, + "learning_rate": 3.6536558173308476e-08, + "loss": 0.025, + "step": 8692 + }, + { + "epoch": 3.86098156784366, + "grad_norm": 0.3771030172546445, + "learning_rate": 3.630300817643762e-08, + "loss": 0.0196, + "step": 8693 + }, + { + "epoch": 3.861425716189207, + "grad_norm": 0.4460409041626926, + "learning_rate": 3.607020430754338e-08, + "loss": 0.0263, + "step": 8694 + }, + { + "epoch": 3.8618698645347544, + "grad_norm": 0.36455946204318906, + "learning_rate": 3.583814660161944e-08, + "loss": 0.0254, + "step": 8695 + }, + { + "epoch": 3.862314012880302, + "grad_norm": 0.45869268016899095, + "learning_rate": 3.5606835093548456e-08, + "loss": 0.0364, + "step": 8696 + }, + { + "epoch": 3.8627581612258495, + "grad_norm": 0.3785302300726189, + "learning_rate": 3.537626981810094e-08, + "loss": 0.0203, + "step": 8697 + }, + { + "epoch": 3.8632023095713968, + "grad_norm": 0.41102132869975794, + "learning_rate": 3.514645080993362e-08, + "loss": 0.0304, + "step": 8698 + }, + { + "epoch": 3.8636464579169445, + "grad_norm": 0.3842024608267308, + "learning_rate": 3.49173781035933e-08, + "loss": 0.0217, + "step": 8699 + }, + { + "epoch": 3.864090606262492, + "grad_norm": 0.5453458258267285, + "learning_rate": 3.4689051733513e-08, + "loss": 0.0423, + "step": 8700 + }, + { + "epoch": 3.864534754608039, + "grad_norm": 0.4282646993236422, + "learning_rate": 3.446147173401415e-08, + "loss": 0.0283, + "step": 8701 + }, + { + "epoch": 3.8649789029535864, + "grad_norm": 0.3536636171621975, + "learning_rate": 3.4234638139306055e-08, + "loss": 0.0239, + "step": 8702 + }, + { + "epoch": 3.8654230512991337, + "grad_norm": 0.32067352971181584, + "learning_rate": 3.4008550983484766e-08, + "loss": 0.019, + "step": 8703 + }, + { + "epoch": 3.8658671996446814, + "grad_norm": 0.42623278208292326, + "learning_rate": 3.378321030053644e-08, + "loss": 0.0235, + "step": 8704 + }, + { + "epoch": 3.8663113479902287, + "grad_norm": 0.4469975288314765, + "learning_rate": 3.355861612433231e-08, + "loss": 0.0309, + "step": 8705 + }, + { + "epoch": 3.8667554963357764, + "grad_norm": 0.4959636073318917, + "learning_rate": 3.3334768488633706e-08, + "loss": 0.0284, + "step": 8706 + }, + { + "epoch": 3.8671996446813237, + "grad_norm": 0.503087622712729, + "learning_rate": 3.31116674270876e-08, + "loss": 0.0256, + "step": 8707 + }, + { + "epoch": 3.867643793026871, + "grad_norm": 0.36447403400661404, + "learning_rate": 3.2889312973231616e-08, + "loss": 0.0187, + "step": 8708 + }, + { + "epoch": 3.8680879413724183, + "grad_norm": 0.39131073302761393, + "learning_rate": 3.266770516048734e-08, + "loss": 0.0227, + "step": 8709 + }, + { + "epoch": 3.8685320897179656, + "grad_norm": 0.39335266998944496, + "learning_rate": 3.2446844022167576e-08, + "loss": 0.0302, + "step": 8710 + }, + { + "epoch": 3.8689762380635133, + "grad_norm": 0.3587677625718898, + "learning_rate": 3.2226729591471326e-08, + "loss": 0.0196, + "step": 8711 + }, + { + "epoch": 3.8694203864090606, + "grad_norm": 0.360223764784636, + "learning_rate": 3.2007361901485455e-08, + "loss": 0.0202, + "step": 8712 + }, + { + "epoch": 3.869864534754608, + "grad_norm": 0.45416360753066903, + "learning_rate": 3.1788740985184144e-08, + "loss": 0.0296, + "step": 8713 + }, + { + "epoch": 3.8703086831001556, + "grad_norm": 0.4540255704066019, + "learning_rate": 3.1570866875430536e-08, + "loss": 0.03, + "step": 8714 + }, + { + "epoch": 3.870752831445703, + "grad_norm": 0.39674557183309156, + "learning_rate": 3.135373960497401e-08, + "loss": 0.0246, + "step": 8715 + }, + { + "epoch": 3.87119697979125, + "grad_norm": 0.4615034431538654, + "learning_rate": 3.113735920645344e-08, + "loss": 0.0209, + "step": 8716 + }, + { + "epoch": 3.8716411281367975, + "grad_norm": 0.40129516372174695, + "learning_rate": 3.092172571239338e-08, + "loss": 0.0239, + "step": 8717 + }, + { + "epoch": 3.8720852764823452, + "grad_norm": 0.3795932466717671, + "learning_rate": 3.070683915520845e-08, + "loss": 0.0275, + "step": 8718 + }, + { + "epoch": 3.8725294248278925, + "grad_norm": 0.5210559447705239, + "learning_rate": 3.049269956719891e-08, + "loss": 0.0261, + "step": 8719 + }, + { + "epoch": 3.87297357317344, + "grad_norm": 0.4437613175714331, + "learning_rate": 3.0279306980554034e-08, + "loss": 0.0295, + "step": 8720 + }, + { + "epoch": 3.8734177215189876, + "grad_norm": 0.3896516822182538, + "learning_rate": 3.006666142734982e-08, + "loss": 0.0246, + "step": 8721 + }, + { + "epoch": 3.873861869864535, + "grad_norm": 0.43115789241542724, + "learning_rate": 2.9854762939551254e-08, + "loss": 0.0314, + "step": 8722 + }, + { + "epoch": 3.874306018210082, + "grad_norm": 0.34436582503878505, + "learning_rate": 2.9643611549008967e-08, + "loss": 0.0219, + "step": 8723 + }, + { + "epoch": 3.8747501665556294, + "grad_norm": 0.430010546023126, + "learning_rate": 2.9433207287464238e-08, + "loss": 0.0267, + "step": 8724 + }, + { + "epoch": 3.8751943149011767, + "grad_norm": 0.28855368882245647, + "learning_rate": 2.9223550186543435e-08, + "loss": 0.0137, + "step": 8725 + }, + { + "epoch": 3.8756384632467245, + "grad_norm": 0.3534494715975773, + "learning_rate": 2.9014640277761353e-08, + "loss": 0.0274, + "step": 8726 + }, + { + "epoch": 3.8760826115922717, + "grad_norm": 0.3093760532300058, + "learning_rate": 2.8806477592521755e-08, + "loss": 0.015, + "step": 8727 + }, + { + "epoch": 3.8765267599378195, + "grad_norm": 0.6497895297842237, + "learning_rate": 2.8599062162114056e-08, + "loss": 0.037, + "step": 8728 + }, + { + "epoch": 3.8769709082833668, + "grad_norm": 0.5014893102635929, + "learning_rate": 2.8392394017716095e-08, + "loss": 0.0344, + "step": 8729 + }, + { + "epoch": 3.877415056628914, + "grad_norm": 0.3936192272437833, + "learning_rate": 2.8186473190395246e-08, + "loss": 0.0303, + "step": 8730 + }, + { + "epoch": 3.8778592049744613, + "grad_norm": 0.4010118354626057, + "learning_rate": 2.798129971110286e-08, + "loss": 0.0307, + "step": 8731 + }, + { + "epoch": 3.8783033533200086, + "grad_norm": 0.3784503093468852, + "learning_rate": 2.7776873610681486e-08, + "loss": 0.02, + "step": 8732 + }, + { + "epoch": 3.8787475016655564, + "grad_norm": 0.4230263125616594, + "learning_rate": 2.7573194919859325e-08, + "loss": 0.0224, + "step": 8733 + }, + { + "epoch": 3.8791916500111037, + "grad_norm": 0.4140738675854264, + "learning_rate": 2.737026366925244e-08, + "loss": 0.0258, + "step": 8734 + }, + { + "epoch": 3.879635798356651, + "grad_norm": 0.42420423811982005, + "learning_rate": 2.716807988936532e-08, + "loss": 0.022, + "step": 8735 + }, + { + "epoch": 3.8800799467021987, + "grad_norm": 0.3980720494346127, + "learning_rate": 2.696664361058976e-08, + "loss": 0.0269, + "step": 8736 + }, + { + "epoch": 3.880524095047746, + "grad_norm": 0.3946254166839832, + "learning_rate": 2.6765954863204323e-08, + "loss": 0.0288, + "step": 8737 + }, + { + "epoch": 3.8809682433932933, + "grad_norm": 0.3788227216815116, + "learning_rate": 2.6566013677376545e-08, + "loss": 0.0238, + "step": 8738 + }, + { + "epoch": 3.8814123917388406, + "grad_norm": 0.34261939630726157, + "learning_rate": 2.6366820083160715e-08, + "loss": 0.0145, + "step": 8739 + }, + { + "epoch": 3.8818565400843883, + "grad_norm": 0.42749605488948866, + "learning_rate": 2.6168374110498995e-08, + "loss": 0.0257, + "step": 8740 + }, + { + "epoch": 3.8823006884299356, + "grad_norm": 0.4042334016798025, + "learning_rate": 2.5970675789220855e-08, + "loss": 0.0186, + "step": 8741 + }, + { + "epoch": 3.882744836775483, + "grad_norm": 0.39831255423745965, + "learning_rate": 2.577372514904475e-08, + "loss": 0.0209, + "step": 8742 + }, + { + "epoch": 3.8831889851210306, + "grad_norm": 0.42360207246789, + "learning_rate": 2.5577522219575324e-08, + "loss": 0.0275, + "step": 8743 + }, + { + "epoch": 3.883633133466578, + "grad_norm": 0.48707455874974, + "learning_rate": 2.5382067030304546e-08, + "loss": 0.0282, + "step": 8744 + }, + { + "epoch": 3.884077281812125, + "grad_norm": 0.4723400265193081, + "learning_rate": 2.5187359610612805e-08, + "loss": 0.0382, + "step": 8745 + }, + { + "epoch": 3.8845214301576725, + "grad_norm": 0.6222325293211652, + "learning_rate": 2.499339998976835e-08, + "loss": 0.0342, + "step": 8746 + }, + { + "epoch": 3.88496557850322, + "grad_norm": 0.35612361294226674, + "learning_rate": 2.4800188196926757e-08, + "loss": 0.0183, + "step": 8747 + }, + { + "epoch": 3.8854097268487675, + "grad_norm": 0.42945931063185067, + "learning_rate": 2.4607724261130893e-08, + "loss": 0.0249, + "step": 8748 + }, + { + "epoch": 3.885853875194315, + "grad_norm": 0.3845905954635827, + "learning_rate": 2.441600821131096e-08, + "loss": 0.0219, + "step": 8749 + }, + { + "epoch": 3.8862980235398625, + "grad_norm": 0.3836148595852599, + "learning_rate": 2.422504007628501e-08, + "loss": 0.0207, + "step": 8750 + }, + { + "epoch": 3.88674217188541, + "grad_norm": 0.3404491277084874, + "learning_rate": 2.4034819884759532e-08, + "loss": 0.0198, + "step": 8751 + }, + { + "epoch": 3.887186320230957, + "grad_norm": 0.3619930083510835, + "learning_rate": 2.3845347665327202e-08, + "loss": 0.0191, + "step": 8752 + }, + { + "epoch": 3.8876304685765044, + "grad_norm": 0.4286874704038195, + "learning_rate": 2.3656623446469684e-08, + "loss": 0.0299, + "step": 8753 + }, + { + "epoch": 3.8880746169220517, + "grad_norm": 0.3418186033416821, + "learning_rate": 2.3468647256554845e-08, + "loss": 0.0201, + "step": 8754 + }, + { + "epoch": 3.8885187652675994, + "grad_norm": 0.4273211578376309, + "learning_rate": 2.3281419123838966e-08, + "loss": 0.0254, + "step": 8755 + }, + { + "epoch": 3.8889629136131467, + "grad_norm": 0.46117222518267137, + "learning_rate": 2.3094939076465095e-08, + "loss": 0.0235, + "step": 8756 + }, + { + "epoch": 3.8894070619586945, + "grad_norm": 0.30961316233144026, + "learning_rate": 2.2909207142464695e-08, + "loss": 0.0178, + "step": 8757 + }, + { + "epoch": 3.8898512103042417, + "grad_norm": 0.4313089227592287, + "learning_rate": 2.2724223349756547e-08, + "loss": 0.0334, + "step": 8758 + }, + { + "epoch": 3.890295358649789, + "grad_norm": 0.45892558729550764, + "learning_rate": 2.253998772614674e-08, + "loss": 0.0358, + "step": 8759 + }, + { + "epoch": 3.8907395069953363, + "grad_norm": 0.4320146677307238, + "learning_rate": 2.235650029932923e-08, + "loss": 0.0275, + "step": 8760 + }, + { + "epoch": 3.8911836553408836, + "grad_norm": 0.47469416867571373, + "learning_rate": 2.2173761096884737e-08, + "loss": 0.0313, + "step": 8761 + }, + { + "epoch": 3.8916278036864314, + "grad_norm": 0.38809247080958487, + "learning_rate": 2.1991770146282953e-08, + "loss": 0.0254, + "step": 8762 + }, + { + "epoch": 3.8920719520319786, + "grad_norm": 0.36883556562596115, + "learning_rate": 2.181052747487922e-08, + "loss": 0.0223, + "step": 8763 + }, + { + "epoch": 3.892516100377526, + "grad_norm": 0.4140272360800753, + "learning_rate": 2.1630033109918403e-08, + "loss": 0.0305, + "step": 8764 + }, + { + "epoch": 3.8929602487230737, + "grad_norm": 0.4360012174993987, + "learning_rate": 2.1450287078531028e-08, + "loss": 0.0247, + "step": 8765 + }, + { + "epoch": 3.893404397068621, + "grad_norm": 0.46749106481514396, + "learning_rate": 2.127128940773604e-08, + "loss": 0.0336, + "step": 8766 + }, + { + "epoch": 3.8938485454141682, + "grad_norm": 0.48628664662774723, + "learning_rate": 2.1093040124440246e-08, + "loss": 0.0364, + "step": 8767 + }, + { + "epoch": 3.8942926937597155, + "grad_norm": 0.4655918973858269, + "learning_rate": 2.091553925543721e-08, + "loss": 0.029, + "step": 8768 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.4385695873892939, + "learning_rate": 2.073878682740893e-08, + "loss": 0.0332, + "step": 8769 + }, + { + "epoch": 3.8951809904508106, + "grad_norm": 0.42920683680763916, + "learning_rate": 2.056278286692359e-08, + "loss": 0.0234, + "step": 8770 + }, + { + "epoch": 3.895625138796358, + "grad_norm": 0.44077294633964265, + "learning_rate": 2.0387527400437812e-08, + "loss": 0.0171, + "step": 8771 + }, + { + "epoch": 3.8960692871419056, + "grad_norm": 0.4448703366882169, + "learning_rate": 2.0213020454295517e-08, + "loss": 0.0241, + "step": 8772 + }, + { + "epoch": 3.896513435487453, + "grad_norm": 0.3962409747625572, + "learning_rate": 2.003926205472795e-08, + "loss": 0.0303, + "step": 8773 + }, + { + "epoch": 3.896957583833, + "grad_norm": 0.45182968264253826, + "learning_rate": 1.986625222785421e-08, + "loss": 0.0517, + "step": 8774 + }, + { + "epoch": 3.8974017321785475, + "grad_norm": 0.3768583215614216, + "learning_rate": 1.9693990999680167e-08, + "loss": 0.0243, + "step": 8775 + }, + { + "epoch": 3.897845880524095, + "grad_norm": 0.37480588059892506, + "learning_rate": 1.952247839610011e-08, + "loss": 0.0232, + "step": 8776 + }, + { + "epoch": 3.8982900288696425, + "grad_norm": 0.43502480214398537, + "learning_rate": 1.9351714442895077e-08, + "loss": 0.0243, + "step": 8777 + }, + { + "epoch": 3.8987341772151898, + "grad_norm": 0.33331146266153816, + "learning_rate": 1.918169916573398e-08, + "loss": 0.0223, + "step": 8778 + }, + { + "epoch": 3.8991783255607375, + "grad_norm": 0.3283297894096133, + "learning_rate": 1.9012432590172493e-08, + "loss": 0.0182, + "step": 8779 + }, + { + "epoch": 3.899622473906285, + "grad_norm": 0.4613996841914982, + "learning_rate": 1.8843914741654146e-08, + "loss": 0.0264, + "step": 8780 + }, + { + "epoch": 3.900066622251832, + "grad_norm": 0.3728645160784641, + "learning_rate": 1.8676145645511456e-08, + "loss": 0.0219, + "step": 8781 + }, + { + "epoch": 3.9005107705973794, + "grad_norm": 0.367364355087082, + "learning_rate": 1.850912532696092e-08, + "loss": 0.0169, + "step": 8782 + }, + { + "epoch": 3.9009549189429267, + "grad_norm": 0.43782054130287257, + "learning_rate": 1.8342853811110227e-08, + "loss": 0.0244, + "step": 8783 + }, + { + "epoch": 3.9013990672884744, + "grad_norm": 0.46621513501531153, + "learning_rate": 1.817733112295217e-08, + "loss": 0.0237, + "step": 8784 + }, + { + "epoch": 3.9018432156340217, + "grad_norm": 0.3948393185866851, + "learning_rate": 1.8012557287367394e-08, + "loss": 0.0197, + "step": 8785 + }, + { + "epoch": 3.9022873639795694, + "grad_norm": 0.5251739552887689, + "learning_rate": 1.7848532329124978e-08, + "loss": 0.0376, + "step": 8786 + }, + { + "epoch": 3.9027315123251167, + "grad_norm": 0.3262674897809267, + "learning_rate": 1.7685256272879646e-08, + "loss": 0.0189, + "step": 8787 + }, + { + "epoch": 3.903175660670664, + "grad_norm": 0.32400288553912415, + "learning_rate": 1.7522729143174545e-08, + "loss": 0.0185, + "step": 8788 + }, + { + "epoch": 3.9036198090162113, + "grad_norm": 0.36601512988828766, + "learning_rate": 1.7360950964441236e-08, + "loss": 0.0196, + "step": 8789 + }, + { + "epoch": 3.9040639573617586, + "grad_norm": 0.5081663386558348, + "learning_rate": 1.7199921760997494e-08, + "loss": 0.0312, + "step": 8790 + }, + { + "epoch": 3.9045081057073063, + "grad_norm": 0.717337078541349, + "learning_rate": 1.7039641557048402e-08, + "loss": 0.035, + "step": 8791 + }, + { + "epoch": 3.9049522540528536, + "grad_norm": 0.44298957493772145, + "learning_rate": 1.6880110376686353e-08, + "loss": 0.0368, + "step": 8792 + }, + { + "epoch": 3.905396402398401, + "grad_norm": 0.34392561304252184, + "learning_rate": 1.672132824389272e-08, + "loss": 0.0178, + "step": 8793 + }, + { + "epoch": 3.9058405507439486, + "grad_norm": 0.42295010360189583, + "learning_rate": 1.6563295182534524e-08, + "loss": 0.0251, + "step": 8794 + }, + { + "epoch": 3.906284699089496, + "grad_norm": 0.3790549559366509, + "learning_rate": 1.6406011216366647e-08, + "loss": 0.0235, + "step": 8795 + }, + { + "epoch": 3.9067288474350432, + "grad_norm": 0.41052130644928436, + "learning_rate": 1.6249476369031845e-08, + "loss": 0.0186, + "step": 8796 + }, + { + "epoch": 3.9071729957805905, + "grad_norm": 0.4367038611052701, + "learning_rate": 1.6093690664059635e-08, + "loss": 0.0339, + "step": 8797 + }, + { + "epoch": 3.9076171441261383, + "grad_norm": 0.37822957010835845, + "learning_rate": 1.5938654124867394e-08, + "loss": 0.0215, + "step": 8798 + }, + { + "epoch": 3.9080612924716855, + "grad_norm": 0.3875394125371318, + "learning_rate": 1.5784366774760362e-08, + "loss": 0.0217, + "step": 8799 + }, + { + "epoch": 3.908505440817233, + "grad_norm": 0.4043786874203682, + "learning_rate": 1.563082863692944e-08, + "loss": 0.0236, + "step": 8800 + }, + { + "epoch": 3.9089495891627806, + "grad_norm": 0.4592008857137564, + "learning_rate": 1.5478039734455053e-08, + "loss": 0.0229, + "step": 8801 + }, + { + "epoch": 3.909393737508328, + "grad_norm": 0.29589640326829375, + "learning_rate": 1.5326000090303272e-08, + "loss": 0.0162, + "step": 8802 + }, + { + "epoch": 3.909837885853875, + "grad_norm": 0.42576417588302456, + "learning_rate": 1.5174709727328595e-08, + "loss": 0.0359, + "step": 8803 + }, + { + "epoch": 3.9102820341994224, + "grad_norm": 0.4093180434069703, + "learning_rate": 1.5024168668272275e-08, + "loss": 0.0264, + "step": 8804 + }, + { + "epoch": 3.91072618254497, + "grad_norm": 0.46276973652514886, + "learning_rate": 1.4874376935763434e-08, + "loss": 0.0289, + "step": 8805 + }, + { + "epoch": 3.9111703308905175, + "grad_norm": 0.4688883489908628, + "learning_rate": 1.4725334552318504e-08, + "loss": 0.0181, + "step": 8806 + }, + { + "epoch": 3.9116144792360648, + "grad_norm": 0.3039807302776599, + "learning_rate": 1.4577041540340676e-08, + "loss": 0.015, + "step": 8807 + }, + { + "epoch": 3.9120586275816125, + "grad_norm": 0.3741536582807147, + "learning_rate": 1.442949792212045e-08, + "loss": 0.0235, + "step": 8808 + }, + { + "epoch": 3.91250277592716, + "grad_norm": 0.5513005564638584, + "learning_rate": 1.428270371983731e-08, + "loss": 0.0238, + "step": 8809 + }, + { + "epoch": 3.912946924272707, + "grad_norm": 0.37263633834750676, + "learning_rate": 1.4136658955556381e-08, + "loss": 0.0249, + "step": 8810 + }, + { + "epoch": 3.9133910726182544, + "grad_norm": 0.36432340872278923, + "learning_rate": 1.3991363651230106e-08, + "loss": 0.0207, + "step": 8811 + }, + { + "epoch": 3.9138352209638017, + "grad_norm": 0.49111074770917923, + "learning_rate": 1.38468178286999e-08, + "loss": 0.0296, + "step": 8812 + }, + { + "epoch": 3.9142793693093494, + "grad_norm": 0.36763184203730237, + "learning_rate": 1.3703021509692827e-08, + "loss": 0.0262, + "step": 8813 + }, + { + "epoch": 3.9147235176548967, + "grad_norm": 0.4095976827275787, + "learning_rate": 1.3559974715823266e-08, + "loss": 0.0297, + "step": 8814 + }, + { + "epoch": 3.9151676660004444, + "grad_norm": 0.4248033682883231, + "learning_rate": 1.3417677468595125e-08, + "loss": 0.025, + "step": 8815 + }, + { + "epoch": 3.9156118143459917, + "grad_norm": 0.4638888518779016, + "learning_rate": 1.3276129789397407e-08, + "loss": 0.0225, + "step": 8816 + }, + { + "epoch": 3.916055962691539, + "grad_norm": 0.5594291449958435, + "learning_rate": 1.3135331699506426e-08, + "loss": 0.0285, + "step": 8817 + }, + { + "epoch": 3.9165001110370863, + "grad_norm": 0.5046033957125147, + "learning_rate": 1.2995283220087473e-08, + "loss": 0.0329, + "step": 8818 + }, + { + "epoch": 3.9169442593826336, + "grad_norm": 0.4673687167446593, + "learning_rate": 1.2855984372191488e-08, + "loss": 0.0256, + "step": 8819 + }, + { + "epoch": 3.9173884077281813, + "grad_norm": 0.37484792767681735, + "learning_rate": 1.2717435176758386e-08, + "loss": 0.0271, + "step": 8820 + }, + { + "epoch": 3.9178325560737286, + "grad_norm": 0.5235629211435013, + "learning_rate": 1.2579635654613176e-08, + "loss": 0.0353, + "step": 8821 + }, + { + "epoch": 3.918276704419276, + "grad_norm": 0.37938171555742584, + "learning_rate": 1.24425858264704e-08, + "loss": 0.0236, + "step": 8822 + }, + { + "epoch": 3.9187208527648236, + "grad_norm": 0.31241390505019695, + "learning_rate": 1.2306285712931354e-08, + "loss": 0.0175, + "step": 8823 + }, + { + "epoch": 3.919165001110371, + "grad_norm": 0.3890372627975678, + "learning_rate": 1.2170735334482986e-08, + "loss": 0.0323, + "step": 8824 + }, + { + "epoch": 3.919609149455918, + "grad_norm": 0.5060461363110739, + "learning_rate": 1.2035934711501773e-08, + "loss": 0.0213, + "step": 8825 + }, + { + "epoch": 3.9200532978014655, + "grad_norm": 0.4210215040945434, + "learning_rate": 1.1901883864250396e-08, + "loss": 0.0201, + "step": 8826 + }, + { + "epoch": 3.9204974461470132, + "grad_norm": 0.3987058322406445, + "learning_rate": 1.1768582812878848e-08, + "loss": 0.0265, + "step": 8827 + }, + { + "epoch": 3.9209415944925605, + "grad_norm": 0.3908166480296464, + "learning_rate": 1.1636031577424434e-08, + "loss": 0.0204, + "step": 8828 + }, + { + "epoch": 3.921385742838108, + "grad_norm": 0.4484182047406765, + "learning_rate": 1.150423017781177e-08, + "loss": 0.0262, + "step": 8829 + }, + { + "epoch": 3.9218298911836555, + "grad_norm": 0.39560743278914223, + "learning_rate": 1.1373178633853344e-08, + "loss": 0.0301, + "step": 8830 + }, + { + "epoch": 3.922274039529203, + "grad_norm": 0.44427517578613046, + "learning_rate": 1.124287696524784e-08, + "loss": 0.033, + "step": 8831 + }, + { + "epoch": 3.92271818787475, + "grad_norm": 0.48713519459485133, + "learning_rate": 1.111332519158237e-08, + "loss": 0.0293, + "step": 8832 + }, + { + "epoch": 3.9231623362202974, + "grad_norm": 0.38920760124214965, + "learning_rate": 1.0984523332330244e-08, + "loss": 0.0313, + "step": 8833 + }, + { + "epoch": 3.923606484565845, + "grad_norm": 0.39034349333664137, + "learning_rate": 1.0856471406852642e-08, + "loss": 0.0227, + "step": 8834 + }, + { + "epoch": 3.9240506329113924, + "grad_norm": 0.5491500358691043, + "learning_rate": 1.0729169434398613e-08, + "loss": 0.046, + "step": 8835 + }, + { + "epoch": 3.9244947812569397, + "grad_norm": 0.38151095276050573, + "learning_rate": 1.0602617434102846e-08, + "loss": 0.0248, + "step": 8836 + }, + { + "epoch": 3.9249389296024875, + "grad_norm": 0.4149228267505079, + "learning_rate": 1.0476815424989018e-08, + "loss": 0.0212, + "step": 8837 + }, + { + "epoch": 3.9253830779480348, + "grad_norm": 0.4455456792947203, + "learning_rate": 1.0351763425966999e-08, + "loss": 0.0276, + "step": 8838 + }, + { + "epoch": 3.925827226293582, + "grad_norm": 0.38613196310803816, + "learning_rate": 1.022746145583453e-08, + "loss": 0.0224, + "step": 8839 + }, + { + "epoch": 3.9262713746391293, + "grad_norm": 0.4192361066154168, + "learning_rate": 1.0103909533275557e-08, + "loss": 0.0299, + "step": 8840 + }, + { + "epoch": 3.9267155229846766, + "grad_norm": 0.6267500325799473, + "learning_rate": 9.981107676862444e-09, + "loss": 0.0255, + "step": 8841 + }, + { + "epoch": 3.9271596713302244, + "grad_norm": 0.43799023094491873, + "learning_rate": 9.859055905054871e-09, + "loss": 0.0318, + "step": 8842 + }, + { + "epoch": 3.9276038196757717, + "grad_norm": 0.40925366502801463, + "learning_rate": 9.737754236198716e-09, + "loss": 0.0237, + "step": 8843 + }, + { + "epoch": 3.9280479680213194, + "grad_norm": 0.34421339563935327, + "learning_rate": 9.617202688527727e-09, + "loss": 0.02, + "step": 8844 + }, + { + "epoch": 3.9284921163668667, + "grad_norm": 0.36452089076221383, + "learning_rate": 9.497401280162966e-09, + "loss": 0.0272, + "step": 8845 + }, + { + "epoch": 3.928936264712414, + "grad_norm": 0.37821446185307267, + "learning_rate": 9.378350029112248e-09, + "loss": 0.0187, + "step": 8846 + }, + { + "epoch": 3.9293804130579613, + "grad_norm": 0.4769435842108248, + "learning_rate": 9.260048953271817e-09, + "loss": 0.0304, + "step": 8847 + }, + { + "epoch": 3.9298245614035086, + "grad_norm": 0.42416826190385143, + "learning_rate": 9.142498070424111e-09, + "loss": 0.0252, + "step": 8848 + }, + { + "epoch": 3.9302687097490563, + "grad_norm": 0.4780719986413718, + "learning_rate": 9.02569739823833e-09, + "loss": 0.0243, + "step": 8849 + }, + { + "epoch": 3.9307128580946036, + "grad_norm": 0.5064893232422892, + "learning_rate": 8.9096469542721e-09, + "loss": 0.029, + "step": 8850 + }, + { + "epoch": 3.931157006440151, + "grad_norm": 0.472928807421496, + "learning_rate": 8.794346755969795e-09, + "loss": 0.0234, + "step": 8851 + }, + { + "epoch": 3.9316011547856986, + "grad_norm": 0.40013223823456623, + "learning_rate": 8.679796820663111e-09, + "loss": 0.0344, + "step": 8852 + }, + { + "epoch": 3.932045303131246, + "grad_norm": 0.40854096697741643, + "learning_rate": 8.565997165570494e-09, + "loss": 0.0317, + "step": 8853 + }, + { + "epoch": 3.932489451476793, + "grad_norm": 0.6774113697293435, + "learning_rate": 8.452947807798261e-09, + "loss": 0.0265, + "step": 8854 + }, + { + "epoch": 3.9329335998223405, + "grad_norm": 0.47756331700457766, + "learning_rate": 8.340648764339487e-09, + "loss": 0.0245, + "step": 8855 + }, + { + "epoch": 3.933377748167888, + "grad_norm": 0.408015599055906, + "learning_rate": 8.229100052074557e-09, + "loss": 0.0266, + "step": 8856 + }, + { + "epoch": 3.9338218965134355, + "grad_norm": 0.3723159509813085, + "learning_rate": 8.118301687771169e-09, + "loss": 0.0244, + "step": 8857 + }, + { + "epoch": 3.934266044858983, + "grad_norm": 0.44846859119809684, + "learning_rate": 8.008253688084888e-09, + "loss": 0.0199, + "step": 8858 + }, + { + "epoch": 3.9347101932045305, + "grad_norm": 0.3532724419502754, + "learning_rate": 7.898956069556375e-09, + "loss": 0.0167, + "step": 8859 + }, + { + "epoch": 3.935154341550078, + "grad_norm": 0.38377139505288993, + "learning_rate": 7.790408848616371e-09, + "loss": 0.025, + "step": 8860 + }, + { + "epoch": 3.935598489895625, + "grad_norm": 0.47706384193903933, + "learning_rate": 7.682612041580161e-09, + "loss": 0.0298, + "step": 8861 + }, + { + "epoch": 3.9360426382411724, + "grad_norm": 0.47799572814835295, + "learning_rate": 7.575565664652562e-09, + "loss": 0.0232, + "step": 8862 + }, + { + "epoch": 3.9364867865867197, + "grad_norm": 0.42569464791407546, + "learning_rate": 7.469269733923478e-09, + "loss": 0.0284, + "step": 8863 + }, + { + "epoch": 3.9369309349322674, + "grad_norm": 0.49128556583785543, + "learning_rate": 7.363724265371796e-09, + "loss": 0.0322, + "step": 8864 + }, + { + "epoch": 3.9373750832778147, + "grad_norm": 0.4426824222148885, + "learning_rate": 7.258929274862048e-09, + "loss": 0.0244, + "step": 8865 + }, + { + "epoch": 3.9378192316233624, + "grad_norm": 0.4220498782198706, + "learning_rate": 7.154884778147187e-09, + "loss": 0.0381, + "step": 8866 + }, + { + "epoch": 3.9382633799689097, + "grad_norm": 0.40573724780385634, + "learning_rate": 7.051590790866925e-09, + "loss": 0.0208, + "step": 8867 + }, + { + "epoch": 3.938707528314457, + "grad_norm": 0.36192582224558784, + "learning_rate": 6.949047328547731e-09, + "loss": 0.0186, + "step": 8868 + }, + { + "epoch": 3.9391516766600043, + "grad_norm": 0.44081181805446706, + "learning_rate": 6.847254406603943e-09, + "loss": 0.0305, + "step": 8869 + }, + { + "epoch": 3.9395958250055516, + "grad_norm": 0.47409619445972684, + "learning_rate": 6.746212040336653e-09, + "loss": 0.0276, + "step": 8870 + }, + { + "epoch": 3.9400399733510993, + "grad_norm": 0.3907822464278477, + "learning_rate": 6.645920244934267e-09, + "loss": 0.0265, + "step": 8871 + }, + { + "epoch": 3.9404841216966466, + "grad_norm": 0.3877502095250313, + "learning_rate": 6.546379035472505e-09, + "loss": 0.0196, + "step": 8872 + }, + { + "epoch": 3.9409282700421944, + "grad_norm": 0.36359890877779866, + "learning_rate": 6.447588426913287e-09, + "loss": 0.0207, + "step": 8873 + }, + { + "epoch": 3.9413724183877417, + "grad_norm": 0.4046244907134214, + "learning_rate": 6.349548434108066e-09, + "loss": 0.0245, + "step": 8874 + }, + { + "epoch": 3.941816566733289, + "grad_norm": 0.30688221794365883, + "learning_rate": 6.252259071792277e-09, + "loss": 0.0252, + "step": 8875 + }, + { + "epoch": 3.9422607150788362, + "grad_norm": 0.3706992283845886, + "learning_rate": 6.155720354590888e-09, + "loss": 0.0179, + "step": 8876 + }, + { + "epoch": 3.9427048634243835, + "grad_norm": 0.3606911516685271, + "learning_rate": 6.059932297015625e-09, + "loss": 0.0155, + "step": 8877 + }, + { + "epoch": 3.9431490117699313, + "grad_norm": 0.3448866184147424, + "learning_rate": 5.964894913464969e-09, + "loss": 0.0238, + "step": 8878 + }, + { + "epoch": 3.9435931601154786, + "grad_norm": 0.42743435978316485, + "learning_rate": 5.8706082182241605e-09, + "loss": 0.019, + "step": 8879 + }, + { + "epoch": 3.944037308461026, + "grad_norm": 0.38374298902655224, + "learning_rate": 5.777072225466307e-09, + "loss": 0.0232, + "step": 8880 + }, + { + "epoch": 3.9444814568065736, + "grad_norm": 0.3737396272194393, + "learning_rate": 5.684286949251272e-09, + "loss": 0.0218, + "step": 8881 + }, + { + "epoch": 3.944925605152121, + "grad_norm": 0.3717422178901769, + "learning_rate": 5.592252403526788e-09, + "loss": 0.018, + "step": 8882 + }, + { + "epoch": 3.945369753497668, + "grad_norm": 0.4530028394270273, + "learning_rate": 5.500968602126788e-09, + "loss": 0.0311, + "step": 8883 + }, + { + "epoch": 3.9458139018432155, + "grad_norm": 0.34242197949439085, + "learning_rate": 5.410435558773075e-09, + "loss": 0.0229, + "step": 8884 + }, + { + "epoch": 3.946258050188763, + "grad_norm": 0.39126976030692906, + "learning_rate": 5.3206532870742065e-09, + "loss": 0.0253, + "step": 8885 + }, + { + "epoch": 3.9467021985343105, + "grad_norm": 0.43259511059270805, + "learning_rate": 5.231621800525499e-09, + "loss": 0.0222, + "step": 8886 + }, + { + "epoch": 3.9471463468798578, + "grad_norm": 0.5231951089873627, + "learning_rate": 5.143341112510691e-09, + "loss": 0.0432, + "step": 8887 + }, + { + "epoch": 3.9475904952254055, + "grad_norm": 0.4699803462882175, + "learning_rate": 5.055811236299724e-09, + "loss": 0.0242, + "step": 8888 + }, + { + "epoch": 3.948034643570953, + "grad_norm": 0.4043702599111712, + "learning_rate": 4.969032185049294e-09, + "loss": 0.0367, + "step": 8889 + }, + { + "epoch": 3.9484787919165, + "grad_norm": 0.41944855374085116, + "learning_rate": 4.883003971803968e-09, + "loss": 0.0236, + "step": 8890 + }, + { + "epoch": 3.9489229402620474, + "grad_norm": 0.40457657146672565, + "learning_rate": 4.797726609495623e-09, + "loss": 0.0188, + "step": 8891 + }, + { + "epoch": 3.9493670886075947, + "grad_norm": 0.4052288455507649, + "learning_rate": 4.7132001109423396e-09, + "loss": 0.0266, + "step": 8892 + }, + { + "epoch": 3.9498112369531424, + "grad_norm": 0.5765436543403538, + "learning_rate": 4.629424488850065e-09, + "loss": 0.0362, + "step": 8893 + }, + { + "epoch": 3.9502553852986897, + "grad_norm": 0.4676308344783614, + "learning_rate": 4.546399755812059e-09, + "loss": 0.0249, + "step": 8894 + }, + { + "epoch": 3.9506995336442374, + "grad_norm": 0.28808412670077804, + "learning_rate": 4.4641259243077825e-09, + "loss": 0.0137, + "step": 8895 + }, + { + "epoch": 3.9511436819897847, + "grad_norm": 0.47058698890390266, + "learning_rate": 4.382603006705121e-09, + "loss": 0.0319, + "step": 8896 + }, + { + "epoch": 3.951587830335332, + "grad_norm": 0.41572017055736116, + "learning_rate": 4.301831015257607e-09, + "loss": 0.0221, + "step": 8897 + }, + { + "epoch": 3.9520319786808793, + "grad_norm": 0.39457081647577197, + "learning_rate": 4.221809962107193e-09, + "loss": 0.0248, + "step": 8898 + }, + { + "epoch": 3.9524761270264266, + "grad_norm": 0.5337090298271753, + "learning_rate": 4.142539859282035e-09, + "loss": 0.036, + "step": 8899 + }, + { + "epoch": 3.9529202753719743, + "grad_norm": 0.4292452210463301, + "learning_rate": 4.064020718698158e-09, + "loss": 0.0276, + "step": 8900 + }, + { + "epoch": 3.9533644237175216, + "grad_norm": 0.35441583703246865, + "learning_rate": 3.986252552157788e-09, + "loss": 0.0228, + "step": 8901 + }, + { + "epoch": 3.953808572063069, + "grad_norm": 0.4031250072105655, + "learning_rate": 3.909235371351017e-09, + "loss": 0.028, + "step": 8902 + }, + { + "epoch": 3.9542527204086166, + "grad_norm": 0.32842995453695917, + "learning_rate": 3.832969187855251e-09, + "loss": 0.0244, + "step": 8903 + }, + { + "epoch": 3.954696868754164, + "grad_norm": 0.4192351275506548, + "learning_rate": 3.757454013134099e-09, + "loss": 0.0215, + "step": 8904 + }, + { + "epoch": 3.955141017099711, + "grad_norm": 0.4517856702968858, + "learning_rate": 3.682689858539035e-09, + "loss": 0.0308, + "step": 8905 + }, + { + "epoch": 3.9555851654452585, + "grad_norm": 0.4000805272591098, + "learning_rate": 3.608676735308292e-09, + "loss": 0.0257, + "step": 8906 + }, + { + "epoch": 3.9560293137908062, + "grad_norm": 0.39861614075630786, + "learning_rate": 3.5354146545668597e-09, + "loss": 0.0268, + "step": 8907 + }, + { + "epoch": 3.9564734621363535, + "grad_norm": 0.3761331007710691, + "learning_rate": 3.462903627328151e-09, + "loss": 0.0212, + "step": 8908 + }, + { + "epoch": 3.956917610481901, + "grad_norm": 0.552448221226994, + "learning_rate": 3.3911436644912256e-09, + "loss": 0.0272, + "step": 8909 + }, + { + "epoch": 3.9573617588274486, + "grad_norm": 0.40174202534071046, + "learning_rate": 3.3201347768430093e-09, + "loss": 0.0289, + "step": 8910 + }, + { + "epoch": 3.957805907172996, + "grad_norm": 0.3916127999245288, + "learning_rate": 3.249876975057187e-09, + "loss": 0.0251, + "step": 8911 + }, + { + "epoch": 3.958250055518543, + "grad_norm": 0.40161063795102814, + "learning_rate": 3.1803702696947547e-09, + "loss": 0.0201, + "step": 8912 + }, + { + "epoch": 3.9586942038640904, + "grad_norm": 0.4175398836504673, + "learning_rate": 3.111614671204022e-09, + "loss": 0.0236, + "step": 8913 + }, + { + "epoch": 3.959138352209638, + "grad_norm": 0.5695134220357339, + "learning_rate": 3.043610189919499e-09, + "loss": 0.0289, + "step": 8914 + }, + { + "epoch": 3.9595825005551855, + "grad_norm": 0.47624104829660213, + "learning_rate": 2.97635683606412e-09, + "loss": 0.0319, + "step": 8915 + }, + { + "epoch": 3.9600266489007327, + "grad_norm": 0.4611136348425102, + "learning_rate": 2.909854619747021e-09, + "loss": 0.0207, + "step": 8916 + }, + { + "epoch": 3.9604707972462805, + "grad_norm": 0.49713796334737675, + "learning_rate": 2.8441035509640947e-09, + "loss": 0.0273, + "step": 8917 + }, + { + "epoch": 3.9609149455918278, + "grad_norm": 0.5029210915242077, + "learning_rate": 2.7791036395996563e-09, + "loss": 0.0413, + "step": 8918 + }, + { + "epoch": 3.961359093937375, + "grad_norm": 0.4809399615159702, + "learning_rate": 2.7148548954236687e-09, + "loss": 0.0326, + "step": 8919 + }, + { + "epoch": 3.9618032422829224, + "grad_norm": 0.4107926474040777, + "learning_rate": 2.6513573280939618e-09, + "loss": 0.0243, + "step": 8920 + }, + { + "epoch": 3.9622473906284696, + "grad_norm": 0.39038109738475957, + "learning_rate": 2.5886109471551233e-09, + "loss": 0.0251, + "step": 8921 + }, + { + "epoch": 3.9626915389740174, + "grad_norm": 0.328940131759137, + "learning_rate": 2.526615762039608e-09, + "loss": 0.0163, + "step": 8922 + }, + { + "epoch": 3.9631356873195647, + "grad_norm": 0.4003085651392341, + "learning_rate": 2.465371782066073e-09, + "loss": 0.0236, + "step": 8923 + }, + { + "epoch": 3.9635798356651124, + "grad_norm": 0.4454242880030863, + "learning_rate": 2.4048790164404866e-09, + "loss": 0.0232, + "step": 8924 + }, + { + "epoch": 3.9640239840106597, + "grad_norm": 0.34629533945323293, + "learning_rate": 2.3451374742555764e-09, + "loss": 0.0177, + "step": 8925 + }, + { + "epoch": 3.964468132356207, + "grad_norm": 0.4133618406412272, + "learning_rate": 2.2861471644919363e-09, + "loss": 0.0191, + "step": 8926 + }, + { + "epoch": 3.9649122807017543, + "grad_norm": 0.42649199370055857, + "learning_rate": 2.2279080960163625e-09, + "loss": 0.027, + "step": 8927 + }, + { + "epoch": 3.9653564290473016, + "grad_norm": 0.3956675634414063, + "learning_rate": 2.170420277584073e-09, + "loss": 0.0263, + "step": 8928 + }, + { + "epoch": 3.9658005773928493, + "grad_norm": 0.4346162397069076, + "learning_rate": 2.1136837178353797e-09, + "loss": 0.0258, + "step": 8929 + }, + { + "epoch": 3.9662447257383966, + "grad_norm": 0.4508730863193407, + "learning_rate": 2.05769842529957e-09, + "loss": 0.0355, + "step": 8930 + }, + { + "epoch": 3.966688874083944, + "grad_norm": 0.44677554596928787, + "learning_rate": 2.002464408392135e-09, + "loss": 0.0245, + "step": 8931 + }, + { + "epoch": 3.9671330224294916, + "grad_norm": 0.3494473501914632, + "learning_rate": 1.9479816754147672e-09, + "loss": 0.0231, + "step": 8932 + }, + { + "epoch": 3.967577170775039, + "grad_norm": 0.3733325402809653, + "learning_rate": 1.894250234558137e-09, + "loss": 0.0177, + "step": 8933 + }, + { + "epoch": 3.968021319120586, + "grad_norm": 0.42732301014072877, + "learning_rate": 1.8412700938985618e-09, + "loss": 0.0245, + "step": 8934 + }, + { + "epoch": 3.9684654674661335, + "grad_norm": 0.5850533107887496, + "learning_rate": 1.7890412614002262e-09, + "loss": 0.0173, + "step": 8935 + }, + { + "epoch": 3.9689096158116812, + "grad_norm": 0.42327533255156274, + "learning_rate": 1.7375637449135174e-09, + "loss": 0.0217, + "step": 8936 + }, + { + "epoch": 3.9693537641572285, + "grad_norm": 0.45340718199873464, + "learning_rate": 1.68683755217669e-09, + "loss": 0.0329, + "step": 8937 + }, + { + "epoch": 3.969797912502776, + "grad_norm": 0.43589126986170634, + "learning_rate": 1.6368626908147556e-09, + "loss": 0.0294, + "step": 8938 + }, + { + "epoch": 3.9702420608483235, + "grad_norm": 0.37544409700894205, + "learning_rate": 1.5876391683400383e-09, + "loss": 0.0356, + "step": 8939 + }, + { + "epoch": 3.970686209193871, + "grad_norm": 0.3711146055897954, + "learning_rate": 1.5391669921505093e-09, + "loss": 0.0304, + "step": 8940 + }, + { + "epoch": 3.971130357539418, + "grad_norm": 0.40437917019760156, + "learning_rate": 1.4914461695336723e-09, + "loss": 0.0186, + "step": 8941 + }, + { + "epoch": 3.9715745058849654, + "grad_norm": 0.36049022928808616, + "learning_rate": 1.4444767076626787e-09, + "loss": 0.0221, + "step": 8942 + }, + { + "epoch": 3.972018654230513, + "grad_norm": 0.49140822525474664, + "learning_rate": 1.3982586135968813e-09, + "loss": 0.0267, + "step": 8943 + }, + { + "epoch": 3.9724628025760604, + "grad_norm": 0.5302909084020938, + "learning_rate": 1.3527918942840556e-09, + "loss": 0.041, + "step": 8944 + }, + { + "epoch": 3.9729069509216077, + "grad_norm": 0.5476906455774254, + "learning_rate": 1.3080765565592902e-09, + "loss": 0.0273, + "step": 8945 + }, + { + "epoch": 3.9733510992671555, + "grad_norm": 0.3333489521628196, + "learning_rate": 1.2641126071433197e-09, + "loss": 0.0268, + "step": 8946 + }, + { + "epoch": 3.9737952476127028, + "grad_norm": 0.39064699714492984, + "learning_rate": 1.2209000526447469e-09, + "loss": 0.0204, + "step": 8947 + }, + { + "epoch": 3.97423939595825, + "grad_norm": 0.5569247363465006, + "learning_rate": 1.1784388995594864e-09, + "loss": 0.0321, + "step": 8948 + }, + { + "epoch": 3.9746835443037973, + "grad_norm": 0.4766119680419059, + "learning_rate": 1.1367291542702107e-09, + "loss": 0.0284, + "step": 8949 + }, + { + "epoch": 3.9751276926493446, + "grad_norm": 0.5034524006899211, + "learning_rate": 1.0957708230457942e-09, + "loss": 0.0271, + "step": 8950 + }, + { + "epoch": 3.9755718409948924, + "grad_norm": 0.3381689403401387, + "learning_rate": 1.0555639120440887e-09, + "loss": 0.0201, + "step": 8951 + }, + { + "epoch": 3.9760159893404396, + "grad_norm": 0.46151159178049117, + "learning_rate": 1.0161084273080378e-09, + "loss": 0.0296, + "step": 8952 + }, + { + "epoch": 3.9764601376859874, + "grad_norm": 0.4521959026876067, + "learning_rate": 9.774043747690087e-10, + "loss": 0.0311, + "step": 8953 + }, + { + "epoch": 3.9769042860315347, + "grad_norm": 0.48398453476246417, + "learning_rate": 9.394517602445697e-10, + "loss": 0.0309, + "step": 8954 + }, + { + "epoch": 3.977348434377082, + "grad_norm": 0.3856389581537802, + "learning_rate": 9.022505894396017e-10, + "loss": 0.0216, + "step": 8955 + }, + { + "epoch": 3.9777925827226293, + "grad_norm": 0.3450747091674961, + "learning_rate": 8.658008679462981e-10, + "loss": 0.0157, + "step": 8956 + }, + { + "epoch": 3.9782367310681765, + "grad_norm": 0.3798481275168741, + "learning_rate": 8.301026012436098e-10, + "loss": 0.0197, + "step": 8957 + }, + { + "epoch": 3.9786808794137243, + "grad_norm": 0.4175802111530846, + "learning_rate": 7.951557946972444e-10, + "loss": 0.0254, + "step": 8958 + }, + { + "epoch": 3.9791250277592716, + "grad_norm": 0.3948056525475575, + "learning_rate": 7.609604535613324e-10, + "loss": 0.0198, + "step": 8959 + }, + { + "epoch": 3.979569176104819, + "grad_norm": 0.5080884160142413, + "learning_rate": 7.275165829745412e-10, + "loss": 0.0204, + "step": 8960 + }, + { + "epoch": 3.9800133244503666, + "grad_norm": 0.3995310763869156, + "learning_rate": 6.948241879650708e-10, + "loss": 0.0296, + "step": 8961 + }, + { + "epoch": 3.980457472795914, + "grad_norm": 0.37430290093081664, + "learning_rate": 6.628832734467683e-10, + "loss": 0.0238, + "step": 8962 + }, + { + "epoch": 3.980901621141461, + "grad_norm": 0.43982737443124764, + "learning_rate": 6.316938442213483e-10, + "loss": 0.0209, + "step": 8963 + }, + { + "epoch": 3.9813457694870085, + "grad_norm": 0.3775182755482849, + "learning_rate": 6.012559049761723e-10, + "loss": 0.025, + "step": 8964 + }, + { + "epoch": 3.981789917832556, + "grad_norm": 0.39039801224690585, + "learning_rate": 5.715694602875799e-10, + "loss": 0.0207, + "step": 8965 + }, + { + "epoch": 3.9822340661781035, + "grad_norm": 0.4199762038802494, + "learning_rate": 5.426345146175571e-10, + "loss": 0.0191, + "step": 8966 + }, + { + "epoch": 3.982678214523651, + "grad_norm": 0.38333717972878917, + "learning_rate": 5.144510723154028e-10, + "loss": 0.0284, + "step": 8967 + }, + { + "epoch": 3.9831223628691985, + "grad_norm": 0.4104443185350045, + "learning_rate": 4.87019137617728e-10, + "loss": 0.0189, + "step": 8968 + }, + { + "epoch": 3.983566511214746, + "grad_norm": 0.5433078848500579, + "learning_rate": 4.60338714647901e-10, + "loss": 0.0302, + "step": 8969 + }, + { + "epoch": 3.984010659560293, + "grad_norm": 0.3909125068421461, + "learning_rate": 4.3440980741660254e-10, + "loss": 0.0238, + "step": 8970 + }, + { + "epoch": 3.9844548079058404, + "grad_norm": 0.41307839626778975, + "learning_rate": 4.092324198212705e-10, + "loss": 0.0235, + "step": 8971 + }, + { + "epoch": 3.984898956251388, + "grad_norm": 0.321331298647483, + "learning_rate": 3.848065556461e-10, + "loss": 0.0236, + "step": 8972 + }, + { + "epoch": 3.9853431045969354, + "grad_norm": 0.3312110185029968, + "learning_rate": 3.6113221856370896e-10, + "loss": 0.0219, + "step": 8973 + }, + { + "epoch": 3.9857872529424827, + "grad_norm": 0.4192103414685381, + "learning_rate": 3.382094121318069e-10, + "loss": 0.0266, + "step": 8974 + }, + { + "epoch": 3.9862314012880304, + "grad_norm": 0.35796707903705355, + "learning_rate": 3.160381397965262e-10, + "loss": 0.0177, + "step": 8975 + }, + { + "epoch": 3.9866755496335777, + "grad_norm": 0.3908253098336717, + "learning_rate": 2.946184048902012e-10, + "loss": 0.0235, + "step": 8976 + }, + { + "epoch": 3.987119697979125, + "grad_norm": 0.42457467356965617, + "learning_rate": 2.7395021063303385e-10, + "loss": 0.0397, + "step": 8977 + }, + { + "epoch": 3.9875638463246723, + "grad_norm": 0.4884888876304642, + "learning_rate": 2.540335601319832e-10, + "loss": 0.0228, + "step": 8978 + }, + { + "epoch": 3.9880079946702196, + "grad_norm": 0.45397167330183835, + "learning_rate": 2.348684563802106e-10, + "loss": 0.0215, + "step": 8979 + }, + { + "epoch": 3.9884521430157673, + "grad_norm": 0.48150914332233963, + "learning_rate": 2.1645490225929989e-10, + "loss": 0.0207, + "step": 8980 + }, + { + "epoch": 3.9888962913613146, + "grad_norm": 0.518549165106476, + "learning_rate": 1.9879290053592682e-10, + "loss": 0.0276, + "step": 8981 + }, + { + "epoch": 3.9893404397068624, + "grad_norm": 0.3673159296164483, + "learning_rate": 1.8188245386629998e-10, + "loss": 0.0188, + "step": 8982 + }, + { + "epoch": 3.9897845880524097, + "grad_norm": 0.34298096212254114, + "learning_rate": 1.657235647917199e-10, + "loss": 0.0169, + "step": 8983 + }, + { + "epoch": 3.990228736397957, + "grad_norm": 0.4555639884295353, + "learning_rate": 1.5031623574135456e-10, + "loss": 0.0288, + "step": 8984 + }, + { + "epoch": 3.9906728847435042, + "grad_norm": 0.3767875220229666, + "learning_rate": 1.3566046903057405e-10, + "loss": 0.0245, + "step": 8985 + }, + { + "epoch": 3.9911170330890515, + "grad_norm": 0.3435105743106647, + "learning_rate": 1.217562668631711e-10, + "loss": 0.0192, + "step": 8986 + }, + { + "epoch": 3.9915611814345993, + "grad_norm": 0.5264624768016674, + "learning_rate": 1.0860363132914053e-10, + "loss": 0.0215, + "step": 8987 + }, + { + "epoch": 3.9920053297801465, + "grad_norm": 0.4654928703557639, + "learning_rate": 9.620256440467934e-11, + "loss": 0.0253, + "step": 8988 + }, + { + "epoch": 3.992449478125694, + "grad_norm": 0.42855294671291005, + "learning_rate": 8.455306795496221e-11, + "loss": 0.0322, + "step": 8989 + }, + { + "epoch": 3.9928936264712416, + "grad_norm": 0.33395128737336627, + "learning_rate": 7.365514373081084e-11, + "loss": 0.0159, + "step": 8990 + }, + { + "epoch": 3.993337774816789, + "grad_norm": 0.7884624510448948, + "learning_rate": 6.35087933698042e-11, + "loss": 0.0429, + "step": 8991 + }, + { + "epoch": 3.993781923162336, + "grad_norm": 0.4471065505434232, + "learning_rate": 5.411401839738872e-11, + "loss": 0.032, + "step": 8992 + }, + { + "epoch": 3.9942260715078834, + "grad_norm": 0.3926302149338509, + "learning_rate": 4.547082022632321e-11, + "loss": 0.0269, + "step": 8993 + }, + { + "epoch": 3.994670219853431, + "grad_norm": 0.35432194327597677, + "learning_rate": 3.7579200155013483e-11, + "loss": 0.024, + "step": 8994 + }, + { + "epoch": 3.9951143681989785, + "grad_norm": 0.39044461483214893, + "learning_rate": 3.043915937028796e-11, + "loss": 0.025, + "step": 8995 + }, + { + "epoch": 3.9955585165445258, + "grad_norm": 0.3981648535488869, + "learning_rate": 2.4050698944622087e-11, + "loss": 0.0221, + "step": 8996 + }, + { + "epoch": 3.9960026648900735, + "grad_norm": 0.42669487275827683, + "learning_rate": 1.8413819839468993e-11, + "loss": 0.0231, + "step": 8997 + }, + { + "epoch": 3.996446813235621, + "grad_norm": 0.33538547874165714, + "learning_rate": 1.3528522901373741e-11, + "loss": 0.0253, + "step": 8998 + }, + { + "epoch": 3.996890961581168, + "grad_norm": 0.4935415446009399, + "learning_rate": 9.394808864748861e-12, + "loss": 0.0315, + "step": 8999 + }, + { + "epoch": 3.9973351099267154, + "grad_norm": 0.37018534448771545, + "learning_rate": 6.012678351319245e-12, + "loss": 0.0243, + "step": 9000 + }, + { + "epoch": 3.9977792582722627, + "grad_norm": 0.38372140221372397, + "learning_rate": 3.3821318690119286e-12, + "loss": 0.021, + "step": 9001 + }, + { + "epoch": 3.9982234066178104, + "grad_norm": 0.3907968845917238, + "learning_rate": 1.503169813621419e-12, + "loss": 0.0222, + "step": 9002 + }, + { + "epoch": 3.9986675549633577, + "grad_norm": 0.4131506866019968, + "learning_rate": 3.757924676994762e-13, + "loss": 0.0311, + "step": 9003 + }, + { + "epoch": 3.9991117033089054, + "grad_norm": 0.47497534756496, + "learning_rate": 0.0, + "loss": 0.0269, + "step": 9004 + }, + { + "epoch": 3.9991117033089054, + "eval_loss": 0.03634560480713844, + "eval_runtime": 402.6359, + "eval_samples_per_second": 37.667, + "eval_steps_per_second": 1.177, + "step": 9004 + }, + { + "epoch": 3.9991117033089054, + "step": 9004, + "total_flos": 1010387024977920.0, + "train_loss": 0.05092642861352228, + "train_runtime": 98184.5209, + "train_samples_per_second": 11.738, + "train_steps_per_second": 0.092 + } + ], + "logging_steps": 1, + "max_steps": 9004, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1010387024977920.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}