diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,54714 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999936, + "eval_steps": 500, + "global_step": 7812, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 48.072471618652344, + "learning_rate": 2.1276595744680853e-06, + "loss": 9.3378, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 43.48775863647461, + "learning_rate": 4.255319148936171e-06, + "loss": 9.3274, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 43.6014518737793, + "learning_rate": 6.3829787234042555e-06, + "loss": 9.3945, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 44.38250732421875, + "learning_rate": 8.510638297872341e-06, + "loss": 9.3164, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 38.3822135925293, + "learning_rate": 1.0638297872340426e-05, + "loss": 9.0362, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 32.224544525146484, + "learning_rate": 1.2765957446808511e-05, + "loss": 9.0261, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 29.361282348632812, + "learning_rate": 1.4893617021276598e-05, + "loss": 8.3959, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 33.011817932128906, + "learning_rate": 1.7021276595744682e-05, + "loss": 8.6945, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 38.45726013183594, + "learning_rate": 1.9148936170212766e-05, + "loss": 8.4528, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 40.37697982788086, + "learning_rate": 2.1276595744680852e-05, + "loss": 7.6175, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 34.739898681640625, + "learning_rate": 2.3404255319148935e-05, + "loss": 7.2495, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 25.367467880249023, + "learning_rate": 2.5531914893617022e-05, + "loss": 7.1284, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 18.876768112182617, + "learning_rate": 2.7659574468085105e-05, + "loss": 6.8116, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 20.27662467956543, + "learning_rate": 2.9787234042553195e-05, + "loss": 6.7344, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 25.331371307373047, + "learning_rate": 3.1914893617021275e-05, + "loss": 6.4643, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 24.46682357788086, + "learning_rate": 3.4042553191489365e-05, + "loss": 6.318, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 14.2562837600708, + "learning_rate": 3.617021276595744e-05, + "loss": 5.8803, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 9.656604766845703, + "learning_rate": 3.829787234042553e-05, + "loss": 5.6743, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 8.968253135681152, + "learning_rate": 4.042553191489362e-05, + "loss": 5.4764, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 8.436161041259766, + "learning_rate": 4.2553191489361704e-05, + "loss": 5.4851, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 7.585102558135986, + "learning_rate": 4.468085106382979e-05, + "loss": 5.2845, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 5.892800807952881, + "learning_rate": 4.680851063829787e-05, + "loss": 5.1436, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 6.763721942901611, + "learning_rate": 4.893617021276596e-05, + "loss": 5.1792, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 4.559952735900879, + "learning_rate": 5.1063829787234044e-05, + "loss": 4.788, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 5.66323184967041, + "learning_rate": 5.319148936170213e-05, + "loss": 5.0929, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 4.006740093231201, + "learning_rate": 5.531914893617021e-05, + "loss": 5.1313, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 3.960521697998047, + "learning_rate": 5.74468085106383e-05, + "loss": 4.7744, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 3.6684303283691406, + "learning_rate": 5.957446808510639e-05, + "loss": 4.7888, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 3.18186616897583, + "learning_rate": 6.170212765957447e-05, + "loss": 4.7324, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 3.048969268798828, + "learning_rate": 6.382978723404255e-05, + "loss": 4.7644, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 2.6776039600372314, + "learning_rate": 6.595744680851063e-05, + "loss": 4.8169, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 2.373857021331787, + "learning_rate": 6.808510638297873e-05, + "loss": 4.7869, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 2.241915702819824, + "learning_rate": 7.021276595744681e-05, + "loss": 4.6614, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.0473105907440186, + "learning_rate": 7.234042553191488e-05, + "loss": 4.6913, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 1.8055814504623413, + "learning_rate": 7.446808510638298e-05, + "loss": 4.4816, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1.7576919794082642, + "learning_rate": 7.659574468085106e-05, + "loss": 4.4663, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 1.6539785861968994, + "learning_rate": 7.872340425531916e-05, + "loss": 4.4183, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.6569417715072632, + "learning_rate": 8.085106382978724e-05, + "loss": 4.5726, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.4927167892456055, + "learning_rate": 8.297872340425531e-05, + "loss": 4.5231, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 1.593092679977417, + "learning_rate": 8.510638297872341e-05, + "loss": 4.4761, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 1.4695898294448853, + "learning_rate": 8.723404255319149e-05, + "loss": 4.309, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 1.461921215057373, + "learning_rate": 8.936170212765958e-05, + "loss": 4.591, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 1.3230671882629395, + "learning_rate": 9.148936170212766e-05, + "loss": 4.5867, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 1.3359841108322144, + "learning_rate": 9.361702127659574e-05, + "loss": 4.5312, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 1.1943374872207642, + "learning_rate": 9.574468085106382e-05, + "loss": 4.444, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 1.1472824811935425, + "learning_rate": 9.787234042553192e-05, + "loss": 4.5007, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 1.113065481185913, + "learning_rate": 0.0001, + "loss": 4.4621, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 1.2492742538452148, + "learning_rate": 0.00010212765957446809, + "loss": 4.5417, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 1.063219428062439, + "learning_rate": 0.00010425531914893617, + "loss": 4.3506, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 1.3006094694137573, + "learning_rate": 0.00010638297872340425, + "loss": 4.2705, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.0823702812194824, + "learning_rate": 0.00010851063829787235, + "loss": 4.5639, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 1.171970248222351, + "learning_rate": 0.00011063829787234042, + "loss": 4.3682, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 1.0258631706237793, + "learning_rate": 0.0001127659574468085, + "loss": 4.4626, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 0.9669986963272095, + "learning_rate": 0.0001148936170212766, + "loss": 4.5103, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 0.8417139649391174, + "learning_rate": 0.00011702127659574468, + "loss": 4.3414, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.9252743721008301, + "learning_rate": 0.00011914893617021278, + "loss": 4.4555, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 0.9143895506858826, + "learning_rate": 0.00012127659574468085, + "loss": 4.3624, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 0.8701967597007751, + "learning_rate": 0.00012340425531914893, + "loss": 4.3983, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 0.8114748001098633, + "learning_rate": 0.00012553191489361702, + "loss": 4.2508, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 0.831774115562439, + "learning_rate": 0.0001276595744680851, + "loss": 4.284, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.8641481399536133, + "learning_rate": 0.00012978723404255318, + "loss": 4.3339, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 0.7552957534790039, + "learning_rate": 0.00013191489361702127, + "loss": 4.3536, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 0.7908809781074524, + "learning_rate": 0.00013404255319148938, + "loss": 4.4432, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 0.7348148822784424, + "learning_rate": 0.00013617021276595746, + "loss": 4.369, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 0.8431515693664551, + "learning_rate": 0.00013829787234042554, + "loss": 4.4683, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.6354610919952393, + "learning_rate": 0.00014042553191489363, + "loss": 4.2249, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.7959691286087036, + "learning_rate": 0.0001425531914893617, + "loss": 4.4541, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.6812201738357544, + "learning_rate": 0.00014468085106382977, + "loss": 4.3104, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.6995226740837097, + "learning_rate": 0.00014680851063829788, + "loss": 4.2944, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 0.6939712166786194, + "learning_rate": 0.00014893617021276596, + "loss": 4.3758, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.6890800595283508, + "learning_rate": 0.00015106382978723404, + "loss": 4.4136, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 0.7452850937843323, + "learning_rate": 0.00015319148936170213, + "loss": 4.179, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 0.7114067077636719, + "learning_rate": 0.0001553191489361702, + "loss": 4.4111, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 0.6431289911270142, + "learning_rate": 0.00015744680851063832, + "loss": 4.3288, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 0.7086479067802429, + "learning_rate": 0.0001595744680851064, + "loss": 4.321, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.6875836849212646, + "learning_rate": 0.00016170212765957449, + "loss": 4.4143, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 0.697288990020752, + "learning_rate": 0.00016382978723404254, + "loss": 4.2854, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 0.6955673694610596, + "learning_rate": 0.00016595744680851062, + "loss": 4.3061, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 0.6468881368637085, + "learning_rate": 0.0001680851063829787, + "loss": 4.3326, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.678005039691925, + "learning_rate": 0.00017021276595744682, + "loss": 4.356, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.6774478554725647, + "learning_rate": 0.0001723404255319149, + "loss": 4.2089, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 0.7283062934875488, + "learning_rate": 0.00017446808510638298, + "loss": 4.3926, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 0.7186111807823181, + "learning_rate": 0.00017659574468085107, + "loss": 4.3649, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 0.6334117650985718, + "learning_rate": 0.00017872340425531915, + "loss": 4.3125, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 0.6184477806091309, + "learning_rate": 0.00018085106382978726, + "loss": 4.3224, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.6729727983474731, + "learning_rate": 0.00018297872340425532, + "loss": 4.2833, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.720475971698761, + "learning_rate": 0.0001851063829787234, + "loss": 4.3743, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.699727475643158, + "learning_rate": 0.00018723404255319148, + "loss": 4.3787, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.7167297005653381, + "learning_rate": 0.00018936170212765957, + "loss": 4.4618, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.7404178977012634, + "learning_rate": 0.00019148936170212765, + "loss": 4.4533, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.7304544448852539, + "learning_rate": 0.00019361702127659576, + "loss": 4.4291, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.7182232141494751, + "learning_rate": 0.00019574468085106384, + "loss": 4.3131, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.7111774682998657, + "learning_rate": 0.00019787234042553193, + "loss": 4.2267, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.7227694988250732, + "learning_rate": 0.0002, + "loss": 4.2252, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.6536330580711365, + "learning_rate": 0.00020212765957446807, + "loss": 4.2824, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.7753093242645264, + "learning_rate": 0.00020425531914893618, + "loss": 4.2619, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.6675902009010315, + "learning_rate": 0.00020638297872340426, + "loss": 4.1745, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.7651579976081848, + "learning_rate": 0.00020851063829787234, + "loss": 4.4243, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.6589416265487671, + "learning_rate": 0.00021063829787234043, + "loss": 4.2501, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.6965423822402954, + "learning_rate": 0.0002127659574468085, + "loss": 4.2279, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.7227066159248352, + "learning_rate": 0.0002148936170212766, + "loss": 4.3669, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.7057831287384033, + "learning_rate": 0.0002170212765957447, + "loss": 4.1575, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.7887274622917175, + "learning_rate": 0.00021914893617021279, + "loss": 4.2733, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.601609468460083, + "learning_rate": 0.00022127659574468084, + "loss": 4.0538, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.783682107925415, + "learning_rate": 0.00022340425531914892, + "loss": 4.2188, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.7353456616401672, + "learning_rate": 0.000225531914893617, + "loss": 4.318, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.7265023589134216, + "learning_rate": 0.00022765957446808512, + "loss": 4.1894, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.710670530796051, + "learning_rate": 0.0002297872340425532, + "loss": 4.2075, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.7732564806938171, + "learning_rate": 0.00023191489361702128, + "loss": 4.193, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.8331496715545654, + "learning_rate": 0.00023404255319148937, + "loss": 4.313, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.8698146939277649, + "learning_rate": 0.00023617021276595745, + "loss": 4.1215, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.7106320261955261, + "learning_rate": 0.00023829787234042556, + "loss": 4.1094, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.8148285150527954, + "learning_rate": 0.00024042553191489362, + "loss": 4.3917, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.8150539994239807, + "learning_rate": 0.0002425531914893617, + "loss": 4.2951, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.7719513773918152, + "learning_rate": 0.0002446808510638298, + "loss": 4.2279, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.9067126512527466, + "learning_rate": 0.00024680851063829787, + "loss": 4.2159, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.8653740286827087, + "learning_rate": 0.000248936170212766, + "loss": 4.3633, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 0.8690366744995117, + "learning_rate": 0.00025106382978723403, + "loss": 4.2861, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 0.8911768794059753, + "learning_rate": 0.0002531914893617021, + "loss": 4.2543, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 0.9171234965324402, + "learning_rate": 0.0002553191489361702, + "loss": 4.1359, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 0.9806623458862305, + "learning_rate": 0.0002574468085106383, + "loss": 4.2147, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 0.9419201016426086, + "learning_rate": 0.00025957446808510637, + "loss": 4.1773, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 0.9396133422851562, + "learning_rate": 0.0002617021276595745, + "loss": 4.2412, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 0.9173053503036499, + "learning_rate": 0.00026382978723404253, + "loss": 4.3375, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 0.922222912311554, + "learning_rate": 0.00026595744680851064, + "loss": 4.0584, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 1.0579538345336914, + "learning_rate": 0.00026808510638297875, + "loss": 4.2104, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 0.862234890460968, + "learning_rate": 0.0002702127659574468, + "loss": 4.2614, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 1.0590007305145264, + "learning_rate": 0.0002723404255319149, + "loss": 4.3827, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 1.0478700399398804, + "learning_rate": 0.000274468085106383, + "loss": 4.1992, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 1.0220136642456055, + "learning_rate": 0.0002765957446808511, + "loss": 4.2198, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 1.0901641845703125, + "learning_rate": 0.00027872340425531914, + "loss": 4.2635, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 1.0935157537460327, + "learning_rate": 0.00028085106382978725, + "loss": 4.1894, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 1.286113977432251, + "learning_rate": 0.00028297872340425536, + "loss": 4.3281, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 0.9515753388404846, + "learning_rate": 0.0002851063829787234, + "loss": 4.2958, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 1.2348365783691406, + "learning_rate": 0.00028723404255319153, + "loss": 4.0989, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 1.276850700378418, + "learning_rate": 0.00028936170212765953, + "loss": 4.1455, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 1.26995050907135, + "learning_rate": 0.00029148936170212764, + "loss": 4.1291, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 1.523987889289856, + "learning_rate": 0.00029361702127659575, + "loss": 4.0667, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 1.3023549318313599, + "learning_rate": 0.0002957446808510638, + "loss": 4.1695, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 1.365368127822876, + "learning_rate": 0.0002978723404255319, + "loss": 4.1791, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 1.2599130868911743, + "learning_rate": 0.0003, + "loss": 4.2204, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 1.4464629888534546, + "learning_rate": 0.0003021276595744681, + "loss": 4.2318, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 1.1470637321472168, + "learning_rate": 0.0003042553191489362, + "loss": 4.1789, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 1.162804365158081, + "learning_rate": 0.00030638297872340425, + "loss": 4.1139, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 1.187334418296814, + "learning_rate": 0.00030851063829787236, + "loss": 3.9921, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 1.4393917322158813, + "learning_rate": 0.0003106382978723404, + "loss": 4.1667, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 1.2334777116775513, + "learning_rate": 0.0003127659574468085, + "loss": 3.9603, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 1.4265984296798706, + "learning_rate": 0.00031489361702127664, + "loss": 4.1494, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 1.3457194566726685, + "learning_rate": 0.0003170212765957447, + "loss": 4.035, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 1.254904866218567, + "learning_rate": 0.0003191489361702128, + "loss": 4.1278, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 1.3408393859863281, + "learning_rate": 0.00032127659574468086, + "loss": 4.0868, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 1.306312918663025, + "learning_rate": 0.00032340425531914897, + "loss": 4.0424, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 1.2912030220031738, + "learning_rate": 0.0003255319148936171, + "loss": 3.9918, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 1.4844059944152832, + "learning_rate": 0.0003276595744680851, + "loss": 4.0833, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 1.3825430870056152, + "learning_rate": 0.0003297872340425532, + "loss": 4.0956, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 1.3500663042068481, + "learning_rate": 0.00033191489361702125, + "loss": 4.1236, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 1.284753441810608, + "learning_rate": 0.00033404255319148936, + "loss": 4.0041, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 1.225710391998291, + "learning_rate": 0.0003361702127659574, + "loss": 4.0542, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 1.3104676008224487, + "learning_rate": 0.0003382978723404255, + "loss": 4.1674, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 1.2814124822616577, + "learning_rate": 0.00034042553191489364, + "loss": 4.1104, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 1.258671760559082, + "learning_rate": 0.0003425531914893617, + "loss": 3.9289, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 1.3215150833129883, + "learning_rate": 0.0003446808510638298, + "loss": 4.0127, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 1.372815489768982, + "learning_rate": 0.00034680851063829786, + "loss": 4.1305, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 1.322662115097046, + "learning_rate": 0.00034893617021276597, + "loss": 3.956, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 1.5933729410171509, + "learning_rate": 0.0003510638297872341, + "loss": 4.0883, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 1.2900516986846924, + "learning_rate": 0.00035319148936170213, + "loss": 4.0051, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 1.4232264757156372, + "learning_rate": 0.00035531914893617025, + "loss": 4.0694, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 1.377969741821289, + "learning_rate": 0.0003574468085106383, + "loss": 4.0636, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 1.3479586839675903, + "learning_rate": 0.0003595744680851064, + "loss": 4.1845, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 1.3103374242782593, + "learning_rate": 0.0003617021276595745, + "loss": 3.9658, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 1.3195881843566895, + "learning_rate": 0.0003638297872340426, + "loss": 4.0497, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 1.282640814781189, + "learning_rate": 0.00036595744680851063, + "loss": 4.024, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 1.4359363317489624, + "learning_rate": 0.0003680851063829787, + "loss": 3.9915, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 1.3655848503112793, + "learning_rate": 0.0003702127659574468, + "loss": 4.0806, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 1.3621926307678223, + "learning_rate": 0.0003723404255319149, + "loss": 4.1544, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 1.3997646570205688, + "learning_rate": 0.00037446808510638297, + "loss": 3.8485, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 1.539406180381775, + "learning_rate": 0.0003765957446808511, + "loss": 4.1827, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 1.5509058237075806, + "learning_rate": 0.00037872340425531913, + "loss": 3.9315, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 1.381636142730713, + "learning_rate": 0.00038085106382978724, + "loss": 3.9717, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 1.3316289186477661, + "learning_rate": 0.0003829787234042553, + "loss": 3.9606, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 1.3810575008392334, + "learning_rate": 0.0003851063829787234, + "loss": 3.9959, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 1.6070252656936646, + "learning_rate": 0.0003872340425531915, + "loss": 4.0106, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 1.1589950323104858, + "learning_rate": 0.0003893617021276596, + "loss": 3.9195, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 1.1375452280044556, + "learning_rate": 0.0003914893617021277, + "loss": 3.9347, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 1.451849102973938, + "learning_rate": 0.00039361702127659574, + "loss": 4.1053, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 1.171995997428894, + "learning_rate": 0.00039574468085106385, + "loss": 4.0659, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 1.3602004051208496, + "learning_rate": 0.00039787234042553196, + "loss": 3.9373, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 1.4252485036849976, + "learning_rate": 0.0004, + "loss": 3.9485, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.3669577836990356, + "learning_rate": 0.00040212765957446813, + "loss": 3.9626, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 1.4510173797607422, + "learning_rate": 0.00040425531914893613, + "loss": 4.0763, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 1.6044769287109375, + "learning_rate": 0.00040638297872340424, + "loss": 3.9054, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.3575087785720825, + "learning_rate": 0.00040851063829787235, + "loss": 3.9598, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 1.4169284105300903, + "learning_rate": 0.0004106382978723404, + "loss": 4.0932, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.3452703952789307, + "learning_rate": 0.0004127659574468085, + "loss": 3.9784, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 1.2284389734268188, + "learning_rate": 0.0004148936170212766, + "loss": 4.149, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 1.5200728178024292, + "learning_rate": 0.0004170212765957447, + "loss": 3.9903, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 1.7464290857315063, + "learning_rate": 0.0004191489361702128, + "loss": 3.9171, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 1.219582438468933, + "learning_rate": 0.00042127659574468085, + "loss": 3.9455, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 1.3889364004135132, + "learning_rate": 0.00042340425531914896, + "loss": 4.0542, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 1.5458612442016602, + "learning_rate": 0.000425531914893617, + "loss": 3.9974, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 1.3700460195541382, + "learning_rate": 0.00042765957446808513, + "loss": 3.919, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 1.6473002433776855, + "learning_rate": 0.0004297872340425532, + "loss": 3.9714, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 1.1797369718551636, + "learning_rate": 0.0004319148936170213, + "loss": 3.9766, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 1.1952027082443237, + "learning_rate": 0.0004340425531914894, + "loss": 3.9486, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 1.4129725694656372, + "learning_rate": 0.00043617021276595746, + "loss": 3.9886, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 1.2671421766281128, + "learning_rate": 0.00043829787234042557, + "loss": 3.7758, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 1.422463059425354, + "learning_rate": 0.0004404255319148936, + "loss": 3.9077, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 1.4845503568649292, + "learning_rate": 0.0004425531914893617, + "loss": 3.8417, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 1.202187418937683, + "learning_rate": 0.0004446808510638298, + "loss": 3.9389, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 1.444098711013794, + "learning_rate": 0.00044680851063829785, + "loss": 3.951, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 1.4373562335968018, + "learning_rate": 0.00044893617021276596, + "loss": 3.9285, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 1.4413760900497437, + "learning_rate": 0.000451063829787234, + "loss": 3.9084, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 1.457393765449524, + "learning_rate": 0.0004531914893617021, + "loss": 3.8667, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 1.4463461637496948, + "learning_rate": 0.00045531914893617024, + "loss": 3.8545, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 1.5317926406860352, + "learning_rate": 0.0004574468085106383, + "loss": 3.8489, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 1.3613853454589844, + "learning_rate": 0.0004595744680851064, + "loss": 3.8174, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 1.4517394304275513, + "learning_rate": 0.00046170212765957446, + "loss": 3.8263, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 1.1728559732437134, + "learning_rate": 0.00046382978723404257, + "loss": 3.8656, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 1.242660641670227, + "learning_rate": 0.0004659574468085107, + "loss": 3.7996, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 1.4940431118011475, + "learning_rate": 0.00046808510638297874, + "loss": 4.0835, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 1.3037731647491455, + "learning_rate": 0.00047021276595744685, + "loss": 4.0274, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 1.3616893291473389, + "learning_rate": 0.0004723404255319149, + "loss": 4.0012, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 1.081254482269287, + "learning_rate": 0.000474468085106383, + "loss": 3.7804, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 1.778045654296875, + "learning_rate": 0.0004765957446808511, + "loss": 3.9001, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 1.1809135675430298, + "learning_rate": 0.0004787234042553192, + "loss": 3.8351, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 1.2070225477218628, + "learning_rate": 0.00048085106382978723, + "loss": 3.9147, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 1.4028456211090088, + "learning_rate": 0.0004829787234042553, + "loss": 3.8441, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 1.1965014934539795, + "learning_rate": 0.0004851063829787234, + "loss": 3.9585, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 1.2288951873779297, + "learning_rate": 0.00048723404255319146, + "loss": 3.9131, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 1.5023062229156494, + "learning_rate": 0.0004893617021276596, + "loss": 3.6967, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 1.2283579111099243, + "learning_rate": 0.0004914893617021277, + "loss": 3.949, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 1.3287562131881714, + "learning_rate": 0.0004936170212765957, + "loss": 3.9456, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 1.2116198539733887, + "learning_rate": 0.0004957446808510638, + "loss": 3.9013, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 1.242817759513855, + "learning_rate": 0.000497872340425532, + "loss": 3.7293, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 1.2188082933425903, + "learning_rate": 0.0005, + "loss": 3.889, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 1.3149223327636719, + "learning_rate": 0.0004999999785110509, + "loss": 3.8368, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 1.1687288284301758, + "learning_rate": 0.0004999999140442072, + "loss": 3.7987, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 1.2290496826171875, + "learning_rate": 0.0004999998065994801, + "loss": 3.933, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 1.2678278684616089, + "learning_rate": 0.0004999996561768879, + "loss": 4.0096, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 1.1861952543258667, + "learning_rate": 0.0004999994627764566, + "loss": 3.8446, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 1.275098443031311, + "learning_rate": 0.0004999992263982194, + "loss": 3.8766, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 1.190368413925171, + "learning_rate": 0.000499998947042217, + "loss": 3.7794, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 1.6217900514602661, + "learning_rate": 0.0004999986247084974, + "loss": 3.8662, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 1.160539150238037, + "learning_rate": 0.0004999982593971157, + "loss": 3.9881, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 1.2364122867584229, + "learning_rate": 0.0004999978511081353, + "loss": 3.8424, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 1.2432687282562256, + "learning_rate": 0.0004999973998416259, + "loss": 3.8258, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 1.4316729307174683, + "learning_rate": 0.0004999969055976653, + "loss": 3.8667, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 1.1562659740447998, + "learning_rate": 0.0004999963683763384, + "loss": 3.7427, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 1.1819325685501099, + "learning_rate": 0.0004999957881777376, + "loss": 3.7863, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 1.3457486629486084, + "learning_rate": 0.0004999951650019627, + "loss": 4.0465, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 1.232448697090149, + "learning_rate": 0.0004999944988491207, + "loss": 3.7483, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 1.208624243736267, + "learning_rate": 0.000499993789719326, + "loss": 3.9846, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 1.267774224281311, + "learning_rate": 0.0004999930376127007, + "loss": 3.9279, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 1.2672343254089355, + "learning_rate": 0.0004999922425293743, + "loss": 3.7615, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 1.0653913021087646, + "learning_rate": 0.000499991404469483, + "loss": 3.9252, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 1.3523491621017456, + "learning_rate": 0.0004999905234331712, + "loss": 3.8587, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 1.3088959455490112, + "learning_rate": 0.0004999895994205903, + "loss": 3.9086, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 1.2633506059646606, + "learning_rate": 0.0004999886324318992, + "loss": 3.799, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 1.1152255535125732, + "learning_rate": 0.000499987622467264, + "loss": 3.8537, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 1.1998374462127686, + "learning_rate": 0.0004999865695268584, + "loss": 3.8254, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 1.329124927520752, + "learning_rate": 0.0004999854736108633, + "loss": 3.8573, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 1.2538161277770996, + "learning_rate": 0.0004999843347194674, + "loss": 3.9046, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 1.1962872743606567, + "learning_rate": 0.0004999831528528662, + "loss": 3.8586, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 1.3084690570831299, + "learning_rate": 0.0004999819280112629, + "loss": 3.896, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 1.0731728076934814, + "learning_rate": 0.0004999806601948682, + "loss": 3.8193, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 1.3075398206710815, + "learning_rate": 0.0004999793494039, + "loss": 3.9347, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 1.2276506423950195, + "learning_rate": 0.0004999779956385836, + "loss": 3.8585, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 0.9807101488113403, + "learning_rate": 0.0004999765988991518, + "loss": 3.7606, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 1.0475084781646729, + "learning_rate": 0.0004999751591858447, + "loss": 3.7161, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 1.1833949089050293, + "learning_rate": 0.0004999736764989096, + "loss": 3.7091, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 1.1262840032577515, + "learning_rate": 0.0004999721508386018, + "loss": 3.8663, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 1.1360872983932495, + "learning_rate": 0.0004999705822051832, + "loss": 3.8956, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 1.2529104948043823, + "learning_rate": 0.0004999689705989237, + "loss": 3.8955, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 1.1704559326171875, + "learning_rate": 0.0004999673160201001, + "loss": 3.683, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 1.3770536184310913, + "learning_rate": 0.0004999656184689972, + "loss": 3.8874, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 1.2912108898162842, + "learning_rate": 0.0004999638779459065, + "loss": 3.8122, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 1.2223007678985596, + "learning_rate": 0.0004999620944511274, + "loss": 3.8967, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 1.205087661743164, + "learning_rate": 0.0004999602679849665, + "loss": 3.7982, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 1.1759825944900513, + "learning_rate": 0.0004999583985477377, + "loss": 3.7433, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 1.3122222423553467, + "learning_rate": 0.0004999564861397624, + "loss": 3.7569, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 1.177197813987732, + "learning_rate": 0.0004999545307613695, + "loss": 3.9638, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 1.0554687976837158, + "learning_rate": 0.0004999525324128949, + "loss": 3.7023, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 1.1935759782791138, + "learning_rate": 0.0004999504910946824, + "loss": 3.8672, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 1.2850805521011353, + "learning_rate": 0.0004999484068070827, + "loss": 3.7865, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 1.1198339462280273, + "learning_rate": 0.0004999462795504542, + "loss": 3.829, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 1.138165831565857, + "learning_rate": 0.0004999441093251627, + "loss": 3.854, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 1.0754280090332031, + "learning_rate": 0.0004999418961315812, + "loss": 3.832, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 1.0576859712600708, + "learning_rate": 0.0004999396399700902, + "loss": 3.8533, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 1.2031567096710205, + "learning_rate": 0.0004999373408410775, + "loss": 3.9285, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 1.1341794729232788, + "learning_rate": 0.0004999349987449384, + "loss": 3.9406, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 1.0620028972625732, + "learning_rate": 0.0004999326136820754, + "loss": 3.8794, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 1.2464852333068848, + "learning_rate": 0.0004999301856528989, + "loss": 3.9019, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 1.0631821155548096, + "learning_rate": 0.0004999277146578258, + "loss": 3.6962, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 1.0158509016036987, + "learning_rate": 0.0004999252006972813, + "loss": 3.7356, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 1.038588047027588, + "learning_rate": 0.0004999226437716974, + "loss": 3.7918, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 1.1335421800613403, + "learning_rate": 0.0004999200438815136, + "loss": 3.7246, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 1.126384973526001, + "learning_rate": 0.000499917401027177, + "loss": 3.7059, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 1.0297484397888184, + "learning_rate": 0.0004999147152091419, + "loss": 3.8476, + "step": 298 + }, + { + "epoch": 0.04, + "grad_norm": 1.1043933629989624, + "learning_rate": 0.0004999119864278699, + "loss": 3.7188, + "step": 299 + }, + { + "epoch": 0.04, + "grad_norm": 1.0596013069152832, + "learning_rate": 0.0004999092146838302, + "loss": 3.7761, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 1.2465026378631592, + "learning_rate": 0.0004999063999774994, + "loss": 3.8733, + "step": 301 + }, + { + "epoch": 0.04, + "grad_norm": 1.1994378566741943, + "learning_rate": 0.0004999035423093612, + "loss": 3.7724, + "step": 302 + }, + { + "epoch": 0.04, + "grad_norm": 1.140863060951233, + "learning_rate": 0.000499900641679907, + "loss": 3.8162, + "step": 303 + }, + { + "epoch": 0.04, + "grad_norm": 1.0662174224853516, + "learning_rate": 0.0004998976980896354, + "loss": 3.831, + "step": 304 + }, + { + "epoch": 0.04, + "grad_norm": 1.201188325881958, + "learning_rate": 0.0004998947115390524, + "loss": 3.9026, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 1.1321618556976318, + "learning_rate": 0.0004998916820286714, + "loss": 3.8513, + "step": 306 + }, + { + "epoch": 0.04, + "grad_norm": 0.9739273190498352, + "learning_rate": 0.0004998886095590134, + "loss": 3.9243, + "step": 307 + }, + { + "epoch": 0.04, + "grad_norm": 1.112061858177185, + "learning_rate": 0.0004998854941306064, + "loss": 3.8003, + "step": 308 + }, + { + "epoch": 0.04, + "grad_norm": 1.1328226327896118, + "learning_rate": 0.000499882335743986, + "loss": 3.7843, + "step": 309 + }, + { + "epoch": 0.04, + "grad_norm": 1.1005468368530273, + "learning_rate": 0.0004998791343996952, + "loss": 3.7154, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 1.102019190788269, + "learning_rate": 0.0004998758900982845, + "loss": 3.7542, + "step": 311 + }, + { + "epoch": 0.04, + "grad_norm": 1.2541944980621338, + "learning_rate": 0.0004998726028403114, + "loss": 3.7998, + "step": 312 + }, + { + "epoch": 0.04, + "grad_norm": 1.0166407823562622, + "learning_rate": 0.000499869272626341, + "loss": 3.7849, + "step": 313 + }, + { + "epoch": 0.04, + "grad_norm": 1.1922158002853394, + "learning_rate": 0.0004998658994569459, + "loss": 3.861, + "step": 314 + }, + { + "epoch": 0.04, + "grad_norm": 1.2430380582809448, + "learning_rate": 0.0004998624833327061, + "loss": 3.8154, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 1.2010189294815063, + "learning_rate": 0.0004998590242542087, + "loss": 3.8315, + "step": 316 + }, + { + "epoch": 0.04, + "grad_norm": 1.1250263452529907, + "learning_rate": 0.0004998555222220485, + "loss": 3.8601, + "step": 317 + }, + { + "epoch": 0.04, + "grad_norm": 1.1066341400146484, + "learning_rate": 0.0004998519772368273, + "loss": 3.8251, + "step": 318 + }, + { + "epoch": 0.04, + "grad_norm": 1.1382642984390259, + "learning_rate": 0.0004998483892991549, + "loss": 3.6828, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 1.06110680103302, + "learning_rate": 0.0004998447584096477, + "loss": 3.8914, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 1.044512391090393, + "learning_rate": 0.0004998410845689301, + "loss": 3.7508, + "step": 321 + }, + { + "epoch": 0.04, + "grad_norm": 1.056077003479004, + "learning_rate": 0.0004998373677776337, + "loss": 3.7714, + "step": 322 + }, + { + "epoch": 0.04, + "grad_norm": 1.1969995498657227, + "learning_rate": 0.0004998336080363975, + "loss": 3.7438, + "step": 323 + }, + { + "epoch": 0.04, + "grad_norm": 1.0868432521820068, + "learning_rate": 0.0004998298053458676, + "loss": 3.7956, + "step": 324 + }, + { + "epoch": 0.04, + "grad_norm": 0.9396175146102905, + "learning_rate": 0.000499825959706698, + "loss": 3.6727, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 1.0107090473175049, + "learning_rate": 0.0004998220711195496, + "loss": 3.8141, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 1.1681958436965942, + "learning_rate": 0.0004998181395850911, + "loss": 3.6731, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 0.969027042388916, + "learning_rate": 0.0004998141651039982, + "loss": 3.7648, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 1.016277551651001, + "learning_rate": 0.0004998101476769542, + "loss": 3.8649, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 1.2155287265777588, + "learning_rate": 0.0004998060873046498, + "loss": 3.911, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 1.138216495513916, + "learning_rate": 0.000499801983987783, + "loss": 3.7283, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 0.9919595718383789, + "learning_rate": 0.0004997978377270591, + "loss": 3.7001, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 1.0648598670959473, + "learning_rate": 0.0004997936485231911, + "loss": 3.8649, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 1.069198727607727, + "learning_rate": 0.0004997894163768992, + "loss": 3.7799, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 1.1835734844207764, + "learning_rate": 0.0004997851412889106, + "loss": 3.7385, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 1.1583417654037476, + "learning_rate": 0.0004997808232599604, + "loss": 3.6885, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 1.1449414491653442, + "learning_rate": 0.0004997764622907911, + "loss": 3.6956, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 1.168426513671875, + "learning_rate": 0.0004997720583821523, + "loss": 3.7565, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 1.0819423198699951, + "learning_rate": 0.000499767611534801, + "loss": 3.7627, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 1.2998234033584595, + "learning_rate": 0.0004997631217495018, + "loss": 3.7833, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 0.9576529860496521, + "learning_rate": 0.0004997585890270265, + "loss": 3.8186, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 1.0528411865234375, + "learning_rate": 0.0004997540133681541, + "loss": 3.706, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 1.0465948581695557, + "learning_rate": 0.0004997493947736715, + "loss": 3.7249, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 0.9764485955238342, + "learning_rate": 0.0004997447332443727, + "loss": 3.6947, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 0.9489080309867859, + "learning_rate": 0.0004997400287810587, + "loss": 3.7476, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 0.995637059211731, + "learning_rate": 0.0004997352813845388, + "loss": 3.7786, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 0.9482467174530029, + "learning_rate": 0.0004997304910556288, + "loss": 3.74, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 1.1336299180984497, + "learning_rate": 0.0004997256577951521, + "loss": 3.7968, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 1.0206025838851929, + "learning_rate": 0.0004997207816039398, + "loss": 3.6832, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 1.0714061260223389, + "learning_rate": 0.0004997158624828303, + "loss": 3.6713, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 1.0384105443954468, + "learning_rate": 0.000499710900432669, + "loss": 3.7539, + "step": 351 + }, + { + "epoch": 0.05, + "grad_norm": 1.263920783996582, + "learning_rate": 0.0004997058954543089, + "loss": 3.847, + "step": 352 + }, + { + "epoch": 0.05, + "grad_norm": 0.9722920656204224, + "learning_rate": 0.0004997008475486107, + "loss": 3.6932, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 0.9932896494865417, + "learning_rate": 0.000499695756716442, + "loss": 3.7424, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 1.5072218179702759, + "learning_rate": 0.0004996906229586778, + "loss": 3.7184, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 1.0924286842346191, + "learning_rate": 0.000499685446276201, + "loss": 3.741, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 0.9837846755981445, + "learning_rate": 0.0004996802266699014, + "loss": 3.6977, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 1.0560784339904785, + "learning_rate": 0.0004996749641406763, + "loss": 3.7476, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 1.043904423713684, + "learning_rate": 0.0004996696586894304, + "loss": 3.8986, + "step": 359 + }, + { + "epoch": 0.05, + "grad_norm": 1.0850636959075928, + "learning_rate": 0.0004996643103170757, + "loss": 3.7739, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 1.0535517930984497, + "learning_rate": 0.0004996589190245318, + "loss": 3.9393, + "step": 361 + }, + { + "epoch": 0.05, + "grad_norm": 0.8589828610420227, + "learning_rate": 0.0004996534848127253, + "loss": 3.651, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 1.2381101846694946, + "learning_rate": 0.0004996480076825906, + "loss": 3.8707, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 0.9625450372695923, + "learning_rate": 0.0004996424876350692, + "loss": 3.7679, + "step": 364 + }, + { + "epoch": 0.05, + "grad_norm": 0.9322208762168884, + "learning_rate": 0.00049963692467111, + "loss": 3.6827, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 1.0279086828231812, + "learning_rate": 0.0004996313187916694, + "loss": 3.7496, + "step": 366 + }, + { + "epoch": 0.05, + "grad_norm": 1.1358555555343628, + "learning_rate": 0.0004996256699977112, + "loss": 3.7333, + "step": 367 + }, + { + "epoch": 0.05, + "grad_norm": 1.1529221534729004, + "learning_rate": 0.0004996199782902064, + "loss": 3.6571, + "step": 368 + }, + { + "epoch": 0.05, + "grad_norm": 1.101511001586914, + "learning_rate": 0.0004996142436701336, + "loss": 3.692, + "step": 369 + }, + { + "epoch": 0.05, + "grad_norm": 1.1611484289169312, + "learning_rate": 0.0004996084661384783, + "loss": 3.8783, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 1.027530312538147, + "learning_rate": 0.0004996026456962341, + "loss": 3.7984, + "step": 371 + }, + { + "epoch": 0.05, + "grad_norm": 1.1063411235809326, + "learning_rate": 0.0004995967823444015, + "loss": 3.6956, + "step": 372 + }, + { + "epoch": 0.05, + "grad_norm": 1.212572693824768, + "learning_rate": 0.0004995908760839884, + "loss": 3.8137, + "step": 373 + }, + { + "epoch": 0.05, + "grad_norm": 1.067589282989502, + "learning_rate": 0.0004995849269160102, + "loss": 3.8697, + "step": 374 + }, + { + "epoch": 0.05, + "grad_norm": 1.170344352722168, + "learning_rate": 0.0004995789348414897, + "loss": 3.7037, + "step": 375 + }, + { + "epoch": 0.05, + "grad_norm": 1.0547147989273071, + "learning_rate": 0.0004995728998614567, + "loss": 3.8312, + "step": 376 + }, + { + "epoch": 0.05, + "grad_norm": 0.9910465478897095, + "learning_rate": 0.0004995668219769492, + "loss": 3.7954, + "step": 377 + }, + { + "epoch": 0.05, + "grad_norm": 0.950949490070343, + "learning_rate": 0.0004995607011890115, + "loss": 3.6607, + "step": 378 + }, + { + "epoch": 0.05, + "grad_norm": 0.9628477692604065, + "learning_rate": 0.0004995545374986963, + "loss": 3.7922, + "step": 379 + }, + { + "epoch": 0.05, + "grad_norm": 1.106607437133789, + "learning_rate": 0.000499548330907063, + "loss": 3.8673, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 1.1569526195526123, + "learning_rate": 0.0004995420814151786, + "loss": 3.7485, + "step": 381 + }, + { + "epoch": 0.05, + "grad_norm": 0.9462524056434631, + "learning_rate": 0.0004995357890241174, + "loss": 3.6742, + "step": 382 + }, + { + "epoch": 0.05, + "grad_norm": 1.117493748664856, + "learning_rate": 0.0004995294537349612, + "loss": 3.7589, + "step": 383 + }, + { + "epoch": 0.05, + "grad_norm": 1.0731674432754517, + "learning_rate": 0.0004995230755487992, + "loss": 3.7896, + "step": 384 + }, + { + "epoch": 0.05, + "grad_norm": 0.9641139507293701, + "learning_rate": 0.0004995166544667278, + "loss": 3.6979, + "step": 385 + }, + { + "epoch": 0.05, + "grad_norm": 0.982068657875061, + "learning_rate": 0.0004995101904898507, + "loss": 3.6966, + "step": 386 + }, + { + "epoch": 0.05, + "grad_norm": 0.9323788285255432, + "learning_rate": 0.0004995036836192793, + "loss": 3.7303, + "step": 387 + }, + { + "epoch": 0.05, + "grad_norm": 1.083184838294983, + "learning_rate": 0.0004994971338561323, + "loss": 3.721, + "step": 388 + }, + { + "epoch": 0.05, + "grad_norm": 1.071229100227356, + "learning_rate": 0.0004994905412015356, + "loss": 3.7621, + "step": 389 + }, + { + "epoch": 0.05, + "grad_norm": 1.0086432695388794, + "learning_rate": 0.0004994839056566225, + "loss": 3.8187, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 0.9265891313552856, + "learning_rate": 0.0004994772272225337, + "loss": 3.7917, + "step": 391 + }, + { + "epoch": 0.05, + "grad_norm": 1.0019060373306274, + "learning_rate": 0.0004994705059004174, + "loss": 3.648, + "step": 392 + }, + { + "epoch": 0.05, + "grad_norm": 1.1281073093414307, + "learning_rate": 0.000499463741691429, + "loss": 3.7661, + "step": 393 + }, + { + "epoch": 0.05, + "grad_norm": 1.1330609321594238, + "learning_rate": 0.0004994569345967314, + "loss": 3.6828, + "step": 394 + }, + { + "epoch": 0.05, + "grad_norm": 1.4806430339813232, + "learning_rate": 0.0004994500846174947, + "loss": 3.8634, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 1.1160086393356323, + "learning_rate": 0.0004994431917548966, + "loss": 3.6678, + "step": 396 + }, + { + "epoch": 0.05, + "grad_norm": 0.9635677933692932, + "learning_rate": 0.0004994362560101221, + "loss": 3.7078, + "step": 397 + }, + { + "epoch": 0.05, + "grad_norm": 1.0511142015457153, + "learning_rate": 0.0004994292773843635, + "loss": 3.7122, + "step": 398 + }, + { + "epoch": 0.05, + "grad_norm": 1.0360469818115234, + "learning_rate": 0.0004994222558788204, + "loss": 3.8101, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 0.9969998002052307, + "learning_rate": 0.0004994151914947001, + "loss": 3.6297, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 1.1051974296569824, + "learning_rate": 0.0004994080842332168, + "loss": 3.7338, + "step": 401 + }, + { + "epoch": 0.05, + "grad_norm": 0.9800593256950378, + "learning_rate": 0.0004994009340955924, + "loss": 3.8023, + "step": 402 + }, + { + "epoch": 0.05, + "grad_norm": 0.9644533395767212, + "learning_rate": 0.0004993937410830561, + "loss": 3.7269, + "step": 403 + }, + { + "epoch": 0.05, + "grad_norm": 1.1003131866455078, + "learning_rate": 0.0004993865051968447, + "loss": 3.7922, + "step": 404 + }, + { + "epoch": 0.05, + "grad_norm": 1.1602671146392822, + "learning_rate": 0.0004993792264382017, + "loss": 3.656, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 1.0589845180511475, + "learning_rate": 0.0004993719048083788, + "loss": 3.6774, + "step": 406 + }, + { + "epoch": 0.05, + "grad_norm": 1.1800709962844849, + "learning_rate": 0.0004993645403086344, + "loss": 3.8222, + "step": 407 + }, + { + "epoch": 0.05, + "grad_norm": 1.1202436685562134, + "learning_rate": 0.0004993571329402346, + "loss": 3.8316, + "step": 408 + }, + { + "epoch": 0.05, + "grad_norm": 1.017310380935669, + "learning_rate": 0.0004993496827044529, + "loss": 3.6912, + "step": 409 + }, + { + "epoch": 0.05, + "grad_norm": 1.1052725315093994, + "learning_rate": 0.00049934218960257, + "loss": 3.8175, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 1.2691749334335327, + "learning_rate": 0.0004993346536358742, + "loss": 3.7636, + "step": 411 + }, + { + "epoch": 0.05, + "grad_norm": 1.0025804042816162, + "learning_rate": 0.0004993270748056607, + "loss": 3.7851, + "step": 412 + }, + { + "epoch": 0.05, + "grad_norm": 1.0031421184539795, + "learning_rate": 0.0004993194531132328, + "loss": 3.7159, + "step": 413 + }, + { + "epoch": 0.05, + "grad_norm": 1.0373765230178833, + "learning_rate": 0.0004993117885599004, + "loss": 3.6204, + "step": 414 + }, + { + "epoch": 0.05, + "grad_norm": 0.9760125875473022, + "learning_rate": 0.0004993040811469815, + "loss": 3.7047, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 1.1238311529159546, + "learning_rate": 0.0004992963308758006, + "loss": 3.8231, + "step": 416 + }, + { + "epoch": 0.05, + "grad_norm": 1.0206835269927979, + "learning_rate": 0.0004992885377476904, + "loss": 3.8061, + "step": 417 + }, + { + "epoch": 0.05, + "grad_norm": 0.9614518284797668, + "learning_rate": 0.0004992807017639906, + "loss": 3.6469, + "step": 418 + }, + { + "epoch": 0.05, + "grad_norm": 1.1135566234588623, + "learning_rate": 0.0004992728229260484, + "loss": 3.7795, + "step": 419 + }, + { + "epoch": 0.05, + "grad_norm": 1.140252709388733, + "learning_rate": 0.000499264901235218, + "loss": 3.7001, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 1.1376891136169434, + "learning_rate": 0.0004992569366928612, + "loss": 3.6246, + "step": 421 + }, + { + "epoch": 0.05, + "grad_norm": 1.0964652299880981, + "learning_rate": 0.0004992489293003475, + "loss": 3.6815, + "step": 422 + }, + { + "epoch": 0.05, + "grad_norm": 0.9947781562805176, + "learning_rate": 0.0004992408790590534, + "loss": 3.7321, + "step": 423 + }, + { + "epoch": 0.05, + "grad_norm": 1.073929786682129, + "learning_rate": 0.0004992327859703626, + "loss": 3.7496, + "step": 424 + }, + { + "epoch": 0.05, + "grad_norm": 1.0482063293457031, + "learning_rate": 0.0004992246500356665, + "loss": 3.6617, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 1.0412204265594482, + "learning_rate": 0.0004992164712563639, + "loss": 3.7565, + "step": 426 + }, + { + "epoch": 0.05, + "grad_norm": 1.1041027307510376, + "learning_rate": 0.0004992082496338607, + "loss": 3.7869, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 0.9422649145126343, + "learning_rate": 0.0004991999851695704, + "loss": 3.6871, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 0.9400534629821777, + "learning_rate": 0.0004991916778649135, + "loss": 3.7735, + "step": 429 + }, + { + "epoch": 0.06, + "grad_norm": 0.8632922172546387, + "learning_rate": 0.0004991833277213183, + "loss": 3.7803, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 0.9322618246078491, + "learning_rate": 0.0004991749347402205, + "loss": 3.7936, + "step": 431 + }, + { + "epoch": 0.06, + "grad_norm": 1.067453384399414, + "learning_rate": 0.0004991664989230626, + "loss": 3.8727, + "step": 432 + }, + { + "epoch": 0.06, + "grad_norm": 0.9637219309806824, + "learning_rate": 0.0004991580202712949, + "loss": 3.6587, + "step": 433 + }, + { + "epoch": 0.06, + "grad_norm": 1.019960880279541, + "learning_rate": 0.0004991494987863751, + "loss": 3.7486, + "step": 434 + }, + { + "epoch": 0.06, + "grad_norm": 0.9948220252990723, + "learning_rate": 0.0004991409344697681, + "loss": 3.7524, + "step": 435 + }, + { + "epoch": 0.06, + "grad_norm": 0.9766054749488831, + "learning_rate": 0.0004991323273229461, + "loss": 3.5528, + "step": 436 + }, + { + "epoch": 0.06, + "grad_norm": 0.9753291606903076, + "learning_rate": 0.0004991236773473889, + "loss": 3.7247, + "step": 437 + }, + { + "epoch": 0.06, + "grad_norm": 1.0976330041885376, + "learning_rate": 0.0004991149845445834, + "loss": 3.5883, + "step": 438 + }, + { + "epoch": 0.06, + "grad_norm": 1.1932387351989746, + "learning_rate": 0.0004991062489160241, + "loss": 3.754, + "step": 439 + }, + { + "epoch": 0.06, + "grad_norm": 0.9403764605522156, + "learning_rate": 0.0004990974704632128, + "loss": 3.6795, + "step": 440 + }, + { + "epoch": 0.06, + "grad_norm": 1.0581969022750854, + "learning_rate": 0.0004990886491876583, + "loss": 3.7334, + "step": 441 + }, + { + "epoch": 0.06, + "grad_norm": 0.9379636645317078, + "learning_rate": 0.0004990797850908775, + "loss": 3.7665, + "step": 442 + }, + { + "epoch": 0.06, + "grad_norm": 0.97157883644104, + "learning_rate": 0.0004990708781743942, + "loss": 3.6782, + "step": 443 + }, + { + "epoch": 0.06, + "grad_norm": 0.9525388479232788, + "learning_rate": 0.0004990619284397393, + "loss": 3.8114, + "step": 444 + }, + { + "epoch": 0.06, + "grad_norm": 0.8939195275306702, + "learning_rate": 0.0004990529358884515, + "loss": 3.7224, + "step": 445 + }, + { + "epoch": 0.06, + "grad_norm": 1.0911587476730347, + "learning_rate": 0.0004990439005220768, + "loss": 3.7623, + "step": 446 + }, + { + "epoch": 0.06, + "grad_norm": 1.012174367904663, + "learning_rate": 0.0004990348223421685, + "loss": 3.7234, + "step": 447 + }, + { + "epoch": 0.06, + "grad_norm": 1.0412369966506958, + "learning_rate": 0.0004990257013502871, + "loss": 3.8048, + "step": 448 + }, + { + "epoch": 0.06, + "grad_norm": 1.0075637102127075, + "learning_rate": 0.0004990165375480007, + "loss": 3.7575, + "step": 449 + }, + { + "epoch": 0.06, + "grad_norm": 1.041245460510254, + "learning_rate": 0.0004990073309368847, + "loss": 3.6959, + "step": 450 + }, + { + "epoch": 0.06, + "grad_norm": 0.9612911343574524, + "learning_rate": 0.0004989980815185217, + "loss": 3.7159, + "step": 451 + }, + { + "epoch": 0.06, + "grad_norm": 1.071912407875061, + "learning_rate": 0.0004989887892945018, + "loss": 3.6945, + "step": 452 + }, + { + "epoch": 0.06, + "grad_norm": 1.0854805707931519, + "learning_rate": 0.0004989794542664226, + "loss": 3.766, + "step": 453 + }, + { + "epoch": 0.06, + "grad_norm": 1.0065691471099854, + "learning_rate": 0.0004989700764358888, + "loss": 3.6882, + "step": 454 + }, + { + "epoch": 0.06, + "grad_norm": 1.0710570812225342, + "learning_rate": 0.0004989606558045126, + "loss": 3.6353, + "step": 455 + }, + { + "epoch": 0.06, + "grad_norm": 1.0000087022781372, + "learning_rate": 0.0004989511923739133, + "loss": 3.6365, + "step": 456 + }, + { + "epoch": 0.06, + "grad_norm": 0.9962633848190308, + "learning_rate": 0.0004989416861457181, + "loss": 3.7622, + "step": 457 + }, + { + "epoch": 0.06, + "grad_norm": 1.0738786458969116, + "learning_rate": 0.000498932137121561, + "loss": 3.7331, + "step": 458 + }, + { + "epoch": 0.06, + "grad_norm": 0.9874961972236633, + "learning_rate": 0.0004989225453030837, + "loss": 3.6447, + "step": 459 + }, + { + "epoch": 0.06, + "grad_norm": 0.9918415546417236, + "learning_rate": 0.000498912910691935, + "loss": 3.6844, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 0.9814335107803345, + "learning_rate": 0.0004989032332897714, + "loss": 3.7584, + "step": 461 + }, + { + "epoch": 0.06, + "grad_norm": 0.9892833828926086, + "learning_rate": 0.0004988935130982564, + "loss": 3.6989, + "step": 462 + }, + { + "epoch": 0.06, + "grad_norm": 1.126273274421692, + "learning_rate": 0.000498883750119061, + "loss": 3.6964, + "step": 463 + }, + { + "epoch": 0.06, + "grad_norm": 1.0810917615890503, + "learning_rate": 0.0004988739443538638, + "loss": 3.6984, + "step": 464 + }, + { + "epoch": 0.06, + "grad_norm": 1.0964237451553345, + "learning_rate": 0.0004988640958043504, + "loss": 3.7274, + "step": 465 + }, + { + "epoch": 0.06, + "grad_norm": 1.0471069812774658, + "learning_rate": 0.0004988542044722138, + "loss": 3.735, + "step": 466 + }, + { + "epoch": 0.06, + "grad_norm": 1.0022653341293335, + "learning_rate": 0.0004988442703591545, + "loss": 3.7472, + "step": 467 + }, + { + "epoch": 0.06, + "grad_norm": 0.9772243499755859, + "learning_rate": 0.0004988342934668801, + "loss": 3.7419, + "step": 468 + }, + { + "epoch": 0.06, + "grad_norm": 1.0073323249816895, + "learning_rate": 0.0004988242737971061, + "loss": 3.7259, + "step": 469 + }, + { + "epoch": 0.06, + "grad_norm": 0.921754777431488, + "learning_rate": 0.0004988142113515548, + "loss": 3.6942, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 0.8903483152389526, + "learning_rate": 0.000498804106131956, + "loss": 3.6989, + "step": 471 + }, + { + "epoch": 0.06, + "grad_norm": 0.968635082244873, + "learning_rate": 0.000498793958140047, + "loss": 3.7787, + "step": 472 + }, + { + "epoch": 0.06, + "grad_norm": 0.9479289650917053, + "learning_rate": 0.0004987837673775723, + "loss": 3.6884, + "step": 473 + }, + { + "epoch": 0.06, + "grad_norm": 1.1050664186477661, + "learning_rate": 0.000498773533846284, + "loss": 3.6962, + "step": 474 + }, + { + "epoch": 0.06, + "grad_norm": 0.9367997646331787, + "learning_rate": 0.000498763257547941, + "loss": 3.6899, + "step": 475 + }, + { + "epoch": 0.06, + "grad_norm": 1.061821699142456, + "learning_rate": 0.0004987529384843102, + "loss": 3.5729, + "step": 476 + }, + { + "epoch": 0.06, + "grad_norm": 0.9343668818473816, + "learning_rate": 0.0004987425766571655, + "loss": 3.6983, + "step": 477 + }, + { + "epoch": 0.06, + "grad_norm": 1.063883900642395, + "learning_rate": 0.0004987321720682882, + "loss": 3.7127, + "step": 478 + }, + { + "epoch": 0.06, + "grad_norm": 1.0057201385498047, + "learning_rate": 0.000498721724719467, + "loss": 3.8145, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 0.9163222908973694, + "learning_rate": 0.0004987112346124978, + "loss": 3.625, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 0.9235097169876099, + "learning_rate": 0.0004987007017491842, + "loss": 3.6672, + "step": 481 + }, + { + "epoch": 0.06, + "grad_norm": 1.2759689092636108, + "learning_rate": 0.0004986901261313366, + "loss": 3.8087, + "step": 482 + }, + { + "epoch": 0.06, + "grad_norm": 1.0272804498672485, + "learning_rate": 0.0004986795077607733, + "loss": 3.7867, + "step": 483 + }, + { + "epoch": 0.06, + "grad_norm": 0.9846420884132385, + "learning_rate": 0.0004986688466393198, + "loss": 3.6247, + "step": 484 + }, + { + "epoch": 0.06, + "grad_norm": 1.0052454471588135, + "learning_rate": 0.0004986581427688086, + "loss": 3.7265, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 0.9914617538452148, + "learning_rate": 0.0004986473961510801, + "loss": 3.6451, + "step": 486 + }, + { + "epoch": 0.06, + "grad_norm": 0.9145068526268005, + "learning_rate": 0.0004986366067879815, + "loss": 3.8237, + "step": 487 + }, + { + "epoch": 0.06, + "grad_norm": 1.084983468055725, + "learning_rate": 0.0004986257746813678, + "loss": 3.6703, + "step": 488 + }, + { + "epoch": 0.06, + "grad_norm": 1.0903581380844116, + "learning_rate": 0.0004986148998331011, + "loss": 3.6616, + "step": 489 + }, + { + "epoch": 0.06, + "grad_norm": 0.89190673828125, + "learning_rate": 0.0004986039822450509, + "loss": 3.7088, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 1.0566867589950562, + "learning_rate": 0.0004985930219190942, + "loss": 3.7702, + "step": 491 + }, + { + "epoch": 0.06, + "grad_norm": 0.9850021004676819, + "learning_rate": 0.0004985820188571151, + "loss": 3.6232, + "step": 492 + }, + { + "epoch": 0.06, + "grad_norm": 1.0485903024673462, + "learning_rate": 0.0004985709730610051, + "loss": 3.7174, + "step": 493 + }, + { + "epoch": 0.06, + "grad_norm": 0.9693613052368164, + "learning_rate": 0.0004985598845326631, + "loss": 3.6675, + "step": 494 + }, + { + "epoch": 0.06, + "grad_norm": 1.0052372217178345, + "learning_rate": 0.0004985487532739955, + "loss": 3.7537, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 1.14641273021698, + "learning_rate": 0.0004985375792869157, + "loss": 3.6944, + "step": 496 + }, + { + "epoch": 0.06, + "grad_norm": 0.9626569747924805, + "learning_rate": 0.0004985263625733447, + "loss": 3.6664, + "step": 497 + }, + { + "epoch": 0.06, + "grad_norm": 1.0079176425933838, + "learning_rate": 0.0004985151031352108, + "loss": 3.7537, + "step": 498 + }, + { + "epoch": 0.06, + "grad_norm": 0.9485942721366882, + "learning_rate": 0.0004985038009744498, + "loss": 3.7139, + "step": 499 + }, + { + "epoch": 0.06, + "grad_norm": 1.0791044235229492, + "learning_rate": 0.0004984924560930044, + "loss": 3.7171, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 0.9445507526397705, + "learning_rate": 0.0004984810684928251, + "loss": 3.764, + "step": 501 + }, + { + "epoch": 0.06, + "grad_norm": 1.0104825496673584, + "learning_rate": 0.0004984696381758693, + "loss": 3.7245, + "step": 502 + }, + { + "epoch": 0.06, + "grad_norm": 0.9643205404281616, + "learning_rate": 0.0004984581651441024, + "loss": 3.7313, + "step": 503 + }, + { + "epoch": 0.06, + "grad_norm": 1.0203408002853394, + "learning_rate": 0.0004984466493994964, + "loss": 3.7902, + "step": 504 + }, + { + "epoch": 0.06, + "grad_norm": 0.9394329786300659, + "learning_rate": 0.0004984350909440311, + "loss": 3.5563, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 0.996798038482666, + "learning_rate": 0.0004984234897796937, + "loss": 3.7845, + "step": 506 + }, + { + "epoch": 0.06, + "grad_norm": 0.9923459887504578, + "learning_rate": 0.0004984118459084783, + "loss": 3.7403, + "step": 507 + }, + { + "epoch": 0.07, + "grad_norm": 1.4790393114089966, + "learning_rate": 0.0004984001593323868, + "loss": 3.6578, + "step": 508 + }, + { + "epoch": 0.07, + "grad_norm": 1.0736733675003052, + "learning_rate": 0.0004983884300534283, + "loss": 3.7221, + "step": 509 + }, + { + "epoch": 0.07, + "grad_norm": 0.9900732636451721, + "learning_rate": 0.000498376658073619, + "loss": 3.5539, + "step": 510 + }, + { + "epoch": 0.07, + "grad_norm": 0.9970335364341736, + "learning_rate": 0.0004983648433949828, + "loss": 3.6776, + "step": 511 + }, + { + "epoch": 0.07, + "grad_norm": 1.0395703315734863, + "learning_rate": 0.0004983529860195507, + "loss": 3.712, + "step": 512 + }, + { + "epoch": 0.07, + "grad_norm": 0.904090940952301, + "learning_rate": 0.0004983410859493611, + "loss": 3.6631, + "step": 513 + }, + { + "epoch": 0.07, + "grad_norm": 0.9814820289611816, + "learning_rate": 0.0004983291431864599, + "loss": 3.6864, + "step": 514 + }, + { + "epoch": 0.07, + "grad_norm": 1.0582082271575928, + "learning_rate": 0.0004983171577329001, + "loss": 3.6732, + "step": 515 + }, + { + "epoch": 0.07, + "grad_norm": 1.0297218561172485, + "learning_rate": 0.0004983051295907421, + "loss": 3.8229, + "step": 516 + }, + { + "epoch": 0.07, + "grad_norm": 0.9799910187721252, + "learning_rate": 0.0004982930587620537, + "loss": 3.6739, + "step": 517 + }, + { + "epoch": 0.07, + "grad_norm": 1.02519690990448, + "learning_rate": 0.0004982809452489101, + "loss": 3.6983, + "step": 518 + }, + { + "epoch": 0.07, + "grad_norm": 0.9558513164520264, + "learning_rate": 0.0004982687890533936, + "loss": 3.5971, + "step": 519 + }, + { + "epoch": 0.07, + "grad_norm": 1.03365957736969, + "learning_rate": 0.0004982565901775943, + "loss": 3.8095, + "step": 520 + }, + { + "epoch": 0.07, + "grad_norm": 1.0138533115386963, + "learning_rate": 0.0004982443486236089, + "loss": 3.7355, + "step": 521 + }, + { + "epoch": 0.07, + "grad_norm": 1.0666346549987793, + "learning_rate": 0.0004982320643935421, + "loss": 3.7923, + "step": 522 + }, + { + "epoch": 0.07, + "grad_norm": 1.0163183212280273, + "learning_rate": 0.0004982197374895058, + "loss": 3.7284, + "step": 523 + }, + { + "epoch": 0.07, + "grad_norm": 1.1301817893981934, + "learning_rate": 0.0004982073679136189, + "loss": 3.758, + "step": 524 + }, + { + "epoch": 0.07, + "grad_norm": 1.0546684265136719, + "learning_rate": 0.000498194955668008, + "loss": 3.7278, + "step": 525 + }, + { + "epoch": 0.07, + "grad_norm": 0.9236823916435242, + "learning_rate": 0.000498182500754807, + "loss": 3.6028, + "step": 526 + }, + { + "epoch": 0.07, + "grad_norm": 0.940798282623291, + "learning_rate": 0.0004981700031761567, + "loss": 3.6304, + "step": 527 + }, + { + "epoch": 0.07, + "grad_norm": 1.0910943746566772, + "learning_rate": 0.0004981574629342061, + "loss": 3.7663, + "step": 528 + }, + { + "epoch": 0.07, + "grad_norm": 0.9817875027656555, + "learning_rate": 0.0004981448800311105, + "loss": 3.7661, + "step": 529 + }, + { + "epoch": 0.07, + "grad_norm": 0.9952785968780518, + "learning_rate": 0.0004981322544690335, + "loss": 3.7484, + "step": 530 + }, + { + "epoch": 0.07, + "grad_norm": 0.9223330020904541, + "learning_rate": 0.0004981195862501452, + "loss": 3.6418, + "step": 531 + }, + { + "epoch": 0.07, + "grad_norm": 1.0352973937988281, + "learning_rate": 0.0004981068753766237, + "loss": 3.7409, + "step": 532 + }, + { + "epoch": 0.07, + "grad_norm": 1.0839977264404297, + "learning_rate": 0.000498094121850654, + "loss": 3.729, + "step": 533 + }, + { + "epoch": 0.07, + "grad_norm": 1.0956603288650513, + "learning_rate": 0.0004980813256744286, + "loss": 3.6688, + "step": 534 + }, + { + "epoch": 0.07, + "grad_norm": 1.0349833965301514, + "learning_rate": 0.0004980684868501472, + "loss": 3.707, + "step": 535 + }, + { + "epoch": 0.07, + "grad_norm": 0.9875361323356628, + "learning_rate": 0.0004980556053800171, + "loss": 3.7233, + "step": 536 + }, + { + "epoch": 0.07, + "grad_norm": 1.0196729898452759, + "learning_rate": 0.0004980426812662527, + "loss": 3.5573, + "step": 537 + }, + { + "epoch": 0.07, + "grad_norm": 0.9404676556587219, + "learning_rate": 0.0004980297145110759, + "loss": 3.639, + "step": 538 + }, + { + "epoch": 0.07, + "grad_norm": 0.9518227577209473, + "learning_rate": 0.0004980167051167158, + "loss": 3.7485, + "step": 539 + }, + { + "epoch": 0.07, + "grad_norm": 1.0937633514404297, + "learning_rate": 0.0004980036530854088, + "loss": 3.679, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 0.9536413550376892, + "learning_rate": 0.0004979905584193986, + "loss": 3.7071, + "step": 541 + }, + { + "epoch": 0.07, + "grad_norm": 0.9441630840301514, + "learning_rate": 0.0004979774211209367, + "loss": 3.628, + "step": 542 + }, + { + "epoch": 0.07, + "grad_norm": 0.9995726943016052, + "learning_rate": 0.0004979642411922811, + "loss": 3.6734, + "step": 543 + }, + { + "epoch": 0.07, + "grad_norm": 0.9759696125984192, + "learning_rate": 0.0004979510186356979, + "loss": 3.7699, + "step": 544 + }, + { + "epoch": 0.07, + "grad_norm": 0.9783552289009094, + "learning_rate": 0.00049793775345346, + "loss": 3.7217, + "step": 545 + }, + { + "epoch": 0.07, + "grad_norm": 1.1201858520507812, + "learning_rate": 0.000497924445647848, + "loss": 3.7629, + "step": 546 + }, + { + "epoch": 0.07, + "grad_norm": 1.0142176151275635, + "learning_rate": 0.0004979110952211496, + "loss": 3.7308, + "step": 547 + }, + { + "epoch": 0.07, + "grad_norm": 0.9891877770423889, + "learning_rate": 0.00049789770217566, + "loss": 3.5363, + "step": 548 + }, + { + "epoch": 0.07, + "grad_norm": 0.984727144241333, + "learning_rate": 0.0004978842665136814, + "loss": 3.7136, + "step": 549 + }, + { + "epoch": 0.07, + "grad_norm": 0.9455428123474121, + "learning_rate": 0.0004978707882375237, + "loss": 3.7227, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 0.9751055836677551, + "learning_rate": 0.000497857267349504, + "loss": 3.7307, + "step": 551 + }, + { + "epoch": 0.07, + "grad_norm": 0.9746074080467224, + "learning_rate": 0.0004978437038519465, + "loss": 3.6678, + "step": 552 + }, + { + "epoch": 0.07, + "grad_norm": 0.972135066986084, + "learning_rate": 0.0004978300977471832, + "loss": 3.8389, + "step": 553 + }, + { + "epoch": 0.07, + "grad_norm": 0.883399486541748, + "learning_rate": 0.0004978164490375529, + "loss": 3.7556, + "step": 554 + }, + { + "epoch": 0.07, + "grad_norm": 0.8993391990661621, + "learning_rate": 0.000497802757725402, + "loss": 3.6435, + "step": 555 + }, + { + "epoch": 0.07, + "grad_norm": 0.9784232378005981, + "learning_rate": 0.0004977890238130844, + "loss": 3.6716, + "step": 556 + }, + { + "epoch": 0.07, + "grad_norm": 1.0136913061141968, + "learning_rate": 0.000497775247302961, + "loss": 3.6346, + "step": 557 + }, + { + "epoch": 0.07, + "grad_norm": 1.1846816539764404, + "learning_rate": 0.0004977614281974002, + "loss": 3.7713, + "step": 558 + }, + { + "epoch": 0.07, + "grad_norm": 1.601712703704834, + "learning_rate": 0.0004977475664987773, + "loss": 3.6996, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 0.9286705851554871, + "learning_rate": 0.0004977336622094759, + "loss": 3.6156, + "step": 560 + }, + { + "epoch": 0.07, + "grad_norm": 0.9827843308448792, + "learning_rate": 0.0004977197153318858, + "loss": 3.6352, + "step": 561 + }, + { + "epoch": 0.07, + "grad_norm": 1.022339105606079, + "learning_rate": 0.000497705725868405, + "loss": 3.664, + "step": 562 + }, + { + "epoch": 0.07, + "grad_norm": 0.9422778487205505, + "learning_rate": 0.0004976916938214381, + "loss": 3.6822, + "step": 563 + }, + { + "epoch": 0.07, + "grad_norm": 0.8874156475067139, + "learning_rate": 0.0004976776191933976, + "loss": 3.651, + "step": 564 + }, + { + "epoch": 0.07, + "grad_norm": 0.979041337966919, + "learning_rate": 0.000497663501986703, + "loss": 3.6669, + "step": 565 + }, + { + "epoch": 0.07, + "grad_norm": 1.0292575359344482, + "learning_rate": 0.0004976493422037813, + "loss": 3.727, + "step": 566 + }, + { + "epoch": 0.07, + "grad_norm": 0.9404982328414917, + "learning_rate": 0.0004976351398470667, + "loss": 3.7099, + "step": 567 + }, + { + "epoch": 0.07, + "grad_norm": 1.0229207277297974, + "learning_rate": 0.0004976208949190007, + "loss": 3.6508, + "step": 568 + }, + { + "epoch": 0.07, + "grad_norm": 0.935306191444397, + "learning_rate": 0.0004976066074220322, + "loss": 3.8563, + "step": 569 + }, + { + "epoch": 0.07, + "grad_norm": 0.9937973618507385, + "learning_rate": 0.0004975922773586173, + "loss": 3.7509, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 0.9270636439323425, + "learning_rate": 0.0004975779047312198, + "loss": 3.6503, + "step": 571 + }, + { + "epoch": 0.07, + "grad_norm": 1.0206480026245117, + "learning_rate": 0.0004975634895423101, + "loss": 3.8005, + "step": 572 + }, + { + "epoch": 0.07, + "grad_norm": 1.0152143239974976, + "learning_rate": 0.0004975490317943666, + "loss": 3.7362, + "step": 573 + }, + { + "epoch": 0.07, + "grad_norm": 0.946738600730896, + "learning_rate": 0.0004975345314898747, + "loss": 3.585, + "step": 574 + }, + { + "epoch": 0.07, + "grad_norm": 0.9271712303161621, + "learning_rate": 0.0004975199886313272, + "loss": 3.6641, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 0.9697306752204895, + "learning_rate": 0.0004975054032212241, + "loss": 3.7098, + "step": 576 + }, + { + "epoch": 0.07, + "grad_norm": 0.8851280808448792, + "learning_rate": 0.0004974907752620729, + "loss": 3.6736, + "step": 577 + }, + { + "epoch": 0.07, + "grad_norm": 0.9549417495727539, + "learning_rate": 0.0004974761047563883, + "loss": 3.536, + "step": 578 + }, + { + "epoch": 0.07, + "grad_norm": 0.9481258392333984, + "learning_rate": 0.0004974613917066923, + "loss": 3.6974, + "step": 579 + }, + { + "epoch": 0.07, + "grad_norm": 1.0045398473739624, + "learning_rate": 0.0004974466361155141, + "loss": 3.7196, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 0.9385789036750793, + "learning_rate": 0.0004974318379853907, + "loss": 3.724, + "step": 581 + }, + { + "epoch": 0.07, + "grad_norm": 1.0042544603347778, + "learning_rate": 0.0004974169973188658, + "loss": 3.6453, + "step": 582 + }, + { + "epoch": 0.07, + "grad_norm": 0.9435723423957825, + "learning_rate": 0.0004974021141184908, + "loss": 3.7277, + "step": 583 + }, + { + "epoch": 0.07, + "grad_norm": 0.9772533774375916, + "learning_rate": 0.0004973871883868242, + "loss": 3.6198, + "step": 584 + }, + { + "epoch": 0.07, + "grad_norm": 1.0190731287002563, + "learning_rate": 0.0004973722201264319, + "loss": 3.7273, + "step": 585 + }, + { + "epoch": 0.08, + "grad_norm": 1.0444413423538208, + "learning_rate": 0.0004973572093398873, + "loss": 3.7142, + "step": 586 + }, + { + "epoch": 0.08, + "grad_norm": 0.9854151606559753, + "learning_rate": 0.0004973421560297707, + "loss": 3.632, + "step": 587 + }, + { + "epoch": 0.08, + "grad_norm": 0.9795171618461609, + "learning_rate": 0.00049732706019867, + "loss": 3.5712, + "step": 588 + }, + { + "epoch": 0.08, + "grad_norm": 0.9471103549003601, + "learning_rate": 0.0004973119218491805, + "loss": 3.6515, + "step": 589 + }, + { + "epoch": 0.08, + "grad_norm": 1.0504335165023804, + "learning_rate": 0.0004972967409839045, + "loss": 3.7154, + "step": 590 + }, + { + "epoch": 0.08, + "grad_norm": 0.8781349062919617, + "learning_rate": 0.0004972815176054519, + "loss": 3.6802, + "step": 591 + }, + { + "epoch": 0.08, + "grad_norm": 0.8920720815658569, + "learning_rate": 0.0004972662517164396, + "loss": 3.5971, + "step": 592 + }, + { + "epoch": 0.08, + "grad_norm": 0.8624701499938965, + "learning_rate": 0.0004972509433194922, + "loss": 3.7251, + "step": 593 + }, + { + "epoch": 0.08, + "grad_norm": 0.8881915211677551, + "learning_rate": 0.0004972355924172411, + "loss": 3.6237, + "step": 594 + }, + { + "epoch": 0.08, + "grad_norm": 1.0436822175979614, + "learning_rate": 0.0004972201990123255, + "loss": 3.6556, + "step": 595 + }, + { + "epoch": 0.08, + "grad_norm": 0.9346268177032471, + "learning_rate": 0.0004972047631073917, + "loss": 3.7369, + "step": 596 + }, + { + "epoch": 0.08, + "grad_norm": 0.9199187159538269, + "learning_rate": 0.0004971892847050933, + "loss": 3.697, + "step": 597 + }, + { + "epoch": 0.08, + "grad_norm": 0.8995476961135864, + "learning_rate": 0.0004971737638080912, + "loss": 3.5557, + "step": 598 + }, + { + "epoch": 0.08, + "grad_norm": 0.8915233612060547, + "learning_rate": 0.0004971582004190536, + "loss": 3.6867, + "step": 599 + }, + { + "epoch": 0.08, + "grad_norm": 0.9910312294960022, + "learning_rate": 0.000497142594540656, + "loss": 3.7324, + "step": 600 + }, + { + "epoch": 0.08, + "grad_norm": 0.9255796074867249, + "learning_rate": 0.0004971269461755813, + "loss": 3.6542, + "step": 601 + }, + { + "epoch": 0.08, + "grad_norm": 0.9073544144630432, + "learning_rate": 0.0004971112553265195, + "loss": 3.6904, + "step": 602 + }, + { + "epoch": 0.08, + "grad_norm": 0.9548100233078003, + "learning_rate": 0.0004970955219961683, + "loss": 3.8423, + "step": 603 + }, + { + "epoch": 0.08, + "grad_norm": 0.9389156699180603, + "learning_rate": 0.0004970797461872323, + "loss": 3.8538, + "step": 604 + }, + { + "epoch": 0.08, + "grad_norm": 0.9951800107955933, + "learning_rate": 0.0004970639279024235, + "loss": 3.7453, + "step": 605 + }, + { + "epoch": 0.08, + "grad_norm": 0.9465222358703613, + "learning_rate": 0.0004970480671444613, + "loss": 3.727, + "step": 606 + }, + { + "epoch": 0.08, + "grad_norm": 1.013688564300537, + "learning_rate": 0.0004970321639160723, + "loss": 3.7229, + "step": 607 + }, + { + "epoch": 0.08, + "grad_norm": 1.0122108459472656, + "learning_rate": 0.0004970162182199904, + "loss": 3.5292, + "step": 608 + }, + { + "epoch": 0.08, + "grad_norm": 1.031880497932434, + "learning_rate": 0.0004970002300589571, + "loss": 3.718, + "step": 609 + }, + { + "epoch": 0.08, + "grad_norm": 0.9461362957954407, + "learning_rate": 0.0004969841994357207, + "loss": 3.7372, + "step": 610 + }, + { + "epoch": 0.08, + "grad_norm": 0.8112237453460693, + "learning_rate": 0.0004969681263530372, + "loss": 3.556, + "step": 611 + }, + { + "epoch": 0.08, + "grad_norm": 0.9459866881370544, + "learning_rate": 0.0004969520108136696, + "loss": 3.7211, + "step": 612 + }, + { + "epoch": 0.08, + "grad_norm": 0.87845778465271, + "learning_rate": 0.0004969358528203884, + "loss": 3.6556, + "step": 613 + }, + { + "epoch": 0.08, + "grad_norm": 0.8740374445915222, + "learning_rate": 0.0004969196523759713, + "loss": 3.6772, + "step": 614 + }, + { + "epoch": 0.08, + "grad_norm": 0.9206240177154541, + "learning_rate": 0.0004969034094832036, + "loss": 3.6392, + "step": 615 + }, + { + "epoch": 0.08, + "grad_norm": 0.9973964691162109, + "learning_rate": 0.0004968871241448774, + "loss": 3.6835, + "step": 616 + }, + { + "epoch": 0.08, + "grad_norm": 0.8693141937255859, + "learning_rate": 0.0004968707963637924, + "loss": 3.6174, + "step": 617 + }, + { + "epoch": 0.08, + "grad_norm": 0.9450755715370178, + "learning_rate": 0.0004968544261427555, + "loss": 3.7216, + "step": 618 + }, + { + "epoch": 0.08, + "grad_norm": 0.9990179538726807, + "learning_rate": 0.0004968380134845811, + "loss": 3.8213, + "step": 619 + }, + { + "epoch": 0.08, + "grad_norm": 0.8987605571746826, + "learning_rate": 0.0004968215583920903, + "loss": 3.614, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 0.8715751767158508, + "learning_rate": 0.0004968050608681125, + "loss": 3.5939, + "step": 621 + }, + { + "epoch": 0.08, + "grad_norm": 0.9483709335327148, + "learning_rate": 0.0004967885209154833, + "loss": 3.6713, + "step": 622 + }, + { + "epoch": 0.08, + "grad_norm": 0.9490387439727783, + "learning_rate": 0.0004967719385370465, + "loss": 3.5721, + "step": 623 + }, + { + "epoch": 0.08, + "grad_norm": 0.9424468278884888, + "learning_rate": 0.0004967553137356527, + "loss": 3.7448, + "step": 624 + }, + { + "epoch": 0.08, + "grad_norm": 1.0020875930786133, + "learning_rate": 0.0004967386465141596, + "loss": 3.6202, + "step": 625 + }, + { + "epoch": 0.08, + "grad_norm": 0.9360184073448181, + "learning_rate": 0.0004967219368754329, + "loss": 3.5842, + "step": 626 + }, + { + "epoch": 0.08, + "grad_norm": 0.961264431476593, + "learning_rate": 0.000496705184822345, + "loss": 3.6114, + "step": 627 + }, + { + "epoch": 0.08, + "grad_norm": 0.9263641238212585, + "learning_rate": 0.0004966883903577757, + "loss": 3.5792, + "step": 628 + }, + { + "epoch": 0.08, + "grad_norm": 0.8265263438224792, + "learning_rate": 0.0004966715534846123, + "loss": 3.6818, + "step": 629 + }, + { + "epoch": 0.08, + "grad_norm": 0.9687116742134094, + "learning_rate": 0.0004966546742057491, + "loss": 3.7072, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 0.9059762358665466, + "learning_rate": 0.000496637752524088, + "loss": 3.6933, + "step": 631 + }, + { + "epoch": 0.08, + "grad_norm": 0.9927087426185608, + "learning_rate": 0.000496620788442538, + "loss": 3.593, + "step": 632 + }, + { + "epoch": 0.08, + "grad_norm": 0.9516920447349548, + "learning_rate": 0.0004966037819640153, + "loss": 3.6701, + "step": 633 + }, + { + "epoch": 0.08, + "grad_norm": 0.9022125005722046, + "learning_rate": 0.0004965867330914437, + "loss": 3.6787, + "step": 634 + }, + { + "epoch": 0.08, + "grad_norm": 0.8861287832260132, + "learning_rate": 0.000496569641827754, + "loss": 3.7285, + "step": 635 + }, + { + "epoch": 0.08, + "grad_norm": 0.8411892056465149, + "learning_rate": 0.0004965525081758843, + "loss": 3.7808, + "step": 636 + }, + { + "epoch": 0.08, + "grad_norm": 0.8907370567321777, + "learning_rate": 0.0004965353321387803, + "loss": 3.6188, + "step": 637 + }, + { + "epoch": 0.08, + "grad_norm": 0.9059764742851257, + "learning_rate": 0.0004965181137193946, + "loss": 3.6357, + "step": 638 + }, + { + "epoch": 0.08, + "grad_norm": 0.9706633687019348, + "learning_rate": 0.0004965008529206872, + "loss": 3.4703, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 0.8777306079864502, + "learning_rate": 0.0004964835497456255, + "loss": 3.6519, + "step": 640 + }, + { + "epoch": 0.08, + "grad_norm": 0.9662359356880188, + "learning_rate": 0.0004964662041971841, + "loss": 3.6009, + "step": 641 + }, + { + "epoch": 0.08, + "grad_norm": 0.9643966555595398, + "learning_rate": 0.000496448816278345, + "loss": 3.5765, + "step": 642 + }, + { + "epoch": 0.08, + "grad_norm": 0.965904951095581, + "learning_rate": 0.0004964313859920972, + "loss": 3.7324, + "step": 643 + }, + { + "epoch": 0.08, + "grad_norm": 0.9563618898391724, + "learning_rate": 0.0004964139133414373, + "loss": 3.7137, + "step": 644 + }, + { + "epoch": 0.08, + "grad_norm": 0.9152633547782898, + "learning_rate": 0.0004963963983293691, + "loss": 3.7103, + "step": 645 + }, + { + "epoch": 0.08, + "grad_norm": 0.92173832654953, + "learning_rate": 0.0004963788409589035, + "loss": 3.6352, + "step": 646 + }, + { + "epoch": 0.08, + "grad_norm": 1.0018508434295654, + "learning_rate": 0.0004963612412330589, + "loss": 3.6581, + "step": 647 + }, + { + "epoch": 0.08, + "grad_norm": 0.9896060228347778, + "learning_rate": 0.0004963435991548608, + "loss": 3.7006, + "step": 648 + }, + { + "epoch": 0.08, + "grad_norm": 0.9749443531036377, + "learning_rate": 0.0004963259147273422, + "loss": 3.8044, + "step": 649 + }, + { + "epoch": 0.08, + "grad_norm": 0.8835206031799316, + "learning_rate": 0.0004963081879535431, + "loss": 3.6068, + "step": 650 + }, + { + "epoch": 0.08, + "grad_norm": 0.8468639850616455, + "learning_rate": 0.0004962904188365112, + "loss": 3.7224, + "step": 651 + }, + { + "epoch": 0.08, + "grad_norm": 0.8811071515083313, + "learning_rate": 0.000496272607379301, + "loss": 3.6173, + "step": 652 + }, + { + "epoch": 0.08, + "grad_norm": 0.8909751772880554, + "learning_rate": 0.0004962547535849745, + "loss": 3.6294, + "step": 653 + }, + { + "epoch": 0.08, + "grad_norm": 0.9179403185844421, + "learning_rate": 0.0004962368574566011, + "loss": 3.6692, + "step": 654 + }, + { + "epoch": 0.08, + "grad_norm": 0.989331066608429, + "learning_rate": 0.0004962189189972573, + "loss": 3.6605, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 0.9141680598258972, + "learning_rate": 0.0004962009382100268, + "loss": 3.7447, + "step": 656 + }, + { + "epoch": 0.08, + "grad_norm": 1.0156253576278687, + "learning_rate": 0.0004961829150980009, + "loss": 3.8072, + "step": 657 + }, + { + "epoch": 0.08, + "grad_norm": 0.904231071472168, + "learning_rate": 0.0004961648496642778, + "loss": 3.649, + "step": 658 + }, + { + "epoch": 0.08, + "grad_norm": 1.1194738149642944, + "learning_rate": 0.0004961467419119634, + "loss": 3.7544, + "step": 659 + }, + { + "epoch": 0.08, + "grad_norm": 0.9274888634681702, + "learning_rate": 0.0004961285918441704, + "loss": 3.6904, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 0.9246535897254944, + "learning_rate": 0.0004961103994640192, + "loss": 3.5769, + "step": 661 + }, + { + "epoch": 0.08, + "grad_norm": 1.013225793838501, + "learning_rate": 0.000496092164774637, + "loss": 3.6117, + "step": 662 + }, + { + "epoch": 0.08, + "grad_norm": 0.9869426488876343, + "learning_rate": 0.0004960738877791589, + "loss": 3.601, + "step": 663 + }, + { + "epoch": 0.08, + "grad_norm": 0.9690237045288086, + "learning_rate": 0.0004960555684807266, + "loss": 3.6647, + "step": 664 + }, + { + "epoch": 0.09, + "grad_norm": 0.9183991551399231, + "learning_rate": 0.0004960372068824896, + "loss": 3.6451, + "step": 665 + }, + { + "epoch": 0.09, + "grad_norm": 0.9161506295204163, + "learning_rate": 0.0004960188029876044, + "loss": 3.6659, + "step": 666 + }, + { + "epoch": 0.09, + "grad_norm": 1.0477174520492554, + "learning_rate": 0.000496000356799235, + "loss": 3.7068, + "step": 667 + }, + { + "epoch": 0.09, + "grad_norm": 0.9970843195915222, + "learning_rate": 0.0004959818683205523, + "loss": 3.7798, + "step": 668 + }, + { + "epoch": 0.09, + "grad_norm": 0.9160288572311401, + "learning_rate": 0.0004959633375547348, + "loss": 3.6924, + "step": 669 + }, + { + "epoch": 0.09, + "grad_norm": 0.8371219635009766, + "learning_rate": 0.0004959447645049681, + "loss": 3.6469, + "step": 670 + }, + { + "epoch": 0.09, + "grad_norm": 1.2257660627365112, + "learning_rate": 0.0004959261491744452, + "loss": 3.6198, + "step": 671 + }, + { + "epoch": 0.09, + "grad_norm": 0.9227054715156555, + "learning_rate": 0.0004959074915663661, + "loss": 3.5663, + "step": 672 + }, + { + "epoch": 0.09, + "grad_norm": 0.9690736532211304, + "learning_rate": 0.0004958887916839386, + "loss": 3.7607, + "step": 673 + }, + { + "epoch": 0.09, + "grad_norm": 0.9061338305473328, + "learning_rate": 0.000495870049530377, + "loss": 3.7882, + "step": 674 + }, + { + "epoch": 0.09, + "grad_norm": 0.9429351091384888, + "learning_rate": 0.0004958512651089036, + "loss": 3.6591, + "step": 675 + }, + { + "epoch": 0.09, + "grad_norm": 0.8772246241569519, + "learning_rate": 0.0004958324384227477, + "loss": 3.5851, + "step": 676 + }, + { + "epoch": 0.09, + "grad_norm": 0.8267006874084473, + "learning_rate": 0.0004958135694751455, + "loss": 3.6412, + "step": 677 + }, + { + "epoch": 0.09, + "grad_norm": 0.8813744783401489, + "learning_rate": 0.0004957946582693412, + "loss": 3.7528, + "step": 678 + }, + { + "epoch": 0.09, + "grad_norm": 0.9163822531700134, + "learning_rate": 0.0004957757048085856, + "loss": 3.7047, + "step": 679 + }, + { + "epoch": 0.09, + "grad_norm": 0.8698851466178894, + "learning_rate": 0.0004957567090961369, + "loss": 3.7463, + "step": 680 + }, + { + "epoch": 0.09, + "grad_norm": 0.9418593049049377, + "learning_rate": 0.000495737671135261, + "loss": 3.7552, + "step": 681 + }, + { + "epoch": 0.09, + "grad_norm": 0.9326760768890381, + "learning_rate": 0.0004957185909292306, + "loss": 3.7517, + "step": 682 + }, + { + "epoch": 0.09, + "grad_norm": 0.9184606075286865, + "learning_rate": 0.0004956994684813257, + "loss": 3.6209, + "step": 683 + }, + { + "epoch": 0.09, + "grad_norm": 0.8675989508628845, + "learning_rate": 0.0004956803037948338, + "loss": 3.704, + "step": 684 + }, + { + "epoch": 0.09, + "grad_norm": 0.9268028140068054, + "learning_rate": 0.0004956610968730495, + "loss": 3.7821, + "step": 685 + }, + { + "epoch": 0.09, + "grad_norm": 0.9288507699966431, + "learning_rate": 0.0004956418477192748, + "loss": 3.6035, + "step": 686 + }, + { + "epoch": 0.09, + "grad_norm": 0.8930710554122925, + "learning_rate": 0.0004956225563368187, + "loss": 3.7, + "step": 687 + }, + { + "epoch": 0.09, + "grad_norm": 1.0005407333374023, + "learning_rate": 0.0004956032227289976, + "loss": 3.7182, + "step": 688 + }, + { + "epoch": 0.09, + "grad_norm": 1.0349626541137695, + "learning_rate": 0.0004955838468991353, + "loss": 3.6968, + "step": 689 + }, + { + "epoch": 0.09, + "grad_norm": 0.8456838130950928, + "learning_rate": 0.0004955644288505627, + "loss": 3.6307, + "step": 690 + }, + { + "epoch": 0.09, + "grad_norm": 0.9580467343330383, + "learning_rate": 0.0004955449685866179, + "loss": 3.6693, + "step": 691 + }, + { + "epoch": 0.09, + "grad_norm": 0.8633206486701965, + "learning_rate": 0.0004955254661106464, + "loss": 3.563, + "step": 692 + }, + { + "epoch": 0.09, + "grad_norm": 0.9008590579032898, + "learning_rate": 0.000495505921426001, + "loss": 3.5747, + "step": 693 + }, + { + "epoch": 0.09, + "grad_norm": 0.9075567722320557, + "learning_rate": 0.0004954863345360414, + "loss": 3.5997, + "step": 694 + }, + { + "epoch": 0.09, + "grad_norm": 0.8881756663322449, + "learning_rate": 0.000495466705444135, + "loss": 3.5613, + "step": 695 + }, + { + "epoch": 0.09, + "grad_norm": 0.8162988424301147, + "learning_rate": 0.0004954470341536563, + "loss": 3.623, + "step": 696 + }, + { + "epoch": 0.09, + "grad_norm": 0.9488250017166138, + "learning_rate": 0.000495427320667987, + "loss": 3.6673, + "step": 697 + }, + { + "epoch": 0.09, + "grad_norm": 0.8821372985839844, + "learning_rate": 0.000495407564990516, + "loss": 3.5725, + "step": 698 + }, + { + "epoch": 0.09, + "grad_norm": 0.8585495352745056, + "learning_rate": 0.0004953877671246395, + "loss": 3.5775, + "step": 699 + }, + { + "epoch": 0.09, + "grad_norm": 0.8403851985931396, + "learning_rate": 0.0004953679270737611, + "loss": 3.8019, + "step": 700 + }, + { + "epoch": 0.09, + "grad_norm": 0.9948201179504395, + "learning_rate": 0.0004953480448412914, + "loss": 3.6131, + "step": 701 + }, + { + "epoch": 0.09, + "grad_norm": 0.8229441046714783, + "learning_rate": 0.0004953281204306487, + "loss": 3.5401, + "step": 702 + }, + { + "epoch": 0.09, + "grad_norm": 0.8649178147315979, + "learning_rate": 0.0004953081538452579, + "loss": 3.4134, + "step": 703 + }, + { + "epoch": 0.09, + "grad_norm": 0.8966211080551147, + "learning_rate": 0.0004952881450885515, + "loss": 3.7468, + "step": 704 + }, + { + "epoch": 0.09, + "grad_norm": 1.0185855627059937, + "learning_rate": 0.0004952680941639694, + "loss": 3.6969, + "step": 705 + }, + { + "epoch": 0.09, + "grad_norm": 0.8968634605407715, + "learning_rate": 0.0004952480010749585, + "loss": 3.676, + "step": 706 + }, + { + "epoch": 0.09, + "grad_norm": 0.8802434206008911, + "learning_rate": 0.0004952278658249731, + "loss": 3.6423, + "step": 707 + }, + { + "epoch": 0.09, + "grad_norm": 0.9503376483917236, + "learning_rate": 0.0004952076884174746, + "loss": 3.6148, + "step": 708 + }, + { + "epoch": 0.09, + "grad_norm": 0.9169230461120605, + "learning_rate": 0.0004951874688559318, + "loss": 3.6117, + "step": 709 + }, + { + "epoch": 0.09, + "grad_norm": 0.9358011484146118, + "learning_rate": 0.0004951672071438207, + "loss": 3.7518, + "step": 710 + }, + { + "epoch": 0.09, + "grad_norm": 0.9428926110267639, + "learning_rate": 0.0004951469032846243, + "loss": 3.7773, + "step": 711 + }, + { + "epoch": 0.09, + "grad_norm": 0.9884240627288818, + "learning_rate": 0.0004951265572818334, + "loss": 3.6447, + "step": 712 + }, + { + "epoch": 0.09, + "grad_norm": 0.8522127866744995, + "learning_rate": 0.0004951061691389455, + "loss": 3.5563, + "step": 713 + }, + { + "epoch": 0.09, + "grad_norm": 0.8171650767326355, + "learning_rate": 0.0004950857388594656, + "loss": 3.6495, + "step": 714 + }, + { + "epoch": 0.09, + "grad_norm": 0.8851253390312195, + "learning_rate": 0.0004950652664469059, + "loss": 3.6131, + "step": 715 + }, + { + "epoch": 0.09, + "grad_norm": 0.8465543985366821, + "learning_rate": 0.000495044751904786, + "loss": 3.6781, + "step": 716 + }, + { + "epoch": 0.09, + "grad_norm": 0.9340558648109436, + "learning_rate": 0.0004950241952366322, + "loss": 3.7341, + "step": 717 + }, + { + "epoch": 0.09, + "grad_norm": 0.8248949646949768, + "learning_rate": 0.0004950035964459789, + "loss": 3.5921, + "step": 718 + }, + { + "epoch": 0.09, + "grad_norm": 0.9273523092269897, + "learning_rate": 0.0004949829555363669, + "loss": 3.6564, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 0.9097087979316711, + "learning_rate": 0.0004949622725113448, + "loss": 3.5832, + "step": 720 + }, + { + "epoch": 0.09, + "grad_norm": 0.799820601940155, + "learning_rate": 0.0004949415473744683, + "loss": 3.6226, + "step": 721 + }, + { + "epoch": 0.09, + "grad_norm": 0.9359858632087708, + "learning_rate": 0.0004949207801293001, + "loss": 3.6449, + "step": 722 + }, + { + "epoch": 0.09, + "grad_norm": 0.8636605739593506, + "learning_rate": 0.0004948999707794105, + "loss": 3.6189, + "step": 723 + }, + { + "epoch": 0.09, + "grad_norm": 0.8730395436286926, + "learning_rate": 0.0004948791193283765, + "loss": 3.6658, + "step": 724 + }, + { + "epoch": 0.09, + "grad_norm": 0.9037566781044006, + "learning_rate": 0.0004948582257797834, + "loss": 3.6856, + "step": 725 + }, + { + "epoch": 0.09, + "grad_norm": 0.8765959143638611, + "learning_rate": 0.0004948372901372224, + "loss": 3.5039, + "step": 726 + }, + { + "epoch": 0.09, + "grad_norm": 0.9535045623779297, + "learning_rate": 0.000494816312404293, + "loss": 3.6947, + "step": 727 + }, + { + "epoch": 0.09, + "grad_norm": 0.9718337059020996, + "learning_rate": 0.0004947952925846013, + "loss": 3.5829, + "step": 728 + }, + { + "epoch": 0.09, + "grad_norm": 0.9038040041923523, + "learning_rate": 0.0004947742306817608, + "loss": 3.5878, + "step": 729 + }, + { + "epoch": 0.09, + "grad_norm": 0.9002828598022461, + "learning_rate": 0.0004947531266993924, + "loss": 3.6424, + "step": 730 + }, + { + "epoch": 0.09, + "grad_norm": 0.9017542600631714, + "learning_rate": 0.0004947319806411243, + "loss": 3.7354, + "step": 731 + }, + { + "epoch": 0.09, + "grad_norm": 0.8402687907218933, + "learning_rate": 0.0004947107925105912, + "loss": 3.6144, + "step": 732 + }, + { + "epoch": 0.09, + "grad_norm": 0.8400694727897644, + "learning_rate": 0.0004946895623114362, + "loss": 3.6566, + "step": 733 + }, + { + "epoch": 0.09, + "grad_norm": 0.899815022945404, + "learning_rate": 0.0004946682900473087, + "loss": 3.6615, + "step": 734 + }, + { + "epoch": 0.09, + "grad_norm": 0.9010912179946899, + "learning_rate": 0.0004946469757218657, + "loss": 3.7581, + "step": 735 + }, + { + "epoch": 0.09, + "grad_norm": 0.8260379433631897, + "learning_rate": 0.0004946256193387714, + "loss": 3.6277, + "step": 736 + }, + { + "epoch": 0.09, + "grad_norm": 0.8734118342399597, + "learning_rate": 0.000494604220901697, + "loss": 3.5827, + "step": 737 + }, + { + "epoch": 0.09, + "grad_norm": 0.8689841032028198, + "learning_rate": 0.0004945827804143216, + "loss": 3.7219, + "step": 738 + }, + { + "epoch": 0.09, + "grad_norm": 0.8634633421897888, + "learning_rate": 0.0004945612978803307, + "loss": 3.4742, + "step": 739 + }, + { + "epoch": 0.09, + "grad_norm": 0.9236171841621399, + "learning_rate": 0.0004945397733034175, + "loss": 3.6558, + "step": 740 + }, + { + "epoch": 0.09, + "grad_norm": 0.9720011949539185, + "learning_rate": 0.0004945182066872823, + "loss": 3.6096, + "step": 741 + }, + { + "epoch": 0.09, + "grad_norm": 0.9223105311393738, + "learning_rate": 0.0004944965980356326, + "loss": 3.6497, + "step": 742 + }, + { + "epoch": 0.1, + "grad_norm": 0.8663374781608582, + "learning_rate": 0.0004944749473521834, + "loss": 3.695, + "step": 743 + }, + { + "epoch": 0.1, + "grad_norm": 0.7778543829917908, + "learning_rate": 0.0004944532546406565, + "loss": 3.6928, + "step": 744 + }, + { + "epoch": 0.1, + "grad_norm": 0.9464306831359863, + "learning_rate": 0.0004944315199047812, + "loss": 3.7198, + "step": 745 + }, + { + "epoch": 0.1, + "grad_norm": 0.871847927570343, + "learning_rate": 0.0004944097431482939, + "loss": 3.6822, + "step": 746 + }, + { + "epoch": 0.1, + "grad_norm": 0.8734187483787537, + "learning_rate": 0.0004943879243749382, + "loss": 3.647, + "step": 747 + }, + { + "epoch": 0.1, + "grad_norm": 0.9731336832046509, + "learning_rate": 0.0004943660635884652, + "loss": 3.6557, + "step": 748 + }, + { + "epoch": 0.1, + "grad_norm": 0.9924262166023254, + "learning_rate": 0.000494344160792633, + "loss": 3.7084, + "step": 749 + }, + { + "epoch": 0.1, + "grad_norm": 0.9291201829910278, + "learning_rate": 0.0004943222159912069, + "loss": 3.6337, + "step": 750 + }, + { + "epoch": 0.1, + "grad_norm": 0.9033045172691345, + "learning_rate": 0.0004943002291879593, + "loss": 3.6388, + "step": 751 + }, + { + "epoch": 0.1, + "grad_norm": 1.0025124549865723, + "learning_rate": 0.0004942782003866703, + "loss": 3.7116, + "step": 752 + }, + { + "epoch": 0.1, + "grad_norm": 0.9180055260658264, + "learning_rate": 0.0004942561295911267, + "loss": 3.6105, + "step": 753 + }, + { + "epoch": 0.1, + "grad_norm": 0.8433054089546204, + "learning_rate": 0.0004942340168051226, + "loss": 3.6694, + "step": 754 + }, + { + "epoch": 0.1, + "grad_norm": 0.9406448602676392, + "learning_rate": 0.0004942118620324597, + "loss": 3.6136, + "step": 755 + }, + { + "epoch": 0.1, + "grad_norm": 0.8985976576805115, + "learning_rate": 0.0004941896652769466, + "loss": 3.6743, + "step": 756 + }, + { + "epoch": 0.1, + "grad_norm": 0.8895725011825562, + "learning_rate": 0.0004941674265423992, + "loss": 3.5532, + "step": 757 + }, + { + "epoch": 0.1, + "grad_norm": 0.8536813855171204, + "learning_rate": 0.0004941451458326404, + "loss": 3.7154, + "step": 758 + }, + { + "epoch": 0.1, + "grad_norm": 0.8130298256874084, + "learning_rate": 0.0004941228231515008, + "loss": 3.4941, + "step": 759 + }, + { + "epoch": 0.1, + "grad_norm": 0.900328516960144, + "learning_rate": 0.0004941004585028177, + "loss": 3.6643, + "step": 760 + }, + { + "epoch": 0.1, + "grad_norm": 0.9134331345558167, + "learning_rate": 0.000494078051890436, + "loss": 3.6062, + "step": 761 + }, + { + "epoch": 0.1, + "grad_norm": 0.8630452156066895, + "learning_rate": 0.0004940556033182075, + "loss": 3.5736, + "step": 762 + }, + { + "epoch": 0.1, + "grad_norm": 0.872016966342926, + "learning_rate": 0.0004940331127899915, + "loss": 3.7443, + "step": 763 + }, + { + "epoch": 0.1, + "grad_norm": 0.8469942808151245, + "learning_rate": 0.0004940105803096544, + "loss": 3.498, + "step": 764 + }, + { + "epoch": 0.1, + "grad_norm": 0.9625217914581299, + "learning_rate": 0.0004939880058810696, + "loss": 3.5907, + "step": 765 + }, + { + "epoch": 0.1, + "grad_norm": 1.0333412885665894, + "learning_rate": 0.0004939653895081181, + "loss": 3.5302, + "step": 766 + }, + { + "epoch": 0.1, + "grad_norm": 0.9309979677200317, + "learning_rate": 0.000493942731194688, + "loss": 3.5486, + "step": 767 + }, + { + "epoch": 0.1, + "grad_norm": 0.873720109462738, + "learning_rate": 0.0004939200309446741, + "loss": 3.6667, + "step": 768 + }, + { + "epoch": 0.1, + "grad_norm": 1.0009433031082153, + "learning_rate": 0.0004938972887619793, + "loss": 3.6041, + "step": 769 + }, + { + "epoch": 0.1, + "grad_norm": 0.8196940422058105, + "learning_rate": 0.0004938745046505129, + "loss": 3.6099, + "step": 770 + }, + { + "epoch": 0.1, + "grad_norm": 0.9370154738426208, + "learning_rate": 0.0004938516786141921, + "loss": 3.7256, + "step": 771 + }, + { + "epoch": 0.1, + "grad_norm": 0.9386774301528931, + "learning_rate": 0.0004938288106569407, + "loss": 3.7586, + "step": 772 + }, + { + "epoch": 0.1, + "grad_norm": 0.7597833275794983, + "learning_rate": 0.0004938059007826901, + "loss": 3.541, + "step": 773 + }, + { + "epoch": 0.1, + "grad_norm": 0.8484575748443604, + "learning_rate": 0.0004937829489953787, + "loss": 3.7377, + "step": 774 + }, + { + "epoch": 0.1, + "grad_norm": 0.830502450466156, + "learning_rate": 0.0004937599552989521, + "loss": 3.6238, + "step": 775 + }, + { + "epoch": 0.1, + "grad_norm": 0.8506859540939331, + "learning_rate": 0.0004937369196973633, + "loss": 3.6765, + "step": 776 + }, + { + "epoch": 0.1, + "grad_norm": 0.8493182063102722, + "learning_rate": 0.0004937138421945724, + "loss": 3.6292, + "step": 777 + }, + { + "epoch": 0.1, + "grad_norm": 0.8898391723632812, + "learning_rate": 0.0004936907227945467, + "loss": 3.6496, + "step": 778 + }, + { + "epoch": 0.1, + "grad_norm": 0.9002225399017334, + "learning_rate": 0.0004936675615012606, + "loss": 3.673, + "step": 779 + }, + { + "epoch": 0.1, + "grad_norm": 0.8686527609825134, + "learning_rate": 0.0004936443583186958, + "loss": 3.663, + "step": 780 + }, + { + "epoch": 0.1, + "grad_norm": 0.8821991086006165, + "learning_rate": 0.0004936211132508413, + "loss": 3.6627, + "step": 781 + }, + { + "epoch": 0.1, + "grad_norm": 0.852533757686615, + "learning_rate": 0.0004935978263016931, + "loss": 3.538, + "step": 782 + }, + { + "epoch": 0.1, + "grad_norm": 0.8961055278778076, + "learning_rate": 0.0004935744974752546, + "loss": 3.6461, + "step": 783 + }, + { + "epoch": 0.1, + "grad_norm": 0.9409698843955994, + "learning_rate": 0.0004935511267755361, + "loss": 3.545, + "step": 784 + }, + { + "epoch": 0.1, + "grad_norm": 0.8036273717880249, + "learning_rate": 0.0004935277142065556, + "loss": 3.5735, + "step": 785 + }, + { + "epoch": 0.1, + "grad_norm": 0.8235346674919128, + "learning_rate": 0.0004935042597723376, + "loss": 3.6634, + "step": 786 + }, + { + "epoch": 0.1, + "grad_norm": 0.8754570484161377, + "learning_rate": 0.0004934807634769145, + "loss": 3.7106, + "step": 787 + }, + { + "epoch": 0.1, + "grad_norm": 0.8720112442970276, + "learning_rate": 0.0004934572253243255, + "loss": 3.5811, + "step": 788 + }, + { + "epoch": 0.1, + "grad_norm": 0.8393718600273132, + "learning_rate": 0.0004934336453186171, + "loss": 3.6455, + "step": 789 + }, + { + "epoch": 0.1, + "grad_norm": 0.795387327671051, + "learning_rate": 0.0004934100234638429, + "loss": 3.7253, + "step": 790 + }, + { + "epoch": 0.1, + "grad_norm": 0.9040440917015076, + "learning_rate": 0.0004933863597640638, + "loss": 3.623, + "step": 791 + }, + { + "epoch": 0.1, + "grad_norm": 0.8612719178199768, + "learning_rate": 0.0004933626542233478, + "loss": 3.7458, + "step": 792 + }, + { + "epoch": 0.1, + "grad_norm": 0.8143512606620789, + "learning_rate": 0.0004933389068457704, + "loss": 3.5961, + "step": 793 + }, + { + "epoch": 0.1, + "grad_norm": 0.7966370582580566, + "learning_rate": 0.0004933151176354138, + "loss": 3.6326, + "step": 794 + }, + { + "epoch": 0.1, + "grad_norm": 0.8990934491157532, + "learning_rate": 0.0004932912865963677, + "loss": 3.6588, + "step": 795 + }, + { + "epoch": 0.1, + "grad_norm": 0.9208589792251587, + "learning_rate": 0.000493267413732729, + "loss": 3.682, + "step": 796 + }, + { + "epoch": 0.1, + "grad_norm": 0.840647280216217, + "learning_rate": 0.0004932434990486017, + "loss": 3.6069, + "step": 797 + }, + { + "epoch": 0.1, + "grad_norm": 0.799932062625885, + "learning_rate": 0.000493219542548097, + "loss": 3.4894, + "step": 798 + }, + { + "epoch": 0.1, + "grad_norm": 0.9299008846282959, + "learning_rate": 0.0004931955442353333, + "loss": 3.7636, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 0.8207058906555176, + "learning_rate": 0.0004931715041144361, + "loss": 3.7038, + "step": 800 + }, + { + "epoch": 0.1, + "grad_norm": 0.9075020551681519, + "learning_rate": 0.0004931474221895383, + "loss": 3.58, + "step": 801 + }, + { + "epoch": 0.1, + "grad_norm": 0.8652316331863403, + "learning_rate": 0.0004931232984647798, + "loss": 3.5738, + "step": 802 + }, + { + "epoch": 0.1, + "grad_norm": 0.9437047243118286, + "learning_rate": 0.0004930991329443079, + "loss": 3.7435, + "step": 803 + }, + { + "epoch": 0.1, + "grad_norm": 0.8851674199104309, + "learning_rate": 0.0004930749256322766, + "loss": 3.7059, + "step": 804 + }, + { + "epoch": 0.1, + "grad_norm": 0.8456943035125732, + "learning_rate": 0.0004930506765328477, + "loss": 3.6934, + "step": 805 + }, + { + "epoch": 0.1, + "grad_norm": 0.8298068046569824, + "learning_rate": 0.0004930263856501899, + "loss": 3.51, + "step": 806 + }, + { + "epoch": 0.1, + "grad_norm": 0.8694912195205688, + "learning_rate": 0.0004930020529884789, + "loss": 3.6908, + "step": 807 + }, + { + "epoch": 0.1, + "grad_norm": 0.8628725409507751, + "learning_rate": 0.000492977678551898, + "loss": 3.7173, + "step": 808 + }, + { + "epoch": 0.1, + "grad_norm": 0.8303777575492859, + "learning_rate": 0.0004929532623446372, + "loss": 3.5715, + "step": 809 + }, + { + "epoch": 0.1, + "grad_norm": 0.7596468329429626, + "learning_rate": 0.000492928804370894, + "loss": 3.535, + "step": 810 + }, + { + "epoch": 0.1, + "grad_norm": 0.9455280900001526, + "learning_rate": 0.0004929043046348732, + "loss": 3.6283, + "step": 811 + }, + { + "epoch": 0.1, + "grad_norm": 0.8608766198158264, + "learning_rate": 0.0004928797631407863, + "loss": 3.6562, + "step": 812 + }, + { + "epoch": 0.1, + "grad_norm": 0.8241950869560242, + "learning_rate": 0.0004928551798928525, + "loss": 3.6279, + "step": 813 + }, + { + "epoch": 0.1, + "grad_norm": 0.836773157119751, + "learning_rate": 0.0004928305548952978, + "loss": 3.5184, + "step": 814 + }, + { + "epoch": 0.1, + "grad_norm": 0.8365269303321838, + "learning_rate": 0.0004928058881523557, + "loss": 3.6565, + "step": 815 + }, + { + "epoch": 0.1, + "grad_norm": 0.8501466512680054, + "learning_rate": 0.0004927811796682666, + "loss": 3.7131, + "step": 816 + }, + { + "epoch": 0.1, + "grad_norm": 0.8528035879135132, + "learning_rate": 0.000492756429447278, + "loss": 3.5438, + "step": 817 + }, + { + "epoch": 0.1, + "grad_norm": 0.8384031057357788, + "learning_rate": 0.0004927316374936449, + "loss": 3.6685, + "step": 818 + }, + { + "epoch": 0.1, + "grad_norm": 1.13548743724823, + "learning_rate": 0.0004927068038116293, + "loss": 3.6312, + "step": 819 + }, + { + "epoch": 0.1, + "grad_norm": 0.8726980090141296, + "learning_rate": 0.0004926819284055006, + "loss": 3.704, + "step": 820 + }, + { + "epoch": 0.11, + "grad_norm": 0.8911085724830627, + "learning_rate": 0.0004926570112795349, + "loss": 3.6911, + "step": 821 + }, + { + "epoch": 0.11, + "grad_norm": 0.8530990481376648, + "learning_rate": 0.0004926320524380159, + "loss": 3.6812, + "step": 822 + }, + { + "epoch": 0.11, + "grad_norm": 0.9349095821380615, + "learning_rate": 0.0004926070518852341, + "loss": 3.5695, + "step": 823 + }, + { + "epoch": 0.11, + "grad_norm": 0.8291351795196533, + "learning_rate": 0.0004925820096254877, + "loss": 3.5025, + "step": 824 + }, + { + "epoch": 0.11, + "grad_norm": 0.9228273630142212, + "learning_rate": 0.0004925569256630816, + "loss": 3.6815, + "step": 825 + }, + { + "epoch": 0.11, + "grad_norm": 0.8727307319641113, + "learning_rate": 0.0004925318000023279, + "loss": 3.5924, + "step": 826 + }, + { + "epoch": 0.11, + "grad_norm": 0.853922426700592, + "learning_rate": 0.0004925066326475461, + "loss": 3.5766, + "step": 827 + }, + { + "epoch": 0.11, + "grad_norm": 0.9327370524406433, + "learning_rate": 0.0004924814236030629, + "loss": 3.6038, + "step": 828 + }, + { + "epoch": 0.11, + "grad_norm": 0.8882296681404114, + "learning_rate": 0.0004924561728732118, + "loss": 3.6118, + "step": 829 + }, + { + "epoch": 0.11, + "grad_norm": 0.8594300746917725, + "learning_rate": 0.0004924308804623339, + "loss": 3.6208, + "step": 830 + }, + { + "epoch": 0.11, + "grad_norm": 0.8235106468200684, + "learning_rate": 0.0004924055463747769, + "loss": 3.5596, + "step": 831 + }, + { + "epoch": 0.11, + "grad_norm": 0.8358860611915588, + "learning_rate": 0.0004923801706148965, + "loss": 3.5078, + "step": 832 + }, + { + "epoch": 0.11, + "grad_norm": 0.863490104675293, + "learning_rate": 0.0004923547531870548, + "loss": 3.7563, + "step": 833 + }, + { + "epoch": 0.11, + "grad_norm": 0.841188371181488, + "learning_rate": 0.0004923292940956215, + "loss": 3.6264, + "step": 834 + }, + { + "epoch": 0.11, + "grad_norm": 0.9689059257507324, + "learning_rate": 0.0004923037933449731, + "loss": 3.6982, + "step": 835 + }, + { + "epoch": 0.11, + "grad_norm": 0.8078739643096924, + "learning_rate": 0.0004922782509394937, + "loss": 3.627, + "step": 836 + }, + { + "epoch": 0.11, + "grad_norm": 0.8360235095024109, + "learning_rate": 0.0004922526668835741, + "loss": 3.5822, + "step": 837 + }, + { + "epoch": 0.11, + "grad_norm": 0.8151078820228577, + "learning_rate": 0.0004922270411816126, + "loss": 3.6288, + "step": 838 + }, + { + "epoch": 0.11, + "grad_norm": 0.8234032392501831, + "learning_rate": 0.0004922013738380147, + "loss": 3.6923, + "step": 839 + }, + { + "epoch": 0.11, + "grad_norm": 0.8429092168807983, + "learning_rate": 0.0004921756648571928, + "loss": 3.6477, + "step": 840 + }, + { + "epoch": 0.11, + "grad_norm": 0.7991187572479248, + "learning_rate": 0.0004921499142435666, + "loss": 3.6118, + "step": 841 + }, + { + "epoch": 0.11, + "grad_norm": 0.7951987981796265, + "learning_rate": 0.0004921241220015627, + "loss": 3.6724, + "step": 842 + }, + { + "epoch": 0.11, + "grad_norm": 0.8163877129554749, + "learning_rate": 0.0004920982881356156, + "loss": 3.6376, + "step": 843 + }, + { + "epoch": 0.11, + "grad_norm": 0.8272448778152466, + "learning_rate": 0.0004920724126501659, + "loss": 3.622, + "step": 844 + }, + { + "epoch": 0.11, + "grad_norm": 0.9418625831604004, + "learning_rate": 0.0004920464955496622, + "loss": 3.6689, + "step": 845 + }, + { + "epoch": 0.11, + "grad_norm": 0.8513910174369812, + "learning_rate": 0.0004920205368385598, + "loss": 3.4536, + "step": 846 + }, + { + "epoch": 0.11, + "grad_norm": 0.7715315222740173, + "learning_rate": 0.0004919945365213214, + "loss": 3.5744, + "step": 847 + }, + { + "epoch": 0.11, + "grad_norm": 0.8094183802604675, + "learning_rate": 0.0004919684946024168, + "loss": 3.7615, + "step": 848 + }, + { + "epoch": 0.11, + "grad_norm": 0.8636196851730347, + "learning_rate": 0.0004919424110863227, + "loss": 3.6143, + "step": 849 + }, + { + "epoch": 0.11, + "grad_norm": 0.8138225674629211, + "learning_rate": 0.0004919162859775235, + "loss": 3.6715, + "step": 850 + }, + { + "epoch": 0.11, + "grad_norm": 0.8703018426895142, + "learning_rate": 0.00049189011928051, + "loss": 3.5849, + "step": 851 + }, + { + "epoch": 0.11, + "grad_norm": 0.7516616582870483, + "learning_rate": 0.0004918639109997809, + "loss": 3.6899, + "step": 852 + }, + { + "epoch": 0.11, + "grad_norm": 0.8473501205444336, + "learning_rate": 0.0004918376611398415, + "loss": 3.6175, + "step": 853 + }, + { + "epoch": 0.11, + "grad_norm": 0.9071518778800964, + "learning_rate": 0.0004918113697052046, + "loss": 3.6823, + "step": 854 + }, + { + "epoch": 0.11, + "grad_norm": 0.7907838225364685, + "learning_rate": 0.0004917850367003898, + "loss": 3.612, + "step": 855 + }, + { + "epoch": 0.11, + "grad_norm": 0.8364929556846619, + "learning_rate": 0.0004917586621299243, + "loss": 3.6682, + "step": 856 + }, + { + "epoch": 0.11, + "grad_norm": 0.8281691670417786, + "learning_rate": 0.0004917322459983421, + "loss": 3.5506, + "step": 857 + }, + { + "epoch": 0.11, + "grad_norm": 0.7997773289680481, + "learning_rate": 0.0004917057883101843, + "loss": 3.5342, + "step": 858 + }, + { + "epoch": 0.11, + "grad_norm": 0.9194799065589905, + "learning_rate": 0.0004916792890699995, + "loss": 3.5594, + "step": 859 + }, + { + "epoch": 0.11, + "grad_norm": 0.8962545394897461, + "learning_rate": 0.000491652748282343, + "loss": 3.7193, + "step": 860 + }, + { + "epoch": 0.11, + "grad_norm": 0.8621538281440735, + "learning_rate": 0.0004916261659517777, + "loss": 3.6989, + "step": 861 + }, + { + "epoch": 0.11, + "grad_norm": 0.7588165402412415, + "learning_rate": 0.0004915995420828732, + "loss": 3.4604, + "step": 862 + }, + { + "epoch": 0.11, + "grad_norm": 0.9234063029289246, + "learning_rate": 0.0004915728766802066, + "loss": 3.6228, + "step": 863 + }, + { + "epoch": 0.11, + "grad_norm": 0.8838184475898743, + "learning_rate": 0.000491546169748362, + "loss": 3.6242, + "step": 864 + }, + { + "epoch": 0.11, + "grad_norm": 0.8651776313781738, + "learning_rate": 0.0004915194212919305, + "loss": 3.7815, + "step": 865 + }, + { + "epoch": 0.11, + "grad_norm": 1.0226529836654663, + "learning_rate": 0.0004914926313155106, + "loss": 3.4453, + "step": 866 + }, + { + "epoch": 0.11, + "grad_norm": 0.8438092470169067, + "learning_rate": 0.0004914657998237078, + "loss": 3.6444, + "step": 867 + }, + { + "epoch": 0.11, + "grad_norm": 0.9107071161270142, + "learning_rate": 0.0004914389268211346, + "loss": 3.7138, + "step": 868 + }, + { + "epoch": 0.11, + "grad_norm": 0.7964327335357666, + "learning_rate": 0.0004914120123124108, + "loss": 3.6776, + "step": 869 + }, + { + "epoch": 0.11, + "grad_norm": 0.8652677536010742, + "learning_rate": 0.0004913850563021636, + "loss": 3.78, + "step": 870 + }, + { + "epoch": 0.11, + "grad_norm": 0.8645572662353516, + "learning_rate": 0.0004913580587950267, + "loss": 3.5867, + "step": 871 + }, + { + "epoch": 0.11, + "grad_norm": 0.805211067199707, + "learning_rate": 0.0004913310197956416, + "loss": 3.6309, + "step": 872 + }, + { + "epoch": 0.11, + "grad_norm": 0.8409008979797363, + "learning_rate": 0.0004913039393086563, + "loss": 3.5873, + "step": 873 + }, + { + "epoch": 0.11, + "grad_norm": 0.8912443518638611, + "learning_rate": 0.0004912768173387264, + "loss": 3.6151, + "step": 874 + }, + { + "epoch": 0.11, + "grad_norm": 0.8684778809547424, + "learning_rate": 0.0004912496538905145, + "loss": 3.6755, + "step": 875 + }, + { + "epoch": 0.11, + "grad_norm": 0.8110811114311218, + "learning_rate": 0.0004912224489686903, + "loss": 3.4404, + "step": 876 + }, + { + "epoch": 0.11, + "grad_norm": 0.9000563025474548, + "learning_rate": 0.0004911952025779306, + "loss": 3.7488, + "step": 877 + }, + { + "epoch": 0.11, + "grad_norm": 0.8925985097885132, + "learning_rate": 0.0004911679147229194, + "loss": 3.7425, + "step": 878 + }, + { + "epoch": 0.11, + "grad_norm": 0.7726958394050598, + "learning_rate": 0.0004911405854083479, + "loss": 3.6524, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 0.786937415599823, + "learning_rate": 0.0004911132146389141, + "loss": 3.6178, + "step": 880 + }, + { + "epoch": 0.11, + "grad_norm": 0.8084043264389038, + "learning_rate": 0.0004910858024193236, + "loss": 3.531, + "step": 881 + }, + { + "epoch": 0.11, + "grad_norm": 0.8019929528236389, + "learning_rate": 0.0004910583487542886, + "loss": 3.7352, + "step": 882 + }, + { + "epoch": 0.11, + "grad_norm": 0.7468410730361938, + "learning_rate": 0.0004910308536485291, + "loss": 3.484, + "step": 883 + }, + { + "epoch": 0.11, + "grad_norm": 0.8613249659538269, + "learning_rate": 0.0004910033171067713, + "loss": 3.5572, + "step": 884 + }, + { + "epoch": 0.11, + "grad_norm": 0.8318970203399658, + "learning_rate": 0.0004909757391337496, + "loss": 3.5578, + "step": 885 + }, + { + "epoch": 0.11, + "grad_norm": 0.8411234617233276, + "learning_rate": 0.0004909481197342046, + "loss": 3.5871, + "step": 886 + }, + { + "epoch": 0.11, + "grad_norm": 0.7730182409286499, + "learning_rate": 0.0004909204589128845, + "loss": 3.6533, + "step": 887 + }, + { + "epoch": 0.11, + "grad_norm": 0.8421241641044617, + "learning_rate": 0.0004908927566745446, + "loss": 3.6285, + "step": 888 + }, + { + "epoch": 0.11, + "grad_norm": 0.8909102082252502, + "learning_rate": 0.000490865013023947, + "loss": 3.6253, + "step": 889 + }, + { + "epoch": 0.11, + "grad_norm": 0.8533161878585815, + "learning_rate": 0.0004908372279658614, + "loss": 3.5395, + "step": 890 + }, + { + "epoch": 0.11, + "grad_norm": 0.8164993524551392, + "learning_rate": 0.0004908094015050643, + "loss": 3.5685, + "step": 891 + }, + { + "epoch": 0.11, + "grad_norm": 0.7901835441589355, + "learning_rate": 0.0004907815336463394, + "loss": 3.6154, + "step": 892 + }, + { + "epoch": 0.11, + "grad_norm": 0.9597400426864624, + "learning_rate": 0.0004907536243944774, + "loss": 3.7743, + "step": 893 + }, + { + "epoch": 0.11, + "grad_norm": 0.8602568507194519, + "learning_rate": 0.0004907256737542765, + "loss": 3.5457, + "step": 894 + }, + { + "epoch": 0.11, + "grad_norm": 0.8954022526741028, + "learning_rate": 0.0004906976817305413, + "loss": 3.5667, + "step": 895 + }, + { + "epoch": 0.11, + "grad_norm": 0.9862939119338989, + "learning_rate": 0.0004906696483280844, + "loss": 3.664, + "step": 896 + }, + { + "epoch": 0.11, + "grad_norm": 0.8378450870513916, + "learning_rate": 0.0004906415735517248, + "loss": 3.68, + "step": 897 + }, + { + "epoch": 0.11, + "grad_norm": 0.7724777460098267, + "learning_rate": 0.0004906134574062888, + "loss": 3.5164, + "step": 898 + }, + { + "epoch": 0.12, + "grad_norm": 0.8759022951126099, + "learning_rate": 0.0004905852998966102, + "loss": 3.5678, + "step": 899 + }, + { + "epoch": 0.12, + "grad_norm": 0.7621853947639465, + "learning_rate": 0.0004905571010275295, + "loss": 3.466, + "step": 900 + }, + { + "epoch": 0.12, + "grad_norm": 0.8283673524856567, + "learning_rate": 0.0004905288608038943, + "loss": 3.5888, + "step": 901 + }, + { + "epoch": 0.12, + "grad_norm": 0.8321616053581238, + "learning_rate": 0.0004905005792305595, + "loss": 3.6799, + "step": 902 + }, + { + "epoch": 0.12, + "grad_norm": 0.7852991223335266, + "learning_rate": 0.0004904722563123869, + "loss": 3.566, + "step": 903 + }, + { + "epoch": 0.12, + "grad_norm": 0.8530229330062866, + "learning_rate": 0.0004904438920542457, + "loss": 3.58, + "step": 904 + }, + { + "epoch": 0.12, + "grad_norm": 0.8264917135238647, + "learning_rate": 0.000490415486461012, + "loss": 3.6062, + "step": 905 + }, + { + "epoch": 0.12, + "grad_norm": 0.7997876405715942, + "learning_rate": 0.0004903870395375691, + "loss": 3.4823, + "step": 906 + }, + { + "epoch": 0.12, + "grad_norm": 0.8135886192321777, + "learning_rate": 0.0004903585512888072, + "loss": 3.5646, + "step": 907 + }, + { + "epoch": 0.12, + "grad_norm": 0.8352776765823364, + "learning_rate": 0.0004903300217196239, + "loss": 3.6718, + "step": 908 + }, + { + "epoch": 0.12, + "grad_norm": 0.8962761163711548, + "learning_rate": 0.0004903014508349237, + "loss": 3.5917, + "step": 909 + }, + { + "epoch": 0.12, + "grad_norm": 0.8091096878051758, + "learning_rate": 0.0004902728386396184, + "loss": 3.6274, + "step": 910 + }, + { + "epoch": 0.12, + "grad_norm": 0.7973507642745972, + "learning_rate": 0.0004902441851386266, + "loss": 3.6302, + "step": 911 + }, + { + "epoch": 0.12, + "grad_norm": 0.8099647164344788, + "learning_rate": 0.0004902154903368741, + "loss": 3.544, + "step": 912 + }, + { + "epoch": 0.12, + "grad_norm": 0.7872130870819092, + "learning_rate": 0.0004901867542392942, + "loss": 3.6173, + "step": 913 + }, + { + "epoch": 0.12, + "grad_norm": 0.8407408595085144, + "learning_rate": 0.0004901579768508265, + "loss": 3.6839, + "step": 914 + }, + { + "epoch": 0.12, + "grad_norm": 0.8333362936973572, + "learning_rate": 0.0004901291581764186, + "loss": 3.6181, + "step": 915 + }, + { + "epoch": 0.12, + "grad_norm": 0.8242466449737549, + "learning_rate": 0.0004901002982210246, + "loss": 3.6812, + "step": 916 + }, + { + "epoch": 0.12, + "grad_norm": 0.7317993640899658, + "learning_rate": 0.0004900713969896059, + "loss": 3.598, + "step": 917 + }, + { + "epoch": 0.12, + "grad_norm": 0.8498689532279968, + "learning_rate": 0.0004900424544871308, + "loss": 3.6426, + "step": 918 + }, + { + "epoch": 0.12, + "grad_norm": 0.8187620043754578, + "learning_rate": 0.0004900134707185751, + "loss": 3.6288, + "step": 919 + }, + { + "epoch": 0.12, + "grad_norm": 0.7988176941871643, + "learning_rate": 0.000489984445688921, + "loss": 3.6693, + "step": 920 + }, + { + "epoch": 0.12, + "grad_norm": 0.9066382646560669, + "learning_rate": 0.0004899553794031589, + "loss": 3.5619, + "step": 921 + }, + { + "epoch": 0.12, + "grad_norm": 0.7891650199890137, + "learning_rate": 0.0004899262718662851, + "loss": 3.8446, + "step": 922 + }, + { + "epoch": 0.12, + "grad_norm": 0.8057774305343628, + "learning_rate": 0.0004898971230833037, + "loss": 3.6976, + "step": 923 + }, + { + "epoch": 0.12, + "grad_norm": 0.8969120979309082, + "learning_rate": 0.0004898679330592259, + "loss": 3.5516, + "step": 924 + }, + { + "epoch": 0.12, + "grad_norm": 0.8318272829055786, + "learning_rate": 0.0004898387017990694, + "loss": 3.525, + "step": 925 + }, + { + "epoch": 0.12, + "grad_norm": 0.8637136220932007, + "learning_rate": 0.0004898094293078597, + "loss": 3.6406, + "step": 926 + }, + { + "epoch": 0.12, + "grad_norm": 0.7998780608177185, + "learning_rate": 0.000489780115590629, + "loss": 3.4636, + "step": 927 + }, + { + "epoch": 0.12, + "grad_norm": 0.8367539048194885, + "learning_rate": 0.0004897507606524167, + "loss": 3.6949, + "step": 928 + }, + { + "epoch": 0.12, + "grad_norm": 0.8102987408638, + "learning_rate": 0.0004897213644982691, + "loss": 3.5146, + "step": 929 + }, + { + "epoch": 0.12, + "grad_norm": 0.7719710469245911, + "learning_rate": 0.0004896919271332399, + "loss": 3.492, + "step": 930 + }, + { + "epoch": 0.12, + "grad_norm": 0.7758731245994568, + "learning_rate": 0.0004896624485623896, + "loss": 3.5188, + "step": 931 + }, + { + "epoch": 0.12, + "grad_norm": 0.7850921154022217, + "learning_rate": 0.0004896329287907862, + "loss": 3.5319, + "step": 932 + }, + { + "epoch": 0.12, + "grad_norm": 0.7877423167228699, + "learning_rate": 0.0004896033678235041, + "loss": 3.6709, + "step": 933 + }, + { + "epoch": 0.12, + "grad_norm": 0.736785352230072, + "learning_rate": 0.0004895737656656254, + "loss": 3.555, + "step": 934 + }, + { + "epoch": 0.12, + "grad_norm": 0.770460307598114, + "learning_rate": 0.000489544122322239, + "loss": 3.6151, + "step": 935 + }, + { + "epoch": 0.12, + "grad_norm": 0.8316922783851624, + "learning_rate": 0.000489514437798441, + "loss": 3.648, + "step": 936 + }, + { + "epoch": 0.12, + "grad_norm": 0.8192751407623291, + "learning_rate": 0.0004894847120993344, + "loss": 3.528, + "step": 937 + }, + { + "epoch": 0.12, + "grad_norm": 0.8254303932189941, + "learning_rate": 0.0004894549452300294, + "loss": 3.6145, + "step": 938 + }, + { + "epoch": 0.12, + "grad_norm": 0.8344351053237915, + "learning_rate": 0.0004894251371956433, + "loss": 3.6076, + "step": 939 + }, + { + "epoch": 0.12, + "grad_norm": 0.8560154438018799, + "learning_rate": 0.0004893952880013005, + "loss": 3.5497, + "step": 940 + }, + { + "epoch": 0.12, + "grad_norm": 0.8088158369064331, + "learning_rate": 0.0004893653976521325, + "loss": 3.5911, + "step": 941 + }, + { + "epoch": 0.12, + "grad_norm": 0.8597379922866821, + "learning_rate": 0.0004893354661532776, + "loss": 3.6256, + "step": 942 + }, + { + "epoch": 0.12, + "grad_norm": 0.7602822780609131, + "learning_rate": 0.0004893054935098814, + "loss": 3.607, + "step": 943 + }, + { + "epoch": 0.12, + "grad_norm": 0.8641725778579712, + "learning_rate": 0.0004892754797270967, + "loss": 3.5751, + "step": 944 + }, + { + "epoch": 0.12, + "grad_norm": 0.8781532049179077, + "learning_rate": 0.000489245424810083, + "loss": 3.6763, + "step": 945 + }, + { + "epoch": 0.12, + "grad_norm": 0.8362348675727844, + "learning_rate": 0.0004892153287640072, + "loss": 3.6466, + "step": 946 + }, + { + "epoch": 0.12, + "grad_norm": 0.8070446252822876, + "learning_rate": 0.0004891851915940433, + "loss": 3.5171, + "step": 947 + }, + { + "epoch": 0.12, + "grad_norm": 0.8270018696784973, + "learning_rate": 0.000489155013305372, + "loss": 3.5635, + "step": 948 + }, + { + "epoch": 0.12, + "grad_norm": 0.8150656819343567, + "learning_rate": 0.0004891247939031814, + "loss": 3.5941, + "step": 949 + }, + { + "epoch": 0.12, + "grad_norm": 0.830181360244751, + "learning_rate": 0.0004890945333926665, + "loss": 3.6124, + "step": 950 + }, + { + "epoch": 0.12, + "grad_norm": 0.808355450630188, + "learning_rate": 0.0004890642317790296, + "loss": 3.5877, + "step": 951 + }, + { + "epoch": 0.12, + "grad_norm": 0.7882115840911865, + "learning_rate": 0.0004890338890674797, + "loss": 3.6326, + "step": 952 + }, + { + "epoch": 0.12, + "grad_norm": 0.828498125076294, + "learning_rate": 0.0004890035052632333, + "loss": 3.5982, + "step": 953 + }, + { + "epoch": 0.12, + "grad_norm": 0.7736223936080933, + "learning_rate": 0.0004889730803715133, + "loss": 3.6843, + "step": 954 + }, + { + "epoch": 0.12, + "grad_norm": 0.8319690227508545, + "learning_rate": 0.0004889426143975507, + "loss": 3.621, + "step": 955 + }, + { + "epoch": 0.12, + "grad_norm": 0.8388370275497437, + "learning_rate": 0.0004889121073465825, + "loss": 3.6456, + "step": 956 + }, + { + "epoch": 0.12, + "grad_norm": 0.8199196457862854, + "learning_rate": 0.0004888815592238532, + "loss": 3.5886, + "step": 957 + }, + { + "epoch": 0.12, + "grad_norm": 0.7664119005203247, + "learning_rate": 0.0004888509700346146, + "loss": 3.5408, + "step": 958 + }, + { + "epoch": 0.12, + "grad_norm": 0.7739691734313965, + "learning_rate": 0.0004888203397841253, + "loss": 3.5643, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 0.7797976136207581, + "learning_rate": 0.0004887896684776508, + "loss": 3.5595, + "step": 960 + }, + { + "epoch": 0.12, + "grad_norm": 0.8414448499679565, + "learning_rate": 0.0004887589561204642, + "loss": 3.5498, + "step": 961 + }, + { + "epoch": 0.12, + "grad_norm": 0.8154247999191284, + "learning_rate": 0.000488728202717845, + "loss": 3.6078, + "step": 962 + }, + { + "epoch": 0.12, + "grad_norm": 0.8003179430961609, + "learning_rate": 0.0004886974082750803, + "loss": 3.4887, + "step": 963 + }, + { + "epoch": 0.12, + "grad_norm": 0.8092113137245178, + "learning_rate": 0.0004886665727974638, + "loss": 3.5388, + "step": 964 + }, + { + "epoch": 0.12, + "grad_norm": 0.7909213304519653, + "learning_rate": 0.0004886356962902965, + "loss": 3.6211, + "step": 965 + }, + { + "epoch": 0.12, + "grad_norm": 0.7772814035415649, + "learning_rate": 0.0004886047787588866, + "loss": 3.594, + "step": 966 + }, + { + "epoch": 0.12, + "grad_norm": 0.801804780960083, + "learning_rate": 0.0004885738202085491, + "loss": 3.6257, + "step": 967 + }, + { + "epoch": 0.12, + "grad_norm": 0.7865604758262634, + "learning_rate": 0.0004885428206446061, + "loss": 3.5625, + "step": 968 + }, + { + "epoch": 0.12, + "grad_norm": 0.8398476839065552, + "learning_rate": 0.0004885117800723869, + "loss": 3.6431, + "step": 969 + }, + { + "epoch": 0.12, + "grad_norm": 0.853718638420105, + "learning_rate": 0.0004884806984972274, + "loss": 3.6405, + "step": 970 + }, + { + "epoch": 0.12, + "grad_norm": 0.8108567595481873, + "learning_rate": 0.0004884495759244713, + "loss": 3.5849, + "step": 971 + }, + { + "epoch": 0.12, + "grad_norm": 0.8177028298377991, + "learning_rate": 0.0004884184123594687, + "loss": 3.6298, + "step": 972 + }, + { + "epoch": 0.12, + "grad_norm": 0.8156768083572388, + "learning_rate": 0.0004883872078075771, + "loss": 3.5049, + "step": 973 + }, + { + "epoch": 0.12, + "grad_norm": 0.8004074692726135, + "learning_rate": 0.0004883559622741607, + "loss": 3.6292, + "step": 974 + }, + { + "epoch": 0.12, + "grad_norm": 0.8555276989936829, + "learning_rate": 0.0004883246757645914, + "loss": 3.6365, + "step": 975 + }, + { + "epoch": 0.12, + "grad_norm": 0.8024552464485168, + "learning_rate": 0.0004882933482842472, + "loss": 3.4914, + "step": 976 + }, + { + "epoch": 0.13, + "grad_norm": 0.7471779584884644, + "learning_rate": 0.00048826197983851396, + "loss": 3.5452, + "step": 977 + }, + { + "epoch": 0.13, + "grad_norm": 0.8516356945037842, + "learning_rate": 0.00048823057043278427, + "loss": 3.6427, + "step": 978 + }, + { + "epoch": 0.13, + "grad_norm": 0.8830351829528809, + "learning_rate": 0.00048819912007245765, + "loss": 3.6421, + "step": 979 + }, + { + "epoch": 0.13, + "grad_norm": 0.8727092742919922, + "learning_rate": 0.0004881676287629409, + "loss": 3.5592, + "step": 980 + }, + { + "epoch": 0.13, + "grad_norm": 0.8191733956336975, + "learning_rate": 0.0004881360965096476, + "loss": 3.5201, + "step": 981 + }, + { + "epoch": 0.13, + "grad_norm": 0.6910346150398254, + "learning_rate": 0.0004881045233179987, + "loss": 3.4483, + "step": 982 + }, + { + "epoch": 0.13, + "grad_norm": 0.8177911639213562, + "learning_rate": 0.00048807290919342187, + "loss": 3.6568, + "step": 983 + }, + { + "epoch": 0.13, + "grad_norm": 0.8026474714279175, + "learning_rate": 0.00048804125414135194, + "loss": 3.6078, + "step": 984 + }, + { + "epoch": 0.13, + "grad_norm": 0.7814860343933105, + "learning_rate": 0.00048800955816723083, + "loss": 3.4803, + "step": 985 + }, + { + "epoch": 0.13, + "grad_norm": 0.772703230381012, + "learning_rate": 0.0004879778212765075, + "loss": 3.5197, + "step": 986 + }, + { + "epoch": 0.13, + "grad_norm": 0.8092082142829895, + "learning_rate": 0.0004879460434746377, + "loss": 3.5248, + "step": 987 + }, + { + "epoch": 0.13, + "grad_norm": 0.8448889255523682, + "learning_rate": 0.0004879142247670846, + "loss": 3.6247, + "step": 988 + }, + { + "epoch": 0.13, + "grad_norm": 0.814842939376831, + "learning_rate": 0.00048788236515931807, + "loss": 3.6079, + "step": 989 + }, + { + "epoch": 0.13, + "grad_norm": 0.789014995098114, + "learning_rate": 0.00048785046465681515, + "loss": 3.5347, + "step": 990 + }, + { + "epoch": 0.13, + "grad_norm": 0.7849494814872742, + "learning_rate": 0.00048781852326506, + "loss": 3.5534, + "step": 991 + }, + { + "epoch": 0.13, + "grad_norm": 0.8483198881149292, + "learning_rate": 0.0004877865409895438, + "loss": 3.734, + "step": 992 + }, + { + "epoch": 0.13, + "grad_norm": 0.7456423044204712, + "learning_rate": 0.0004877545178357644, + "loss": 3.5662, + "step": 993 + }, + { + "epoch": 0.13, + "grad_norm": 0.8502069711685181, + "learning_rate": 0.0004877224538092271, + "loss": 3.6267, + "step": 994 + }, + { + "epoch": 0.13, + "grad_norm": 0.794623851776123, + "learning_rate": 0.00048769034891544416, + "loss": 3.6479, + "step": 995 + }, + { + "epoch": 0.13, + "grad_norm": 0.8556898832321167, + "learning_rate": 0.00048765820315993465, + "loss": 3.5804, + "step": 996 + }, + { + "epoch": 0.13, + "grad_norm": 0.8531051874160767, + "learning_rate": 0.0004876260165482248, + "loss": 3.5223, + "step": 997 + }, + { + "epoch": 0.13, + "grad_norm": 0.8192583322525024, + "learning_rate": 0.000487593789085848, + "loss": 3.5352, + "step": 998 + }, + { + "epoch": 0.13, + "grad_norm": 0.8201632499694824, + "learning_rate": 0.00048756152077834435, + "loss": 3.7469, + "step": 999 + }, + { + "epoch": 0.13, + "grad_norm": 0.767375648021698, + "learning_rate": 0.0004875292116312613, + "loss": 3.66, + "step": 1000 + }, + { + "epoch": 0.13, + "grad_norm": 0.7790427803993225, + "learning_rate": 0.00048749686165015305, + "loss": 3.3572, + "step": 1001 + }, + { + "epoch": 0.13, + "grad_norm": 0.7836480140686035, + "learning_rate": 0.00048746447084058105, + "loss": 3.5477, + "step": 1002 + }, + { + "epoch": 0.13, + "grad_norm": 0.7594608068466187, + "learning_rate": 0.0004874320392081135, + "loss": 3.6529, + "step": 1003 + }, + { + "epoch": 0.13, + "grad_norm": 0.8746945261955261, + "learning_rate": 0.00048739956675832594, + "loss": 3.5856, + "step": 1004 + }, + { + "epoch": 0.13, + "grad_norm": 0.8683459758758545, + "learning_rate": 0.00048736705349680063, + "loss": 3.6031, + "step": 1005 + }, + { + "epoch": 0.13, + "grad_norm": 0.7671228051185608, + "learning_rate": 0.000487334499429127, + "loss": 3.5888, + "step": 1006 + }, + { + "epoch": 0.13, + "grad_norm": 0.7802176475524902, + "learning_rate": 0.00048730190456090154, + "loss": 3.5723, + "step": 1007 + }, + { + "epoch": 0.13, + "grad_norm": 0.7627329230308533, + "learning_rate": 0.00048726926889772773, + "loss": 3.5489, + "step": 1008 + }, + { + "epoch": 0.13, + "grad_norm": 0.8381568193435669, + "learning_rate": 0.00048723659244521576, + "loss": 3.5454, + "step": 1009 + }, + { + "epoch": 0.13, + "grad_norm": 0.7972750663757324, + "learning_rate": 0.00048720387520898343, + "loss": 3.5405, + "step": 1010 + }, + { + "epoch": 0.13, + "grad_norm": 0.8035085797309875, + "learning_rate": 0.0004871711171946549, + "loss": 3.5673, + "step": 1011 + }, + { + "epoch": 0.13, + "grad_norm": 0.8251034021377563, + "learning_rate": 0.00048713831840786195, + "loss": 3.6157, + "step": 1012 + }, + { + "epoch": 0.13, + "grad_norm": 0.785926342010498, + "learning_rate": 0.0004871054788542428, + "loss": 3.5963, + "step": 1013 + }, + { + "epoch": 0.13, + "grad_norm": 0.7499585747718811, + "learning_rate": 0.0004870725985394431, + "loss": 3.4179, + "step": 1014 + }, + { + "epoch": 0.13, + "grad_norm": 0.8161090612411499, + "learning_rate": 0.00048703967746911535, + "loss": 3.6996, + "step": 1015 + }, + { + "epoch": 0.13, + "grad_norm": 0.8196116089820862, + "learning_rate": 0.0004870067156489191, + "loss": 3.6429, + "step": 1016 + }, + { + "epoch": 0.13, + "grad_norm": 0.7670063972473145, + "learning_rate": 0.00048697371308452077, + "loss": 3.6078, + "step": 1017 + }, + { + "epoch": 0.13, + "grad_norm": 0.857550859451294, + "learning_rate": 0.0004869406697815939, + "loss": 3.6364, + "step": 1018 + }, + { + "epoch": 0.13, + "grad_norm": 0.8311168551445007, + "learning_rate": 0.00048690758574581905, + "loss": 3.3805, + "step": 1019 + }, + { + "epoch": 0.13, + "grad_norm": 0.8697710037231445, + "learning_rate": 0.00048687446098288374, + "loss": 3.497, + "step": 1020 + }, + { + "epoch": 0.13, + "grad_norm": 0.8013821244239807, + "learning_rate": 0.0004868412954984825, + "loss": 3.5215, + "step": 1021 + }, + { + "epoch": 0.13, + "grad_norm": 0.8414774537086487, + "learning_rate": 0.0004868080892983169, + "loss": 3.7113, + "step": 1022 + }, + { + "epoch": 0.13, + "grad_norm": 0.7996682524681091, + "learning_rate": 0.0004867748423880954, + "loss": 3.5724, + "step": 1023 + }, + { + "epoch": 0.13, + "grad_norm": 0.7667548060417175, + "learning_rate": 0.00048674155477353354, + "loss": 3.4878, + "step": 1024 + }, + { + "epoch": 0.13, + "grad_norm": 0.7549991011619568, + "learning_rate": 0.00048670822646035397, + "loss": 3.4915, + "step": 1025 + }, + { + "epoch": 0.13, + "grad_norm": 0.809350311756134, + "learning_rate": 0.00048667485745428594, + "loss": 3.5255, + "step": 1026 + }, + { + "epoch": 0.13, + "grad_norm": 0.8527643084526062, + "learning_rate": 0.00048664144776106635, + "loss": 3.5635, + "step": 1027 + }, + { + "epoch": 0.13, + "grad_norm": 0.8219663500785828, + "learning_rate": 0.00048660799738643837, + "loss": 3.5104, + "step": 1028 + }, + { + "epoch": 0.13, + "grad_norm": 0.910542905330658, + "learning_rate": 0.0004865745063361526, + "loss": 3.4771, + "step": 1029 + }, + { + "epoch": 0.13, + "grad_norm": 0.7761480212211609, + "learning_rate": 0.00048654097461596675, + "loss": 3.6563, + "step": 1030 + }, + { + "epoch": 0.13, + "grad_norm": 0.7041064500808716, + "learning_rate": 0.00048650740223164503, + "loss": 3.4496, + "step": 1031 + }, + { + "epoch": 0.13, + "grad_norm": 0.7807188630104065, + "learning_rate": 0.00048647378918895904, + "loss": 3.6611, + "step": 1032 + }, + { + "epoch": 0.13, + "grad_norm": 0.7950218319892883, + "learning_rate": 0.0004864401354936873, + "loss": 3.4923, + "step": 1033 + }, + { + "epoch": 0.13, + "grad_norm": 0.8378319144248962, + "learning_rate": 0.00048640644115161523, + "loss": 3.622, + "step": 1034 + }, + { + "epoch": 0.13, + "grad_norm": 0.7991623878479004, + "learning_rate": 0.00048637270616853527, + "loss": 3.5162, + "step": 1035 + }, + { + "epoch": 0.13, + "grad_norm": 0.8185338973999023, + "learning_rate": 0.0004863389305502467, + "loss": 3.6818, + "step": 1036 + }, + { + "epoch": 0.13, + "grad_norm": 0.8307229280471802, + "learning_rate": 0.0004863051143025563, + "loss": 3.6152, + "step": 1037 + }, + { + "epoch": 0.13, + "grad_norm": 0.7207797169685364, + "learning_rate": 0.0004862712574312771, + "loss": 3.5715, + "step": 1038 + }, + { + "epoch": 0.13, + "grad_norm": 0.8925043344497681, + "learning_rate": 0.00048623735994222976, + "loss": 3.7058, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 0.8738099336624146, + "learning_rate": 0.00048620342184124156, + "loss": 3.6992, + "step": 1040 + }, + { + "epoch": 0.13, + "grad_norm": 0.7224860191345215, + "learning_rate": 0.0004861694431341468, + "loss": 3.4439, + "step": 1041 + }, + { + "epoch": 0.13, + "grad_norm": 0.8354774713516235, + "learning_rate": 0.0004861354238267868, + "loss": 3.6577, + "step": 1042 + }, + { + "epoch": 0.13, + "grad_norm": 0.9021112322807312, + "learning_rate": 0.00048610136392501004, + "loss": 3.6562, + "step": 1043 + }, + { + "epoch": 0.13, + "grad_norm": 0.9086030721664429, + "learning_rate": 0.0004860672634346716, + "loss": 3.6625, + "step": 1044 + }, + { + "epoch": 0.13, + "grad_norm": 0.7535606622695923, + "learning_rate": 0.0004860331223616339, + "loss": 3.6021, + "step": 1045 + }, + { + "epoch": 0.13, + "grad_norm": 0.8022335171699524, + "learning_rate": 0.0004859989407117661, + "loss": 3.68, + "step": 1046 + }, + { + "epoch": 0.13, + "grad_norm": 0.7648836374282837, + "learning_rate": 0.0004859647184909445, + "loss": 3.6551, + "step": 1047 + }, + { + "epoch": 0.13, + "grad_norm": 0.7472444176673889, + "learning_rate": 0.00048593045570505223, + "loss": 3.5951, + "step": 1048 + }, + { + "epoch": 0.13, + "grad_norm": 0.7668543457984924, + "learning_rate": 0.00048589615235997954, + "loss": 3.4882, + "step": 1049 + }, + { + "epoch": 0.13, + "grad_norm": 0.7949318289756775, + "learning_rate": 0.0004858618084616234, + "loss": 3.6551, + "step": 1050 + }, + { + "epoch": 0.13, + "grad_norm": 0.7682514190673828, + "learning_rate": 0.00048582742401588814, + "loss": 3.5208, + "step": 1051 + }, + { + "epoch": 0.13, + "grad_norm": 0.7590826749801636, + "learning_rate": 0.0004857929990286847, + "loss": 3.632, + "step": 1052 + }, + { + "epoch": 0.13, + "grad_norm": 0.7686331868171692, + "learning_rate": 0.0004857585335059312, + "loss": 3.5244, + "step": 1053 + }, + { + "epoch": 0.13, + "grad_norm": 0.8039591312408447, + "learning_rate": 0.0004857240274535526, + "loss": 3.5638, + "step": 1054 + }, + { + "epoch": 0.14, + "grad_norm": 0.7751696109771729, + "learning_rate": 0.000485689480877481, + "loss": 3.5261, + "step": 1055 + }, + { + "epoch": 0.14, + "grad_norm": 0.7615006566047668, + "learning_rate": 0.00048565489378365524, + "loss": 3.5094, + "step": 1056 + }, + { + "epoch": 0.14, + "grad_norm": 0.7346290946006775, + "learning_rate": 0.00048562026617802133, + "loss": 3.4812, + "step": 1057 + }, + { + "epoch": 0.14, + "grad_norm": 0.8013098239898682, + "learning_rate": 0.00048558559806653214, + "loss": 3.5405, + "step": 1058 + }, + { + "epoch": 0.14, + "grad_norm": 0.8227730393409729, + "learning_rate": 0.0004855508894551474, + "loss": 3.7083, + "step": 1059 + }, + { + "epoch": 0.14, + "grad_norm": 0.8623093962669373, + "learning_rate": 0.0004855161403498341, + "loss": 3.7428, + "step": 1060 + }, + { + "epoch": 0.14, + "grad_norm": 0.7669157981872559, + "learning_rate": 0.00048548135075656595, + "loss": 3.6555, + "step": 1061 + }, + { + "epoch": 0.14, + "grad_norm": 0.7940523028373718, + "learning_rate": 0.00048544652068132363, + "loss": 3.6781, + "step": 1062 + }, + { + "epoch": 0.14, + "grad_norm": 0.7166727781295776, + "learning_rate": 0.0004854116501300949, + "loss": 3.4161, + "step": 1063 + }, + { + "epoch": 0.14, + "grad_norm": 0.809562623500824, + "learning_rate": 0.00048537673910887435, + "loss": 3.5386, + "step": 1064 + }, + { + "epoch": 0.14, + "grad_norm": 0.7145100235939026, + "learning_rate": 0.00048534178762366364, + "loss": 3.6118, + "step": 1065 + }, + { + "epoch": 0.14, + "grad_norm": 0.7584477066993713, + "learning_rate": 0.0004853067956804713, + "loss": 3.5688, + "step": 1066 + }, + { + "epoch": 0.14, + "grad_norm": 0.8329288959503174, + "learning_rate": 0.0004852717632853129, + "loss": 3.5135, + "step": 1067 + }, + { + "epoch": 0.14, + "grad_norm": 0.8227055668830872, + "learning_rate": 0.0004852366904442109, + "loss": 3.6212, + "step": 1068 + }, + { + "epoch": 0.14, + "grad_norm": 0.7810276746749878, + "learning_rate": 0.00048520157716319467, + "loss": 3.5009, + "step": 1069 + }, + { + "epoch": 0.14, + "grad_norm": 0.7705519199371338, + "learning_rate": 0.00048516642344830077, + "loss": 3.6416, + "step": 1070 + }, + { + "epoch": 0.14, + "grad_norm": 0.8475935459136963, + "learning_rate": 0.0004851312293055722, + "loss": 3.5342, + "step": 1071 + }, + { + "epoch": 0.14, + "grad_norm": 0.9521417617797852, + "learning_rate": 0.0004850959947410596, + "loss": 3.6362, + "step": 1072 + }, + { + "epoch": 0.14, + "grad_norm": 0.7932273745536804, + "learning_rate": 0.00048506071976081995, + "loss": 3.5728, + "step": 1073 + }, + { + "epoch": 0.14, + "grad_norm": 0.7444732189178467, + "learning_rate": 0.00048502540437091754, + "loss": 3.5696, + "step": 1074 + }, + { + "epoch": 0.14, + "grad_norm": 0.7930514812469482, + "learning_rate": 0.00048499004857742347, + "loss": 3.5353, + "step": 1075 + }, + { + "epoch": 0.14, + "grad_norm": 0.8234135508537292, + "learning_rate": 0.0004849546523864158, + "loss": 3.6051, + "step": 1076 + }, + { + "epoch": 0.14, + "grad_norm": 0.818554162979126, + "learning_rate": 0.00048491921580397956, + "loss": 3.6196, + "step": 1077 + }, + { + "epoch": 0.14, + "grad_norm": 0.7329668998718262, + "learning_rate": 0.00048488373883620676, + "loss": 3.5506, + "step": 1078 + }, + { + "epoch": 0.14, + "grad_norm": 0.8094332218170166, + "learning_rate": 0.00048484822148919614, + "loss": 3.5721, + "step": 1079 + }, + { + "epoch": 0.14, + "grad_norm": 0.8114806413650513, + "learning_rate": 0.0004848126637690537, + "loss": 3.5818, + "step": 1080 + }, + { + "epoch": 0.14, + "grad_norm": 0.7351101040840149, + "learning_rate": 0.00048477706568189215, + "loss": 3.5156, + "step": 1081 + }, + { + "epoch": 0.14, + "grad_norm": 0.7758623957633972, + "learning_rate": 0.0004847414272338313, + "loss": 3.6086, + "step": 1082 + }, + { + "epoch": 0.14, + "grad_norm": 0.7500880360603333, + "learning_rate": 0.0004847057484309977, + "loss": 3.5158, + "step": 1083 + }, + { + "epoch": 0.14, + "grad_norm": 0.7690058350563049, + "learning_rate": 0.00048467002927952507, + "loss": 3.4918, + "step": 1084 + }, + { + "epoch": 0.14, + "grad_norm": 0.7274742126464844, + "learning_rate": 0.0004846342697855538, + "loss": 3.4803, + "step": 1085 + }, + { + "epoch": 0.14, + "grad_norm": 0.736820638179779, + "learning_rate": 0.0004845984699552315, + "loss": 3.5871, + "step": 1086 + }, + { + "epoch": 0.14, + "grad_norm": 0.7568606734275818, + "learning_rate": 0.0004845626297947125, + "loss": 3.4236, + "step": 1087 + }, + { + "epoch": 0.14, + "grad_norm": 0.7281172871589661, + "learning_rate": 0.0004845267493101582, + "loss": 3.5822, + "step": 1088 + }, + { + "epoch": 0.14, + "grad_norm": 0.7004794478416443, + "learning_rate": 0.0004844908285077367, + "loss": 3.5296, + "step": 1089 + }, + { + "epoch": 0.14, + "grad_norm": 0.698142409324646, + "learning_rate": 0.00048445486739362345, + "loss": 3.5203, + "step": 1090 + }, + { + "epoch": 0.14, + "grad_norm": 0.7830263376235962, + "learning_rate": 0.00048441886597400043, + "loss": 3.5399, + "step": 1091 + }, + { + "epoch": 0.14, + "grad_norm": 0.735805094242096, + "learning_rate": 0.0004843828242550566, + "loss": 3.5375, + "step": 1092 + }, + { + "epoch": 0.14, + "grad_norm": 0.7252518534660339, + "learning_rate": 0.00048434674224298824, + "loss": 3.5794, + "step": 1093 + }, + { + "epoch": 0.14, + "grad_norm": 0.8261983394622803, + "learning_rate": 0.0004843106199439981, + "loss": 3.7589, + "step": 1094 + }, + { + "epoch": 0.14, + "grad_norm": 0.8342580199241638, + "learning_rate": 0.000484274457364296, + "loss": 3.6699, + "step": 1095 + }, + { + "epoch": 0.14, + "grad_norm": 0.7839307188987732, + "learning_rate": 0.0004842382545100987, + "loss": 3.5323, + "step": 1096 + }, + { + "epoch": 0.14, + "grad_norm": 0.8153157234191895, + "learning_rate": 0.00048420201138762997, + "loss": 3.6985, + "step": 1097 + }, + { + "epoch": 0.14, + "grad_norm": 0.7271867394447327, + "learning_rate": 0.0004841657280031204, + "loss": 3.3959, + "step": 1098 + }, + { + "epoch": 0.14, + "grad_norm": 0.7656581401824951, + "learning_rate": 0.0004841294043628074, + "loss": 3.6082, + "step": 1099 + }, + { + "epoch": 0.14, + "grad_norm": 0.8586941957473755, + "learning_rate": 0.00048409304047293566, + "loss": 3.4874, + "step": 1100 + }, + { + "epoch": 0.14, + "grad_norm": 0.7712840437889099, + "learning_rate": 0.0004840566363397564, + "loss": 3.6099, + "step": 1101 + }, + { + "epoch": 0.14, + "grad_norm": 0.7870004773139954, + "learning_rate": 0.00048402019196952783, + "loss": 3.5546, + "step": 1102 + }, + { + "epoch": 0.14, + "grad_norm": 0.7695828676223755, + "learning_rate": 0.0004839837073685154, + "loss": 3.6071, + "step": 1103 + }, + { + "epoch": 0.14, + "grad_norm": 0.7353809475898743, + "learning_rate": 0.00048394718254299096, + "loss": 3.5236, + "step": 1104 + }, + { + "epoch": 0.14, + "grad_norm": 0.6917027831077576, + "learning_rate": 0.0004839106174992338, + "loss": 3.3997, + "step": 1105 + }, + { + "epoch": 0.14, + "grad_norm": 0.7143733501434326, + "learning_rate": 0.0004838740122435298, + "loss": 3.4964, + "step": 1106 + }, + { + "epoch": 0.14, + "grad_norm": 0.7869515419006348, + "learning_rate": 0.00048383736678217173, + "loss": 3.5131, + "step": 1107 + }, + { + "epoch": 0.14, + "grad_norm": 0.7774674296379089, + "learning_rate": 0.00048380068112145936, + "loss": 3.5641, + "step": 1108 + }, + { + "epoch": 0.14, + "grad_norm": 0.7332097291946411, + "learning_rate": 0.0004837639552676996, + "loss": 3.4813, + "step": 1109 + }, + { + "epoch": 0.14, + "grad_norm": 0.737568736076355, + "learning_rate": 0.00048372718922720583, + "loss": 3.5873, + "step": 1110 + }, + { + "epoch": 0.14, + "grad_norm": 0.8007314801216125, + "learning_rate": 0.00048369038300629866, + "loss": 3.5338, + "step": 1111 + }, + { + "epoch": 0.14, + "grad_norm": 0.8464657068252563, + "learning_rate": 0.0004836535366113054, + "loss": 3.5136, + "step": 1112 + }, + { + "epoch": 0.14, + "grad_norm": 0.7695111632347107, + "learning_rate": 0.0004836166500485605, + "loss": 3.5132, + "step": 1113 + }, + { + "epoch": 0.14, + "grad_norm": 0.7972545623779297, + "learning_rate": 0.0004835797233244052, + "loss": 3.4944, + "step": 1114 + }, + { + "epoch": 0.14, + "grad_norm": 0.8033908009529114, + "learning_rate": 0.0004835427564451875, + "loss": 3.5781, + "step": 1115 + }, + { + "epoch": 0.14, + "grad_norm": 0.7441543340682983, + "learning_rate": 0.0004835057494172625, + "loss": 3.4414, + "step": 1116 + }, + { + "epoch": 0.14, + "grad_norm": 0.8694853782653809, + "learning_rate": 0.00048346870224699225, + "loss": 3.5197, + "step": 1117 + }, + { + "epoch": 0.14, + "grad_norm": 0.833680272102356, + "learning_rate": 0.0004834316149407454, + "loss": 3.4152, + "step": 1118 + }, + { + "epoch": 0.14, + "grad_norm": 0.9092240929603577, + "learning_rate": 0.0004833944875048978, + "loss": 3.5894, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 0.7969741225242615, + "learning_rate": 0.000483357319945832, + "loss": 3.6246, + "step": 1120 + }, + { + "epoch": 0.14, + "grad_norm": 0.8331993222236633, + "learning_rate": 0.0004833201122699377, + "loss": 3.7095, + "step": 1121 + }, + { + "epoch": 0.14, + "grad_norm": 0.7968263030052185, + "learning_rate": 0.0004832828644836111, + "loss": 3.5234, + "step": 1122 + }, + { + "epoch": 0.14, + "grad_norm": 0.7297738790512085, + "learning_rate": 0.0004832455765932557, + "loss": 3.6672, + "step": 1123 + }, + { + "epoch": 0.14, + "grad_norm": 0.7510380148887634, + "learning_rate": 0.00048320824860528165, + "loss": 3.5526, + "step": 1124 + }, + { + "epoch": 0.14, + "grad_norm": 0.6990682482719421, + "learning_rate": 0.0004831708805261061, + "loss": 3.6454, + "step": 1125 + }, + { + "epoch": 0.14, + "grad_norm": 0.7424012422561646, + "learning_rate": 0.00048313347236215304, + "loss": 3.5578, + "step": 1126 + }, + { + "epoch": 0.14, + "grad_norm": 0.7626960277557373, + "learning_rate": 0.0004830960241198534, + "loss": 3.5351, + "step": 1127 + }, + { + "epoch": 0.14, + "grad_norm": 0.7487967610359192, + "learning_rate": 0.00048305853580564483, + "loss": 3.5848, + "step": 1128 + }, + { + "epoch": 0.14, + "grad_norm": 0.7351952195167542, + "learning_rate": 0.0004830210074259722, + "loss": 3.57, + "step": 1129 + }, + { + "epoch": 0.14, + "grad_norm": 0.7571800351142883, + "learning_rate": 0.000482983438987287, + "loss": 3.5361, + "step": 1130 + }, + { + "epoch": 0.14, + "grad_norm": 0.7057940363883972, + "learning_rate": 0.00048294583049604756, + "loss": 3.5936, + "step": 1131 + }, + { + "epoch": 0.14, + "grad_norm": 0.713702917098999, + "learning_rate": 0.00048290818195871946, + "loss": 3.5384, + "step": 1132 + }, + { + "epoch": 0.15, + "grad_norm": 0.7351755499839783, + "learning_rate": 0.0004828704933817746, + "loss": 3.5273, + "step": 1133 + }, + { + "epoch": 0.15, + "grad_norm": 0.7459946274757385, + "learning_rate": 0.0004828327647716924, + "loss": 3.6704, + "step": 1134 + }, + { + "epoch": 0.15, + "grad_norm": 0.7656434178352356, + "learning_rate": 0.0004827949961349587, + "loss": 3.6132, + "step": 1135 + }, + { + "epoch": 0.15, + "grad_norm": 0.7966132164001465, + "learning_rate": 0.0004827571874780663, + "loss": 3.53, + "step": 1136 + }, + { + "epoch": 0.15, + "grad_norm": 0.7394230365753174, + "learning_rate": 0.0004827193388075151, + "loss": 3.5357, + "step": 1137 + }, + { + "epoch": 0.15, + "grad_norm": 0.7922983765602112, + "learning_rate": 0.00048268145012981157, + "loss": 3.5202, + "step": 1138 + }, + { + "epoch": 0.15, + "grad_norm": 0.8919665813446045, + "learning_rate": 0.0004826435214514693, + "loss": 3.4918, + "step": 1139 + }, + { + "epoch": 0.15, + "grad_norm": 0.7942013740539551, + "learning_rate": 0.0004826055527790087, + "loss": 3.6465, + "step": 1140 + }, + { + "epoch": 0.15, + "grad_norm": 0.8015908598899841, + "learning_rate": 0.00048256754411895697, + "loss": 3.4992, + "step": 1141 + }, + { + "epoch": 0.15, + "grad_norm": 0.8327279686927795, + "learning_rate": 0.0004825294954778482, + "loss": 3.7522, + "step": 1142 + }, + { + "epoch": 0.15, + "grad_norm": 0.7686452865600586, + "learning_rate": 0.0004824914068622235, + "loss": 3.5357, + "step": 1143 + }, + { + "epoch": 0.15, + "grad_norm": 0.8076480627059937, + "learning_rate": 0.0004824532782786307, + "loss": 3.7146, + "step": 1144 + }, + { + "epoch": 0.15, + "grad_norm": 0.7591144442558289, + "learning_rate": 0.00048241510973362446, + "loss": 3.6272, + "step": 1145 + }, + { + "epoch": 0.15, + "grad_norm": 0.764695942401886, + "learning_rate": 0.00048237690123376656, + "loss": 3.5952, + "step": 1146 + }, + { + "epoch": 0.15, + "grad_norm": 0.737902045249939, + "learning_rate": 0.0004823386527856253, + "loss": 3.5014, + "step": 1147 + }, + { + "epoch": 0.15, + "grad_norm": 0.7147963643074036, + "learning_rate": 0.00048230036439577615, + "loss": 3.4634, + "step": 1148 + }, + { + "epoch": 0.15, + "grad_norm": 0.7402358055114746, + "learning_rate": 0.0004822620360708014, + "loss": 3.5073, + "step": 1149 + }, + { + "epoch": 0.15, + "grad_norm": 0.8155220150947571, + "learning_rate": 0.00048222366781729, + "loss": 3.7277, + "step": 1150 + }, + { + "epoch": 0.15, + "grad_norm": 0.8458031415939331, + "learning_rate": 0.0004821852596418379, + "loss": 3.5606, + "step": 1151 + }, + { + "epoch": 0.15, + "grad_norm": 0.730534017086029, + "learning_rate": 0.0004821468115510479, + "loss": 3.6716, + "step": 1152 + }, + { + "epoch": 0.15, + "grad_norm": 0.7740826606750488, + "learning_rate": 0.0004821083235515298, + "loss": 3.6137, + "step": 1153 + }, + { + "epoch": 0.15, + "grad_norm": 0.7634129524230957, + "learning_rate": 0.0004820697956499, + "loss": 3.6249, + "step": 1154 + }, + { + "epoch": 0.15, + "grad_norm": 0.7905582189559937, + "learning_rate": 0.00048203122785278195, + "loss": 3.5613, + "step": 1155 + }, + { + "epoch": 0.15, + "grad_norm": 0.7820989489555359, + "learning_rate": 0.0004819926201668059, + "loss": 3.5696, + "step": 1156 + }, + { + "epoch": 0.15, + "grad_norm": 0.7696579098701477, + "learning_rate": 0.00048195397259860894, + "loss": 3.5467, + "step": 1157 + }, + { + "epoch": 0.15, + "grad_norm": 0.8046624064445496, + "learning_rate": 0.00048191528515483504, + "loss": 3.6504, + "step": 1158 + }, + { + "epoch": 0.15, + "grad_norm": 0.7997883558273315, + "learning_rate": 0.0004818765578421351, + "loss": 3.5677, + "step": 1159 + }, + { + "epoch": 0.15, + "grad_norm": 0.7544205784797668, + "learning_rate": 0.0004818377906671667, + "loss": 3.6235, + "step": 1160 + }, + { + "epoch": 0.15, + "grad_norm": 0.7897548079490662, + "learning_rate": 0.00048179898363659435, + "loss": 3.6026, + "step": 1161 + }, + { + "epoch": 0.15, + "grad_norm": 0.7426437735557556, + "learning_rate": 0.0004817601367570895, + "loss": 3.5164, + "step": 1162 + }, + { + "epoch": 0.15, + "grad_norm": 0.7978121638298035, + "learning_rate": 0.00048172125003533034, + "loss": 3.6887, + "step": 1163 + }, + { + "epoch": 0.15, + "grad_norm": 0.7501871585845947, + "learning_rate": 0.00048168232347800197, + "loss": 3.4855, + "step": 1164 + }, + { + "epoch": 0.15, + "grad_norm": 0.7330640554428101, + "learning_rate": 0.0004816433570917963, + "loss": 3.4732, + "step": 1165 + }, + { + "epoch": 0.15, + "grad_norm": 0.7803331017494202, + "learning_rate": 0.000481604350883412, + "loss": 3.5258, + "step": 1166 + }, + { + "epoch": 0.15, + "grad_norm": 0.7541994452476501, + "learning_rate": 0.00048156530485955496, + "loss": 3.4746, + "step": 1167 + }, + { + "epoch": 0.15, + "grad_norm": 0.779971182346344, + "learning_rate": 0.0004815262190269374, + "loss": 3.4933, + "step": 1168 + }, + { + "epoch": 0.15, + "grad_norm": 0.7977837920188904, + "learning_rate": 0.00048148709339227867, + "loss": 3.5619, + "step": 1169 + }, + { + "epoch": 0.15, + "grad_norm": 0.8055017590522766, + "learning_rate": 0.000481447927962305, + "loss": 3.4803, + "step": 1170 + }, + { + "epoch": 0.15, + "grad_norm": 0.854934811592102, + "learning_rate": 0.00048140872274374935, + "loss": 3.5664, + "step": 1171 + }, + { + "epoch": 0.15, + "grad_norm": 0.7964548468589783, + "learning_rate": 0.00048136947774335154, + "loss": 3.6014, + "step": 1172 + }, + { + "epoch": 0.15, + "grad_norm": 0.76549232006073, + "learning_rate": 0.00048133019296785825, + "loss": 3.6647, + "step": 1173 + }, + { + "epoch": 0.15, + "grad_norm": 0.7582719922065735, + "learning_rate": 0.0004812908684240229, + "loss": 3.4162, + "step": 1174 + }, + { + "epoch": 0.15, + "grad_norm": 0.7677952647209167, + "learning_rate": 0.000481251504118606, + "loss": 3.6258, + "step": 1175 + }, + { + "epoch": 0.15, + "grad_norm": 0.820792019367218, + "learning_rate": 0.00048121210005837463, + "loss": 3.6514, + "step": 1176 + }, + { + "epoch": 0.15, + "grad_norm": 0.8015343546867371, + "learning_rate": 0.0004811726562501028, + "loss": 3.499, + "step": 1177 + }, + { + "epoch": 0.15, + "grad_norm": 1.0623503923416138, + "learning_rate": 0.00048113317270057136, + "loss": 3.6353, + "step": 1178 + }, + { + "epoch": 0.15, + "grad_norm": 0.7586822509765625, + "learning_rate": 0.0004810936494165681, + "loss": 3.5914, + "step": 1179 + }, + { + "epoch": 0.15, + "grad_norm": 0.744174063205719, + "learning_rate": 0.00048105408640488733, + "loss": 3.4081, + "step": 1180 + }, + { + "epoch": 0.15, + "grad_norm": 0.7595331072807312, + "learning_rate": 0.0004810144836723305, + "loss": 3.5483, + "step": 1181 + }, + { + "epoch": 0.15, + "grad_norm": 0.7022777199745178, + "learning_rate": 0.00048097484122570585, + "loss": 3.4242, + "step": 1182 + }, + { + "epoch": 0.15, + "grad_norm": 0.7804768085479736, + "learning_rate": 0.0004809351590718283, + "loss": 3.5572, + "step": 1183 + }, + { + "epoch": 0.15, + "grad_norm": 0.7230439782142639, + "learning_rate": 0.00048089543721751963, + "loss": 3.538, + "step": 1184 + }, + { + "epoch": 0.15, + "grad_norm": 0.8133188486099243, + "learning_rate": 0.0004808556756696085, + "loss": 3.7011, + "step": 1185 + }, + { + "epoch": 0.15, + "grad_norm": 0.7499143481254578, + "learning_rate": 0.00048081587443493044, + "loss": 3.5564, + "step": 1186 + }, + { + "epoch": 0.15, + "grad_norm": 0.7320449948310852, + "learning_rate": 0.0004807760335203277, + "loss": 3.5104, + "step": 1187 + }, + { + "epoch": 0.15, + "grad_norm": 0.6878542304039001, + "learning_rate": 0.0004807361529326495, + "loss": 3.4946, + "step": 1188 + }, + { + "epoch": 0.15, + "grad_norm": 0.7565658688545227, + "learning_rate": 0.0004806962326787516, + "loss": 3.4685, + "step": 1189 + }, + { + "epoch": 0.15, + "grad_norm": 1.302788496017456, + "learning_rate": 0.00048065627276549697, + "loss": 3.4976, + "step": 1190 + }, + { + "epoch": 0.15, + "grad_norm": 0.8028513789176941, + "learning_rate": 0.00048061627319975493, + "loss": 3.5922, + "step": 1191 + }, + { + "epoch": 0.15, + "grad_norm": 0.7514439225196838, + "learning_rate": 0.000480576233988402, + "loss": 3.5921, + "step": 1192 + }, + { + "epoch": 0.15, + "grad_norm": 0.8432983160018921, + "learning_rate": 0.0004805361551383214, + "loss": 3.5551, + "step": 1193 + }, + { + "epoch": 0.15, + "grad_norm": 0.7517931461334229, + "learning_rate": 0.0004804960366564032, + "loss": 3.5159, + "step": 1194 + }, + { + "epoch": 0.15, + "grad_norm": 0.7453200817108154, + "learning_rate": 0.000480455878549544, + "loss": 3.4694, + "step": 1195 + }, + { + "epoch": 0.15, + "grad_norm": 0.7308317422866821, + "learning_rate": 0.00048041568082464773, + "loss": 3.4796, + "step": 1196 + }, + { + "epoch": 0.15, + "grad_norm": 0.7596452236175537, + "learning_rate": 0.00048037544348862474, + "loss": 3.4077, + "step": 1197 + }, + { + "epoch": 0.15, + "grad_norm": 0.703974187374115, + "learning_rate": 0.00048033516654839213, + "loss": 3.607, + "step": 1198 + }, + { + "epoch": 0.15, + "grad_norm": 0.666873574256897, + "learning_rate": 0.0004802948500108743, + "loss": 3.5187, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 0.7371575832366943, + "learning_rate": 0.0004802544938830018, + "loss": 3.6909, + "step": 1200 + }, + { + "epoch": 0.15, + "grad_norm": 0.7255220413208008, + "learning_rate": 0.0004802140981717125, + "loss": 3.6693, + "step": 1201 + }, + { + "epoch": 0.15, + "grad_norm": 0.757788896560669, + "learning_rate": 0.0004801736628839509, + "loss": 3.5358, + "step": 1202 + }, + { + "epoch": 0.15, + "grad_norm": 0.7462393045425415, + "learning_rate": 0.0004801331880266682, + "loss": 3.5705, + "step": 1203 + }, + { + "epoch": 0.15, + "grad_norm": 0.827492892742157, + "learning_rate": 0.0004800926736068225, + "loss": 3.6334, + "step": 1204 + }, + { + "epoch": 0.15, + "grad_norm": 0.803577721118927, + "learning_rate": 0.0004800521196313788, + "loss": 3.4963, + "step": 1205 + }, + { + "epoch": 0.15, + "grad_norm": 0.8225682377815247, + "learning_rate": 0.0004800115261073088, + "loss": 3.4747, + "step": 1206 + }, + { + "epoch": 0.15, + "grad_norm": 0.806121826171875, + "learning_rate": 0.00047997089304159085, + "loss": 3.544, + "step": 1207 + }, + { + "epoch": 0.15, + "grad_norm": 0.8418048024177551, + "learning_rate": 0.00047993022044121036, + "loss": 3.6103, + "step": 1208 + }, + { + "epoch": 0.15, + "grad_norm": 0.7457411289215088, + "learning_rate": 0.00047988950831315947, + "loss": 3.5944, + "step": 1209 + }, + { + "epoch": 0.15, + "grad_norm": 0.7606494426727295, + "learning_rate": 0.0004798487566644369, + "loss": 3.6472, + "step": 1210 + }, + { + "epoch": 0.16, + "grad_norm": 0.7392157912254333, + "learning_rate": 0.00047980796550204844, + "loss": 3.4584, + "step": 1211 + }, + { + "epoch": 0.16, + "grad_norm": 0.7845761179924011, + "learning_rate": 0.0004797671348330066, + "loss": 3.6173, + "step": 1212 + }, + { + "epoch": 0.16, + "grad_norm": 0.7397451996803284, + "learning_rate": 0.0004797262646643305, + "loss": 3.5771, + "step": 1213 + }, + { + "epoch": 0.16, + "grad_norm": 0.7049579620361328, + "learning_rate": 0.0004796853550030463, + "loss": 3.4425, + "step": 1214 + }, + { + "epoch": 0.16, + "grad_norm": 0.7008839249610901, + "learning_rate": 0.000479644405856187, + "loss": 3.5106, + "step": 1215 + }, + { + "epoch": 0.16, + "grad_norm": 0.7417147755622864, + "learning_rate": 0.00047960341723079185, + "loss": 3.4835, + "step": 1216 + }, + { + "epoch": 0.16, + "grad_norm": 0.8001255989074707, + "learning_rate": 0.0004795623891339076, + "loss": 3.5156, + "step": 1217 + }, + { + "epoch": 0.16, + "grad_norm": 0.7629938125610352, + "learning_rate": 0.0004795213215725873, + "loss": 3.5797, + "step": 1218 + }, + { + "epoch": 0.16, + "grad_norm": 0.7776039242744446, + "learning_rate": 0.000479480214553891, + "loss": 3.6313, + "step": 1219 + }, + { + "epoch": 0.16, + "grad_norm": 0.7509599924087524, + "learning_rate": 0.00047943906808488546, + "loss": 3.4204, + "step": 1220 + }, + { + "epoch": 0.16, + "grad_norm": 0.837563157081604, + "learning_rate": 0.00047939788217264424, + "loss": 3.5478, + "step": 1221 + }, + { + "epoch": 0.16, + "grad_norm": 0.7498551607131958, + "learning_rate": 0.00047935665682424767, + "loss": 3.5485, + "step": 1222 + }, + { + "epoch": 0.16, + "grad_norm": 0.7325605154037476, + "learning_rate": 0.00047931539204678286, + "loss": 3.4524, + "step": 1223 + }, + { + "epoch": 0.16, + "grad_norm": 0.691773533821106, + "learning_rate": 0.0004792740878473437, + "loss": 3.5594, + "step": 1224 + }, + { + "epoch": 0.16, + "grad_norm": 0.7024340033531189, + "learning_rate": 0.0004792327442330309, + "loss": 3.4423, + "step": 1225 + }, + { + "epoch": 0.16, + "grad_norm": 0.7005559802055359, + "learning_rate": 0.00047919136121095173, + "loss": 3.4901, + "step": 1226 + }, + { + "epoch": 0.16, + "grad_norm": 0.7247700691223145, + "learning_rate": 0.00047914993878822067, + "loss": 3.6014, + "step": 1227 + }, + { + "epoch": 0.16, + "grad_norm": 0.6847344636917114, + "learning_rate": 0.0004791084769719586, + "loss": 3.4629, + "step": 1228 + }, + { + "epoch": 0.16, + "grad_norm": 0.7518942356109619, + "learning_rate": 0.00047906697576929327, + "loss": 3.6171, + "step": 1229 + }, + { + "epoch": 0.16, + "grad_norm": 0.7332186102867126, + "learning_rate": 0.0004790254351873592, + "loss": 3.5396, + "step": 1230 + }, + { + "epoch": 0.16, + "grad_norm": 0.7920958995819092, + "learning_rate": 0.0004789838552332978, + "loss": 3.5926, + "step": 1231 + }, + { + "epoch": 0.16, + "grad_norm": 0.7580140829086304, + "learning_rate": 0.000478942235914257, + "loss": 3.4796, + "step": 1232 + }, + { + "epoch": 0.16, + "grad_norm": 0.728755533695221, + "learning_rate": 0.00047890057723739177, + "loss": 3.5943, + "step": 1233 + }, + { + "epoch": 0.16, + "grad_norm": 0.7519229054450989, + "learning_rate": 0.0004788588792098637, + "loss": 3.5283, + "step": 1234 + }, + { + "epoch": 0.16, + "grad_norm": 0.7821226119995117, + "learning_rate": 0.00047881714183884105, + "loss": 3.6506, + "step": 1235 + }, + { + "epoch": 0.16, + "grad_norm": 0.8124991059303284, + "learning_rate": 0.0004787753651314991, + "loss": 3.6492, + "step": 1236 + }, + { + "epoch": 0.16, + "grad_norm": 0.7577442526817322, + "learning_rate": 0.00047873354909501963, + "loss": 3.4795, + "step": 1237 + }, + { + "epoch": 0.16, + "grad_norm": 0.7887035012245178, + "learning_rate": 0.00047869169373659145, + "loss": 3.4658, + "step": 1238 + }, + { + "epoch": 0.16, + "grad_norm": 0.7463725805282593, + "learning_rate": 0.0004786497990634099, + "loss": 3.5866, + "step": 1239 + }, + { + "epoch": 0.16, + "grad_norm": 0.8219547867774963, + "learning_rate": 0.0004786078650826771, + "loss": 3.6231, + "step": 1240 + }, + { + "epoch": 0.16, + "grad_norm": 0.6816055774688721, + "learning_rate": 0.00047856589180160204, + "loss": 3.5007, + "step": 1241 + }, + { + "epoch": 0.16, + "grad_norm": 0.7251628637313843, + "learning_rate": 0.00047852387922740037, + "loss": 3.4619, + "step": 1242 + }, + { + "epoch": 0.16, + "grad_norm": 0.7888408303260803, + "learning_rate": 0.0004784818273672947, + "loss": 3.5239, + "step": 1243 + }, + { + "epoch": 0.16, + "grad_norm": 0.7807585000991821, + "learning_rate": 0.00047843973622851407, + "loss": 3.5853, + "step": 1244 + }, + { + "epoch": 0.16, + "grad_norm": 0.7896850109100342, + "learning_rate": 0.00047839760581829444, + "loss": 3.5603, + "step": 1245 + }, + { + "epoch": 0.16, + "grad_norm": 0.706608235836029, + "learning_rate": 0.0004783554361438786, + "loss": 3.6437, + "step": 1246 + }, + { + "epoch": 0.16, + "grad_norm": 0.7114312052726746, + "learning_rate": 0.0004783132272125159, + "loss": 3.5691, + "step": 1247 + }, + { + "epoch": 0.16, + "grad_norm": 0.6859767436981201, + "learning_rate": 0.00047827097903146266, + "loss": 3.446, + "step": 1248 + }, + { + "epoch": 0.16, + "grad_norm": 0.8105244636535645, + "learning_rate": 0.00047822869160798185, + "loss": 3.5147, + "step": 1249 + }, + { + "epoch": 0.16, + "grad_norm": 0.761033296585083, + "learning_rate": 0.00047818636494934295, + "loss": 3.5753, + "step": 1250 + }, + { + "epoch": 0.16, + "grad_norm": 0.7481651306152344, + "learning_rate": 0.0004781439990628227, + "loss": 3.6124, + "step": 1251 + }, + { + "epoch": 0.16, + "grad_norm": 0.7689448595046997, + "learning_rate": 0.00047810159395570405, + "loss": 3.5605, + "step": 1252 + }, + { + "epoch": 0.16, + "grad_norm": 0.758472740650177, + "learning_rate": 0.00047805914963527707, + "loss": 3.518, + "step": 1253 + }, + { + "epoch": 0.16, + "grad_norm": 0.7787196040153503, + "learning_rate": 0.00047801666610883833, + "loss": 3.3512, + "step": 1254 + }, + { + "epoch": 0.16, + "grad_norm": 0.8059257864952087, + "learning_rate": 0.00047797414338369125, + "loss": 3.5148, + "step": 1255 + }, + { + "epoch": 0.16, + "grad_norm": 0.8021636009216309, + "learning_rate": 0.0004779315814671461, + "loss": 3.6353, + "step": 1256 + }, + { + "epoch": 0.16, + "grad_norm": 0.817902684211731, + "learning_rate": 0.00047788898036651967, + "loss": 3.6066, + "step": 1257 + }, + { + "epoch": 0.16, + "grad_norm": 0.8657376170158386, + "learning_rate": 0.00047784634008913565, + "loss": 3.636, + "step": 1258 + }, + { + "epoch": 0.16, + "grad_norm": 0.7575492262840271, + "learning_rate": 0.00047780366064232427, + "loss": 3.4298, + "step": 1259 + }, + { + "epoch": 0.16, + "grad_norm": 0.7412144541740417, + "learning_rate": 0.0004777609420334227, + "loss": 3.5585, + "step": 1260 + }, + { + "epoch": 0.16, + "grad_norm": 0.7263116836547852, + "learning_rate": 0.00047771818426977475, + "loss": 3.5636, + "step": 1261 + }, + { + "epoch": 0.16, + "grad_norm": 0.7642979621887207, + "learning_rate": 0.0004776753873587311, + "loss": 3.6134, + "step": 1262 + }, + { + "epoch": 0.16, + "grad_norm": 0.7336152195930481, + "learning_rate": 0.0004776325513076488, + "loss": 3.5417, + "step": 1263 + }, + { + "epoch": 0.16, + "grad_norm": 0.7328927516937256, + "learning_rate": 0.00047758967612389206, + "loss": 3.4612, + "step": 1264 + }, + { + "epoch": 0.16, + "grad_norm": 0.7464151382446289, + "learning_rate": 0.00047754676181483146, + "loss": 3.6533, + "step": 1265 + }, + { + "epoch": 0.16, + "grad_norm": 0.7545056343078613, + "learning_rate": 0.0004775038083878446, + "loss": 3.5238, + "step": 1266 + }, + { + "epoch": 0.16, + "grad_norm": 0.7579200863838196, + "learning_rate": 0.00047746081585031566, + "loss": 3.4982, + "step": 1267 + }, + { + "epoch": 0.16, + "grad_norm": 0.7628843188285828, + "learning_rate": 0.00047741778420963554, + "loss": 3.5309, + "step": 1268 + }, + { + "epoch": 0.16, + "grad_norm": 0.7624720931053162, + "learning_rate": 0.0004773747134732018, + "loss": 3.689, + "step": 1269 + }, + { + "epoch": 0.16, + "grad_norm": 0.6972590684890747, + "learning_rate": 0.0004773316036484189, + "loss": 3.5652, + "step": 1270 + }, + { + "epoch": 0.16, + "grad_norm": 0.7528889775276184, + "learning_rate": 0.0004772884547426979, + "loss": 3.599, + "step": 1271 + }, + { + "epoch": 0.16, + "grad_norm": 0.6812743544578552, + "learning_rate": 0.0004772452667634565, + "loss": 3.4173, + "step": 1272 + }, + { + "epoch": 0.16, + "grad_norm": 0.7377366423606873, + "learning_rate": 0.0004772020397181194, + "loss": 3.5766, + "step": 1273 + }, + { + "epoch": 0.16, + "grad_norm": 0.7267171740531921, + "learning_rate": 0.0004771587736141176, + "loss": 3.5296, + "step": 1274 + }, + { + "epoch": 0.16, + "grad_norm": 0.7521837949752808, + "learning_rate": 0.0004771154684588892, + "loss": 3.6125, + "step": 1275 + }, + { + "epoch": 0.16, + "grad_norm": 0.6818501949310303, + "learning_rate": 0.00047707212425987885, + "loss": 3.5312, + "step": 1276 + }, + { + "epoch": 0.16, + "grad_norm": 0.7633997797966003, + "learning_rate": 0.00047702874102453785, + "loss": 3.4141, + "step": 1277 + }, + { + "epoch": 0.16, + "grad_norm": 0.7707627415657043, + "learning_rate": 0.00047698531876032437, + "loss": 3.5135, + "step": 1278 + }, + { + "epoch": 0.16, + "grad_norm": 0.7294227480888367, + "learning_rate": 0.0004769418574747032, + "loss": 3.6082, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 0.7334704995155334, + "learning_rate": 0.00047689835717514576, + "loss": 3.4137, + "step": 1280 + }, + { + "epoch": 0.16, + "grad_norm": 0.7663036584854126, + "learning_rate": 0.0004768548178691302, + "loss": 3.605, + "step": 1281 + }, + { + "epoch": 0.16, + "grad_norm": 0.7646681070327759, + "learning_rate": 0.00047681123956414164, + "loss": 3.571, + "step": 1282 + }, + { + "epoch": 0.16, + "grad_norm": 0.7949337959289551, + "learning_rate": 0.0004767676222676715, + "loss": 3.7506, + "step": 1283 + }, + { + "epoch": 0.16, + "grad_norm": 0.725398063659668, + "learning_rate": 0.00047672396598721824, + "loss": 3.5381, + "step": 1284 + }, + { + "epoch": 0.16, + "grad_norm": 0.7394816279411316, + "learning_rate": 0.00047668027073028676, + "loss": 3.555, + "step": 1285 + }, + { + "epoch": 0.16, + "grad_norm": 0.7550774216651917, + "learning_rate": 0.0004766365365043889, + "loss": 3.4987, + "step": 1286 + }, + { + "epoch": 0.16, + "grad_norm": 0.736352801322937, + "learning_rate": 0.00047659276331704295, + "loss": 3.3987, + "step": 1287 + }, + { + "epoch": 0.16, + "grad_norm": 0.7474049925804138, + "learning_rate": 0.00047654895117577414, + "loss": 3.4919, + "step": 1288 + }, + { + "epoch": 0.16, + "grad_norm": 0.8658040761947632, + "learning_rate": 0.0004765051000881142, + "loss": 3.7013, + "step": 1289 + }, + { + "epoch": 0.17, + "grad_norm": 0.7833415269851685, + "learning_rate": 0.0004764612100616017, + "loss": 3.6412, + "step": 1290 + }, + { + "epoch": 0.17, + "grad_norm": 0.7740939855575562, + "learning_rate": 0.0004764172811037818, + "loss": 3.5839, + "step": 1291 + }, + { + "epoch": 0.17, + "grad_norm": 0.8426244258880615, + "learning_rate": 0.0004763733132222065, + "loss": 3.3914, + "step": 1292 + }, + { + "epoch": 0.17, + "grad_norm": 0.733086884021759, + "learning_rate": 0.0004763293064244343, + "loss": 3.6397, + "step": 1293 + }, + { + "epoch": 0.17, + "grad_norm": 0.8235949873924255, + "learning_rate": 0.00047628526071803046, + "loss": 3.5314, + "step": 1294 + }, + { + "epoch": 0.17, + "grad_norm": 0.7923353314399719, + "learning_rate": 0.00047624117611056696, + "loss": 3.4461, + "step": 1295 + }, + { + "epoch": 0.17, + "grad_norm": 0.7275211811065674, + "learning_rate": 0.0004761970526096225, + "loss": 3.6302, + "step": 1296 + }, + { + "epoch": 0.17, + "grad_norm": 0.7095808386802673, + "learning_rate": 0.0004761528902227824, + "loss": 3.5677, + "step": 1297 + }, + { + "epoch": 0.17, + "grad_norm": 0.7407586574554443, + "learning_rate": 0.00047610868895763865, + "loss": 3.4968, + "step": 1298 + }, + { + "epoch": 0.17, + "grad_norm": 0.7782431244850159, + "learning_rate": 0.00047606444882179, + "loss": 3.717, + "step": 1299 + }, + { + "epoch": 0.17, + "grad_norm": 0.6857940554618835, + "learning_rate": 0.0004760201698228419, + "loss": 3.4115, + "step": 1300 + }, + { + "epoch": 0.17, + "grad_norm": 0.8255395293235779, + "learning_rate": 0.0004759758519684063, + "loss": 3.6371, + "step": 1301 + }, + { + "epoch": 0.17, + "grad_norm": 0.7767965197563171, + "learning_rate": 0.000475931495266102, + "loss": 3.6189, + "step": 1302 + }, + { + "epoch": 0.17, + "grad_norm": 0.7226409316062927, + "learning_rate": 0.0004758870997235545, + "loss": 3.5922, + "step": 1303 + }, + { + "epoch": 0.17, + "grad_norm": 0.7387113571166992, + "learning_rate": 0.00047584266534839577, + "loss": 3.5281, + "step": 1304 + }, + { + "epoch": 0.17, + "grad_norm": 0.7528977394104004, + "learning_rate": 0.0004757981921482647, + "loss": 3.5226, + "step": 1305 + }, + { + "epoch": 0.17, + "grad_norm": 0.8862900137901306, + "learning_rate": 0.00047575368013080675, + "loss": 3.4604, + "step": 1306 + }, + { + "epoch": 0.17, + "grad_norm": 0.8055890202522278, + "learning_rate": 0.000475709129303674, + "loss": 3.5837, + "step": 1307 + }, + { + "epoch": 0.17, + "grad_norm": 0.7440634965896606, + "learning_rate": 0.0004756645396745253, + "loss": 3.5179, + "step": 1308 + }, + { + "epoch": 0.17, + "grad_norm": 0.7975066304206848, + "learning_rate": 0.0004756199112510261, + "loss": 3.4997, + "step": 1309 + }, + { + "epoch": 0.17, + "grad_norm": 0.7775300145149231, + "learning_rate": 0.0004755752440408485, + "loss": 3.5861, + "step": 1310 + }, + { + "epoch": 0.17, + "grad_norm": 0.7195447683334351, + "learning_rate": 0.00047553053805167136, + "loss": 3.4673, + "step": 1311 + }, + { + "epoch": 0.17, + "grad_norm": 0.7289524078369141, + "learning_rate": 0.0004754857932911802, + "loss": 3.4836, + "step": 1312 + }, + { + "epoch": 0.17, + "grad_norm": 0.7409530282020569, + "learning_rate": 0.0004754410097670671, + "loss": 3.4789, + "step": 1313 + }, + { + "epoch": 0.17, + "grad_norm": 0.7885916829109192, + "learning_rate": 0.0004753961874870309, + "loss": 3.4292, + "step": 1314 + }, + { + "epoch": 0.17, + "grad_norm": 0.7096027731895447, + "learning_rate": 0.000475351326458777, + "loss": 3.4578, + "step": 1315 + }, + { + "epoch": 0.17, + "grad_norm": 0.802874743938446, + "learning_rate": 0.0004753064266900177, + "loss": 3.5491, + "step": 1316 + }, + { + "epoch": 0.17, + "grad_norm": 0.7888942360877991, + "learning_rate": 0.00047526148818847157, + "loss": 3.451, + "step": 1317 + }, + { + "epoch": 0.17, + "grad_norm": 0.7798328995704651, + "learning_rate": 0.00047521651096186417, + "loss": 3.6149, + "step": 1318 + }, + { + "epoch": 0.17, + "grad_norm": 0.7312204241752625, + "learning_rate": 0.00047517149501792767, + "loss": 3.6303, + "step": 1319 + }, + { + "epoch": 0.17, + "grad_norm": 0.7797731757164001, + "learning_rate": 0.0004751264403644007, + "loss": 3.4529, + "step": 1320 + }, + { + "epoch": 0.17, + "grad_norm": 0.7609673142433167, + "learning_rate": 0.00047508134700902874, + "loss": 3.5962, + "step": 1321 + }, + { + "epoch": 0.17, + "grad_norm": 0.6947163939476013, + "learning_rate": 0.0004750362149595639, + "loss": 3.4685, + "step": 1322 + }, + { + "epoch": 0.17, + "grad_norm": 0.7074885964393616, + "learning_rate": 0.0004749910442237648, + "loss": 3.4133, + "step": 1323 + }, + { + "epoch": 0.17, + "grad_norm": 0.7470065355300903, + "learning_rate": 0.000474945834809397, + "loss": 3.5763, + "step": 1324 + }, + { + "epoch": 0.17, + "grad_norm": 0.752415120601654, + "learning_rate": 0.0004749005867242323, + "loss": 3.5299, + "step": 1325 + }, + { + "epoch": 0.17, + "grad_norm": 0.7444427609443665, + "learning_rate": 0.00047485529997604947, + "loss": 3.5643, + "step": 1326 + }, + { + "epoch": 0.17, + "grad_norm": 0.7178149819374084, + "learning_rate": 0.00047480997457263394, + "loss": 3.5211, + "step": 1327 + }, + { + "epoch": 0.17, + "grad_norm": 0.7430973052978516, + "learning_rate": 0.0004747646105217774, + "loss": 3.5017, + "step": 1328 + }, + { + "epoch": 0.17, + "grad_norm": 0.7771696448326111, + "learning_rate": 0.00047471920783127874, + "loss": 3.5547, + "step": 1329 + }, + { + "epoch": 0.17, + "grad_norm": 0.8592125773429871, + "learning_rate": 0.000474673766508943, + "loss": 3.6266, + "step": 1330 + }, + { + "epoch": 0.17, + "grad_norm": 0.7617010474205017, + "learning_rate": 0.0004746282865625822, + "loss": 3.7154, + "step": 1331 + }, + { + "epoch": 0.17, + "grad_norm": 0.7075968980789185, + "learning_rate": 0.00047458276800001485, + "loss": 3.5496, + "step": 1332 + }, + { + "epoch": 0.17, + "grad_norm": 0.8044525384902954, + "learning_rate": 0.0004745372108290661, + "loss": 3.5343, + "step": 1333 + }, + { + "epoch": 0.17, + "grad_norm": 0.7208424806594849, + "learning_rate": 0.00047449161505756766, + "loss": 3.6053, + "step": 1334 + }, + { + "epoch": 0.17, + "grad_norm": 0.6809619069099426, + "learning_rate": 0.0004744459806933581, + "loss": 3.4405, + "step": 1335 + }, + { + "epoch": 0.17, + "grad_norm": 0.7048726081848145, + "learning_rate": 0.0004744003077442825, + "loss": 3.541, + "step": 1336 + }, + { + "epoch": 0.17, + "grad_norm": 0.7939316630363464, + "learning_rate": 0.00047435459621819257, + "loss": 3.6154, + "step": 1337 + }, + { + "epoch": 0.17, + "grad_norm": 0.7555193305015564, + "learning_rate": 0.00047430884612294645, + "loss": 3.4798, + "step": 1338 + }, + { + "epoch": 0.17, + "grad_norm": 0.7547550201416016, + "learning_rate": 0.0004742630574664094, + "loss": 3.4525, + "step": 1339 + }, + { + "epoch": 0.17, + "grad_norm": 0.7709076404571533, + "learning_rate": 0.0004742172302564528, + "loss": 3.4592, + "step": 1340 + }, + { + "epoch": 0.17, + "grad_norm": 0.7275447845458984, + "learning_rate": 0.00047417136450095504, + "loss": 3.6308, + "step": 1341 + }, + { + "epoch": 0.17, + "grad_norm": 0.773053765296936, + "learning_rate": 0.0004741254602078009, + "loss": 3.5169, + "step": 1342 + }, + { + "epoch": 0.17, + "grad_norm": 0.7104327082633972, + "learning_rate": 0.00047407951738488187, + "loss": 3.5841, + "step": 1343 + }, + { + "epoch": 0.17, + "grad_norm": 0.7337343692779541, + "learning_rate": 0.00047403353604009605, + "loss": 3.4557, + "step": 1344 + }, + { + "epoch": 0.17, + "grad_norm": 0.8200117945671082, + "learning_rate": 0.0004739875161813481, + "loss": 3.6261, + "step": 1345 + }, + { + "epoch": 0.17, + "grad_norm": 0.7903836369514465, + "learning_rate": 0.0004739414578165495, + "loss": 3.5732, + "step": 1346 + }, + { + "epoch": 0.17, + "grad_norm": 0.8006786108016968, + "learning_rate": 0.00047389536095361807, + "loss": 3.708, + "step": 1347 + }, + { + "epoch": 0.17, + "grad_norm": 0.7467408776283264, + "learning_rate": 0.00047384922560047855, + "loss": 3.5375, + "step": 1348 + }, + { + "epoch": 0.17, + "grad_norm": 0.7394277453422546, + "learning_rate": 0.00047380305176506203, + "loss": 3.4562, + "step": 1349 + }, + { + "epoch": 0.17, + "grad_norm": 0.7740619778633118, + "learning_rate": 0.0004737568394553064, + "loss": 3.6413, + "step": 1350 + }, + { + "epoch": 0.17, + "grad_norm": 0.7343472242355347, + "learning_rate": 0.00047371058867915606, + "loss": 3.4088, + "step": 1351 + }, + { + "epoch": 0.17, + "grad_norm": 0.7520578503608704, + "learning_rate": 0.000473664299444562, + "loss": 3.6681, + "step": 1352 + }, + { + "epoch": 0.17, + "grad_norm": 0.8136751651763916, + "learning_rate": 0.000473617971759482, + "loss": 3.5463, + "step": 1353 + }, + { + "epoch": 0.17, + "grad_norm": 0.6983349323272705, + "learning_rate": 0.0004735716056318802, + "loss": 3.4922, + "step": 1354 + }, + { + "epoch": 0.17, + "grad_norm": 0.7741418480873108, + "learning_rate": 0.0004735252010697275, + "loss": 3.6106, + "step": 1355 + }, + { + "epoch": 0.17, + "grad_norm": 0.692803144454956, + "learning_rate": 0.00047347875808100145, + "loss": 3.4923, + "step": 1356 + }, + { + "epoch": 0.17, + "grad_norm": 0.7374471426010132, + "learning_rate": 0.00047343227667368605, + "loss": 3.5939, + "step": 1357 + }, + { + "epoch": 0.17, + "grad_norm": 0.7494756579399109, + "learning_rate": 0.0004733857568557721, + "loss": 3.5447, + "step": 1358 + }, + { + "epoch": 0.17, + "grad_norm": 0.7510193586349487, + "learning_rate": 0.0004733391986352568, + "loss": 3.5397, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 0.7476963996887207, + "learning_rate": 0.0004732926020201441, + "loss": 3.5822, + "step": 1360 + }, + { + "epoch": 0.17, + "grad_norm": 0.7323873043060303, + "learning_rate": 0.0004732459670184445, + "loss": 3.5792, + "step": 1361 + }, + { + "epoch": 0.17, + "grad_norm": 0.7584927678108215, + "learning_rate": 0.000473199293638175, + "loss": 3.5797, + "step": 1362 + }, + { + "epoch": 0.17, + "grad_norm": 0.7634500861167908, + "learning_rate": 0.00047315258188735954, + "loss": 3.4843, + "step": 1363 + }, + { + "epoch": 0.17, + "grad_norm": 0.750939667224884, + "learning_rate": 0.0004731058317740281, + "loss": 3.3346, + "step": 1364 + }, + { + "epoch": 0.17, + "grad_norm": 0.7266926765441895, + "learning_rate": 0.0004730590433062178, + "loss": 3.416, + "step": 1365 + }, + { + "epoch": 0.17, + "grad_norm": 0.722404420375824, + "learning_rate": 0.000473012216491972, + "loss": 3.6912, + "step": 1366 + }, + { + "epoch": 0.17, + "grad_norm": 0.6876066327095032, + "learning_rate": 0.0004729653513393408, + "loss": 3.5736, + "step": 1367 + }, + { + "epoch": 0.18, + "grad_norm": 0.7364122271537781, + "learning_rate": 0.00047291844785638085, + "loss": 3.6518, + "step": 1368 + }, + { + "epoch": 0.18, + "grad_norm": 0.7460653781890869, + "learning_rate": 0.0004728715060511555, + "loss": 3.5476, + "step": 1369 + }, + { + "epoch": 0.18, + "grad_norm": 0.6969454288482666, + "learning_rate": 0.0004728245259317344, + "loss": 3.4781, + "step": 1370 + }, + { + "epoch": 0.18, + "grad_norm": 0.7020079493522644, + "learning_rate": 0.00047277750750619415, + "loss": 3.5167, + "step": 1371 + }, + { + "epoch": 0.18, + "grad_norm": 0.7419220805168152, + "learning_rate": 0.00047273045078261765, + "loss": 3.5804, + "step": 1372 + }, + { + "epoch": 0.18, + "grad_norm": 0.7115669846534729, + "learning_rate": 0.0004726833557690946, + "loss": 3.5391, + "step": 1373 + }, + { + "epoch": 0.18, + "grad_norm": 0.7343298196792603, + "learning_rate": 0.0004726362224737211, + "loss": 3.5557, + "step": 1374 + }, + { + "epoch": 0.18, + "grad_norm": 0.7240042090415955, + "learning_rate": 0.00047258905090459996, + "loss": 3.5377, + "step": 1375 + }, + { + "epoch": 0.18, + "grad_norm": 0.7479480504989624, + "learning_rate": 0.0004725418410698405, + "loss": 3.4328, + "step": 1376 + }, + { + "epoch": 0.18, + "grad_norm": 0.7856737971305847, + "learning_rate": 0.0004724945929775586, + "loss": 3.5831, + "step": 1377 + }, + { + "epoch": 0.18, + "grad_norm": 0.7368362545967102, + "learning_rate": 0.0004724473066358768, + "loss": 3.4749, + "step": 1378 + }, + { + "epoch": 0.18, + "grad_norm": 0.7227674126625061, + "learning_rate": 0.00047239998205292425, + "loss": 3.5272, + "step": 1379 + }, + { + "epoch": 0.18, + "grad_norm": 0.7797960042953491, + "learning_rate": 0.0004723526192368364, + "loss": 3.4572, + "step": 1380 + }, + { + "epoch": 0.18, + "grad_norm": 0.7110636234283447, + "learning_rate": 0.0004723052181957556, + "loss": 3.5097, + "step": 1381 + }, + { + "epoch": 0.18, + "grad_norm": 0.7274758219718933, + "learning_rate": 0.00047225777893783054, + "loss": 3.5489, + "step": 1382 + }, + { + "epoch": 0.18, + "grad_norm": 0.7741771340370178, + "learning_rate": 0.0004722103014712167, + "loss": 3.4818, + "step": 1383 + }, + { + "epoch": 0.18, + "grad_norm": 0.7286615371704102, + "learning_rate": 0.00047216278580407603, + "loss": 3.517, + "step": 1384 + }, + { + "epoch": 0.18, + "grad_norm": 0.7044836282730103, + "learning_rate": 0.00047211523194457683, + "loss": 3.5782, + "step": 1385 + }, + { + "epoch": 0.18, + "grad_norm": 0.7229937314987183, + "learning_rate": 0.0004720676399008943, + "loss": 3.5155, + "step": 1386 + }, + { + "epoch": 0.18, + "grad_norm": 0.7034634351730347, + "learning_rate": 0.00047202000968121007, + "loss": 3.5449, + "step": 1387 + }, + { + "epoch": 0.18, + "grad_norm": 0.6847106218338013, + "learning_rate": 0.00047197234129371234, + "loss": 3.5222, + "step": 1388 + }, + { + "epoch": 0.18, + "grad_norm": 0.7309967279434204, + "learning_rate": 0.00047192463474659576, + "loss": 3.5243, + "step": 1389 + }, + { + "epoch": 0.18, + "grad_norm": 0.7710961103439331, + "learning_rate": 0.0004718768900480617, + "loss": 3.5449, + "step": 1390 + }, + { + "epoch": 0.18, + "grad_norm": 0.7054787278175354, + "learning_rate": 0.00047182910720631804, + "loss": 3.5429, + "step": 1391 + }, + { + "epoch": 0.18, + "grad_norm": 0.732120931148529, + "learning_rate": 0.00047178128622957916, + "loss": 3.5658, + "step": 1392 + }, + { + "epoch": 0.18, + "grad_norm": 0.7226571440696716, + "learning_rate": 0.00047173342712606613, + "loss": 3.5384, + "step": 1393 + }, + { + "epoch": 0.18, + "grad_norm": 0.753541111946106, + "learning_rate": 0.00047168552990400637, + "loss": 3.5268, + "step": 1394 + }, + { + "epoch": 0.18, + "grad_norm": 0.7406083941459656, + "learning_rate": 0.000471637594571634, + "loss": 3.5432, + "step": 1395 + }, + { + "epoch": 0.18, + "grad_norm": 0.7489895224571228, + "learning_rate": 0.0004715896211371897, + "loss": 3.3925, + "step": 1396 + }, + { + "epoch": 0.18, + "grad_norm": 0.742703378200531, + "learning_rate": 0.00047154160960892065, + "loss": 3.4482, + "step": 1397 + }, + { + "epoch": 0.18, + "grad_norm": 0.7691521048545837, + "learning_rate": 0.00047149355999508066, + "loss": 3.6035, + "step": 1398 + }, + { + "epoch": 0.18, + "grad_norm": 0.847844660282135, + "learning_rate": 0.0004714454723039299, + "loss": 3.6594, + "step": 1399 + }, + { + "epoch": 0.18, + "grad_norm": 0.6853086948394775, + "learning_rate": 0.00047139734654373514, + "loss": 3.4047, + "step": 1400 + }, + { + "epoch": 0.18, + "grad_norm": 0.6940163373947144, + "learning_rate": 0.0004713491827227699, + "loss": 3.552, + "step": 1401 + }, + { + "epoch": 0.18, + "grad_norm": 0.7334827780723572, + "learning_rate": 0.00047130098084931406, + "loss": 3.5911, + "step": 1402 + }, + { + "epoch": 0.18, + "grad_norm": 0.7077183723449707, + "learning_rate": 0.00047125274093165405, + "loss": 3.6431, + "step": 1403 + }, + { + "epoch": 0.18, + "grad_norm": 0.7522363066673279, + "learning_rate": 0.00047120446297808287, + "loss": 3.4579, + "step": 1404 + }, + { + "epoch": 0.18, + "grad_norm": 0.6966984868049622, + "learning_rate": 0.00047115614699690014, + "loss": 3.5049, + "step": 1405 + }, + { + "epoch": 0.18, + "grad_norm": 0.7402704358100891, + "learning_rate": 0.0004711077929964118, + "loss": 3.606, + "step": 1406 + }, + { + "epoch": 0.18, + "grad_norm": 0.7326617240905762, + "learning_rate": 0.0004710594009849306, + "loss": 3.5428, + "step": 1407 + }, + { + "epoch": 0.18, + "grad_norm": 0.7809644937515259, + "learning_rate": 0.0004710109709707757, + "loss": 3.6287, + "step": 1408 + }, + { + "epoch": 0.18, + "grad_norm": 0.7565213441848755, + "learning_rate": 0.0004709625029622726, + "loss": 3.3696, + "step": 1409 + }, + { + "epoch": 0.18, + "grad_norm": 0.7164292931556702, + "learning_rate": 0.0004709139969677537, + "loss": 3.3233, + "step": 1410 + }, + { + "epoch": 0.18, + "grad_norm": 0.7206988334655762, + "learning_rate": 0.0004708654529955576, + "loss": 3.6306, + "step": 1411 + }, + { + "epoch": 0.18, + "grad_norm": 0.713064968585968, + "learning_rate": 0.00047081687105402967, + "loss": 3.4532, + "step": 1412 + }, + { + "epoch": 0.18, + "grad_norm": 0.6608602404594421, + "learning_rate": 0.00047076825115152166, + "loss": 3.5389, + "step": 1413 + }, + { + "epoch": 0.18, + "grad_norm": 0.7010859251022339, + "learning_rate": 0.0004707195932963919, + "loss": 3.4688, + "step": 1414 + }, + { + "epoch": 0.18, + "grad_norm": 0.700348973274231, + "learning_rate": 0.00047067089749700534, + "loss": 3.4571, + "step": 1415 + }, + { + "epoch": 0.18, + "grad_norm": 0.6964651346206665, + "learning_rate": 0.00047062216376173315, + "loss": 3.5361, + "step": 1416 + }, + { + "epoch": 0.18, + "grad_norm": 0.7558305859565735, + "learning_rate": 0.0004705733920989534, + "loss": 3.4434, + "step": 1417 + }, + { + "epoch": 0.18, + "grad_norm": 0.784369945526123, + "learning_rate": 0.00047052458251705043, + "loss": 3.6063, + "step": 1418 + }, + { + "epoch": 0.18, + "grad_norm": 0.7479367256164551, + "learning_rate": 0.0004704757350244152, + "loss": 3.5864, + "step": 1419 + }, + { + "epoch": 0.18, + "grad_norm": 0.7332322001457214, + "learning_rate": 0.0004704268496294451, + "loss": 3.5199, + "step": 1420 + }, + { + "epoch": 0.18, + "grad_norm": 0.7303996086120605, + "learning_rate": 0.00047037792634054416, + "loss": 3.5091, + "step": 1421 + }, + { + "epoch": 0.18, + "grad_norm": 0.7379922270774841, + "learning_rate": 0.0004703289651661229, + "loss": 3.5665, + "step": 1422 + }, + { + "epoch": 0.18, + "grad_norm": 0.7322970628738403, + "learning_rate": 0.0004702799661145981, + "loss": 3.5029, + "step": 1423 + }, + { + "epoch": 0.18, + "grad_norm": 0.7706109285354614, + "learning_rate": 0.0004702309291943936, + "loss": 3.632, + "step": 1424 + }, + { + "epoch": 0.18, + "grad_norm": 0.6745412349700928, + "learning_rate": 0.00047018185441393914, + "loss": 3.5001, + "step": 1425 + }, + { + "epoch": 0.18, + "grad_norm": 0.7590019106864929, + "learning_rate": 0.00047013274178167136, + "loss": 3.6288, + "step": 1426 + }, + { + "epoch": 0.18, + "grad_norm": 1.681084156036377, + "learning_rate": 0.00047008359130603326, + "loss": 3.6016, + "step": 1427 + }, + { + "epoch": 0.18, + "grad_norm": 0.7189148664474487, + "learning_rate": 0.00047003440299547437, + "loss": 3.7531, + "step": 1428 + }, + { + "epoch": 0.18, + "grad_norm": 0.6640321016311646, + "learning_rate": 0.0004699851768584508, + "loss": 3.4807, + "step": 1429 + }, + { + "epoch": 0.18, + "grad_norm": 0.7376559376716614, + "learning_rate": 0.0004699359129034251, + "loss": 3.483, + "step": 1430 + }, + { + "epoch": 0.18, + "grad_norm": 0.6923270225524902, + "learning_rate": 0.0004698866111388661, + "loss": 3.5111, + "step": 1431 + }, + { + "epoch": 0.18, + "grad_norm": 0.6715227961540222, + "learning_rate": 0.00046983727157324964, + "loss": 3.5518, + "step": 1432 + }, + { + "epoch": 0.18, + "grad_norm": 0.6784403920173645, + "learning_rate": 0.0004697878942150575, + "loss": 3.4985, + "step": 1433 + }, + { + "epoch": 0.18, + "grad_norm": 0.7522211670875549, + "learning_rate": 0.00046973847907277844, + "loss": 3.4804, + "step": 1434 + }, + { + "epoch": 0.18, + "grad_norm": 0.8737341165542603, + "learning_rate": 0.0004696890261549073, + "loss": 3.4973, + "step": 1435 + }, + { + "epoch": 0.18, + "grad_norm": 0.7290940880775452, + "learning_rate": 0.00046963953546994583, + "loss": 3.4597, + "step": 1436 + }, + { + "epoch": 0.18, + "grad_norm": 1.3438891172409058, + "learning_rate": 0.00046959000702640185, + "loss": 3.5925, + "step": 1437 + }, + { + "epoch": 0.18, + "grad_norm": 0.8091414570808411, + "learning_rate": 0.00046954044083279004, + "loss": 3.4549, + "step": 1438 + }, + { + "epoch": 0.18, + "grad_norm": 0.8499096035957336, + "learning_rate": 0.00046949083689763114, + "loss": 3.6355, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 0.7301666736602783, + "learning_rate": 0.00046944119522945307, + "loss": 3.4349, + "step": 1440 + }, + { + "epoch": 0.18, + "grad_norm": 0.7843241095542908, + "learning_rate": 0.0004693915158367894, + "loss": 3.4676, + "step": 1441 + }, + { + "epoch": 0.18, + "grad_norm": 0.7359977960586548, + "learning_rate": 0.00046934179872818073, + "loss": 3.5503, + "step": 1442 + }, + { + "epoch": 0.18, + "grad_norm": 0.7981695532798767, + "learning_rate": 0.00046929204391217414, + "loss": 3.4857, + "step": 1443 + }, + { + "epoch": 0.18, + "grad_norm": 0.8104868531227112, + "learning_rate": 0.0004692422513973229, + "loss": 3.5549, + "step": 1444 + }, + { + "epoch": 0.18, + "grad_norm": 0.7411574721336365, + "learning_rate": 0.000469192421192187, + "loss": 3.3795, + "step": 1445 + }, + { + "epoch": 0.19, + "grad_norm": 0.715009868144989, + "learning_rate": 0.00046914255330533273, + "loss": 3.613, + "step": 1446 + }, + { + "epoch": 0.19, + "grad_norm": 0.7242358326911926, + "learning_rate": 0.00046909264774533307, + "loss": 3.537, + "step": 1447 + }, + { + "epoch": 0.19, + "grad_norm": 0.8142811059951782, + "learning_rate": 0.0004690427045207673, + "loss": 3.4606, + "step": 1448 + }, + { + "epoch": 0.19, + "grad_norm": 0.7344624996185303, + "learning_rate": 0.00046899272364022126, + "loss": 3.433, + "step": 1449 + }, + { + "epoch": 0.19, + "grad_norm": 0.7056677937507629, + "learning_rate": 0.0004689427051122873, + "loss": 3.5493, + "step": 1450 + }, + { + "epoch": 0.19, + "grad_norm": 0.6946106553077698, + "learning_rate": 0.00046889264894556406, + "loss": 3.5235, + "step": 1451 + }, + { + "epoch": 0.19, + "grad_norm": 0.7007893323898315, + "learning_rate": 0.00046884255514865694, + "loss": 3.4545, + "step": 1452 + }, + { + "epoch": 0.19, + "grad_norm": 0.6814204454421997, + "learning_rate": 0.00046879242373017746, + "loss": 3.5959, + "step": 1453 + }, + { + "epoch": 0.19, + "grad_norm": 0.7359189987182617, + "learning_rate": 0.0004687422546987439, + "loss": 3.7141, + "step": 1454 + }, + { + "epoch": 0.19, + "grad_norm": 0.7151730060577393, + "learning_rate": 0.00046869204806298094, + "loss": 3.5196, + "step": 1455 + }, + { + "epoch": 0.19, + "grad_norm": 0.7060539126396179, + "learning_rate": 0.0004686418038315196, + "loss": 3.5085, + "step": 1456 + }, + { + "epoch": 0.19, + "grad_norm": 0.6896477341651917, + "learning_rate": 0.00046859152201299736, + "loss": 3.5511, + "step": 1457 + }, + { + "epoch": 0.19, + "grad_norm": 0.731194019317627, + "learning_rate": 0.0004685412026160584, + "loss": 3.437, + "step": 1458 + }, + { + "epoch": 0.19, + "grad_norm": 0.8108443021774292, + "learning_rate": 0.00046849084564935323, + "loss": 3.5826, + "step": 1459 + }, + { + "epoch": 0.19, + "grad_norm": 0.751708447933197, + "learning_rate": 0.00046844045112153865, + "loss": 3.5659, + "step": 1460 + }, + { + "epoch": 0.19, + "grad_norm": 0.7761220932006836, + "learning_rate": 0.0004683900190412782, + "loss": 3.5145, + "step": 1461 + }, + { + "epoch": 0.19, + "grad_norm": 0.8092109560966492, + "learning_rate": 0.0004683395494172417, + "loss": 3.5084, + "step": 1462 + }, + { + "epoch": 0.19, + "grad_norm": 0.8220993876457214, + "learning_rate": 0.0004682890422581054, + "loss": 3.5945, + "step": 1463 + }, + { + "epoch": 0.19, + "grad_norm": 0.7602437734603882, + "learning_rate": 0.0004682384975725522, + "loss": 3.4458, + "step": 1464 + }, + { + "epoch": 0.19, + "grad_norm": 0.6738101840019226, + "learning_rate": 0.0004681879153692711, + "loss": 3.486, + "step": 1465 + }, + { + "epoch": 0.19, + "grad_norm": 0.7281554341316223, + "learning_rate": 0.00046813729565695793, + "loss": 3.7172, + "step": 1466 + }, + { + "epoch": 0.19, + "grad_norm": 0.7399675846099854, + "learning_rate": 0.0004680866384443149, + "loss": 3.6074, + "step": 1467 + }, + { + "epoch": 0.19, + "grad_norm": 0.6513975262641907, + "learning_rate": 0.0004680359437400503, + "loss": 3.3342, + "step": 1468 + }, + { + "epoch": 0.19, + "grad_norm": 0.6898742914199829, + "learning_rate": 0.0004679852115528793, + "loss": 3.4383, + "step": 1469 + }, + { + "epoch": 0.19, + "grad_norm": 0.7081462740898132, + "learning_rate": 0.0004679344418915234, + "loss": 3.5978, + "step": 1470 + }, + { + "epoch": 0.19, + "grad_norm": 0.7182134985923767, + "learning_rate": 0.0004678836347647104, + "loss": 3.5795, + "step": 1471 + }, + { + "epoch": 0.19, + "grad_norm": 0.7415251135826111, + "learning_rate": 0.0004678327901811746, + "loss": 3.5322, + "step": 1472 + }, + { + "epoch": 0.19, + "grad_norm": 0.7213219404220581, + "learning_rate": 0.00046778190814965694, + "loss": 3.5142, + "step": 1473 + }, + { + "epoch": 0.19, + "grad_norm": 0.9378483295440674, + "learning_rate": 0.0004677309886789044, + "loss": 3.5648, + "step": 1474 + }, + { + "epoch": 0.19, + "grad_norm": 0.7091275453567505, + "learning_rate": 0.0004676800317776708, + "loss": 3.5034, + "step": 1475 + }, + { + "epoch": 0.19, + "grad_norm": 0.7190051078796387, + "learning_rate": 0.0004676290374547162, + "loss": 3.4522, + "step": 1476 + }, + { + "epoch": 0.19, + "grad_norm": 0.7434428930282593, + "learning_rate": 0.0004675780057188071, + "loss": 3.4756, + "step": 1477 + }, + { + "epoch": 0.19, + "grad_norm": 0.822962760925293, + "learning_rate": 0.00046752693657871645, + "loss": 3.5878, + "step": 1478 + }, + { + "epoch": 0.19, + "grad_norm": 0.7522849440574646, + "learning_rate": 0.00046747583004322357, + "loss": 3.6562, + "step": 1479 + }, + { + "epoch": 0.19, + "grad_norm": 0.769645631313324, + "learning_rate": 0.0004674246861211143, + "loss": 3.5938, + "step": 1480 + }, + { + "epoch": 0.19, + "grad_norm": 0.8104962706565857, + "learning_rate": 0.0004673735048211809, + "loss": 3.6472, + "step": 1481 + }, + { + "epoch": 0.19, + "grad_norm": 0.7202017903327942, + "learning_rate": 0.00046732228615222203, + "loss": 3.4748, + "step": 1482 + }, + { + "epoch": 0.19, + "grad_norm": 0.7173405289649963, + "learning_rate": 0.00046727103012304274, + "loss": 3.4189, + "step": 1483 + }, + { + "epoch": 0.19, + "grad_norm": 0.7136857509613037, + "learning_rate": 0.00046721973674245453, + "loss": 3.4613, + "step": 1484 + }, + { + "epoch": 0.19, + "grad_norm": 0.7353072762489319, + "learning_rate": 0.00046716840601927534, + "loss": 3.5693, + "step": 1485 + }, + { + "epoch": 0.19, + "grad_norm": 0.7930543422698975, + "learning_rate": 0.00046711703796232954, + "loss": 3.5676, + "step": 1486 + }, + { + "epoch": 0.19, + "grad_norm": 0.6626320481300354, + "learning_rate": 0.0004670656325804479, + "loss": 3.4426, + "step": 1487 + }, + { + "epoch": 0.19, + "grad_norm": 0.7427566647529602, + "learning_rate": 0.0004670141898824676, + "loss": 3.4905, + "step": 1488 + }, + { + "epoch": 0.19, + "grad_norm": 0.7173172831535339, + "learning_rate": 0.0004669627098772321, + "loss": 3.5909, + "step": 1489 + }, + { + "epoch": 0.19, + "grad_norm": 0.768047034740448, + "learning_rate": 0.0004669111925735916, + "loss": 3.4904, + "step": 1490 + }, + { + "epoch": 0.19, + "grad_norm": 0.6736437082290649, + "learning_rate": 0.00046685963798040247, + "loss": 3.4495, + "step": 1491 + }, + { + "epoch": 0.19, + "grad_norm": 0.7259313464164734, + "learning_rate": 0.0004668080461065275, + "loss": 3.5522, + "step": 1492 + }, + { + "epoch": 0.19, + "grad_norm": 0.7390080690383911, + "learning_rate": 0.00046675641696083595, + "loss": 3.5692, + "step": 1493 + }, + { + "epoch": 0.19, + "grad_norm": 0.7455681562423706, + "learning_rate": 0.00046670475055220347, + "loss": 3.5092, + "step": 1494 + }, + { + "epoch": 0.19, + "grad_norm": 0.7226245999336243, + "learning_rate": 0.0004666530468895121, + "loss": 3.4348, + "step": 1495 + }, + { + "epoch": 0.19, + "grad_norm": 0.8105984926223755, + "learning_rate": 0.0004666013059816503, + "loss": 3.5211, + "step": 1496 + }, + { + "epoch": 0.19, + "grad_norm": 0.6822407841682434, + "learning_rate": 0.000466549527837513, + "loss": 3.5263, + "step": 1497 + }, + { + "epoch": 0.19, + "grad_norm": 0.7036212086677551, + "learning_rate": 0.00046649771246600136, + "loss": 3.4583, + "step": 1498 + }, + { + "epoch": 0.19, + "grad_norm": 0.7508665323257446, + "learning_rate": 0.00046644585987602304, + "loss": 3.4535, + "step": 1499 + }, + { + "epoch": 0.19, + "grad_norm": 0.6954589486122131, + "learning_rate": 0.0004663939700764923, + "loss": 3.5332, + "step": 1500 + }, + { + "epoch": 0.19, + "grad_norm": 0.6855521202087402, + "learning_rate": 0.0004663420430763293, + "loss": 3.3302, + "step": 1501 + }, + { + "epoch": 0.19, + "grad_norm": 0.7412664890289307, + "learning_rate": 0.00046629007888446115, + "loss": 3.4485, + "step": 1502 + }, + { + "epoch": 0.19, + "grad_norm": 0.7577595114707947, + "learning_rate": 0.00046623807750982094, + "loss": 3.4485, + "step": 1503 + }, + { + "epoch": 0.19, + "grad_norm": 0.7610722184181213, + "learning_rate": 0.00046618603896134836, + "loss": 3.6569, + "step": 1504 + }, + { + "epoch": 0.19, + "grad_norm": 0.7613515257835388, + "learning_rate": 0.00046613396324798943, + "loss": 3.4656, + "step": 1505 + }, + { + "epoch": 0.19, + "grad_norm": 0.7554030418395996, + "learning_rate": 0.0004660818503786965, + "loss": 3.4672, + "step": 1506 + }, + { + "epoch": 0.19, + "grad_norm": 0.7763833999633789, + "learning_rate": 0.00046602970036242866, + "loss": 3.6245, + "step": 1507 + }, + { + "epoch": 0.19, + "grad_norm": 0.8957259058952332, + "learning_rate": 0.00046597751320815074, + "loss": 3.4625, + "step": 1508 + }, + { + "epoch": 0.19, + "grad_norm": 0.7575466632843018, + "learning_rate": 0.00046592528892483453, + "loss": 3.5558, + "step": 1509 + }, + { + "epoch": 0.19, + "grad_norm": 0.7638049125671387, + "learning_rate": 0.00046587302752145793, + "loss": 3.5342, + "step": 1510 + }, + { + "epoch": 0.19, + "grad_norm": 0.7103486657142639, + "learning_rate": 0.00046582072900700524, + "loss": 3.4839, + "step": 1511 + }, + { + "epoch": 0.19, + "grad_norm": 0.7354874610900879, + "learning_rate": 0.00046576839339046726, + "loss": 3.547, + "step": 1512 + }, + { + "epoch": 0.19, + "grad_norm": 0.6934744715690613, + "learning_rate": 0.00046571602068084107, + "loss": 3.4264, + "step": 1513 + }, + { + "epoch": 0.19, + "grad_norm": 0.7283271551132202, + "learning_rate": 0.00046566361088713016, + "loss": 3.5117, + "step": 1514 + }, + { + "epoch": 0.19, + "grad_norm": 0.738408088684082, + "learning_rate": 0.00046561116401834426, + "loss": 3.4262, + "step": 1515 + }, + { + "epoch": 0.19, + "grad_norm": 0.6994336843490601, + "learning_rate": 0.0004655586800834998, + "loss": 3.3924, + "step": 1516 + }, + { + "epoch": 0.19, + "grad_norm": 0.723712682723999, + "learning_rate": 0.0004655061590916191, + "loss": 3.4571, + "step": 1517 + }, + { + "epoch": 0.19, + "grad_norm": 0.7158104777336121, + "learning_rate": 0.0004654536010517314, + "loss": 3.5852, + "step": 1518 + }, + { + "epoch": 0.19, + "grad_norm": 0.7097963690757751, + "learning_rate": 0.00046540100597287193, + "loss": 3.5034, + "step": 1519 + }, + { + "epoch": 0.19, + "grad_norm": 0.6684868335723877, + "learning_rate": 0.00046534837386408236, + "loss": 3.3736, + "step": 1520 + }, + { + "epoch": 0.19, + "grad_norm": 0.7114806175231934, + "learning_rate": 0.0004652957047344108, + "loss": 3.4636, + "step": 1521 + }, + { + "epoch": 0.19, + "grad_norm": 0.7204274535179138, + "learning_rate": 0.00046524299859291164, + "loss": 3.5365, + "step": 1522 + }, + { + "epoch": 0.19, + "grad_norm": 0.6938864588737488, + "learning_rate": 0.0004651902554486458, + "loss": 3.4385, + "step": 1523 + }, + { + "epoch": 0.2, + "grad_norm": 0.6994484663009644, + "learning_rate": 0.0004651374753106803, + "loss": 3.5652, + "step": 1524 + }, + { + "epoch": 0.2, + "grad_norm": 0.6984156966209412, + "learning_rate": 0.00046508465818808866, + "loss": 3.5164, + "step": 1525 + }, + { + "epoch": 0.2, + "grad_norm": 0.7477048635482788, + "learning_rate": 0.00046503180408995085, + "loss": 3.6011, + "step": 1526 + }, + { + "epoch": 0.2, + "grad_norm": 0.7297093868255615, + "learning_rate": 0.00046497891302535303, + "loss": 3.6558, + "step": 1527 + }, + { + "epoch": 0.2, + "grad_norm": 0.7676709890365601, + "learning_rate": 0.00046492598500338787, + "loss": 3.5184, + "step": 1528 + }, + { + "epoch": 0.2, + "grad_norm": 0.7769712209701538, + "learning_rate": 0.0004648730200331542, + "loss": 3.5287, + "step": 1529 + }, + { + "epoch": 0.2, + "grad_norm": 0.7823746204376221, + "learning_rate": 0.0004648200181237574, + "loss": 3.4073, + "step": 1530 + }, + { + "epoch": 0.2, + "grad_norm": 0.7949132323265076, + "learning_rate": 0.000464766979284309, + "loss": 3.6725, + "step": 1531 + }, + { + "epoch": 0.2, + "grad_norm": 0.7303690910339355, + "learning_rate": 0.0004647139035239272, + "loss": 3.6023, + "step": 1532 + }, + { + "epoch": 0.2, + "grad_norm": 0.7428348064422607, + "learning_rate": 0.0004646607908517361, + "loss": 3.582, + "step": 1533 + }, + { + "epoch": 0.2, + "grad_norm": 0.7381561994552612, + "learning_rate": 0.0004646076412768665, + "loss": 3.6566, + "step": 1534 + }, + { + "epoch": 0.2, + "grad_norm": 0.7015333771705627, + "learning_rate": 0.00046455445480845543, + "loss": 3.5161, + "step": 1535 + }, + { + "epoch": 0.2, + "grad_norm": 0.7148547172546387, + "learning_rate": 0.0004645012314556463, + "loss": 3.5613, + "step": 1536 + }, + { + "epoch": 0.2, + "grad_norm": 0.6725534796714783, + "learning_rate": 0.0004644479712275887, + "loss": 3.4803, + "step": 1537 + }, + { + "epoch": 0.2, + "grad_norm": 0.7428320050239563, + "learning_rate": 0.0004643946741334387, + "loss": 3.53, + "step": 1538 + }, + { + "epoch": 0.2, + "grad_norm": 0.6852494478225708, + "learning_rate": 0.00046434134018235885, + "loss": 3.584, + "step": 1539 + }, + { + "epoch": 0.2, + "grad_norm": 0.717242419719696, + "learning_rate": 0.0004642879693835178, + "loss": 3.6089, + "step": 1540 + }, + { + "epoch": 0.2, + "grad_norm": 0.7618151903152466, + "learning_rate": 0.00046423456174609045, + "loss": 3.5611, + "step": 1541 + }, + { + "epoch": 0.2, + "grad_norm": 0.7496387362480164, + "learning_rate": 0.0004641811172792584, + "loss": 3.5066, + "step": 1542 + }, + { + "epoch": 0.2, + "grad_norm": 0.6788463592529297, + "learning_rate": 0.00046412763599220925, + "loss": 3.6358, + "step": 1543 + }, + { + "epoch": 0.2, + "grad_norm": 0.735870361328125, + "learning_rate": 0.00046407411789413714, + "loss": 3.6166, + "step": 1544 + }, + { + "epoch": 0.2, + "grad_norm": 0.766202986240387, + "learning_rate": 0.0004640205629942423, + "loss": 3.4675, + "step": 1545 + }, + { + "epoch": 0.2, + "grad_norm": 0.7808355093002319, + "learning_rate": 0.00046396697130173165, + "loss": 3.4217, + "step": 1546 + }, + { + "epoch": 0.2, + "grad_norm": 0.7486604452133179, + "learning_rate": 0.0004639133428258181, + "loss": 3.571, + "step": 1547 + }, + { + "epoch": 0.2, + "grad_norm": 0.693970263004303, + "learning_rate": 0.000463859677575721, + "loss": 3.4341, + "step": 1548 + }, + { + "epoch": 0.2, + "grad_norm": 0.7594770193099976, + "learning_rate": 0.00046380597556066607, + "loss": 3.459, + "step": 1549 + }, + { + "epoch": 0.2, + "grad_norm": 0.740746259689331, + "learning_rate": 0.0004637522367898852, + "loss": 3.5711, + "step": 1550 + }, + { + "epoch": 0.2, + "grad_norm": 0.7217738032341003, + "learning_rate": 0.00046369846127261696, + "loss": 3.5348, + "step": 1551 + }, + { + "epoch": 0.2, + "grad_norm": 0.7293241620063782, + "learning_rate": 0.0004636446490181057, + "loss": 3.5425, + "step": 1552 + }, + { + "epoch": 0.2, + "grad_norm": 0.7295405268669128, + "learning_rate": 0.0004635908000356025, + "loss": 3.6256, + "step": 1553 + }, + { + "epoch": 0.2, + "grad_norm": 0.7225673198699951, + "learning_rate": 0.00046353691433436464, + "loss": 3.5693, + "step": 1554 + }, + { + "epoch": 0.2, + "grad_norm": 0.7160700559616089, + "learning_rate": 0.00046348299192365566, + "loss": 3.6359, + "step": 1555 + }, + { + "epoch": 0.2, + "grad_norm": 0.7208033204078674, + "learning_rate": 0.00046342903281274553, + "loss": 3.4644, + "step": 1556 + }, + { + "epoch": 0.2, + "grad_norm": 0.725329577922821, + "learning_rate": 0.00046337503701091026, + "loss": 3.5131, + "step": 1557 + }, + { + "epoch": 0.2, + "grad_norm": 0.66863614320755, + "learning_rate": 0.0004633210045274325, + "loss": 3.5729, + "step": 1558 + }, + { + "epoch": 0.2, + "grad_norm": 0.6695690751075745, + "learning_rate": 0.00046326693537160113, + "loss": 3.5065, + "step": 1559 + }, + { + "epoch": 0.2, + "grad_norm": 0.6696643829345703, + "learning_rate": 0.0004632128295527111, + "loss": 3.5872, + "step": 1560 + }, + { + "epoch": 0.2, + "grad_norm": 0.6737921833992004, + "learning_rate": 0.00046315868708006393, + "loss": 3.3002, + "step": 1561 + }, + { + "epoch": 0.2, + "grad_norm": 0.6874504089355469, + "learning_rate": 0.0004631045079629672, + "loss": 3.4913, + "step": 1562 + }, + { + "epoch": 0.2, + "grad_norm": 0.7577842473983765, + "learning_rate": 0.00046305029221073516, + "loss": 3.5991, + "step": 1563 + }, + { + "epoch": 0.2, + "grad_norm": 0.7580335736274719, + "learning_rate": 0.0004629960398326879, + "loss": 3.541, + "step": 1564 + }, + { + "epoch": 0.2, + "grad_norm": 0.7388274669647217, + "learning_rate": 0.00046294175083815215, + "loss": 3.5442, + "step": 1565 + }, + { + "epoch": 0.2, + "grad_norm": 0.7765889763832092, + "learning_rate": 0.0004628874252364609, + "loss": 3.557, + "step": 1566 + }, + { + "epoch": 0.2, + "grad_norm": 0.758765459060669, + "learning_rate": 0.00046283306303695316, + "loss": 3.5705, + "step": 1567 + }, + { + "epoch": 0.2, + "grad_norm": 0.7454854846000671, + "learning_rate": 0.0004627786642489745, + "loss": 3.5865, + "step": 1568 + }, + { + "epoch": 0.2, + "grad_norm": 0.7236392498016357, + "learning_rate": 0.0004627242288818767, + "loss": 3.4642, + "step": 1569 + }, + { + "epoch": 0.2, + "grad_norm": 0.7050071954727173, + "learning_rate": 0.0004626697569450179, + "loss": 3.3845, + "step": 1570 + }, + { + "epoch": 0.2, + "grad_norm": 0.6823355555534363, + "learning_rate": 0.00046261524844776235, + "loss": 3.4824, + "step": 1571 + }, + { + "epoch": 0.2, + "grad_norm": 0.7080915570259094, + "learning_rate": 0.0004625607033994808, + "loss": 3.6313, + "step": 1572 + }, + { + "epoch": 0.2, + "grad_norm": 0.7229819893836975, + "learning_rate": 0.0004625061218095501, + "loss": 3.4139, + "step": 1573 + }, + { + "epoch": 0.2, + "grad_norm": 0.7221214175224304, + "learning_rate": 0.00046245150368735344, + "loss": 3.5608, + "step": 1574 + }, + { + "epoch": 0.2, + "grad_norm": 0.7289392352104187, + "learning_rate": 0.0004623968490422804, + "loss": 3.5833, + "step": 1575 + }, + { + "epoch": 0.2, + "grad_norm": 0.7396621704101562, + "learning_rate": 0.0004623421578837267, + "loss": 3.4815, + "step": 1576 + }, + { + "epoch": 0.2, + "grad_norm": 0.753093957901001, + "learning_rate": 0.0004622874302210943, + "loss": 3.5491, + "step": 1577 + }, + { + "epoch": 0.2, + "grad_norm": 0.7183561325073242, + "learning_rate": 0.00046223266606379166, + "loss": 3.5169, + "step": 1578 + }, + { + "epoch": 0.2, + "grad_norm": 0.7659851312637329, + "learning_rate": 0.0004621778654212333, + "loss": 3.6561, + "step": 1579 + }, + { + "epoch": 0.2, + "grad_norm": 0.74198317527771, + "learning_rate": 0.00046212302830284015, + "loss": 3.5021, + "step": 1580 + }, + { + "epoch": 0.2, + "grad_norm": 0.7400766015052795, + "learning_rate": 0.0004620681547180392, + "loss": 3.5211, + "step": 1581 + }, + { + "epoch": 0.2, + "grad_norm": 0.7454343438148499, + "learning_rate": 0.00046201324467626405, + "loss": 3.6262, + "step": 1582 + }, + { + "epoch": 0.2, + "grad_norm": 0.6770966649055481, + "learning_rate": 0.0004619582981869542, + "loss": 3.4959, + "step": 1583 + }, + { + "epoch": 0.2, + "grad_norm": 0.6676375269889832, + "learning_rate": 0.00046190331525955566, + "loss": 3.4918, + "step": 1584 + }, + { + "epoch": 0.2, + "grad_norm": 0.7769284248352051, + "learning_rate": 0.0004618482959035206, + "loss": 3.5092, + "step": 1585 + }, + { + "epoch": 0.2, + "grad_norm": 0.7751719951629639, + "learning_rate": 0.0004617932401283076, + "loss": 3.4876, + "step": 1586 + }, + { + "epoch": 0.2, + "grad_norm": 0.6945412755012512, + "learning_rate": 0.0004617381479433813, + "loss": 3.3976, + "step": 1587 + }, + { + "epoch": 0.2, + "grad_norm": 0.803017795085907, + "learning_rate": 0.0004616830193582127, + "loss": 3.4917, + "step": 1588 + }, + { + "epoch": 0.2, + "grad_norm": 0.7180715799331665, + "learning_rate": 0.00046162785438227895, + "loss": 3.4079, + "step": 1589 + }, + { + "epoch": 0.2, + "grad_norm": 0.7560957074165344, + "learning_rate": 0.0004615726530250637, + "loss": 3.5088, + "step": 1590 + }, + { + "epoch": 0.2, + "grad_norm": 0.8058301210403442, + "learning_rate": 0.00046151741529605654, + "loss": 3.5657, + "step": 1591 + }, + { + "epoch": 0.2, + "grad_norm": 0.7559605240821838, + "learning_rate": 0.00046146214120475367, + "loss": 3.635, + "step": 1592 + }, + { + "epoch": 0.2, + "grad_norm": 0.7491931319236755, + "learning_rate": 0.0004614068307606572, + "loss": 3.4993, + "step": 1593 + }, + { + "epoch": 0.2, + "grad_norm": 0.7718519568443298, + "learning_rate": 0.0004613514839732757, + "loss": 3.5777, + "step": 1594 + }, + { + "epoch": 0.2, + "grad_norm": 0.7517576813697815, + "learning_rate": 0.00046129610085212394, + "loss": 3.4151, + "step": 1595 + }, + { + "epoch": 0.2, + "grad_norm": 0.7024547457695007, + "learning_rate": 0.00046124068140672284, + "loss": 3.4908, + "step": 1596 + }, + { + "epoch": 0.2, + "grad_norm": 0.7161441445350647, + "learning_rate": 0.0004611852256465997, + "loss": 3.5818, + "step": 1597 + }, + { + "epoch": 0.2, + "grad_norm": 0.6977832317352295, + "learning_rate": 0.00046112973358128796, + "loss": 3.4198, + "step": 1598 + }, + { + "epoch": 0.2, + "grad_norm": 0.6729937195777893, + "learning_rate": 0.0004610742052203275, + "loss": 3.6251, + "step": 1599 + }, + { + "epoch": 0.2, + "grad_norm": 0.7269434332847595, + "learning_rate": 0.0004610186405732641, + "loss": 3.5543, + "step": 1600 + }, + { + "epoch": 0.2, + "grad_norm": 0.7198590636253357, + "learning_rate": 0.00046096303964965004, + "loss": 3.5553, + "step": 1601 + }, + { + "epoch": 0.21, + "grad_norm": 0.7438438534736633, + "learning_rate": 0.00046090740245904383, + "loss": 3.4807, + "step": 1602 + }, + { + "epoch": 0.21, + "grad_norm": 0.7416555285453796, + "learning_rate": 0.00046085172901101006, + "loss": 3.4353, + "step": 1603 + }, + { + "epoch": 0.21, + "grad_norm": 0.7552011013031006, + "learning_rate": 0.0004607960193151197, + "loss": 3.5533, + "step": 1604 + }, + { + "epoch": 0.21, + "grad_norm": 0.6969509124755859, + "learning_rate": 0.00046074027338094983, + "loss": 3.5417, + "step": 1605 + }, + { + "epoch": 0.21, + "grad_norm": 0.6895467042922974, + "learning_rate": 0.0004606844912180839, + "loss": 3.5665, + "step": 1606 + }, + { + "epoch": 0.21, + "grad_norm": 0.6941189765930176, + "learning_rate": 0.0004606286728361113, + "loss": 3.5748, + "step": 1607 + }, + { + "epoch": 0.21, + "grad_norm": 0.6663969159126282, + "learning_rate": 0.0004605728182446282, + "loss": 3.3967, + "step": 1608 + }, + { + "epoch": 0.21, + "grad_norm": 0.7020400166511536, + "learning_rate": 0.0004605169274532364, + "loss": 3.4237, + "step": 1609 + }, + { + "epoch": 0.21, + "grad_norm": 0.7018489837646484, + "learning_rate": 0.00046046100047154425, + "loss": 3.6572, + "step": 1610 + }, + { + "epoch": 0.21, + "grad_norm": 0.7054951190948486, + "learning_rate": 0.0004604050373091663, + "loss": 3.3447, + "step": 1611 + }, + { + "epoch": 0.21, + "grad_norm": 0.8908802270889282, + "learning_rate": 0.0004603490379757232, + "loss": 3.5154, + "step": 1612 + }, + { + "epoch": 0.21, + "grad_norm": 0.6973745226860046, + "learning_rate": 0.00046029300248084183, + "loss": 3.4249, + "step": 1613 + }, + { + "epoch": 0.21, + "grad_norm": 0.6773630976676941, + "learning_rate": 0.0004602369308341555, + "loss": 3.4243, + "step": 1614 + }, + { + "epoch": 0.21, + "grad_norm": 0.7102717757225037, + "learning_rate": 0.0004601808230453034, + "loss": 3.6138, + "step": 1615 + }, + { + "epoch": 0.21, + "grad_norm": 0.6981760859489441, + "learning_rate": 0.00046012467912393126, + "loss": 3.3881, + "step": 1616 + }, + { + "epoch": 0.21, + "grad_norm": 0.7004696726799011, + "learning_rate": 0.0004600684990796907, + "loss": 3.5152, + "step": 1617 + }, + { + "epoch": 0.21, + "grad_norm": 0.77801513671875, + "learning_rate": 0.00046001228292223993, + "loss": 3.5409, + "step": 1618 + }, + { + "epoch": 0.21, + "grad_norm": 0.7654480338096619, + "learning_rate": 0.00045995603066124305, + "loss": 3.375, + "step": 1619 + }, + { + "epoch": 0.21, + "grad_norm": 0.7013776302337646, + "learning_rate": 0.00045989974230637045, + "loss": 3.4237, + "step": 1620 + }, + { + "epoch": 0.21, + "grad_norm": 0.7696542739868164, + "learning_rate": 0.0004598434178672988, + "loss": 3.6602, + "step": 1621 + }, + { + "epoch": 0.21, + "grad_norm": 0.912253201007843, + "learning_rate": 0.00045978705735371083, + "loss": 3.5991, + "step": 1622 + }, + { + "epoch": 0.21, + "grad_norm": 0.7334491610527039, + "learning_rate": 0.00045973066077529574, + "loss": 3.3882, + "step": 1623 + }, + { + "epoch": 0.21, + "grad_norm": 0.7463875412940979, + "learning_rate": 0.00045967422814174863, + "loss": 3.4675, + "step": 1624 + }, + { + "epoch": 0.21, + "grad_norm": 0.730761706829071, + "learning_rate": 0.0004596177594627709, + "loss": 3.6244, + "step": 1625 + }, + { + "epoch": 0.21, + "grad_norm": 0.6837449669837952, + "learning_rate": 0.00045956125474807034, + "loss": 3.3481, + "step": 1626 + }, + { + "epoch": 0.21, + "grad_norm": 0.722831130027771, + "learning_rate": 0.0004595047140073605, + "loss": 3.546, + "step": 1627 + }, + { + "epoch": 0.21, + "grad_norm": 0.7261539101600647, + "learning_rate": 0.0004594481372503616, + "loss": 3.5072, + "step": 1628 + }, + { + "epoch": 0.21, + "grad_norm": 0.7304078936576843, + "learning_rate": 0.00045939152448679977, + "loss": 3.5156, + "step": 1629 + }, + { + "epoch": 0.21, + "grad_norm": 0.6343042254447937, + "learning_rate": 0.0004593348757264074, + "loss": 3.567, + "step": 1630 + }, + { + "epoch": 0.21, + "grad_norm": 0.7293105125427246, + "learning_rate": 0.0004592781909789231, + "loss": 3.536, + "step": 1631 + }, + { + "epoch": 0.21, + "grad_norm": 0.7170096635818481, + "learning_rate": 0.0004592214702540916, + "loss": 3.5915, + "step": 1632 + }, + { + "epoch": 0.21, + "grad_norm": 0.7097491025924683, + "learning_rate": 0.00045916471356166383, + "loss": 3.5545, + "step": 1633 + }, + { + "epoch": 0.21, + "grad_norm": 0.6664694547653198, + "learning_rate": 0.000459107920911397, + "loss": 3.5604, + "step": 1634 + }, + { + "epoch": 0.21, + "grad_norm": 0.7300134301185608, + "learning_rate": 0.00045905109231305437, + "loss": 3.4362, + "step": 1635 + }, + { + "epoch": 0.21, + "grad_norm": 0.7318281531333923, + "learning_rate": 0.00045899422777640543, + "loss": 3.5118, + "step": 1636 + }, + { + "epoch": 0.21, + "grad_norm": 0.7411096692085266, + "learning_rate": 0.00045893732731122584, + "loss": 3.472, + "step": 1637 + }, + { + "epoch": 0.21, + "grad_norm": 0.6925008893013, + "learning_rate": 0.0004588803909272975, + "loss": 3.4881, + "step": 1638 + }, + { + "epoch": 0.21, + "grad_norm": 0.735426664352417, + "learning_rate": 0.0004588234186344084, + "loss": 3.5375, + "step": 1639 + }, + { + "epoch": 0.21, + "grad_norm": 0.7070428729057312, + "learning_rate": 0.0004587664104423528, + "loss": 3.5516, + "step": 1640 + }, + { + "epoch": 0.21, + "grad_norm": 0.6997687816619873, + "learning_rate": 0.000458709366360931, + "loss": 3.5889, + "step": 1641 + }, + { + "epoch": 0.21, + "grad_norm": 0.709517240524292, + "learning_rate": 0.0004586522863999495, + "loss": 3.6729, + "step": 1642 + }, + { + "epoch": 0.21, + "grad_norm": 0.7886548638343811, + "learning_rate": 0.0004585951705692211, + "loss": 3.6147, + "step": 1643 + }, + { + "epoch": 0.21, + "grad_norm": 0.7466222643852234, + "learning_rate": 0.0004585380188785646, + "loss": 3.4541, + "step": 1644 + }, + { + "epoch": 0.21, + "grad_norm": 0.6836808323860168, + "learning_rate": 0.0004584808313378051, + "loss": 3.4846, + "step": 1645 + }, + { + "epoch": 0.21, + "grad_norm": 0.6720550060272217, + "learning_rate": 0.0004584236079567738, + "loss": 3.6527, + "step": 1646 + }, + { + "epoch": 0.21, + "grad_norm": 0.6817904114723206, + "learning_rate": 0.000458366348745308, + "loss": 3.4742, + "step": 1647 + }, + { + "epoch": 0.21, + "grad_norm": 0.7797605991363525, + "learning_rate": 0.00045830905371325125, + "loss": 3.5033, + "step": 1648 + }, + { + "epoch": 0.21, + "grad_norm": 0.7567112445831299, + "learning_rate": 0.0004582517228704533, + "loss": 3.4196, + "step": 1649 + }, + { + "epoch": 0.21, + "grad_norm": 0.7362318634986877, + "learning_rate": 0.00045819435622676985, + "loss": 3.5975, + "step": 1650 + }, + { + "epoch": 0.21, + "grad_norm": 0.7481383085250854, + "learning_rate": 0.0004581369537920631, + "loss": 3.5421, + "step": 1651 + }, + { + "epoch": 0.21, + "grad_norm": 0.741283118724823, + "learning_rate": 0.000458079515576201, + "loss": 3.6068, + "step": 1652 + }, + { + "epoch": 0.21, + "grad_norm": 0.7184082865715027, + "learning_rate": 0.00045802204158905787, + "loss": 3.4343, + "step": 1653 + }, + { + "epoch": 0.21, + "grad_norm": 0.7001619935035706, + "learning_rate": 0.00045796453184051417, + "loss": 3.5601, + "step": 1654 + }, + { + "epoch": 0.21, + "grad_norm": 0.7675745487213135, + "learning_rate": 0.0004579069863404566, + "loss": 3.5921, + "step": 1655 + }, + { + "epoch": 0.21, + "grad_norm": 0.6656587719917297, + "learning_rate": 0.0004578494050987777, + "loss": 3.5763, + "step": 1656 + }, + { + "epoch": 0.21, + "grad_norm": 0.7041658759117126, + "learning_rate": 0.0004577917881253766, + "loss": 3.5689, + "step": 1657 + }, + { + "epoch": 0.21, + "grad_norm": 0.6746188998222351, + "learning_rate": 0.0004577341354301581, + "loss": 3.4682, + "step": 1658 + }, + { + "epoch": 0.21, + "grad_norm": 0.7306371927261353, + "learning_rate": 0.00045767644702303346, + "loss": 3.5342, + "step": 1659 + }, + { + "epoch": 0.21, + "grad_norm": 0.6800777912139893, + "learning_rate": 0.00045761872291392005, + "loss": 3.3995, + "step": 1660 + }, + { + "epoch": 0.21, + "grad_norm": 0.7372608780860901, + "learning_rate": 0.00045756096311274123, + "loss": 3.4871, + "step": 1661 + }, + { + "epoch": 0.21, + "grad_norm": 0.7147559523582458, + "learning_rate": 0.00045750316762942656, + "loss": 3.4816, + "step": 1662 + }, + { + "epoch": 0.21, + "grad_norm": 0.6870706677436829, + "learning_rate": 0.0004574453364739118, + "loss": 3.3661, + "step": 1663 + }, + { + "epoch": 0.21, + "grad_norm": 0.7589773535728455, + "learning_rate": 0.00045738746965613876, + "loss": 3.5928, + "step": 1664 + }, + { + "epoch": 0.21, + "grad_norm": 0.7578521966934204, + "learning_rate": 0.0004573295671860555, + "loss": 3.5298, + "step": 1665 + }, + { + "epoch": 0.21, + "grad_norm": 0.7245187163352966, + "learning_rate": 0.0004572716290736161, + "loss": 3.4699, + "step": 1666 + }, + { + "epoch": 0.21, + "grad_norm": 0.807602047920227, + "learning_rate": 0.00045721365532878065, + "loss": 3.4792, + "step": 1667 + }, + { + "epoch": 0.21, + "grad_norm": 0.735205352306366, + "learning_rate": 0.0004571556459615157, + "loss": 3.5042, + "step": 1668 + }, + { + "epoch": 0.21, + "grad_norm": 0.7455587983131409, + "learning_rate": 0.00045709760098179363, + "loss": 3.6033, + "step": 1669 + }, + { + "epoch": 0.21, + "grad_norm": 0.7397889494895935, + "learning_rate": 0.00045703952039959306, + "loss": 3.6053, + "step": 1670 + }, + { + "epoch": 0.21, + "grad_norm": 0.6866623163223267, + "learning_rate": 0.00045698140422489875, + "loss": 3.5595, + "step": 1671 + }, + { + "epoch": 0.21, + "grad_norm": 0.6840174198150635, + "learning_rate": 0.00045692325246770156, + "loss": 3.4672, + "step": 1672 + }, + { + "epoch": 0.21, + "grad_norm": 0.7191321849822998, + "learning_rate": 0.0004568650651379984, + "loss": 3.5568, + "step": 1673 + }, + { + "epoch": 0.21, + "grad_norm": 0.6546381115913391, + "learning_rate": 0.0004568068422457923, + "loss": 3.5321, + "step": 1674 + }, + { + "epoch": 0.21, + "grad_norm": 0.6783313155174255, + "learning_rate": 0.0004567485838010925, + "loss": 3.5561, + "step": 1675 + }, + { + "epoch": 0.21, + "grad_norm": 0.7149572968482971, + "learning_rate": 0.00045669028981391434, + "loss": 3.4968, + "step": 1676 + }, + { + "epoch": 0.21, + "grad_norm": 0.754173755645752, + "learning_rate": 0.00045663196029427925, + "loss": 3.4835, + "step": 1677 + }, + { + "epoch": 0.21, + "grad_norm": 0.7751779556274414, + "learning_rate": 0.00045657359525221465, + "loss": 3.4883, + "step": 1678 + }, + { + "epoch": 0.21, + "grad_norm": 0.6891859769821167, + "learning_rate": 0.0004565151946977542, + "loss": 3.507, + "step": 1679 + }, + { + "epoch": 0.22, + "grad_norm": 0.7515348196029663, + "learning_rate": 0.00045645675864093766, + "loss": 3.5647, + "step": 1680 + }, + { + "epoch": 0.22, + "grad_norm": 0.7117422223091125, + "learning_rate": 0.0004563982870918109, + "loss": 3.4871, + "step": 1681 + }, + { + "epoch": 0.22, + "grad_norm": 0.7142488956451416, + "learning_rate": 0.00045633978006042575, + "loss": 3.5224, + "step": 1682 + }, + { + "epoch": 0.22, + "grad_norm": 0.7000136375427246, + "learning_rate": 0.00045628123755684036, + "loss": 3.4883, + "step": 1683 + }, + { + "epoch": 0.22, + "grad_norm": 0.766757071018219, + "learning_rate": 0.0004562226595911188, + "loss": 3.5763, + "step": 1684 + }, + { + "epoch": 0.22, + "grad_norm": 0.7023899555206299, + "learning_rate": 0.0004561640461733313, + "loss": 3.4384, + "step": 1685 + }, + { + "epoch": 0.22, + "grad_norm": 0.7456826567649841, + "learning_rate": 0.0004561053973135543, + "loss": 3.5366, + "step": 1686 + }, + { + "epoch": 0.22, + "grad_norm": 0.7246689200401306, + "learning_rate": 0.00045604671302187, + "loss": 3.6063, + "step": 1687 + }, + { + "epoch": 0.22, + "grad_norm": 0.7071058750152588, + "learning_rate": 0.000455987993308367, + "loss": 3.5176, + "step": 1688 + }, + { + "epoch": 0.22, + "grad_norm": 0.7235510349273682, + "learning_rate": 0.00045592923818314014, + "loss": 3.4324, + "step": 1689 + }, + { + "epoch": 0.22, + "grad_norm": 0.6815816760063171, + "learning_rate": 0.00045587044765628973, + "loss": 3.405, + "step": 1690 + }, + { + "epoch": 0.22, + "grad_norm": 0.7223538756370544, + "learning_rate": 0.0004558116217379228, + "loss": 3.5051, + "step": 1691 + }, + { + "epoch": 0.22, + "grad_norm": 0.698800265789032, + "learning_rate": 0.00045575276043815203, + "loss": 3.5592, + "step": 1692 + }, + { + "epoch": 0.22, + "grad_norm": 0.7396284937858582, + "learning_rate": 0.00045569386376709655, + "loss": 3.4582, + "step": 1693 + }, + { + "epoch": 0.22, + "grad_norm": 0.6998549103736877, + "learning_rate": 0.0004556349317348812, + "loss": 3.4187, + "step": 1694 + }, + { + "epoch": 0.22, + "grad_norm": 0.6666313409805298, + "learning_rate": 0.0004555759643516372, + "loss": 3.546, + "step": 1695 + }, + { + "epoch": 0.22, + "grad_norm": 0.7003994584083557, + "learning_rate": 0.0004555169616275017, + "loss": 3.5365, + "step": 1696 + }, + { + "epoch": 0.22, + "grad_norm": 0.7141129374504089, + "learning_rate": 0.00045545792357261784, + "loss": 3.5452, + "step": 1697 + }, + { + "epoch": 0.22, + "grad_norm": 0.6956498622894287, + "learning_rate": 0.0004553988501971351, + "loss": 3.4388, + "step": 1698 + }, + { + "epoch": 0.22, + "grad_norm": 0.6885257363319397, + "learning_rate": 0.00045533974151120896, + "loss": 3.4795, + "step": 1699 + }, + { + "epoch": 0.22, + "grad_norm": 0.8513539433479309, + "learning_rate": 0.0004552805975250006, + "loss": 3.5629, + "step": 1700 + }, + { + "epoch": 0.22, + "grad_norm": 0.7191634178161621, + "learning_rate": 0.0004552214182486777, + "loss": 3.4614, + "step": 1701 + }, + { + "epoch": 0.22, + "grad_norm": 0.6562285423278809, + "learning_rate": 0.0004551622036924139, + "loss": 3.6592, + "step": 1702 + }, + { + "epoch": 0.22, + "grad_norm": 0.673965334892273, + "learning_rate": 0.0004551029538663889, + "loss": 3.6226, + "step": 1703 + }, + { + "epoch": 0.22, + "grad_norm": 0.7001772522926331, + "learning_rate": 0.00045504366878078826, + "loss": 3.4116, + "step": 1704 + }, + { + "epoch": 0.22, + "grad_norm": 0.6701346635818481, + "learning_rate": 0.0004549843484458041, + "loss": 3.4371, + "step": 1705 + }, + { + "epoch": 0.22, + "grad_norm": 0.659070611000061, + "learning_rate": 0.0004549249928716338, + "loss": 3.4649, + "step": 1706 + }, + { + "epoch": 0.22, + "grad_norm": 0.7089521884918213, + "learning_rate": 0.0004548656020684817, + "loss": 3.4124, + "step": 1707 + }, + { + "epoch": 0.22, + "grad_norm": 0.6980241537094116, + "learning_rate": 0.0004548061760465575, + "loss": 3.4206, + "step": 1708 + }, + { + "epoch": 0.22, + "grad_norm": 0.7338328957557678, + "learning_rate": 0.00045474671481607744, + "loss": 3.4859, + "step": 1709 + }, + { + "epoch": 0.22, + "grad_norm": 0.7270656824111938, + "learning_rate": 0.00045468721838726336, + "loss": 3.5368, + "step": 1710 + }, + { + "epoch": 0.22, + "grad_norm": 0.6947497129440308, + "learning_rate": 0.00045462768677034354, + "loss": 3.5811, + "step": 1711 + }, + { + "epoch": 0.22, + "grad_norm": 0.7697454690933228, + "learning_rate": 0.00045456811997555215, + "loss": 3.47, + "step": 1712 + }, + { + "epoch": 0.22, + "grad_norm": 0.7647615671157837, + "learning_rate": 0.0004545085180131293, + "loss": 3.523, + "step": 1713 + }, + { + "epoch": 0.22, + "grad_norm": 0.7090777158737183, + "learning_rate": 0.0004544488808933214, + "loss": 3.4826, + "step": 1714 + }, + { + "epoch": 0.22, + "grad_norm": 0.6906019449234009, + "learning_rate": 0.0004543892086263807, + "loss": 3.4338, + "step": 1715 + }, + { + "epoch": 0.22, + "grad_norm": 0.709435760974884, + "learning_rate": 0.0004543295012225656, + "loss": 3.3788, + "step": 1716 + }, + { + "epoch": 0.22, + "grad_norm": 0.6874732971191406, + "learning_rate": 0.00045426975869214035, + "loss": 3.5806, + "step": 1717 + }, + { + "epoch": 0.22, + "grad_norm": 0.7035239338874817, + "learning_rate": 0.0004542099810453755, + "loss": 3.5524, + "step": 1718 + }, + { + "epoch": 0.22, + "grad_norm": 0.682473361492157, + "learning_rate": 0.0004541501682925475, + "loss": 3.4603, + "step": 1719 + }, + { + "epoch": 0.22, + "grad_norm": 0.756766676902771, + "learning_rate": 0.0004540903204439389, + "loss": 3.5824, + "step": 1720 + }, + { + "epoch": 0.22, + "grad_norm": 0.716594934463501, + "learning_rate": 0.0004540304375098382, + "loss": 3.4851, + "step": 1721 + }, + { + "epoch": 0.22, + "grad_norm": 0.6676660180091858, + "learning_rate": 0.0004539705195005399, + "loss": 3.4948, + "step": 1722 + }, + { + "epoch": 0.22, + "grad_norm": 0.7688947319984436, + "learning_rate": 0.00045391056642634476, + "loss": 3.3265, + "step": 1723 + }, + { + "epoch": 0.22, + "grad_norm": 0.7121802568435669, + "learning_rate": 0.00045385057829755925, + "loss": 3.4246, + "step": 1724 + }, + { + "epoch": 0.22, + "grad_norm": 0.7320714592933655, + "learning_rate": 0.00045379055512449615, + "loss": 3.5915, + "step": 1725 + }, + { + "epoch": 0.22, + "grad_norm": 0.7020367980003357, + "learning_rate": 0.00045373049691747403, + "loss": 3.585, + "step": 1726 + }, + { + "epoch": 0.22, + "grad_norm": 0.7828790545463562, + "learning_rate": 0.0004536704036868177, + "loss": 3.4396, + "step": 1727 + }, + { + "epoch": 0.22, + "grad_norm": 0.7378258109092712, + "learning_rate": 0.0004536102754428577, + "loss": 3.5462, + "step": 1728 + }, + { + "epoch": 0.22, + "grad_norm": 0.6563218235969543, + "learning_rate": 0.00045355011219593103, + "loss": 3.5116, + "step": 1729 + }, + { + "epoch": 0.22, + "grad_norm": 0.7141712307929993, + "learning_rate": 0.00045348991395638026, + "loss": 3.6652, + "step": 1730 + }, + { + "epoch": 0.22, + "grad_norm": 0.6334406733512878, + "learning_rate": 0.00045342968073455427, + "loss": 3.5444, + "step": 1731 + }, + { + "epoch": 0.22, + "grad_norm": 0.6695120334625244, + "learning_rate": 0.0004533694125408078, + "loss": 3.5061, + "step": 1732 + }, + { + "epoch": 0.22, + "grad_norm": 0.6740787029266357, + "learning_rate": 0.00045330910938550157, + "loss": 3.6269, + "step": 1733 + }, + { + "epoch": 0.22, + "grad_norm": 0.6897600889205933, + "learning_rate": 0.00045324877127900253, + "loss": 3.3207, + "step": 1734 + }, + { + "epoch": 0.22, + "grad_norm": 0.6967800259590149, + "learning_rate": 0.00045318839823168345, + "loss": 3.4868, + "step": 1735 + }, + { + "epoch": 0.22, + "grad_norm": 0.8088453412055969, + "learning_rate": 0.00045312799025392313, + "loss": 3.5164, + "step": 1736 + }, + { + "epoch": 0.22, + "grad_norm": 0.7367091774940491, + "learning_rate": 0.00045306754735610643, + "loss": 3.4248, + "step": 1737 + }, + { + "epoch": 0.22, + "grad_norm": 0.7140554189682007, + "learning_rate": 0.00045300706954862425, + "loss": 3.3806, + "step": 1738 + }, + { + "epoch": 0.22, + "grad_norm": 0.7674543857574463, + "learning_rate": 0.00045294655684187325, + "loss": 3.5559, + "step": 1739 + }, + { + "epoch": 0.22, + "grad_norm": 0.745649516582489, + "learning_rate": 0.00045288600924625643, + "loss": 3.5134, + "step": 1740 + }, + { + "epoch": 0.22, + "grad_norm": 0.7173015475273132, + "learning_rate": 0.00045282542677218255, + "loss": 3.4533, + "step": 1741 + }, + { + "epoch": 0.22, + "grad_norm": 0.6862587928771973, + "learning_rate": 0.00045276480943006646, + "loss": 3.6471, + "step": 1742 + }, + { + "epoch": 0.22, + "grad_norm": 0.5865806341171265, + "learning_rate": 0.00045270415723032897, + "loss": 3.3734, + "step": 1743 + }, + { + "epoch": 0.22, + "grad_norm": 0.682898223400116, + "learning_rate": 0.0004526434701833969, + "loss": 3.6128, + "step": 1744 + }, + { + "epoch": 0.22, + "grad_norm": 0.6636016964912415, + "learning_rate": 0.000452582748299703, + "loss": 3.5103, + "step": 1745 + }, + { + "epoch": 0.22, + "grad_norm": 0.6695670485496521, + "learning_rate": 0.0004525219915896863, + "loss": 3.4878, + "step": 1746 + }, + { + "epoch": 0.22, + "grad_norm": 0.7170305848121643, + "learning_rate": 0.00045246120006379125, + "loss": 3.5729, + "step": 1747 + }, + { + "epoch": 0.22, + "grad_norm": 0.6496245265007019, + "learning_rate": 0.00045240037373246885, + "loss": 3.5039, + "step": 1748 + }, + { + "epoch": 0.22, + "grad_norm": 0.727599561214447, + "learning_rate": 0.0004523395126061757, + "loss": 3.4796, + "step": 1749 + }, + { + "epoch": 0.22, + "grad_norm": 0.6873193383216858, + "learning_rate": 0.00045227861669537475, + "loss": 3.5528, + "step": 1750 + }, + { + "epoch": 0.22, + "grad_norm": 0.7259441614151001, + "learning_rate": 0.0004522176860105345, + "loss": 3.5181, + "step": 1751 + }, + { + "epoch": 0.22, + "grad_norm": 0.7488285303115845, + "learning_rate": 0.00045215672056212966, + "loss": 3.4781, + "step": 1752 + }, + { + "epoch": 0.22, + "grad_norm": 0.7147641777992249, + "learning_rate": 0.00045209572036064105, + "loss": 3.6021, + "step": 1753 + }, + { + "epoch": 0.22, + "grad_norm": 0.760727047920227, + "learning_rate": 0.00045203468541655514, + "loss": 3.4344, + "step": 1754 + }, + { + "epoch": 0.22, + "grad_norm": 0.7726491689682007, + "learning_rate": 0.00045197361574036466, + "loss": 3.4809, + "step": 1755 + }, + { + "epoch": 0.22, + "grad_norm": 0.7099810838699341, + "learning_rate": 0.0004519125113425681, + "loss": 3.5666, + "step": 1756 + }, + { + "epoch": 0.22, + "grad_norm": 0.6434507966041565, + "learning_rate": 0.0004518513722336701, + "loss": 3.4913, + "step": 1757 + }, + { + "epoch": 0.23, + "grad_norm": 0.6790302395820618, + "learning_rate": 0.00045179019842418126, + "loss": 3.5466, + "step": 1758 + }, + { + "epoch": 0.23, + "grad_norm": 0.7132038474082947, + "learning_rate": 0.0004517289899246178, + "loss": 3.4942, + "step": 1759 + }, + { + "epoch": 0.23, + "grad_norm": 0.6917662620544434, + "learning_rate": 0.0004516677467455024, + "loss": 3.5228, + "step": 1760 + }, + { + "epoch": 0.23, + "grad_norm": 0.7276488542556763, + "learning_rate": 0.00045160646889736333, + "loss": 3.491, + "step": 1761 + }, + { + "epoch": 0.23, + "grad_norm": 0.671357274055481, + "learning_rate": 0.00045154515639073513, + "loss": 3.476, + "step": 1762 + }, + { + "epoch": 0.23, + "grad_norm": 0.6840657591819763, + "learning_rate": 0.00045148380923615804, + "loss": 3.4813, + "step": 1763 + }, + { + "epoch": 0.23, + "grad_norm": 0.6774691343307495, + "learning_rate": 0.0004514224274441783, + "loss": 3.46, + "step": 1764 + }, + { + "epoch": 0.23, + "grad_norm": 0.7096502184867859, + "learning_rate": 0.00045136101102534823, + "loss": 3.3175, + "step": 1765 + }, + { + "epoch": 0.23, + "grad_norm": 0.7626500129699707, + "learning_rate": 0.00045129955999022585, + "loss": 3.4774, + "step": 1766 + }, + { + "epoch": 0.23, + "grad_norm": 0.7550767064094543, + "learning_rate": 0.0004512380743493756, + "loss": 3.5936, + "step": 1767 + }, + { + "epoch": 0.23, + "grad_norm": 0.718117892742157, + "learning_rate": 0.00045117655411336735, + "loss": 3.4214, + "step": 1768 + }, + { + "epoch": 0.23, + "grad_norm": 0.7564663887023926, + "learning_rate": 0.00045111499929277723, + "loss": 3.5129, + "step": 1769 + }, + { + "epoch": 0.23, + "grad_norm": 0.7053793668746948, + "learning_rate": 0.0004510534098981872, + "loss": 3.4276, + "step": 1770 + }, + { + "epoch": 0.23, + "grad_norm": 0.7066434621810913, + "learning_rate": 0.0004509917859401852, + "loss": 3.4055, + "step": 1771 + }, + { + "epoch": 0.23, + "grad_norm": 0.6779627799987793, + "learning_rate": 0.0004509301274293651, + "loss": 3.3915, + "step": 1772 + }, + { + "epoch": 0.23, + "grad_norm": 0.6719902157783508, + "learning_rate": 0.00045086843437632673, + "loss": 3.3999, + "step": 1773 + }, + { + "epoch": 0.23, + "grad_norm": 0.6965342164039612, + "learning_rate": 0.0004508067067916758, + "loss": 3.3755, + "step": 1774 + }, + { + "epoch": 0.23, + "grad_norm": 0.7080169320106506, + "learning_rate": 0.000450744944686024, + "loss": 3.602, + "step": 1775 + }, + { + "epoch": 0.23, + "grad_norm": 0.8627835512161255, + "learning_rate": 0.000450683148069989, + "loss": 3.4469, + "step": 1776 + }, + { + "epoch": 0.23, + "grad_norm": 0.7523432374000549, + "learning_rate": 0.00045062131695419434, + "loss": 3.5508, + "step": 1777 + }, + { + "epoch": 0.23, + "grad_norm": 0.7530139088630676, + "learning_rate": 0.00045055945134926944, + "loss": 3.4976, + "step": 1778 + }, + { + "epoch": 0.23, + "grad_norm": 0.7500725388526917, + "learning_rate": 0.00045049755126584987, + "loss": 3.4723, + "step": 1779 + }, + { + "epoch": 0.23, + "grad_norm": 0.7203992009162903, + "learning_rate": 0.00045043561671457677, + "loss": 3.5489, + "step": 1780 + }, + { + "epoch": 0.23, + "grad_norm": 0.7485403418540955, + "learning_rate": 0.0004503736477060976, + "loss": 3.6029, + "step": 1781 + }, + { + "epoch": 0.23, + "grad_norm": 0.65134197473526, + "learning_rate": 0.0004503116442510654, + "loss": 3.5845, + "step": 1782 + }, + { + "epoch": 0.23, + "grad_norm": 0.671821653842926, + "learning_rate": 0.00045024960636013935, + "loss": 3.4996, + "step": 1783 + }, + { + "epoch": 0.23, + "grad_norm": 0.7114441394805908, + "learning_rate": 0.00045018753404398444, + "loss": 3.5538, + "step": 1784 + }, + { + "epoch": 0.23, + "grad_norm": 0.6860888600349426, + "learning_rate": 0.0004501254273132717, + "loss": 3.5501, + "step": 1785 + }, + { + "epoch": 0.23, + "grad_norm": 0.6691362261772156, + "learning_rate": 0.0004500632861786779, + "loss": 3.495, + "step": 1786 + }, + { + "epoch": 0.23, + "grad_norm": 0.6683976054191589, + "learning_rate": 0.00045000111065088597, + "loss": 3.4391, + "step": 1787 + }, + { + "epoch": 0.23, + "grad_norm": 0.6653891801834106, + "learning_rate": 0.00044993890074058443, + "loss": 3.5648, + "step": 1788 + }, + { + "epoch": 0.23, + "grad_norm": 0.6728489995002747, + "learning_rate": 0.000449876656458468, + "loss": 3.4839, + "step": 1789 + }, + { + "epoch": 0.23, + "grad_norm": 0.6840108036994934, + "learning_rate": 0.00044981437781523714, + "loss": 3.4492, + "step": 1790 + }, + { + "epoch": 0.23, + "grad_norm": 0.7206718921661377, + "learning_rate": 0.00044975206482159827, + "loss": 3.4534, + "step": 1791 + }, + { + "epoch": 0.23, + "grad_norm": 0.6660496592521667, + "learning_rate": 0.0004496897174882637, + "loss": 3.4322, + "step": 1792 + }, + { + "epoch": 0.23, + "grad_norm": 0.6936440467834473, + "learning_rate": 0.0004496273358259517, + "loss": 3.5069, + "step": 1793 + }, + { + "epoch": 0.23, + "grad_norm": 0.7100087404251099, + "learning_rate": 0.0004495649198453865, + "loss": 3.5615, + "step": 1794 + }, + { + "epoch": 0.23, + "grad_norm": 0.7237424850463867, + "learning_rate": 0.00044950246955729794, + "loss": 3.6016, + "step": 1795 + }, + { + "epoch": 0.23, + "grad_norm": 0.7479493618011475, + "learning_rate": 0.00044943998497242203, + "loss": 3.5855, + "step": 1796 + }, + { + "epoch": 0.23, + "grad_norm": 0.6842243075370789, + "learning_rate": 0.00044937746610150065, + "loss": 3.3482, + "step": 1797 + }, + { + "epoch": 0.23, + "grad_norm": 0.7530412077903748, + "learning_rate": 0.00044931491295528144, + "loss": 3.5059, + "step": 1798 + }, + { + "epoch": 0.23, + "grad_norm": 0.6601586937904358, + "learning_rate": 0.000449252325544518, + "loss": 3.3957, + "step": 1799 + }, + { + "epoch": 0.23, + "grad_norm": 0.7175249457359314, + "learning_rate": 0.0004491897038799699, + "loss": 3.5889, + "step": 1800 + }, + { + "epoch": 0.23, + "grad_norm": 0.7499895095825195, + "learning_rate": 0.00044912704797240243, + "loss": 3.3806, + "step": 1801 + }, + { + "epoch": 0.23, + "grad_norm": 0.7383548617362976, + "learning_rate": 0.000449064357832587, + "loss": 3.4735, + "step": 1802 + }, + { + "epoch": 0.23, + "grad_norm": 0.6868014335632324, + "learning_rate": 0.00044900163347130073, + "loss": 3.5739, + "step": 1803 + }, + { + "epoch": 0.23, + "grad_norm": 0.7332748770713806, + "learning_rate": 0.0004489388748993266, + "loss": 3.3961, + "step": 1804 + }, + { + "epoch": 0.23, + "grad_norm": 0.695375382900238, + "learning_rate": 0.0004488760821274536, + "loss": 3.4391, + "step": 1805 + }, + { + "epoch": 0.23, + "grad_norm": 0.6780656576156616, + "learning_rate": 0.00044881325516647654, + "loss": 3.5523, + "step": 1806 + }, + { + "epoch": 0.23, + "grad_norm": 0.6870729923248291, + "learning_rate": 0.00044875039402719606, + "loss": 3.4855, + "step": 1807 + }, + { + "epoch": 0.23, + "grad_norm": 0.7325615286827087, + "learning_rate": 0.0004486874987204187, + "loss": 3.5555, + "step": 1808 + }, + { + "epoch": 0.23, + "grad_norm": 0.7024620175361633, + "learning_rate": 0.0004486245692569569, + "loss": 3.4482, + "step": 1809 + }, + { + "epoch": 0.23, + "grad_norm": 0.7054506540298462, + "learning_rate": 0.00044856160564762904, + "loss": 3.4569, + "step": 1810 + }, + { + "epoch": 0.23, + "grad_norm": 0.7186829447746277, + "learning_rate": 0.0004484986079032592, + "loss": 3.4807, + "step": 1811 + }, + { + "epoch": 0.23, + "grad_norm": 0.6961916089057922, + "learning_rate": 0.00044843557603467754, + "loss": 3.4053, + "step": 1812 + }, + { + "epoch": 0.23, + "grad_norm": 0.797313928604126, + "learning_rate": 0.00044837251005271984, + "loss": 3.4249, + "step": 1813 + }, + { + "epoch": 0.23, + "grad_norm": 0.7082807421684265, + "learning_rate": 0.000448309409968228, + "loss": 3.6033, + "step": 1814 + }, + { + "epoch": 0.23, + "grad_norm": 0.7119776606559753, + "learning_rate": 0.00044824627579204953, + "loss": 3.4228, + "step": 1815 + }, + { + "epoch": 0.23, + "grad_norm": 0.724306583404541, + "learning_rate": 0.000448183107535038, + "loss": 3.42, + "step": 1816 + }, + { + "epoch": 0.23, + "grad_norm": 0.7299683690071106, + "learning_rate": 0.0004481199052080527, + "loss": 3.4756, + "step": 1817 + }, + { + "epoch": 0.23, + "grad_norm": 0.7112978100776672, + "learning_rate": 0.0004480566688219589, + "loss": 3.4947, + "step": 1818 + }, + { + "epoch": 0.23, + "grad_norm": 0.6957636475563049, + "learning_rate": 0.0004479933983876278, + "loss": 3.5313, + "step": 1819 + }, + { + "epoch": 0.23, + "grad_norm": 0.6685547232627869, + "learning_rate": 0.000447930093915936, + "loss": 3.4602, + "step": 1820 + }, + { + "epoch": 0.23, + "grad_norm": 0.6561869382858276, + "learning_rate": 0.00044786675541776653, + "loss": 3.4492, + "step": 1821 + }, + { + "epoch": 0.23, + "grad_norm": 0.6853393912315369, + "learning_rate": 0.0004478033829040079, + "loss": 3.5309, + "step": 1822 + }, + { + "epoch": 0.23, + "grad_norm": 0.8650455474853516, + "learning_rate": 0.0004477399763855547, + "loss": 3.4467, + "step": 1823 + }, + { + "epoch": 0.23, + "grad_norm": 0.6917409896850586, + "learning_rate": 0.0004476765358733071, + "loss": 3.3812, + "step": 1824 + }, + { + "epoch": 0.23, + "grad_norm": 0.6957911252975464, + "learning_rate": 0.0004476130613781713, + "loss": 3.5238, + "step": 1825 + }, + { + "epoch": 0.23, + "grad_norm": 0.7351962327957153, + "learning_rate": 0.0004475495529110594, + "loss": 3.6196, + "step": 1826 + }, + { + "epoch": 0.23, + "grad_norm": 0.7193084955215454, + "learning_rate": 0.00044748601048288906, + "loss": 3.4937, + "step": 1827 + }, + { + "epoch": 0.23, + "grad_norm": 0.6810345649719238, + "learning_rate": 0.00044742243410458415, + "loss": 3.5354, + "step": 1828 + }, + { + "epoch": 0.23, + "grad_norm": 0.7194337844848633, + "learning_rate": 0.00044735882378707406, + "loss": 3.4368, + "step": 1829 + }, + { + "epoch": 0.23, + "grad_norm": 0.6895034909248352, + "learning_rate": 0.00044729517954129416, + "loss": 3.4814, + "step": 1830 + }, + { + "epoch": 0.23, + "grad_norm": 0.7407213449478149, + "learning_rate": 0.0004472315013781857, + "loss": 3.5947, + "step": 1831 + }, + { + "epoch": 0.23, + "grad_norm": 0.6990834474563599, + "learning_rate": 0.0004471677893086956, + "loss": 3.5198, + "step": 1832 + }, + { + "epoch": 0.23, + "grad_norm": 0.6894617080688477, + "learning_rate": 0.00044710404334377684, + "loss": 3.3916, + "step": 1833 + }, + { + "epoch": 0.23, + "grad_norm": 0.7195038795471191, + "learning_rate": 0.000447040263494388, + "loss": 3.5726, + "step": 1834 + }, + { + "epoch": 0.23, + "grad_norm": 0.6901001334190369, + "learning_rate": 0.00044697644977149346, + "loss": 3.5009, + "step": 1835 + }, + { + "epoch": 0.24, + "grad_norm": 0.7657378911972046, + "learning_rate": 0.0004469126021860637, + "loss": 3.483, + "step": 1836 + }, + { + "epoch": 0.24, + "grad_norm": 0.6822826862335205, + "learning_rate": 0.0004468487207490749, + "loss": 3.5181, + "step": 1837 + }, + { + "epoch": 0.24, + "grad_norm": 0.7345016598701477, + "learning_rate": 0.00044678480547150886, + "loss": 3.4361, + "step": 1838 + }, + { + "epoch": 0.24, + "grad_norm": 0.7520294189453125, + "learning_rate": 0.00044672085636435346, + "loss": 3.5237, + "step": 1839 + }, + { + "epoch": 0.24, + "grad_norm": 0.7586507797241211, + "learning_rate": 0.0004466568734386023, + "loss": 3.4596, + "step": 1840 + }, + { + "epoch": 0.24, + "grad_norm": 0.6959397196769714, + "learning_rate": 0.00044659285670525464, + "loss": 3.4634, + "step": 1841 + }, + { + "epoch": 0.24, + "grad_norm": 0.708108127117157, + "learning_rate": 0.00044652880617531587, + "loss": 3.4588, + "step": 1842 + }, + { + "epoch": 0.24, + "grad_norm": 0.7102747559547424, + "learning_rate": 0.000446464721859797, + "loss": 3.6206, + "step": 1843 + }, + { + "epoch": 0.24, + "grad_norm": 0.6688615083694458, + "learning_rate": 0.0004464006037697148, + "loss": 3.3673, + "step": 1844 + }, + { + "epoch": 0.24, + "grad_norm": 0.7083567976951599, + "learning_rate": 0.000446336451916092, + "loss": 3.4632, + "step": 1845 + }, + { + "epoch": 0.24, + "grad_norm": 0.655749499797821, + "learning_rate": 0.0004462722663099569, + "loss": 3.4694, + "step": 1846 + }, + { + "epoch": 0.24, + "grad_norm": 0.7156842947006226, + "learning_rate": 0.00044620804696234387, + "loss": 3.4285, + "step": 1847 + }, + { + "epoch": 0.24, + "grad_norm": 0.6834205985069275, + "learning_rate": 0.0004461437938842929, + "loss": 3.5335, + "step": 1848 + }, + { + "epoch": 0.24, + "grad_norm": 0.6689387559890747, + "learning_rate": 0.00044607950708685, + "loss": 3.4063, + "step": 1849 + }, + { + "epoch": 0.24, + "grad_norm": 0.7429822683334351, + "learning_rate": 0.00044601518658106653, + "loss": 3.4659, + "step": 1850 + }, + { + "epoch": 0.24, + "grad_norm": 0.6987894177436829, + "learning_rate": 0.0004459508323780001, + "loss": 3.636, + "step": 1851 + }, + { + "epoch": 0.24, + "grad_norm": 0.6989704370498657, + "learning_rate": 0.00044588644448871395, + "loss": 3.4733, + "step": 1852 + }, + { + "epoch": 0.24, + "grad_norm": 0.69915372133255, + "learning_rate": 0.0004458220229242771, + "loss": 3.5979, + "step": 1853 + }, + { + "epoch": 0.24, + "grad_norm": 0.7444015145301819, + "learning_rate": 0.0004457575676957644, + "loss": 3.5744, + "step": 1854 + }, + { + "epoch": 0.24, + "grad_norm": 0.6756484508514404, + "learning_rate": 0.0004456930788142563, + "loss": 3.4098, + "step": 1855 + }, + { + "epoch": 0.24, + "grad_norm": 0.6732588410377502, + "learning_rate": 0.0004456285562908393, + "loss": 3.4806, + "step": 1856 + }, + { + "epoch": 0.24, + "grad_norm": 0.6970331072807312, + "learning_rate": 0.0004455640001366056, + "loss": 3.6507, + "step": 1857 + }, + { + "epoch": 0.24, + "grad_norm": 0.7336878180503845, + "learning_rate": 0.00044549941036265306, + "loss": 3.6066, + "step": 1858 + }, + { + "epoch": 0.24, + "grad_norm": 0.6298251152038574, + "learning_rate": 0.00044543478698008546, + "loss": 3.4287, + "step": 1859 + }, + { + "epoch": 0.24, + "grad_norm": 0.6941573023796082, + "learning_rate": 0.0004453701300000124, + "loss": 3.5047, + "step": 1860 + }, + { + "epoch": 0.24, + "grad_norm": 0.6953024864196777, + "learning_rate": 0.00044530543943354896, + "loss": 3.521, + "step": 1861 + }, + { + "epoch": 0.24, + "grad_norm": 0.6954164505004883, + "learning_rate": 0.0004452407152918163, + "loss": 3.4325, + "step": 1862 + }, + { + "epoch": 0.24, + "grad_norm": 0.7267298102378845, + "learning_rate": 0.0004451759575859413, + "loss": 3.4498, + "step": 1863 + }, + { + "epoch": 0.24, + "grad_norm": 0.6570194959640503, + "learning_rate": 0.0004451111663270565, + "loss": 3.4265, + "step": 1864 + }, + { + "epoch": 0.24, + "grad_norm": 0.7331037521362305, + "learning_rate": 0.00044504634152630024, + "loss": 3.5847, + "step": 1865 + }, + { + "epoch": 0.24, + "grad_norm": 0.7002983689308167, + "learning_rate": 0.0004449814831948168, + "loss": 3.5015, + "step": 1866 + }, + { + "epoch": 0.24, + "grad_norm": 0.7224550247192383, + "learning_rate": 0.00044491659134375587, + "loss": 3.6215, + "step": 1867 + }, + { + "epoch": 0.24, + "grad_norm": 0.6379685997962952, + "learning_rate": 0.0004448516659842733, + "loss": 3.5124, + "step": 1868 + }, + { + "epoch": 0.24, + "grad_norm": 0.6856171488761902, + "learning_rate": 0.00044478670712753043, + "loss": 3.3486, + "step": 1869 + }, + { + "epoch": 0.24, + "grad_norm": 0.671360969543457, + "learning_rate": 0.0004447217147846944, + "loss": 3.4822, + "step": 1870 + }, + { + "epoch": 0.24, + "grad_norm": 0.7054650783538818, + "learning_rate": 0.0004446566889669382, + "loss": 3.5302, + "step": 1871 + }, + { + "epoch": 0.24, + "grad_norm": 0.7077500820159912, + "learning_rate": 0.00044459162968544055, + "loss": 3.5209, + "step": 1872 + }, + { + "epoch": 0.24, + "grad_norm": 0.7756421566009521, + "learning_rate": 0.00044452653695138585, + "loss": 3.5391, + "step": 1873 + }, + { + "epoch": 0.24, + "grad_norm": 0.7363382577896118, + "learning_rate": 0.00044446141077596424, + "loss": 3.6557, + "step": 1874 + }, + { + "epoch": 0.24, + "grad_norm": 0.6910688877105713, + "learning_rate": 0.00044439625117037183, + "loss": 3.3939, + "step": 1875 + }, + { + "epoch": 0.24, + "grad_norm": 0.6563981771469116, + "learning_rate": 0.0004443310581458102, + "loss": 3.4331, + "step": 1876 + }, + { + "epoch": 0.24, + "grad_norm": 0.6942573189735413, + "learning_rate": 0.00044426583171348666, + "loss": 3.463, + "step": 1877 + }, + { + "epoch": 0.24, + "grad_norm": 0.7408201098442078, + "learning_rate": 0.0004442005718846147, + "loss": 3.5645, + "step": 1878 + }, + { + "epoch": 0.24, + "grad_norm": 0.7020135521888733, + "learning_rate": 0.00044413527867041293, + "loss": 3.5329, + "step": 1879 + }, + { + "epoch": 0.24, + "grad_norm": 0.7310847640037537, + "learning_rate": 0.0004440699520821062, + "loss": 3.4169, + "step": 1880 + }, + { + "epoch": 0.24, + "grad_norm": 0.7448086142539978, + "learning_rate": 0.00044400459213092487, + "loss": 3.4812, + "step": 1881 + }, + { + "epoch": 0.24, + "grad_norm": 0.6619753837585449, + "learning_rate": 0.0004439391988281051, + "loss": 3.5827, + "step": 1882 + }, + { + "epoch": 0.24, + "grad_norm": 0.6850958466529846, + "learning_rate": 0.00044387377218488863, + "loss": 3.4599, + "step": 1883 + }, + { + "epoch": 0.24, + "grad_norm": 0.7675847411155701, + "learning_rate": 0.00044380831221252316, + "loss": 3.4061, + "step": 1884 + }, + { + "epoch": 0.24, + "grad_norm": 0.7381026148796082, + "learning_rate": 0.00044374281892226204, + "loss": 3.4934, + "step": 1885 + }, + { + "epoch": 0.24, + "grad_norm": 0.7035375833511353, + "learning_rate": 0.00044367729232536423, + "loss": 3.7558, + "step": 1886 + }, + { + "epoch": 0.24, + "grad_norm": 0.6353806853294373, + "learning_rate": 0.0004436117324330946, + "loss": 3.4913, + "step": 1887 + }, + { + "epoch": 0.24, + "grad_norm": 0.687185525894165, + "learning_rate": 0.0004435461392567236, + "loss": 3.6783, + "step": 1888 + }, + { + "epoch": 0.24, + "grad_norm": 0.6932180523872375, + "learning_rate": 0.00044348051280752756, + "loss": 3.4155, + "step": 1889 + }, + { + "epoch": 0.24, + "grad_norm": 0.6853898763656616, + "learning_rate": 0.0004434148530967883, + "loss": 3.3687, + "step": 1890 + }, + { + "epoch": 0.24, + "grad_norm": 0.6757237911224365, + "learning_rate": 0.0004433491601357935, + "loss": 3.5026, + "step": 1891 + }, + { + "epoch": 0.24, + "grad_norm": 0.653641402721405, + "learning_rate": 0.0004432834339358367, + "loss": 3.3618, + "step": 1892 + }, + { + "epoch": 0.24, + "grad_norm": 0.6783159971237183, + "learning_rate": 0.00044321767450821673, + "loss": 3.5522, + "step": 1893 + }, + { + "epoch": 0.24, + "grad_norm": 0.6962612867355347, + "learning_rate": 0.0004431518818642386, + "loss": 3.3733, + "step": 1894 + }, + { + "epoch": 0.24, + "grad_norm": 0.7038624286651611, + "learning_rate": 0.0004430860560152128, + "loss": 3.4771, + "step": 1895 + }, + { + "epoch": 0.24, + "grad_norm": 0.6692087650299072, + "learning_rate": 0.00044302019697245546, + "loss": 3.5181, + "step": 1896 + }, + { + "epoch": 0.24, + "grad_norm": 0.6483880877494812, + "learning_rate": 0.0004429543047472886, + "loss": 3.5177, + "step": 1897 + }, + { + "epoch": 0.24, + "grad_norm": 0.687965452671051, + "learning_rate": 0.0004428883793510399, + "loss": 3.4136, + "step": 1898 + }, + { + "epoch": 0.24, + "grad_norm": 0.7006435394287109, + "learning_rate": 0.0004428224207950425, + "loss": 3.4867, + "step": 1899 + }, + { + "epoch": 0.24, + "grad_norm": 0.7762485146522522, + "learning_rate": 0.0004427564290906357, + "loss": 3.4821, + "step": 1900 + }, + { + "epoch": 0.24, + "grad_norm": 0.672616183757782, + "learning_rate": 0.00044269040424916407, + "loss": 3.5393, + "step": 1901 + }, + { + "epoch": 0.24, + "grad_norm": 0.75141841173172, + "learning_rate": 0.00044262434628197805, + "loss": 3.6058, + "step": 1902 + }, + { + "epoch": 0.24, + "grad_norm": 0.6640050411224365, + "learning_rate": 0.00044255825520043393, + "loss": 3.4241, + "step": 1903 + }, + { + "epoch": 0.24, + "grad_norm": 0.6671122908592224, + "learning_rate": 0.00044249213101589323, + "loss": 3.3735, + "step": 1904 + }, + { + "epoch": 0.24, + "grad_norm": 0.6726486086845398, + "learning_rate": 0.0004424259737397238, + "loss": 3.4756, + "step": 1905 + }, + { + "epoch": 0.24, + "grad_norm": 0.6656337380409241, + "learning_rate": 0.00044235978338329863, + "loss": 3.4923, + "step": 1906 + }, + { + "epoch": 0.24, + "grad_norm": 0.6552636623382568, + "learning_rate": 0.0004422935599579967, + "loss": 3.4121, + "step": 1907 + }, + { + "epoch": 0.24, + "grad_norm": 0.7019140720367432, + "learning_rate": 0.00044222730347520255, + "loss": 3.4499, + "step": 1908 + }, + { + "epoch": 0.24, + "grad_norm": 0.6775514483451843, + "learning_rate": 0.0004421610139463064, + "loss": 3.3372, + "step": 1909 + }, + { + "epoch": 0.24, + "grad_norm": 0.6777592301368713, + "learning_rate": 0.0004420946913827043, + "loss": 3.4424, + "step": 1910 + }, + { + "epoch": 0.24, + "grad_norm": 0.7406557202339172, + "learning_rate": 0.00044202833579579776, + "loss": 3.3874, + "step": 1911 + }, + { + "epoch": 0.24, + "grad_norm": 0.6490644216537476, + "learning_rate": 0.0004419619471969941, + "loss": 3.3859, + "step": 1912 + }, + { + "epoch": 0.24, + "grad_norm": 0.7046037316322327, + "learning_rate": 0.00044189552559770635, + "loss": 3.5542, + "step": 1913 + }, + { + "epoch": 0.24, + "grad_norm": 0.681759774684906, + "learning_rate": 0.0004418290710093531, + "loss": 3.4768, + "step": 1914 + }, + { + "epoch": 0.25, + "grad_norm": 0.7443521618843079, + "learning_rate": 0.0004417625834433586, + "loss": 3.6244, + "step": 1915 + }, + { + "epoch": 0.25, + "grad_norm": 0.6745380163192749, + "learning_rate": 0.00044169606291115295, + "loss": 3.3206, + "step": 1916 + }, + { + "epoch": 0.25, + "grad_norm": 0.7358022332191467, + "learning_rate": 0.0004416295094241718, + "loss": 3.3504, + "step": 1917 + }, + { + "epoch": 0.25, + "grad_norm": 0.7005290985107422, + "learning_rate": 0.00044156292299385636, + "loss": 3.5177, + "step": 1918 + }, + { + "epoch": 0.25, + "grad_norm": 0.6522749066352844, + "learning_rate": 0.0004414963036316537, + "loss": 3.504, + "step": 1919 + }, + { + "epoch": 0.25, + "grad_norm": 0.7436257600784302, + "learning_rate": 0.00044142965134901635, + "loss": 3.5254, + "step": 1920 + }, + { + "epoch": 0.25, + "grad_norm": 0.682768702507019, + "learning_rate": 0.0004413629661574028, + "loss": 3.1955, + "step": 1921 + }, + { + "epoch": 0.25, + "grad_norm": 0.715193510055542, + "learning_rate": 0.00044129624806827684, + "loss": 3.4625, + "step": 1922 + }, + { + "epoch": 0.25, + "grad_norm": 0.7119811773300171, + "learning_rate": 0.00044122949709310817, + "loss": 3.5076, + "step": 1923 + }, + { + "epoch": 0.25, + "grad_norm": 0.6985951066017151, + "learning_rate": 0.00044116271324337196, + "loss": 3.4379, + "step": 1924 + }, + { + "epoch": 0.25, + "grad_norm": 0.644829273223877, + "learning_rate": 0.00044109589653054925, + "loss": 3.5178, + "step": 1925 + }, + { + "epoch": 0.25, + "grad_norm": 0.7521076798439026, + "learning_rate": 0.0004410290469661266, + "loss": 3.5462, + "step": 1926 + }, + { + "epoch": 0.25, + "grad_norm": 0.7019145488739014, + "learning_rate": 0.0004409621645615961, + "loss": 3.4109, + "step": 1927 + }, + { + "epoch": 0.25, + "grad_norm": 0.731626033782959, + "learning_rate": 0.0004408952493284557, + "loss": 3.548, + "step": 1928 + }, + { + "epoch": 0.25, + "grad_norm": 0.6920309066772461, + "learning_rate": 0.00044082830127820897, + "loss": 3.5405, + "step": 1929 + }, + { + "epoch": 0.25, + "grad_norm": 0.7317379713058472, + "learning_rate": 0.0004407613204223648, + "loss": 3.4448, + "step": 1930 + }, + { + "epoch": 0.25, + "grad_norm": 0.7225735187530518, + "learning_rate": 0.00044069430677243834, + "loss": 3.4579, + "step": 1931 + }, + { + "epoch": 0.25, + "grad_norm": 0.6989052891731262, + "learning_rate": 0.0004406272603399497, + "loss": 3.4951, + "step": 1932 + }, + { + "epoch": 0.25, + "grad_norm": 0.706935703754425, + "learning_rate": 0.00044056018113642514, + "loss": 3.4935, + "step": 1933 + }, + { + "epoch": 0.25, + "grad_norm": 0.6633912920951843, + "learning_rate": 0.00044049306917339626, + "loss": 3.5158, + "step": 1934 + }, + { + "epoch": 0.25, + "grad_norm": 0.6701186895370483, + "learning_rate": 0.00044042592446240044, + "loss": 3.4929, + "step": 1935 + }, + { + "epoch": 0.25, + "grad_norm": 0.6783208250999451, + "learning_rate": 0.0004403587470149806, + "loss": 3.4407, + "step": 1936 + }, + { + "epoch": 0.25, + "grad_norm": 0.8055753707885742, + "learning_rate": 0.00044029153684268526, + "loss": 3.3577, + "step": 1937 + }, + { + "epoch": 0.25, + "grad_norm": 0.6336120963096619, + "learning_rate": 0.0004402242939570687, + "loss": 3.3685, + "step": 1938 + }, + { + "epoch": 0.25, + "grad_norm": 0.6730412840843201, + "learning_rate": 0.00044015701836969075, + "loss": 3.5368, + "step": 1939 + }, + { + "epoch": 0.25, + "grad_norm": 0.7138683795928955, + "learning_rate": 0.00044008971009211684, + "loss": 3.5457, + "step": 1940 + }, + { + "epoch": 0.25, + "grad_norm": 0.759942889213562, + "learning_rate": 0.0004400223691359181, + "loss": 3.4003, + "step": 1941 + }, + { + "epoch": 0.25, + "grad_norm": 0.752402126789093, + "learning_rate": 0.00043995499551267115, + "loss": 3.4526, + "step": 1942 + }, + { + "epoch": 0.25, + "grad_norm": 0.6543063521385193, + "learning_rate": 0.0004398875892339583, + "loss": 3.5303, + "step": 1943 + }, + { + "epoch": 0.25, + "grad_norm": 0.7147708535194397, + "learning_rate": 0.0004398201503113675, + "loss": 3.4889, + "step": 1944 + }, + { + "epoch": 0.25, + "grad_norm": 0.6864874958992004, + "learning_rate": 0.0004397526787564923, + "loss": 3.5839, + "step": 1945 + }, + { + "epoch": 0.25, + "grad_norm": 0.7960362434387207, + "learning_rate": 0.00043968517458093184, + "loss": 3.5179, + "step": 1946 + }, + { + "epoch": 0.25, + "grad_norm": 0.7662225961685181, + "learning_rate": 0.0004396176377962908, + "loss": 3.5034, + "step": 1947 + }, + { + "epoch": 0.25, + "grad_norm": 0.6254611015319824, + "learning_rate": 0.0004395500684141797, + "loss": 3.3934, + "step": 1948 + }, + { + "epoch": 0.25, + "grad_norm": 0.6561282873153687, + "learning_rate": 0.00043948246644621427, + "loss": 3.5359, + "step": 1949 + }, + { + "epoch": 0.25, + "grad_norm": 0.6546863317489624, + "learning_rate": 0.0004394148319040162, + "loss": 3.464, + "step": 1950 + }, + { + "epoch": 0.25, + "grad_norm": 0.6724408864974976, + "learning_rate": 0.00043934716479921267, + "loss": 3.4396, + "step": 1951 + }, + { + "epoch": 0.25, + "grad_norm": 0.6835522055625916, + "learning_rate": 0.00043927946514343635, + "loss": 3.4584, + "step": 1952 + }, + { + "epoch": 0.25, + "grad_norm": 0.7021266222000122, + "learning_rate": 0.00043921173294832574, + "loss": 3.3387, + "step": 1953 + }, + { + "epoch": 0.25, + "grad_norm": 0.7060502767562866, + "learning_rate": 0.0004391439682255247, + "loss": 3.4091, + "step": 1954 + }, + { + "epoch": 0.25, + "grad_norm": 0.7331516742706299, + "learning_rate": 0.0004390761709866827, + "loss": 3.5226, + "step": 1955 + }, + { + "epoch": 0.25, + "grad_norm": 0.7018919587135315, + "learning_rate": 0.000439008341243455, + "loss": 3.5414, + "step": 1956 + }, + { + "epoch": 0.25, + "grad_norm": 0.8398563861846924, + "learning_rate": 0.00043894047900750225, + "loss": 3.4953, + "step": 1957 + }, + { + "epoch": 0.25, + "grad_norm": 0.6954603791236877, + "learning_rate": 0.0004388725842904908, + "loss": 3.5214, + "step": 1958 + }, + { + "epoch": 0.25, + "grad_norm": 0.6625902056694031, + "learning_rate": 0.00043880465710409243, + "loss": 3.4665, + "step": 1959 + }, + { + "epoch": 0.25, + "grad_norm": 0.6996656060218811, + "learning_rate": 0.0004387366974599848, + "loss": 3.4562, + "step": 1960 + }, + { + "epoch": 0.25, + "grad_norm": 0.7215577363967896, + "learning_rate": 0.0004386687053698508, + "loss": 3.4082, + "step": 1961 + }, + { + "epoch": 0.25, + "grad_norm": 0.7131018042564392, + "learning_rate": 0.0004386006808453792, + "loss": 3.436, + "step": 1962 + }, + { + "epoch": 0.25, + "grad_norm": 0.7280438542366028, + "learning_rate": 0.000438532623898264, + "loss": 3.3886, + "step": 1963 + }, + { + "epoch": 0.25, + "grad_norm": 0.7057799696922302, + "learning_rate": 0.00043846453454020513, + "loss": 3.4871, + "step": 1964 + }, + { + "epoch": 0.25, + "grad_norm": 0.7401341795921326, + "learning_rate": 0.00043839641278290787, + "loss": 3.5752, + "step": 1965 + }, + { + "epoch": 0.25, + "grad_norm": 0.7227087616920471, + "learning_rate": 0.0004383282586380832, + "loss": 3.5417, + "step": 1966 + }, + { + "epoch": 0.25, + "grad_norm": 0.778991162776947, + "learning_rate": 0.0004382600721174477, + "loss": 3.493, + "step": 1967 + }, + { + "epoch": 0.25, + "grad_norm": 0.6657295227050781, + "learning_rate": 0.00043819185323272313, + "loss": 3.4669, + "step": 1968 + }, + { + "epoch": 0.25, + "grad_norm": 0.6682909727096558, + "learning_rate": 0.0004381236019956374, + "loss": 3.3524, + "step": 1969 + }, + { + "epoch": 0.25, + "grad_norm": 0.6612628102302551, + "learning_rate": 0.00043805531841792345, + "loss": 3.5043, + "step": 1970 + }, + { + "epoch": 0.25, + "grad_norm": 0.7476475238800049, + "learning_rate": 0.0004379870025113203, + "loss": 3.5374, + "step": 1971 + }, + { + "epoch": 0.25, + "grad_norm": 0.6901301741600037, + "learning_rate": 0.00043791865428757196, + "loss": 3.3478, + "step": 1972 + }, + { + "epoch": 0.25, + "grad_norm": 0.6781008243560791, + "learning_rate": 0.00043785027375842846, + "loss": 3.4154, + "step": 1973 + }, + { + "epoch": 0.25, + "grad_norm": 0.7976479530334473, + "learning_rate": 0.0004377818609356451, + "loss": 3.4864, + "step": 1974 + }, + { + "epoch": 0.25, + "grad_norm": 0.6746860146522522, + "learning_rate": 0.00043771341583098293, + "loss": 3.3971, + "step": 1975 + }, + { + "epoch": 0.25, + "grad_norm": 0.6861461400985718, + "learning_rate": 0.00043764493845620847, + "loss": 3.5625, + "step": 1976 + }, + { + "epoch": 0.25, + "grad_norm": 0.6744572520256042, + "learning_rate": 0.00043757642882309364, + "loss": 3.4734, + "step": 1977 + }, + { + "epoch": 0.25, + "grad_norm": 0.684846818447113, + "learning_rate": 0.00043750788694341613, + "loss": 3.388, + "step": 1978 + }, + { + "epoch": 0.25, + "grad_norm": 0.6695400476455688, + "learning_rate": 0.0004374393128289591, + "loss": 3.5149, + "step": 1979 + }, + { + "epoch": 0.25, + "grad_norm": 0.6745089292526245, + "learning_rate": 0.0004373707064915112, + "loss": 3.376, + "step": 1980 + }, + { + "epoch": 0.25, + "grad_norm": 0.7593763470649719, + "learning_rate": 0.0004373020679428667, + "loss": 3.5023, + "step": 1981 + }, + { + "epoch": 0.25, + "grad_norm": 0.6825028657913208, + "learning_rate": 0.0004372333971948253, + "loss": 3.5532, + "step": 1982 + }, + { + "epoch": 0.25, + "grad_norm": 0.7344127893447876, + "learning_rate": 0.0004371646942591924, + "loss": 3.3933, + "step": 1983 + }, + { + "epoch": 0.25, + "grad_norm": 0.7311540246009827, + "learning_rate": 0.00043709595914777865, + "loss": 3.3001, + "step": 1984 + }, + { + "epoch": 0.25, + "grad_norm": 0.7440100312232971, + "learning_rate": 0.0004370271918724006, + "loss": 3.5478, + "step": 1985 + }, + { + "epoch": 0.25, + "grad_norm": 0.7262304425239563, + "learning_rate": 0.00043695839244488, + "loss": 3.432, + "step": 1986 + }, + { + "epoch": 0.25, + "grad_norm": 0.7508566379547119, + "learning_rate": 0.00043688956087704434, + "loss": 3.451, + "step": 1987 + }, + { + "epoch": 0.25, + "grad_norm": 0.701371431350708, + "learning_rate": 0.0004368206971807266, + "loss": 3.4946, + "step": 1988 + }, + { + "epoch": 0.25, + "grad_norm": 0.6847150921821594, + "learning_rate": 0.00043675180136776515, + "loss": 3.3828, + "step": 1989 + }, + { + "epoch": 0.25, + "grad_norm": 0.7214049100875854, + "learning_rate": 0.00043668287345000403, + "loss": 3.5557, + "step": 1990 + }, + { + "epoch": 0.25, + "grad_norm": 0.6651790142059326, + "learning_rate": 0.0004366139134392928, + "loss": 3.3798, + "step": 1991 + }, + { + "epoch": 0.25, + "grad_norm": 0.7046433091163635, + "learning_rate": 0.00043654492134748634, + "loss": 3.4396, + "step": 1992 + }, + { + "epoch": 0.26, + "grad_norm": 0.7146850824356079, + "learning_rate": 0.00043647589718644544, + "loss": 3.4823, + "step": 1993 + }, + { + "epoch": 0.26, + "grad_norm": 0.6687988042831421, + "learning_rate": 0.00043640684096803585, + "loss": 3.4075, + "step": 1994 + }, + { + "epoch": 0.26, + "grad_norm": 0.6961211562156677, + "learning_rate": 0.0004363377527041294, + "loss": 3.408, + "step": 1995 + }, + { + "epoch": 0.26, + "grad_norm": 0.7626001238822937, + "learning_rate": 0.00043626863240660296, + "loss": 3.6133, + "step": 1996 + }, + { + "epoch": 0.26, + "grad_norm": 0.6749371886253357, + "learning_rate": 0.0004361994800873392, + "loss": 3.366, + "step": 1997 + }, + { + "epoch": 0.26, + "grad_norm": 0.74397212266922, + "learning_rate": 0.00043613029575822625, + "loss": 3.4656, + "step": 1998 + }, + { + "epoch": 0.26, + "grad_norm": 0.6702881455421448, + "learning_rate": 0.0004360610794311577, + "loss": 3.5005, + "step": 1999 + }, + { + "epoch": 0.26, + "grad_norm": 0.6869195699691772, + "learning_rate": 0.0004359918311180325, + "loss": 3.6151, + "step": 2000 + }, + { + "epoch": 0.26, + "grad_norm": 0.7465648055076599, + "learning_rate": 0.0004359225508307554, + "loss": 3.5636, + "step": 2001 + }, + { + "epoch": 0.26, + "grad_norm": 0.6688586473464966, + "learning_rate": 0.00043585323858123635, + "loss": 3.5731, + "step": 2002 + }, + { + "epoch": 0.26, + "grad_norm": 0.6723427176475525, + "learning_rate": 0.000435783894381391, + "loss": 3.5229, + "step": 2003 + }, + { + "epoch": 0.26, + "grad_norm": 0.6933841109275818, + "learning_rate": 0.0004357145182431405, + "loss": 3.504, + "step": 2004 + }, + { + "epoch": 0.26, + "grad_norm": 0.6604121327400208, + "learning_rate": 0.00043564511017841123, + "loss": 3.3467, + "step": 2005 + }, + { + "epoch": 0.26, + "grad_norm": 0.6695713400840759, + "learning_rate": 0.00043557567019913534, + "loss": 3.5138, + "step": 2006 + }, + { + "epoch": 0.26, + "grad_norm": 0.7155663371086121, + "learning_rate": 0.00043550619831725037, + "loss": 3.4378, + "step": 2007 + }, + { + "epoch": 0.26, + "grad_norm": 0.7241122722625732, + "learning_rate": 0.00043543669454469935, + "loss": 3.391, + "step": 2008 + }, + { + "epoch": 0.26, + "grad_norm": 0.6676546335220337, + "learning_rate": 0.00043536715889343084, + "loss": 3.4446, + "step": 2009 + }, + { + "epoch": 0.26, + "grad_norm": 0.7022714614868164, + "learning_rate": 0.0004352975913753987, + "loss": 3.4875, + "step": 2010 + }, + { + "epoch": 0.26, + "grad_norm": 0.6732816696166992, + "learning_rate": 0.0004352279920025624, + "loss": 3.3945, + "step": 2011 + }, + { + "epoch": 0.26, + "grad_norm": 0.7686198949813843, + "learning_rate": 0.00043515836078688696, + "loss": 3.3351, + "step": 2012 + }, + { + "epoch": 0.26, + "grad_norm": 0.7455644011497498, + "learning_rate": 0.00043508869774034275, + "loss": 3.5383, + "step": 2013 + }, + { + "epoch": 0.26, + "grad_norm": 0.7511581778526306, + "learning_rate": 0.0004350190028749057, + "loss": 3.4616, + "step": 2014 + }, + { + "epoch": 0.26, + "grad_norm": 0.6484218835830688, + "learning_rate": 0.00043494927620255715, + "loss": 3.4581, + "step": 2015 + }, + { + "epoch": 0.26, + "grad_norm": 0.7067961096763611, + "learning_rate": 0.0004348795177352839, + "loss": 3.4788, + "step": 2016 + }, + { + "epoch": 0.26, + "grad_norm": 0.6927537322044373, + "learning_rate": 0.0004348097274850782, + "loss": 3.4899, + "step": 2017 + }, + { + "epoch": 0.26, + "grad_norm": 0.6653082370758057, + "learning_rate": 0.00043473990546393784, + "loss": 3.5604, + "step": 2018 + }, + { + "epoch": 0.26, + "grad_norm": 0.7367866039276123, + "learning_rate": 0.0004346700516838661, + "loss": 3.4888, + "step": 2019 + }, + { + "epoch": 0.26, + "grad_norm": 0.7152118682861328, + "learning_rate": 0.0004346001661568715, + "loss": 3.5531, + "step": 2020 + }, + { + "epoch": 0.26, + "grad_norm": 0.6760360598564148, + "learning_rate": 0.0004345302488949684, + "loss": 3.4032, + "step": 2021 + }, + { + "epoch": 0.26, + "grad_norm": 0.6933254599571228, + "learning_rate": 0.0004344602999101761, + "loss": 3.5824, + "step": 2022 + }, + { + "epoch": 0.26, + "grad_norm": 0.6791170239448547, + "learning_rate": 0.00043439031921451994, + "loss": 3.299, + "step": 2023 + }, + { + "epoch": 0.26, + "grad_norm": 0.6987645030021667, + "learning_rate": 0.0004343203068200302, + "loss": 3.4105, + "step": 2024 + }, + { + "epoch": 0.26, + "grad_norm": 0.6751917004585266, + "learning_rate": 0.0004342502627387429, + "loss": 3.4956, + "step": 2025 + }, + { + "epoch": 0.26, + "grad_norm": 0.7118316888809204, + "learning_rate": 0.00043418018698269945, + "loss": 3.4484, + "step": 2026 + }, + { + "epoch": 0.26, + "grad_norm": 0.7071229815483093, + "learning_rate": 0.00043411007956394666, + "loss": 3.3642, + "step": 2027 + }, + { + "epoch": 0.26, + "grad_norm": 0.6810958981513977, + "learning_rate": 0.0004340399404945368, + "loss": 3.5119, + "step": 2028 + }, + { + "epoch": 0.26, + "grad_norm": 0.7358177900314331, + "learning_rate": 0.0004339697697865276, + "loss": 3.4273, + "step": 2029 + }, + { + "epoch": 0.26, + "grad_norm": 0.7156998515129089, + "learning_rate": 0.0004338995674519822, + "loss": 3.5882, + "step": 2030 + }, + { + "epoch": 0.26, + "grad_norm": 0.7202603220939636, + "learning_rate": 0.00043382933350296916, + "loss": 3.5176, + "step": 2031 + }, + { + "epoch": 0.26, + "grad_norm": 0.7155426740646362, + "learning_rate": 0.0004337590679515626, + "loss": 3.4288, + "step": 2032 + }, + { + "epoch": 0.26, + "grad_norm": 0.7273238301277161, + "learning_rate": 0.00043368877080984195, + "loss": 3.5389, + "step": 2033 + }, + { + "epoch": 0.26, + "grad_norm": 0.7046866416931152, + "learning_rate": 0.0004336184420898921, + "loss": 3.4743, + "step": 2034 + }, + { + "epoch": 0.26, + "grad_norm": 0.724859356880188, + "learning_rate": 0.00043354808180380333, + "loss": 3.4958, + "step": 2035 + }, + { + "epoch": 0.26, + "grad_norm": 0.7297008633613586, + "learning_rate": 0.0004334776899636714, + "loss": 3.5521, + "step": 2036 + }, + { + "epoch": 0.26, + "grad_norm": 0.7214579582214355, + "learning_rate": 0.00043340726658159764, + "loss": 3.3943, + "step": 2037 + }, + { + "epoch": 0.26, + "grad_norm": 0.682220458984375, + "learning_rate": 0.0004333368116696884, + "loss": 3.4364, + "step": 2038 + }, + { + "epoch": 0.26, + "grad_norm": 0.6749826073646545, + "learning_rate": 0.00043326632524005583, + "loss": 3.5132, + "step": 2039 + }, + { + "epoch": 0.26, + "grad_norm": 0.6864546537399292, + "learning_rate": 0.0004331958073048174, + "loss": 3.568, + "step": 2040 + }, + { + "epoch": 0.26, + "grad_norm": 0.7034383416175842, + "learning_rate": 0.0004331252578760959, + "loss": 3.5975, + "step": 2041 + }, + { + "epoch": 0.26, + "grad_norm": 0.7162829637527466, + "learning_rate": 0.00043305467696601953, + "loss": 3.4938, + "step": 2042 + }, + { + "epoch": 0.26, + "grad_norm": 0.6995472311973572, + "learning_rate": 0.00043298406458672207, + "loss": 3.5466, + "step": 2043 + }, + { + "epoch": 0.26, + "grad_norm": 0.672080397605896, + "learning_rate": 0.00043291342075034255, + "loss": 3.3307, + "step": 2044 + }, + { + "epoch": 0.26, + "grad_norm": 0.6504019498825073, + "learning_rate": 0.00043284274546902555, + "loss": 3.497, + "step": 2045 + }, + { + "epoch": 0.26, + "grad_norm": 0.6735064387321472, + "learning_rate": 0.00043277203875492087, + "loss": 3.4493, + "step": 2046 + }, + { + "epoch": 0.26, + "grad_norm": 0.7060967087745667, + "learning_rate": 0.0004327013006201839, + "loss": 3.554, + "step": 2047 + }, + { + "epoch": 0.26, + "grad_norm": 0.6433796882629395, + "learning_rate": 0.00043263053107697524, + "loss": 3.5399, + "step": 2048 + }, + { + "epoch": 0.26, + "grad_norm": 0.6759264469146729, + "learning_rate": 0.0004325597301374611, + "loss": 3.5371, + "step": 2049 + }, + { + "epoch": 0.26, + "grad_norm": 0.6785297393798828, + "learning_rate": 0.00043248889781381285, + "loss": 3.4896, + "step": 2050 + }, + { + "epoch": 0.26, + "grad_norm": 0.6828736662864685, + "learning_rate": 0.00043241803411820756, + "loss": 3.3398, + "step": 2051 + }, + { + "epoch": 0.26, + "grad_norm": 0.699984610080719, + "learning_rate": 0.0004323471390628274, + "loss": 3.3404, + "step": 2052 + }, + { + "epoch": 0.26, + "grad_norm": 0.6868505477905273, + "learning_rate": 0.00043227621265986004, + "loss": 3.4911, + "step": 2053 + }, + { + "epoch": 0.26, + "grad_norm": 0.6925286054611206, + "learning_rate": 0.0004322052549214987, + "loss": 3.3701, + "step": 2054 + }, + { + "epoch": 0.26, + "grad_norm": 0.7424176931381226, + "learning_rate": 0.0004321342658599416, + "loss": 3.4955, + "step": 2055 + }, + { + "epoch": 0.26, + "grad_norm": 0.6669638156890869, + "learning_rate": 0.0004320632454873929, + "loss": 3.3172, + "step": 2056 + }, + { + "epoch": 0.26, + "grad_norm": 0.6610919237136841, + "learning_rate": 0.00043199219381606153, + "loss": 3.5237, + "step": 2057 + }, + { + "epoch": 0.26, + "grad_norm": 0.6433104276657104, + "learning_rate": 0.0004319211108581622, + "loss": 3.4581, + "step": 2058 + }, + { + "epoch": 0.26, + "grad_norm": 0.6824292540550232, + "learning_rate": 0.000431849996625915, + "loss": 3.5064, + "step": 2059 + }, + { + "epoch": 0.26, + "grad_norm": 0.6911894679069519, + "learning_rate": 0.00043177885113154503, + "loss": 3.4415, + "step": 2060 + }, + { + "epoch": 0.26, + "grad_norm": 0.7084999084472656, + "learning_rate": 0.0004317076743872833, + "loss": 3.4936, + "step": 2061 + }, + { + "epoch": 0.26, + "grad_norm": 0.7088779211044312, + "learning_rate": 0.0004316364664053658, + "loss": 3.4208, + "step": 2062 + }, + { + "epoch": 0.26, + "grad_norm": 0.6912571787834167, + "learning_rate": 0.000431565227198034, + "loss": 3.5643, + "step": 2063 + }, + { + "epoch": 0.26, + "grad_norm": 0.6518218517303467, + "learning_rate": 0.0004314939567775347, + "loss": 3.4999, + "step": 2064 + }, + { + "epoch": 0.26, + "grad_norm": 2.758113384246826, + "learning_rate": 0.0004314226551561202, + "loss": 3.4841, + "step": 2065 + }, + { + "epoch": 0.26, + "grad_norm": 0.6662098169326782, + "learning_rate": 0.00043135132234604814, + "loss": 3.4227, + "step": 2066 + }, + { + "epoch": 0.26, + "grad_norm": 0.7417008876800537, + "learning_rate": 0.0004312799583595813, + "loss": 3.5523, + "step": 2067 + }, + { + "epoch": 0.26, + "grad_norm": 0.7159901857376099, + "learning_rate": 0.00043120856320898806, + "loss": 3.4275, + "step": 2068 + }, + { + "epoch": 0.26, + "grad_norm": 0.7324928045272827, + "learning_rate": 0.000431137136906542, + "loss": 3.5198, + "step": 2069 + }, + { + "epoch": 0.26, + "grad_norm": 0.7330513000488281, + "learning_rate": 0.00043106567946452225, + "loss": 3.6361, + "step": 2070 + }, + { + "epoch": 0.27, + "grad_norm": 0.7240403294563293, + "learning_rate": 0.0004309941908952131, + "loss": 3.4916, + "step": 2071 + }, + { + "epoch": 0.27, + "grad_norm": 0.7005897760391235, + "learning_rate": 0.0004309226712109042, + "loss": 3.4402, + "step": 2072 + }, + { + "epoch": 0.27, + "grad_norm": 0.7218866348266602, + "learning_rate": 0.00043085112042389075, + "loss": 3.5906, + "step": 2073 + }, + { + "epoch": 0.27, + "grad_norm": 0.7103626132011414, + "learning_rate": 0.000430779538546473, + "loss": 3.5515, + "step": 2074 + }, + { + "epoch": 0.27, + "grad_norm": 0.7686103582382202, + "learning_rate": 0.0004307079255909569, + "loss": 3.4956, + "step": 2075 + }, + { + "epoch": 0.27, + "grad_norm": 0.7192513942718506, + "learning_rate": 0.0004306362815696534, + "loss": 3.5604, + "step": 2076 + }, + { + "epoch": 0.27, + "grad_norm": 0.6870089173316956, + "learning_rate": 0.00043056460649487904, + "loss": 3.4542, + "step": 2077 + }, + { + "epoch": 0.27, + "grad_norm": 0.7580736875534058, + "learning_rate": 0.0004304929003789555, + "loss": 3.4443, + "step": 2078 + }, + { + "epoch": 0.27, + "grad_norm": 0.6816487908363342, + "learning_rate": 0.0004304211632342099, + "loss": 3.3623, + "step": 2079 + }, + { + "epoch": 0.27, + "grad_norm": 0.724460780620575, + "learning_rate": 0.0004303493950729748, + "loss": 3.3841, + "step": 2080 + }, + { + "epoch": 0.27, + "grad_norm": 0.719296395778656, + "learning_rate": 0.0004302775959075878, + "loss": 3.3878, + "step": 2081 + }, + { + "epoch": 0.27, + "grad_norm": 0.6661654114723206, + "learning_rate": 0.00043020576575039215, + "loss": 3.4579, + "step": 2082 + }, + { + "epoch": 0.27, + "grad_norm": 0.6661612391471863, + "learning_rate": 0.00043013390461373626, + "loss": 3.4368, + "step": 2083 + }, + { + "epoch": 0.27, + "grad_norm": 0.683428168296814, + "learning_rate": 0.0004300620125099738, + "loss": 3.4621, + "step": 2084 + }, + { + "epoch": 0.27, + "grad_norm": 0.7038209438323975, + "learning_rate": 0.000429990089451464, + "loss": 3.5481, + "step": 2085 + }, + { + "epoch": 0.27, + "grad_norm": 0.6688196659088135, + "learning_rate": 0.0004299181354505712, + "loss": 3.4501, + "step": 2086 + }, + { + "epoch": 0.27, + "grad_norm": 0.6909682154655457, + "learning_rate": 0.00042984615051966515, + "loss": 3.4476, + "step": 2087 + }, + { + "epoch": 0.27, + "grad_norm": 0.70419842004776, + "learning_rate": 0.00042977413467112084, + "loss": 3.6017, + "step": 2088 + }, + { + "epoch": 0.27, + "grad_norm": 0.8000498414039612, + "learning_rate": 0.00042970208791731857, + "loss": 3.413, + "step": 2089 + }, + { + "epoch": 0.27, + "grad_norm": 0.6451414227485657, + "learning_rate": 0.00042963001027064416, + "loss": 3.5047, + "step": 2090 + }, + { + "epoch": 0.27, + "grad_norm": 0.7211129069328308, + "learning_rate": 0.00042955790174348864, + "loss": 3.5806, + "step": 2091 + }, + { + "epoch": 0.27, + "grad_norm": 0.7346986532211304, + "learning_rate": 0.0004294857623482481, + "loss": 3.502, + "step": 2092 + }, + { + "epoch": 0.27, + "grad_norm": 0.7390275597572327, + "learning_rate": 0.0004294135920973242, + "loss": 3.4617, + "step": 2093 + }, + { + "epoch": 0.27, + "grad_norm": 0.7164575457572937, + "learning_rate": 0.000429341391003124, + "loss": 3.5179, + "step": 2094 + }, + { + "epoch": 0.27, + "grad_norm": 0.7850090861320496, + "learning_rate": 0.0004292691590780595, + "loss": 3.5023, + "step": 2095 + }, + { + "epoch": 0.27, + "grad_norm": 0.7475020289421082, + "learning_rate": 0.00042919689633454827, + "loss": 3.4549, + "step": 2096 + }, + { + "epoch": 0.27, + "grad_norm": 0.7047721147537231, + "learning_rate": 0.0004291246027850132, + "loss": 3.4779, + "step": 2097 + }, + { + "epoch": 0.27, + "grad_norm": 0.656097948551178, + "learning_rate": 0.00042905227844188226, + "loss": 3.3747, + "step": 2098 + }, + { + "epoch": 0.27, + "grad_norm": 0.6438138484954834, + "learning_rate": 0.00042897992331758896, + "loss": 3.5208, + "step": 2099 + }, + { + "epoch": 0.27, + "grad_norm": 0.6644891500473022, + "learning_rate": 0.0004289075374245719, + "loss": 3.4167, + "step": 2100 + }, + { + "epoch": 0.27, + "grad_norm": 0.6867865324020386, + "learning_rate": 0.00042883512077527506, + "loss": 3.615, + "step": 2101 + }, + { + "epoch": 0.27, + "grad_norm": 0.7456568479537964, + "learning_rate": 0.0004287626733821477, + "loss": 3.4706, + "step": 2102 + }, + { + "epoch": 0.27, + "grad_norm": 0.7014803290367126, + "learning_rate": 0.0004286901952576445, + "loss": 3.5279, + "step": 2103 + }, + { + "epoch": 0.27, + "grad_norm": 0.6818939447402954, + "learning_rate": 0.00042861768641422505, + "loss": 3.4957, + "step": 2104 + }, + { + "epoch": 0.27, + "grad_norm": 0.6658956408500671, + "learning_rate": 0.0004285451468643546, + "loss": 3.5563, + "step": 2105 + }, + { + "epoch": 0.27, + "grad_norm": 0.6703715920448303, + "learning_rate": 0.0004284725766205037, + "loss": 3.45, + "step": 2106 + }, + { + "epoch": 0.27, + "grad_norm": 0.697333574295044, + "learning_rate": 0.00042839997569514766, + "loss": 3.4967, + "step": 2107 + }, + { + "epoch": 0.27, + "grad_norm": 0.6819553375244141, + "learning_rate": 0.0004283273441007677, + "loss": 3.2826, + "step": 2108 + }, + { + "epoch": 0.27, + "grad_norm": 0.7194439768791199, + "learning_rate": 0.00042825468184984995, + "loss": 3.4084, + "step": 2109 + }, + { + "epoch": 0.27, + "grad_norm": 0.7378134727478027, + "learning_rate": 0.0004281819889548858, + "loss": 3.4858, + "step": 2110 + }, + { + "epoch": 0.27, + "grad_norm": 0.7612019777297974, + "learning_rate": 0.00042810926542837213, + "loss": 3.56, + "step": 2111 + }, + { + "epoch": 0.27, + "grad_norm": 0.6910973191261292, + "learning_rate": 0.000428036511282811, + "loss": 3.4338, + "step": 2112 + }, + { + "epoch": 0.27, + "grad_norm": 0.7197093963623047, + "learning_rate": 0.00042796372653070946, + "loss": 3.5613, + "step": 2113 + }, + { + "epoch": 0.27, + "grad_norm": 0.673733651638031, + "learning_rate": 0.00042789091118458037, + "loss": 3.4107, + "step": 2114 + }, + { + "epoch": 0.27, + "grad_norm": 0.6490559577941895, + "learning_rate": 0.00042781806525694124, + "loss": 3.5269, + "step": 2115 + }, + { + "epoch": 0.27, + "grad_norm": 0.6586740612983704, + "learning_rate": 0.0004277451887603152, + "loss": 3.4372, + "step": 2116 + }, + { + "epoch": 0.27, + "grad_norm": 0.6464582085609436, + "learning_rate": 0.0004276722817072307, + "loss": 3.3763, + "step": 2117 + }, + { + "epoch": 0.27, + "grad_norm": 0.7137161493301392, + "learning_rate": 0.0004275993441102212, + "loss": 3.4224, + "step": 2118 + }, + { + "epoch": 0.27, + "grad_norm": 0.6621758341789246, + "learning_rate": 0.00042752637598182555, + "loss": 3.4509, + "step": 2119 + }, + { + "epoch": 0.27, + "grad_norm": 1.6240204572677612, + "learning_rate": 0.0004274533773345878, + "loss": 3.4812, + "step": 2120 + }, + { + "epoch": 0.27, + "grad_norm": 0.6391758918762207, + "learning_rate": 0.0004273803481810573, + "loss": 3.4246, + "step": 2121 + }, + { + "epoch": 0.27, + "grad_norm": 0.702078104019165, + "learning_rate": 0.0004273072885337885, + "loss": 3.5051, + "step": 2122 + }, + { + "epoch": 0.27, + "grad_norm": 0.767562985420227, + "learning_rate": 0.0004272341984053413, + "loss": 3.4407, + "step": 2123 + }, + { + "epoch": 0.27, + "grad_norm": 0.6934359073638916, + "learning_rate": 0.0004271610778082807, + "loss": 3.5191, + "step": 2124 + }, + { + "epoch": 0.27, + "grad_norm": 0.7259646058082581, + "learning_rate": 0.00042708792675517703, + "loss": 3.5403, + "step": 2125 + }, + { + "epoch": 0.27, + "grad_norm": 0.7107850909233093, + "learning_rate": 0.00042701474525860583, + "loss": 3.5209, + "step": 2126 + }, + { + "epoch": 0.27, + "grad_norm": 0.7186352014541626, + "learning_rate": 0.0004269415333311477, + "loss": 3.402, + "step": 2127 + }, + { + "epoch": 0.27, + "grad_norm": 0.6743739247322083, + "learning_rate": 0.0004268682909853888, + "loss": 3.4583, + "step": 2128 + }, + { + "epoch": 0.27, + "grad_norm": 0.7082521319389343, + "learning_rate": 0.0004267950182339201, + "loss": 3.4462, + "step": 2129 + }, + { + "epoch": 0.27, + "grad_norm": 0.7342261672019958, + "learning_rate": 0.0004267217150893383, + "loss": 3.4504, + "step": 2130 + }, + { + "epoch": 0.27, + "grad_norm": 0.70721834897995, + "learning_rate": 0.0004266483815642449, + "loss": 3.4127, + "step": 2131 + }, + { + "epoch": 0.27, + "grad_norm": 0.6457509994506836, + "learning_rate": 0.00042657501767124685, + "loss": 3.3845, + "step": 2132 + }, + { + "epoch": 0.27, + "grad_norm": 0.7977797389030457, + "learning_rate": 0.0004265016234229563, + "loss": 3.4275, + "step": 2133 + }, + { + "epoch": 0.27, + "grad_norm": 0.7149450182914734, + "learning_rate": 0.00042642819883199033, + "loss": 3.5007, + "step": 2134 + }, + { + "epoch": 0.27, + "grad_norm": 0.6619107723236084, + "learning_rate": 0.0004263547439109717, + "loss": 3.3743, + "step": 2135 + }, + { + "epoch": 0.27, + "grad_norm": 0.6747405529022217, + "learning_rate": 0.0004262812586725282, + "loss": 3.5574, + "step": 2136 + }, + { + "epoch": 0.27, + "grad_norm": 0.7770102620124817, + "learning_rate": 0.00042620774312929265, + "loss": 3.5815, + "step": 2137 + }, + { + "epoch": 0.27, + "grad_norm": 0.711304247379303, + "learning_rate": 0.0004261341972939033, + "loss": 3.4309, + "step": 2138 + }, + { + "epoch": 0.27, + "grad_norm": 0.6804766058921814, + "learning_rate": 0.0004260606211790035, + "loss": 3.3478, + "step": 2139 + }, + { + "epoch": 0.27, + "grad_norm": 0.700182318687439, + "learning_rate": 0.0004259870147972419, + "loss": 3.4619, + "step": 2140 + }, + { + "epoch": 0.27, + "grad_norm": 0.6286213994026184, + "learning_rate": 0.0004259133781612722, + "loss": 3.5719, + "step": 2141 + }, + { + "epoch": 0.27, + "grad_norm": 0.7141399383544922, + "learning_rate": 0.0004258397112837534, + "loss": 3.5161, + "step": 2142 + }, + { + "epoch": 0.27, + "grad_norm": 0.6732556819915771, + "learning_rate": 0.0004257660141773497, + "loss": 3.4989, + "step": 2143 + }, + { + "epoch": 0.27, + "grad_norm": 0.6593499183654785, + "learning_rate": 0.0004256922868547306, + "loss": 3.3962, + "step": 2144 + }, + { + "epoch": 0.27, + "grad_norm": 0.6658518314361572, + "learning_rate": 0.00042561852932857045, + "loss": 3.4766, + "step": 2145 + }, + { + "epoch": 0.27, + "grad_norm": 0.6705014705657959, + "learning_rate": 0.00042554474161154933, + "loss": 3.4349, + "step": 2146 + }, + { + "epoch": 0.27, + "grad_norm": 0.6827144622802734, + "learning_rate": 0.000425470923716352, + "loss": 3.484, + "step": 2147 + }, + { + "epoch": 0.27, + "grad_norm": 0.7057637572288513, + "learning_rate": 0.0004253970756556685, + "loss": 3.3785, + "step": 2148 + }, + { + "epoch": 0.28, + "grad_norm": 0.6769725680351257, + "learning_rate": 0.0004253231974421945, + "loss": 3.4597, + "step": 2149 + }, + { + "epoch": 0.28, + "grad_norm": 0.7392288446426392, + "learning_rate": 0.00042524928908863025, + "loss": 3.4506, + "step": 2150 + }, + { + "epoch": 0.28, + "grad_norm": 0.6754869222640991, + "learning_rate": 0.0004251753506076816, + "loss": 3.4491, + "step": 2151 + }, + { + "epoch": 0.28, + "grad_norm": 0.688669741153717, + "learning_rate": 0.00042510138201205935, + "loss": 3.622, + "step": 2152 + }, + { + "epoch": 0.28, + "grad_norm": 0.7528844475746155, + "learning_rate": 0.0004250273833144795, + "loss": 3.5109, + "step": 2153 + }, + { + "epoch": 0.28, + "grad_norm": 0.7025592923164368, + "learning_rate": 0.00042495335452766346, + "loss": 3.5784, + "step": 2154 + }, + { + "epoch": 0.28, + "grad_norm": 0.678760290145874, + "learning_rate": 0.0004248792956643376, + "loss": 3.5936, + "step": 2155 + }, + { + "epoch": 0.28, + "grad_norm": 0.7542569041252136, + "learning_rate": 0.00042480520673723334, + "loss": 3.6332, + "step": 2156 + }, + { + "epoch": 0.28, + "grad_norm": 0.7015278935432434, + "learning_rate": 0.0004247310877590875, + "loss": 3.3464, + "step": 2157 + }, + { + "epoch": 0.28, + "grad_norm": 0.7442424893379211, + "learning_rate": 0.00042465693874264203, + "loss": 3.4356, + "step": 2158 + }, + { + "epoch": 0.28, + "grad_norm": 0.7039641737937927, + "learning_rate": 0.00042458275970064404, + "loss": 3.3722, + "step": 2159 + }, + { + "epoch": 0.28, + "grad_norm": 0.6853299736976624, + "learning_rate": 0.0004245085506458457, + "loss": 3.4757, + "step": 2160 + }, + { + "epoch": 0.28, + "grad_norm": 0.6797823309898376, + "learning_rate": 0.0004244343115910044, + "loss": 3.4409, + "step": 2161 + }, + { + "epoch": 0.28, + "grad_norm": 0.7026341557502747, + "learning_rate": 0.00042436004254888286, + "loss": 3.4307, + "step": 2162 + }, + { + "epoch": 0.28, + "grad_norm": 0.7010685205459595, + "learning_rate": 0.00042428574353224846, + "loss": 3.3267, + "step": 2163 + }, + { + "epoch": 0.28, + "grad_norm": 0.6813393831253052, + "learning_rate": 0.0004242114145538744, + "loss": 3.3881, + "step": 2164 + }, + { + "epoch": 0.28, + "grad_norm": 0.7339474558830261, + "learning_rate": 0.00042413705562653847, + "loss": 3.4078, + "step": 2165 + }, + { + "epoch": 0.28, + "grad_norm": 0.6485927104949951, + "learning_rate": 0.0004240626667630239, + "loss": 3.4319, + "step": 2166 + }, + { + "epoch": 0.28, + "grad_norm": 0.6844644546508789, + "learning_rate": 0.000423988247976119, + "loss": 3.412, + "step": 2167 + }, + { + "epoch": 0.28, + "grad_norm": 0.6630472540855408, + "learning_rate": 0.0004239137992786173, + "loss": 3.4041, + "step": 2168 + }, + { + "epoch": 0.28, + "grad_norm": 0.715288519859314, + "learning_rate": 0.00042383932068331727, + "loss": 3.4277, + "step": 2169 + }, + { + "epoch": 0.28, + "grad_norm": 0.7145828604698181, + "learning_rate": 0.0004237648122030227, + "loss": 3.407, + "step": 2170 + }, + { + "epoch": 0.28, + "grad_norm": 0.7224670648574829, + "learning_rate": 0.00042369027385054245, + "loss": 3.501, + "step": 2171 + }, + { + "epoch": 0.28, + "grad_norm": 0.7074441909790039, + "learning_rate": 0.00042361570563869056, + "loss": 3.5399, + "step": 2172 + }, + { + "epoch": 0.28, + "grad_norm": 0.6857562065124512, + "learning_rate": 0.00042354110758028614, + "loss": 3.4285, + "step": 2173 + }, + { + "epoch": 0.28, + "grad_norm": 0.6324169039726257, + "learning_rate": 0.00042346647968815346, + "loss": 3.311, + "step": 2174 + }, + { + "epoch": 0.28, + "grad_norm": 0.7110226154327393, + "learning_rate": 0.00042339182197512193, + "loss": 3.5438, + "step": 2175 + }, + { + "epoch": 0.28, + "grad_norm": 0.6994052529335022, + "learning_rate": 0.000423317134454026, + "loss": 3.5169, + "step": 2176 + }, + { + "epoch": 0.28, + "grad_norm": 0.6708390712738037, + "learning_rate": 0.0004232424171377055, + "loss": 3.3347, + "step": 2177 + }, + { + "epoch": 0.28, + "grad_norm": 0.681258499622345, + "learning_rate": 0.00042316767003900503, + "loss": 3.5251, + "step": 2178 + }, + { + "epoch": 0.28, + "grad_norm": 0.7101171612739563, + "learning_rate": 0.0004230928931707746, + "loss": 3.453, + "step": 2179 + }, + { + "epoch": 0.28, + "grad_norm": 0.66106778383255, + "learning_rate": 0.00042301808654586915, + "loss": 3.4855, + "step": 2180 + }, + { + "epoch": 0.28, + "grad_norm": 0.7216284871101379, + "learning_rate": 0.00042294325017714875, + "loss": 3.5795, + "step": 2181 + }, + { + "epoch": 0.28, + "grad_norm": 0.6846936345100403, + "learning_rate": 0.00042286838407747877, + "loss": 3.603, + "step": 2182 + }, + { + "epoch": 0.28, + "grad_norm": 0.6562172174453735, + "learning_rate": 0.00042279348825972955, + "loss": 3.3686, + "step": 2183 + }, + { + "epoch": 0.28, + "grad_norm": 0.6876118779182434, + "learning_rate": 0.0004227185627367765, + "loss": 3.3676, + "step": 2184 + }, + { + "epoch": 0.28, + "grad_norm": 0.6920157670974731, + "learning_rate": 0.0004226436075215001, + "loss": 3.4078, + "step": 2185 + }, + { + "epoch": 0.28, + "grad_norm": 0.6572172045707703, + "learning_rate": 0.0004225686226267862, + "loss": 3.4962, + "step": 2186 + }, + { + "epoch": 0.28, + "grad_norm": 0.6494218707084656, + "learning_rate": 0.0004224936080655255, + "loss": 3.4851, + "step": 2187 + }, + { + "epoch": 0.28, + "grad_norm": 0.7011899948120117, + "learning_rate": 0.0004224185638506138, + "loss": 3.4603, + "step": 2188 + }, + { + "epoch": 0.28, + "grad_norm": 0.6653608083724976, + "learning_rate": 0.0004223434899949521, + "loss": 3.4034, + "step": 2189 + }, + { + "epoch": 0.28, + "grad_norm": 0.6927925944328308, + "learning_rate": 0.0004222683865114465, + "loss": 3.4491, + "step": 2190 + }, + { + "epoch": 0.28, + "grad_norm": 0.650065004825592, + "learning_rate": 0.0004221932534130082, + "loss": 3.4603, + "step": 2191 + }, + { + "epoch": 0.28, + "grad_norm": 0.6736721992492676, + "learning_rate": 0.00042211809071255344, + "loss": 3.3901, + "step": 2192 + }, + { + "epoch": 0.28, + "grad_norm": 0.6661523580551147, + "learning_rate": 0.00042204289842300344, + "loss": 3.4231, + "step": 2193 + }, + { + "epoch": 0.28, + "grad_norm": 0.7264108061790466, + "learning_rate": 0.00042196767655728473, + "loss": 3.62, + "step": 2194 + }, + { + "epoch": 0.28, + "grad_norm": 0.6918498277664185, + "learning_rate": 0.00042189242512832875, + "loss": 3.5098, + "step": 2195 + }, + { + "epoch": 0.28, + "grad_norm": 0.7076089978218079, + "learning_rate": 0.00042181714414907224, + "loss": 3.4579, + "step": 2196 + }, + { + "epoch": 0.28, + "grad_norm": 0.6871483325958252, + "learning_rate": 0.00042174183363245674, + "loss": 3.4601, + "step": 2197 + }, + { + "epoch": 0.28, + "grad_norm": 0.6901012063026428, + "learning_rate": 0.0004216664935914291, + "loss": 3.5506, + "step": 2198 + }, + { + "epoch": 0.28, + "grad_norm": 0.6661196947097778, + "learning_rate": 0.0004215911240389409, + "loss": 3.3883, + "step": 2199 + }, + { + "epoch": 0.28, + "grad_norm": 0.6737020611763, + "learning_rate": 0.0004215157249879493, + "loss": 3.4412, + "step": 2200 + }, + { + "epoch": 0.28, + "grad_norm": 0.6551852822303772, + "learning_rate": 0.0004214402964514163, + "loss": 3.3396, + "step": 2201 + }, + { + "epoch": 0.28, + "grad_norm": 0.682401180267334, + "learning_rate": 0.00042136483844230877, + "loss": 3.4852, + "step": 2202 + }, + { + "epoch": 0.28, + "grad_norm": 0.6794586777687073, + "learning_rate": 0.0004212893509735989, + "loss": 3.3587, + "step": 2203 + }, + { + "epoch": 0.28, + "grad_norm": 0.6415001153945923, + "learning_rate": 0.00042121383405826373, + "loss": 3.4704, + "step": 2204 + }, + { + "epoch": 0.28, + "grad_norm": 0.6676735877990723, + "learning_rate": 0.00042113828770928575, + "loss": 3.4323, + "step": 2205 + }, + { + "epoch": 0.28, + "grad_norm": 0.7048613429069519, + "learning_rate": 0.000421062711939652, + "loss": 3.3844, + "step": 2206 + }, + { + "epoch": 0.28, + "grad_norm": 0.6825292706489563, + "learning_rate": 0.000420987106762355, + "loss": 3.5218, + "step": 2207 + }, + { + "epoch": 0.28, + "grad_norm": 0.6771185398101807, + "learning_rate": 0.00042091147219039203, + "loss": 3.3899, + "step": 2208 + }, + { + "epoch": 0.28, + "grad_norm": 0.6721455454826355, + "learning_rate": 0.0004208358082367657, + "loss": 3.4341, + "step": 2209 + }, + { + "epoch": 0.28, + "grad_norm": 0.7785545587539673, + "learning_rate": 0.0004207601149144834, + "loss": 3.516, + "step": 2210 + }, + { + "epoch": 0.28, + "grad_norm": 0.7200364470481873, + "learning_rate": 0.00042068439223655774, + "loss": 3.4922, + "step": 2211 + }, + { + "epoch": 0.28, + "grad_norm": 0.6667042374610901, + "learning_rate": 0.0004206086402160063, + "loss": 3.3768, + "step": 2212 + }, + { + "epoch": 0.28, + "grad_norm": 0.6721999645233154, + "learning_rate": 0.0004205328588658517, + "loss": 3.4743, + "step": 2213 + }, + { + "epoch": 0.28, + "grad_norm": 0.6536881923675537, + "learning_rate": 0.0004204570481991217, + "loss": 3.3868, + "step": 2214 + }, + { + "epoch": 0.28, + "grad_norm": 0.6732337474822998, + "learning_rate": 0.00042038120822884904, + "loss": 3.3522, + "step": 2215 + }, + { + "epoch": 0.28, + "grad_norm": 0.6934797167778015, + "learning_rate": 0.00042030533896807143, + "loss": 3.5299, + "step": 2216 + }, + { + "epoch": 0.28, + "grad_norm": 0.6694208383560181, + "learning_rate": 0.0004202294404298317, + "loss": 3.4935, + "step": 2217 + }, + { + "epoch": 0.28, + "grad_norm": 0.6776825189590454, + "learning_rate": 0.0004201535126271777, + "loss": 3.4775, + "step": 2218 + }, + { + "epoch": 0.28, + "grad_norm": 0.6322728395462036, + "learning_rate": 0.0004200775555731623, + "loss": 3.496, + "step": 2219 + }, + { + "epoch": 0.28, + "grad_norm": 0.6704550981521606, + "learning_rate": 0.00042000156928084336, + "loss": 3.5436, + "step": 2220 + }, + { + "epoch": 0.28, + "grad_norm": 0.6826850175857544, + "learning_rate": 0.00041992555376328385, + "loss": 3.577, + "step": 2221 + }, + { + "epoch": 0.28, + "grad_norm": 0.6952018737792969, + "learning_rate": 0.00041984950903355166, + "loss": 3.4282, + "step": 2222 + }, + { + "epoch": 0.28, + "grad_norm": 0.6880483031272888, + "learning_rate": 0.0004197734351047199, + "loss": 3.4562, + "step": 2223 + }, + { + "epoch": 0.28, + "grad_norm": 0.7053331732749939, + "learning_rate": 0.0004196973319898664, + "loss": 3.4515, + "step": 2224 + }, + { + "epoch": 0.28, + "grad_norm": 0.6839045882225037, + "learning_rate": 0.0004196211997020742, + "loss": 3.5339, + "step": 2225 + }, + { + "epoch": 0.28, + "grad_norm": 0.6083318591117859, + "learning_rate": 0.0004195450382544315, + "loss": 3.4035, + "step": 2226 + }, + { + "epoch": 0.29, + "grad_norm": 0.6685528755187988, + "learning_rate": 0.000419468847660031, + "loss": 3.447, + "step": 2227 + }, + { + "epoch": 0.29, + "grad_norm": 0.6794456243515015, + "learning_rate": 0.00041939262793197105, + "loss": 3.3722, + "step": 2228 + }, + { + "epoch": 0.29, + "grad_norm": 0.6989598274230957, + "learning_rate": 0.00041931637908335453, + "loss": 3.4522, + "step": 2229 + }, + { + "epoch": 0.29, + "grad_norm": 0.7004005312919617, + "learning_rate": 0.0004192401011272896, + "loss": 3.2647, + "step": 2230 + }, + { + "epoch": 0.29, + "grad_norm": 0.6671735048294067, + "learning_rate": 0.0004191637940768893, + "loss": 3.4905, + "step": 2231 + }, + { + "epoch": 0.29, + "grad_norm": 0.7051229476928711, + "learning_rate": 0.0004190874579452716, + "loss": 3.3168, + "step": 2232 + }, + { + "epoch": 0.29, + "grad_norm": 0.6781712174415588, + "learning_rate": 0.0004190110927455597, + "loss": 3.4034, + "step": 2233 + }, + { + "epoch": 0.29, + "grad_norm": 0.7480825185775757, + "learning_rate": 0.0004189346984908816, + "loss": 3.3314, + "step": 2234 + }, + { + "epoch": 0.29, + "grad_norm": 0.7438943386077881, + "learning_rate": 0.00041885827519437047, + "loss": 3.4312, + "step": 2235 + }, + { + "epoch": 0.29, + "grad_norm": 0.6712598204612732, + "learning_rate": 0.0004187818228691641, + "loss": 3.5298, + "step": 2236 + }, + { + "epoch": 0.29, + "grad_norm": 0.6709827184677124, + "learning_rate": 0.0004187053415284058, + "loss": 3.3879, + "step": 2237 + }, + { + "epoch": 0.29, + "grad_norm": 0.65529465675354, + "learning_rate": 0.0004186288311852435, + "loss": 3.3781, + "step": 2238 + }, + { + "epoch": 0.29, + "grad_norm": 0.6793820858001709, + "learning_rate": 0.0004185522918528302, + "loss": 3.51, + "step": 2239 + }, + { + "epoch": 0.29, + "grad_norm": 0.6451491713523865, + "learning_rate": 0.0004184757235443238, + "loss": 3.366, + "step": 2240 + }, + { + "epoch": 0.29, + "grad_norm": 0.7233899235725403, + "learning_rate": 0.0004183991262728875, + "loss": 3.4044, + "step": 2241 + }, + { + "epoch": 0.29, + "grad_norm": 0.6855872273445129, + "learning_rate": 0.0004183225000516891, + "loss": 3.3616, + "step": 2242 + }, + { + "epoch": 0.29, + "grad_norm": 0.7209010720252991, + "learning_rate": 0.0004182458448939016, + "loss": 3.4268, + "step": 2243 + }, + { + "epoch": 0.29, + "grad_norm": 0.6535835862159729, + "learning_rate": 0.00041816916081270286, + "loss": 3.5758, + "step": 2244 + }, + { + "epoch": 0.29, + "grad_norm": 0.6555159091949463, + "learning_rate": 0.00041809244782127573, + "loss": 3.4047, + "step": 2245 + }, + { + "epoch": 0.29, + "grad_norm": 0.6891555190086365, + "learning_rate": 0.00041801570593280824, + "loss": 3.4448, + "step": 2246 + }, + { + "epoch": 0.29, + "grad_norm": 0.6463344693183899, + "learning_rate": 0.00041793893516049307, + "loss": 3.4945, + "step": 2247 + }, + { + "epoch": 0.29, + "grad_norm": 0.6066601276397705, + "learning_rate": 0.00041786213551752804, + "loss": 3.3468, + "step": 2248 + }, + { + "epoch": 0.29, + "grad_norm": 0.6507159471511841, + "learning_rate": 0.0004177853070171159, + "loss": 3.355, + "step": 2249 + }, + { + "epoch": 0.29, + "grad_norm": 0.6900511980056763, + "learning_rate": 0.00041770844967246423, + "loss": 3.4342, + "step": 2250 + }, + { + "epoch": 0.29, + "grad_norm": 0.7094230055809021, + "learning_rate": 0.0004176315634967859, + "loss": 3.5087, + "step": 2251 + }, + { + "epoch": 0.29, + "grad_norm": 0.6796755194664001, + "learning_rate": 0.00041755464850329847, + "loss": 3.4755, + "step": 2252 + }, + { + "epoch": 0.29, + "grad_norm": 0.7061808109283447, + "learning_rate": 0.0004174777047052245, + "loss": 3.449, + "step": 2253 + }, + { + "epoch": 0.29, + "grad_norm": 0.6908069252967834, + "learning_rate": 0.00041740073211579156, + "loss": 3.4706, + "step": 2254 + }, + { + "epoch": 0.29, + "grad_norm": 0.6602137684822083, + "learning_rate": 0.00041732373074823204, + "loss": 3.521, + "step": 2255 + }, + { + "epoch": 0.29, + "grad_norm": 0.670946478843689, + "learning_rate": 0.0004172467006157834, + "loss": 3.5037, + "step": 2256 + }, + { + "epoch": 0.29, + "grad_norm": 0.6751448512077332, + "learning_rate": 0.00041716964173168803, + "loss": 3.3769, + "step": 2257 + }, + { + "epoch": 0.29, + "grad_norm": 0.651528000831604, + "learning_rate": 0.00041709255410919335, + "loss": 3.4461, + "step": 2258 + }, + { + "epoch": 0.29, + "grad_norm": 0.6197723150253296, + "learning_rate": 0.00041701543776155136, + "loss": 3.4872, + "step": 2259 + }, + { + "epoch": 0.29, + "grad_norm": 0.6545820832252502, + "learning_rate": 0.0004169382927020196, + "loss": 3.3789, + "step": 2260 + }, + { + "epoch": 0.29, + "grad_norm": 0.6743528246879578, + "learning_rate": 0.0004168611189438598, + "loss": 3.442, + "step": 2261 + }, + { + "epoch": 0.29, + "grad_norm": 0.7139307856559753, + "learning_rate": 0.00041678391650033943, + "loss": 3.4165, + "step": 2262 + }, + { + "epoch": 0.29, + "grad_norm": 0.694196879863739, + "learning_rate": 0.0004167066853847302, + "loss": 3.5189, + "step": 2263 + }, + { + "epoch": 0.29, + "grad_norm": 0.6422727704048157, + "learning_rate": 0.00041662942561030905, + "loss": 3.5793, + "step": 2264 + }, + { + "epoch": 0.29, + "grad_norm": 0.6588732004165649, + "learning_rate": 0.000416552137190358, + "loss": 3.5192, + "step": 2265 + }, + { + "epoch": 0.29, + "grad_norm": 0.6799389123916626, + "learning_rate": 0.00041647482013816366, + "loss": 3.4602, + "step": 2266 + }, + { + "epoch": 0.29, + "grad_norm": 0.6588906645774841, + "learning_rate": 0.0004163974744670179, + "loss": 3.388, + "step": 2267 + }, + { + "epoch": 0.29, + "grad_norm": 0.7076383233070374, + "learning_rate": 0.00041632010019021715, + "loss": 3.3662, + "step": 2268 + }, + { + "epoch": 0.29, + "grad_norm": 0.6735315918922424, + "learning_rate": 0.000416242697321063, + "loss": 3.3095, + "step": 2269 + }, + { + "epoch": 0.29, + "grad_norm": 0.6909252405166626, + "learning_rate": 0.000416165265872862, + "loss": 3.486, + "step": 2270 + }, + { + "epoch": 0.29, + "grad_norm": 0.7013283371925354, + "learning_rate": 0.0004160878058589255, + "loss": 3.4681, + "step": 2271 + }, + { + "epoch": 0.29, + "grad_norm": 0.6996357440948486, + "learning_rate": 0.00041601031729256963, + "loss": 3.2933, + "step": 2272 + }, + { + "epoch": 0.29, + "grad_norm": 0.7182360887527466, + "learning_rate": 0.00041593280018711567, + "loss": 3.4998, + "step": 2273 + }, + { + "epoch": 0.29, + "grad_norm": 0.7021991610527039, + "learning_rate": 0.0004158552545558897, + "loss": 3.4124, + "step": 2274 + }, + { + "epoch": 0.29, + "grad_norm": 0.6800808906555176, + "learning_rate": 0.00041577768041222265, + "loss": 3.3624, + "step": 2275 + }, + { + "epoch": 0.29, + "grad_norm": 0.6459739208221436, + "learning_rate": 0.0004157000777694506, + "loss": 3.3325, + "step": 2276 + }, + { + "epoch": 0.29, + "grad_norm": 0.621931791305542, + "learning_rate": 0.0004156224466409141, + "loss": 3.4246, + "step": 2277 + }, + { + "epoch": 0.29, + "grad_norm": 0.7608678936958313, + "learning_rate": 0.00041554478703995903, + "loss": 3.6148, + "step": 2278 + }, + { + "epoch": 0.29, + "grad_norm": 0.6395514011383057, + "learning_rate": 0.00041546709897993594, + "loss": 3.3696, + "step": 2279 + }, + { + "epoch": 0.29, + "grad_norm": 0.671046257019043, + "learning_rate": 0.0004153893824742002, + "loss": 3.552, + "step": 2280 + }, + { + "epoch": 0.29, + "grad_norm": 0.6708778142929077, + "learning_rate": 0.00041531163753611236, + "loss": 3.4801, + "step": 2281 + }, + { + "epoch": 0.29, + "grad_norm": 0.6961725950241089, + "learning_rate": 0.00041523386417903745, + "loss": 3.4896, + "step": 2282 + }, + { + "epoch": 0.29, + "grad_norm": 0.7047480940818787, + "learning_rate": 0.00041515606241634577, + "loss": 3.4363, + "step": 2283 + }, + { + "epoch": 0.29, + "grad_norm": 0.6927662491798401, + "learning_rate": 0.0004150782322614124, + "loss": 3.454, + "step": 2284 + }, + { + "epoch": 0.29, + "grad_norm": 0.6934167146682739, + "learning_rate": 0.00041500037372761705, + "loss": 3.4441, + "step": 2285 + }, + { + "epoch": 0.29, + "grad_norm": 0.6866016983985901, + "learning_rate": 0.00041492248682834464, + "loss": 3.4035, + "step": 2286 + }, + { + "epoch": 0.29, + "grad_norm": 0.7255225777626038, + "learning_rate": 0.00041484457157698473, + "loss": 3.6204, + "step": 2287 + }, + { + "epoch": 0.29, + "grad_norm": 0.657615602016449, + "learning_rate": 0.00041476662798693205, + "loss": 3.4804, + "step": 2288 + }, + { + "epoch": 0.29, + "grad_norm": 0.7463390231132507, + "learning_rate": 0.0004146886560715858, + "loss": 3.5594, + "step": 2289 + }, + { + "epoch": 0.29, + "grad_norm": 0.7018217444419861, + "learning_rate": 0.0004146106558443504, + "loss": 3.4931, + "step": 2290 + }, + { + "epoch": 0.29, + "grad_norm": 0.686687171459198, + "learning_rate": 0.0004145326273186348, + "loss": 3.4155, + "step": 2291 + }, + { + "epoch": 0.29, + "grad_norm": 0.7059077620506287, + "learning_rate": 0.0004144545705078533, + "loss": 3.5494, + "step": 2292 + }, + { + "epoch": 0.29, + "grad_norm": 0.6666622757911682, + "learning_rate": 0.0004143764854254245, + "loss": 3.3401, + "step": 2293 + }, + { + "epoch": 0.29, + "grad_norm": 0.6963295936584473, + "learning_rate": 0.0004142983720847723, + "loss": 3.3885, + "step": 2294 + }, + { + "epoch": 0.29, + "grad_norm": 0.6787527203559875, + "learning_rate": 0.0004142202304993252, + "loss": 3.479, + "step": 2295 + }, + { + "epoch": 0.29, + "grad_norm": 0.658642053604126, + "learning_rate": 0.0004141420606825166, + "loss": 3.4086, + "step": 2296 + }, + { + "epoch": 0.29, + "grad_norm": 0.625916600227356, + "learning_rate": 0.00041406386264778496, + "loss": 3.3912, + "step": 2297 + }, + { + "epoch": 0.29, + "grad_norm": 0.6570123434066772, + "learning_rate": 0.0004139856364085733, + "loss": 3.5255, + "step": 2298 + }, + { + "epoch": 0.29, + "grad_norm": 0.6459956765174866, + "learning_rate": 0.00041390738197832975, + "loss": 3.5721, + "step": 2299 + }, + { + "epoch": 0.29, + "grad_norm": 0.667670488357544, + "learning_rate": 0.00041382909937050694, + "loss": 3.5063, + "step": 2300 + }, + { + "epoch": 0.29, + "grad_norm": 0.7464162707328796, + "learning_rate": 0.00041375078859856275, + "loss": 3.3505, + "step": 2301 + }, + { + "epoch": 0.29, + "grad_norm": 0.6404237747192383, + "learning_rate": 0.00041367244967595963, + "loss": 3.4405, + "step": 2302 + }, + { + "epoch": 0.29, + "grad_norm": 0.6771771907806396, + "learning_rate": 0.0004135940826161649, + "loss": 3.4206, + "step": 2303 + }, + { + "epoch": 0.29, + "grad_norm": 0.6354660391807556, + "learning_rate": 0.0004135156874326509, + "loss": 3.4234, + "step": 2304 + }, + { + "epoch": 0.3, + "grad_norm": 0.6784554123878479, + "learning_rate": 0.0004134372641388946, + "loss": 3.5672, + "step": 2305 + }, + { + "epoch": 0.3, + "grad_norm": 0.6180399060249329, + "learning_rate": 0.0004133588127483778, + "loss": 3.5065, + "step": 2306 + }, + { + "epoch": 0.3, + "grad_norm": 0.7070502638816833, + "learning_rate": 0.00041328033327458726, + "loss": 3.3293, + "step": 2307 + }, + { + "epoch": 0.3, + "grad_norm": 0.6411788463592529, + "learning_rate": 0.00041320182573101463, + "loss": 3.2946, + "step": 2308 + }, + { + "epoch": 0.3, + "grad_norm": 0.6687417030334473, + "learning_rate": 0.0004131232901311561, + "loss": 3.4136, + "step": 2309 + }, + { + "epoch": 0.3, + "grad_norm": 0.618337094783783, + "learning_rate": 0.00041304472648851285, + "loss": 3.2932, + "step": 2310 + }, + { + "epoch": 0.3, + "grad_norm": 0.7408819794654846, + "learning_rate": 0.00041296613481659096, + "loss": 3.3637, + "step": 2311 + }, + { + "epoch": 0.3, + "grad_norm": 0.6739766597747803, + "learning_rate": 0.0004128875151289013, + "loss": 3.508, + "step": 2312 + }, + { + "epoch": 0.3, + "grad_norm": 0.7727325558662415, + "learning_rate": 0.0004128088674389594, + "loss": 3.4848, + "step": 2313 + }, + { + "epoch": 0.3, + "grad_norm": 0.6911073923110962, + "learning_rate": 0.0004127301917602857, + "loss": 3.4613, + "step": 2314 + }, + { + "epoch": 0.3, + "grad_norm": 0.7270300984382629, + "learning_rate": 0.0004126514881064055, + "loss": 3.4827, + "step": 2315 + }, + { + "epoch": 0.3, + "grad_norm": 0.6577227711677551, + "learning_rate": 0.00041257275649084896, + "loss": 3.4467, + "step": 2316 + }, + { + "epoch": 0.3, + "grad_norm": 0.7177078127861023, + "learning_rate": 0.00041249399692715085, + "loss": 3.2912, + "step": 2317 + }, + { + "epoch": 0.3, + "grad_norm": 0.7087587714195251, + "learning_rate": 0.00041241520942885085, + "loss": 3.4813, + "step": 2318 + }, + { + "epoch": 0.3, + "grad_norm": 0.6729384660720825, + "learning_rate": 0.00041233639400949345, + "loss": 3.5241, + "step": 2319 + }, + { + "epoch": 0.3, + "grad_norm": 0.6875993013381958, + "learning_rate": 0.00041225755068262804, + "loss": 3.5853, + "step": 2320 + }, + { + "epoch": 0.3, + "grad_norm": 0.7308191061019897, + "learning_rate": 0.00041217867946180856, + "loss": 3.4682, + "step": 2321 + }, + { + "epoch": 0.3, + "grad_norm": 0.697512149810791, + "learning_rate": 0.00041209978036059393, + "loss": 3.3957, + "step": 2322 + }, + { + "epoch": 0.3, + "grad_norm": 0.7060604691505432, + "learning_rate": 0.00041202085339254785, + "loss": 3.5362, + "step": 2323 + }, + { + "epoch": 0.3, + "grad_norm": 0.697181224822998, + "learning_rate": 0.00041194189857123877, + "loss": 3.4826, + "step": 2324 + }, + { + "epoch": 0.3, + "grad_norm": 0.7139546871185303, + "learning_rate": 0.00041186291591023994, + "loss": 3.5293, + "step": 2325 + }, + { + "epoch": 0.3, + "grad_norm": 0.7130452990531921, + "learning_rate": 0.00041178390542312936, + "loss": 3.3592, + "step": 2326 + }, + { + "epoch": 0.3, + "grad_norm": 0.680930495262146, + "learning_rate": 0.0004117048671234899, + "loss": 3.408, + "step": 2327 + }, + { + "epoch": 0.3, + "grad_norm": 0.7060506343841553, + "learning_rate": 0.00041162580102490915, + "loss": 3.3227, + "step": 2328 + }, + { + "epoch": 0.3, + "grad_norm": 0.6682820916175842, + "learning_rate": 0.0004115467071409794, + "loss": 3.4366, + "step": 2329 + }, + { + "epoch": 0.3, + "grad_norm": 0.6864296197891235, + "learning_rate": 0.000411467585485298, + "loss": 3.3235, + "step": 2330 + }, + { + "epoch": 0.3, + "grad_norm": 0.6731339693069458, + "learning_rate": 0.0004113884360714667, + "loss": 3.4194, + "step": 2331 + }, + { + "epoch": 0.3, + "grad_norm": 0.6872637271881104, + "learning_rate": 0.0004113092589130923, + "loss": 3.4454, + "step": 2332 + }, + { + "epoch": 0.3, + "grad_norm": 0.6677448153495789, + "learning_rate": 0.00041123005402378615, + "loss": 3.3855, + "step": 2333 + }, + { + "epoch": 0.3, + "grad_norm": 0.7034282088279724, + "learning_rate": 0.00041115082141716464, + "loss": 3.4623, + "step": 2334 + }, + { + "epoch": 0.3, + "grad_norm": 0.6950244307518005, + "learning_rate": 0.00041107156110684875, + "loss": 3.3473, + "step": 2335 + }, + { + "epoch": 0.3, + "grad_norm": 0.6312594413757324, + "learning_rate": 0.00041099227310646415, + "loss": 3.5203, + "step": 2336 + }, + { + "epoch": 0.3, + "grad_norm": 0.6454058289527893, + "learning_rate": 0.0004109129574296414, + "loss": 3.3099, + "step": 2337 + }, + { + "epoch": 0.3, + "grad_norm": 0.6628250479698181, + "learning_rate": 0.0004108336140900158, + "loss": 3.3756, + "step": 2338 + }, + { + "epoch": 0.3, + "grad_norm": 0.6997645497322083, + "learning_rate": 0.00041075424310122745, + "loss": 3.3865, + "step": 2339 + }, + { + "epoch": 0.3, + "grad_norm": 0.6497191190719604, + "learning_rate": 0.00041067484447692104, + "loss": 3.5008, + "step": 2340 + }, + { + "epoch": 0.3, + "grad_norm": 0.7443307042121887, + "learning_rate": 0.0004105954182307462, + "loss": 3.5211, + "step": 2341 + }, + { + "epoch": 0.3, + "grad_norm": 0.7580959796905518, + "learning_rate": 0.00041051596437635717, + "loss": 3.377, + "step": 2342 + }, + { + "epoch": 0.3, + "grad_norm": 0.6966285109519958, + "learning_rate": 0.000410436482927413, + "loss": 3.4408, + "step": 2343 + }, + { + "epoch": 0.3, + "grad_norm": 0.6279584765434265, + "learning_rate": 0.00041035697389757745, + "loss": 3.405, + "step": 2344 + }, + { + "epoch": 0.3, + "grad_norm": 0.6910143494606018, + "learning_rate": 0.0004102774373005191, + "loss": 3.3697, + "step": 2345 + }, + { + "epoch": 0.3, + "grad_norm": 0.7200734615325928, + "learning_rate": 0.0004101978731499112, + "loss": 3.4753, + "step": 2346 + }, + { + "epoch": 0.3, + "grad_norm": 0.7063745856285095, + "learning_rate": 0.00041011828145943167, + "loss": 3.4143, + "step": 2347 + }, + { + "epoch": 0.3, + "grad_norm": 0.6820845603942871, + "learning_rate": 0.00041003866224276333, + "loss": 3.49, + "step": 2348 + }, + { + "epoch": 0.3, + "grad_norm": 0.6298126578330994, + "learning_rate": 0.00040995901551359366, + "loss": 3.4667, + "step": 2349 + }, + { + "epoch": 0.3, + "grad_norm": 0.6085259914398193, + "learning_rate": 0.00040987934128561477, + "loss": 3.4444, + "step": 2350 + }, + { + "epoch": 0.3, + "grad_norm": 0.7015394568443298, + "learning_rate": 0.0004097996395725237, + "loss": 3.3604, + "step": 2351 + }, + { + "epoch": 0.3, + "grad_norm": 0.6495139002799988, + "learning_rate": 0.0004097199103880219, + "loss": 3.3688, + "step": 2352 + }, + { + "epoch": 0.3, + "grad_norm": 0.729513943195343, + "learning_rate": 0.00040964015374581606, + "loss": 3.5373, + "step": 2353 + }, + { + "epoch": 0.3, + "grad_norm": 0.6324521899223328, + "learning_rate": 0.00040956036965961694, + "loss": 3.5387, + "step": 2354 + }, + { + "epoch": 0.3, + "grad_norm": 0.6699215173721313, + "learning_rate": 0.0004094805581431406, + "loss": 3.521, + "step": 2355 + }, + { + "epoch": 0.3, + "grad_norm": 0.6890226602554321, + "learning_rate": 0.00040940071921010735, + "loss": 3.3704, + "step": 2356 + }, + { + "epoch": 0.3, + "grad_norm": 0.7144984602928162, + "learning_rate": 0.0004093208528742426, + "loss": 3.3831, + "step": 2357 + }, + { + "epoch": 0.3, + "grad_norm": 0.6853955984115601, + "learning_rate": 0.0004092409591492762, + "loss": 3.4309, + "step": 2358 + }, + { + "epoch": 0.3, + "grad_norm": 0.7000523209571838, + "learning_rate": 0.0004091610380489429, + "loss": 3.3446, + "step": 2359 + }, + { + "epoch": 0.3, + "grad_norm": 0.6360917091369629, + "learning_rate": 0.000409081089586982, + "loss": 3.5088, + "step": 2360 + }, + { + "epoch": 0.3, + "grad_norm": 0.6459645628929138, + "learning_rate": 0.00040900111377713743, + "loss": 3.5955, + "step": 2361 + }, + { + "epoch": 0.3, + "grad_norm": 0.7057053446769714, + "learning_rate": 0.0004089211106331583, + "loss": 3.4484, + "step": 2362 + }, + { + "epoch": 0.3, + "grad_norm": 0.6538532376289368, + "learning_rate": 0.0004088410801687977, + "loss": 3.4237, + "step": 2363 + }, + { + "epoch": 0.3, + "grad_norm": 0.727017343044281, + "learning_rate": 0.00040876102239781407, + "loss": 3.4437, + "step": 2364 + }, + { + "epoch": 0.3, + "grad_norm": 0.7080045342445374, + "learning_rate": 0.0004086809373339703, + "loss": 3.4506, + "step": 2365 + }, + { + "epoch": 0.3, + "grad_norm": 0.6796673536300659, + "learning_rate": 0.00040860082499103356, + "loss": 3.5654, + "step": 2366 + }, + { + "epoch": 0.3, + "grad_norm": 0.739126443862915, + "learning_rate": 0.0004085206853827765, + "loss": 3.4897, + "step": 2367 + }, + { + "epoch": 0.3, + "grad_norm": 0.6407649517059326, + "learning_rate": 0.0004084405185229759, + "loss": 3.3694, + "step": 2368 + }, + { + "epoch": 0.3, + "grad_norm": 1.063560962677002, + "learning_rate": 0.0004083603244254134, + "loss": 3.5015, + "step": 2369 + }, + { + "epoch": 0.3, + "grad_norm": 0.6587662696838379, + "learning_rate": 0.0004082801031038751, + "loss": 3.4524, + "step": 2370 + }, + { + "epoch": 0.3, + "grad_norm": 0.6542607545852661, + "learning_rate": 0.00040819985457215223, + "loss": 3.3431, + "step": 2371 + }, + { + "epoch": 0.3, + "grad_norm": 0.7463224530220032, + "learning_rate": 0.0004081195788440404, + "loss": 3.4656, + "step": 2372 + }, + { + "epoch": 0.3, + "grad_norm": 0.7438334822654724, + "learning_rate": 0.0004080392759333398, + "loss": 3.3541, + "step": 2373 + }, + { + "epoch": 0.3, + "grad_norm": 0.6712779402732849, + "learning_rate": 0.00040795894585385557, + "loss": 3.475, + "step": 2374 + }, + { + "epoch": 0.3, + "grad_norm": 0.6458290219306946, + "learning_rate": 0.0004078785886193973, + "loss": 3.3563, + "step": 2375 + }, + { + "epoch": 0.3, + "grad_norm": 0.6500002145767212, + "learning_rate": 0.00040779820424377935, + "loss": 3.4716, + "step": 2376 + }, + { + "epoch": 0.3, + "grad_norm": 0.6747543811798096, + "learning_rate": 0.0004077177927408208, + "loss": 3.5333, + "step": 2377 + }, + { + "epoch": 0.3, + "grad_norm": 0.7234296798706055, + "learning_rate": 0.0004076373541243452, + "loss": 3.5203, + "step": 2378 + }, + { + "epoch": 0.3, + "grad_norm": 0.6669906973838806, + "learning_rate": 0.00040755688840818095, + "loss": 3.425, + "step": 2379 + }, + { + "epoch": 0.3, + "grad_norm": 0.6469231843948364, + "learning_rate": 0.000407476395606161, + "loss": 3.4563, + "step": 2380 + }, + { + "epoch": 0.3, + "grad_norm": 0.8278177976608276, + "learning_rate": 0.0004073958757321231, + "loss": 3.4265, + "step": 2381 + }, + { + "epoch": 0.3, + "grad_norm": 0.6605141758918762, + "learning_rate": 0.0004073153287999094, + "loss": 3.4845, + "step": 2382 + }, + { + "epoch": 0.31, + "grad_norm": 0.6517539620399475, + "learning_rate": 0.00040723475482336703, + "loss": 3.5435, + "step": 2383 + }, + { + "epoch": 0.31, + "grad_norm": 0.6293243169784546, + "learning_rate": 0.00040715415381634737, + "loss": 3.4723, + "step": 2384 + }, + { + "epoch": 0.31, + "grad_norm": 0.6572651267051697, + "learning_rate": 0.0004070735257927068, + "loss": 3.3577, + "step": 2385 + }, + { + "epoch": 0.31, + "grad_norm": 0.6305806636810303, + "learning_rate": 0.0004069928707663062, + "loss": 3.4141, + "step": 2386 + }, + { + "epoch": 0.31, + "grad_norm": 0.6208084225654602, + "learning_rate": 0.00040691218875101113, + "loss": 3.4277, + "step": 2387 + }, + { + "epoch": 0.31, + "grad_norm": 0.7063469290733337, + "learning_rate": 0.0004068314797606917, + "loss": 3.3206, + "step": 2388 + }, + { + "epoch": 0.31, + "grad_norm": 0.6245595812797546, + "learning_rate": 0.0004067507438092227, + "loss": 3.4599, + "step": 2389 + }, + { + "epoch": 0.31, + "grad_norm": 0.7018944025039673, + "learning_rate": 0.0004066699809104837, + "loss": 3.4735, + "step": 2390 + }, + { + "epoch": 0.31, + "grad_norm": 0.6688552498817444, + "learning_rate": 0.0004065891910783587, + "loss": 3.4035, + "step": 2391 + }, + { + "epoch": 0.31, + "grad_norm": 0.699343740940094, + "learning_rate": 0.00040650837432673647, + "loss": 3.4211, + "step": 2392 + }, + { + "epoch": 0.31, + "grad_norm": 0.6850763559341431, + "learning_rate": 0.00040642753066951024, + "loss": 3.5136, + "step": 2393 + }, + { + "epoch": 0.31, + "grad_norm": 0.714257538318634, + "learning_rate": 0.00040634666012057797, + "loss": 3.3978, + "step": 2394 + }, + { + "epoch": 0.31, + "grad_norm": 0.6932154893875122, + "learning_rate": 0.00040626576269384244, + "loss": 3.4032, + "step": 2395 + }, + { + "epoch": 0.31, + "grad_norm": 0.6759898066520691, + "learning_rate": 0.0004061848384032106, + "loss": 3.3437, + "step": 2396 + }, + { + "epoch": 0.31, + "grad_norm": 0.6919507384300232, + "learning_rate": 0.0004061038872625944, + "loss": 3.4002, + "step": 2397 + }, + { + "epoch": 0.31, + "grad_norm": 0.6822661757469177, + "learning_rate": 0.00040602290928591026, + "loss": 3.4483, + "step": 2398 + }, + { + "epoch": 0.31, + "grad_norm": 0.7360864281654358, + "learning_rate": 0.00040594190448707925, + "loss": 3.2807, + "step": 2399 + }, + { + "epoch": 0.31, + "grad_norm": 0.6921037435531616, + "learning_rate": 0.00040586087288002707, + "loss": 3.3602, + "step": 2400 + }, + { + "epoch": 0.31, + "grad_norm": 0.7205747365951538, + "learning_rate": 0.0004057798144786839, + "loss": 3.4747, + "step": 2401 + }, + { + "epoch": 0.31, + "grad_norm": 0.6903738379478455, + "learning_rate": 0.0004056987292969847, + "loss": 3.4369, + "step": 2402 + }, + { + "epoch": 0.31, + "grad_norm": 0.7312140464782715, + "learning_rate": 0.00040561761734886885, + "loss": 3.4649, + "step": 2403 + }, + { + "epoch": 0.31, + "grad_norm": 0.6566363573074341, + "learning_rate": 0.0004055364786482806, + "loss": 3.4276, + "step": 2404 + }, + { + "epoch": 0.31, + "grad_norm": 0.6498070359230042, + "learning_rate": 0.00040545531320916844, + "loss": 3.4, + "step": 2405 + }, + { + "epoch": 0.31, + "grad_norm": 0.6509925723075867, + "learning_rate": 0.0004053741210454859, + "loss": 3.3766, + "step": 2406 + }, + { + "epoch": 0.31, + "grad_norm": 0.6313583254814148, + "learning_rate": 0.00040529290217119053, + "loss": 3.4926, + "step": 2407 + }, + { + "epoch": 0.31, + "grad_norm": 0.6571773290634155, + "learning_rate": 0.00040521165660024503, + "loss": 3.5759, + "step": 2408 + }, + { + "epoch": 0.31, + "grad_norm": 0.6563805341720581, + "learning_rate": 0.0004051303843466164, + "loss": 3.4646, + "step": 2409 + }, + { + "epoch": 0.31, + "grad_norm": 0.6175151467323303, + "learning_rate": 0.0004050490854242763, + "loss": 3.3364, + "step": 2410 + }, + { + "epoch": 0.31, + "grad_norm": 0.7010701894760132, + "learning_rate": 0.0004049677598472009, + "loss": 3.4465, + "step": 2411 + }, + { + "epoch": 0.31, + "grad_norm": 0.7390778660774231, + "learning_rate": 0.000404886407629371, + "loss": 3.5124, + "step": 2412 + }, + { + "epoch": 0.31, + "grad_norm": 0.6527614593505859, + "learning_rate": 0.0004048050287847721, + "loss": 3.4107, + "step": 2413 + }, + { + "epoch": 0.31, + "grad_norm": 0.6538462042808533, + "learning_rate": 0.0004047236233273941, + "loss": 3.5326, + "step": 2414 + }, + { + "epoch": 0.31, + "grad_norm": 0.6972126364707947, + "learning_rate": 0.00040464219127123147, + "loss": 3.4837, + "step": 2415 + }, + { + "epoch": 0.31, + "grad_norm": 0.7578407526016235, + "learning_rate": 0.00040456073263028347, + "loss": 3.5219, + "step": 2416 + }, + { + "epoch": 0.31, + "grad_norm": 0.6868325471878052, + "learning_rate": 0.0004044792474185537, + "loss": 3.5467, + "step": 2417 + }, + { + "epoch": 0.31, + "grad_norm": 0.6207484006881714, + "learning_rate": 0.00040439773565005034, + "loss": 3.3662, + "step": 2418 + }, + { + "epoch": 0.31, + "grad_norm": 0.6857748031616211, + "learning_rate": 0.00040431619733878637, + "loss": 3.3901, + "step": 2419 + }, + { + "epoch": 0.31, + "grad_norm": 0.6813026070594788, + "learning_rate": 0.0004042346324987791, + "loss": 3.2639, + "step": 2420 + }, + { + "epoch": 0.31, + "grad_norm": 0.6619722843170166, + "learning_rate": 0.0004041530411440504, + "loss": 3.5003, + "step": 2421 + }, + { + "epoch": 0.31, + "grad_norm": 0.6328409314155579, + "learning_rate": 0.0004040714232886269, + "loss": 3.5067, + "step": 2422 + }, + { + "epoch": 0.31, + "grad_norm": 0.68046635389328, + "learning_rate": 0.00040398977894653954, + "loss": 3.4216, + "step": 2423 + }, + { + "epoch": 0.31, + "grad_norm": 0.6497738361358643, + "learning_rate": 0.000403908108131824, + "loss": 3.4894, + "step": 2424 + }, + { + "epoch": 0.31, + "grad_norm": 0.6499060988426208, + "learning_rate": 0.0004038264108585204, + "loss": 3.4544, + "step": 2425 + }, + { + "epoch": 0.31, + "grad_norm": 0.6749792695045471, + "learning_rate": 0.00040374468714067345, + "loss": 3.4488, + "step": 2426 + }, + { + "epoch": 0.31, + "grad_norm": 0.7037678360939026, + "learning_rate": 0.00040366293699233246, + "loss": 3.4747, + "step": 2427 + }, + { + "epoch": 0.31, + "grad_norm": 0.6845188736915588, + "learning_rate": 0.00040358116042755115, + "loss": 3.4011, + "step": 2428 + }, + { + "epoch": 0.31, + "grad_norm": 0.651651918888092, + "learning_rate": 0.00040349935746038793, + "loss": 3.4295, + "step": 2429 + }, + { + "epoch": 0.31, + "grad_norm": 0.6486951112747192, + "learning_rate": 0.0004034175281049056, + "loss": 3.3702, + "step": 2430 + }, + { + "epoch": 0.31, + "grad_norm": 0.7313594818115234, + "learning_rate": 0.00040333567237517165, + "loss": 3.4048, + "step": 2431 + }, + { + "epoch": 0.31, + "grad_norm": 0.6645215153694153, + "learning_rate": 0.000403253790285258, + "loss": 3.4562, + "step": 2432 + }, + { + "epoch": 0.31, + "grad_norm": 0.6896566152572632, + "learning_rate": 0.00040317188184924116, + "loss": 3.3233, + "step": 2433 + }, + { + "epoch": 0.31, + "grad_norm": 0.6973954439163208, + "learning_rate": 0.00040308994708120207, + "loss": 3.4185, + "step": 2434 + }, + { + "epoch": 0.31, + "grad_norm": 0.6570093631744385, + "learning_rate": 0.00040300798599522626, + "loss": 3.3588, + "step": 2435 + }, + { + "epoch": 0.31, + "grad_norm": 0.6232661604881287, + "learning_rate": 0.00040292599860540393, + "loss": 3.4517, + "step": 2436 + }, + { + "epoch": 0.31, + "grad_norm": 0.6920954585075378, + "learning_rate": 0.00040284398492582954, + "loss": 3.4176, + "step": 2437 + }, + { + "epoch": 0.31, + "grad_norm": 0.7019113898277283, + "learning_rate": 0.00040276194497060227, + "loss": 3.4126, + "step": 2438 + }, + { + "epoch": 0.31, + "grad_norm": 0.6729170680046082, + "learning_rate": 0.0004026798787538256, + "loss": 3.4647, + "step": 2439 + }, + { + "epoch": 0.31, + "grad_norm": 0.6775332689285278, + "learning_rate": 0.00040259778628960773, + "loss": 3.4055, + "step": 2440 + }, + { + "epoch": 0.31, + "grad_norm": 0.7059245109558105, + "learning_rate": 0.0004025156675920614, + "loss": 3.4201, + "step": 2441 + }, + { + "epoch": 0.31, + "grad_norm": 0.6790223717689514, + "learning_rate": 0.0004024335226753037, + "loss": 3.4517, + "step": 2442 + }, + { + "epoch": 0.31, + "grad_norm": 0.6908589005470276, + "learning_rate": 0.0004023513515534563, + "loss": 3.3657, + "step": 2443 + }, + { + "epoch": 0.31, + "grad_norm": 0.6590731143951416, + "learning_rate": 0.0004022691542406453, + "loss": 3.4982, + "step": 2444 + }, + { + "epoch": 0.31, + "grad_norm": 0.6416826248168945, + "learning_rate": 0.00040218693075100146, + "loss": 3.3781, + "step": 2445 + }, + { + "epoch": 0.31, + "grad_norm": 0.6760276556015015, + "learning_rate": 0.0004021046810986599, + "loss": 3.4182, + "step": 2446 + }, + { + "epoch": 0.31, + "grad_norm": 0.6423786282539368, + "learning_rate": 0.0004020224052977604, + "loss": 3.5088, + "step": 2447 + }, + { + "epoch": 0.31, + "grad_norm": 0.7082242369651794, + "learning_rate": 0.00040194010336244696, + "loss": 3.3883, + "step": 2448 + }, + { + "epoch": 0.31, + "grad_norm": 0.6391067504882812, + "learning_rate": 0.0004018577753068683, + "loss": 3.2714, + "step": 2449 + }, + { + "epoch": 0.31, + "grad_norm": 0.7196114659309387, + "learning_rate": 0.00040177542114517765, + "loss": 3.5096, + "step": 2450 + }, + { + "epoch": 0.31, + "grad_norm": 0.6951587200164795, + "learning_rate": 0.0004016930408915325, + "loss": 3.3887, + "step": 2451 + }, + { + "epoch": 0.31, + "grad_norm": 0.622295081615448, + "learning_rate": 0.00040161063456009506, + "loss": 3.3613, + "step": 2452 + }, + { + "epoch": 0.31, + "grad_norm": 0.6327260732650757, + "learning_rate": 0.00040152820216503196, + "loss": 3.4621, + "step": 2453 + }, + { + "epoch": 0.31, + "grad_norm": 0.6431770920753479, + "learning_rate": 0.00040144574372051416, + "loss": 3.4108, + "step": 2454 + }, + { + "epoch": 0.31, + "grad_norm": 0.6560462117195129, + "learning_rate": 0.0004013632592407174, + "loss": 3.4254, + "step": 2455 + }, + { + "epoch": 0.31, + "grad_norm": 0.672125518321991, + "learning_rate": 0.00040128074873982156, + "loss": 3.45, + "step": 2456 + }, + { + "epoch": 0.31, + "grad_norm": 0.6163976788520813, + "learning_rate": 0.0004011982122320112, + "loss": 3.3067, + "step": 2457 + }, + { + "epoch": 0.31, + "grad_norm": 0.667873740196228, + "learning_rate": 0.0004011156497314754, + "loss": 3.3488, + "step": 2458 + }, + { + "epoch": 0.31, + "grad_norm": 0.6594126224517822, + "learning_rate": 0.00040103306125240746, + "loss": 3.5699, + "step": 2459 + }, + { + "epoch": 0.31, + "grad_norm": 0.6237178444862366, + "learning_rate": 0.0004009504468090054, + "loss": 3.5031, + "step": 2460 + }, + { + "epoch": 0.32, + "grad_norm": 0.6896465420722961, + "learning_rate": 0.00040086780641547157, + "loss": 3.4244, + "step": 2461 + }, + { + "epoch": 0.32, + "grad_norm": 0.6883692145347595, + "learning_rate": 0.00040078514008601274, + "loss": 3.4431, + "step": 2462 + }, + { + "epoch": 0.32, + "grad_norm": 0.6940760016441345, + "learning_rate": 0.00040070244783484035, + "loss": 3.3533, + "step": 2463 + }, + { + "epoch": 0.32, + "grad_norm": 0.6569497585296631, + "learning_rate": 0.00040061972967617013, + "loss": 3.4175, + "step": 2464 + }, + { + "epoch": 0.32, + "grad_norm": 0.6395750641822815, + "learning_rate": 0.00040053698562422216, + "loss": 3.5232, + "step": 2465 + }, + { + "epoch": 0.32, + "grad_norm": 0.6653283834457397, + "learning_rate": 0.00040045421569322116, + "loss": 3.3437, + "step": 2466 + }, + { + "epoch": 0.32, + "grad_norm": 0.6142536997795105, + "learning_rate": 0.00040037141989739626, + "loss": 3.3303, + "step": 2467 + }, + { + "epoch": 0.32, + "grad_norm": 0.6975305080413818, + "learning_rate": 0.0004002885982509811, + "loss": 3.3792, + "step": 2468 + }, + { + "epoch": 0.32, + "grad_norm": 0.6583127975463867, + "learning_rate": 0.0004002057507682136, + "loss": 3.5056, + "step": 2469 + }, + { + "epoch": 0.32, + "grad_norm": 0.6988340616226196, + "learning_rate": 0.0004001228774633362, + "loss": 3.4311, + "step": 2470 + }, + { + "epoch": 0.32, + "grad_norm": 0.7292492985725403, + "learning_rate": 0.0004000399783505957, + "loss": 3.3629, + "step": 2471 + }, + { + "epoch": 0.32, + "grad_norm": 0.6581091284751892, + "learning_rate": 0.0003999570534442436, + "loss": 3.3712, + "step": 2472 + }, + { + "epoch": 0.32, + "grad_norm": 0.6563900113105774, + "learning_rate": 0.0003998741027585356, + "loss": 3.4227, + "step": 2473 + }, + { + "epoch": 0.32, + "grad_norm": 0.7054997086524963, + "learning_rate": 0.0003997911263077318, + "loss": 3.4234, + "step": 2474 + }, + { + "epoch": 0.32, + "grad_norm": 0.7008715271949768, + "learning_rate": 0.0003997081241060968, + "loss": 3.505, + "step": 2475 + }, + { + "epoch": 0.32, + "grad_norm": 0.6388863921165466, + "learning_rate": 0.00039962509616789984, + "loss": 3.2693, + "step": 2476 + }, + { + "epoch": 0.32, + "grad_norm": 0.8109036087989807, + "learning_rate": 0.0003995420425074141, + "loss": 3.3847, + "step": 2477 + }, + { + "epoch": 0.32, + "grad_norm": 0.6357604265213013, + "learning_rate": 0.00039945896313891774, + "loss": 3.3092, + "step": 2478 + }, + { + "epoch": 0.32, + "grad_norm": 0.6333456039428711, + "learning_rate": 0.00039937585807669286, + "loss": 3.29, + "step": 2479 + }, + { + "epoch": 0.32, + "grad_norm": 0.6567932963371277, + "learning_rate": 0.00039929272733502623, + "loss": 3.5207, + "step": 2480 + }, + { + "epoch": 0.32, + "grad_norm": 0.6872777938842773, + "learning_rate": 0.0003992095709282091, + "loss": 3.3851, + "step": 2481 + }, + { + "epoch": 0.32, + "grad_norm": 0.6259168982505798, + "learning_rate": 0.0003991263888705369, + "loss": 3.4827, + "step": 2482 + }, + { + "epoch": 0.32, + "grad_norm": 0.637649655342102, + "learning_rate": 0.00039904318117630965, + "loss": 3.6457, + "step": 2483 + }, + { + "epoch": 0.32, + "grad_norm": 0.6806047558784485, + "learning_rate": 0.0003989599478598317, + "loss": 3.3285, + "step": 2484 + }, + { + "epoch": 0.32, + "grad_norm": 0.6640322208404541, + "learning_rate": 0.0003988766889354118, + "loss": 3.4148, + "step": 2485 + }, + { + "epoch": 0.32, + "grad_norm": 0.6474546790122986, + "learning_rate": 0.00039879340441736314, + "loss": 3.3632, + "step": 2486 + }, + { + "epoch": 0.32, + "grad_norm": 0.6708735823631287, + "learning_rate": 0.0003987100943200033, + "loss": 3.4945, + "step": 2487 + }, + { + "epoch": 0.32, + "grad_norm": 0.7857782244682312, + "learning_rate": 0.0003986267586576543, + "loss": 3.45, + "step": 2488 + }, + { + "epoch": 0.32, + "grad_norm": 0.733543872833252, + "learning_rate": 0.0003985433974446424, + "loss": 3.4585, + "step": 2489 + }, + { + "epoch": 0.32, + "grad_norm": 0.687157392501831, + "learning_rate": 0.0003984600106952985, + "loss": 3.2763, + "step": 2490 + }, + { + "epoch": 0.32, + "grad_norm": 0.6997659206390381, + "learning_rate": 0.00039837659842395756, + "loss": 3.529, + "step": 2491 + }, + { + "epoch": 0.32, + "grad_norm": 0.6506561636924744, + "learning_rate": 0.0003982931606449592, + "loss": 3.5072, + "step": 2492 + }, + { + "epoch": 0.32, + "grad_norm": 0.6445373296737671, + "learning_rate": 0.00039820969737264743, + "loss": 3.3944, + "step": 2493 + }, + { + "epoch": 0.32, + "grad_norm": 0.6242894530296326, + "learning_rate": 0.00039812620862137056, + "loss": 3.4263, + "step": 2494 + }, + { + "epoch": 0.32, + "grad_norm": 0.5803747773170471, + "learning_rate": 0.0003980426944054811, + "loss": 3.3099, + "step": 2495 + }, + { + "epoch": 0.32, + "grad_norm": 0.6158910989761353, + "learning_rate": 0.00039795915473933623, + "loss": 3.5117, + "step": 2496 + }, + { + "epoch": 0.32, + "grad_norm": 0.6407814621925354, + "learning_rate": 0.00039787558963729744, + "loss": 3.3517, + "step": 2497 + }, + { + "epoch": 0.32, + "grad_norm": 0.6843712329864502, + "learning_rate": 0.0003977919991137304, + "loss": 3.4314, + "step": 2498 + }, + { + "epoch": 0.32, + "grad_norm": 0.7148441076278687, + "learning_rate": 0.0003977083831830054, + "loss": 3.3656, + "step": 2499 + }, + { + "epoch": 0.32, + "grad_norm": 0.6620389819145203, + "learning_rate": 0.00039762474185949686, + "loss": 3.4349, + "step": 2500 + }, + { + "epoch": 0.32, + "grad_norm": 0.6523193120956421, + "learning_rate": 0.0003975410751575839, + "loss": 3.4915, + "step": 2501 + }, + { + "epoch": 0.32, + "grad_norm": 0.6660597920417786, + "learning_rate": 0.0003974573830916496, + "loss": 3.4874, + "step": 2502 + }, + { + "epoch": 0.32, + "grad_norm": 0.7087952494621277, + "learning_rate": 0.0003973736656760817, + "loss": 3.5746, + "step": 2503 + }, + { + "epoch": 0.32, + "grad_norm": 0.7073147296905518, + "learning_rate": 0.0003972899229252721, + "loss": 3.3991, + "step": 2504 + }, + { + "epoch": 0.32, + "grad_norm": 0.6184029579162598, + "learning_rate": 0.00039720615485361733, + "loss": 3.4825, + "step": 2505 + }, + { + "epoch": 0.32, + "grad_norm": 0.6575698256492615, + "learning_rate": 0.00039712236147551795, + "loss": 3.4063, + "step": 2506 + }, + { + "epoch": 0.32, + "grad_norm": 0.648165762424469, + "learning_rate": 0.000397038542805379, + "loss": 3.5048, + "step": 2507 + }, + { + "epoch": 0.32, + "grad_norm": 0.6533029079437256, + "learning_rate": 0.00039695469885761, + "loss": 3.3698, + "step": 2508 + }, + { + "epoch": 0.32, + "grad_norm": 0.6076450943946838, + "learning_rate": 0.0003968708296466245, + "loss": 3.3721, + "step": 2509 + }, + { + "epoch": 0.32, + "grad_norm": 0.631220281124115, + "learning_rate": 0.00039678693518684083, + "loss": 3.3633, + "step": 2510 + }, + { + "epoch": 0.32, + "grad_norm": 0.6237044334411621, + "learning_rate": 0.0003967030154926813, + "loss": 3.4118, + "step": 2511 + }, + { + "epoch": 0.32, + "grad_norm": 0.712161660194397, + "learning_rate": 0.00039661907057857263, + "loss": 3.386, + "step": 2512 + }, + { + "epoch": 0.32, + "grad_norm": 0.6828798651695251, + "learning_rate": 0.0003965351004589459, + "loss": 3.4011, + "step": 2513 + }, + { + "epoch": 0.32, + "grad_norm": 0.6511789560317993, + "learning_rate": 0.0003964511051482367, + "loss": 3.3554, + "step": 2514 + }, + { + "epoch": 0.32, + "grad_norm": 0.7005689740180969, + "learning_rate": 0.00039636708466088476, + "loss": 3.4527, + "step": 2515 + }, + { + "epoch": 0.32, + "grad_norm": 0.6651748418807983, + "learning_rate": 0.00039628303901133413, + "loss": 3.4961, + "step": 2516 + }, + { + "epoch": 0.32, + "grad_norm": 0.6886200308799744, + "learning_rate": 0.00039619896821403315, + "loss": 3.3853, + "step": 2517 + }, + { + "epoch": 0.32, + "grad_norm": 0.6898598670959473, + "learning_rate": 0.0003961148722834347, + "loss": 3.3678, + "step": 2518 + }, + { + "epoch": 0.32, + "grad_norm": 0.6316580772399902, + "learning_rate": 0.0003960307512339958, + "loss": 3.3588, + "step": 2519 + }, + { + "epoch": 0.32, + "grad_norm": 0.6656391620635986, + "learning_rate": 0.00039594660508017774, + "loss": 3.4436, + "step": 2520 + }, + { + "epoch": 0.32, + "grad_norm": 0.7290733456611633, + "learning_rate": 0.00039586243383644645, + "loss": 3.4326, + "step": 2521 + }, + { + "epoch": 0.32, + "grad_norm": 0.684203565120697, + "learning_rate": 0.00039577823751727175, + "loss": 3.4813, + "step": 2522 + }, + { + "epoch": 0.32, + "grad_norm": 0.6514954566955566, + "learning_rate": 0.00039569401613712797, + "loss": 3.4506, + "step": 2523 + }, + { + "epoch": 0.32, + "grad_norm": 0.6980857849121094, + "learning_rate": 0.00039560976971049386, + "loss": 3.3717, + "step": 2524 + }, + { + "epoch": 0.32, + "grad_norm": 0.6839146614074707, + "learning_rate": 0.00039552549825185224, + "loss": 3.3792, + "step": 2525 + }, + { + "epoch": 0.32, + "grad_norm": 0.6302343010902405, + "learning_rate": 0.0003954412017756904, + "loss": 3.4761, + "step": 2526 + }, + { + "epoch": 0.32, + "grad_norm": 0.6123268604278564, + "learning_rate": 0.00039535688029649984, + "loss": 3.3494, + "step": 2527 + }, + { + "epoch": 0.32, + "grad_norm": 0.6835804581642151, + "learning_rate": 0.0003952725338287765, + "loss": 3.4457, + "step": 2528 + }, + { + "epoch": 0.32, + "grad_norm": 0.6334277391433716, + "learning_rate": 0.0003951881623870204, + "loss": 3.3608, + "step": 2529 + }, + { + "epoch": 0.32, + "grad_norm": 0.6541914939880371, + "learning_rate": 0.00039510376598573605, + "loss": 3.4332, + "step": 2530 + }, + { + "epoch": 0.32, + "grad_norm": 0.6691238880157471, + "learning_rate": 0.0003950193446394321, + "loss": 3.553, + "step": 2531 + }, + { + "epoch": 0.32, + "grad_norm": 0.5947278738021851, + "learning_rate": 0.00039493489836262165, + "loss": 3.4345, + "step": 2532 + }, + { + "epoch": 0.32, + "grad_norm": 0.6068588495254517, + "learning_rate": 0.0003948504271698219, + "loss": 3.4011, + "step": 2533 + }, + { + "epoch": 0.32, + "grad_norm": 0.6819738745689392, + "learning_rate": 0.0003947659310755545, + "loss": 3.3812, + "step": 2534 + }, + { + "epoch": 0.32, + "grad_norm": 0.6502918004989624, + "learning_rate": 0.00039468141009434533, + "loss": 3.5099, + "step": 2535 + }, + { + "epoch": 0.32, + "grad_norm": 0.6570164561271667, + "learning_rate": 0.0003945968642407244, + "loss": 3.4287, + "step": 2536 + }, + { + "epoch": 0.32, + "grad_norm": 0.7019553184509277, + "learning_rate": 0.0003945122935292261, + "loss": 3.3873, + "step": 2537 + }, + { + "epoch": 0.32, + "grad_norm": 0.6633791923522949, + "learning_rate": 0.00039442769797438937, + "loss": 3.4676, + "step": 2538 + }, + { + "epoch": 0.32, + "grad_norm": 0.6608754396438599, + "learning_rate": 0.0003943430775907569, + "loss": 3.3399, + "step": 2539 + }, + { + "epoch": 0.33, + "grad_norm": 0.6182772517204285, + "learning_rate": 0.0003942584323928762, + "loss": 3.4271, + "step": 2540 + }, + { + "epoch": 0.33, + "grad_norm": 0.7042610049247742, + "learning_rate": 0.00039417376239529847, + "loss": 3.4318, + "step": 2541 + }, + { + "epoch": 0.33, + "grad_norm": 0.691856861114502, + "learning_rate": 0.00039408906761257957, + "loss": 3.435, + "step": 2542 + }, + { + "epoch": 0.33, + "grad_norm": 0.692318856716156, + "learning_rate": 0.00039400434805927947, + "loss": 3.5448, + "step": 2543 + }, + { + "epoch": 0.33, + "grad_norm": 0.6041357517242432, + "learning_rate": 0.00039391960374996263, + "loss": 3.3525, + "step": 2544 + }, + { + "epoch": 0.33, + "grad_norm": 0.7062011361122131, + "learning_rate": 0.0003938348346991973, + "loss": 3.5398, + "step": 2545 + }, + { + "epoch": 0.33, + "grad_norm": 0.6786819100379944, + "learning_rate": 0.0003937500409215565, + "loss": 3.3596, + "step": 2546 + }, + { + "epoch": 0.33, + "grad_norm": 0.6160567998886108, + "learning_rate": 0.0003936652224316172, + "loss": 3.3977, + "step": 2547 + }, + { + "epoch": 0.33, + "grad_norm": 0.6385580897331238, + "learning_rate": 0.00039358037924396055, + "loss": 3.4078, + "step": 2548 + }, + { + "epoch": 0.33, + "grad_norm": 0.6399730443954468, + "learning_rate": 0.0003934955113731723, + "loss": 3.329, + "step": 2549 + }, + { + "epoch": 0.33, + "grad_norm": 0.6369842290878296, + "learning_rate": 0.000393410618833842, + "loss": 3.5006, + "step": 2550 + }, + { + "epoch": 0.33, + "grad_norm": 0.6642215251922607, + "learning_rate": 0.0003933257016405638, + "loss": 3.3534, + "step": 2551 + }, + { + "epoch": 0.33, + "grad_norm": 0.7062375545501709, + "learning_rate": 0.0003932407598079359, + "loss": 3.4078, + "step": 2552 + }, + { + "epoch": 0.33, + "grad_norm": 0.6758290529251099, + "learning_rate": 0.0003931557933505608, + "loss": 3.3559, + "step": 2553 + }, + { + "epoch": 0.33, + "grad_norm": 0.6561087965965271, + "learning_rate": 0.00039307080228304526, + "loss": 3.4428, + "step": 2554 + }, + { + "epoch": 0.33, + "grad_norm": 0.6365450620651245, + "learning_rate": 0.0003929857866200002, + "loss": 3.3149, + "step": 2555 + }, + { + "epoch": 0.33, + "grad_norm": 0.6529794335365295, + "learning_rate": 0.0003929007463760407, + "loss": 3.4746, + "step": 2556 + }, + { + "epoch": 0.33, + "grad_norm": 0.6441689133644104, + "learning_rate": 0.0003928156815657863, + "loss": 3.3952, + "step": 2557 + }, + { + "epoch": 0.33, + "grad_norm": 0.6540311574935913, + "learning_rate": 0.00039273059220386065, + "loss": 3.522, + "step": 2558 + }, + { + "epoch": 0.33, + "grad_norm": 0.6645547747612, + "learning_rate": 0.0003926454783048914, + "loss": 3.4708, + "step": 2559 + }, + { + "epoch": 0.33, + "grad_norm": 0.6746283173561096, + "learning_rate": 0.0003925603398835108, + "loss": 3.3483, + "step": 2560 + }, + { + "epoch": 0.33, + "grad_norm": 0.6782620549201965, + "learning_rate": 0.00039247517695435507, + "loss": 3.3578, + "step": 2561 + }, + { + "epoch": 0.33, + "grad_norm": 0.6735712289810181, + "learning_rate": 0.00039238998953206473, + "loss": 3.3935, + "step": 2562 + }, + { + "epoch": 0.33, + "grad_norm": 0.6714909672737122, + "learning_rate": 0.0003923047776312844, + "loss": 3.4678, + "step": 2563 + }, + { + "epoch": 0.33, + "grad_norm": 0.7226071953773499, + "learning_rate": 0.00039221954126666313, + "loss": 3.5213, + "step": 2564 + }, + { + "epoch": 0.33, + "grad_norm": 0.6498965620994568, + "learning_rate": 0.00039213428045285385, + "loss": 3.4121, + "step": 2565 + }, + { + "epoch": 0.33, + "grad_norm": 0.6417312622070312, + "learning_rate": 0.0003920489952045141, + "loss": 3.3316, + "step": 2566 + }, + { + "epoch": 0.33, + "grad_norm": 0.6538100838661194, + "learning_rate": 0.00039196368553630535, + "loss": 3.3282, + "step": 2567 + }, + { + "epoch": 0.33, + "grad_norm": 0.6624993085861206, + "learning_rate": 0.0003918783514628932, + "loss": 3.3879, + "step": 2568 + }, + { + "epoch": 0.33, + "grad_norm": 0.662112295627594, + "learning_rate": 0.0003917929929989476, + "loss": 3.3907, + "step": 2569 + }, + { + "epoch": 0.33, + "grad_norm": 0.6703125238418579, + "learning_rate": 0.0003917076101591427, + "loss": 3.5137, + "step": 2570 + }, + { + "epoch": 0.33, + "grad_norm": 0.6566250920295715, + "learning_rate": 0.00039162220295815684, + "loss": 3.4737, + "step": 2571 + }, + { + "epoch": 0.33, + "grad_norm": 0.6487689018249512, + "learning_rate": 0.00039153677141067254, + "loss": 3.3527, + "step": 2572 + }, + { + "epoch": 0.33, + "grad_norm": 0.704270601272583, + "learning_rate": 0.0003914513155313763, + "loss": 3.5182, + "step": 2573 + }, + { + "epoch": 0.33, + "grad_norm": 0.793128252029419, + "learning_rate": 0.00039136583533495905, + "loss": 3.3832, + "step": 2574 + }, + { + "epoch": 0.33, + "grad_norm": 0.6730992197990417, + "learning_rate": 0.0003912803308361159, + "loss": 3.6286, + "step": 2575 + }, + { + "epoch": 0.33, + "grad_norm": 0.6524163484573364, + "learning_rate": 0.00039119480204954606, + "loss": 3.4998, + "step": 2576 + }, + { + "epoch": 0.33, + "grad_norm": 0.7632148265838623, + "learning_rate": 0.00039110924898995284, + "loss": 3.4395, + "step": 2577 + }, + { + "epoch": 0.33, + "grad_norm": 0.6825524568557739, + "learning_rate": 0.0003910236716720438, + "loss": 3.4272, + "step": 2578 + }, + { + "epoch": 0.33, + "grad_norm": 0.6332438588142395, + "learning_rate": 0.00039093807011053073, + "loss": 3.4783, + "step": 2579 + }, + { + "epoch": 0.33, + "grad_norm": 0.6003892421722412, + "learning_rate": 0.0003908524443201296, + "loss": 3.3101, + "step": 2580 + }, + { + "epoch": 0.33, + "grad_norm": 0.6262263655662537, + "learning_rate": 0.0003907667943155603, + "loss": 3.4023, + "step": 2581 + }, + { + "epoch": 0.33, + "grad_norm": 0.6281822919845581, + "learning_rate": 0.00039068112011154715, + "loss": 3.4619, + "step": 2582 + }, + { + "epoch": 0.33, + "grad_norm": 0.6222302317619324, + "learning_rate": 0.0003905954217228186, + "loss": 3.2979, + "step": 2583 + }, + { + "epoch": 0.33, + "grad_norm": 0.6705178022384644, + "learning_rate": 0.0003905096991641071, + "loss": 3.4174, + "step": 2584 + }, + { + "epoch": 0.33, + "grad_norm": 0.6498988270759583, + "learning_rate": 0.00039042395245014933, + "loss": 3.4733, + "step": 2585 + }, + { + "epoch": 0.33, + "grad_norm": 0.6690146327018738, + "learning_rate": 0.00039033818159568624, + "loss": 3.3617, + "step": 2586 + }, + { + "epoch": 0.33, + "grad_norm": 0.7530152201652527, + "learning_rate": 0.0003902523866154628, + "loss": 3.3473, + "step": 2587 + }, + { + "epoch": 0.33, + "grad_norm": 0.647878110408783, + "learning_rate": 0.00039016656752422814, + "loss": 3.3918, + "step": 2588 + }, + { + "epoch": 0.33, + "grad_norm": 0.6253050565719604, + "learning_rate": 0.00039008072433673556, + "loss": 3.3227, + "step": 2589 + }, + { + "epoch": 0.33, + "grad_norm": 0.6417919993400574, + "learning_rate": 0.00038999485706774263, + "loss": 3.3752, + "step": 2590 + }, + { + "epoch": 0.33, + "grad_norm": 0.7062673568725586, + "learning_rate": 0.0003899089657320107, + "loss": 3.4478, + "step": 2591 + }, + { + "epoch": 0.33, + "grad_norm": 0.6611924767494202, + "learning_rate": 0.0003898230503443055, + "loss": 3.4042, + "step": 2592 + }, + { + "epoch": 0.33, + "grad_norm": 0.6270459294319153, + "learning_rate": 0.0003897371109193972, + "loss": 3.4388, + "step": 2593 + }, + { + "epoch": 0.33, + "grad_norm": 0.6408137083053589, + "learning_rate": 0.00038965114747205944, + "loss": 3.3497, + "step": 2594 + }, + { + "epoch": 0.33, + "grad_norm": 0.7065438032150269, + "learning_rate": 0.0003895651600170705, + "loss": 3.5494, + "step": 2595 + }, + { + "epoch": 0.33, + "grad_norm": 0.6970211863517761, + "learning_rate": 0.0003894791485692125, + "loss": 3.4482, + "step": 2596 + }, + { + "epoch": 0.33, + "grad_norm": 0.6874566078186035, + "learning_rate": 0.0003893931131432719, + "loss": 3.3486, + "step": 2597 + }, + { + "epoch": 0.33, + "grad_norm": 0.6408682465553284, + "learning_rate": 0.00038930705375403923, + "loss": 3.4459, + "step": 2598 + }, + { + "epoch": 0.33, + "grad_norm": 0.6832528114318848, + "learning_rate": 0.000389220970416309, + "loss": 3.479, + "step": 2599 + }, + { + "epoch": 0.33, + "grad_norm": 0.7190806865692139, + "learning_rate": 0.00038913486314488, + "loss": 3.3861, + "step": 2600 + }, + { + "epoch": 0.33, + "grad_norm": 0.6788740754127502, + "learning_rate": 0.000389048731954555, + "loss": 3.5252, + "step": 2601 + }, + { + "epoch": 0.33, + "grad_norm": 0.7985811233520508, + "learning_rate": 0.0003889625768601409, + "loss": 3.3963, + "step": 2602 + }, + { + "epoch": 0.33, + "grad_norm": 0.654829740524292, + "learning_rate": 0.00038887639787644904, + "loss": 3.4419, + "step": 2603 + }, + { + "epoch": 0.33, + "grad_norm": 0.6960733532905579, + "learning_rate": 0.00038879019501829423, + "loss": 3.3564, + "step": 2604 + }, + { + "epoch": 0.33, + "grad_norm": 0.6409716010093689, + "learning_rate": 0.00038870396830049595, + "loss": 3.4309, + "step": 2605 + }, + { + "epoch": 0.33, + "grad_norm": 0.6938721537590027, + "learning_rate": 0.00038861771773787744, + "loss": 3.407, + "step": 2606 + }, + { + "epoch": 0.33, + "grad_norm": 0.6538400053977966, + "learning_rate": 0.00038853144334526643, + "loss": 3.4347, + "step": 2607 + }, + { + "epoch": 0.33, + "grad_norm": 0.6794121265411377, + "learning_rate": 0.0003884451451374942, + "loss": 3.3708, + "step": 2608 + }, + { + "epoch": 0.33, + "grad_norm": 0.6384621858596802, + "learning_rate": 0.00038835882312939656, + "loss": 3.4581, + "step": 2609 + }, + { + "epoch": 0.33, + "grad_norm": 0.6200606226921082, + "learning_rate": 0.0003882724773358133, + "loss": 3.2224, + "step": 2610 + }, + { + "epoch": 0.33, + "grad_norm": 0.7154632210731506, + "learning_rate": 0.00038818610777158804, + "loss": 3.3628, + "step": 2611 + }, + { + "epoch": 0.33, + "grad_norm": 0.7068455219268799, + "learning_rate": 0.000388099714451569, + "loss": 3.4331, + "step": 2612 + }, + { + "epoch": 0.33, + "grad_norm": 0.6456942558288574, + "learning_rate": 0.000388013297390608, + "loss": 3.3843, + "step": 2613 + }, + { + "epoch": 0.33, + "grad_norm": 0.7880919575691223, + "learning_rate": 0.0003879268566035612, + "loss": 3.3801, + "step": 2614 + }, + { + "epoch": 0.33, + "grad_norm": 0.7418599128723145, + "learning_rate": 0.0003878403921052887, + "loss": 3.436, + "step": 2615 + }, + { + "epoch": 0.33, + "grad_norm": 0.7047538161277771, + "learning_rate": 0.0003877539039106549, + "loss": 3.3842, + "step": 2616 + }, + { + "epoch": 0.33, + "grad_norm": 0.6789987683296204, + "learning_rate": 0.000387667392034528, + "loss": 3.4646, + "step": 2617 + }, + { + "epoch": 0.34, + "grad_norm": 0.6848494410514832, + "learning_rate": 0.00038758085649178044, + "loss": 3.4293, + "step": 2618 + }, + { + "epoch": 0.34, + "grad_norm": 0.6500349044799805, + "learning_rate": 0.0003874942972972887, + "loss": 3.4177, + "step": 2619 + }, + { + "epoch": 0.34, + "grad_norm": 0.7213976979255676, + "learning_rate": 0.00038740771446593323, + "loss": 3.5621, + "step": 2620 + }, + { + "epoch": 0.34, + "grad_norm": 0.6804102659225464, + "learning_rate": 0.00038732110801259874, + "loss": 3.4393, + "step": 2621 + }, + { + "epoch": 0.34, + "grad_norm": 0.6893405318260193, + "learning_rate": 0.00038723447795217374, + "loss": 3.482, + "step": 2622 + }, + { + "epoch": 0.34, + "grad_norm": 0.6176344752311707, + "learning_rate": 0.0003871478242995511, + "loss": 3.4335, + "step": 2623 + }, + { + "epoch": 0.34, + "grad_norm": 0.6017330288887024, + "learning_rate": 0.0003870611470696275, + "loss": 3.3067, + "step": 2624 + }, + { + "epoch": 0.34, + "grad_norm": 0.5874341726303101, + "learning_rate": 0.00038697444627730374, + "loss": 3.4075, + "step": 2625 + }, + { + "epoch": 0.34, + "grad_norm": 0.6801062226295471, + "learning_rate": 0.00038688772193748466, + "loss": 3.4583, + "step": 2626 + }, + { + "epoch": 0.34, + "grad_norm": 0.6636302471160889, + "learning_rate": 0.0003868009740650794, + "loss": 3.464, + "step": 2627 + }, + { + "epoch": 0.34, + "grad_norm": 0.6342689990997314, + "learning_rate": 0.00038671420267500067, + "loss": 3.4211, + "step": 2628 + }, + { + "epoch": 0.34, + "grad_norm": 0.679341733455658, + "learning_rate": 0.0003866274077821655, + "loss": 3.5502, + "step": 2629 + }, + { + "epoch": 0.34, + "grad_norm": 0.6680846810340881, + "learning_rate": 0.0003865405894014951, + "loss": 3.4471, + "step": 2630 + }, + { + "epoch": 0.34, + "grad_norm": 0.6836916208267212, + "learning_rate": 0.00038645374754791445, + "loss": 3.499, + "step": 2631 + }, + { + "epoch": 0.34, + "grad_norm": 0.698597252368927, + "learning_rate": 0.00038636688223635273, + "loss": 3.5174, + "step": 2632 + }, + { + "epoch": 0.34, + "grad_norm": 0.6660541296005249, + "learning_rate": 0.000386279993481743, + "loss": 3.4333, + "step": 2633 + }, + { + "epoch": 0.34, + "grad_norm": 0.6841675043106079, + "learning_rate": 0.0003861930812990225, + "loss": 3.511, + "step": 2634 + }, + { + "epoch": 0.34, + "grad_norm": 0.6387922167778015, + "learning_rate": 0.00038610614570313244, + "loss": 3.3794, + "step": 2635 + }, + { + "epoch": 0.34, + "grad_norm": 0.6555662751197815, + "learning_rate": 0.00038601918670901807, + "loss": 3.3818, + "step": 2636 + }, + { + "epoch": 0.34, + "grad_norm": 0.6639174222946167, + "learning_rate": 0.0003859322043316287, + "loss": 3.5306, + "step": 2637 + }, + { + "epoch": 0.34, + "grad_norm": 0.6673606038093567, + "learning_rate": 0.0003858451985859175, + "loss": 3.489, + "step": 2638 + }, + { + "epoch": 0.34, + "grad_norm": 0.6728341579437256, + "learning_rate": 0.0003857581694868417, + "loss": 3.3419, + "step": 2639 + }, + { + "epoch": 0.34, + "grad_norm": 0.6590186953544617, + "learning_rate": 0.00038567111704936285, + "loss": 3.3493, + "step": 2640 + }, + { + "epoch": 0.34, + "grad_norm": 0.6790418028831482, + "learning_rate": 0.00038558404128844604, + "loss": 3.4853, + "step": 2641 + }, + { + "epoch": 0.34, + "grad_norm": 0.660210371017456, + "learning_rate": 0.00038549694221906084, + "loss": 3.4395, + "step": 2642 + }, + { + "epoch": 0.34, + "grad_norm": 0.6540093421936035, + "learning_rate": 0.00038540981985618036, + "loss": 3.4581, + "step": 2643 + }, + { + "epoch": 0.34, + "grad_norm": 0.647119402885437, + "learning_rate": 0.00038532267421478205, + "loss": 3.5108, + "step": 2644 + }, + { + "epoch": 0.34, + "grad_norm": 0.6743528842926025, + "learning_rate": 0.0003852355053098473, + "loss": 3.3326, + "step": 2645 + }, + { + "epoch": 0.34, + "grad_norm": 0.6162041425704956, + "learning_rate": 0.00038514831315636134, + "loss": 3.4508, + "step": 2646 + }, + { + "epoch": 0.34, + "grad_norm": 0.6437183022499084, + "learning_rate": 0.00038506109776931366, + "loss": 3.3653, + "step": 2647 + }, + { + "epoch": 0.34, + "grad_norm": 0.6612119674682617, + "learning_rate": 0.0003849738591636974, + "loss": 3.4162, + "step": 2648 + }, + { + "epoch": 0.34, + "grad_norm": 0.6219061613082886, + "learning_rate": 0.00038488659735451, + "loss": 3.4372, + "step": 2649 + }, + { + "epoch": 0.34, + "grad_norm": 0.6129502654075623, + "learning_rate": 0.00038479931235675294, + "loss": 3.4622, + "step": 2650 + }, + { + "epoch": 0.34, + "grad_norm": 0.6514201164245605, + "learning_rate": 0.0003847120041854312, + "loss": 3.4952, + "step": 2651 + }, + { + "epoch": 0.34, + "grad_norm": 0.6656510233879089, + "learning_rate": 0.00038462467285555423, + "loss": 3.4955, + "step": 2652 + }, + { + "epoch": 0.34, + "grad_norm": 0.6361547708511353, + "learning_rate": 0.00038453731838213533, + "loss": 3.4209, + "step": 2653 + }, + { + "epoch": 0.34, + "grad_norm": 0.6556544303894043, + "learning_rate": 0.0003844499407801918, + "loss": 3.4026, + "step": 2654 + }, + { + "epoch": 0.34, + "grad_norm": 0.6475023627281189, + "learning_rate": 0.0003843625400647446, + "loss": 3.5694, + "step": 2655 + }, + { + "epoch": 0.34, + "grad_norm": 0.6740972995758057, + "learning_rate": 0.00038427511625081925, + "loss": 3.457, + "step": 2656 + }, + { + "epoch": 0.34, + "grad_norm": 0.7039819955825806, + "learning_rate": 0.00038418766935344466, + "loss": 3.5753, + "step": 2657 + }, + { + "epoch": 0.34, + "grad_norm": 0.6474445462226868, + "learning_rate": 0.00038410019938765413, + "loss": 3.4807, + "step": 2658 + }, + { + "epoch": 0.34, + "grad_norm": 0.6333473920822144, + "learning_rate": 0.00038401270636848474, + "loss": 3.423, + "step": 2659 + }, + { + "epoch": 0.34, + "grad_norm": 0.6756442785263062, + "learning_rate": 0.00038392519031097745, + "loss": 3.3655, + "step": 2660 + }, + { + "epoch": 0.34, + "grad_norm": 0.6421991586685181, + "learning_rate": 0.0003838376512301773, + "loss": 3.3833, + "step": 2661 + }, + { + "epoch": 0.34, + "grad_norm": 0.650489330291748, + "learning_rate": 0.0003837500891411334, + "loss": 3.4429, + "step": 2662 + }, + { + "epoch": 0.34, + "grad_norm": 0.6732882857322693, + "learning_rate": 0.0003836625040588986, + "loss": 3.3899, + "step": 2663 + }, + { + "epoch": 0.34, + "grad_norm": 0.6878411173820496, + "learning_rate": 0.00038357489599852984, + "loss": 3.4397, + "step": 2664 + }, + { + "epoch": 0.34, + "grad_norm": 0.6262323260307312, + "learning_rate": 0.0003834872649750879, + "loss": 3.2506, + "step": 2665 + }, + { + "epoch": 0.34, + "grad_norm": 0.6440932750701904, + "learning_rate": 0.00038339961100363753, + "loss": 3.4439, + "step": 2666 + }, + { + "epoch": 0.34, + "grad_norm": 0.6685276031494141, + "learning_rate": 0.0003833119340992476, + "loss": 3.5265, + "step": 2667 + }, + { + "epoch": 0.34, + "grad_norm": 0.6493540406227112, + "learning_rate": 0.0003832242342769907, + "loss": 3.3814, + "step": 2668 + }, + { + "epoch": 0.34, + "grad_norm": 0.6593091487884521, + "learning_rate": 0.00038313651155194345, + "loss": 3.3719, + "step": 2669 + }, + { + "epoch": 0.34, + "grad_norm": 0.647038459777832, + "learning_rate": 0.00038304876593918646, + "loss": 3.359, + "step": 2670 + }, + { + "epoch": 0.34, + "grad_norm": 0.697995126247406, + "learning_rate": 0.0003829609974538041, + "loss": 3.4759, + "step": 2671 + }, + { + "epoch": 0.34, + "grad_norm": 0.6449376344680786, + "learning_rate": 0.00038287320611088486, + "loss": 3.4125, + "step": 2672 + }, + { + "epoch": 0.34, + "grad_norm": 0.674048900604248, + "learning_rate": 0.00038278539192552113, + "loss": 3.4293, + "step": 2673 + }, + { + "epoch": 0.34, + "grad_norm": 0.6361321806907654, + "learning_rate": 0.00038269755491280916, + "loss": 3.4286, + "step": 2674 + }, + { + "epoch": 0.34, + "grad_norm": 0.673592746257782, + "learning_rate": 0.00038260969508784916, + "loss": 3.4306, + "step": 2675 + }, + { + "epoch": 0.34, + "grad_norm": 0.597294807434082, + "learning_rate": 0.00038252181246574516, + "loss": 3.3668, + "step": 2676 + }, + { + "epoch": 0.34, + "grad_norm": 0.6507534384727478, + "learning_rate": 0.0003824339070616053, + "loss": 3.4632, + "step": 2677 + }, + { + "epoch": 0.34, + "grad_norm": 0.6554521322250366, + "learning_rate": 0.0003823459788905415, + "loss": 3.5535, + "step": 2678 + }, + { + "epoch": 0.34, + "grad_norm": 0.668842077255249, + "learning_rate": 0.00038225802796766964, + "loss": 3.6099, + "step": 2679 + }, + { + "epoch": 0.34, + "grad_norm": 0.6702171564102173, + "learning_rate": 0.0003821700543081095, + "loss": 3.4481, + "step": 2680 + }, + { + "epoch": 0.34, + "grad_norm": 0.7185189127922058, + "learning_rate": 0.00038208205792698483, + "loss": 3.3188, + "step": 2681 + }, + { + "epoch": 0.34, + "grad_norm": 0.6936577558517456, + "learning_rate": 0.0003819940388394232, + "loss": 3.4154, + "step": 2682 + }, + { + "epoch": 0.34, + "grad_norm": 0.6211004257202148, + "learning_rate": 0.00038190599706055595, + "loss": 3.3764, + "step": 2683 + }, + { + "epoch": 0.34, + "grad_norm": 0.7152443528175354, + "learning_rate": 0.00038181793260551875, + "loss": 3.499, + "step": 2684 + }, + { + "epoch": 0.34, + "grad_norm": 0.5948370695114136, + "learning_rate": 0.00038172984548945067, + "loss": 3.3368, + "step": 2685 + }, + { + "epoch": 0.34, + "grad_norm": 0.6092764139175415, + "learning_rate": 0.00038164173572749504, + "loss": 3.3187, + "step": 2686 + }, + { + "epoch": 0.34, + "grad_norm": 0.6423379182815552, + "learning_rate": 0.0003815536033347989, + "loss": 3.5358, + "step": 2687 + }, + { + "epoch": 0.34, + "grad_norm": 0.6405006051063538, + "learning_rate": 0.0003814654483265134, + "loss": 3.3378, + "step": 2688 + }, + { + "epoch": 0.34, + "grad_norm": 0.7445230484008789, + "learning_rate": 0.00038137727071779304, + "loss": 3.4888, + "step": 2689 + }, + { + "epoch": 0.34, + "grad_norm": 0.6603292226791382, + "learning_rate": 0.0003812890705237969, + "loss": 3.4612, + "step": 2690 + }, + { + "epoch": 0.34, + "grad_norm": 0.6725325584411621, + "learning_rate": 0.0003812008477596875, + "loss": 3.4146, + "step": 2691 + }, + { + "epoch": 0.34, + "grad_norm": 0.647088885307312, + "learning_rate": 0.0003811126024406314, + "loss": 3.3068, + "step": 2692 + }, + { + "epoch": 0.34, + "grad_norm": 0.6597961187362671, + "learning_rate": 0.00038102433458179883, + "loss": 3.4599, + "step": 2693 + }, + { + "epoch": 0.34, + "grad_norm": 0.6334750652313232, + "learning_rate": 0.0003809360441983643, + "loss": 3.2942, + "step": 2694 + }, + { + "epoch": 0.34, + "grad_norm": 0.6588044166564941, + "learning_rate": 0.00038084773130550576, + "loss": 3.2866, + "step": 2695 + }, + { + "epoch": 0.35, + "grad_norm": 0.5897568464279175, + "learning_rate": 0.0003807593959184053, + "loss": 3.4746, + "step": 2696 + }, + { + "epoch": 0.35, + "grad_norm": 0.6646286845207214, + "learning_rate": 0.0003806710380522488, + "loss": 3.3076, + "step": 2697 + }, + { + "epoch": 0.35, + "grad_norm": 0.6470707654953003, + "learning_rate": 0.000380582657722226, + "loss": 3.504, + "step": 2698 + }, + { + "epoch": 0.35, + "grad_norm": 0.6288967728614807, + "learning_rate": 0.00038049425494353047, + "loss": 3.4962, + "step": 2699 + }, + { + "epoch": 0.35, + "grad_norm": 0.6686561703681946, + "learning_rate": 0.00038040582973135974, + "loss": 3.4595, + "step": 2700 + }, + { + "epoch": 0.35, + "grad_norm": 0.6200351119041443, + "learning_rate": 0.00038031738210091506, + "loss": 3.3326, + "step": 2701 + }, + { + "epoch": 0.35, + "grad_norm": 0.634735643863678, + "learning_rate": 0.0003802289120674016, + "loss": 3.3659, + "step": 2702 + }, + { + "epoch": 0.35, + "grad_norm": 0.6304633021354675, + "learning_rate": 0.00038014041964602843, + "loss": 3.4768, + "step": 2703 + }, + { + "epoch": 0.35, + "grad_norm": 0.6305013298988342, + "learning_rate": 0.00038005190485200844, + "loss": 3.5768, + "step": 2704 + }, + { + "epoch": 0.35, + "grad_norm": 0.6871489882469177, + "learning_rate": 0.0003799633677005583, + "loss": 3.4581, + "step": 2705 + }, + { + "epoch": 0.35, + "grad_norm": 0.6479337215423584, + "learning_rate": 0.0003798748082068986, + "loss": 3.4541, + "step": 2706 + }, + { + "epoch": 0.35, + "grad_norm": 0.6679111123085022, + "learning_rate": 0.0003797862263862537, + "loss": 3.5238, + "step": 2707 + }, + { + "epoch": 0.35, + "grad_norm": 0.6705268621444702, + "learning_rate": 0.00037969762225385176, + "loss": 3.5165, + "step": 2708 + }, + { + "epoch": 0.35, + "grad_norm": 0.6185088157653809, + "learning_rate": 0.00037960899582492515, + "loss": 3.3261, + "step": 2709 + }, + { + "epoch": 0.35, + "grad_norm": 0.641598105430603, + "learning_rate": 0.00037952034711470953, + "loss": 3.3865, + "step": 2710 + }, + { + "epoch": 0.35, + "grad_norm": 0.6470401883125305, + "learning_rate": 0.00037943167613844477, + "loss": 3.4629, + "step": 2711 + }, + { + "epoch": 0.35, + "grad_norm": 0.6144194602966309, + "learning_rate": 0.0003793429829113743, + "loss": 3.2882, + "step": 2712 + }, + { + "epoch": 0.35, + "grad_norm": 0.6533951163291931, + "learning_rate": 0.0003792542674487456, + "loss": 3.4835, + "step": 2713 + }, + { + "epoch": 0.35, + "grad_norm": 0.6079481244087219, + "learning_rate": 0.00037916552976580993, + "loss": 3.408, + "step": 2714 + }, + { + "epoch": 0.35, + "grad_norm": 0.6616194248199463, + "learning_rate": 0.00037907676987782225, + "loss": 3.354, + "step": 2715 + }, + { + "epoch": 0.35, + "grad_norm": 0.6493124961853027, + "learning_rate": 0.0003789879878000415, + "loss": 3.3074, + "step": 2716 + }, + { + "epoch": 0.35, + "grad_norm": 0.6650919318199158, + "learning_rate": 0.00037889918354773016, + "loss": 3.2749, + "step": 2717 + }, + { + "epoch": 0.35, + "grad_norm": 0.6138140559196472, + "learning_rate": 0.00037881035713615495, + "loss": 3.3002, + "step": 2718 + }, + { + "epoch": 0.35, + "grad_norm": 0.6505001783370972, + "learning_rate": 0.000378721508580586, + "loss": 3.2198, + "step": 2719 + }, + { + "epoch": 0.35, + "grad_norm": 0.6526826024055481, + "learning_rate": 0.00037863263789629753, + "loss": 3.4036, + "step": 2720 + }, + { + "epoch": 0.35, + "grad_norm": 0.711990475654602, + "learning_rate": 0.00037854374509856733, + "loss": 3.4434, + "step": 2721 + }, + { + "epoch": 0.35, + "grad_norm": 0.6838361620903015, + "learning_rate": 0.00037845483020267713, + "loss": 3.4861, + "step": 2722 + }, + { + "epoch": 0.35, + "grad_norm": 0.7093937993049622, + "learning_rate": 0.00037836589322391234, + "loss": 3.3011, + "step": 2723 + }, + { + "epoch": 0.35, + "grad_norm": 0.6648284196853638, + "learning_rate": 0.00037827693417756254, + "loss": 3.3851, + "step": 2724 + }, + { + "epoch": 0.35, + "grad_norm": 0.6639228463172913, + "learning_rate": 0.00037818795307892057, + "loss": 3.4538, + "step": 2725 + }, + { + "epoch": 0.35, + "grad_norm": 0.6106069087982178, + "learning_rate": 0.0003780989499432833, + "loss": 3.3033, + "step": 2726 + }, + { + "epoch": 0.35, + "grad_norm": 0.6387519240379333, + "learning_rate": 0.0003780099247859516, + "loss": 3.371, + "step": 2727 + }, + { + "epoch": 0.35, + "grad_norm": 0.6642487049102783, + "learning_rate": 0.0003779208776222298, + "loss": 3.4189, + "step": 2728 + }, + { + "epoch": 0.35, + "grad_norm": 0.6124227046966553, + "learning_rate": 0.00037783180846742617, + "loss": 3.3224, + "step": 2729 + }, + { + "epoch": 0.35, + "grad_norm": 0.6687057614326477, + "learning_rate": 0.0003777427173368526, + "loss": 3.4333, + "step": 2730 + }, + { + "epoch": 0.35, + "grad_norm": 0.6118618249893188, + "learning_rate": 0.000377653604245825, + "loss": 3.4947, + "step": 2731 + }, + { + "epoch": 0.35, + "grad_norm": 0.6521013379096985, + "learning_rate": 0.00037756446920966303, + "loss": 3.3169, + "step": 2732 + }, + { + "epoch": 0.35, + "grad_norm": 0.6459051370620728, + "learning_rate": 0.00037747531224368994, + "loss": 3.4129, + "step": 2733 + }, + { + "epoch": 0.35, + "grad_norm": 0.6482908725738525, + "learning_rate": 0.0003773861333632328, + "loss": 3.4848, + "step": 2734 + }, + { + "epoch": 0.35, + "grad_norm": 0.6932248473167419, + "learning_rate": 0.00037729693258362263, + "loss": 3.3471, + "step": 2735 + }, + { + "epoch": 0.35, + "grad_norm": 0.7700514197349548, + "learning_rate": 0.00037720770992019393, + "loss": 3.4745, + "step": 2736 + }, + { + "epoch": 0.35, + "grad_norm": 0.6803534030914307, + "learning_rate": 0.0003771184653882852, + "loss": 3.4634, + "step": 2737 + }, + { + "epoch": 0.35, + "grad_norm": 0.655329704284668, + "learning_rate": 0.00037702919900323856, + "loss": 3.4858, + "step": 2738 + }, + { + "epoch": 0.35, + "grad_norm": 0.666143536567688, + "learning_rate": 0.0003769399107804, + "loss": 3.3936, + "step": 2739 + }, + { + "epoch": 0.35, + "grad_norm": 0.6388490200042725, + "learning_rate": 0.0003768506007351191, + "loss": 3.3838, + "step": 2740 + }, + { + "epoch": 0.35, + "grad_norm": 0.6686370372772217, + "learning_rate": 0.00037676126888274943, + "loss": 3.4085, + "step": 2741 + }, + { + "epoch": 0.35, + "grad_norm": 0.7050430178642273, + "learning_rate": 0.0003766719152386481, + "loss": 3.3971, + "step": 2742 + }, + { + "epoch": 0.35, + "grad_norm": 0.687915563583374, + "learning_rate": 0.00037658253981817603, + "loss": 3.4354, + "step": 2743 + }, + { + "epoch": 0.35, + "grad_norm": 0.6310634016990662, + "learning_rate": 0.00037649314263669785, + "loss": 3.4568, + "step": 2744 + }, + { + "epoch": 0.35, + "grad_norm": 0.6493231654167175, + "learning_rate": 0.000376403723709582, + "loss": 3.3632, + "step": 2745 + }, + { + "epoch": 0.35, + "grad_norm": 0.669558048248291, + "learning_rate": 0.0003763142830522007, + "loss": 3.3468, + "step": 2746 + }, + { + "epoch": 0.35, + "grad_norm": 0.721985399723053, + "learning_rate": 0.00037622482067992976, + "loss": 3.4897, + "step": 2747 + }, + { + "epoch": 0.35, + "grad_norm": 0.6772878766059875, + "learning_rate": 0.0003761353366081488, + "loss": 3.5065, + "step": 2748 + }, + { + "epoch": 0.35, + "grad_norm": 0.7323068380355835, + "learning_rate": 0.00037604583085224126, + "loss": 3.5024, + "step": 2749 + }, + { + "epoch": 0.35, + "grad_norm": 0.6881877779960632, + "learning_rate": 0.0003759563034275941, + "loss": 3.4767, + "step": 2750 + }, + { + "epoch": 0.35, + "grad_norm": 0.7242856621742249, + "learning_rate": 0.0003758667543495982, + "loss": 3.5577, + "step": 2751 + }, + { + "epoch": 0.35, + "grad_norm": 0.6376597285270691, + "learning_rate": 0.000375777183633648, + "loss": 3.5718, + "step": 2752 + }, + { + "epoch": 0.35, + "grad_norm": 0.6114027500152588, + "learning_rate": 0.00037568759129514185, + "loss": 3.324, + "step": 2753 + }, + { + "epoch": 0.35, + "grad_norm": 0.6938390731811523, + "learning_rate": 0.0003755979773494816, + "loss": 3.3563, + "step": 2754 + }, + { + "epoch": 0.35, + "grad_norm": 0.6962039470672607, + "learning_rate": 0.000375508341812073, + "loss": 3.4616, + "step": 2755 + }, + { + "epoch": 0.35, + "grad_norm": 0.6147626042366028, + "learning_rate": 0.00037541868469832547, + "loss": 3.3883, + "step": 2756 + }, + { + "epoch": 0.35, + "grad_norm": 0.646985650062561, + "learning_rate": 0.00037532900602365205, + "loss": 3.428, + "step": 2757 + }, + { + "epoch": 0.35, + "grad_norm": 0.669619083404541, + "learning_rate": 0.00037523930580346955, + "loss": 3.43, + "step": 2758 + }, + { + "epoch": 0.35, + "grad_norm": 0.720965564250946, + "learning_rate": 0.00037514958405319846, + "loss": 3.5539, + "step": 2759 + }, + { + "epoch": 0.35, + "grad_norm": 0.7012642025947571, + "learning_rate": 0.00037505984078826303, + "loss": 3.4185, + "step": 2760 + }, + { + "epoch": 0.35, + "grad_norm": 0.6531978845596313, + "learning_rate": 0.0003749700760240912, + "loss": 3.355, + "step": 2761 + }, + { + "epoch": 0.35, + "grad_norm": 0.6693536043167114, + "learning_rate": 0.0003748802897761144, + "loss": 3.4314, + "step": 2762 + }, + { + "epoch": 0.35, + "grad_norm": 0.6955552697181702, + "learning_rate": 0.0003747904820597682, + "loss": 3.308, + "step": 2763 + }, + { + "epoch": 0.35, + "grad_norm": 0.6971192955970764, + "learning_rate": 0.00037470065289049135, + "loss": 3.4387, + "step": 2764 + }, + { + "epoch": 0.35, + "grad_norm": 0.6421964764595032, + "learning_rate": 0.0003746108022837266, + "loss": 3.4253, + "step": 2765 + }, + { + "epoch": 0.35, + "grad_norm": 0.6529431939125061, + "learning_rate": 0.0003745209302549204, + "loss": 3.3858, + "step": 2766 + }, + { + "epoch": 0.35, + "grad_norm": 0.6435075998306274, + "learning_rate": 0.0003744310368195227, + "loss": 3.3976, + "step": 2767 + }, + { + "epoch": 0.35, + "grad_norm": 0.5953412055969238, + "learning_rate": 0.0003743411219929872, + "loss": 3.3261, + "step": 2768 + }, + { + "epoch": 0.35, + "grad_norm": 0.6816799640655518, + "learning_rate": 0.0003742511857907713, + "loss": 3.4159, + "step": 2769 + }, + { + "epoch": 0.35, + "grad_norm": 0.6983208656311035, + "learning_rate": 0.0003741612282283362, + "loss": 3.5298, + "step": 2770 + }, + { + "epoch": 0.35, + "grad_norm": 0.6705322861671448, + "learning_rate": 0.0003740712493211466, + "loss": 3.3556, + "step": 2771 + }, + { + "epoch": 0.35, + "grad_norm": 0.640531599521637, + "learning_rate": 0.0003739812490846708, + "loss": 3.3856, + "step": 2772 + }, + { + "epoch": 0.35, + "grad_norm": 0.6185344457626343, + "learning_rate": 0.000373891227534381, + "loss": 3.2589, + "step": 2773 + }, + { + "epoch": 0.36, + "grad_norm": 0.6326441168785095, + "learning_rate": 0.0003738011846857529, + "loss": 3.309, + "step": 2774 + }, + { + "epoch": 0.36, + "grad_norm": 0.6673996448516846, + "learning_rate": 0.000373711120554266, + "loss": 3.4861, + "step": 2775 + }, + { + "epoch": 0.36, + "grad_norm": 0.6781924962997437, + "learning_rate": 0.0003736210351554032, + "loss": 3.5098, + "step": 2776 + }, + { + "epoch": 0.36, + "grad_norm": 0.6595224142074585, + "learning_rate": 0.0003735309285046513, + "loss": 3.5276, + "step": 2777 + }, + { + "epoch": 0.36, + "grad_norm": 0.6162325143814087, + "learning_rate": 0.0003734408006175008, + "loss": 3.2696, + "step": 2778 + }, + { + "epoch": 0.36, + "grad_norm": 0.6548352241516113, + "learning_rate": 0.00037335065150944556, + "loss": 3.4189, + "step": 2779 + }, + { + "epoch": 0.36, + "grad_norm": 0.6298694610595703, + "learning_rate": 0.0003732604811959834, + "loss": 3.5247, + "step": 2780 + }, + { + "epoch": 0.36, + "grad_norm": 0.632470428943634, + "learning_rate": 0.00037317028969261547, + "loss": 3.3577, + "step": 2781 + }, + { + "epoch": 0.36, + "grad_norm": 0.6624454855918884, + "learning_rate": 0.00037308007701484684, + "loss": 3.3773, + "step": 2782 + }, + { + "epoch": 0.36, + "grad_norm": 0.641908586025238, + "learning_rate": 0.00037298984317818613, + "loss": 3.4905, + "step": 2783 + }, + { + "epoch": 0.36, + "grad_norm": 0.6339795589447021, + "learning_rate": 0.00037289958819814557, + "loss": 3.4135, + "step": 2784 + }, + { + "epoch": 0.36, + "grad_norm": 0.6595509052276611, + "learning_rate": 0.00037280931209024106, + "loss": 3.5151, + "step": 2785 + }, + { + "epoch": 0.36, + "grad_norm": 0.6234700679779053, + "learning_rate": 0.000372719014869992, + "loss": 3.3336, + "step": 2786 + }, + { + "epoch": 0.36, + "grad_norm": 0.6775194406509399, + "learning_rate": 0.0003726286965529216, + "loss": 3.3802, + "step": 2787 + }, + { + "epoch": 0.36, + "grad_norm": 0.6999110579490662, + "learning_rate": 0.00037253835715455664, + "loss": 3.4496, + "step": 2788 + }, + { + "epoch": 0.36, + "grad_norm": 0.6700520515441895, + "learning_rate": 0.00037244799669042754, + "loss": 3.4214, + "step": 2789 + }, + { + "epoch": 0.36, + "grad_norm": 0.659300684928894, + "learning_rate": 0.00037235761517606826, + "loss": 3.3645, + "step": 2790 + }, + { + "epoch": 0.36, + "grad_norm": 0.6916134357452393, + "learning_rate": 0.00037226721262701633, + "loss": 3.4435, + "step": 2791 + }, + { + "epoch": 0.36, + "grad_norm": 0.6676251292228699, + "learning_rate": 0.00037217678905881324, + "loss": 3.4753, + "step": 2792 + }, + { + "epoch": 0.36, + "grad_norm": 0.6123244762420654, + "learning_rate": 0.00037208634448700374, + "loss": 3.2478, + "step": 2793 + }, + { + "epoch": 0.36, + "grad_norm": 0.6385574340820312, + "learning_rate": 0.00037199587892713617, + "loss": 3.3875, + "step": 2794 + }, + { + "epoch": 0.36, + "grad_norm": 0.6606059074401855, + "learning_rate": 0.0003719053923947628, + "loss": 3.3749, + "step": 2795 + }, + { + "epoch": 0.36, + "grad_norm": 0.6060505509376526, + "learning_rate": 0.0003718148849054391, + "loss": 3.3241, + "step": 2796 + }, + { + "epoch": 0.36, + "grad_norm": 0.6498494148254395, + "learning_rate": 0.00037172435647472466, + "loss": 3.398, + "step": 2797 + }, + { + "epoch": 0.36, + "grad_norm": 0.6244494318962097, + "learning_rate": 0.0003716338071181821, + "loss": 3.531, + "step": 2798 + }, + { + "epoch": 0.36, + "grad_norm": 0.6035483479499817, + "learning_rate": 0.00037154323685137803, + "loss": 3.4189, + "step": 2799 + }, + { + "epoch": 0.36, + "grad_norm": 0.6718020439147949, + "learning_rate": 0.0003714526456898824, + "loss": 3.4351, + "step": 2800 + }, + { + "epoch": 0.36, + "grad_norm": 0.6160071492195129, + "learning_rate": 0.000371362033649269, + "loss": 3.396, + "step": 2801 + }, + { + "epoch": 0.36, + "grad_norm": 0.664903461933136, + "learning_rate": 0.00037127140074511516, + "loss": 3.4186, + "step": 2802 + }, + { + "epoch": 0.36, + "grad_norm": 0.6706874966621399, + "learning_rate": 0.0003711807469930016, + "loss": 3.4037, + "step": 2803 + }, + { + "epoch": 0.36, + "grad_norm": 0.6155506372451782, + "learning_rate": 0.0003710900724085128, + "loss": 3.3301, + "step": 2804 + }, + { + "epoch": 0.36, + "grad_norm": 0.616083025932312, + "learning_rate": 0.00037099937700723663, + "loss": 3.2487, + "step": 2805 + }, + { + "epoch": 0.36, + "grad_norm": 0.6222215890884399, + "learning_rate": 0.00037090866080476495, + "loss": 3.3808, + "step": 2806 + }, + { + "epoch": 0.36, + "grad_norm": 0.5854479670524597, + "learning_rate": 0.00037081792381669275, + "loss": 3.4023, + "step": 2807 + }, + { + "epoch": 0.36, + "grad_norm": 0.6443006992340088, + "learning_rate": 0.0003707271660586188, + "loss": 3.3804, + "step": 2808 + }, + { + "epoch": 0.36, + "grad_norm": 0.6444540619850159, + "learning_rate": 0.0003706363875461454, + "loss": 3.4384, + "step": 2809 + }, + { + "epoch": 0.36, + "grad_norm": 0.6260697841644287, + "learning_rate": 0.00037054558829487837, + "loss": 3.4442, + "step": 2810 + }, + { + "epoch": 0.36, + "grad_norm": 0.6451106667518616, + "learning_rate": 0.00037045476832042734, + "loss": 3.282, + "step": 2811 + }, + { + "epoch": 0.36, + "grad_norm": 0.6747676134109497, + "learning_rate": 0.00037036392763840513, + "loss": 3.4037, + "step": 2812 + }, + { + "epoch": 0.36, + "grad_norm": 0.6337499618530273, + "learning_rate": 0.00037027306626442847, + "loss": 3.3404, + "step": 2813 + }, + { + "epoch": 0.36, + "grad_norm": 0.6735157370567322, + "learning_rate": 0.0003701821842141173, + "loss": 3.5064, + "step": 2814 + }, + { + "epoch": 0.36, + "grad_norm": 0.6192700266838074, + "learning_rate": 0.0003700912815030955, + "loss": 3.3892, + "step": 2815 + }, + { + "epoch": 0.36, + "grad_norm": 0.6197276711463928, + "learning_rate": 0.0003700003581469901, + "loss": 3.4002, + "step": 2816 + }, + { + "epoch": 0.36, + "grad_norm": 0.6331234574317932, + "learning_rate": 0.000369909414161432, + "loss": 3.434, + "step": 2817 + }, + { + "epoch": 0.36, + "grad_norm": 0.6590570211410522, + "learning_rate": 0.0003698184495620555, + "loss": 3.5184, + "step": 2818 + }, + { + "epoch": 0.36, + "grad_norm": 0.6396684050559998, + "learning_rate": 0.00036972746436449844, + "loss": 3.4706, + "step": 2819 + }, + { + "epoch": 0.36, + "grad_norm": 0.6504268050193787, + "learning_rate": 0.0003696364585844023, + "loss": 3.4953, + "step": 2820 + }, + { + "epoch": 0.36, + "grad_norm": 0.6527693867683411, + "learning_rate": 0.000369545432237412, + "loss": 3.487, + "step": 2821 + }, + { + "epoch": 0.36, + "grad_norm": 0.6094881892204285, + "learning_rate": 0.0003694543853391759, + "loss": 3.2907, + "step": 2822 + }, + { + "epoch": 0.36, + "grad_norm": 0.6652140617370605, + "learning_rate": 0.0003693633179053462, + "loss": 3.3554, + "step": 2823 + }, + { + "epoch": 0.36, + "grad_norm": 0.6317107677459717, + "learning_rate": 0.00036927222995157837, + "loss": 3.4327, + "step": 2824 + }, + { + "epoch": 0.36, + "grad_norm": 0.6838727593421936, + "learning_rate": 0.0003691811214935315, + "loss": 3.3449, + "step": 2825 + }, + { + "epoch": 0.36, + "grad_norm": 0.6608859300613403, + "learning_rate": 0.00036908999254686817, + "loss": 3.4886, + "step": 2826 + }, + { + "epoch": 0.36, + "grad_norm": 0.7537527680397034, + "learning_rate": 0.00036899884312725453, + "loss": 3.4995, + "step": 2827 + }, + { + "epoch": 0.36, + "grad_norm": 0.6652011871337891, + "learning_rate": 0.0003689076732503601, + "loss": 3.5182, + "step": 2828 + }, + { + "epoch": 0.36, + "grad_norm": 0.6610903739929199, + "learning_rate": 0.0003688164829318583, + "loss": 3.4118, + "step": 2829 + }, + { + "epoch": 0.36, + "grad_norm": 0.6542768478393555, + "learning_rate": 0.00036872527218742557, + "loss": 3.2537, + "step": 2830 + }, + { + "epoch": 0.36, + "grad_norm": 0.6550228595733643, + "learning_rate": 0.00036863404103274215, + "loss": 3.4192, + "step": 2831 + }, + { + "epoch": 0.36, + "grad_norm": 0.6263365745544434, + "learning_rate": 0.00036854278948349184, + "loss": 3.3837, + "step": 2832 + }, + { + "epoch": 0.36, + "grad_norm": 0.6227504014968872, + "learning_rate": 0.0003684515175553616, + "loss": 3.3923, + "step": 2833 + }, + { + "epoch": 0.36, + "grad_norm": 0.7218218445777893, + "learning_rate": 0.00036836022526404237, + "loss": 3.4124, + "step": 2834 + }, + { + "epoch": 0.36, + "grad_norm": 0.6374344229698181, + "learning_rate": 0.0003682689126252283, + "loss": 3.3984, + "step": 2835 + }, + { + "epoch": 0.36, + "grad_norm": 0.6587662696838379, + "learning_rate": 0.0003681775796546171, + "loss": 3.4672, + "step": 2836 + }, + { + "epoch": 0.36, + "grad_norm": 0.6785812377929688, + "learning_rate": 0.0003680862263679097, + "loss": 3.3096, + "step": 2837 + }, + { + "epoch": 0.36, + "grad_norm": 0.6638329029083252, + "learning_rate": 0.00036799485278081126, + "loss": 3.4854, + "step": 2838 + }, + { + "epoch": 0.36, + "grad_norm": 0.6179649829864502, + "learning_rate": 0.0003679034589090296, + "loss": 3.4328, + "step": 2839 + }, + { + "epoch": 0.36, + "grad_norm": 0.5974080562591553, + "learning_rate": 0.0003678120447682765, + "loss": 3.2263, + "step": 2840 + }, + { + "epoch": 0.36, + "grad_norm": 0.6987083554267883, + "learning_rate": 0.0003677206103742671, + "loss": 3.486, + "step": 2841 + }, + { + "epoch": 0.36, + "grad_norm": 0.6530375480651855, + "learning_rate": 0.0003676291557427201, + "loss": 3.3044, + "step": 2842 + }, + { + "epoch": 0.36, + "grad_norm": 0.6578952670097351, + "learning_rate": 0.0003675376808893575, + "loss": 3.5347, + "step": 2843 + }, + { + "epoch": 0.36, + "grad_norm": 0.6601744890213013, + "learning_rate": 0.00036744618582990496, + "loss": 3.2729, + "step": 2844 + }, + { + "epoch": 0.36, + "grad_norm": 0.6604732275009155, + "learning_rate": 0.00036735467058009153, + "loss": 3.5031, + "step": 2845 + }, + { + "epoch": 0.36, + "grad_norm": 0.6677465438842773, + "learning_rate": 0.0003672631351556498, + "loss": 3.3901, + "step": 2846 + }, + { + "epoch": 0.36, + "grad_norm": 0.6700396537780762, + "learning_rate": 0.0003671715795723156, + "loss": 3.4247, + "step": 2847 + }, + { + "epoch": 0.36, + "grad_norm": 0.6824732422828674, + "learning_rate": 0.00036708000384582854, + "loss": 3.2445, + "step": 2848 + }, + { + "epoch": 0.36, + "grad_norm": 0.6635594964027405, + "learning_rate": 0.00036698840799193153, + "loss": 3.5453, + "step": 2849 + }, + { + "epoch": 0.36, + "grad_norm": 0.6748292446136475, + "learning_rate": 0.00036689679202637094, + "loss": 3.4804, + "step": 2850 + }, + { + "epoch": 0.36, + "grad_norm": 0.584503173828125, + "learning_rate": 0.0003668051559648965, + "loss": 3.4471, + "step": 2851 + }, + { + "epoch": 0.37, + "grad_norm": 0.6716008186340332, + "learning_rate": 0.00036671349982326173, + "loss": 3.3442, + "step": 2852 + }, + { + "epoch": 0.37, + "grad_norm": 0.6274146437644958, + "learning_rate": 0.00036662182361722333, + "loss": 3.3139, + "step": 2853 + }, + { + "epoch": 0.37, + "grad_norm": 0.6152539849281311, + "learning_rate": 0.00036653012736254136, + "loss": 3.4598, + "step": 2854 + }, + { + "epoch": 0.37, + "grad_norm": 0.6842306852340698, + "learning_rate": 0.0003664384110749797, + "loss": 3.4182, + "step": 2855 + }, + { + "epoch": 0.37, + "grad_norm": 0.705602765083313, + "learning_rate": 0.00036634667477030515, + "loss": 3.3613, + "step": 2856 + }, + { + "epoch": 0.37, + "grad_norm": 0.6812083125114441, + "learning_rate": 0.0003662549184642885, + "loss": 3.3681, + "step": 2857 + }, + { + "epoch": 0.37, + "grad_norm": 0.615364670753479, + "learning_rate": 0.00036616314217270354, + "loss": 3.3965, + "step": 2858 + }, + { + "epoch": 0.37, + "grad_norm": 0.5788252353668213, + "learning_rate": 0.00036607134591132783, + "loss": 3.3278, + "step": 2859 + }, + { + "epoch": 0.37, + "grad_norm": 0.6545475125312805, + "learning_rate": 0.00036597952969594217, + "loss": 3.4891, + "step": 2860 + }, + { + "epoch": 0.37, + "grad_norm": 0.6398218274116516, + "learning_rate": 0.0003658876935423307, + "loss": 3.429, + "step": 2861 + }, + { + "epoch": 0.37, + "grad_norm": 0.6287596821784973, + "learning_rate": 0.00036579583746628125, + "loss": 3.4181, + "step": 2862 + }, + { + "epoch": 0.37, + "grad_norm": 0.6481361985206604, + "learning_rate": 0.00036570396148358496, + "loss": 3.3941, + "step": 2863 + }, + { + "epoch": 0.37, + "grad_norm": 0.6730766892433167, + "learning_rate": 0.00036561206561003636, + "loss": 3.5306, + "step": 2864 + }, + { + "epoch": 0.37, + "grad_norm": 0.6656659841537476, + "learning_rate": 0.00036552014986143336, + "loss": 3.3658, + "step": 2865 + }, + { + "epoch": 0.37, + "grad_norm": 0.6573489904403687, + "learning_rate": 0.0003654282142535773, + "loss": 3.4707, + "step": 2866 + }, + { + "epoch": 0.37, + "grad_norm": 0.6266399621963501, + "learning_rate": 0.0003653362588022732, + "loss": 3.4187, + "step": 2867 + }, + { + "epoch": 0.37, + "grad_norm": 0.6363852620124817, + "learning_rate": 0.0003652442835233291, + "loss": 3.314, + "step": 2868 + }, + { + "epoch": 0.37, + "grad_norm": 0.6544065475463867, + "learning_rate": 0.0003651522884325565, + "loss": 3.3298, + "step": 2869 + }, + { + "epoch": 0.37, + "grad_norm": 0.6237480640411377, + "learning_rate": 0.0003650602735457706, + "loss": 3.2989, + "step": 2870 + }, + { + "epoch": 0.37, + "grad_norm": 0.627240777015686, + "learning_rate": 0.0003649682388787898, + "loss": 3.3804, + "step": 2871 + }, + { + "epoch": 0.37, + "grad_norm": 0.7018293142318726, + "learning_rate": 0.000364876184447436, + "loss": 3.3385, + "step": 2872 + }, + { + "epoch": 0.37, + "grad_norm": 0.6876696348190308, + "learning_rate": 0.0003647841102675342, + "loss": 3.3663, + "step": 2873 + }, + { + "epoch": 0.37, + "grad_norm": 0.6464250683784485, + "learning_rate": 0.0003646920163549132, + "loss": 3.4385, + "step": 2874 + }, + { + "epoch": 0.37, + "grad_norm": 0.703446626663208, + "learning_rate": 0.00036459990272540507, + "loss": 3.4518, + "step": 2875 + }, + { + "epoch": 0.37, + "grad_norm": 0.6278776526451111, + "learning_rate": 0.00036450776939484495, + "loss": 3.4672, + "step": 2876 + }, + { + "epoch": 0.37, + "grad_norm": 0.622167706489563, + "learning_rate": 0.0003644156163790719, + "loss": 3.4943, + "step": 2877 + }, + { + "epoch": 0.37, + "grad_norm": 0.643734335899353, + "learning_rate": 0.00036432344369392795, + "loss": 3.3365, + "step": 2878 + }, + { + "epoch": 0.37, + "grad_norm": 0.6084978580474854, + "learning_rate": 0.0003642312513552586, + "loss": 3.4268, + "step": 2879 + }, + { + "epoch": 0.37, + "grad_norm": 0.6124347448348999, + "learning_rate": 0.0003641390393789129, + "loss": 3.2882, + "step": 2880 + }, + { + "epoch": 0.37, + "grad_norm": 0.6220024824142456, + "learning_rate": 0.00036404680778074315, + "loss": 3.3692, + "step": 2881 + }, + { + "epoch": 0.37, + "grad_norm": 0.6398417949676514, + "learning_rate": 0.00036395455657660505, + "loss": 3.3702, + "step": 2882 + }, + { + "epoch": 0.37, + "grad_norm": 0.6671603918075562, + "learning_rate": 0.0003638622857823575, + "loss": 3.4744, + "step": 2883 + }, + { + "epoch": 0.37, + "grad_norm": 0.663680374622345, + "learning_rate": 0.000363769995413863, + "loss": 3.4448, + "step": 2884 + }, + { + "epoch": 0.37, + "grad_norm": 0.7110529541969299, + "learning_rate": 0.00036367768548698745, + "loss": 3.4378, + "step": 2885 + }, + { + "epoch": 0.37, + "grad_norm": 0.6788253784179688, + "learning_rate": 0.0003635853560175998, + "loss": 3.4987, + "step": 2886 + }, + { + "epoch": 0.37, + "grad_norm": 0.6238468289375305, + "learning_rate": 0.0003634930070215727, + "loss": 3.5116, + "step": 2887 + }, + { + "epoch": 0.37, + "grad_norm": 0.6631554365158081, + "learning_rate": 0.000363400638514782, + "loss": 3.2929, + "step": 2888 + }, + { + "epoch": 0.37, + "grad_norm": 0.6805015206336975, + "learning_rate": 0.0003633082505131069, + "loss": 3.4705, + "step": 2889 + }, + { + "epoch": 0.37, + "grad_norm": 0.6465250849723816, + "learning_rate": 0.0003632158430324299, + "loss": 3.3477, + "step": 2890 + }, + { + "epoch": 0.37, + "grad_norm": 0.5989782810211182, + "learning_rate": 0.000363123416088637, + "loss": 3.2646, + "step": 2891 + }, + { + "epoch": 0.37, + "grad_norm": 0.6498996019363403, + "learning_rate": 0.0003630309696976175, + "loss": 3.5004, + "step": 2892 + }, + { + "epoch": 0.37, + "grad_norm": 0.6347329020500183, + "learning_rate": 0.00036293850387526377, + "loss": 3.4345, + "step": 2893 + }, + { + "epoch": 0.37, + "grad_norm": 0.6744681000709534, + "learning_rate": 0.000362846018637472, + "loss": 3.3837, + "step": 2894 + }, + { + "epoch": 0.37, + "grad_norm": 0.6721587181091309, + "learning_rate": 0.00036275351400014144, + "loss": 3.3357, + "step": 2895 + }, + { + "epoch": 0.37, + "grad_norm": 0.6546547412872314, + "learning_rate": 0.00036266098997917476, + "loss": 3.4083, + "step": 2896 + }, + { + "epoch": 0.37, + "grad_norm": 0.6251435875892639, + "learning_rate": 0.00036256844659047767, + "loss": 3.4083, + "step": 2897 + }, + { + "epoch": 0.37, + "grad_norm": 0.6282581686973572, + "learning_rate": 0.0003624758838499596, + "loss": 3.4596, + "step": 2898 + }, + { + "epoch": 0.37, + "grad_norm": 0.6379485130310059, + "learning_rate": 0.0003623833017735333, + "loss": 3.3715, + "step": 2899 + }, + { + "epoch": 0.37, + "grad_norm": 0.6379944682121277, + "learning_rate": 0.00036229070037711446, + "loss": 3.2941, + "step": 2900 + }, + { + "epoch": 0.37, + "grad_norm": 0.7039225697517395, + "learning_rate": 0.0003621980796766225, + "loss": 3.4274, + "step": 2901 + }, + { + "epoch": 0.37, + "grad_norm": 0.5860920548439026, + "learning_rate": 0.0003621054396879798, + "loss": 3.333, + "step": 2902 + }, + { + "epoch": 0.37, + "grad_norm": 0.688922107219696, + "learning_rate": 0.00036201278042711257, + "loss": 3.4805, + "step": 2903 + }, + { + "epoch": 0.37, + "grad_norm": 0.6412578821182251, + "learning_rate": 0.0003619201019099497, + "loss": 3.4894, + "step": 2904 + }, + { + "epoch": 0.37, + "grad_norm": 0.7155341506004333, + "learning_rate": 0.0003618274041524239, + "loss": 3.4138, + "step": 2905 + }, + { + "epoch": 0.37, + "grad_norm": 0.7104753255844116, + "learning_rate": 0.00036173468717047086, + "loss": 3.4911, + "step": 2906 + }, + { + "epoch": 0.37, + "grad_norm": 0.6984987854957581, + "learning_rate": 0.0003616419509800297, + "loss": 3.4344, + "step": 2907 + }, + { + "epoch": 0.37, + "grad_norm": 0.6836965084075928, + "learning_rate": 0.00036154919559704303, + "loss": 3.2565, + "step": 2908 + }, + { + "epoch": 0.37, + "grad_norm": 0.6347119808197021, + "learning_rate": 0.0003614564210374563, + "loss": 3.3677, + "step": 2909 + }, + { + "epoch": 0.37, + "grad_norm": 0.577217698097229, + "learning_rate": 0.00036136362731721886, + "loss": 3.3257, + "step": 2910 + }, + { + "epoch": 0.37, + "grad_norm": 0.6259645819664001, + "learning_rate": 0.0003612708144522827, + "loss": 3.5003, + "step": 2911 + }, + { + "epoch": 0.37, + "grad_norm": 0.6633533239364624, + "learning_rate": 0.00036117798245860364, + "loss": 3.3711, + "step": 2912 + }, + { + "epoch": 0.37, + "grad_norm": 0.6357201337814331, + "learning_rate": 0.0003610851313521405, + "loss": 3.4742, + "step": 2913 + }, + { + "epoch": 0.37, + "grad_norm": 0.6130713820457458, + "learning_rate": 0.00036099226114885545, + "loss": 3.3955, + "step": 2914 + }, + { + "epoch": 0.37, + "grad_norm": 0.6467316150665283, + "learning_rate": 0.000360899371864714, + "loss": 3.5528, + "step": 2915 + }, + { + "epoch": 0.37, + "grad_norm": 0.6171731948852539, + "learning_rate": 0.00036080646351568485, + "loss": 3.3667, + "step": 2916 + }, + { + "epoch": 0.37, + "grad_norm": 0.6676499843597412, + "learning_rate": 0.0003607135361177401, + "loss": 3.3411, + "step": 2917 + }, + { + "epoch": 0.37, + "grad_norm": 0.6495273113250732, + "learning_rate": 0.000360620589686855, + "loss": 3.3802, + "step": 2918 + }, + { + "epoch": 0.37, + "grad_norm": 0.6387211680412292, + "learning_rate": 0.000360527624239008, + "loss": 3.3713, + "step": 2919 + }, + { + "epoch": 0.37, + "grad_norm": 0.6411932110786438, + "learning_rate": 0.0003604346397901811, + "loss": 3.3835, + "step": 2920 + }, + { + "epoch": 0.37, + "grad_norm": 0.614315927028656, + "learning_rate": 0.0003603416363563593, + "loss": 3.3809, + "step": 2921 + }, + { + "epoch": 0.37, + "grad_norm": 0.6927322745323181, + "learning_rate": 0.0003602486139535311, + "loss": 3.4335, + "step": 2922 + }, + { + "epoch": 0.37, + "grad_norm": 0.6915454864501953, + "learning_rate": 0.000360155572597688, + "loss": 3.3863, + "step": 2923 + }, + { + "epoch": 0.37, + "grad_norm": 0.6321678161621094, + "learning_rate": 0.000360062512304825, + "loss": 3.4565, + "step": 2924 + }, + { + "epoch": 0.37, + "grad_norm": 0.6756112575531006, + "learning_rate": 0.0003599694330909401, + "loss": 3.4476, + "step": 2925 + }, + { + "epoch": 0.37, + "grad_norm": 0.6556795239448547, + "learning_rate": 0.00035987633497203483, + "loss": 3.1846, + "step": 2926 + }, + { + "epoch": 0.37, + "grad_norm": 0.6876122355461121, + "learning_rate": 0.00035978321796411374, + "loss": 3.3614, + "step": 2927 + }, + { + "epoch": 0.37, + "grad_norm": 0.68753981590271, + "learning_rate": 0.00035969008208318483, + "loss": 3.3603, + "step": 2928 + }, + { + "epoch": 0.37, + "grad_norm": 0.6918085813522339, + "learning_rate": 0.0003595969273452591, + "loss": 3.2825, + "step": 2929 + }, + { + "epoch": 0.38, + "grad_norm": 0.6255301833152771, + "learning_rate": 0.00035950375376635104, + "loss": 3.4746, + "step": 2930 + }, + { + "epoch": 0.38, + "grad_norm": 0.6332477331161499, + "learning_rate": 0.00035941056136247826, + "loss": 3.3609, + "step": 2931 + }, + { + "epoch": 0.38, + "grad_norm": 0.6548829674720764, + "learning_rate": 0.0003593173501496616, + "loss": 3.4232, + "step": 2932 + }, + { + "epoch": 0.38, + "grad_norm": 0.6732133030891418, + "learning_rate": 0.00035922412014392503, + "loss": 3.4736, + "step": 2933 + }, + { + "epoch": 0.38, + "grad_norm": 0.6473472118377686, + "learning_rate": 0.0003591308713612961, + "loss": 3.4885, + "step": 2934 + }, + { + "epoch": 0.38, + "grad_norm": 0.7150130271911621, + "learning_rate": 0.0003590376038178051, + "loss": 3.352, + "step": 2935 + }, + { + "epoch": 0.38, + "grad_norm": 0.6881107687950134, + "learning_rate": 0.00035894431752948603, + "loss": 3.4317, + "step": 2936 + }, + { + "epoch": 0.38, + "grad_norm": 0.6364063024520874, + "learning_rate": 0.0003588510125123757, + "loss": 3.4329, + "step": 2937 + }, + { + "epoch": 0.38, + "grad_norm": 0.616174042224884, + "learning_rate": 0.00035875768878251444, + "loss": 3.4787, + "step": 2938 + }, + { + "epoch": 0.38, + "grad_norm": 0.7194497585296631, + "learning_rate": 0.0003586643463559457, + "loss": 3.5125, + "step": 2939 + }, + { + "epoch": 0.38, + "grad_norm": 0.6634018421173096, + "learning_rate": 0.000358570985248716, + "loss": 3.5313, + "step": 2940 + }, + { + "epoch": 0.38, + "grad_norm": 0.6058551073074341, + "learning_rate": 0.00035847760547687535, + "loss": 3.3203, + "step": 2941 + }, + { + "epoch": 0.38, + "grad_norm": 0.6337698698043823, + "learning_rate": 0.0003583842070564768, + "loss": 3.4544, + "step": 2942 + }, + { + "epoch": 0.38, + "grad_norm": 0.6180306077003479, + "learning_rate": 0.00035829079000357643, + "loss": 3.4748, + "step": 2943 + }, + { + "epoch": 0.38, + "grad_norm": 0.6458591222763062, + "learning_rate": 0.0003581973543342338, + "loss": 3.3677, + "step": 2944 + }, + { + "epoch": 0.38, + "grad_norm": 0.669301450252533, + "learning_rate": 0.00035810390006451177, + "loss": 3.3496, + "step": 2945 + }, + { + "epoch": 0.38, + "grad_norm": 0.6883298754692078, + "learning_rate": 0.00035801042721047606, + "loss": 3.4491, + "step": 2946 + }, + { + "epoch": 0.38, + "grad_norm": 0.565108597278595, + "learning_rate": 0.00035791693578819564, + "loss": 3.2918, + "step": 2947 + }, + { + "epoch": 0.38, + "grad_norm": 0.5855687856674194, + "learning_rate": 0.00035782342581374294, + "loss": 3.4187, + "step": 2948 + }, + { + "epoch": 0.38, + "grad_norm": 0.6642069220542908, + "learning_rate": 0.0003577298973031933, + "loss": 3.2816, + "step": 2949 + }, + { + "epoch": 0.38, + "grad_norm": 0.6544449925422668, + "learning_rate": 0.0003576363502726255, + "loss": 3.4615, + "step": 2950 + }, + { + "epoch": 0.38, + "grad_norm": 0.6845769882202148, + "learning_rate": 0.0003575427847381212, + "loss": 3.4364, + "step": 2951 + }, + { + "epoch": 0.38, + "grad_norm": 0.6053401827812195, + "learning_rate": 0.0003574492007157655, + "loss": 3.4218, + "step": 2952 + }, + { + "epoch": 0.38, + "grad_norm": 0.6466869711875916, + "learning_rate": 0.0003573555982216465, + "loss": 3.4526, + "step": 2953 + }, + { + "epoch": 0.38, + "grad_norm": 0.6494457721710205, + "learning_rate": 0.0003572619772718556, + "loss": 3.5021, + "step": 2954 + }, + { + "epoch": 0.38, + "grad_norm": 0.6246287226676941, + "learning_rate": 0.0003571683378824873, + "loss": 3.2659, + "step": 2955 + }, + { + "epoch": 0.38, + "grad_norm": 0.6110139489173889, + "learning_rate": 0.00035707468006963936, + "loss": 3.3118, + "step": 2956 + }, + { + "epoch": 0.38, + "grad_norm": 0.7253236174583435, + "learning_rate": 0.00035698100384941257, + "loss": 3.3502, + "step": 2957 + }, + { + "epoch": 0.38, + "grad_norm": 0.6442704796791077, + "learning_rate": 0.0003568873092379109, + "loss": 3.4506, + "step": 2958 + }, + { + "epoch": 0.38, + "grad_norm": 0.6649218201637268, + "learning_rate": 0.00035679359625124174, + "loss": 3.3949, + "step": 2959 + }, + { + "epoch": 0.38, + "grad_norm": 0.6611589789390564, + "learning_rate": 0.00035669986490551535, + "loss": 3.4551, + "step": 2960 + }, + { + "epoch": 0.38, + "grad_norm": 0.7117242217063904, + "learning_rate": 0.00035660611521684516, + "loss": 3.5832, + "step": 2961 + }, + { + "epoch": 0.38, + "grad_norm": 0.6103567481040955, + "learning_rate": 0.0003565123472013478, + "loss": 3.3479, + "step": 2962 + }, + { + "epoch": 0.38, + "grad_norm": 0.630219042301178, + "learning_rate": 0.00035641856087514337, + "loss": 3.3387, + "step": 2963 + }, + { + "epoch": 0.38, + "grad_norm": 0.5978261828422546, + "learning_rate": 0.0003563247562543545, + "loss": 3.3176, + "step": 2964 + }, + { + "epoch": 0.38, + "grad_norm": 0.6091091632843018, + "learning_rate": 0.00035623093335510736, + "loss": 3.2688, + "step": 2965 + }, + { + "epoch": 0.38, + "grad_norm": 0.6181175708770752, + "learning_rate": 0.0003561370921935313, + "loss": 3.2935, + "step": 2966 + }, + { + "epoch": 0.38, + "grad_norm": 0.6397256851196289, + "learning_rate": 0.00035604323278575856, + "loss": 3.3793, + "step": 2967 + }, + { + "epoch": 0.38, + "grad_norm": 0.6449773907661438, + "learning_rate": 0.0003559493551479249, + "loss": 3.4294, + "step": 2968 + }, + { + "epoch": 0.38, + "grad_norm": 0.6925517320632935, + "learning_rate": 0.00035585545929616867, + "loss": 3.4042, + "step": 2969 + }, + { + "epoch": 0.38, + "grad_norm": 0.6502744555473328, + "learning_rate": 0.0003557615452466319, + "loss": 3.5209, + "step": 2970 + }, + { + "epoch": 0.38, + "grad_norm": 0.6259154677391052, + "learning_rate": 0.0003556676130154594, + "loss": 3.3345, + "step": 2971 + }, + { + "epoch": 0.38, + "grad_norm": 0.6316712498664856, + "learning_rate": 0.00035557366261879916, + "loss": 3.304, + "step": 2972 + }, + { + "epoch": 0.38, + "grad_norm": 0.5849267840385437, + "learning_rate": 0.0003554796940728024, + "loss": 3.2746, + "step": 2973 + }, + { + "epoch": 0.38, + "grad_norm": 0.6785985231399536, + "learning_rate": 0.0003553857073936235, + "loss": 3.5004, + "step": 2974 + }, + { + "epoch": 0.38, + "grad_norm": 0.6512728333473206, + "learning_rate": 0.00035529170259741973, + "loss": 3.354, + "step": 2975 + }, + { + "epoch": 0.38, + "grad_norm": 0.6976585388183594, + "learning_rate": 0.00035519767970035147, + "loss": 3.3957, + "step": 2976 + }, + { + "epoch": 0.38, + "grad_norm": 0.6594891548156738, + "learning_rate": 0.0003551036387185827, + "loss": 3.2895, + "step": 2977 + }, + { + "epoch": 0.38, + "grad_norm": 0.7098269462585449, + "learning_rate": 0.00035500957966827994, + "loss": 3.5641, + "step": 2978 + }, + { + "epoch": 0.38, + "grad_norm": 0.6764531135559082, + "learning_rate": 0.000354915502565613, + "loss": 3.4566, + "step": 2979 + }, + { + "epoch": 0.38, + "grad_norm": 0.6904491186141968, + "learning_rate": 0.000354821407426755, + "loss": 3.3431, + "step": 2980 + }, + { + "epoch": 0.38, + "grad_norm": 0.6289408206939697, + "learning_rate": 0.00035472729426788176, + "loss": 3.4602, + "step": 2981 + }, + { + "epoch": 0.38, + "grad_norm": 0.5534930229187012, + "learning_rate": 0.0003546331631051726, + "loss": 3.2183, + "step": 2982 + }, + { + "epoch": 0.38, + "grad_norm": 0.6324417591094971, + "learning_rate": 0.0003545390139548096, + "loss": 3.3314, + "step": 2983 + }, + { + "epoch": 0.38, + "grad_norm": 0.6006765961647034, + "learning_rate": 0.0003544448468329783, + "loss": 3.4489, + "step": 2984 + }, + { + "epoch": 0.38, + "grad_norm": 1.3233200311660767, + "learning_rate": 0.0003543506617558669, + "loss": 3.4516, + "step": 2985 + }, + { + "epoch": 0.38, + "grad_norm": 0.6815298199653625, + "learning_rate": 0.0003542564587396671, + "loss": 3.3749, + "step": 2986 + }, + { + "epoch": 0.38, + "grad_norm": 0.5787482857704163, + "learning_rate": 0.0003541622378005733, + "loss": 3.3712, + "step": 2987 + }, + { + "epoch": 0.38, + "grad_norm": 0.6376410126686096, + "learning_rate": 0.0003540679989547833, + "loss": 3.2685, + "step": 2988 + }, + { + "epoch": 0.38, + "grad_norm": 0.6883410811424255, + "learning_rate": 0.00035397374221849786, + "loss": 3.5234, + "step": 2989 + }, + { + "epoch": 0.38, + "grad_norm": 0.7224739789962769, + "learning_rate": 0.00035387946760792073, + "loss": 3.576, + "step": 2990 + }, + { + "epoch": 0.38, + "grad_norm": 0.6121779084205627, + "learning_rate": 0.00035378517513925885, + "loss": 3.2946, + "step": 2991 + }, + { + "epoch": 0.38, + "grad_norm": 0.710949718952179, + "learning_rate": 0.0003536908648287222, + "loss": 3.4816, + "step": 2992 + }, + { + "epoch": 0.38, + "grad_norm": 0.5981950163841248, + "learning_rate": 0.0003535965366925238, + "loss": 3.4682, + "step": 2993 + }, + { + "epoch": 0.38, + "grad_norm": 0.6085212826728821, + "learning_rate": 0.0003535021907468797, + "loss": 3.2202, + "step": 2994 + }, + { + "epoch": 0.38, + "grad_norm": 0.6419751644134521, + "learning_rate": 0.0003534078270080091, + "loss": 3.344, + "step": 2995 + }, + { + "epoch": 0.38, + "grad_norm": 0.6410698890686035, + "learning_rate": 0.00035331344549213435, + "loss": 3.3171, + "step": 2996 + }, + { + "epoch": 0.38, + "grad_norm": 0.66268390417099, + "learning_rate": 0.0003532190462154805, + "loss": 3.3896, + "step": 2997 + }, + { + "epoch": 0.38, + "grad_norm": 0.6045586466789246, + "learning_rate": 0.000353124629194276, + "loss": 3.3563, + "step": 2998 + }, + { + "epoch": 0.38, + "grad_norm": 0.6830160021781921, + "learning_rate": 0.00035303019444475224, + "loss": 3.4542, + "step": 2999 + }, + { + "epoch": 0.38, + "grad_norm": 0.6896335482597351, + "learning_rate": 0.0003529357419831437, + "loss": 3.4167, + "step": 3000 + }, + { + "epoch": 0.38, + "grad_norm": 0.6890286803245544, + "learning_rate": 0.00035284127182568767, + "loss": 3.3208, + "step": 3001 + }, + { + "epoch": 0.38, + "grad_norm": 0.6445521712303162, + "learning_rate": 0.0003527467839886248, + "loss": 3.3125, + "step": 3002 + }, + { + "epoch": 0.38, + "grad_norm": 0.6485596299171448, + "learning_rate": 0.00035265227848819866, + "loss": 3.4856, + "step": 3003 + }, + { + "epoch": 0.38, + "grad_norm": 0.5812082886695862, + "learning_rate": 0.0003525577553406557, + "loss": 3.4143, + "step": 3004 + }, + { + "epoch": 0.38, + "grad_norm": 0.6749499440193176, + "learning_rate": 0.0003524632145622457, + "loss": 3.3787, + "step": 3005 + }, + { + "epoch": 0.38, + "grad_norm": 0.6414878964424133, + "learning_rate": 0.0003523686561692213, + "loss": 3.4175, + "step": 3006 + }, + { + "epoch": 0.38, + "grad_norm": 0.6318088173866272, + "learning_rate": 0.00035227408017783813, + "loss": 3.337, + "step": 3007 + }, + { + "epoch": 0.39, + "grad_norm": 0.6485508680343628, + "learning_rate": 0.0003521794866043549, + "loss": 3.4666, + "step": 3008 + }, + { + "epoch": 0.39, + "grad_norm": 0.6505287289619446, + "learning_rate": 0.0003520848754650333, + "loss": 3.4531, + "step": 3009 + }, + { + "epoch": 0.39, + "grad_norm": 0.6688392758369446, + "learning_rate": 0.00035199024677613826, + "loss": 3.5422, + "step": 3010 + }, + { + "epoch": 0.39, + "grad_norm": 0.6040188074111938, + "learning_rate": 0.00035189560055393744, + "loss": 3.2983, + "step": 3011 + }, + { + "epoch": 0.39, + "grad_norm": 0.6814796924591064, + "learning_rate": 0.0003518009368147015, + "loss": 3.4625, + "step": 3012 + }, + { + "epoch": 0.39, + "grad_norm": 0.6356769800186157, + "learning_rate": 0.00035170625557470433, + "loss": 3.4569, + "step": 3013 + }, + { + "epoch": 0.39, + "grad_norm": 0.6647804975509644, + "learning_rate": 0.0003516115568502228, + "loss": 3.3751, + "step": 3014 + }, + { + "epoch": 0.39, + "grad_norm": 0.651704728603363, + "learning_rate": 0.00035151684065753665, + "loss": 3.3811, + "step": 3015 + }, + { + "epoch": 0.39, + "grad_norm": 0.6553705930709839, + "learning_rate": 0.00035142210701292873, + "loss": 3.522, + "step": 3016 + }, + { + "epoch": 0.39, + "grad_norm": 0.6140713095664978, + "learning_rate": 0.0003513273559326848, + "loss": 3.371, + "step": 3017 + }, + { + "epoch": 0.39, + "grad_norm": 0.636650562286377, + "learning_rate": 0.00035123258743309375, + "loss": 3.3064, + "step": 3018 + }, + { + "epoch": 0.39, + "grad_norm": 0.6213472485542297, + "learning_rate": 0.0003511378015304473, + "loss": 3.5391, + "step": 3019 + }, + { + "epoch": 0.39, + "grad_norm": 0.6385437846183777, + "learning_rate": 0.00035104299824104026, + "loss": 3.3907, + "step": 3020 + }, + { + "epoch": 0.39, + "grad_norm": 0.6367380023002625, + "learning_rate": 0.0003509481775811705, + "loss": 3.4114, + "step": 3021 + }, + { + "epoch": 0.39, + "grad_norm": 0.6261208057403564, + "learning_rate": 0.0003508533395671386, + "loss": 3.3958, + "step": 3022 + }, + { + "epoch": 0.39, + "grad_norm": 0.6035501956939697, + "learning_rate": 0.00035075848421524847, + "loss": 3.3486, + "step": 3023 + }, + { + "epoch": 0.39, + "grad_norm": 0.6485708951950073, + "learning_rate": 0.0003506636115418069, + "loss": 3.4355, + "step": 3024 + }, + { + "epoch": 0.39, + "grad_norm": 0.6530906558036804, + "learning_rate": 0.0003505687215631235, + "loss": 3.4487, + "step": 3025 + }, + { + "epoch": 0.39, + "grad_norm": 0.6586419939994812, + "learning_rate": 0.0003504738142955109, + "loss": 3.4389, + "step": 3026 + }, + { + "epoch": 0.39, + "grad_norm": 0.6540868282318115, + "learning_rate": 0.0003503788897552848, + "loss": 3.3689, + "step": 3027 + }, + { + "epoch": 0.39, + "grad_norm": 0.6517311930656433, + "learning_rate": 0.00035028394795876396, + "loss": 3.5082, + "step": 3028 + }, + { + "epoch": 0.39, + "grad_norm": 0.5996099710464478, + "learning_rate": 0.00035018898892226984, + "loss": 3.3514, + "step": 3029 + }, + { + "epoch": 0.39, + "grad_norm": 0.6310666799545288, + "learning_rate": 0.00035009401266212706, + "loss": 3.3497, + "step": 3030 + }, + { + "epoch": 0.39, + "grad_norm": 0.623947262763977, + "learning_rate": 0.00034999901919466304, + "loss": 3.2739, + "step": 3031 + }, + { + "epoch": 0.39, + "grad_norm": 0.6056064963340759, + "learning_rate": 0.00034990400853620836, + "loss": 3.4216, + "step": 3032 + }, + { + "epoch": 0.39, + "grad_norm": 0.6672126054763794, + "learning_rate": 0.00034980898070309643, + "loss": 3.4154, + "step": 3033 + }, + { + "epoch": 0.39, + "grad_norm": 0.6156476736068726, + "learning_rate": 0.0003497139357116637, + "loss": 3.3667, + "step": 3034 + }, + { + "epoch": 0.39, + "grad_norm": 0.6445411443710327, + "learning_rate": 0.00034961887357824934, + "loss": 3.345, + "step": 3035 + }, + { + "epoch": 0.39, + "grad_norm": 0.6492227911949158, + "learning_rate": 0.0003495237943191957, + "loss": 3.333, + "step": 3036 + }, + { + "epoch": 0.39, + "grad_norm": 0.6277163624763489, + "learning_rate": 0.00034942869795084815, + "loss": 3.4001, + "step": 3037 + }, + { + "epoch": 0.39, + "grad_norm": 0.7017390131950378, + "learning_rate": 0.0003493335844895547, + "loss": 3.3455, + "step": 3038 + }, + { + "epoch": 0.39, + "grad_norm": 0.6569957733154297, + "learning_rate": 0.00034923845395166655, + "loss": 3.4971, + "step": 3039 + }, + { + "epoch": 0.39, + "grad_norm": 0.6246014833450317, + "learning_rate": 0.00034914330635353765, + "loss": 3.3908, + "step": 3040 + }, + { + "epoch": 0.39, + "grad_norm": 0.6579762697219849, + "learning_rate": 0.00034904814171152497, + "loss": 3.4869, + "step": 3041 + }, + { + "epoch": 0.39, + "grad_norm": 0.6611732840538025, + "learning_rate": 0.00034895296004198854, + "loss": 3.3098, + "step": 3042 + }, + { + "epoch": 0.39, + "grad_norm": 0.6445024609565735, + "learning_rate": 0.0003488577613612911, + "loss": 3.4028, + "step": 3043 + }, + { + "epoch": 0.39, + "grad_norm": 0.7002721428871155, + "learning_rate": 0.0003487625456857984, + "loss": 3.3255, + "step": 3044 + }, + { + "epoch": 0.39, + "grad_norm": 0.6145609617233276, + "learning_rate": 0.00034866731303187916, + "loss": 3.4165, + "step": 3045 + }, + { + "epoch": 0.39, + "grad_norm": 0.6688539385795593, + "learning_rate": 0.00034857206341590497, + "loss": 3.3142, + "step": 3046 + }, + { + "epoch": 0.39, + "grad_norm": 0.7072897553443909, + "learning_rate": 0.00034847679685425024, + "loss": 3.4557, + "step": 3047 + }, + { + "epoch": 0.39, + "grad_norm": 0.664849579334259, + "learning_rate": 0.00034838151336329267, + "loss": 3.5348, + "step": 3048 + }, + { + "epoch": 0.39, + "grad_norm": 0.6490936875343323, + "learning_rate": 0.0003482862129594123, + "loss": 3.3588, + "step": 3049 + }, + { + "epoch": 0.39, + "grad_norm": 0.5882840156555176, + "learning_rate": 0.00034819089565899246, + "loss": 3.2787, + "step": 3050 + }, + { + "epoch": 0.39, + "grad_norm": 0.622391402721405, + "learning_rate": 0.00034809556147841935, + "loss": 3.4638, + "step": 3051 + }, + { + "epoch": 0.39, + "grad_norm": 0.6462031602859497, + "learning_rate": 0.0003480002104340821, + "loss": 3.4412, + "step": 3052 + }, + { + "epoch": 0.39, + "grad_norm": 0.7033052444458008, + "learning_rate": 0.0003479048425423725, + "loss": 3.4586, + "step": 3053 + }, + { + "epoch": 0.39, + "grad_norm": 0.6103463768959045, + "learning_rate": 0.0003478094578196854, + "loss": 3.2599, + "step": 3054 + }, + { + "epoch": 0.39, + "grad_norm": 0.6081366539001465, + "learning_rate": 0.00034771405628241867, + "loss": 3.4528, + "step": 3055 + }, + { + "epoch": 0.39, + "grad_norm": 0.612507164478302, + "learning_rate": 0.00034761863794697287, + "loss": 3.4103, + "step": 3056 + }, + { + "epoch": 0.39, + "grad_norm": 0.6493777632713318, + "learning_rate": 0.00034752320282975145, + "loss": 3.3473, + "step": 3057 + }, + { + "epoch": 0.39, + "grad_norm": 0.7184935212135315, + "learning_rate": 0.00034742775094716093, + "loss": 3.4663, + "step": 3058 + }, + { + "epoch": 0.39, + "grad_norm": 0.6439271569252014, + "learning_rate": 0.00034733228231561056, + "loss": 3.2053, + "step": 3059 + }, + { + "epoch": 0.39, + "grad_norm": 0.6064172983169556, + "learning_rate": 0.00034723679695151244, + "loss": 3.4793, + "step": 3060 + }, + { + "epoch": 0.39, + "grad_norm": 0.6584929823875427, + "learning_rate": 0.00034714129487128166, + "loss": 3.402, + "step": 3061 + }, + { + "epoch": 0.39, + "grad_norm": 0.6590911149978638, + "learning_rate": 0.00034704577609133614, + "loss": 3.4054, + "step": 3062 + }, + { + "epoch": 0.39, + "grad_norm": 0.6318978667259216, + "learning_rate": 0.00034695024062809666, + "loss": 3.3923, + "step": 3063 + }, + { + "epoch": 0.39, + "grad_norm": 0.6369098424911499, + "learning_rate": 0.0003468546884979868, + "loss": 3.3737, + "step": 3064 + }, + { + "epoch": 0.39, + "grad_norm": 0.9716199040412903, + "learning_rate": 0.0003467591197174332, + "loss": 3.2404, + "step": 3065 + }, + { + "epoch": 0.39, + "grad_norm": 0.676282525062561, + "learning_rate": 0.00034666353430286534, + "loss": 3.3358, + "step": 3066 + }, + { + "epoch": 0.39, + "grad_norm": 0.655470609664917, + "learning_rate": 0.0003465679322707152, + "loss": 3.5153, + "step": 3067 + }, + { + "epoch": 0.39, + "grad_norm": 0.6594122648239136, + "learning_rate": 0.00034647231363741797, + "loss": 3.282, + "step": 3068 + }, + { + "epoch": 0.39, + "grad_norm": 0.6830422878265381, + "learning_rate": 0.00034637667841941154, + "loss": 3.4893, + "step": 3069 + }, + { + "epoch": 0.39, + "grad_norm": 2.9497554302215576, + "learning_rate": 0.000346281026633137, + "loss": 3.3481, + "step": 3070 + }, + { + "epoch": 0.39, + "grad_norm": 0.6353447437286377, + "learning_rate": 0.0003461853582950377, + "loss": 3.3376, + "step": 3071 + }, + { + "epoch": 0.39, + "grad_norm": 0.6244276165962219, + "learning_rate": 0.00034608967342156016, + "loss": 3.3032, + "step": 3072 + }, + { + "epoch": 0.39, + "grad_norm": 0.6454404592514038, + "learning_rate": 0.00034599397202915384, + "loss": 3.3049, + "step": 3073 + }, + { + "epoch": 0.39, + "grad_norm": 0.6199542880058289, + "learning_rate": 0.0003458982541342709, + "loss": 3.3146, + "step": 3074 + }, + { + "epoch": 0.39, + "grad_norm": 0.7060653567314148, + "learning_rate": 0.00034580251975336636, + "loss": 3.4034, + "step": 3075 + }, + { + "epoch": 0.39, + "grad_norm": 0.6460721492767334, + "learning_rate": 0.00034570676890289794, + "loss": 3.3228, + "step": 3076 + }, + { + "epoch": 0.39, + "grad_norm": 0.6966212391853333, + "learning_rate": 0.00034561100159932653, + "loss": 3.5031, + "step": 3077 + }, + { + "epoch": 0.39, + "grad_norm": 0.6779978275299072, + "learning_rate": 0.00034551521785911537, + "loss": 3.466, + "step": 3078 + }, + { + "epoch": 0.39, + "grad_norm": 0.6837643980979919, + "learning_rate": 0.0003454194176987311, + "loss": 3.3513, + "step": 3079 + }, + { + "epoch": 0.39, + "grad_norm": 0.6901535391807556, + "learning_rate": 0.0003453236011346427, + "loss": 3.3933, + "step": 3080 + }, + { + "epoch": 0.39, + "grad_norm": 0.6862813830375671, + "learning_rate": 0.0003452277681833221, + "loss": 3.4137, + "step": 3081 + }, + { + "epoch": 0.39, + "grad_norm": 0.686568558216095, + "learning_rate": 0.0003451319188612443, + "loss": 3.5128, + "step": 3082 + }, + { + "epoch": 0.39, + "grad_norm": 0.6437336802482605, + "learning_rate": 0.0003450360531848866, + "loss": 3.4203, + "step": 3083 + }, + { + "epoch": 0.39, + "grad_norm": 0.6524853706359863, + "learning_rate": 0.00034494017117072973, + "loss": 3.3342, + "step": 3084 + }, + { + "epoch": 0.39, + "grad_norm": 0.6673775911331177, + "learning_rate": 0.00034484427283525685, + "loss": 3.4508, + "step": 3085 + }, + { + "epoch": 0.4, + "grad_norm": 0.6682735681533813, + "learning_rate": 0.0003447483581949538, + "loss": 3.4717, + "step": 3086 + }, + { + "epoch": 0.4, + "grad_norm": 0.6896383762359619, + "learning_rate": 0.0003446524272663096, + "loss": 3.41, + "step": 3087 + }, + { + "epoch": 0.4, + "grad_norm": 0.6797161102294922, + "learning_rate": 0.0003445564800658159, + "loss": 3.4678, + "step": 3088 + }, + { + "epoch": 0.4, + "grad_norm": 0.7360920906066895, + "learning_rate": 0.00034446051660996714, + "loss": 3.5791, + "step": 3089 + }, + { + "epoch": 0.4, + "grad_norm": 0.6560612320899963, + "learning_rate": 0.00034436453691526035, + "loss": 3.4878, + "step": 3090 + }, + { + "epoch": 0.4, + "grad_norm": 0.7072687745094299, + "learning_rate": 0.00034426854099819584, + "loss": 3.4051, + "step": 3091 + }, + { + "epoch": 0.4, + "grad_norm": 1.1792899370193481, + "learning_rate": 0.00034417252887527616, + "loss": 3.3725, + "step": 3092 + }, + { + "epoch": 0.4, + "grad_norm": 0.6721067428588867, + "learning_rate": 0.0003440765005630071, + "loss": 3.4718, + "step": 3093 + }, + { + "epoch": 0.4, + "grad_norm": 0.6501315832138062, + "learning_rate": 0.0003439804560778968, + "loss": 3.3161, + "step": 3094 + }, + { + "epoch": 0.4, + "grad_norm": 0.6631578207015991, + "learning_rate": 0.00034388439543645683, + "loss": 3.5299, + "step": 3095 + }, + { + "epoch": 0.4, + "grad_norm": 0.6838598251342773, + "learning_rate": 0.0003437883186552008, + "loss": 3.3965, + "step": 3096 + }, + { + "epoch": 0.4, + "grad_norm": 1.6257017850875854, + "learning_rate": 0.0003436922257506454, + "loss": 3.2741, + "step": 3097 + }, + { + "epoch": 0.4, + "grad_norm": 0.6229751706123352, + "learning_rate": 0.00034359611673931027, + "loss": 3.3837, + "step": 3098 + }, + { + "epoch": 0.4, + "grad_norm": 0.6309128999710083, + "learning_rate": 0.0003434999916377177, + "loss": 3.4685, + "step": 3099 + }, + { + "epoch": 0.4, + "grad_norm": 0.6558688282966614, + "learning_rate": 0.0003434038504623926, + "loss": 3.4456, + "step": 3100 + }, + { + "epoch": 0.4, + "grad_norm": 0.6286501288414001, + "learning_rate": 0.00034330769322986267, + "loss": 3.4159, + "step": 3101 + }, + { + "epoch": 0.4, + "grad_norm": 0.6474177837371826, + "learning_rate": 0.0003432115199566586, + "loss": 3.4199, + "step": 3102 + }, + { + "epoch": 0.4, + "grad_norm": 0.5949915647506714, + "learning_rate": 0.00034311533065931374, + "loss": 3.5586, + "step": 3103 + }, + { + "epoch": 0.4, + "grad_norm": 0.584155797958374, + "learning_rate": 0.00034301912535436395, + "loss": 3.4332, + "step": 3104 + }, + { + "epoch": 0.4, + "grad_norm": 0.6542870402336121, + "learning_rate": 0.0003429229040583482, + "loss": 3.4701, + "step": 3105 + }, + { + "epoch": 0.4, + "grad_norm": 0.634269654750824, + "learning_rate": 0.0003428266667878079, + "loss": 3.3685, + "step": 3106 + }, + { + "epoch": 0.4, + "grad_norm": 0.7071416974067688, + "learning_rate": 0.0003427304135592876, + "loss": 3.4869, + "step": 3107 + }, + { + "epoch": 0.4, + "grad_norm": 0.6527315378189087, + "learning_rate": 0.00034263414438933407, + "loss": 3.5248, + "step": 3108 + }, + { + "epoch": 0.4, + "grad_norm": 0.634596586227417, + "learning_rate": 0.00034253785929449723, + "loss": 3.5002, + "step": 3109 + }, + { + "epoch": 0.4, + "grad_norm": 0.6589738130569458, + "learning_rate": 0.00034244155829132953, + "loss": 3.4708, + "step": 3110 + }, + { + "epoch": 0.4, + "grad_norm": 0.6533238887786865, + "learning_rate": 0.0003423452413963864, + "loss": 3.3354, + "step": 3111 + }, + { + "epoch": 0.4, + "grad_norm": 0.6337494254112244, + "learning_rate": 0.00034224890862622566, + "loss": 3.4836, + "step": 3112 + }, + { + "epoch": 0.4, + "grad_norm": 0.6837114691734314, + "learning_rate": 0.00034215255999740806, + "loss": 3.4551, + "step": 3113 + }, + { + "epoch": 0.4, + "grad_norm": 0.6452608108520508, + "learning_rate": 0.00034205619552649715, + "loss": 3.5575, + "step": 3114 + }, + { + "epoch": 0.4, + "grad_norm": 0.6340389847755432, + "learning_rate": 0.00034195981523005894, + "loss": 3.3972, + "step": 3115 + }, + { + "epoch": 0.4, + "grad_norm": 0.6513810157775879, + "learning_rate": 0.0003418634191246624, + "loss": 3.3774, + "step": 3116 + }, + { + "epoch": 0.4, + "grad_norm": 0.632177472114563, + "learning_rate": 0.0003417670072268792, + "loss": 3.4479, + "step": 3117 + }, + { + "epoch": 0.4, + "grad_norm": 0.6260270476341248, + "learning_rate": 0.00034167057955328365, + "loss": 3.499, + "step": 3118 + }, + { + "epoch": 0.4, + "grad_norm": 0.66219162940979, + "learning_rate": 0.0003415741361204526, + "loss": 3.4481, + "step": 3119 + }, + { + "epoch": 0.4, + "grad_norm": 0.6389588117599487, + "learning_rate": 0.00034147767694496604, + "loss": 3.3789, + "step": 3120 + }, + { + "epoch": 0.4, + "grad_norm": 0.6469547152519226, + "learning_rate": 0.00034138120204340623, + "loss": 3.4273, + "step": 3121 + }, + { + "epoch": 0.4, + "grad_norm": 0.6160049438476562, + "learning_rate": 0.00034128471143235847, + "loss": 3.5509, + "step": 3122 + }, + { + "epoch": 0.4, + "grad_norm": 0.6240808367729187, + "learning_rate": 0.00034118820512841054, + "loss": 3.4135, + "step": 3123 + }, + { + "epoch": 0.4, + "grad_norm": 0.6506920456886292, + "learning_rate": 0.000341091683148153, + "loss": 3.4338, + "step": 3124 + }, + { + "epoch": 0.4, + "grad_norm": 0.6893854737281799, + "learning_rate": 0.00034099514550817904, + "loss": 3.3949, + "step": 3125 + }, + { + "epoch": 0.4, + "grad_norm": 0.6406022310256958, + "learning_rate": 0.00034089859222508474, + "loss": 3.5067, + "step": 3126 + }, + { + "epoch": 0.4, + "grad_norm": 0.5929657220840454, + "learning_rate": 0.0003408020233154686, + "loss": 3.3074, + "step": 3127 + }, + { + "epoch": 0.4, + "grad_norm": 0.6202057600021362, + "learning_rate": 0.00034070543879593206, + "loss": 3.3579, + "step": 3128 + }, + { + "epoch": 0.4, + "grad_norm": 0.6425173878669739, + "learning_rate": 0.00034060883868307893, + "loss": 3.3264, + "step": 3129 + }, + { + "epoch": 0.4, + "grad_norm": 0.6303429007530212, + "learning_rate": 0.0003405122229935161, + "loss": 3.4209, + "step": 3130 + }, + { + "epoch": 0.4, + "grad_norm": 0.7062411308288574, + "learning_rate": 0.0003404155917438528, + "loss": 3.4781, + "step": 3131 + }, + { + "epoch": 0.4, + "grad_norm": 0.5809078812599182, + "learning_rate": 0.0003403189449507012, + "loss": 3.3356, + "step": 3132 + }, + { + "epoch": 0.4, + "grad_norm": 0.6062414050102234, + "learning_rate": 0.0003402222826306757, + "loss": 3.3905, + "step": 3133 + }, + { + "epoch": 0.4, + "grad_norm": 0.651741623878479, + "learning_rate": 0.0003401256048003939, + "loss": 3.3978, + "step": 3134 + }, + { + "epoch": 0.4, + "grad_norm": 0.6697308421134949, + "learning_rate": 0.0003400289114764759, + "loss": 3.4333, + "step": 3135 + }, + { + "epoch": 0.4, + "grad_norm": 0.668836236000061, + "learning_rate": 0.0003399322026755443, + "loss": 3.3994, + "step": 3136 + }, + { + "epoch": 0.4, + "grad_norm": 0.6523700952529907, + "learning_rate": 0.0003398354784142244, + "loss": 3.4765, + "step": 3137 + }, + { + "epoch": 0.4, + "grad_norm": 0.6296630501747131, + "learning_rate": 0.00033973873870914433, + "loss": 3.4468, + "step": 3138 + }, + { + "epoch": 0.4, + "grad_norm": 0.6403985023498535, + "learning_rate": 0.00033964198357693475, + "loss": 3.4852, + "step": 3139 + }, + { + "epoch": 0.4, + "grad_norm": 0.6643931269645691, + "learning_rate": 0.00033954521303422905, + "loss": 3.3481, + "step": 3140 + }, + { + "epoch": 0.4, + "grad_norm": 0.6210636496543884, + "learning_rate": 0.00033944842709766307, + "loss": 3.4064, + "step": 3141 + }, + { + "epoch": 0.4, + "grad_norm": 0.6406473517417908, + "learning_rate": 0.00033935162578387545, + "loss": 3.356, + "step": 3142 + }, + { + "epoch": 0.4, + "grad_norm": 0.6525415778160095, + "learning_rate": 0.00033925480910950757, + "loss": 3.4282, + "step": 3143 + }, + { + "epoch": 0.4, + "grad_norm": 0.637851893901825, + "learning_rate": 0.0003391579770912032, + "loss": 3.4665, + "step": 3144 + }, + { + "epoch": 0.4, + "grad_norm": 0.6637407541275024, + "learning_rate": 0.00033906112974560904, + "loss": 3.3821, + "step": 3145 + }, + { + "epoch": 0.4, + "grad_norm": 0.6572132110595703, + "learning_rate": 0.0003389642670893742, + "loss": 3.385, + "step": 3146 + }, + { + "epoch": 0.4, + "grad_norm": 0.6763609647750854, + "learning_rate": 0.0003388673891391504, + "loss": 3.2326, + "step": 3147 + }, + { + "epoch": 0.4, + "grad_norm": 0.630200982093811, + "learning_rate": 0.00033877049591159213, + "loss": 3.3535, + "step": 3148 + }, + { + "epoch": 0.4, + "grad_norm": 0.671553373336792, + "learning_rate": 0.00033867358742335665, + "loss": 3.4566, + "step": 3149 + }, + { + "epoch": 0.4, + "grad_norm": 0.6508437395095825, + "learning_rate": 0.0003385766636911035, + "loss": 3.3464, + "step": 3150 + }, + { + "epoch": 0.4, + "grad_norm": 0.6745772957801819, + "learning_rate": 0.00033847972473149485, + "loss": 3.4444, + "step": 3151 + }, + { + "epoch": 0.4, + "grad_norm": 0.5989139676094055, + "learning_rate": 0.0003383827705611958, + "loss": 3.2801, + "step": 3152 + }, + { + "epoch": 0.4, + "grad_norm": 0.6041255593299866, + "learning_rate": 0.00033828580119687395, + "loss": 3.412, + "step": 3153 + }, + { + "epoch": 0.4, + "grad_norm": 0.6439777612686157, + "learning_rate": 0.0003381888166551994, + "loss": 3.4712, + "step": 3154 + }, + { + "epoch": 0.4, + "grad_norm": 0.6362516283988953, + "learning_rate": 0.0003380918169528448, + "loss": 3.3767, + "step": 3155 + }, + { + "epoch": 0.4, + "grad_norm": 0.6359475255012512, + "learning_rate": 0.00033799480210648566, + "loss": 3.3803, + "step": 3156 + }, + { + "epoch": 0.4, + "grad_norm": 0.6420576572418213, + "learning_rate": 0.0003378977721327999, + "loss": 3.3852, + "step": 3157 + }, + { + "epoch": 0.4, + "grad_norm": 0.6671146750450134, + "learning_rate": 0.0003378007270484681, + "loss": 3.5088, + "step": 3158 + }, + { + "epoch": 0.4, + "grad_norm": 0.6650534868240356, + "learning_rate": 0.00033770366687017346, + "loss": 3.4395, + "step": 3159 + }, + { + "epoch": 0.4, + "grad_norm": 0.6656182408332825, + "learning_rate": 0.00033760659161460177, + "loss": 3.4362, + "step": 3160 + }, + { + "epoch": 0.4, + "grad_norm": 0.6491706967353821, + "learning_rate": 0.00033750950129844134, + "loss": 3.3303, + "step": 3161 + }, + { + "epoch": 0.4, + "grad_norm": 0.6670464277267456, + "learning_rate": 0.0003374123959383831, + "loss": 3.3414, + "step": 3162 + }, + { + "epoch": 0.4, + "grad_norm": 0.6329524517059326, + "learning_rate": 0.0003373152755511207, + "loss": 3.4744, + "step": 3163 + }, + { + "epoch": 0.4, + "grad_norm": 0.6125639081001282, + "learning_rate": 0.0003372181401533501, + "loss": 3.3642, + "step": 3164 + }, + { + "epoch": 0.41, + "grad_norm": 0.6461024880409241, + "learning_rate": 0.00033712098976177015, + "loss": 3.4479, + "step": 3165 + }, + { + "epoch": 0.41, + "grad_norm": 0.6762595772743225, + "learning_rate": 0.000337023824393082, + "loss": 3.3881, + "step": 3166 + }, + { + "epoch": 0.41, + "grad_norm": 0.632960319519043, + "learning_rate": 0.0003369266440639897, + "loss": 3.476, + "step": 3167 + }, + { + "epoch": 0.41, + "grad_norm": 0.6530134677886963, + "learning_rate": 0.0003368294487911995, + "loss": 3.3649, + "step": 3168 + }, + { + "epoch": 0.41, + "grad_norm": 0.5805623531341553, + "learning_rate": 0.00033673223859142034, + "loss": 3.3062, + "step": 3169 + }, + { + "epoch": 0.41, + "grad_norm": 0.6365443468093872, + "learning_rate": 0.0003366350134813639, + "loss": 3.4325, + "step": 3170 + }, + { + "epoch": 0.41, + "grad_norm": 0.6298810839653015, + "learning_rate": 0.0003365377734777444, + "loss": 3.3691, + "step": 3171 + }, + { + "epoch": 0.41, + "grad_norm": 0.67560875415802, + "learning_rate": 0.0003364405185972783, + "loss": 3.444, + "step": 3172 + }, + { + "epoch": 0.41, + "grad_norm": 0.6174789071083069, + "learning_rate": 0.000336343248856685, + "loss": 3.3915, + "step": 3173 + }, + { + "epoch": 0.41, + "grad_norm": 0.6995825171470642, + "learning_rate": 0.00033624596427268627, + "loss": 3.4796, + "step": 3174 + }, + { + "epoch": 0.41, + "grad_norm": 0.676274836063385, + "learning_rate": 0.00033614866486200636, + "loss": 3.4281, + "step": 3175 + }, + { + "epoch": 0.41, + "grad_norm": 0.6361021399497986, + "learning_rate": 0.0003360513506413722, + "loss": 3.3182, + "step": 3176 + }, + { + "epoch": 0.41, + "grad_norm": 0.6266982555389404, + "learning_rate": 0.00033595402162751344, + "loss": 3.3538, + "step": 3177 + }, + { + "epoch": 0.41, + "grad_norm": 0.6403414011001587, + "learning_rate": 0.00033585667783716177, + "loss": 3.5016, + "step": 3178 + }, + { + "epoch": 0.41, + "grad_norm": 0.6274116039276123, + "learning_rate": 0.00033575931928705186, + "loss": 3.4155, + "step": 3179 + }, + { + "epoch": 0.41, + "grad_norm": 0.6344843506813049, + "learning_rate": 0.00033566194599392074, + "loss": 3.3734, + "step": 3180 + }, + { + "epoch": 0.41, + "grad_norm": 0.6367878913879395, + "learning_rate": 0.00033556455797450807, + "loss": 3.4927, + "step": 3181 + }, + { + "epoch": 0.41, + "grad_norm": 0.6092458963394165, + "learning_rate": 0.00033546715524555593, + "loss": 3.3209, + "step": 3182 + }, + { + "epoch": 0.41, + "grad_norm": 0.64115971326828, + "learning_rate": 0.00033536973782380896, + "loss": 3.3135, + "step": 3183 + }, + { + "epoch": 0.41, + "grad_norm": 0.6055205464363098, + "learning_rate": 0.0003352723057260144, + "loss": 3.4147, + "step": 3184 + }, + { + "epoch": 0.41, + "grad_norm": 0.65650874376297, + "learning_rate": 0.0003351748589689219, + "loss": 3.3556, + "step": 3185 + }, + { + "epoch": 0.41, + "grad_norm": 0.6385210752487183, + "learning_rate": 0.00033507739756928373, + "loss": 3.3497, + "step": 3186 + }, + { + "epoch": 0.41, + "grad_norm": 0.6704872846603394, + "learning_rate": 0.0003349799215438546, + "loss": 3.4498, + "step": 3187 + }, + { + "epoch": 0.41, + "grad_norm": 0.6331678628921509, + "learning_rate": 0.0003348824309093918, + "loss": 3.3611, + "step": 3188 + }, + { + "epoch": 0.41, + "grad_norm": 0.6341878175735474, + "learning_rate": 0.00033478492568265507, + "loss": 3.4496, + "step": 3189 + }, + { + "epoch": 0.41, + "grad_norm": 0.6627287268638611, + "learning_rate": 0.00033468740588040667, + "loss": 3.4181, + "step": 3190 + }, + { + "epoch": 0.41, + "grad_norm": 0.6297523975372314, + "learning_rate": 0.0003345898715194115, + "loss": 3.3115, + "step": 3191 + }, + { + "epoch": 0.41, + "grad_norm": 0.6300864815711975, + "learning_rate": 0.00033449232261643675, + "loss": 3.2662, + "step": 3192 + }, + { + "epoch": 0.41, + "grad_norm": 0.6536930203437805, + "learning_rate": 0.00033439475918825215, + "loss": 3.2435, + "step": 3193 + }, + { + "epoch": 0.41, + "grad_norm": 0.6698246002197266, + "learning_rate": 0.00033429718125163013, + "loss": 3.4452, + "step": 3194 + }, + { + "epoch": 0.41, + "grad_norm": 0.6215016841888428, + "learning_rate": 0.0003341995888233453, + "loss": 3.4275, + "step": 3195 + }, + { + "epoch": 0.41, + "grad_norm": 0.623037576675415, + "learning_rate": 0.00033410198192017517, + "loss": 3.3506, + "step": 3196 + }, + { + "epoch": 0.41, + "grad_norm": 0.6066840887069702, + "learning_rate": 0.00033400436055889925, + "loss": 3.329, + "step": 3197 + }, + { + "epoch": 0.41, + "grad_norm": 0.6748824715614319, + "learning_rate": 0.0003339067247562999, + "loss": 3.4759, + "step": 3198 + }, + { + "epoch": 0.41, + "grad_norm": 0.6000560522079468, + "learning_rate": 0.0003338090745291619, + "loss": 3.256, + "step": 3199 + }, + { + "epoch": 0.41, + "grad_norm": 0.6228510141372681, + "learning_rate": 0.0003337114098942723, + "loss": 3.4539, + "step": 3200 + }, + { + "epoch": 0.41, + "grad_norm": 0.6298632025718689, + "learning_rate": 0.0003336137308684209, + "loss": 3.2606, + "step": 3201 + }, + { + "epoch": 0.41, + "grad_norm": 0.6771034598350525, + "learning_rate": 0.00033351603746839987, + "loss": 3.4276, + "step": 3202 + }, + { + "epoch": 0.41, + "grad_norm": 0.6480305790901184, + "learning_rate": 0.00033341832971100376, + "loss": 3.3418, + "step": 3203 + }, + { + "epoch": 0.41, + "grad_norm": 0.6708489656448364, + "learning_rate": 0.00033332060761302973, + "loss": 3.4634, + "step": 3204 + }, + { + "epoch": 0.41, + "grad_norm": 0.6806980967521667, + "learning_rate": 0.0003332228711912773, + "loss": 3.4818, + "step": 3205 + }, + { + "epoch": 0.41, + "grad_norm": 0.6376710534095764, + "learning_rate": 0.0003331251204625485, + "loss": 3.4056, + "step": 3206 + }, + { + "epoch": 0.41, + "grad_norm": 0.6477569937705994, + "learning_rate": 0.00033302735544364794, + "loss": 3.2535, + "step": 3207 + }, + { + "epoch": 0.41, + "grad_norm": 0.6241592764854431, + "learning_rate": 0.0003329295761513822, + "loss": 3.4414, + "step": 3208 + }, + { + "epoch": 0.41, + "grad_norm": 0.6071279048919678, + "learning_rate": 0.0003328317826025611, + "loss": 3.2777, + "step": 3209 + }, + { + "epoch": 0.41, + "grad_norm": 0.585581362247467, + "learning_rate": 0.0003327339748139963, + "loss": 3.3901, + "step": 3210 + }, + { + "epoch": 0.41, + "grad_norm": 0.6255743503570557, + "learning_rate": 0.000332636152802502, + "loss": 3.351, + "step": 3211 + }, + { + "epoch": 0.41, + "grad_norm": 0.6320606470108032, + "learning_rate": 0.0003325383165848951, + "loss": 3.4299, + "step": 3212 + }, + { + "epoch": 0.41, + "grad_norm": 0.6342218518257141, + "learning_rate": 0.00033244046617799473, + "loss": 3.3082, + "step": 3213 + }, + { + "epoch": 0.41, + "grad_norm": 0.6162002682685852, + "learning_rate": 0.00033234260159862244, + "loss": 3.3157, + "step": 3214 + }, + { + "epoch": 0.41, + "grad_norm": 0.5912756323814392, + "learning_rate": 0.00033224472286360235, + "loss": 3.2381, + "step": 3215 + }, + { + "epoch": 0.41, + "grad_norm": 0.6357508897781372, + "learning_rate": 0.00033214682998976096, + "loss": 3.4788, + "step": 3216 + }, + { + "epoch": 0.41, + "grad_norm": 0.6640059351921082, + "learning_rate": 0.00033204892299392715, + "loss": 3.3802, + "step": 3217 + }, + { + "epoch": 0.41, + "grad_norm": 0.6069798469543457, + "learning_rate": 0.0003319510018929323, + "loss": 3.34, + "step": 3218 + }, + { + "epoch": 0.41, + "grad_norm": 0.6166238188743591, + "learning_rate": 0.00033185306670361015, + "loss": 3.4072, + "step": 3219 + }, + { + "epoch": 0.41, + "grad_norm": 0.5803248882293701, + "learning_rate": 0.00033175511744279695, + "loss": 3.3099, + "step": 3220 + }, + { + "epoch": 0.41, + "grad_norm": 0.6132253408432007, + "learning_rate": 0.0003316571541273312, + "loss": 3.3057, + "step": 3221 + }, + { + "epoch": 0.41, + "grad_norm": 0.5996150374412537, + "learning_rate": 0.00033155917677405404, + "loss": 3.4043, + "step": 3222 + }, + { + "epoch": 0.41, + "grad_norm": 0.684146523475647, + "learning_rate": 0.00033146118539980886, + "loss": 3.3739, + "step": 3223 + }, + { + "epoch": 0.41, + "grad_norm": 0.5961861610412598, + "learning_rate": 0.0003313631800214416, + "loss": 3.371, + "step": 3224 + }, + { + "epoch": 0.41, + "grad_norm": 0.6282367706298828, + "learning_rate": 0.0003312651606558004, + "loss": 3.3869, + "step": 3225 + }, + { + "epoch": 0.41, + "grad_norm": 0.5975234508514404, + "learning_rate": 0.00033116712731973597, + "loss": 3.3008, + "step": 3226 + }, + { + "epoch": 0.41, + "grad_norm": 0.6370994448661804, + "learning_rate": 0.0003310690800301014, + "loss": 3.4603, + "step": 3227 + }, + { + "epoch": 0.41, + "grad_norm": 0.6064805388450623, + "learning_rate": 0.0003309710188037521, + "loss": 3.2699, + "step": 3228 + }, + { + "epoch": 0.41, + "grad_norm": 0.620232105255127, + "learning_rate": 0.00033087294365754605, + "loss": 3.5315, + "step": 3229 + }, + { + "epoch": 0.41, + "grad_norm": 0.635590136051178, + "learning_rate": 0.0003307748546083434, + "loss": 3.5964, + "step": 3230 + }, + { + "epoch": 0.41, + "grad_norm": 0.6045178174972534, + "learning_rate": 0.00033067675167300673, + "loss": 3.2993, + "step": 3231 + }, + { + "epoch": 0.41, + "grad_norm": 0.6263872385025024, + "learning_rate": 0.0003305786348684013, + "loss": 3.5154, + "step": 3232 + }, + { + "epoch": 0.41, + "grad_norm": 0.6414194703102112, + "learning_rate": 0.0003304805042113943, + "loss": 3.3522, + "step": 3233 + }, + { + "epoch": 0.41, + "grad_norm": 0.5951778292655945, + "learning_rate": 0.00033038235971885566, + "loss": 3.4194, + "step": 3234 + }, + { + "epoch": 0.41, + "grad_norm": 0.700951337814331, + "learning_rate": 0.00033028420140765745, + "loss": 3.406, + "step": 3235 + }, + { + "epoch": 0.41, + "grad_norm": 0.6830416321754456, + "learning_rate": 0.0003301860292946743, + "loss": 3.4461, + "step": 3236 + }, + { + "epoch": 0.41, + "grad_norm": 0.657326340675354, + "learning_rate": 0.0003300878433967832, + "loss": 3.37, + "step": 3237 + }, + { + "epoch": 0.41, + "grad_norm": 0.665967583656311, + "learning_rate": 0.0003299896437308633, + "loss": 3.3558, + "step": 3238 + }, + { + "epoch": 0.41, + "grad_norm": 0.6418531537055969, + "learning_rate": 0.0003298914303137964, + "loss": 3.4379, + "step": 3239 + }, + { + "epoch": 0.41, + "grad_norm": 0.6427636742591858, + "learning_rate": 0.0003297932031624663, + "loss": 3.3113, + "step": 3240 + }, + { + "epoch": 0.41, + "grad_norm": 0.6846147179603577, + "learning_rate": 0.0003296949622937596, + "loss": 3.3432, + "step": 3241 + }, + { + "epoch": 0.41, + "grad_norm": 0.5913913249969482, + "learning_rate": 0.000329596707724565, + "loss": 3.3784, + "step": 3242 + }, + { + "epoch": 0.42, + "grad_norm": 0.6321174502372742, + "learning_rate": 0.0003294984394717736, + "loss": 3.4374, + "step": 3243 + }, + { + "epoch": 0.42, + "grad_norm": 0.7750719785690308, + "learning_rate": 0.0003294001575522788, + "loss": 3.3214, + "step": 3244 + }, + { + "epoch": 0.42, + "grad_norm": 0.6348200440406799, + "learning_rate": 0.0003293018619829764, + "loss": 3.3458, + "step": 3245 + }, + { + "epoch": 0.42, + "grad_norm": 0.6247170567512512, + "learning_rate": 0.00032920355278076466, + "loss": 3.3382, + "step": 3246 + }, + { + "epoch": 0.42, + "grad_norm": 0.6537448167800903, + "learning_rate": 0.0003291052299625439, + "loss": 3.5199, + "step": 3247 + }, + { + "epoch": 0.42, + "grad_norm": 0.6451489329338074, + "learning_rate": 0.00032900689354521705, + "loss": 3.3319, + "step": 3248 + }, + { + "epoch": 0.42, + "grad_norm": 0.6447813510894775, + "learning_rate": 0.00032890854354568935, + "loss": 3.3783, + "step": 3249 + }, + { + "epoch": 0.42, + "grad_norm": 0.6620329022407532, + "learning_rate": 0.00032881017998086817, + "loss": 3.439, + "step": 3250 + }, + { + "epoch": 0.42, + "grad_norm": 0.6444206237792969, + "learning_rate": 0.0003287118028676634, + "loss": 3.4197, + "step": 3251 + }, + { + "epoch": 0.42, + "grad_norm": 0.6440566778182983, + "learning_rate": 0.00032861341222298725, + "loss": 3.29, + "step": 3252 + }, + { + "epoch": 0.42, + "grad_norm": 0.64480060338974, + "learning_rate": 0.0003285150080637542, + "loss": 3.4399, + "step": 3253 + }, + { + "epoch": 0.42, + "grad_norm": 0.6301221251487732, + "learning_rate": 0.00032841659040688093, + "loss": 3.3271, + "step": 3254 + }, + { + "epoch": 0.42, + "grad_norm": 0.6687061786651611, + "learning_rate": 0.00032831815926928675, + "loss": 3.4727, + "step": 3255 + }, + { + "epoch": 0.42, + "grad_norm": 0.7571885585784912, + "learning_rate": 0.00032821971466789304, + "loss": 3.3199, + "step": 3256 + }, + { + "epoch": 0.42, + "grad_norm": 0.6162340641021729, + "learning_rate": 0.0003281212566196236, + "loss": 3.53, + "step": 3257 + }, + { + "epoch": 0.42, + "grad_norm": 0.7048136591911316, + "learning_rate": 0.00032802278514140444, + "loss": 3.3968, + "step": 3258 + }, + { + "epoch": 0.42, + "grad_norm": 0.7401905655860901, + "learning_rate": 0.0003279243002501641, + "loss": 3.5071, + "step": 3259 + }, + { + "epoch": 0.42, + "grad_norm": 0.66029953956604, + "learning_rate": 0.00032782580196283307, + "loss": 3.3956, + "step": 3260 + }, + { + "epoch": 0.42, + "grad_norm": 0.6099365949630737, + "learning_rate": 0.0003277272902963445, + "loss": 3.4846, + "step": 3261 + }, + { + "epoch": 0.42, + "grad_norm": 0.636657178401947, + "learning_rate": 0.00032762876526763355, + "loss": 3.4264, + "step": 3262 + }, + { + "epoch": 0.42, + "grad_norm": 0.6442105770111084, + "learning_rate": 0.00032753022689363797, + "loss": 3.2815, + "step": 3263 + }, + { + "epoch": 0.42, + "grad_norm": 0.6299678087234497, + "learning_rate": 0.0003274316751912976, + "loss": 3.4017, + "step": 3264 + }, + { + "epoch": 0.42, + "grad_norm": 0.6098466515541077, + "learning_rate": 0.0003273331101775546, + "loss": 3.2246, + "step": 3265 + }, + { + "epoch": 0.42, + "grad_norm": 0.5735189914703369, + "learning_rate": 0.0003272345318693534, + "loss": 3.4318, + "step": 3266 + }, + { + "epoch": 0.42, + "grad_norm": 0.6352166533470154, + "learning_rate": 0.0003271359402836408, + "loss": 3.2772, + "step": 3267 + }, + { + "epoch": 0.42, + "grad_norm": 0.6368380784988403, + "learning_rate": 0.00032703733543736584, + "loss": 3.4721, + "step": 3268 + }, + { + "epoch": 0.42, + "grad_norm": 0.6893098950386047, + "learning_rate": 0.00032693871734747984, + "loss": 3.3131, + "step": 3269 + }, + { + "epoch": 0.42, + "grad_norm": 0.660503089427948, + "learning_rate": 0.00032684008603093634, + "loss": 3.375, + "step": 3270 + }, + { + "epoch": 0.42, + "grad_norm": 0.6506989598274231, + "learning_rate": 0.00032674144150469133, + "loss": 3.3489, + "step": 3271 + }, + { + "epoch": 0.42, + "grad_norm": 0.612584114074707, + "learning_rate": 0.0003266427837857027, + "loss": 3.2255, + "step": 3272 + }, + { + "epoch": 0.42, + "grad_norm": 0.6004265546798706, + "learning_rate": 0.0003265441128909311, + "loss": 3.4389, + "step": 3273 + }, + { + "epoch": 0.42, + "grad_norm": 0.6289479732513428, + "learning_rate": 0.0003264454288373392, + "loss": 3.5457, + "step": 3274 + }, + { + "epoch": 0.42, + "grad_norm": 0.6494922041893005, + "learning_rate": 0.00032634673164189175, + "loss": 3.4552, + "step": 3275 + }, + { + "epoch": 0.42, + "grad_norm": 0.6543040871620178, + "learning_rate": 0.0003262480213215561, + "loss": 3.3199, + "step": 3276 + }, + { + "epoch": 0.42, + "grad_norm": 0.6678844690322876, + "learning_rate": 0.0003261492978933016, + "loss": 3.3874, + "step": 3277 + }, + { + "epoch": 0.42, + "grad_norm": 0.614801287651062, + "learning_rate": 0.0003260505613741, + "loss": 3.4826, + "step": 3278 + }, + { + "epoch": 0.42, + "grad_norm": 0.6474515199661255, + "learning_rate": 0.00032595181178092525, + "loss": 3.4237, + "step": 3279 + }, + { + "epoch": 0.42, + "grad_norm": 0.6905706524848938, + "learning_rate": 0.00032585304913075353, + "loss": 3.5723, + "step": 3280 + }, + { + "epoch": 0.42, + "grad_norm": 0.6413430571556091, + "learning_rate": 0.0003257542734405634, + "loss": 3.4314, + "step": 3281 + }, + { + "epoch": 0.42, + "grad_norm": 0.5918505787849426, + "learning_rate": 0.0003256554847273353, + "loss": 3.2961, + "step": 3282 + }, + { + "epoch": 0.42, + "grad_norm": 0.658825695514679, + "learning_rate": 0.00032555668300805236, + "loss": 3.4234, + "step": 3283 + }, + { + "epoch": 0.42, + "grad_norm": 0.7836254835128784, + "learning_rate": 0.0003254578682996997, + "loss": 3.2429, + "step": 3284 + }, + { + "epoch": 0.42, + "grad_norm": 0.6375472545623779, + "learning_rate": 0.00032535904061926473, + "loss": 3.4131, + "step": 3285 + }, + { + "epoch": 0.42, + "grad_norm": 0.6533145308494568, + "learning_rate": 0.000325260199983737, + "loss": 3.4104, + "step": 3286 + }, + { + "epoch": 0.42, + "grad_norm": 0.6835505962371826, + "learning_rate": 0.00032516134641010836, + "loss": 3.3225, + "step": 3287 + }, + { + "epoch": 0.42, + "grad_norm": 0.6734188199043274, + "learning_rate": 0.000325062479915373, + "loss": 3.4174, + "step": 3288 + }, + { + "epoch": 0.42, + "grad_norm": 0.6586959362030029, + "learning_rate": 0.00032496360051652713, + "loss": 3.4702, + "step": 3289 + }, + { + "epoch": 0.42, + "grad_norm": 0.5997111201286316, + "learning_rate": 0.00032486470823056925, + "loss": 3.2781, + "step": 3290 + }, + { + "epoch": 0.42, + "grad_norm": 0.6650357842445374, + "learning_rate": 0.0003247658030745001, + "loss": 3.3645, + "step": 3291 + }, + { + "epoch": 0.42, + "grad_norm": 0.6417433619499207, + "learning_rate": 0.0003246668850653227, + "loss": 3.4462, + "step": 3292 + }, + { + "epoch": 0.42, + "grad_norm": 0.6094464063644409, + "learning_rate": 0.00032456795422004217, + "loss": 3.4812, + "step": 3293 + }, + { + "epoch": 0.42, + "grad_norm": 0.6310552954673767, + "learning_rate": 0.00032446901055566574, + "loss": 3.4605, + "step": 3294 + }, + { + "epoch": 0.42, + "grad_norm": 0.6238374710083008, + "learning_rate": 0.0003243700540892031, + "loss": 3.3442, + "step": 3295 + }, + { + "epoch": 0.42, + "grad_norm": 0.6162290573120117, + "learning_rate": 0.000324271084837666, + "loss": 3.4078, + "step": 3296 + }, + { + "epoch": 0.42, + "grad_norm": 0.6043792366981506, + "learning_rate": 0.0003241721028180684, + "loss": 3.3685, + "step": 3297 + }, + { + "epoch": 0.42, + "grad_norm": 0.6471230983734131, + "learning_rate": 0.0003240731080474265, + "loss": 3.26, + "step": 3298 + }, + { + "epoch": 0.42, + "grad_norm": 0.6456151008605957, + "learning_rate": 0.0003239741005427586, + "loss": 3.4192, + "step": 3299 + }, + { + "epoch": 0.42, + "grad_norm": 0.6801007986068726, + "learning_rate": 0.0003238750803210851, + "loss": 3.4669, + "step": 3300 + }, + { + "epoch": 0.42, + "grad_norm": 0.6284809708595276, + "learning_rate": 0.000323776047399429, + "loss": 3.3718, + "step": 3301 + }, + { + "epoch": 0.42, + "grad_norm": 0.5813315510749817, + "learning_rate": 0.000323677001794815, + "loss": 3.4051, + "step": 3302 + }, + { + "epoch": 0.42, + "grad_norm": 0.6104584336280823, + "learning_rate": 0.00032357794352427035, + "loss": 3.2907, + "step": 3303 + }, + { + "epoch": 0.42, + "grad_norm": 0.6234532594680786, + "learning_rate": 0.0003234788726048241, + "loss": 3.3436, + "step": 3304 + }, + { + "epoch": 0.42, + "grad_norm": 0.633009135723114, + "learning_rate": 0.00032337978905350786, + "loss": 3.3842, + "step": 3305 + }, + { + "epoch": 0.42, + "grad_norm": 0.5902324318885803, + "learning_rate": 0.0003232806928873552, + "loss": 3.1955, + "step": 3306 + }, + { + "epoch": 0.42, + "grad_norm": 0.636461079120636, + "learning_rate": 0.00032318158412340185, + "loss": 3.3598, + "step": 3307 + }, + { + "epoch": 0.42, + "grad_norm": 0.6180666089057922, + "learning_rate": 0.0003230824627786858, + "loss": 3.5227, + "step": 3308 + }, + { + "epoch": 0.42, + "grad_norm": 0.5958223938941956, + "learning_rate": 0.00032298332887024716, + "loss": 3.396, + "step": 3309 + }, + { + "epoch": 0.42, + "grad_norm": 0.6031109094619751, + "learning_rate": 0.00032288418241512826, + "loss": 3.3235, + "step": 3310 + }, + { + "epoch": 0.42, + "grad_norm": 0.583393394947052, + "learning_rate": 0.00032278502343037344, + "loss": 3.379, + "step": 3311 + }, + { + "epoch": 0.42, + "grad_norm": 0.626627504825592, + "learning_rate": 0.0003226858519330292, + "loss": 3.4544, + "step": 3312 + }, + { + "epoch": 0.42, + "grad_norm": 0.6014848351478577, + "learning_rate": 0.0003225866679401444, + "loss": 3.3026, + "step": 3313 + }, + { + "epoch": 0.42, + "grad_norm": 0.6087637543678284, + "learning_rate": 0.0003224874714687699, + "loss": 3.3001, + "step": 3314 + }, + { + "epoch": 0.42, + "grad_norm": 0.6320148706436157, + "learning_rate": 0.0003223882625359587, + "loss": 3.3306, + "step": 3315 + }, + { + "epoch": 0.42, + "grad_norm": 0.6071418523788452, + "learning_rate": 0.00032228904115876603, + "loss": 3.418, + "step": 3316 + }, + { + "epoch": 0.42, + "grad_norm": 0.6349049806594849, + "learning_rate": 0.00032218980735424906, + "loss": 3.4945, + "step": 3317 + }, + { + "epoch": 0.42, + "grad_norm": 0.6723031997680664, + "learning_rate": 0.00032209056113946733, + "loss": 3.5165, + "step": 3318 + }, + { + "epoch": 0.42, + "grad_norm": 0.5983545780181885, + "learning_rate": 0.0003219913025314824, + "loss": 3.2624, + "step": 3319 + }, + { + "epoch": 0.42, + "grad_norm": 0.5908249020576477, + "learning_rate": 0.00032189203154735804, + "loss": 3.3144, + "step": 3320 + }, + { + "epoch": 0.43, + "grad_norm": 0.6061389446258545, + "learning_rate": 0.0003217927482041599, + "loss": 3.2718, + "step": 3321 + }, + { + "epoch": 0.43, + "grad_norm": 0.6404465436935425, + "learning_rate": 0.0003216934525189561, + "loss": 3.3912, + "step": 3322 + }, + { + "epoch": 0.43, + "grad_norm": 0.6172356605529785, + "learning_rate": 0.00032159414450881667, + "loss": 3.4738, + "step": 3323 + }, + { + "epoch": 0.43, + "grad_norm": 0.6124842762947083, + "learning_rate": 0.0003214948241908138, + "loss": 3.5237, + "step": 3324 + }, + { + "epoch": 0.43, + "grad_norm": 0.628217875957489, + "learning_rate": 0.00032139549158202187, + "loss": 3.5643, + "step": 3325 + }, + { + "epoch": 0.43, + "grad_norm": 0.6124613881111145, + "learning_rate": 0.00032129614669951717, + "loss": 3.3889, + "step": 3326 + }, + { + "epoch": 0.43, + "grad_norm": 0.6479032039642334, + "learning_rate": 0.0003211967895603784, + "loss": 3.5663, + "step": 3327 + }, + { + "epoch": 0.43, + "grad_norm": 0.6235885620117188, + "learning_rate": 0.00032109742018168605, + "loss": 3.3537, + "step": 3328 + }, + { + "epoch": 0.43, + "grad_norm": 0.6726940870285034, + "learning_rate": 0.000320998038580523, + "loss": 3.3948, + "step": 3329 + }, + { + "epoch": 0.43, + "grad_norm": 0.6240055561065674, + "learning_rate": 0.000320898644773974, + "loss": 3.2737, + "step": 3330 + }, + { + "epoch": 0.43, + "grad_norm": 0.6884067058563232, + "learning_rate": 0.00032079923877912606, + "loss": 3.3856, + "step": 3331 + }, + { + "epoch": 0.43, + "grad_norm": 0.6275650858879089, + "learning_rate": 0.00032069982061306814, + "loss": 3.4113, + "step": 3332 + }, + { + "epoch": 0.43, + "grad_norm": 0.6332144141197205, + "learning_rate": 0.0003206003902928914, + "loss": 3.2665, + "step": 3333 + }, + { + "epoch": 0.43, + "grad_norm": 0.6318677663803101, + "learning_rate": 0.00032050094783568915, + "loss": 3.2838, + "step": 3334 + }, + { + "epoch": 0.43, + "grad_norm": 0.6149763464927673, + "learning_rate": 0.0003204014932585567, + "loss": 3.2776, + "step": 3335 + }, + { + "epoch": 0.43, + "grad_norm": 0.6720367074012756, + "learning_rate": 0.0003203020265785914, + "loss": 3.4391, + "step": 3336 + }, + { + "epoch": 0.43, + "grad_norm": 0.6117024421691895, + "learning_rate": 0.0003202025478128926, + "loss": 3.2989, + "step": 3337 + }, + { + "epoch": 0.43, + "grad_norm": 0.6262743473052979, + "learning_rate": 0.00032010305697856214, + "loss": 3.4251, + "step": 3338 + }, + { + "epoch": 0.43, + "grad_norm": 0.635115921497345, + "learning_rate": 0.0003200035540927034, + "loss": 3.5293, + "step": 3339 + }, + { + "epoch": 0.43, + "grad_norm": 0.6411104798316956, + "learning_rate": 0.0003199040391724221, + "loss": 3.3625, + "step": 3340 + }, + { + "epoch": 0.43, + "grad_norm": 0.6015012860298157, + "learning_rate": 0.00031980451223482615, + "loss": 3.288, + "step": 3341 + }, + { + "epoch": 0.43, + "grad_norm": 0.5813014507293701, + "learning_rate": 0.00031970497329702525, + "loss": 3.316, + "step": 3342 + }, + { + "epoch": 0.43, + "grad_norm": 0.619752824306488, + "learning_rate": 0.00031960542237613145, + "loss": 3.4123, + "step": 3343 + }, + { + "epoch": 0.43, + "grad_norm": 0.6338851451873779, + "learning_rate": 0.0003195058594892585, + "loss": 3.3534, + "step": 3344 + }, + { + "epoch": 0.43, + "grad_norm": 0.661378026008606, + "learning_rate": 0.00031940628465352254, + "loss": 3.4172, + "step": 3345 + }, + { + "epoch": 0.43, + "grad_norm": 0.5985472202301025, + "learning_rate": 0.0003193066978860416, + "loss": 3.353, + "step": 3346 + }, + { + "epoch": 0.43, + "grad_norm": 0.5869202017784119, + "learning_rate": 0.00031920709920393593, + "loss": 3.3536, + "step": 3347 + }, + { + "epoch": 0.43, + "grad_norm": 0.5923748016357422, + "learning_rate": 0.0003191074886243275, + "loss": 3.3209, + "step": 3348 + }, + { + "epoch": 0.43, + "grad_norm": 0.5979978442192078, + "learning_rate": 0.0003190078661643406, + "loss": 3.3844, + "step": 3349 + }, + { + "epoch": 0.43, + "grad_norm": 0.6152983903884888, + "learning_rate": 0.0003189082318411016, + "loss": 3.3938, + "step": 3350 + }, + { + "epoch": 0.43, + "grad_norm": 0.6220348477363586, + "learning_rate": 0.00031880858567173855, + "loss": 3.4024, + "step": 3351 + }, + { + "epoch": 0.43, + "grad_norm": 0.650018572807312, + "learning_rate": 0.00031870892767338197, + "loss": 3.5015, + "step": 3352 + }, + { + "epoch": 0.43, + "grad_norm": 0.6519098281860352, + "learning_rate": 0.0003186092578631643, + "loss": 3.3158, + "step": 3353 + }, + { + "epoch": 0.43, + "grad_norm": 0.6117034554481506, + "learning_rate": 0.0003185095762582197, + "loss": 3.4774, + "step": 3354 + }, + { + "epoch": 0.43, + "grad_norm": 0.6011911034584045, + "learning_rate": 0.00031840988287568474, + "loss": 3.3814, + "step": 3355 + }, + { + "epoch": 0.43, + "grad_norm": 0.6428784728050232, + "learning_rate": 0.00031831017773269784, + "loss": 3.3486, + "step": 3356 + }, + { + "epoch": 0.43, + "grad_norm": 0.6304425001144409, + "learning_rate": 0.0003182104608463995, + "loss": 3.4399, + "step": 3357 + }, + { + "epoch": 0.43, + "grad_norm": 0.6032813787460327, + "learning_rate": 0.00031811073223393205, + "loss": 3.3493, + "step": 3358 + }, + { + "epoch": 0.43, + "grad_norm": 0.6222589015960693, + "learning_rate": 0.0003180109919124403, + "loss": 3.4479, + "step": 3359 + }, + { + "epoch": 0.43, + "grad_norm": 0.6171223521232605, + "learning_rate": 0.00031791123989907043, + "loss": 3.2138, + "step": 3360 + }, + { + "epoch": 0.43, + "grad_norm": 0.6898640394210815, + "learning_rate": 0.00031781147621097125, + "loss": 3.4864, + "step": 3361 + }, + { + "epoch": 0.43, + "grad_norm": 0.6144060492515564, + "learning_rate": 0.0003177117008652931, + "loss": 3.4161, + "step": 3362 + }, + { + "epoch": 0.43, + "grad_norm": 0.6295124292373657, + "learning_rate": 0.00031761191387918867, + "loss": 3.3, + "step": 3363 + }, + { + "epoch": 0.43, + "grad_norm": 0.6242650747299194, + "learning_rate": 0.00031751211526981235, + "loss": 3.3961, + "step": 3364 + }, + { + "epoch": 0.43, + "grad_norm": 0.6680017113685608, + "learning_rate": 0.00031741230505432066, + "loss": 3.4074, + "step": 3365 + }, + { + "epoch": 0.43, + "grad_norm": 0.6949132680892944, + "learning_rate": 0.0003173124832498724, + "loss": 3.3573, + "step": 3366 + }, + { + "epoch": 0.43, + "grad_norm": 0.8361984491348267, + "learning_rate": 0.0003172126498736278, + "loss": 3.396, + "step": 3367 + }, + { + "epoch": 0.43, + "grad_norm": 0.6838335990905762, + "learning_rate": 0.0003171128049427496, + "loss": 3.2927, + "step": 3368 + }, + { + "epoch": 0.43, + "grad_norm": 0.6476494073867798, + "learning_rate": 0.00031701294847440197, + "loss": 3.4097, + "step": 3369 + }, + { + "epoch": 0.43, + "grad_norm": 0.6326252222061157, + "learning_rate": 0.00031691308048575185, + "loss": 3.4594, + "step": 3370 + }, + { + "epoch": 0.43, + "grad_norm": 0.655439019203186, + "learning_rate": 0.0003168132009939674, + "loss": 3.3764, + "step": 3371 + }, + { + "epoch": 0.43, + "grad_norm": 0.6298957467079163, + "learning_rate": 0.00031671331001621906, + "loss": 3.3612, + "step": 3372 + }, + { + "epoch": 0.43, + "grad_norm": 0.6700654625892639, + "learning_rate": 0.00031661340756967943, + "loss": 3.4241, + "step": 3373 + }, + { + "epoch": 0.43, + "grad_norm": 0.6177738308906555, + "learning_rate": 0.0003165134936715227, + "loss": 3.3666, + "step": 3374 + }, + { + "epoch": 0.43, + "grad_norm": 0.6501065492630005, + "learning_rate": 0.00031641356833892544, + "loss": 3.4278, + "step": 3375 + }, + { + "epoch": 0.43, + "grad_norm": 0.655293345451355, + "learning_rate": 0.0003163136315890657, + "loss": 3.3245, + "step": 3376 + }, + { + "epoch": 0.43, + "grad_norm": 0.659319281578064, + "learning_rate": 0.000316213683439124, + "loss": 3.3409, + "step": 3377 + }, + { + "epoch": 0.43, + "grad_norm": 0.615034818649292, + "learning_rate": 0.0003161137239062825, + "loss": 3.3011, + "step": 3378 + }, + { + "epoch": 0.43, + "grad_norm": 0.645544707775116, + "learning_rate": 0.00031601375300772537, + "loss": 3.4239, + "step": 3379 + }, + { + "epoch": 0.43, + "grad_norm": 0.595169723033905, + "learning_rate": 0.0003159137707606388, + "loss": 3.2856, + "step": 3380 + }, + { + "epoch": 0.43, + "grad_norm": 0.607345700263977, + "learning_rate": 0.0003158137771822109, + "loss": 3.4077, + "step": 3381 + }, + { + "epoch": 0.43, + "grad_norm": 0.5927847027778625, + "learning_rate": 0.00031571377228963184, + "loss": 3.3385, + "step": 3382 + }, + { + "epoch": 0.43, + "grad_norm": 0.6054478883743286, + "learning_rate": 0.0003156137561000933, + "loss": 3.4818, + "step": 3383 + }, + { + "epoch": 0.43, + "grad_norm": 0.6634422540664673, + "learning_rate": 0.00031551372863078956, + "loss": 3.2794, + "step": 3384 + }, + { + "epoch": 0.43, + "grad_norm": 0.591606080532074, + "learning_rate": 0.0003154136898989164, + "loss": 3.443, + "step": 3385 + }, + { + "epoch": 0.43, + "grad_norm": 0.672604501247406, + "learning_rate": 0.0003153136399216715, + "loss": 3.3172, + "step": 3386 + }, + { + "epoch": 0.43, + "grad_norm": 0.5803119540214539, + "learning_rate": 0.0003152135787162548, + "loss": 3.2582, + "step": 3387 + }, + { + "epoch": 0.43, + "grad_norm": 0.6184684634208679, + "learning_rate": 0.0003151135062998678, + "loss": 3.3465, + "step": 3388 + }, + { + "epoch": 0.43, + "grad_norm": 0.6289113163948059, + "learning_rate": 0.00031501342268971434, + "loss": 3.3795, + "step": 3389 + }, + { + "epoch": 0.43, + "grad_norm": 0.6137139201164246, + "learning_rate": 0.00031491332790299973, + "loss": 3.3531, + "step": 3390 + }, + { + "epoch": 0.43, + "grad_norm": 0.6423360705375671, + "learning_rate": 0.0003148132219569315, + "loss": 3.4468, + "step": 3391 + }, + { + "epoch": 0.43, + "grad_norm": 0.6035122871398926, + "learning_rate": 0.0003147131048687191, + "loss": 3.3405, + "step": 3392 + }, + { + "epoch": 0.43, + "grad_norm": 0.598874032497406, + "learning_rate": 0.0003146129766555737, + "loss": 3.3547, + "step": 3393 + }, + { + "epoch": 0.43, + "grad_norm": 0.6974937319755554, + "learning_rate": 0.0003145128373347086, + "loss": 3.4845, + "step": 3394 + }, + { + "epoch": 0.43, + "grad_norm": 0.6463556289672852, + "learning_rate": 0.0003144126869233389, + "loss": 3.2956, + "step": 3395 + }, + { + "epoch": 0.43, + "grad_norm": 0.69242262840271, + "learning_rate": 0.0003143125254386815, + "loss": 3.5167, + "step": 3396 + }, + { + "epoch": 0.43, + "grad_norm": 0.6210297346115112, + "learning_rate": 0.00031421235289795534, + "loss": 3.5848, + "step": 3397 + }, + { + "epoch": 0.43, + "grad_norm": 1.0055627822875977, + "learning_rate": 0.0003141121693183814, + "loss": 3.3848, + "step": 3398 + }, + { + "epoch": 0.44, + "grad_norm": 0.7247527241706848, + "learning_rate": 0.00031401197471718235, + "loss": 3.3554, + "step": 3399 + }, + { + "epoch": 0.44, + "grad_norm": 0.5828782916069031, + "learning_rate": 0.0003139117691115827, + "loss": 3.4218, + "step": 3400 + }, + { + "epoch": 0.44, + "grad_norm": 0.6309610605239868, + "learning_rate": 0.000313811552518809, + "loss": 3.433, + "step": 3401 + }, + { + "epoch": 0.44, + "grad_norm": 0.6716001033782959, + "learning_rate": 0.0003137113249560896, + "loss": 3.3232, + "step": 3402 + }, + { + "epoch": 0.44, + "grad_norm": 0.6095533967018127, + "learning_rate": 0.00031361108644065497, + "loss": 3.3684, + "step": 3403 + }, + { + "epoch": 0.44, + "grad_norm": 0.6483668088912964, + "learning_rate": 0.00031351083698973704, + "loss": 3.2485, + "step": 3404 + }, + { + "epoch": 0.44, + "grad_norm": 0.6276049613952637, + "learning_rate": 0.00031341057662057, + "loss": 3.3453, + "step": 3405 + }, + { + "epoch": 0.44, + "grad_norm": 0.6165592074394226, + "learning_rate": 0.00031331030535038963, + "loss": 3.4065, + "step": 3406 + }, + { + "epoch": 0.44, + "grad_norm": 0.5991979837417603, + "learning_rate": 0.00031321002319643394, + "loss": 3.2915, + "step": 3407 + }, + { + "epoch": 0.44, + "grad_norm": 0.6335907578468323, + "learning_rate": 0.00031310973017594233, + "loss": 3.4469, + "step": 3408 + }, + { + "epoch": 0.44, + "grad_norm": 0.6385035514831543, + "learning_rate": 0.00031300942630615666, + "loss": 3.3794, + "step": 3409 + }, + { + "epoch": 0.44, + "grad_norm": 0.6390557885169983, + "learning_rate": 0.00031290911160432, + "loss": 3.4735, + "step": 3410 + }, + { + "epoch": 0.44, + "grad_norm": 0.5964511632919312, + "learning_rate": 0.0003128087860876778, + "loss": 3.2477, + "step": 3411 + }, + { + "epoch": 0.44, + "grad_norm": 0.6422802209854126, + "learning_rate": 0.00031270844977347706, + "loss": 3.464, + "step": 3412 + }, + { + "epoch": 0.44, + "grad_norm": 0.6574854850769043, + "learning_rate": 0.00031260810267896686, + "loss": 3.3135, + "step": 3413 + }, + { + "epoch": 0.44, + "grad_norm": 0.6653745770454407, + "learning_rate": 0.0003125077448213981, + "loss": 3.4306, + "step": 3414 + }, + { + "epoch": 0.44, + "grad_norm": 0.6036469340324402, + "learning_rate": 0.0003124073762180232, + "loss": 3.4437, + "step": 3415 + }, + { + "epoch": 0.44, + "grad_norm": 0.6745374798774719, + "learning_rate": 0.00031230699688609693, + "loss": 3.2753, + "step": 3416 + }, + { + "epoch": 0.44, + "grad_norm": 0.6684151291847229, + "learning_rate": 0.0003122066068428756, + "loss": 3.3819, + "step": 3417 + }, + { + "epoch": 0.44, + "grad_norm": 0.6250439882278442, + "learning_rate": 0.00031210620610561736, + "loss": 3.3884, + "step": 3418 + }, + { + "epoch": 0.44, + "grad_norm": 0.6410674452781677, + "learning_rate": 0.0003120057946915822, + "loss": 3.3788, + "step": 3419 + }, + { + "epoch": 0.44, + "grad_norm": 0.6384410262107849, + "learning_rate": 0.00031190537261803215, + "loss": 3.4571, + "step": 3420 + }, + { + "epoch": 0.44, + "grad_norm": 0.6605309844017029, + "learning_rate": 0.0003118049399022309, + "loss": 3.3624, + "step": 3421 + }, + { + "epoch": 0.44, + "grad_norm": 0.6254794001579285, + "learning_rate": 0.000311704496561444, + "loss": 3.3778, + "step": 3422 + }, + { + "epoch": 0.44, + "grad_norm": 0.6189180016517639, + "learning_rate": 0.0003116040426129388, + "loss": 3.3505, + "step": 3423 + }, + { + "epoch": 0.44, + "grad_norm": 0.6452104449272156, + "learning_rate": 0.0003115035780739845, + "loss": 3.3617, + "step": 3424 + }, + { + "epoch": 0.44, + "grad_norm": 0.6219657063484192, + "learning_rate": 0.000311403102961852, + "loss": 3.3035, + "step": 3425 + }, + { + "epoch": 0.44, + "grad_norm": 0.6689308285713196, + "learning_rate": 0.0003113026172938144, + "loss": 3.4442, + "step": 3426 + }, + { + "epoch": 0.44, + "grad_norm": 0.6046881079673767, + "learning_rate": 0.0003112021210871462, + "loss": 3.2917, + "step": 3427 + }, + { + "epoch": 0.44, + "grad_norm": 0.6043495535850525, + "learning_rate": 0.00031110161435912384, + "loss": 3.4112, + "step": 3428 + }, + { + "epoch": 0.44, + "grad_norm": 0.6852877140045166, + "learning_rate": 0.0003110010971270256, + "loss": 3.3943, + "step": 3429 + }, + { + "epoch": 0.44, + "grad_norm": 0.6157641410827637, + "learning_rate": 0.0003109005694081315, + "loss": 3.2675, + "step": 3430 + }, + { + "epoch": 0.44, + "grad_norm": 0.5903221368789673, + "learning_rate": 0.0003108000312197236, + "loss": 3.3839, + "step": 3431 + }, + { + "epoch": 0.44, + "grad_norm": 0.6359903812408447, + "learning_rate": 0.0003106994825790855, + "loss": 3.4311, + "step": 3432 + }, + { + "epoch": 0.44, + "grad_norm": 0.6401050090789795, + "learning_rate": 0.0003105989235035026, + "loss": 3.3574, + "step": 3433 + }, + { + "epoch": 0.44, + "grad_norm": 0.5832918286323547, + "learning_rate": 0.0003104983540102622, + "loss": 3.3243, + "step": 3434 + }, + { + "epoch": 0.44, + "grad_norm": 0.6364819407463074, + "learning_rate": 0.0003103977741166535, + "loss": 3.3136, + "step": 3435 + }, + { + "epoch": 0.44, + "grad_norm": 0.5973476767539978, + "learning_rate": 0.0003102971838399672, + "loss": 3.4813, + "step": 3436 + }, + { + "epoch": 0.44, + "grad_norm": 0.6046728491783142, + "learning_rate": 0.0003101965831974959, + "loss": 3.2598, + "step": 3437 + }, + { + "epoch": 0.44, + "grad_norm": 0.6494289636611938, + "learning_rate": 0.00031009597220653417, + "loss": 3.3655, + "step": 3438 + }, + { + "epoch": 0.44, + "grad_norm": 0.6511953473091125, + "learning_rate": 0.00030999535088437806, + "loss": 3.4139, + "step": 3439 + }, + { + "epoch": 0.44, + "grad_norm": 0.6205424070358276, + "learning_rate": 0.0003098947192483256, + "loss": 3.32, + "step": 3440 + }, + { + "epoch": 0.44, + "grad_norm": 0.6335713863372803, + "learning_rate": 0.0003097940773156766, + "loss": 3.4043, + "step": 3441 + }, + { + "epoch": 0.44, + "grad_norm": 0.6370178461074829, + "learning_rate": 0.00030969342510373253, + "loss": 3.3922, + "step": 3442 + }, + { + "epoch": 0.44, + "grad_norm": 0.6742094159126282, + "learning_rate": 0.00030959276262979663, + "loss": 3.4295, + "step": 3443 + }, + { + "epoch": 0.44, + "grad_norm": 0.5739489197731018, + "learning_rate": 0.00030949208991117396, + "loss": 3.3301, + "step": 3444 + }, + { + "epoch": 0.44, + "grad_norm": 0.6312046051025391, + "learning_rate": 0.00030939140696517137, + "loss": 3.2065, + "step": 3445 + }, + { + "epoch": 0.44, + "grad_norm": 0.6611394882202148, + "learning_rate": 0.0003092907138090974, + "loss": 3.3841, + "step": 3446 + }, + { + "epoch": 0.44, + "grad_norm": 0.6275805234909058, + "learning_rate": 0.0003091900104602623, + "loss": 3.3972, + "step": 3447 + }, + { + "epoch": 0.44, + "grad_norm": 0.6278124451637268, + "learning_rate": 0.00030908929693597827, + "loss": 3.3839, + "step": 3448 + }, + { + "epoch": 0.44, + "grad_norm": 0.6052056550979614, + "learning_rate": 0.00030898857325355905, + "loss": 3.4597, + "step": 3449 + }, + { + "epoch": 0.44, + "grad_norm": 0.6291190385818481, + "learning_rate": 0.00030888783943032027, + "loss": 3.2394, + "step": 3450 + }, + { + "epoch": 0.44, + "grad_norm": 0.6044225692749023, + "learning_rate": 0.0003087870954835792, + "loss": 3.4053, + "step": 3451 + }, + { + "epoch": 0.44, + "grad_norm": 0.6691856980323792, + "learning_rate": 0.00030868634143065487, + "loss": 3.3968, + "step": 3452 + }, + { + "epoch": 0.44, + "grad_norm": 0.6347593069076538, + "learning_rate": 0.00030858557728886813, + "loss": 3.3502, + "step": 3453 + }, + { + "epoch": 0.44, + "grad_norm": 0.6214762330055237, + "learning_rate": 0.0003084848030755415, + "loss": 3.3677, + "step": 3454 + }, + { + "epoch": 0.44, + "grad_norm": 0.6205227375030518, + "learning_rate": 0.0003083840188079991, + "loss": 3.4138, + "step": 3455 + }, + { + "epoch": 0.44, + "grad_norm": 0.6083345413208008, + "learning_rate": 0.0003082832245035671, + "loss": 3.4667, + "step": 3456 + }, + { + "epoch": 0.44, + "grad_norm": 0.6635501980781555, + "learning_rate": 0.0003081824201795731, + "loss": 3.3298, + "step": 3457 + }, + { + "epoch": 0.44, + "grad_norm": 0.6177193522453308, + "learning_rate": 0.00030808160585334653, + "loss": 3.3713, + "step": 3458 + }, + { + "epoch": 0.44, + "grad_norm": 0.6598303914070129, + "learning_rate": 0.0003079807815422187, + "loss": 3.4621, + "step": 3459 + }, + { + "epoch": 0.44, + "grad_norm": 0.6189092397689819, + "learning_rate": 0.0003078799472635223, + "loss": 3.3052, + "step": 3460 + }, + { + "epoch": 0.44, + "grad_norm": 0.5945866107940674, + "learning_rate": 0.00030777910303459187, + "loss": 3.3532, + "step": 3461 + }, + { + "epoch": 0.44, + "grad_norm": 0.6236907839775085, + "learning_rate": 0.00030767824887276385, + "loss": 3.2658, + "step": 3462 + }, + { + "epoch": 0.44, + "grad_norm": 0.6429352164268494, + "learning_rate": 0.00030757738479537624, + "loss": 3.5409, + "step": 3463 + }, + { + "epoch": 0.44, + "grad_norm": 0.6596398949623108, + "learning_rate": 0.0003074765108197687, + "loss": 3.3922, + "step": 3464 + }, + { + "epoch": 0.44, + "grad_norm": 0.6131845116615295, + "learning_rate": 0.00030737562696328254, + "loss": 3.5003, + "step": 3465 + }, + { + "epoch": 0.44, + "grad_norm": 0.6321629881858826, + "learning_rate": 0.000307274733243261, + "loss": 3.3875, + "step": 3466 + }, + { + "epoch": 0.44, + "grad_norm": 0.5874630212783813, + "learning_rate": 0.0003071738296770488, + "loss": 3.3837, + "step": 3467 + }, + { + "epoch": 0.44, + "grad_norm": 0.6010831594467163, + "learning_rate": 0.0003070729162819925, + "loss": 3.3813, + "step": 3468 + }, + { + "epoch": 0.44, + "grad_norm": 0.6139161586761475, + "learning_rate": 0.00030697199307544014, + "loss": 3.3483, + "step": 3469 + }, + { + "epoch": 0.44, + "grad_norm": 0.6040095090866089, + "learning_rate": 0.0003068710600747418, + "loss": 3.4057, + "step": 3470 + }, + { + "epoch": 0.44, + "grad_norm": 0.6296358704566956, + "learning_rate": 0.0003067701172972489, + "loss": 3.3732, + "step": 3471 + }, + { + "epoch": 0.44, + "grad_norm": 0.6295837163925171, + "learning_rate": 0.0003066691647603147, + "loss": 3.3701, + "step": 3472 + }, + { + "epoch": 0.44, + "grad_norm": 0.6771396398544312, + "learning_rate": 0.0003065682024812941, + "loss": 3.31, + "step": 3473 + }, + { + "epoch": 0.44, + "grad_norm": 0.6971045732498169, + "learning_rate": 0.0003064672304775438, + "loss": 3.4291, + "step": 3474 + }, + { + "epoch": 0.44, + "grad_norm": 0.5894261598587036, + "learning_rate": 0.0003063662487664218, + "loss": 3.4009, + "step": 3475 + }, + { + "epoch": 0.44, + "grad_norm": 0.6298110485076904, + "learning_rate": 0.0003062652573652882, + "loss": 3.3064, + "step": 3476 + }, + { + "epoch": 0.45, + "grad_norm": 0.6593163013458252, + "learning_rate": 0.00030616425629150467, + "loss": 3.3307, + "step": 3477 + }, + { + "epoch": 0.45, + "grad_norm": 0.6416672468185425, + "learning_rate": 0.00030606324556243435, + "loss": 3.3134, + "step": 3478 + }, + { + "epoch": 0.45, + "grad_norm": 0.6436814665794373, + "learning_rate": 0.00030596222519544216, + "loss": 3.4234, + "step": 3479 + }, + { + "epoch": 0.45, + "grad_norm": 0.6197558641433716, + "learning_rate": 0.00030586119520789465, + "loss": 3.3476, + "step": 3480 + }, + { + "epoch": 0.45, + "grad_norm": 0.6113980412483215, + "learning_rate": 0.00030576015561716014, + "loss": 3.2472, + "step": 3481 + }, + { + "epoch": 0.45, + "grad_norm": 0.64369136095047, + "learning_rate": 0.00030565910644060845, + "loss": 3.4599, + "step": 3482 + }, + { + "epoch": 0.45, + "grad_norm": 0.6595032811164856, + "learning_rate": 0.00030555804769561107, + "loss": 3.307, + "step": 3483 + }, + { + "epoch": 0.45, + "grad_norm": 0.6180384159088135, + "learning_rate": 0.00030545697939954124, + "loss": 3.2461, + "step": 3484 + }, + { + "epoch": 0.45, + "grad_norm": 0.6257269382476807, + "learning_rate": 0.00030535590156977375, + "loss": 3.3398, + "step": 3485 + }, + { + "epoch": 0.45, + "grad_norm": 0.6344363689422607, + "learning_rate": 0.00030525481422368503, + "loss": 3.4293, + "step": 3486 + }, + { + "epoch": 0.45, + "grad_norm": 0.6301414370536804, + "learning_rate": 0.00030515371737865313, + "loss": 3.3741, + "step": 3487 + }, + { + "epoch": 0.45, + "grad_norm": 0.6213796138763428, + "learning_rate": 0.00030505261105205785, + "loss": 3.4556, + "step": 3488 + }, + { + "epoch": 0.45, + "grad_norm": 1.0104610919952393, + "learning_rate": 0.00030495149526128053, + "loss": 3.3195, + "step": 3489 + }, + { + "epoch": 0.45, + "grad_norm": 0.6484731435775757, + "learning_rate": 0.000304850370023704, + "loss": 3.2919, + "step": 3490 + }, + { + "epoch": 0.45, + "grad_norm": 0.6554868221282959, + "learning_rate": 0.0003047492353567131, + "loss": 3.3719, + "step": 3491 + }, + { + "epoch": 0.45, + "grad_norm": 0.5816743969917297, + "learning_rate": 0.000304648091277694, + "loss": 3.1774, + "step": 3492 + }, + { + "epoch": 0.45, + "grad_norm": 0.6390587091445923, + "learning_rate": 0.0003045469378040344, + "loss": 3.3829, + "step": 3493 + }, + { + "epoch": 0.45, + "grad_norm": 0.6130589842796326, + "learning_rate": 0.00030444577495312373, + "loss": 3.3452, + "step": 3494 + }, + { + "epoch": 0.45, + "grad_norm": 0.6152821183204651, + "learning_rate": 0.0003043446027423533, + "loss": 3.3599, + "step": 3495 + }, + { + "epoch": 0.45, + "grad_norm": 0.6135820746421814, + "learning_rate": 0.0003042434211891156, + "loss": 3.4351, + "step": 3496 + }, + { + "epoch": 0.45, + "grad_norm": 0.5926715731620789, + "learning_rate": 0.0003041422303108049, + "loss": 3.5196, + "step": 3497 + }, + { + "epoch": 0.45, + "grad_norm": 0.6294316649436951, + "learning_rate": 0.0003040410301248172, + "loss": 3.4092, + "step": 3498 + }, + { + "epoch": 0.45, + "grad_norm": 0.6111506819725037, + "learning_rate": 0.00030393982064854985, + "loss": 3.2415, + "step": 3499 + }, + { + "epoch": 0.45, + "grad_norm": 0.6075042486190796, + "learning_rate": 0.0003038386018994021, + "loss": 3.3598, + "step": 3500 + }, + { + "epoch": 0.45, + "grad_norm": 0.586727499961853, + "learning_rate": 0.00030373737389477446, + "loss": 3.4594, + "step": 3501 + }, + { + "epoch": 0.45, + "grad_norm": 0.7585539221763611, + "learning_rate": 0.00030363613665206936, + "loss": 3.4881, + "step": 3502 + }, + { + "epoch": 0.45, + "grad_norm": 0.5955037474632263, + "learning_rate": 0.0003035348901886905, + "loss": 3.4064, + "step": 3503 + }, + { + "epoch": 0.45, + "grad_norm": 0.5939720869064331, + "learning_rate": 0.00030343363452204334, + "loss": 3.3177, + "step": 3504 + }, + { + "epoch": 0.45, + "grad_norm": 0.6512664556503296, + "learning_rate": 0.000303332369669535, + "loss": 3.3653, + "step": 3505 + }, + { + "epoch": 0.45, + "grad_norm": 0.6145567297935486, + "learning_rate": 0.0003032310956485741, + "loss": 3.3844, + "step": 3506 + }, + { + "epoch": 0.45, + "grad_norm": 0.6215113997459412, + "learning_rate": 0.0003031298124765707, + "loss": 3.3526, + "step": 3507 + }, + { + "epoch": 0.45, + "grad_norm": 0.619853675365448, + "learning_rate": 0.00030302852017093654, + "loss": 3.4662, + "step": 3508 + }, + { + "epoch": 0.45, + "grad_norm": 0.6302598118782043, + "learning_rate": 0.000302927218749085, + "loss": 3.3894, + "step": 3509 + }, + { + "epoch": 0.45, + "grad_norm": 0.6114816069602966, + "learning_rate": 0.0003028259082284311, + "loss": 3.3717, + "step": 3510 + }, + { + "epoch": 0.45, + "grad_norm": 0.6622714400291443, + "learning_rate": 0.00030272458862639105, + "loss": 3.3851, + "step": 3511 + }, + { + "epoch": 0.45, + "grad_norm": 0.6394220590591431, + "learning_rate": 0.00030262325996038305, + "loss": 3.3535, + "step": 3512 + }, + { + "epoch": 0.45, + "grad_norm": 0.6466477513313293, + "learning_rate": 0.00030252192224782654, + "loss": 3.2252, + "step": 3513 + }, + { + "epoch": 0.45, + "grad_norm": 0.582334578037262, + "learning_rate": 0.00030242057550614276, + "loss": 3.3672, + "step": 3514 + }, + { + "epoch": 0.45, + "grad_norm": 0.6474951505661011, + "learning_rate": 0.0003023192197527543, + "loss": 3.3184, + "step": 3515 + }, + { + "epoch": 0.45, + "grad_norm": 0.6074023246765137, + "learning_rate": 0.0003022178550050855, + "loss": 3.2978, + "step": 3516 + }, + { + "epoch": 0.45, + "grad_norm": 0.6092503070831299, + "learning_rate": 0.00030211648128056196, + "loss": 3.4776, + "step": 3517 + }, + { + "epoch": 0.45, + "grad_norm": 0.6604886054992676, + "learning_rate": 0.0003020150985966111, + "loss": 3.4106, + "step": 3518 + }, + { + "epoch": 0.45, + "grad_norm": 0.599673330783844, + "learning_rate": 0.00030191370697066187, + "loss": 3.4092, + "step": 3519 + }, + { + "epoch": 0.45, + "grad_norm": 0.6192481517791748, + "learning_rate": 0.00030181230642014456, + "loss": 3.3152, + "step": 3520 + }, + { + "epoch": 0.45, + "grad_norm": 0.6263299584388733, + "learning_rate": 0.0003017108969624911, + "loss": 3.4797, + "step": 3521 + }, + { + "epoch": 0.45, + "grad_norm": 0.6188412308692932, + "learning_rate": 0.000301609478615135, + "loss": 3.329, + "step": 3522 + }, + { + "epoch": 0.45, + "grad_norm": 0.6249926090240479, + "learning_rate": 0.00030150805139551114, + "loss": 3.2953, + "step": 3523 + }, + { + "epoch": 0.45, + "grad_norm": 0.6381181478500366, + "learning_rate": 0.0003014066153210562, + "loss": 3.4152, + "step": 3524 + }, + { + "epoch": 0.45, + "grad_norm": 0.6533681750297546, + "learning_rate": 0.0003013051704092081, + "loss": 3.3498, + "step": 3525 + }, + { + "epoch": 0.45, + "grad_norm": 0.6145551800727844, + "learning_rate": 0.0003012037166774063, + "loss": 3.4357, + "step": 3526 + }, + { + "epoch": 0.45, + "grad_norm": 0.6011974215507507, + "learning_rate": 0.00030110225414309213, + "loss": 3.365, + "step": 3527 + }, + { + "epoch": 0.45, + "grad_norm": 0.6385951042175293, + "learning_rate": 0.00030100078282370804, + "loss": 3.3649, + "step": 3528 + }, + { + "epoch": 0.45, + "grad_norm": 0.5937651991844177, + "learning_rate": 0.0003008993027366981, + "loss": 3.271, + "step": 3529 + }, + { + "epoch": 0.45, + "grad_norm": 0.6630327701568604, + "learning_rate": 0.000300797813899508, + "loss": 3.2964, + "step": 3530 + }, + { + "epoch": 0.45, + "grad_norm": 0.7359730005264282, + "learning_rate": 0.00030069631632958474, + "loss": 3.3561, + "step": 3531 + }, + { + "epoch": 0.45, + "grad_norm": 0.631149172782898, + "learning_rate": 0.00030059481004437696, + "loss": 3.3569, + "step": 3532 + }, + { + "epoch": 0.45, + "grad_norm": 0.6512892842292786, + "learning_rate": 0.00030049329506133483, + "loss": 3.3519, + "step": 3533 + }, + { + "epoch": 0.45, + "grad_norm": 0.6602842807769775, + "learning_rate": 0.0003003917713979099, + "loss": 3.5396, + "step": 3534 + }, + { + "epoch": 0.45, + "grad_norm": 0.6186116337776184, + "learning_rate": 0.0003002902390715553, + "loss": 3.4058, + "step": 3535 + }, + { + "epoch": 0.45, + "grad_norm": 0.6194803714752197, + "learning_rate": 0.00030018869809972555, + "loss": 3.33, + "step": 3536 + }, + { + "epoch": 0.45, + "grad_norm": 0.6519399285316467, + "learning_rate": 0.0003000871484998767, + "loss": 3.3879, + "step": 3537 + }, + { + "epoch": 0.45, + "grad_norm": 0.6017206311225891, + "learning_rate": 0.00029998559028946646, + "loss": 3.3329, + "step": 3538 + }, + { + "epoch": 0.45, + "grad_norm": 0.6431175470352173, + "learning_rate": 0.0002998840234859538, + "loss": 3.3358, + "step": 3539 + }, + { + "epoch": 0.45, + "grad_norm": 0.6184296011924744, + "learning_rate": 0.00029978244810679913, + "loss": 3.4958, + "step": 3540 + }, + { + "epoch": 0.45, + "grad_norm": 0.625480592250824, + "learning_rate": 0.0002996808641694644, + "loss": 3.3546, + "step": 3541 + }, + { + "epoch": 0.45, + "grad_norm": 0.6198711395263672, + "learning_rate": 0.00029957927169141336, + "loss": 3.2546, + "step": 3542 + }, + { + "epoch": 0.45, + "grad_norm": 0.6898179650306702, + "learning_rate": 0.0002994776706901107, + "loss": 3.3301, + "step": 3543 + }, + { + "epoch": 0.45, + "grad_norm": 0.6591171622276306, + "learning_rate": 0.0002993760611830228, + "loss": 3.2971, + "step": 3544 + }, + { + "epoch": 0.45, + "grad_norm": 0.6145486831665039, + "learning_rate": 0.00029927444318761755, + "loss": 3.2786, + "step": 3545 + }, + { + "epoch": 0.45, + "grad_norm": 0.6447986364364624, + "learning_rate": 0.0002991728167213643, + "loss": 3.2749, + "step": 3546 + }, + { + "epoch": 0.45, + "grad_norm": 0.6319825649261475, + "learning_rate": 0.0002990711818017338, + "loss": 3.2753, + "step": 3547 + }, + { + "epoch": 0.45, + "grad_norm": 0.6972938776016235, + "learning_rate": 0.00029896953844619833, + "loss": 3.5112, + "step": 3548 + }, + { + "epoch": 0.45, + "grad_norm": 0.618682861328125, + "learning_rate": 0.0002988678866722315, + "loss": 3.2993, + "step": 3549 + }, + { + "epoch": 0.45, + "grad_norm": 0.5898873209953308, + "learning_rate": 0.0002987662264973083, + "loss": 3.4458, + "step": 3550 + }, + { + "epoch": 0.45, + "grad_norm": 0.6917723417282104, + "learning_rate": 0.00029866455793890544, + "loss": 3.3236, + "step": 3551 + }, + { + "epoch": 0.45, + "grad_norm": 0.6008872389793396, + "learning_rate": 0.000298562881014501, + "loss": 3.4311, + "step": 3552 + }, + { + "epoch": 0.45, + "grad_norm": 0.674096941947937, + "learning_rate": 0.0002984611957415743, + "loss": 3.3805, + "step": 3553 + }, + { + "epoch": 0.45, + "grad_norm": 0.6469094157218933, + "learning_rate": 0.0002983595021376062, + "loss": 3.3483, + "step": 3554 + }, + { + "epoch": 0.46, + "grad_norm": 0.6237218379974365, + "learning_rate": 0.000298257800220079, + "loss": 3.3999, + "step": 3555 + }, + { + "epoch": 0.46, + "grad_norm": 1.1701252460479736, + "learning_rate": 0.0002981560900064766, + "loss": 3.4764, + "step": 3556 + }, + { + "epoch": 0.46, + "grad_norm": 0.6512659788131714, + "learning_rate": 0.00029805437151428403, + "loss": 3.3742, + "step": 3557 + }, + { + "epoch": 0.46, + "grad_norm": 0.6231496334075928, + "learning_rate": 0.00029795264476098786, + "loss": 3.2932, + "step": 3558 + }, + { + "epoch": 0.46, + "grad_norm": 0.5999680757522583, + "learning_rate": 0.00029785090976407614, + "loss": 3.3411, + "step": 3559 + }, + { + "epoch": 0.46, + "grad_norm": 0.6761013269424438, + "learning_rate": 0.0002977491665410383, + "loss": 3.472, + "step": 3560 + }, + { + "epoch": 0.46, + "grad_norm": 0.6520906090736389, + "learning_rate": 0.0002976474151093653, + "loss": 3.3184, + "step": 3561 + }, + { + "epoch": 0.46, + "grad_norm": 0.6317535638809204, + "learning_rate": 0.0002975456554865491, + "loss": 3.3547, + "step": 3562 + }, + { + "epoch": 0.46, + "grad_norm": 0.635465681552887, + "learning_rate": 0.00029744388769008365, + "loss": 3.453, + "step": 3563 + }, + { + "epoch": 0.46, + "grad_norm": 0.6405307650566101, + "learning_rate": 0.0002973421117374637, + "loss": 3.2819, + "step": 3564 + }, + { + "epoch": 0.46, + "grad_norm": 0.622593343257904, + "learning_rate": 0.00029724032764618605, + "loss": 3.4364, + "step": 3565 + }, + { + "epoch": 0.46, + "grad_norm": 0.6837420463562012, + "learning_rate": 0.00029713853543374846, + "loss": 3.4766, + "step": 3566 + }, + { + "epoch": 0.46, + "grad_norm": 0.6647900938987732, + "learning_rate": 0.0002970367351176501, + "loss": 3.4147, + "step": 3567 + }, + { + "epoch": 0.46, + "grad_norm": 0.6509084105491638, + "learning_rate": 0.00029693492671539165, + "loss": 3.3641, + "step": 3568 + }, + { + "epoch": 0.46, + "grad_norm": 0.5970133543014526, + "learning_rate": 0.0002968331102444752, + "loss": 3.3489, + "step": 3569 + }, + { + "epoch": 0.46, + "grad_norm": 0.7497248649597168, + "learning_rate": 0.0002967312857224042, + "loss": 3.4326, + "step": 3570 + }, + { + "epoch": 0.46, + "grad_norm": 0.6541271805763245, + "learning_rate": 0.0002966294531666834, + "loss": 3.4407, + "step": 3571 + }, + { + "epoch": 0.46, + "grad_norm": 0.6058679819107056, + "learning_rate": 0.000296527612594819, + "loss": 3.433, + "step": 3572 + }, + { + "epoch": 0.46, + "grad_norm": 0.5970894694328308, + "learning_rate": 0.0002964257640243186, + "loss": 3.381, + "step": 3573 + }, + { + "epoch": 0.46, + "grad_norm": 0.6197634339332581, + "learning_rate": 0.00029632390747269115, + "loss": 3.3566, + "step": 3574 + }, + { + "epoch": 0.46, + "grad_norm": 0.5964658260345459, + "learning_rate": 0.000296222042957447, + "loss": 3.2884, + "step": 3575 + }, + { + "epoch": 0.46, + "grad_norm": 0.5752972960472107, + "learning_rate": 0.0002961201704960978, + "loss": 3.4334, + "step": 3576 + }, + { + "epoch": 0.46, + "grad_norm": 0.6440456509590149, + "learning_rate": 0.00029601829010615665, + "loss": 3.4766, + "step": 3577 + }, + { + "epoch": 0.46, + "grad_norm": 0.6194326877593994, + "learning_rate": 0.0002959164018051379, + "loss": 3.3685, + "step": 3578 + }, + { + "epoch": 0.46, + "grad_norm": 0.6755337119102478, + "learning_rate": 0.0002958145056105575, + "loss": 3.4672, + "step": 3579 + }, + { + "epoch": 0.46, + "grad_norm": 0.6031014919281006, + "learning_rate": 0.00029571260153993224, + "loss": 3.2198, + "step": 3580 + }, + { + "epoch": 0.46, + "grad_norm": 0.601763904094696, + "learning_rate": 0.000295610689610781, + "loss": 3.25, + "step": 3581 + }, + { + "epoch": 0.46, + "grad_norm": 0.5831199884414673, + "learning_rate": 0.00029550876984062337, + "loss": 3.315, + "step": 3582 + }, + { + "epoch": 0.46, + "grad_norm": 0.6840198040008545, + "learning_rate": 0.00029540684224698056, + "loss": 3.4738, + "step": 3583 + }, + { + "epoch": 0.46, + "grad_norm": 0.6487699151039124, + "learning_rate": 0.0002953049068473753, + "loss": 3.3388, + "step": 3584 + }, + { + "epoch": 0.46, + "grad_norm": 0.6016435027122498, + "learning_rate": 0.0002952029636593313, + "loss": 3.2886, + "step": 3585 + }, + { + "epoch": 0.46, + "grad_norm": 0.6112717390060425, + "learning_rate": 0.00029510101270037377, + "loss": 3.4304, + "step": 3586 + }, + { + "epoch": 0.46, + "grad_norm": 0.6386420130729675, + "learning_rate": 0.00029499905398802927, + "loss": 3.3713, + "step": 3587 + }, + { + "epoch": 0.46, + "grad_norm": 0.6484072804450989, + "learning_rate": 0.0002948970875398258, + "loss": 3.4334, + "step": 3588 + }, + { + "epoch": 0.46, + "grad_norm": 0.6308383345603943, + "learning_rate": 0.0002947951133732925, + "loss": 3.2712, + "step": 3589 + }, + { + "epoch": 0.46, + "grad_norm": 0.6079784035682678, + "learning_rate": 0.00029469313150595983, + "loss": 3.4595, + "step": 3590 + }, + { + "epoch": 0.46, + "grad_norm": 0.6424318552017212, + "learning_rate": 0.00029459114195535977, + "loss": 3.408, + "step": 3591 + }, + { + "epoch": 0.46, + "grad_norm": 0.5923727750778198, + "learning_rate": 0.00029448914473902544, + "loss": 3.3762, + "step": 3592 + }, + { + "epoch": 0.46, + "grad_norm": 0.5885934829711914, + "learning_rate": 0.0002943871398744914, + "loss": 3.4226, + "step": 3593 + }, + { + "epoch": 0.46, + "grad_norm": 0.6437953114509583, + "learning_rate": 0.00029428512737929334, + "loss": 3.3637, + "step": 3594 + }, + { + "epoch": 0.46, + "grad_norm": 0.6215007901191711, + "learning_rate": 0.00029418310727096864, + "loss": 3.3717, + "step": 3595 + }, + { + "epoch": 0.46, + "grad_norm": 0.6558961868286133, + "learning_rate": 0.00029408107956705544, + "loss": 3.1917, + "step": 3596 + }, + { + "epoch": 0.46, + "grad_norm": 0.606677770614624, + "learning_rate": 0.0002939790442850937, + "loss": 3.3311, + "step": 3597 + }, + { + "epoch": 0.46, + "grad_norm": 0.5694128274917603, + "learning_rate": 0.0002938770014426244, + "loss": 3.2667, + "step": 3598 + }, + { + "epoch": 0.46, + "grad_norm": 0.6147801876068115, + "learning_rate": 0.00029377495105718993, + "loss": 3.5328, + "step": 3599 + }, + { + "epoch": 0.46, + "grad_norm": 0.5825375318527222, + "learning_rate": 0.00029367289314633383, + "loss": 3.3569, + "step": 3600 + }, + { + "epoch": 0.46, + "grad_norm": 0.6331030130386353, + "learning_rate": 0.00029357082772760103, + "loss": 3.3965, + "step": 3601 + }, + { + "epoch": 0.46, + "grad_norm": 0.6068410873413086, + "learning_rate": 0.000293468754818538, + "loss": 3.2792, + "step": 3602 + }, + { + "epoch": 0.46, + "grad_norm": 0.6427342891693115, + "learning_rate": 0.00029336667443669197, + "loss": 3.3827, + "step": 3603 + }, + { + "epoch": 0.46, + "grad_norm": 0.6383602619171143, + "learning_rate": 0.0002932645865996119, + "loss": 3.3444, + "step": 3604 + }, + { + "epoch": 0.46, + "grad_norm": 0.671267569065094, + "learning_rate": 0.00029316249132484775, + "loss": 3.4324, + "step": 3605 + }, + { + "epoch": 0.46, + "grad_norm": 0.6190444231033325, + "learning_rate": 0.000293060388629951, + "loss": 3.3451, + "step": 3606 + }, + { + "epoch": 0.46, + "grad_norm": 0.6308755278587341, + "learning_rate": 0.0002929582785324743, + "loss": 3.4204, + "step": 3607 + }, + { + "epoch": 0.46, + "grad_norm": 0.6411634087562561, + "learning_rate": 0.0002928561610499714, + "loss": 3.434, + "step": 3608 + }, + { + "epoch": 0.46, + "grad_norm": 0.6541515588760376, + "learning_rate": 0.0002927540361999977, + "loss": 3.3911, + "step": 3609 + }, + { + "epoch": 0.46, + "grad_norm": 0.6651307940483093, + "learning_rate": 0.0002926519040001094, + "loss": 3.2249, + "step": 3610 + }, + { + "epoch": 0.46, + "grad_norm": 0.6221703290939331, + "learning_rate": 0.00029254976446786447, + "loss": 3.2465, + "step": 3611 + }, + { + "epoch": 0.46, + "grad_norm": 0.6201170086860657, + "learning_rate": 0.00029244761762082166, + "loss": 3.3435, + "step": 3612 + }, + { + "epoch": 0.46, + "grad_norm": 0.7041198015213013, + "learning_rate": 0.00029234546347654133, + "loss": 3.3102, + "step": 3613 + }, + { + "epoch": 0.46, + "grad_norm": 0.6195172667503357, + "learning_rate": 0.0002922433020525849, + "loss": 3.4033, + "step": 3614 + }, + { + "epoch": 0.46, + "grad_norm": 0.5839033722877502, + "learning_rate": 0.0002921411333665151, + "loss": 3.4524, + "step": 3615 + }, + { + "epoch": 0.46, + "grad_norm": 0.5902976393699646, + "learning_rate": 0.00029203895743589597, + "loss": 3.3454, + "step": 3616 + }, + { + "epoch": 0.46, + "grad_norm": 0.583528459072113, + "learning_rate": 0.00029193677427829276, + "loss": 3.365, + "step": 3617 + }, + { + "epoch": 0.46, + "grad_norm": 0.6564446091651917, + "learning_rate": 0.0002918345839112717, + "loss": 3.3221, + "step": 3618 + }, + { + "epoch": 0.46, + "grad_norm": 0.6363897919654846, + "learning_rate": 0.0002917323863524008, + "loss": 3.349, + "step": 3619 + }, + { + "epoch": 0.46, + "grad_norm": 0.6170995831489563, + "learning_rate": 0.00029163018161924885, + "loss": 3.3675, + "step": 3620 + }, + { + "epoch": 0.46, + "grad_norm": 0.5862497687339783, + "learning_rate": 0.0002915279697293862, + "loss": 3.2222, + "step": 3621 + }, + { + "epoch": 0.46, + "grad_norm": 0.6244729161262512, + "learning_rate": 0.0002914257507003839, + "loss": 3.343, + "step": 3622 + }, + { + "epoch": 0.46, + "grad_norm": 0.6157928109169006, + "learning_rate": 0.0002913235245498149, + "loss": 3.3368, + "step": 3623 + }, + { + "epoch": 0.46, + "grad_norm": 0.5982532501220703, + "learning_rate": 0.000291221291295253, + "loss": 3.3366, + "step": 3624 + }, + { + "epoch": 0.46, + "grad_norm": 0.6363582015037537, + "learning_rate": 0.00029111905095427325, + "loss": 3.3574, + "step": 3625 + }, + { + "epoch": 0.46, + "grad_norm": 0.6117314100265503, + "learning_rate": 0.00029101680354445185, + "loss": 3.3052, + "step": 3626 + }, + { + "epoch": 0.46, + "grad_norm": 0.6107704043388367, + "learning_rate": 0.0002909145490833666, + "loss": 3.3321, + "step": 3627 + }, + { + "epoch": 0.46, + "grad_norm": 0.6260195970535278, + "learning_rate": 0.00029081228758859583, + "loss": 3.3971, + "step": 3628 + }, + { + "epoch": 0.46, + "grad_norm": 0.7223565578460693, + "learning_rate": 0.00029071001907771983, + "loss": 3.4466, + "step": 3629 + }, + { + "epoch": 0.46, + "grad_norm": 0.6113673448562622, + "learning_rate": 0.00029060774356831946, + "loss": 3.3126, + "step": 3630 + }, + { + "epoch": 0.46, + "grad_norm": 0.6077743768692017, + "learning_rate": 0.0002905054610779774, + "loss": 3.3025, + "step": 3631 + }, + { + "epoch": 0.46, + "grad_norm": 0.6696673631668091, + "learning_rate": 0.00029040317162427686, + "loss": 3.3966, + "step": 3632 + }, + { + "epoch": 0.47, + "grad_norm": 0.5980106592178345, + "learning_rate": 0.0002903008752248027, + "loss": 3.3802, + "step": 3633 + }, + { + "epoch": 0.47, + "grad_norm": 0.666287362575531, + "learning_rate": 0.00029019857189714095, + "loss": 3.3208, + "step": 3634 + }, + { + "epoch": 0.47, + "grad_norm": 0.6521688103675842, + "learning_rate": 0.0002900962616588787, + "loss": 3.3544, + "step": 3635 + }, + { + "epoch": 0.47, + "grad_norm": 0.5930438041687012, + "learning_rate": 0.00028999394452760413, + "loss": 3.3636, + "step": 3636 + }, + { + "epoch": 0.47, + "grad_norm": 0.6384075880050659, + "learning_rate": 0.000289891620520907, + "loss": 3.299, + "step": 3637 + }, + { + "epoch": 0.47, + "grad_norm": 0.6537858247756958, + "learning_rate": 0.0002897892896563777, + "loss": 3.4557, + "step": 3638 + }, + { + "epoch": 0.47, + "grad_norm": 0.5836814641952515, + "learning_rate": 0.00028968695195160824, + "loss": 3.4368, + "step": 3639 + }, + { + "epoch": 0.47, + "grad_norm": 0.6711363792419434, + "learning_rate": 0.0002895846074241917, + "loss": 3.4193, + "step": 3640 + }, + { + "epoch": 0.47, + "grad_norm": 0.5976700782775879, + "learning_rate": 0.00028948225609172224, + "loss": 3.2866, + "step": 3641 + }, + { + "epoch": 0.47, + "grad_norm": 0.6466230750083923, + "learning_rate": 0.0002893798979717952, + "loss": 3.4144, + "step": 3642 + }, + { + "epoch": 0.47, + "grad_norm": 0.6126176714897156, + "learning_rate": 0.0002892775330820072, + "loss": 3.3428, + "step": 3643 + }, + { + "epoch": 0.47, + "grad_norm": 0.6454432606697083, + "learning_rate": 0.00028917516143995594, + "loss": 3.4041, + "step": 3644 + }, + { + "epoch": 0.47, + "grad_norm": 0.5797054171562195, + "learning_rate": 0.00028907278306324025, + "loss": 3.3597, + "step": 3645 + }, + { + "epoch": 0.47, + "grad_norm": 0.6614933013916016, + "learning_rate": 0.0002889703979694602, + "loss": 3.3663, + "step": 3646 + }, + { + "epoch": 0.47, + "grad_norm": 0.6284923553466797, + "learning_rate": 0.0002888680061762169, + "loss": 3.3435, + "step": 3647 + }, + { + "epoch": 0.47, + "grad_norm": 0.633361279964447, + "learning_rate": 0.00028876560770111276, + "loss": 3.3691, + "step": 3648 + }, + { + "epoch": 0.47, + "grad_norm": 0.6137580871582031, + "learning_rate": 0.00028866320256175126, + "loss": 3.4468, + "step": 3649 + }, + { + "epoch": 0.47, + "grad_norm": 0.6484012603759766, + "learning_rate": 0.00028856079077573704, + "loss": 3.3441, + "step": 3650 + }, + { + "epoch": 0.47, + "grad_norm": 0.6236936450004578, + "learning_rate": 0.00028845837236067583, + "loss": 3.2894, + "step": 3651 + }, + { + "epoch": 0.47, + "grad_norm": 0.6233891248703003, + "learning_rate": 0.00028835594733417455, + "loss": 3.4351, + "step": 3652 + }, + { + "epoch": 0.47, + "grad_norm": 0.6302180886268616, + "learning_rate": 0.0002882535157138413, + "loss": 3.3731, + "step": 3653 + }, + { + "epoch": 0.47, + "grad_norm": 0.6548006534576416, + "learning_rate": 0.0002881510775172851, + "loss": 3.4157, + "step": 3654 + }, + { + "epoch": 0.47, + "grad_norm": 0.5937785506248474, + "learning_rate": 0.0002880486327621166, + "loss": 3.4101, + "step": 3655 + }, + { + "epoch": 0.47, + "grad_norm": 0.5982134938240051, + "learning_rate": 0.00028794618146594684, + "loss": 3.4519, + "step": 3656 + }, + { + "epoch": 0.47, + "grad_norm": 0.6334812045097351, + "learning_rate": 0.0002878437236463887, + "loss": 3.3228, + "step": 3657 + }, + { + "epoch": 0.47, + "grad_norm": 0.5932255983352661, + "learning_rate": 0.00028774125932105567, + "loss": 3.4541, + "step": 3658 + }, + { + "epoch": 0.47, + "grad_norm": 0.6145234704017639, + "learning_rate": 0.0002876387885075627, + "loss": 3.3212, + "step": 3659 + }, + { + "epoch": 0.47, + "grad_norm": 0.6053762435913086, + "learning_rate": 0.00028753631122352565, + "loss": 3.3865, + "step": 3660 + }, + { + "epoch": 0.47, + "grad_norm": 0.6523486375808716, + "learning_rate": 0.00028743382748656145, + "loss": 3.4043, + "step": 3661 + }, + { + "epoch": 0.47, + "grad_norm": 0.6297605037689209, + "learning_rate": 0.00028733133731428836, + "loss": 3.3975, + "step": 3662 + }, + { + "epoch": 0.47, + "grad_norm": 0.6141480803489685, + "learning_rate": 0.0002872288407243256, + "loss": 3.4173, + "step": 3663 + }, + { + "epoch": 0.47, + "grad_norm": 0.5871593952178955, + "learning_rate": 0.00028712633773429356, + "loss": 3.2913, + "step": 3664 + }, + { + "epoch": 0.47, + "grad_norm": 0.6223506331443787, + "learning_rate": 0.0002870238283618136, + "loss": 3.4494, + "step": 3665 + }, + { + "epoch": 0.47, + "grad_norm": 0.591840386390686, + "learning_rate": 0.0002869213126245084, + "loss": 3.3138, + "step": 3666 + }, + { + "epoch": 0.47, + "grad_norm": 0.5490025877952576, + "learning_rate": 0.0002868187905400015, + "loss": 3.2517, + "step": 3667 + }, + { + "epoch": 0.47, + "grad_norm": 0.5901401042938232, + "learning_rate": 0.0002867162621259177, + "loss": 3.3777, + "step": 3668 + }, + { + "epoch": 0.47, + "grad_norm": 0.5850833058357239, + "learning_rate": 0.00028661372739988275, + "loss": 3.4323, + "step": 3669 + }, + { + "epoch": 0.47, + "grad_norm": 0.6089956164360046, + "learning_rate": 0.00028651118637952356, + "loss": 3.3793, + "step": 3670 + }, + { + "epoch": 0.47, + "grad_norm": 0.6123780608177185, + "learning_rate": 0.00028640863908246825, + "loss": 3.3569, + "step": 3671 + }, + { + "epoch": 0.47, + "grad_norm": 0.6722397208213806, + "learning_rate": 0.00028630608552634567, + "loss": 3.4235, + "step": 3672 + }, + { + "epoch": 0.47, + "grad_norm": 0.6255530714988708, + "learning_rate": 0.0002862035257287862, + "loss": 3.3247, + "step": 3673 + }, + { + "epoch": 0.47, + "grad_norm": 0.6261695623397827, + "learning_rate": 0.00028610095970742097, + "loss": 3.3805, + "step": 3674 + }, + { + "epoch": 0.47, + "grad_norm": 0.6323671936988831, + "learning_rate": 0.00028599838747988215, + "loss": 3.3112, + "step": 3675 + }, + { + "epoch": 0.47, + "grad_norm": 0.6552383303642273, + "learning_rate": 0.00028589580906380324, + "loss": 3.4024, + "step": 3676 + }, + { + "epoch": 0.47, + "grad_norm": 0.6854276657104492, + "learning_rate": 0.0002857932244768186, + "loss": 3.5205, + "step": 3677 + }, + { + "epoch": 0.47, + "grad_norm": 0.6248034834861755, + "learning_rate": 0.0002856906337365638, + "loss": 3.3893, + "step": 3678 + }, + { + "epoch": 0.47, + "grad_norm": 0.6380196809768677, + "learning_rate": 0.0002855880368606752, + "loss": 3.2897, + "step": 3679 + }, + { + "epoch": 0.47, + "grad_norm": 0.6448647379875183, + "learning_rate": 0.0002854854338667905, + "loss": 3.4351, + "step": 3680 + }, + { + "epoch": 0.47, + "grad_norm": 0.668107271194458, + "learning_rate": 0.0002853828247725484, + "loss": 3.3766, + "step": 3681 + }, + { + "epoch": 0.47, + "grad_norm": 0.6216070055961609, + "learning_rate": 0.0002852802095955885, + "loss": 3.4826, + "step": 3682 + }, + { + "epoch": 0.47, + "grad_norm": 0.6218451857566833, + "learning_rate": 0.0002851775883535515, + "loss": 3.3688, + "step": 3683 + }, + { + "epoch": 0.47, + "grad_norm": 0.6319736242294312, + "learning_rate": 0.0002850749610640793, + "loss": 3.3736, + "step": 3684 + }, + { + "epoch": 0.47, + "grad_norm": 0.6772602200508118, + "learning_rate": 0.00028497232774481467, + "loss": 3.4899, + "step": 3685 + }, + { + "epoch": 0.47, + "grad_norm": 0.6682306528091431, + "learning_rate": 0.0002848696884134016, + "loss": 3.4507, + "step": 3686 + }, + { + "epoch": 0.47, + "grad_norm": 0.6152041554450989, + "learning_rate": 0.00028476704308748465, + "loss": 3.2639, + "step": 3687 + }, + { + "epoch": 0.47, + "grad_norm": 0.6305086612701416, + "learning_rate": 0.00028466439178471, + "loss": 3.2784, + "step": 3688 + }, + { + "epoch": 0.47, + "grad_norm": 0.6088946461677551, + "learning_rate": 0.00028456173452272456, + "loss": 3.3655, + "step": 3689 + }, + { + "epoch": 0.47, + "grad_norm": 0.6327491402626038, + "learning_rate": 0.0002844590713191762, + "loss": 3.3668, + "step": 3690 + }, + { + "epoch": 0.47, + "grad_norm": 0.5848463177680969, + "learning_rate": 0.0002843564021917141, + "loss": 3.3243, + "step": 3691 + }, + { + "epoch": 0.47, + "grad_norm": 0.664267361164093, + "learning_rate": 0.00028425372715798816, + "loss": 3.5031, + "step": 3692 + }, + { + "epoch": 0.47, + "grad_norm": 0.6153045296669006, + "learning_rate": 0.0002841510462356494, + "loss": 3.3878, + "step": 3693 + }, + { + "epoch": 0.47, + "grad_norm": 0.6261101365089417, + "learning_rate": 0.0002840483594423498, + "loss": 3.4501, + "step": 3694 + }, + { + "epoch": 0.47, + "grad_norm": 0.6328792572021484, + "learning_rate": 0.00028394566679574256, + "loss": 3.4624, + "step": 3695 + }, + { + "epoch": 0.47, + "grad_norm": 0.6128214001655579, + "learning_rate": 0.0002838429683134817, + "loss": 3.2798, + "step": 3696 + }, + { + "epoch": 0.47, + "grad_norm": 0.6675438284873962, + "learning_rate": 0.00028374026401322224, + "loss": 3.3174, + "step": 3697 + }, + { + "epoch": 0.47, + "grad_norm": 0.6769039630889893, + "learning_rate": 0.00028363755391262014, + "loss": 3.2651, + "step": 3698 + }, + { + "epoch": 0.47, + "grad_norm": 0.6832557916641235, + "learning_rate": 0.00028353483802933267, + "loss": 3.4916, + "step": 3699 + }, + { + "epoch": 0.47, + "grad_norm": 0.6211268901824951, + "learning_rate": 0.0002834321163810178, + "loss": 3.2836, + "step": 3700 + }, + { + "epoch": 0.47, + "grad_norm": 0.6350568532943726, + "learning_rate": 0.0002833293889853345, + "loss": 3.2902, + "step": 3701 + }, + { + "epoch": 0.47, + "grad_norm": 0.631552517414093, + "learning_rate": 0.0002832266558599428, + "loss": 3.293, + "step": 3702 + }, + { + "epoch": 0.47, + "grad_norm": 0.6681779026985168, + "learning_rate": 0.0002831239170225038, + "loss": 3.1951, + "step": 3703 + }, + { + "epoch": 0.47, + "grad_norm": 0.6160247325897217, + "learning_rate": 0.0002830211724906794, + "loss": 3.4099, + "step": 3704 + }, + { + "epoch": 0.47, + "grad_norm": 0.5667205452919006, + "learning_rate": 0.0002829184222821327, + "loss": 3.2883, + "step": 3705 + }, + { + "epoch": 0.47, + "grad_norm": 0.6098401546478271, + "learning_rate": 0.00028281566641452763, + "loss": 3.3686, + "step": 3706 + }, + { + "epoch": 0.47, + "grad_norm": 0.6125772595405579, + "learning_rate": 0.000282712904905529, + "loss": 3.3227, + "step": 3707 + }, + { + "epoch": 0.47, + "grad_norm": 0.6355741024017334, + "learning_rate": 0.00028261013777280275, + "loss": 3.4768, + "step": 3708 + }, + { + "epoch": 0.47, + "grad_norm": 0.5942621231079102, + "learning_rate": 0.00028250736503401586, + "loss": 3.3359, + "step": 3709 + }, + { + "epoch": 0.47, + "grad_norm": 0.6091006994247437, + "learning_rate": 0.000282404586706836, + "loss": 3.1985, + "step": 3710 + }, + { + "epoch": 0.48, + "grad_norm": 0.60381019115448, + "learning_rate": 0.000282301802808932, + "loss": 3.4062, + "step": 3711 + }, + { + "epoch": 0.48, + "grad_norm": 0.5802066922187805, + "learning_rate": 0.00028219901335797365, + "loss": 3.4317, + "step": 3712 + }, + { + "epoch": 0.48, + "grad_norm": 0.6418296098709106, + "learning_rate": 0.00028209621837163167, + "loss": 3.4678, + "step": 3713 + }, + { + "epoch": 0.48, + "grad_norm": 0.6086550354957581, + "learning_rate": 0.00028199341786757764, + "loss": 3.2207, + "step": 3714 + }, + { + "epoch": 0.48, + "grad_norm": 0.8619450926780701, + "learning_rate": 0.0002818906118634841, + "loss": 3.5654, + "step": 3715 + }, + { + "epoch": 0.48, + "grad_norm": 0.6018812656402588, + "learning_rate": 0.0002817878003770247, + "loss": 3.4463, + "step": 3716 + }, + { + "epoch": 0.48, + "grad_norm": 0.6410269737243652, + "learning_rate": 0.000281684983425874, + "loss": 3.3419, + "step": 3717 + }, + { + "epoch": 0.48, + "grad_norm": 0.5836138129234314, + "learning_rate": 0.0002815821610277073, + "loss": 3.3654, + "step": 3718 + }, + { + "epoch": 0.48, + "grad_norm": 0.5505456328392029, + "learning_rate": 0.00028147933320020084, + "loss": 3.3156, + "step": 3719 + }, + { + "epoch": 0.48, + "grad_norm": 0.6228007674217224, + "learning_rate": 0.0002813764999610322, + "loss": 3.3534, + "step": 3720 + }, + { + "epoch": 0.48, + "grad_norm": 0.6110643744468689, + "learning_rate": 0.0002812736613278794, + "loss": 3.3265, + "step": 3721 + }, + { + "epoch": 0.48, + "grad_norm": 0.6205087304115295, + "learning_rate": 0.0002811708173184217, + "loss": 3.3997, + "step": 3722 + }, + { + "epoch": 0.48, + "grad_norm": 0.605631411075592, + "learning_rate": 0.00028106796795033916, + "loss": 3.3011, + "step": 3723 + }, + { + "epoch": 0.48, + "grad_norm": 0.6018489599227905, + "learning_rate": 0.00028096511324131275, + "loss": 3.3225, + "step": 3724 + }, + { + "epoch": 0.48, + "grad_norm": 0.6378231048583984, + "learning_rate": 0.0002808622532090244, + "loss": 3.4259, + "step": 3725 + }, + { + "epoch": 0.48, + "grad_norm": 0.6305454969406128, + "learning_rate": 0.00028075938787115685, + "loss": 3.2959, + "step": 3726 + }, + { + "epoch": 0.48, + "grad_norm": 0.6639338135719299, + "learning_rate": 0.00028065651724539404, + "loss": 3.3189, + "step": 3727 + }, + { + "epoch": 0.48, + "grad_norm": 0.6513733863830566, + "learning_rate": 0.00028055364134942047, + "loss": 3.3276, + "step": 3728 + }, + { + "epoch": 0.48, + "grad_norm": 0.5964550971984863, + "learning_rate": 0.00028045076020092176, + "loss": 3.3355, + "step": 3729 + }, + { + "epoch": 0.48, + "grad_norm": 0.6000546216964722, + "learning_rate": 0.00028034787381758433, + "loss": 3.3898, + "step": 3730 + }, + { + "epoch": 0.48, + "grad_norm": 0.601989209651947, + "learning_rate": 0.00028024498221709555, + "loss": 3.2769, + "step": 3731 + }, + { + "epoch": 0.48, + "grad_norm": 0.6208813786506653, + "learning_rate": 0.0002801420854171438, + "loss": 3.3688, + "step": 3732 + }, + { + "epoch": 0.48, + "grad_norm": 0.5917370915412903, + "learning_rate": 0.000280039183435418, + "loss": 3.4519, + "step": 3733 + }, + { + "epoch": 0.48, + "grad_norm": 0.6015068888664246, + "learning_rate": 0.0002799362762896084, + "loss": 3.3931, + "step": 3734 + }, + { + "epoch": 0.48, + "grad_norm": 0.5841699242591858, + "learning_rate": 0.00027983336399740584, + "loss": 3.4089, + "step": 3735 + }, + { + "epoch": 0.48, + "grad_norm": 0.6215444803237915, + "learning_rate": 0.00027973044657650213, + "loss": 3.3379, + "step": 3736 + }, + { + "epoch": 0.48, + "grad_norm": 0.6177813410758972, + "learning_rate": 0.00027962752404459, + "loss": 3.3657, + "step": 3737 + }, + { + "epoch": 0.48, + "grad_norm": 0.6230924129486084, + "learning_rate": 0.0002795245964193631, + "loss": 3.3429, + "step": 3738 + }, + { + "epoch": 0.48, + "grad_norm": 0.6166138052940369, + "learning_rate": 0.00027942166371851566, + "loss": 3.4291, + "step": 3739 + }, + { + "epoch": 0.48, + "grad_norm": 0.5767776370048523, + "learning_rate": 0.0002793187259597432, + "loss": 3.3826, + "step": 3740 + }, + { + "epoch": 0.48, + "grad_norm": 0.6381042003631592, + "learning_rate": 0.0002792157831607418, + "loss": 3.3958, + "step": 3741 + }, + { + "epoch": 0.48, + "grad_norm": 0.6046695113182068, + "learning_rate": 0.00027911283533920865, + "loss": 3.358, + "step": 3742 + }, + { + "epoch": 0.48, + "grad_norm": 0.6162915825843811, + "learning_rate": 0.0002790098825128416, + "loss": 3.3353, + "step": 3743 + }, + { + "epoch": 0.48, + "grad_norm": 0.5994752645492554, + "learning_rate": 0.0002789069246993394, + "loss": 3.2982, + "step": 3744 + }, + { + "epoch": 0.48, + "grad_norm": 0.6190649271011353, + "learning_rate": 0.0002788039619164017, + "loss": 3.4497, + "step": 3745 + }, + { + "epoch": 0.48, + "grad_norm": 0.6186133027076721, + "learning_rate": 0.0002787009941817291, + "loss": 3.464, + "step": 3746 + }, + { + "epoch": 0.48, + "grad_norm": 0.6915348172187805, + "learning_rate": 0.0002785980215130228, + "loss": 3.3655, + "step": 3747 + }, + { + "epoch": 0.48, + "grad_norm": 0.613624095916748, + "learning_rate": 0.0002784950439279852, + "loss": 3.4417, + "step": 3748 + }, + { + "epoch": 0.48, + "grad_norm": 0.6338350772857666, + "learning_rate": 0.00027839206144431906, + "loss": 3.3845, + "step": 3749 + }, + { + "epoch": 0.48, + "grad_norm": 0.6367514729499817, + "learning_rate": 0.0002782890740797285, + "loss": 3.4026, + "step": 3750 + }, + { + "epoch": 0.48, + "grad_norm": 0.6197525262832642, + "learning_rate": 0.0002781860818519181, + "loss": 3.4527, + "step": 3751 + }, + { + "epoch": 0.48, + "grad_norm": 0.6463971734046936, + "learning_rate": 0.0002780830847785935, + "loss": 3.4799, + "step": 3752 + }, + { + "epoch": 0.48, + "grad_norm": 0.6383652687072754, + "learning_rate": 0.00027798008287746096, + "loss": 3.3539, + "step": 3753 + }, + { + "epoch": 0.48, + "grad_norm": 0.6330506205558777, + "learning_rate": 0.0002778770761662279, + "loss": 3.4025, + "step": 3754 + }, + { + "epoch": 0.48, + "grad_norm": 0.6345783472061157, + "learning_rate": 0.0002777740646626022, + "loss": 3.3139, + "step": 3755 + }, + { + "epoch": 0.48, + "grad_norm": 0.6289472579956055, + "learning_rate": 0.00027767104838429283, + "loss": 3.4035, + "step": 3756 + }, + { + "epoch": 0.48, + "grad_norm": 0.6420667767524719, + "learning_rate": 0.00027756802734900957, + "loss": 3.3938, + "step": 3757 + }, + { + "epoch": 0.48, + "grad_norm": 0.7068736553192139, + "learning_rate": 0.0002774650015744626, + "loss": 3.3751, + "step": 3758 + }, + { + "epoch": 0.48, + "grad_norm": 0.6125309467315674, + "learning_rate": 0.00027736197107836356, + "loss": 3.3893, + "step": 3759 + }, + { + "epoch": 0.48, + "grad_norm": 0.6480317115783691, + "learning_rate": 0.00027725893587842457, + "loss": 3.3795, + "step": 3760 + }, + { + "epoch": 0.48, + "grad_norm": 0.6217477917671204, + "learning_rate": 0.00027715589599235834, + "loss": 3.404, + "step": 3761 + }, + { + "epoch": 0.48, + "grad_norm": 0.6079472303390503, + "learning_rate": 0.0002770528514378788, + "loss": 3.4794, + "step": 3762 + }, + { + "epoch": 0.48, + "grad_norm": 0.6414199471473694, + "learning_rate": 0.00027694980223270054, + "loss": 3.3759, + "step": 3763 + }, + { + "epoch": 0.48, + "grad_norm": 0.6436086297035217, + "learning_rate": 0.00027684674839453886, + "loss": 3.5266, + "step": 3764 + }, + { + "epoch": 0.48, + "grad_norm": 0.594540536403656, + "learning_rate": 0.0002767436899411098, + "loss": 3.3593, + "step": 3765 + }, + { + "epoch": 0.48, + "grad_norm": 0.6494881510734558, + "learning_rate": 0.00027664062689013055, + "loss": 3.3226, + "step": 3766 + }, + { + "epoch": 0.48, + "grad_norm": 0.6552349925041199, + "learning_rate": 0.0002765375592593186, + "loss": 3.3331, + "step": 3767 + }, + { + "epoch": 0.48, + "grad_norm": 0.6475743651390076, + "learning_rate": 0.0002764344870663925, + "loss": 3.3533, + "step": 3768 + }, + { + "epoch": 0.48, + "grad_norm": 0.6364050507545471, + "learning_rate": 0.00027633141032907166, + "loss": 3.497, + "step": 3769 + }, + { + "epoch": 0.48, + "grad_norm": 0.5534500479698181, + "learning_rate": 0.00027622832906507624, + "loss": 3.2724, + "step": 3770 + }, + { + "epoch": 0.48, + "grad_norm": 0.6304947137832642, + "learning_rate": 0.00027612524329212685, + "loss": 3.3515, + "step": 3771 + }, + { + "epoch": 0.48, + "grad_norm": 0.6687121987342834, + "learning_rate": 0.0002760221530279453, + "loss": 3.4175, + "step": 3772 + }, + { + "epoch": 0.48, + "grad_norm": 0.618787407875061, + "learning_rate": 0.000275919058290254, + "loss": 3.5104, + "step": 3773 + }, + { + "epoch": 0.48, + "grad_norm": 0.6386235356330872, + "learning_rate": 0.00027581595909677607, + "loss": 3.3924, + "step": 3774 + }, + { + "epoch": 0.48, + "grad_norm": 0.6412214040756226, + "learning_rate": 0.00027571285546523555, + "loss": 3.2804, + "step": 3775 + }, + { + "epoch": 0.48, + "grad_norm": 0.6224966049194336, + "learning_rate": 0.00027560974741335696, + "loss": 3.3536, + "step": 3776 + }, + { + "epoch": 0.48, + "grad_norm": 0.6877733469009399, + "learning_rate": 0.000275506634958866, + "loss": 3.4325, + "step": 3777 + }, + { + "epoch": 0.48, + "grad_norm": 0.6142483949661255, + "learning_rate": 0.0002754035181194888, + "loss": 3.4013, + "step": 3778 + }, + { + "epoch": 0.48, + "grad_norm": 0.6085824966430664, + "learning_rate": 0.00027530039691295224, + "loss": 3.3554, + "step": 3779 + }, + { + "epoch": 0.48, + "grad_norm": 0.5958852171897888, + "learning_rate": 0.0002751972713569842, + "loss": 3.3051, + "step": 3780 + }, + { + "epoch": 0.48, + "grad_norm": 0.5997690558433533, + "learning_rate": 0.00027509414146931313, + "loss": 3.3604, + "step": 3781 + }, + { + "epoch": 0.48, + "grad_norm": 0.6032097339630127, + "learning_rate": 0.00027499100726766817, + "loss": 3.3303, + "step": 3782 + }, + { + "epoch": 0.48, + "grad_norm": 0.5886170268058777, + "learning_rate": 0.0002748878687697794, + "loss": 3.324, + "step": 3783 + }, + { + "epoch": 0.48, + "grad_norm": 0.5861546397209167, + "learning_rate": 0.0002747847259933774, + "loss": 3.4258, + "step": 3784 + }, + { + "epoch": 0.48, + "grad_norm": 0.6559439301490784, + "learning_rate": 0.0002746815789561938, + "loss": 3.4873, + "step": 3785 + }, + { + "epoch": 0.48, + "grad_norm": 0.6742950081825256, + "learning_rate": 0.0002745784276759605, + "loss": 3.3851, + "step": 3786 + }, + { + "epoch": 0.48, + "grad_norm": 0.6020086407661438, + "learning_rate": 0.00027447527217041055, + "loss": 3.4026, + "step": 3787 + }, + { + "epoch": 0.48, + "grad_norm": 0.624190092086792, + "learning_rate": 0.00027437211245727764, + "loss": 3.4239, + "step": 3788 + }, + { + "epoch": 0.48, + "grad_norm": 0.5968867540359497, + "learning_rate": 0.0002742689485542961, + "loss": 3.4466, + "step": 3789 + }, + { + "epoch": 0.49, + "grad_norm": 0.6325806975364685, + "learning_rate": 0.0002741657804792008, + "loss": 3.2536, + "step": 3790 + }, + { + "epoch": 0.49, + "grad_norm": 0.595104992389679, + "learning_rate": 0.0002740626082497277, + "loss": 3.2624, + "step": 3791 + }, + { + "epoch": 0.49, + "grad_norm": 0.6479557752609253, + "learning_rate": 0.00027395943188361345, + "loss": 3.4156, + "step": 3792 + }, + { + "epoch": 0.49, + "grad_norm": 0.6428397297859192, + "learning_rate": 0.000273856251398595, + "loss": 3.4348, + "step": 3793 + }, + { + "epoch": 0.49, + "grad_norm": 0.571712851524353, + "learning_rate": 0.0002737530668124104, + "loss": 3.407, + "step": 3794 + }, + { + "epoch": 0.49, + "grad_norm": 0.5848454236984253, + "learning_rate": 0.0002736498781427981, + "loss": 3.2116, + "step": 3795 + }, + { + "epoch": 0.49, + "grad_norm": 0.6058680415153503, + "learning_rate": 0.00027354668540749774, + "loss": 3.3176, + "step": 3796 + }, + { + "epoch": 0.49, + "grad_norm": 0.6119828820228577, + "learning_rate": 0.0002734434886242491, + "loss": 3.3076, + "step": 3797 + }, + { + "epoch": 0.49, + "grad_norm": 0.6115705370903015, + "learning_rate": 0.000273340287810793, + "loss": 3.2607, + "step": 3798 + }, + { + "epoch": 0.49, + "grad_norm": 0.6166749596595764, + "learning_rate": 0.00027323708298487094, + "loss": 3.3529, + "step": 3799 + }, + { + "epoch": 0.49, + "grad_norm": 0.6365509629249573, + "learning_rate": 0.0002731338741642248, + "loss": 3.3274, + "step": 3800 + }, + { + "epoch": 0.49, + "grad_norm": 0.6337140798568726, + "learning_rate": 0.0002730306613665975, + "loss": 3.3724, + "step": 3801 + }, + { + "epoch": 0.49, + "grad_norm": 0.6156590580940247, + "learning_rate": 0.00027292744460973256, + "loss": 3.2998, + "step": 3802 + }, + { + "epoch": 0.49, + "grad_norm": 0.6068218350410461, + "learning_rate": 0.0002728242239113741, + "loss": 3.3315, + "step": 3803 + }, + { + "epoch": 0.49, + "grad_norm": 0.5837210416793823, + "learning_rate": 0.00027272099928926695, + "loss": 3.3249, + "step": 3804 + }, + { + "epoch": 0.49, + "grad_norm": 0.6302131414413452, + "learning_rate": 0.00027261777076115657, + "loss": 3.3445, + "step": 3805 + }, + { + "epoch": 0.49, + "grad_norm": 0.5965090990066528, + "learning_rate": 0.0002725145383447893, + "loss": 3.4302, + "step": 3806 + }, + { + "epoch": 0.49, + "grad_norm": 0.5544455647468567, + "learning_rate": 0.0002724113020579118, + "loss": 3.2505, + "step": 3807 + }, + { + "epoch": 0.49, + "grad_norm": 0.6283453106880188, + "learning_rate": 0.00027230806191827164, + "loss": 3.3896, + "step": 3808 + }, + { + "epoch": 0.49, + "grad_norm": 0.6319368481636047, + "learning_rate": 0.00027220481794361704, + "loss": 3.372, + "step": 3809 + }, + { + "epoch": 0.49, + "grad_norm": 0.5720791220664978, + "learning_rate": 0.00027210157015169687, + "loss": 3.2416, + "step": 3810 + }, + { + "epoch": 0.49, + "grad_norm": 0.6046373844146729, + "learning_rate": 0.00027199831856026044, + "loss": 3.1444, + "step": 3811 + }, + { + "epoch": 0.49, + "grad_norm": 0.6764358282089233, + "learning_rate": 0.00027189506318705815, + "loss": 3.3285, + "step": 3812 + }, + { + "epoch": 0.49, + "grad_norm": 0.6244410872459412, + "learning_rate": 0.0002717918040498406, + "loss": 3.3421, + "step": 3813 + }, + { + "epoch": 0.49, + "grad_norm": 0.6250739693641663, + "learning_rate": 0.0002716885411663593, + "loss": 3.4526, + "step": 3814 + }, + { + "epoch": 0.49, + "grad_norm": 0.600410521030426, + "learning_rate": 0.0002715852745543663, + "loss": 3.4254, + "step": 3815 + }, + { + "epoch": 0.49, + "grad_norm": 0.6418642401695251, + "learning_rate": 0.0002714820042316145, + "loss": 3.1948, + "step": 3816 + }, + { + "epoch": 0.49, + "grad_norm": 0.639212429523468, + "learning_rate": 0.0002713787302158571, + "loss": 3.2942, + "step": 3817 + }, + { + "epoch": 0.49, + "grad_norm": 0.6001421809196472, + "learning_rate": 0.000271275452524848, + "loss": 3.3185, + "step": 3818 + }, + { + "epoch": 0.49, + "grad_norm": 0.6354403495788574, + "learning_rate": 0.00027117217117634206, + "loss": 3.3556, + "step": 3819 + }, + { + "epoch": 0.49, + "grad_norm": 0.6731590032577515, + "learning_rate": 0.0002710688861880945, + "loss": 3.4374, + "step": 3820 + }, + { + "epoch": 0.49, + "grad_norm": 0.6121606230735779, + "learning_rate": 0.0002709655975778611, + "loss": 3.3958, + "step": 3821 + }, + { + "epoch": 0.49, + "grad_norm": 0.6243732571601868, + "learning_rate": 0.0002708623053633984, + "loss": 3.3648, + "step": 3822 + }, + { + "epoch": 0.49, + "grad_norm": 0.6269816756248474, + "learning_rate": 0.00027075900956246353, + "loss": 3.3031, + "step": 3823 + }, + { + "epoch": 0.49, + "grad_norm": 0.6408427953720093, + "learning_rate": 0.0002706557101928143, + "loss": 3.2254, + "step": 3824 + }, + { + "epoch": 0.49, + "grad_norm": 0.629331648349762, + "learning_rate": 0.0002705524072722091, + "loss": 3.4062, + "step": 3825 + }, + { + "epoch": 0.49, + "grad_norm": 0.6464104652404785, + "learning_rate": 0.00027044910081840665, + "loss": 3.3696, + "step": 3826 + }, + { + "epoch": 0.49, + "grad_norm": 0.6373887658119202, + "learning_rate": 0.0002703457908491668, + "loss": 3.3367, + "step": 3827 + }, + { + "epoch": 0.49, + "grad_norm": 0.704435408115387, + "learning_rate": 0.0002702424773822497, + "loss": 3.3642, + "step": 3828 + }, + { + "epoch": 0.49, + "grad_norm": 0.634634792804718, + "learning_rate": 0.00027013916043541595, + "loss": 3.2776, + "step": 3829 + }, + { + "epoch": 0.49, + "grad_norm": 0.6082105040550232, + "learning_rate": 0.0002700358400264271, + "loss": 3.3809, + "step": 3830 + }, + { + "epoch": 0.49, + "grad_norm": 0.5796101689338684, + "learning_rate": 0.00026993251617304513, + "loss": 3.3239, + "step": 3831 + }, + { + "epoch": 0.49, + "grad_norm": 0.6139970421791077, + "learning_rate": 0.0002698291888930325, + "loss": 3.4051, + "step": 3832 + }, + { + "epoch": 0.49, + "grad_norm": 0.6833216547966003, + "learning_rate": 0.00026972585820415234, + "loss": 3.419, + "step": 3833 + }, + { + "epoch": 0.49, + "grad_norm": 0.6220802068710327, + "learning_rate": 0.0002696225241241686, + "loss": 3.358, + "step": 3834 + }, + { + "epoch": 0.49, + "grad_norm": 0.6287161707878113, + "learning_rate": 0.00026951918667084547, + "loss": 3.2347, + "step": 3835 + }, + { + "epoch": 0.49, + "grad_norm": 0.6108514070510864, + "learning_rate": 0.00026941584586194777, + "loss": 3.3711, + "step": 3836 + }, + { + "epoch": 0.49, + "grad_norm": 0.5812754034996033, + "learning_rate": 0.00026931250171524114, + "loss": 3.4014, + "step": 3837 + }, + { + "epoch": 0.49, + "grad_norm": 0.6192893981933594, + "learning_rate": 0.00026920915424849155, + "loss": 3.3689, + "step": 3838 + }, + { + "epoch": 0.49, + "grad_norm": 0.6789095401763916, + "learning_rate": 0.00026910580347946575, + "loss": 3.4586, + "step": 3839 + }, + { + "epoch": 0.49, + "grad_norm": 0.6124496459960938, + "learning_rate": 0.00026900244942593067, + "loss": 3.3106, + "step": 3840 + }, + { + "epoch": 0.49, + "grad_norm": 0.663774847984314, + "learning_rate": 0.00026889909210565434, + "loss": 3.4751, + "step": 3841 + }, + { + "epoch": 0.49, + "grad_norm": 0.6115865111351013, + "learning_rate": 0.00026879573153640497, + "loss": 3.4109, + "step": 3842 + }, + { + "epoch": 0.49, + "grad_norm": 0.5876237750053406, + "learning_rate": 0.00026869236773595145, + "loss": 3.3461, + "step": 3843 + }, + { + "epoch": 0.49, + "grad_norm": 0.6677536368370056, + "learning_rate": 0.0002685890007220632, + "loss": 3.3114, + "step": 3844 + }, + { + "epoch": 0.49, + "grad_norm": 0.6193956136703491, + "learning_rate": 0.0002684856305125103, + "loss": 3.3138, + "step": 3845 + }, + { + "epoch": 0.49, + "grad_norm": 0.5813229084014893, + "learning_rate": 0.0002683822571250631, + "loss": 3.3238, + "step": 3846 + }, + { + "epoch": 0.49, + "grad_norm": 0.7698580622673035, + "learning_rate": 0.00026827888057749275, + "loss": 3.2871, + "step": 3847 + }, + { + "epoch": 0.49, + "grad_norm": 0.6021171808242798, + "learning_rate": 0.0002681755008875711, + "loss": 3.2632, + "step": 3848 + }, + { + "epoch": 0.49, + "grad_norm": 0.590849757194519, + "learning_rate": 0.0002680721180730701, + "loss": 3.3846, + "step": 3849 + }, + { + "epoch": 0.49, + "grad_norm": 0.6216306686401367, + "learning_rate": 0.0002679687321517624, + "loss": 3.4654, + "step": 3850 + }, + { + "epoch": 0.49, + "grad_norm": 0.6439468860626221, + "learning_rate": 0.00026786534314142136, + "loss": 3.4023, + "step": 3851 + }, + { + "epoch": 0.49, + "grad_norm": 0.6653583645820618, + "learning_rate": 0.0002677619510598208, + "loss": 3.3591, + "step": 3852 + }, + { + "epoch": 0.49, + "grad_norm": 0.625581681728363, + "learning_rate": 0.0002676585559247349, + "loss": 3.2551, + "step": 3853 + }, + { + "epoch": 0.49, + "grad_norm": 0.6035146117210388, + "learning_rate": 0.0002675551577539384, + "loss": 3.222, + "step": 3854 + }, + { + "epoch": 0.49, + "grad_norm": 0.6973259449005127, + "learning_rate": 0.0002674517565652069, + "loss": 3.2016, + "step": 3855 + }, + { + "epoch": 0.49, + "grad_norm": 0.6433961391448975, + "learning_rate": 0.00026734835237631605, + "loss": 3.3871, + "step": 3856 + }, + { + "epoch": 0.49, + "grad_norm": 0.6088781356811523, + "learning_rate": 0.0002672449452050424, + "loss": 3.373, + "step": 3857 + }, + { + "epoch": 0.49, + "grad_norm": 0.6201007962226868, + "learning_rate": 0.0002671415350691627, + "loss": 3.3112, + "step": 3858 + }, + { + "epoch": 0.49, + "grad_norm": 0.5991825461387634, + "learning_rate": 0.0002670381219864544, + "loss": 3.3877, + "step": 3859 + }, + { + "epoch": 0.49, + "grad_norm": 0.5942227244377136, + "learning_rate": 0.0002669347059746954, + "loss": 3.3144, + "step": 3860 + }, + { + "epoch": 0.49, + "grad_norm": 0.6242623329162598, + "learning_rate": 0.00026683128705166416, + "loss": 3.2541, + "step": 3861 + }, + { + "epoch": 0.49, + "grad_norm": 0.6556063890457153, + "learning_rate": 0.00026672786523513947, + "loss": 3.3878, + "step": 3862 + }, + { + "epoch": 0.49, + "grad_norm": 0.6340212821960449, + "learning_rate": 0.00026662444054290085, + "loss": 3.3619, + "step": 3863 + }, + { + "epoch": 0.49, + "grad_norm": 0.6830992698669434, + "learning_rate": 0.0002665210129927282, + "loss": 3.4276, + "step": 3864 + }, + { + "epoch": 0.49, + "grad_norm": 0.578670084476471, + "learning_rate": 0.00026641758260240184, + "loss": 3.439, + "step": 3865 + }, + { + "epoch": 0.49, + "grad_norm": 0.6785685420036316, + "learning_rate": 0.0002663141493897028, + "loss": 3.4199, + "step": 3866 + }, + { + "epoch": 0.49, + "grad_norm": 0.6079252362251282, + "learning_rate": 0.00026621071337241226, + "loss": 3.3678, + "step": 3867 + }, + { + "epoch": 0.5, + "grad_norm": 0.6454574465751648, + "learning_rate": 0.00026610727456831217, + "loss": 3.3315, + "step": 3868 + }, + { + "epoch": 0.5, + "grad_norm": 0.614564836025238, + "learning_rate": 0.0002660038329951849, + "loss": 3.3012, + "step": 3869 + }, + { + "epoch": 0.5, + "grad_norm": 0.6483057141304016, + "learning_rate": 0.00026590038867081316, + "loss": 3.2884, + "step": 3870 + }, + { + "epoch": 0.5, + "grad_norm": 0.6164308190345764, + "learning_rate": 0.0002657969416129803, + "loss": 3.2987, + "step": 3871 + }, + { + "epoch": 0.5, + "grad_norm": 0.6032328605651855, + "learning_rate": 0.00026569349183947, + "loss": 3.299, + "step": 3872 + }, + { + "epoch": 0.5, + "grad_norm": 0.5902796387672424, + "learning_rate": 0.0002655900393680666, + "loss": 3.4066, + "step": 3873 + }, + { + "epoch": 0.5, + "grad_norm": 0.59278804063797, + "learning_rate": 0.00026548658421655464, + "loss": 3.414, + "step": 3874 + }, + { + "epoch": 0.5, + "grad_norm": 0.5973041653633118, + "learning_rate": 0.00026538312640271935, + "loss": 3.437, + "step": 3875 + }, + { + "epoch": 0.5, + "grad_norm": 0.608325183391571, + "learning_rate": 0.0002652796659443462, + "loss": 3.4709, + "step": 3876 + }, + { + "epoch": 0.5, + "grad_norm": 0.6026968359947205, + "learning_rate": 0.00026517620285922144, + "loss": 3.3073, + "step": 3877 + }, + { + "epoch": 0.5, + "grad_norm": 0.6208993196487427, + "learning_rate": 0.00026507273716513144, + "loss": 3.3494, + "step": 3878 + }, + { + "epoch": 0.5, + "grad_norm": 0.5892390012741089, + "learning_rate": 0.00026496926887986324, + "loss": 3.357, + "step": 3879 + }, + { + "epoch": 0.5, + "grad_norm": 0.6406593918800354, + "learning_rate": 0.00026486579802120406, + "loss": 3.3358, + "step": 3880 + }, + { + "epoch": 0.5, + "grad_norm": 0.6420329809188843, + "learning_rate": 0.00026476232460694195, + "loss": 3.3091, + "step": 3881 + }, + { + "epoch": 0.5, + "grad_norm": 0.5696841478347778, + "learning_rate": 0.0002646588486548651, + "loss": 3.3782, + "step": 3882 + }, + { + "epoch": 0.5, + "grad_norm": 0.6150279641151428, + "learning_rate": 0.0002645553701827621, + "loss": 3.3737, + "step": 3883 + }, + { + "epoch": 0.5, + "grad_norm": 0.6361640691757202, + "learning_rate": 0.0002644518892084224, + "loss": 3.4473, + "step": 3884 + }, + { + "epoch": 0.5, + "grad_norm": 0.6132729053497314, + "learning_rate": 0.0002643484057496353, + "loss": 3.324, + "step": 3885 + }, + { + "epoch": 0.5, + "grad_norm": 0.6346362829208374, + "learning_rate": 0.00026424491982419095, + "loss": 3.3577, + "step": 3886 + }, + { + "epoch": 0.5, + "grad_norm": 0.6215100884437561, + "learning_rate": 0.00026414143144987975, + "loss": 3.4401, + "step": 3887 + }, + { + "epoch": 0.5, + "grad_norm": 0.6370041370391846, + "learning_rate": 0.0002640379406444925, + "loss": 3.3201, + "step": 3888 + }, + { + "epoch": 0.5, + "grad_norm": 0.5833559036254883, + "learning_rate": 0.0002639344474258206, + "loss": 3.2599, + "step": 3889 + }, + { + "epoch": 0.5, + "grad_norm": 0.6063408255577087, + "learning_rate": 0.00026383095181165553, + "loss": 3.251, + "step": 3890 + }, + { + "epoch": 0.5, + "grad_norm": 0.6340535879135132, + "learning_rate": 0.0002637274538197896, + "loss": 3.3845, + "step": 3891 + }, + { + "epoch": 0.5, + "grad_norm": 0.6207410097122192, + "learning_rate": 0.0002636239534680151, + "loss": 3.2976, + "step": 3892 + }, + { + "epoch": 0.5, + "grad_norm": 0.6404011249542236, + "learning_rate": 0.00026352045077412516, + "loss": 3.4276, + "step": 3893 + }, + { + "epoch": 0.5, + "grad_norm": 0.614709198474884, + "learning_rate": 0.0002634169457559129, + "loss": 3.4475, + "step": 3894 + }, + { + "epoch": 0.5, + "grad_norm": 0.6316673159599304, + "learning_rate": 0.0002633134384311722, + "loss": 3.4259, + "step": 3895 + }, + { + "epoch": 0.5, + "grad_norm": 0.5817269086837769, + "learning_rate": 0.0002632099288176971, + "loss": 3.345, + "step": 3896 + }, + { + "epoch": 0.5, + "grad_norm": 0.624113142490387, + "learning_rate": 0.00026310641693328205, + "loss": 3.5048, + "step": 3897 + }, + { + "epoch": 0.5, + "grad_norm": 0.6005644798278809, + "learning_rate": 0.000263002902795722, + "loss": 3.3095, + "step": 3898 + }, + { + "epoch": 0.5, + "grad_norm": 0.6340106129646301, + "learning_rate": 0.0002628993864228122, + "loss": 3.404, + "step": 3899 + }, + { + "epoch": 0.5, + "grad_norm": 0.5678104162216187, + "learning_rate": 0.0002627958678323484, + "loss": 3.2185, + "step": 3900 + }, + { + "epoch": 0.5, + "grad_norm": 0.6310855746269226, + "learning_rate": 0.0002626923470421264, + "loss": 3.4734, + "step": 3901 + }, + { + "epoch": 0.5, + "grad_norm": 0.6362099647521973, + "learning_rate": 0.000262588824069943, + "loss": 3.4269, + "step": 3902 + }, + { + "epoch": 0.5, + "grad_norm": 0.63554447889328, + "learning_rate": 0.00026248529893359475, + "loss": 3.3321, + "step": 3903 + }, + { + "epoch": 0.5, + "grad_norm": 0.619417130947113, + "learning_rate": 0.0002623817716508788, + "loss": 3.4176, + "step": 3904 + }, + { + "epoch": 0.5, + "grad_norm": 0.6085559725761414, + "learning_rate": 0.00026227824223959287, + "loss": 3.362, + "step": 3905 + }, + { + "epoch": 0.5, + "grad_norm": 0.610848605632782, + "learning_rate": 0.00026217471071753463, + "loss": 3.2489, + "step": 3906 + }, + { + "epoch": 0.5, + "grad_norm": 0.572178840637207, + "learning_rate": 0.00026207117710250254, + "loss": 3.357, + "step": 3907 + }, + { + "epoch": 0.5, + "grad_norm": 0.6343911290168762, + "learning_rate": 0.0002619676414122952, + "loss": 3.4127, + "step": 3908 + }, + { + "epoch": 0.5, + "grad_norm": 0.6293900609016418, + "learning_rate": 0.0002618641036647115, + "loss": 3.3451, + "step": 3909 + }, + { + "epoch": 0.5, + "grad_norm": 0.6710907816886902, + "learning_rate": 0.0002617605638775509, + "loss": 3.3697, + "step": 3910 + }, + { + "epoch": 0.5, + "grad_norm": 0.5918707251548767, + "learning_rate": 0.000261657022068613, + "loss": 3.4208, + "step": 3911 + }, + { + "epoch": 0.5, + "grad_norm": 0.6613056063652039, + "learning_rate": 0.00026155347825569783, + "loss": 3.2565, + "step": 3912 + }, + { + "epoch": 0.5, + "grad_norm": 0.6077964305877686, + "learning_rate": 0.0002614499324566059, + "loss": 3.4051, + "step": 3913 + }, + { + "epoch": 0.5, + "grad_norm": 0.6586365699768066, + "learning_rate": 0.0002613463846891377, + "loss": 3.4608, + "step": 3914 + }, + { + "epoch": 0.5, + "grad_norm": 0.6008127927780151, + "learning_rate": 0.00026124283497109445, + "loss": 3.5188, + "step": 3915 + }, + { + "epoch": 0.5, + "grad_norm": 0.6196286678314209, + "learning_rate": 0.00026113928332027763, + "loss": 3.2877, + "step": 3916 + }, + { + "epoch": 0.5, + "grad_norm": 0.5656877756118774, + "learning_rate": 0.00026103572975448884, + "loss": 3.2677, + "step": 3917 + }, + { + "epoch": 0.5, + "grad_norm": 0.5755074620246887, + "learning_rate": 0.0002609321742915302, + "loss": 3.3962, + "step": 3918 + }, + { + "epoch": 0.5, + "grad_norm": 0.6107060313224792, + "learning_rate": 0.000260828616949204, + "loss": 3.2938, + "step": 3919 + }, + { + "epoch": 0.5, + "grad_norm": 0.6584084630012512, + "learning_rate": 0.00026072505774531304, + "loss": 3.3053, + "step": 3920 + }, + { + "epoch": 0.5, + "grad_norm": 0.5812655091285706, + "learning_rate": 0.0002606214966976603, + "loss": 3.4265, + "step": 3921 + }, + { + "epoch": 0.5, + "grad_norm": 0.6057932376861572, + "learning_rate": 0.00026051793382404916, + "loss": 3.442, + "step": 3922 + }, + { + "epoch": 0.5, + "grad_norm": 0.617680549621582, + "learning_rate": 0.0002604143691422833, + "loss": 3.2939, + "step": 3923 + }, + { + "epoch": 0.5, + "grad_norm": 0.616668164730072, + "learning_rate": 0.00026031080267016664, + "loss": 3.3975, + "step": 3924 + }, + { + "epoch": 0.5, + "grad_norm": 0.571333110332489, + "learning_rate": 0.0002602072344255034, + "loss": 3.289, + "step": 3925 + }, + { + "epoch": 0.5, + "grad_norm": 0.6522971391677856, + "learning_rate": 0.0002601036644260982, + "loss": 3.3841, + "step": 3926 + }, + { + "epoch": 0.5, + "grad_norm": 0.5657179951667786, + "learning_rate": 0.00026000009268975613, + "loss": 3.4224, + "step": 3927 + }, + { + "epoch": 0.5, + "grad_norm": 0.5910373330116272, + "learning_rate": 0.0002598965192342821, + "loss": 3.242, + "step": 3928 + }, + { + "epoch": 0.5, + "grad_norm": 0.6280388832092285, + "learning_rate": 0.0002597929440774816, + "loss": 3.5236, + "step": 3929 + }, + { + "epoch": 0.5, + "grad_norm": 0.6481741666793823, + "learning_rate": 0.0002596893672371605, + "loss": 3.2832, + "step": 3930 + }, + { + "epoch": 0.5, + "grad_norm": 0.6153259873390198, + "learning_rate": 0.00025958578873112496, + "loss": 3.459, + "step": 3931 + }, + { + "epoch": 0.5, + "grad_norm": 0.59522944688797, + "learning_rate": 0.0002594822085771812, + "loss": 3.249, + "step": 3932 + }, + { + "epoch": 0.5, + "grad_norm": 0.5696998834609985, + "learning_rate": 0.00025937862679313573, + "loss": 3.3491, + "step": 3933 + }, + { + "epoch": 0.5, + "grad_norm": 0.6377519369125366, + "learning_rate": 0.00025927504339679565, + "loss": 3.2992, + "step": 3934 + }, + { + "epoch": 0.5, + "grad_norm": 0.5743221044540405, + "learning_rate": 0.0002591714584059681, + "loss": 3.3483, + "step": 3935 + }, + { + "epoch": 0.5, + "grad_norm": 0.6047176122665405, + "learning_rate": 0.0002590678718384605, + "loss": 3.3561, + "step": 3936 + }, + { + "epoch": 0.5, + "grad_norm": 0.5904937982559204, + "learning_rate": 0.0002589642837120807, + "loss": 3.4039, + "step": 3937 + }, + { + "epoch": 0.5, + "grad_norm": 0.6137279868125916, + "learning_rate": 0.00025886069404463646, + "loss": 3.3784, + "step": 3938 + }, + { + "epoch": 0.5, + "grad_norm": 0.6516735553741455, + "learning_rate": 0.00025875710285393634, + "loss": 3.4415, + "step": 3939 + }, + { + "epoch": 0.5, + "grad_norm": 0.5683122277259827, + "learning_rate": 0.0002586535101577886, + "loss": 3.3377, + "step": 3940 + }, + { + "epoch": 0.5, + "grad_norm": 0.6334203481674194, + "learning_rate": 0.00025854991597400217, + "loss": 3.2492, + "step": 3941 + }, + { + "epoch": 0.5, + "grad_norm": 0.6036867499351501, + "learning_rate": 0.0002584463203203861, + "loss": 3.3008, + "step": 3942 + }, + { + "epoch": 0.5, + "grad_norm": 0.636132001876831, + "learning_rate": 0.00025834272321474964, + "loss": 3.3408, + "step": 3943 + }, + { + "epoch": 0.5, + "grad_norm": 0.6284753680229187, + "learning_rate": 0.0002582391246749023, + "loss": 3.4375, + "step": 3944 + }, + { + "epoch": 0.5, + "grad_norm": 0.6076215505599976, + "learning_rate": 0.000258135524718654, + "loss": 3.3355, + "step": 3945 + }, + { + "epoch": 0.51, + "grad_norm": 0.6466711163520813, + "learning_rate": 0.00025803192336381465, + "loss": 3.268, + "step": 3946 + }, + { + "epoch": 0.51, + "grad_norm": 0.6400593519210815, + "learning_rate": 0.0002579283206281945, + "loss": 3.4288, + "step": 3947 + }, + { + "epoch": 0.51, + "grad_norm": 0.6525185704231262, + "learning_rate": 0.0002578247165296041, + "loss": 3.3486, + "step": 3948 + }, + { + "epoch": 0.51, + "grad_norm": 0.6444108486175537, + "learning_rate": 0.0002577211110858543, + "loss": 3.518, + "step": 3949 + }, + { + "epoch": 0.51, + "grad_norm": 0.5654917359352112, + "learning_rate": 0.00025761750431475605, + "loss": 3.2278, + "step": 3950 + }, + { + "epoch": 0.51, + "grad_norm": 0.5556711554527283, + "learning_rate": 0.0002575138962341204, + "loss": 3.3468, + "step": 3951 + }, + { + "epoch": 0.51, + "grad_norm": 0.5785199999809265, + "learning_rate": 0.00025741028686175887, + "loss": 3.2947, + "step": 3952 + }, + { + "epoch": 0.51, + "grad_norm": 0.605494499206543, + "learning_rate": 0.0002573066762154832, + "loss": 3.382, + "step": 3953 + }, + { + "epoch": 0.51, + "grad_norm": 0.6059899926185608, + "learning_rate": 0.0002572030643131051, + "loss": 3.3576, + "step": 3954 + }, + { + "epoch": 0.51, + "grad_norm": 0.632280170917511, + "learning_rate": 0.00025709945117243676, + "loss": 3.2547, + "step": 3955 + }, + { + "epoch": 0.51, + "grad_norm": 0.5962053537368774, + "learning_rate": 0.0002569958368112905, + "loss": 3.3773, + "step": 3956 + }, + { + "epoch": 0.51, + "grad_norm": 0.604576826095581, + "learning_rate": 0.00025689222124747877, + "loss": 3.39, + "step": 3957 + }, + { + "epoch": 0.51, + "grad_norm": 0.5862025618553162, + "learning_rate": 0.00025678860449881427, + "loss": 3.2868, + "step": 3958 + }, + { + "epoch": 0.51, + "grad_norm": 0.6356115341186523, + "learning_rate": 0.00025668498658311, + "loss": 3.3923, + "step": 3959 + }, + { + "epoch": 0.51, + "grad_norm": 0.6119865775108337, + "learning_rate": 0.00025658136751817906, + "loss": 3.3857, + "step": 3960 + }, + { + "epoch": 0.51, + "grad_norm": 0.6909923553466797, + "learning_rate": 0.0002564777473218347, + "loss": 3.1973, + "step": 3961 + }, + { + "epoch": 0.51, + "grad_norm": 0.6727779507637024, + "learning_rate": 0.0002563741260118904, + "loss": 3.4471, + "step": 3962 + }, + { + "epoch": 0.51, + "grad_norm": 0.6961916089057922, + "learning_rate": 0.0002562705036061601, + "loss": 3.5349, + "step": 3963 + }, + { + "epoch": 0.51, + "grad_norm": 0.6283586621284485, + "learning_rate": 0.0002561668801224575, + "loss": 3.5474, + "step": 3964 + }, + { + "epoch": 0.51, + "grad_norm": 0.6693708896636963, + "learning_rate": 0.00025606325557859665, + "loss": 3.4094, + "step": 3965 + }, + { + "epoch": 0.51, + "grad_norm": 0.6046238541603088, + "learning_rate": 0.00025595962999239185, + "loss": 3.3631, + "step": 3966 + }, + { + "epoch": 0.51, + "grad_norm": 0.6080331206321716, + "learning_rate": 0.00025585600338165764, + "loss": 3.5612, + "step": 3967 + }, + { + "epoch": 0.51, + "grad_norm": 0.6154356598854065, + "learning_rate": 0.00025575237576420864, + "loss": 3.3989, + "step": 3968 + }, + { + "epoch": 0.51, + "grad_norm": 0.5944554209709167, + "learning_rate": 0.00025564874715785943, + "loss": 3.1934, + "step": 3969 + }, + { + "epoch": 0.51, + "grad_norm": 0.627717912197113, + "learning_rate": 0.00025554511758042514, + "loss": 3.154, + "step": 3970 + }, + { + "epoch": 0.51, + "grad_norm": 0.6271909475326538, + "learning_rate": 0.00025544148704972084, + "loss": 3.3213, + "step": 3971 + }, + { + "epoch": 0.51, + "grad_norm": 0.612829327583313, + "learning_rate": 0.00025533785558356175, + "loss": 3.2844, + "step": 3972 + }, + { + "epoch": 0.51, + "grad_norm": 0.5844332575798035, + "learning_rate": 0.00025523422319976354, + "loss": 3.4009, + "step": 3973 + }, + { + "epoch": 0.51, + "grad_norm": 0.6311678886413574, + "learning_rate": 0.00025513058991614166, + "loss": 3.4696, + "step": 3974 + }, + { + "epoch": 0.51, + "grad_norm": 0.5532718300819397, + "learning_rate": 0.0002550269557505118, + "loss": 3.2258, + "step": 3975 + }, + { + "epoch": 0.51, + "grad_norm": 0.6182968020439148, + "learning_rate": 0.00025492332072069, + "loss": 3.3138, + "step": 3976 + }, + { + "epoch": 0.51, + "grad_norm": 0.6046236753463745, + "learning_rate": 0.00025481968484449236, + "loss": 3.2735, + "step": 3977 + }, + { + "epoch": 0.51, + "grad_norm": 0.6428443789482117, + "learning_rate": 0.00025471604813973503, + "loss": 3.3787, + "step": 3978 + }, + { + "epoch": 0.51, + "grad_norm": 0.6348656415939331, + "learning_rate": 0.0002546124106242343, + "loss": 3.4603, + "step": 3979 + }, + { + "epoch": 0.51, + "grad_norm": 0.5930755734443665, + "learning_rate": 0.00025450877231580665, + "loss": 3.4622, + "step": 3980 + }, + { + "epoch": 0.51, + "grad_norm": 0.636692464351654, + "learning_rate": 0.0002544051332322689, + "loss": 3.3818, + "step": 3981 + }, + { + "epoch": 0.51, + "grad_norm": 0.5703644752502441, + "learning_rate": 0.0002543014933914376, + "loss": 3.3697, + "step": 3982 + }, + { + "epoch": 0.51, + "grad_norm": 0.6083183884620667, + "learning_rate": 0.0002541978528111297, + "loss": 3.3912, + "step": 3983 + }, + { + "epoch": 0.51, + "grad_norm": 0.6166694760322571, + "learning_rate": 0.00025409421150916235, + "loss": 3.2652, + "step": 3984 + }, + { + "epoch": 0.51, + "grad_norm": 0.6031867265701294, + "learning_rate": 0.0002539905695033524, + "loss": 3.3129, + "step": 3985 + }, + { + "epoch": 0.51, + "grad_norm": 0.6122192740440369, + "learning_rate": 0.0002538869268115175, + "loss": 3.3169, + "step": 3986 + }, + { + "epoch": 0.51, + "grad_norm": 0.6112638711929321, + "learning_rate": 0.0002537832834514747, + "loss": 3.2996, + "step": 3987 + }, + { + "epoch": 0.51, + "grad_norm": 0.6045041084289551, + "learning_rate": 0.0002536796394410416, + "loss": 3.3229, + "step": 3988 + }, + { + "epoch": 0.51, + "grad_norm": 0.6106022596359253, + "learning_rate": 0.00025357599479803586, + "loss": 3.364, + "step": 3989 + }, + { + "epoch": 0.51, + "grad_norm": 0.6055294275283813, + "learning_rate": 0.00025347234954027506, + "loss": 3.275, + "step": 3990 + }, + { + "epoch": 0.51, + "grad_norm": 0.5865367650985718, + "learning_rate": 0.0002533687036855772, + "loss": 3.3175, + "step": 3991 + }, + { + "epoch": 0.51, + "grad_norm": 0.6457139849662781, + "learning_rate": 0.0002532650572517602, + "loss": 3.3128, + "step": 3992 + }, + { + "epoch": 0.51, + "grad_norm": 0.6387559771537781, + "learning_rate": 0.00025316141025664193, + "loss": 3.2759, + "step": 3993 + }, + { + "epoch": 0.51, + "grad_norm": 0.6478233337402344, + "learning_rate": 0.0002530577627180405, + "loss": 3.4826, + "step": 3994 + }, + { + "epoch": 0.51, + "grad_norm": 0.6275167465209961, + "learning_rate": 0.0002529541146537743, + "loss": 3.3614, + "step": 3995 + }, + { + "epoch": 0.51, + "grad_norm": 0.5962046980857849, + "learning_rate": 0.00025285046608166165, + "loss": 3.4151, + "step": 3996 + }, + { + "epoch": 0.51, + "grad_norm": 0.6149052977561951, + "learning_rate": 0.0002527468170195207, + "loss": 3.2768, + "step": 3997 + }, + { + "epoch": 0.51, + "grad_norm": 0.607078492641449, + "learning_rate": 0.0002526431674851701, + "loss": 3.3428, + "step": 3998 + }, + { + "epoch": 0.51, + "grad_norm": 0.6066473126411438, + "learning_rate": 0.0002525395174964283, + "loss": 3.4584, + "step": 3999 + }, + { + "epoch": 0.51, + "grad_norm": 0.5871192812919617, + "learning_rate": 0.0002524358670711141, + "loss": 3.5256, + "step": 4000 + }, + { + "epoch": 0.51, + "grad_norm": 0.5904553532600403, + "learning_rate": 0.00025233221622704606, + "loss": 3.2749, + "step": 4001 + }, + { + "epoch": 0.51, + "grad_norm": 0.5975815057754517, + "learning_rate": 0.000252228564982043, + "loss": 3.4215, + "step": 4002 + }, + { + "epoch": 0.51, + "grad_norm": 0.6611238121986389, + "learning_rate": 0.0002521249133539238, + "loss": 3.3904, + "step": 4003 + }, + { + "epoch": 0.51, + "grad_norm": 0.6311246156692505, + "learning_rate": 0.0002520212613605074, + "loss": 3.2204, + "step": 4004 + }, + { + "epoch": 0.51, + "grad_norm": 0.5951711535453796, + "learning_rate": 0.0002519176090196127, + "loss": 3.3291, + "step": 4005 + }, + { + "epoch": 0.51, + "grad_norm": 0.6938546895980835, + "learning_rate": 0.0002518139563490588, + "loss": 3.3354, + "step": 4006 + }, + { + "epoch": 0.51, + "grad_norm": 0.5967162847518921, + "learning_rate": 0.0002517103033666648, + "loss": 3.2652, + "step": 4007 + }, + { + "epoch": 0.51, + "grad_norm": 0.6288487911224365, + "learning_rate": 0.0002516066500902497, + "loss": 3.1915, + "step": 4008 + }, + { + "epoch": 0.51, + "grad_norm": 0.6464089155197144, + "learning_rate": 0.0002515029965376329, + "loss": 3.2344, + "step": 4009 + }, + { + "epoch": 0.51, + "grad_norm": 0.6463294625282288, + "learning_rate": 0.0002513993427266336, + "loss": 3.4581, + "step": 4010 + }, + { + "epoch": 0.51, + "grad_norm": 0.617191731929779, + "learning_rate": 0.00025129568867507103, + "loss": 3.4698, + "step": 4011 + }, + { + "epoch": 0.51, + "grad_norm": 0.6267497539520264, + "learning_rate": 0.00025119203440076453, + "loss": 3.3611, + "step": 4012 + }, + { + "epoch": 0.51, + "grad_norm": 0.6050522327423096, + "learning_rate": 0.00025108837992153353, + "loss": 3.3997, + "step": 4013 + }, + { + "epoch": 0.51, + "grad_norm": 0.6314012408256531, + "learning_rate": 0.0002509847252551974, + "loss": 3.3887, + "step": 4014 + }, + { + "epoch": 0.51, + "grad_norm": 0.7093877792358398, + "learning_rate": 0.00025088107041957566, + "loss": 3.3945, + "step": 4015 + }, + { + "epoch": 0.51, + "grad_norm": 0.5881847739219666, + "learning_rate": 0.0002507774154324876, + "loss": 3.3512, + "step": 4016 + }, + { + "epoch": 0.51, + "grad_norm": 0.6320837736129761, + "learning_rate": 0.0002506737603117528, + "loss": 3.3946, + "step": 4017 + }, + { + "epoch": 0.51, + "grad_norm": 0.617286205291748, + "learning_rate": 0.0002505701050751909, + "loss": 3.2815, + "step": 4018 + }, + { + "epoch": 0.51, + "grad_norm": 0.6056340336799622, + "learning_rate": 0.00025046644974062123, + "loss": 3.3245, + "step": 4019 + }, + { + "epoch": 0.51, + "grad_norm": 0.6097458600997925, + "learning_rate": 0.0002503627943258635, + "loss": 3.2694, + "step": 4020 + }, + { + "epoch": 0.51, + "grad_norm": 0.5899346470832825, + "learning_rate": 0.00025025913884873727, + "loss": 3.3591, + "step": 4021 + }, + { + "epoch": 0.51, + "grad_norm": 0.5989256501197815, + "learning_rate": 0.0002501554833270619, + "loss": 3.4002, + "step": 4022 + }, + { + "epoch": 0.51, + "grad_norm": 0.6305152773857117, + "learning_rate": 0.00025005182777865725, + "loss": 3.2025, + "step": 4023 + }, + { + "epoch": 0.52, + "grad_norm": 0.6572516560554504, + "learning_rate": 0.00024994817222134276, + "loss": 3.3328, + "step": 4024 + }, + { + "epoch": 0.52, + "grad_norm": 0.6383114457130432, + "learning_rate": 0.0002498445166729381, + "loss": 3.2991, + "step": 4025 + }, + { + "epoch": 0.52, + "grad_norm": 0.6050015091896057, + "learning_rate": 0.0002497408611512628, + "loss": 3.3712, + "step": 4026 + }, + { + "epoch": 0.52, + "grad_norm": 0.6256352066993713, + "learning_rate": 0.0002496372056741365, + "loss": 3.4663, + "step": 4027 + }, + { + "epoch": 0.52, + "grad_norm": 0.6097555756568909, + "learning_rate": 0.0002495335502593788, + "loss": 3.2967, + "step": 4028 + }, + { + "epoch": 0.52, + "grad_norm": 0.6122956275939941, + "learning_rate": 0.00024942989492480913, + "loss": 3.3454, + "step": 4029 + }, + { + "epoch": 0.52, + "grad_norm": 0.6701467633247375, + "learning_rate": 0.00024932623968824724, + "loss": 3.2917, + "step": 4030 + }, + { + "epoch": 0.52, + "grad_norm": 0.5594620704650879, + "learning_rate": 0.00024922258456751246, + "loss": 3.3435, + "step": 4031 + }, + { + "epoch": 0.52, + "grad_norm": 0.5902441143989563, + "learning_rate": 0.0002491189295804244, + "loss": 3.3574, + "step": 4032 + }, + { + "epoch": 0.52, + "grad_norm": 0.6207271814346313, + "learning_rate": 0.0002490152747448026, + "loss": 3.4042, + "step": 4033 + }, + { + "epoch": 0.52, + "grad_norm": 0.5825251936912537, + "learning_rate": 0.0002489116200784664, + "loss": 3.2744, + "step": 4034 + }, + { + "epoch": 0.52, + "grad_norm": 0.5591505169868469, + "learning_rate": 0.0002488079655992355, + "loss": 3.3777, + "step": 4035 + }, + { + "epoch": 0.52, + "grad_norm": 0.6506365537643433, + "learning_rate": 0.00024870431132492904, + "loss": 3.3424, + "step": 4036 + }, + { + "epoch": 0.52, + "grad_norm": 0.5781224370002747, + "learning_rate": 0.00024860065727336646, + "loss": 3.4152, + "step": 4037 + }, + { + "epoch": 0.52, + "grad_norm": 0.6120777130126953, + "learning_rate": 0.00024849700346236714, + "loss": 3.3249, + "step": 4038 + }, + { + "epoch": 0.52, + "grad_norm": 0.6162705421447754, + "learning_rate": 0.0002483933499097504, + "loss": 3.3129, + "step": 4039 + }, + { + "epoch": 0.52, + "grad_norm": 0.6134864091873169, + "learning_rate": 0.00024828969663333533, + "loss": 3.3345, + "step": 4040 + }, + { + "epoch": 0.52, + "grad_norm": 0.6703715920448303, + "learning_rate": 0.0002481860436509413, + "loss": 3.5297, + "step": 4041 + }, + { + "epoch": 0.52, + "grad_norm": 0.7207913398742676, + "learning_rate": 0.00024808239098038744, + "loss": 3.4621, + "step": 4042 + }, + { + "epoch": 0.52, + "grad_norm": 0.9156608581542969, + "learning_rate": 0.00024797873863949266, + "loss": 3.3278, + "step": 4043 + }, + { + "epoch": 0.52, + "grad_norm": 0.6291712522506714, + "learning_rate": 0.0002478750866460762, + "loss": 3.2997, + "step": 4044 + }, + { + "epoch": 0.52, + "grad_norm": 0.5725845098495483, + "learning_rate": 0.000247771435017957, + "loss": 3.4005, + "step": 4045 + }, + { + "epoch": 0.52, + "grad_norm": 0.6406933069229126, + "learning_rate": 0.000247667783772954, + "loss": 3.3842, + "step": 4046 + }, + { + "epoch": 0.52, + "grad_norm": 0.633264422416687, + "learning_rate": 0.0002475641329288859, + "loss": 3.3048, + "step": 4047 + }, + { + "epoch": 0.52, + "grad_norm": 0.5785310864448547, + "learning_rate": 0.00024746048250357173, + "loss": 3.2927, + "step": 4048 + }, + { + "epoch": 0.52, + "grad_norm": 0.6772027015686035, + "learning_rate": 0.00024735683251483003, + "loss": 3.3779, + "step": 4049 + }, + { + "epoch": 0.52, + "grad_norm": 0.6316635608673096, + "learning_rate": 0.00024725318298047936, + "loss": 3.3251, + "step": 4050 + }, + { + "epoch": 0.52, + "grad_norm": 0.5939462780952454, + "learning_rate": 0.00024714953391833847, + "loss": 3.3048, + "step": 4051 + }, + { + "epoch": 0.52, + "grad_norm": 0.5867147445678711, + "learning_rate": 0.00024704588534622563, + "loss": 3.3868, + "step": 4052 + }, + { + "epoch": 0.52, + "grad_norm": 0.6310352087020874, + "learning_rate": 0.0002469422372819595, + "loss": 3.4479, + "step": 4053 + }, + { + "epoch": 0.52, + "grad_norm": 0.6316264867782593, + "learning_rate": 0.00024683858974335814, + "loss": 3.2812, + "step": 4054 + }, + { + "epoch": 0.52, + "grad_norm": 0.6058422327041626, + "learning_rate": 0.00024673494274823985, + "loss": 3.3781, + "step": 4055 + }, + { + "epoch": 0.52, + "grad_norm": 0.6459770202636719, + "learning_rate": 0.0002466312963144228, + "loss": 3.5122, + "step": 4056 + }, + { + "epoch": 0.52, + "grad_norm": 0.6532690525054932, + "learning_rate": 0.00024652765045972495, + "loss": 3.3902, + "step": 4057 + }, + { + "epoch": 0.52, + "grad_norm": 0.5904327034950256, + "learning_rate": 0.0002464240052019642, + "loss": 3.3848, + "step": 4058 + }, + { + "epoch": 0.52, + "grad_norm": 0.6357998847961426, + "learning_rate": 0.0002463203605589585, + "loss": 3.268, + "step": 4059 + }, + { + "epoch": 0.52, + "grad_norm": 0.5934481024742126, + "learning_rate": 0.00024621671654852543, + "loss": 3.2678, + "step": 4060 + }, + { + "epoch": 0.52, + "grad_norm": 0.6189697980880737, + "learning_rate": 0.00024611307318848257, + "loss": 3.5115, + "step": 4061 + }, + { + "epoch": 0.52, + "grad_norm": 0.6146605610847473, + "learning_rate": 0.00024600943049664755, + "loss": 3.1718, + "step": 4062 + }, + { + "epoch": 0.52, + "grad_norm": 0.667597234249115, + "learning_rate": 0.0002459057884908377, + "loss": 3.3191, + "step": 4063 + }, + { + "epoch": 0.52, + "grad_norm": 0.6192696690559387, + "learning_rate": 0.0002458021471888703, + "loss": 3.3508, + "step": 4064 + }, + { + "epoch": 0.52, + "grad_norm": 0.6276788711547852, + "learning_rate": 0.0002456985066085624, + "loss": 3.373, + "step": 4065 + }, + { + "epoch": 0.52, + "grad_norm": 0.6441596150398254, + "learning_rate": 0.0002455948667677312, + "loss": 3.3959, + "step": 4066 + }, + { + "epoch": 0.52, + "grad_norm": 0.541420042514801, + "learning_rate": 0.00024549122768419336, + "loss": 3.3434, + "step": 4067 + }, + { + "epoch": 0.52, + "grad_norm": 0.6219912171363831, + "learning_rate": 0.00024538758937576576, + "loss": 3.2958, + "step": 4068 + }, + { + "epoch": 0.52, + "grad_norm": 0.6137280464172363, + "learning_rate": 0.0002452839518602651, + "loss": 3.3196, + "step": 4069 + }, + { + "epoch": 0.52, + "grad_norm": 0.6087313890457153, + "learning_rate": 0.0002451803151555076, + "loss": 3.3415, + "step": 4070 + }, + { + "epoch": 0.52, + "grad_norm": 0.5690910220146179, + "learning_rate": 0.00024507667927930993, + "loss": 3.2481, + "step": 4071 + }, + { + "epoch": 0.52, + "grad_norm": 0.6003113985061646, + "learning_rate": 0.0002449730442494882, + "loss": 3.3616, + "step": 4072 + }, + { + "epoch": 0.52, + "grad_norm": 0.6427178978919983, + "learning_rate": 0.0002448694100838584, + "loss": 3.3604, + "step": 4073 + }, + { + "epoch": 0.52, + "grad_norm": 0.648370623588562, + "learning_rate": 0.0002447657768002365, + "loss": 3.3164, + "step": 4074 + }, + { + "epoch": 0.52, + "grad_norm": 0.6166640520095825, + "learning_rate": 0.00024466214441643826, + "loss": 3.3474, + "step": 4075 + }, + { + "epoch": 0.52, + "grad_norm": 0.5990052819252014, + "learning_rate": 0.00024455851295027923, + "loss": 3.4471, + "step": 4076 + }, + { + "epoch": 0.52, + "grad_norm": 0.61072838306427, + "learning_rate": 0.000244454882419575, + "loss": 3.2597, + "step": 4077 + }, + { + "epoch": 0.52, + "grad_norm": 0.5881423950195312, + "learning_rate": 0.0002443512528421407, + "loss": 3.3991, + "step": 4078 + }, + { + "epoch": 0.52, + "grad_norm": 0.6317446231842041, + "learning_rate": 0.0002442476242357915, + "loss": 3.3498, + "step": 4079 + }, + { + "epoch": 0.52, + "grad_norm": 0.6786907911300659, + "learning_rate": 0.0002441439966183423, + "loss": 3.3058, + "step": 4080 + }, + { + "epoch": 0.52, + "grad_norm": 0.652067244052887, + "learning_rate": 0.0002440403700076081, + "loss": 3.4007, + "step": 4081 + }, + { + "epoch": 0.52, + "grad_norm": 0.6863986253738403, + "learning_rate": 0.0002439367444214034, + "loss": 3.3557, + "step": 4082 + }, + { + "epoch": 0.52, + "grad_norm": 0.6483095288276672, + "learning_rate": 0.00024383311987754254, + "loss": 3.3917, + "step": 4083 + }, + { + "epoch": 0.52, + "grad_norm": 0.6491280794143677, + "learning_rate": 0.00024372949639383992, + "loss": 3.4275, + "step": 4084 + }, + { + "epoch": 0.52, + "grad_norm": 0.6307315230369568, + "learning_rate": 0.00024362587398810959, + "loss": 3.3075, + "step": 4085 + }, + { + "epoch": 0.52, + "grad_norm": 0.6512874364852905, + "learning_rate": 0.00024352225267816537, + "loss": 3.3041, + "step": 4086 + }, + { + "epoch": 0.52, + "grad_norm": 0.6538076400756836, + "learning_rate": 0.00024341863248182103, + "loss": 3.306, + "step": 4087 + }, + { + "epoch": 0.52, + "grad_norm": 0.6164134740829468, + "learning_rate": 0.00024331501341689007, + "loss": 3.3771, + "step": 4088 + }, + { + "epoch": 0.52, + "grad_norm": 0.6030963063240051, + "learning_rate": 0.00024321139550118572, + "loss": 3.2981, + "step": 4089 + }, + { + "epoch": 0.52, + "grad_norm": 0.5784734487533569, + "learning_rate": 0.00024310777875252127, + "loss": 3.2589, + "step": 4090 + }, + { + "epoch": 0.52, + "grad_norm": 0.6351543664932251, + "learning_rate": 0.0002430041631887095, + "loss": 3.2737, + "step": 4091 + }, + { + "epoch": 0.52, + "grad_norm": 0.6241032481193542, + "learning_rate": 0.00024290054882756325, + "loss": 3.2979, + "step": 4092 + }, + { + "epoch": 0.52, + "grad_norm": 0.6360937356948853, + "learning_rate": 0.00024279693568689495, + "loss": 3.4559, + "step": 4093 + }, + { + "epoch": 0.52, + "grad_norm": 0.571132481098175, + "learning_rate": 0.00024269332378451687, + "loss": 3.3975, + "step": 4094 + }, + { + "epoch": 0.52, + "grad_norm": 0.6465764045715332, + "learning_rate": 0.00024258971313824115, + "loss": 3.3546, + "step": 4095 + }, + { + "epoch": 0.52, + "grad_norm": 0.6149590611457825, + "learning_rate": 0.00024248610376587971, + "loss": 3.2858, + "step": 4096 + }, + { + "epoch": 0.52, + "grad_norm": 0.6058986186981201, + "learning_rate": 0.00024238249568524404, + "loss": 3.3587, + "step": 4097 + }, + { + "epoch": 0.52, + "grad_norm": 0.6025921106338501, + "learning_rate": 0.00024227888891414562, + "loss": 3.1882, + "step": 4098 + }, + { + "epoch": 0.52, + "grad_norm": 0.5954164266586304, + "learning_rate": 0.00024217528347039583, + "loss": 3.4369, + "step": 4099 + }, + { + "epoch": 0.52, + "grad_norm": 0.6313084363937378, + "learning_rate": 0.0002420716793718055, + "loss": 3.3698, + "step": 4100 + }, + { + "epoch": 0.52, + "grad_norm": 0.6320266127586365, + "learning_rate": 0.0002419680766361854, + "loss": 3.3298, + "step": 4101 + }, + { + "epoch": 0.53, + "grad_norm": 0.6496672034263611, + "learning_rate": 0.00024186447528134605, + "loss": 3.405, + "step": 4102 + }, + { + "epoch": 0.53, + "grad_norm": 0.6242969632148743, + "learning_rate": 0.0002417608753250977, + "loss": 3.4386, + "step": 4103 + }, + { + "epoch": 0.53, + "grad_norm": 0.6158754229545593, + "learning_rate": 0.0002416572767852504, + "loss": 3.3669, + "step": 4104 + }, + { + "epoch": 0.53, + "grad_norm": 0.6036068797111511, + "learning_rate": 0.00024155367967961395, + "loss": 3.3808, + "step": 4105 + }, + { + "epoch": 0.53, + "grad_norm": 0.6248922944068909, + "learning_rate": 0.0002414500840259979, + "loss": 3.3149, + "step": 4106 + }, + { + "epoch": 0.53, + "grad_norm": 0.6088889241218567, + "learning_rate": 0.0002413464898422114, + "loss": 3.1844, + "step": 4107 + }, + { + "epoch": 0.53, + "grad_norm": 0.6099678874015808, + "learning_rate": 0.00024124289714606368, + "loss": 3.3684, + "step": 4108 + }, + { + "epoch": 0.53, + "grad_norm": 0.6642912030220032, + "learning_rate": 0.00024113930595536353, + "loss": 3.4227, + "step": 4109 + }, + { + "epoch": 0.53, + "grad_norm": 0.6614413261413574, + "learning_rate": 0.00024103571628791937, + "loss": 3.3632, + "step": 4110 + }, + { + "epoch": 0.53, + "grad_norm": 0.5714897513389587, + "learning_rate": 0.00024093212816153953, + "loss": 3.445, + "step": 4111 + }, + { + "epoch": 0.53, + "grad_norm": 0.7094261050224304, + "learning_rate": 0.00024082854159403192, + "loss": 3.4828, + "step": 4112 + }, + { + "epoch": 0.53, + "grad_norm": 0.6622090935707092, + "learning_rate": 0.00024072495660320439, + "loss": 3.4051, + "step": 4113 + }, + { + "epoch": 0.53, + "grad_norm": 0.5947360396385193, + "learning_rate": 0.00024062137320686436, + "loss": 3.31, + "step": 4114 + }, + { + "epoch": 0.53, + "grad_norm": 0.5824505090713501, + "learning_rate": 0.00024051779142281892, + "loss": 3.348, + "step": 4115 + }, + { + "epoch": 0.53, + "grad_norm": 0.6096441149711609, + "learning_rate": 0.00024041421126887514, + "loss": 3.2762, + "step": 4116 + }, + { + "epoch": 0.53, + "grad_norm": 0.5826987624168396, + "learning_rate": 0.00024031063276283945, + "loss": 3.292, + "step": 4117 + }, + { + "epoch": 0.53, + "grad_norm": 0.6296156644821167, + "learning_rate": 0.00024020705592251842, + "loss": 3.3162, + "step": 4118 + }, + { + "epoch": 0.53, + "grad_norm": 0.6289445757865906, + "learning_rate": 0.00024010348076571798, + "loss": 3.2726, + "step": 4119 + }, + { + "epoch": 0.53, + "grad_norm": 0.6153998374938965, + "learning_rate": 0.00023999990731024396, + "loss": 3.3769, + "step": 4120 + }, + { + "epoch": 0.53, + "grad_norm": 0.6376085877418518, + "learning_rate": 0.0002398963355739018, + "loss": 3.2786, + "step": 4121 + }, + { + "epoch": 0.53, + "grad_norm": 0.5849770903587341, + "learning_rate": 0.00023979276557449663, + "loss": 3.2887, + "step": 4122 + }, + { + "epoch": 0.53, + "grad_norm": 0.638300895690918, + "learning_rate": 0.00023968919732983345, + "loss": 3.3049, + "step": 4123 + }, + { + "epoch": 0.53, + "grad_norm": 0.6314319968223572, + "learning_rate": 0.0002395856308577168, + "loss": 3.2331, + "step": 4124 + }, + { + "epoch": 0.53, + "grad_norm": 0.6285073757171631, + "learning_rate": 0.00023948206617595088, + "loss": 3.137, + "step": 4125 + }, + { + "epoch": 0.53, + "grad_norm": 0.6688522696495056, + "learning_rate": 0.00023937850330233966, + "loss": 3.3979, + "step": 4126 + }, + { + "epoch": 0.53, + "grad_norm": 0.649043619632721, + "learning_rate": 0.00023927494225468694, + "loss": 3.3499, + "step": 4127 + }, + { + "epoch": 0.53, + "grad_norm": 0.6987082958221436, + "learning_rate": 0.000239171383050796, + "loss": 3.4631, + "step": 4128 + }, + { + "epoch": 0.53, + "grad_norm": 0.6388396620750427, + "learning_rate": 0.0002390678257084698, + "loss": 3.2583, + "step": 4129 + }, + { + "epoch": 0.53, + "grad_norm": 0.6269456744194031, + "learning_rate": 0.00023896427024551115, + "loss": 3.422, + "step": 4130 + }, + { + "epoch": 0.53, + "grad_norm": 0.6069709062576294, + "learning_rate": 0.0002388607166797224, + "loss": 3.4043, + "step": 4131 + }, + { + "epoch": 0.53, + "grad_norm": 0.6211990118026733, + "learning_rate": 0.00023875716502890556, + "loss": 3.3723, + "step": 4132 + }, + { + "epoch": 0.53, + "grad_norm": 0.596342146396637, + "learning_rate": 0.00023865361531086234, + "loss": 3.4274, + "step": 4133 + }, + { + "epoch": 0.53, + "grad_norm": 0.5964327454566956, + "learning_rate": 0.00023855006754339424, + "loss": 3.3164, + "step": 4134 + }, + { + "epoch": 0.53, + "grad_norm": 0.5660306215286255, + "learning_rate": 0.00023844652174430218, + "loss": 3.4161, + "step": 4135 + }, + { + "epoch": 0.53, + "grad_norm": 0.6334840059280396, + "learning_rate": 0.00023834297793138708, + "loss": 3.3481, + "step": 4136 + }, + { + "epoch": 0.53, + "grad_norm": 0.5846516489982605, + "learning_rate": 0.00023823943612244914, + "loss": 3.4382, + "step": 4137 + }, + { + "epoch": 0.53, + "grad_norm": 0.6413622498512268, + "learning_rate": 0.00023813589633528854, + "loss": 3.3584, + "step": 4138 + }, + { + "epoch": 0.53, + "grad_norm": 0.6269566416740417, + "learning_rate": 0.00023803235858770489, + "loss": 3.3366, + "step": 4139 + }, + { + "epoch": 0.53, + "grad_norm": 0.609159529209137, + "learning_rate": 0.00023792882289749747, + "loss": 3.2685, + "step": 4140 + }, + { + "epoch": 0.53, + "grad_norm": 0.5730066895484924, + "learning_rate": 0.0002378252892824654, + "loss": 3.2286, + "step": 4141 + }, + { + "epoch": 0.53, + "grad_norm": 0.6422752737998962, + "learning_rate": 0.00023772175776040727, + "loss": 3.3926, + "step": 4142 + }, + { + "epoch": 0.53, + "grad_norm": 0.6044661402702332, + "learning_rate": 0.00023761822834912126, + "loss": 3.1972, + "step": 4143 + }, + { + "epoch": 0.53, + "grad_norm": 0.6210265159606934, + "learning_rate": 0.00023751470106640526, + "loss": 3.1652, + "step": 4144 + }, + { + "epoch": 0.53, + "grad_norm": 0.6290984153747559, + "learning_rate": 0.00023741117593005702, + "loss": 3.4232, + "step": 4145 + }, + { + "epoch": 0.53, + "grad_norm": 0.6228407621383667, + "learning_rate": 0.00023730765295787356, + "loss": 3.2965, + "step": 4146 + }, + { + "epoch": 0.53, + "grad_norm": 0.628711462020874, + "learning_rate": 0.00023720413216765166, + "loss": 3.4672, + "step": 4147 + }, + { + "epoch": 0.53, + "grad_norm": 0.6025664806365967, + "learning_rate": 0.00023710061357718783, + "loss": 3.3333, + "step": 4148 + }, + { + "epoch": 0.53, + "grad_norm": 0.613601803779602, + "learning_rate": 0.00023699709720427807, + "loss": 3.3516, + "step": 4149 + }, + { + "epoch": 0.53, + "grad_norm": 0.5990720987319946, + "learning_rate": 0.000236893583066718, + "loss": 3.3035, + "step": 4150 + }, + { + "epoch": 0.53, + "grad_norm": 0.589634358882904, + "learning_rate": 0.000236790071182303, + "loss": 3.4067, + "step": 4151 + }, + { + "epoch": 0.53, + "grad_norm": 0.5927738547325134, + "learning_rate": 0.00023668656156882787, + "loss": 3.4319, + "step": 4152 + }, + { + "epoch": 0.53, + "grad_norm": 0.6159043908119202, + "learning_rate": 0.00023658305424408718, + "loss": 3.4393, + "step": 4153 + }, + { + "epoch": 0.53, + "grad_norm": 0.6087044477462769, + "learning_rate": 0.0002364795492258749, + "loss": 3.3349, + "step": 4154 + }, + { + "epoch": 0.53, + "grad_norm": 0.6531714200973511, + "learning_rate": 0.0002363760465319849, + "loss": 3.3839, + "step": 4155 + }, + { + "epoch": 0.53, + "grad_norm": 0.6283435821533203, + "learning_rate": 0.00023627254618021048, + "loss": 3.337, + "step": 4156 + }, + { + "epoch": 0.53, + "grad_norm": 0.6097371578216553, + "learning_rate": 0.00023616904818834453, + "loss": 3.4198, + "step": 4157 + }, + { + "epoch": 0.53, + "grad_norm": 0.6434276700019836, + "learning_rate": 0.0002360655525741795, + "loss": 3.3535, + "step": 4158 + }, + { + "epoch": 0.53, + "grad_norm": 0.6356515288352966, + "learning_rate": 0.00023596205935550756, + "loss": 3.4611, + "step": 4159 + }, + { + "epoch": 0.53, + "grad_norm": 0.632684051990509, + "learning_rate": 0.00023585856855012037, + "loss": 3.412, + "step": 4160 + }, + { + "epoch": 0.53, + "grad_norm": 0.6770361065864563, + "learning_rate": 0.0002357550801758091, + "loss": 3.377, + "step": 4161 + }, + { + "epoch": 0.53, + "grad_norm": 0.5955840349197388, + "learning_rate": 0.0002356515942503648, + "loss": 3.2534, + "step": 4162 + }, + { + "epoch": 0.53, + "grad_norm": 0.6225008368492126, + "learning_rate": 0.00023554811079157763, + "loss": 3.375, + "step": 4163 + }, + { + "epoch": 0.53, + "grad_norm": 0.6350579261779785, + "learning_rate": 0.00023544462981723788, + "loss": 3.4148, + "step": 4164 + }, + { + "epoch": 0.53, + "grad_norm": 0.622137188911438, + "learning_rate": 0.00023534115134513496, + "loss": 3.3173, + "step": 4165 + }, + { + "epoch": 0.53, + "grad_norm": 0.6317146420478821, + "learning_rate": 0.0002352376753930581, + "loss": 3.2954, + "step": 4166 + }, + { + "epoch": 0.53, + "grad_norm": 0.6163128018379211, + "learning_rate": 0.00023513420197879598, + "loss": 3.2012, + "step": 4167 + }, + { + "epoch": 0.53, + "grad_norm": 0.6241321563720703, + "learning_rate": 0.00023503073112013685, + "loss": 3.3465, + "step": 4168 + }, + { + "epoch": 0.53, + "grad_norm": 0.7008969187736511, + "learning_rate": 0.00023492726283486862, + "loss": 3.5505, + "step": 4169 + }, + { + "epoch": 0.53, + "grad_norm": 0.6041575074195862, + "learning_rate": 0.00023482379714077865, + "loss": 3.4479, + "step": 4170 + }, + { + "epoch": 0.53, + "grad_norm": 0.6186978220939636, + "learning_rate": 0.00023472033405565388, + "loss": 3.275, + "step": 4171 + }, + { + "epoch": 0.53, + "grad_norm": 0.6767882108688354, + "learning_rate": 0.00023461687359728071, + "loss": 3.3668, + "step": 4172 + }, + { + "epoch": 0.53, + "grad_norm": 0.6426897048950195, + "learning_rate": 0.00023451341578344538, + "loss": 3.2808, + "step": 4173 + }, + { + "epoch": 0.53, + "grad_norm": 0.5685768723487854, + "learning_rate": 0.00023440996063193344, + "loss": 3.33, + "step": 4174 + }, + { + "epoch": 0.53, + "grad_norm": 0.6065342426300049, + "learning_rate": 0.00023430650816053, + "loss": 3.378, + "step": 4175 + }, + { + "epoch": 0.53, + "grad_norm": 0.6212944984436035, + "learning_rate": 0.00023420305838701971, + "loss": 3.496, + "step": 4176 + }, + { + "epoch": 0.53, + "grad_norm": 0.6290854811668396, + "learning_rate": 0.00023409961132918688, + "loss": 3.343, + "step": 4177 + }, + { + "epoch": 0.53, + "grad_norm": 0.6344352960586548, + "learning_rate": 0.00023399616700481518, + "loss": 3.4487, + "step": 4178 + }, + { + "epoch": 0.53, + "grad_norm": 0.6236302256584167, + "learning_rate": 0.00023389272543168784, + "loss": 3.4238, + "step": 4179 + }, + { + "epoch": 0.54, + "grad_norm": 0.6058031320571899, + "learning_rate": 0.0002337892866275878, + "loss": 3.3207, + "step": 4180 + }, + { + "epoch": 0.54, + "grad_norm": 0.5936098694801331, + "learning_rate": 0.00023368585061029723, + "loss": 3.3671, + "step": 4181 + }, + { + "epoch": 0.54, + "grad_norm": 0.5796845555305481, + "learning_rate": 0.00023358241739759815, + "loss": 3.2806, + "step": 4182 + }, + { + "epoch": 0.54, + "grad_norm": 0.6267536878585815, + "learning_rate": 0.0002334789870072718, + "loss": 3.3728, + "step": 4183 + }, + { + "epoch": 0.54, + "grad_norm": 0.6423101425170898, + "learning_rate": 0.00023337555945709916, + "loss": 3.281, + "step": 4184 + }, + { + "epoch": 0.54, + "grad_norm": 0.6725943684577942, + "learning_rate": 0.0002332721347648606, + "loss": 3.4129, + "step": 4185 + }, + { + "epoch": 0.54, + "grad_norm": 0.6199285984039307, + "learning_rate": 0.0002331687129483359, + "loss": 3.2585, + "step": 4186 + }, + { + "epoch": 0.54, + "grad_norm": 0.6867667436599731, + "learning_rate": 0.00023306529402530467, + "loss": 3.3677, + "step": 4187 + }, + { + "epoch": 0.54, + "grad_norm": 0.6624594926834106, + "learning_rate": 0.0002329618780135457, + "loss": 3.411, + "step": 4188 + }, + { + "epoch": 0.54, + "grad_norm": 0.6970866918563843, + "learning_rate": 0.00023285846493083736, + "loss": 3.3584, + "step": 4189 + }, + { + "epoch": 0.54, + "grad_norm": 0.6615217924118042, + "learning_rate": 0.00023275505479495768, + "loss": 3.4853, + "step": 4190 + }, + { + "epoch": 0.54, + "grad_norm": 0.6189938187599182, + "learning_rate": 0.0002326516476236839, + "loss": 3.2827, + "step": 4191 + }, + { + "epoch": 0.54, + "grad_norm": 0.6213204264640808, + "learning_rate": 0.00023254824343479314, + "loss": 3.4017, + "step": 4192 + }, + { + "epoch": 0.54, + "grad_norm": 0.6040611267089844, + "learning_rate": 0.0002324448422460616, + "loss": 3.4199, + "step": 4193 + }, + { + "epoch": 0.54, + "grad_norm": 0.6316700577735901, + "learning_rate": 0.0002323414440752652, + "loss": 3.3436, + "step": 4194 + }, + { + "epoch": 0.54, + "grad_norm": 0.5814554691314697, + "learning_rate": 0.0002322380489401793, + "loss": 3.2686, + "step": 4195 + }, + { + "epoch": 0.54, + "grad_norm": 0.6496289968490601, + "learning_rate": 0.0002321346568585787, + "loss": 3.3125, + "step": 4196 + }, + { + "epoch": 0.54, + "grad_norm": 0.5613424777984619, + "learning_rate": 0.00023203126784823765, + "loss": 3.2389, + "step": 4197 + }, + { + "epoch": 0.54, + "grad_norm": 0.6151560544967651, + "learning_rate": 0.00023192788192693002, + "loss": 3.3445, + "step": 4198 + }, + { + "epoch": 0.54, + "grad_norm": 0.6019925475120544, + "learning_rate": 0.000231824499112429, + "loss": 3.1763, + "step": 4199 + }, + { + "epoch": 0.54, + "grad_norm": 0.6324076056480408, + "learning_rate": 0.00023172111942250717, + "loss": 3.2282, + "step": 4200 + }, + { + "epoch": 0.54, + "grad_norm": 0.6262516379356384, + "learning_rate": 0.0002316177428749369, + "loss": 3.4438, + "step": 4201 + }, + { + "epoch": 0.54, + "grad_norm": 0.6653327345848083, + "learning_rate": 0.00023151436948748978, + "loss": 3.3371, + "step": 4202 + }, + { + "epoch": 0.54, + "grad_norm": 0.64223712682724, + "learning_rate": 0.00023141099927793683, + "loss": 3.4776, + "step": 4203 + }, + { + "epoch": 0.54, + "grad_norm": 0.6516994833946228, + "learning_rate": 0.00023130763226404858, + "loss": 3.4086, + "step": 4204 + }, + { + "epoch": 0.54, + "grad_norm": 0.6069992184638977, + "learning_rate": 0.00023120426846359507, + "loss": 3.3633, + "step": 4205 + }, + { + "epoch": 0.54, + "grad_norm": 0.6335489153862, + "learning_rate": 0.00023110090789434573, + "loss": 3.3145, + "step": 4206 + }, + { + "epoch": 0.54, + "grad_norm": 0.636854350566864, + "learning_rate": 0.00023099755057406934, + "loss": 3.3186, + "step": 4207 + }, + { + "epoch": 0.54, + "grad_norm": 0.7066942453384399, + "learning_rate": 0.0002308941965205344, + "loss": 3.4711, + "step": 4208 + }, + { + "epoch": 0.54, + "grad_norm": 0.6394109129905701, + "learning_rate": 0.00023079084575150844, + "loss": 3.2384, + "step": 4209 + }, + { + "epoch": 0.54, + "grad_norm": 0.6655170917510986, + "learning_rate": 0.00023068749828475887, + "loss": 3.3054, + "step": 4210 + }, + { + "epoch": 0.54, + "grad_norm": 0.603547215461731, + "learning_rate": 0.00023058415413805227, + "loss": 3.4559, + "step": 4211 + }, + { + "epoch": 0.54, + "grad_norm": 0.6678892970085144, + "learning_rate": 0.0002304808133291546, + "loss": 3.4117, + "step": 4212 + }, + { + "epoch": 0.54, + "grad_norm": 0.5886514186859131, + "learning_rate": 0.00023037747587583145, + "loss": 3.373, + "step": 4213 + }, + { + "epoch": 0.54, + "grad_norm": 0.615600049495697, + "learning_rate": 0.0002302741417958477, + "loss": 3.3118, + "step": 4214 + }, + { + "epoch": 0.54, + "grad_norm": 0.6184048652648926, + "learning_rate": 0.0002301708111069676, + "loss": 3.3858, + "step": 4215 + }, + { + "epoch": 0.54, + "grad_norm": 0.6221560835838318, + "learning_rate": 0.00023006748382695496, + "loss": 3.4005, + "step": 4216 + }, + { + "epoch": 0.54, + "grad_norm": 0.6305278539657593, + "learning_rate": 0.00022996415997357297, + "loss": 3.31, + "step": 4217 + }, + { + "epoch": 0.54, + "grad_norm": 0.6164804697036743, + "learning_rate": 0.00022986083956458403, + "loss": 3.2852, + "step": 4218 + }, + { + "epoch": 0.54, + "grad_norm": 0.6002761125564575, + "learning_rate": 0.0002297575226177503, + "loss": 3.3573, + "step": 4219 + }, + { + "epoch": 0.54, + "grad_norm": 0.6530587077140808, + "learning_rate": 0.0002296542091508332, + "loss": 3.4403, + "step": 4220 + }, + { + "epoch": 0.54, + "grad_norm": 0.6755840182304382, + "learning_rate": 0.00022955089918159333, + "loss": 3.3594, + "step": 4221 + }, + { + "epoch": 0.54, + "grad_norm": 0.6349380016326904, + "learning_rate": 0.00022944759272779098, + "loss": 3.3848, + "step": 4222 + }, + { + "epoch": 0.54, + "grad_norm": 0.5848196148872375, + "learning_rate": 0.00022934428980718571, + "loss": 3.2475, + "step": 4223 + }, + { + "epoch": 0.54, + "grad_norm": 0.6374369859695435, + "learning_rate": 0.00022924099043753648, + "loss": 3.5159, + "step": 4224 + }, + { + "epoch": 0.54, + "grad_norm": 0.6684466600418091, + "learning_rate": 0.00022913769463660163, + "loss": 3.3954, + "step": 4225 + }, + { + "epoch": 0.54, + "grad_norm": 0.6474562883377075, + "learning_rate": 0.000229034402422139, + "loss": 3.3546, + "step": 4226 + }, + { + "epoch": 0.54, + "grad_norm": 0.5929523706436157, + "learning_rate": 0.00022893111381190563, + "loss": 3.2398, + "step": 4227 + }, + { + "epoch": 0.54, + "grad_norm": 0.586125910282135, + "learning_rate": 0.0002288278288236579, + "loss": 3.2269, + "step": 4228 + }, + { + "epoch": 0.54, + "grad_norm": 0.5903507471084595, + "learning_rate": 0.00022872454747515197, + "loss": 3.2724, + "step": 4229 + }, + { + "epoch": 0.54, + "grad_norm": 0.6329900026321411, + "learning_rate": 0.00022862126978414295, + "loss": 3.3877, + "step": 4230 + }, + { + "epoch": 0.54, + "grad_norm": 0.6692850589752197, + "learning_rate": 0.00022851799576838552, + "loss": 3.3185, + "step": 4231 + }, + { + "epoch": 0.54, + "grad_norm": 0.5712085962295532, + "learning_rate": 0.00022841472544563368, + "loss": 3.2557, + "step": 4232 + }, + { + "epoch": 0.54, + "grad_norm": 0.6429996490478516, + "learning_rate": 0.0002283114588336407, + "loss": 3.2549, + "step": 4233 + }, + { + "epoch": 0.54, + "grad_norm": 0.621189296245575, + "learning_rate": 0.00022820819595015945, + "loss": 3.3142, + "step": 4234 + }, + { + "epoch": 0.54, + "grad_norm": 0.6361286044120789, + "learning_rate": 0.00022810493681294197, + "loss": 3.372, + "step": 4235 + }, + { + "epoch": 0.54, + "grad_norm": 0.6397047638893127, + "learning_rate": 0.0002280016814397396, + "loss": 3.3356, + "step": 4236 + }, + { + "epoch": 0.54, + "grad_norm": 0.5639482736587524, + "learning_rate": 0.00022789842984830317, + "loss": 3.3375, + "step": 4237 + }, + { + "epoch": 0.54, + "grad_norm": 0.5667389631271362, + "learning_rate": 0.00022779518205638297, + "loss": 3.2657, + "step": 4238 + }, + { + "epoch": 0.54, + "grad_norm": 0.6697838306427002, + "learning_rate": 0.00022769193808172843, + "loss": 3.4399, + "step": 4239 + }, + { + "epoch": 0.54, + "grad_norm": 0.634571373462677, + "learning_rate": 0.00022758869794208824, + "loss": 3.2491, + "step": 4240 + }, + { + "epoch": 0.54, + "grad_norm": 0.6573125123977661, + "learning_rate": 0.00022748546165521079, + "loss": 3.4672, + "step": 4241 + }, + { + "epoch": 0.54, + "grad_norm": 0.6508284211158752, + "learning_rate": 0.00022738222923884347, + "loss": 3.2154, + "step": 4242 + }, + { + "epoch": 0.54, + "grad_norm": 0.6569792628288269, + "learning_rate": 0.00022727900071073308, + "loss": 3.375, + "step": 4243 + }, + { + "epoch": 0.54, + "grad_norm": 0.6264458894729614, + "learning_rate": 0.00022717577608862596, + "loss": 3.4025, + "step": 4244 + }, + { + "epoch": 0.54, + "grad_norm": 0.6183469891548157, + "learning_rate": 0.00022707255539026753, + "loss": 3.4382, + "step": 4245 + }, + { + "epoch": 0.54, + "grad_norm": 0.6360318660736084, + "learning_rate": 0.0002269693386334025, + "loss": 3.3828, + "step": 4246 + }, + { + "epoch": 0.54, + "grad_norm": 0.5953824520111084, + "learning_rate": 0.00022686612583577525, + "loss": 3.3895, + "step": 4247 + }, + { + "epoch": 0.54, + "grad_norm": 0.6206636428833008, + "learning_rate": 0.00022676291701512912, + "loss": 3.3062, + "step": 4248 + }, + { + "epoch": 0.54, + "grad_norm": 0.6012082695960999, + "learning_rate": 0.000226659712189207, + "loss": 3.2807, + "step": 4249 + }, + { + "epoch": 0.54, + "grad_norm": 0.56572026014328, + "learning_rate": 0.00022655651137575095, + "loss": 3.3347, + "step": 4250 + }, + { + "epoch": 0.54, + "grad_norm": 0.6169419884681702, + "learning_rate": 0.00022645331459250233, + "loss": 3.4329, + "step": 4251 + }, + { + "epoch": 0.54, + "grad_norm": 0.6697790622711182, + "learning_rate": 0.00022635012185720193, + "loss": 3.3228, + "step": 4252 + }, + { + "epoch": 0.54, + "grad_norm": 0.5939697623252869, + "learning_rate": 0.00022624693318758977, + "loss": 3.4995, + "step": 4253 + }, + { + "epoch": 0.54, + "grad_norm": 0.6752704381942749, + "learning_rate": 0.00022614374860140511, + "loss": 3.3671, + "step": 4254 + }, + { + "epoch": 0.54, + "grad_norm": 0.5698899030685425, + "learning_rate": 0.00022604056811638656, + "loss": 3.3342, + "step": 4255 + }, + { + "epoch": 0.54, + "grad_norm": 0.6277320981025696, + "learning_rate": 0.00022593739175027222, + "loss": 3.4509, + "step": 4256 + }, + { + "epoch": 0.54, + "grad_norm": 0.6074933409690857, + "learning_rate": 0.00022583421952079925, + "loss": 3.3508, + "step": 4257 + }, + { + "epoch": 0.55, + "grad_norm": 0.6025525331497192, + "learning_rate": 0.000225731051445704, + "loss": 3.3106, + "step": 4258 + }, + { + "epoch": 0.55, + "grad_norm": 0.6493105292320251, + "learning_rate": 0.0002256278875427224, + "loss": 3.4314, + "step": 4259 + }, + { + "epoch": 0.55, + "grad_norm": 0.6410335302352905, + "learning_rate": 0.0002255247278295895, + "loss": 3.4247, + "step": 4260 + }, + { + "epoch": 0.55, + "grad_norm": 0.604914665222168, + "learning_rate": 0.00022542157232403957, + "loss": 3.3722, + "step": 4261 + }, + { + "epoch": 0.55, + "grad_norm": 0.6296116709709167, + "learning_rate": 0.00022531842104380633, + "loss": 3.316, + "step": 4262 + }, + { + "epoch": 0.55, + "grad_norm": 0.660125732421875, + "learning_rate": 0.00022521527400662267, + "loss": 3.3388, + "step": 4263 + }, + { + "epoch": 0.55, + "grad_norm": 0.5822790861129761, + "learning_rate": 0.00022511213123022067, + "loss": 3.2431, + "step": 4264 + }, + { + "epoch": 0.55, + "grad_norm": 0.6079729199409485, + "learning_rate": 0.00022500899273233184, + "loss": 3.4239, + "step": 4265 + }, + { + "epoch": 0.55, + "grad_norm": 0.6291740536689758, + "learning_rate": 0.00022490585853068688, + "loss": 3.3707, + "step": 4266 + }, + { + "epoch": 0.55, + "grad_norm": 0.6137354969978333, + "learning_rate": 0.00022480272864301582, + "loss": 3.3738, + "step": 4267 + }, + { + "epoch": 0.55, + "grad_norm": 0.6171749234199524, + "learning_rate": 0.0002246996030870478, + "loss": 3.5086, + "step": 4268 + }, + { + "epoch": 0.55, + "grad_norm": 0.6187096238136292, + "learning_rate": 0.00022459648188051127, + "loss": 3.3847, + "step": 4269 + }, + { + "epoch": 0.55, + "grad_norm": 0.5911290049552917, + "learning_rate": 0.00022449336504113405, + "loss": 3.3809, + "step": 4270 + }, + { + "epoch": 0.55, + "grad_norm": 0.6221075057983398, + "learning_rate": 0.0002243902525866431, + "loss": 3.3216, + "step": 4271 + }, + { + "epoch": 0.55, + "grad_norm": 0.6293851137161255, + "learning_rate": 0.00022428714453476457, + "loss": 3.3067, + "step": 4272 + }, + { + "epoch": 0.55, + "grad_norm": 0.603039562702179, + "learning_rate": 0.00022418404090322403, + "loss": 3.3561, + "step": 4273 + }, + { + "epoch": 0.55, + "grad_norm": 0.5857105255126953, + "learning_rate": 0.00022408094170974603, + "loss": 3.4122, + "step": 4274 + }, + { + "epoch": 0.55, + "grad_norm": 0.5771323442459106, + "learning_rate": 0.00022397784697205473, + "loss": 3.3669, + "step": 4275 + }, + { + "epoch": 0.55, + "grad_norm": 0.61953204870224, + "learning_rate": 0.00022387475670787317, + "loss": 3.1603, + "step": 4276 + }, + { + "epoch": 0.55, + "grad_norm": 0.6458093523979187, + "learning_rate": 0.00022377167093492385, + "loss": 3.4174, + "step": 4277 + }, + { + "epoch": 0.55, + "grad_norm": 0.6441712379455566, + "learning_rate": 0.00022366858967092835, + "loss": 3.3808, + "step": 4278 + }, + { + "epoch": 0.55, + "grad_norm": 0.6296495795249939, + "learning_rate": 0.0002235655129336075, + "loss": 3.3364, + "step": 4279 + }, + { + "epoch": 0.55, + "grad_norm": 0.6566582918167114, + "learning_rate": 0.0002234624407406815, + "loss": 3.3708, + "step": 4280 + }, + { + "epoch": 0.55, + "grad_norm": 0.5854193568229675, + "learning_rate": 0.0002233593731098696, + "loss": 3.1429, + "step": 4281 + }, + { + "epoch": 0.55, + "grad_norm": 0.9514264464378357, + "learning_rate": 0.00022325631005889023, + "loss": 3.3458, + "step": 4282 + }, + { + "epoch": 0.55, + "grad_norm": 0.584960401058197, + "learning_rate": 0.00022315325160546118, + "loss": 3.2647, + "step": 4283 + }, + { + "epoch": 0.55, + "grad_norm": 0.636075496673584, + "learning_rate": 0.00022305019776729942, + "loss": 3.2481, + "step": 4284 + }, + { + "epoch": 0.55, + "grad_norm": 0.6747125387191772, + "learning_rate": 0.00022294714856212116, + "loss": 3.3448, + "step": 4285 + }, + { + "epoch": 0.55, + "grad_norm": 0.6393190026283264, + "learning_rate": 0.0002228441040076417, + "loss": 3.3644, + "step": 4286 + }, + { + "epoch": 0.55, + "grad_norm": 0.6362060904502869, + "learning_rate": 0.00022274106412157552, + "loss": 3.389, + "step": 4287 + }, + { + "epoch": 0.55, + "grad_norm": 0.6304023265838623, + "learning_rate": 0.00022263802892163645, + "loss": 3.3914, + "step": 4288 + }, + { + "epoch": 0.55, + "grad_norm": 0.6571694016456604, + "learning_rate": 0.00022253499842553746, + "loss": 3.4138, + "step": 4289 + }, + { + "epoch": 0.55, + "grad_norm": 0.6255675554275513, + "learning_rate": 0.00022243197265099058, + "loss": 3.3106, + "step": 4290 + }, + { + "epoch": 0.55, + "grad_norm": 0.6484401822090149, + "learning_rate": 0.00022232895161570723, + "loss": 3.4181, + "step": 4291 + }, + { + "epoch": 0.55, + "grad_norm": 0.626513659954071, + "learning_rate": 0.0002222259353373978, + "loss": 3.3272, + "step": 4292 + }, + { + "epoch": 0.55, + "grad_norm": 0.6421791911125183, + "learning_rate": 0.00022212292383377215, + "loss": 3.2945, + "step": 4293 + }, + { + "epoch": 0.55, + "grad_norm": 0.64449143409729, + "learning_rate": 0.00022201991712253905, + "loss": 3.2933, + "step": 4294 + }, + { + "epoch": 0.55, + "grad_norm": 0.6221674084663391, + "learning_rate": 0.0002219169152214066, + "loss": 3.3385, + "step": 4295 + }, + { + "epoch": 0.55, + "grad_norm": 0.6388044953346252, + "learning_rate": 0.000221813918148082, + "loss": 3.3818, + "step": 4296 + }, + { + "epoch": 0.55, + "grad_norm": 0.6350899934768677, + "learning_rate": 0.00022171092592027157, + "loss": 3.3079, + "step": 4297 + }, + { + "epoch": 0.55, + "grad_norm": 0.6287100315093994, + "learning_rate": 0.00022160793855568098, + "loss": 3.4988, + "step": 4298 + }, + { + "epoch": 0.55, + "grad_norm": 0.6106211543083191, + "learning_rate": 0.00022150495607201493, + "loss": 3.3329, + "step": 4299 + }, + { + "epoch": 0.55, + "grad_norm": 0.5845887660980225, + "learning_rate": 0.00022140197848697718, + "loss": 3.2069, + "step": 4300 + }, + { + "epoch": 0.55, + "grad_norm": 0.6291304230690002, + "learning_rate": 0.00022129900581827094, + "loss": 3.3692, + "step": 4301 + }, + { + "epoch": 0.55, + "grad_norm": 0.5868272185325623, + "learning_rate": 0.00022119603808359823, + "loss": 3.3427, + "step": 4302 + }, + { + "epoch": 0.55, + "grad_norm": 0.6013039350509644, + "learning_rate": 0.00022109307530066062, + "loss": 3.444, + "step": 4303 + }, + { + "epoch": 0.55, + "grad_norm": 0.6117579340934753, + "learning_rate": 0.00022099011748715844, + "loss": 3.3554, + "step": 4304 + }, + { + "epoch": 0.55, + "grad_norm": 0.6590527296066284, + "learning_rate": 0.00022088716466079134, + "loss": 3.4342, + "step": 4305 + }, + { + "epoch": 0.55, + "grad_norm": 0.5997858643531799, + "learning_rate": 0.0002207842168392582, + "loss": 3.3164, + "step": 4306 + }, + { + "epoch": 0.55, + "grad_norm": 0.5858597755432129, + "learning_rate": 0.0002206812740402569, + "loss": 3.2571, + "step": 4307 + }, + { + "epoch": 0.55, + "grad_norm": 0.644085168838501, + "learning_rate": 0.0002205783362814844, + "loss": 3.2653, + "step": 4308 + }, + { + "epoch": 0.55, + "grad_norm": 0.6230582594871521, + "learning_rate": 0.00022047540358063707, + "loss": 3.3099, + "step": 4309 + }, + { + "epoch": 0.55, + "grad_norm": 0.6548083424568176, + "learning_rate": 0.0002203724759554101, + "loss": 3.2571, + "step": 4310 + }, + { + "epoch": 0.55, + "grad_norm": 0.6090371012687683, + "learning_rate": 0.00022026955342349788, + "loss": 3.314, + "step": 4311 + }, + { + "epoch": 0.55, + "grad_norm": 0.572574257850647, + "learning_rate": 0.00022016663600259417, + "loss": 3.4233, + "step": 4312 + }, + { + "epoch": 0.55, + "grad_norm": 0.6050710082054138, + "learning_rate": 0.00022006372371039163, + "loss": 3.3334, + "step": 4313 + }, + { + "epoch": 0.55, + "grad_norm": 0.6001432538032532, + "learning_rate": 0.00021996081656458204, + "loss": 3.3587, + "step": 4314 + }, + { + "epoch": 0.55, + "grad_norm": 0.5992408990859985, + "learning_rate": 0.00021985791458285626, + "loss": 3.4552, + "step": 4315 + }, + { + "epoch": 0.55, + "grad_norm": 0.6127637624740601, + "learning_rate": 0.00021975501778290446, + "loss": 3.3865, + "step": 4316 + }, + { + "epoch": 0.55, + "grad_norm": 0.6167628169059753, + "learning_rate": 0.00021965212618241576, + "loss": 3.3978, + "step": 4317 + }, + { + "epoch": 0.55, + "grad_norm": 0.6233075261116028, + "learning_rate": 0.0002195492397990783, + "loss": 3.4385, + "step": 4318 + }, + { + "epoch": 0.55, + "grad_norm": 0.5850648880004883, + "learning_rate": 0.0002194463586505796, + "loss": 3.2513, + "step": 4319 + }, + { + "epoch": 0.55, + "grad_norm": 0.6017858386039734, + "learning_rate": 0.00021934348275460597, + "loss": 3.3278, + "step": 4320 + }, + { + "epoch": 0.55, + "grad_norm": 0.6295514702796936, + "learning_rate": 0.00021924061212884313, + "loss": 3.3648, + "step": 4321 + }, + { + "epoch": 0.55, + "grad_norm": 0.6282049417495728, + "learning_rate": 0.00021913774679097568, + "loss": 3.3844, + "step": 4322 + }, + { + "epoch": 0.55, + "grad_norm": 0.5880584716796875, + "learning_rate": 0.00021903488675868726, + "loss": 3.2539, + "step": 4323 + }, + { + "epoch": 0.55, + "grad_norm": 0.6382170915603638, + "learning_rate": 0.00021893203204966088, + "loss": 3.4829, + "step": 4324 + }, + { + "epoch": 0.55, + "grad_norm": 0.6000595092773438, + "learning_rate": 0.00021882918268157834, + "loss": 3.3531, + "step": 4325 + }, + { + "epoch": 0.55, + "grad_norm": 0.6396212577819824, + "learning_rate": 0.0002187263386721206, + "loss": 3.3567, + "step": 4326 + }, + { + "epoch": 0.55, + "grad_norm": 0.6351012587547302, + "learning_rate": 0.00021862350003896787, + "loss": 3.2683, + "step": 4327 + }, + { + "epoch": 0.55, + "grad_norm": 0.6076213717460632, + "learning_rate": 0.00021852066679979923, + "loss": 3.3865, + "step": 4328 + }, + { + "epoch": 0.55, + "grad_norm": 0.5674775838851929, + "learning_rate": 0.00021841783897229278, + "loss": 3.2943, + "step": 4329 + }, + { + "epoch": 0.55, + "grad_norm": 0.6063193082809448, + "learning_rate": 0.000218315016574126, + "loss": 3.3139, + "step": 4330 + }, + { + "epoch": 0.55, + "grad_norm": 0.5930963754653931, + "learning_rate": 0.00021821219962297529, + "loss": 3.2054, + "step": 4331 + }, + { + "epoch": 0.55, + "grad_norm": 0.6369614601135254, + "learning_rate": 0.00021810938813651592, + "loss": 3.3401, + "step": 4332 + }, + { + "epoch": 0.55, + "grad_norm": 0.638029932975769, + "learning_rate": 0.00021800658213242243, + "loss": 3.3599, + "step": 4333 + }, + { + "epoch": 0.55, + "grad_norm": 0.6605929732322693, + "learning_rate": 0.00021790378162836837, + "loss": 3.3288, + "step": 4334 + }, + { + "epoch": 0.55, + "grad_norm": 0.6818827986717224, + "learning_rate": 0.0002178009866420264, + "loss": 3.3761, + "step": 4335 + }, + { + "epoch": 0.56, + "grad_norm": 0.60908442735672, + "learning_rate": 0.000217698197191068, + "loss": 3.3528, + "step": 4336 + }, + { + "epoch": 0.56, + "grad_norm": 0.594711422920227, + "learning_rate": 0.00021759541329316408, + "loss": 3.3716, + "step": 4337 + }, + { + "epoch": 0.56, + "grad_norm": 0.6263545155525208, + "learning_rate": 0.00021749263496598426, + "loss": 3.433, + "step": 4338 + }, + { + "epoch": 0.56, + "grad_norm": 0.6035946011543274, + "learning_rate": 0.00021738986222719723, + "loss": 3.3038, + "step": 4339 + }, + { + "epoch": 0.56, + "grad_norm": 0.6282978057861328, + "learning_rate": 0.00021728709509447102, + "loss": 3.3346, + "step": 4340 + }, + { + "epoch": 0.56, + "grad_norm": 0.5939093232154846, + "learning_rate": 0.0002171843335854724, + "loss": 3.2278, + "step": 4341 + }, + { + "epoch": 0.56, + "grad_norm": 0.6777181029319763, + "learning_rate": 0.00021708157771786732, + "loss": 3.4534, + "step": 4342 + }, + { + "epoch": 0.56, + "grad_norm": 0.6568247675895691, + "learning_rate": 0.00021697882750932064, + "loss": 3.4082, + "step": 4343 + }, + { + "epoch": 0.56, + "grad_norm": 0.5942575335502625, + "learning_rate": 0.00021687608297749625, + "loss": 3.2439, + "step": 4344 + }, + { + "epoch": 0.56, + "grad_norm": 0.6373928189277649, + "learning_rate": 0.0002167733441400573, + "loss": 3.2647, + "step": 4345 + }, + { + "epoch": 0.56, + "grad_norm": 0.6113847494125366, + "learning_rate": 0.00021667061101466565, + "loss": 3.3904, + "step": 4346 + }, + { + "epoch": 0.56, + "grad_norm": 0.6256155967712402, + "learning_rate": 0.0002165678836189823, + "loss": 3.2756, + "step": 4347 + }, + { + "epoch": 0.56, + "grad_norm": 0.6141061782836914, + "learning_rate": 0.0002164651619706673, + "loss": 3.3737, + "step": 4348 + }, + { + "epoch": 0.56, + "grad_norm": 0.5906838774681091, + "learning_rate": 0.00021636244608737982, + "loss": 3.2511, + "step": 4349 + }, + { + "epoch": 0.56, + "grad_norm": 0.6241053938865662, + "learning_rate": 0.00021625973598677785, + "loss": 3.3067, + "step": 4350 + }, + { + "epoch": 0.56, + "grad_norm": 0.6333268880844116, + "learning_rate": 0.00021615703168651832, + "loss": 3.4568, + "step": 4351 + }, + { + "epoch": 0.56, + "grad_norm": 0.6115931868553162, + "learning_rate": 0.00021605433320425743, + "loss": 3.321, + "step": 4352 + }, + { + "epoch": 0.56, + "grad_norm": 0.589478611946106, + "learning_rate": 0.00021595164055765022, + "loss": 3.2818, + "step": 4353 + }, + { + "epoch": 0.56, + "grad_norm": 0.6029682159423828, + "learning_rate": 0.00021584895376435068, + "loss": 3.2876, + "step": 4354 + }, + { + "epoch": 0.56, + "grad_norm": 0.5721759796142578, + "learning_rate": 0.00021574627284201193, + "loss": 3.3056, + "step": 4355 + }, + { + "epoch": 0.56, + "grad_norm": 0.6214892864227295, + "learning_rate": 0.00021564359780828598, + "loss": 3.2567, + "step": 4356 + }, + { + "epoch": 0.56, + "grad_norm": 0.6280972957611084, + "learning_rate": 0.0002155409286808238, + "loss": 3.446, + "step": 4357 + }, + { + "epoch": 0.56, + "grad_norm": 0.6329858303070068, + "learning_rate": 0.00021543826547727545, + "loss": 3.2714, + "step": 4358 + }, + { + "epoch": 0.56, + "grad_norm": 0.5748723745346069, + "learning_rate": 0.00021533560821529002, + "loss": 3.2944, + "step": 4359 + }, + { + "epoch": 0.56, + "grad_norm": 0.6321572065353394, + "learning_rate": 0.00021523295691251544, + "loss": 3.3506, + "step": 4360 + }, + { + "epoch": 0.56, + "grad_norm": 0.5913103818893433, + "learning_rate": 0.00021513031158659852, + "loss": 3.4111, + "step": 4361 + }, + { + "epoch": 0.56, + "grad_norm": 0.7028201818466187, + "learning_rate": 0.00021502767225518532, + "loss": 3.3399, + "step": 4362 + }, + { + "epoch": 0.56, + "grad_norm": 0.5699731707572937, + "learning_rate": 0.00021492503893592074, + "loss": 3.3424, + "step": 4363 + }, + { + "epoch": 0.56, + "grad_norm": 0.6355714201927185, + "learning_rate": 0.00021482241164644855, + "loss": 3.3267, + "step": 4364 + }, + { + "epoch": 0.56, + "grad_norm": 0.6434081196784973, + "learning_rate": 0.0002147197904044116, + "loss": 3.2562, + "step": 4365 + }, + { + "epoch": 0.56, + "grad_norm": 0.6165071725845337, + "learning_rate": 0.00021461717522745162, + "loss": 3.4498, + "step": 4366 + }, + { + "epoch": 0.56, + "grad_norm": 0.6426162123680115, + "learning_rate": 0.0002145145661332095, + "loss": 3.3448, + "step": 4367 + }, + { + "epoch": 0.56, + "grad_norm": 0.6269713044166565, + "learning_rate": 0.00021441196313932485, + "loss": 3.352, + "step": 4368 + }, + { + "epoch": 0.56, + "grad_norm": 0.5989927649497986, + "learning_rate": 0.00021430936626343626, + "loss": 3.2152, + "step": 4369 + }, + { + "epoch": 0.56, + "grad_norm": 0.586698055267334, + "learning_rate": 0.00021420677552318141, + "loss": 3.4072, + "step": 4370 + }, + { + "epoch": 0.56, + "grad_norm": 0.5867027640342712, + "learning_rate": 0.00021410419093619682, + "loss": 3.2013, + "step": 4371 + }, + { + "epoch": 0.56, + "grad_norm": 0.6716873049736023, + "learning_rate": 0.00021400161252011786, + "loss": 3.325, + "step": 4372 + }, + { + "epoch": 0.56, + "grad_norm": 0.6525076031684875, + "learning_rate": 0.00021389904029257912, + "loss": 3.4317, + "step": 4373 + }, + { + "epoch": 0.56, + "grad_norm": 0.6117497682571411, + "learning_rate": 0.00021379647427121387, + "loss": 3.3695, + "step": 4374 + }, + { + "epoch": 0.56, + "grad_norm": 0.6429139971733093, + "learning_rate": 0.00021369391447365437, + "loss": 3.3836, + "step": 4375 + }, + { + "epoch": 0.56, + "grad_norm": 0.5918415784835815, + "learning_rate": 0.00021359136091753176, + "loss": 3.2832, + "step": 4376 + }, + { + "epoch": 0.56, + "grad_norm": 0.6165708899497986, + "learning_rate": 0.00021348881362047643, + "loss": 3.2793, + "step": 4377 + }, + { + "epoch": 0.56, + "grad_norm": 0.6108337044715881, + "learning_rate": 0.00021338627260011732, + "loss": 3.297, + "step": 4378 + }, + { + "epoch": 0.56, + "grad_norm": 0.6316553354263306, + "learning_rate": 0.00021328373787408235, + "loss": 3.4442, + "step": 4379 + }, + { + "epoch": 0.56, + "grad_norm": 0.594231903553009, + "learning_rate": 0.00021318120945999853, + "loss": 3.3557, + "step": 4380 + }, + { + "epoch": 0.56, + "grad_norm": 0.6255188584327698, + "learning_rate": 0.00021307868737549166, + "loss": 3.2208, + "step": 4381 + }, + { + "epoch": 0.56, + "grad_norm": 0.5767228007316589, + "learning_rate": 0.00021297617163818639, + "loss": 3.2629, + "step": 4382 + }, + { + "epoch": 0.56, + "grad_norm": 0.6460601687431335, + "learning_rate": 0.0002128736622657065, + "loss": 3.2394, + "step": 4383 + }, + { + "epoch": 0.56, + "grad_norm": 0.6205249428749084, + "learning_rate": 0.00021277115927567446, + "loss": 3.2985, + "step": 4384 + }, + { + "epoch": 0.56, + "grad_norm": 0.6857031583786011, + "learning_rate": 0.00021266866268571168, + "loss": 3.3526, + "step": 4385 + }, + { + "epoch": 0.56, + "grad_norm": 0.6057483553886414, + "learning_rate": 0.00021256617251343862, + "loss": 3.361, + "step": 4386 + }, + { + "epoch": 0.56, + "grad_norm": 0.6175010800361633, + "learning_rate": 0.00021246368877647442, + "loss": 3.2765, + "step": 4387 + }, + { + "epoch": 0.56, + "grad_norm": 0.5943877696990967, + "learning_rate": 0.00021236121149243733, + "loss": 3.3107, + "step": 4388 + }, + { + "epoch": 0.56, + "grad_norm": 0.6185916066169739, + "learning_rate": 0.00021225874067894434, + "loss": 3.1669, + "step": 4389 + }, + { + "epoch": 0.56, + "grad_norm": 0.6013955473899841, + "learning_rate": 0.00021215627635361135, + "loss": 3.374, + "step": 4390 + }, + { + "epoch": 0.56, + "grad_norm": 0.5989227890968323, + "learning_rate": 0.00021205381853405317, + "loss": 3.4375, + "step": 4391 + }, + { + "epoch": 0.56, + "grad_norm": 0.6267585754394531, + "learning_rate": 0.0002119513672378835, + "loss": 3.2624, + "step": 4392 + }, + { + "epoch": 0.56, + "grad_norm": 0.635187029838562, + "learning_rate": 0.00021184892248271489, + "loss": 3.2997, + "step": 4393 + }, + { + "epoch": 0.56, + "grad_norm": 0.6231619715690613, + "learning_rate": 0.0002117464842861587, + "loss": 3.4194, + "step": 4394 + }, + { + "epoch": 0.56, + "grad_norm": 0.5973530411720276, + "learning_rate": 0.00021164405266582546, + "loss": 3.4433, + "step": 4395 + }, + { + "epoch": 0.56, + "grad_norm": 0.5918546319007874, + "learning_rate": 0.0002115416276393242, + "loss": 3.2967, + "step": 4396 + }, + { + "epoch": 0.56, + "grad_norm": 0.6279117465019226, + "learning_rate": 0.00021143920922426298, + "loss": 3.3549, + "step": 4397 + }, + { + "epoch": 0.56, + "grad_norm": 0.6497898101806641, + "learning_rate": 0.00021133679743824877, + "loss": 3.517, + "step": 4398 + }, + { + "epoch": 0.56, + "grad_norm": 0.6653177738189697, + "learning_rate": 0.0002112343922988873, + "loss": 3.369, + "step": 4399 + }, + { + "epoch": 0.56, + "grad_norm": 0.6002407670021057, + "learning_rate": 0.00021113199382378312, + "loss": 3.3906, + "step": 4400 + }, + { + "epoch": 0.56, + "grad_norm": 0.6072937846183777, + "learning_rate": 0.0002110296020305399, + "loss": 3.2641, + "step": 4401 + }, + { + "epoch": 0.56, + "grad_norm": 0.621131420135498, + "learning_rate": 0.00021092721693675984, + "loss": 3.3461, + "step": 4402 + }, + { + "epoch": 0.56, + "grad_norm": 0.6232835054397583, + "learning_rate": 0.00021082483856004405, + "loss": 3.4475, + "step": 4403 + }, + { + "epoch": 0.56, + "grad_norm": 0.6901021003723145, + "learning_rate": 0.0002107224669179928, + "loss": 3.3947, + "step": 4404 + }, + { + "epoch": 0.56, + "grad_norm": 0.5754798650741577, + "learning_rate": 0.00021062010202820477, + "loss": 3.2069, + "step": 4405 + }, + { + "epoch": 0.56, + "grad_norm": 0.6407782435417175, + "learning_rate": 0.00021051774390827777, + "loss": 3.26, + "step": 4406 + }, + { + "epoch": 0.56, + "grad_norm": 0.6157494187355042, + "learning_rate": 0.00021041539257580832, + "loss": 3.3776, + "step": 4407 + }, + { + "epoch": 0.56, + "grad_norm": 0.6090334057807922, + "learning_rate": 0.00021031304804839177, + "loss": 3.2972, + "step": 4408 + }, + { + "epoch": 0.56, + "grad_norm": 0.6406842470169067, + "learning_rate": 0.0002102107103436224, + "loss": 3.3999, + "step": 4409 + }, + { + "epoch": 0.56, + "grad_norm": 0.6397127509117126, + "learning_rate": 0.00021010837947909314, + "loss": 3.2272, + "step": 4410 + }, + { + "epoch": 0.56, + "grad_norm": 0.5834441184997559, + "learning_rate": 0.0002100060554723959, + "loss": 3.3716, + "step": 4411 + }, + { + "epoch": 0.56, + "grad_norm": 0.6270722150802612, + "learning_rate": 0.00020990373834112142, + "loss": 3.3023, + "step": 4412 + }, + { + "epoch": 0.56, + "grad_norm": 0.6306389570236206, + "learning_rate": 0.00020980142810285904, + "loss": 3.3696, + "step": 4413 + }, + { + "epoch": 0.56, + "grad_norm": 0.5938227772712708, + "learning_rate": 0.00020969912477519732, + "loss": 3.3412, + "step": 4414 + }, + { + "epoch": 0.57, + "grad_norm": 0.6692408323287964, + "learning_rate": 0.00020959682837572318, + "loss": 3.4345, + "step": 4415 + }, + { + "epoch": 0.57, + "grad_norm": 0.6993062496185303, + "learning_rate": 0.0002094945389220227, + "loss": 3.2873, + "step": 4416 + }, + { + "epoch": 0.57, + "grad_norm": 0.62058424949646, + "learning_rate": 0.00020939225643168055, + "loss": 3.2635, + "step": 4417 + }, + { + "epoch": 0.57, + "grad_norm": 0.6588623523712158, + "learning_rate": 0.00020928998092228023, + "loss": 3.2024, + "step": 4418 + }, + { + "epoch": 0.57, + "grad_norm": 0.6288304328918457, + "learning_rate": 0.00020918771241140423, + "loss": 3.3743, + "step": 4419 + }, + { + "epoch": 0.57, + "grad_norm": 0.6205514669418335, + "learning_rate": 0.00020908545091663356, + "loss": 3.3437, + "step": 4420 + }, + { + "epoch": 0.57, + "grad_norm": 0.5885340571403503, + "learning_rate": 0.00020898319645554816, + "loss": 3.2602, + "step": 4421 + }, + { + "epoch": 0.57, + "grad_norm": 0.5855981707572937, + "learning_rate": 0.0002088809490457268, + "loss": 3.2926, + "step": 4422 + }, + { + "epoch": 0.57, + "grad_norm": 0.623570442199707, + "learning_rate": 0.000208778708704747, + "loss": 3.2314, + "step": 4423 + }, + { + "epoch": 0.57, + "grad_norm": 0.6036929488182068, + "learning_rate": 0.0002086764754501851, + "loss": 3.4226, + "step": 4424 + }, + { + "epoch": 0.57, + "grad_norm": 0.6300530433654785, + "learning_rate": 0.00020857424929961613, + "loss": 3.3704, + "step": 4425 + }, + { + "epoch": 0.57, + "grad_norm": 0.5847938060760498, + "learning_rate": 0.0002084720302706139, + "loss": 3.3486, + "step": 4426 + }, + { + "epoch": 0.57, + "grad_norm": 0.6898181438446045, + "learning_rate": 0.00020836981838075113, + "loss": 3.4942, + "step": 4427 + }, + { + "epoch": 0.57, + "grad_norm": 0.5924224257469177, + "learning_rate": 0.00020826761364759925, + "loss": 3.3576, + "step": 4428 + }, + { + "epoch": 0.57, + "grad_norm": 0.6309162974357605, + "learning_rate": 0.0002081654160887283, + "loss": 3.3284, + "step": 4429 + }, + { + "epoch": 0.57, + "grad_norm": 0.5869278907775879, + "learning_rate": 0.0002080632257217074, + "loss": 3.3286, + "step": 4430 + }, + { + "epoch": 0.57, + "grad_norm": 0.6758648157119751, + "learning_rate": 0.000207961042564104, + "loss": 3.4192, + "step": 4431 + }, + { + "epoch": 0.57, + "grad_norm": 0.6503282189369202, + "learning_rate": 0.00020785886663348492, + "loss": 3.4772, + "step": 4432 + }, + { + "epoch": 0.57, + "grad_norm": 0.6114601492881775, + "learning_rate": 0.00020775669794741508, + "loss": 3.3737, + "step": 4433 + }, + { + "epoch": 0.57, + "grad_norm": 0.6376871466636658, + "learning_rate": 0.0002076545365234587, + "loss": 3.2028, + "step": 4434 + }, + { + "epoch": 0.57, + "grad_norm": 0.6289983987808228, + "learning_rate": 0.0002075523823791784, + "loss": 3.4294, + "step": 4435 + }, + { + "epoch": 0.57, + "grad_norm": 0.6352116465568542, + "learning_rate": 0.00020745023553213557, + "loss": 3.2161, + "step": 4436 + }, + { + "epoch": 0.57, + "grad_norm": 0.5930891633033752, + "learning_rate": 0.00020734809599989062, + "loss": 3.2802, + "step": 4437 + }, + { + "epoch": 0.57, + "grad_norm": 0.6394661664962769, + "learning_rate": 0.0002072459638000024, + "loss": 3.3941, + "step": 4438 + }, + { + "epoch": 0.57, + "grad_norm": 0.6554189920425415, + "learning_rate": 0.00020714383895002863, + "loss": 3.4242, + "step": 4439 + }, + { + "epoch": 0.57, + "grad_norm": 0.6214420795440674, + "learning_rate": 0.00020704172146752576, + "loss": 3.3585, + "step": 4440 + }, + { + "epoch": 0.57, + "grad_norm": 0.5942106246948242, + "learning_rate": 0.000206939611370049, + "loss": 3.3558, + "step": 4441 + }, + { + "epoch": 0.57, + "grad_norm": 0.5977739691734314, + "learning_rate": 0.00020683750867515226, + "loss": 3.343, + "step": 4442 + }, + { + "epoch": 0.57, + "grad_norm": 0.6262596249580383, + "learning_rate": 0.0002067354134003882, + "loss": 3.5039, + "step": 4443 + }, + { + "epoch": 0.57, + "grad_norm": 0.6008438467979431, + "learning_rate": 0.00020663332556330807, + "loss": 3.2965, + "step": 4444 + }, + { + "epoch": 0.57, + "grad_norm": 0.6184266209602356, + "learning_rate": 0.0002065312451814621, + "loss": 3.3596, + "step": 4445 + }, + { + "epoch": 0.57, + "grad_norm": 0.6176638603210449, + "learning_rate": 0.00020642917227239898, + "loss": 3.2312, + "step": 4446 + }, + { + "epoch": 0.57, + "grad_norm": 0.599486768245697, + "learning_rate": 0.00020632710685366623, + "loss": 3.3167, + "step": 4447 + }, + { + "epoch": 0.57, + "grad_norm": 0.5841536521911621, + "learning_rate": 0.00020622504894281018, + "loss": 3.4116, + "step": 4448 + }, + { + "epoch": 0.57, + "grad_norm": 0.62180495262146, + "learning_rate": 0.0002061229985573757, + "loss": 3.3403, + "step": 4449 + }, + { + "epoch": 0.57, + "grad_norm": 0.5866765975952148, + "learning_rate": 0.0002060209557149063, + "loss": 3.3545, + "step": 4450 + }, + { + "epoch": 0.57, + "grad_norm": 0.5751224160194397, + "learning_rate": 0.00020591892043294452, + "loss": 3.3138, + "step": 4451 + }, + { + "epoch": 0.57, + "grad_norm": 0.621759295463562, + "learning_rate": 0.00020581689272903143, + "loss": 3.4496, + "step": 4452 + }, + { + "epoch": 0.57, + "grad_norm": 0.6080735921859741, + "learning_rate": 0.00020571487262070664, + "loss": 3.2598, + "step": 4453 + }, + { + "epoch": 0.57, + "grad_norm": 0.6722729802131653, + "learning_rate": 0.00020561286012550864, + "loss": 3.3551, + "step": 4454 + }, + { + "epoch": 0.57, + "grad_norm": 0.6033018827438354, + "learning_rate": 0.0002055108552609746, + "loss": 3.2812, + "step": 4455 + }, + { + "epoch": 0.57, + "grad_norm": 0.6231434345245361, + "learning_rate": 0.00020540885804464033, + "loss": 3.3449, + "step": 4456 + }, + { + "epoch": 0.57, + "grad_norm": 0.5670124292373657, + "learning_rate": 0.0002053068684940402, + "loss": 3.1999, + "step": 4457 + }, + { + "epoch": 0.57, + "grad_norm": 0.6172345280647278, + "learning_rate": 0.0002052048866267076, + "loss": 3.2949, + "step": 4458 + }, + { + "epoch": 0.57, + "grad_norm": 0.6621881127357483, + "learning_rate": 0.00020510291246017415, + "loss": 3.4038, + "step": 4459 + }, + { + "epoch": 0.57, + "grad_norm": 0.5925076007843018, + "learning_rate": 0.0002050009460119707, + "loss": 3.3718, + "step": 4460 + }, + { + "epoch": 0.57, + "grad_norm": 0.6103377342224121, + "learning_rate": 0.00020489898729962627, + "loss": 3.455, + "step": 4461 + }, + { + "epoch": 0.57, + "grad_norm": 0.6944873332977295, + "learning_rate": 0.00020479703634066873, + "loss": 3.2554, + "step": 4462 + }, + { + "epoch": 0.57, + "grad_norm": 0.6063147783279419, + "learning_rate": 0.00020469509315262474, + "loss": 3.3391, + "step": 4463 + }, + { + "epoch": 0.57, + "grad_norm": 0.5826002359390259, + "learning_rate": 0.00020459315775301945, + "loss": 3.357, + "step": 4464 + }, + { + "epoch": 0.57, + "grad_norm": 0.5819864273071289, + "learning_rate": 0.0002044912301593767, + "loss": 3.3449, + "step": 4465 + }, + { + "epoch": 0.57, + "grad_norm": 0.6028755307197571, + "learning_rate": 0.00020438931038921913, + "loss": 3.4394, + "step": 4466 + }, + { + "epoch": 0.57, + "grad_norm": 0.597665548324585, + "learning_rate": 0.00020428739846006783, + "loss": 3.3657, + "step": 4467 + }, + { + "epoch": 0.57, + "grad_norm": 0.6453766822814941, + "learning_rate": 0.00020418549438944262, + "loss": 3.2768, + "step": 4468 + }, + { + "epoch": 0.57, + "grad_norm": 0.6111141443252563, + "learning_rate": 0.00020408359819486206, + "loss": 3.3182, + "step": 4469 + }, + { + "epoch": 0.57, + "grad_norm": 0.6410272717475891, + "learning_rate": 0.00020398170989384336, + "loss": 3.2668, + "step": 4470 + }, + { + "epoch": 0.57, + "grad_norm": 0.6027970314025879, + "learning_rate": 0.00020387982950390222, + "loss": 3.2543, + "step": 4471 + }, + { + "epoch": 0.57, + "grad_norm": 0.5735467076301575, + "learning_rate": 0.000203777957042553, + "loss": 3.2246, + "step": 4472 + }, + { + "epoch": 0.57, + "grad_norm": 0.6231405735015869, + "learning_rate": 0.00020367609252730886, + "loss": 3.5919, + "step": 4473 + }, + { + "epoch": 0.57, + "grad_norm": 0.6022270917892456, + "learning_rate": 0.00020357423597568147, + "loss": 3.3906, + "step": 4474 + }, + { + "epoch": 0.57, + "grad_norm": 0.6529914736747742, + "learning_rate": 0.00020347238740518107, + "loss": 3.2762, + "step": 4475 + }, + { + "epoch": 0.57, + "grad_norm": 0.5402228236198425, + "learning_rate": 0.00020337054683331672, + "loss": 3.2418, + "step": 4476 + }, + { + "epoch": 0.57, + "grad_norm": 0.6032045483589172, + "learning_rate": 0.00020326871427759583, + "loss": 3.2748, + "step": 4477 + }, + { + "epoch": 0.57, + "grad_norm": 0.6211556196212769, + "learning_rate": 0.00020316688975552483, + "loss": 3.2738, + "step": 4478 + }, + { + "epoch": 0.57, + "grad_norm": 0.587247908115387, + "learning_rate": 0.00020306507328460838, + "loss": 3.3226, + "step": 4479 + }, + { + "epoch": 0.57, + "grad_norm": 0.6140707731246948, + "learning_rate": 0.00020296326488234996, + "loss": 3.3092, + "step": 4480 + }, + { + "epoch": 0.57, + "grad_norm": 0.6291319131851196, + "learning_rate": 0.0002028614645662516, + "loss": 3.403, + "step": 4481 + }, + { + "epoch": 0.57, + "grad_norm": 0.6498053669929504, + "learning_rate": 0.00020275967235381398, + "loss": 3.3862, + "step": 4482 + }, + { + "epoch": 0.57, + "grad_norm": 0.5758363604545593, + "learning_rate": 0.00020265788826253627, + "loss": 3.3229, + "step": 4483 + }, + { + "epoch": 0.57, + "grad_norm": 0.6611259579658508, + "learning_rate": 0.0002025561123099165, + "loss": 3.2656, + "step": 4484 + }, + { + "epoch": 0.57, + "grad_norm": 0.6483119130134583, + "learning_rate": 0.00020245434451345102, + "loss": 3.274, + "step": 4485 + }, + { + "epoch": 0.57, + "grad_norm": 0.580224871635437, + "learning_rate": 0.00020235258489063486, + "loss": 3.3803, + "step": 4486 + }, + { + "epoch": 0.57, + "grad_norm": 0.6108076572418213, + "learning_rate": 0.00020225083345896163, + "loss": 3.33, + "step": 4487 + }, + { + "epoch": 0.57, + "grad_norm": 0.599166214466095, + "learning_rate": 0.00020214909023592387, + "loss": 3.2939, + "step": 4488 + }, + { + "epoch": 0.57, + "grad_norm": 0.6520143747329712, + "learning_rate": 0.00020204735523901218, + "loss": 3.3922, + "step": 4489 + }, + { + "epoch": 0.57, + "grad_norm": 0.6151868104934692, + "learning_rate": 0.000201945628485716, + "loss": 3.3768, + "step": 4490 + }, + { + "epoch": 0.57, + "grad_norm": 0.6231768131256104, + "learning_rate": 0.00020184390999352346, + "loss": 3.2957, + "step": 4491 + }, + { + "epoch": 0.57, + "grad_norm": 0.5855582356452942, + "learning_rate": 0.00020174219977992102, + "loss": 3.3735, + "step": 4492 + }, + { + "epoch": 0.58, + "grad_norm": 0.6440154314041138, + "learning_rate": 0.00020164049786239386, + "loss": 3.3244, + "step": 4493 + }, + { + "epoch": 0.58, + "grad_norm": 0.6473847031593323, + "learning_rate": 0.0002015388042584258, + "loss": 3.261, + "step": 4494 + }, + { + "epoch": 0.58, + "grad_norm": 0.6261402368545532, + "learning_rate": 0.0002014371189854991, + "loss": 3.3561, + "step": 4495 + }, + { + "epoch": 0.58, + "grad_norm": 0.6150282025337219, + "learning_rate": 0.0002013354420610945, + "loss": 3.2941, + "step": 4496 + }, + { + "epoch": 0.58, + "grad_norm": 0.6966652870178223, + "learning_rate": 0.00020123377350269176, + "loss": 3.4082, + "step": 4497 + }, + { + "epoch": 0.58, + "grad_norm": 0.6378828883171082, + "learning_rate": 0.0002011321133277686, + "loss": 3.465, + "step": 4498 + }, + { + "epoch": 0.58, + "grad_norm": 0.6646527051925659, + "learning_rate": 0.00020103046155380173, + "loss": 3.4109, + "step": 4499 + }, + { + "epoch": 0.58, + "grad_norm": 0.5820347666740417, + "learning_rate": 0.00020092881819826623, + "loss": 3.2323, + "step": 4500 + }, + { + "epoch": 0.58, + "grad_norm": 0.6171141266822815, + "learning_rate": 0.0002008271832786357, + "loss": 3.3112, + "step": 4501 + }, + { + "epoch": 0.58, + "grad_norm": 0.5929138660430908, + "learning_rate": 0.0002007255568123825, + "loss": 3.3975, + "step": 4502 + }, + { + "epoch": 0.58, + "grad_norm": 0.6141952276229858, + "learning_rate": 0.00020062393881697732, + "loss": 3.281, + "step": 4503 + }, + { + "epoch": 0.58, + "grad_norm": 0.5851192474365234, + "learning_rate": 0.0002005223293098894, + "loss": 3.4033, + "step": 4504 + }, + { + "epoch": 0.58, + "grad_norm": 0.6415019631385803, + "learning_rate": 0.00020042072830858663, + "loss": 3.3742, + "step": 4505 + }, + { + "epoch": 0.58, + "grad_norm": 0.5388271808624268, + "learning_rate": 0.0002003191358305355, + "loss": 3.4478, + "step": 4506 + }, + { + "epoch": 0.58, + "grad_norm": 0.6231642961502075, + "learning_rate": 0.00020021755189320096, + "loss": 3.3979, + "step": 4507 + }, + { + "epoch": 0.58, + "grad_norm": 0.726058304309845, + "learning_rate": 0.00020011597651404625, + "loss": 3.2798, + "step": 4508 + }, + { + "epoch": 0.58, + "grad_norm": 0.5680627822875977, + "learning_rate": 0.00020001440971053355, + "loss": 3.2948, + "step": 4509 + }, + { + "epoch": 0.58, + "grad_norm": 0.618209183216095, + "learning_rate": 0.00019991285150012332, + "loss": 3.3348, + "step": 4510 + }, + { + "epoch": 0.58, + "grad_norm": 0.5917927026748657, + "learning_rate": 0.00019981130190027452, + "loss": 3.3388, + "step": 4511 + }, + { + "epoch": 0.58, + "grad_norm": 0.6505729556083679, + "learning_rate": 0.0001997097609284448, + "loss": 3.263, + "step": 4512 + }, + { + "epoch": 0.58, + "grad_norm": 0.6598019003868103, + "learning_rate": 0.0001996082286020902, + "loss": 3.3134, + "step": 4513 + }, + { + "epoch": 0.58, + "grad_norm": 0.6504053473472595, + "learning_rate": 0.00019950670493866518, + "loss": 3.4686, + "step": 4514 + }, + { + "epoch": 0.58, + "grad_norm": 0.6098240613937378, + "learning_rate": 0.00019940518995562306, + "loss": 3.4063, + "step": 4515 + }, + { + "epoch": 0.58, + "grad_norm": 0.677740216255188, + "learning_rate": 0.0001993036836704153, + "loss": 3.3569, + "step": 4516 + }, + { + "epoch": 0.58, + "grad_norm": 0.5991215705871582, + "learning_rate": 0.00019920218610049205, + "loss": 3.277, + "step": 4517 + }, + { + "epoch": 0.58, + "grad_norm": 0.6191042065620422, + "learning_rate": 0.00019910069726330196, + "loss": 3.5172, + "step": 4518 + }, + { + "epoch": 0.58, + "grad_norm": 0.5997809171676636, + "learning_rate": 0.000198999217176292, + "loss": 3.3544, + "step": 4519 + }, + { + "epoch": 0.58, + "grad_norm": 0.6340522170066833, + "learning_rate": 0.00019889774585690794, + "loss": 3.2665, + "step": 4520 + }, + { + "epoch": 0.58, + "grad_norm": 0.5887564420700073, + "learning_rate": 0.00019879628332259376, + "loss": 3.2665, + "step": 4521 + }, + { + "epoch": 0.58, + "grad_norm": 0.6379936337471008, + "learning_rate": 0.00019869482959079205, + "loss": 3.3426, + "step": 4522 + }, + { + "epoch": 0.58, + "grad_norm": 0.6129528880119324, + "learning_rate": 0.00019859338467894395, + "loss": 3.3432, + "step": 4523 + }, + { + "epoch": 0.58, + "grad_norm": 0.6111044883728027, + "learning_rate": 0.00019849194860448887, + "loss": 3.3171, + "step": 4524 + }, + { + "epoch": 0.58, + "grad_norm": 0.6117994785308838, + "learning_rate": 0.00019839052138486508, + "loss": 3.3349, + "step": 4525 + }, + { + "epoch": 0.58, + "grad_norm": 0.6637835502624512, + "learning_rate": 0.0001982891030375089, + "loss": 3.2723, + "step": 4526 + }, + { + "epoch": 0.58, + "grad_norm": 0.6433278322219849, + "learning_rate": 0.00019818769357985547, + "loss": 3.2436, + "step": 4527 + }, + { + "epoch": 0.58, + "grad_norm": 0.6748256087303162, + "learning_rate": 0.00019808629302933817, + "loss": 3.4467, + "step": 4528 + }, + { + "epoch": 0.58, + "grad_norm": 0.6236575245857239, + "learning_rate": 0.00019798490140338887, + "loss": 3.3715, + "step": 4529 + }, + { + "epoch": 0.58, + "grad_norm": 0.6512519121170044, + "learning_rate": 0.0001978835187194381, + "loss": 3.4483, + "step": 4530 + }, + { + "epoch": 0.58, + "grad_norm": 0.5926584601402283, + "learning_rate": 0.00019778214499491462, + "loss": 3.319, + "step": 4531 + }, + { + "epoch": 0.58, + "grad_norm": 0.7325878739356995, + "learning_rate": 0.00019768078024724576, + "loss": 3.3067, + "step": 4532 + }, + { + "epoch": 0.58, + "grad_norm": 0.8272165060043335, + "learning_rate": 0.00019757942449385725, + "loss": 3.4562, + "step": 4533 + }, + { + "epoch": 0.58, + "grad_norm": 0.6049686670303345, + "learning_rate": 0.00019747807775217344, + "loss": 3.4144, + "step": 4534 + }, + { + "epoch": 0.58, + "grad_norm": 0.5436964631080627, + "learning_rate": 0.000197376740039617, + "loss": 3.347, + "step": 4535 + }, + { + "epoch": 0.58, + "grad_norm": 0.5583860874176025, + "learning_rate": 0.000197275411373609, + "loss": 3.3251, + "step": 4536 + }, + { + "epoch": 0.58, + "grad_norm": 0.5891790986061096, + "learning_rate": 0.00019717409177156893, + "loss": 3.262, + "step": 4537 + }, + { + "epoch": 0.58, + "grad_norm": 0.6078890562057495, + "learning_rate": 0.000197072781250915, + "loss": 3.2676, + "step": 4538 + }, + { + "epoch": 0.58, + "grad_norm": 0.5872478485107422, + "learning_rate": 0.00019697147982906355, + "loss": 3.332, + "step": 4539 + }, + { + "epoch": 0.58, + "grad_norm": 0.6866099238395691, + "learning_rate": 0.00019687018752342938, + "loss": 3.1723, + "step": 4540 + }, + { + "epoch": 0.58, + "grad_norm": 0.5853214263916016, + "learning_rate": 0.00019676890435142602, + "loss": 3.3885, + "step": 4541 + }, + { + "epoch": 0.58, + "grad_norm": 0.6210541129112244, + "learning_rate": 0.00019666763033046497, + "loss": 3.3431, + "step": 4542 + }, + { + "epoch": 0.58, + "grad_norm": 0.6124565601348877, + "learning_rate": 0.00019656636547795664, + "loss": 3.3899, + "step": 4543 + }, + { + "epoch": 0.58, + "grad_norm": 0.6130770444869995, + "learning_rate": 0.0001964651098113095, + "loss": 3.293, + "step": 4544 + }, + { + "epoch": 0.58, + "grad_norm": 0.6430736184120178, + "learning_rate": 0.0001963638633479307, + "loss": 3.3275, + "step": 4545 + }, + { + "epoch": 0.58, + "grad_norm": 0.5826516151428223, + "learning_rate": 0.00019626262610522558, + "loss": 3.3288, + "step": 4546 + }, + { + "epoch": 0.58, + "grad_norm": 0.6000692248344421, + "learning_rate": 0.00019616139810059793, + "loss": 3.4573, + "step": 4547 + }, + { + "epoch": 0.58, + "grad_norm": 0.6481806635856628, + "learning_rate": 0.00019606017935145018, + "loss": 3.2728, + "step": 4548 + }, + { + "epoch": 0.58, + "grad_norm": 0.6128011345863342, + "learning_rate": 0.00019595896987518292, + "loss": 3.4553, + "step": 4549 + }, + { + "epoch": 0.58, + "grad_norm": 0.6444360613822937, + "learning_rate": 0.00019585776968919516, + "loss": 3.284, + "step": 4550 + }, + { + "epoch": 0.58, + "grad_norm": 0.5891768336296082, + "learning_rate": 0.0001957565788108844, + "loss": 3.224, + "step": 4551 + }, + { + "epoch": 0.58, + "grad_norm": 0.6095583438873291, + "learning_rate": 0.0001956553972576467, + "loss": 3.3322, + "step": 4552 + }, + { + "epoch": 0.58, + "grad_norm": 0.5875852704048157, + "learning_rate": 0.00019555422504687625, + "loss": 3.3107, + "step": 4553 + }, + { + "epoch": 0.58, + "grad_norm": 0.6559893488883972, + "learning_rate": 0.00019545306219596564, + "loss": 3.391, + "step": 4554 + }, + { + "epoch": 0.58, + "grad_norm": 0.6290282011032104, + "learning_rate": 0.00019535190872230608, + "loss": 3.339, + "step": 4555 + }, + { + "epoch": 0.58, + "grad_norm": 0.5857229232788086, + "learning_rate": 0.0001952507646432869, + "loss": 3.4297, + "step": 4556 + }, + { + "epoch": 0.58, + "grad_norm": 0.6085899472236633, + "learning_rate": 0.00019514962997629602, + "loss": 3.2916, + "step": 4557 + }, + { + "epoch": 0.58, + "grad_norm": 0.6261692643165588, + "learning_rate": 0.00019504850473871954, + "loss": 3.2903, + "step": 4558 + }, + { + "epoch": 0.58, + "grad_norm": 0.8973466753959656, + "learning_rate": 0.00019494738894794222, + "loss": 3.3536, + "step": 4559 + }, + { + "epoch": 0.58, + "grad_norm": 0.6741258502006531, + "learning_rate": 0.00019484628262134696, + "loss": 3.4425, + "step": 4560 + }, + { + "epoch": 0.58, + "grad_norm": 0.6239340305328369, + "learning_rate": 0.00019474518577631503, + "loss": 3.2413, + "step": 4561 + }, + { + "epoch": 0.58, + "grad_norm": 0.6356580257415771, + "learning_rate": 0.00019464409843022627, + "loss": 3.366, + "step": 4562 + }, + { + "epoch": 0.58, + "grad_norm": 0.5412483811378479, + "learning_rate": 0.00019454302060045877, + "loss": 3.2558, + "step": 4563 + }, + { + "epoch": 0.58, + "grad_norm": 0.6215941309928894, + "learning_rate": 0.00019444195230438897, + "loss": 3.3711, + "step": 4564 + }, + { + "epoch": 0.58, + "grad_norm": 0.6361460089683533, + "learning_rate": 0.00019434089355939159, + "loss": 3.257, + "step": 4565 + }, + { + "epoch": 0.58, + "grad_norm": 0.6052807569503784, + "learning_rate": 0.0001942398443828399, + "loss": 3.3148, + "step": 4566 + }, + { + "epoch": 0.58, + "grad_norm": 0.5979077219963074, + "learning_rate": 0.00019413880479210538, + "loss": 3.3381, + "step": 4567 + }, + { + "epoch": 0.58, + "grad_norm": 0.631401777267456, + "learning_rate": 0.0001940377748045579, + "loss": 3.3188, + "step": 4568 + }, + { + "epoch": 0.58, + "grad_norm": 0.6828151345252991, + "learning_rate": 0.00019393675443756572, + "loss": 3.3572, + "step": 4569 + }, + { + "epoch": 0.58, + "grad_norm": 0.6513455510139465, + "learning_rate": 0.00019383574370849532, + "loss": 3.4064, + "step": 4570 + }, + { + "epoch": 0.59, + "grad_norm": 0.6232062578201294, + "learning_rate": 0.00019373474263471177, + "loss": 3.3425, + "step": 4571 + }, + { + "epoch": 0.59, + "grad_norm": 0.5913916230201721, + "learning_rate": 0.0001936337512335782, + "loss": 3.2361, + "step": 4572 + }, + { + "epoch": 0.59, + "grad_norm": 0.6242142915725708, + "learning_rate": 0.00019353276952245628, + "loss": 3.3271, + "step": 4573 + }, + { + "epoch": 0.59, + "grad_norm": 0.5948387980461121, + "learning_rate": 0.00019343179751870593, + "loss": 3.2691, + "step": 4574 + }, + { + "epoch": 0.59, + "grad_norm": 0.6828065514564514, + "learning_rate": 0.0001933308352396853, + "loss": 3.258, + "step": 4575 + }, + { + "epoch": 0.59, + "grad_norm": 0.5905175805091858, + "learning_rate": 0.00019322988270275115, + "loss": 3.2377, + "step": 4576 + }, + { + "epoch": 0.59, + "grad_norm": 0.6073306202888489, + "learning_rate": 0.00019312893992525827, + "loss": 3.3904, + "step": 4577 + }, + { + "epoch": 0.59, + "grad_norm": 0.6556875705718994, + "learning_rate": 0.00019302800692455995, + "loss": 3.3669, + "step": 4578 + }, + { + "epoch": 0.59, + "grad_norm": 0.609798014163971, + "learning_rate": 0.00019292708371800756, + "loss": 3.3693, + "step": 4579 + }, + { + "epoch": 0.59, + "grad_norm": 0.6277830600738525, + "learning_rate": 0.00019282617032295122, + "loss": 3.2681, + "step": 4580 + }, + { + "epoch": 0.59, + "grad_norm": 0.6359354853630066, + "learning_rate": 0.00019272526675673908, + "loss": 3.2995, + "step": 4581 + }, + { + "epoch": 0.59, + "grad_norm": 0.6259021759033203, + "learning_rate": 0.00019262437303671753, + "loss": 3.2703, + "step": 4582 + }, + { + "epoch": 0.59, + "grad_norm": 0.6437623500823975, + "learning_rate": 0.00019252348918023137, + "loss": 3.3741, + "step": 4583 + }, + { + "epoch": 0.59, + "grad_norm": 0.5862172245979309, + "learning_rate": 0.00019242261520462382, + "loss": 3.2279, + "step": 4584 + }, + { + "epoch": 0.59, + "grad_norm": 0.5978466868400574, + "learning_rate": 0.0001923217511272362, + "loss": 3.2782, + "step": 4585 + }, + { + "epoch": 0.59, + "grad_norm": 0.6125479936599731, + "learning_rate": 0.00019222089696540814, + "loss": 3.3482, + "step": 4586 + }, + { + "epoch": 0.59, + "grad_norm": 0.6221272945404053, + "learning_rate": 0.0001921200527364778, + "loss": 3.4018, + "step": 4587 + }, + { + "epoch": 0.59, + "grad_norm": 0.5960904359817505, + "learning_rate": 0.0001920192184577813, + "loss": 3.2547, + "step": 4588 + }, + { + "epoch": 0.59, + "grad_norm": 0.6198006272315979, + "learning_rate": 0.0001919183941466534, + "loss": 3.4055, + "step": 4589 + }, + { + "epoch": 0.59, + "grad_norm": 0.6370771527290344, + "learning_rate": 0.00019181757982042686, + "loss": 3.3125, + "step": 4590 + }, + { + "epoch": 0.59, + "grad_norm": 0.6158603429794312, + "learning_rate": 0.00019171677549643292, + "loss": 3.2231, + "step": 4591 + }, + { + "epoch": 0.59, + "grad_norm": 0.6988177299499512, + "learning_rate": 0.0001916159811920009, + "loss": 3.305, + "step": 4592 + }, + { + "epoch": 0.59, + "grad_norm": 0.5879431962966919, + "learning_rate": 0.00019151519692445858, + "loss": 3.3562, + "step": 4593 + }, + { + "epoch": 0.59, + "grad_norm": 0.6362794637680054, + "learning_rate": 0.0001914144227111319, + "loss": 3.2479, + "step": 4594 + }, + { + "epoch": 0.59, + "grad_norm": 0.6738634705543518, + "learning_rate": 0.0001913136585693452, + "loss": 3.2917, + "step": 4595 + }, + { + "epoch": 0.59, + "grad_norm": 0.6603161096572876, + "learning_rate": 0.0001912129045164209, + "loss": 3.3998, + "step": 4596 + }, + { + "epoch": 0.59, + "grad_norm": 0.6091164946556091, + "learning_rate": 0.0001911121605696798, + "loss": 3.3063, + "step": 4597 + }, + { + "epoch": 0.59, + "grad_norm": 0.6261993050575256, + "learning_rate": 0.0001910114267464409, + "loss": 3.3927, + "step": 4598 + }, + { + "epoch": 0.59, + "grad_norm": 0.6107027530670166, + "learning_rate": 0.00019091070306402174, + "loss": 3.2642, + "step": 4599 + }, + { + "epoch": 0.59, + "grad_norm": 0.6385813355445862, + "learning_rate": 0.0001908099895397377, + "loss": 3.3738, + "step": 4600 + }, + { + "epoch": 0.59, + "grad_norm": 0.6593822240829468, + "learning_rate": 0.00019070928619090266, + "loss": 3.3662, + "step": 4601 + }, + { + "epoch": 0.59, + "grad_norm": 0.6259279251098633, + "learning_rate": 0.0001906085930348287, + "loss": 3.3263, + "step": 4602 + }, + { + "epoch": 0.59, + "grad_norm": 0.6008145809173584, + "learning_rate": 0.00019050791008882613, + "loss": 3.1958, + "step": 4603 + }, + { + "epoch": 0.59, + "grad_norm": 0.6144819855690002, + "learning_rate": 0.00019040723737020343, + "loss": 3.2219, + "step": 4604 + }, + { + "epoch": 0.59, + "grad_norm": 0.6299683451652527, + "learning_rate": 0.00019030657489626753, + "loss": 3.2164, + "step": 4605 + }, + { + "epoch": 0.59, + "grad_norm": 0.5852930545806885, + "learning_rate": 0.00019020592268432346, + "loss": 3.2383, + "step": 4606 + }, + { + "epoch": 0.59, + "grad_norm": 0.6271327137947083, + "learning_rate": 0.0001901052807516744, + "loss": 3.5819, + "step": 4607 + }, + { + "epoch": 0.59, + "grad_norm": 0.6168107986450195, + "learning_rate": 0.00019000464911562192, + "loss": 3.2793, + "step": 4608 + }, + { + "epoch": 0.59, + "grad_norm": 0.6508430242538452, + "learning_rate": 0.0001899040277934659, + "loss": 3.2548, + "step": 4609 + }, + { + "epoch": 0.59, + "grad_norm": 0.5964395999908447, + "learning_rate": 0.00018980341680250413, + "loss": 3.335, + "step": 4610 + }, + { + "epoch": 0.59, + "grad_norm": 0.5980967879295349, + "learning_rate": 0.00018970281616003285, + "loss": 3.2761, + "step": 4611 + }, + { + "epoch": 0.59, + "grad_norm": 0.6565948724746704, + "learning_rate": 0.00018960222588334656, + "loss": 3.3041, + "step": 4612 + }, + { + "epoch": 0.59, + "grad_norm": 0.569970965385437, + "learning_rate": 0.00018950164598973782, + "loss": 3.2828, + "step": 4613 + }, + { + "epoch": 0.59, + "grad_norm": 0.6044802069664001, + "learning_rate": 0.00018940107649649743, + "loss": 3.3015, + "step": 4614 + }, + { + "epoch": 0.59, + "grad_norm": 0.6506989598274231, + "learning_rate": 0.0001893005174209146, + "loss": 3.3676, + "step": 4615 + }, + { + "epoch": 0.59, + "grad_norm": 0.5685774087905884, + "learning_rate": 0.00018919996878027635, + "loss": 3.3147, + "step": 4616 + }, + { + "epoch": 0.59, + "grad_norm": 0.6195859313011169, + "learning_rate": 0.00018909943059186845, + "loss": 3.4735, + "step": 4617 + }, + { + "epoch": 0.59, + "grad_norm": 0.6199584007263184, + "learning_rate": 0.0001889989028729745, + "loss": 3.3806, + "step": 4618 + }, + { + "epoch": 0.59, + "grad_norm": 0.6059188842773438, + "learning_rate": 0.00018889838564087623, + "loss": 3.3009, + "step": 4619 + }, + { + "epoch": 0.59, + "grad_norm": 0.6111743450164795, + "learning_rate": 0.0001887978789128539, + "loss": 3.2618, + "step": 4620 + }, + { + "epoch": 0.59, + "grad_norm": 0.6045678853988647, + "learning_rate": 0.00018869738270618566, + "loss": 3.3107, + "step": 4621 + }, + { + "epoch": 0.59, + "grad_norm": 0.6162798404693604, + "learning_rate": 0.00018859689703814797, + "loss": 3.3628, + "step": 4622 + }, + { + "epoch": 0.59, + "grad_norm": 0.5983816981315613, + "learning_rate": 0.0001884964219260156, + "loss": 3.3064, + "step": 4623 + }, + { + "epoch": 0.59, + "grad_norm": 0.6370041966438293, + "learning_rate": 0.0001883959573870613, + "loss": 3.491, + "step": 4624 + }, + { + "epoch": 0.59, + "grad_norm": 0.6961913704872131, + "learning_rate": 0.000188295503438556, + "loss": 3.3276, + "step": 4625 + }, + { + "epoch": 0.59, + "grad_norm": 0.6662081480026245, + "learning_rate": 0.00018819506009776904, + "loss": 3.3432, + "step": 4626 + }, + { + "epoch": 0.59, + "grad_norm": 0.589362621307373, + "learning_rate": 0.00018809462738196783, + "loss": 3.3602, + "step": 4627 + }, + { + "epoch": 0.59, + "grad_norm": 0.8106807470321655, + "learning_rate": 0.0001879942053084178, + "loss": 3.4204, + "step": 4628 + }, + { + "epoch": 0.59, + "grad_norm": 0.6647022366523743, + "learning_rate": 0.00018789379389438273, + "loss": 3.3072, + "step": 4629 + }, + { + "epoch": 0.59, + "grad_norm": 0.6003039479255676, + "learning_rate": 0.0001877933931571245, + "loss": 3.3546, + "step": 4630 + }, + { + "epoch": 0.59, + "grad_norm": 0.6438256502151489, + "learning_rate": 0.00018769300311390313, + "loss": 3.3693, + "step": 4631 + }, + { + "epoch": 0.59, + "grad_norm": 0.6219691038131714, + "learning_rate": 0.0001875926237819768, + "loss": 3.3282, + "step": 4632 + }, + { + "epoch": 0.59, + "grad_norm": 0.5815635323524475, + "learning_rate": 0.000187492255178602, + "loss": 3.4142, + "step": 4633 + }, + { + "epoch": 0.59, + "grad_norm": 0.6395319700241089, + "learning_rate": 0.00018739189732103317, + "loss": 3.2867, + "step": 4634 + }, + { + "epoch": 0.59, + "grad_norm": 0.6617757081985474, + "learning_rate": 0.00018729155022652292, + "loss": 3.3896, + "step": 4635 + }, + { + "epoch": 0.59, + "grad_norm": 0.5858076810836792, + "learning_rate": 0.00018719121391232225, + "loss": 3.3204, + "step": 4636 + }, + { + "epoch": 0.59, + "grad_norm": 0.660744845867157, + "learning_rate": 0.00018709088839568, + "loss": 3.2615, + "step": 4637 + }, + { + "epoch": 0.59, + "grad_norm": 0.6023468375205994, + "learning_rate": 0.00018699057369384343, + "loss": 3.2328, + "step": 4638 + }, + { + "epoch": 0.59, + "grad_norm": 0.593385636806488, + "learning_rate": 0.00018689026982405765, + "loss": 3.3062, + "step": 4639 + }, + { + "epoch": 0.59, + "grad_norm": 0.6836429238319397, + "learning_rate": 0.0001867899768035661, + "loss": 3.3262, + "step": 4640 + }, + { + "epoch": 0.59, + "grad_norm": 0.5973769426345825, + "learning_rate": 0.00018668969464961038, + "loss": 3.2149, + "step": 4641 + }, + { + "epoch": 0.59, + "grad_norm": 0.5919034481048584, + "learning_rate": 0.00018658942337943012, + "loss": 3.2584, + "step": 4642 + }, + { + "epoch": 0.59, + "grad_norm": 0.5940399169921875, + "learning_rate": 0.00018648916301026303, + "loss": 3.2865, + "step": 4643 + }, + { + "epoch": 0.59, + "grad_norm": 0.5559197664260864, + "learning_rate": 0.00018638891355934504, + "loss": 3.3067, + "step": 4644 + }, + { + "epoch": 0.59, + "grad_norm": 0.5843505263328552, + "learning_rate": 0.00018628867504391037, + "loss": 3.4005, + "step": 4645 + }, + { + "epoch": 0.59, + "grad_norm": 0.5966205596923828, + "learning_rate": 0.00018618844748119107, + "loss": 3.3431, + "step": 4646 + }, + { + "epoch": 0.59, + "grad_norm": 0.6731165647506714, + "learning_rate": 0.00018608823088841732, + "loss": 3.3499, + "step": 4647 + }, + { + "epoch": 0.59, + "grad_norm": 0.6583471298217773, + "learning_rate": 0.00018598802528281772, + "loss": 3.4808, + "step": 4648 + }, + { + "epoch": 0.6, + "grad_norm": 0.6014115214347839, + "learning_rate": 0.00018588783068161863, + "loss": 3.3479, + "step": 4649 + }, + { + "epoch": 0.6, + "grad_norm": 0.621559202671051, + "learning_rate": 0.00018578764710204467, + "loss": 3.451, + "step": 4650 + }, + { + "epoch": 0.6, + "grad_norm": 0.6271108984947205, + "learning_rate": 0.0001856874745613186, + "loss": 3.392, + "step": 4651 + }, + { + "epoch": 0.6, + "grad_norm": 0.6189903020858765, + "learning_rate": 0.00018558731307666127, + "loss": 3.3413, + "step": 4652 + }, + { + "epoch": 0.6, + "grad_norm": 0.6160193681716919, + "learning_rate": 0.00018548716266529141, + "loss": 3.171, + "step": 4653 + }, + { + "epoch": 0.6, + "grad_norm": 0.5830660462379456, + "learning_rate": 0.00018538702334442632, + "loss": 3.2925, + "step": 4654 + }, + { + "epoch": 0.6, + "grad_norm": 0.6012463569641113, + "learning_rate": 0.00018528689513128092, + "loss": 3.2814, + "step": 4655 + }, + { + "epoch": 0.6, + "grad_norm": 0.6324112415313721, + "learning_rate": 0.0001851867780430685, + "loss": 3.392, + "step": 4656 + }, + { + "epoch": 0.6, + "grad_norm": 0.5771598219871521, + "learning_rate": 0.00018508667209700034, + "loss": 3.368, + "step": 4657 + }, + { + "epoch": 0.6, + "grad_norm": 0.6254698634147644, + "learning_rate": 0.00018498657731028572, + "loss": 3.2715, + "step": 4658 + }, + { + "epoch": 0.6, + "grad_norm": 0.5659250617027283, + "learning_rate": 0.00018488649370013223, + "loss": 3.3505, + "step": 4659 + }, + { + "epoch": 0.6, + "grad_norm": 0.6227028369903564, + "learning_rate": 0.0001847864212837453, + "loss": 3.1114, + "step": 4660 + }, + { + "epoch": 0.6, + "grad_norm": 0.6223110556602478, + "learning_rate": 0.00018468636007832852, + "loss": 3.3056, + "step": 4661 + }, + { + "epoch": 0.6, + "grad_norm": 0.5822505354881287, + "learning_rate": 0.00018458631010108363, + "loss": 3.3205, + "step": 4662 + }, + { + "epoch": 0.6, + "grad_norm": 0.6166062951087952, + "learning_rate": 0.00018448627136921042, + "loss": 3.3337, + "step": 4663 + }, + { + "epoch": 0.6, + "grad_norm": 0.6271905303001404, + "learning_rate": 0.00018438624389990665, + "loss": 3.3123, + "step": 4664 + }, + { + "epoch": 0.6, + "grad_norm": 0.6071668863296509, + "learning_rate": 0.00018428622771036822, + "loss": 3.3749, + "step": 4665 + }, + { + "epoch": 0.6, + "grad_norm": 0.5966046452522278, + "learning_rate": 0.00018418622281778907, + "loss": 3.3569, + "step": 4666 + }, + { + "epoch": 0.6, + "grad_norm": 0.5757268667221069, + "learning_rate": 0.00018408622923936124, + "loss": 3.1887, + "step": 4667 + }, + { + "epoch": 0.6, + "grad_norm": 0.652225136756897, + "learning_rate": 0.00018398624699227467, + "loss": 3.3437, + "step": 4668 + }, + { + "epoch": 0.6, + "grad_norm": 0.6238378882408142, + "learning_rate": 0.00018388627609371757, + "loss": 3.4043, + "step": 4669 + }, + { + "epoch": 0.6, + "grad_norm": 0.6278160810470581, + "learning_rate": 0.00018378631656087608, + "loss": 3.3641, + "step": 4670 + }, + { + "epoch": 0.6, + "grad_norm": 0.6459201574325562, + "learning_rate": 0.00018368636841093434, + "loss": 3.4107, + "step": 4671 + }, + { + "epoch": 0.6, + "grad_norm": 0.6520346999168396, + "learning_rate": 0.00018358643166107463, + "loss": 3.3059, + "step": 4672 + }, + { + "epoch": 0.6, + "grad_norm": 0.5968659520149231, + "learning_rate": 0.00018348650632847726, + "loss": 3.2756, + "step": 4673 + }, + { + "epoch": 0.6, + "grad_norm": 0.6091008186340332, + "learning_rate": 0.00018338659243032063, + "loss": 3.2954, + "step": 4674 + }, + { + "epoch": 0.6, + "grad_norm": 0.6272988319396973, + "learning_rate": 0.00018328668998378095, + "loss": 3.2951, + "step": 4675 + }, + { + "epoch": 0.6, + "grad_norm": 0.6257034540176392, + "learning_rate": 0.00018318679900603265, + "loss": 3.4839, + "step": 4676 + }, + { + "epoch": 0.6, + "grad_norm": 0.6726770997047424, + "learning_rate": 0.00018308691951424822, + "loss": 3.3251, + "step": 4677 + }, + { + "epoch": 0.6, + "grad_norm": 0.6617605090141296, + "learning_rate": 0.000182987051525598, + "loss": 3.2709, + "step": 4678 + }, + { + "epoch": 0.6, + "grad_norm": 0.6106722354888916, + "learning_rate": 0.00018288719505725056, + "loss": 3.4031, + "step": 4679 + }, + { + "epoch": 0.6, + "grad_norm": 0.6694716811180115, + "learning_rate": 0.00018278735012637227, + "loss": 3.2723, + "step": 4680 + }, + { + "epoch": 0.6, + "grad_norm": 0.6016599535942078, + "learning_rate": 0.0001826875167501276, + "loss": 3.5272, + "step": 4681 + }, + { + "epoch": 0.6, + "grad_norm": 0.6514267921447754, + "learning_rate": 0.00018258769494567932, + "loss": 3.3313, + "step": 4682 + }, + { + "epoch": 0.6, + "grad_norm": 0.6131722331047058, + "learning_rate": 0.0001824878847301877, + "loss": 3.3894, + "step": 4683 + }, + { + "epoch": 0.6, + "grad_norm": 0.6339254379272461, + "learning_rate": 0.00018238808612081143, + "loss": 3.4352, + "step": 4684 + }, + { + "epoch": 0.6, + "grad_norm": 0.6149921417236328, + "learning_rate": 0.00018228829913470695, + "loss": 3.3429, + "step": 4685 + }, + { + "epoch": 0.6, + "grad_norm": 0.6236720085144043, + "learning_rate": 0.0001821885237890288, + "loss": 3.3032, + "step": 4686 + }, + { + "epoch": 0.6, + "grad_norm": 0.5988636612892151, + "learning_rate": 0.00018208876010092958, + "loss": 3.2897, + "step": 4687 + }, + { + "epoch": 0.6, + "grad_norm": 0.5963378548622131, + "learning_rate": 0.0001819890080875598, + "loss": 3.3148, + "step": 4688 + }, + { + "epoch": 0.6, + "grad_norm": 0.6665579676628113, + "learning_rate": 0.00018188926776606796, + "loss": 3.3134, + "step": 4689 + }, + { + "epoch": 0.6, + "grad_norm": 0.6786206960678101, + "learning_rate": 0.00018178953915360058, + "loss": 3.3366, + "step": 4690 + }, + { + "epoch": 0.6, + "grad_norm": 0.6171886324882507, + "learning_rate": 0.0001816898222673022, + "loss": 3.243, + "step": 4691 + }, + { + "epoch": 0.6, + "grad_norm": 0.6409314274787903, + "learning_rate": 0.00018159011712431527, + "loss": 3.3624, + "step": 4692 + }, + { + "epoch": 0.6, + "grad_norm": 0.60154789686203, + "learning_rate": 0.00018149042374178038, + "loss": 3.3147, + "step": 4693 + }, + { + "epoch": 0.6, + "grad_norm": 0.6252798438072205, + "learning_rate": 0.00018139074213683576, + "loss": 3.2986, + "step": 4694 + }, + { + "epoch": 0.6, + "grad_norm": 0.5952434539794922, + "learning_rate": 0.000181291072326618, + "loss": 3.2963, + "step": 4695 + }, + { + "epoch": 0.6, + "grad_norm": 0.6407507658004761, + "learning_rate": 0.0001811914143282615, + "loss": 3.5287, + "step": 4696 + }, + { + "epoch": 0.6, + "grad_norm": 0.5918746590614319, + "learning_rate": 0.0001810917681588985, + "loss": 3.4047, + "step": 4697 + }, + { + "epoch": 0.6, + "grad_norm": 0.658079206943512, + "learning_rate": 0.00018099213383565945, + "loss": 3.4384, + "step": 4698 + }, + { + "epoch": 0.6, + "grad_norm": 0.5900827646255493, + "learning_rate": 0.00018089251137567254, + "loss": 3.2725, + "step": 4699 + }, + { + "epoch": 0.6, + "grad_norm": 0.6569340825080872, + "learning_rate": 0.00018079290079606414, + "loss": 3.2884, + "step": 4700 + }, + { + "epoch": 0.6, + "grad_norm": 0.5840054750442505, + "learning_rate": 0.00018069330211395833, + "loss": 3.3072, + "step": 4701 + }, + { + "epoch": 0.6, + "grad_norm": 0.7141736149787903, + "learning_rate": 0.0001805937153464775, + "loss": 3.3235, + "step": 4702 + }, + { + "epoch": 0.6, + "grad_norm": 0.6071069240570068, + "learning_rate": 0.0001804941405107416, + "loss": 3.3522, + "step": 4703 + }, + { + "epoch": 0.6, + "grad_norm": 0.6229115128517151, + "learning_rate": 0.00018039457762386864, + "loss": 3.3637, + "step": 4704 + }, + { + "epoch": 0.6, + "grad_norm": 0.618442177772522, + "learning_rate": 0.00018029502670297479, + "loss": 3.3666, + "step": 4705 + }, + { + "epoch": 0.6, + "grad_norm": 0.606401801109314, + "learning_rate": 0.00018019548776517395, + "loss": 3.2345, + "step": 4706 + }, + { + "epoch": 0.6, + "grad_norm": 0.6161041855812073, + "learning_rate": 0.00018009596082757794, + "loss": 3.2574, + "step": 4707 + }, + { + "epoch": 0.6, + "grad_norm": 0.6366614699363708, + "learning_rate": 0.0001799964459072967, + "loss": 3.2993, + "step": 4708 + }, + { + "epoch": 0.6, + "grad_norm": 0.636702835559845, + "learning_rate": 0.00017989694302143788, + "loss": 3.4084, + "step": 4709 + }, + { + "epoch": 0.6, + "grad_norm": 0.6429193615913391, + "learning_rate": 0.00017979745218710735, + "loss": 3.3006, + "step": 4710 + }, + { + "epoch": 0.6, + "grad_norm": 0.6209059357643127, + "learning_rate": 0.00017969797342140868, + "loss": 3.2954, + "step": 4711 + }, + { + "epoch": 0.6, + "grad_norm": 0.634597897529602, + "learning_rate": 0.0001795985067414433, + "loss": 3.2651, + "step": 4712 + }, + { + "epoch": 0.6, + "grad_norm": 0.5833476781845093, + "learning_rate": 0.00017949905216431083, + "loss": 3.2958, + "step": 4713 + }, + { + "epoch": 0.6, + "grad_norm": 0.6205071210861206, + "learning_rate": 0.00017939960970710865, + "loss": 3.3498, + "step": 4714 + }, + { + "epoch": 0.6, + "grad_norm": 0.6410182118415833, + "learning_rate": 0.00017930017938693193, + "loss": 3.3563, + "step": 4715 + }, + { + "epoch": 0.6, + "grad_norm": 0.6970769166946411, + "learning_rate": 0.0001792007612208741, + "loss": 3.4069, + "step": 4716 + }, + { + "epoch": 0.6, + "grad_norm": 0.6904531717300415, + "learning_rate": 0.00017910135522602614, + "loss": 3.4065, + "step": 4717 + }, + { + "epoch": 0.6, + "grad_norm": 0.6469513773918152, + "learning_rate": 0.00017900196141947705, + "loss": 3.3064, + "step": 4718 + }, + { + "epoch": 0.6, + "grad_norm": 0.6153637766838074, + "learning_rate": 0.00017890257981831393, + "loss": 3.1413, + "step": 4719 + }, + { + "epoch": 0.6, + "grad_norm": 0.6310357451438904, + "learning_rate": 0.00017880321043962165, + "loss": 3.351, + "step": 4720 + }, + { + "epoch": 0.6, + "grad_norm": 0.6128477454185486, + "learning_rate": 0.00017870385330048284, + "loss": 3.3062, + "step": 4721 + }, + { + "epoch": 0.6, + "grad_norm": 0.6027767062187195, + "learning_rate": 0.00017860450841797814, + "loss": 3.2342, + "step": 4722 + }, + { + "epoch": 0.6, + "grad_norm": 0.6310712695121765, + "learning_rate": 0.0001785051758091862, + "loss": 3.3841, + "step": 4723 + }, + { + "epoch": 0.6, + "grad_norm": 0.6277130842208862, + "learning_rate": 0.00017840585549118337, + "loss": 3.3743, + "step": 4724 + }, + { + "epoch": 0.6, + "grad_norm": 0.6063037514686584, + "learning_rate": 0.0001783065474810439, + "loss": 3.1617, + "step": 4725 + }, + { + "epoch": 0.6, + "grad_norm": 0.6380765438079834, + "learning_rate": 0.00017820725179584014, + "loss": 3.4209, + "step": 4726 + }, + { + "epoch": 0.61, + "grad_norm": 0.6219593286514282, + "learning_rate": 0.000178107968452642, + "loss": 3.3852, + "step": 4727 + }, + { + "epoch": 0.61, + "grad_norm": 0.5906947255134583, + "learning_rate": 0.00017800869746851757, + "loss": 3.3816, + "step": 4728 + }, + { + "epoch": 0.61, + "grad_norm": 0.6144208312034607, + "learning_rate": 0.00017790943886053268, + "loss": 3.3553, + "step": 4729 + }, + { + "epoch": 0.61, + "grad_norm": 0.6082693934440613, + "learning_rate": 0.00017781019264575092, + "loss": 3.2825, + "step": 4730 + }, + { + "epoch": 0.61, + "grad_norm": 0.6370079517364502, + "learning_rate": 0.00017771095884123404, + "loss": 3.2945, + "step": 4731 + }, + { + "epoch": 0.61, + "grad_norm": 0.6162868738174438, + "learning_rate": 0.00017761173746404135, + "loss": 3.1309, + "step": 4732 + }, + { + "epoch": 0.61, + "grad_norm": 0.6330640912055969, + "learning_rate": 0.00017751252853123012, + "loss": 3.2824, + "step": 4733 + }, + { + "epoch": 0.61, + "grad_norm": 0.6064519882202148, + "learning_rate": 0.00017741333205985565, + "loss": 3.2224, + "step": 4734 + }, + { + "epoch": 0.61, + "grad_norm": 0.6307936310768127, + "learning_rate": 0.0001773141480669709, + "loss": 3.2816, + "step": 4735 + }, + { + "epoch": 0.61, + "grad_norm": 0.631202220916748, + "learning_rate": 0.00017721497656962665, + "loss": 3.338, + "step": 4736 + }, + { + "epoch": 0.61, + "grad_norm": 0.6187067031860352, + "learning_rate": 0.00017711581758487178, + "loss": 3.2692, + "step": 4737 + }, + { + "epoch": 0.61, + "grad_norm": 0.6404892206192017, + "learning_rate": 0.00017701667112975285, + "loss": 3.2894, + "step": 4738 + }, + { + "epoch": 0.61, + "grad_norm": 0.6452632546424866, + "learning_rate": 0.00017691753722131424, + "loss": 3.3684, + "step": 4739 + }, + { + "epoch": 0.61, + "grad_norm": 0.6127825379371643, + "learning_rate": 0.00017681841587659816, + "loss": 3.3527, + "step": 4740 + }, + { + "epoch": 0.61, + "grad_norm": 0.6171119809150696, + "learning_rate": 0.00017671930711264487, + "loss": 3.2473, + "step": 4741 + }, + { + "epoch": 0.61, + "grad_norm": 0.6060885190963745, + "learning_rate": 0.00017662021094649223, + "loss": 3.3261, + "step": 4742 + }, + { + "epoch": 0.61, + "grad_norm": 0.6263689398765564, + "learning_rate": 0.00017652112739517596, + "loss": 3.2627, + "step": 4743 + }, + { + "epoch": 0.61, + "grad_norm": 0.5872975587844849, + "learning_rate": 0.00017642205647572975, + "loss": 3.3252, + "step": 4744 + }, + { + "epoch": 0.61, + "grad_norm": 0.6522191762924194, + "learning_rate": 0.00017632299820518504, + "loss": 3.327, + "step": 4745 + }, + { + "epoch": 0.61, + "grad_norm": 0.6609612703323364, + "learning_rate": 0.000176223952600571, + "loss": 3.2529, + "step": 4746 + }, + { + "epoch": 0.61, + "grad_norm": 0.6605116128921509, + "learning_rate": 0.00017612491967891486, + "loss": 3.4559, + "step": 4747 + }, + { + "epoch": 0.61, + "grad_norm": 0.6365745663642883, + "learning_rate": 0.00017602589945724144, + "loss": 3.3159, + "step": 4748 + }, + { + "epoch": 0.61, + "grad_norm": 0.6035125255584717, + "learning_rate": 0.00017592689195257354, + "loss": 3.2803, + "step": 4749 + }, + { + "epoch": 0.61, + "grad_norm": 0.6476607918739319, + "learning_rate": 0.0001758278971819316, + "loss": 3.3746, + "step": 4750 + }, + { + "epoch": 0.61, + "grad_norm": 0.6447315812110901, + "learning_rate": 0.00017572891516233398, + "loss": 3.2185, + "step": 4751 + }, + { + "epoch": 0.61, + "grad_norm": 0.6167468428611755, + "learning_rate": 0.00017562994591079696, + "loss": 3.4329, + "step": 4752 + }, + { + "epoch": 0.61, + "grad_norm": 0.6049765944480896, + "learning_rate": 0.00017553098944433435, + "loss": 3.2662, + "step": 4753 + }, + { + "epoch": 0.61, + "grad_norm": 0.5891309380531311, + "learning_rate": 0.00017543204577995795, + "loss": 3.3656, + "step": 4754 + }, + { + "epoch": 0.61, + "grad_norm": 0.7036178708076477, + "learning_rate": 0.0001753331149346773, + "loss": 3.2953, + "step": 4755 + }, + { + "epoch": 0.61, + "grad_norm": 0.6219146847724915, + "learning_rate": 0.00017523419692549988, + "loss": 3.2968, + "step": 4756 + }, + { + "epoch": 0.61, + "grad_norm": 0.6210200190544128, + "learning_rate": 0.0001751352917694308, + "loss": 3.3618, + "step": 4757 + }, + { + "epoch": 0.61, + "grad_norm": 0.6193158626556396, + "learning_rate": 0.00017503639948347288, + "loss": 3.3273, + "step": 4758 + }, + { + "epoch": 0.61, + "grad_norm": 0.6548976302146912, + "learning_rate": 0.00017493752008462704, + "loss": 3.4085, + "step": 4759 + }, + { + "epoch": 0.61, + "grad_norm": 0.6289055943489075, + "learning_rate": 0.00017483865358989168, + "loss": 3.3661, + "step": 4760 + }, + { + "epoch": 0.61, + "grad_norm": 0.5951789021492004, + "learning_rate": 0.00017473980001626304, + "loss": 3.3544, + "step": 4761 + }, + { + "epoch": 0.61, + "grad_norm": 0.5678247213363647, + "learning_rate": 0.00017464095938073536, + "loss": 3.3182, + "step": 4762 + }, + { + "epoch": 0.61, + "grad_norm": 0.6432581543922424, + "learning_rate": 0.00017454213170030037, + "loss": 3.2518, + "step": 4763 + }, + { + "epoch": 0.61, + "grad_norm": 0.6164613366127014, + "learning_rate": 0.00017444331699194762, + "loss": 3.359, + "step": 4764 + }, + { + "epoch": 0.61, + "grad_norm": 0.6441646218299866, + "learning_rate": 0.00017434451527266465, + "loss": 3.3631, + "step": 4765 + }, + { + "epoch": 0.61, + "grad_norm": 0.5996881127357483, + "learning_rate": 0.00017424572655943665, + "loss": 3.2875, + "step": 4766 + }, + { + "epoch": 0.61, + "grad_norm": 0.6040083765983582, + "learning_rate": 0.00017414695086924648, + "loss": 3.3622, + "step": 4767 + }, + { + "epoch": 0.61, + "grad_norm": 0.630914568901062, + "learning_rate": 0.00017404818821907482, + "loss": 3.2048, + "step": 4768 + }, + { + "epoch": 0.61, + "grad_norm": 0.6699578762054443, + "learning_rate": 0.00017394943862590004, + "loss": 3.5253, + "step": 4769 + }, + { + "epoch": 0.61, + "grad_norm": 0.665324866771698, + "learning_rate": 0.0001738507021066985, + "loss": 3.2589, + "step": 4770 + }, + { + "epoch": 0.61, + "grad_norm": 0.6398959755897522, + "learning_rate": 0.00017375197867844401, + "loss": 3.2727, + "step": 4771 + }, + { + "epoch": 0.61, + "grad_norm": 0.697909951210022, + "learning_rate": 0.00017365326835810832, + "loss": 3.4117, + "step": 4772 + }, + { + "epoch": 0.61, + "grad_norm": 0.6068934798240662, + "learning_rate": 0.0001735545711626608, + "loss": 3.2946, + "step": 4773 + }, + { + "epoch": 0.61, + "grad_norm": 0.6205888390541077, + "learning_rate": 0.0001734558871090689, + "loss": 3.3062, + "step": 4774 + }, + { + "epoch": 0.61, + "grad_norm": 0.6135295629501343, + "learning_rate": 0.00017335721621429732, + "loss": 3.4012, + "step": 4775 + }, + { + "epoch": 0.61, + "grad_norm": 0.663394570350647, + "learning_rate": 0.00017325855849530876, + "loss": 3.2613, + "step": 4776 + }, + { + "epoch": 0.61, + "grad_norm": 0.6539154052734375, + "learning_rate": 0.00017315991396906372, + "loss": 3.3191, + "step": 4777 + }, + { + "epoch": 0.61, + "grad_norm": 0.6040697693824768, + "learning_rate": 0.00017306128265252025, + "loss": 3.2666, + "step": 4778 + }, + { + "epoch": 0.61, + "grad_norm": 0.6077384948730469, + "learning_rate": 0.0001729626645626342, + "loss": 3.3127, + "step": 4779 + }, + { + "epoch": 0.61, + "grad_norm": 0.6014003753662109, + "learning_rate": 0.00017286405971635928, + "loss": 3.3564, + "step": 4780 + }, + { + "epoch": 0.61, + "grad_norm": 0.5933483839035034, + "learning_rate": 0.0001727654681306467, + "loss": 3.2653, + "step": 4781 + }, + { + "epoch": 0.61, + "grad_norm": 0.6310820579528809, + "learning_rate": 0.0001726668898224455, + "loss": 3.3651, + "step": 4782 + }, + { + "epoch": 0.61, + "grad_norm": 0.6220866441726685, + "learning_rate": 0.00017256832480870237, + "loss": 3.3343, + "step": 4783 + }, + { + "epoch": 0.61, + "grad_norm": 0.6071862578392029, + "learning_rate": 0.00017246977310636201, + "loss": 3.3427, + "step": 4784 + }, + { + "epoch": 0.61, + "grad_norm": 0.5943565964698792, + "learning_rate": 0.00017237123473236643, + "loss": 3.2754, + "step": 4785 + }, + { + "epoch": 0.61, + "grad_norm": 0.6503081917762756, + "learning_rate": 0.00017227270970365555, + "loss": 3.4084, + "step": 4786 + }, + { + "epoch": 0.61, + "grad_norm": 0.6690330505371094, + "learning_rate": 0.00017217419803716697, + "loss": 3.3706, + "step": 4787 + }, + { + "epoch": 0.61, + "grad_norm": 0.6268274784088135, + "learning_rate": 0.000172075699749836, + "loss": 3.4523, + "step": 4788 + }, + { + "epoch": 0.61, + "grad_norm": 0.6357588768005371, + "learning_rate": 0.00017197721485859562, + "loss": 3.5234, + "step": 4789 + }, + { + "epoch": 0.61, + "grad_norm": 0.6383059620857239, + "learning_rate": 0.00017187874338037644, + "loss": 3.3371, + "step": 4790 + }, + { + "epoch": 0.61, + "grad_norm": 0.646907389163971, + "learning_rate": 0.00017178028533210705, + "loss": 3.3089, + "step": 4791 + }, + { + "epoch": 0.61, + "grad_norm": 0.5765965580940247, + "learning_rate": 0.00017168184073071324, + "loss": 3.3526, + "step": 4792 + }, + { + "epoch": 0.61, + "grad_norm": 0.5881819725036621, + "learning_rate": 0.0001715834095931191, + "loss": 3.286, + "step": 4793 + }, + { + "epoch": 0.61, + "grad_norm": 0.6026200652122498, + "learning_rate": 0.00017148499193624586, + "loss": 3.2993, + "step": 4794 + }, + { + "epoch": 0.61, + "grad_norm": 0.7471684813499451, + "learning_rate": 0.0001713865877770128, + "loss": 3.3015, + "step": 4795 + }, + { + "epoch": 0.61, + "grad_norm": 0.6499928832054138, + "learning_rate": 0.00017128819713233665, + "loss": 3.2953, + "step": 4796 + }, + { + "epoch": 0.61, + "grad_norm": 0.6148751378059387, + "learning_rate": 0.00017118982001913187, + "loss": 3.2791, + "step": 4797 + }, + { + "epoch": 0.61, + "grad_norm": 0.681559145450592, + "learning_rate": 0.00017109145645431074, + "loss": 3.3396, + "step": 4798 + }, + { + "epoch": 0.61, + "grad_norm": 0.6669842600822449, + "learning_rate": 0.000170993106454783, + "loss": 3.3768, + "step": 4799 + }, + { + "epoch": 0.61, + "grad_norm": 0.6817641258239746, + "learning_rate": 0.00017089477003745618, + "loss": 3.461, + "step": 4800 + }, + { + "epoch": 0.61, + "grad_norm": 0.6329324841499329, + "learning_rate": 0.00017079644721923538, + "loss": 3.3652, + "step": 4801 + }, + { + "epoch": 0.61, + "grad_norm": 0.6417033076286316, + "learning_rate": 0.00017069813801702362, + "loss": 3.4907, + "step": 4802 + }, + { + "epoch": 0.61, + "grad_norm": 0.7128257155418396, + "learning_rate": 0.00017059984244772124, + "loss": 3.3174, + "step": 4803 + }, + { + "epoch": 0.61, + "grad_norm": 0.6172081232070923, + "learning_rate": 0.0001705015605282264, + "loss": 3.3234, + "step": 4804 + }, + { + "epoch": 0.62, + "grad_norm": 0.6061226725578308, + "learning_rate": 0.000170403292275435, + "loss": 3.3656, + "step": 4805 + }, + { + "epoch": 0.62, + "grad_norm": 0.5999137163162231, + "learning_rate": 0.00017030503770624046, + "loss": 3.3599, + "step": 4806 + }, + { + "epoch": 0.62, + "grad_norm": 0.5816587805747986, + "learning_rate": 0.0001702067968375337, + "loss": 3.2664, + "step": 4807 + }, + { + "epoch": 0.62, + "grad_norm": 0.63521808385849, + "learning_rate": 0.00017010856968620373, + "loss": 3.4519, + "step": 4808 + }, + { + "epoch": 0.62, + "grad_norm": 0.6459574699401855, + "learning_rate": 0.00017001035626913678, + "loss": 3.258, + "step": 4809 + }, + { + "epoch": 0.62, + "grad_norm": 0.5902713537216187, + "learning_rate": 0.00016991215660321679, + "loss": 3.4486, + "step": 4810 + }, + { + "epoch": 0.62, + "grad_norm": 0.6274678707122803, + "learning_rate": 0.00016981397070532566, + "loss": 3.4292, + "step": 4811 + }, + { + "epoch": 0.62, + "grad_norm": 0.5549883246421814, + "learning_rate": 0.0001697157985923425, + "loss": 3.1933, + "step": 4812 + }, + { + "epoch": 0.62, + "grad_norm": 0.6459517478942871, + "learning_rate": 0.00016961764028114437, + "loss": 3.3454, + "step": 4813 + }, + { + "epoch": 0.62, + "grad_norm": 0.6086466908454895, + "learning_rate": 0.00016951949578860575, + "loss": 3.2549, + "step": 4814 + }, + { + "epoch": 0.62, + "grad_norm": 0.6149626970291138, + "learning_rate": 0.00016942136513159873, + "loss": 3.166, + "step": 4815 + }, + { + "epoch": 0.62, + "grad_norm": 0.6007905006408691, + "learning_rate": 0.00016932324832699325, + "loss": 3.3945, + "step": 4816 + }, + { + "epoch": 0.62, + "grad_norm": 0.6186566948890686, + "learning_rate": 0.0001692251453916567, + "loss": 3.376, + "step": 4817 + }, + { + "epoch": 0.62, + "grad_norm": 0.6166824102401733, + "learning_rate": 0.000169127056342454, + "loss": 3.3302, + "step": 4818 + }, + { + "epoch": 0.62, + "grad_norm": 0.6184083223342896, + "learning_rate": 0.00016902898119624794, + "loss": 3.251, + "step": 4819 + }, + { + "epoch": 0.62, + "grad_norm": 0.6125866770744324, + "learning_rate": 0.0001689309199698986, + "loss": 3.1848, + "step": 4820 + }, + { + "epoch": 0.62, + "grad_norm": 0.604458212852478, + "learning_rate": 0.00016883287268026404, + "loss": 3.3634, + "step": 4821 + }, + { + "epoch": 0.62, + "grad_norm": 0.6307944655418396, + "learning_rate": 0.00016873483934419959, + "loss": 3.3647, + "step": 4822 + }, + { + "epoch": 0.62, + "grad_norm": 0.5935248136520386, + "learning_rate": 0.00016863681997855844, + "loss": 3.3112, + "step": 4823 + }, + { + "epoch": 0.62, + "grad_norm": 0.6155415773391724, + "learning_rate": 0.00016853881460019115, + "loss": 3.3834, + "step": 4824 + }, + { + "epoch": 0.62, + "grad_norm": 0.6031137704849243, + "learning_rate": 0.00016844082322594597, + "loss": 3.2919, + "step": 4825 + }, + { + "epoch": 0.62, + "grad_norm": 0.713043212890625, + "learning_rate": 0.00016834284587266885, + "loss": 3.4577, + "step": 4826 + }, + { + "epoch": 0.62, + "grad_norm": 0.6130597591400146, + "learning_rate": 0.00016824488255720317, + "loss": 3.3312, + "step": 4827 + }, + { + "epoch": 0.62, + "grad_norm": 0.620797336101532, + "learning_rate": 0.00016814693329638992, + "loss": 3.1559, + "step": 4828 + }, + { + "epoch": 0.62, + "grad_norm": 0.6594494581222534, + "learning_rate": 0.00016804899810706774, + "loss": 3.3446, + "step": 4829 + }, + { + "epoch": 0.62, + "grad_norm": 0.6507055163383484, + "learning_rate": 0.00016795107700607286, + "loss": 3.3051, + "step": 4830 + }, + { + "epoch": 0.62, + "grad_norm": 0.6062747836112976, + "learning_rate": 0.00016785317001023905, + "loss": 3.2461, + "step": 4831 + }, + { + "epoch": 0.62, + "grad_norm": 0.6378657221794128, + "learning_rate": 0.0001677552771363977, + "loss": 3.4018, + "step": 4832 + }, + { + "epoch": 0.62, + "grad_norm": 0.6071016192436218, + "learning_rate": 0.00016765739840137757, + "loss": 3.1889, + "step": 4833 + }, + { + "epoch": 0.62, + "grad_norm": 0.5815123319625854, + "learning_rate": 0.00016755953382200533, + "loss": 3.3255, + "step": 4834 + }, + { + "epoch": 0.62, + "grad_norm": 0.5857453942298889, + "learning_rate": 0.00016746168341510499, + "loss": 3.3624, + "step": 4835 + }, + { + "epoch": 0.62, + "grad_norm": 0.6075772643089294, + "learning_rate": 0.00016736384719749804, + "loss": 3.2764, + "step": 4836 + }, + { + "epoch": 0.62, + "grad_norm": 0.6338539719581604, + "learning_rate": 0.00016726602518600382, + "loss": 3.2596, + "step": 4837 + }, + { + "epoch": 0.62, + "grad_norm": 0.6194092035293579, + "learning_rate": 0.00016716821739743887, + "loss": 3.4251, + "step": 4838 + }, + { + "epoch": 0.62, + "grad_norm": 0.6281957030296326, + "learning_rate": 0.00016707042384861775, + "loss": 3.2663, + "step": 4839 + }, + { + "epoch": 0.62, + "grad_norm": 0.583798348903656, + "learning_rate": 0.00016697264455635213, + "loss": 3.2086, + "step": 4840 + }, + { + "epoch": 0.62, + "grad_norm": 0.6021174788475037, + "learning_rate": 0.0001668748795374515, + "loss": 3.2542, + "step": 4841 + }, + { + "epoch": 0.62, + "grad_norm": 0.6187352538108826, + "learning_rate": 0.00016677712880872275, + "loss": 3.2407, + "step": 4842 + }, + { + "epoch": 0.62, + "grad_norm": 0.5813688039779663, + "learning_rate": 0.00016667939238697028, + "loss": 3.2851, + "step": 4843 + }, + { + "epoch": 0.62, + "grad_norm": 0.6295503377914429, + "learning_rate": 0.0001665816702889963, + "loss": 3.3658, + "step": 4844 + }, + { + "epoch": 0.62, + "grad_norm": 0.5846412777900696, + "learning_rate": 0.0001664839625316002, + "loss": 3.2269, + "step": 4845 + }, + { + "epoch": 0.62, + "grad_norm": 0.6589176058769226, + "learning_rate": 0.0001663862691315791, + "loss": 3.2868, + "step": 4846 + }, + { + "epoch": 0.62, + "grad_norm": 0.6424933671951294, + "learning_rate": 0.00016628859010572771, + "loss": 3.3598, + "step": 4847 + }, + { + "epoch": 0.62, + "grad_norm": 0.6255198121070862, + "learning_rate": 0.00016619092547083813, + "loss": 3.4012, + "step": 4848 + }, + { + "epoch": 0.62, + "grad_norm": 0.656611442565918, + "learning_rate": 0.00016609327524370012, + "loss": 3.2743, + "step": 4849 + }, + { + "epoch": 0.62, + "grad_norm": 0.6469491720199585, + "learning_rate": 0.0001659956394411008, + "loss": 3.2577, + "step": 4850 + }, + { + "epoch": 0.62, + "grad_norm": 0.6087647676467896, + "learning_rate": 0.00016589801807982487, + "loss": 3.3577, + "step": 4851 + }, + { + "epoch": 0.62, + "grad_norm": 0.6154561638832092, + "learning_rate": 0.00016580041117665467, + "loss": 3.3821, + "step": 4852 + }, + { + "epoch": 0.62, + "grad_norm": 0.6215868592262268, + "learning_rate": 0.00016570281874836996, + "loss": 3.2785, + "step": 4853 + }, + { + "epoch": 0.62, + "grad_norm": 0.6483801007270813, + "learning_rate": 0.0001656052408117479, + "loss": 3.3564, + "step": 4854 + }, + { + "epoch": 0.62, + "grad_norm": 0.6019678115844727, + "learning_rate": 0.00016550767738356337, + "loss": 3.2738, + "step": 4855 + }, + { + "epoch": 0.62, + "grad_norm": 0.626493513584137, + "learning_rate": 0.0001654101284805886, + "loss": 3.182, + "step": 4856 + }, + { + "epoch": 0.62, + "grad_norm": 0.6123741269111633, + "learning_rate": 0.00016531259411959332, + "loss": 3.3446, + "step": 4857 + }, + { + "epoch": 0.62, + "grad_norm": 0.6224625706672668, + "learning_rate": 0.00016521507431734492, + "loss": 3.3272, + "step": 4858 + }, + { + "epoch": 0.62, + "grad_norm": 0.6573577523231506, + "learning_rate": 0.0001651175690906082, + "loss": 3.325, + "step": 4859 + }, + { + "epoch": 0.62, + "grad_norm": 0.6397596001625061, + "learning_rate": 0.0001650200784561454, + "loss": 3.2862, + "step": 4860 + }, + { + "epoch": 0.62, + "grad_norm": 0.6268782019615173, + "learning_rate": 0.00016492260243071628, + "loss": 3.2584, + "step": 4861 + }, + { + "epoch": 0.62, + "grad_norm": 0.6043176054954529, + "learning_rate": 0.00016482514103107813, + "loss": 3.2901, + "step": 4862 + }, + { + "epoch": 0.62, + "grad_norm": 0.6334238648414612, + "learning_rate": 0.00016472769427398564, + "loss": 3.507, + "step": 4863 + }, + { + "epoch": 0.62, + "grad_norm": 0.6372676491737366, + "learning_rate": 0.00016463026217619105, + "loss": 3.2654, + "step": 4864 + }, + { + "epoch": 0.62, + "grad_norm": 0.6464550495147705, + "learning_rate": 0.00016453284475444413, + "loss": 3.2341, + "step": 4865 + }, + { + "epoch": 0.62, + "grad_norm": 0.6223431825637817, + "learning_rate": 0.00016443544202549189, + "loss": 3.3866, + "step": 4866 + }, + { + "epoch": 0.62, + "grad_norm": 0.5811198949813843, + "learning_rate": 0.00016433805400607924, + "loss": 3.2517, + "step": 4867 + }, + { + "epoch": 0.62, + "grad_norm": 0.6073024868965149, + "learning_rate": 0.00016424068071294818, + "loss": 3.3954, + "step": 4868 + }, + { + "epoch": 0.62, + "grad_norm": 0.6416819095611572, + "learning_rate": 0.00016414332216283827, + "loss": 3.3483, + "step": 4869 + }, + { + "epoch": 0.62, + "grad_norm": 0.6186531782150269, + "learning_rate": 0.00016404597837248663, + "loss": 3.3467, + "step": 4870 + }, + { + "epoch": 0.62, + "grad_norm": 0.622018039226532, + "learning_rate": 0.0001639486493586278, + "loss": 3.359, + "step": 4871 + }, + { + "epoch": 0.62, + "grad_norm": 0.6513142585754395, + "learning_rate": 0.00016385133513799368, + "loss": 3.2877, + "step": 4872 + }, + { + "epoch": 0.62, + "grad_norm": 0.6026149988174438, + "learning_rate": 0.00016375403572731385, + "loss": 3.3425, + "step": 4873 + }, + { + "epoch": 0.62, + "grad_norm": 0.5751915574073792, + "learning_rate": 0.00016365675114331512, + "loss": 3.4418, + "step": 4874 + }, + { + "epoch": 0.62, + "grad_norm": 0.6088400483131409, + "learning_rate": 0.0001635594814027217, + "loss": 3.2938, + "step": 4875 + }, + { + "epoch": 0.62, + "grad_norm": 0.6468400955200195, + "learning_rate": 0.0001634622265222556, + "loss": 3.3112, + "step": 4876 + }, + { + "epoch": 0.62, + "grad_norm": 0.6035825610160828, + "learning_rate": 0.00016336498651863609, + "loss": 3.3464, + "step": 4877 + }, + { + "epoch": 0.62, + "grad_norm": 0.5807467699050903, + "learning_rate": 0.0001632677614085797, + "loss": 3.3286, + "step": 4878 + }, + { + "epoch": 0.62, + "grad_norm": 0.6074662804603577, + "learning_rate": 0.00016317055120880058, + "loss": 3.36, + "step": 4879 + }, + { + "epoch": 0.62, + "grad_norm": 0.646324872970581, + "learning_rate": 0.00016307335593601036, + "loss": 3.491, + "step": 4880 + }, + { + "epoch": 0.62, + "grad_norm": 0.6242514848709106, + "learning_rate": 0.000162976175606918, + "loss": 3.5013, + "step": 4881 + }, + { + "epoch": 0.62, + "grad_norm": 0.6329924464225769, + "learning_rate": 0.00016287901023822988, + "loss": 3.3805, + "step": 4882 + }, + { + "epoch": 0.63, + "grad_norm": 0.6353751420974731, + "learning_rate": 0.00016278185984664996, + "loss": 3.3469, + "step": 4883 + }, + { + "epoch": 0.63, + "grad_norm": 0.6465364694595337, + "learning_rate": 0.00016268472444887932, + "loss": 3.384, + "step": 4884 + }, + { + "epoch": 0.63, + "grad_norm": 0.5814542770385742, + "learning_rate": 0.0001625876040616169, + "loss": 3.4397, + "step": 4885 + }, + { + "epoch": 0.63, + "grad_norm": 0.5991275310516357, + "learning_rate": 0.00016249049870155875, + "loss": 3.3694, + "step": 4886 + }, + { + "epoch": 0.63, + "grad_norm": 0.6458463072776794, + "learning_rate": 0.00016239340838539827, + "loss": 3.2171, + "step": 4887 + }, + { + "epoch": 0.63, + "grad_norm": 0.5924831628799438, + "learning_rate": 0.00016229633312982655, + "loss": 3.3777, + "step": 4888 + }, + { + "epoch": 0.63, + "grad_norm": 0.6339801549911499, + "learning_rate": 0.00016219927295153198, + "loss": 3.3329, + "step": 4889 + }, + { + "epoch": 0.63, + "grad_norm": 0.5737445950508118, + "learning_rate": 0.00016210222786720014, + "loss": 3.2879, + "step": 4890 + }, + { + "epoch": 0.63, + "grad_norm": 0.6337724328041077, + "learning_rate": 0.00016200519789351443, + "loss": 3.3842, + "step": 4891 + }, + { + "epoch": 0.63, + "grad_norm": 0.5654064416885376, + "learning_rate": 0.0001619081830471553, + "loss": 3.2704, + "step": 4892 + }, + { + "epoch": 0.63, + "grad_norm": 0.6272947788238525, + "learning_rate": 0.00016181118334480073, + "loss": 3.4049, + "step": 4893 + }, + { + "epoch": 0.63, + "grad_norm": 0.7220709919929504, + "learning_rate": 0.00016171419880312604, + "loss": 3.4469, + "step": 4894 + }, + { + "epoch": 0.63, + "grad_norm": 0.6143007278442383, + "learning_rate": 0.00016161722943880416, + "loss": 3.3387, + "step": 4895 + }, + { + "epoch": 0.63, + "grad_norm": 0.6261044144630432, + "learning_rate": 0.00016152027526850519, + "loss": 3.3093, + "step": 4896 + }, + { + "epoch": 0.63, + "grad_norm": 0.61616051197052, + "learning_rate": 0.0001614233363088966, + "loss": 3.3521, + "step": 4897 + }, + { + "epoch": 0.63, + "grad_norm": 0.6149140000343323, + "learning_rate": 0.0001613264125766434, + "loss": 3.2737, + "step": 4898 + }, + { + "epoch": 0.63, + "grad_norm": 0.616252601146698, + "learning_rate": 0.00016122950408840785, + "loss": 3.279, + "step": 4899 + }, + { + "epoch": 0.63, + "grad_norm": 0.6254578828811646, + "learning_rate": 0.00016113261086084962, + "loss": 3.4044, + "step": 4900 + }, + { + "epoch": 0.63, + "grad_norm": 0.6441753506660461, + "learning_rate": 0.0001610357329106259, + "loss": 3.4349, + "step": 4901 + }, + { + "epoch": 0.63, + "grad_norm": 0.6455859541893005, + "learning_rate": 0.00016093887025439105, + "loss": 3.3825, + "step": 4902 + }, + { + "epoch": 0.63, + "grad_norm": 0.632778525352478, + "learning_rate": 0.00016084202290879678, + "loss": 3.4236, + "step": 4903 + }, + { + "epoch": 0.63, + "grad_norm": 0.6407384872436523, + "learning_rate": 0.0001607451908904925, + "loss": 3.1993, + "step": 4904 + }, + { + "epoch": 0.63, + "grad_norm": 0.6580832004547119, + "learning_rate": 0.00016064837421612456, + "loss": 3.1839, + "step": 4905 + }, + { + "epoch": 0.63, + "grad_norm": 0.6400294899940491, + "learning_rate": 0.00016055157290233702, + "loss": 3.4126, + "step": 4906 + }, + { + "epoch": 0.63, + "grad_norm": 0.60333251953125, + "learning_rate": 0.00016045478696577104, + "loss": 3.3403, + "step": 4907 + }, + { + "epoch": 0.63, + "grad_norm": 0.6334813833236694, + "learning_rate": 0.00016035801642306526, + "loss": 3.3563, + "step": 4908 + }, + { + "epoch": 0.63, + "grad_norm": 0.6013305187225342, + "learning_rate": 0.00016026126129085568, + "loss": 3.2698, + "step": 4909 + }, + { + "epoch": 0.63, + "grad_norm": 0.5717803239822388, + "learning_rate": 0.00016016452158577565, + "loss": 3.4212, + "step": 4910 + }, + { + "epoch": 0.63, + "grad_norm": 0.6447166204452515, + "learning_rate": 0.00016006779732445582, + "loss": 3.2419, + "step": 4911 + }, + { + "epoch": 0.63, + "grad_norm": 0.6601414084434509, + "learning_rate": 0.0001599710885235241, + "loss": 3.4306, + "step": 4912 + }, + { + "epoch": 0.63, + "grad_norm": 0.6667841672897339, + "learning_rate": 0.00015987439519960607, + "loss": 3.2909, + "step": 4913 + }, + { + "epoch": 0.63, + "grad_norm": 0.6420342922210693, + "learning_rate": 0.00015977771736932434, + "loss": 3.1509, + "step": 4914 + }, + { + "epoch": 0.63, + "grad_norm": 0.6115997433662415, + "learning_rate": 0.00015968105504929892, + "loss": 3.3316, + "step": 4915 + }, + { + "epoch": 0.63, + "grad_norm": 0.5998908281326294, + "learning_rate": 0.00015958440825614723, + "loss": 3.2116, + "step": 4916 + }, + { + "epoch": 0.63, + "grad_norm": 0.654397189617157, + "learning_rate": 0.00015948777700648396, + "loss": 3.2198, + "step": 4917 + }, + { + "epoch": 0.63, + "grad_norm": 0.6434248685836792, + "learning_rate": 0.00015939116131692105, + "loss": 3.2118, + "step": 4918 + }, + { + "epoch": 0.63, + "grad_norm": 0.6479857563972473, + "learning_rate": 0.000159294561204068, + "loss": 3.384, + "step": 4919 + }, + { + "epoch": 0.63, + "grad_norm": 0.6829742193222046, + "learning_rate": 0.00015919797668453146, + "loss": 3.3759, + "step": 4920 + }, + { + "epoch": 0.63, + "grad_norm": 0.643349289894104, + "learning_rate": 0.00015910140777491527, + "loss": 3.3625, + "step": 4921 + }, + { + "epoch": 0.63, + "grad_norm": 0.6263314485549927, + "learning_rate": 0.00015900485449182094, + "loss": 3.3384, + "step": 4922 + }, + { + "epoch": 0.63, + "grad_norm": 0.6248134970664978, + "learning_rate": 0.00015890831685184704, + "loss": 3.4037, + "step": 4923 + }, + { + "epoch": 0.63, + "grad_norm": 0.6030375361442566, + "learning_rate": 0.00015881179487158952, + "loss": 3.1696, + "step": 4924 + }, + { + "epoch": 0.63, + "grad_norm": 0.6380594372749329, + "learning_rate": 0.00015871528856764163, + "loss": 3.3414, + "step": 4925 + }, + { + "epoch": 0.63, + "grad_norm": 0.6514424085617065, + "learning_rate": 0.00015861879795659378, + "loss": 3.2199, + "step": 4926 + }, + { + "epoch": 0.63, + "grad_norm": 0.6601086854934692, + "learning_rate": 0.00015852232305503406, + "loss": 3.3459, + "step": 4927 + }, + { + "epoch": 0.63, + "grad_norm": 0.663547933101654, + "learning_rate": 0.0001584258638795475, + "loss": 3.293, + "step": 4928 + }, + { + "epoch": 0.63, + "grad_norm": 0.6009930372238159, + "learning_rate": 0.00015832942044671647, + "loss": 3.2446, + "step": 4929 + }, + { + "epoch": 0.63, + "grad_norm": 0.6301782727241516, + "learning_rate": 0.00015823299277312086, + "loss": 3.2444, + "step": 4930 + }, + { + "epoch": 0.63, + "grad_norm": 0.6136788129806519, + "learning_rate": 0.00015813658087533757, + "loss": 3.4593, + "step": 4931 + }, + { + "epoch": 0.63, + "grad_norm": 0.5674062371253967, + "learning_rate": 0.0001580401847699411, + "loss": 3.2897, + "step": 4932 + }, + { + "epoch": 0.63, + "grad_norm": 0.5971000790596008, + "learning_rate": 0.00015794380447350288, + "loss": 3.3371, + "step": 4933 + }, + { + "epoch": 0.63, + "grad_norm": 0.6050028800964355, + "learning_rate": 0.00015784744000259195, + "loss": 3.3715, + "step": 4934 + }, + { + "epoch": 0.63, + "grad_norm": 0.6050082445144653, + "learning_rate": 0.0001577510913737744, + "loss": 3.2101, + "step": 4935 + }, + { + "epoch": 0.63, + "grad_norm": 0.6067320704460144, + "learning_rate": 0.00015765475860361365, + "loss": 3.3618, + "step": 4936 + }, + { + "epoch": 0.63, + "grad_norm": 0.685673713684082, + "learning_rate": 0.00015755844170867048, + "loss": 3.2557, + "step": 4937 + }, + { + "epoch": 0.63, + "grad_norm": 0.6283639669418335, + "learning_rate": 0.00015746214070550286, + "loss": 3.4148, + "step": 4938 + }, + { + "epoch": 0.63, + "grad_norm": 0.5921222567558289, + "learning_rate": 0.00015736585561066603, + "loss": 3.2689, + "step": 4939 + }, + { + "epoch": 0.63, + "grad_norm": 0.6454620361328125, + "learning_rate": 0.00015726958644071248, + "loss": 3.2757, + "step": 4940 + }, + { + "epoch": 0.63, + "grad_norm": 0.6581416726112366, + "learning_rate": 0.00015717333321219206, + "loss": 3.393, + "step": 4941 + }, + { + "epoch": 0.63, + "grad_norm": 0.608899712562561, + "learning_rate": 0.00015707709594165185, + "loss": 3.3873, + "step": 4942 + }, + { + "epoch": 0.63, + "grad_norm": 0.6669918298721313, + "learning_rate": 0.0001569808746456361, + "loss": 3.2282, + "step": 4943 + }, + { + "epoch": 0.63, + "grad_norm": 0.6613045334815979, + "learning_rate": 0.00015688466934068632, + "loss": 3.2838, + "step": 4944 + }, + { + "epoch": 0.63, + "grad_norm": 0.6391863822937012, + "learning_rate": 0.0001567884800433414, + "loss": 3.2624, + "step": 4945 + }, + { + "epoch": 0.63, + "grad_norm": 0.5971353650093079, + "learning_rate": 0.0001566923067701374, + "loss": 3.253, + "step": 4946 + }, + { + "epoch": 0.63, + "grad_norm": 0.608292818069458, + "learning_rate": 0.0001565961495376075, + "loss": 3.3042, + "step": 4947 + }, + { + "epoch": 0.63, + "grad_norm": 0.6255372762680054, + "learning_rate": 0.0001565000083622824, + "loss": 3.2865, + "step": 4948 + }, + { + "epoch": 0.63, + "grad_norm": 0.6835498213768005, + "learning_rate": 0.00015640388326068969, + "loss": 3.3385, + "step": 4949 + }, + { + "epoch": 0.63, + "grad_norm": 0.6317881941795349, + "learning_rate": 0.0001563077742493546, + "loss": 3.3869, + "step": 4950 + }, + { + "epoch": 0.63, + "grad_norm": 0.6487517952919006, + "learning_rate": 0.00015621168134479925, + "loss": 3.3362, + "step": 4951 + }, + { + "epoch": 0.63, + "grad_norm": 0.6169304847717285, + "learning_rate": 0.00015611560456354323, + "loss": 3.2885, + "step": 4952 + }, + { + "epoch": 0.63, + "grad_norm": 0.6524413228034973, + "learning_rate": 0.00015601954392210316, + "loss": 3.2434, + "step": 4953 + }, + { + "epoch": 0.63, + "grad_norm": 0.6151243448257446, + "learning_rate": 0.00015592349943699296, + "loss": 3.2181, + "step": 4954 + }, + { + "epoch": 0.63, + "grad_norm": 0.61757493019104, + "learning_rate": 0.0001558274711247239, + "loss": 3.374, + "step": 4955 + }, + { + "epoch": 0.63, + "grad_norm": 0.7145484089851379, + "learning_rate": 0.0001557314590018043, + "loss": 3.2289, + "step": 4956 + }, + { + "epoch": 0.63, + "grad_norm": 0.6981588006019592, + "learning_rate": 0.00015563546308473966, + "loss": 3.3561, + "step": 4957 + }, + { + "epoch": 0.63, + "grad_norm": 0.5915506482124329, + "learning_rate": 0.00015553948339003287, + "loss": 3.281, + "step": 4958 + }, + { + "epoch": 0.63, + "grad_norm": 0.6296604871749878, + "learning_rate": 0.00015544351993418404, + "loss": 3.404, + "step": 4959 + }, + { + "epoch": 0.63, + "grad_norm": 0.6014420986175537, + "learning_rate": 0.00015534757273369038, + "loss": 3.3068, + "step": 4960 + }, + { + "epoch": 0.64, + "grad_norm": 0.621677815914154, + "learning_rate": 0.00015525164180504622, + "loss": 3.3712, + "step": 4961 + }, + { + "epoch": 0.64, + "grad_norm": 0.6184865832328796, + "learning_rate": 0.00015515572716474322, + "loss": 3.3306, + "step": 4962 + }, + { + "epoch": 0.64, + "grad_norm": 0.6415227651596069, + "learning_rate": 0.00015505982882927028, + "loss": 3.2403, + "step": 4963 + }, + { + "epoch": 0.64, + "grad_norm": 0.6465979218482971, + "learning_rate": 0.00015496394681511344, + "loss": 3.1784, + "step": 4964 + }, + { + "epoch": 0.64, + "grad_norm": 0.6245262026786804, + "learning_rate": 0.00015486808113875582, + "loss": 3.3508, + "step": 4965 + }, + { + "epoch": 0.64, + "grad_norm": 0.6168780326843262, + "learning_rate": 0.00015477223181667796, + "loss": 3.2795, + "step": 4966 + }, + { + "epoch": 0.64, + "grad_norm": 0.6115142107009888, + "learning_rate": 0.00015467639886535746, + "loss": 3.2947, + "step": 4967 + }, + { + "epoch": 0.64, + "grad_norm": 0.5774526000022888, + "learning_rate": 0.00015458058230126892, + "loss": 3.4082, + "step": 4968 + }, + { + "epoch": 0.64, + "grad_norm": 0.5983077883720398, + "learning_rate": 0.00015448478214088456, + "loss": 3.4332, + "step": 4969 + }, + { + "epoch": 0.64, + "grad_norm": 0.5960877537727356, + "learning_rate": 0.00015438899840067356, + "loss": 3.2596, + "step": 4970 + }, + { + "epoch": 0.64, + "grad_norm": 0.601218581199646, + "learning_rate": 0.00015429323109710207, + "loss": 3.2533, + "step": 4971 + }, + { + "epoch": 0.64, + "grad_norm": 0.587386429309845, + "learning_rate": 0.0001541974802466337, + "loss": 3.3526, + "step": 4972 + }, + { + "epoch": 0.64, + "grad_norm": 0.6104817986488342, + "learning_rate": 0.00015410174586572912, + "loss": 3.2944, + "step": 4973 + }, + { + "epoch": 0.64, + "grad_norm": 0.6067850589752197, + "learning_rate": 0.0001540060279708462, + "loss": 3.361, + "step": 4974 + }, + { + "epoch": 0.64, + "grad_norm": 0.6144149303436279, + "learning_rate": 0.00015391032657843988, + "loss": 3.2056, + "step": 4975 + }, + { + "epoch": 0.64, + "grad_norm": 0.6430124640464783, + "learning_rate": 0.0001538146417049624, + "loss": 3.322, + "step": 4976 + }, + { + "epoch": 0.64, + "grad_norm": 0.6340519785881042, + "learning_rate": 0.00015371897336686303, + "loss": 3.2947, + "step": 4977 + }, + { + "epoch": 0.64, + "grad_norm": 0.6688194870948792, + "learning_rate": 0.0001536233215805884, + "loss": 3.2453, + "step": 4978 + }, + { + "epoch": 0.64, + "grad_norm": 0.7047507166862488, + "learning_rate": 0.000153527686362582, + "loss": 3.3035, + "step": 4979 + }, + { + "epoch": 0.64, + "grad_norm": 0.6342984437942505, + "learning_rate": 0.00015343206772928486, + "loss": 3.4161, + "step": 4980 + }, + { + "epoch": 0.64, + "grad_norm": 0.6949705481529236, + "learning_rate": 0.00015333646569713473, + "loss": 3.2528, + "step": 4981 + }, + { + "epoch": 0.64, + "grad_norm": 0.6137412786483765, + "learning_rate": 0.00015324088028256677, + "loss": 3.3913, + "step": 4982 + }, + { + "epoch": 0.64, + "grad_norm": 0.6763321757316589, + "learning_rate": 0.00015314531150201316, + "loss": 3.3005, + "step": 4983 + }, + { + "epoch": 0.64, + "grad_norm": 0.6877267956733704, + "learning_rate": 0.0001530497593719034, + "loss": 3.3418, + "step": 4984 + }, + { + "epoch": 0.64, + "grad_norm": 0.633033812046051, + "learning_rate": 0.00015295422390866398, + "loss": 3.2597, + "step": 4985 + }, + { + "epoch": 0.64, + "grad_norm": 0.6207406520843506, + "learning_rate": 0.00015285870512871835, + "loss": 3.3557, + "step": 4986 + }, + { + "epoch": 0.64, + "grad_norm": 0.6227355599403381, + "learning_rate": 0.00015276320304848757, + "loss": 3.2997, + "step": 4987 + }, + { + "epoch": 0.64, + "grad_norm": 0.6410094499588013, + "learning_rate": 0.00015266771768438948, + "loss": 3.3759, + "step": 4988 + }, + { + "epoch": 0.64, + "grad_norm": 0.6022902131080627, + "learning_rate": 0.00015257224905283913, + "loss": 3.2474, + "step": 4989 + }, + { + "epoch": 0.64, + "grad_norm": 0.6035560965538025, + "learning_rate": 0.00015247679717024854, + "loss": 3.3183, + "step": 4990 + }, + { + "epoch": 0.64, + "grad_norm": 0.6439880132675171, + "learning_rate": 0.0001523813620530272, + "loss": 3.3303, + "step": 4991 + }, + { + "epoch": 0.64, + "grad_norm": 0.6099724173545837, + "learning_rate": 0.00015228594371758137, + "loss": 3.223, + "step": 4992 + }, + { + "epoch": 0.64, + "grad_norm": 0.6340009570121765, + "learning_rate": 0.00015219054218031458, + "loss": 3.2488, + "step": 4993 + }, + { + "epoch": 0.64, + "grad_norm": 0.6087921261787415, + "learning_rate": 0.0001520951574576276, + "loss": 3.2731, + "step": 4994 + }, + { + "epoch": 0.64, + "grad_norm": 0.6423810124397278, + "learning_rate": 0.0001519997895659179, + "loss": 3.1672, + "step": 4995 + }, + { + "epoch": 0.64, + "grad_norm": 0.5887874960899353, + "learning_rate": 0.00015190443852158058, + "loss": 3.2284, + "step": 4996 + }, + { + "epoch": 0.64, + "grad_norm": 0.6974184513092041, + "learning_rate": 0.00015180910434100747, + "loss": 3.3678, + "step": 4997 + }, + { + "epoch": 0.64, + "grad_norm": 0.5889269113540649, + "learning_rate": 0.00015171378704058772, + "loss": 3.2248, + "step": 4998 + }, + { + "epoch": 0.64, + "grad_norm": 0.6352993249893188, + "learning_rate": 0.0001516184866367074, + "loss": 3.2472, + "step": 4999 + }, + { + "epoch": 0.64, + "grad_norm": 0.5974810719490051, + "learning_rate": 0.00015152320314574974, + "loss": 3.2709, + "step": 5000 + }, + { + "epoch": 0.64, + "grad_norm": 0.6414270997047424, + "learning_rate": 0.0001514279365840951, + "loss": 3.2805, + "step": 5001 + }, + { + "epoch": 0.64, + "grad_norm": 0.6457729339599609, + "learning_rate": 0.0001513326869681209, + "loss": 3.2465, + "step": 5002 + }, + { + "epoch": 0.64, + "grad_norm": 0.5834159851074219, + "learning_rate": 0.00015123745431420169, + "loss": 3.3469, + "step": 5003 + }, + { + "epoch": 0.64, + "grad_norm": 0.6347243189811707, + "learning_rate": 0.000151142238638709, + "loss": 3.2968, + "step": 5004 + }, + { + "epoch": 0.64, + "grad_norm": 0.6431541442871094, + "learning_rate": 0.00015104703995801145, + "loss": 3.2673, + "step": 5005 + }, + { + "epoch": 0.64, + "grad_norm": 0.6189345717430115, + "learning_rate": 0.000150951858288475, + "loss": 3.3624, + "step": 5006 + }, + { + "epoch": 0.64, + "grad_norm": 0.6066557168960571, + "learning_rate": 0.00015085669364646242, + "loss": 3.2837, + "step": 5007 + }, + { + "epoch": 0.64, + "grad_norm": 0.6095768809318542, + "learning_rate": 0.0001507615460483335, + "loss": 3.2273, + "step": 5008 + }, + { + "epoch": 0.64, + "grad_norm": 0.6279232501983643, + "learning_rate": 0.00015066641551044532, + "loss": 3.3132, + "step": 5009 + }, + { + "epoch": 0.64, + "grad_norm": 0.6557057499885559, + "learning_rate": 0.00015057130204915192, + "loss": 3.4269, + "step": 5010 + }, + { + "epoch": 0.64, + "grad_norm": 0.6350106596946716, + "learning_rate": 0.00015047620568080428, + "loss": 3.2787, + "step": 5011 + }, + { + "epoch": 0.64, + "grad_norm": 0.6168920993804932, + "learning_rate": 0.00015038112642175072, + "loss": 3.2072, + "step": 5012 + }, + { + "epoch": 0.64, + "grad_norm": 0.6138224005699158, + "learning_rate": 0.00015028606428833645, + "loss": 3.3762, + "step": 5013 + }, + { + "epoch": 0.64, + "grad_norm": 0.6427010893821716, + "learning_rate": 0.00015019101929690358, + "loss": 3.4177, + "step": 5014 + }, + { + "epoch": 0.64, + "grad_norm": 0.6091998815536499, + "learning_rate": 0.00015009599146379162, + "loss": 3.3719, + "step": 5015 + }, + { + "epoch": 0.64, + "grad_norm": 0.6215112209320068, + "learning_rate": 0.00015000098080533697, + "loss": 3.3247, + "step": 5016 + }, + { + "epoch": 0.64, + "grad_norm": 0.6604453325271606, + "learning_rate": 0.00014990598733787304, + "loss": 3.2659, + "step": 5017 + }, + { + "epoch": 0.64, + "grad_norm": 0.643018364906311, + "learning_rate": 0.0001498110110777302, + "loss": 3.1471, + "step": 5018 + }, + { + "epoch": 0.64, + "grad_norm": 0.6082643270492554, + "learning_rate": 0.00014971605204123608, + "loss": 3.2765, + "step": 5019 + }, + { + "epoch": 0.64, + "grad_norm": 0.6341992020606995, + "learning_rate": 0.00014962111024471522, + "loss": 3.4197, + "step": 5020 + }, + { + "epoch": 0.64, + "grad_norm": 0.6241186261177063, + "learning_rate": 0.00014952618570448923, + "loss": 3.3389, + "step": 5021 + }, + { + "epoch": 0.64, + "grad_norm": 0.6271200180053711, + "learning_rate": 0.00014943127843687658, + "loss": 3.2994, + "step": 5022 + }, + { + "epoch": 0.64, + "grad_norm": 0.6608973741531372, + "learning_rate": 0.0001493363884581931, + "loss": 3.4353, + "step": 5023 + }, + { + "epoch": 0.64, + "grad_norm": 0.6565616726875305, + "learning_rate": 0.0001492415157847515, + "loss": 3.3183, + "step": 5024 + }, + { + "epoch": 0.64, + "grad_norm": 0.5876268744468689, + "learning_rate": 0.0001491466604328614, + "loss": 3.3067, + "step": 5025 + }, + { + "epoch": 0.64, + "grad_norm": 0.5943182706832886, + "learning_rate": 0.00014905182241882955, + "loss": 3.276, + "step": 5026 + }, + { + "epoch": 0.64, + "grad_norm": 0.5989049673080444, + "learning_rate": 0.00014895700175895978, + "loss": 3.3227, + "step": 5027 + }, + { + "epoch": 0.64, + "grad_norm": 0.6210485696792603, + "learning_rate": 0.00014886219846955276, + "loss": 3.2309, + "step": 5028 + }, + { + "epoch": 0.64, + "grad_norm": 0.6408283114433289, + "learning_rate": 0.0001487674125669063, + "loss": 3.3571, + "step": 5029 + }, + { + "epoch": 0.64, + "grad_norm": 0.6775360107421875, + "learning_rate": 0.00014867264406731524, + "loss": 3.3817, + "step": 5030 + }, + { + "epoch": 0.64, + "grad_norm": 0.7827131748199463, + "learning_rate": 0.00014857789298707133, + "loss": 3.2939, + "step": 5031 + }, + { + "epoch": 0.64, + "grad_norm": 0.6316560506820679, + "learning_rate": 0.0001484831593424633, + "loss": 3.0717, + "step": 5032 + }, + { + "epoch": 0.64, + "grad_norm": 0.5866991877555847, + "learning_rate": 0.00014838844314977719, + "loss": 3.4829, + "step": 5033 + }, + { + "epoch": 0.64, + "grad_norm": 0.652138888835907, + "learning_rate": 0.00014829374442529563, + "loss": 3.3932, + "step": 5034 + }, + { + "epoch": 0.64, + "grad_norm": 0.5911225080490112, + "learning_rate": 0.00014819906318529858, + "loss": 3.3848, + "step": 5035 + }, + { + "epoch": 0.64, + "grad_norm": 0.6234878301620483, + "learning_rate": 0.00014810439944606263, + "loss": 3.2459, + "step": 5036 + }, + { + "epoch": 0.64, + "grad_norm": 0.6251317262649536, + "learning_rate": 0.00014800975322386175, + "loss": 3.2907, + "step": 5037 + }, + { + "epoch": 0.64, + "grad_norm": 0.6247319579124451, + "learning_rate": 0.00014791512453496669, + "loss": 3.3405, + "step": 5038 + }, + { + "epoch": 0.64, + "grad_norm": 0.6416252255439758, + "learning_rate": 0.00014782051339564512, + "loss": 3.3899, + "step": 5039 + }, + { + "epoch": 0.65, + "grad_norm": 0.6290828585624695, + "learning_rate": 0.00014772591982216193, + "loss": 3.3738, + "step": 5040 + }, + { + "epoch": 0.65, + "grad_norm": 0.5640152096748352, + "learning_rate": 0.00014763134383077875, + "loss": 3.3445, + "step": 5041 + }, + { + "epoch": 0.65, + "grad_norm": 0.6488463878631592, + "learning_rate": 0.00014753678543775428, + "loss": 3.3709, + "step": 5042 + }, + { + "epoch": 0.65, + "grad_norm": 0.6063884496688843, + "learning_rate": 0.0001474422446593443, + "loss": 3.4388, + "step": 5043 + }, + { + "epoch": 0.65, + "grad_norm": 0.6267374157905579, + "learning_rate": 0.0001473477215118014, + "loss": 3.2639, + "step": 5044 + }, + { + "epoch": 0.65, + "grad_norm": 0.6429750919342041, + "learning_rate": 0.00014725321601137526, + "loss": 3.3561, + "step": 5045 + }, + { + "epoch": 0.65, + "grad_norm": 0.6482391357421875, + "learning_rate": 0.00014715872817431242, + "loss": 3.2559, + "step": 5046 + }, + { + "epoch": 0.65, + "grad_norm": 0.6009674072265625, + "learning_rate": 0.0001470642580168564, + "loss": 3.3531, + "step": 5047 + }, + { + "epoch": 0.65, + "grad_norm": 0.6284033060073853, + "learning_rate": 0.0001469698055552478, + "loss": 3.323, + "step": 5048 + }, + { + "epoch": 0.65, + "grad_norm": 0.6527823209762573, + "learning_rate": 0.00014687537080572405, + "loss": 3.2579, + "step": 5049 + }, + { + "epoch": 0.65, + "grad_norm": 0.630158007144928, + "learning_rate": 0.00014678095378451955, + "loss": 3.4153, + "step": 5050 + }, + { + "epoch": 0.65, + "grad_norm": 0.631565272808075, + "learning_rate": 0.00014668655450786566, + "loss": 3.2825, + "step": 5051 + }, + { + "epoch": 0.65, + "grad_norm": 0.6248773336410522, + "learning_rate": 0.00014659217299199084, + "loss": 3.2931, + "step": 5052 + }, + { + "epoch": 0.65, + "grad_norm": 0.608366847038269, + "learning_rate": 0.00014649780925312034, + "loss": 3.3213, + "step": 5053 + }, + { + "epoch": 0.65, + "grad_norm": 0.6143530607223511, + "learning_rate": 0.00014640346330747623, + "loss": 3.3564, + "step": 5054 + }, + { + "epoch": 0.65, + "grad_norm": 0.6777532696723938, + "learning_rate": 0.00014630913517127786, + "loss": 3.3336, + "step": 5055 + }, + { + "epoch": 0.65, + "grad_norm": 0.5899236798286438, + "learning_rate": 0.00014621482486074121, + "loss": 3.1634, + "step": 5056 + }, + { + "epoch": 0.65, + "grad_norm": 0.6264376640319824, + "learning_rate": 0.00014612053239207928, + "loss": 3.2912, + "step": 5057 + }, + { + "epoch": 0.65, + "grad_norm": 0.5884184241294861, + "learning_rate": 0.0001460262577815022, + "loss": 3.1504, + "step": 5058 + }, + { + "epoch": 0.65, + "grad_norm": 0.6290861368179321, + "learning_rate": 0.0001459320010452167, + "loss": 3.2264, + "step": 5059 + }, + { + "epoch": 0.65, + "grad_norm": 0.648082435131073, + "learning_rate": 0.00014583776219942665, + "loss": 3.3968, + "step": 5060 + }, + { + "epoch": 0.65, + "grad_norm": 0.6225313544273376, + "learning_rate": 0.00014574354126033293, + "loss": 3.2704, + "step": 5061 + }, + { + "epoch": 0.65, + "grad_norm": 0.6109769344329834, + "learning_rate": 0.0001456493382441331, + "loss": 3.1933, + "step": 5062 + }, + { + "epoch": 0.65, + "grad_norm": 0.6114202737808228, + "learning_rate": 0.00014555515316702175, + "loss": 3.2708, + "step": 5063 + }, + { + "epoch": 0.65, + "grad_norm": 0.6244300603866577, + "learning_rate": 0.0001454609860451904, + "loss": 3.3708, + "step": 5064 + }, + { + "epoch": 0.65, + "grad_norm": 0.6992241740226746, + "learning_rate": 0.0001453668368948275, + "loss": 3.1895, + "step": 5065 + }, + { + "epoch": 0.65, + "grad_norm": 0.6371669769287109, + "learning_rate": 0.00014527270573211823, + "loss": 3.3398, + "step": 5066 + }, + { + "epoch": 0.65, + "grad_norm": 0.6179408431053162, + "learning_rate": 0.00014517859257324507, + "loss": 3.2487, + "step": 5067 + }, + { + "epoch": 0.65, + "grad_norm": 0.6645209193229675, + "learning_rate": 0.000145084497434387, + "loss": 3.313, + "step": 5068 + }, + { + "epoch": 0.65, + "grad_norm": 0.5824407339096069, + "learning_rate": 0.00014499042033172, + "loss": 3.2539, + "step": 5069 + }, + { + "epoch": 0.65, + "grad_norm": 0.6037798523902893, + "learning_rate": 0.00014489636128141726, + "loss": 3.2942, + "step": 5070 + }, + { + "epoch": 0.65, + "grad_norm": 0.6334280967712402, + "learning_rate": 0.00014480232029964849, + "loss": 3.3655, + "step": 5071 + }, + { + "epoch": 0.65, + "grad_norm": 0.5741010308265686, + "learning_rate": 0.0001447082974025804, + "loss": 3.2918, + "step": 5072 + }, + { + "epoch": 0.65, + "grad_norm": 0.6136914491653442, + "learning_rate": 0.0001446142926063766, + "loss": 3.2696, + "step": 5073 + }, + { + "epoch": 0.65, + "grad_norm": 0.6093493103981018, + "learning_rate": 0.00014452030592719756, + "loss": 3.1996, + "step": 5074 + }, + { + "epoch": 0.65, + "grad_norm": 0.6550522446632385, + "learning_rate": 0.00014442633738120087, + "loss": 3.3622, + "step": 5075 + }, + { + "epoch": 0.65, + "grad_norm": 0.6372076869010925, + "learning_rate": 0.0001443323869845407, + "loss": 3.3896, + "step": 5076 + }, + { + "epoch": 0.65, + "grad_norm": 0.6629884839057922, + "learning_rate": 0.0001442384547533682, + "loss": 3.348, + "step": 5077 + }, + { + "epoch": 0.65, + "grad_norm": 0.6217200756072998, + "learning_rate": 0.00014414454070383142, + "loss": 3.2738, + "step": 5078 + }, + { + "epoch": 0.65, + "grad_norm": 0.6206686496734619, + "learning_rate": 0.00014405064485207517, + "loss": 3.2477, + "step": 5079 + }, + { + "epoch": 0.65, + "grad_norm": 0.6034120321273804, + "learning_rate": 0.00014395676721424145, + "loss": 3.3615, + "step": 5080 + }, + { + "epoch": 0.65, + "grad_norm": 0.5921332836151123, + "learning_rate": 0.00014386290780646872, + "loss": 3.3844, + "step": 5081 + }, + { + "epoch": 0.65, + "grad_norm": 0.6570034027099609, + "learning_rate": 0.00014376906664489265, + "loss": 3.4145, + "step": 5082 + }, + { + "epoch": 0.65, + "grad_norm": 0.5640652179718018, + "learning_rate": 0.00014367524374564556, + "loss": 3.2098, + "step": 5083 + }, + { + "epoch": 0.65, + "grad_norm": 0.6425613760948181, + "learning_rate": 0.00014358143912485672, + "loss": 3.3405, + "step": 5084 + }, + { + "epoch": 0.65, + "grad_norm": 0.6288067698478699, + "learning_rate": 0.0001434876527986522, + "loss": 3.3572, + "step": 5085 + }, + { + "epoch": 0.65, + "grad_norm": 0.5896071195602417, + "learning_rate": 0.00014339388478315496, + "loss": 3.2689, + "step": 5086 + }, + { + "epoch": 0.65, + "grad_norm": 0.6366032361984253, + "learning_rate": 0.00014330013509448468, + "loss": 3.4776, + "step": 5087 + }, + { + "epoch": 0.65, + "grad_norm": 0.6185347437858582, + "learning_rate": 0.00014320640374875827, + "loss": 3.2119, + "step": 5088 + }, + { + "epoch": 0.65, + "grad_norm": 0.6984763145446777, + "learning_rate": 0.00014311269076208903, + "loss": 3.3938, + "step": 5089 + }, + { + "epoch": 0.65, + "grad_norm": 0.6636654734611511, + "learning_rate": 0.00014301899615058747, + "loss": 3.2879, + "step": 5090 + }, + { + "epoch": 0.65, + "grad_norm": 0.576370894908905, + "learning_rate": 0.0001429253199303607, + "loss": 3.3888, + "step": 5091 + }, + { + "epoch": 0.65, + "grad_norm": 0.6354531645774841, + "learning_rate": 0.00014283166211751276, + "loss": 3.2763, + "step": 5092 + }, + { + "epoch": 0.65, + "grad_norm": 0.6325899362564087, + "learning_rate": 0.0001427380227281445, + "loss": 3.0955, + "step": 5093 + }, + { + "epoch": 0.65, + "grad_norm": 0.6241953372955322, + "learning_rate": 0.00014264440177835363, + "loss": 3.2864, + "step": 5094 + }, + { + "epoch": 0.65, + "grad_norm": 0.6474228501319885, + "learning_rate": 0.00014255079928423455, + "loss": 3.2663, + "step": 5095 + }, + { + "epoch": 0.65, + "grad_norm": 0.6273337006568909, + "learning_rate": 0.00014245721526187882, + "loss": 3.4285, + "step": 5096 + }, + { + "epoch": 0.65, + "grad_norm": 0.5948605537414551, + "learning_rate": 0.00014236364972737447, + "loss": 3.2559, + "step": 5097 + }, + { + "epoch": 0.65, + "grad_norm": 0.6316484808921814, + "learning_rate": 0.00014227010269680663, + "loss": 3.297, + "step": 5098 + }, + { + "epoch": 0.65, + "grad_norm": 0.6476810574531555, + "learning_rate": 0.00014217657418625707, + "loss": 3.3244, + "step": 5099 + }, + { + "epoch": 0.65, + "grad_norm": 0.6473725438117981, + "learning_rate": 0.0001420830642118044, + "loss": 3.2591, + "step": 5100 + }, + { + "epoch": 0.65, + "grad_norm": 0.7240094542503357, + "learning_rate": 0.00014198957278952406, + "loss": 3.3884, + "step": 5101 + }, + { + "epoch": 0.65, + "grad_norm": 0.6182361245155334, + "learning_rate": 0.00014189609993548824, + "loss": 3.293, + "step": 5102 + }, + { + "epoch": 0.65, + "grad_norm": 0.6440767049789429, + "learning_rate": 0.00014180264566576617, + "loss": 3.3392, + "step": 5103 + }, + { + "epoch": 0.65, + "grad_norm": 0.6826469898223877, + "learning_rate": 0.0001417092099964236, + "loss": 3.2833, + "step": 5104 + }, + { + "epoch": 0.65, + "grad_norm": 0.6628642082214355, + "learning_rate": 0.00014161579294352333, + "loss": 3.3416, + "step": 5105 + }, + { + "epoch": 0.65, + "grad_norm": 0.6261721849441528, + "learning_rate": 0.0001415223945231246, + "loss": 3.3862, + "step": 5106 + }, + { + "epoch": 0.65, + "grad_norm": 0.5831178426742554, + "learning_rate": 0.00014142901475128395, + "loss": 3.272, + "step": 5107 + }, + { + "epoch": 0.65, + "grad_norm": 0.6141115427017212, + "learning_rate": 0.00014133565364405433, + "loss": 3.3945, + "step": 5108 + }, + { + "epoch": 0.65, + "grad_norm": 0.58644038438797, + "learning_rate": 0.0001412423112174856, + "loss": 3.2059, + "step": 5109 + }, + { + "epoch": 0.65, + "grad_norm": 0.5988340377807617, + "learning_rate": 0.0001411489874876243, + "loss": 3.3375, + "step": 5110 + }, + { + "epoch": 0.65, + "grad_norm": 0.6062052249908447, + "learning_rate": 0.00014105568247051403, + "loss": 3.3096, + "step": 5111 + }, + { + "epoch": 0.65, + "grad_norm": 0.6299193501472473, + "learning_rate": 0.00014096239618219492, + "loss": 3.3522, + "step": 5112 + }, + { + "epoch": 0.65, + "grad_norm": 0.6424022316932678, + "learning_rate": 0.00014086912863870403, + "loss": 3.1548, + "step": 5113 + }, + { + "epoch": 0.65, + "grad_norm": 0.6243067979812622, + "learning_rate": 0.00014077587985607504, + "loss": 3.3714, + "step": 5114 + }, + { + "epoch": 0.65, + "grad_norm": 0.6548874378204346, + "learning_rate": 0.00014068264985033857, + "loss": 3.354, + "step": 5115 + }, + { + "epoch": 0.65, + "grad_norm": 0.6031063199043274, + "learning_rate": 0.00014058943863752178, + "loss": 3.2291, + "step": 5116 + }, + { + "epoch": 0.65, + "grad_norm": 0.6349620223045349, + "learning_rate": 0.0001404962462336489, + "loss": 3.2758, + "step": 5117 + }, + { + "epoch": 0.66, + "grad_norm": 0.7188941836357117, + "learning_rate": 0.00014040307265474086, + "loss": 3.208, + "step": 5118 + }, + { + "epoch": 0.66, + "grad_norm": 0.6274533867835999, + "learning_rate": 0.00014030991791681518, + "loss": 3.3663, + "step": 5119 + }, + { + "epoch": 0.66, + "grad_norm": 0.6238290071487427, + "learning_rate": 0.00014021678203588627, + "loss": 3.3302, + "step": 5120 + }, + { + "epoch": 0.66, + "grad_norm": 0.6041349172592163, + "learning_rate": 0.00014012366502796526, + "loss": 3.2348, + "step": 5121 + }, + { + "epoch": 0.66, + "grad_norm": 0.6019901037216187, + "learning_rate": 0.00014003056690906, + "loss": 3.2653, + "step": 5122 + }, + { + "epoch": 0.66, + "grad_norm": 0.6081122756004333, + "learning_rate": 0.00013993748769517507, + "loss": 3.4407, + "step": 5123 + }, + { + "epoch": 0.66, + "grad_norm": 0.6346868872642517, + "learning_rate": 0.00013984442740231203, + "loss": 3.287, + "step": 5124 + }, + { + "epoch": 0.66, + "grad_norm": 0.587018609046936, + "learning_rate": 0.00013975138604646888, + "loss": 3.4186, + "step": 5125 + }, + { + "epoch": 0.66, + "grad_norm": 0.5837512612342834, + "learning_rate": 0.00013965836364364067, + "loss": 3.249, + "step": 5126 + }, + { + "epoch": 0.66, + "grad_norm": 0.608168363571167, + "learning_rate": 0.00013956536020981897, + "loss": 3.2367, + "step": 5127 + }, + { + "epoch": 0.66, + "grad_norm": 0.6023334264755249, + "learning_rate": 0.0001394723757609921, + "loss": 3.3814, + "step": 5128 + }, + { + "epoch": 0.66, + "grad_norm": 0.6250126361846924, + "learning_rate": 0.00013937941031314516, + "loss": 3.2889, + "step": 5129 + }, + { + "epoch": 0.66, + "grad_norm": 0.6232753992080688, + "learning_rate": 0.00013928646388226002, + "loss": 3.329, + "step": 5130 + }, + { + "epoch": 0.66, + "grad_norm": 0.6164801716804504, + "learning_rate": 0.00013919353648431516, + "loss": 3.297, + "step": 5131 + }, + { + "epoch": 0.66, + "grad_norm": 0.6190966963768005, + "learning_rate": 0.00013910062813528605, + "loss": 3.3271, + "step": 5132 + }, + { + "epoch": 0.66, + "grad_norm": 0.6317085027694702, + "learning_rate": 0.0001390077388511446, + "loss": 3.4416, + "step": 5133 + }, + { + "epoch": 0.66, + "grad_norm": 0.5752818584442139, + "learning_rate": 0.0001389148686478595, + "loss": 3.2217, + "step": 5134 + }, + { + "epoch": 0.66, + "grad_norm": 0.6402003169059753, + "learning_rate": 0.00013882201754139638, + "loss": 3.3543, + "step": 5135 + }, + { + "epoch": 0.66, + "grad_norm": 0.6417957544326782, + "learning_rate": 0.0001387291855477173, + "loss": 3.2738, + "step": 5136 + }, + { + "epoch": 0.66, + "grad_norm": 0.6363396644592285, + "learning_rate": 0.00013863637268278123, + "loss": 3.2036, + "step": 5137 + }, + { + "epoch": 0.66, + "grad_norm": 0.6913270950317383, + "learning_rate": 0.0001385435789625436, + "loss": 3.3316, + "step": 5138 + }, + { + "epoch": 0.66, + "grad_norm": 0.6306250691413879, + "learning_rate": 0.00013845080440295698, + "loss": 3.3967, + "step": 5139 + }, + { + "epoch": 0.66, + "grad_norm": 0.6392128467559814, + "learning_rate": 0.00013835804901997029, + "loss": 3.4899, + "step": 5140 + }, + { + "epoch": 0.66, + "grad_norm": 0.6294986605644226, + "learning_rate": 0.0001382653128295292, + "loss": 3.2473, + "step": 5141 + }, + { + "epoch": 0.66, + "grad_norm": 0.6198727488517761, + "learning_rate": 0.00013817259584757619, + "loss": 3.3429, + "step": 5142 + }, + { + "epoch": 0.66, + "grad_norm": 0.6190699338912964, + "learning_rate": 0.0001380798980900503, + "loss": 3.3584, + "step": 5143 + }, + { + "epoch": 0.66, + "grad_norm": 0.5895346999168396, + "learning_rate": 0.00013798721957288747, + "loss": 3.4016, + "step": 5144 + }, + { + "epoch": 0.66, + "grad_norm": 0.6533825993537903, + "learning_rate": 0.0001378945603120202, + "loss": 3.3099, + "step": 5145 + }, + { + "epoch": 0.66, + "grad_norm": 0.6693593263626099, + "learning_rate": 0.00013780192032337752, + "loss": 3.23, + "step": 5146 + }, + { + "epoch": 0.66, + "grad_norm": 0.6089479327201843, + "learning_rate": 0.00013770929962288552, + "loss": 3.3238, + "step": 5147 + }, + { + "epoch": 0.66, + "grad_norm": 0.644583523273468, + "learning_rate": 0.00013761669822646676, + "loss": 3.2007, + "step": 5148 + }, + { + "epoch": 0.66, + "grad_norm": 0.6629884243011475, + "learning_rate": 0.0001375241161500404, + "loss": 3.2607, + "step": 5149 + }, + { + "epoch": 0.66, + "grad_norm": 0.6261684894561768, + "learning_rate": 0.00013743155340952242, + "loss": 3.3117, + "step": 5150 + }, + { + "epoch": 0.66, + "grad_norm": 0.6273629665374756, + "learning_rate": 0.00013733901002082544, + "loss": 3.2865, + "step": 5151 + }, + { + "epoch": 0.66, + "grad_norm": 0.6482716202735901, + "learning_rate": 0.00013724648599985857, + "loss": 3.328, + "step": 5152 + }, + { + "epoch": 0.66, + "grad_norm": 0.6402632594108582, + "learning_rate": 0.00013715398136252794, + "loss": 3.3984, + "step": 5153 + }, + { + "epoch": 0.66, + "grad_norm": 0.6050584316253662, + "learning_rate": 0.0001370614961247362, + "loss": 3.4794, + "step": 5154 + }, + { + "epoch": 0.66, + "grad_norm": 0.6375922560691833, + "learning_rate": 0.00013696903030238262, + "loss": 3.2708, + "step": 5155 + }, + { + "epoch": 0.66, + "grad_norm": 0.6117022037506104, + "learning_rate": 0.00013687658391136305, + "loss": 3.4379, + "step": 5156 + }, + { + "epoch": 0.66, + "grad_norm": 0.6070494651794434, + "learning_rate": 0.00013678415696757016, + "loss": 3.2773, + "step": 5157 + }, + { + "epoch": 0.66, + "grad_norm": 0.5802207589149475, + "learning_rate": 0.00013669174948689318, + "loss": 3.2079, + "step": 5158 + }, + { + "epoch": 0.66, + "grad_norm": 0.5739595293998718, + "learning_rate": 0.000136599361485218, + "loss": 3.3533, + "step": 5159 + }, + { + "epoch": 0.66, + "grad_norm": 0.6401107907295227, + "learning_rate": 0.0001365069929784273, + "loss": 3.31, + "step": 5160 + }, + { + "epoch": 0.66, + "grad_norm": 0.6143526434898376, + "learning_rate": 0.00013641464398240021, + "loss": 3.3205, + "step": 5161 + }, + { + "epoch": 0.66, + "grad_norm": 0.6221523284912109, + "learning_rate": 0.00013632231451301256, + "loss": 3.3169, + "step": 5162 + }, + { + "epoch": 0.66, + "grad_norm": 0.6104118824005127, + "learning_rate": 0.000136230004586137, + "loss": 3.3427, + "step": 5163 + }, + { + "epoch": 0.66, + "grad_norm": 0.6160389184951782, + "learning_rate": 0.00013613771421764254, + "loss": 3.3304, + "step": 5164 + }, + { + "epoch": 0.66, + "grad_norm": 0.607818603515625, + "learning_rate": 0.00013604544342339506, + "loss": 3.2499, + "step": 5165 + }, + { + "epoch": 0.66, + "grad_norm": 0.64719557762146, + "learning_rate": 0.0001359531922192569, + "loss": 3.2432, + "step": 5166 + }, + { + "epoch": 0.66, + "grad_norm": 0.597808837890625, + "learning_rate": 0.0001358609606210871, + "loss": 3.3301, + "step": 5167 + }, + { + "epoch": 0.66, + "grad_norm": 0.6585919260978699, + "learning_rate": 0.00013576874864474142, + "loss": 3.3205, + "step": 5168 + }, + { + "epoch": 0.66, + "grad_norm": 0.6387555599212646, + "learning_rate": 0.0001356765563060721, + "loss": 3.3725, + "step": 5169 + }, + { + "epoch": 0.66, + "grad_norm": 0.6279529929161072, + "learning_rate": 0.00013558438362092816, + "loss": 3.3865, + "step": 5170 + }, + { + "epoch": 0.66, + "grad_norm": 0.6626907587051392, + "learning_rate": 0.00013549223060515503, + "loss": 3.2946, + "step": 5171 + }, + { + "epoch": 0.66, + "grad_norm": 0.6042270064353943, + "learning_rate": 0.000135400097274595, + "loss": 3.3273, + "step": 5172 + }, + { + "epoch": 0.66, + "grad_norm": 0.5902952551841736, + "learning_rate": 0.00013530798364508678, + "loss": 3.4654, + "step": 5173 + }, + { + "epoch": 0.66, + "grad_norm": 0.6035054326057434, + "learning_rate": 0.00013521588973246573, + "loss": 3.1904, + "step": 5174 + }, + { + "epoch": 0.66, + "grad_norm": 0.6701990962028503, + "learning_rate": 0.00013512381555256403, + "loss": 3.2674, + "step": 5175 + }, + { + "epoch": 0.66, + "grad_norm": 0.6053285598754883, + "learning_rate": 0.0001350317611212102, + "loss": 3.2791, + "step": 5176 + }, + { + "epoch": 0.66, + "grad_norm": 0.6039866805076599, + "learning_rate": 0.00013493972645422942, + "loss": 3.2819, + "step": 5177 + }, + { + "epoch": 0.66, + "grad_norm": 0.6498487591743469, + "learning_rate": 0.00013484771156744356, + "loss": 3.4138, + "step": 5178 + }, + { + "epoch": 0.66, + "grad_norm": 0.5941669940948486, + "learning_rate": 0.0001347557164766711, + "loss": 3.3125, + "step": 5179 + }, + { + "epoch": 0.66, + "grad_norm": 0.5859676599502563, + "learning_rate": 0.00013466374119772685, + "loss": 3.2442, + "step": 5180 + }, + { + "epoch": 0.66, + "grad_norm": 0.6273823976516724, + "learning_rate": 0.0001345717857464226, + "loss": 3.3161, + "step": 5181 + }, + { + "epoch": 0.66, + "grad_norm": 0.641112744808197, + "learning_rate": 0.00013447985013856665, + "loss": 3.3408, + "step": 5182 + }, + { + "epoch": 0.66, + "grad_norm": 0.6055769324302673, + "learning_rate": 0.00013438793438996365, + "loss": 3.2671, + "step": 5183 + }, + { + "epoch": 0.66, + "grad_norm": 0.6235153675079346, + "learning_rate": 0.00013429603851641505, + "loss": 3.2637, + "step": 5184 + }, + { + "epoch": 0.66, + "grad_norm": 0.6751651763916016, + "learning_rate": 0.0001342041625337188, + "loss": 3.1511, + "step": 5185 + }, + { + "epoch": 0.66, + "grad_norm": 0.6396401524543762, + "learning_rate": 0.00013411230645766936, + "loss": 3.2449, + "step": 5186 + }, + { + "epoch": 0.66, + "grad_norm": 0.623742938041687, + "learning_rate": 0.000134020470304058, + "loss": 3.3573, + "step": 5187 + }, + { + "epoch": 0.66, + "grad_norm": 0.6788823008537292, + "learning_rate": 0.00013392865408867223, + "loss": 3.3257, + "step": 5188 + }, + { + "epoch": 0.66, + "grad_norm": 0.6515249013900757, + "learning_rate": 0.0001338368578272965, + "loss": 3.3549, + "step": 5189 + }, + { + "epoch": 0.66, + "grad_norm": 0.624242901802063, + "learning_rate": 0.00013374508153571153, + "loss": 3.2207, + "step": 5190 + }, + { + "epoch": 0.66, + "grad_norm": 0.7381793260574341, + "learning_rate": 0.00013365332522969486, + "loss": 3.3485, + "step": 5191 + }, + { + "epoch": 0.66, + "grad_norm": 0.5954594016075134, + "learning_rate": 0.00013356158892502038, + "loss": 3.2959, + "step": 5192 + }, + { + "epoch": 0.66, + "grad_norm": 0.6428266763687134, + "learning_rate": 0.00013346987263745862, + "loss": 3.2255, + "step": 5193 + }, + { + "epoch": 0.66, + "grad_norm": 0.6748354434967041, + "learning_rate": 0.00013337817638277673, + "loss": 3.3791, + "step": 5194 + }, + { + "epoch": 0.66, + "grad_norm": 0.612354040145874, + "learning_rate": 0.0001332865001767382, + "loss": 3.3375, + "step": 5195 + }, + { + "epoch": 0.67, + "grad_norm": 0.6095655560493469, + "learning_rate": 0.00013319484403510345, + "loss": 3.2663, + "step": 5196 + }, + { + "epoch": 0.67, + "grad_norm": 0.5803064107894897, + "learning_rate": 0.00013310320797362915, + "loss": 3.3649, + "step": 5197 + }, + { + "epoch": 0.67, + "grad_norm": 0.6813079118728638, + "learning_rate": 0.00013301159200806856, + "loss": 3.3442, + "step": 5198 + }, + { + "epoch": 0.67, + "grad_norm": 0.6259683966636658, + "learning_rate": 0.00013291999615417147, + "loss": 3.4004, + "step": 5199 + }, + { + "epoch": 0.67, + "grad_norm": 0.5949110388755798, + "learning_rate": 0.00013282842042768446, + "loss": 3.2591, + "step": 5200 + }, + { + "epoch": 0.67, + "grad_norm": 0.641460120677948, + "learning_rate": 0.0001327368648443503, + "loss": 3.3612, + "step": 5201 + }, + { + "epoch": 0.67, + "grad_norm": 0.5888059735298157, + "learning_rate": 0.00013264532941990853, + "loss": 3.3093, + "step": 5202 + }, + { + "epoch": 0.67, + "grad_norm": 0.629054605960846, + "learning_rate": 0.00013255381417009502, + "loss": 3.2571, + "step": 5203 + }, + { + "epoch": 0.67, + "grad_norm": 0.6009963154792786, + "learning_rate": 0.0001324623191106425, + "loss": 3.2093, + "step": 5204 + }, + { + "epoch": 0.67, + "grad_norm": 0.6035163998603821, + "learning_rate": 0.00013237084425727995, + "loss": 3.346, + "step": 5205 + }, + { + "epoch": 0.67, + "grad_norm": 0.6143518090248108, + "learning_rate": 0.00013227938962573295, + "loss": 3.3584, + "step": 5206 + }, + { + "epoch": 0.67, + "grad_norm": 0.6170941591262817, + "learning_rate": 0.0001321879552317236, + "loss": 3.2979, + "step": 5207 + }, + { + "epoch": 0.67, + "grad_norm": 0.5544931888580322, + "learning_rate": 0.00013209654109097043, + "loss": 3.3117, + "step": 5208 + }, + { + "epoch": 0.67, + "grad_norm": 0.5998968482017517, + "learning_rate": 0.00013200514721918883, + "loss": 3.1606, + "step": 5209 + }, + { + "epoch": 0.67, + "grad_norm": 0.6752246618270874, + "learning_rate": 0.00013191377363209022, + "loss": 3.2952, + "step": 5210 + }, + { + "epoch": 0.67, + "grad_norm": 0.656449019908905, + "learning_rate": 0.000131822420345383, + "loss": 3.2877, + "step": 5211 + }, + { + "epoch": 0.67, + "grad_norm": 0.6037731170654297, + "learning_rate": 0.00013173108737477173, + "loss": 3.3158, + "step": 5212 + }, + { + "epoch": 0.67, + "grad_norm": 0.6379036903381348, + "learning_rate": 0.00013163977473595767, + "loss": 3.3287, + "step": 5213 + }, + { + "epoch": 0.67, + "grad_norm": 0.569927990436554, + "learning_rate": 0.00013154848244463846, + "loss": 3.297, + "step": 5214 + }, + { + "epoch": 0.67, + "grad_norm": 0.6360076665878296, + "learning_rate": 0.00013145721051650833, + "loss": 3.3574, + "step": 5215 + }, + { + "epoch": 0.67, + "grad_norm": 0.5665431618690491, + "learning_rate": 0.00013136595896725786, + "loss": 3.2449, + "step": 5216 + }, + { + "epoch": 0.67, + "grad_norm": 0.583529531955719, + "learning_rate": 0.00013127472781257439, + "loss": 3.2924, + "step": 5217 + }, + { + "epoch": 0.67, + "grad_norm": 0.5858355164527893, + "learning_rate": 0.0001311835170681417, + "loss": 3.2698, + "step": 5218 + }, + { + "epoch": 0.67, + "grad_norm": 0.6250311732292175, + "learning_rate": 0.00013109232674963982, + "loss": 3.3428, + "step": 5219 + }, + { + "epoch": 0.67, + "grad_norm": 0.6676336526870728, + "learning_rate": 0.0001310011568727455, + "loss": 3.27, + "step": 5220 + }, + { + "epoch": 0.67, + "grad_norm": 0.6124276518821716, + "learning_rate": 0.00013091000745313187, + "loss": 3.2371, + "step": 5221 + }, + { + "epoch": 0.67, + "grad_norm": 0.6961532235145569, + "learning_rate": 0.00013081887850646857, + "loss": 3.2868, + "step": 5222 + }, + { + "epoch": 0.67, + "grad_norm": 0.6264037489891052, + "learning_rate": 0.0001307277700484217, + "loss": 3.2977, + "step": 5223 + }, + { + "epoch": 0.67, + "grad_norm": 0.6278262138366699, + "learning_rate": 0.0001306366820946538, + "loss": 3.4374, + "step": 5224 + }, + { + "epoch": 0.67, + "grad_norm": 0.598902702331543, + "learning_rate": 0.00013054561466082411, + "loss": 3.2982, + "step": 5225 + }, + { + "epoch": 0.67, + "grad_norm": 0.6131187081336975, + "learning_rate": 0.00013045456776258812, + "loss": 3.502, + "step": 5226 + }, + { + "epoch": 0.67, + "grad_norm": 0.6919788718223572, + "learning_rate": 0.0001303635414155977, + "loss": 3.3583, + "step": 5227 + }, + { + "epoch": 0.67, + "grad_norm": 0.6344329714775085, + "learning_rate": 0.00013027253563550157, + "loss": 3.3588, + "step": 5228 + }, + { + "epoch": 0.67, + "grad_norm": 0.6490606069564819, + "learning_rate": 0.00013018155043794454, + "loss": 3.2091, + "step": 5229 + }, + { + "epoch": 0.67, + "grad_norm": 0.6615791320800781, + "learning_rate": 0.00013009058583856808, + "loss": 3.3525, + "step": 5230 + }, + { + "epoch": 0.67, + "grad_norm": 0.5891029834747314, + "learning_rate": 0.0001299996418530099, + "loss": 3.3269, + "step": 5231 + }, + { + "epoch": 0.67, + "grad_norm": 0.602630078792572, + "learning_rate": 0.00012990871849690455, + "loss": 3.3488, + "step": 5232 + }, + { + "epoch": 0.67, + "grad_norm": 0.6360301375389099, + "learning_rate": 0.0001298178157858827, + "loss": 3.3218, + "step": 5233 + }, + { + "epoch": 0.67, + "grad_norm": 0.5955541729927063, + "learning_rate": 0.0001297269337355716, + "loss": 3.2836, + "step": 5234 + }, + { + "epoch": 0.67, + "grad_norm": 0.5877676606178284, + "learning_rate": 0.0001296360723615949, + "loss": 3.3302, + "step": 5235 + }, + { + "epoch": 0.67, + "grad_norm": 0.5851530432701111, + "learning_rate": 0.00012954523167957267, + "loss": 3.4018, + "step": 5236 + }, + { + "epoch": 0.67, + "grad_norm": 0.6754768490791321, + "learning_rate": 0.0001294544117051216, + "loss": 3.3248, + "step": 5237 + }, + { + "epoch": 0.67, + "grad_norm": 0.6499289870262146, + "learning_rate": 0.00012936361245385457, + "loss": 3.271, + "step": 5238 + }, + { + "epoch": 0.67, + "grad_norm": 0.5828032493591309, + "learning_rate": 0.00012927283394138122, + "loss": 3.3106, + "step": 5239 + }, + { + "epoch": 0.67, + "grad_norm": 0.622321367263794, + "learning_rate": 0.0001291820761833073, + "loss": 3.2305, + "step": 5240 + }, + { + "epoch": 0.67, + "grad_norm": 0.635196328163147, + "learning_rate": 0.0001290913391952351, + "loss": 3.2854, + "step": 5241 + }, + { + "epoch": 0.67, + "grad_norm": 0.6155919432640076, + "learning_rate": 0.00012900062299276338, + "loss": 3.3467, + "step": 5242 + }, + { + "epoch": 0.67, + "grad_norm": 0.6434239149093628, + "learning_rate": 0.00012890992759148734, + "loss": 3.409, + "step": 5243 + }, + { + "epoch": 0.67, + "grad_norm": 0.6506776213645935, + "learning_rate": 0.00012881925300699853, + "loss": 3.3755, + "step": 5244 + }, + { + "epoch": 0.67, + "grad_norm": 0.6123660802841187, + "learning_rate": 0.00012872859925488488, + "loss": 3.2831, + "step": 5245 + }, + { + "epoch": 0.67, + "grad_norm": 0.5690043568611145, + "learning_rate": 0.0001286379663507309, + "loss": 3.1536, + "step": 5246 + }, + { + "epoch": 0.67, + "grad_norm": 0.6750291585922241, + "learning_rate": 0.00012854735431011758, + "loss": 3.3318, + "step": 5247 + }, + { + "epoch": 0.67, + "grad_norm": 0.647638738155365, + "learning_rate": 0.00012845676314862203, + "loss": 3.4185, + "step": 5248 + }, + { + "epoch": 0.67, + "grad_norm": 0.6368696093559265, + "learning_rate": 0.00012836619288181795, + "loss": 3.2747, + "step": 5249 + }, + { + "epoch": 0.67, + "grad_norm": 0.6043297052383423, + "learning_rate": 0.00012827564352527543, + "loss": 3.2943, + "step": 5250 + }, + { + "epoch": 0.67, + "grad_norm": 0.6385737061500549, + "learning_rate": 0.00012818511509456092, + "loss": 3.3061, + "step": 5251 + }, + { + "epoch": 0.67, + "grad_norm": 0.6400846838951111, + "learning_rate": 0.0001280946076052372, + "loss": 3.3456, + "step": 5252 + }, + { + "epoch": 0.67, + "grad_norm": 0.5763105154037476, + "learning_rate": 0.00012800412107286384, + "loss": 3.3069, + "step": 5253 + }, + { + "epoch": 0.67, + "grad_norm": 0.6130650639533997, + "learning_rate": 0.00012791365551299624, + "loss": 3.3633, + "step": 5254 + }, + { + "epoch": 0.67, + "grad_norm": 0.6135804057121277, + "learning_rate": 0.00012782321094118672, + "loss": 3.2839, + "step": 5255 + }, + { + "epoch": 0.67, + "grad_norm": 0.5776688456535339, + "learning_rate": 0.0001277327873729836, + "loss": 3.242, + "step": 5256 + }, + { + "epoch": 0.67, + "grad_norm": 0.6195380091667175, + "learning_rate": 0.0001276423848239318, + "loss": 3.2915, + "step": 5257 + }, + { + "epoch": 0.67, + "grad_norm": 0.5916157364845276, + "learning_rate": 0.0001275520033095725, + "loss": 3.2218, + "step": 5258 + }, + { + "epoch": 0.67, + "grad_norm": 0.6159975528717041, + "learning_rate": 0.00012746164284544332, + "loss": 3.2407, + "step": 5259 + }, + { + "epoch": 0.67, + "grad_norm": 0.6072033643722534, + "learning_rate": 0.00012737130344707843, + "loss": 3.4078, + "step": 5260 + }, + { + "epoch": 0.67, + "grad_norm": 0.6531447768211365, + "learning_rate": 0.00012728098513000805, + "loss": 3.145, + "step": 5261 + }, + { + "epoch": 0.67, + "grad_norm": 0.5923706889152527, + "learning_rate": 0.00012719068790975906, + "loss": 3.366, + "step": 5262 + }, + { + "epoch": 0.67, + "grad_norm": 0.6577826738357544, + "learning_rate": 0.0001271004118018545, + "loss": 3.3171, + "step": 5263 + }, + { + "epoch": 0.67, + "grad_norm": 0.6210270524024963, + "learning_rate": 0.00012701015682181385, + "loss": 3.3118, + "step": 5264 + }, + { + "epoch": 0.67, + "grad_norm": 0.758233904838562, + "learning_rate": 0.00012691992298515317, + "loss": 3.3728, + "step": 5265 + }, + { + "epoch": 0.67, + "grad_norm": 0.6270756125450134, + "learning_rate": 0.0001268297103073846, + "loss": 3.3101, + "step": 5266 + }, + { + "epoch": 0.67, + "grad_norm": 0.6202570199966431, + "learning_rate": 0.00012673951880401663, + "loss": 3.3395, + "step": 5267 + }, + { + "epoch": 0.67, + "grad_norm": 0.5952986478805542, + "learning_rate": 0.00012664934849055442, + "loss": 3.36, + "step": 5268 + }, + { + "epoch": 0.67, + "grad_norm": 0.6283809542655945, + "learning_rate": 0.00012655919938249922, + "loss": 3.1854, + "step": 5269 + }, + { + "epoch": 0.67, + "grad_norm": 0.6460493803024292, + "learning_rate": 0.0001264690714953487, + "loss": 3.3881, + "step": 5270 + }, + { + "epoch": 0.67, + "grad_norm": 0.6002459526062012, + "learning_rate": 0.00012637896484459687, + "loss": 3.276, + "step": 5271 + }, + { + "epoch": 0.67, + "grad_norm": 0.6768071055412292, + "learning_rate": 0.00012628887944573413, + "loss": 3.3012, + "step": 5272 + }, + { + "epoch": 0.67, + "grad_norm": 0.6223396062850952, + "learning_rate": 0.00012619881531424713, + "loss": 3.2877, + "step": 5273 + }, + { + "epoch": 0.68, + "grad_norm": 0.73650723695755, + "learning_rate": 0.00012610877246561897, + "loss": 3.3105, + "step": 5274 + }, + { + "epoch": 0.68, + "grad_norm": 0.6108227968215942, + "learning_rate": 0.0001260187509153292, + "loss": 3.166, + "step": 5275 + }, + { + "epoch": 0.68, + "grad_norm": 0.6116386651992798, + "learning_rate": 0.00012592875067885345, + "loss": 3.2562, + "step": 5276 + }, + { + "epoch": 0.68, + "grad_norm": 0.6081870198249817, + "learning_rate": 0.0001258387717716638, + "loss": 3.2611, + "step": 5277 + }, + { + "epoch": 0.68, + "grad_norm": 0.6102370023727417, + "learning_rate": 0.00012574881420922873, + "loss": 3.2183, + "step": 5278 + }, + { + "epoch": 0.68, + "grad_norm": 0.5910782217979431, + "learning_rate": 0.00012565887800701291, + "loss": 3.2597, + "step": 5279 + }, + { + "epoch": 0.68, + "grad_norm": 0.5789437294006348, + "learning_rate": 0.00012556896318047733, + "loss": 3.3493, + "step": 5280 + }, + { + "epoch": 0.68, + "grad_norm": 0.5884090065956116, + "learning_rate": 0.00012547906974507968, + "loss": 3.36, + "step": 5281 + }, + { + "epoch": 0.68, + "grad_norm": 0.6605008840560913, + "learning_rate": 0.00012538919771627334, + "loss": 3.3206, + "step": 5282 + }, + { + "epoch": 0.68, + "grad_norm": 0.6063432097434998, + "learning_rate": 0.00012529934710950864, + "loss": 3.3791, + "step": 5283 + }, + { + "epoch": 0.68, + "grad_norm": 0.6339197754859924, + "learning_rate": 0.00012520951794023184, + "loss": 3.3382, + "step": 5284 + }, + { + "epoch": 0.68, + "grad_norm": 0.6664906740188599, + "learning_rate": 0.00012511971022388557, + "loss": 3.2971, + "step": 5285 + }, + { + "epoch": 0.68, + "grad_norm": 0.6363011002540588, + "learning_rate": 0.0001250299239759089, + "loss": 3.4032, + "step": 5286 + }, + { + "epoch": 0.68, + "grad_norm": 0.6400793194770813, + "learning_rate": 0.00012494015921173704, + "loss": 3.2862, + "step": 5287 + }, + { + "epoch": 0.68, + "grad_norm": 0.5983986258506775, + "learning_rate": 0.00012485041594680155, + "loss": 3.2912, + "step": 5288 + }, + { + "epoch": 0.68, + "grad_norm": 0.5790104866027832, + "learning_rate": 0.0001247606941965305, + "loss": 3.3903, + "step": 5289 + }, + { + "epoch": 0.68, + "grad_norm": 0.6162522435188293, + "learning_rate": 0.00012467099397634802, + "loss": 3.2884, + "step": 5290 + }, + { + "epoch": 0.68, + "grad_norm": 0.5953441262245178, + "learning_rate": 0.00012458131530167452, + "loss": 3.3475, + "step": 5291 + }, + { + "epoch": 0.68, + "grad_norm": 0.6053186058998108, + "learning_rate": 0.000124491658187927, + "loss": 3.3778, + "step": 5292 + }, + { + "epoch": 0.68, + "grad_norm": 0.6237776875495911, + "learning_rate": 0.00012440202265051844, + "loss": 3.3701, + "step": 5293 + }, + { + "epoch": 0.68, + "grad_norm": 0.640582799911499, + "learning_rate": 0.00012431240870485824, + "loss": 3.3292, + "step": 5294 + }, + { + "epoch": 0.68, + "grad_norm": 0.6169292330741882, + "learning_rate": 0.00012422281636635202, + "loss": 3.3628, + "step": 5295 + }, + { + "epoch": 0.68, + "grad_norm": 0.6268018484115601, + "learning_rate": 0.00012413324565040186, + "loss": 3.3225, + "step": 5296 + }, + { + "epoch": 0.68, + "grad_norm": 0.5968641638755798, + "learning_rate": 0.00012404369657240596, + "loss": 3.3803, + "step": 5297 + }, + { + "epoch": 0.68, + "grad_norm": 0.6453279256820679, + "learning_rate": 0.0001239541691477588, + "loss": 3.3369, + "step": 5298 + }, + { + "epoch": 0.68, + "grad_norm": 0.6305952668190002, + "learning_rate": 0.00012386466339185125, + "loss": 3.5307, + "step": 5299 + }, + { + "epoch": 0.68, + "grad_norm": 0.6009957194328308, + "learning_rate": 0.00012377517932007033, + "loss": 3.3499, + "step": 5300 + }, + { + "epoch": 0.68, + "grad_norm": 0.5801147222518921, + "learning_rate": 0.00012368571694779934, + "loss": 3.3182, + "step": 5301 + }, + { + "epoch": 0.68, + "grad_norm": 0.6128845810890198, + "learning_rate": 0.00012359627629041805, + "loss": 3.2508, + "step": 5302 + }, + { + "epoch": 0.68, + "grad_norm": 0.6393544673919678, + "learning_rate": 0.00012350685736330216, + "loss": 3.3248, + "step": 5303 + }, + { + "epoch": 0.68, + "grad_norm": 0.632507860660553, + "learning_rate": 0.000123417460181824, + "loss": 3.2552, + "step": 5304 + }, + { + "epoch": 0.68, + "grad_norm": 0.64713054895401, + "learning_rate": 0.00012332808476135193, + "loss": 3.3147, + "step": 5305 + }, + { + "epoch": 0.68, + "grad_norm": 0.6159345507621765, + "learning_rate": 0.00012323873111725063, + "loss": 3.3826, + "step": 5306 + }, + { + "epoch": 0.68, + "grad_norm": 0.6228779554367065, + "learning_rate": 0.00012314939926488095, + "loss": 3.3276, + "step": 5307 + }, + { + "epoch": 0.68, + "grad_norm": 0.5892593860626221, + "learning_rate": 0.0001230600892196001, + "loss": 3.307, + "step": 5308 + }, + { + "epoch": 0.68, + "grad_norm": 0.6291082501411438, + "learning_rate": 0.00012297080099676146, + "loss": 3.3732, + "step": 5309 + }, + { + "epoch": 0.68, + "grad_norm": 0.5819165110588074, + "learning_rate": 0.0001228815346117148, + "loss": 3.2115, + "step": 5310 + }, + { + "epoch": 0.68, + "grad_norm": 0.6499049067497253, + "learning_rate": 0.00012279229007980605, + "loss": 3.416, + "step": 5311 + }, + { + "epoch": 0.68, + "grad_norm": 0.6292195320129395, + "learning_rate": 0.0001227030674163774, + "loss": 3.3248, + "step": 5312 + }, + { + "epoch": 0.68, + "grad_norm": 0.5752406120300293, + "learning_rate": 0.00012261386663676722, + "loss": 3.29, + "step": 5313 + }, + { + "epoch": 0.68, + "grad_norm": 0.5899255871772766, + "learning_rate": 0.00012252468775631012, + "loss": 3.264, + "step": 5314 + }, + { + "epoch": 0.68, + "grad_norm": 0.6782875657081604, + "learning_rate": 0.00012243553079033703, + "loss": 3.3667, + "step": 5315 + }, + { + "epoch": 0.68, + "grad_norm": 0.6177646517753601, + "learning_rate": 0.00012234639575417497, + "loss": 3.3603, + "step": 5316 + }, + { + "epoch": 0.68, + "grad_norm": 0.6094318628311157, + "learning_rate": 0.00012225728266314746, + "loss": 3.3114, + "step": 5317 + }, + { + "epoch": 0.68, + "grad_norm": 0.6286597847938538, + "learning_rate": 0.00012216819153257398, + "loss": 3.2324, + "step": 5318 + }, + { + "epoch": 0.68, + "grad_norm": 0.6156710982322693, + "learning_rate": 0.00012207912237777022, + "loss": 3.4317, + "step": 5319 + }, + { + "epoch": 0.68, + "grad_norm": 0.6111778020858765, + "learning_rate": 0.0001219900752140484, + "loss": 3.205, + "step": 5320 + }, + { + "epoch": 0.68, + "grad_norm": 0.6007053256034851, + "learning_rate": 0.0001219010500567167, + "loss": 3.3469, + "step": 5321 + }, + { + "epoch": 0.68, + "grad_norm": 0.5899903178215027, + "learning_rate": 0.00012181204692107952, + "loss": 3.3366, + "step": 5322 + }, + { + "epoch": 0.68, + "grad_norm": 0.636367678642273, + "learning_rate": 0.00012172306582243756, + "loss": 3.3244, + "step": 5323 + }, + { + "epoch": 0.68, + "grad_norm": 0.6870309710502625, + "learning_rate": 0.0001216341067760876, + "loss": 3.2732, + "step": 5324 + }, + { + "epoch": 0.68, + "grad_norm": 0.6357422471046448, + "learning_rate": 0.00012154516979732295, + "loss": 3.1816, + "step": 5325 + }, + { + "epoch": 0.68, + "grad_norm": 0.6212683916091919, + "learning_rate": 0.00012145625490143275, + "loss": 3.314, + "step": 5326 + }, + { + "epoch": 0.68, + "grad_norm": 0.5981928110122681, + "learning_rate": 0.00012136736210370255, + "loss": 3.1853, + "step": 5327 + }, + { + "epoch": 0.68, + "grad_norm": 0.636627733707428, + "learning_rate": 0.00012127849141941396, + "loss": 3.3097, + "step": 5328 + }, + { + "epoch": 0.68, + "grad_norm": 0.6140633821487427, + "learning_rate": 0.00012118964286384506, + "loss": 3.3549, + "step": 5329 + }, + { + "epoch": 0.68, + "grad_norm": 0.5917346477508545, + "learning_rate": 0.00012110081645226986, + "loss": 3.2242, + "step": 5330 + }, + { + "epoch": 0.68, + "grad_norm": 0.6547762155532837, + "learning_rate": 0.00012101201219995853, + "loss": 3.232, + "step": 5331 + }, + { + "epoch": 0.68, + "grad_norm": 0.6183657050132751, + "learning_rate": 0.00012092323012217774, + "loss": 3.3344, + "step": 5332 + }, + { + "epoch": 0.68, + "grad_norm": 0.6329478621482849, + "learning_rate": 0.00012083447023419009, + "loss": 3.3021, + "step": 5333 + }, + { + "epoch": 0.68, + "grad_norm": 0.6323261260986328, + "learning_rate": 0.00012074573255125442, + "loss": 3.2491, + "step": 5334 + }, + { + "epoch": 0.68, + "grad_norm": 0.6243185997009277, + "learning_rate": 0.00012065701708862578, + "loss": 3.3159, + "step": 5335 + }, + { + "epoch": 0.68, + "grad_norm": 0.6621860861778259, + "learning_rate": 0.00012056832386155536, + "loss": 3.2703, + "step": 5336 + }, + { + "epoch": 0.68, + "grad_norm": 0.6325708031654358, + "learning_rate": 0.0001204796528852905, + "loss": 3.2953, + "step": 5337 + }, + { + "epoch": 0.68, + "grad_norm": 0.6371250152587891, + "learning_rate": 0.0001203910041750749, + "loss": 3.4275, + "step": 5338 + }, + { + "epoch": 0.68, + "grad_norm": 0.6301656365394592, + "learning_rate": 0.00012030237774614816, + "loss": 3.4774, + "step": 5339 + }, + { + "epoch": 0.68, + "grad_norm": 0.6328499913215637, + "learning_rate": 0.00012021377361374636, + "loss": 3.2954, + "step": 5340 + }, + { + "epoch": 0.68, + "grad_norm": 0.6077415347099304, + "learning_rate": 0.0001201251917931015, + "loss": 3.3703, + "step": 5341 + }, + { + "epoch": 0.68, + "grad_norm": 0.6206401586532593, + "learning_rate": 0.00012003663229944178, + "loss": 3.3644, + "step": 5342 + }, + { + "epoch": 0.68, + "grad_norm": 0.6526326537132263, + "learning_rate": 0.00011994809514799166, + "loss": 3.3512, + "step": 5343 + }, + { + "epoch": 0.68, + "grad_norm": 0.6655504107475281, + "learning_rate": 0.00011985958035397169, + "loss": 3.2669, + "step": 5344 + }, + { + "epoch": 0.68, + "grad_norm": 0.5940502882003784, + "learning_rate": 0.00011977108793259845, + "loss": 3.2079, + "step": 5345 + }, + { + "epoch": 0.68, + "grad_norm": 0.5870606899261475, + "learning_rate": 0.00011968261789908502, + "loss": 3.2211, + "step": 5346 + }, + { + "epoch": 0.68, + "grad_norm": 0.6249194145202637, + "learning_rate": 0.00011959417026864025, + "loss": 3.3497, + "step": 5347 + }, + { + "epoch": 0.68, + "grad_norm": 0.6449898481369019, + "learning_rate": 0.00011950574505646952, + "loss": 3.4259, + "step": 5348 + }, + { + "epoch": 0.68, + "grad_norm": 0.659328281879425, + "learning_rate": 0.00011941734227777403, + "loss": 3.3543, + "step": 5349 + }, + { + "epoch": 0.68, + "grad_norm": 0.6789844632148743, + "learning_rate": 0.00011932896194775125, + "loss": 3.2992, + "step": 5350 + }, + { + "epoch": 0.68, + "grad_norm": 0.6676364541053772, + "learning_rate": 0.00011924060408159477, + "loss": 3.448, + "step": 5351 + }, + { + "epoch": 0.69, + "grad_norm": 0.6708555221557617, + "learning_rate": 0.00011915226869449425, + "loss": 3.3006, + "step": 5352 + }, + { + "epoch": 0.69, + "grad_norm": 0.6383585929870605, + "learning_rate": 0.00011906395580163576, + "loss": 3.2522, + "step": 5353 + }, + { + "epoch": 0.69, + "grad_norm": 0.6652035713195801, + "learning_rate": 0.0001189756654182012, + "loss": 3.3669, + "step": 5354 + }, + { + "epoch": 0.69, + "grad_norm": 0.6032835841178894, + "learning_rate": 0.00011888739755936873, + "loss": 3.4457, + "step": 5355 + }, + { + "epoch": 0.69, + "grad_norm": 0.6228095293045044, + "learning_rate": 0.00011879915224031249, + "loss": 3.4268, + "step": 5356 + }, + { + "epoch": 0.69, + "grad_norm": 0.6080935001373291, + "learning_rate": 0.0001187109294762031, + "loss": 3.2699, + "step": 5357 + }, + { + "epoch": 0.69, + "grad_norm": 0.6331431865692139, + "learning_rate": 0.00011862272928220696, + "loss": 3.3719, + "step": 5358 + }, + { + "epoch": 0.69, + "grad_norm": 0.6054500341415405, + "learning_rate": 0.00011853455167348673, + "loss": 3.1855, + "step": 5359 + }, + { + "epoch": 0.69, + "grad_norm": 0.6649967432022095, + "learning_rate": 0.00011844639666520105, + "loss": 3.3242, + "step": 5360 + }, + { + "epoch": 0.69, + "grad_norm": 0.6024637818336487, + "learning_rate": 0.00011835826427250496, + "loss": 3.3709, + "step": 5361 + }, + { + "epoch": 0.69, + "grad_norm": 0.654136061668396, + "learning_rate": 0.00011827015451054937, + "loss": 3.2647, + "step": 5362 + }, + { + "epoch": 0.69, + "grad_norm": 0.6690921783447266, + "learning_rate": 0.00011818206739448137, + "loss": 3.3242, + "step": 5363 + }, + { + "epoch": 0.69, + "grad_norm": 0.6012855768203735, + "learning_rate": 0.00011809400293944414, + "loss": 3.237, + "step": 5364 + }, + { + "epoch": 0.69, + "grad_norm": 0.6119986772537231, + "learning_rate": 0.00011800596116057688, + "loss": 3.2263, + "step": 5365 + }, + { + "epoch": 0.69, + "grad_norm": 0.6060097217559814, + "learning_rate": 0.00011791794207301524, + "loss": 3.375, + "step": 5366 + }, + { + "epoch": 0.69, + "grad_norm": 0.6539217233657837, + "learning_rate": 0.00011782994569189045, + "loss": 3.2729, + "step": 5367 + }, + { + "epoch": 0.69, + "grad_norm": 0.6135516166687012, + "learning_rate": 0.00011774197203233037, + "loss": 3.3101, + "step": 5368 + }, + { + "epoch": 0.69, + "grad_norm": 0.6137378811836243, + "learning_rate": 0.00011765402110945852, + "loss": 3.2379, + "step": 5369 + }, + { + "epoch": 0.69, + "grad_norm": 0.6105403900146484, + "learning_rate": 0.00011756609293839477, + "loss": 3.2976, + "step": 5370 + }, + { + "epoch": 0.69, + "grad_norm": 0.6659379005432129, + "learning_rate": 0.00011747818753425493, + "loss": 3.2012, + "step": 5371 + }, + { + "epoch": 0.69, + "grad_norm": 0.6903918385505676, + "learning_rate": 0.00011739030491215097, + "loss": 3.294, + "step": 5372 + }, + { + "epoch": 0.69, + "grad_norm": 0.6265830397605896, + "learning_rate": 0.00011730244508719087, + "loss": 3.3366, + "step": 5373 + }, + { + "epoch": 0.69, + "grad_norm": 0.5908456444740295, + "learning_rate": 0.00011721460807447889, + "loss": 3.2259, + "step": 5374 + }, + { + "epoch": 0.69, + "grad_norm": 0.6120139956474304, + "learning_rate": 0.0001171267938891151, + "loss": 3.2589, + "step": 5375 + }, + { + "epoch": 0.69, + "grad_norm": 0.5990294218063354, + "learning_rate": 0.0001170390025461959, + "loss": 3.299, + "step": 5376 + }, + { + "epoch": 0.69, + "grad_norm": 0.6606252193450928, + "learning_rate": 0.00011695123406081359, + "loss": 3.3959, + "step": 5377 + }, + { + "epoch": 0.69, + "grad_norm": 0.689970850944519, + "learning_rate": 0.00011686348844805659, + "loss": 3.3365, + "step": 5378 + }, + { + "epoch": 0.69, + "grad_norm": 0.7213213443756104, + "learning_rate": 0.00011677576572300935, + "loss": 3.3814, + "step": 5379 + }, + { + "epoch": 0.69, + "grad_norm": 0.6567277908325195, + "learning_rate": 0.00011668806590075248, + "loss": 3.3028, + "step": 5380 + }, + { + "epoch": 0.69, + "grad_norm": 0.636063277721405, + "learning_rate": 0.00011660038899636247, + "loss": 3.2835, + "step": 5381 + }, + { + "epoch": 0.69, + "grad_norm": 0.6282566785812378, + "learning_rate": 0.00011651273502491216, + "loss": 3.3486, + "step": 5382 + }, + { + "epoch": 0.69, + "grad_norm": 0.6272147297859192, + "learning_rate": 0.00011642510400147025, + "loss": 3.257, + "step": 5383 + }, + { + "epoch": 0.69, + "grad_norm": 0.5836976766586304, + "learning_rate": 0.00011633749594110139, + "loss": 3.2378, + "step": 5384 + }, + { + "epoch": 0.69, + "grad_norm": 0.6571261286735535, + "learning_rate": 0.00011624991085886661, + "loss": 3.288, + "step": 5385 + }, + { + "epoch": 0.69, + "grad_norm": 0.6557903289794922, + "learning_rate": 0.0001161623487698227, + "loss": 3.2989, + "step": 5386 + }, + { + "epoch": 0.69, + "grad_norm": 0.6257616877555847, + "learning_rate": 0.00011607480968902264, + "loss": 3.2491, + "step": 5387 + }, + { + "epoch": 0.69, + "grad_norm": 0.6553791761398315, + "learning_rate": 0.0001159872936315153, + "loss": 3.3515, + "step": 5388 + }, + { + "epoch": 0.69, + "grad_norm": 0.631804883480072, + "learning_rate": 0.00011589980061234587, + "loss": 3.1635, + "step": 5389 + }, + { + "epoch": 0.69, + "grad_norm": 0.6316229701042175, + "learning_rate": 0.00011581233064655536, + "loss": 3.347, + "step": 5390 + }, + { + "epoch": 0.69, + "grad_norm": 0.6077477335929871, + "learning_rate": 0.00011572488374918083, + "loss": 3.2972, + "step": 5391 + }, + { + "epoch": 0.69, + "grad_norm": 0.6585051417350769, + "learning_rate": 0.00011563745993525543, + "loss": 3.3662, + "step": 5392 + }, + { + "epoch": 0.69, + "grad_norm": 0.6453659534454346, + "learning_rate": 0.00011555005921980825, + "loss": 3.3, + "step": 5393 + }, + { + "epoch": 0.69, + "grad_norm": 0.6500974297523499, + "learning_rate": 0.00011546268161786466, + "loss": 3.2243, + "step": 5394 + }, + { + "epoch": 0.69, + "grad_norm": 0.6436439156532288, + "learning_rate": 0.0001153753271444458, + "loss": 3.3304, + "step": 5395 + }, + { + "epoch": 0.69, + "grad_norm": 0.6569267511367798, + "learning_rate": 0.00011528799581456878, + "loss": 3.2496, + "step": 5396 + }, + { + "epoch": 0.69, + "grad_norm": 0.6070961356163025, + "learning_rate": 0.00011520068764324712, + "loss": 3.4124, + "step": 5397 + }, + { + "epoch": 0.69, + "grad_norm": 0.597518801689148, + "learning_rate": 0.00011511340264548997, + "loss": 3.2643, + "step": 5398 + }, + { + "epoch": 0.69, + "grad_norm": 0.6514595150947571, + "learning_rate": 0.00011502614083630264, + "loss": 3.4758, + "step": 5399 + }, + { + "epoch": 0.69, + "grad_norm": 0.615273654460907, + "learning_rate": 0.00011493890223068646, + "loss": 3.4179, + "step": 5400 + }, + { + "epoch": 0.69, + "grad_norm": 0.6327627301216125, + "learning_rate": 0.00011485168684363876, + "loss": 3.3016, + "step": 5401 + }, + { + "epoch": 0.69, + "grad_norm": 0.6123901009559631, + "learning_rate": 0.00011476449469015276, + "loss": 3.3365, + "step": 5402 + }, + { + "epoch": 0.69, + "grad_norm": 0.6408461332321167, + "learning_rate": 0.0001146773257852179, + "loss": 3.2455, + "step": 5403 + }, + { + "epoch": 0.69, + "grad_norm": 0.606148362159729, + "learning_rate": 0.00011459018014381963, + "loss": 3.3178, + "step": 5404 + }, + { + "epoch": 0.69, + "grad_norm": 0.6506169438362122, + "learning_rate": 0.0001145030577809392, + "loss": 3.4223, + "step": 5405 + }, + { + "epoch": 0.69, + "grad_norm": 0.6217191219329834, + "learning_rate": 0.00011441595871155397, + "loss": 3.3234, + "step": 5406 + }, + { + "epoch": 0.69, + "grad_norm": 0.6534956693649292, + "learning_rate": 0.00011432888295063723, + "loss": 3.2476, + "step": 5407 + }, + { + "epoch": 0.69, + "grad_norm": 0.6531282663345337, + "learning_rate": 0.00011424183051315837, + "loss": 3.2446, + "step": 5408 + }, + { + "epoch": 0.69, + "grad_norm": 0.6264949440956116, + "learning_rate": 0.00011415480141408258, + "loss": 3.3105, + "step": 5409 + }, + { + "epoch": 0.69, + "grad_norm": 0.6574244499206543, + "learning_rate": 0.00011406779566837139, + "loss": 3.3587, + "step": 5410 + }, + { + "epoch": 0.69, + "grad_norm": 0.6147937178611755, + "learning_rate": 0.00011398081329098198, + "loss": 3.4396, + "step": 5411 + }, + { + "epoch": 0.69, + "grad_norm": 0.6084353923797607, + "learning_rate": 0.00011389385429686752, + "loss": 3.2026, + "step": 5412 + }, + { + "epoch": 0.69, + "grad_norm": 0.617586612701416, + "learning_rate": 0.0001138069187009775, + "loss": 3.341, + "step": 5413 + }, + { + "epoch": 0.69, + "grad_norm": 0.6375697255134583, + "learning_rate": 0.00011372000651825703, + "loss": 3.3281, + "step": 5414 + }, + { + "epoch": 0.69, + "grad_norm": 0.6307726502418518, + "learning_rate": 0.00011363311776364735, + "loss": 3.3064, + "step": 5415 + }, + { + "epoch": 0.69, + "grad_norm": 0.5901523232460022, + "learning_rate": 0.0001135462524520856, + "loss": 3.2324, + "step": 5416 + }, + { + "epoch": 0.69, + "grad_norm": 0.6218355298042297, + "learning_rate": 0.0001134594105985049, + "loss": 3.2884, + "step": 5417 + }, + { + "epoch": 0.69, + "grad_norm": 0.6189782023429871, + "learning_rate": 0.00011337259221783453, + "loss": 3.3595, + "step": 5418 + }, + { + "epoch": 0.69, + "grad_norm": 0.6642634272575378, + "learning_rate": 0.00011328579732499944, + "loss": 3.3093, + "step": 5419 + }, + { + "epoch": 0.69, + "grad_norm": 0.6610681414604187, + "learning_rate": 0.00011319902593492074, + "loss": 3.3262, + "step": 5420 + }, + { + "epoch": 0.69, + "grad_norm": 0.6154919266700745, + "learning_rate": 0.00011311227806251531, + "loss": 3.2144, + "step": 5421 + }, + { + "epoch": 0.69, + "grad_norm": 0.6145983338356018, + "learning_rate": 0.00011302555372269633, + "loss": 3.3124, + "step": 5422 + }, + { + "epoch": 0.69, + "grad_norm": 0.6004980206489563, + "learning_rate": 0.00011293885293037259, + "loss": 3.4122, + "step": 5423 + }, + { + "epoch": 0.69, + "grad_norm": 0.6144428253173828, + "learning_rate": 0.0001128521757004489, + "loss": 3.1873, + "step": 5424 + }, + { + "epoch": 0.69, + "grad_norm": 0.652301549911499, + "learning_rate": 0.00011276552204782625, + "loss": 3.4181, + "step": 5425 + }, + { + "epoch": 0.69, + "grad_norm": 0.5871480107307434, + "learning_rate": 0.00011267889198740131, + "loss": 3.2486, + "step": 5426 + }, + { + "epoch": 0.69, + "grad_norm": 0.5704430341720581, + "learning_rate": 0.0001125922855340668, + "loss": 3.1964, + "step": 5427 + }, + { + "epoch": 0.69, + "grad_norm": 0.6235558390617371, + "learning_rate": 0.0001125057027027114, + "loss": 3.0768, + "step": 5428 + }, + { + "epoch": 0.69, + "grad_norm": 0.6705940365791321, + "learning_rate": 0.00011241914350821967, + "loss": 3.4425, + "step": 5429 + }, + { + "epoch": 0.7, + "grad_norm": 0.6448222994804382, + "learning_rate": 0.00011233260796547201, + "loss": 3.3121, + "step": 5430 + }, + { + "epoch": 0.7, + "grad_norm": 0.6028103232383728, + "learning_rate": 0.00011224609608934505, + "loss": 3.3336, + "step": 5431 + }, + { + "epoch": 0.7, + "grad_norm": 0.6391410231590271, + "learning_rate": 0.00011215960789471125, + "loss": 3.2327, + "step": 5432 + }, + { + "epoch": 0.7, + "grad_norm": 0.673503577709198, + "learning_rate": 0.00011207314339643884, + "loss": 3.4889, + "step": 5433 + }, + { + "epoch": 0.7, + "grad_norm": 0.5765765309333801, + "learning_rate": 0.00011198670260939206, + "loss": 3.2223, + "step": 5434 + }, + { + "epoch": 0.7, + "grad_norm": 0.6331235766410828, + "learning_rate": 0.00011190028554843107, + "loss": 3.2458, + "step": 5435 + }, + { + "epoch": 0.7, + "grad_norm": 0.6514334082603455, + "learning_rate": 0.00011181389222841201, + "loss": 3.3787, + "step": 5436 + }, + { + "epoch": 0.7, + "grad_norm": 0.6331505179405212, + "learning_rate": 0.00011172752266418684, + "loss": 3.3041, + "step": 5437 + }, + { + "epoch": 0.7, + "grad_norm": 0.6335639953613281, + "learning_rate": 0.00011164117687060346, + "loss": 3.3845, + "step": 5438 + }, + { + "epoch": 0.7, + "grad_norm": 0.6040472388267517, + "learning_rate": 0.00011155485486250574, + "loss": 3.2573, + "step": 5439 + }, + { + "epoch": 0.7, + "grad_norm": 0.625055193901062, + "learning_rate": 0.00011146855665473355, + "loss": 3.3227, + "step": 5440 + }, + { + "epoch": 0.7, + "grad_norm": 0.6047588586807251, + "learning_rate": 0.00011138228226212249, + "loss": 3.3813, + "step": 5441 + }, + { + "epoch": 0.7, + "grad_norm": 0.6100949645042419, + "learning_rate": 0.00011129603169950409, + "loss": 3.4167, + "step": 5442 + }, + { + "epoch": 0.7, + "grad_norm": 0.6181287169456482, + "learning_rate": 0.00011120980498170583, + "loss": 3.344, + "step": 5443 + }, + { + "epoch": 0.7, + "grad_norm": 0.6023228764533997, + "learning_rate": 0.0001111236021235511, + "loss": 3.3657, + "step": 5444 + }, + { + "epoch": 0.7, + "grad_norm": 0.6248236894607544, + "learning_rate": 0.00011103742313985906, + "loss": 3.295, + "step": 5445 + }, + { + "epoch": 0.7, + "grad_norm": 0.5963073372840881, + "learning_rate": 0.00011095126804544505, + "loss": 3.1363, + "step": 5446 + }, + { + "epoch": 0.7, + "grad_norm": 0.6147260665893555, + "learning_rate": 0.0001108651368551201, + "loss": 3.4925, + "step": 5447 + }, + { + "epoch": 0.7, + "grad_norm": 0.630146861076355, + "learning_rate": 0.00011077902958369106, + "loss": 3.2746, + "step": 5448 + }, + { + "epoch": 0.7, + "grad_norm": 0.6169753074645996, + "learning_rate": 0.00011069294624596077, + "loss": 3.1804, + "step": 5449 + }, + { + "epoch": 0.7, + "grad_norm": 0.6223546266555786, + "learning_rate": 0.0001106068868567281, + "loss": 3.2929, + "step": 5450 + }, + { + "epoch": 0.7, + "grad_norm": 0.6127028465270996, + "learning_rate": 0.00011052085143078752, + "loss": 3.2841, + "step": 5451 + }, + { + "epoch": 0.7, + "grad_norm": 0.6518468260765076, + "learning_rate": 0.00011043483998292949, + "loss": 3.2391, + "step": 5452 + }, + { + "epoch": 0.7, + "grad_norm": 0.623802900314331, + "learning_rate": 0.00011034885252794056, + "loss": 3.2241, + "step": 5453 + }, + { + "epoch": 0.7, + "grad_norm": 0.5940724611282349, + "learning_rate": 0.00011026288908060284, + "loss": 3.1555, + "step": 5454 + }, + { + "epoch": 0.7, + "grad_norm": 0.6354788541793823, + "learning_rate": 0.00011017694965569447, + "loss": 3.3109, + "step": 5455 + }, + { + "epoch": 0.7, + "grad_norm": 0.5967780351638794, + "learning_rate": 0.00011009103426798939, + "loss": 3.3411, + "step": 5456 + }, + { + "epoch": 0.7, + "grad_norm": 0.6164358258247375, + "learning_rate": 0.0001100051429322575, + "loss": 3.33, + "step": 5457 + }, + { + "epoch": 0.7, + "grad_norm": 0.6327795386314392, + "learning_rate": 0.00010991927566326443, + "loss": 3.2257, + "step": 5458 + }, + { + "epoch": 0.7, + "grad_norm": 0.637921154499054, + "learning_rate": 0.00010983343247577187, + "loss": 3.2734, + "step": 5459 + }, + { + "epoch": 0.7, + "grad_norm": 0.61724853515625, + "learning_rate": 0.00010974761338453718, + "loss": 3.1948, + "step": 5460 + }, + { + "epoch": 0.7, + "grad_norm": 0.6523029208183289, + "learning_rate": 0.00010966181840431375, + "loss": 3.3381, + "step": 5461 + }, + { + "epoch": 0.7, + "grad_norm": 0.6485067009925842, + "learning_rate": 0.0001095760475498507, + "loss": 3.356, + "step": 5462 + }, + { + "epoch": 0.7, + "grad_norm": 0.6275986433029175, + "learning_rate": 0.000109490300835893, + "loss": 3.297, + "step": 5463 + }, + { + "epoch": 0.7, + "grad_norm": 0.6262179017066956, + "learning_rate": 0.00010940457827718151, + "loss": 3.2337, + "step": 5464 + }, + { + "epoch": 0.7, + "grad_norm": 0.6072761416435242, + "learning_rate": 0.00010931887988845294, + "loss": 3.3163, + "step": 5465 + }, + { + "epoch": 0.7, + "grad_norm": 0.6281195878982544, + "learning_rate": 0.00010923320568443972, + "loss": 3.3204, + "step": 5466 + }, + { + "epoch": 0.7, + "grad_norm": 0.6109669208526611, + "learning_rate": 0.0001091475556798704, + "loss": 3.3305, + "step": 5467 + }, + { + "epoch": 0.7, + "grad_norm": 0.660270094871521, + "learning_rate": 0.00010906192988946922, + "loss": 3.2994, + "step": 5468 + }, + { + "epoch": 0.7, + "grad_norm": 0.6059504151344299, + "learning_rate": 0.00010897632832795618, + "loss": 3.2588, + "step": 5469 + }, + { + "epoch": 0.7, + "grad_norm": 0.6001191735267639, + "learning_rate": 0.0001088907510100472, + "loss": 3.3192, + "step": 5470 + }, + { + "epoch": 0.7, + "grad_norm": 0.5829064846038818, + "learning_rate": 0.00010880519795045399, + "loss": 3.2155, + "step": 5471 + }, + { + "epoch": 0.7, + "grad_norm": 0.5633309483528137, + "learning_rate": 0.00010871966916388415, + "loss": 3.308, + "step": 5472 + }, + { + "epoch": 0.7, + "grad_norm": 0.5973705649375916, + "learning_rate": 0.00010863416466504092, + "loss": 3.3308, + "step": 5473 + }, + { + "epoch": 0.7, + "grad_norm": 0.6159566640853882, + "learning_rate": 0.00010854868446862373, + "loss": 3.4214, + "step": 5474 + }, + { + "epoch": 0.7, + "grad_norm": 0.654606282711029, + "learning_rate": 0.00010846322858932756, + "loss": 3.2694, + "step": 5475 + }, + { + "epoch": 0.7, + "grad_norm": 0.6203795671463013, + "learning_rate": 0.00010837779704184311, + "loss": 3.3132, + "step": 5476 + }, + { + "epoch": 0.7, + "grad_norm": 0.6621377468109131, + "learning_rate": 0.00010829238984085727, + "loss": 3.3296, + "step": 5477 + }, + { + "epoch": 0.7, + "grad_norm": 0.6005878448486328, + "learning_rate": 0.00010820700700105244, + "loss": 3.2034, + "step": 5478 + }, + { + "epoch": 0.7, + "grad_norm": 0.6048322916030884, + "learning_rate": 0.00010812164853710687, + "loss": 3.2214, + "step": 5479 + }, + { + "epoch": 0.7, + "grad_norm": 0.6573078632354736, + "learning_rate": 0.00010803631446369477, + "loss": 3.3038, + "step": 5480 + }, + { + "epoch": 0.7, + "grad_norm": 0.6411568522453308, + "learning_rate": 0.00010795100479548586, + "loss": 3.3197, + "step": 5481 + }, + { + "epoch": 0.7, + "grad_norm": 0.6481874585151672, + "learning_rate": 0.00010786571954714613, + "loss": 3.2614, + "step": 5482 + }, + { + "epoch": 0.7, + "grad_norm": 0.6276066303253174, + "learning_rate": 0.00010778045873333695, + "loss": 3.2463, + "step": 5483 + }, + { + "epoch": 0.7, + "grad_norm": 0.6247035264968872, + "learning_rate": 0.00010769522236871568, + "loss": 3.2256, + "step": 5484 + }, + { + "epoch": 0.7, + "grad_norm": 0.6492263078689575, + "learning_rate": 0.00010761001046793539, + "loss": 3.3523, + "step": 5485 + }, + { + "epoch": 0.7, + "grad_norm": 0.6784762144088745, + "learning_rate": 0.00010752482304564495, + "loss": 3.3337, + "step": 5486 + }, + { + "epoch": 0.7, + "grad_norm": 0.6118037104606628, + "learning_rate": 0.00010743966011648926, + "loss": 3.2237, + "step": 5487 + }, + { + "epoch": 0.7, + "grad_norm": 0.637303352355957, + "learning_rate": 0.0001073545216951086, + "loss": 3.2963, + "step": 5488 + }, + { + "epoch": 0.7, + "grad_norm": 0.6309385299682617, + "learning_rate": 0.00010726940779613942, + "loss": 3.2413, + "step": 5489 + }, + { + "epoch": 0.7, + "grad_norm": 0.5993750691413879, + "learning_rate": 0.0001071843184342137, + "loss": 3.281, + "step": 5490 + }, + { + "epoch": 0.7, + "grad_norm": 0.6420213580131531, + "learning_rate": 0.00010709925362395933, + "loss": 3.1684, + "step": 5491 + }, + { + "epoch": 0.7, + "grad_norm": 0.6415120959281921, + "learning_rate": 0.0001070142133799999, + "loss": 3.3344, + "step": 5492 + }, + { + "epoch": 0.7, + "grad_norm": 0.6192628145217896, + "learning_rate": 0.00010692919771695483, + "loss": 3.2524, + "step": 5493 + }, + { + "epoch": 0.7, + "grad_norm": 0.6713070273399353, + "learning_rate": 0.0001068442066494392, + "loss": 3.3718, + "step": 5494 + }, + { + "epoch": 0.7, + "grad_norm": 0.6320802569389343, + "learning_rate": 0.00010675924019206415, + "loss": 3.3318, + "step": 5495 + }, + { + "epoch": 0.7, + "grad_norm": 0.6208996772766113, + "learning_rate": 0.00010667429835943617, + "loss": 3.4287, + "step": 5496 + }, + { + "epoch": 0.7, + "grad_norm": 0.601486086845398, + "learning_rate": 0.00010658938116615802, + "loss": 3.3564, + "step": 5497 + }, + { + "epoch": 0.7, + "grad_norm": 0.6085739135742188, + "learning_rate": 0.00010650448862682777, + "loss": 3.4097, + "step": 5498 + }, + { + "epoch": 0.7, + "grad_norm": 0.6288726329803467, + "learning_rate": 0.00010641962075603948, + "loss": 3.2182, + "step": 5499 + }, + { + "epoch": 0.7, + "grad_norm": 0.6331077814102173, + "learning_rate": 0.00010633477756838292, + "loss": 3.2166, + "step": 5500 + }, + { + "epoch": 0.7, + "grad_norm": 0.6663286089897156, + "learning_rate": 0.0001062499590784436, + "loss": 3.3958, + "step": 5501 + }, + { + "epoch": 0.7, + "grad_norm": 0.6179254651069641, + "learning_rate": 0.00010616516530080269, + "loss": 3.3268, + "step": 5502 + }, + { + "epoch": 0.7, + "grad_norm": 0.5864897966384888, + "learning_rate": 0.00010608039625003746, + "loss": 3.2097, + "step": 5503 + }, + { + "epoch": 0.7, + "grad_norm": 0.670877993106842, + "learning_rate": 0.00010599565194072047, + "loss": 3.3851, + "step": 5504 + }, + { + "epoch": 0.7, + "grad_norm": 0.6081317663192749, + "learning_rate": 0.00010591093238742047, + "loss": 3.2122, + "step": 5505 + }, + { + "epoch": 0.7, + "grad_norm": 0.627242922782898, + "learning_rate": 0.00010582623760470159, + "loss": 3.3746, + "step": 5506 + }, + { + "epoch": 0.7, + "grad_norm": 0.6064150333404541, + "learning_rate": 0.00010574156760712389, + "loss": 3.2696, + "step": 5507 + }, + { + "epoch": 0.71, + "grad_norm": 0.6662350296974182, + "learning_rate": 0.00010565692240924307, + "loss": 3.3341, + "step": 5508 + }, + { + "epoch": 0.71, + "grad_norm": 0.606359601020813, + "learning_rate": 0.0001055723020256106, + "loss": 3.2804, + "step": 5509 + }, + { + "epoch": 0.71, + "grad_norm": 0.5908784866333008, + "learning_rate": 0.00010548770647077385, + "loss": 3.2003, + "step": 5510 + }, + { + "epoch": 0.71, + "grad_norm": 0.5890214443206787, + "learning_rate": 0.00010540313575927568, + "loss": 3.2922, + "step": 5511 + }, + { + "epoch": 0.71, + "grad_norm": 0.6119219660758972, + "learning_rate": 0.00010531858990565477, + "loss": 3.2116, + "step": 5512 + }, + { + "epoch": 0.71, + "grad_norm": 0.6429023146629333, + "learning_rate": 0.00010523406892444549, + "loss": 3.3435, + "step": 5513 + }, + { + "epoch": 0.71, + "grad_norm": 0.6105777621269226, + "learning_rate": 0.00010514957283017809, + "loss": 3.1645, + "step": 5514 + }, + { + "epoch": 0.71, + "grad_norm": 0.5838789939880371, + "learning_rate": 0.0001050651016373784, + "loss": 3.2575, + "step": 5515 + }, + { + "epoch": 0.71, + "grad_norm": 0.5827857255935669, + "learning_rate": 0.00010498065536056794, + "loss": 3.4108, + "step": 5516 + }, + { + "epoch": 0.71, + "grad_norm": 0.6028188467025757, + "learning_rate": 0.00010489623401426396, + "loss": 3.2953, + "step": 5517 + }, + { + "epoch": 0.71, + "grad_norm": 0.616176426410675, + "learning_rate": 0.00010481183761297961, + "loss": 3.3607, + "step": 5518 + }, + { + "epoch": 0.71, + "grad_norm": 0.652996301651001, + "learning_rate": 0.00010472746617122356, + "loss": 3.2498, + "step": 5519 + }, + { + "epoch": 0.71, + "grad_norm": 0.6454377174377441, + "learning_rate": 0.00010464311970350021, + "loss": 3.087, + "step": 5520 + }, + { + "epoch": 0.71, + "grad_norm": 0.6470353603363037, + "learning_rate": 0.00010455879822430969, + "loss": 3.2994, + "step": 5521 + }, + { + "epoch": 0.71, + "grad_norm": 0.6124841570854187, + "learning_rate": 0.00010447450174814787, + "loss": 3.3696, + "step": 5522 + }, + { + "epoch": 0.71, + "grad_norm": 0.6404425501823425, + "learning_rate": 0.0001043902302895062, + "loss": 3.2451, + "step": 5523 + }, + { + "epoch": 0.71, + "grad_norm": 0.6828598976135254, + "learning_rate": 0.00010430598386287199, + "loss": 3.1555, + "step": 5524 + }, + { + "epoch": 0.71, + "grad_norm": 0.6182655692100525, + "learning_rate": 0.00010422176248272825, + "loss": 3.2486, + "step": 5525 + }, + { + "epoch": 0.71, + "grad_norm": 0.6098328828811646, + "learning_rate": 0.00010413756616355358, + "loss": 3.3218, + "step": 5526 + }, + { + "epoch": 0.71, + "grad_norm": 0.6310797333717346, + "learning_rate": 0.00010405339491982224, + "loss": 3.2505, + "step": 5527 + }, + { + "epoch": 0.71, + "grad_norm": 0.6132204532623291, + "learning_rate": 0.00010396924876600428, + "loss": 3.44, + "step": 5528 + }, + { + "epoch": 0.71, + "grad_norm": 0.6618490219116211, + "learning_rate": 0.00010388512771656539, + "loss": 3.2299, + "step": 5529 + }, + { + "epoch": 0.71, + "grad_norm": 0.6412935853004456, + "learning_rate": 0.00010380103178596686, + "loss": 3.3538, + "step": 5530 + }, + { + "epoch": 0.71, + "grad_norm": 0.6475511193275452, + "learning_rate": 0.00010371696098866596, + "loss": 3.3645, + "step": 5531 + }, + { + "epoch": 0.71, + "grad_norm": 0.6022229790687561, + "learning_rate": 0.00010363291533911523, + "loss": 3.2734, + "step": 5532 + }, + { + "epoch": 0.71, + "grad_norm": 0.6337286829948425, + "learning_rate": 0.00010354889485176328, + "loss": 3.2877, + "step": 5533 + }, + { + "epoch": 0.71, + "grad_norm": 0.6246282458305359, + "learning_rate": 0.0001034648995410541, + "loss": 3.2926, + "step": 5534 + }, + { + "epoch": 0.71, + "grad_norm": 0.6323986053466797, + "learning_rate": 0.00010338092942142746, + "loss": 3.4426, + "step": 5535 + }, + { + "epoch": 0.71, + "grad_norm": 0.6178189516067505, + "learning_rate": 0.0001032969845073188, + "loss": 3.2257, + "step": 5536 + }, + { + "epoch": 0.71, + "grad_norm": 0.653200626373291, + "learning_rate": 0.00010321306481315926, + "loss": 3.3105, + "step": 5537 + }, + { + "epoch": 0.71, + "grad_norm": 0.6622167229652405, + "learning_rate": 0.00010312917035337546, + "loss": 3.2909, + "step": 5538 + }, + { + "epoch": 0.71, + "grad_norm": 0.5952175259590149, + "learning_rate": 0.00010304530114239008, + "loss": 3.2572, + "step": 5539 + }, + { + "epoch": 0.71, + "grad_norm": 0.5932444334030151, + "learning_rate": 0.00010296145719462105, + "loss": 3.2586, + "step": 5540 + }, + { + "epoch": 0.71, + "grad_norm": 0.6411198973655701, + "learning_rate": 0.00010287763852448207, + "loss": 3.2888, + "step": 5541 + }, + { + "epoch": 0.71, + "grad_norm": 0.6064850687980652, + "learning_rate": 0.00010279384514638268, + "loss": 3.1833, + "step": 5542 + }, + { + "epoch": 0.71, + "grad_norm": 0.6550402641296387, + "learning_rate": 0.00010271007707472788, + "loss": 3.3338, + "step": 5543 + }, + { + "epoch": 0.71, + "grad_norm": 0.6096448302268982, + "learning_rate": 0.00010262633432391838, + "loss": 3.2992, + "step": 5544 + }, + { + "epoch": 0.71, + "grad_norm": 0.6269118189811707, + "learning_rate": 0.0001025426169083504, + "loss": 3.2611, + "step": 5545 + }, + { + "epoch": 0.71, + "grad_norm": 0.6356010437011719, + "learning_rate": 0.00010245892484241615, + "loss": 3.2572, + "step": 5546 + }, + { + "epoch": 0.71, + "grad_norm": 0.6577345132827759, + "learning_rate": 0.00010237525814050316, + "loss": 3.2427, + "step": 5547 + }, + { + "epoch": 0.71, + "grad_norm": 0.6274149417877197, + "learning_rate": 0.0001022916168169947, + "loss": 3.2137, + "step": 5548 + }, + { + "epoch": 0.71, + "grad_norm": 0.6737575531005859, + "learning_rate": 0.00010220800088626969, + "loss": 3.2899, + "step": 5549 + }, + { + "epoch": 0.71, + "grad_norm": 0.6428675651550293, + "learning_rate": 0.00010212441036270271, + "loss": 3.3025, + "step": 5550 + }, + { + "epoch": 0.71, + "grad_norm": 0.6240596175193787, + "learning_rate": 0.0001020408452606638, + "loss": 3.2887, + "step": 5551 + }, + { + "epoch": 0.71, + "grad_norm": 0.6059290766716003, + "learning_rate": 0.00010195730559451893, + "loss": 3.2722, + "step": 5552 + }, + { + "epoch": 0.71, + "grad_norm": 0.5900526642799377, + "learning_rate": 0.00010187379137862945, + "loss": 3.2072, + "step": 5553 + }, + { + "epoch": 0.71, + "grad_norm": 0.5984786152839661, + "learning_rate": 0.00010179030262735254, + "loss": 3.284, + "step": 5554 + }, + { + "epoch": 0.71, + "grad_norm": 0.6868179440498352, + "learning_rate": 0.00010170683935504077, + "loss": 3.3633, + "step": 5555 + }, + { + "epoch": 0.71, + "grad_norm": 0.6394917964935303, + "learning_rate": 0.00010162340157604252, + "loss": 3.2281, + "step": 5556 + }, + { + "epoch": 0.71, + "grad_norm": 0.6540482640266418, + "learning_rate": 0.00010153998930470165, + "loss": 3.1823, + "step": 5557 + }, + { + "epoch": 0.71, + "grad_norm": 0.6355370879173279, + "learning_rate": 0.00010145660255535771, + "loss": 3.4014, + "step": 5558 + }, + { + "epoch": 0.71, + "grad_norm": 0.6422974467277527, + "learning_rate": 0.00010137324134234577, + "loss": 3.3509, + "step": 5559 + }, + { + "epoch": 0.71, + "grad_norm": 0.6201578974723816, + "learning_rate": 0.00010128990567999666, + "loss": 3.3104, + "step": 5560 + }, + { + "epoch": 0.71, + "grad_norm": 0.6185092926025391, + "learning_rate": 0.00010120659558263687, + "loss": 3.3656, + "step": 5561 + }, + { + "epoch": 0.71, + "grad_norm": 0.6379492878913879, + "learning_rate": 0.00010112331106458825, + "loss": 3.3114, + "step": 5562 + }, + { + "epoch": 0.71, + "grad_norm": 0.6754159331321716, + "learning_rate": 0.00010104005214016837, + "loss": 3.3098, + "step": 5563 + }, + { + "epoch": 0.71, + "grad_norm": 0.6179805994033813, + "learning_rate": 0.00010095681882369042, + "loss": 3.4452, + "step": 5564 + }, + { + "epoch": 0.71, + "grad_norm": 0.6396198272705078, + "learning_rate": 0.00010087361112946319, + "loss": 3.3147, + "step": 5565 + }, + { + "epoch": 0.71, + "grad_norm": 0.6521846652030945, + "learning_rate": 0.00010079042907179092, + "loss": 3.2994, + "step": 5566 + }, + { + "epoch": 0.71, + "grad_norm": 0.6344322562217712, + "learning_rate": 0.0001007072726649738, + "loss": 3.3356, + "step": 5567 + }, + { + "epoch": 0.71, + "grad_norm": 0.6148968935012817, + "learning_rate": 0.00010062414192330724, + "loss": 3.3081, + "step": 5568 + }, + { + "epoch": 0.71, + "grad_norm": 0.6300048828125, + "learning_rate": 0.00010054103686108229, + "loss": 3.3595, + "step": 5569 + }, + { + "epoch": 0.71, + "grad_norm": 0.6113799214363098, + "learning_rate": 0.0001004579574925859, + "loss": 3.2506, + "step": 5570 + }, + { + "epoch": 0.71, + "grad_norm": 0.6313521265983582, + "learning_rate": 0.00010037490383210024, + "loss": 3.3954, + "step": 5571 + }, + { + "epoch": 0.71, + "grad_norm": 0.6265667080879211, + "learning_rate": 0.0001002918758939032, + "loss": 3.4398, + "step": 5572 + }, + { + "epoch": 0.71, + "grad_norm": 0.67408686876297, + "learning_rate": 0.0001002088736922683, + "loss": 3.3008, + "step": 5573 + }, + { + "epoch": 0.71, + "grad_norm": 0.6352943778038025, + "learning_rate": 0.00010012589724146443, + "loss": 3.3582, + "step": 5574 + }, + { + "epoch": 0.71, + "grad_norm": 0.5675966739654541, + "learning_rate": 0.00010004294655575639, + "loss": 3.2766, + "step": 5575 + }, + { + "epoch": 0.71, + "grad_norm": 0.695729672908783, + "learning_rate": 9.996002164940429e-05, + "loss": 3.3943, + "step": 5576 + }, + { + "epoch": 0.71, + "grad_norm": 0.5746981501579285, + "learning_rate": 9.98771225366639e-05, + "loss": 3.3425, + "step": 5577 + }, + { + "epoch": 0.71, + "grad_norm": 0.5896180272102356, + "learning_rate": 9.979424923178643e-05, + "loss": 3.2756, + "step": 5578 + }, + { + "epoch": 0.71, + "grad_norm": 0.6412181854248047, + "learning_rate": 9.971140174901891e-05, + "loss": 3.267, + "step": 5579 + }, + { + "epoch": 0.71, + "grad_norm": 0.5925517082214355, + "learning_rate": 9.962858010260376e-05, + "loss": 3.2467, + "step": 5580 + }, + { + "epoch": 0.71, + "grad_norm": 0.6303708553314209, + "learning_rate": 9.954578430677882e-05, + "loss": 3.3277, + "step": 5581 + }, + { + "epoch": 0.71, + "grad_norm": 0.6667519807815552, + "learning_rate": 9.94630143757779e-05, + "loss": 3.4194, + "step": 5582 + }, + { + "epoch": 0.71, + "grad_norm": 0.6062800288200378, + "learning_rate": 9.938027032382996e-05, + "loss": 3.3892, + "step": 5583 + }, + { + "epoch": 0.71, + "grad_norm": 0.6656795144081116, + "learning_rate": 9.92975521651597e-05, + "loss": 3.3456, + "step": 5584 + }, + { + "epoch": 0.71, + "grad_norm": 0.6574938297271729, + "learning_rate": 9.92148599139873e-05, + "loss": 3.3215, + "step": 5585 + }, + { + "epoch": 0.72, + "grad_norm": 0.6665852069854736, + "learning_rate": 9.913219358452855e-05, + "loss": 3.3117, + "step": 5586 + }, + { + "epoch": 0.72, + "grad_norm": 0.640589714050293, + "learning_rate": 9.904955319099462e-05, + "loss": 3.1973, + "step": 5587 + }, + { + "epoch": 0.72, + "grad_norm": 0.615400493144989, + "learning_rate": 9.896693874759257e-05, + "loss": 3.3768, + "step": 5588 + }, + { + "epoch": 0.72, + "grad_norm": 0.6466597318649292, + "learning_rate": 9.888435026852458e-05, + "loss": 3.2447, + "step": 5589 + }, + { + "epoch": 0.72, + "grad_norm": 0.6233015060424805, + "learning_rate": 9.880178776798876e-05, + "loss": 3.305, + "step": 5590 + }, + { + "epoch": 0.72, + "grad_norm": 0.658562183380127, + "learning_rate": 9.871925126017845e-05, + "loss": 3.1554, + "step": 5591 + }, + { + "epoch": 0.72, + "grad_norm": 0.6282123327255249, + "learning_rate": 9.863674075928267e-05, + "loss": 3.1391, + "step": 5592 + }, + { + "epoch": 0.72, + "grad_norm": 0.6806384921073914, + "learning_rate": 9.855425627948587e-05, + "loss": 3.422, + "step": 5593 + }, + { + "epoch": 0.72, + "grad_norm": 0.6451425552368164, + "learning_rate": 9.847179783496815e-05, + "loss": 3.2525, + "step": 5594 + }, + { + "epoch": 0.72, + "grad_norm": 0.6309500932693481, + "learning_rate": 9.838936543990495e-05, + "loss": 3.3627, + "step": 5595 + }, + { + "epoch": 0.72, + "grad_norm": 0.641991138458252, + "learning_rate": 9.830695910846754e-05, + "loss": 3.2728, + "step": 5596 + }, + { + "epoch": 0.72, + "grad_norm": 0.6529327630996704, + "learning_rate": 9.822457885482237e-05, + "loss": 3.306, + "step": 5597 + }, + { + "epoch": 0.72, + "grad_norm": 0.6511504650115967, + "learning_rate": 9.814222469313166e-05, + "loss": 3.2857, + "step": 5598 + }, + { + "epoch": 0.72, + "grad_norm": 0.5973052978515625, + "learning_rate": 9.805989663755308e-05, + "loss": 3.3806, + "step": 5599 + }, + { + "epoch": 0.72, + "grad_norm": 0.6324095726013184, + "learning_rate": 9.797759470223966e-05, + "loss": 3.3219, + "step": 5600 + }, + { + "epoch": 0.72, + "grad_norm": 0.6935181021690369, + "learning_rate": 9.789531890134012e-05, + "loss": 3.3186, + "step": 5601 + }, + { + "epoch": 0.72, + "grad_norm": 0.6352891325950623, + "learning_rate": 9.781306924899852e-05, + "loss": 3.3261, + "step": 5602 + }, + { + "epoch": 0.72, + "grad_norm": 0.6081901788711548, + "learning_rate": 9.773084575935471e-05, + "loss": 3.2191, + "step": 5603 + }, + { + "epoch": 0.72, + "grad_norm": 0.6001847386360168, + "learning_rate": 9.764864844654379e-05, + "loss": 3.4207, + "step": 5604 + }, + { + "epoch": 0.72, + "grad_norm": 0.5756590962409973, + "learning_rate": 9.756647732469636e-05, + "loss": 3.1644, + "step": 5605 + }, + { + "epoch": 0.72, + "grad_norm": 0.6494952440261841, + "learning_rate": 9.748433240793858e-05, + "loss": 3.3798, + "step": 5606 + }, + { + "epoch": 0.72, + "grad_norm": 0.597460150718689, + "learning_rate": 9.740221371039226e-05, + "loss": 3.2834, + "step": 5607 + }, + { + "epoch": 0.72, + "grad_norm": 0.64914470911026, + "learning_rate": 9.732012124617449e-05, + "loss": 3.3143, + "step": 5608 + }, + { + "epoch": 0.72, + "grad_norm": 0.6491999626159668, + "learning_rate": 9.723805502939786e-05, + "loss": 3.3448, + "step": 5609 + }, + { + "epoch": 0.72, + "grad_norm": 0.6296149492263794, + "learning_rate": 9.715601507417046e-05, + "loss": 3.3611, + "step": 5610 + }, + { + "epoch": 0.72, + "grad_norm": 0.5787907838821411, + "learning_rate": 9.70740013945961e-05, + "loss": 3.154, + "step": 5611 + }, + { + "epoch": 0.72, + "grad_norm": 0.6179164052009583, + "learning_rate": 9.699201400477372e-05, + "loss": 3.2012, + "step": 5612 + }, + { + "epoch": 0.72, + "grad_norm": 0.6448367834091187, + "learning_rate": 9.691005291879801e-05, + "loss": 3.2615, + "step": 5613 + }, + { + "epoch": 0.72, + "grad_norm": 0.6444175839424133, + "learning_rate": 9.682811815075895e-05, + "loss": 3.1871, + "step": 5614 + }, + { + "epoch": 0.72, + "grad_norm": 0.6346524357795715, + "learning_rate": 9.674620971474202e-05, + "loss": 3.1654, + "step": 5615 + }, + { + "epoch": 0.72, + "grad_norm": 0.6192304491996765, + "learning_rate": 9.666432762482838e-05, + "loss": 3.2435, + "step": 5616 + }, + { + "epoch": 0.72, + "grad_norm": 0.6740929484367371, + "learning_rate": 9.658247189509436e-05, + "loss": 3.2574, + "step": 5617 + }, + { + "epoch": 0.72, + "grad_norm": 0.6283291578292847, + "learning_rate": 9.650064253961208e-05, + "loss": 3.4429, + "step": 5618 + }, + { + "epoch": 0.72, + "grad_norm": 0.6806941032409668, + "learning_rate": 9.641883957244887e-05, + "loss": 3.1937, + "step": 5619 + }, + { + "epoch": 0.72, + "grad_norm": 0.6300070881843567, + "learning_rate": 9.633706300766759e-05, + "loss": 3.4148, + "step": 5620 + }, + { + "epoch": 0.72, + "grad_norm": 0.68531334400177, + "learning_rate": 9.62553128593266e-05, + "loss": 3.3577, + "step": 5621 + }, + { + "epoch": 0.72, + "grad_norm": 0.6283920407295227, + "learning_rate": 9.617358914147969e-05, + "loss": 3.3075, + "step": 5622 + }, + { + "epoch": 0.72, + "grad_norm": 0.6550459265708923, + "learning_rate": 9.609189186817604e-05, + "loss": 3.2821, + "step": 5623 + }, + { + "epoch": 0.72, + "grad_norm": 0.596026599407196, + "learning_rate": 9.60102210534605e-05, + "loss": 3.4144, + "step": 5624 + }, + { + "epoch": 0.72, + "grad_norm": 0.6399941444396973, + "learning_rate": 9.59285767113731e-05, + "loss": 3.4119, + "step": 5625 + }, + { + "epoch": 0.72, + "grad_norm": 0.6629415154457092, + "learning_rate": 9.584695885594957e-05, + "loss": 3.3527, + "step": 5626 + }, + { + "epoch": 0.72, + "grad_norm": 0.5877668261528015, + "learning_rate": 9.576536750122094e-05, + "loss": 3.2068, + "step": 5627 + }, + { + "epoch": 0.72, + "grad_norm": 0.6166104674339294, + "learning_rate": 9.568380266121366e-05, + "loss": 3.196, + "step": 5628 + }, + { + "epoch": 0.72, + "grad_norm": 0.6430399417877197, + "learning_rate": 9.56022643499497e-05, + "loss": 3.2587, + "step": 5629 + }, + { + "epoch": 0.72, + "grad_norm": 0.6009620428085327, + "learning_rate": 9.55207525814464e-05, + "loss": 3.3185, + "step": 5630 + }, + { + "epoch": 0.72, + "grad_norm": 0.5799506902694702, + "learning_rate": 9.543926736971656e-05, + "loss": 3.2342, + "step": 5631 + }, + { + "epoch": 0.72, + "grad_norm": 0.6568311452865601, + "learning_rate": 9.535780872876857e-05, + "loss": 3.3809, + "step": 5632 + }, + { + "epoch": 0.72, + "grad_norm": 0.5879119038581848, + "learning_rate": 9.5276376672606e-05, + "loss": 3.2301, + "step": 5633 + }, + { + "epoch": 0.72, + "grad_norm": 0.6337886452674866, + "learning_rate": 9.519497121522791e-05, + "loss": 3.3523, + "step": 5634 + }, + { + "epoch": 0.72, + "grad_norm": 0.6023512482643127, + "learning_rate": 9.5113592370629e-05, + "loss": 3.3258, + "step": 5635 + }, + { + "epoch": 0.72, + "grad_norm": 0.6364913582801819, + "learning_rate": 9.503224015279916e-05, + "loss": 3.2814, + "step": 5636 + }, + { + "epoch": 0.72, + "grad_norm": 0.6315597295761108, + "learning_rate": 9.49509145757238e-05, + "loss": 3.2524, + "step": 5637 + }, + { + "epoch": 0.72, + "grad_norm": 0.6659747958183289, + "learning_rate": 9.48696156533836e-05, + "loss": 3.3089, + "step": 5638 + }, + { + "epoch": 0.72, + "grad_norm": 0.6412807703018188, + "learning_rate": 9.478834339975498e-05, + "loss": 3.3407, + "step": 5639 + }, + { + "epoch": 0.72, + "grad_norm": 0.654059112071991, + "learning_rate": 9.470709782880952e-05, + "loss": 3.2988, + "step": 5640 + }, + { + "epoch": 0.72, + "grad_norm": 0.6097717881202698, + "learning_rate": 9.462587895451424e-05, + "loss": 3.269, + "step": 5641 + }, + { + "epoch": 0.72, + "grad_norm": 0.6349428296089172, + "learning_rate": 9.454468679083161e-05, + "loss": 3.3455, + "step": 5642 + }, + { + "epoch": 0.72, + "grad_norm": 0.6521540880203247, + "learning_rate": 9.446352135171943e-05, + "loss": 3.1802, + "step": 5643 + }, + { + "epoch": 0.72, + "grad_norm": 0.596436083316803, + "learning_rate": 9.438238265113116e-05, + "loss": 3.2043, + "step": 5644 + }, + { + "epoch": 0.72, + "grad_norm": 0.6569159030914307, + "learning_rate": 9.43012707030153e-05, + "loss": 3.3805, + "step": 5645 + }, + { + "epoch": 0.72, + "grad_norm": 0.6213423013687134, + "learning_rate": 9.422018552131611e-05, + "loss": 3.3731, + "step": 5646 + }, + { + "epoch": 0.72, + "grad_norm": 0.6433305740356445, + "learning_rate": 9.413912711997297e-05, + "loss": 3.391, + "step": 5647 + }, + { + "epoch": 0.72, + "grad_norm": 0.6438114643096924, + "learning_rate": 9.405809551292077e-05, + "loss": 3.1735, + "step": 5648 + }, + { + "epoch": 0.72, + "grad_norm": 0.6398864984512329, + "learning_rate": 9.39770907140898e-05, + "loss": 3.4009, + "step": 5649 + }, + { + "epoch": 0.72, + "grad_norm": 0.6665001511573792, + "learning_rate": 9.38961127374057e-05, + "loss": 3.3683, + "step": 5650 + }, + { + "epoch": 0.72, + "grad_norm": 0.6613116264343262, + "learning_rate": 9.381516159678955e-05, + "loss": 3.2784, + "step": 5651 + }, + { + "epoch": 0.72, + "grad_norm": 0.6008492708206177, + "learning_rate": 9.373423730615766e-05, + "loss": 3.4032, + "step": 5652 + }, + { + "epoch": 0.72, + "grad_norm": 0.6395289301872253, + "learning_rate": 9.365333987942199e-05, + "loss": 3.2685, + "step": 5653 + }, + { + "epoch": 0.72, + "grad_norm": 0.6400224566459656, + "learning_rate": 9.357246933048977e-05, + "loss": 3.5009, + "step": 5654 + }, + { + "epoch": 0.72, + "grad_norm": 0.6652866005897522, + "learning_rate": 9.349162567326355e-05, + "loss": 3.2542, + "step": 5655 + }, + { + "epoch": 0.72, + "grad_norm": 0.6197553873062134, + "learning_rate": 9.34108089216413e-05, + "loss": 3.3471, + "step": 5656 + }, + { + "epoch": 0.72, + "grad_norm": 0.5985637903213501, + "learning_rate": 9.333001908951633e-05, + "loss": 3.2991, + "step": 5657 + }, + { + "epoch": 0.72, + "grad_norm": 0.5842049717903137, + "learning_rate": 9.324925619077732e-05, + "loss": 3.4584, + "step": 5658 + }, + { + "epoch": 0.72, + "grad_norm": 0.59958815574646, + "learning_rate": 9.316852023930832e-05, + "loss": 3.1319, + "step": 5659 + }, + { + "epoch": 0.72, + "grad_norm": 0.5971797108650208, + "learning_rate": 9.308781124898894e-05, + "loss": 3.2662, + "step": 5660 + }, + { + "epoch": 0.72, + "grad_norm": 0.5735284686088562, + "learning_rate": 9.300712923369387e-05, + "loss": 3.2252, + "step": 5661 + }, + { + "epoch": 0.72, + "grad_norm": 0.6035478711128235, + "learning_rate": 9.29264742072932e-05, + "loss": 3.1127, + "step": 5662 + }, + { + "epoch": 0.72, + "grad_norm": 0.6161136627197266, + "learning_rate": 9.284584618365266e-05, + "loss": 3.3258, + "step": 5663 + }, + { + "epoch": 0.72, + "grad_norm": 0.684057354927063, + "learning_rate": 9.276524517663306e-05, + "loss": 3.2352, + "step": 5664 + }, + { + "epoch": 0.73, + "grad_norm": 0.620516300201416, + "learning_rate": 9.268467120009063e-05, + "loss": 3.2965, + "step": 5665 + }, + { + "epoch": 0.73, + "grad_norm": 0.6349815130233765, + "learning_rate": 9.26041242678769e-05, + "loss": 3.2535, + "step": 5666 + }, + { + "epoch": 0.73, + "grad_norm": 0.6179966926574707, + "learning_rate": 9.252360439383897e-05, + "loss": 3.2433, + "step": 5667 + }, + { + "epoch": 0.73, + "grad_norm": 0.6889963150024414, + "learning_rate": 9.244311159181906e-05, + "loss": 3.3033, + "step": 5668 + }, + { + "epoch": 0.73, + "grad_norm": 0.6488670110702515, + "learning_rate": 9.236264587565485e-05, + "loss": 3.3602, + "step": 5669 + }, + { + "epoch": 0.73, + "grad_norm": 0.6509321331977844, + "learning_rate": 9.22822072591793e-05, + "loss": 3.2645, + "step": 5670 + }, + { + "epoch": 0.73, + "grad_norm": 0.6226929426193237, + "learning_rate": 9.220179575622065e-05, + "loss": 3.4073, + "step": 5671 + }, + { + "epoch": 0.73, + "grad_norm": 0.6354256868362427, + "learning_rate": 9.212141138060273e-05, + "loss": 3.3473, + "step": 5672 + }, + { + "epoch": 0.73, + "grad_norm": 0.6060647368431091, + "learning_rate": 9.20410541461445e-05, + "loss": 3.3694, + "step": 5673 + }, + { + "epoch": 0.73, + "grad_norm": 0.6140630841255188, + "learning_rate": 9.19607240666602e-05, + "loss": 3.2368, + "step": 5674 + }, + { + "epoch": 0.73, + "grad_norm": 0.6258276104927063, + "learning_rate": 9.188042115595966e-05, + "loss": 3.3254, + "step": 5675 + }, + { + "epoch": 0.73, + "grad_norm": 0.633005678653717, + "learning_rate": 9.180014542784779e-05, + "loss": 3.3143, + "step": 5676 + }, + { + "epoch": 0.73, + "grad_norm": 0.6406413316726685, + "learning_rate": 9.171989689612495e-05, + "loss": 3.3556, + "step": 5677 + }, + { + "epoch": 0.73, + "grad_norm": 0.6403318047523499, + "learning_rate": 9.163967557458675e-05, + "loss": 3.2525, + "step": 5678 + }, + { + "epoch": 0.73, + "grad_norm": 0.5798841714859009, + "learning_rate": 9.155948147702419e-05, + "loss": 3.1693, + "step": 5679 + }, + { + "epoch": 0.73, + "grad_norm": 0.5711493492126465, + "learning_rate": 9.14793146172235e-05, + "loss": 3.3399, + "step": 5680 + }, + { + "epoch": 0.73, + "grad_norm": 0.6775694489479065, + "learning_rate": 9.139917500896635e-05, + "loss": 3.3542, + "step": 5681 + }, + { + "epoch": 0.73, + "grad_norm": 0.6326046586036682, + "learning_rate": 9.131906266602977e-05, + "loss": 3.3323, + "step": 5682 + }, + { + "epoch": 0.73, + "grad_norm": 0.6392051577568054, + "learning_rate": 9.123897760218589e-05, + "loss": 3.3526, + "step": 5683 + }, + { + "epoch": 0.73, + "grad_norm": 0.6865871548652649, + "learning_rate": 9.115891983120228e-05, + "loss": 3.3018, + "step": 5684 + }, + { + "epoch": 0.73, + "grad_norm": 0.6464802622795105, + "learning_rate": 9.107888936684181e-05, + "loss": 3.2994, + "step": 5685 + }, + { + "epoch": 0.73, + "grad_norm": 0.6245539784431458, + "learning_rate": 9.099888622286262e-05, + "loss": 3.2927, + "step": 5686 + }, + { + "epoch": 0.73, + "grad_norm": 0.6925250291824341, + "learning_rate": 9.091891041301808e-05, + "loss": 3.3068, + "step": 5687 + }, + { + "epoch": 0.73, + "grad_norm": 0.62113356590271, + "learning_rate": 9.083896195105718e-05, + "loss": 3.2647, + "step": 5688 + }, + { + "epoch": 0.73, + "grad_norm": 0.6427499055862427, + "learning_rate": 9.075904085072375e-05, + "loss": 3.3155, + "step": 5689 + }, + { + "epoch": 0.73, + "grad_norm": 0.6446628570556641, + "learning_rate": 9.06791471257574e-05, + "loss": 3.35, + "step": 5690 + }, + { + "epoch": 0.73, + "grad_norm": 0.596994161605835, + "learning_rate": 9.059928078989266e-05, + "loss": 3.3549, + "step": 5691 + }, + { + "epoch": 0.73, + "grad_norm": 0.6257395148277283, + "learning_rate": 9.051944185685948e-05, + "loss": 3.3217, + "step": 5692 + }, + { + "epoch": 0.73, + "grad_norm": 0.6585277915000916, + "learning_rate": 9.04396303403831e-05, + "loss": 3.3329, + "step": 5693 + }, + { + "epoch": 0.73, + "grad_norm": 0.6199955344200134, + "learning_rate": 9.035984625418406e-05, + "loss": 3.3027, + "step": 5694 + }, + { + "epoch": 0.73, + "grad_norm": 0.6597419381141663, + "learning_rate": 9.028008961197803e-05, + "loss": 3.284, + "step": 5695 + }, + { + "epoch": 0.73, + "grad_norm": 0.6863325834274292, + "learning_rate": 9.020036042747637e-05, + "loss": 3.3692, + "step": 5696 + }, + { + "epoch": 0.73, + "grad_norm": 0.6006118059158325, + "learning_rate": 9.012065871438527e-05, + "loss": 3.374, + "step": 5697 + }, + { + "epoch": 0.73, + "grad_norm": 0.643882691860199, + "learning_rate": 9.004098448640643e-05, + "loss": 3.2162, + "step": 5698 + }, + { + "epoch": 0.73, + "grad_norm": 0.6381259560585022, + "learning_rate": 8.996133775723666e-05, + "loss": 3.3138, + "step": 5699 + }, + { + "epoch": 0.73, + "grad_norm": 0.7250969409942627, + "learning_rate": 8.988171854056837e-05, + "loss": 3.3412, + "step": 5700 + }, + { + "epoch": 0.73, + "grad_norm": 0.5688900351524353, + "learning_rate": 8.98021268500889e-05, + "loss": 3.2126, + "step": 5701 + }, + { + "epoch": 0.73, + "grad_norm": 0.6205479502677917, + "learning_rate": 8.97225626994809e-05, + "loss": 3.376, + "step": 5702 + }, + { + "epoch": 0.73, + "grad_norm": 0.6234270334243774, + "learning_rate": 8.964302610242256e-05, + "loss": 3.2247, + "step": 5703 + }, + { + "epoch": 0.73, + "grad_norm": 0.6274899244308472, + "learning_rate": 8.956351707258705e-05, + "loss": 3.2646, + "step": 5704 + }, + { + "epoch": 0.73, + "grad_norm": 0.6678943037986755, + "learning_rate": 8.948403562364291e-05, + "loss": 3.3395, + "step": 5705 + }, + { + "epoch": 0.73, + "grad_norm": 0.6532748341560364, + "learning_rate": 8.940458176925389e-05, + "loss": 3.3363, + "step": 5706 + }, + { + "epoch": 0.73, + "grad_norm": 0.6314519643783569, + "learning_rate": 8.932515552307904e-05, + "loss": 3.2931, + "step": 5707 + }, + { + "epoch": 0.73, + "grad_norm": 0.6630345582962036, + "learning_rate": 8.92457568987726e-05, + "loss": 3.1622, + "step": 5708 + }, + { + "epoch": 0.73, + "grad_norm": 0.6650614738464355, + "learning_rate": 8.916638590998425e-05, + "loss": 3.3558, + "step": 5709 + }, + { + "epoch": 0.73, + "grad_norm": 0.6153234243392944, + "learning_rate": 8.90870425703586e-05, + "loss": 3.2526, + "step": 5710 + }, + { + "epoch": 0.73, + "grad_norm": 0.650212824344635, + "learning_rate": 8.900772689353589e-05, + "loss": 3.3059, + "step": 5711 + }, + { + "epoch": 0.73, + "grad_norm": 0.6286440491676331, + "learning_rate": 8.892843889315133e-05, + "loss": 3.3715, + "step": 5712 + }, + { + "epoch": 0.73, + "grad_norm": 0.6217995285987854, + "learning_rate": 8.88491785828354e-05, + "loss": 3.3575, + "step": 5713 + }, + { + "epoch": 0.73, + "grad_norm": 0.6089176535606384, + "learning_rate": 8.876994597621391e-05, + "loss": 3.3028, + "step": 5714 + }, + { + "epoch": 0.73, + "grad_norm": 0.631025493144989, + "learning_rate": 8.869074108690783e-05, + "loss": 3.3126, + "step": 5715 + }, + { + "epoch": 0.73, + "grad_norm": 0.6296160817146301, + "learning_rate": 8.861156392853334e-05, + "loss": 3.344, + "step": 5716 + }, + { + "epoch": 0.73, + "grad_norm": 0.6157452464103699, + "learning_rate": 8.853241451470198e-05, + "loss": 3.1725, + "step": 5717 + }, + { + "epoch": 0.73, + "grad_norm": 0.6230807304382324, + "learning_rate": 8.845329285902054e-05, + "loss": 3.2362, + "step": 5718 + }, + { + "epoch": 0.73, + "grad_norm": 0.7034242749214172, + "learning_rate": 8.837419897509086e-05, + "loss": 3.3586, + "step": 5719 + }, + { + "epoch": 0.73, + "grad_norm": 0.6778315305709839, + "learning_rate": 8.829513287651011e-05, + "loss": 3.3762, + "step": 5720 + }, + { + "epoch": 0.73, + "grad_norm": 0.6450397372245789, + "learning_rate": 8.821609457687069e-05, + "loss": 3.2485, + "step": 5721 + }, + { + "epoch": 0.73, + "grad_norm": 0.6391881108283997, + "learning_rate": 8.813708408976015e-05, + "loss": 3.484, + "step": 5722 + }, + { + "epoch": 0.73, + "grad_norm": 0.6348711848258972, + "learning_rate": 8.805810142876123e-05, + "loss": 3.2173, + "step": 5723 + }, + { + "epoch": 0.73, + "grad_norm": 0.6497623920440674, + "learning_rate": 8.797914660745218e-05, + "loss": 3.29, + "step": 5724 + }, + { + "epoch": 0.73, + "grad_norm": 0.581919252872467, + "learning_rate": 8.790021963940612e-05, + "loss": 3.3053, + "step": 5725 + }, + { + "epoch": 0.73, + "grad_norm": 0.6217665672302246, + "learning_rate": 8.782132053819145e-05, + "loss": 3.2744, + "step": 5726 + }, + { + "epoch": 0.73, + "grad_norm": 0.645945131778717, + "learning_rate": 8.774244931737197e-05, + "loss": 3.2167, + "step": 5727 + }, + { + "epoch": 0.73, + "grad_norm": 0.6536757349967957, + "learning_rate": 8.766360599050654e-05, + "loss": 3.4259, + "step": 5728 + }, + { + "epoch": 0.73, + "grad_norm": 0.6132533550262451, + "learning_rate": 8.758479057114917e-05, + "loss": 3.2033, + "step": 5729 + }, + { + "epoch": 0.73, + "grad_norm": 0.6727334260940552, + "learning_rate": 8.750600307284922e-05, + "loss": 3.3885, + "step": 5730 + }, + { + "epoch": 0.73, + "grad_norm": 0.676634669303894, + "learning_rate": 8.742724350915102e-05, + "loss": 3.4119, + "step": 5731 + }, + { + "epoch": 0.73, + "grad_norm": 0.6278587579727173, + "learning_rate": 8.734851189359447e-05, + "loss": 3.3256, + "step": 5732 + }, + { + "epoch": 0.73, + "grad_norm": 0.623261034488678, + "learning_rate": 8.726980823971434e-05, + "loss": 3.3331, + "step": 5733 + }, + { + "epoch": 0.73, + "grad_norm": 0.6241022944450378, + "learning_rate": 8.719113256104069e-05, + "loss": 3.3075, + "step": 5734 + }, + { + "epoch": 0.73, + "grad_norm": 0.593539297580719, + "learning_rate": 8.71124848710988e-05, + "loss": 3.2942, + "step": 5735 + }, + { + "epoch": 0.73, + "grad_norm": 0.6482180953025818, + "learning_rate": 8.703386518340902e-05, + "loss": 3.3901, + "step": 5736 + }, + { + "epoch": 0.73, + "grad_norm": 0.6159878969192505, + "learning_rate": 8.695527351148719e-05, + "loss": 3.431, + "step": 5737 + }, + { + "epoch": 0.73, + "grad_norm": 0.6078242659568787, + "learning_rate": 8.687670986884394e-05, + "loss": 3.3015, + "step": 5738 + }, + { + "epoch": 0.73, + "grad_norm": 0.604847252368927, + "learning_rate": 8.67981742689854e-05, + "loss": 3.3483, + "step": 5739 + }, + { + "epoch": 0.73, + "grad_norm": 0.6486718058586121, + "learning_rate": 8.671966672541273e-05, + "loss": 3.378, + "step": 5740 + }, + { + "epoch": 0.73, + "grad_norm": 0.6555278897285461, + "learning_rate": 8.664118725162226e-05, + "loss": 3.2885, + "step": 5741 + }, + { + "epoch": 0.73, + "grad_norm": 0.6854482293128967, + "learning_rate": 8.656273586110549e-05, + "loss": 3.4235, + "step": 5742 + }, + { + "epoch": 0.74, + "grad_norm": 0.6202027797698975, + "learning_rate": 8.648431256734918e-05, + "loss": 3.2464, + "step": 5743 + }, + { + "epoch": 0.74, + "grad_norm": 0.6301712393760681, + "learning_rate": 8.640591738383507e-05, + "loss": 3.3117, + "step": 5744 + }, + { + "epoch": 0.74, + "grad_norm": 0.5973697304725647, + "learning_rate": 8.632755032404041e-05, + "loss": 3.2841, + "step": 5745 + }, + { + "epoch": 0.74, + "grad_norm": 0.601158082485199, + "learning_rate": 8.624921140143722e-05, + "loss": 3.1326, + "step": 5746 + }, + { + "epoch": 0.74, + "grad_norm": 0.5854448676109314, + "learning_rate": 8.617090062949303e-05, + "loss": 3.3146, + "step": 5747 + }, + { + "epoch": 0.74, + "grad_norm": 0.6340086460113525, + "learning_rate": 8.609261802167029e-05, + "loss": 3.3485, + "step": 5748 + }, + { + "epoch": 0.74, + "grad_norm": 0.634315013885498, + "learning_rate": 8.60143635914267e-05, + "loss": 3.2879, + "step": 5749 + }, + { + "epoch": 0.74, + "grad_norm": 0.639906644821167, + "learning_rate": 8.593613735221506e-05, + "loss": 3.3795, + "step": 5750 + }, + { + "epoch": 0.74, + "grad_norm": 0.6317019462585449, + "learning_rate": 8.585793931748343e-05, + "loss": 3.2666, + "step": 5751 + }, + { + "epoch": 0.74, + "grad_norm": 0.6535657048225403, + "learning_rate": 8.577976950067484e-05, + "loss": 3.3267, + "step": 5752 + }, + { + "epoch": 0.74, + "grad_norm": 0.6275309324264526, + "learning_rate": 8.570162791522776e-05, + "loss": 3.5098, + "step": 5753 + }, + { + "epoch": 0.74, + "grad_norm": 0.6426060199737549, + "learning_rate": 8.562351457457549e-05, + "loss": 3.4356, + "step": 5754 + }, + { + "epoch": 0.74, + "grad_norm": 0.6221879124641418, + "learning_rate": 8.554542949214672e-05, + "loss": 3.2748, + "step": 5755 + }, + { + "epoch": 0.74, + "grad_norm": 0.6630797982215881, + "learning_rate": 8.546737268136518e-05, + "loss": 3.4619, + "step": 5756 + }, + { + "epoch": 0.74, + "grad_norm": 0.6118805408477783, + "learning_rate": 8.538934415564966e-05, + "loss": 3.3449, + "step": 5757 + }, + { + "epoch": 0.74, + "grad_norm": 0.6531286239624023, + "learning_rate": 8.531134392841424e-05, + "loss": 3.3845, + "step": 5758 + }, + { + "epoch": 0.74, + "grad_norm": 0.6513779759407043, + "learning_rate": 8.523337201306796e-05, + "loss": 3.3936, + "step": 5759 + }, + { + "epoch": 0.74, + "grad_norm": 0.653174638748169, + "learning_rate": 8.515542842301524e-05, + "loss": 3.1839, + "step": 5760 + }, + { + "epoch": 0.74, + "grad_norm": 0.6509855389595032, + "learning_rate": 8.507751317165541e-05, + "loss": 3.3573, + "step": 5761 + }, + { + "epoch": 0.74, + "grad_norm": 0.640022873878479, + "learning_rate": 8.499962627238302e-05, + "loss": 3.35, + "step": 5762 + }, + { + "epoch": 0.74, + "grad_norm": 0.6861554384231567, + "learning_rate": 8.492176773858765e-05, + "loss": 3.3539, + "step": 5763 + }, + { + "epoch": 0.74, + "grad_norm": 0.6354894042015076, + "learning_rate": 8.484393758365422e-05, + "loss": 3.3855, + "step": 5764 + }, + { + "epoch": 0.74, + "grad_norm": 0.679643988609314, + "learning_rate": 8.476613582096257e-05, + "loss": 3.2839, + "step": 5765 + }, + { + "epoch": 0.74, + "grad_norm": 0.6623861789703369, + "learning_rate": 8.468836246388772e-05, + "loss": 3.3924, + "step": 5766 + }, + { + "epoch": 0.74, + "grad_norm": 0.6140841841697693, + "learning_rate": 8.461061752579976e-05, + "loss": 3.3562, + "step": 5767 + }, + { + "epoch": 0.74, + "grad_norm": 0.613014280796051, + "learning_rate": 8.453290102006408e-05, + "loss": 3.2021, + "step": 5768 + }, + { + "epoch": 0.74, + "grad_norm": 0.6353635191917419, + "learning_rate": 8.445521296004099e-05, + "loss": 3.2412, + "step": 5769 + }, + { + "epoch": 0.74, + "grad_norm": 0.6085248589515686, + "learning_rate": 8.437755335908592e-05, + "loss": 3.2289, + "step": 5770 + }, + { + "epoch": 0.74, + "grad_norm": 0.6465585231781006, + "learning_rate": 8.429992223054952e-05, + "loss": 3.4146, + "step": 5771 + }, + { + "epoch": 0.74, + "grad_norm": 0.6053239703178406, + "learning_rate": 8.422231958777743e-05, + "loss": 3.3973, + "step": 5772 + }, + { + "epoch": 0.74, + "grad_norm": 0.6017369627952576, + "learning_rate": 8.414474544411038e-05, + "loss": 3.3329, + "step": 5773 + }, + { + "epoch": 0.74, + "grad_norm": 0.6545090675354004, + "learning_rate": 8.406719981288436e-05, + "loss": 3.3531, + "step": 5774 + }, + { + "epoch": 0.74, + "grad_norm": 0.6039037704467773, + "learning_rate": 8.398968270743041e-05, + "loss": 3.3622, + "step": 5775 + }, + { + "epoch": 0.74, + "grad_norm": 0.6185946464538574, + "learning_rate": 8.391219414107456e-05, + "loss": 3.3219, + "step": 5776 + }, + { + "epoch": 0.74, + "grad_norm": 0.6226698756217957, + "learning_rate": 8.383473412713802e-05, + "loss": 3.292, + "step": 5777 + }, + { + "epoch": 0.74, + "grad_norm": 0.6575097441673279, + "learning_rate": 8.375730267893703e-05, + "loss": 3.1514, + "step": 5778 + }, + { + "epoch": 0.74, + "grad_norm": 0.6401361227035522, + "learning_rate": 8.367989980978294e-05, + "loss": 3.3897, + "step": 5779 + }, + { + "epoch": 0.74, + "grad_norm": 0.6216908097267151, + "learning_rate": 8.360252553298214e-05, + "loss": 3.2323, + "step": 5780 + }, + { + "epoch": 0.74, + "grad_norm": 0.5935060381889343, + "learning_rate": 8.352517986183636e-05, + "loss": 3.3945, + "step": 5781 + }, + { + "epoch": 0.74, + "grad_norm": 0.6383286118507385, + "learning_rate": 8.344786280964197e-05, + "loss": 3.3122, + "step": 5782 + }, + { + "epoch": 0.74, + "grad_norm": 0.6526837348937988, + "learning_rate": 8.337057438969092e-05, + "loss": 3.2942, + "step": 5783 + }, + { + "epoch": 0.74, + "grad_norm": 0.6335547566413879, + "learning_rate": 8.329331461526988e-05, + "loss": 3.3683, + "step": 5784 + }, + { + "epoch": 0.74, + "grad_norm": 0.619326114654541, + "learning_rate": 8.321608349966065e-05, + "loss": 3.3295, + "step": 5785 + }, + { + "epoch": 0.74, + "grad_norm": 0.6175655722618103, + "learning_rate": 8.31388810561402e-05, + "loss": 3.1881, + "step": 5786 + }, + { + "epoch": 0.74, + "grad_norm": 0.6244990825653076, + "learning_rate": 8.306170729798054e-05, + "loss": 3.3252, + "step": 5787 + }, + { + "epoch": 0.74, + "grad_norm": 0.6253812313079834, + "learning_rate": 8.29845622384486e-05, + "loss": 3.2042, + "step": 5788 + }, + { + "epoch": 0.74, + "grad_norm": 0.6498683094978333, + "learning_rate": 8.290744589080674e-05, + "loss": 3.3758, + "step": 5789 + }, + { + "epoch": 0.74, + "grad_norm": 0.6260440945625305, + "learning_rate": 8.283035826831201e-05, + "loss": 3.2997, + "step": 5790 + }, + { + "epoch": 0.74, + "grad_norm": 0.6759835481643677, + "learning_rate": 8.275329938421658e-05, + "loss": 3.3167, + "step": 5791 + }, + { + "epoch": 0.74, + "grad_norm": 0.6252330541610718, + "learning_rate": 8.2676269251768e-05, + "loss": 3.2903, + "step": 5792 + }, + { + "epoch": 0.74, + "grad_norm": 0.6032154560089111, + "learning_rate": 8.25992678842085e-05, + "loss": 3.189, + "step": 5793 + }, + { + "epoch": 0.74, + "grad_norm": 0.6188944578170776, + "learning_rate": 8.252229529477554e-05, + "loss": 3.165, + "step": 5794 + }, + { + "epoch": 0.74, + "grad_norm": 0.630266547203064, + "learning_rate": 8.244535149670148e-05, + "loss": 3.1759, + "step": 5795 + }, + { + "epoch": 0.74, + "grad_norm": 0.6536327600479126, + "learning_rate": 8.236843650321408e-05, + "loss": 3.2775, + "step": 5796 + }, + { + "epoch": 0.74, + "grad_norm": 0.6232385635375977, + "learning_rate": 8.229155032753577e-05, + "loss": 3.2553, + "step": 5797 + }, + { + "epoch": 0.74, + "grad_norm": 0.6598358750343323, + "learning_rate": 8.221469298288419e-05, + "loss": 3.392, + "step": 5798 + }, + { + "epoch": 0.74, + "grad_norm": 0.6670900583267212, + "learning_rate": 8.213786448247205e-05, + "loss": 3.3819, + "step": 5799 + }, + { + "epoch": 0.74, + "grad_norm": 0.6062140464782715, + "learning_rate": 8.206106483950693e-05, + "loss": 3.3811, + "step": 5800 + }, + { + "epoch": 0.74, + "grad_norm": 0.6420729160308838, + "learning_rate": 8.198429406719177e-05, + "loss": 3.196, + "step": 5801 + }, + { + "epoch": 0.74, + "grad_norm": 0.6605176329612732, + "learning_rate": 8.190755217872425e-05, + "loss": 3.4285, + "step": 5802 + }, + { + "epoch": 0.74, + "grad_norm": 0.6510549783706665, + "learning_rate": 8.183083918729713e-05, + "loss": 3.4052, + "step": 5803 + }, + { + "epoch": 0.74, + "grad_norm": 0.6223390698432922, + "learning_rate": 8.175415510609843e-05, + "loss": 3.257, + "step": 5804 + }, + { + "epoch": 0.74, + "grad_norm": 0.6070486903190613, + "learning_rate": 8.167749994831092e-05, + "loss": 3.2376, + "step": 5805 + }, + { + "epoch": 0.74, + "grad_norm": 0.6169131994247437, + "learning_rate": 8.160087372711256e-05, + "loss": 3.2959, + "step": 5806 + }, + { + "epoch": 0.74, + "grad_norm": 0.6251973509788513, + "learning_rate": 8.152427645567622e-05, + "loss": 3.3106, + "step": 5807 + }, + { + "epoch": 0.74, + "grad_norm": 0.629302442073822, + "learning_rate": 8.144770814716993e-05, + "loss": 3.1936, + "step": 5808 + }, + { + "epoch": 0.74, + "grad_norm": 0.6064233183860779, + "learning_rate": 8.137116881475653e-05, + "loss": 3.3217, + "step": 5809 + }, + { + "epoch": 0.74, + "grad_norm": 0.5733712315559387, + "learning_rate": 8.129465847159414e-05, + "loss": 3.2617, + "step": 5810 + }, + { + "epoch": 0.74, + "grad_norm": 0.6017459630966187, + "learning_rate": 8.121817713083584e-05, + "loss": 3.2549, + "step": 5811 + }, + { + "epoch": 0.74, + "grad_norm": 0.5999737977981567, + "learning_rate": 8.114172480562957e-05, + "loss": 3.3636, + "step": 5812 + }, + { + "epoch": 0.74, + "grad_norm": 0.6227502226829529, + "learning_rate": 8.106530150911837e-05, + "loss": 3.2725, + "step": 5813 + }, + { + "epoch": 0.74, + "grad_norm": 0.620937705039978, + "learning_rate": 8.098890725444033e-05, + "loss": 3.2942, + "step": 5814 + }, + { + "epoch": 0.74, + "grad_norm": 0.5800301432609558, + "learning_rate": 8.091254205472846e-05, + "loss": 3.2182, + "step": 5815 + }, + { + "epoch": 0.74, + "grad_norm": 0.6282032132148743, + "learning_rate": 8.083620592311075e-05, + "loss": 3.2537, + "step": 5816 + }, + { + "epoch": 0.74, + "grad_norm": 0.6425186991691589, + "learning_rate": 8.075989887271043e-05, + "loss": 3.3287, + "step": 5817 + }, + { + "epoch": 0.74, + "grad_norm": 0.6459394097328186, + "learning_rate": 8.068362091664552e-05, + "loss": 3.2264, + "step": 5818 + }, + { + "epoch": 0.74, + "grad_norm": 0.6475614905357361, + "learning_rate": 8.060737206802896e-05, + "loss": 3.2835, + "step": 5819 + }, + { + "epoch": 0.74, + "grad_norm": 0.6001794934272766, + "learning_rate": 8.0531152339969e-05, + "loss": 3.341, + "step": 5820 + }, + { + "epoch": 0.75, + "grad_norm": 0.6368500590324402, + "learning_rate": 8.04549617455686e-05, + "loss": 3.2757, + "step": 5821 + }, + { + "epoch": 0.75, + "grad_norm": 0.6508361101150513, + "learning_rate": 8.037880029792582e-05, + "loss": 3.3815, + "step": 5822 + }, + { + "epoch": 0.75, + "grad_norm": 0.5668452978134155, + "learning_rate": 8.030266801013366e-05, + "loss": 3.3402, + "step": 5823 + }, + { + "epoch": 0.75, + "grad_norm": 0.6491958498954773, + "learning_rate": 8.022656489528013e-05, + "loss": 3.4139, + "step": 5824 + }, + { + "epoch": 0.75, + "grad_norm": 0.6883427500724792, + "learning_rate": 8.015049096644833e-05, + "loss": 3.3844, + "step": 5825 + }, + { + "epoch": 0.75, + "grad_norm": 0.6037605404853821, + "learning_rate": 8.007444623671619e-05, + "loss": 3.3302, + "step": 5826 + }, + { + "epoch": 0.75, + "grad_norm": 0.6611215472221375, + "learning_rate": 7.999843071915671e-05, + "loss": 3.2731, + "step": 5827 + }, + { + "epoch": 0.75, + "grad_norm": 0.6562877893447876, + "learning_rate": 7.992244442683771e-05, + "loss": 3.3768, + "step": 5828 + }, + { + "epoch": 0.75, + "grad_norm": 0.6370260119438171, + "learning_rate": 7.984648737282232e-05, + "loss": 3.3764, + "step": 5829 + }, + { + "epoch": 0.75, + "grad_norm": 0.6698316931724548, + "learning_rate": 7.977055957016835e-05, + "loss": 3.2406, + "step": 5830 + }, + { + "epoch": 0.75, + "grad_norm": 0.6381048560142517, + "learning_rate": 7.969466103192858e-05, + "loss": 3.245, + "step": 5831 + }, + { + "epoch": 0.75, + "grad_norm": 0.6150162816047668, + "learning_rate": 7.961879177115097e-05, + "loss": 3.1812, + "step": 5832 + }, + { + "epoch": 0.75, + "grad_norm": 0.5887827277183533, + "learning_rate": 7.954295180087831e-05, + "loss": 3.2838, + "step": 5833 + }, + { + "epoch": 0.75, + "grad_norm": 0.5896087884902954, + "learning_rate": 7.946714113414836e-05, + "loss": 3.3149, + "step": 5834 + }, + { + "epoch": 0.75, + "grad_norm": 0.6396685838699341, + "learning_rate": 7.939135978399382e-05, + "loss": 3.3465, + "step": 5835 + }, + { + "epoch": 0.75, + "grad_norm": 0.5983510613441467, + "learning_rate": 7.931560776344238e-05, + "loss": 3.3145, + "step": 5836 + }, + { + "epoch": 0.75, + "grad_norm": 0.6477048397064209, + "learning_rate": 7.923988508551663e-05, + "loss": 3.3467, + "step": 5837 + }, + { + "epoch": 0.75, + "grad_norm": 0.6282901763916016, + "learning_rate": 7.916419176323428e-05, + "loss": 3.3294, + "step": 5838 + }, + { + "epoch": 0.75, + "grad_norm": 0.5895499587059021, + "learning_rate": 7.908852780960794e-05, + "loss": 3.3558, + "step": 5839 + }, + { + "epoch": 0.75, + "grad_norm": 0.6594005823135376, + "learning_rate": 7.901289323764502e-05, + "loss": 3.2958, + "step": 5840 + }, + { + "epoch": 0.75, + "grad_norm": 0.6194105744361877, + "learning_rate": 7.893728806034803e-05, + "loss": 3.08, + "step": 5841 + }, + { + "epoch": 0.75, + "grad_norm": 0.6449106931686401, + "learning_rate": 7.886171229071434e-05, + "loss": 3.3395, + "step": 5842 + }, + { + "epoch": 0.75, + "grad_norm": 0.6128101944923401, + "learning_rate": 7.878616594173632e-05, + "loss": 3.2356, + "step": 5843 + }, + { + "epoch": 0.75, + "grad_norm": 0.6564944386482239, + "learning_rate": 7.871064902640124e-05, + "loss": 3.2198, + "step": 5844 + }, + { + "epoch": 0.75, + "grad_norm": 0.650698721408844, + "learning_rate": 7.863516155769129e-05, + "loss": 3.2793, + "step": 5845 + }, + { + "epoch": 0.75, + "grad_norm": 0.6543959379196167, + "learning_rate": 7.855970354858378e-05, + "loss": 3.246, + "step": 5846 + }, + { + "epoch": 0.75, + "grad_norm": 0.6274881958961487, + "learning_rate": 7.848427501205064e-05, + "loss": 3.2797, + "step": 5847 + }, + { + "epoch": 0.75, + "grad_norm": 0.5793599486351013, + "learning_rate": 7.840887596105909e-05, + "loss": 3.2347, + "step": 5848 + }, + { + "epoch": 0.75, + "grad_norm": 0.6473877429962158, + "learning_rate": 7.833350640857101e-05, + "loss": 3.391, + "step": 5849 + }, + { + "epoch": 0.75, + "grad_norm": 0.6663014888763428, + "learning_rate": 7.825816636754333e-05, + "loss": 3.3715, + "step": 5850 + }, + { + "epoch": 0.75, + "grad_norm": 0.6312239170074463, + "learning_rate": 7.818285585092783e-05, + "loss": 3.2068, + "step": 5851 + }, + { + "epoch": 0.75, + "grad_norm": 0.622334361076355, + "learning_rate": 7.810757487167122e-05, + "loss": 3.1758, + "step": 5852 + }, + { + "epoch": 0.75, + "grad_norm": 0.6082350015640259, + "learning_rate": 7.803232344271532e-05, + "loss": 3.3347, + "step": 5853 + }, + { + "epoch": 0.75, + "grad_norm": 0.6351848244667053, + "learning_rate": 7.795710157699662e-05, + "loss": 3.1847, + "step": 5854 + }, + { + "epoch": 0.75, + "grad_norm": 0.6521065831184387, + "learning_rate": 7.788190928744668e-05, + "loss": 3.331, + "step": 5855 + }, + { + "epoch": 0.75, + "grad_norm": 0.6365451216697693, + "learning_rate": 7.78067465869918e-05, + "loss": 3.4912, + "step": 5856 + }, + { + "epoch": 0.75, + "grad_norm": 0.6078124046325684, + "learning_rate": 7.773161348855349e-05, + "loss": 3.2794, + "step": 5857 + }, + { + "epoch": 0.75, + "grad_norm": 0.6526720523834229, + "learning_rate": 7.765651000504795e-05, + "loss": 3.2953, + "step": 5858 + }, + { + "epoch": 0.75, + "grad_norm": 0.6162887811660767, + "learning_rate": 7.758143614938621e-05, + "loss": 3.3665, + "step": 5859 + }, + { + "epoch": 0.75, + "grad_norm": 0.6105762720108032, + "learning_rate": 7.750639193447454e-05, + "loss": 3.4606, + "step": 5860 + }, + { + "epoch": 0.75, + "grad_norm": 0.5999521017074585, + "learning_rate": 7.743137737321381e-05, + "loss": 3.2395, + "step": 5861 + }, + { + "epoch": 0.75, + "grad_norm": 0.6496285200119019, + "learning_rate": 7.73563924784999e-05, + "loss": 3.2687, + "step": 5862 + }, + { + "epoch": 0.75, + "grad_norm": 0.6636385321617126, + "learning_rate": 7.728143726322359e-05, + "loss": 3.3295, + "step": 5863 + }, + { + "epoch": 0.75, + "grad_norm": 0.648780882358551, + "learning_rate": 7.720651174027051e-05, + "loss": 3.2538, + "step": 5864 + }, + { + "epoch": 0.75, + "grad_norm": 0.6651748418807983, + "learning_rate": 7.713161592252121e-05, + "loss": 3.2258, + "step": 5865 + }, + { + "epoch": 0.75, + "grad_norm": 0.6317049264907837, + "learning_rate": 7.705674982285127e-05, + "loss": 3.2107, + "step": 5866 + }, + { + "epoch": 0.75, + "grad_norm": 0.6234962344169617, + "learning_rate": 7.698191345413086e-05, + "loss": 3.2145, + "step": 5867 + }, + { + "epoch": 0.75, + "grad_norm": 0.6090578436851501, + "learning_rate": 7.690710682922541e-05, + "loss": 3.3738, + "step": 5868 + }, + { + "epoch": 0.75, + "grad_norm": 0.6581776738166809, + "learning_rate": 7.683232996099498e-05, + "loss": 3.2474, + "step": 5869 + }, + { + "epoch": 0.75, + "grad_norm": 0.6431095004081726, + "learning_rate": 7.675758286229456e-05, + "loss": 3.2965, + "step": 5870 + }, + { + "epoch": 0.75, + "grad_norm": 0.648125171661377, + "learning_rate": 7.668286554597404e-05, + "loss": 3.2822, + "step": 5871 + }, + { + "epoch": 0.75, + "grad_norm": 0.6426742672920227, + "learning_rate": 7.660817802487819e-05, + "loss": 3.3757, + "step": 5872 + }, + { + "epoch": 0.75, + "grad_norm": 0.6470495462417603, + "learning_rate": 7.65335203118466e-05, + "loss": 3.3586, + "step": 5873 + }, + { + "epoch": 0.75, + "grad_norm": 0.6357805132865906, + "learning_rate": 7.645889241971384e-05, + "loss": 3.1924, + "step": 5874 + }, + { + "epoch": 0.75, + "grad_norm": 0.5984390377998352, + "learning_rate": 7.638429436130945e-05, + "loss": 3.2409, + "step": 5875 + }, + { + "epoch": 0.75, + "grad_norm": 0.6378467679023743, + "learning_rate": 7.630972614945756e-05, + "loss": 3.3597, + "step": 5876 + }, + { + "epoch": 0.75, + "grad_norm": 0.7053393721580505, + "learning_rate": 7.623518779697733e-05, + "loss": 3.4355, + "step": 5877 + }, + { + "epoch": 0.75, + "grad_norm": 0.673662006855011, + "learning_rate": 7.616067931668277e-05, + "loss": 3.3187, + "step": 5878 + }, + { + "epoch": 0.75, + "grad_norm": 0.6976150870323181, + "learning_rate": 7.608620072138278e-05, + "loss": 3.3573, + "step": 5879 + }, + { + "epoch": 0.75, + "grad_norm": 0.6496651768684387, + "learning_rate": 7.601175202388097e-05, + "loss": 3.2845, + "step": 5880 + }, + { + "epoch": 0.75, + "grad_norm": 0.6530563831329346, + "learning_rate": 7.593733323697613e-05, + "loss": 3.317, + "step": 5881 + }, + { + "epoch": 0.75, + "grad_norm": 0.6424029469490051, + "learning_rate": 7.586294437346158e-05, + "loss": 3.2376, + "step": 5882 + }, + { + "epoch": 0.75, + "grad_norm": 0.6783373355865479, + "learning_rate": 7.578858544612571e-05, + "loss": 3.2549, + "step": 5883 + }, + { + "epoch": 0.75, + "grad_norm": 0.6218166947364807, + "learning_rate": 7.571425646775151e-05, + "loss": 3.3064, + "step": 5884 + }, + { + "epoch": 0.75, + "grad_norm": 0.6839401125907898, + "learning_rate": 7.563995745111724e-05, + "loss": 3.295, + "step": 5885 + }, + { + "epoch": 0.75, + "grad_norm": 0.6496739983558655, + "learning_rate": 7.55656884089956e-05, + "loss": 3.3063, + "step": 5886 + }, + { + "epoch": 0.75, + "grad_norm": 0.6135286688804626, + "learning_rate": 7.549144935415434e-05, + "loss": 3.3386, + "step": 5887 + }, + { + "epoch": 0.75, + "grad_norm": 0.6319910883903503, + "learning_rate": 7.541724029935596e-05, + "loss": 3.269, + "step": 5888 + }, + { + "epoch": 0.75, + "grad_norm": 0.691398561000824, + "learning_rate": 7.534306125735796e-05, + "loss": 3.2814, + "step": 5889 + }, + { + "epoch": 0.75, + "grad_norm": 0.6186633706092834, + "learning_rate": 7.526891224091254e-05, + "loss": 3.2561, + "step": 5890 + }, + { + "epoch": 0.75, + "grad_norm": 0.6427898406982422, + "learning_rate": 7.519479326276677e-05, + "loss": 3.4017, + "step": 5891 + }, + { + "epoch": 0.75, + "grad_norm": 0.6385214328765869, + "learning_rate": 7.512070433566253e-05, + "loss": 3.404, + "step": 5892 + }, + { + "epoch": 0.75, + "grad_norm": 0.6215147972106934, + "learning_rate": 7.504664547233655e-05, + "loss": 3.3073, + "step": 5893 + }, + { + "epoch": 0.75, + "grad_norm": 0.6701691746711731, + "learning_rate": 7.497261668552049e-05, + "loss": 3.2871, + "step": 5894 + }, + { + "epoch": 0.75, + "grad_norm": 0.5981207489967346, + "learning_rate": 7.489861798794065e-05, + "loss": 3.2929, + "step": 5895 + }, + { + "epoch": 0.75, + "grad_norm": 0.6607575416564941, + "learning_rate": 7.482464939231842e-05, + "loss": 3.4491, + "step": 5896 + }, + { + "epoch": 0.75, + "grad_norm": 0.5619738101959229, + "learning_rate": 7.475071091136973e-05, + "loss": 3.2607, + "step": 5897 + }, + { + "epoch": 0.75, + "grad_norm": 0.6099634766578674, + "learning_rate": 7.467680255780555e-05, + "loss": 3.2743, + "step": 5898 + }, + { + "epoch": 0.76, + "grad_norm": 0.6769587397575378, + "learning_rate": 7.460292434433147e-05, + "loss": 3.3378, + "step": 5899 + }, + { + "epoch": 0.76, + "grad_norm": 0.6428564786911011, + "learning_rate": 7.45290762836481e-05, + "loss": 3.3223, + "step": 5900 + }, + { + "epoch": 0.76, + "grad_norm": 0.7414649128913879, + "learning_rate": 7.445525838845076e-05, + "loss": 3.2627, + "step": 5901 + }, + { + "epoch": 0.76, + "grad_norm": 0.6124725341796875, + "learning_rate": 7.43814706714295e-05, + "loss": 3.2105, + "step": 5902 + }, + { + "epoch": 0.76, + "grad_norm": 0.6249970197677612, + "learning_rate": 7.43077131452694e-05, + "loss": 3.4095, + "step": 5903 + }, + { + "epoch": 0.76, + "grad_norm": 0.7355794310569763, + "learning_rate": 7.423398582265026e-05, + "loss": 3.2868, + "step": 5904 + }, + { + "epoch": 0.76, + "grad_norm": 0.6775880455970764, + "learning_rate": 7.416028871624664e-05, + "loss": 3.2442, + "step": 5905 + }, + { + "epoch": 0.76, + "grad_norm": 0.6577247381210327, + "learning_rate": 7.408662183872786e-05, + "loss": 3.3793, + "step": 5906 + }, + { + "epoch": 0.76, + "grad_norm": 0.7472736239433289, + "learning_rate": 7.401298520275817e-05, + "loss": 3.2458, + "step": 5907 + }, + { + "epoch": 0.76, + "grad_norm": 0.6332255005836487, + "learning_rate": 7.393937882099656e-05, + "loss": 3.3351, + "step": 5908 + }, + { + "epoch": 0.76, + "grad_norm": 0.6670913696289062, + "learning_rate": 7.386580270609669e-05, + "loss": 3.267, + "step": 5909 + }, + { + "epoch": 0.76, + "grad_norm": 0.6957175731658936, + "learning_rate": 7.379225687070734e-05, + "loss": 3.381, + "step": 5910 + }, + { + "epoch": 0.76, + "grad_norm": 0.6249982118606567, + "learning_rate": 7.371874132747175e-05, + "loss": 3.3498, + "step": 5911 + }, + { + "epoch": 0.76, + "grad_norm": 0.6238232851028442, + "learning_rate": 7.364525608902823e-05, + "loss": 3.2145, + "step": 5912 + }, + { + "epoch": 0.76, + "grad_norm": 0.6316736936569214, + "learning_rate": 7.357180116800965e-05, + "loss": 3.2978, + "step": 5913 + }, + { + "epoch": 0.76, + "grad_norm": 0.559715211391449, + "learning_rate": 7.349837657704378e-05, + "loss": 3.328, + "step": 5914 + }, + { + "epoch": 0.76, + "grad_norm": 0.6490960717201233, + "learning_rate": 7.342498232875319e-05, + "loss": 3.3539, + "step": 5915 + }, + { + "epoch": 0.76, + "grad_norm": 0.6523640751838684, + "learning_rate": 7.335161843575505e-05, + "loss": 3.2819, + "step": 5916 + }, + { + "epoch": 0.76, + "grad_norm": 0.6588881015777588, + "learning_rate": 7.327828491066169e-05, + "loss": 3.3489, + "step": 5917 + }, + { + "epoch": 0.76, + "grad_norm": 0.6438858509063721, + "learning_rate": 7.32049817660799e-05, + "loss": 3.2901, + "step": 5918 + }, + { + "epoch": 0.76, + "grad_norm": 0.6138406991958618, + "learning_rate": 7.31317090146113e-05, + "loss": 3.1689, + "step": 5919 + }, + { + "epoch": 0.76, + "grad_norm": 0.5893608927726746, + "learning_rate": 7.305846666885236e-05, + "loss": 3.3019, + "step": 5920 + }, + { + "epoch": 0.76, + "grad_norm": 0.6357761025428772, + "learning_rate": 7.298525474139419e-05, + "loss": 3.3627, + "step": 5921 + }, + { + "epoch": 0.76, + "grad_norm": 0.6008619666099548, + "learning_rate": 7.291207324482296e-05, + "loss": 3.3415, + "step": 5922 + }, + { + "epoch": 0.76, + "grad_norm": 0.6560606956481934, + "learning_rate": 7.283892219171933e-05, + "loss": 3.2799, + "step": 5923 + }, + { + "epoch": 0.76, + "grad_norm": 0.5996524691581726, + "learning_rate": 7.27658015946587e-05, + "loss": 3.2154, + "step": 5924 + }, + { + "epoch": 0.76, + "grad_norm": 0.5937591195106506, + "learning_rate": 7.269271146621153e-05, + "loss": 3.2677, + "step": 5925 + }, + { + "epoch": 0.76, + "grad_norm": 0.6687285900115967, + "learning_rate": 7.26196518189428e-05, + "loss": 3.3694, + "step": 5926 + }, + { + "epoch": 0.76, + "grad_norm": 0.6421836614608765, + "learning_rate": 7.254662266541229e-05, + "loss": 3.2305, + "step": 5927 + }, + { + "epoch": 0.76, + "grad_norm": 0.6282448172569275, + "learning_rate": 7.247362401817456e-05, + "loss": 3.4225, + "step": 5928 + }, + { + "epoch": 0.76, + "grad_norm": 0.6458451747894287, + "learning_rate": 7.24006558897789e-05, + "loss": 3.3958, + "step": 5929 + }, + { + "epoch": 0.76, + "grad_norm": 0.6398035287857056, + "learning_rate": 7.232771829276935e-05, + "loss": 3.2902, + "step": 5930 + }, + { + "epoch": 0.76, + "grad_norm": 0.6559479832649231, + "learning_rate": 7.225481123968475e-05, + "loss": 3.3943, + "step": 5931 + }, + { + "epoch": 0.76, + "grad_norm": 0.6435176134109497, + "learning_rate": 7.218193474305881e-05, + "loss": 3.3508, + "step": 5932 + }, + { + "epoch": 0.76, + "grad_norm": 0.7084299921989441, + "learning_rate": 7.21090888154197e-05, + "loss": 3.3744, + "step": 5933 + }, + { + "epoch": 0.76, + "grad_norm": 0.6554321050643921, + "learning_rate": 7.203627346929053e-05, + "loss": 3.3302, + "step": 5934 + }, + { + "epoch": 0.76, + "grad_norm": 0.624515950679779, + "learning_rate": 7.196348871718907e-05, + "loss": 3.3803, + "step": 5935 + }, + { + "epoch": 0.76, + "grad_norm": 0.6426070928573608, + "learning_rate": 7.18907345716279e-05, + "loss": 3.2487, + "step": 5936 + }, + { + "epoch": 0.76, + "grad_norm": 0.6676350235939026, + "learning_rate": 7.181801104511417e-05, + "loss": 3.3991, + "step": 5937 + }, + { + "epoch": 0.76, + "grad_norm": 0.5947633981704712, + "learning_rate": 7.174531815015009e-05, + "loss": 3.2309, + "step": 5938 + }, + { + "epoch": 0.76, + "grad_norm": 0.6089289784431458, + "learning_rate": 7.167265589923228e-05, + "loss": 3.2029, + "step": 5939 + }, + { + "epoch": 0.76, + "grad_norm": 0.6404508352279663, + "learning_rate": 7.16000243048523e-05, + "loss": 3.3253, + "step": 5940 + }, + { + "epoch": 0.76, + "grad_norm": 0.6133842468261719, + "learning_rate": 7.152742337949636e-05, + "loss": 3.2687, + "step": 5941 + }, + { + "epoch": 0.76, + "grad_norm": 0.6085859537124634, + "learning_rate": 7.145485313564536e-05, + "loss": 3.327, + "step": 5942 + }, + { + "epoch": 0.76, + "grad_norm": 0.5930094122886658, + "learning_rate": 7.138231358577498e-05, + "loss": 3.1893, + "step": 5943 + }, + { + "epoch": 0.76, + "grad_norm": 0.6218498349189758, + "learning_rate": 7.13098047423556e-05, + "loss": 3.2663, + "step": 5944 + }, + { + "epoch": 0.76, + "grad_norm": 0.6165961623191833, + "learning_rate": 7.12373266178523e-05, + "loss": 3.3039, + "step": 5945 + }, + { + "epoch": 0.76, + "grad_norm": 0.5737981796264648, + "learning_rate": 7.116487922472498e-05, + "loss": 3.1787, + "step": 5946 + }, + { + "epoch": 0.76, + "grad_norm": 0.6428831815719604, + "learning_rate": 7.109246257542817e-05, + "loss": 3.3723, + "step": 5947 + }, + { + "epoch": 0.76, + "grad_norm": 0.6483456492424011, + "learning_rate": 7.102007668241107e-05, + "loss": 3.3862, + "step": 5948 + }, + { + "epoch": 0.76, + "grad_norm": 0.6161538362503052, + "learning_rate": 7.094772155811774e-05, + "loss": 3.356, + "step": 5949 + }, + { + "epoch": 0.76, + "grad_norm": 0.618384063243866, + "learning_rate": 7.087539721498684e-05, + "loss": 3.1761, + "step": 5950 + }, + { + "epoch": 0.76, + "grad_norm": 0.6284879446029663, + "learning_rate": 7.080310366545176e-05, + "loss": 3.2568, + "step": 5951 + }, + { + "epoch": 0.76, + "grad_norm": 0.6222849488258362, + "learning_rate": 7.073084092194049e-05, + "loss": 3.326, + "step": 5952 + }, + { + "epoch": 0.76, + "grad_norm": 0.7094422578811646, + "learning_rate": 7.065860899687604e-05, + "loss": 3.3456, + "step": 5953 + }, + { + "epoch": 0.76, + "grad_norm": 0.5881149768829346, + "learning_rate": 7.05864079026758e-05, + "loss": 3.2518, + "step": 5954 + }, + { + "epoch": 0.76, + "grad_norm": 0.5845149755477905, + "learning_rate": 7.051423765175194e-05, + "loss": 3.2955, + "step": 5955 + }, + { + "epoch": 0.76, + "grad_norm": 0.6444967985153198, + "learning_rate": 7.044209825651143e-05, + "loss": 3.3345, + "step": 5956 + }, + { + "epoch": 0.76, + "grad_norm": 0.5796284675598145, + "learning_rate": 7.036998972935585e-05, + "loss": 3.2418, + "step": 5957 + }, + { + "epoch": 0.76, + "grad_norm": 0.5962539911270142, + "learning_rate": 7.02979120826814e-05, + "loss": 3.3014, + "step": 5958 + }, + { + "epoch": 0.76, + "grad_norm": 0.6504797339439392, + "learning_rate": 7.022586532887926e-05, + "loss": 3.443, + "step": 5959 + }, + { + "epoch": 0.76, + "grad_norm": 0.6190051436424255, + "learning_rate": 7.015384948033487e-05, + "loss": 3.2198, + "step": 5960 + }, + { + "epoch": 0.76, + "grad_norm": 0.6398943662643433, + "learning_rate": 7.00818645494288e-05, + "loss": 3.3484, + "step": 5961 + }, + { + "epoch": 0.76, + "grad_norm": 0.6127356290817261, + "learning_rate": 7.0009910548536e-05, + "loss": 3.077, + "step": 5962 + }, + { + "epoch": 0.76, + "grad_norm": 0.6366421580314636, + "learning_rate": 6.993798749002622e-05, + "loss": 3.395, + "step": 5963 + }, + { + "epoch": 0.76, + "grad_norm": 0.606545090675354, + "learning_rate": 6.986609538626384e-05, + "loss": 3.325, + "step": 5964 + }, + { + "epoch": 0.76, + "grad_norm": 0.5849049091339111, + "learning_rate": 6.979423424960795e-05, + "loss": 3.2585, + "step": 5965 + }, + { + "epoch": 0.76, + "grad_norm": 0.6326369047164917, + "learning_rate": 6.972240409241224e-05, + "loss": 3.2424, + "step": 5966 + }, + { + "epoch": 0.76, + "grad_norm": 0.642951488494873, + "learning_rate": 6.965060492702525e-05, + "loss": 3.2929, + "step": 5967 + }, + { + "epoch": 0.76, + "grad_norm": 0.6404616236686707, + "learning_rate": 6.95788367657901e-05, + "loss": 3.3485, + "step": 5968 + }, + { + "epoch": 0.76, + "grad_norm": 0.5683895945549011, + "learning_rate": 6.950709962104454e-05, + "loss": 3.2801, + "step": 5969 + }, + { + "epoch": 0.76, + "grad_norm": 0.614601731300354, + "learning_rate": 6.943539350512101e-05, + "loss": 3.2453, + "step": 5970 + }, + { + "epoch": 0.76, + "grad_norm": 0.6009972095489502, + "learning_rate": 6.936371843034663e-05, + "loss": 3.3278, + "step": 5971 + }, + { + "epoch": 0.76, + "grad_norm": 0.6426170468330383, + "learning_rate": 6.929207440904318e-05, + "loss": 3.2078, + "step": 5972 + }, + { + "epoch": 0.76, + "grad_norm": 0.6109356880187988, + "learning_rate": 6.922046145352698e-05, + "loss": 3.2783, + "step": 5973 + }, + { + "epoch": 0.76, + "grad_norm": 0.5790854096412659, + "learning_rate": 6.91488795761093e-05, + "loss": 3.3016, + "step": 5974 + }, + { + "epoch": 0.76, + "grad_norm": 0.6267958283424377, + "learning_rate": 6.907732878909587e-05, + "loss": 3.3314, + "step": 5975 + }, + { + "epoch": 0.76, + "grad_norm": 0.6615608334541321, + "learning_rate": 6.900580910478693e-05, + "loss": 3.2845, + "step": 5976 + }, + { + "epoch": 0.77, + "grad_norm": 0.6061287522315979, + "learning_rate": 6.89343205354778e-05, + "loss": 3.2213, + "step": 5977 + }, + { + "epoch": 0.77, + "grad_norm": 0.6183081865310669, + "learning_rate": 6.886286309345801e-05, + "loss": 3.212, + "step": 5978 + }, + { + "epoch": 0.77, + "grad_norm": 0.615390419960022, + "learning_rate": 6.879143679101202e-05, + "loss": 3.3775, + "step": 5979 + }, + { + "epoch": 0.77, + "grad_norm": 0.6467695832252502, + "learning_rate": 6.872004164041878e-05, + "loss": 3.2776, + "step": 5980 + }, + { + "epoch": 0.77, + "grad_norm": 0.635640561580658, + "learning_rate": 6.864867765395188e-05, + "loss": 3.3567, + "step": 5981 + }, + { + "epoch": 0.77, + "grad_norm": 0.6547449231147766, + "learning_rate": 6.857734484387976e-05, + "loss": 3.2996, + "step": 5982 + }, + { + "epoch": 0.77, + "grad_norm": 0.6103988289833069, + "learning_rate": 6.850604322246532e-05, + "loss": 3.1834, + "step": 5983 + }, + { + "epoch": 0.77, + "grad_norm": 0.6807669401168823, + "learning_rate": 6.843477280196609e-05, + "loss": 3.3163, + "step": 5984 + }, + { + "epoch": 0.77, + "grad_norm": 0.5808712244033813, + "learning_rate": 6.836353359463424e-05, + "loss": 3.3389, + "step": 5985 + }, + { + "epoch": 0.77, + "grad_norm": 0.6186822056770325, + "learning_rate": 6.829232561271672e-05, + "loss": 3.3118, + "step": 5986 + }, + { + "epoch": 0.77, + "grad_norm": 0.6658785343170166, + "learning_rate": 6.822114886845498e-05, + "loss": 3.1867, + "step": 5987 + }, + { + "epoch": 0.77, + "grad_norm": 0.6463857889175415, + "learning_rate": 6.815000337408506e-05, + "loss": 3.2857, + "step": 5988 + }, + { + "epoch": 0.77, + "grad_norm": 0.6342170238494873, + "learning_rate": 6.80788891418378e-05, + "loss": 3.4287, + "step": 5989 + }, + { + "epoch": 0.77, + "grad_norm": 0.6567581295967102, + "learning_rate": 6.800780618393851e-05, + "loss": 3.3444, + "step": 5990 + }, + { + "epoch": 0.77, + "grad_norm": 0.6194071173667908, + "learning_rate": 6.793675451260717e-05, + "loss": 3.2548, + "step": 5991 + }, + { + "epoch": 0.77, + "grad_norm": 0.6007659435272217, + "learning_rate": 6.78657341400584e-05, + "loss": 3.2942, + "step": 5992 + }, + { + "epoch": 0.77, + "grad_norm": 0.6221792101860046, + "learning_rate": 6.779474507850139e-05, + "loss": 3.3124, + "step": 5993 + }, + { + "epoch": 0.77, + "grad_norm": 0.6629132032394409, + "learning_rate": 6.772378734013992e-05, + "loss": 3.2519, + "step": 5994 + }, + { + "epoch": 0.77, + "grad_norm": 0.6474263668060303, + "learning_rate": 6.765286093717265e-05, + "loss": 3.3981, + "step": 5995 + }, + { + "epoch": 0.77, + "grad_norm": 0.6070880889892578, + "learning_rate": 6.758196588179244e-05, + "loss": 3.3196, + "step": 5996 + }, + { + "epoch": 0.77, + "grad_norm": 0.5742640495300293, + "learning_rate": 6.751110218618714e-05, + "loss": 3.2057, + "step": 5997 + }, + { + "epoch": 0.77, + "grad_norm": 0.6586142778396606, + "learning_rate": 6.744026986253895e-05, + "loss": 3.3675, + "step": 5998 + }, + { + "epoch": 0.77, + "grad_norm": 0.5742660760879517, + "learning_rate": 6.736946892302481e-05, + "loss": 3.2621, + "step": 5999 + }, + { + "epoch": 0.77, + "grad_norm": 0.6078349351882935, + "learning_rate": 6.729869937981619e-05, + "loss": 3.2636, + "step": 6000 + }, + { + "epoch": 0.77, + "grad_norm": 0.5926090478897095, + "learning_rate": 6.72279612450792e-05, + "loss": 3.2122, + "step": 6001 + }, + { + "epoch": 0.77, + "grad_norm": 0.6195435523986816, + "learning_rate": 6.715725453097446e-05, + "loss": 3.3282, + "step": 6002 + }, + { + "epoch": 0.77, + "grad_norm": 0.699246883392334, + "learning_rate": 6.708657924965746e-05, + "loss": 3.3197, + "step": 6003 + }, + { + "epoch": 0.77, + "grad_norm": 0.6332224011421204, + "learning_rate": 6.701593541327792e-05, + "loss": 3.2629, + "step": 6004 + }, + { + "epoch": 0.77, + "grad_norm": 0.6227474212646484, + "learning_rate": 6.694532303398048e-05, + "loss": 3.3366, + "step": 6005 + }, + { + "epoch": 0.77, + "grad_norm": 0.6562210917472839, + "learning_rate": 6.687474212390418e-05, + "loss": 3.3181, + "step": 6006 + }, + { + "epoch": 0.77, + "grad_norm": 0.6172775030136108, + "learning_rate": 6.680419269518265e-05, + "loss": 3.3731, + "step": 6007 + }, + { + "epoch": 0.77, + "grad_norm": 0.6271465420722961, + "learning_rate": 6.673367475994421e-05, + "loss": 3.3334, + "step": 6008 + }, + { + "epoch": 0.77, + "grad_norm": 0.626900851726532, + "learning_rate": 6.66631883303116e-05, + "loss": 3.4734, + "step": 6009 + }, + { + "epoch": 0.77, + "grad_norm": 0.7117727398872375, + "learning_rate": 6.659273341840241e-05, + "loss": 3.3387, + "step": 6010 + }, + { + "epoch": 0.77, + "grad_norm": 0.6394992470741272, + "learning_rate": 6.652231003632858e-05, + "loss": 3.4417, + "step": 6011 + }, + { + "epoch": 0.77, + "grad_norm": 0.6169131398200989, + "learning_rate": 6.645191819619672e-05, + "loss": 3.3076, + "step": 6012 + }, + { + "epoch": 0.77, + "grad_norm": 0.6018803715705872, + "learning_rate": 6.638155791010791e-05, + "loss": 3.2318, + "step": 6013 + }, + { + "epoch": 0.77, + "grad_norm": 0.6905596256256104, + "learning_rate": 6.631122919015809e-05, + "loss": 3.2227, + "step": 6014 + }, + { + "epoch": 0.77, + "grad_norm": 0.5747807025909424, + "learning_rate": 6.624093204843746e-05, + "loss": 3.1994, + "step": 6015 + }, + { + "epoch": 0.77, + "grad_norm": 0.6372610926628113, + "learning_rate": 6.61706664970309e-05, + "loss": 3.337, + "step": 6016 + }, + { + "epoch": 0.77, + "grad_norm": 0.6507368087768555, + "learning_rate": 6.610043254801784e-05, + "loss": 3.3928, + "step": 6017 + }, + { + "epoch": 0.77, + "grad_norm": 0.6256568431854248, + "learning_rate": 6.603023021347246e-05, + "loss": 3.192, + "step": 6018 + }, + { + "epoch": 0.77, + "grad_norm": 0.6386218667030334, + "learning_rate": 6.596005950546327e-05, + "loss": 3.3401, + "step": 6019 + }, + { + "epoch": 0.77, + "grad_norm": 0.6473775506019592, + "learning_rate": 6.58899204360534e-05, + "loss": 3.1698, + "step": 6020 + }, + { + "epoch": 0.77, + "grad_norm": 0.6212315559387207, + "learning_rate": 6.58198130173006e-05, + "loss": 3.3463, + "step": 6021 + }, + { + "epoch": 0.77, + "grad_norm": 0.6566527485847473, + "learning_rate": 6.57497372612571e-05, + "loss": 3.3118, + "step": 6022 + }, + { + "epoch": 0.77, + "grad_norm": 0.5994009375572205, + "learning_rate": 6.567969317996982e-05, + "loss": 3.169, + "step": 6023 + }, + { + "epoch": 0.77, + "grad_norm": 0.6485239267349243, + "learning_rate": 6.560968078548005e-05, + "loss": 3.288, + "step": 6024 + }, + { + "epoch": 0.77, + "grad_norm": 0.606691837310791, + "learning_rate": 6.553970008982385e-05, + "loss": 3.1417, + "step": 6025 + }, + { + "epoch": 0.77, + "grad_norm": 0.6456628441810608, + "learning_rate": 6.546975110503164e-05, + "loss": 3.2006, + "step": 6026 + }, + { + "epoch": 0.77, + "grad_norm": 0.6619232296943665, + "learning_rate": 6.53998338431285e-05, + "loss": 3.3092, + "step": 6027 + }, + { + "epoch": 0.77, + "grad_norm": 0.6379346251487732, + "learning_rate": 6.532994831613398e-05, + "loss": 3.422, + "step": 6028 + }, + { + "epoch": 0.77, + "grad_norm": 0.6716301441192627, + "learning_rate": 6.526009453606224e-05, + "loss": 3.3259, + "step": 6029 + }, + { + "epoch": 0.77, + "grad_norm": 0.6789576411247253, + "learning_rate": 6.519027251492185e-05, + "loss": 3.3459, + "step": 6030 + }, + { + "epoch": 0.77, + "grad_norm": 0.6274928450584412, + "learning_rate": 6.512048226471617e-05, + "loss": 3.2663, + "step": 6031 + }, + { + "epoch": 0.77, + "grad_norm": 0.6633818745613098, + "learning_rate": 6.505072379744283e-05, + "loss": 3.3433, + "step": 6032 + }, + { + "epoch": 0.77, + "grad_norm": 0.6291759014129639, + "learning_rate": 6.498099712509428e-05, + "loss": 3.2835, + "step": 6033 + }, + { + "epoch": 0.77, + "grad_norm": 0.624902606010437, + "learning_rate": 6.491130225965722e-05, + "loss": 3.3141, + "step": 6034 + }, + { + "epoch": 0.77, + "grad_norm": 0.6179114580154419, + "learning_rate": 6.484163921311306e-05, + "loss": 3.2325, + "step": 6035 + }, + { + "epoch": 0.77, + "grad_norm": 0.6924793720245361, + "learning_rate": 6.477200799743766e-05, + "loss": 3.2907, + "step": 6036 + }, + { + "epoch": 0.77, + "grad_norm": 0.5994296073913574, + "learning_rate": 6.470240862460142e-05, + "loss": 3.1898, + "step": 6037 + }, + { + "epoch": 0.77, + "grad_norm": 0.6019535660743713, + "learning_rate": 6.46328411065692e-05, + "loss": 3.3504, + "step": 6038 + }, + { + "epoch": 0.77, + "grad_norm": 0.6485642790794373, + "learning_rate": 6.456330545530065e-05, + "loss": 3.4536, + "step": 6039 + }, + { + "epoch": 0.77, + "grad_norm": 0.6324289441108704, + "learning_rate": 6.449380168274965e-05, + "loss": 3.2148, + "step": 6040 + }, + { + "epoch": 0.77, + "grad_norm": 0.6326262354850769, + "learning_rate": 6.442432980086466e-05, + "loss": 3.317, + "step": 6041 + }, + { + "epoch": 0.77, + "grad_norm": 0.8014912009239197, + "learning_rate": 6.43548898215888e-05, + "loss": 3.2295, + "step": 6042 + }, + { + "epoch": 0.77, + "grad_norm": 0.7135244607925415, + "learning_rate": 6.428548175685958e-05, + "loss": 3.3388, + "step": 6043 + }, + { + "epoch": 0.77, + "grad_norm": 0.6073774099349976, + "learning_rate": 6.421610561860902e-05, + "loss": 3.2621, + "step": 6044 + }, + { + "epoch": 0.77, + "grad_norm": 0.6199702024459839, + "learning_rate": 6.414676141876363e-05, + "loss": 3.3099, + "step": 6045 + }, + { + "epoch": 0.77, + "grad_norm": 0.683590292930603, + "learning_rate": 6.407744916924463e-05, + "loss": 3.3776, + "step": 6046 + }, + { + "epoch": 0.77, + "grad_norm": 0.6720425486564636, + "learning_rate": 6.400816888196751e-05, + "loss": 3.1343, + "step": 6047 + }, + { + "epoch": 0.77, + "grad_norm": 0.6250442266464233, + "learning_rate": 6.393892056884234e-05, + "loss": 3.2088, + "step": 6048 + }, + { + "epoch": 0.77, + "grad_norm": 0.6554825305938721, + "learning_rate": 6.386970424177376e-05, + "loss": 3.3075, + "step": 6049 + }, + { + "epoch": 0.77, + "grad_norm": 0.6803299188613892, + "learning_rate": 6.380051991266075e-05, + "loss": 3.3917, + "step": 6050 + }, + { + "epoch": 0.77, + "grad_norm": 0.6328518390655518, + "learning_rate": 6.373136759339704e-05, + "loss": 3.3815, + "step": 6051 + }, + { + "epoch": 0.77, + "grad_norm": 0.6592798829078674, + "learning_rate": 6.366224729587067e-05, + "loss": 3.1769, + "step": 6052 + }, + { + "epoch": 0.77, + "grad_norm": 0.627619743347168, + "learning_rate": 6.359315903196411e-05, + "loss": 3.2093, + "step": 6053 + }, + { + "epoch": 0.77, + "grad_norm": 0.6786208152770996, + "learning_rate": 6.352410281355461e-05, + "loss": 3.2703, + "step": 6054 + }, + { + "epoch": 0.78, + "grad_norm": 0.6732574701309204, + "learning_rate": 6.345507865251366e-05, + "loss": 3.3439, + "step": 6055 + }, + { + "epoch": 0.78, + "grad_norm": 0.5812044143676758, + "learning_rate": 6.338608656070727e-05, + "loss": 3.2492, + "step": 6056 + }, + { + "epoch": 0.78, + "grad_norm": 0.6608644723892212, + "learning_rate": 6.331712654999602e-05, + "loss": 3.2478, + "step": 6057 + }, + { + "epoch": 0.78, + "grad_norm": 0.623185932636261, + "learning_rate": 6.324819863223497e-05, + "loss": 3.2782, + "step": 6058 + }, + { + "epoch": 0.78, + "grad_norm": 0.6139093637466431, + "learning_rate": 6.317930281927348e-05, + "loss": 3.3213, + "step": 6059 + }, + { + "epoch": 0.78, + "grad_norm": 0.6272095441818237, + "learning_rate": 6.311043912295563e-05, + "loss": 3.1978, + "step": 6060 + }, + { + "epoch": 0.78, + "grad_norm": 0.6622560620307922, + "learning_rate": 6.304160755512003e-05, + "loss": 3.3403, + "step": 6061 + }, + { + "epoch": 0.78, + "grad_norm": 0.6419727206230164, + "learning_rate": 6.297280812759945e-05, + "loss": 3.3154, + "step": 6062 + }, + { + "epoch": 0.78, + "grad_norm": 0.6320186853408813, + "learning_rate": 6.29040408522214e-05, + "loss": 3.2966, + "step": 6063 + }, + { + "epoch": 0.78, + "grad_norm": 0.6026878952980042, + "learning_rate": 6.28353057408077e-05, + "loss": 3.3277, + "step": 6064 + }, + { + "epoch": 0.78, + "grad_norm": 0.6492076516151428, + "learning_rate": 6.276660280517477e-05, + "loss": 3.2501, + "step": 6065 + }, + { + "epoch": 0.78, + "grad_norm": 0.6005073189735413, + "learning_rate": 6.269793205713331e-05, + "loss": 3.2953, + "step": 6066 + }, + { + "epoch": 0.78, + "grad_norm": 0.5965752005577087, + "learning_rate": 6.262929350848881e-05, + "loss": 3.2117, + "step": 6067 + }, + { + "epoch": 0.78, + "grad_norm": 0.6626747846603394, + "learning_rate": 6.256068717104093e-05, + "loss": 3.3177, + "step": 6068 + }, + { + "epoch": 0.78, + "grad_norm": 0.6364305019378662, + "learning_rate": 6.249211305658384e-05, + "loss": 3.1901, + "step": 6069 + }, + { + "epoch": 0.78, + "grad_norm": 0.6597917675971985, + "learning_rate": 6.242357117690639e-05, + "loss": 3.2494, + "step": 6070 + }, + { + "epoch": 0.78, + "grad_norm": 0.6223085522651672, + "learning_rate": 6.235506154379161e-05, + "loss": 3.2459, + "step": 6071 + }, + { + "epoch": 0.78, + "grad_norm": 0.5980460047721863, + "learning_rate": 6.228658416901711e-05, + "loss": 3.1854, + "step": 6072 + }, + { + "epoch": 0.78, + "grad_norm": 0.6632031798362732, + "learning_rate": 6.221813906435494e-05, + "loss": 3.2937, + "step": 6073 + }, + { + "epoch": 0.78, + "grad_norm": 0.6205509305000305, + "learning_rate": 6.214972624157159e-05, + "loss": 3.3785, + "step": 6074 + }, + { + "epoch": 0.78, + "grad_norm": 0.6477987766265869, + "learning_rate": 6.208134571242808e-05, + "loss": 3.1485, + "step": 6075 + }, + { + "epoch": 0.78, + "grad_norm": 0.6303578019142151, + "learning_rate": 6.20129974886798e-05, + "loss": 3.1722, + "step": 6076 + }, + { + "epoch": 0.78, + "grad_norm": 0.6248361468315125, + "learning_rate": 6.194468158207659e-05, + "loss": 3.263, + "step": 6077 + }, + { + "epoch": 0.78, + "grad_norm": 0.8012433648109436, + "learning_rate": 6.187639800436265e-05, + "loss": 3.2556, + "step": 6078 + }, + { + "epoch": 0.78, + "grad_norm": 0.6323918104171753, + "learning_rate": 6.18081467672769e-05, + "loss": 3.2504, + "step": 6079 + }, + { + "epoch": 0.78, + "grad_norm": 0.626590371131897, + "learning_rate": 6.17399278825524e-05, + "loss": 3.3973, + "step": 6080 + }, + { + "epoch": 0.78, + "grad_norm": 0.6283361911773682, + "learning_rate": 6.167174136191675e-05, + "loss": 3.2398, + "step": 6081 + }, + { + "epoch": 0.78, + "grad_norm": 0.636721134185791, + "learning_rate": 6.160358721709212e-05, + "loss": 3.2098, + "step": 6082 + }, + { + "epoch": 0.78, + "grad_norm": 0.6429245471954346, + "learning_rate": 6.15354654597949e-05, + "loss": 3.3347, + "step": 6083 + }, + { + "epoch": 0.78, + "grad_norm": 0.6272968649864197, + "learning_rate": 6.146737610173606e-05, + "loss": 3.3246, + "step": 6084 + }, + { + "epoch": 0.78, + "grad_norm": 0.6349840760231018, + "learning_rate": 6.13993191546209e-05, + "loss": 3.252, + "step": 6085 + }, + { + "epoch": 0.78, + "grad_norm": 0.6135838627815247, + "learning_rate": 6.133129463014924e-05, + "loss": 3.3089, + "step": 6086 + }, + { + "epoch": 0.78, + "grad_norm": 0.6382942199707031, + "learning_rate": 6.126330254001522e-05, + "loss": 3.2533, + "step": 6087 + }, + { + "epoch": 0.78, + "grad_norm": 0.690299391746521, + "learning_rate": 6.119534289590747e-05, + "loss": 3.091, + "step": 6088 + }, + { + "epoch": 0.78, + "grad_norm": 0.6160559058189392, + "learning_rate": 6.112741570950919e-05, + "loss": 3.2888, + "step": 6089 + }, + { + "epoch": 0.78, + "grad_norm": 0.6733001470565796, + "learning_rate": 6.105952099249776e-05, + "loss": 3.2529, + "step": 6090 + }, + { + "epoch": 0.78, + "grad_norm": 0.5812699794769287, + "learning_rate": 6.099165875654503e-05, + "loss": 3.3015, + "step": 6091 + }, + { + "epoch": 0.78, + "grad_norm": 0.6092056035995483, + "learning_rate": 6.092382901331733e-05, + "loss": 3.1526, + "step": 6092 + }, + { + "epoch": 0.78, + "grad_norm": 0.6755577921867371, + "learning_rate": 6.0856031774475407e-05, + "loss": 3.3499, + "step": 6093 + }, + { + "epoch": 0.78, + "grad_norm": 0.596678614616394, + "learning_rate": 6.078826705167437e-05, + "loss": 3.2249, + "step": 6094 + }, + { + "epoch": 0.78, + "grad_norm": 0.6615753173828125, + "learning_rate": 6.072053485656365e-05, + "loss": 3.3419, + "step": 6095 + }, + { + "epoch": 0.78, + "grad_norm": 0.6795618534088135, + "learning_rate": 6.065283520078732e-05, + "loss": 3.289, + "step": 6096 + }, + { + "epoch": 0.78, + "grad_norm": 0.6887994408607483, + "learning_rate": 6.05851680959838e-05, + "loss": 3.3519, + "step": 6097 + }, + { + "epoch": 0.78, + "grad_norm": 0.7297321557998657, + "learning_rate": 6.051753355378578e-05, + "loss": 3.1386, + "step": 6098 + }, + { + "epoch": 0.78, + "grad_norm": 0.6433356404304504, + "learning_rate": 6.044993158582038e-05, + "loss": 3.2025, + "step": 6099 + }, + { + "epoch": 0.78, + "grad_norm": 0.6871981024742126, + "learning_rate": 6.038236220370921e-05, + "loss": 3.3401, + "step": 6100 + }, + { + "epoch": 0.78, + "grad_norm": 0.6435525417327881, + "learning_rate": 6.031482541906821e-05, + "loss": 3.4581, + "step": 6101 + }, + { + "epoch": 0.78, + "grad_norm": 0.6278454661369324, + "learning_rate": 6.024732124350768e-05, + "loss": 3.3444, + "step": 6102 + }, + { + "epoch": 0.78, + "grad_norm": 0.6780098676681519, + "learning_rate": 6.017984968863249e-05, + "loss": 3.3399, + "step": 6103 + }, + { + "epoch": 0.78, + "grad_norm": 0.5871052742004395, + "learning_rate": 6.0112410766041735e-05, + "loss": 3.1699, + "step": 6104 + }, + { + "epoch": 0.78, + "grad_norm": 0.6621236205101013, + "learning_rate": 6.0045004487328917e-05, + "loss": 3.2539, + "step": 6105 + }, + { + "epoch": 0.78, + "grad_norm": 0.6626326441764832, + "learning_rate": 5.997763086408192e-05, + "loss": 3.273, + "step": 6106 + }, + { + "epoch": 0.78, + "grad_norm": 0.5612792372703552, + "learning_rate": 5.991028990788316e-05, + "loss": 3.2697, + "step": 6107 + }, + { + "epoch": 0.78, + "grad_norm": 0.6139649748802185, + "learning_rate": 5.984298163030929e-05, + "loss": 3.2308, + "step": 6108 + }, + { + "epoch": 0.78, + "grad_norm": 0.6269494891166687, + "learning_rate": 5.977570604293128e-05, + "loss": 3.3231, + "step": 6109 + }, + { + "epoch": 0.78, + "grad_norm": 0.6203356981277466, + "learning_rate": 5.9708463157314765e-05, + "loss": 3.3443, + "step": 6110 + }, + { + "epoch": 0.78, + "grad_norm": 0.6309711337089539, + "learning_rate": 5.964125298501946e-05, + "loss": 3.2053, + "step": 6111 + }, + { + "epoch": 0.78, + "grad_norm": 0.7104734182357788, + "learning_rate": 5.957407553759961e-05, + "loss": 3.2847, + "step": 6112 + }, + { + "epoch": 0.78, + "grad_norm": 0.6571914553642273, + "learning_rate": 5.950693082660377e-05, + "loss": 3.3378, + "step": 6113 + }, + { + "epoch": 0.78, + "grad_norm": 0.6446319818496704, + "learning_rate": 5.9439818863574913e-05, + "loss": 3.1806, + "step": 6114 + }, + { + "epoch": 0.78, + "grad_norm": 0.632908284664154, + "learning_rate": 5.937273966005028e-05, + "loss": 3.284, + "step": 6115 + }, + { + "epoch": 0.78, + "grad_norm": 0.6450886726379395, + "learning_rate": 5.9305693227561715e-05, + "loss": 3.2289, + "step": 6116 + }, + { + "epoch": 0.78, + "grad_norm": 0.6438875198364258, + "learning_rate": 5.923867957763512e-05, + "loss": 3.2964, + "step": 6117 + }, + { + "epoch": 0.78, + "grad_norm": 0.632624089717865, + "learning_rate": 5.917169872179109e-05, + "loss": 3.3564, + "step": 6118 + }, + { + "epoch": 0.78, + "grad_norm": 0.6316910982131958, + "learning_rate": 5.91047506715443e-05, + "loss": 3.3003, + "step": 6119 + }, + { + "epoch": 0.78, + "grad_norm": 0.6209931373596191, + "learning_rate": 5.903783543840393e-05, + "loss": 3.2094, + "step": 6120 + }, + { + "epoch": 0.78, + "grad_norm": 0.5965805053710938, + "learning_rate": 5.8970953033873484e-05, + "loss": 3.1804, + "step": 6121 + }, + { + "epoch": 0.78, + "grad_norm": 0.6775824427604675, + "learning_rate": 5.89041034694508e-05, + "loss": 3.3426, + "step": 6122 + }, + { + "epoch": 0.78, + "grad_norm": 0.6145557165145874, + "learning_rate": 5.8837286756628025e-05, + "loss": 3.2335, + "step": 6123 + }, + { + "epoch": 0.78, + "grad_norm": 0.6069118976593018, + "learning_rate": 5.877050290689182e-05, + "loss": 3.1969, + "step": 6124 + }, + { + "epoch": 0.78, + "grad_norm": 0.6316295266151428, + "learning_rate": 5.870375193172314e-05, + "loss": 3.3389, + "step": 6125 + }, + { + "epoch": 0.78, + "grad_norm": 0.654758870601654, + "learning_rate": 5.8637033842597224e-05, + "loss": 3.2345, + "step": 6126 + }, + { + "epoch": 0.78, + "grad_norm": 0.6531403064727783, + "learning_rate": 5.857034865098365e-05, + "loss": 3.2278, + "step": 6127 + }, + { + "epoch": 0.78, + "grad_norm": 0.6327181458473206, + "learning_rate": 5.8503696368346374e-05, + "loss": 3.1833, + "step": 6128 + }, + { + "epoch": 0.78, + "grad_norm": 0.6120930314064026, + "learning_rate": 5.84370770061437e-05, + "loss": 3.3167, + "step": 6129 + }, + { + "epoch": 0.78, + "grad_norm": 0.6629424095153809, + "learning_rate": 5.8370490575828216e-05, + "loss": 3.2911, + "step": 6130 + }, + { + "epoch": 0.78, + "grad_norm": 0.6397058367729187, + "learning_rate": 5.8303937088847045e-05, + "loss": 3.2963, + "step": 6131 + }, + { + "epoch": 0.78, + "grad_norm": 0.6212075352668762, + "learning_rate": 5.823741655664141e-05, + "loss": 3.4139, + "step": 6132 + }, + { + "epoch": 0.79, + "grad_norm": 0.635893702507019, + "learning_rate": 5.817092899064691e-05, + "loss": 3.398, + "step": 6133 + }, + { + "epoch": 0.79, + "grad_norm": 0.6343691945075989, + "learning_rate": 5.810447440229366e-05, + "loss": 3.2888, + "step": 6134 + }, + { + "epoch": 0.79, + "grad_norm": 0.6382713913917542, + "learning_rate": 5.8038052803005894e-05, + "loss": 3.263, + "step": 6135 + }, + { + "epoch": 0.79, + "grad_norm": 0.6321362257003784, + "learning_rate": 5.797166420420228e-05, + "loss": 3.327, + "step": 6136 + }, + { + "epoch": 0.79, + "grad_norm": 0.631368100643158, + "learning_rate": 5.790530861729576e-05, + "loss": 3.2032, + "step": 6137 + }, + { + "epoch": 0.79, + "grad_norm": 0.5903132557868958, + "learning_rate": 5.783898605369356e-05, + "loss": 3.2422, + "step": 6138 + }, + { + "epoch": 0.79, + "grad_norm": 0.6581616401672363, + "learning_rate": 5.777269652479747e-05, + "loss": 3.2814, + "step": 6139 + }, + { + "epoch": 0.79, + "grad_norm": 0.638910710811615, + "learning_rate": 5.770644004200332e-05, + "loss": 3.2615, + "step": 6140 + }, + { + "epoch": 0.79, + "grad_norm": 0.6400853991508484, + "learning_rate": 5.764021661670141e-05, + "loss": 3.2304, + "step": 6141 + }, + { + "epoch": 0.79, + "grad_norm": 0.6094925999641418, + "learning_rate": 5.757402626027625e-05, + "loss": 3.2817, + "step": 6142 + }, + { + "epoch": 0.79, + "grad_norm": 0.7051050066947937, + "learning_rate": 5.750786898410673e-05, + "loss": 3.2862, + "step": 6143 + }, + { + "epoch": 0.79, + "grad_norm": 0.6410810351371765, + "learning_rate": 5.744174479956615e-05, + "loss": 3.3413, + "step": 6144 + }, + { + "epoch": 0.79, + "grad_norm": 0.6460927724838257, + "learning_rate": 5.737565371802189e-05, + "loss": 3.2164, + "step": 6145 + }, + { + "epoch": 0.79, + "grad_norm": 0.6438145041465759, + "learning_rate": 5.730959575083594e-05, + "loss": 3.2533, + "step": 6146 + }, + { + "epoch": 0.79, + "grad_norm": 0.6696481108665466, + "learning_rate": 5.7243570909364326e-05, + "loss": 3.2115, + "step": 6147 + }, + { + "epoch": 0.79, + "grad_norm": 0.6192052960395813, + "learning_rate": 5.71775792049575e-05, + "loss": 3.2049, + "step": 6148 + }, + { + "epoch": 0.79, + "grad_norm": 0.5886045694351196, + "learning_rate": 5.711162064896019e-05, + "loss": 3.2451, + "step": 6149 + }, + { + "epoch": 0.79, + "grad_norm": 0.6182309985160828, + "learning_rate": 5.7045695252711475e-05, + "loss": 3.1899, + "step": 6150 + }, + { + "epoch": 0.79, + "grad_norm": 0.6360798478126526, + "learning_rate": 5.697980302754458e-05, + "loss": 3.3021, + "step": 6151 + }, + { + "epoch": 0.79, + "grad_norm": 0.6290764808654785, + "learning_rate": 5.691394398478727e-05, + "loss": 3.2406, + "step": 6152 + }, + { + "epoch": 0.79, + "grad_norm": 0.6354265213012695, + "learning_rate": 5.684811813576138e-05, + "loss": 3.3239, + "step": 6153 + }, + { + "epoch": 0.79, + "grad_norm": 0.659576952457428, + "learning_rate": 5.678232549178328e-05, + "loss": 3.4077, + "step": 6154 + }, + { + "epoch": 0.79, + "grad_norm": 0.6638350486755371, + "learning_rate": 5.671656606416337e-05, + "loss": 3.2351, + "step": 6155 + }, + { + "epoch": 0.79, + "grad_norm": 0.657824695110321, + "learning_rate": 5.6650839864206496e-05, + "loss": 3.371, + "step": 6156 + }, + { + "epoch": 0.79, + "grad_norm": 0.648253858089447, + "learning_rate": 5.658514690321176e-05, + "loss": 3.2927, + "step": 6157 + }, + { + "epoch": 0.79, + "grad_norm": 0.6625542044639587, + "learning_rate": 5.651948719247252e-05, + "loss": 3.1641, + "step": 6158 + }, + { + "epoch": 0.79, + "grad_norm": 0.6146804094314575, + "learning_rate": 5.645386074327638e-05, + "loss": 3.2208, + "step": 6159 + }, + { + "epoch": 0.79, + "grad_norm": 0.7196055054664612, + "learning_rate": 5.63882675669054e-05, + "loss": 3.2925, + "step": 6160 + }, + { + "epoch": 0.79, + "grad_norm": 0.6016031503677368, + "learning_rate": 5.6322707674635736e-05, + "loss": 3.3658, + "step": 6161 + }, + { + "epoch": 0.79, + "grad_norm": 0.6775092482566833, + "learning_rate": 5.6257181077737975e-05, + "loss": 3.2381, + "step": 6162 + }, + { + "epoch": 0.79, + "grad_norm": 0.6624186635017395, + "learning_rate": 5.619168778747685e-05, + "loss": 3.2365, + "step": 6163 + }, + { + "epoch": 0.79, + "grad_norm": 0.6462388634681702, + "learning_rate": 5.6126227815111425e-05, + "loss": 3.3843, + "step": 6164 + }, + { + "epoch": 0.79, + "grad_norm": 0.6163438558578491, + "learning_rate": 5.6060801171894996e-05, + "loss": 3.2424, + "step": 6165 + }, + { + "epoch": 0.79, + "grad_norm": 0.5882861018180847, + "learning_rate": 5.599540786907512e-05, + "loss": 3.446, + "step": 6166 + }, + { + "epoch": 0.79, + "grad_norm": 0.6447424292564392, + "learning_rate": 5.59300479178938e-05, + "loss": 3.2844, + "step": 6167 + }, + { + "epoch": 0.79, + "grad_norm": 0.5792222023010254, + "learning_rate": 5.5864721329587084e-05, + "loss": 3.1562, + "step": 6168 + }, + { + "epoch": 0.79, + "grad_norm": 0.6093016266822815, + "learning_rate": 5.579942811538538e-05, + "loss": 3.2774, + "step": 6169 + }, + { + "epoch": 0.79, + "grad_norm": 0.6209796071052551, + "learning_rate": 5.573416828651329e-05, + "loss": 3.4412, + "step": 6170 + }, + { + "epoch": 0.79, + "grad_norm": 0.62843257188797, + "learning_rate": 5.566894185418986e-05, + "loss": 3.3524, + "step": 6171 + }, + { + "epoch": 0.79, + "grad_norm": 0.6866121292114258, + "learning_rate": 5.560374882962821e-05, + "loss": 3.3448, + "step": 6172 + }, + { + "epoch": 0.79, + "grad_norm": 0.6563045382499695, + "learning_rate": 5.553858922403576e-05, + "loss": 3.2293, + "step": 6173 + }, + { + "epoch": 0.79, + "grad_norm": 0.6468909382820129, + "learning_rate": 5.5473463048614144e-05, + "loss": 3.3161, + "step": 6174 + }, + { + "epoch": 0.79, + "grad_norm": 0.6102748513221741, + "learning_rate": 5.540837031455945e-05, + "loss": 3.3122, + "step": 6175 + }, + { + "epoch": 0.79, + "grad_norm": 0.6245465278625488, + "learning_rate": 5.534331103306181e-05, + "loss": 3.2799, + "step": 6176 + }, + { + "epoch": 0.79, + "grad_norm": 0.6340338587760925, + "learning_rate": 5.527828521530562e-05, + "loss": 3.3966, + "step": 6177 + }, + { + "epoch": 0.79, + "grad_norm": 0.6553022265434265, + "learning_rate": 5.521329287246965e-05, + "loss": 3.1975, + "step": 6178 + }, + { + "epoch": 0.79, + "grad_norm": 0.6260964274406433, + "learning_rate": 5.5148334015726774e-05, + "loss": 3.181, + "step": 6179 + }, + { + "epoch": 0.79, + "grad_norm": 0.6341874003410339, + "learning_rate": 5.5083408656244146e-05, + "loss": 3.1921, + "step": 6180 + }, + { + "epoch": 0.79, + "grad_norm": 0.6693719029426575, + "learning_rate": 5.501851680518322e-05, + "loss": 3.3772, + "step": 6181 + }, + { + "epoch": 0.79, + "grad_norm": 0.6416365504264832, + "learning_rate": 5.4953658473699734e-05, + "loss": 3.3323, + "step": 6182 + }, + { + "epoch": 0.79, + "grad_norm": 0.6330123543739319, + "learning_rate": 5.488883367294353e-05, + "loss": 3.3311, + "step": 6183 + }, + { + "epoch": 0.79, + "grad_norm": 0.6906682252883911, + "learning_rate": 5.482404241405875e-05, + "loss": 3.2435, + "step": 6184 + }, + { + "epoch": 0.79, + "grad_norm": 0.6447123289108276, + "learning_rate": 5.4759284708183755e-05, + "loss": 3.2492, + "step": 6185 + }, + { + "epoch": 0.79, + "grad_norm": 0.6190230250358582, + "learning_rate": 5.469456056645114e-05, + "loss": 3.276, + "step": 6186 + }, + { + "epoch": 0.79, + "grad_norm": 0.60328209400177, + "learning_rate": 5.4629869999987675e-05, + "loss": 3.2909, + "step": 6187 + }, + { + "epoch": 0.79, + "grad_norm": 0.6737046837806702, + "learning_rate": 5.4565213019914544e-05, + "loss": 3.3374, + "step": 6188 + }, + { + "epoch": 0.79, + "grad_norm": 0.6252620816230774, + "learning_rate": 5.450058963734691e-05, + "loss": 3.3156, + "step": 6189 + }, + { + "epoch": 0.79, + "grad_norm": 0.6689559817314148, + "learning_rate": 5.44359998633944e-05, + "loss": 3.4685, + "step": 6190 + }, + { + "epoch": 0.79, + "grad_norm": 0.6267184019088745, + "learning_rate": 5.437144370916069e-05, + "loss": 3.3537, + "step": 6191 + }, + { + "epoch": 0.79, + "grad_norm": 0.6087378859519958, + "learning_rate": 5.430692118574374e-05, + "loss": 3.3843, + "step": 6192 + }, + { + "epoch": 0.79, + "grad_norm": 0.6335881352424622, + "learning_rate": 5.424243230423567e-05, + "loss": 3.2503, + "step": 6193 + }, + { + "epoch": 0.79, + "grad_norm": 0.6103877425193787, + "learning_rate": 5.417797707572294e-05, + "loss": 3.1548, + "step": 6194 + }, + { + "epoch": 0.79, + "grad_norm": 0.6789931058883667, + "learning_rate": 5.411355551128602e-05, + "loss": 3.2277, + "step": 6195 + }, + { + "epoch": 0.79, + "grad_norm": 0.6586472392082214, + "learning_rate": 5.404916762199991e-05, + "loss": 3.3497, + "step": 6196 + }, + { + "epoch": 0.79, + "grad_norm": 0.6357256174087524, + "learning_rate": 5.3984813418933545e-05, + "loss": 3.3711, + "step": 6197 + }, + { + "epoch": 0.79, + "grad_norm": 0.6573581695556641, + "learning_rate": 5.3920492913150074e-05, + "loss": 3.2539, + "step": 6198 + }, + { + "epoch": 0.79, + "grad_norm": 0.6482775211334229, + "learning_rate": 5.3856206115707077e-05, + "loss": 3.1406, + "step": 6199 + }, + { + "epoch": 0.79, + "grad_norm": 0.6598178148269653, + "learning_rate": 5.379195303765616e-05, + "loss": 3.2314, + "step": 6200 + }, + { + "epoch": 0.79, + "grad_norm": 0.6228412985801697, + "learning_rate": 5.372773369004314e-05, + "loss": 3.1364, + "step": 6201 + }, + { + "epoch": 0.79, + "grad_norm": 0.7382637858390808, + "learning_rate": 5.366354808390803e-05, + "loss": 3.236, + "step": 6202 + }, + { + "epoch": 0.79, + "grad_norm": 0.6506236791610718, + "learning_rate": 5.35993962302852e-05, + "loss": 3.1999, + "step": 6203 + }, + { + "epoch": 0.79, + "grad_norm": 0.6304946541786194, + "learning_rate": 5.353527814020301e-05, + "loss": 3.3298, + "step": 6204 + }, + { + "epoch": 0.79, + "grad_norm": 0.6180459260940552, + "learning_rate": 5.347119382468413e-05, + "loss": 3.2057, + "step": 6205 + }, + { + "epoch": 0.79, + "grad_norm": 0.6518542766571045, + "learning_rate": 5.340714329474541e-05, + "loss": 3.1527, + "step": 6206 + }, + { + "epoch": 0.79, + "grad_norm": 0.6927798390388489, + "learning_rate": 5.3343126561397756e-05, + "loss": 3.2682, + "step": 6207 + }, + { + "epoch": 0.79, + "grad_norm": 0.6122478246688843, + "learning_rate": 5.327914363564659e-05, + "loss": 3.3978, + "step": 6208 + }, + { + "epoch": 0.79, + "grad_norm": 0.607306182384491, + "learning_rate": 5.3215194528491186e-05, + "loss": 3.2244, + "step": 6209 + }, + { + "epoch": 0.79, + "grad_norm": 0.7095955610275269, + "learning_rate": 5.315127925092511e-05, + "loss": 3.3604, + "step": 6210 + }, + { + "epoch": 0.8, + "grad_norm": 0.6393181681632996, + "learning_rate": 5.308739781393629e-05, + "loss": 3.142, + "step": 6211 + }, + { + "epoch": 0.8, + "grad_norm": 0.7591974139213562, + "learning_rate": 5.302355022850655e-05, + "loss": 3.3069, + "step": 6212 + }, + { + "epoch": 0.8, + "grad_norm": 0.6667575836181641, + "learning_rate": 5.295973650561212e-05, + "loss": 3.3487, + "step": 6213 + }, + { + "epoch": 0.8, + "grad_norm": 0.6396272778511047, + "learning_rate": 5.289595665622324e-05, + "loss": 3.2997, + "step": 6214 + }, + { + "epoch": 0.8, + "grad_norm": 0.6105965375900269, + "learning_rate": 5.283221069130442e-05, + "loss": 3.4331, + "step": 6215 + }, + { + "epoch": 0.8, + "grad_norm": 0.6773115992546082, + "learning_rate": 5.276849862181432e-05, + "loss": 3.3996, + "step": 6216 + }, + { + "epoch": 0.8, + "grad_norm": 0.6334084868431091, + "learning_rate": 5.2704820458705785e-05, + "loss": 3.1817, + "step": 6217 + }, + { + "epoch": 0.8, + "grad_norm": 0.6564429402351379, + "learning_rate": 5.264117621292594e-05, + "loss": 3.1336, + "step": 6218 + }, + { + "epoch": 0.8, + "grad_norm": 0.631753146648407, + "learning_rate": 5.2577565895415876e-05, + "loss": 3.2437, + "step": 6219 + }, + { + "epoch": 0.8, + "grad_norm": 0.6693889498710632, + "learning_rate": 5.251398951711095e-05, + "loss": 3.3941, + "step": 6220 + }, + { + "epoch": 0.8, + "grad_norm": 0.6456241607666016, + "learning_rate": 5.245044708894067e-05, + "loss": 3.2437, + "step": 6221 + }, + { + "epoch": 0.8, + "grad_norm": 0.6354429125785828, + "learning_rate": 5.238693862182875e-05, + "loss": 3.1984, + "step": 6222 + }, + { + "epoch": 0.8, + "grad_norm": 0.6462532877922058, + "learning_rate": 5.232346412669292e-05, + "loss": 3.4046, + "step": 6223 + }, + { + "epoch": 0.8, + "grad_norm": 0.6371812224388123, + "learning_rate": 5.2260023614445344e-05, + "loss": 3.3915, + "step": 6224 + }, + { + "epoch": 0.8, + "grad_norm": 0.6222655773162842, + "learning_rate": 5.2196617095992103e-05, + "loss": 3.273, + "step": 6225 + }, + { + "epoch": 0.8, + "grad_norm": 0.618720531463623, + "learning_rate": 5.213324458223345e-05, + "loss": 3.3342, + "step": 6226 + }, + { + "epoch": 0.8, + "grad_norm": 0.6369234919548035, + "learning_rate": 5.2069906084064006e-05, + "loss": 3.3978, + "step": 6227 + }, + { + "epoch": 0.8, + "grad_norm": 0.6682491302490234, + "learning_rate": 5.200660161237228e-05, + "loss": 3.1945, + "step": 6228 + }, + { + "epoch": 0.8, + "grad_norm": 0.6570479273796082, + "learning_rate": 5.1943331178041094e-05, + "loss": 3.2701, + "step": 6229 + }, + { + "epoch": 0.8, + "grad_norm": 0.6652297973632812, + "learning_rate": 5.188009479194736e-05, + "loss": 3.3443, + "step": 6230 + }, + { + "epoch": 0.8, + "grad_norm": 0.6708425283432007, + "learning_rate": 5.1816892464962046e-05, + "loss": 3.3297, + "step": 6231 + }, + { + "epoch": 0.8, + "grad_norm": 0.653319776058197, + "learning_rate": 5.1753724207950517e-05, + "loss": 3.2837, + "step": 6232 + }, + { + "epoch": 0.8, + "grad_norm": 0.6871150732040405, + "learning_rate": 5.1690590031772066e-05, + "loss": 3.3362, + "step": 6233 + }, + { + "epoch": 0.8, + "grad_norm": 0.6416165828704834, + "learning_rate": 5.16274899472802e-05, + "loss": 3.2679, + "step": 6234 + }, + { + "epoch": 0.8, + "grad_norm": 0.6724268198013306, + "learning_rate": 5.156442396532246e-05, + "loss": 3.3087, + "step": 6235 + }, + { + "epoch": 0.8, + "grad_norm": 0.5732466578483582, + "learning_rate": 5.150139209674079e-05, + "loss": 3.2837, + "step": 6236 + }, + { + "epoch": 0.8, + "grad_norm": 0.6079490780830383, + "learning_rate": 5.1438394352370994e-05, + "loss": 3.3867, + "step": 6237 + }, + { + "epoch": 0.8, + "grad_norm": 0.6082061529159546, + "learning_rate": 5.137543074304307e-05, + "loss": 3.4159, + "step": 6238 + }, + { + "epoch": 0.8, + "grad_norm": 0.6031527519226074, + "learning_rate": 5.131250127958134e-05, + "loss": 3.2258, + "step": 6239 + }, + { + "epoch": 0.8, + "grad_norm": 0.68482905626297, + "learning_rate": 5.124960597280401e-05, + "loss": 3.3032, + "step": 6240 + }, + { + "epoch": 0.8, + "grad_norm": 0.6390005946159363, + "learning_rate": 5.1186744833523533e-05, + "loss": 3.3787, + "step": 6241 + }, + { + "epoch": 0.8, + "grad_norm": 0.6423435807228088, + "learning_rate": 5.112391787254645e-05, + "loss": 3.3118, + "step": 6242 + }, + { + "epoch": 0.8, + "grad_norm": 0.6239182353019714, + "learning_rate": 5.106112510067346e-05, + "loss": 3.25, + "step": 6243 + }, + { + "epoch": 0.8, + "grad_norm": 0.6259147524833679, + "learning_rate": 5.0998366528699266e-05, + "loss": 3.2261, + "step": 6244 + }, + { + "epoch": 0.8, + "grad_norm": 0.6671559810638428, + "learning_rate": 5.0935642167413e-05, + "loss": 3.5002, + "step": 6245 + }, + { + "epoch": 0.8, + "grad_norm": 0.6126053333282471, + "learning_rate": 5.087295202759751e-05, + "loss": 3.4132, + "step": 6246 + }, + { + "epoch": 0.8, + "grad_norm": 0.6345648765563965, + "learning_rate": 5.081029612003013e-05, + "loss": 3.3136, + "step": 6247 + }, + { + "epoch": 0.8, + "grad_norm": 0.6129223704338074, + "learning_rate": 5.074767445548204e-05, + "loss": 3.3363, + "step": 6248 + }, + { + "epoch": 0.8, + "grad_norm": 0.6308622360229492, + "learning_rate": 5.0685087044718635e-05, + "loss": 3.3067, + "step": 6249 + }, + { + "epoch": 0.8, + "grad_norm": 0.5842102766036987, + "learning_rate": 5.062253389849941e-05, + "loss": 3.1752, + "step": 6250 + }, + { + "epoch": 0.8, + "grad_norm": 0.62709641456604, + "learning_rate": 5.056001502757801e-05, + "loss": 3.3369, + "step": 6251 + }, + { + "epoch": 0.8, + "grad_norm": 0.6009126901626587, + "learning_rate": 5.0497530442702086e-05, + "loss": 3.2501, + "step": 6252 + }, + { + "epoch": 0.8, + "grad_norm": 0.6700008511543274, + "learning_rate": 5.043508015461354e-05, + "loss": 3.3443, + "step": 6253 + }, + { + "epoch": 0.8, + "grad_norm": 0.66363126039505, + "learning_rate": 5.037266417404823e-05, + "loss": 3.211, + "step": 6254 + }, + { + "epoch": 0.8, + "grad_norm": 0.669608473777771, + "learning_rate": 5.031028251173628e-05, + "loss": 3.1384, + "step": 6255 + }, + { + "epoch": 0.8, + "grad_norm": 0.6509547233581543, + "learning_rate": 5.024793517840176e-05, + "loss": 3.2467, + "step": 6256 + }, + { + "epoch": 0.8, + "grad_norm": 0.5959102511405945, + "learning_rate": 5.018562218476294e-05, + "loss": 3.2967, + "step": 6257 + }, + { + "epoch": 0.8, + "grad_norm": 0.650132954120636, + "learning_rate": 5.012334354153208e-05, + "loss": 3.2285, + "step": 6258 + }, + { + "epoch": 0.8, + "grad_norm": 0.5863680243492126, + "learning_rate": 5.00610992594156e-05, + "loss": 3.3313, + "step": 6259 + }, + { + "epoch": 0.8, + "grad_norm": 0.6746791005134583, + "learning_rate": 4.999888934911409e-05, + "loss": 3.2599, + "step": 6260 + }, + { + "epoch": 0.8, + "grad_norm": 0.6146058440208435, + "learning_rate": 4.993671382132212e-05, + "loss": 3.2519, + "step": 6261 + }, + { + "epoch": 0.8, + "grad_norm": 0.6945891976356506, + "learning_rate": 4.987457268672837e-05, + "loss": 3.3743, + "step": 6262 + }, + { + "epoch": 0.8, + "grad_norm": 0.6706831455230713, + "learning_rate": 4.981246595601555e-05, + "loss": 3.2523, + "step": 6263 + }, + { + "epoch": 0.8, + "grad_norm": 0.6371322870254517, + "learning_rate": 4.9750393639860694e-05, + "loss": 3.2625, + "step": 6264 + }, + { + "epoch": 0.8, + "grad_norm": 0.6765096187591553, + "learning_rate": 4.9688355748934654e-05, + "loss": 3.2962, + "step": 6265 + }, + { + "epoch": 0.8, + "grad_norm": 0.6514853835105896, + "learning_rate": 4.9626352293902474e-05, + "loss": 3.4608, + "step": 6266 + }, + { + "epoch": 0.8, + "grad_norm": 0.6448090076446533, + "learning_rate": 4.9564383285423204e-05, + "loss": 3.3251, + "step": 6267 + }, + { + "epoch": 0.8, + "grad_norm": 0.6782955527305603, + "learning_rate": 4.950244873415016e-05, + "loss": 3.2665, + "step": 6268 + }, + { + "epoch": 0.8, + "grad_norm": 0.6292037963867188, + "learning_rate": 4.9440548650730555e-05, + "loss": 3.3043, + "step": 6269 + }, + { + "epoch": 0.8, + "grad_norm": 0.6162392497062683, + "learning_rate": 4.93786830458057e-05, + "loss": 3.2856, + "step": 6270 + }, + { + "epoch": 0.8, + "grad_norm": 0.6886681914329529, + "learning_rate": 4.931685193001106e-05, + "loss": 3.2479, + "step": 6271 + }, + { + "epoch": 0.8, + "grad_norm": 0.6889098882675171, + "learning_rate": 4.9255055313976003e-05, + "loss": 3.4246, + "step": 6272 + }, + { + "epoch": 0.8, + "grad_norm": 0.6550235152244568, + "learning_rate": 4.9193293208324246e-05, + "loss": 3.3797, + "step": 6273 + }, + { + "epoch": 0.8, + "grad_norm": 0.6581763625144958, + "learning_rate": 4.9131565623673266e-05, + "loss": 3.3268, + "step": 6274 + }, + { + "epoch": 0.8, + "grad_norm": 0.6184753775596619, + "learning_rate": 4.906987257063489e-05, + "loss": 3.3674, + "step": 6275 + }, + { + "epoch": 0.8, + "grad_norm": 0.6205751299858093, + "learning_rate": 4.900821405981482e-05, + "loss": 3.3093, + "step": 6276 + }, + { + "epoch": 0.8, + "grad_norm": 0.6369295120239258, + "learning_rate": 4.894659010181282e-05, + "loss": 3.1849, + "step": 6277 + }, + { + "epoch": 0.8, + "grad_norm": 0.691066324710846, + "learning_rate": 4.888500070722282e-05, + "loss": 3.3285, + "step": 6278 + }, + { + "epoch": 0.8, + "grad_norm": 0.6090084314346313, + "learning_rate": 4.882344588663271e-05, + "loss": 3.2637, + "step": 6279 + }, + { + "epoch": 0.8, + "grad_norm": 0.6285091638565063, + "learning_rate": 4.87619256506244e-05, + "loss": 3.2199, + "step": 6280 + }, + { + "epoch": 0.8, + "grad_norm": 0.6004552245140076, + "learning_rate": 4.870044000977406e-05, + "loss": 3.2288, + "step": 6281 + }, + { + "epoch": 0.8, + "grad_norm": 0.6280658841133118, + "learning_rate": 4.86389889746518e-05, + "loss": 3.1535, + "step": 6282 + }, + { + "epoch": 0.8, + "grad_norm": 0.6477638483047485, + "learning_rate": 4.857757255582171e-05, + "loss": 3.2006, + "step": 6283 + }, + { + "epoch": 0.8, + "grad_norm": 0.6137582063674927, + "learning_rate": 4.851619076384201e-05, + "loss": 3.3103, + "step": 6284 + }, + { + "epoch": 0.8, + "grad_norm": 0.6008321642875671, + "learning_rate": 4.845484360926489e-05, + "loss": 3.2552, + "step": 6285 + }, + { + "epoch": 0.8, + "grad_norm": 0.6434144973754883, + "learning_rate": 4.8393531102636664e-05, + "loss": 3.3374, + "step": 6286 + }, + { + "epoch": 0.8, + "grad_norm": 0.6633194088935852, + "learning_rate": 4.8332253254497665e-05, + "loss": 3.3185, + "step": 6287 + }, + { + "epoch": 0.8, + "grad_norm": 0.7374275326728821, + "learning_rate": 4.8271010075382205e-05, + "loss": 3.1933, + "step": 6288 + }, + { + "epoch": 0.8, + "grad_norm": 0.6177479028701782, + "learning_rate": 4.8209801575818835e-05, + "loss": 3.1641, + "step": 6289 + }, + { + "epoch": 0.81, + "grad_norm": 0.6840469837188721, + "learning_rate": 4.8148627766329914e-05, + "loss": 3.4058, + "step": 6290 + }, + { + "epoch": 0.81, + "grad_norm": 0.6524780988693237, + "learning_rate": 4.808748865743187e-05, + "loss": 3.3234, + "step": 6291 + }, + { + "epoch": 0.81, + "grad_norm": 0.6769046783447266, + "learning_rate": 4.802638425963537e-05, + "loss": 3.3129, + "step": 6292 + }, + { + "epoch": 0.81, + "grad_norm": 0.6700226068496704, + "learning_rate": 4.796531458344491e-05, + "loss": 3.2718, + "step": 6293 + }, + { + "epoch": 0.81, + "grad_norm": 0.6332312226295471, + "learning_rate": 4.790427963935903e-05, + "loss": 3.314, + "step": 6294 + }, + { + "epoch": 0.81, + "grad_norm": 0.6040202379226685, + "learning_rate": 4.784327943787034e-05, + "loss": 3.2749, + "step": 6295 + }, + { + "epoch": 0.81, + "grad_norm": 0.6835915446281433, + "learning_rate": 4.7782313989465556e-05, + "loss": 3.3133, + "step": 6296 + }, + { + "epoch": 0.81, + "grad_norm": 0.6584433317184448, + "learning_rate": 4.772138330462533e-05, + "loss": 3.1339, + "step": 6297 + }, + { + "epoch": 0.81, + "grad_norm": 0.6218266487121582, + "learning_rate": 4.766048739382431e-05, + "loss": 3.2933, + "step": 6298 + }, + { + "epoch": 0.81, + "grad_norm": 0.6843163371086121, + "learning_rate": 4.7599626267531225e-05, + "loss": 3.3426, + "step": 6299 + }, + { + "epoch": 0.81, + "grad_norm": 0.6411788463592529, + "learning_rate": 4.753879993620877e-05, + "loss": 3.2508, + "step": 6300 + }, + { + "epoch": 0.81, + "grad_norm": 0.6439712643623352, + "learning_rate": 4.7478008410313775e-05, + "loss": 3.1786, + "step": 6301 + }, + { + "epoch": 0.81, + "grad_norm": 0.6473103761672974, + "learning_rate": 4.741725170029693e-05, + "loss": 3.3076, + "step": 6302 + }, + { + "epoch": 0.81, + "grad_norm": 0.6452195644378662, + "learning_rate": 4.73565298166031e-05, + "loss": 3.3539, + "step": 6303 + }, + { + "epoch": 0.81, + "grad_norm": 0.6400468945503235, + "learning_rate": 4.7295842769671053e-05, + "loss": 3.2893, + "step": 6304 + }, + { + "epoch": 0.81, + "grad_norm": 0.6331076622009277, + "learning_rate": 4.723519056993358e-05, + "loss": 3.2634, + "step": 6305 + }, + { + "epoch": 0.81, + "grad_norm": 0.6293200850486755, + "learning_rate": 4.717457322781749e-05, + "loss": 3.2414, + "step": 6306 + }, + { + "epoch": 0.81, + "grad_norm": 0.7003605365753174, + "learning_rate": 4.711399075374362e-05, + "loss": 3.203, + "step": 6307 + }, + { + "epoch": 0.81, + "grad_norm": 0.6010873317718506, + "learning_rate": 4.705344315812682e-05, + "loss": 3.2742, + "step": 6308 + }, + { + "epoch": 0.81, + "grad_norm": 0.6363465189933777, + "learning_rate": 4.699293045137582e-05, + "loss": 3.3849, + "step": 6309 + }, + { + "epoch": 0.81, + "grad_norm": 0.6699336767196655, + "learning_rate": 4.693245264389351e-05, + "loss": 3.36, + "step": 6310 + }, + { + "epoch": 0.81, + "grad_norm": 0.6721777319908142, + "learning_rate": 4.687200974607683e-05, + "loss": 3.4133, + "step": 6311 + }, + { + "epoch": 0.81, + "grad_norm": 0.6349652409553528, + "learning_rate": 4.6811601768316556e-05, + "loss": 3.4149, + "step": 6312 + }, + { + "epoch": 0.81, + "grad_norm": 0.6694733500480652, + "learning_rate": 4.675122872099749e-05, + "loss": 3.2205, + "step": 6313 + }, + { + "epoch": 0.81, + "grad_norm": 0.5913556814193726, + "learning_rate": 4.6690890614498466e-05, + "loss": 3.2009, + "step": 6314 + }, + { + "epoch": 0.81, + "grad_norm": 0.6164645552635193, + "learning_rate": 4.66305874591923e-05, + "loss": 3.3531, + "step": 6315 + }, + { + "epoch": 0.81, + "grad_norm": 0.6450039148330688, + "learning_rate": 4.657031926544575e-05, + "loss": 3.3254, + "step": 6316 + }, + { + "epoch": 0.81, + "grad_norm": 0.6537266373634338, + "learning_rate": 4.651008604361975e-05, + "loss": 3.2913, + "step": 6317 + }, + { + "epoch": 0.81, + "grad_norm": 0.65716952085495, + "learning_rate": 4.6449887804068944e-05, + "loss": 3.3379, + "step": 6318 + }, + { + "epoch": 0.81, + "grad_norm": 0.6729187369346619, + "learning_rate": 4.638972455714224e-05, + "loss": 3.2672, + "step": 6319 + }, + { + "epoch": 0.81, + "grad_norm": 0.6832636594772339, + "learning_rate": 4.632959631318234e-05, + "loss": 3.2275, + "step": 6320 + }, + { + "epoch": 0.81, + "grad_norm": 0.6566662192344666, + "learning_rate": 4.6269503082526e-05, + "loss": 3.2217, + "step": 6321 + }, + { + "epoch": 0.81, + "grad_norm": 0.671359658241272, + "learning_rate": 4.620944487550391e-05, + "loss": 3.4067, + "step": 6322 + }, + { + "epoch": 0.81, + "grad_norm": 0.677776038646698, + "learning_rate": 4.6149421702440743e-05, + "loss": 3.2207, + "step": 6323 + }, + { + "epoch": 0.81, + "grad_norm": 0.6239826679229736, + "learning_rate": 4.6089433573655276e-05, + "loss": 3.231, + "step": 6324 + }, + { + "epoch": 0.81, + "grad_norm": 0.6734199523925781, + "learning_rate": 4.6029480499460095e-05, + "loss": 3.268, + "step": 6325 + }, + { + "epoch": 0.81, + "grad_norm": 0.6146829128265381, + "learning_rate": 4.596956249016188e-05, + "loss": 3.1548, + "step": 6326 + }, + { + "epoch": 0.81, + "grad_norm": 0.6150223612785339, + "learning_rate": 4.590967955606115e-05, + "loss": 3.1679, + "step": 6327 + }, + { + "epoch": 0.81, + "grad_norm": 0.690636396408081, + "learning_rate": 4.5849831707452496e-05, + "loss": 3.2218, + "step": 6328 + }, + { + "epoch": 0.81, + "grad_norm": 0.6366089582443237, + "learning_rate": 4.579001895462453e-05, + "loss": 3.2026, + "step": 6329 + }, + { + "epoch": 0.81, + "grad_norm": 0.6616598963737488, + "learning_rate": 4.573024130785972e-05, + "loss": 3.2072, + "step": 6330 + }, + { + "epoch": 0.81, + "grad_norm": 0.6787472367286682, + "learning_rate": 4.5670498777434456e-05, + "loss": 3.2212, + "step": 6331 + }, + { + "epoch": 0.81, + "grad_norm": 0.6337720155715942, + "learning_rate": 4.561079137361932e-05, + "loss": 3.2873, + "step": 6332 + }, + { + "epoch": 0.81, + "grad_norm": 0.680767834186554, + "learning_rate": 4.555111910667861e-05, + "loss": 3.4622, + "step": 6333 + }, + { + "epoch": 0.81, + "grad_norm": 0.5902546048164368, + "learning_rate": 4.5491481986870726e-05, + "loss": 3.0807, + "step": 6334 + }, + { + "epoch": 0.81, + "grad_norm": 0.6405333280563354, + "learning_rate": 4.543188002444795e-05, + "loss": 3.3239, + "step": 6335 + }, + { + "epoch": 0.81, + "grad_norm": 0.6532682180404663, + "learning_rate": 4.537231322965654e-05, + "loss": 3.3386, + "step": 6336 + }, + { + "epoch": 0.81, + "grad_norm": 0.6565823554992676, + "learning_rate": 4.531278161273667e-05, + "loss": 3.2401, + "step": 6337 + }, + { + "epoch": 0.81, + "grad_norm": 0.6898584961891174, + "learning_rate": 4.5253285183922574e-05, + "loss": 3.4491, + "step": 6338 + }, + { + "epoch": 0.81, + "grad_norm": 0.6294739246368408, + "learning_rate": 4.519382395344246e-05, + "loss": 3.2714, + "step": 6339 + }, + { + "epoch": 0.81, + "grad_norm": 0.673034131526947, + "learning_rate": 4.513439793151833e-05, + "loss": 3.372, + "step": 6340 + }, + { + "epoch": 0.81, + "grad_norm": 0.6405984163284302, + "learning_rate": 4.507500712836621e-05, + "loss": 3.3182, + "step": 6341 + }, + { + "epoch": 0.81, + "grad_norm": 0.619519054889679, + "learning_rate": 4.5015651554196015e-05, + "loss": 3.3016, + "step": 6342 + }, + { + "epoch": 0.81, + "grad_norm": 0.5913485288619995, + "learning_rate": 4.495633121921175e-05, + "loss": 3.3034, + "step": 6343 + }, + { + "epoch": 0.81, + "grad_norm": 0.6833899021148682, + "learning_rate": 4.489704613361112e-05, + "loss": 3.401, + "step": 6344 + }, + { + "epoch": 0.81, + "grad_norm": 0.7165760397911072, + "learning_rate": 4.4837796307586085e-05, + "loss": 3.3631, + "step": 6345 + }, + { + "epoch": 0.81, + "grad_norm": 0.6703233122825623, + "learning_rate": 4.477858175132227e-05, + "loss": 3.1297, + "step": 6346 + }, + { + "epoch": 0.81, + "grad_norm": 0.6346496939659119, + "learning_rate": 4.4719402474999424e-05, + "loss": 3.2481, + "step": 6347 + }, + { + "epoch": 0.81, + "grad_norm": 0.6317062973976135, + "learning_rate": 4.4660258488791125e-05, + "loss": 3.2427, + "step": 6348 + }, + { + "epoch": 0.81, + "grad_norm": 0.6202000379562378, + "learning_rate": 4.4601149802864864e-05, + "loss": 3.3742, + "step": 6349 + }, + { + "epoch": 0.81, + "grad_norm": 0.655680775642395, + "learning_rate": 4.454207642738217e-05, + "loss": 3.212, + "step": 6350 + }, + { + "epoch": 0.81, + "grad_norm": 0.6488626599311829, + "learning_rate": 4.4483038372498397e-05, + "loss": 3.3244, + "step": 6351 + }, + { + "epoch": 0.81, + "grad_norm": 0.6419368386268616, + "learning_rate": 4.4424035648362836e-05, + "loss": 3.3423, + "step": 6352 + }, + { + "epoch": 0.81, + "grad_norm": 0.6373838186264038, + "learning_rate": 4.4365068265118825e-05, + "loss": 3.2937, + "step": 6353 + }, + { + "epoch": 0.81, + "grad_norm": 0.6217588186264038, + "learning_rate": 4.4306136232903524e-05, + "loss": 3.2487, + "step": 6354 + }, + { + "epoch": 0.81, + "grad_norm": 0.6756614446640015, + "learning_rate": 4.424723956184795e-05, + "loss": 3.2515, + "step": 6355 + }, + { + "epoch": 0.81, + "grad_norm": 0.641849935054779, + "learning_rate": 4.418837826207725e-05, + "loss": 3.3007, + "step": 6356 + }, + { + "epoch": 0.81, + "grad_norm": 0.6056436896324158, + "learning_rate": 4.41295523437103e-05, + "loss": 3.2913, + "step": 6357 + }, + { + "epoch": 0.81, + "grad_norm": 0.6423781514167786, + "learning_rate": 4.407076181685996e-05, + "loss": 3.2333, + "step": 6358 + }, + { + "epoch": 0.81, + "grad_norm": 0.6782287955284119, + "learning_rate": 4.401200669163291e-05, + "loss": 3.4017, + "step": 6359 + }, + { + "epoch": 0.81, + "grad_norm": 0.5897096395492554, + "learning_rate": 4.3953286978130005e-05, + "loss": 3.1897, + "step": 6360 + }, + { + "epoch": 0.81, + "grad_norm": 0.6132126450538635, + "learning_rate": 4.389460268644577e-05, + "loss": 3.3616, + "step": 6361 + }, + { + "epoch": 0.81, + "grad_norm": 0.6592351198196411, + "learning_rate": 4.383595382666872e-05, + "loss": 3.2439, + "step": 6362 + }, + { + "epoch": 0.81, + "grad_norm": 0.6139822602272034, + "learning_rate": 4.3777340408881263e-05, + "loss": 3.2914, + "step": 6363 + }, + { + "epoch": 0.81, + "grad_norm": 0.6376251578330994, + "learning_rate": 4.3718762443159725e-05, + "loss": 3.3195, + "step": 6364 + }, + { + "epoch": 0.81, + "grad_norm": 0.6536754965782166, + "learning_rate": 4.366021993957428e-05, + "loss": 3.314, + "step": 6365 + }, + { + "epoch": 0.81, + "grad_norm": 0.6610123515129089, + "learning_rate": 4.360171290818918e-05, + "loss": 3.3501, + "step": 6366 + }, + { + "epoch": 0.81, + "grad_norm": 0.6074190735816956, + "learning_rate": 4.354324135906235e-05, + "loss": 3.2911, + "step": 6367 + }, + { + "epoch": 0.82, + "grad_norm": 0.6405759453773499, + "learning_rate": 4.3484805302245826e-05, + "loss": 3.2542, + "step": 6368 + }, + { + "epoch": 0.82, + "grad_norm": 0.6312056183815002, + "learning_rate": 4.342640474778542e-05, + "loss": 3.2063, + "step": 6369 + }, + { + "epoch": 0.82, + "grad_norm": 0.6049206256866455, + "learning_rate": 4.3368039705720844e-05, + "loss": 3.3249, + "step": 6370 + }, + { + "epoch": 0.82, + "grad_norm": 0.6044682264328003, + "learning_rate": 4.33097101860857e-05, + "loss": 3.306, + "step": 6371 + }, + { + "epoch": 0.82, + "grad_norm": 0.6150915622711182, + "learning_rate": 4.3251416198907576e-05, + "loss": 3.3062, + "step": 6372 + }, + { + "epoch": 0.82, + "grad_norm": 0.6390911936759949, + "learning_rate": 4.319315775420776e-05, + "loss": 3.2433, + "step": 6373 + }, + { + "epoch": 0.82, + "grad_norm": 0.5915288329124451, + "learning_rate": 4.3134934862001624e-05, + "loss": 3.2095, + "step": 6374 + }, + { + "epoch": 0.82, + "grad_norm": 0.6328829526901245, + "learning_rate": 4.307674753229846e-05, + "loss": 3.4263, + "step": 6375 + }, + { + "epoch": 0.82, + "grad_norm": 0.6330004930496216, + "learning_rate": 4.301859577510123e-05, + "loss": 3.1894, + "step": 6376 + }, + { + "epoch": 0.82, + "grad_norm": 0.6698534488677979, + "learning_rate": 4.2960479600406917e-05, + "loss": 3.3107, + "step": 6377 + }, + { + "epoch": 0.82, + "grad_norm": 0.6564880609512329, + "learning_rate": 4.290239901820639e-05, + "loss": 3.346, + "step": 6378 + }, + { + "epoch": 0.82, + "grad_norm": 0.5670497417449951, + "learning_rate": 4.284435403848436e-05, + "loss": 3.2161, + "step": 6379 + }, + { + "epoch": 0.82, + "grad_norm": 0.5879086852073669, + "learning_rate": 4.2786344671219334e-05, + "loss": 3.2026, + "step": 6380 + }, + { + "epoch": 0.82, + "grad_norm": 0.6338858008384705, + "learning_rate": 4.2728370926383956e-05, + "loss": 3.22, + "step": 6381 + }, + { + "epoch": 0.82, + "grad_norm": 0.6421663761138916, + "learning_rate": 4.267043281394453e-05, + "loss": 3.3388, + "step": 6382 + }, + { + "epoch": 0.82, + "grad_norm": 0.6467295289039612, + "learning_rate": 4.26125303438612e-05, + "loss": 3.3544, + "step": 6383 + }, + { + "epoch": 0.82, + "grad_norm": 0.6295877695083618, + "learning_rate": 4.255466352608822e-05, + "loss": 3.2793, + "step": 6384 + }, + { + "epoch": 0.82, + "grad_norm": 0.6589932441711426, + "learning_rate": 4.2496832370573476e-05, + "loss": 3.2592, + "step": 6385 + }, + { + "epoch": 0.82, + "grad_norm": 0.6921527981758118, + "learning_rate": 4.2439036887258837e-05, + "loss": 3.1894, + "step": 6386 + }, + { + "epoch": 0.82, + "grad_norm": 0.6357716917991638, + "learning_rate": 4.2381277086080026e-05, + "loss": 3.2358, + "step": 6387 + }, + { + "epoch": 0.82, + "grad_norm": 0.6349096298217773, + "learning_rate": 4.2323552976966525e-05, + "loss": 3.3883, + "step": 6388 + }, + { + "epoch": 0.82, + "grad_norm": 0.6470059156417847, + "learning_rate": 4.226586456984191e-05, + "loss": 3.1993, + "step": 6389 + }, + { + "epoch": 0.82, + "grad_norm": 0.6028220653533936, + "learning_rate": 4.220821187462345e-05, + "loss": 3.1472, + "step": 6390 + }, + { + "epoch": 0.82, + "grad_norm": 0.6410771608352661, + "learning_rate": 4.2150594901222306e-05, + "loss": 3.2464, + "step": 6391 + }, + { + "epoch": 0.82, + "grad_norm": 0.6089470386505127, + "learning_rate": 4.209301365954343e-05, + "loss": 3.2082, + "step": 6392 + }, + { + "epoch": 0.82, + "grad_norm": 0.6100988984107971, + "learning_rate": 4.2035468159485835e-05, + "loss": 3.3006, + "step": 6393 + }, + { + "epoch": 0.82, + "grad_norm": 0.6550276875495911, + "learning_rate": 4.197795841094221e-05, + "loss": 3.2269, + "step": 6394 + }, + { + "epoch": 0.82, + "grad_norm": 0.6482765078544617, + "learning_rate": 4.192048442379903e-05, + "loss": 3.4126, + "step": 6395 + }, + { + "epoch": 0.82, + "grad_norm": 0.6182270646095276, + "learning_rate": 4.1863046207936934e-05, + "loss": 3.3065, + "step": 6396 + }, + { + "epoch": 0.82, + "grad_norm": 0.6417246460914612, + "learning_rate": 4.180564377323012e-05, + "loss": 3.2214, + "step": 6397 + }, + { + "epoch": 0.82, + "grad_norm": 0.6832561492919922, + "learning_rate": 4.1748277129546735e-05, + "loss": 3.441, + "step": 6398 + }, + { + "epoch": 0.82, + "grad_norm": 0.6649682521820068, + "learning_rate": 4.169094628674877e-05, + "loss": 3.3143, + "step": 6399 + }, + { + "epoch": 0.82, + "grad_norm": 0.6525314450263977, + "learning_rate": 4.163365125469207e-05, + "loss": 3.1577, + "step": 6400 + }, + { + "epoch": 0.82, + "grad_norm": 0.7238406538963318, + "learning_rate": 4.157639204322625e-05, + "loss": 3.3571, + "step": 6401 + }, + { + "epoch": 0.82, + "grad_norm": 0.6186272501945496, + "learning_rate": 4.1519168662194935e-05, + "loss": 3.277, + "step": 6402 + }, + { + "epoch": 0.82, + "grad_norm": 0.605095624923706, + "learning_rate": 4.14619811214354e-05, + "loss": 3.3446, + "step": 6403 + }, + { + "epoch": 0.82, + "grad_norm": 0.643415093421936, + "learning_rate": 4.140482943077895e-05, + "loss": 3.3538, + "step": 6404 + }, + { + "epoch": 0.82, + "grad_norm": 0.6669523119926453, + "learning_rate": 4.1347713600050547e-05, + "loss": 3.2147, + "step": 6405 + }, + { + "epoch": 0.82, + "grad_norm": 0.6525630354881287, + "learning_rate": 4.12906336390691e-05, + "loss": 3.2834, + "step": 6406 + }, + { + "epoch": 0.82, + "grad_norm": 0.6735588908195496, + "learning_rate": 4.123358955764728e-05, + "loss": 3.4095, + "step": 6407 + }, + { + "epoch": 0.82, + "grad_norm": 0.6865549087524414, + "learning_rate": 4.117658136559166e-05, + "loss": 3.3061, + "step": 6408 + }, + { + "epoch": 0.82, + "grad_norm": 0.6206440925598145, + "learning_rate": 4.1119609072702515e-05, + "loss": 3.2481, + "step": 6409 + }, + { + "epoch": 0.82, + "grad_norm": 0.6011683940887451, + "learning_rate": 4.10626726887742e-05, + "loss": 3.1444, + "step": 6410 + }, + { + "epoch": 0.82, + "grad_norm": 0.6887205243110657, + "learning_rate": 4.1005772223594583e-05, + "loss": 3.3052, + "step": 6411 + }, + { + "epoch": 0.82, + "grad_norm": 0.643696129322052, + "learning_rate": 4.0948907686945675e-05, + "loss": 3.326, + "step": 6412 + }, + { + "epoch": 0.82, + "grad_norm": 0.647750198841095, + "learning_rate": 4.089207908860304e-05, + "loss": 3.2997, + "step": 6413 + }, + { + "epoch": 0.82, + "grad_norm": 0.6211200952529907, + "learning_rate": 4.08352864383362e-05, + "loss": 3.3226, + "step": 6414 + }, + { + "epoch": 0.82, + "grad_norm": 0.6293831467628479, + "learning_rate": 4.077852974590846e-05, + "loss": 3.2841, + "step": 6415 + }, + { + "epoch": 0.82, + "grad_norm": 0.6573365330696106, + "learning_rate": 4.0721809021076916e-05, + "loss": 3.3569, + "step": 6416 + }, + { + "epoch": 0.82, + "grad_norm": 0.6426891088485718, + "learning_rate": 4.066512427359262e-05, + "loss": 3.2425, + "step": 6417 + }, + { + "epoch": 0.82, + "grad_norm": 0.6337445974349976, + "learning_rate": 4.060847551320027e-05, + "loss": 3.2817, + "step": 6418 + }, + { + "epoch": 0.82, + "grad_norm": 0.6924945116043091, + "learning_rate": 4.055186274963846e-05, + "loss": 3.263, + "step": 6419 + }, + { + "epoch": 0.82, + "grad_norm": 0.6352530717849731, + "learning_rate": 4.0495285992639494e-05, + "loss": 3.3007, + "step": 6420 + }, + { + "epoch": 0.82, + "grad_norm": 0.6801749467849731, + "learning_rate": 4.0438745251929756e-05, + "loss": 3.2665, + "step": 6421 + }, + { + "epoch": 0.82, + "grad_norm": 0.6209421753883362, + "learning_rate": 4.038224053722911e-05, + "loss": 3.2686, + "step": 6422 + }, + { + "epoch": 0.82, + "grad_norm": 0.6174101829528809, + "learning_rate": 4.0325771858251424e-05, + "loss": 3.1971, + "step": 6423 + }, + { + "epoch": 0.82, + "grad_norm": 0.6516974568367004, + "learning_rate": 4.026933922470427e-05, + "loss": 3.2522, + "step": 6424 + }, + { + "epoch": 0.82, + "grad_norm": 0.6284974813461304, + "learning_rate": 4.021294264628914e-05, + "loss": 3.3534, + "step": 6425 + }, + { + "epoch": 0.82, + "grad_norm": 0.6337777972221375, + "learning_rate": 4.015658213270126e-05, + "loss": 3.3124, + "step": 6426 + }, + { + "epoch": 0.82, + "grad_norm": 0.6753039956092834, + "learning_rate": 4.01002576936296e-05, + "loss": 3.2471, + "step": 6427 + }, + { + "epoch": 0.82, + "grad_norm": 0.6429099440574646, + "learning_rate": 4.004396933875701e-05, + "loss": 3.1807, + "step": 6428 + }, + { + "epoch": 0.82, + "grad_norm": 0.6312372088432312, + "learning_rate": 3.9987717077760054e-05, + "loss": 3.326, + "step": 6429 + }, + { + "epoch": 0.82, + "grad_norm": 0.6393996477127075, + "learning_rate": 3.993150092030928e-05, + "loss": 3.3049, + "step": 6430 + }, + { + "epoch": 0.82, + "grad_norm": 0.6306151151657104, + "learning_rate": 3.987532087606874e-05, + "loss": 3.3042, + "step": 6431 + }, + { + "epoch": 0.82, + "grad_norm": 0.6168709993362427, + "learning_rate": 3.981917695469658e-05, + "loss": 3.2459, + "step": 6432 + }, + { + "epoch": 0.82, + "grad_norm": 0.6085794568061829, + "learning_rate": 3.9763069165844546e-05, + "loss": 3.4072, + "step": 6433 + }, + { + "epoch": 0.82, + "grad_norm": 0.6100152730941772, + "learning_rate": 3.9706997519158185e-05, + "loss": 3.2662, + "step": 6434 + }, + { + "epoch": 0.82, + "grad_norm": 0.7263420820236206, + "learning_rate": 3.965096202427687e-05, + "loss": 3.1309, + "step": 6435 + }, + { + "epoch": 0.82, + "grad_norm": 0.6868110299110413, + "learning_rate": 3.959496269083376e-05, + "loss": 3.4455, + "step": 6436 + }, + { + "epoch": 0.82, + "grad_norm": 0.6217072606086731, + "learning_rate": 3.953899952845572e-05, + "loss": 3.2807, + "step": 6437 + }, + { + "epoch": 0.82, + "grad_norm": 0.6387251019477844, + "learning_rate": 3.9483072546763615e-05, + "loss": 3.2012, + "step": 6438 + }, + { + "epoch": 0.82, + "grad_norm": 0.6551047563552856, + "learning_rate": 3.9427181755371774e-05, + "loss": 3.2802, + "step": 6439 + }, + { + "epoch": 0.82, + "grad_norm": 0.626193642616272, + "learning_rate": 3.937132716388864e-05, + "loss": 3.2716, + "step": 6440 + }, + { + "epoch": 0.82, + "grad_norm": 0.6018974781036377, + "learning_rate": 3.931550878191617e-05, + "loss": 3.286, + "step": 6441 + }, + { + "epoch": 0.82, + "grad_norm": 0.6675443649291992, + "learning_rate": 3.9259726619050225e-05, + "loss": 3.2788, + "step": 6442 + }, + { + "epoch": 0.82, + "grad_norm": 0.6622194051742554, + "learning_rate": 3.920398068488037e-05, + "loss": 3.244, + "step": 6443 + }, + { + "epoch": 0.82, + "grad_norm": 0.6131778359413147, + "learning_rate": 3.914827098898999e-05, + "loss": 3.3079, + "step": 6444 + }, + { + "epoch": 0.82, + "grad_norm": 0.6365820169448853, + "learning_rate": 3.909259754095617e-05, + "loss": 3.3133, + "step": 6445 + }, + { + "epoch": 0.83, + "grad_norm": 0.618618369102478, + "learning_rate": 3.9036960350349954e-05, + "loss": 3.3683, + "step": 6446 + }, + { + "epoch": 0.83, + "grad_norm": 0.6009793877601624, + "learning_rate": 3.8981359426735945e-05, + "loss": 3.2897, + "step": 6447 + }, + { + "epoch": 0.83, + "grad_norm": 0.6588253378868103, + "learning_rate": 3.892579477967253e-05, + "loss": 3.3201, + "step": 6448 + }, + { + "epoch": 0.83, + "grad_norm": 0.6340826749801636, + "learning_rate": 3.887026641871203e-05, + "loss": 3.2024, + "step": 6449 + }, + { + "epoch": 0.83, + "grad_norm": 0.6288912892341614, + "learning_rate": 3.8814774353400335e-05, + "loss": 3.3785, + "step": 6450 + }, + { + "epoch": 0.83, + "grad_norm": 0.6233227252960205, + "learning_rate": 3.875931859327722e-05, + "loss": 3.3332, + "step": 6451 + }, + { + "epoch": 0.83, + "grad_norm": 0.6542572975158691, + "learning_rate": 3.870389914787609e-05, + "loss": 3.3217, + "step": 6452 + }, + { + "epoch": 0.83, + "grad_norm": 0.6089885234832764, + "learning_rate": 3.8648516026724314e-05, + "loss": 3.3098, + "step": 6453 + }, + { + "epoch": 0.83, + "grad_norm": 0.6347541213035583, + "learning_rate": 3.859316923934284e-05, + "loss": 3.2931, + "step": 6454 + }, + { + "epoch": 0.83, + "grad_norm": 0.6597635746002197, + "learning_rate": 3.8537858795246404e-05, + "loss": 3.2143, + "step": 6455 + }, + { + "epoch": 0.83, + "grad_norm": 0.6348395943641663, + "learning_rate": 3.84825847039435e-05, + "loss": 3.2582, + "step": 6456 + }, + { + "epoch": 0.83, + "grad_norm": 0.6304396390914917, + "learning_rate": 3.8427346974936355e-05, + "loss": 3.2717, + "step": 6457 + }, + { + "epoch": 0.83, + "grad_norm": 0.6483598351478577, + "learning_rate": 3.837214561772109e-05, + "loss": 3.3369, + "step": 6458 + }, + { + "epoch": 0.83, + "grad_norm": 0.6771075129508972, + "learning_rate": 3.83169806417874e-05, + "loss": 3.3749, + "step": 6459 + }, + { + "epoch": 0.83, + "grad_norm": 0.6419274806976318, + "learning_rate": 3.826185205661872e-05, + "loss": 3.2732, + "step": 6460 + }, + { + "epoch": 0.83, + "grad_norm": 0.6026401519775391, + "learning_rate": 3.820675987169239e-05, + "loss": 3.2216, + "step": 6461 + }, + { + "epoch": 0.83, + "grad_norm": 0.6403732299804688, + "learning_rate": 3.815170409647939e-05, + "loss": 3.3156, + "step": 6462 + }, + { + "epoch": 0.83, + "grad_norm": 0.6257762908935547, + "learning_rate": 3.8096684740444395e-05, + "loss": 3.2642, + "step": 6463 + }, + { + "epoch": 0.83, + "grad_norm": 0.6160749197006226, + "learning_rate": 3.804170181304587e-05, + "loss": 3.26, + "step": 6464 + }, + { + "epoch": 0.83, + "grad_norm": 0.6488245725631714, + "learning_rate": 3.798675532373605e-05, + "loss": 3.3118, + "step": 6465 + }, + { + "epoch": 0.83, + "grad_norm": 0.6134005188941956, + "learning_rate": 3.7931845281960807e-05, + "loss": 3.2695, + "step": 6466 + }, + { + "epoch": 0.83, + "grad_norm": 0.6293632388114929, + "learning_rate": 3.787697169715984e-05, + "loss": 3.3878, + "step": 6467 + }, + { + "epoch": 0.83, + "grad_norm": 0.6290237903594971, + "learning_rate": 3.7822134578766654e-05, + "loss": 3.3034, + "step": 6468 + }, + { + "epoch": 0.83, + "grad_norm": 0.642416775226593, + "learning_rate": 3.776733393620832e-05, + "loss": 3.2255, + "step": 6469 + }, + { + "epoch": 0.83, + "grad_norm": 0.6392046213150024, + "learning_rate": 3.771256977890569e-05, + "loss": 3.2484, + "step": 6470 + }, + { + "epoch": 0.83, + "grad_norm": 0.6874974370002747, + "learning_rate": 3.765784211627335e-05, + "loss": 3.3158, + "step": 6471 + }, + { + "epoch": 0.83, + "grad_norm": 0.61274653673172, + "learning_rate": 3.760315095771966e-05, + "loss": 3.2667, + "step": 6472 + }, + { + "epoch": 0.83, + "grad_norm": 0.6175403594970703, + "learning_rate": 3.754849631264653e-05, + "loss": 3.2756, + "step": 6473 + }, + { + "epoch": 0.83, + "grad_norm": 0.6124956011772156, + "learning_rate": 3.749387819044994e-05, + "loss": 3.3829, + "step": 6474 + }, + { + "epoch": 0.83, + "grad_norm": 0.6410250067710876, + "learning_rate": 3.743929660051923e-05, + "loss": 3.27, + "step": 6475 + }, + { + "epoch": 0.83, + "grad_norm": 0.6674247980117798, + "learning_rate": 3.738475155223761e-05, + "loss": 3.2707, + "step": 6476 + }, + { + "epoch": 0.83, + "grad_norm": 0.6086504459381104, + "learning_rate": 3.7330243054982116e-05, + "loss": 3.3489, + "step": 6477 + }, + { + "epoch": 0.83, + "grad_norm": 0.5963006615638733, + "learning_rate": 3.7275771118123305e-05, + "loss": 3.1831, + "step": 6478 + }, + { + "epoch": 0.83, + "grad_norm": 0.6756417155265808, + "learning_rate": 3.722133575102554e-05, + "loss": 3.3405, + "step": 6479 + }, + { + "epoch": 0.83, + "grad_norm": 0.5943170189857483, + "learning_rate": 3.7166936963046933e-05, + "loss": 3.2267, + "step": 6480 + }, + { + "epoch": 0.83, + "grad_norm": 0.5919336676597595, + "learning_rate": 3.711257476353916e-05, + "loss": 3.3203, + "step": 6481 + }, + { + "epoch": 0.83, + "grad_norm": 0.6363521218299866, + "learning_rate": 3.705824916184783e-05, + "loss": 3.2974, + "step": 6482 + }, + { + "epoch": 0.83, + "grad_norm": 0.5848808288574219, + "learning_rate": 3.700396016731214e-05, + "loss": 3.1555, + "step": 6483 + }, + { + "epoch": 0.83, + "grad_norm": 0.658295214176178, + "learning_rate": 3.694970778926493e-05, + "loss": 3.2425, + "step": 6484 + }, + { + "epoch": 0.83, + "grad_norm": 0.5989205241203308, + "learning_rate": 3.6895492037032766e-05, + "loss": 3.2431, + "step": 6485 + }, + { + "epoch": 0.83, + "grad_norm": 0.6400282382965088, + "learning_rate": 3.684131291993614e-05, + "loss": 3.3204, + "step": 6486 + }, + { + "epoch": 0.83, + "grad_norm": 0.6289013028144836, + "learning_rate": 3.678717044728894e-05, + "loss": 3.297, + "step": 6487 + }, + { + "epoch": 0.83, + "grad_norm": 0.6446638703346252, + "learning_rate": 3.6733064628398874e-05, + "loss": 3.2334, + "step": 6488 + }, + { + "epoch": 0.83, + "grad_norm": 0.6189967393875122, + "learning_rate": 3.667899547256745e-05, + "loss": 3.3623, + "step": 6489 + }, + { + "epoch": 0.83, + "grad_norm": 0.6518852710723877, + "learning_rate": 3.6624962989089736e-05, + "loss": 3.4634, + "step": 6490 + }, + { + "epoch": 0.83, + "grad_norm": 0.6169833540916443, + "learning_rate": 3.657096718725456e-05, + "loss": 3.3121, + "step": 6491 + }, + { + "epoch": 0.83, + "grad_norm": 0.6083911061286926, + "learning_rate": 3.651700807634439e-05, + "loss": 3.3423, + "step": 6492 + }, + { + "epoch": 0.83, + "grad_norm": 0.6496937870979309, + "learning_rate": 3.646308566563541e-05, + "loss": 3.3, + "step": 6493 + }, + { + "epoch": 0.83, + "grad_norm": 0.6268054842948914, + "learning_rate": 3.640919996439751e-05, + "loss": 3.4026, + "step": 6494 + }, + { + "epoch": 0.83, + "grad_norm": 0.6139602661132812, + "learning_rate": 3.63553509818943e-05, + "loss": 3.1487, + "step": 6495 + }, + { + "epoch": 0.83, + "grad_norm": 0.6239408254623413, + "learning_rate": 3.6301538727383094e-05, + "loss": 3.3932, + "step": 6496 + }, + { + "epoch": 0.83, + "grad_norm": 0.6659420728683472, + "learning_rate": 3.624776321011478e-05, + "loss": 3.2684, + "step": 6497 + }, + { + "epoch": 0.83, + "grad_norm": 0.6213231682777405, + "learning_rate": 3.619402443933398e-05, + "loss": 3.1686, + "step": 6498 + }, + { + "epoch": 0.83, + "grad_norm": 0.6232683658599854, + "learning_rate": 3.614032242427903e-05, + "loss": 3.3488, + "step": 6499 + }, + { + "epoch": 0.83, + "grad_norm": 0.604383111000061, + "learning_rate": 3.608665717418197e-05, + "loss": 3.2447, + "step": 6500 + }, + { + "epoch": 0.83, + "grad_norm": 0.5992885231971741, + "learning_rate": 3.603302869826841e-05, + "loss": 3.2639, + "step": 6501 + }, + { + "epoch": 0.83, + "grad_norm": 0.6617333292961121, + "learning_rate": 3.5979437005757674e-05, + "loss": 3.2442, + "step": 6502 + }, + { + "epoch": 0.83, + "grad_norm": 0.6387820243835449, + "learning_rate": 3.592588210586287e-05, + "loss": 3.2685, + "step": 6503 + }, + { + "epoch": 0.83, + "grad_norm": 0.6541626453399658, + "learning_rate": 3.587236400779073e-05, + "loss": 3.1938, + "step": 6504 + }, + { + "epoch": 0.83, + "grad_norm": 0.6782626509666443, + "learning_rate": 3.58188827207416e-05, + "loss": 3.3333, + "step": 6505 + }, + { + "epoch": 0.83, + "grad_norm": 0.6350370049476624, + "learning_rate": 3.576543825390954e-05, + "loss": 3.3704, + "step": 6506 + }, + { + "epoch": 0.83, + "grad_norm": 0.6781575083732605, + "learning_rate": 3.571203061648226e-05, + "loss": 3.3183, + "step": 6507 + }, + { + "epoch": 0.83, + "grad_norm": 0.6166051030158997, + "learning_rate": 3.565865981764116e-05, + "loss": 3.178, + "step": 6508 + }, + { + "epoch": 0.83, + "grad_norm": 0.6483914256095886, + "learning_rate": 3.560532586656126e-05, + "loss": 3.3736, + "step": 6509 + }, + { + "epoch": 0.83, + "grad_norm": 0.6345580220222473, + "learning_rate": 3.555202877241134e-05, + "loss": 3.1971, + "step": 6510 + }, + { + "epoch": 0.83, + "grad_norm": 0.6691543459892273, + "learning_rate": 3.54987685443538e-05, + "loss": 3.2858, + "step": 6511 + }, + { + "epoch": 0.83, + "grad_norm": 0.6558595895767212, + "learning_rate": 3.544554519154464e-05, + "loss": 3.2686, + "step": 6512 + }, + { + "epoch": 0.83, + "grad_norm": 0.6313934922218323, + "learning_rate": 3.539235872313354e-05, + "loss": 3.379, + "step": 6513 + }, + { + "epoch": 0.83, + "grad_norm": 0.6743571758270264, + "learning_rate": 3.533920914826397e-05, + "loss": 3.2874, + "step": 6514 + }, + { + "epoch": 0.83, + "grad_norm": 0.6316173672676086, + "learning_rate": 3.528609647607289e-05, + "loss": 3.2425, + "step": 6515 + }, + { + "epoch": 0.83, + "grad_norm": 0.6446629762649536, + "learning_rate": 3.523302071569098e-05, + "loss": 3.2588, + "step": 6516 + }, + { + "epoch": 0.83, + "grad_norm": 0.6505565047264099, + "learning_rate": 3.5179981876242634e-05, + "loss": 3.3203, + "step": 6517 + }, + { + "epoch": 0.83, + "grad_norm": 0.6769336462020874, + "learning_rate": 3.5126979966845826e-05, + "loss": 3.3582, + "step": 6518 + }, + { + "epoch": 0.83, + "grad_norm": 0.6479410529136658, + "learning_rate": 3.50740149966122e-05, + "loss": 3.353, + "step": 6519 + }, + { + "epoch": 0.83, + "grad_norm": 0.6441712975502014, + "learning_rate": 3.502108697464701e-05, + "loss": 3.2151, + "step": 6520 + }, + { + "epoch": 0.83, + "grad_norm": 0.6619863510131836, + "learning_rate": 3.496819591004921e-05, + "loss": 3.2887, + "step": 6521 + }, + { + "epoch": 0.83, + "grad_norm": 0.6329262256622314, + "learning_rate": 3.491534181191136e-05, + "loss": 3.2962, + "step": 6522 + }, + { + "epoch": 0.83, + "grad_norm": 0.6719884872436523, + "learning_rate": 3.4862524689319776e-05, + "loss": 3.3552, + "step": 6523 + }, + { + "epoch": 0.84, + "grad_norm": 0.6617701649665833, + "learning_rate": 3.480974455135422e-05, + "loss": 3.2037, + "step": 6524 + }, + { + "epoch": 0.84, + "grad_norm": 0.6485932469367981, + "learning_rate": 3.4757001407088346e-05, + "loss": 3.2314, + "step": 6525 + }, + { + "epoch": 0.84, + "grad_norm": 0.6022593975067139, + "learning_rate": 3.470429526558921e-05, + "loss": 3.1837, + "step": 6526 + }, + { + "epoch": 0.84, + "grad_norm": 0.6153969168663025, + "learning_rate": 3.465162613591769e-05, + "loss": 3.3104, + "step": 6527 + }, + { + "epoch": 0.84, + "grad_norm": 0.6110339164733887, + "learning_rate": 3.459899402712813e-05, + "loss": 3.2397, + "step": 6528 + }, + { + "epoch": 0.84, + "grad_norm": 0.620966911315918, + "learning_rate": 3.4546398948268665e-05, + "loss": 3.318, + "step": 6529 + }, + { + "epoch": 0.84, + "grad_norm": 0.6802229881286621, + "learning_rate": 3.4493840908380895e-05, + "loss": 3.3426, + "step": 6530 + }, + { + "epoch": 0.84, + "grad_norm": 0.6216890215873718, + "learning_rate": 3.444131991650024e-05, + "loss": 3.2002, + "step": 6531 + }, + { + "epoch": 0.84, + "grad_norm": 0.6214625239372253, + "learning_rate": 3.4388835981655737e-05, + "loss": 3.2711, + "step": 6532 + }, + { + "epoch": 0.84, + "grad_norm": 0.6334563493728638, + "learning_rate": 3.433638911286987e-05, + "loss": 3.2746, + "step": 6533 + }, + { + "epoch": 0.84, + "grad_norm": 0.6457828283309937, + "learning_rate": 3.428397931915894e-05, + "loss": 3.2577, + "step": 6534 + }, + { + "epoch": 0.84, + "grad_norm": 0.6323023438453674, + "learning_rate": 3.423160660953276e-05, + "loss": 3.1932, + "step": 6535 + }, + { + "epoch": 0.84, + "grad_norm": 0.6734870672225952, + "learning_rate": 3.417927099299478e-05, + "loss": 3.2799, + "step": 6536 + }, + { + "epoch": 0.84, + "grad_norm": 0.692999541759491, + "learning_rate": 3.4126972478542076e-05, + "loss": 3.3364, + "step": 6537 + }, + { + "epoch": 0.84, + "grad_norm": 0.6212725639343262, + "learning_rate": 3.407471107516549e-05, + "loss": 3.2772, + "step": 6538 + }, + { + "epoch": 0.84, + "grad_norm": 0.6048080325126648, + "learning_rate": 3.402248679184927e-05, + "loss": 3.2096, + "step": 6539 + }, + { + "epoch": 0.84, + "grad_norm": 0.6433602571487427, + "learning_rate": 3.397029963757134e-05, + "loss": 3.306, + "step": 6540 + }, + { + "epoch": 0.84, + "grad_norm": 0.6071273684501648, + "learning_rate": 3.391814962130341e-05, + "loss": 3.1467, + "step": 6541 + }, + { + "epoch": 0.84, + "grad_norm": 0.6584873795509338, + "learning_rate": 3.3866036752010585e-05, + "loss": 3.3005, + "step": 6542 + }, + { + "epoch": 0.84, + "grad_norm": 0.6271588206291199, + "learning_rate": 3.381396103865167e-05, + "loss": 3.1721, + "step": 6543 + }, + { + "epoch": 0.84, + "grad_norm": 0.6568840742111206, + "learning_rate": 3.376192249017912e-05, + "loss": 3.4128, + "step": 6544 + }, + { + "epoch": 0.84, + "grad_norm": 0.6104627847671509, + "learning_rate": 3.370992111553886e-05, + "loss": 3.1912, + "step": 6545 + }, + { + "epoch": 0.84, + "grad_norm": 0.6521439552307129, + "learning_rate": 3.365795692367069e-05, + "loss": 3.1236, + "step": 6546 + }, + { + "epoch": 0.84, + "grad_norm": 0.6480180621147156, + "learning_rate": 3.360602992350775e-05, + "loss": 3.3413, + "step": 6547 + }, + { + "epoch": 0.84, + "grad_norm": 0.7322046160697937, + "learning_rate": 3.3554140123976954e-05, + "loss": 3.3761, + "step": 6548 + }, + { + "epoch": 0.84, + "grad_norm": 0.6070480942726135, + "learning_rate": 3.35022875339987e-05, + "loss": 3.2662, + "step": 6549 + }, + { + "epoch": 0.84, + "grad_norm": 0.6356043219566345, + "learning_rate": 3.345047216248703e-05, + "loss": 3.2459, + "step": 6550 + }, + { + "epoch": 0.84, + "grad_norm": 0.6720722317695618, + "learning_rate": 3.3398694018349715e-05, + "loss": 3.2603, + "step": 6551 + }, + { + "epoch": 0.84, + "grad_norm": 0.6010866761207581, + "learning_rate": 3.334695311048788e-05, + "loss": 3.3378, + "step": 6552 + }, + { + "epoch": 0.84, + "grad_norm": 0.6258826851844788, + "learning_rate": 3.329524944779655e-05, + "loss": 3.2521, + "step": 6553 + }, + { + "epoch": 0.84, + "grad_norm": 0.6346193552017212, + "learning_rate": 3.3243583039164054e-05, + "loss": 3.3683, + "step": 6554 + }, + { + "epoch": 0.84, + "grad_norm": 0.6157397627830505, + "learning_rate": 3.319195389347251e-05, + "loss": 3.2757, + "step": 6555 + }, + { + "epoch": 0.84, + "grad_norm": 0.6648643612861633, + "learning_rate": 3.314036201959755e-05, + "loss": 3.3403, + "step": 6556 + }, + { + "epoch": 0.84, + "grad_norm": 0.6080839037895203, + "learning_rate": 3.3088807426408434e-05, + "loss": 3.1593, + "step": 6557 + }, + { + "epoch": 0.84, + "grad_norm": 0.6104198694229126, + "learning_rate": 3.3037290122767873e-05, + "loss": 3.2418, + "step": 6558 + }, + { + "epoch": 0.84, + "grad_norm": 0.614980161190033, + "learning_rate": 3.298581011753246e-05, + "loss": 3.321, + "step": 6559 + }, + { + "epoch": 0.84, + "grad_norm": 0.6636888980865479, + "learning_rate": 3.293436741955208e-05, + "loss": 3.2396, + "step": 6560 + }, + { + "epoch": 0.84, + "grad_norm": 0.6287356615066528, + "learning_rate": 3.288296203767044e-05, + "loss": 3.2945, + "step": 6561 + }, + { + "epoch": 0.84, + "grad_norm": 0.6043485999107361, + "learning_rate": 3.2831593980724664e-05, + "loss": 3.4645, + "step": 6562 + }, + { + "epoch": 0.84, + "grad_norm": 0.6181927919387817, + "learning_rate": 3.278026325754552e-05, + "loss": 3.2439, + "step": 6563 + }, + { + "epoch": 0.84, + "grad_norm": 0.6592283248901367, + "learning_rate": 3.272896987695734e-05, + "loss": 3.38, + "step": 6564 + }, + { + "epoch": 0.84, + "grad_norm": 0.6501290798187256, + "learning_rate": 3.267771384777804e-05, + "loss": 3.3721, + "step": 6565 + }, + { + "epoch": 0.84, + "grad_norm": 0.6183682680130005, + "learning_rate": 3.2626495178819134e-05, + "loss": 3.2261, + "step": 6566 + }, + { + "epoch": 0.84, + "grad_norm": 0.6056748032569885, + "learning_rate": 3.257531387888574e-05, + "loss": 3.3504, + "step": 6567 + }, + { + "epoch": 0.84, + "grad_norm": 0.6474144458770752, + "learning_rate": 3.252416995677646e-05, + "loss": 3.3172, + "step": 6568 + }, + { + "epoch": 0.84, + "grad_norm": 1.8973803520202637, + "learning_rate": 3.247306342128359e-05, + "loss": 3.1872, + "step": 6569 + }, + { + "epoch": 0.84, + "grad_norm": 0.6066573858261108, + "learning_rate": 3.2421994281192915e-05, + "loss": 3.3767, + "step": 6570 + }, + { + "epoch": 0.84, + "grad_norm": 0.628512442111969, + "learning_rate": 3.23709625452838e-05, + "loss": 3.2634, + "step": 6571 + }, + { + "epoch": 0.84, + "grad_norm": 0.6511738300323486, + "learning_rate": 3.2319968222329216e-05, + "loss": 3.2386, + "step": 6572 + }, + { + "epoch": 0.84, + "grad_norm": 0.6188576221466064, + "learning_rate": 3.226901132109558e-05, + "loss": 3.294, + "step": 6573 + }, + { + "epoch": 0.84, + "grad_norm": 0.6552718281745911, + "learning_rate": 3.221809185034311e-05, + "loss": 3.2761, + "step": 6574 + }, + { + "epoch": 0.84, + "grad_norm": 0.638786792755127, + "learning_rate": 3.21672098188254e-05, + "loss": 3.3197, + "step": 6575 + }, + { + "epoch": 0.84, + "grad_norm": 0.654994010925293, + "learning_rate": 3.211636523528966e-05, + "loss": 3.1878, + "step": 6576 + }, + { + "epoch": 0.84, + "grad_norm": 0.6370866298675537, + "learning_rate": 3.2065558108476615e-05, + "loss": 3.3911, + "step": 6577 + }, + { + "epoch": 0.84, + "grad_norm": 0.6311317086219788, + "learning_rate": 3.201478844712069e-05, + "loss": 3.3334, + "step": 6578 + }, + { + "epoch": 0.84, + "grad_norm": 0.6666600108146667, + "learning_rate": 3.196405625994972e-05, + "loss": 3.2714, + "step": 6579 + }, + { + "epoch": 0.84, + "grad_norm": 0.638438880443573, + "learning_rate": 3.1913361555685196e-05, + "loss": 3.2415, + "step": 6580 + }, + { + "epoch": 0.84, + "grad_norm": 0.6487619280815125, + "learning_rate": 3.1862704343042e-05, + "loss": 3.2731, + "step": 6581 + }, + { + "epoch": 0.84, + "grad_norm": 0.6603454947471619, + "learning_rate": 3.181208463072888e-05, + "loss": 3.355, + "step": 6582 + }, + { + "epoch": 0.84, + "grad_norm": 0.6582320332527161, + "learning_rate": 3.1761502427447855e-05, + "loss": 3.3843, + "step": 6583 + }, + { + "epoch": 0.84, + "grad_norm": 0.682377278804779, + "learning_rate": 3.1710957741894614e-05, + "loss": 3.2483, + "step": 6584 + }, + { + "epoch": 0.84, + "grad_norm": 0.6121364235877991, + "learning_rate": 3.166045058275835e-05, + "loss": 3.2538, + "step": 6585 + }, + { + "epoch": 0.84, + "grad_norm": 0.6380255222320557, + "learning_rate": 3.160998095872183e-05, + "loss": 3.319, + "step": 6586 + }, + { + "epoch": 0.84, + "grad_norm": 0.6388247013092041, + "learning_rate": 3.1559548878461325e-05, + "loss": 3.1455, + "step": 6587 + }, + { + "epoch": 0.84, + "grad_norm": 0.614047110080719, + "learning_rate": 3.1509154350646745e-05, + "loss": 3.2948, + "step": 6588 + }, + { + "epoch": 0.84, + "grad_norm": 0.6629258394241333, + "learning_rate": 3.145879738394156e-05, + "loss": 3.3795, + "step": 6589 + }, + { + "epoch": 0.84, + "grad_norm": 0.6300737857818604, + "learning_rate": 3.140847798700267e-05, + "loss": 3.1161, + "step": 6590 + }, + { + "epoch": 0.84, + "grad_norm": 0.6252375841140747, + "learning_rate": 3.1358196168480515e-05, + "loss": 3.2803, + "step": 6591 + }, + { + "epoch": 0.84, + "grad_norm": 0.6455172300338745, + "learning_rate": 3.130795193701916e-05, + "loss": 3.2143, + "step": 6592 + }, + { + "epoch": 0.84, + "grad_norm": 0.6555576920509338, + "learning_rate": 3.1257745301256165e-05, + "loss": 3.3123, + "step": 6593 + }, + { + "epoch": 0.84, + "grad_norm": 0.6197571158409119, + "learning_rate": 3.1207576269822566e-05, + "loss": 3.2767, + "step": 6594 + }, + { + "epoch": 0.84, + "grad_norm": 0.6083783507347107, + "learning_rate": 3.115744485134314e-05, + "loss": 3.1134, + "step": 6595 + }, + { + "epoch": 0.84, + "grad_norm": 0.6117764711380005, + "learning_rate": 3.1107351054435906e-05, + "loss": 3.3644, + "step": 6596 + }, + { + "epoch": 0.84, + "grad_norm": 0.6059695482254028, + "learning_rate": 3.105729488771272e-05, + "loss": 3.4464, + "step": 6597 + }, + { + "epoch": 0.84, + "grad_norm": 0.628901481628418, + "learning_rate": 3.100727635977873e-05, + "loss": 3.391, + "step": 6598 + }, + { + "epoch": 0.84, + "grad_norm": 0.6364163756370544, + "learning_rate": 3.095729547923273e-05, + "loss": 3.2195, + "step": 6599 + }, + { + "epoch": 0.84, + "grad_norm": 0.6437022089958191, + "learning_rate": 3.0907352254666985e-05, + "loss": 3.3625, + "step": 6600 + }, + { + "epoch": 0.84, + "grad_norm": 0.6143481135368347, + "learning_rate": 3.085744669466733e-05, + "loss": 3.1303, + "step": 6601 + }, + { + "epoch": 0.85, + "grad_norm": 0.7401031255722046, + "learning_rate": 3.080757880781307e-05, + "loss": 3.3805, + "step": 6602 + }, + { + "epoch": 0.85, + "grad_norm": 0.5818541049957275, + "learning_rate": 3.075774860267716e-05, + "loss": 3.276, + "step": 6603 + }, + { + "epoch": 0.85, + "grad_norm": 0.6768269538879395, + "learning_rate": 3.0707956087825923e-05, + "loss": 3.3174, + "step": 6604 + }, + { + "epoch": 0.85, + "grad_norm": 0.6687585115432739, + "learning_rate": 3.065820127181923e-05, + "loss": 3.2422, + "step": 6605 + }, + { + "epoch": 0.85, + "grad_norm": 0.6788862943649292, + "learning_rate": 3.060848416321063e-05, + "loss": 3.3239, + "step": 6606 + }, + { + "epoch": 0.85, + "grad_norm": 0.6286535263061523, + "learning_rate": 3.055880477054701e-05, + "loss": 3.2169, + "step": 6607 + }, + { + "epoch": 0.85, + "grad_norm": 0.6973392367362976, + "learning_rate": 3.0509163102368815e-05, + "loss": 3.3658, + "step": 6608 + }, + { + "epoch": 0.85, + "grad_norm": 0.6668422818183899, + "learning_rate": 3.0459559167209993e-05, + "loss": 3.4155, + "step": 6609 + }, + { + "epoch": 0.85, + "grad_norm": 0.6291588544845581, + "learning_rate": 3.0409992973598145e-05, + "loss": 3.2352, + "step": 6610 + }, + { + "epoch": 0.85, + "grad_norm": 0.6531401872634888, + "learning_rate": 3.03604645300542e-05, + "loss": 3.2694, + "step": 6611 + }, + { + "epoch": 0.85, + "grad_norm": 0.6205223202705383, + "learning_rate": 3.031097384509271e-05, + "loss": 3.3903, + "step": 6612 + }, + { + "epoch": 0.85, + "grad_norm": 0.662426769733429, + "learning_rate": 3.0261520927221647e-05, + "loss": 3.3381, + "step": 6613 + }, + { + "epoch": 0.85, + "grad_norm": 0.5979740023612976, + "learning_rate": 3.021210578494249e-05, + "loss": 3.3004, + "step": 6614 + }, + { + "epoch": 0.85, + "grad_norm": 0.6564514636993408, + "learning_rate": 3.016272842675044e-05, + "loss": 3.27, + "step": 6615 + }, + { + "epoch": 0.85, + "grad_norm": 0.683449923992157, + "learning_rate": 3.0113388861133907e-05, + "loss": 3.3458, + "step": 6616 + }, + { + "epoch": 0.85, + "grad_norm": 0.6408655047416687, + "learning_rate": 3.006408709657496e-05, + "loss": 3.0898, + "step": 6617 + }, + { + "epoch": 0.85, + "grad_norm": 0.6579421162605286, + "learning_rate": 3.0014823141549186e-05, + "loss": 3.2406, + "step": 6618 + }, + { + "epoch": 0.85, + "grad_norm": 0.6416338682174683, + "learning_rate": 2.9965597004525614e-05, + "loss": 3.2816, + "step": 6619 + }, + { + "epoch": 0.85, + "grad_norm": 5.375583171844482, + "learning_rate": 2.991640869396675e-05, + "loss": 3.33, + "step": 6620 + }, + { + "epoch": 0.85, + "grad_norm": 0.6254746913909912, + "learning_rate": 2.9867258218328668e-05, + "loss": 3.2705, + "step": 6621 + }, + { + "epoch": 0.85, + "grad_norm": 0.6137012839317322, + "learning_rate": 2.9818145586060912e-05, + "loss": 3.3699, + "step": 6622 + }, + { + "epoch": 0.85, + "grad_norm": 0.6229578852653503, + "learning_rate": 2.976907080560645e-05, + "loss": 3.0818, + "step": 6623 + }, + { + "epoch": 0.85, + "grad_norm": 0.6454364061355591, + "learning_rate": 2.9720033885401816e-05, + "loss": 3.3186, + "step": 6624 + }, + { + "epoch": 0.85, + "grad_norm": 0.6500315070152283, + "learning_rate": 2.9671034833877147e-05, + "loss": 3.1809, + "step": 6625 + }, + { + "epoch": 0.85, + "grad_norm": 0.6044383645057678, + "learning_rate": 2.962207365945585e-05, + "loss": 3.2998, + "step": 6626 + }, + { + "epoch": 0.85, + "grad_norm": 0.6520947813987732, + "learning_rate": 2.9573150370554942e-05, + "loss": 3.2195, + "step": 6627 + }, + { + "epoch": 0.85, + "grad_norm": 0.6258668303489685, + "learning_rate": 2.9524264975584887e-05, + "loss": 3.245, + "step": 6628 + }, + { + "epoch": 0.85, + "grad_norm": 0.6491183042526245, + "learning_rate": 2.9475417482949657e-05, + "loss": 3.282, + "step": 6629 + }, + { + "epoch": 0.85, + "grad_norm": 0.6811879277229309, + "learning_rate": 2.9426607901046622e-05, + "loss": 3.407, + "step": 6630 + }, + { + "epoch": 0.85, + "grad_norm": 0.5877832770347595, + "learning_rate": 2.937783623826687e-05, + "loss": 3.2315, + "step": 6631 + }, + { + "epoch": 0.85, + "grad_norm": 0.6555650234222412, + "learning_rate": 2.9329102502994753e-05, + "loss": 3.3984, + "step": 6632 + }, + { + "epoch": 0.85, + "grad_norm": 0.6136705875396729, + "learning_rate": 2.92804067036081e-05, + "loss": 3.356, + "step": 6633 + }, + { + "epoch": 0.85, + "grad_norm": 0.6061446666717529, + "learning_rate": 2.9231748848478373e-05, + "loss": 3.1808, + "step": 6634 + }, + { + "epoch": 0.85, + "grad_norm": 0.6542891263961792, + "learning_rate": 2.918312894597039e-05, + "loss": 3.2174, + "step": 6635 + }, + { + "epoch": 0.85, + "grad_norm": 0.6651108860969543, + "learning_rate": 2.9134547004442456e-05, + "loss": 3.301, + "step": 6636 + }, + { + "epoch": 0.85, + "grad_norm": 0.7036979794502258, + "learning_rate": 2.90860030322464e-05, + "loss": 3.3462, + "step": 6637 + }, + { + "epoch": 0.85, + "grad_norm": 0.6571182012557983, + "learning_rate": 2.903749703772743e-05, + "loss": 3.2802, + "step": 6638 + }, + { + "epoch": 0.85, + "grad_norm": 0.6431148648262024, + "learning_rate": 2.8989029029224374e-05, + "loss": 3.3584, + "step": 6639 + }, + { + "epoch": 0.85, + "grad_norm": 0.6809457540512085, + "learning_rate": 2.8940599015069403e-05, + "loss": 3.2448, + "step": 6640 + }, + { + "epoch": 0.85, + "grad_norm": 0.6790762543678284, + "learning_rate": 2.8892207003588218e-05, + "loss": 3.1304, + "step": 6641 + }, + { + "epoch": 0.85, + "grad_norm": 0.7012802958488464, + "learning_rate": 2.8843853003099885e-05, + "loss": 3.4279, + "step": 6642 + }, + { + "epoch": 0.85, + "grad_norm": 0.6387409567832947, + "learning_rate": 2.8795537021917144e-05, + "loss": 3.2683, + "step": 6643 + }, + { + "epoch": 0.85, + "grad_norm": 0.6141805052757263, + "learning_rate": 2.874725906834602e-05, + "loss": 3.2577, + "step": 6644 + }, + { + "epoch": 0.85, + "grad_norm": 0.6355705261230469, + "learning_rate": 2.8699019150685958e-05, + "loss": 3.2237, + "step": 6645 + }, + { + "epoch": 0.85, + "grad_norm": 0.6300574541091919, + "learning_rate": 2.8650817277230123e-05, + "loss": 3.2852, + "step": 6646 + }, + { + "epoch": 0.85, + "grad_norm": 0.6067410707473755, + "learning_rate": 2.8602653456264893e-05, + "loss": 3.1998, + "step": 6647 + }, + { + "epoch": 0.85, + "grad_norm": 0.5872777104377747, + "learning_rate": 2.85545276960702e-05, + "loss": 3.2829, + "step": 6648 + }, + { + "epoch": 0.85, + "grad_norm": 0.6420096158981323, + "learning_rate": 2.850644000491942e-05, + "loss": 3.2486, + "step": 6649 + }, + { + "epoch": 0.85, + "grad_norm": 0.6760454773902893, + "learning_rate": 2.8458390391079365e-05, + "loss": 3.3123, + "step": 6650 + }, + { + "epoch": 0.85, + "grad_norm": 0.6170526742935181, + "learning_rate": 2.8410378862810255e-05, + "loss": 3.2622, + "step": 6651 + }, + { + "epoch": 0.85, + "grad_norm": 0.8137573599815369, + "learning_rate": 2.836240542836599e-05, + "loss": 3.3784, + "step": 6652 + }, + { + "epoch": 0.85, + "grad_norm": 0.6231951117515564, + "learning_rate": 2.831447009599361e-05, + "loss": 3.1708, + "step": 6653 + }, + { + "epoch": 0.85, + "grad_norm": 0.642604410648346, + "learning_rate": 2.826657287393389e-05, + "loss": 3.2427, + "step": 6654 + }, + { + "epoch": 0.85, + "grad_norm": 0.6338666677474976, + "learning_rate": 2.8218713770420816e-05, + "loss": 3.3477, + "step": 6655 + }, + { + "epoch": 0.85, + "grad_norm": 0.656091570854187, + "learning_rate": 2.8170892793681984e-05, + "loss": 3.2722, + "step": 6656 + }, + { + "epoch": 0.85, + "grad_norm": 0.6734408736228943, + "learning_rate": 2.812310995193834e-05, + "loss": 3.3415, + "step": 6657 + }, + { + "epoch": 0.85, + "grad_norm": 0.636033833026886, + "learning_rate": 2.807536525340429e-05, + "loss": 3.2382, + "step": 6658 + }, + { + "epoch": 0.85, + "grad_norm": 0.633607029914856, + "learning_rate": 2.8027658706287677e-05, + "loss": 3.3441, + "step": 6659 + }, + { + "epoch": 0.85, + "grad_norm": 0.6181923151016235, + "learning_rate": 2.7979990318789922e-05, + "loss": 3.1268, + "step": 6660 + }, + { + "epoch": 0.85, + "grad_norm": 0.6916999220848083, + "learning_rate": 2.7932360099105658e-05, + "loss": 3.2991, + "step": 6661 + }, + { + "epoch": 0.85, + "grad_norm": 0.6611192226409912, + "learning_rate": 2.7884768055423172e-05, + "loss": 3.3341, + "step": 6662 + }, + { + "epoch": 0.85, + "grad_norm": 0.6485355496406555, + "learning_rate": 2.7837214195924027e-05, + "loss": 3.1795, + "step": 6663 + }, + { + "epoch": 0.85, + "grad_norm": 0.6241832375526428, + "learning_rate": 2.7789698528783323e-05, + "loss": 3.2311, + "step": 6664 + }, + { + "epoch": 0.85, + "grad_norm": 0.6207740902900696, + "learning_rate": 2.7742221062169505e-05, + "loss": 3.3232, + "step": 6665 + }, + { + "epoch": 0.85, + "grad_norm": 0.6482567191123962, + "learning_rate": 2.769478180424445e-05, + "loss": 3.3199, + "step": 6666 + }, + { + "epoch": 0.85, + "grad_norm": 0.6029701828956604, + "learning_rate": 2.7647380763163673e-05, + "loss": 3.2725, + "step": 6667 + }, + { + "epoch": 0.85, + "grad_norm": 0.6569728851318359, + "learning_rate": 2.760001794707584e-05, + "loss": 3.149, + "step": 6668 + }, + { + "epoch": 0.85, + "grad_norm": 0.7240564823150635, + "learning_rate": 2.755269336412322e-05, + "loss": 3.2414, + "step": 6669 + }, + { + "epoch": 0.85, + "grad_norm": 0.6327884793281555, + "learning_rate": 2.750540702244139e-05, + "loss": 3.2251, + "step": 6670 + }, + { + "epoch": 0.85, + "grad_norm": 0.6695579886436462, + "learning_rate": 2.7458158930159516e-05, + "loss": 3.2286, + "step": 6671 + }, + { + "epoch": 0.85, + "grad_norm": 0.6714457869529724, + "learning_rate": 2.7410949095400067e-05, + "loss": 3.1332, + "step": 6672 + }, + { + "epoch": 0.85, + "grad_norm": 0.6200273036956787, + "learning_rate": 2.7363777526278915e-05, + "loss": 3.2911, + "step": 6673 + }, + { + "epoch": 0.85, + "grad_norm": 0.6575629115104675, + "learning_rate": 2.731664423090541e-05, + "loss": 3.2926, + "step": 6674 + }, + { + "epoch": 0.85, + "grad_norm": 0.6633983850479126, + "learning_rate": 2.726954921738234e-05, + "loss": 3.3532, + "step": 6675 + }, + { + "epoch": 0.85, + "grad_norm": 0.6128534078598022, + "learning_rate": 2.72224924938059e-05, + "loss": 3.2628, + "step": 6676 + }, + { + "epoch": 0.85, + "grad_norm": 0.6728388071060181, + "learning_rate": 2.7175474068265648e-05, + "loss": 3.3687, + "step": 6677 + }, + { + "epoch": 0.85, + "grad_norm": 0.6939319968223572, + "learning_rate": 2.7128493948844617e-05, + "loss": 3.477, + "step": 6678 + }, + { + "epoch": 0.85, + "grad_norm": 0.7096392512321472, + "learning_rate": 2.708155214361918e-05, + "loss": 3.104, + "step": 6679 + }, + { + "epoch": 0.86, + "grad_norm": 0.6583896279335022, + "learning_rate": 2.7034648660659246e-05, + "loss": 3.3376, + "step": 6680 + }, + { + "epoch": 0.86, + "grad_norm": 0.6055124402046204, + "learning_rate": 2.6987783508028023e-05, + "loss": 3.2004, + "step": 6681 + }, + { + "epoch": 0.86, + "grad_norm": 0.6449880003929138, + "learning_rate": 2.6940956693782215e-05, + "loss": 3.2938, + "step": 6682 + }, + { + "epoch": 0.86, + "grad_norm": 0.5981688499450684, + "learning_rate": 2.6894168225971876e-05, + "loss": 3.267, + "step": 6683 + }, + { + "epoch": 0.86, + "grad_norm": 0.611633837223053, + "learning_rate": 2.684741811264052e-05, + "loss": 3.2029, + "step": 6684 + }, + { + "epoch": 0.86, + "grad_norm": 0.6699308156967163, + "learning_rate": 2.6800706361824966e-05, + "loss": 3.2592, + "step": 6685 + }, + { + "epoch": 0.86, + "grad_norm": 0.6041182279586792, + "learning_rate": 2.6754032981555544e-05, + "loss": 3.2229, + "step": 6686 + }, + { + "epoch": 0.86, + "grad_norm": 0.5983774662017822, + "learning_rate": 2.6707397979855885e-05, + "loss": 3.2502, + "step": 6687 + }, + { + "epoch": 0.86, + "grad_norm": 0.6428108811378479, + "learning_rate": 2.6660801364743138e-05, + "loss": 3.3587, + "step": 6688 + }, + { + "epoch": 0.86, + "grad_norm": 0.6479592323303223, + "learning_rate": 2.661424314422789e-05, + "loss": 3.2364, + "step": 6689 + }, + { + "epoch": 0.86, + "grad_norm": 0.6503172516822815, + "learning_rate": 2.656772332631391e-05, + "loss": 3.4001, + "step": 6690 + }, + { + "epoch": 0.86, + "grad_norm": 0.6411533951759338, + "learning_rate": 2.6521241918998572e-05, + "loss": 3.2245, + "step": 6691 + }, + { + "epoch": 0.86, + "grad_norm": 0.7209354043006897, + "learning_rate": 2.647479893027252e-05, + "loss": 3.1744, + "step": 6692 + }, + { + "epoch": 0.86, + "grad_norm": 0.6692107915878296, + "learning_rate": 2.642839436811986e-05, + "loss": 3.1625, + "step": 6693 + }, + { + "epoch": 0.86, + "grad_norm": 0.6229325532913208, + "learning_rate": 2.638202824051808e-05, + "loss": 3.3834, + "step": 6694 + }, + { + "epoch": 0.86, + "grad_norm": 0.6708949208259583, + "learning_rate": 2.633570055543802e-05, + "loss": 3.2889, + "step": 6695 + }, + { + "epoch": 0.86, + "grad_norm": 0.6829221844673157, + "learning_rate": 2.6289411320843974e-05, + "loss": 3.2398, + "step": 6696 + }, + { + "epoch": 0.86, + "grad_norm": 0.6486997008323669, + "learning_rate": 2.624316054469364e-05, + "loss": 3.2258, + "step": 6697 + }, + { + "epoch": 0.86, + "grad_norm": 0.6232972741127014, + "learning_rate": 2.6196948234937955e-05, + "loss": 3.2935, + "step": 6698 + }, + { + "epoch": 0.86, + "grad_norm": 0.6385177969932556, + "learning_rate": 2.615077439952146e-05, + "loss": 3.23, + "step": 6699 + }, + { + "epoch": 0.86, + "grad_norm": 0.6205930113792419, + "learning_rate": 2.6104639046381912e-05, + "loss": 3.3083, + "step": 6700 + }, + { + "epoch": 0.86, + "grad_norm": 0.7248706817626953, + "learning_rate": 2.605854218345055e-05, + "loss": 3.2182, + "step": 6701 + }, + { + "epoch": 0.86, + "grad_norm": 0.6649443507194519, + "learning_rate": 2.60124838186519e-05, + "loss": 3.2294, + "step": 6702 + }, + { + "epoch": 0.86, + "grad_norm": 0.7588315010070801, + "learning_rate": 2.5966463959904013e-05, + "loss": 3.3313, + "step": 6703 + }, + { + "epoch": 0.86, + "grad_norm": 0.6877144575119019, + "learning_rate": 2.5920482615118173e-05, + "loss": 3.365, + "step": 6704 + }, + { + "epoch": 0.86, + "grad_norm": 0.6987777948379517, + "learning_rate": 2.5874539792199142e-05, + "loss": 3.3193, + "step": 6705 + }, + { + "epoch": 0.86, + "grad_norm": 0.636402428150177, + "learning_rate": 2.582863549904499e-05, + "loss": 3.1899, + "step": 6706 + }, + { + "epoch": 0.86, + "grad_norm": 0.668968677520752, + "learning_rate": 2.578276974354718e-05, + "loss": 3.2208, + "step": 6707 + }, + { + "epoch": 0.86, + "grad_norm": 0.6446101069450378, + "learning_rate": 2.573694253359063e-05, + "loss": 3.1072, + "step": 6708 + }, + { + "epoch": 0.86, + "grad_norm": 0.6740022301673889, + "learning_rate": 2.5691153877053564e-05, + "loss": 3.2844, + "step": 6709 + }, + { + "epoch": 0.86, + "grad_norm": 0.6222270131111145, + "learning_rate": 2.5645403781807492e-05, + "loss": 3.3166, + "step": 6710 + }, + { + "epoch": 0.86, + "grad_norm": 0.688671350479126, + "learning_rate": 2.5599692255717512e-05, + "loss": 3.2215, + "step": 6711 + }, + { + "epoch": 0.86, + "grad_norm": 0.6682297587394714, + "learning_rate": 2.555401930664189e-05, + "loss": 3.3589, + "step": 6712 + }, + { + "epoch": 0.86, + "grad_norm": 0.6356056928634644, + "learning_rate": 2.550838494243235e-05, + "loss": 3.2856, + "step": 6713 + }, + { + "epoch": 0.86, + "grad_norm": 0.6925631165504456, + "learning_rate": 2.5462789170933976e-05, + "loss": 3.2134, + "step": 6714 + }, + { + "epoch": 0.86, + "grad_norm": 0.6714974641799927, + "learning_rate": 2.541723199998522e-05, + "loss": 3.4193, + "step": 6715 + }, + { + "epoch": 0.86, + "grad_norm": 0.6370111107826233, + "learning_rate": 2.5371713437417787e-05, + "loss": 3.1771, + "step": 6716 + }, + { + "epoch": 0.86, + "grad_norm": 0.6870541572570801, + "learning_rate": 2.5326233491056948e-05, + "loss": 3.3673, + "step": 6717 + }, + { + "epoch": 0.86, + "grad_norm": 0.6816743612289429, + "learning_rate": 2.5280792168721257e-05, + "loss": 3.3273, + "step": 6718 + }, + { + "epoch": 0.86, + "grad_norm": 0.6466469168663025, + "learning_rate": 2.523538947822257e-05, + "loss": 3.3984, + "step": 6719 + }, + { + "epoch": 0.86, + "grad_norm": 0.6508055925369263, + "learning_rate": 2.519002542736612e-05, + "loss": 3.1394, + "step": 6720 + }, + { + "epoch": 0.86, + "grad_norm": 0.6685908436775208, + "learning_rate": 2.5144700023950527e-05, + "loss": 3.2152, + "step": 6721 + }, + { + "epoch": 0.86, + "grad_norm": 0.6493694186210632, + "learning_rate": 2.509941327576773e-05, + "loss": 3.3361, + "step": 6722 + }, + { + "epoch": 0.86, + "grad_norm": 0.6266916990280151, + "learning_rate": 2.5054165190603022e-05, + "loss": 3.1545, + "step": 6723 + }, + { + "epoch": 0.86, + "grad_norm": 0.5824097394943237, + "learning_rate": 2.500895577623516e-05, + "loss": 3.1569, + "step": 6724 + }, + { + "epoch": 0.86, + "grad_norm": 0.6279321312904358, + "learning_rate": 2.4963785040436088e-05, + "loss": 3.4809, + "step": 6725 + }, + { + "epoch": 0.86, + "grad_norm": 0.6300782561302185, + "learning_rate": 2.4918652990971236e-05, + "loss": 3.2169, + "step": 6726 + }, + { + "epoch": 0.86, + "grad_norm": 0.5826421976089478, + "learning_rate": 2.4873559635599313e-05, + "loss": 3.2049, + "step": 6727 + }, + { + "epoch": 0.86, + "grad_norm": 0.6628600358963013, + "learning_rate": 2.4828504982072397e-05, + "loss": 3.2544, + "step": 6728 + }, + { + "epoch": 0.86, + "grad_norm": 0.5862376093864441, + "learning_rate": 2.4783489038135847e-05, + "loss": 3.2769, + "step": 6729 + }, + { + "epoch": 0.86, + "grad_norm": 0.6180055737495422, + "learning_rate": 2.47385118115285e-05, + "loss": 3.2885, + "step": 6730 + }, + { + "epoch": 0.86, + "grad_norm": 0.6752247214317322, + "learning_rate": 2.4693573309982342e-05, + "loss": 3.3776, + "step": 6731 + }, + { + "epoch": 0.86, + "grad_norm": 0.6708443760871887, + "learning_rate": 2.4648673541222994e-05, + "loss": 3.3513, + "step": 6732 + }, + { + "epoch": 0.86, + "grad_norm": 0.7268015742301941, + "learning_rate": 2.4603812512969142e-05, + "loss": 3.3318, + "step": 6733 + }, + { + "epoch": 0.86, + "grad_norm": 0.6242460012435913, + "learning_rate": 2.4558990232932955e-05, + "loss": 3.28, + "step": 6734 + }, + { + "epoch": 0.86, + "grad_norm": 0.6382979154586792, + "learning_rate": 2.451420670881982e-05, + "loss": 3.3254, + "step": 6735 + }, + { + "epoch": 0.86, + "grad_norm": 0.629286527633667, + "learning_rate": 2.4469461948328635e-05, + "loss": 3.1642, + "step": 6736 + }, + { + "epoch": 0.86, + "grad_norm": 0.6485285758972168, + "learning_rate": 2.4424755959151558e-05, + "loss": 3.2271, + "step": 6737 + }, + { + "epoch": 0.86, + "grad_norm": 0.6550916433334351, + "learning_rate": 2.4380088748973933e-05, + "loss": 3.2394, + "step": 6738 + }, + { + "epoch": 0.86, + "grad_norm": 0.6315405368804932, + "learning_rate": 2.4335460325474736e-05, + "loss": 3.2613, + "step": 6739 + }, + { + "epoch": 0.86, + "grad_norm": 0.6707183122634888, + "learning_rate": 2.4290870696326046e-05, + "loss": 3.4281, + "step": 6740 + }, + { + "epoch": 0.86, + "grad_norm": 0.6594843864440918, + "learning_rate": 2.424631986919332e-05, + "loss": 3.4797, + "step": 6741 + }, + { + "epoch": 0.86, + "grad_norm": 0.6843511462211609, + "learning_rate": 2.420180785173534e-05, + "loss": 3.3491, + "step": 6742 + }, + { + "epoch": 0.86, + "grad_norm": 0.6677063703536987, + "learning_rate": 2.4157334651604302e-05, + "loss": 3.2779, + "step": 6743 + }, + { + "epoch": 0.86, + "grad_norm": 0.6253190636634827, + "learning_rate": 2.411290027644558e-05, + "loss": 3.2955, + "step": 6744 + }, + { + "epoch": 0.86, + "grad_norm": 0.6013156175613403, + "learning_rate": 2.4068504733897988e-05, + "loss": 3.2331, + "step": 6745 + }, + { + "epoch": 0.86, + "grad_norm": 0.6724177598953247, + "learning_rate": 2.4024148031593723e-05, + "loss": 3.2156, + "step": 6746 + }, + { + "epoch": 0.86, + "grad_norm": 0.6198339462280273, + "learning_rate": 2.397983017715813e-05, + "loss": 3.2331, + "step": 6747 + }, + { + "epoch": 0.86, + "grad_norm": 0.6026395559310913, + "learning_rate": 2.3935551178210004e-05, + "loss": 3.1947, + "step": 6748 + }, + { + "epoch": 0.86, + "grad_norm": 0.6195902228355408, + "learning_rate": 2.3891311042361364e-05, + "loss": 3.2482, + "step": 6749 + }, + { + "epoch": 0.86, + "grad_norm": 0.635100245475769, + "learning_rate": 2.3847109777217658e-05, + "loss": 3.3386, + "step": 6750 + }, + { + "epoch": 0.86, + "grad_norm": 0.6043824553489685, + "learning_rate": 2.3802947390377554e-05, + "loss": 3.118, + "step": 6751 + }, + { + "epoch": 0.86, + "grad_norm": 0.6124756932258606, + "learning_rate": 2.375882388943307e-05, + "loss": 3.1976, + "step": 6752 + }, + { + "epoch": 0.86, + "grad_norm": 0.6314394474029541, + "learning_rate": 2.3714739281969545e-05, + "loss": 3.2971, + "step": 6753 + }, + { + "epoch": 0.86, + "grad_norm": 0.6392366886138916, + "learning_rate": 2.3670693575565726e-05, + "loss": 3.2638, + "step": 6754 + }, + { + "epoch": 0.86, + "grad_norm": 0.6057491898536682, + "learning_rate": 2.3626686777793503e-05, + "loss": 3.2018, + "step": 6755 + }, + { + "epoch": 0.86, + "grad_norm": 0.6680116057395935, + "learning_rate": 2.3582718896218185e-05, + "loss": 3.3387, + "step": 6756 + }, + { + "epoch": 0.86, + "grad_norm": 0.6639420986175537, + "learning_rate": 2.3538789938398335e-05, + "loss": 3.4412, + "step": 6757 + }, + { + "epoch": 0.87, + "grad_norm": 0.6201452016830444, + "learning_rate": 2.3494899911885857e-05, + "loss": 3.2545, + "step": 6758 + }, + { + "epoch": 0.87, + "grad_norm": 0.6092161536216736, + "learning_rate": 2.3451048824225912e-05, + "loss": 3.2522, + "step": 6759 + }, + { + "epoch": 0.87, + "grad_norm": 0.6514724493026733, + "learning_rate": 2.3407236682957106e-05, + "loss": 3.2481, + "step": 6760 + }, + { + "epoch": 0.87, + "grad_norm": 0.5797368884086609, + "learning_rate": 2.336346349561119e-05, + "loss": 3.1768, + "step": 6761 + }, + { + "epoch": 0.87, + "grad_norm": 0.661661684513092, + "learning_rate": 2.3319729269713263e-05, + "loss": 3.1709, + "step": 6762 + }, + { + "epoch": 0.87, + "grad_norm": 0.6890133619308472, + "learning_rate": 2.3276034012781803e-05, + "loss": 3.2111, + "step": 6763 + }, + { + "epoch": 0.87, + "grad_norm": 0.6165077090263367, + "learning_rate": 2.323237773232853e-05, + "loss": 3.2561, + "step": 6764 + }, + { + "epoch": 0.87, + "grad_norm": 0.6231985092163086, + "learning_rate": 2.3188760435858436e-05, + "loss": 3.4661, + "step": 6765 + }, + { + "epoch": 0.87, + "grad_norm": 0.6593947410583496, + "learning_rate": 2.3145182130869772e-05, + "loss": 3.3824, + "step": 6766 + }, + { + "epoch": 0.87, + "grad_norm": 0.6102452278137207, + "learning_rate": 2.3101642824854302e-05, + "loss": 3.3386, + "step": 6767 + }, + { + "epoch": 0.87, + "grad_norm": 0.6081363558769226, + "learning_rate": 2.3058142525296864e-05, + "loss": 3.1532, + "step": 6768 + }, + { + "epoch": 0.87, + "grad_norm": 0.6321461200714111, + "learning_rate": 2.301468123967565e-05, + "loss": 3.243, + "step": 6769 + }, + { + "epoch": 0.87, + "grad_norm": 0.6125012636184692, + "learning_rate": 2.297125897546215e-05, + "loss": 3.2151, + "step": 6770 + }, + { + "epoch": 0.87, + "grad_norm": 0.6407002806663513, + "learning_rate": 2.29278757401212e-05, + "loss": 3.1811, + "step": 6771 + }, + { + "epoch": 0.87, + "grad_norm": 0.6125441789627075, + "learning_rate": 2.288453154111081e-05, + "loss": 3.2838, + "step": 6772 + }, + { + "epoch": 0.87, + "grad_norm": 0.6470776796340942, + "learning_rate": 2.284122638588243e-05, + "loss": 3.485, + "step": 6773 + }, + { + "epoch": 0.87, + "grad_norm": 0.6441331505775452, + "learning_rate": 2.2797960281880664e-05, + "loss": 3.3061, + "step": 6774 + }, + { + "epoch": 0.87, + "grad_norm": 0.65076744556427, + "learning_rate": 2.2754733236543506e-05, + "loss": 3.283, + "step": 6775 + }, + { + "epoch": 0.87, + "grad_norm": 0.6396564841270447, + "learning_rate": 2.2711545257302152e-05, + "loss": 3.177, + "step": 6776 + }, + { + "epoch": 0.87, + "grad_norm": 0.6858227849006653, + "learning_rate": 2.2668396351581134e-05, + "loss": 3.3496, + "step": 6777 + }, + { + "epoch": 0.87, + "grad_norm": 0.5985760688781738, + "learning_rate": 2.262528652679824e-05, + "loss": 3.29, + "step": 6778 + }, + { + "epoch": 0.87, + "grad_norm": 0.6282758712768555, + "learning_rate": 2.2582215790364542e-05, + "loss": 3.4646, + "step": 6779 + }, + { + "epoch": 0.87, + "grad_norm": 0.6286015510559082, + "learning_rate": 2.2539184149684338e-05, + "loss": 3.2247, + "step": 6780 + }, + { + "epoch": 0.87, + "grad_norm": 0.6121549010276794, + "learning_rate": 2.2496191612155355e-05, + "loss": 3.3774, + "step": 6781 + }, + { + "epoch": 0.87, + "grad_norm": 0.6485132575035095, + "learning_rate": 2.2453238185168505e-05, + "loss": 3.3468, + "step": 6782 + }, + { + "epoch": 0.87, + "grad_norm": 0.6902052164077759, + "learning_rate": 2.2410323876107974e-05, + "loss": 3.3393, + "step": 6783 + }, + { + "epoch": 0.87, + "grad_norm": 0.5603453516960144, + "learning_rate": 2.2367448692351216e-05, + "loss": 3.1907, + "step": 6784 + }, + { + "epoch": 0.87, + "grad_norm": 0.6382895708084106, + "learning_rate": 2.2324612641268975e-05, + "loss": 3.2211, + "step": 6785 + }, + { + "epoch": 0.87, + "grad_norm": 0.6377125978469849, + "learning_rate": 2.2281815730225252e-05, + "loss": 3.335, + "step": 6786 + }, + { + "epoch": 0.87, + "grad_norm": 0.638985812664032, + "learning_rate": 2.2239057966577297e-05, + "loss": 3.2314, + "step": 6787 + }, + { + "epoch": 0.87, + "grad_norm": 0.6332682371139526, + "learning_rate": 2.219633935767576e-05, + "loss": 3.2651, + "step": 6788 + }, + { + "epoch": 0.87, + "grad_norm": 0.6259728074073792, + "learning_rate": 2.215365991086443e-05, + "loss": 3.211, + "step": 6789 + }, + { + "epoch": 0.87, + "grad_norm": 0.6506862640380859, + "learning_rate": 2.2111019633480306e-05, + "loss": 3.3028, + "step": 6790 + }, + { + "epoch": 0.87, + "grad_norm": 0.6298990845680237, + "learning_rate": 2.2068418532853878e-05, + "loss": 3.1886, + "step": 6791 + }, + { + "epoch": 0.87, + "grad_norm": 0.7053328156471252, + "learning_rate": 2.202585661630871e-05, + "loss": 3.1508, + "step": 6792 + }, + { + "epoch": 0.87, + "grad_norm": 0.6330047845840454, + "learning_rate": 2.198333389116172e-05, + "loss": 3.2599, + "step": 6793 + }, + { + "epoch": 0.87, + "grad_norm": 0.7028726935386658, + "learning_rate": 2.1940850364723014e-05, + "loss": 3.3423, + "step": 6794 + }, + { + "epoch": 0.87, + "grad_norm": 0.631902277469635, + "learning_rate": 2.1898406044295964e-05, + "loss": 3.2537, + "step": 6795 + }, + { + "epoch": 0.87, + "grad_norm": 0.5942831039428711, + "learning_rate": 2.185600093717735e-05, + "loss": 3.3153, + "step": 6796 + }, + { + "epoch": 0.87, + "grad_norm": 0.6552851796150208, + "learning_rate": 2.1813635050657032e-05, + "loss": 3.2042, + "step": 6797 + }, + { + "epoch": 0.87, + "grad_norm": 0.6387913823127747, + "learning_rate": 2.1771308392018213e-05, + "loss": 3.2958, + "step": 6798 + }, + { + "epoch": 0.87, + "grad_norm": 0.7131662368774414, + "learning_rate": 2.1729020968537296e-05, + "loss": 3.2817, + "step": 6799 + }, + { + "epoch": 0.87, + "grad_norm": 0.6315956711769104, + "learning_rate": 2.1686772787484072e-05, + "loss": 3.2904, + "step": 6800 + }, + { + "epoch": 0.87, + "grad_norm": 0.6526632905006409, + "learning_rate": 2.164456385612143e-05, + "loss": 3.2094, + "step": 6801 + }, + { + "epoch": 0.87, + "grad_norm": 0.6810610294342041, + "learning_rate": 2.1602394181705564e-05, + "loss": 3.3241, + "step": 6802 + }, + { + "epoch": 0.87, + "grad_norm": 0.6127844452857971, + "learning_rate": 2.1560263771485983e-05, + "loss": 3.3316, + "step": 6803 + }, + { + "epoch": 0.87, + "grad_norm": 0.7066926956176758, + "learning_rate": 2.1518172632705334e-05, + "loss": 3.3411, + "step": 6804 + }, + { + "epoch": 0.87, + "grad_norm": 0.6281033158302307, + "learning_rate": 2.1476120772599613e-05, + "loss": 3.2523, + "step": 6805 + }, + { + "epoch": 0.87, + "grad_norm": 0.6492352485656738, + "learning_rate": 2.1434108198398027e-05, + "loss": 3.2758, + "step": 6806 + }, + { + "epoch": 0.87, + "grad_norm": 0.6568070650100708, + "learning_rate": 2.1392134917322975e-05, + "loss": 3.2168, + "step": 6807 + }, + { + "epoch": 0.87, + "grad_norm": 0.6375867128372192, + "learning_rate": 2.1350200936590152e-05, + "loss": 3.2557, + "step": 6808 + }, + { + "epoch": 0.87, + "grad_norm": 0.5970162153244019, + "learning_rate": 2.130830626340857e-05, + "loss": 3.2336, + "step": 6809 + }, + { + "epoch": 0.87, + "grad_norm": 0.6471285820007324, + "learning_rate": 2.1266450904980335e-05, + "loss": 3.2764, + "step": 6810 + }, + { + "epoch": 0.87, + "grad_norm": 0.6247797608375549, + "learning_rate": 2.1224634868500902e-05, + "loss": 3.31, + "step": 6811 + }, + { + "epoch": 0.87, + "grad_norm": 0.6753485202789307, + "learning_rate": 2.1182858161158947e-05, + "loss": 3.3195, + "step": 6812 + }, + { + "epoch": 0.87, + "grad_norm": 0.6527683734893799, + "learning_rate": 2.114112079013636e-05, + "loss": 3.3438, + "step": 6813 + }, + { + "epoch": 0.87, + "grad_norm": 0.6041711568832397, + "learning_rate": 2.1099422762608262e-05, + "loss": 3.2407, + "step": 6814 + }, + { + "epoch": 0.87, + "grad_norm": 0.670981764793396, + "learning_rate": 2.105776408574303e-05, + "loss": 3.2142, + "step": 6815 + }, + { + "epoch": 0.87, + "grad_norm": 0.6561753153800964, + "learning_rate": 2.1016144766702217e-05, + "loss": 3.2738, + "step": 6816 + }, + { + "epoch": 0.87, + "grad_norm": 0.6334289312362671, + "learning_rate": 2.097456481264079e-05, + "loss": 3.2525, + "step": 6817 + }, + { + "epoch": 0.87, + "grad_norm": 0.6357218623161316, + "learning_rate": 2.0933024230706726e-05, + "loss": 3.2613, + "step": 6818 + }, + { + "epoch": 0.87, + "grad_norm": 0.639295220375061, + "learning_rate": 2.08915230280414e-05, + "loss": 3.3472, + "step": 6819 + }, + { + "epoch": 0.87, + "grad_norm": 0.6403231024742126, + "learning_rate": 2.085006121177932e-05, + "loss": 3.335, + "step": 6820 + }, + { + "epoch": 0.87, + "grad_norm": 0.6622771620750427, + "learning_rate": 2.0808638789048263e-05, + "loss": 3.1739, + "step": 6821 + }, + { + "epoch": 0.87, + "grad_norm": 0.708372175693512, + "learning_rate": 2.0767255766969195e-05, + "loss": 3.3605, + "step": 6822 + }, + { + "epoch": 0.87, + "grad_norm": 0.6363511681556702, + "learning_rate": 2.0725912152656317e-05, + "loss": 3.1817, + "step": 6823 + }, + { + "epoch": 0.87, + "grad_norm": 0.6383823752403259, + "learning_rate": 2.0684607953217164e-05, + "loss": 3.395, + "step": 6824 + }, + { + "epoch": 0.87, + "grad_norm": 0.6685783863067627, + "learning_rate": 2.0643343175752367e-05, + "loss": 3.2711, + "step": 6825 + }, + { + "epoch": 0.87, + "grad_norm": 0.6340213418006897, + "learning_rate": 2.060211782735577e-05, + "loss": 3.3396, + "step": 6826 + }, + { + "epoch": 0.87, + "grad_norm": 0.6279425621032715, + "learning_rate": 2.0560931915114518e-05, + "loss": 3.4247, + "step": 6827 + }, + { + "epoch": 0.87, + "grad_norm": 0.6689784526824951, + "learning_rate": 2.0519785446108996e-05, + "loss": 3.3333, + "step": 6828 + }, + { + "epoch": 0.87, + "grad_norm": 0.6482875347137451, + "learning_rate": 2.0478678427412718e-05, + "loss": 3.322, + "step": 6829 + }, + { + "epoch": 0.87, + "grad_norm": 0.6636989712715149, + "learning_rate": 2.0437610866092442e-05, + "loss": 3.2851, + "step": 6830 + }, + { + "epoch": 0.87, + "grad_norm": 0.6471637487411499, + "learning_rate": 2.0396582769208137e-05, + "loss": 3.3048, + "step": 6831 + }, + { + "epoch": 0.87, + "grad_norm": 0.6450260281562805, + "learning_rate": 2.0355594143813078e-05, + "loss": 3.2115, + "step": 6832 + }, + { + "epoch": 0.87, + "grad_norm": 0.6283779144287109, + "learning_rate": 2.031464499695368e-05, + "loss": 3.1822, + "step": 6833 + }, + { + "epoch": 0.87, + "grad_norm": 0.6070658564567566, + "learning_rate": 2.0273735335669536e-05, + "loss": 3.3539, + "step": 6834 + }, + { + "epoch": 0.87, + "grad_norm": 0.6264026165008545, + "learning_rate": 2.0232865166993492e-05, + "loss": 3.2446, + "step": 6835 + }, + { + "epoch": 0.88, + "grad_norm": 0.5890588760375977, + "learning_rate": 2.0192034497951566e-05, + "loss": 3.2071, + "step": 6836 + }, + { + "epoch": 0.88, + "grad_norm": 0.6800095438957214, + "learning_rate": 2.0151243335563146e-05, + "loss": 3.298, + "step": 6837 + }, + { + "epoch": 0.88, + "grad_norm": 0.6465356945991516, + "learning_rate": 2.0110491686840564e-05, + "loss": 3.3531, + "step": 6838 + }, + { + "epoch": 0.88, + "grad_norm": 0.6601016521453857, + "learning_rate": 2.006977955878961e-05, + "loss": 3.1851, + "step": 6839 + }, + { + "epoch": 0.88, + "grad_norm": 0.6479170322418213, + "learning_rate": 2.0029106958409148e-05, + "loss": 3.2531, + "step": 6840 + }, + { + "epoch": 0.88, + "grad_norm": 0.6518640518188477, + "learning_rate": 1.9988473892691235e-05, + "loss": 3.258, + "step": 6841 + }, + { + "epoch": 0.88, + "grad_norm": 0.6924657225608826, + "learning_rate": 1.994788036862119e-05, + "loss": 3.2473, + "step": 6842 + }, + { + "epoch": 0.88, + "grad_norm": 0.6231380701065063, + "learning_rate": 1.990732639317752e-05, + "loss": 3.2671, + "step": 6843 + }, + { + "epoch": 0.88, + "grad_norm": 0.6526206135749817, + "learning_rate": 1.9866811973331846e-05, + "loss": 3.3348, + "step": 6844 + }, + { + "epoch": 0.88, + "grad_norm": 0.6139849424362183, + "learning_rate": 1.982633711604917e-05, + "loss": 3.3864, + "step": 6845 + }, + { + "epoch": 0.88, + "grad_norm": 0.6477641463279724, + "learning_rate": 1.978590182828749e-05, + "loss": 3.3935, + "step": 6846 + }, + { + "epoch": 0.88, + "grad_norm": 0.6017580032348633, + "learning_rate": 1.9745506116998214e-05, + "loss": 3.1474, + "step": 6847 + }, + { + "epoch": 0.88, + "grad_norm": 0.5790743231773376, + "learning_rate": 1.970514998912576e-05, + "loss": 3.2047, + "step": 6848 + }, + { + "epoch": 0.88, + "grad_norm": 0.6539261937141418, + "learning_rate": 1.9664833451607856e-05, + "loss": 3.1933, + "step": 6849 + }, + { + "epoch": 0.88, + "grad_norm": 0.6438460946083069, + "learning_rate": 1.9624556511375347e-05, + "loss": 3.2391, + "step": 6850 + }, + { + "epoch": 0.88, + "grad_norm": 0.621233344078064, + "learning_rate": 1.9584319175352307e-05, + "loss": 3.1789, + "step": 6851 + }, + { + "epoch": 0.88, + "grad_norm": 0.676813542842865, + "learning_rate": 1.9544121450455976e-05, + "loss": 3.1908, + "step": 6852 + }, + { + "epoch": 0.88, + "grad_norm": 0.5853865146636963, + "learning_rate": 1.950396334359686e-05, + "loss": 3.3179, + "step": 6853 + }, + { + "epoch": 0.88, + "grad_norm": 0.6410489082336426, + "learning_rate": 1.9463844861678627e-05, + "loss": 3.2646, + "step": 6854 + }, + { + "epoch": 0.88, + "grad_norm": 0.6273015737533569, + "learning_rate": 1.942376601159798e-05, + "loss": 3.1283, + "step": 6855 + }, + { + "epoch": 0.88, + "grad_norm": 0.6359155178070068, + "learning_rate": 1.9383726800245083e-05, + "loss": 3.2482, + "step": 6856 + }, + { + "epoch": 0.88, + "grad_norm": 0.6588960886001587, + "learning_rate": 1.934372723450309e-05, + "loss": 3.2952, + "step": 6857 + }, + { + "epoch": 0.88, + "grad_norm": 0.6521344780921936, + "learning_rate": 1.9303767321248388e-05, + "loss": 3.1928, + "step": 6858 + }, + { + "epoch": 0.88, + "grad_norm": 0.6000865697860718, + "learning_rate": 1.926384706735049e-05, + "loss": 3.2937, + "step": 6859 + }, + { + "epoch": 0.88, + "grad_norm": 0.6228570938110352, + "learning_rate": 1.9223966479672255e-05, + "loss": 3.3858, + "step": 6860 + }, + { + "epoch": 0.88, + "grad_norm": 0.643111526966095, + "learning_rate": 1.9184125565069543e-05, + "loss": 3.2005, + "step": 6861 + }, + { + "epoch": 0.88, + "grad_norm": 0.6268308162689209, + "learning_rate": 1.914432433039151e-05, + "loss": 3.2329, + "step": 6862 + }, + { + "epoch": 0.88, + "grad_norm": 0.6840165257453918, + "learning_rate": 1.9104562782480434e-05, + "loss": 3.2345, + "step": 6863 + }, + { + "epoch": 0.88, + "grad_norm": 0.6919675469398499, + "learning_rate": 1.906484092817176e-05, + "loss": 3.2749, + "step": 6864 + }, + { + "epoch": 0.88, + "grad_norm": 0.6603330969810486, + "learning_rate": 1.902515877429417e-05, + "loss": 3.2708, + "step": 6865 + }, + { + "epoch": 0.88, + "grad_norm": 0.6835293173789978, + "learning_rate": 1.8985516327669512e-05, + "loss": 3.3131, + "step": 6866 + }, + { + "epoch": 0.88, + "grad_norm": 0.6213630437850952, + "learning_rate": 1.894591359511269e-05, + "loss": 3.2018, + "step": 6867 + }, + { + "epoch": 0.88, + "grad_norm": 0.6311686635017395, + "learning_rate": 1.890635058343196e-05, + "loss": 3.1391, + "step": 6868 + }, + { + "epoch": 0.88, + "grad_norm": 0.603550136089325, + "learning_rate": 1.886682729942865e-05, + "loss": 3.2515, + "step": 6869 + }, + { + "epoch": 0.88, + "grad_norm": 0.640389084815979, + "learning_rate": 1.8827343749897224e-05, + "loss": 3.2602, + "step": 6870 + }, + { + "epoch": 0.88, + "grad_norm": 0.6399551033973694, + "learning_rate": 1.8787899941625413e-05, + "loss": 3.2608, + "step": 6871 + }, + { + "epoch": 0.88, + "grad_norm": 0.6673776507377625, + "learning_rate": 1.8748495881394046e-05, + "loss": 3.2441, + "step": 6872 + }, + { + "epoch": 0.88, + "grad_norm": 0.5939326882362366, + "learning_rate": 1.870913157597709e-05, + "loss": 3.1888, + "step": 6873 + }, + { + "epoch": 0.88, + "grad_norm": 0.64223712682724, + "learning_rate": 1.866980703214177e-05, + "loss": 3.2668, + "step": 6874 + }, + { + "epoch": 0.88, + "grad_norm": 0.5804606080055237, + "learning_rate": 1.8630522256648463e-05, + "loss": 3.1411, + "step": 6875 + }, + { + "epoch": 0.88, + "grad_norm": 0.6221579909324646, + "learning_rate": 1.8591277256250648e-05, + "loss": 3.1402, + "step": 6876 + }, + { + "epoch": 0.88, + "grad_norm": 0.6562621593475342, + "learning_rate": 1.855207203769499e-05, + "loss": 3.3352, + "step": 6877 + }, + { + "epoch": 0.88, + "grad_norm": 0.654252827167511, + "learning_rate": 1.8512906607721342e-05, + "loss": 3.2961, + "step": 6878 + }, + { + "epoch": 0.88, + "grad_norm": 0.6618490219116211, + "learning_rate": 1.8473780973062655e-05, + "loss": 3.2589, + "step": 6879 + }, + { + "epoch": 0.88, + "grad_norm": 0.6368651390075684, + "learning_rate": 1.8434695140445074e-05, + "loss": 3.2181, + "step": 6880 + }, + { + "epoch": 0.88, + "grad_norm": 0.6331031322479248, + "learning_rate": 1.8395649116587974e-05, + "loss": 3.3327, + "step": 6881 + }, + { + "epoch": 0.88, + "grad_norm": 0.6702656149864197, + "learning_rate": 1.8356642908203767e-05, + "loss": 3.2164, + "step": 6882 + }, + { + "epoch": 0.88, + "grad_norm": 0.6424792408943176, + "learning_rate": 1.8317676521998033e-05, + "loss": 3.2294, + "step": 6883 + }, + { + "epoch": 0.88, + "grad_norm": 0.6082201600074768, + "learning_rate": 1.827874996466966e-05, + "loss": 3.3042, + "step": 6884 + }, + { + "epoch": 0.88, + "grad_norm": 0.6452752947807312, + "learning_rate": 1.823986324291052e-05, + "loss": 3.3743, + "step": 6885 + }, + { + "epoch": 0.88, + "grad_norm": 0.6559005379676819, + "learning_rate": 1.8201016363405653e-05, + "loss": 3.3308, + "step": 6886 + }, + { + "epoch": 0.88, + "grad_norm": 0.6568729877471924, + "learning_rate": 1.816220933283336e-05, + "loss": 3.2974, + "step": 6887 + }, + { + "epoch": 0.88, + "grad_norm": 0.7080766558647156, + "learning_rate": 1.8123442157864907e-05, + "loss": 3.1731, + "step": 6888 + }, + { + "epoch": 0.88, + "grad_norm": 0.6404166221618652, + "learning_rate": 1.8084714845164912e-05, + "loss": 3.3248, + "step": 6889 + }, + { + "epoch": 0.88, + "grad_norm": 0.664342999458313, + "learning_rate": 1.804602740139105e-05, + "loss": 3.3336, + "step": 6890 + }, + { + "epoch": 0.88, + "grad_norm": 0.6884958744049072, + "learning_rate": 1.8007379833194142e-05, + "loss": 3.1601, + "step": 6891 + }, + { + "epoch": 0.88, + "grad_norm": 0.7024304270744324, + "learning_rate": 1.7968772147218067e-05, + "loss": 3.2835, + "step": 6892 + }, + { + "epoch": 0.88, + "grad_norm": 0.670708954334259, + "learning_rate": 1.793020435010004e-05, + "loss": 3.3666, + "step": 6893 + }, + { + "epoch": 0.88, + "grad_norm": 0.6311020255088806, + "learning_rate": 1.7891676448470255e-05, + "loss": 3.2645, + "step": 6894 + }, + { + "epoch": 0.88, + "grad_norm": 0.7195715308189392, + "learning_rate": 1.785318844895209e-05, + "loss": 3.2093, + "step": 6895 + }, + { + "epoch": 0.88, + "grad_norm": 0.6251908540725708, + "learning_rate": 1.7814740358162136e-05, + "loss": 3.3117, + "step": 6896 + }, + { + "epoch": 0.88, + "grad_norm": 0.6668151617050171, + "learning_rate": 1.7776332182710047e-05, + "loss": 3.3754, + "step": 6897 + }, + { + "epoch": 0.88, + "grad_norm": 0.655120313167572, + "learning_rate": 1.773796392919863e-05, + "loss": 3.246, + "step": 6898 + }, + { + "epoch": 0.88, + "grad_norm": 0.6486814618110657, + "learning_rate": 1.769963560422383e-05, + "loss": 3.3588, + "step": 6899 + }, + { + "epoch": 0.88, + "grad_norm": 0.6619285941123962, + "learning_rate": 1.7661347214374706e-05, + "loss": 3.2986, + "step": 6900 + }, + { + "epoch": 0.88, + "grad_norm": 0.677575945854187, + "learning_rate": 1.762309876623347e-05, + "loss": 3.1905, + "step": 6901 + }, + { + "epoch": 0.88, + "grad_norm": 0.6310055255889893, + "learning_rate": 1.7584890266375552e-05, + "loss": 3.3665, + "step": 6902 + }, + { + "epoch": 0.88, + "grad_norm": 0.6316937804222107, + "learning_rate": 1.7546721721369314e-05, + "loss": 3.477, + "step": 6903 + }, + { + "epoch": 0.88, + "grad_norm": 0.6531652808189392, + "learning_rate": 1.7508593137776503e-05, + "loss": 3.3253, + "step": 6904 + }, + { + "epoch": 0.88, + "grad_norm": 0.6354236602783203, + "learning_rate": 1.7470504522151792e-05, + "loss": 3.3458, + "step": 6905 + }, + { + "epoch": 0.88, + "grad_norm": 0.6475649476051331, + "learning_rate": 1.7432455881043085e-05, + "loss": 3.3281, + "step": 6906 + }, + { + "epoch": 0.88, + "grad_norm": 0.6476089954376221, + "learning_rate": 1.7394447220991342e-05, + "loss": 3.426, + "step": 6907 + }, + { + "epoch": 0.88, + "grad_norm": 0.5931687951087952, + "learning_rate": 1.735647854853073e-05, + "loss": 3.1769, + "step": 6908 + }, + { + "epoch": 0.88, + "grad_norm": 0.6188100576400757, + "learning_rate": 1.7318549870188468e-05, + "loss": 3.2636, + "step": 6909 + }, + { + "epoch": 0.88, + "grad_norm": 0.6311696767807007, + "learning_rate": 1.728066119248492e-05, + "loss": 3.2662, + "step": 6910 + }, + { + "epoch": 0.88, + "grad_norm": 0.6498907804489136, + "learning_rate": 1.724281252193369e-05, + "loss": 3.2602, + "step": 6911 + }, + { + "epoch": 0.88, + "grad_norm": 0.6863357424736023, + "learning_rate": 1.7205003865041342e-05, + "loss": 3.2302, + "step": 6912 + }, + { + "epoch": 0.88, + "grad_norm": 0.6403094530105591, + "learning_rate": 1.7167235228307627e-05, + "loss": 3.2029, + "step": 6913 + }, + { + "epoch": 0.88, + "grad_norm": 0.6653391122817993, + "learning_rate": 1.7129506618225376e-05, + "loss": 3.3177, + "step": 6914 + }, + { + "epoch": 0.89, + "grad_norm": 0.616364598274231, + "learning_rate": 1.7091818041280626e-05, + "loss": 3.3601, + "step": 6915 + }, + { + "epoch": 0.89, + "grad_norm": 0.6449361443519592, + "learning_rate": 1.7054169503952415e-05, + "loss": 3.3162, + "step": 6916 + }, + { + "epoch": 0.89, + "grad_norm": 0.6562541127204895, + "learning_rate": 1.701656101271304e-05, + "loss": 3.3603, + "step": 6917 + }, + { + "epoch": 0.89, + "grad_norm": 0.6591519117355347, + "learning_rate": 1.6978992574027824e-05, + "loss": 3.3095, + "step": 6918 + }, + { + "epoch": 0.89, + "grad_norm": 0.627376914024353, + "learning_rate": 1.6941464194355188e-05, + "loss": 3.334, + "step": 6919 + }, + { + "epoch": 0.89, + "grad_norm": 0.6193311810493469, + "learning_rate": 1.6903975880146638e-05, + "loss": 3.2708, + "step": 6920 + }, + { + "epoch": 0.89, + "grad_norm": 0.6567133069038391, + "learning_rate": 1.686652763784699e-05, + "loss": 3.2111, + "step": 6921 + }, + { + "epoch": 0.89, + "grad_norm": 0.6930579543113708, + "learning_rate": 1.6829119473893927e-05, + "loss": 3.3558, + "step": 6922 + }, + { + "epoch": 0.89, + "grad_norm": 0.6540127992630005, + "learning_rate": 1.679175139471839e-05, + "loss": 3.2656, + "step": 6923 + }, + { + "epoch": 0.89, + "grad_norm": 0.5813849568367004, + "learning_rate": 1.6754423406744323e-05, + "loss": 3.3181, + "step": 6924 + }, + { + "epoch": 0.89, + "grad_norm": 0.6777455806732178, + "learning_rate": 1.6717135516388925e-05, + "loss": 3.3117, + "step": 6925 + }, + { + "epoch": 0.89, + "grad_norm": 0.657409131526947, + "learning_rate": 1.6679887730062404e-05, + "loss": 3.2066, + "step": 6926 + }, + { + "epoch": 0.89, + "grad_norm": 0.6671031713485718, + "learning_rate": 1.6642680054168026e-05, + "loss": 3.3455, + "step": 6927 + }, + { + "epoch": 0.89, + "grad_norm": 0.5967366695404053, + "learning_rate": 1.6605512495102282e-05, + "loss": 3.2231, + "step": 6928 + }, + { + "epoch": 0.89, + "grad_norm": 0.6444880366325378, + "learning_rate": 1.656838505925462e-05, + "loss": 3.2665, + "step": 6929 + }, + { + "epoch": 0.89, + "grad_norm": 0.6808575391769409, + "learning_rate": 1.6531297753007795e-05, + "loss": 3.2324, + "step": 6930 + }, + { + "epoch": 0.89, + "grad_norm": 0.6104139089584351, + "learning_rate": 1.649425058273743e-05, + "loss": 3.2293, + "step": 6931 + }, + { + "epoch": 0.89, + "grad_norm": 0.6585542559623718, + "learning_rate": 1.6457243554812486e-05, + "loss": 3.1955, + "step": 6932 + }, + { + "epoch": 0.89, + "grad_norm": 0.6599720120429993, + "learning_rate": 1.6420276675594814e-05, + "loss": 3.4477, + "step": 6933 + }, + { + "epoch": 0.89, + "grad_norm": 0.6509204506874084, + "learning_rate": 1.6383349951439475e-05, + "loss": 3.2605, + "step": 6934 + }, + { + "epoch": 0.89, + "grad_norm": 0.7017077207565308, + "learning_rate": 1.634646338869461e-05, + "loss": 3.2774, + "step": 6935 + }, + { + "epoch": 0.89, + "grad_norm": 0.6533510684967041, + "learning_rate": 1.6309616993701426e-05, + "loss": 3.3077, + "step": 6936 + }, + { + "epoch": 0.89, + "grad_norm": 0.6311295032501221, + "learning_rate": 1.6272810772794218e-05, + "loss": 3.2622, + "step": 6937 + }, + { + "epoch": 0.89, + "grad_norm": 0.6376855969429016, + "learning_rate": 1.623604473230042e-05, + "loss": 3.287, + "step": 6938 + }, + { + "epoch": 0.89, + "grad_norm": 0.683590829372406, + "learning_rate": 1.6199318878540593e-05, + "loss": 3.3705, + "step": 6939 + }, + { + "epoch": 0.89, + "grad_norm": 0.6251614689826965, + "learning_rate": 1.616263321782832e-05, + "loss": 3.295, + "step": 6940 + }, + { + "epoch": 0.89, + "grad_norm": 0.608283281326294, + "learning_rate": 1.6125987756470257e-05, + "loss": 3.1217, + "step": 6941 + }, + { + "epoch": 0.89, + "grad_norm": 0.6173918843269348, + "learning_rate": 1.6089382500766193e-05, + "loss": 3.299, + "step": 6942 + }, + { + "epoch": 0.89, + "grad_norm": 0.6270238161087036, + "learning_rate": 1.605281745700904e-05, + "loss": 3.2012, + "step": 6943 + }, + { + "epoch": 0.89, + "grad_norm": 0.670943021774292, + "learning_rate": 1.6016292631484684e-05, + "loss": 3.2473, + "step": 6944 + }, + { + "epoch": 0.89, + "grad_norm": 0.6275277137756348, + "learning_rate": 1.5979808030472164e-05, + "loss": 3.0922, + "step": 6945 + }, + { + "epoch": 0.89, + "grad_norm": 0.6165827512741089, + "learning_rate": 1.5943363660243655e-05, + "loss": 3.2648, + "step": 6946 + }, + { + "epoch": 0.89, + "grad_norm": 0.6089856028556824, + "learning_rate": 1.5906959527064334e-05, + "loss": 3.2855, + "step": 6947 + }, + { + "epoch": 0.89, + "grad_norm": 0.6459159255027771, + "learning_rate": 1.5870595637192535e-05, + "loss": 3.268, + "step": 6948 + }, + { + "epoch": 0.89, + "grad_norm": 0.669853150844574, + "learning_rate": 1.5834271996879644e-05, + "loss": 3.1253, + "step": 6949 + }, + { + "epoch": 0.89, + "grad_norm": 0.6710006594657898, + "learning_rate": 1.579798861237003e-05, + "loss": 3.1732, + "step": 6950 + }, + { + "epoch": 0.89, + "grad_norm": 0.6368153691291809, + "learning_rate": 1.5761745489901307e-05, + "loss": 3.28, + "step": 6951 + }, + { + "epoch": 0.89, + "grad_norm": 0.6223729848861694, + "learning_rate": 1.5725542635704026e-05, + "loss": 3.1827, + "step": 6952 + }, + { + "epoch": 0.89, + "grad_norm": 0.6653406023979187, + "learning_rate": 1.5689380056001927e-05, + "loss": 3.3634, + "step": 6953 + }, + { + "epoch": 0.89, + "grad_norm": 0.7627063393592834, + "learning_rate": 1.5653257757011763e-05, + "loss": 3.3921, + "step": 6954 + }, + { + "epoch": 0.89, + "grad_norm": 0.6704054474830627, + "learning_rate": 1.5617175744943368e-05, + "loss": 3.3215, + "step": 6955 + }, + { + "epoch": 0.89, + "grad_norm": 0.6059884428977966, + "learning_rate": 1.5581134025999644e-05, + "loss": 3.3057, + "step": 6956 + }, + { + "epoch": 0.89, + "grad_norm": 0.6775001287460327, + "learning_rate": 1.5545132606376605e-05, + "loss": 3.3801, + "step": 6957 + }, + { + "epoch": 0.89, + "grad_norm": 0.6409911513328552, + "learning_rate": 1.5509171492263302e-05, + "loss": 3.2189, + "step": 6958 + }, + { + "epoch": 0.89, + "grad_norm": 0.6561943888664246, + "learning_rate": 1.5473250689841843e-05, + "loss": 3.3577, + "step": 6959 + }, + { + "epoch": 0.89, + "grad_norm": 0.6522544026374817, + "learning_rate": 1.5437370205287515e-05, + "loss": 3.4143, + "step": 6960 + }, + { + "epoch": 0.89, + "grad_norm": 0.6523046493530273, + "learning_rate": 1.540153004476852e-05, + "loss": 3.1973, + "step": 6961 + }, + { + "epoch": 0.89, + "grad_norm": 0.6480640769004822, + "learning_rate": 1.5365730214446204e-05, + "loss": 3.1805, + "step": 6962 + }, + { + "epoch": 0.89, + "grad_norm": 0.6580466628074646, + "learning_rate": 1.5329970720474985e-05, + "loss": 3.3363, + "step": 6963 + }, + { + "epoch": 0.89, + "grad_norm": 0.6183872818946838, + "learning_rate": 1.52942515690023e-05, + "loss": 3.3039, + "step": 6964 + }, + { + "epoch": 0.89, + "grad_norm": 0.6598934531211853, + "learning_rate": 1.5258572766168738e-05, + "loss": 3.3041, + "step": 6965 + }, + { + "epoch": 0.89, + "grad_norm": 0.7518994808197021, + "learning_rate": 1.5222934318107839e-05, + "loss": 3.4231, + "step": 6966 + }, + { + "epoch": 0.89, + "grad_norm": 0.6579861640930176, + "learning_rate": 1.5187336230946285e-05, + "loss": 3.3033, + "step": 6967 + }, + { + "epoch": 0.89, + "grad_norm": 0.6535706520080566, + "learning_rate": 1.5151778510803877e-05, + "loss": 3.3467, + "step": 6968 + }, + { + "epoch": 0.89, + "grad_norm": 0.6427934765815735, + "learning_rate": 1.5116261163793332e-05, + "loss": 3.1088, + "step": 6969 + }, + { + "epoch": 0.89, + "grad_norm": 0.6205170750617981, + "learning_rate": 1.5080784196020491e-05, + "loss": 3.2518, + "step": 6970 + }, + { + "epoch": 0.89, + "grad_norm": 0.6465139389038086, + "learning_rate": 1.5045347613584253e-05, + "loss": 3.3603, + "step": 6971 + }, + { + "epoch": 0.89, + "grad_norm": 0.6643823385238647, + "learning_rate": 1.5009951422576607e-05, + "loss": 3.2904, + "step": 6972 + }, + { + "epoch": 0.89, + "grad_norm": 0.6664116382598877, + "learning_rate": 1.4974595629082488e-05, + "loss": 3.3235, + "step": 6973 + }, + { + "epoch": 0.89, + "grad_norm": 0.6498314142227173, + "learning_rate": 1.4939280239180091e-05, + "loss": 3.29, + "step": 6974 + }, + { + "epoch": 0.89, + "grad_norm": 0.6376214027404785, + "learning_rate": 1.4904005258940424e-05, + "loss": 3.3026, + "step": 6975 + }, + { + "epoch": 0.89, + "grad_norm": 0.6093677878379822, + "learning_rate": 1.4868770694427768e-05, + "loss": 3.2407, + "step": 6976 + }, + { + "epoch": 0.89, + "grad_norm": 0.6138312220573425, + "learning_rate": 1.4833576551699285e-05, + "loss": 3.258, + "step": 6977 + }, + { + "epoch": 0.89, + "grad_norm": 0.6545621752738953, + "learning_rate": 1.4798422836805298e-05, + "loss": 3.3727, + "step": 6978 + }, + { + "epoch": 0.89, + "grad_norm": 0.6456170678138733, + "learning_rate": 1.4763309555789111e-05, + "loss": 3.2684, + "step": 6979 + }, + { + "epoch": 0.89, + "grad_norm": 0.6681527495384216, + "learning_rate": 1.4728236714687066e-05, + "loss": 3.498, + "step": 6980 + }, + { + "epoch": 0.89, + "grad_norm": 0.644694447517395, + "learning_rate": 1.4693204319528696e-05, + "loss": 3.1514, + "step": 6981 + }, + { + "epoch": 0.89, + "grad_norm": 0.6642745137214661, + "learning_rate": 1.4658212376336384e-05, + "loss": 3.3157, + "step": 6982 + }, + { + "epoch": 0.89, + "grad_norm": 0.6468070149421692, + "learning_rate": 1.46232608911257e-05, + "loss": 3.3904, + "step": 6983 + }, + { + "epoch": 0.89, + "grad_norm": 0.6560948491096497, + "learning_rate": 1.4588349869905149e-05, + "loss": 3.3689, + "step": 6984 + }, + { + "epoch": 0.89, + "grad_norm": 0.6244742274284363, + "learning_rate": 1.4553479318676398e-05, + "loss": 3.3091, + "step": 6985 + }, + { + "epoch": 0.89, + "grad_norm": 0.6304239630699158, + "learning_rate": 1.45186492434341e-05, + "loss": 3.306, + "step": 6986 + }, + { + "epoch": 0.89, + "grad_norm": 0.6679442524909973, + "learning_rate": 1.4483859650165937e-05, + "loss": 3.3438, + "step": 6987 + }, + { + "epoch": 0.89, + "grad_norm": 0.6513766050338745, + "learning_rate": 1.4449110544852596e-05, + "loss": 3.266, + "step": 6988 + }, + { + "epoch": 0.89, + "grad_norm": 0.6204814910888672, + "learning_rate": 1.4414401933467907e-05, + "loss": 3.3811, + "step": 6989 + }, + { + "epoch": 0.89, + "grad_norm": 0.6290676593780518, + "learning_rate": 1.4379733821978686e-05, + "loss": 3.3356, + "step": 6990 + }, + { + "epoch": 0.89, + "grad_norm": 0.6321442127227783, + "learning_rate": 1.4345106216344772e-05, + "loss": 3.2128, + "step": 6991 + }, + { + "epoch": 0.89, + "grad_norm": 0.6475712656974792, + "learning_rate": 1.4310519122519045e-05, + "loss": 3.4042, + "step": 6992 + }, + { + "epoch": 0.9, + "grad_norm": 0.6464070677757263, + "learning_rate": 1.4275972546447412e-05, + "loss": 3.333, + "step": 6993 + }, + { + "epoch": 0.9, + "grad_norm": 0.6454011797904968, + "learning_rate": 1.4241466494068822e-05, + "loss": 3.1697, + "step": 6994 + }, + { + "epoch": 0.9, + "grad_norm": 0.6559916734695435, + "learning_rate": 1.4207000971315276e-05, + "loss": 3.2271, + "step": 6995 + }, + { + "epoch": 0.9, + "grad_norm": 0.7605160474777222, + "learning_rate": 1.4172575984111869e-05, + "loss": 3.2859, + "step": 6996 + }, + { + "epoch": 0.9, + "grad_norm": 0.6258525252342224, + "learning_rate": 1.4138191538376587e-05, + "loss": 3.3107, + "step": 6997 + }, + { + "epoch": 0.9, + "grad_norm": 0.6340823769569397, + "learning_rate": 1.4103847640020511e-05, + "loss": 3.2286, + "step": 6998 + }, + { + "epoch": 0.9, + "grad_norm": 0.6625699996948242, + "learning_rate": 1.4069544294947779e-05, + "loss": 3.1828, + "step": 6999 + }, + { + "epoch": 0.9, + "grad_norm": 0.640891432762146, + "learning_rate": 1.4035281509055531e-05, + "loss": 3.2585, + "step": 7000 + }, + { + "epoch": 0.9, + "grad_norm": 0.6654657125473022, + "learning_rate": 1.4001059288233892e-05, + "loss": 3.3142, + "step": 7001 + }, + { + "epoch": 0.9, + "grad_norm": 0.7290911674499512, + "learning_rate": 1.3966877638366127e-05, + "loss": 3.4767, + "step": 7002 + }, + { + "epoch": 0.9, + "grad_norm": 0.6988948583602905, + "learning_rate": 1.3932736565328396e-05, + "loss": 3.2548, + "step": 7003 + }, + { + "epoch": 0.9, + "grad_norm": 0.6619759202003479, + "learning_rate": 1.389863607498998e-05, + "loss": 3.3499, + "step": 7004 + }, + { + "epoch": 0.9, + "grad_norm": 0.6396152377128601, + "learning_rate": 1.3864576173213183e-05, + "loss": 3.2954, + "step": 7005 + }, + { + "epoch": 0.9, + "grad_norm": 0.6701951026916504, + "learning_rate": 1.3830556865853244e-05, + "loss": 3.2475, + "step": 7006 + }, + { + "epoch": 0.9, + "grad_norm": 0.6315203309059143, + "learning_rate": 1.3796578158758483e-05, + "loss": 3.243, + "step": 7007 + }, + { + "epoch": 0.9, + "grad_norm": 0.6182517409324646, + "learning_rate": 1.3762640057770253e-05, + "loss": 3.2879, + "step": 7008 + }, + { + "epoch": 0.9, + "grad_norm": 0.6174279451370239, + "learning_rate": 1.3728742568722864e-05, + "loss": 3.1597, + "step": 7009 + }, + { + "epoch": 0.9, + "grad_norm": 0.6286531686782837, + "learning_rate": 1.369488569744376e-05, + "loss": 3.2261, + "step": 7010 + }, + { + "epoch": 0.9, + "grad_norm": 0.6284041404724121, + "learning_rate": 1.366106944975326e-05, + "loss": 3.2638, + "step": 7011 + }, + { + "epoch": 0.9, + "grad_norm": 0.711616575717926, + "learning_rate": 1.3627293831464771e-05, + "loss": 3.3522, + "step": 7012 + }, + { + "epoch": 0.9, + "grad_norm": 0.649397075176239, + "learning_rate": 1.3593558848384785e-05, + "loss": 3.3519, + "step": 7013 + }, + { + "epoch": 0.9, + "grad_norm": 0.6184853315353394, + "learning_rate": 1.3559864506312691e-05, + "loss": 3.159, + "step": 7014 + }, + { + "epoch": 0.9, + "grad_norm": 0.6492748856544495, + "learning_rate": 1.352621081104094e-05, + "loss": 3.2205, + "step": 7015 + }, + { + "epoch": 0.9, + "grad_norm": 0.6151764988899231, + "learning_rate": 1.3492597768354959e-05, + "loss": 3.3327, + "step": 7016 + }, + { + "epoch": 0.9, + "grad_norm": 0.6303196549415588, + "learning_rate": 1.3459025384033264e-05, + "loss": 3.1337, + "step": 7017 + }, + { + "epoch": 0.9, + "grad_norm": 0.6831106543540955, + "learning_rate": 1.3425493663847349e-05, + "loss": 3.1622, + "step": 7018 + }, + { + "epoch": 0.9, + "grad_norm": 0.6606957912445068, + "learning_rate": 1.339200261356166e-05, + "loss": 3.1899, + "step": 7019 + }, + { + "epoch": 0.9, + "grad_norm": 0.6598381400108337, + "learning_rate": 1.335855223893373e-05, + "loss": 3.2937, + "step": 7020 + }, + { + "epoch": 0.9, + "grad_norm": 0.6386822462081909, + "learning_rate": 1.3325142545714014e-05, + "loss": 3.294, + "step": 7021 + }, + { + "epoch": 0.9, + "grad_norm": 0.6143596768379211, + "learning_rate": 1.3291773539646112e-05, + "loss": 3.1769, + "step": 7022 + }, + { + "epoch": 0.9, + "grad_norm": 0.6505967378616333, + "learning_rate": 1.3258445226466464e-05, + "loss": 3.2047, + "step": 7023 + }, + { + "epoch": 0.9, + "grad_norm": 0.6997924447059631, + "learning_rate": 1.3225157611904625e-05, + "loss": 3.4018, + "step": 7024 + }, + { + "epoch": 0.9, + "grad_norm": 0.6924464106559753, + "learning_rate": 1.3191910701683129e-05, + "loss": 3.3137, + "step": 7025 + }, + { + "epoch": 0.9, + "grad_norm": 0.6976944804191589, + "learning_rate": 1.3158704501517516e-05, + "loss": 3.2224, + "step": 7026 + }, + { + "epoch": 0.9, + "grad_norm": 0.6997489333152771, + "learning_rate": 1.31255390171163e-05, + "loss": 3.3012, + "step": 7027 + }, + { + "epoch": 0.9, + "grad_norm": 0.641872227191925, + "learning_rate": 1.3092414254181006e-05, + "loss": 3.2138, + "step": 7028 + }, + { + "epoch": 0.9, + "grad_norm": 0.6068212389945984, + "learning_rate": 1.3059330218406162e-05, + "loss": 3.2716, + "step": 7029 + }, + { + "epoch": 0.9, + "grad_norm": 0.6634566187858582, + "learning_rate": 1.3026286915479273e-05, + "loss": 3.3622, + "step": 7030 + }, + { + "epoch": 0.9, + "grad_norm": 0.6330064535140991, + "learning_rate": 1.2993284351080909e-05, + "loss": 3.2639, + "step": 7031 + }, + { + "epoch": 0.9, + "grad_norm": 0.6173626780509949, + "learning_rate": 1.296032253088461e-05, + "loss": 3.2587, + "step": 7032 + }, + { + "epoch": 0.9, + "grad_norm": 0.7010987401008606, + "learning_rate": 1.2927401460556876e-05, + "loss": 3.2308, + "step": 7033 + }, + { + "epoch": 0.9, + "grad_norm": 0.6823770999908447, + "learning_rate": 1.2894521145757205e-05, + "loss": 3.3011, + "step": 7034 + }, + { + "epoch": 0.9, + "grad_norm": 0.6275352239608765, + "learning_rate": 1.2861681592138103e-05, + "loss": 3.1642, + "step": 7035 + }, + { + "epoch": 0.9, + "grad_norm": 0.6606463193893433, + "learning_rate": 1.282888280534511e-05, + "loss": 3.3432, + "step": 7036 + }, + { + "epoch": 0.9, + "grad_norm": 0.5987416505813599, + "learning_rate": 1.2796124791016605e-05, + "loss": 3.1984, + "step": 7037 + }, + { + "epoch": 0.9, + "grad_norm": 0.6557232737541199, + "learning_rate": 1.2763407554784223e-05, + "loss": 3.2765, + "step": 7038 + }, + { + "epoch": 0.9, + "grad_norm": 0.7637114524841309, + "learning_rate": 1.2730731102272352e-05, + "loss": 3.3681, + "step": 7039 + }, + { + "epoch": 0.9, + "grad_norm": 0.6272114515304565, + "learning_rate": 1.2698095439098445e-05, + "loss": 3.1892, + "step": 7040 + }, + { + "epoch": 0.9, + "grad_norm": 0.6716676950454712, + "learning_rate": 1.2665500570872984e-05, + "loss": 3.3333, + "step": 7041 + }, + { + "epoch": 0.9, + "grad_norm": 0.7103142142295837, + "learning_rate": 1.2632946503199406e-05, + "loss": 3.2958, + "step": 7042 + }, + { + "epoch": 0.9, + "grad_norm": 0.6846051216125488, + "learning_rate": 1.260043324167412e-05, + "loss": 3.3182, + "step": 7043 + }, + { + "epoch": 0.9, + "grad_norm": 0.6335909366607666, + "learning_rate": 1.2567960791886518e-05, + "loss": 3.3162, + "step": 7044 + }, + { + "epoch": 0.9, + "grad_norm": 0.6781107783317566, + "learning_rate": 1.2535529159418968e-05, + "loss": 3.2821, + "step": 7045 + }, + { + "epoch": 0.9, + "grad_norm": 0.5994791984558105, + "learning_rate": 1.2503138349846926e-05, + "loss": 3.3098, + "step": 7046 + }, + { + "epoch": 0.9, + "grad_norm": 0.6859517693519592, + "learning_rate": 1.2470788368738717e-05, + "loss": 3.1433, + "step": 7047 + }, + { + "epoch": 0.9, + "grad_norm": 0.6935350298881531, + "learning_rate": 1.2438479221655641e-05, + "loss": 3.3322, + "step": 7048 + }, + { + "epoch": 0.9, + "grad_norm": 0.6661397218704224, + "learning_rate": 1.2406210914152005e-05, + "loss": 3.213, + "step": 7049 + }, + { + "epoch": 0.9, + "grad_norm": 0.6652267575263977, + "learning_rate": 1.2373983451775179e-05, + "loss": 3.1398, + "step": 7050 + }, + { + "epoch": 0.9, + "grad_norm": 0.652113139629364, + "learning_rate": 1.2341796840065366e-05, + "loss": 3.3114, + "step": 7051 + }, + { + "epoch": 0.9, + "grad_norm": 0.6672348976135254, + "learning_rate": 1.230965108455584e-05, + "loss": 3.364, + "step": 7052 + }, + { + "epoch": 0.9, + "grad_norm": 0.6418783664703369, + "learning_rate": 1.227754619077287e-05, + "loss": 3.3023, + "step": 7053 + }, + { + "epoch": 0.9, + "grad_norm": 0.637625515460968, + "learning_rate": 1.2245482164235627e-05, + "loss": 3.2907, + "step": 7054 + }, + { + "epoch": 0.9, + "grad_norm": 0.6356735825538635, + "learning_rate": 1.2213459010456285e-05, + "loss": 3.1777, + "step": 7055 + }, + { + "epoch": 0.9, + "grad_norm": 0.6418339014053345, + "learning_rate": 1.2181476734939968e-05, + "loss": 3.2971, + "step": 7056 + }, + { + "epoch": 0.9, + "grad_norm": 0.7009538412094116, + "learning_rate": 1.2149535343184858e-05, + "loss": 3.3479, + "step": 7057 + }, + { + "epoch": 0.9, + "grad_norm": 0.6452652812004089, + "learning_rate": 1.2117634840681984e-05, + "loss": 3.1921, + "step": 7058 + }, + { + "epoch": 0.9, + "grad_norm": 0.6060606837272644, + "learning_rate": 1.2085775232915485e-05, + "loss": 3.3612, + "step": 7059 + }, + { + "epoch": 0.9, + "grad_norm": 0.6534036993980408, + "learning_rate": 1.2053956525362314e-05, + "loss": 3.3251, + "step": 7060 + }, + { + "epoch": 0.9, + "grad_norm": 0.6830912828445435, + "learning_rate": 1.2022178723492566e-05, + "loss": 3.3881, + "step": 7061 + }, + { + "epoch": 0.9, + "grad_norm": 0.6198063492774963, + "learning_rate": 1.1990441832769178e-05, + "loss": 3.2956, + "step": 7062 + }, + { + "epoch": 0.9, + "grad_norm": 0.6613783240318298, + "learning_rate": 1.195874585864809e-05, + "loss": 3.2565, + "step": 7063 + }, + { + "epoch": 0.9, + "grad_norm": 0.6410684585571289, + "learning_rate": 1.1927090806578195e-05, + "loss": 3.2802, + "step": 7064 + }, + { + "epoch": 0.9, + "grad_norm": 0.6888789534568787, + "learning_rate": 1.189547668200136e-05, + "loss": 3.2388, + "step": 7065 + }, + { + "epoch": 0.9, + "grad_norm": 0.6376722455024719, + "learning_rate": 1.1863903490352379e-05, + "loss": 3.3282, + "step": 7066 + }, + { + "epoch": 0.9, + "grad_norm": 0.6477070450782776, + "learning_rate": 1.1832371237059158e-05, + "loss": 3.2316, + "step": 7067 + }, + { + "epoch": 0.9, + "grad_norm": 0.6830076575279236, + "learning_rate": 1.1800879927542335e-05, + "loss": 3.3049, + "step": 7068 + }, + { + "epoch": 0.9, + "grad_norm": 0.6432408690452576, + "learning_rate": 1.1769429567215773e-05, + "loss": 3.1772, + "step": 7069 + }, + { + "epoch": 0.9, + "grad_norm": 0.6343344449996948, + "learning_rate": 1.1738020161486035e-05, + "loss": 3.153, + "step": 7070 + }, + { + "epoch": 0.91, + "grad_norm": 0.6284183263778687, + "learning_rate": 1.1706651715752803e-05, + "loss": 3.2556, + "step": 7071 + }, + { + "epoch": 0.91, + "grad_norm": 0.6398439407348633, + "learning_rate": 1.1675324235408707e-05, + "loss": 3.3268, + "step": 7072 + }, + { + "epoch": 0.91, + "grad_norm": 0.6415259838104248, + "learning_rate": 1.164403772583919e-05, + "loss": 3.3415, + "step": 7073 + }, + { + "epoch": 0.91, + "grad_norm": 0.6458741426467896, + "learning_rate": 1.1612792192422922e-05, + "loss": 3.2342, + "step": 7074 + }, + { + "epoch": 0.91, + "grad_norm": 0.6145205497741699, + "learning_rate": 1.1581587640531272e-05, + "loss": 3.2521, + "step": 7075 + }, + { + "epoch": 0.91, + "grad_norm": 0.6816922426223755, + "learning_rate": 1.1550424075528697e-05, + "loss": 3.2076, + "step": 7076 + }, + { + "epoch": 0.91, + "grad_norm": 0.7199867367744446, + "learning_rate": 1.1519301502772556e-05, + "loss": 3.3487, + "step": 7077 + }, + { + "epoch": 0.91, + "grad_norm": 0.6625239253044128, + "learning_rate": 1.1488219927613202e-05, + "loss": 3.2905, + "step": 7078 + }, + { + "epoch": 0.91, + "grad_norm": 0.615695059299469, + "learning_rate": 1.145717935539392e-05, + "loss": 3.3327, + "step": 7079 + }, + { + "epoch": 0.91, + "grad_norm": 0.669804573059082, + "learning_rate": 1.1426179791450913e-05, + "loss": 3.3124, + "step": 7080 + }, + { + "epoch": 0.91, + "grad_norm": 0.6174100041389465, + "learning_rate": 1.1395221241113363e-05, + "loss": 3.2424, + "step": 7081 + }, + { + "epoch": 0.91, + "grad_norm": 0.6850603818893433, + "learning_rate": 1.1364303709703482e-05, + "loss": 3.3598, + "step": 7082 + }, + { + "epoch": 0.91, + "grad_norm": 0.6397016048431396, + "learning_rate": 1.1333427202536273e-05, + "loss": 3.2598, + "step": 7083 + }, + { + "epoch": 0.91, + "grad_norm": 0.6480134129524231, + "learning_rate": 1.1302591724919791e-05, + "loss": 3.3201, + "step": 7084 + }, + { + "epoch": 0.91, + "grad_norm": 0.6504730582237244, + "learning_rate": 1.1271797282154994e-05, + "loss": 3.28, + "step": 7085 + }, + { + "epoch": 0.91, + "grad_norm": 0.646886944770813, + "learning_rate": 1.1241043879535811e-05, + "loss": 3.1932, + "step": 7086 + }, + { + "epoch": 0.91, + "grad_norm": 0.6623911261558533, + "learning_rate": 1.1210331522349126e-05, + "loss": 3.2725, + "step": 7087 + }, + { + "epoch": 0.91, + "grad_norm": 0.6475020051002502, + "learning_rate": 1.1179660215874715e-05, + "loss": 3.2217, + "step": 7088 + }, + { + "epoch": 0.91, + "grad_norm": 0.6451123952865601, + "learning_rate": 1.1149029965385416e-05, + "loss": 3.1674, + "step": 7089 + }, + { + "epoch": 0.91, + "grad_norm": 0.7544106245040894, + "learning_rate": 1.1118440776146821e-05, + "loss": 3.206, + "step": 7090 + }, + { + "epoch": 0.91, + "grad_norm": 0.6707612872123718, + "learning_rate": 1.1087892653417642e-05, + "loss": 3.2419, + "step": 7091 + }, + { + "epoch": 0.91, + "grad_norm": 0.6471689343452454, + "learning_rate": 1.10573856024494e-05, + "loss": 3.1596, + "step": 7092 + }, + { + "epoch": 0.91, + "grad_norm": 0.7012263536453247, + "learning_rate": 1.1026919628486647e-05, + "loss": 3.2828, + "step": 7093 + }, + { + "epoch": 0.91, + "grad_norm": 0.6403650641441345, + "learning_rate": 1.0996494736766782e-05, + "loss": 3.2227, + "step": 7094 + }, + { + "epoch": 0.91, + "grad_norm": 0.6718841791152954, + "learning_rate": 1.0966110932520285e-05, + "loss": 3.1116, + "step": 7095 + }, + { + "epoch": 0.91, + "grad_norm": 0.5933142900466919, + "learning_rate": 1.0935768220970393e-05, + "loss": 3.1487, + "step": 7096 + }, + { + "epoch": 0.91, + "grad_norm": 0.6481373906135559, + "learning_rate": 1.0905466607333465e-05, + "loss": 3.2511, + "step": 7097 + }, + { + "epoch": 0.91, + "grad_norm": 0.6603955030441284, + "learning_rate": 1.0875206096818607e-05, + "loss": 3.1641, + "step": 7098 + }, + { + "epoch": 0.91, + "grad_norm": 0.6217958331108093, + "learning_rate": 1.0844986694628022e-05, + "loss": 3.2658, + "step": 7099 + }, + { + "epoch": 0.91, + "grad_norm": 0.6261694431304932, + "learning_rate": 1.081480840595675e-05, + "loss": 3.2886, + "step": 7100 + }, + { + "epoch": 0.91, + "grad_norm": 0.6567366123199463, + "learning_rate": 1.0784671235992777e-05, + "loss": 3.2645, + "step": 7101 + }, + { + "epoch": 0.91, + "grad_norm": 0.6423852443695068, + "learning_rate": 1.0754575189917015e-05, + "loss": 3.2489, + "step": 7102 + }, + { + "epoch": 0.91, + "grad_norm": 0.6532287001609802, + "learning_rate": 1.0724520272903382e-05, + "loss": 3.3499, + "step": 7103 + }, + { + "epoch": 0.91, + "grad_norm": 0.67435222864151, + "learning_rate": 1.0694506490118632e-05, + "loss": 3.1951, + "step": 7104 + }, + { + "epoch": 0.91, + "grad_norm": 0.6489763855934143, + "learning_rate": 1.0664533846722447e-05, + "loss": 3.2103, + "step": 7105 + }, + { + "epoch": 0.91, + "grad_norm": 0.6705418825149536, + "learning_rate": 1.0634602347867533e-05, + "loss": 3.2122, + "step": 7106 + }, + { + "epoch": 0.91, + "grad_norm": 0.6227537393569946, + "learning_rate": 1.0604711998699445e-05, + "loss": 3.133, + "step": 7107 + }, + { + "epoch": 0.91, + "grad_norm": 0.6649758219718933, + "learning_rate": 1.0574862804356683e-05, + "loss": 3.3684, + "step": 7108 + }, + { + "epoch": 0.91, + "grad_norm": 0.7490100860595703, + "learning_rate": 1.0545054769970614e-05, + "loss": 3.4781, + "step": 7109 + }, + { + "epoch": 0.91, + "grad_norm": 0.6477136611938477, + "learning_rate": 1.0515287900665666e-05, + "loss": 3.1671, + "step": 7110 + }, + { + "epoch": 0.91, + "grad_norm": 0.6387073397636414, + "learning_rate": 1.0485562201559079e-05, + "loss": 3.3198, + "step": 7111 + }, + { + "epoch": 0.91, + "grad_norm": 0.648479700088501, + "learning_rate": 1.0455877677761044e-05, + "loss": 3.2667, + "step": 7112 + }, + { + "epoch": 0.91, + "grad_norm": 0.6808549761772156, + "learning_rate": 1.0426234334374647e-05, + "loss": 3.2661, + "step": 7113 + }, + { + "epoch": 0.91, + "grad_norm": 0.6373841166496277, + "learning_rate": 1.0396632176495946e-05, + "loss": 3.2111, + "step": 7114 + }, + { + "epoch": 0.91, + "grad_norm": 0.6293783187866211, + "learning_rate": 1.0367071209213902e-05, + "loss": 3.3596, + "step": 7115 + }, + { + "epoch": 0.91, + "grad_norm": 0.6526897549629211, + "learning_rate": 1.0337551437610365e-05, + "loss": 3.3849, + "step": 7116 + }, + { + "epoch": 0.91, + "grad_norm": 0.6583970785140991, + "learning_rate": 1.0308072866760137e-05, + "loss": 3.2599, + "step": 7117 + }, + { + "epoch": 0.91, + "grad_norm": 0.6063961982727051, + "learning_rate": 1.027863550173097e-05, + "loss": 3.3776, + "step": 7118 + }, + { + "epoch": 0.91, + "grad_norm": 0.6646319627761841, + "learning_rate": 1.0249239347583428e-05, + "loss": 3.2485, + "step": 7119 + }, + { + "epoch": 0.91, + "grad_norm": 0.7042518854141235, + "learning_rate": 1.0219884409371077e-05, + "loss": 3.2221, + "step": 7120 + }, + { + "epoch": 0.91, + "grad_norm": 0.685055673122406, + "learning_rate": 1.0190570692140355e-05, + "loss": 3.2521, + "step": 7121 + }, + { + "epoch": 0.91, + "grad_norm": 0.6594273447990417, + "learning_rate": 1.0161298200930647e-05, + "loss": 3.3004, + "step": 7122 + }, + { + "epoch": 0.91, + "grad_norm": 0.61384117603302, + "learning_rate": 1.0132066940774203e-05, + "loss": 3.2558, + "step": 7123 + }, + { + "epoch": 0.91, + "grad_norm": 0.6446554660797119, + "learning_rate": 1.010287691669623e-05, + "loss": 3.265, + "step": 7124 + }, + { + "epoch": 0.91, + "grad_norm": 0.6155528426170349, + "learning_rate": 1.0073728133714877e-05, + "loss": 3.23, + "step": 7125 + }, + { + "epoch": 0.91, + "grad_norm": 0.5826809406280518, + "learning_rate": 1.0044620596841136e-05, + "loss": 3.171, + "step": 7126 + }, + { + "epoch": 0.91, + "grad_norm": 0.7251071333885193, + "learning_rate": 1.0015554311078895e-05, + "loss": 3.3523, + "step": 7127 + }, + { + "epoch": 0.91, + "grad_norm": 0.6796685457229614, + "learning_rate": 9.986529281425016e-06, + "loss": 3.3612, + "step": 7128 + }, + { + "epoch": 0.91, + "grad_norm": 0.6957389712333679, + "learning_rate": 9.957545512869231e-06, + "loss": 3.2444, + "step": 7129 + }, + { + "epoch": 0.91, + "grad_norm": 0.6471418738365173, + "learning_rate": 9.928603010394138e-06, + "loss": 3.2354, + "step": 7130 + }, + { + "epoch": 0.91, + "grad_norm": 0.6198047995567322, + "learning_rate": 9.899701778975395e-06, + "loss": 3.1523, + "step": 7131 + }, + { + "epoch": 0.91, + "grad_norm": 0.6490558981895447, + "learning_rate": 9.870841823581362e-06, + "loss": 3.191, + "step": 7132 + }, + { + "epoch": 0.91, + "grad_norm": 0.6650616526603699, + "learning_rate": 9.842023149173428e-06, + "loss": 3.3283, + "step": 7133 + }, + { + "epoch": 0.91, + "grad_norm": 0.6575379371643066, + "learning_rate": 9.813245760705886e-06, + "loss": 3.3364, + "step": 7134 + }, + { + "epoch": 0.91, + "grad_norm": 0.6252859830856323, + "learning_rate": 9.784509663125884e-06, + "loss": 3.3314, + "step": 7135 + }, + { + "epoch": 0.91, + "grad_norm": 0.6415067911148071, + "learning_rate": 9.755814861373502e-06, + "loss": 3.0821, + "step": 7136 + }, + { + "epoch": 0.91, + "grad_norm": 0.6143196225166321, + "learning_rate": 9.727161360381681e-06, + "loss": 3.3058, + "step": 7137 + }, + { + "epoch": 0.91, + "grad_norm": 0.6523025035858154, + "learning_rate": 9.69854916507626e-06, + "loss": 3.2572, + "step": 7138 + }, + { + "epoch": 0.91, + "grad_norm": 0.6369534134864807, + "learning_rate": 9.669978280376107e-06, + "loss": 3.1735, + "step": 7139 + }, + { + "epoch": 0.91, + "grad_norm": 0.6777133345603943, + "learning_rate": 9.641448711192796e-06, + "loss": 3.1831, + "step": 7140 + }, + { + "epoch": 0.91, + "grad_norm": 0.6214920878410339, + "learning_rate": 9.61296046243096e-06, + "loss": 3.211, + "step": 7141 + }, + { + "epoch": 0.91, + "grad_norm": 0.6660655736923218, + "learning_rate": 9.584513538987983e-06, + "loss": 3.2759, + "step": 7142 + }, + { + "epoch": 0.91, + "grad_norm": 0.6243155002593994, + "learning_rate": 9.556107945754316e-06, + "loss": 3.3113, + "step": 7143 + }, + { + "epoch": 0.91, + "grad_norm": 0.6599859595298767, + "learning_rate": 9.527743687613138e-06, + "loss": 3.2715, + "step": 7144 + }, + { + "epoch": 0.91, + "grad_norm": 0.6295204758644104, + "learning_rate": 9.499420769440576e-06, + "loss": 3.2068, + "step": 7145 + }, + { + "epoch": 0.91, + "grad_norm": 0.6949333548545837, + "learning_rate": 9.471139196105732e-06, + "loss": 3.3363, + "step": 7146 + }, + { + "epoch": 0.91, + "grad_norm": 0.6422164440155029, + "learning_rate": 9.442898972470526e-06, + "loss": 3.2773, + "step": 7147 + }, + { + "epoch": 0.91, + "grad_norm": 0.6916665434837341, + "learning_rate": 9.414700103389768e-06, + "loss": 3.2584, + "step": 7148 + }, + { + "epoch": 0.92, + "grad_norm": 0.722175121307373, + "learning_rate": 9.386542593711162e-06, + "loss": 3.3599, + "step": 7149 + }, + { + "epoch": 0.92, + "grad_norm": 0.6117916107177734, + "learning_rate": 9.358426448275309e-06, + "loss": 3.1653, + "step": 7150 + }, + { + "epoch": 0.92, + "grad_norm": 0.6152571439743042, + "learning_rate": 9.330351671915676e-06, + "loss": 3.3316, + "step": 7151 + }, + { + "epoch": 0.92, + "grad_norm": 0.6863966584205627, + "learning_rate": 9.302318269458682e-06, + "loss": 3.1733, + "step": 7152 + }, + { + "epoch": 0.92, + "grad_norm": 0.6250137686729431, + "learning_rate": 9.274326245723607e-06, + "loss": 3.1781, + "step": 7153 + }, + { + "epoch": 0.92, + "grad_norm": 0.6183449625968933, + "learning_rate": 9.246375605522578e-06, + "loss": 3.4088, + "step": 7154 + }, + { + "epoch": 0.92, + "grad_norm": 0.6490113735198975, + "learning_rate": 9.218466353660637e-06, + "loss": 3.2649, + "step": 7155 + }, + { + "epoch": 0.92, + "grad_norm": 0.6815605759620667, + "learning_rate": 9.190598494935726e-06, + "loss": 3.1625, + "step": 7156 + }, + { + "epoch": 0.92, + "grad_norm": 0.623877227306366, + "learning_rate": 9.162772034138623e-06, + "loss": 3.3315, + "step": 7157 + }, + { + "epoch": 0.92, + "grad_norm": 0.654930591583252, + "learning_rate": 9.134986976053028e-06, + "loss": 3.1119, + "step": 7158 + }, + { + "epoch": 0.92, + "grad_norm": 0.6280533075332642, + "learning_rate": 9.10724332545551e-06, + "loss": 3.3371, + "step": 7159 + }, + { + "epoch": 0.92, + "grad_norm": 0.6446752548217773, + "learning_rate": 9.079541087115506e-06, + "loss": 3.271, + "step": 7160 + }, + { + "epoch": 0.92, + "grad_norm": 0.6848363876342773, + "learning_rate": 9.051880265795426e-06, + "loss": 3.2908, + "step": 7161 + }, + { + "epoch": 0.92, + "grad_norm": 0.6229819655418396, + "learning_rate": 9.024260866250439e-06, + "loss": 3.2014, + "step": 7162 + }, + { + "epoch": 0.92, + "grad_norm": 0.6721832156181335, + "learning_rate": 8.996682893228609e-06, + "loss": 3.1534, + "step": 7163 + }, + { + "epoch": 0.92, + "grad_norm": 0.6590028405189514, + "learning_rate": 8.969146351470974e-06, + "loss": 3.3421, + "step": 7164 + }, + { + "epoch": 0.92, + "grad_norm": 0.6596424579620361, + "learning_rate": 8.941651245711336e-06, + "loss": 3.1913, + "step": 7165 + }, + { + "epoch": 0.92, + "grad_norm": 0.6490575075149536, + "learning_rate": 8.914197580676409e-06, + "loss": 3.2886, + "step": 7166 + }, + { + "epoch": 0.92, + "grad_norm": 0.643104612827301, + "learning_rate": 8.886785361085865e-06, + "loss": 3.2904, + "step": 7167 + }, + { + "epoch": 0.92, + "grad_norm": 0.633247971534729, + "learning_rate": 8.859414591652126e-06, + "loss": 3.1506, + "step": 7168 + }, + { + "epoch": 0.92, + "grad_norm": 0.6761441826820374, + "learning_rate": 8.832085277080571e-06, + "loss": 3.234, + "step": 7169 + }, + { + "epoch": 0.92, + "grad_norm": 0.6587713360786438, + "learning_rate": 8.804797422069383e-06, + "loss": 3.2586, + "step": 7170 + }, + { + "epoch": 0.92, + "grad_norm": 0.6706406474113464, + "learning_rate": 8.777551031309727e-06, + "loss": 3.2162, + "step": 7171 + }, + { + "epoch": 0.92, + "grad_norm": 0.6373216509819031, + "learning_rate": 8.750346109485525e-06, + "loss": 3.2112, + "step": 7172 + }, + { + "epoch": 0.92, + "grad_norm": 0.6625058650970459, + "learning_rate": 8.723182661273615e-06, + "loss": 3.3164, + "step": 7173 + }, + { + "epoch": 0.92, + "grad_norm": 0.6026141047477722, + "learning_rate": 8.696060691343765e-06, + "loss": 3.1936, + "step": 7174 + }, + { + "epoch": 0.92, + "grad_norm": 0.6548628211021423, + "learning_rate": 8.668980204358496e-06, + "loss": 3.3851, + "step": 7175 + }, + { + "epoch": 0.92, + "grad_norm": 0.6361516714096069, + "learning_rate": 8.641941204973274e-06, + "loss": 3.3051, + "step": 7176 + }, + { + "epoch": 0.92, + "grad_norm": 0.6361994743347168, + "learning_rate": 8.61494369783644e-06, + "loss": 3.3744, + "step": 7177 + }, + { + "epoch": 0.92, + "grad_norm": 0.6931582093238831, + "learning_rate": 8.587987687589172e-06, + "loss": 3.3163, + "step": 7178 + }, + { + "epoch": 0.92, + "grad_norm": 0.6600695252418518, + "learning_rate": 8.561073178865453e-06, + "loss": 3.3885, + "step": 7179 + }, + { + "epoch": 0.92, + "grad_norm": 0.6554650068283081, + "learning_rate": 8.534200176292305e-06, + "loss": 3.2736, + "step": 7180 + }, + { + "epoch": 0.92, + "grad_norm": 0.6693041920661926, + "learning_rate": 8.507368684489397e-06, + "loss": 3.1823, + "step": 7181 + }, + { + "epoch": 0.92, + "grad_norm": 0.6274213790893555, + "learning_rate": 8.48057870806951e-06, + "loss": 3.292, + "step": 7182 + }, + { + "epoch": 0.92, + "grad_norm": 0.6852020621299744, + "learning_rate": 8.45383025163804e-06, + "loss": 3.306, + "step": 7183 + }, + { + "epoch": 0.92, + "grad_norm": 0.6665197610855103, + "learning_rate": 8.427123319793395e-06, + "loss": 3.3709, + "step": 7184 + }, + { + "epoch": 0.92, + "grad_norm": 0.6755871772766113, + "learning_rate": 8.400457917126819e-06, + "loss": 3.1938, + "step": 7185 + }, + { + "epoch": 0.92, + "grad_norm": 0.6387326717376709, + "learning_rate": 8.373834048222394e-06, + "loss": 3.3682, + "step": 7186 + }, + { + "epoch": 0.92, + "grad_norm": 0.6742923259735107, + "learning_rate": 8.347251717657018e-06, + "loss": 3.2947, + "step": 7187 + }, + { + "epoch": 0.92, + "grad_norm": 0.6418575644493103, + "learning_rate": 8.320710930000586e-06, + "loss": 3.2402, + "step": 7188 + }, + { + "epoch": 0.92, + "grad_norm": 0.6079849600791931, + "learning_rate": 8.294211689815729e-06, + "loss": 3.3532, + "step": 7189 + }, + { + "epoch": 0.92, + "grad_norm": 0.6370933055877686, + "learning_rate": 8.267754001657969e-06, + "loss": 3.2438, + "step": 7190 + }, + { + "epoch": 0.92, + "grad_norm": 0.6640612483024597, + "learning_rate": 8.241337870075721e-06, + "loss": 3.308, + "step": 7191 + }, + { + "epoch": 0.92, + "grad_norm": 0.6398223042488098, + "learning_rate": 8.214963299610189e-06, + "loss": 3.4203, + "step": 7192 + }, + { + "epoch": 0.92, + "grad_norm": 0.6995474696159363, + "learning_rate": 8.188630294795469e-06, + "loss": 3.2194, + "step": 7193 + }, + { + "epoch": 0.92, + "grad_norm": 0.5878394842147827, + "learning_rate": 8.16233886015852e-06, + "loss": 3.23, + "step": 7194 + }, + { + "epoch": 0.92, + "grad_norm": 0.66357421875, + "learning_rate": 8.136089000219144e-06, + "loss": 3.2357, + "step": 7195 + }, + { + "epoch": 0.92, + "grad_norm": 0.6442654132843018, + "learning_rate": 8.10988071949001e-06, + "loss": 3.1958, + "step": 7196 + }, + { + "epoch": 0.92, + "grad_norm": 0.6443668007850647, + "learning_rate": 8.083714022476568e-06, + "loss": 3.2293, + "step": 7197 + }, + { + "epoch": 0.92, + "grad_norm": 0.6623659729957581, + "learning_rate": 8.057588913677277e-06, + "loss": 3.0882, + "step": 7198 + }, + { + "epoch": 0.92, + "grad_norm": 0.6151493787765503, + "learning_rate": 8.031505397583267e-06, + "loss": 3.2242, + "step": 7199 + }, + { + "epoch": 0.92, + "grad_norm": 0.5729960799217224, + "learning_rate": 8.005463478678615e-06, + "loss": 3.2981, + "step": 7200 + }, + { + "epoch": 0.92, + "grad_norm": 0.695530354976654, + "learning_rate": 7.979463161440242e-06, + "loss": 3.0923, + "step": 7201 + }, + { + "epoch": 0.92, + "grad_norm": 0.6356866359710693, + "learning_rate": 7.953504450337879e-06, + "loss": 3.2798, + "step": 7202 + }, + { + "epoch": 0.92, + "grad_norm": 0.7238433361053467, + "learning_rate": 7.927587349834148e-06, + "loss": 3.2346, + "step": 7203 + }, + { + "epoch": 0.92, + "grad_norm": 0.5977983474731445, + "learning_rate": 7.901711864384515e-06, + "loss": 3.2292, + "step": 7204 + }, + { + "epoch": 0.92, + "grad_norm": 0.6451910138130188, + "learning_rate": 7.875877998437226e-06, + "loss": 3.2024, + "step": 7205 + }, + { + "epoch": 0.92, + "grad_norm": 0.6787285208702087, + "learning_rate": 7.850085756433478e-06, + "loss": 3.2673, + "step": 7206 + }, + { + "epoch": 0.92, + "grad_norm": 0.6958125233650208, + "learning_rate": 7.824335142807198e-06, + "loss": 3.2676, + "step": 7207 + }, + { + "epoch": 0.92, + "grad_norm": 0.6704130172729492, + "learning_rate": 7.79862616198529e-06, + "loss": 3.2651, + "step": 7208 + }, + { + "epoch": 0.92, + "grad_norm": 0.6035682559013367, + "learning_rate": 7.772958818387326e-06, + "loss": 3.4288, + "step": 7209 + }, + { + "epoch": 0.92, + "grad_norm": 0.6599581241607666, + "learning_rate": 7.747333116425947e-06, + "loss": 3.3845, + "step": 7210 + }, + { + "epoch": 0.92, + "grad_norm": 0.6396064758300781, + "learning_rate": 7.721749060506406e-06, + "loss": 3.1495, + "step": 7211 + }, + { + "epoch": 0.92, + "grad_norm": 0.6582738757133484, + "learning_rate": 7.696206655026933e-06, + "loss": 3.3308, + "step": 7212 + }, + { + "epoch": 0.92, + "grad_norm": 0.6420777440071106, + "learning_rate": 7.670705904378572e-06, + "loss": 3.2123, + "step": 7213 + }, + { + "epoch": 0.92, + "grad_norm": 0.6579254865646362, + "learning_rate": 7.645246812945206e-06, + "loss": 3.3212, + "step": 7214 + }, + { + "epoch": 0.92, + "grad_norm": 0.6444306373596191, + "learning_rate": 7.6198293851034715e-06, + "loss": 3.121, + "step": 7215 + }, + { + "epoch": 0.92, + "grad_norm": 0.7798821926116943, + "learning_rate": 7.594453625223013e-06, + "loss": 3.1915, + "step": 7216 + }, + { + "epoch": 0.92, + "grad_norm": 0.6160184144973755, + "learning_rate": 7.569119537666175e-06, + "loss": 3.268, + "step": 7217 + }, + { + "epoch": 0.92, + "grad_norm": 0.7422967553138733, + "learning_rate": 7.543827126788194e-06, + "loss": 3.2423, + "step": 7218 + }, + { + "epoch": 0.92, + "grad_norm": 0.6377444863319397, + "learning_rate": 7.5185763969371215e-06, + "loss": 3.1848, + "step": 7219 + }, + { + "epoch": 0.92, + "grad_norm": 0.6461215615272522, + "learning_rate": 7.493367352453873e-06, + "loss": 3.2925, + "step": 7220 + }, + { + "epoch": 0.92, + "grad_norm": 0.6127544045448303, + "learning_rate": 7.468199997672148e-06, + "loss": 3.1988, + "step": 7221 + }, + { + "epoch": 0.92, + "grad_norm": 0.6574669480323792, + "learning_rate": 7.443074336918487e-06, + "loss": 3.2854, + "step": 7222 + }, + { + "epoch": 0.92, + "grad_norm": 0.6241658329963684, + "learning_rate": 7.417990374512296e-06, + "loss": 3.3113, + "step": 7223 + }, + { + "epoch": 0.92, + "grad_norm": 0.6702455282211304, + "learning_rate": 7.392948114765846e-06, + "loss": 3.2695, + "step": 7224 + }, + { + "epoch": 0.92, + "grad_norm": 0.6530889272689819, + "learning_rate": 7.36794756198414e-06, + "loss": 3.2646, + "step": 7225 + }, + { + "epoch": 0.92, + "grad_norm": 0.681586503982544, + "learning_rate": 7.3429887204650994e-06, + "loss": 3.2925, + "step": 7226 + }, + { + "epoch": 0.93, + "grad_norm": 0.6429810523986816, + "learning_rate": 7.318071594499403e-06, + "loss": 3.2615, + "step": 7227 + }, + { + "epoch": 0.93, + "grad_norm": 0.6187373399734497, + "learning_rate": 7.293196188370627e-06, + "loss": 3.2572, + "step": 7228 + }, + { + "epoch": 0.93, + "grad_norm": 0.6678377389907837, + "learning_rate": 7.268362506355125e-06, + "loss": 3.3227, + "step": 7229 + }, + { + "epoch": 0.93, + "grad_norm": 0.6457262635231018, + "learning_rate": 7.243570552722067e-06, + "loss": 3.1622, + "step": 7230 + }, + { + "epoch": 0.93, + "grad_norm": 0.6468750834465027, + "learning_rate": 7.2188203317335165e-06, + "loss": 3.2397, + "step": 7231 + }, + { + "epoch": 0.93, + "grad_norm": 0.6340529918670654, + "learning_rate": 7.194111847644347e-06, + "loss": 3.1485, + "step": 7232 + }, + { + "epoch": 0.93, + "grad_norm": 0.6241766214370728, + "learning_rate": 7.16944510470216e-06, + "loss": 3.2466, + "step": 7233 + }, + { + "epoch": 0.93, + "grad_norm": 0.6593299508094788, + "learning_rate": 7.144820107147482e-06, + "loss": 3.3613, + "step": 7234 + }, + { + "epoch": 0.93, + "grad_norm": 0.6926560997962952, + "learning_rate": 7.120236859213674e-06, + "loss": 3.3678, + "step": 7235 + }, + { + "epoch": 0.93, + "grad_norm": 0.6539257168769836, + "learning_rate": 7.095695365126858e-06, + "loss": 3.217, + "step": 7236 + }, + { + "epoch": 0.93, + "grad_norm": 0.6468613147735596, + "learning_rate": 7.071195629105992e-06, + "loss": 3.1489, + "step": 7237 + }, + { + "epoch": 0.93, + "grad_norm": 0.662559986114502, + "learning_rate": 7.046737655362845e-06, + "loss": 3.3925, + "step": 7238 + }, + { + "epoch": 0.93, + "grad_norm": 0.6389094591140747, + "learning_rate": 7.0223214481020535e-06, + "loss": 3.2455, + "step": 7239 + }, + { + "epoch": 0.93, + "grad_norm": 0.6499673128128052, + "learning_rate": 6.997947011521067e-06, + "loss": 3.3585, + "step": 7240 + }, + { + "epoch": 0.93, + "grad_norm": 0.6148930191993713, + "learning_rate": 6.973614349810115e-06, + "loss": 3.2433, + "step": 7241 + }, + { + "epoch": 0.93, + "grad_norm": 0.6275553703308105, + "learning_rate": 6.949323467152269e-06, + "loss": 3.2461, + "step": 7242 + }, + { + "epoch": 0.93, + "grad_norm": 0.7042184472084045, + "learning_rate": 6.925074367723383e-06, + "loss": 3.3104, + "step": 7243 + }, + { + "epoch": 0.93, + "grad_norm": 0.6173494458198547, + "learning_rate": 6.90086705569215e-06, + "loss": 3.2534, + "step": 7244 + }, + { + "epoch": 0.93, + "grad_norm": 0.6344504952430725, + "learning_rate": 6.876701535220131e-06, + "loss": 3.1419, + "step": 7245 + }, + { + "epoch": 0.93, + "grad_norm": 0.6360830664634705, + "learning_rate": 6.8525778104616685e-06, + "loss": 3.4543, + "step": 7246 + }, + { + "epoch": 0.93, + "grad_norm": 0.6810656189918518, + "learning_rate": 6.82849588556389e-06, + "loss": 3.1706, + "step": 7247 + }, + { + "epoch": 0.93, + "grad_norm": 0.6513180732727051, + "learning_rate": 6.804455764666733e-06, + "loss": 3.4029, + "step": 7248 + }, + { + "epoch": 0.93, + "grad_norm": 0.6299558877944946, + "learning_rate": 6.7804574519030325e-06, + "loss": 3.2123, + "step": 7249 + }, + { + "epoch": 0.93, + "grad_norm": 0.6327093839645386, + "learning_rate": 6.75650095139832e-06, + "loss": 3.322, + "step": 7250 + }, + { + "epoch": 0.93, + "grad_norm": 0.667273223400116, + "learning_rate": 6.732586267270968e-06, + "loss": 3.3847, + "step": 7251 + }, + { + "epoch": 0.93, + "grad_norm": 0.6162838935852051, + "learning_rate": 6.708713403632299e-06, + "loss": 3.2832, + "step": 7252 + }, + { + "epoch": 0.93, + "grad_norm": 0.6861327886581421, + "learning_rate": 6.684882364586226e-06, + "loss": 3.2907, + "step": 7253 + }, + { + "epoch": 0.93, + "grad_norm": 0.6113225221633911, + "learning_rate": 6.661093154229636e-06, + "loss": 3.2132, + "step": 7254 + }, + { + "epoch": 0.93, + "grad_norm": 0.6668545603752136, + "learning_rate": 6.637345776652176e-06, + "loss": 3.3351, + "step": 7255 + }, + { + "epoch": 0.93, + "grad_norm": 0.7097353339195251, + "learning_rate": 6.613640235936275e-06, + "loss": 3.4091, + "step": 7256 + }, + { + "epoch": 0.93, + "grad_norm": 0.6316132545471191, + "learning_rate": 6.589976536157199e-06, + "loss": 3.2332, + "step": 7257 + }, + { + "epoch": 0.93, + "grad_norm": 0.637789785861969, + "learning_rate": 6.566354681383002e-06, + "loss": 3.2688, + "step": 7258 + }, + { + "epoch": 0.93, + "grad_norm": 0.6592211723327637, + "learning_rate": 6.542774675674546e-06, + "loss": 3.2023, + "step": 7259 + }, + { + "epoch": 0.93, + "grad_norm": 0.6290187835693359, + "learning_rate": 6.519236523085531e-06, + "loss": 3.2602, + "step": 7260 + }, + { + "epoch": 0.93, + "grad_norm": 0.651838481426239, + "learning_rate": 6.495740227662445e-06, + "loss": 3.3902, + "step": 7261 + }, + { + "epoch": 0.93, + "grad_norm": 0.6233133673667908, + "learning_rate": 6.472285793444499e-06, + "loss": 3.1979, + "step": 7262 + }, + { + "epoch": 0.93, + "grad_norm": 0.6357033848762512, + "learning_rate": 6.448873224463914e-06, + "loss": 3.1953, + "step": 7263 + }, + { + "epoch": 0.93, + "grad_norm": 0.704164981842041, + "learning_rate": 6.4255025247454694e-06, + "loss": 3.1846, + "step": 7264 + }, + { + "epoch": 0.93, + "grad_norm": 0.6579105854034424, + "learning_rate": 6.402173698306924e-06, + "loss": 3.3871, + "step": 7265 + }, + { + "epoch": 0.93, + "grad_norm": 0.6748107075691223, + "learning_rate": 6.378886749158708e-06, + "loss": 3.2592, + "step": 7266 + }, + { + "epoch": 0.93, + "grad_norm": 0.6929618716239929, + "learning_rate": 6.355641681304175e-06, + "loss": 3.3654, + "step": 7267 + }, + { + "epoch": 0.93, + "grad_norm": 0.651310920715332, + "learning_rate": 6.332438498739435e-06, + "loss": 3.28, + "step": 7268 + }, + { + "epoch": 0.93, + "grad_norm": 0.639167845249176, + "learning_rate": 6.309277205453323e-06, + "loss": 3.1737, + "step": 7269 + }, + { + "epoch": 0.93, + "grad_norm": 0.6047765612602234, + "learning_rate": 6.2861578054276e-06, + "loss": 3.2351, + "step": 7270 + }, + { + "epoch": 0.93, + "grad_norm": 0.6707802414894104, + "learning_rate": 6.26308030263667e-06, + "loss": 3.3016, + "step": 7271 + }, + { + "epoch": 0.93, + "grad_norm": 0.6374141573905945, + "learning_rate": 6.240044701047915e-06, + "loss": 3.3161, + "step": 7272 + }, + { + "epoch": 0.93, + "grad_norm": 0.6345334649085999, + "learning_rate": 6.21705100462136e-06, + "loss": 3.356, + "step": 7273 + }, + { + "epoch": 0.93, + "grad_norm": 0.6492778658866882, + "learning_rate": 6.194099217309901e-06, + "loss": 3.2443, + "step": 7274 + }, + { + "epoch": 0.93, + "grad_norm": 0.642340898513794, + "learning_rate": 6.171189343059269e-06, + "loss": 3.255, + "step": 7275 + }, + { + "epoch": 0.93, + "grad_norm": 0.6277442574501038, + "learning_rate": 6.14832138580787e-06, + "loss": 3.3729, + "step": 7276 + }, + { + "epoch": 0.93, + "grad_norm": 0.6362481713294983, + "learning_rate": 6.125495349487003e-06, + "loss": 3.3552, + "step": 7277 + }, + { + "epoch": 0.93, + "grad_norm": 0.6513189673423767, + "learning_rate": 6.102711238020725e-06, + "loss": 3.1394, + "step": 7278 + }, + { + "epoch": 0.93, + "grad_norm": 0.6888836622238159, + "learning_rate": 6.079969055325901e-06, + "loss": 3.2644, + "step": 7279 + }, + { + "epoch": 0.93, + "grad_norm": 0.6480313539505005, + "learning_rate": 6.057268805312127e-06, + "loss": 3.1271, + "step": 7280 + }, + { + "epoch": 0.93, + "grad_norm": 0.6510853171348572, + "learning_rate": 6.0346104918818645e-06, + "loss": 3.3147, + "step": 7281 + }, + { + "epoch": 0.93, + "grad_norm": 0.6157244443893433, + "learning_rate": 6.011994118930358e-06, + "loss": 3.2939, + "step": 7282 + }, + { + "epoch": 0.93, + "grad_norm": 0.6814156770706177, + "learning_rate": 5.9894196903456376e-06, + "loss": 3.2736, + "step": 7283 + }, + { + "epoch": 0.93, + "grad_norm": 0.6630598306655884, + "learning_rate": 5.966887210008487e-06, + "loss": 3.2568, + "step": 7284 + }, + { + "epoch": 0.93, + "grad_norm": 0.6240792274475098, + "learning_rate": 5.944396681792474e-06, + "loss": 3.3374, + "step": 7285 + }, + { + "epoch": 0.93, + "grad_norm": 0.6431431174278259, + "learning_rate": 5.921948109564035e-06, + "loss": 3.3313, + "step": 7286 + }, + { + "epoch": 0.93, + "grad_norm": 0.5855844020843506, + "learning_rate": 5.899541497182276e-06, + "loss": 3.2376, + "step": 7287 + }, + { + "epoch": 0.93, + "grad_norm": 0.6474109888076782, + "learning_rate": 5.8771768484992e-06, + "loss": 3.2878, + "step": 7288 + }, + { + "epoch": 0.93, + "grad_norm": 0.7335505485534668, + "learning_rate": 5.854854167359564e-06, + "loss": 3.3169, + "step": 7289 + }, + { + "epoch": 0.93, + "grad_norm": 0.6480274200439453, + "learning_rate": 5.832573457600826e-06, + "loss": 3.2585, + "step": 7290 + }, + { + "epoch": 0.93, + "grad_norm": 0.640687882900238, + "learning_rate": 5.810334723053367e-06, + "loss": 3.3847, + "step": 7291 + }, + { + "epoch": 0.93, + "grad_norm": 0.64142245054245, + "learning_rate": 5.788137967540269e-06, + "loss": 3.1795, + "step": 7292 + }, + { + "epoch": 0.93, + "grad_norm": 0.6028220653533936, + "learning_rate": 5.765983194877394e-06, + "loss": 3.1765, + "step": 7293 + }, + { + "epoch": 0.93, + "grad_norm": 0.672534704208374, + "learning_rate": 5.743870408873419e-06, + "loss": 3.2826, + "step": 7294 + }, + { + "epoch": 0.93, + "grad_norm": 0.6606788039207458, + "learning_rate": 5.7217996133297476e-06, + "loss": 3.1391, + "step": 7295 + }, + { + "epoch": 0.93, + "grad_norm": 0.6740570664405823, + "learning_rate": 5.6997708120406775e-06, + "loss": 3.2925, + "step": 7296 + }, + { + "epoch": 0.93, + "grad_norm": 0.668215811252594, + "learning_rate": 5.677784008793152e-06, + "loss": 3.2491, + "step": 7297 + }, + { + "epoch": 0.93, + "grad_norm": 0.6743031144142151, + "learning_rate": 5.655839207367008e-06, + "loss": 3.3067, + "step": 7298 + }, + { + "epoch": 0.93, + "grad_norm": 0.6616876125335693, + "learning_rate": 5.633936411534729e-06, + "loss": 3.2424, + "step": 7299 + }, + { + "epoch": 0.93, + "grad_norm": 0.6089078783988953, + "learning_rate": 5.612075625061774e-06, + "loss": 3.1401, + "step": 7300 + }, + { + "epoch": 0.93, + "grad_norm": 0.6700180768966675, + "learning_rate": 5.590256851706193e-06, + "loss": 3.3681, + "step": 7301 + }, + { + "epoch": 0.93, + "grad_norm": 0.6170392632484436, + "learning_rate": 5.5684800952188755e-06, + "loss": 3.2534, + "step": 7302 + }, + { + "epoch": 0.93, + "grad_norm": 0.6896737813949585, + "learning_rate": 5.546745359343547e-06, + "loss": 3.1538, + "step": 7303 + }, + { + "epoch": 0.93, + "grad_norm": 0.6510875821113586, + "learning_rate": 5.525052647816636e-06, + "loss": 3.4175, + "step": 7304 + }, + { + "epoch": 0.94, + "grad_norm": 0.6070569157600403, + "learning_rate": 5.503401964367355e-06, + "loss": 3.3609, + "step": 7305 + }, + { + "epoch": 0.94, + "grad_norm": 0.6779491901397705, + "learning_rate": 5.481793312717753e-06, + "loss": 3.288, + "step": 7306 + }, + { + "epoch": 0.94, + "grad_norm": 0.6362120509147644, + "learning_rate": 5.460226696582555e-06, + "loss": 3.2862, + "step": 7307 + }, + { + "epoch": 0.94, + "grad_norm": 0.9496941566467285, + "learning_rate": 5.43870211966932e-06, + "loss": 3.2342, + "step": 7308 + }, + { + "epoch": 0.94, + "grad_norm": 0.7293429970741272, + "learning_rate": 5.417219585678423e-06, + "loss": 3.3709, + "step": 7309 + }, + { + "epoch": 0.94, + "grad_norm": 0.6373938322067261, + "learning_rate": 5.395779098302911e-06, + "loss": 3.3523, + "step": 7310 + }, + { + "epoch": 0.94, + "grad_norm": 0.6489766836166382, + "learning_rate": 5.374380661228667e-06, + "loss": 3.1637, + "step": 7311 + }, + { + "epoch": 0.94, + "grad_norm": 0.6648989915847778, + "learning_rate": 5.353024278134333e-06, + "loss": 3.2934, + "step": 7312 + }, + { + "epoch": 0.94, + "grad_norm": 0.6867431998252869, + "learning_rate": 5.331709952691333e-06, + "loss": 3.1268, + "step": 7313 + }, + { + "epoch": 0.94, + "grad_norm": 0.6261318922042847, + "learning_rate": 5.3104376885638185e-06, + "loss": 3.3558, + "step": 7314 + }, + { + "epoch": 0.94, + "grad_norm": 0.635741114616394, + "learning_rate": 5.289207489408754e-06, + "loss": 3.2383, + "step": 7315 + }, + { + "epoch": 0.94, + "grad_norm": 0.6128219962120056, + "learning_rate": 5.268019358875803e-06, + "loss": 3.3608, + "step": 7316 + }, + { + "epoch": 0.94, + "grad_norm": 0.6505506038665771, + "learning_rate": 5.246873300607552e-06, + "loss": 3.2795, + "step": 7317 + }, + { + "epoch": 0.94, + "grad_norm": 0.6969970464706421, + "learning_rate": 5.225769318239177e-06, + "loss": 3.3382, + "step": 7318 + }, + { + "epoch": 0.94, + "grad_norm": 0.6488733887672424, + "learning_rate": 5.2047074153987465e-06, + "loss": 3.1926, + "step": 7319 + }, + { + "epoch": 0.94, + "grad_norm": 0.6368139982223511, + "learning_rate": 5.183687595707032e-06, + "loss": 3.2677, + "step": 7320 + }, + { + "epoch": 0.94, + "grad_norm": 0.6244758367538452, + "learning_rate": 5.1627098627775594e-06, + "loss": 3.2698, + "step": 7321 + }, + { + "epoch": 0.94, + "grad_norm": 0.6960594654083252, + "learning_rate": 5.1417742202166665e-06, + "loss": 3.4216, + "step": 7322 + }, + { + "epoch": 0.94, + "grad_norm": 0.6352282762527466, + "learning_rate": 5.12088067162339e-06, + "loss": 3.2368, + "step": 7323 + }, + { + "epoch": 0.94, + "grad_norm": 0.6699858903884888, + "learning_rate": 5.100029220589636e-06, + "loss": 3.4182, + "step": 7324 + }, + { + "epoch": 0.94, + "grad_norm": 0.628677487373352, + "learning_rate": 5.07921987069998e-06, + "loss": 3.2176, + "step": 7325 + }, + { + "epoch": 0.94, + "grad_norm": 0.7023717164993286, + "learning_rate": 5.058452625531812e-06, + "loss": 3.3189, + "step": 7326 + }, + { + "epoch": 0.94, + "grad_norm": 0.6396088600158691, + "learning_rate": 5.037727488655192e-06, + "loss": 3.4466, + "step": 7327 + }, + { + "epoch": 0.94, + "grad_norm": 0.6315101385116577, + "learning_rate": 5.0170444636331025e-06, + "loss": 3.2949, + "step": 7328 + }, + { + "epoch": 0.94, + "grad_norm": 0.6428784728050232, + "learning_rate": 4.996403554021145e-06, + "loss": 3.2748, + "step": 7329 + }, + { + "epoch": 0.94, + "grad_norm": 0.6924943327903748, + "learning_rate": 4.975804763367758e-06, + "loss": 3.3769, + "step": 7330 + }, + { + "epoch": 0.94, + "grad_norm": 0.6694445610046387, + "learning_rate": 4.955248095214082e-06, + "loss": 3.1891, + "step": 7331 + }, + { + "epoch": 0.94, + "grad_norm": 0.6170761585235596, + "learning_rate": 4.934733553094068e-06, + "loss": 3.3613, + "step": 7332 + }, + { + "epoch": 0.94, + "grad_norm": 0.675621747970581, + "learning_rate": 4.914261140534393e-06, + "loss": 3.3318, + "step": 7333 + }, + { + "epoch": 0.94, + "grad_norm": 0.675338625907898, + "learning_rate": 4.8938308610544935e-06, + "loss": 3.2877, + "step": 7334 + }, + { + "epoch": 0.94, + "grad_norm": 0.6300745010375977, + "learning_rate": 4.8734427181666156e-06, + "loss": 3.1573, + "step": 7335 + }, + { + "epoch": 0.94, + "grad_norm": 0.6434711217880249, + "learning_rate": 4.853096715375649e-06, + "loss": 3.2582, + "step": 7336 + }, + { + "epoch": 0.94, + "grad_norm": 0.6582839488983154, + "learning_rate": 4.8327928561793525e-06, + "loss": 3.4553, + "step": 7337 + }, + { + "epoch": 0.94, + "grad_norm": 0.6042643785476685, + "learning_rate": 4.812531144068183e-06, + "loss": 3.1254, + "step": 7338 + }, + { + "epoch": 0.94, + "grad_norm": 0.6512244939804077, + "learning_rate": 4.792311582525383e-06, + "loss": 3.2403, + "step": 7339 + }, + { + "epoch": 0.94, + "grad_norm": 0.6681080460548401, + "learning_rate": 4.772134175026921e-06, + "loss": 3.2801, + "step": 7340 + }, + { + "epoch": 0.94, + "grad_norm": 0.6504518985748291, + "learning_rate": 4.751998925041495e-06, + "loss": 3.2723, + "step": 7341 + }, + { + "epoch": 0.94, + "grad_norm": 0.6400660872459412, + "learning_rate": 4.731905836030642e-06, + "loss": 3.3012, + "step": 7342 + }, + { + "epoch": 0.94, + "grad_norm": 0.6214766502380371, + "learning_rate": 4.711854911448543e-06, + "loss": 3.284, + "step": 7343 + }, + { + "epoch": 0.94, + "grad_norm": 0.6600997447967529, + "learning_rate": 4.691846154742191e-06, + "loss": 3.3809, + "step": 7344 + }, + { + "epoch": 0.94, + "grad_norm": 0.7025464773178101, + "learning_rate": 4.671879569351362e-06, + "loss": 3.2058, + "step": 7345 + }, + { + "epoch": 0.94, + "grad_norm": 0.6420849561691284, + "learning_rate": 4.651955158708532e-06, + "loss": 3.1747, + "step": 7346 + }, + { + "epoch": 0.94, + "grad_norm": 0.5835869908332825, + "learning_rate": 4.632072926238934e-06, + "loss": 3.2178, + "step": 7347 + }, + { + "epoch": 0.94, + "grad_norm": 0.6520231366157532, + "learning_rate": 4.612232875360528e-06, + "loss": 3.2645, + "step": 7348 + }, + { + "epoch": 0.94, + "grad_norm": 0.6406843066215515, + "learning_rate": 4.592435009484086e-06, + "loss": 3.439, + "step": 7349 + }, + { + "epoch": 0.94, + "grad_norm": 0.6760379076004028, + "learning_rate": 4.57267933201308e-06, + "loss": 3.4426, + "step": 7350 + }, + { + "epoch": 0.94, + "grad_norm": 0.5957873463630676, + "learning_rate": 4.552965846343709e-06, + "loss": 3.3527, + "step": 7351 + }, + { + "epoch": 0.94, + "grad_norm": 0.6975240111351013, + "learning_rate": 4.533294555864986e-06, + "loss": 3.3895, + "step": 7352 + }, + { + "epoch": 0.94, + "grad_norm": 0.699790894985199, + "learning_rate": 4.513665463958621e-06, + "loss": 3.2091, + "step": 7353 + }, + { + "epoch": 0.94, + "grad_norm": 0.6839869022369385, + "learning_rate": 4.494078573999111e-06, + "loss": 3.2557, + "step": 7354 + }, + { + "epoch": 0.94, + "grad_norm": 0.6462808847427368, + "learning_rate": 4.474533889353594e-06, + "loss": 3.23, + "step": 7355 + }, + { + "epoch": 0.94, + "grad_norm": 0.6232364773750305, + "learning_rate": 4.455031413382104e-06, + "loss": 3.2319, + "step": 7356 + }, + { + "epoch": 0.94, + "grad_norm": 0.8861619234085083, + "learning_rate": 4.435571149437323e-06, + "loss": 3.3775, + "step": 7357 + }, + { + "epoch": 0.94, + "grad_norm": 0.6382137537002563, + "learning_rate": 4.416153100864684e-06, + "loss": 3.2468, + "step": 7358 + }, + { + "epoch": 0.94, + "grad_norm": 0.6784188747406006, + "learning_rate": 4.396777271002378e-06, + "loss": 3.2836, + "step": 7359 + }, + { + "epoch": 0.94, + "grad_norm": 0.6564819812774658, + "learning_rate": 4.377443663181324e-06, + "loss": 3.3778, + "step": 7360 + }, + { + "epoch": 0.94, + "grad_norm": 0.6596285104751587, + "learning_rate": 4.358152280725225e-06, + "loss": 3.2579, + "step": 7361 + }, + { + "epoch": 0.94, + "grad_norm": 0.6444254517555237, + "learning_rate": 4.338903126950483e-06, + "loss": 3.2217, + "step": 7362 + }, + { + "epoch": 0.94, + "grad_norm": 0.6608470678329468, + "learning_rate": 4.3196962051662014e-06, + "loss": 3.0983, + "step": 7363 + }, + { + "epoch": 0.94, + "grad_norm": 0.6407880783081055, + "learning_rate": 4.300531518674322e-06, + "loss": 3.2262, + "step": 7364 + }, + { + "epoch": 0.94, + "grad_norm": 0.6535629034042358, + "learning_rate": 4.281409070769487e-06, + "loss": 3.3884, + "step": 7365 + }, + { + "epoch": 0.94, + "grad_norm": 1.063302755355835, + "learning_rate": 4.26232886473904e-06, + "loss": 3.2577, + "step": 7366 + }, + { + "epoch": 0.94, + "grad_norm": 0.6154136657714844, + "learning_rate": 4.2432909038630785e-06, + "loss": 3.1855, + "step": 7367 + }, + { + "epoch": 0.94, + "grad_norm": 0.5912913084030151, + "learning_rate": 4.224295191414512e-06, + "loss": 3.2211, + "step": 7368 + }, + { + "epoch": 0.94, + "grad_norm": 0.6943714618682861, + "learning_rate": 4.205341730658841e-06, + "loss": 3.3071, + "step": 7369 + }, + { + "epoch": 0.94, + "grad_norm": 0.6367833614349365, + "learning_rate": 4.186430524854429e-06, + "loss": 3.2245, + "step": 7370 + }, + { + "epoch": 0.94, + "grad_norm": 0.658357560634613, + "learning_rate": 4.167561577252343e-06, + "loss": 3.3038, + "step": 7371 + }, + { + "epoch": 0.94, + "grad_norm": 0.6754440665245056, + "learning_rate": 4.148734891096351e-06, + "loss": 3.3077, + "step": 7372 + }, + { + "epoch": 0.94, + "grad_norm": 0.6419435143470764, + "learning_rate": 4.129950469622945e-06, + "loss": 3.3789, + "step": 7373 + }, + { + "epoch": 0.94, + "grad_norm": 0.6624208092689514, + "learning_rate": 4.1112083160614326e-06, + "loss": 3.1705, + "step": 7374 + }, + { + "epoch": 0.94, + "grad_norm": 0.6808865666389465, + "learning_rate": 4.0925084336338455e-06, + "loss": 3.1439, + "step": 7375 + }, + { + "epoch": 0.94, + "grad_norm": 0.6722561120986938, + "learning_rate": 4.073850825554837e-06, + "loss": 3.3548, + "step": 7376 + }, + { + "epoch": 0.94, + "grad_norm": 0.6091050505638123, + "learning_rate": 4.0552354950318944e-06, + "loss": 3.1668, + "step": 7377 + }, + { + "epoch": 0.94, + "grad_norm": 0.6220724582672119, + "learning_rate": 4.03666244526521e-06, + "loss": 3.2689, + "step": 7378 + }, + { + "epoch": 0.94, + "grad_norm": 0.6284779906272888, + "learning_rate": 4.018131679447701e-06, + "loss": 3.2768, + "step": 7379 + }, + { + "epoch": 0.94, + "grad_norm": 0.6638051867485046, + "learning_rate": 3.999643200764985e-06, + "loss": 3.3569, + "step": 7380 + }, + { + "epoch": 0.94, + "grad_norm": 0.6347754001617432, + "learning_rate": 3.981197012395521e-06, + "loss": 3.2362, + "step": 7381 + }, + { + "epoch": 0.94, + "grad_norm": 0.7009361982345581, + "learning_rate": 3.962793117510383e-06, + "loss": 3.2804, + "step": 7382 + }, + { + "epoch": 0.95, + "grad_norm": 0.6505569219589233, + "learning_rate": 3.944431519273401e-06, + "loss": 3.3916, + "step": 7383 + }, + { + "epoch": 0.95, + "grad_norm": 0.6859057545661926, + "learning_rate": 3.926112220841188e-06, + "loss": 3.2671, + "step": 7384 + }, + { + "epoch": 0.95, + "grad_norm": 0.6320242285728455, + "learning_rate": 3.907835225363005e-06, + "loss": 3.1011, + "step": 7385 + }, + { + "epoch": 0.95, + "grad_norm": 0.6930665969848633, + "learning_rate": 3.889600535980892e-06, + "loss": 3.3667, + "step": 7386 + }, + { + "epoch": 0.95, + "grad_norm": 0.6389195919036865, + "learning_rate": 3.8714081558295925e-06, + "loss": 3.2719, + "step": 7387 + }, + { + "epoch": 0.95, + "grad_norm": 0.6377859711647034, + "learning_rate": 3.853258088036604e-06, + "loss": 3.2008, + "step": 7388 + }, + { + "epoch": 0.95, + "grad_norm": 0.6412763595581055, + "learning_rate": 3.835150335722154e-06, + "loss": 3.2153, + "step": 7389 + }, + { + "epoch": 0.95, + "grad_norm": 0.7105813026428223, + "learning_rate": 3.817084901999113e-06, + "loss": 3.299, + "step": 7390 + }, + { + "epoch": 0.95, + "grad_norm": 0.6779466867446899, + "learning_rate": 3.7990617899731904e-06, + "loss": 3.2306, + "step": 7391 + }, + { + "epoch": 0.95, + "grad_norm": 0.6576768159866333, + "learning_rate": 3.7810810027427424e-06, + "loss": 3.4647, + "step": 7392 + }, + { + "epoch": 0.95, + "grad_norm": 0.6411910653114319, + "learning_rate": 3.7631425433989062e-06, + "loss": 3.2822, + "step": 7393 + }, + { + "epoch": 0.95, + "grad_norm": 0.6772045493125916, + "learning_rate": 3.745246415025466e-06, + "loss": 3.4465, + "step": 7394 + }, + { + "epoch": 0.95, + "grad_norm": 0.6431482434272766, + "learning_rate": 3.727392620699016e-06, + "loss": 3.2325, + "step": 7395 + }, + { + "epoch": 0.95, + "grad_norm": 0.6533840298652649, + "learning_rate": 3.7095811634887956e-06, + "loss": 3.1974, + "step": 7396 + }, + { + "epoch": 0.95, + "grad_norm": 0.6330516338348389, + "learning_rate": 3.691812046456827e-06, + "loss": 3.2032, + "step": 7397 + }, + { + "epoch": 0.95, + "grad_norm": 0.617141604423523, + "learning_rate": 3.6740852726578067e-06, + "loss": 3.2139, + "step": 7398 + }, + { + "epoch": 0.95, + "grad_norm": 0.7009109258651733, + "learning_rate": 3.6564008451392127e-06, + "loss": 3.2776, + "step": 7399 + }, + { + "epoch": 0.95, + "grad_norm": 0.6232280731201172, + "learning_rate": 3.6387587669411416e-06, + "loss": 3.2027, + "step": 7400 + }, + { + "epoch": 0.95, + "grad_norm": 0.7022527456283569, + "learning_rate": 3.6211590410965e-06, + "loss": 3.3376, + "step": 7401 + }, + { + "epoch": 0.95, + "grad_norm": 0.6588423848152161, + "learning_rate": 3.603601670630896e-06, + "loss": 3.3283, + "step": 7402 + }, + { + "epoch": 0.95, + "grad_norm": 0.6571477055549622, + "learning_rate": 3.5860866585626373e-06, + "loss": 3.2783, + "step": 7403 + }, + { + "epoch": 0.95, + "grad_norm": 0.6520159244537354, + "learning_rate": 3.5686140079027598e-06, + "loss": 3.2508, + "step": 7404 + }, + { + "epoch": 0.95, + "grad_norm": 0.6331124901771545, + "learning_rate": 3.551183721655027e-06, + "loss": 3.2829, + "step": 7405 + }, + { + "epoch": 0.95, + "grad_norm": 0.6849514245986938, + "learning_rate": 3.5337958028158754e-06, + "loss": 3.188, + "step": 7406 + }, + { + "epoch": 0.95, + "grad_norm": 0.5790781378746033, + "learning_rate": 3.5164502543745257e-06, + "loss": 3.1526, + "step": 7407 + }, + { + "epoch": 0.95, + "grad_norm": 0.634568452835083, + "learning_rate": 3.499147079312842e-06, + "loss": 3.3117, + "step": 7408 + }, + { + "epoch": 0.95, + "grad_norm": 0.6393299102783203, + "learning_rate": 3.481886280605445e-06, + "loss": 3.3075, + "step": 7409 + }, + { + "epoch": 0.95, + "grad_norm": 0.6796958446502686, + "learning_rate": 3.4646678612196837e-06, + "loss": 3.2816, + "step": 7410 + }, + { + "epoch": 0.95, + "grad_norm": 0.6363698840141296, + "learning_rate": 3.447491824115606e-06, + "loss": 3.3547, + "step": 7411 + }, + { + "epoch": 0.95, + "grad_norm": 0.611214280128479, + "learning_rate": 3.43035817224599e-06, + "loss": 3.2755, + "step": 7412 + }, + { + "epoch": 0.95, + "grad_norm": 0.6416151523590088, + "learning_rate": 3.413266908556256e-06, + "loss": 3.3442, + "step": 7413 + }, + { + "epoch": 0.95, + "grad_norm": 0.6357517838478088, + "learning_rate": 3.3962180359846386e-06, + "loss": 3.308, + "step": 7414 + }, + { + "epoch": 0.95, + "grad_norm": 0.6159369945526123, + "learning_rate": 3.379211557462014e-06, + "loss": 3.2884, + "step": 7415 + }, + { + "epoch": 0.95, + "grad_norm": 0.6278047561645508, + "learning_rate": 3.362247475911989e-06, + "loss": 3.1813, + "step": 7416 + }, + { + "epoch": 0.95, + "grad_norm": 0.6572278141975403, + "learning_rate": 3.3453257942508972e-06, + "loss": 3.3234, + "step": 7417 + }, + { + "epoch": 0.95, + "grad_norm": 0.6664751172065735, + "learning_rate": 3.328446515387773e-06, + "loss": 3.3132, + "step": 7418 + }, + { + "epoch": 0.95, + "grad_norm": 0.6599653363227844, + "learning_rate": 3.3116096422243225e-06, + "loss": 3.2526, + "step": 7419 + }, + { + "epoch": 0.95, + "grad_norm": 0.654568612575531, + "learning_rate": 3.294815177655064e-06, + "loss": 3.207, + "step": 7420 + }, + { + "epoch": 0.95, + "grad_norm": 0.6287360191345215, + "learning_rate": 3.2780631245671043e-06, + "loss": 3.209, + "step": 7421 + }, + { + "epoch": 0.95, + "grad_norm": 0.6323820948600769, + "learning_rate": 3.2613534858403625e-06, + "loss": 3.2948, + "step": 7422 + }, + { + "epoch": 0.95, + "grad_norm": 0.6450842022895813, + "learning_rate": 3.244686264347374e-06, + "loss": 3.3435, + "step": 7423 + }, + { + "epoch": 0.95, + "grad_norm": 0.7002039551734924, + "learning_rate": 3.2280614629534853e-06, + "loss": 3.4343, + "step": 7424 + }, + { + "epoch": 0.95, + "grad_norm": 0.6468720436096191, + "learning_rate": 3.211479084516633e-06, + "loss": 3.3368, + "step": 7425 + }, + { + "epoch": 0.95, + "grad_norm": 0.7393363118171692, + "learning_rate": 3.194939131887564e-06, + "loss": 3.2048, + "step": 7426 + }, + { + "epoch": 0.95, + "grad_norm": 0.6139890551567078, + "learning_rate": 3.1784416079096433e-06, + "loss": 3.3042, + "step": 7427 + }, + { + "epoch": 0.95, + "grad_norm": 0.6313186883926392, + "learning_rate": 3.1619865154190186e-06, + "loss": 3.2822, + "step": 7428 + }, + { + "epoch": 0.95, + "grad_norm": 0.6245015263557434, + "learning_rate": 3.1455738572444824e-06, + "loss": 3.3669, + "step": 7429 + }, + { + "epoch": 0.95, + "grad_norm": 0.6937124729156494, + "learning_rate": 3.1292036362076117e-06, + "loss": 3.2656, + "step": 7430 + }, + { + "epoch": 0.95, + "grad_norm": 0.6102727055549622, + "learning_rate": 3.1128758551225988e-06, + "loss": 3.1886, + "step": 7431 + }, + { + "epoch": 0.95, + "grad_norm": 0.6378781795501709, + "learning_rate": 3.0965905167963935e-06, + "loss": 3.2937, + "step": 7432 + }, + { + "epoch": 0.95, + "grad_norm": 0.6180261969566345, + "learning_rate": 3.0803476240286178e-06, + "loss": 3.1587, + "step": 7433 + }, + { + "epoch": 0.95, + "grad_norm": 0.66991126537323, + "learning_rate": 3.064147179611648e-06, + "loss": 3.2151, + "step": 7434 + }, + { + "epoch": 0.95, + "grad_norm": 0.6227942705154419, + "learning_rate": 3.04798918633048e-06, + "loss": 3.2695, + "step": 7435 + }, + { + "epoch": 0.95, + "grad_norm": 0.670200765132904, + "learning_rate": 3.0318736469628906e-06, + "loss": 3.1475, + "step": 7436 + }, + { + "epoch": 0.95, + "grad_norm": 0.5999524593353271, + "learning_rate": 3.015800564279303e-06, + "loss": 3.2645, + "step": 7437 + }, + { + "epoch": 0.95, + "grad_norm": 0.6560450196266174, + "learning_rate": 2.9997699410428956e-06, + "loss": 3.2083, + "step": 7438 + }, + { + "epoch": 0.95, + "grad_norm": 0.6428449153900146, + "learning_rate": 2.9837817800095203e-06, + "loss": 3.4943, + "step": 7439 + }, + { + "epoch": 0.95, + "grad_norm": 0.6920236945152283, + "learning_rate": 2.9678360839277276e-06, + "loss": 3.2001, + "step": 7440 + }, + { + "epoch": 0.95, + "grad_norm": 0.606275200843811, + "learning_rate": 2.9519328555387417e-06, + "loss": 3.3291, + "step": 7441 + }, + { + "epoch": 0.95, + "grad_norm": 0.665324330329895, + "learning_rate": 2.9360720975765144e-06, + "loss": 3.2925, + "step": 7442 + }, + { + "epoch": 0.95, + "grad_norm": 0.6416666507720947, + "learning_rate": 2.920253812767698e-06, + "loss": 3.4125, + "step": 7443 + }, + { + "epoch": 0.95, + "grad_norm": 0.621583104133606, + "learning_rate": 2.9044780038316456e-06, + "loss": 3.2132, + "step": 7444 + }, + { + "epoch": 0.95, + "grad_norm": 0.6322230696678162, + "learning_rate": 2.888744673480437e-06, + "loss": 3.2776, + "step": 7445 + }, + { + "epoch": 0.95, + "grad_norm": 0.6622276306152344, + "learning_rate": 2.873053824418742e-06, + "loss": 3.3062, + "step": 7446 + }, + { + "epoch": 0.95, + "grad_norm": 0.5983544588088989, + "learning_rate": 2.8574054593440423e-06, + "loss": 3.2411, + "step": 7447 + }, + { + "epoch": 0.95, + "grad_norm": 0.6524794697761536, + "learning_rate": 2.841799580946464e-06, + "loss": 3.2322, + "step": 7448 + }, + { + "epoch": 0.95, + "grad_norm": 0.5979216694831848, + "learning_rate": 2.82623619190886e-06, + "loss": 3.1827, + "step": 7449 + }, + { + "epoch": 0.95, + "grad_norm": 0.6573725938796997, + "learning_rate": 2.8107152949067295e-06, + "loss": 3.4223, + "step": 7450 + }, + { + "epoch": 0.95, + "grad_norm": 0.6488075256347656, + "learning_rate": 2.795236892608327e-06, + "loss": 3.1364, + "step": 7451 + }, + { + "epoch": 0.95, + "grad_norm": 0.6476569771766663, + "learning_rate": 2.779800987674497e-06, + "loss": 3.2683, + "step": 7452 + }, + { + "epoch": 0.95, + "grad_norm": 0.5997454524040222, + "learning_rate": 2.764407582758921e-06, + "loss": 3.245, + "step": 7453 + }, + { + "epoch": 0.95, + "grad_norm": 0.6412538290023804, + "learning_rate": 2.7490566805078996e-06, + "loss": 3.2369, + "step": 7454 + }, + { + "epoch": 0.95, + "grad_norm": 0.6506205201148987, + "learning_rate": 2.7337482835604056e-06, + "loss": 3.3683, + "step": 7455 + }, + { + "epoch": 0.95, + "grad_norm": 0.6723852157592773, + "learning_rate": 2.7184823945481105e-06, + "loss": 3.2889, + "step": 7456 + }, + { + "epoch": 0.95, + "grad_norm": 0.6422088742256165, + "learning_rate": 2.7032590160954716e-06, + "loss": 3.2476, + "step": 7457 + }, + { + "epoch": 0.95, + "grad_norm": 0.6394736170768738, + "learning_rate": 2.6880781508194784e-06, + "loss": 3.2505, + "step": 7458 + }, + { + "epoch": 0.95, + "grad_norm": 0.6404505372047424, + "learning_rate": 2.67293980132996e-06, + "loss": 3.2505, + "step": 7459 + }, + { + "epoch": 0.95, + "grad_norm": 0.6839563846588135, + "learning_rate": 2.6578439702293344e-06, + "loss": 3.2388, + "step": 7460 + }, + { + "epoch": 0.96, + "grad_norm": 0.6375935673713684, + "learning_rate": 2.642790660112776e-06, + "loss": 3.1646, + "step": 7461 + }, + { + "epoch": 0.96, + "grad_norm": 0.6834440231323242, + "learning_rate": 2.6277798735681315e-06, + "loss": 3.1828, + "step": 7462 + }, + { + "epoch": 0.96, + "grad_norm": 0.6728748083114624, + "learning_rate": 2.6128116131758918e-06, + "loss": 3.2923, + "step": 7463 + }, + { + "epoch": 0.96, + "grad_norm": 0.619761049747467, + "learning_rate": 2.5978858815093045e-06, + "loss": 3.2659, + "step": 7464 + }, + { + "epoch": 0.96, + "grad_norm": 0.683583676815033, + "learning_rate": 2.5830026811342335e-06, + "loss": 3.2574, + "step": 7465 + }, + { + "epoch": 0.96, + "grad_norm": 0.6158591508865356, + "learning_rate": 2.5681620146093266e-06, + "loss": 3.3406, + "step": 7466 + }, + { + "epoch": 0.96, + "grad_norm": 0.6639404296875, + "learning_rate": 2.5533638844858486e-06, + "loss": 3.1972, + "step": 7467 + }, + { + "epoch": 0.96, + "grad_norm": 0.6213057637214661, + "learning_rate": 2.538608293307765e-06, + "loss": 3.2496, + "step": 7468 + }, + { + "epoch": 0.96, + "grad_norm": 0.6411742568016052, + "learning_rate": 2.52389524361174e-06, + "loss": 3.357, + "step": 7469 + }, + { + "epoch": 0.96, + "grad_norm": 0.664076566696167, + "learning_rate": 2.5092247379271126e-06, + "loss": 3.2863, + "step": 7470 + }, + { + "epoch": 0.96, + "grad_norm": 0.586796224117279, + "learning_rate": 2.494596778775893e-06, + "loss": 3.2527, + "step": 7471 + }, + { + "epoch": 0.96, + "grad_norm": 0.661478579044342, + "learning_rate": 2.4800113686728467e-06, + "loss": 3.345, + "step": 7472 + }, + { + "epoch": 0.96, + "grad_norm": 0.6605984568595886, + "learning_rate": 2.4654685101253014e-06, + "loss": 3.2981, + "step": 7473 + }, + { + "epoch": 0.96, + "grad_norm": 0.6903208494186401, + "learning_rate": 2.450968205633425e-06, + "loss": 3.3093, + "step": 7474 + }, + { + "epoch": 0.96, + "grad_norm": 0.7049779295921326, + "learning_rate": 2.436510457689917e-06, + "loss": 3.2576, + "step": 7475 + }, + { + "epoch": 0.96, + "grad_norm": 0.6223276853561401, + "learning_rate": 2.4220952687802887e-06, + "loss": 3.3374, + "step": 7476 + }, + { + "epoch": 0.96, + "grad_norm": 0.6552082896232605, + "learning_rate": 2.4077226413826426e-06, + "loss": 3.2579, + "step": 7477 + }, + { + "epoch": 0.96, + "grad_norm": 0.6227458119392395, + "learning_rate": 2.3933925779678346e-06, + "loss": 3.2856, + "step": 7478 + }, + { + "epoch": 0.96, + "grad_norm": 0.6535359025001526, + "learning_rate": 2.379105080999311e-06, + "loss": 3.2242, + "step": 7479 + }, + { + "epoch": 0.96, + "grad_norm": 0.6697153449058533, + "learning_rate": 2.3648601529333014e-06, + "loss": 3.2368, + "step": 7480 + }, + { + "epoch": 0.96, + "grad_norm": 0.7097249627113342, + "learning_rate": 2.35065779621868e-06, + "loss": 3.3781, + "step": 7481 + }, + { + "epoch": 0.96, + "grad_norm": 0.6668369770050049, + "learning_rate": 2.336498013296967e-06, + "loss": 3.272, + "step": 7482 + }, + { + "epoch": 0.96, + "grad_norm": 0.6335403323173523, + "learning_rate": 2.3223808066024086e-06, + "loss": 3.1968, + "step": 7483 + }, + { + "epoch": 0.96, + "grad_norm": 0.6395009756088257, + "learning_rate": 2.3083061785618977e-06, + "loss": 3.3057, + "step": 7484 + }, + { + "epoch": 0.96, + "grad_norm": 0.6572073101997375, + "learning_rate": 2.2942741315950534e-06, + "loss": 3.3194, + "step": 7485 + }, + { + "epoch": 0.96, + "grad_norm": 0.64683997631073, + "learning_rate": 2.2802846681141685e-06, + "loss": 3.1422, + "step": 7486 + }, + { + "epoch": 0.96, + "grad_norm": 0.6475039720535278, + "learning_rate": 2.2663377905241257e-06, + "loss": 3.2925, + "step": 7487 + }, + { + "epoch": 0.96, + "grad_norm": 0.6652488708496094, + "learning_rate": 2.2524335012225894e-06, + "loss": 3.1857, + "step": 7488 + }, + { + "epoch": 0.96, + "grad_norm": 0.6927341222763062, + "learning_rate": 2.238571802599898e-06, + "loss": 3.3621, + "step": 7489 + }, + { + "epoch": 0.96, + "grad_norm": 0.6448782086372375, + "learning_rate": 2.2247526970390064e-06, + "loss": 3.2789, + "step": 7490 + }, + { + "epoch": 0.96, + "grad_norm": 0.6733150482177734, + "learning_rate": 2.2109761869155697e-06, + "loss": 3.2261, + "step": 7491 + }, + { + "epoch": 0.96, + "grad_norm": 0.6071258187294006, + "learning_rate": 2.1972422745979436e-06, + "loss": 3.2484, + "step": 7492 + }, + { + "epoch": 0.96, + "grad_norm": 0.6346178650856018, + "learning_rate": 2.1835509624471562e-06, + "loss": 3.4007, + "step": 7493 + }, + { + "epoch": 0.96, + "grad_norm": 0.6324289441108704, + "learning_rate": 2.1699022528168797e-06, + "loss": 3.1415, + "step": 7494 + }, + { + "epoch": 0.96, + "grad_norm": 0.6148743033409119, + "learning_rate": 2.156296148053488e-06, + "loss": 3.3537, + "step": 7495 + }, + { + "epoch": 0.96, + "grad_norm": 0.6335322856903076, + "learning_rate": 2.142732650496082e-06, + "loss": 3.1739, + "step": 7496 + }, + { + "epoch": 0.96, + "grad_norm": 0.7053571343421936, + "learning_rate": 2.1292117624763243e-06, + "loss": 3.274, + "step": 7497 + }, + { + "epoch": 0.96, + "grad_norm": 0.6378585696220398, + "learning_rate": 2.115733486318605e-06, + "loss": 3.1187, + "step": 7498 + }, + { + "epoch": 0.96, + "grad_norm": 0.6480412483215332, + "learning_rate": 2.102297824340044e-06, + "loss": 3.2563, + "step": 7499 + }, + { + "epoch": 0.96, + "grad_norm": 0.6488012075424194, + "learning_rate": 2.0889047788503755e-06, + "loss": 3.2886, + "step": 7500 + }, + { + "epoch": 0.96, + "grad_norm": 0.6246366500854492, + "learning_rate": 2.0755543521519815e-06, + "loss": 3.2876, + "step": 7501 + }, + { + "epoch": 0.96, + "grad_norm": 0.702854573726654, + "learning_rate": 2.06224654653997e-06, + "loss": 3.3309, + "step": 7502 + }, + { + "epoch": 0.96, + "grad_norm": 0.630124568939209, + "learning_rate": 2.0489813643021493e-06, + "loss": 3.2226, + "step": 7503 + }, + { + "epoch": 0.96, + "grad_norm": 0.6372682452201843, + "learning_rate": 2.035758807718918e-06, + "loss": 3.2873, + "step": 7504 + }, + { + "epoch": 0.96, + "grad_norm": 0.6328387260437012, + "learning_rate": 2.0225788790633746e-06, + "loss": 3.227, + "step": 7505 + }, + { + "epoch": 0.96, + "grad_norm": 0.6260595917701721, + "learning_rate": 2.0094415806013454e-06, + "loss": 3.4956, + "step": 7506 + }, + { + "epoch": 0.96, + "grad_norm": 0.6554728746414185, + "learning_rate": 1.996346914591274e-06, + "loss": 3.2805, + "step": 7507 + }, + { + "epoch": 0.96, + "grad_norm": 0.6232674717903137, + "learning_rate": 1.983294883284248e-06, + "loss": 3.2885, + "step": 7508 + }, + { + "epoch": 0.96, + "grad_norm": 0.624148964881897, + "learning_rate": 1.970285488924084e-06, + "loss": 3.4587, + "step": 7509 + }, + { + "epoch": 0.96, + "grad_norm": 0.6816232800483704, + "learning_rate": 1.957318733747271e-06, + "loss": 3.3016, + "step": 7510 + }, + { + "epoch": 0.96, + "grad_norm": 0.7068300247192383, + "learning_rate": 1.944394619982942e-06, + "loss": 3.2474, + "step": 7511 + }, + { + "epoch": 0.96, + "grad_norm": 0.6904548406600952, + "learning_rate": 1.931513149852848e-06, + "loss": 3.1786, + "step": 7512 + }, + { + "epoch": 0.96, + "grad_norm": 0.6330033540725708, + "learning_rate": 1.9186743255714955e-06, + "loss": 3.2149, + "step": 7513 + }, + { + "epoch": 0.96, + "grad_norm": 0.6272470355033875, + "learning_rate": 1.905878149346063e-06, + "loss": 3.286, + "step": 7514 + }, + { + "epoch": 0.96, + "grad_norm": 0.6389195919036865, + "learning_rate": 1.893124623376319e-06, + "loss": 3.269, + "step": 7515 + }, + { + "epoch": 0.96, + "grad_norm": 0.6015783548355103, + "learning_rate": 1.8804137498547592e-06, + "loss": 3.2182, + "step": 7516 + }, + { + "epoch": 0.96, + "grad_norm": 0.6698102951049805, + "learning_rate": 1.8677455309664971e-06, + "loss": 3.2994, + "step": 7517 + }, + { + "epoch": 0.96, + "grad_norm": 0.654212474822998, + "learning_rate": 1.8551199688894016e-06, + "loss": 3.2943, + "step": 7518 + }, + { + "epoch": 0.96, + "grad_norm": 0.6759641766548157, + "learning_rate": 1.842537065793931e-06, + "loss": 3.2493, + "step": 7519 + }, + { + "epoch": 0.96, + "grad_norm": 0.6238897442817688, + "learning_rate": 1.8299968238432163e-06, + "loss": 3.2418, + "step": 7520 + }, + { + "epoch": 0.96, + "grad_norm": 0.6189209222793579, + "learning_rate": 1.8174992451930605e-06, + "loss": 3.2654, + "step": 7521 + }, + { + "epoch": 0.96, + "grad_norm": 0.6399158835411072, + "learning_rate": 1.805044331991995e-06, + "loss": 3.3872, + "step": 7522 + }, + { + "epoch": 0.96, + "grad_norm": 0.6976350545883179, + "learning_rate": 1.7926320863811129e-06, + "loss": 3.2379, + "step": 7523 + }, + { + "epoch": 0.96, + "grad_norm": 0.6221190094947815, + "learning_rate": 1.7802625104942627e-06, + "loss": 3.1995, + "step": 7524 + }, + { + "epoch": 0.96, + "grad_norm": 0.6344885230064392, + "learning_rate": 1.7679356064578821e-06, + "loss": 3.2165, + "step": 7525 + }, + { + "epoch": 0.96, + "grad_norm": 0.6899000406265259, + "learning_rate": 1.7556513763911096e-06, + "loss": 3.259, + "step": 7526 + }, + { + "epoch": 0.96, + "grad_norm": 0.6491447687149048, + "learning_rate": 1.7434098224057838e-06, + "loss": 3.2962, + "step": 7527 + }, + { + "epoch": 0.96, + "grad_norm": 0.6144093871116638, + "learning_rate": 1.7312109466063597e-06, + "loss": 3.1326, + "step": 7528 + }, + { + "epoch": 0.96, + "grad_norm": 0.6447715163230896, + "learning_rate": 1.71905475108991e-06, + "loss": 3.3746, + "step": 7529 + }, + { + "epoch": 0.96, + "grad_norm": 0.596442461013794, + "learning_rate": 1.7069412379462911e-06, + "loss": 3.1959, + "step": 7530 + }, + { + "epoch": 0.96, + "grad_norm": 0.6180348992347717, + "learning_rate": 1.6948704092579205e-06, + "loss": 3.2475, + "step": 7531 + }, + { + "epoch": 0.96, + "grad_norm": 0.6351748704910278, + "learning_rate": 1.6828422670999432e-06, + "loss": 3.2058, + "step": 7532 + }, + { + "epoch": 0.96, + "grad_norm": 0.642833411693573, + "learning_rate": 1.6708568135401225e-06, + "loss": 3.1984, + "step": 7533 + }, + { + "epoch": 0.96, + "grad_norm": 0.638550341129303, + "learning_rate": 1.6589140506388933e-06, + "loss": 3.2736, + "step": 7534 + }, + { + "epoch": 0.96, + "grad_norm": 0.6374490261077881, + "learning_rate": 1.6470139804493357e-06, + "loss": 3.3581, + "step": 7535 + }, + { + "epoch": 0.96, + "grad_norm": 0.6551685333251953, + "learning_rate": 1.6351566050172573e-06, + "loss": 3.2042, + "step": 7536 + }, + { + "epoch": 0.96, + "grad_norm": 0.6481794118881226, + "learning_rate": 1.6233419263810278e-06, + "loss": 3.2001, + "step": 7537 + }, + { + "epoch": 0.96, + "grad_norm": 0.6699073314666748, + "learning_rate": 1.611569946571745e-06, + "loss": 3.3134, + "step": 7538 + }, + { + "epoch": 0.96, + "grad_norm": 0.680874764919281, + "learning_rate": 1.5998406676131783e-06, + "loss": 3.3234, + "step": 7539 + }, + { + "epoch": 0.97, + "grad_norm": 0.6119950413703918, + "learning_rate": 1.5881540915216875e-06, + "loss": 3.078, + "step": 7540 + }, + { + "epoch": 0.97, + "grad_norm": 0.6424537897109985, + "learning_rate": 1.5765102203063596e-06, + "loss": 3.254, + "step": 7541 + }, + { + "epoch": 0.97, + "grad_norm": 0.6495968103408813, + "learning_rate": 1.5649090559688716e-06, + "loss": 3.2139, + "step": 7542 + }, + { + "epoch": 0.97, + "grad_norm": 0.6209322214126587, + "learning_rate": 1.5533506005036557e-06, + "loss": 3.3226, + "step": 7543 + }, + { + "epoch": 0.97, + "grad_norm": 0.6560537219047546, + "learning_rate": 1.5418348558977058e-06, + "loss": 3.2831, + "step": 7544 + }, + { + "epoch": 0.97, + "grad_norm": 0.6782567501068115, + "learning_rate": 1.5303618241306883e-06, + "loss": 3.2048, + "step": 7545 + }, + { + "epoch": 0.97, + "grad_norm": 0.648374617099762, + "learning_rate": 1.5189315071749977e-06, + "loss": 3.2015, + "step": 7546 + }, + { + "epoch": 0.97, + "grad_norm": 0.6472437381744385, + "learning_rate": 1.5075439069956453e-06, + "loss": 3.1896, + "step": 7547 + }, + { + "epoch": 0.97, + "grad_norm": 0.6487230062484741, + "learning_rate": 1.4961990255502323e-06, + "loss": 3.2939, + "step": 7548 + }, + { + "epoch": 0.97, + "grad_norm": 0.6969477534294128, + "learning_rate": 1.4848968647891148e-06, + "loss": 3.3404, + "step": 7549 + }, + { + "epoch": 0.97, + "grad_norm": 0.6641603708267212, + "learning_rate": 1.4736374266552943e-06, + "loss": 3.2283, + "step": 7550 + }, + { + "epoch": 0.97, + "grad_norm": 0.6257874965667725, + "learning_rate": 1.4624207130843336e-06, + "loss": 3.2594, + "step": 7551 + }, + { + "epoch": 0.97, + "grad_norm": 0.6400957703590393, + "learning_rate": 1.4512467260045514e-06, + "loss": 3.3074, + "step": 7552 + }, + { + "epoch": 0.97, + "grad_norm": 0.6433885097503662, + "learning_rate": 1.4401154673368833e-06, + "loss": 3.3894, + "step": 7553 + }, + { + "epoch": 0.97, + "grad_norm": 0.6520471572875977, + "learning_rate": 1.4290269389949095e-06, + "loss": 3.2277, + "step": 7554 + }, + { + "epoch": 0.97, + "grad_norm": 0.6392160058021545, + "learning_rate": 1.417981142884911e-06, + "loss": 3.2823, + "step": 7555 + }, + { + "epoch": 0.97, + "grad_norm": 0.637761116027832, + "learning_rate": 1.4069780809057575e-06, + "loss": 3.2336, + "step": 7556 + }, + { + "epoch": 0.97, + "grad_norm": 0.6488739252090454, + "learning_rate": 1.3960177549489917e-06, + "loss": 3.38, + "step": 7557 + }, + { + "epoch": 0.97, + "grad_norm": 0.6583778858184814, + "learning_rate": 1.3851001668988562e-06, + "loss": 3.2938, + "step": 7558 + }, + { + "epoch": 0.97, + "grad_norm": 0.6335779428482056, + "learning_rate": 1.3742253186321829e-06, + "loss": 3.2275, + "step": 7559 + }, + { + "epoch": 0.97, + "grad_norm": 0.6512758135795593, + "learning_rate": 1.3633932120184766e-06, + "loss": 3.3365, + "step": 7560 + }, + { + "epoch": 0.97, + "grad_norm": 0.6726590991020203, + "learning_rate": 1.3526038489199421e-06, + "loss": 3.3447, + "step": 7561 + }, + { + "epoch": 0.97, + "grad_norm": 0.6761517524719238, + "learning_rate": 1.3418572311913735e-06, + "loss": 3.3358, + "step": 7562 + }, + { + "epoch": 0.97, + "grad_norm": 0.5900736451148987, + "learning_rate": 1.3311533606802651e-06, + "loss": 3.2874, + "step": 7563 + }, + { + "epoch": 0.97, + "grad_norm": 0.6390765905380249, + "learning_rate": 1.3204922392266728e-06, + "loss": 3.211, + "step": 7564 + }, + { + "epoch": 0.97, + "grad_norm": 0.6341561079025269, + "learning_rate": 1.309873868663436e-06, + "loss": 3.3539, + "step": 7565 + }, + { + "epoch": 0.97, + "grad_norm": 0.6070687174797058, + "learning_rate": 1.2992982508159e-06, + "loss": 3.2892, + "step": 7566 + }, + { + "epoch": 0.97, + "grad_norm": 0.6941978931427002, + "learning_rate": 1.2887653875021944e-06, + "loss": 3.3742, + "step": 7567 + }, + { + "epoch": 0.97, + "grad_norm": 0.6048576235771179, + "learning_rate": 1.2782752805330366e-06, + "loss": 3.1947, + "step": 7568 + }, + { + "epoch": 0.97, + "grad_norm": 0.6762664318084717, + "learning_rate": 1.2678279317117903e-06, + "loss": 3.2131, + "step": 7569 + }, + { + "epoch": 0.97, + "grad_norm": 0.6474283337593079, + "learning_rate": 1.2574233428344905e-06, + "loss": 3.3567, + "step": 7570 + }, + { + "epoch": 0.97, + "grad_norm": 0.6327533721923828, + "learning_rate": 1.2470615156897624e-06, + "loss": 3.2799, + "step": 7571 + }, + { + "epoch": 0.97, + "grad_norm": 0.6646479368209839, + "learning_rate": 1.2367424520589588e-06, + "loss": 3.3822, + "step": 7572 + }, + { + "epoch": 0.97, + "grad_norm": 0.6395760178565979, + "learning_rate": 1.2264661537160492e-06, + "loss": 3.3372, + "step": 7573 + }, + { + "epoch": 0.97, + "grad_norm": 0.7232096195220947, + "learning_rate": 1.216232622427621e-06, + "loss": 3.3805, + "step": 7574 + }, + { + "epoch": 0.97, + "grad_norm": 0.6458327174186707, + "learning_rate": 1.206041859952961e-06, + "loss": 3.2589, + "step": 7575 + }, + { + "epoch": 0.97, + "grad_norm": 0.6875505447387695, + "learning_rate": 1.1958938680439736e-06, + "loss": 3.1891, + "step": 7576 + }, + { + "epoch": 0.97, + "grad_norm": 0.6274546384811401, + "learning_rate": 1.1857886484452073e-06, + "loss": 3.2288, + "step": 7577 + }, + { + "epoch": 0.97, + "grad_norm": 0.6373465657234192, + "learning_rate": 1.1757262028938842e-06, + "loss": 3.312, + "step": 7578 + }, + { + "epoch": 0.97, + "grad_norm": 0.624261736869812, + "learning_rate": 1.1657065331198425e-06, + "loss": 3.1332, + "step": 7579 + }, + { + "epoch": 0.97, + "grad_norm": 0.6104755997657776, + "learning_rate": 1.1557296408455932e-06, + "loss": 3.2463, + "step": 7580 + }, + { + "epoch": 0.97, + "grad_norm": 0.6304614543914795, + "learning_rate": 1.1457955277862641e-06, + "loss": 3.3576, + "step": 7581 + }, + { + "epoch": 0.97, + "grad_norm": 0.6486703157424927, + "learning_rate": 1.1359041956496286e-06, + "loss": 3.2021, + "step": 7582 + }, + { + "epoch": 0.97, + "grad_norm": 0.6310344934463501, + "learning_rate": 1.1260556461361592e-06, + "loss": 3.2776, + "step": 7583 + }, + { + "epoch": 0.97, + "grad_norm": 0.6773595213890076, + "learning_rate": 1.1162498809389188e-06, + "loss": 3.2133, + "step": 7584 + }, + { + "epoch": 0.97, + "grad_norm": 0.6367495059967041, + "learning_rate": 1.106486901743642e-06, + "loss": 3.2301, + "step": 7585 + }, + { + "epoch": 0.97, + "grad_norm": 0.6119555830955505, + "learning_rate": 1.0967667102286527e-06, + "loss": 3.2219, + "step": 7586 + }, + { + "epoch": 0.97, + "grad_norm": 0.6391817927360535, + "learning_rate": 1.0870893080650313e-06, + "loss": 3.1323, + "step": 7587 + }, + { + "epoch": 0.97, + "grad_norm": 0.6428825855255127, + "learning_rate": 1.0774546969163912e-06, + "loss": 3.2592, + "step": 7588 + }, + { + "epoch": 0.97, + "grad_norm": 0.6393899917602539, + "learning_rate": 1.0678628784390466e-06, + "loss": 3.3378, + "step": 7589 + }, + { + "epoch": 0.97, + "grad_norm": 0.6011338829994202, + "learning_rate": 1.0583138542819558e-06, + "loss": 3.3527, + "step": 7590 + }, + { + "epoch": 0.97, + "grad_norm": 0.6448006629943848, + "learning_rate": 1.0488076260866952e-06, + "loss": 3.2675, + "step": 7591 + }, + { + "epoch": 0.97, + "grad_norm": 0.6437724828720093, + "learning_rate": 1.0393441954874849e-06, + "loss": 3.3337, + "step": 7592 + }, + { + "epoch": 0.97, + "grad_norm": 0.6157968640327454, + "learning_rate": 1.0299235641111904e-06, + "loss": 3.2734, + "step": 7593 + }, + { + "epoch": 0.97, + "grad_norm": 0.6341884136199951, + "learning_rate": 1.0205457335773493e-06, + "loss": 3.2529, + "step": 7594 + }, + { + "epoch": 0.97, + "grad_norm": 0.8305909037590027, + "learning_rate": 1.0112107054981167e-06, + "loss": 3.2854, + "step": 7595 + }, + { + "epoch": 0.97, + "grad_norm": 0.61409991979599, + "learning_rate": 1.0019184814782923e-06, + "loss": 3.2995, + "step": 7596 + }, + { + "epoch": 0.97, + "grad_norm": 0.6470026969909668, + "learning_rate": 9.9266906311532e-07, + "loss": 3.3683, + "step": 7597 + }, + { + "epoch": 0.97, + "grad_norm": 0.6454616785049438, + "learning_rate": 9.834624519992897e-07, + "loss": 3.2503, + "step": 7598 + }, + { + "epoch": 0.97, + "grad_norm": 0.614826500415802, + "learning_rate": 9.742986497128792e-07, + "loss": 3.265, + "step": 7599 + }, + { + "epoch": 0.97, + "grad_norm": 0.647479772567749, + "learning_rate": 9.65177657831523e-07, + "loss": 3.1108, + "step": 7600 + }, + { + "epoch": 0.97, + "grad_norm": 0.689254641532898, + "learning_rate": 9.560994779231613e-07, + "loss": 3.2805, + "step": 7601 + }, + { + "epoch": 0.97, + "grad_norm": 0.6509751081466675, + "learning_rate": 9.470641115484624e-07, + "loss": 3.2505, + "step": 7602 + }, + { + "epoch": 0.97, + "grad_norm": 0.6674864292144775, + "learning_rate": 9.380715602607115e-07, + "loss": 3.2593, + "step": 7603 + }, + { + "epoch": 0.97, + "grad_norm": 0.6391346454620361, + "learning_rate": 9.291218256058387e-07, + "loss": 3.4132, + "step": 7604 + }, + { + "epoch": 0.97, + "grad_norm": 0.6764358878135681, + "learning_rate": 9.20214909122391e-07, + "loss": 3.0903, + "step": 7605 + }, + { + "epoch": 0.97, + "grad_norm": 0.6738482117652893, + "learning_rate": 9.113508123415881e-07, + "loss": 3.2611, + "step": 7606 + }, + { + "epoch": 0.97, + "grad_norm": 0.6696187853813171, + "learning_rate": 9.025295367872665e-07, + "loss": 3.3648, + "step": 7607 + }, + { + "epoch": 0.97, + "grad_norm": 0.6360906362533569, + "learning_rate": 8.937510839759078e-07, + "loss": 3.2088, + "step": 7608 + }, + { + "epoch": 0.97, + "grad_norm": 0.6381098031997681, + "learning_rate": 8.850154554166101e-07, + "loss": 3.326, + "step": 7609 + }, + { + "epoch": 0.97, + "grad_norm": 0.7050420045852661, + "learning_rate": 8.763226526111723e-07, + "loss": 3.2243, + "step": 7610 + }, + { + "epoch": 0.97, + "grad_norm": 0.664466381072998, + "learning_rate": 8.676726770539267e-07, + "loss": 3.3126, + "step": 7611 + }, + { + "epoch": 0.97, + "grad_norm": 0.6208410263061523, + "learning_rate": 8.590655302319616e-07, + "loss": 3.256, + "step": 7612 + }, + { + "epoch": 0.97, + "grad_norm": 0.6272687315940857, + "learning_rate": 8.505012136249268e-07, + "loss": 3.272, + "step": 7613 + }, + { + "epoch": 0.97, + "grad_norm": 0.6727380752563477, + "learning_rate": 8.41979728705089e-07, + "loss": 3.2288, + "step": 7614 + }, + { + "epoch": 0.97, + "grad_norm": 0.7296229600906372, + "learning_rate": 8.335010769374429e-07, + "loss": 3.3521, + "step": 7615 + }, + { + "epoch": 0.97, + "grad_norm": 0.6695315837860107, + "learning_rate": 8.25065259779545e-07, + "loss": 3.2912, + "step": 7616 + }, + { + "epoch": 0.97, + "grad_norm": 0.6159558892250061, + "learning_rate": 8.166722786816239e-07, + "loss": 3.2625, + "step": 7617 + }, + { + "epoch": 0.98, + "grad_norm": 0.639655590057373, + "learning_rate": 8.083221350865256e-07, + "loss": 3.2781, + "step": 7618 + }, + { + "epoch": 0.98, + "grad_norm": 0.6372049450874329, + "learning_rate": 8.000148304297128e-07, + "loss": 3.2502, + "step": 7619 + }, + { + "epoch": 0.98, + "grad_norm": 0.6868239641189575, + "learning_rate": 7.917503661393211e-07, + "loss": 3.2418, + "step": 7620 + }, + { + "epoch": 0.98, + "grad_norm": 0.6578603982925415, + "learning_rate": 7.835287436361305e-07, + "loss": 3.3421, + "step": 7621 + }, + { + "epoch": 0.98, + "grad_norm": 0.6327383518218994, + "learning_rate": 7.753499643334827e-07, + "loss": 3.2392, + "step": 7622 + }, + { + "epoch": 0.98, + "grad_norm": 0.6470942497253418, + "learning_rate": 7.672140296374475e-07, + "loss": 3.1826, + "step": 7623 + }, + { + "epoch": 0.98, + "grad_norm": 0.6480328440666199, + "learning_rate": 7.591209409466837e-07, + "loss": 3.34, + "step": 7624 + }, + { + "epoch": 0.98, + "grad_norm": 0.6624470949172974, + "learning_rate": 7.510706996524675e-07, + "loss": 3.2978, + "step": 7625 + }, + { + "epoch": 0.98, + "grad_norm": 0.6373721361160278, + "learning_rate": 7.430633071387749e-07, + "loss": 3.3639, + "step": 7626 + }, + { + "epoch": 0.98, + "grad_norm": 0.6560044288635254, + "learning_rate": 7.350987647820884e-07, + "loss": 3.3718, + "step": 7627 + }, + { + "epoch": 0.98, + "grad_norm": 0.6382562518119812, + "learning_rate": 7.271770739516737e-07, + "loss": 3.254, + "step": 7628 + }, + { + "epoch": 0.98, + "grad_norm": 0.6614659428596497, + "learning_rate": 7.19298236009358e-07, + "loss": 3.2579, + "step": 7629 + }, + { + "epoch": 0.98, + "grad_norm": 0.6337676048278809, + "learning_rate": 7.114622523095305e-07, + "loss": 3.099, + "step": 7630 + }, + { + "epoch": 0.98, + "grad_norm": 0.6617096066474915, + "learning_rate": 7.036691241993909e-07, + "loss": 3.2443, + "step": 7631 + }, + { + "epoch": 0.98, + "grad_norm": 0.6335844397544861, + "learning_rate": 6.9591885301859e-07, + "loss": 3.3422, + "step": 7632 + }, + { + "epoch": 0.98, + "grad_norm": 0.6168643236160278, + "learning_rate": 6.882114400995343e-07, + "loss": 3.253, + "step": 7633 + }, + { + "epoch": 0.98, + "grad_norm": 0.6648966073989868, + "learning_rate": 6.805468867672193e-07, + "loss": 3.162, + "step": 7634 + }, + { + "epoch": 0.98, + "grad_norm": 0.680281400680542, + "learning_rate": 6.729251943392301e-07, + "loss": 3.223, + "step": 7635 + }, + { + "epoch": 0.98, + "grad_norm": 0.6999646425247192, + "learning_rate": 6.653463641258517e-07, + "loss": 3.2631, + "step": 7636 + }, + { + "epoch": 0.98, + "grad_norm": 0.6539798974990845, + "learning_rate": 6.578103974299588e-07, + "loss": 3.28, + "step": 7637 + }, + { + "epoch": 0.98, + "grad_norm": 0.676738440990448, + "learning_rate": 6.503172955470982e-07, + "loss": 3.3316, + "step": 7638 + }, + { + "epoch": 0.98, + "grad_norm": 0.6027320623397827, + "learning_rate": 6.428670597654062e-07, + "loss": 3.2476, + "step": 7639 + }, + { + "epoch": 0.98, + "grad_norm": 0.5785861611366272, + "learning_rate": 6.354596913656363e-07, + "loss": 3.1649, + "step": 7640 + }, + { + "epoch": 0.98, + "grad_norm": 0.6677677631378174, + "learning_rate": 6.280951916212418e-07, + "loss": 3.2489, + "step": 7641 + }, + { + "epoch": 0.98, + "grad_norm": 0.611767590045929, + "learning_rate": 6.207735617982657e-07, + "loss": 3.3958, + "step": 7642 + }, + { + "epoch": 0.98, + "grad_norm": 0.7346791625022888, + "learning_rate": 6.134948031553678e-07, + "loss": 3.2172, + "step": 7643 + }, + { + "epoch": 0.98, + "grad_norm": 0.6609097719192505, + "learning_rate": 6.062589169438248e-07, + "loss": 3.3464, + "step": 7644 + }, + { + "epoch": 0.98, + "grad_norm": 0.6335502862930298, + "learning_rate": 5.990659044076141e-07, + "loss": 3.178, + "step": 7645 + }, + { + "epoch": 0.98, + "grad_norm": 0.6232840418815613, + "learning_rate": 5.919157667832464e-07, + "loss": 3.2284, + "step": 7646 + }, + { + "epoch": 0.98, + "grad_norm": 0.5935794115066528, + "learning_rate": 5.848085052999885e-07, + "loss": 3.2545, + "step": 7647 + }, + { + "epoch": 0.98, + "grad_norm": 0.6684829592704773, + "learning_rate": 5.777441211795853e-07, + "loss": 3.3676, + "step": 7648 + }, + { + "epoch": 0.98, + "grad_norm": 0.6543155908584595, + "learning_rate": 5.707226156365375e-07, + "loss": 3.219, + "step": 7649 + }, + { + "epoch": 0.98, + "grad_norm": 0.5975430607795715, + "learning_rate": 5.637439898779073e-07, + "loss": 3.2246, + "step": 7650 + }, + { + "epoch": 0.98, + "grad_norm": 0.6876344084739685, + "learning_rate": 5.56808245103374e-07, + "loss": 3.3489, + "step": 7651 + }, + { + "epoch": 0.98, + "grad_norm": 0.6838445067405701, + "learning_rate": 5.499153825053171e-07, + "loss": 3.3739, + "step": 7652 + }, + { + "epoch": 0.98, + "grad_norm": 0.6268404722213745, + "learning_rate": 5.430654032686777e-07, + "loss": 3.1471, + "step": 7653 + }, + { + "epoch": 0.98, + "grad_norm": 0.679879903793335, + "learning_rate": 5.362583085710416e-07, + "loss": 3.3478, + "step": 7654 + }, + { + "epoch": 0.98, + "grad_norm": 0.6650727391242981, + "learning_rate": 5.294940995826392e-07, + "loss": 3.3497, + "step": 7655 + }, + { + "epoch": 0.98, + "grad_norm": 0.6430428624153137, + "learning_rate": 5.227727774663182e-07, + "loss": 3.1272, + "step": 7656 + }, + { + "epoch": 0.98, + "grad_norm": 0.7259106636047363, + "learning_rate": 5.160943433775434e-07, + "loss": 3.2782, + "step": 7657 + }, + { + "epoch": 0.98, + "grad_norm": 0.6815977692604065, + "learning_rate": 5.094587984643962e-07, + "loss": 3.2856, + "step": 7658 + }, + { + "epoch": 0.98, + "grad_norm": 0.6110831499099731, + "learning_rate": 5.028661438676308e-07, + "loss": 3.354, + "step": 7659 + }, + { + "epoch": 0.98, + "grad_norm": 0.6547110080718994, + "learning_rate": 4.963163807205906e-07, + "loss": 3.2981, + "step": 7660 + }, + { + "epoch": 0.98, + "grad_norm": 0.6048975586891174, + "learning_rate": 4.898095101492916e-07, + "loss": 3.2398, + "step": 7661 + }, + { + "epoch": 0.98, + "grad_norm": 0.6170310378074646, + "learning_rate": 4.833455332722836e-07, + "loss": 3.306, + "step": 7662 + }, + { + "epoch": 0.98, + "grad_norm": 0.6575043797492981, + "learning_rate": 4.769244512008164e-07, + "loss": 3.2691, + "step": 7663 + }, + { + "epoch": 0.98, + "grad_norm": 0.6412174701690674, + "learning_rate": 4.7054626503878484e-07, + "loss": 3.4241, + "step": 7664 + }, + { + "epoch": 0.98, + "grad_norm": 0.6265059113502502, + "learning_rate": 4.642109758826174e-07, + "loss": 3.2871, + "step": 7665 + }, + { + "epoch": 0.98, + "grad_norm": 0.6097034215927124, + "learning_rate": 4.579185848214429e-07, + "loss": 3.2071, + "step": 7666 + }, + { + "epoch": 0.98, + "grad_norm": 0.6445879340171814, + "learning_rate": 4.5166909293703487e-07, + "loss": 3.285, + "step": 7667 + }, + { + "epoch": 0.98, + "grad_norm": 0.6280282735824585, + "learning_rate": 4.454625013037006e-07, + "loss": 3.1142, + "step": 7668 + }, + { + "epoch": 0.98, + "grad_norm": 0.644930362701416, + "learning_rate": 4.392988109884477e-07, + "loss": 3.415, + "step": 7669 + }, + { + "epoch": 0.98, + "grad_norm": 0.7137476205825806, + "learning_rate": 4.331780230509008e-07, + "loss": 3.3423, + "step": 7670 + }, + { + "epoch": 0.98, + "grad_norm": 0.6461427211761475, + "learning_rate": 4.271001385432738e-07, + "loss": 3.3923, + "step": 7671 + }, + { + "epoch": 0.98, + "grad_norm": 0.6740446090698242, + "learning_rate": 4.2106515851042524e-07, + "loss": 3.1089, + "step": 7672 + }, + { + "epoch": 0.98, + "grad_norm": 0.6676669120788574, + "learning_rate": 4.150730839898309e-07, + "loss": 3.315, + "step": 7673 + }, + { + "epoch": 0.98, + "grad_norm": 0.6839563846588135, + "learning_rate": 4.0912391601161115e-07, + "loss": 3.3375, + "step": 7674 + }, + { + "epoch": 0.98, + "grad_norm": 0.7062957286834717, + "learning_rate": 4.032176555985034e-07, + "loss": 3.2208, + "step": 7675 + }, + { + "epoch": 0.98, + "grad_norm": 0.6304193139076233, + "learning_rate": 3.9735430376586224e-07, + "loss": 3.2194, + "step": 7676 + }, + { + "epoch": 0.98, + "grad_norm": 0.6265396475791931, + "learning_rate": 3.9153386152165905e-07, + "loss": 3.1607, + "step": 7677 + }, + { + "epoch": 0.98, + "grad_norm": 0.6499339938163757, + "learning_rate": 3.8575632986648236e-07, + "loss": 3.1646, + "step": 7678 + }, + { + "epoch": 0.98, + "grad_norm": 0.6155187487602234, + "learning_rate": 3.800217097935932e-07, + "loss": 3.2666, + "step": 7679 + }, + { + "epoch": 0.98, + "grad_norm": 0.6204526424407959, + "learning_rate": 3.7433000228878635e-07, + "loss": 3.255, + "step": 7680 + }, + { + "epoch": 0.98, + "grad_norm": 0.5999842882156372, + "learning_rate": 3.6868120833055686e-07, + "loss": 3.1958, + "step": 7681 + }, + { + "epoch": 0.98, + "grad_norm": 0.6637298464775085, + "learning_rate": 3.630753288900446e-07, + "loss": 3.2996, + "step": 7682 + }, + { + "epoch": 0.98, + "grad_norm": 0.6675429344177246, + "learning_rate": 3.575123649308953e-07, + "loss": 3.387, + "step": 7683 + }, + { + "epoch": 0.98, + "grad_norm": 0.6250631809234619, + "learning_rate": 3.5199231740945525e-07, + "loss": 3.2661, + "step": 7684 + }, + { + "epoch": 0.98, + "grad_norm": 0.6430096626281738, + "learning_rate": 3.4651518727474294e-07, + "loss": 3.2979, + "step": 7685 + }, + { + "epoch": 0.98, + "grad_norm": 0.6707577109336853, + "learning_rate": 3.410809754682831e-07, + "loss": 3.3047, + "step": 7686 + }, + { + "epoch": 0.98, + "grad_norm": 0.6935476064682007, + "learning_rate": 3.356896829243006e-07, + "loss": 3.1893, + "step": 7687 + }, + { + "epoch": 0.98, + "grad_norm": 0.661466658115387, + "learning_rate": 3.303413105696096e-07, + "loss": 3.2446, + "step": 7688 + }, + { + "epoch": 0.98, + "grad_norm": 0.6921482682228088, + "learning_rate": 3.250358593236691e-07, + "loss": 3.3028, + "step": 7689 + }, + { + "epoch": 0.98, + "grad_norm": 0.6858127117156982, + "learning_rate": 3.19773330098555e-07, + "loss": 3.2903, + "step": 7690 + }, + { + "epoch": 0.98, + "grad_norm": 0.602769136428833, + "learning_rate": 3.1455372379893267e-07, + "loss": 3.0643, + "step": 7691 + }, + { + "epoch": 0.98, + "grad_norm": 0.6282801628112793, + "learning_rate": 3.0937704132213975e-07, + "loss": 3.2068, + "step": 7692 + }, + { + "epoch": 0.98, + "grad_norm": 0.6082367300987244, + "learning_rate": 3.0424328355810326e-07, + "loss": 3.3177, + "step": 7693 + }, + { + "epoch": 0.98, + "grad_norm": 0.6054982542991638, + "learning_rate": 2.9915245138933957e-07, + "loss": 3.3331, + "step": 7694 + }, + { + "epoch": 0.98, + "grad_norm": 0.6351670026779175, + "learning_rate": 2.9410454569106516e-07, + "loss": 3.3554, + "step": 7695 + }, + { + "epoch": 0.99, + "grad_norm": 0.6396512985229492, + "learning_rate": 2.8909956733105815e-07, + "loss": 3.2822, + "step": 7696 + }, + { + "epoch": 0.99, + "grad_norm": 0.6445683836936951, + "learning_rate": 2.841375171697413e-07, + "loss": 3.2697, + "step": 7697 + }, + { + "epoch": 0.99, + "grad_norm": 0.6352219581604004, + "learning_rate": 2.792183960601269e-07, + "loss": 3.1338, + "step": 7698 + }, + { + "epoch": 0.99, + "grad_norm": 0.6588396430015564, + "learning_rate": 2.743422048478994e-07, + "loss": 3.3444, + "step": 7699 + }, + { + "epoch": 0.99, + "grad_norm": 0.6465908288955688, + "learning_rate": 2.69508944371305e-07, + "loss": 3.2591, + "step": 7700 + }, + { + "epoch": 0.99, + "grad_norm": 0.6593931913375854, + "learning_rate": 2.6471861546123465e-07, + "loss": 3.1718, + "step": 7701 + }, + { + "epoch": 0.99, + "grad_norm": 0.6716919541358948, + "learning_rate": 2.59971218941224e-07, + "loss": 3.3324, + "step": 7702 + }, + { + "epoch": 0.99, + "grad_norm": 0.6339880228042603, + "learning_rate": 2.55266755627398e-07, + "loss": 3.2649, + "step": 7703 + }, + { + "epoch": 0.99, + "grad_norm": 0.641206681728363, + "learning_rate": 2.506052263284986e-07, + "loss": 3.3873, + "step": 7704 + }, + { + "epoch": 0.99, + "grad_norm": 0.6437599658966064, + "learning_rate": 2.4598663184591254e-07, + "loss": 3.3047, + "step": 7705 + }, + { + "epoch": 0.99, + "grad_norm": 0.6551904082298279, + "learning_rate": 2.414109729736158e-07, + "loss": 3.512, + "step": 7706 + }, + { + "epoch": 0.99, + "grad_norm": 0.6614031791687012, + "learning_rate": 2.368782504982292e-07, + "loss": 3.3369, + "step": 7707 + }, + { + "epoch": 0.99, + "grad_norm": 0.6629523038864136, + "learning_rate": 2.3238846519896274e-07, + "loss": 3.2633, + "step": 7708 + }, + { + "epoch": 0.99, + "grad_norm": 0.6999644041061401, + "learning_rate": 2.2794161784769896e-07, + "loss": 3.2459, + "step": 7709 + }, + { + "epoch": 0.99, + "grad_norm": 0.6301155686378479, + "learning_rate": 2.235377092088542e-07, + "loss": 3.3455, + "step": 7710 + }, + { + "epoch": 0.99, + "grad_norm": 0.6277669668197632, + "learning_rate": 2.1917674003954502e-07, + "loss": 3.355, + "step": 7711 + }, + { + "epoch": 0.99, + "grad_norm": 0.6536439061164856, + "learning_rate": 2.1485871108944955e-07, + "loss": 3.2062, + "step": 7712 + }, + { + "epoch": 0.99, + "grad_norm": 0.622828483581543, + "learning_rate": 2.1058362310091837e-07, + "loss": 3.2024, + "step": 7713 + }, + { + "epoch": 0.99, + "grad_norm": 0.6759048700332642, + "learning_rate": 2.0635147680886368e-07, + "loss": 3.2899, + "step": 7714 + }, + { + "epoch": 0.99, + "grad_norm": 0.6670531034469604, + "learning_rate": 2.0216227294084234e-07, + "loss": 3.3518, + "step": 7715 + }, + { + "epoch": 0.99, + "grad_norm": 0.6462798714637756, + "learning_rate": 1.9801601221702825e-07, + "loss": 3.2263, + "step": 7716 + }, + { + "epoch": 0.99, + "grad_norm": 0.6871739625930786, + "learning_rate": 1.939126953502124e-07, + "loss": 3.2723, + "step": 7717 + }, + { + "epoch": 0.99, + "grad_norm": 0.6767607927322388, + "learning_rate": 1.8985232304580268e-07, + "loss": 3.2441, + "step": 7718 + }, + { + "epoch": 0.99, + "grad_norm": 0.6675797700881958, + "learning_rate": 1.8583489600182413e-07, + "loss": 3.2221, + "step": 7719 + }, + { + "epoch": 0.99, + "grad_norm": 0.6533353328704834, + "learning_rate": 1.8186041490894646e-07, + "loss": 3.2238, + "step": 7720 + }, + { + "epoch": 0.99, + "grad_norm": 0.6005282402038574, + "learning_rate": 1.7792888045037315e-07, + "loss": 3.364, + "step": 7721 + }, + { + "epoch": 0.99, + "grad_norm": 0.6536868810653687, + "learning_rate": 1.7404029330203574e-07, + "loss": 3.2367, + "step": 7722 + }, + { + "epoch": 0.99, + "grad_norm": 0.6557430624961853, + "learning_rate": 1.7019465413239955e-07, + "loss": 3.3412, + "step": 7723 + }, + { + "epoch": 0.99, + "grad_norm": 0.6770214438438416, + "learning_rate": 1.6639196360257458e-07, + "loss": 3.2828, + "step": 7724 + }, + { + "epoch": 0.99, + "grad_norm": 0.6555509567260742, + "learning_rate": 1.6263222236628794e-07, + "loss": 3.2589, + "step": 7725 + }, + { + "epoch": 0.99, + "grad_norm": 0.6666334271430969, + "learning_rate": 1.589154310698837e-07, + "loss": 3.3113, + "step": 7726 + }, + { + "epoch": 0.99, + "grad_norm": 0.5893369317054749, + "learning_rate": 1.55241590352323e-07, + "loss": 3.2455, + "step": 7727 + }, + { + "epoch": 0.99, + "grad_norm": 0.7154968976974487, + "learning_rate": 1.5161070084518392e-07, + "loss": 3.325, + "step": 7728 + }, + { + "epoch": 0.99, + "grad_norm": 0.6864626407623291, + "learning_rate": 1.4802276317266162e-07, + "loss": 3.3101, + "step": 7729 + }, + { + "epoch": 0.99, + "grad_norm": 0.6598794460296631, + "learning_rate": 1.444777779515405e-07, + "loss": 3.2696, + "step": 7730 + }, + { + "epoch": 0.99, + "grad_norm": 0.6975576877593994, + "learning_rate": 1.4097574579127749e-07, + "loss": 3.2838, + "step": 7731 + }, + { + "epoch": 0.99, + "grad_norm": 0.6082096695899963, + "learning_rate": 1.37516667293891e-07, + "loss": 3.1761, + "step": 7732 + }, + { + "epoch": 0.99, + "grad_norm": 0.5750147104263306, + "learning_rate": 1.3410054305404428e-07, + "loss": 3.3239, + "step": 7733 + }, + { + "epoch": 0.99, + "grad_norm": 0.6296765208244324, + "learning_rate": 1.3072737365901755e-07, + "loss": 3.3048, + "step": 7734 + }, + { + "epoch": 0.99, + "grad_norm": 0.6614196300506592, + "learning_rate": 1.2739715968868028e-07, + "loss": 3.2068, + "step": 7735 + }, + { + "epoch": 0.99, + "grad_norm": 0.6254644393920898, + "learning_rate": 1.241099017155467e-07, + "loss": 3.2677, + "step": 7736 + }, + { + "epoch": 0.99, + "grad_norm": 0.6854985952377319, + "learning_rate": 1.2086560030474813e-07, + "loss": 3.3053, + "step": 7737 + }, + { + "epoch": 0.99, + "grad_norm": 0.6591229438781738, + "learning_rate": 1.1766425601397734e-07, + "loss": 3.2759, + "step": 7738 + }, + { + "epoch": 0.99, + "grad_norm": 0.6330887079238892, + "learning_rate": 1.1450586939362739e-07, + "loss": 3.3254, + "step": 7739 + }, + { + "epoch": 0.99, + "grad_norm": 0.645741879940033, + "learning_rate": 1.1139044098662509e-07, + "loss": 3.2391, + "step": 7740 + }, + { + "epoch": 0.99, + "grad_norm": 0.674430251121521, + "learning_rate": 1.0831797132854204e-07, + "loss": 3.3575, + "step": 7741 + }, + { + "epoch": 0.99, + "grad_norm": 0.6620994210243225, + "learning_rate": 1.0528846094762234e-07, + "loss": 3.3184, + "step": 7742 + }, + { + "epoch": 0.99, + "grad_norm": 0.6199248433113098, + "learning_rate": 1.0230191036464388e-07, + "loss": 3.2341, + "step": 7743 + }, + { + "epoch": 0.99, + "grad_norm": 0.6632201671600342, + "learning_rate": 9.93583200930015e-08, + "loss": 3.2446, + "step": 7744 + }, + { + "epoch": 0.99, + "grad_norm": 0.6452274918556213, + "learning_rate": 9.645769063879039e-08, + "loss": 3.3059, + "step": 7745 + }, + { + "epoch": 0.99, + "grad_norm": 0.6447122693061829, + "learning_rate": 9.360002250061172e-08, + "loss": 3.3207, + "step": 7746 + }, + { + "epoch": 0.99, + "grad_norm": 0.6201050281524658, + "learning_rate": 9.078531616976693e-08, + "loss": 3.3251, + "step": 7747 + }, + { + "epoch": 0.99, + "grad_norm": 0.663913905620575, + "learning_rate": 8.801357213011896e-08, + "loss": 3.2393, + "step": 7748 + }, + { + "epoch": 0.99, + "grad_norm": 0.6402385234832764, + "learning_rate": 8.528479085817554e-08, + "loss": 3.1977, + "step": 7749 + }, + { + "epoch": 0.99, + "grad_norm": 0.680420458316803, + "learning_rate": 8.259897282303363e-08, + "loss": 3.3444, + "step": 7750 + }, + { + "epoch": 0.99, + "grad_norm": 0.6284847855567932, + "learning_rate": 7.995611848640728e-08, + "loss": 3.2691, + "step": 7751 + }, + { + "epoch": 0.99, + "grad_norm": 0.6648733019828796, + "learning_rate": 7.735622830265521e-08, + "loss": 3.1771, + "step": 7752 + }, + { + "epoch": 0.99, + "grad_norm": 0.6077075600624084, + "learning_rate": 7.479930271869773e-08, + "loss": 3.2351, + "step": 7753 + }, + { + "epoch": 0.99, + "grad_norm": 0.6494194269180298, + "learning_rate": 7.228534217415539e-08, + "loss": 3.2726, + "step": 7754 + }, + { + "epoch": 0.99, + "grad_norm": 0.6784036755561829, + "learning_rate": 6.981434710115475e-08, + "loss": 3.2135, + "step": 7755 + }, + { + "epoch": 0.99, + "grad_norm": 0.6575074195861816, + "learning_rate": 6.738631792452266e-08, + "loss": 3.2102, + "step": 7756 + }, + { + "epoch": 0.99, + "grad_norm": 0.6784305572509766, + "learning_rate": 6.500125506161969e-08, + "loss": 3.3353, + "step": 7757 + }, + { + "epoch": 0.99, + "grad_norm": 0.6398900151252747, + "learning_rate": 6.265915892253448e-08, + "loss": 3.1519, + "step": 7758 + }, + { + "epoch": 0.99, + "grad_norm": 0.6406898498535156, + "learning_rate": 6.036002990983391e-08, + "loss": 3.236, + "step": 7759 + }, + { + "epoch": 0.99, + "grad_norm": 0.6392099857330322, + "learning_rate": 5.810386841878512e-08, + "loss": 3.2477, + "step": 7760 + }, + { + "epoch": 0.99, + "grad_norm": 0.6563682556152344, + "learning_rate": 5.5890674837272285e-08, + "loss": 3.3468, + "step": 7761 + }, + { + "epoch": 0.99, + "grad_norm": 0.6428896188735962, + "learning_rate": 5.3720449545768826e-08, + "loss": 3.3645, + "step": 7762 + }, + { + "epoch": 0.99, + "grad_norm": 0.6636173725128174, + "learning_rate": 5.159319291733744e-08, + "loss": 3.2402, + "step": 7763 + }, + { + "epoch": 0.99, + "grad_norm": 0.6179137229919434, + "learning_rate": 4.950890531765784e-08, + "loss": 3.2196, + "step": 7764 + }, + { + "epoch": 0.99, + "grad_norm": 0.6699796319007874, + "learning_rate": 4.746758710511001e-08, + "loss": 3.3711, + "step": 7765 + }, + { + "epoch": 0.99, + "grad_norm": 0.6111055016517639, + "learning_rate": 4.546923863055219e-08, + "loss": 3.271, + "step": 7766 + }, + { + "epoch": 0.99, + "grad_norm": 0.6459084153175354, + "learning_rate": 4.351386023757064e-08, + "loss": 3.3405, + "step": 7767 + }, + { + "epoch": 0.99, + "grad_norm": 0.6312695741653442, + "learning_rate": 4.1601452262313155e-08, + "loss": 3.3122, + "step": 7768 + }, + { + "epoch": 0.99, + "grad_norm": 0.5960135459899902, + "learning_rate": 3.973201503351675e-08, + "loss": 3.381, + "step": 7769 + }, + { + "epoch": 0.99, + "grad_norm": 0.6069164276123047, + "learning_rate": 3.790554887256326e-08, + "loss": 3.2818, + "step": 7770 + }, + { + "epoch": 0.99, + "grad_norm": 0.6660847663879395, + "learning_rate": 3.612205409347924e-08, + "loss": 3.2356, + "step": 7771 + }, + { + "epoch": 0.99, + "grad_norm": 0.7008808255195618, + "learning_rate": 3.438153100282504e-08, + "loss": 3.3235, + "step": 7772 + }, + { + "epoch": 0.99, + "grad_norm": 0.6775698065757751, + "learning_rate": 3.2683979899833514e-08, + "loss": 3.2186, + "step": 7773 + }, + { + "epoch": 1.0, + "grad_norm": 0.6351761817932129, + "learning_rate": 3.1029401076354546e-08, + "loss": 3.2799, + "step": 7774 + }, + { + "epoch": 1.0, + "grad_norm": 0.6113251447677612, + "learning_rate": 2.9417794816799515e-08, + "loss": 3.2026, + "step": 7775 + }, + { + "epoch": 1.0, + "grad_norm": 0.6366956233978271, + "learning_rate": 2.7849161398224575e-08, + "loss": 3.1522, + "step": 7776 + }, + { + "epoch": 1.0, + "grad_norm": 0.66960209608078, + "learning_rate": 2.632350109033066e-08, + "loss": 3.2107, + "step": 7777 + }, + { + "epoch": 1.0, + "grad_norm": 0.6164228916168213, + "learning_rate": 2.484081415535244e-08, + "loss": 3.1453, + "step": 7778 + }, + { + "epoch": 1.0, + "grad_norm": 0.6470010280609131, + "learning_rate": 2.3401100848197132e-08, + "loss": 3.2593, + "step": 7779 + }, + { + "epoch": 1.0, + "grad_norm": 0.5902237892150879, + "learning_rate": 2.2004361416361195e-08, + "loss": 3.2198, + "step": 7780 + }, + { + "epoch": 1.0, + "grad_norm": 0.6584739089012146, + "learning_rate": 2.0650596099985874e-08, + "loss": 3.3246, + "step": 7781 + }, + { + "epoch": 1.0, + "grad_norm": 0.6267509460449219, + "learning_rate": 1.9339805131773912e-08, + "loss": 3.3515, + "step": 7782 + }, + { + "epoch": 1.0, + "grad_norm": 0.6580419540405273, + "learning_rate": 1.8071988737100585e-08, + "loss": 3.2328, + "step": 7783 + }, + { + "epoch": 1.0, + "grad_norm": 0.6391124129295349, + "learning_rate": 1.6847147133847163e-08, + "loss": 3.2001, + "step": 7784 + }, + { + "epoch": 1.0, + "grad_norm": 0.6607205271720886, + "learning_rate": 1.5665280532650705e-08, + "loss": 3.2384, + "step": 7785 + }, + { + "epoch": 1.0, + "grad_norm": 0.6454439163208008, + "learning_rate": 1.4526389136654273e-08, + "loss": 3.2399, + "step": 7786 + }, + { + "epoch": 1.0, + "grad_norm": 0.6675859093666077, + "learning_rate": 1.3430473141645694e-08, + "loss": 3.1598, + "step": 7787 + }, + { + "epoch": 1.0, + "grad_norm": 0.6172164678573608, + "learning_rate": 1.2377532736057572e-08, + "loss": 3.3129, + "step": 7788 + }, + { + "epoch": 1.0, + "grad_norm": 0.6432853937149048, + "learning_rate": 1.1367568100856262e-08, + "loss": 3.2288, + "step": 7789 + }, + { + "epoch": 1.0, + "grad_norm": 0.6783219575881958, + "learning_rate": 1.0400579409680643e-08, + "loss": 3.2685, + "step": 7790 + }, + { + "epoch": 1.0, + "grad_norm": 0.6008654832839966, + "learning_rate": 9.476566828786615e-09, + "loss": 3.2806, + "step": 7791 + }, + { + "epoch": 1.0, + "grad_norm": 0.6340771317481995, + "learning_rate": 8.595530516991579e-09, + "loss": 3.2155, + "step": 7792 + }, + { + "epoch": 1.0, + "grad_norm": 0.636785626411438, + "learning_rate": 7.757470625785467e-09, + "loss": 3.2274, + "step": 7793 + }, + { + "epoch": 1.0, + "grad_norm": 0.6771584749221802, + "learning_rate": 6.962387299219719e-09, + "loss": 3.2748, + "step": 7794 + }, + { + "epoch": 1.0, + "grad_norm": 0.6428625583648682, + "learning_rate": 6.210280674018298e-09, + "loss": 3.2257, + "step": 7795 + }, + { + "epoch": 1.0, + "grad_norm": 0.6587328910827637, + "learning_rate": 5.501150879411165e-09, + "loss": 3.3533, + "step": 7796 + }, + { + "epoch": 1.0, + "grad_norm": 0.6639074683189392, + "learning_rate": 4.8349980373563195e-09, + "loss": 3.2496, + "step": 7797 + }, + { + "epoch": 1.0, + "grad_norm": 0.6632134318351746, + "learning_rate": 4.2118222623455105e-09, + "loss": 3.2102, + "step": 7798 + }, + { + "epoch": 1.0, + "grad_norm": 0.6787715554237366, + "learning_rate": 3.6316236615430154e-09, + "loss": 3.2833, + "step": 7799 + }, + { + "epoch": 1.0, + "grad_norm": 0.6268180012702942, + "learning_rate": 3.0944023346746175e-09, + "loss": 3.1667, + "step": 7800 + }, + { + "epoch": 1.0, + "grad_norm": 0.6189209222793579, + "learning_rate": 2.6001583740553615e-09, + "loss": 3.2903, + "step": 7801 + }, + { + "epoch": 1.0, + "grad_norm": 0.6033608913421631, + "learning_rate": 2.1488918647283307e-09, + "loss": 3.2652, + "step": 7802 + }, + { + "epoch": 1.0, + "grad_norm": 0.6378599405288696, + "learning_rate": 1.7406028842148481e-09, + "loss": 3.3589, + "step": 7803 + }, + { + "epoch": 1.0, + "grad_norm": 0.6148380637168884, + "learning_rate": 1.3752915027087642e-09, + "loss": 3.2446, + "step": 7804 + }, + { + "epoch": 1.0, + "grad_norm": 0.720740020275116, + "learning_rate": 1.0529577830209468e-09, + "loss": 3.3218, + "step": 7805 + }, + { + "epoch": 1.0, + "grad_norm": 0.6575115919113159, + "learning_rate": 7.736017805792805e-10, + "loss": 3.2913, + "step": 7806 + }, + { + "epoch": 1.0, + "grad_norm": 0.6172797679901123, + "learning_rate": 5.37223543400911e-10, + "loss": 3.1487, + "step": 7807 + }, + { + "epoch": 1.0, + "grad_norm": 0.6265156865119934, + "learning_rate": 3.4382311209224526e-10, + "loss": 3.3373, + "step": 7808 + }, + { + "epoch": 1.0, + "grad_norm": 0.676074743270874, + "learning_rate": 1.9340051995997422e-10, + "loss": 3.1338, + "step": 7809 + }, + { + "epoch": 1.0, + "grad_norm": 0.6446760296821594, + "learning_rate": 8.595579281678311e-11, + "loss": 3.2552, + "step": 7810 + }, + { + "epoch": 1.0, + "grad_norm": 0.6062079071998596, + "learning_rate": 2.1488949120129775e-11, + "loss": 3.1867, + "step": 7811 + }, + { + "epoch": 1.0, + "grad_norm": 0.6387574076652527, + "learning_rate": 0.0, + "loss": 3.367, + "step": 7812 + }, + { + "epoch": 1.0, + "step": 7812, + "total_flos": 9.9855816916258e+17, + "train_loss": 3.4287229562440533, + "train_runtime": 19633.4504, + "train_samples_per_second": 101.867, + "train_steps_per_second": 0.398 + } + ], + "logging_steps": 1.0, + "max_steps": 7812, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 24000, + "total_flos": 9.9855816916258e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}