diff --git "a/checkpoint-10000/trainer_state.json" "b/checkpoint-10000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10000/trainer_state.json" @@ -0,0 +1,70033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5364806866952789, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.3648068669527894e-05, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 2.4401, + "step": 1 + }, + { + "epoch": 0.00010729613733905579, + "grad_norm": NaN, + "learning_rate": 0.0, + "loss": 2.5072, + "step": 2 + }, + { + "epoch": 0.0001609442060085837, + "grad_norm": 0.6935932040214539, + "learning_rate": 8.92857142857143e-09, + "loss": 2.2394, + "step": 3 + }, + { + "epoch": 0.00021459227467811158, + "grad_norm": 0.677909791469574, + "learning_rate": 1.785714285714286e-08, + "loss": 2.5629, + "step": 4 + }, + { + "epoch": 0.0002682403433476395, + "grad_norm": 0.6475480198860168, + "learning_rate": 2.678571428571429e-08, + "loss": 2.8006, + "step": 5 + }, + { + "epoch": 0.0003218884120171674, + "grad_norm": 0.8326438665390015, + "learning_rate": 3.571428571428572e-08, + "loss": 2.4323, + "step": 6 + }, + { + "epoch": 0.0003755364806866953, + "grad_norm": 0.8124875426292419, + "learning_rate": 4.4642857142857145e-08, + "loss": 2.4991, + "step": 7 + }, + { + "epoch": 0.00042918454935622315, + "grad_norm": 1.1957017183303833, + "learning_rate": 5.357142857142858e-08, + "loss": 2.1047, + "step": 8 + }, + { + "epoch": 0.00048283261802575106, + "grad_norm": 0.759741485118866, + "learning_rate": 6.250000000000001e-08, + "loss": 2.3788, + "step": 9 + }, + { + "epoch": 0.000536480686695279, + "grad_norm": 1.6295050382614136, + "learning_rate": 7.142857142857144e-08, + "loss": 1.3629, + "step": 10 + }, + { + "epoch": 0.0005901287553648068, + "grad_norm": 0.6981147527694702, + "learning_rate": 8.035714285714285e-08, + "loss": 2.4387, + "step": 11 + }, + { + "epoch": 0.0006437768240343348, + "grad_norm": 0.8827914595603943, + "learning_rate": 8.928571428571429e-08, + "loss": 2.6157, + "step": 12 + }, + { + "epoch": 0.0006974248927038626, + "grad_norm": 0.7324496507644653, + "learning_rate": 9.821428571428572e-08, + "loss": 2.5207, + "step": 13 + }, + { + "epoch": 0.0007510729613733906, + "grad_norm": 0.8087125420570374, + "learning_rate": 1.0714285714285716e-07, + "loss": 2.3233, + "step": 14 + }, + { + "epoch": 0.0008047210300429185, + "grad_norm": 0.7666011452674866, + "learning_rate": 1.1607142857142858e-07, + "loss": 2.6354, + "step": 15 + }, + { + "epoch": 0.0008583690987124463, + "grad_norm": 0.6825481057167053, + "learning_rate": 1.2500000000000002e-07, + "loss": 2.1957, + "step": 16 + }, + { + "epoch": 0.0009120171673819743, + "grad_norm": 0.6773061156272888, + "learning_rate": 1.3392857142857142e-07, + "loss": 2.4252, + "step": 17 + }, + { + "epoch": 0.0009656652360515021, + "grad_norm": 0.8494783043861389, + "learning_rate": 1.4285714285714287e-07, + "loss": 2.2829, + "step": 18 + }, + { + "epoch": 0.00101931330472103, + "grad_norm": 0.8014193177223206, + "learning_rate": 1.517857142857143e-07, + "loss": 2.2007, + "step": 19 + }, + { + "epoch": 0.001072961373390558, + "grad_norm": 0.8662102222442627, + "learning_rate": 1.607142857142857e-07, + "loss": 2.4999, + "step": 20 + }, + { + "epoch": 0.0011266094420600858, + "grad_norm": 1.2300390005111694, + "learning_rate": 1.6964285714285715e-07, + "loss": 2.7978, + "step": 21 + }, + { + "epoch": 0.0011802575107296136, + "grad_norm": NaN, + "learning_rate": 1.6964285714285715e-07, + "loss": 2.4121, + "step": 22 + }, + { + "epoch": 0.0012339055793991417, + "grad_norm": 0.8139364123344421, + "learning_rate": 1.7857142857142858e-07, + "loss": 2.4236, + "step": 23 + }, + { + "epoch": 0.0012875536480686696, + "grad_norm": 0.8446831107139587, + "learning_rate": 1.875e-07, + "loss": 2.4883, + "step": 24 + }, + { + "epoch": 0.0013412017167381974, + "grad_norm": 0.7207334637641907, + "learning_rate": 1.9642857142857143e-07, + "loss": 2.1914, + "step": 25 + }, + { + "epoch": 0.0013948497854077253, + "grad_norm": 0.9601536989212036, + "learning_rate": 2.0535714285714288e-07, + "loss": 2.3311, + "step": 26 + }, + { + "epoch": 0.0014484978540772531, + "grad_norm": 1.0330086946487427, + "learning_rate": 2.142857142857143e-07, + "loss": 2.678, + "step": 27 + }, + { + "epoch": 0.0015021459227467812, + "grad_norm": 0.8670296669006348, + "learning_rate": 2.2321428571428574e-07, + "loss": 2.3593, + "step": 28 + }, + { + "epoch": 0.001555793991416309, + "grad_norm": 2.535274028778076, + "learning_rate": 2.3214285714285716e-07, + "loss": 2.3478, + "step": 29 + }, + { + "epoch": 0.001609442060085837, + "grad_norm": 0.9899892807006836, + "learning_rate": 2.410714285714286e-07, + "loss": 2.2422, + "step": 30 + }, + { + "epoch": 0.0016630901287553648, + "grad_norm": 1.1274137496948242, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.6209, + "step": 31 + }, + { + "epoch": 0.0017167381974248926, + "grad_norm": 0.8845741748809814, + "learning_rate": 2.5892857142857147e-07, + "loss": 2.4619, + "step": 32 + }, + { + "epoch": 0.0017703862660944207, + "grad_norm": 1.4410101175308228, + "learning_rate": 2.6785714285714284e-07, + "loss": 2.6095, + "step": 33 + }, + { + "epoch": 0.0018240343347639485, + "grad_norm": 0.9437788724899292, + "learning_rate": 2.767857142857143e-07, + "loss": 2.6002, + "step": 34 + }, + { + "epoch": 0.0018776824034334764, + "grad_norm": 0.8635700345039368, + "learning_rate": 2.8571428571428575e-07, + "loss": 2.4766, + "step": 35 + }, + { + "epoch": 0.0019313304721030042, + "grad_norm": 0.8582034111022949, + "learning_rate": 2.9464285714285717e-07, + "loss": 2.8097, + "step": 36 + }, + { + "epoch": 0.0019849785407725323, + "grad_norm": 0.8454009890556335, + "learning_rate": 3.035714285714286e-07, + "loss": 2.442, + "step": 37 + }, + { + "epoch": 0.00203862660944206, + "grad_norm": 0.7810462117195129, + "learning_rate": 3.125e-07, + "loss": 2.5525, + "step": 38 + }, + { + "epoch": 0.002092274678111588, + "grad_norm": 0.6493550539016724, + "learning_rate": 3.214285714285714e-07, + "loss": 2.4719, + "step": 39 + }, + { + "epoch": 0.002145922746781116, + "grad_norm": 0.9213592410087585, + "learning_rate": 3.303571428571429e-07, + "loss": 2.4089, + "step": 40 + }, + { + "epoch": 0.0021995708154506437, + "grad_norm": 0.6836276054382324, + "learning_rate": 3.392857142857143e-07, + "loss": 2.5924, + "step": 41 + }, + { + "epoch": 0.0022532188841201716, + "grad_norm": 0.9182270169258118, + "learning_rate": 3.4821428571428573e-07, + "loss": 2.631, + "step": 42 + }, + { + "epoch": 0.0023068669527896994, + "grad_norm": 0.7277834415435791, + "learning_rate": 3.5714285714285716e-07, + "loss": 2.5004, + "step": 43 + }, + { + "epoch": 0.0023605150214592273, + "grad_norm": 1.9422338008880615, + "learning_rate": 3.6607142857142864e-07, + "loss": 2.4053, + "step": 44 + }, + { + "epoch": 0.0024141630901287556, + "grad_norm": 0.8444631099700928, + "learning_rate": 3.75e-07, + "loss": 2.5125, + "step": 45 + }, + { + "epoch": 0.0024678111587982834, + "grad_norm": NaN, + "learning_rate": 3.75e-07, + "loss": 2.7383, + "step": 46 + }, + { + "epoch": 0.0025214592274678113, + "grad_norm": 0.9670552015304565, + "learning_rate": 3.839285714285715e-07, + "loss": 2.6818, + "step": 47 + }, + { + "epoch": 0.002575107296137339, + "grad_norm": 1.7243165969848633, + "learning_rate": 3.9285714285714286e-07, + "loss": 2.3875, + "step": 48 + }, + { + "epoch": 0.002628755364806867, + "grad_norm": 0.9937056303024292, + "learning_rate": 4.0178571428571434e-07, + "loss": 2.5009, + "step": 49 + }, + { + "epoch": 0.002682403433476395, + "grad_norm": 0.9792752861976624, + "learning_rate": 4.1071428571428577e-07, + "loss": 2.4047, + "step": 50 + }, + { + "epoch": 0.0027360515021459227, + "grad_norm": 0.67503422498703, + "learning_rate": 4.1964285714285714e-07, + "loss": 2.2836, + "step": 51 + }, + { + "epoch": 0.0027896995708154505, + "grad_norm": 0.8637988567352295, + "learning_rate": 4.285714285714286e-07, + "loss": 2.7357, + "step": 52 + }, + { + "epoch": 0.0028433476394849784, + "grad_norm": 1.1493501663208008, + "learning_rate": 4.375e-07, + "loss": 2.017, + "step": 53 + }, + { + "epoch": 0.0028969957081545063, + "grad_norm": 1.1174843311309814, + "learning_rate": 4.4642857142857147e-07, + "loss": 2.0569, + "step": 54 + }, + { + "epoch": 0.0029506437768240345, + "grad_norm": 0.9512081146240234, + "learning_rate": 4.553571428571429e-07, + "loss": 2.4652, + "step": 55 + }, + { + "epoch": 0.0030042918454935624, + "grad_norm": 0.7697733044624329, + "learning_rate": 4.642857142857143e-07, + "loss": 2.6841, + "step": 56 + }, + { + "epoch": 0.0030579399141630902, + "grad_norm": 0.7791878581047058, + "learning_rate": 4.7321428571428575e-07, + "loss": 2.326, + "step": 57 + }, + { + "epoch": 0.003111587982832618, + "grad_norm": 1.6973649263381958, + "learning_rate": 4.821428571428572e-07, + "loss": 2.4609, + "step": 58 + }, + { + "epoch": 0.003165236051502146, + "grad_norm": 0.8930851221084595, + "learning_rate": 4.910714285714286e-07, + "loss": 2.3977, + "step": 59 + }, + { + "epoch": 0.003218884120171674, + "grad_norm": 1.0759652853012085, + "learning_rate": 5.000000000000001e-07, + "loss": 2.6328, + "step": 60 + }, + { + "epoch": 0.0032725321888412017, + "grad_norm": 0.760720431804657, + "learning_rate": 5.089285714285715e-07, + "loss": 2.4299, + "step": 61 + }, + { + "epoch": 0.0033261802575107295, + "grad_norm": 0.917052686214447, + "learning_rate": 5.178571428571429e-07, + "loss": 2.0909, + "step": 62 + }, + { + "epoch": 0.0033798283261802574, + "grad_norm": 0.8736194968223572, + "learning_rate": 5.267857142857143e-07, + "loss": 2.4152, + "step": 63 + }, + { + "epoch": 0.0034334763948497852, + "grad_norm": 1.0136430263519287, + "learning_rate": 5.357142857142857e-07, + "loss": 2.4125, + "step": 64 + }, + { + "epoch": 0.0034871244635193135, + "grad_norm": 1.9392871856689453, + "learning_rate": 5.446428571428572e-07, + "loss": 2.2764, + "step": 65 + }, + { + "epoch": 0.0035407725321888414, + "grad_norm": 1.4139771461486816, + "learning_rate": 5.535714285714286e-07, + "loss": 2.4301, + "step": 66 + }, + { + "epoch": 0.003594420600858369, + "grad_norm": 0.9242576360702515, + "learning_rate": 5.625e-07, + "loss": 2.4611, + "step": 67 + }, + { + "epoch": 0.003648068669527897, + "grad_norm": 0.7728210687637329, + "learning_rate": 5.714285714285715e-07, + "loss": 2.6612, + "step": 68 + }, + { + "epoch": 0.003701716738197425, + "grad_norm": 0.8032379150390625, + "learning_rate": 5.80357142857143e-07, + "loss": 2.3875, + "step": 69 + }, + { + "epoch": 0.0037553648068669528, + "grad_norm": 0.954206645488739, + "learning_rate": 5.892857142857143e-07, + "loss": 2.4059, + "step": 70 + }, + { + "epoch": 0.0038090128755364806, + "grad_norm": 0.7434251308441162, + "learning_rate": 5.982142857142858e-07, + "loss": 2.4556, + "step": 71 + }, + { + "epoch": 0.0038626609442060085, + "grad_norm": 0.9050096273422241, + "learning_rate": 6.071428571428572e-07, + "loss": 2.4441, + "step": 72 + }, + { + "epoch": 0.003916309012875536, + "grad_norm": 0.8813114166259766, + "learning_rate": 6.160714285714287e-07, + "loss": 2.4783, + "step": 73 + }, + { + "epoch": 0.003969957081545065, + "grad_norm": 0.8532451391220093, + "learning_rate": 6.25e-07, + "loss": 2.4637, + "step": 74 + }, + { + "epoch": 0.004023605150214592, + "grad_norm": 0.6587334275245667, + "learning_rate": 6.339285714285714e-07, + "loss": 2.4099, + "step": 75 + }, + { + "epoch": 0.00407725321888412, + "grad_norm": 3.630815267562866, + "learning_rate": 6.428571428571428e-07, + "loss": 2.5932, + "step": 76 + }, + { + "epoch": 0.004130901287553648, + "grad_norm": 1.041545033454895, + "learning_rate": 6.517857142857144e-07, + "loss": 2.0863, + "step": 77 + }, + { + "epoch": 0.004184549356223176, + "grad_norm": 0.7904495596885681, + "learning_rate": 6.607142857142858e-07, + "loss": 2.4064, + "step": 78 + }, + { + "epoch": 0.0042381974248927035, + "grad_norm": 0.8185285329818726, + "learning_rate": 6.696428571428571e-07, + "loss": 2.3364, + "step": 79 + }, + { + "epoch": 0.004291845493562232, + "grad_norm": 0.7770063877105713, + "learning_rate": 6.785714285714286e-07, + "loss": 2.4239, + "step": 80 + }, + { + "epoch": 0.00434549356223176, + "grad_norm": 0.9059091806411743, + "learning_rate": 6.875000000000001e-07, + "loss": 2.6458, + "step": 81 + }, + { + "epoch": 0.0043991416309012875, + "grad_norm": 0.7613852024078369, + "learning_rate": 6.964285714285715e-07, + "loss": 2.1675, + "step": 82 + }, + { + "epoch": 0.004452789699570816, + "grad_norm": 14.029939651489258, + "learning_rate": 7.053571428571429e-07, + "loss": 2.6587, + "step": 83 + }, + { + "epoch": 0.004506437768240343, + "grad_norm": 0.8541538715362549, + "learning_rate": 7.142857142857143e-07, + "loss": 2.612, + "step": 84 + }, + { + "epoch": 0.0045600858369098714, + "grad_norm": 2.270970582962036, + "learning_rate": 7.232142857142858e-07, + "loss": 2.4127, + "step": 85 + }, + { + "epoch": 0.004613733905579399, + "grad_norm": 1.2037321329116821, + "learning_rate": 7.321428571428573e-07, + "loss": 2.4513, + "step": 86 + }, + { + "epoch": 0.004667381974248927, + "grad_norm": 1.0241434574127197, + "learning_rate": 7.410714285714286e-07, + "loss": 2.7365, + "step": 87 + }, + { + "epoch": 0.004721030042918455, + "grad_norm": 0.6498371362686157, + "learning_rate": 7.5e-07, + "loss": 2.3593, + "step": 88 + }, + { + "epoch": 0.004774678111587983, + "grad_norm": 0.9677167534828186, + "learning_rate": 7.589285714285714e-07, + "loss": 2.5213, + "step": 89 + }, + { + "epoch": 0.004828326180257511, + "grad_norm": 0.8672310709953308, + "learning_rate": 7.67857142857143e-07, + "loss": 2.6857, + "step": 90 + }, + { + "epoch": 0.0048819742489270386, + "grad_norm": 0.9947278499603271, + "learning_rate": 7.767857142857144e-07, + "loss": 2.5638, + "step": 91 + }, + { + "epoch": 0.004935622317596567, + "grad_norm": 0.7824763059616089, + "learning_rate": 7.857142857142857e-07, + "loss": 2.3563, + "step": 92 + }, + { + "epoch": 0.004989270386266094, + "grad_norm": 0.759227991104126, + "learning_rate": 7.946428571428572e-07, + "loss": 2.4229, + "step": 93 + }, + { + "epoch": 0.0050429184549356226, + "grad_norm": 0.6327589750289917, + "learning_rate": 8.035714285714287e-07, + "loss": 2.3801, + "step": 94 + }, + { + "epoch": 0.00509656652360515, + "grad_norm": 0.7244407534599304, + "learning_rate": 8.125000000000001e-07, + "loss": 2.2058, + "step": 95 + }, + { + "epoch": 0.005150214592274678, + "grad_norm": 1.2423087358474731, + "learning_rate": 8.214285714285715e-07, + "loss": 2.6372, + "step": 96 + }, + { + "epoch": 0.005203862660944206, + "grad_norm": 1.712682843208313, + "learning_rate": 8.303571428571429e-07, + "loss": 2.5142, + "step": 97 + }, + { + "epoch": 0.005257510729613734, + "grad_norm": 0.9291441440582275, + "learning_rate": 8.392857142857143e-07, + "loss": 1.9431, + "step": 98 + }, + { + "epoch": 0.005311158798283261, + "grad_norm": 2.164389133453369, + "learning_rate": 8.482142857142859e-07, + "loss": 2.4759, + "step": 99 + }, + { + "epoch": 0.00536480686695279, + "grad_norm": 0.6145456433296204, + "learning_rate": 8.571428571428572e-07, + "loss": 2.3094, + "step": 100 + }, + { + "epoch": 0.005418454935622318, + "grad_norm": 0.9413978457450867, + "learning_rate": 8.660714285714286e-07, + "loss": 2.5706, + "step": 101 + }, + { + "epoch": 0.005472103004291845, + "grad_norm": 0.861832320690155, + "learning_rate": 8.75e-07, + "loss": 2.4966, + "step": 102 + }, + { + "epoch": 0.005525751072961374, + "grad_norm": 0.7913983464241028, + "learning_rate": 8.839285714285716e-07, + "loss": 2.5964, + "step": 103 + }, + { + "epoch": 0.005579399141630901, + "grad_norm": 0.9602859020233154, + "learning_rate": 8.928571428571429e-07, + "loss": 2.9216, + "step": 104 + }, + { + "epoch": 0.005633047210300429, + "grad_norm": 0.6429104804992676, + "learning_rate": 9.017857142857143e-07, + "loss": 2.4912, + "step": 105 + }, + { + "epoch": 0.005686695278969957, + "grad_norm": 0.8189711570739746, + "learning_rate": 9.107142857142858e-07, + "loss": 2.2812, + "step": 106 + }, + { + "epoch": 0.005740343347639485, + "grad_norm": 0.686623752117157, + "learning_rate": 9.196428571428573e-07, + "loss": 2.6057, + "step": 107 + }, + { + "epoch": 0.0057939914163090125, + "grad_norm": 0.99550861120224, + "learning_rate": 9.285714285714287e-07, + "loss": 2.6879, + "step": 108 + }, + { + "epoch": 0.005847639484978541, + "grad_norm": 1.379797339439392, + "learning_rate": 9.375000000000001e-07, + "loss": 2.5634, + "step": 109 + }, + { + "epoch": 0.005901287553648069, + "grad_norm": 0.9499379396438599, + "learning_rate": 9.464285714285715e-07, + "loss": 2.4607, + "step": 110 + }, + { + "epoch": 0.0059549356223175965, + "grad_norm": 1.0836615562438965, + "learning_rate": 9.553571428571429e-07, + "loss": 2.5644, + "step": 111 + }, + { + "epoch": 0.006008583690987125, + "grad_norm": 0.7682836651802063, + "learning_rate": 9.642857142857145e-07, + "loss": 2.5347, + "step": 112 + }, + { + "epoch": 0.006062231759656652, + "grad_norm": 1.3963618278503418, + "learning_rate": 9.732142857142858e-07, + "loss": 2.3035, + "step": 113 + }, + { + "epoch": 0.0061158798283261805, + "grad_norm": 0.8740382194519043, + "learning_rate": 9.821428571428572e-07, + "loss": 2.2037, + "step": 114 + }, + { + "epoch": 0.006169527896995708, + "grad_norm": 1.2923239469528198, + "learning_rate": 9.910714285714286e-07, + "loss": 2.3938, + "step": 115 + }, + { + "epoch": 0.006223175965665236, + "grad_norm": 0.9651234149932861, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7977, + "step": 116 + }, + { + "epoch": 0.006276824034334764, + "grad_norm": 1.1392234563827515, + "learning_rate": 1.0089285714285715e-06, + "loss": 2.6388, + "step": 117 + }, + { + "epoch": 0.006330472103004292, + "grad_norm": 0.7700512409210205, + "learning_rate": 1.017857142857143e-06, + "loss": 2.6283, + "step": 118 + }, + { + "epoch": 0.006384120171673819, + "grad_norm": 0.7713974714279175, + "learning_rate": 1.0267857142857143e-06, + "loss": 2.5588, + "step": 119 + }, + { + "epoch": 0.006437768240343348, + "grad_norm": 0.9464917778968811, + "learning_rate": 1.0357142857142859e-06, + "loss": 2.6444, + "step": 120 + }, + { + "epoch": 0.006491416309012876, + "grad_norm": 0.7184650897979736, + "learning_rate": 1.0446428571428572e-06, + "loss": 2.0531, + "step": 121 + }, + { + "epoch": 0.006545064377682403, + "grad_norm": 0.7756190299987793, + "learning_rate": 1.0535714285714286e-06, + "loss": 2.4958, + "step": 122 + }, + { + "epoch": 0.006598712446351932, + "grad_norm": 0.6817174553871155, + "learning_rate": 1.0625e-06, + "loss": 2.4343, + "step": 123 + }, + { + "epoch": 0.006652360515021459, + "grad_norm": 0.8120045065879822, + "learning_rate": 1.0714285714285714e-06, + "loss": 2.6489, + "step": 124 + }, + { + "epoch": 0.006706008583690987, + "grad_norm": 1.4648245573043823, + "learning_rate": 1.080357142857143e-06, + "loss": 2.5889, + "step": 125 + }, + { + "epoch": 0.006759656652360515, + "grad_norm": 0.7509508728981018, + "learning_rate": 1.0892857142857143e-06, + "loss": 2.8112, + "step": 126 + }, + { + "epoch": 0.006813304721030043, + "grad_norm": 4.133321285247803, + "learning_rate": 1.0982142857142857e-06, + "loss": 2.2546, + "step": 127 + }, + { + "epoch": 0.0068669527896995704, + "grad_norm": 0.9913201332092285, + "learning_rate": 1.1071428571428573e-06, + "loss": 2.3405, + "step": 128 + }, + { + "epoch": 0.006920600858369099, + "grad_norm": 0.8450392484664917, + "learning_rate": 1.1160714285714287e-06, + "loss": 2.6864, + "step": 129 + }, + { + "epoch": 0.006974248927038627, + "grad_norm": 0.6946297883987427, + "learning_rate": 1.125e-06, + "loss": 2.5017, + "step": 130 + }, + { + "epoch": 0.0070278969957081544, + "grad_norm": 0.7169015407562256, + "learning_rate": 1.1339285714285716e-06, + "loss": 2.5261, + "step": 131 + }, + { + "epoch": 0.007081545064377683, + "grad_norm": 0.83310467004776, + "learning_rate": 1.142857142857143e-06, + "loss": 2.2022, + "step": 132 + }, + { + "epoch": 0.00713519313304721, + "grad_norm": 1.942430853843689, + "learning_rate": 1.1517857142857144e-06, + "loss": 2.4821, + "step": 133 + }, + { + "epoch": 0.007188841201716738, + "grad_norm": 0.6703306436538696, + "learning_rate": 1.160714285714286e-06, + "loss": 2.2026, + "step": 134 + }, + { + "epoch": 0.007242489270386266, + "grad_norm": 0.9137828350067139, + "learning_rate": 1.1696428571428573e-06, + "loss": 2.1623, + "step": 135 + }, + { + "epoch": 0.007296137339055794, + "grad_norm": 0.7661757469177246, + "learning_rate": 1.1785714285714287e-06, + "loss": 2.4106, + "step": 136 + }, + { + "epoch": 0.0073497854077253216, + "grad_norm": 0.835307240486145, + "learning_rate": 1.1875e-06, + "loss": 2.1197, + "step": 137 + }, + { + "epoch": 0.00740343347639485, + "grad_norm": 1.353977918624878, + "learning_rate": 1.1964285714285717e-06, + "loss": 2.5839, + "step": 138 + }, + { + "epoch": 0.007457081545064377, + "grad_norm": 0.7668522000312805, + "learning_rate": 1.205357142857143e-06, + "loss": 2.6026, + "step": 139 + }, + { + "epoch": 0.0075107296137339056, + "grad_norm": 0.8905326724052429, + "learning_rate": 1.2142857142857144e-06, + "loss": 2.5294, + "step": 140 + }, + { + "epoch": 0.007564377682403434, + "grad_norm": 0.9774091839790344, + "learning_rate": 1.2232142857142858e-06, + "loss": 2.501, + "step": 141 + }, + { + "epoch": 0.007618025751072961, + "grad_norm": 0.6075891852378845, + "learning_rate": 1.2321428571428574e-06, + "loss": 2.5342, + "step": 142 + }, + { + "epoch": 0.0076716738197424895, + "grad_norm": 1.0102717876434326, + "learning_rate": 1.2410714285714287e-06, + "loss": 2.0099, + "step": 143 + }, + { + "epoch": 0.007725321888412017, + "grad_norm": 0.7865061163902283, + "learning_rate": 1.25e-06, + "loss": 2.4802, + "step": 144 + }, + { + "epoch": 0.007778969957081545, + "grad_norm": 0.7139711976051331, + "learning_rate": 1.2589285714285715e-06, + "loss": 2.4817, + "step": 145 + }, + { + "epoch": 0.007832618025751073, + "grad_norm": 0.8458415865898132, + "learning_rate": 1.2678571428571428e-06, + "loss": 2.5749, + "step": 146 + }, + { + "epoch": 0.007886266094420601, + "grad_norm": 0.7011553645133972, + "learning_rate": 1.2767857142857142e-06, + "loss": 2.4348, + "step": 147 + }, + { + "epoch": 0.00793991416309013, + "grad_norm": 0.7118622064590454, + "learning_rate": 1.2857142857142856e-06, + "loss": 2.7591, + "step": 148 + }, + { + "epoch": 0.007993562231759656, + "grad_norm": 1.0853469371795654, + "learning_rate": 1.2946428571428574e-06, + "loss": 2.5427, + "step": 149 + }, + { + "epoch": 0.008047210300429184, + "grad_norm": 0.5898392200469971, + "learning_rate": 1.3035714285714288e-06, + "loss": 2.3806, + "step": 150 + }, + { + "epoch": 0.008100858369098712, + "grad_norm": 0.8301461338996887, + "learning_rate": 1.3125000000000001e-06, + "loss": 2.4623, + "step": 151 + }, + { + "epoch": 0.00815450643776824, + "grad_norm": 1.341497540473938, + "learning_rate": 1.3214285714285715e-06, + "loss": 2.4946, + "step": 152 + }, + { + "epoch": 0.008208154506437769, + "grad_norm": 1.0956140756607056, + "learning_rate": 1.3303571428571429e-06, + "loss": 2.5995, + "step": 153 + }, + { + "epoch": 0.008261802575107296, + "grad_norm": NaN, + "learning_rate": 1.3303571428571429e-06, + "loss": 2.5367, + "step": 154 + }, + { + "epoch": 0.008315450643776824, + "grad_norm": 0.6907609105110168, + "learning_rate": 1.3392857142857143e-06, + "loss": 2.3598, + "step": 155 + }, + { + "epoch": 0.008369098712446352, + "grad_norm": 0.7842445373535156, + "learning_rate": 1.3482142857142858e-06, + "loss": 2.2936, + "step": 156 + }, + { + "epoch": 0.00842274678111588, + "grad_norm": 0.7514459490776062, + "learning_rate": 1.3571428571428572e-06, + "loss": 2.6988, + "step": 157 + }, + { + "epoch": 0.008476394849785407, + "grad_norm": 0.8455466628074646, + "learning_rate": 1.3660714285714286e-06, + "loss": 2.5296, + "step": 158 + }, + { + "epoch": 0.008530042918454935, + "grad_norm": 0.913686990737915, + "learning_rate": 1.3750000000000002e-06, + "loss": 2.6835, + "step": 159 + }, + { + "epoch": 0.008583690987124463, + "grad_norm": 0.6005373001098633, + "learning_rate": 1.3839285714285715e-06, + "loss": 2.2973, + "step": 160 + }, + { + "epoch": 0.008637339055793992, + "grad_norm": 1.3129281997680664, + "learning_rate": 1.392857142857143e-06, + "loss": 2.2657, + "step": 161 + }, + { + "epoch": 0.00869098712446352, + "grad_norm": 0.7612324953079224, + "learning_rate": 1.4017857142857145e-06, + "loss": 2.5952, + "step": 162 + }, + { + "epoch": 0.008744635193133047, + "grad_norm": 0.8370961546897888, + "learning_rate": 1.4107142857142859e-06, + "loss": 2.477, + "step": 163 + }, + { + "epoch": 0.008798283261802575, + "grad_norm": 0.8533289432525635, + "learning_rate": 1.4196428571428573e-06, + "loss": 2.375, + "step": 164 + }, + { + "epoch": 0.008851931330472103, + "grad_norm": 0.6898089051246643, + "learning_rate": 1.4285714285714286e-06, + "loss": 2.6677, + "step": 165 + }, + { + "epoch": 0.008905579399141631, + "grad_norm": 0.7607659101486206, + "learning_rate": 1.4375e-06, + "loss": 2.4561, + "step": 166 + }, + { + "epoch": 0.008959227467811158, + "grad_norm": 0.725953221321106, + "learning_rate": 1.4464285714285716e-06, + "loss": 2.4179, + "step": 167 + }, + { + "epoch": 0.009012875536480686, + "grad_norm": 0.8308206796646118, + "learning_rate": 1.4553571428571432e-06, + "loss": 2.2374, + "step": 168 + }, + { + "epoch": 0.009066523605150215, + "grad_norm": 1.1526063680648804, + "learning_rate": 1.4642857142857145e-06, + "loss": 2.1913, + "step": 169 + }, + { + "epoch": 0.009120171673819743, + "grad_norm": 0.9452094435691833, + "learning_rate": 1.473214285714286e-06, + "loss": 2.4841, + "step": 170 + }, + { + "epoch": 0.009173819742489271, + "grad_norm": 0.9099540114402771, + "learning_rate": 1.4821428571428573e-06, + "loss": 2.2326, + "step": 171 + }, + { + "epoch": 0.009227467811158798, + "grad_norm": 1.041542410850525, + "learning_rate": 1.4910714285714287e-06, + "loss": 2.4508, + "step": 172 + }, + { + "epoch": 0.009281115879828326, + "grad_norm": 0.7449007630348206, + "learning_rate": 1.5e-06, + "loss": 2.1162, + "step": 173 + }, + { + "epoch": 0.009334763948497854, + "grad_norm": 0.8525586128234863, + "learning_rate": 1.5089285714285714e-06, + "loss": 2.3712, + "step": 174 + }, + { + "epoch": 0.009388412017167383, + "grad_norm": 0.9100508689880371, + "learning_rate": 1.5178571428571428e-06, + "loss": 2.7238, + "step": 175 + }, + { + "epoch": 0.00944206008583691, + "grad_norm": 0.8841806054115295, + "learning_rate": 1.5267857142857146e-06, + "loss": 2.5829, + "step": 176 + }, + { + "epoch": 0.009495708154506437, + "grad_norm": 0.6963426470756531, + "learning_rate": 1.535714285714286e-06, + "loss": 2.5081, + "step": 177 + }, + { + "epoch": 0.009549356223175966, + "grad_norm": 0.6730285882949829, + "learning_rate": 1.5446428571428573e-06, + "loss": 2.429, + "step": 178 + }, + { + "epoch": 0.009603004291845494, + "grad_norm": 0.6490024924278259, + "learning_rate": 1.5535714285714287e-06, + "loss": 2.4692, + "step": 179 + }, + { + "epoch": 0.009656652360515022, + "grad_norm": 0.6705166101455688, + "learning_rate": 1.5625e-06, + "loss": 2.3184, + "step": 180 + }, + { + "epoch": 0.009710300429184549, + "grad_norm": 0.6710647344589233, + "learning_rate": 1.5714285714285714e-06, + "loss": 2.216, + "step": 181 + }, + { + "epoch": 0.009763948497854077, + "grad_norm": 1.1906534433364868, + "learning_rate": 1.5803571428571428e-06, + "loss": 2.4681, + "step": 182 + }, + { + "epoch": 0.009817596566523605, + "grad_norm": 0.6579937934875488, + "learning_rate": 1.5892857142857144e-06, + "loss": 2.187, + "step": 183 + }, + { + "epoch": 0.009871244635193134, + "grad_norm": 0.704179584980011, + "learning_rate": 1.5982142857142858e-06, + "loss": 2.4643, + "step": 184 + }, + { + "epoch": 0.00992489270386266, + "grad_norm": 0.7342430353164673, + "learning_rate": 1.6071428571428574e-06, + "loss": 2.5128, + "step": 185 + }, + { + "epoch": 0.009978540772532189, + "grad_norm": 0.793552041053772, + "learning_rate": 1.6160714285714287e-06, + "loss": 2.3754, + "step": 186 + }, + { + "epoch": 0.010032188841201717, + "grad_norm": 1.0377492904663086, + "learning_rate": 1.6250000000000001e-06, + "loss": 2.4328, + "step": 187 + }, + { + "epoch": 0.010085836909871245, + "grad_norm": 0.6674951314926147, + "learning_rate": 1.6339285714285715e-06, + "loss": 2.4433, + "step": 188 + }, + { + "epoch": 0.010139484978540772, + "grad_norm": 2.0816256999969482, + "learning_rate": 1.642857142857143e-06, + "loss": 2.4011, + "step": 189 + }, + { + "epoch": 0.0101931330472103, + "grad_norm": 0.6879795789718628, + "learning_rate": 1.6517857142857144e-06, + "loss": 2.5995, + "step": 190 + }, + { + "epoch": 0.010246781115879828, + "grad_norm": 0.8282930850982666, + "learning_rate": 1.6607142857142858e-06, + "loss": 2.614, + "step": 191 + }, + { + "epoch": 0.010300429184549357, + "grad_norm": 0.661475658416748, + "learning_rate": 1.6696428571428572e-06, + "loss": 2.4919, + "step": 192 + }, + { + "epoch": 0.010354077253218885, + "grad_norm": 0.6563178896903992, + "learning_rate": 1.6785714285714286e-06, + "loss": 2.3542, + "step": 193 + }, + { + "epoch": 0.010407725321888411, + "grad_norm": 0.8732898831367493, + "learning_rate": 1.6875000000000001e-06, + "loss": 2.5474, + "step": 194 + }, + { + "epoch": 0.01046137339055794, + "grad_norm": 0.7637423276901245, + "learning_rate": 1.6964285714285717e-06, + "loss": 2.3512, + "step": 195 + }, + { + "epoch": 0.010515021459227468, + "grad_norm": 0.8342444896697998, + "learning_rate": 1.7053571428571431e-06, + "loss": 2.4901, + "step": 196 + }, + { + "epoch": 0.010568669527896996, + "grad_norm": 0.7179602384567261, + "learning_rate": 1.7142857142857145e-06, + "loss": 2.6228, + "step": 197 + }, + { + "epoch": 0.010622317596566523, + "grad_norm": 0.7071645259857178, + "learning_rate": 1.7232142857142859e-06, + "loss": 2.5812, + "step": 198 + }, + { + "epoch": 0.010675965665236051, + "grad_norm": 0.6539354920387268, + "learning_rate": 1.7321428571428572e-06, + "loss": 2.3245, + "step": 199 + }, + { + "epoch": 0.01072961373390558, + "grad_norm": 0.8995392322540283, + "learning_rate": 1.7410714285714286e-06, + "loss": 2.5033, + "step": 200 + }, + { + "epoch": 0.010783261802575108, + "grad_norm": 0.8255943059921265, + "learning_rate": 1.75e-06, + "loss": 2.3979, + "step": 201 + }, + { + "epoch": 0.010836909871244636, + "grad_norm": 1.028721809387207, + "learning_rate": 1.7589285714285718e-06, + "loss": 2.4931, + "step": 202 + }, + { + "epoch": 0.010890557939914162, + "grad_norm": 0.7333868741989136, + "learning_rate": 1.7678571428571431e-06, + "loss": 2.4265, + "step": 203 + }, + { + "epoch": 0.01094420600858369, + "grad_norm": 0.6806638836860657, + "learning_rate": 1.7767857142857145e-06, + "loss": 2.4698, + "step": 204 + }, + { + "epoch": 0.010997854077253219, + "grad_norm": 1.7733010053634644, + "learning_rate": 1.7857142857142859e-06, + "loss": 2.6086, + "step": 205 + }, + { + "epoch": 0.011051502145922747, + "grad_norm": 0.6890419125556946, + "learning_rate": 1.7946428571428573e-06, + "loss": 2.4403, + "step": 206 + }, + { + "epoch": 0.011105150214592274, + "grad_norm": 0.7711536884307861, + "learning_rate": 1.8035714285714286e-06, + "loss": 2.272, + "step": 207 + }, + { + "epoch": 0.011158798283261802, + "grad_norm": 0.7877945303916931, + "learning_rate": 1.8125e-06, + "loss": 2.2373, + "step": 208 + }, + { + "epoch": 0.01121244635193133, + "grad_norm": 0.9670550227165222, + "learning_rate": 1.8214285714285716e-06, + "loss": 2.319, + "step": 209 + }, + { + "epoch": 0.011266094420600859, + "grad_norm": 0.69708251953125, + "learning_rate": 1.830357142857143e-06, + "loss": 2.4331, + "step": 210 + }, + { + "epoch": 0.011319742489270387, + "grad_norm": 0.7773550748825073, + "learning_rate": 1.8392857142857146e-06, + "loss": 2.3893, + "step": 211 + }, + { + "epoch": 0.011373390557939914, + "grad_norm": 0.7683597207069397, + "learning_rate": 1.848214285714286e-06, + "loss": 2.5038, + "step": 212 + }, + { + "epoch": 0.011427038626609442, + "grad_norm": 3.5412344932556152, + "learning_rate": 1.8571428571428573e-06, + "loss": 2.0655, + "step": 213 + }, + { + "epoch": 0.01148068669527897, + "grad_norm": 0.6078819632530212, + "learning_rate": 1.8660714285714287e-06, + "loss": 2.5147, + "step": 214 + }, + { + "epoch": 0.011534334763948498, + "grad_norm": 0.8145774602890015, + "learning_rate": 1.8750000000000003e-06, + "loss": 2.4078, + "step": 215 + }, + { + "epoch": 0.011587982832618025, + "grad_norm": 1.0329651832580566, + "learning_rate": 1.8839285714285716e-06, + "loss": 2.3664, + "step": 216 + }, + { + "epoch": 0.011641630901287553, + "grad_norm": 0.6468517780303955, + "learning_rate": 1.892857142857143e-06, + "loss": 2.49, + "step": 217 + }, + { + "epoch": 0.011695278969957082, + "grad_norm": 0.9864062666893005, + "learning_rate": 1.9017857142857144e-06, + "loss": 2.4109, + "step": 218 + }, + { + "epoch": 0.01174892703862661, + "grad_norm": 0.6388240456581116, + "learning_rate": 1.9107142857142858e-06, + "loss": 2.5061, + "step": 219 + }, + { + "epoch": 0.011802575107296138, + "grad_norm": 0.680731475353241, + "learning_rate": 1.9196428571428573e-06, + "loss": 2.2591, + "step": 220 + }, + { + "epoch": 0.011856223175965665, + "grad_norm": 0.8310266137123108, + "learning_rate": 1.928571428571429e-06, + "loss": 2.3415, + "step": 221 + }, + { + "epoch": 0.011909871244635193, + "grad_norm": 0.6374121308326721, + "learning_rate": 1.9375e-06, + "loss": 2.2759, + "step": 222 + }, + { + "epoch": 0.011963519313304721, + "grad_norm": 0.5901023149490356, + "learning_rate": 1.9464285714285717e-06, + "loss": 2.3178, + "step": 223 + }, + { + "epoch": 0.01201716738197425, + "grad_norm": 0.7914515733718872, + "learning_rate": 1.955357142857143e-06, + "loss": 2.3164, + "step": 224 + }, + { + "epoch": 0.012070815450643776, + "grad_norm": 0.9243539571762085, + "learning_rate": 1.9642857142857144e-06, + "loss": 2.447, + "step": 225 + }, + { + "epoch": 0.012124463519313304, + "grad_norm": 0.7908656001091003, + "learning_rate": 1.973214285714286e-06, + "loss": 2.4447, + "step": 226 + }, + { + "epoch": 0.012178111587982833, + "grad_norm": 0.7527483105659485, + "learning_rate": 1.982142857142857e-06, + "loss": 2.6932, + "step": 227 + }, + { + "epoch": 0.012231759656652361, + "grad_norm": 0.7420084476470947, + "learning_rate": 1.9910714285714287e-06, + "loss": 2.5089, + "step": 228 + }, + { + "epoch": 0.012285407725321888, + "grad_norm": 0.6847890615463257, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.4264, + "step": 229 + }, + { + "epoch": 0.012339055793991416, + "grad_norm": 0.7093695402145386, + "learning_rate": 2.0089285714285715e-06, + "loss": 2.1732, + "step": 230 + }, + { + "epoch": 0.012392703862660944, + "grad_norm": 0.8424200415611267, + "learning_rate": 2.017857142857143e-06, + "loss": 2.5147, + "step": 231 + }, + { + "epoch": 0.012446351931330472, + "grad_norm": 0.5956903696060181, + "learning_rate": 2.0267857142857147e-06, + "loss": 2.0074, + "step": 232 + }, + { + "epoch": 0.0125, + "grad_norm": 1.5747441053390503, + "learning_rate": 2.035714285714286e-06, + "loss": 2.5164, + "step": 233 + }, + { + "epoch": 0.012553648068669527, + "grad_norm": 0.6299461722373962, + "learning_rate": 2.0446428571428574e-06, + "loss": 2.3953, + "step": 234 + }, + { + "epoch": 0.012607296137339056, + "grad_norm": 0.6784403324127197, + "learning_rate": 2.0535714285714286e-06, + "loss": 2.4146, + "step": 235 + }, + { + "epoch": 0.012660944206008584, + "grad_norm": 0.6467978954315186, + "learning_rate": 2.0625e-06, + "loss": 2.397, + "step": 236 + }, + { + "epoch": 0.012714592274678112, + "grad_norm": 0.7506272792816162, + "learning_rate": 2.0714285714285717e-06, + "loss": 2.5601, + "step": 237 + }, + { + "epoch": 0.012768240343347639, + "grad_norm": 0.8178972601890564, + "learning_rate": 2.0803571428571433e-06, + "loss": 2.1411, + "step": 238 + }, + { + "epoch": 0.012821888412017167, + "grad_norm": 1.0078363418579102, + "learning_rate": 2.0892857142857145e-06, + "loss": 2.5521, + "step": 239 + }, + { + "epoch": 0.012875536480686695, + "grad_norm": 0.8121960759162903, + "learning_rate": 2.098214285714286e-06, + "loss": 2.5314, + "step": 240 + }, + { + "epoch": 0.012929184549356224, + "grad_norm": 0.8223477602005005, + "learning_rate": 2.1071428571428572e-06, + "loss": 2.3234, + "step": 241 + }, + { + "epoch": 0.012982832618025752, + "grad_norm": 0.6423619985580444, + "learning_rate": 2.116071428571429e-06, + "loss": 2.3738, + "step": 242 + }, + { + "epoch": 0.013036480686695278, + "grad_norm": 0.6700283885002136, + "learning_rate": 2.125e-06, + "loss": 2.3832, + "step": 243 + }, + { + "epoch": 0.013090128755364807, + "grad_norm": 0.6296292543411255, + "learning_rate": 2.1339285714285716e-06, + "loss": 2.359, + "step": 244 + }, + { + "epoch": 0.013143776824034335, + "grad_norm": 0.7367408871650696, + "learning_rate": 2.1428571428571427e-06, + "loss": 2.4386, + "step": 245 + }, + { + "epoch": 0.013197424892703863, + "grad_norm": 0.9185892939567566, + "learning_rate": 2.1517857142857147e-06, + "loss": 2.4242, + "step": 246 + }, + { + "epoch": 0.01325107296137339, + "grad_norm": 0.6843007206916809, + "learning_rate": 2.160714285714286e-06, + "loss": 2.0571, + "step": 247 + }, + { + "epoch": 0.013304721030042918, + "grad_norm": 0.9147751927375793, + "learning_rate": 2.1696428571428575e-06, + "loss": 1.8428, + "step": 248 + }, + { + "epoch": 0.013358369098712446, + "grad_norm": 1.2487143278121948, + "learning_rate": 2.1785714285714286e-06, + "loss": 2.3028, + "step": 249 + }, + { + "epoch": 0.013412017167381975, + "grad_norm": 0.7906087040901184, + "learning_rate": 2.1875000000000002e-06, + "loss": 2.4286, + "step": 250 + }, + { + "epoch": 0.013465665236051503, + "grad_norm": 0.8500818610191345, + "learning_rate": 2.1964285714285714e-06, + "loss": 2.5569, + "step": 251 + }, + { + "epoch": 0.01351931330472103, + "grad_norm": 0.697887659072876, + "learning_rate": 2.205357142857143e-06, + "loss": 2.4827, + "step": 252 + }, + { + "epoch": 0.013572961373390558, + "grad_norm": 0.7631711959838867, + "learning_rate": 2.2142857142857146e-06, + "loss": 2.4049, + "step": 253 + }, + { + "epoch": 0.013626609442060086, + "grad_norm": 0.6823691129684448, + "learning_rate": 2.2232142857142857e-06, + "loss": 2.2669, + "step": 254 + }, + { + "epoch": 0.013680257510729614, + "grad_norm": 0.68162602186203, + "learning_rate": 2.2321428571428573e-06, + "loss": 2.4426, + "step": 255 + }, + { + "epoch": 0.013733905579399141, + "grad_norm": 0.8350782990455627, + "learning_rate": 2.241071428571429e-06, + "loss": 2.5014, + "step": 256 + }, + { + "epoch": 0.01378755364806867, + "grad_norm": 0.6530932188034058, + "learning_rate": 2.25e-06, + "loss": 2.4346, + "step": 257 + }, + { + "epoch": 0.013841201716738197, + "grad_norm": 0.7705269455909729, + "learning_rate": 2.2589285714285716e-06, + "loss": 1.8935, + "step": 258 + }, + { + "epoch": 0.013894849785407726, + "grad_norm": 0.7379944920539856, + "learning_rate": 2.2678571428571432e-06, + "loss": 2.4101, + "step": 259 + }, + { + "epoch": 0.013948497854077254, + "grad_norm": 0.7072131633758545, + "learning_rate": 2.2767857142857144e-06, + "loss": 2.4687, + "step": 260 + }, + { + "epoch": 0.01400214592274678, + "grad_norm": 0.6634659767150879, + "learning_rate": 2.285714285714286e-06, + "loss": 2.3767, + "step": 261 + }, + { + "epoch": 0.014055793991416309, + "grad_norm": 0.9153283834457397, + "learning_rate": 2.294642857142857e-06, + "loss": 2.3362, + "step": 262 + }, + { + "epoch": 0.014109442060085837, + "grad_norm": 0.6105332970619202, + "learning_rate": 2.3035714285714287e-06, + "loss": 2.3203, + "step": 263 + }, + { + "epoch": 0.014163090128755365, + "grad_norm": 0.699830949306488, + "learning_rate": 2.3125000000000003e-06, + "loss": 2.4901, + "step": 264 + }, + { + "epoch": 0.014216738197424892, + "grad_norm": 0.7631047368049622, + "learning_rate": 2.321428571428572e-06, + "loss": 2.3894, + "step": 265 + }, + { + "epoch": 0.01427038626609442, + "grad_norm": 0.8344921469688416, + "learning_rate": 2.330357142857143e-06, + "loss": 2.5259, + "step": 266 + }, + { + "epoch": 0.014324034334763949, + "grad_norm": 0.7504924535751343, + "learning_rate": 2.3392857142857146e-06, + "loss": 2.6454, + "step": 267 + }, + { + "epoch": 0.014377682403433477, + "grad_norm": 0.7122287750244141, + "learning_rate": 2.348214285714286e-06, + "loss": 2.4852, + "step": 268 + }, + { + "epoch": 0.014431330472103005, + "grad_norm": 0.6561256647109985, + "learning_rate": 2.3571428571428574e-06, + "loss": 2.2046, + "step": 269 + }, + { + "epoch": 0.014484978540772532, + "grad_norm": 0.8987632393836975, + "learning_rate": 2.3660714285714285e-06, + "loss": 2.4976, + "step": 270 + }, + { + "epoch": 0.01453862660944206, + "grad_norm": 0.9650809168815613, + "learning_rate": 2.375e-06, + "loss": 1.9362, + "step": 271 + }, + { + "epoch": 0.014592274678111588, + "grad_norm": 0.6158315539360046, + "learning_rate": 2.3839285714285717e-06, + "loss": 2.0685, + "step": 272 + }, + { + "epoch": 0.014645922746781117, + "grad_norm": 0.6385294795036316, + "learning_rate": 2.3928571428571433e-06, + "loss": 2.462, + "step": 273 + }, + { + "epoch": 0.014699570815450643, + "grad_norm": 0.6893616318702698, + "learning_rate": 2.4017857142857145e-06, + "loss": 2.0943, + "step": 274 + }, + { + "epoch": 0.014753218884120171, + "grad_norm": 0.8767016530036926, + "learning_rate": 2.410714285714286e-06, + "loss": 2.3982, + "step": 275 + }, + { + "epoch": 0.0148068669527897, + "grad_norm": 0.7963337898254395, + "learning_rate": 2.419642857142857e-06, + "loss": 2.2558, + "step": 276 + }, + { + "epoch": 0.014860515021459228, + "grad_norm": 0.7312727570533752, + "learning_rate": 2.428571428571429e-06, + "loss": 2.4308, + "step": 277 + }, + { + "epoch": 0.014914163090128755, + "grad_norm": 0.8620723485946655, + "learning_rate": 2.4375e-06, + "loss": 2.3666, + "step": 278 + }, + { + "epoch": 0.014967811158798283, + "grad_norm": 0.7023136615753174, + "learning_rate": 2.4464285714285715e-06, + "loss": 2.6281, + "step": 279 + }, + { + "epoch": 0.015021459227467811, + "grad_norm": 0.724403440952301, + "learning_rate": 2.455357142857143e-06, + "loss": 2.5165, + "step": 280 + }, + { + "epoch": 0.01507510729613734, + "grad_norm": 0.6700321435928345, + "learning_rate": 2.4642857142857147e-06, + "loss": 2.0925, + "step": 281 + }, + { + "epoch": 0.015128755364806868, + "grad_norm": 0.8319527506828308, + "learning_rate": 2.473214285714286e-06, + "loss": 2.3042, + "step": 282 + }, + { + "epoch": 0.015182403433476394, + "grad_norm": 0.9960564374923706, + "learning_rate": 2.4821428571428575e-06, + "loss": 2.6683, + "step": 283 + }, + { + "epoch": 0.015236051502145923, + "grad_norm": 0.6869928240776062, + "learning_rate": 2.4910714285714286e-06, + "loss": 2.4533, + "step": 284 + }, + { + "epoch": 0.01528969957081545, + "grad_norm": 1.1115264892578125, + "learning_rate": 2.5e-06, + "loss": 2.4816, + "step": 285 + }, + { + "epoch": 0.015343347639484979, + "grad_norm": 5.782863616943359, + "learning_rate": 2.508928571428572e-06, + "loss": 2.5195, + "step": 286 + }, + { + "epoch": 0.015396995708154506, + "grad_norm": 0.8223388195037842, + "learning_rate": 2.517857142857143e-06, + "loss": 2.5119, + "step": 287 + }, + { + "epoch": 0.015450643776824034, + "grad_norm": 0.8225493431091309, + "learning_rate": 2.5267857142857145e-06, + "loss": 2.4015, + "step": 288 + }, + { + "epoch": 0.015504291845493562, + "grad_norm": 0.6869227290153503, + "learning_rate": 2.5357142857142857e-06, + "loss": 2.2017, + "step": 289 + }, + { + "epoch": 0.01555793991416309, + "grad_norm": 0.8585204482078552, + "learning_rate": 2.5446428571428573e-06, + "loss": 2.6262, + "step": 290 + }, + { + "epoch": 0.015611587982832619, + "grad_norm": 0.9561352133750916, + "learning_rate": 2.5535714285714284e-06, + "loss": 2.4293, + "step": 291 + }, + { + "epoch": 0.015665236051502145, + "grad_norm": 0.9420600533485413, + "learning_rate": 2.5625e-06, + "loss": 2.3199, + "step": 292 + }, + { + "epoch": 0.015718884120171674, + "grad_norm": 0.985571026802063, + "learning_rate": 2.571428571428571e-06, + "loss": 2.5742, + "step": 293 + }, + { + "epoch": 0.015772532188841202, + "grad_norm": 0.8489513397216797, + "learning_rate": 2.580357142857143e-06, + "loss": 2.1875, + "step": 294 + }, + { + "epoch": 0.01582618025751073, + "grad_norm": 0.7616540789604187, + "learning_rate": 2.5892857142857148e-06, + "loss": 2.0514, + "step": 295 + }, + { + "epoch": 0.01587982832618026, + "grad_norm": 0.7030709981918335, + "learning_rate": 2.598214285714286e-06, + "loss": 2.5914, + "step": 296 + }, + { + "epoch": 0.015933476394849787, + "grad_norm": 0.5586270689964294, + "learning_rate": 2.6071428571428575e-06, + "loss": 2.5231, + "step": 297 + }, + { + "epoch": 0.01598712446351931, + "grad_norm": 0.6314849853515625, + "learning_rate": 2.616071428571429e-06, + "loss": 2.1521, + "step": 298 + }, + { + "epoch": 0.01604077253218884, + "grad_norm": 2.3691253662109375, + "learning_rate": 2.6250000000000003e-06, + "loss": 1.6596, + "step": 299 + }, + { + "epoch": 0.016094420600858368, + "grad_norm": 0.9220977425575256, + "learning_rate": 2.633928571428572e-06, + "loss": 2.5476, + "step": 300 + }, + { + "epoch": 0.016148068669527896, + "grad_norm": 0.6786535382270813, + "learning_rate": 2.642857142857143e-06, + "loss": 1.7556, + "step": 301 + }, + { + "epoch": 0.016201716738197425, + "grad_norm": 0.7234790325164795, + "learning_rate": 2.6517857142857146e-06, + "loss": 2.6333, + "step": 302 + }, + { + "epoch": 0.016255364806866953, + "grad_norm": 0.9879781007766724, + "learning_rate": 2.6607142857142858e-06, + "loss": 2.3762, + "step": 303 + }, + { + "epoch": 0.01630901287553648, + "grad_norm": 0.8127362132072449, + "learning_rate": 2.6696428571428574e-06, + "loss": 2.4876, + "step": 304 + }, + { + "epoch": 0.01636266094420601, + "grad_norm": 0.8156941533088684, + "learning_rate": 2.6785714285714285e-06, + "loss": 2.3795, + "step": 305 + }, + { + "epoch": 0.016416309012875538, + "grad_norm": 0.5962357521057129, + "learning_rate": 2.6875e-06, + "loss": 2.3785, + "step": 306 + }, + { + "epoch": 0.016469957081545063, + "grad_norm": 0.7326784133911133, + "learning_rate": 2.6964285714285717e-06, + "loss": 2.352, + "step": 307 + }, + { + "epoch": 0.01652360515021459, + "grad_norm": 0.7983625531196594, + "learning_rate": 2.705357142857143e-06, + "loss": 2.3658, + "step": 308 + }, + { + "epoch": 0.01657725321888412, + "grad_norm": 1.134830355644226, + "learning_rate": 2.7142857142857144e-06, + "loss": 2.5088, + "step": 309 + }, + { + "epoch": 0.016630901287553648, + "grad_norm": 0.7062075734138489, + "learning_rate": 2.7232142857142856e-06, + "loss": 2.3007, + "step": 310 + }, + { + "epoch": 0.016684549356223176, + "grad_norm": 0.7864484786987305, + "learning_rate": 2.732142857142857e-06, + "loss": 2.1189, + "step": 311 + }, + { + "epoch": 0.016738197424892704, + "grad_norm": 0.9680244326591492, + "learning_rate": 2.741071428571429e-06, + "loss": 2.4341, + "step": 312 + }, + { + "epoch": 0.016791845493562232, + "grad_norm": 0.841587245464325, + "learning_rate": 2.7500000000000004e-06, + "loss": 2.6046, + "step": 313 + }, + { + "epoch": 0.01684549356223176, + "grad_norm": 0.7342112064361572, + "learning_rate": 2.758928571428572e-06, + "loss": 2.3676, + "step": 314 + }, + { + "epoch": 0.01689914163090129, + "grad_norm": 0.7875284552574158, + "learning_rate": 2.767857142857143e-06, + "loss": 2.6814, + "step": 315 + }, + { + "epoch": 0.016952789699570814, + "grad_norm": 0.8010267019271851, + "learning_rate": 2.7767857142857147e-06, + "loss": 2.3434, + "step": 316 + }, + { + "epoch": 0.017006437768240342, + "grad_norm": 0.7906016111373901, + "learning_rate": 2.785714285714286e-06, + "loss": 1.778, + "step": 317 + }, + { + "epoch": 0.01706008583690987, + "grad_norm": 0.8270081877708435, + "learning_rate": 2.7946428571428574e-06, + "loss": 2.476, + "step": 318 + }, + { + "epoch": 0.0171137339055794, + "grad_norm": 0.8687238097190857, + "learning_rate": 2.803571428571429e-06, + "loss": 2.5384, + "step": 319 + }, + { + "epoch": 0.017167381974248927, + "grad_norm": 0.7220746278762817, + "learning_rate": 2.8125e-06, + "loss": 2.3432, + "step": 320 + }, + { + "epoch": 0.017221030042918455, + "grad_norm": 0.8341050148010254, + "learning_rate": 2.8214285714285718e-06, + "loss": 2.5135, + "step": 321 + }, + { + "epoch": 0.017274678111587984, + "grad_norm": 0.8350667953491211, + "learning_rate": 2.830357142857143e-06, + "loss": 2.4291, + "step": 322 + }, + { + "epoch": 0.017328326180257512, + "grad_norm": 1.1586958169937134, + "learning_rate": 2.8392857142857145e-06, + "loss": 2.0637, + "step": 323 + }, + { + "epoch": 0.01738197424892704, + "grad_norm": 0.6313625574111938, + "learning_rate": 2.8482142857142857e-06, + "loss": 2.1824, + "step": 324 + }, + { + "epoch": 0.017435622317596565, + "grad_norm": 0.9125402569770813, + "learning_rate": 2.8571428571428573e-06, + "loss": 2.3423, + "step": 325 + }, + { + "epoch": 0.017489270386266093, + "grad_norm": 0.6458116173744202, + "learning_rate": 2.8660714285714284e-06, + "loss": 2.3292, + "step": 326 + }, + { + "epoch": 0.01754291845493562, + "grad_norm": 0.6161265969276428, + "learning_rate": 2.875e-06, + "loss": 2.0034, + "step": 327 + }, + { + "epoch": 0.01759656652360515, + "grad_norm": 3.8319132328033447, + "learning_rate": 2.8839285714285716e-06, + "loss": 2.276, + "step": 328 + }, + { + "epoch": 0.017650214592274678, + "grad_norm": 0.7079542875289917, + "learning_rate": 2.892857142857143e-06, + "loss": 2.0804, + "step": 329 + }, + { + "epoch": 0.017703862660944206, + "grad_norm": 1.677864909172058, + "learning_rate": 2.9017857142857148e-06, + "loss": 2.4735, + "step": 330 + }, + { + "epoch": 0.017757510729613735, + "grad_norm": 0.8613357543945312, + "learning_rate": 2.9107142857142863e-06, + "loss": 2.3187, + "step": 331 + }, + { + "epoch": 0.017811158798283263, + "grad_norm": 0.7053126096725464, + "learning_rate": 2.9196428571428575e-06, + "loss": 2.2529, + "step": 332 + }, + { + "epoch": 0.01786480686695279, + "grad_norm": 0.6988296508789062, + "learning_rate": 2.928571428571429e-06, + "loss": 2.3578, + "step": 333 + }, + { + "epoch": 0.017918454935622316, + "grad_norm": 0.7423781156539917, + "learning_rate": 2.9375000000000003e-06, + "loss": 2.2244, + "step": 334 + }, + { + "epoch": 0.017972103004291844, + "grad_norm": 0.6908703446388245, + "learning_rate": 2.946428571428572e-06, + "loss": 2.259, + "step": 335 + }, + { + "epoch": 0.018025751072961373, + "grad_norm": 0.8297222256660461, + "learning_rate": 2.955357142857143e-06, + "loss": 2.4594, + "step": 336 + }, + { + "epoch": 0.0180793991416309, + "grad_norm": 0.8613131642341614, + "learning_rate": 2.9642857142857146e-06, + "loss": 2.3921, + "step": 337 + }, + { + "epoch": 0.01813304721030043, + "grad_norm": 0.7114418148994446, + "learning_rate": 2.9732142857142857e-06, + "loss": 2.5854, + "step": 338 + }, + { + "epoch": 0.018186695278969957, + "grad_norm": 0.8707160949707031, + "learning_rate": 2.9821428571428573e-06, + "loss": 2.5622, + "step": 339 + }, + { + "epoch": 0.018240343347639486, + "grad_norm": 0.7687397599220276, + "learning_rate": 2.991071428571429e-06, + "loss": 2.3291, + "step": 340 + }, + { + "epoch": 0.018293991416309014, + "grad_norm": 0.8099189400672913, + "learning_rate": 3e-06, + "loss": 2.4883, + "step": 341 + }, + { + "epoch": 0.018347639484978542, + "grad_norm": 0.706312358379364, + "learning_rate": 3.0089285714285717e-06, + "loss": 2.2135, + "step": 342 + }, + { + "epoch": 0.018401287553648067, + "grad_norm": 0.6552898287773132, + "learning_rate": 3.017857142857143e-06, + "loss": 2.39, + "step": 343 + }, + { + "epoch": 0.018454935622317595, + "grad_norm": 0.6962871551513672, + "learning_rate": 3.0267857142857144e-06, + "loss": 2.4725, + "step": 344 + }, + { + "epoch": 0.018508583690987124, + "grad_norm": 0.9890510439872742, + "learning_rate": 3.0357142857142856e-06, + "loss": 2.5443, + "step": 345 + }, + { + "epoch": 0.018562231759656652, + "grad_norm": 0.8848845958709717, + "learning_rate": 3.044642857142857e-06, + "loss": 2.0176, + "step": 346 + }, + { + "epoch": 0.01861587982832618, + "grad_norm": 1.0035433769226074, + "learning_rate": 3.053571428571429e-06, + "loss": 2.2168, + "step": 347 + }, + { + "epoch": 0.01866952789699571, + "grad_norm": 0.9026788473129272, + "learning_rate": 3.0625000000000003e-06, + "loss": 2.5627, + "step": 348 + }, + { + "epoch": 0.018723175965665237, + "grad_norm": 0.9245330095291138, + "learning_rate": 3.071428571428572e-06, + "loss": 2.4319, + "step": 349 + }, + { + "epoch": 0.018776824034334765, + "grad_norm": 0.879179835319519, + "learning_rate": 3.080357142857143e-06, + "loss": 2.4178, + "step": 350 + }, + { + "epoch": 0.018830472103004293, + "grad_norm": 0.9510053396224976, + "learning_rate": 3.0892857142857147e-06, + "loss": 2.6461, + "step": 351 + }, + { + "epoch": 0.01888412017167382, + "grad_norm": 1.0390998125076294, + "learning_rate": 3.0982142857142862e-06, + "loss": 2.3377, + "step": 352 + }, + { + "epoch": 0.018937768240343347, + "grad_norm": 0.7219237089157104, + "learning_rate": 3.1071428571428574e-06, + "loss": 2.1921, + "step": 353 + }, + { + "epoch": 0.018991416309012875, + "grad_norm": 0.6656615138053894, + "learning_rate": 3.116071428571429e-06, + "loss": 2.2453, + "step": 354 + }, + { + "epoch": 0.019045064377682403, + "grad_norm": 0.7299067378044128, + "learning_rate": 3.125e-06, + "loss": 2.4842, + "step": 355 + }, + { + "epoch": 0.01909871244635193, + "grad_norm": 0.9124429225921631, + "learning_rate": 3.1339285714285717e-06, + "loss": 2.5188, + "step": 356 + }, + { + "epoch": 0.01915236051502146, + "grad_norm": 1.1448966264724731, + "learning_rate": 3.142857142857143e-06, + "loss": 2.4272, + "step": 357 + }, + { + "epoch": 0.019206008583690988, + "grad_norm": 0.8445751667022705, + "learning_rate": 3.1517857142857145e-06, + "loss": 2.5534, + "step": 358 + }, + { + "epoch": 0.019259656652360516, + "grad_norm": 0.7917039394378662, + "learning_rate": 3.1607142857142856e-06, + "loss": 2.3478, + "step": 359 + }, + { + "epoch": 0.019313304721030045, + "grad_norm": 1.1327301263809204, + "learning_rate": 3.1696428571428572e-06, + "loss": 2.4204, + "step": 360 + }, + { + "epoch": 0.01936695278969957, + "grad_norm": 0.7836429476737976, + "learning_rate": 3.178571428571429e-06, + "loss": 2.608, + "step": 361 + }, + { + "epoch": 0.019420600858369098, + "grad_norm": 0.7183333039283752, + "learning_rate": 3.1875e-06, + "loss": 2.2111, + "step": 362 + }, + { + "epoch": 0.019474248927038626, + "grad_norm": 0.9662913680076599, + "learning_rate": 3.1964285714285716e-06, + "loss": 2.4022, + "step": 363 + }, + { + "epoch": 0.019527896995708154, + "grad_norm": 0.8930501341819763, + "learning_rate": 3.2053571428571436e-06, + "loss": 2.6228, + "step": 364 + }, + { + "epoch": 0.019581545064377683, + "grad_norm": 0.7898169755935669, + "learning_rate": 3.2142857142857147e-06, + "loss": 2.3246, + "step": 365 + }, + { + "epoch": 0.01963519313304721, + "grad_norm": 0.9395961165428162, + "learning_rate": 3.2232142857142863e-06, + "loss": 2.322, + "step": 366 + }, + { + "epoch": 0.01968884120171674, + "grad_norm": 0.6955658793449402, + "learning_rate": 3.2321428571428575e-06, + "loss": 2.6313, + "step": 367 + }, + { + "epoch": 0.019742489270386267, + "grad_norm": 0.7964416742324829, + "learning_rate": 3.241071428571429e-06, + "loss": 2.4888, + "step": 368 + }, + { + "epoch": 0.019796137339055796, + "grad_norm": 1.114831805229187, + "learning_rate": 3.2500000000000002e-06, + "loss": 2.3438, + "step": 369 + }, + { + "epoch": 0.01984978540772532, + "grad_norm": 0.8184826374053955, + "learning_rate": 3.258928571428572e-06, + "loss": 2.5248, + "step": 370 + }, + { + "epoch": 0.01990343347639485, + "grad_norm": 0.7276002168655396, + "learning_rate": 3.267857142857143e-06, + "loss": 2.5176, + "step": 371 + }, + { + "epoch": 0.019957081545064377, + "grad_norm": 1.347496747970581, + "learning_rate": 3.2767857142857146e-06, + "loss": 2.3598, + "step": 372 + }, + { + "epoch": 0.020010729613733905, + "grad_norm": 0.754483163356781, + "learning_rate": 3.285714285714286e-06, + "loss": 2.4211, + "step": 373 + }, + { + "epoch": 0.020064377682403434, + "grad_norm": 0.7603384852409363, + "learning_rate": 3.2946428571428573e-06, + "loss": 2.4043, + "step": 374 + }, + { + "epoch": 0.020118025751072962, + "grad_norm": 0.6446270942687988, + "learning_rate": 3.303571428571429e-06, + "loss": 2.5342, + "step": 375 + }, + { + "epoch": 0.02017167381974249, + "grad_norm": 0.7562928795814514, + "learning_rate": 3.3125e-06, + "loss": 2.3929, + "step": 376 + }, + { + "epoch": 0.02022532188841202, + "grad_norm": 0.8165103197097778, + "learning_rate": 3.3214285714285716e-06, + "loss": 2.5336, + "step": 377 + }, + { + "epoch": 0.020278969957081543, + "grad_norm": 0.7981828451156616, + "learning_rate": 3.330357142857143e-06, + "loss": 2.3266, + "step": 378 + }, + { + "epoch": 0.02033261802575107, + "grad_norm": 0.7669143676757812, + "learning_rate": 3.3392857142857144e-06, + "loss": 2.1537, + "step": 379 + }, + { + "epoch": 0.0203862660944206, + "grad_norm": 0.7440754771232605, + "learning_rate": 3.3482142857142855e-06, + "loss": 2.5524, + "step": 380 + }, + { + "epoch": 0.020439914163090128, + "grad_norm": 3.4954633712768555, + "learning_rate": 3.357142857142857e-06, + "loss": 1.7568, + "step": 381 + }, + { + "epoch": 0.020493562231759656, + "grad_norm": 2.1664199829101562, + "learning_rate": 3.366071428571429e-06, + "loss": 2.4094, + "step": 382 + }, + { + "epoch": 0.020547210300429185, + "grad_norm": 0.6689859628677368, + "learning_rate": 3.3750000000000003e-06, + "loss": 2.5279, + "step": 383 + }, + { + "epoch": 0.020600858369098713, + "grad_norm": 0.9399182200431824, + "learning_rate": 3.383928571428572e-06, + "loss": 2.3638, + "step": 384 + }, + { + "epoch": 0.02065450643776824, + "grad_norm": 0.7546661496162415, + "learning_rate": 3.3928571428571435e-06, + "loss": 2.3668, + "step": 385 + }, + { + "epoch": 0.02070815450643777, + "grad_norm": 1.46944260597229, + "learning_rate": 3.4017857142857146e-06, + "loss": 2.5717, + "step": 386 + }, + { + "epoch": 0.020761802575107294, + "grad_norm": 0.7063437104225159, + "learning_rate": 3.4107142857142862e-06, + "loss": 2.3311, + "step": 387 + }, + { + "epoch": 0.020815450643776823, + "grad_norm": 0.6529530882835388, + "learning_rate": 3.4196428571428574e-06, + "loss": 2.5047, + "step": 388 + }, + { + "epoch": 0.02086909871244635, + "grad_norm": 1.0409495830535889, + "learning_rate": 3.428571428571429e-06, + "loss": 2.5642, + "step": 389 + }, + { + "epoch": 0.02092274678111588, + "grad_norm": 0.7223802208900452, + "learning_rate": 3.4375e-06, + "loss": 2.1276, + "step": 390 + }, + { + "epoch": 0.020976394849785408, + "grad_norm": 1.925168752670288, + "learning_rate": 3.4464285714285717e-06, + "loss": 2.4447, + "step": 391 + }, + { + "epoch": 0.021030042918454936, + "grad_norm": 0.7264248132705688, + "learning_rate": 3.455357142857143e-06, + "loss": 2.5937, + "step": 392 + }, + { + "epoch": 0.021083690987124464, + "grad_norm": 0.8342792987823486, + "learning_rate": 3.4642857142857145e-06, + "loss": 2.694, + "step": 393 + }, + { + "epoch": 0.021137339055793992, + "grad_norm": 0.736733078956604, + "learning_rate": 3.473214285714286e-06, + "loss": 2.3066, + "step": 394 + }, + { + "epoch": 0.02119098712446352, + "grad_norm": 0.8179200887680054, + "learning_rate": 3.482142857142857e-06, + "loss": 2.839, + "step": 395 + }, + { + "epoch": 0.021244635193133046, + "grad_norm": 5.501429557800293, + "learning_rate": 3.4910714285714288e-06, + "loss": 2.5711, + "step": 396 + }, + { + "epoch": 0.021298283261802574, + "grad_norm": 0.892386257648468, + "learning_rate": 3.5e-06, + "loss": 2.3015, + "step": 397 + }, + { + "epoch": 0.021351931330472102, + "grad_norm": 0.9194245338439941, + "learning_rate": 3.5089285714285715e-06, + "loss": 2.4064, + "step": 398 + }, + { + "epoch": 0.02140557939914163, + "grad_norm": 1.5715312957763672, + "learning_rate": 3.5178571428571435e-06, + "loss": 1.5237, + "step": 399 + }, + { + "epoch": 0.02145922746781116, + "grad_norm": 0.933725893497467, + "learning_rate": 3.5267857142857147e-06, + "loss": 2.3794, + "step": 400 + }, + { + "epoch": 0.021512875536480687, + "grad_norm": 0.7138448357582092, + "learning_rate": 3.5357142857142863e-06, + "loss": 2.5775, + "step": 401 + }, + { + "epoch": 0.021566523605150215, + "grad_norm": 1.017191767692566, + "learning_rate": 3.5446428571428574e-06, + "loss": 2.5537, + "step": 402 + }, + { + "epoch": 0.021620171673819744, + "grad_norm": 0.8227158784866333, + "learning_rate": 3.553571428571429e-06, + "loss": 2.3265, + "step": 403 + }, + { + "epoch": 0.021673819742489272, + "grad_norm": 1.0108470916748047, + "learning_rate": 3.5625e-06, + "loss": 1.9004, + "step": 404 + }, + { + "epoch": 0.021727467811158797, + "grad_norm": 0.8835887312889099, + "learning_rate": 3.5714285714285718e-06, + "loss": 2.4506, + "step": 405 + }, + { + "epoch": 0.021781115879828325, + "grad_norm": 3.7320992946624756, + "learning_rate": 3.5803571428571434e-06, + "loss": 2.4304, + "step": 406 + }, + { + "epoch": 0.021834763948497853, + "grad_norm": 0.8696883916854858, + "learning_rate": 3.5892857142857145e-06, + "loss": 1.7832, + "step": 407 + }, + { + "epoch": 0.02188841201716738, + "grad_norm": 0.7369847893714905, + "learning_rate": 3.598214285714286e-06, + "loss": 2.2157, + "step": 408 + }, + { + "epoch": 0.02194206008583691, + "grad_norm": 0.7057186961174011, + "learning_rate": 3.6071428571428573e-06, + "loss": 2.1928, + "step": 409 + }, + { + "epoch": 0.021995708154506438, + "grad_norm": 0.6958415508270264, + "learning_rate": 3.616071428571429e-06, + "loss": 2.3975, + "step": 410 + }, + { + "epoch": 0.022049356223175966, + "grad_norm": 0.7458484768867493, + "learning_rate": 3.625e-06, + "loss": 2.4592, + "step": 411 + }, + { + "epoch": 0.022103004291845495, + "grad_norm": 0.6370704770088196, + "learning_rate": 3.6339285714285716e-06, + "loss": 2.4053, + "step": 412 + }, + { + "epoch": 0.022156652360515023, + "grad_norm": 0.7654312252998352, + "learning_rate": 3.642857142857143e-06, + "loss": 2.1924, + "step": 413 + }, + { + "epoch": 0.022210300429184548, + "grad_norm": 0.8381941914558411, + "learning_rate": 3.6517857142857144e-06, + "loss": 2.3798, + "step": 414 + }, + { + "epoch": 0.022263948497854076, + "grad_norm": 0.7348666191101074, + "learning_rate": 3.660714285714286e-06, + "loss": 2.2003, + "step": 415 + }, + { + "epoch": 0.022317596566523604, + "grad_norm": 0.8701876401901245, + "learning_rate": 3.669642857142857e-06, + "loss": 2.5562, + "step": 416 + }, + { + "epoch": 0.022371244635193133, + "grad_norm": 1.1198087930679321, + "learning_rate": 3.678571428571429e-06, + "loss": 1.6158, + "step": 417 + }, + { + "epoch": 0.02242489270386266, + "grad_norm": 0.6843016743659973, + "learning_rate": 3.6875000000000007e-06, + "loss": 2.4437, + "step": 418 + }, + { + "epoch": 0.02247854077253219, + "grad_norm": 0.6474888324737549, + "learning_rate": 3.696428571428572e-06, + "loss": 2.1288, + "step": 419 + }, + { + "epoch": 0.022532188841201718, + "grad_norm": 0.9589895606040955, + "learning_rate": 3.7053571428571434e-06, + "loss": 1.8437, + "step": 420 + }, + { + "epoch": 0.022585836909871246, + "grad_norm": 0.8810008764266968, + "learning_rate": 3.7142857142857146e-06, + "loss": 2.0788, + "step": 421 + }, + { + "epoch": 0.022639484978540774, + "grad_norm": 0.7050797939300537, + "learning_rate": 3.723214285714286e-06, + "loss": 2.4073, + "step": 422 + }, + { + "epoch": 0.0226931330472103, + "grad_norm": 0.8251096606254578, + "learning_rate": 3.7321428571428573e-06, + "loss": 2.2869, + "step": 423 + }, + { + "epoch": 0.022746781115879827, + "grad_norm": 1.02393639087677, + "learning_rate": 3.741071428571429e-06, + "loss": 2.5954, + "step": 424 + }, + { + "epoch": 0.022800429184549355, + "grad_norm": 0.8207212090492249, + "learning_rate": 3.7500000000000005e-06, + "loss": 2.2162, + "step": 425 + }, + { + "epoch": 0.022854077253218884, + "grad_norm": 0.7464378476142883, + "learning_rate": 3.7589285714285717e-06, + "loss": 2.0438, + "step": 426 + }, + { + "epoch": 0.022907725321888412, + "grad_norm": 0.8199927806854248, + "learning_rate": 3.7678571428571433e-06, + "loss": 2.3657, + "step": 427 + }, + { + "epoch": 0.02296137339055794, + "grad_norm": 0.931246817111969, + "learning_rate": 3.7767857142857144e-06, + "loss": 2.4942, + "step": 428 + }, + { + "epoch": 0.02301502145922747, + "grad_norm": 1.1280642747879028, + "learning_rate": 3.785714285714286e-06, + "loss": 2.0957, + "step": 429 + }, + { + "epoch": 0.023068669527896997, + "grad_norm": 0.7328938841819763, + "learning_rate": 3.794642857142857e-06, + "loss": 2.3172, + "step": 430 + }, + { + "epoch": 0.023122317596566525, + "grad_norm": 0.911868691444397, + "learning_rate": 3.8035714285714288e-06, + "loss": 2.134, + "step": 431 + }, + { + "epoch": 0.02317596566523605, + "grad_norm": 0.6746270060539246, + "learning_rate": 3.8125e-06, + "loss": 2.5609, + "step": 432 + }, + { + "epoch": 0.02322961373390558, + "grad_norm": 3.7394657135009766, + "learning_rate": 3.8214285714285715e-06, + "loss": 2.2045, + "step": 433 + }, + { + "epoch": 0.023283261802575107, + "grad_norm": 0.9176698327064514, + "learning_rate": 3.8303571428571435e-06, + "loss": 2.4058, + "step": 434 + }, + { + "epoch": 0.023336909871244635, + "grad_norm": 1.428479552268982, + "learning_rate": 3.839285714285715e-06, + "loss": 2.5748, + "step": 435 + }, + { + "epoch": 0.023390557939914163, + "grad_norm": 0.7161766886711121, + "learning_rate": 3.848214285714286e-06, + "loss": 2.2265, + "step": 436 + }, + { + "epoch": 0.02344420600858369, + "grad_norm": 0.7266635894775391, + "learning_rate": 3.857142857142858e-06, + "loss": 2.3507, + "step": 437 + }, + { + "epoch": 0.02349785407725322, + "grad_norm": 0.7059489488601685, + "learning_rate": 3.866071428571429e-06, + "loss": 2.5323, + "step": 438 + }, + { + "epoch": 0.023551502145922748, + "grad_norm": 0.7215156555175781, + "learning_rate": 3.875e-06, + "loss": 2.5228, + "step": 439 + }, + { + "epoch": 0.023605150214592276, + "grad_norm": 0.7039859890937805, + "learning_rate": 3.883928571428572e-06, + "loss": 2.341, + "step": 440 + }, + { + "epoch": 0.0236587982832618, + "grad_norm": 0.6339973211288452, + "learning_rate": 3.892857142857143e-06, + "loss": 2.1507, + "step": 441 + }, + { + "epoch": 0.02371244635193133, + "grad_norm": 0.840636134147644, + "learning_rate": 3.9017857142857145e-06, + "loss": 2.4195, + "step": 442 + }, + { + "epoch": 0.023766094420600858, + "grad_norm": 0.814447283744812, + "learning_rate": 3.910714285714286e-06, + "loss": 2.1937, + "step": 443 + }, + { + "epoch": 0.023819742489270386, + "grad_norm": 1.3763039112091064, + "learning_rate": 3.919642857142858e-06, + "loss": 2.4544, + "step": 444 + }, + { + "epoch": 0.023873390557939914, + "grad_norm": 0.7267353534698486, + "learning_rate": 3.928571428571429e-06, + "loss": 2.2495, + "step": 445 + }, + { + "epoch": 0.023927038626609443, + "grad_norm": 0.854361891746521, + "learning_rate": 3.9375e-06, + "loss": 2.3906, + "step": 446 + }, + { + "epoch": 0.02398068669527897, + "grad_norm": 0.760546863079071, + "learning_rate": 3.946428571428572e-06, + "loss": 2.6315, + "step": 447 + }, + { + "epoch": 0.0240343347639485, + "grad_norm": 1.6090713739395142, + "learning_rate": 3.955357142857143e-06, + "loss": 2.3172, + "step": 448 + }, + { + "epoch": 0.024087982832618027, + "grad_norm": 0.6143981218338013, + "learning_rate": 3.964285714285714e-06, + "loss": 2.3899, + "step": 449 + }, + { + "epoch": 0.024141630901287552, + "grad_norm": 0.8334725499153137, + "learning_rate": 3.9732142857142855e-06, + "loss": 2.4754, + "step": 450 + }, + { + "epoch": 0.02419527896995708, + "grad_norm": 0.8154469728469849, + "learning_rate": 3.9821428571428575e-06, + "loss": 2.2967, + "step": 451 + }, + { + "epoch": 0.02424892703862661, + "grad_norm": 2.8558404445648193, + "learning_rate": 3.9910714285714295e-06, + "loss": 2.3997, + "step": 452 + }, + { + "epoch": 0.024302575107296137, + "grad_norm": 0.8113605380058289, + "learning_rate": 4.000000000000001e-06, + "loss": 2.3864, + "step": 453 + }, + { + "epoch": 0.024356223175965665, + "grad_norm": 0.7968435883522034, + "learning_rate": 4.008928571428572e-06, + "loss": 2.2888, + "step": 454 + }, + { + "epoch": 0.024409871244635194, + "grad_norm": 0.8200784921646118, + "learning_rate": 4.017857142857143e-06, + "loss": 2.6836, + "step": 455 + }, + { + "epoch": 0.024463519313304722, + "grad_norm": 0.7798553705215454, + "learning_rate": 4.026785714285715e-06, + "loss": 2.327, + "step": 456 + }, + { + "epoch": 0.02451716738197425, + "grad_norm": 1.0361378192901611, + "learning_rate": 4.035714285714286e-06, + "loss": 2.7211, + "step": 457 + }, + { + "epoch": 0.024570815450643775, + "grad_norm": 0.8387232422828674, + "learning_rate": 4.044642857142857e-06, + "loss": 2.3965, + "step": 458 + }, + { + "epoch": 0.024624463519313303, + "grad_norm": 0.9912132024765015, + "learning_rate": 4.053571428571429e-06, + "loss": 2.3776, + "step": 459 + }, + { + "epoch": 0.02467811158798283, + "grad_norm": 0.8059343695640564, + "learning_rate": 4.0625000000000005e-06, + "loss": 2.2162, + "step": 460 + }, + { + "epoch": 0.02473175965665236, + "grad_norm": 1.1003913879394531, + "learning_rate": 4.071428571428572e-06, + "loss": 2.2931, + "step": 461 + }, + { + "epoch": 0.024785407725321888, + "grad_norm": 0.7562921643257141, + "learning_rate": 4.080357142857143e-06, + "loss": 2.202, + "step": 462 + }, + { + "epoch": 0.024839055793991417, + "grad_norm": 2.8305306434631348, + "learning_rate": 4.089285714285715e-06, + "loss": 2.3429, + "step": 463 + }, + { + "epoch": 0.024892703862660945, + "grad_norm": 1.0779688358306885, + "learning_rate": 4.098214285714286e-06, + "loss": 2.4483, + "step": 464 + }, + { + "epoch": 0.024946351931330473, + "grad_norm": 0.817470908164978, + "learning_rate": 4.107142857142857e-06, + "loss": 2.4476, + "step": 465 + }, + { + "epoch": 0.025, + "grad_norm": 0.740354597568512, + "learning_rate": 4.116071428571428e-06, + "loss": 2.4089, + "step": 466 + }, + { + "epoch": 0.025053648068669526, + "grad_norm": 0.7255138754844666, + "learning_rate": 4.125e-06, + "loss": 2.2781, + "step": 467 + }, + { + "epoch": 0.025107296137339054, + "grad_norm": 1.036693811416626, + "learning_rate": 4.1339285714285715e-06, + "loss": 2.4642, + "step": 468 + }, + { + "epoch": 0.025160944206008583, + "grad_norm": 0.7903293967247009, + "learning_rate": 4.1428571428571435e-06, + "loss": 2.4157, + "step": 469 + }, + { + "epoch": 0.02521459227467811, + "grad_norm": 1.0276495218276978, + "learning_rate": 4.151785714285715e-06, + "loss": 2.4765, + "step": 470 + }, + { + "epoch": 0.02526824034334764, + "grad_norm": 0.7276985049247742, + "learning_rate": 4.160714285714287e-06, + "loss": 2.123, + "step": 471 + }, + { + "epoch": 0.025321888412017168, + "grad_norm": 1.0035864114761353, + "learning_rate": 4.169642857142858e-06, + "loss": 2.4313, + "step": 472 + }, + { + "epoch": 0.025375536480686696, + "grad_norm": 0.7579031586647034, + "learning_rate": 4.178571428571429e-06, + "loss": 2.3478, + "step": 473 + }, + { + "epoch": 0.025429184549356224, + "grad_norm": 1.0906922817230225, + "learning_rate": 4.1875e-06, + "loss": 2.4483, + "step": 474 + }, + { + "epoch": 0.025482832618025753, + "grad_norm": 0.9213836193084717, + "learning_rate": 4.196428571428572e-06, + "loss": 2.4349, + "step": 475 + }, + { + "epoch": 0.025536480686695277, + "grad_norm": 0.8189167380332947, + "learning_rate": 4.205357142857143e-06, + "loss": 2.3574, + "step": 476 + }, + { + "epoch": 0.025590128755364806, + "grad_norm": 0.976218044757843, + "learning_rate": 4.2142857142857145e-06, + "loss": 2.5658, + "step": 477 + }, + { + "epoch": 0.025643776824034334, + "grad_norm": 0.7440518140792847, + "learning_rate": 4.223214285714286e-06, + "loss": 1.7217, + "step": 478 + }, + { + "epoch": 0.025697424892703862, + "grad_norm": 0.7372002601623535, + "learning_rate": 4.232142857142858e-06, + "loss": 2.279, + "step": 479 + }, + { + "epoch": 0.02575107296137339, + "grad_norm": 0.7228379845619202, + "learning_rate": 4.241071428571429e-06, + "loss": 2.4355, + "step": 480 + }, + { + "epoch": 0.02580472103004292, + "grad_norm": 0.9598197340965271, + "learning_rate": 4.25e-06, + "loss": 2.4328, + "step": 481 + }, + { + "epoch": 0.025858369098712447, + "grad_norm": 0.7715316414833069, + "learning_rate": 4.258928571428572e-06, + "loss": 2.3871, + "step": 482 + }, + { + "epoch": 0.025912017167381975, + "grad_norm": 0.746380627155304, + "learning_rate": 4.267857142857143e-06, + "loss": 2.3437, + "step": 483 + }, + { + "epoch": 0.025965665236051504, + "grad_norm": 0.6639009714126587, + "learning_rate": 4.276785714285714e-06, + "loss": 2.0351, + "step": 484 + }, + { + "epoch": 0.02601931330472103, + "grad_norm": 0.8016340732574463, + "learning_rate": 4.2857142857142855e-06, + "loss": 2.4322, + "step": 485 + }, + { + "epoch": 0.026072961373390557, + "grad_norm": 0.9691901803016663, + "learning_rate": 4.2946428571428575e-06, + "loss": 2.4124, + "step": 486 + }, + { + "epoch": 0.026126609442060085, + "grad_norm": 0.9508801698684692, + "learning_rate": 4.3035714285714295e-06, + "loss": 2.5039, + "step": 487 + }, + { + "epoch": 0.026180257510729613, + "grad_norm": 0.7650073766708374, + "learning_rate": 4.312500000000001e-06, + "loss": 2.01, + "step": 488 + }, + { + "epoch": 0.02623390557939914, + "grad_norm": 0.992106556892395, + "learning_rate": 4.321428571428572e-06, + "loss": 2.233, + "step": 489 + }, + { + "epoch": 0.02628755364806867, + "grad_norm": 0.7837042808532715, + "learning_rate": 4.330357142857143e-06, + "loss": 2.2642, + "step": 490 + }, + { + "epoch": 0.026341201716738198, + "grad_norm": 0.7892453074455261, + "learning_rate": 4.339285714285715e-06, + "loss": 2.6229, + "step": 491 + }, + { + "epoch": 0.026394849785407726, + "grad_norm": 0.9260952472686768, + "learning_rate": 4.348214285714286e-06, + "loss": 2.1562, + "step": 492 + }, + { + "epoch": 0.026448497854077255, + "grad_norm": 0.8462100625038147, + "learning_rate": 4.357142857142857e-06, + "loss": 2.2956, + "step": 493 + }, + { + "epoch": 0.02650214592274678, + "grad_norm": 0.7706511616706848, + "learning_rate": 4.366071428571429e-06, + "loss": 2.4379, + "step": 494 + }, + { + "epoch": 0.026555793991416308, + "grad_norm": 0.8740862607955933, + "learning_rate": 4.3750000000000005e-06, + "loss": 2.6446, + "step": 495 + }, + { + "epoch": 0.026609442060085836, + "grad_norm": 1.0557152032852173, + "learning_rate": 4.383928571428572e-06, + "loss": 2.5206, + "step": 496 + }, + { + "epoch": 0.026663090128755364, + "grad_norm": 0.756197988986969, + "learning_rate": 4.392857142857143e-06, + "loss": 2.6486, + "step": 497 + }, + { + "epoch": 0.026716738197424893, + "grad_norm": 0.9108513593673706, + "learning_rate": 4.401785714285715e-06, + "loss": 2.217, + "step": 498 + }, + { + "epoch": 0.02677038626609442, + "grad_norm": 0.7217069268226624, + "learning_rate": 4.410714285714286e-06, + "loss": 2.3605, + "step": 499 + }, + { + "epoch": 0.02682403433476395, + "grad_norm": 0.7229094505310059, + "learning_rate": 4.419642857142857e-06, + "loss": 2.3754, + "step": 500 + }, + { + "epoch": 0.026877682403433478, + "grad_norm": 0.7486718893051147, + "learning_rate": 4.428571428571429e-06, + "loss": 2.5013, + "step": 501 + }, + { + "epoch": 0.026931330472103006, + "grad_norm": 0.7304054498672485, + "learning_rate": 4.4375e-06, + "loss": 2.3725, + "step": 502 + }, + { + "epoch": 0.02698497854077253, + "grad_norm": 0.8499715328216553, + "learning_rate": 4.4464285714285715e-06, + "loss": 2.5746, + "step": 503 + }, + { + "epoch": 0.02703862660944206, + "grad_norm": 0.9253300428390503, + "learning_rate": 4.4553571428571435e-06, + "loss": 2.4701, + "step": 504 + }, + { + "epoch": 0.027092274678111587, + "grad_norm": 1.1182117462158203, + "learning_rate": 4.464285714285715e-06, + "loss": 2.543, + "step": 505 + }, + { + "epoch": 0.027145922746781116, + "grad_norm": 0.9392046332359314, + "learning_rate": 4.473214285714287e-06, + "loss": 2.3858, + "step": 506 + }, + { + "epoch": 0.027199570815450644, + "grad_norm": 0.8048560619354248, + "learning_rate": 4.482142857142858e-06, + "loss": 2.3558, + "step": 507 + }, + { + "epoch": 0.027253218884120172, + "grad_norm": 0.8537608981132507, + "learning_rate": 4.491071428571429e-06, + "loss": 2.5773, + "step": 508 + }, + { + "epoch": 0.0273068669527897, + "grad_norm": 0.9082659482955933, + "learning_rate": 4.5e-06, + "loss": 2.0719, + "step": 509 + }, + { + "epoch": 0.02736051502145923, + "grad_norm": 0.8176069259643555, + "learning_rate": 4.508928571428572e-06, + "loss": 2.4638, + "step": 510 + }, + { + "epoch": 0.027414163090128757, + "grad_norm": 0.828382670879364, + "learning_rate": 4.517857142857143e-06, + "loss": 2.1148, + "step": 511 + }, + { + "epoch": 0.027467811158798282, + "grad_norm": 0.7014255523681641, + "learning_rate": 4.5267857142857144e-06, + "loss": 1.8971, + "step": 512 + }, + { + "epoch": 0.02752145922746781, + "grad_norm": 0.8343459963798523, + "learning_rate": 4.5357142857142865e-06, + "loss": 2.4437, + "step": 513 + }, + { + "epoch": 0.02757510729613734, + "grad_norm": 0.9934781193733215, + "learning_rate": 4.544642857142858e-06, + "loss": 2.4134, + "step": 514 + }, + { + "epoch": 0.027628755364806867, + "grad_norm": 0.8685812950134277, + "learning_rate": 4.553571428571429e-06, + "loss": 2.3418, + "step": 515 + }, + { + "epoch": 0.027682403433476395, + "grad_norm": 0.7411813735961914, + "learning_rate": 4.5625e-06, + "loss": 2.3009, + "step": 516 + }, + { + "epoch": 0.027736051502145923, + "grad_norm": 0.8920450806617737, + "learning_rate": 4.571428571428572e-06, + "loss": 2.3818, + "step": 517 + }, + { + "epoch": 0.02778969957081545, + "grad_norm": 0.9877768754959106, + "learning_rate": 4.580357142857143e-06, + "loss": 1.8392, + "step": 518 + }, + { + "epoch": 0.02784334763948498, + "grad_norm": 0.7175792455673218, + "learning_rate": 4.589285714285714e-06, + "loss": 2.2744, + "step": 519 + }, + { + "epoch": 0.027896995708154508, + "grad_norm": 0.8334026336669922, + "learning_rate": 4.5982142857142854e-06, + "loss": 2.3306, + "step": 520 + }, + { + "epoch": 0.027950643776824033, + "grad_norm": 0.840488612651825, + "learning_rate": 4.6071428571428574e-06, + "loss": 2.5156, + "step": 521 + }, + { + "epoch": 0.02800429184549356, + "grad_norm": 0.8102824687957764, + "learning_rate": 4.6160714285714294e-06, + "loss": 2.3334, + "step": 522 + }, + { + "epoch": 0.02805793991416309, + "grad_norm": 0.8264137506484985, + "learning_rate": 4.625000000000001e-06, + "loss": 2.4241, + "step": 523 + }, + { + "epoch": 0.028111587982832618, + "grad_norm": 1.9726167917251587, + "learning_rate": 4.633928571428572e-06, + "loss": 2.421, + "step": 524 + }, + { + "epoch": 0.028165236051502146, + "grad_norm": 0.9991806745529175, + "learning_rate": 4.642857142857144e-06, + "loss": 2.4648, + "step": 525 + }, + { + "epoch": 0.028218884120171674, + "grad_norm": 0.8271647691726685, + "learning_rate": 4.651785714285715e-06, + "loss": 2.3448, + "step": 526 + }, + { + "epoch": 0.028272532188841203, + "grad_norm": 0.7760920524597168, + "learning_rate": 4.660714285714286e-06, + "loss": 2.0153, + "step": 527 + }, + { + "epoch": 0.02832618025751073, + "grad_norm": 0.9586369395256042, + "learning_rate": 4.669642857142857e-06, + "loss": 2.2707, + "step": 528 + }, + { + "epoch": 0.02837982832618026, + "grad_norm": 1.0405757427215576, + "learning_rate": 4.678571428571429e-06, + "loss": 2.3195, + "step": 529 + }, + { + "epoch": 0.028433476394849784, + "grad_norm": 1.0947896242141724, + "learning_rate": 4.6875000000000004e-06, + "loss": 2.0498, + "step": 530 + }, + { + "epoch": 0.028487124463519312, + "grad_norm": 0.808911144733429, + "learning_rate": 4.696428571428572e-06, + "loss": 2.2677, + "step": 531 + }, + { + "epoch": 0.02854077253218884, + "grad_norm": 1.011614203453064, + "learning_rate": 4.705357142857143e-06, + "loss": 2.4923, + "step": 532 + }, + { + "epoch": 0.02859442060085837, + "grad_norm": 0.8231817483901978, + "learning_rate": 4.714285714285715e-06, + "loss": 2.5659, + "step": 533 + }, + { + "epoch": 0.028648068669527897, + "grad_norm": 0.7618065476417542, + "learning_rate": 4.723214285714286e-06, + "loss": 2.2326, + "step": 534 + }, + { + "epoch": 0.028701716738197425, + "grad_norm": 0.9810335636138916, + "learning_rate": 4.732142857142857e-06, + "loss": 1.5702, + "step": 535 + }, + { + "epoch": 0.028755364806866954, + "grad_norm": 0.9005271792411804, + "learning_rate": 4.741071428571429e-06, + "loss": 2.4064, + "step": 536 + }, + { + "epoch": 0.028809012875536482, + "grad_norm": 1.0362120866775513, + "learning_rate": 4.75e-06, + "loss": 2.4831, + "step": 537 + }, + { + "epoch": 0.02886266094420601, + "grad_norm": 1.1122186183929443, + "learning_rate": 4.758928571428571e-06, + "loss": 2.3297, + "step": 538 + }, + { + "epoch": 0.028916309012875535, + "grad_norm": 0.9291356801986694, + "learning_rate": 4.7678571428571434e-06, + "loss": 2.3028, + "step": 539 + }, + { + "epoch": 0.028969957081545063, + "grad_norm": 0.9159222841262817, + "learning_rate": 4.776785714285715e-06, + "loss": 2.3823, + "step": 540 + }, + { + "epoch": 0.02902360515021459, + "grad_norm": 0.7951038479804993, + "learning_rate": 4.785714285714287e-06, + "loss": 2.3624, + "step": 541 + }, + { + "epoch": 0.02907725321888412, + "grad_norm": 0.9676268100738525, + "learning_rate": 4.794642857142858e-06, + "loss": 2.4777, + "step": 542 + }, + { + "epoch": 0.02913090128755365, + "grad_norm": 0.8129324316978455, + "learning_rate": 4.803571428571429e-06, + "loss": 2.4551, + "step": 543 + }, + { + "epoch": 0.029184549356223177, + "grad_norm": 2.837127447128296, + "learning_rate": 4.8125e-06, + "loss": 2.2939, + "step": 544 + }, + { + "epoch": 0.029238197424892705, + "grad_norm": 0.7300359010696411, + "learning_rate": 4.821428571428572e-06, + "loss": 2.2, + "step": 545 + }, + { + "epoch": 0.029291845493562233, + "grad_norm": 0.7897707223892212, + "learning_rate": 4.830357142857143e-06, + "loss": 2.3829, + "step": 546 + }, + { + "epoch": 0.029345493562231758, + "grad_norm": 0.8719412684440613, + "learning_rate": 4.839285714285714e-06, + "loss": 2.5864, + "step": 547 + }, + { + "epoch": 0.029399141630901286, + "grad_norm": 0.9915910959243774, + "learning_rate": 4.8482142857142864e-06, + "loss": 2.5142, + "step": 548 + }, + { + "epoch": 0.029452789699570815, + "grad_norm": 1.4836807250976562, + "learning_rate": 4.857142857142858e-06, + "loss": 2.3387, + "step": 549 + }, + { + "epoch": 0.029506437768240343, + "grad_norm": 1.0107911825180054, + "learning_rate": 4.866071428571429e-06, + "loss": 2.033, + "step": 550 + }, + { + "epoch": 0.02956008583690987, + "grad_norm": 1.0573559999465942, + "learning_rate": 4.875e-06, + "loss": 2.6196, + "step": 551 + }, + { + "epoch": 0.0296137339055794, + "grad_norm": 1.354737401008606, + "learning_rate": 4.883928571428572e-06, + "loss": 2.3285, + "step": 552 + }, + { + "epoch": 0.029667381974248928, + "grad_norm": 0.6840674877166748, + "learning_rate": 4.892857142857143e-06, + "loss": 1.9089, + "step": 553 + }, + { + "epoch": 0.029721030042918456, + "grad_norm": 0.9670092463493347, + "learning_rate": 4.901785714285714e-06, + "loss": 2.3398, + "step": 554 + }, + { + "epoch": 0.029774678111587984, + "grad_norm": 0.9707775115966797, + "learning_rate": 4.910714285714286e-06, + "loss": 2.279, + "step": 555 + }, + { + "epoch": 0.02982832618025751, + "grad_norm": 0.8296954035758972, + "learning_rate": 4.919642857142857e-06, + "loss": 2.7126, + "step": 556 + }, + { + "epoch": 0.029881974248927037, + "grad_norm": 0.8749581575393677, + "learning_rate": 4.928571428571429e-06, + "loss": 2.6278, + "step": 557 + }, + { + "epoch": 0.029935622317596566, + "grad_norm": 1.1528538465499878, + "learning_rate": 4.937500000000001e-06, + "loss": 2.3957, + "step": 558 + }, + { + "epoch": 0.029989270386266094, + "grad_norm": 0.9784524440765381, + "learning_rate": 4.946428571428572e-06, + "loss": 2.2382, + "step": 559 + }, + { + "epoch": 0.030042918454935622, + "grad_norm": 0.7259788513183594, + "learning_rate": 4.955357142857144e-06, + "loss": 2.3094, + "step": 560 + }, + { + "epoch": 0.03009656652360515, + "grad_norm": 0.7123528718948364, + "learning_rate": 4.964285714285715e-06, + "loss": 2.0797, + "step": 561 + }, + { + "epoch": 0.03015021459227468, + "grad_norm": 0.8238666653633118, + "learning_rate": 4.973214285714286e-06, + "loss": 2.2912, + "step": 562 + }, + { + "epoch": 0.030203862660944207, + "grad_norm": 0.8915418386459351, + "learning_rate": 4.982142857142857e-06, + "loss": 2.3714, + "step": 563 + }, + { + "epoch": 0.030257510729613735, + "grad_norm": 1.8866664171218872, + "learning_rate": 4.991071428571429e-06, + "loss": 2.396, + "step": 564 + }, + { + "epoch": 0.03031115879828326, + "grad_norm": 1.5602302551269531, + "learning_rate": 5e-06, + "loss": 2.1339, + "step": 565 + }, + { + "epoch": 0.03036480686695279, + "grad_norm": 0.8363792300224304, + "learning_rate": 4.999999962259043e-06, + "loss": 2.3554, + "step": 566 + }, + { + "epoch": 0.030418454935622317, + "grad_norm": 0.8323478102684021, + "learning_rate": 4.999999849036174e-06, + "loss": 2.4786, + "step": 567 + }, + { + "epoch": 0.030472103004291845, + "grad_norm": 0.9940460920333862, + "learning_rate": 4.999999660331397e-06, + "loss": 2.2116, + "step": 568 + }, + { + "epoch": 0.030525751072961373, + "grad_norm": 0.7168157696723938, + "learning_rate": 4.999999396144716e-06, + "loss": 2.2887, + "step": 569 + }, + { + "epoch": 0.0305793991416309, + "grad_norm": 0.8743246793746948, + "learning_rate": 4.99999905647614e-06, + "loss": 2.5274, + "step": 570 + }, + { + "epoch": 0.03063304721030043, + "grad_norm": 0.6311125159263611, + "learning_rate": 4.999998641325678e-06, + "loss": 2.2784, + "step": 571 + }, + { + "epoch": 0.030686695278969958, + "grad_norm": 0.8848361372947693, + "learning_rate": 4.999998150693345e-06, + "loss": 2.3122, + "step": 572 + }, + { + "epoch": 0.030740343347639486, + "grad_norm": 0.6999986171722412, + "learning_rate": 4.999997584579154e-06, + "loss": 1.9438, + "step": 573 + }, + { + "epoch": 0.03079399141630901, + "grad_norm": 0.8877727389335632, + "learning_rate": 4.999996942983122e-06, + "loss": 2.2672, + "step": 574 + }, + { + "epoch": 0.03084763948497854, + "grad_norm": 0.9267008304595947, + "learning_rate": 4.99999622590527e-06, + "loss": 2.4836, + "step": 575 + }, + { + "epoch": 0.030901287553648068, + "grad_norm": 1.1035044193267822, + "learning_rate": 4.999995433345618e-06, + "loss": 2.3607, + "step": 576 + }, + { + "epoch": 0.030954935622317596, + "grad_norm": 0.8054925799369812, + "learning_rate": 4.999994565304191e-06, + "loss": 2.4194, + "step": 577 + }, + { + "epoch": 0.031008583690987124, + "grad_norm": 0.8115206956863403, + "learning_rate": 4.999993621781013e-06, + "loss": 2.1981, + "step": 578 + }, + { + "epoch": 0.031062231759656653, + "grad_norm": 0.8305758833885193, + "learning_rate": 4.999992602776116e-06, + "loss": 2.2348, + "step": 579 + }, + { + "epoch": 0.03111587982832618, + "grad_norm": 0.8097905516624451, + "learning_rate": 4.999991508289527e-06, + "loss": 2.4876, + "step": 580 + }, + { + "epoch": 0.03116952789699571, + "grad_norm": 0.946451723575592, + "learning_rate": 4.999990338321282e-06, + "loss": 2.4051, + "step": 581 + }, + { + "epoch": 0.031223175965665238, + "grad_norm": 0.7696050405502319, + "learning_rate": 4.999989092871416e-06, + "loss": 2.5064, + "step": 582 + }, + { + "epoch": 0.031276824034334766, + "grad_norm": 0.8154787421226501, + "learning_rate": 4.9999877719399645e-06, + "loss": 2.2645, + "step": 583 + }, + { + "epoch": 0.03133047210300429, + "grad_norm": 0.9122997522354126, + "learning_rate": 4.99998637552697e-06, + "loss": 2.5181, + "step": 584 + }, + { + "epoch": 0.03138412017167382, + "grad_norm": 0.8365505933761597, + "learning_rate": 4.999984903632473e-06, + "loss": 2.6447, + "step": 585 + }, + { + "epoch": 0.03143776824034335, + "grad_norm": 0.8556792736053467, + "learning_rate": 4.999983356256519e-06, + "loss": 2.5029, + "step": 586 + }, + { + "epoch": 0.03149141630901287, + "grad_norm": 0.828436017036438, + "learning_rate": 4.9999817333991525e-06, + "loss": 2.6751, + "step": 587 + }, + { + "epoch": 0.031545064377682404, + "grad_norm": 0.9811944365501404, + "learning_rate": 4.999980035060426e-06, + "loss": 2.2857, + "step": 588 + }, + { + "epoch": 0.03159871244635193, + "grad_norm": 0.8002449870109558, + "learning_rate": 4.999978261240388e-06, + "loss": 2.2137, + "step": 589 + }, + { + "epoch": 0.03165236051502146, + "grad_norm": 0.7398699522018433, + "learning_rate": 4.999976411939092e-06, + "loss": 2.4169, + "step": 590 + }, + { + "epoch": 0.031706008583690985, + "grad_norm": 1.1704624891281128, + "learning_rate": 4.999974487156597e-06, + "loss": 1.8738, + "step": 591 + }, + { + "epoch": 0.03175965665236052, + "grad_norm": 0.617539644241333, + "learning_rate": 4.9999724868929564e-06, + "loss": 2.1428, + "step": 592 + }, + { + "epoch": 0.03181330472103004, + "grad_norm": 0.9764933586120605, + "learning_rate": 4.999970411148235e-06, + "loss": 2.3223, + "step": 593 + }, + { + "epoch": 0.031866952789699574, + "grad_norm": 0.7986540198326111, + "learning_rate": 4.999968259922493e-06, + "loss": 1.9895, + "step": 594 + }, + { + "epoch": 0.0319206008583691, + "grad_norm": 0.8393023610115051, + "learning_rate": 4.9999660332157955e-06, + "loss": 2.2815, + "step": 595 + }, + { + "epoch": 0.03197424892703862, + "grad_norm": 0.8658771514892578, + "learning_rate": 4.99996373102821e-06, + "loss": 2.3395, + "step": 596 + }, + { + "epoch": 0.032027896995708155, + "grad_norm": 0.8444042801856995, + "learning_rate": 4.999961353359807e-06, + "loss": 2.5247, + "step": 597 + }, + { + "epoch": 0.03208154506437768, + "grad_norm": 0.9189513921737671, + "learning_rate": 4.999958900210657e-06, + "loss": 2.4342, + "step": 598 + }, + { + "epoch": 0.03213519313304721, + "grad_norm": 0.7856061458587646, + "learning_rate": 4.999956371580834e-06, + "loss": 2.2446, + "step": 599 + }, + { + "epoch": 0.032188841201716736, + "grad_norm": 1.0161802768707275, + "learning_rate": 4.999953767470416e-06, + "loss": 1.842, + "step": 600 + }, + { + "epoch": 0.03224248927038627, + "grad_norm": 1.0062010288238525, + "learning_rate": 4.99995108787948e-06, + "loss": 2.6268, + "step": 601 + }, + { + "epoch": 0.03229613733905579, + "grad_norm": 0.9725028872489929, + "learning_rate": 4.9999483328081065e-06, + "loss": 2.4455, + "step": 602 + }, + { + "epoch": 0.032349785407725325, + "grad_norm": 0.8319478034973145, + "learning_rate": 4.99994550225638e-06, + "loss": 2.4594, + "step": 603 + }, + { + "epoch": 0.03240343347639485, + "grad_norm": 0.7323631048202515, + "learning_rate": 4.999942596224387e-06, + "loss": 2.1539, + "step": 604 + }, + { + "epoch": 0.032457081545064374, + "grad_norm": 0.8496477007865906, + "learning_rate": 4.999939614712212e-06, + "loss": 2.3917, + "step": 605 + }, + { + "epoch": 0.032510729613733906, + "grad_norm": 0.7720670700073242, + "learning_rate": 4.999936557719948e-06, + "loss": 2.4219, + "step": 606 + }, + { + "epoch": 0.03256437768240343, + "grad_norm": 0.7344219088554382, + "learning_rate": 4.999933425247685e-06, + "loss": 1.7383, + "step": 607 + }, + { + "epoch": 0.03261802575107296, + "grad_norm": 0.9333812594413757, + "learning_rate": 4.999930217295519e-06, + "loss": 1.6582, + "step": 608 + }, + { + "epoch": 0.03267167381974249, + "grad_norm": 0.8230965733528137, + "learning_rate": 4.999926933863546e-06, + "loss": 2.4792, + "step": 609 + }, + { + "epoch": 0.03272532188841202, + "grad_norm": 4.041313648223877, + "learning_rate": 4.999923574951866e-06, + "loss": 2.5668, + "step": 610 + }, + { + "epoch": 0.032778969957081544, + "grad_norm": 0.7684834599494934, + "learning_rate": 4.999920140560582e-06, + "loss": 2.1755, + "step": 611 + }, + { + "epoch": 0.032832618025751076, + "grad_norm": 1.4816479682922363, + "learning_rate": 4.999916630689793e-06, + "loss": 2.3972, + "step": 612 + }, + { + "epoch": 0.0328862660944206, + "grad_norm": 1.0189032554626465, + "learning_rate": 4.9999130453396085e-06, + "loss": 2.5772, + "step": 613 + }, + { + "epoch": 0.032939914163090125, + "grad_norm": 1.0986303091049194, + "learning_rate": 4.999909384510136e-06, + "loss": 2.4164, + "step": 614 + }, + { + "epoch": 0.03299356223175966, + "grad_norm": 0.9175761342048645, + "learning_rate": 4.9999056482014875e-06, + "loss": 2.3117, + "step": 615 + }, + { + "epoch": 0.03304721030042918, + "grad_norm": 0.8070825934410095, + "learning_rate": 4.999901836413773e-06, + "loss": 2.3548, + "step": 616 + }, + { + "epoch": 0.033100858369098714, + "grad_norm": 0.956683337688446, + "learning_rate": 4.999897949147109e-06, + "loss": 2.4175, + "step": 617 + }, + { + "epoch": 0.03315450643776824, + "grad_norm": 1.2388218641281128, + "learning_rate": 4.999893986401612e-06, + "loss": 1.8928, + "step": 618 + }, + { + "epoch": 0.03320815450643777, + "grad_norm": 0.7838587164878845, + "learning_rate": 4.999889948177404e-06, + "loss": 2.2175, + "step": 619 + }, + { + "epoch": 0.033261802575107295, + "grad_norm": 1.0134375095367432, + "learning_rate": 4.999885834474605e-06, + "loss": 2.3261, + "step": 620 + }, + { + "epoch": 0.03331545064377683, + "grad_norm": 1.7147351503372192, + "learning_rate": 4.999881645293339e-06, + "loss": 2.751, + "step": 621 + }, + { + "epoch": 0.03336909871244635, + "grad_norm": 2.4965734481811523, + "learning_rate": 4.999877380633733e-06, + "loss": 2.3369, + "step": 622 + }, + { + "epoch": 0.03342274678111588, + "grad_norm": 0.8227230310440063, + "learning_rate": 4.999873040495916e-06, + "loss": 2.4026, + "step": 623 + }, + { + "epoch": 0.03347639484978541, + "grad_norm": 0.8249292373657227, + "learning_rate": 4.999868624880019e-06, + "loss": 2.1575, + "step": 624 + }, + { + "epoch": 0.03353004291845493, + "grad_norm": 0.9761359095573425, + "learning_rate": 4.999864133786175e-06, + "loss": 2.4239, + "step": 625 + }, + { + "epoch": 0.033583690987124465, + "grad_norm": 1.0189385414123535, + "learning_rate": 4.99985956721452e-06, + "loss": 2.3626, + "step": 626 + }, + { + "epoch": 0.03363733905579399, + "grad_norm": 0.8175013661384583, + "learning_rate": 4.999854925165192e-06, + "loss": 2.2103, + "step": 627 + }, + { + "epoch": 0.03369098712446352, + "grad_norm": 0.9679737091064453, + "learning_rate": 4.99985020763833e-06, + "loss": 2.5721, + "step": 628 + }, + { + "epoch": 0.033744635193133046, + "grad_norm": 0.8842343688011169, + "learning_rate": 4.999845414634077e-06, + "loss": 2.2962, + "step": 629 + }, + { + "epoch": 0.03379828326180258, + "grad_norm": 1.3311737775802612, + "learning_rate": 4.999840546152579e-06, + "loss": 2.4183, + "step": 630 + }, + { + "epoch": 0.0338519313304721, + "grad_norm": 0.7848086953163147, + "learning_rate": 4.999835602193981e-06, + "loss": 2.3146, + "step": 631 + }, + { + "epoch": 0.03390557939914163, + "grad_norm": 0.8788207769393921, + "learning_rate": 4.9998305827584335e-06, + "loss": 2.471, + "step": 632 + }, + { + "epoch": 0.03395922746781116, + "grad_norm": 2.2514638900756836, + "learning_rate": 4.999825487846088e-06, + "loss": 2.3052, + "step": 633 + }, + { + "epoch": 0.034012875536480684, + "grad_norm": 0.9558051824569702, + "learning_rate": 4.999820317457098e-06, + "loss": 2.3038, + "step": 634 + }, + { + "epoch": 0.034066523605150216, + "grad_norm": 7.700989246368408, + "learning_rate": 4.99981507159162e-06, + "loss": 2.2401, + "step": 635 + }, + { + "epoch": 0.03412017167381974, + "grad_norm": 0.8497698307037354, + "learning_rate": 4.999809750249811e-06, + "loss": 2.2652, + "step": 636 + }, + { + "epoch": 0.03417381974248927, + "grad_norm": 0.8105354309082031, + "learning_rate": 4.999804353431834e-06, + "loss": 2.4668, + "step": 637 + }, + { + "epoch": 0.0342274678111588, + "grad_norm": 0.7997203469276428, + "learning_rate": 4.99979888113785e-06, + "loss": 2.2942, + "step": 638 + }, + { + "epoch": 0.03428111587982833, + "grad_norm": 0.8552649617195129, + "learning_rate": 4.999793333368025e-06, + "loss": 2.3791, + "step": 639 + }, + { + "epoch": 0.034334763948497854, + "grad_norm": 0.9041031002998352, + "learning_rate": 4.999787710122526e-06, + "loss": 2.4304, + "step": 640 + }, + { + "epoch": 0.03438841201716738, + "grad_norm": 1.0379509925842285, + "learning_rate": 4.999782011401524e-06, + "loss": 2.2387, + "step": 641 + }, + { + "epoch": 0.03444206008583691, + "grad_norm": 1.0014477968215942, + "learning_rate": 4.99977623720519e-06, + "loss": 2.3903, + "step": 642 + }, + { + "epoch": 0.034495708154506435, + "grad_norm": 0.818027138710022, + "learning_rate": 4.999770387533699e-06, + "loss": 2.4231, + "step": 643 + }, + { + "epoch": 0.03454935622317597, + "grad_norm": 0.9278150796890259, + "learning_rate": 4.999764462387227e-06, + "loss": 2.3367, + "step": 644 + }, + { + "epoch": 0.03460300429184549, + "grad_norm": 0.8519514799118042, + "learning_rate": 4.999758461765953e-06, + "loss": 2.3704, + "step": 645 + }, + { + "epoch": 0.034656652360515024, + "grad_norm": 2.329301595687866, + "learning_rate": 4.999752385670058e-06, + "loss": 2.3162, + "step": 646 + }, + { + "epoch": 0.03471030042918455, + "grad_norm": 0.7948412895202637, + "learning_rate": 4.999746234099726e-06, + "loss": 2.3477, + "step": 647 + }, + { + "epoch": 0.03476394849785408, + "grad_norm": 0.9377961754798889, + "learning_rate": 4.999740007055142e-06, + "loss": 2.219, + "step": 648 + }, + { + "epoch": 0.034817596566523605, + "grad_norm": 1.2502127885818481, + "learning_rate": 4.9997337045364955e-06, + "loss": 2.226, + "step": 649 + }, + { + "epoch": 0.03487124463519313, + "grad_norm": 0.7637403607368469, + "learning_rate": 4.9997273265439756e-06, + "loss": 2.2265, + "step": 650 + }, + { + "epoch": 0.03492489270386266, + "grad_norm": 1.3224856853485107, + "learning_rate": 4.999720873077775e-06, + "loss": 1.6977, + "step": 651 + }, + { + "epoch": 0.034978540772532186, + "grad_norm": 0.7775270938873291, + "learning_rate": 4.999714344138088e-06, + "loss": 2.3436, + "step": 652 + }, + { + "epoch": 0.03503218884120172, + "grad_norm": 0.8279511332511902, + "learning_rate": 4.999707739725113e-06, + "loss": 2.283, + "step": 653 + }, + { + "epoch": 0.03508583690987124, + "grad_norm": 1.0360100269317627, + "learning_rate": 4.999701059839048e-06, + "loss": 2.3208, + "step": 654 + }, + { + "epoch": 0.035139484978540775, + "grad_norm": 1.0353562831878662, + "learning_rate": 4.999694304480096e-06, + "loss": 2.297, + "step": 655 + }, + { + "epoch": 0.0351931330472103, + "grad_norm": 1.0269654989242554, + "learning_rate": 4.99968747364846e-06, + "loss": 2.4866, + "step": 656 + }, + { + "epoch": 0.03524678111587983, + "grad_norm": 0.8758061528205872, + "learning_rate": 4.999680567344347e-06, + "loss": 2.4508, + "step": 657 + }, + { + "epoch": 0.035300429184549356, + "grad_norm": 0.8079335689544678, + "learning_rate": 4.999673585567964e-06, + "loss": 2.3333, + "step": 658 + }, + { + "epoch": 0.03535407725321888, + "grad_norm": 0.8654548525810242, + "learning_rate": 4.999666528319525e-06, + "loss": 2.3997, + "step": 659 + }, + { + "epoch": 0.03540772532188841, + "grad_norm": 0.898048996925354, + "learning_rate": 4.999659395599239e-06, + "loss": 2.4132, + "step": 660 + }, + { + "epoch": 0.03546137339055794, + "grad_norm": 0.706701934337616, + "learning_rate": 4.999652187407325e-06, + "loss": 2.4879, + "step": 661 + }, + { + "epoch": 0.03551502145922747, + "grad_norm": 1.273048758506775, + "learning_rate": 4.999644903743998e-06, + "loss": 2.2613, + "step": 662 + }, + { + "epoch": 0.035568669527896994, + "grad_norm": 1.7957842350006104, + "learning_rate": 4.999637544609479e-06, + "loss": 2.1644, + "step": 663 + }, + { + "epoch": 0.035622317596566526, + "grad_norm": 1.0425153970718384, + "learning_rate": 4.99963011000399e-06, + "loss": 2.425, + "step": 664 + }, + { + "epoch": 0.03567596566523605, + "grad_norm": 0.8255715370178223, + "learning_rate": 4.999622599927757e-06, + "loss": 2.5267, + "step": 665 + }, + { + "epoch": 0.03572961373390558, + "grad_norm": 1.2199499607086182, + "learning_rate": 4.999615014381004e-06, + "loss": 2.365, + "step": 666 + }, + { + "epoch": 0.03578326180257511, + "grad_norm": 0.8043991327285767, + "learning_rate": 4.999607353363961e-06, + "loss": 2.407, + "step": 667 + }, + { + "epoch": 0.03583690987124463, + "grad_norm": 0.8119394779205322, + "learning_rate": 4.999599616876861e-06, + "loss": 2.2399, + "step": 668 + }, + { + "epoch": 0.035890557939914164, + "grad_norm": 0.8173143863677979, + "learning_rate": 4.999591804919935e-06, + "loss": 2.3591, + "step": 669 + }, + { + "epoch": 0.03594420600858369, + "grad_norm": 0.7705435752868652, + "learning_rate": 4.999583917493421e-06, + "loss": 2.5671, + "step": 670 + }, + { + "epoch": 0.03599785407725322, + "grad_norm": 0.8844898343086243, + "learning_rate": 4.999575954597556e-06, + "loss": 2.4353, + "step": 671 + }, + { + "epoch": 0.036051502145922745, + "grad_norm": 1.1428945064544678, + "learning_rate": 4.9995679162325805e-06, + "loss": 2.2943, + "step": 672 + }, + { + "epoch": 0.03610515021459228, + "grad_norm": 0.8979571461677551, + "learning_rate": 4.999559802398737e-06, + "loss": 2.4669, + "step": 673 + }, + { + "epoch": 0.0361587982832618, + "grad_norm": 1.020084261894226, + "learning_rate": 4.999551613096272e-06, + "loss": 2.5004, + "step": 674 + }, + { + "epoch": 0.036212446351931334, + "grad_norm": 1.0453612804412842, + "learning_rate": 4.99954334832543e-06, + "loss": 2.6331, + "step": 675 + }, + { + "epoch": 0.03626609442060086, + "grad_norm": 0.8666693568229675, + "learning_rate": 4.999535008086464e-06, + "loss": 2.1928, + "step": 676 + }, + { + "epoch": 0.03631974248927038, + "grad_norm": 0.9827839732170105, + "learning_rate": 4.999526592379623e-06, + "loss": 2.4469, + "step": 677 + }, + { + "epoch": 0.036373390557939915, + "grad_norm": 0.8393266201019287, + "learning_rate": 4.999518101205162e-06, + "loss": 2.4985, + "step": 678 + }, + { + "epoch": 0.03642703862660944, + "grad_norm": 1.0076488256454468, + "learning_rate": 4.999509534563338e-06, + "loss": 2.4406, + "step": 679 + }, + { + "epoch": 0.03648068669527897, + "grad_norm": 1.0385128259658813, + "learning_rate": 4.999500892454409e-06, + "loss": 2.4823, + "step": 680 + }, + { + "epoch": 0.036534334763948496, + "grad_norm": 4.658494472503662, + "learning_rate": 4.9994921748786365e-06, + "loss": 2.3569, + "step": 681 + }, + { + "epoch": 0.03658798283261803, + "grad_norm": 0.8108393549919128, + "learning_rate": 4.999483381836282e-06, + "loss": 2.2848, + "step": 682 + }, + { + "epoch": 0.03664163090128755, + "grad_norm": 0.7995207905769348, + "learning_rate": 4.999474513327612e-06, + "loss": 2.2727, + "step": 683 + }, + { + "epoch": 0.036695278969957085, + "grad_norm": 1.0003660917282104, + "learning_rate": 4.999465569352896e-06, + "loss": 2.4904, + "step": 684 + }, + { + "epoch": 0.03674892703862661, + "grad_norm": 1.129424810409546, + "learning_rate": 4.999456549912402e-06, + "loss": 2.3447, + "step": 685 + }, + { + "epoch": 0.036802575107296134, + "grad_norm": 0.8258870244026184, + "learning_rate": 4.999447455006402e-06, + "loss": 2.4406, + "step": 686 + }, + { + "epoch": 0.036856223175965666, + "grad_norm": 0.8014585971832275, + "learning_rate": 4.9994382846351725e-06, + "loss": 2.3744, + "step": 687 + }, + { + "epoch": 0.03690987124463519, + "grad_norm": 0.7724493741989136, + "learning_rate": 4.999429038798989e-06, + "loss": 1.7458, + "step": 688 + }, + { + "epoch": 0.03696351931330472, + "grad_norm": 0.9295950531959534, + "learning_rate": 4.999419717498131e-06, + "loss": 2.4803, + "step": 689 + }, + { + "epoch": 0.03701716738197425, + "grad_norm": 1.2042477130889893, + "learning_rate": 4.999410320732879e-06, + "loss": 2.3137, + "step": 690 + }, + { + "epoch": 0.03707081545064378, + "grad_norm": 0.9284137487411499, + "learning_rate": 4.999400848503519e-06, + "loss": 2.4265, + "step": 691 + }, + { + "epoch": 0.037124463519313304, + "grad_norm": 0.8523659110069275, + "learning_rate": 4.999391300810335e-06, + "loss": 2.4381, + "step": 692 + }, + { + "epoch": 0.037178111587982836, + "grad_norm": 1.2447504997253418, + "learning_rate": 4.999381677653616e-06, + "loss": 2.4701, + "step": 693 + }, + { + "epoch": 0.03723175965665236, + "grad_norm": 0.8053240776062012, + "learning_rate": 4.999371979033653e-06, + "loss": 2.4038, + "step": 694 + }, + { + "epoch": 0.037285407725321885, + "grad_norm": 0.8771582841873169, + "learning_rate": 4.999362204950737e-06, + "loss": 2.2925, + "step": 695 + }, + { + "epoch": 0.03733905579399142, + "grad_norm": 0.9519544243812561, + "learning_rate": 4.999352355405165e-06, + "loss": 2.3953, + "step": 696 + }, + { + "epoch": 0.03739270386266094, + "grad_norm": 0.820660412311554, + "learning_rate": 4.9993424303972336e-06, + "loss": 2.4248, + "step": 697 + }, + { + "epoch": 0.037446351931330474, + "grad_norm": 0.8565362095832825, + "learning_rate": 4.999332429927243e-06, + "loss": 2.3761, + "step": 698 + }, + { + "epoch": 0.0375, + "grad_norm": 0.8427897691726685, + "learning_rate": 4.999322353995495e-06, + "loss": 2.3386, + "step": 699 + }, + { + "epoch": 0.03755364806866953, + "grad_norm": 0.8358487486839294, + "learning_rate": 4.9993122026022935e-06, + "loss": 2.3625, + "step": 700 + }, + { + "epoch": 0.037607296137339055, + "grad_norm": 10.18463134765625, + "learning_rate": 4.999301975747944e-06, + "loss": 2.2802, + "step": 701 + }, + { + "epoch": 0.03766094420600859, + "grad_norm": 1.2484214305877686, + "learning_rate": 4.999291673432758e-06, + "loss": 2.3287, + "step": 702 + }, + { + "epoch": 0.03771459227467811, + "grad_norm": 1.0467242002487183, + "learning_rate": 4.999281295657044e-06, + "loss": 2.4922, + "step": 703 + }, + { + "epoch": 0.03776824034334764, + "grad_norm": 0.9561108350753784, + "learning_rate": 4.9992708424211164e-06, + "loss": 2.309, + "step": 704 + }, + { + "epoch": 0.03782188841201717, + "grad_norm": 0.9839465618133545, + "learning_rate": 4.999260313725291e-06, + "loss": 2.3399, + "step": 705 + }, + { + "epoch": 0.03787553648068669, + "grad_norm": 0.9964559078216553, + "learning_rate": 4.999249709569885e-06, + "loss": 2.1475, + "step": 706 + }, + { + "epoch": 0.037929184549356225, + "grad_norm": 0.9006378054618835, + "learning_rate": 4.999239029955219e-06, + "loss": 2.2803, + "step": 707 + }, + { + "epoch": 0.03798283261802575, + "grad_norm": 0.8732361793518066, + "learning_rate": 4.999228274881615e-06, + "loss": 2.2114, + "step": 708 + }, + { + "epoch": 0.03803648068669528, + "grad_norm": 1.0166932344436646, + "learning_rate": 4.999217444349398e-06, + "loss": 2.4329, + "step": 709 + }, + { + "epoch": 0.038090128755364806, + "grad_norm": 0.8104544878005981, + "learning_rate": 4.999206538358896e-06, + "loss": 2.0427, + "step": 710 + }, + { + "epoch": 0.03814377682403434, + "grad_norm": 1.361675500869751, + "learning_rate": 4.999195556910437e-06, + "loss": 2.2213, + "step": 711 + }, + { + "epoch": 0.03819742489270386, + "grad_norm": 0.9806872010231018, + "learning_rate": 4.9991845000043525e-06, + "loss": 2.4211, + "step": 712 + }, + { + "epoch": 0.03825107296137339, + "grad_norm": 0.8754968643188477, + "learning_rate": 4.999173367640977e-06, + "loss": 2.4261, + "step": 713 + }, + { + "epoch": 0.03830472103004292, + "grad_norm": 0.8951733708381653, + "learning_rate": 4.999162159820646e-06, + "loss": 2.1626, + "step": 714 + }, + { + "epoch": 0.038358369098712444, + "grad_norm": 0.876552939414978, + "learning_rate": 4.999150876543699e-06, + "loss": 2.3987, + "step": 715 + }, + { + "epoch": 0.038412017167381976, + "grad_norm": 0.8129255771636963, + "learning_rate": 4.999139517810476e-06, + "loss": 2.5997, + "step": 716 + }, + { + "epoch": 0.0384656652360515, + "grad_norm": 0.7739890217781067, + "learning_rate": 4.99912808362132e-06, + "loss": 2.0723, + "step": 717 + }, + { + "epoch": 0.03851931330472103, + "grad_norm": 0.8814245462417603, + "learning_rate": 4.999116573976576e-06, + "loss": 2.3901, + "step": 718 + }, + { + "epoch": 0.03857296137339056, + "grad_norm": 0.847317099571228, + "learning_rate": 4.999104988876592e-06, + "loss": 2.2523, + "step": 719 + }, + { + "epoch": 0.03862660944206009, + "grad_norm": 0.7911797165870667, + "learning_rate": 4.999093328321717e-06, + "loss": 2.2159, + "step": 720 + }, + { + "epoch": 0.038680257510729614, + "grad_norm": 0.8942217826843262, + "learning_rate": 4.999081592312303e-06, + "loss": 2.5269, + "step": 721 + }, + { + "epoch": 0.03873390557939914, + "grad_norm": 0.9118621349334717, + "learning_rate": 4.999069780848705e-06, + "loss": 2.5405, + "step": 722 + }, + { + "epoch": 0.03878755364806867, + "grad_norm": 0.806627094745636, + "learning_rate": 4.99905789393128e-06, + "loss": 2.4317, + "step": 723 + }, + { + "epoch": 0.038841201716738195, + "grad_norm": 1.011579990386963, + "learning_rate": 4.999045931560387e-06, + "loss": 2.3841, + "step": 724 + }, + { + "epoch": 0.03889484978540773, + "grad_norm": 0.8012880682945251, + "learning_rate": 4.999033893736386e-06, + "loss": 2.2811, + "step": 725 + }, + { + "epoch": 0.03894849785407725, + "grad_norm": 0.8141025900840759, + "learning_rate": 4.9990217804596405e-06, + "loss": 2.354, + "step": 726 + }, + { + "epoch": 0.039002145922746784, + "grad_norm": 0.8222667574882507, + "learning_rate": 4.999009591730517e-06, + "loss": 2.5948, + "step": 727 + }, + { + "epoch": 0.03905579399141631, + "grad_norm": 0.8395331501960754, + "learning_rate": 4.998997327549383e-06, + "loss": 2.327, + "step": 728 + }, + { + "epoch": 0.03910944206008584, + "grad_norm": 0.9304442405700684, + "learning_rate": 4.99898498791661e-06, + "loss": 2.1368, + "step": 729 + }, + { + "epoch": 0.039163090128755365, + "grad_norm": 1.8233082294464111, + "learning_rate": 4.998972572832568e-06, + "loss": 2.2995, + "step": 730 + }, + { + "epoch": 0.03921673819742489, + "grad_norm": 0.8477112650871277, + "learning_rate": 4.998960082297634e-06, + "loss": 2.0283, + "step": 731 + }, + { + "epoch": 0.03927038626609442, + "grad_norm": 0.8308610320091248, + "learning_rate": 4.9989475163121855e-06, + "loss": 2.2226, + "step": 732 + }, + { + "epoch": 0.039324034334763946, + "grad_norm": 0.953279435634613, + "learning_rate": 4.998934874876601e-06, + "loss": 2.2687, + "step": 733 + }, + { + "epoch": 0.03937768240343348, + "grad_norm": 0.847504734992981, + "learning_rate": 4.998922157991261e-06, + "loss": 2.4557, + "step": 734 + }, + { + "epoch": 0.039431330472103, + "grad_norm": 0.8619157671928406, + "learning_rate": 4.998909365656551e-06, + "loss": 2.3774, + "step": 735 + }, + { + "epoch": 0.039484978540772535, + "grad_norm": 0.8861187100410461, + "learning_rate": 4.998896497872857e-06, + "loss": 2.3729, + "step": 736 + }, + { + "epoch": 0.03953862660944206, + "grad_norm": 4.534422397613525, + "learning_rate": 4.998883554640568e-06, + "loss": 2.146, + "step": 737 + }, + { + "epoch": 0.03959227467811159, + "grad_norm": 0.8690135478973389, + "learning_rate": 4.998870535960074e-06, + "loss": 1.9468, + "step": 738 + }, + { + "epoch": 0.039645922746781116, + "grad_norm": 0.9731127619743347, + "learning_rate": 4.9988574418317675e-06, + "loss": 2.1205, + "step": 739 + }, + { + "epoch": 0.03969957081545064, + "grad_norm": 0.8011561036109924, + "learning_rate": 4.9988442722560445e-06, + "loss": 2.365, + "step": 740 + }, + { + "epoch": 0.03975321888412017, + "grad_norm": 0.9566690921783447, + "learning_rate": 4.998831027233304e-06, + "loss": 1.9065, + "step": 741 + }, + { + "epoch": 0.0398068669527897, + "grad_norm": 1.4914478063583374, + "learning_rate": 4.998817706763943e-06, + "loss": 2.4801, + "step": 742 + }, + { + "epoch": 0.03986051502145923, + "grad_norm": 0.8158642649650574, + "learning_rate": 4.998804310848366e-06, + "loss": 2.3732, + "step": 743 + }, + { + "epoch": 0.039914163090128754, + "grad_norm": 1.379916787147522, + "learning_rate": 4.998790839486977e-06, + "loss": 2.3872, + "step": 744 + }, + { + "epoch": 0.039967811158798286, + "grad_norm": 2.303086757659912, + "learning_rate": 4.998777292680182e-06, + "loss": 1.9339, + "step": 745 + }, + { + "epoch": 0.04002145922746781, + "grad_norm": 0.9438580870628357, + "learning_rate": 4.998763670428391e-06, + "loss": 2.5294, + "step": 746 + }, + { + "epoch": 0.04007510729613734, + "grad_norm": 0.7845317721366882, + "learning_rate": 4.998749972732014e-06, + "loss": 2.1711, + "step": 747 + }, + { + "epoch": 0.04012875536480687, + "grad_norm": 0.8746833205223083, + "learning_rate": 4.998736199591466e-06, + "loss": 2.1271, + "step": 748 + }, + { + "epoch": 0.04018240343347639, + "grad_norm": 0.8327506184577942, + "learning_rate": 4.998722351007163e-06, + "loss": 2.5124, + "step": 749 + }, + { + "epoch": 0.040236051502145924, + "grad_norm": 0.8333725929260254, + "learning_rate": 4.998708426979521e-06, + "loss": 2.3701, + "step": 750 + }, + { + "epoch": 0.04028969957081545, + "grad_norm": 0.8267025947570801, + "learning_rate": 4.998694427508962e-06, + "loss": 2.4137, + "step": 751 + }, + { + "epoch": 0.04034334763948498, + "grad_norm": 0.8505074381828308, + "learning_rate": 4.99868035259591e-06, + "loss": 2.3332, + "step": 752 + }, + { + "epoch": 0.040396995708154505, + "grad_norm": 0.8915570974349976, + "learning_rate": 4.998666202240786e-06, + "loss": 2.1964, + "step": 753 + }, + { + "epoch": 0.04045064377682404, + "grad_norm": 1.062445878982544, + "learning_rate": 4.998651976444021e-06, + "loss": 2.5681, + "step": 754 + }, + { + "epoch": 0.04050429184549356, + "grad_norm": 1.759235143661499, + "learning_rate": 4.998637675206043e-06, + "loss": 2.0857, + "step": 755 + }, + { + "epoch": 0.04055793991416309, + "grad_norm": 0.9714831709861755, + "learning_rate": 4.998623298527283e-06, + "loss": 2.5067, + "step": 756 + }, + { + "epoch": 0.04061158798283262, + "grad_norm": 0.8594189882278442, + "learning_rate": 4.9986088464081775e-06, + "loss": 2.3668, + "step": 757 + }, + { + "epoch": 0.04066523605150214, + "grad_norm": 1.1636526584625244, + "learning_rate": 4.9985943188491605e-06, + "loss": 2.1719, + "step": 758 + }, + { + "epoch": 0.040718884120171675, + "grad_norm": 0.7689273357391357, + "learning_rate": 4.998579715850671e-06, + "loss": 2.317, + "step": 759 + }, + { + "epoch": 0.0407725321888412, + "grad_norm": 0.7637854218482971, + "learning_rate": 4.998565037413151e-06, + "loss": 2.164, + "step": 760 + }, + { + "epoch": 0.04082618025751073, + "grad_norm": 1.039669156074524, + "learning_rate": 4.9985502835370435e-06, + "loss": 2.341, + "step": 761 + }, + { + "epoch": 0.040879828326180256, + "grad_norm": 0.8856495022773743, + "learning_rate": 4.998535454222793e-06, + "loss": 2.186, + "step": 762 + }, + { + "epoch": 0.04093347639484979, + "grad_norm": 0.7140936851501465, + "learning_rate": 4.998520549470847e-06, + "loss": 2.175, + "step": 763 + }, + { + "epoch": 0.04098712446351931, + "grad_norm": 0.8625261783599854, + "learning_rate": 4.998505569281659e-06, + "loss": 2.3984, + "step": 764 + }, + { + "epoch": 0.04104077253218884, + "grad_norm": 0.9290820360183716, + "learning_rate": 4.998490513655676e-06, + "loss": 2.364, + "step": 765 + }, + { + "epoch": 0.04109442060085837, + "grad_norm": 0.8107749819755554, + "learning_rate": 4.998475382593356e-06, + "loss": 2.5272, + "step": 766 + }, + { + "epoch": 0.041148068669527894, + "grad_norm": 1.0411086082458496, + "learning_rate": 4.998460176095155e-06, + "loss": 1.7349, + "step": 767 + }, + { + "epoch": 0.041201716738197426, + "grad_norm": NaN, + "learning_rate": 4.998460176095155e-06, + "loss": 2.3429, + "step": 768 + }, + { + "epoch": 0.04125536480686695, + "grad_norm": 0.8158135414123535, + "learning_rate": 4.9984448941615324e-06, + "loss": 1.8537, + "step": 769 + }, + { + "epoch": 0.04130901287553648, + "grad_norm": 1.0948500633239746, + "learning_rate": 4.998429536792949e-06, + "loss": 2.2529, + "step": 770 + }, + { + "epoch": 0.04136266094420601, + "grad_norm": 0.8820666074752808, + "learning_rate": 4.998414103989868e-06, + "loss": 2.3287, + "step": 771 + }, + { + "epoch": 0.04141630901287554, + "grad_norm": 0.8852021098136902, + "learning_rate": 4.998398595752757e-06, + "loss": 2.3479, + "step": 772 + }, + { + "epoch": 0.041469957081545064, + "grad_norm": 0.7983525395393372, + "learning_rate": 4.998383012082084e-06, + "loss": 2.3484, + "step": 773 + }, + { + "epoch": 0.04152360515021459, + "grad_norm": 0.917819619178772, + "learning_rate": 4.998367352978318e-06, + "loss": 2.3576, + "step": 774 + }, + { + "epoch": 0.04157725321888412, + "grad_norm": 0.848444938659668, + "learning_rate": 4.998351618441933e-06, + "loss": 2.2624, + "step": 775 + }, + { + "epoch": 0.041630901287553645, + "grad_norm": 1.1796140670776367, + "learning_rate": 4.998335808473404e-06, + "loss": 2.0746, + "step": 776 + }, + { + "epoch": 0.04168454935622318, + "grad_norm": 0.8889231085777283, + "learning_rate": 4.998319923073209e-06, + "loss": 2.304, + "step": 777 + }, + { + "epoch": 0.0417381974248927, + "grad_norm": 0.9986532330513, + "learning_rate": 4.998303962241825e-06, + "loss": 2.3869, + "step": 778 + }, + { + "epoch": 0.041791845493562234, + "grad_norm": 0.8982643485069275, + "learning_rate": 4.998287925979735e-06, + "loss": 2.2363, + "step": 779 + }, + { + "epoch": 0.04184549356223176, + "grad_norm": 0.8382365703582764, + "learning_rate": 4.998271814287427e-06, + "loss": 2.453, + "step": 780 + }, + { + "epoch": 0.04189914163090129, + "grad_norm": 0.8186877965927124, + "learning_rate": 4.998255627165382e-06, + "loss": 2.1865, + "step": 781 + }, + { + "epoch": 0.041952789699570815, + "grad_norm": 0.9079015254974365, + "learning_rate": 4.998239364614091e-06, + "loss": 2.4476, + "step": 782 + }, + { + "epoch": 0.04200643776824034, + "grad_norm": 0.936324417591095, + "learning_rate": 4.998223026634046e-06, + "loss": 2.4837, + "step": 783 + }, + { + "epoch": 0.04206008583690987, + "grad_norm": 0.734960675239563, + "learning_rate": 4.998206613225739e-06, + "loss": 2.2193, + "step": 784 + }, + { + "epoch": 0.0421137339055794, + "grad_norm": 0.8517699241638184, + "learning_rate": 4.998190124389666e-06, + "loss": 2.3412, + "step": 785 + }, + { + "epoch": 0.04216738197424893, + "grad_norm": 0.8668347597122192, + "learning_rate": 4.998173560126324e-06, + "loss": 2.2964, + "step": 786 + }, + { + "epoch": 0.04222103004291845, + "grad_norm": 0.9091010689735413, + "learning_rate": 4.998156920436215e-06, + "loss": 2.2286, + "step": 787 + }, + { + "epoch": 0.042274678111587985, + "grad_norm": 0.9950007796287537, + "learning_rate": 4.998140205319839e-06, + "loss": 2.5874, + "step": 788 + }, + { + "epoch": 0.04232832618025751, + "grad_norm": 0.9189279079437256, + "learning_rate": 4.9981234147777025e-06, + "loss": 2.3062, + "step": 789 + }, + { + "epoch": 0.04238197424892704, + "grad_norm": 1.0291131734848022, + "learning_rate": 4.998106548810312e-06, + "loss": 2.3356, + "step": 790 + }, + { + "epoch": 0.042435622317596566, + "grad_norm": 0.8761991858482361, + "learning_rate": 4.9980896074181765e-06, + "loss": 2.3992, + "step": 791 + }, + { + "epoch": 0.04248927038626609, + "grad_norm": 0.892407238483429, + "learning_rate": 4.998072590601808e-06, + "loss": 2.4764, + "step": 792 + }, + { + "epoch": 0.04254291845493562, + "grad_norm": 0.9847999215126038, + "learning_rate": 4.998055498361719e-06, + "loss": 2.4833, + "step": 793 + }, + { + "epoch": 0.04259656652360515, + "grad_norm": 1.0667805671691895, + "learning_rate": 4.998038330698427e-06, + "loss": 2.3346, + "step": 794 + }, + { + "epoch": 0.04265021459227468, + "grad_norm": 0.9136879444122314, + "learning_rate": 4.99802108761245e-06, + "loss": 2.4017, + "step": 795 + }, + { + "epoch": 0.042703862660944204, + "grad_norm": 0.9691357016563416, + "learning_rate": 4.9980037691043086e-06, + "loss": 2.2941, + "step": 796 + }, + { + "epoch": 0.042757510729613736, + "grad_norm": 1.5764594078063965, + "learning_rate": 4.997986375174525e-06, + "loss": 2.5367, + "step": 797 + }, + { + "epoch": 0.04281115879828326, + "grad_norm": 2.486959457397461, + "learning_rate": 4.997968905823626e-06, + "loss": 2.4594, + "step": 798 + }, + { + "epoch": 0.04286480686695279, + "grad_norm": 0.9328217506408691, + "learning_rate": 4.997951361052136e-06, + "loss": 2.4032, + "step": 799 + }, + { + "epoch": 0.04291845493562232, + "grad_norm": 0.9318724274635315, + "learning_rate": 4.997933740860588e-06, + "loss": 2.249, + "step": 800 + }, + { + "epoch": 0.04297210300429184, + "grad_norm": 0.8338583111763, + "learning_rate": 4.997916045249513e-06, + "loss": 2.3779, + "step": 801 + }, + { + "epoch": 0.043025751072961374, + "grad_norm": 1.0567046403884888, + "learning_rate": 4.997898274219444e-06, + "loss": 2.3838, + "step": 802 + }, + { + "epoch": 0.0430793991416309, + "grad_norm": 0.87156742811203, + "learning_rate": 4.997880427770918e-06, + "loss": 2.6135, + "step": 803 + }, + { + "epoch": 0.04313304721030043, + "grad_norm": 1.1943604946136475, + "learning_rate": 4.997862505904475e-06, + "loss": 2.5741, + "step": 804 + }, + { + "epoch": 0.043186695278969955, + "grad_norm": 6.526900291442871, + "learning_rate": 4.9978445086206565e-06, + "loss": 2.3296, + "step": 805 + }, + { + "epoch": 0.04324034334763949, + "grad_norm": 3.29752254486084, + "learning_rate": 4.997826435920003e-06, + "loss": 2.3092, + "step": 806 + }, + { + "epoch": 0.04329399141630901, + "grad_norm": 0.8057052493095398, + "learning_rate": 4.997808287803063e-06, + "loss": 2.2616, + "step": 807 + }, + { + "epoch": 0.043347639484978544, + "grad_norm": 1.2297816276550293, + "learning_rate": 4.997790064270383e-06, + "loss": 2.4965, + "step": 808 + }, + { + "epoch": 0.04340128755364807, + "grad_norm": 0.9467470049858093, + "learning_rate": 4.997771765322515e-06, + "loss": 2.2229, + "step": 809 + }, + { + "epoch": 0.04345493562231759, + "grad_norm": 1.096414566040039, + "learning_rate": 4.997753390960009e-06, + "loss": 2.2314, + "step": 810 + }, + { + "epoch": 0.043508583690987125, + "grad_norm": 0.8888903260231018, + "learning_rate": 4.997734941183421e-06, + "loss": 2.3368, + "step": 811 + }, + { + "epoch": 0.04356223175965665, + "grad_norm": 0.7792751789093018, + "learning_rate": 4.997716415993308e-06, + "loss": 2.2053, + "step": 812 + }, + { + "epoch": 0.04361587982832618, + "grad_norm": 0.8904644846916199, + "learning_rate": 4.99769781539023e-06, + "loss": 2.4439, + "step": 813 + }, + { + "epoch": 0.043669527896995707, + "grad_norm": 0.8041107058525085, + "learning_rate": 4.997679139374747e-06, + "loss": 1.7341, + "step": 814 + }, + { + "epoch": 0.04372317596566524, + "grad_norm": 0.8451853394508362, + "learning_rate": 4.997660387947424e-06, + "loss": 2.2739, + "step": 815 + }, + { + "epoch": 0.04377682403433476, + "grad_norm": 0.9418163895606995, + "learning_rate": 4.997641561108827e-06, + "loss": 2.4701, + "step": 816 + }, + { + "epoch": 0.043830472103004295, + "grad_norm": 0.9716716408729553, + "learning_rate": 4.997622658859524e-06, + "loss": 2.5012, + "step": 817 + }, + { + "epoch": 0.04388412017167382, + "grad_norm": 0.9453468918800354, + "learning_rate": 4.997603681200087e-06, + "loss": 2.3483, + "step": 818 + }, + { + "epoch": 0.043937768240343344, + "grad_norm": 0.935333251953125, + "learning_rate": 4.997584628131088e-06, + "loss": 2.4222, + "step": 819 + }, + { + "epoch": 0.043991416309012876, + "grad_norm": 0.9922914505004883, + "learning_rate": 4.997565499653101e-06, + "loss": 2.4547, + "step": 820 + }, + { + "epoch": 0.0440450643776824, + "grad_norm": 0.7321531772613525, + "learning_rate": 4.997546295766706e-06, + "loss": 2.0147, + "step": 821 + }, + { + "epoch": 0.04409871244635193, + "grad_norm": 1.4188416004180908, + "learning_rate": 4.9975270164724815e-06, + "loss": 2.2163, + "step": 822 + }, + { + "epoch": 0.04415236051502146, + "grad_norm": 0.8140658736228943, + "learning_rate": 4.99750766177101e-06, + "loss": 2.223, + "step": 823 + }, + { + "epoch": 0.04420600858369099, + "grad_norm": 0.833301842212677, + "learning_rate": 4.9974882316628755e-06, + "loss": 2.291, + "step": 824 + }, + { + "epoch": 0.044259656652360514, + "grad_norm": 0.869111180305481, + "learning_rate": 4.997468726148664e-06, + "loss": 2.2221, + "step": 825 + }, + { + "epoch": 0.044313304721030046, + "grad_norm": 0.9753746390342712, + "learning_rate": 4.997449145228966e-06, + "loss": 2.225, + "step": 826 + }, + { + "epoch": 0.04436695278969957, + "grad_norm": 10.13404369354248, + "learning_rate": 4.997429488904373e-06, + "loss": 2.5246, + "step": 827 + }, + { + "epoch": 0.044420600858369096, + "grad_norm": 1.0190917253494263, + "learning_rate": 4.997409757175476e-06, + "loss": 2.2721, + "step": 828 + }, + { + "epoch": 0.04447424892703863, + "grad_norm": 0.941078782081604, + "learning_rate": 4.9973899500428725e-06, + "loss": 2.2688, + "step": 829 + }, + { + "epoch": 0.04452789699570815, + "grad_norm": 1.8942211866378784, + "learning_rate": 4.99737006750716e-06, + "loss": 2.4701, + "step": 830 + }, + { + "epoch": 0.044581545064377684, + "grad_norm": 1.5602954626083374, + "learning_rate": 4.99735010956894e-06, + "loss": 2.3694, + "step": 831 + }, + { + "epoch": 0.04463519313304721, + "grad_norm": 0.8522498607635498, + "learning_rate": 4.997330076228814e-06, + "loss": 2.3173, + "step": 832 + }, + { + "epoch": 0.04468884120171674, + "grad_norm": 0.8480938673019409, + "learning_rate": 4.997309967487386e-06, + "loss": 2.2045, + "step": 833 + }, + { + "epoch": 0.044742489270386265, + "grad_norm": 1.0573030710220337, + "learning_rate": 4.997289783345264e-06, + "loss": 2.2847, + "step": 834 + }, + { + "epoch": 0.0447961373390558, + "grad_norm": 0.7354035377502441, + "learning_rate": 4.997269523803058e-06, + "loss": 2.1653, + "step": 835 + }, + { + "epoch": 0.04484978540772532, + "grad_norm": 0.9692280888557434, + "learning_rate": 4.9972491888613795e-06, + "loss": 2.3415, + "step": 836 + }, + { + "epoch": 0.04490343347639485, + "grad_norm": 1.0519038438796997, + "learning_rate": 4.997228778520842e-06, + "loss": 2.3051, + "step": 837 + }, + { + "epoch": 0.04495708154506438, + "grad_norm": 0.989500880241394, + "learning_rate": 4.9972082927820615e-06, + "loss": 2.3875, + "step": 838 + }, + { + "epoch": 0.0450107296137339, + "grad_norm": 1.0554471015930176, + "learning_rate": 4.9971877316456575e-06, + "loss": 2.2299, + "step": 839 + }, + { + "epoch": 0.045064377682403435, + "grad_norm": 1.1355721950531006, + "learning_rate": 4.99716709511225e-06, + "loss": 2.7322, + "step": 840 + }, + { + "epoch": 0.04511802575107296, + "grad_norm": 0.9610007405281067, + "learning_rate": 4.997146383182462e-06, + "loss": 2.5791, + "step": 841 + }, + { + "epoch": 0.04517167381974249, + "grad_norm": 0.9329342246055603, + "learning_rate": 4.99712559585692e-06, + "loss": 2.2301, + "step": 842 + }, + { + "epoch": 0.045225321888412016, + "grad_norm": 0.9036456942558289, + "learning_rate": 4.99710473313625e-06, + "loss": 2.4383, + "step": 843 + }, + { + "epoch": 0.04527896995708155, + "grad_norm": 0.9212629795074463, + "learning_rate": 4.997083795021083e-06, + "loss": 2.264, + "step": 844 + }, + { + "epoch": 0.04533261802575107, + "grad_norm": 1.1060104370117188, + "learning_rate": 4.997062781512051e-06, + "loss": 2.3024, + "step": 845 + }, + { + "epoch": 0.0453862660944206, + "grad_norm": 0.8168416023254395, + "learning_rate": 4.997041692609789e-06, + "loss": 2.0985, + "step": 846 + }, + { + "epoch": 0.04543991416309013, + "grad_norm": 0.9614957571029663, + "learning_rate": 4.997020528314932e-06, + "loss": 2.3652, + "step": 847 + }, + { + "epoch": 0.045493562231759654, + "grad_norm": 0.9241411089897156, + "learning_rate": 4.9969992886281195e-06, + "loss": 2.2806, + "step": 848 + }, + { + "epoch": 0.045547210300429186, + "grad_norm": 1.316644310951233, + "learning_rate": 4.996977973549995e-06, + "loss": 2.1456, + "step": 849 + }, + { + "epoch": 0.04560085836909871, + "grad_norm": 0.8849570155143738, + "learning_rate": 4.9969565830811995e-06, + "loss": 2.2959, + "step": 850 + }, + { + "epoch": 0.04565450643776824, + "grad_norm": 0.8720526099205017, + "learning_rate": 4.99693511722238e-06, + "loss": 2.1651, + "step": 851 + }, + { + "epoch": 0.04570815450643777, + "grad_norm": 0.9494382739067078, + "learning_rate": 4.996913575974184e-06, + "loss": 2.3033, + "step": 852 + }, + { + "epoch": 0.0457618025751073, + "grad_norm": 0.9361014366149902, + "learning_rate": 4.996891959337263e-06, + "loss": 1.6821, + "step": 853 + }, + { + "epoch": 0.045815450643776824, + "grad_norm": 0.9310194253921509, + "learning_rate": 4.996870267312268e-06, + "loss": 2.3073, + "step": 854 + }, + { + "epoch": 0.04586909871244635, + "grad_norm": 1.0022679567337036, + "learning_rate": 4.996848499899856e-06, + "loss": 2.2257, + "step": 855 + }, + { + "epoch": 0.04592274678111588, + "grad_norm": 0.8023546934127808, + "learning_rate": 4.9968266571006815e-06, + "loss": 2.0871, + "step": 856 + }, + { + "epoch": 0.045976394849785406, + "grad_norm": 0.8610460758209229, + "learning_rate": 4.996804738915407e-06, + "loss": 2.4909, + "step": 857 + }, + { + "epoch": 0.04603004291845494, + "grad_norm": 0.8212977051734924, + "learning_rate": 4.996782745344693e-06, + "loss": 2.3139, + "step": 858 + }, + { + "epoch": 0.04608369098712446, + "grad_norm": 0.9452611207962036, + "learning_rate": 4.9967606763892026e-06, + "loss": 2.2239, + "step": 859 + }, + { + "epoch": 0.046137339055793994, + "grad_norm": 0.9727702140808105, + "learning_rate": 4.9967385320496035e-06, + "loss": 2.2971, + "step": 860 + }, + { + "epoch": 0.04619098712446352, + "grad_norm": 0.9022993445396423, + "learning_rate": 4.996716312326562e-06, + "loss": 2.4018, + "step": 861 + }, + { + "epoch": 0.04624463519313305, + "grad_norm": 1.4464820623397827, + "learning_rate": 4.996694017220753e-06, + "loss": 2.3533, + "step": 862 + }, + { + "epoch": 0.046298283261802575, + "grad_norm": 0.8852490186691284, + "learning_rate": 4.996671646732846e-06, + "loss": 2.3145, + "step": 863 + }, + { + "epoch": 0.0463519313304721, + "grad_norm": 0.8469889760017395, + "learning_rate": 4.996649200863518e-06, + "loss": 2.3308, + "step": 864 + }, + { + "epoch": 0.04640557939914163, + "grad_norm": 1.1048692464828491, + "learning_rate": 4.996626679613446e-06, + "loss": 2.2454, + "step": 865 + }, + { + "epoch": 0.04645922746781116, + "grad_norm": 0.9867213368415833, + "learning_rate": 4.996604082983312e-06, + "loss": 2.4458, + "step": 866 + }, + { + "epoch": 0.04651287553648069, + "grad_norm": 1.1414556503295898, + "learning_rate": 4.9965814109737955e-06, + "loss": 2.6875, + "step": 867 + }, + { + "epoch": 0.04656652360515021, + "grad_norm": 1.112567663192749, + "learning_rate": 4.996558663585583e-06, + "loss": 2.4132, + "step": 868 + }, + { + "epoch": 0.046620171673819745, + "grad_norm": 0.8955932855606079, + "learning_rate": 4.9965358408193595e-06, + "loss": 2.0656, + "step": 869 + }, + { + "epoch": 0.04667381974248927, + "grad_norm": 0.8922957181930542, + "learning_rate": 4.996512942675816e-06, + "loss": 2.5367, + "step": 870 + }, + { + "epoch": 0.0467274678111588, + "grad_norm": 0.9885995984077454, + "learning_rate": 4.996489969155644e-06, + "loss": 2.4662, + "step": 871 + }, + { + "epoch": 0.046781115879828326, + "grad_norm": 0.8768265843391418, + "learning_rate": 4.996466920259534e-06, + "loss": 2.2089, + "step": 872 + }, + { + "epoch": 0.04683476394849785, + "grad_norm": 1.0764960050582886, + "learning_rate": 4.996443795988185e-06, + "loss": 2.2871, + "step": 873 + }, + { + "epoch": 0.04688841201716738, + "grad_norm": 0.8354368209838867, + "learning_rate": 4.996420596342294e-06, + "loss": 2.2896, + "step": 874 + }, + { + "epoch": 0.04694206008583691, + "grad_norm": 1.151351809501648, + "learning_rate": 4.996397321322561e-06, + "loss": 2.578, + "step": 875 + }, + { + "epoch": 0.04699570815450644, + "grad_norm": 0.9126418232917786, + "learning_rate": 4.996373970929691e-06, + "loss": 2.6343, + "step": 876 + }, + { + "epoch": 0.047049356223175964, + "grad_norm": 0.8948343396186829, + "learning_rate": 4.996350545164387e-06, + "loss": 2.1238, + "step": 877 + }, + { + "epoch": 0.047103004291845496, + "grad_norm": 0.8587777018547058, + "learning_rate": 4.996327044027356e-06, + "loss": 2.357, + "step": 878 + }, + { + "epoch": 0.04715665236051502, + "grad_norm": 1.0722131729125977, + "learning_rate": 4.996303467519309e-06, + "loss": 2.4987, + "step": 879 + }, + { + "epoch": 0.04721030042918455, + "grad_norm": 0.9111208319664001, + "learning_rate": 4.996279815640957e-06, + "loss": 2.1724, + "step": 880 + }, + { + "epoch": 0.04726394849785408, + "grad_norm": 0.9679865837097168, + "learning_rate": 4.996256088393013e-06, + "loss": 2.2889, + "step": 881 + }, + { + "epoch": 0.0473175965665236, + "grad_norm": 3.3809311389923096, + "learning_rate": 4.996232285776195e-06, + "loss": 2.5273, + "step": 882 + }, + { + "epoch": 0.047371244635193134, + "grad_norm": 5.054480075836182, + "learning_rate": 4.996208407791223e-06, + "loss": 2.34, + "step": 883 + }, + { + "epoch": 0.04742489270386266, + "grad_norm": 0.9699863791465759, + "learning_rate": 4.996184454438815e-06, + "loss": 2.342, + "step": 884 + }, + { + "epoch": 0.04747854077253219, + "grad_norm": 0.8573706150054932, + "learning_rate": 4.996160425719696e-06, + "loss": 2.3838, + "step": 885 + }, + { + "epoch": 0.047532188841201715, + "grad_norm": 0.9323153495788574, + "learning_rate": 4.9961363216345906e-06, + "loss": 2.4422, + "step": 886 + }, + { + "epoch": 0.04758583690987125, + "grad_norm": 0.9933872222900391, + "learning_rate": 4.996112142184227e-06, + "loss": 2.3562, + "step": 887 + }, + { + "epoch": 0.04763948497854077, + "grad_norm": 2.8429462909698486, + "learning_rate": 4.996087887369335e-06, + "loss": 2.1378, + "step": 888 + }, + { + "epoch": 0.047693133047210304, + "grad_norm": 0.882140040397644, + "learning_rate": 4.996063557190647e-06, + "loss": 2.1772, + "step": 889 + }, + { + "epoch": 0.04774678111587983, + "grad_norm": 0.9161127209663391, + "learning_rate": 4.996039151648898e-06, + "loss": 2.2012, + "step": 890 + }, + { + "epoch": 0.04780042918454935, + "grad_norm": 0.8706679940223694, + "learning_rate": 4.996014670744824e-06, + "loss": 2.1204, + "step": 891 + }, + { + "epoch": 0.047854077253218885, + "grad_norm": 0.9027878046035767, + "learning_rate": 4.995990114479165e-06, + "loss": 2.1609, + "step": 892 + }, + { + "epoch": 0.04790772532188841, + "grad_norm": 1.4998260736465454, + "learning_rate": 4.995965482852662e-06, + "loss": 2.5109, + "step": 893 + }, + { + "epoch": 0.04796137339055794, + "grad_norm": 0.9449037313461304, + "learning_rate": 4.995940775866059e-06, + "loss": 2.373, + "step": 894 + }, + { + "epoch": 0.04801502145922747, + "grad_norm": 1.0213640928268433, + "learning_rate": 4.9959159935201026e-06, + "loss": 2.3769, + "step": 895 + }, + { + "epoch": 0.048068669527897, + "grad_norm": 0.8521072268486023, + "learning_rate": 4.995891135815539e-06, + "loss": 2.2695, + "step": 896 + }, + { + "epoch": 0.04812231759656652, + "grad_norm": 1.093001127243042, + "learning_rate": 4.9958662027531215e-06, + "loss": 2.3793, + "step": 897 + }, + { + "epoch": 0.048175965665236055, + "grad_norm": 0.8218960762023926, + "learning_rate": 4.9958411943335995e-06, + "loss": 2.368, + "step": 898 + }, + { + "epoch": 0.04822961373390558, + "grad_norm": 0.7762258648872375, + "learning_rate": 4.99581611055773e-06, + "loss": 2.0708, + "step": 899 + }, + { + "epoch": 0.048283261802575105, + "grad_norm": 1.0556535720825195, + "learning_rate": 4.995790951426272e-06, + "loss": 2.3648, + "step": 900 + }, + { + "epoch": 0.048336909871244636, + "grad_norm": 0.8436540961265564, + "learning_rate": 4.995765716939982e-06, + "loss": 2.1907, + "step": 901 + }, + { + "epoch": 0.04839055793991416, + "grad_norm": 1.234887719154358, + "learning_rate": 4.995740407099624e-06, + "loss": 2.115, + "step": 902 + }, + { + "epoch": 0.04844420600858369, + "grad_norm": 3.1939470767974854, + "learning_rate": 4.9957150219059615e-06, + "loss": 2.2276, + "step": 903 + }, + { + "epoch": 0.04849785407725322, + "grad_norm": 0.9288572072982788, + "learning_rate": 4.99568956135976e-06, + "loss": 2.368, + "step": 904 + }, + { + "epoch": 0.04855150214592275, + "grad_norm": 1.2513632774353027, + "learning_rate": 4.99566402546179e-06, + "loss": 2.4524, + "step": 905 + }, + { + "epoch": 0.048605150214592274, + "grad_norm": 0.9631871581077576, + "learning_rate": 4.995638414212821e-06, + "loss": 2.4557, + "step": 906 + }, + { + "epoch": 0.048658798283261806, + "grad_norm": 1.0282517671585083, + "learning_rate": 4.995612727613628e-06, + "loss": 2.4711, + "step": 907 + }, + { + "epoch": 0.04871244635193133, + "grad_norm": 0.9872168302536011, + "learning_rate": 4.9955869656649845e-06, + "loss": 2.3447, + "step": 908 + }, + { + "epoch": 0.048766094420600856, + "grad_norm": 0.8003728985786438, + "learning_rate": 4.9955611283676696e-06, + "loss": 2.1906, + "step": 909 + }, + { + "epoch": 0.04881974248927039, + "grad_norm": 0.8998580574989319, + "learning_rate": 4.9955352157224634e-06, + "loss": 2.4697, + "step": 910 + }, + { + "epoch": 0.04887339055793991, + "grad_norm": 0.8387194275856018, + "learning_rate": 4.995509227730148e-06, + "loss": 2.2342, + "step": 911 + }, + { + "epoch": 0.048927038626609444, + "grad_norm": 1.3213353157043457, + "learning_rate": 4.995483164391507e-06, + "loss": 2.3268, + "step": 912 + }, + { + "epoch": 0.04898068669527897, + "grad_norm": 0.9710744023323059, + "learning_rate": 4.995457025707329e-06, + "loss": 2.3417, + "step": 913 + }, + { + "epoch": 0.0490343347639485, + "grad_norm": 0.8901402354240417, + "learning_rate": 4.995430811678403e-06, + "loss": 2.2967, + "step": 914 + }, + { + "epoch": 0.049087982832618025, + "grad_norm": 0.836480975151062, + "learning_rate": 4.9954045223055194e-06, + "loss": 2.5677, + "step": 915 + }, + { + "epoch": 0.04914163090128755, + "grad_norm": 0.8585495352745056, + "learning_rate": 4.995378157589473e-06, + "loss": 2.2087, + "step": 916 + }, + { + "epoch": 0.04919527896995708, + "grad_norm": 0.9633775949478149, + "learning_rate": 4.995351717531059e-06, + "loss": 2.2611, + "step": 917 + }, + { + "epoch": 0.04924892703862661, + "grad_norm": 1.202440619468689, + "learning_rate": 4.995325202131076e-06, + "loss": 2.1993, + "step": 918 + }, + { + "epoch": 0.04930257510729614, + "grad_norm": 0.9721028208732605, + "learning_rate": 4.995298611390325e-06, + "loss": 2.0807, + "step": 919 + }, + { + "epoch": 0.04935622317596566, + "grad_norm": 0.9465093612670898, + "learning_rate": 4.995271945309609e-06, + "loss": 1.4496, + "step": 920 + }, + { + "epoch": 0.049409871244635195, + "grad_norm": 0.9208647012710571, + "learning_rate": 4.995245203889732e-06, + "loss": 2.1962, + "step": 921 + }, + { + "epoch": 0.04946351931330472, + "grad_norm": 1.8951324224472046, + "learning_rate": 4.995218387131502e-06, + "loss": 1.8659, + "step": 922 + }, + { + "epoch": 0.04951716738197425, + "grad_norm": 1.0991795063018799, + "learning_rate": 4.995191495035728e-06, + "loss": 2.3649, + "step": 923 + }, + { + "epoch": 0.049570815450643776, + "grad_norm": 0.9312379360198975, + "learning_rate": 4.995164527603224e-06, + "loss": 2.4272, + "step": 924 + }, + { + "epoch": 0.0496244635193133, + "grad_norm": 0.9193178415298462, + "learning_rate": 4.9951374848348025e-06, + "loss": 2.0204, + "step": 925 + }, + { + "epoch": 0.04967811158798283, + "grad_norm": 0.9467421174049377, + "learning_rate": 4.9951103667312795e-06, + "loss": 2.3051, + "step": 926 + }, + { + "epoch": 0.04973175965665236, + "grad_norm": 0.9656423926353455, + "learning_rate": 4.995083173293475e-06, + "loss": 2.4439, + "step": 927 + }, + { + "epoch": 0.04978540772532189, + "grad_norm": 0.9064692258834839, + "learning_rate": 4.995055904522211e-06, + "loss": 2.148, + "step": 928 + }, + { + "epoch": 0.049839055793991414, + "grad_norm": 0.9935110211372375, + "learning_rate": 4.995028560418308e-06, + "loss": 2.1648, + "step": 929 + }, + { + "epoch": 0.049892703862660946, + "grad_norm": 0.9635987281799316, + "learning_rate": 4.995001140982594e-06, + "loss": 2.0844, + "step": 930 + }, + { + "epoch": 0.04994635193133047, + "grad_norm": 1.7002750635147095, + "learning_rate": 4.994973646215895e-06, + "loss": 2.3084, + "step": 931 + }, + { + "epoch": 0.05, + "grad_norm": 1.072045922279358, + "learning_rate": 4.994946076119043e-06, + "loss": 2.3338, + "step": 932 + }, + { + "epoch": 0.05005364806866953, + "grad_norm": 0.8942899703979492, + "learning_rate": 4.994918430692869e-06, + "loss": 2.3021, + "step": 933 + }, + { + "epoch": 0.05010729613733905, + "grad_norm": 0.9237831234931946, + "learning_rate": 4.994890709938208e-06, + "loss": 1.9531, + "step": 934 + }, + { + "epoch": 0.050160944206008584, + "grad_norm": 0.9621050953865051, + "learning_rate": 4.994862913855898e-06, + "loss": 2.4625, + "step": 935 + }, + { + "epoch": 0.05021459227467811, + "grad_norm": 0.9398587346076965, + "learning_rate": 4.9948350424467776e-06, + "loss": 2.3661, + "step": 936 + }, + { + "epoch": 0.05026824034334764, + "grad_norm": 0.9487491250038147, + "learning_rate": 4.9948070957116864e-06, + "loss": 2.2331, + "step": 937 + }, + { + "epoch": 0.050321888412017166, + "grad_norm": 1.0199170112609863, + "learning_rate": 4.9947790736514715e-06, + "loss": 2.3274, + "step": 938 + }, + { + "epoch": 0.0503755364806867, + "grad_norm": 1.0984852313995361, + "learning_rate": 4.994750976266977e-06, + "loss": 1.4854, + "step": 939 + }, + { + "epoch": 0.05042918454935622, + "grad_norm": 0.8496457934379578, + "learning_rate": 4.9947228035590515e-06, + "loss": 2.0712, + "step": 940 + }, + { + "epoch": 0.050482832618025754, + "grad_norm": 0.9884358048439026, + "learning_rate": 4.9946945555285465e-06, + "loss": 2.5152, + "step": 941 + }, + { + "epoch": 0.05053648068669528, + "grad_norm": 0.933678388595581, + "learning_rate": 4.9946662321763126e-06, + "loss": 2.2755, + "step": 942 + }, + { + "epoch": 0.050590128755364804, + "grad_norm": 0.9292495846748352, + "learning_rate": 4.994637833503208e-06, + "loss": 2.5286, + "step": 943 + }, + { + "epoch": 0.050643776824034335, + "grad_norm": 2.4656083583831787, + "learning_rate": 4.994609359510088e-06, + "loss": 2.3116, + "step": 944 + }, + { + "epoch": 0.05069742489270386, + "grad_norm": 1.0257818698883057, + "learning_rate": 4.994580810197813e-06, + "loss": 2.4318, + "step": 945 + }, + { + "epoch": 0.05075107296137339, + "grad_norm": 0.9891431331634521, + "learning_rate": 4.9945521855672445e-06, + "loss": 1.9486, + "step": 946 + }, + { + "epoch": 0.05080472103004292, + "grad_norm": 0.8486548066139221, + "learning_rate": 4.994523485619248e-06, + "loss": 2.1794, + "step": 947 + }, + { + "epoch": 0.05085836909871245, + "grad_norm": 0.8849818706512451, + "learning_rate": 4.994494710354688e-06, + "loss": 2.3697, + "step": 948 + }, + { + "epoch": 0.05091201716738197, + "grad_norm": 1.0209705829620361, + "learning_rate": 4.994465859774436e-06, + "loss": 2.3055, + "step": 949 + }, + { + "epoch": 0.050965665236051505, + "grad_norm": 0.9417099356651306, + "learning_rate": 4.99443693387936e-06, + "loss": 2.4874, + "step": 950 + }, + { + "epoch": 0.05101931330472103, + "grad_norm": 1.1745842695236206, + "learning_rate": 4.994407932670336e-06, + "loss": 2.3623, + "step": 951 + }, + { + "epoch": 0.051072961373390555, + "grad_norm": 0.8378251791000366, + "learning_rate": 4.994378856148238e-06, + "loss": 2.1624, + "step": 952 + }, + { + "epoch": 0.051126609442060086, + "grad_norm": 0.9222257733345032, + "learning_rate": 4.994349704313945e-06, + "loss": 2.0363, + "step": 953 + }, + { + "epoch": 0.05118025751072961, + "grad_norm": 1.2677849531173706, + "learning_rate": 4.994320477168336e-06, + "loss": 2.417, + "step": 954 + }, + { + "epoch": 0.05123390557939914, + "grad_norm": 1.0328000783920288, + "learning_rate": 4.994291174712295e-06, + "loss": 2.302, + "step": 955 + }, + { + "epoch": 0.05128755364806867, + "grad_norm": 1.087328553199768, + "learning_rate": 4.994261796946705e-06, + "loss": 2.3318, + "step": 956 + }, + { + "epoch": 0.0513412017167382, + "grad_norm": 0.939976692199707, + "learning_rate": 4.994232343872454e-06, + "loss": 2.2508, + "step": 957 + }, + { + "epoch": 0.051394849785407724, + "grad_norm": 0.8842934966087341, + "learning_rate": 4.994202815490431e-06, + "loss": 2.2574, + "step": 958 + }, + { + "epoch": 0.051448497854077256, + "grad_norm": 1.062280297279358, + "learning_rate": 4.994173211801528e-06, + "loss": 2.4438, + "step": 959 + }, + { + "epoch": 0.05150214592274678, + "grad_norm": 0.9384779334068298, + "learning_rate": 4.994143532806638e-06, + "loss": 2.3252, + "step": 960 + }, + { + "epoch": 0.051555793991416306, + "grad_norm": 0.9565473198890686, + "learning_rate": 4.994113778506658e-06, + "loss": 2.3258, + "step": 961 + }, + { + "epoch": 0.05160944206008584, + "grad_norm": 0.8754107356071472, + "learning_rate": 4.994083948902486e-06, + "loss": 2.493, + "step": 962 + }, + { + "epoch": 0.05166309012875536, + "grad_norm": 0.7576799392700195, + "learning_rate": 4.994054043995022e-06, + "loss": 2.0661, + "step": 963 + }, + { + "epoch": 0.051716738197424894, + "grad_norm": 0.798824667930603, + "learning_rate": 4.99402406378517e-06, + "loss": 2.3011, + "step": 964 + }, + { + "epoch": 0.05177038626609442, + "grad_norm": 0.9284719824790955, + "learning_rate": 4.993994008273833e-06, + "loss": 2.2128, + "step": 965 + }, + { + "epoch": 0.05182403433476395, + "grad_norm": 0.9260801672935486, + "learning_rate": 4.993963877461922e-06, + "loss": 2.3133, + "step": 966 + }, + { + "epoch": 0.051877682403433475, + "grad_norm": 0.8682747483253479, + "learning_rate": 4.993933671350344e-06, + "loss": 2.2938, + "step": 967 + }, + { + "epoch": 0.05193133047210301, + "grad_norm": 1.0938524007797241, + "learning_rate": 4.993903389940013e-06, + "loss": 2.1396, + "step": 968 + }, + { + "epoch": 0.05198497854077253, + "grad_norm": 0.8743252754211426, + "learning_rate": 4.99387303323184e-06, + "loss": 2.2928, + "step": 969 + }, + { + "epoch": 0.05203862660944206, + "grad_norm": 0.9667371511459351, + "learning_rate": 4.993842601226745e-06, + "loss": 2.0934, + "step": 970 + }, + { + "epoch": 0.05209227467811159, + "grad_norm": 0.9245797991752625, + "learning_rate": 4.993812093925645e-06, + "loss": 2.1062, + "step": 971 + }, + { + "epoch": 0.05214592274678111, + "grad_norm": 0.9028712511062622, + "learning_rate": 4.993781511329462e-06, + "loss": 2.2794, + "step": 972 + }, + { + "epoch": 0.052199570815450645, + "grad_norm": 0.9455599784851074, + "learning_rate": 4.993750853439119e-06, + "loss": 2.1582, + "step": 973 + }, + { + "epoch": 0.05225321888412017, + "grad_norm": 1.1080763339996338, + "learning_rate": 4.993720120255541e-06, + "loss": 2.4664, + "step": 974 + }, + { + "epoch": 0.0523068669527897, + "grad_norm": 0.9499462842941284, + "learning_rate": 4.993689311779657e-06, + "loss": 1.9394, + "step": 975 + }, + { + "epoch": 0.05236051502145923, + "grad_norm": 0.9132592082023621, + "learning_rate": 4.993658428012397e-06, + "loss": 2.1273, + "step": 976 + }, + { + "epoch": 0.05241416309012876, + "grad_norm": 0.9829801917076111, + "learning_rate": 4.993627468954692e-06, + "loss": 2.3914, + "step": 977 + }, + { + "epoch": 0.05246781115879828, + "grad_norm": 0.8828199505805969, + "learning_rate": 4.993596434607479e-06, + "loss": 2.2594, + "step": 978 + }, + { + "epoch": 0.05252145922746781, + "grad_norm": 0.8871847987174988, + "learning_rate": 4.993565324971693e-06, + "loss": 2.2615, + "step": 979 + }, + { + "epoch": 0.05257510729613734, + "grad_norm": 0.9502004981040955, + "learning_rate": 4.993534140048275e-06, + "loss": 2.4219, + "step": 980 + }, + { + "epoch": 0.052628755364806865, + "grad_norm": 0.8645646572113037, + "learning_rate": 4.993502879838166e-06, + "loss": 2.1623, + "step": 981 + }, + { + "epoch": 0.052682403433476396, + "grad_norm": 0.973790168762207, + "learning_rate": 4.9934715443423096e-06, + "loss": 2.1278, + "step": 982 + }, + { + "epoch": 0.05273605150214592, + "grad_norm": 1.0120927095413208, + "learning_rate": 4.993440133561651e-06, + "loss": 2.1849, + "step": 983 + }, + { + "epoch": 0.05278969957081545, + "grad_norm": 1.1030818223953247, + "learning_rate": 4.99340864749714e-06, + "loss": 2.2806, + "step": 984 + }, + { + "epoch": 0.05284334763948498, + "grad_norm": 0.9871490001678467, + "learning_rate": 4.993377086149726e-06, + "loss": 2.4908, + "step": 985 + }, + { + "epoch": 0.05289699570815451, + "grad_norm": 0.9324192404747009, + "learning_rate": 4.993345449520365e-06, + "loss": 2.2333, + "step": 986 + }, + { + "epoch": 0.052950643776824034, + "grad_norm": 1.0024852752685547, + "learning_rate": 4.993313737610008e-06, + "loss": 2.2568, + "step": 987 + }, + { + "epoch": 0.05300429184549356, + "grad_norm": 0.883892834186554, + "learning_rate": 4.993281950419614e-06, + "loss": 1.8877, + "step": 988 + }, + { + "epoch": 0.05305793991416309, + "grad_norm": 1.057392954826355, + "learning_rate": 4.993250087950145e-06, + "loss": 2.6095, + "step": 989 + }, + { + "epoch": 0.053111587982832616, + "grad_norm": 1.0020055770874023, + "learning_rate": 4.993218150202559e-06, + "loss": 2.3811, + "step": 990 + }, + { + "epoch": 0.05316523605150215, + "grad_norm": 0.98125159740448, + "learning_rate": 4.9931861371778235e-06, + "loss": 2.4358, + "step": 991 + }, + { + "epoch": 0.05321888412017167, + "grad_norm": 1.0553079843521118, + "learning_rate": 4.993154048876904e-06, + "loss": 2.4128, + "step": 992 + }, + { + "epoch": 0.053272532188841204, + "grad_norm": 1.234278917312622, + "learning_rate": 4.993121885300769e-06, + "loss": 2.2179, + "step": 993 + }, + { + "epoch": 0.05332618025751073, + "grad_norm": 0.9815464019775391, + "learning_rate": 4.99308964645039e-06, + "loss": 2.328, + "step": 994 + }, + { + "epoch": 0.05337982832618026, + "grad_norm": 1.090070366859436, + "learning_rate": 4.99305733232674e-06, + "loss": 2.2669, + "step": 995 + }, + { + "epoch": 0.053433476394849785, + "grad_norm": 0.9559518694877625, + "learning_rate": 4.993024942930794e-06, + "loss": 2.379, + "step": 996 + }, + { + "epoch": 0.05348712446351931, + "grad_norm": 1.1252349615097046, + "learning_rate": 4.992992478263533e-06, + "loss": 2.3017, + "step": 997 + }, + { + "epoch": 0.05354077253218884, + "grad_norm": 1.450621247291565, + "learning_rate": 4.992959938325933e-06, + "loss": 2.4054, + "step": 998 + }, + { + "epoch": 0.05359442060085837, + "grad_norm": 0.9781585931777954, + "learning_rate": 4.99292732311898e-06, + "loss": 2.5405, + "step": 999 + }, + { + "epoch": 0.0536480686695279, + "grad_norm": 0.8734198212623596, + "learning_rate": 4.992894632643657e-06, + "loss": 2.2508, + "step": 1000 + }, + { + "epoch": 0.05370171673819742, + "grad_norm": 0.9765375852584839, + "learning_rate": 4.992861866900951e-06, + "loss": 2.2035, + "step": 1001 + }, + { + "epoch": 0.053755364806866955, + "grad_norm": 0.941493809223175, + "learning_rate": 4.992829025891851e-06, + "loss": 2.3465, + "step": 1002 + }, + { + "epoch": 0.05380901287553648, + "grad_norm": 0.83364337682724, + "learning_rate": 4.99279610961735e-06, + "loss": 2.2142, + "step": 1003 + }, + { + "epoch": 0.05386266094420601, + "grad_norm": 0.9412592649459839, + "learning_rate": 4.9927631180784405e-06, + "loss": 2.2616, + "step": 1004 + }, + { + "epoch": 0.053916309012875537, + "grad_norm": 1.1956979036331177, + "learning_rate": 4.992730051276119e-06, + "loss": 2.4217, + "step": 1005 + }, + { + "epoch": 0.05396995708154506, + "grad_norm": 0.8592537045478821, + "learning_rate": 4.992696909211384e-06, + "loss": 2.1793, + "step": 1006 + }, + { + "epoch": 0.05402360515021459, + "grad_norm": 0.9625216126441956, + "learning_rate": 4.992663691885237e-06, + "loss": 2.0678, + "step": 1007 + }, + { + "epoch": 0.05407725321888412, + "grad_norm": 0.8811043500900269, + "learning_rate": 4.992630399298679e-06, + "loss": 2.2504, + "step": 1008 + }, + { + "epoch": 0.05413090128755365, + "grad_norm": 0.8211297392845154, + "learning_rate": 4.9925970314527164e-06, + "loss": 2.365, + "step": 1009 + }, + { + "epoch": 0.054184549356223174, + "grad_norm": 1.2517651319503784, + "learning_rate": 4.992563588348357e-06, + "loss": 2.2074, + "step": 1010 + }, + { + "epoch": 0.054238197424892706, + "grad_norm": 4.096930027008057, + "learning_rate": 4.992530069986609e-06, + "loss": 2.4019, + "step": 1011 + }, + { + "epoch": 0.05429184549356223, + "grad_norm": 0.8329586982727051, + "learning_rate": 4.992496476368486e-06, + "loss": 2.2887, + "step": 1012 + }, + { + "epoch": 0.05434549356223176, + "grad_norm": 1.034111499786377, + "learning_rate": 4.992462807495002e-06, + "loss": 2.5454, + "step": 1013 + }, + { + "epoch": 0.05439914163090129, + "grad_norm": 0.8501675128936768, + "learning_rate": 4.992429063367173e-06, + "loss": 2.068, + "step": 1014 + }, + { + "epoch": 0.05445278969957081, + "grad_norm": 1.1050831079483032, + "learning_rate": 4.992395243986018e-06, + "loss": 2.4044, + "step": 1015 + }, + { + "epoch": 0.054506437768240344, + "grad_norm": 1.5501147508621216, + "learning_rate": 4.992361349352558e-06, + "loss": 2.1468, + "step": 1016 + }, + { + "epoch": 0.05456008583690987, + "grad_norm": 1.1415302753448486, + "learning_rate": 4.992327379467817e-06, + "loss": 2.3387, + "step": 1017 + }, + { + "epoch": 0.0546137339055794, + "grad_norm": 0.968172013759613, + "learning_rate": 4.992293334332821e-06, + "loss": 2.1255, + "step": 1018 + }, + { + "epoch": 0.054667381974248926, + "grad_norm": 0.9503417611122131, + "learning_rate": 4.992259213948596e-06, + "loss": 2.2578, + "step": 1019 + }, + { + "epoch": 0.05472103004291846, + "grad_norm": 0.9569436311721802, + "learning_rate": 4.992225018316173e-06, + "loss": 2.4178, + "step": 1020 + }, + { + "epoch": 0.05477467811158798, + "grad_norm": 0.8677228689193726, + "learning_rate": 4.992190747436585e-06, + "loss": 2.4738, + "step": 1021 + }, + { + "epoch": 0.054828326180257514, + "grad_norm": 0.978649914264679, + "learning_rate": 4.992156401310867e-06, + "loss": 2.4001, + "step": 1022 + }, + { + "epoch": 0.05488197424892704, + "grad_norm": 1.0024888515472412, + "learning_rate": 4.992121979940055e-06, + "loss": 2.3634, + "step": 1023 + }, + { + "epoch": 0.054935622317596564, + "grad_norm": 0.9719534516334534, + "learning_rate": 4.99208748332519e-06, + "loss": 2.2777, + "step": 1024 + }, + { + "epoch": 0.054989270386266095, + "grad_norm": 1.948480486869812, + "learning_rate": 4.9920529114673115e-06, + "loss": 2.5464, + "step": 1025 + }, + { + "epoch": 0.05504291845493562, + "grad_norm": 1.2344721555709839, + "learning_rate": 4.992018264367464e-06, + "loss": 2.5148, + "step": 1026 + }, + { + "epoch": 0.05509656652360515, + "grad_norm": 0.8931727409362793, + "learning_rate": 4.991983542026694e-06, + "loss": 2.2271, + "step": 1027 + }, + { + "epoch": 0.05515021459227468, + "grad_norm": 1.0193794965744019, + "learning_rate": 4.9919487444460495e-06, + "loss": 2.2034, + "step": 1028 + }, + { + "epoch": 0.05520386266094421, + "grad_norm": 1.0055073499679565, + "learning_rate": 4.991913871626581e-06, + "loss": 2.3504, + "step": 1029 + }, + { + "epoch": 0.05525751072961373, + "grad_norm": 1.1442534923553467, + "learning_rate": 4.991878923569342e-06, + "loss": 2.4353, + "step": 1030 + }, + { + "epoch": 0.055311158798283265, + "grad_norm": 0.9306530952453613, + "learning_rate": 4.991843900275388e-06, + "loss": 2.3237, + "step": 1031 + }, + { + "epoch": 0.05536480686695279, + "grad_norm": 0.9179642796516418, + "learning_rate": 4.991808801745775e-06, + "loss": 2.2958, + "step": 1032 + }, + { + "epoch": 0.055418454935622315, + "grad_norm": 1.267077088356018, + "learning_rate": 4.991773627981563e-06, + "loss": 2.2479, + "step": 1033 + }, + { + "epoch": 0.055472103004291846, + "grad_norm": 1.0775796175003052, + "learning_rate": 4.991738378983816e-06, + "loss": 2.5535, + "step": 1034 + }, + { + "epoch": 0.05552575107296137, + "grad_norm": 0.8064876794815063, + "learning_rate": 4.991703054753596e-06, + "loss": 2.2049, + "step": 1035 + }, + { + "epoch": 0.0555793991416309, + "grad_norm": 1.4190459251403809, + "learning_rate": 4.99166765529197e-06, + "loss": 2.1659, + "step": 1036 + }, + { + "epoch": 0.05563304721030043, + "grad_norm": 1.1851986646652222, + "learning_rate": 4.991632180600008e-06, + "loss": 2.4983, + "step": 1037 + }, + { + "epoch": 0.05568669527896996, + "grad_norm": 0.9262078404426575, + "learning_rate": 4.991596630678779e-06, + "loss": 2.2609, + "step": 1038 + }, + { + "epoch": 0.055740343347639484, + "grad_norm": 0.9049752354621887, + "learning_rate": 4.991561005529358e-06, + "loss": 2.3509, + "step": 1039 + }, + { + "epoch": 0.055793991416309016, + "grad_norm": 0.9508761763572693, + "learning_rate": 4.99152530515282e-06, + "loss": 2.316, + "step": 1040 + }, + { + "epoch": 0.05584763948497854, + "grad_norm": 2.8029944896698, + "learning_rate": 4.991489529550244e-06, + "loss": 2.5277, + "step": 1041 + }, + { + "epoch": 0.055901287553648066, + "grad_norm": 0.9777684211730957, + "learning_rate": 4.991453678722708e-06, + "loss": 2.2536, + "step": 1042 + }, + { + "epoch": 0.0559549356223176, + "grad_norm": 0.9265131950378418, + "learning_rate": 4.991417752671296e-06, + "loss": 2.2791, + "step": 1043 + }, + { + "epoch": 0.05600858369098712, + "grad_norm": 0.9356545805931091, + "learning_rate": 4.991381751397093e-06, + "loss": 2.325, + "step": 1044 + }, + { + "epoch": 0.056062231759656654, + "grad_norm": 1.0545375347137451, + "learning_rate": 4.991345674901185e-06, + "loss": 2.2737, + "step": 1045 + }, + { + "epoch": 0.05611587982832618, + "grad_norm": 1.0303090810775757, + "learning_rate": 4.991309523184661e-06, + "loss": 2.3858, + "step": 1046 + }, + { + "epoch": 0.05616952789699571, + "grad_norm": 1.0034732818603516, + "learning_rate": 4.991273296248614e-06, + "loss": 2.184, + "step": 1047 + }, + { + "epoch": 0.056223175965665236, + "grad_norm": 1.0190399885177612, + "learning_rate": 4.991236994094137e-06, + "loss": 2.479, + "step": 1048 + }, + { + "epoch": 0.05627682403433477, + "grad_norm": 0.9503771662712097, + "learning_rate": 4.9912006167223246e-06, + "loss": 2.2572, + "step": 1049 + }, + { + "epoch": 0.05633047210300429, + "grad_norm": 1.3302894830703735, + "learning_rate": 4.991164164134278e-06, + "loss": 2.0203, + "step": 1050 + }, + { + "epoch": 0.05638412017167382, + "grad_norm": 0.9612857103347778, + "learning_rate": 4.991127636331095e-06, + "loss": 2.4065, + "step": 1051 + }, + { + "epoch": 0.05643776824034335, + "grad_norm": 1.0276907682418823, + "learning_rate": 4.99109103331388e-06, + "loss": 2.2189, + "step": 1052 + }, + { + "epoch": 0.056491416309012873, + "grad_norm": 0.879024088382721, + "learning_rate": 4.9910543550837394e-06, + "loss": 2.0793, + "step": 1053 + }, + { + "epoch": 0.056545064377682405, + "grad_norm": 8.64822769165039, + "learning_rate": 4.991017601641777e-06, + "loss": 2.3557, + "step": 1054 + }, + { + "epoch": 0.05659871244635193, + "grad_norm": 1.0720820426940918, + "learning_rate": 4.990980772989107e-06, + "loss": 2.1938, + "step": 1055 + }, + { + "epoch": 0.05665236051502146, + "grad_norm": 1.3528854846954346, + "learning_rate": 4.990943869126838e-06, + "loss": 2.3726, + "step": 1056 + }, + { + "epoch": 0.05670600858369099, + "grad_norm": 0.9055727124214172, + "learning_rate": 4.990906890056084e-06, + "loss": 1.9999, + "step": 1057 + }, + { + "epoch": 0.05675965665236052, + "grad_norm": 0.9075988531112671, + "learning_rate": 4.990869835777964e-06, + "loss": 2.557, + "step": 1058 + }, + { + "epoch": 0.05681330472103004, + "grad_norm": 1.028501272201538, + "learning_rate": 4.990832706293596e-06, + "loss": 2.186, + "step": 1059 + }, + { + "epoch": 0.05686695278969957, + "grad_norm": 1.1125520467758179, + "learning_rate": 4.990795501604099e-06, + "loss": 2.4308, + "step": 1060 + }, + { + "epoch": 0.0569206008583691, + "grad_norm": 0.8248952031135559, + "learning_rate": 4.9907582217105985e-06, + "loss": 2.186, + "step": 1061 + }, + { + "epoch": 0.056974248927038625, + "grad_norm": 0.9849532842636108, + "learning_rate": 4.99072086661422e-06, + "loss": 2.3525, + "step": 1062 + }, + { + "epoch": 0.057027896995708156, + "grad_norm": 0.9275630116462708, + "learning_rate": 4.990683436316091e-06, + "loss": 2.2078, + "step": 1063 + }, + { + "epoch": 0.05708154506437768, + "grad_norm": 0.8727232813835144, + "learning_rate": 4.99064593081734e-06, + "loss": 2.1465, + "step": 1064 + }, + { + "epoch": 0.05713519313304721, + "grad_norm": 1.146936058998108, + "learning_rate": 4.9906083501191014e-06, + "loss": 2.4233, + "step": 1065 + }, + { + "epoch": 0.05718884120171674, + "grad_norm": 0.9624524116516113, + "learning_rate": 4.9905706942225095e-06, + "loss": 2.3894, + "step": 1066 + }, + { + "epoch": 0.05724248927038627, + "grad_norm": 1.0129231214523315, + "learning_rate": 4.990532963128701e-06, + "loss": 2.5176, + "step": 1067 + }, + { + "epoch": 0.057296137339055794, + "grad_norm": 1.0094066858291626, + "learning_rate": 4.990495156838815e-06, + "loss": 2.377, + "step": 1068 + }, + { + "epoch": 0.05734978540772532, + "grad_norm": 0.8860799074172974, + "learning_rate": 4.990457275353993e-06, + "loss": 2.2352, + "step": 1069 + }, + { + "epoch": 0.05740343347639485, + "grad_norm": 0.9399589896202087, + "learning_rate": 4.990419318675379e-06, + "loss": 2.1897, + "step": 1070 + }, + { + "epoch": 0.057457081545064376, + "grad_norm": 0.8149527907371521, + "learning_rate": 4.990381286804119e-06, + "loss": 2.0189, + "step": 1071 + }, + { + "epoch": 0.05751072961373391, + "grad_norm": 0.9436704516410828, + "learning_rate": 4.99034317974136e-06, + "loss": 2.0401, + "step": 1072 + }, + { + "epoch": 0.05756437768240343, + "grad_norm": 1.4298005104064941, + "learning_rate": 4.990304997488256e-06, + "loss": 2.4895, + "step": 1073 + }, + { + "epoch": 0.057618025751072964, + "grad_norm": 1.0987980365753174, + "learning_rate": 4.990266740045955e-06, + "loss": 1.6837, + "step": 1074 + }, + { + "epoch": 0.05767167381974249, + "grad_norm": 0.9572760462760925, + "learning_rate": 4.990228407415616e-06, + "loss": 2.2359, + "step": 1075 + }, + { + "epoch": 0.05772532188841202, + "grad_norm": 1.0550768375396729, + "learning_rate": 4.990189999598395e-06, + "loss": 2.1813, + "step": 1076 + }, + { + "epoch": 0.057778969957081545, + "grad_norm": 0.8116125464439392, + "learning_rate": 4.9901515165954514e-06, + "loss": 2.2302, + "step": 1077 + }, + { + "epoch": 0.05783261802575107, + "grad_norm": 0.8613325953483582, + "learning_rate": 4.990112958407947e-06, + "loss": 2.3111, + "step": 1078 + }, + { + "epoch": 0.0578862660944206, + "grad_norm": 1.0418342351913452, + "learning_rate": 4.9900743250370465e-06, + "loss": 2.3226, + "step": 1079 + }, + { + "epoch": 0.05793991416309013, + "grad_norm": 0.9930688142776489, + "learning_rate": 4.990035616483917e-06, + "loss": 2.4795, + "step": 1080 + }, + { + "epoch": 0.05799356223175966, + "grad_norm": 0.9675402641296387, + "learning_rate": 4.989996832749725e-06, + "loss": 2.1391, + "step": 1081 + }, + { + "epoch": 0.05804721030042918, + "grad_norm": 0.8295792937278748, + "learning_rate": 4.989957973835644e-06, + "loss": 2.1439, + "step": 1082 + }, + { + "epoch": 0.058100858369098715, + "grad_norm": 38.84608459472656, + "learning_rate": 4.989919039742846e-06, + "loss": 2.149, + "step": 1083 + }, + { + "epoch": 0.05815450643776824, + "grad_norm": 1.051656723022461, + "learning_rate": 4.9898800304725055e-06, + "loss": 2.2328, + "step": 1084 + }, + { + "epoch": 0.058208154506437765, + "grad_norm": 1.083837866783142, + "learning_rate": 4.9898409460258025e-06, + "loss": 2.3645, + "step": 1085 + }, + { + "epoch": 0.0582618025751073, + "grad_norm": 1.4535505771636963, + "learning_rate": 4.989801786403916e-06, + "loss": 2.4859, + "step": 1086 + }, + { + "epoch": 0.05831545064377682, + "grad_norm": 1.1062170267105103, + "learning_rate": 4.9897625516080285e-06, + "loss": 2.1956, + "step": 1087 + }, + { + "epoch": 0.05836909871244635, + "grad_norm": 0.9202542304992676, + "learning_rate": 4.989723241639324e-06, + "loss": 2.5209, + "step": 1088 + }, + { + "epoch": 0.05842274678111588, + "grad_norm": 0.9678305387496948, + "learning_rate": 4.989683856498989e-06, + "loss": 2.1728, + "step": 1089 + }, + { + "epoch": 0.05847639484978541, + "grad_norm": 0.9611050486564636, + "learning_rate": 4.989644396188214e-06, + "loss": 2.4742, + "step": 1090 + }, + { + "epoch": 0.058530042918454935, + "grad_norm": 0.9415873885154724, + "learning_rate": 4.9896048607081914e-06, + "loss": 2.2895, + "step": 1091 + }, + { + "epoch": 0.058583690987124466, + "grad_norm": 1.0929332971572876, + "learning_rate": 4.989565250060112e-06, + "loss": 2.0405, + "step": 1092 + }, + { + "epoch": 0.05863733905579399, + "grad_norm": 0.9018646478652954, + "learning_rate": 4.989525564245173e-06, + "loss": 2.5565, + "step": 1093 + }, + { + "epoch": 0.058690987124463516, + "grad_norm": 0.899653971195221, + "learning_rate": 4.989485803264575e-06, + "loss": 2.2524, + "step": 1094 + }, + { + "epoch": 0.05874463519313305, + "grad_norm": 1.0592292547225952, + "learning_rate": 4.989445967119514e-06, + "loss": 1.9871, + "step": 1095 + }, + { + "epoch": 0.05879828326180257, + "grad_norm": 0.9516764283180237, + "learning_rate": 4.9894060558111955e-06, + "loss": 2.4744, + "step": 1096 + }, + { + "epoch": 0.058851931330472104, + "grad_norm": 0.9141608476638794, + "learning_rate": 4.989366069340824e-06, + "loss": 2.5211, + "step": 1097 + }, + { + "epoch": 0.05890557939914163, + "grad_norm": 2.9615654945373535, + "learning_rate": 4.989326007709606e-06, + "loss": 2.4658, + "step": 1098 + }, + { + "epoch": 0.05895922746781116, + "grad_norm": 1.3978153467178345, + "learning_rate": 4.989285870918753e-06, + "loss": 2.2703, + "step": 1099 + }, + { + "epoch": 0.059012875536480686, + "grad_norm": 0.9596801400184631, + "learning_rate": 4.989245658969476e-06, + "loss": 2.5142, + "step": 1100 + }, + { + "epoch": 0.05906652360515022, + "grad_norm": 1.232269287109375, + "learning_rate": 4.989205371862988e-06, + "loss": 2.1275, + "step": 1101 + }, + { + "epoch": 0.05912017167381974, + "grad_norm": 1.0570298433303833, + "learning_rate": 4.9891650096005074e-06, + "loss": 2.4595, + "step": 1102 + }, + { + "epoch": 0.05917381974248927, + "grad_norm": 1.0608618259429932, + "learning_rate": 4.989124572183251e-06, + "loss": 1.9981, + "step": 1103 + }, + { + "epoch": 0.0592274678111588, + "grad_norm": 0.9068223834037781, + "learning_rate": 4.9890840596124405e-06, + "loss": 2.3069, + "step": 1104 + }, + { + "epoch": 0.059281115879828324, + "grad_norm": 0.9125115275382996, + "learning_rate": 4.9890434718893e-06, + "loss": 2.0232, + "step": 1105 + }, + { + "epoch": 0.059334763948497855, + "grad_norm": 0.9936618804931641, + "learning_rate": 4.989002809015052e-06, + "loss": 2.431, + "step": 1106 + }, + { + "epoch": 0.05938841201716738, + "grad_norm": 0.9919398427009583, + "learning_rate": 4.988962070990928e-06, + "loss": 2.3961, + "step": 1107 + }, + { + "epoch": 0.05944206008583691, + "grad_norm": 1.279747486114502, + "learning_rate": 4.988921257818154e-06, + "loss": 1.4218, + "step": 1108 + }, + { + "epoch": 0.05949570815450644, + "grad_norm": 0.9825503826141357, + "learning_rate": 4.988880369497967e-06, + "loss": 2.4498, + "step": 1109 + }, + { + "epoch": 0.05954935622317597, + "grad_norm": 1.653053641319275, + "learning_rate": 4.988839406031597e-06, + "loss": 2.4965, + "step": 1110 + }, + { + "epoch": 0.05960300429184549, + "grad_norm": 0.9315429329872131, + "learning_rate": 4.988798367420284e-06, + "loss": 2.3924, + "step": 1111 + }, + { + "epoch": 0.05965665236051502, + "grad_norm": 1.2868900299072266, + "learning_rate": 4.988757253665266e-06, + "loss": 2.3973, + "step": 1112 + }, + { + "epoch": 0.05971030042918455, + "grad_norm": 1.0158864259719849, + "learning_rate": 4.988716064767784e-06, + "loss": 2.4751, + "step": 1113 + }, + { + "epoch": 0.059763948497854075, + "grad_norm": 1.1413277387619019, + "learning_rate": 4.9886748007290805e-06, + "loss": 2.3294, + "step": 1114 + }, + { + "epoch": 0.059817596566523606, + "grad_norm": 0.8365909457206726, + "learning_rate": 4.988633461550404e-06, + "loss": 2.1325, + "step": 1115 + }, + { + "epoch": 0.05987124463519313, + "grad_norm": 1.0290286540985107, + "learning_rate": 4.988592047233001e-06, + "loss": 2.4654, + "step": 1116 + }, + { + "epoch": 0.05992489270386266, + "grad_norm": 0.9697268009185791, + "learning_rate": 4.988550557778123e-06, + "loss": 2.4101, + "step": 1117 + }, + { + "epoch": 0.05997854077253219, + "grad_norm": 0.9565639495849609, + "learning_rate": 4.98850899318702e-06, + "loss": 2.251, + "step": 1118 + }, + { + "epoch": 0.06003218884120172, + "grad_norm": 0.8121193051338196, + "learning_rate": 4.9884673534609505e-06, + "loss": 2.0026, + "step": 1119 + }, + { + "epoch": 0.060085836909871244, + "grad_norm": 1.6524022817611694, + "learning_rate": 4.9884256386011685e-06, + "loss": 2.1174, + "step": 1120 + }, + { + "epoch": 0.06013948497854077, + "grad_norm": 2.40254807472229, + "learning_rate": 4.9883838486089365e-06, + "loss": 2.2118, + "step": 1121 + }, + { + "epoch": 0.0601931330472103, + "grad_norm": 0.9734964966773987, + "learning_rate": 4.988341983485515e-06, + "loss": 2.2518, + "step": 1122 + }, + { + "epoch": 0.060246781115879826, + "grad_norm": 0.9296643733978271, + "learning_rate": 4.988300043232167e-06, + "loss": 2.187, + "step": 1123 + }, + { + "epoch": 0.06030042918454936, + "grad_norm": 1.008583903312683, + "learning_rate": 4.98825802785016e-06, + "loss": 2.4545, + "step": 1124 + }, + { + "epoch": 0.06035407725321888, + "grad_norm": 0.995166540145874, + "learning_rate": 4.988215937340762e-06, + "loss": 2.4791, + "step": 1125 + }, + { + "epoch": 0.060407725321888414, + "grad_norm": 0.8931052684783936, + "learning_rate": 4.988173771705244e-06, + "loss": 2.5079, + "step": 1126 + }, + { + "epoch": 0.06046137339055794, + "grad_norm": 0.9702426791191101, + "learning_rate": 4.988131530944879e-06, + "loss": 2.5299, + "step": 1127 + }, + { + "epoch": 0.06051502145922747, + "grad_norm": 1.0601887702941895, + "learning_rate": 4.9880892150609424e-06, + "loss": 2.4213, + "step": 1128 + }, + { + "epoch": 0.060568669527896996, + "grad_norm": 1.0227653980255127, + "learning_rate": 4.988046824054713e-06, + "loss": 2.2857, + "step": 1129 + }, + { + "epoch": 0.06062231759656652, + "grad_norm": 0.926737904548645, + "learning_rate": 4.988004357927468e-06, + "loss": 2.3232, + "step": 1130 + }, + { + "epoch": 0.06067596566523605, + "grad_norm": 0.8746935725212097, + "learning_rate": 4.987961816680493e-06, + "loss": 2.2568, + "step": 1131 + }, + { + "epoch": 0.06072961373390558, + "grad_norm": 0.8764224648475647, + "learning_rate": 4.987919200315069e-06, + "loss": 2.427, + "step": 1132 + }, + { + "epoch": 0.06078326180257511, + "grad_norm": 0.9093576073646545, + "learning_rate": 4.987876508832485e-06, + "loss": 2.3657, + "step": 1133 + }, + { + "epoch": 0.060836909871244634, + "grad_norm": 0.9841018319129944, + "learning_rate": 4.98783374223403e-06, + "loss": 2.2756, + "step": 1134 + }, + { + "epoch": 0.060890557939914165, + "grad_norm": 1.6909751892089844, + "learning_rate": 4.987790900520993e-06, + "loss": 2.3376, + "step": 1135 + }, + { + "epoch": 0.06094420600858369, + "grad_norm": 0.9900720715522766, + "learning_rate": 4.987747983694671e-06, + "loss": 2.3721, + "step": 1136 + }, + { + "epoch": 0.06099785407725322, + "grad_norm": 0.9804003238677979, + "learning_rate": 4.987704991756356e-06, + "loss": 2.3909, + "step": 1137 + }, + { + "epoch": 0.06105150214592275, + "grad_norm": 1.2065123319625854, + "learning_rate": 4.987661924707349e-06, + "loss": 2.5192, + "step": 1138 + }, + { + "epoch": 0.06110515021459227, + "grad_norm": 0.9317047595977783, + "learning_rate": 4.9876187825489486e-06, + "loss": 2.4687, + "step": 1139 + }, + { + "epoch": 0.0611587982832618, + "grad_norm": 1.1988011598587036, + "learning_rate": 4.987575565282459e-06, + "loss": 2.3379, + "step": 1140 + }, + { + "epoch": 0.06121244635193133, + "grad_norm": 1.0506364107131958, + "learning_rate": 4.987532272909183e-06, + "loss": 2.178, + "step": 1141 + }, + { + "epoch": 0.06126609442060086, + "grad_norm": 1.0984842777252197, + "learning_rate": 4.987488905430429e-06, + "loss": 2.2426, + "step": 1142 + }, + { + "epoch": 0.061319742489270385, + "grad_norm": 1.0500613451004028, + "learning_rate": 4.987445462847506e-06, + "loss": 2.624, + "step": 1143 + }, + { + "epoch": 0.061373390557939916, + "grad_norm": 1.079162359237671, + "learning_rate": 4.987401945161726e-06, + "loss": 2.1887, + "step": 1144 + }, + { + "epoch": 0.06142703862660944, + "grad_norm": 0.9197587966918945, + "learning_rate": 4.9873583523744025e-06, + "loss": 2.2531, + "step": 1145 + }, + { + "epoch": 0.06148068669527897, + "grad_norm": 0.8580393195152283, + "learning_rate": 4.9873146844868525e-06, + "loss": 2.2592, + "step": 1146 + }, + { + "epoch": 0.0615343347639485, + "grad_norm": 1.0936763286590576, + "learning_rate": 4.987270941500393e-06, + "loss": 2.427, + "step": 1147 + }, + { + "epoch": 0.06158798283261802, + "grad_norm": 0.9202926754951477, + "learning_rate": 4.987227123416346e-06, + "loss": 2.2279, + "step": 1148 + }, + { + "epoch": 0.061641630901287554, + "grad_norm": 0.8660621047019958, + "learning_rate": 4.987183230236034e-06, + "loss": 2.1709, + "step": 1149 + }, + { + "epoch": 0.06169527896995708, + "grad_norm": 0.9105360507965088, + "learning_rate": 4.987139261960782e-06, + "loss": 2.3055, + "step": 1150 + }, + { + "epoch": 0.06174892703862661, + "grad_norm": 1.1920312643051147, + "learning_rate": 4.987095218591919e-06, + "loss": 2.3814, + "step": 1151 + }, + { + "epoch": 0.061802575107296136, + "grad_norm": 0.9380356073379517, + "learning_rate": 4.987051100130772e-06, + "loss": 2.4826, + "step": 1152 + }, + { + "epoch": 0.06185622317596567, + "grad_norm": 0.9523796439170837, + "learning_rate": 4.987006906578676e-06, + "loss": 2.137, + "step": 1153 + }, + { + "epoch": 0.06190987124463519, + "grad_norm": 1.0403015613555908, + "learning_rate": 4.986962637936963e-06, + "loss": 2.102, + "step": 1154 + }, + { + "epoch": 0.061963519313304724, + "grad_norm": 1.1150732040405273, + "learning_rate": 4.986918294206972e-06, + "loss": 2.1992, + "step": 1155 + }, + { + "epoch": 0.06201716738197425, + "grad_norm": 1.2442398071289062, + "learning_rate": 4.986873875390039e-06, + "loss": 2.328, + "step": 1156 + }, + { + "epoch": 0.062070815450643774, + "grad_norm": 1.0945496559143066, + "learning_rate": 4.9868293814875064e-06, + "loss": 2.5701, + "step": 1157 + }, + { + "epoch": 0.062124463519313305, + "grad_norm": 0.9511393308639526, + "learning_rate": 4.986784812500719e-06, + "loss": 2.6029, + "step": 1158 + }, + { + "epoch": 0.06217811158798283, + "grad_norm": 0.9506328105926514, + "learning_rate": 4.98674016843102e-06, + "loss": 2.4729, + "step": 1159 + }, + { + "epoch": 0.06223175965665236, + "grad_norm": 0.9968279004096985, + "learning_rate": 4.98669544927976e-06, + "loss": 2.3917, + "step": 1160 + }, + { + "epoch": 0.06228540772532189, + "grad_norm": 0.9288352131843567, + "learning_rate": 4.986650655048287e-06, + "loss": 2.2191, + "step": 1161 + }, + { + "epoch": 0.06233905579399142, + "grad_norm": 1.005597710609436, + "learning_rate": 4.9866057857379545e-06, + "loss": 2.272, + "step": 1162 + }, + { + "epoch": 0.06239270386266094, + "grad_norm": 2.23844575881958, + "learning_rate": 4.986560841350116e-06, + "loss": 1.4605, + "step": 1163 + }, + { + "epoch": 0.062446351931330475, + "grad_norm": 0.93501216173172, + "learning_rate": 4.986515821886131e-06, + "loss": 2.5551, + "step": 1164 + }, + { + "epoch": 0.0625, + "grad_norm": 1.2463313341140747, + "learning_rate": 4.986470727347356e-06, + "loss": 2.0815, + "step": 1165 + }, + { + "epoch": 0.06255364806866953, + "grad_norm": 0.9285088181495667, + "learning_rate": 4.986425557735154e-06, + "loss": 2.2435, + "step": 1166 + }, + { + "epoch": 0.06260729613733905, + "grad_norm": 1.2001177072525024, + "learning_rate": 4.986380313050889e-06, + "loss": 2.2972, + "step": 1167 + }, + { + "epoch": 0.06266094420600858, + "grad_norm": 1.4643712043762207, + "learning_rate": 4.9863349932959256e-06, + "loss": 2.3177, + "step": 1168 + }, + { + "epoch": 0.06271459227467811, + "grad_norm": 0.9649955630302429, + "learning_rate": 4.9862895984716345e-06, + "loss": 2.1132, + "step": 1169 + }, + { + "epoch": 0.06276824034334764, + "grad_norm": 1.084575891494751, + "learning_rate": 4.986244128579384e-06, + "loss": 2.372, + "step": 1170 + }, + { + "epoch": 0.06282188841201716, + "grad_norm": 0.9651587605476379, + "learning_rate": 4.9861985836205485e-06, + "loss": 2.3409, + "step": 1171 + }, + { + "epoch": 0.0628755364806867, + "grad_norm": 0.9547308087348938, + "learning_rate": 4.986152963596502e-06, + "loss": 2.382, + "step": 1172 + }, + { + "epoch": 0.06292918454935623, + "grad_norm": 1.0556073188781738, + "learning_rate": 4.986107268508622e-06, + "loss": 2.1951, + "step": 1173 + }, + { + "epoch": 0.06298283261802574, + "grad_norm": 1.031585693359375, + "learning_rate": 4.98606149835829e-06, + "loss": 1.8858, + "step": 1174 + }, + { + "epoch": 0.06303648068669528, + "grad_norm": 0.875927209854126, + "learning_rate": 4.986015653146885e-06, + "loss": 2.2312, + "step": 1175 + }, + { + "epoch": 0.06309012875536481, + "grad_norm": 0.9850233793258667, + "learning_rate": 4.985969732875794e-06, + "loss": 2.3268, + "step": 1176 + }, + { + "epoch": 0.06314377682403434, + "grad_norm": 1.0025520324707031, + "learning_rate": 4.985923737546401e-06, + "loss": 2.4564, + "step": 1177 + }, + { + "epoch": 0.06319742489270386, + "grad_norm": 1.0855391025543213, + "learning_rate": 4.985877667160096e-06, + "loss": 2.3502, + "step": 1178 + }, + { + "epoch": 0.06325107296137339, + "grad_norm": 0.8323860168457031, + "learning_rate": 4.9858315217182705e-06, + "loss": 2.2287, + "step": 1179 + }, + { + "epoch": 0.06330472103004292, + "grad_norm": 1.0464768409729004, + "learning_rate": 4.985785301222317e-06, + "loss": 2.3254, + "step": 1180 + }, + { + "epoch": 0.06335836909871245, + "grad_norm": 0.8289233446121216, + "learning_rate": 4.985739005673631e-06, + "loss": 1.9804, + "step": 1181 + }, + { + "epoch": 0.06341201716738197, + "grad_norm": 1.034662127494812, + "learning_rate": 4.98569263507361e-06, + "loss": 2.3365, + "step": 1182 + }, + { + "epoch": 0.0634656652360515, + "grad_norm": 0.9792503118515015, + "learning_rate": 4.985646189423655e-06, + "loss": 2.1938, + "step": 1183 + }, + { + "epoch": 0.06351931330472103, + "grad_norm": 1.6392652988433838, + "learning_rate": 4.985599668725168e-06, + "loss": 2.4219, + "step": 1184 + }, + { + "epoch": 0.06357296137339055, + "grad_norm": 0.9966708421707153, + "learning_rate": 4.985553072979553e-06, + "loss": 2.4656, + "step": 1185 + }, + { + "epoch": 0.06362660944206008, + "grad_norm": 1.1187785863876343, + "learning_rate": 4.985506402188217e-06, + "loss": 2.3186, + "step": 1186 + }, + { + "epoch": 0.06368025751072962, + "grad_norm": 2.5858256816864014, + "learning_rate": 4.98545965635257e-06, + "loss": 2.1474, + "step": 1187 + }, + { + "epoch": 0.06373390557939915, + "grad_norm": 1.1409443616867065, + "learning_rate": 4.985412835474023e-06, + "loss": 2.1916, + "step": 1188 + }, + { + "epoch": 0.06378755364806867, + "grad_norm": 1.0248135328292847, + "learning_rate": 4.985365939553989e-06, + "loss": 2.3183, + "step": 1189 + }, + { + "epoch": 0.0638412017167382, + "grad_norm": 0.9299795627593994, + "learning_rate": 4.985318968593884e-06, + "loss": 2.2232, + "step": 1190 + }, + { + "epoch": 0.06389484978540773, + "grad_norm": 1.0572593212127686, + "learning_rate": 4.985271922595127e-06, + "loss": 1.9589, + "step": 1191 + }, + { + "epoch": 0.06394849785407725, + "grad_norm": 0.9500041604042053, + "learning_rate": 4.985224801559137e-06, + "loss": 2.2186, + "step": 1192 + }, + { + "epoch": 0.06400214592274678, + "grad_norm": 1.4783669710159302, + "learning_rate": 4.985177605487339e-06, + "loss": 2.4164, + "step": 1193 + }, + { + "epoch": 0.06405579399141631, + "grad_norm": 1.0400598049163818, + "learning_rate": 4.985130334381156e-06, + "loss": 2.2274, + "step": 1194 + }, + { + "epoch": 0.06410944206008584, + "grad_norm": 0.9037993550300598, + "learning_rate": 4.985082988242017e-06, + "loss": 2.0441, + "step": 1195 + }, + { + "epoch": 0.06416309012875536, + "grad_norm": 1.4019476175308228, + "learning_rate": 4.9850355670713495e-06, + "loss": 2.3994, + "step": 1196 + }, + { + "epoch": 0.06421673819742489, + "grad_norm": 3.3732125759124756, + "learning_rate": 4.984988070870586e-06, + "loss": 2.5683, + "step": 1197 + }, + { + "epoch": 0.06427038626609442, + "grad_norm": 0.9091217517852783, + "learning_rate": 4.984940499641161e-06, + "loss": 2.2068, + "step": 1198 + }, + { + "epoch": 0.06432403433476395, + "grad_norm": 0.9493944644927979, + "learning_rate": 4.984892853384511e-06, + "loss": 2.3488, + "step": 1199 + }, + { + "epoch": 0.06437768240343347, + "grad_norm": 1.0054129362106323, + "learning_rate": 4.984845132102073e-06, + "loss": 2.3602, + "step": 1200 + }, + { + "epoch": 0.064431330472103, + "grad_norm": 1.0504363775253296, + "learning_rate": 4.98479733579529e-06, + "loss": 2.3265, + "step": 1201 + }, + { + "epoch": 0.06448497854077254, + "grad_norm": 1.0254640579223633, + "learning_rate": 4.984749464465604e-06, + "loss": 2.156, + "step": 1202 + }, + { + "epoch": 0.06453862660944205, + "grad_norm": 1.132319450378418, + "learning_rate": 4.98470151811446e-06, + "loss": 2.3433, + "step": 1203 + }, + { + "epoch": 0.06459227467811159, + "grad_norm": 1.229435682296753, + "learning_rate": 4.984653496743306e-06, + "loss": 2.3781, + "step": 1204 + }, + { + "epoch": 0.06464592274678112, + "grad_norm": 0.8927853107452393, + "learning_rate": 4.984605400353591e-06, + "loss": 2.2183, + "step": 1205 + }, + { + "epoch": 0.06469957081545065, + "grad_norm": 0.999556839466095, + "learning_rate": 4.98455722894677e-06, + "loss": 2.231, + "step": 1206 + }, + { + "epoch": 0.06475321888412017, + "grad_norm": 1.519758701324463, + "learning_rate": 4.984508982524295e-06, + "loss": 2.1922, + "step": 1207 + }, + { + "epoch": 0.0648068669527897, + "grad_norm": 0.8093305826187134, + "learning_rate": 4.984460661087623e-06, + "loss": 1.9918, + "step": 1208 + }, + { + "epoch": 0.06486051502145923, + "grad_norm": 1.0123575925827026, + "learning_rate": 4.984412264638213e-06, + "loss": 2.3328, + "step": 1209 + }, + { + "epoch": 0.06491416309012875, + "grad_norm": 0.9538084864616394, + "learning_rate": 4.984363793177527e-06, + "loss": 2.2817, + "step": 1210 + }, + { + "epoch": 0.06496781115879828, + "grad_norm": 1.329424262046814, + "learning_rate": 4.984315246707027e-06, + "loss": 2.4669, + "step": 1211 + }, + { + "epoch": 0.06502145922746781, + "grad_norm": 0.9524489045143127, + "learning_rate": 4.98426662522818e-06, + "loss": 2.2738, + "step": 1212 + }, + { + "epoch": 0.06507510729613734, + "grad_norm": 1.0620826482772827, + "learning_rate": 4.984217928742454e-06, + "loss": 2.1967, + "step": 1213 + }, + { + "epoch": 0.06512875536480686, + "grad_norm": 1.919254183769226, + "learning_rate": 4.984169157251319e-06, + "loss": 2.5653, + "step": 1214 + }, + { + "epoch": 0.0651824034334764, + "grad_norm": 1.1565606594085693, + "learning_rate": 4.9841203107562476e-06, + "loss": 2.3789, + "step": 1215 + }, + { + "epoch": 0.06523605150214593, + "grad_norm": 1.0749967098236084, + "learning_rate": 4.984071389258714e-06, + "loss": 2.4212, + "step": 1216 + }, + { + "epoch": 0.06528969957081546, + "grad_norm": 0.951046347618103, + "learning_rate": 4.984022392760196e-06, + "loss": 2.1944, + "step": 1217 + }, + { + "epoch": 0.06534334763948497, + "grad_norm": 1.1085155010223389, + "learning_rate": 4.983973321262173e-06, + "loss": 2.4214, + "step": 1218 + }, + { + "epoch": 0.0653969957081545, + "grad_norm": 0.9239466190338135, + "learning_rate": 4.983924174766126e-06, + "loss": 1.6983, + "step": 1219 + }, + { + "epoch": 0.06545064377682404, + "grad_norm": 1.2276287078857422, + "learning_rate": 4.98387495327354e-06, + "loss": 1.6961, + "step": 1220 + }, + { + "epoch": 0.06550429184549356, + "grad_norm": 1.0233432054519653, + "learning_rate": 4.983825656785899e-06, + "loss": 2.2611, + "step": 1221 + }, + { + "epoch": 0.06555793991416309, + "grad_norm": 0.8634902834892273, + "learning_rate": 4.983776285304694e-06, + "loss": 2.0637, + "step": 1222 + }, + { + "epoch": 0.06561158798283262, + "grad_norm": 1.2154033184051514, + "learning_rate": 4.983726838831413e-06, + "loss": 2.0678, + "step": 1223 + }, + { + "epoch": 0.06566523605150215, + "grad_norm": 0.8645827174186707, + "learning_rate": 4.983677317367551e-06, + "loss": 2.4238, + "step": 1224 + }, + { + "epoch": 0.06571888412017167, + "grad_norm": 1.046346664428711, + "learning_rate": 4.983627720914603e-06, + "loss": 2.4546, + "step": 1225 + }, + { + "epoch": 0.0657725321888412, + "grad_norm": 1.1527621746063232, + "learning_rate": 4.983578049474066e-06, + "loss": 2.1418, + "step": 1226 + }, + { + "epoch": 0.06582618025751073, + "grad_norm": 1.0037026405334473, + "learning_rate": 4.9835283030474394e-06, + "loss": 2.2632, + "step": 1227 + }, + { + "epoch": 0.06587982832618025, + "grad_norm": 0.9696345329284668, + "learning_rate": 4.983478481636225e-06, + "loss": 2.1971, + "step": 1228 + }, + { + "epoch": 0.06593347639484978, + "grad_norm": 1.151303768157959, + "learning_rate": 4.983428585241928e-06, + "loss": 2.4703, + "step": 1229 + }, + { + "epoch": 0.06598712446351931, + "grad_norm": 0.9496086835861206, + "learning_rate": 4.983378613866055e-06, + "loss": 2.3845, + "step": 1230 + }, + { + "epoch": 0.06604077253218885, + "grad_norm": 1.027260184288025, + "learning_rate": 4.983328567510113e-06, + "loss": 2.5167, + "step": 1231 + }, + { + "epoch": 0.06609442060085836, + "grad_norm": 1.0592864751815796, + "learning_rate": 4.983278446175615e-06, + "loss": 2.1535, + "step": 1232 + }, + { + "epoch": 0.0661480686695279, + "grad_norm": 1.2062712907791138, + "learning_rate": 4.983228249864073e-06, + "loss": 2.2636, + "step": 1233 + }, + { + "epoch": 0.06620171673819743, + "grad_norm": 0.9164139628410339, + "learning_rate": 4.9831779785770034e-06, + "loss": 2.1669, + "step": 1234 + }, + { + "epoch": 0.06625536480686696, + "grad_norm": 0.9456780552864075, + "learning_rate": 4.983127632315924e-06, + "loss": 2.2491, + "step": 1235 + }, + { + "epoch": 0.06630901287553648, + "grad_norm": 0.9188715219497681, + "learning_rate": 4.983077211082355e-06, + "loss": 2.3742, + "step": 1236 + }, + { + "epoch": 0.06636266094420601, + "grad_norm": 1.416751742362976, + "learning_rate": 4.983026714877816e-06, + "loss": 2.6262, + "step": 1237 + }, + { + "epoch": 0.06641630901287554, + "grad_norm": 1.0176416635513306, + "learning_rate": 4.982976143703837e-06, + "loss": 2.3965, + "step": 1238 + }, + { + "epoch": 0.06646995708154506, + "grad_norm": 0.9725081324577332, + "learning_rate": 4.98292549756194e-06, + "loss": 2.482, + "step": 1239 + }, + { + "epoch": 0.06652360515021459, + "grad_norm": 1.0572484731674194, + "learning_rate": 4.982874776453657e-06, + "loss": 2.1112, + "step": 1240 + }, + { + "epoch": 0.06657725321888412, + "grad_norm": 1.1000754833221436, + "learning_rate": 4.982823980380518e-06, + "loss": 2.4197, + "step": 1241 + }, + { + "epoch": 0.06663090128755365, + "grad_norm": 0.9540502429008484, + "learning_rate": 4.982773109344058e-06, + "loss": 1.6994, + "step": 1242 + }, + { + "epoch": 0.06668454935622317, + "grad_norm": 1.0147403478622437, + "learning_rate": 4.98272216334581e-06, + "loss": 2.3514, + "step": 1243 + }, + { + "epoch": 0.0667381974248927, + "grad_norm": 1.1577200889587402, + "learning_rate": 4.982671142387316e-06, + "loss": 2.2504, + "step": 1244 + }, + { + "epoch": 0.06679184549356224, + "grad_norm": 0.9553468823432922, + "learning_rate": 4.982620046470115e-06, + "loss": 2.3619, + "step": 1245 + }, + { + "epoch": 0.06684549356223175, + "grad_norm": 1.835162878036499, + "learning_rate": 4.982568875595748e-06, + "loss": 2.2313, + "step": 1246 + }, + { + "epoch": 0.06689914163090128, + "grad_norm": 0.8699524402618408, + "learning_rate": 4.982517629765762e-06, + "loss": 2.1041, + "step": 1247 + }, + { + "epoch": 0.06695278969957082, + "grad_norm": 1.0222312211990356, + "learning_rate": 4.982466308981704e-06, + "loss": 2.3486, + "step": 1248 + }, + { + "epoch": 0.06700643776824035, + "grad_norm": 1.0730657577514648, + "learning_rate": 4.982414913245123e-06, + "loss": 2.4572, + "step": 1249 + }, + { + "epoch": 0.06706008583690987, + "grad_norm": 1.5996443033218384, + "learning_rate": 4.982363442557571e-06, + "loss": 2.2972, + "step": 1250 + }, + { + "epoch": 0.0671137339055794, + "grad_norm": 0.9231743216514587, + "learning_rate": 4.982311896920602e-06, + "loss": 2.2365, + "step": 1251 + }, + { + "epoch": 0.06716738197424893, + "grad_norm": 0.9723864793777466, + "learning_rate": 4.982260276335772e-06, + "loss": 2.3674, + "step": 1252 + }, + { + "epoch": 0.06722103004291846, + "grad_norm": 1.8816993236541748, + "learning_rate": 4.98220858080464e-06, + "loss": 2.2334, + "step": 1253 + }, + { + "epoch": 0.06727467811158798, + "grad_norm": 1.128426432609558, + "learning_rate": 4.9821568103287675e-06, + "loss": 2.3304, + "step": 1254 + }, + { + "epoch": 0.06732832618025751, + "grad_norm": 0.9466338157653809, + "learning_rate": 4.982104964909717e-06, + "loss": 2.448, + "step": 1255 + }, + { + "epoch": 0.06738197424892704, + "grad_norm": 0.9969996213912964, + "learning_rate": 4.982053044549053e-06, + "loss": 2.3047, + "step": 1256 + }, + { + "epoch": 0.06743562231759656, + "grad_norm": 1.0799095630645752, + "learning_rate": 4.982001049248344e-06, + "loss": 2.3733, + "step": 1257 + }, + { + "epoch": 0.06748927038626609, + "grad_norm": 1.7526142597198486, + "learning_rate": 4.981948979009159e-06, + "loss": 2.161, + "step": 1258 + }, + { + "epoch": 0.06754291845493562, + "grad_norm": 1.012143611907959, + "learning_rate": 4.981896833833072e-06, + "loss": 2.4371, + "step": 1259 + }, + { + "epoch": 0.06759656652360516, + "grad_norm": 0.9565131664276123, + "learning_rate": 4.9818446137216545e-06, + "loss": 2.0336, + "step": 1260 + }, + { + "epoch": 0.06765021459227467, + "grad_norm": 1.119325041770935, + "learning_rate": 4.981792318676487e-06, + "loss": 2.2727, + "step": 1261 + }, + { + "epoch": 0.0677038626609442, + "grad_norm": 0.8959728479385376, + "learning_rate": 4.981739948699145e-06, + "loss": 2.2803, + "step": 1262 + }, + { + "epoch": 0.06775751072961374, + "grad_norm": 1.1167850494384766, + "learning_rate": 4.981687503791211e-06, + "loss": 2.2523, + "step": 1263 + }, + { + "epoch": 0.06781115879828326, + "grad_norm": 1.2471730709075928, + "learning_rate": 4.981634983954268e-06, + "loss": 2.6639, + "step": 1264 + }, + { + "epoch": 0.06786480686695279, + "grad_norm": 1.421600341796875, + "learning_rate": 4.981582389189904e-06, + "loss": 2.2953, + "step": 1265 + }, + { + "epoch": 0.06791845493562232, + "grad_norm": 1.0473721027374268, + "learning_rate": 4.981529719499704e-06, + "loss": 2.2799, + "step": 1266 + }, + { + "epoch": 0.06797210300429185, + "grad_norm": 0.9871096611022949, + "learning_rate": 4.98147697488526e-06, + "loss": 2.3155, + "step": 1267 + }, + { + "epoch": 0.06802575107296137, + "grad_norm": 1.0446598529815674, + "learning_rate": 4.9814241553481635e-06, + "loss": 2.0745, + "step": 1268 + }, + { + "epoch": 0.0680793991416309, + "grad_norm": 1.3474087715148926, + "learning_rate": 4.98137126089001e-06, + "loss": 2.428, + "step": 1269 + }, + { + "epoch": 0.06813304721030043, + "grad_norm": 1.1699421405792236, + "learning_rate": 4.981318291512396e-06, + "loss": 2.5069, + "step": 1270 + }, + { + "epoch": 0.06818669527896996, + "grad_norm": 0.9889071583747864, + "learning_rate": 4.981265247216921e-06, + "loss": 2.4165, + "step": 1271 + }, + { + "epoch": 0.06824034334763948, + "grad_norm": 5.230331897735596, + "learning_rate": 4.981212128005187e-06, + "loss": 2.0656, + "step": 1272 + }, + { + "epoch": 0.06829399141630901, + "grad_norm": 0.8876602649688721, + "learning_rate": 4.9811589338787965e-06, + "loss": 2.1492, + "step": 1273 + }, + { + "epoch": 0.06834763948497855, + "grad_norm": 1.0285910367965698, + "learning_rate": 4.981105664839358e-06, + "loss": 2.1517, + "step": 1274 + }, + { + "epoch": 0.06840128755364806, + "grad_norm": 1.3014415502548218, + "learning_rate": 4.981052320888476e-06, + "loss": 1.9139, + "step": 1275 + }, + { + "epoch": 0.0684549356223176, + "grad_norm": 0.9376952648162842, + "learning_rate": 4.980998902027765e-06, + "loss": 2.2074, + "step": 1276 + }, + { + "epoch": 0.06850858369098713, + "grad_norm": 0.94576096534729, + "learning_rate": 4.980945408258836e-06, + "loss": 2.2472, + "step": 1277 + }, + { + "epoch": 0.06856223175965666, + "grad_norm": 0.9819731116294861, + "learning_rate": 4.980891839583305e-06, + "loss": 1.6485, + "step": 1278 + }, + { + "epoch": 0.06861587982832618, + "grad_norm": 0.9782082438468933, + "learning_rate": 4.980838196002788e-06, + "loss": 2.3349, + "step": 1279 + }, + { + "epoch": 0.06866952789699571, + "grad_norm": 0.9362853765487671, + "learning_rate": 4.980784477518906e-06, + "loss": 2.2468, + "step": 1280 + }, + { + "epoch": 0.06872317596566524, + "grad_norm": 1.3191250562667847, + "learning_rate": 4.98073068413328e-06, + "loss": 2.4428, + "step": 1281 + }, + { + "epoch": 0.06877682403433476, + "grad_norm": 1.1156028509140015, + "learning_rate": 4.980676815847534e-06, + "loss": 2.427, + "step": 1282 + }, + { + "epoch": 0.06883047210300429, + "grad_norm": 7.49153995513916, + "learning_rate": 4.980622872663296e-06, + "loss": 2.2113, + "step": 1283 + }, + { + "epoch": 0.06888412017167382, + "grad_norm": 1.0203325748443604, + "learning_rate": 4.980568854582193e-06, + "loss": 2.2779, + "step": 1284 + }, + { + "epoch": 0.06893776824034335, + "grad_norm": 3.292227268218994, + "learning_rate": 4.980514761605857e-06, + "loss": 2.2174, + "step": 1285 + }, + { + "epoch": 0.06899141630901287, + "grad_norm": 0.9186006188392639, + "learning_rate": 4.9804605937359205e-06, + "loss": 2.2696, + "step": 1286 + }, + { + "epoch": 0.0690450643776824, + "grad_norm": 1.049167513847351, + "learning_rate": 4.9804063509740194e-06, + "loss": 2.3914, + "step": 1287 + }, + { + "epoch": 0.06909871244635193, + "grad_norm": 0.8847677111625671, + "learning_rate": 4.980352033321792e-06, + "loss": 2.101, + "step": 1288 + }, + { + "epoch": 0.06915236051502147, + "grad_norm": 1.3655288219451904, + "learning_rate": 4.980297640780876e-06, + "loss": 2.4535, + "step": 1289 + }, + { + "epoch": 0.06920600858369098, + "grad_norm": 1.1724376678466797, + "learning_rate": 4.980243173352916e-06, + "loss": 2.1485, + "step": 1290 + }, + { + "epoch": 0.06925965665236052, + "grad_norm": 1.0108832120895386, + "learning_rate": 4.980188631039557e-06, + "loss": 2.2599, + "step": 1291 + }, + { + "epoch": 0.06931330472103005, + "grad_norm": 1.0784542560577393, + "learning_rate": 4.9801340138424425e-06, + "loss": 2.0973, + "step": 1292 + }, + { + "epoch": 0.06936695278969957, + "grad_norm": 1.2918726205825806, + "learning_rate": 4.980079321763225e-06, + "loss": 1.662, + "step": 1293 + }, + { + "epoch": 0.0694206008583691, + "grad_norm": 1.0497658252716064, + "learning_rate": 4.980024554803554e-06, + "loss": 2.2273, + "step": 1294 + }, + { + "epoch": 0.06947424892703863, + "grad_norm": 1.080718994140625, + "learning_rate": 4.979969712965084e-06, + "loss": 2.2664, + "step": 1295 + }, + { + "epoch": 0.06952789699570816, + "grad_norm": 2.7959582805633545, + "learning_rate": 4.97991479624947e-06, + "loss": 1.6957, + "step": 1296 + }, + { + "epoch": 0.06958154506437768, + "grad_norm": 1.1994774341583252, + "learning_rate": 4.979859804658371e-06, + "loss": 2.5288, + "step": 1297 + }, + { + "epoch": 0.06963519313304721, + "grad_norm": 0.9972768425941467, + "learning_rate": 4.9798047381934464e-06, + "loss": 2.2021, + "step": 1298 + }, + { + "epoch": 0.06968884120171674, + "grad_norm": 1.1177918910980225, + "learning_rate": 4.9797495968563595e-06, + "loss": 2.3284, + "step": 1299 + }, + { + "epoch": 0.06974248927038626, + "grad_norm": 1.208223581314087, + "learning_rate": 4.979694380648774e-06, + "loss": 2.6212, + "step": 1300 + }, + { + "epoch": 0.06979613733905579, + "grad_norm": 1.0205754041671753, + "learning_rate": 4.9796390895723575e-06, + "loss": 2.2783, + "step": 1301 + }, + { + "epoch": 0.06984978540772532, + "grad_norm": 1.001619577407837, + "learning_rate": 4.9795837236287814e-06, + "loss": 2.0492, + "step": 1302 + }, + { + "epoch": 0.06990343347639486, + "grad_norm": 0.8463701605796814, + "learning_rate": 4.979528282819715e-06, + "loss": 2.1374, + "step": 1303 + }, + { + "epoch": 0.06995708154506437, + "grad_norm": 1.023215413093567, + "learning_rate": 4.979472767146833e-06, + "loss": 2.3688, + "step": 1304 + }, + { + "epoch": 0.0700107296137339, + "grad_norm": 1.1979265213012695, + "learning_rate": 4.979417176611811e-06, + "loss": 2.3346, + "step": 1305 + }, + { + "epoch": 0.07006437768240344, + "grad_norm": 1.452760100364685, + "learning_rate": 4.979361511216329e-06, + "loss": 2.7218, + "step": 1306 + }, + { + "epoch": 0.07011802575107297, + "grad_norm": 0.979247510433197, + "learning_rate": 4.979305770962065e-06, + "loss": 2.1939, + "step": 1307 + }, + { + "epoch": 0.07017167381974249, + "grad_norm": 1.400586485862732, + "learning_rate": 4.9792499558507054e-06, + "loss": 2.333, + "step": 1308 + }, + { + "epoch": 0.07022532188841202, + "grad_norm": 0.9813389778137207, + "learning_rate": 4.979194065883932e-06, + "loss": 2.3749, + "step": 1309 + }, + { + "epoch": 0.07027896995708155, + "grad_norm": 0.9473187327384949, + "learning_rate": 4.9791381010634355e-06, + "loss": 2.4122, + "step": 1310 + }, + { + "epoch": 0.07033261802575107, + "grad_norm": 2.2443084716796875, + "learning_rate": 4.979082061390903e-06, + "loss": 2.1777, + "step": 1311 + }, + { + "epoch": 0.0703862660944206, + "grad_norm": 0.936434805393219, + "learning_rate": 4.9790259468680275e-06, + "loss": 2.3809, + "step": 1312 + }, + { + "epoch": 0.07043991416309013, + "grad_norm": 1.2129138708114624, + "learning_rate": 4.978969757496503e-06, + "loss": 2.3822, + "step": 1313 + }, + { + "epoch": 0.07049356223175966, + "grad_norm": 1.1773587465286255, + "learning_rate": 4.978913493278027e-06, + "loss": 2.5653, + "step": 1314 + }, + { + "epoch": 0.07054721030042918, + "grad_norm": 0.9448917508125305, + "learning_rate": 4.978857154214297e-06, + "loss": 2.171, + "step": 1315 + }, + { + "epoch": 0.07060085836909871, + "grad_norm": 1.2923929691314697, + "learning_rate": 4.978800740307015e-06, + "loss": 2.2974, + "step": 1316 + }, + { + "epoch": 0.07065450643776824, + "grad_norm": 1.136430263519287, + "learning_rate": 4.978744251557884e-06, + "loss": 1.6862, + "step": 1317 + }, + { + "epoch": 0.07070815450643776, + "grad_norm": 1.0014485120773315, + "learning_rate": 4.978687687968609e-06, + "loss": 2.45, + "step": 1318 + }, + { + "epoch": 0.0707618025751073, + "grad_norm": 1.0103057622909546, + "learning_rate": 4.978631049540898e-06, + "loss": 2.351, + "step": 1319 + }, + { + "epoch": 0.07081545064377683, + "grad_norm": 1.5005868673324585, + "learning_rate": 4.9785743362764615e-06, + "loss": 2.2677, + "step": 1320 + }, + { + "epoch": 0.07086909871244636, + "grad_norm": 1.5612396001815796, + "learning_rate": 4.978517548177012e-06, + "loss": 1.3249, + "step": 1321 + }, + { + "epoch": 0.07092274678111588, + "grad_norm": 1.2271084785461426, + "learning_rate": 4.9784606852442626e-06, + "loss": 2.2995, + "step": 1322 + }, + { + "epoch": 0.0709763948497854, + "grad_norm": 1.063094973564148, + "learning_rate": 4.978403747479933e-06, + "loss": 2.3148, + "step": 1323 + }, + { + "epoch": 0.07103004291845494, + "grad_norm": 0.9872644543647766, + "learning_rate": 4.9783467348857396e-06, + "loss": 2.0725, + "step": 1324 + }, + { + "epoch": 0.07108369098712447, + "grad_norm": 1.2497276067733765, + "learning_rate": 4.978289647463405e-06, + "loss": 2.2094, + "step": 1325 + }, + { + "epoch": 0.07113733905579399, + "grad_norm": 1.0279934406280518, + "learning_rate": 4.978232485214652e-06, + "loss": 2.3647, + "step": 1326 + }, + { + "epoch": 0.07119098712446352, + "grad_norm": 0.89280766248703, + "learning_rate": 4.978175248141207e-06, + "loss": 2.2508, + "step": 1327 + }, + { + "epoch": 0.07124463519313305, + "grad_norm": 1.3410364389419556, + "learning_rate": 4.978117936244799e-06, + "loss": 2.2822, + "step": 1328 + }, + { + "epoch": 0.07129828326180257, + "grad_norm": 1.0980582237243652, + "learning_rate": 4.9780605495271575e-06, + "loss": 2.3824, + "step": 1329 + }, + { + "epoch": 0.0713519313304721, + "grad_norm": 1.0156686305999756, + "learning_rate": 4.978003087990014e-06, + "loss": 2.2999, + "step": 1330 + }, + { + "epoch": 0.07140557939914163, + "grad_norm": 4.5208940505981445, + "learning_rate": 4.977945551635106e-06, + "loss": 2.2908, + "step": 1331 + }, + { + "epoch": 0.07145922746781116, + "grad_norm": 1.0811820030212402, + "learning_rate": 4.977887940464169e-06, + "loss": 2.278, + "step": 1332 + }, + { + "epoch": 0.07151287553648068, + "grad_norm": 1.040591835975647, + "learning_rate": 4.977830254478943e-06, + "loss": 2.3086, + "step": 1333 + }, + { + "epoch": 0.07156652360515021, + "grad_norm": 1.2930958271026611, + "learning_rate": 4.9777724936811696e-06, + "loss": 2.1737, + "step": 1334 + }, + { + "epoch": 0.07162017167381975, + "grad_norm": 1.0584529638290405, + "learning_rate": 4.977714658072592e-06, + "loss": 2.3511, + "step": 1335 + }, + { + "epoch": 0.07167381974248926, + "grad_norm": 1.032886266708374, + "learning_rate": 4.977656747654958e-06, + "loss": 2.3503, + "step": 1336 + }, + { + "epoch": 0.0717274678111588, + "grad_norm": 1.0458582639694214, + "learning_rate": 4.977598762430015e-06, + "loss": 2.1372, + "step": 1337 + }, + { + "epoch": 0.07178111587982833, + "grad_norm": 0.995593786239624, + "learning_rate": 4.977540702399513e-06, + "loss": 2.2492, + "step": 1338 + }, + { + "epoch": 0.07183476394849786, + "grad_norm": 1.0863360166549683, + "learning_rate": 4.977482567565207e-06, + "loss": 2.3414, + "step": 1339 + }, + { + "epoch": 0.07188841201716738, + "grad_norm": 1.0745601654052734, + "learning_rate": 4.9774243579288505e-06, + "loss": 2.3559, + "step": 1340 + }, + { + "epoch": 0.07194206008583691, + "grad_norm": 1.1351815462112427, + "learning_rate": 4.977366073492202e-06, + "loss": 2.379, + "step": 1341 + }, + { + "epoch": 0.07199570815450644, + "grad_norm": 0.9772175550460815, + "learning_rate": 4.977307714257021e-06, + "loss": 2.2358, + "step": 1342 + }, + { + "epoch": 0.07204935622317596, + "grad_norm": 1.0001784563064575, + "learning_rate": 4.977249280225068e-06, + "loss": 2.1606, + "step": 1343 + }, + { + "epoch": 0.07210300429184549, + "grad_norm": 1.0162519216537476, + "learning_rate": 4.977190771398111e-06, + "loss": 2.3239, + "step": 1344 + }, + { + "epoch": 0.07215665236051502, + "grad_norm": 1.2197778224945068, + "learning_rate": 4.977132187777912e-06, + "loss": 2.5, + "step": 1345 + }, + { + "epoch": 0.07221030042918455, + "grad_norm": 0.8744888305664062, + "learning_rate": 4.977073529366244e-06, + "loss": 2.0576, + "step": 1346 + }, + { + "epoch": 0.07226394849785407, + "grad_norm": 1.0663422346115112, + "learning_rate": 4.977014796164875e-06, + "loss": 2.3018, + "step": 1347 + }, + { + "epoch": 0.0723175965665236, + "grad_norm": 1.0181113481521606, + "learning_rate": 4.97695598817558e-06, + "loss": 2.3851, + "step": 1348 + }, + { + "epoch": 0.07237124463519314, + "grad_norm": 1.0137766599655151, + "learning_rate": 4.976897105400134e-06, + "loss": 1.9353, + "step": 1349 + }, + { + "epoch": 0.07242489270386267, + "grad_norm": 1.0173087120056152, + "learning_rate": 4.976838147840314e-06, + "loss": 1.9168, + "step": 1350 + }, + { + "epoch": 0.07247854077253219, + "grad_norm": 0.9286210536956787, + "learning_rate": 4.976779115497901e-06, + "loss": 2.2792, + "step": 1351 + }, + { + "epoch": 0.07253218884120172, + "grad_norm": 1.0620357990264893, + "learning_rate": 4.976720008374679e-06, + "loss": 2.5175, + "step": 1352 + }, + { + "epoch": 0.07258583690987125, + "grad_norm": 1.0661815404891968, + "learning_rate": 4.976660826472429e-06, + "loss": 2.1318, + "step": 1353 + }, + { + "epoch": 0.07263948497854077, + "grad_norm": 1.3985929489135742, + "learning_rate": 4.9766015697929414e-06, + "loss": 2.2953, + "step": 1354 + }, + { + "epoch": 0.0726931330472103, + "grad_norm": 1.268054723739624, + "learning_rate": 4.976542238338003e-06, + "loss": 2.4896, + "step": 1355 + }, + { + "epoch": 0.07274678111587983, + "grad_norm": 0.9607091546058655, + "learning_rate": 4.976482832109406e-06, + "loss": 2.2571, + "step": 1356 + }, + { + "epoch": 0.07280042918454936, + "grad_norm": 1.0794057846069336, + "learning_rate": 4.976423351108943e-06, + "loss": 2.3569, + "step": 1357 + }, + { + "epoch": 0.07285407725321888, + "grad_norm": 1.0689151287078857, + "learning_rate": 4.976363795338412e-06, + "loss": 2.0614, + "step": 1358 + }, + { + "epoch": 0.07290772532188841, + "grad_norm": 0.9918772578239441, + "learning_rate": 4.97630416479961e-06, + "loss": 2.4995, + "step": 1359 + }, + { + "epoch": 0.07296137339055794, + "grad_norm": 1.0323249101638794, + "learning_rate": 4.976244459494336e-06, + "loss": 2.4073, + "step": 1360 + }, + { + "epoch": 0.07301502145922746, + "grad_norm": 1.168944239616394, + "learning_rate": 4.976184679424395e-06, + "loss": 2.4763, + "step": 1361 + }, + { + "epoch": 0.07306866952789699, + "grad_norm": 1.1609365940093994, + "learning_rate": 4.9761248245915915e-06, + "loss": 2.418, + "step": 1362 + }, + { + "epoch": 0.07312231759656652, + "grad_norm": 1.0638235807418823, + "learning_rate": 4.9760648949977316e-06, + "loss": 2.4265, + "step": 1363 + }, + { + "epoch": 0.07317596566523606, + "grad_norm": 1.0396552085876465, + "learning_rate": 4.976004890644625e-06, + "loss": 2.4424, + "step": 1364 + }, + { + "epoch": 0.07322961373390557, + "grad_norm": 0.8740794062614441, + "learning_rate": 4.975944811534084e-06, + "loss": 1.9612, + "step": 1365 + }, + { + "epoch": 0.0732832618025751, + "grad_norm": 2.43509840965271, + "learning_rate": 4.975884657667922e-06, + "loss": 2.5256, + "step": 1366 + }, + { + "epoch": 0.07333690987124464, + "grad_norm": 1.0254740715026855, + "learning_rate": 4.975824429047956e-06, + "loss": 1.6288, + "step": 1367 + }, + { + "epoch": 0.07339055793991417, + "grad_norm": 0.9655486345291138, + "learning_rate": 4.9757641256760035e-06, + "loss": 1.6648, + "step": 1368 + }, + { + "epoch": 0.07344420600858369, + "grad_norm": 1.0243462324142456, + "learning_rate": 4.9757037475538865e-06, + "loss": 2.3519, + "step": 1369 + }, + { + "epoch": 0.07349785407725322, + "grad_norm": 1.113011121749878, + "learning_rate": 4.975643294683426e-06, + "loss": 2.4679, + "step": 1370 + }, + { + "epoch": 0.07355150214592275, + "grad_norm": 1.161157488822937, + "learning_rate": 4.975582767066449e-06, + "loss": 2.3772, + "step": 1371 + }, + { + "epoch": 0.07360515021459227, + "grad_norm": 1.1884695291519165, + "learning_rate": 4.975522164704782e-06, + "loss": 2.6519, + "step": 1372 + }, + { + "epoch": 0.0736587982832618, + "grad_norm": 0.9771613478660583, + "learning_rate": 4.975461487600255e-06, + "loss": 2.2491, + "step": 1373 + }, + { + "epoch": 0.07371244635193133, + "grad_norm": 1.149491310119629, + "learning_rate": 4.975400735754701e-06, + "loss": 2.3346, + "step": 1374 + }, + { + "epoch": 0.07376609442060086, + "grad_norm": 1.282455325126648, + "learning_rate": 4.975339909169952e-06, + "loss": 2.351, + "step": 1375 + }, + { + "epoch": 0.07381974248927038, + "grad_norm": 0.9327060580253601, + "learning_rate": 4.975279007847847e-06, + "loss": 1.8217, + "step": 1376 + }, + { + "epoch": 0.07387339055793991, + "grad_norm": 1.1702831983566284, + "learning_rate": 4.975218031790223e-06, + "loss": 2.2291, + "step": 1377 + }, + { + "epoch": 0.07392703862660945, + "grad_norm": 1.1040705442428589, + "learning_rate": 4.9751569809989225e-06, + "loss": 2.523, + "step": 1378 + }, + { + "epoch": 0.07398068669527896, + "grad_norm": 1.0453336238861084, + "learning_rate": 4.975095855475788e-06, + "loss": 2.413, + "step": 1379 + }, + { + "epoch": 0.0740343347639485, + "grad_norm": 1.1374396085739136, + "learning_rate": 4.9750346552226645e-06, + "loss": 2.2, + "step": 1380 + }, + { + "epoch": 0.07408798283261803, + "grad_norm": 1.015295386314392, + "learning_rate": 4.9749733802414e-06, + "loss": 2.5627, + "step": 1381 + }, + { + "epoch": 0.07414163090128756, + "grad_norm": 0.9965130686759949, + "learning_rate": 4.974912030533846e-06, + "loss": 2.4583, + "step": 1382 + }, + { + "epoch": 0.07419527896995708, + "grad_norm": 1.0276066064834595, + "learning_rate": 4.974850606101854e-06, + "loss": 2.0452, + "step": 1383 + }, + { + "epoch": 0.07424892703862661, + "grad_norm": 0.9480687975883484, + "learning_rate": 4.974789106947278e-06, + "loss": 2.2192, + "step": 1384 + }, + { + "epoch": 0.07430257510729614, + "grad_norm": 3.404052257537842, + "learning_rate": 4.974727533071975e-06, + "loss": 2.3217, + "step": 1385 + }, + { + "epoch": 0.07435622317596567, + "grad_norm": 1.065765380859375, + "learning_rate": 4.974665884477803e-06, + "loss": 2.3553, + "step": 1386 + }, + { + "epoch": 0.07440987124463519, + "grad_norm": 0.9770095944404602, + "learning_rate": 4.9746041611666266e-06, + "loss": 2.4147, + "step": 1387 + }, + { + "epoch": 0.07446351931330472, + "grad_norm": 1.0051023960113525, + "learning_rate": 4.974542363140306e-06, + "loss": 2.2474, + "step": 1388 + }, + { + "epoch": 0.07451716738197425, + "grad_norm": 1.0955405235290527, + "learning_rate": 4.974480490400709e-06, + "loss": 2.2348, + "step": 1389 + }, + { + "epoch": 0.07457081545064377, + "grad_norm": 1.1394176483154297, + "learning_rate": 4.974418542949703e-06, + "loss": 2.1132, + "step": 1390 + }, + { + "epoch": 0.0746244635193133, + "grad_norm": 0.9789254069328308, + "learning_rate": 4.974356520789159e-06, + "loss": 2.2965, + "step": 1391 + }, + { + "epoch": 0.07467811158798283, + "grad_norm": 1.038822889328003, + "learning_rate": 4.974294423920949e-06, + "loss": 2.2101, + "step": 1392 + }, + { + "epoch": 0.07473175965665237, + "grad_norm": 0.9869319796562195, + "learning_rate": 4.9742322523469475e-06, + "loss": 2.4068, + "step": 1393 + }, + { + "epoch": 0.07478540772532188, + "grad_norm": 0.9765912890434265, + "learning_rate": 4.974170006069032e-06, + "loss": 2.2668, + "step": 1394 + }, + { + "epoch": 0.07483905579399142, + "grad_norm": 1.6876389980316162, + "learning_rate": 4.974107685089083e-06, + "loss": 2.3885, + "step": 1395 + }, + { + "epoch": 0.07489270386266095, + "grad_norm": 1.2938416004180908, + "learning_rate": 4.974045289408981e-06, + "loss": 2.2538, + "step": 1396 + }, + { + "epoch": 0.07494635193133047, + "grad_norm": 0.9978740215301514, + "learning_rate": 4.97398281903061e-06, + "loss": 2.4949, + "step": 1397 + }, + { + "epoch": 0.075, + "grad_norm": 0.977267324924469, + "learning_rate": 4.973920273955855e-06, + "loss": 2.3806, + "step": 1398 + }, + { + "epoch": 0.07505364806866953, + "grad_norm": 1.0162580013275146, + "learning_rate": 4.973857654186607e-06, + "loss": 2.2677, + "step": 1399 + }, + { + "epoch": 0.07510729613733906, + "grad_norm": 0.880977213382721, + "learning_rate": 4.973794959724755e-06, + "loss": 2.2722, + "step": 1400 + }, + { + "epoch": 0.07516094420600858, + "grad_norm": 0.9718684554100037, + "learning_rate": 4.9737321905721915e-06, + "loss": 2.1507, + "step": 1401 + }, + { + "epoch": 0.07521459227467811, + "grad_norm": 1.0685368776321411, + "learning_rate": 4.973669346730813e-06, + "loss": 2.1471, + "step": 1402 + }, + { + "epoch": 0.07526824034334764, + "grad_norm": 1.0438332557678223, + "learning_rate": 4.973606428202516e-06, + "loss": 2.204, + "step": 1403 + }, + { + "epoch": 0.07532188841201717, + "grad_norm": 1.1545586585998535, + "learning_rate": 4.973543434989201e-06, + "loss": 2.545, + "step": 1404 + }, + { + "epoch": 0.07537553648068669, + "grad_norm": 2.0891075134277344, + "learning_rate": 4.973480367092769e-06, + "loss": 2.4578, + "step": 1405 + }, + { + "epoch": 0.07542918454935622, + "grad_norm": 1.1711297035217285, + "learning_rate": 4.973417224515126e-06, + "loss": 2.4318, + "step": 1406 + }, + { + "epoch": 0.07548283261802576, + "grad_norm": 1.000867486000061, + "learning_rate": 4.973354007258175e-06, + "loss": 2.2352, + "step": 1407 + }, + { + "epoch": 0.07553648068669527, + "grad_norm": 0.9216778874397278, + "learning_rate": 4.973290715323829e-06, + "loss": 2.0343, + "step": 1408 + }, + { + "epoch": 0.0755901287553648, + "grad_norm": 0.8999906778335571, + "learning_rate": 4.973227348713995e-06, + "loss": 2.0836, + "step": 1409 + }, + { + "epoch": 0.07564377682403434, + "grad_norm": 1.1049591302871704, + "learning_rate": 4.973163907430588e-06, + "loss": 2.3079, + "step": 1410 + }, + { + "epoch": 0.07569742489270387, + "grad_norm": 1.0142604112625122, + "learning_rate": 4.973100391475524e-06, + "loss": 2.1857, + "step": 1411 + }, + { + "epoch": 0.07575107296137339, + "grad_norm": 0.9644139409065247, + "learning_rate": 4.97303680085072e-06, + "loss": 2.1685, + "step": 1412 + }, + { + "epoch": 0.07580472103004292, + "grad_norm": 1.0186855792999268, + "learning_rate": 4.972973135558097e-06, + "loss": 2.2221, + "step": 1413 + }, + { + "epoch": 0.07585836909871245, + "grad_norm": 1.1210620403289795, + "learning_rate": 4.972909395599575e-06, + "loss": 2.1043, + "step": 1414 + }, + { + "epoch": 0.07591201716738197, + "grad_norm": 1.0287531614303589, + "learning_rate": 4.97284558097708e-06, + "loss": 2.3634, + "step": 1415 + }, + { + "epoch": 0.0759656652360515, + "grad_norm": 1.0717772245407104, + "learning_rate": 4.97278169169254e-06, + "loss": 2.3914, + "step": 1416 + }, + { + "epoch": 0.07601931330472103, + "grad_norm": 1.0899219512939453, + "learning_rate": 4.972717727747881e-06, + "loss": 2.5441, + "step": 1417 + }, + { + "epoch": 0.07607296137339056, + "grad_norm": 1.3384462594985962, + "learning_rate": 4.9726536891450365e-06, + "loss": 2.3993, + "step": 1418 + }, + { + "epoch": 0.07612660944206008, + "grad_norm": 0.8986116051673889, + "learning_rate": 4.972589575885939e-06, + "loss": 2.3021, + "step": 1419 + }, + { + "epoch": 0.07618025751072961, + "grad_norm": 0.8991843461990356, + "learning_rate": 4.972525387972525e-06, + "loss": 2.3461, + "step": 1420 + }, + { + "epoch": 0.07623390557939914, + "grad_norm": 1.085938572883606, + "learning_rate": 4.972461125406732e-06, + "loss": 2.4223, + "step": 1421 + }, + { + "epoch": 0.07628755364806868, + "grad_norm": 1.02108633518219, + "learning_rate": 4.972396788190501e-06, + "loss": 2.2198, + "step": 1422 + }, + { + "epoch": 0.0763412017167382, + "grad_norm": 1.1626222133636475, + "learning_rate": 4.972332376325773e-06, + "loss": 1.7834, + "step": 1423 + }, + { + "epoch": 0.07639484978540773, + "grad_norm": 1.1069952249526978, + "learning_rate": 4.972267889814494e-06, + "loss": 2.3768, + "step": 1424 + }, + { + "epoch": 0.07644849785407726, + "grad_norm": 1.2132041454315186, + "learning_rate": 4.97220332865861e-06, + "loss": 2.4191, + "step": 1425 + }, + { + "epoch": 0.07650214592274678, + "grad_norm": 1.1763595342636108, + "learning_rate": 4.972138692860072e-06, + "loss": 2.5987, + "step": 1426 + }, + { + "epoch": 0.07655579399141631, + "grad_norm": 1.1088312864303589, + "learning_rate": 4.97207398242083e-06, + "loss": 2.339, + "step": 1427 + }, + { + "epoch": 0.07660944206008584, + "grad_norm": 1.031510829925537, + "learning_rate": 4.9720091973428385e-06, + "loss": 2.1076, + "step": 1428 + }, + { + "epoch": 0.07666309012875537, + "grad_norm": 0.9312414526939392, + "learning_rate": 4.971944337628053e-06, + "loss": 2.2307, + "step": 1429 + }, + { + "epoch": 0.07671673819742489, + "grad_norm": 1.063948631286621, + "learning_rate": 4.971879403278432e-06, + "loss": 2.209, + "step": 1430 + }, + { + "epoch": 0.07677038626609442, + "grad_norm": 0.9697802662849426, + "learning_rate": 4.971814394295936e-06, + "loss": 2.2513, + "step": 1431 + }, + { + "epoch": 0.07682403433476395, + "grad_norm": 8.346556663513184, + "learning_rate": 4.971749310682529e-06, + "loss": 2.2531, + "step": 1432 + }, + { + "epoch": 0.07687768240343347, + "grad_norm": 0.9593003392219543, + "learning_rate": 4.9716841524401745e-06, + "loss": 2.3754, + "step": 1433 + }, + { + "epoch": 0.076931330472103, + "grad_norm": 1.1474682092666626, + "learning_rate": 4.97161891957084e-06, + "loss": 2.451, + "step": 1434 + }, + { + "epoch": 0.07698497854077253, + "grad_norm": 0.9977197647094727, + "learning_rate": 4.971553612076495e-06, + "loss": 2.373, + "step": 1435 + }, + { + "epoch": 0.07703862660944207, + "grad_norm": 1.1065374612808228, + "learning_rate": 4.9714882299591125e-06, + "loss": 2.2546, + "step": 1436 + }, + { + "epoch": 0.07709227467811158, + "grad_norm": 1.0937299728393555, + "learning_rate": 4.971422773220666e-06, + "loss": 2.3658, + "step": 1437 + }, + { + "epoch": 0.07714592274678111, + "grad_norm": 0.9734066128730774, + "learning_rate": 4.971357241863131e-06, + "loss": 1.9396, + "step": 1438 + }, + { + "epoch": 0.07719957081545065, + "grad_norm": 0.8513569831848145, + "learning_rate": 4.971291635888487e-06, + "loss": 2.3663, + "step": 1439 + }, + { + "epoch": 0.07725321888412018, + "grad_norm": 1.2299129962921143, + "learning_rate": 4.971225955298714e-06, + "loss": 2.3839, + "step": 1440 + }, + { + "epoch": 0.0773068669527897, + "grad_norm": 1.3104857206344604, + "learning_rate": 4.9711602000957964e-06, + "loss": 2.3399, + "step": 1441 + }, + { + "epoch": 0.07736051502145923, + "grad_norm": 0.9283754825592041, + "learning_rate": 4.971094370281718e-06, + "loss": 2.2779, + "step": 1442 + }, + { + "epoch": 0.07741416309012876, + "grad_norm": 1.1517002582550049, + "learning_rate": 4.971028465858468e-06, + "loss": 2.4216, + "step": 1443 + }, + { + "epoch": 0.07746781115879828, + "grad_norm": 1.0854661464691162, + "learning_rate": 4.970962486828034e-06, + "loss": 2.2931, + "step": 1444 + }, + { + "epoch": 0.07752145922746781, + "grad_norm": 1.1306616067886353, + "learning_rate": 4.9708964331924105e-06, + "loss": 2.4358, + "step": 1445 + }, + { + "epoch": 0.07757510729613734, + "grad_norm": 1.0557206869125366, + "learning_rate": 4.97083030495359e-06, + "loss": 2.419, + "step": 1446 + }, + { + "epoch": 0.07762875536480687, + "grad_norm": 1.566414475440979, + "learning_rate": 4.97076410211357e-06, + "loss": 1.942, + "step": 1447 + }, + { + "epoch": 0.07768240343347639, + "grad_norm": 7.868310928344727, + "learning_rate": 4.9706978246743495e-06, + "loss": 2.4146, + "step": 1448 + }, + { + "epoch": 0.07773605150214592, + "grad_norm": 0.9003645777702332, + "learning_rate": 4.970631472637929e-06, + "loss": 2.0977, + "step": 1449 + }, + { + "epoch": 0.07778969957081545, + "grad_norm": 0.9081676006317139, + "learning_rate": 4.970565046006312e-06, + "loss": 2.4292, + "step": 1450 + }, + { + "epoch": 0.07784334763948497, + "grad_norm": 0.9508426785469055, + "learning_rate": 4.970498544781505e-06, + "loss": 2.1994, + "step": 1451 + }, + { + "epoch": 0.0778969957081545, + "grad_norm": 1.0169605016708374, + "learning_rate": 4.970431968965515e-06, + "loss": 2.6188, + "step": 1452 + }, + { + "epoch": 0.07795064377682404, + "grad_norm": 1.2397761344909668, + "learning_rate": 4.970365318560351e-06, + "loss": 2.2672, + "step": 1453 + }, + { + "epoch": 0.07800429184549357, + "grad_norm": 1.0348191261291504, + "learning_rate": 4.970298593568027e-06, + "loss": 2.3372, + "step": 1454 + }, + { + "epoch": 0.07805793991416309, + "grad_norm": 1.3939028978347778, + "learning_rate": 4.9702317939905575e-06, + "loss": 1.3533, + "step": 1455 + }, + { + "epoch": 0.07811158798283262, + "grad_norm": 1.0024003982543945, + "learning_rate": 4.97016491982996e-06, + "loss": 2.2791, + "step": 1456 + }, + { + "epoch": 0.07816523605150215, + "grad_norm": 1.1704720258712769, + "learning_rate": 4.970097971088251e-06, + "loss": 2.3451, + "step": 1457 + }, + { + "epoch": 0.07821888412017168, + "grad_norm": 1.1526343822479248, + "learning_rate": 4.970030947767455e-06, + "loss": 1.9494, + "step": 1458 + }, + { + "epoch": 0.0782725321888412, + "grad_norm": 3.485053539276123, + "learning_rate": 4.969963849869593e-06, + "loss": 1.8429, + "step": 1459 + }, + { + "epoch": 0.07832618025751073, + "grad_norm": 1.1040962934494019, + "learning_rate": 4.969896677396693e-06, + "loss": 2.4239, + "step": 1460 + }, + { + "epoch": 0.07837982832618026, + "grad_norm": 1.2002966403961182, + "learning_rate": 4.969829430350781e-06, + "loss": 2.3071, + "step": 1461 + }, + { + "epoch": 0.07843347639484978, + "grad_norm": 1.0308129787445068, + "learning_rate": 4.969762108733889e-06, + "loss": 2.3301, + "step": 1462 + }, + { + "epoch": 0.07848712446351931, + "grad_norm": 1.0292377471923828, + "learning_rate": 4.969694712548049e-06, + "loss": 2.3967, + "step": 1463 + }, + { + "epoch": 0.07854077253218884, + "grad_norm": 1.1803817749023438, + "learning_rate": 4.969627241795297e-06, + "loss": 2.4276, + "step": 1464 + }, + { + "epoch": 0.07859442060085838, + "grad_norm": 0.984775960445404, + "learning_rate": 4.969559696477668e-06, + "loss": 2.4163, + "step": 1465 + }, + { + "epoch": 0.07864806866952789, + "grad_norm": 1.029131531715393, + "learning_rate": 4.969492076597203e-06, + "loss": 2.2875, + "step": 1466 + }, + { + "epoch": 0.07870171673819742, + "grad_norm": 1.2231156826019287, + "learning_rate": 4.969424382155943e-06, + "loss": 2.4018, + "step": 1467 + }, + { + "epoch": 0.07875536480686696, + "grad_norm": 1.2237415313720703, + "learning_rate": 4.969356613155932e-06, + "loss": 2.2569, + "step": 1468 + }, + { + "epoch": 0.07880901287553647, + "grad_norm": 1.0500431060791016, + "learning_rate": 4.969288769599217e-06, + "loss": 2.2986, + "step": 1469 + }, + { + "epoch": 0.078862660944206, + "grad_norm": 0.9005617499351501, + "learning_rate": 4.9692208514878445e-06, + "loss": 2.0929, + "step": 1470 + }, + { + "epoch": 0.07891630901287554, + "grad_norm": 0.9856646656990051, + "learning_rate": 4.969152858823867e-06, + "loss": 2.1493, + "step": 1471 + }, + { + "epoch": 0.07896995708154507, + "grad_norm": 1.258678674697876, + "learning_rate": 4.969084791609336e-06, + "loss": 2.4362, + "step": 1472 + }, + { + "epoch": 0.07902360515021459, + "grad_norm": 1.1616809368133545, + "learning_rate": 4.969016649846308e-06, + "loss": 2.0765, + "step": 1473 + }, + { + "epoch": 0.07907725321888412, + "grad_norm": 1.0906788110733032, + "learning_rate": 4.968948433536839e-06, + "loss": 2.4387, + "step": 1474 + }, + { + "epoch": 0.07913090128755365, + "grad_norm": 1.141611933708191, + "learning_rate": 4.968880142682988e-06, + "loss": 2.3865, + "step": 1475 + }, + { + "epoch": 0.07918454935622318, + "grad_norm": 1.0800232887268066, + "learning_rate": 4.96881177728682e-06, + "loss": 2.2554, + "step": 1476 + }, + { + "epoch": 0.0792381974248927, + "grad_norm": 1.5412439107894897, + "learning_rate": 4.968743337350397e-06, + "loss": 2.2166, + "step": 1477 + }, + { + "epoch": 0.07929184549356223, + "grad_norm": 1.1633332967758179, + "learning_rate": 4.968674822875785e-06, + "loss": 2.3943, + "step": 1478 + }, + { + "epoch": 0.07934549356223176, + "grad_norm": 1.0530400276184082, + "learning_rate": 4.968606233865054e-06, + "loss": 2.4994, + "step": 1479 + }, + { + "epoch": 0.07939914163090128, + "grad_norm": 1.0710266828536987, + "learning_rate": 4.968537570320274e-06, + "loss": 2.1843, + "step": 1480 + }, + { + "epoch": 0.07945278969957081, + "grad_norm": 0.9292663335800171, + "learning_rate": 4.968468832243518e-06, + "loss": 2.3688, + "step": 1481 + }, + { + "epoch": 0.07950643776824035, + "grad_norm": 0.9833860993385315, + "learning_rate": 4.968400019636862e-06, + "loss": 2.1584, + "step": 1482 + }, + { + "epoch": 0.07956008583690988, + "grad_norm": 1.1426249742507935, + "learning_rate": 4.968331132502383e-06, + "loss": 2.1803, + "step": 1483 + }, + { + "epoch": 0.0796137339055794, + "grad_norm": 1.9897398948669434, + "learning_rate": 4.968262170842162e-06, + "loss": 2.1976, + "step": 1484 + }, + { + "epoch": 0.07966738197424893, + "grad_norm": 1.028342604637146, + "learning_rate": 4.968193134658279e-06, + "loss": 2.245, + "step": 1485 + }, + { + "epoch": 0.07972103004291846, + "grad_norm": 1.072307825088501, + "learning_rate": 4.968124023952822e-06, + "loss": 2.189, + "step": 1486 + }, + { + "epoch": 0.07977467811158798, + "grad_norm": 1.0869473218917847, + "learning_rate": 4.968054838727874e-06, + "loss": 2.2567, + "step": 1487 + }, + { + "epoch": 0.07982832618025751, + "grad_norm": 1.1065728664398193, + "learning_rate": 4.967985578985525e-06, + "loss": 2.2626, + "step": 1488 + }, + { + "epoch": 0.07988197424892704, + "grad_norm": 1.05400550365448, + "learning_rate": 4.967916244727868e-06, + "loss": 2.3589, + "step": 1489 + }, + { + "epoch": 0.07993562231759657, + "grad_norm": 1.0734738111495972, + "learning_rate": 4.967846835956993e-06, + "loss": 2.0856, + "step": 1490 + }, + { + "epoch": 0.07998927038626609, + "grad_norm": 1.056257724761963, + "learning_rate": 4.967777352674999e-06, + "loss": 2.3669, + "step": 1491 + }, + { + "epoch": 0.08004291845493562, + "grad_norm": 1.1005325317382812, + "learning_rate": 4.967707794883982e-06, + "loss": 2.3447, + "step": 1492 + }, + { + "epoch": 0.08009656652360515, + "grad_norm": 0.9405511617660522, + "learning_rate": 4.9676381625860424e-06, + "loss": 2.2176, + "step": 1493 + }, + { + "epoch": 0.08015021459227469, + "grad_norm": 1.039394497871399, + "learning_rate": 4.967568455783283e-06, + "loss": 2.3169, + "step": 1494 + }, + { + "epoch": 0.0802038626609442, + "grad_norm": 1.0795094966888428, + "learning_rate": 4.967498674477807e-06, + "loss": 2.3251, + "step": 1495 + }, + { + "epoch": 0.08025751072961373, + "grad_norm": 0.9734029173851013, + "learning_rate": 4.9674288186717244e-06, + "loss": 2.2366, + "step": 1496 + }, + { + "epoch": 0.08031115879828327, + "grad_norm": 0.8638395071029663, + "learning_rate": 4.967358888367141e-06, + "loss": 2.3378, + "step": 1497 + }, + { + "epoch": 0.08036480686695278, + "grad_norm": 1.0712440013885498, + "learning_rate": 4.967288883566171e-06, + "loss": 2.465, + "step": 1498 + }, + { + "epoch": 0.08041845493562232, + "grad_norm": 1.140532374382019, + "learning_rate": 4.967218804270926e-06, + "loss": 2.4646, + "step": 1499 + }, + { + "epoch": 0.08047210300429185, + "grad_norm": 1.0432919263839722, + "learning_rate": 4.967148650483522e-06, + "loss": 2.5469, + "step": 1500 + }, + { + "epoch": 0.08052575107296138, + "grad_norm": 1.0415621995925903, + "learning_rate": 4.967078422206077e-06, + "loss": 2.1109, + "step": 1501 + }, + { + "epoch": 0.0805793991416309, + "grad_norm": 1.091618537902832, + "learning_rate": 4.967008119440714e-06, + "loss": 2.1967, + "step": 1502 + }, + { + "epoch": 0.08063304721030043, + "grad_norm": 1.1757445335388184, + "learning_rate": 4.966937742189553e-06, + "loss": 2.3264, + "step": 1503 + }, + { + "epoch": 0.08068669527896996, + "grad_norm": 1.1083757877349854, + "learning_rate": 4.966867290454719e-06, + "loss": 2.5584, + "step": 1504 + }, + { + "epoch": 0.08074034334763948, + "grad_norm": 0.9487075209617615, + "learning_rate": 4.96679676423834e-06, + "loss": 2.2323, + "step": 1505 + }, + { + "epoch": 0.08079399141630901, + "grad_norm": 0.9605690240859985, + "learning_rate": 4.966726163542545e-06, + "loss": 2.2459, + "step": 1506 + }, + { + "epoch": 0.08084763948497854, + "grad_norm": 1.1166452169418335, + "learning_rate": 4.966655488369466e-06, + "loss": 2.2793, + "step": 1507 + }, + { + "epoch": 0.08090128755364807, + "grad_norm": 0.9237282872200012, + "learning_rate": 4.966584738721236e-06, + "loss": 2.4728, + "step": 1508 + }, + { + "epoch": 0.08095493562231759, + "grad_norm": 1.134324312210083, + "learning_rate": 4.966513914599993e-06, + "loss": 2.1329, + "step": 1509 + }, + { + "epoch": 0.08100858369098712, + "grad_norm": 1.199369192123413, + "learning_rate": 4.966443016007873e-06, + "loss": 2.3196, + "step": 1510 + }, + { + "epoch": 0.08106223175965666, + "grad_norm": 1.2491835355758667, + "learning_rate": 4.966372042947018e-06, + "loss": 2.4594, + "step": 1511 + }, + { + "epoch": 0.08111587982832617, + "grad_norm": 2.467543125152588, + "learning_rate": 4.966300995419571e-06, + "loss": 2.2045, + "step": 1512 + }, + { + "epoch": 0.0811695278969957, + "grad_norm": 1.171787977218628, + "learning_rate": 4.966229873427676e-06, + "loss": 2.3926, + "step": 1513 + }, + { + "epoch": 0.08122317596566524, + "grad_norm": 0.9942606687545776, + "learning_rate": 4.966158676973482e-06, + "loss": 2.4775, + "step": 1514 + }, + { + "epoch": 0.08127682403433477, + "grad_norm": 1.0077043771743774, + "learning_rate": 4.9660874060591365e-06, + "loss": 2.4135, + "step": 1515 + }, + { + "epoch": 0.08133047210300429, + "grad_norm": 1.1150081157684326, + "learning_rate": 4.966016060686794e-06, + "loss": 2.1191, + "step": 1516 + }, + { + "epoch": 0.08138412017167382, + "grad_norm": 3.168137311935425, + "learning_rate": 4.965944640858606e-06, + "loss": 2.3586, + "step": 1517 + }, + { + "epoch": 0.08143776824034335, + "grad_norm": 1.1107186079025269, + "learning_rate": 4.96587314657673e-06, + "loss": 2.3359, + "step": 1518 + }, + { + "epoch": 0.08149141630901288, + "grad_norm": 1.3944618701934814, + "learning_rate": 4.9658015778433256e-06, + "loss": 2.2309, + "step": 1519 + }, + { + "epoch": 0.0815450643776824, + "grad_norm": 1.0401256084442139, + "learning_rate": 4.965729934660553e-06, + "loss": 2.3432, + "step": 1520 + }, + { + "epoch": 0.08159871244635193, + "grad_norm": 0.9296973943710327, + "learning_rate": 4.965658217030574e-06, + "loss": 2.2907, + "step": 1521 + }, + { + "epoch": 0.08165236051502146, + "grad_norm": 0.9325800538063049, + "learning_rate": 4.965586424955555e-06, + "loss": 2.2639, + "step": 1522 + }, + { + "epoch": 0.08170600858369098, + "grad_norm": 1.0755035877227783, + "learning_rate": 4.965514558437664e-06, + "loss": 2.4326, + "step": 1523 + }, + { + "epoch": 0.08175965665236051, + "grad_norm": 1.4959969520568848, + "learning_rate": 4.965442617479071e-06, + "loss": 2.0137, + "step": 1524 + }, + { + "epoch": 0.08181330472103004, + "grad_norm": 1.0266350507736206, + "learning_rate": 4.965370602081946e-06, + "loss": 2.2494, + "step": 1525 + }, + { + "epoch": 0.08186695278969958, + "grad_norm": 0.9980213642120361, + "learning_rate": 4.965298512248466e-06, + "loss": 2.4622, + "step": 1526 + }, + { + "epoch": 0.0819206008583691, + "grad_norm": 1.038623332977295, + "learning_rate": 4.965226347980806e-06, + "loss": 2.3288, + "step": 1527 + }, + { + "epoch": 0.08197424892703863, + "grad_norm": 1.8606276512145996, + "learning_rate": 4.965154109281145e-06, + "loss": 2.3771, + "step": 1528 + }, + { + "epoch": 0.08202789699570816, + "grad_norm": 1.4848978519439697, + "learning_rate": 4.965081796151665e-06, + "loss": 2.2242, + "step": 1529 + }, + { + "epoch": 0.08208154506437768, + "grad_norm": 0.9233113527297974, + "learning_rate": 4.965009408594549e-06, + "loss": 2.2002, + "step": 1530 + }, + { + "epoch": 0.08213519313304721, + "grad_norm": 1.1974916458129883, + "learning_rate": 4.964936946611983e-06, + "loss": 2.6403, + "step": 1531 + }, + { + "epoch": 0.08218884120171674, + "grad_norm": 1.0628743171691895, + "learning_rate": 4.964864410206152e-06, + "loss": 2.4027, + "step": 1532 + }, + { + "epoch": 0.08224248927038627, + "grad_norm": 0.9386650323867798, + "learning_rate": 4.9647917993792496e-06, + "loss": 2.25, + "step": 1533 + }, + { + "epoch": 0.08229613733905579, + "grad_norm": 1.109528660774231, + "learning_rate": 4.964719114133466e-06, + "loss": 2.325, + "step": 1534 + }, + { + "epoch": 0.08234978540772532, + "grad_norm": 0.9987028241157532, + "learning_rate": 4.964646354470997e-06, + "loss": 2.1007, + "step": 1535 + }, + { + "epoch": 0.08240343347639485, + "grad_norm": 1.5527077913284302, + "learning_rate": 4.964573520394038e-06, + "loss": 2.3758, + "step": 1536 + }, + { + "epoch": 0.08245708154506438, + "grad_norm": 1.2129688262939453, + "learning_rate": 4.964500611904791e-06, + "loss": 2.2112, + "step": 1537 + }, + { + "epoch": 0.0825107296137339, + "grad_norm": 1.1084346771240234, + "learning_rate": 4.964427629005454e-06, + "loss": 2.28, + "step": 1538 + }, + { + "epoch": 0.08256437768240343, + "grad_norm": 0.9709970951080322, + "learning_rate": 4.964354571698231e-06, + "loss": 2.2874, + "step": 1539 + }, + { + "epoch": 0.08261802575107297, + "grad_norm": 1.2608736753463745, + "learning_rate": 4.96428143998533e-06, + "loss": 1.2996, + "step": 1540 + }, + { + "epoch": 0.08267167381974248, + "grad_norm": 1.2486788034439087, + "learning_rate": 4.964208233868957e-06, + "loss": 2.2147, + "step": 1541 + }, + { + "epoch": 0.08272532188841202, + "grad_norm": 1.1228344440460205, + "learning_rate": 4.9641349533513235e-06, + "loss": 2.3057, + "step": 1542 + }, + { + "epoch": 0.08277896995708155, + "grad_norm": 1.055301308631897, + "learning_rate": 4.964061598434641e-06, + "loss": 2.3837, + "step": 1543 + }, + { + "epoch": 0.08283261802575108, + "grad_norm": 0.8575541377067566, + "learning_rate": 4.963988169121125e-06, + "loss": 2.237, + "step": 1544 + }, + { + "epoch": 0.0828862660944206, + "grad_norm": 1.2017253637313843, + "learning_rate": 4.963914665412992e-06, + "loss": 2.4651, + "step": 1545 + }, + { + "epoch": 0.08293991416309013, + "grad_norm": 1.2050693035125732, + "learning_rate": 4.963841087312462e-06, + "loss": 2.6758, + "step": 1546 + }, + { + "epoch": 0.08299356223175966, + "grad_norm": 1.0124294757843018, + "learning_rate": 4.963767434821757e-06, + "loss": 2.5102, + "step": 1547 + }, + { + "epoch": 0.08304721030042918, + "grad_norm": 1.078139305114746, + "learning_rate": 4.963693707943099e-06, + "loss": 2.4136, + "step": 1548 + }, + { + "epoch": 0.08310085836909871, + "grad_norm": 1.0511324405670166, + "learning_rate": 4.963619906678715e-06, + "loss": 2.1272, + "step": 1549 + }, + { + "epoch": 0.08315450643776824, + "grad_norm": 1.0642032623291016, + "learning_rate": 4.9635460310308335e-06, + "loss": 2.1734, + "step": 1550 + }, + { + "epoch": 0.08320815450643777, + "grad_norm": 0.9766706228256226, + "learning_rate": 4.963472081001684e-06, + "loss": 2.4462, + "step": 1551 + }, + { + "epoch": 0.08326180257510729, + "grad_norm": 1.5284464359283447, + "learning_rate": 4.9633980565935e-06, + "loss": 1.8817, + "step": 1552 + }, + { + "epoch": 0.08331545064377682, + "grad_norm": 0.9357942938804626, + "learning_rate": 4.9633239578085175e-06, + "loss": 2.0977, + "step": 1553 + }, + { + "epoch": 0.08336909871244635, + "grad_norm": 1.0891377925872803, + "learning_rate": 4.963249784648972e-06, + "loss": 2.2126, + "step": 1554 + }, + { + "epoch": 0.08342274678111589, + "grad_norm": 1.7222974300384521, + "learning_rate": 4.963175537117104e-06, + "loss": 2.5116, + "step": 1555 + }, + { + "epoch": 0.0834763948497854, + "grad_norm": 0.9710643887519836, + "learning_rate": 4.963101215215155e-06, + "loss": 2.3834, + "step": 1556 + }, + { + "epoch": 0.08353004291845494, + "grad_norm": 2.442746877670288, + "learning_rate": 4.963026818945369e-06, + "loss": 2.5579, + "step": 1557 + }, + { + "epoch": 0.08358369098712447, + "grad_norm": 1.247110366821289, + "learning_rate": 4.962952348309991e-06, + "loss": 2.4129, + "step": 1558 + }, + { + "epoch": 0.08363733905579399, + "grad_norm": 1.1250485181808472, + "learning_rate": 4.9628778033112715e-06, + "loss": 2.4015, + "step": 1559 + }, + { + "epoch": 0.08369098712446352, + "grad_norm": 1.0553501844406128, + "learning_rate": 4.962803183951461e-06, + "loss": 2.4529, + "step": 1560 + }, + { + "epoch": 0.08374463519313305, + "grad_norm": 1.24652099609375, + "learning_rate": 4.962728490232811e-06, + "loss": 2.2815, + "step": 1561 + }, + { + "epoch": 0.08379828326180258, + "grad_norm": 1.631174087524414, + "learning_rate": 4.962653722157577e-06, + "loss": 2.1128, + "step": 1562 + }, + { + "epoch": 0.0838519313304721, + "grad_norm": 1.0755717754364014, + "learning_rate": 4.962578879728018e-06, + "loss": 2.4024, + "step": 1563 + }, + { + "epoch": 0.08390557939914163, + "grad_norm": 0.9437095522880554, + "learning_rate": 4.962503962946393e-06, + "loss": 2.2104, + "step": 1564 + }, + { + "epoch": 0.08395922746781116, + "grad_norm": 1.0856975317001343, + "learning_rate": 4.962428971814963e-06, + "loss": 2.3054, + "step": 1565 + }, + { + "epoch": 0.08401287553648068, + "grad_norm": 0.9859164953231812, + "learning_rate": 4.962353906335993e-06, + "loss": 2.1173, + "step": 1566 + }, + { + "epoch": 0.08406652360515021, + "grad_norm": 1.337965965270996, + "learning_rate": 4.962278766511749e-06, + "loss": 2.3045, + "step": 1567 + }, + { + "epoch": 0.08412017167381974, + "grad_norm": 1.0456466674804688, + "learning_rate": 4.9622035523445e-06, + "loss": 2.6629, + "step": 1568 + }, + { + "epoch": 0.08417381974248928, + "grad_norm": 5.553250312805176, + "learning_rate": 4.962128263836518e-06, + "loss": 2.2184, + "step": 1569 + }, + { + "epoch": 0.0842274678111588, + "grad_norm": 1.127160906791687, + "learning_rate": 4.962052900990073e-06, + "loss": 2.4818, + "step": 1570 + }, + { + "epoch": 0.08428111587982832, + "grad_norm": 1.0883961915969849, + "learning_rate": 4.961977463807444e-06, + "loss": 2.3773, + "step": 1571 + }, + { + "epoch": 0.08433476394849786, + "grad_norm": 0.972382128238678, + "learning_rate": 4.961901952290906e-06, + "loss": 2.2293, + "step": 1572 + }, + { + "epoch": 0.08438841201716739, + "grad_norm": 0.8943543434143066, + "learning_rate": 4.9618263664427405e-06, + "loss": 1.9719, + "step": 1573 + }, + { + "epoch": 0.0844420600858369, + "grad_norm": 1.0477933883666992, + "learning_rate": 4.961750706265229e-06, + "loss": 2.3782, + "step": 1574 + }, + { + "epoch": 0.08449570815450644, + "grad_norm": 1.7300524711608887, + "learning_rate": 4.961674971760657e-06, + "loss": 2.4484, + "step": 1575 + }, + { + "epoch": 0.08454935622317597, + "grad_norm": 1.2619285583496094, + "learning_rate": 4.9615991629313095e-06, + "loss": 2.431, + "step": 1576 + }, + { + "epoch": 0.08460300429184549, + "grad_norm": 1.0125261545181274, + "learning_rate": 4.961523279779475e-06, + "loss": 2.2616, + "step": 1577 + }, + { + "epoch": 0.08465665236051502, + "grad_norm": 1.1749234199523926, + "learning_rate": 4.9614473223074475e-06, + "loss": 2.3122, + "step": 1578 + }, + { + "epoch": 0.08471030042918455, + "grad_norm": 0.9323041439056396, + "learning_rate": 4.961371290517518e-06, + "loss": 1.7698, + "step": 1579 + }, + { + "epoch": 0.08476394849785408, + "grad_norm": 1.215304970741272, + "learning_rate": 4.961295184411981e-06, + "loss": 2.2894, + "step": 1580 + }, + { + "epoch": 0.0848175965665236, + "grad_norm": 1.7376598119735718, + "learning_rate": 4.961219003993138e-06, + "loss": 2.1761, + "step": 1581 + }, + { + "epoch": 0.08487124463519313, + "grad_norm": 1.0124722719192505, + "learning_rate": 4.961142749263287e-06, + "loss": 2.0947, + "step": 1582 + }, + { + "epoch": 0.08492489270386266, + "grad_norm": 1.1788444519042969, + "learning_rate": 4.961066420224729e-06, + "loss": 2.3621, + "step": 1583 + }, + { + "epoch": 0.08497854077253218, + "grad_norm": 1.0883941650390625, + "learning_rate": 4.960990016879771e-06, + "loss": 2.6198, + "step": 1584 + }, + { + "epoch": 0.08503218884120171, + "grad_norm": 1.1471115350723267, + "learning_rate": 4.960913539230719e-06, + "loss": 2.3472, + "step": 1585 + }, + { + "epoch": 0.08508583690987125, + "grad_norm": 1.0778883695602417, + "learning_rate": 4.960836987279881e-06, + "loss": 2.5777, + "step": 1586 + }, + { + "epoch": 0.08513948497854078, + "grad_norm": 1.1139392852783203, + "learning_rate": 4.9607603610295704e-06, + "loss": 2.3872, + "step": 1587 + }, + { + "epoch": 0.0851931330472103, + "grad_norm": 0.9776027202606201, + "learning_rate": 4.960683660482099e-06, + "loss": 2.1972, + "step": 1588 + }, + { + "epoch": 0.08524678111587983, + "grad_norm": 1.1770435571670532, + "learning_rate": 4.960606885639784e-06, + "loss": 2.5403, + "step": 1589 + }, + { + "epoch": 0.08530042918454936, + "grad_norm": 0.9870885610580444, + "learning_rate": 4.960530036504942e-06, + "loss": 2.206, + "step": 1590 + }, + { + "epoch": 0.08535407725321889, + "grad_norm": 1.0582643747329712, + "learning_rate": 4.960453113079894e-06, + "loss": 2.373, + "step": 1591 + }, + { + "epoch": 0.08540772532188841, + "grad_norm": 1.0486334562301636, + "learning_rate": 4.9603761153669625e-06, + "loss": 2.4604, + "step": 1592 + }, + { + "epoch": 0.08546137339055794, + "grad_norm": 1.0280712842941284, + "learning_rate": 4.960299043368472e-06, + "loss": 2.2057, + "step": 1593 + }, + { + "epoch": 0.08551502145922747, + "grad_norm": 1.0404623746871948, + "learning_rate": 4.96022189708675e-06, + "loss": 2.3821, + "step": 1594 + }, + { + "epoch": 0.08556866952789699, + "grad_norm": 1.085313320159912, + "learning_rate": 4.9601446765241245e-06, + "loss": 2.1799, + "step": 1595 + }, + { + "epoch": 0.08562231759656652, + "grad_norm": 1.0947303771972656, + "learning_rate": 4.960067381682929e-06, + "loss": 2.2951, + "step": 1596 + }, + { + "epoch": 0.08567596566523605, + "grad_norm": 1.0661115646362305, + "learning_rate": 4.959990012565497e-06, + "loss": 2.2234, + "step": 1597 + }, + { + "epoch": 0.08572961373390559, + "grad_norm": 2.8304338455200195, + "learning_rate": 4.959912569174163e-06, + "loss": 2.4343, + "step": 1598 + }, + { + "epoch": 0.0857832618025751, + "grad_norm": 1.0539566278457642, + "learning_rate": 4.959835051511265e-06, + "loss": 2.3118, + "step": 1599 + }, + { + "epoch": 0.08583690987124463, + "grad_norm": 0.9918525815010071, + "learning_rate": 4.9597574595791455e-06, + "loss": 2.3641, + "step": 1600 + }, + { + "epoch": 0.08589055793991417, + "grad_norm": 1.1240812540054321, + "learning_rate": 4.959679793380146e-06, + "loss": 2.0454, + "step": 1601 + }, + { + "epoch": 0.08594420600858368, + "grad_norm": 2.01190185546875, + "learning_rate": 4.959602052916612e-06, + "loss": 2.2489, + "step": 1602 + }, + { + "epoch": 0.08599785407725322, + "grad_norm": 1.1215431690216064, + "learning_rate": 4.95952423819089e-06, + "loss": 2.3604, + "step": 1603 + }, + { + "epoch": 0.08605150214592275, + "grad_norm": 1.0536242723464966, + "learning_rate": 4.959446349205329e-06, + "loss": 2.3864, + "step": 1604 + }, + { + "epoch": 0.08610515021459228, + "grad_norm": 10.067190170288086, + "learning_rate": 4.959368385962282e-06, + "loss": 2.4603, + "step": 1605 + }, + { + "epoch": 0.0861587982832618, + "grad_norm": 1.2379369735717773, + "learning_rate": 4.959290348464103e-06, + "loss": 1.5238, + "step": 1606 + }, + { + "epoch": 0.08621244635193133, + "grad_norm": 1.1681138277053833, + "learning_rate": 4.959212236713147e-06, + "loss": 1.8978, + "step": 1607 + }, + { + "epoch": 0.08626609442060086, + "grad_norm": 0.9545812606811523, + "learning_rate": 4.959134050711774e-06, + "loss": 2.1748, + "step": 1608 + }, + { + "epoch": 0.08631974248927039, + "grad_norm": 0.9710696935653687, + "learning_rate": 4.959055790462342e-06, + "loss": 2.343, + "step": 1609 + }, + { + "epoch": 0.08637339055793991, + "grad_norm": 1.082421064376831, + "learning_rate": 4.958977455967216e-06, + "loss": 2.3228, + "step": 1610 + }, + { + "epoch": 0.08642703862660944, + "grad_norm": 1.0215240716934204, + "learning_rate": 4.958899047228761e-06, + "loss": 2.2326, + "step": 1611 + }, + { + "epoch": 0.08648068669527897, + "grad_norm": 1.154728889465332, + "learning_rate": 4.958820564249344e-06, + "loss": 2.3295, + "step": 1612 + }, + { + "epoch": 0.08653433476394849, + "grad_norm": 1.0604885816574097, + "learning_rate": 4.958742007031334e-06, + "loss": 2.5871, + "step": 1613 + }, + { + "epoch": 0.08658798283261802, + "grad_norm": 1.0388280153274536, + "learning_rate": 4.958663375577104e-06, + "loss": 2.1844, + "step": 1614 + }, + { + "epoch": 0.08664163090128756, + "grad_norm": 1.0665380954742432, + "learning_rate": 4.958584669889028e-06, + "loss": 2.4078, + "step": 1615 + }, + { + "epoch": 0.08669527896995709, + "grad_norm": 1.0188113451004028, + "learning_rate": 4.9585058899694814e-06, + "loss": 2.3011, + "step": 1616 + }, + { + "epoch": 0.0867489270386266, + "grad_norm": 0.9401970505714417, + "learning_rate": 4.958427035820843e-06, + "loss": 2.3958, + "step": 1617 + }, + { + "epoch": 0.08680257510729614, + "grad_norm": 1.1738519668579102, + "learning_rate": 4.958348107445494e-06, + "loss": 2.3002, + "step": 1618 + }, + { + "epoch": 0.08685622317596567, + "grad_norm": 1.0160599946975708, + "learning_rate": 4.958269104845818e-06, + "loss": 2.3413, + "step": 1619 + }, + { + "epoch": 0.08690987124463519, + "grad_norm": 1.0429478883743286, + "learning_rate": 4.958190028024199e-06, + "loss": 2.2245, + "step": 1620 + }, + { + "epoch": 0.08696351931330472, + "grad_norm": 0.7835602164268494, + "learning_rate": 4.958110876983026e-06, + "loss": 1.6608, + "step": 1621 + }, + { + "epoch": 0.08701716738197425, + "grad_norm": 1.061514973640442, + "learning_rate": 4.958031651724687e-06, + "loss": 2.3192, + "step": 1622 + }, + { + "epoch": 0.08707081545064378, + "grad_norm": 1.1347829103469849, + "learning_rate": 4.957952352251576e-06, + "loss": 2.3736, + "step": 1623 + }, + { + "epoch": 0.0871244635193133, + "grad_norm": 1.0647541284561157, + "learning_rate": 4.957872978566087e-06, + "loss": 2.1625, + "step": 1624 + }, + { + "epoch": 0.08717811158798283, + "grad_norm": 0.9151790738105774, + "learning_rate": 4.957793530670615e-06, + "loss": 2.2361, + "step": 1625 + }, + { + "epoch": 0.08723175965665236, + "grad_norm": 1.250340461730957, + "learning_rate": 4.957714008567559e-06, + "loss": 2.3458, + "step": 1626 + }, + { + "epoch": 0.0872854077253219, + "grad_norm": 1.1014546155929565, + "learning_rate": 4.957634412259321e-06, + "loss": 2.6657, + "step": 1627 + }, + { + "epoch": 0.08733905579399141, + "grad_norm": 0.9861351847648621, + "learning_rate": 4.957554741748305e-06, + "loss": 2.4369, + "step": 1628 + }, + { + "epoch": 0.08739270386266094, + "grad_norm": 0.9996386766433716, + "learning_rate": 4.957474997036914e-06, + "loss": 2.5287, + "step": 1629 + }, + { + "epoch": 0.08744635193133048, + "grad_norm": 0.9537687301635742, + "learning_rate": 4.9573951781275575e-06, + "loss": 2.348, + "step": 1630 + }, + { + "epoch": 0.0875, + "grad_norm": 1.0618668794631958, + "learning_rate": 4.957315285022645e-06, + "loss": 2.0845, + "step": 1631 + }, + { + "epoch": 0.08755364806866953, + "grad_norm": 1.057807207107544, + "learning_rate": 4.957235317724588e-06, + "loss": 2.1804, + "step": 1632 + }, + { + "epoch": 0.08760729613733906, + "grad_norm": 0.9793893694877625, + "learning_rate": 4.957155276235802e-06, + "loss": 2.1999, + "step": 1633 + }, + { + "epoch": 0.08766094420600859, + "grad_norm": 1.0972487926483154, + "learning_rate": 4.957075160558704e-06, + "loss": 2.3085, + "step": 1634 + }, + { + "epoch": 0.08771459227467811, + "grad_norm": 2.310936689376831, + "learning_rate": 4.956994970695712e-06, + "loss": 2.2562, + "step": 1635 + }, + { + "epoch": 0.08776824034334764, + "grad_norm": 1.8515785932540894, + "learning_rate": 4.956914706649246e-06, + "loss": 2.3215, + "step": 1636 + }, + { + "epoch": 0.08782188841201717, + "grad_norm": 0.9569388628005981, + "learning_rate": 4.9568343684217325e-06, + "loss": 2.2949, + "step": 1637 + }, + { + "epoch": 0.08787553648068669, + "grad_norm": 1.075890302658081, + "learning_rate": 4.956753956015594e-06, + "loss": 1.5326, + "step": 1638 + }, + { + "epoch": 0.08792918454935622, + "grad_norm": 0.9892435669898987, + "learning_rate": 4.9566734694332604e-06, + "loss": 2.3118, + "step": 1639 + }, + { + "epoch": 0.08798283261802575, + "grad_norm": 1.394315481185913, + "learning_rate": 4.9565929086771616e-06, + "loss": 2.2008, + "step": 1640 + }, + { + "epoch": 0.08803648068669528, + "grad_norm": 1.2070680856704712, + "learning_rate": 4.956512273749728e-06, + "loss": 2.5041, + "step": 1641 + }, + { + "epoch": 0.0880901287553648, + "grad_norm": 1.5568751096725464, + "learning_rate": 4.956431564653398e-06, + "loss": 2.3386, + "step": 1642 + }, + { + "epoch": 0.08814377682403433, + "grad_norm": 0.926261305809021, + "learning_rate": 4.956350781390604e-06, + "loss": 2.2702, + "step": 1643 + }, + { + "epoch": 0.08819742489270387, + "grad_norm": 1.0699083805084229, + "learning_rate": 4.956269923963788e-06, + "loss": 2.4138, + "step": 1644 + }, + { + "epoch": 0.0882510729613734, + "grad_norm": 0.9699431657791138, + "learning_rate": 4.9561889923753906e-06, + "loss": 2.3312, + "step": 1645 + }, + { + "epoch": 0.08830472103004292, + "grad_norm": 1.1991177797317505, + "learning_rate": 4.956107986627855e-06, + "loss": 2.3921, + "step": 1646 + }, + { + "epoch": 0.08835836909871245, + "grad_norm": 1.1849983930587769, + "learning_rate": 4.9560269067236275e-06, + "loss": 1.8947, + "step": 1647 + }, + { + "epoch": 0.08841201716738198, + "grad_norm": 0.9640688896179199, + "learning_rate": 4.9559457526651566e-06, + "loss": 2.193, + "step": 1648 + }, + { + "epoch": 0.0884656652360515, + "grad_norm": 1.068498134613037, + "learning_rate": 4.9558645244548905e-06, + "loss": 2.381, + "step": 1649 + }, + { + "epoch": 0.08851931330472103, + "grad_norm": 0.9630306363105774, + "learning_rate": 4.955783222095284e-06, + "loss": 2.2937, + "step": 1650 + }, + { + "epoch": 0.08857296137339056, + "grad_norm": 0.9491444826126099, + "learning_rate": 4.955701845588791e-06, + "loss": 2.2244, + "step": 1651 + }, + { + "epoch": 0.08862660944206009, + "grad_norm": 1.047086238861084, + "learning_rate": 4.955620394937868e-06, + "loss": 2.5038, + "step": 1652 + }, + { + "epoch": 0.08868025751072961, + "grad_norm": 1.1465414762496948, + "learning_rate": 4.955538870144974e-06, + "loss": 2.1733, + "step": 1653 + }, + { + "epoch": 0.08873390557939914, + "grad_norm": 1.1305711269378662, + "learning_rate": 4.955457271212571e-06, + "loss": 2.2205, + "step": 1654 + }, + { + "epoch": 0.08878755364806867, + "grad_norm": 10.776774406433105, + "learning_rate": 4.955375598143124e-06, + "loss": 2.407, + "step": 1655 + }, + { + "epoch": 0.08884120171673819, + "grad_norm": 1.152079463005066, + "learning_rate": 4.955293850939096e-06, + "loss": 2.2355, + "step": 1656 + }, + { + "epoch": 0.08889484978540772, + "grad_norm": 1.032520055770874, + "learning_rate": 4.955212029602959e-06, + "loss": 2.4639, + "step": 1657 + }, + { + "epoch": 0.08894849785407725, + "grad_norm": 1.1614201068878174, + "learning_rate": 4.955130134137179e-06, + "loss": 2.2825, + "step": 1658 + }, + { + "epoch": 0.08900214592274679, + "grad_norm": 9.3829984664917, + "learning_rate": 4.955048164544232e-06, + "loss": 2.3736, + "step": 1659 + }, + { + "epoch": 0.0890557939914163, + "grad_norm": 0.9842097163200378, + "learning_rate": 4.954966120826592e-06, + "loss": 2.1657, + "step": 1660 + }, + { + "epoch": 0.08910944206008584, + "grad_norm": 1.2840720415115356, + "learning_rate": 4.954884002986737e-06, + "loss": 2.0849, + "step": 1661 + }, + { + "epoch": 0.08916309012875537, + "grad_norm": 1.6244661808013916, + "learning_rate": 4.954801811027143e-06, + "loss": 2.3069, + "step": 1662 + }, + { + "epoch": 0.0892167381974249, + "grad_norm": 1.2511205673217773, + "learning_rate": 4.954719544950295e-06, + "loss": 2.3526, + "step": 1663 + }, + { + "epoch": 0.08927038626609442, + "grad_norm": 1.004133701324463, + "learning_rate": 4.954637204758675e-06, + "loss": 2.2934, + "step": 1664 + }, + { + "epoch": 0.08932403433476395, + "grad_norm": 0.9755863547325134, + "learning_rate": 4.954554790454771e-06, + "loss": 2.3493, + "step": 1665 + }, + { + "epoch": 0.08937768240343348, + "grad_norm": 1.164946436882019, + "learning_rate": 4.9544723020410695e-06, + "loss": 2.2953, + "step": 1666 + }, + { + "epoch": 0.089431330472103, + "grad_norm": 1.0282413959503174, + "learning_rate": 4.954389739520062e-06, + "loss": 2.193, + "step": 1667 + }, + { + "epoch": 0.08948497854077253, + "grad_norm": 1.0469257831573486, + "learning_rate": 4.9543071028942405e-06, + "loss": 2.3295, + "step": 1668 + }, + { + "epoch": 0.08953862660944206, + "grad_norm": 1.0027525424957275, + "learning_rate": 4.954224392166101e-06, + "loss": 1.7702, + "step": 1669 + }, + { + "epoch": 0.0895922746781116, + "grad_norm": 1.0576199293136597, + "learning_rate": 4.95414160733814e-06, + "loss": 2.3216, + "step": 1670 + }, + { + "epoch": 0.08964592274678111, + "grad_norm": 1.091983437538147, + "learning_rate": 4.954058748412858e-06, + "loss": 2.3048, + "step": 1671 + }, + { + "epoch": 0.08969957081545064, + "grad_norm": 1.375794768333435, + "learning_rate": 4.953975815392755e-06, + "loss": 2.2796, + "step": 1672 + }, + { + "epoch": 0.08975321888412018, + "grad_norm": 1.1275479793548584, + "learning_rate": 4.953892808280336e-06, + "loss": 2.3907, + "step": 1673 + }, + { + "epoch": 0.0898068669527897, + "grad_norm": 0.9899151921272278, + "learning_rate": 4.953809727078108e-06, + "loss": 2.5191, + "step": 1674 + }, + { + "epoch": 0.08986051502145923, + "grad_norm": 1.0902293920516968, + "learning_rate": 4.953726571788578e-06, + "loss": 2.4469, + "step": 1675 + }, + { + "epoch": 0.08991416309012876, + "grad_norm": 1.140784502029419, + "learning_rate": 4.953643342414257e-06, + "loss": 2.3083, + "step": 1676 + }, + { + "epoch": 0.08996781115879829, + "grad_norm": 1.101956844329834, + "learning_rate": 4.953560038957659e-06, + "loss": 2.3227, + "step": 1677 + }, + { + "epoch": 0.0900214592274678, + "grad_norm": 1.2096384763717651, + "learning_rate": 4.953476661421299e-06, + "loss": 2.191, + "step": 1678 + }, + { + "epoch": 0.09007510729613734, + "grad_norm": 1.170034408569336, + "learning_rate": 4.9533932098076935e-06, + "loss": 2.4302, + "step": 1679 + }, + { + "epoch": 0.09012875536480687, + "grad_norm": 1.2253456115722656, + "learning_rate": 4.953309684119362e-06, + "loss": 2.2372, + "step": 1680 + }, + { + "epoch": 0.09018240343347639, + "grad_norm": 1.0534791946411133, + "learning_rate": 4.9532260843588265e-06, + "loss": 2.2459, + "step": 1681 + }, + { + "epoch": 0.09023605150214592, + "grad_norm": 0.9997828602790833, + "learning_rate": 4.953142410528612e-06, + "loss": 2.0874, + "step": 1682 + }, + { + "epoch": 0.09028969957081545, + "grad_norm": 1.0342515707015991, + "learning_rate": 4.953058662631244e-06, + "loss": 2.2832, + "step": 1683 + }, + { + "epoch": 0.09034334763948498, + "grad_norm": 1.0536489486694336, + "learning_rate": 4.952974840669251e-06, + "loss": 2.316, + "step": 1684 + }, + { + "epoch": 0.0903969957081545, + "grad_norm": 1.1113282442092896, + "learning_rate": 4.952890944645165e-06, + "loss": 2.1653, + "step": 1685 + }, + { + "epoch": 0.09045064377682403, + "grad_norm": 0.96253901720047, + "learning_rate": 4.952806974561518e-06, + "loss": 2.2952, + "step": 1686 + }, + { + "epoch": 0.09050429184549356, + "grad_norm": 1.232043743133545, + "learning_rate": 4.952722930420846e-06, + "loss": 2.2718, + "step": 1687 + }, + { + "epoch": 0.0905579399141631, + "grad_norm": 1.027339220046997, + "learning_rate": 4.9526388122256856e-06, + "loss": 2.6271, + "step": 1688 + }, + { + "epoch": 0.09061158798283261, + "grad_norm": 1.070208191871643, + "learning_rate": 4.952554619978577e-06, + "loss": 2.1693, + "step": 1689 + }, + { + "epoch": 0.09066523605150215, + "grad_norm": 1.5996330976486206, + "learning_rate": 4.952470353682061e-06, + "loss": 2.4926, + "step": 1690 + }, + { + "epoch": 0.09071888412017168, + "grad_norm": 1.0628859996795654, + "learning_rate": 4.952386013338685e-06, + "loss": 2.1305, + "step": 1691 + }, + { + "epoch": 0.0907725321888412, + "grad_norm": 1.0918407440185547, + "learning_rate": 4.952301598950993e-06, + "loss": 2.1313, + "step": 1692 + }, + { + "epoch": 0.09082618025751073, + "grad_norm": 1.1832183599472046, + "learning_rate": 4.952217110521534e-06, + "loss": 2.4062, + "step": 1693 + }, + { + "epoch": 0.09087982832618026, + "grad_norm": 0.9799538254737854, + "learning_rate": 4.952132548052859e-06, + "loss": 2.0866, + "step": 1694 + }, + { + "epoch": 0.09093347639484979, + "grad_norm": 1.0728163719177246, + "learning_rate": 4.952047911547522e-06, + "loss": 2.4577, + "step": 1695 + }, + { + "epoch": 0.09098712446351931, + "grad_norm": 1.0580471754074097, + "learning_rate": 4.9519632010080765e-06, + "loss": 2.0425, + "step": 1696 + }, + { + "epoch": 0.09104077253218884, + "grad_norm": 1.1906814575195312, + "learning_rate": 4.951878416437082e-06, + "loss": 2.4083, + "step": 1697 + }, + { + "epoch": 0.09109442060085837, + "grad_norm": 1.040137767791748, + "learning_rate": 4.951793557837098e-06, + "loss": 2.262, + "step": 1698 + }, + { + "epoch": 0.09114806866952789, + "grad_norm": 1.469665765762329, + "learning_rate": 4.951708625210686e-06, + "loss": 2.2536, + "step": 1699 + }, + { + "epoch": 0.09120171673819742, + "grad_norm": 1.0200997591018677, + "learning_rate": 4.9516236185604115e-06, + "loss": 2.2682, + "step": 1700 + }, + { + "epoch": 0.09125536480686695, + "grad_norm": 1.045822262763977, + "learning_rate": 4.951538537888839e-06, + "loss": 2.392, + "step": 1701 + }, + { + "epoch": 0.09130901287553649, + "grad_norm": 0.9028156995773315, + "learning_rate": 4.95145338319854e-06, + "loss": 2.0184, + "step": 1702 + }, + { + "epoch": 0.091362660944206, + "grad_norm": 0.9939229488372803, + "learning_rate": 4.951368154492083e-06, + "loss": 2.3039, + "step": 1703 + }, + { + "epoch": 0.09141630901287554, + "grad_norm": 1.1573246717453003, + "learning_rate": 4.9512828517720435e-06, + "loss": 2.3283, + "step": 1704 + }, + { + "epoch": 0.09146995708154507, + "grad_norm": 1.1356160640716553, + "learning_rate": 4.951197475040996e-06, + "loss": 2.4557, + "step": 1705 + }, + { + "epoch": 0.0915236051502146, + "grad_norm": 1.3603020906448364, + "learning_rate": 4.951112024301518e-06, + "loss": 2.2512, + "step": 1706 + }, + { + "epoch": 0.09157725321888412, + "grad_norm": 1.5410436391830444, + "learning_rate": 4.95102649955619e-06, + "loss": 2.3796, + "step": 1707 + }, + { + "epoch": 0.09163090128755365, + "grad_norm": 1.0808476209640503, + "learning_rate": 4.9509409008075934e-06, + "loss": 2.2622, + "step": 1708 + }, + { + "epoch": 0.09168454935622318, + "grad_norm": 1.0631842613220215, + "learning_rate": 4.950855228058313e-06, + "loss": 2.1981, + "step": 1709 + }, + { + "epoch": 0.0917381974248927, + "grad_norm": 1.033604621887207, + "learning_rate": 4.950769481310936e-06, + "loss": 2.2875, + "step": 1710 + }, + { + "epoch": 0.09179184549356223, + "grad_norm": 1.089732050895691, + "learning_rate": 4.950683660568052e-06, + "loss": 2.4262, + "step": 1711 + }, + { + "epoch": 0.09184549356223176, + "grad_norm": 1.0868736505508423, + "learning_rate": 4.9505977658322504e-06, + "loss": 2.4652, + "step": 1712 + }, + { + "epoch": 0.0918991416309013, + "grad_norm": 1.0220608711242676, + "learning_rate": 4.950511797106125e-06, + "loss": 2.3041, + "step": 1713 + }, + { + "epoch": 0.09195278969957081, + "grad_norm": 0.9999760389328003, + "learning_rate": 4.950425754392274e-06, + "loss": 2.3992, + "step": 1714 + }, + { + "epoch": 0.09200643776824034, + "grad_norm": 1.0831005573272705, + "learning_rate": 4.950339637693291e-06, + "loss": 2.3228, + "step": 1715 + }, + { + "epoch": 0.09206008583690987, + "grad_norm": 1.117891788482666, + "learning_rate": 4.950253447011779e-06, + "loss": 2.1657, + "step": 1716 + }, + { + "epoch": 0.09211373390557939, + "grad_norm": 1.136479139328003, + "learning_rate": 4.95016718235034e-06, + "loss": 2.3328, + "step": 1717 + }, + { + "epoch": 0.09216738197424892, + "grad_norm": 1.0672374963760376, + "learning_rate": 4.9500808437115775e-06, + "loss": 2.3971, + "step": 1718 + }, + { + "epoch": 0.09222103004291846, + "grad_norm": 1.0088444948196411, + "learning_rate": 4.949994431098099e-06, + "loss": 2.0962, + "step": 1719 + }, + { + "epoch": 0.09227467811158799, + "grad_norm": 1.1689600944519043, + "learning_rate": 4.949907944512514e-06, + "loss": 2.3995, + "step": 1720 + }, + { + "epoch": 0.0923283261802575, + "grad_norm": 1.145004391670227, + "learning_rate": 4.9498213839574325e-06, + "loss": 2.3243, + "step": 1721 + }, + { + "epoch": 0.09238197424892704, + "grad_norm": 1.1102570295333862, + "learning_rate": 4.94973474943547e-06, + "loss": 2.3647, + "step": 1722 + }, + { + "epoch": 0.09243562231759657, + "grad_norm": 1.1830644607543945, + "learning_rate": 4.94964804094924e-06, + "loss": 2.2818, + "step": 1723 + }, + { + "epoch": 0.0924892703862661, + "grad_norm": 0.9842121601104736, + "learning_rate": 4.949561258501361e-06, + "loss": 2.2592, + "step": 1724 + }, + { + "epoch": 0.09254291845493562, + "grad_norm": 1.0975236892700195, + "learning_rate": 4.949474402094454e-06, + "loss": 2.4808, + "step": 1725 + }, + { + "epoch": 0.09259656652360515, + "grad_norm": 1.018229603767395, + "learning_rate": 4.949387471731142e-06, + "loss": 2.3101, + "step": 1726 + }, + { + "epoch": 0.09265021459227468, + "grad_norm": 1.0171138048171997, + "learning_rate": 4.949300467414049e-06, + "loss": 2.3446, + "step": 1727 + }, + { + "epoch": 0.0927038626609442, + "grad_norm": 1.1310986280441284, + "learning_rate": 4.949213389145801e-06, + "loss": 2.4478, + "step": 1728 + }, + { + "epoch": 0.09275751072961373, + "grad_norm": 0.9898760914802551, + "learning_rate": 4.949126236929027e-06, + "loss": 2.3096, + "step": 1729 + }, + { + "epoch": 0.09281115879828326, + "grad_norm": 1.0996556282043457, + "learning_rate": 4.94903901076636e-06, + "loss": 2.4148, + "step": 1730 + }, + { + "epoch": 0.0928648068669528, + "grad_norm": 1.2774583101272583, + "learning_rate": 4.948951710660432e-06, + "loss": 1.6594, + "step": 1731 + }, + { + "epoch": 0.09291845493562231, + "grad_norm": 1.0694501399993896, + "learning_rate": 4.948864336613881e-06, + "loss": 2.1715, + "step": 1732 + }, + { + "epoch": 0.09297210300429185, + "grad_norm": 1.0541152954101562, + "learning_rate": 4.948776888629343e-06, + "loss": 2.4923, + "step": 1733 + }, + { + "epoch": 0.09302575107296138, + "grad_norm": 0.9381615519523621, + "learning_rate": 4.948689366709459e-06, + "loss": 2.19, + "step": 1734 + }, + { + "epoch": 0.0930793991416309, + "grad_norm": 1.3243545293807983, + "learning_rate": 4.948601770856871e-06, + "loss": 2.3973, + "step": 1735 + }, + { + "epoch": 0.09313304721030043, + "grad_norm": 1.1936115026474, + "learning_rate": 4.948514101074225e-06, + "loss": 2.134, + "step": 1736 + }, + { + "epoch": 0.09318669527896996, + "grad_norm": 1.2958829402923584, + "learning_rate": 4.948426357364166e-06, + "loss": 2.2175, + "step": 1737 + }, + { + "epoch": 0.09324034334763949, + "grad_norm": 1.3500028848648071, + "learning_rate": 4.948338539729346e-06, + "loss": 2.3002, + "step": 1738 + }, + { + "epoch": 0.09329399141630901, + "grad_norm": 0.9945364594459534, + "learning_rate": 4.948250648172415e-06, + "loss": 2.2178, + "step": 1739 + }, + { + "epoch": 0.09334763948497854, + "grad_norm": 1.2498462200164795, + "learning_rate": 4.948162682696026e-06, + "loss": 2.357, + "step": 1740 + }, + { + "epoch": 0.09340128755364807, + "grad_norm": 1.1835029125213623, + "learning_rate": 4.948074643302835e-06, + "loss": 1.5793, + "step": 1741 + }, + { + "epoch": 0.0934549356223176, + "grad_norm": 0.9677000641822815, + "learning_rate": 4.947986529995501e-06, + "loss": 1.4786, + "step": 1742 + }, + { + "epoch": 0.09350858369098712, + "grad_norm": 1.1027590036392212, + "learning_rate": 4.947898342776685e-06, + "loss": 2.3803, + "step": 1743 + }, + { + "epoch": 0.09356223175965665, + "grad_norm": 1.6241215467453003, + "learning_rate": 4.947810081649048e-06, + "loss": 2.2293, + "step": 1744 + }, + { + "epoch": 0.09361587982832618, + "grad_norm": 1.0730262994766235, + "learning_rate": 4.947721746615256e-06, + "loss": 2.555, + "step": 1745 + }, + { + "epoch": 0.0936695278969957, + "grad_norm": 6.772916793823242, + "learning_rate": 4.9476333376779745e-06, + "loss": 2.334, + "step": 1746 + }, + { + "epoch": 0.09372317596566523, + "grad_norm": 1.1642218828201294, + "learning_rate": 4.947544854839875e-06, + "loss": 2.283, + "step": 1747 + }, + { + "epoch": 0.09377682403433477, + "grad_norm": 1.220048427581787, + "learning_rate": 4.947456298103627e-06, + "loss": 2.6901, + "step": 1748 + }, + { + "epoch": 0.0938304721030043, + "grad_norm": 1.0343363285064697, + "learning_rate": 4.947367667471907e-06, + "loss": 2.2687, + "step": 1749 + }, + { + "epoch": 0.09388412017167382, + "grad_norm": 1.0181132555007935, + "learning_rate": 4.9472789629473875e-06, + "loss": 2.1069, + "step": 1750 + }, + { + "epoch": 0.09393776824034335, + "grad_norm": 0.9533340930938721, + "learning_rate": 4.947190184532749e-06, + "loss": 2.2887, + "step": 1751 + }, + { + "epoch": 0.09399141630901288, + "grad_norm": 1.0199658870697021, + "learning_rate": 4.947101332230671e-06, + "loss": 2.2671, + "step": 1752 + }, + { + "epoch": 0.0940450643776824, + "grad_norm": 1.2576285600662231, + "learning_rate": 4.947012406043837e-06, + "loss": 2.5089, + "step": 1753 + }, + { + "epoch": 0.09409871244635193, + "grad_norm": 1.522181510925293, + "learning_rate": 4.9469234059749325e-06, + "loss": 2.3635, + "step": 1754 + }, + { + "epoch": 0.09415236051502146, + "grad_norm": 1.002143144607544, + "learning_rate": 4.946834332026643e-06, + "loss": 2.3087, + "step": 1755 + }, + { + "epoch": 0.09420600858369099, + "grad_norm": 1.1013399362564087, + "learning_rate": 4.946745184201659e-06, + "loss": 2.2812, + "step": 1756 + }, + { + "epoch": 0.09425965665236051, + "grad_norm": 0.9937084317207336, + "learning_rate": 4.946655962502671e-06, + "loss": 2.2395, + "step": 1757 + }, + { + "epoch": 0.09431330472103004, + "grad_norm": 1.0199317932128906, + "learning_rate": 4.946566666932374e-06, + "loss": 2.1548, + "step": 1758 + }, + { + "epoch": 0.09436695278969957, + "grad_norm": 1.004987120628357, + "learning_rate": 4.946477297493464e-06, + "loss": 2.1739, + "step": 1759 + }, + { + "epoch": 0.0944206008583691, + "grad_norm": 1.0001802444458008, + "learning_rate": 4.946387854188638e-06, + "loss": 2.2052, + "step": 1760 + }, + { + "epoch": 0.09447424892703862, + "grad_norm": 1.0096222162246704, + "learning_rate": 4.946298337020599e-06, + "loss": 2.0537, + "step": 1761 + }, + { + "epoch": 0.09452789699570815, + "grad_norm": 1.0804417133331299, + "learning_rate": 4.9462087459920476e-06, + "loss": 2.2358, + "step": 1762 + }, + { + "epoch": 0.09458154506437769, + "grad_norm": 0.9764360785484314, + "learning_rate": 4.9461190811056886e-06, + "loss": 2.1773, + "step": 1763 + }, + { + "epoch": 0.0946351931330472, + "grad_norm": 1.0284818410873413, + "learning_rate": 4.9460293423642315e-06, + "loss": 2.0455, + "step": 1764 + }, + { + "epoch": 0.09468884120171674, + "grad_norm": 1.0948991775512695, + "learning_rate": 4.945939529770383e-06, + "loss": 2.2671, + "step": 1765 + }, + { + "epoch": 0.09474248927038627, + "grad_norm": 0.9731478095054626, + "learning_rate": 4.945849643326857e-06, + "loss": 2.2466, + "step": 1766 + }, + { + "epoch": 0.0947961373390558, + "grad_norm": 1.0813003778457642, + "learning_rate": 4.945759683036367e-06, + "loss": 2.0396, + "step": 1767 + }, + { + "epoch": 0.09484978540772532, + "grad_norm": 1.5196201801300049, + "learning_rate": 4.945669648901628e-06, + "loss": 2.034, + "step": 1768 + }, + { + "epoch": 0.09490343347639485, + "grad_norm": 1.1343860626220703, + "learning_rate": 4.945579540925359e-06, + "loss": 2.1557, + "step": 1769 + }, + { + "epoch": 0.09495708154506438, + "grad_norm": 1.2273834943771362, + "learning_rate": 4.9454893591102815e-06, + "loss": 2.5308, + "step": 1770 + }, + { + "epoch": 0.0950107296137339, + "grad_norm": 1.316330909729004, + "learning_rate": 4.945399103459117e-06, + "loss": 2.7067, + "step": 1771 + }, + { + "epoch": 0.09506437768240343, + "grad_norm": 4.036048889160156, + "learning_rate": 4.945308773974592e-06, + "loss": 2.2812, + "step": 1772 + }, + { + "epoch": 0.09511802575107296, + "grad_norm": 1.1160069704055786, + "learning_rate": 4.945218370659433e-06, + "loss": 2.3493, + "step": 1773 + }, + { + "epoch": 0.0951716738197425, + "grad_norm": 0.9403936266899109, + "learning_rate": 4.945127893516368e-06, + "loss": 2.2586, + "step": 1774 + }, + { + "epoch": 0.09522532188841201, + "grad_norm": 1.2135741710662842, + "learning_rate": 4.945037342548131e-06, + "loss": 2.41, + "step": 1775 + }, + { + "epoch": 0.09527896995708154, + "grad_norm": 0.9958726763725281, + "learning_rate": 4.944946717757455e-06, + "loss": 2.1145, + "step": 1776 + }, + { + "epoch": 0.09533261802575108, + "grad_norm": 0.9738193154335022, + "learning_rate": 4.944856019147078e-06, + "loss": 2.2265, + "step": 1777 + }, + { + "epoch": 0.09538626609442061, + "grad_norm": 0.9824439287185669, + "learning_rate": 4.944765246719735e-06, + "loss": 1.6814, + "step": 1778 + }, + { + "epoch": 0.09543991416309013, + "grad_norm": 1.069852590560913, + "learning_rate": 4.9446744004781685e-06, + "loss": 2.2942, + "step": 1779 + }, + { + "epoch": 0.09549356223175966, + "grad_norm": 1.1017260551452637, + "learning_rate": 4.944583480425123e-06, + "loss": 2.2711, + "step": 1780 + }, + { + "epoch": 0.09554721030042919, + "grad_norm": 1.0956183671951294, + "learning_rate": 4.94449248656334e-06, + "loss": 2.2792, + "step": 1781 + }, + { + "epoch": 0.0956008583690987, + "grad_norm": 1.3162893056869507, + "learning_rate": 4.94440141889557e-06, + "loss": 2.439, + "step": 1782 + }, + { + "epoch": 0.09565450643776824, + "grad_norm": 1.1342337131500244, + "learning_rate": 4.944310277424562e-06, + "loss": 2.2786, + "step": 1783 + }, + { + "epoch": 0.09570815450643777, + "grad_norm": 1.0669126510620117, + "learning_rate": 4.944219062153066e-06, + "loss": 2.3417, + "step": 1784 + }, + { + "epoch": 0.0957618025751073, + "grad_norm": 0.9310864210128784, + "learning_rate": 4.944127773083839e-06, + "loss": 2.3123, + "step": 1785 + }, + { + "epoch": 0.09581545064377682, + "grad_norm": 0.9087014198303223, + "learning_rate": 4.944036410219635e-06, + "loss": 2.283, + "step": 1786 + }, + { + "epoch": 0.09586909871244635, + "grad_norm": 1.1928151845932007, + "learning_rate": 4.943944973563212e-06, + "loss": 2.3887, + "step": 1787 + }, + { + "epoch": 0.09592274678111588, + "grad_norm": 1.0142813920974731, + "learning_rate": 4.943853463117333e-06, + "loss": 1.9903, + "step": 1788 + }, + { + "epoch": 0.0959763948497854, + "grad_norm": 0.969001293182373, + "learning_rate": 4.94376187888476e-06, + "loss": 2.6434, + "step": 1789 + }, + { + "epoch": 0.09603004291845493, + "grad_norm": 1.0098637342453003, + "learning_rate": 4.943670220868258e-06, + "loss": 2.3612, + "step": 1790 + }, + { + "epoch": 0.09608369098712446, + "grad_norm": 1.0917390584945679, + "learning_rate": 4.943578489070593e-06, + "loss": 2.3741, + "step": 1791 + }, + { + "epoch": 0.096137339055794, + "grad_norm": 1.5210977792739868, + "learning_rate": 4.943486683494537e-06, + "loss": 2.422, + "step": 1792 + }, + { + "epoch": 0.09619098712446351, + "grad_norm": 1.3402745723724365, + "learning_rate": 4.943394804142861e-06, + "loss": 2.3609, + "step": 1793 + }, + { + "epoch": 0.09624463519313305, + "grad_norm": 1.1966967582702637, + "learning_rate": 4.943302851018339e-06, + "loss": 2.1588, + "step": 1794 + }, + { + "epoch": 0.09629828326180258, + "grad_norm": 1.168697476387024, + "learning_rate": 4.943210824123746e-06, + "loss": 2.3092, + "step": 1795 + }, + { + "epoch": 0.09635193133047211, + "grad_norm": 2.0943479537963867, + "learning_rate": 4.943118723461864e-06, + "loss": 2.3025, + "step": 1796 + }, + { + "epoch": 0.09640557939914163, + "grad_norm": 1.0429822206497192, + "learning_rate": 4.94302654903547e-06, + "loss": 2.3494, + "step": 1797 + }, + { + "epoch": 0.09645922746781116, + "grad_norm": 1.0660500526428223, + "learning_rate": 4.94293430084735e-06, + "loss": 2.1772, + "step": 1798 + }, + { + "epoch": 0.09651287553648069, + "grad_norm": 0.9309167861938477, + "learning_rate": 4.942841978900287e-06, + "loss": 2.3166, + "step": 1799 + }, + { + "epoch": 0.09656652360515021, + "grad_norm": 1.264136791229248, + "learning_rate": 4.942749583197069e-06, + "loss": 2.3415, + "step": 1800 + }, + { + "epoch": 0.09662017167381974, + "grad_norm": 1.0385442972183228, + "learning_rate": 4.942657113740486e-06, + "loss": 2.2775, + "step": 1801 + }, + { + "epoch": 0.09667381974248927, + "grad_norm": 0.9580645561218262, + "learning_rate": 4.94256457053333e-06, + "loss": 2.062, + "step": 1802 + }, + { + "epoch": 0.0967274678111588, + "grad_norm": 0.9761534333229065, + "learning_rate": 4.942471953578396e-06, + "loss": 2.2797, + "step": 1803 + }, + { + "epoch": 0.09678111587982832, + "grad_norm": 1.1233505010604858, + "learning_rate": 4.942379262878478e-06, + "loss": 2.3796, + "step": 1804 + }, + { + "epoch": 0.09683476394849785, + "grad_norm": 1.0904690027236938, + "learning_rate": 4.942286498436377e-06, + "loss": 2.1946, + "step": 1805 + }, + { + "epoch": 0.09688841201716739, + "grad_norm": 1.2603479623794556, + "learning_rate": 4.942193660254893e-06, + "loss": 2.1587, + "step": 1806 + }, + { + "epoch": 0.0969420600858369, + "grad_norm": 1.2668986320495605, + "learning_rate": 4.942100748336828e-06, + "loss": 2.3472, + "step": 1807 + }, + { + "epoch": 0.09699570815450644, + "grad_norm": 1.2179588079452515, + "learning_rate": 4.942007762684988e-06, + "loss": 2.6664, + "step": 1808 + }, + { + "epoch": 0.09704935622317597, + "grad_norm": 3.2700676918029785, + "learning_rate": 4.941914703302181e-06, + "loss": 2.4886, + "step": 1809 + }, + { + "epoch": 0.0971030042918455, + "grad_norm": 1.0173689126968384, + "learning_rate": 4.941821570191217e-06, + "loss": 2.4749, + "step": 1810 + }, + { + "epoch": 0.09715665236051502, + "grad_norm": 1.240060806274414, + "learning_rate": 4.941728363354906e-06, + "loss": 2.2142, + "step": 1811 + }, + { + "epoch": 0.09721030042918455, + "grad_norm": 1.0010547637939453, + "learning_rate": 4.941635082796065e-06, + "loss": 2.2299, + "step": 1812 + }, + { + "epoch": 0.09726394849785408, + "grad_norm": 1.383365273475647, + "learning_rate": 4.941541728517507e-06, + "loss": 2.4874, + "step": 1813 + }, + { + "epoch": 0.09731759656652361, + "grad_norm": 1.5537402629852295, + "learning_rate": 4.941448300522054e-06, + "loss": 2.2708, + "step": 1814 + }, + { + "epoch": 0.09737124463519313, + "grad_norm": 1.1356617212295532, + "learning_rate": 4.941354798812524e-06, + "loss": 2.2645, + "step": 1815 + }, + { + "epoch": 0.09742489270386266, + "grad_norm": 1.0561602115631104, + "learning_rate": 4.941261223391743e-06, + "loss": 2.2893, + "step": 1816 + }, + { + "epoch": 0.0974785407725322, + "grad_norm": 1.0381650924682617, + "learning_rate": 4.941167574262533e-06, + "loss": 2.2233, + "step": 1817 + }, + { + "epoch": 0.09753218884120171, + "grad_norm": 1.0750852823257446, + "learning_rate": 4.941073851427725e-06, + "loss": 2.0981, + "step": 1818 + }, + { + "epoch": 0.09758583690987124, + "grad_norm": 1.0661476850509644, + "learning_rate": 4.940980054890146e-06, + "loss": 2.3254, + "step": 1819 + }, + { + "epoch": 0.09763948497854077, + "grad_norm": 2.4237892627716064, + "learning_rate": 4.940886184652628e-06, + "loss": 2.1913, + "step": 1820 + }, + { + "epoch": 0.0976931330472103, + "grad_norm": 1.1221319437026978, + "learning_rate": 4.940792240718007e-06, + "loss": 2.2936, + "step": 1821 + }, + { + "epoch": 0.09774678111587982, + "grad_norm": 1.2436867952346802, + "learning_rate": 4.940698223089119e-06, + "loss": 2.2268, + "step": 1822 + }, + { + "epoch": 0.09780042918454936, + "grad_norm": 1.1578826904296875, + "learning_rate": 4.9406041317688014e-06, + "loss": 2.1855, + "step": 1823 + }, + { + "epoch": 0.09785407725321889, + "grad_norm": 1.3151758909225464, + "learning_rate": 4.9405099667598965e-06, + "loss": 2.417, + "step": 1824 + }, + { + "epoch": 0.0979077253218884, + "grad_norm": 1.1388425827026367, + "learning_rate": 4.9404157280652466e-06, + "loss": 2.5327, + "step": 1825 + }, + { + "epoch": 0.09796137339055794, + "grad_norm": 0.9641751646995544, + "learning_rate": 4.940321415687697e-06, + "loss": 2.1574, + "step": 1826 + }, + { + "epoch": 0.09801502145922747, + "grad_norm": 1.0384544134140015, + "learning_rate": 4.940227029630096e-06, + "loss": 2.4034, + "step": 1827 + }, + { + "epoch": 0.098068669527897, + "grad_norm": 1.4834786653518677, + "learning_rate": 4.940132569895292e-06, + "loss": 2.4003, + "step": 1828 + }, + { + "epoch": 0.09812231759656652, + "grad_norm": 1.011629581451416, + "learning_rate": 4.940038036486139e-06, + "loss": 2.2569, + "step": 1829 + }, + { + "epoch": 0.09817596566523605, + "grad_norm": 1.2222284078598022, + "learning_rate": 4.93994342940549e-06, + "loss": 2.3708, + "step": 1830 + }, + { + "epoch": 0.09822961373390558, + "grad_norm": 1.1567062139511108, + "learning_rate": 4.939848748656201e-06, + "loss": 2.4804, + "step": 1831 + }, + { + "epoch": 0.0982832618025751, + "grad_norm": 1.0491682291030884, + "learning_rate": 4.93975399424113e-06, + "loss": 2.2564, + "step": 1832 + }, + { + "epoch": 0.09833690987124463, + "grad_norm": 1.2224682569503784, + "learning_rate": 4.939659166163141e-06, + "loss": 2.8533, + "step": 1833 + }, + { + "epoch": 0.09839055793991416, + "grad_norm": 1.135331630706787, + "learning_rate": 4.939564264425094e-06, + "loss": 2.6133, + "step": 1834 + }, + { + "epoch": 0.0984442060085837, + "grad_norm": 1.3518239259719849, + "learning_rate": 4.939469289029857e-06, + "loss": 2.508, + "step": 1835 + }, + { + "epoch": 0.09849785407725321, + "grad_norm": 1.1037099361419678, + "learning_rate": 4.939374239980295e-06, + "loss": 2.3498, + "step": 1836 + }, + { + "epoch": 0.09855150214592275, + "grad_norm": 1.0083069801330566, + "learning_rate": 4.9392791172792784e-06, + "loss": 2.5604, + "step": 1837 + }, + { + "epoch": 0.09860515021459228, + "grad_norm": 0.9932397603988647, + "learning_rate": 4.939183920929681e-06, + "loss": 2.118, + "step": 1838 + }, + { + "epoch": 0.09865879828326181, + "grad_norm": 1.0175913572311401, + "learning_rate": 4.9390886509343756e-06, + "loss": 2.3314, + "step": 1839 + }, + { + "epoch": 0.09871244635193133, + "grad_norm": 1.057472586631775, + "learning_rate": 4.938993307296238e-06, + "loss": 2.1229, + "step": 1840 + }, + { + "epoch": 0.09876609442060086, + "grad_norm": 1.3059980869293213, + "learning_rate": 4.9388978900181486e-06, + "loss": 2.4538, + "step": 1841 + }, + { + "epoch": 0.09881974248927039, + "grad_norm": 0.9771423935890198, + "learning_rate": 4.938802399102987e-06, + "loss": 2.3827, + "step": 1842 + }, + { + "epoch": 0.09887339055793991, + "grad_norm": 0.9960698485374451, + "learning_rate": 4.938706834553637e-06, + "loss": 2.2894, + "step": 1843 + }, + { + "epoch": 0.09892703862660944, + "grad_norm": 1.1310608386993408, + "learning_rate": 4.938611196372984e-06, + "loss": 2.1061, + "step": 1844 + }, + { + "epoch": 0.09898068669527897, + "grad_norm": 0.985736608505249, + "learning_rate": 4.938515484563915e-06, + "loss": 2.3323, + "step": 1845 + }, + { + "epoch": 0.0990343347639485, + "grad_norm": 1.3505454063415527, + "learning_rate": 4.93841969912932e-06, + "loss": 2.3304, + "step": 1846 + }, + { + "epoch": 0.09908798283261802, + "grad_norm": 1.1021395921707153, + "learning_rate": 4.9383238400720925e-06, + "loss": 2.378, + "step": 1847 + }, + { + "epoch": 0.09914163090128755, + "grad_norm": 1.4619157314300537, + "learning_rate": 4.938227907395123e-06, + "loss": 2.2652, + "step": 1848 + }, + { + "epoch": 0.09919527896995708, + "grad_norm": 1.2272729873657227, + "learning_rate": 4.9381319011013125e-06, + "loss": 2.4661, + "step": 1849 + }, + { + "epoch": 0.0992489270386266, + "grad_norm": 1.2520153522491455, + "learning_rate": 4.9380358211935566e-06, + "loss": 2.4801, + "step": 1850 + }, + { + "epoch": 0.09930257510729613, + "grad_norm": 2.0115866661071777, + "learning_rate": 4.937939667674757e-06, + "loss": 2.348, + "step": 1851 + }, + { + "epoch": 0.09935622317596567, + "grad_norm": 1.3581775426864624, + "learning_rate": 4.937843440547818e-06, + "loss": 2.2388, + "step": 1852 + }, + { + "epoch": 0.0994098712446352, + "grad_norm": 1.0846054553985596, + "learning_rate": 4.937747139815643e-06, + "loss": 2.4461, + "step": 1853 + }, + { + "epoch": 0.09946351931330472, + "grad_norm": 1.0130946636199951, + "learning_rate": 4.937650765481141e-06, + "loss": 2.3485, + "step": 1854 + }, + { + "epoch": 0.09951716738197425, + "grad_norm": 1.190429925918579, + "learning_rate": 4.9375543175472216e-06, + "loss": 2.338, + "step": 1855 + }, + { + "epoch": 0.09957081545064378, + "grad_norm": 1.199324369430542, + "learning_rate": 4.937457796016797e-06, + "loss": 2.424, + "step": 1856 + }, + { + "epoch": 0.09962446351931331, + "grad_norm": 0.9469640254974365, + "learning_rate": 4.93736120089278e-06, + "loss": 1.8118, + "step": 1857 + }, + { + "epoch": 0.09967811158798283, + "grad_norm": 1.030418038368225, + "learning_rate": 4.9372645321780885e-06, + "loss": 2.2803, + "step": 1858 + }, + { + "epoch": 0.09973175965665236, + "grad_norm": 1.013992428779602, + "learning_rate": 4.937167789875641e-06, + "loss": 2.41, + "step": 1859 + }, + { + "epoch": 0.09978540772532189, + "grad_norm": 0.9556620121002197, + "learning_rate": 4.937070973988358e-06, + "loss": 2.3248, + "step": 1860 + }, + { + "epoch": 0.09983905579399141, + "grad_norm": 1.1075009107589722, + "learning_rate": 4.936974084519164e-06, + "loss": 2.4167, + "step": 1861 + }, + { + "epoch": 0.09989270386266094, + "grad_norm": 1.0543617010116577, + "learning_rate": 4.936877121470982e-06, + "loss": 2.1338, + "step": 1862 + }, + { + "epoch": 0.09994635193133047, + "grad_norm": 1.0831676721572876, + "learning_rate": 4.936780084846741e-06, + "loss": 2.3586, + "step": 1863 + }, + { + "epoch": 0.1, + "grad_norm": 1.0894944667816162, + "learning_rate": 4.936682974649371e-06, + "loss": 2.3999, + "step": 1864 + }, + { + "epoch": 0.10005364806866952, + "grad_norm": 1.329052209854126, + "learning_rate": 4.9365857908818034e-06, + "loss": 2.4621, + "step": 1865 + }, + { + "epoch": 0.10010729613733906, + "grad_norm": 0.9984211325645447, + "learning_rate": 4.936488533546974e-06, + "loss": 2.4173, + "step": 1866 + }, + { + "epoch": 0.10016094420600859, + "grad_norm": 1.033639907836914, + "learning_rate": 4.936391202647816e-06, + "loss": 2.1486, + "step": 1867 + }, + { + "epoch": 0.1002145922746781, + "grad_norm": 1.1027116775512695, + "learning_rate": 4.936293798187272e-06, + "loss": 2.1526, + "step": 1868 + }, + { + "epoch": 0.10026824034334764, + "grad_norm": 1.2518665790557861, + "learning_rate": 4.93619632016828e-06, + "loss": 2.1665, + "step": 1869 + }, + { + "epoch": 0.10032188841201717, + "grad_norm": 1.0929193496704102, + "learning_rate": 4.9360987685937835e-06, + "loss": 2.302, + "step": 1870 + }, + { + "epoch": 0.1003755364806867, + "grad_norm": 1.1017314195632935, + "learning_rate": 4.936001143466729e-06, + "loss": 2.0748, + "step": 1871 + }, + { + "epoch": 0.10042918454935622, + "grad_norm": 1.0110530853271484, + "learning_rate": 4.935903444790063e-06, + "loss": 2.3387, + "step": 1872 + }, + { + "epoch": 0.10048283261802575, + "grad_norm": 1.0448200702667236, + "learning_rate": 4.935805672566737e-06, + "loss": 2.2271, + "step": 1873 + }, + { + "epoch": 0.10053648068669528, + "grad_norm": 1.0159937143325806, + "learning_rate": 4.935707826799702e-06, + "loss": 2.4846, + "step": 1874 + }, + { + "epoch": 0.10059012875536481, + "grad_norm": 1.311103343963623, + "learning_rate": 4.93560990749191e-06, + "loss": 2.1506, + "step": 1875 + }, + { + "epoch": 0.10064377682403433, + "grad_norm": 1.0946451425552368, + "learning_rate": 4.935511914646321e-06, + "loss": 2.2024, + "step": 1876 + }, + { + "epoch": 0.10069742489270386, + "grad_norm": 1.112795352935791, + "learning_rate": 4.9354138482658934e-06, + "loss": 2.3616, + "step": 1877 + }, + { + "epoch": 0.1007510729613734, + "grad_norm": 3.6566877365112305, + "learning_rate": 4.935315708353585e-06, + "loss": 2.2085, + "step": 1878 + }, + { + "epoch": 0.10080472103004291, + "grad_norm": 1.154276967048645, + "learning_rate": 4.935217494912362e-06, + "loss": 2.4051, + "step": 1879 + }, + { + "epoch": 0.10085836909871244, + "grad_norm": 1.085336685180664, + "learning_rate": 4.935119207945188e-06, + "loss": 2.204, + "step": 1880 + }, + { + "epoch": 0.10091201716738198, + "grad_norm": 1.0241410732269287, + "learning_rate": 4.935020847455031e-06, + "loss": 2.7957, + "step": 1881 + }, + { + "epoch": 0.10096566523605151, + "grad_norm": 1.4265036582946777, + "learning_rate": 4.9349224134448616e-06, + "loss": 1.8139, + "step": 1882 + }, + { + "epoch": 0.10101931330472103, + "grad_norm": 1.04610013961792, + "learning_rate": 4.93482390591765e-06, + "loss": 2.465, + "step": 1883 + }, + { + "epoch": 0.10107296137339056, + "grad_norm": 0.9944731593132019, + "learning_rate": 4.934725324876372e-06, + "loss": 2.3437, + "step": 1884 + }, + { + "epoch": 0.10112660944206009, + "grad_norm": 1.1450241804122925, + "learning_rate": 4.934626670324003e-06, + "loss": 2.3373, + "step": 1885 + }, + { + "epoch": 0.10118025751072961, + "grad_norm": 0.9581221342086792, + "learning_rate": 4.934527942263524e-06, + "loss": 2.1985, + "step": 1886 + }, + { + "epoch": 0.10123390557939914, + "grad_norm": 1.0833932161331177, + "learning_rate": 4.934429140697913e-06, + "loss": 2.3517, + "step": 1887 + }, + { + "epoch": 0.10128755364806867, + "grad_norm": 1.4144988059997559, + "learning_rate": 4.934330265630153e-06, + "loss": 2.4213, + "step": 1888 + }, + { + "epoch": 0.1013412017167382, + "grad_norm": 1.1171783208847046, + "learning_rate": 4.934231317063232e-06, + "loss": 2.2352, + "step": 1889 + }, + { + "epoch": 0.10139484978540772, + "grad_norm": 4.830934524536133, + "learning_rate": 4.934132295000136e-06, + "loss": 2.4924, + "step": 1890 + }, + { + "epoch": 0.10144849785407725, + "grad_norm": 1.2319527864456177, + "learning_rate": 4.934033199443854e-06, + "loss": 2.3649, + "step": 1891 + }, + { + "epoch": 0.10150214592274678, + "grad_norm": 1.0994936227798462, + "learning_rate": 4.933934030397379e-06, + "loss": 2.6161, + "step": 1892 + }, + { + "epoch": 0.10155579399141632, + "grad_norm": 1.039217472076416, + "learning_rate": 4.9338347878637044e-06, + "loss": 2.3718, + "step": 1893 + }, + { + "epoch": 0.10160944206008583, + "grad_norm": 1.1979058980941772, + "learning_rate": 4.933735471845827e-06, + "loss": 1.7425, + "step": 1894 + }, + { + "epoch": 0.10166309012875537, + "grad_norm": 1.2812947034835815, + "learning_rate": 4.933636082346746e-06, + "loss": 2.371, + "step": 1895 + }, + { + "epoch": 0.1017167381974249, + "grad_norm": 1.1178780794143677, + "learning_rate": 4.9335366193694625e-06, + "loss": 1.9442, + "step": 1896 + }, + { + "epoch": 0.10177038626609441, + "grad_norm": 1.063353419303894, + "learning_rate": 4.933437082916979e-06, + "loss": 2.2253, + "step": 1897 + }, + { + "epoch": 0.10182403433476395, + "grad_norm": 1.1093535423278809, + "learning_rate": 4.933337472992299e-06, + "loss": 2.1623, + "step": 1898 + }, + { + "epoch": 0.10187768240343348, + "grad_norm": 0.9737566709518433, + "learning_rate": 4.933237789598433e-06, + "loss": 2.2017, + "step": 1899 + }, + { + "epoch": 0.10193133047210301, + "grad_norm": 1.0472688674926758, + "learning_rate": 4.933138032738388e-06, + "loss": 2.3247, + "step": 1900 + }, + { + "epoch": 0.10198497854077253, + "grad_norm": 1.1580147743225098, + "learning_rate": 4.9330382024151795e-06, + "loss": 2.3406, + "step": 1901 + }, + { + "epoch": 0.10203862660944206, + "grad_norm": 0.9940545558929443, + "learning_rate": 4.932938298631818e-06, + "loss": 2.4166, + "step": 1902 + }, + { + "epoch": 0.10209227467811159, + "grad_norm": 1.0722543001174927, + "learning_rate": 4.932838321391321e-06, + "loss": 2.2958, + "step": 1903 + }, + { + "epoch": 0.10214592274678111, + "grad_norm": 1.3596563339233398, + "learning_rate": 4.932738270696708e-06, + "loss": 2.3119, + "step": 1904 + }, + { + "epoch": 0.10219957081545064, + "grad_norm": 2.106581687927246, + "learning_rate": 4.932638146550998e-06, + "loss": 2.4007, + "step": 1905 + }, + { + "epoch": 0.10225321888412017, + "grad_norm": 1.0810281038284302, + "learning_rate": 4.9325379489572165e-06, + "loss": 2.4845, + "step": 1906 + }, + { + "epoch": 0.1023068669527897, + "grad_norm": 0.9764095544815063, + "learning_rate": 4.932437677918387e-06, + "loss": 2.2538, + "step": 1907 + }, + { + "epoch": 0.10236051502145922, + "grad_norm": 1.1815199851989746, + "learning_rate": 4.932337333437538e-06, + "loss": 2.1681, + "step": 1908 + }, + { + "epoch": 0.10241416309012875, + "grad_norm": 1.2199777364730835, + "learning_rate": 4.932236915517697e-06, + "loss": 2.055, + "step": 1909 + }, + { + "epoch": 0.10246781115879829, + "grad_norm": 1.1175533533096313, + "learning_rate": 4.9321364241619e-06, + "loss": 2.4388, + "step": 1910 + }, + { + "epoch": 0.10252145922746782, + "grad_norm": 1.08524489402771, + "learning_rate": 4.932035859373177e-06, + "loss": 2.2443, + "step": 1911 + }, + { + "epoch": 0.10257510729613734, + "grad_norm": 1.004044532775879, + "learning_rate": 4.931935221154566e-06, + "loss": 2.1838, + "step": 1912 + }, + { + "epoch": 0.10262875536480687, + "grad_norm": 0.9978200793266296, + "learning_rate": 4.931834509509105e-06, + "loss": 2.4456, + "step": 1913 + }, + { + "epoch": 0.1026824034334764, + "grad_norm": 1.1172590255737305, + "learning_rate": 4.931733724439837e-06, + "loss": 2.2105, + "step": 1914 + }, + { + "epoch": 0.10273605150214592, + "grad_norm": 1.3385688066482544, + "learning_rate": 4.931632865949801e-06, + "loss": 2.1631, + "step": 1915 + }, + { + "epoch": 0.10278969957081545, + "grad_norm": 0.9395245909690857, + "learning_rate": 4.931531934042047e-06, + "loss": 2.3465, + "step": 1916 + }, + { + "epoch": 0.10284334763948498, + "grad_norm": 1.017091989517212, + "learning_rate": 4.9314309287196175e-06, + "loss": 2.2796, + "step": 1917 + }, + { + "epoch": 0.10289699570815451, + "grad_norm": 1.107131838798523, + "learning_rate": 4.931329849985566e-06, + "loss": 2.4611, + "step": 1918 + }, + { + "epoch": 0.10295064377682403, + "grad_norm": 1.0355887413024902, + "learning_rate": 4.9312286978429415e-06, + "loss": 2.2878, + "step": 1919 + }, + { + "epoch": 0.10300429184549356, + "grad_norm": 1.1842552423477173, + "learning_rate": 4.9311274722948e-06, + "loss": 2.1973, + "step": 1920 + }, + { + "epoch": 0.1030579399141631, + "grad_norm": 1.0049989223480225, + "learning_rate": 4.931026173344198e-06, + "loss": 2.2489, + "step": 1921 + }, + { + "epoch": 0.10311158798283261, + "grad_norm": 1.1443963050842285, + "learning_rate": 4.930924800994192e-06, + "loss": 2.4495, + "step": 1922 + }, + { + "epoch": 0.10316523605150214, + "grad_norm": 1.4274924993515015, + "learning_rate": 4.930823355247844e-06, + "loss": 2.212, + "step": 1923 + }, + { + "epoch": 0.10321888412017168, + "grad_norm": 7.714043140411377, + "learning_rate": 4.930721836108217e-06, + "loss": 2.2976, + "step": 1924 + }, + { + "epoch": 0.1032725321888412, + "grad_norm": 1.056976318359375, + "learning_rate": 4.930620243578376e-06, + "loss": 2.2925, + "step": 1925 + }, + { + "epoch": 0.10332618025751072, + "grad_norm": 0.9453772902488708, + "learning_rate": 4.930518577661388e-06, + "loss": 2.0442, + "step": 1926 + }, + { + "epoch": 0.10337982832618026, + "grad_norm": 1.0958707332611084, + "learning_rate": 4.930416838360323e-06, + "loss": 2.2092, + "step": 1927 + }, + { + "epoch": 0.10343347639484979, + "grad_norm": 1.2005515098571777, + "learning_rate": 4.930315025678253e-06, + "loss": 2.35, + "step": 1928 + }, + { + "epoch": 0.10348712446351932, + "grad_norm": 1.1472572088241577, + "learning_rate": 4.930213139618252e-06, + "loss": 2.2942, + "step": 1929 + }, + { + "epoch": 0.10354077253218884, + "grad_norm": 1.045853853225708, + "learning_rate": 4.930111180183395e-06, + "loss": 2.1626, + "step": 1930 + }, + { + "epoch": 0.10359442060085837, + "grad_norm": 1.09354567527771, + "learning_rate": 4.930009147376762e-06, + "loss": 2.3185, + "step": 1931 + }, + { + "epoch": 0.1036480686695279, + "grad_norm": 2.403632879257202, + "learning_rate": 4.9299070412014325e-06, + "loss": 2.2059, + "step": 1932 + }, + { + "epoch": 0.10370171673819742, + "grad_norm": 1.2445405721664429, + "learning_rate": 4.9298048616604896e-06, + "loss": 2.4014, + "step": 1933 + }, + { + "epoch": 0.10375536480686695, + "grad_norm": 1.114967703819275, + "learning_rate": 4.929702608757019e-06, + "loss": 2.5393, + "step": 1934 + }, + { + "epoch": 0.10380901287553648, + "grad_norm": 1.0352331399917603, + "learning_rate": 4.929600282494108e-06, + "loss": 2.4063, + "step": 1935 + }, + { + "epoch": 0.10386266094420601, + "grad_norm": 0.9813822507858276, + "learning_rate": 4.929497882874845e-06, + "loss": 2.1798, + "step": 1936 + }, + { + "epoch": 0.10391630901287553, + "grad_norm": 1.2553036212921143, + "learning_rate": 4.929395409902323e-06, + "loss": 2.5353, + "step": 1937 + }, + { + "epoch": 0.10396995708154506, + "grad_norm": 1.0444730520248413, + "learning_rate": 4.929292863579635e-06, + "loss": 2.2866, + "step": 1938 + }, + { + "epoch": 0.1040236051502146, + "grad_norm": 1.1428064107894897, + "learning_rate": 4.929190243909878e-06, + "loss": 2.3918, + "step": 1939 + }, + { + "epoch": 0.10407725321888411, + "grad_norm": 1.1962543725967407, + "learning_rate": 4.92908755089615e-06, + "loss": 2.2525, + "step": 1940 + }, + { + "epoch": 0.10413090128755365, + "grad_norm": 1.271498441696167, + "learning_rate": 4.928984784541552e-06, + "loss": 2.478, + "step": 1941 + }, + { + "epoch": 0.10418454935622318, + "grad_norm": 1.0881752967834473, + "learning_rate": 4.928881944849185e-06, + "loss": 2.5222, + "step": 1942 + }, + { + "epoch": 0.10423819742489271, + "grad_norm": 1.0832735300064087, + "learning_rate": 4.9287790318221564e-06, + "loss": 2.366, + "step": 1943 + }, + { + "epoch": 0.10429184549356223, + "grad_norm": 3.5342414379119873, + "learning_rate": 4.928676045463572e-06, + "loss": 2.3721, + "step": 1944 + }, + { + "epoch": 0.10434549356223176, + "grad_norm": 1.0153589248657227, + "learning_rate": 4.9285729857765415e-06, + "loss": 2.2232, + "step": 1945 + }, + { + "epoch": 0.10439914163090129, + "grad_norm": 1.118100881576538, + "learning_rate": 4.928469852764176e-06, + "loss": 2.2672, + "step": 1946 + }, + { + "epoch": 0.10445278969957082, + "grad_norm": 1.0871555805206299, + "learning_rate": 4.928366646429591e-06, + "loss": 2.3697, + "step": 1947 + }, + { + "epoch": 0.10450643776824034, + "grad_norm": 1.074065923690796, + "learning_rate": 4.928263366775902e-06, + "loss": 2.0967, + "step": 1948 + }, + { + "epoch": 0.10456008583690987, + "grad_norm": 1.1531176567077637, + "learning_rate": 4.928160013806226e-06, + "loss": 2.2788, + "step": 1949 + }, + { + "epoch": 0.1046137339055794, + "grad_norm": 1.138756275177002, + "learning_rate": 4.9280565875236845e-06, + "loss": 2.2864, + "step": 1950 + }, + { + "epoch": 0.10466738197424892, + "grad_norm": 1.1232471466064453, + "learning_rate": 4.927953087931401e-06, + "loss": 2.2341, + "step": 1951 + }, + { + "epoch": 0.10472103004291845, + "grad_norm": 1.1913272142410278, + "learning_rate": 4.927849515032499e-06, + "loss": 2.3101, + "step": 1952 + }, + { + "epoch": 0.10477467811158798, + "grad_norm": 1.4608371257781982, + "learning_rate": 4.927745868830105e-06, + "loss": 2.2592, + "step": 1953 + }, + { + "epoch": 0.10482832618025752, + "grad_norm": 1.4135850667953491, + "learning_rate": 4.9276421493273515e-06, + "loss": 2.2051, + "step": 1954 + }, + { + "epoch": 0.10488197424892703, + "grad_norm": 0.9767780303955078, + "learning_rate": 4.9275383565273674e-06, + "loss": 2.036, + "step": 1955 + }, + { + "epoch": 0.10493562231759657, + "grad_norm": 1.2478094100952148, + "learning_rate": 4.927434490433288e-06, + "loss": 2.5222, + "step": 1956 + }, + { + "epoch": 0.1049892703862661, + "grad_norm": 1.1460052728652954, + "learning_rate": 4.927330551048248e-06, + "loss": 2.357, + "step": 1957 + }, + { + "epoch": 0.10504291845493562, + "grad_norm": 1.182121992111206, + "learning_rate": 4.927226538375386e-06, + "loss": 2.3932, + "step": 1958 + }, + { + "epoch": 0.10509656652360515, + "grad_norm": 1.0653690099716187, + "learning_rate": 4.9271224524178426e-06, + "loss": 2.1304, + "step": 1959 + }, + { + "epoch": 0.10515021459227468, + "grad_norm": 1.0397404432296753, + "learning_rate": 4.927018293178761e-06, + "loss": 2.2088, + "step": 1960 + }, + { + "epoch": 0.10520386266094421, + "grad_norm": 0.9816562533378601, + "learning_rate": 4.926914060661285e-06, + "loss": 2.2125, + "step": 1961 + }, + { + "epoch": 0.10525751072961373, + "grad_norm": 1.1414966583251953, + "learning_rate": 4.926809754868562e-06, + "loss": 2.4095, + "step": 1962 + }, + { + "epoch": 0.10531115879828326, + "grad_norm": 1.167049527168274, + "learning_rate": 4.926705375803742e-06, + "loss": 2.1286, + "step": 1963 + }, + { + "epoch": 0.10536480686695279, + "grad_norm": 1.0355244874954224, + "learning_rate": 4.926600923469977e-06, + "loss": 2.2912, + "step": 1964 + }, + { + "epoch": 0.10541845493562232, + "grad_norm": 1.4339945316314697, + "learning_rate": 4.926496397870418e-06, + "loss": 2.5648, + "step": 1965 + }, + { + "epoch": 0.10547210300429184, + "grad_norm": 1.3985776901245117, + "learning_rate": 4.926391799008223e-06, + "loss": 2.4167, + "step": 1966 + }, + { + "epoch": 0.10552575107296137, + "grad_norm": 1.1102783679962158, + "learning_rate": 4.926287126886551e-06, + "loss": 2.2605, + "step": 1967 + }, + { + "epoch": 0.1055793991416309, + "grad_norm": 1.1157299280166626, + "learning_rate": 4.926182381508559e-06, + "loss": 2.4972, + "step": 1968 + }, + { + "epoch": 0.10563304721030042, + "grad_norm": 1.1610889434814453, + "learning_rate": 4.926077562877413e-06, + "loss": 2.4844, + "step": 1969 + }, + { + "epoch": 0.10568669527896996, + "grad_norm": 0.9723529815673828, + "learning_rate": 4.925972670996276e-06, + "loss": 2.2345, + "step": 1970 + }, + { + "epoch": 0.10574034334763949, + "grad_norm": 1.03019380569458, + "learning_rate": 4.925867705868316e-06, + "loss": 1.9923, + "step": 1971 + }, + { + "epoch": 0.10579399141630902, + "grad_norm": 1.1678401231765747, + "learning_rate": 4.925762667496701e-06, + "loss": 2.1057, + "step": 1972 + }, + { + "epoch": 0.10584763948497854, + "grad_norm": 1.1231210231781006, + "learning_rate": 4.925657555884603e-06, + "loss": 2.0989, + "step": 1973 + }, + { + "epoch": 0.10590128755364807, + "grad_norm": 1.1615835428237915, + "learning_rate": 4.925552371035195e-06, + "loss": 2.4159, + "step": 1974 + }, + { + "epoch": 0.1059549356223176, + "grad_norm": 1.2831733226776123, + "learning_rate": 4.925447112951654e-06, + "loss": 2.4731, + "step": 1975 + }, + { + "epoch": 0.10600858369098712, + "grad_norm": 0.980715811252594, + "learning_rate": 4.925341781637158e-06, + "loss": 2.2235, + "step": 1976 + }, + { + "epoch": 0.10606223175965665, + "grad_norm": 1.0056291818618774, + "learning_rate": 4.9252363770948855e-06, + "loss": 2.125, + "step": 1977 + }, + { + "epoch": 0.10611587982832618, + "grad_norm": 1.4800682067871094, + "learning_rate": 4.92513089932802e-06, + "loss": 2.2584, + "step": 1978 + }, + { + "epoch": 0.10616952789699571, + "grad_norm": 1.5279347896575928, + "learning_rate": 4.925025348339747e-06, + "loss": 2.0517, + "step": 1979 + }, + { + "epoch": 0.10622317596566523, + "grad_norm": 1.0320872068405151, + "learning_rate": 4.924919724133253e-06, + "loss": 2.5768, + "step": 1980 + }, + { + "epoch": 0.10627682403433476, + "grad_norm": 1.095285415649414, + "learning_rate": 4.924814026711726e-06, + "loss": 2.2437, + "step": 1981 + }, + { + "epoch": 0.1063304721030043, + "grad_norm": 0.9614364504814148, + "learning_rate": 4.924708256078358e-06, + "loss": 2.0973, + "step": 1982 + }, + { + "epoch": 0.10638412017167383, + "grad_norm": 1.0558069944381714, + "learning_rate": 4.924602412236343e-06, + "loss": 2.4361, + "step": 1983 + }, + { + "epoch": 0.10643776824034334, + "grad_norm": 0.8254480957984924, + "learning_rate": 4.924496495188875e-06, + "loss": 1.8443, + "step": 1984 + }, + { + "epoch": 0.10649141630901288, + "grad_norm": 1.9961565732955933, + "learning_rate": 4.924390504939155e-06, + "loss": 2.2969, + "step": 1985 + }, + { + "epoch": 0.10654506437768241, + "grad_norm": 0.976161539554596, + "learning_rate": 4.92428444149038e-06, + "loss": 1.6461, + "step": 1986 + }, + { + "epoch": 0.10659871244635193, + "grad_norm": 0.8726969957351685, + "learning_rate": 4.924178304845755e-06, + "loss": 1.9044, + "step": 1987 + }, + { + "epoch": 0.10665236051502146, + "grad_norm": 1.0752164125442505, + "learning_rate": 4.924072095008482e-06, + "loss": 2.0086, + "step": 1988 + }, + { + "epoch": 0.10670600858369099, + "grad_norm": 1.0471471548080444, + "learning_rate": 4.92396581198177e-06, + "loss": 2.0153, + "step": 1989 + }, + { + "epoch": 0.10675965665236052, + "grad_norm": 0.928613543510437, + "learning_rate": 4.923859455768826e-06, + "loss": 2.1906, + "step": 1990 + }, + { + "epoch": 0.10681330472103004, + "grad_norm": 1.2925628423690796, + "learning_rate": 4.923753026372863e-06, + "loss": 2.5508, + "step": 1991 + }, + { + "epoch": 0.10686695278969957, + "grad_norm": 1.3501696586608887, + "learning_rate": 4.923646523797093e-06, + "loss": 2.4359, + "step": 1992 + }, + { + "epoch": 0.1069206008583691, + "grad_norm": 1.0489870309829712, + "learning_rate": 4.923539948044732e-06, + "loss": 2.3111, + "step": 1993 + }, + { + "epoch": 0.10697424892703862, + "grad_norm": 1.2146090269088745, + "learning_rate": 4.923433299119e-06, + "loss": 2.2358, + "step": 1994 + }, + { + "epoch": 0.10702789699570815, + "grad_norm": 1.218310832977295, + "learning_rate": 4.923326577023112e-06, + "loss": 2.3723, + "step": 1995 + }, + { + "epoch": 0.10708154506437768, + "grad_norm": 1.011652946472168, + "learning_rate": 4.923219781760295e-06, + "loss": 2.1674, + "step": 1996 + }, + { + "epoch": 0.10713519313304722, + "grad_norm": 1.1200813055038452, + "learning_rate": 4.923112913333771e-06, + "loss": 2.5299, + "step": 1997 + }, + { + "epoch": 0.10718884120171673, + "grad_norm": 1.0726518630981445, + "learning_rate": 4.923005971746768e-06, + "loss": 2.6529, + "step": 1998 + }, + { + "epoch": 0.10724248927038627, + "grad_norm": 1.5433592796325684, + "learning_rate": 4.922898957002514e-06, + "loss": 2.1025, + "step": 1999 + }, + { + "epoch": 0.1072961373390558, + "grad_norm": 1.0829018354415894, + "learning_rate": 4.922791869104241e-06, + "loss": 2.3683, + "step": 2000 + }, + { + "epoch": 0.10734978540772531, + "grad_norm": 1.2019517421722412, + "learning_rate": 4.9226847080551794e-06, + "loss": 2.0909, + "step": 2001 + }, + { + "epoch": 0.10740343347639485, + "grad_norm": 2.1196649074554443, + "learning_rate": 4.922577473858569e-06, + "loss": 1.503, + "step": 2002 + }, + { + "epoch": 0.10745708154506438, + "grad_norm": 1.0655595064163208, + "learning_rate": 4.922470166517644e-06, + "loss": 2.2892, + "step": 2003 + }, + { + "epoch": 0.10751072961373391, + "grad_norm": 1.2319308519363403, + "learning_rate": 4.922362786035647e-06, + "loss": 2.0564, + "step": 2004 + }, + { + "epoch": 0.10756437768240343, + "grad_norm": 1.0345441102981567, + "learning_rate": 4.922255332415818e-06, + "loss": 2.1837, + "step": 2005 + }, + { + "epoch": 0.10761802575107296, + "grad_norm": 1.1513289213180542, + "learning_rate": 4.922147805661402e-06, + "loss": 2.2578, + "step": 2006 + }, + { + "epoch": 0.10767167381974249, + "grad_norm": 13.099942207336426, + "learning_rate": 4.9220402057756455e-06, + "loss": 2.244, + "step": 2007 + }, + { + "epoch": 0.10772532188841202, + "grad_norm": 1.1152081489562988, + "learning_rate": 4.921932532761798e-06, + "loss": 2.3677, + "step": 2008 + }, + { + "epoch": 0.10777896995708154, + "grad_norm": 1.0645225048065186, + "learning_rate": 4.92182478662311e-06, + "loss": 2.4963, + "step": 2009 + }, + { + "epoch": 0.10783261802575107, + "grad_norm": 1.2022515535354614, + "learning_rate": 4.9217169673628326e-06, + "loss": 2.5965, + "step": 2010 + }, + { + "epoch": 0.1078862660944206, + "grad_norm": 1.4064871072769165, + "learning_rate": 4.921609074984225e-06, + "loss": 2.2841, + "step": 2011 + }, + { + "epoch": 0.10793991416309012, + "grad_norm": 1.1635982990264893, + "learning_rate": 4.921501109490542e-06, + "loss": 2.2971, + "step": 2012 + }, + { + "epoch": 0.10799356223175965, + "grad_norm": 1.1491824388504028, + "learning_rate": 4.921393070885044e-06, + "loss": 2.3114, + "step": 2013 + }, + { + "epoch": 0.10804721030042919, + "grad_norm": 1.0140197277069092, + "learning_rate": 4.921284959170994e-06, + "loss": 2.3269, + "step": 2014 + }, + { + "epoch": 0.10810085836909872, + "grad_norm": 1.2919663190841675, + "learning_rate": 4.9211767743516544e-06, + "loss": 2.357, + "step": 2015 + }, + { + "epoch": 0.10815450643776824, + "grad_norm": 1.280945897102356, + "learning_rate": 4.921068516430293e-06, + "loss": 2.51, + "step": 2016 + }, + { + "epoch": 0.10820815450643777, + "grad_norm": 1.1772642135620117, + "learning_rate": 4.920960185410178e-06, + "loss": 2.5031, + "step": 2017 + }, + { + "epoch": 0.1082618025751073, + "grad_norm": 1.0220706462860107, + "learning_rate": 4.92085178129458e-06, + "loss": 2.182, + "step": 2018 + }, + { + "epoch": 0.10831545064377682, + "grad_norm": 1.0696314573287964, + "learning_rate": 4.920743304086772e-06, + "loss": 2.2457, + "step": 2019 + }, + { + "epoch": 0.10836909871244635, + "grad_norm": 1.0832674503326416, + "learning_rate": 4.92063475379003e-06, + "loss": 2.4338, + "step": 2020 + }, + { + "epoch": 0.10842274678111588, + "grad_norm": 1.7219007015228271, + "learning_rate": 4.92052613040763e-06, + "loss": 2.2127, + "step": 2021 + }, + { + "epoch": 0.10847639484978541, + "grad_norm": 1.0856860876083374, + "learning_rate": 4.920417433942853e-06, + "loss": 2.1214, + "step": 2022 + }, + { + "epoch": 0.10853004291845493, + "grad_norm": 1.1084561347961426, + "learning_rate": 4.920308664398981e-06, + "loss": 2.2878, + "step": 2023 + }, + { + "epoch": 0.10858369098712446, + "grad_norm": 1.4390372037887573, + "learning_rate": 4.920199821779297e-06, + "loss": 1.9212, + "step": 2024 + }, + { + "epoch": 0.108637339055794, + "grad_norm": 1.0123540163040161, + "learning_rate": 4.920090906087086e-06, + "loss": 2.2565, + "step": 2025 + }, + { + "epoch": 0.10869098712446353, + "grad_norm": 1.1613882780075073, + "learning_rate": 4.91998191732564e-06, + "loss": 2.3559, + "step": 2026 + }, + { + "epoch": 0.10874463519313304, + "grad_norm": 1.4327887296676636, + "learning_rate": 4.919872855498247e-06, + "loss": 2.2019, + "step": 2027 + }, + { + "epoch": 0.10879828326180258, + "grad_norm": 6.1417131423950195, + "learning_rate": 4.919763720608201e-06, + "loss": 2.5622, + "step": 2028 + }, + { + "epoch": 0.10885193133047211, + "grad_norm": 1.6055347919464111, + "learning_rate": 4.919654512658797e-06, + "loss": 2.355, + "step": 2029 + }, + { + "epoch": 0.10890557939914162, + "grad_norm": 1.0879734754562378, + "learning_rate": 4.919545231653331e-06, + "loss": 2.3648, + "step": 2030 + }, + { + "epoch": 0.10895922746781116, + "grad_norm": 1.0652059316635132, + "learning_rate": 4.9194358775951045e-06, + "loss": 2.4187, + "step": 2031 + }, + { + "epoch": 0.10901287553648069, + "grad_norm": 1.1899642944335938, + "learning_rate": 4.919326450487418e-06, + "loss": 2.2256, + "step": 2032 + }, + { + "epoch": 0.10906652360515022, + "grad_norm": 1.866661548614502, + "learning_rate": 4.919216950333576e-06, + "loss": 2.2514, + "step": 2033 + }, + { + "epoch": 0.10912017167381974, + "grad_norm": 1.2340679168701172, + "learning_rate": 4.919107377136884e-06, + "loss": 1.7516, + "step": 2034 + }, + { + "epoch": 0.10917381974248927, + "grad_norm": 1.1162368059158325, + "learning_rate": 4.91899773090065e-06, + "loss": 2.2946, + "step": 2035 + }, + { + "epoch": 0.1092274678111588, + "grad_norm": 1.0872057676315308, + "learning_rate": 4.918888011628185e-06, + "loss": 2.4856, + "step": 2036 + }, + { + "epoch": 0.10928111587982832, + "grad_norm": 0.8604046106338501, + "learning_rate": 4.918778219322803e-06, + "loss": 1.9526, + "step": 2037 + }, + { + "epoch": 0.10933476394849785, + "grad_norm": 1.4402927160263062, + "learning_rate": 4.9186683539878165e-06, + "loss": 2.5663, + "step": 2038 + }, + { + "epoch": 0.10938841201716738, + "grad_norm": 1.3104445934295654, + "learning_rate": 4.918558415626545e-06, + "loss": 1.9119, + "step": 2039 + }, + { + "epoch": 0.10944206008583691, + "grad_norm": 1.0582126379013062, + "learning_rate": 4.918448404242306e-06, + "loss": 1.9362, + "step": 2040 + }, + { + "epoch": 0.10949570815450643, + "grad_norm": 1.1518861055374146, + "learning_rate": 4.918338319838422e-06, + "loss": 2.275, + "step": 2041 + }, + { + "epoch": 0.10954935622317596, + "grad_norm": 1.5217136144638062, + "learning_rate": 4.918228162418216e-06, + "loss": 2.3499, + "step": 2042 + }, + { + "epoch": 0.1096030042918455, + "grad_norm": 1.1799073219299316, + "learning_rate": 4.918117931985015e-06, + "loss": 2.0551, + "step": 2043 + }, + { + "epoch": 0.10965665236051503, + "grad_norm": 1.0288593769073486, + "learning_rate": 4.918007628542147e-06, + "loss": 2.4489, + "step": 2044 + }, + { + "epoch": 0.10971030042918455, + "grad_norm": 1.0439658164978027, + "learning_rate": 4.917897252092941e-06, + "loss": 2.2469, + "step": 2045 + }, + { + "epoch": 0.10976394849785408, + "grad_norm": 1.2418016195297241, + "learning_rate": 4.917786802640731e-06, + "loss": 2.4485, + "step": 2046 + }, + { + "epoch": 0.10981759656652361, + "grad_norm": 1.3407255411148071, + "learning_rate": 4.917676280188853e-06, + "loss": 2.3523, + "step": 2047 + }, + { + "epoch": 0.10987124463519313, + "grad_norm": 0.9720165133476257, + "learning_rate": 4.91756568474064e-06, + "loss": 1.9392, + "step": 2048 + }, + { + "epoch": 0.10992489270386266, + "grad_norm": 1.1267201900482178, + "learning_rate": 4.917455016299435e-06, + "loss": 2.3207, + "step": 2049 + }, + { + "epoch": 0.10997854077253219, + "grad_norm": 1.117612600326538, + "learning_rate": 4.917344274868577e-06, + "loss": 2.5463, + "step": 2050 + }, + { + "epoch": 0.11003218884120172, + "grad_norm": 1.2730518579483032, + "learning_rate": 4.917233460451411e-06, + "loss": 2.4726, + "step": 2051 + }, + { + "epoch": 0.11008583690987124, + "grad_norm": 1.255043387413025, + "learning_rate": 4.917122573051282e-06, + "loss": 2.5161, + "step": 2052 + }, + { + "epoch": 0.11013948497854077, + "grad_norm": 1.0561366081237793, + "learning_rate": 4.917011612671539e-06, + "loss": 2.1806, + "step": 2053 + }, + { + "epoch": 0.1101931330472103, + "grad_norm": 1.0609501600265503, + "learning_rate": 4.916900579315531e-06, + "loss": 2.3887, + "step": 2054 + }, + { + "epoch": 0.11024678111587982, + "grad_norm": 1.147754192352295, + "learning_rate": 4.9167894729866104e-06, + "loss": 2.458, + "step": 2055 + }, + { + "epoch": 0.11030042918454935, + "grad_norm": 1.0064215660095215, + "learning_rate": 4.916678293688133e-06, + "loss": 1.9657, + "step": 2056 + }, + { + "epoch": 0.11035407725321889, + "grad_norm": 0.9739856719970703, + "learning_rate": 4.9165670414234545e-06, + "loss": 2.2862, + "step": 2057 + }, + { + "epoch": 0.11040772532188842, + "grad_norm": 6.9045820236206055, + "learning_rate": 4.916455716195935e-06, + "loss": 2.1785, + "step": 2058 + }, + { + "epoch": 0.11046137339055793, + "grad_norm": 1.451264500617981, + "learning_rate": 4.916344318008934e-06, + "loss": 2.4807, + "step": 2059 + }, + { + "epoch": 0.11051502145922747, + "grad_norm": 1.047815203666687, + "learning_rate": 4.916232846865817e-06, + "loss": 2.2352, + "step": 2060 + }, + { + "epoch": 0.110568669527897, + "grad_norm": 1.1995114088058472, + "learning_rate": 4.9161213027699485e-06, + "loss": 2.1995, + "step": 2061 + }, + { + "epoch": 0.11062231759656653, + "grad_norm": 0.8995116949081421, + "learning_rate": 4.9160096857246955e-06, + "loss": 2.009, + "step": 2062 + }, + { + "epoch": 0.11067596566523605, + "grad_norm": 3.1704225540161133, + "learning_rate": 4.91589799573343e-06, + "loss": 2.1819, + "step": 2063 + }, + { + "epoch": 0.11072961373390558, + "grad_norm": 1.0004647970199585, + "learning_rate": 4.915786232799522e-06, + "loss": 2.3666, + "step": 2064 + }, + { + "epoch": 0.11078326180257511, + "grad_norm": 1.0436476469039917, + "learning_rate": 4.9156743969263475e-06, + "loss": 2.1743, + "step": 2065 + }, + { + "epoch": 0.11083690987124463, + "grad_norm": 1.4021981954574585, + "learning_rate": 4.915562488117284e-06, + "loss": 2.3301, + "step": 2066 + }, + { + "epoch": 0.11089055793991416, + "grad_norm": 1.48274827003479, + "learning_rate": 4.915450506375708e-06, + "loss": 2.3904, + "step": 2067 + }, + { + "epoch": 0.11094420600858369, + "grad_norm": 1.0304043292999268, + "learning_rate": 4.915338451705001e-06, + "loss": 2.1186, + "step": 2068 + }, + { + "epoch": 0.11099785407725322, + "grad_norm": 1.1636834144592285, + "learning_rate": 4.9152263241085474e-06, + "loss": 2.1914, + "step": 2069 + }, + { + "epoch": 0.11105150214592274, + "grad_norm": 0.9261377453804016, + "learning_rate": 4.9151141235897326e-06, + "loss": 2.1626, + "step": 2070 + }, + { + "epoch": 0.11110515021459227, + "grad_norm": 5.439774990081787, + "learning_rate": 4.915001850151943e-06, + "loss": 2.2354, + "step": 2071 + }, + { + "epoch": 0.1111587982832618, + "grad_norm": 1.1202877759933472, + "learning_rate": 4.91488950379857e-06, + "loss": 2.2875, + "step": 2072 + }, + { + "epoch": 0.11121244635193132, + "grad_norm": 1.1542850732803345, + "learning_rate": 4.914777084533004e-06, + "loss": 2.3737, + "step": 2073 + }, + { + "epoch": 0.11126609442060086, + "grad_norm": 0.9751764535903931, + "learning_rate": 4.9146645923586406e-06, + "loss": 2.4268, + "step": 2074 + }, + { + "epoch": 0.11131974248927039, + "grad_norm": 1.25663423538208, + "learning_rate": 4.9145520272788755e-06, + "loss": 1.3381, + "step": 2075 + }, + { + "epoch": 0.11137339055793992, + "grad_norm": 1.1756258010864258, + "learning_rate": 4.914439389297107e-06, + "loss": 1.3796, + "step": 2076 + }, + { + "epoch": 0.11142703862660944, + "grad_norm": 1.1819255352020264, + "learning_rate": 4.914326678416738e-06, + "loss": 2.2776, + "step": 2077 + }, + { + "epoch": 0.11148068669527897, + "grad_norm": 1.0208954811096191, + "learning_rate": 4.9142138946411685e-06, + "loss": 2.3087, + "step": 2078 + }, + { + "epoch": 0.1115343347639485, + "grad_norm": 1.1422512531280518, + "learning_rate": 4.914101037973806e-06, + "loss": 2.144, + "step": 2079 + }, + { + "epoch": 0.11158798283261803, + "grad_norm": 1.0211634635925293, + "learning_rate": 4.913988108418058e-06, + "loss": 2.3984, + "step": 2080 + }, + { + "epoch": 0.11164163090128755, + "grad_norm": 1.12263023853302, + "learning_rate": 4.913875105977332e-06, + "loss": 2.3935, + "step": 2081 + }, + { + "epoch": 0.11169527896995708, + "grad_norm": 1.1522924900054932, + "learning_rate": 4.913762030655043e-06, + "loss": 2.1781, + "step": 2082 + }, + { + "epoch": 0.11174892703862661, + "grad_norm": 1.1698349714279175, + "learning_rate": 4.9136488824546025e-06, + "loss": 2.2696, + "step": 2083 + }, + { + "epoch": 0.11180257510729613, + "grad_norm": 1.1896405220031738, + "learning_rate": 4.913535661379427e-06, + "loss": 2.1026, + "step": 2084 + }, + { + "epoch": 0.11185622317596566, + "grad_norm": 1.0642669200897217, + "learning_rate": 4.913422367432937e-06, + "loss": 2.0953, + "step": 2085 + }, + { + "epoch": 0.1119098712446352, + "grad_norm": 1.0581564903259277, + "learning_rate": 4.9133090006185505e-06, + "loss": 2.4164, + "step": 2086 + }, + { + "epoch": 0.11196351931330473, + "grad_norm": 1.449578046798706, + "learning_rate": 4.913195560939692e-06, + "loss": 2.2158, + "step": 2087 + }, + { + "epoch": 0.11201716738197424, + "grad_norm": 1.1908867359161377, + "learning_rate": 4.9130820483997865e-06, + "loss": 2.3059, + "step": 2088 + }, + { + "epoch": 0.11207081545064378, + "grad_norm": 1.2181179523468018, + "learning_rate": 4.91296846300226e-06, + "loss": 2.2419, + "step": 2089 + }, + { + "epoch": 0.11212446351931331, + "grad_norm": 1.125166893005371, + "learning_rate": 4.912854804750544e-06, + "loss": 2.3256, + "step": 2090 + }, + { + "epoch": 0.11217811158798283, + "grad_norm": 1.1511597633361816, + "learning_rate": 4.912741073648068e-06, + "loss": 2.3099, + "step": 2091 + }, + { + "epoch": 0.11223175965665236, + "grad_norm": 1.1430375576019287, + "learning_rate": 4.912627269698268e-06, + "loss": 2.2793, + "step": 2092 + }, + { + "epoch": 0.11228540772532189, + "grad_norm": 1.1949678659439087, + "learning_rate": 4.912513392904577e-06, + "loss": 2.3914, + "step": 2093 + }, + { + "epoch": 0.11233905579399142, + "grad_norm": 1.2153464555740356, + "learning_rate": 4.912399443270437e-06, + "loss": 2.3425, + "step": 2094 + }, + { + "epoch": 0.11239270386266094, + "grad_norm": 1.229775309562683, + "learning_rate": 4.912285420799286e-06, + "loss": 1.4476, + "step": 2095 + }, + { + "epoch": 0.11244635193133047, + "grad_norm": 1.1002280712127686, + "learning_rate": 4.912171325494568e-06, + "loss": 2.34, + "step": 2096 + }, + { + "epoch": 0.1125, + "grad_norm": 1.1302978992462158, + "learning_rate": 4.912057157359727e-06, + "loss": 2.2485, + "step": 2097 + }, + { + "epoch": 0.11255364806866953, + "grad_norm": 1.0015721321105957, + "learning_rate": 4.9119429163982094e-06, + "loss": 2.3147, + "step": 2098 + }, + { + "epoch": 0.11260729613733905, + "grad_norm": 1.0378457307815552, + "learning_rate": 4.911828602613466e-06, + "loss": 2.1815, + "step": 2099 + }, + { + "epoch": 0.11266094420600858, + "grad_norm": 1.0669654607772827, + "learning_rate": 4.911714216008946e-06, + "loss": 2.0762, + "step": 2100 + }, + { + "epoch": 0.11271459227467812, + "grad_norm": 1.1532959938049316, + "learning_rate": 4.911599756588106e-06, + "loss": 1.9917, + "step": 2101 + }, + { + "epoch": 0.11276824034334763, + "grad_norm": 1.1765762567520142, + "learning_rate": 4.9114852243543995e-06, + "loss": 2.0649, + "step": 2102 + }, + { + "epoch": 0.11282188841201717, + "grad_norm": 1.1193914413452148, + "learning_rate": 4.911370619311287e-06, + "loss": 2.3835, + "step": 2103 + }, + { + "epoch": 0.1128755364806867, + "grad_norm": 1.1819320917129517, + "learning_rate": 4.911255941462225e-06, + "loss": 2.5395, + "step": 2104 + }, + { + "epoch": 0.11292918454935623, + "grad_norm": 1.0073168277740479, + "learning_rate": 4.91114119081068e-06, + "loss": 2.321, + "step": 2105 + }, + { + "epoch": 0.11298283261802575, + "grad_norm": 1.103088617324829, + "learning_rate": 4.911026367360114e-06, + "loss": 2.4836, + "step": 2106 + }, + { + "epoch": 0.11303648068669528, + "grad_norm": 1.6039079427719116, + "learning_rate": 4.910911471113994e-06, + "loss": 2.4099, + "step": 2107 + }, + { + "epoch": 0.11309012875536481, + "grad_norm": 1.127637505531311, + "learning_rate": 4.910796502075791e-06, + "loss": 2.2958, + "step": 2108 + }, + { + "epoch": 0.11314377682403433, + "grad_norm": 1.2135496139526367, + "learning_rate": 4.910681460248974e-06, + "loss": 2.289, + "step": 2109 + }, + { + "epoch": 0.11319742489270386, + "grad_norm": 1.2150956392288208, + "learning_rate": 4.910566345637017e-06, + "loss": 2.3871, + "step": 2110 + }, + { + "epoch": 0.11325107296137339, + "grad_norm": 1.0421477556228638, + "learning_rate": 4.9104511582433965e-06, + "loss": 2.1875, + "step": 2111 + }, + { + "epoch": 0.11330472103004292, + "grad_norm": 1.0551294088363647, + "learning_rate": 4.91033589807159e-06, + "loss": 2.0893, + "step": 2112 + }, + { + "epoch": 0.11335836909871244, + "grad_norm": 1.499182105064392, + "learning_rate": 4.910220565125077e-06, + "loss": 2.2931, + "step": 2113 + }, + { + "epoch": 0.11341201716738197, + "grad_norm": 1.1058008670806885, + "learning_rate": 4.910105159407339e-06, + "loss": 2.2322, + "step": 2114 + }, + { + "epoch": 0.1134656652360515, + "grad_norm": 1.0842797756195068, + "learning_rate": 4.909989680921863e-06, + "loss": 2.6319, + "step": 2115 + }, + { + "epoch": 0.11351931330472104, + "grad_norm": 1.4017088413238525, + "learning_rate": 4.909874129672133e-06, + "loss": 2.5882, + "step": 2116 + }, + { + "epoch": 0.11357296137339055, + "grad_norm": 1.0638715028762817, + "learning_rate": 4.909758505661639e-06, + "loss": 2.3841, + "step": 2117 + }, + { + "epoch": 0.11362660944206009, + "grad_norm": 1.2103101015090942, + "learning_rate": 4.909642808893873e-06, + "loss": 2.444, + "step": 2118 + }, + { + "epoch": 0.11368025751072962, + "grad_norm": 1.1628270149230957, + "learning_rate": 4.909527039372326e-06, + "loss": 2.2928, + "step": 2119 + }, + { + "epoch": 0.11373390557939914, + "grad_norm": 1.0771673917770386, + "learning_rate": 4.909411197100494e-06, + "loss": 2.4213, + "step": 2120 + }, + { + "epoch": 0.11378755364806867, + "grad_norm": 1.1699645519256592, + "learning_rate": 4.909295282081876e-06, + "loss": 2.4298, + "step": 2121 + }, + { + "epoch": 0.1138412017167382, + "grad_norm": 1.004538893699646, + "learning_rate": 4.90917929431997e-06, + "loss": 2.0826, + "step": 2122 + }, + { + "epoch": 0.11389484978540773, + "grad_norm": 1.2406861782073975, + "learning_rate": 4.90906323381828e-06, + "loss": 2.6382, + "step": 2123 + }, + { + "epoch": 0.11394849785407725, + "grad_norm": 1.0982295274734497, + "learning_rate": 4.908947100580308e-06, + "loss": 1.5078, + "step": 2124 + }, + { + "epoch": 0.11400214592274678, + "grad_norm": 1.0639511346817017, + "learning_rate": 4.908830894609562e-06, + "loss": 2.271, + "step": 2125 + }, + { + "epoch": 0.11405579399141631, + "grad_norm": 1.044068455696106, + "learning_rate": 4.90871461590955e-06, + "loss": 2.2488, + "step": 2126 + }, + { + "epoch": 0.11410944206008583, + "grad_norm": 0.899344801902771, + "learning_rate": 4.908598264483782e-06, + "loss": 1.935, + "step": 2127 + }, + { + "epoch": 0.11416309012875536, + "grad_norm": 1.2748914957046509, + "learning_rate": 4.908481840335772e-06, + "loss": 2.3807, + "step": 2128 + }, + { + "epoch": 0.1142167381974249, + "grad_norm": 0.9995961785316467, + "learning_rate": 4.908365343469036e-06, + "loss": 1.9777, + "step": 2129 + }, + { + "epoch": 0.11427038626609443, + "grad_norm": 1.1364281177520752, + "learning_rate": 4.908248773887089e-06, + "loss": 2.3212, + "step": 2130 + }, + { + "epoch": 0.11432403433476394, + "grad_norm": 1.0163472890853882, + "learning_rate": 4.908132131593453e-06, + "loss": 2.288, + "step": 2131 + }, + { + "epoch": 0.11437768240343348, + "grad_norm": 1.072706937789917, + "learning_rate": 4.908015416591647e-06, + "loss": 2.5196, + "step": 2132 + }, + { + "epoch": 0.11443133047210301, + "grad_norm": 1.2132407426834106, + "learning_rate": 4.9078986288851974e-06, + "loss": 2.4029, + "step": 2133 + }, + { + "epoch": 0.11448497854077254, + "grad_norm": 1.2211484909057617, + "learning_rate": 4.90778176847763e-06, + "loss": 2.3065, + "step": 2134 + }, + { + "epoch": 0.11453862660944206, + "grad_norm": 1.230224847793579, + "learning_rate": 4.907664835372472e-06, + "loss": 2.3547, + "step": 2135 + }, + { + "epoch": 0.11459227467811159, + "grad_norm": 1.2703454494476318, + "learning_rate": 4.907547829573254e-06, + "loss": 2.3336, + "step": 2136 + }, + { + "epoch": 0.11464592274678112, + "grad_norm": 1.0291352272033691, + "learning_rate": 4.907430751083511e-06, + "loss": 2.0963, + "step": 2137 + }, + { + "epoch": 0.11469957081545064, + "grad_norm": 1.1068578958511353, + "learning_rate": 4.9073135999067745e-06, + "loss": 2.5326, + "step": 2138 + }, + { + "epoch": 0.11475321888412017, + "grad_norm": 1.4264963865280151, + "learning_rate": 4.907196376046583e-06, + "loss": 2.3075, + "step": 2139 + }, + { + "epoch": 0.1148068669527897, + "grad_norm": 1.0852543115615845, + "learning_rate": 4.907079079506478e-06, + "loss": 2.1412, + "step": 2140 + }, + { + "epoch": 0.11486051502145923, + "grad_norm": 0.9450492858886719, + "learning_rate": 4.906961710289998e-06, + "loss": 2.1248, + "step": 2141 + }, + { + "epoch": 0.11491416309012875, + "grad_norm": 1.37924325466156, + "learning_rate": 4.9068442684006875e-06, + "loss": 2.4428, + "step": 2142 + }, + { + "epoch": 0.11496781115879828, + "grad_norm": 1.090120792388916, + "learning_rate": 4.906726753842094e-06, + "loss": 2.169, + "step": 2143 + }, + { + "epoch": 0.11502145922746781, + "grad_norm": 1.117771863937378, + "learning_rate": 4.906609166617763e-06, + "loss": 2.2899, + "step": 2144 + }, + { + "epoch": 0.11507510729613733, + "grad_norm": 1.1384176015853882, + "learning_rate": 4.906491506731247e-06, + "loss": 2.2955, + "step": 2145 + }, + { + "epoch": 0.11512875536480686, + "grad_norm": 2.3992228507995605, + "learning_rate": 4.9063737741860975e-06, + "loss": 2.3496, + "step": 2146 + }, + { + "epoch": 0.1151824034334764, + "grad_norm": 1.0887489318847656, + "learning_rate": 4.906255968985869e-06, + "loss": 2.2459, + "step": 2147 + }, + { + "epoch": 0.11523605150214593, + "grad_norm": 1.25492525100708, + "learning_rate": 4.906138091134118e-06, + "loss": 2.5008, + "step": 2148 + }, + { + "epoch": 0.11528969957081545, + "grad_norm": 1.120599627494812, + "learning_rate": 4.9060201406344056e-06, + "loss": 1.8314, + "step": 2149 + }, + { + "epoch": 0.11534334763948498, + "grad_norm": 0.9818553328514099, + "learning_rate": 4.905902117490291e-06, + "loss": 2.2602, + "step": 2150 + }, + { + "epoch": 0.11539699570815451, + "grad_norm": 5.510129928588867, + "learning_rate": 4.905784021705338e-06, + "loss": 2.3903, + "step": 2151 + }, + { + "epoch": 0.11545064377682404, + "grad_norm": 1.0270910263061523, + "learning_rate": 4.905665853283112e-06, + "loss": 2.2123, + "step": 2152 + }, + { + "epoch": 0.11550429184549356, + "grad_norm": 1.265606164932251, + "learning_rate": 4.905547612227183e-06, + "loss": 2.0716, + "step": 2153 + }, + { + "epoch": 0.11555793991416309, + "grad_norm": 1.1506266593933105, + "learning_rate": 4.9054292985411175e-06, + "loss": 2.4673, + "step": 2154 + }, + { + "epoch": 0.11561158798283262, + "grad_norm": 1.0741430521011353, + "learning_rate": 4.905310912228491e-06, + "loss": 2.0951, + "step": 2155 + }, + { + "epoch": 0.11566523605150214, + "grad_norm": 1.1328495740890503, + "learning_rate": 4.905192453292876e-06, + "loss": 2.3399, + "step": 2156 + }, + { + "epoch": 0.11571888412017167, + "grad_norm": 1.1842769384384155, + "learning_rate": 4.90507392173785e-06, + "loss": 2.4153, + "step": 2157 + }, + { + "epoch": 0.1157725321888412, + "grad_norm": 1.2189619541168213, + "learning_rate": 4.90495531756699e-06, + "loss": 2.4993, + "step": 2158 + }, + { + "epoch": 0.11582618025751074, + "grad_norm": 0.9801350235939026, + "learning_rate": 4.90483664078388e-06, + "loss": 2.021, + "step": 2159 + }, + { + "epoch": 0.11587982832618025, + "grad_norm": 1.056208610534668, + "learning_rate": 4.9047178913921005e-06, + "loss": 2.2384, + "step": 2160 + }, + { + "epoch": 0.11593347639484979, + "grad_norm": 0.9219304323196411, + "learning_rate": 4.904599069395239e-06, + "loss": 2.0414, + "step": 2161 + }, + { + "epoch": 0.11598712446351932, + "grad_norm": 2.476566791534424, + "learning_rate": 4.904480174796881e-06, + "loss": 2.3725, + "step": 2162 + }, + { + "epoch": 0.11604077253218884, + "grad_norm": 1.02821946144104, + "learning_rate": 4.904361207600618e-06, + "loss": 2.2382, + "step": 2163 + }, + { + "epoch": 0.11609442060085837, + "grad_norm": 1.128381609916687, + "learning_rate": 4.90424216781004e-06, + "loss": 2.4118, + "step": 2164 + }, + { + "epoch": 0.1161480686695279, + "grad_norm": 1.044130563735962, + "learning_rate": 4.904123055428743e-06, + "loss": 2.2685, + "step": 2165 + }, + { + "epoch": 0.11620171673819743, + "grad_norm": 1.3193451166152954, + "learning_rate": 4.904003870460323e-06, + "loss": 2.4698, + "step": 2166 + }, + { + "epoch": 0.11625536480686695, + "grad_norm": 1.1072287559509277, + "learning_rate": 4.903884612908379e-06, + "loss": 2.0913, + "step": 2167 + }, + { + "epoch": 0.11630901287553648, + "grad_norm": 0.9757217764854431, + "learning_rate": 4.903765282776509e-06, + "loss": 2.1555, + "step": 2168 + }, + { + "epoch": 0.11636266094420601, + "grad_norm": 1.0752724409103394, + "learning_rate": 4.903645880068319e-06, + "loss": 2.2628, + "step": 2169 + }, + { + "epoch": 0.11641630901287553, + "grad_norm": 0.9949609637260437, + "learning_rate": 4.903526404787412e-06, + "loss": 2.2473, + "step": 2170 + }, + { + "epoch": 0.11646995708154506, + "grad_norm": 1.220971941947937, + "learning_rate": 4.903406856937397e-06, + "loss": 2.1583, + "step": 2171 + }, + { + "epoch": 0.1165236051502146, + "grad_norm": 1.1386816501617432, + "learning_rate": 4.903287236521883e-06, + "loss": 2.5579, + "step": 2172 + }, + { + "epoch": 0.11657725321888412, + "grad_norm": 1.3955374956130981, + "learning_rate": 4.903167543544481e-06, + "loss": 1.4899, + "step": 2173 + }, + { + "epoch": 0.11663090128755364, + "grad_norm": 0.9502013325691223, + "learning_rate": 4.903047778008805e-06, + "loss": 1.9321, + "step": 2174 + }, + { + "epoch": 0.11668454935622317, + "grad_norm": 0.9913385510444641, + "learning_rate": 4.902927939918472e-06, + "loss": 2.0106, + "step": 2175 + }, + { + "epoch": 0.1167381974248927, + "grad_norm": 1.1221914291381836, + "learning_rate": 4.9028080292770985e-06, + "loss": 2.4941, + "step": 2176 + }, + { + "epoch": 0.11679184549356224, + "grad_norm": 1.1127504110336304, + "learning_rate": 4.902688046088306e-06, + "loss": 2.3154, + "step": 2177 + }, + { + "epoch": 0.11684549356223176, + "grad_norm": 1.0930111408233643, + "learning_rate": 4.902567990355718e-06, + "loss": 2.3043, + "step": 2178 + }, + { + "epoch": 0.11689914163090129, + "grad_norm": 1.0564309358596802, + "learning_rate": 4.902447862082958e-06, + "loss": 2.2588, + "step": 2179 + }, + { + "epoch": 0.11695278969957082, + "grad_norm": 1.0894969701766968, + "learning_rate": 4.902327661273653e-06, + "loss": 2.4006, + "step": 2180 + }, + { + "epoch": 0.11700643776824034, + "grad_norm": 2.0762178897857666, + "learning_rate": 4.902207387931433e-06, + "loss": 2.4005, + "step": 2181 + }, + { + "epoch": 0.11706008583690987, + "grad_norm": 1.3048710823059082, + "learning_rate": 4.902087042059929e-06, + "loss": 2.4738, + "step": 2182 + }, + { + "epoch": 0.1171137339055794, + "grad_norm": 1.0240036249160767, + "learning_rate": 4.901966623662774e-06, + "loss": 2.1186, + "step": 2183 + }, + { + "epoch": 0.11716738197424893, + "grad_norm": 1.124778151512146, + "learning_rate": 4.9018461327436054e-06, + "loss": 2.2246, + "step": 2184 + }, + { + "epoch": 0.11722103004291845, + "grad_norm": 1.0641335248947144, + "learning_rate": 4.901725569306059e-06, + "loss": 1.9817, + "step": 2185 + }, + { + "epoch": 0.11727467811158798, + "grad_norm": 1.028509497642517, + "learning_rate": 4.901604933353777e-06, + "loss": 2.334, + "step": 2186 + }, + { + "epoch": 0.11732832618025751, + "grad_norm": 1.2249325513839722, + "learning_rate": 4.901484224890399e-06, + "loss": 2.422, + "step": 2187 + }, + { + "epoch": 0.11738197424892703, + "grad_norm": 1.0359843969345093, + "learning_rate": 4.901363443919573e-06, + "loss": 2.2407, + "step": 2188 + }, + { + "epoch": 0.11743562231759656, + "grad_norm": 1.117618441581726, + "learning_rate": 4.901242590444943e-06, + "loss": 2.1807, + "step": 2189 + }, + { + "epoch": 0.1174892703862661, + "grad_norm": 1.1504007577896118, + "learning_rate": 4.901121664470159e-06, + "loss": 2.1702, + "step": 2190 + }, + { + "epoch": 0.11754291845493563, + "grad_norm": 1.1611334085464478, + "learning_rate": 4.901000665998873e-06, + "loss": 2.4329, + "step": 2191 + }, + { + "epoch": 0.11759656652360514, + "grad_norm": 1.0668236017227173, + "learning_rate": 4.9008795950347366e-06, + "loss": 2.0801, + "step": 2192 + }, + { + "epoch": 0.11765021459227468, + "grad_norm": 1.1863116025924683, + "learning_rate": 4.900758451581406e-06, + "loss": 2.1948, + "step": 2193 + }, + { + "epoch": 0.11770386266094421, + "grad_norm": 1.2821661233901978, + "learning_rate": 4.900637235642538e-06, + "loss": 2.4351, + "step": 2194 + }, + { + "epoch": 0.11775751072961374, + "grad_norm": 1.008506178855896, + "learning_rate": 4.900515947221794e-06, + "loss": 2.3231, + "step": 2195 + }, + { + "epoch": 0.11781115879828326, + "grad_norm": 1.093316912651062, + "learning_rate": 4.900394586322835e-06, + "loss": 2.3706, + "step": 2196 + }, + { + "epoch": 0.11786480686695279, + "grad_norm": 1.0420522689819336, + "learning_rate": 4.900273152949326e-06, + "loss": 2.4209, + "step": 2197 + }, + { + "epoch": 0.11791845493562232, + "grad_norm": 1.185957670211792, + "learning_rate": 4.900151647104933e-06, + "loss": 2.3107, + "step": 2198 + }, + { + "epoch": 0.11797210300429184, + "grad_norm": 1.2284326553344727, + "learning_rate": 4.900030068793323e-06, + "loss": 2.2998, + "step": 2199 + }, + { + "epoch": 0.11802575107296137, + "grad_norm": 1.2085764408111572, + "learning_rate": 4.899908418018169e-06, + "loss": 2.5662, + "step": 2200 + }, + { + "epoch": 0.1180793991416309, + "grad_norm": 5.531564712524414, + "learning_rate": 4.899786694783144e-06, + "loss": 2.4013, + "step": 2201 + }, + { + "epoch": 0.11813304721030043, + "grad_norm": 1.008955717086792, + "learning_rate": 4.899664899091921e-06, + "loss": 2.3071, + "step": 2202 + }, + { + "epoch": 0.11818669527896995, + "grad_norm": 1.2425086498260498, + "learning_rate": 4.89954303094818e-06, + "loss": 1.7144, + "step": 2203 + }, + { + "epoch": 0.11824034334763948, + "grad_norm": 1.084505558013916, + "learning_rate": 4.899421090355599e-06, + "loss": 2.1824, + "step": 2204 + }, + { + "epoch": 0.11829399141630902, + "grad_norm": 1.035610556602478, + "learning_rate": 4.89929907731786e-06, + "loss": 2.1197, + "step": 2205 + }, + { + "epoch": 0.11834763948497853, + "grad_norm": 1.176472783088684, + "learning_rate": 4.899176991838647e-06, + "loss": 2.4991, + "step": 2206 + }, + { + "epoch": 0.11840128755364807, + "grad_norm": 1.188096284866333, + "learning_rate": 4.899054833921646e-06, + "loss": 2.324, + "step": 2207 + }, + { + "epoch": 0.1184549356223176, + "grad_norm": 0.9822206497192383, + "learning_rate": 4.898932603570544e-06, + "loss": 2.0403, + "step": 2208 + }, + { + "epoch": 0.11850858369098713, + "grad_norm": 1.2326061725616455, + "learning_rate": 4.898810300789034e-06, + "loss": 2.3191, + "step": 2209 + }, + { + "epoch": 0.11856223175965665, + "grad_norm": 1.0185041427612305, + "learning_rate": 4.8986879255808075e-06, + "loss": 2.2197, + "step": 2210 + }, + { + "epoch": 0.11861587982832618, + "grad_norm": 1.6082539558410645, + "learning_rate": 4.8985654779495595e-06, + "loss": 2.1429, + "step": 2211 + }, + { + "epoch": 0.11866952789699571, + "grad_norm": 1.3185955286026, + "learning_rate": 4.898442957898986e-06, + "loss": 2.3797, + "step": 2212 + }, + { + "epoch": 0.11872317596566524, + "grad_norm": 1.179489016532898, + "learning_rate": 4.8983203654327875e-06, + "loss": 2.3496, + "step": 2213 + }, + { + "epoch": 0.11877682403433476, + "grad_norm": 1.1040499210357666, + "learning_rate": 4.898197700554665e-06, + "loss": 2.0856, + "step": 2214 + }, + { + "epoch": 0.11883047210300429, + "grad_norm": 1.1592415571212769, + "learning_rate": 4.898074963268322e-06, + "loss": 2.2846, + "step": 2215 + }, + { + "epoch": 0.11888412017167382, + "grad_norm": 1.2647626399993896, + "learning_rate": 4.897952153577464e-06, + "loss": 2.364, + "step": 2216 + }, + { + "epoch": 0.11893776824034334, + "grad_norm": 1.2367607355117798, + "learning_rate": 4.897829271485799e-06, + "loss": 2.0552, + "step": 2217 + }, + { + "epoch": 0.11899141630901287, + "grad_norm": 1.0942049026489258, + "learning_rate": 4.897706316997038e-06, + "loss": 2.3617, + "step": 2218 + }, + { + "epoch": 0.1190450643776824, + "grad_norm": 1.1610749959945679, + "learning_rate": 4.897583290114893e-06, + "loss": 2.3401, + "step": 2219 + }, + { + "epoch": 0.11909871244635194, + "grad_norm": 1.0944725275039673, + "learning_rate": 4.897460190843077e-06, + "loss": 2.2566, + "step": 2220 + }, + { + "epoch": 0.11915236051502145, + "grad_norm": 1.1467441320419312, + "learning_rate": 4.89733701918531e-06, + "loss": 2.1874, + "step": 2221 + }, + { + "epoch": 0.11920600858369099, + "grad_norm": 1.3802417516708374, + "learning_rate": 4.897213775145308e-06, + "loss": 2.2388, + "step": 2222 + }, + { + "epoch": 0.11925965665236052, + "grad_norm": 1.7257260084152222, + "learning_rate": 4.897090458726792e-06, + "loss": 1.6935, + "step": 2223 + }, + { + "epoch": 0.11931330472103004, + "grad_norm": 1.4928561449050903, + "learning_rate": 4.896967069933487e-06, + "loss": 2.4323, + "step": 2224 + }, + { + "epoch": 0.11936695278969957, + "grad_norm": 1.3938133716583252, + "learning_rate": 4.896843608769117e-06, + "loss": 2.0414, + "step": 2225 + }, + { + "epoch": 0.1194206008583691, + "grad_norm": 1.1266670227050781, + "learning_rate": 4.896720075237411e-06, + "loss": 2.5809, + "step": 2226 + }, + { + "epoch": 0.11947424892703863, + "grad_norm": 1.175878882408142, + "learning_rate": 4.8965964693420985e-06, + "loss": 2.2741, + "step": 2227 + }, + { + "epoch": 0.11952789699570815, + "grad_norm": 1.298793911933899, + "learning_rate": 4.89647279108691e-06, + "loss": 2.0195, + "step": 2228 + }, + { + "epoch": 0.11958154506437768, + "grad_norm": 1.122757911682129, + "learning_rate": 4.896349040475582e-06, + "loss": 2.2946, + "step": 2229 + }, + { + "epoch": 0.11963519313304721, + "grad_norm": 0.9356628656387329, + "learning_rate": 4.8962252175118494e-06, + "loss": 2.127, + "step": 2230 + }, + { + "epoch": 0.11968884120171674, + "grad_norm": 1.2855839729309082, + "learning_rate": 4.896101322199451e-06, + "loss": 2.0342, + "step": 2231 + }, + { + "epoch": 0.11974248927038626, + "grad_norm": 1.0085489749908447, + "learning_rate": 4.895977354542127e-06, + "loss": 2.2659, + "step": 2232 + }, + { + "epoch": 0.1197961373390558, + "grad_norm": 1.6022034883499146, + "learning_rate": 4.895853314543621e-06, + "loss": 2.2293, + "step": 2233 + }, + { + "epoch": 0.11984978540772533, + "grad_norm": 0.9767213463783264, + "learning_rate": 4.895729202207678e-06, + "loss": 2.121, + "step": 2234 + }, + { + "epoch": 0.11990343347639484, + "grad_norm": 1.094233751296997, + "learning_rate": 4.895605017538046e-06, + "loss": 2.3498, + "step": 2235 + }, + { + "epoch": 0.11995708154506438, + "grad_norm": 1.0703954696655273, + "learning_rate": 4.895480760538474e-06, + "loss": 2.3645, + "step": 2236 + }, + { + "epoch": 0.12001072961373391, + "grad_norm": 1.4306122064590454, + "learning_rate": 4.895356431212713e-06, + "loss": 2.3706, + "step": 2237 + }, + { + "epoch": 0.12006437768240344, + "grad_norm": 1.7209537029266357, + "learning_rate": 4.895232029564517e-06, + "loss": 2.361, + "step": 2238 + }, + { + "epoch": 0.12011802575107296, + "grad_norm": 1.271087408065796, + "learning_rate": 4.895107555597642e-06, + "loss": 2.7223, + "step": 2239 + }, + { + "epoch": 0.12017167381974249, + "grad_norm": 1.2082319259643555, + "learning_rate": 4.894983009315848e-06, + "loss": 2.3118, + "step": 2240 + }, + { + "epoch": 0.12022532188841202, + "grad_norm": 1.6960113048553467, + "learning_rate": 4.894858390722893e-06, + "loss": 2.3407, + "step": 2241 + }, + { + "epoch": 0.12027896995708154, + "grad_norm": 1.1810345649719238, + "learning_rate": 4.8947336998225404e-06, + "loss": 2.3015, + "step": 2242 + }, + { + "epoch": 0.12033261802575107, + "grad_norm": 1.2832180261611938, + "learning_rate": 4.894608936618556e-06, + "loss": 2.4866, + "step": 2243 + }, + { + "epoch": 0.1203862660944206, + "grad_norm": 1.0896553993225098, + "learning_rate": 4.8944841011147045e-06, + "loss": 2.4626, + "step": 2244 + }, + { + "epoch": 0.12043991416309013, + "grad_norm": 1.222801685333252, + "learning_rate": 4.894359193314757e-06, + "loss": 2.3645, + "step": 2245 + }, + { + "epoch": 0.12049356223175965, + "grad_norm": 1.9235016107559204, + "learning_rate": 4.894234213222484e-06, + "loss": 2.4034, + "step": 2246 + }, + { + "epoch": 0.12054721030042918, + "grad_norm": 1.4074989557266235, + "learning_rate": 4.894109160841659e-06, + "loss": 2.5836, + "step": 2247 + }, + { + "epoch": 0.12060085836909872, + "grad_norm": 1.1286439895629883, + "learning_rate": 4.8939840361760585e-06, + "loss": 2.0861, + "step": 2248 + }, + { + "epoch": 0.12065450643776825, + "grad_norm": 1.1284736394882202, + "learning_rate": 4.89385883922946e-06, + "loss": 1.6527, + "step": 2249 + }, + { + "epoch": 0.12070815450643776, + "grad_norm": 1.1600624322891235, + "learning_rate": 4.893733570005642e-06, + "loss": 2.3329, + "step": 2250 + }, + { + "epoch": 0.1207618025751073, + "grad_norm": 1.0814099311828613, + "learning_rate": 4.893608228508389e-06, + "loss": 2.5917, + "step": 2251 + }, + { + "epoch": 0.12081545064377683, + "grad_norm": 1.070959448814392, + "learning_rate": 4.893482814741484e-06, + "loss": 2.3385, + "step": 2252 + }, + { + "epoch": 0.12086909871244635, + "grad_norm": 1.0625431537628174, + "learning_rate": 4.893357328708713e-06, + "loss": 2.2465, + "step": 2253 + }, + { + "epoch": 0.12092274678111588, + "grad_norm": 1.0816937685012817, + "learning_rate": 4.893231770413867e-06, + "loss": 2.2662, + "step": 2254 + }, + { + "epoch": 0.12097639484978541, + "grad_norm": 1.9735479354858398, + "learning_rate": 4.8931061398607355e-06, + "loss": 2.1403, + "step": 2255 + }, + { + "epoch": 0.12103004291845494, + "grad_norm": 1.1743444204330444, + "learning_rate": 4.892980437053112e-06, + "loss": 2.2858, + "step": 2256 + }, + { + "epoch": 0.12108369098712446, + "grad_norm": 1.1207444667816162, + "learning_rate": 4.892854661994791e-06, + "loss": 2.4183, + "step": 2257 + }, + { + "epoch": 0.12113733905579399, + "grad_norm": 1.1353235244750977, + "learning_rate": 4.8927288146895715e-06, + "loss": 2.1396, + "step": 2258 + }, + { + "epoch": 0.12119098712446352, + "grad_norm": 1.5006572008132935, + "learning_rate": 4.892602895141252e-06, + "loss": 2.2046, + "step": 2259 + }, + { + "epoch": 0.12124463519313304, + "grad_norm": 1.0503261089324951, + "learning_rate": 4.892476903353634e-06, + "loss": 2.3842, + "step": 2260 + }, + { + "epoch": 0.12129828326180257, + "grad_norm": 1.1151103973388672, + "learning_rate": 4.8923508393305224e-06, + "loss": 2.4577, + "step": 2261 + }, + { + "epoch": 0.1213519313304721, + "grad_norm": 8.964838027954102, + "learning_rate": 4.892224703075724e-06, + "loss": 2.3432, + "step": 2262 + }, + { + "epoch": 0.12140557939914164, + "grad_norm": 1.1781694889068604, + "learning_rate": 4.892098494593046e-06, + "loss": 2.3841, + "step": 2263 + }, + { + "epoch": 0.12145922746781115, + "grad_norm": 1.463942050933838, + "learning_rate": 4.891972213886299e-06, + "loss": 2.5706, + "step": 2264 + }, + { + "epoch": 0.12151287553648069, + "grad_norm": 0.9793860912322998, + "learning_rate": 4.891845860959296e-06, + "loss": 2.303, + "step": 2265 + }, + { + "epoch": 0.12156652360515022, + "grad_norm": 3.1655125617980957, + "learning_rate": 4.891719435815854e-06, + "loss": 2.5299, + "step": 2266 + }, + { + "epoch": 0.12162017167381975, + "grad_norm": 1.0490312576293945, + "learning_rate": 4.891592938459786e-06, + "loss": 2.0666, + "step": 2267 + }, + { + "epoch": 0.12167381974248927, + "grad_norm": 1.081045150756836, + "learning_rate": 4.8914663688949145e-06, + "loss": 2.2976, + "step": 2268 + }, + { + "epoch": 0.1217274678111588, + "grad_norm": 1.1728190183639526, + "learning_rate": 4.891339727125061e-06, + "loss": 2.2653, + "step": 2269 + }, + { + "epoch": 0.12178111587982833, + "grad_norm": 1.086280107498169, + "learning_rate": 4.891213013154047e-06, + "loss": 2.526, + "step": 2270 + }, + { + "epoch": 0.12183476394849785, + "grad_norm": 1.153801441192627, + "learning_rate": 4.891086226985699e-06, + "loss": 2.3797, + "step": 2271 + }, + { + "epoch": 0.12188841201716738, + "grad_norm": 1.327539324760437, + "learning_rate": 4.890959368623847e-06, + "loss": 2.43, + "step": 2272 + }, + { + "epoch": 0.12194206008583691, + "grad_norm": 2.161428689956665, + "learning_rate": 4.89083243807232e-06, + "loss": 2.2985, + "step": 2273 + }, + { + "epoch": 0.12199570815450644, + "grad_norm": 1.1817656755447388, + "learning_rate": 4.890705435334948e-06, + "loss": 2.2754, + "step": 2274 + }, + { + "epoch": 0.12204935622317596, + "grad_norm": 1.1818692684173584, + "learning_rate": 4.89057836041557e-06, + "loss": 2.2402, + "step": 2275 + }, + { + "epoch": 0.1221030042918455, + "grad_norm": 1.019187331199646, + "learning_rate": 4.890451213318019e-06, + "loss": 2.5817, + "step": 2276 + }, + { + "epoch": 0.12215665236051503, + "grad_norm": 1.0185984373092651, + "learning_rate": 4.890323994046136e-06, + "loss": 2.0667, + "step": 2277 + }, + { + "epoch": 0.12221030042918454, + "grad_norm": 1.385988473892212, + "learning_rate": 4.890196702603762e-06, + "loss": 2.4344, + "step": 2278 + }, + { + "epoch": 0.12226394849785407, + "grad_norm": 1.0223299264907837, + "learning_rate": 4.890069338994738e-06, + "loss": 1.9043, + "step": 2279 + }, + { + "epoch": 0.1223175965665236, + "grad_norm": 1.1395995616912842, + "learning_rate": 4.889941903222913e-06, + "loss": 2.3864, + "step": 2280 + }, + { + "epoch": 0.12237124463519314, + "grad_norm": 1.0651013851165771, + "learning_rate": 4.889814395292133e-06, + "loss": 2.2772, + "step": 2281 + }, + { + "epoch": 0.12242489270386266, + "grad_norm": 1.121873378753662, + "learning_rate": 4.889686815206247e-06, + "loss": 2.2094, + "step": 2282 + }, + { + "epoch": 0.12247854077253219, + "grad_norm": 2.237943172454834, + "learning_rate": 4.889559162969108e-06, + "loss": 2.4287, + "step": 2283 + }, + { + "epoch": 0.12253218884120172, + "grad_norm": 1.2616499662399292, + "learning_rate": 4.88943143858457e-06, + "loss": 2.3118, + "step": 2284 + }, + { + "epoch": 0.12258583690987125, + "grad_norm": 0.9820063710212708, + "learning_rate": 4.889303642056488e-06, + "loss": 1.9891, + "step": 2285 + }, + { + "epoch": 0.12263948497854077, + "grad_norm": 1.0452661514282227, + "learning_rate": 4.889175773388723e-06, + "loss": 2.2443, + "step": 2286 + }, + { + "epoch": 0.1226931330472103, + "grad_norm": 1.1051013469696045, + "learning_rate": 4.889047832585134e-06, + "loss": 2.4115, + "step": 2287 + }, + { + "epoch": 0.12274678111587983, + "grad_norm": 1.1486003398895264, + "learning_rate": 4.888919819649584e-06, + "loss": 2.3251, + "step": 2288 + }, + { + "epoch": 0.12280042918454935, + "grad_norm": 1.3496659994125366, + "learning_rate": 4.88879173458594e-06, + "loss": 2.3622, + "step": 2289 + }, + { + "epoch": 0.12285407725321888, + "grad_norm": 1.135419487953186, + "learning_rate": 4.8886635773980655e-06, + "loss": 2.487, + "step": 2290 + }, + { + "epoch": 0.12290772532188841, + "grad_norm": 1.3515251874923706, + "learning_rate": 4.888535348089833e-06, + "loss": 2.3393, + "step": 2291 + }, + { + "epoch": 0.12296137339055795, + "grad_norm": 0.967898964881897, + "learning_rate": 4.888407046665113e-06, + "loss": 2.1267, + "step": 2292 + }, + { + "epoch": 0.12301502145922746, + "grad_norm": 1.1497321128845215, + "learning_rate": 4.8882786731277795e-06, + "loss": 2.2572, + "step": 2293 + }, + { + "epoch": 0.123068669527897, + "grad_norm": 1.1062813997268677, + "learning_rate": 4.8881502274817085e-06, + "loss": 2.2211, + "step": 2294 + }, + { + "epoch": 0.12312231759656653, + "grad_norm": 1.4057420492172241, + "learning_rate": 4.888021709730779e-06, + "loss": 2.1616, + "step": 2295 + }, + { + "epoch": 0.12317596566523605, + "grad_norm": 1.0357105731964111, + "learning_rate": 4.887893119878869e-06, + "loss": 2.3083, + "step": 2296 + }, + { + "epoch": 0.12322961373390558, + "grad_norm": 2.976337194442749, + "learning_rate": 4.887764457929862e-06, + "loss": 2.3017, + "step": 2297 + }, + { + "epoch": 0.12328326180257511, + "grad_norm": 1.1037198305130005, + "learning_rate": 4.8876357238876445e-06, + "loss": 2.4369, + "step": 2298 + }, + { + "epoch": 0.12333690987124464, + "grad_norm": 1.0907936096191406, + "learning_rate": 4.8875069177561e-06, + "loss": 2.2435, + "step": 2299 + }, + { + "epoch": 0.12339055793991416, + "grad_norm": 1.07779860496521, + "learning_rate": 4.887378039539121e-06, + "loss": 2.0807, + "step": 2300 + }, + { + "epoch": 0.12344420600858369, + "grad_norm": 1.0141977071762085, + "learning_rate": 4.887249089240596e-06, + "loss": 2.2059, + "step": 2301 + }, + { + "epoch": 0.12349785407725322, + "grad_norm": 1.072801947593689, + "learning_rate": 4.88712006686442e-06, + "loss": 1.958, + "step": 2302 + }, + { + "epoch": 0.12355150214592275, + "grad_norm": 1.3614814281463623, + "learning_rate": 4.886990972414488e-06, + "loss": 2.2468, + "step": 2303 + }, + { + "epoch": 0.12360515021459227, + "grad_norm": 1.0918394327163696, + "learning_rate": 4.886861805894697e-06, + "loss": 2.4399, + "step": 2304 + }, + { + "epoch": 0.1236587982832618, + "grad_norm": 1.2425581216812134, + "learning_rate": 4.886732567308948e-06, + "loss": 2.4537, + "step": 2305 + }, + { + "epoch": 0.12371244635193133, + "grad_norm": 2.0526070594787598, + "learning_rate": 4.886603256661142e-06, + "loss": 2.1689, + "step": 2306 + }, + { + "epoch": 0.12376609442060085, + "grad_norm": 2.4389543533325195, + "learning_rate": 4.886473873955185e-06, + "loss": 2.6921, + "step": 2307 + }, + { + "epoch": 0.12381974248927038, + "grad_norm": 1.3708523511886597, + "learning_rate": 4.8863444191949815e-06, + "loss": 2.4885, + "step": 2308 + }, + { + "epoch": 0.12387339055793992, + "grad_norm": 1.2254393100738525, + "learning_rate": 4.886214892384441e-06, + "loss": 2.3443, + "step": 2309 + }, + { + "epoch": 0.12392703862660945, + "grad_norm": 1.4461462497711182, + "learning_rate": 4.886085293527474e-06, + "loss": 1.3828, + "step": 2310 + }, + { + "epoch": 0.12398068669527897, + "grad_norm": 1.1960307359695435, + "learning_rate": 4.885955622627993e-06, + "loss": 2.4103, + "step": 2311 + }, + { + "epoch": 0.1240343347639485, + "grad_norm": 1.1315014362335205, + "learning_rate": 4.885825879689915e-06, + "loss": 2.3024, + "step": 2312 + }, + { + "epoch": 0.12408798283261803, + "grad_norm": 0.9529332518577576, + "learning_rate": 4.885696064717155e-06, + "loss": 2.0116, + "step": 2313 + }, + { + "epoch": 0.12414163090128755, + "grad_norm": 1.4346996545791626, + "learning_rate": 4.8855661777136345e-06, + "loss": 2.2411, + "step": 2314 + }, + { + "epoch": 0.12419527896995708, + "grad_norm": 1.1109012365341187, + "learning_rate": 4.885436218683272e-06, + "loss": 2.1995, + "step": 2315 + }, + { + "epoch": 0.12424892703862661, + "grad_norm": 1.0108405351638794, + "learning_rate": 4.885306187629995e-06, + "loss": 2.3431, + "step": 2316 + }, + { + "epoch": 0.12430257510729614, + "grad_norm": 1.3428287506103516, + "learning_rate": 4.885176084557729e-06, + "loss": 2.3377, + "step": 2317 + }, + { + "epoch": 0.12435622317596566, + "grad_norm": 1.252379298210144, + "learning_rate": 4.8850459094704e-06, + "loss": 2.3753, + "step": 2318 + }, + { + "epoch": 0.12440987124463519, + "grad_norm": 1.072121262550354, + "learning_rate": 4.88491566237194e-06, + "loss": 2.2639, + "step": 2319 + }, + { + "epoch": 0.12446351931330472, + "grad_norm": 1.1638325452804565, + "learning_rate": 4.88478534326628e-06, + "loss": 2.3239, + "step": 2320 + }, + { + "epoch": 0.12451716738197426, + "grad_norm": 0.9053388237953186, + "learning_rate": 4.884654952157356e-06, + "loss": 2.0564, + "step": 2321 + }, + { + "epoch": 0.12457081545064377, + "grad_norm": 1.1814097166061401, + "learning_rate": 4.884524489049105e-06, + "loss": 2.4508, + "step": 2322 + }, + { + "epoch": 0.1246244635193133, + "grad_norm": 1.1180146932601929, + "learning_rate": 4.8843939539454656e-06, + "loss": 2.4311, + "step": 2323 + }, + { + "epoch": 0.12467811158798284, + "grad_norm": 1.2142528295516968, + "learning_rate": 4.8842633468503785e-06, + "loss": 2.3506, + "step": 2324 + }, + { + "epoch": 0.12473175965665236, + "grad_norm": 0.9260704517364502, + "learning_rate": 4.884132667767788e-06, + "loss": 2.19, + "step": 2325 + }, + { + "epoch": 0.12478540772532189, + "grad_norm": 1.0858399868011475, + "learning_rate": 4.884001916701639e-06, + "loss": 2.4385, + "step": 2326 + }, + { + "epoch": 0.12483905579399142, + "grad_norm": 1.2187477350234985, + "learning_rate": 4.88387109365588e-06, + "loss": 2.3422, + "step": 2327 + }, + { + "epoch": 0.12489270386266095, + "grad_norm": 0.896597683429718, + "learning_rate": 4.883740198634459e-06, + "loss": 1.8902, + "step": 2328 + }, + { + "epoch": 0.12494635193133047, + "grad_norm": 1.2404420375823975, + "learning_rate": 4.88360923164133e-06, + "loss": 2.2073, + "step": 2329 + }, + { + "epoch": 0.125, + "grad_norm": 3.5138399600982666, + "learning_rate": 4.883478192680448e-06, + "loss": 2.2371, + "step": 2330 + }, + { + "epoch": 0.12505364806866953, + "grad_norm": 0.9927777051925659, + "learning_rate": 4.883347081755768e-06, + "loss": 2.0641, + "step": 2331 + }, + { + "epoch": 0.12510729613733906, + "grad_norm": 1.074389934539795, + "learning_rate": 4.883215898871248e-06, + "loss": 2.2161, + "step": 2332 + }, + { + "epoch": 0.1251609442060086, + "grad_norm": 1.1682851314544678, + "learning_rate": 4.88308464403085e-06, + "loss": 2.1986, + "step": 2333 + }, + { + "epoch": 0.1252145922746781, + "grad_norm": 1.182250738143921, + "learning_rate": 4.882953317238536e-06, + "loss": 2.4657, + "step": 2334 + }, + { + "epoch": 0.12526824034334763, + "grad_norm": 0.8748573064804077, + "learning_rate": 4.8828219184982715e-06, + "loss": 2.2611, + "step": 2335 + }, + { + "epoch": 0.12532188841201716, + "grad_norm": 1.1007317304611206, + "learning_rate": 4.8826904478140245e-06, + "loss": 2.3644, + "step": 2336 + }, + { + "epoch": 0.1253755364806867, + "grad_norm": 1.2679766416549683, + "learning_rate": 4.882558905189764e-06, + "loss": 2.194, + "step": 2337 + }, + { + "epoch": 0.12542918454935623, + "grad_norm": 1.1054491996765137, + "learning_rate": 4.88242729062946e-06, + "loss": 2.4034, + "step": 2338 + }, + { + "epoch": 0.12548283261802576, + "grad_norm": 1.0838422775268555, + "learning_rate": 4.882295604137089e-06, + "loss": 2.185, + "step": 2339 + }, + { + "epoch": 0.1255364806866953, + "grad_norm": 1.37325119972229, + "learning_rate": 4.882163845716627e-06, + "loss": 2.3019, + "step": 2340 + }, + { + "epoch": 0.1255901287553648, + "grad_norm": 1.1444363594055176, + "learning_rate": 4.882032015372049e-06, + "loss": 2.3911, + "step": 2341 + }, + { + "epoch": 0.12564377682403433, + "grad_norm": 1.3782764673233032, + "learning_rate": 4.881900113107338e-06, + "loss": 2.0179, + "step": 2342 + }, + { + "epoch": 0.12569742489270386, + "grad_norm": 1.0805110931396484, + "learning_rate": 4.881768138926476e-06, + "loss": 2.0892, + "step": 2343 + }, + { + "epoch": 0.1257510729613734, + "grad_norm": 1.0971037149429321, + "learning_rate": 4.8816360928334474e-06, + "loss": 2.3799, + "step": 2344 + }, + { + "epoch": 0.12580472103004292, + "grad_norm": 1.356163740158081, + "learning_rate": 4.881503974832239e-06, + "loss": 2.2796, + "step": 2345 + }, + { + "epoch": 0.12585836909871245, + "grad_norm": 11.280003547668457, + "learning_rate": 4.881371784926839e-06, + "loss": 2.1939, + "step": 2346 + }, + { + "epoch": 0.12591201716738198, + "grad_norm": 1.1361794471740723, + "learning_rate": 4.88123952312124e-06, + "loss": 2.4777, + "step": 2347 + }, + { + "epoch": 0.1259656652360515, + "grad_norm": 1.1215752363204956, + "learning_rate": 4.881107189419435e-06, + "loss": 2.2627, + "step": 2348 + }, + { + "epoch": 0.12601931330472102, + "grad_norm": 1.053443431854248, + "learning_rate": 4.880974783825419e-06, + "loss": 2.1952, + "step": 2349 + }, + { + "epoch": 0.12607296137339055, + "grad_norm": 1.1118744611740112, + "learning_rate": 4.8808423063431905e-06, + "loss": 2.1625, + "step": 2350 + }, + { + "epoch": 0.12612660944206008, + "grad_norm": 1.3315709829330444, + "learning_rate": 4.880709756976749e-06, + "loss": 2.1729, + "step": 2351 + }, + { + "epoch": 0.12618025751072962, + "grad_norm": 1.4719502925872803, + "learning_rate": 4.880577135730096e-06, + "loss": 2.2937, + "step": 2352 + }, + { + "epoch": 0.12623390557939915, + "grad_norm": 1.437761664390564, + "learning_rate": 4.880444442607236e-06, + "loss": 2.5292, + "step": 2353 + }, + { + "epoch": 0.12628755364806868, + "grad_norm": 1.2620387077331543, + "learning_rate": 4.8803116776121765e-06, + "loss": 2.2312, + "step": 2354 + }, + { + "epoch": 0.1263412017167382, + "grad_norm": 1.1695735454559326, + "learning_rate": 4.8801788407489234e-06, + "loss": 2.1354, + "step": 2355 + }, + { + "epoch": 0.12639484978540771, + "grad_norm": 1.1577333211898804, + "learning_rate": 4.88004593202149e-06, + "loss": 2.4819, + "step": 2356 + }, + { + "epoch": 0.12644849785407725, + "grad_norm": 1.3224793672561646, + "learning_rate": 4.879912951433888e-06, + "loss": 2.3364, + "step": 2357 + }, + { + "epoch": 0.12650214592274678, + "grad_norm": 1.1379387378692627, + "learning_rate": 4.879779898990132e-06, + "loss": 2.133, + "step": 2358 + }, + { + "epoch": 0.1265557939914163, + "grad_norm": 1.2142889499664307, + "learning_rate": 4.879646774694241e-06, + "loss": 2.2295, + "step": 2359 + }, + { + "epoch": 0.12660944206008584, + "grad_norm": 1.0073366165161133, + "learning_rate": 4.879513578550234e-06, + "loss": 2.3677, + "step": 2360 + }, + { + "epoch": 0.12666309012875537, + "grad_norm": 1.1449031829833984, + "learning_rate": 4.879380310562129e-06, + "loss": 2.3376, + "step": 2361 + }, + { + "epoch": 0.1267167381974249, + "grad_norm": 1.1996792554855347, + "learning_rate": 4.8792469707339555e-06, + "loss": 2.2721, + "step": 2362 + }, + { + "epoch": 0.1267703862660944, + "grad_norm": 2.8425180912017822, + "learning_rate": 4.879113559069735e-06, + "loss": 2.0843, + "step": 2363 + }, + { + "epoch": 0.12682403433476394, + "grad_norm": 1.099564790725708, + "learning_rate": 4.878980075573497e-06, + "loss": 2.321, + "step": 2364 + }, + { + "epoch": 0.12687768240343347, + "grad_norm": 1.306047797203064, + "learning_rate": 4.878846520249272e-06, + "loss": 2.1976, + "step": 2365 + }, + { + "epoch": 0.126931330472103, + "grad_norm": 1.669678807258606, + "learning_rate": 4.878712893101092e-06, + "loss": 2.2334, + "step": 2366 + }, + { + "epoch": 0.12698497854077254, + "grad_norm": 1.132728099822998, + "learning_rate": 4.878579194132992e-06, + "loss": 2.095, + "step": 2367 + }, + { + "epoch": 0.12703862660944207, + "grad_norm": 1.1475120782852173, + "learning_rate": 4.878445423349009e-06, + "loss": 2.1269, + "step": 2368 + }, + { + "epoch": 0.1270922746781116, + "grad_norm": 1.0702564716339111, + "learning_rate": 4.8783115807531805e-06, + "loss": 2.1523, + "step": 2369 + }, + { + "epoch": 0.1271459227467811, + "grad_norm": 1.7012202739715576, + "learning_rate": 4.878177666349549e-06, + "loss": 2.2699, + "step": 2370 + }, + { + "epoch": 0.12719957081545064, + "grad_norm": 1.4421552419662476, + "learning_rate": 4.878043680142157e-06, + "loss": 2.3407, + "step": 2371 + }, + { + "epoch": 0.12725321888412017, + "grad_norm": 3.920339584350586, + "learning_rate": 4.87790962213505e-06, + "loss": 2.3075, + "step": 2372 + }, + { + "epoch": 0.1273068669527897, + "grad_norm": 0.9351228475570679, + "learning_rate": 4.877775492332276e-06, + "loss": 1.9369, + "step": 2373 + }, + { + "epoch": 0.12736051502145923, + "grad_norm": 1.1334078311920166, + "learning_rate": 4.8776412907378845e-06, + "loss": 2.2697, + "step": 2374 + }, + { + "epoch": 0.12741416309012876, + "grad_norm": 1.269491195678711, + "learning_rate": 4.877507017355927e-06, + "loss": 2.3136, + "step": 2375 + }, + { + "epoch": 0.1274678111587983, + "grad_norm": 1.1269466876983643, + "learning_rate": 4.877372672190458e-06, + "loss": 2.3255, + "step": 2376 + }, + { + "epoch": 0.1275214592274678, + "grad_norm": 1.7106226682662964, + "learning_rate": 4.8772382552455346e-06, + "loss": 1.8209, + "step": 2377 + }, + { + "epoch": 0.12757510729613733, + "grad_norm": 1.1557451486587524, + "learning_rate": 4.877103766525213e-06, + "loss": 2.4351, + "step": 2378 + }, + { + "epoch": 0.12762875536480686, + "grad_norm": 1.0685505867004395, + "learning_rate": 4.8769692060335564e-06, + "loss": 2.3428, + "step": 2379 + }, + { + "epoch": 0.1276824034334764, + "grad_norm": 1.2688835859298706, + "learning_rate": 4.876834573774626e-06, + "loss": 2.3144, + "step": 2380 + }, + { + "epoch": 0.12773605150214593, + "grad_norm": 1.2049007415771484, + "learning_rate": 4.876699869752486e-06, + "loss": 2.5566, + "step": 2381 + }, + { + "epoch": 0.12778969957081546, + "grad_norm": 0.9294794201850891, + "learning_rate": 4.876565093971205e-06, + "loss": 2.111, + "step": 2382 + }, + { + "epoch": 0.127843347639485, + "grad_norm": 1.2053269147872925, + "learning_rate": 4.8764302464348526e-06, + "loss": 2.1491, + "step": 2383 + }, + { + "epoch": 0.1278969957081545, + "grad_norm": 1.4618021249771118, + "learning_rate": 4.876295327147498e-06, + "loss": 2.261, + "step": 2384 + }, + { + "epoch": 0.12795064377682402, + "grad_norm": 1.1170001029968262, + "learning_rate": 4.876160336113217e-06, + "loss": 2.1095, + "step": 2385 + }, + { + "epoch": 0.12800429184549356, + "grad_norm": 0.9864555597305298, + "learning_rate": 4.876025273336085e-06, + "loss": 2.3188, + "step": 2386 + }, + { + "epoch": 0.1280579399141631, + "grad_norm": 1.1883249282836914, + "learning_rate": 4.875890138820178e-06, + "loss": 2.3047, + "step": 2387 + }, + { + "epoch": 0.12811158798283262, + "grad_norm": 1.2885873317718506, + "learning_rate": 4.875754932569578e-06, + "loss": 2.2965, + "step": 2388 + }, + { + "epoch": 0.12816523605150215, + "grad_norm": 1.0598763227462769, + "learning_rate": 4.875619654588368e-06, + "loss": 2.3065, + "step": 2389 + }, + { + "epoch": 0.12821888412017168, + "grad_norm": 1.1310160160064697, + "learning_rate": 4.87548430488063e-06, + "loss": 2.4561, + "step": 2390 + }, + { + "epoch": 0.12827253218884122, + "grad_norm": 1.2494921684265137, + "learning_rate": 4.875348883450452e-06, + "loss": 2.3162, + "step": 2391 + }, + { + "epoch": 0.12832618025751072, + "grad_norm": 0.9802578687667847, + "learning_rate": 4.875213390301922e-06, + "loss": 1.8146, + "step": 2392 + }, + { + "epoch": 0.12837982832618025, + "grad_norm": 1.1088906526565552, + "learning_rate": 4.875077825439132e-06, + "loss": 2.1672, + "step": 2393 + }, + { + "epoch": 0.12843347639484978, + "grad_norm": 1.2378369569778442, + "learning_rate": 4.874942188866175e-06, + "loss": 2.1707, + "step": 2394 + }, + { + "epoch": 0.12848712446351931, + "grad_norm": 1.4250819683074951, + "learning_rate": 4.874806480587145e-06, + "loss": 2.0099, + "step": 2395 + }, + { + "epoch": 0.12854077253218885, + "grad_norm": 2.238598585128784, + "learning_rate": 4.87467070060614e-06, + "loss": 2.2453, + "step": 2396 + }, + { + "epoch": 0.12859442060085838, + "grad_norm": 1.098453164100647, + "learning_rate": 4.8745348489272595e-06, + "loss": 2.2903, + "step": 2397 + }, + { + "epoch": 0.1286480686695279, + "grad_norm": 1.0525413751602173, + "learning_rate": 4.874398925554606e-06, + "loss": 2.3805, + "step": 2398 + }, + { + "epoch": 0.1287017167381974, + "grad_norm": 1.0820671319961548, + "learning_rate": 4.874262930492283e-06, + "loss": 2.3547, + "step": 2399 + }, + { + "epoch": 0.12875536480686695, + "grad_norm": 1.0449117422103882, + "learning_rate": 4.8741268637443965e-06, + "loss": 2.2154, + "step": 2400 + }, + { + "epoch": 0.12880901287553648, + "grad_norm": 1.172614574432373, + "learning_rate": 4.873990725315054e-06, + "loss": 2.2757, + "step": 2401 + }, + { + "epoch": 0.128862660944206, + "grad_norm": 1.2390937805175781, + "learning_rate": 4.873854515208367e-06, + "loss": 2.2868, + "step": 2402 + }, + { + "epoch": 0.12891630901287554, + "grad_norm": 1.4107798337936401, + "learning_rate": 4.873718233428448e-06, + "loss": 2.3966, + "step": 2403 + }, + { + "epoch": 0.12896995708154507, + "grad_norm": 1.0762255191802979, + "learning_rate": 4.87358187997941e-06, + "loss": 2.3602, + "step": 2404 + }, + { + "epoch": 0.1290236051502146, + "grad_norm": 1.3310017585754395, + "learning_rate": 4.873445454865373e-06, + "loss": 2.291, + "step": 2405 + }, + { + "epoch": 0.1290772532188841, + "grad_norm": 1.1308265924453735, + "learning_rate": 4.873308958090453e-06, + "loss": 2.2995, + "step": 2406 + }, + { + "epoch": 0.12913090128755364, + "grad_norm": 1.0790859460830688, + "learning_rate": 4.873172389658772e-06, + "loss": 2.2592, + "step": 2407 + }, + { + "epoch": 0.12918454935622317, + "grad_norm": 1.11284339427948, + "learning_rate": 4.873035749574455e-06, + "loss": 2.1322, + "step": 2408 + }, + { + "epoch": 0.1292381974248927, + "grad_norm": 1.120193362236023, + "learning_rate": 4.8728990378416265e-06, + "loss": 2.359, + "step": 2409 + }, + { + "epoch": 0.12929184549356224, + "grad_norm": 1.189876914024353, + "learning_rate": 4.872762254464414e-06, + "loss": 2.1342, + "step": 2410 + }, + { + "epoch": 0.12934549356223177, + "grad_norm": 0.8777509927749634, + "learning_rate": 4.872625399446947e-06, + "loss": 2.1866, + "step": 2411 + }, + { + "epoch": 0.1293991416309013, + "grad_norm": 1.6556050777435303, + "learning_rate": 4.872488472793358e-06, + "loss": 1.7068, + "step": 2412 + }, + { + "epoch": 0.1294527896995708, + "grad_norm": 1.1721910238265991, + "learning_rate": 4.872351474507782e-06, + "loss": 2.3557, + "step": 2413 + }, + { + "epoch": 0.12950643776824033, + "grad_norm": 1.1171882152557373, + "learning_rate": 4.8722144045943536e-06, + "loss": 2.4369, + "step": 2414 + }, + { + "epoch": 0.12956008583690987, + "grad_norm": 0.9666391015052795, + "learning_rate": 4.8720772630572125e-06, + "loss": 2.0302, + "step": 2415 + }, + { + "epoch": 0.1296137339055794, + "grad_norm": 1.036158800125122, + "learning_rate": 4.8719400499005e-06, + "loss": 2.2398, + "step": 2416 + }, + { + "epoch": 0.12966738197424893, + "grad_norm": 0.9993540048599243, + "learning_rate": 4.871802765128357e-06, + "loss": 2.0954, + "step": 2417 + }, + { + "epoch": 0.12972103004291846, + "grad_norm": 1.0844297409057617, + "learning_rate": 4.87166540874493e-06, + "loss": 2.3367, + "step": 2418 + }, + { + "epoch": 0.129774678111588, + "grad_norm": 1.746978998184204, + "learning_rate": 4.871527980754366e-06, + "loss": 2.054, + "step": 2419 + }, + { + "epoch": 0.1298283261802575, + "grad_norm": 1.1036478281021118, + "learning_rate": 4.8713904811608144e-06, + "loss": 2.205, + "step": 2420 + }, + { + "epoch": 0.12988197424892703, + "grad_norm": 1.1024922132492065, + "learning_rate": 4.871252909968426e-06, + "loss": 2.3058, + "step": 2421 + }, + { + "epoch": 0.12993562231759656, + "grad_norm": 0.8724871277809143, + "learning_rate": 4.871115267181354e-06, + "loss": 2.0378, + "step": 2422 + }, + { + "epoch": 0.1299892703862661, + "grad_norm": 1.0958431959152222, + "learning_rate": 4.870977552803757e-06, + "loss": 2.3123, + "step": 2423 + }, + { + "epoch": 0.13004291845493562, + "grad_norm": 2.1479625701904297, + "learning_rate": 4.87083976683979e-06, + "loss": 1.6012, + "step": 2424 + }, + { + "epoch": 0.13009656652360516, + "grad_norm": 1.3289028406143188, + "learning_rate": 4.870701909293614e-06, + "loss": 2.5407, + "step": 2425 + }, + { + "epoch": 0.1301502145922747, + "grad_norm": 1.3962678909301758, + "learning_rate": 4.870563980169392e-06, + "loss": 2.2489, + "step": 2426 + }, + { + "epoch": 0.13020386266094422, + "grad_norm": 1.4806437492370605, + "learning_rate": 4.870425979471287e-06, + "loss": 2.5066, + "step": 2427 + }, + { + "epoch": 0.13025751072961372, + "grad_norm": 1.2365700006484985, + "learning_rate": 4.8702879072034675e-06, + "loss": 2.1762, + "step": 2428 + }, + { + "epoch": 0.13031115879828326, + "grad_norm": 1.4384307861328125, + "learning_rate": 4.870149763370101e-06, + "loss": 1.8681, + "step": 2429 + }, + { + "epoch": 0.1303648068669528, + "grad_norm": 1.5055028200149536, + "learning_rate": 4.87001154797536e-06, + "loss": 2.2961, + "step": 2430 + }, + { + "epoch": 0.13041845493562232, + "grad_norm": 4.938503742218018, + "learning_rate": 4.869873261023415e-06, + "loss": 2.2399, + "step": 2431 + }, + { + "epoch": 0.13047210300429185, + "grad_norm": 1.151028037071228, + "learning_rate": 4.8697349025184425e-06, + "loss": 2.2664, + "step": 2432 + }, + { + "epoch": 0.13052575107296138, + "grad_norm": 1.0099657773971558, + "learning_rate": 4.869596472464621e-06, + "loss": 2.2019, + "step": 2433 + }, + { + "epoch": 0.13057939914163091, + "grad_norm": 1.1457583904266357, + "learning_rate": 4.869457970866129e-06, + "loss": 2.4809, + "step": 2434 + }, + { + "epoch": 0.13063304721030042, + "grad_norm": 1.1080389022827148, + "learning_rate": 4.869319397727148e-06, + "loss": 2.3689, + "step": 2435 + }, + { + "epoch": 0.13068669527896995, + "grad_norm": 1.1928656101226807, + "learning_rate": 4.869180753051863e-06, + "loss": 2.1303, + "step": 2436 + }, + { + "epoch": 0.13074034334763948, + "grad_norm": 1.382012128829956, + "learning_rate": 4.869042036844459e-06, + "loss": 2.5368, + "step": 2437 + }, + { + "epoch": 0.130793991416309, + "grad_norm": 1.0681489706039429, + "learning_rate": 4.8689032491091244e-06, + "loss": 2.1482, + "step": 2438 + }, + { + "epoch": 0.13084763948497855, + "grad_norm": 1.0849847793579102, + "learning_rate": 4.86876438985005e-06, + "loss": 2.4194, + "step": 2439 + }, + { + "epoch": 0.13090128755364808, + "grad_norm": 1.0854945182800293, + "learning_rate": 4.868625459071429e-06, + "loss": 2.3238, + "step": 2440 + }, + { + "epoch": 0.1309549356223176, + "grad_norm": 1.238879680633545, + "learning_rate": 4.868486456777455e-06, + "loss": 2.1457, + "step": 2441 + }, + { + "epoch": 0.1310085836909871, + "grad_norm": 1.228428602218628, + "learning_rate": 4.868347382972324e-06, + "loss": 2.3667, + "step": 2442 + }, + { + "epoch": 0.13106223175965664, + "grad_norm": 1.0824220180511475, + "learning_rate": 4.868208237660237e-06, + "loss": 2.1545, + "step": 2443 + }, + { + "epoch": 0.13111587982832618, + "grad_norm": 1.081310510635376, + "learning_rate": 4.868069020845394e-06, + "loss": 2.4431, + "step": 2444 + }, + { + "epoch": 0.1311695278969957, + "grad_norm": 1.4071342945098877, + "learning_rate": 4.867929732532e-06, + "loss": 2.243, + "step": 2445 + }, + { + "epoch": 0.13122317596566524, + "grad_norm": 1.2823927402496338, + "learning_rate": 4.867790372724257e-06, + "loss": 2.4881, + "step": 2446 + }, + { + "epoch": 0.13127682403433477, + "grad_norm": 1.1484392881393433, + "learning_rate": 4.867650941426376e-06, + "loss": 2.427, + "step": 2447 + }, + { + "epoch": 0.1313304721030043, + "grad_norm": 1.1392104625701904, + "learning_rate": 4.867511438642566e-06, + "loss": 2.3135, + "step": 2448 + }, + { + "epoch": 0.1313841201716738, + "grad_norm": 1.181867003440857, + "learning_rate": 4.867371864377039e-06, + "loss": 2.2542, + "step": 2449 + }, + { + "epoch": 0.13143776824034334, + "grad_norm": 1.058600902557373, + "learning_rate": 4.867232218634007e-06, + "loss": 1.5266, + "step": 2450 + }, + { + "epoch": 0.13149141630901287, + "grad_norm": 1.1246685981750488, + "learning_rate": 4.86709250141769e-06, + "loss": 2.3004, + "step": 2451 + }, + { + "epoch": 0.1315450643776824, + "grad_norm": 1.1211568117141724, + "learning_rate": 4.866952712732303e-06, + "loss": 2.3389, + "step": 2452 + }, + { + "epoch": 0.13159871244635193, + "grad_norm": 0.8965975046157837, + "learning_rate": 4.866812852582069e-06, + "loss": 2.1468, + "step": 2453 + }, + { + "epoch": 0.13165236051502147, + "grad_norm": 1.2314492464065552, + "learning_rate": 4.866672920971209e-06, + "loss": 2.44, + "step": 2454 + }, + { + "epoch": 0.131706008583691, + "grad_norm": 1.1346197128295898, + "learning_rate": 4.86653291790395e-06, + "loss": 2.2447, + "step": 2455 + }, + { + "epoch": 0.1317596566523605, + "grad_norm": 1.2393659353256226, + "learning_rate": 4.866392843384517e-06, + "loss": 2.3841, + "step": 2456 + }, + { + "epoch": 0.13181330472103003, + "grad_norm": 1.2449826002120972, + "learning_rate": 4.86625269741714e-06, + "loss": 2.3687, + "step": 2457 + }, + { + "epoch": 0.13186695278969957, + "grad_norm": 1.5085512399673462, + "learning_rate": 4.8661124800060515e-06, + "loss": 2.2284, + "step": 2458 + }, + { + "epoch": 0.1319206008583691, + "grad_norm": 0.972308337688446, + "learning_rate": 4.865972191155483e-06, + "loss": 1.9395, + "step": 2459 + }, + { + "epoch": 0.13197424892703863, + "grad_norm": 2.0145492553710938, + "learning_rate": 4.865831830869671e-06, + "loss": 2.3798, + "step": 2460 + }, + { + "epoch": 0.13202789699570816, + "grad_norm": 1.1628353595733643, + "learning_rate": 4.865691399152855e-06, + "loss": 2.4381, + "step": 2461 + }, + { + "epoch": 0.1320815450643777, + "grad_norm": 1.1165425777435303, + "learning_rate": 4.8655508960092735e-06, + "loss": 2.1846, + "step": 2462 + }, + { + "epoch": 0.13213519313304722, + "grad_norm": 1.2204065322875977, + "learning_rate": 4.865410321443168e-06, + "loss": 2.2777, + "step": 2463 + }, + { + "epoch": 0.13218884120171673, + "grad_norm": 1.274996280670166, + "learning_rate": 4.865269675458784e-06, + "loss": 2.36, + "step": 2464 + }, + { + "epoch": 0.13224248927038626, + "grad_norm": 4.397604465484619, + "learning_rate": 4.865128958060368e-06, + "loss": 2.33, + "step": 2465 + }, + { + "epoch": 0.1322961373390558, + "grad_norm": 1.5007342100143433, + "learning_rate": 4.864988169252168e-06, + "loss": 2.2989, + "step": 2466 + }, + { + "epoch": 0.13234978540772532, + "grad_norm": 1.2448261976242065, + "learning_rate": 4.864847309038435e-06, + "loss": 2.225, + "step": 2467 + }, + { + "epoch": 0.13240343347639486, + "grad_norm": 1.3271067142486572, + "learning_rate": 4.864706377423422e-06, + "loss": 2.5981, + "step": 2468 + }, + { + "epoch": 0.1324570815450644, + "grad_norm": 1.155749797821045, + "learning_rate": 4.864565374411384e-06, + "loss": 2.3886, + "step": 2469 + }, + { + "epoch": 0.13251072961373392, + "grad_norm": 1.0514700412750244, + "learning_rate": 4.86442430000658e-06, + "loss": 2.2313, + "step": 2470 + }, + { + "epoch": 0.13256437768240342, + "grad_norm": 1.0524930953979492, + "learning_rate": 4.864283154213267e-06, + "loss": 2.1224, + "step": 2471 + }, + { + "epoch": 0.13261802575107295, + "grad_norm": 1.065729022026062, + "learning_rate": 4.864141937035707e-06, + "loss": 2.0881, + "step": 2472 + }, + { + "epoch": 0.13267167381974249, + "grad_norm": 1.6914567947387695, + "learning_rate": 4.864000648478164e-06, + "loss": 2.2843, + "step": 2473 + }, + { + "epoch": 0.13272532188841202, + "grad_norm": 1.1357675790786743, + "learning_rate": 4.863859288544905e-06, + "loss": 2.2392, + "step": 2474 + }, + { + "epoch": 0.13277896995708155, + "grad_norm": 0.8924603462219238, + "learning_rate": 4.863717857240196e-06, + "loss": 2.0255, + "step": 2475 + }, + { + "epoch": 0.13283261802575108, + "grad_norm": 1.3204772472381592, + "learning_rate": 4.863576354568309e-06, + "loss": 2.4899, + "step": 2476 + }, + { + "epoch": 0.1328862660944206, + "grad_norm": 1.1214940547943115, + "learning_rate": 4.863434780533516e-06, + "loss": 2.3656, + "step": 2477 + }, + { + "epoch": 0.13293991416309012, + "grad_norm": 1.12124502658844, + "learning_rate": 4.863293135140091e-06, + "loss": 2.3022, + "step": 2478 + }, + { + "epoch": 0.13299356223175965, + "grad_norm": 1.40201735496521, + "learning_rate": 4.86315141839231e-06, + "loss": 2.3249, + "step": 2479 + }, + { + "epoch": 0.13304721030042918, + "grad_norm": 1.4091248512268066, + "learning_rate": 4.863009630294454e-06, + "loss": 1.5086, + "step": 2480 + }, + { + "epoch": 0.1331008583690987, + "grad_norm": 1.0625252723693848, + "learning_rate": 4.8628677708508025e-06, + "loss": 2.3041, + "step": 2481 + }, + { + "epoch": 0.13315450643776824, + "grad_norm": 1.1291453838348389, + "learning_rate": 4.862725840065639e-06, + "loss": 2.2612, + "step": 2482 + }, + { + "epoch": 0.13320815450643778, + "grad_norm": 1.226914405822754, + "learning_rate": 4.862583837943247e-06, + "loss": 2.3971, + "step": 2483 + }, + { + "epoch": 0.1332618025751073, + "grad_norm": 1.140817403793335, + "learning_rate": 4.862441764487917e-06, + "loss": 2.3165, + "step": 2484 + }, + { + "epoch": 0.1333154506437768, + "grad_norm": 2.9969699382781982, + "learning_rate": 4.8622996197039365e-06, + "loss": 2.3322, + "step": 2485 + }, + { + "epoch": 0.13336909871244634, + "grad_norm": 0.9679015278816223, + "learning_rate": 4.862157403595598e-06, + "loss": 2.0663, + "step": 2486 + }, + { + "epoch": 0.13342274678111588, + "grad_norm": 1.1280068159103394, + "learning_rate": 4.862015116167195e-06, + "loss": 2.1408, + "step": 2487 + }, + { + "epoch": 0.1334763948497854, + "grad_norm": 1.0866833925247192, + "learning_rate": 4.8618727574230245e-06, + "loss": 2.0908, + "step": 2488 + }, + { + "epoch": 0.13353004291845494, + "grad_norm": 1.1361885070800781, + "learning_rate": 4.8617303273673836e-06, + "loss": 2.3246, + "step": 2489 + }, + { + "epoch": 0.13358369098712447, + "grad_norm": 0.907956063747406, + "learning_rate": 4.861587826004574e-06, + "loss": 1.9674, + "step": 2490 + }, + { + "epoch": 0.133637339055794, + "grad_norm": 1.3571093082427979, + "learning_rate": 4.861445253338896e-06, + "loss": 2.1823, + "step": 2491 + }, + { + "epoch": 0.1336909871244635, + "grad_norm": 1.0538620948791504, + "learning_rate": 4.861302609374655e-06, + "loss": 2.2006, + "step": 2492 + }, + { + "epoch": 0.13374463519313304, + "grad_norm": 1.1479865312576294, + "learning_rate": 4.86115989411616e-06, + "loss": 1.7962, + "step": 2493 + }, + { + "epoch": 0.13379828326180257, + "grad_norm": 0.9883248805999756, + "learning_rate": 4.861017107567718e-06, + "loss": 2.1793, + "step": 2494 + }, + { + "epoch": 0.1338519313304721, + "grad_norm": 1.5355045795440674, + "learning_rate": 4.860874249733639e-06, + "loss": 2.3631, + "step": 2495 + }, + { + "epoch": 0.13390557939914163, + "grad_norm": 1.2603999376296997, + "learning_rate": 4.860731320618239e-06, + "loss": 2.5657, + "step": 2496 + }, + { + "epoch": 0.13395922746781116, + "grad_norm": 1.3818358182907104, + "learning_rate": 4.860588320225832e-06, + "loss": 2.4626, + "step": 2497 + }, + { + "epoch": 0.1340128755364807, + "grad_norm": 1.1364777088165283, + "learning_rate": 4.8604452485607366e-06, + "loss": 2.4896, + "step": 2498 + }, + { + "epoch": 0.1340665236051502, + "grad_norm": 1.0966848134994507, + "learning_rate": 4.8603021056272704e-06, + "loss": 2.1367, + "step": 2499 + }, + { + "epoch": 0.13412017167381973, + "grad_norm": 0.9862625002861023, + "learning_rate": 4.860158891429757e-06, + "loss": 2.1855, + "step": 2500 + }, + { + "epoch": 0.13417381974248926, + "grad_norm": 1.0815471410751343, + "learning_rate": 4.86001560597252e-06, + "loss": 2.2777, + "step": 2501 + }, + { + "epoch": 0.1342274678111588, + "grad_norm": 0.9311325550079346, + "learning_rate": 4.859872249259885e-06, + "loss": 1.8387, + "step": 2502 + }, + { + "epoch": 0.13428111587982833, + "grad_norm": 1.5817748308181763, + "learning_rate": 4.859728821296182e-06, + "loss": 2.3448, + "step": 2503 + }, + { + "epoch": 0.13433476394849786, + "grad_norm": 1.1083400249481201, + "learning_rate": 4.859585322085739e-06, + "loss": 2.2072, + "step": 2504 + }, + { + "epoch": 0.1343884120171674, + "grad_norm": 1.167738437652588, + "learning_rate": 4.85944175163289e-06, + "loss": 2.0642, + "step": 2505 + }, + { + "epoch": 0.13444206008583692, + "grad_norm": 1.3435335159301758, + "learning_rate": 4.859298109941971e-06, + "loss": 2.6457, + "step": 2506 + }, + { + "epoch": 0.13449570815450643, + "grad_norm": 1.1121413707733154, + "learning_rate": 4.859154397017317e-06, + "loss": 2.447, + "step": 2507 + }, + { + "epoch": 0.13454935622317596, + "grad_norm": 1.3497041463851929, + "learning_rate": 4.859010612863268e-06, + "loss": 2.3638, + "step": 2508 + }, + { + "epoch": 0.1346030042918455, + "grad_norm": 1.252178430557251, + "learning_rate": 4.8588667574841655e-06, + "loss": 2.5052, + "step": 2509 + }, + { + "epoch": 0.13465665236051502, + "grad_norm": 1.6020605564117432, + "learning_rate": 4.858722830884351e-06, + "loss": 1.4687, + "step": 2510 + }, + { + "epoch": 0.13471030042918455, + "grad_norm": 1.2267513275146484, + "learning_rate": 4.858578833068174e-06, + "loss": 2.0935, + "step": 2511 + }, + { + "epoch": 0.13476394849785409, + "grad_norm": 1.0221701860427856, + "learning_rate": 4.858434764039978e-06, + "loss": 1.9946, + "step": 2512 + }, + { + "epoch": 0.13481759656652362, + "grad_norm": 1.3191696405410767, + "learning_rate": 4.858290623804115e-06, + "loss": 2.5346, + "step": 2513 + }, + { + "epoch": 0.13487124463519312, + "grad_norm": 3.7783210277557373, + "learning_rate": 4.858146412364936e-06, + "loss": 2.3083, + "step": 2514 + }, + { + "epoch": 0.13492489270386265, + "grad_norm": 1.6394140720367432, + "learning_rate": 4.858002129726795e-06, + "loss": 2.4281, + "step": 2515 + }, + { + "epoch": 0.13497854077253219, + "grad_norm": 1.2089977264404297, + "learning_rate": 4.85785777589405e-06, + "loss": 2.1703, + "step": 2516 + }, + { + "epoch": 0.13503218884120172, + "grad_norm": 1.1010644435882568, + "learning_rate": 4.8577133508710595e-06, + "loss": 2.1802, + "step": 2517 + }, + { + "epoch": 0.13508583690987125, + "grad_norm": 1.081972360610962, + "learning_rate": 4.857568854662181e-06, + "loss": 2.1328, + "step": 2518 + }, + { + "epoch": 0.13513948497854078, + "grad_norm": 1.1794577836990356, + "learning_rate": 4.857424287271781e-06, + "loss": 2.4268, + "step": 2519 + }, + { + "epoch": 0.1351931330472103, + "grad_norm": 1.1605967283248901, + "learning_rate": 4.8572796487042214e-06, + "loss": 2.258, + "step": 2520 + }, + { + "epoch": 0.13524678111587982, + "grad_norm": 0.9454315900802612, + "learning_rate": 4.857134938963871e-06, + "loss": 2.3309, + "step": 2521 + }, + { + "epoch": 0.13530042918454935, + "grad_norm": 1.4776242971420288, + "learning_rate": 4.856990158055098e-06, + "loss": 2.3293, + "step": 2522 + }, + { + "epoch": 0.13535407725321888, + "grad_norm": 1.7410770654678345, + "learning_rate": 4.856845305982275e-06, + "loss": 2.6291, + "step": 2523 + }, + { + "epoch": 0.1354077253218884, + "grad_norm": 1.1397978067398071, + "learning_rate": 4.856700382749774e-06, + "loss": 2.261, + "step": 2524 + }, + { + "epoch": 0.13546137339055794, + "grad_norm": 5.739822864532471, + "learning_rate": 4.856555388361971e-06, + "loss": 2.3138, + "step": 2525 + }, + { + "epoch": 0.13551502145922747, + "grad_norm": 1.137895107269287, + "learning_rate": 4.856410322823245e-06, + "loss": 2.4795, + "step": 2526 + }, + { + "epoch": 0.135568669527897, + "grad_norm": 1.1866455078125, + "learning_rate": 4.856265186137974e-06, + "loss": 2.2584, + "step": 2527 + }, + { + "epoch": 0.1356223175965665, + "grad_norm": 1.3064160346984863, + "learning_rate": 4.856119978310542e-06, + "loss": 2.4068, + "step": 2528 + }, + { + "epoch": 0.13567596566523604, + "grad_norm": 1.1309518814086914, + "learning_rate": 4.8559746993453315e-06, + "loss": 2.4698, + "step": 2529 + }, + { + "epoch": 0.13572961373390557, + "grad_norm": 1.1686393022537231, + "learning_rate": 4.85582934924673e-06, + "loss": 2.3136, + "step": 2530 + }, + { + "epoch": 0.1357832618025751, + "grad_norm": 1.2027391195297241, + "learning_rate": 4.855683928019126e-06, + "loss": 2.2064, + "step": 2531 + }, + { + "epoch": 0.13583690987124464, + "grad_norm": 1.2543373107910156, + "learning_rate": 4.85553843566691e-06, + "loss": 2.3928, + "step": 2532 + }, + { + "epoch": 0.13589055793991417, + "grad_norm": 1.2000174522399902, + "learning_rate": 4.855392872194474e-06, + "loss": 2.3225, + "step": 2533 + }, + { + "epoch": 0.1359442060085837, + "grad_norm": 1.0030899047851562, + "learning_rate": 4.8552472376062145e-06, + "loss": 2.0126, + "step": 2534 + }, + { + "epoch": 0.1359978540772532, + "grad_norm": 1.1059125661849976, + "learning_rate": 4.855101531906528e-06, + "loss": 2.3652, + "step": 2535 + }, + { + "epoch": 0.13605150214592274, + "grad_norm": 1.0687627792358398, + "learning_rate": 4.8549557550998125e-06, + "loss": 2.3538, + "step": 2536 + }, + { + "epoch": 0.13610515021459227, + "grad_norm": 1.2166259288787842, + "learning_rate": 4.854809907190471e-06, + "loss": 2.1124, + "step": 2537 + }, + { + "epoch": 0.1361587982832618, + "grad_norm": 1.20870840549469, + "learning_rate": 4.854663988182906e-06, + "loss": 2.3066, + "step": 2538 + }, + { + "epoch": 0.13621244635193133, + "grad_norm": 1.304471492767334, + "learning_rate": 4.8545179980815245e-06, + "loss": 2.1111, + "step": 2539 + }, + { + "epoch": 0.13626609442060086, + "grad_norm": 1.3361451625823975, + "learning_rate": 4.8543719368907325e-06, + "loss": 2.3056, + "step": 2540 + }, + { + "epoch": 0.1363197424892704, + "grad_norm": 1.1520822048187256, + "learning_rate": 4.854225804614943e-06, + "loss": 2.3928, + "step": 2541 + }, + { + "epoch": 0.13637339055793993, + "grad_norm": 1.1784589290618896, + "learning_rate": 4.854079601258564e-06, + "loss": 2.3025, + "step": 2542 + }, + { + "epoch": 0.13642703862660943, + "grad_norm": 1.1539127826690674, + "learning_rate": 4.853933326826012e-06, + "loss": 2.3394, + "step": 2543 + }, + { + "epoch": 0.13648068669527896, + "grad_norm": 1.067237377166748, + "learning_rate": 4.853786981321705e-06, + "loss": 2.2082, + "step": 2544 + }, + { + "epoch": 0.1365343347639485, + "grad_norm": 1.1302145719528198, + "learning_rate": 4.853640564750059e-06, + "loss": 2.2704, + "step": 2545 + }, + { + "epoch": 0.13658798283261803, + "grad_norm": 1.4070969820022583, + "learning_rate": 4.853494077115496e-06, + "loss": 2.4938, + "step": 2546 + }, + { + "epoch": 0.13664163090128756, + "grad_norm": 1.1975455284118652, + "learning_rate": 4.853347518422439e-06, + "loss": 2.2266, + "step": 2547 + }, + { + "epoch": 0.1366952789699571, + "grad_norm": 1.2218058109283447, + "learning_rate": 4.853200888675312e-06, + "loss": 2.5746, + "step": 2548 + }, + { + "epoch": 0.13674892703862662, + "grad_norm": 1.2944599390029907, + "learning_rate": 4.853054187878542e-06, + "loss": 2.1689, + "step": 2549 + }, + { + "epoch": 0.13680257510729613, + "grad_norm": 1.2318395376205444, + "learning_rate": 4.852907416036559e-06, + "loss": 2.3028, + "step": 2550 + }, + { + "epoch": 0.13685622317596566, + "grad_norm": 1.3986480236053467, + "learning_rate": 4.852760573153795e-06, + "loss": 2.0448, + "step": 2551 + }, + { + "epoch": 0.1369098712446352, + "grad_norm": 1.106857180595398, + "learning_rate": 4.852613659234684e-06, + "loss": 2.3573, + "step": 2552 + }, + { + "epoch": 0.13696351931330472, + "grad_norm": 1.1128196716308594, + "learning_rate": 4.852466674283659e-06, + "loss": 2.4467, + "step": 2553 + }, + { + "epoch": 0.13701716738197425, + "grad_norm": 1.1432217359542847, + "learning_rate": 4.8523196183051605e-06, + "loss": 2.144, + "step": 2554 + }, + { + "epoch": 0.13707081545064378, + "grad_norm": 1.1968839168548584, + "learning_rate": 4.8521724913036284e-06, + "loss": 2.3312, + "step": 2555 + }, + { + "epoch": 0.13712446351931332, + "grad_norm": 1.1269241571426392, + "learning_rate": 4.852025293283503e-06, + "loss": 2.3689, + "step": 2556 + }, + { + "epoch": 0.13717811158798282, + "grad_norm": 1.1425331830978394, + "learning_rate": 4.85187802424923e-06, + "loss": 2.3685, + "step": 2557 + }, + { + "epoch": 0.13723175965665235, + "grad_norm": 1.2258234024047852, + "learning_rate": 4.851730684205256e-06, + "loss": 2.3266, + "step": 2558 + }, + { + "epoch": 0.13728540772532188, + "grad_norm": 1.7194783687591553, + "learning_rate": 4.8515832731560294e-06, + "loss": 2.3063, + "step": 2559 + }, + { + "epoch": 0.13733905579399142, + "grad_norm": 1.1475189924240112, + "learning_rate": 4.851435791106001e-06, + "loss": 2.3058, + "step": 2560 + }, + { + "epoch": 0.13739270386266095, + "grad_norm": 1.0955013036727905, + "learning_rate": 4.851288238059623e-06, + "loss": 2.3928, + "step": 2561 + }, + { + "epoch": 0.13744635193133048, + "grad_norm": 2.7226550579071045, + "learning_rate": 4.851140614021351e-06, + "loss": 2.2506, + "step": 2562 + }, + { + "epoch": 0.1375, + "grad_norm": 1.1289459466934204, + "learning_rate": 4.850992918995642e-06, + "loss": 2.2048, + "step": 2563 + }, + { + "epoch": 0.13755364806866952, + "grad_norm": 1.1512473821640015, + "learning_rate": 4.850845152986956e-06, + "loss": 2.2902, + "step": 2564 + }, + { + "epoch": 0.13760729613733905, + "grad_norm": 1.2312180995941162, + "learning_rate": 4.850697315999753e-06, + "loss": 2.2745, + "step": 2565 + }, + { + "epoch": 0.13766094420600858, + "grad_norm": 0.935968279838562, + "learning_rate": 4.850549408038498e-06, + "loss": 2.2815, + "step": 2566 + }, + { + "epoch": 0.1377145922746781, + "grad_norm": 1.1151764392852783, + "learning_rate": 4.850401429107656e-06, + "loss": 2.316, + "step": 2567 + }, + { + "epoch": 0.13776824034334764, + "grad_norm": 0.9631773233413696, + "learning_rate": 4.850253379211696e-06, + "loss": 2.2872, + "step": 2568 + }, + { + "epoch": 0.13782188841201717, + "grad_norm": 1.9466556310653687, + "learning_rate": 4.850105258355085e-06, + "loss": 2.3534, + "step": 2569 + }, + { + "epoch": 0.1378755364806867, + "grad_norm": 1.1101152896881104, + "learning_rate": 4.8499570665423e-06, + "loss": 2.2318, + "step": 2570 + }, + { + "epoch": 0.1379291845493562, + "grad_norm": 1.044542670249939, + "learning_rate": 4.849808803777811e-06, + "loss": 2.1414, + "step": 2571 + }, + { + "epoch": 0.13798283261802574, + "grad_norm": 1.2001663446426392, + "learning_rate": 4.849660470066097e-06, + "loss": 2.2717, + "step": 2572 + }, + { + "epoch": 0.13803648068669527, + "grad_norm": 1.1531293392181396, + "learning_rate": 4.849512065411636e-06, + "loss": 2.2932, + "step": 2573 + }, + { + "epoch": 0.1380901287553648, + "grad_norm": 1.1364675760269165, + "learning_rate": 4.849363589818908e-06, + "loss": 2.0914, + "step": 2574 + }, + { + "epoch": 0.13814377682403434, + "grad_norm": 1.1344733238220215, + "learning_rate": 4.8492150432923955e-06, + "loss": 2.1728, + "step": 2575 + }, + { + "epoch": 0.13819742489270387, + "grad_norm": 1.2180049419403076, + "learning_rate": 4.849066425836585e-06, + "loss": 2.3907, + "step": 2576 + }, + { + "epoch": 0.1382510729613734, + "grad_norm": 1.3955883979797363, + "learning_rate": 4.8489177374559635e-06, + "loss": 2.0869, + "step": 2577 + }, + { + "epoch": 0.13830472103004293, + "grad_norm": 1.2396379709243774, + "learning_rate": 4.84876897815502e-06, + "loss": 2.2996, + "step": 2578 + }, + { + "epoch": 0.13835836909871244, + "grad_norm": 0.9525055289268494, + "learning_rate": 4.848620147938245e-06, + "loss": 1.9263, + "step": 2579 + }, + { + "epoch": 0.13841201716738197, + "grad_norm": 6.341739654541016, + "learning_rate": 4.848471246810134e-06, + "loss": 2.3896, + "step": 2580 + }, + { + "epoch": 0.1384656652360515, + "grad_norm": 1.4433153867721558, + "learning_rate": 4.84832227477518e-06, + "loss": 2.1809, + "step": 2581 + }, + { + "epoch": 0.13851931330472103, + "grad_norm": 1.1104830503463745, + "learning_rate": 4.848173231837884e-06, + "loss": 2.2601, + "step": 2582 + }, + { + "epoch": 0.13857296137339056, + "grad_norm": 1.1456726789474487, + "learning_rate": 4.848024118002744e-06, + "loss": 2.2313, + "step": 2583 + }, + { + "epoch": 0.1386266094420601, + "grad_norm": 1.78724205493927, + "learning_rate": 4.8478749332742636e-06, + "loss": 2.1399, + "step": 2584 + }, + { + "epoch": 0.13868025751072963, + "grad_norm": 1.2041760683059692, + "learning_rate": 4.847725677656944e-06, + "loss": 2.3602, + "step": 2585 + }, + { + "epoch": 0.13873390557939913, + "grad_norm": 1.205826759338379, + "learning_rate": 4.847576351155296e-06, + "loss": 2.4175, + "step": 2586 + }, + { + "epoch": 0.13878755364806866, + "grad_norm": 1.2007580995559692, + "learning_rate": 4.847426953773826e-06, + "loss": 2.471, + "step": 2587 + }, + { + "epoch": 0.1388412017167382, + "grad_norm": 1.0631814002990723, + "learning_rate": 4.847277485517045e-06, + "loss": 2.4685, + "step": 2588 + }, + { + "epoch": 0.13889484978540773, + "grad_norm": 1.238420009613037, + "learning_rate": 4.847127946389464e-06, + "loss": 2.2702, + "step": 2589 + }, + { + "epoch": 0.13894849785407726, + "grad_norm": 1.376084327697754, + "learning_rate": 4.8469783363956005e-06, + "loss": 2.4519, + "step": 2590 + }, + { + "epoch": 0.1390021459227468, + "grad_norm": 1.245323896408081, + "learning_rate": 4.846828655539971e-06, + "loss": 2.6047, + "step": 2591 + }, + { + "epoch": 0.13905579399141632, + "grad_norm": 1.3367661237716675, + "learning_rate": 4.8466789038270946e-06, + "loss": 2.3491, + "step": 2592 + }, + { + "epoch": 0.13910944206008583, + "grad_norm": 1.2830106019973755, + "learning_rate": 4.846529081261493e-06, + "loss": 2.4756, + "step": 2593 + }, + { + "epoch": 0.13916309012875536, + "grad_norm": 1.070207118988037, + "learning_rate": 4.8463791878476884e-06, + "loss": 2.2129, + "step": 2594 + }, + { + "epoch": 0.1392167381974249, + "grad_norm": 1.1964598894119263, + "learning_rate": 4.8462292235902076e-06, + "loss": 2.2409, + "step": 2595 + }, + { + "epoch": 0.13927038626609442, + "grad_norm": 1.0617563724517822, + "learning_rate": 4.846079188493578e-06, + "loss": 2.5477, + "step": 2596 + }, + { + "epoch": 0.13932403433476395, + "grad_norm": 1.1844786405563354, + "learning_rate": 4.8459290825623315e-06, + "loss": 2.1219, + "step": 2597 + }, + { + "epoch": 0.13937768240343348, + "grad_norm": 1.202764630317688, + "learning_rate": 4.845778905800998e-06, + "loss": 2.3712, + "step": 2598 + }, + { + "epoch": 0.13943133047210302, + "grad_norm": 1.2007936239242554, + "learning_rate": 4.845628658214112e-06, + "loss": 2.29, + "step": 2599 + }, + { + "epoch": 0.13948497854077252, + "grad_norm": 1.4583313465118408, + "learning_rate": 4.845478339806211e-06, + "loss": 2.1552, + "step": 2600 + }, + { + "epoch": 0.13953862660944205, + "grad_norm": 1.2071523666381836, + "learning_rate": 4.845327950581832e-06, + "loss": 2.3038, + "step": 2601 + }, + { + "epoch": 0.13959227467811158, + "grad_norm": 1.0191982984542847, + "learning_rate": 4.845177490545516e-06, + "loss": 1.5545, + "step": 2602 + }, + { + "epoch": 0.13964592274678111, + "grad_norm": 1.1065293550491333, + "learning_rate": 4.845026959701808e-06, + "loss": 2.1539, + "step": 2603 + }, + { + "epoch": 0.13969957081545065, + "grad_norm": 1.3406102657318115, + "learning_rate": 4.84487635805525e-06, + "loss": 2.4177, + "step": 2604 + }, + { + "epoch": 0.13975321888412018, + "grad_norm": 1.3943607807159424, + "learning_rate": 4.844725685610391e-06, + "loss": 2.2079, + "step": 2605 + }, + { + "epoch": 0.1398068669527897, + "grad_norm": 1.083919882774353, + "learning_rate": 4.844574942371779e-06, + "loss": 2.5449, + "step": 2606 + }, + { + "epoch": 0.13986051502145921, + "grad_norm": 1.276545763015747, + "learning_rate": 4.8444241283439665e-06, + "loss": 2.1997, + "step": 2607 + }, + { + "epoch": 0.13991416309012875, + "grad_norm": 1.0600335597991943, + "learning_rate": 4.8442732435315055e-06, + "loss": 2.2708, + "step": 2608 + }, + { + "epoch": 0.13996781115879828, + "grad_norm": 1.265777349472046, + "learning_rate": 4.844122287938953e-06, + "loss": 2.4247, + "step": 2609 + }, + { + "epoch": 0.1400214592274678, + "grad_norm": 1.1178034543991089, + "learning_rate": 4.843971261570866e-06, + "loss": 1.9211, + "step": 2610 + }, + { + "epoch": 0.14007510729613734, + "grad_norm": 1.1789339780807495, + "learning_rate": 4.843820164431805e-06, + "loss": 2.3098, + "step": 2611 + }, + { + "epoch": 0.14012875536480687, + "grad_norm": 1.1659685373306274, + "learning_rate": 4.8436689965263315e-06, + "loss": 2.4917, + "step": 2612 + }, + { + "epoch": 0.1401824034334764, + "grad_norm": 0.9890972971916199, + "learning_rate": 4.84351775785901e-06, + "loss": 2.192, + "step": 2613 + }, + { + "epoch": 0.14023605150214594, + "grad_norm": 1.6201070547103882, + "learning_rate": 4.843366448434407e-06, + "loss": 2.3915, + "step": 2614 + }, + { + "epoch": 0.14028969957081544, + "grad_norm": 4.082869052886963, + "learning_rate": 4.84321506825709e-06, + "loss": 2.3106, + "step": 2615 + }, + { + "epoch": 0.14034334763948497, + "grad_norm": 1.1288623809814453, + "learning_rate": 4.843063617331631e-06, + "loss": 2.3839, + "step": 2616 + }, + { + "epoch": 0.1403969957081545, + "grad_norm": 1.1509149074554443, + "learning_rate": 4.842912095662602e-06, + "loss": 2.6458, + "step": 2617 + }, + { + "epoch": 0.14045064377682404, + "grad_norm": 0.9668889045715332, + "learning_rate": 4.842760503254577e-06, + "loss": 2.0297, + "step": 2618 + }, + { + "epoch": 0.14050429184549357, + "grad_norm": 1.011601209640503, + "learning_rate": 4.842608840112134e-06, + "loss": 2.2746, + "step": 2619 + }, + { + "epoch": 0.1405579399141631, + "grad_norm": 1.1727242469787598, + "learning_rate": 4.8424571062398525e-06, + "loss": 2.355, + "step": 2620 + }, + { + "epoch": 0.14061158798283263, + "grad_norm": 1.046802282333374, + "learning_rate": 4.8423053016423126e-06, + "loss": 2.2547, + "step": 2621 + }, + { + "epoch": 0.14066523605150213, + "grad_norm": 1.787411093711853, + "learning_rate": 4.842153426324099e-06, + "loss": 2.3398, + "step": 2622 + }, + { + "epoch": 0.14071888412017167, + "grad_norm": 1.2201378345489502, + "learning_rate": 4.842001480289796e-06, + "loss": 2.4591, + "step": 2623 + }, + { + "epoch": 0.1407725321888412, + "grad_norm": 1.4829744100570679, + "learning_rate": 4.841849463543992e-06, + "loss": 2.3791, + "step": 2624 + }, + { + "epoch": 0.14082618025751073, + "grad_norm": 1.2268778085708618, + "learning_rate": 4.841697376091277e-06, + "loss": 2.4836, + "step": 2625 + }, + { + "epoch": 0.14087982832618026, + "grad_norm": 1.33174729347229, + "learning_rate": 4.841545217936242e-06, + "loss": 2.3294, + "step": 2626 + }, + { + "epoch": 0.1409334763948498, + "grad_norm": 1.284349799156189, + "learning_rate": 4.841392989083482e-06, + "loss": 1.2711, + "step": 2627 + }, + { + "epoch": 0.14098712446351933, + "grad_norm": 1.1619863510131836, + "learning_rate": 4.8412406895375915e-06, + "loss": 2.0605, + "step": 2628 + }, + { + "epoch": 0.14104077253218883, + "grad_norm": 1.1974586248397827, + "learning_rate": 4.841088319303172e-06, + "loss": 2.1339, + "step": 2629 + }, + { + "epoch": 0.14109442060085836, + "grad_norm": 0.9850566983222961, + "learning_rate": 4.840935878384821e-06, + "loss": 1.9712, + "step": 2630 + }, + { + "epoch": 0.1411480686695279, + "grad_norm": 1.0427888631820679, + "learning_rate": 4.840783366787143e-06, + "loss": 2.2419, + "step": 2631 + }, + { + "epoch": 0.14120171673819742, + "grad_norm": 1.1892296075820923, + "learning_rate": 4.8406307845147425e-06, + "loss": 2.2298, + "step": 2632 + }, + { + "epoch": 0.14125536480686696, + "grad_norm": 1.3453247547149658, + "learning_rate": 4.840478131572225e-06, + "loss": 2.3003, + "step": 2633 + }, + { + "epoch": 0.1413090128755365, + "grad_norm": 1.0352658033370972, + "learning_rate": 4.8403254079642e-06, + "loss": 2.2433, + "step": 2634 + }, + { + "epoch": 0.14136266094420602, + "grad_norm": 0.9283222556114197, + "learning_rate": 4.8401726136952806e-06, + "loss": 1.7908, + "step": 2635 + }, + { + "epoch": 0.14141630901287552, + "grad_norm": 1.2222533226013184, + "learning_rate": 4.840019748770077e-06, + "loss": 2.4824, + "step": 2636 + }, + { + "epoch": 0.14146995708154506, + "grad_norm": 1.1536953449249268, + "learning_rate": 4.839866813193207e-06, + "loss": 1.9876, + "step": 2637 + }, + { + "epoch": 0.1415236051502146, + "grad_norm": 2.367110252380371, + "learning_rate": 4.839713806969288e-06, + "loss": 2.6035, + "step": 2638 + }, + { + "epoch": 0.14157725321888412, + "grad_norm": 1.019415020942688, + "learning_rate": 4.839560730102938e-06, + "loss": 1.9779, + "step": 2639 + }, + { + "epoch": 0.14163090128755365, + "grad_norm": 1.4380708932876587, + "learning_rate": 4.83940758259878e-06, + "loss": 2.1803, + "step": 2640 + }, + { + "epoch": 0.14168454935622318, + "grad_norm": 1.1637959480285645, + "learning_rate": 4.839254364461438e-06, + "loss": 2.2428, + "step": 2641 + }, + { + "epoch": 0.14173819742489271, + "grad_norm": 0.9171174764633179, + "learning_rate": 4.8391010756955375e-06, + "loss": 2.2194, + "step": 2642 + }, + { + "epoch": 0.14179184549356222, + "grad_norm": 1.1895849704742432, + "learning_rate": 4.838947716305707e-06, + "loss": 2.2276, + "step": 2643 + }, + { + "epoch": 0.14184549356223175, + "grad_norm": 1.0951863527297974, + "learning_rate": 4.8387942862965775e-06, + "loss": 2.2233, + "step": 2644 + }, + { + "epoch": 0.14189914163090128, + "grad_norm": 1.1402037143707275, + "learning_rate": 4.83864078567278e-06, + "loss": 2.4188, + "step": 2645 + }, + { + "epoch": 0.1419527896995708, + "grad_norm": 1.107161521911621, + "learning_rate": 4.83848721443895e-06, + "loss": 2.3839, + "step": 2646 + }, + { + "epoch": 0.14200643776824035, + "grad_norm": 1.1461299657821655, + "learning_rate": 4.838333572599726e-06, + "loss": 2.0215, + "step": 2647 + }, + { + "epoch": 0.14206008583690988, + "grad_norm": 1.1573461294174194, + "learning_rate": 4.8381798601597435e-06, + "loss": 2.1784, + "step": 2648 + }, + { + "epoch": 0.1421137339055794, + "grad_norm": 1.5497785806655884, + "learning_rate": 4.838026077123645e-06, + "loss": 2.0427, + "step": 2649 + }, + { + "epoch": 0.14216738197424894, + "grad_norm": 1.2822668552398682, + "learning_rate": 4.837872223496075e-06, + "loss": 2.0872, + "step": 2650 + }, + { + "epoch": 0.14222103004291844, + "grad_norm": 1.1856153011322021, + "learning_rate": 4.8377182992816764e-06, + "loss": 2.4139, + "step": 2651 + }, + { + "epoch": 0.14227467811158798, + "grad_norm": 1.19161856174469, + "learning_rate": 4.837564304485098e-06, + "loss": 2.3316, + "step": 2652 + }, + { + "epoch": 0.1423283261802575, + "grad_norm": 1.301226019859314, + "learning_rate": 4.837410239110989e-06, + "loss": 2.2512, + "step": 2653 + }, + { + "epoch": 0.14238197424892704, + "grad_norm": 1.5653005838394165, + "learning_rate": 4.837256103164001e-06, + "loss": 2.2565, + "step": 2654 + }, + { + "epoch": 0.14243562231759657, + "grad_norm": 1.1987214088439941, + "learning_rate": 4.837101896648787e-06, + "loss": 2.3294, + "step": 2655 + }, + { + "epoch": 0.1424892703862661, + "grad_norm": 1.2392146587371826, + "learning_rate": 4.836947619570005e-06, + "loss": 2.3291, + "step": 2656 + }, + { + "epoch": 0.14254291845493564, + "grad_norm": 1.1895787715911865, + "learning_rate": 4.836793271932312e-06, + "loss": 2.3372, + "step": 2657 + }, + { + "epoch": 0.14259656652360514, + "grad_norm": 1.6794284582138062, + "learning_rate": 4.8366388537403676e-06, + "loss": 2.3439, + "step": 2658 + }, + { + "epoch": 0.14265021459227467, + "grad_norm": 1.1115394830703735, + "learning_rate": 4.836484364998835e-06, + "loss": 2.2436, + "step": 2659 + }, + { + "epoch": 0.1427038626609442, + "grad_norm": 1.0934815406799316, + "learning_rate": 4.836329805712378e-06, + "loss": 2.2752, + "step": 2660 + }, + { + "epoch": 0.14275751072961373, + "grad_norm": 0.968967080116272, + "learning_rate": 4.8361751758856625e-06, + "loss": 2.1936, + "step": 2661 + }, + { + "epoch": 0.14281115879828327, + "grad_norm": 1.3964102268218994, + "learning_rate": 4.8360204755233586e-06, + "loss": 2.1108, + "step": 2662 + }, + { + "epoch": 0.1428648068669528, + "grad_norm": 1.295291543006897, + "learning_rate": 4.835865704630137e-06, + "loss": 2.463, + "step": 2663 + }, + { + "epoch": 0.14291845493562233, + "grad_norm": 1.1287435293197632, + "learning_rate": 4.83571086321067e-06, + "loss": 2.2689, + "step": 2664 + }, + { + "epoch": 0.14297210300429183, + "grad_norm": 1.3540992736816406, + "learning_rate": 4.8355559512696325e-06, + "loss": 2.451, + "step": 2665 + }, + { + "epoch": 0.14302575107296137, + "grad_norm": 1.1927950382232666, + "learning_rate": 4.835400968811703e-06, + "loss": 2.4272, + "step": 2666 + }, + { + "epoch": 0.1430793991416309, + "grad_norm": 1.7891123294830322, + "learning_rate": 4.835245915841559e-06, + "loss": 2.5212, + "step": 2667 + }, + { + "epoch": 0.14313304721030043, + "grad_norm": 1.2342780828475952, + "learning_rate": 4.835090792363884e-06, + "loss": 2.4043, + "step": 2668 + }, + { + "epoch": 0.14318669527896996, + "grad_norm": 1.2914084196090698, + "learning_rate": 4.83493559838336e-06, + "loss": 2.5127, + "step": 2669 + }, + { + "epoch": 0.1432403433476395, + "grad_norm": 1.14341139793396, + "learning_rate": 4.834780333904674e-06, + "loss": 2.078, + "step": 2670 + }, + { + "epoch": 0.14329399141630902, + "grad_norm": 1.143470287322998, + "learning_rate": 4.834624998932513e-06, + "loss": 2.3962, + "step": 2671 + }, + { + "epoch": 0.14334763948497853, + "grad_norm": 1.2366676330566406, + "learning_rate": 4.8344695934715654e-06, + "loss": 2.4763, + "step": 2672 + }, + { + "epoch": 0.14340128755364806, + "grad_norm": 1.0483859777450562, + "learning_rate": 4.8343141175265274e-06, + "loss": 2.0984, + "step": 2673 + }, + { + "epoch": 0.1434549356223176, + "grad_norm": 1.587472915649414, + "learning_rate": 4.83415857110209e-06, + "loss": 2.4762, + "step": 2674 + }, + { + "epoch": 0.14350858369098712, + "grad_norm": 1.443004846572876, + "learning_rate": 4.834002954202951e-06, + "loss": 2.4255, + "step": 2675 + }, + { + "epoch": 0.14356223175965666, + "grad_norm": 1.2059435844421387, + "learning_rate": 4.833847266833808e-06, + "loss": 2.401, + "step": 2676 + }, + { + "epoch": 0.1436158798283262, + "grad_norm": 1.1221987009048462, + "learning_rate": 4.833691508999362e-06, + "loss": 2.2795, + "step": 2677 + }, + { + "epoch": 0.14366952789699572, + "grad_norm": 1.1567884683609009, + "learning_rate": 4.833535680704317e-06, + "loss": 2.1773, + "step": 2678 + }, + { + "epoch": 0.14372317596566522, + "grad_norm": 1.020630121231079, + "learning_rate": 4.833379781953375e-06, + "loss": 2.3416, + "step": 2679 + }, + { + "epoch": 0.14377682403433475, + "grad_norm": 1.044460415840149, + "learning_rate": 4.8332238127512455e-06, + "loss": 2.1921, + "step": 2680 + }, + { + "epoch": 0.1438304721030043, + "grad_norm": 1.2360183000564575, + "learning_rate": 4.833067773102637e-06, + "loss": 2.2792, + "step": 2681 + }, + { + "epoch": 0.14388412017167382, + "grad_norm": 1.1277679204940796, + "learning_rate": 4.83291166301226e-06, + "loss": 2.3468, + "step": 2682 + }, + { + "epoch": 0.14393776824034335, + "grad_norm": 1.0997039079666138, + "learning_rate": 4.83275548248483e-06, + "loss": 2.2527, + "step": 2683 + }, + { + "epoch": 0.14399141630901288, + "grad_norm": 1.3994145393371582, + "learning_rate": 4.832599231525059e-06, + "loss": 2.4244, + "step": 2684 + }, + { + "epoch": 0.1440450643776824, + "grad_norm": 1.2338752746582031, + "learning_rate": 4.832442910137669e-06, + "loss": 2.2713, + "step": 2685 + }, + { + "epoch": 0.14409871244635192, + "grad_norm": 0.9556673765182495, + "learning_rate": 4.832286518327376e-06, + "loss": 2.1539, + "step": 2686 + }, + { + "epoch": 0.14415236051502145, + "grad_norm": 1.1534291505813599, + "learning_rate": 4.8321300560989034e-06, + "loss": 2.3384, + "step": 2687 + }, + { + "epoch": 0.14420600858369098, + "grad_norm": 1.063785433769226, + "learning_rate": 4.831973523456975e-06, + "loss": 2.2127, + "step": 2688 + }, + { + "epoch": 0.1442596566523605, + "grad_norm": 1.2987428903579712, + "learning_rate": 4.831816920406318e-06, + "loss": 2.621, + "step": 2689 + }, + { + "epoch": 0.14431330472103004, + "grad_norm": 1.241766095161438, + "learning_rate": 4.83166024695166e-06, + "loss": 2.1739, + "step": 2690 + }, + { + "epoch": 0.14436695278969958, + "grad_norm": 1.3829827308654785, + "learning_rate": 4.831503503097731e-06, + "loss": 2.3216, + "step": 2691 + }, + { + "epoch": 0.1444206008583691, + "grad_norm": 1.428987741470337, + "learning_rate": 4.831346688849263e-06, + "loss": 2.4716, + "step": 2692 + }, + { + "epoch": 0.14447424892703864, + "grad_norm": 0.9514102339744568, + "learning_rate": 4.831189804210992e-06, + "loss": 2.2495, + "step": 2693 + }, + { + "epoch": 0.14452789699570814, + "grad_norm": 1.0548843145370483, + "learning_rate": 4.831032849187654e-06, + "loss": 2.1617, + "step": 2694 + }, + { + "epoch": 0.14458154506437768, + "grad_norm": 1.2221304178237915, + "learning_rate": 4.830875823783989e-06, + "loss": 2.3313, + "step": 2695 + }, + { + "epoch": 0.1446351931330472, + "grad_norm": 1.1903204917907715, + "learning_rate": 4.830718728004736e-06, + "loss": 2.4718, + "step": 2696 + }, + { + "epoch": 0.14468884120171674, + "grad_norm": 1.1820785999298096, + "learning_rate": 4.83056156185464e-06, + "loss": 2.3183, + "step": 2697 + }, + { + "epoch": 0.14474248927038627, + "grad_norm": 1.166542410850525, + "learning_rate": 4.8304043253384454e-06, + "loss": 2.3058, + "step": 2698 + }, + { + "epoch": 0.1447961373390558, + "grad_norm": 1.6744433641433716, + "learning_rate": 4.8302470184609e-06, + "loss": 1.9108, + "step": 2699 + }, + { + "epoch": 0.14484978540772533, + "grad_norm": 1.3729876279830933, + "learning_rate": 4.830089641226753e-06, + "loss": 1.4932, + "step": 2700 + }, + { + "epoch": 0.14490343347639484, + "grad_norm": 1.3682063817977905, + "learning_rate": 4.829932193640756e-06, + "loss": 2.1562, + "step": 2701 + }, + { + "epoch": 0.14495708154506437, + "grad_norm": 1.3546043634414673, + "learning_rate": 4.8297746757076635e-06, + "loss": 2.6705, + "step": 2702 + }, + { + "epoch": 0.1450107296137339, + "grad_norm": 1.306232213973999, + "learning_rate": 4.82961708743223e-06, + "loss": 2.2309, + "step": 2703 + }, + { + "epoch": 0.14506437768240343, + "grad_norm": 1.5348254442214966, + "learning_rate": 4.829459428819215e-06, + "loss": 2.2823, + "step": 2704 + }, + { + "epoch": 0.14511802575107297, + "grad_norm": 1.1961326599121094, + "learning_rate": 4.829301699873377e-06, + "loss": 2.4985, + "step": 2705 + }, + { + "epoch": 0.1451716738197425, + "grad_norm": 1.248276710510254, + "learning_rate": 4.829143900599481e-06, + "loss": 2.3922, + "step": 2706 + }, + { + "epoch": 0.14522532188841203, + "grad_norm": 1.1918928623199463, + "learning_rate": 4.828986031002289e-06, + "loss": 2.1168, + "step": 2707 + }, + { + "epoch": 0.14527896995708153, + "grad_norm": 1.1401969194412231, + "learning_rate": 4.828828091086568e-06, + "loss": 2.4647, + "step": 2708 + }, + { + "epoch": 0.14533261802575106, + "grad_norm": 1.1937243938446045, + "learning_rate": 4.828670080857087e-06, + "loss": 2.1161, + "step": 2709 + }, + { + "epoch": 0.1453862660944206, + "grad_norm": 1.3287962675094604, + "learning_rate": 4.828512000318617e-06, + "loss": 2.4256, + "step": 2710 + }, + { + "epoch": 0.14543991416309013, + "grad_norm": 3.6277103424072266, + "learning_rate": 4.82835384947593e-06, + "loss": 2.3067, + "step": 2711 + }, + { + "epoch": 0.14549356223175966, + "grad_norm": 1.1979738473892212, + "learning_rate": 4.828195628333802e-06, + "loss": 2.1231, + "step": 2712 + }, + { + "epoch": 0.1455472103004292, + "grad_norm": 1.2800036668777466, + "learning_rate": 4.828037336897009e-06, + "loss": 2.2545, + "step": 2713 + }, + { + "epoch": 0.14560085836909872, + "grad_norm": 0.9817940592765808, + "learning_rate": 4.827878975170331e-06, + "loss": 2.2189, + "step": 2714 + }, + { + "epoch": 0.14565450643776823, + "grad_norm": 1.2749505043029785, + "learning_rate": 4.82772054315855e-06, + "loss": 2.2535, + "step": 2715 + }, + { + "epoch": 0.14570815450643776, + "grad_norm": 1.1903676986694336, + "learning_rate": 4.827562040866448e-06, + "loss": 2.467, + "step": 2716 + }, + { + "epoch": 0.1457618025751073, + "grad_norm": 1.4522608518600464, + "learning_rate": 4.827403468298812e-06, + "loss": 2.3076, + "step": 2717 + }, + { + "epoch": 0.14581545064377682, + "grad_norm": 1.3967574834823608, + "learning_rate": 4.82724482546043e-06, + "loss": 2.3934, + "step": 2718 + }, + { + "epoch": 0.14586909871244635, + "grad_norm": 1.1825729608535767, + "learning_rate": 4.82708611235609e-06, + "loss": 2.2789, + "step": 2719 + }, + { + "epoch": 0.1459227467811159, + "grad_norm": 1.2014063596725464, + "learning_rate": 4.826927328990585e-06, + "loss": 2.3136, + "step": 2720 + }, + { + "epoch": 0.14597639484978542, + "grad_norm": 1.0787264108657837, + "learning_rate": 4.8267684753687086e-06, + "loss": 2.1486, + "step": 2721 + }, + { + "epoch": 0.14603004291845492, + "grad_norm": 1.1232953071594238, + "learning_rate": 4.826609551495259e-06, + "loss": 2.3815, + "step": 2722 + }, + { + "epoch": 0.14608369098712445, + "grad_norm": 1.70075261592865, + "learning_rate": 4.826450557375032e-06, + "loss": 2.2075, + "step": 2723 + }, + { + "epoch": 0.14613733905579399, + "grad_norm": 1.1785184144973755, + "learning_rate": 4.826291493012829e-06, + "loss": 2.0765, + "step": 2724 + }, + { + "epoch": 0.14619098712446352, + "grad_norm": 1.2155002355575562, + "learning_rate": 4.826132358413453e-06, + "loss": 2.3931, + "step": 2725 + }, + { + "epoch": 0.14624463519313305, + "grad_norm": 1.232975959777832, + "learning_rate": 4.825973153581709e-06, + "loss": 2.6078, + "step": 2726 + }, + { + "epoch": 0.14629828326180258, + "grad_norm": 1.1197890043258667, + "learning_rate": 4.825813878522403e-06, + "loss": 2.2705, + "step": 2727 + }, + { + "epoch": 0.1463519313304721, + "grad_norm": 1.2148678302764893, + "learning_rate": 4.825654533240345e-06, + "loss": 2.3436, + "step": 2728 + }, + { + "epoch": 0.14640557939914164, + "grad_norm": 1.0417633056640625, + "learning_rate": 4.825495117740344e-06, + "loss": 2.1067, + "step": 2729 + }, + { + "epoch": 0.14645922746781115, + "grad_norm": 1.2266037464141846, + "learning_rate": 4.825335632027216e-06, + "loss": 2.3441, + "step": 2730 + }, + { + "epoch": 0.14651287553648068, + "grad_norm": 1.3694885969161987, + "learning_rate": 4.825176076105773e-06, + "loss": 2.1666, + "step": 2731 + }, + { + "epoch": 0.1465665236051502, + "grad_norm": 1.0907225608825684, + "learning_rate": 4.8250164499808364e-06, + "loss": 2.2457, + "step": 2732 + }, + { + "epoch": 0.14662017167381974, + "grad_norm": 1.1504307985305786, + "learning_rate": 4.8248567536572225e-06, + "loss": 2.4923, + "step": 2733 + }, + { + "epoch": 0.14667381974248928, + "grad_norm": 1.1006940603256226, + "learning_rate": 4.8246969871397544e-06, + "loss": 2.2881, + "step": 2734 + }, + { + "epoch": 0.1467274678111588, + "grad_norm": 2.4016170501708984, + "learning_rate": 4.824537150433255e-06, + "loss": 2.6694, + "step": 2735 + }, + { + "epoch": 0.14678111587982834, + "grad_norm": 1.1272023916244507, + "learning_rate": 4.824377243542551e-06, + "loss": 2.0996, + "step": 2736 + }, + { + "epoch": 0.14683476394849784, + "grad_norm": 0.9871650338172913, + "learning_rate": 4.824217266472472e-06, + "loss": 2.2405, + "step": 2737 + }, + { + "epoch": 0.14688841201716737, + "grad_norm": 1.145375370979309, + "learning_rate": 4.824057219227844e-06, + "loss": 2.5092, + "step": 2738 + }, + { + "epoch": 0.1469420600858369, + "grad_norm": 1.9291049242019653, + "learning_rate": 4.823897101813503e-06, + "loss": 2.3695, + "step": 2739 + }, + { + "epoch": 0.14699570815450644, + "grad_norm": 1.111106514930725, + "learning_rate": 4.8237369142342825e-06, + "loss": 2.1868, + "step": 2740 + }, + { + "epoch": 0.14704935622317597, + "grad_norm": 1.0375750064849854, + "learning_rate": 4.823576656495018e-06, + "loss": 2.3434, + "step": 2741 + }, + { + "epoch": 0.1471030042918455, + "grad_norm": 1.190987467765808, + "learning_rate": 4.823416328600548e-06, + "loss": 2.398, + "step": 2742 + }, + { + "epoch": 0.14715665236051503, + "grad_norm": 1.3128366470336914, + "learning_rate": 4.823255930555715e-06, + "loss": 2.2623, + "step": 2743 + }, + { + "epoch": 0.14721030042918454, + "grad_norm": 1.173422932624817, + "learning_rate": 4.82309546236536e-06, + "loss": 2.384, + "step": 2744 + }, + { + "epoch": 0.14726394849785407, + "grad_norm": 1.2136934995651245, + "learning_rate": 4.8229349240343295e-06, + "loss": 2.4563, + "step": 2745 + }, + { + "epoch": 0.1473175965665236, + "grad_norm": 1.2677202224731445, + "learning_rate": 4.822774315567469e-06, + "loss": 2.3236, + "step": 2746 + }, + { + "epoch": 0.14737124463519313, + "grad_norm": 1.568453073501587, + "learning_rate": 4.822613636969629e-06, + "loss": 2.5263, + "step": 2747 + }, + { + "epoch": 0.14742489270386266, + "grad_norm": 1.0851116180419922, + "learning_rate": 4.822452888245661e-06, + "loss": 2.2211, + "step": 2748 + }, + { + "epoch": 0.1474785407725322, + "grad_norm": 1.0469335317611694, + "learning_rate": 4.822292069400417e-06, + "loss": 2.1514, + "step": 2749 + }, + { + "epoch": 0.14753218884120173, + "grad_norm": 1.0660486221313477, + "learning_rate": 4.822131180438754e-06, + "loss": 2.1633, + "step": 2750 + }, + { + "epoch": 0.14758583690987123, + "grad_norm": 1.3153481483459473, + "learning_rate": 4.821970221365529e-06, + "loss": 2.2428, + "step": 2751 + }, + { + "epoch": 0.14763948497854076, + "grad_norm": 1.3628638982772827, + "learning_rate": 4.821809192185602e-06, + "loss": 2.3404, + "step": 2752 + }, + { + "epoch": 0.1476931330472103, + "grad_norm": 1.3208353519439697, + "learning_rate": 4.821648092903834e-06, + "loss": 2.2749, + "step": 2753 + }, + { + "epoch": 0.14774678111587983, + "grad_norm": 1.1756128072738647, + "learning_rate": 4.821486923525091e-06, + "loss": 2.5399, + "step": 2754 + }, + { + "epoch": 0.14780042918454936, + "grad_norm": 1.1231908798217773, + "learning_rate": 4.8213256840542375e-06, + "loss": 1.9654, + "step": 2755 + }, + { + "epoch": 0.1478540772532189, + "grad_norm": 1.257382869720459, + "learning_rate": 4.821164374496143e-06, + "loss": 2.3145, + "step": 2756 + }, + { + "epoch": 0.14790772532188842, + "grad_norm": 1.0137689113616943, + "learning_rate": 4.821002994855677e-06, + "loss": 2.188, + "step": 2757 + }, + { + "epoch": 0.14796137339055793, + "grad_norm": 1.242363691329956, + "learning_rate": 4.820841545137711e-06, + "loss": 2.2597, + "step": 2758 + }, + { + "epoch": 0.14801502145922746, + "grad_norm": 1.6212775707244873, + "learning_rate": 4.820680025347122e-06, + "loss": 2.2932, + "step": 2759 + }, + { + "epoch": 0.148068669527897, + "grad_norm": 1.1316384077072144, + "learning_rate": 4.8205184354887845e-06, + "loss": 2.3879, + "step": 2760 + }, + { + "epoch": 0.14812231759656652, + "grad_norm": 1.2950024604797363, + "learning_rate": 4.820356775567579e-06, + "loss": 2.5604, + "step": 2761 + }, + { + "epoch": 0.14817596566523605, + "grad_norm": 1.0231729745864868, + "learning_rate": 4.820195045588386e-06, + "loss": 2.1551, + "step": 2762 + }, + { + "epoch": 0.14822961373390559, + "grad_norm": 1.1873359680175781, + "learning_rate": 4.820033245556088e-06, + "loss": 2.269, + "step": 2763 + }, + { + "epoch": 0.14828326180257512, + "grad_norm": 1.729657530784607, + "learning_rate": 4.81987137547557e-06, + "loss": 2.2263, + "step": 2764 + }, + { + "epoch": 0.14833690987124465, + "grad_norm": 1.1941564083099365, + "learning_rate": 4.819709435351721e-06, + "loss": 2.2716, + "step": 2765 + }, + { + "epoch": 0.14839055793991415, + "grad_norm": 1.1694934368133545, + "learning_rate": 4.819547425189429e-06, + "loss": 2.3421, + "step": 2766 + }, + { + "epoch": 0.14844420600858368, + "grad_norm": 0.9682570695877075, + "learning_rate": 4.819385344993586e-06, + "loss": 2.4654, + "step": 2767 + }, + { + "epoch": 0.14849785407725322, + "grad_norm": 1.313369870185852, + "learning_rate": 4.819223194769085e-06, + "loss": 2.1191, + "step": 2768 + }, + { + "epoch": 0.14855150214592275, + "grad_norm": 1.1320074796676636, + "learning_rate": 4.819060974520821e-06, + "loss": 2.0982, + "step": 2769 + }, + { + "epoch": 0.14860515021459228, + "grad_norm": 1.023849606513977, + "learning_rate": 4.818898684253695e-06, + "loss": 2.0897, + "step": 2770 + }, + { + "epoch": 0.1486587982832618, + "grad_norm": 1.3047869205474854, + "learning_rate": 4.818736323972605e-06, + "loss": 2.2591, + "step": 2771 + }, + { + "epoch": 0.14871244635193134, + "grad_norm": 1.1748082637786865, + "learning_rate": 4.8185738936824536e-06, + "loss": 2.225, + "step": 2772 + }, + { + "epoch": 0.14876609442060085, + "grad_norm": 1.1303651332855225, + "learning_rate": 4.818411393388144e-06, + "loss": 2.4475, + "step": 2773 + }, + { + "epoch": 0.14881974248927038, + "grad_norm": 1.0879414081573486, + "learning_rate": 4.818248823094583e-06, + "loss": 2.203, + "step": 2774 + }, + { + "epoch": 0.1488733905579399, + "grad_norm": 1.073878288269043, + "learning_rate": 4.818086182806679e-06, + "loss": 2.1514, + "step": 2775 + }, + { + "epoch": 0.14892703862660944, + "grad_norm": 1.2259843349456787, + "learning_rate": 4.817923472529344e-06, + "loss": 2.4409, + "step": 2776 + }, + { + "epoch": 0.14898068669527897, + "grad_norm": 1.2612895965576172, + "learning_rate": 4.817760692267488e-06, + "loss": 2.5617, + "step": 2777 + }, + { + "epoch": 0.1490343347639485, + "grad_norm": 1.1186659336090088, + "learning_rate": 4.8175978420260275e-06, + "loss": 2.4181, + "step": 2778 + }, + { + "epoch": 0.14908798283261804, + "grad_norm": 1.0115100145339966, + "learning_rate": 4.817434921809879e-06, + "loss": 2.0459, + "step": 2779 + }, + { + "epoch": 0.14914163090128754, + "grad_norm": 1.2720636129379272, + "learning_rate": 4.817271931623962e-06, + "loss": 2.2539, + "step": 2780 + }, + { + "epoch": 0.14919527896995707, + "grad_norm": 1.054483413696289, + "learning_rate": 4.817108871473198e-06, + "loss": 2.3351, + "step": 2781 + }, + { + "epoch": 0.1492489270386266, + "grad_norm": 1.2497326135635376, + "learning_rate": 4.816945741362508e-06, + "loss": 2.2135, + "step": 2782 + }, + { + "epoch": 0.14930257510729614, + "grad_norm": 1.3859692811965942, + "learning_rate": 4.816782541296821e-06, + "loss": 2.2562, + "step": 2783 + }, + { + "epoch": 0.14935622317596567, + "grad_norm": 1.254451870918274, + "learning_rate": 4.8166192712810615e-06, + "loss": 2.4086, + "step": 2784 + }, + { + "epoch": 0.1494098712446352, + "grad_norm": 1.0867466926574707, + "learning_rate": 4.816455931320159e-06, + "loss": 2.0221, + "step": 2785 + }, + { + "epoch": 0.14946351931330473, + "grad_norm": 1.1539472341537476, + "learning_rate": 4.816292521419046e-06, + "loss": 2.4382, + "step": 2786 + }, + { + "epoch": 0.14951716738197424, + "grad_norm": 1.2179774045944214, + "learning_rate": 4.816129041582658e-06, + "loss": 2.5761, + "step": 2787 + }, + { + "epoch": 0.14957081545064377, + "grad_norm": 1.195760726928711, + "learning_rate": 4.815965491815929e-06, + "loss": 1.7838, + "step": 2788 + }, + { + "epoch": 0.1496244635193133, + "grad_norm": 1.1972638368606567, + "learning_rate": 4.815801872123795e-06, + "loss": 2.3568, + "step": 2789 + }, + { + "epoch": 0.14967811158798283, + "grad_norm": 1.3246418237686157, + "learning_rate": 4.815638182511201e-06, + "loss": 2.216, + "step": 2790 + }, + { + "epoch": 0.14973175965665236, + "grad_norm": 1.1295033693313599, + "learning_rate": 4.8154744229830856e-06, + "loss": 2.2183, + "step": 2791 + }, + { + "epoch": 0.1497854077253219, + "grad_norm": 1.429836630821228, + "learning_rate": 4.815310593544394e-06, + "loss": 2.3083, + "step": 2792 + }, + { + "epoch": 0.14983905579399143, + "grad_norm": 0.9989696145057678, + "learning_rate": 4.8151466942000735e-06, + "loss": 2.2779, + "step": 2793 + }, + { + "epoch": 0.14989270386266093, + "grad_norm": 1.1302649974822998, + "learning_rate": 4.814982724955072e-06, + "loss": 2.4467, + "step": 2794 + }, + { + "epoch": 0.14994635193133046, + "grad_norm": 1.0692965984344482, + "learning_rate": 4.81481868581434e-06, + "loss": 2.2643, + "step": 2795 + }, + { + "epoch": 0.15, + "grad_norm": 1.1162033081054688, + "learning_rate": 4.81465457678283e-06, + "loss": 2.3903, + "step": 2796 + }, + { + "epoch": 0.15005364806866953, + "grad_norm": 1.286103367805481, + "learning_rate": 4.814490397865499e-06, + "loss": 2.565, + "step": 2797 + }, + { + "epoch": 0.15010729613733906, + "grad_norm": 1.2736555337905884, + "learning_rate": 4.814326149067301e-06, + "loss": 2.2514, + "step": 2798 + }, + { + "epoch": 0.1501609442060086, + "grad_norm": 1.3678972721099854, + "learning_rate": 4.814161830393197e-06, + "loss": 2.4858, + "step": 2799 + }, + { + "epoch": 0.15021459227467812, + "grad_norm": 1.3407158851623535, + "learning_rate": 4.813997441848148e-06, + "loss": 2.4033, + "step": 2800 + }, + { + "epoch": 0.15026824034334765, + "grad_norm": 1.9348576068878174, + "learning_rate": 4.8138329834371176e-06, + "loss": 1.3548, + "step": 2801 + }, + { + "epoch": 0.15032188841201716, + "grad_norm": 1.1590474843978882, + "learning_rate": 4.81366845516507e-06, + "loss": 2.0311, + "step": 2802 + }, + { + "epoch": 0.1503755364806867, + "grad_norm": 1.1259260177612305, + "learning_rate": 4.813503857036974e-06, + "loss": 2.4137, + "step": 2803 + }, + { + "epoch": 0.15042918454935622, + "grad_norm": 1.2435444593429565, + "learning_rate": 4.8133391890578e-06, + "loss": 2.2315, + "step": 2804 + }, + { + "epoch": 0.15048283261802575, + "grad_norm": 0.9682391285896301, + "learning_rate": 4.813174451232517e-06, + "loss": 2.1987, + "step": 2805 + }, + { + "epoch": 0.15053648068669528, + "grad_norm": 1.1212704181671143, + "learning_rate": 4.813009643566101e-06, + "loss": 2.2888, + "step": 2806 + }, + { + "epoch": 0.15059012875536482, + "grad_norm": 1.0823678970336914, + "learning_rate": 4.812844766063528e-06, + "loss": 2.5039, + "step": 2807 + }, + { + "epoch": 0.15064377682403435, + "grad_norm": 1.1434557437896729, + "learning_rate": 4.812679818729776e-06, + "loss": 2.1472, + "step": 2808 + }, + { + "epoch": 0.15069742489270385, + "grad_norm": 1.2345346212387085, + "learning_rate": 4.8125148015698235e-06, + "loss": 2.1529, + "step": 2809 + }, + { + "epoch": 0.15075107296137338, + "grad_norm": 1.1005955934524536, + "learning_rate": 4.812349714588654e-06, + "loss": 2.1996, + "step": 2810 + }, + { + "epoch": 0.15080472103004292, + "grad_norm": 1.015892505645752, + "learning_rate": 4.812184557791254e-06, + "loss": 2.0645, + "step": 2811 + }, + { + "epoch": 0.15085836909871245, + "grad_norm": 1.39345121383667, + "learning_rate": 4.8120193311826065e-06, + "loss": 2.0786, + "step": 2812 + }, + { + "epoch": 0.15091201716738198, + "grad_norm": 1.3849331140518188, + "learning_rate": 4.811854034767703e-06, + "loss": 2.2318, + "step": 2813 + }, + { + "epoch": 0.1509656652360515, + "grad_norm": 1.222198486328125, + "learning_rate": 4.811688668551533e-06, + "loss": 2.1871, + "step": 2814 + }, + { + "epoch": 0.15101931330472104, + "grad_norm": 1.1931946277618408, + "learning_rate": 4.811523232539089e-06, + "loss": 2.0753, + "step": 2815 + }, + { + "epoch": 0.15107296137339055, + "grad_norm": 1.2346850633621216, + "learning_rate": 4.8113577267353664e-06, + "loss": 2.435, + "step": 2816 + }, + { + "epoch": 0.15112660944206008, + "grad_norm": 2.9288177490234375, + "learning_rate": 4.811192151145362e-06, + "loss": 2.2436, + "step": 2817 + }, + { + "epoch": 0.1511802575107296, + "grad_norm": 1.1076375246047974, + "learning_rate": 4.8110265057740755e-06, + "loss": 2.0994, + "step": 2818 + }, + { + "epoch": 0.15123390557939914, + "grad_norm": 1.2407209873199463, + "learning_rate": 4.810860790626508e-06, + "loss": 1.3745, + "step": 2819 + }, + { + "epoch": 0.15128755364806867, + "grad_norm": 1.0911478996276855, + "learning_rate": 4.810695005707663e-06, + "loss": 2.2423, + "step": 2820 + }, + { + "epoch": 0.1513412017167382, + "grad_norm": 1.8436505794525146, + "learning_rate": 4.810529151022546e-06, + "loss": 2.3215, + "step": 2821 + }, + { + "epoch": 0.15139484978540774, + "grad_norm": 1.2501153945922852, + "learning_rate": 4.810363226576164e-06, + "loss": 2.4171, + "step": 2822 + }, + { + "epoch": 0.15144849785407724, + "grad_norm": 1.1345223188400269, + "learning_rate": 4.8101972323735265e-06, + "loss": 2.2344, + "step": 2823 + }, + { + "epoch": 0.15150214592274677, + "grad_norm": 1.1944572925567627, + "learning_rate": 4.810031168419647e-06, + "loss": 2.2793, + "step": 2824 + }, + { + "epoch": 0.1515557939914163, + "grad_norm": 1.3816077709197998, + "learning_rate": 4.809865034719539e-06, + "loss": 2.5368, + "step": 2825 + }, + { + "epoch": 0.15160944206008584, + "grad_norm": 1.3680733442306519, + "learning_rate": 4.809698831278217e-06, + "loss": 2.1798, + "step": 2826 + }, + { + "epoch": 0.15166309012875537, + "grad_norm": 1.2031583786010742, + "learning_rate": 4.8095325581007e-06, + "loss": 2.3402, + "step": 2827 + }, + { + "epoch": 0.1517167381974249, + "grad_norm": 1.217227816581726, + "learning_rate": 4.80936621519201e-06, + "loss": 2.1276, + "step": 2828 + }, + { + "epoch": 0.15177038626609443, + "grad_norm": 1.1827473640441895, + "learning_rate": 4.809199802557166e-06, + "loss": 2.2513, + "step": 2829 + }, + { + "epoch": 0.15182403433476394, + "grad_norm": 1.2194539308547974, + "learning_rate": 4.8090333202011955e-06, + "loss": 2.2915, + "step": 2830 + }, + { + "epoch": 0.15187768240343347, + "grad_norm": 1.2167216539382935, + "learning_rate": 4.808866768129122e-06, + "loss": 2.2094, + "step": 2831 + }, + { + "epoch": 0.151931330472103, + "grad_norm": 1.0072201490402222, + "learning_rate": 4.808700146345977e-06, + "loss": 2.1026, + "step": 2832 + }, + { + "epoch": 0.15198497854077253, + "grad_norm": 1.1857491731643677, + "learning_rate": 4.80853345485679e-06, + "loss": 2.2986, + "step": 2833 + }, + { + "epoch": 0.15203862660944206, + "grad_norm": 1.2352328300476074, + "learning_rate": 4.808366693666594e-06, + "loss": 2.23, + "step": 2834 + }, + { + "epoch": 0.1520922746781116, + "grad_norm": 0.9971647262573242, + "learning_rate": 4.808199862780425e-06, + "loss": 2.1973, + "step": 2835 + }, + { + "epoch": 0.15214592274678113, + "grad_norm": 1.0250343084335327, + "learning_rate": 4.808032962203319e-06, + "loss": 2.1758, + "step": 2836 + }, + { + "epoch": 0.15219957081545063, + "grad_norm": 1.3263059854507446, + "learning_rate": 4.8078659919403134e-06, + "loss": 2.1637, + "step": 2837 + }, + { + "epoch": 0.15225321888412016, + "grad_norm": 1.5190186500549316, + "learning_rate": 4.807698951996453e-06, + "loss": 2.3645, + "step": 2838 + }, + { + "epoch": 0.1523068669527897, + "grad_norm": 1.33429753780365, + "learning_rate": 4.807531842376779e-06, + "loss": 2.2151, + "step": 2839 + }, + { + "epoch": 0.15236051502145923, + "grad_norm": 1.1994478702545166, + "learning_rate": 4.807364663086338e-06, + "loss": 2.334, + "step": 2840 + }, + { + "epoch": 0.15241416309012876, + "grad_norm": 1.367074966430664, + "learning_rate": 4.807197414130177e-06, + "loss": 2.3755, + "step": 2841 + }, + { + "epoch": 0.1524678111587983, + "grad_norm": 1.1799287796020508, + "learning_rate": 4.8070300955133455e-06, + "loss": 2.4564, + "step": 2842 + }, + { + "epoch": 0.15252145922746782, + "grad_norm": 1.0291837453842163, + "learning_rate": 4.806862707240896e-06, + "loss": 2.1146, + "step": 2843 + }, + { + "epoch": 0.15257510729613735, + "grad_norm": 1.1938090324401855, + "learning_rate": 4.806695249317881e-06, + "loss": 2.3382, + "step": 2844 + }, + { + "epoch": 0.15262875536480686, + "grad_norm": 1.1887747049331665, + "learning_rate": 4.806527721749359e-06, + "loss": 2.4546, + "step": 2845 + }, + { + "epoch": 0.1526824034334764, + "grad_norm": 1.1367768049240112, + "learning_rate": 4.806360124540386e-06, + "loss": 2.4913, + "step": 2846 + }, + { + "epoch": 0.15273605150214592, + "grad_norm": 1.1260086297988892, + "learning_rate": 4.806192457696024e-06, + "loss": 2.1192, + "step": 2847 + }, + { + "epoch": 0.15278969957081545, + "grad_norm": 1.0587607622146606, + "learning_rate": 4.806024721221333e-06, + "loss": 2.4204, + "step": 2848 + }, + { + "epoch": 0.15284334763948498, + "grad_norm": 1.30533766746521, + "learning_rate": 4.805856915121379e-06, + "loss": 2.3008, + "step": 2849 + }, + { + "epoch": 0.15289699570815452, + "grad_norm": 1.5247464179992676, + "learning_rate": 4.805689039401228e-06, + "loss": 2.2667, + "step": 2850 + }, + { + "epoch": 0.15295064377682405, + "grad_norm": 1.1710468530654907, + "learning_rate": 4.805521094065949e-06, + "loss": 2.3488, + "step": 2851 + }, + { + "epoch": 0.15300429184549355, + "grad_norm": 1.6017030477523804, + "learning_rate": 4.805353079120612e-06, + "loss": 2.2721, + "step": 2852 + }, + { + "epoch": 0.15305793991416308, + "grad_norm": 1.4532430171966553, + "learning_rate": 4.805184994570291e-06, + "loss": 2.0727, + "step": 2853 + }, + { + "epoch": 0.15311158798283261, + "grad_norm": 1.1741102933883667, + "learning_rate": 4.80501684042006e-06, + "loss": 2.0903, + "step": 2854 + }, + { + "epoch": 0.15316523605150215, + "grad_norm": 1.1631425619125366, + "learning_rate": 4.804848616674997e-06, + "loss": 2.3127, + "step": 2855 + }, + { + "epoch": 0.15321888412017168, + "grad_norm": 1.1531758308410645, + "learning_rate": 4.80468032334018e-06, + "loss": 2.1923, + "step": 2856 + }, + { + "epoch": 0.1532725321888412, + "grad_norm": 1.1437016725540161, + "learning_rate": 4.804511960420691e-06, + "loss": 2.2679, + "step": 2857 + }, + { + "epoch": 0.15332618025751074, + "grad_norm": 1.3112766742706299, + "learning_rate": 4.804343527921613e-06, + "loss": 2.1199, + "step": 2858 + }, + { + "epoch": 0.15337982832618025, + "grad_norm": 1.145240306854248, + "learning_rate": 4.804175025848032e-06, + "loss": 2.3508, + "step": 2859 + }, + { + "epoch": 0.15343347639484978, + "grad_norm": 1.6694886684417725, + "learning_rate": 4.8040064542050345e-06, + "loss": 2.5529, + "step": 2860 + }, + { + "epoch": 0.1534871244635193, + "grad_norm": 1.0723209381103516, + "learning_rate": 4.80383781299771e-06, + "loss": 2.3574, + "step": 2861 + }, + { + "epoch": 0.15354077253218884, + "grad_norm": 1.242425799369812, + "learning_rate": 4.803669102231152e-06, + "loss": 2.4926, + "step": 2862 + }, + { + "epoch": 0.15359442060085837, + "grad_norm": 1.1655932664871216, + "learning_rate": 4.803500321910453e-06, + "loss": 2.1249, + "step": 2863 + }, + { + "epoch": 0.1536480686695279, + "grad_norm": 1.310933232307434, + "learning_rate": 4.80333147204071e-06, + "loss": 2.1902, + "step": 2864 + }, + { + "epoch": 0.15370171673819744, + "grad_norm": 1.14324152469635, + "learning_rate": 4.80316255262702e-06, + "loss": 2.259, + "step": 2865 + }, + { + "epoch": 0.15375536480686694, + "grad_norm": 1.2345800399780273, + "learning_rate": 4.802993563674483e-06, + "loss": 2.2076, + "step": 2866 + }, + { + "epoch": 0.15380901287553647, + "grad_norm": 1.1824792623519897, + "learning_rate": 4.802824505188202e-06, + "loss": 2.2537, + "step": 2867 + }, + { + "epoch": 0.153862660944206, + "grad_norm": 1.1872730255126953, + "learning_rate": 4.802655377173281e-06, + "loss": 2.2983, + "step": 2868 + }, + { + "epoch": 0.15391630901287554, + "grad_norm": 1.2097169160842896, + "learning_rate": 4.802486179634827e-06, + "loss": 2.2547, + "step": 2869 + }, + { + "epoch": 0.15396995708154507, + "grad_norm": 1.0807543992996216, + "learning_rate": 4.802316912577947e-06, + "loss": 2.268, + "step": 2870 + }, + { + "epoch": 0.1540236051502146, + "grad_norm": 1.1962196826934814, + "learning_rate": 4.802147576007753e-06, + "loss": 2.3631, + "step": 2871 + }, + { + "epoch": 0.15407725321888413, + "grad_norm": 1.0260673761367798, + "learning_rate": 4.801978169929357e-06, + "loss": 2.0644, + "step": 2872 + }, + { + "epoch": 0.15413090128755363, + "grad_norm": 1.2221707105636597, + "learning_rate": 4.801808694347875e-06, + "loss": 2.2612, + "step": 2873 + }, + { + "epoch": 0.15418454935622317, + "grad_norm": 1.1478830575942993, + "learning_rate": 4.8016391492684226e-06, + "loss": 2.2952, + "step": 2874 + }, + { + "epoch": 0.1542381974248927, + "grad_norm": 1.1226656436920166, + "learning_rate": 4.80146953469612e-06, + "loss": 2.2966, + "step": 2875 + }, + { + "epoch": 0.15429184549356223, + "grad_norm": 1.188783884048462, + "learning_rate": 4.801299850636087e-06, + "loss": 1.9943, + "step": 2876 + }, + { + "epoch": 0.15434549356223176, + "grad_norm": 1.115997076034546, + "learning_rate": 4.801130097093448e-06, + "loss": 2.3906, + "step": 2877 + }, + { + "epoch": 0.1543991416309013, + "grad_norm": 1.3525177240371704, + "learning_rate": 4.8009602740733284e-06, + "loss": 2.1624, + "step": 2878 + }, + { + "epoch": 0.15445278969957082, + "grad_norm": 1.3838391304016113, + "learning_rate": 4.800790381580854e-06, + "loss": 2.3437, + "step": 2879 + }, + { + "epoch": 0.15450643776824036, + "grad_norm": 1.6384971141815186, + "learning_rate": 4.800620419621156e-06, + "loss": 2.5869, + "step": 2880 + }, + { + "epoch": 0.15456008583690986, + "grad_norm": 1.213053822517395, + "learning_rate": 4.800450388199366e-06, + "loss": 2.5044, + "step": 2881 + }, + { + "epoch": 0.1546137339055794, + "grad_norm": 1.0747272968292236, + "learning_rate": 4.800280287320617e-06, + "loss": 2.1819, + "step": 2882 + }, + { + "epoch": 0.15466738197424892, + "grad_norm": 1.104613184928894, + "learning_rate": 4.800110116990044e-06, + "loss": 2.321, + "step": 2883 + }, + { + "epoch": 0.15472103004291846, + "grad_norm": 1.3164948225021362, + "learning_rate": 4.7999398772127875e-06, + "loss": 2.4182, + "step": 2884 + }, + { + "epoch": 0.154774678111588, + "grad_norm": 1.271101951599121, + "learning_rate": 4.799769567993985e-06, + "loss": 2.4235, + "step": 2885 + }, + { + "epoch": 0.15482832618025752, + "grad_norm": 1.2206676006317139, + "learning_rate": 4.799599189338779e-06, + "loss": 2.3002, + "step": 2886 + }, + { + "epoch": 0.15488197424892705, + "grad_norm": 1.2204235792160034, + "learning_rate": 4.799428741252315e-06, + "loss": 2.0242, + "step": 2887 + }, + { + "epoch": 0.15493562231759656, + "grad_norm": 1.106865644454956, + "learning_rate": 4.799258223739739e-06, + "loss": 1.9556, + "step": 2888 + }, + { + "epoch": 0.1549892703862661, + "grad_norm": 1.1434762477874756, + "learning_rate": 4.799087636806199e-06, + "loss": 2.2644, + "step": 2889 + }, + { + "epoch": 0.15504291845493562, + "grad_norm": 1.2054431438446045, + "learning_rate": 4.798916980456845e-06, + "loss": 2.4804, + "step": 2890 + }, + { + "epoch": 0.15509656652360515, + "grad_norm": 1.1112333536148071, + "learning_rate": 4.798746254696829e-06, + "loss": 2.3868, + "step": 2891 + }, + { + "epoch": 0.15515021459227468, + "grad_norm": 1.7496609687805176, + "learning_rate": 4.7985754595313075e-06, + "loss": 2.243, + "step": 2892 + }, + { + "epoch": 0.15520386266094421, + "grad_norm": 1.0955934524536133, + "learning_rate": 4.798404594965437e-06, + "loss": 2.2166, + "step": 2893 + }, + { + "epoch": 0.15525751072961375, + "grad_norm": 1.0978455543518066, + "learning_rate": 4.798233661004376e-06, + "loss": 2.3146, + "step": 2894 + }, + { + "epoch": 0.15531115879828325, + "grad_norm": 1.270133137702942, + "learning_rate": 4.798062657653284e-06, + "loss": 2.3247, + "step": 2895 + }, + { + "epoch": 0.15536480686695278, + "grad_norm": 1.2818571329116821, + "learning_rate": 4.797891584917326e-06, + "loss": 1.7678, + "step": 2896 + }, + { + "epoch": 0.1554184549356223, + "grad_norm": 1.1788274049758911, + "learning_rate": 4.797720442801667e-06, + "loss": 2.325, + "step": 2897 + }, + { + "epoch": 0.15547210300429185, + "grad_norm": 1.1679904460906982, + "learning_rate": 4.797549231311473e-06, + "loss": 2.1724, + "step": 2898 + }, + { + "epoch": 0.15552575107296138, + "grad_norm": 1.1923301219940186, + "learning_rate": 4.797377950451915e-06, + "loss": 2.0791, + "step": 2899 + }, + { + "epoch": 0.1555793991416309, + "grad_norm": 1.2958101034164429, + "learning_rate": 4.7972066002281625e-06, + "loss": 2.1796, + "step": 2900 + }, + { + "epoch": 0.15563304721030044, + "grad_norm": 1.2919819355010986, + "learning_rate": 4.797035180645391e-06, + "loss": 2.3949, + "step": 2901 + }, + { + "epoch": 0.15568669527896994, + "grad_norm": 1.2203866243362427, + "learning_rate": 4.7968636917087754e-06, + "loss": 2.2881, + "step": 2902 + }, + { + "epoch": 0.15574034334763948, + "grad_norm": 0.9369720220565796, + "learning_rate": 4.796692133423493e-06, + "loss": 2.136, + "step": 2903 + }, + { + "epoch": 0.155793991416309, + "grad_norm": 1.5433340072631836, + "learning_rate": 4.7965205057947235e-06, + "loss": 2.2993, + "step": 2904 + }, + { + "epoch": 0.15584763948497854, + "grad_norm": 1.4709161520004272, + "learning_rate": 4.79634880882765e-06, + "loss": 2.2933, + "step": 2905 + }, + { + "epoch": 0.15590128755364807, + "grad_norm": 1.2268426418304443, + "learning_rate": 4.7961770425274545e-06, + "loss": 2.4902, + "step": 2906 + }, + { + "epoch": 0.1559549356223176, + "grad_norm": 1.2769966125488281, + "learning_rate": 4.796005206899325e-06, + "loss": 2.2954, + "step": 2907 + }, + { + "epoch": 0.15600858369098713, + "grad_norm": 1.116747498512268, + "learning_rate": 4.795833301948449e-06, + "loss": 2.0409, + "step": 2908 + }, + { + "epoch": 0.15606223175965664, + "grad_norm": 1.101408839225769, + "learning_rate": 4.795661327680017e-06, + "loss": 2.0644, + "step": 2909 + }, + { + "epoch": 0.15611587982832617, + "grad_norm": 1.1973319053649902, + "learning_rate": 4.795489284099221e-06, + "loss": 2.2783, + "step": 2910 + }, + { + "epoch": 0.1561695278969957, + "grad_norm": 1.2317484617233276, + "learning_rate": 4.795317171211255e-06, + "loss": 2.0144, + "step": 2911 + }, + { + "epoch": 0.15622317596566523, + "grad_norm": 1.0992565155029297, + "learning_rate": 4.795144989021318e-06, + "loss": 2.3465, + "step": 2912 + }, + { + "epoch": 0.15627682403433477, + "grad_norm": 1.1181533336639404, + "learning_rate": 4.794972737534605e-06, + "loss": 2.1508, + "step": 2913 + }, + { + "epoch": 0.1563304721030043, + "grad_norm": 2.796945571899414, + "learning_rate": 4.79480041675632e-06, + "loss": 2.2248, + "step": 2914 + }, + { + "epoch": 0.15638412017167383, + "grad_norm": 1.2749654054641724, + "learning_rate": 4.794628026691663e-06, + "loss": 2.4683, + "step": 2915 + }, + { + "epoch": 0.15643776824034336, + "grad_norm": 1.3407409191131592, + "learning_rate": 4.794455567345842e-06, + "loss": 2.287, + "step": 2916 + }, + { + "epoch": 0.15649141630901287, + "grad_norm": 1.2482330799102783, + "learning_rate": 4.794283038724061e-06, + "loss": 2.4043, + "step": 2917 + }, + { + "epoch": 0.1565450643776824, + "grad_norm": 1.1665592193603516, + "learning_rate": 4.794110440831532e-06, + "loss": 1.9848, + "step": 2918 + }, + { + "epoch": 0.15659871244635193, + "grad_norm": 1.169379711151123, + "learning_rate": 4.793937773673464e-06, + "loss": 2.1467, + "step": 2919 + }, + { + "epoch": 0.15665236051502146, + "grad_norm": 0.976087749004364, + "learning_rate": 4.79376503725507e-06, + "loss": 2.4747, + "step": 2920 + }, + { + "epoch": 0.156706008583691, + "grad_norm": 1.2011499404907227, + "learning_rate": 4.793592231581568e-06, + "loss": 2.1088, + "step": 2921 + }, + { + "epoch": 0.15675965665236052, + "grad_norm": 1.0975022315979004, + "learning_rate": 4.7934193566581734e-06, + "loss": 2.2717, + "step": 2922 + }, + { + "epoch": 0.15681330472103006, + "grad_norm": 1.1524206399917603, + "learning_rate": 4.793246412490106e-06, + "loss": 2.3371, + "step": 2923 + }, + { + "epoch": 0.15686695278969956, + "grad_norm": 1.3166700601577759, + "learning_rate": 4.793073399082589e-06, + "loss": 2.1306, + "step": 2924 + }, + { + "epoch": 0.1569206008583691, + "grad_norm": 1.2136192321777344, + "learning_rate": 4.792900316440843e-06, + "loss": 2.417, + "step": 2925 + }, + { + "epoch": 0.15697424892703862, + "grad_norm": 1.1122221946716309, + "learning_rate": 4.792727164570097e-06, + "loss": 2.2707, + "step": 2926 + }, + { + "epoch": 0.15702789699570815, + "grad_norm": 1.1029934883117676, + "learning_rate": 4.792553943475576e-06, + "loss": 2.0887, + "step": 2927 + }, + { + "epoch": 0.1570815450643777, + "grad_norm": 1.1776975393295288, + "learning_rate": 4.7923806531625135e-06, + "loss": 2.2628, + "step": 2928 + }, + { + "epoch": 0.15713519313304722, + "grad_norm": 1.1428911685943604, + "learning_rate": 4.79220729363614e-06, + "loss": 2.2121, + "step": 2929 + }, + { + "epoch": 0.15718884120171675, + "grad_norm": 1.2312090396881104, + "learning_rate": 4.792033864901688e-06, + "loss": 2.2553, + "step": 2930 + }, + { + "epoch": 0.15724248927038625, + "grad_norm": 1.3499360084533691, + "learning_rate": 4.7918603669643955e-06, + "loss": 2.3146, + "step": 2931 + }, + { + "epoch": 0.15729613733905579, + "grad_norm": 1.2122966051101685, + "learning_rate": 4.791686799829502e-06, + "loss": 2.2008, + "step": 2932 + }, + { + "epoch": 0.15734978540772532, + "grad_norm": 1.2988256216049194, + "learning_rate": 4.791513163502246e-06, + "loss": 2.2869, + "step": 2933 + }, + { + "epoch": 0.15740343347639485, + "grad_norm": 1.2200340032577515, + "learning_rate": 4.7913394579878704e-06, + "loss": 2.4279, + "step": 2934 + }, + { + "epoch": 0.15745708154506438, + "grad_norm": 1.1703461408615112, + "learning_rate": 4.79116568329162e-06, + "loss": 2.1508, + "step": 2935 + }, + { + "epoch": 0.1575107296137339, + "grad_norm": 1.604644775390625, + "learning_rate": 4.790991839418743e-06, + "loss": 2.3619, + "step": 2936 + }, + { + "epoch": 0.15756437768240344, + "grad_norm": 1.2847343683242798, + "learning_rate": 4.790817926374486e-06, + "loss": 2.3993, + "step": 2937 + }, + { + "epoch": 0.15761802575107295, + "grad_norm": 1.1218241453170776, + "learning_rate": 4.7906439441641004e-06, + "loss": 2.1724, + "step": 2938 + }, + { + "epoch": 0.15767167381974248, + "grad_norm": 1.0674391984939575, + "learning_rate": 4.79046989279284e-06, + "loss": 2.3832, + "step": 2939 + }, + { + "epoch": 0.157725321888412, + "grad_norm": 1.1682212352752686, + "learning_rate": 4.79029577226596e-06, + "loss": 2.4606, + "step": 2940 + }, + { + "epoch": 0.15777896995708154, + "grad_norm": 1.290135383605957, + "learning_rate": 4.790121582588717e-06, + "loss": 2.2585, + "step": 2941 + }, + { + "epoch": 0.15783261802575108, + "grad_norm": 3.6530585289001465, + "learning_rate": 4.7899473237663695e-06, + "loss": 2.2772, + "step": 2942 + }, + { + "epoch": 0.1578862660944206, + "grad_norm": 1.1133934259414673, + "learning_rate": 4.789772995804181e-06, + "loss": 2.0838, + "step": 2943 + }, + { + "epoch": 0.15793991416309014, + "grad_norm": 1.2653542757034302, + "learning_rate": 4.789598598707413e-06, + "loss": 2.3966, + "step": 2944 + }, + { + "epoch": 0.15799356223175964, + "grad_norm": 1.090238094329834, + "learning_rate": 4.789424132481332e-06, + "loss": 2.3327, + "step": 2945 + }, + { + "epoch": 0.15804721030042918, + "grad_norm": 1.1340293884277344, + "learning_rate": 4.7892495971312055e-06, + "loss": 2.4089, + "step": 2946 + }, + { + "epoch": 0.1581008583690987, + "grad_norm": 1.0783169269561768, + "learning_rate": 4.789074992662302e-06, + "loss": 2.3024, + "step": 2947 + }, + { + "epoch": 0.15815450643776824, + "grad_norm": 1.1634693145751953, + "learning_rate": 4.7889003190798955e-06, + "loss": 2.3763, + "step": 2948 + }, + { + "epoch": 0.15820815450643777, + "grad_norm": 1.2150582075119019, + "learning_rate": 4.788725576389259e-06, + "loss": 2.3629, + "step": 2949 + }, + { + "epoch": 0.1582618025751073, + "grad_norm": 1.2875847816467285, + "learning_rate": 4.788550764595667e-06, + "loss": 2.3886, + "step": 2950 + }, + { + "epoch": 0.15831545064377683, + "grad_norm": 1.313193678855896, + "learning_rate": 4.7883758837044e-06, + "loss": 2.4221, + "step": 2951 + }, + { + "epoch": 0.15836909871244637, + "grad_norm": 1.2829747200012207, + "learning_rate": 4.788200933720736e-06, + "loss": 2.2852, + "step": 2952 + }, + { + "epoch": 0.15842274678111587, + "grad_norm": 1.48868727684021, + "learning_rate": 4.788025914649958e-06, + "loss": 2.4975, + "step": 2953 + }, + { + "epoch": 0.1584763948497854, + "grad_norm": 1.1242103576660156, + "learning_rate": 4.787850826497351e-06, + "loss": 2.2012, + "step": 2954 + }, + { + "epoch": 0.15853004291845493, + "grad_norm": 1.2643870115280151, + "learning_rate": 4.7876756692682e-06, + "loss": 2.2487, + "step": 2955 + }, + { + "epoch": 0.15858369098712446, + "grad_norm": 1.1525098085403442, + "learning_rate": 4.787500442967795e-06, + "loss": 2.4075, + "step": 2956 + }, + { + "epoch": 0.158637339055794, + "grad_norm": 1.1928962469100952, + "learning_rate": 4.787325147601426e-06, + "loss": 2.4639, + "step": 2957 + }, + { + "epoch": 0.15869098712446353, + "grad_norm": 1.706312894821167, + "learning_rate": 4.787149783174385e-06, + "loss": 2.3303, + "step": 2958 + }, + { + "epoch": 0.15874463519313306, + "grad_norm": 1.1195869445800781, + "learning_rate": 4.7869743496919676e-06, + "loss": 2.4713, + "step": 2959 + }, + { + "epoch": 0.15879828326180256, + "grad_norm": 1.1594929695129395, + "learning_rate": 4.786798847159469e-06, + "loss": 2.3332, + "step": 2960 + }, + { + "epoch": 0.1588519313304721, + "grad_norm": 1.0888879299163818, + "learning_rate": 4.786623275582191e-06, + "loss": 2.1355, + "step": 2961 + }, + { + "epoch": 0.15890557939914163, + "grad_norm": 1.4115545749664307, + "learning_rate": 4.786447634965432e-06, + "loss": 2.2819, + "step": 2962 + }, + { + "epoch": 0.15895922746781116, + "grad_norm": 1.5182048082351685, + "learning_rate": 4.786271925314497e-06, + "loss": 2.4817, + "step": 2963 + }, + { + "epoch": 0.1590128755364807, + "grad_norm": 1.0676771402359009, + "learning_rate": 4.78609614663469e-06, + "loss": 2.2765, + "step": 2964 + }, + { + "epoch": 0.15906652360515022, + "grad_norm": 1.1651480197906494, + "learning_rate": 4.785920298931318e-06, + "loss": 2.2432, + "step": 2965 + }, + { + "epoch": 0.15912017167381975, + "grad_norm": 1.4434301853179932, + "learning_rate": 4.785744382209691e-06, + "loss": 2.3743, + "step": 2966 + }, + { + "epoch": 0.15917381974248926, + "grad_norm": 1.3972140550613403, + "learning_rate": 4.78556839647512e-06, + "loss": 2.0996, + "step": 2967 + }, + { + "epoch": 0.1592274678111588, + "grad_norm": 1.1602033376693726, + "learning_rate": 4.785392341732918e-06, + "loss": 2.231, + "step": 2968 + }, + { + "epoch": 0.15928111587982832, + "grad_norm": 1.2217235565185547, + "learning_rate": 4.785216217988402e-06, + "loss": 2.2636, + "step": 2969 + }, + { + "epoch": 0.15933476394849785, + "grad_norm": 1.2225987911224365, + "learning_rate": 4.7850400252468895e-06, + "loss": 1.7281, + "step": 2970 + }, + { + "epoch": 0.15938841201716739, + "grad_norm": 1.0378024578094482, + "learning_rate": 4.784863763513699e-06, + "loss": 1.9749, + "step": 2971 + }, + { + "epoch": 0.15944206008583692, + "grad_norm": 1.298031210899353, + "learning_rate": 4.784687432794153e-06, + "loss": 2.2678, + "step": 2972 + }, + { + "epoch": 0.15949570815450645, + "grad_norm": 1.3964534997940063, + "learning_rate": 4.784511033093575e-06, + "loss": 2.3751, + "step": 2973 + }, + { + "epoch": 0.15954935622317595, + "grad_norm": 1.145098090171814, + "learning_rate": 4.784334564417292e-06, + "loss": 2.1854, + "step": 2974 + }, + { + "epoch": 0.15960300429184548, + "grad_norm": 1.25832998752594, + "learning_rate": 4.784158026770631e-06, + "loss": 2.1541, + "step": 2975 + }, + { + "epoch": 0.15965665236051502, + "grad_norm": 1.383490800857544, + "learning_rate": 4.783981420158924e-06, + "loss": 2.7172, + "step": 2976 + }, + { + "epoch": 0.15971030042918455, + "grad_norm": 4.170907020568848, + "learning_rate": 4.7838047445875005e-06, + "loss": 2.4101, + "step": 2977 + }, + { + "epoch": 0.15976394849785408, + "grad_norm": 1.2681913375854492, + "learning_rate": 4.783628000061696e-06, + "loss": 2.2121, + "step": 2978 + }, + { + "epoch": 0.1598175965665236, + "grad_norm": 1.331875205039978, + "learning_rate": 4.783451186586848e-06, + "loss": 2.6903, + "step": 2979 + }, + { + "epoch": 0.15987124463519314, + "grad_norm": 1.641340732574463, + "learning_rate": 4.783274304168294e-06, + "loss": 2.5118, + "step": 2980 + }, + { + "epoch": 0.15992489270386265, + "grad_norm": 1.1730107069015503, + "learning_rate": 4.783097352811374e-06, + "loss": 2.3505, + "step": 2981 + }, + { + "epoch": 0.15997854077253218, + "grad_norm": 1.1710654497146606, + "learning_rate": 4.782920332521433e-06, + "loss": 2.3055, + "step": 2982 + }, + { + "epoch": 0.1600321888412017, + "grad_norm": 1.148520827293396, + "learning_rate": 4.7827432433038115e-06, + "loss": 2.0943, + "step": 2983 + }, + { + "epoch": 0.16008583690987124, + "grad_norm": 1.396323323249817, + "learning_rate": 4.7825660851638605e-06, + "loss": 2.297, + "step": 2984 + }, + { + "epoch": 0.16013948497854077, + "grad_norm": 1.2834547758102417, + "learning_rate": 4.782388858106926e-06, + "loss": 2.3529, + "step": 2985 + }, + { + "epoch": 0.1601931330472103, + "grad_norm": 1.0846941471099854, + "learning_rate": 4.782211562138363e-06, + "loss": 1.6139, + "step": 2986 + }, + { + "epoch": 0.16024678111587984, + "grad_norm": 1.0871044397354126, + "learning_rate": 4.782034197263519e-06, + "loss": 2.1872, + "step": 2987 + }, + { + "epoch": 0.16030042918454937, + "grad_norm": 1.2371833324432373, + "learning_rate": 4.781856763487754e-06, + "loss": 2.4931, + "step": 2988 + }, + { + "epoch": 0.16035407725321887, + "grad_norm": 1.4174323081970215, + "learning_rate": 4.781679260816421e-06, + "loss": 2.2155, + "step": 2989 + }, + { + "epoch": 0.1604077253218884, + "grad_norm": 1.049787163734436, + "learning_rate": 4.781501689254883e-06, + "loss": 2.2278, + "step": 2990 + }, + { + "epoch": 0.16046137339055794, + "grad_norm": 1.183361530303955, + "learning_rate": 4.781324048808499e-06, + "loss": 2.1097, + "step": 2991 + }, + { + "epoch": 0.16051502145922747, + "grad_norm": 1.1320747137069702, + "learning_rate": 4.781146339482634e-06, + "loss": 2.5177, + "step": 2992 + }, + { + "epoch": 0.160568669527897, + "grad_norm": 0.9171345829963684, + "learning_rate": 4.780968561282652e-06, + "loss": 2.0298, + "step": 2993 + }, + { + "epoch": 0.16062231759656653, + "grad_norm": 1.3874672651290894, + "learning_rate": 4.7807907142139224e-06, + "loss": 2.1983, + "step": 2994 + }, + { + "epoch": 0.16067596566523606, + "grad_norm": 1.2509644031524658, + "learning_rate": 4.780612798281813e-06, + "loss": 2.1635, + "step": 2995 + }, + { + "epoch": 0.16072961373390557, + "grad_norm": 1.2035802602767944, + "learning_rate": 4.780434813491697e-06, + "loss": 2.3582, + "step": 2996 + }, + { + "epoch": 0.1607832618025751, + "grad_norm": 1.1673996448516846, + "learning_rate": 4.7802567598489466e-06, + "loss": 2.2278, + "step": 2997 + }, + { + "epoch": 0.16083690987124463, + "grad_norm": 7.547418594360352, + "learning_rate": 4.78007863735894e-06, + "loss": 2.4681, + "step": 2998 + }, + { + "epoch": 0.16089055793991416, + "grad_norm": 1.2003828287124634, + "learning_rate": 4.779900446027054e-06, + "loss": 2.3656, + "step": 2999 + }, + { + "epoch": 0.1609442060085837, + "grad_norm": 1.1799448728561401, + "learning_rate": 4.779722185858669e-06, + "loss": 1.7391, + "step": 3000 + }, + { + "epoch": 0.16099785407725323, + "grad_norm": 1.1270782947540283, + "learning_rate": 4.779543856859167e-06, + "loss": 2.4289, + "step": 3001 + }, + { + "epoch": 0.16105150214592276, + "grad_norm": 1.0644357204437256, + "learning_rate": 4.779365459033931e-06, + "loss": 2.147, + "step": 3002 + }, + { + "epoch": 0.16110515021459226, + "grad_norm": 1.154708981513977, + "learning_rate": 4.7791869923883495e-06, + "loss": 2.3507, + "step": 3003 + }, + { + "epoch": 0.1611587982832618, + "grad_norm": 1.1787158250808716, + "learning_rate": 4.77900845692781e-06, + "loss": 2.3193, + "step": 3004 + }, + { + "epoch": 0.16121244635193133, + "grad_norm": 1.1646615266799927, + "learning_rate": 4.778829852657703e-06, + "loss": 2.3467, + "step": 3005 + }, + { + "epoch": 0.16126609442060086, + "grad_norm": 1.3145935535430908, + "learning_rate": 4.77865117958342e-06, + "loss": 2.3776, + "step": 3006 + }, + { + "epoch": 0.1613197424892704, + "grad_norm": 1.2228314876556396, + "learning_rate": 4.778472437710357e-06, + "loss": 2.2197, + "step": 3007 + }, + { + "epoch": 0.16137339055793992, + "grad_norm": 4.620388984680176, + "learning_rate": 4.7782936270439105e-06, + "loss": 1.9011, + "step": 3008 + }, + { + "epoch": 0.16142703862660945, + "grad_norm": 1.3254187107086182, + "learning_rate": 4.778114747589479e-06, + "loss": 2.2061, + "step": 3009 + }, + { + "epoch": 0.16148068669527896, + "grad_norm": 1.2507721185684204, + "learning_rate": 4.777935799352464e-06, + "loss": 1.342, + "step": 3010 + }, + { + "epoch": 0.1615343347639485, + "grad_norm": 1.1259554624557495, + "learning_rate": 4.777756782338267e-06, + "loss": 2.2219, + "step": 3011 + }, + { + "epoch": 0.16158798283261802, + "grad_norm": 1.2501468658447266, + "learning_rate": 4.7775776965522945e-06, + "loss": 2.4516, + "step": 3012 + }, + { + "epoch": 0.16164163090128755, + "grad_norm": 1.2197118997573853, + "learning_rate": 4.777398541999954e-06, + "loss": 2.2677, + "step": 3013 + }, + { + "epoch": 0.16169527896995708, + "grad_norm": 1.248966097831726, + "learning_rate": 4.777219318686652e-06, + "loss": 2.1611, + "step": 3014 + }, + { + "epoch": 0.16174892703862662, + "grad_norm": 1.2267180681228638, + "learning_rate": 4.777040026617802e-06, + "loss": 2.2973, + "step": 3015 + }, + { + "epoch": 0.16180257510729615, + "grad_norm": 1.3521398305892944, + "learning_rate": 4.776860665798816e-06, + "loss": 2.1558, + "step": 3016 + }, + { + "epoch": 0.16185622317596565, + "grad_norm": 13.133729934692383, + "learning_rate": 4.776681236235111e-06, + "loss": 2.4099, + "step": 3017 + }, + { + "epoch": 0.16190987124463518, + "grad_norm": 1.2674521207809448, + "learning_rate": 4.776501737932104e-06, + "loss": 2.2721, + "step": 3018 + }, + { + "epoch": 0.16196351931330472, + "grad_norm": 1.169963002204895, + "learning_rate": 4.776322170895213e-06, + "loss": 1.3111, + "step": 3019 + }, + { + "epoch": 0.16201716738197425, + "grad_norm": 1.1726515293121338, + "learning_rate": 4.776142535129862e-06, + "loss": 2.3495, + "step": 3020 + }, + { + "epoch": 0.16207081545064378, + "grad_norm": 1.153148889541626, + "learning_rate": 4.775962830641473e-06, + "loss": 2.1606, + "step": 3021 + }, + { + "epoch": 0.1621244635193133, + "grad_norm": 1.3693723678588867, + "learning_rate": 4.775783057435472e-06, + "loss": 2.3615, + "step": 3022 + }, + { + "epoch": 0.16217811158798284, + "grad_norm": 1.3577522039413452, + "learning_rate": 4.775603215517287e-06, + "loss": 2.4706, + "step": 3023 + }, + { + "epoch": 0.16223175965665235, + "grad_norm": 1.1257344484329224, + "learning_rate": 4.775423304892349e-06, + "loss": 2.2385, + "step": 3024 + }, + { + "epoch": 0.16228540772532188, + "grad_norm": 1.2729793787002563, + "learning_rate": 4.775243325566088e-06, + "loss": 2.253, + "step": 3025 + }, + { + "epoch": 0.1623390557939914, + "grad_norm": 1.1944845914840698, + "learning_rate": 4.77506327754394e-06, + "loss": 2.2492, + "step": 3026 + }, + { + "epoch": 0.16239270386266094, + "grad_norm": 1.116951584815979, + "learning_rate": 4.77488316083134e-06, + "loss": 2.0661, + "step": 3027 + }, + { + "epoch": 0.16244635193133047, + "grad_norm": 1.4719529151916504, + "learning_rate": 4.774702975433726e-06, + "loss": 2.4788, + "step": 3028 + }, + { + "epoch": 0.1625, + "grad_norm": 1.0310859680175781, + "learning_rate": 4.774522721356539e-06, + "loss": 2.2345, + "step": 3029 + }, + { + "epoch": 0.16255364806866954, + "grad_norm": 1.2536746263504028, + "learning_rate": 4.774342398605222e-06, + "loss": 2.2693, + "step": 3030 + }, + { + "epoch": 0.16260729613733907, + "grad_norm": NaN, + "learning_rate": 4.774342398605222e-06, + "loss": 2.2686, + "step": 3031 + }, + { + "epoch": 0.16266094420600857, + "grad_norm": 1.1777926683425903, + "learning_rate": 4.774162007185217e-06, + "loss": 2.3642, + "step": 3032 + }, + { + "epoch": 0.1627145922746781, + "grad_norm": 1.1644583940505981, + "learning_rate": 4.773981547101973e-06, + "loss": 2.1838, + "step": 3033 + }, + { + "epoch": 0.16276824034334764, + "grad_norm": 1.145784854888916, + "learning_rate": 4.7738010183609386e-06, + "loss": 2.1274, + "step": 3034 + }, + { + "epoch": 0.16282188841201717, + "grad_norm": 1.1470249891281128, + "learning_rate": 4.773620420967562e-06, + "loss": 2.1254, + "step": 3035 + }, + { + "epoch": 0.1628755364806867, + "grad_norm": 1.4423848390579224, + "learning_rate": 4.7734397549272985e-06, + "loss": 2.4984, + "step": 3036 + }, + { + "epoch": 0.16292918454935623, + "grad_norm": 1.1550347805023193, + "learning_rate": 4.773259020245601e-06, + "loss": 2.4852, + "step": 3037 + }, + { + "epoch": 0.16298283261802576, + "grad_norm": 1.0661749839782715, + "learning_rate": 4.773078216927929e-06, + "loss": 2.2356, + "step": 3038 + }, + { + "epoch": 0.16303648068669527, + "grad_norm": 1.095867395401001, + "learning_rate": 4.772897344979738e-06, + "loss": 2.2554, + "step": 3039 + }, + { + "epoch": 0.1630901287553648, + "grad_norm": 1.2106332778930664, + "learning_rate": 4.772716404406491e-06, + "loss": 2.4065, + "step": 3040 + }, + { + "epoch": 0.16314377682403433, + "grad_norm": 1.3573211431503296, + "learning_rate": 4.772535395213651e-06, + "loss": 2.4073, + "step": 3041 + }, + { + "epoch": 0.16319742489270386, + "grad_norm": 1.451893925666809, + "learning_rate": 4.772354317406683e-06, + "loss": 2.4717, + "step": 3042 + }, + { + "epoch": 0.1632510729613734, + "grad_norm": 3.816734552383423, + "learning_rate": 4.772173170991055e-06, + "loss": 2.1656, + "step": 3043 + }, + { + "epoch": 0.16330472103004293, + "grad_norm": 1.2051477432250977, + "learning_rate": 4.771991955972235e-06, + "loss": 2.3746, + "step": 3044 + }, + { + "epoch": 0.16335836909871246, + "grad_norm": 1.3530343770980835, + "learning_rate": 4.771810672355696e-06, + "loss": 2.2762, + "step": 3045 + }, + { + "epoch": 0.16341201716738196, + "grad_norm": 1.1844065189361572, + "learning_rate": 4.771629320146909e-06, + "loss": 2.2306, + "step": 3046 + }, + { + "epoch": 0.1634656652360515, + "grad_norm": 1.5023545026779175, + "learning_rate": 4.771447899351351e-06, + "loss": 2.6556, + "step": 3047 + }, + { + "epoch": 0.16351931330472103, + "grad_norm": 1.3368052244186401, + "learning_rate": 4.7712664099744995e-06, + "loss": 2.5056, + "step": 3048 + }, + { + "epoch": 0.16357296137339056, + "grad_norm": 1.0757620334625244, + "learning_rate": 4.771084852021835e-06, + "loss": 2.2066, + "step": 3049 + }, + { + "epoch": 0.1636266094420601, + "grad_norm": 1.2819583415985107, + "learning_rate": 4.7709032254988385e-06, + "loss": 2.2341, + "step": 3050 + }, + { + "epoch": 0.16368025751072962, + "grad_norm": 1.229966402053833, + "learning_rate": 4.770721530410993e-06, + "loss": 2.278, + "step": 3051 + }, + { + "epoch": 0.16373390557939915, + "grad_norm": 1.3077037334442139, + "learning_rate": 4.770539766763786e-06, + "loss": 2.4204, + "step": 3052 + }, + { + "epoch": 0.16378755364806866, + "grad_norm": 1.126503348350525, + "learning_rate": 4.770357934562704e-06, + "loss": 2.0961, + "step": 3053 + }, + { + "epoch": 0.1638412017167382, + "grad_norm": 1.6038146018981934, + "learning_rate": 4.7701760338132375e-06, + "loss": 2.1024, + "step": 3054 + }, + { + "epoch": 0.16389484978540772, + "grad_norm": 1.1988927125930786, + "learning_rate": 4.769994064520878e-06, + "loss": 2.3868, + "step": 3055 + }, + { + "epoch": 0.16394849785407725, + "grad_norm": 2.023129940032959, + "learning_rate": 4.769812026691122e-06, + "loss": 2.1872, + "step": 3056 + }, + { + "epoch": 0.16400214592274678, + "grad_norm": 1.0118769407272339, + "learning_rate": 4.769629920329463e-06, + "loss": 2.3441, + "step": 3057 + }, + { + "epoch": 0.16405579399141632, + "grad_norm": 1.0740152597427368, + "learning_rate": 4.7694477454414e-06, + "loss": 2.1031, + "step": 3058 + }, + { + "epoch": 0.16410944206008585, + "grad_norm": 1.088633418083191, + "learning_rate": 4.769265502032434e-06, + "loss": 2.2978, + "step": 3059 + }, + { + "epoch": 0.16416309012875535, + "grad_norm": 2.8807919025421143, + "learning_rate": 4.769083190108068e-06, + "loss": 2.2027, + "step": 3060 + }, + { + "epoch": 0.16421673819742488, + "grad_norm": 1.247018575668335, + "learning_rate": 4.7689008096738055e-06, + "loss": 2.4035, + "step": 3061 + }, + { + "epoch": 0.16427038626609441, + "grad_norm": 1.2167555093765259, + "learning_rate": 4.768718360735152e-06, + "loss": 2.3281, + "step": 3062 + }, + { + "epoch": 0.16432403433476395, + "grad_norm": 1.3849979639053345, + "learning_rate": 4.7685358432976184e-06, + "loss": 2.4508, + "step": 3063 + }, + { + "epoch": 0.16437768240343348, + "grad_norm": 1.4860491752624512, + "learning_rate": 4.768353257366714e-06, + "loss": 2.3818, + "step": 3064 + }, + { + "epoch": 0.164431330472103, + "grad_norm": 1.1544153690338135, + "learning_rate": 4.768170602947952e-06, + "loss": 2.405, + "step": 3065 + }, + { + "epoch": 0.16448497854077254, + "grad_norm": 1.0194731950759888, + "learning_rate": 4.767987880046847e-06, + "loss": 2.2358, + "step": 3066 + }, + { + "epoch": 0.16453862660944207, + "grad_norm": 1.1071710586547852, + "learning_rate": 4.7678050886689165e-06, + "loss": 2.2741, + "step": 3067 + }, + { + "epoch": 0.16459227467811158, + "grad_norm": 1.2307766675949097, + "learning_rate": 4.7676222288196785e-06, + "loss": 2.0242, + "step": 3068 + }, + { + "epoch": 0.1646459227467811, + "grad_norm": 1.3019862174987793, + "learning_rate": 4.767439300504655e-06, + "loss": 2.1365, + "step": 3069 + }, + { + "epoch": 0.16469957081545064, + "grad_norm": 1.2495993375778198, + "learning_rate": 4.767256303729369e-06, + "loss": 2.3628, + "step": 3070 + }, + { + "epoch": 0.16475321888412017, + "grad_norm": 1.18794846534729, + "learning_rate": 4.767073238499345e-06, + "loss": 2.3674, + "step": 3071 + }, + { + "epoch": 0.1648068669527897, + "grad_norm": 1.3019988536834717, + "learning_rate": 4.766890104820111e-06, + "loss": 2.2879, + "step": 3072 + }, + { + "epoch": 0.16486051502145924, + "grad_norm": 1.1134647130966187, + "learning_rate": 4.766706902697195e-06, + "loss": 2.1944, + "step": 3073 + }, + { + "epoch": 0.16491416309012877, + "grad_norm": 1.3174816370010376, + "learning_rate": 4.76652363213613e-06, + "loss": 2.5073, + "step": 3074 + }, + { + "epoch": 0.16496781115879827, + "grad_norm": 1.7657816410064697, + "learning_rate": 4.766340293142449e-06, + "loss": 1.5533, + "step": 3075 + }, + { + "epoch": 0.1650214592274678, + "grad_norm": 1.1561741828918457, + "learning_rate": 4.766156885721687e-06, + "loss": 2.1049, + "step": 3076 + }, + { + "epoch": 0.16507510729613734, + "grad_norm": 1.2976888418197632, + "learning_rate": 4.765973409879382e-06, + "loss": 2.2399, + "step": 3077 + }, + { + "epoch": 0.16512875536480687, + "grad_norm": 1.0463858842849731, + "learning_rate": 4.7657898656210735e-06, + "loss": 2.1704, + "step": 3078 + }, + { + "epoch": 0.1651824034334764, + "grad_norm": 1.2705919742584229, + "learning_rate": 4.765606252952303e-06, + "loss": 2.3077, + "step": 3079 + }, + { + "epoch": 0.16523605150214593, + "grad_norm": 1.166109323501587, + "learning_rate": 4.765422571878615e-06, + "loss": 2.2453, + "step": 3080 + }, + { + "epoch": 0.16528969957081546, + "grad_norm": 1.4696775674819946, + "learning_rate": 4.7652388224055544e-06, + "loss": 2.1702, + "step": 3081 + }, + { + "epoch": 0.16534334763948497, + "grad_norm": 1.4241284132003784, + "learning_rate": 4.76505500453867e-06, + "loss": 2.1835, + "step": 3082 + }, + { + "epoch": 0.1653969957081545, + "grad_norm": 1.5942964553833008, + "learning_rate": 4.764871118283511e-06, + "loss": 2.3967, + "step": 3083 + }, + { + "epoch": 0.16545064377682403, + "grad_norm": 1.3306474685668945, + "learning_rate": 4.764687163645629e-06, + "loss": 2.1367, + "step": 3084 + }, + { + "epoch": 0.16550429184549356, + "grad_norm": 1.1766239404678345, + "learning_rate": 4.7645031406305794e-06, + "loss": 2.4697, + "step": 3085 + }, + { + "epoch": 0.1655579399141631, + "grad_norm": 1.2496379613876343, + "learning_rate": 4.764319049243919e-06, + "loss": 2.0348, + "step": 3086 + }, + { + "epoch": 0.16561158798283263, + "grad_norm": 1.3229255676269531, + "learning_rate": 4.764134889491203e-06, + "loss": 2.2484, + "step": 3087 + }, + { + "epoch": 0.16566523605150216, + "grad_norm": 1.3015755414962769, + "learning_rate": 4.763950661377994e-06, + "loss": 2.331, + "step": 3088 + }, + { + "epoch": 0.16571888412017166, + "grad_norm": 1.1605817079544067, + "learning_rate": 4.763766364909854e-06, + "loss": 2.3112, + "step": 3089 + }, + { + "epoch": 0.1657725321888412, + "grad_norm": 1.31203293800354, + "learning_rate": 4.763582000092348e-06, + "loss": 2.2961, + "step": 3090 + }, + { + "epoch": 0.16582618025751072, + "grad_norm": 1.231998324394226, + "learning_rate": 4.7633975669310415e-06, + "loss": 2.1397, + "step": 3091 + }, + { + "epoch": 0.16587982832618026, + "grad_norm": 1.1606056690216064, + "learning_rate": 4.763213065431502e-06, + "loss": 2.3584, + "step": 3092 + }, + { + "epoch": 0.1659334763948498, + "grad_norm": 1.199363350868225, + "learning_rate": 4.7630284955993034e-06, + "loss": 2.0288, + "step": 3093 + }, + { + "epoch": 0.16598712446351932, + "grad_norm": 8.525489807128906, + "learning_rate": 4.7628438574400155e-06, + "loss": 2.5801, + "step": 3094 + }, + { + "epoch": 0.16604077253218885, + "grad_norm": 1.3252573013305664, + "learning_rate": 4.7626591509592136e-06, + "loss": 2.5557, + "step": 3095 + }, + { + "epoch": 0.16609442060085836, + "grad_norm": 1.1889395713806152, + "learning_rate": 4.762474376162476e-06, + "loss": 1.8008, + "step": 3096 + }, + { + "epoch": 0.1661480686695279, + "grad_norm": 1.0889924764633179, + "learning_rate": 4.762289533055379e-06, + "loss": 2.3091, + "step": 3097 + }, + { + "epoch": 0.16620171673819742, + "grad_norm": 1.2526229619979858, + "learning_rate": 4.7621046216435064e-06, + "loss": 2.3572, + "step": 3098 + }, + { + "epoch": 0.16625536480686695, + "grad_norm": 1.374398112297058, + "learning_rate": 4.761919641932439e-06, + "loss": 2.5932, + "step": 3099 + }, + { + "epoch": 0.16630901287553648, + "grad_norm": 1.0881431102752686, + "learning_rate": 4.761734593927762e-06, + "loss": 2.4145, + "step": 3100 + }, + { + "epoch": 0.16636266094420601, + "grad_norm": 3.0205113887786865, + "learning_rate": 4.761549477635064e-06, + "loss": 2.3989, + "step": 3101 + }, + { + "epoch": 0.16641630901287555, + "grad_norm": 1.146669626235962, + "learning_rate": 4.7613642930599325e-06, + "loss": 2.3158, + "step": 3102 + }, + { + "epoch": 0.16646995708154508, + "grad_norm": 1.3177757263183594, + "learning_rate": 4.76117904020796e-06, + "loss": 2.4463, + "step": 3103 + }, + { + "epoch": 0.16652360515021458, + "grad_norm": 1.9498062133789062, + "learning_rate": 4.7609937190847385e-06, + "loss": 2.3583, + "step": 3104 + }, + { + "epoch": 0.1665772532188841, + "grad_norm": 1.0625829696655273, + "learning_rate": 4.760808329695865e-06, + "loss": 2.1716, + "step": 3105 + }, + { + "epoch": 0.16663090128755365, + "grad_norm": 1.2703917026519775, + "learning_rate": 4.760622872046936e-06, + "loss": 2.2931, + "step": 3106 + }, + { + "epoch": 0.16668454935622318, + "grad_norm": 1.15384840965271, + "learning_rate": 4.760437346143551e-06, + "loss": 2.2393, + "step": 3107 + }, + { + "epoch": 0.1667381974248927, + "grad_norm": 1.2875674962997437, + "learning_rate": 4.7602517519913114e-06, + "loss": 2.1301, + "step": 3108 + }, + { + "epoch": 0.16679184549356224, + "grad_norm": 1.3640247583389282, + "learning_rate": 4.760066089595821e-06, + "loss": 2.3167, + "step": 3109 + }, + { + "epoch": 0.16684549356223177, + "grad_norm": 1.2401844263076782, + "learning_rate": 4.759880358962686e-06, + "loss": 2.2239, + "step": 3110 + }, + { + "epoch": 0.16689914163090128, + "grad_norm": 1.1503642797470093, + "learning_rate": 4.759694560097513e-06, + "loss": 2.221, + "step": 3111 + }, + { + "epoch": 0.1669527896995708, + "grad_norm": 1.1801371574401855, + "learning_rate": 4.759508693005912e-06, + "loss": 2.2508, + "step": 3112 + }, + { + "epoch": 0.16700643776824034, + "grad_norm": 1.1720728874206543, + "learning_rate": 4.759322757693496e-06, + "loss": 2.3798, + "step": 3113 + }, + { + "epoch": 0.16706008583690987, + "grad_norm": 1.2267084121704102, + "learning_rate": 4.759136754165878e-06, + "loss": 2.1778, + "step": 3114 + }, + { + "epoch": 0.1671137339055794, + "grad_norm": 1.259080171585083, + "learning_rate": 4.758950682428673e-06, + "loss": 2.3467, + "step": 3115 + }, + { + "epoch": 0.16716738197424894, + "grad_norm": 1.1435894966125488, + "learning_rate": 4.758764542487502e-06, + "loss": 2.3278, + "step": 3116 + }, + { + "epoch": 0.16722103004291847, + "grad_norm": 1.2031466960906982, + "learning_rate": 4.7585783343479815e-06, + "loss": 2.2606, + "step": 3117 + }, + { + "epoch": 0.16727467811158797, + "grad_norm": 1.1896131038665771, + "learning_rate": 4.758392058015736e-06, + "loss": 2.2353, + "step": 3118 + }, + { + "epoch": 0.1673283261802575, + "grad_norm": 1.0234004259109497, + "learning_rate": 4.758205713496389e-06, + "loss": 2.1916, + "step": 3119 + }, + { + "epoch": 0.16738197424892703, + "grad_norm": 1.194617509841919, + "learning_rate": 4.758019300795566e-06, + "loss": 1.8711, + "step": 3120 + }, + { + "epoch": 0.16743562231759657, + "grad_norm": 3.112551689147949, + "learning_rate": 4.757832819918897e-06, + "loss": 2.1692, + "step": 3121 + }, + { + "epoch": 0.1674892703862661, + "grad_norm": 1.2056938409805298, + "learning_rate": 4.757646270872011e-06, + "loss": 2.3087, + "step": 3122 + }, + { + "epoch": 0.16754291845493563, + "grad_norm": 1.231048345565796, + "learning_rate": 4.757459653660541e-06, + "loss": 2.313, + "step": 3123 + }, + { + "epoch": 0.16759656652360516, + "grad_norm": 1.3102327585220337, + "learning_rate": 4.757272968290121e-06, + "loss": 2.3616, + "step": 3124 + }, + { + "epoch": 0.16765021459227467, + "grad_norm": 1.1878371238708496, + "learning_rate": 4.7570862147663885e-06, + "loss": 2.125, + "step": 3125 + }, + { + "epoch": 0.1677038626609442, + "grad_norm": 1.353148341178894, + "learning_rate": 4.75689939309498e-06, + "loss": 2.1381, + "step": 3126 + }, + { + "epoch": 0.16775751072961373, + "grad_norm": 1.318827748298645, + "learning_rate": 4.75671250328154e-06, + "loss": 2.2238, + "step": 3127 + }, + { + "epoch": 0.16781115879828326, + "grad_norm": 1.5133788585662842, + "learning_rate": 4.756525545331707e-06, + "loss": 2.1805, + "step": 3128 + }, + { + "epoch": 0.1678648068669528, + "grad_norm": 1.187117099761963, + "learning_rate": 4.756338519251129e-06, + "loss": 1.9503, + "step": 3129 + }, + { + "epoch": 0.16791845493562232, + "grad_norm": 1.1511939764022827, + "learning_rate": 4.756151425045451e-06, + "loss": 2.2253, + "step": 3130 + }, + { + "epoch": 0.16797210300429186, + "grad_norm": 1.2244433164596558, + "learning_rate": 4.755964262720322e-06, + "loss": 1.6768, + "step": 3131 + }, + { + "epoch": 0.16802575107296136, + "grad_norm": 1.6896213293075562, + "learning_rate": 4.755777032281394e-06, + "loss": 2.4076, + "step": 3132 + }, + { + "epoch": 0.1680793991416309, + "grad_norm": 1.1307308673858643, + "learning_rate": 4.755589733734319e-06, + "loss": 2.1278, + "step": 3133 + }, + { + "epoch": 0.16813304721030042, + "grad_norm": 1.2302844524383545, + "learning_rate": 4.7554023670847525e-06, + "loss": 2.1178, + "step": 3134 + }, + { + "epoch": 0.16818669527896996, + "grad_norm": 1.1671125888824463, + "learning_rate": 4.755214932338351e-06, + "loss": 2.1651, + "step": 3135 + }, + { + "epoch": 0.1682403433476395, + "grad_norm": 1.1945048570632935, + "learning_rate": 4.755027429500775e-06, + "loss": 2.1785, + "step": 3136 + }, + { + "epoch": 0.16829399141630902, + "grad_norm": 7.30875301361084, + "learning_rate": 4.754839858577684e-06, + "loss": 2.301, + "step": 3137 + }, + { + "epoch": 0.16834763948497855, + "grad_norm": 0.9955490827560425, + "learning_rate": 4.754652219574743e-06, + "loss": 2.2015, + "step": 3138 + }, + { + "epoch": 0.16840128755364808, + "grad_norm": 1.3531932830810547, + "learning_rate": 4.754464512497617e-06, + "loss": 2.3451, + "step": 3139 + }, + { + "epoch": 0.1684549356223176, + "grad_norm": 1.1007150411605835, + "learning_rate": 4.754276737351972e-06, + "loss": 2.3068, + "step": 3140 + }, + { + "epoch": 0.16850858369098712, + "grad_norm": 1.1884078979492188, + "learning_rate": 4.754088894143479e-06, + "loss": 2.2487, + "step": 3141 + }, + { + "epoch": 0.16856223175965665, + "grad_norm": 1.2803208827972412, + "learning_rate": 4.753900982877808e-06, + "loss": 2.6239, + "step": 3142 + }, + { + "epoch": 0.16861587982832618, + "grad_norm": 0.9828450679779053, + "learning_rate": 4.753713003560634e-06, + "loss": 2.2564, + "step": 3143 + }, + { + "epoch": 0.1686695278969957, + "grad_norm": 1.1587157249450684, + "learning_rate": 4.753524956197632e-06, + "loss": 2.1354, + "step": 3144 + }, + { + "epoch": 0.16872317596566525, + "grad_norm": 1.295487403869629, + "learning_rate": 4.75333684079448e-06, + "loss": 2.3235, + "step": 3145 + }, + { + "epoch": 0.16877682403433478, + "grad_norm": 1.0606836080551147, + "learning_rate": 4.753148657356858e-06, + "loss": 2.2281, + "step": 3146 + }, + { + "epoch": 0.16883047210300428, + "grad_norm": 1.3008947372436523, + "learning_rate": 4.752960405890446e-06, + "loss": 2.3033, + "step": 3147 + }, + { + "epoch": 0.1688841201716738, + "grad_norm": 1.1323285102844238, + "learning_rate": 4.75277208640093e-06, + "loss": 2.0796, + "step": 3148 + }, + { + "epoch": 0.16893776824034334, + "grad_norm": 1.187278151512146, + "learning_rate": 4.752583698893994e-06, + "loss": 2.1824, + "step": 3149 + }, + { + "epoch": 0.16899141630901288, + "grad_norm": 1.2143573760986328, + "learning_rate": 4.752395243375328e-06, + "loss": 2.4814, + "step": 3150 + }, + { + "epoch": 0.1690450643776824, + "grad_norm": 1.1394156217575073, + "learning_rate": 4.7522067198506205e-06, + "loss": 2.494, + "step": 3151 + }, + { + "epoch": 0.16909871244635194, + "grad_norm": 1.220416784286499, + "learning_rate": 4.752018128325564e-06, + "loss": 2.2103, + "step": 3152 + }, + { + "epoch": 0.16915236051502147, + "grad_norm": 1.153308629989624, + "learning_rate": 4.7518294688058525e-06, + "loss": 2.2301, + "step": 3153 + }, + { + "epoch": 0.16920600858369098, + "grad_norm": 1.179295301437378, + "learning_rate": 4.751640741297182e-06, + "loss": 2.4303, + "step": 3154 + }, + { + "epoch": 0.1692596566523605, + "grad_norm": 1.6544982194900513, + "learning_rate": 4.751451945805251e-06, + "loss": 2.3373, + "step": 3155 + }, + { + "epoch": 0.16931330472103004, + "grad_norm": 1.1035844087600708, + "learning_rate": 4.75126308233576e-06, + "loss": 2.5454, + "step": 3156 + }, + { + "epoch": 0.16936695278969957, + "grad_norm": 1.2545452117919922, + "learning_rate": 4.7510741508944115e-06, + "loss": 2.3327, + "step": 3157 + }, + { + "epoch": 0.1694206008583691, + "grad_norm": 1.1166460514068604, + "learning_rate": 4.750885151486908e-06, + "loss": 2.5852, + "step": 3158 + }, + { + "epoch": 0.16947424892703863, + "grad_norm": 1.177878499031067, + "learning_rate": 4.750696084118957e-06, + "loss": 2.1074, + "step": 3159 + }, + { + "epoch": 0.16952789699570817, + "grad_norm": 1.3506262302398682, + "learning_rate": 4.750506948796269e-06, + "loss": 2.1321, + "step": 3160 + }, + { + "epoch": 0.16958154506437767, + "grad_norm": 1.4831058979034424, + "learning_rate": 4.750317745524552e-06, + "loss": 1.9844, + "step": 3161 + }, + { + "epoch": 0.1696351931330472, + "grad_norm": 1.1205027103424072, + "learning_rate": 4.750128474309519e-06, + "loss": 2.2063, + "step": 3162 + }, + { + "epoch": 0.16968884120171673, + "grad_norm": 1.0629470348358154, + "learning_rate": 4.749939135156885e-06, + "loss": 2.115, + "step": 3163 + }, + { + "epoch": 0.16974248927038627, + "grad_norm": 1.3046075105667114, + "learning_rate": 4.749749728072367e-06, + "loss": 2.2085, + "step": 3164 + }, + { + "epoch": 0.1697961373390558, + "grad_norm": 1.3103086948394775, + "learning_rate": 4.749560253061683e-06, + "loss": 2.2326, + "step": 3165 + }, + { + "epoch": 0.16984978540772533, + "grad_norm": 1.8780930042266846, + "learning_rate": 4.7493707101305545e-06, + "loss": 2.2217, + "step": 3166 + }, + { + "epoch": 0.16990343347639486, + "grad_norm": 1.2620376348495483, + "learning_rate": 4.749181099284703e-06, + "loss": 2.4682, + "step": 3167 + }, + { + "epoch": 0.16995708154506436, + "grad_norm": 1.7832976579666138, + "learning_rate": 4.748991420529855e-06, + "loss": 2.4631, + "step": 3168 + }, + { + "epoch": 0.1700107296137339, + "grad_norm": 1.217355728149414, + "learning_rate": 4.7488016738717364e-06, + "loss": 2.1716, + "step": 3169 + }, + { + "epoch": 0.17006437768240343, + "grad_norm": 1.3644405603408813, + "learning_rate": 4.748611859316077e-06, + "loss": 2.3477, + "step": 3170 + }, + { + "epoch": 0.17011802575107296, + "grad_norm": 1.2777938842773438, + "learning_rate": 4.748421976868607e-06, + "loss": 2.3159, + "step": 3171 + }, + { + "epoch": 0.1701716738197425, + "grad_norm": 1.2640750408172607, + "learning_rate": 4.74823202653506e-06, + "loss": 2.6411, + "step": 3172 + }, + { + "epoch": 0.17022532188841202, + "grad_norm": 1.2655059099197388, + "learning_rate": 4.748042008321171e-06, + "loss": 2.6659, + "step": 3173 + }, + { + "epoch": 0.17027896995708156, + "grad_norm": 1.2147634029388428, + "learning_rate": 4.747851922232677e-06, + "loss": 2.3692, + "step": 3174 + }, + { + "epoch": 0.17033261802575106, + "grad_norm": 1.2011181116104126, + "learning_rate": 4.747661768275317e-06, + "loss": 1.9574, + "step": 3175 + }, + { + "epoch": 0.1703862660944206, + "grad_norm": 1.1849104166030884, + "learning_rate": 4.747471546454833e-06, + "loss": 2.4824, + "step": 3176 + }, + { + "epoch": 0.17043991416309012, + "grad_norm": 1.3223013877868652, + "learning_rate": 4.747281256776968e-06, + "loss": 2.1885, + "step": 3177 + }, + { + "epoch": 0.17049356223175965, + "grad_norm": 1.719274878501892, + "learning_rate": 4.7470908992474666e-06, + "loss": 1.5951, + "step": 3178 + }, + { + "epoch": 0.1705472103004292, + "grad_norm": 1.1803390979766846, + "learning_rate": 4.746900473872077e-06, + "loss": 2.1442, + "step": 3179 + }, + { + "epoch": 0.17060085836909872, + "grad_norm": 1.1858339309692383, + "learning_rate": 4.74670998065655e-06, + "loss": 2.2745, + "step": 3180 + }, + { + "epoch": 0.17065450643776825, + "grad_norm": 2.791250467300415, + "learning_rate": 4.746519419606634e-06, + "loss": 2.2112, + "step": 3181 + }, + { + "epoch": 0.17070815450643778, + "grad_norm": 1.2418444156646729, + "learning_rate": 4.746328790728085e-06, + "loss": 2.2181, + "step": 3182 + }, + { + "epoch": 0.17076180257510729, + "grad_norm": 1.1856694221496582, + "learning_rate": 4.7461380940266575e-06, + "loss": 2.0887, + "step": 3183 + }, + { + "epoch": 0.17081545064377682, + "grad_norm": 1.26639723777771, + "learning_rate": 4.74594732950811e-06, + "loss": 2.451, + "step": 3184 + }, + { + "epoch": 0.17086909871244635, + "grad_norm": 2.0401604175567627, + "learning_rate": 4.745756497178203e-06, + "loss": 2.1046, + "step": 3185 + }, + { + "epoch": 0.17092274678111588, + "grad_norm": 0.9376734495162964, + "learning_rate": 4.745565597042695e-06, + "loss": 1.8043, + "step": 3186 + }, + { + "epoch": 0.1709763948497854, + "grad_norm": 1.1713671684265137, + "learning_rate": 4.745374629107352e-06, + "loss": 2.4176, + "step": 3187 + }, + { + "epoch": 0.17103004291845494, + "grad_norm": 1.4067981243133545, + "learning_rate": 4.74518359337794e-06, + "loss": 2.3567, + "step": 3188 + }, + { + "epoch": 0.17108369098712448, + "grad_norm": 1.4748116731643677, + "learning_rate": 4.744992489860228e-06, + "loss": 1.6612, + "step": 3189 + }, + { + "epoch": 0.17113733905579398, + "grad_norm": 1.3666428327560425, + "learning_rate": 4.744801318559983e-06, + "loss": 2.3399, + "step": 3190 + }, + { + "epoch": 0.1711909871244635, + "grad_norm": 1.7921096086502075, + "learning_rate": 4.744610079482978e-06, + "loss": 2.1985, + "step": 3191 + }, + { + "epoch": 0.17124463519313304, + "grad_norm": 1.3345067501068115, + "learning_rate": 4.744418772634989e-06, + "loss": 2.0749, + "step": 3192 + }, + { + "epoch": 0.17129828326180258, + "grad_norm": 1.2852028608322144, + "learning_rate": 4.74422739802179e-06, + "loss": 2.3781, + "step": 3193 + }, + { + "epoch": 0.1713519313304721, + "grad_norm": 3.393296003341675, + "learning_rate": 4.744035955649159e-06, + "loss": 2.1957, + "step": 3194 + }, + { + "epoch": 0.17140557939914164, + "grad_norm": 1.4399561882019043, + "learning_rate": 4.743844445522878e-06, + "loss": 2.4077, + "step": 3195 + }, + { + "epoch": 0.17145922746781117, + "grad_norm": 1.1978611946105957, + "learning_rate": 4.743652867648727e-06, + "loss": 2.2498, + "step": 3196 + }, + { + "epoch": 0.17151287553648067, + "grad_norm": 1.1651824712753296, + "learning_rate": 4.743461222032493e-06, + "loss": 2.4215, + "step": 3197 + }, + { + "epoch": 0.1715665236051502, + "grad_norm": 1.1520835161209106, + "learning_rate": 4.74326950867996e-06, + "loss": 2.3138, + "step": 3198 + }, + { + "epoch": 0.17162017167381974, + "grad_norm": 1.280731201171875, + "learning_rate": 4.7430777275969166e-06, + "loss": 2.2012, + "step": 3199 + }, + { + "epoch": 0.17167381974248927, + "grad_norm": 1.108737826347351, + "learning_rate": 4.742885878789154e-06, + "loss": 2.2787, + "step": 3200 + }, + { + "epoch": 0.1717274678111588, + "grad_norm": 1.325556993484497, + "learning_rate": 4.7426939622624644e-06, + "loss": 2.5599, + "step": 3201 + }, + { + "epoch": 0.17178111587982833, + "grad_norm": 1.585776925086975, + "learning_rate": 4.742501978022641e-06, + "loss": 2.1294, + "step": 3202 + }, + { + "epoch": 0.17183476394849787, + "grad_norm": 2.8806309700012207, + "learning_rate": 4.7423099260754835e-06, + "loss": 2.4791, + "step": 3203 + }, + { + "epoch": 0.17188841201716737, + "grad_norm": 1.2806979417800903, + "learning_rate": 4.742117806426787e-06, + "loss": 2.3911, + "step": 3204 + }, + { + "epoch": 0.1719420600858369, + "grad_norm": 2.4306485652923584, + "learning_rate": 4.741925619082355e-06, + "loss": 2.1856, + "step": 3205 + }, + { + "epoch": 0.17199570815450643, + "grad_norm": 1.1184712648391724, + "learning_rate": 4.741733364047988e-06, + "loss": 2.1767, + "step": 3206 + }, + { + "epoch": 0.17204935622317596, + "grad_norm": 1.054630160331726, + "learning_rate": 4.741541041329492e-06, + "loss": 2.4273, + "step": 3207 + }, + { + "epoch": 0.1721030042918455, + "grad_norm": 1.0737639665603638, + "learning_rate": 4.741348650932673e-06, + "loss": 2.3062, + "step": 3208 + }, + { + "epoch": 0.17215665236051503, + "grad_norm": 1.1330751180648804, + "learning_rate": 4.74115619286334e-06, + "loss": 2.2726, + "step": 3209 + }, + { + "epoch": 0.17221030042918456, + "grad_norm": 1.470331072807312, + "learning_rate": 4.7409636671273044e-06, + "loss": 2.2496, + "step": 3210 + }, + { + "epoch": 0.17226394849785406, + "grad_norm": 1.2213854789733887, + "learning_rate": 4.740771073730378e-06, + "loss": 2.3438, + "step": 3211 + }, + { + "epoch": 0.1723175965665236, + "grad_norm": 1.3786942958831787, + "learning_rate": 4.740578412678376e-06, + "loss": 2.3053, + "step": 3212 + }, + { + "epoch": 0.17237124463519313, + "grad_norm": 1.107563853263855, + "learning_rate": 4.740385683977116e-06, + "loss": 2.3227, + "step": 3213 + }, + { + "epoch": 0.17242489270386266, + "grad_norm": 1.3693963289260864, + "learning_rate": 4.740192887632417e-06, + "loss": 2.2196, + "step": 3214 + }, + { + "epoch": 0.1724785407725322, + "grad_norm": 1.2904671430587769, + "learning_rate": 4.7400000236501e-06, + "loss": 2.1594, + "step": 3215 + }, + { + "epoch": 0.17253218884120172, + "grad_norm": 1.6878256797790527, + "learning_rate": 4.739807092035988e-06, + "loss": 1.3786, + "step": 3216 + }, + { + "epoch": 0.17258583690987125, + "grad_norm": 1.2700577974319458, + "learning_rate": 4.739614092795905e-06, + "loss": 2.3412, + "step": 3217 + }, + { + "epoch": 0.17263948497854079, + "grad_norm": 1.1570180654525757, + "learning_rate": 4.73942102593568e-06, + "loss": 2.3784, + "step": 3218 + }, + { + "epoch": 0.1726931330472103, + "grad_norm": 1.2291889190673828, + "learning_rate": 4.739227891461141e-06, + "loss": 2.1832, + "step": 3219 + }, + { + "epoch": 0.17274678111587982, + "grad_norm": 1.2705426216125488, + "learning_rate": 4.7390346893781195e-06, + "loss": 2.2552, + "step": 3220 + }, + { + "epoch": 0.17280042918454935, + "grad_norm": 1.1856234073638916, + "learning_rate": 4.738841419692449e-06, + "loss": 2.3594, + "step": 3221 + }, + { + "epoch": 0.17285407725321889, + "grad_norm": 1.0419477224349976, + "learning_rate": 4.738648082409966e-06, + "loss": 2.2121, + "step": 3222 + }, + { + "epoch": 0.17290772532188842, + "grad_norm": 1.0261335372924805, + "learning_rate": 4.738454677536506e-06, + "loss": 1.6921, + "step": 3223 + }, + { + "epoch": 0.17296137339055795, + "grad_norm": 1.2106951475143433, + "learning_rate": 4.738261205077909e-06, + "loss": 2.2426, + "step": 3224 + }, + { + "epoch": 0.17301502145922748, + "grad_norm": 1.5593748092651367, + "learning_rate": 4.738067665040016e-06, + "loss": 2.0907, + "step": 3225 + }, + { + "epoch": 0.17306866952789698, + "grad_norm": 1.157471776008606, + "learning_rate": 4.737874057428672e-06, + "loss": 2.2245, + "step": 3226 + }, + { + "epoch": 0.17312231759656652, + "grad_norm": 1.1568232774734497, + "learning_rate": 4.737680382249721e-06, + "loss": 2.2577, + "step": 3227 + }, + { + "epoch": 0.17317596566523605, + "grad_norm": 1.202784538269043, + "learning_rate": 4.737486639509012e-06, + "loss": 2.1883, + "step": 3228 + }, + { + "epoch": 0.17322961373390558, + "grad_norm": 1.5098192691802979, + "learning_rate": 4.737292829212393e-06, + "loss": 1.538, + "step": 3229 + }, + { + "epoch": 0.1732832618025751, + "grad_norm": 1.1531174182891846, + "learning_rate": 4.7370989513657175e-06, + "loss": 2.3101, + "step": 3230 + }, + { + "epoch": 0.17333690987124464, + "grad_norm": 1.3116086721420288, + "learning_rate": 4.736905005974838e-06, + "loss": 2.3045, + "step": 3231 + }, + { + "epoch": 0.17339055793991417, + "grad_norm": 1.1999833583831787, + "learning_rate": 4.7367109930456105e-06, + "loss": 2.3477, + "step": 3232 + }, + { + "epoch": 0.17344420600858368, + "grad_norm": 1.261141061782837, + "learning_rate": 4.736516912583893e-06, + "loss": 2.4318, + "step": 3233 + }, + { + "epoch": 0.1734978540772532, + "grad_norm": 2.061005115509033, + "learning_rate": 4.7363227645955445e-06, + "loss": 1.6611, + "step": 3234 + }, + { + "epoch": 0.17355150214592274, + "grad_norm": 1.1197830438613892, + "learning_rate": 4.736128549086428e-06, + "loss": 2.2545, + "step": 3235 + }, + { + "epoch": 0.17360515021459227, + "grad_norm": 1.1702910661697388, + "learning_rate": 4.735934266062406e-06, + "loss": 2.2606, + "step": 3236 + }, + { + "epoch": 0.1736587982832618, + "grad_norm": 1.2458657026290894, + "learning_rate": 4.735739915529346e-06, + "loss": 2.4577, + "step": 3237 + }, + { + "epoch": 0.17371244635193134, + "grad_norm": 1.2247769832611084, + "learning_rate": 4.7355454974931155e-06, + "loss": 2.1624, + "step": 3238 + }, + { + "epoch": 0.17376609442060087, + "grad_norm": 1.384115219116211, + "learning_rate": 4.735351011959585e-06, + "loss": 2.4937, + "step": 3239 + }, + { + "epoch": 0.17381974248927037, + "grad_norm": 1.249359130859375, + "learning_rate": 4.735156458934624e-06, + "loss": 1.6334, + "step": 3240 + }, + { + "epoch": 0.1738733905579399, + "grad_norm": 1.2073140144348145, + "learning_rate": 4.7349618384241105e-06, + "loss": 2.2672, + "step": 3241 + }, + { + "epoch": 0.17392703862660944, + "grad_norm": 1.5026147365570068, + "learning_rate": 4.734767150433917e-06, + "loss": 2.4554, + "step": 3242 + }, + { + "epoch": 0.17398068669527897, + "grad_norm": 1.1649084091186523, + "learning_rate": 4.734572394969924e-06, + "loss": 2.254, + "step": 3243 + }, + { + "epoch": 0.1740343347639485, + "grad_norm": 1.2507749795913696, + "learning_rate": 4.734377572038011e-06, + "loss": 2.2807, + "step": 3244 + }, + { + "epoch": 0.17408798283261803, + "grad_norm": 1.1135432720184326, + "learning_rate": 4.73418268164406e-06, + "loss": 1.9718, + "step": 3245 + }, + { + "epoch": 0.17414163090128756, + "grad_norm": 1.1996616125106812, + "learning_rate": 4.733987723793956e-06, + "loss": 2.5147, + "step": 3246 + }, + { + "epoch": 0.17419527896995707, + "grad_norm": 1.3393299579620361, + "learning_rate": 4.733792698493584e-06, + "loss": 2.4986, + "step": 3247 + }, + { + "epoch": 0.1742489270386266, + "grad_norm": 1.5479871034622192, + "learning_rate": 4.7335976057488334e-06, + "loss": 2.5229, + "step": 3248 + }, + { + "epoch": 0.17430257510729613, + "grad_norm": 2.0576059818267822, + "learning_rate": 4.733402445565595e-06, + "loss": 2.63, + "step": 3249 + }, + { + "epoch": 0.17435622317596566, + "grad_norm": 1.1637901067733765, + "learning_rate": 4.73320721794976e-06, + "loss": 2.2106, + "step": 3250 + }, + { + "epoch": 0.1744098712446352, + "grad_norm": 1.1971057653427124, + "learning_rate": 4.733011922907223e-06, + "loss": 2.3141, + "step": 3251 + }, + { + "epoch": 0.17446351931330473, + "grad_norm": 1.2143582105636597, + "learning_rate": 4.732816560443882e-06, + "loss": 2.304, + "step": 3252 + }, + { + "epoch": 0.17451716738197426, + "grad_norm": 1.1354238986968994, + "learning_rate": 4.732621130565635e-06, + "loss": 1.7097, + "step": 3253 + }, + { + "epoch": 0.1745708154506438, + "grad_norm": 2.366018295288086, + "learning_rate": 4.73242563327838e-06, + "loss": 2.4709, + "step": 3254 + }, + { + "epoch": 0.1746244635193133, + "grad_norm": 1.1619905233383179, + "learning_rate": 4.732230068588023e-06, + "loss": 2.0292, + "step": 3255 + }, + { + "epoch": 0.17467811158798283, + "grad_norm": 1.2585099935531616, + "learning_rate": 4.7320344365004675e-06, + "loss": 2.2169, + "step": 3256 + }, + { + "epoch": 0.17473175965665236, + "grad_norm": 1.2061822414398193, + "learning_rate": 4.73183873702162e-06, + "loss": 2.0107, + "step": 3257 + }, + { + "epoch": 0.1747854077253219, + "grad_norm": 4.927656173706055, + "learning_rate": 4.731642970157388e-06, + "loss": 2.183, + "step": 3258 + }, + { + "epoch": 0.17483905579399142, + "grad_norm": 1.194763422012329, + "learning_rate": 4.731447135913685e-06, + "loss": 2.1675, + "step": 3259 + }, + { + "epoch": 0.17489270386266095, + "grad_norm": 1.1798185110092163, + "learning_rate": 4.7312512342964216e-06, + "loss": 2.1874, + "step": 3260 + }, + { + "epoch": 0.17494635193133048, + "grad_norm": 1.3313912153244019, + "learning_rate": 4.731055265311513e-06, + "loss": 2.0975, + "step": 3261 + }, + { + "epoch": 0.175, + "grad_norm": 1.367790699005127, + "learning_rate": 4.730859228964876e-06, + "loss": 2.2871, + "step": 3262 + }, + { + "epoch": 0.17505364806866952, + "grad_norm": 1.1406912803649902, + "learning_rate": 4.7306631252624305e-06, + "loss": 2.2646, + "step": 3263 + }, + { + "epoch": 0.17510729613733905, + "grad_norm": 0.9723648428916931, + "learning_rate": 4.7304669542100956e-06, + "loss": 2.2166, + "step": 3264 + }, + { + "epoch": 0.17516094420600858, + "grad_norm": 1.37990140914917, + "learning_rate": 4.730270715813795e-06, + "loss": 2.0626, + "step": 3265 + }, + { + "epoch": 0.17521459227467812, + "grad_norm": 1.228124737739563, + "learning_rate": 4.730074410079455e-06, + "loss": 2.0347, + "step": 3266 + }, + { + "epoch": 0.17526824034334765, + "grad_norm": 1.2506194114685059, + "learning_rate": 4.729878037013001e-06, + "loss": 2.2231, + "step": 3267 + }, + { + "epoch": 0.17532188841201718, + "grad_norm": 1.1909507513046265, + "learning_rate": 4.729681596620364e-06, + "loss": 2.1528, + "step": 3268 + }, + { + "epoch": 0.17537553648068668, + "grad_norm": 1.2300488948822021, + "learning_rate": 4.7294850889074725e-06, + "loss": 2.3664, + "step": 3269 + }, + { + "epoch": 0.17542918454935622, + "grad_norm": 1.264133095741272, + "learning_rate": 4.729288513880261e-06, + "loss": 2.387, + "step": 3270 + }, + { + "epoch": 0.17548283261802575, + "grad_norm": 1.0399789810180664, + "learning_rate": 4.729091871544665e-06, + "loss": 2.2649, + "step": 3271 + }, + { + "epoch": 0.17553648068669528, + "grad_norm": 1.1306984424591064, + "learning_rate": 4.72889516190662e-06, + "loss": 2.2579, + "step": 3272 + }, + { + "epoch": 0.1755901287553648, + "grad_norm": 2.114661455154419, + "learning_rate": 4.728698384972068e-06, + "loss": 2.3864, + "step": 3273 + }, + { + "epoch": 0.17564377682403434, + "grad_norm": 1.2984557151794434, + "learning_rate": 4.728501540746947e-06, + "loss": 2.2162, + "step": 3274 + }, + { + "epoch": 0.17569742489270387, + "grad_norm": 1.2220731973648071, + "learning_rate": 4.728304629237203e-06, + "loss": 1.9408, + "step": 3275 + }, + { + "epoch": 0.17575107296137338, + "grad_norm": 1.0379122495651245, + "learning_rate": 4.728107650448779e-06, + "loss": 1.8653, + "step": 3276 + }, + { + "epoch": 0.1758047210300429, + "grad_norm": 1.3365097045898438, + "learning_rate": 4.727910604387624e-06, + "loss": 2.4822, + "step": 3277 + }, + { + "epoch": 0.17585836909871244, + "grad_norm": 1.1737916469573975, + "learning_rate": 4.7277134910596874e-06, + "loss": 2.2346, + "step": 3278 + }, + { + "epoch": 0.17591201716738197, + "grad_norm": 1.288511872291565, + "learning_rate": 4.72751631047092e-06, + "loss": 2.2343, + "step": 3279 + }, + { + "epoch": 0.1759656652360515, + "grad_norm": 1.3324507474899292, + "learning_rate": 4.727319062627275e-06, + "loss": 1.9334, + "step": 3280 + }, + { + "epoch": 0.17601931330472104, + "grad_norm": 1.0309637784957886, + "learning_rate": 4.7271217475347084e-06, + "loss": 2.0175, + "step": 3281 + }, + { + "epoch": 0.17607296137339057, + "grad_norm": 1.1180663108825684, + "learning_rate": 4.726924365199177e-06, + "loss": 2.2359, + "step": 3282 + }, + { + "epoch": 0.17612660944206007, + "grad_norm": 1.177488923072815, + "learning_rate": 4.726726915626641e-06, + "loss": 2.2526, + "step": 3283 + }, + { + "epoch": 0.1761802575107296, + "grad_norm": 1.2432887554168701, + "learning_rate": 4.726529398823062e-06, + "loss": 2.2397, + "step": 3284 + }, + { + "epoch": 0.17623390557939914, + "grad_norm": 1.3605650663375854, + "learning_rate": 4.726331814794403e-06, + "loss": 2.1839, + "step": 3285 + }, + { + "epoch": 0.17628755364806867, + "grad_norm": 1.228833556175232, + "learning_rate": 4.726134163546629e-06, + "loss": 2.227, + "step": 3286 + }, + { + "epoch": 0.1763412017167382, + "grad_norm": 1.3469996452331543, + "learning_rate": 4.7259364450857095e-06, + "loss": 2.3346, + "step": 3287 + }, + { + "epoch": 0.17639484978540773, + "grad_norm": 1.1270191669464111, + "learning_rate": 4.7257386594176136e-06, + "loss": 2.1275, + "step": 3288 + }, + { + "epoch": 0.17644849785407726, + "grad_norm": 1.1712634563446045, + "learning_rate": 4.7255408065483114e-06, + "loss": 2.3546, + "step": 3289 + }, + { + "epoch": 0.1765021459227468, + "grad_norm": 1.326046109199524, + "learning_rate": 4.725342886483779e-06, + "loss": 2.4395, + "step": 3290 + }, + { + "epoch": 0.1765557939914163, + "grad_norm": 1.148240327835083, + "learning_rate": 4.72514489922999e-06, + "loss": 2.3489, + "step": 3291 + }, + { + "epoch": 0.17660944206008583, + "grad_norm": 1.3469010591506958, + "learning_rate": 4.724946844792924e-06, + "loss": 2.2099, + "step": 3292 + }, + { + "epoch": 0.17666309012875536, + "grad_norm": 1.2217059135437012, + "learning_rate": 4.724748723178559e-06, + "loss": 2.3139, + "step": 3293 + }, + { + "epoch": 0.1767167381974249, + "grad_norm": 1.2934917211532593, + "learning_rate": 4.724550534392878e-06, + "loss": 2.4144, + "step": 3294 + }, + { + "epoch": 0.17677038626609443, + "grad_norm": 1.1159981489181519, + "learning_rate": 4.724352278441866e-06, + "loss": 2.2201, + "step": 3295 + }, + { + "epoch": 0.17682403433476396, + "grad_norm": 1.337834119796753, + "learning_rate": 4.724153955331506e-06, + "loss": 1.5219, + "step": 3296 + }, + { + "epoch": 0.1768776824034335, + "grad_norm": 1.1232203245162964, + "learning_rate": 4.7239555650677885e-06, + "loss": 1.8611, + "step": 3297 + }, + { + "epoch": 0.176931330472103, + "grad_norm": 2.8807718753814697, + "learning_rate": 4.723757107656702e-06, + "loss": 2.4662, + "step": 3298 + }, + { + "epoch": 0.17698497854077253, + "grad_norm": 1.1551040410995483, + "learning_rate": 4.723558583104239e-06, + "loss": 2.621, + "step": 3299 + }, + { + "epoch": 0.17703862660944206, + "grad_norm": 1.349138855934143, + "learning_rate": 4.723359991416393e-06, + "loss": 2.2498, + "step": 3300 + }, + { + "epoch": 0.1770922746781116, + "grad_norm": 1.2441036701202393, + "learning_rate": 4.723161332599161e-06, + "loss": 2.1276, + "step": 3301 + }, + { + "epoch": 0.17714592274678112, + "grad_norm": 1.3120198249816895, + "learning_rate": 4.722962606658541e-06, + "loss": 2.3223, + "step": 3302 + }, + { + "epoch": 0.17719957081545065, + "grad_norm": 1.2230521440505981, + "learning_rate": 4.722763813600532e-06, + "loss": 2.4337, + "step": 3303 + }, + { + "epoch": 0.17725321888412018, + "grad_norm": 1.1807341575622559, + "learning_rate": 4.722564953431136e-06, + "loss": 2.4287, + "step": 3304 + }, + { + "epoch": 0.1773068669527897, + "grad_norm": 1.1501522064208984, + "learning_rate": 4.722366026156359e-06, + "loss": 2.303, + "step": 3305 + }, + { + "epoch": 0.17736051502145922, + "grad_norm": 1.2249243259429932, + "learning_rate": 4.722167031782205e-06, + "loss": 2.0456, + "step": 3306 + }, + { + "epoch": 0.17741416309012875, + "grad_norm": 1.2970484495162964, + "learning_rate": 4.721967970314684e-06, + "loss": 2.5582, + "step": 3307 + }, + { + "epoch": 0.17746781115879828, + "grad_norm": 1.161226749420166, + "learning_rate": 4.721768841759805e-06, + "loss": 2.3432, + "step": 3308 + }, + { + "epoch": 0.17752145922746781, + "grad_norm": 1.4384127855300903, + "learning_rate": 4.7215696461235805e-06, + "loss": 2.3857, + "step": 3309 + }, + { + "epoch": 0.17757510729613735, + "grad_norm": 1.1120959520339966, + "learning_rate": 4.721370383412026e-06, + "loss": 2.2757, + "step": 3310 + }, + { + "epoch": 0.17762875536480688, + "grad_norm": 1.0684009790420532, + "learning_rate": 4.721171053631155e-06, + "loss": 2.1949, + "step": 3311 + }, + { + "epoch": 0.17768240343347638, + "grad_norm": 1.4089354276657104, + "learning_rate": 4.720971656786989e-06, + "loss": 1.3821, + "step": 3312 + }, + { + "epoch": 0.17773605150214591, + "grad_norm": 1.3002897500991821, + "learning_rate": 4.720772192885546e-06, + "loss": 2.5057, + "step": 3313 + }, + { + "epoch": 0.17778969957081545, + "grad_norm": 1.231972575187683, + "learning_rate": 4.72057266193285e-06, + "loss": 2.4774, + "step": 3314 + }, + { + "epoch": 0.17784334763948498, + "grad_norm": 1.224912405014038, + "learning_rate": 4.7203730639349244e-06, + "loss": 2.3102, + "step": 3315 + }, + { + "epoch": 0.1778969957081545, + "grad_norm": 1.188181757926941, + "learning_rate": 4.720173398897796e-06, + "loss": 2.2008, + "step": 3316 + }, + { + "epoch": 0.17795064377682404, + "grad_norm": 1.1265875101089478, + "learning_rate": 4.719973666827492e-06, + "loss": 2.2237, + "step": 3317 + }, + { + "epoch": 0.17800429184549357, + "grad_norm": 1.2247153520584106, + "learning_rate": 4.719773867730045e-06, + "loss": 2.2614, + "step": 3318 + }, + { + "epoch": 0.17805793991416308, + "grad_norm": 1.621139645576477, + "learning_rate": 4.719574001611486e-06, + "loss": 2.4858, + "step": 3319 + }, + { + "epoch": 0.1781115879828326, + "grad_norm": 1.1965810060501099, + "learning_rate": 4.719374068477851e-06, + "loss": 2.2683, + "step": 3320 + }, + { + "epoch": 0.17816523605150214, + "grad_norm": 1.0128332376480103, + "learning_rate": 4.719174068335175e-06, + "loss": 2.3727, + "step": 3321 + }, + { + "epoch": 0.17821888412017167, + "grad_norm": 1.3548953533172607, + "learning_rate": 4.718974001189497e-06, + "loss": 2.2177, + "step": 3322 + }, + { + "epoch": 0.1782725321888412, + "grad_norm": 1.2157686948776245, + "learning_rate": 4.718773867046857e-06, + "loss": 2.359, + "step": 3323 + }, + { + "epoch": 0.17832618025751074, + "grad_norm": 1.5162028074264526, + "learning_rate": 4.7185736659132986e-06, + "loss": 2.4443, + "step": 3324 + }, + { + "epoch": 0.17837982832618027, + "grad_norm": 1.2259628772735596, + "learning_rate": 4.718373397794866e-06, + "loss": 2.5212, + "step": 3325 + }, + { + "epoch": 0.1784334763948498, + "grad_norm": 1.2535587549209595, + "learning_rate": 4.718173062697606e-06, + "loss": 2.5036, + "step": 3326 + }, + { + "epoch": 0.1784871244635193, + "grad_norm": 1.3460325002670288, + "learning_rate": 4.7179726606275675e-06, + "loss": 2.2034, + "step": 3327 + }, + { + "epoch": 0.17854077253218884, + "grad_norm": 1.2065768241882324, + "learning_rate": 4.7177721915908e-06, + "loss": 2.3978, + "step": 3328 + }, + { + "epoch": 0.17859442060085837, + "grad_norm": 1.334222435951233, + "learning_rate": 4.717571655593358e-06, + "loss": 2.6093, + "step": 3329 + }, + { + "epoch": 0.1786480686695279, + "grad_norm": 1.507074236869812, + "learning_rate": 4.717371052641295e-06, + "loss": 2.3004, + "step": 3330 + }, + { + "epoch": 0.17870171673819743, + "grad_norm": 1.1185156106948853, + "learning_rate": 4.7171703827406675e-06, + "loss": 2.325, + "step": 3331 + }, + { + "epoch": 0.17875536480686696, + "grad_norm": 1.3980834484100342, + "learning_rate": 4.716969645897535e-06, + "loss": 2.1809, + "step": 3332 + }, + { + "epoch": 0.1788090128755365, + "grad_norm": 1.3183598518371582, + "learning_rate": 4.716768842117959e-06, + "loss": 2.387, + "step": 3333 + }, + { + "epoch": 0.178862660944206, + "grad_norm": 1.194647192955017, + "learning_rate": 4.716567971408001e-06, + "loss": 2.2648, + "step": 3334 + }, + { + "epoch": 0.17891630901287553, + "grad_norm": 1.2399290800094604, + "learning_rate": 4.716367033773727e-06, + "loss": 2.4572, + "step": 3335 + }, + { + "epoch": 0.17896995708154506, + "grad_norm": 1.072323203086853, + "learning_rate": 4.716166029221202e-06, + "loss": 2.2076, + "step": 3336 + }, + { + "epoch": 0.1790236051502146, + "grad_norm": 1.5378801822662354, + "learning_rate": 4.715964957756497e-06, + "loss": 2.1768, + "step": 3337 + }, + { + "epoch": 0.17907725321888412, + "grad_norm": 1.2414758205413818, + "learning_rate": 4.715763819385681e-06, + "loss": 2.1381, + "step": 3338 + }, + { + "epoch": 0.17913090128755366, + "grad_norm": 1.085838794708252, + "learning_rate": 4.715562614114829e-06, + "loss": 1.5521, + "step": 3339 + }, + { + "epoch": 0.1791845493562232, + "grad_norm": 1.1544448137283325, + "learning_rate": 4.715361341950014e-06, + "loss": 2.0903, + "step": 3340 + }, + { + "epoch": 0.1792381974248927, + "grad_norm": 1.0183314085006714, + "learning_rate": 4.715160002897314e-06, + "loss": 2.2667, + "step": 3341 + }, + { + "epoch": 0.17929184549356222, + "grad_norm": 1.5667051076889038, + "learning_rate": 4.714958596962809e-06, + "loss": 2.4978, + "step": 3342 + }, + { + "epoch": 0.17934549356223176, + "grad_norm": 1.3831385374069214, + "learning_rate": 4.714757124152577e-06, + "loss": 2.2888, + "step": 3343 + }, + { + "epoch": 0.1793991416309013, + "grad_norm": 1.332710862159729, + "learning_rate": 4.714555584472703e-06, + "loss": 2.5978, + "step": 3344 + }, + { + "epoch": 0.17945278969957082, + "grad_norm": 2.2025465965270996, + "learning_rate": 4.714353977929273e-06, + "loss": 2.2199, + "step": 3345 + }, + { + "epoch": 0.17950643776824035, + "grad_norm": 1.3338243961334229, + "learning_rate": 4.714152304528372e-06, + "loss": 2.3915, + "step": 3346 + }, + { + "epoch": 0.17956008583690988, + "grad_norm": 1.2486987113952637, + "learning_rate": 4.713950564276091e-06, + "loss": 2.3264, + "step": 3347 + }, + { + "epoch": 0.1796137339055794, + "grad_norm": 1.2230762243270874, + "learning_rate": 4.713748757178519e-06, + "loss": 2.2531, + "step": 3348 + }, + { + "epoch": 0.17966738197424892, + "grad_norm": 1.2375510931015015, + "learning_rate": 4.713546883241751e-06, + "loss": 1.5783, + "step": 3349 + }, + { + "epoch": 0.17972103004291845, + "grad_norm": 1.2244757413864136, + "learning_rate": 4.71334494247188e-06, + "loss": 2.3195, + "step": 3350 + }, + { + "epoch": 0.17977467811158798, + "grad_norm": 1.173211693763733, + "learning_rate": 4.713142934875006e-06, + "loss": 2.1888, + "step": 3351 + }, + { + "epoch": 0.17982832618025751, + "grad_norm": 1.324818730354309, + "learning_rate": 4.712940860457226e-06, + "loss": 2.4913, + "step": 3352 + }, + { + "epoch": 0.17988197424892705, + "grad_norm": 1.1883020401000977, + "learning_rate": 4.712738719224641e-06, + "loss": 2.2073, + "step": 3353 + }, + { + "epoch": 0.17993562231759658, + "grad_norm": 1.4092051982879639, + "learning_rate": 4.7125365111833566e-06, + "loss": 2.0478, + "step": 3354 + }, + { + "epoch": 0.17998927038626608, + "grad_norm": 1.5741856098175049, + "learning_rate": 4.712334236339475e-06, + "loss": 2.3589, + "step": 3355 + }, + { + "epoch": 0.1800429184549356, + "grad_norm": 1.0563687086105347, + "learning_rate": 4.7121318946991054e-06, + "loss": 2.3048, + "step": 3356 + }, + { + "epoch": 0.18009656652360514, + "grad_norm": 1.4690524339675903, + "learning_rate": 4.711929486268357e-06, + "loss": 2.2334, + "step": 3357 + }, + { + "epoch": 0.18015021459227468, + "grad_norm": 1.1756932735443115, + "learning_rate": 4.711727011053341e-06, + "loss": 2.4008, + "step": 3358 + }, + { + "epoch": 0.1802038626609442, + "grad_norm": 1.1118922233581543, + "learning_rate": 4.711524469060169e-06, + "loss": 2.4789, + "step": 3359 + }, + { + "epoch": 0.18025751072961374, + "grad_norm": 1.358298420906067, + "learning_rate": 4.711321860294958e-06, + "loss": 2.2671, + "step": 3360 + }, + { + "epoch": 0.18031115879828327, + "grad_norm": 1.274778127670288, + "learning_rate": 4.711119184763826e-06, + "loss": 2.1856, + "step": 3361 + }, + { + "epoch": 0.18036480686695278, + "grad_norm": 1.380314826965332, + "learning_rate": 4.71091644247289e-06, + "loss": 2.332, + "step": 3362 + }, + { + "epoch": 0.1804184549356223, + "grad_norm": 1.3315237760543823, + "learning_rate": 4.710713633428273e-06, + "loss": 2.5123, + "step": 3363 + }, + { + "epoch": 0.18047210300429184, + "grad_norm": 1.219577431678772, + "learning_rate": 4.710510757636099e-06, + "loss": 2.2406, + "step": 3364 + }, + { + "epoch": 0.18052575107296137, + "grad_norm": 1.1883876323699951, + "learning_rate": 4.7103078151024915e-06, + "loss": 2.0716, + "step": 3365 + }, + { + "epoch": 0.1805793991416309, + "grad_norm": 1.304682970046997, + "learning_rate": 4.710104805833578e-06, + "loss": 2.5246, + "step": 3366 + }, + { + "epoch": 0.18063304721030043, + "grad_norm": 1.2353423833847046, + "learning_rate": 4.70990172983549e-06, + "loss": 2.3099, + "step": 3367 + }, + { + "epoch": 0.18068669527896997, + "grad_norm": 1.6269780397415161, + "learning_rate": 4.709698587114356e-06, + "loss": 2.2344, + "step": 3368 + }, + { + "epoch": 0.1807403433476395, + "grad_norm": 1.420514464378357, + "learning_rate": 4.709495377676313e-06, + "loss": 2.2495, + "step": 3369 + }, + { + "epoch": 0.180793991416309, + "grad_norm": 1.3217394351959229, + "learning_rate": 4.709292101527493e-06, + "loss": 2.3213, + "step": 3370 + }, + { + "epoch": 0.18084763948497853, + "grad_norm": 1.446742296218872, + "learning_rate": 4.709088758674035e-06, + "loss": 2.113, + "step": 3371 + }, + { + "epoch": 0.18090128755364807, + "grad_norm": 1.0756535530090332, + "learning_rate": 4.708885349122079e-06, + "loss": 2.3469, + "step": 3372 + }, + { + "epoch": 0.1809549356223176, + "grad_norm": 1.3259756565093994, + "learning_rate": 4.7086818728777665e-06, + "loss": 2.4762, + "step": 3373 + }, + { + "epoch": 0.18100858369098713, + "grad_norm": 1.30990731716156, + "learning_rate": 4.708478329947239e-06, + "loss": 2.4072, + "step": 3374 + }, + { + "epoch": 0.18106223175965666, + "grad_norm": 1.1396689414978027, + "learning_rate": 4.708274720336644e-06, + "loss": 2.3697, + "step": 3375 + }, + { + "epoch": 0.1811158798283262, + "grad_norm": 3.3345210552215576, + "learning_rate": 4.708071044052129e-06, + "loss": 2.1913, + "step": 3376 + }, + { + "epoch": 0.1811695278969957, + "grad_norm": 1.2438654899597168, + "learning_rate": 4.7078673010998425e-06, + "loss": 2.223, + "step": 3377 + }, + { + "epoch": 0.18122317596566523, + "grad_norm": 6.911498069763184, + "learning_rate": 4.707663491485937e-06, + "loss": 2.4596, + "step": 3378 + }, + { + "epoch": 0.18127682403433476, + "grad_norm": 1.4207910299301147, + "learning_rate": 4.707459615216565e-06, + "loss": 2.3155, + "step": 3379 + }, + { + "epoch": 0.1813304721030043, + "grad_norm": 1.2656573057174683, + "learning_rate": 4.707255672297884e-06, + "loss": 2.2084, + "step": 3380 + }, + { + "epoch": 0.18138412017167382, + "grad_norm": 1.2000253200531006, + "learning_rate": 4.70705166273605e-06, + "loss": 2.4801, + "step": 3381 + }, + { + "epoch": 0.18143776824034336, + "grad_norm": 2.6334891319274902, + "learning_rate": 4.706847586537222e-06, + "loss": 1.7144, + "step": 3382 + }, + { + "epoch": 0.1814914163090129, + "grad_norm": 1.3687636852264404, + "learning_rate": 4.706643443707564e-06, + "loss": 2.2753, + "step": 3383 + }, + { + "epoch": 0.1815450643776824, + "grad_norm": 1.229845643043518, + "learning_rate": 4.706439234253238e-06, + "loss": 2.4462, + "step": 3384 + }, + { + "epoch": 0.18159871244635192, + "grad_norm": 1.1514933109283447, + "learning_rate": 4.706234958180409e-06, + "loss": 2.1255, + "step": 3385 + }, + { + "epoch": 0.18165236051502145, + "grad_norm": 1.210893988609314, + "learning_rate": 4.706030615495246e-06, + "loss": 2.2539, + "step": 3386 + }, + { + "epoch": 0.181706008583691, + "grad_norm": 1.2623544931411743, + "learning_rate": 4.705826206203918e-06, + "loss": 2.2176, + "step": 3387 + }, + { + "epoch": 0.18175965665236052, + "grad_norm": 1.4160205125808716, + "learning_rate": 4.705621730312598e-06, + "loss": 2.403, + "step": 3388 + }, + { + "epoch": 0.18181330472103005, + "grad_norm": 1.2037442922592163, + "learning_rate": 4.705417187827458e-06, + "loss": 2.3909, + "step": 3389 + }, + { + "epoch": 0.18186695278969958, + "grad_norm": 1.2460377216339111, + "learning_rate": 4.705212578754674e-06, + "loss": 2.284, + "step": 3390 + }, + { + "epoch": 0.18192060085836909, + "grad_norm": 1.33826744556427, + "learning_rate": 4.7050079031004245e-06, + "loss": 2.177, + "step": 3391 + }, + { + "epoch": 0.18197424892703862, + "grad_norm": 1.2302066087722778, + "learning_rate": 4.704803160870888e-06, + "loss": 2.2362, + "step": 3392 + }, + { + "epoch": 0.18202789699570815, + "grad_norm": 1.6322983503341675, + "learning_rate": 4.7045983520722474e-06, + "loss": 2.2396, + "step": 3393 + }, + { + "epoch": 0.18208154506437768, + "grad_norm": 1.4127452373504639, + "learning_rate": 4.704393476710686e-06, + "loss": 2.2892, + "step": 3394 + }, + { + "epoch": 0.1821351931330472, + "grad_norm": 1.2116230726242065, + "learning_rate": 4.70418853479239e-06, + "loss": 1.834, + "step": 3395 + }, + { + "epoch": 0.18218884120171674, + "grad_norm": 1.4258627891540527, + "learning_rate": 4.703983526323546e-06, + "loss": 2.4251, + "step": 3396 + }, + { + "epoch": 0.18224248927038628, + "grad_norm": 1.1221263408660889, + "learning_rate": 4.703778451310345e-06, + "loss": 2.5463, + "step": 3397 + }, + { + "epoch": 0.18229613733905578, + "grad_norm": 1.4210299253463745, + "learning_rate": 4.703573309758979e-06, + "loss": 2.4913, + "step": 3398 + }, + { + "epoch": 0.1823497854077253, + "grad_norm": 1.373189091682434, + "learning_rate": 4.703368101675639e-06, + "loss": 2.3256, + "step": 3399 + }, + { + "epoch": 0.18240343347639484, + "grad_norm": 1.5056707859039307, + "learning_rate": 4.703162827066525e-06, + "loss": 2.4424, + "step": 3400 + }, + { + "epoch": 0.18245708154506438, + "grad_norm": 0.9617934226989746, + "learning_rate": 4.702957485937832e-06, + "loss": 2.0519, + "step": 3401 + }, + { + "epoch": 0.1825107296137339, + "grad_norm": 1.2561590671539307, + "learning_rate": 4.70275207829576e-06, + "loss": 2.2882, + "step": 3402 + }, + { + "epoch": 0.18256437768240344, + "grad_norm": 1.3570400476455688, + "learning_rate": 4.702546604146512e-06, + "loss": 2.0244, + "step": 3403 + }, + { + "epoch": 0.18261802575107297, + "grad_norm": 1.0024769306182861, + "learning_rate": 4.70234106349629e-06, + "loss": 2.2204, + "step": 3404 + }, + { + "epoch": 0.1826716738197425, + "grad_norm": 1.1964446306228638, + "learning_rate": 4.702135456351304e-06, + "loss": 2.3599, + "step": 3405 + }, + { + "epoch": 0.182725321888412, + "grad_norm": 1.2723007202148438, + "learning_rate": 4.701929782717756e-06, + "loss": 2.403, + "step": 3406 + }, + { + "epoch": 0.18277896995708154, + "grad_norm": 1.2454720735549927, + "learning_rate": 4.701724042601859e-06, + "loss": 2.3579, + "step": 3407 + }, + { + "epoch": 0.18283261802575107, + "grad_norm": 1.4295283555984497, + "learning_rate": 4.701518236009826e-06, + "loss": 2.5784, + "step": 3408 + }, + { + "epoch": 0.1828862660944206, + "grad_norm": 1.2187504768371582, + "learning_rate": 4.701312362947869e-06, + "loss": 2.3321, + "step": 3409 + }, + { + "epoch": 0.18293991416309013, + "grad_norm": 1.2741873264312744, + "learning_rate": 4.7011064234222034e-06, + "loss": 2.0788, + "step": 3410 + }, + { + "epoch": 0.18299356223175967, + "grad_norm": 1.2787679433822632, + "learning_rate": 4.700900417439048e-06, + "loss": 2.0806, + "step": 3411 + }, + { + "epoch": 0.1830472103004292, + "grad_norm": 1.1745655536651611, + "learning_rate": 4.700694345004624e-06, + "loss": 2.1937, + "step": 3412 + }, + { + "epoch": 0.1831008583690987, + "grad_norm": 1.3520039319992065, + "learning_rate": 4.700488206125151e-06, + "loss": 2.3336, + "step": 3413 + }, + { + "epoch": 0.18315450643776823, + "grad_norm": 1.2563878297805786, + "learning_rate": 4.7002820008068536e-06, + "loss": 2.3818, + "step": 3414 + }, + { + "epoch": 0.18320815450643776, + "grad_norm": 1.1884040832519531, + "learning_rate": 4.700075729055959e-06, + "loss": 2.3169, + "step": 3415 + }, + { + "epoch": 0.1832618025751073, + "grad_norm": 1.2662967443466187, + "learning_rate": 4.699869390878694e-06, + "loss": 2.5083, + "step": 3416 + }, + { + "epoch": 0.18331545064377683, + "grad_norm": 1.2281732559204102, + "learning_rate": 4.699662986281288e-06, + "loss": 2.0786, + "step": 3417 + }, + { + "epoch": 0.18336909871244636, + "grad_norm": 1.0945271253585815, + "learning_rate": 4.6994565152699735e-06, + "loss": 2.2816, + "step": 3418 + }, + { + "epoch": 0.1834227467811159, + "grad_norm": 1.219077706336975, + "learning_rate": 4.699249977850985e-06, + "loss": 2.3161, + "step": 3419 + }, + { + "epoch": 0.1834763948497854, + "grad_norm": 1.332869291305542, + "learning_rate": 4.699043374030559e-06, + "loss": 2.1112, + "step": 3420 + }, + { + "epoch": 0.18353004291845493, + "grad_norm": 1.2747352123260498, + "learning_rate": 4.698836703814931e-06, + "loss": 2.3782, + "step": 3421 + }, + { + "epoch": 0.18358369098712446, + "grad_norm": 1.5129599571228027, + "learning_rate": 4.698629967210342e-06, + "loss": 2.365, + "step": 3422 + }, + { + "epoch": 0.183637339055794, + "grad_norm": 1.2279963493347168, + "learning_rate": 4.698423164223035e-06, + "loss": 2.4457, + "step": 3423 + }, + { + "epoch": 0.18369098712446352, + "grad_norm": 1.420676589012146, + "learning_rate": 4.6982162948592525e-06, + "loss": 2.5913, + "step": 3424 + }, + { + "epoch": 0.18374463519313305, + "grad_norm": 1.133131980895996, + "learning_rate": 4.698009359125242e-06, + "loss": 2.4099, + "step": 3425 + }, + { + "epoch": 0.1837982832618026, + "grad_norm": 1.3151047229766846, + "learning_rate": 4.69780235702725e-06, + "loss": 2.277, + "step": 3426 + }, + { + "epoch": 0.1838519313304721, + "grad_norm": 1.2477298974990845, + "learning_rate": 4.697595288571528e-06, + "loss": 2.3004, + "step": 3427 + }, + { + "epoch": 0.18390557939914162, + "grad_norm": 1.343800663948059, + "learning_rate": 4.697388153764327e-06, + "loss": 2.3294, + "step": 3428 + }, + { + "epoch": 0.18395922746781115, + "grad_norm": 1.8435895442962646, + "learning_rate": 4.697180952611901e-06, + "loss": 2.3312, + "step": 3429 + }, + { + "epoch": 0.18401287553648069, + "grad_norm": 1.1881822347640991, + "learning_rate": 4.696973685120505e-06, + "loss": 2.4325, + "step": 3430 + }, + { + "epoch": 0.18406652360515022, + "grad_norm": 1.2352685928344727, + "learning_rate": 4.696766351296399e-06, + "loss": 2.4006, + "step": 3431 + }, + { + "epoch": 0.18412017167381975, + "grad_norm": 1.2703675031661987, + "learning_rate": 4.696558951145841e-06, + "loss": 2.1605, + "step": 3432 + }, + { + "epoch": 0.18417381974248928, + "grad_norm": 1.151642084121704, + "learning_rate": 4.696351484675095e-06, + "loss": 2.1601, + "step": 3433 + }, + { + "epoch": 0.18422746781115878, + "grad_norm": 1.0776824951171875, + "learning_rate": 4.696143951890424e-06, + "loss": 1.9581, + "step": 3434 + }, + { + "epoch": 0.18428111587982832, + "grad_norm": 1.2898123264312744, + "learning_rate": 4.695936352798093e-06, + "loss": 2.3018, + "step": 3435 + }, + { + "epoch": 0.18433476394849785, + "grad_norm": 1.1914597749710083, + "learning_rate": 4.695728687404372e-06, + "loss": 2.2753, + "step": 3436 + }, + { + "epoch": 0.18438841201716738, + "grad_norm": 1.2146872282028198, + "learning_rate": 4.695520955715529e-06, + "loss": 2.2167, + "step": 3437 + }, + { + "epoch": 0.1844420600858369, + "grad_norm": 1.1418417692184448, + "learning_rate": 4.695313157737838e-06, + "loss": 2.4233, + "step": 3438 + }, + { + "epoch": 0.18449570815450644, + "grad_norm": 1.7218014001846313, + "learning_rate": 4.695105293477571e-06, + "loss": 2.223, + "step": 3439 + }, + { + "epoch": 0.18454935622317598, + "grad_norm": 1.265100359916687, + "learning_rate": 4.6948973629410054e-06, + "loss": 2.3062, + "step": 3440 + }, + { + "epoch": 0.1846030042918455, + "grad_norm": 1.4422928094863892, + "learning_rate": 4.694689366134418e-06, + "loss": 2.3269, + "step": 3441 + }, + { + "epoch": 0.184656652360515, + "grad_norm": 1.192766547203064, + "learning_rate": 4.694481303064091e-06, + "loss": 2.2656, + "step": 3442 + }, + { + "epoch": 0.18471030042918454, + "grad_norm": 1.4258579015731812, + "learning_rate": 4.694273173736304e-06, + "loss": 2.1993, + "step": 3443 + }, + { + "epoch": 0.18476394849785407, + "grad_norm": 1.4929676055908203, + "learning_rate": 4.694064978157342e-06, + "loss": 2.3098, + "step": 3444 + }, + { + "epoch": 0.1848175965665236, + "grad_norm": 1.0989540815353394, + "learning_rate": 4.693856716333491e-06, + "loss": 2.1882, + "step": 3445 + }, + { + "epoch": 0.18487124463519314, + "grad_norm": 1.138278603553772, + "learning_rate": 4.693648388271038e-06, + "loss": 2.2724, + "step": 3446 + }, + { + "epoch": 0.18492489270386267, + "grad_norm": 2.4815783500671387, + "learning_rate": 4.6934399939762745e-06, + "loss": 2.4955, + "step": 3447 + }, + { + "epoch": 0.1849785407725322, + "grad_norm": 1.5761370658874512, + "learning_rate": 4.693231533455492e-06, + "loss": 2.2712, + "step": 3448 + }, + { + "epoch": 0.1850321888412017, + "grad_norm": 1.2584961652755737, + "learning_rate": 4.693023006714985e-06, + "loss": 2.1644, + "step": 3449 + }, + { + "epoch": 0.18508583690987124, + "grad_norm": 1.0721272230148315, + "learning_rate": 4.692814413761049e-06, + "loss": 1.9745, + "step": 3450 + }, + { + "epoch": 0.18513948497854077, + "grad_norm": 1.045979380607605, + "learning_rate": 4.692605754599981e-06, + "loss": 2.2408, + "step": 3451 + }, + { + "epoch": 0.1851931330472103, + "grad_norm": 1.1396970748901367, + "learning_rate": 4.692397029238082e-06, + "loss": 2.1158, + "step": 3452 + }, + { + "epoch": 0.18524678111587983, + "grad_norm": 1.2885175943374634, + "learning_rate": 4.692188237681654e-06, + "loss": 1.5418, + "step": 3453 + }, + { + "epoch": 0.18530042918454936, + "grad_norm": 1.3672146797180176, + "learning_rate": 4.691979379937001e-06, + "loss": 2.331, + "step": 3454 + }, + { + "epoch": 0.1853540772532189, + "grad_norm": 1.2950081825256348, + "learning_rate": 4.69177045601043e-06, + "loss": 1.842, + "step": 3455 + }, + { + "epoch": 0.1854077253218884, + "grad_norm": 1.2495976686477661, + "learning_rate": 4.691561465908247e-06, + "loss": 2.4723, + "step": 3456 + }, + { + "epoch": 0.18546137339055793, + "grad_norm": 1.1697078943252563, + "learning_rate": 4.691352409636762e-06, + "loss": 2.0623, + "step": 3457 + }, + { + "epoch": 0.18551502145922746, + "grad_norm": 1.2757004499435425, + "learning_rate": 4.691143287202289e-06, + "loss": 2.4493, + "step": 3458 + }, + { + "epoch": 0.185568669527897, + "grad_norm": 1.1258955001831055, + "learning_rate": 4.6909340986111394e-06, + "loss": 2.1992, + "step": 3459 + }, + { + "epoch": 0.18562231759656653, + "grad_norm": 1.3226019144058228, + "learning_rate": 4.690724843869632e-06, + "loss": 2.2444, + "step": 3460 + }, + { + "epoch": 0.18567596566523606, + "grad_norm": 1.264570951461792, + "learning_rate": 4.690515522984084e-06, + "loss": 2.2884, + "step": 3461 + }, + { + "epoch": 0.1857296137339056, + "grad_norm": 1.5125712156295776, + "learning_rate": 4.690306135960814e-06, + "loss": 2.4681, + "step": 3462 + }, + { + "epoch": 0.1857832618025751, + "grad_norm": 1.176947832107544, + "learning_rate": 4.690096682806144e-06, + "loss": 2.3058, + "step": 3463 + }, + { + "epoch": 0.18583690987124463, + "grad_norm": 1.1605802774429321, + "learning_rate": 4.6898871635263995e-06, + "loss": 2.2749, + "step": 3464 + }, + { + "epoch": 0.18589055793991416, + "grad_norm": 1.2823446989059448, + "learning_rate": 4.689677578127905e-06, + "loss": 2.3429, + "step": 3465 + }, + { + "epoch": 0.1859442060085837, + "grad_norm": 1.2924188375473022, + "learning_rate": 4.68946792661699e-06, + "loss": 2.4486, + "step": 3466 + }, + { + "epoch": 0.18599785407725322, + "grad_norm": 1.221558690071106, + "learning_rate": 4.6892582089999835e-06, + "loss": 1.8809, + "step": 3467 + }, + { + "epoch": 0.18605150214592275, + "grad_norm": 1.2156959772109985, + "learning_rate": 4.689048425283217e-06, + "loss": 2.4651, + "step": 3468 + }, + { + "epoch": 0.18610515021459229, + "grad_norm": 1.2166577577590942, + "learning_rate": 4.688838575473026e-06, + "loss": 2.2087, + "step": 3469 + }, + { + "epoch": 0.1861587982832618, + "grad_norm": 1.5724996328353882, + "learning_rate": 4.6886286595757446e-06, + "loss": 2.1372, + "step": 3470 + }, + { + "epoch": 0.18621244635193132, + "grad_norm": 1.1953818798065186, + "learning_rate": 4.688418677597712e-06, + "loss": 2.2605, + "step": 3471 + }, + { + "epoch": 0.18626609442060085, + "grad_norm": 1.2076828479766846, + "learning_rate": 4.6882086295452674e-06, + "loss": 2.3637, + "step": 3472 + }, + { + "epoch": 0.18631974248927038, + "grad_norm": 1.1043329238891602, + "learning_rate": 4.687998515424754e-06, + "loss": 1.8293, + "step": 3473 + }, + { + "epoch": 0.18637339055793992, + "grad_norm": 1.359475016593933, + "learning_rate": 4.687788335242513e-06, + "loss": 2.2794, + "step": 3474 + }, + { + "epoch": 0.18642703862660945, + "grad_norm": 1.1955811977386475, + "learning_rate": 4.6875780890048936e-06, + "loss": 2.2667, + "step": 3475 + }, + { + "epoch": 0.18648068669527898, + "grad_norm": 1.4142004251480103, + "learning_rate": 4.687367776718243e-06, + "loss": 2.5091, + "step": 3476 + }, + { + "epoch": 0.1865343347639485, + "grad_norm": 1.1294684410095215, + "learning_rate": 4.687157398388909e-06, + "loss": 2.3863, + "step": 3477 + }, + { + "epoch": 0.18658798283261802, + "grad_norm": 1.1605569124221802, + "learning_rate": 4.686946954023245e-06, + "loss": 2.3036, + "step": 3478 + }, + { + "epoch": 0.18664163090128755, + "grad_norm": 1.2896679639816284, + "learning_rate": 4.6867364436276045e-06, + "loss": 2.3141, + "step": 3479 + }, + { + "epoch": 0.18669527896995708, + "grad_norm": 1.1368482112884521, + "learning_rate": 4.6865258672083445e-06, + "loss": 2.2539, + "step": 3480 + }, + { + "epoch": 0.1867489270386266, + "grad_norm": 1.1594871282577515, + "learning_rate": 4.686315224771821e-06, + "loss": 2.578, + "step": 3481 + }, + { + "epoch": 0.18680257510729614, + "grad_norm": 2.3387207984924316, + "learning_rate": 4.686104516324396e-06, + "loss": 2.3763, + "step": 3482 + }, + { + "epoch": 0.18685622317596567, + "grad_norm": 1.3232580423355103, + "learning_rate": 4.685893741872429e-06, + "loss": 2.5792, + "step": 3483 + }, + { + "epoch": 0.1869098712446352, + "grad_norm": 1.279096245765686, + "learning_rate": 4.685682901422286e-06, + "loss": 2.2118, + "step": 3484 + }, + { + "epoch": 0.1869635193133047, + "grad_norm": 1.150031566619873, + "learning_rate": 4.6854719949803315e-06, + "loss": 2.3647, + "step": 3485 + }, + { + "epoch": 0.18701716738197424, + "grad_norm": 1.2252492904663086, + "learning_rate": 4.685261022552935e-06, + "loss": 2.2716, + "step": 3486 + }, + { + "epoch": 0.18707081545064377, + "grad_norm": 1.3255879878997803, + "learning_rate": 4.685049984146464e-06, + "loss": 2.2407, + "step": 3487 + }, + { + "epoch": 0.1871244635193133, + "grad_norm": 1.780170202255249, + "learning_rate": 4.684838879767291e-06, + "loss": 2.0399, + "step": 3488 + }, + { + "epoch": 0.18717811158798284, + "grad_norm": 1.2680577039718628, + "learning_rate": 4.6846277094217915e-06, + "loss": 2.2847, + "step": 3489 + }, + { + "epoch": 0.18723175965665237, + "grad_norm": 1.1650372743606567, + "learning_rate": 4.68441647311634e-06, + "loss": 2.3024, + "step": 3490 + }, + { + "epoch": 0.1872854077253219, + "grad_norm": 1.2956602573394775, + "learning_rate": 4.684205170857315e-06, + "loss": 2.441, + "step": 3491 + }, + { + "epoch": 0.1873390557939914, + "grad_norm": 1.2169373035430908, + "learning_rate": 4.6839938026510945e-06, + "loss": 2.1749, + "step": 3492 + }, + { + "epoch": 0.18739270386266094, + "grad_norm": 1.308790683746338, + "learning_rate": 4.683782368504063e-06, + "loss": 2.2875, + "step": 3493 + }, + { + "epoch": 0.18744635193133047, + "grad_norm": 1.277746558189392, + "learning_rate": 4.683570868422602e-06, + "loss": 2.4709, + "step": 3494 + }, + { + "epoch": 0.1875, + "grad_norm": 1.8639328479766846, + "learning_rate": 4.683359302413098e-06, + "loss": 2.1424, + "step": 3495 + }, + { + "epoch": 0.18755364806866953, + "grad_norm": 1.271832823753357, + "learning_rate": 4.683147670481939e-06, + "loss": 2.2323, + "step": 3496 + }, + { + "epoch": 0.18760729613733906, + "grad_norm": 2.4120025634765625, + "learning_rate": 4.6829359726355146e-06, + "loss": 2.2998, + "step": 3497 + }, + { + "epoch": 0.1876609442060086, + "grad_norm": 2.5653023719787598, + "learning_rate": 4.682724208880216e-06, + "loss": 2.2429, + "step": 3498 + }, + { + "epoch": 0.1877145922746781, + "grad_norm": 1.347082495689392, + "learning_rate": 4.682512379222438e-06, + "loss": 2.2919, + "step": 3499 + }, + { + "epoch": 0.18776824034334763, + "grad_norm": 1.876242995262146, + "learning_rate": 4.682300483668577e-06, + "loss": 2.5072, + "step": 3500 + }, + { + "epoch": 0.18782188841201716, + "grad_norm": 1.8499375581741333, + "learning_rate": 4.682088522225028e-06, + "loss": 2.2409, + "step": 3501 + }, + { + "epoch": 0.1878755364806867, + "grad_norm": 1.176655888557434, + "learning_rate": 4.6818764948981926e-06, + "loss": 2.2344, + "step": 3502 + }, + { + "epoch": 0.18792918454935623, + "grad_norm": 1.1528737545013428, + "learning_rate": 4.6816644016944725e-06, + "loss": 2.2553, + "step": 3503 + }, + { + "epoch": 0.18798283261802576, + "grad_norm": 1.6741968393325806, + "learning_rate": 4.6814522426202705e-06, + "loss": 1.5094, + "step": 3504 + }, + { + "epoch": 0.1880364806866953, + "grad_norm": 1.2595505714416504, + "learning_rate": 4.681240017681994e-06, + "loss": 2.4149, + "step": 3505 + }, + { + "epoch": 0.1880901287553648, + "grad_norm": 1.366628885269165, + "learning_rate": 4.681027726886048e-06, + "loss": 2.2867, + "step": 3506 + }, + { + "epoch": 0.18814377682403433, + "grad_norm": 1.6810380220413208, + "learning_rate": 4.680815370238844e-06, + "loss": 2.3519, + "step": 3507 + }, + { + "epoch": 0.18819742489270386, + "grad_norm": 1.2418636083602905, + "learning_rate": 4.680602947746793e-06, + "loss": 2.255, + "step": 3508 + }, + { + "epoch": 0.1882510729613734, + "grad_norm": 1.3633884191513062, + "learning_rate": 4.680390459416309e-06, + "loss": 2.1862, + "step": 3509 + }, + { + "epoch": 0.18830472103004292, + "grad_norm": 1.6661665439605713, + "learning_rate": 4.680177905253808e-06, + "loss": 2.517, + "step": 3510 + }, + { + "epoch": 0.18835836909871245, + "grad_norm": 1.3546892404556274, + "learning_rate": 4.679965285265706e-06, + "loss": 2.4987, + "step": 3511 + }, + { + "epoch": 0.18841201716738198, + "grad_norm": 1.1302149295806885, + "learning_rate": 4.6797525994584246e-06, + "loss": 2.3166, + "step": 3512 + }, + { + "epoch": 0.1884656652360515, + "grad_norm": 1.3671995401382446, + "learning_rate": 4.679539847838385e-06, + "loss": 2.0621, + "step": 3513 + }, + { + "epoch": 0.18851931330472102, + "grad_norm": 1.1462236642837524, + "learning_rate": 4.679327030412009e-06, + "loss": 2.2146, + "step": 3514 + }, + { + "epoch": 0.18857296137339055, + "grad_norm": 1.8315908908843994, + "learning_rate": 4.679114147185723e-06, + "loss": 2.2718, + "step": 3515 + }, + { + "epoch": 0.18862660944206008, + "grad_norm": 1.0322917699813843, + "learning_rate": 4.6789011981659555e-06, + "loss": 1.9576, + "step": 3516 + }, + { + "epoch": 0.18868025751072962, + "grad_norm": 1.3153645992279053, + "learning_rate": 4.678688183359136e-06, + "loss": 2.2188, + "step": 3517 + }, + { + "epoch": 0.18873390557939915, + "grad_norm": 1.4579088687896729, + "learning_rate": 4.678475102771694e-06, + "loss": 2.3862, + "step": 3518 + }, + { + "epoch": 0.18878755364806868, + "grad_norm": 1.331779956817627, + "learning_rate": 4.678261956410065e-06, + "loss": 2.2316, + "step": 3519 + }, + { + "epoch": 0.1888412017167382, + "grad_norm": 1.311563491821289, + "learning_rate": 4.678048744280684e-06, + "loss": 2.4204, + "step": 3520 + }, + { + "epoch": 0.18889484978540771, + "grad_norm": 1.0316811800003052, + "learning_rate": 4.677835466389988e-06, + "loss": 2.1278, + "step": 3521 + }, + { + "epoch": 0.18894849785407725, + "grad_norm": 1.2204980850219727, + "learning_rate": 4.677622122744418e-06, + "loss": 2.2966, + "step": 3522 + }, + { + "epoch": 0.18900214592274678, + "grad_norm": 1.2745643854141235, + "learning_rate": 4.677408713350412e-06, + "loss": 2.2574, + "step": 3523 + }, + { + "epoch": 0.1890557939914163, + "grad_norm": 1.0805141925811768, + "learning_rate": 4.677195238214417e-06, + "loss": 2.2976, + "step": 3524 + }, + { + "epoch": 0.18910944206008584, + "grad_norm": 1.4277303218841553, + "learning_rate": 4.6769816973428775e-06, + "loss": 2.0568, + "step": 3525 + }, + { + "epoch": 0.18916309012875537, + "grad_norm": 1.3099827766418457, + "learning_rate": 4.676768090742239e-06, + "loss": 2.2685, + "step": 3526 + }, + { + "epoch": 0.1892167381974249, + "grad_norm": 1.3297253847122192, + "learning_rate": 4.676554418418953e-06, + "loss": 2.3646, + "step": 3527 + }, + { + "epoch": 0.1892703862660944, + "grad_norm": 1.1348944902420044, + "learning_rate": 4.6763406803794705e-06, + "loss": 2.0507, + "step": 3528 + }, + { + "epoch": 0.18932403433476394, + "grad_norm": 1.2010785341262817, + "learning_rate": 4.676126876630244e-06, + "loss": 2.3294, + "step": 3529 + }, + { + "epoch": 0.18937768240343347, + "grad_norm": 1.2042897939682007, + "learning_rate": 4.675913007177729e-06, + "loss": 2.4938, + "step": 3530 + }, + { + "epoch": 0.189431330472103, + "grad_norm": 1.1656131744384766, + "learning_rate": 4.675699072028383e-06, + "loss": 2.1307, + "step": 3531 + }, + { + "epoch": 0.18948497854077254, + "grad_norm": 1.298869013786316, + "learning_rate": 4.6754850711886665e-06, + "loss": 2.5398, + "step": 3532 + }, + { + "epoch": 0.18953862660944207, + "grad_norm": 1.3676540851593018, + "learning_rate": 4.67527100466504e-06, + "loss": 2.1475, + "step": 3533 + }, + { + "epoch": 0.1895922746781116, + "grad_norm": 1.240015983581543, + "learning_rate": 4.675056872463965e-06, + "loss": 2.2761, + "step": 3534 + }, + { + "epoch": 0.1896459227467811, + "grad_norm": 1.3519331216812134, + "learning_rate": 4.6748426745919084e-06, + "loss": 2.3267, + "step": 3535 + }, + { + "epoch": 0.18969957081545064, + "grad_norm": 1.279384732246399, + "learning_rate": 4.674628411055338e-06, + "loss": 2.2055, + "step": 3536 + }, + { + "epoch": 0.18975321888412017, + "grad_norm": 1.1871554851531982, + "learning_rate": 4.674414081860722e-06, + "loss": 2.3028, + "step": 3537 + }, + { + "epoch": 0.1898068669527897, + "grad_norm": 1.193011999130249, + "learning_rate": 4.674199687014531e-06, + "loss": 2.3093, + "step": 3538 + }, + { + "epoch": 0.18986051502145923, + "grad_norm": 1.0564366579055786, + "learning_rate": 4.67398522652324e-06, + "loss": 2.3846, + "step": 3539 + }, + { + "epoch": 0.18991416309012876, + "grad_norm": 1.7341244220733643, + "learning_rate": 4.673770700393323e-06, + "loss": 2.536, + "step": 3540 + }, + { + "epoch": 0.1899678111587983, + "grad_norm": 1.2732852697372437, + "learning_rate": 4.673556108631257e-06, + "loss": 2.6109, + "step": 3541 + }, + { + "epoch": 0.1900214592274678, + "grad_norm": 1.182658076286316, + "learning_rate": 4.6733414512435215e-06, + "loss": 2.2112, + "step": 3542 + }, + { + "epoch": 0.19007510729613733, + "grad_norm": 1.420609474182129, + "learning_rate": 4.673126728236598e-06, + "loss": 2.3941, + "step": 3543 + }, + { + "epoch": 0.19012875536480686, + "grad_norm": 1.200242042541504, + "learning_rate": 4.672911939616968e-06, + "loss": 2.3224, + "step": 3544 + }, + { + "epoch": 0.1901824034334764, + "grad_norm": 1.2135869264602661, + "learning_rate": 4.672697085391118e-06, + "loss": 2.3561, + "step": 3545 + }, + { + "epoch": 0.19023605150214593, + "grad_norm": 5.383236408233643, + "learning_rate": 4.672482165565535e-06, + "loss": 2.3288, + "step": 3546 + }, + { + "epoch": 0.19028969957081546, + "grad_norm": 1.2978312969207764, + "learning_rate": 4.672267180146708e-06, + "loss": 2.5017, + "step": 3547 + }, + { + "epoch": 0.190343347639485, + "grad_norm": 1.1624550819396973, + "learning_rate": 4.672052129141127e-06, + "loss": 2.3361, + "step": 3548 + }, + { + "epoch": 0.1903969957081545, + "grad_norm": 1.1709518432617188, + "learning_rate": 4.671837012555286e-06, + "loss": 2.2725, + "step": 3549 + }, + { + "epoch": 0.19045064377682402, + "grad_norm": 1.2186332941055298, + "learning_rate": 4.67162183039568e-06, + "loss": 2.494, + "step": 3550 + }, + { + "epoch": 0.19050429184549356, + "grad_norm": 1.4235016107559204, + "learning_rate": 4.671406582668805e-06, + "loss": 2.29, + "step": 3551 + }, + { + "epoch": 0.1905579399141631, + "grad_norm": 1.33878493309021, + "learning_rate": 4.671191269381161e-06, + "loss": 2.4113, + "step": 3552 + }, + { + "epoch": 0.19061158798283262, + "grad_norm": 1.3717186450958252, + "learning_rate": 4.6709758905392485e-06, + "loss": 2.2685, + "step": 3553 + }, + { + "epoch": 0.19066523605150215, + "grad_norm": 1.1963114738464355, + "learning_rate": 4.67076044614957e-06, + "loss": 2.328, + "step": 3554 + }, + { + "epoch": 0.19071888412017168, + "grad_norm": 1.1665699481964111, + "learning_rate": 4.67054493621863e-06, + "loss": 2.2526, + "step": 3555 + }, + { + "epoch": 0.19077253218884122, + "grad_norm": 1.3046523332595825, + "learning_rate": 4.6703293607529375e-06, + "loss": 2.1855, + "step": 3556 + }, + { + "epoch": 0.19082618025751072, + "grad_norm": 1.2440407276153564, + "learning_rate": 4.670113719759e-06, + "loss": 2.1408, + "step": 3557 + }, + { + "epoch": 0.19087982832618025, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.669898013243327e-06, + "loss": 2.2726, + "step": 3558 + }, + { + "epoch": 0.19093347639484978, + "grad_norm": 1.2645074129104614, + "learning_rate": 4.669682241212432e-06, + "loss": 2.1848, + "step": 3559 + }, + { + "epoch": 0.19098712446351931, + "grad_norm": 1.3097604513168335, + "learning_rate": 4.669466403672831e-06, + "loss": 2.3236, + "step": 3560 + }, + { + "epoch": 0.19104077253218885, + "grad_norm": 1.4354110956192017, + "learning_rate": 4.66925050063104e-06, + "loss": 2.2474, + "step": 3561 + }, + { + "epoch": 0.19109442060085838, + "grad_norm": 1.2526880502700806, + "learning_rate": 4.6690345320935785e-06, + "loss": 1.751, + "step": 3562 + }, + { + "epoch": 0.1911480686695279, + "grad_norm": 1.134473443031311, + "learning_rate": 4.668818498066964e-06, + "loss": 2.0988, + "step": 3563 + }, + { + "epoch": 0.1912017167381974, + "grad_norm": 1.1148837804794312, + "learning_rate": 4.668602398557723e-06, + "loss": 2.2885, + "step": 3564 + }, + { + "epoch": 0.19125536480686695, + "grad_norm": 1.2057620286941528, + "learning_rate": 4.668386233572379e-06, + "loss": 2.2864, + "step": 3565 + }, + { + "epoch": 0.19130901287553648, + "grad_norm": 1.6614640951156616, + "learning_rate": 4.668170003117457e-06, + "loss": 2.1159, + "step": 3566 + }, + { + "epoch": 0.191362660944206, + "grad_norm": 1.0109710693359375, + "learning_rate": 4.667953707199487e-06, + "loss": 1.8667, + "step": 3567 + }, + { + "epoch": 0.19141630901287554, + "grad_norm": 1.339402437210083, + "learning_rate": 4.667737345825e-06, + "loss": 2.4122, + "step": 3568 + }, + { + "epoch": 0.19146995708154507, + "grad_norm": 4.490293025970459, + "learning_rate": 4.667520919000528e-06, + "loss": 2.2701, + "step": 3569 + }, + { + "epoch": 0.1915236051502146, + "grad_norm": 1.3692729473114014, + "learning_rate": 4.667304426732605e-06, + "loss": 2.1961, + "step": 3570 + }, + { + "epoch": 0.1915772532188841, + "grad_norm": 1.2795538902282715, + "learning_rate": 4.667087869027768e-06, + "loss": 2.3986, + "step": 3571 + }, + { + "epoch": 0.19163090128755364, + "grad_norm": 21.75772476196289, + "learning_rate": 4.666871245892556e-06, + "loss": 2.1832, + "step": 3572 + }, + { + "epoch": 0.19168454935622317, + "grad_norm": 33.11798095703125, + "learning_rate": 4.666654557333509e-06, + "loss": 2.0066, + "step": 3573 + }, + { + "epoch": 0.1917381974248927, + "grad_norm": 1.18890380859375, + "learning_rate": 4.666437803357168e-06, + "loss": 2.3515, + "step": 3574 + }, + { + "epoch": 0.19179184549356224, + "grad_norm": 1.2441178560256958, + "learning_rate": 4.6662209839700805e-06, + "loss": 2.2006, + "step": 3575 + }, + { + "epoch": 0.19184549356223177, + "grad_norm": 1.101897954940796, + "learning_rate": 4.666004099178789e-06, + "loss": 2.279, + "step": 3576 + }, + { + "epoch": 0.1918991416309013, + "grad_norm": 5.953339576721191, + "learning_rate": 4.665787148989845e-06, + "loss": 2.3313, + "step": 3577 + }, + { + "epoch": 0.1919527896995708, + "grad_norm": 2.2388110160827637, + "learning_rate": 4.665570133409799e-06, + "loss": 2.408, + "step": 3578 + }, + { + "epoch": 0.19200643776824033, + "grad_norm": 1.207289457321167, + "learning_rate": 4.6653530524452e-06, + "loss": 2.3075, + "step": 3579 + }, + { + "epoch": 0.19206008583690987, + "grad_norm": 1.3266401290893555, + "learning_rate": 4.665135906102606e-06, + "loss": 2.186, + "step": 3580 + }, + { + "epoch": 0.1921137339055794, + "grad_norm": 1.1596747636795044, + "learning_rate": 4.664918694388571e-06, + "loss": 2.1192, + "step": 3581 + }, + { + "epoch": 0.19216738197424893, + "grad_norm": 1.2349233627319336, + "learning_rate": 4.664701417309653e-06, + "loss": 2.4396, + "step": 3582 + }, + { + "epoch": 0.19222103004291846, + "grad_norm": 1.1601699590682983, + "learning_rate": 4.664484074872414e-06, + "loss": 2.3772, + "step": 3583 + }, + { + "epoch": 0.192274678111588, + "grad_norm": 1.5814498662948608, + "learning_rate": 4.664266667083416e-06, + "loss": 2.2161, + "step": 3584 + }, + { + "epoch": 0.1923283261802575, + "grad_norm": 1.3311710357666016, + "learning_rate": 4.6640491939492205e-06, + "loss": 2.4009, + "step": 3585 + }, + { + "epoch": 0.19238197424892703, + "grad_norm": 1.2327566146850586, + "learning_rate": 4.663831655476396e-06, + "loss": 2.2926, + "step": 3586 + }, + { + "epoch": 0.19243562231759656, + "grad_norm": 1.0314773321151733, + "learning_rate": 4.663614051671511e-06, + "loss": 2.1918, + "step": 3587 + }, + { + "epoch": 0.1924892703862661, + "grad_norm": 1.332322120666504, + "learning_rate": 4.663396382541133e-06, + "loss": 2.0701, + "step": 3588 + }, + { + "epoch": 0.19254291845493562, + "grad_norm": 2.494562864303589, + "learning_rate": 4.663178648091837e-06, + "loss": 2.1177, + "step": 3589 + }, + { + "epoch": 0.19259656652360516, + "grad_norm": 1.0249983072280884, + "learning_rate": 4.662960848330195e-06, + "loss": 1.9666, + "step": 3590 + }, + { + "epoch": 0.1926502145922747, + "grad_norm": 1.0773414373397827, + "learning_rate": 4.6627429832627835e-06, + "loss": 2.0498, + "step": 3591 + }, + { + "epoch": 0.19270386266094422, + "grad_norm": 1.2529000043869019, + "learning_rate": 4.662525052896182e-06, + "loss": 2.2055, + "step": 3592 + }, + { + "epoch": 0.19275751072961372, + "grad_norm": 1.3041211366653442, + "learning_rate": 4.6623070572369675e-06, + "loss": 2.151, + "step": 3593 + }, + { + "epoch": 0.19281115879828326, + "grad_norm": 1.2334107160568237, + "learning_rate": 4.662088996291723e-06, + "loss": 2.008, + "step": 3594 + }, + { + "epoch": 0.1928648068669528, + "grad_norm": 1.0371265411376953, + "learning_rate": 4.661870870067033e-06, + "loss": 1.8776, + "step": 3595 + }, + { + "epoch": 0.19291845493562232, + "grad_norm": 1.1485832929611206, + "learning_rate": 4.661652678569483e-06, + "loss": 2.125, + "step": 3596 + }, + { + "epoch": 0.19297210300429185, + "grad_norm": 1.4199310541152954, + "learning_rate": 4.661434421805662e-06, + "loss": 2.3762, + "step": 3597 + }, + { + "epoch": 0.19302575107296138, + "grad_norm": 1.1334068775177002, + "learning_rate": 4.661216099782159e-06, + "loss": 2.4145, + "step": 3598 + }, + { + "epoch": 0.19307939914163091, + "grad_norm": 2.2847933769226074, + "learning_rate": 4.6609977125055636e-06, + "loss": 2.3188, + "step": 3599 + }, + { + "epoch": 0.19313304721030042, + "grad_norm": 1.1959174871444702, + "learning_rate": 4.6607792599824716e-06, + "loss": 2.4547, + "step": 3600 + }, + { + "epoch": 0.19318669527896995, + "grad_norm": 1.3965126276016235, + "learning_rate": 4.66056074221948e-06, + "loss": 2.3331, + "step": 3601 + }, + { + "epoch": 0.19324034334763948, + "grad_norm": 1.177359938621521, + "learning_rate": 4.660342159223183e-06, + "loss": 2.3647, + "step": 3602 + }, + { + "epoch": 0.193293991416309, + "grad_norm": 1.110926866531372, + "learning_rate": 4.660123511000183e-06, + "loss": 2.3028, + "step": 3603 + }, + { + "epoch": 0.19334763948497855, + "grad_norm": 1.036074161529541, + "learning_rate": 4.659904797557081e-06, + "loss": 2.249, + "step": 3604 + }, + { + "epoch": 0.19340128755364808, + "grad_norm": 1.1773415803909302, + "learning_rate": 4.659686018900479e-06, + "loss": 2.2608, + "step": 3605 + }, + { + "epoch": 0.1934549356223176, + "grad_norm": 1.4209520816802979, + "learning_rate": 4.659467175036985e-06, + "loss": 1.9854, + "step": 3606 + }, + { + "epoch": 0.1935085836909871, + "grad_norm": 1.2295329570770264, + "learning_rate": 4.6592482659732056e-06, + "loss": 2.1525, + "step": 3607 + }, + { + "epoch": 0.19356223175965664, + "grad_norm": 1.2311673164367676, + "learning_rate": 4.659029291715749e-06, + "loss": 2.0152, + "step": 3608 + }, + { + "epoch": 0.19361587982832618, + "grad_norm": 1.1897872686386108, + "learning_rate": 4.658810252271227e-06, + "loss": 2.3974, + "step": 3609 + }, + { + "epoch": 0.1936695278969957, + "grad_norm": 1.2521212100982666, + "learning_rate": 4.658591147646254e-06, + "loss": 2.2654, + "step": 3610 + }, + { + "epoch": 0.19372317596566524, + "grad_norm": 1.3074500560760498, + "learning_rate": 4.658371977847446e-06, + "loss": 2.363, + "step": 3611 + }, + { + "epoch": 0.19377682403433477, + "grad_norm": 1.2901840209960938, + "learning_rate": 4.658152742881419e-06, + "loss": 2.2049, + "step": 3612 + }, + { + "epoch": 0.1938304721030043, + "grad_norm": 3.313418388366699, + "learning_rate": 4.657933442754791e-06, + "loss": 2.3749, + "step": 3613 + }, + { + "epoch": 0.1938841201716738, + "grad_norm": 1.3726825714111328, + "learning_rate": 4.6577140774741864e-06, + "loss": 2.0487, + "step": 3614 + }, + { + "epoch": 0.19393776824034334, + "grad_norm": 1.1878173351287842, + "learning_rate": 4.6574946470462264e-06, + "loss": 2.3139, + "step": 3615 + }, + { + "epoch": 0.19399141630901287, + "grad_norm": 1.2353484630584717, + "learning_rate": 4.657275151477537e-06, + "loss": 2.1272, + "step": 3616 + }, + { + "epoch": 0.1940450643776824, + "grad_norm": 1.3373301029205322, + "learning_rate": 4.657055590774745e-06, + "loss": 2.1868, + "step": 3617 + }, + { + "epoch": 0.19409871244635193, + "grad_norm": 1.3088189363479614, + "learning_rate": 4.65683596494448e-06, + "loss": 2.115, + "step": 3618 + }, + { + "epoch": 0.19415236051502147, + "grad_norm": 1.3089812994003296, + "learning_rate": 4.656616273993372e-06, + "loss": 2.292, + "step": 3619 + }, + { + "epoch": 0.194206008583691, + "grad_norm": 1.2452466487884521, + "learning_rate": 4.656396517928056e-06, + "loss": 2.052, + "step": 3620 + }, + { + "epoch": 0.1942596566523605, + "grad_norm": 1.3931708335876465, + "learning_rate": 4.656176696755164e-06, + "loss": 2.306, + "step": 3621 + }, + { + "epoch": 0.19431330472103003, + "grad_norm": 7.421163558959961, + "learning_rate": 4.6559568104813364e-06, + "loss": 2.4714, + "step": 3622 + }, + { + "epoch": 0.19436695278969957, + "grad_norm": 1.3557257652282715, + "learning_rate": 4.655736859113211e-06, + "loss": 2.3254, + "step": 3623 + }, + { + "epoch": 0.1944206008583691, + "grad_norm": 1.1688828468322754, + "learning_rate": 4.6555168426574285e-06, + "loss": 2.1758, + "step": 3624 + }, + { + "epoch": 0.19447424892703863, + "grad_norm": 1.2879763841629028, + "learning_rate": 4.655296761120632e-06, + "loss": 2.3733, + "step": 3625 + }, + { + "epoch": 0.19452789699570816, + "grad_norm": 1.2558258771896362, + "learning_rate": 4.655076614509464e-06, + "loss": 2.166, + "step": 3626 + }, + { + "epoch": 0.1945815450643777, + "grad_norm": 1.2599647045135498, + "learning_rate": 4.654856402830575e-06, + "loss": 2.3186, + "step": 3627 + }, + { + "epoch": 0.19463519313304722, + "grad_norm": 1.3302412033081055, + "learning_rate": 4.654636126090612e-06, + "loss": 2.2258, + "step": 3628 + }, + { + "epoch": 0.19468884120171673, + "grad_norm": 1.495296597480774, + "learning_rate": 4.6544157842962256e-06, + "loss": 2.4904, + "step": 3629 + }, + { + "epoch": 0.19474248927038626, + "grad_norm": 1.4503979682922363, + "learning_rate": 4.65419537745407e-06, + "loss": 2.2142, + "step": 3630 + }, + { + "epoch": 0.1947961373390558, + "grad_norm": 1.1210914850234985, + "learning_rate": 4.6539749055707975e-06, + "loss": 2.168, + "step": 3631 + }, + { + "epoch": 0.19484978540772532, + "grad_norm": 0.9265280365943909, + "learning_rate": 4.6537543686530665e-06, + "loss": 1.9762, + "step": 3632 + }, + { + "epoch": 0.19490343347639486, + "grad_norm": 1.0267680883407593, + "learning_rate": 4.6535337667075345e-06, + "loss": 1.9292, + "step": 3633 + }, + { + "epoch": 0.1949570815450644, + "grad_norm": 1.2618517875671387, + "learning_rate": 4.6533130997408635e-06, + "loss": 2.303, + "step": 3634 + }, + { + "epoch": 0.19501072961373392, + "grad_norm": 1.208715796470642, + "learning_rate": 4.653092367759716e-06, + "loss": 2.1315, + "step": 3635 + }, + { + "epoch": 0.19506437768240342, + "grad_norm": 1.345659852027893, + "learning_rate": 4.652871570770755e-06, + "loss": 2.2968, + "step": 3636 + }, + { + "epoch": 0.19511802575107295, + "grad_norm": 1.0620254278182983, + "learning_rate": 4.652650708780648e-06, + "loss": 2.0906, + "step": 3637 + }, + { + "epoch": 0.19517167381974249, + "grad_norm": 1.495103120803833, + "learning_rate": 4.652429781796064e-06, + "loss": 2.7103, + "step": 3638 + }, + { + "epoch": 0.19522532188841202, + "grad_norm": 1.3775628805160522, + "learning_rate": 4.6522087898236715e-06, + "loss": 2.0486, + "step": 3639 + }, + { + "epoch": 0.19527896995708155, + "grad_norm": 1.167701244354248, + "learning_rate": 4.651987732870144e-06, + "loss": 2.1937, + "step": 3640 + }, + { + "epoch": 0.19533261802575108, + "grad_norm": 1.382943868637085, + "learning_rate": 4.651766610942156e-06, + "loss": 2.0779, + "step": 3641 + }, + { + "epoch": 0.1953862660944206, + "grad_norm": 1.7146567106246948, + "learning_rate": 4.651545424046384e-06, + "loss": 2.2874, + "step": 3642 + }, + { + "epoch": 0.19543991416309012, + "grad_norm": 1.3333243131637573, + "learning_rate": 4.651324172189506e-06, + "loss": 2.2309, + "step": 3643 + }, + { + "epoch": 0.19549356223175965, + "grad_norm": 1.3862327337265015, + "learning_rate": 4.651102855378201e-06, + "loss": 2.2788, + "step": 3644 + }, + { + "epoch": 0.19554721030042918, + "grad_norm": 2.3496007919311523, + "learning_rate": 4.650881473619153e-06, + "loss": 2.4064, + "step": 3645 + }, + { + "epoch": 0.1956008583690987, + "grad_norm": 1.3489352464675903, + "learning_rate": 4.650660026919046e-06, + "loss": 2.1096, + "step": 3646 + }, + { + "epoch": 0.19565450643776824, + "grad_norm": 3.5695536136627197, + "learning_rate": 4.6504385152845646e-06, + "loss": 2.1552, + "step": 3647 + }, + { + "epoch": 0.19570815450643778, + "grad_norm": 0.9061922430992126, + "learning_rate": 4.650216938722397e-06, + "loss": 2.01, + "step": 3648 + }, + { + "epoch": 0.1957618025751073, + "grad_norm": 31.034299850463867, + "learning_rate": 4.649995297239235e-06, + "loss": 2.315, + "step": 3649 + }, + { + "epoch": 0.1958154506437768, + "grad_norm": 1.4523001909255981, + "learning_rate": 4.649773590841769e-06, + "loss": 2.4729, + "step": 3650 + }, + { + "epoch": 0.19586909871244634, + "grad_norm": 1.2226629257202148, + "learning_rate": 4.649551819536694e-06, + "loss": 2.1775, + "step": 3651 + }, + { + "epoch": 0.19592274678111588, + "grad_norm": 1.250909686088562, + "learning_rate": 4.649329983330703e-06, + "loss": 2.5323, + "step": 3652 + }, + { + "epoch": 0.1959763948497854, + "grad_norm": 2.626577138900757, + "learning_rate": 4.649108082230499e-06, + "loss": 2.1885, + "step": 3653 + }, + { + "epoch": 0.19603004291845494, + "grad_norm": 1.2613577842712402, + "learning_rate": 4.648886116242778e-06, + "loss": 2.4616, + "step": 3654 + }, + { + "epoch": 0.19608369098712447, + "grad_norm": 1.1361145973205566, + "learning_rate": 4.648664085374242e-06, + "loss": 2.2283, + "step": 3655 + }, + { + "epoch": 0.196137339055794, + "grad_norm": 1.3440237045288086, + "learning_rate": 4.648441989631596e-06, + "loss": 2.1975, + "step": 3656 + }, + { + "epoch": 0.1961909871244635, + "grad_norm": 1.123000144958496, + "learning_rate": 4.648219829021546e-06, + "loss": 2.388, + "step": 3657 + }, + { + "epoch": 0.19624463519313304, + "grad_norm": 1.0534039735794067, + "learning_rate": 4.647997603550797e-06, + "loss": 2.0849, + "step": 3658 + }, + { + "epoch": 0.19629828326180257, + "grad_norm": 1.122397541999817, + "learning_rate": 4.647775313226061e-06, + "loss": 2.3112, + "step": 3659 + }, + { + "epoch": 0.1963519313304721, + "grad_norm": 1.2134307622909546, + "learning_rate": 4.647552958054049e-06, + "loss": 2.314, + "step": 3660 + }, + { + "epoch": 0.19640557939914163, + "grad_norm": 1.2176368236541748, + "learning_rate": 4.647330538041475e-06, + "loss": 2.1336, + "step": 3661 + }, + { + "epoch": 0.19645922746781116, + "grad_norm": 1.1325048208236694, + "learning_rate": 4.647108053195053e-06, + "loss": 2.129, + "step": 3662 + }, + { + "epoch": 0.1965128755364807, + "grad_norm": 3.9090774059295654, + "learning_rate": 4.646885503521501e-06, + "loss": 2.5249, + "step": 3663 + }, + { + "epoch": 0.1965665236051502, + "grad_norm": 1.244073748588562, + "learning_rate": 4.64666288902754e-06, + "loss": 2.1236, + "step": 3664 + }, + { + "epoch": 0.19662017167381973, + "grad_norm": 1.3762214183807373, + "learning_rate": 4.646440209719889e-06, + "loss": 2.2692, + "step": 3665 + }, + { + "epoch": 0.19667381974248926, + "grad_norm": 1.1827284097671509, + "learning_rate": 4.646217465605273e-06, + "loss": 2.2464, + "step": 3666 + }, + { + "epoch": 0.1967274678111588, + "grad_norm": 1.244786024093628, + "learning_rate": 4.645994656690417e-06, + "loss": 2.2322, + "step": 3667 + }, + { + "epoch": 0.19678111587982833, + "grad_norm": 1.2837826013565063, + "learning_rate": 4.645771782982047e-06, + "loss": 2.3547, + "step": 3668 + }, + { + "epoch": 0.19683476394849786, + "grad_norm": 1.0728979110717773, + "learning_rate": 4.6455488444868936e-06, + "loss": 2.3378, + "step": 3669 + }, + { + "epoch": 0.1968884120171674, + "grad_norm": 1.4267597198486328, + "learning_rate": 4.645325841211687e-06, + "loss": 2.453, + "step": 3670 + }, + { + "epoch": 0.19694206008583692, + "grad_norm": 1.0697736740112305, + "learning_rate": 4.64510277316316e-06, + "loss": 1.9464, + "step": 3671 + }, + { + "epoch": 0.19699570815450643, + "grad_norm": 1.169786810874939, + "learning_rate": 4.64487964034805e-06, + "loss": 2.3621, + "step": 3672 + }, + { + "epoch": 0.19704935622317596, + "grad_norm": 1.407608985900879, + "learning_rate": 4.644656442773091e-06, + "loss": 2.4342, + "step": 3673 + }, + { + "epoch": 0.1971030042918455, + "grad_norm": 1.292242407798767, + "learning_rate": 4.644433180445024e-06, + "loss": 2.3593, + "step": 3674 + }, + { + "epoch": 0.19715665236051502, + "grad_norm": 1.0376532077789307, + "learning_rate": 4.644209853370588e-06, + "loss": 2.1894, + "step": 3675 + }, + { + "epoch": 0.19721030042918455, + "grad_norm": 1.3240770101547241, + "learning_rate": 4.643986461556528e-06, + "loss": 2.3868, + "step": 3676 + }, + { + "epoch": 0.19726394849785409, + "grad_norm": 1.2710820436477661, + "learning_rate": 4.643763005009587e-06, + "loss": 2.2331, + "step": 3677 + }, + { + "epoch": 0.19731759656652362, + "grad_norm": 1.3726979494094849, + "learning_rate": 4.643539483736514e-06, + "loss": 2.4804, + "step": 3678 + }, + { + "epoch": 0.19737124463519312, + "grad_norm": 1.2373720407485962, + "learning_rate": 4.643315897744055e-06, + "loss": 2.3456, + "step": 3679 + }, + { + "epoch": 0.19742489270386265, + "grad_norm": 1.1499115228652954, + "learning_rate": 4.643092247038963e-06, + "loss": 2.1092, + "step": 3680 + }, + { + "epoch": 0.19747854077253219, + "grad_norm": 1.1649826765060425, + "learning_rate": 4.642868531627989e-06, + "loss": 1.9586, + "step": 3681 + }, + { + "epoch": 0.19753218884120172, + "grad_norm": 1.3701539039611816, + "learning_rate": 4.642644751517888e-06, + "loss": 2.5484, + "step": 3682 + }, + { + "epoch": 0.19758583690987125, + "grad_norm": 1.452637791633606, + "learning_rate": 4.642420906715417e-06, + "loss": 2.073, + "step": 3683 + }, + { + "epoch": 0.19763948497854078, + "grad_norm": 1.1668627262115479, + "learning_rate": 4.642196997227334e-06, + "loss": 2.1953, + "step": 3684 + }, + { + "epoch": 0.1976931330472103, + "grad_norm": 1.2274147272109985, + "learning_rate": 4.6419730230604e-06, + "loss": 2.2747, + "step": 3685 + }, + { + "epoch": 0.19774678111587982, + "grad_norm": 1.2802897691726685, + "learning_rate": 4.641748984221377e-06, + "loss": 2.3477, + "step": 3686 + }, + { + "epoch": 0.19780042918454935, + "grad_norm": 9.779991149902344, + "learning_rate": 4.6415248807170295e-06, + "loss": 2.4089, + "step": 3687 + }, + { + "epoch": 0.19785407725321888, + "grad_norm": 1.1982390880584717, + "learning_rate": 4.641300712554125e-06, + "loss": 2.2691, + "step": 3688 + }, + { + "epoch": 0.1979077253218884, + "grad_norm": 1.0289463996887207, + "learning_rate": 4.641076479739429e-06, + "loss": 2.2247, + "step": 3689 + }, + { + "epoch": 0.19796137339055794, + "grad_norm": 1.2301536798477173, + "learning_rate": 4.6408521822797134e-06, + "loss": 1.5722, + "step": 3690 + }, + { + "epoch": 0.19801502145922747, + "grad_norm": 1.6804865598678589, + "learning_rate": 4.64062782018175e-06, + "loss": 2.2831, + "step": 3691 + }, + { + "epoch": 0.198068669527897, + "grad_norm": 1.0852324962615967, + "learning_rate": 4.6404033934523135e-06, + "loss": 2.1961, + "step": 3692 + }, + { + "epoch": 0.1981223175965665, + "grad_norm": 1.1962088346481323, + "learning_rate": 4.640178902098179e-06, + "loss": 2.0727, + "step": 3693 + }, + { + "epoch": 0.19817596566523604, + "grad_norm": 1.4886806011199951, + "learning_rate": 4.6399543461261256e-06, + "loss": 2.4385, + "step": 3694 + }, + { + "epoch": 0.19822961373390557, + "grad_norm": 1.2767034769058228, + "learning_rate": 4.639729725542933e-06, + "loss": 2.172, + "step": 3695 + }, + { + "epoch": 0.1982832618025751, + "grad_norm": 1.4468457698822021, + "learning_rate": 4.639505040355383e-06, + "loss": 2.2391, + "step": 3696 + }, + { + "epoch": 0.19833690987124464, + "grad_norm": 1.7686817646026611, + "learning_rate": 4.639280290570258e-06, + "loss": 1.7449, + "step": 3697 + }, + { + "epoch": 0.19839055793991417, + "grad_norm": 1.1469653844833374, + "learning_rate": 4.639055476194347e-06, + "loss": 2.1473, + "step": 3698 + }, + { + "epoch": 0.1984442060085837, + "grad_norm": 1.5095516443252563, + "learning_rate": 4.638830597234434e-06, + "loss": 2.2474, + "step": 3699 + }, + { + "epoch": 0.1984978540772532, + "grad_norm": 1.4449721574783325, + "learning_rate": 4.638605653697311e-06, + "loss": 2.2742, + "step": 3700 + }, + { + "epoch": 0.19855150214592274, + "grad_norm": 1.3423619270324707, + "learning_rate": 4.638380645589771e-06, + "loss": 2.0988, + "step": 3701 + }, + { + "epoch": 0.19860515021459227, + "grad_norm": 1.5285801887512207, + "learning_rate": 4.638155572918604e-06, + "loss": 2.1954, + "step": 3702 + }, + { + "epoch": 0.1986587982832618, + "grad_norm": 1.3817273378372192, + "learning_rate": 4.6379304356906085e-06, + "loss": 2.4622, + "step": 3703 + }, + { + "epoch": 0.19871244635193133, + "grad_norm": 1.3577519655227661, + "learning_rate": 4.637705233912581e-06, + "loss": 2.091, + "step": 3704 + }, + { + "epoch": 0.19876609442060086, + "grad_norm": 1.1404502391815186, + "learning_rate": 4.63747996759132e-06, + "loss": 1.8069, + "step": 3705 + }, + { + "epoch": 0.1988197424892704, + "grad_norm": 1.7578426599502563, + "learning_rate": 4.637254636733628e-06, + "loss": 2.5767, + "step": 3706 + }, + { + "epoch": 0.19887339055793993, + "grad_norm": 1.1518572568893433, + "learning_rate": 4.637029241346309e-06, + "loss": 2.272, + "step": 3707 + }, + { + "epoch": 0.19892703862660943, + "grad_norm": 2.8595566749572754, + "learning_rate": 4.636803781436168e-06, + "loss": 2.3504, + "step": 3708 + }, + { + "epoch": 0.19898068669527896, + "grad_norm": 1.6838815212249756, + "learning_rate": 4.63657825701001e-06, + "loss": 2.2386, + "step": 3709 + }, + { + "epoch": 0.1990343347639485, + "grad_norm": 1.2520842552185059, + "learning_rate": 4.636352668074647e-06, + "loss": 2.474, + "step": 3710 + }, + { + "epoch": 0.19908798283261803, + "grad_norm": 1.386107087135315, + "learning_rate": 4.6361270146368895e-06, + "loss": 2.2791, + "step": 3711 + }, + { + "epoch": 0.19914163090128756, + "grad_norm": 1.2178764343261719, + "learning_rate": 4.63590129670355e-06, + "loss": 2.5027, + "step": 3712 + }, + { + "epoch": 0.1991952789699571, + "grad_norm": 1.1601449251174927, + "learning_rate": 4.635675514281444e-06, + "loss": 2.3535, + "step": 3713 + }, + { + "epoch": 0.19924892703862662, + "grad_norm": 1.4203969240188599, + "learning_rate": 4.6354496673773875e-06, + "loss": 1.5301, + "step": 3714 + }, + { + "epoch": 0.19930257510729613, + "grad_norm": 1.1996502876281738, + "learning_rate": 4.635223755998201e-06, + "loss": 2.3105, + "step": 3715 + }, + { + "epoch": 0.19935622317596566, + "grad_norm": 1.408998966217041, + "learning_rate": 4.634997780150705e-06, + "loss": 2.3694, + "step": 3716 + }, + { + "epoch": 0.1994098712446352, + "grad_norm": 1.4139456748962402, + "learning_rate": 4.6347717398417205e-06, + "loss": 2.1043, + "step": 3717 + }, + { + "epoch": 0.19946351931330472, + "grad_norm": 1.4600870609283447, + "learning_rate": 4.634545635078075e-06, + "loss": 2.4164, + "step": 3718 + }, + { + "epoch": 0.19951716738197425, + "grad_norm": 1.1632405519485474, + "learning_rate": 4.634319465866594e-06, + "loss": 2.1359, + "step": 3719 + }, + { + "epoch": 0.19957081545064378, + "grad_norm": 1.261743426322937, + "learning_rate": 4.634093232214106e-06, + "loss": 2.1436, + "step": 3720 + }, + { + "epoch": 0.19962446351931332, + "grad_norm": 1.2200193405151367, + "learning_rate": 4.6338669341274415e-06, + "loss": 2.1167, + "step": 3721 + }, + { + "epoch": 0.19967811158798282, + "grad_norm": 1.2095427513122559, + "learning_rate": 4.633640571613432e-06, + "loss": 2.2836, + "step": 3722 + }, + { + "epoch": 0.19973175965665235, + "grad_norm": 1.4819310903549194, + "learning_rate": 4.6334141446789156e-06, + "loss": 2.461, + "step": 3723 + }, + { + "epoch": 0.19978540772532188, + "grad_norm": 1.3756663799285889, + "learning_rate": 4.633187653330725e-06, + "loss": 2.2174, + "step": 3724 + }, + { + "epoch": 0.19983905579399142, + "grad_norm": 1.2319761514663696, + "learning_rate": 4.632961097575701e-06, + "loss": 2.442, + "step": 3725 + }, + { + "epoch": 0.19989270386266095, + "grad_norm": 1.2945011854171753, + "learning_rate": 4.6327344774206825e-06, + "loss": 2.3501, + "step": 3726 + }, + { + "epoch": 0.19994635193133048, + "grad_norm": 1.3036407232284546, + "learning_rate": 4.632507792872514e-06, + "loss": 2.2354, + "step": 3727 + }, + { + "epoch": 0.2, + "grad_norm": 1.1577447652816772, + "learning_rate": 4.632281043938036e-06, + "loss": 2.2633, + "step": 3728 + }, + { + "epoch": 0.20005364806866952, + "grad_norm": 1.2133798599243164, + "learning_rate": 4.6320542306240975e-06, + "loss": 2.2232, + "step": 3729 + }, + { + "epoch": 0.20010729613733905, + "grad_norm": 1.6016851663589478, + "learning_rate": 4.631827352937546e-06, + "loss": 2.2656, + "step": 3730 + }, + { + "epoch": 0.20016094420600858, + "grad_norm": 1.3274916410446167, + "learning_rate": 4.631600410885231e-06, + "loss": 2.5082, + "step": 3731 + }, + { + "epoch": 0.2002145922746781, + "grad_norm": 1.3525985479354858, + "learning_rate": 4.6313734044740055e-06, + "loss": 2.4353, + "step": 3732 + }, + { + "epoch": 0.20026824034334764, + "grad_norm": 1.674757480621338, + "learning_rate": 4.631146333710722e-06, + "loss": 2.3104, + "step": 3733 + }, + { + "epoch": 0.20032188841201717, + "grad_norm": 1.3631049394607544, + "learning_rate": 4.630919198602238e-06, + "loss": 2.1577, + "step": 3734 + }, + { + "epoch": 0.2003755364806867, + "grad_norm": 1.2909578084945679, + "learning_rate": 4.630691999155411e-06, + "loss": 2.0932, + "step": 3735 + }, + { + "epoch": 0.2004291845493562, + "grad_norm": 1.3406130075454712, + "learning_rate": 4.6304647353770995e-06, + "loss": 2.5019, + "step": 3736 + }, + { + "epoch": 0.20048283261802574, + "grad_norm": 1.590498924255371, + "learning_rate": 4.630237407274166e-06, + "loss": 1.967, + "step": 3737 + }, + { + "epoch": 0.20053648068669527, + "grad_norm": 1.4649293422698975, + "learning_rate": 4.630010014853475e-06, + "loss": 2.2236, + "step": 3738 + }, + { + "epoch": 0.2005901287553648, + "grad_norm": 1.3511090278625488, + "learning_rate": 4.629782558121891e-06, + "loss": 2.2423, + "step": 3739 + }, + { + "epoch": 0.20064377682403434, + "grad_norm": 1.9356008768081665, + "learning_rate": 4.629555037086283e-06, + "loss": 2.3868, + "step": 3740 + }, + { + "epoch": 0.20069742489270387, + "grad_norm": 1.466711163520813, + "learning_rate": 4.6293274517535184e-06, + "loss": 1.9777, + "step": 3741 + }, + { + "epoch": 0.2007510729613734, + "grad_norm": 8.2410888671875, + "learning_rate": 4.62909980213047e-06, + "loss": 2.3371, + "step": 3742 + }, + { + "epoch": 0.20080472103004293, + "grad_norm": 1.122536540031433, + "learning_rate": 4.62887208822401e-06, + "loss": 2.1531, + "step": 3743 + }, + { + "epoch": 0.20085836909871244, + "grad_norm": 1.2790428400039673, + "learning_rate": 4.628644310041016e-06, + "loss": 2.3046, + "step": 3744 + }, + { + "epoch": 0.20091201716738197, + "grad_norm": 1.2917968034744263, + "learning_rate": 4.628416467588363e-06, + "loss": 2.4666, + "step": 3745 + }, + { + "epoch": 0.2009656652360515, + "grad_norm": 1.4038149118423462, + "learning_rate": 4.62818856087293e-06, + "loss": 2.3096, + "step": 3746 + }, + { + "epoch": 0.20101931330472103, + "grad_norm": 1.3286137580871582, + "learning_rate": 4.627960589901601e-06, + "loss": 2.2549, + "step": 3747 + }, + { + "epoch": 0.20107296137339056, + "grad_norm": 2.239736557006836, + "learning_rate": 4.627732554681256e-06, + "loss": 2.2566, + "step": 3748 + }, + { + "epoch": 0.2011266094420601, + "grad_norm": 1.275478720664978, + "learning_rate": 4.627504455218782e-06, + "loss": 2.3135, + "step": 3749 + }, + { + "epoch": 0.20118025751072963, + "grad_norm": 1.3192983865737915, + "learning_rate": 4.627276291521064e-06, + "loss": 2.1942, + "step": 3750 + }, + { + "epoch": 0.20123390557939913, + "grad_norm": 1.3997247219085693, + "learning_rate": 4.6270480635949935e-06, + "loss": 2.1885, + "step": 3751 + }, + { + "epoch": 0.20128755364806866, + "grad_norm": 1.3167312145233154, + "learning_rate": 4.626819771447459e-06, + "loss": 2.3016, + "step": 3752 + }, + { + "epoch": 0.2013412017167382, + "grad_norm": 1.4369189739227295, + "learning_rate": 4.6265914150853544e-06, + "loss": 2.4939, + "step": 3753 + }, + { + "epoch": 0.20139484978540773, + "grad_norm": 1.2732185125350952, + "learning_rate": 4.626362994515574e-06, + "loss": 2.4472, + "step": 3754 + }, + { + "epoch": 0.20144849785407726, + "grad_norm": 1.285101294517517, + "learning_rate": 4.626134509745015e-06, + "loss": 2.2287, + "step": 3755 + }, + { + "epoch": 0.2015021459227468, + "grad_norm": 1.1932889223098755, + "learning_rate": 4.625905960780575e-06, + "loss": 2.4468, + "step": 3756 + }, + { + "epoch": 0.20155579399141632, + "grad_norm": 1.3584980964660645, + "learning_rate": 4.625677347629156e-06, + "loss": 2.4328, + "step": 3757 + }, + { + "epoch": 0.20160944206008583, + "grad_norm": 1.3915753364562988, + "learning_rate": 4.625448670297659e-06, + "loss": 2.4468, + "step": 3758 + }, + { + "epoch": 0.20166309012875536, + "grad_norm": 1.041346549987793, + "learning_rate": 4.6252199287929885e-06, + "loss": 2.3521, + "step": 3759 + }, + { + "epoch": 0.2017167381974249, + "grad_norm": 1.2436233758926392, + "learning_rate": 4.624991123122052e-06, + "loss": 2.2022, + "step": 3760 + }, + { + "epoch": 0.20177038626609442, + "grad_norm": 1.1524702310562134, + "learning_rate": 4.624762253291758e-06, + "loss": 2.4753, + "step": 3761 + }, + { + "epoch": 0.20182403433476395, + "grad_norm": 2.212296485900879, + "learning_rate": 4.624533319309014e-06, + "loss": 1.3085, + "step": 3762 + }, + { + "epoch": 0.20187768240343348, + "grad_norm": 1.178541898727417, + "learning_rate": 4.6243043211807355e-06, + "loss": 2.3574, + "step": 3763 + }, + { + "epoch": 0.20193133047210302, + "grad_norm": 2.4395627975463867, + "learning_rate": 4.624075258913834e-06, + "loss": 1.9602, + "step": 3764 + }, + { + "epoch": 0.20198497854077252, + "grad_norm": 1.1168386936187744, + "learning_rate": 4.623846132515227e-06, + "loss": 2.402, + "step": 3765 + }, + { + "epoch": 0.20203862660944205, + "grad_norm": 1.3017778396606445, + "learning_rate": 4.623616941991833e-06, + "loss": 2.3957, + "step": 3766 + }, + { + "epoch": 0.20209227467811158, + "grad_norm": 1.089753270149231, + "learning_rate": 4.6233876873505696e-06, + "loss": 2.2387, + "step": 3767 + }, + { + "epoch": 0.20214592274678111, + "grad_norm": 1.197166085243225, + "learning_rate": 4.623158368598361e-06, + "loss": 2.2796, + "step": 3768 + }, + { + "epoch": 0.20219957081545065, + "grad_norm": 1.3133816719055176, + "learning_rate": 4.622928985742129e-06, + "loss": 2.3475, + "step": 3769 + }, + { + "epoch": 0.20225321888412018, + "grad_norm": 2.7644522190093994, + "learning_rate": 4.622699538788801e-06, + "loss": 2.2082, + "step": 3770 + }, + { + "epoch": 0.2023068669527897, + "grad_norm": 1.2636334896087646, + "learning_rate": 4.622470027745303e-06, + "loss": 2.0564, + "step": 3771 + }, + { + "epoch": 0.20236051502145921, + "grad_norm": 1.1485320329666138, + "learning_rate": 4.622240452618566e-06, + "loss": 2.2279, + "step": 3772 + }, + { + "epoch": 0.20241416309012875, + "grad_norm": 1.2882862091064453, + "learning_rate": 4.622010813415521e-06, + "loss": 2.4148, + "step": 3773 + }, + { + "epoch": 0.20246781115879828, + "grad_norm": 1.1709342002868652, + "learning_rate": 4.621781110143101e-06, + "loss": 2.3785, + "step": 3774 + }, + { + "epoch": 0.2025214592274678, + "grad_norm": 1.4074974060058594, + "learning_rate": 4.621551342808243e-06, + "loss": 2.3115, + "step": 3775 + }, + { + "epoch": 0.20257510729613734, + "grad_norm": 1.4677814245224, + "learning_rate": 4.621321511417881e-06, + "loss": 2.3513, + "step": 3776 + }, + { + "epoch": 0.20262875536480687, + "grad_norm": 1.1087291240692139, + "learning_rate": 4.621091615978958e-06, + "loss": 2.1139, + "step": 3777 + }, + { + "epoch": 0.2026824034334764, + "grad_norm": 1.2703778743743896, + "learning_rate": 4.620861656498414e-06, + "loss": 2.3672, + "step": 3778 + }, + { + "epoch": 0.20273605150214594, + "grad_norm": 1.620661735534668, + "learning_rate": 4.62063163298319e-06, + "loss": 2.1721, + "step": 3779 + }, + { + "epoch": 0.20278969957081544, + "grad_norm": 1.1637080907821655, + "learning_rate": 4.6204015454402325e-06, + "loss": 2.2978, + "step": 3780 + }, + { + "epoch": 0.20284334763948497, + "grad_norm": 1.4243382215499878, + "learning_rate": 4.62017139387649e-06, + "loss": 2.2406, + "step": 3781 + }, + { + "epoch": 0.2028969957081545, + "grad_norm": 2.8579137325286865, + "learning_rate": 4.61994117829891e-06, + "loss": 2.1685, + "step": 3782 + }, + { + "epoch": 0.20295064377682404, + "grad_norm": 1.1859360933303833, + "learning_rate": 4.619710898714442e-06, + "loss": 2.2979, + "step": 3783 + }, + { + "epoch": 0.20300429184549357, + "grad_norm": 1.1524977684020996, + "learning_rate": 4.619480555130042e-06, + "loss": 2.1892, + "step": 3784 + }, + { + "epoch": 0.2030579399141631, + "grad_norm": 3.5306761264801025, + "learning_rate": 4.6192501475526615e-06, + "loss": 2.4009, + "step": 3785 + }, + { + "epoch": 0.20311158798283263, + "grad_norm": 1.1591706275939941, + "learning_rate": 4.61901967598926e-06, + "loss": 2.0782, + "step": 3786 + }, + { + "epoch": 0.20316523605150213, + "grad_norm": 1.1073267459869385, + "learning_rate": 4.6187891404467935e-06, + "loss": 2.2546, + "step": 3787 + }, + { + "epoch": 0.20321888412017167, + "grad_norm": 1.3203403949737549, + "learning_rate": 4.618558540932224e-06, + "loss": 2.4296, + "step": 3788 + }, + { + "epoch": 0.2032725321888412, + "grad_norm": 1.1785733699798584, + "learning_rate": 4.6183278774525135e-06, + "loss": 2.1377, + "step": 3789 + }, + { + "epoch": 0.20332618025751073, + "grad_norm": 1.0508460998535156, + "learning_rate": 4.6180971500146265e-06, + "loss": 2.0465, + "step": 3790 + }, + { + "epoch": 0.20337982832618026, + "grad_norm": 1.310874104499817, + "learning_rate": 4.617866358625529e-06, + "loss": 2.1938, + "step": 3791 + }, + { + "epoch": 0.2034334763948498, + "grad_norm": 1.2831422090530396, + "learning_rate": 4.617635503292189e-06, + "loss": 2.2748, + "step": 3792 + }, + { + "epoch": 0.20348712446351933, + "grad_norm": 1.2381889820098877, + "learning_rate": 4.617404584021579e-06, + "loss": 2.3735, + "step": 3793 + }, + { + "epoch": 0.20354077253218883, + "grad_norm": 1.1694697141647339, + "learning_rate": 4.617173600820667e-06, + "loss": 2.1043, + "step": 3794 + }, + { + "epoch": 0.20359442060085836, + "grad_norm": 1.1172897815704346, + "learning_rate": 4.616942553696431e-06, + "loss": 2.2191, + "step": 3795 + }, + { + "epoch": 0.2036480686695279, + "grad_norm": 1.2368327379226685, + "learning_rate": 4.616711442655843e-06, + "loss": 2.5827, + "step": 3796 + }, + { + "epoch": 0.20370171673819742, + "grad_norm": 1.1593455076217651, + "learning_rate": 4.616480267705885e-06, + "loss": 2.4757, + "step": 3797 + }, + { + "epoch": 0.20375536480686696, + "grad_norm": 1.147629976272583, + "learning_rate": 4.616249028853534e-06, + "loss": 2.3198, + "step": 3798 + }, + { + "epoch": 0.2038090128755365, + "grad_norm": 1.1638574600219727, + "learning_rate": 4.616017726105773e-06, + "loss": 2.2984, + "step": 3799 + }, + { + "epoch": 0.20386266094420602, + "grad_norm": 1.3640450239181519, + "learning_rate": 4.615786359469585e-06, + "loss": 1.9309, + "step": 3800 + }, + { + "epoch": 0.20391630901287552, + "grad_norm": 1.2929766178131104, + "learning_rate": 4.615554928951956e-06, + "loss": 2.2077, + "step": 3801 + }, + { + "epoch": 0.20396995708154506, + "grad_norm": 1.323526382446289, + "learning_rate": 4.615323434559874e-06, + "loss": 2.392, + "step": 3802 + }, + { + "epoch": 0.2040236051502146, + "grad_norm": 1.1255865097045898, + "learning_rate": 4.615091876300327e-06, + "loss": 2.2897, + "step": 3803 + }, + { + "epoch": 0.20407725321888412, + "grad_norm": 1.4588115215301514, + "learning_rate": 4.6148602541803076e-06, + "loss": 2.4303, + "step": 3804 + }, + { + "epoch": 0.20413090128755365, + "grad_norm": 1.2456674575805664, + "learning_rate": 4.614628568206809e-06, + "loss": 2.5019, + "step": 3805 + }, + { + "epoch": 0.20418454935622318, + "grad_norm": 1.07060706615448, + "learning_rate": 4.614396818386826e-06, + "loss": 2.0887, + "step": 3806 + }, + { + "epoch": 0.20423819742489271, + "grad_norm": 1.2038031816482544, + "learning_rate": 4.614165004727356e-06, + "loss": 1.6497, + "step": 3807 + }, + { + "epoch": 0.20429184549356222, + "grad_norm": 1.4909688234329224, + "learning_rate": 4.6139331272353985e-06, + "loss": 2.4389, + "step": 3808 + }, + { + "epoch": 0.20434549356223175, + "grad_norm": 1.328690767288208, + "learning_rate": 4.613701185917954e-06, + "loss": 2.3768, + "step": 3809 + }, + { + "epoch": 0.20439914163090128, + "grad_norm": 1.305776596069336, + "learning_rate": 4.613469180782024e-06, + "loss": 2.4612, + "step": 3810 + }, + { + "epoch": 0.2044527896995708, + "grad_norm": 1.2762460708618164, + "learning_rate": 4.613237111834616e-06, + "loss": 2.629, + "step": 3811 + }, + { + "epoch": 0.20450643776824035, + "grad_norm": 1.0543373823165894, + "learning_rate": 4.6130049790827366e-06, + "loss": 2.3366, + "step": 3812 + }, + { + "epoch": 0.20456008583690988, + "grad_norm": 1.3244009017944336, + "learning_rate": 4.612772782533393e-06, + "loss": 2.3761, + "step": 3813 + }, + { + "epoch": 0.2046137339055794, + "grad_norm": 1.328403115272522, + "learning_rate": 4.612540522193596e-06, + "loss": 2.2789, + "step": 3814 + }, + { + "epoch": 0.20466738197424894, + "grad_norm": 1.163795828819275, + "learning_rate": 4.612308198070359e-06, + "loss": 2.2677, + "step": 3815 + }, + { + "epoch": 0.20472103004291844, + "grad_norm": 1.2112303972244263, + "learning_rate": 4.612075810170696e-06, + "loss": 1.5704, + "step": 3816 + }, + { + "epoch": 0.20477467811158798, + "grad_norm": 1.2815124988555908, + "learning_rate": 4.611843358501624e-06, + "loss": 2.2972, + "step": 3817 + }, + { + "epoch": 0.2048283261802575, + "grad_norm": 1.0280054807662964, + "learning_rate": 4.611610843070161e-06, + "loss": 2.301, + "step": 3818 + }, + { + "epoch": 0.20488197424892704, + "grad_norm": 1.268127202987671, + "learning_rate": 4.6113782638833275e-06, + "loss": 2.4349, + "step": 3819 + }, + { + "epoch": 0.20493562231759657, + "grad_norm": 1.2431765794754028, + "learning_rate": 4.611145620948145e-06, + "loss": 2.3797, + "step": 3820 + }, + { + "epoch": 0.2049892703862661, + "grad_norm": 1.271441102027893, + "learning_rate": 4.610912914271638e-06, + "loss": 2.431, + "step": 3821 + }, + { + "epoch": 0.20504291845493564, + "grad_norm": 1.0640727281570435, + "learning_rate": 4.610680143860833e-06, + "loss": 2.0019, + "step": 3822 + }, + { + "epoch": 0.20509656652360514, + "grad_norm": 1.3055484294891357, + "learning_rate": 4.6104473097227575e-06, + "loss": 1.9814, + "step": 3823 + }, + { + "epoch": 0.20515021459227467, + "grad_norm": 1.217063069343567, + "learning_rate": 4.610214411864441e-06, + "loss": 2.2498, + "step": 3824 + }, + { + "epoch": 0.2052038626609442, + "grad_norm": 1.3911885023117065, + "learning_rate": 4.609981450292917e-06, + "loss": 2.3122, + "step": 3825 + }, + { + "epoch": 0.20525751072961373, + "grad_norm": 1.2633899450302124, + "learning_rate": 4.609748425015218e-06, + "loss": 2.3219, + "step": 3826 + }, + { + "epoch": 0.20531115879828327, + "grad_norm": 1.3123701810836792, + "learning_rate": 4.609515336038379e-06, + "loss": 2.5282, + "step": 3827 + }, + { + "epoch": 0.2053648068669528, + "grad_norm": 1.2486709356307983, + "learning_rate": 4.609282183369439e-06, + "loss": 2.5164, + "step": 3828 + }, + { + "epoch": 0.20541845493562233, + "grad_norm": 1.298445224761963, + "learning_rate": 4.609048967015437e-06, + "loss": 1.9444, + "step": 3829 + }, + { + "epoch": 0.20547210300429183, + "grad_norm": 1.1438865661621094, + "learning_rate": 4.608815686983415e-06, + "loss": 2.3673, + "step": 3830 + }, + { + "epoch": 0.20552575107296137, + "grad_norm": 1.3315881490707397, + "learning_rate": 4.608582343280415e-06, + "loss": 2.5481, + "step": 3831 + }, + { + "epoch": 0.2055793991416309, + "grad_norm": 1.4122023582458496, + "learning_rate": 4.608348935913482e-06, + "loss": 2.4238, + "step": 3832 + }, + { + "epoch": 0.20563304721030043, + "grad_norm": 1.2805945873260498, + "learning_rate": 4.608115464889665e-06, + "loss": 2.3446, + "step": 3833 + }, + { + "epoch": 0.20568669527896996, + "grad_norm": 1.0847457647323608, + "learning_rate": 4.607881930216014e-06, + "loss": 2.1845, + "step": 3834 + }, + { + "epoch": 0.2057403433476395, + "grad_norm": 1.2080551385879517, + "learning_rate": 4.607648331899577e-06, + "loss": 2.496, + "step": 3835 + }, + { + "epoch": 0.20579399141630902, + "grad_norm": NaN, + "learning_rate": 4.607648331899577e-06, + "loss": 2.4192, + "step": 3836 + }, + { + "epoch": 0.20584763948497853, + "grad_norm": 1.3334401845932007, + "learning_rate": 4.607414669947408e-06, + "loss": 2.3548, + "step": 3837 + }, + { + "epoch": 0.20590128755364806, + "grad_norm": 1.3470743894577026, + "learning_rate": 4.607180944366563e-06, + "loss": 2.1089, + "step": 3838 + }, + { + "epoch": 0.2059549356223176, + "grad_norm": 1.382663607597351, + "learning_rate": 4.606947155164098e-06, + "loss": 1.9065, + "step": 3839 + }, + { + "epoch": 0.20600858369098712, + "grad_norm": 7.55432653427124, + "learning_rate": 4.606713302347072e-06, + "loss": 2.115, + "step": 3840 + }, + { + "epoch": 0.20606223175965666, + "grad_norm": 1.340131402015686, + "learning_rate": 4.606479385922547e-06, + "loss": 2.3421, + "step": 3841 + }, + { + "epoch": 0.2061158798283262, + "grad_norm": 1.2830229997634888, + "learning_rate": 4.606245405897582e-06, + "loss": 2.2986, + "step": 3842 + }, + { + "epoch": 0.20616952789699572, + "grad_norm": 1.2333353757858276, + "learning_rate": 4.606011362279245e-06, + "loss": 2.3825, + "step": 3843 + }, + { + "epoch": 0.20622317596566522, + "grad_norm": 1.1420345306396484, + "learning_rate": 4.6057772550746e-06, + "loss": 2.2984, + "step": 3844 + }, + { + "epoch": 0.20627682403433475, + "grad_norm": 1.297447919845581, + "learning_rate": 4.605543084290716e-06, + "loss": 2.2389, + "step": 3845 + }, + { + "epoch": 0.2063304721030043, + "grad_norm": 1.4369782209396362, + "learning_rate": 4.605308849934665e-06, + "loss": 2.108, + "step": 3846 + }, + { + "epoch": 0.20638412017167382, + "grad_norm": 1.1779539585113525, + "learning_rate": 4.605074552013518e-06, + "loss": 2.1522, + "step": 3847 + }, + { + "epoch": 0.20643776824034335, + "grad_norm": 1.4043254852294922, + "learning_rate": 4.604840190534349e-06, + "loss": 1.788, + "step": 3848 + }, + { + "epoch": 0.20649141630901288, + "grad_norm": 1.6444863080978394, + "learning_rate": 4.604605765504233e-06, + "loss": 2.448, + "step": 3849 + }, + { + "epoch": 0.2065450643776824, + "grad_norm": 1.5335686206817627, + "learning_rate": 4.60437127693025e-06, + "loss": 2.4546, + "step": 3850 + }, + { + "epoch": 0.20659871244635192, + "grad_norm": 7.86378812789917, + "learning_rate": 4.604136724819478e-06, + "loss": 2.2802, + "step": 3851 + }, + { + "epoch": 0.20665236051502145, + "grad_norm": 1.3682823181152344, + "learning_rate": 4.603902109178999e-06, + "loss": 2.3158, + "step": 3852 + }, + { + "epoch": 0.20670600858369098, + "grad_norm": 1.0754303932189941, + "learning_rate": 4.603667430015898e-06, + "loss": 2.1266, + "step": 3853 + }, + { + "epoch": 0.2067596566523605, + "grad_norm": 1.199903130531311, + "learning_rate": 4.60343268733726e-06, + "loss": 2.1111, + "step": 3854 + }, + { + "epoch": 0.20681330472103004, + "grad_norm": 1.3824491500854492, + "learning_rate": 4.6031978811501735e-06, + "loss": 2.3213, + "step": 3855 + }, + { + "epoch": 0.20686695278969958, + "grad_norm": 0.9867307543754578, + "learning_rate": 4.602963011461725e-06, + "loss": 2.0806, + "step": 3856 + }, + { + "epoch": 0.2069206008583691, + "grad_norm": 1.4526727199554443, + "learning_rate": 4.602728078279009e-06, + "loss": 1.7057, + "step": 3857 + }, + { + "epoch": 0.20697424892703864, + "grad_norm": 1.1399047374725342, + "learning_rate": 4.602493081609116e-06, + "loss": 1.7174, + "step": 3858 + }, + { + "epoch": 0.20702789699570814, + "grad_norm": 1.4496724605560303, + "learning_rate": 4.6022580214591436e-06, + "loss": 2.3054, + "step": 3859 + }, + { + "epoch": 0.20708154506437768, + "grad_norm": 1.2448503971099854, + "learning_rate": 4.602022897836189e-06, + "loss": 2.3576, + "step": 3860 + }, + { + "epoch": 0.2071351931330472, + "grad_norm": 1.2560955286026, + "learning_rate": 4.601787710747348e-06, + "loss": 2.246, + "step": 3861 + }, + { + "epoch": 0.20718884120171674, + "grad_norm": 1.3090347051620483, + "learning_rate": 4.601552460199726e-06, + "loss": 2.3664, + "step": 3862 + }, + { + "epoch": 0.20724248927038627, + "grad_norm": 1.337153673171997, + "learning_rate": 4.601317146200423e-06, + "loss": 2.2555, + "step": 3863 + }, + { + "epoch": 0.2072961373390558, + "grad_norm": 1.1006747484207153, + "learning_rate": 4.601081768756544e-06, + "loss": 2.1088, + "step": 3864 + }, + { + "epoch": 0.20734978540772533, + "grad_norm": 1.5067309141159058, + "learning_rate": 4.600846327875196e-06, + "loss": 2.2439, + "step": 3865 + }, + { + "epoch": 0.20740343347639484, + "grad_norm": 1.183552622795105, + "learning_rate": 4.600610823563488e-06, + "loss": 2.3315, + "step": 3866 + }, + { + "epoch": 0.20745708154506437, + "grad_norm": 1.1141819953918457, + "learning_rate": 4.600375255828531e-06, + "loss": 2.0202, + "step": 3867 + }, + { + "epoch": 0.2075107296137339, + "grad_norm": 1.1734802722930908, + "learning_rate": 4.600139624677436e-06, + "loss": 2.2592, + "step": 3868 + }, + { + "epoch": 0.20756437768240343, + "grad_norm": 1.159720778465271, + "learning_rate": 4.5999039301173186e-06, + "loss": 2.4654, + "step": 3869 + }, + { + "epoch": 0.20761802575107297, + "grad_norm": 1.1812273263931274, + "learning_rate": 4.599668172155294e-06, + "loss": 2.2638, + "step": 3870 + }, + { + "epoch": 0.2076716738197425, + "grad_norm": 1.181543231010437, + "learning_rate": 4.59943235079848e-06, + "loss": 2.3209, + "step": 3871 + }, + { + "epoch": 0.20772532188841203, + "grad_norm": 1.4954465627670288, + "learning_rate": 4.599196466054e-06, + "loss": 2.176, + "step": 3872 + }, + { + "epoch": 0.20777896995708153, + "grad_norm": 1.0538361072540283, + "learning_rate": 4.598960517928972e-06, + "loss": 2.1719, + "step": 3873 + }, + { + "epoch": 0.20783261802575106, + "grad_norm": 1.0606739521026611, + "learning_rate": 4.598724506430522e-06, + "loss": 2.0146, + "step": 3874 + }, + { + "epoch": 0.2078862660944206, + "grad_norm": 1.1746855974197388, + "learning_rate": 4.598488431565775e-06, + "loss": 2.0647, + "step": 3875 + }, + { + "epoch": 0.20793991416309013, + "grad_norm": 1.2457624673843384, + "learning_rate": 4.598252293341859e-06, + "loss": 2.2483, + "step": 3876 + }, + { + "epoch": 0.20799356223175966, + "grad_norm": 1.2829456329345703, + "learning_rate": 4.598016091765905e-06, + "loss": 2.3703, + "step": 3877 + }, + { + "epoch": 0.2080472103004292, + "grad_norm": 1.4245387315750122, + "learning_rate": 4.597779826845043e-06, + "loss": 2.1345, + "step": 3878 + }, + { + "epoch": 0.20810085836909872, + "grad_norm": 1.543312668800354, + "learning_rate": 4.5975434985864065e-06, + "loss": 2.2061, + "step": 3879 + }, + { + "epoch": 0.20815450643776823, + "grad_norm": 1.225606083869934, + "learning_rate": 4.597307106997132e-06, + "loss": 1.9766, + "step": 3880 + }, + { + "epoch": 0.20820815450643776, + "grad_norm": 1.234779715538025, + "learning_rate": 4.597070652084355e-06, + "loss": 2.4288, + "step": 3881 + }, + { + "epoch": 0.2082618025751073, + "grad_norm": 1.204129695892334, + "learning_rate": 4.596834133855217e-06, + "loss": 2.3069, + "step": 3882 + }, + { + "epoch": 0.20831545064377682, + "grad_norm": 1.2821471691131592, + "learning_rate": 4.596597552316857e-06, + "loss": 2.5146, + "step": 3883 + }, + { + "epoch": 0.20836909871244635, + "grad_norm": 1.440686583518982, + "learning_rate": 4.596360907476419e-06, + "loss": 2.3657, + "step": 3884 + }, + { + "epoch": 0.2084227467811159, + "grad_norm": 1.1899305582046509, + "learning_rate": 4.596124199341049e-06, + "loss": 2.2572, + "step": 3885 + }, + { + "epoch": 0.20847639484978542, + "grad_norm": 1.1728804111480713, + "learning_rate": 4.595887427917892e-06, + "loss": 2.1545, + "step": 3886 + }, + { + "epoch": 0.20853004291845492, + "grad_norm": 1.5531076192855835, + "learning_rate": 4.595650593214098e-06, + "loss": 2.1804, + "step": 3887 + }, + { + "epoch": 0.20858369098712445, + "grad_norm": 1.746886968612671, + "learning_rate": 4.595413695236818e-06, + "loss": 2.3064, + "step": 3888 + }, + { + "epoch": 0.20863733905579399, + "grad_norm": 1.1693010330200195, + "learning_rate": 4.595176733993203e-06, + "loss": 2.2953, + "step": 3889 + }, + { + "epoch": 0.20869098712446352, + "grad_norm": 1.231632113456726, + "learning_rate": 4.594939709490409e-06, + "loss": 2.5384, + "step": 3890 + }, + { + "epoch": 0.20874463519313305, + "grad_norm": 1.355269193649292, + "learning_rate": 4.594702621735592e-06, + "loss": 2.4947, + "step": 3891 + }, + { + "epoch": 0.20879828326180258, + "grad_norm": 1.3049167394638062, + "learning_rate": 4.594465470735909e-06, + "loss": 2.3633, + "step": 3892 + }, + { + "epoch": 0.2088519313304721, + "grad_norm": 1.1722811460494995, + "learning_rate": 4.594228256498522e-06, + "loss": 2.2012, + "step": 3893 + }, + { + "epoch": 0.20890557939914164, + "grad_norm": 1.2136625051498413, + "learning_rate": 4.593990979030594e-06, + "loss": 2.3655, + "step": 3894 + }, + { + "epoch": 0.20895922746781115, + "grad_norm": 1.6654685735702515, + "learning_rate": 4.593753638339286e-06, + "loss": 2.4418, + "step": 3895 + }, + { + "epoch": 0.20901287553648068, + "grad_norm": 1.2885582447052002, + "learning_rate": 4.593516234431766e-06, + "loss": 2.3383, + "step": 3896 + }, + { + "epoch": 0.2090665236051502, + "grad_norm": 1.777189016342163, + "learning_rate": 4.5932787673152024e-06, + "loss": 2.2115, + "step": 3897 + }, + { + "epoch": 0.20912017167381974, + "grad_norm": 1.3131041526794434, + "learning_rate": 4.593041236996764e-06, + "loss": 2.4638, + "step": 3898 + }, + { + "epoch": 0.20917381974248928, + "grad_norm": 1.3513163328170776, + "learning_rate": 4.592803643483622e-06, + "loss": 2.4049, + "step": 3899 + }, + { + "epoch": 0.2092274678111588, + "grad_norm": 1.1137490272521973, + "learning_rate": 4.592565986782951e-06, + "loss": 1.9909, + "step": 3900 + }, + { + "epoch": 0.20928111587982834, + "grad_norm": 1.3984042406082153, + "learning_rate": 4.592328266901926e-06, + "loss": 2.1516, + "step": 3901 + }, + { + "epoch": 0.20933476394849784, + "grad_norm": 1.1658276319503784, + "learning_rate": 4.592090483847724e-06, + "loss": 1.9071, + "step": 3902 + }, + { + "epoch": 0.20938841201716737, + "grad_norm": 1.154640555381775, + "learning_rate": 4.591852637627526e-06, + "loss": 2.0924, + "step": 3903 + }, + { + "epoch": 0.2094420600858369, + "grad_norm": 1.141451120376587, + "learning_rate": 4.591614728248512e-06, + "loss": 2.3463, + "step": 3904 + }, + { + "epoch": 0.20949570815450644, + "grad_norm": 1.219144344329834, + "learning_rate": 4.591376755717865e-06, + "loss": 2.4222, + "step": 3905 + }, + { + "epoch": 0.20954935622317597, + "grad_norm": 15.583785057067871, + "learning_rate": 4.591138720042771e-06, + "loss": 2.1326, + "step": 3906 + }, + { + "epoch": 0.2096030042918455, + "grad_norm": 1.3150886297225952, + "learning_rate": 4.590900621230414e-06, + "loss": 2.453, + "step": 3907 + }, + { + "epoch": 0.20965665236051503, + "grad_norm": 1.159549593925476, + "learning_rate": 4.590662459287987e-06, + "loss": 2.314, + "step": 3908 + }, + { + "epoch": 0.20971030042918454, + "grad_norm": 1.4379063844680786, + "learning_rate": 4.590424234222679e-06, + "loss": 2.3219, + "step": 3909 + }, + { + "epoch": 0.20976394849785407, + "grad_norm": 1.0443286895751953, + "learning_rate": 4.590185946041682e-06, + "loss": 1.974, + "step": 3910 + }, + { + "epoch": 0.2098175965665236, + "grad_norm": 1.286314845085144, + "learning_rate": 4.589947594752192e-06, + "loss": 2.2488, + "step": 3911 + }, + { + "epoch": 0.20987124463519313, + "grad_norm": 1.2732988595962524, + "learning_rate": 4.589709180361404e-06, + "loss": 2.261, + "step": 3912 + }, + { + "epoch": 0.20992489270386266, + "grad_norm": 2.4995880126953125, + "learning_rate": 4.589470702876516e-06, + "loss": 2.376, + "step": 3913 + }, + { + "epoch": 0.2099785407725322, + "grad_norm": 1.1933046579360962, + "learning_rate": 4.589232162304731e-06, + "loss": 2.2997, + "step": 3914 + }, + { + "epoch": 0.21003218884120173, + "grad_norm": 1.2445906400680542, + "learning_rate": 4.588993558653249e-06, + "loss": 2.3104, + "step": 3915 + }, + { + "epoch": 0.21008583690987123, + "grad_norm": 1.328859567642212, + "learning_rate": 4.588754891929274e-06, + "loss": 2.2768, + "step": 3916 + }, + { + "epoch": 0.21013948497854076, + "grad_norm": 1.3306444883346558, + "learning_rate": 4.588516162140013e-06, + "loss": 2.014, + "step": 3917 + }, + { + "epoch": 0.2101931330472103, + "grad_norm": 1.1950582265853882, + "learning_rate": 4.588277369292674e-06, + "loss": 2.3039, + "step": 3918 + }, + { + "epoch": 0.21024678111587983, + "grad_norm": 1.2974456548690796, + "learning_rate": 4.588038513394466e-06, + "loss": 2.3134, + "step": 3919 + }, + { + "epoch": 0.21030042918454936, + "grad_norm": 1.0932424068450928, + "learning_rate": 4.587799594452601e-06, + "loss": 2.0922, + "step": 3920 + }, + { + "epoch": 0.2103540772532189, + "grad_norm": 1.1918118000030518, + "learning_rate": 4.587560612474293e-06, + "loss": 1.8309, + "step": 3921 + }, + { + "epoch": 0.21040772532188842, + "grad_norm": 1.271555781364441, + "learning_rate": 4.587321567466757e-06, + "loss": 2.2584, + "step": 3922 + }, + { + "epoch": 0.21046137339055793, + "grad_norm": 1.3273849487304688, + "learning_rate": 4.587082459437211e-06, + "loss": 2.3067, + "step": 3923 + }, + { + "epoch": 0.21051502145922746, + "grad_norm": 4.311246871948242, + "learning_rate": 4.5868432883928735e-06, + "loss": 2.4169, + "step": 3924 + }, + { + "epoch": 0.210568669527897, + "grad_norm": 1.3034133911132812, + "learning_rate": 4.586604054340967e-06, + "loss": 2.1625, + "step": 3925 + }, + { + "epoch": 0.21062231759656652, + "grad_norm": 1.2499034404754639, + "learning_rate": 4.5863647572887135e-06, + "loss": 2.2653, + "step": 3926 + }, + { + "epoch": 0.21067596566523605, + "grad_norm": 1.2024236917495728, + "learning_rate": 4.586125397243339e-06, + "loss": 2.3765, + "step": 3927 + }, + { + "epoch": 0.21072961373390559, + "grad_norm": 1.4244794845581055, + "learning_rate": 4.585885974212069e-06, + "loss": 2.4115, + "step": 3928 + }, + { + "epoch": 0.21078326180257512, + "grad_norm": 1.4384560585021973, + "learning_rate": 4.585646488202133e-06, + "loss": 2.2855, + "step": 3929 + }, + { + "epoch": 0.21083690987124465, + "grad_norm": 1.6139925718307495, + "learning_rate": 4.585406939220762e-06, + "loss": 2.2331, + "step": 3930 + }, + { + "epoch": 0.21089055793991415, + "grad_norm": 5.379709243774414, + "learning_rate": 4.5851673272751895e-06, + "loss": 2.3719, + "step": 3931 + }, + { + "epoch": 0.21094420600858368, + "grad_norm": 1.4132283926010132, + "learning_rate": 4.584927652372649e-06, + "loss": 2.2759, + "step": 3932 + }, + { + "epoch": 0.21099785407725322, + "grad_norm": 1.1269378662109375, + "learning_rate": 4.584687914520376e-06, + "loss": 2.4771, + "step": 3933 + }, + { + "epoch": 0.21105150214592275, + "grad_norm": 1.5550537109375, + "learning_rate": 4.58444811372561e-06, + "loss": 2.3895, + "step": 3934 + }, + { + "epoch": 0.21110515021459228, + "grad_norm": 1.555764079093933, + "learning_rate": 4.584208249995592e-06, + "loss": 1.6229, + "step": 3935 + }, + { + "epoch": 0.2111587982832618, + "grad_norm": 1.2932902574539185, + "learning_rate": 4.583968323337563e-06, + "loss": 2.378, + "step": 3936 + }, + { + "epoch": 0.21121244635193134, + "grad_norm": 5.404799938201904, + "learning_rate": 4.583728333758768e-06, + "loss": 2.447, + "step": 3937 + }, + { + "epoch": 0.21126609442060085, + "grad_norm": 1.048024296760559, + "learning_rate": 4.5834882812664516e-06, + "loss": 1.7508, + "step": 3938 + }, + { + "epoch": 0.21131974248927038, + "grad_norm": 1.3353197574615479, + "learning_rate": 4.583248165867863e-06, + "loss": 2.2567, + "step": 3939 + }, + { + "epoch": 0.2113733905579399, + "grad_norm": 1.126205325126648, + "learning_rate": 4.583007987570251e-06, + "loss": 2.0048, + "step": 3940 + }, + { + "epoch": 0.21142703862660944, + "grad_norm": 1.4274523258209229, + "learning_rate": 4.582767746380868e-06, + "loss": 2.2367, + "step": 3941 + }, + { + "epoch": 0.21148068669527897, + "grad_norm": 1.800157070159912, + "learning_rate": 4.582527442306966e-06, + "loss": 2.3545, + "step": 3942 + }, + { + "epoch": 0.2115343347639485, + "grad_norm": 1.1164541244506836, + "learning_rate": 4.582287075355802e-06, + "loss": 1.9742, + "step": 3943 + }, + { + "epoch": 0.21158798283261804, + "grad_norm": 1.2985765933990479, + "learning_rate": 4.582046645534634e-06, + "loss": 2.1946, + "step": 3944 + }, + { + "epoch": 0.21164163090128754, + "grad_norm": 1.176155686378479, + "learning_rate": 4.581806152850719e-06, + "loss": 2.341, + "step": 3945 + }, + { + "epoch": 0.21169527896995707, + "grad_norm": 1.2346646785736084, + "learning_rate": 4.581565597311319e-06, + "loss": 2.1096, + "step": 3946 + }, + { + "epoch": 0.2117489270386266, + "grad_norm": 1.3905377388000488, + "learning_rate": 4.581324978923698e-06, + "loss": 1.9976, + "step": 3947 + }, + { + "epoch": 0.21180257510729614, + "grad_norm": 1.2381666898727417, + "learning_rate": 4.58108429769512e-06, + "loss": 2.0116, + "step": 3948 + }, + { + "epoch": 0.21185622317596567, + "grad_norm": 1.3181166648864746, + "learning_rate": 4.580843553632852e-06, + "loss": 2.3192, + "step": 3949 + }, + { + "epoch": 0.2119098712446352, + "grad_norm": 1.3104408979415894, + "learning_rate": 4.580602746744164e-06, + "loss": 2.1794, + "step": 3950 + }, + { + "epoch": 0.21196351931330473, + "grad_norm": 1.2503306865692139, + "learning_rate": 4.580361877036325e-06, + "loss": 2.5402, + "step": 3951 + }, + { + "epoch": 0.21201716738197424, + "grad_norm": 1.2229423522949219, + "learning_rate": 4.580120944516606e-06, + "loss": 1.534, + "step": 3952 + }, + { + "epoch": 0.21207081545064377, + "grad_norm": 1.1410666704177856, + "learning_rate": 4.579879949192286e-06, + "loss": 2.1214, + "step": 3953 + }, + { + "epoch": 0.2121244635193133, + "grad_norm": 1.078779935836792, + "learning_rate": 4.5796388910706365e-06, + "loss": 2.1281, + "step": 3954 + }, + { + "epoch": 0.21217811158798283, + "grad_norm": 1.8580636978149414, + "learning_rate": 4.57939777015894e-06, + "loss": 1.9122, + "step": 3955 + }, + { + "epoch": 0.21223175965665236, + "grad_norm": 1.2122997045516968, + "learning_rate": 4.579156586464474e-06, + "loss": 2.4173, + "step": 3956 + }, + { + "epoch": 0.2122854077253219, + "grad_norm": 1.3841835260391235, + "learning_rate": 4.578915339994519e-06, + "loss": 2.2999, + "step": 3957 + }, + { + "epoch": 0.21233905579399143, + "grad_norm": 1.2687430381774902, + "learning_rate": 4.578674030756364e-06, + "loss": 2.2156, + "step": 3958 + }, + { + "epoch": 0.21239270386266093, + "grad_norm": 1.6709665060043335, + "learning_rate": 4.5784326587572896e-06, + "loss": 2.1817, + "step": 3959 + }, + { + "epoch": 0.21244635193133046, + "grad_norm": 1.3587028980255127, + "learning_rate": 4.578191224004587e-06, + "loss": 2.2847, + "step": 3960 + }, + { + "epoch": 0.2125, + "grad_norm": 1.2953519821166992, + "learning_rate": 4.577949726505543e-06, + "loss": 2.368, + "step": 3961 + }, + { + "epoch": 0.21255364806866953, + "grad_norm": 1.3547508716583252, + "learning_rate": 4.577708166267451e-06, + "loss": 2.363, + "step": 3962 + }, + { + "epoch": 0.21260729613733906, + "grad_norm": 1.3005449771881104, + "learning_rate": 4.577466543297604e-06, + "loss": 2.4499, + "step": 3963 + }, + { + "epoch": 0.2126609442060086, + "grad_norm": 0.9699686765670776, + "learning_rate": 4.577224857603297e-06, + "loss": 1.8542, + "step": 3964 + }, + { + "epoch": 0.21271459227467812, + "grad_norm": 1.3157836198806763, + "learning_rate": 4.576983109191827e-06, + "loss": 2.3713, + "step": 3965 + }, + { + "epoch": 0.21276824034334765, + "grad_norm": 1.4870609045028687, + "learning_rate": 4.576741298070494e-06, + "loss": 2.2177, + "step": 3966 + }, + { + "epoch": 0.21282188841201716, + "grad_norm": 1.211416482925415, + "learning_rate": 4.576499424246597e-06, + "loss": 2.1745, + "step": 3967 + }, + { + "epoch": 0.2128755364806867, + "grad_norm": 1.3217746019363403, + "learning_rate": 4.576257487727442e-06, + "loss": 2.3184, + "step": 3968 + }, + { + "epoch": 0.21292918454935622, + "grad_norm": 1.1396137475967407, + "learning_rate": 4.576015488520331e-06, + "loss": 2.3097, + "step": 3969 + }, + { + "epoch": 0.21298283261802575, + "grad_norm": 3.3452138900756836, + "learning_rate": 4.575773426632571e-06, + "loss": 2.0746, + "step": 3970 + }, + { + "epoch": 0.21303648068669528, + "grad_norm": 1.2655305862426758, + "learning_rate": 4.575531302071471e-06, + "loss": 2.2915, + "step": 3971 + }, + { + "epoch": 0.21309012875536482, + "grad_norm": 1.1992863416671753, + "learning_rate": 4.575289114844342e-06, + "loss": 2.2304, + "step": 3972 + }, + { + "epoch": 0.21314377682403435, + "grad_norm": 1.462810754776001, + "learning_rate": 4.575046864958496e-06, + "loss": 2.3149, + "step": 3973 + }, + { + "epoch": 0.21319742489270385, + "grad_norm": 1.4043545722961426, + "learning_rate": 4.574804552421245e-06, + "loss": 2.5286, + "step": 3974 + }, + { + "epoch": 0.21325107296137338, + "grad_norm": 1.3590774536132812, + "learning_rate": 4.574562177239908e-06, + "loss": 2.1506, + "step": 3975 + }, + { + "epoch": 0.21330472103004292, + "grad_norm": 1.0810760259628296, + "learning_rate": 4.574319739421803e-06, + "loss": 1.8158, + "step": 3976 + }, + { + "epoch": 0.21335836909871245, + "grad_norm": 1.0326933860778809, + "learning_rate": 4.574077238974248e-06, + "loss": 1.8437, + "step": 3977 + }, + { + "epoch": 0.21341201716738198, + "grad_norm": 1.369327187538147, + "learning_rate": 4.573834675904565e-06, + "loss": 2.3493, + "step": 3978 + }, + { + "epoch": 0.2134656652360515, + "grad_norm": 1.1811996698379517, + "learning_rate": 4.573592050220079e-06, + "loss": 2.2746, + "step": 3979 + }, + { + "epoch": 0.21351931330472104, + "grad_norm": 1.385120153427124, + "learning_rate": 4.573349361928114e-06, + "loss": 2.2755, + "step": 3980 + }, + { + "epoch": 0.21357296137339055, + "grad_norm": 1.3449207544326782, + "learning_rate": 4.573106611035999e-06, + "loss": 2.3078, + "step": 3981 + }, + { + "epoch": 0.21362660944206008, + "grad_norm": 1.352372169494629, + "learning_rate": 4.5728637975510624e-06, + "loss": 2.0526, + "step": 3982 + }, + { + "epoch": 0.2136802575107296, + "grad_norm": 1.1233474016189575, + "learning_rate": 4.572620921480635e-06, + "loss": 2.647, + "step": 3983 + }, + { + "epoch": 0.21373390557939914, + "grad_norm": 1.350175142288208, + "learning_rate": 4.572377982832051e-06, + "loss": 2.4006, + "step": 3984 + }, + { + "epoch": 0.21378755364806867, + "grad_norm": 1.3102589845657349, + "learning_rate": 4.572134981612645e-06, + "loss": 2.4524, + "step": 3985 + }, + { + "epoch": 0.2138412017167382, + "grad_norm": 1.2548129558563232, + "learning_rate": 4.571891917829753e-06, + "loss": 2.2626, + "step": 3986 + }, + { + "epoch": 0.21389484978540774, + "grad_norm": 1.5825259685516357, + "learning_rate": 4.571648791490715e-06, + "loss": 2.1542, + "step": 3987 + }, + { + "epoch": 0.21394849785407724, + "grad_norm": 1.796368956565857, + "learning_rate": 4.571405602602871e-06, + "loss": 2.1625, + "step": 3988 + }, + { + "epoch": 0.21400214592274677, + "grad_norm": 1.0182774066925049, + "learning_rate": 4.571162351173564e-06, + "loss": 1.9383, + "step": 3989 + }, + { + "epoch": 0.2140557939914163, + "grad_norm": 1.5573290586471558, + "learning_rate": 4.570919037210137e-06, + "loss": 2.3621, + "step": 3990 + }, + { + "epoch": 0.21410944206008584, + "grad_norm": 1.459327220916748, + "learning_rate": 4.570675660719938e-06, + "loss": 2.2882, + "step": 3991 + }, + { + "epoch": 0.21416309012875537, + "grad_norm": 1.4889427423477173, + "learning_rate": 4.570432221710314e-06, + "loss": 2.2649, + "step": 3992 + }, + { + "epoch": 0.2142167381974249, + "grad_norm": 1.2046356201171875, + "learning_rate": 4.570188720188618e-06, + "loss": 2.3584, + "step": 3993 + }, + { + "epoch": 0.21427038626609443, + "grad_norm": 3.9545633792877197, + "learning_rate": 4.5699451561621975e-06, + "loss": 2.3694, + "step": 3994 + }, + { + "epoch": 0.21432403433476394, + "grad_norm": 1.2446953058242798, + "learning_rate": 4.569701529638409e-06, + "loss": 2.413, + "step": 3995 + }, + { + "epoch": 0.21437768240343347, + "grad_norm": 1.3979307413101196, + "learning_rate": 4.5694578406246084e-06, + "loss": 2.2909, + "step": 3996 + }, + { + "epoch": 0.214431330472103, + "grad_norm": 1.2423509359359741, + "learning_rate": 4.569214089128152e-06, + "loss": 2.4144, + "step": 3997 + }, + { + "epoch": 0.21448497854077253, + "grad_norm": 1.5049792528152466, + "learning_rate": 4.5689702751564e-06, + "loss": 2.3176, + "step": 3998 + }, + { + "epoch": 0.21453862660944206, + "grad_norm": 1.3934131860733032, + "learning_rate": 4.5687263987167155e-06, + "loss": 2.2734, + "step": 3999 + }, + { + "epoch": 0.2145922746781116, + "grad_norm": 1.35462486743927, + "learning_rate": 4.568482459816459e-06, + "loss": 2.2667, + "step": 4000 + }, + { + "epoch": 0.21464592274678113, + "grad_norm": 1.8288527727127075, + "learning_rate": 4.568238458462997e-06, + "loss": 2.2054, + "step": 4001 + }, + { + "epoch": 0.21469957081545063, + "grad_norm": 1.321295142173767, + "learning_rate": 4.567994394663696e-06, + "loss": 1.8727, + "step": 4002 + }, + { + "epoch": 0.21475321888412016, + "grad_norm": 1.4664661884307861, + "learning_rate": 4.567750268425926e-06, + "loss": 2.2386, + "step": 4003 + }, + { + "epoch": 0.2148068669527897, + "grad_norm": 2.1229934692382812, + "learning_rate": 4.567506079757058e-06, + "loss": 2.1308, + "step": 4004 + }, + { + "epoch": 0.21486051502145923, + "grad_norm": 1.3522522449493408, + "learning_rate": 4.567261828664463e-06, + "loss": 2.3031, + "step": 4005 + }, + { + "epoch": 0.21491416309012876, + "grad_norm": 1.3768761157989502, + "learning_rate": 4.5670175151555175e-06, + "loss": 2.3866, + "step": 4006 + }, + { + "epoch": 0.2149678111587983, + "grad_norm": 1.1307841539382935, + "learning_rate": 4.566773139237597e-06, + "loss": 1.6956, + "step": 4007 + }, + { + "epoch": 0.21502145922746782, + "grad_norm": 1.3322261571884155, + "learning_rate": 4.56652870091808e-06, + "loss": 2.1476, + "step": 4008 + }, + { + "epoch": 0.21507510729613735, + "grad_norm": 1.3683487176895142, + "learning_rate": 4.566284200204346e-06, + "loss": 2.249, + "step": 4009 + }, + { + "epoch": 0.21512875536480686, + "grad_norm": 1.8226114511489868, + "learning_rate": 4.566039637103779e-06, + "loss": 2.5514, + "step": 4010 + }, + { + "epoch": 0.2151824034334764, + "grad_norm": 1.3784879446029663, + "learning_rate": 4.565795011623761e-06, + "loss": 2.0704, + "step": 4011 + }, + { + "epoch": 0.21523605150214592, + "grad_norm": 1.294020652770996, + "learning_rate": 4.56555032377168e-06, + "loss": 2.2321, + "step": 4012 + }, + { + "epoch": 0.21528969957081545, + "grad_norm": 1.5056620836257935, + "learning_rate": 4.5653055735549226e-06, + "loss": 2.4999, + "step": 4013 + }, + { + "epoch": 0.21534334763948498, + "grad_norm": 1.7543549537658691, + "learning_rate": 4.565060760980878e-06, + "loss": 1.4555, + "step": 4014 + }, + { + "epoch": 0.21539699570815452, + "grad_norm": 1.2629188299179077, + "learning_rate": 4.564815886056939e-06, + "loss": 2.108, + "step": 4015 + }, + { + "epoch": 0.21545064377682405, + "grad_norm": 1.2696492671966553, + "learning_rate": 4.564570948790497e-06, + "loss": 2.126, + "step": 4016 + }, + { + "epoch": 0.21550429184549355, + "grad_norm": 1.2824338674545288, + "learning_rate": 4.564325949188951e-06, + "loss": 2.4887, + "step": 4017 + }, + { + "epoch": 0.21555793991416308, + "grad_norm": 1.234084129333496, + "learning_rate": 4.564080887259694e-06, + "loss": 2.4033, + "step": 4018 + }, + { + "epoch": 0.21561158798283261, + "grad_norm": 1.3155438899993896, + "learning_rate": 4.5638357630101285e-06, + "loss": 1.4194, + "step": 4019 + }, + { + "epoch": 0.21566523605150215, + "grad_norm": 1.1690928936004639, + "learning_rate": 4.563590576447654e-06, + "loss": 2.1331, + "step": 4020 + }, + { + "epoch": 0.21571888412017168, + "grad_norm": 1.456256628036499, + "learning_rate": 4.563345327579673e-06, + "loss": 2.3429, + "step": 4021 + }, + { + "epoch": 0.2157725321888412, + "grad_norm": 1.5942866802215576, + "learning_rate": 4.56310001641359e-06, + "loss": 2.3064, + "step": 4022 + }, + { + "epoch": 0.21582618025751074, + "grad_norm": 1.300215721130371, + "learning_rate": 4.562854642956813e-06, + "loss": 2.0929, + "step": 4023 + }, + { + "epoch": 0.21587982832618025, + "grad_norm": 2.921764612197876, + "learning_rate": 4.562609207216749e-06, + "loss": 2.2021, + "step": 4024 + }, + { + "epoch": 0.21593347639484978, + "grad_norm": 1.3887743949890137, + "learning_rate": 4.56236370920081e-06, + "loss": 2.1226, + "step": 4025 + }, + { + "epoch": 0.2159871244635193, + "grad_norm": 1.290189504623413, + "learning_rate": 4.562118148916408e-06, + "loss": 2.5229, + "step": 4026 + }, + { + "epoch": 0.21604077253218884, + "grad_norm": 1.1281436681747437, + "learning_rate": 4.5618725263709555e-06, + "loss": 2.3205, + "step": 4027 + }, + { + "epoch": 0.21609442060085837, + "grad_norm": 1.1982078552246094, + "learning_rate": 4.561626841571869e-06, + "loss": 2.093, + "step": 4028 + }, + { + "epoch": 0.2161480686695279, + "grad_norm": 1.3428905010223389, + "learning_rate": 4.561381094526568e-06, + "loss": 2.3555, + "step": 4029 + }, + { + "epoch": 0.21620171673819744, + "grad_norm": 1.1932132244110107, + "learning_rate": 4.561135285242472e-06, + "loss": 2.3759, + "step": 4030 + }, + { + "epoch": 0.21625536480686694, + "grad_norm": 1.5166715383529663, + "learning_rate": 4.560889413727e-06, + "loss": 2.3652, + "step": 4031 + }, + { + "epoch": 0.21630901287553647, + "grad_norm": 1.1639577150344849, + "learning_rate": 4.560643479987579e-06, + "loss": 2.3519, + "step": 4032 + }, + { + "epoch": 0.216362660944206, + "grad_norm": 1.2389296293258667, + "learning_rate": 4.560397484031633e-06, + "loss": 2.3184, + "step": 4033 + }, + { + "epoch": 0.21641630901287554, + "grad_norm": 1.2735469341278076, + "learning_rate": 4.560151425866588e-06, + "loss": 1.5577, + "step": 4034 + }, + { + "epoch": 0.21646995708154507, + "grad_norm": 1.2459306716918945, + "learning_rate": 4.559905305499875e-06, + "loss": 2.0357, + "step": 4035 + }, + { + "epoch": 0.2165236051502146, + "grad_norm": 1.2834945917129517, + "learning_rate": 4.5596591229389245e-06, + "loss": 2.3119, + "step": 4036 + }, + { + "epoch": 0.21657725321888413, + "grad_norm": 1.2545154094696045, + "learning_rate": 4.55941287819117e-06, + "loss": 2.1155, + "step": 4037 + }, + { + "epoch": 0.21663090128755363, + "grad_norm": 1.2819812297821045, + "learning_rate": 4.559166571264045e-06, + "loss": 2.3184, + "step": 4038 + }, + { + "epoch": 0.21668454935622317, + "grad_norm": 1.3518935441970825, + "learning_rate": 4.558920202164987e-06, + "loss": 2.2692, + "step": 4039 + }, + { + "epoch": 0.2167381974248927, + "grad_norm": 1.2696197032928467, + "learning_rate": 4.558673770901434e-06, + "loss": 2.13, + "step": 4040 + }, + { + "epoch": 0.21679184549356223, + "grad_norm": 1.2872360944747925, + "learning_rate": 4.558427277480827e-06, + "loss": 2.3448, + "step": 4041 + }, + { + "epoch": 0.21684549356223176, + "grad_norm": 1.2360516786575317, + "learning_rate": 4.558180721910609e-06, + "loss": 2.2813, + "step": 4042 + }, + { + "epoch": 0.2168991416309013, + "grad_norm": 1.4598301649093628, + "learning_rate": 4.557934104198223e-06, + "loss": 2.5138, + "step": 4043 + }, + { + "epoch": 0.21695278969957082, + "grad_norm": 1.3264082670211792, + "learning_rate": 4.557687424351115e-06, + "loss": 2.4326, + "step": 4044 + }, + { + "epoch": 0.21700643776824036, + "grad_norm": 2.1275906562805176, + "learning_rate": 4.557440682376734e-06, + "loss": 2.3182, + "step": 4045 + }, + { + "epoch": 0.21706008583690986, + "grad_norm": 2.638984203338623, + "learning_rate": 4.557193878282529e-06, + "loss": 2.4731, + "step": 4046 + }, + { + "epoch": 0.2171137339055794, + "grad_norm": 1.2371875047683716, + "learning_rate": 4.55694701207595e-06, + "loss": 2.3305, + "step": 4047 + }, + { + "epoch": 0.21716738197424892, + "grad_norm": 1.2358791828155518, + "learning_rate": 4.556700083764455e-06, + "loss": 2.5184, + "step": 4048 + }, + { + "epoch": 0.21722103004291846, + "grad_norm": 1.2942252159118652, + "learning_rate": 4.556453093355497e-06, + "loss": 2.3306, + "step": 4049 + }, + { + "epoch": 0.217274678111588, + "grad_norm": 1.5318509340286255, + "learning_rate": 4.556206040856532e-06, + "loss": 2.1764, + "step": 4050 + }, + { + "epoch": 0.21732832618025752, + "grad_norm": 1.396066665649414, + "learning_rate": 4.555958926275022e-06, + "loss": 2.2244, + "step": 4051 + }, + { + "epoch": 0.21738197424892705, + "grad_norm": 1.4724102020263672, + "learning_rate": 4.555711749618424e-06, + "loss": 2.4819, + "step": 4052 + }, + { + "epoch": 0.21743562231759656, + "grad_norm": 1.2758296728134155, + "learning_rate": 4.555464510894205e-06, + "loss": 2.2461, + "step": 4053 + }, + { + "epoch": 0.2174892703862661, + "grad_norm": 2.528101921081543, + "learning_rate": 4.555217210109829e-06, + "loss": 2.5992, + "step": 4054 + }, + { + "epoch": 0.21754291845493562, + "grad_norm": 1.2149527072906494, + "learning_rate": 4.554969847272761e-06, + "loss": 2.4921, + "step": 4055 + }, + { + "epoch": 0.21759656652360515, + "grad_norm": 2.0609002113342285, + "learning_rate": 4.554722422390471e-06, + "loss": 2.195, + "step": 4056 + }, + { + "epoch": 0.21765021459227468, + "grad_norm": 1.2150212526321411, + "learning_rate": 4.554474935470429e-06, + "loss": 2.2701, + "step": 4057 + }, + { + "epoch": 0.21770386266094421, + "grad_norm": 5.598989009857178, + "learning_rate": 4.554227386520107e-06, + "loss": 2.0309, + "step": 4058 + }, + { + "epoch": 0.21775751072961375, + "grad_norm": 1.238718867301941, + "learning_rate": 4.55397977554698e-06, + "loss": 1.3881, + "step": 4059 + }, + { + "epoch": 0.21781115879828325, + "grad_norm": 1.2481614351272583, + "learning_rate": 4.553732102558523e-06, + "loss": 1.9875, + "step": 4060 + }, + { + "epoch": 0.21786480686695278, + "grad_norm": 1.3086787462234497, + "learning_rate": 4.553484367562215e-06, + "loss": 2.3285, + "step": 4061 + }, + { + "epoch": 0.2179184549356223, + "grad_norm": 1.5588326454162598, + "learning_rate": 4.553236570565535e-06, + "loss": 2.2036, + "step": 4062 + }, + { + "epoch": 0.21797210300429185, + "grad_norm": 1.3315690755844116, + "learning_rate": 4.552988711575965e-06, + "loss": 2.3599, + "step": 4063 + }, + { + "epoch": 0.21802575107296138, + "grad_norm": 1.2024998664855957, + "learning_rate": 4.552740790600989e-06, + "loss": 1.6921, + "step": 4064 + }, + { + "epoch": 0.2180793991416309, + "grad_norm": 2.979360580444336, + "learning_rate": 4.552492807648091e-06, + "loss": 2.2833, + "step": 4065 + }, + { + "epoch": 0.21813304721030044, + "grad_norm": 1.2613452672958374, + "learning_rate": 4.55224476272476e-06, + "loss": 2.4071, + "step": 4066 + }, + { + "epoch": 0.21818669527896994, + "grad_norm": 1.4730923175811768, + "learning_rate": 4.551996655838484e-06, + "loss": 2.2736, + "step": 4067 + }, + { + "epoch": 0.21824034334763948, + "grad_norm": 1.6711945533752441, + "learning_rate": 4.551748486996755e-06, + "loss": 2.2327, + "step": 4068 + }, + { + "epoch": 0.218293991416309, + "grad_norm": 1.295769214630127, + "learning_rate": 4.551500256207065e-06, + "loss": 2.1913, + "step": 4069 + }, + { + "epoch": 0.21834763948497854, + "grad_norm": 1.2700793743133545, + "learning_rate": 4.5512519634769095e-06, + "loss": 2.3327, + "step": 4070 + }, + { + "epoch": 0.21840128755364807, + "grad_norm": 1.387874960899353, + "learning_rate": 4.551003608813784e-06, + "loss": 2.6176, + "step": 4071 + }, + { + "epoch": 0.2184549356223176, + "grad_norm": 1.2596790790557861, + "learning_rate": 4.5507551922251886e-06, + "loss": 2.1466, + "step": 4072 + }, + { + "epoch": 0.21850858369098713, + "grad_norm": 1.1596606969833374, + "learning_rate": 4.550506713718622e-06, + "loss": 1.9931, + "step": 4073 + }, + { + "epoch": 0.21856223175965664, + "grad_norm": 1.500030279159546, + "learning_rate": 4.550258173301588e-06, + "loss": 2.2285, + "step": 4074 + }, + { + "epoch": 0.21861587982832617, + "grad_norm": 1.5016918182373047, + "learning_rate": 4.55000957098159e-06, + "loss": 2.4667, + "step": 4075 + }, + { + "epoch": 0.2186695278969957, + "grad_norm": 0.9242588877677917, + "learning_rate": 4.549760906766134e-06, + "loss": 2.0743, + "step": 4076 + }, + { + "epoch": 0.21872317596566523, + "grad_norm": 1.260085105895996, + "learning_rate": 4.549512180662727e-06, + "loss": 2.3511, + "step": 4077 + }, + { + "epoch": 0.21877682403433477, + "grad_norm": 1.0691243410110474, + "learning_rate": 4.549263392678881e-06, + "loss": 2.0043, + "step": 4078 + }, + { + "epoch": 0.2188304721030043, + "grad_norm": 1.4263142347335815, + "learning_rate": 4.549014542822105e-06, + "loss": 2.1288, + "step": 4079 + }, + { + "epoch": 0.21888412017167383, + "grad_norm": 1.1576175689697266, + "learning_rate": 4.548765631099914e-06, + "loss": 2.1177, + "step": 4080 + }, + { + "epoch": 0.21893776824034336, + "grad_norm": 1.2837655544281006, + "learning_rate": 4.548516657519824e-06, + "loss": 2.405, + "step": 4081 + }, + { + "epoch": 0.21899141630901287, + "grad_norm": 1.2756690979003906, + "learning_rate": 4.54826762208935e-06, + "loss": 2.4133, + "step": 4082 + }, + { + "epoch": 0.2190450643776824, + "grad_norm": 0.9815880060195923, + "learning_rate": 4.548018524816013e-06, + "loss": 2.2179, + "step": 4083 + }, + { + "epoch": 0.21909871244635193, + "grad_norm": 1.3628289699554443, + "learning_rate": 4.547769365707333e-06, + "loss": 2.4824, + "step": 4084 + }, + { + "epoch": 0.21915236051502146, + "grad_norm": 1.2855581045150757, + "learning_rate": 4.5475201447708325e-06, + "loss": 2.2325, + "step": 4085 + }, + { + "epoch": 0.219206008583691, + "grad_norm": 1.3037744760513306, + "learning_rate": 4.547270862014037e-06, + "loss": 2.2047, + "step": 4086 + }, + { + "epoch": 0.21925965665236052, + "grad_norm": 1.1918920278549194, + "learning_rate": 4.547021517444473e-06, + "loss": 1.9462, + "step": 4087 + }, + { + "epoch": 0.21931330472103006, + "grad_norm": 1.4301564693450928, + "learning_rate": 4.546772111069669e-06, + "loss": 2.1486, + "step": 4088 + }, + { + "epoch": 0.21936695278969956, + "grad_norm": 1.0981155633926392, + "learning_rate": 4.546522642897155e-06, + "loss": 2.0675, + "step": 4089 + }, + { + "epoch": 0.2194206008583691, + "grad_norm": 1.3486417531967163, + "learning_rate": 4.546273112934462e-06, + "loss": 2.4478, + "step": 4090 + }, + { + "epoch": 0.21947424892703862, + "grad_norm": 1.2009557485580444, + "learning_rate": 4.546023521189125e-06, + "loss": 2.1854, + "step": 4091 + }, + { + "epoch": 0.21952789699570815, + "grad_norm": 1.3875855207443237, + "learning_rate": 4.54577386766868e-06, + "loss": 2.3317, + "step": 4092 + }, + { + "epoch": 0.2195815450643777, + "grad_norm": 1.4080275297164917, + "learning_rate": 4.545524152380665e-06, + "loss": 2.2101, + "step": 4093 + }, + { + "epoch": 0.21963519313304722, + "grad_norm": 1.6188087463378906, + "learning_rate": 4.54527437533262e-06, + "loss": 2.4661, + "step": 4094 + }, + { + "epoch": 0.21968884120171675, + "grad_norm": 1.0407153367996216, + "learning_rate": 4.545024536532085e-06, + "loss": 2.1353, + "step": 4095 + }, + { + "epoch": 0.21974248927038625, + "grad_norm": 1.2516001462936401, + "learning_rate": 4.544774635986603e-06, + "loss": 2.1715, + "step": 4096 + }, + { + "epoch": 0.21979613733905579, + "grad_norm": 1.339664101600647, + "learning_rate": 4.544524673703721e-06, + "loss": 2.4846, + "step": 4097 + }, + { + "epoch": 0.21984978540772532, + "grad_norm": 1.0787155628204346, + "learning_rate": 4.5442746496909854e-06, + "loss": 2.4245, + "step": 4098 + }, + { + "epoch": 0.21990343347639485, + "grad_norm": 1.3437243700027466, + "learning_rate": 4.544024563955946e-06, + "loss": 2.3708, + "step": 4099 + }, + { + "epoch": 0.21995708154506438, + "grad_norm": 1.0187792778015137, + "learning_rate": 4.5437744165061504e-06, + "loss": 2.2779, + "step": 4100 + }, + { + "epoch": 0.2200107296137339, + "grad_norm": 1.6815812587738037, + "learning_rate": 4.543524207349155e-06, + "loss": 2.3876, + "step": 4101 + }, + { + "epoch": 0.22006437768240344, + "grad_norm": 1.2402664422988892, + "learning_rate": 4.543273936492511e-06, + "loss": 2.2162, + "step": 4102 + }, + { + "epoch": 0.22011802575107295, + "grad_norm": 1.2237123250961304, + "learning_rate": 4.543023603943778e-06, + "loss": 2.1896, + "step": 4103 + }, + { + "epoch": 0.22017167381974248, + "grad_norm": 1.2247097492218018, + "learning_rate": 4.542773209710513e-06, + "loss": 2.1518, + "step": 4104 + }, + { + "epoch": 0.220225321888412, + "grad_norm": 1.2794774770736694, + "learning_rate": 4.542522753800275e-06, + "loss": 2.1795, + "step": 4105 + }, + { + "epoch": 0.22027896995708154, + "grad_norm": 1.0907092094421387, + "learning_rate": 4.542272236220626e-06, + "loss": 2.2592, + "step": 4106 + }, + { + "epoch": 0.22033261802575108, + "grad_norm": 1.3120157718658447, + "learning_rate": 4.542021656979132e-06, + "loss": 2.1254, + "step": 4107 + }, + { + "epoch": 0.2203862660944206, + "grad_norm": 1.1971904039382935, + "learning_rate": 4.541771016083356e-06, + "loss": 2.2797, + "step": 4108 + }, + { + "epoch": 0.22043991416309014, + "grad_norm": 1.1954035758972168, + "learning_rate": 4.541520313540867e-06, + "loss": 2.0989, + "step": 4109 + }, + { + "epoch": 0.22049356223175964, + "grad_norm": 1.2555776834487915, + "learning_rate": 4.541269549359235e-06, + "loss": 2.3772, + "step": 4110 + }, + { + "epoch": 0.22054721030042918, + "grad_norm": 1.2451598644256592, + "learning_rate": 4.541018723546029e-06, + "loss": 2.1273, + "step": 4111 + }, + { + "epoch": 0.2206008583690987, + "grad_norm": 1.6648187637329102, + "learning_rate": 4.5407678361088255e-06, + "loss": 2.4718, + "step": 4112 + }, + { + "epoch": 0.22065450643776824, + "grad_norm": 1.4406028985977173, + "learning_rate": 4.540516887055196e-06, + "loss": 2.2285, + "step": 4113 + }, + { + "epoch": 0.22070815450643777, + "grad_norm": 1.1565648317337036, + "learning_rate": 4.54026587639272e-06, + "loss": 2.107, + "step": 4114 + }, + { + "epoch": 0.2207618025751073, + "grad_norm": 1.4181222915649414, + "learning_rate": 4.540014804128974e-06, + "loss": 2.3274, + "step": 4115 + }, + { + "epoch": 0.22081545064377683, + "grad_norm": 1.2595993280410767, + "learning_rate": 4.539763670271541e-06, + "loss": 2.2474, + "step": 4116 + }, + { + "epoch": 0.22086909871244637, + "grad_norm": 1.216870665550232, + "learning_rate": 4.539512474828001e-06, + "loss": 2.2145, + "step": 4117 + }, + { + "epoch": 0.22092274678111587, + "grad_norm": 1.519698143005371, + "learning_rate": 4.53926121780594e-06, + "loss": 2.1149, + "step": 4118 + }, + { + "epoch": 0.2209763948497854, + "grad_norm": 1.309591293334961, + "learning_rate": 4.539009899212943e-06, + "loss": 2.3481, + "step": 4119 + }, + { + "epoch": 0.22103004291845493, + "grad_norm": 1.5477757453918457, + "learning_rate": 4.538758519056598e-06, + "loss": 1.3498, + "step": 4120 + }, + { + "epoch": 0.22108369098712446, + "grad_norm": 1.23820161819458, + "learning_rate": 4.538507077344498e-06, + "loss": 2.329, + "step": 4121 + }, + { + "epoch": 0.221137339055794, + "grad_norm": 1.4710888862609863, + "learning_rate": 4.538255574084229e-06, + "loss": 2.2683, + "step": 4122 + }, + { + "epoch": 0.22119098712446353, + "grad_norm": 1.0721662044525146, + "learning_rate": 4.53800400928339e-06, + "loss": 1.9682, + "step": 4123 + }, + { + "epoch": 0.22124463519313306, + "grad_norm": 1.225087285041809, + "learning_rate": 4.5377523829495725e-06, + "loss": 2.1572, + "step": 4124 + }, + { + "epoch": 0.22129828326180256, + "grad_norm": 1.3383469581604004, + "learning_rate": 4.5375006950903766e-06, + "loss": 2.1066, + "step": 4125 + }, + { + "epoch": 0.2213519313304721, + "grad_norm": 1.2422738075256348, + "learning_rate": 4.5372489457133995e-06, + "loss": 2.3146, + "step": 4126 + }, + { + "epoch": 0.22140557939914163, + "grad_norm": 1.0540372133255005, + "learning_rate": 4.536997134826244e-06, + "loss": 2.2023, + "step": 4127 + }, + { + "epoch": 0.22145922746781116, + "grad_norm": 1.0352305173873901, + "learning_rate": 4.53674526243651e-06, + "loss": 1.8288, + "step": 4128 + }, + { + "epoch": 0.2215128755364807, + "grad_norm": 1.194169282913208, + "learning_rate": 4.5364933285518064e-06, + "loss": 2.4203, + "step": 4129 + }, + { + "epoch": 0.22156652360515022, + "grad_norm": 1.3234004974365234, + "learning_rate": 4.536241333179736e-06, + "loss": 2.3028, + "step": 4130 + }, + { + "epoch": 0.22162017167381975, + "grad_norm": 1.195081114768982, + "learning_rate": 4.53598927632791e-06, + "loss": 2.3004, + "step": 4131 + }, + { + "epoch": 0.22167381974248926, + "grad_norm": 1.2846063375473022, + "learning_rate": 4.535737158003937e-06, + "loss": 2.2635, + "step": 4132 + }, + { + "epoch": 0.2217274678111588, + "grad_norm": 4.552309513092041, + "learning_rate": 4.535484978215429e-06, + "loss": 2.1417, + "step": 4133 + }, + { + "epoch": 0.22178111587982832, + "grad_norm": 1.1983071565628052, + "learning_rate": 4.535232736970001e-06, + "loss": 2.2581, + "step": 4134 + }, + { + "epoch": 0.22183476394849785, + "grad_norm": 1.5103843212127686, + "learning_rate": 4.534980434275269e-06, + "loss": 2.4994, + "step": 4135 + }, + { + "epoch": 0.22188841201716739, + "grad_norm": 1.4699982404708862, + "learning_rate": 4.53472807013885e-06, + "loss": 2.2979, + "step": 4136 + }, + { + "epoch": 0.22194206008583692, + "grad_norm": 1.1811516284942627, + "learning_rate": 4.534475644568364e-06, + "loss": 2.2803, + "step": 4137 + }, + { + "epoch": 0.22199570815450645, + "grad_norm": 1.2863566875457764, + "learning_rate": 4.534223157571432e-06, + "loss": 2.593, + "step": 4138 + }, + { + "epoch": 0.22204935622317595, + "grad_norm": 1.4187575578689575, + "learning_rate": 4.533970609155678e-06, + "loss": 2.3258, + "step": 4139 + }, + { + "epoch": 0.22210300429184548, + "grad_norm": 1.1688623428344727, + "learning_rate": 4.533717999328725e-06, + "loss": 2.1834, + "step": 4140 + }, + { + "epoch": 0.22215665236051502, + "grad_norm": 1.2112675905227661, + "learning_rate": 4.533465328098204e-06, + "loss": 2.4451, + "step": 4141 + }, + { + "epoch": 0.22221030042918455, + "grad_norm": 1.245354413986206, + "learning_rate": 4.533212595471739e-06, + "loss": 2.2461, + "step": 4142 + }, + { + "epoch": 0.22226394849785408, + "grad_norm": 1.238709807395935, + "learning_rate": 4.532959801456964e-06, + "loss": 2.5543, + "step": 4143 + }, + { + "epoch": 0.2223175965665236, + "grad_norm": 1.3161052465438843, + "learning_rate": 4.532706946061512e-06, + "loss": 2.2503, + "step": 4144 + }, + { + "epoch": 0.22237124463519314, + "grad_norm": 1.3506826162338257, + "learning_rate": 4.532454029293014e-06, + "loss": 2.2801, + "step": 4145 + }, + { + "epoch": 0.22242489270386265, + "grad_norm": 2.1322338581085205, + "learning_rate": 4.53220105115911e-06, + "loss": 2.2402, + "step": 4146 + }, + { + "epoch": 0.22247854077253218, + "grad_norm": 3.018296480178833, + "learning_rate": 4.531948011667436e-06, + "loss": 2.322, + "step": 4147 + }, + { + "epoch": 0.2225321888412017, + "grad_norm": 1.3827488422393799, + "learning_rate": 4.531694910825632e-06, + "loss": 2.3907, + "step": 4148 + }, + { + "epoch": 0.22258583690987124, + "grad_norm": 1.122594952583313, + "learning_rate": 4.531441748641342e-06, + "loss": 2.2283, + "step": 4149 + }, + { + "epoch": 0.22263948497854077, + "grad_norm": 1.3123664855957031, + "learning_rate": 4.531188525122206e-06, + "loss": 2.428, + "step": 4150 + }, + { + "epoch": 0.2226931330472103, + "grad_norm": 1.3037792444229126, + "learning_rate": 4.530935240275872e-06, + "loss": 1.9642, + "step": 4151 + }, + { + "epoch": 0.22274678111587984, + "grad_norm": 1.2568076848983765, + "learning_rate": 4.5306818941099875e-06, + "loss": 2.1195, + "step": 4152 + }, + { + "epoch": 0.22280042918454937, + "grad_norm": 1.2873857021331787, + "learning_rate": 4.5304284866322e-06, + "loss": 2.3372, + "step": 4153 + }, + { + "epoch": 0.22285407725321887, + "grad_norm": 1.130839228630066, + "learning_rate": 4.5301750178501625e-06, + "loss": 2.0725, + "step": 4154 + }, + { + "epoch": 0.2229077253218884, + "grad_norm": 1.2262253761291504, + "learning_rate": 4.529921487771527e-06, + "loss": 2.4333, + "step": 4155 + }, + { + "epoch": 0.22296137339055794, + "grad_norm": 1.2814700603485107, + "learning_rate": 4.529667896403948e-06, + "loss": 2.3227, + "step": 4156 + }, + { + "epoch": 0.22301502145922747, + "grad_norm": 1.300201416015625, + "learning_rate": 4.529414243755083e-06, + "loss": 2.0794, + "step": 4157 + }, + { + "epoch": 0.223068669527897, + "grad_norm": 1.2454097270965576, + "learning_rate": 4.5291605298325885e-06, + "loss": 2.2754, + "step": 4158 + }, + { + "epoch": 0.22312231759656653, + "grad_norm": 4.251690864562988, + "learning_rate": 4.528906754644127e-06, + "loss": 1.8464, + "step": 4159 + }, + { + "epoch": 0.22317596566523606, + "grad_norm": 1.262721300125122, + "learning_rate": 4.52865291819736e-06, + "loss": 2.206, + "step": 4160 + }, + { + "epoch": 0.22322961373390557, + "grad_norm": 1.213037371635437, + "learning_rate": 4.528399020499952e-06, + "loss": 2.3047, + "step": 4161 + }, + { + "epoch": 0.2232832618025751, + "grad_norm": 1.457229495048523, + "learning_rate": 4.528145061559567e-06, + "loss": 2.3602, + "step": 4162 + }, + { + "epoch": 0.22333690987124463, + "grad_norm": 1.2745708227157593, + "learning_rate": 4.527891041383875e-06, + "loss": 2.245, + "step": 4163 + }, + { + "epoch": 0.22339055793991416, + "grad_norm": 1.3804290294647217, + "learning_rate": 4.527636959980544e-06, + "loss": 2.1101, + "step": 4164 + }, + { + "epoch": 0.2234442060085837, + "grad_norm": 1.3190988302230835, + "learning_rate": 4.527382817357246e-06, + "loss": 2.2078, + "step": 4165 + }, + { + "epoch": 0.22349785407725323, + "grad_norm": 1.2664481401443481, + "learning_rate": 4.527128613521655e-06, + "loss": 2.131, + "step": 4166 + }, + { + "epoch": 0.22355150214592276, + "grad_norm": 1.161913275718689, + "learning_rate": 4.526874348481445e-06, + "loss": 2.1368, + "step": 4167 + }, + { + "epoch": 0.22360515021459226, + "grad_norm": 1.365136981010437, + "learning_rate": 4.526620022244294e-06, + "loss": 2.363, + "step": 4168 + }, + { + "epoch": 0.2236587982832618, + "grad_norm": 1.2922248840332031, + "learning_rate": 4.526365634817879e-06, + "loss": 2.3746, + "step": 4169 + }, + { + "epoch": 0.22371244635193133, + "grad_norm": 1.430645227432251, + "learning_rate": 4.526111186209882e-06, + "loss": 2.2982, + "step": 4170 + }, + { + "epoch": 0.22376609442060086, + "grad_norm": 1.1746705770492554, + "learning_rate": 4.525856676427986e-06, + "loss": 2.3061, + "step": 4171 + }, + { + "epoch": 0.2238197424892704, + "grad_norm": 1.4003663063049316, + "learning_rate": 4.5256021054798745e-06, + "loss": 1.9149, + "step": 4172 + }, + { + "epoch": 0.22387339055793992, + "grad_norm": 1.1928813457489014, + "learning_rate": 4.525347473373234e-06, + "loss": 2.1898, + "step": 4173 + }, + { + "epoch": 0.22392703862660945, + "grad_norm": 1.1385349035263062, + "learning_rate": 4.525092780115753e-06, + "loss": 2.135, + "step": 4174 + }, + { + "epoch": 0.22398068669527896, + "grad_norm": 1.2306265830993652, + "learning_rate": 4.52483802571512e-06, + "loss": 2.5247, + "step": 4175 + }, + { + "epoch": 0.2240343347639485, + "grad_norm": 1.2577638626098633, + "learning_rate": 4.524583210179028e-06, + "loss": 2.1187, + "step": 4176 + }, + { + "epoch": 0.22408798283261802, + "grad_norm": 1.3242615461349487, + "learning_rate": 4.52432833351517e-06, + "loss": 2.064, + "step": 4177 + }, + { + "epoch": 0.22414163090128755, + "grad_norm": 1.3201019763946533, + "learning_rate": 4.524073395731241e-06, + "loss": 2.2134, + "step": 4178 + }, + { + "epoch": 0.22419527896995708, + "grad_norm": 1.216232180595398, + "learning_rate": 4.523818396834939e-06, + "loss": 2.3239, + "step": 4179 + }, + { + "epoch": 0.22424892703862662, + "grad_norm": 1.3973580598831177, + "learning_rate": 4.523563336833964e-06, + "loss": 2.601, + "step": 4180 + }, + { + "epoch": 0.22430257510729615, + "grad_norm": 1.2858775854110718, + "learning_rate": 4.523308215736015e-06, + "loss": 2.0889, + "step": 4181 + }, + { + "epoch": 0.22435622317596565, + "grad_norm": 1.1894402503967285, + "learning_rate": 4.523053033548796e-06, + "loss": 2.3127, + "step": 4182 + }, + { + "epoch": 0.22440987124463518, + "grad_norm": 1.3871452808380127, + "learning_rate": 4.522797790280012e-06, + "loss": 2.2172, + "step": 4183 + }, + { + "epoch": 0.22446351931330472, + "grad_norm": 1.1873234510421753, + "learning_rate": 4.522542485937369e-06, + "loss": 2.2501, + "step": 4184 + }, + { + "epoch": 0.22451716738197425, + "grad_norm": 1.7398942708969116, + "learning_rate": 4.5222871205285746e-06, + "loss": 2.4169, + "step": 4185 + }, + { + "epoch": 0.22457081545064378, + "grad_norm": 1.3806498050689697, + "learning_rate": 4.5220316940613405e-06, + "loss": 2.3102, + "step": 4186 + }, + { + "epoch": 0.2246244635193133, + "grad_norm": 1.0575028657913208, + "learning_rate": 4.521776206543377e-06, + "loss": 2.1878, + "step": 4187 + }, + { + "epoch": 0.22467811158798284, + "grad_norm": 1.7962061166763306, + "learning_rate": 4.521520657982399e-06, + "loss": 2.1237, + "step": 4188 + }, + { + "epoch": 0.22473175965665235, + "grad_norm": 1.3157016038894653, + "learning_rate": 4.521265048386122e-06, + "loss": 2.1789, + "step": 4189 + }, + { + "epoch": 0.22478540772532188, + "grad_norm": 1.5334219932556152, + "learning_rate": 4.5210093777622645e-06, + "loss": 2.1087, + "step": 4190 + }, + { + "epoch": 0.2248390557939914, + "grad_norm": 1.396945595741272, + "learning_rate": 4.5207536461185446e-06, + "loss": 2.3926, + "step": 4191 + }, + { + "epoch": 0.22489270386266094, + "grad_norm": 1.693298578262329, + "learning_rate": 4.520497853462684e-06, + "loss": 2.5151, + "step": 4192 + }, + { + "epoch": 0.22494635193133047, + "grad_norm": 1.8134039640426636, + "learning_rate": 4.520241999802405e-06, + "loss": 2.0415, + "step": 4193 + }, + { + "epoch": 0.225, + "grad_norm": 1.1591259241104126, + "learning_rate": 4.519986085145435e-06, + "loss": 2.316, + "step": 4194 + }, + { + "epoch": 0.22505364806866954, + "grad_norm": 1.1595256328582764, + "learning_rate": 4.519730109499497e-06, + "loss": 1.5389, + "step": 4195 + }, + { + "epoch": 0.22510729613733907, + "grad_norm": 1.2893412113189697, + "learning_rate": 4.519474072872323e-06, + "loss": 2.4779, + "step": 4196 + }, + { + "epoch": 0.22516094420600857, + "grad_norm": 1.5012093782424927, + "learning_rate": 4.519217975271642e-06, + "loss": 2.3136, + "step": 4197 + }, + { + "epoch": 0.2252145922746781, + "grad_norm": 1.4367676973342896, + "learning_rate": 4.518961816705187e-06, + "loss": 2.2793, + "step": 4198 + }, + { + "epoch": 0.22526824034334764, + "grad_norm": 1.3509337902069092, + "learning_rate": 4.51870559718069e-06, + "loss": 2.3964, + "step": 4199 + }, + { + "epoch": 0.22532188841201717, + "grad_norm": 0.9810132384300232, + "learning_rate": 4.5184493167058905e-06, + "loss": 1.8963, + "step": 4200 + }, + { + "epoch": 0.2253755364806867, + "grad_norm": 1.268875241279602, + "learning_rate": 4.518192975288524e-06, + "loss": 2.1802, + "step": 4201 + }, + { + "epoch": 0.22542918454935623, + "grad_norm": 1.781166434288025, + "learning_rate": 4.5179365729363285e-06, + "loss": 2.3234, + "step": 4202 + }, + { + "epoch": 0.22548283261802576, + "grad_norm": 1.4591174125671387, + "learning_rate": 4.51768010965705e-06, + "loss": 2.329, + "step": 4203 + }, + { + "epoch": 0.22553648068669527, + "grad_norm": 1.3794262409210205, + "learning_rate": 4.517423585458428e-06, + "loss": 2.2176, + "step": 4204 + }, + { + "epoch": 0.2255901287553648, + "grad_norm": 1.1388839483261108, + "learning_rate": 4.51716700034821e-06, + "loss": 2.2514, + "step": 4205 + }, + { + "epoch": 0.22564377682403433, + "grad_norm": 1.401643991470337, + "learning_rate": 4.516910354334141e-06, + "loss": 2.0769, + "step": 4206 + }, + { + "epoch": 0.22569742489270386, + "grad_norm": 1.2006672620773315, + "learning_rate": 4.516653647423972e-06, + "loss": 2.0907, + "step": 4207 + }, + { + "epoch": 0.2257510729613734, + "grad_norm": 1.060868263244629, + "learning_rate": 4.5163968796254515e-06, + "loss": 1.9721, + "step": 4208 + }, + { + "epoch": 0.22580472103004293, + "grad_norm": 1.0899029970169067, + "learning_rate": 4.516140050946334e-06, + "loss": 2.1945, + "step": 4209 + }, + { + "epoch": 0.22585836909871246, + "grad_norm": 1.2583327293395996, + "learning_rate": 4.515883161394372e-06, + "loss": 2.3642, + "step": 4210 + }, + { + "epoch": 0.22591201716738196, + "grad_norm": 1.1657730340957642, + "learning_rate": 4.515626210977323e-06, + "loss": 2.2709, + "step": 4211 + }, + { + "epoch": 0.2259656652360515, + "grad_norm": 1.3851717710494995, + "learning_rate": 4.515369199702946e-06, + "loss": 2.482, + "step": 4212 + }, + { + "epoch": 0.22601931330472103, + "grad_norm": 1.3141083717346191, + "learning_rate": 4.5151121275789985e-06, + "loss": 2.3812, + "step": 4213 + }, + { + "epoch": 0.22607296137339056, + "grad_norm": 1.3523257970809937, + "learning_rate": 4.514854994613244e-06, + "loss": 2.2384, + "step": 4214 + }, + { + "epoch": 0.2261266094420601, + "grad_norm": 1.6801468133926392, + "learning_rate": 4.514597800813444e-06, + "loss": 2.1687, + "step": 4215 + }, + { + "epoch": 0.22618025751072962, + "grad_norm": 1.14146089553833, + "learning_rate": 4.514340546187367e-06, + "loss": 2.2149, + "step": 4216 + }, + { + "epoch": 0.22623390557939915, + "grad_norm": 1.1192948818206787, + "learning_rate": 4.514083230742778e-06, + "loss": 2.1961, + "step": 4217 + }, + { + "epoch": 0.22628755364806866, + "grad_norm": 1.266595721244812, + "learning_rate": 4.513825854487446e-06, + "loss": 2.3144, + "step": 4218 + }, + { + "epoch": 0.2263412017167382, + "grad_norm": 1.2343413829803467, + "learning_rate": 4.513568417429143e-06, + "loss": 2.5629, + "step": 4219 + }, + { + "epoch": 0.22639484978540772, + "grad_norm": 1.368905782699585, + "learning_rate": 4.513310919575641e-06, + "loss": 2.2909, + "step": 4220 + }, + { + "epoch": 0.22644849785407725, + "grad_norm": 1.2427005767822266, + "learning_rate": 4.513053360934715e-06, + "loss": 2.2753, + "step": 4221 + }, + { + "epoch": 0.22650214592274678, + "grad_norm": 1.0451421737670898, + "learning_rate": 4.5127957415141404e-06, + "loss": 2.0216, + "step": 4222 + }, + { + "epoch": 0.22655579399141632, + "grad_norm": 1.269049882888794, + "learning_rate": 4.5125380613216975e-06, + "loss": 2.258, + "step": 4223 + }, + { + "epoch": 0.22660944206008585, + "grad_norm": 1.429309606552124, + "learning_rate": 4.512280320365164e-06, + "loss": 2.2943, + "step": 4224 + }, + { + "epoch": 0.22666309012875535, + "grad_norm": 1.4048737287521362, + "learning_rate": 4.512022518652324e-06, + "loss": 2.2503, + "step": 4225 + }, + { + "epoch": 0.22671673819742488, + "grad_norm": 1.8769124746322632, + "learning_rate": 4.511764656190959e-06, + "loss": 1.7067, + "step": 4226 + }, + { + "epoch": 0.22677038626609441, + "grad_norm": 1.3965413570404053, + "learning_rate": 4.511506732988857e-06, + "loss": 1.3666, + "step": 4227 + }, + { + "epoch": 0.22682403433476395, + "grad_norm": 1.4848700761795044, + "learning_rate": 4.511248749053803e-06, + "loss": 1.5891, + "step": 4228 + }, + { + "epoch": 0.22687768240343348, + "grad_norm": 1.4053491353988647, + "learning_rate": 4.510990704393589e-06, + "loss": 1.3795, + "step": 4229 + }, + { + "epoch": 0.226931330472103, + "grad_norm": 1.3739265203475952, + "learning_rate": 4.510732599016003e-06, + "loss": 2.2727, + "step": 4230 + }, + { + "epoch": 0.22698497854077254, + "grad_norm": 1.3298473358154297, + "learning_rate": 4.51047443292884e-06, + "loss": 2.1384, + "step": 4231 + }, + { + "epoch": 0.22703862660944207, + "grad_norm": 1.1832493543624878, + "learning_rate": 4.5102162061398944e-06, + "loss": 2.0473, + "step": 4232 + }, + { + "epoch": 0.22709227467811158, + "grad_norm": 1.3594063520431519, + "learning_rate": 4.509957918656962e-06, + "loss": 2.378, + "step": 4233 + }, + { + "epoch": 0.2271459227467811, + "grad_norm": 1.0908880233764648, + "learning_rate": 4.509699570487842e-06, + "loss": 2.2078, + "step": 4234 + }, + { + "epoch": 0.22719957081545064, + "grad_norm": 1.4295746088027954, + "learning_rate": 4.509441161640334e-06, + "loss": 2.4149, + "step": 4235 + }, + { + "epoch": 0.22725321888412017, + "grad_norm": 1.4791003465652466, + "learning_rate": 4.5091826921222415e-06, + "loss": 2.2749, + "step": 4236 + }, + { + "epoch": 0.2273068669527897, + "grad_norm": 1.304463267326355, + "learning_rate": 4.508924161941367e-06, + "loss": 2.1569, + "step": 4237 + }, + { + "epoch": 0.22736051502145924, + "grad_norm": 1.2860138416290283, + "learning_rate": 4.508665571105517e-06, + "loss": 2.3785, + "step": 4238 + }, + { + "epoch": 0.22741416309012877, + "grad_norm": 1.928805947303772, + "learning_rate": 4.508406919622498e-06, + "loss": 2.2349, + "step": 4239 + }, + { + "epoch": 0.22746781115879827, + "grad_norm": 1.218626856803894, + "learning_rate": 4.508148207500121e-06, + "loss": 2.2695, + "step": 4240 + }, + { + "epoch": 0.2275214592274678, + "grad_norm": 1.2615572214126587, + "learning_rate": 4.507889434746195e-06, + "loss": 2.3562, + "step": 4241 + }, + { + "epoch": 0.22757510729613734, + "grad_norm": 1.3275768756866455, + "learning_rate": 4.507630601368535e-06, + "loss": 2.3078, + "step": 4242 + }, + { + "epoch": 0.22762875536480687, + "grad_norm": 1.3431469202041626, + "learning_rate": 4.5073717073749554e-06, + "loss": 2.2589, + "step": 4243 + }, + { + "epoch": 0.2276824034334764, + "grad_norm": 1.3678847551345825, + "learning_rate": 4.507112752773273e-06, + "loss": 1.9702, + "step": 4244 + }, + { + "epoch": 0.22773605150214593, + "grad_norm": 2.9123919010162354, + "learning_rate": 4.506853737571306e-06, + "loss": 1.8705, + "step": 4245 + }, + { + "epoch": 0.22778969957081546, + "grad_norm": 1.277217984199524, + "learning_rate": 4.506594661776875e-06, + "loss": 2.2051, + "step": 4246 + }, + { + "epoch": 0.22784334763948497, + "grad_norm": 1.0994324684143066, + "learning_rate": 4.506335525397803e-06, + "loss": 2.0259, + "step": 4247 + }, + { + "epoch": 0.2278969957081545, + "grad_norm": 1.2504770755767822, + "learning_rate": 4.506076328441912e-06, + "loss": 2.0129, + "step": 4248 + }, + { + "epoch": 0.22795064377682403, + "grad_norm": 1.184155821800232, + "learning_rate": 4.505817070917029e-06, + "loss": 2.4234, + "step": 4249 + }, + { + "epoch": 0.22800429184549356, + "grad_norm": 1.2529385089874268, + "learning_rate": 4.505557752830982e-06, + "loss": 2.3375, + "step": 4250 + }, + { + "epoch": 0.2280579399141631, + "grad_norm": 1.2555314302444458, + "learning_rate": 4.5052983741916015e-06, + "loss": 2.1866, + "step": 4251 + }, + { + "epoch": 0.22811158798283263, + "grad_norm": 1.4596494436264038, + "learning_rate": 4.505038935006717e-06, + "loss": 2.4552, + "step": 4252 + }, + { + "epoch": 0.22816523605150216, + "grad_norm": 1.0968877077102661, + "learning_rate": 4.504779435284162e-06, + "loss": 2.0658, + "step": 4253 + }, + { + "epoch": 0.22821888412017166, + "grad_norm": 1.2761653661727905, + "learning_rate": 4.504519875031772e-06, + "loss": 2.2119, + "step": 4254 + }, + { + "epoch": 0.2282725321888412, + "grad_norm": 1.3791866302490234, + "learning_rate": 4.504260254257384e-06, + "loss": 2.0385, + "step": 4255 + }, + { + "epoch": 0.22832618025751072, + "grad_norm": 1.2698209285736084, + "learning_rate": 4.5040005729688375e-06, + "loss": 2.0538, + "step": 4256 + }, + { + "epoch": 0.22837982832618026, + "grad_norm": 1.2167953252792358, + "learning_rate": 4.5037408311739706e-06, + "loss": 2.3168, + "step": 4257 + }, + { + "epoch": 0.2284334763948498, + "grad_norm": 1.2450553178787231, + "learning_rate": 4.503481028880627e-06, + "loss": 2.4873, + "step": 4258 + }, + { + "epoch": 0.22848712446351932, + "grad_norm": 1.2709585428237915, + "learning_rate": 4.503221166096651e-06, + "loss": 2.3325, + "step": 4259 + }, + { + "epoch": 0.22854077253218885, + "grad_norm": 1.2812600135803223, + "learning_rate": 4.502961242829889e-06, + "loss": 2.362, + "step": 4260 + }, + { + "epoch": 0.22859442060085836, + "grad_norm": 1.6778184175491333, + "learning_rate": 4.502701259088186e-06, + "loss": 2.1352, + "step": 4261 + }, + { + "epoch": 0.2286480686695279, + "grad_norm": 3.1060702800750732, + "learning_rate": 4.5024412148793965e-06, + "loss": 2.464, + "step": 4262 + }, + { + "epoch": 0.22870171673819742, + "grad_norm": 1.412001609802246, + "learning_rate": 4.502181110211367e-06, + "loss": 2.5053, + "step": 4263 + }, + { + "epoch": 0.22875536480686695, + "grad_norm": 1.2603927850723267, + "learning_rate": 4.501920945091954e-06, + "loss": 2.1347, + "step": 4264 + }, + { + "epoch": 0.22880901287553648, + "grad_norm": 1.1803603172302246, + "learning_rate": 4.501660719529012e-06, + "loss": 2.3415, + "step": 4265 + }, + { + "epoch": 0.22886266094420601, + "grad_norm": 1.2559101581573486, + "learning_rate": 4.501400433530398e-06, + "loss": 2.3335, + "step": 4266 + }, + { + "epoch": 0.22891630901287555, + "grad_norm": 1.179594874382019, + "learning_rate": 4.5011400871039685e-06, + "loss": 2.1712, + "step": 4267 + }, + { + "epoch": 0.22896995708154508, + "grad_norm": 1.2721190452575684, + "learning_rate": 4.500879680257587e-06, + "loss": 2.429, + "step": 4268 + }, + { + "epoch": 0.22902360515021458, + "grad_norm": 1.3187835216522217, + "learning_rate": 4.500619212999115e-06, + "loss": 2.4277, + "step": 4269 + }, + { + "epoch": 0.2290772532188841, + "grad_norm": 1.4782633781433105, + "learning_rate": 4.500358685336415e-06, + "loss": 2.1177, + "step": 4270 + }, + { + "epoch": 0.22913090128755365, + "grad_norm": 1.3469854593276978, + "learning_rate": 4.500098097277356e-06, + "loss": 2.2742, + "step": 4271 + }, + { + "epoch": 0.22918454935622318, + "grad_norm": 1.2353256940841675, + "learning_rate": 4.499837448829804e-06, + "loss": 2.1407, + "step": 4272 + }, + { + "epoch": 0.2292381974248927, + "grad_norm": 1.243424892425537, + "learning_rate": 4.49957674000163e-06, + "loss": 2.1023, + "step": 4273 + }, + { + "epoch": 0.22929184549356224, + "grad_norm": 1.249490737915039, + "learning_rate": 4.499315970800704e-06, + "loss": 2.0584, + "step": 4274 + }, + { + "epoch": 0.22934549356223177, + "grad_norm": 1.520580768585205, + "learning_rate": 4.4990551412349005e-06, + "loss": 2.3413, + "step": 4275 + }, + { + "epoch": 0.22939914163090128, + "grad_norm": 1.3006083965301514, + "learning_rate": 4.498794251312093e-06, + "loss": 2.3671, + "step": 4276 + }, + { + "epoch": 0.2294527896995708, + "grad_norm": 1.059979796409607, + "learning_rate": 4.49853330104016e-06, + "loss": 1.5822, + "step": 4277 + }, + { + "epoch": 0.22950643776824034, + "grad_norm": 2.240868091583252, + "learning_rate": 4.498272290426981e-06, + "loss": 2.4461, + "step": 4278 + }, + { + "epoch": 0.22956008583690987, + "grad_norm": 1.3489139080047607, + "learning_rate": 4.498011219480435e-06, + "loss": 2.4347, + "step": 4279 + }, + { + "epoch": 0.2296137339055794, + "grad_norm": 1.267393708229065, + "learning_rate": 4.4977500882084046e-06, + "loss": 2.3078, + "step": 4280 + }, + { + "epoch": 0.22966738197424894, + "grad_norm": 1.1420133113861084, + "learning_rate": 4.497488896618775e-06, + "loss": 2.1731, + "step": 4281 + }, + { + "epoch": 0.22972103004291847, + "grad_norm": 1.2444493770599365, + "learning_rate": 4.4972276447194315e-06, + "loss": 2.1263, + "step": 4282 + }, + { + "epoch": 0.22977467811158797, + "grad_norm": 1.1850425004959106, + "learning_rate": 4.496966332518262e-06, + "loss": 2.0798, + "step": 4283 + }, + { + "epoch": 0.2298283261802575, + "grad_norm": 1.1372735500335693, + "learning_rate": 4.496704960023158e-06, + "loss": 2.0609, + "step": 4284 + }, + { + "epoch": 0.22988197424892703, + "grad_norm": 1.4855576753616333, + "learning_rate": 4.496443527242008e-06, + "loss": 2.4227, + "step": 4285 + }, + { + "epoch": 0.22993562231759657, + "grad_norm": 1.21796452999115, + "learning_rate": 4.496182034182708e-06, + "loss": 2.3481, + "step": 4286 + }, + { + "epoch": 0.2299892703862661, + "grad_norm": 1.2259410619735718, + "learning_rate": 4.495920480853152e-06, + "loss": 2.367, + "step": 4287 + }, + { + "epoch": 0.23004291845493563, + "grad_norm": 1.323128342628479, + "learning_rate": 4.495658867261238e-06, + "loss": 2.0459, + "step": 4288 + }, + { + "epoch": 0.23009656652360516, + "grad_norm": 1.424511194229126, + "learning_rate": 4.495397193414863e-06, + "loss": 2.2416, + "step": 4289 + }, + { + "epoch": 0.23015021459227467, + "grad_norm": 2.6157166957855225, + "learning_rate": 4.495135459321929e-06, + "loss": 1.6254, + "step": 4290 + }, + { + "epoch": 0.2302038626609442, + "grad_norm": 1.2663716077804565, + "learning_rate": 4.494873664990338e-06, + "loss": 2.3249, + "step": 4291 + }, + { + "epoch": 0.23025751072961373, + "grad_norm": 1.3946375846862793, + "learning_rate": 4.494611810427994e-06, + "loss": 2.2294, + "step": 4292 + }, + { + "epoch": 0.23031115879828326, + "grad_norm": 1.1045866012573242, + "learning_rate": 4.494349895642805e-06, + "loss": 2.2008, + "step": 4293 + }, + { + "epoch": 0.2303648068669528, + "grad_norm": 1.3333927392959595, + "learning_rate": 4.494087920642677e-06, + "loss": 2.3405, + "step": 4294 + }, + { + "epoch": 0.23041845493562232, + "grad_norm": 1.3146411180496216, + "learning_rate": 4.49382588543552e-06, + "loss": 2.1977, + "step": 4295 + }, + { + "epoch": 0.23047210300429186, + "grad_norm": 1.1399245262145996, + "learning_rate": 4.493563790029246e-06, + "loss": 2.1651, + "step": 4296 + }, + { + "epoch": 0.23052575107296136, + "grad_norm": 1.7175068855285645, + "learning_rate": 4.493301634431768e-06, + "loss": 2.4077, + "step": 4297 + }, + { + "epoch": 0.2305793991416309, + "grad_norm": 1.3462729454040527, + "learning_rate": 4.493039418651002e-06, + "loss": 2.1692, + "step": 4298 + }, + { + "epoch": 0.23063304721030042, + "grad_norm": 1.1895768642425537, + "learning_rate": 4.492777142694864e-06, + "loss": 2.4913, + "step": 4299 + }, + { + "epoch": 0.23068669527896996, + "grad_norm": 1.4366636276245117, + "learning_rate": 4.492514806571274e-06, + "loss": 2.2918, + "step": 4300 + }, + { + "epoch": 0.2307403433476395, + "grad_norm": 1.1893635988235474, + "learning_rate": 4.4922524102881506e-06, + "loss": 2.1577, + "step": 4301 + }, + { + "epoch": 0.23079399141630902, + "grad_norm": 1.1347347497940063, + "learning_rate": 4.491989953853419e-06, + "loss": 2.4243, + "step": 4302 + }, + { + "epoch": 0.23084763948497855, + "grad_norm": 1.7646615505218506, + "learning_rate": 4.491727437275002e-06, + "loss": 2.2572, + "step": 4303 + }, + { + "epoch": 0.23090128755364808, + "grad_norm": 1.369956612586975, + "learning_rate": 4.491464860560825e-06, + "loss": 2.3425, + "step": 4304 + }, + { + "epoch": 0.2309549356223176, + "grad_norm": 1.3222109079360962, + "learning_rate": 4.491202223718817e-06, + "loss": 2.044, + "step": 4305 + }, + { + "epoch": 0.23100858369098712, + "grad_norm": 1.1932661533355713, + "learning_rate": 4.490939526756908e-06, + "loss": 1.9169, + "step": 4306 + }, + { + "epoch": 0.23106223175965665, + "grad_norm": 1.085843563079834, + "learning_rate": 4.490676769683029e-06, + "loss": 2.2004, + "step": 4307 + }, + { + "epoch": 0.23111587982832618, + "grad_norm": 1.292127251625061, + "learning_rate": 4.490413952505113e-06, + "loss": 2.2762, + "step": 4308 + }, + { + "epoch": 0.2311695278969957, + "grad_norm": 1.2490174770355225, + "learning_rate": 4.4901510752310955e-06, + "loss": 2.3325, + "step": 4309 + }, + { + "epoch": 0.23122317596566525, + "grad_norm": 1.2808234691619873, + "learning_rate": 4.489888137868913e-06, + "loss": 2.4659, + "step": 4310 + }, + { + "epoch": 0.23127682403433478, + "grad_norm": 1.35246741771698, + "learning_rate": 4.489625140426506e-06, + "loss": 2.1575, + "step": 4311 + }, + { + "epoch": 0.23133047210300428, + "grad_norm": 1.3591811656951904, + "learning_rate": 4.489362082911813e-06, + "loss": 2.3124, + "step": 4312 + }, + { + "epoch": 0.2313841201716738, + "grad_norm": 1.6490336656570435, + "learning_rate": 4.4890989653327775e-06, + "loss": 2.2335, + "step": 4313 + }, + { + "epoch": 0.23143776824034334, + "grad_norm": 1.025185465812683, + "learning_rate": 4.488835787697344e-06, + "loss": 2.0727, + "step": 4314 + }, + { + "epoch": 0.23149141630901288, + "grad_norm": 1.6137906312942505, + "learning_rate": 4.488572550013459e-06, + "loss": 2.4679, + "step": 4315 + }, + { + "epoch": 0.2315450643776824, + "grad_norm": 1.5684468746185303, + "learning_rate": 4.488309252289068e-06, + "loss": 2.1716, + "step": 4316 + }, + { + "epoch": 0.23159871244635194, + "grad_norm": 1.582435131072998, + "learning_rate": 4.488045894532124e-06, + "loss": 2.1633, + "step": 4317 + }, + { + "epoch": 0.23165236051502147, + "grad_norm": 1.3677560091018677, + "learning_rate": 4.487782476750575e-06, + "loss": 2.1925, + "step": 4318 + }, + { + "epoch": 0.23170600858369098, + "grad_norm": 1.6632845401763916, + "learning_rate": 4.487518998952377e-06, + "loss": 2.1189, + "step": 4319 + }, + { + "epoch": 0.2317596566523605, + "grad_norm": 1.7089923620224, + "learning_rate": 4.487255461145484e-06, + "loss": 2.5497, + "step": 4320 + }, + { + "epoch": 0.23181330472103004, + "grad_norm": 1.6139802932739258, + "learning_rate": 4.486991863337854e-06, + "loss": 2.3029, + "step": 4321 + }, + { + "epoch": 0.23186695278969957, + "grad_norm": 1.3656519651412964, + "learning_rate": 4.4867282055374436e-06, + "loss": 2.2448, + "step": 4322 + }, + { + "epoch": 0.2319206008583691, + "grad_norm": 1.3967156410217285, + "learning_rate": 4.486464487752215e-06, + "loss": 2.3275, + "step": 4323 + }, + { + "epoch": 0.23197424892703863, + "grad_norm": 1.3365110158920288, + "learning_rate": 4.486200709990131e-06, + "loss": 2.4546, + "step": 4324 + }, + { + "epoch": 0.23202789699570817, + "grad_norm": 1.2156864404678345, + "learning_rate": 4.485936872259154e-06, + "loss": 2.2811, + "step": 4325 + }, + { + "epoch": 0.23208154506437767, + "grad_norm": 1.1513874530792236, + "learning_rate": 4.485672974567251e-06, + "loss": 2.3228, + "step": 4326 + }, + { + "epoch": 0.2321351931330472, + "grad_norm": 1.2503345012664795, + "learning_rate": 4.485409016922391e-06, + "loss": 2.2855, + "step": 4327 + }, + { + "epoch": 0.23218884120171673, + "grad_norm": 1.2678992748260498, + "learning_rate": 4.485144999332542e-06, + "loss": 1.8002, + "step": 4328 + }, + { + "epoch": 0.23224248927038627, + "grad_norm": 1.2359492778778076, + "learning_rate": 4.484880921805675e-06, + "loss": 2.4737, + "step": 4329 + }, + { + "epoch": 0.2322961373390558, + "grad_norm": 1.317336082458496, + "learning_rate": 4.484616784349764e-06, + "loss": 2.0856, + "step": 4330 + }, + { + "epoch": 0.23234978540772533, + "grad_norm": 1.2037147283554077, + "learning_rate": 4.484352586972785e-06, + "loss": 2.2583, + "step": 4331 + }, + { + "epoch": 0.23240343347639486, + "grad_norm": 1.4038766622543335, + "learning_rate": 4.484088329682713e-06, + "loss": 2.2584, + "step": 4332 + }, + { + "epoch": 0.23245708154506436, + "grad_norm": 1.3828651905059814, + "learning_rate": 4.483824012487528e-06, + "loss": 2.4642, + "step": 4333 + }, + { + "epoch": 0.2325107296137339, + "grad_norm": 1.3854117393493652, + "learning_rate": 4.48355963539521e-06, + "loss": 2.0562, + "step": 4334 + }, + { + "epoch": 0.23256437768240343, + "grad_norm": 1.2055209875106812, + "learning_rate": 4.483295198413742e-06, + "loss": 2.2791, + "step": 4335 + }, + { + "epoch": 0.23261802575107296, + "grad_norm": 2.0389962196350098, + "learning_rate": 4.483030701551106e-06, + "loss": 2.4317, + "step": 4336 + }, + { + "epoch": 0.2326716738197425, + "grad_norm": 1.344920039176941, + "learning_rate": 4.48276614481529e-06, + "loss": 2.3788, + "step": 4337 + }, + { + "epoch": 0.23272532188841202, + "grad_norm": 3.3896846771240234, + "learning_rate": 4.482501528214282e-06, + "loss": 2.3134, + "step": 4338 + }, + { + "epoch": 0.23277896995708156, + "grad_norm": 1.3181533813476562, + "learning_rate": 4.48223685175607e-06, + "loss": 2.2062, + "step": 4339 + }, + { + "epoch": 0.23283261802575106, + "grad_norm": 1.219758152961731, + "learning_rate": 4.481972115448645e-06, + "loss": 2.165, + "step": 4340 + }, + { + "epoch": 0.2328862660944206, + "grad_norm": 1.302364468574524, + "learning_rate": 4.481707319300002e-06, + "loss": 2.3643, + "step": 4341 + }, + { + "epoch": 0.23293991416309012, + "grad_norm": 1.1491588354110718, + "learning_rate": 4.481442463318134e-06, + "loss": 2.3329, + "step": 4342 + }, + { + "epoch": 0.23299356223175965, + "grad_norm": 1.1963340044021606, + "learning_rate": 4.481177547511039e-06, + "loss": 2.2786, + "step": 4343 + }, + { + "epoch": 0.2330472103004292, + "grad_norm": 1.243759274482727, + "learning_rate": 4.480912571886715e-06, + "loss": 1.8893, + "step": 4344 + }, + { + "epoch": 0.23310085836909872, + "grad_norm": 1.0996675491333008, + "learning_rate": 4.480647536453163e-06, + "loss": 1.9837, + "step": 4345 + }, + { + "epoch": 0.23315450643776825, + "grad_norm": 1.3433884382247925, + "learning_rate": 4.480382441218385e-06, + "loss": 2.3764, + "step": 4346 + }, + { + "epoch": 0.23320815450643778, + "grad_norm": 1.4097157716751099, + "learning_rate": 4.480117286190383e-06, + "loss": 2.2825, + "step": 4347 + }, + { + "epoch": 0.23326180257510729, + "grad_norm": 1.3240216970443726, + "learning_rate": 4.4798520713771655e-06, + "loss": 2.371, + "step": 4348 + }, + { + "epoch": 0.23331545064377682, + "grad_norm": 1.1156622171401978, + "learning_rate": 4.479586796786739e-06, + "loss": 2.1688, + "step": 4349 + }, + { + "epoch": 0.23336909871244635, + "grad_norm": 1.1675187349319458, + "learning_rate": 4.479321462427113e-06, + "loss": 2.3362, + "step": 4350 + }, + { + "epoch": 0.23342274678111588, + "grad_norm": 1.2747483253479004, + "learning_rate": 4.479056068306298e-06, + "loss": 2.4472, + "step": 4351 + }, + { + "epoch": 0.2334763948497854, + "grad_norm": 1.4827978610992432, + "learning_rate": 4.478790614432308e-06, + "loss": 2.6247, + "step": 4352 + }, + { + "epoch": 0.23353004291845494, + "grad_norm": 1.4637144804000854, + "learning_rate": 4.478525100813157e-06, + "loss": 2.1677, + "step": 4353 + }, + { + "epoch": 0.23358369098712448, + "grad_norm": 1.3094490766525269, + "learning_rate": 4.478259527456861e-06, + "loss": 2.29, + "step": 4354 + }, + { + "epoch": 0.23363733905579398, + "grad_norm": 2.6860947608947754, + "learning_rate": 4.477993894371441e-06, + "loss": 2.3369, + "step": 4355 + }, + { + "epoch": 0.2336909871244635, + "grad_norm": 1.3211926221847534, + "learning_rate": 4.477728201564914e-06, + "loss": 2.2245, + "step": 4356 + }, + { + "epoch": 0.23374463519313304, + "grad_norm": 1.1329327821731567, + "learning_rate": 4.477462449045304e-06, + "loss": 1.9609, + "step": 4357 + }, + { + "epoch": 0.23379828326180258, + "grad_norm": 1.2474039793014526, + "learning_rate": 4.4771966368206345e-06, + "loss": 2.3174, + "step": 4358 + }, + { + "epoch": 0.2338519313304721, + "grad_norm": 1.4257066249847412, + "learning_rate": 4.47693076489893e-06, + "loss": 2.3253, + "step": 4359 + }, + { + "epoch": 0.23390557939914164, + "grad_norm": 1.3322510719299316, + "learning_rate": 4.4766648332882196e-06, + "loss": 2.4728, + "step": 4360 + }, + { + "epoch": 0.23395922746781117, + "grad_norm": 1.4649618864059448, + "learning_rate": 4.476398841996531e-06, + "loss": 2.0959, + "step": 4361 + }, + { + "epoch": 0.23401287553648067, + "grad_norm": 1.4653068780899048, + "learning_rate": 4.476132791031896e-06, + "loss": 2.4589, + "step": 4362 + }, + { + "epoch": 0.2340665236051502, + "grad_norm": 1.5275148153305054, + "learning_rate": 4.475866680402348e-06, + "loss": 2.3053, + "step": 4363 + }, + { + "epoch": 0.23412017167381974, + "grad_norm": 1.2020846605300903, + "learning_rate": 4.47560051011592e-06, + "loss": 2.2285, + "step": 4364 + }, + { + "epoch": 0.23417381974248927, + "grad_norm": 1.1556189060211182, + "learning_rate": 4.475334280180651e-06, + "loss": 2.1854, + "step": 4365 + }, + { + "epoch": 0.2342274678111588, + "grad_norm": 1.5416728258132935, + "learning_rate": 4.475067990604576e-06, + "loss": 2.3941, + "step": 4366 + }, + { + "epoch": 0.23428111587982833, + "grad_norm": 1.3037092685699463, + "learning_rate": 4.4748016413957374e-06, + "loss": 2.6139, + "step": 4367 + }, + { + "epoch": 0.23433476394849787, + "grad_norm": 1.2425020933151245, + "learning_rate": 4.474535232562176e-06, + "loss": 2.1224, + "step": 4368 + }, + { + "epoch": 0.23438841201716737, + "grad_norm": 1.3612395524978638, + "learning_rate": 4.474268764111936e-06, + "loss": 2.2361, + "step": 4369 + }, + { + "epoch": 0.2344420600858369, + "grad_norm": 1.1600407361984253, + "learning_rate": 4.4740022360530615e-06, + "loss": 2.4195, + "step": 4370 + }, + { + "epoch": 0.23449570815450643, + "grad_norm": 1.028662919998169, + "learning_rate": 4.4737356483936e-06, + "loss": 2.2251, + "step": 4371 + }, + { + "epoch": 0.23454935622317596, + "grad_norm": 1.3009066581726074, + "learning_rate": 4.473469001141603e-06, + "loss": 2.3201, + "step": 4372 + }, + { + "epoch": 0.2346030042918455, + "grad_norm": 1.190610408782959, + "learning_rate": 4.473202294305118e-06, + "loss": 2.365, + "step": 4373 + }, + { + "epoch": 0.23465665236051503, + "grad_norm": 1.2243157625198364, + "learning_rate": 4.472935527892201e-06, + "loss": 2.2485, + "step": 4374 + }, + { + "epoch": 0.23471030042918456, + "grad_norm": 4.586282253265381, + "learning_rate": 4.472668701910903e-06, + "loss": 2.1639, + "step": 4375 + }, + { + "epoch": 0.23476394849785406, + "grad_norm": 1.1305216550827026, + "learning_rate": 4.472401816369281e-06, + "loss": 2.1462, + "step": 4376 + }, + { + "epoch": 0.2348175965665236, + "grad_norm": 1.3253856897354126, + "learning_rate": 4.472134871275396e-06, + "loss": 2.4035, + "step": 4377 + }, + { + "epoch": 0.23487124463519313, + "grad_norm": 1.3884618282318115, + "learning_rate": 4.471867866637304e-06, + "loss": 2.4239, + "step": 4378 + }, + { + "epoch": 0.23492489270386266, + "grad_norm": 1.4257464408874512, + "learning_rate": 4.4716008024630685e-06, + "loss": 2.3109, + "step": 4379 + }, + { + "epoch": 0.2349785407725322, + "grad_norm": 1.3328720331192017, + "learning_rate": 4.471333678760753e-06, + "loss": 2.372, + "step": 4380 + }, + { + "epoch": 0.23503218884120172, + "grad_norm": 1.2006334066390991, + "learning_rate": 4.471066495538422e-06, + "loss": 2.273, + "step": 4381 + }, + { + "epoch": 0.23508583690987125, + "grad_norm": 1.5074840784072876, + "learning_rate": 4.4707992528041425e-06, + "loss": 2.0787, + "step": 4382 + }, + { + "epoch": 0.23513948497854079, + "grad_norm": 1.2722506523132324, + "learning_rate": 4.470531950565984e-06, + "loss": 2.0986, + "step": 4383 + }, + { + "epoch": 0.2351931330472103, + "grad_norm": 3.313344717025757, + "learning_rate": 4.470264588832016e-06, + "loss": 2.1327, + "step": 4384 + }, + { + "epoch": 0.23524678111587982, + "grad_norm": 1.385342001914978, + "learning_rate": 4.469997167610312e-06, + "loss": 2.2611, + "step": 4385 + }, + { + "epoch": 0.23530042918454935, + "grad_norm": 1.1020311117172241, + "learning_rate": 4.469729686908946e-06, + "loss": 2.1116, + "step": 4386 + }, + { + "epoch": 0.23535407725321889, + "grad_norm": 1.9942830801010132, + "learning_rate": 4.4694621467359935e-06, + "loss": 2.24, + "step": 4387 + }, + { + "epoch": 0.23540772532188842, + "grad_norm": 1.301674723625183, + "learning_rate": 4.4691945470995324e-06, + "loss": 2.2243, + "step": 4388 + }, + { + "epoch": 0.23546137339055795, + "grad_norm": 1.2264516353607178, + "learning_rate": 4.468926888007643e-06, + "loss": 2.2468, + "step": 4389 + }, + { + "epoch": 0.23551502145922748, + "grad_norm": 1.7258033752441406, + "learning_rate": 4.468659169468405e-06, + "loss": 2.5116, + "step": 4390 + }, + { + "epoch": 0.23556866952789698, + "grad_norm": 1.3218050003051758, + "learning_rate": 4.468391391489904e-06, + "loss": 2.3141, + "step": 4391 + }, + { + "epoch": 0.23562231759656652, + "grad_norm": 1.3355748653411865, + "learning_rate": 4.468123554080222e-06, + "loss": 2.125, + "step": 4392 + }, + { + "epoch": 0.23567596566523605, + "grad_norm": 1.254394769668579, + "learning_rate": 4.4678556572474476e-06, + "loss": 2.437, + "step": 4393 + }, + { + "epoch": 0.23572961373390558, + "grad_norm": 1.5354855060577393, + "learning_rate": 4.46758770099967e-06, + "loss": 2.2614, + "step": 4394 + }, + { + "epoch": 0.2357832618025751, + "grad_norm": 1.289841890335083, + "learning_rate": 4.4673196853449784e-06, + "loss": 2.0953, + "step": 4395 + }, + { + "epoch": 0.23583690987124464, + "grad_norm": 1.4153512716293335, + "learning_rate": 4.4670516102914644e-06, + "loss": 2.2044, + "step": 4396 + }, + { + "epoch": 0.23589055793991417, + "grad_norm": 5.185482025146484, + "learning_rate": 4.466783475847223e-06, + "loss": 1.9168, + "step": 4397 + }, + { + "epoch": 0.23594420600858368, + "grad_norm": 1.2395782470703125, + "learning_rate": 4.46651528202035e-06, + "loss": 2.2345, + "step": 4398 + }, + { + "epoch": 0.2359978540772532, + "grad_norm": 1.4227306842803955, + "learning_rate": 4.4662470288189416e-06, + "loss": 2.2456, + "step": 4399 + }, + { + "epoch": 0.23605150214592274, + "grad_norm": 1.7872899770736694, + "learning_rate": 4.465978716251099e-06, + "loss": 2.3803, + "step": 4400 + }, + { + "epoch": 0.23610515021459227, + "grad_norm": 1.2728022336959839, + "learning_rate": 4.465710344324922e-06, + "loss": 2.2034, + "step": 4401 + }, + { + "epoch": 0.2361587982832618, + "grad_norm": 1.3185375928878784, + "learning_rate": 4.465441913048514e-06, + "loss": 2.3164, + "step": 4402 + }, + { + "epoch": 0.23621244635193134, + "grad_norm": 1.0915955305099487, + "learning_rate": 4.465173422429979e-06, + "loss": 2.0263, + "step": 4403 + }, + { + "epoch": 0.23626609442060087, + "grad_norm": 1.2610116004943848, + "learning_rate": 4.464904872477424e-06, + "loss": 2.3237, + "step": 4404 + }, + { + "epoch": 0.23631974248927037, + "grad_norm": 1.2174705266952515, + "learning_rate": 4.464636263198958e-06, + "loss": 2.3939, + "step": 4405 + }, + { + "epoch": 0.2363733905579399, + "grad_norm": 1.0720505714416504, + "learning_rate": 4.46436759460269e-06, + "loss": 2.0931, + "step": 4406 + }, + { + "epoch": 0.23642703862660944, + "grad_norm": 1.1911568641662598, + "learning_rate": 4.464098866696731e-06, + "loss": 2.1718, + "step": 4407 + }, + { + "epoch": 0.23648068669527897, + "grad_norm": 1.3971198797225952, + "learning_rate": 4.463830079489196e-06, + "loss": 2.3897, + "step": 4408 + }, + { + "epoch": 0.2365343347639485, + "grad_norm": 1.2622727155685425, + "learning_rate": 4.463561232988202e-06, + "loss": 2.3187, + "step": 4409 + }, + { + "epoch": 0.23658798283261803, + "grad_norm": 1.3098440170288086, + "learning_rate": 4.463292327201862e-06, + "loss": 2.1166, + "step": 4410 + }, + { + "epoch": 0.23664163090128756, + "grad_norm": 2.8953349590301514, + "learning_rate": 4.463023362138299e-06, + "loss": 2.2667, + "step": 4411 + }, + { + "epoch": 0.23669527896995707, + "grad_norm": 1.3173998594284058, + "learning_rate": 4.462754337805632e-06, + "loss": 2.2823, + "step": 4412 + }, + { + "epoch": 0.2367489270386266, + "grad_norm": 0.999905526638031, + "learning_rate": 4.462485254211984e-06, + "loss": 2.1859, + "step": 4413 + }, + { + "epoch": 0.23680257510729613, + "grad_norm": 1.131600260734558, + "learning_rate": 4.4622161113654785e-06, + "loss": 1.5712, + "step": 4414 + }, + { + "epoch": 0.23685622317596566, + "grad_norm": 1.4501051902770996, + "learning_rate": 4.461946909274243e-06, + "loss": 2.2202, + "step": 4415 + }, + { + "epoch": 0.2369098712446352, + "grad_norm": 1.3024439811706543, + "learning_rate": 4.461677647946404e-06, + "loss": 2.3345, + "step": 4416 + }, + { + "epoch": 0.23696351931330473, + "grad_norm": 1.052730917930603, + "learning_rate": 4.461408327390093e-06, + "loss": 1.9873, + "step": 4417 + }, + { + "epoch": 0.23701716738197426, + "grad_norm": 1.3660541772842407, + "learning_rate": 4.46113894761344e-06, + "loss": 2.4109, + "step": 4418 + }, + { + "epoch": 0.2370708154506438, + "grad_norm": 1.177076816558838, + "learning_rate": 4.460869508624579e-06, + "loss": 2.2619, + "step": 4419 + }, + { + "epoch": 0.2371244635193133, + "grad_norm": 2.34683895111084, + "learning_rate": 4.4606000104316445e-06, + "loss": 1.8681, + "step": 4420 + }, + { + "epoch": 0.23717811158798283, + "grad_norm": 1.3673999309539795, + "learning_rate": 4.460330453042775e-06, + "loss": 2.286, + "step": 4421 + }, + { + "epoch": 0.23723175965665236, + "grad_norm": 1.1394463777542114, + "learning_rate": 4.4600608364661075e-06, + "loss": 2.2922, + "step": 4422 + }, + { + "epoch": 0.2372854077253219, + "grad_norm": 1.2011947631835938, + "learning_rate": 4.459791160709783e-06, + "loss": 2.0132, + "step": 4423 + }, + { + "epoch": 0.23733905579399142, + "grad_norm": 1.259818434715271, + "learning_rate": 4.459521425781944e-06, + "loss": 2.5137, + "step": 4424 + }, + { + "epoch": 0.23739270386266095, + "grad_norm": 1.3820463418960571, + "learning_rate": 4.459251631690734e-06, + "loss": 2.3213, + "step": 4425 + }, + { + "epoch": 0.23744635193133048, + "grad_norm": 1.5029082298278809, + "learning_rate": 4.4589817784443e-06, + "loss": 2.2627, + "step": 4426 + }, + { + "epoch": 0.2375, + "grad_norm": 1.3817216157913208, + "learning_rate": 4.458711866050788e-06, + "loss": 2.3783, + "step": 4427 + }, + { + "epoch": 0.23755364806866952, + "grad_norm": 1.3477693796157837, + "learning_rate": 4.458441894518349e-06, + "loss": 2.2988, + "step": 4428 + }, + { + "epoch": 0.23760729613733905, + "grad_norm": 1.3004342317581177, + "learning_rate": 4.458171863855132e-06, + "loss": 2.1336, + "step": 4429 + }, + { + "epoch": 0.23766094420600858, + "grad_norm": 1.254935622215271, + "learning_rate": 4.457901774069292e-06, + "loss": 2.1327, + "step": 4430 + }, + { + "epoch": 0.23771459227467812, + "grad_norm": 1.3051347732543945, + "learning_rate": 4.457631625168984e-06, + "loss": 2.1168, + "step": 4431 + }, + { + "epoch": 0.23776824034334765, + "grad_norm": 1.1955232620239258, + "learning_rate": 4.457361417162363e-06, + "loss": 2.2607, + "step": 4432 + }, + { + "epoch": 0.23782188841201718, + "grad_norm": 1.2513070106506348, + "learning_rate": 4.4570911500575884e-06, + "loss": 2.3601, + "step": 4433 + }, + { + "epoch": 0.23787553648068668, + "grad_norm": 5.513226509094238, + "learning_rate": 4.45682082386282e-06, + "loss": 1.9382, + "step": 4434 + }, + { + "epoch": 0.23792918454935622, + "grad_norm": 4.670441150665283, + "learning_rate": 4.456550438586219e-06, + "loss": 2.1586, + "step": 4435 + }, + { + "epoch": 0.23798283261802575, + "grad_norm": 1.3263832330703735, + "learning_rate": 4.4562799942359496e-06, + "loss": 2.4818, + "step": 4436 + }, + { + "epoch": 0.23803648068669528, + "grad_norm": 1.0644826889038086, + "learning_rate": 4.4560094908201774e-06, + "loss": 2.0589, + "step": 4437 + }, + { + "epoch": 0.2380901287553648, + "grad_norm": 1.24978768825531, + "learning_rate": 4.45573892834707e-06, + "loss": 2.4433, + "step": 4438 + }, + { + "epoch": 0.23814377682403434, + "grad_norm": 1.2844618558883667, + "learning_rate": 4.455468306824796e-06, + "loss": 2.373, + "step": 4439 + }, + { + "epoch": 0.23819742489270387, + "grad_norm": 1.337622046470642, + "learning_rate": 4.455197626261526e-06, + "loss": 2.1959, + "step": 4440 + }, + { + "epoch": 0.23825107296137338, + "grad_norm": 1.2497349977493286, + "learning_rate": 4.454926886665433e-06, + "loss": 2.4419, + "step": 4441 + }, + { + "epoch": 0.2383047210300429, + "grad_norm": 1.2595387697219849, + "learning_rate": 4.45465608804469e-06, + "loss": 1.76, + "step": 4442 + }, + { + "epoch": 0.23835836909871244, + "grad_norm": 1.1460535526275635, + "learning_rate": 4.4543852304074754e-06, + "loss": 2.2597, + "step": 4443 + }, + { + "epoch": 0.23841201716738197, + "grad_norm": 1.2883442640304565, + "learning_rate": 4.454114313761967e-06, + "loss": 2.1038, + "step": 4444 + }, + { + "epoch": 0.2384656652360515, + "grad_norm": 2.1312105655670166, + "learning_rate": 4.453843338116342e-06, + "loss": 2.2773, + "step": 4445 + }, + { + "epoch": 0.23851931330472104, + "grad_norm": 2.400259256362915, + "learning_rate": 4.4535723034787846e-06, + "loss": 2.1665, + "step": 4446 + }, + { + "epoch": 0.23857296137339057, + "grad_norm": 1.160089373588562, + "learning_rate": 4.453301209857477e-06, + "loss": 2.3976, + "step": 4447 + }, + { + "epoch": 0.23862660944206007, + "grad_norm": 1.2763224840164185, + "learning_rate": 4.453030057260604e-06, + "loss": 2.2842, + "step": 4448 + }, + { + "epoch": 0.2386802575107296, + "grad_norm": 1.2928688526153564, + "learning_rate": 4.452758845696352e-06, + "loss": 1.8337, + "step": 4449 + }, + { + "epoch": 0.23873390557939914, + "grad_norm": 1.3449517488479614, + "learning_rate": 4.452487575172912e-06, + "loss": 2.2574, + "step": 4450 + }, + { + "epoch": 0.23878755364806867, + "grad_norm": 1.2839900255203247, + "learning_rate": 4.452216245698472e-06, + "loss": 2.1149, + "step": 4451 + }, + { + "epoch": 0.2388412017167382, + "grad_norm": 1.1692938804626465, + "learning_rate": 4.451944857281224e-06, + "loss": 2.6247, + "step": 4452 + }, + { + "epoch": 0.23889484978540773, + "grad_norm": 1.3871676921844482, + "learning_rate": 4.451673409929364e-06, + "loss": 2.7501, + "step": 4453 + }, + { + "epoch": 0.23894849785407726, + "grad_norm": 1.3463504314422607, + "learning_rate": 4.451401903651086e-06, + "loss": 2.2473, + "step": 4454 + }, + { + "epoch": 0.2390021459227468, + "grad_norm": 1.229349970817566, + "learning_rate": 4.451130338454589e-06, + "loss": 2.4, + "step": 4455 + }, + { + "epoch": 0.2390557939914163, + "grad_norm": 1.0824062824249268, + "learning_rate": 4.450858714348071e-06, + "loss": 2.2561, + "step": 4456 + }, + { + "epoch": 0.23910944206008583, + "grad_norm": 1.3383007049560547, + "learning_rate": 4.450587031339733e-06, + "loss": 2.2438, + "step": 4457 + }, + { + "epoch": 0.23916309012875536, + "grad_norm": 1.2562716007232666, + "learning_rate": 4.45031528943778e-06, + "loss": 2.1992, + "step": 4458 + }, + { + "epoch": 0.2392167381974249, + "grad_norm": 1.37105393409729, + "learning_rate": 4.450043488650414e-06, + "loss": 2.2005, + "step": 4459 + }, + { + "epoch": 0.23927038626609443, + "grad_norm": 1.1020804643630981, + "learning_rate": 4.449771628985843e-06, + "loss": 2.3056, + "step": 4460 + }, + { + "epoch": 0.23932403433476396, + "grad_norm": 1.3139532804489136, + "learning_rate": 4.4494997104522744e-06, + "loss": 2.295, + "step": 4461 + }, + { + "epoch": 0.2393776824034335, + "grad_norm": 2.4014010429382324, + "learning_rate": 4.449227733057918e-06, + "loss": 2.2242, + "step": 4462 + }, + { + "epoch": 0.239431330472103, + "grad_norm": 1.3368598222732544, + "learning_rate": 4.448955696810986e-06, + "loss": 2.2216, + "step": 4463 + }, + { + "epoch": 0.23948497854077253, + "grad_norm": 1.3003777265548706, + "learning_rate": 4.448683601719693e-06, + "loss": 2.1007, + "step": 4464 + }, + { + "epoch": 0.23953862660944206, + "grad_norm": 1.6509233713150024, + "learning_rate": 4.448411447792252e-06, + "loss": 2.4757, + "step": 4465 + }, + { + "epoch": 0.2395922746781116, + "grad_norm": 1.3373866081237793, + "learning_rate": 4.448139235036882e-06, + "loss": 2.3199, + "step": 4466 + }, + { + "epoch": 0.23964592274678112, + "grad_norm": 1.5324915647506714, + "learning_rate": 4.4478669634618014e-06, + "loss": 2.2907, + "step": 4467 + }, + { + "epoch": 0.23969957081545065, + "grad_norm": 1.2022651433944702, + "learning_rate": 4.447594633075231e-06, + "loss": 2.2585, + "step": 4468 + }, + { + "epoch": 0.23975321888412018, + "grad_norm": 1.0487884283065796, + "learning_rate": 4.447322243885392e-06, + "loss": 2.301, + "step": 4469 + }, + { + "epoch": 0.2398068669527897, + "grad_norm": 1.202965259552002, + "learning_rate": 4.447049795900509e-06, + "loss": 2.4235, + "step": 4470 + }, + { + "epoch": 0.23986051502145922, + "grad_norm": 1.6093404293060303, + "learning_rate": 4.446777289128809e-06, + "loss": 2.1577, + "step": 4471 + }, + { + "epoch": 0.23991416309012875, + "grad_norm": 1.2030073404312134, + "learning_rate": 4.446504723578519e-06, + "loss": 2.2348, + "step": 4472 + }, + { + "epoch": 0.23996781115879828, + "grad_norm": 1.235249400138855, + "learning_rate": 4.4462320992578686e-06, + "loss": 2.0572, + "step": 4473 + }, + { + "epoch": 0.24002145922746781, + "grad_norm": 2.1595802307128906, + "learning_rate": 4.445959416175089e-06, + "loss": 2.4489, + "step": 4474 + }, + { + "epoch": 0.24007510729613735, + "grad_norm": 1.0707886219024658, + "learning_rate": 4.445686674338414e-06, + "loss": 1.933, + "step": 4475 + }, + { + "epoch": 0.24012875536480688, + "grad_norm": 1.4789921045303345, + "learning_rate": 4.445413873756077e-06, + "loss": 2.431, + "step": 4476 + }, + { + "epoch": 0.24018240343347638, + "grad_norm": 1.1800998449325562, + "learning_rate": 4.445141014436315e-06, + "loss": 1.9389, + "step": 4477 + }, + { + "epoch": 0.24023605150214591, + "grad_norm": 1.6010503768920898, + "learning_rate": 4.4448680963873675e-06, + "loss": 2.1769, + "step": 4478 + }, + { + "epoch": 0.24028969957081545, + "grad_norm": 1.322901964187622, + "learning_rate": 4.444595119617474e-06, + "loss": 2.2427, + "step": 4479 + }, + { + "epoch": 0.24034334763948498, + "grad_norm": 1.3009332418441772, + "learning_rate": 4.444322084134876e-06, + "loss": 2.3964, + "step": 4480 + }, + { + "epoch": 0.2403969957081545, + "grad_norm": 1.313167929649353, + "learning_rate": 4.444048989947817e-06, + "loss": 2.3602, + "step": 4481 + }, + { + "epoch": 0.24045064377682404, + "grad_norm": 1.351949691772461, + "learning_rate": 4.443775837064544e-06, + "loss": 2.2882, + "step": 4482 + }, + { + "epoch": 0.24050429184549357, + "grad_norm": 0.9932847619056702, + "learning_rate": 4.443502625493302e-06, + "loss": 2.2312, + "step": 4483 + }, + { + "epoch": 0.24055793991416308, + "grad_norm": 1.1095845699310303, + "learning_rate": 4.443229355242342e-06, + "loss": 1.9933, + "step": 4484 + }, + { + "epoch": 0.2406115879828326, + "grad_norm": 1.3095425367355347, + "learning_rate": 4.442956026319914e-06, + "loss": 2.4814, + "step": 4485 + }, + { + "epoch": 0.24066523605150214, + "grad_norm": 1.4396170377731323, + "learning_rate": 4.4426826387342714e-06, + "loss": 2.4941, + "step": 4486 + }, + { + "epoch": 0.24071888412017167, + "grad_norm": 1.1779533624649048, + "learning_rate": 4.442409192493667e-06, + "loss": 2.1354, + "step": 4487 + }, + { + "epoch": 0.2407725321888412, + "grad_norm": 1.4943889379501343, + "learning_rate": 4.442135687606357e-06, + "loss": 2.224, + "step": 4488 + }, + { + "epoch": 0.24082618025751074, + "grad_norm": 1.1185665130615234, + "learning_rate": 4.4418621240806e-06, + "loss": 2.4317, + "step": 4489 + }, + { + "epoch": 0.24087982832618027, + "grad_norm": 3.1897494792938232, + "learning_rate": 4.441588501924656e-06, + "loss": 2.2963, + "step": 4490 + }, + { + "epoch": 0.2409334763948498, + "grad_norm": 1.219915509223938, + "learning_rate": 4.441314821146786e-06, + "loss": 2.3182, + "step": 4491 + }, + { + "epoch": 0.2409871244635193, + "grad_norm": 1.3797706365585327, + "learning_rate": 4.441041081755253e-06, + "loss": 2.1294, + "step": 4492 + }, + { + "epoch": 0.24104077253218884, + "grad_norm": 1.5421326160430908, + "learning_rate": 4.440767283758322e-06, + "loss": 2.1266, + "step": 4493 + }, + { + "epoch": 0.24109442060085837, + "grad_norm": 1.2671245336532593, + "learning_rate": 4.44049342716426e-06, + "loss": 1.9543, + "step": 4494 + }, + { + "epoch": 0.2411480686695279, + "grad_norm": 1.2562769651412964, + "learning_rate": 4.4402195119813355e-06, + "loss": 2.3559, + "step": 4495 + }, + { + "epoch": 0.24120171673819743, + "grad_norm": 1.2040386199951172, + "learning_rate": 4.439945538217818e-06, + "loss": 2.1811, + "step": 4496 + }, + { + "epoch": 0.24125536480686696, + "grad_norm": 1.549121618270874, + "learning_rate": 4.439671505881979e-06, + "loss": 2.1909, + "step": 4497 + }, + { + "epoch": 0.2413090128755365, + "grad_norm": 1.91903817653656, + "learning_rate": 4.439397414982095e-06, + "loss": 2.46, + "step": 4498 + }, + { + "epoch": 0.241362660944206, + "grad_norm": 1.3668568134307861, + "learning_rate": 4.439123265526439e-06, + "loss": 2.2894, + "step": 4499 + }, + { + "epoch": 0.24141630901287553, + "grad_norm": 1.8568788766860962, + "learning_rate": 4.43884905752329e-06, + "loss": 2.149, + "step": 4500 + }, + { + "epoch": 0.24146995708154506, + "grad_norm": 1.31496262550354, + "learning_rate": 4.438574790980926e-06, + "loss": 2.36, + "step": 4501 + }, + { + "epoch": 0.2415236051502146, + "grad_norm": 1.4341189861297607, + "learning_rate": 4.438300465907628e-06, + "loss": 2.3401, + "step": 4502 + }, + { + "epoch": 0.24157725321888412, + "grad_norm": 1.259193778038025, + "learning_rate": 4.438026082311679e-06, + "loss": 2.0051, + "step": 4503 + }, + { + "epoch": 0.24163090128755366, + "grad_norm": 1.3366167545318604, + "learning_rate": 4.4377516402013635e-06, + "loss": 2.4982, + "step": 4504 + }, + { + "epoch": 0.2416845493562232, + "grad_norm": 1.426495909690857, + "learning_rate": 4.437477139584967e-06, + "loss": 2.311, + "step": 4505 + }, + { + "epoch": 0.2417381974248927, + "grad_norm": 1.4083887338638306, + "learning_rate": 4.437202580470778e-06, + "loss": 2.3344, + "step": 4506 + }, + { + "epoch": 0.24179184549356222, + "grad_norm": 2.990960121154785, + "learning_rate": 4.436927962867086e-06, + "loss": 2.3491, + "step": 4507 + }, + { + "epoch": 0.24184549356223176, + "grad_norm": 1.2106231451034546, + "learning_rate": 4.436653286782182e-06, + "loss": 2.1099, + "step": 4508 + }, + { + "epoch": 0.2418991416309013, + "grad_norm": 1.4857693910598755, + "learning_rate": 4.436378552224359e-06, + "loss": 2.2296, + "step": 4509 + }, + { + "epoch": 0.24195278969957082, + "grad_norm": 1.2006309032440186, + "learning_rate": 4.436103759201914e-06, + "loss": 2.3279, + "step": 4510 + }, + { + "epoch": 0.24200643776824035, + "grad_norm": 1.4846787452697754, + "learning_rate": 4.435828907723143e-06, + "loss": 2.656, + "step": 4511 + }, + { + "epoch": 0.24206008583690988, + "grad_norm": 1.3082877397537231, + "learning_rate": 4.435553997796343e-06, + "loss": 2.5256, + "step": 4512 + }, + { + "epoch": 0.2421137339055794, + "grad_norm": 1.1711370944976807, + "learning_rate": 4.435279029429816e-06, + "loss": 2.3154, + "step": 4513 + }, + { + "epoch": 0.24216738197424892, + "grad_norm": 1.2798247337341309, + "learning_rate": 4.435004002631863e-06, + "loss": 2.2007, + "step": 4514 + }, + { + "epoch": 0.24222103004291845, + "grad_norm": 1.2884875535964966, + "learning_rate": 4.434728917410788e-06, + "loss": 2.239, + "step": 4515 + }, + { + "epoch": 0.24227467811158798, + "grad_norm": 1.2101253271102905, + "learning_rate": 4.434453773774897e-06, + "loss": 2.3283, + "step": 4516 + }, + { + "epoch": 0.24232832618025751, + "grad_norm": 1.259056568145752, + "learning_rate": 4.4341785717324964e-06, + "loss": 2.2467, + "step": 4517 + }, + { + "epoch": 0.24238197424892705, + "grad_norm": 1.1883078813552856, + "learning_rate": 4.433903311291897e-06, + "loss": 2.2291, + "step": 4518 + }, + { + "epoch": 0.24243562231759658, + "grad_norm": 1.4221304655075073, + "learning_rate": 4.433627992461408e-06, + "loss": 2.2965, + "step": 4519 + }, + { + "epoch": 0.24248927038626608, + "grad_norm": 1.4277164936065674, + "learning_rate": 4.433352615249343e-06, + "loss": 2.1561, + "step": 4520 + }, + { + "epoch": 0.2425429184549356, + "grad_norm": 1.2651458978652954, + "learning_rate": 4.433077179664016e-06, + "loss": 2.1477, + "step": 4521 + }, + { + "epoch": 0.24259656652360514, + "grad_norm": 1.084426760673523, + "learning_rate": 4.432801685713743e-06, + "loss": 2.1495, + "step": 4522 + }, + { + "epoch": 0.24265021459227468, + "grad_norm": 1.350829839706421, + "learning_rate": 4.432526133406843e-06, + "loss": 2.3291, + "step": 4523 + }, + { + "epoch": 0.2427038626609442, + "grad_norm": 1.2623040676116943, + "learning_rate": 4.432250522751634e-06, + "loss": 2.2698, + "step": 4524 + }, + { + "epoch": 0.24275751072961374, + "grad_norm": 1.2594927549362183, + "learning_rate": 4.431974853756439e-06, + "loss": 2.0316, + "step": 4525 + }, + { + "epoch": 0.24281115879828327, + "grad_norm": 2.7029483318328857, + "learning_rate": 4.43169912642958e-06, + "loss": 2.3451, + "step": 4526 + }, + { + "epoch": 0.24286480686695278, + "grad_norm": 1.1853396892547607, + "learning_rate": 4.431423340779383e-06, + "loss": 2.1373, + "step": 4527 + }, + { + "epoch": 0.2429184549356223, + "grad_norm": 13.509042739868164, + "learning_rate": 4.431147496814174e-06, + "loss": 2.3131, + "step": 4528 + }, + { + "epoch": 0.24297210300429184, + "grad_norm": 1.1108297109603882, + "learning_rate": 4.430871594542282e-06, + "loss": 2.19, + "step": 4529 + }, + { + "epoch": 0.24302575107296137, + "grad_norm": 1.2682673931121826, + "learning_rate": 4.4305956339720365e-06, + "loss": 1.9399, + "step": 4530 + }, + { + "epoch": 0.2430793991416309, + "grad_norm": 1.4001171588897705, + "learning_rate": 4.430319615111771e-06, + "loss": 2.3747, + "step": 4531 + }, + { + "epoch": 0.24313304721030043, + "grad_norm": 1.026219367980957, + "learning_rate": 4.430043537969818e-06, + "loss": 1.9686, + "step": 4532 + }, + { + "epoch": 0.24318669527896997, + "grad_norm": 1.3281149864196777, + "learning_rate": 4.429767402554512e-06, + "loss": 2.3697, + "step": 4533 + }, + { + "epoch": 0.2432403433476395, + "grad_norm": 1.5367693901062012, + "learning_rate": 4.429491208874192e-06, + "loss": 2.1622, + "step": 4534 + }, + { + "epoch": 0.243293991416309, + "grad_norm": 1.443192720413208, + "learning_rate": 4.429214956937197e-06, + "loss": 2.2481, + "step": 4535 + }, + { + "epoch": 0.24334763948497853, + "grad_norm": 1.2638553380966187, + "learning_rate": 4.4289386467518695e-06, + "loss": 2.3543, + "step": 4536 + }, + { + "epoch": 0.24340128755364807, + "grad_norm": 1.6390571594238281, + "learning_rate": 4.4286622783265475e-06, + "loss": 2.3619, + "step": 4537 + }, + { + "epoch": 0.2434549356223176, + "grad_norm": 1.1614885330200195, + "learning_rate": 4.428385851669579e-06, + "loss": 2.2292, + "step": 4538 + }, + { + "epoch": 0.24350858369098713, + "grad_norm": 1.8747397661209106, + "learning_rate": 4.428109366789308e-06, + "loss": 2.2066, + "step": 4539 + }, + { + "epoch": 0.24356223175965666, + "grad_norm": 1.3619917631149292, + "learning_rate": 4.427832823694084e-06, + "loss": 2.4707, + "step": 4540 + }, + { + "epoch": 0.2436158798283262, + "grad_norm": 1.32420814037323, + "learning_rate": 4.4275562223922554e-06, + "loss": 2.3491, + "step": 4541 + }, + { + "epoch": 0.2436695278969957, + "grad_norm": 1.4588814973831177, + "learning_rate": 4.427279562892175e-06, + "loss": 2.0129, + "step": 4542 + }, + { + "epoch": 0.24372317596566523, + "grad_norm": 1.2479084730148315, + "learning_rate": 4.427002845202194e-06, + "loss": 2.2295, + "step": 4543 + }, + { + "epoch": 0.24377682403433476, + "grad_norm": 1.2809967994689941, + "learning_rate": 4.426726069330669e-06, + "loss": 2.5221, + "step": 4544 + }, + { + "epoch": 0.2438304721030043, + "grad_norm": 1.3349099159240723, + "learning_rate": 4.426449235285955e-06, + "loss": 2.344, + "step": 4545 + }, + { + "epoch": 0.24388412017167382, + "grad_norm": 2.370455503463745, + "learning_rate": 4.426172343076411e-06, + "loss": 2.3652, + "step": 4546 + }, + { + "epoch": 0.24393776824034336, + "grad_norm": 1.2130486965179443, + "learning_rate": 4.4258953927103986e-06, + "loss": 2.4758, + "step": 4547 + }, + { + "epoch": 0.2439914163090129, + "grad_norm": 1.3224835395812988, + "learning_rate": 4.425618384196278e-06, + "loss": 2.1518, + "step": 4548 + }, + { + "epoch": 0.2440450643776824, + "grad_norm": 1.318089485168457, + "learning_rate": 4.425341317542413e-06, + "loss": 2.3941, + "step": 4549 + }, + { + "epoch": 0.24409871244635192, + "grad_norm": 1.3781025409698486, + "learning_rate": 4.4250641927571694e-06, + "loss": 2.2321, + "step": 4550 + }, + { + "epoch": 0.24415236051502145, + "grad_norm": 1.340971827507019, + "learning_rate": 4.424787009848915e-06, + "loss": 2.397, + "step": 4551 + }, + { + "epoch": 0.244206008583691, + "grad_norm": 1.2438377141952515, + "learning_rate": 4.424509768826018e-06, + "loss": 2.0433, + "step": 4552 + }, + { + "epoch": 0.24425965665236052, + "grad_norm": 1.366862177848816, + "learning_rate": 4.424232469696849e-06, + "loss": 2.5268, + "step": 4553 + }, + { + "epoch": 0.24431330472103005, + "grad_norm": 1.2236778736114502, + "learning_rate": 4.42395511246978e-06, + "loss": 2.1351, + "step": 4554 + }, + { + "epoch": 0.24436695278969958, + "grad_norm": 1.2367098331451416, + "learning_rate": 4.4236776971531855e-06, + "loss": 2.386, + "step": 4555 + }, + { + "epoch": 0.24442060085836909, + "grad_norm": 1.3691462278366089, + "learning_rate": 4.423400223755442e-06, + "loss": 2.2196, + "step": 4556 + }, + { + "epoch": 0.24447424892703862, + "grad_norm": 2.451730728149414, + "learning_rate": 4.423122692284927e-06, + "loss": 2.0526, + "step": 4557 + }, + { + "epoch": 0.24452789699570815, + "grad_norm": 1.3873506784439087, + "learning_rate": 4.4228451027500195e-06, + "loss": 2.0436, + "step": 4558 + }, + { + "epoch": 0.24458154506437768, + "grad_norm": 1.315555214881897, + "learning_rate": 4.422567455159102e-06, + "loss": 2.3298, + "step": 4559 + }, + { + "epoch": 0.2446351931330472, + "grad_norm": 1.2035634517669678, + "learning_rate": 4.422289749520555e-06, + "loss": 2.1559, + "step": 4560 + }, + { + "epoch": 0.24468884120171674, + "grad_norm": 1.6766892671585083, + "learning_rate": 4.4220119858427655e-06, + "loss": 2.0134, + "step": 4561 + }, + { + "epoch": 0.24474248927038628, + "grad_norm": 1.4351340532302856, + "learning_rate": 4.4217341641341186e-06, + "loss": 2.0823, + "step": 4562 + }, + { + "epoch": 0.24479613733905578, + "grad_norm": 1.620651125907898, + "learning_rate": 4.421456284403003e-06, + "loss": 2.0864, + "step": 4563 + }, + { + "epoch": 0.2448497854077253, + "grad_norm": 1.7919038534164429, + "learning_rate": 4.421178346657809e-06, + "loss": 2.3626, + "step": 4564 + }, + { + "epoch": 0.24490343347639484, + "grad_norm": 1.7062321901321411, + "learning_rate": 4.4209003509069284e-06, + "loss": 2.2628, + "step": 4565 + }, + { + "epoch": 0.24495708154506438, + "grad_norm": 1.3591548204421997, + "learning_rate": 4.420622297158753e-06, + "loss": 2.2979, + "step": 4566 + }, + { + "epoch": 0.2450107296137339, + "grad_norm": 1.263174057006836, + "learning_rate": 4.42034418542168e-06, + "loss": 2.2619, + "step": 4567 + }, + { + "epoch": 0.24506437768240344, + "grad_norm": 1.3969866037368774, + "learning_rate": 4.420066015704105e-06, + "loss": 2.3592, + "step": 4568 + }, + { + "epoch": 0.24511802575107297, + "grad_norm": 1.2820513248443604, + "learning_rate": 4.419787788014428e-06, + "loss": 2.2871, + "step": 4569 + }, + { + "epoch": 0.2451716738197425, + "grad_norm": 1.3511970043182373, + "learning_rate": 4.419509502361049e-06, + "loss": 2.4405, + "step": 4570 + }, + { + "epoch": 0.245225321888412, + "grad_norm": 1.2979071140289307, + "learning_rate": 4.419231158752369e-06, + "loss": 2.3302, + "step": 4571 + }, + { + "epoch": 0.24527896995708154, + "grad_norm": 1.3349720239639282, + "learning_rate": 4.418952757196794e-06, + "loss": 2.341, + "step": 4572 + }, + { + "epoch": 0.24533261802575107, + "grad_norm": 1.2666716575622559, + "learning_rate": 4.418674297702728e-06, + "loss": 2.3219, + "step": 4573 + }, + { + "epoch": 0.2453862660944206, + "grad_norm": 2.173933267593384, + "learning_rate": 4.418395780278579e-06, + "loss": 2.3568, + "step": 4574 + }, + { + "epoch": 0.24543991416309013, + "grad_norm": 1.1442227363586426, + "learning_rate": 4.418117204932757e-06, + "loss": 2.2588, + "step": 4575 + }, + { + "epoch": 0.24549356223175967, + "grad_norm": 1.2916347980499268, + "learning_rate": 4.417838571673671e-06, + "loss": 2.603, + "step": 4576 + }, + { + "epoch": 0.2455472103004292, + "grad_norm": 1.3924412727355957, + "learning_rate": 4.417559880509736e-06, + "loss": 2.4006, + "step": 4577 + }, + { + "epoch": 0.2456008583690987, + "grad_norm": 1.260448932647705, + "learning_rate": 4.417281131449366e-06, + "loss": 2.1945, + "step": 4578 + }, + { + "epoch": 0.24565450643776823, + "grad_norm": 1.2147328853607178, + "learning_rate": 4.417002324500976e-06, + "loss": 2.4357, + "step": 4579 + }, + { + "epoch": 0.24570815450643776, + "grad_norm": 1.6764822006225586, + "learning_rate": 4.416723459672985e-06, + "loss": 2.1154, + "step": 4580 + }, + { + "epoch": 0.2457618025751073, + "grad_norm": 1.2308107614517212, + "learning_rate": 4.416444536973811e-06, + "loss": 2.3505, + "step": 4581 + }, + { + "epoch": 0.24581545064377683, + "grad_norm": 1.3282898664474487, + "learning_rate": 4.4161655564118776e-06, + "loss": 2.2371, + "step": 4582 + }, + { + "epoch": 0.24586909871244636, + "grad_norm": 1.3079829216003418, + "learning_rate": 4.415886517995608e-06, + "loss": 2.2487, + "step": 4583 + }, + { + "epoch": 0.2459227467811159, + "grad_norm": 1.3554892539978027, + "learning_rate": 4.415607421733425e-06, + "loss": 2.3208, + "step": 4584 + }, + { + "epoch": 0.2459763948497854, + "grad_norm": 1.480136513710022, + "learning_rate": 4.415328267633757e-06, + "loss": 1.3268, + "step": 4585 + }, + { + "epoch": 0.24603004291845493, + "grad_norm": 1.1656535863876343, + "learning_rate": 4.415049055705032e-06, + "loss": 2.1294, + "step": 4586 + }, + { + "epoch": 0.24608369098712446, + "grad_norm": 1.221509575843811, + "learning_rate": 4.414769785955681e-06, + "loss": 2.1645, + "step": 4587 + }, + { + "epoch": 0.246137339055794, + "grad_norm": 1.19186532497406, + "learning_rate": 4.414490458394134e-06, + "loss": 2.2249, + "step": 4588 + }, + { + "epoch": 0.24619098712446352, + "grad_norm": 1.2068843841552734, + "learning_rate": 4.414211073028826e-06, + "loss": 1.8834, + "step": 4589 + }, + { + "epoch": 0.24624463519313305, + "grad_norm": 1.4786144495010376, + "learning_rate": 4.413931629868192e-06, + "loss": 2.2732, + "step": 4590 + }, + { + "epoch": 0.2462982832618026, + "grad_norm": 1.1416915655136108, + "learning_rate": 4.41365212892067e-06, + "loss": 2.257, + "step": 4591 + }, + { + "epoch": 0.2463519313304721, + "grad_norm": 1.228311538696289, + "learning_rate": 4.413372570194698e-06, + "loss": 2.3626, + "step": 4592 + }, + { + "epoch": 0.24640557939914162, + "grad_norm": 1.7493356466293335, + "learning_rate": 4.413092953698718e-06, + "loss": 2.141, + "step": 4593 + }, + { + "epoch": 0.24645922746781115, + "grad_norm": 1.8018168210983276, + "learning_rate": 4.412813279441169e-06, + "loss": 1.9612, + "step": 4594 + }, + { + "epoch": 0.24651287553648069, + "grad_norm": 1.1696953773498535, + "learning_rate": 4.4125335474305e-06, + "loss": 2.338, + "step": 4595 + }, + { + "epoch": 0.24656652360515022, + "grad_norm": 1.3282105922698975, + "learning_rate": 4.412253757675152e-06, + "loss": 2.0995, + "step": 4596 + }, + { + "epoch": 0.24662017167381975, + "grad_norm": 1.65318763256073, + "learning_rate": 4.4119739101835765e-06, + "loss": 2.4615, + "step": 4597 + }, + { + "epoch": 0.24667381974248928, + "grad_norm": 1.3003625869750977, + "learning_rate": 4.411694004964221e-06, + "loss": 2.2152, + "step": 4598 + }, + { + "epoch": 0.24672746781115878, + "grad_norm": 1.2720067501068115, + "learning_rate": 4.411414042025537e-06, + "loss": 2.2008, + "step": 4599 + }, + { + "epoch": 0.24678111587982832, + "grad_norm": 1.4852077960968018, + "learning_rate": 4.411134021375978e-06, + "loss": 2.4577, + "step": 4600 + }, + { + "epoch": 0.24683476394849785, + "grad_norm": 1.2844146490097046, + "learning_rate": 4.410853943023996e-06, + "loss": 2.3396, + "step": 4601 + }, + { + "epoch": 0.24688841201716738, + "grad_norm": 1.377570629119873, + "learning_rate": 4.410573806978051e-06, + "loss": 2.2999, + "step": 4602 + }, + { + "epoch": 0.2469420600858369, + "grad_norm": 0.9887990951538086, + "learning_rate": 4.410293613246599e-06, + "loss": 1.9186, + "step": 4603 + }, + { + "epoch": 0.24699570815450644, + "grad_norm": 1.2459425926208496, + "learning_rate": 4.4100133618381e-06, + "loss": 2.1994, + "step": 4604 + }, + { + "epoch": 0.24704935622317598, + "grad_norm": 1.4360429048538208, + "learning_rate": 4.4097330527610146e-06, + "loss": 2.3733, + "step": 4605 + }, + { + "epoch": 0.2471030042918455, + "grad_norm": 1.3495270013809204, + "learning_rate": 4.409452686023809e-06, + "loss": 2.3477, + "step": 4606 + }, + { + "epoch": 0.247156652360515, + "grad_norm": 1.353315830230713, + "learning_rate": 4.409172261634945e-06, + "loss": 2.384, + "step": 4607 + }, + { + "epoch": 0.24721030042918454, + "grad_norm": 1.401780366897583, + "learning_rate": 4.408891779602892e-06, + "loss": 2.3432, + "step": 4608 + }, + { + "epoch": 0.24726394849785407, + "grad_norm": 2.026226282119751, + "learning_rate": 4.408611239936117e-06, + "loss": 2.198, + "step": 4609 + }, + { + "epoch": 0.2473175965665236, + "grad_norm": 1.4425803422927856, + "learning_rate": 4.408330642643091e-06, + "loss": 1.5518, + "step": 4610 + }, + { + "epoch": 0.24737124463519314, + "grad_norm": 1.5362894535064697, + "learning_rate": 4.4080499877322855e-06, + "loss": 2.34, + "step": 4611 + }, + { + "epoch": 0.24742489270386267, + "grad_norm": 1.3754531145095825, + "learning_rate": 4.407769275212173e-06, + "loss": 2.0786, + "step": 4612 + }, + { + "epoch": 0.2474785407725322, + "grad_norm": 1.3047900199890137, + "learning_rate": 4.407488505091232e-06, + "loss": 2.3881, + "step": 4613 + }, + { + "epoch": 0.2475321888412017, + "grad_norm": 1.1291013956069946, + "learning_rate": 4.407207677377938e-06, + "loss": 2.0419, + "step": 4614 + }, + { + "epoch": 0.24758583690987124, + "grad_norm": 1.3228893280029297, + "learning_rate": 4.40692679208077e-06, + "loss": 2.3093, + "step": 4615 + }, + { + "epoch": 0.24763948497854077, + "grad_norm": 1.3220711946487427, + "learning_rate": 4.406645849208208e-06, + "loss": 2.456, + "step": 4616 + }, + { + "epoch": 0.2476931330472103, + "grad_norm": 1.175918698310852, + "learning_rate": 4.406364848768737e-06, + "loss": 2.0964, + "step": 4617 + }, + { + "epoch": 0.24774678111587983, + "grad_norm": 1.2820569276809692, + "learning_rate": 4.406083790770838e-06, + "loss": 2.1899, + "step": 4618 + }, + { + "epoch": 0.24780042918454936, + "grad_norm": 1.3543072938919067, + "learning_rate": 4.405802675222999e-06, + "loss": 2.246, + "step": 4619 + }, + { + "epoch": 0.2478540772532189, + "grad_norm": 1.2152683734893799, + "learning_rate": 4.405521502133706e-06, + "loss": 2.2243, + "step": 4620 + }, + { + "epoch": 0.2479077253218884, + "grad_norm": 1.4656566381454468, + "learning_rate": 4.40524027151145e-06, + "loss": 2.1674, + "step": 4621 + }, + { + "epoch": 0.24796137339055793, + "grad_norm": 2.225377082824707, + "learning_rate": 4.404958983364722e-06, + "loss": 1.9598, + "step": 4622 + }, + { + "epoch": 0.24801502145922746, + "grad_norm": 1.6383512020111084, + "learning_rate": 4.404677637702014e-06, + "loss": 2.1319, + "step": 4623 + }, + { + "epoch": 0.248068669527897, + "grad_norm": 1.3523112535476685, + "learning_rate": 4.40439623453182e-06, + "loss": 2.1963, + "step": 4624 + }, + { + "epoch": 0.24812231759656653, + "grad_norm": 1.373656988143921, + "learning_rate": 4.404114773862639e-06, + "loss": 2.1036, + "step": 4625 + }, + { + "epoch": 0.24817596566523606, + "grad_norm": 1.0224401950836182, + "learning_rate": 4.403833255702966e-06, + "loss": 1.8302, + "step": 4626 + }, + { + "epoch": 0.2482296137339056, + "grad_norm": 1.3691993951797485, + "learning_rate": 4.403551680061303e-06, + "loss": 2.4441, + "step": 4627 + }, + { + "epoch": 0.2482832618025751, + "grad_norm": 1.32490873336792, + "learning_rate": 4.403270046946151e-06, + "loss": 2.362, + "step": 4628 + }, + { + "epoch": 0.24833690987124463, + "grad_norm": 1.4081957340240479, + "learning_rate": 4.402988356366013e-06, + "loss": 1.9213, + "step": 4629 + }, + { + "epoch": 0.24839055793991416, + "grad_norm": 1.402766466140747, + "learning_rate": 4.402706608329394e-06, + "loss": 2.3816, + "step": 4630 + }, + { + "epoch": 0.2484442060085837, + "grad_norm": 9.932598114013672, + "learning_rate": 4.4024248028448005e-06, + "loss": 1.4311, + "step": 4631 + }, + { + "epoch": 0.24849785407725322, + "grad_norm": 1.2272510528564453, + "learning_rate": 4.402142939920741e-06, + "loss": 2.2296, + "step": 4632 + }, + { + "epoch": 0.24855150214592275, + "grad_norm": 1.2913213968276978, + "learning_rate": 4.401861019565726e-06, + "loss": 2.282, + "step": 4633 + }, + { + "epoch": 0.24860515021459229, + "grad_norm": 1.4505927562713623, + "learning_rate": 4.401579041788268e-06, + "loss": 2.2803, + "step": 4634 + }, + { + "epoch": 0.2486587982832618, + "grad_norm": 1.4837590456008911, + "learning_rate": 4.4012970065968805e-06, + "loss": 2.2837, + "step": 4635 + }, + { + "epoch": 0.24871244635193132, + "grad_norm": 1.3483814001083374, + "learning_rate": 4.401014914000078e-06, + "loss": 2.4499, + "step": 4636 + }, + { + "epoch": 0.24876609442060085, + "grad_norm": 1.0692014694213867, + "learning_rate": 4.400732764006378e-06, + "loss": 2.1507, + "step": 4637 + }, + { + "epoch": 0.24881974248927038, + "grad_norm": 1.3181699514389038, + "learning_rate": 4.4004505566243e-06, + "loss": 2.2928, + "step": 4638 + }, + { + "epoch": 0.24887339055793992, + "grad_norm": 1.6523408889770508, + "learning_rate": 4.400168291862364e-06, + "loss": 2.1778, + "step": 4639 + }, + { + "epoch": 0.24892703862660945, + "grad_norm": 1.372593641281128, + "learning_rate": 4.399885969729093e-06, + "loss": 2.255, + "step": 4640 + }, + { + "epoch": 0.24898068669527898, + "grad_norm": 1.036649227142334, + "learning_rate": 4.39960359023301e-06, + "loss": 2.2627, + "step": 4641 + }, + { + "epoch": 0.2490343347639485, + "grad_norm": 1.2437238693237305, + "learning_rate": 4.399321153382643e-06, + "loss": 1.6998, + "step": 4642 + }, + { + "epoch": 0.24908798283261802, + "grad_norm": 1.4217860698699951, + "learning_rate": 4.399038659186517e-06, + "loss": 2.3597, + "step": 4643 + }, + { + "epoch": 0.24914163090128755, + "grad_norm": 1.2888492345809937, + "learning_rate": 4.398756107653163e-06, + "loss": 2.2926, + "step": 4644 + }, + { + "epoch": 0.24919527896995708, + "grad_norm": 1.3465036153793335, + "learning_rate": 4.398473498791111e-06, + "loss": 2.4397, + "step": 4645 + }, + { + "epoch": 0.2492489270386266, + "grad_norm": 1.8906538486480713, + "learning_rate": 4.398190832608893e-06, + "loss": 2.1739, + "step": 4646 + }, + { + "epoch": 0.24930257510729614, + "grad_norm": 1.3516991138458252, + "learning_rate": 4.397908109115045e-06, + "loss": 2.2859, + "step": 4647 + }, + { + "epoch": 0.24935622317596567, + "grad_norm": 1.7208471298217773, + "learning_rate": 4.397625328318104e-06, + "loss": 2.3517, + "step": 4648 + }, + { + "epoch": 0.2494098712446352, + "grad_norm": 1.1863105297088623, + "learning_rate": 4.397342490226606e-06, + "loss": 2.3567, + "step": 4649 + }, + { + "epoch": 0.2494635193133047, + "grad_norm": 1.0315488576889038, + "learning_rate": 4.397059594849092e-06, + "loss": 1.8324, + "step": 4650 + }, + { + "epoch": 0.24951716738197424, + "grad_norm": 4.127085208892822, + "learning_rate": 4.396776642194102e-06, + "loss": 2.0841, + "step": 4651 + }, + { + "epoch": 0.24957081545064377, + "grad_norm": 3.1701173782348633, + "learning_rate": 4.39649363227018e-06, + "loss": 2.2888, + "step": 4652 + }, + { + "epoch": 0.2496244635193133, + "grad_norm": 1.468549370765686, + "learning_rate": 4.39621056508587e-06, + "loss": 2.0649, + "step": 4653 + }, + { + "epoch": 0.24967811158798284, + "grad_norm": 1.358656883239746, + "learning_rate": 4.39592744064972e-06, + "loss": 2.2778, + "step": 4654 + }, + { + "epoch": 0.24973175965665237, + "grad_norm": 1.4928723573684692, + "learning_rate": 4.395644258970279e-06, + "loss": 2.4162, + "step": 4655 + }, + { + "epoch": 0.2497854077253219, + "grad_norm": 1.3940508365631104, + "learning_rate": 4.395361020056094e-06, + "loss": 2.4959, + "step": 4656 + }, + { + "epoch": 0.2498390557939914, + "grad_norm": 1.3029959201812744, + "learning_rate": 4.395077723915718e-06, + "loss": 2.4055, + "step": 4657 + }, + { + "epoch": 0.24989270386266094, + "grad_norm": 1.213126540184021, + "learning_rate": 4.394794370557706e-06, + "loss": 2.2629, + "step": 4658 + }, + { + "epoch": 0.24994635193133047, + "grad_norm": 1.28671395778656, + "learning_rate": 4.394510959990612e-06, + "loss": 2.4889, + "step": 4659 + }, + { + "epoch": 0.25, + "grad_norm": 1.1249020099639893, + "learning_rate": 4.394227492222994e-06, + "loss": 2.5089, + "step": 4660 + }, + { + "epoch": 0.25005364806866953, + "grad_norm": 1.3175601959228516, + "learning_rate": 4.393943967263408e-06, + "loss": 2.1537, + "step": 4661 + }, + { + "epoch": 0.25010729613733906, + "grad_norm": 1.3439412117004395, + "learning_rate": 4.393660385120417e-06, + "loss": 2.4132, + "step": 4662 + }, + { + "epoch": 0.2501609442060086, + "grad_norm": 1.2446285486221313, + "learning_rate": 4.393376745802582e-06, + "loss": 2.14, + "step": 4663 + }, + { + "epoch": 0.2502145922746781, + "grad_norm": 1.2032802104949951, + "learning_rate": 4.3930930493184675e-06, + "loss": 2.5206, + "step": 4664 + }, + { + "epoch": 0.25026824034334766, + "grad_norm": 1.272829532623291, + "learning_rate": 4.392809295676639e-06, + "loss": 2.3303, + "step": 4665 + }, + { + "epoch": 0.2503218884120172, + "grad_norm": 5.965005397796631, + "learning_rate": 4.392525484885663e-06, + "loss": 2.2117, + "step": 4666 + }, + { + "epoch": 0.2503755364806867, + "grad_norm": 1.2145698070526123, + "learning_rate": 4.39224161695411e-06, + "loss": 2.3229, + "step": 4667 + }, + { + "epoch": 0.2504291845493562, + "grad_norm": 1.269827961921692, + "learning_rate": 4.39195769189055e-06, + "loss": 2.2984, + "step": 4668 + }, + { + "epoch": 0.25048283261802573, + "grad_norm": 1.3151859045028687, + "learning_rate": 4.391673709703553e-06, + "loss": 2.0711, + "step": 4669 + }, + { + "epoch": 0.25053648068669526, + "grad_norm": 1.1792100667953491, + "learning_rate": 4.391389670401698e-06, + "loss": 2.3515, + "step": 4670 + }, + { + "epoch": 0.2505901287553648, + "grad_norm": 1.3208152055740356, + "learning_rate": 4.391105573993557e-06, + "loss": 2.0223, + "step": 4671 + }, + { + "epoch": 0.2506437768240343, + "grad_norm": 1.4561305046081543, + "learning_rate": 4.3908214204877094e-06, + "loss": 2.1993, + "step": 4672 + }, + { + "epoch": 0.25069742489270386, + "grad_norm": 1.6801458597183228, + "learning_rate": 4.390537209892735e-06, + "loss": 2.0056, + "step": 4673 + }, + { + "epoch": 0.2507510729613734, + "grad_norm": 1.3332023620605469, + "learning_rate": 4.390252942217214e-06, + "loss": 2.3001, + "step": 4674 + }, + { + "epoch": 0.2508047210300429, + "grad_norm": 1.06626558303833, + "learning_rate": 4.389968617469728e-06, + "loss": 1.678, + "step": 4675 + }, + { + "epoch": 0.25085836909871245, + "grad_norm": 1.14922297000885, + "learning_rate": 4.389684235658866e-06, + "loss": 2.3139, + "step": 4676 + }, + { + "epoch": 0.250912017167382, + "grad_norm": 1.2489769458770752, + "learning_rate": 4.389399796793209e-06, + "loss": 2.3458, + "step": 4677 + }, + { + "epoch": 0.2509656652360515, + "grad_norm": 1.3004792928695679, + "learning_rate": 4.389115300881347e-06, + "loss": 2.273, + "step": 4678 + }, + { + "epoch": 0.25101931330472105, + "grad_norm": 1.2159384489059448, + "learning_rate": 4.388830747931871e-06, + "loss": 2.2974, + "step": 4679 + }, + { + "epoch": 0.2510729613733906, + "grad_norm": 1.4995081424713135, + "learning_rate": 4.3885461379533715e-06, + "loss": 2.1989, + "step": 4680 + }, + { + "epoch": 0.2511266094420601, + "grad_norm": 1.3097503185272217, + "learning_rate": 4.388261470954441e-06, + "loss": 2.2224, + "step": 4681 + }, + { + "epoch": 0.2511802575107296, + "grad_norm": 1.3637512922286987, + "learning_rate": 4.3879767469436755e-06, + "loss": 1.6261, + "step": 4682 + }, + { + "epoch": 0.2512339055793991, + "grad_norm": 1.4383403062820435, + "learning_rate": 4.38769196592967e-06, + "loss": 2.1897, + "step": 4683 + }, + { + "epoch": 0.25128755364806865, + "grad_norm": 1.5342657566070557, + "learning_rate": 4.387407127921025e-06, + "loss": 2.2836, + "step": 4684 + }, + { + "epoch": 0.2513412017167382, + "grad_norm": 1.454198956489563, + "learning_rate": 4.387122232926338e-06, + "loss": 1.9238, + "step": 4685 + }, + { + "epoch": 0.2513948497854077, + "grad_norm": 1.2759591341018677, + "learning_rate": 4.386837280954214e-06, + "loss": 2.0071, + "step": 4686 + }, + { + "epoch": 0.25144849785407725, + "grad_norm": 1.6622095108032227, + "learning_rate": 4.386552272013254e-06, + "loss": 2.5093, + "step": 4687 + }, + { + "epoch": 0.2515021459227468, + "grad_norm": 1.4091709852218628, + "learning_rate": 4.386267206112064e-06, + "loss": 2.4049, + "step": 4688 + }, + { + "epoch": 0.2515557939914163, + "grad_norm": 1.3885761499404907, + "learning_rate": 4.38598208325925e-06, + "loss": 2.2699, + "step": 4689 + }, + { + "epoch": 0.25160944206008584, + "grad_norm": 1.3105758428573608, + "learning_rate": 4.385696903463422e-06, + "loss": 1.9874, + "step": 4690 + }, + { + "epoch": 0.2516630901287554, + "grad_norm": 1.6249362230300903, + "learning_rate": 4.38541166673319e-06, + "loss": 2.2224, + "step": 4691 + }, + { + "epoch": 0.2517167381974249, + "grad_norm": 1.2358310222625732, + "learning_rate": 4.385126373077167e-06, + "loss": 2.2253, + "step": 4692 + }, + { + "epoch": 0.25177038626609444, + "grad_norm": 1.5247166156768799, + "learning_rate": 4.384841022503964e-06, + "loss": 1.2933, + "step": 4693 + }, + { + "epoch": 0.25182403433476397, + "grad_norm": 1.4652175903320312, + "learning_rate": 4.384555615022199e-06, + "loss": 2.2749, + "step": 4694 + }, + { + "epoch": 0.2518776824034335, + "grad_norm": 1.1321232318878174, + "learning_rate": 4.384270150640488e-06, + "loss": 2.1815, + "step": 4695 + }, + { + "epoch": 0.251931330472103, + "grad_norm": 1.2901922464370728, + "learning_rate": 4.383984629367451e-06, + "loss": 2.2997, + "step": 4696 + }, + { + "epoch": 0.2519849785407725, + "grad_norm": 1.2474418878555298, + "learning_rate": 4.383699051211708e-06, + "loss": 2.2561, + "step": 4697 + }, + { + "epoch": 0.25203862660944204, + "grad_norm": 1.359551191329956, + "learning_rate": 4.383413416181882e-06, + "loss": 2.5746, + "step": 4698 + }, + { + "epoch": 0.25209227467811157, + "grad_norm": 1.6498521566390991, + "learning_rate": 4.383127724286596e-06, + "loss": 2.2727, + "step": 4699 + }, + { + "epoch": 0.2521459227467811, + "grad_norm": 1.4336018562316895, + "learning_rate": 4.382841975534476e-06, + "loss": 2.416, + "step": 4700 + }, + { + "epoch": 0.25219957081545064, + "grad_norm": 1.3909361362457275, + "learning_rate": 4.382556169934151e-06, + "loss": 1.9366, + "step": 4701 + }, + { + "epoch": 0.25225321888412017, + "grad_norm": 1.1623016595840454, + "learning_rate": 4.382270307494249e-06, + "loss": 2.0354, + "step": 4702 + }, + { + "epoch": 0.2523068669527897, + "grad_norm": 1.2541495561599731, + "learning_rate": 4.381984388223402e-06, + "loss": 2.1368, + "step": 4703 + }, + { + "epoch": 0.25236051502145923, + "grad_norm": 1.213975429534912, + "learning_rate": 4.381698412130241e-06, + "loss": 1.7543, + "step": 4704 + }, + { + "epoch": 0.25241416309012876, + "grad_norm": 1.3815008401870728, + "learning_rate": 4.381412379223401e-06, + "loss": 2.3144, + "step": 4705 + }, + { + "epoch": 0.2524678111587983, + "grad_norm": 1.5426162481307983, + "learning_rate": 4.381126289511519e-06, + "loss": 2.3306, + "step": 4706 + }, + { + "epoch": 0.2525214592274678, + "grad_norm": 1.4915313720703125, + "learning_rate": 4.380840143003232e-06, + "loss": 2.1037, + "step": 4707 + }, + { + "epoch": 0.25257510729613736, + "grad_norm": 1.4083093404769897, + "learning_rate": 4.380553939707181e-06, + "loss": 2.42, + "step": 4708 + }, + { + "epoch": 0.2526287553648069, + "grad_norm": 1.288374423980713, + "learning_rate": 4.380267679632004e-06, + "loss": 2.1754, + "step": 4709 + }, + { + "epoch": 0.2526824034334764, + "grad_norm": 2.345243453979492, + "learning_rate": 4.379981362786348e-06, + "loss": 2.3276, + "step": 4710 + }, + { + "epoch": 0.2527360515021459, + "grad_norm": 1.269438624382019, + "learning_rate": 4.379694989178855e-06, + "loss": 2.1133, + "step": 4711 + }, + { + "epoch": 0.25278969957081543, + "grad_norm": 1.1539441347122192, + "learning_rate": 4.379408558818173e-06, + "loss": 2.2667, + "step": 4712 + }, + { + "epoch": 0.25284334763948496, + "grad_norm": 1.351917028427124, + "learning_rate": 4.379122071712949e-06, + "loss": 1.3903, + "step": 4713 + }, + { + "epoch": 0.2528969957081545, + "grad_norm": 1.2214667797088623, + "learning_rate": 4.3788355278718325e-06, + "loss": 2.6417, + "step": 4714 + }, + { + "epoch": 0.252950643776824, + "grad_norm": 1.4418749809265137, + "learning_rate": 4.3785489273034765e-06, + "loss": 2.4021, + "step": 4715 + }, + { + "epoch": 0.25300429184549356, + "grad_norm": 1.436390995979309, + "learning_rate": 4.378262270016533e-06, + "loss": 2.2285, + "step": 4716 + }, + { + "epoch": 0.2530579399141631, + "grad_norm": 1.3132411241531372, + "learning_rate": 4.377975556019658e-06, + "loss": 2.1544, + "step": 4717 + }, + { + "epoch": 0.2531115879828326, + "grad_norm": 1.0356918573379517, + "learning_rate": 4.377688785321507e-06, + "loss": 2.1527, + "step": 4718 + }, + { + "epoch": 0.25316523605150215, + "grad_norm": 1.6169227361679077, + "learning_rate": 4.37740195793074e-06, + "loss": 2.1745, + "step": 4719 + }, + { + "epoch": 0.2532188841201717, + "grad_norm": 1.1928679943084717, + "learning_rate": 4.377115073856016e-06, + "loss": 1.9766, + "step": 4720 + }, + { + "epoch": 0.2532725321888412, + "grad_norm": 1.1258528232574463, + "learning_rate": 4.376828133105996e-06, + "loss": 2.0901, + "step": 4721 + }, + { + "epoch": 0.25332618025751075, + "grad_norm": 1.106016993522644, + "learning_rate": 4.376541135689345e-06, + "loss": 2.1961, + "step": 4722 + }, + { + "epoch": 0.2533798283261803, + "grad_norm": 2.232729434967041, + "learning_rate": 4.376254081614728e-06, + "loss": 1.7107, + "step": 4723 + }, + { + "epoch": 0.2534334763948498, + "grad_norm": 1.2458021640777588, + "learning_rate": 4.375966970890812e-06, + "loss": 2.4819, + "step": 4724 + }, + { + "epoch": 0.2534871244635193, + "grad_norm": 1.1124881505966187, + "learning_rate": 4.375679803526266e-06, + "loss": 1.9711, + "step": 4725 + }, + { + "epoch": 0.2535407725321888, + "grad_norm": 1.2916507720947266, + "learning_rate": 4.375392579529759e-06, + "loss": 2.2706, + "step": 4726 + }, + { + "epoch": 0.25359442060085835, + "grad_norm": 1.2368227243423462, + "learning_rate": 4.375105298909963e-06, + "loss": 2.0921, + "step": 4727 + }, + { + "epoch": 0.2536480686695279, + "grad_norm": 1.4035682678222656, + "learning_rate": 4.374817961675554e-06, + "loss": 2.3576, + "step": 4728 + }, + { + "epoch": 0.2537017167381974, + "grad_norm": 1.6265597343444824, + "learning_rate": 4.374530567835206e-06, + "loss": 2.3354, + "step": 4729 + }, + { + "epoch": 0.25375536480686695, + "grad_norm": 1.3577080965042114, + "learning_rate": 4.3742431173975956e-06, + "loss": 2.0817, + "step": 4730 + }, + { + "epoch": 0.2538090128755365, + "grad_norm": 1.3620152473449707, + "learning_rate": 4.373955610371403e-06, + "loss": 2.4497, + "step": 4731 + }, + { + "epoch": 0.253862660944206, + "grad_norm": 1.1478400230407715, + "learning_rate": 4.373668046765308e-06, + "loss": 1.8957, + "step": 4732 + }, + { + "epoch": 0.25391630901287554, + "grad_norm": 1.1924848556518555, + "learning_rate": 4.3733804265879945e-06, + "loss": 2.0451, + "step": 4733 + }, + { + "epoch": 0.2539699570815451, + "grad_norm": 1.3766313791275024, + "learning_rate": 4.373092749848145e-06, + "loss": 1.9761, + "step": 4734 + }, + { + "epoch": 0.2540236051502146, + "grad_norm": 1.173679232597351, + "learning_rate": 4.372805016554444e-06, + "loss": 2.2394, + "step": 4735 + }, + { + "epoch": 0.25407725321888414, + "grad_norm": 1.3310160636901855, + "learning_rate": 4.372517226715582e-06, + "loss": 2.5217, + "step": 4736 + }, + { + "epoch": 0.25413090128755367, + "grad_norm": 1.086255669593811, + "learning_rate": 4.372229380340247e-06, + "loss": 1.9712, + "step": 4737 + }, + { + "epoch": 0.2541845493562232, + "grad_norm": 1.2872341871261597, + "learning_rate": 4.371941477437128e-06, + "loss": 2.5924, + "step": 4738 + }, + { + "epoch": 0.25423819742489273, + "grad_norm": 1.2670516967773438, + "learning_rate": 4.3716535180149215e-06, + "loss": 2.1307, + "step": 4739 + }, + { + "epoch": 0.2542918454935622, + "grad_norm": 1.1846305131912231, + "learning_rate": 4.371365502082318e-06, + "loss": 2.2837, + "step": 4740 + }, + { + "epoch": 0.25434549356223174, + "grad_norm": 1.1272854804992676, + "learning_rate": 4.371077429648015e-06, + "loss": 1.8231, + "step": 4741 + }, + { + "epoch": 0.25439914163090127, + "grad_norm": 1.2442699670791626, + "learning_rate": 4.37078930072071e-06, + "loss": 2.1641, + "step": 4742 + }, + { + "epoch": 0.2544527896995708, + "grad_norm": 1.3901772499084473, + "learning_rate": 4.370501115309103e-06, + "loss": 2.1756, + "step": 4743 + }, + { + "epoch": 0.25450643776824033, + "grad_norm": 1.3693444728851318, + "learning_rate": 4.370212873421895e-06, + "loss": 2.2055, + "step": 4744 + }, + { + "epoch": 0.25456008583690987, + "grad_norm": 1.129073143005371, + "learning_rate": 4.369924575067788e-06, + "loss": 2.0782, + "step": 4745 + }, + { + "epoch": 0.2546137339055794, + "grad_norm": 1.4586212635040283, + "learning_rate": 4.369636220255487e-06, + "loss": 2.2177, + "step": 4746 + }, + { + "epoch": 0.25466738197424893, + "grad_norm": 1.190265417098999, + "learning_rate": 4.369347808993699e-06, + "loss": 2.2529, + "step": 4747 + }, + { + "epoch": 0.25472103004291846, + "grad_norm": 1.3085672855377197, + "learning_rate": 4.369059341291131e-06, + "loss": 2.2911, + "step": 4748 + }, + { + "epoch": 0.254774678111588, + "grad_norm": 1.2679541110992432, + "learning_rate": 4.368770817156493e-06, + "loss": 2.42, + "step": 4749 + }, + { + "epoch": 0.2548283261802575, + "grad_norm": 1.1705948114395142, + "learning_rate": 4.368482236598495e-06, + "loss": 2.1182, + "step": 4750 + }, + { + "epoch": 0.25488197424892706, + "grad_norm": 1.5130914449691772, + "learning_rate": 4.3681935996258524e-06, + "loss": 2.3419, + "step": 4751 + }, + { + "epoch": 0.2549356223175966, + "grad_norm": 1.3825738430023193, + "learning_rate": 4.3679049062472785e-06, + "loss": 2.3169, + "step": 4752 + }, + { + "epoch": 0.2549892703862661, + "grad_norm": 1.3916094303131104, + "learning_rate": 4.36761615647149e-06, + "loss": 2.2963, + "step": 4753 + }, + { + "epoch": 0.2550429184549356, + "grad_norm": 1.3945997953414917, + "learning_rate": 4.367327350307205e-06, + "loss": 2.3649, + "step": 4754 + }, + { + "epoch": 0.25509656652360513, + "grad_norm": 1.6407297849655151, + "learning_rate": 4.367038487763143e-06, + "loss": 2.2635, + "step": 4755 + }, + { + "epoch": 0.25515021459227466, + "grad_norm": 1.2859954833984375, + "learning_rate": 4.366749568848027e-06, + "loss": 2.1939, + "step": 4756 + }, + { + "epoch": 0.2552038626609442, + "grad_norm": 1.234242558479309, + "learning_rate": 4.366460593570579e-06, + "loss": 2.2253, + "step": 4757 + }, + { + "epoch": 0.2552575107296137, + "grad_norm": 1.9538100957870483, + "learning_rate": 4.366171561939524e-06, + "loss": 2.3194, + "step": 4758 + }, + { + "epoch": 0.25531115879828326, + "grad_norm": 1.3371589183807373, + "learning_rate": 4.365882473963588e-06, + "loss": 2.5109, + "step": 4759 + }, + { + "epoch": 0.2553648068669528, + "grad_norm": 1.3439959287643433, + "learning_rate": 4.365593329651502e-06, + "loss": 2.4364, + "step": 4760 + }, + { + "epoch": 0.2554184549356223, + "grad_norm": 3.110311985015869, + "learning_rate": 4.3653041290119936e-06, + "loss": 2.3144, + "step": 4761 + }, + { + "epoch": 0.25547210300429185, + "grad_norm": 1.2985212802886963, + "learning_rate": 4.365014872053795e-06, + "loss": 2.2498, + "step": 4762 + }, + { + "epoch": 0.2555257510729614, + "grad_norm": 1.2323898077011108, + "learning_rate": 4.3647255587856405e-06, + "loss": 2.2981, + "step": 4763 + }, + { + "epoch": 0.2555793991416309, + "grad_norm": 3.187609910964966, + "learning_rate": 4.364436189216264e-06, + "loss": 2.3934, + "step": 4764 + }, + { + "epoch": 0.25563304721030045, + "grad_norm": 1.452471375465393, + "learning_rate": 4.364146763354404e-06, + "loss": 2.1343, + "step": 4765 + }, + { + "epoch": 0.25568669527897, + "grad_norm": 1.3680399656295776, + "learning_rate": 4.363857281208797e-06, + "loss": 2.2963, + "step": 4766 + }, + { + "epoch": 0.2557403433476395, + "grad_norm": 1.295642375946045, + "learning_rate": 4.363567742788186e-06, + "loss": 2.3311, + "step": 4767 + }, + { + "epoch": 0.255793991416309, + "grad_norm": 1.542189359664917, + "learning_rate": 4.363278148101311e-06, + "loss": 2.284, + "step": 4768 + }, + { + "epoch": 0.2558476394849785, + "grad_norm": 1.2334336042404175, + "learning_rate": 4.362988497156915e-06, + "loss": 2.2288, + "step": 4769 + }, + { + "epoch": 0.25590128755364805, + "grad_norm": 1.2505131959915161, + "learning_rate": 4.362698789963745e-06, + "loss": 2.2876, + "step": 4770 + }, + { + "epoch": 0.2559549356223176, + "grad_norm": 1.258476972579956, + "learning_rate": 4.362409026530548e-06, + "loss": 2.1297, + "step": 4771 + }, + { + "epoch": 0.2560085836909871, + "grad_norm": 1.3791723251342773, + "learning_rate": 4.362119206866071e-06, + "loss": 2.157, + "step": 4772 + }, + { + "epoch": 0.25606223175965664, + "grad_norm": 1.2837340831756592, + "learning_rate": 4.361829330979067e-06, + "loss": 2.3408, + "step": 4773 + }, + { + "epoch": 0.2561158798283262, + "grad_norm": 1.2569209337234497, + "learning_rate": 4.361539398878286e-06, + "loss": 2.3955, + "step": 4774 + }, + { + "epoch": 0.2561695278969957, + "grad_norm": 1.3036630153656006, + "learning_rate": 4.3612494105724835e-06, + "loss": 2.1031, + "step": 4775 + }, + { + "epoch": 0.25622317596566524, + "grad_norm": 1.3954561948776245, + "learning_rate": 4.360959366070414e-06, + "loss": 2.4999, + "step": 4776 + }, + { + "epoch": 0.25627682403433477, + "grad_norm": 1.5732146501541138, + "learning_rate": 4.360669265380835e-06, + "loss": 2.2499, + "step": 4777 + }, + { + "epoch": 0.2563304721030043, + "grad_norm": 1.2605928182601929, + "learning_rate": 4.360379108512505e-06, + "loss": 2.2825, + "step": 4778 + }, + { + "epoch": 0.25638412017167383, + "grad_norm": 1.430039644241333, + "learning_rate": 4.360088895474185e-06, + "loss": 2.0969, + "step": 4779 + }, + { + "epoch": 0.25643776824034337, + "grad_norm": 1.4888466596603394, + "learning_rate": 4.359798626274639e-06, + "loss": 2.4221, + "step": 4780 + }, + { + "epoch": 0.2564914163090129, + "grad_norm": 1.2765727043151855, + "learning_rate": 4.359508300922628e-06, + "loss": 2.3994, + "step": 4781 + }, + { + "epoch": 0.25654506437768243, + "grad_norm": 1.730627179145813, + "learning_rate": 4.359217919426919e-06, + "loss": 2.3335, + "step": 4782 + }, + { + "epoch": 0.2565987124463519, + "grad_norm": 1.6021413803100586, + "learning_rate": 4.35892748179628e-06, + "loss": 2.335, + "step": 4783 + }, + { + "epoch": 0.25665236051502144, + "grad_norm": 2.3130877017974854, + "learning_rate": 4.358636988039479e-06, + "loss": 2.2119, + "step": 4784 + }, + { + "epoch": 0.25670600858369097, + "grad_norm": 1.1009377241134644, + "learning_rate": 4.358346438165289e-06, + "loss": 2.171, + "step": 4785 + }, + { + "epoch": 0.2567596566523605, + "grad_norm": 1.1690815687179565, + "learning_rate": 4.358055832182479e-06, + "loss": 2.4706, + "step": 4786 + }, + { + "epoch": 0.25681330472103003, + "grad_norm": 1.5092735290527344, + "learning_rate": 4.357765170099828e-06, + "loss": 2.5165, + "step": 4787 + }, + { + "epoch": 0.25686695278969957, + "grad_norm": 1.144112467765808, + "learning_rate": 4.357474451926107e-06, + "loss": 2.153, + "step": 4788 + }, + { + "epoch": 0.2569206008583691, + "grad_norm": 1.3317633867263794, + "learning_rate": 4.357183677670097e-06, + "loss": 2.1146, + "step": 4789 + }, + { + "epoch": 0.25697424892703863, + "grad_norm": 1.3423964977264404, + "learning_rate": 4.356892847340575e-06, + "loss": 2.5807, + "step": 4790 + }, + { + "epoch": 0.25702789699570816, + "grad_norm": 1.280236840248108, + "learning_rate": 4.356601960946322e-06, + "loss": 2.4304, + "step": 4791 + }, + { + "epoch": 0.2570815450643777, + "grad_norm": 1.5411491394042969, + "learning_rate": 4.356311018496124e-06, + "loss": 2.2148, + "step": 4792 + }, + { + "epoch": 0.2571351931330472, + "grad_norm": 1.3796104192733765, + "learning_rate": 4.356020019998761e-06, + "loss": 2.3774, + "step": 4793 + }, + { + "epoch": 0.25718884120171676, + "grad_norm": 1.3540996313095093, + "learning_rate": 4.355728965463023e-06, + "loss": 2.5229, + "step": 4794 + }, + { + "epoch": 0.2572424892703863, + "grad_norm": 2.3865864276885986, + "learning_rate": 4.355437854897694e-06, + "loss": 2.2571, + "step": 4795 + }, + { + "epoch": 0.2572961373390558, + "grad_norm": 1.2192660570144653, + "learning_rate": 4.355146688311566e-06, + "loss": 2.2241, + "step": 4796 + }, + { + "epoch": 0.2573497854077253, + "grad_norm": 1.3640340566635132, + "learning_rate": 4.354855465713429e-06, + "loss": 2.2832, + "step": 4797 + }, + { + "epoch": 0.2574034334763948, + "grad_norm": 1.5324517488479614, + "learning_rate": 4.3545641871120764e-06, + "loss": 2.3925, + "step": 4798 + }, + { + "epoch": 0.25745708154506436, + "grad_norm": 1.3580207824707031, + "learning_rate": 4.354272852516302e-06, + "loss": 2.4024, + "step": 4799 + }, + { + "epoch": 0.2575107296137339, + "grad_norm": 1.262824296951294, + "learning_rate": 4.353981461934903e-06, + "loss": 2.2451, + "step": 4800 + }, + { + "epoch": 0.2575643776824034, + "grad_norm": 22.03597640991211, + "learning_rate": 4.353690015376677e-06, + "loss": 2.2809, + "step": 4801 + }, + { + "epoch": 0.25761802575107295, + "grad_norm": 1.4872517585754395, + "learning_rate": 4.353398512850422e-06, + "loss": 2.2963, + "step": 4802 + }, + { + "epoch": 0.2576716738197425, + "grad_norm": 1.7533700466156006, + "learning_rate": 4.353106954364942e-06, + "loss": 2.1917, + "step": 4803 + }, + { + "epoch": 0.257725321888412, + "grad_norm": 1.1470719575881958, + "learning_rate": 4.3528153399290375e-06, + "loss": 1.9893, + "step": 4804 + }, + { + "epoch": 0.25777896995708155, + "grad_norm": 1.649128794670105, + "learning_rate": 4.352523669551515e-06, + "loss": 1.6331, + "step": 4805 + }, + { + "epoch": 0.2578326180257511, + "grad_norm": 1.27980375289917, + "learning_rate": 4.352231943241179e-06, + "loss": 2.1409, + "step": 4806 + }, + { + "epoch": 0.2578862660944206, + "grad_norm": 1.337688684463501, + "learning_rate": 4.35194016100684e-06, + "loss": 2.5873, + "step": 4807 + }, + { + "epoch": 0.25793991416309014, + "grad_norm": 1.3720512390136719, + "learning_rate": 4.351648322857305e-06, + "loss": 2.2802, + "step": 4808 + }, + { + "epoch": 0.2579935622317597, + "grad_norm": 2.1326920986175537, + "learning_rate": 4.351356428801387e-06, + "loss": 2.3616, + "step": 4809 + }, + { + "epoch": 0.2580472103004292, + "grad_norm": 1.1424493789672852, + "learning_rate": 4.3510644788478984e-06, + "loss": 1.5801, + "step": 4810 + }, + { + "epoch": 0.25810085836909874, + "grad_norm": 1.3555235862731934, + "learning_rate": 4.350772473005655e-06, + "loss": 2.1498, + "step": 4811 + }, + { + "epoch": 0.2581545064377682, + "grad_norm": 1.408324122428894, + "learning_rate": 4.350480411283472e-06, + "loss": 1.809, + "step": 4812 + }, + { + "epoch": 0.25820815450643775, + "grad_norm": 1.331098198890686, + "learning_rate": 4.3501882936901684e-06, + "loss": 2.0249, + "step": 4813 + }, + { + "epoch": 0.2582618025751073, + "grad_norm": 1.2944880723953247, + "learning_rate": 4.349896120234564e-06, + "loss": 2.2023, + "step": 4814 + }, + { + "epoch": 0.2583154506437768, + "grad_norm": 1.7539501190185547, + "learning_rate": 4.349603890925479e-06, + "loss": 2.4232, + "step": 4815 + }, + { + "epoch": 0.25836909871244634, + "grad_norm": 1.835427165031433, + "learning_rate": 4.349311605771739e-06, + "loss": 2.289, + "step": 4816 + }, + { + "epoch": 0.2584227467811159, + "grad_norm": 1.4379550218582153, + "learning_rate": 4.349019264782167e-06, + "loss": 2.138, + "step": 4817 + }, + { + "epoch": 0.2584763948497854, + "grad_norm": 1.2745985984802246, + "learning_rate": 4.348726867965591e-06, + "loss": 2.4098, + "step": 4818 + }, + { + "epoch": 0.25853004291845494, + "grad_norm": 1.4098281860351562, + "learning_rate": 4.348434415330838e-06, + "loss": 2.3017, + "step": 4819 + }, + { + "epoch": 0.25858369098712447, + "grad_norm": 2.8383677005767822, + "learning_rate": 4.348141906886738e-06, + "loss": 2.3425, + "step": 4820 + }, + { + "epoch": 0.258637339055794, + "grad_norm": 1.1675755977630615, + "learning_rate": 4.347849342642123e-06, + "loss": 2.2059, + "step": 4821 + }, + { + "epoch": 0.25869098712446353, + "grad_norm": 1.7232476472854614, + "learning_rate": 4.347556722605827e-06, + "loss": 2.3554, + "step": 4822 + }, + { + "epoch": 0.25874463519313307, + "grad_norm": 1.950708031654358, + "learning_rate": 4.347264046786684e-06, + "loss": 2.3196, + "step": 4823 + }, + { + "epoch": 0.2587982832618026, + "grad_norm": 1.338625431060791, + "learning_rate": 4.34697131519353e-06, + "loss": 2.3636, + "step": 4824 + }, + { + "epoch": 0.25885193133047213, + "grad_norm": 1.305640459060669, + "learning_rate": 4.346678527835207e-06, + "loss": 2.2867, + "step": 4825 + }, + { + "epoch": 0.2589055793991416, + "grad_norm": 1.4617409706115723, + "learning_rate": 4.3463856847205495e-06, + "loss": 2.3148, + "step": 4826 + }, + { + "epoch": 0.25895922746781114, + "grad_norm": 1.5991469621658325, + "learning_rate": 4.3460927858584045e-06, + "loss": 2.2579, + "step": 4827 + }, + { + "epoch": 0.25901287553648067, + "grad_norm": 1.3048694133758545, + "learning_rate": 4.345799831257612e-06, + "loss": 2.3459, + "step": 4828 + }, + { + "epoch": 0.2590665236051502, + "grad_norm": 1.165088176727295, + "learning_rate": 4.345506820927019e-06, + "loss": 1.4534, + "step": 4829 + }, + { + "epoch": 0.25912017167381973, + "grad_norm": 1.4183557033538818, + "learning_rate": 4.345213754875471e-06, + "loss": 2.0545, + "step": 4830 + }, + { + "epoch": 0.25917381974248926, + "grad_norm": 1.3080843687057495, + "learning_rate": 4.344920633111818e-06, + "loss": 2.364, + "step": 4831 + }, + { + "epoch": 0.2592274678111588, + "grad_norm": 1.7696107625961304, + "learning_rate": 4.344627455644908e-06, + "loss": 2.2423, + "step": 4832 + }, + { + "epoch": 0.2592811158798283, + "grad_norm": 1.2515175342559814, + "learning_rate": 4.344334222483595e-06, + "loss": 2.3407, + "step": 4833 + }, + { + "epoch": 0.25933476394849786, + "grad_norm": 1.464719533920288, + "learning_rate": 4.34404093363673e-06, + "loss": 2.3535, + "step": 4834 + }, + { + "epoch": 0.2593884120171674, + "grad_norm": 1.2068192958831787, + "learning_rate": 4.343747589113171e-06, + "loss": 2.1454, + "step": 4835 + }, + { + "epoch": 0.2594420600858369, + "grad_norm": 1.422254204750061, + "learning_rate": 4.343454188921774e-06, + "loss": 2.3287, + "step": 4836 + }, + { + "epoch": 0.25949570815450645, + "grad_norm": 1.320586085319519, + "learning_rate": 4.343160733071396e-06, + "loss": 2.1164, + "step": 4837 + }, + { + "epoch": 0.259549356223176, + "grad_norm": 1.4359910488128662, + "learning_rate": 4.3428672215709e-06, + "loss": 2.3546, + "step": 4838 + }, + { + "epoch": 0.2596030042918455, + "grad_norm": 1.2519913911819458, + "learning_rate": 4.342573654429146e-06, + "loss": 2.0381, + "step": 4839 + }, + { + "epoch": 0.259656652360515, + "grad_norm": 1.3789550065994263, + "learning_rate": 4.342280031654997e-06, + "loss": 2.4307, + "step": 4840 + }, + { + "epoch": 0.2597103004291845, + "grad_norm": 1.4557098150253296, + "learning_rate": 4.34198635325732e-06, + "loss": 2.176, + "step": 4841 + }, + { + "epoch": 0.25976394849785406, + "grad_norm": 1.2449498176574707, + "learning_rate": 4.341692619244981e-06, + "loss": 2.3798, + "step": 4842 + }, + { + "epoch": 0.2598175965665236, + "grad_norm": 1.8051986694335938, + "learning_rate": 4.34139882962685e-06, + "loss": 2.2467, + "step": 4843 + }, + { + "epoch": 0.2598712446351931, + "grad_norm": 1.278777003288269, + "learning_rate": 4.3411049844117945e-06, + "loss": 2.2439, + "step": 4844 + }, + { + "epoch": 0.25992489270386265, + "grad_norm": 1.264844536781311, + "learning_rate": 4.3408110836086886e-06, + "loss": 2.0835, + "step": 4845 + }, + { + "epoch": 0.2599785407725322, + "grad_norm": 1.421617031097412, + "learning_rate": 4.340517127226405e-06, + "loss": 2.4834, + "step": 4846 + }, + { + "epoch": 0.2600321888412017, + "grad_norm": 1.0610733032226562, + "learning_rate": 4.340223115273822e-06, + "loss": 2.1221, + "step": 4847 + }, + { + "epoch": 0.26008583690987125, + "grad_norm": 1.3463908433914185, + "learning_rate": 4.339929047759812e-06, + "loss": 2.3249, + "step": 4848 + }, + { + "epoch": 0.2601394849785408, + "grad_norm": 1.3777925968170166, + "learning_rate": 4.339634924693257e-06, + "loss": 2.2485, + "step": 4849 + }, + { + "epoch": 0.2601931330472103, + "grad_norm": 2.166346549987793, + "learning_rate": 4.339340746083036e-06, + "loss": 2.0172, + "step": 4850 + }, + { + "epoch": 0.26024678111587984, + "grad_norm": 1.351366639137268, + "learning_rate": 4.339046511938031e-06, + "loss": 2.2608, + "step": 4851 + }, + { + "epoch": 0.2603004291845494, + "grad_norm": 1.4497090578079224, + "learning_rate": 4.338752222267127e-06, + "loss": 2.2863, + "step": 4852 + }, + { + "epoch": 0.2603540772532189, + "grad_norm": 1.5247631072998047, + "learning_rate": 4.338457877079208e-06, + "loss": 2.1968, + "step": 4853 + }, + { + "epoch": 0.26040772532188844, + "grad_norm": 1.801844835281372, + "learning_rate": 4.3381634763831624e-06, + "loss": 2.1077, + "step": 4854 + }, + { + "epoch": 0.2604613733905579, + "grad_norm": 5.149079322814941, + "learning_rate": 4.337869020187877e-06, + "loss": 2.2419, + "step": 4855 + }, + { + "epoch": 0.26051502145922745, + "grad_norm": 1.2535032033920288, + "learning_rate": 4.337574508502245e-06, + "loss": 1.6337, + "step": 4856 + }, + { + "epoch": 0.260568669527897, + "grad_norm": 1.5266965627670288, + "learning_rate": 4.337279941335156e-06, + "loss": 2.0453, + "step": 4857 + }, + { + "epoch": 0.2606223175965665, + "grad_norm": 1.3953857421875, + "learning_rate": 4.336985318695505e-06, + "loss": 2.503, + "step": 4858 + }, + { + "epoch": 0.26067596566523604, + "grad_norm": 1.5420866012573242, + "learning_rate": 4.336690640592187e-06, + "loss": 2.1028, + "step": 4859 + }, + { + "epoch": 0.2607296137339056, + "grad_norm": 1.4026782512664795, + "learning_rate": 4.3363959070341e-06, + "loss": 2.2399, + "step": 4860 + }, + { + "epoch": 0.2607832618025751, + "grad_norm": 1.2996913194656372, + "learning_rate": 4.336101118030142e-06, + "loss": 2.5274, + "step": 4861 + }, + { + "epoch": 0.26083690987124464, + "grad_norm": 1.4705771207809448, + "learning_rate": 4.335806273589214e-06, + "loss": 2.3678, + "step": 4862 + }, + { + "epoch": 0.26089055793991417, + "grad_norm": 1.2812087535858154, + "learning_rate": 4.3355113737202185e-06, + "loss": 2.1542, + "step": 4863 + }, + { + "epoch": 0.2609442060085837, + "grad_norm": 1.1715989112854004, + "learning_rate": 4.335216418432058e-06, + "loss": 2.281, + "step": 4864 + }, + { + "epoch": 0.26099785407725323, + "grad_norm": 1.594001054763794, + "learning_rate": 4.334921407733638e-06, + "loss": 2.4147, + "step": 4865 + }, + { + "epoch": 0.26105150214592276, + "grad_norm": 1.1099270582199097, + "learning_rate": 4.3346263416338675e-06, + "loss": 2.2587, + "step": 4866 + }, + { + "epoch": 0.2611051502145923, + "grad_norm": 1.278590440750122, + "learning_rate": 4.334331220141654e-06, + "loss": 2.3601, + "step": 4867 + }, + { + "epoch": 0.26115879828326183, + "grad_norm": 1.6035581827163696, + "learning_rate": 4.334036043265909e-06, + "loss": 2.2898, + "step": 4868 + }, + { + "epoch": 0.2612124463519313, + "grad_norm": 1.3233224153518677, + "learning_rate": 4.3337408110155435e-06, + "loss": 1.9059, + "step": 4869 + }, + { + "epoch": 0.26126609442060084, + "grad_norm": 1.1429449319839478, + "learning_rate": 4.333445523399472e-06, + "loss": 2.0881, + "step": 4870 + }, + { + "epoch": 0.26131974248927037, + "grad_norm": 1.2607967853546143, + "learning_rate": 4.33315018042661e-06, + "loss": 2.4109, + "step": 4871 + }, + { + "epoch": 0.2613733905579399, + "grad_norm": 1.6226083040237427, + "learning_rate": 4.332854782105875e-06, + "loss": 2.3343, + "step": 4872 + }, + { + "epoch": 0.26142703862660943, + "grad_norm": 1.4444100856781006, + "learning_rate": 4.332559328446185e-06, + "loss": 2.1567, + "step": 4873 + }, + { + "epoch": 0.26148068669527896, + "grad_norm": 1.2583414316177368, + "learning_rate": 4.332263819456461e-06, + "loss": 2.2296, + "step": 4874 + }, + { + "epoch": 0.2615343347639485, + "grad_norm": 1.3421982526779175, + "learning_rate": 4.331968255145627e-06, + "loss": 2.4243, + "step": 4875 + }, + { + "epoch": 0.261587982832618, + "grad_norm": 1.3730186223983765, + "learning_rate": 4.3316726355226036e-06, + "loss": 2.3544, + "step": 4876 + }, + { + "epoch": 0.26164163090128756, + "grad_norm": 1.0839687585830688, + "learning_rate": 4.331376960596319e-06, + "loss": 1.9948, + "step": 4877 + }, + { + "epoch": 0.2616952789699571, + "grad_norm": 1.4229644536972046, + "learning_rate": 4.3310812303756996e-06, + "loss": 1.2112, + "step": 4878 + }, + { + "epoch": 0.2617489270386266, + "grad_norm": 1.0040287971496582, + "learning_rate": 4.330785444869674e-06, + "loss": 2.2117, + "step": 4879 + }, + { + "epoch": 0.26180257510729615, + "grad_norm": 1.7786331176757812, + "learning_rate": 4.330489604087173e-06, + "loss": 2.3371, + "step": 4880 + }, + { + "epoch": 0.2618562231759657, + "grad_norm": 1.5912742614746094, + "learning_rate": 4.330193708037129e-06, + "loss": 2.3051, + "step": 4881 + }, + { + "epoch": 0.2619098712446352, + "grad_norm": 1.294698715209961, + "learning_rate": 4.329897756728475e-06, + "loss": 2.3297, + "step": 4882 + }, + { + "epoch": 0.2619635193133047, + "grad_norm": 1.3701165914535522, + "learning_rate": 4.329601750170148e-06, + "loss": 2.2171, + "step": 4883 + }, + { + "epoch": 0.2620171673819742, + "grad_norm": 1.3847661018371582, + "learning_rate": 4.329305688371085e-06, + "loss": 2.3853, + "step": 4884 + }, + { + "epoch": 0.26207081545064376, + "grad_norm": 1.2282342910766602, + "learning_rate": 4.329009571340225e-06, + "loss": 2.1656, + "step": 4885 + }, + { + "epoch": 0.2621244635193133, + "grad_norm": 1.2801765203475952, + "learning_rate": 4.328713399086507e-06, + "loss": 2.4815, + "step": 4886 + }, + { + "epoch": 0.2621781115879828, + "grad_norm": 1.4987655878067017, + "learning_rate": 4.328417171618875e-06, + "loss": 2.1133, + "step": 4887 + }, + { + "epoch": 0.26223175965665235, + "grad_norm": 1.4141063690185547, + "learning_rate": 4.328120888946272e-06, + "loss": 2.3888, + "step": 4888 + }, + { + "epoch": 0.2622854077253219, + "grad_norm": 1.2283570766448975, + "learning_rate": 4.327824551077644e-06, + "loss": 2.2234, + "step": 4889 + }, + { + "epoch": 0.2623390557939914, + "grad_norm": 1.3510206937789917, + "learning_rate": 4.3275281580219385e-06, + "loss": 2.2786, + "step": 4890 + }, + { + "epoch": 0.26239270386266095, + "grad_norm": 1.5663013458251953, + "learning_rate": 4.327231709788104e-06, + "loss": 2.3512, + "step": 4891 + }, + { + "epoch": 0.2624463519313305, + "grad_norm": 1.3485668897628784, + "learning_rate": 4.326935206385091e-06, + "loss": 2.3295, + "step": 4892 + }, + { + "epoch": 0.2625, + "grad_norm": 1.4898134469985962, + "learning_rate": 4.326638647821853e-06, + "loss": 2.3525, + "step": 4893 + }, + { + "epoch": 0.26255364806866954, + "grad_norm": 1.3081623315811157, + "learning_rate": 4.326342034107341e-06, + "loss": 2.4056, + "step": 4894 + }, + { + "epoch": 0.2626072961373391, + "grad_norm": 1.5296578407287598, + "learning_rate": 4.326045365250514e-06, + "loss": 2.2507, + "step": 4895 + }, + { + "epoch": 0.2626609442060086, + "grad_norm": 1.4058868885040283, + "learning_rate": 4.325748641260327e-06, + "loss": 2.49, + "step": 4896 + }, + { + "epoch": 0.26271459227467814, + "grad_norm": 1.4289765357971191, + "learning_rate": 4.32545186214574e-06, + "loss": 2.3249, + "step": 4897 + }, + { + "epoch": 0.2627682403433476, + "grad_norm": 1.4563928842544556, + "learning_rate": 4.325155027915713e-06, + "loss": 2.356, + "step": 4898 + }, + { + "epoch": 0.26282188841201715, + "grad_norm": 1.817145824432373, + "learning_rate": 4.324858138579209e-06, + "loss": 2.1805, + "step": 4899 + }, + { + "epoch": 0.2628755364806867, + "grad_norm": 1.4374120235443115, + "learning_rate": 4.324561194145191e-06, + "loss": 2.155, + "step": 4900 + }, + { + "epoch": 0.2629291845493562, + "grad_norm": 1.4012548923492432, + "learning_rate": 4.324264194622625e-06, + "loss": 2.2936, + "step": 4901 + }, + { + "epoch": 0.26298283261802574, + "grad_norm": 1.2875539064407349, + "learning_rate": 4.323967140020479e-06, + "loss": 2.1496, + "step": 4902 + }, + { + "epoch": 0.2630364806866953, + "grad_norm": 1.320745825767517, + "learning_rate": 4.3236700303477194e-06, + "loss": 2.4174, + "step": 4903 + }, + { + "epoch": 0.2630901287553648, + "grad_norm": 1.3613402843475342, + "learning_rate": 4.32337286561332e-06, + "loss": 2.3793, + "step": 4904 + }, + { + "epoch": 0.26314377682403434, + "grad_norm": 1.2410712242126465, + "learning_rate": 4.323075645826251e-06, + "loss": 2.2341, + "step": 4905 + }, + { + "epoch": 0.26319742489270387, + "grad_norm": 1.3350324630737305, + "learning_rate": 4.322778370995486e-06, + "loss": 1.4241, + "step": 4906 + }, + { + "epoch": 0.2632510729613734, + "grad_norm": 1.5607573986053467, + "learning_rate": 4.3224810411300025e-06, + "loss": 2.1927, + "step": 4907 + }, + { + "epoch": 0.26330472103004293, + "grad_norm": 1.2957297563552856, + "learning_rate": 4.322183656238776e-06, + "loss": 2.2543, + "step": 4908 + }, + { + "epoch": 0.26335836909871246, + "grad_norm": 1.5021214485168457, + "learning_rate": 4.321886216330786e-06, + "loss": 2.3117, + "step": 4909 + }, + { + "epoch": 0.263412017167382, + "grad_norm": 1.5452467203140259, + "learning_rate": 4.3215887214150134e-06, + "loss": 2.6166, + "step": 4910 + }, + { + "epoch": 0.2634656652360515, + "grad_norm": 1.277463436126709, + "learning_rate": 4.321291171500439e-06, + "loss": 2.2833, + "step": 4911 + }, + { + "epoch": 0.263519313304721, + "grad_norm": 1.3175767660140991, + "learning_rate": 4.320993566596048e-06, + "loss": 2.0734, + "step": 4912 + }, + { + "epoch": 0.26357296137339054, + "grad_norm": 2.131577253341675, + "learning_rate": 4.320695906710826e-06, + "loss": 2.3244, + "step": 4913 + }, + { + "epoch": 0.26362660944206007, + "grad_norm": 1.467145323753357, + "learning_rate": 4.320398191853759e-06, + "loss": 2.1019, + "step": 4914 + }, + { + "epoch": 0.2636802575107296, + "grad_norm": 1.3435848951339722, + "learning_rate": 4.320100422033837e-06, + "loss": 2.2832, + "step": 4915 + }, + { + "epoch": 0.26373390557939913, + "grad_norm": 1.356048822402954, + "learning_rate": 4.31980259726005e-06, + "loss": 2.3017, + "step": 4916 + }, + { + "epoch": 0.26378755364806866, + "grad_norm": 1.2676447629928589, + "learning_rate": 4.3195047175413895e-06, + "loss": 2.4564, + "step": 4917 + }, + { + "epoch": 0.2638412017167382, + "grad_norm": 1.3053126335144043, + "learning_rate": 4.319206782886851e-06, + "loss": 2.3608, + "step": 4918 + }, + { + "epoch": 0.2638948497854077, + "grad_norm": 1.6386467218399048, + "learning_rate": 4.318908793305428e-06, + "loss": 2.4616, + "step": 4919 + }, + { + "epoch": 0.26394849785407726, + "grad_norm": 1.3883670568466187, + "learning_rate": 4.318610748806119e-06, + "loss": 2.053, + "step": 4920 + }, + { + "epoch": 0.2640021459227468, + "grad_norm": 1.5829551219940186, + "learning_rate": 4.318312649397922e-06, + "loss": 2.2877, + "step": 4921 + }, + { + "epoch": 0.2640557939914163, + "grad_norm": 1.3653408288955688, + "learning_rate": 4.318014495089839e-06, + "loss": 2.2338, + "step": 4922 + }, + { + "epoch": 0.26410944206008585, + "grad_norm": 1.5707958936691284, + "learning_rate": 4.317716285890869e-06, + "loss": 2.332, + "step": 4923 + }, + { + "epoch": 0.2641630901287554, + "grad_norm": 1.2048958539962769, + "learning_rate": 4.317418021810019e-06, + "loss": 2.1324, + "step": 4924 + }, + { + "epoch": 0.2642167381974249, + "grad_norm": 1.7018972635269165, + "learning_rate": 4.317119702856293e-06, + "loss": 2.1641, + "step": 4925 + }, + { + "epoch": 0.26427038626609445, + "grad_norm": 1.3550385236740112, + "learning_rate": 4.316821329038698e-06, + "loss": 2.3762, + "step": 4926 + }, + { + "epoch": 0.2643240343347639, + "grad_norm": 6.832102298736572, + "learning_rate": 4.316522900366243e-06, + "loss": 2.4016, + "step": 4927 + }, + { + "epoch": 0.26437768240343346, + "grad_norm": 1.1516891717910767, + "learning_rate": 4.316224416847939e-06, + "loss": 2.1816, + "step": 4928 + }, + { + "epoch": 0.264431330472103, + "grad_norm": 1.2342513799667358, + "learning_rate": 4.315925878492797e-06, + "loss": 2.2748, + "step": 4929 + }, + { + "epoch": 0.2644849785407725, + "grad_norm": 1.268660545349121, + "learning_rate": 4.31562728530983e-06, + "loss": 2.4396, + "step": 4930 + }, + { + "epoch": 0.26453862660944205, + "grad_norm": 1.3737273216247559, + "learning_rate": 4.315328637308055e-06, + "loss": 2.5176, + "step": 4931 + }, + { + "epoch": 0.2645922746781116, + "grad_norm": 1.526369571685791, + "learning_rate": 4.315029934496489e-06, + "loss": 2.2788, + "step": 4932 + }, + { + "epoch": 0.2646459227467811, + "grad_norm": 1.3185139894485474, + "learning_rate": 4.314731176884149e-06, + "loss": 2.4294, + "step": 4933 + }, + { + "epoch": 0.26469957081545065, + "grad_norm": 1.7181143760681152, + "learning_rate": 4.314432364480057e-06, + "loss": 2.1363, + "step": 4934 + }, + { + "epoch": 0.2647532188841202, + "grad_norm": 1.3996254205703735, + "learning_rate": 4.314133497293233e-06, + "loss": 2.2663, + "step": 4935 + }, + { + "epoch": 0.2648068669527897, + "grad_norm": 1.4128931760787964, + "learning_rate": 4.313834575332703e-06, + "loss": 2.2088, + "step": 4936 + }, + { + "epoch": 0.26486051502145924, + "grad_norm": 1.2946629524230957, + "learning_rate": 4.313535598607492e-06, + "loss": 2.4011, + "step": 4937 + }, + { + "epoch": 0.2649141630901288, + "grad_norm": 1.3024029731750488, + "learning_rate": 4.313236567126626e-06, + "loss": 2.2391, + "step": 4938 + }, + { + "epoch": 0.2649678111587983, + "grad_norm": 1.2250474691390991, + "learning_rate": 4.312937480899133e-06, + "loss": 2.283, + "step": 4939 + }, + { + "epoch": 0.26502145922746784, + "grad_norm": 1.2232544422149658, + "learning_rate": 4.312638339934045e-06, + "loss": 2.406, + "step": 4940 + }, + { + "epoch": 0.2650751072961373, + "grad_norm": 2.940908908843994, + "learning_rate": 4.312339144240392e-06, + "loss": 1.367, + "step": 4941 + }, + { + "epoch": 0.26512875536480685, + "grad_norm": 1.3842527866363525, + "learning_rate": 4.312039893827209e-06, + "loss": 2.2589, + "step": 4942 + }, + { + "epoch": 0.2651824034334764, + "grad_norm": 1.2307825088500977, + "learning_rate": 4.31174058870353e-06, + "loss": 2.261, + "step": 4943 + }, + { + "epoch": 0.2652360515021459, + "grad_norm": 1.4354695081710815, + "learning_rate": 4.3114412288783935e-06, + "loss": 2.5203, + "step": 4944 + }, + { + "epoch": 0.26528969957081544, + "grad_norm": 1.2472885847091675, + "learning_rate": 4.311141814360836e-06, + "loss": 2.1669, + "step": 4945 + }, + { + "epoch": 0.26534334763948497, + "grad_norm": 1.2918510437011719, + "learning_rate": 4.310842345159899e-06, + "loss": 2.2223, + "step": 4946 + }, + { + "epoch": 0.2653969957081545, + "grad_norm": 1.1528593301773071, + "learning_rate": 4.310542821284625e-06, + "loss": 2.3968, + "step": 4947 + }, + { + "epoch": 0.26545064377682404, + "grad_norm": 1.2348899841308594, + "learning_rate": 4.310243242744055e-06, + "loss": 2.4933, + "step": 4948 + }, + { + "epoch": 0.26550429184549357, + "grad_norm": 1.3857501745224, + "learning_rate": 4.309943609547236e-06, + "loss": 2.1121, + "step": 4949 + }, + { + "epoch": 0.2655579399141631, + "grad_norm": 1.2734688520431519, + "learning_rate": 4.309643921703214e-06, + "loss": 2.1693, + "step": 4950 + }, + { + "epoch": 0.26561158798283263, + "grad_norm": 1.5599111318588257, + "learning_rate": 4.309344179221038e-06, + "loss": 2.3264, + "step": 4951 + }, + { + "epoch": 0.26566523605150216, + "grad_norm": 1.0367002487182617, + "learning_rate": 4.309044382109757e-06, + "loss": 2.2652, + "step": 4952 + }, + { + "epoch": 0.2657188841201717, + "grad_norm": 1.4513957500457764, + "learning_rate": 4.308744530378425e-06, + "loss": 2.4025, + "step": 4953 + }, + { + "epoch": 0.2657725321888412, + "grad_norm": 1.3689954280853271, + "learning_rate": 4.308444624036092e-06, + "loss": 2.2069, + "step": 4954 + }, + { + "epoch": 0.2658261802575107, + "grad_norm": 1.2546120882034302, + "learning_rate": 4.308144663091816e-06, + "loss": 2.1249, + "step": 4955 + }, + { + "epoch": 0.26587982832618023, + "grad_norm": 1.6062411069869995, + "learning_rate": 4.307844647554652e-06, + "loss": 2.3476, + "step": 4956 + }, + { + "epoch": 0.26593347639484977, + "grad_norm": 1.5228404998779297, + "learning_rate": 4.30754457743366e-06, + "loss": 2.3796, + "step": 4957 + }, + { + "epoch": 0.2659871244635193, + "grad_norm": 1.2062166929244995, + "learning_rate": 4.307244452737897e-06, + "loss": 2.2948, + "step": 4958 + }, + { + "epoch": 0.26604077253218883, + "grad_norm": 1.3289083242416382, + "learning_rate": 4.306944273476427e-06, + "loss": 2.5736, + "step": 4959 + }, + { + "epoch": 0.26609442060085836, + "grad_norm": 1.5624699592590332, + "learning_rate": 4.306644039658312e-06, + "loss": 2.4482, + "step": 4960 + }, + { + "epoch": 0.2661480686695279, + "grad_norm": 1.241837739944458, + "learning_rate": 4.3063437512926174e-06, + "loss": 1.801, + "step": 4961 + }, + { + "epoch": 0.2662017167381974, + "grad_norm": 2.51621675491333, + "learning_rate": 4.30604340838841e-06, + "loss": 2.3302, + "step": 4962 + }, + { + "epoch": 0.26625536480686696, + "grad_norm": 1.3444901704788208, + "learning_rate": 4.305743010954758e-06, + "loss": 2.2618, + "step": 4963 + }, + { + "epoch": 0.2663090128755365, + "grad_norm": 1.248565435409546, + "learning_rate": 4.305442559000731e-06, + "loss": 2.264, + "step": 4964 + }, + { + "epoch": 0.266362660944206, + "grad_norm": 1.2382774353027344, + "learning_rate": 4.3051420525354e-06, + "loss": 2.1906, + "step": 4965 + }, + { + "epoch": 0.26641630901287555, + "grad_norm": 1.802303671836853, + "learning_rate": 4.304841491567839e-06, + "loss": 2.3677, + "step": 4966 + }, + { + "epoch": 0.2664699570815451, + "grad_norm": 1.4669591188430786, + "learning_rate": 4.304540876107121e-06, + "loss": 2.3834, + "step": 4967 + }, + { + "epoch": 0.2665236051502146, + "grad_norm": 1.670754313468933, + "learning_rate": 4.304240206162326e-06, + "loss": 2.2617, + "step": 4968 + }, + { + "epoch": 0.26657725321888415, + "grad_norm": 1.2535589933395386, + "learning_rate": 4.3039394817425285e-06, + "loss": 2.0165, + "step": 4969 + }, + { + "epoch": 0.2666309012875536, + "grad_norm": 1.2884461879730225, + "learning_rate": 4.303638702856809e-06, + "loss": 2.2083, + "step": 4970 + }, + { + "epoch": 0.26668454935622316, + "grad_norm": 1.2634315490722656, + "learning_rate": 4.303337869514251e-06, + "loss": 2.5143, + "step": 4971 + }, + { + "epoch": 0.2667381974248927, + "grad_norm": 1.457110047340393, + "learning_rate": 4.303036981723934e-06, + "loss": 2.2281, + "step": 4972 + }, + { + "epoch": 0.2667918454935622, + "grad_norm": 2.2297818660736084, + "learning_rate": 4.302736039494945e-06, + "loss": 2.2218, + "step": 4973 + }, + { + "epoch": 0.26684549356223175, + "grad_norm": 0.9956889152526855, + "learning_rate": 4.30243504283637e-06, + "loss": 1.9755, + "step": 4974 + }, + { + "epoch": 0.2668991416309013, + "grad_norm": 1.256490707397461, + "learning_rate": 4.3021339917572975e-06, + "loss": 2.3296, + "step": 4975 + }, + { + "epoch": 0.2669527896995708, + "grad_norm": 1.417594075202942, + "learning_rate": 4.3018328862668155e-06, + "loss": 2.4423, + "step": 4976 + }, + { + "epoch": 0.26700643776824035, + "grad_norm": 1.1783504486083984, + "learning_rate": 4.301531726374015e-06, + "loss": 2.3927, + "step": 4977 + }, + { + "epoch": 0.2670600858369099, + "grad_norm": 2.6332640647888184, + "learning_rate": 4.301230512087992e-06, + "loss": 2.2166, + "step": 4978 + }, + { + "epoch": 0.2671137339055794, + "grad_norm": 1.3823796510696411, + "learning_rate": 4.300929243417837e-06, + "loss": 2.4813, + "step": 4979 + }, + { + "epoch": 0.26716738197424894, + "grad_norm": 1.3533602952957153, + "learning_rate": 4.300627920372648e-06, + "loss": 2.1912, + "step": 4980 + }, + { + "epoch": 0.2672210300429185, + "grad_norm": 1.3954432010650635, + "learning_rate": 4.300326542961523e-06, + "loss": 2.3207, + "step": 4981 + }, + { + "epoch": 0.267274678111588, + "grad_norm": 1.5588010549545288, + "learning_rate": 4.300025111193561e-06, + "loss": 2.2725, + "step": 4982 + }, + { + "epoch": 0.26732832618025754, + "grad_norm": 1.4824800491333008, + "learning_rate": 4.299723625077864e-06, + "loss": 2.0438, + "step": 4983 + }, + { + "epoch": 0.267381974248927, + "grad_norm": 1.352290153503418, + "learning_rate": 4.299422084623532e-06, + "loss": 2.2173, + "step": 4984 + }, + { + "epoch": 0.26743562231759654, + "grad_norm": 1.3440053462982178, + "learning_rate": 4.299120489839673e-06, + "loss": 2.3153, + "step": 4985 + }, + { + "epoch": 0.2674892703862661, + "grad_norm": 1.2835479974746704, + "learning_rate": 4.298818840735391e-06, + "loss": 2.4139, + "step": 4986 + }, + { + "epoch": 0.2675429184549356, + "grad_norm": 1.3719205856323242, + "learning_rate": 4.298517137319793e-06, + "loss": 2.1684, + "step": 4987 + }, + { + "epoch": 0.26759656652360514, + "grad_norm": 1.3065173625946045, + "learning_rate": 4.2982153796019895e-06, + "loss": 2.5497, + "step": 4988 + }, + { + "epoch": 0.26765021459227467, + "grad_norm": 1.5087333917617798, + "learning_rate": 4.297913567591091e-06, + "loss": 2.4513, + "step": 4989 + }, + { + "epoch": 0.2677038626609442, + "grad_norm": 1.418238878250122, + "learning_rate": 4.29761170129621e-06, + "loss": 2.1645, + "step": 4990 + }, + { + "epoch": 0.26775751072961373, + "grad_norm": 3.0693984031677246, + "learning_rate": 4.29730978072646e-06, + "loss": 2.5868, + "step": 4991 + }, + { + "epoch": 0.26781115879828327, + "grad_norm": 1.3223739862442017, + "learning_rate": 4.297007805890958e-06, + "loss": 2.3448, + "step": 4992 + }, + { + "epoch": 0.2678648068669528, + "grad_norm": 1.527571201324463, + "learning_rate": 4.296705776798822e-06, + "loss": 2.5465, + "step": 4993 + }, + { + "epoch": 0.26791845493562233, + "grad_norm": 1.3328962326049805, + "learning_rate": 4.296403693459169e-06, + "loss": 2.2605, + "step": 4994 + }, + { + "epoch": 0.26797210300429186, + "grad_norm": 1.3360021114349365, + "learning_rate": 4.29610155588112e-06, + "loss": 2.3782, + "step": 4995 + }, + { + "epoch": 0.2680257510729614, + "grad_norm": 1.4117408990859985, + "learning_rate": 4.2957993640738e-06, + "loss": 2.3566, + "step": 4996 + }, + { + "epoch": 0.2680793991416309, + "grad_norm": 1.9808634519577026, + "learning_rate": 4.29549711804633e-06, + "loss": 1.7897, + "step": 4997 + }, + { + "epoch": 0.2681330472103004, + "grad_norm": 1.3581477403640747, + "learning_rate": 4.295194817807837e-06, + "loss": 2.1979, + "step": 4998 + }, + { + "epoch": 0.26818669527896993, + "grad_norm": 1.3385461568832397, + "learning_rate": 4.294892463367448e-06, + "loss": 2.2826, + "step": 4999 + }, + { + "epoch": 0.26824034334763946, + "grad_norm": 1.4239351749420166, + "learning_rate": 4.294590054734292e-06, + "loss": 2.4289, + "step": 5000 + }, + { + "epoch": 0.268293991416309, + "grad_norm": 1.520639181137085, + "learning_rate": 4.294287591917499e-06, + "loss": 2.3832, + "step": 5001 + }, + { + "epoch": 0.26834763948497853, + "grad_norm": 1.3181447982788086, + "learning_rate": 4.293985074926202e-06, + "loss": 2.0431, + "step": 5002 + }, + { + "epoch": 0.26840128755364806, + "grad_norm": 1.2739182710647583, + "learning_rate": 4.293682503769535e-06, + "loss": 2.2145, + "step": 5003 + }, + { + "epoch": 0.2684549356223176, + "grad_norm": 1.3622190952301025, + "learning_rate": 4.293379878456634e-06, + "loss": 2.0938, + "step": 5004 + }, + { + "epoch": 0.2685085836909871, + "grad_norm": 1.2822149991989136, + "learning_rate": 4.2930771989966335e-06, + "loss": 2.2214, + "step": 5005 + }, + { + "epoch": 0.26856223175965666, + "grad_norm": 1.3744333982467651, + "learning_rate": 4.2927744653986745e-06, + "loss": 2.2385, + "step": 5006 + }, + { + "epoch": 0.2686158798283262, + "grad_norm": 1.2973759174346924, + "learning_rate": 4.292471677671897e-06, + "loss": 1.5525, + "step": 5007 + }, + { + "epoch": 0.2686695278969957, + "grad_norm": 1.5829969644546509, + "learning_rate": 4.292168835825442e-06, + "loss": 2.2589, + "step": 5008 + }, + { + "epoch": 0.26872317596566525, + "grad_norm": 1.2086747884750366, + "learning_rate": 4.291865939868454e-06, + "loss": 2.0988, + "step": 5009 + }, + { + "epoch": 0.2687768240343348, + "grad_norm": 2.0807058811187744, + "learning_rate": 4.291562989810079e-06, + "loss": 2.2114, + "step": 5010 + }, + { + "epoch": 0.2688304721030043, + "grad_norm": 1.4548686742782593, + "learning_rate": 4.291259985659462e-06, + "loss": 1.9167, + "step": 5011 + }, + { + "epoch": 0.26888412017167385, + "grad_norm": 1.3382760286331177, + "learning_rate": 4.290956927425753e-06, + "loss": 2.3412, + "step": 5012 + }, + { + "epoch": 0.2689377682403433, + "grad_norm": 1.5678690671920776, + "learning_rate": 4.290653815118101e-06, + "loss": 2.3155, + "step": 5013 + }, + { + "epoch": 0.26899141630901285, + "grad_norm": 1.2697694301605225, + "learning_rate": 4.290350648745659e-06, + "loss": 2.0214, + "step": 5014 + }, + { + "epoch": 0.2690450643776824, + "grad_norm": 1.3039318323135376, + "learning_rate": 4.290047428317581e-06, + "loss": 2.233, + "step": 5015 + }, + { + "epoch": 0.2690987124463519, + "grad_norm": 1.314761757850647, + "learning_rate": 4.28974415384302e-06, + "loss": 2.3192, + "step": 5016 + }, + { + "epoch": 0.26915236051502145, + "grad_norm": 1.351046085357666, + "learning_rate": 4.289440825331133e-06, + "loss": 2.3464, + "step": 5017 + }, + { + "epoch": 0.269206008583691, + "grad_norm": 1.1469202041625977, + "learning_rate": 4.28913744279108e-06, + "loss": 2.1461, + "step": 5018 + }, + { + "epoch": 0.2692596566523605, + "grad_norm": 2.1836276054382324, + "learning_rate": 4.28883400623202e-06, + "loss": 2.1814, + "step": 5019 + }, + { + "epoch": 0.26931330472103004, + "grad_norm": 1.353124976158142, + "learning_rate": 4.288530515663114e-06, + "loss": 2.0215, + "step": 5020 + }, + { + "epoch": 0.2693669527896996, + "grad_norm": 1.345713496208191, + "learning_rate": 4.288226971093526e-06, + "loss": 2.2631, + "step": 5021 + }, + { + "epoch": 0.2694206008583691, + "grad_norm": 1.3977351188659668, + "learning_rate": 4.287923372532421e-06, + "loss": 2.4116, + "step": 5022 + }, + { + "epoch": 0.26947424892703864, + "grad_norm": 1.544020175933838, + "learning_rate": 4.287619719988965e-06, + "loss": 2.5256, + "step": 5023 + }, + { + "epoch": 0.26952789699570817, + "grad_norm": 1.587257981300354, + "learning_rate": 4.287316013472326e-06, + "loss": 2.2676, + "step": 5024 + }, + { + "epoch": 0.2695815450643777, + "grad_norm": 1.2156556844711304, + "learning_rate": 4.287012252991674e-06, + "loss": 2.427, + "step": 5025 + }, + { + "epoch": 0.26963519313304724, + "grad_norm": 1.4727579355239868, + "learning_rate": 4.28670843855618e-06, + "loss": 2.558, + "step": 5026 + }, + { + "epoch": 0.2696888412017167, + "grad_norm": 1.4053401947021484, + "learning_rate": 4.286404570175018e-06, + "loss": 2.2256, + "step": 5027 + }, + { + "epoch": 0.26974248927038624, + "grad_norm": 1.5847079753875732, + "learning_rate": 4.2861006478573615e-06, + "loss": 2.3742, + "step": 5028 + }, + { + "epoch": 0.2697961373390558, + "grad_norm": 1.9975436925888062, + "learning_rate": 4.285796671612388e-06, + "loss": 2.2896, + "step": 5029 + }, + { + "epoch": 0.2698497854077253, + "grad_norm": 1.4838610887527466, + "learning_rate": 4.285492641449274e-06, + "loss": 2.2441, + "step": 5030 + }, + { + "epoch": 0.26990343347639484, + "grad_norm": 1.2856327295303345, + "learning_rate": 4.285188557377199e-06, + "loss": 2.3861, + "step": 5031 + }, + { + "epoch": 0.26995708154506437, + "grad_norm": 1.2092539072036743, + "learning_rate": 4.284884419405346e-06, + "loss": 2.3101, + "step": 5032 + }, + { + "epoch": 0.2700107296137339, + "grad_norm": 1.4489800930023193, + "learning_rate": 4.2845802275428964e-06, + "loss": 2.0137, + "step": 5033 + }, + { + "epoch": 0.27006437768240343, + "grad_norm": 1.3279945850372314, + "learning_rate": 4.284275981799033e-06, + "loss": 2.4818, + "step": 5034 + }, + { + "epoch": 0.27011802575107297, + "grad_norm": 1.4883575439453125, + "learning_rate": 4.283971682182945e-06, + "loss": 2.3936, + "step": 5035 + }, + { + "epoch": 0.2701716738197425, + "grad_norm": 1.171967625617981, + "learning_rate": 4.2836673287038185e-06, + "loss": 2.198, + "step": 5036 + }, + { + "epoch": 0.27022532188841203, + "grad_norm": 1.435179591178894, + "learning_rate": 4.283362921370843e-06, + "loss": 2.3111, + "step": 5037 + }, + { + "epoch": 0.27027896995708156, + "grad_norm": 1.2800894975662231, + "learning_rate": 4.283058460193209e-06, + "loss": 2.481, + "step": 5038 + }, + { + "epoch": 0.2703326180257511, + "grad_norm": 2.8953981399536133, + "learning_rate": 4.282753945180109e-06, + "loss": 2.1909, + "step": 5039 + }, + { + "epoch": 0.2703862660944206, + "grad_norm": 1.5334526300430298, + "learning_rate": 4.282449376340738e-06, + "loss": 2.2883, + "step": 5040 + }, + { + "epoch": 0.27043991416309016, + "grad_norm": 1.5062311887741089, + "learning_rate": 4.282144753684291e-06, + "loss": 2.4197, + "step": 5041 + }, + { + "epoch": 0.27049356223175963, + "grad_norm": 1.4818061590194702, + "learning_rate": 4.281840077219965e-06, + "loss": 2.3177, + "step": 5042 + }, + { + "epoch": 0.27054721030042916, + "grad_norm": 1.3199596405029297, + "learning_rate": 4.2815353469569606e-06, + "loss": 2.3908, + "step": 5043 + }, + { + "epoch": 0.2706008583690987, + "grad_norm": 1.455367922782898, + "learning_rate": 4.281230562904477e-06, + "loss": 2.1342, + "step": 5044 + }, + { + "epoch": 0.2706545064377682, + "grad_norm": 1.3164163827896118, + "learning_rate": 4.280925725071717e-06, + "loss": 1.8652, + "step": 5045 + }, + { + "epoch": 0.27070815450643776, + "grad_norm": 1.4694806337356567, + "learning_rate": 4.280620833467884e-06, + "loss": 2.6928, + "step": 5046 + }, + { + "epoch": 0.2707618025751073, + "grad_norm": 1.1250041723251343, + "learning_rate": 4.280315888102185e-06, + "loss": 2.112, + "step": 5047 + }, + { + "epoch": 0.2708154506437768, + "grad_norm": 1.379387378692627, + "learning_rate": 4.280010888983825e-06, + "loss": 2.5605, + "step": 5048 + }, + { + "epoch": 0.27086909871244635, + "grad_norm": 1.2103480100631714, + "learning_rate": 4.2797058361220145e-06, + "loss": 2.0291, + "step": 5049 + }, + { + "epoch": 0.2709227467811159, + "grad_norm": 1.1501247882843018, + "learning_rate": 4.279400729525963e-06, + "loss": 2.2633, + "step": 5050 + }, + { + "epoch": 0.2709763948497854, + "grad_norm": 1.0144981145858765, + "learning_rate": 4.279095569204883e-06, + "loss": 2.0668, + "step": 5051 + }, + { + "epoch": 0.27103004291845495, + "grad_norm": 1.4243783950805664, + "learning_rate": 4.278790355167988e-06, + "loss": 2.2037, + "step": 5052 + }, + { + "epoch": 0.2710836909871245, + "grad_norm": 1.429612636566162, + "learning_rate": 4.278485087424492e-06, + "loss": 2.3269, + "step": 5053 + }, + { + "epoch": 0.271137339055794, + "grad_norm": 1.616772174835205, + "learning_rate": 4.278179765983615e-06, + "loss": 2.298, + "step": 5054 + }, + { + "epoch": 0.27119098712446355, + "grad_norm": 4.538635730743408, + "learning_rate": 4.277874390854572e-06, + "loss": 1.8707, + "step": 5055 + }, + { + "epoch": 0.271244635193133, + "grad_norm": 1.6581392288208008, + "learning_rate": 4.277568962046585e-06, + "loss": 2.189, + "step": 5056 + }, + { + "epoch": 0.27129828326180255, + "grad_norm": 1.2346731424331665, + "learning_rate": 4.2772634795688765e-06, + "loss": 2.1664, + "step": 5057 + }, + { + "epoch": 0.2713519313304721, + "grad_norm": 1.1846965551376343, + "learning_rate": 4.2769579434306674e-06, + "loss": 2.4093, + "step": 5058 + }, + { + "epoch": 0.2714055793991416, + "grad_norm": 1.3121274709701538, + "learning_rate": 4.276652353641185e-06, + "loss": 2.0601, + "step": 5059 + }, + { + "epoch": 0.27145922746781115, + "grad_norm": 1.4552000761032104, + "learning_rate": 4.2763467102096545e-06, + "loss": 2.1054, + "step": 5060 + }, + { + "epoch": 0.2715128755364807, + "grad_norm": 1.27945876121521, + "learning_rate": 4.276041013145305e-06, + "loss": 2.4143, + "step": 5061 + }, + { + "epoch": 0.2715665236051502, + "grad_norm": 1.5154036283493042, + "learning_rate": 4.275735262457365e-06, + "loss": 2.3033, + "step": 5062 + }, + { + "epoch": 0.27162017167381974, + "grad_norm": 1.378668189048767, + "learning_rate": 4.275429458155068e-06, + "loss": 2.4515, + "step": 5063 + }, + { + "epoch": 0.2716738197424893, + "grad_norm": 1.4732182025909424, + "learning_rate": 4.2751236002476445e-06, + "loss": 2.3508, + "step": 5064 + }, + { + "epoch": 0.2717274678111588, + "grad_norm": 1.4523944854736328, + "learning_rate": 4.274817688744332e-06, + "loss": 2.1809, + "step": 5065 + }, + { + "epoch": 0.27178111587982834, + "grad_norm": 1.3190053701400757, + "learning_rate": 4.274511723654365e-06, + "loss": 2.285, + "step": 5066 + }, + { + "epoch": 0.27183476394849787, + "grad_norm": 1.5244964361190796, + "learning_rate": 4.2742057049869815e-06, + "loss": 2.3753, + "step": 5067 + }, + { + "epoch": 0.2718884120171674, + "grad_norm": 1.3714141845703125, + "learning_rate": 4.273899632751422e-06, + "loss": 2.3331, + "step": 5068 + }, + { + "epoch": 0.27194206008583693, + "grad_norm": 1.3573110103607178, + "learning_rate": 4.273593506956927e-06, + "loss": 2.2788, + "step": 5069 + }, + { + "epoch": 0.2719957081545064, + "grad_norm": 1.4939484596252441, + "learning_rate": 4.273287327612739e-06, + "loss": 2.4674, + "step": 5070 + }, + { + "epoch": 0.27204935622317594, + "grad_norm": 1.6218711137771606, + "learning_rate": 4.272981094728103e-06, + "loss": 2.1943, + "step": 5071 + }, + { + "epoch": 0.2721030042918455, + "grad_norm": 1.68217933177948, + "learning_rate": 4.2726748083122645e-06, + "loss": 2.2304, + "step": 5072 + }, + { + "epoch": 0.272156652360515, + "grad_norm": 1.9088689088821411, + "learning_rate": 4.272368468374473e-06, + "loss": 2.2055, + "step": 5073 + }, + { + "epoch": 0.27221030042918454, + "grad_norm": 1.1622698307037354, + "learning_rate": 4.2720620749239745e-06, + "loss": 2.0815, + "step": 5074 + }, + { + "epoch": 0.27226394849785407, + "grad_norm": 1.3695124387741089, + "learning_rate": 4.2717556279700225e-06, + "loss": 2.3711, + "step": 5075 + }, + { + "epoch": 0.2723175965665236, + "grad_norm": 1.3187259435653687, + "learning_rate": 4.2714491275218675e-06, + "loss": 2.3433, + "step": 5076 + }, + { + "epoch": 0.27237124463519313, + "grad_norm": 1.5475512742996216, + "learning_rate": 4.271142573588766e-06, + "loss": 2.1077, + "step": 5077 + }, + { + "epoch": 0.27242489270386266, + "grad_norm": 1.3945552110671997, + "learning_rate": 4.270835966179972e-06, + "loss": 2.5996, + "step": 5078 + }, + { + "epoch": 0.2724785407725322, + "grad_norm": 1.7153942584991455, + "learning_rate": 4.270529305304741e-06, + "loss": 2.3499, + "step": 5079 + }, + { + "epoch": 0.27253218884120173, + "grad_norm": 1.2384716272354126, + "learning_rate": 4.2702225909723364e-06, + "loss": 2.1649, + "step": 5080 + }, + { + "epoch": 0.27258583690987126, + "grad_norm": 1.304511547088623, + "learning_rate": 4.269915823192016e-06, + "loss": 2.3514, + "step": 5081 + }, + { + "epoch": 0.2726394849785408, + "grad_norm": 1.1688787937164307, + "learning_rate": 4.269609001973043e-06, + "loss": 2.3012, + "step": 5082 + }, + { + "epoch": 0.2726931330472103, + "grad_norm": 1.440639615058899, + "learning_rate": 4.26930212732468e-06, + "loss": 2.3446, + "step": 5083 + }, + { + "epoch": 0.27274678111587985, + "grad_norm": 1.2931896448135376, + "learning_rate": 4.268995199256193e-06, + "loss": 1.9701, + "step": 5084 + }, + { + "epoch": 0.27280042918454933, + "grad_norm": 1.1630518436431885, + "learning_rate": 4.268688217776849e-06, + "loss": 2.4506, + "step": 5085 + }, + { + "epoch": 0.27285407725321886, + "grad_norm": 1.4432328939437866, + "learning_rate": 4.268381182895916e-06, + "loss": 2.2178, + "step": 5086 + }, + { + "epoch": 0.2729077253218884, + "grad_norm": 3.370286226272583, + "learning_rate": 4.268074094622666e-06, + "loss": 2.6975, + "step": 5087 + }, + { + "epoch": 0.2729613733905579, + "grad_norm": 1.330358862876892, + "learning_rate": 4.267766952966369e-06, + "loss": 2.2636, + "step": 5088 + }, + { + "epoch": 0.27301502145922746, + "grad_norm": 2.4076426029205322, + "learning_rate": 4.2674597579363e-06, + "loss": 2.362, + "step": 5089 + }, + { + "epoch": 0.273068669527897, + "grad_norm": 1.1698631048202515, + "learning_rate": 4.267152509541732e-06, + "loss": 2.2228, + "step": 5090 + }, + { + "epoch": 0.2731223175965665, + "grad_norm": 1.3573989868164062, + "learning_rate": 4.266845207791944e-06, + "loss": 2.2644, + "step": 5091 + }, + { + "epoch": 0.27317596566523605, + "grad_norm": 1.3935761451721191, + "learning_rate": 4.266537852696213e-06, + "loss": 2.5705, + "step": 5092 + }, + { + "epoch": 0.2732296137339056, + "grad_norm": 1.5292274951934814, + "learning_rate": 4.266230444263818e-06, + "loss": 2.3717, + "step": 5093 + }, + { + "epoch": 0.2732832618025751, + "grad_norm": 1.39475679397583, + "learning_rate": 4.265922982504043e-06, + "loss": 2.3837, + "step": 5094 + }, + { + "epoch": 0.27333690987124465, + "grad_norm": 1.8508994579315186, + "learning_rate": 4.26561546742617e-06, + "loss": 2.4516, + "step": 5095 + }, + { + "epoch": 0.2733905579399142, + "grad_norm": 1.290399193763733, + "learning_rate": 4.265307899039482e-06, + "loss": 2.2106, + "step": 5096 + }, + { + "epoch": 0.2734442060085837, + "grad_norm": 1.650404691696167, + "learning_rate": 4.265000277353268e-06, + "loss": 2.2012, + "step": 5097 + }, + { + "epoch": 0.27349785407725324, + "grad_norm": 1.3637722730636597, + "learning_rate": 4.264692602376814e-06, + "loss": 2.2205, + "step": 5098 + }, + { + "epoch": 0.2735515021459227, + "grad_norm": 1.1829214096069336, + "learning_rate": 4.264384874119411e-06, + "loss": 2.1888, + "step": 5099 + }, + { + "epoch": 0.27360515021459225, + "grad_norm": 1.4340940713882446, + "learning_rate": 4.264077092590349e-06, + "loss": 2.3793, + "step": 5100 + }, + { + "epoch": 0.2736587982832618, + "grad_norm": 1.3910068273544312, + "learning_rate": 4.263769257798921e-06, + "loss": 2.2972, + "step": 5101 + }, + { + "epoch": 0.2737124463519313, + "grad_norm": 1.41350257396698, + "learning_rate": 4.263461369754422e-06, + "loss": 2.3141, + "step": 5102 + }, + { + "epoch": 0.27376609442060085, + "grad_norm": 1.2730504274368286, + "learning_rate": 4.263153428466147e-06, + "loss": 2.2056, + "step": 5103 + }, + { + "epoch": 0.2738197424892704, + "grad_norm": 1.2497092485427856, + "learning_rate": 4.262845433943396e-06, + "loss": 2.3358, + "step": 5104 + }, + { + "epoch": 0.2738733905579399, + "grad_norm": 1.3817212581634521, + "learning_rate": 4.262537386195464e-06, + "loss": 2.3971, + "step": 5105 + }, + { + "epoch": 0.27392703862660944, + "grad_norm": 1.5090575218200684, + "learning_rate": 4.262229285231656e-06, + "loss": 2.2748, + "step": 5106 + }, + { + "epoch": 0.273980686695279, + "grad_norm": 2.461193084716797, + "learning_rate": 4.261921131061272e-06, + "loss": 2.2437, + "step": 5107 + }, + { + "epoch": 0.2740343347639485, + "grad_norm": 1.3447057008743286, + "learning_rate": 4.261612923693617e-06, + "loss": 2.0417, + "step": 5108 + }, + { + "epoch": 0.27408798283261804, + "grad_norm": 1.2058742046356201, + "learning_rate": 4.261304663137996e-06, + "loss": 2.1474, + "step": 5109 + }, + { + "epoch": 0.27414163090128757, + "grad_norm": 1.2519980669021606, + "learning_rate": 4.260996349403717e-06, + "loss": 2.089, + "step": 5110 + }, + { + "epoch": 0.2741952789699571, + "grad_norm": 1.1811704635620117, + "learning_rate": 4.260687982500088e-06, + "loss": 1.9808, + "step": 5111 + }, + { + "epoch": 0.27424892703862663, + "grad_norm": 1.392079472541809, + "learning_rate": 4.26037956243642e-06, + "loss": 2.2366, + "step": 5112 + }, + { + "epoch": 0.27430257510729616, + "grad_norm": 2.0677363872528076, + "learning_rate": 4.260071089222026e-06, + "loss": 2.2034, + "step": 5113 + }, + { + "epoch": 0.27435622317596564, + "grad_norm": 1.342209815979004, + "learning_rate": 4.259762562866216e-06, + "loss": 2.5117, + "step": 5114 + }, + { + "epoch": 0.2744098712446352, + "grad_norm": 1.2299880981445312, + "learning_rate": 4.2594539833783096e-06, + "loss": 2.0789, + "step": 5115 + }, + { + "epoch": 0.2744635193133047, + "grad_norm": 1.32047700881958, + "learning_rate": 4.259145350767622e-06, + "loss": 2.4119, + "step": 5116 + }, + { + "epoch": 0.27451716738197424, + "grad_norm": 1.419252634048462, + "learning_rate": 4.2588366650434715e-06, + "loss": 2.1327, + "step": 5117 + }, + { + "epoch": 0.27457081545064377, + "grad_norm": 1.4397735595703125, + "learning_rate": 4.2585279262151775e-06, + "loss": 2.3717, + "step": 5118 + }, + { + "epoch": 0.2746244635193133, + "grad_norm": 1.302578091621399, + "learning_rate": 4.258219134292064e-06, + "loss": 2.0624, + "step": 5119 + }, + { + "epoch": 0.27467811158798283, + "grad_norm": 1.517033576965332, + "learning_rate": 4.257910289283451e-06, + "loss": 2.3818, + "step": 5120 + }, + { + "epoch": 0.27473175965665236, + "grad_norm": 1.2350488901138306, + "learning_rate": 4.257601391198666e-06, + "loss": 2.2122, + "step": 5121 + }, + { + "epoch": 0.2747854077253219, + "grad_norm": 1.6916656494140625, + "learning_rate": 4.257292440047035e-06, + "loss": 2.2313, + "step": 5122 + }, + { + "epoch": 0.2748390557939914, + "grad_norm": 1.1269254684448242, + "learning_rate": 4.256983435837884e-06, + "loss": 2.1087, + "step": 5123 + }, + { + "epoch": 0.27489270386266096, + "grad_norm": 1.3345727920532227, + "learning_rate": 4.256674378580546e-06, + "loss": 2.1793, + "step": 5124 + }, + { + "epoch": 0.2749463519313305, + "grad_norm": 1.3503037691116333, + "learning_rate": 4.25636526828435e-06, + "loss": 2.2907, + "step": 5125 + }, + { + "epoch": 0.275, + "grad_norm": 1.3210887908935547, + "learning_rate": 4.25605610495863e-06, + "loss": 2.2399, + "step": 5126 + }, + { + "epoch": 0.27505364806866955, + "grad_norm": 1.5001415014266968, + "learning_rate": 4.2557468886127195e-06, + "loss": 2.3589, + "step": 5127 + }, + { + "epoch": 0.27510729613733903, + "grad_norm": 1.2709132432937622, + "learning_rate": 4.255437619255956e-06, + "loss": 2.3611, + "step": 5128 + }, + { + "epoch": 0.27516094420600856, + "grad_norm": 1.336848497390747, + "learning_rate": 4.255128296897675e-06, + "loss": 2.3212, + "step": 5129 + }, + { + "epoch": 0.2752145922746781, + "grad_norm": 1.238572597503662, + "learning_rate": 4.254818921547218e-06, + "loss": 2.2065, + "step": 5130 + }, + { + "epoch": 0.2752682403433476, + "grad_norm": 1.2732969522476196, + "learning_rate": 4.254509493213925e-06, + "loss": 2.06, + "step": 5131 + }, + { + "epoch": 0.27532188841201716, + "grad_norm": 1.3019180297851562, + "learning_rate": 4.254200011907138e-06, + "loss": 2.2509, + "step": 5132 + }, + { + "epoch": 0.2753755364806867, + "grad_norm": 1.3336728811264038, + "learning_rate": 4.253890477636202e-06, + "loss": 2.0353, + "step": 5133 + }, + { + "epoch": 0.2754291845493562, + "grad_norm": 1.1933029890060425, + "learning_rate": 4.253580890410462e-06, + "loss": 2.1334, + "step": 5134 + }, + { + "epoch": 0.27548283261802575, + "grad_norm": 1.5035765171051025, + "learning_rate": 4.2532712502392674e-06, + "loss": 2.4081, + "step": 5135 + }, + { + "epoch": 0.2755364806866953, + "grad_norm": 1.0912796258926392, + "learning_rate": 4.2529615571319635e-06, + "loss": 2.3314, + "step": 5136 + }, + { + "epoch": 0.2755901287553648, + "grad_norm": 1.7212821245193481, + "learning_rate": 4.252651811097903e-06, + "loss": 1.748, + "step": 5137 + }, + { + "epoch": 0.27564377682403435, + "grad_norm": 1.5574736595153809, + "learning_rate": 4.252342012146438e-06, + "loss": 2.3123, + "step": 5138 + }, + { + "epoch": 0.2756974248927039, + "grad_norm": 1.2778644561767578, + "learning_rate": 4.252032160286921e-06, + "loss": 2.4569, + "step": 5139 + }, + { + "epoch": 0.2757510729613734, + "grad_norm": 1.5819686651229858, + "learning_rate": 4.251722255528709e-06, + "loss": 2.3121, + "step": 5140 + }, + { + "epoch": 0.27580472103004294, + "grad_norm": 1.4555691480636597, + "learning_rate": 4.251412297881157e-06, + "loss": 2.1226, + "step": 5141 + }, + { + "epoch": 0.2758583690987124, + "grad_norm": 1.233445167541504, + "learning_rate": 4.2511022873536254e-06, + "loss": 2.1901, + "step": 5142 + }, + { + "epoch": 0.27591201716738195, + "grad_norm": 1.24208664894104, + "learning_rate": 4.250792223955472e-06, + "loss": 2.0571, + "step": 5143 + }, + { + "epoch": 0.2759656652360515, + "grad_norm": 1.2599092721939087, + "learning_rate": 4.250482107696061e-06, + "loss": 2.3582, + "step": 5144 + }, + { + "epoch": 0.276019313304721, + "grad_norm": 1.55085289478302, + "learning_rate": 4.250171938584754e-06, + "loss": 2.334, + "step": 5145 + }, + { + "epoch": 0.27607296137339055, + "grad_norm": 1.3603636026382446, + "learning_rate": 4.249861716630916e-06, + "loss": 2.3599, + "step": 5146 + }, + { + "epoch": 0.2761266094420601, + "grad_norm": 1.0400370359420776, + "learning_rate": 4.2495514418439145e-06, + "loss": 2.0434, + "step": 5147 + }, + { + "epoch": 0.2761802575107296, + "grad_norm": 2.799269437789917, + "learning_rate": 4.249241114233117e-06, + "loss": 2.0889, + "step": 5148 + }, + { + "epoch": 0.27623390557939914, + "grad_norm": 1.2948399782180786, + "learning_rate": 4.248930733807892e-06, + "loss": 2.5173, + "step": 5149 + }, + { + "epoch": 0.2762875536480687, + "grad_norm": 1.5789997577667236, + "learning_rate": 4.248620300577612e-06, + "loss": 2.2491, + "step": 5150 + }, + { + "epoch": 0.2763412017167382, + "grad_norm": 1.6369529962539673, + "learning_rate": 4.2483098145516496e-06, + "loss": 2.4179, + "step": 5151 + }, + { + "epoch": 0.27639484978540774, + "grad_norm": 1.317929983139038, + "learning_rate": 4.247999275739379e-06, + "loss": 2.3318, + "step": 5152 + }, + { + "epoch": 0.27644849785407727, + "grad_norm": 1.0702420473098755, + "learning_rate": 4.247688684150177e-06, + "loss": 2.1316, + "step": 5153 + }, + { + "epoch": 0.2765021459227468, + "grad_norm": 1.9283409118652344, + "learning_rate": 4.247378039793421e-06, + "loss": 2.3501, + "step": 5154 + }, + { + "epoch": 0.27655579399141633, + "grad_norm": 1.3748856782913208, + "learning_rate": 4.2470673426784885e-06, + "loss": 2.1949, + "step": 5155 + }, + { + "epoch": 0.27660944206008586, + "grad_norm": 1.4990483522415161, + "learning_rate": 4.2467565928147625e-06, + "loss": 2.2239, + "step": 5156 + }, + { + "epoch": 0.27666309012875534, + "grad_norm": 1.33983314037323, + "learning_rate": 4.246445790211624e-06, + "loss": 2.4572, + "step": 5157 + }, + { + "epoch": 0.27671673819742487, + "grad_norm": 1.379253625869751, + "learning_rate": 4.2461349348784585e-06, + "loss": 2.5133, + "step": 5158 + }, + { + "epoch": 0.2767703862660944, + "grad_norm": 1.4265518188476562, + "learning_rate": 4.24582402682465e-06, + "loss": 2.3075, + "step": 5159 + }, + { + "epoch": 0.27682403433476394, + "grad_norm": 1.2879512310028076, + "learning_rate": 4.245513066059586e-06, + "loss": 2.5126, + "step": 5160 + }, + { + "epoch": 0.27687768240343347, + "grad_norm": 1.2859792709350586, + "learning_rate": 4.245202052592656e-06, + "loss": 2.1888, + "step": 5161 + }, + { + "epoch": 0.276931330472103, + "grad_norm": 1.2637146711349487, + "learning_rate": 4.24489098643325e-06, + "loss": 2.3043, + "step": 5162 + }, + { + "epoch": 0.27698497854077253, + "grad_norm": 1.3754401206970215, + "learning_rate": 4.244579867590759e-06, + "loss": 2.2659, + "step": 5163 + }, + { + "epoch": 0.27703862660944206, + "grad_norm": 1.4319254159927368, + "learning_rate": 4.244268696074578e-06, + "loss": 2.1874, + "step": 5164 + }, + { + "epoch": 0.2770922746781116, + "grad_norm": 1.6389193534851074, + "learning_rate": 4.243957471894102e-06, + "loss": 2.2896, + "step": 5165 + }, + { + "epoch": 0.2771459227467811, + "grad_norm": 1.3648602962493896, + "learning_rate": 4.243646195058727e-06, + "loss": 1.7477, + "step": 5166 + }, + { + "epoch": 0.27719957081545066, + "grad_norm": 1.385986089706421, + "learning_rate": 4.243334865577851e-06, + "loss": 2.3667, + "step": 5167 + }, + { + "epoch": 0.2772532188841202, + "grad_norm": 1.3868513107299805, + "learning_rate": 4.243023483460875e-06, + "loss": 2.0209, + "step": 5168 + }, + { + "epoch": 0.2773068669527897, + "grad_norm": 2.375197410583496, + "learning_rate": 4.2427120487171994e-06, + "loss": 2.4041, + "step": 5169 + }, + { + "epoch": 0.27736051502145925, + "grad_norm": 1.3337339162826538, + "learning_rate": 4.242400561356228e-06, + "loss": 2.1708, + "step": 5170 + }, + { + "epoch": 0.27741416309012873, + "grad_norm": 1.3357185125350952, + "learning_rate": 4.242089021387365e-06, + "loss": 2.2146, + "step": 5171 + }, + { + "epoch": 0.27746781115879826, + "grad_norm": 1.4766290187835693, + "learning_rate": 4.241777428820018e-06, + "loss": 2.6462, + "step": 5172 + }, + { + "epoch": 0.2775214592274678, + "grad_norm": 1.3067364692687988, + "learning_rate": 4.241465783663593e-06, + "loss": 2.1218, + "step": 5173 + }, + { + "epoch": 0.2775751072961373, + "grad_norm": 1.459026575088501, + "learning_rate": 4.241154085927501e-06, + "loss": 2.108, + "step": 5174 + }, + { + "epoch": 0.27762875536480686, + "grad_norm": 1.3283309936523438, + "learning_rate": 4.240842335621151e-06, + "loss": 2.1638, + "step": 5175 + }, + { + "epoch": 0.2776824034334764, + "grad_norm": 1.347233533859253, + "learning_rate": 4.240530532753957e-06, + "loss": 2.3347, + "step": 5176 + }, + { + "epoch": 0.2777360515021459, + "grad_norm": 1.2273359298706055, + "learning_rate": 4.240218677335334e-06, + "loss": 2.1825, + "step": 5177 + }, + { + "epoch": 0.27778969957081545, + "grad_norm": 1.5048913955688477, + "learning_rate": 4.239906769374695e-06, + "loss": 2.2483, + "step": 5178 + }, + { + "epoch": 0.277843347639485, + "grad_norm": 1.2917808294296265, + "learning_rate": 4.239594808881461e-06, + "loss": 2.3458, + "step": 5179 + }, + { + "epoch": 0.2778969957081545, + "grad_norm": 1.6750189065933228, + "learning_rate": 4.239282795865048e-06, + "loss": 2.1208, + "step": 5180 + }, + { + "epoch": 0.27795064377682405, + "grad_norm": 1.3258097171783447, + "learning_rate": 4.238970730334879e-06, + "loss": 2.358, + "step": 5181 + }, + { + "epoch": 0.2780042918454936, + "grad_norm": 1.3858402967453003, + "learning_rate": 4.238658612300374e-06, + "loss": 2.3103, + "step": 5182 + }, + { + "epoch": 0.2780579399141631, + "grad_norm": 1.375516414642334, + "learning_rate": 4.238346441770957e-06, + "loss": 2.2269, + "step": 5183 + }, + { + "epoch": 0.27811158798283264, + "grad_norm": 1.4884830713272095, + "learning_rate": 4.238034218756054e-06, + "loss": 2.2517, + "step": 5184 + }, + { + "epoch": 0.2781652360515021, + "grad_norm": 1.2289118766784668, + "learning_rate": 4.2377219432650914e-06, + "loss": 2.4535, + "step": 5185 + }, + { + "epoch": 0.27821888412017165, + "grad_norm": 1.5096640586853027, + "learning_rate": 4.2374096153074985e-06, + "loss": 2.6197, + "step": 5186 + }, + { + "epoch": 0.2782725321888412, + "grad_norm": 2.6392154693603516, + "learning_rate": 4.2370972348927045e-06, + "loss": 2.1384, + "step": 5187 + }, + { + "epoch": 0.2783261802575107, + "grad_norm": 1.350480556488037, + "learning_rate": 4.236784802030141e-06, + "loss": 2.2861, + "step": 5188 + }, + { + "epoch": 0.27837982832618025, + "grad_norm": 1.3361363410949707, + "learning_rate": 4.236472316729242e-06, + "loss": 2.1934, + "step": 5189 + }, + { + "epoch": 0.2784334763948498, + "grad_norm": 1.334849238395691, + "learning_rate": 4.236159778999441e-06, + "loss": 1.4582, + "step": 5190 + }, + { + "epoch": 0.2784871244635193, + "grad_norm": 1.3450567722320557, + "learning_rate": 4.235847188850175e-06, + "loss": 2.3503, + "step": 5191 + }, + { + "epoch": 0.27854077253218884, + "grad_norm": 1.4672859907150269, + "learning_rate": 4.235534546290884e-06, + "loss": 2.1708, + "step": 5192 + }, + { + "epoch": 0.2785944206008584, + "grad_norm": 1.4257906675338745, + "learning_rate": 4.235221851331004e-06, + "loss": 2.4392, + "step": 5193 + }, + { + "epoch": 0.2786480686695279, + "grad_norm": 1.4506680965423584, + "learning_rate": 4.2349091039799786e-06, + "loss": 2.2303, + "step": 5194 + }, + { + "epoch": 0.27870171673819744, + "grad_norm": 1.1998234987258911, + "learning_rate": 4.234596304247249e-06, + "loss": 2.1107, + "step": 5195 + }, + { + "epoch": 0.27875536480686697, + "grad_norm": 1.4508569240570068, + "learning_rate": 4.234283452142261e-06, + "loss": 2.0759, + "step": 5196 + }, + { + "epoch": 0.2788090128755365, + "grad_norm": 1.427050232887268, + "learning_rate": 4.233970547674459e-06, + "loss": 2.1628, + "step": 5197 + }, + { + "epoch": 0.27886266094420603, + "grad_norm": 1.17997145652771, + "learning_rate": 4.23365759085329e-06, + "loss": 2.3434, + "step": 5198 + }, + { + "epoch": 0.27891630901287556, + "grad_norm": 1.4046803712844849, + "learning_rate": 4.233344581688205e-06, + "loss": 2.1836, + "step": 5199 + }, + { + "epoch": 0.27896995708154504, + "grad_norm": 1.3837255239486694, + "learning_rate": 4.233031520188655e-06, + "loss": 1.778, + "step": 5200 + }, + { + "epoch": 0.27902360515021457, + "grad_norm": 1.5713597536087036, + "learning_rate": 4.2327184063640905e-06, + "loss": 2.3894, + "step": 5201 + }, + { + "epoch": 0.2790772532188841, + "grad_norm": 1.529776692390442, + "learning_rate": 4.232405240223965e-06, + "loss": 2.0, + "step": 5202 + }, + { + "epoch": 0.27913090128755363, + "grad_norm": 1.4573729038238525, + "learning_rate": 4.232092021777734e-06, + "loss": 2.2926, + "step": 5203 + }, + { + "epoch": 0.27918454935622317, + "grad_norm": 1.515673279762268, + "learning_rate": 4.231778751034857e-06, + "loss": 2.3154, + "step": 5204 + }, + { + "epoch": 0.2792381974248927, + "grad_norm": 1.1519880294799805, + "learning_rate": 4.231465428004789e-06, + "loss": 2.2016, + "step": 5205 + }, + { + "epoch": 0.27929184549356223, + "grad_norm": 1.5241012573242188, + "learning_rate": 4.231152052696992e-06, + "loss": 2.1189, + "step": 5206 + }, + { + "epoch": 0.27934549356223176, + "grad_norm": 1.5358985662460327, + "learning_rate": 4.230838625120927e-06, + "loss": 2.2295, + "step": 5207 + }, + { + "epoch": 0.2793991416309013, + "grad_norm": 1.3408899307250977, + "learning_rate": 4.230525145286057e-06, + "loss": 2.2084, + "step": 5208 + }, + { + "epoch": 0.2794527896995708, + "grad_norm": 1.3194150924682617, + "learning_rate": 4.230211613201848e-06, + "loss": 2.1152, + "step": 5209 + }, + { + "epoch": 0.27950643776824036, + "grad_norm": 1.537580132484436, + "learning_rate": 4.229898028877767e-06, + "loss": 2.2257, + "step": 5210 + }, + { + "epoch": 0.2795600858369099, + "grad_norm": 1.4625517129898071, + "learning_rate": 4.229584392323279e-06, + "loss": 2.3727, + "step": 5211 + }, + { + "epoch": 0.2796137339055794, + "grad_norm": 1.3874555826187134, + "learning_rate": 4.2292707035478554e-06, + "loss": 2.5725, + "step": 5212 + }, + { + "epoch": 0.27966738197424895, + "grad_norm": 1.3671960830688477, + "learning_rate": 4.228956962560967e-06, + "loss": 2.3113, + "step": 5213 + }, + { + "epoch": 0.27972103004291843, + "grad_norm": 1.3213077783584595, + "learning_rate": 4.228643169372088e-06, + "loss": 2.1765, + "step": 5214 + }, + { + "epoch": 0.27977467811158796, + "grad_norm": 1.6054065227508545, + "learning_rate": 4.228329323990691e-06, + "loss": 2.2497, + "step": 5215 + }, + { + "epoch": 0.2798283261802575, + "grad_norm": 1.1613774299621582, + "learning_rate": 4.228015426426251e-06, + "loss": 2.2912, + "step": 5216 + }, + { + "epoch": 0.279881974248927, + "grad_norm": 1.2451120615005493, + "learning_rate": 4.227701476688247e-06, + "loss": 2.0478, + "step": 5217 + }, + { + "epoch": 0.27993562231759656, + "grad_norm": 1.8184071779251099, + "learning_rate": 4.227387474786159e-06, + "loss": 2.0967, + "step": 5218 + }, + { + "epoch": 0.2799892703862661, + "grad_norm": 1.329810619354248, + "learning_rate": 4.227073420729466e-06, + "loss": 2.2447, + "step": 5219 + }, + { + "epoch": 0.2800429184549356, + "grad_norm": 1.1265616416931152, + "learning_rate": 4.226759314527649e-06, + "loss": 2.2877, + "step": 5220 + }, + { + "epoch": 0.28009656652360515, + "grad_norm": 1.2927178144454956, + "learning_rate": 4.226445156190194e-06, + "loss": 2.0988, + "step": 5221 + }, + { + "epoch": 0.2801502145922747, + "grad_norm": 1.538016676902771, + "learning_rate": 4.226130945726586e-06, + "loss": 2.228, + "step": 5222 + }, + { + "epoch": 0.2802038626609442, + "grad_norm": 1.3824162483215332, + "learning_rate": 4.225816683146311e-06, + "loss": 2.4655, + "step": 5223 + }, + { + "epoch": 0.28025751072961375, + "grad_norm": 1.3595017194747925, + "learning_rate": 4.225502368458858e-06, + "loss": 2.3088, + "step": 5224 + }, + { + "epoch": 0.2803111587982833, + "grad_norm": 4.487486362457275, + "learning_rate": 4.225188001673717e-06, + "loss": 2.1442, + "step": 5225 + }, + { + "epoch": 0.2803648068669528, + "grad_norm": 1.4007480144500732, + "learning_rate": 4.2248735828003785e-06, + "loss": 2.3056, + "step": 5226 + }, + { + "epoch": 0.28041845493562234, + "grad_norm": 1.380864143371582, + "learning_rate": 4.224559111848337e-06, + "loss": 2.4918, + "step": 5227 + }, + { + "epoch": 0.2804721030042919, + "grad_norm": 1.363326072692871, + "learning_rate": 4.2242445888270875e-06, + "loss": 2.4162, + "step": 5228 + }, + { + "epoch": 0.28052575107296135, + "grad_norm": 1.3385776281356812, + "learning_rate": 4.223930013746126e-06, + "loss": 2.3903, + "step": 5229 + }, + { + "epoch": 0.2805793991416309, + "grad_norm": 1.3226299285888672, + "learning_rate": 4.223615386614949e-06, + "loss": 2.2943, + "step": 5230 + }, + { + "epoch": 0.2806330472103004, + "grad_norm": 1.1901867389678955, + "learning_rate": 4.223300707443057e-06, + "loss": 2.2905, + "step": 5231 + }, + { + "epoch": 0.28068669527896994, + "grad_norm": 1.403824806213379, + "learning_rate": 4.222985976239952e-06, + "loss": 2.278, + "step": 5232 + }, + { + "epoch": 0.2807403433476395, + "grad_norm": 1.6184874773025513, + "learning_rate": 4.222671193015135e-06, + "loss": 2.3745, + "step": 5233 + }, + { + "epoch": 0.280793991416309, + "grad_norm": 1.4539847373962402, + "learning_rate": 4.222356357778111e-06, + "loss": 2.1762, + "step": 5234 + }, + { + "epoch": 0.28084763948497854, + "grad_norm": 1.3271501064300537, + "learning_rate": 4.2220414705383854e-06, + "loss": 2.1548, + "step": 5235 + }, + { + "epoch": 0.28090128755364807, + "grad_norm": 3.267444133758545, + "learning_rate": 4.221726531305467e-06, + "loss": 2.3781, + "step": 5236 + }, + { + "epoch": 0.2809549356223176, + "grad_norm": 1.451798439025879, + "learning_rate": 4.221411540088862e-06, + "loss": 2.3694, + "step": 5237 + }, + { + "epoch": 0.28100858369098713, + "grad_norm": 1.3437318801879883, + "learning_rate": 4.221096496898083e-06, + "loss": 2.5644, + "step": 5238 + }, + { + "epoch": 0.28106223175965667, + "grad_norm": 3.569720983505249, + "learning_rate": 4.2207814017426405e-06, + "loss": 2.5217, + "step": 5239 + }, + { + "epoch": 0.2811158798283262, + "grad_norm": 1.3779711723327637, + "learning_rate": 4.2204662546320495e-06, + "loss": 2.3492, + "step": 5240 + }, + { + "epoch": 0.28116952789699573, + "grad_norm": 1.6914650201797485, + "learning_rate": 4.220151055575824e-06, + "loss": 2.2538, + "step": 5241 + }, + { + "epoch": 0.28122317596566526, + "grad_norm": 2.0960283279418945, + "learning_rate": 4.219835804583482e-06, + "loss": 2.3322, + "step": 5242 + }, + { + "epoch": 0.28127682403433474, + "grad_norm": 1.5461546182632446, + "learning_rate": 4.219520501664541e-06, + "loss": 2.0799, + "step": 5243 + }, + { + "epoch": 0.28133047210300427, + "grad_norm": 1.374873161315918, + "learning_rate": 4.219205146828521e-06, + "loss": 2.253, + "step": 5244 + }, + { + "epoch": 0.2813841201716738, + "grad_norm": 1.4015312194824219, + "learning_rate": 4.218889740084943e-06, + "loss": 2.0663, + "step": 5245 + }, + { + "epoch": 0.28143776824034333, + "grad_norm": 1.4393256902694702, + "learning_rate": 4.21857428144333e-06, + "loss": 2.3305, + "step": 5246 + }, + { + "epoch": 0.28149141630901287, + "grad_norm": 1.3401038646697998, + "learning_rate": 4.218258770913208e-06, + "loss": 2.2562, + "step": 5247 + }, + { + "epoch": 0.2815450643776824, + "grad_norm": 1.2800744771957397, + "learning_rate": 4.217943208504101e-06, + "loss": 2.1185, + "step": 5248 + }, + { + "epoch": 0.28159871244635193, + "grad_norm": 1.2058789730072021, + "learning_rate": 4.217627594225539e-06, + "loss": 2.1808, + "step": 5249 + }, + { + "epoch": 0.28165236051502146, + "grad_norm": 1.4832321405410767, + "learning_rate": 4.21731192808705e-06, + "loss": 2.1897, + "step": 5250 + }, + { + "epoch": 0.281706008583691, + "grad_norm": 1.4130202531814575, + "learning_rate": 4.216996210098164e-06, + "loss": 2.3083, + "step": 5251 + }, + { + "epoch": 0.2817596566523605, + "grad_norm": 1.553207278251648, + "learning_rate": 4.2166804402684146e-06, + "loss": 2.3918, + "step": 5252 + }, + { + "epoch": 0.28181330472103006, + "grad_norm": 1.388093113899231, + "learning_rate": 4.2163646186073355e-06, + "loss": 2.4909, + "step": 5253 + }, + { + "epoch": 0.2818669527896996, + "grad_norm": 1.3600637912750244, + "learning_rate": 4.216048745124462e-06, + "loss": 2.3412, + "step": 5254 + }, + { + "epoch": 0.2819206008583691, + "grad_norm": 1.1618337631225586, + "learning_rate": 4.215732819829332e-06, + "loss": 2.3037, + "step": 5255 + }, + { + "epoch": 0.28197424892703865, + "grad_norm": 1.4654827117919922, + "learning_rate": 4.215416842731483e-06, + "loss": 2.1177, + "step": 5256 + }, + { + "epoch": 0.2820278969957081, + "grad_norm": 1.3337420225143433, + "learning_rate": 4.215100813840456e-06, + "loss": 1.9635, + "step": 5257 + }, + { + "epoch": 0.28208154506437766, + "grad_norm": 1.477061152458191, + "learning_rate": 4.214784733165793e-06, + "loss": 2.3152, + "step": 5258 + }, + { + "epoch": 0.2821351931330472, + "grad_norm": 1.428373098373413, + "learning_rate": 4.214468600717036e-06, + "loss": 2.266, + "step": 5259 + }, + { + "epoch": 0.2821888412017167, + "grad_norm": 1.5135383605957031, + "learning_rate": 4.214152416503731e-06, + "loss": 2.1245, + "step": 5260 + }, + { + "epoch": 0.28224248927038625, + "grad_norm": 1.271544337272644, + "learning_rate": 4.2138361805354245e-06, + "loss": 1.8188, + "step": 5261 + }, + { + "epoch": 0.2822961373390558, + "grad_norm": 1.3638372421264648, + "learning_rate": 4.213519892821664e-06, + "loss": 2.1851, + "step": 5262 + }, + { + "epoch": 0.2823497854077253, + "grad_norm": 1.2949248552322388, + "learning_rate": 4.213203553372e-06, + "loss": 2.3053, + "step": 5263 + }, + { + "epoch": 0.28240343347639485, + "grad_norm": 1.8513284921646118, + "learning_rate": 4.212887162195983e-06, + "loss": 2.5473, + "step": 5264 + }, + { + "epoch": 0.2824570815450644, + "grad_norm": 1.424858570098877, + "learning_rate": 4.212570719303165e-06, + "loss": 2.4483, + "step": 5265 + }, + { + "epoch": 0.2825107296137339, + "grad_norm": 1.3986140489578247, + "learning_rate": 4.212254224703102e-06, + "loss": 2.7014, + "step": 5266 + }, + { + "epoch": 0.28256437768240344, + "grad_norm": 4.689304828643799, + "learning_rate": 4.211937678405349e-06, + "loss": 2.1426, + "step": 5267 + }, + { + "epoch": 0.282618025751073, + "grad_norm": 1.404821515083313, + "learning_rate": 4.211621080419463e-06, + "loss": 2.465, + "step": 5268 + }, + { + "epoch": 0.2826716738197425, + "grad_norm": 1.3127071857452393, + "learning_rate": 4.211304430755004e-06, + "loss": 2.4535, + "step": 5269 + }, + { + "epoch": 0.28272532188841204, + "grad_norm": 3.073326349258423, + "learning_rate": 4.21098772942153e-06, + "loss": 2.2756, + "step": 5270 + }, + { + "epoch": 0.28277896995708157, + "grad_norm": 1.4246277809143066, + "learning_rate": 4.210670976428606e-06, + "loss": 2.2483, + "step": 5271 + }, + { + "epoch": 0.28283261802575105, + "grad_norm": 1.2855678796768188, + "learning_rate": 4.210354171785795e-06, + "loss": 2.207, + "step": 5272 + }, + { + "epoch": 0.2828862660944206, + "grad_norm": 1.4680513143539429, + "learning_rate": 4.210037315502662e-06, + "loss": 2.3489, + "step": 5273 + }, + { + "epoch": 0.2829399141630901, + "grad_norm": 11.044355392456055, + "learning_rate": 4.209720407588773e-06, + "loss": 2.2732, + "step": 5274 + }, + { + "epoch": 0.28299356223175964, + "grad_norm": 1.3397061824798584, + "learning_rate": 4.209403448053697e-06, + "loss": 2.1888, + "step": 5275 + }, + { + "epoch": 0.2830472103004292, + "grad_norm": 1.3031245470046997, + "learning_rate": 4.209086436907004e-06, + "loss": 2.5797, + "step": 5276 + }, + { + "epoch": 0.2831008583690987, + "grad_norm": 1.389979600906372, + "learning_rate": 4.208769374158265e-06, + "loss": 2.2829, + "step": 5277 + }, + { + "epoch": 0.28315450643776824, + "grad_norm": 1.2984424829483032, + "learning_rate": 4.208452259817053e-06, + "loss": 2.4046, + "step": 5278 + }, + { + "epoch": 0.28320815450643777, + "grad_norm": 1.474424123764038, + "learning_rate": 4.208135093892943e-06, + "loss": 2.2292, + "step": 5279 + }, + { + "epoch": 0.2832618025751073, + "grad_norm": 1.3610624074935913, + "learning_rate": 4.207817876395511e-06, + "loss": 2.1958, + "step": 5280 + }, + { + "epoch": 0.28331545064377683, + "grad_norm": 1.2142666578292847, + "learning_rate": 4.207500607334334e-06, + "loss": 2.342, + "step": 5281 + }, + { + "epoch": 0.28336909871244637, + "grad_norm": 1.3932498693466187, + "learning_rate": 4.207183286718993e-06, + "loss": 2.3067, + "step": 5282 + }, + { + "epoch": 0.2834227467811159, + "grad_norm": 1.506768822669983, + "learning_rate": 4.2068659145590664e-06, + "loss": 2.5669, + "step": 5283 + }, + { + "epoch": 0.28347639484978543, + "grad_norm": 1.2575539350509644, + "learning_rate": 4.206548490864138e-06, + "loss": 2.2593, + "step": 5284 + }, + { + "epoch": 0.28353004291845496, + "grad_norm": 1.4685211181640625, + "learning_rate": 4.2062310156437925e-06, + "loss": 2.2098, + "step": 5285 + }, + { + "epoch": 0.28358369098712444, + "grad_norm": 1.5002377033233643, + "learning_rate": 4.205913488907612e-06, + "loss": 2.2377, + "step": 5286 + }, + { + "epoch": 0.28363733905579397, + "grad_norm": 1.477756142616272, + "learning_rate": 4.2055959106651875e-06, + "loss": 2.2644, + "step": 5287 + }, + { + "epoch": 0.2836909871244635, + "grad_norm": 1.3902099132537842, + "learning_rate": 4.205278280926106e-06, + "loss": 1.9639, + "step": 5288 + }, + { + "epoch": 0.28374463519313303, + "grad_norm": 1.3058732748031616, + "learning_rate": 4.204960599699957e-06, + "loss": 2.3112, + "step": 5289 + }, + { + "epoch": 0.28379828326180256, + "grad_norm": 1.3058775663375854, + "learning_rate": 4.204642866996333e-06, + "loss": 2.4107, + "step": 5290 + }, + { + "epoch": 0.2838519313304721, + "grad_norm": 1.5248193740844727, + "learning_rate": 4.204325082824826e-06, + "loss": 2.1995, + "step": 5291 + }, + { + "epoch": 0.2839055793991416, + "grad_norm": 1.132330060005188, + "learning_rate": 4.204007247195032e-06, + "loss": 2.2376, + "step": 5292 + }, + { + "epoch": 0.28395922746781116, + "grad_norm": 1.9655956029891968, + "learning_rate": 4.203689360116547e-06, + "loss": 2.3217, + "step": 5293 + }, + { + "epoch": 0.2840128755364807, + "grad_norm": 3.632291793823242, + "learning_rate": 4.203371421598971e-06, + "loss": 2.3623, + "step": 5294 + }, + { + "epoch": 0.2840665236051502, + "grad_norm": 1.584941029548645, + "learning_rate": 4.203053431651899e-06, + "loss": 2.1194, + "step": 5295 + }, + { + "epoch": 0.28412017167381975, + "grad_norm": 1.450371265411377, + "learning_rate": 4.2027353902849345e-06, + "loss": 2.3279, + "step": 5296 + }, + { + "epoch": 0.2841738197424893, + "grad_norm": 1.3237791061401367, + "learning_rate": 4.20241729750768e-06, + "loss": 2.2213, + "step": 5297 + }, + { + "epoch": 0.2842274678111588, + "grad_norm": 1.0109926462173462, + "learning_rate": 4.20209915332974e-06, + "loss": 2.0049, + "step": 5298 + }, + { + "epoch": 0.28428111587982835, + "grad_norm": 1.4023473262786865, + "learning_rate": 4.201780957760719e-06, + "loss": 2.4096, + "step": 5299 + }, + { + "epoch": 0.2843347639484979, + "grad_norm": 1.713315725326538, + "learning_rate": 4.201462710810226e-06, + "loss": 2.3561, + "step": 5300 + }, + { + "epoch": 0.28438841201716736, + "grad_norm": 1.3906464576721191, + "learning_rate": 4.201144412487867e-06, + "loss": 2.3711, + "step": 5301 + }, + { + "epoch": 0.2844420600858369, + "grad_norm": 1.397920846939087, + "learning_rate": 4.200826062803255e-06, + "loss": 2.0266, + "step": 5302 + }, + { + "epoch": 0.2844957081545064, + "grad_norm": 1.3822640180587769, + "learning_rate": 4.200507661766e-06, + "loss": 2.0637, + "step": 5303 + }, + { + "epoch": 0.28454935622317595, + "grad_norm": 1.3081847429275513, + "learning_rate": 4.200189209385717e-06, + "loss": 2.4536, + "step": 5304 + }, + { + "epoch": 0.2846030042918455, + "grad_norm": 1.361376166343689, + "learning_rate": 4.199870705672019e-06, + "loss": 2.2802, + "step": 5305 + }, + { + "epoch": 0.284656652360515, + "grad_norm": 1.2919461727142334, + "learning_rate": 4.199552150634525e-06, + "loss": 2.1394, + "step": 5306 + }, + { + "epoch": 0.28471030042918455, + "grad_norm": 1.2919390201568604, + "learning_rate": 4.1992335442828515e-06, + "loss": 2.145, + "step": 5307 + }, + { + "epoch": 0.2847639484978541, + "grad_norm": 1.3241201639175415, + "learning_rate": 4.198914886626617e-06, + "loss": 2.2636, + "step": 5308 + }, + { + "epoch": 0.2848175965665236, + "grad_norm": 1.6842128038406372, + "learning_rate": 4.198596177675444e-06, + "loss": 2.2742, + "step": 5309 + }, + { + "epoch": 0.28487124463519314, + "grad_norm": 1.2363498210906982, + "learning_rate": 4.198277417438958e-06, + "loss": 2.188, + "step": 5310 + }, + { + "epoch": 0.2849248927038627, + "grad_norm": 1.2418339252471924, + "learning_rate": 4.197958605926778e-06, + "loss": 2.5077, + "step": 5311 + }, + { + "epoch": 0.2849785407725322, + "grad_norm": 1.5520706176757812, + "learning_rate": 4.1976397431485324e-06, + "loss": 1.5068, + "step": 5312 + }, + { + "epoch": 0.28503218884120174, + "grad_norm": 1.3444104194641113, + "learning_rate": 4.197320829113848e-06, + "loss": 2.2559, + "step": 5313 + }, + { + "epoch": 0.28508583690987127, + "grad_norm": 3.049741506576538, + "learning_rate": 4.197001863832355e-06, + "loss": 2.081, + "step": 5314 + }, + { + "epoch": 0.28513948497854075, + "grad_norm": 1.255085825920105, + "learning_rate": 4.196682847313682e-06, + "loss": 2.479, + "step": 5315 + }, + { + "epoch": 0.2851931330472103, + "grad_norm": 1.3506109714508057, + "learning_rate": 4.196363779567463e-06, + "loss": 2.2521, + "step": 5316 + }, + { + "epoch": 0.2852467811158798, + "grad_norm": 1.6188067197799683, + "learning_rate": 4.19604466060333e-06, + "loss": 2.3477, + "step": 5317 + }, + { + "epoch": 0.28530042918454934, + "grad_norm": 1.34382963180542, + "learning_rate": 4.195725490430917e-06, + "loss": 2.2101, + "step": 5318 + }, + { + "epoch": 0.2853540772532189, + "grad_norm": 1.3928169012069702, + "learning_rate": 4.195406269059864e-06, + "loss": 2.5289, + "step": 5319 + }, + { + "epoch": 0.2854077253218884, + "grad_norm": 1.352113127708435, + "learning_rate": 4.195086996499807e-06, + "loss": 2.2547, + "step": 5320 + }, + { + "epoch": 0.28546137339055794, + "grad_norm": 1.263211727142334, + "learning_rate": 4.194767672760386e-06, + "loss": 2.041, + "step": 5321 + }, + { + "epoch": 0.28551502145922747, + "grad_norm": 2.1483588218688965, + "learning_rate": 4.194448297851242e-06, + "loss": 2.2279, + "step": 5322 + }, + { + "epoch": 0.285568669527897, + "grad_norm": 1.4854096174240112, + "learning_rate": 4.1941288717820185e-06, + "loss": 2.2566, + "step": 5323 + }, + { + "epoch": 0.28562231759656653, + "grad_norm": 1.0366175174713135, + "learning_rate": 4.193809394562359e-06, + "loss": 2.1325, + "step": 5324 + }, + { + "epoch": 0.28567596566523606, + "grad_norm": 1.1006108522415161, + "learning_rate": 4.19348986620191e-06, + "loss": 2.593, + "step": 5325 + }, + { + "epoch": 0.2857296137339056, + "grad_norm": 2.207791805267334, + "learning_rate": 4.193170286710319e-06, + "loss": 2.5034, + "step": 5326 + }, + { + "epoch": 0.28578326180257513, + "grad_norm": 1.1080607175827026, + "learning_rate": 4.192850656097235e-06, + "loss": 1.8544, + "step": 5327 + }, + { + "epoch": 0.28583690987124466, + "grad_norm": 1.2526116371154785, + "learning_rate": 4.192530974372307e-06, + "loss": 2.3295, + "step": 5328 + }, + { + "epoch": 0.28589055793991414, + "grad_norm": 1.2348859310150146, + "learning_rate": 4.19221124154519e-06, + "loss": 2.3825, + "step": 5329 + }, + { + "epoch": 0.28594420600858367, + "grad_norm": 2.4628984928131104, + "learning_rate": 4.191891457625536e-06, + "loss": 2.3813, + "step": 5330 + }, + { + "epoch": 0.2859978540772532, + "grad_norm": 1.2056008577346802, + "learning_rate": 4.191571622623e-06, + "loss": 2.052, + "step": 5331 + }, + { + "epoch": 0.28605150214592273, + "grad_norm": 1.5246846675872803, + "learning_rate": 4.191251736547239e-06, + "loss": 2.2614, + "step": 5332 + }, + { + "epoch": 0.28610515021459226, + "grad_norm": 1.1969069242477417, + "learning_rate": 4.1909317994079105e-06, + "loss": 2.3389, + "step": 5333 + }, + { + "epoch": 0.2861587982832618, + "grad_norm": 1.4049445390701294, + "learning_rate": 4.1906118112146745e-06, + "loss": 2.2399, + "step": 5334 + }, + { + "epoch": 0.2862124463519313, + "grad_norm": 1.3644524812698364, + "learning_rate": 4.190291771977194e-06, + "loss": 2.3909, + "step": 5335 + }, + { + "epoch": 0.28626609442060086, + "grad_norm": 1.5685845613479614, + "learning_rate": 4.189971681705129e-06, + "loss": 2.025, + "step": 5336 + }, + { + "epoch": 0.2863197424892704, + "grad_norm": 1.0965675115585327, + "learning_rate": 4.189651540408147e-06, + "loss": 2.2816, + "step": 5337 + }, + { + "epoch": 0.2863733905579399, + "grad_norm": 1.2093197107315063, + "learning_rate": 4.189331348095913e-06, + "loss": 2.2792, + "step": 5338 + }, + { + "epoch": 0.28642703862660945, + "grad_norm": 1.3440860509872437, + "learning_rate": 4.189011104778093e-06, + "loss": 2.3266, + "step": 5339 + }, + { + "epoch": 0.286480686695279, + "grad_norm": 1.5248643159866333, + "learning_rate": 4.188690810464357e-06, + "loss": 2.2411, + "step": 5340 + }, + { + "epoch": 0.2865343347639485, + "grad_norm": 1.5997084379196167, + "learning_rate": 4.1883704651643755e-06, + "loss": 2.5441, + "step": 5341 + }, + { + "epoch": 0.28658798283261805, + "grad_norm": 1.5267788171768188, + "learning_rate": 4.188050068887821e-06, + "loss": 2.2699, + "step": 5342 + }, + { + "epoch": 0.2866416309012876, + "grad_norm": 1.2479997873306274, + "learning_rate": 4.187729621644367e-06, + "loss": 2.1689, + "step": 5343 + }, + { + "epoch": 0.28669527896995706, + "grad_norm": 1.556512475013733, + "learning_rate": 4.187409123443688e-06, + "loss": 2.236, + "step": 5344 + }, + { + "epoch": 0.2867489270386266, + "grad_norm": 1.225488305091858, + "learning_rate": 4.1870885742954616e-06, + "loss": 2.2203, + "step": 5345 + }, + { + "epoch": 0.2868025751072961, + "grad_norm": 1.3815730810165405, + "learning_rate": 4.186767974209366e-06, + "loss": 2.1789, + "step": 5346 + }, + { + "epoch": 0.28685622317596565, + "grad_norm": 1.9201236963272095, + "learning_rate": 4.18644732319508e-06, + "loss": 2.3858, + "step": 5347 + }, + { + "epoch": 0.2869098712446352, + "grad_norm": 1.3880109786987305, + "learning_rate": 4.186126621262287e-06, + "loss": 2.4339, + "step": 5348 + }, + { + "epoch": 0.2869635193133047, + "grad_norm": 1.6802955865859985, + "learning_rate": 4.185805868420667e-06, + "loss": 2.5212, + "step": 5349 + }, + { + "epoch": 0.28701716738197425, + "grad_norm": 1.2921512126922607, + "learning_rate": 4.185485064679906e-06, + "loss": 2.0959, + "step": 5350 + }, + { + "epoch": 0.2870708154506438, + "grad_norm": 1.7332086563110352, + "learning_rate": 4.185164210049692e-06, + "loss": 2.3404, + "step": 5351 + }, + { + "epoch": 0.2871244635193133, + "grad_norm": 1.5740838050842285, + "learning_rate": 4.184843304539708e-06, + "loss": 2.4439, + "step": 5352 + }, + { + "epoch": 0.28717811158798284, + "grad_norm": 1.5215038061141968, + "learning_rate": 4.184522348159647e-06, + "loss": 2.3105, + "step": 5353 + }, + { + "epoch": 0.2872317596566524, + "grad_norm": 1.27907395362854, + "learning_rate": 4.184201340919196e-06, + "loss": 2.4787, + "step": 5354 + }, + { + "epoch": 0.2872854077253219, + "grad_norm": 1.4168368577957153, + "learning_rate": 4.183880282828051e-06, + "loss": 2.1961, + "step": 5355 + }, + { + "epoch": 0.28733905579399144, + "grad_norm": 1.2337470054626465, + "learning_rate": 4.1835591738959034e-06, + "loss": 2.3609, + "step": 5356 + }, + { + "epoch": 0.28739270386266097, + "grad_norm": 1.3632909059524536, + "learning_rate": 4.183238014132448e-06, + "loss": 2.2623, + "step": 5357 + }, + { + "epoch": 0.28744635193133045, + "grad_norm": 1.2759157419204712, + "learning_rate": 4.182916803547383e-06, + "loss": 2.3638, + "step": 5358 + }, + { + "epoch": 0.2875, + "grad_norm": 1.753441333770752, + "learning_rate": 4.182595542150405e-06, + "loss": 2.6432, + "step": 5359 + }, + { + "epoch": 0.2875536480686695, + "grad_norm": 1.3974738121032715, + "learning_rate": 4.182274229951216e-06, + "loss": 2.5123, + "step": 5360 + }, + { + "epoch": 0.28760729613733904, + "grad_norm": 1.307824730873108, + "learning_rate": 4.1819528669595145e-06, + "loss": 2.3211, + "step": 5361 + }, + { + "epoch": 0.2876609442060086, + "grad_norm": 1.2060304880142212, + "learning_rate": 4.181631453185004e-06, + "loss": 2.1793, + "step": 5362 + }, + { + "epoch": 0.2877145922746781, + "grad_norm": 1.4520443677902222, + "learning_rate": 4.181309988637392e-06, + "loss": 2.1481, + "step": 5363 + }, + { + "epoch": 0.28776824034334764, + "grad_norm": 1.2506394386291504, + "learning_rate": 4.180988473326381e-06, + "loss": 2.3618, + "step": 5364 + }, + { + "epoch": 0.28782188841201717, + "grad_norm": 1.5998327732086182, + "learning_rate": 4.180666907261678e-06, + "loss": 2.3626, + "step": 5365 + }, + { + "epoch": 0.2878755364806867, + "grad_norm": 1.5224560499191284, + "learning_rate": 4.180345290452995e-06, + "loss": 2.6139, + "step": 5366 + }, + { + "epoch": 0.28792918454935623, + "grad_norm": 1.516493558883667, + "learning_rate": 4.1800236229100405e-06, + "loss": 2.0148, + "step": 5367 + }, + { + "epoch": 0.28798283261802576, + "grad_norm": 1.220171570777893, + "learning_rate": 4.179701904642527e-06, + "loss": 2.5074, + "step": 5368 + }, + { + "epoch": 0.2880364806866953, + "grad_norm": 1.3300745487213135, + "learning_rate": 4.179380135660168e-06, + "loss": 2.3006, + "step": 5369 + }, + { + "epoch": 0.2880901287553648, + "grad_norm": 1.3369560241699219, + "learning_rate": 4.1790583159726784e-06, + "loss": 2.4395, + "step": 5370 + }, + { + "epoch": 0.28814377682403436, + "grad_norm": 1.3817059993743896, + "learning_rate": 4.178736445589775e-06, + "loss": 2.1106, + "step": 5371 + }, + { + "epoch": 0.28819742489270384, + "grad_norm": 1.8524466753005981, + "learning_rate": 4.178414524521176e-06, + "loss": 2.141, + "step": 5372 + }, + { + "epoch": 0.28825107296137337, + "grad_norm": 1.3546829223632812, + "learning_rate": 4.178092552776602e-06, + "loss": 2.1298, + "step": 5373 + }, + { + "epoch": 0.2883047210300429, + "grad_norm": 1.8453757762908936, + "learning_rate": 4.177770530365772e-06, + "loss": 2.2003, + "step": 5374 + }, + { + "epoch": 0.28835836909871243, + "grad_norm": 0.9543159604072571, + "learning_rate": 4.1774484572984105e-06, + "loss": 2.0692, + "step": 5375 + }, + { + "epoch": 0.28841201716738196, + "grad_norm": 1.4383676052093506, + "learning_rate": 4.177126333584242e-06, + "loss": 2.2244, + "step": 5376 + }, + { + "epoch": 0.2884656652360515, + "grad_norm": 1.3789979219436646, + "learning_rate": 4.176804159232991e-06, + "loss": 2.2943, + "step": 5377 + }, + { + "epoch": 0.288519313304721, + "grad_norm": 1.4091039896011353, + "learning_rate": 4.1764819342543865e-06, + "loss": 2.3177, + "step": 5378 + }, + { + "epoch": 0.28857296137339056, + "grad_norm": 1.451812744140625, + "learning_rate": 4.176159658658155e-06, + "loss": 2.3543, + "step": 5379 + }, + { + "epoch": 0.2886266094420601, + "grad_norm": 1.1658954620361328, + "learning_rate": 4.175837332454028e-06, + "loss": 2.082, + "step": 5380 + }, + { + "epoch": 0.2886802575107296, + "grad_norm": 1.1515681743621826, + "learning_rate": 4.175514955651738e-06, + "loss": 2.1439, + "step": 5381 + }, + { + "epoch": 0.28873390557939915, + "grad_norm": 1.314581036567688, + "learning_rate": 4.1751925282610186e-06, + "loss": 1.9241, + "step": 5382 + }, + { + "epoch": 0.2887875536480687, + "grad_norm": 1.286803126335144, + "learning_rate": 4.174870050291604e-06, + "loss": 2.3384, + "step": 5383 + }, + { + "epoch": 0.2888412017167382, + "grad_norm": 1.384809970855713, + "learning_rate": 4.17454752175323e-06, + "loss": 2.146, + "step": 5384 + }, + { + "epoch": 0.28889484978540775, + "grad_norm": 1.2953877449035645, + "learning_rate": 4.174224942655637e-06, + "loss": 2.2489, + "step": 5385 + }, + { + "epoch": 0.2889484978540773, + "grad_norm": 1.2511601448059082, + "learning_rate": 4.173902313008563e-06, + "loss": 2.2609, + "step": 5386 + }, + { + "epoch": 0.28900214592274676, + "grad_norm": 2.22845196723938, + "learning_rate": 4.173579632821748e-06, + "loss": 2.2789, + "step": 5387 + }, + { + "epoch": 0.2890557939914163, + "grad_norm": 1.3235803842544556, + "learning_rate": 4.1732569021049364e-06, + "loss": 2.2925, + "step": 5388 + }, + { + "epoch": 0.2891094420600858, + "grad_norm": 1.4318894147872925, + "learning_rate": 4.172934120867873e-06, + "loss": 2.5096, + "step": 5389 + }, + { + "epoch": 0.28916309012875535, + "grad_norm": 0.9978567957878113, + "learning_rate": 4.172611289120301e-06, + "loss": 2.1511, + "step": 5390 + }, + { + "epoch": 0.2892167381974249, + "grad_norm": 1.469740390777588, + "learning_rate": 4.172288406871969e-06, + "loss": 2.4367, + "step": 5391 + }, + { + "epoch": 0.2892703862660944, + "grad_norm": 1.4108291864395142, + "learning_rate": 4.171965474132626e-06, + "loss": 2.0867, + "step": 5392 + }, + { + "epoch": 0.28932403433476395, + "grad_norm": 18.777679443359375, + "learning_rate": 4.1716424909120205e-06, + "loss": 2.2992, + "step": 5393 + }, + { + "epoch": 0.2893776824034335, + "grad_norm": 1.5034455060958862, + "learning_rate": 4.171319457219906e-06, + "loss": 2.1985, + "step": 5394 + }, + { + "epoch": 0.289431330472103, + "grad_norm": 1.2798768281936646, + "learning_rate": 4.170996373066036e-06, + "loss": 2.1909, + "step": 5395 + }, + { + "epoch": 0.28948497854077254, + "grad_norm": 1.0494613647460938, + "learning_rate": 4.170673238460164e-06, + "loss": 2.0639, + "step": 5396 + }, + { + "epoch": 0.2895386266094421, + "grad_norm": 1.5054503679275513, + "learning_rate": 4.170350053412047e-06, + "loss": 2.3725, + "step": 5397 + }, + { + "epoch": 0.2895922746781116, + "grad_norm": 1.4547404050827026, + "learning_rate": 4.170026817931443e-06, + "loss": 2.1085, + "step": 5398 + }, + { + "epoch": 0.28964592274678114, + "grad_norm": 1.322479009628296, + "learning_rate": 4.169703532028111e-06, + "loss": 2.4654, + "step": 5399 + }, + { + "epoch": 0.28969957081545067, + "grad_norm": 1.2573456764221191, + "learning_rate": 4.1693801957118114e-06, + "loss": 2.222, + "step": 5400 + }, + { + "epoch": 0.28975321888412015, + "grad_norm": 1.107547640800476, + "learning_rate": 4.169056808992308e-06, + "loss": 1.9905, + "step": 5401 + }, + { + "epoch": 0.2898068669527897, + "grad_norm": 1.4555102586746216, + "learning_rate": 4.168733371879363e-06, + "loss": 1.3308, + "step": 5402 + }, + { + "epoch": 0.2898605150214592, + "grad_norm": 1.5119762420654297, + "learning_rate": 4.168409884382744e-06, + "loss": 2.1869, + "step": 5403 + }, + { + "epoch": 0.28991416309012874, + "grad_norm": 1.3459663391113281, + "learning_rate": 4.168086346512217e-06, + "loss": 2.2154, + "step": 5404 + }, + { + "epoch": 0.28996781115879827, + "grad_norm": 1.3628828525543213, + "learning_rate": 4.167762758277549e-06, + "loss": 2.3355, + "step": 5405 + }, + { + "epoch": 0.2900214592274678, + "grad_norm": 1.5307854413986206, + "learning_rate": 4.167439119688513e-06, + "loss": 2.2843, + "step": 5406 + }, + { + "epoch": 0.29007510729613734, + "grad_norm": 1.4077099561691284, + "learning_rate": 4.167115430754878e-06, + "loss": 2.3023, + "step": 5407 + }, + { + "epoch": 0.29012875536480687, + "grad_norm": 1.2115776538848877, + "learning_rate": 4.166791691486417e-06, + "loss": 2.2053, + "step": 5408 + }, + { + "epoch": 0.2901824034334764, + "grad_norm": 1.3680387735366821, + "learning_rate": 4.166467901892908e-06, + "loss": 2.2163, + "step": 5409 + }, + { + "epoch": 0.29023605150214593, + "grad_norm": 1.3409061431884766, + "learning_rate": 4.166144061984122e-06, + "loss": 2.3362, + "step": 5410 + }, + { + "epoch": 0.29028969957081546, + "grad_norm": 1.473103642463684, + "learning_rate": 4.165820171769841e-06, + "loss": 2.227, + "step": 5411 + }, + { + "epoch": 0.290343347639485, + "grad_norm": 1.437228798866272, + "learning_rate": 4.1654962312598415e-06, + "loss": 2.2966, + "step": 5412 + }, + { + "epoch": 0.2903969957081545, + "grad_norm": 1.5756202936172485, + "learning_rate": 4.165172240463906e-06, + "loss": 2.4544, + "step": 5413 + }, + { + "epoch": 0.29045064377682406, + "grad_norm": 1.3767938613891602, + "learning_rate": 4.164848199391815e-06, + "loss": 1.9737, + "step": 5414 + }, + { + "epoch": 0.2905042918454936, + "grad_norm": 1.4427515268325806, + "learning_rate": 4.164524108053352e-06, + "loss": 2.3114, + "step": 5415 + }, + { + "epoch": 0.29055793991416307, + "grad_norm": 1.2109973430633545, + "learning_rate": 4.164199966458306e-06, + "loss": 2.1445, + "step": 5416 + }, + { + "epoch": 0.2906115879828326, + "grad_norm": 1.5473718643188477, + "learning_rate": 4.163875774616458e-06, + "loss": 2.1708, + "step": 5417 + }, + { + "epoch": 0.29066523605150213, + "grad_norm": 1.359774112701416, + "learning_rate": 4.163551532537601e-06, + "loss": 2.4773, + "step": 5418 + }, + { + "epoch": 0.29071888412017166, + "grad_norm": 1.350265622138977, + "learning_rate": 4.163227240231522e-06, + "loss": 1.7653, + "step": 5419 + }, + { + "epoch": 0.2907725321888412, + "grad_norm": 1.2648168802261353, + "learning_rate": 4.162902897708013e-06, + "loss": 2.1717, + "step": 5420 + }, + { + "epoch": 0.2908261802575107, + "grad_norm": 1.5107903480529785, + "learning_rate": 4.1625785049768676e-06, + "loss": 2.4216, + "step": 5421 + }, + { + "epoch": 0.29087982832618026, + "grad_norm": 1.7196375131607056, + "learning_rate": 4.16225406204788e-06, + "loss": 2.2097, + "step": 5422 + }, + { + "epoch": 0.2909334763948498, + "grad_norm": 2.7263107299804688, + "learning_rate": 4.161929568930845e-06, + "loss": 2.0942, + "step": 5423 + }, + { + "epoch": 0.2909871244635193, + "grad_norm": 1.483261227607727, + "learning_rate": 4.16160502563556e-06, + "loss": 2.3738, + "step": 5424 + }, + { + "epoch": 0.29104077253218885, + "grad_norm": 1.586222529411316, + "learning_rate": 4.1612804321718245e-06, + "loss": 2.4265, + "step": 5425 + }, + { + "epoch": 0.2910944206008584, + "grad_norm": 1.3538148403167725, + "learning_rate": 4.160955788549439e-06, + "loss": 2.4221, + "step": 5426 + }, + { + "epoch": 0.2911480686695279, + "grad_norm": 1.3709430694580078, + "learning_rate": 4.160631094778205e-06, + "loss": 2.3299, + "step": 5427 + }, + { + "epoch": 0.29120171673819745, + "grad_norm": 1.3107454776763916, + "learning_rate": 4.160306350867925e-06, + "loss": 2.1783, + "step": 5428 + }, + { + "epoch": 0.291255364806867, + "grad_norm": 1.3892253637313843, + "learning_rate": 4.159981556828406e-06, + "loss": 2.436, + "step": 5429 + }, + { + "epoch": 0.29130901287553645, + "grad_norm": 1.2698709964752197, + "learning_rate": 4.159656712669454e-06, + "loss": 2.1219, + "step": 5430 + }, + { + "epoch": 0.291362660944206, + "grad_norm": 1.5008832216262817, + "learning_rate": 4.1593318184008754e-06, + "loss": 2.1726, + "step": 5431 + }, + { + "epoch": 0.2914163090128755, + "grad_norm": 1.3508555889129639, + "learning_rate": 4.159006874032481e-06, + "loss": 1.9522, + "step": 5432 + }, + { + "epoch": 0.29146995708154505, + "grad_norm": 1.0924934148788452, + "learning_rate": 4.1586818795740805e-06, + "loss": 2.0924, + "step": 5433 + }, + { + "epoch": 0.2915236051502146, + "grad_norm": 1.383457064628601, + "learning_rate": 4.158356835035487e-06, + "loss": 2.277, + "step": 5434 + }, + { + "epoch": 0.2915772532188841, + "grad_norm": 1.3724820613861084, + "learning_rate": 4.158031740426516e-06, + "loss": 2.4474, + "step": 5435 + }, + { + "epoch": 0.29163090128755365, + "grad_norm": 1.2691468000411987, + "learning_rate": 4.157706595756981e-06, + "loss": 2.0072, + "step": 5436 + }, + { + "epoch": 0.2916845493562232, + "grad_norm": 1.3697035312652588, + "learning_rate": 4.157381401036699e-06, + "loss": 2.2813, + "step": 5437 + }, + { + "epoch": 0.2917381974248927, + "grad_norm": 1.6200684309005737, + "learning_rate": 4.157056156275491e-06, + "loss": 2.207, + "step": 5438 + }, + { + "epoch": 0.29179184549356224, + "grad_norm": 1.3832786083221436, + "learning_rate": 4.156730861483173e-06, + "loss": 2.1304, + "step": 5439 + }, + { + "epoch": 0.2918454935622318, + "grad_norm": 1.407540202140808, + "learning_rate": 4.156405516669571e-06, + "loss": 2.2722, + "step": 5440 + }, + { + "epoch": 0.2918991416309013, + "grad_norm": 1.516169786453247, + "learning_rate": 4.156080121844505e-06, + "loss": 2.4005, + "step": 5441 + }, + { + "epoch": 0.29195278969957084, + "grad_norm": 1.3275007009506226, + "learning_rate": 4.1557546770178e-06, + "loss": 2.2359, + "step": 5442 + }, + { + "epoch": 0.29200643776824037, + "grad_norm": 1.27895987033844, + "learning_rate": 4.155429182199283e-06, + "loss": 2.2503, + "step": 5443 + }, + { + "epoch": 0.29206008583690984, + "grad_norm": 1.5146820545196533, + "learning_rate": 4.155103637398781e-06, + "loss": 2.3884, + "step": 5444 + }, + { + "epoch": 0.2921137339055794, + "grad_norm": 3.543250560760498, + "learning_rate": 4.1547780426261236e-06, + "loss": 2.4858, + "step": 5445 + }, + { + "epoch": 0.2921673819742489, + "grad_norm": 1.4312770366668701, + "learning_rate": 4.15445239789114e-06, + "loss": 2.1749, + "step": 5446 + }, + { + "epoch": 0.29222103004291844, + "grad_norm": 1.1570500135421753, + "learning_rate": 4.1541267032036646e-06, + "loss": 2.2639, + "step": 5447 + }, + { + "epoch": 0.29227467811158797, + "grad_norm": 1.38362455368042, + "learning_rate": 4.15380095857353e-06, + "loss": 2.1978, + "step": 5448 + }, + { + "epoch": 0.2923283261802575, + "grad_norm": 1.2602527141571045, + "learning_rate": 4.1534751640105695e-06, + "loss": 2.1675, + "step": 5449 + }, + { + "epoch": 0.29238197424892703, + "grad_norm": 1.471513032913208, + "learning_rate": 4.153149319524622e-06, + "loss": 2.4253, + "step": 5450 + }, + { + "epoch": 0.29243562231759657, + "grad_norm": 1.2623225450515747, + "learning_rate": 4.152823425125525e-06, + "loss": 2.3233, + "step": 5451 + }, + { + "epoch": 0.2924892703862661, + "grad_norm": 1.3304659128189087, + "learning_rate": 4.1524974808231185e-06, + "loss": 2.1656, + "step": 5452 + }, + { + "epoch": 0.29254291845493563, + "grad_norm": 1.5253896713256836, + "learning_rate": 4.152171486627243e-06, + "loss": 2.2016, + "step": 5453 + }, + { + "epoch": 0.29259656652360516, + "grad_norm": 1.3610948324203491, + "learning_rate": 4.1518454425477416e-06, + "loss": 2.2689, + "step": 5454 + }, + { + "epoch": 0.2926502145922747, + "grad_norm": 1.2831079959869385, + "learning_rate": 4.151519348594458e-06, + "loss": 2.1196, + "step": 5455 + }, + { + "epoch": 0.2927038626609442, + "grad_norm": 1.3628156185150146, + "learning_rate": 4.151193204777239e-06, + "loss": 2.4229, + "step": 5456 + }, + { + "epoch": 0.29275751072961376, + "grad_norm": 1.3526813983917236, + "learning_rate": 4.150867011105931e-06, + "loss": 2.1983, + "step": 5457 + }, + { + "epoch": 0.2928111587982833, + "grad_norm": 1.2210673093795776, + "learning_rate": 4.150540767590382e-06, + "loss": 2.4525, + "step": 5458 + }, + { + "epoch": 0.29286480686695276, + "grad_norm": 1.2776470184326172, + "learning_rate": 4.1502144742404425e-06, + "loss": 2.1108, + "step": 5459 + }, + { + "epoch": 0.2929184549356223, + "grad_norm": 1.371298909187317, + "learning_rate": 4.149888131065965e-06, + "loss": 2.4026, + "step": 5460 + }, + { + "epoch": 0.29297210300429183, + "grad_norm": 1.9349470138549805, + "learning_rate": 4.149561738076803e-06, + "loss": 2.3504, + "step": 5461 + }, + { + "epoch": 0.29302575107296136, + "grad_norm": 1.3360164165496826, + "learning_rate": 4.14923529528281e-06, + "loss": 2.2479, + "step": 5462 + }, + { + "epoch": 0.2930793991416309, + "grad_norm": 1.4595962762832642, + "learning_rate": 4.148908802693842e-06, + "loss": 1.8572, + "step": 5463 + }, + { + "epoch": 0.2931330472103004, + "grad_norm": 1.2996351718902588, + "learning_rate": 4.1485822603197575e-06, + "loss": 2.1091, + "step": 5464 + }, + { + "epoch": 0.29318669527896996, + "grad_norm": 1.5048496723175049, + "learning_rate": 4.1482556681704165e-06, + "loss": 2.208, + "step": 5465 + }, + { + "epoch": 0.2932403433476395, + "grad_norm": 6.0797648429870605, + "learning_rate": 4.147929026255678e-06, + "loss": 2.1844, + "step": 5466 + }, + { + "epoch": 0.293293991416309, + "grad_norm": 1.573850154876709, + "learning_rate": 4.147602334585406e-06, + "loss": 2.3568, + "step": 5467 + }, + { + "epoch": 0.29334763948497855, + "grad_norm": 1.3477632999420166, + "learning_rate": 4.147275593169463e-06, + "loss": 1.8916, + "step": 5468 + }, + { + "epoch": 0.2934012875536481, + "grad_norm": 1.3363653421401978, + "learning_rate": 4.146948802017714e-06, + "loss": 2.2046, + "step": 5469 + }, + { + "epoch": 0.2934549356223176, + "grad_norm": 1.156680703163147, + "learning_rate": 4.146621961140027e-06, + "loss": 1.6412, + "step": 5470 + }, + { + "epoch": 0.29350858369098715, + "grad_norm": 1.2708503007888794, + "learning_rate": 4.146295070546269e-06, + "loss": 2.1703, + "step": 5471 + }, + { + "epoch": 0.2935622317596567, + "grad_norm": 1.3210761547088623, + "learning_rate": 4.14596813024631e-06, + "loss": 1.3418, + "step": 5472 + }, + { + "epoch": 0.29361587982832615, + "grad_norm": 1.3715096712112427, + "learning_rate": 4.145641140250022e-06, + "loss": 2.4924, + "step": 5473 + }, + { + "epoch": 0.2936695278969957, + "grad_norm": 1.352448582649231, + "learning_rate": 4.145314100567277e-06, + "loss": 2.2436, + "step": 5474 + }, + { + "epoch": 0.2937231759656652, + "grad_norm": 1.3864436149597168, + "learning_rate": 4.14498701120795e-06, + "loss": 2.161, + "step": 5475 + }, + { + "epoch": 0.29377682403433475, + "grad_norm": 1.336189866065979, + "learning_rate": 4.144659872181915e-06, + "loss": 2.2272, + "step": 5476 + }, + { + "epoch": 0.2938304721030043, + "grad_norm": 1.1542164087295532, + "learning_rate": 4.14433268349905e-06, + "loss": 2.0433, + "step": 5477 + }, + { + "epoch": 0.2938841201716738, + "grad_norm": 1.3596775531768799, + "learning_rate": 4.144005445169236e-06, + "loss": 2.2832, + "step": 5478 + }, + { + "epoch": 0.29393776824034334, + "grad_norm": 1.318446397781372, + "learning_rate": 4.1436781572023495e-06, + "loss": 2.2989, + "step": 5479 + }, + { + "epoch": 0.2939914163090129, + "grad_norm": 1.592558741569519, + "learning_rate": 4.143350819608275e-06, + "loss": 2.1669, + "step": 5480 + }, + { + "epoch": 0.2940450643776824, + "grad_norm": 1.4947926998138428, + "learning_rate": 4.1430234323968945e-06, + "loss": 2.4697, + "step": 5481 + }, + { + "epoch": 0.29409871244635194, + "grad_norm": 1.40901780128479, + "learning_rate": 4.142695995578093e-06, + "loss": 2.4416, + "step": 5482 + }, + { + "epoch": 0.29415236051502147, + "grad_norm": 1.3132877349853516, + "learning_rate": 4.142368509161757e-06, + "loss": 2.0916, + "step": 5483 + }, + { + "epoch": 0.294206008583691, + "grad_norm": 1.4300315380096436, + "learning_rate": 4.142040973157774e-06, + "loss": 2.1508, + "step": 5484 + }, + { + "epoch": 0.29425965665236054, + "grad_norm": 1.382888913154602, + "learning_rate": 4.141713387576033e-06, + "loss": 2.202, + "step": 5485 + }, + { + "epoch": 0.29431330472103007, + "grad_norm": 1.4399235248565674, + "learning_rate": 4.141385752426425e-06, + "loss": 2.386, + "step": 5486 + }, + { + "epoch": 0.2943669527896996, + "grad_norm": 1.379339575767517, + "learning_rate": 4.141058067718843e-06, + "loss": 2.3622, + "step": 5487 + }, + { + "epoch": 0.2944206008583691, + "grad_norm": 1.3388197422027588, + "learning_rate": 4.140730333463179e-06, + "loss": 2.395, + "step": 5488 + }, + { + "epoch": 0.2944742489270386, + "grad_norm": 1.34584641456604, + "learning_rate": 4.140402549669328e-06, + "loss": 2.3075, + "step": 5489 + }, + { + "epoch": 0.29452789699570814, + "grad_norm": 1.431449294090271, + "learning_rate": 4.140074716347189e-06, + "loss": 2.2123, + "step": 5490 + }, + { + "epoch": 0.29458154506437767, + "grad_norm": 1.5522383451461792, + "learning_rate": 4.139746833506658e-06, + "loss": 1.9923, + "step": 5491 + }, + { + "epoch": 0.2946351931330472, + "grad_norm": 1.4277317523956299, + "learning_rate": 4.139418901157637e-06, + "loss": 2.4398, + "step": 5492 + }, + { + "epoch": 0.29468884120171673, + "grad_norm": 8.361177444458008, + "learning_rate": 4.139090919310024e-06, + "loss": 2.3307, + "step": 5493 + }, + { + "epoch": 0.29474248927038627, + "grad_norm": 1.3900518417358398, + "learning_rate": 4.138762887973724e-06, + "loss": 2.4518, + "step": 5494 + }, + { + "epoch": 0.2947961373390558, + "grad_norm": 2.169848918914795, + "learning_rate": 4.1384348071586414e-06, + "loss": 2.1848, + "step": 5495 + }, + { + "epoch": 0.29484978540772533, + "grad_norm": 1.4290153980255127, + "learning_rate": 4.13810667687468e-06, + "loss": 2.1925, + "step": 5496 + }, + { + "epoch": 0.29490343347639486, + "grad_norm": 1.2133549451828003, + "learning_rate": 4.137778497131749e-06, + "loss": 1.9577, + "step": 5497 + }, + { + "epoch": 0.2949570815450644, + "grad_norm": 1.3028990030288696, + "learning_rate": 4.137450267939757e-06, + "loss": 2.1313, + "step": 5498 + }, + { + "epoch": 0.2950107296137339, + "grad_norm": 2.771085500717163, + "learning_rate": 4.137121989308611e-06, + "loss": 2.1381, + "step": 5499 + }, + { + "epoch": 0.29506437768240346, + "grad_norm": 1.4173152446746826, + "learning_rate": 4.136793661248226e-06, + "loss": 2.3273, + "step": 5500 + }, + { + "epoch": 0.295118025751073, + "grad_norm": 1.3525866270065308, + "learning_rate": 4.136465283768515e-06, + "loss": 2.1766, + "step": 5501 + }, + { + "epoch": 0.29517167381974246, + "grad_norm": 4.122291088104248, + "learning_rate": 4.13613685687939e-06, + "loss": 2.3909, + "step": 5502 + }, + { + "epoch": 0.295225321888412, + "grad_norm": 3.094316005706787, + "learning_rate": 4.13580838059077e-06, + "loss": 2.42, + "step": 5503 + }, + { + "epoch": 0.2952789699570815, + "grad_norm": 1.4016180038452148, + "learning_rate": 4.13547985491257e-06, + "loss": 2.0915, + "step": 5504 + }, + { + "epoch": 0.29533261802575106, + "grad_norm": 1.0442498922348022, + "learning_rate": 4.135151279854712e-06, + "loss": 1.9975, + "step": 5505 + }, + { + "epoch": 0.2953862660944206, + "grad_norm": 1.3738951683044434, + "learning_rate": 4.134822655427114e-06, + "loss": 2.1144, + "step": 5506 + }, + { + "epoch": 0.2954399141630901, + "grad_norm": 1.3903402090072632, + "learning_rate": 4.1344939816397e-06, + "loss": 2.3676, + "step": 5507 + }, + { + "epoch": 0.29549356223175965, + "grad_norm": 1.3205171823501587, + "learning_rate": 4.1341652585023915e-06, + "loss": 2.136, + "step": 5508 + }, + { + "epoch": 0.2955472103004292, + "grad_norm": 1.2775673866271973, + "learning_rate": 4.133836486025116e-06, + "loss": 2.2517, + "step": 5509 + }, + { + "epoch": 0.2956008583690987, + "grad_norm": 1.4848219156265259, + "learning_rate": 4.133507664217799e-06, + "loss": 2.3374, + "step": 5510 + }, + { + "epoch": 0.29565450643776825, + "grad_norm": 1.4249656200408936, + "learning_rate": 4.1331787930903675e-06, + "loss": 2.1873, + "step": 5511 + }, + { + "epoch": 0.2957081545064378, + "grad_norm": 1.3103734254837036, + "learning_rate": 4.132849872652752e-06, + "loss": 2.1855, + "step": 5512 + }, + { + "epoch": 0.2957618025751073, + "grad_norm": 1.5762317180633545, + "learning_rate": 4.132520902914883e-06, + "loss": 2.5405, + "step": 5513 + }, + { + "epoch": 0.29581545064377684, + "grad_norm": 1.0975292921066284, + "learning_rate": 4.132191883886695e-06, + "loss": 2.0021, + "step": 5514 + }, + { + "epoch": 0.2958690987124464, + "grad_norm": 1.4672718048095703, + "learning_rate": 4.131862815578119e-06, + "loss": 2.2733, + "step": 5515 + }, + { + "epoch": 0.29592274678111585, + "grad_norm": 1.577335000038147, + "learning_rate": 4.131533697999092e-06, + "loss": 2.4397, + "step": 5516 + }, + { + "epoch": 0.2959763948497854, + "grad_norm": 1.348673939704895, + "learning_rate": 4.131204531159551e-06, + "loss": 2.3214, + "step": 5517 + }, + { + "epoch": 0.2960300429184549, + "grad_norm": 1.358231782913208, + "learning_rate": 4.130875315069435e-06, + "loss": 1.6412, + "step": 5518 + }, + { + "epoch": 0.29608369098712445, + "grad_norm": 1.302847146987915, + "learning_rate": 4.130546049738683e-06, + "loss": 2.2186, + "step": 5519 + }, + { + "epoch": 0.296137339055794, + "grad_norm": 1.35374116897583, + "learning_rate": 4.130216735177236e-06, + "loss": 2.2339, + "step": 5520 + }, + { + "epoch": 0.2961909871244635, + "grad_norm": 1.3235708475112915, + "learning_rate": 4.1298873713950385e-06, + "loss": 2.2808, + "step": 5521 + }, + { + "epoch": 0.29624463519313304, + "grad_norm": 1.3238112926483154, + "learning_rate": 4.129557958402034e-06, + "loss": 2.2647, + "step": 5522 + }, + { + "epoch": 0.2962982832618026, + "grad_norm": 1.48179292678833, + "learning_rate": 4.129228496208167e-06, + "loss": 2.4143, + "step": 5523 + }, + { + "epoch": 0.2963519313304721, + "grad_norm": 1.447256088256836, + "learning_rate": 4.128898984823388e-06, + "loss": 2.4077, + "step": 5524 + }, + { + "epoch": 0.29640557939914164, + "grad_norm": 1.3127540349960327, + "learning_rate": 4.1285694242576435e-06, + "loss": 2.2085, + "step": 5525 + }, + { + "epoch": 0.29645922746781117, + "grad_norm": 1.1671324968338013, + "learning_rate": 4.128239814520885e-06, + "loss": 2.1891, + "step": 5526 + }, + { + "epoch": 0.2965128755364807, + "grad_norm": 2.3061258792877197, + "learning_rate": 4.127910155623063e-06, + "loss": 2.1351, + "step": 5527 + }, + { + "epoch": 0.29656652360515023, + "grad_norm": 1.352087378501892, + "learning_rate": 4.127580447574132e-06, + "loss": 2.2649, + "step": 5528 + }, + { + "epoch": 0.29662017167381977, + "grad_norm": 1.1235475540161133, + "learning_rate": 4.1272506903840455e-06, + "loss": 2.1934, + "step": 5529 + }, + { + "epoch": 0.2966738197424893, + "grad_norm": 1.4440765380859375, + "learning_rate": 4.126920884062762e-06, + "loss": 2.1241, + "step": 5530 + }, + { + "epoch": 0.2967274678111588, + "grad_norm": 2.906920909881592, + "learning_rate": 4.126591028620238e-06, + "loss": 2.3175, + "step": 5531 + }, + { + "epoch": 0.2967811158798283, + "grad_norm": 1.2527577877044678, + "learning_rate": 4.126261124066432e-06, + "loss": 2.2168, + "step": 5532 + }, + { + "epoch": 0.29683476394849784, + "grad_norm": 2.1743595600128174, + "learning_rate": 4.125931170411306e-06, + "loss": 2.4366, + "step": 5533 + }, + { + "epoch": 0.29688841201716737, + "grad_norm": 2.08651065826416, + "learning_rate": 4.125601167664821e-06, + "loss": 1.9772, + "step": 5534 + }, + { + "epoch": 0.2969420600858369, + "grad_norm": 1.332914113998413, + "learning_rate": 4.125271115836942e-06, + "loss": 2.0151, + "step": 5535 + }, + { + "epoch": 0.29699570815450643, + "grad_norm": 1.1251566410064697, + "learning_rate": 4.124941014937633e-06, + "loss": 1.9115, + "step": 5536 + }, + { + "epoch": 0.29704935622317596, + "grad_norm": 1.3311747312545776, + "learning_rate": 4.124610864976862e-06, + "loss": 2.4709, + "step": 5537 + }, + { + "epoch": 0.2971030042918455, + "grad_norm": 1.4849967956542969, + "learning_rate": 4.124280665964596e-06, + "loss": 2.4196, + "step": 5538 + }, + { + "epoch": 0.29715665236051503, + "grad_norm": 1.4229122400283813, + "learning_rate": 4.123950417910805e-06, + "loss": 2.2207, + "step": 5539 + }, + { + "epoch": 0.29721030042918456, + "grad_norm": 3.0427515506744385, + "learning_rate": 4.123620120825459e-06, + "loss": 2.2786, + "step": 5540 + }, + { + "epoch": 0.2972639484978541, + "grad_norm": 1.2019755840301514, + "learning_rate": 4.123289774718533e-06, + "loss": 2.0226, + "step": 5541 + }, + { + "epoch": 0.2973175965665236, + "grad_norm": 1.4078935384750366, + "learning_rate": 4.1229593795999995e-06, + "loss": 2.2484, + "step": 5542 + }, + { + "epoch": 0.29737124463519315, + "grad_norm": 2.459399461746216, + "learning_rate": 4.122628935479834e-06, + "loss": 2.2822, + "step": 5543 + }, + { + "epoch": 0.2974248927038627, + "grad_norm": 1.3620423078536987, + "learning_rate": 4.122298442368013e-06, + "loss": 2.2028, + "step": 5544 + }, + { + "epoch": 0.29747854077253216, + "grad_norm": 1.2279924154281616, + "learning_rate": 4.121967900274517e-06, + "loss": 2.3272, + "step": 5545 + }, + { + "epoch": 0.2975321888412017, + "grad_norm": 1.4786262512207031, + "learning_rate": 4.121637309209325e-06, + "loss": 2.1837, + "step": 5546 + }, + { + "epoch": 0.2975858369098712, + "grad_norm": 1.2879210710525513, + "learning_rate": 4.121306669182418e-06, + "loss": 2.2476, + "step": 5547 + }, + { + "epoch": 0.29763948497854076, + "grad_norm": 2.52043080329895, + "learning_rate": 4.120975980203778e-06, + "loss": 2.2546, + "step": 5548 + }, + { + "epoch": 0.2976931330472103, + "grad_norm": 11.33411979675293, + "learning_rate": 4.1206452422833905e-06, + "loss": 2.0358, + "step": 5549 + }, + { + "epoch": 0.2977467811158798, + "grad_norm": 1.234311819076538, + "learning_rate": 4.120314455431243e-06, + "loss": 2.3189, + "step": 5550 + }, + { + "epoch": 0.29780042918454935, + "grad_norm": 5.4598774909973145, + "learning_rate": 4.119983619657321e-06, + "loss": 2.4003, + "step": 5551 + }, + { + "epoch": 0.2978540772532189, + "grad_norm": 1.5048677921295166, + "learning_rate": 4.119652734971613e-06, + "loss": 2.4339, + "step": 5552 + }, + { + "epoch": 0.2979077253218884, + "grad_norm": 1.3824656009674072, + "learning_rate": 4.11932180138411e-06, + "loss": 2.3368, + "step": 5553 + }, + { + "epoch": 0.29796137339055795, + "grad_norm": 1.3250491619110107, + "learning_rate": 4.118990818904804e-06, + "loss": 2.302, + "step": 5554 + }, + { + "epoch": 0.2980150214592275, + "grad_norm": 1.2715595960617065, + "learning_rate": 4.118659787543688e-06, + "loss": 1.304, + "step": 5555 + }, + { + "epoch": 0.298068669527897, + "grad_norm": 1.4455374479293823, + "learning_rate": 4.118328707310758e-06, + "loss": 2.3055, + "step": 5556 + }, + { + "epoch": 0.29812231759656654, + "grad_norm": 1.0137486457824707, + "learning_rate": 4.1179975782160075e-06, + "loss": 2.2124, + "step": 5557 + }, + { + "epoch": 0.2981759656652361, + "grad_norm": 1.2701963186264038, + "learning_rate": 4.117666400269436e-06, + "loss": 2.0658, + "step": 5558 + }, + { + "epoch": 0.29822961373390555, + "grad_norm": 1.3263640403747559, + "learning_rate": 4.117335173481043e-06, + "loss": 2.0259, + "step": 5559 + }, + { + "epoch": 0.2982832618025751, + "grad_norm": 1.365533709526062, + "learning_rate": 4.117003897860828e-06, + "loss": 2.1972, + "step": 5560 + }, + { + "epoch": 0.2983369098712446, + "grad_norm": 1.567028522491455, + "learning_rate": 4.116672573418794e-06, + "loss": 2.5564, + "step": 5561 + }, + { + "epoch": 0.29839055793991415, + "grad_norm": 1.4813100099563599, + "learning_rate": 4.116341200164945e-06, + "loss": 2.4755, + "step": 5562 + }, + { + "epoch": 0.2984442060085837, + "grad_norm": 1.3501322269439697, + "learning_rate": 4.116009778109285e-06, + "loss": 2.3238, + "step": 5563 + }, + { + "epoch": 0.2984978540772532, + "grad_norm": 1.5173017978668213, + "learning_rate": 4.11567830726182e-06, + "loss": 2.0767, + "step": 5564 + }, + { + "epoch": 0.29855150214592274, + "grad_norm": 1.5466724634170532, + "learning_rate": 4.115346787632559e-06, + "loss": 2.2998, + "step": 5565 + }, + { + "epoch": 0.2986051502145923, + "grad_norm": 1.7835558652877808, + "learning_rate": 4.1150152192315126e-06, + "loss": 2.2688, + "step": 5566 + }, + { + "epoch": 0.2986587982832618, + "grad_norm": 1.5680902004241943, + "learning_rate": 4.11468360206869e-06, + "loss": 2.5543, + "step": 5567 + }, + { + "epoch": 0.29871244635193134, + "grad_norm": 1.4262161254882812, + "learning_rate": 4.114351936154104e-06, + "loss": 2.2553, + "step": 5568 + }, + { + "epoch": 0.29876609442060087, + "grad_norm": 1.4250355958938599, + "learning_rate": 4.114020221497769e-06, + "loss": 2.1431, + "step": 5569 + }, + { + "epoch": 0.2988197424892704, + "grad_norm": 19.17849349975586, + "learning_rate": 4.113688458109701e-06, + "loss": 2.0824, + "step": 5570 + }, + { + "epoch": 0.29887339055793993, + "grad_norm": 1.4456100463867188, + "learning_rate": 4.113356645999915e-06, + "loss": 2.2673, + "step": 5571 + }, + { + "epoch": 0.29892703862660946, + "grad_norm": 1.8032821416854858, + "learning_rate": 4.113024785178431e-06, + "loss": 2.1544, + "step": 5572 + }, + { + "epoch": 0.298980686695279, + "grad_norm": 1.4733537435531616, + "learning_rate": 4.112692875655267e-06, + "loss": 2.4135, + "step": 5573 + }, + { + "epoch": 0.2990343347639485, + "grad_norm": 1.0533534288406372, + "learning_rate": 4.112360917440446e-06, + "loss": 2.0788, + "step": 5574 + }, + { + "epoch": 0.299087982832618, + "grad_norm": 1.3447593450546265, + "learning_rate": 4.1120289105439915e-06, + "loss": 2.3493, + "step": 5575 + }, + { + "epoch": 0.29914163090128754, + "grad_norm": 1.3794361352920532, + "learning_rate": 4.111696854975925e-06, + "loss": 2.1253, + "step": 5576 + }, + { + "epoch": 0.29919527896995707, + "grad_norm": 1.4568120241165161, + "learning_rate": 4.111364750746274e-06, + "loss": 2.352, + "step": 5577 + }, + { + "epoch": 0.2992489270386266, + "grad_norm": 1.777111291885376, + "learning_rate": 4.111032597865064e-06, + "loss": 2.4308, + "step": 5578 + }, + { + "epoch": 0.29930257510729613, + "grad_norm": 2.0574002265930176, + "learning_rate": 4.110700396342326e-06, + "loss": 2.4526, + "step": 5579 + }, + { + "epoch": 0.29935622317596566, + "grad_norm": 1.5751008987426758, + "learning_rate": 4.110368146188089e-06, + "loss": 2.3752, + "step": 5580 + }, + { + "epoch": 0.2994098712446352, + "grad_norm": 1.5123980045318604, + "learning_rate": 4.110035847412384e-06, + "loss": 2.3418, + "step": 5581 + }, + { + "epoch": 0.2994635193133047, + "grad_norm": 1.3599052429199219, + "learning_rate": 4.1097035000252435e-06, + "loss": 2.3918, + "step": 5582 + }, + { + "epoch": 0.29951716738197426, + "grad_norm": 1.4699546098709106, + "learning_rate": 4.109371104036704e-06, + "loss": 2.0838, + "step": 5583 + }, + { + "epoch": 0.2995708154506438, + "grad_norm": 1.4142591953277588, + "learning_rate": 4.1090386594568e-06, + "loss": 2.1239, + "step": 5584 + }, + { + "epoch": 0.2996244635193133, + "grad_norm": 1.4878312349319458, + "learning_rate": 4.1087061662955695e-06, + "loss": 2.2839, + "step": 5585 + }, + { + "epoch": 0.29967811158798285, + "grad_norm": 1.315213680267334, + "learning_rate": 4.108373624563051e-06, + "loss": 2.4394, + "step": 5586 + }, + { + "epoch": 0.2997317596566524, + "grad_norm": 1.390446662902832, + "learning_rate": 4.108041034269285e-06, + "loss": 2.3345, + "step": 5587 + }, + { + "epoch": 0.29978540772532186, + "grad_norm": 1.3917582035064697, + "learning_rate": 4.107708395424313e-06, + "loss": 2.2578, + "step": 5588 + }, + { + "epoch": 0.2998390557939914, + "grad_norm": 1.3119382858276367, + "learning_rate": 4.10737570803818e-06, + "loss": 2.3176, + "step": 5589 + }, + { + "epoch": 0.2998927038626609, + "grad_norm": 1.401854395866394, + "learning_rate": 4.107042972120928e-06, + "loss": 2.1115, + "step": 5590 + }, + { + "epoch": 0.29994635193133046, + "grad_norm": 1.549887776374817, + "learning_rate": 4.106710187682606e-06, + "loss": 2.1775, + "step": 5591 + }, + { + "epoch": 0.3, + "grad_norm": 1.294798493385315, + "learning_rate": 4.1063773547332595e-06, + "loss": 2.3325, + "step": 5592 + }, + { + "epoch": 0.3000536480686695, + "grad_norm": 1.6942462921142578, + "learning_rate": 4.106044473282938e-06, + "loss": 2.2098, + "step": 5593 + }, + { + "epoch": 0.30010729613733905, + "grad_norm": 1.583571195602417, + "learning_rate": 4.105711543341693e-06, + "loss": 2.3167, + "step": 5594 + }, + { + "epoch": 0.3001609442060086, + "grad_norm": 1.2708642482757568, + "learning_rate": 4.105378564919576e-06, + "loss": 2.0877, + "step": 5595 + }, + { + "epoch": 0.3002145922746781, + "grad_norm": 1.3037384748458862, + "learning_rate": 4.10504553802664e-06, + "loss": 2.3993, + "step": 5596 + }, + { + "epoch": 0.30026824034334765, + "grad_norm": 1.4961433410644531, + "learning_rate": 4.104712462672942e-06, + "loss": 2.2336, + "step": 5597 + }, + { + "epoch": 0.3003218884120172, + "grad_norm": 1.4493472576141357, + "learning_rate": 4.104379338868537e-06, + "loss": 2.1949, + "step": 5598 + }, + { + "epoch": 0.3003755364806867, + "grad_norm": 2.085373878479004, + "learning_rate": 4.104046166623482e-06, + "loss": 2.3853, + "step": 5599 + }, + { + "epoch": 0.30042918454935624, + "grad_norm": 1.252465844154358, + "learning_rate": 4.103712945947838e-06, + "loss": 2.219, + "step": 5600 + }, + { + "epoch": 0.3004828326180258, + "grad_norm": 1.1790684461593628, + "learning_rate": 4.103379676851665e-06, + "loss": 2.1969, + "step": 5601 + }, + { + "epoch": 0.3005364806866953, + "grad_norm": 1.3727481365203857, + "learning_rate": 4.103046359345027e-06, + "loss": 2.2906, + "step": 5602 + }, + { + "epoch": 0.3005901287553648, + "grad_norm": 1.2875772714614868, + "learning_rate": 4.102712993437985e-06, + "loss": 2.1451, + "step": 5603 + }, + { + "epoch": 0.3006437768240343, + "grad_norm": 1.1621007919311523, + "learning_rate": 4.102379579140605e-06, + "loss": 2.1774, + "step": 5604 + }, + { + "epoch": 0.30069742489270385, + "grad_norm": 1.370139479637146, + "learning_rate": 4.102046116462956e-06, + "loss": 2.2189, + "step": 5605 + }, + { + "epoch": 0.3007510729613734, + "grad_norm": 1.528881549835205, + "learning_rate": 4.101712605415104e-06, + "loss": 2.5882, + "step": 5606 + }, + { + "epoch": 0.3008047210300429, + "grad_norm": 1.4151527881622314, + "learning_rate": 4.101379046007119e-06, + "loss": 2.3956, + "step": 5607 + }, + { + "epoch": 0.30085836909871244, + "grad_norm": 1.4075419902801514, + "learning_rate": 4.101045438249072e-06, + "loss": 2.4344, + "step": 5608 + }, + { + "epoch": 0.300912017167382, + "grad_norm": 2.9130892753601074, + "learning_rate": 4.100711782151036e-06, + "loss": 2.2648, + "step": 5609 + }, + { + "epoch": 0.3009656652360515, + "grad_norm": 3.1979682445526123, + "learning_rate": 4.100378077723085e-06, + "loss": 2.2498, + "step": 5610 + }, + { + "epoch": 0.30101931330472104, + "grad_norm": 1.4726369380950928, + "learning_rate": 4.100044324975293e-06, + "loss": 2.3154, + "step": 5611 + }, + { + "epoch": 0.30107296137339057, + "grad_norm": 1.4932900667190552, + "learning_rate": 4.099710523917739e-06, + "loss": 2.3832, + "step": 5612 + }, + { + "epoch": 0.3011266094420601, + "grad_norm": 1.3942238092422485, + "learning_rate": 4.099376674560501e-06, + "loss": 2.3381, + "step": 5613 + }, + { + "epoch": 0.30118025751072963, + "grad_norm": 1.4566380977630615, + "learning_rate": 4.099042776913657e-06, + "loss": 2.0724, + "step": 5614 + }, + { + "epoch": 0.30123390557939916, + "grad_norm": 1.2954537868499756, + "learning_rate": 4.098708830987292e-06, + "loss": 2.2774, + "step": 5615 + }, + { + "epoch": 0.3012875536480687, + "grad_norm": 1.1416816711425781, + "learning_rate": 4.098374836791485e-06, + "loss": 2.0354, + "step": 5616 + }, + { + "epoch": 0.30134120171673817, + "grad_norm": 1.3953015804290771, + "learning_rate": 4.098040794336321e-06, + "loss": 2.1247, + "step": 5617 + }, + { + "epoch": 0.3013948497854077, + "grad_norm": 1.424910545349121, + "learning_rate": 4.097706703631887e-06, + "loss": 2.4224, + "step": 5618 + }, + { + "epoch": 0.30144849785407724, + "grad_norm": 1.1311378479003906, + "learning_rate": 4.097372564688268e-06, + "loss": 2.0884, + "step": 5619 + }, + { + "epoch": 0.30150214592274677, + "grad_norm": 1.741468071937561, + "learning_rate": 4.097038377515556e-06, + "loss": 2.4264, + "step": 5620 + }, + { + "epoch": 0.3015557939914163, + "grad_norm": 1.346289873123169, + "learning_rate": 4.096704142123837e-06, + "loss": 2.1627, + "step": 5621 + }, + { + "epoch": 0.30160944206008583, + "grad_norm": 1.2272098064422607, + "learning_rate": 4.096369858523206e-06, + "loss": 1.9485, + "step": 5622 + }, + { + "epoch": 0.30166309012875536, + "grad_norm": 4.005715847015381, + "learning_rate": 4.096035526723754e-06, + "loss": 2.4012, + "step": 5623 + }, + { + "epoch": 0.3017167381974249, + "grad_norm": 1.4263533353805542, + "learning_rate": 4.0957011467355754e-06, + "loss": 2.226, + "step": 5624 + }, + { + "epoch": 0.3017703862660944, + "grad_norm": 1.3845363855361938, + "learning_rate": 4.095366718568767e-06, + "loss": 2.1966, + "step": 5625 + }, + { + "epoch": 0.30182403433476396, + "grad_norm": 1.2800418138504028, + "learning_rate": 4.095032242233426e-06, + "loss": 2.1478, + "step": 5626 + }, + { + "epoch": 0.3018776824034335, + "grad_norm": 1.5190095901489258, + "learning_rate": 4.0946977177396495e-06, + "loss": 2.2061, + "step": 5627 + }, + { + "epoch": 0.301931330472103, + "grad_norm": 1.6948390007019043, + "learning_rate": 4.09436314509754e-06, + "loss": 2.2222, + "step": 5628 + }, + { + "epoch": 0.30198497854077255, + "grad_norm": 1.5232542753219604, + "learning_rate": 4.0940285243171976e-06, + "loss": 2.3214, + "step": 5629 + }, + { + "epoch": 0.3020386266094421, + "grad_norm": 1.216957926750183, + "learning_rate": 4.093693855408726e-06, + "loss": 2.0701, + "step": 5630 + }, + { + "epoch": 0.30209227467811156, + "grad_norm": 1.2994657754898071, + "learning_rate": 4.09335913838223e-06, + "loss": 2.2043, + "step": 5631 + }, + { + "epoch": 0.3021459227467811, + "grad_norm": 1.6342687606811523, + "learning_rate": 4.093024373247815e-06, + "loss": 2.1501, + "step": 5632 + }, + { + "epoch": 0.3021995708154506, + "grad_norm": 1.2513360977172852, + "learning_rate": 4.092689560015589e-06, + "loss": 2.3583, + "step": 5633 + }, + { + "epoch": 0.30225321888412016, + "grad_norm": 2.216447353363037, + "learning_rate": 4.092354698695662e-06, + "loss": 2.3443, + "step": 5634 + }, + { + "epoch": 0.3023068669527897, + "grad_norm": 1.1440874338150024, + "learning_rate": 4.092019789298142e-06, + "loss": 2.2956, + "step": 5635 + }, + { + "epoch": 0.3023605150214592, + "grad_norm": 1.878041386604309, + "learning_rate": 4.091684831833142e-06, + "loss": 2.2072, + "step": 5636 + }, + { + "epoch": 0.30241416309012875, + "grad_norm": 1.443677306175232, + "learning_rate": 4.091349826310776e-06, + "loss": 2.4089, + "step": 5637 + }, + { + "epoch": 0.3024678111587983, + "grad_norm": 1.5547406673431396, + "learning_rate": 4.091014772741157e-06, + "loss": 2.3047, + "step": 5638 + }, + { + "epoch": 0.3025214592274678, + "grad_norm": 1.1168632507324219, + "learning_rate": 4.090679671134403e-06, + "loss": 2.0631, + "step": 5639 + }, + { + "epoch": 0.30257510729613735, + "grad_norm": 1.4455931186676025, + "learning_rate": 4.090344521500631e-06, + "loss": 2.3121, + "step": 5640 + }, + { + "epoch": 0.3026287553648069, + "grad_norm": 1.3152016401290894, + "learning_rate": 4.09000932384996e-06, + "loss": 2.3966, + "step": 5641 + }, + { + "epoch": 0.3026824034334764, + "grad_norm": 1.356750249862671, + "learning_rate": 4.0896740781925105e-06, + "loss": 2.3248, + "step": 5642 + }, + { + "epoch": 0.30273605150214594, + "grad_norm": 1.4216879606246948, + "learning_rate": 4.089338784538405e-06, + "loss": 2.2088, + "step": 5643 + }, + { + "epoch": 0.3027896995708155, + "grad_norm": 1.228576898574829, + "learning_rate": 4.089003442897766e-06, + "loss": 2.2392, + "step": 5644 + }, + { + "epoch": 0.302843347639485, + "grad_norm": 1.3207995891571045, + "learning_rate": 4.088668053280718e-06, + "loss": 2.1491, + "step": 5645 + }, + { + "epoch": 0.3028969957081545, + "grad_norm": 2.3389368057250977, + "learning_rate": 4.08833261569739e-06, + "loss": 2.2898, + "step": 5646 + }, + { + "epoch": 0.302950643776824, + "grad_norm": 2.9603917598724365, + "learning_rate": 4.087997130157907e-06, + "loss": 2.3582, + "step": 5647 + }, + { + "epoch": 0.30300429184549355, + "grad_norm": 1.287766933441162, + "learning_rate": 4.087661596672398e-06, + "loss": 2.1757, + "step": 5648 + }, + { + "epoch": 0.3030579399141631, + "grad_norm": 1.5170120000839233, + "learning_rate": 4.087326015250998e-06, + "loss": 2.3347, + "step": 5649 + }, + { + "epoch": 0.3031115879828326, + "grad_norm": 1.5668004751205444, + "learning_rate": 4.0869903859038335e-06, + "loss": 2.4146, + "step": 5650 + }, + { + "epoch": 0.30316523605150214, + "grad_norm": 1.5377253293991089, + "learning_rate": 4.086654708641042e-06, + "loss": 2.386, + "step": 5651 + }, + { + "epoch": 0.3032188841201717, + "grad_norm": 1.759872555732727, + "learning_rate": 4.086318983472756e-06, + "loss": 2.3794, + "step": 5652 + }, + { + "epoch": 0.3032725321888412, + "grad_norm": 1.3844619989395142, + "learning_rate": 4.085983210409114e-06, + "loss": 2.3369, + "step": 5653 + }, + { + "epoch": 0.30332618025751074, + "grad_norm": 3.7982306480407715, + "learning_rate": 4.085647389460253e-06, + "loss": 2.1267, + "step": 5654 + }, + { + "epoch": 0.30337982832618027, + "grad_norm": 1.312081217765808, + "learning_rate": 4.085311520636312e-06, + "loss": 2.241, + "step": 5655 + }, + { + "epoch": 0.3034334763948498, + "grad_norm": 1.5679099559783936, + "learning_rate": 4.084975603947433e-06, + "loss": 1.5233, + "step": 5656 + }, + { + "epoch": 0.30348712446351933, + "grad_norm": 1.2718433141708374, + "learning_rate": 4.084639639403757e-06, + "loss": 2.2847, + "step": 5657 + }, + { + "epoch": 0.30354077253218886, + "grad_norm": 1.1442599296569824, + "learning_rate": 4.084303627015428e-06, + "loss": 2.4316, + "step": 5658 + }, + { + "epoch": 0.3035944206008584, + "grad_norm": 1.3210723400115967, + "learning_rate": 4.083967566792591e-06, + "loss": 2.1727, + "step": 5659 + }, + { + "epoch": 0.30364806866952787, + "grad_norm": 1.2769922018051147, + "learning_rate": 4.083631458745394e-06, + "loss": 2.4704, + "step": 5660 + }, + { + "epoch": 0.3037017167381974, + "grad_norm": 1.55484938621521, + "learning_rate": 4.0832953028839835e-06, + "loss": 2.0157, + "step": 5661 + }, + { + "epoch": 0.30375536480686693, + "grad_norm": 1.3804036378860474, + "learning_rate": 4.082959099218509e-06, + "loss": 1.9013, + "step": 5662 + }, + { + "epoch": 0.30380901287553647, + "grad_norm": 1.5432641506195068, + "learning_rate": 4.082622847759122e-06, + "loss": 2.2115, + "step": 5663 + }, + { + "epoch": 0.303862660944206, + "grad_norm": 1.5573573112487793, + "learning_rate": 4.082286548515975e-06, + "loss": 2.3231, + "step": 5664 + }, + { + "epoch": 0.30391630901287553, + "grad_norm": 1.5517879724502563, + "learning_rate": 4.081950201499221e-06, + "loss": 2.4124, + "step": 5665 + }, + { + "epoch": 0.30396995708154506, + "grad_norm": 1.4378842115402222, + "learning_rate": 4.081613806719016e-06, + "loss": 2.1228, + "step": 5666 + }, + { + "epoch": 0.3040236051502146, + "grad_norm": 1.2293237447738647, + "learning_rate": 4.081277364185516e-06, + "loss": 2.2994, + "step": 5667 + }, + { + "epoch": 0.3040772532188841, + "grad_norm": 1.4004319906234741, + "learning_rate": 4.0809408739088804e-06, + "loss": 1.8178, + "step": 5668 + }, + { + "epoch": 0.30413090128755366, + "grad_norm": 1.2678675651550293, + "learning_rate": 4.0806043358992675e-06, + "loss": 2.1868, + "step": 5669 + }, + { + "epoch": 0.3041845493562232, + "grad_norm": 1.473337173461914, + "learning_rate": 4.080267750166839e-06, + "loss": 2.288, + "step": 5670 + }, + { + "epoch": 0.3042381974248927, + "grad_norm": 1.3962126970291138, + "learning_rate": 4.079931116721757e-06, + "loss": 2.3113, + "step": 5671 + }, + { + "epoch": 0.30429184549356225, + "grad_norm": 1.3885467052459717, + "learning_rate": 4.079594435574186e-06, + "loss": 2.1857, + "step": 5672 + }, + { + "epoch": 0.3043454935622318, + "grad_norm": 1.428124189376831, + "learning_rate": 4.07925770673429e-06, + "loss": 2.2443, + "step": 5673 + }, + { + "epoch": 0.30439914163090126, + "grad_norm": 1.6052621603012085, + "learning_rate": 4.078920930212237e-06, + "loss": 2.2199, + "step": 5674 + }, + { + "epoch": 0.3044527896995708, + "grad_norm": 1.308228850364685, + "learning_rate": 4.078584106018194e-06, + "loss": 2.422, + "step": 5675 + }, + { + "epoch": 0.3045064377682403, + "grad_norm": 1.3035264015197754, + "learning_rate": 4.078247234162332e-06, + "loss": 2.2886, + "step": 5676 + }, + { + "epoch": 0.30456008583690986, + "grad_norm": 1.3524413108825684, + "learning_rate": 4.077910314654822e-06, + "loss": 2.382, + "step": 5677 + }, + { + "epoch": 0.3046137339055794, + "grad_norm": 1.377758502960205, + "learning_rate": 4.077573347505837e-06, + "loss": 2.4755, + "step": 5678 + }, + { + "epoch": 0.3046673819742489, + "grad_norm": 1.3649576902389526, + "learning_rate": 4.077236332725548e-06, + "loss": 2.3581, + "step": 5679 + }, + { + "epoch": 0.30472103004291845, + "grad_norm": 1.0945258140563965, + "learning_rate": 4.076899270324133e-06, + "loss": 2.3318, + "step": 5680 + }, + { + "epoch": 0.304774678111588, + "grad_norm": 1.6673932075500488, + "learning_rate": 4.076562160311769e-06, + "loss": 2.4576, + "step": 5681 + }, + { + "epoch": 0.3048283261802575, + "grad_norm": 1.3988454341888428, + "learning_rate": 4.0762250026986335e-06, + "loss": 2.2234, + "step": 5682 + }, + { + "epoch": 0.30488197424892705, + "grad_norm": 1.5207840204238892, + "learning_rate": 4.075887797494906e-06, + "loss": 2.4083, + "step": 5683 + }, + { + "epoch": 0.3049356223175966, + "grad_norm": 1.3344876766204834, + "learning_rate": 4.075550544710768e-06, + "loss": 2.052, + "step": 5684 + }, + { + "epoch": 0.3049892703862661, + "grad_norm": 1.51613187789917, + "learning_rate": 4.075213244356402e-06, + "loss": 2.3433, + "step": 5685 + }, + { + "epoch": 0.30504291845493564, + "grad_norm": 1.394096851348877, + "learning_rate": 4.074875896441992e-06, + "loss": 2.2466, + "step": 5686 + }, + { + "epoch": 0.3050965665236052, + "grad_norm": 1.5377808809280396, + "learning_rate": 4.074538500977725e-06, + "loss": 2.1811, + "step": 5687 + }, + { + "epoch": 0.3051502145922747, + "grad_norm": 1.4722511768341064, + "learning_rate": 4.074201057973785e-06, + "loss": 2.1854, + "step": 5688 + }, + { + "epoch": 0.3052038626609442, + "grad_norm": 1.3650801181793213, + "learning_rate": 4.073863567440363e-06, + "loss": 2.3581, + "step": 5689 + }, + { + "epoch": 0.3052575107296137, + "grad_norm": 1.5257960557937622, + "learning_rate": 4.073526029387646e-06, + "loss": 2.496, + "step": 5690 + }, + { + "epoch": 0.30531115879828324, + "grad_norm": 1.2375541925430298, + "learning_rate": 4.073188443825828e-06, + "loss": 2.3067, + "step": 5691 + }, + { + "epoch": 0.3053648068669528, + "grad_norm": 1.2095866203308105, + "learning_rate": 4.0728508107651e-06, + "loss": 2.1031, + "step": 5692 + }, + { + "epoch": 0.3054184549356223, + "grad_norm": 1.3011871576309204, + "learning_rate": 4.072513130215656e-06, + "loss": 2.1826, + "step": 5693 + }, + { + "epoch": 0.30547210300429184, + "grad_norm": 1.6190072298049927, + "learning_rate": 4.072175402187693e-06, + "loss": 2.2151, + "step": 5694 + }, + { + "epoch": 0.30552575107296137, + "grad_norm": 1.4324859380722046, + "learning_rate": 4.071837626691407e-06, + "loss": 2.2205, + "step": 5695 + }, + { + "epoch": 0.3055793991416309, + "grad_norm": 1.6129558086395264, + "learning_rate": 4.071499803736996e-06, + "loss": 2.0829, + "step": 5696 + }, + { + "epoch": 0.30563304721030043, + "grad_norm": 1.468011736869812, + "learning_rate": 4.071161933334659e-06, + "loss": 2.1771, + "step": 5697 + }, + { + "epoch": 0.30568669527896997, + "grad_norm": 1.4367543458938599, + "learning_rate": 4.0708240154946e-06, + "loss": 2.3685, + "step": 5698 + }, + { + "epoch": 0.3057403433476395, + "grad_norm": 1.3240309953689575, + "learning_rate": 4.0704860502270195e-06, + "loss": 2.1819, + "step": 5699 + }, + { + "epoch": 0.30579399141630903, + "grad_norm": 1.4401029348373413, + "learning_rate": 4.070148037542123e-06, + "loss": 2.5753, + "step": 5700 + }, + { + "epoch": 0.30584763948497856, + "grad_norm": 1.3353359699249268, + "learning_rate": 4.069809977450115e-06, + "loss": 2.2036, + "step": 5701 + }, + { + "epoch": 0.3059012875536481, + "grad_norm": 8.862578392028809, + "learning_rate": 4.069471869961201e-06, + "loss": 2.4889, + "step": 5702 + }, + { + "epoch": 0.30595493562231757, + "grad_norm": 1.4887027740478516, + "learning_rate": 4.069133715085592e-06, + "loss": 2.3862, + "step": 5703 + }, + { + "epoch": 0.3060085836909871, + "grad_norm": 1.4874505996704102, + "learning_rate": 4.068795512833498e-06, + "loss": 2.2409, + "step": 5704 + }, + { + "epoch": 0.30606223175965663, + "grad_norm": 1.4580141305923462, + "learning_rate": 4.0684572632151275e-06, + "loss": 2.1604, + "step": 5705 + }, + { + "epoch": 0.30611587982832617, + "grad_norm": 1.524873971939087, + "learning_rate": 4.068118966240696e-06, + "loss": 2.269, + "step": 5706 + }, + { + "epoch": 0.3061695278969957, + "grad_norm": 1.241546630859375, + "learning_rate": 4.067780621920416e-06, + "loss": 2.2097, + "step": 5707 + }, + { + "epoch": 0.30622317596566523, + "grad_norm": 1.2779959440231323, + "learning_rate": 4.067442230264503e-06, + "loss": 2.1691, + "step": 5708 + }, + { + "epoch": 0.30627682403433476, + "grad_norm": 1.238029956817627, + "learning_rate": 4.067103791283175e-06, + "loss": 2.2345, + "step": 5709 + }, + { + "epoch": 0.3063304721030043, + "grad_norm": 3.3578290939331055, + "learning_rate": 4.06676530498665e-06, + "loss": 2.2526, + "step": 5710 + }, + { + "epoch": 0.3063841201716738, + "grad_norm": 1.8842111825942993, + "learning_rate": 4.066426771385149e-06, + "loss": 2.26, + "step": 5711 + }, + { + "epoch": 0.30643776824034336, + "grad_norm": 1.3308213949203491, + "learning_rate": 4.06608819048889e-06, + "loss": 1.9113, + "step": 5712 + }, + { + "epoch": 0.3064914163090129, + "grad_norm": 1.3513343334197998, + "learning_rate": 4.0657495623081e-06, + "loss": 2.4674, + "step": 5713 + }, + { + "epoch": 0.3065450643776824, + "grad_norm": 1.437351107597351, + "learning_rate": 4.0654108868529986e-06, + "loss": 2.0762, + "step": 5714 + }, + { + "epoch": 0.30659871244635195, + "grad_norm": 1.49918794631958, + "learning_rate": 4.065072164133815e-06, + "loss": 2.2571, + "step": 5715 + }, + { + "epoch": 0.3066523605150215, + "grad_norm": 1.4321484565734863, + "learning_rate": 4.064733394160774e-06, + "loss": 2.4531, + "step": 5716 + }, + { + "epoch": 0.306706008583691, + "grad_norm": 1.316144585609436, + "learning_rate": 4.064394576944105e-06, + "loss": 2.3245, + "step": 5717 + }, + { + "epoch": 0.3067596566523605, + "grad_norm": 1.4634536504745483, + "learning_rate": 4.064055712494038e-06, + "loss": 2.4015, + "step": 5718 + }, + { + "epoch": 0.30681330472103, + "grad_norm": 1.3908226490020752, + "learning_rate": 4.0637168008208026e-06, + "loss": 2.2876, + "step": 5719 + }, + { + "epoch": 0.30686695278969955, + "grad_norm": 1.3373477458953857, + "learning_rate": 4.063377841934633e-06, + "loss": 2.2032, + "step": 5720 + }, + { + "epoch": 0.3069206008583691, + "grad_norm": 1.3989357948303223, + "learning_rate": 4.063038835845764e-06, + "loss": 2.2226, + "step": 5721 + }, + { + "epoch": 0.3069742489270386, + "grad_norm": 1.3497830629348755, + "learning_rate": 4.062699782564431e-06, + "loss": 2.4327, + "step": 5722 + }, + { + "epoch": 0.30702789699570815, + "grad_norm": 2.25803279876709, + "learning_rate": 4.062360682100869e-06, + "loss": 2.4143, + "step": 5723 + }, + { + "epoch": 0.3070815450643777, + "grad_norm": 1.391983985900879, + "learning_rate": 4.0620215344653165e-06, + "loss": 2.1658, + "step": 5724 + }, + { + "epoch": 0.3071351931330472, + "grad_norm": 1.2796801328659058, + "learning_rate": 4.061682339668016e-06, + "loss": 2.1647, + "step": 5725 + }, + { + "epoch": 0.30718884120171674, + "grad_norm": 1.456968903541565, + "learning_rate": 4.061343097719206e-06, + "loss": 2.3031, + "step": 5726 + }, + { + "epoch": 0.3072424892703863, + "grad_norm": 1.5105525255203247, + "learning_rate": 4.06100380862913e-06, + "loss": 2.4157, + "step": 5727 + }, + { + "epoch": 0.3072961373390558, + "grad_norm": 1.7286176681518555, + "learning_rate": 4.060664472408033e-06, + "loss": 2.3589, + "step": 5728 + }, + { + "epoch": 0.30734978540772534, + "grad_norm": 1.6165162324905396, + "learning_rate": 4.06032508906616e-06, + "loss": 2.5343, + "step": 5729 + }, + { + "epoch": 0.30740343347639487, + "grad_norm": 1.4612947702407837, + "learning_rate": 4.059985658613757e-06, + "loss": 2.2766, + "step": 5730 + }, + { + "epoch": 0.3074570815450644, + "grad_norm": 1.519787311553955, + "learning_rate": 4.059646181061073e-06, + "loss": 2.7198, + "step": 5731 + }, + { + "epoch": 0.3075107296137339, + "grad_norm": 1.405494213104248, + "learning_rate": 4.059306656418359e-06, + "loss": 2.501, + "step": 5732 + }, + { + "epoch": 0.3075643776824034, + "grad_norm": 1.4190503358840942, + "learning_rate": 4.058967084695864e-06, + "loss": 2.3325, + "step": 5733 + }, + { + "epoch": 0.30761802575107294, + "grad_norm": 1.240964412689209, + "learning_rate": 4.058627465903841e-06, + "loss": 1.9851, + "step": 5734 + }, + { + "epoch": 0.3076716738197425, + "grad_norm": 1.3014048337936401, + "learning_rate": 4.058287800052546e-06, + "loss": 2.064, + "step": 5735 + }, + { + "epoch": 0.307725321888412, + "grad_norm": 1.541025161743164, + "learning_rate": 4.057948087152232e-06, + "loss": 2.2684, + "step": 5736 + }, + { + "epoch": 0.30777896995708154, + "grad_norm": 1.2527133226394653, + "learning_rate": 4.057608327213157e-06, + "loss": 2.1079, + "step": 5737 + }, + { + "epoch": 0.30783261802575107, + "grad_norm": 1.2668806314468384, + "learning_rate": 4.05726852024558e-06, + "loss": 2.2924, + "step": 5738 + }, + { + "epoch": 0.3078862660944206, + "grad_norm": 1.5482497215270996, + "learning_rate": 4.056928666259759e-06, + "loss": 2.2238, + "step": 5739 + }, + { + "epoch": 0.30793991416309013, + "grad_norm": 1.470178484916687, + "learning_rate": 4.056588765265957e-06, + "loss": 2.3004, + "step": 5740 + }, + { + "epoch": 0.30799356223175967, + "grad_norm": 1.5532933473587036, + "learning_rate": 4.056248817274435e-06, + "loss": 2.1986, + "step": 5741 + }, + { + "epoch": 0.3080472103004292, + "grad_norm": 1.2718422412872314, + "learning_rate": 4.055908822295458e-06, + "loss": 1.9929, + "step": 5742 + }, + { + "epoch": 0.30810085836909873, + "grad_norm": 1.403761625289917, + "learning_rate": 4.0555687803392905e-06, + "loss": 2.0681, + "step": 5743 + }, + { + "epoch": 0.30815450643776826, + "grad_norm": 1.355272889137268, + "learning_rate": 4.055228691416201e-06, + "loss": 2.4478, + "step": 5744 + }, + { + "epoch": 0.3082081545064378, + "grad_norm": 1.4263219833374023, + "learning_rate": 4.054888555536456e-06, + "loss": 2.161, + "step": 5745 + }, + { + "epoch": 0.30826180257510727, + "grad_norm": 1.6973152160644531, + "learning_rate": 4.054548372710325e-06, + "loss": 2.0335, + "step": 5746 + }, + { + "epoch": 0.3083154506437768, + "grad_norm": 1.5490440130233765, + "learning_rate": 4.05420814294808e-06, + "loss": 2.2939, + "step": 5747 + }, + { + "epoch": 0.30836909871244633, + "grad_norm": 1.657361626625061, + "learning_rate": 4.053867866259994e-06, + "loss": 2.3126, + "step": 5748 + }, + { + "epoch": 0.30842274678111586, + "grad_norm": 1.3809483051300049, + "learning_rate": 4.053527542656339e-06, + "loss": 2.1374, + "step": 5749 + }, + { + "epoch": 0.3084763948497854, + "grad_norm": 1.4597104787826538, + "learning_rate": 4.053187172147391e-06, + "loss": 2.2426, + "step": 5750 + }, + { + "epoch": 0.3085300429184549, + "grad_norm": 1.2546296119689941, + "learning_rate": 4.0528467547434285e-06, + "loss": 2.1674, + "step": 5751 + }, + { + "epoch": 0.30858369098712446, + "grad_norm": 1.3623522520065308, + "learning_rate": 4.052506290454728e-06, + "loss": 2.3892, + "step": 5752 + }, + { + "epoch": 0.308637339055794, + "grad_norm": 1.3201828002929688, + "learning_rate": 4.0521657792915695e-06, + "loss": 2.4609, + "step": 5753 + }, + { + "epoch": 0.3086909871244635, + "grad_norm": 1.5300309658050537, + "learning_rate": 4.051825221264233e-06, + "loss": 2.293, + "step": 5754 + }, + { + "epoch": 0.30874463519313305, + "grad_norm": 1.4522902965545654, + "learning_rate": 4.051484616383002e-06, + "loss": 1.4267, + "step": 5755 + }, + { + "epoch": 0.3087982832618026, + "grad_norm": 1.3857709169387817, + "learning_rate": 4.0511439646581606e-06, + "loss": 2.3072, + "step": 5756 + }, + { + "epoch": 0.3088519313304721, + "grad_norm": 1.4904454946517944, + "learning_rate": 4.050803266099993e-06, + "loss": 2.3607, + "step": 5757 + }, + { + "epoch": 0.30890557939914165, + "grad_norm": 1.4489847421646118, + "learning_rate": 4.050462520718786e-06, + "loss": 2.0942, + "step": 5758 + }, + { + "epoch": 0.3089592274678112, + "grad_norm": 1.3150558471679688, + "learning_rate": 4.050121728524829e-06, + "loss": 2.0241, + "step": 5759 + }, + { + "epoch": 0.3090128755364807, + "grad_norm": 1.3363829851150513, + "learning_rate": 4.049780889528408e-06, + "loss": 2.3521, + "step": 5760 + }, + { + "epoch": 0.3090665236051502, + "grad_norm": 1.5047639608383179, + "learning_rate": 4.049440003739818e-06, + "loss": 2.2508, + "step": 5761 + }, + { + "epoch": 0.3091201716738197, + "grad_norm": 1.5026495456695557, + "learning_rate": 4.04909907116935e-06, + "loss": 2.473, + "step": 5762 + }, + { + "epoch": 0.30917381974248925, + "grad_norm": 12.407692909240723, + "learning_rate": 4.048758091827296e-06, + "loss": 1.6303, + "step": 5763 + }, + { + "epoch": 0.3092274678111588, + "grad_norm": 1.3768272399902344, + "learning_rate": 4.048417065723953e-06, + "loss": 2.0844, + "step": 5764 + }, + { + "epoch": 0.3092811158798283, + "grad_norm": 1.3312957286834717, + "learning_rate": 4.048075992869617e-06, + "loss": 2.2548, + "step": 5765 + }, + { + "epoch": 0.30933476394849785, + "grad_norm": 1.3554630279541016, + "learning_rate": 4.047734873274586e-06, + "loss": 2.2757, + "step": 5766 + }, + { + "epoch": 0.3093884120171674, + "grad_norm": 1.4638768434524536, + "learning_rate": 4.047393706949157e-06, + "loss": 2.3577, + "step": 5767 + }, + { + "epoch": 0.3094420600858369, + "grad_norm": 1.2715502977371216, + "learning_rate": 4.047052493903635e-06, + "loss": 2.1725, + "step": 5768 + }, + { + "epoch": 0.30949570815450644, + "grad_norm": 1.6652662754058838, + "learning_rate": 4.046711234148319e-06, + "loss": 2.3677, + "step": 5769 + }, + { + "epoch": 0.309549356223176, + "grad_norm": 1.4805527925491333, + "learning_rate": 4.046369927693516e-06, + "loss": 2.3588, + "step": 5770 + }, + { + "epoch": 0.3096030042918455, + "grad_norm": 1.184088110923767, + "learning_rate": 4.046028574549526e-06, + "loss": 2.046, + "step": 5771 + }, + { + "epoch": 0.30965665236051504, + "grad_norm": 1.2991359233856201, + "learning_rate": 4.045687174726659e-06, + "loss": 2.4311, + "step": 5772 + }, + { + "epoch": 0.30971030042918457, + "grad_norm": 1.672582983970642, + "learning_rate": 4.045345728235221e-06, + "loss": 2.1534, + "step": 5773 + }, + { + "epoch": 0.3097639484978541, + "grad_norm": 1.4412033557891846, + "learning_rate": 4.045004235085522e-06, + "loss": 2.2528, + "step": 5774 + }, + { + "epoch": 0.3098175965665236, + "grad_norm": 1.339589238166809, + "learning_rate": 4.044662695287874e-06, + "loss": 2.2901, + "step": 5775 + }, + { + "epoch": 0.3098712446351931, + "grad_norm": 1.4800071716308594, + "learning_rate": 4.044321108852587e-06, + "loss": 2.3993, + "step": 5776 + }, + { + "epoch": 0.30992489270386264, + "grad_norm": 1.31625235080719, + "learning_rate": 4.043979475789974e-06, + "loss": 1.965, + "step": 5777 + }, + { + "epoch": 0.3099785407725322, + "grad_norm": 1.567817211151123, + "learning_rate": 4.043637796110352e-06, + "loss": 2.3428, + "step": 5778 + }, + { + "epoch": 0.3100321888412017, + "grad_norm": 1.4252607822418213, + "learning_rate": 4.043296069824037e-06, + "loss": 2.2329, + "step": 5779 + }, + { + "epoch": 0.31008583690987124, + "grad_norm": 1.2999069690704346, + "learning_rate": 4.042954296941345e-06, + "loss": 2.3689, + "step": 5780 + }, + { + "epoch": 0.31013948497854077, + "grad_norm": 1.5831931829452515, + "learning_rate": 4.0426124774725956e-06, + "loss": 2.6068, + "step": 5781 + }, + { + "epoch": 0.3101931330472103, + "grad_norm": 1.4944424629211426, + "learning_rate": 4.0422706114281096e-06, + "loss": 2.2937, + "step": 5782 + }, + { + "epoch": 0.31024678111587983, + "grad_norm": 1.3514009714126587, + "learning_rate": 4.041928698818209e-06, + "loss": 2.3681, + "step": 5783 + }, + { + "epoch": 0.31030042918454936, + "grad_norm": 1.5957690477371216, + "learning_rate": 4.041586739653218e-06, + "loss": 2.262, + "step": 5784 + }, + { + "epoch": 0.3103540772532189, + "grad_norm": 1.2952711582183838, + "learning_rate": 4.04124473394346e-06, + "loss": 2.6662, + "step": 5785 + }, + { + "epoch": 0.31040772532188843, + "grad_norm": 1.2392678260803223, + "learning_rate": 4.04090268169926e-06, + "loss": 2.1188, + "step": 5786 + }, + { + "epoch": 0.31046137339055796, + "grad_norm": 1.2619109153747559, + "learning_rate": 4.040560582930948e-06, + "loss": 2.1334, + "step": 5787 + }, + { + "epoch": 0.3105150214592275, + "grad_norm": 1.4918471574783325, + "learning_rate": 4.0402184376488515e-06, + "loss": 2.3952, + "step": 5788 + }, + { + "epoch": 0.310568669527897, + "grad_norm": 1.4112993478775024, + "learning_rate": 4.039876245863302e-06, + "loss": 2.1965, + "step": 5789 + }, + { + "epoch": 0.3106223175965665, + "grad_norm": 1.2389898300170898, + "learning_rate": 4.039534007584629e-06, + "loss": 1.968, + "step": 5790 + }, + { + "epoch": 0.31067596566523603, + "grad_norm": 1.4373563528060913, + "learning_rate": 4.039191722823167e-06, + "loss": 2.4376, + "step": 5791 + }, + { + "epoch": 0.31072961373390556, + "grad_norm": 1.296429991722107, + "learning_rate": 4.0388493915892515e-06, + "loss": 2.4461, + "step": 5792 + }, + { + "epoch": 0.3107832618025751, + "grad_norm": 1.6524708271026611, + "learning_rate": 4.038507013893217e-06, + "loss": 2.3492, + "step": 5793 + }, + { + "epoch": 0.3108369098712446, + "grad_norm": 1.7744450569152832, + "learning_rate": 4.038164589745401e-06, + "loss": 2.1254, + "step": 5794 + }, + { + "epoch": 0.31089055793991416, + "grad_norm": 1.22013521194458, + "learning_rate": 4.037822119156142e-06, + "loss": 1.9388, + "step": 5795 + }, + { + "epoch": 0.3109442060085837, + "grad_norm": 1.4393178224563599, + "learning_rate": 4.037479602135781e-06, + "loss": 2.5525, + "step": 5796 + }, + { + "epoch": 0.3109978540772532, + "grad_norm": 1.7135732173919678, + "learning_rate": 4.03713703869466e-06, + "loss": 2.4624, + "step": 5797 + }, + { + "epoch": 0.31105150214592275, + "grad_norm": 1.4786226749420166, + "learning_rate": 4.036794428843119e-06, + "loss": 2.5189, + "step": 5798 + }, + { + "epoch": 0.3111051502145923, + "grad_norm": 1.265604853630066, + "learning_rate": 4.036451772591506e-06, + "loss": 2.0573, + "step": 5799 + }, + { + "epoch": 0.3111587982832618, + "grad_norm": 1.4307684898376465, + "learning_rate": 4.036109069950165e-06, + "loss": 2.1028, + "step": 5800 + }, + { + "epoch": 0.31121244635193135, + "grad_norm": 1.3537698984146118, + "learning_rate": 4.035766320929443e-06, + "loss": 2.1349, + "step": 5801 + }, + { + "epoch": 0.3112660944206009, + "grad_norm": 1.3925491571426392, + "learning_rate": 4.035423525539689e-06, + "loss": 2.2663, + "step": 5802 + }, + { + "epoch": 0.3113197424892704, + "grad_norm": 1.3359932899475098, + "learning_rate": 4.0350806837912525e-06, + "loss": 2.2205, + "step": 5803 + }, + { + "epoch": 0.3113733905579399, + "grad_norm": 1.4050886631011963, + "learning_rate": 4.034737795694485e-06, + "loss": 2.2089, + "step": 5804 + }, + { + "epoch": 0.3114270386266094, + "grad_norm": 1.6069879531860352, + "learning_rate": 4.03439486125974e-06, + "loss": 2.3198, + "step": 5805 + }, + { + "epoch": 0.31148068669527895, + "grad_norm": 1.3600398302078247, + "learning_rate": 4.03405188049737e-06, + "loss": 2.3991, + "step": 5806 + }, + { + "epoch": 0.3115343347639485, + "grad_norm": 1.471459150314331, + "learning_rate": 4.033708853417733e-06, + "loss": 2.0859, + "step": 5807 + }, + { + "epoch": 0.311587982832618, + "grad_norm": 1.3663729429244995, + "learning_rate": 4.033365780031183e-06, + "loss": 2.1692, + "step": 5808 + }, + { + "epoch": 0.31164163090128755, + "grad_norm": 1.4209940433502197, + "learning_rate": 4.03302266034808e-06, + "loss": 2.3524, + "step": 5809 + }, + { + "epoch": 0.3116952789699571, + "grad_norm": 1.255976915359497, + "learning_rate": 4.032679494378784e-06, + "loss": 2.2791, + "step": 5810 + }, + { + "epoch": 0.3117489270386266, + "grad_norm": 1.2183526754379272, + "learning_rate": 4.0323362821336555e-06, + "loss": 2.5359, + "step": 5811 + }, + { + "epoch": 0.31180257510729614, + "grad_norm": 1.2164422273635864, + "learning_rate": 4.0319930236230566e-06, + "loss": 2.1737, + "step": 5812 + }, + { + "epoch": 0.3118562231759657, + "grad_norm": 1.1526315212249756, + "learning_rate": 4.031649718857352e-06, + "loss": 1.872, + "step": 5813 + }, + { + "epoch": 0.3119098712446352, + "grad_norm": 1.5897020101547241, + "learning_rate": 4.031306367846908e-06, + "loss": 2.3555, + "step": 5814 + }, + { + "epoch": 0.31196351931330474, + "grad_norm": 1.4817225933074951, + "learning_rate": 4.030962970602089e-06, + "loss": 2.1792, + "step": 5815 + }, + { + "epoch": 0.31201716738197427, + "grad_norm": 2.483764410018921, + "learning_rate": 4.030619527133265e-06, + "loss": 2.1867, + "step": 5816 + }, + { + "epoch": 0.3120708154506438, + "grad_norm": 1.7333160638809204, + "learning_rate": 4.030276037450804e-06, + "loss": 2.2054, + "step": 5817 + }, + { + "epoch": 0.3121244635193133, + "grad_norm": 1.3606500625610352, + "learning_rate": 4.029932501565078e-06, + "loss": 2.0804, + "step": 5818 + }, + { + "epoch": 0.3121781115879828, + "grad_norm": 1.3906664848327637, + "learning_rate": 4.029588919486459e-06, + "loss": 2.2304, + "step": 5819 + }, + { + "epoch": 0.31223175965665234, + "grad_norm": 1.3337547779083252, + "learning_rate": 4.02924529122532e-06, + "loss": 2.253, + "step": 5820 + }, + { + "epoch": 0.3122854077253219, + "grad_norm": 1.2399014234542847, + "learning_rate": 4.028901616792037e-06, + "loss": 2.2987, + "step": 5821 + }, + { + "epoch": 0.3123390557939914, + "grad_norm": 1.1811914443969727, + "learning_rate": 4.028557896196986e-06, + "loss": 2.1655, + "step": 5822 + }, + { + "epoch": 0.31239270386266094, + "grad_norm": 1.4651488065719604, + "learning_rate": 4.028214129450546e-06, + "loss": 2.1211, + "step": 5823 + }, + { + "epoch": 0.31244635193133047, + "grad_norm": 1.2192994356155396, + "learning_rate": 4.027870316563095e-06, + "loss": 2.182, + "step": 5824 + }, + { + "epoch": 0.3125, + "grad_norm": 1.2199244499206543, + "learning_rate": 4.027526457545015e-06, + "loss": 2.269, + "step": 5825 + }, + { + "epoch": 0.31255364806866953, + "grad_norm": 1.5545547008514404, + "learning_rate": 4.0271825524066855e-06, + "loss": 2.4972, + "step": 5826 + }, + { + "epoch": 0.31260729613733906, + "grad_norm": 1.2338107824325562, + "learning_rate": 4.026838601158492e-06, + "loss": 2.2039, + "step": 5827 + }, + { + "epoch": 0.3126609442060086, + "grad_norm": 2.1778085231781006, + "learning_rate": 4.026494603810818e-06, + "loss": 2.2947, + "step": 5828 + }, + { + "epoch": 0.3127145922746781, + "grad_norm": 1.0893511772155762, + "learning_rate": 4.026150560374052e-06, + "loss": 2.2279, + "step": 5829 + }, + { + "epoch": 0.31276824034334766, + "grad_norm": 1.3846956491470337, + "learning_rate": 4.02580647085858e-06, + "loss": 2.4881, + "step": 5830 + }, + { + "epoch": 0.3128218884120172, + "grad_norm": 1.4905753135681152, + "learning_rate": 4.02546233527479e-06, + "loss": 2.4911, + "step": 5831 + }, + { + "epoch": 0.3128755364806867, + "grad_norm": 1.3309577703475952, + "learning_rate": 4.025118153633075e-06, + "loss": 2.2064, + "step": 5832 + }, + { + "epoch": 0.3129291845493562, + "grad_norm": 1.569792628288269, + "learning_rate": 4.024773925943825e-06, + "loss": 2.3787, + "step": 5833 + }, + { + "epoch": 0.31298283261802573, + "grad_norm": 1.3819103240966797, + "learning_rate": 4.0244296522174346e-06, + "loss": 2.4355, + "step": 5834 + }, + { + "epoch": 0.31303648068669526, + "grad_norm": 1.5207468271255493, + "learning_rate": 4.024085332464296e-06, + "loss": 2.2475, + "step": 5835 + }, + { + "epoch": 0.3130901287553648, + "grad_norm": 1.3282800912857056, + "learning_rate": 4.023740966694807e-06, + "loss": 2.2047, + "step": 5836 + }, + { + "epoch": 0.3131437768240343, + "grad_norm": 1.4877768754959106, + "learning_rate": 4.023396554919364e-06, + "loss": 2.2875, + "step": 5837 + }, + { + "epoch": 0.31319742489270386, + "grad_norm": 1.2206835746765137, + "learning_rate": 4.023052097148367e-06, + "loss": 2.1213, + "step": 5838 + }, + { + "epoch": 0.3132510729613734, + "grad_norm": 1.5885003805160522, + "learning_rate": 4.022707593392216e-06, + "loss": 1.9955, + "step": 5839 + }, + { + "epoch": 0.3133047210300429, + "grad_norm": 1.4760076999664307, + "learning_rate": 4.0223630436613105e-06, + "loss": 2.223, + "step": 5840 + }, + { + "epoch": 0.31335836909871245, + "grad_norm": 1.4097520112991333, + "learning_rate": 4.022018447966056e-06, + "loss": 2.3827, + "step": 5841 + }, + { + "epoch": 0.313412017167382, + "grad_norm": 1.3420275449752808, + "learning_rate": 4.021673806316855e-06, + "loss": 2.3218, + "step": 5842 + }, + { + "epoch": 0.3134656652360515, + "grad_norm": 1.2954903841018677, + "learning_rate": 4.021329118724113e-06, + "loss": 2.1752, + "step": 5843 + }, + { + "epoch": 0.31351931330472105, + "grad_norm": 1.3203480243682861, + "learning_rate": 4.0209843851982384e-06, + "loss": 2.4538, + "step": 5844 + }, + { + "epoch": 0.3135729613733906, + "grad_norm": 1.5920377969741821, + "learning_rate": 4.02063960574964e-06, + "loss": 2.1174, + "step": 5845 + }, + { + "epoch": 0.3136266094420601, + "grad_norm": 1.431911587715149, + "learning_rate": 4.020294780388726e-06, + "loss": 2.4879, + "step": 5846 + }, + { + "epoch": 0.3136802575107296, + "grad_norm": 1.5101070404052734, + "learning_rate": 4.019949909125909e-06, + "loss": 2.2364, + "step": 5847 + }, + { + "epoch": 0.3137339055793991, + "grad_norm": 1.3755563497543335, + "learning_rate": 4.019604991971601e-06, + "loss": 2.3251, + "step": 5848 + }, + { + "epoch": 0.31378755364806865, + "grad_norm": 1.0476672649383545, + "learning_rate": 4.0192600289362146e-06, + "loss": 2.0875, + "step": 5849 + }, + { + "epoch": 0.3138412017167382, + "grad_norm": 1.3277875185012817, + "learning_rate": 4.018915020030168e-06, + "loss": 1.8944, + "step": 5850 + }, + { + "epoch": 0.3138948497854077, + "grad_norm": 1.2977263927459717, + "learning_rate": 4.018569965263876e-06, + "loss": 2.1835, + "step": 5851 + }, + { + "epoch": 0.31394849785407725, + "grad_norm": 1.2878985404968262, + "learning_rate": 4.018224864647759e-06, + "loss": 2.3485, + "step": 5852 + }, + { + "epoch": 0.3140021459227468, + "grad_norm": 1.526236653327942, + "learning_rate": 4.017879718192232e-06, + "loss": 2.2063, + "step": 5853 + }, + { + "epoch": 0.3140557939914163, + "grad_norm": 1.3673596382141113, + "learning_rate": 4.017534525907721e-06, + "loss": 2.3833, + "step": 5854 + }, + { + "epoch": 0.31410944206008584, + "grad_norm": 1.2280751466751099, + "learning_rate": 4.017189287804646e-06, + "loss": 2.0168, + "step": 5855 + }, + { + "epoch": 0.3141630901287554, + "grad_norm": 2.0710325241088867, + "learning_rate": 4.01684400389343e-06, + "loss": 2.3255, + "step": 5856 + }, + { + "epoch": 0.3142167381974249, + "grad_norm": 1.266101360321045, + "learning_rate": 4.0164986741844995e-06, + "loss": 2.253, + "step": 5857 + }, + { + "epoch": 0.31427038626609444, + "grad_norm": 1.335493803024292, + "learning_rate": 4.01615329868828e-06, + "loss": 2.3138, + "step": 5858 + }, + { + "epoch": 0.31432403433476397, + "grad_norm": 1.2489944696426392, + "learning_rate": 4.0158078774152e-06, + "loss": 2.084, + "step": 5859 + }, + { + "epoch": 0.3143776824034335, + "grad_norm": 1.51082444190979, + "learning_rate": 4.015462410375688e-06, + "loss": 1.9733, + "step": 5860 + }, + { + "epoch": 0.314431330472103, + "grad_norm": 1.3965479135513306, + "learning_rate": 4.015116897580175e-06, + "loss": 2.2292, + "step": 5861 + }, + { + "epoch": 0.3144849785407725, + "grad_norm": 1.4490561485290527, + "learning_rate": 4.014771339039093e-06, + "loss": 2.3188, + "step": 5862 + }, + { + "epoch": 0.31453862660944204, + "grad_norm": 1.3344054222106934, + "learning_rate": 4.014425734762876e-06, + "loss": 2.0236, + "step": 5863 + }, + { + "epoch": 0.31459227467811157, + "grad_norm": 2.1611578464508057, + "learning_rate": 4.014080084761957e-06, + "loss": 2.104, + "step": 5864 + }, + { + "epoch": 0.3146459227467811, + "grad_norm": 1.3506325483322144, + "learning_rate": 4.013734389046774e-06, + "loss": 2.3767, + "step": 5865 + }, + { + "epoch": 0.31469957081545064, + "grad_norm": 1.3832772970199585, + "learning_rate": 4.013388647627764e-06, + "loss": 2.2306, + "step": 5866 + }, + { + "epoch": 0.31475321888412017, + "grad_norm": 1.505828619003296, + "learning_rate": 4.013042860515365e-06, + "loss": 2.3248, + "step": 5867 + }, + { + "epoch": 0.3148068669527897, + "grad_norm": 1.0858372449874878, + "learning_rate": 4.012697027720019e-06, + "loss": 2.2485, + "step": 5868 + }, + { + "epoch": 0.31486051502145923, + "grad_norm": 1.3615769147872925, + "learning_rate": 4.0123511492521656e-06, + "loss": 2.2551, + "step": 5869 + }, + { + "epoch": 0.31491416309012876, + "grad_norm": 1.3243950605392456, + "learning_rate": 4.0120052251222485e-06, + "loss": 2.1893, + "step": 5870 + }, + { + "epoch": 0.3149678111587983, + "grad_norm": 1.4365261793136597, + "learning_rate": 4.011659255340713e-06, + "loss": 2.2847, + "step": 5871 + }, + { + "epoch": 0.3150214592274678, + "grad_norm": 1.362168550491333, + "learning_rate": 4.011313239918005e-06, + "loss": 2.2393, + "step": 5872 + }, + { + "epoch": 0.31507510729613736, + "grad_norm": 1.286145567893982, + "learning_rate": 4.01096717886457e-06, + "loss": 2.187, + "step": 5873 + }, + { + "epoch": 0.3151287553648069, + "grad_norm": 1.3495314121246338, + "learning_rate": 4.010621072190858e-06, + "loss": 2.2984, + "step": 5874 + }, + { + "epoch": 0.3151824034334764, + "grad_norm": 1.3690388202667236, + "learning_rate": 4.010274919907318e-06, + "loss": 2.0026, + "step": 5875 + }, + { + "epoch": 0.3152360515021459, + "grad_norm": 1.5466806888580322, + "learning_rate": 4.0099287220244015e-06, + "loss": 1.9951, + "step": 5876 + }, + { + "epoch": 0.31528969957081543, + "grad_norm": 1.2617849111557007, + "learning_rate": 4.009582478552562e-06, + "loss": 2.0967, + "step": 5877 + }, + { + "epoch": 0.31534334763948496, + "grad_norm": 1.631239414215088, + "learning_rate": 4.009236189502253e-06, + "loss": 2.5626, + "step": 5878 + }, + { + "epoch": 0.3153969957081545, + "grad_norm": 1.104596495628357, + "learning_rate": 4.0088898548839285e-06, + "loss": 2.3171, + "step": 5879 + }, + { + "epoch": 0.315450643776824, + "grad_norm": 1.8215866088867188, + "learning_rate": 4.008543474708047e-06, + "loss": 2.3845, + "step": 5880 + }, + { + "epoch": 0.31550429184549356, + "grad_norm": 1.3353911638259888, + "learning_rate": 4.008197048985068e-06, + "loss": 2.3051, + "step": 5881 + }, + { + "epoch": 0.3155579399141631, + "grad_norm": 1.2554665803909302, + "learning_rate": 4.007850577725448e-06, + "loss": 2.1435, + "step": 5882 + }, + { + "epoch": 0.3156115879828326, + "grad_norm": 1.4089579582214355, + "learning_rate": 4.00750406093965e-06, + "loss": 1.8129, + "step": 5883 + }, + { + "epoch": 0.31566523605150215, + "grad_norm": 1.7016485929489136, + "learning_rate": 4.007157498638135e-06, + "loss": 2.1971, + "step": 5884 + }, + { + "epoch": 0.3157188841201717, + "grad_norm": 1.4625771045684814, + "learning_rate": 4.006810890831368e-06, + "loss": 2.0427, + "step": 5885 + }, + { + "epoch": 0.3157725321888412, + "grad_norm": 1.4263584613800049, + "learning_rate": 4.006464237529813e-06, + "loss": 2.2841, + "step": 5886 + }, + { + "epoch": 0.31582618025751075, + "grad_norm": 1.1856904029846191, + "learning_rate": 4.006117538743937e-06, + "loss": 2.0482, + "step": 5887 + }, + { + "epoch": 0.3158798283261803, + "grad_norm": 1.423359751701355, + "learning_rate": 4.005770794484206e-06, + "loss": 2.5615, + "step": 5888 + }, + { + "epoch": 0.3159334763948498, + "grad_norm": 1.1739780902862549, + "learning_rate": 4.005424004761092e-06, + "loss": 2.3036, + "step": 5889 + }, + { + "epoch": 0.3159871244635193, + "grad_norm": 1.1342941522598267, + "learning_rate": 4.005077169585064e-06, + "loss": 2.0499, + "step": 5890 + }, + { + "epoch": 0.3160407725321888, + "grad_norm": 1.464175820350647, + "learning_rate": 4.004730288966595e-06, + "loss": 2.2428, + "step": 5891 + }, + { + "epoch": 0.31609442060085835, + "grad_norm": 1.4919548034667969, + "learning_rate": 4.0043833629161565e-06, + "loss": 2.2819, + "step": 5892 + }, + { + "epoch": 0.3161480686695279, + "grad_norm": 1.2593588829040527, + "learning_rate": 4.0040363914442236e-06, + "loss": 2.3932, + "step": 5893 + }, + { + "epoch": 0.3162017167381974, + "grad_norm": 1.234706997871399, + "learning_rate": 4.003689374561274e-06, + "loss": 2.3346, + "step": 5894 + }, + { + "epoch": 0.31625536480686695, + "grad_norm": 1.384163498878479, + "learning_rate": 4.003342312277784e-06, + "loss": 2.3707, + "step": 5895 + }, + { + "epoch": 0.3163090128755365, + "grad_norm": 1.386602759361267, + "learning_rate": 4.002995204604231e-06, + "loss": 2.2301, + "step": 5896 + }, + { + "epoch": 0.316362660944206, + "grad_norm": 1.43043851852417, + "learning_rate": 4.002648051551098e-06, + "loss": 2.5513, + "step": 5897 + }, + { + "epoch": 0.31641630901287554, + "grad_norm": 1.6308279037475586, + "learning_rate": 4.002300853128864e-06, + "loss": 2.2549, + "step": 5898 + }, + { + "epoch": 0.3164699570815451, + "grad_norm": 1.3895539045333862, + "learning_rate": 4.0019536093480125e-06, + "loss": 2.2416, + "step": 5899 + }, + { + "epoch": 0.3165236051502146, + "grad_norm": 1.700250506401062, + "learning_rate": 4.001606320219028e-06, + "loss": 2.5458, + "step": 5900 + }, + { + "epoch": 0.31657725321888414, + "grad_norm": 1.4658989906311035, + "learning_rate": 4.001258985752397e-06, + "loss": 2.3321, + "step": 5901 + }, + { + "epoch": 0.31663090128755367, + "grad_norm": 1.748357892036438, + "learning_rate": 4.000911605958606e-06, + "loss": 2.2126, + "step": 5902 + }, + { + "epoch": 0.3166845493562232, + "grad_norm": 1.5841739177703857, + "learning_rate": 4.000564180848143e-06, + "loss": 2.2074, + "step": 5903 + }, + { + "epoch": 0.31673819742489273, + "grad_norm": 1.4803937673568726, + "learning_rate": 4.000216710431497e-06, + "loss": 2.269, + "step": 5904 + }, + { + "epoch": 0.3167918454935622, + "grad_norm": 1.2654374837875366, + "learning_rate": 3.99986919471916e-06, + "loss": 2.2899, + "step": 5905 + }, + { + "epoch": 0.31684549356223174, + "grad_norm": 1.568416714668274, + "learning_rate": 3.999521633721624e-06, + "loss": 2.4387, + "step": 5906 + }, + { + "epoch": 0.31689914163090127, + "grad_norm": 1.3928263187408447, + "learning_rate": 3.999174027449384e-06, + "loss": 2.2375, + "step": 5907 + }, + { + "epoch": 0.3169527896995708, + "grad_norm": 2.359147787094116, + "learning_rate": 3.998826375912934e-06, + "loss": 2.2673, + "step": 5908 + }, + { + "epoch": 0.31700643776824033, + "grad_norm": 1.3019367456436157, + "learning_rate": 3.998478679122771e-06, + "loss": 2.1931, + "step": 5909 + }, + { + "epoch": 0.31706008583690987, + "grad_norm": 1.2517238855361938, + "learning_rate": 3.998130937089393e-06, + "loss": 2.2614, + "step": 5910 + }, + { + "epoch": 0.3171137339055794, + "grad_norm": 1.3992232084274292, + "learning_rate": 3.997783149823298e-06, + "loss": 2.3909, + "step": 5911 + }, + { + "epoch": 0.31716738197424893, + "grad_norm": 1.3836698532104492, + "learning_rate": 3.997435317334989e-06, + "loss": 2.2474, + "step": 5912 + }, + { + "epoch": 0.31722103004291846, + "grad_norm": 1.3613141775131226, + "learning_rate": 3.9970874396349665e-06, + "loss": 2.3584, + "step": 5913 + }, + { + "epoch": 0.317274678111588, + "grad_norm": 1.322877287864685, + "learning_rate": 3.996739516733734e-06, + "loss": 2.3809, + "step": 5914 + }, + { + "epoch": 0.3173283261802575, + "grad_norm": 1.3105590343475342, + "learning_rate": 3.996391548641797e-06, + "loss": 2.2514, + "step": 5915 + }, + { + "epoch": 0.31738197424892706, + "grad_norm": 1.760127305984497, + "learning_rate": 3.99604353536966e-06, + "loss": 2.1715, + "step": 5916 + }, + { + "epoch": 0.3174356223175966, + "grad_norm": 1.4145069122314453, + "learning_rate": 3.995695476927833e-06, + "loss": 2.2401, + "step": 5917 + }, + { + "epoch": 0.3174892703862661, + "grad_norm": 1.537939190864563, + "learning_rate": 3.995347373326822e-06, + "loss": 2.12, + "step": 5918 + }, + { + "epoch": 0.3175429184549356, + "grad_norm": 1.5429917573928833, + "learning_rate": 3.99499922457714e-06, + "loss": 2.4453, + "step": 5919 + }, + { + "epoch": 0.31759656652360513, + "grad_norm": 1.3955916166305542, + "learning_rate": 3.9946510306892964e-06, + "loss": 2.2396, + "step": 5920 + }, + { + "epoch": 0.31765021459227466, + "grad_norm": 1.3247120380401611, + "learning_rate": 3.994302791673805e-06, + "loss": 2.1409, + "step": 5921 + }, + { + "epoch": 0.3177038626609442, + "grad_norm": 1.3299760818481445, + "learning_rate": 3.99395450754118e-06, + "loss": 2.3507, + "step": 5922 + }, + { + "epoch": 0.3177575107296137, + "grad_norm": 1.4263391494750977, + "learning_rate": 3.993606178301937e-06, + "loss": 2.0503, + "step": 5923 + }, + { + "epoch": 0.31781115879828326, + "grad_norm": 1.4661478996276855, + "learning_rate": 3.993257803966593e-06, + "loss": 2.2498, + "step": 5924 + }, + { + "epoch": 0.3178648068669528, + "grad_norm": 1.4244416952133179, + "learning_rate": 3.992909384545667e-06, + "loss": 2.142, + "step": 5925 + }, + { + "epoch": 0.3179184549356223, + "grad_norm": 1.4430783987045288, + "learning_rate": 3.992560920049679e-06, + "loss": 2.1494, + "step": 5926 + }, + { + "epoch": 0.31797210300429185, + "grad_norm": 1.3963559865951538, + "learning_rate": 3.9922124104891475e-06, + "loss": 2.1628, + "step": 5927 + }, + { + "epoch": 0.3180257510729614, + "grad_norm": 1.4241660833358765, + "learning_rate": 3.991863855874597e-06, + "loss": 2.2889, + "step": 5928 + }, + { + "epoch": 0.3180793991416309, + "grad_norm": 1.7290140390396118, + "learning_rate": 3.9915152562165525e-06, + "loss": 2.1768, + "step": 5929 + }, + { + "epoch": 0.31813304721030045, + "grad_norm": 1.344428300857544, + "learning_rate": 3.991166611525537e-06, + "loss": 2.0385, + "step": 5930 + }, + { + "epoch": 0.31818669527897, + "grad_norm": 1.4090670347213745, + "learning_rate": 3.990817921812078e-06, + "loss": 2.2898, + "step": 5931 + }, + { + "epoch": 0.3182403433476395, + "grad_norm": 1.1183990240097046, + "learning_rate": 3.990469187086703e-06, + "loss": 2.1362, + "step": 5932 + }, + { + "epoch": 0.318293991416309, + "grad_norm": 1.093375325202942, + "learning_rate": 3.990120407359942e-06, + "loss": 1.8282, + "step": 5933 + }, + { + "epoch": 0.3183476394849785, + "grad_norm": 1.215208649635315, + "learning_rate": 3.989771582642325e-06, + "loss": 2.1466, + "step": 5934 + }, + { + "epoch": 0.31840128755364805, + "grad_norm": 1.393731951713562, + "learning_rate": 3.989422712944384e-06, + "loss": 2.2016, + "step": 5935 + }, + { + "epoch": 0.3184549356223176, + "grad_norm": 1.3831143379211426, + "learning_rate": 3.9890737982766525e-06, + "loss": 1.4871, + "step": 5936 + }, + { + "epoch": 0.3185085836909871, + "grad_norm": 1.4719438552856445, + "learning_rate": 3.988724838649666e-06, + "loss": 2.2988, + "step": 5937 + }, + { + "epoch": 0.31856223175965664, + "grad_norm": 1.4514520168304443, + "learning_rate": 3.988375834073959e-06, + "loss": 2.1588, + "step": 5938 + }, + { + "epoch": 0.3186158798283262, + "grad_norm": 1.4457788467407227, + "learning_rate": 3.9880267845600696e-06, + "loss": 2.2785, + "step": 5939 + }, + { + "epoch": 0.3186695278969957, + "grad_norm": 1.3554222583770752, + "learning_rate": 3.9876776901185365e-06, + "loss": 2.1552, + "step": 5940 + }, + { + "epoch": 0.31872317596566524, + "grad_norm": 1.1484888792037964, + "learning_rate": 3.9873285507599e-06, + "loss": 2.1054, + "step": 5941 + }, + { + "epoch": 0.31877682403433477, + "grad_norm": 1.4687474966049194, + "learning_rate": 3.986979366494702e-06, + "loss": 2.2963, + "step": 5942 + }, + { + "epoch": 0.3188304721030043, + "grad_norm": 1.3335825204849243, + "learning_rate": 3.986630137333485e-06, + "loss": 2.4212, + "step": 5943 + }, + { + "epoch": 0.31888412017167383, + "grad_norm": 1.3364695310592651, + "learning_rate": 3.986280863286792e-06, + "loss": 2.4562, + "step": 5944 + }, + { + "epoch": 0.31893776824034337, + "grad_norm": 1.2788304090499878, + "learning_rate": 3.985931544365171e-06, + "loss": 1.8692, + "step": 5945 + }, + { + "epoch": 0.3189914163090129, + "grad_norm": 1.3509128093719482, + "learning_rate": 3.985582180579167e-06, + "loss": 2.2456, + "step": 5946 + }, + { + "epoch": 0.31904506437768243, + "grad_norm": 1.3948979377746582, + "learning_rate": 3.985232771939329e-06, + "loss": 2.3266, + "step": 5947 + }, + { + "epoch": 0.3190987124463519, + "grad_norm": 1.5525155067443848, + "learning_rate": 3.984883318456206e-06, + "loss": 2.3045, + "step": 5948 + }, + { + "epoch": 0.31915236051502144, + "grad_norm": 1.5647971630096436, + "learning_rate": 3.984533820140349e-06, + "loss": 2.3289, + "step": 5949 + }, + { + "epoch": 0.31920600858369097, + "grad_norm": 1.2367948293685913, + "learning_rate": 3.984184277002311e-06, + "loss": 2.4104, + "step": 5950 + }, + { + "epoch": 0.3192596566523605, + "grad_norm": 1.6153628826141357, + "learning_rate": 3.983834689052646e-06, + "loss": 2.2495, + "step": 5951 + }, + { + "epoch": 0.31931330472103003, + "grad_norm": 1.1676959991455078, + "learning_rate": 3.983485056301908e-06, + "loss": 2.3106, + "step": 5952 + }, + { + "epoch": 0.31936695278969957, + "grad_norm": 1.6218992471694946, + "learning_rate": 3.983135378760654e-06, + "loss": 1.8685, + "step": 5953 + }, + { + "epoch": 0.3194206008583691, + "grad_norm": 1.3009065389633179, + "learning_rate": 3.9827856564394406e-06, + "loss": 2.2434, + "step": 5954 + }, + { + "epoch": 0.31947424892703863, + "grad_norm": 1.1720595359802246, + "learning_rate": 3.982435889348829e-06, + "loss": 2.0982, + "step": 5955 + }, + { + "epoch": 0.31952789699570816, + "grad_norm": 1.3721423149108887, + "learning_rate": 3.982086077499377e-06, + "loss": 2.0411, + "step": 5956 + }, + { + "epoch": 0.3195815450643777, + "grad_norm": 1.6543272733688354, + "learning_rate": 3.98173622090165e-06, + "loss": 1.4223, + "step": 5957 + }, + { + "epoch": 0.3196351931330472, + "grad_norm": 1.3494998216629028, + "learning_rate": 3.981386319566207e-06, + "loss": 2.2903, + "step": 5958 + }, + { + "epoch": 0.31968884120171676, + "grad_norm": 1.5931499004364014, + "learning_rate": 3.981036373503615e-06, + "loss": 2.4323, + "step": 5959 + }, + { + "epoch": 0.3197424892703863, + "grad_norm": 1.39492666721344, + "learning_rate": 3.98068638272444e-06, + "loss": 2.2765, + "step": 5960 + }, + { + "epoch": 0.3197961373390558, + "grad_norm": 1.4700298309326172, + "learning_rate": 3.980336347239247e-06, + "loss": 2.2041, + "step": 5961 + }, + { + "epoch": 0.3198497854077253, + "grad_norm": 1.210188627243042, + "learning_rate": 3.9799862670586065e-06, + "loss": 2.1588, + "step": 5962 + }, + { + "epoch": 0.3199034334763948, + "grad_norm": 1.3887741565704346, + "learning_rate": 3.979636142193089e-06, + "loss": 2.2998, + "step": 5963 + }, + { + "epoch": 0.31995708154506436, + "grad_norm": 1.7014334201812744, + "learning_rate": 3.979285972653263e-06, + "loss": 2.4327, + "step": 5964 + }, + { + "epoch": 0.3200107296137339, + "grad_norm": 1.5565305948257446, + "learning_rate": 3.978935758449704e-06, + "loss": 2.3908, + "step": 5965 + }, + { + "epoch": 0.3200643776824034, + "grad_norm": 1.344342827796936, + "learning_rate": 3.978585499592984e-06, + "loss": 2.2646, + "step": 5966 + }, + { + "epoch": 0.32011802575107295, + "grad_norm": 1.5224277973175049, + "learning_rate": 3.978235196093679e-06, + "loss": 2.2691, + "step": 5967 + }, + { + "epoch": 0.3201716738197425, + "grad_norm": 1.2822117805480957, + "learning_rate": 3.977884847962365e-06, + "loss": 2.1627, + "step": 5968 + }, + { + "epoch": 0.320225321888412, + "grad_norm": 1.4916346073150635, + "learning_rate": 3.977534455209623e-06, + "loss": 2.0867, + "step": 5969 + }, + { + "epoch": 0.32027896995708155, + "grad_norm": 2.9462192058563232, + "learning_rate": 3.977184017846027e-06, + "loss": 2.2506, + "step": 5970 + }, + { + "epoch": 0.3203326180257511, + "grad_norm": 1.5056499242782593, + "learning_rate": 3.976833535882162e-06, + "loss": 2.2465, + "step": 5971 + }, + { + "epoch": 0.3203862660944206, + "grad_norm": 1.2833329439163208, + "learning_rate": 3.976483009328609e-06, + "loss": 2.1806, + "step": 5972 + }, + { + "epoch": 0.32043991416309014, + "grad_norm": 1.072334885597229, + "learning_rate": 3.9761324381959484e-06, + "loss": 2.0992, + "step": 5973 + }, + { + "epoch": 0.3204935622317597, + "grad_norm": 1.174496054649353, + "learning_rate": 3.97578182249477e-06, + "loss": 2.3454, + "step": 5974 + }, + { + "epoch": 0.3205472103004292, + "grad_norm": 1.4426332712173462, + "learning_rate": 3.975431162235657e-06, + "loss": 2.1584, + "step": 5975 + }, + { + "epoch": 0.32060085836909874, + "grad_norm": 1.2138456106185913, + "learning_rate": 3.975080457429196e-06, + "loss": 2.2159, + "step": 5976 + }, + { + "epoch": 0.3206545064377682, + "grad_norm": 1.39769446849823, + "learning_rate": 3.974729708085978e-06, + "loss": 2.263, + "step": 5977 + }, + { + "epoch": 0.32070815450643775, + "grad_norm": 6.508936882019043, + "learning_rate": 3.974378914216591e-06, + "loss": 2.257, + "step": 5978 + }, + { + "epoch": 0.3207618025751073, + "grad_norm": 1.3737374544143677, + "learning_rate": 3.974028075831629e-06, + "loss": 2.3974, + "step": 5979 + }, + { + "epoch": 0.3208154506437768, + "grad_norm": 1.4077364206314087, + "learning_rate": 3.973677192941681e-06, + "loss": 2.1801, + "step": 5980 + }, + { + "epoch": 0.32086909871244634, + "grad_norm": 1.397320032119751, + "learning_rate": 3.973326265557344e-06, + "loss": 2.2765, + "step": 5981 + }, + { + "epoch": 0.3209227467811159, + "grad_norm": 1.2147523164749146, + "learning_rate": 3.972975293689214e-06, + "loss": 1.7739, + "step": 5982 + }, + { + "epoch": 0.3209763948497854, + "grad_norm": 1.3358607292175293, + "learning_rate": 3.9726242773478855e-06, + "loss": 2.1526, + "step": 5983 + }, + { + "epoch": 0.32103004291845494, + "grad_norm": 1.2810086011886597, + "learning_rate": 3.972273216543957e-06, + "loss": 2.0737, + "step": 5984 + }, + { + "epoch": 0.32108369098712447, + "grad_norm": 1.3313672542572021, + "learning_rate": 3.97192211128803e-06, + "loss": 2.1775, + "step": 5985 + }, + { + "epoch": 0.321137339055794, + "grad_norm": 1.25983464717865, + "learning_rate": 3.971570961590703e-06, + "loss": 1.6317, + "step": 5986 + }, + { + "epoch": 0.32119098712446353, + "grad_norm": 1.349513053894043, + "learning_rate": 3.97121976746258e-06, + "loss": 2.442, + "step": 5987 + }, + { + "epoch": 0.32124463519313307, + "grad_norm": 1.2588133811950684, + "learning_rate": 3.970868528914264e-06, + "loss": 2.21, + "step": 5988 + }, + { + "epoch": 0.3212982832618026, + "grad_norm": 1.3079249858856201, + "learning_rate": 3.970517245956359e-06, + "loss": 2.4018, + "step": 5989 + }, + { + "epoch": 0.32135193133047213, + "grad_norm": 1.2168998718261719, + "learning_rate": 3.970165918599472e-06, + "loss": 2.219, + "step": 5990 + }, + { + "epoch": 0.3214055793991416, + "grad_norm": 1.21315336227417, + "learning_rate": 3.96981454685421e-06, + "loss": 1.9759, + "step": 5991 + }, + { + "epoch": 0.32145922746781114, + "grad_norm": 1.4610745906829834, + "learning_rate": 3.969463130731183e-06, + "loss": 2.3923, + "step": 5992 + }, + { + "epoch": 0.32151287553648067, + "grad_norm": 1.3758206367492676, + "learning_rate": 3.969111670241e-06, + "loss": 2.3409, + "step": 5993 + }, + { + "epoch": 0.3215665236051502, + "grad_norm": 1.7068089246749878, + "learning_rate": 3.968760165394274e-06, + "loss": 2.3335, + "step": 5994 + }, + { + "epoch": 0.32162017167381973, + "grad_norm": 1.2771812677383423, + "learning_rate": 3.968408616201616e-06, + "loss": 2.0821, + "step": 5995 + }, + { + "epoch": 0.32167381974248926, + "grad_norm": 1.4000864028930664, + "learning_rate": 3.968057022673641e-06, + "loss": 2.2664, + "step": 5996 + }, + { + "epoch": 0.3217274678111588, + "grad_norm": 1.061402440071106, + "learning_rate": 3.967705384820966e-06, + "loss": 1.9429, + "step": 5997 + }, + { + "epoch": 0.3217811158798283, + "grad_norm": 1.3093785047531128, + "learning_rate": 3.967353702654205e-06, + "loss": 2.0585, + "step": 5998 + }, + { + "epoch": 0.32183476394849786, + "grad_norm": 1.4890022277832031, + "learning_rate": 3.967001976183979e-06, + "loss": 2.3033, + "step": 5999 + }, + { + "epoch": 0.3218884120171674, + "grad_norm": 1.4709620475769043, + "learning_rate": 3.966650205420906e-06, + "loss": 2.2161, + "step": 6000 + }, + { + "epoch": 0.3219420600858369, + "grad_norm": 1.114967703819275, + "learning_rate": 3.966298390375608e-06, + "loss": 1.8822, + "step": 6001 + }, + { + "epoch": 0.32199570815450645, + "grad_norm": 1.370835781097412, + "learning_rate": 3.965946531058706e-06, + "loss": 2.2148, + "step": 6002 + }, + { + "epoch": 0.322049356223176, + "grad_norm": 1.8968689441680908, + "learning_rate": 3.965594627480826e-06, + "loss": 2.2095, + "step": 6003 + }, + { + "epoch": 0.3221030042918455, + "grad_norm": 1.792375922203064, + "learning_rate": 3.96524267965259e-06, + "loss": 2.2442, + "step": 6004 + }, + { + "epoch": 0.322156652360515, + "grad_norm": 1.5790334939956665, + "learning_rate": 3.964890687584625e-06, + "loss": 1.8557, + "step": 6005 + }, + { + "epoch": 0.3222103004291845, + "grad_norm": 1.4072262048721313, + "learning_rate": 3.96453865128756e-06, + "loss": 2.2488, + "step": 6006 + }, + { + "epoch": 0.32226394849785406, + "grad_norm": 1.2357879877090454, + "learning_rate": 3.9641865707720225e-06, + "loss": 1.4946, + "step": 6007 + }, + { + "epoch": 0.3223175965665236, + "grad_norm": 1.3748326301574707, + "learning_rate": 3.963834446048644e-06, + "loss": 2.2664, + "step": 6008 + }, + { + "epoch": 0.3223712446351931, + "grad_norm": 1.2854689359664917, + "learning_rate": 3.9634822771280555e-06, + "loss": 2.1939, + "step": 6009 + }, + { + "epoch": 0.32242489270386265, + "grad_norm": 1.5735291242599487, + "learning_rate": 3.96313006402089e-06, + "loss": 2.3443, + "step": 6010 + }, + { + "epoch": 0.3224785407725322, + "grad_norm": 1.0736336708068848, + "learning_rate": 3.962777806737782e-06, + "loss": 2.1405, + "step": 6011 + }, + { + "epoch": 0.3225321888412017, + "grad_norm": 1.546279788017273, + "learning_rate": 3.962425505289367e-06, + "loss": 2.4377, + "step": 6012 + }, + { + "epoch": 0.32258583690987125, + "grad_norm": 1.437889575958252, + "learning_rate": 3.962073159686282e-06, + "loss": 2.3994, + "step": 6013 + }, + { + "epoch": 0.3226394849785408, + "grad_norm": 1.1342790126800537, + "learning_rate": 3.961720769939164e-06, + "loss": 2.2459, + "step": 6014 + }, + { + "epoch": 0.3226931330472103, + "grad_norm": 1.433527946472168, + "learning_rate": 3.961368336058654e-06, + "loss": 2.1904, + "step": 6015 + }, + { + "epoch": 0.32274678111587984, + "grad_norm": 1.1936719417572021, + "learning_rate": 3.961015858055394e-06, + "loss": 2.4727, + "step": 6016 + }, + { + "epoch": 0.3228004291845494, + "grad_norm": 1.4497169256210327, + "learning_rate": 3.960663335940023e-06, + "loss": 2.4817, + "step": 6017 + }, + { + "epoch": 0.3228540772532189, + "grad_norm": 1.4518686532974243, + "learning_rate": 3.960310769723189e-06, + "loss": 2.5448, + "step": 6018 + }, + { + "epoch": 0.32290772532188844, + "grad_norm": 1.275664210319519, + "learning_rate": 3.959958159415534e-06, + "loss": 2.2682, + "step": 6019 + }, + { + "epoch": 0.3229613733905579, + "grad_norm": 1.3790339231491089, + "learning_rate": 3.959605505027703e-06, + "loss": 2.3242, + "step": 6020 + }, + { + "epoch": 0.32301502145922745, + "grad_norm": 2.1742212772369385, + "learning_rate": 3.959252806570347e-06, + "loss": 2.3142, + "step": 6021 + }, + { + "epoch": 0.323068669527897, + "grad_norm": 1.497609257698059, + "learning_rate": 3.958900064054113e-06, + "loss": 2.2982, + "step": 6022 + }, + { + "epoch": 0.3231223175965665, + "grad_norm": 1.3400509357452393, + "learning_rate": 3.958547277489652e-06, + "loss": 2.2492, + "step": 6023 + }, + { + "epoch": 0.32317596566523604, + "grad_norm": 1.2320431470870972, + "learning_rate": 3.958194446887614e-06, + "loss": 1.9777, + "step": 6024 + }, + { + "epoch": 0.3232296137339056, + "grad_norm": 1.3914586305618286, + "learning_rate": 3.957841572258655e-06, + "loss": 2.3415, + "step": 6025 + }, + { + "epoch": 0.3232832618025751, + "grad_norm": 1.3490352630615234, + "learning_rate": 3.957488653613426e-06, + "loss": 2.2377, + "step": 6026 + }, + { + "epoch": 0.32333690987124464, + "grad_norm": 1.3950414657592773, + "learning_rate": 3.9571356909625855e-06, + "loss": 2.3698, + "step": 6027 + }, + { + "epoch": 0.32339055793991417, + "grad_norm": 1.2156002521514893, + "learning_rate": 3.956782684316788e-06, + "loss": 2.4396, + "step": 6028 + }, + { + "epoch": 0.3234442060085837, + "grad_norm": 1.4551489353179932, + "learning_rate": 3.956429633686692e-06, + "loss": 2.1922, + "step": 6029 + }, + { + "epoch": 0.32349785407725323, + "grad_norm": 1.346534252166748, + "learning_rate": 3.956076539082959e-06, + "loss": 2.0592, + "step": 6030 + }, + { + "epoch": 0.32355150214592276, + "grad_norm": 1.2834972143173218, + "learning_rate": 3.955723400516249e-06, + "loss": 2.3596, + "step": 6031 + }, + { + "epoch": 0.3236051502145923, + "grad_norm": 1.686742901802063, + "learning_rate": 3.955370217997223e-06, + "loss": 2.5758, + "step": 6032 + }, + { + "epoch": 0.32365879828326183, + "grad_norm": 1.078207015991211, + "learning_rate": 3.955016991536547e-06, + "loss": 1.7023, + "step": 6033 + }, + { + "epoch": 0.3237124463519313, + "grad_norm": 1.601454734802246, + "learning_rate": 3.954663721144882e-06, + "loss": 2.3891, + "step": 6034 + }, + { + "epoch": 0.32376609442060084, + "grad_norm": 1.1917636394500732, + "learning_rate": 3.954310406832899e-06, + "loss": 1.9687, + "step": 6035 + }, + { + "epoch": 0.32381974248927037, + "grad_norm": 1.2045788764953613, + "learning_rate": 3.953957048611261e-06, + "loss": 1.9409, + "step": 6036 + }, + { + "epoch": 0.3238733905579399, + "grad_norm": 1.3625417947769165, + "learning_rate": 3.95360364649064e-06, + "loss": 2.4779, + "step": 6037 + }, + { + "epoch": 0.32392703862660943, + "grad_norm": 1.2555559873580933, + "learning_rate": 3.953250200481706e-06, + "loss": 2.1016, + "step": 6038 + }, + { + "epoch": 0.32398068669527896, + "grad_norm": 1.3457425832748413, + "learning_rate": 3.952896710595128e-06, + "loss": 2.1972, + "step": 6039 + }, + { + "epoch": 0.3240343347639485, + "grad_norm": 1.431816816329956, + "learning_rate": 3.952543176841581e-06, + "loss": 2.3284, + "step": 6040 + }, + { + "epoch": 0.324087982832618, + "grad_norm": 1.427112340927124, + "learning_rate": 3.952189599231739e-06, + "loss": 2.3155, + "step": 6041 + }, + { + "epoch": 0.32414163090128756, + "grad_norm": 1.3288233280181885, + "learning_rate": 3.951835977776277e-06, + "loss": 2.4971, + "step": 6042 + }, + { + "epoch": 0.3241952789699571, + "grad_norm": 1.4562783241271973, + "learning_rate": 3.951482312485873e-06, + "loss": 2.2919, + "step": 6043 + }, + { + "epoch": 0.3242489270386266, + "grad_norm": 1.679364562034607, + "learning_rate": 3.951128603371203e-06, + "loss": 2.1984, + "step": 6044 + }, + { + "epoch": 0.32430257510729615, + "grad_norm": 1.2773252725601196, + "learning_rate": 3.950774850442948e-06, + "loss": 2.5288, + "step": 6045 + }, + { + "epoch": 0.3243562231759657, + "grad_norm": 1.7305341958999634, + "learning_rate": 3.9504210537117876e-06, + "loss": 2.3859, + "step": 6046 + }, + { + "epoch": 0.3244098712446352, + "grad_norm": 1.4356937408447266, + "learning_rate": 3.950067213188406e-06, + "loss": 2.2023, + "step": 6047 + }, + { + "epoch": 0.3244635193133047, + "grad_norm": 1.3703795671463013, + "learning_rate": 3.949713328883483e-06, + "loss": 2.302, + "step": 6048 + }, + { + "epoch": 0.3245171673819742, + "grad_norm": 1.8170459270477295, + "learning_rate": 3.949359400807708e-06, + "loss": 2.1231, + "step": 6049 + }, + { + "epoch": 0.32457081545064376, + "grad_norm": 1.2621817588806152, + "learning_rate": 3.949005428971764e-06, + "loss": 2.234, + "step": 6050 + }, + { + "epoch": 0.3246244635193133, + "grad_norm": 1.4735603332519531, + "learning_rate": 3.948651413386338e-06, + "loss": 1.3249, + "step": 6051 + }, + { + "epoch": 0.3246781115879828, + "grad_norm": 1.481858253479004, + "learning_rate": 3.948297354062121e-06, + "loss": 2.4274, + "step": 6052 + }, + { + "epoch": 0.32473175965665235, + "grad_norm": 1.396786093711853, + "learning_rate": 3.947943251009801e-06, + "loss": 2.2951, + "step": 6053 + }, + { + "epoch": 0.3247854077253219, + "grad_norm": 1.5417286157608032, + "learning_rate": 3.947589104240071e-06, + "loss": 2.3383, + "step": 6054 + }, + { + "epoch": 0.3248390557939914, + "grad_norm": 1.420938491821289, + "learning_rate": 3.947234913763622e-06, + "loss": 2.36, + "step": 6055 + }, + { + "epoch": 0.32489270386266095, + "grad_norm": 1.43526029586792, + "learning_rate": 3.946880679591149e-06, + "loss": 2.2142, + "step": 6056 + }, + { + "epoch": 0.3249463519313305, + "grad_norm": 1.3256782293319702, + "learning_rate": 3.946526401733347e-06, + "loss": 2.3969, + "step": 6057 + }, + { + "epoch": 0.325, + "grad_norm": 1.354140043258667, + "learning_rate": 3.946172080200914e-06, + "loss": 2.1779, + "step": 6058 + }, + { + "epoch": 0.32505364806866954, + "grad_norm": 1.3182255029678345, + "learning_rate": 3.945817715004545e-06, + "loss": 2.4171, + "step": 6059 + }, + { + "epoch": 0.3251072961373391, + "grad_norm": 1.3270312547683716, + "learning_rate": 3.945463306154942e-06, + "loss": 1.2547, + "step": 6060 + }, + { + "epoch": 0.3251609442060086, + "grad_norm": 1.8841599225997925, + "learning_rate": 3.9451088536628045e-06, + "loss": 2.4142, + "step": 6061 + }, + { + "epoch": 0.32521459227467814, + "grad_norm": 1.2304511070251465, + "learning_rate": 3.944754357538834e-06, + "loss": 1.939, + "step": 6062 + }, + { + "epoch": 0.3252682403433476, + "grad_norm": 1.3385754823684692, + "learning_rate": 3.944399817793735e-06, + "loss": 2.3151, + "step": 6063 + }, + { + "epoch": 0.32532188841201715, + "grad_norm": 2.2281248569488525, + "learning_rate": 3.944045234438211e-06, + "loss": 2.4194, + "step": 6064 + }, + { + "epoch": 0.3253755364806867, + "grad_norm": 1.3934074640274048, + "learning_rate": 3.9436906074829675e-06, + "loss": 2.4607, + "step": 6065 + }, + { + "epoch": 0.3254291845493562, + "grad_norm": 1.275821566581726, + "learning_rate": 3.943335936938712e-06, + "loss": 2.0042, + "step": 6066 + }, + { + "epoch": 0.32548283261802574, + "grad_norm": 1.275038242340088, + "learning_rate": 3.9429812228161536e-06, + "loss": 2.2418, + "step": 6067 + }, + { + "epoch": 0.3255364806866953, + "grad_norm": 1.1992568969726562, + "learning_rate": 3.942626465126001e-06, + "loss": 2.1752, + "step": 6068 + }, + { + "epoch": 0.3255901287553648, + "grad_norm": 1.9551094770431519, + "learning_rate": 3.942271663878967e-06, + "loss": 2.2624, + "step": 6069 + }, + { + "epoch": 0.32564377682403434, + "grad_norm": 1.3997042179107666, + "learning_rate": 3.941916819085763e-06, + "loss": 2.4454, + "step": 6070 + }, + { + "epoch": 0.32569742489270387, + "grad_norm": 1.5217247009277344, + "learning_rate": 3.941561930757103e-06, + "loss": 2.4578, + "step": 6071 + }, + { + "epoch": 0.3257510729613734, + "grad_norm": 1.3834190368652344, + "learning_rate": 3.941206998903702e-06, + "loss": 2.0932, + "step": 6072 + }, + { + "epoch": 0.32580472103004293, + "grad_norm": 1.5070440769195557, + "learning_rate": 3.940852023536276e-06, + "loss": 2.3952, + "step": 6073 + }, + { + "epoch": 0.32585836909871246, + "grad_norm": 1.8680914640426636, + "learning_rate": 3.940497004665542e-06, + "loss": 2.2919, + "step": 6074 + }, + { + "epoch": 0.325912017167382, + "grad_norm": 1.1562632322311401, + "learning_rate": 3.940141942302221e-06, + "loss": 1.7379, + "step": 6075 + }, + { + "epoch": 0.3259656652360515, + "grad_norm": 1.385358452796936, + "learning_rate": 3.939786836457031e-06, + "loss": 2.405, + "step": 6076 + }, + { + "epoch": 0.326019313304721, + "grad_norm": 1.44832444190979, + "learning_rate": 3.939431687140697e-06, + "loss": 2.201, + "step": 6077 + }, + { + "epoch": 0.32607296137339054, + "grad_norm": 1.5171477794647217, + "learning_rate": 3.9390764943639385e-06, + "loss": 2.2833, + "step": 6078 + }, + { + "epoch": 0.32612660944206007, + "grad_norm": 1.483892798423767, + "learning_rate": 3.938721258137481e-06, + "loss": 2.3082, + "step": 6079 + }, + { + "epoch": 0.3261802575107296, + "grad_norm": 1.1166363954544067, + "learning_rate": 3.938365978472049e-06, + "loss": 2.2239, + "step": 6080 + }, + { + "epoch": 0.32623390557939913, + "grad_norm": 1.1954537630081177, + "learning_rate": 3.938010655378373e-06, + "loss": 2.0663, + "step": 6081 + }, + { + "epoch": 0.32628755364806866, + "grad_norm": 1.4657039642333984, + "learning_rate": 3.937655288867176e-06, + "loss": 2.285, + "step": 6082 + }, + { + "epoch": 0.3263412017167382, + "grad_norm": 1.2901153564453125, + "learning_rate": 3.937299878949192e-06, + "loss": 2.2492, + "step": 6083 + }, + { + "epoch": 0.3263948497854077, + "grad_norm": 2.5172698497772217, + "learning_rate": 3.93694442563515e-06, + "loss": 2.3059, + "step": 6084 + }, + { + "epoch": 0.32644849785407726, + "grad_norm": 1.4450727701187134, + "learning_rate": 3.936588928935782e-06, + "loss": 2.6347, + "step": 6085 + }, + { + "epoch": 0.3265021459227468, + "grad_norm": 1.3544381856918335, + "learning_rate": 3.9362333888618205e-06, + "loss": 2.2077, + "step": 6086 + }, + { + "epoch": 0.3265557939914163, + "grad_norm": 1.411892294883728, + "learning_rate": 3.935877805424002e-06, + "loss": 2.2257, + "step": 6087 + }, + { + "epoch": 0.32660944206008585, + "grad_norm": 1.2639573812484741, + "learning_rate": 3.935522178633062e-06, + "loss": 2.3819, + "step": 6088 + }, + { + "epoch": 0.3266630901287554, + "grad_norm": 1.342213749885559, + "learning_rate": 3.9351665084997385e-06, + "loss": 2.3355, + "step": 6089 + }, + { + "epoch": 0.3267167381974249, + "grad_norm": 1.3847976922988892, + "learning_rate": 3.934810795034768e-06, + "loss": 2.1159, + "step": 6090 + }, + { + "epoch": 0.32677038626609445, + "grad_norm": 1.5427281856536865, + "learning_rate": 3.934455038248892e-06, + "loss": 2.2456, + "step": 6091 + }, + { + "epoch": 0.3268240343347639, + "grad_norm": 1.4805700778961182, + "learning_rate": 3.934099238152852e-06, + "loss": 2.1685, + "step": 6092 + }, + { + "epoch": 0.32687768240343346, + "grad_norm": 1.2999088764190674, + "learning_rate": 3.93374339475739e-06, + "loss": 2.35, + "step": 6093 + }, + { + "epoch": 0.326931330472103, + "grad_norm": 1.4747238159179688, + "learning_rate": 3.93338750807325e-06, + "loss": 2.5144, + "step": 6094 + }, + { + "epoch": 0.3269849785407725, + "grad_norm": 1.6348778009414673, + "learning_rate": 3.933031578111178e-06, + "loss": 2.4186, + "step": 6095 + }, + { + "epoch": 0.32703862660944205, + "grad_norm": 1.3757802248001099, + "learning_rate": 3.9326756048819185e-06, + "loss": 2.097, + "step": 6096 + }, + { + "epoch": 0.3270922746781116, + "grad_norm": 1.5057902336120605, + "learning_rate": 3.932319588396222e-06, + "loss": 2.3883, + "step": 6097 + }, + { + "epoch": 0.3271459227467811, + "grad_norm": 1.3176623582839966, + "learning_rate": 3.931963528664835e-06, + "loss": 2.2477, + "step": 6098 + }, + { + "epoch": 0.32719957081545065, + "grad_norm": 1.336725115776062, + "learning_rate": 3.931607425698509e-06, + "loss": 2.0607, + "step": 6099 + }, + { + "epoch": 0.3272532188841202, + "grad_norm": 1.3595011234283447, + "learning_rate": 3.931251279507997e-06, + "loss": 2.278, + "step": 6100 + }, + { + "epoch": 0.3273068669527897, + "grad_norm": 1.2989863157272339, + "learning_rate": 3.930895090104051e-06, + "loss": 1.9171, + "step": 6101 + }, + { + "epoch": 0.32736051502145924, + "grad_norm": 1.3096462488174438, + "learning_rate": 3.930538857497425e-06, + "loss": 2.4511, + "step": 6102 + }, + { + "epoch": 0.3274141630901288, + "grad_norm": 1.4978630542755127, + "learning_rate": 3.9301825816988745e-06, + "loss": 2.3789, + "step": 6103 + }, + { + "epoch": 0.3274678111587983, + "grad_norm": 1.2900049686431885, + "learning_rate": 3.929826262719157e-06, + "loss": 2.0967, + "step": 6104 + }, + { + "epoch": 0.32752145922746784, + "grad_norm": 2.135716676712036, + "learning_rate": 3.929469900569031e-06, + "loss": 2.3693, + "step": 6105 + }, + { + "epoch": 0.3275751072961373, + "grad_norm": 1.1791282892227173, + "learning_rate": 3.929113495259256e-06, + "loss": 2.317, + "step": 6106 + }, + { + "epoch": 0.32762875536480685, + "grad_norm": 1.8430759906768799, + "learning_rate": 3.928757046800592e-06, + "loss": 2.3382, + "step": 6107 + }, + { + "epoch": 0.3276824034334764, + "grad_norm": 1.3359018564224243, + "learning_rate": 3.928400555203801e-06, + "loss": 2.3332, + "step": 6108 + }, + { + "epoch": 0.3277360515021459, + "grad_norm": 1.2495310306549072, + "learning_rate": 3.928044020479648e-06, + "loss": 2.2252, + "step": 6109 + }, + { + "epoch": 0.32778969957081544, + "grad_norm": 1.1953222751617432, + "learning_rate": 3.927687442638898e-06, + "loss": 1.7769, + "step": 6110 + }, + { + "epoch": 0.32784334763948497, + "grad_norm": 1.9285470247268677, + "learning_rate": 3.927330821692314e-06, + "loss": 2.3296, + "step": 6111 + }, + { + "epoch": 0.3278969957081545, + "grad_norm": 1.590352177619934, + "learning_rate": 3.926974157650667e-06, + "loss": 1.9597, + "step": 6112 + }, + { + "epoch": 0.32795064377682404, + "grad_norm": 1.5460946559906006, + "learning_rate": 3.926617450524723e-06, + "loss": 2.2454, + "step": 6113 + }, + { + "epoch": 0.32800429184549357, + "grad_norm": 1.4370816946029663, + "learning_rate": 3.926260700325253e-06, + "loss": 2.6543, + "step": 6114 + }, + { + "epoch": 0.3280579399141631, + "grad_norm": 1.683282494544983, + "learning_rate": 3.925903907063029e-06, + "loss": 2.1752, + "step": 6115 + }, + { + "epoch": 0.32811158798283263, + "grad_norm": 1.509598970413208, + "learning_rate": 3.925547070748823e-06, + "loss": 2.4556, + "step": 6116 + }, + { + "epoch": 0.32816523605150216, + "grad_norm": 1.4513237476348877, + "learning_rate": 3.9251901913934085e-06, + "loss": 2.4274, + "step": 6117 + }, + { + "epoch": 0.3282188841201717, + "grad_norm": 1.145928144454956, + "learning_rate": 3.924833269007561e-06, + "loss": 1.8133, + "step": 6118 + }, + { + "epoch": 0.3282725321888412, + "grad_norm": 1.4371678829193115, + "learning_rate": 3.924476303602057e-06, + "loss": 2.2211, + "step": 6119 + }, + { + "epoch": 0.3283261802575107, + "grad_norm": 1.4568145275115967, + "learning_rate": 3.924119295187674e-06, + "loss": 1.9191, + "step": 6120 + }, + { + "epoch": 0.32837982832618023, + "grad_norm": 1.4051222801208496, + "learning_rate": 3.923762243775191e-06, + "loss": 2.3824, + "step": 6121 + }, + { + "epoch": 0.32843347639484977, + "grad_norm": 1.222388505935669, + "learning_rate": 3.923405149375389e-06, + "loss": 2.3378, + "step": 6122 + }, + { + "epoch": 0.3284871244635193, + "grad_norm": 1.412110447883606, + "learning_rate": 3.923048011999048e-06, + "loss": 2.0175, + "step": 6123 + }, + { + "epoch": 0.32854077253218883, + "grad_norm": 1.184849500656128, + "learning_rate": 3.922690831656955e-06, + "loss": 2.0519, + "step": 6124 + }, + { + "epoch": 0.32859442060085836, + "grad_norm": 1.3993688821792603, + "learning_rate": 3.92233360835989e-06, + "loss": 2.3503, + "step": 6125 + }, + { + "epoch": 0.3286480686695279, + "grad_norm": 1.3627541065216064, + "learning_rate": 3.92197634211864e-06, + "loss": 2.1212, + "step": 6126 + }, + { + "epoch": 0.3287017167381974, + "grad_norm": 1.4704824686050415, + "learning_rate": 3.921619032943991e-06, + "loss": 2.1972, + "step": 6127 + }, + { + "epoch": 0.32875536480686696, + "grad_norm": 1.3918232917785645, + "learning_rate": 3.921261680846735e-06, + "loss": 2.3194, + "step": 6128 + }, + { + "epoch": 0.3288090128755365, + "grad_norm": 1.3148014545440674, + "learning_rate": 3.9209042858376565e-06, + "loss": 2.2309, + "step": 6129 + }, + { + "epoch": 0.328862660944206, + "grad_norm": 1.313732624053955, + "learning_rate": 3.920546847927548e-06, + "loss": 2.3268, + "step": 6130 + }, + { + "epoch": 0.32891630901287555, + "grad_norm": 1.5773123502731323, + "learning_rate": 3.920189367127203e-06, + "loss": 2.0662, + "step": 6131 + }, + { + "epoch": 0.3289699570815451, + "grad_norm": 1.6550101041793823, + "learning_rate": 3.919831843447413e-06, + "loss": 2.0586, + "step": 6132 + }, + { + "epoch": 0.3290236051502146, + "grad_norm": 1.2425153255462646, + "learning_rate": 3.919474276898973e-06, + "loss": 2.3804, + "step": 6133 + }, + { + "epoch": 0.32907725321888415, + "grad_norm": 1.5558452606201172, + "learning_rate": 3.919116667492679e-06, + "loss": 1.7642, + "step": 6134 + }, + { + "epoch": 0.3291309012875536, + "grad_norm": 1.2681199312210083, + "learning_rate": 3.91875901523933e-06, + "loss": 2.3253, + "step": 6135 + }, + { + "epoch": 0.32918454935622316, + "grad_norm": 1.473028540611267, + "learning_rate": 3.918401320149721e-06, + "loss": 2.1959, + "step": 6136 + }, + { + "epoch": 0.3292381974248927, + "grad_norm": 1.1942203044891357, + "learning_rate": 3.9180435822346556e-06, + "loss": 2.5101, + "step": 6137 + }, + { + "epoch": 0.3292918454935622, + "grad_norm": 1.9621222019195557, + "learning_rate": 3.917685801504931e-06, + "loss": 2.1579, + "step": 6138 + }, + { + "epoch": 0.32934549356223175, + "grad_norm": 1.2312260866165161, + "learning_rate": 3.9173279779713536e-06, + "loss": 2.2531, + "step": 6139 + }, + { + "epoch": 0.3293991416309013, + "grad_norm": 1.6946525573730469, + "learning_rate": 3.916970111644724e-06, + "loss": 2.4702, + "step": 6140 + }, + { + "epoch": 0.3294527896995708, + "grad_norm": 1.2287325859069824, + "learning_rate": 3.916612202535849e-06, + "loss": 2.2436, + "step": 6141 + }, + { + "epoch": 0.32950643776824035, + "grad_norm": 1.547756314277649, + "learning_rate": 3.916254250655533e-06, + "loss": 2.2697, + "step": 6142 + }, + { + "epoch": 0.3295600858369099, + "grad_norm": 1.5706462860107422, + "learning_rate": 3.915896256014586e-06, + "loss": 2.4285, + "step": 6143 + }, + { + "epoch": 0.3296137339055794, + "grad_norm": 1.3637423515319824, + "learning_rate": 3.915538218623815e-06, + "loss": 2.4787, + "step": 6144 + }, + { + "epoch": 0.32966738197424894, + "grad_norm": 1.3997255563735962, + "learning_rate": 3.91518013849403e-06, + "loss": 2.2612, + "step": 6145 + }, + { + "epoch": 0.3297210300429185, + "grad_norm": 1.3788686990737915, + "learning_rate": 3.914822015636044e-06, + "loss": 2.3273, + "step": 6146 + }, + { + "epoch": 0.329774678111588, + "grad_norm": 1.2675588130950928, + "learning_rate": 3.914463850060669e-06, + "loss": 2.2375, + "step": 6147 + }, + { + "epoch": 0.32982832618025754, + "grad_norm": 1.4323813915252686, + "learning_rate": 3.914105641778718e-06, + "loss": 2.1725, + "step": 6148 + }, + { + "epoch": 0.329881974248927, + "grad_norm": 1.7634832859039307, + "learning_rate": 3.913747390801008e-06, + "loss": 2.3609, + "step": 6149 + }, + { + "epoch": 0.32993562231759654, + "grad_norm": 1.2724339962005615, + "learning_rate": 3.913389097138355e-06, + "loss": 2.3371, + "step": 6150 + }, + { + "epoch": 0.3299892703862661, + "grad_norm": 1.348560094833374, + "learning_rate": 3.913030760801576e-06, + "loss": 2.2894, + "step": 6151 + }, + { + "epoch": 0.3300429184549356, + "grad_norm": 1.1399492025375366, + "learning_rate": 3.912672381801491e-06, + "loss": 2.1527, + "step": 6152 + }, + { + "epoch": 0.33009656652360514, + "grad_norm": 1.3422009944915771, + "learning_rate": 3.912313960148919e-06, + "loss": 2.2081, + "step": 6153 + }, + { + "epoch": 0.33015021459227467, + "grad_norm": 1.3480740785598755, + "learning_rate": 3.911955495854684e-06, + "loss": 2.4374, + "step": 6154 + }, + { + "epoch": 0.3302038626609442, + "grad_norm": 1.3796093463897705, + "learning_rate": 3.911596988929608e-06, + "loss": 2.4003, + "step": 6155 + }, + { + "epoch": 0.33025751072961373, + "grad_norm": 1.4262079000473022, + "learning_rate": 3.911238439384516e-06, + "loss": 2.5162, + "step": 6156 + }, + { + "epoch": 0.33031115879828327, + "grad_norm": 1.4422751665115356, + "learning_rate": 3.910879847230232e-06, + "loss": 2.327, + "step": 6157 + }, + { + "epoch": 0.3303648068669528, + "grad_norm": 1.5098599195480347, + "learning_rate": 3.910521212477585e-06, + "loss": 2.3299, + "step": 6158 + }, + { + "epoch": 0.33041845493562233, + "grad_norm": 1.3790377378463745, + "learning_rate": 3.910162535137401e-06, + "loss": 1.9831, + "step": 6159 + }, + { + "epoch": 0.33047210300429186, + "grad_norm": 1.3973348140716553, + "learning_rate": 3.90980381522051e-06, + "loss": 2.2478, + "step": 6160 + }, + { + "epoch": 0.3305257510729614, + "grad_norm": 1.4692519903182983, + "learning_rate": 3.909445052737743e-06, + "loss": 2.2643, + "step": 6161 + }, + { + "epoch": 0.3305793991416309, + "grad_norm": 1.253674030303955, + "learning_rate": 3.909086247699934e-06, + "loss": 2.0738, + "step": 6162 + }, + { + "epoch": 0.3306330472103004, + "grad_norm": 1.5052000284194946, + "learning_rate": 3.908727400117914e-06, + "loss": 2.0963, + "step": 6163 + }, + { + "epoch": 0.33068669527896993, + "grad_norm": 1.4422839879989624, + "learning_rate": 3.9083685100025174e-06, + "loss": 2.1919, + "step": 6164 + }, + { + "epoch": 0.33074034334763946, + "grad_norm": 1.2151066064834595, + "learning_rate": 3.908009577364581e-06, + "loss": 2.3305, + "step": 6165 + }, + { + "epoch": 0.330793991416309, + "grad_norm": 7.133994102478027, + "learning_rate": 3.907650602214942e-06, + "loss": 1.8944, + "step": 6166 + }, + { + "epoch": 0.33084763948497853, + "grad_norm": 1.3245015144348145, + "learning_rate": 3.907291584564438e-06, + "loss": 2.5197, + "step": 6167 + }, + { + "epoch": 0.33090128755364806, + "grad_norm": 1.709640622138977, + "learning_rate": 3.90693252442391e-06, + "loss": 2.427, + "step": 6168 + }, + { + "epoch": 0.3309549356223176, + "grad_norm": 1.6875680685043335, + "learning_rate": 3.9065734218041985e-06, + "loss": 2.1355, + "step": 6169 + }, + { + "epoch": 0.3310085836909871, + "grad_norm": 1.0845019817352295, + "learning_rate": 3.906214276716146e-06, + "loss": 2.2375, + "step": 6170 + }, + { + "epoch": 0.33106223175965666, + "grad_norm": 1.5290286540985107, + "learning_rate": 3.905855089170595e-06, + "loss": 2.3366, + "step": 6171 + }, + { + "epoch": 0.3311158798283262, + "grad_norm": 2.6789357662200928, + "learning_rate": 3.905495859178391e-06, + "loss": 2.1533, + "step": 6172 + }, + { + "epoch": 0.3311695278969957, + "grad_norm": 1.2964061498641968, + "learning_rate": 3.905136586750381e-06, + "loss": 2.2306, + "step": 6173 + }, + { + "epoch": 0.33122317596566525, + "grad_norm": 1.2299610376358032, + "learning_rate": 3.904777271897412e-06, + "loss": 2.0901, + "step": 6174 + }, + { + "epoch": 0.3312768240343348, + "grad_norm": 1.8139643669128418, + "learning_rate": 3.904417914630331e-06, + "loss": 2.4271, + "step": 6175 + }, + { + "epoch": 0.3313304721030043, + "grad_norm": 20.939542770385742, + "learning_rate": 3.90405851495999e-06, + "loss": 2.1395, + "step": 6176 + }, + { + "epoch": 0.33138412017167385, + "grad_norm": 1.2558252811431885, + "learning_rate": 3.903699072897241e-06, + "loss": 2.1525, + "step": 6177 + }, + { + "epoch": 0.3314377682403433, + "grad_norm": 1.1869101524353027, + "learning_rate": 3.903339588452934e-06, + "loss": 2.0879, + "step": 6178 + }, + { + "epoch": 0.33149141630901285, + "grad_norm": 1.4712185859680176, + "learning_rate": 3.902980061637924e-06, + "loss": 2.514, + "step": 6179 + }, + { + "epoch": 0.3315450643776824, + "grad_norm": 1.760266900062561, + "learning_rate": 3.902620492463067e-06, + "loss": 2.4253, + "step": 6180 + }, + { + "epoch": 0.3315987124463519, + "grad_norm": 1.5084344148635864, + "learning_rate": 3.902260880939218e-06, + "loss": 2.1095, + "step": 6181 + }, + { + "epoch": 0.33165236051502145, + "grad_norm": 1.2221487760543823, + "learning_rate": 3.901901227077236e-06, + "loss": 2.2559, + "step": 6182 + }, + { + "epoch": 0.331706008583691, + "grad_norm": 1.5954703092575073, + "learning_rate": 3.901541530887978e-06, + "loss": 2.1608, + "step": 6183 + }, + { + "epoch": 0.3317596566523605, + "grad_norm": 1.9287298917770386, + "learning_rate": 3.901181792382307e-06, + "loss": 2.1139, + "step": 6184 + }, + { + "epoch": 0.33181330472103004, + "grad_norm": 1.298027753829956, + "learning_rate": 3.900822011571082e-06, + "loss": 2.4116, + "step": 6185 + }, + { + "epoch": 0.3318669527896996, + "grad_norm": 1.515547513961792, + "learning_rate": 3.900462188465166e-06, + "loss": 2.1221, + "step": 6186 + }, + { + "epoch": 0.3319206008583691, + "grad_norm": 1.429242491722107, + "learning_rate": 3.900102323075424e-06, + "loss": 2.2721, + "step": 6187 + }, + { + "epoch": 0.33197424892703864, + "grad_norm": 1.3975281715393066, + "learning_rate": 3.899742415412721e-06, + "loss": 2.1329, + "step": 6188 + }, + { + "epoch": 0.33202789699570817, + "grad_norm": 1.3420729637145996, + "learning_rate": 3.8993824654879246e-06, + "loss": 2.4304, + "step": 6189 + }, + { + "epoch": 0.3320815450643777, + "grad_norm": 1.3511033058166504, + "learning_rate": 3.899022473311902e-06, + "loss": 2.3615, + "step": 6190 + }, + { + "epoch": 0.33213519313304724, + "grad_norm": 1.4234850406646729, + "learning_rate": 3.8986624388955204e-06, + "loss": 2.2546, + "step": 6191 + }, + { + "epoch": 0.3321888412017167, + "grad_norm": 1.4310261011123657, + "learning_rate": 3.898302362249653e-06, + "loss": 2.5358, + "step": 6192 + }, + { + "epoch": 0.33224248927038624, + "grad_norm": 1.4003154039382935, + "learning_rate": 3.897942243385171e-06, + "loss": 2.5654, + "step": 6193 + }, + { + "epoch": 0.3322961373390558, + "grad_norm": 1.2333399057388306, + "learning_rate": 3.897582082312946e-06, + "loss": 2.0386, + "step": 6194 + }, + { + "epoch": 0.3323497854077253, + "grad_norm": 1.390762209892273, + "learning_rate": 3.8972218790438534e-06, + "loss": 2.2853, + "step": 6195 + }, + { + "epoch": 0.33240343347639484, + "grad_norm": 1.3688921928405762, + "learning_rate": 3.896861633588769e-06, + "loss": 2.2396, + "step": 6196 + }, + { + "epoch": 0.33245708154506437, + "grad_norm": 1.3292900323867798, + "learning_rate": 3.896501345958568e-06, + "loss": 2.2643, + "step": 6197 + }, + { + "epoch": 0.3325107296137339, + "grad_norm": 1.093149185180664, + "learning_rate": 3.89614101616413e-06, + "loss": 1.8045, + "step": 6198 + }, + { + "epoch": 0.33256437768240343, + "grad_norm": 1.3612083196640015, + "learning_rate": 3.895780644216334e-06, + "loss": 2.2514, + "step": 6199 + }, + { + "epoch": 0.33261802575107297, + "grad_norm": 1.2811156511306763, + "learning_rate": 3.8954202301260604e-06, + "loss": 2.2128, + "step": 6200 + }, + { + "epoch": 0.3326716738197425, + "grad_norm": 1.4927057027816772, + "learning_rate": 3.895059773904191e-06, + "loss": 2.2884, + "step": 6201 + }, + { + "epoch": 0.33272532188841203, + "grad_norm": 2.843698024749756, + "learning_rate": 3.89469927556161e-06, + "loss": 2.2053, + "step": 6202 + }, + { + "epoch": 0.33277896995708156, + "grad_norm": 1.2479279041290283, + "learning_rate": 3.8943387351092e-06, + "loss": 2.2261, + "step": 6203 + }, + { + "epoch": 0.3328326180257511, + "grad_norm": 1.4643946886062622, + "learning_rate": 3.893978152557848e-06, + "loss": 2.0665, + "step": 6204 + }, + { + "epoch": 0.3328862660944206, + "grad_norm": 4.834478855133057, + "learning_rate": 3.89361752791844e-06, + "loss": 2.3752, + "step": 6205 + }, + { + "epoch": 0.33293991416309016, + "grad_norm": 1.5285042524337769, + "learning_rate": 3.893256861201866e-06, + "loss": 2.297, + "step": 6206 + }, + { + "epoch": 0.33299356223175963, + "grad_norm": 1.4932730197906494, + "learning_rate": 3.892896152419015e-06, + "loss": 2.4936, + "step": 6207 + }, + { + "epoch": 0.33304721030042916, + "grad_norm": 1.4053982496261597, + "learning_rate": 3.892535401580776e-06, + "loss": 2.3502, + "step": 6208 + }, + { + "epoch": 0.3331008583690987, + "grad_norm": 1.487870693206787, + "learning_rate": 3.892174608698043e-06, + "loss": 2.4418, + "step": 6209 + }, + { + "epoch": 0.3331545064377682, + "grad_norm": 1.460911512374878, + "learning_rate": 3.891813773781709e-06, + "loss": 2.065, + "step": 6210 + }, + { + "epoch": 0.33320815450643776, + "grad_norm": 1.3891305923461914, + "learning_rate": 3.8914528968426675e-06, + "loss": 2.197, + "step": 6211 + }, + { + "epoch": 0.3332618025751073, + "grad_norm": 2.612617254257202, + "learning_rate": 3.891091977891816e-06, + "loss": 2.0207, + "step": 6212 + }, + { + "epoch": 0.3333154506437768, + "grad_norm": 1.4658137559890747, + "learning_rate": 3.8907310169400504e-06, + "loss": 2.2046, + "step": 6213 + }, + { + "epoch": 0.33336909871244635, + "grad_norm": 1.368478775024414, + "learning_rate": 3.8903700139982694e-06, + "loss": 2.4189, + "step": 6214 + }, + { + "epoch": 0.3334227467811159, + "grad_norm": 1.3689693212509155, + "learning_rate": 3.890008969077373e-06, + "loss": 2.1177, + "step": 6215 + }, + { + "epoch": 0.3334763948497854, + "grad_norm": 1.3291702270507812, + "learning_rate": 3.889647882188262e-06, + "loss": 2.1557, + "step": 6216 + }, + { + "epoch": 0.33353004291845495, + "grad_norm": 1.1518481969833374, + "learning_rate": 3.889286753341839e-06, + "loss": 2.0293, + "step": 6217 + }, + { + "epoch": 0.3335836909871245, + "grad_norm": 1.3207160234451294, + "learning_rate": 3.888925582549006e-06, + "loss": 2.1145, + "step": 6218 + }, + { + "epoch": 0.333637339055794, + "grad_norm": 1.4499911069869995, + "learning_rate": 3.888564369820668e-06, + "loss": 2.2499, + "step": 6219 + }, + { + "epoch": 0.33369098712446355, + "grad_norm": 1.7353564500808716, + "learning_rate": 3.888203115167734e-06, + "loss": 2.3088, + "step": 6220 + }, + { + "epoch": 0.333744635193133, + "grad_norm": 1.3910596370697021, + "learning_rate": 3.887841818601109e-06, + "loss": 2.3797, + "step": 6221 + }, + { + "epoch": 0.33379828326180255, + "grad_norm": 1.4821583032608032, + "learning_rate": 3.8874804801317e-06, + "loss": 2.4414, + "step": 6222 + }, + { + "epoch": 0.3338519313304721, + "grad_norm": 1.3632432222366333, + "learning_rate": 3.887119099770419e-06, + "loss": 2.3133, + "step": 6223 + }, + { + "epoch": 0.3339055793991416, + "grad_norm": 1.2548344135284424, + "learning_rate": 3.886757677528177e-06, + "loss": 2.2768, + "step": 6224 + }, + { + "epoch": 0.33395922746781115, + "grad_norm": 1.578744888305664, + "learning_rate": 3.886396213415885e-06, + "loss": 2.2462, + "step": 6225 + }, + { + "epoch": 0.3340128755364807, + "grad_norm": 1.8347245454788208, + "learning_rate": 3.886034707444458e-06, + "loss": 2.4606, + "step": 6226 + }, + { + "epoch": 0.3340665236051502, + "grad_norm": 1.621164083480835, + "learning_rate": 3.88567315962481e-06, + "loss": 2.2667, + "step": 6227 + }, + { + "epoch": 0.33412017167381974, + "grad_norm": 1.462547779083252, + "learning_rate": 3.885311569967858e-06, + "loss": 2.1606, + "step": 6228 + }, + { + "epoch": 0.3341738197424893, + "grad_norm": 2.373406410217285, + "learning_rate": 3.884949938484518e-06, + "loss": 2.2402, + "step": 6229 + }, + { + "epoch": 0.3342274678111588, + "grad_norm": 1.353695034980774, + "learning_rate": 3.88458826518571e-06, + "loss": 2.3318, + "step": 6230 + }, + { + "epoch": 0.33428111587982834, + "grad_norm": 1.1312167644500732, + "learning_rate": 3.884226550082353e-06, + "loss": 2.247, + "step": 6231 + }, + { + "epoch": 0.33433476394849787, + "grad_norm": 1.2877676486968994, + "learning_rate": 3.883864793185369e-06, + "loss": 2.2421, + "step": 6232 + }, + { + "epoch": 0.3343884120171674, + "grad_norm": 1.3866691589355469, + "learning_rate": 3.883502994505679e-06, + "loss": 2.2845, + "step": 6233 + }, + { + "epoch": 0.33444206008583693, + "grad_norm": 1.2166835069656372, + "learning_rate": 3.883141154054209e-06, + "loss": 2.1121, + "step": 6234 + }, + { + "epoch": 0.3344957081545064, + "grad_norm": 1.2426835298538208, + "learning_rate": 3.882779271841882e-06, + "loss": 2.1945, + "step": 6235 + }, + { + "epoch": 0.33454935622317594, + "grad_norm": 1.4466413259506226, + "learning_rate": 3.882417347879626e-06, + "loss": 2.5207, + "step": 6236 + }, + { + "epoch": 0.3346030042918455, + "grad_norm": 1.38124680519104, + "learning_rate": 3.8820553821783654e-06, + "loss": 2.4518, + "step": 6237 + }, + { + "epoch": 0.334656652360515, + "grad_norm": 1.5244996547698975, + "learning_rate": 3.8816933747490324e-06, + "loss": 2.2807, + "step": 6238 + }, + { + "epoch": 0.33471030042918454, + "grad_norm": 1.5065275430679321, + "learning_rate": 3.881331325602555e-06, + "loss": 2.2626, + "step": 6239 + }, + { + "epoch": 0.33476394849785407, + "grad_norm": 1.1953340768814087, + "learning_rate": 3.880969234749865e-06, + "loss": 2.0719, + "step": 6240 + }, + { + "epoch": 0.3348175965665236, + "grad_norm": 1.4182006120681763, + "learning_rate": 3.880607102201895e-06, + "loss": 2.1797, + "step": 6241 + }, + { + "epoch": 0.33487124463519313, + "grad_norm": 1.1812732219696045, + "learning_rate": 3.880244927969579e-06, + "loss": 2.0714, + "step": 6242 + }, + { + "epoch": 0.33492489270386266, + "grad_norm": 1.3137438297271729, + "learning_rate": 3.879882712063852e-06, + "loss": 2.49, + "step": 6243 + }, + { + "epoch": 0.3349785407725322, + "grad_norm": 1.5808615684509277, + "learning_rate": 3.879520454495649e-06, + "loss": 2.285, + "step": 6244 + }, + { + "epoch": 0.33503218884120173, + "grad_norm": 1.425258755683899, + "learning_rate": 3.879158155275908e-06, + "loss": 2.3269, + "step": 6245 + }, + { + "epoch": 0.33508583690987126, + "grad_norm": 2.0387580394744873, + "learning_rate": 3.87879581441557e-06, + "loss": 2.1928, + "step": 6246 + }, + { + "epoch": 0.3351394849785408, + "grad_norm": 1.5940487384796143, + "learning_rate": 3.878433431925573e-06, + "loss": 2.2831, + "step": 6247 + }, + { + "epoch": 0.3351931330472103, + "grad_norm": 1.5654547214508057, + "learning_rate": 3.87807100781686e-06, + "loss": 2.3818, + "step": 6248 + }, + { + "epoch": 0.33524678111587985, + "grad_norm": 1.7217055559158325, + "learning_rate": 3.87770854210037e-06, + "loss": 2.1975, + "step": 6249 + }, + { + "epoch": 0.33530042918454933, + "grad_norm": 1.6022995710372925, + "learning_rate": 3.87734603478705e-06, + "loss": 2.1598, + "step": 6250 + }, + { + "epoch": 0.33535407725321886, + "grad_norm": 1.3201349973678589, + "learning_rate": 3.876983485887845e-06, + "loss": 2.3062, + "step": 6251 + }, + { + "epoch": 0.3354077253218884, + "grad_norm": 1.4664616584777832, + "learning_rate": 3.876620895413701e-06, + "loss": 2.4803, + "step": 6252 + }, + { + "epoch": 0.3354613733905579, + "grad_norm": 1.4684245586395264, + "learning_rate": 3.8762582633755655e-06, + "loss": 2.306, + "step": 6253 + }, + { + "epoch": 0.33551502145922746, + "grad_norm": 2.029207706451416, + "learning_rate": 3.875895589784385e-06, + "loss": 2.5199, + "step": 6254 + }, + { + "epoch": 0.335568669527897, + "grad_norm": 1.455013632774353, + "learning_rate": 3.875532874651113e-06, + "loss": 2.1579, + "step": 6255 + }, + { + "epoch": 0.3356223175965665, + "grad_norm": 1.3100916147232056, + "learning_rate": 3.875170117986701e-06, + "loss": 2.0489, + "step": 6256 + }, + { + "epoch": 0.33567596566523605, + "grad_norm": 1.7600421905517578, + "learning_rate": 3.874807319802099e-06, + "loss": 2.331, + "step": 6257 + }, + { + "epoch": 0.3357296137339056, + "grad_norm": 1.5150431394577026, + "learning_rate": 3.874444480108263e-06, + "loss": 2.0184, + "step": 6258 + }, + { + "epoch": 0.3357832618025751, + "grad_norm": 1.5189645290374756, + "learning_rate": 3.874081598916146e-06, + "loss": 2.4013, + "step": 6259 + }, + { + "epoch": 0.33583690987124465, + "grad_norm": 1.4468756914138794, + "learning_rate": 3.873718676236707e-06, + "loss": 1.901, + "step": 6260 + }, + { + "epoch": 0.3358905579399142, + "grad_norm": 1.3467559814453125, + "learning_rate": 3.873355712080902e-06, + "loss": 2.3753, + "step": 6261 + }, + { + "epoch": 0.3359442060085837, + "grad_norm": 1.5808312892913818, + "learning_rate": 3.8729927064596895e-06, + "loss": 2.4022, + "step": 6262 + }, + { + "epoch": 0.33599785407725324, + "grad_norm": 1.5847537517547607, + "learning_rate": 3.8726296593840304e-06, + "loss": 2.1149, + "step": 6263 + }, + { + "epoch": 0.3360515021459227, + "grad_norm": 1.6160452365875244, + "learning_rate": 3.8722665708648865e-06, + "loss": 2.276, + "step": 6264 + }, + { + "epoch": 0.33610515021459225, + "grad_norm": 1.9760268926620483, + "learning_rate": 3.87190344091322e-06, + "loss": 1.0606, + "step": 6265 + }, + { + "epoch": 0.3361587982832618, + "grad_norm": 1.4468380212783813, + "learning_rate": 3.871540269539995e-06, + "loss": 2.3597, + "step": 6266 + }, + { + "epoch": 0.3362124463519313, + "grad_norm": 2.4414610862731934, + "learning_rate": 3.871177056756176e-06, + "loss": 2.2864, + "step": 6267 + }, + { + "epoch": 0.33626609442060085, + "grad_norm": 1.4311810731887817, + "learning_rate": 3.87081380257273e-06, + "loss": 2.3276, + "step": 6268 + }, + { + "epoch": 0.3363197424892704, + "grad_norm": 1.529992938041687, + "learning_rate": 3.870450507000625e-06, + "loss": 2.1435, + "step": 6269 + }, + { + "epoch": 0.3363733905579399, + "grad_norm": 1.3982223272323608, + "learning_rate": 3.870087170050828e-06, + "loss": 2.4015, + "step": 6270 + }, + { + "epoch": 0.33642703862660944, + "grad_norm": 1.6170828342437744, + "learning_rate": 3.869723791734312e-06, + "loss": 2.2164, + "step": 6271 + }, + { + "epoch": 0.336480686695279, + "grad_norm": 1.5131142139434814, + "learning_rate": 3.8693603720620464e-06, + "loss": 2.4421, + "step": 6272 + }, + { + "epoch": 0.3365343347639485, + "grad_norm": 1.5544012784957886, + "learning_rate": 3.868996911045005e-06, + "loss": 2.1549, + "step": 6273 + }, + { + "epoch": 0.33658798283261804, + "grad_norm": 1.5486555099487305, + "learning_rate": 3.86863340869416e-06, + "loss": 2.3841, + "step": 6274 + }, + { + "epoch": 0.33664163090128757, + "grad_norm": 1.0354562997817993, + "learning_rate": 3.868269865020489e-06, + "loss": 1.8543, + "step": 6275 + }, + { + "epoch": 0.3366952789699571, + "grad_norm": 1.5357552766799927, + "learning_rate": 3.867906280034965e-06, + "loss": 2.4234, + "step": 6276 + }, + { + "epoch": 0.33674892703862663, + "grad_norm": 1.188955545425415, + "learning_rate": 3.867542653748569e-06, + "loss": 2.1093, + "step": 6277 + }, + { + "epoch": 0.33680257510729616, + "grad_norm": 1.236797571182251, + "learning_rate": 3.867178986172279e-06, + "loss": 2.1325, + "step": 6278 + }, + { + "epoch": 0.33685622317596564, + "grad_norm": 1.275349736213684, + "learning_rate": 3.8668152773170735e-06, + "loss": 1.9005, + "step": 6279 + }, + { + "epoch": 0.3369098712446352, + "grad_norm": 1.6370301246643066, + "learning_rate": 3.866451527193936e-06, + "loss": 2.3332, + "step": 6280 + }, + { + "epoch": 0.3369635193133047, + "grad_norm": 1.4016544818878174, + "learning_rate": 3.866087735813847e-06, + "loss": 2.0545, + "step": 6281 + }, + { + "epoch": 0.33701716738197424, + "grad_norm": 1.6162351369857788, + "learning_rate": 3.865723903187792e-06, + "loss": 2.4077, + "step": 6282 + }, + { + "epoch": 0.33707081545064377, + "grad_norm": 1.3306087255477905, + "learning_rate": 3.8653600293267565e-06, + "loss": 2.0037, + "step": 6283 + }, + { + "epoch": 0.3371244635193133, + "grad_norm": 4.7715373039245605, + "learning_rate": 3.864996114241725e-06, + "loss": 2.5435, + "step": 6284 + }, + { + "epoch": 0.33717811158798283, + "grad_norm": 1.3337410688400269, + "learning_rate": 3.864632157943686e-06, + "loss": 1.9702, + "step": 6285 + }, + { + "epoch": 0.33723175965665236, + "grad_norm": 1.4679689407348633, + "learning_rate": 3.864268160443629e-06, + "loss": 2.087, + "step": 6286 + }, + { + "epoch": 0.3372854077253219, + "grad_norm": 1.351992130279541, + "learning_rate": 3.863904121752544e-06, + "loss": 2.3647, + "step": 6287 + }, + { + "epoch": 0.3373390557939914, + "grad_norm": 1.418066382408142, + "learning_rate": 3.863540041881422e-06, + "loss": 2.0129, + "step": 6288 + }, + { + "epoch": 0.33739270386266096, + "grad_norm": 1.500707745552063, + "learning_rate": 3.863175920841254e-06, + "loss": 2.3083, + "step": 6289 + }, + { + "epoch": 0.3374463519313305, + "grad_norm": 6.493648529052734, + "learning_rate": 3.862811758643035e-06, + "loss": 2.2717, + "step": 6290 + }, + { + "epoch": 0.3375, + "grad_norm": 1.411453127861023, + "learning_rate": 3.862447555297763e-06, + "loss": 1.8747, + "step": 6291 + }, + { + "epoch": 0.33755364806866955, + "grad_norm": 1.385439395904541, + "learning_rate": 3.862083310816429e-06, + "loss": 2.2166, + "step": 6292 + }, + { + "epoch": 0.33760729613733903, + "grad_norm": 2.579744815826416, + "learning_rate": 3.861719025210034e-06, + "loss": 2.1951, + "step": 6293 + }, + { + "epoch": 0.33766094420600856, + "grad_norm": 1.724898099899292, + "learning_rate": 3.861354698489576e-06, + "loss": 2.2621, + "step": 6294 + }, + { + "epoch": 0.3377145922746781, + "grad_norm": 1.5882360935211182, + "learning_rate": 3.860990330666055e-06, + "loss": 2.3171, + "step": 6295 + }, + { + "epoch": 0.3377682403433476, + "grad_norm": 1.3877136707305908, + "learning_rate": 3.860625921750474e-06, + "loss": 2.2365, + "step": 6296 + }, + { + "epoch": 0.33782188841201716, + "grad_norm": 1.9223088026046753, + "learning_rate": 3.860261471753832e-06, + "loss": 2.3241, + "step": 6297 + }, + { + "epoch": 0.3378755364806867, + "grad_norm": 1.8392900228500366, + "learning_rate": 3.859896980687135e-06, + "loss": 2.6306, + "step": 6298 + }, + { + "epoch": 0.3379291845493562, + "grad_norm": 1.5349417924880981, + "learning_rate": 3.859532448561386e-06, + "loss": 2.2588, + "step": 6299 + }, + { + "epoch": 0.33798283261802575, + "grad_norm": 1.1491702795028687, + "learning_rate": 3.859167875387595e-06, + "loss": 2.0725, + "step": 6300 + }, + { + "epoch": 0.3380364806866953, + "grad_norm": 10.733426094055176, + "learning_rate": 3.8588032611767665e-06, + "loss": 2.2605, + "step": 6301 + }, + { + "epoch": 0.3380901287553648, + "grad_norm": 1.5075838565826416, + "learning_rate": 3.85843860593991e-06, + "loss": 2.3296, + "step": 6302 + }, + { + "epoch": 0.33814377682403435, + "grad_norm": 2.6896042823791504, + "learning_rate": 3.858073909688035e-06, + "loss": 2.2037, + "step": 6303 + }, + { + "epoch": 0.3381974248927039, + "grad_norm": 1.6419333219528198, + "learning_rate": 3.857709172432153e-06, + "loss": 2.4771, + "step": 6304 + }, + { + "epoch": 0.3382510729613734, + "grad_norm": 1.5565507411956787, + "learning_rate": 3.857344394183277e-06, + "loss": 2.3742, + "step": 6305 + }, + { + "epoch": 0.33830472103004294, + "grad_norm": 1.4735013246536255, + "learning_rate": 3.856979574952419e-06, + "loss": 2.2568, + "step": 6306 + }, + { + "epoch": 0.3383583690987124, + "grad_norm": 1.1989693641662598, + "learning_rate": 3.856614714750597e-06, + "loss": 1.9973, + "step": 6307 + }, + { + "epoch": 0.33841201716738195, + "grad_norm": 4.490205764770508, + "learning_rate": 3.856249813588824e-06, + "loss": 2.6266, + "step": 6308 + }, + { + "epoch": 0.3384656652360515, + "grad_norm": 1.2256189584732056, + "learning_rate": 3.855884871478119e-06, + "loss": 2.087, + "step": 6309 + }, + { + "epoch": 0.338519313304721, + "grad_norm": 1.9927901029586792, + "learning_rate": 3.855519888429501e-06, + "loss": 1.38, + "step": 6310 + }, + { + "epoch": 0.33857296137339055, + "grad_norm": 1.394152283668518, + "learning_rate": 3.855154864453987e-06, + "loss": 2.2335, + "step": 6311 + }, + { + "epoch": 0.3386266094420601, + "grad_norm": 3.090458631515503, + "learning_rate": 3.8547897995626025e-06, + "loss": 2.2872, + "step": 6312 + }, + { + "epoch": 0.3386802575107296, + "grad_norm": 1.6368776559829712, + "learning_rate": 3.854424693766366e-06, + "loss": 2.3794, + "step": 6313 + }, + { + "epoch": 0.33873390557939914, + "grad_norm": 2.3276658058166504, + "learning_rate": 3.854059547076303e-06, + "loss": 2.2994, + "step": 6314 + }, + { + "epoch": 0.3387875536480687, + "grad_norm": 1.46492338180542, + "learning_rate": 3.853694359503437e-06, + "loss": 1.9603, + "step": 6315 + }, + { + "epoch": 0.3388412017167382, + "grad_norm": 1.3829314708709717, + "learning_rate": 3.853329131058796e-06, + "loss": 2.5207, + "step": 6316 + }, + { + "epoch": 0.33889484978540774, + "grad_norm": 1.3139957189559937, + "learning_rate": 3.852963861753406e-06, + "loss": 2.2318, + "step": 6317 + }, + { + "epoch": 0.33894849785407727, + "grad_norm": 1.1784919500350952, + "learning_rate": 3.852598551598295e-06, + "loss": 2.0703, + "step": 6318 + }, + { + "epoch": 0.3390021459227468, + "grad_norm": 1.3850423097610474, + "learning_rate": 3.852233200604493e-06, + "loss": 2.211, + "step": 6319 + }, + { + "epoch": 0.33905579399141633, + "grad_norm": 1.731902003288269, + "learning_rate": 3.851867808783032e-06, + "loss": 2.3995, + "step": 6320 + }, + { + "epoch": 0.33910944206008586, + "grad_norm": 1.530334711074829, + "learning_rate": 3.851502376144943e-06, + "loss": 2.2175, + "step": 6321 + }, + { + "epoch": 0.33916309012875534, + "grad_norm": 1.220383882522583, + "learning_rate": 3.85113690270126e-06, + "loss": 1.9523, + "step": 6322 + }, + { + "epoch": 0.33921673819742487, + "grad_norm": 1.4292024374008179, + "learning_rate": 3.8507713884630174e-06, + "loss": 2.0751, + "step": 6323 + }, + { + "epoch": 0.3392703862660944, + "grad_norm": 1.2505770921707153, + "learning_rate": 3.850405833441251e-06, + "loss": 2.4885, + "step": 6324 + }, + { + "epoch": 0.33932403433476394, + "grad_norm": 1.3056080341339111, + "learning_rate": 3.850040237646998e-06, + "loss": 2.3672, + "step": 6325 + }, + { + "epoch": 0.33937768240343347, + "grad_norm": 1.8368221521377563, + "learning_rate": 3.849674601091298e-06, + "loss": 2.3235, + "step": 6326 + }, + { + "epoch": 0.339431330472103, + "grad_norm": 1.2423819303512573, + "learning_rate": 3.8493089237851885e-06, + "loss": 2.3204, + "step": 6327 + }, + { + "epoch": 0.33948497854077253, + "grad_norm": 1.3403520584106445, + "learning_rate": 3.848943205739711e-06, + "loss": 2.3677, + "step": 6328 + }, + { + "epoch": 0.33953862660944206, + "grad_norm": 1.676029920578003, + "learning_rate": 3.8485774469659085e-06, + "loss": 2.6419, + "step": 6329 + }, + { + "epoch": 0.3395922746781116, + "grad_norm": 1.326041579246521, + "learning_rate": 3.848211647474824e-06, + "loss": 1.9162, + "step": 6330 + }, + { + "epoch": 0.3396459227467811, + "grad_norm": 1.682722806930542, + "learning_rate": 3.847845807277501e-06, + "loss": 2.063, + "step": 6331 + }, + { + "epoch": 0.33969957081545066, + "grad_norm": 1.4190996885299683, + "learning_rate": 3.847479926384986e-06, + "loss": 2.0268, + "step": 6332 + }, + { + "epoch": 0.3397532188841202, + "grad_norm": 1.3999545574188232, + "learning_rate": 3.847114004808326e-06, + "loss": 2.2723, + "step": 6333 + }, + { + "epoch": 0.3398068669527897, + "grad_norm": 1.412208914756775, + "learning_rate": 3.846748042558568e-06, + "loss": 2.1635, + "step": 6334 + }, + { + "epoch": 0.33986051502145925, + "grad_norm": 1.3530380725860596, + "learning_rate": 3.846382039646762e-06, + "loss": 2.3856, + "step": 6335 + }, + { + "epoch": 0.33991416309012873, + "grad_norm": 1.8071779012680054, + "learning_rate": 3.8460159960839606e-06, + "loss": 2.2597, + "step": 6336 + }, + { + "epoch": 0.33996781115879826, + "grad_norm": 1.0220457315444946, + "learning_rate": 3.845649911881213e-06, + "loss": 1.7605, + "step": 6337 + }, + { + "epoch": 0.3400214592274678, + "grad_norm": 1.3356634378433228, + "learning_rate": 3.8452837870495735e-06, + "loss": 2.0646, + "step": 6338 + }, + { + "epoch": 0.3400751072961373, + "grad_norm": 1.6045458316802979, + "learning_rate": 3.844917621600096e-06, + "loss": 2.4612, + "step": 6339 + }, + { + "epoch": 0.34012875536480686, + "grad_norm": 1.407333493232727, + "learning_rate": 3.844551415543837e-06, + "loss": 2.1062, + "step": 6340 + }, + { + "epoch": 0.3401824034334764, + "grad_norm": 1.5741393566131592, + "learning_rate": 3.844185168891852e-06, + "loss": 2.329, + "step": 6341 + }, + { + "epoch": 0.3402360515021459, + "grad_norm": 3.266425848007202, + "learning_rate": 3.8438188816552e-06, + "loss": 2.4804, + "step": 6342 + }, + { + "epoch": 0.34028969957081545, + "grad_norm": 1.6010602712631226, + "learning_rate": 3.84345255384494e-06, + "loss": 2.3787, + "step": 6343 + }, + { + "epoch": 0.340343347639485, + "grad_norm": 1.3751670122146606, + "learning_rate": 3.843086185472131e-06, + "loss": 2.3774, + "step": 6344 + }, + { + "epoch": 0.3403969957081545, + "grad_norm": 1.2498719692230225, + "learning_rate": 3.8427197765478374e-06, + "loss": 2.2361, + "step": 6345 + }, + { + "epoch": 0.34045064377682405, + "grad_norm": 1.2906341552734375, + "learning_rate": 3.84235332708312e-06, + "loss": 1.7297, + "step": 6346 + }, + { + "epoch": 0.3405042918454936, + "grad_norm": 2.3686916828155518, + "learning_rate": 3.841986837089044e-06, + "loss": 2.3461, + "step": 6347 + }, + { + "epoch": 0.3405579399141631, + "grad_norm": 1.2460500001907349, + "learning_rate": 3.841620306576673e-06, + "loss": 2.417, + "step": 6348 + }, + { + "epoch": 0.34061158798283264, + "grad_norm": 1.5867910385131836, + "learning_rate": 3.841253735557077e-06, + "loss": 2.3582, + "step": 6349 + }, + { + "epoch": 0.3406652360515021, + "grad_norm": 1.4070888757705688, + "learning_rate": 3.840887124041319e-06, + "loss": 2.1522, + "step": 6350 + }, + { + "epoch": 0.34071888412017165, + "grad_norm": 1.452125906944275, + "learning_rate": 3.8405204720404726e-06, + "loss": 2.3827, + "step": 6351 + }, + { + "epoch": 0.3407725321888412, + "grad_norm": 2.207505941390991, + "learning_rate": 3.840153779565606e-06, + "loss": 2.4492, + "step": 6352 + }, + { + "epoch": 0.3408261802575107, + "grad_norm": 1.6600888967514038, + "learning_rate": 3.839787046627791e-06, + "loss": 2.1536, + "step": 6353 + }, + { + "epoch": 0.34087982832618025, + "grad_norm": 1.3883662223815918, + "learning_rate": 3.8394202732380995e-06, + "loss": 2.3022, + "step": 6354 + }, + { + "epoch": 0.3409334763948498, + "grad_norm": 1.3842953443527222, + "learning_rate": 3.839053459407606e-06, + "loss": 2.3947, + "step": 6355 + }, + { + "epoch": 0.3409871244635193, + "grad_norm": 1.5438265800476074, + "learning_rate": 3.838686605147384e-06, + "loss": 2.0738, + "step": 6356 + }, + { + "epoch": 0.34104077253218884, + "grad_norm": 2.9977214336395264, + "learning_rate": 3.838319710468513e-06, + "loss": 2.3778, + "step": 6357 + }, + { + "epoch": 0.3410944206008584, + "grad_norm": 1.5673528909683228, + "learning_rate": 3.83795277538207e-06, + "loss": 2.3898, + "step": 6358 + }, + { + "epoch": 0.3411480686695279, + "grad_norm": 1.9175660610198975, + "learning_rate": 3.8375857998991316e-06, + "loss": 2.3069, + "step": 6359 + }, + { + "epoch": 0.34120171673819744, + "grad_norm": 1.7557395696640015, + "learning_rate": 3.837218784030779e-06, + "loss": 2.1854, + "step": 6360 + }, + { + "epoch": 0.34125536480686697, + "grad_norm": 1.3138290643692017, + "learning_rate": 3.836851727788094e-06, + "loss": 2.2552, + "step": 6361 + }, + { + "epoch": 0.3413090128755365, + "grad_norm": 1.7046599388122559, + "learning_rate": 3.836484631182158e-06, + "loss": 2.3675, + "step": 6362 + }, + { + "epoch": 0.34136266094420603, + "grad_norm": 1.361918568611145, + "learning_rate": 3.836117494224055e-06, + "loss": 2.1814, + "step": 6363 + }, + { + "epoch": 0.34141630901287556, + "grad_norm": 1.7055100202560425, + "learning_rate": 3.83575031692487e-06, + "loss": 2.3144, + "step": 6364 + }, + { + "epoch": 0.34146995708154504, + "grad_norm": 1.6225265264511108, + "learning_rate": 3.835383099295689e-06, + "loss": 2.5762, + "step": 6365 + }, + { + "epoch": 0.34152360515021457, + "grad_norm": 1.2997620105743408, + "learning_rate": 3.8350158413476e-06, + "loss": 2.2533, + "step": 6366 + }, + { + "epoch": 0.3415772532188841, + "grad_norm": 1.475014328956604, + "learning_rate": 3.834648543091691e-06, + "loss": 2.4562, + "step": 6367 + }, + { + "epoch": 0.34163090128755363, + "grad_norm": 1.3553208112716675, + "learning_rate": 3.834281204539051e-06, + "loss": 2.2014, + "step": 6368 + }, + { + "epoch": 0.34168454935622317, + "grad_norm": 1.4530366659164429, + "learning_rate": 3.833913825700772e-06, + "loss": 2.3747, + "step": 6369 + }, + { + "epoch": 0.3417381974248927, + "grad_norm": 1.41780686378479, + "learning_rate": 3.833546406587946e-06, + "loss": 2.5931, + "step": 6370 + }, + { + "epoch": 0.34179184549356223, + "grad_norm": 1.554456114768982, + "learning_rate": 3.833178947211667e-06, + "loss": 2.3523, + "step": 6371 + }, + { + "epoch": 0.34184549356223176, + "grad_norm": 1.2901263236999512, + "learning_rate": 3.832811447583027e-06, + "loss": 2.234, + "step": 6372 + }, + { + "epoch": 0.3418991416309013, + "grad_norm": 1.5654377937316895, + "learning_rate": 3.832443907713124e-06, + "loss": 2.334, + "step": 6373 + }, + { + "epoch": 0.3419527896995708, + "grad_norm": 1.3794069290161133, + "learning_rate": 3.832076327613056e-06, + "loss": 2.196, + "step": 6374 + }, + { + "epoch": 0.34200643776824036, + "grad_norm": 4.975091934204102, + "learning_rate": 3.831708707293919e-06, + "loss": 2.2829, + "step": 6375 + }, + { + "epoch": 0.3420600858369099, + "grad_norm": 2.074338436126709, + "learning_rate": 3.831341046766814e-06, + "loss": 2.2461, + "step": 6376 + }, + { + "epoch": 0.3421137339055794, + "grad_norm": 1.386121392250061, + "learning_rate": 3.830973346042841e-06, + "loss": 2.2264, + "step": 6377 + }, + { + "epoch": 0.34216738197424895, + "grad_norm": 1.2878543138504028, + "learning_rate": 3.830605605133102e-06, + "loss": 2.0824, + "step": 6378 + }, + { + "epoch": 0.34222103004291843, + "grad_norm": 1.2218302488327026, + "learning_rate": 3.8302378240486995e-06, + "loss": 2.0873, + "step": 6379 + }, + { + "epoch": 0.34227467811158796, + "grad_norm": 1.6448884010314941, + "learning_rate": 3.82987000280074e-06, + "loss": 2.1941, + "step": 6380 + }, + { + "epoch": 0.3423283261802575, + "grad_norm": 1.417178750038147, + "learning_rate": 3.829502141400327e-06, + "loss": 2.5649, + "step": 6381 + }, + { + "epoch": 0.342381974248927, + "grad_norm": 1.361562728881836, + "learning_rate": 3.829134239858567e-06, + "loss": 2.4576, + "step": 6382 + }, + { + "epoch": 0.34243562231759656, + "grad_norm": 1.1940594911575317, + "learning_rate": 3.828766298186569e-06, + "loss": 1.8028, + "step": 6383 + }, + { + "epoch": 0.3424892703862661, + "grad_norm": 1.3963851928710938, + "learning_rate": 3.828398316395442e-06, + "loss": 2.1841, + "step": 6384 + }, + { + "epoch": 0.3425429184549356, + "grad_norm": 1.6720631122589111, + "learning_rate": 3.8280302944962965e-06, + "loss": 2.4733, + "step": 6385 + }, + { + "epoch": 0.34259656652360515, + "grad_norm": 1.6624995470046997, + "learning_rate": 3.827662232500244e-06, + "loss": 2.5324, + "step": 6386 + }, + { + "epoch": 0.3426502145922747, + "grad_norm": 1.3570040464401245, + "learning_rate": 3.827294130418397e-06, + "loss": 2.2124, + "step": 6387 + }, + { + "epoch": 0.3427038626609442, + "grad_norm": 1.4931702613830566, + "learning_rate": 3.82692598826187e-06, + "loss": 2.283, + "step": 6388 + }, + { + "epoch": 0.34275751072961375, + "grad_norm": 1.3648391962051392, + "learning_rate": 3.826557806041779e-06, + "loss": 2.1575, + "step": 6389 + }, + { + "epoch": 0.3428111587982833, + "grad_norm": 1.4024423360824585, + "learning_rate": 3.826189583769237e-06, + "loss": 2.522, + "step": 6390 + }, + { + "epoch": 0.3428648068669528, + "grad_norm": 1.498680830001831, + "learning_rate": 3.825821321455365e-06, + "loss": 2.2815, + "step": 6391 + }, + { + "epoch": 0.34291845493562234, + "grad_norm": 1.3559666872024536, + "learning_rate": 3.8254530191112815e-06, + "loss": 2.0928, + "step": 6392 + }, + { + "epoch": 0.3429721030042919, + "grad_norm": 1.2270348072052002, + "learning_rate": 3.825084676748106e-06, + "loss": 1.9737, + "step": 6393 + }, + { + "epoch": 0.34302575107296135, + "grad_norm": 1.5853955745697021, + "learning_rate": 3.82471629437696e-06, + "loss": 2.4952, + "step": 6394 + }, + { + "epoch": 0.3430793991416309, + "grad_norm": 1.324788212776184, + "learning_rate": 3.824347872008966e-06, + "loss": 2.4061, + "step": 6395 + }, + { + "epoch": 0.3431330472103004, + "grad_norm": 1.1054415702819824, + "learning_rate": 3.8239794096552455e-06, + "loss": 2.1777, + "step": 6396 + }, + { + "epoch": 0.34318669527896994, + "grad_norm": 1.6014986038208008, + "learning_rate": 3.823610907326927e-06, + "loss": 2.2555, + "step": 6397 + }, + { + "epoch": 0.3432403433476395, + "grad_norm": 1.4888561964035034, + "learning_rate": 3.8232423650351344e-06, + "loss": 2.3023, + "step": 6398 + }, + { + "epoch": 0.343293991416309, + "grad_norm": 1.466916799545288, + "learning_rate": 3.822873782790996e-06, + "loss": 2.4145, + "step": 6399 + }, + { + "epoch": 0.34334763948497854, + "grad_norm": 1.4280308485031128, + "learning_rate": 3.82250516060564e-06, + "loss": 2.2655, + "step": 6400 + }, + { + "epoch": 0.34340128755364807, + "grad_norm": 1.5005484819412231, + "learning_rate": 3.822136498490195e-06, + "loss": 2.1201, + "step": 6401 + }, + { + "epoch": 0.3434549356223176, + "grad_norm": 1.4878922700881958, + "learning_rate": 3.821767796455793e-06, + "loss": 2.2387, + "step": 6402 + }, + { + "epoch": 0.34350858369098713, + "grad_norm": 1.4106266498565674, + "learning_rate": 3.821399054513567e-06, + "loss": 2.2957, + "step": 6403 + }, + { + "epoch": 0.34356223175965667, + "grad_norm": 1.4952090978622437, + "learning_rate": 3.821030272674648e-06, + "loss": 2.2203, + "step": 6404 + }, + { + "epoch": 0.3436158798283262, + "grad_norm": 1.2765313386917114, + "learning_rate": 3.820661450950173e-06, + "loss": 2.0189, + "step": 6405 + }, + { + "epoch": 0.34366952789699573, + "grad_norm": 1.2692092657089233, + "learning_rate": 3.820292589351276e-06, + "loss": 2.298, + "step": 6406 + }, + { + "epoch": 0.34372317596566526, + "grad_norm": 2.080418109893799, + "learning_rate": 3.8199236878890954e-06, + "loss": 2.1477, + "step": 6407 + }, + { + "epoch": 0.34377682403433474, + "grad_norm": 1.4335898160934448, + "learning_rate": 3.819554746574768e-06, + "loss": 2.3106, + "step": 6408 + }, + { + "epoch": 0.34383047210300427, + "grad_norm": 1.4354840517044067, + "learning_rate": 3.819185765419435e-06, + "loss": 2.276, + "step": 6409 + }, + { + "epoch": 0.3438841201716738, + "grad_norm": 1.292160153388977, + "learning_rate": 3.818816744434235e-06, + "loss": 2.0759, + "step": 6410 + }, + { + "epoch": 0.34393776824034333, + "grad_norm": 1.348283052444458, + "learning_rate": 3.818447683630311e-06, + "loss": 2.2615, + "step": 6411 + }, + { + "epoch": 0.34399141630901287, + "grad_norm": 1.5178455114364624, + "learning_rate": 3.818078583018804e-06, + "loss": 2.3794, + "step": 6412 + }, + { + "epoch": 0.3440450643776824, + "grad_norm": 1.3080862760543823, + "learning_rate": 3.81770944261086e-06, + "loss": 2.436, + "step": 6413 + }, + { + "epoch": 0.34409871244635193, + "grad_norm": 1.71969735622406, + "learning_rate": 3.817340262417624e-06, + "loss": 2.3075, + "step": 6414 + }, + { + "epoch": 0.34415236051502146, + "grad_norm": 1.4548228979110718, + "learning_rate": 3.8169710424502436e-06, + "loss": 2.3396, + "step": 6415 + }, + { + "epoch": 0.344206008583691, + "grad_norm": 1.3521168231964111, + "learning_rate": 3.816601782719864e-06, + "loss": 2.3306, + "step": 6416 + }, + { + "epoch": 0.3442596566523605, + "grad_norm": 1.2059763669967651, + "learning_rate": 3.816232483237638e-06, + "loss": 2.3689, + "step": 6417 + }, + { + "epoch": 0.34431330472103006, + "grad_norm": 1.4981037378311157, + "learning_rate": 3.815863144014711e-06, + "loss": 2.2768, + "step": 6418 + }, + { + "epoch": 0.3443669527896996, + "grad_norm": 3.38205885887146, + "learning_rate": 3.815493765062238e-06, + "loss": 2.3711, + "step": 6419 + }, + { + "epoch": 0.3444206008583691, + "grad_norm": 1.3896703720092773, + "learning_rate": 3.81512434639137e-06, + "loss": 2.47, + "step": 6420 + }, + { + "epoch": 0.34447424892703865, + "grad_norm": 1.3766453266143799, + "learning_rate": 3.814754888013261e-06, + "loss": 2.3083, + "step": 6421 + }, + { + "epoch": 0.3445278969957081, + "grad_norm": 1.3648570775985718, + "learning_rate": 3.814385389939067e-06, + "loss": 2.1336, + "step": 6422 + }, + { + "epoch": 0.34458154506437766, + "grad_norm": 1.4411721229553223, + "learning_rate": 3.814015852179943e-06, + "loss": 2.3438, + "step": 6423 + }, + { + "epoch": 0.3446351931330472, + "grad_norm": 1.2792072296142578, + "learning_rate": 3.813646274747046e-06, + "loss": 2.411, + "step": 6424 + }, + { + "epoch": 0.3446888412017167, + "grad_norm": 1.1754347085952759, + "learning_rate": 3.8132766576515355e-06, + "loss": 2.3705, + "step": 6425 + }, + { + "epoch": 0.34474248927038625, + "grad_norm": 1.3197308778762817, + "learning_rate": 3.8129070009045723e-06, + "loss": 2.2439, + "step": 6426 + }, + { + "epoch": 0.3447961373390558, + "grad_norm": 1.4153891801834106, + "learning_rate": 3.812537304517314e-06, + "loss": 2.1182, + "step": 6427 + }, + { + "epoch": 0.3448497854077253, + "grad_norm": 1.389613389968872, + "learning_rate": 3.812167568500927e-06, + "loss": 2.4778, + "step": 6428 + }, + { + "epoch": 0.34490343347639485, + "grad_norm": 1.248624563217163, + "learning_rate": 3.8117977928665707e-06, + "loss": 2.047, + "step": 6429 + }, + { + "epoch": 0.3449570815450644, + "grad_norm": 2.0799033641815186, + "learning_rate": 3.8114279776254125e-06, + "loss": 2.1583, + "step": 6430 + }, + { + "epoch": 0.3450107296137339, + "grad_norm": 2.5087671279907227, + "learning_rate": 3.8110581227886166e-06, + "loss": 2.2325, + "step": 6431 + }, + { + "epoch": 0.34506437768240344, + "grad_norm": 1.8247483968734741, + "learning_rate": 3.810688228367351e-06, + "loss": 2.1702, + "step": 6432 + }, + { + "epoch": 0.345118025751073, + "grad_norm": 1.5115059614181519, + "learning_rate": 3.8103182943727824e-06, + "loss": 2.371, + "step": 6433 + }, + { + "epoch": 0.3451716738197425, + "grad_norm": 1.4052335023880005, + "learning_rate": 3.8099483208160816e-06, + "loss": 2.4365, + "step": 6434 + }, + { + "epoch": 0.34522532188841204, + "grad_norm": 1.3167140483856201, + "learning_rate": 3.8095783077084182e-06, + "loss": 2.2013, + "step": 6435 + }, + { + "epoch": 0.34527896995708157, + "grad_norm": 1.432983636856079, + "learning_rate": 3.809208255060964e-06, + "loss": 1.7112, + "step": 6436 + }, + { + "epoch": 0.34533261802575105, + "grad_norm": 1.6556049585342407, + "learning_rate": 3.8088381628848924e-06, + "loss": 2.4737, + "step": 6437 + }, + { + "epoch": 0.3453862660944206, + "grad_norm": 1.4849791526794434, + "learning_rate": 3.808468031191378e-06, + "loss": 2.1427, + "step": 6438 + }, + { + "epoch": 0.3454399141630901, + "grad_norm": 1.1858291625976562, + "learning_rate": 3.808097859991594e-06, + "loss": 1.9224, + "step": 6439 + }, + { + "epoch": 0.34549356223175964, + "grad_norm": 1.423277497291565, + "learning_rate": 3.8077276492967193e-06, + "loss": 2.379, + "step": 6440 + }, + { + "epoch": 0.3455472103004292, + "grad_norm": 1.275699257850647, + "learning_rate": 3.80735739911793e-06, + "loss": 2.1037, + "step": 6441 + }, + { + "epoch": 0.3456008583690987, + "grad_norm": 1.4181658029556274, + "learning_rate": 3.806987109466406e-06, + "loss": 2.3455, + "step": 6442 + }, + { + "epoch": 0.34565450643776824, + "grad_norm": 1.2438486814498901, + "learning_rate": 3.8066167803533262e-06, + "loss": 2.2367, + "step": 6443 + }, + { + "epoch": 0.34570815450643777, + "grad_norm": 1.846069574356079, + "learning_rate": 3.806246411789872e-06, + "loss": 2.383, + "step": 6444 + }, + { + "epoch": 0.3457618025751073, + "grad_norm": 1.7154114246368408, + "learning_rate": 3.805876003787228e-06, + "loss": 2.3927, + "step": 6445 + }, + { + "epoch": 0.34581545064377683, + "grad_norm": 1.4271003007888794, + "learning_rate": 3.805505556356575e-06, + "loss": 2.3706, + "step": 6446 + }, + { + "epoch": 0.34586909871244637, + "grad_norm": 1.3748867511749268, + "learning_rate": 3.8051350695090993e-06, + "loss": 2.2433, + "step": 6447 + }, + { + "epoch": 0.3459227467811159, + "grad_norm": 1.6252453327178955, + "learning_rate": 3.804764543255987e-06, + "loss": 2.3209, + "step": 6448 + }, + { + "epoch": 0.34597639484978543, + "grad_norm": 1.3945691585540771, + "learning_rate": 3.804393977608425e-06, + "loss": 2.3845, + "step": 6449 + }, + { + "epoch": 0.34603004291845496, + "grad_norm": 1.7789602279663086, + "learning_rate": 3.804023372577602e-06, + "loss": 2.0685, + "step": 6450 + }, + { + "epoch": 0.34608369098712444, + "grad_norm": 1.509685754776001, + "learning_rate": 3.8036527281747066e-06, + "loss": 1.5483, + "step": 6451 + }, + { + "epoch": 0.34613733905579397, + "grad_norm": 1.3482797145843506, + "learning_rate": 3.8032820444109297e-06, + "loss": 2.1116, + "step": 6452 + }, + { + "epoch": 0.3461909871244635, + "grad_norm": 1.3718268871307373, + "learning_rate": 3.8029113212974643e-06, + "loss": 2.3645, + "step": 6453 + }, + { + "epoch": 0.34624463519313303, + "grad_norm": 1.4623862504959106, + "learning_rate": 3.802540558845504e-06, + "loss": 2.3864, + "step": 6454 + }, + { + "epoch": 0.34629828326180256, + "grad_norm": 7.074548721313477, + "learning_rate": 3.802169757066242e-06, + "loss": 2.2557, + "step": 6455 + }, + { + "epoch": 0.3463519313304721, + "grad_norm": 1.4995418787002563, + "learning_rate": 3.8017989159708736e-06, + "loss": 2.3866, + "step": 6456 + }, + { + "epoch": 0.3464055793991416, + "grad_norm": 1.2509571313858032, + "learning_rate": 3.8014280355705956e-06, + "loss": 2.3144, + "step": 6457 + }, + { + "epoch": 0.34645922746781116, + "grad_norm": 1.3468259572982788, + "learning_rate": 3.801057115876606e-06, + "loss": 2.2096, + "step": 6458 + }, + { + "epoch": 0.3465128755364807, + "grad_norm": 1.34392511844635, + "learning_rate": 3.8006861569001064e-06, + "loss": 2.1673, + "step": 6459 + }, + { + "epoch": 0.3465665236051502, + "grad_norm": 1.473832607269287, + "learning_rate": 3.800315158652293e-06, + "loss": 2.2973, + "step": 6460 + }, + { + "epoch": 0.34662017167381975, + "grad_norm": 1.558215856552124, + "learning_rate": 3.7999441211443692e-06, + "loss": 2.1039, + "step": 6461 + }, + { + "epoch": 0.3466738197424893, + "grad_norm": 1.5232497453689575, + "learning_rate": 3.799573044387538e-06, + "loss": 2.5628, + "step": 6462 + }, + { + "epoch": 0.3467274678111588, + "grad_norm": 1.4501172304153442, + "learning_rate": 3.7992019283930027e-06, + "loss": 2.4402, + "step": 6463 + }, + { + "epoch": 0.34678111587982835, + "grad_norm": 1.6622258424758911, + "learning_rate": 3.798830773171968e-06, + "loss": 2.3627, + "step": 6464 + }, + { + "epoch": 0.3468347639484979, + "grad_norm": 2.0232038497924805, + "learning_rate": 3.798459578735641e-06, + "loss": 2.1348, + "step": 6465 + }, + { + "epoch": 0.34688841201716736, + "grad_norm": 1.473836064338684, + "learning_rate": 3.798088345095228e-06, + "loss": 2.2965, + "step": 6466 + }, + { + "epoch": 0.3469420600858369, + "grad_norm": 1.4368524551391602, + "learning_rate": 3.7977170722619388e-06, + "loss": 2.3215, + "step": 6467 + }, + { + "epoch": 0.3469957081545064, + "grad_norm": 1.573657512664795, + "learning_rate": 3.7973457602469825e-06, + "loss": 2.2803, + "step": 6468 + }, + { + "epoch": 0.34704935622317595, + "grad_norm": 1.6372370719909668, + "learning_rate": 3.79697440906157e-06, + "loss": 2.1696, + "step": 6469 + }, + { + "epoch": 0.3471030042918455, + "grad_norm": 1.3895238637924194, + "learning_rate": 3.7966030187169134e-06, + "loss": 2.1239, + "step": 6470 + }, + { + "epoch": 0.347156652360515, + "grad_norm": 1.3323166370391846, + "learning_rate": 3.7962315892242264e-06, + "loss": 2.5336, + "step": 6471 + }, + { + "epoch": 0.34721030042918455, + "grad_norm": 1.4975160360336304, + "learning_rate": 3.795860120594723e-06, + "loss": 2.6107, + "step": 6472 + }, + { + "epoch": 0.3472639484978541, + "grad_norm": 1.3187066316604614, + "learning_rate": 3.7954886128396186e-06, + "loss": 2.1704, + "step": 6473 + }, + { + "epoch": 0.3473175965665236, + "grad_norm": 1.4303984642028809, + "learning_rate": 3.7951170659701304e-06, + "loss": 2.2342, + "step": 6474 + }, + { + "epoch": 0.34737124463519314, + "grad_norm": 1.5964797735214233, + "learning_rate": 3.794745479997477e-06, + "loss": 2.315, + "step": 6475 + }, + { + "epoch": 0.3474248927038627, + "grad_norm": 1.3571821451187134, + "learning_rate": 3.7943738549328766e-06, + "loss": 2.1722, + "step": 6476 + }, + { + "epoch": 0.3474785407725322, + "grad_norm": 1.3883659839630127, + "learning_rate": 3.7940021907875515e-06, + "loss": 2.2113, + "step": 6477 + }, + { + "epoch": 0.34753218884120174, + "grad_norm": 1.8634958267211914, + "learning_rate": 3.79363048757272e-06, + "loss": 2.259, + "step": 6478 + }, + { + "epoch": 0.34758583690987127, + "grad_norm": 1.660954236984253, + "learning_rate": 3.793258745299608e-06, + "loss": 2.2335, + "step": 6479 + }, + { + "epoch": 0.34763948497854075, + "grad_norm": 1.4764294624328613, + "learning_rate": 3.7928869639794373e-06, + "loss": 2.1591, + "step": 6480 + }, + { + "epoch": 0.3476931330472103, + "grad_norm": 1.234781265258789, + "learning_rate": 3.7925151436234346e-06, + "loss": 2.2239, + "step": 6481 + }, + { + "epoch": 0.3477467811158798, + "grad_norm": 1.5989177227020264, + "learning_rate": 3.7921432842428253e-06, + "loss": 2.3294, + "step": 6482 + }, + { + "epoch": 0.34780042918454934, + "grad_norm": 1.498311161994934, + "learning_rate": 3.791771385848837e-06, + "loss": 2.4025, + "step": 6483 + }, + { + "epoch": 0.3478540772532189, + "grad_norm": 1.346754550933838, + "learning_rate": 3.791399448452698e-06, + "loss": 2.2927, + "step": 6484 + }, + { + "epoch": 0.3479077253218884, + "grad_norm": 1.2382075786590576, + "learning_rate": 3.791027472065638e-06, + "loss": 1.945, + "step": 6485 + }, + { + "epoch": 0.34796137339055794, + "grad_norm": 1.1924548149108887, + "learning_rate": 3.7906554566988896e-06, + "loss": 1.9653, + "step": 6486 + }, + { + "epoch": 0.34801502145922747, + "grad_norm": 1.8447448015213013, + "learning_rate": 3.790283402363683e-06, + "loss": 2.4031, + "step": 6487 + }, + { + "epoch": 0.348068669527897, + "grad_norm": 1.4024606943130493, + "learning_rate": 3.7899113090712526e-06, + "loss": 2.333, + "step": 6488 + }, + { + "epoch": 0.34812231759656653, + "grad_norm": 1.3806215524673462, + "learning_rate": 3.7895391768328326e-06, + "loss": 2.2055, + "step": 6489 + }, + { + "epoch": 0.34817596566523606, + "grad_norm": 1.5431177616119385, + "learning_rate": 3.7891670056596597e-06, + "loss": 2.3228, + "step": 6490 + }, + { + "epoch": 0.3482296137339056, + "grad_norm": 1.1286429166793823, + "learning_rate": 3.788794795562969e-06, + "loss": 2.2692, + "step": 6491 + }, + { + "epoch": 0.34828326180257513, + "grad_norm": 1.5850770473480225, + "learning_rate": 3.788422546553999e-06, + "loss": 2.1192, + "step": 6492 + }, + { + "epoch": 0.34833690987124466, + "grad_norm": 1.4153685569763184, + "learning_rate": 3.7880502586439907e-06, + "loss": 2.4499, + "step": 6493 + }, + { + "epoch": 0.34839055793991414, + "grad_norm": 1.3940708637237549, + "learning_rate": 3.7876779318441825e-06, + "loss": 2.5393, + "step": 6494 + }, + { + "epoch": 0.34844420600858367, + "grad_norm": 12.740437507629395, + "learning_rate": 3.7873055661658167e-06, + "loss": 2.0997, + "step": 6495 + }, + { + "epoch": 0.3484978540772532, + "grad_norm": 1.6795686483383179, + "learning_rate": 3.7869331616201353e-06, + "loss": 2.2039, + "step": 6496 + }, + { + "epoch": 0.34855150214592273, + "grad_norm": 1.560244083404541, + "learning_rate": 3.786560718218383e-06, + "loss": 2.2864, + "step": 6497 + }, + { + "epoch": 0.34860515021459226, + "grad_norm": 1.711350917816162, + "learning_rate": 3.7861882359718056e-06, + "loss": 2.327, + "step": 6498 + }, + { + "epoch": 0.3486587982832618, + "grad_norm": 1.5140128135681152, + "learning_rate": 3.7858157148916486e-06, + "loss": 2.155, + "step": 6499 + }, + { + "epoch": 0.3487124463519313, + "grad_norm": 1.5251073837280273, + "learning_rate": 3.785443154989159e-06, + "loss": 2.4763, + "step": 6500 + }, + { + "epoch": 0.34876609442060086, + "grad_norm": 1.4040336608886719, + "learning_rate": 3.7850705562755863e-06, + "loss": 2.0466, + "step": 6501 + }, + { + "epoch": 0.3488197424892704, + "grad_norm": 1.3360854387283325, + "learning_rate": 3.784697918762179e-06, + "loss": 2.2351, + "step": 6502 + }, + { + "epoch": 0.3488733905579399, + "grad_norm": 1.268568515777588, + "learning_rate": 3.7843252424601894e-06, + "loss": 2.024, + "step": 6503 + }, + { + "epoch": 0.34892703862660945, + "grad_norm": 4.018975734710693, + "learning_rate": 3.7839525273808687e-06, + "loss": 2.3989, + "step": 6504 + }, + { + "epoch": 0.348980686695279, + "grad_norm": 3.112670421600342, + "learning_rate": 3.783579773535471e-06, + "loss": 2.1396, + "step": 6505 + }, + { + "epoch": 0.3490343347639485, + "grad_norm": 1.532141089439392, + "learning_rate": 3.78320698093525e-06, + "loss": 2.3221, + "step": 6506 + }, + { + "epoch": 0.34908798283261805, + "grad_norm": 1.5496803522109985, + "learning_rate": 3.782834149591462e-06, + "loss": 2.2668, + "step": 6507 + }, + { + "epoch": 0.3491416309012876, + "grad_norm": 1.5894904136657715, + "learning_rate": 3.7824612795153637e-06, + "loss": 2.3867, + "step": 6508 + }, + { + "epoch": 0.34919527896995706, + "grad_norm": 1.3832825422286987, + "learning_rate": 3.7820883707182125e-06, + "loss": 2.2556, + "step": 6509 + }, + { + "epoch": 0.3492489270386266, + "grad_norm": 1.2357467412948608, + "learning_rate": 3.7817154232112685e-06, + "loss": 2.2771, + "step": 6510 + }, + { + "epoch": 0.3493025751072961, + "grad_norm": 1.801103949546814, + "learning_rate": 3.7813424370057905e-06, + "loss": 2.1852, + "step": 6511 + }, + { + "epoch": 0.34935622317596565, + "grad_norm": 1.547884225845337, + "learning_rate": 3.7809694121130424e-06, + "loss": 2.2618, + "step": 6512 + }, + { + "epoch": 0.3494098712446352, + "grad_norm": 1.5542031526565552, + "learning_rate": 3.780596348544284e-06, + "loss": 2.1811, + "step": 6513 + }, + { + "epoch": 0.3494635193133047, + "grad_norm": 1.2121013402938843, + "learning_rate": 3.780223246310781e-06, + "loss": 2.0923, + "step": 6514 + }, + { + "epoch": 0.34951716738197425, + "grad_norm": 1.4558266401290894, + "learning_rate": 3.779850105423798e-06, + "loss": 2.3698, + "step": 6515 + }, + { + "epoch": 0.3495708154506438, + "grad_norm": 1.5886198282241821, + "learning_rate": 3.779476925894601e-06, + "loss": 2.2415, + "step": 6516 + }, + { + "epoch": 0.3496244635193133, + "grad_norm": 1.2989619970321655, + "learning_rate": 3.779103707734458e-06, + "loss": 2.2875, + "step": 6517 + }, + { + "epoch": 0.34967811158798284, + "grad_norm": 2.1490824222564697, + "learning_rate": 3.7787304509546363e-06, + "loss": 2.3416, + "step": 6518 + }, + { + "epoch": 0.3497317596566524, + "grad_norm": 1.3594331741333008, + "learning_rate": 3.7783571555664057e-06, + "loss": 2.2823, + "step": 6519 + }, + { + "epoch": 0.3497854077253219, + "grad_norm": 1.5159779787063599, + "learning_rate": 3.7779838215810372e-06, + "loss": 2.2803, + "step": 6520 + }, + { + "epoch": 0.34983905579399144, + "grad_norm": 1.4452751874923706, + "learning_rate": 3.777610449009804e-06, + "loss": 2.2157, + "step": 6521 + }, + { + "epoch": 0.34989270386266097, + "grad_norm": 1.5698561668395996, + "learning_rate": 3.7772370378639783e-06, + "loss": 2.5746, + "step": 6522 + }, + { + "epoch": 0.34994635193133045, + "grad_norm": 1.5496207475662231, + "learning_rate": 3.7768635881548345e-06, + "loss": 2.1182, + "step": 6523 + }, + { + "epoch": 0.35, + "grad_norm": 1.606806755065918, + "learning_rate": 3.7764900998936472e-06, + "loss": 2.2539, + "step": 6524 + }, + { + "epoch": 0.3500536480686695, + "grad_norm": 1.6851266622543335, + "learning_rate": 3.7761165730916943e-06, + "loss": 2.3249, + "step": 6525 + }, + { + "epoch": 0.35010729613733904, + "grad_norm": 1.4876527786254883, + "learning_rate": 3.775743007760253e-06, + "loss": 2.446, + "step": 6526 + }, + { + "epoch": 0.3501609442060086, + "grad_norm": 1.332470178604126, + "learning_rate": 3.7753694039106027e-06, + "loss": 2.3228, + "step": 6527 + }, + { + "epoch": 0.3502145922746781, + "grad_norm": 12.597851753234863, + "learning_rate": 3.7749957615540223e-06, + "loss": 2.309, + "step": 6528 + }, + { + "epoch": 0.35026824034334764, + "grad_norm": 1.5044827461242676, + "learning_rate": 3.774622080701795e-06, + "loss": 2.1586, + "step": 6529 + }, + { + "epoch": 0.35032188841201717, + "grad_norm": 1.5167250633239746, + "learning_rate": 3.7742483613652014e-06, + "loss": 2.3533, + "step": 6530 + }, + { + "epoch": 0.3503755364806867, + "grad_norm": 1.3608652353286743, + "learning_rate": 3.7738746035555273e-06, + "loss": 2.1015, + "step": 6531 + }, + { + "epoch": 0.35042918454935623, + "grad_norm": 1.3641568422317505, + "learning_rate": 3.773500807284055e-06, + "loss": 2.162, + "step": 6532 + }, + { + "epoch": 0.35048283261802576, + "grad_norm": 1.808520793914795, + "learning_rate": 3.7731269725620724e-06, + "loss": 2.305, + "step": 6533 + }, + { + "epoch": 0.3505364806866953, + "grad_norm": 1.4053006172180176, + "learning_rate": 3.772753099400866e-06, + "loss": 2.2012, + "step": 6534 + }, + { + "epoch": 0.3505901287553648, + "grad_norm": 1.5163025856018066, + "learning_rate": 3.772379187811723e-06, + "loss": 2.4868, + "step": 6535 + }, + { + "epoch": 0.35064377682403436, + "grad_norm": 1.625240445137024, + "learning_rate": 3.772005237805934e-06, + "loss": 2.3189, + "step": 6536 + }, + { + "epoch": 0.35069742489270384, + "grad_norm": 1.4170719385147095, + "learning_rate": 3.771631249394789e-06, + "loss": 2.1219, + "step": 6537 + }, + { + "epoch": 0.35075107296137337, + "grad_norm": 1.3178287744522095, + "learning_rate": 3.771257222589581e-06, + "loss": 2.1938, + "step": 6538 + }, + { + "epoch": 0.3508047210300429, + "grad_norm": 1.4973037242889404, + "learning_rate": 3.770883157401602e-06, + "loss": 2.26, + "step": 6539 + }, + { + "epoch": 0.35085836909871243, + "grad_norm": 1.3881757259368896, + "learning_rate": 3.770509053842145e-06, + "loss": 2.2395, + "step": 6540 + }, + { + "epoch": 0.35091201716738196, + "grad_norm": 1.459144115447998, + "learning_rate": 3.770134911922507e-06, + "loss": 2.2435, + "step": 6541 + }, + { + "epoch": 0.3509656652360515, + "grad_norm": 1.6259008646011353, + "learning_rate": 3.769760731653983e-06, + "loss": 2.1279, + "step": 6542 + }, + { + "epoch": 0.351019313304721, + "grad_norm": 1.2764596939086914, + "learning_rate": 3.7693865130478726e-06, + "loss": 2.4613, + "step": 6543 + }, + { + "epoch": 0.35107296137339056, + "grad_norm": 1.365822196006775, + "learning_rate": 3.769012256115471e-06, + "loss": 2.3471, + "step": 6544 + }, + { + "epoch": 0.3511266094420601, + "grad_norm": 1.5906635522842407, + "learning_rate": 3.768637960868081e-06, + "loss": 2.461, + "step": 6545 + }, + { + "epoch": 0.3511802575107296, + "grad_norm": 1.3141703605651855, + "learning_rate": 3.7682636273170036e-06, + "loss": 2.2131, + "step": 6546 + }, + { + "epoch": 0.35123390557939915, + "grad_norm": 1.1806355714797974, + "learning_rate": 3.7678892554735393e-06, + "loss": 2.1516, + "step": 6547 + }, + { + "epoch": 0.3512875536480687, + "grad_norm": 1.504241943359375, + "learning_rate": 3.767514845348992e-06, + "loss": 2.2113, + "step": 6548 + }, + { + "epoch": 0.3513412017167382, + "grad_norm": 1.3619860410690308, + "learning_rate": 3.767140396954666e-06, + "loss": 2.1074, + "step": 6549 + }, + { + "epoch": 0.35139484978540775, + "grad_norm": 1.3303145170211792, + "learning_rate": 3.7667659103018684e-06, + "loss": 2.357, + "step": 6550 + }, + { + "epoch": 0.3514484978540773, + "grad_norm": 1.4275656938552856, + "learning_rate": 3.766391385401904e-06, + "loss": 1.3928, + "step": 6551 + }, + { + "epoch": 0.35150214592274676, + "grad_norm": 1.4495702981948853, + "learning_rate": 3.766016822266083e-06, + "loss": 2.2864, + "step": 6552 + }, + { + "epoch": 0.3515557939914163, + "grad_norm": 1.4562325477600098, + "learning_rate": 3.7656422209057114e-06, + "loss": 2.4558, + "step": 6553 + }, + { + "epoch": 0.3516094420600858, + "grad_norm": 1.4686837196350098, + "learning_rate": 3.7652675813321015e-06, + "loss": 2.3544, + "step": 6554 + }, + { + "epoch": 0.35166309012875535, + "grad_norm": 1.379331111907959, + "learning_rate": 3.764892903556565e-06, + "loss": 2.2322, + "step": 6555 + }, + { + "epoch": 0.3517167381974249, + "grad_norm": 1.4196382761001587, + "learning_rate": 3.7645181875904147e-06, + "loss": 2.4504, + "step": 6556 + }, + { + "epoch": 0.3517703862660944, + "grad_norm": 1.491058111190796, + "learning_rate": 3.764143433444962e-06, + "loss": 2.3122, + "step": 6557 + }, + { + "epoch": 0.35182403433476395, + "grad_norm": 1.7267078161239624, + "learning_rate": 3.763768641131524e-06, + "loss": 2.216, + "step": 6558 + }, + { + "epoch": 0.3518776824034335, + "grad_norm": 1.344507098197937, + "learning_rate": 3.763393810661415e-06, + "loss": 1.9567, + "step": 6559 + }, + { + "epoch": 0.351931330472103, + "grad_norm": 1.4446678161621094, + "learning_rate": 3.7630189420459538e-06, + "loss": 2.2571, + "step": 6560 + }, + { + "epoch": 0.35198497854077254, + "grad_norm": 1.5421905517578125, + "learning_rate": 3.762644035296458e-06, + "loss": 2.1988, + "step": 6561 + }, + { + "epoch": 0.3520386266094421, + "grad_norm": 1.5585570335388184, + "learning_rate": 3.7622690904242477e-06, + "loss": 2.3004, + "step": 6562 + }, + { + "epoch": 0.3520922746781116, + "grad_norm": 1.6440478563308716, + "learning_rate": 3.7618941074406416e-06, + "loss": 2.5218, + "step": 6563 + }, + { + "epoch": 0.35214592274678114, + "grad_norm": 1.5944221019744873, + "learning_rate": 3.761519086356964e-06, + "loss": 2.2458, + "step": 6564 + }, + { + "epoch": 0.35219957081545067, + "grad_norm": 1.5017720460891724, + "learning_rate": 3.7611440271845355e-06, + "loss": 2.3227, + "step": 6565 + }, + { + "epoch": 0.35225321888412015, + "grad_norm": 1.4933990240097046, + "learning_rate": 3.760768929934682e-06, + "loss": 2.3836, + "step": 6566 + }, + { + "epoch": 0.3523068669527897, + "grad_norm": 1.4331042766571045, + "learning_rate": 3.760393794618728e-06, + "loss": 2.3265, + "step": 6567 + }, + { + "epoch": 0.3523605150214592, + "grad_norm": 1.2496833801269531, + "learning_rate": 3.760018621248e-06, + "loss": 2.1754, + "step": 6568 + }, + { + "epoch": 0.35241416309012874, + "grad_norm": 1.764426589012146, + "learning_rate": 3.7596434098338254e-06, + "loss": 2.095, + "step": 6569 + }, + { + "epoch": 0.35246781115879827, + "grad_norm": 1.4962037801742554, + "learning_rate": 3.7592681603875327e-06, + "loss": 2.1821, + "step": 6570 + }, + { + "epoch": 0.3525214592274678, + "grad_norm": 1.3773409128189087, + "learning_rate": 3.7588928729204518e-06, + "loss": 2.236, + "step": 6571 + }, + { + "epoch": 0.35257510729613734, + "grad_norm": 1.6125526428222656, + "learning_rate": 3.7585175474439138e-06, + "loss": 2.1853, + "step": 6572 + }, + { + "epoch": 0.35262875536480687, + "grad_norm": 1.4170609712600708, + "learning_rate": 3.758142183969251e-06, + "loss": 2.4158, + "step": 6573 + }, + { + "epoch": 0.3526824034334764, + "grad_norm": 2.5453779697418213, + "learning_rate": 3.757766782507797e-06, + "loss": 2.0136, + "step": 6574 + }, + { + "epoch": 0.35273605150214593, + "grad_norm": 2.0930137634277344, + "learning_rate": 3.7573913430708842e-06, + "loss": 2.3899, + "step": 6575 + }, + { + "epoch": 0.35278969957081546, + "grad_norm": 1.356724500656128, + "learning_rate": 3.7570158656698502e-06, + "loss": 2.1273, + "step": 6576 + }, + { + "epoch": 0.352843347639485, + "grad_norm": 1.380228042602539, + "learning_rate": 3.7566403503160307e-06, + "loss": 2.2915, + "step": 6577 + }, + { + "epoch": 0.3528969957081545, + "grad_norm": 1.5100390911102295, + "learning_rate": 3.7562647970207655e-06, + "loss": 2.3706, + "step": 6578 + }, + { + "epoch": 0.35295064377682406, + "grad_norm": 1.3702104091644287, + "learning_rate": 3.755889205795391e-06, + "loss": 2.2149, + "step": 6579 + }, + { + "epoch": 0.3530042918454936, + "grad_norm": 1.5654704570770264, + "learning_rate": 3.7555135766512485e-06, + "loss": 2.3817, + "step": 6580 + }, + { + "epoch": 0.35305793991416307, + "grad_norm": 1.3199925422668457, + "learning_rate": 3.7551379095996786e-06, + "loss": 2.2301, + "step": 6581 + }, + { + "epoch": 0.3531115879828326, + "grad_norm": 1.3605579137802124, + "learning_rate": 3.754762204652025e-06, + "loss": 2.33, + "step": 6582 + }, + { + "epoch": 0.35316523605150213, + "grad_norm": 1.3631550073623657, + "learning_rate": 3.754386461819631e-06, + "loss": 2.3052, + "step": 6583 + }, + { + "epoch": 0.35321888412017166, + "grad_norm": 2.251589059829712, + "learning_rate": 3.75401068111384e-06, + "loss": 2.3813, + "step": 6584 + }, + { + "epoch": 0.3532725321888412, + "grad_norm": 1.4251147508621216, + "learning_rate": 3.753634862545999e-06, + "loss": 2.1525, + "step": 6585 + }, + { + "epoch": 0.3533261802575107, + "grad_norm": 1.395853042602539, + "learning_rate": 3.753259006127454e-06, + "loss": 2.2129, + "step": 6586 + }, + { + "epoch": 0.35337982832618026, + "grad_norm": 1.4284335374832153, + "learning_rate": 3.752883111869555e-06, + "loss": 2.2612, + "step": 6587 + }, + { + "epoch": 0.3534334763948498, + "grad_norm": 1.450382947921753, + "learning_rate": 3.75250717978365e-06, + "loss": 2.3187, + "step": 6588 + }, + { + "epoch": 0.3534871244635193, + "grad_norm": 1.8490978479385376, + "learning_rate": 3.7521312098810892e-06, + "loss": 2.4455, + "step": 6589 + }, + { + "epoch": 0.35354077253218885, + "grad_norm": 1.4225797653198242, + "learning_rate": 3.751755202173225e-06, + "loss": 2.376, + "step": 6590 + }, + { + "epoch": 0.3535944206008584, + "grad_norm": 1.6729384660720825, + "learning_rate": 3.7513791566714095e-06, + "loss": 2.2732, + "step": 6591 + }, + { + "epoch": 0.3536480686695279, + "grad_norm": 1.4528577327728271, + "learning_rate": 3.751003073386997e-06, + "loss": 2.1542, + "step": 6592 + }, + { + "epoch": 0.35370171673819745, + "grad_norm": 1.513875961303711, + "learning_rate": 3.7506269523313416e-06, + "loss": 2.153, + "step": 6593 + }, + { + "epoch": 0.353755364806867, + "grad_norm": 1.139407753944397, + "learning_rate": 3.7502507935158007e-06, + "loss": 1.7784, + "step": 6594 + }, + { + "epoch": 0.35380901287553645, + "grad_norm": 1.2833248376846313, + "learning_rate": 3.74987459695173e-06, + "loss": 2.2604, + "step": 6595 + }, + { + "epoch": 0.353862660944206, + "grad_norm": 1.3791223764419556, + "learning_rate": 3.7494983626504904e-06, + "loss": 2.4118, + "step": 6596 + }, + { + "epoch": 0.3539163090128755, + "grad_norm": 1.3431297540664673, + "learning_rate": 3.749122090623439e-06, + "loss": 2.1886, + "step": 6597 + }, + { + "epoch": 0.35396995708154505, + "grad_norm": 1.2889246940612793, + "learning_rate": 3.7487457808819373e-06, + "loss": 2.0641, + "step": 6598 + }, + { + "epoch": 0.3540236051502146, + "grad_norm": 1.401997685432434, + "learning_rate": 3.7483694334373468e-06, + "loss": 2.1035, + "step": 6599 + }, + { + "epoch": 0.3540772532188841, + "grad_norm": 1.291137933731079, + "learning_rate": 3.747993048301032e-06, + "loss": 2.0514, + "step": 6600 + }, + { + "epoch": 0.35413090128755365, + "grad_norm": 1.6282728910446167, + "learning_rate": 3.7476166254843554e-06, + "loss": 2.3886, + "step": 6601 + }, + { + "epoch": 0.3541845493562232, + "grad_norm": 1.4343492984771729, + "learning_rate": 3.7472401649986827e-06, + "loss": 2.3306, + "step": 6602 + }, + { + "epoch": 0.3542381974248927, + "grad_norm": 1.341374158859253, + "learning_rate": 3.7468636668553802e-06, + "loss": 2.5083, + "step": 6603 + }, + { + "epoch": 0.35429184549356224, + "grad_norm": 1.6882625818252563, + "learning_rate": 3.7464871310658158e-06, + "loss": 2.5983, + "step": 6604 + }, + { + "epoch": 0.3543454935622318, + "grad_norm": 1.4449682235717773, + "learning_rate": 3.746110557641358e-06, + "loss": 2.379, + "step": 6605 + }, + { + "epoch": 0.3543991416309013, + "grad_norm": 1.6613953113555908, + "learning_rate": 3.7457339465933763e-06, + "loss": 2.3636, + "step": 6606 + }, + { + "epoch": 0.35445278969957084, + "grad_norm": 1.4868648052215576, + "learning_rate": 3.745357297933242e-06, + "loss": 2.4766, + "step": 6607 + }, + { + "epoch": 0.35450643776824037, + "grad_norm": 1.143654227256775, + "learning_rate": 3.744980611672327e-06, + "loss": 2.0181, + "step": 6608 + }, + { + "epoch": 0.35456008583690984, + "grad_norm": 1.4482556581497192, + "learning_rate": 3.7446038878220047e-06, + "loss": 2.4058, + "step": 6609 + }, + { + "epoch": 0.3546137339055794, + "grad_norm": 1.4268049001693726, + "learning_rate": 3.7442271263936484e-06, + "loss": 2.3481, + "step": 6610 + }, + { + "epoch": 0.3546673819742489, + "grad_norm": 1.5326968431472778, + "learning_rate": 3.7438503273986355e-06, + "loss": 2.1853, + "step": 6611 + }, + { + "epoch": 0.35472103004291844, + "grad_norm": 1.333457350730896, + "learning_rate": 3.7434734908483403e-06, + "loss": 2.0731, + "step": 6612 + }, + { + "epoch": 0.35477467811158797, + "grad_norm": 1.5015615224838257, + "learning_rate": 3.743096616754143e-06, + "loss": 2.3451, + "step": 6613 + }, + { + "epoch": 0.3548283261802575, + "grad_norm": 1.4556254148483276, + "learning_rate": 3.7427197051274204e-06, + "loss": 2.5605, + "step": 6614 + }, + { + "epoch": 0.35488197424892703, + "grad_norm": 1.5108942985534668, + "learning_rate": 3.7423427559795537e-06, + "loss": 2.3066, + "step": 6615 + }, + { + "epoch": 0.35493562231759657, + "grad_norm": 1.4643949270248413, + "learning_rate": 3.7419657693219235e-06, + "loss": 2.2745, + "step": 6616 + }, + { + "epoch": 0.3549892703862661, + "grad_norm": 1.5058305263519287, + "learning_rate": 3.7415887451659126e-06, + "loss": 2.362, + "step": 6617 + }, + { + "epoch": 0.35504291845493563, + "grad_norm": 1.2907005548477173, + "learning_rate": 3.741211683522904e-06, + "loss": 2.1113, + "step": 6618 + }, + { + "epoch": 0.35509656652360516, + "grad_norm": 1.5062997341156006, + "learning_rate": 3.7408345844042824e-06, + "loss": 2.1385, + "step": 6619 + }, + { + "epoch": 0.3551502145922747, + "grad_norm": 1.4324803352355957, + "learning_rate": 3.740457447821433e-06, + "loss": 2.6555, + "step": 6620 + }, + { + "epoch": 0.3552038626609442, + "grad_norm": 1.3735454082489014, + "learning_rate": 3.7400802737857424e-06, + "loss": 2.4194, + "step": 6621 + }, + { + "epoch": 0.35525751072961376, + "grad_norm": 1.47373366355896, + "learning_rate": 3.7397030623086e-06, + "loss": 2.1549, + "step": 6622 + }, + { + "epoch": 0.3553111587982833, + "grad_norm": 1.4648815393447876, + "learning_rate": 3.739325813401393e-06, + "loss": 2.1372, + "step": 6623 + }, + { + "epoch": 0.35536480686695276, + "grad_norm": 1.5120002031326294, + "learning_rate": 3.7389485270755133e-06, + "loss": 2.3248, + "step": 6624 + }, + { + "epoch": 0.3554184549356223, + "grad_norm": 1.6829265356063843, + "learning_rate": 3.738571203342351e-06, + "loss": 2.3449, + "step": 6625 + }, + { + "epoch": 0.35547210300429183, + "grad_norm": 1.573442816734314, + "learning_rate": 3.738193842213299e-06, + "loss": 2.2695, + "step": 6626 + }, + { + "epoch": 0.35552575107296136, + "grad_norm": 2.455378770828247, + "learning_rate": 3.737816443699751e-06, + "loss": 2.2618, + "step": 6627 + }, + { + "epoch": 0.3555793991416309, + "grad_norm": 1.3237136602401733, + "learning_rate": 3.737439007813102e-06, + "loss": 2.3026, + "step": 6628 + }, + { + "epoch": 0.3556330472103004, + "grad_norm": 1.3945207595825195, + "learning_rate": 3.7370615345647467e-06, + "loss": 2.4051, + "step": 6629 + }, + { + "epoch": 0.35568669527896996, + "grad_norm": 2.220625877380371, + "learning_rate": 3.736684023966083e-06, + "loss": 2.4594, + "step": 6630 + }, + { + "epoch": 0.3557403433476395, + "grad_norm": 1.6476198434829712, + "learning_rate": 3.7363064760285085e-06, + "loss": 2.3087, + "step": 6631 + }, + { + "epoch": 0.355793991416309, + "grad_norm": 1.3521995544433594, + "learning_rate": 3.735928890763423e-06, + "loss": 2.2069, + "step": 6632 + }, + { + "epoch": 0.35584763948497855, + "grad_norm": 1.3622008562088013, + "learning_rate": 3.7355512681822257e-06, + "loss": 2.1454, + "step": 6633 + }, + { + "epoch": 0.3559012875536481, + "grad_norm": 1.295922040939331, + "learning_rate": 3.73517360829632e-06, + "loss": 2.3974, + "step": 6634 + }, + { + "epoch": 0.3559549356223176, + "grad_norm": 1.055612564086914, + "learning_rate": 3.734795911117106e-06, + "loss": 1.8825, + "step": 6635 + }, + { + "epoch": 0.35600858369098715, + "grad_norm": 1.5313750505447388, + "learning_rate": 3.7344181766559907e-06, + "loss": 2.3619, + "step": 6636 + }, + { + "epoch": 0.3560622317596567, + "grad_norm": 1.401395320892334, + "learning_rate": 3.734040404924375e-06, + "loss": 1.9803, + "step": 6637 + }, + { + "epoch": 0.35611587982832615, + "grad_norm": 1.5373891592025757, + "learning_rate": 3.733662595933668e-06, + "loss": 2.2507, + "step": 6638 + }, + { + "epoch": 0.3561695278969957, + "grad_norm": 1.458549976348877, + "learning_rate": 3.733284749695275e-06, + "loss": 2.2296, + "step": 6639 + }, + { + "epoch": 0.3562231759656652, + "grad_norm": 1.132603645324707, + "learning_rate": 3.7329068662206063e-06, + "loss": 1.9069, + "step": 6640 + }, + { + "epoch": 0.35627682403433475, + "grad_norm": 1.3780393600463867, + "learning_rate": 3.732528945521069e-06, + "loss": 1.7875, + "step": 6641 + }, + { + "epoch": 0.3563304721030043, + "grad_norm": 1.1471532583236694, + "learning_rate": 3.732150987608074e-06, + "loss": 2.0561, + "step": 6642 + }, + { + "epoch": 0.3563841201716738, + "grad_norm": 1.4746462106704712, + "learning_rate": 3.7317729924930336e-06, + "loss": 2.212, + "step": 6643 + }, + { + "epoch": 0.35643776824034334, + "grad_norm": 1.4513486623764038, + "learning_rate": 3.731394960187361e-06, + "loss": 2.3664, + "step": 6644 + }, + { + "epoch": 0.3564914163090129, + "grad_norm": 1.4443669319152832, + "learning_rate": 3.731016890702469e-06, + "loss": 1.8198, + "step": 6645 + }, + { + "epoch": 0.3565450643776824, + "grad_norm": 1.33649480342865, + "learning_rate": 3.7306387840497725e-06, + "loss": 1.9347, + "step": 6646 + }, + { + "epoch": 0.35659871244635194, + "grad_norm": 1.5294508934020996, + "learning_rate": 3.7302606402406883e-06, + "loss": 2.1931, + "step": 6647 + }, + { + "epoch": 0.35665236051502147, + "grad_norm": 1.4922329187393188, + "learning_rate": 3.729882459286632e-06, + "loss": 2.319, + "step": 6648 + }, + { + "epoch": 0.356706008583691, + "grad_norm": 1.42587149143219, + "learning_rate": 3.7295042411990244e-06, + "loss": 2.2049, + "step": 6649 + }, + { + "epoch": 0.35675965665236054, + "grad_norm": 1.2980842590332031, + "learning_rate": 3.729125985989284e-06, + "loss": 2.2009, + "step": 6650 + }, + { + "epoch": 0.35681330472103007, + "grad_norm": 1.4580068588256836, + "learning_rate": 3.7287476936688304e-06, + "loss": 2.3917, + "step": 6651 + }, + { + "epoch": 0.3568669527896996, + "grad_norm": 1.6395008563995361, + "learning_rate": 3.7283693642490857e-06, + "loss": 2.3794, + "step": 6652 + }, + { + "epoch": 0.3569206008583691, + "grad_norm": 1.542049527168274, + "learning_rate": 3.727990997741475e-06, + "loss": 2.0676, + "step": 6653 + }, + { + "epoch": 0.3569742489270386, + "grad_norm": 1.4703807830810547, + "learning_rate": 3.7276125941574175e-06, + "loss": 2.3181, + "step": 6654 + }, + { + "epoch": 0.35702789699570814, + "grad_norm": 1.1955410242080688, + "learning_rate": 3.7272341535083424e-06, + "loss": 1.9606, + "step": 6655 + }, + { + "epoch": 0.35708154506437767, + "grad_norm": 1.4196454286575317, + "learning_rate": 3.7268556758056745e-06, + "loss": 2.2369, + "step": 6656 + }, + { + "epoch": 0.3571351931330472, + "grad_norm": 2.3844175338745117, + "learning_rate": 3.726477161060841e-06, + "loss": 2.3356, + "step": 6657 + }, + { + "epoch": 0.35718884120171673, + "grad_norm": 1.536426305770874, + "learning_rate": 3.7260986092852702e-06, + "loss": 2.2285, + "step": 6658 + }, + { + "epoch": 0.35724248927038627, + "grad_norm": 1.4843966960906982, + "learning_rate": 3.7257200204903922e-06, + "loss": 2.1687, + "step": 6659 + }, + { + "epoch": 0.3572961373390558, + "grad_norm": 1.3564784526824951, + "learning_rate": 3.725341394687636e-06, + "loss": 2.2515, + "step": 6660 + }, + { + "epoch": 0.35734978540772533, + "grad_norm": 1.223218560218811, + "learning_rate": 3.724962731888435e-06, + "loss": 2.1149, + "step": 6661 + }, + { + "epoch": 0.35740343347639486, + "grad_norm": 1.4613856077194214, + "learning_rate": 3.7245840321042227e-06, + "loss": 2.3158, + "step": 6662 + }, + { + "epoch": 0.3574570815450644, + "grad_norm": 1.3984965085983276, + "learning_rate": 3.7242052953464315e-06, + "loss": 2.2171, + "step": 6663 + }, + { + "epoch": 0.3575107296137339, + "grad_norm": 1.6760406494140625, + "learning_rate": 3.7238265216264967e-06, + "loss": 2.623, + "step": 6664 + }, + { + "epoch": 0.35756437768240346, + "grad_norm": 1.4236608743667603, + "learning_rate": 3.7234477109558554e-06, + "loss": 2.2819, + "step": 6665 + }, + { + "epoch": 0.357618025751073, + "grad_norm": 1.493051290512085, + "learning_rate": 3.7230688633459433e-06, + "loss": 2.2927, + "step": 6666 + }, + { + "epoch": 0.35767167381974246, + "grad_norm": 2.045302629470825, + "learning_rate": 3.722689978808202e-06, + "loss": 1.9657, + "step": 6667 + }, + { + "epoch": 0.357725321888412, + "grad_norm": 1.2258179187774658, + "learning_rate": 3.722311057354067e-06, + "loss": 2.2727, + "step": 6668 + }, + { + "epoch": 0.3577789699570815, + "grad_norm": 1.5414364337921143, + "learning_rate": 3.721932098994982e-06, + "loss": 2.1241, + "step": 6669 + }, + { + "epoch": 0.35783261802575106, + "grad_norm": 1.3087472915649414, + "learning_rate": 3.721553103742388e-06, + "loss": 2.3775, + "step": 6670 + }, + { + "epoch": 0.3578862660944206, + "grad_norm": 1.2137852907180786, + "learning_rate": 3.721174071607727e-06, + "loss": 2.303, + "step": 6671 + }, + { + "epoch": 0.3579399141630901, + "grad_norm": 1.7760053873062134, + "learning_rate": 3.720795002602444e-06, + "loss": 2.0448, + "step": 6672 + }, + { + "epoch": 0.35799356223175965, + "grad_norm": 1.3637800216674805, + "learning_rate": 3.7204158967379843e-06, + "loss": 2.3091, + "step": 6673 + }, + { + "epoch": 0.3580472103004292, + "grad_norm": 1.4893972873687744, + "learning_rate": 3.7200367540257944e-06, + "loss": 2.1514, + "step": 6674 + }, + { + "epoch": 0.3581008583690987, + "grad_norm": 1.5987217426300049, + "learning_rate": 3.7196575744773206e-06, + "loss": 2.5454, + "step": 6675 + }, + { + "epoch": 0.35815450643776825, + "grad_norm": 1.4753153324127197, + "learning_rate": 3.7192783581040107e-06, + "loss": 2.545, + "step": 6676 + }, + { + "epoch": 0.3582081545064378, + "grad_norm": 4.044870376586914, + "learning_rate": 3.7188991049173166e-06, + "loss": 2.2521, + "step": 6677 + }, + { + "epoch": 0.3582618025751073, + "grad_norm": 1.3998456001281738, + "learning_rate": 3.718519814928687e-06, + "loss": 2.2686, + "step": 6678 + }, + { + "epoch": 0.35831545064377684, + "grad_norm": 1.1236392259597778, + "learning_rate": 3.7181404881495754e-06, + "loss": 2.1889, + "step": 6679 + }, + { + "epoch": 0.3583690987124464, + "grad_norm": 1.8922197818756104, + "learning_rate": 3.717761124591434e-06, + "loss": 2.0636, + "step": 6680 + }, + { + "epoch": 0.35842274678111585, + "grad_norm": 2.179311752319336, + "learning_rate": 3.7173817242657163e-06, + "loss": 2.2419, + "step": 6681 + }, + { + "epoch": 0.3584763948497854, + "grad_norm": 1.638905644416809, + "learning_rate": 3.7170022871838774e-06, + "loss": 1.9922, + "step": 6682 + }, + { + "epoch": 0.3585300429184549, + "grad_norm": 1.7547186613082886, + "learning_rate": 3.716622813357374e-06, + "loss": 2.2132, + "step": 6683 + }, + { + "epoch": 0.35858369098712445, + "grad_norm": 1.6068952083587646, + "learning_rate": 3.716243302797664e-06, + "loss": 2.0259, + "step": 6684 + }, + { + "epoch": 0.358637339055794, + "grad_norm": 1.4059895277023315, + "learning_rate": 3.715863755516206e-06, + "loss": 2.2773, + "step": 6685 + }, + { + "epoch": 0.3586909871244635, + "grad_norm": 1.4693502187728882, + "learning_rate": 3.715484171524458e-06, + "loss": 2.1565, + "step": 6686 + }, + { + "epoch": 0.35874463519313304, + "grad_norm": 1.6236411333084106, + "learning_rate": 3.715104550833881e-06, + "loss": 2.3183, + "step": 6687 + }, + { + "epoch": 0.3587982832618026, + "grad_norm": 1.5535759925842285, + "learning_rate": 3.714724893455938e-06, + "loss": 2.1789, + "step": 6688 + }, + { + "epoch": 0.3588519313304721, + "grad_norm": 1.7081072330474854, + "learning_rate": 3.714345199402092e-06, + "loss": 2.3719, + "step": 6689 + }, + { + "epoch": 0.35890557939914164, + "grad_norm": 1.389893651008606, + "learning_rate": 3.713965468683805e-06, + "loss": 2.1556, + "step": 6690 + }, + { + "epoch": 0.35895922746781117, + "grad_norm": 1.53445303440094, + "learning_rate": 3.713585701312544e-06, + "loss": 2.234, + "step": 6691 + }, + { + "epoch": 0.3590128755364807, + "grad_norm": 1.0967592000961304, + "learning_rate": 3.7132058972997755e-06, + "loss": 1.8402, + "step": 6692 + }, + { + "epoch": 0.35906652360515023, + "grad_norm": 1.2785816192626953, + "learning_rate": 3.712826056656965e-06, + "loss": 2.3217, + "step": 6693 + }, + { + "epoch": 0.35912017167381977, + "grad_norm": 1.1523802280426025, + "learning_rate": 3.7124461793955823e-06, + "loss": 2.1174, + "step": 6694 + }, + { + "epoch": 0.3591738197424893, + "grad_norm": 1.1022987365722656, + "learning_rate": 3.7120662655270962e-06, + "loss": 2.1636, + "step": 6695 + }, + { + "epoch": 0.3592274678111588, + "grad_norm": 1.4156585931777954, + "learning_rate": 3.7116863150629777e-06, + "loss": 2.2559, + "step": 6696 + }, + { + "epoch": 0.3592811158798283, + "grad_norm": 1.524519443511963, + "learning_rate": 3.7113063280147e-06, + "loss": 2.3959, + "step": 6697 + }, + { + "epoch": 0.35933476394849784, + "grad_norm": 1.2044224739074707, + "learning_rate": 3.710926304393733e-06, + "loss": 2.03, + "step": 6698 + }, + { + "epoch": 0.35938841201716737, + "grad_norm": 1.5784095525741577, + "learning_rate": 3.7105462442115523e-06, + "loss": 2.2924, + "step": 6699 + }, + { + "epoch": 0.3594420600858369, + "grad_norm": 19.89160919189453, + "learning_rate": 3.7101661474796334e-06, + "loss": 2.0855, + "step": 6700 + }, + { + "epoch": 0.35949570815450643, + "grad_norm": 1.4523333311080933, + "learning_rate": 3.709786014209452e-06, + "loss": 2.3909, + "step": 6701 + }, + { + "epoch": 0.35954935622317596, + "grad_norm": 1.5998201370239258, + "learning_rate": 3.709405844412486e-06, + "loss": 2.2069, + "step": 6702 + }, + { + "epoch": 0.3596030042918455, + "grad_norm": 1.2623927593231201, + "learning_rate": 3.709025638100212e-06, + "loss": 2.0329, + "step": 6703 + }, + { + "epoch": 0.35965665236051503, + "grad_norm": 1.6840022802352905, + "learning_rate": 3.7086453952841105e-06, + "loss": 2.4661, + "step": 6704 + }, + { + "epoch": 0.35971030042918456, + "grad_norm": 1.374257206916809, + "learning_rate": 3.708265115975662e-06, + "loss": 2.1712, + "step": 6705 + }, + { + "epoch": 0.3597639484978541, + "grad_norm": 1.0839450359344482, + "learning_rate": 3.70788480018635e-06, + "loss": 1.9553, + "step": 6706 + }, + { + "epoch": 0.3598175965665236, + "grad_norm": 1.4740309715270996, + "learning_rate": 3.7075044479276546e-06, + "loss": 2.3199, + "step": 6707 + }, + { + "epoch": 0.35987124463519315, + "grad_norm": 1.897629737854004, + "learning_rate": 3.70712405921106e-06, + "loss": 2.1191, + "step": 6708 + }, + { + "epoch": 0.3599248927038627, + "grad_norm": 1.3602482080459595, + "learning_rate": 3.7067436340480527e-06, + "loss": 1.9532, + "step": 6709 + }, + { + "epoch": 0.35997854077253216, + "grad_norm": 1.3458384275436401, + "learning_rate": 3.706363172450118e-06, + "loss": 1.9426, + "step": 6710 + }, + { + "epoch": 0.3600321888412017, + "grad_norm": 1.4998124837875366, + "learning_rate": 3.705982674428743e-06, + "loss": 2.3541, + "step": 6711 + }, + { + "epoch": 0.3600858369098712, + "grad_norm": 1.7312755584716797, + "learning_rate": 3.705602139995416e-06, + "loss": 2.0907, + "step": 6712 + }, + { + "epoch": 0.36013948497854076, + "grad_norm": 1.5956013202667236, + "learning_rate": 3.7052215691616265e-06, + "loss": 2.5547, + "step": 6713 + }, + { + "epoch": 0.3601931330472103, + "grad_norm": 1.5591791868209839, + "learning_rate": 3.704840961938865e-06, + "loss": 2.4149, + "step": 6714 + }, + { + "epoch": 0.3602467811158798, + "grad_norm": 1.7081432342529297, + "learning_rate": 3.7044603183386236e-06, + "loss": 2.3418, + "step": 6715 + }, + { + "epoch": 0.36030042918454935, + "grad_norm": 1.433266282081604, + "learning_rate": 3.7040796383723932e-06, + "loss": 2.3768, + "step": 6716 + }, + { + "epoch": 0.3603540772532189, + "grad_norm": 1.3361419439315796, + "learning_rate": 3.7036989220516693e-06, + "loss": 2.1583, + "step": 6717 + }, + { + "epoch": 0.3604077253218884, + "grad_norm": 1.5159611701965332, + "learning_rate": 3.7033181693879465e-06, + "loss": 2.2247, + "step": 6718 + }, + { + "epoch": 0.36046137339055795, + "grad_norm": 3.2116525173187256, + "learning_rate": 3.702937380392721e-06, + "loss": 2.4461, + "step": 6719 + }, + { + "epoch": 0.3605150214592275, + "grad_norm": 1.4045124053955078, + "learning_rate": 3.702556555077489e-06, + "loss": 2.1831, + "step": 6720 + }, + { + "epoch": 0.360568669527897, + "grad_norm": 1.4803547859191895, + "learning_rate": 3.702175693453749e-06, + "loss": 2.358, + "step": 6721 + }, + { + "epoch": 0.36062231759656654, + "grad_norm": 1.4186421632766724, + "learning_rate": 3.701794795533e-06, + "loss": 2.2797, + "step": 6722 + }, + { + "epoch": 0.3606759656652361, + "grad_norm": 1.045519471168518, + "learning_rate": 3.7014138613267426e-06, + "loss": 2.3163, + "step": 6723 + }, + { + "epoch": 0.36072961373390555, + "grad_norm": 1.3641313314437866, + "learning_rate": 3.7010328908464797e-06, + "loss": 2.3711, + "step": 6724 + }, + { + "epoch": 0.3607832618025751, + "grad_norm": 1.5399200916290283, + "learning_rate": 3.7006518841037115e-06, + "loss": 2.4711, + "step": 6725 + }, + { + "epoch": 0.3608369098712446, + "grad_norm": 1.8634132146835327, + "learning_rate": 3.700270841109943e-06, + "loss": 2.1998, + "step": 6726 + }, + { + "epoch": 0.36089055793991415, + "grad_norm": 1.5799874067306519, + "learning_rate": 3.699889761876678e-06, + "loss": 2.3976, + "step": 6727 + }, + { + "epoch": 0.3609442060085837, + "grad_norm": 1.3075252771377563, + "learning_rate": 3.699508646415424e-06, + "loss": 2.1319, + "step": 6728 + }, + { + "epoch": 0.3609978540772532, + "grad_norm": 1.3944813013076782, + "learning_rate": 3.699127494737686e-06, + "loss": 2.3668, + "step": 6729 + }, + { + "epoch": 0.36105150214592274, + "grad_norm": 1.500064492225647, + "learning_rate": 3.6987463068549733e-06, + "loss": 2.377, + "step": 6730 + }, + { + "epoch": 0.3611051502145923, + "grad_norm": 1.4405593872070312, + "learning_rate": 3.698365082778794e-06, + "loss": 2.2957, + "step": 6731 + }, + { + "epoch": 0.3611587982832618, + "grad_norm": 1.5679404735565186, + "learning_rate": 3.69798382252066e-06, + "loss": 2.2549, + "step": 6732 + }, + { + "epoch": 0.36121244635193134, + "grad_norm": 1.450005054473877, + "learning_rate": 3.697602526092081e-06, + "loss": 2.343, + "step": 6733 + }, + { + "epoch": 0.36126609442060087, + "grad_norm": 1.1235257387161255, + "learning_rate": 3.69722119350457e-06, + "loss": 1.8235, + "step": 6734 + }, + { + "epoch": 0.3613197424892704, + "grad_norm": 1.513170599937439, + "learning_rate": 3.6968398247696403e-06, + "loss": 2.095, + "step": 6735 + }, + { + "epoch": 0.36137339055793993, + "grad_norm": 1.6262638568878174, + "learning_rate": 3.6964584198988063e-06, + "loss": 2.6005, + "step": 6736 + }, + { + "epoch": 0.36142703862660946, + "grad_norm": 1.578515887260437, + "learning_rate": 3.696076978903585e-06, + "loss": 2.5138, + "step": 6737 + }, + { + "epoch": 0.361480686695279, + "grad_norm": 1.1626454591751099, + "learning_rate": 3.695695501795491e-06, + "loss": 2.2324, + "step": 6738 + }, + { + "epoch": 0.3615343347639485, + "grad_norm": 1.5618401765823364, + "learning_rate": 3.6953139885860434e-06, + "loss": 2.0941, + "step": 6739 + }, + { + "epoch": 0.361587982832618, + "grad_norm": 1.2795178890228271, + "learning_rate": 3.6949324392867613e-06, + "loss": 1.9593, + "step": 6740 + }, + { + "epoch": 0.36164163090128754, + "grad_norm": 1.4061262607574463, + "learning_rate": 3.6945508539091646e-06, + "loss": 2.3797, + "step": 6741 + }, + { + "epoch": 0.36169527896995707, + "grad_norm": 1.3596842288970947, + "learning_rate": 3.6941692324647745e-06, + "loss": 2.0739, + "step": 6742 + }, + { + "epoch": 0.3617489270386266, + "grad_norm": 1.2067079544067383, + "learning_rate": 3.6937875749651124e-06, + "loss": 1.7993, + "step": 6743 + }, + { + "epoch": 0.36180257510729613, + "grad_norm": 1.7225133180618286, + "learning_rate": 3.693405881421702e-06, + "loss": 2.4491, + "step": 6744 + }, + { + "epoch": 0.36185622317596566, + "grad_norm": 1.5937293767929077, + "learning_rate": 3.693024151846068e-06, + "loss": 2.4386, + "step": 6745 + }, + { + "epoch": 0.3619098712446352, + "grad_norm": 2.0825037956237793, + "learning_rate": 3.692642386249736e-06, + "loss": 2.42, + "step": 6746 + }, + { + "epoch": 0.3619635193133047, + "grad_norm": 1.6783479452133179, + "learning_rate": 3.692260584644232e-06, + "loss": 2.3639, + "step": 6747 + }, + { + "epoch": 0.36201716738197426, + "grad_norm": 1.468515396118164, + "learning_rate": 3.6918787470410843e-06, + "loss": 2.6342, + "step": 6748 + }, + { + "epoch": 0.3620708154506438, + "grad_norm": 1.5987814664840698, + "learning_rate": 3.6914968734518206e-06, + "loss": 2.4742, + "step": 6749 + }, + { + "epoch": 0.3621244635193133, + "grad_norm": 1.378782868385315, + "learning_rate": 3.6911149638879716e-06, + "loss": 2.0713, + "step": 6750 + }, + { + "epoch": 0.36217811158798285, + "grad_norm": 1.4599446058273315, + "learning_rate": 3.6907330183610683e-06, + "loss": 2.2949, + "step": 6751 + }, + { + "epoch": 0.3622317596566524, + "grad_norm": 1.3555574417114258, + "learning_rate": 3.6903510368826423e-06, + "loss": 2.3088, + "step": 6752 + }, + { + "epoch": 0.36228540772532186, + "grad_norm": 1.4014731645584106, + "learning_rate": 3.6899690194642268e-06, + "loss": 1.531, + "step": 6753 + }, + { + "epoch": 0.3623390557939914, + "grad_norm": 1.4295086860656738, + "learning_rate": 3.6895869661173557e-06, + "loss": 2.4664, + "step": 6754 + }, + { + "epoch": 0.3623927038626609, + "grad_norm": 1.475127100944519, + "learning_rate": 3.6892048768535645e-06, + "loss": 1.5732, + "step": 6755 + }, + { + "epoch": 0.36244635193133046, + "grad_norm": 2.00888729095459, + "learning_rate": 3.688822751684389e-06, + "loss": 2.7214, + "step": 6756 + }, + { + "epoch": 0.3625, + "grad_norm": 1.4122873544692993, + "learning_rate": 3.688440590621368e-06, + "loss": 2.2632, + "step": 6757 + }, + { + "epoch": 0.3625536480686695, + "grad_norm": 1.7156052589416504, + "learning_rate": 3.6880583936760382e-06, + "loss": 2.3246, + "step": 6758 + }, + { + "epoch": 0.36260729613733905, + "grad_norm": 1.0643112659454346, + "learning_rate": 3.687676160859941e-06, + "loss": 2.0624, + "step": 6759 + }, + { + "epoch": 0.3626609442060086, + "grad_norm": 3.131361722946167, + "learning_rate": 3.6872938921846156e-06, + "loss": 2.2606, + "step": 6760 + }, + { + "epoch": 0.3627145922746781, + "grad_norm": 1.4648895263671875, + "learning_rate": 3.686911587661604e-06, + "loss": 2.0999, + "step": 6761 + }, + { + "epoch": 0.36276824034334765, + "grad_norm": 1.6864657402038574, + "learning_rate": 3.6865292473024493e-06, + "loss": 2.3604, + "step": 6762 + }, + { + "epoch": 0.3628218884120172, + "grad_norm": 1.2881242036819458, + "learning_rate": 3.686146871118696e-06, + "loss": 2.2524, + "step": 6763 + }, + { + "epoch": 0.3628755364806867, + "grad_norm": 1.6156197786331177, + "learning_rate": 3.6857644591218887e-06, + "loss": 2.5401, + "step": 6764 + }, + { + "epoch": 0.36292918454935624, + "grad_norm": 1.3883087635040283, + "learning_rate": 3.6853820113235733e-06, + "loss": 2.363, + "step": 6765 + }, + { + "epoch": 0.3629828326180258, + "grad_norm": 1.3813550472259521, + "learning_rate": 3.684999527735297e-06, + "loss": 2.1832, + "step": 6766 + }, + { + "epoch": 0.3630364806866953, + "grad_norm": 1.2527031898498535, + "learning_rate": 3.684617008368607e-06, + "loss": 1.6917, + "step": 6767 + }, + { + "epoch": 0.3630901287553648, + "grad_norm": 1.2466684579849243, + "learning_rate": 3.6842344532350544e-06, + "loss": 2.3387, + "step": 6768 + }, + { + "epoch": 0.3631437768240343, + "grad_norm": 1.424993634223938, + "learning_rate": 3.6838518623461884e-06, + "loss": 2.2903, + "step": 6769 + }, + { + "epoch": 0.36319742489270385, + "grad_norm": 1.4554150104522705, + "learning_rate": 3.6834692357135616e-06, + "loss": 2.0703, + "step": 6770 + }, + { + "epoch": 0.3632510729613734, + "grad_norm": 1.4774442911148071, + "learning_rate": 3.6830865733487254e-06, + "loss": 2.2747, + "step": 6771 + }, + { + "epoch": 0.3633047210300429, + "grad_norm": 1.557076334953308, + "learning_rate": 3.682703875263234e-06, + "loss": 2.2133, + "step": 6772 + }, + { + "epoch": 0.36335836909871244, + "grad_norm": 1.3116300106048584, + "learning_rate": 3.682321141468641e-06, + "loss": 2.2983, + "step": 6773 + }, + { + "epoch": 0.363412017167382, + "grad_norm": 1.4958542585372925, + "learning_rate": 3.6819383719765047e-06, + "loss": 2.3215, + "step": 6774 + }, + { + "epoch": 0.3634656652360515, + "grad_norm": 1.3648865222930908, + "learning_rate": 3.6815555667983794e-06, + "loss": 2.3078, + "step": 6775 + }, + { + "epoch": 0.36351931330472104, + "grad_norm": 1.227803349494934, + "learning_rate": 3.6811727259458246e-06, + "loss": 1.9665, + "step": 6776 + }, + { + "epoch": 0.36357296137339057, + "grad_norm": 1.435510277748108, + "learning_rate": 3.680789849430399e-06, + "loss": 2.402, + "step": 6777 + }, + { + "epoch": 0.3636266094420601, + "grad_norm": 1.4004079103469849, + "learning_rate": 3.680406937263662e-06, + "loss": 2.0934, + "step": 6778 + }, + { + "epoch": 0.36368025751072963, + "grad_norm": 1.4324290752410889, + "learning_rate": 3.6800239894571755e-06, + "loss": 2.1421, + "step": 6779 + }, + { + "epoch": 0.36373390557939916, + "grad_norm": 1.6849584579467773, + "learning_rate": 3.679641006022502e-06, + "loss": 2.1863, + "step": 6780 + }, + { + "epoch": 0.3637875536480687, + "grad_norm": 1.4318037033081055, + "learning_rate": 3.679257986971204e-06, + "loss": 2.246, + "step": 6781 + }, + { + "epoch": 0.36384120171673817, + "grad_norm": 1.4949102401733398, + "learning_rate": 3.6788749323148466e-06, + "loss": 2.1641, + "step": 6782 + }, + { + "epoch": 0.3638948497854077, + "grad_norm": 1.395939826965332, + "learning_rate": 3.6784918420649952e-06, + "loss": 1.9773, + "step": 6783 + }, + { + "epoch": 0.36394849785407724, + "grad_norm": 1.4222697019577026, + "learning_rate": 3.6781087162332147e-06, + "loss": 1.9691, + "step": 6784 + }, + { + "epoch": 0.36400214592274677, + "grad_norm": 1.402303695678711, + "learning_rate": 3.6777255548310754e-06, + "loss": 2.3065, + "step": 6785 + }, + { + "epoch": 0.3640557939914163, + "grad_norm": 1.2365678548812866, + "learning_rate": 3.6773423578701444e-06, + "loss": 2.3253, + "step": 6786 + }, + { + "epoch": 0.36410944206008583, + "grad_norm": 1.4924107789993286, + "learning_rate": 3.6769591253619918e-06, + "loss": 2.5181, + "step": 6787 + }, + { + "epoch": 0.36416309012875536, + "grad_norm": 1.6608140468597412, + "learning_rate": 3.676575857318189e-06, + "loss": 2.1145, + "step": 6788 + }, + { + "epoch": 0.3642167381974249, + "grad_norm": 1.5805091857910156, + "learning_rate": 3.676192553750307e-06, + "loss": 2.2098, + "step": 6789 + }, + { + "epoch": 0.3642703862660944, + "grad_norm": 1.438055157661438, + "learning_rate": 3.6758092146699186e-06, + "loss": 2.5328, + "step": 6790 + }, + { + "epoch": 0.36432403433476396, + "grad_norm": 1.2695201635360718, + "learning_rate": 3.6754258400885994e-06, + "loss": 2.3172, + "step": 6791 + }, + { + "epoch": 0.3643776824034335, + "grad_norm": 2.7563259601593018, + "learning_rate": 3.675042430017923e-06, + "loss": 2.4003, + "step": 6792 + }, + { + "epoch": 0.364431330472103, + "grad_norm": 1.5600240230560303, + "learning_rate": 3.6746589844694668e-06, + "loss": 2.5272, + "step": 6793 + }, + { + "epoch": 0.36448497854077255, + "grad_norm": 1.5785208940505981, + "learning_rate": 3.674275503454807e-06, + "loss": 2.312, + "step": 6794 + }, + { + "epoch": 0.3645386266094421, + "grad_norm": 1.3453141450881958, + "learning_rate": 3.673891986985523e-06, + "loss": 2.4105, + "step": 6795 + }, + { + "epoch": 0.36459227467811156, + "grad_norm": 1.3453423976898193, + "learning_rate": 3.6735084350731932e-06, + "loss": 2.2795, + "step": 6796 + }, + { + "epoch": 0.3646459227467811, + "grad_norm": 1.6896252632141113, + "learning_rate": 3.6731248477293983e-06, + "loss": 2.4838, + "step": 6797 + }, + { + "epoch": 0.3646995708154506, + "grad_norm": 1.3720932006835938, + "learning_rate": 3.6727412249657205e-06, + "loss": 2.448, + "step": 6798 + }, + { + "epoch": 0.36475321888412016, + "grad_norm": 1.177620768547058, + "learning_rate": 3.672357566793743e-06, + "loss": 2.1832, + "step": 6799 + }, + { + "epoch": 0.3648068669527897, + "grad_norm": 1.4273402690887451, + "learning_rate": 3.6719738732250466e-06, + "loss": 2.3127, + "step": 6800 + }, + { + "epoch": 0.3648605150214592, + "grad_norm": 1.8332197666168213, + "learning_rate": 3.6715901442712195e-06, + "loss": 1.9608, + "step": 6801 + }, + { + "epoch": 0.36491416309012875, + "grad_norm": 1.588000774383545, + "learning_rate": 3.671206379943845e-06, + "loss": 2.1193, + "step": 6802 + }, + { + "epoch": 0.3649678111587983, + "grad_norm": 1.4746553897857666, + "learning_rate": 3.670822580254512e-06, + "loss": 2.1209, + "step": 6803 + }, + { + "epoch": 0.3650214592274678, + "grad_norm": 1.5035122632980347, + "learning_rate": 3.670438745214808e-06, + "loss": 2.1923, + "step": 6804 + }, + { + "epoch": 0.36507510729613735, + "grad_norm": 3.075547695159912, + "learning_rate": 3.6700548748363207e-06, + "loss": 2.3129, + "step": 6805 + }, + { + "epoch": 0.3651287553648069, + "grad_norm": 1.5556223392486572, + "learning_rate": 3.6696709691306403e-06, + "loss": 2.272, + "step": 6806 + }, + { + "epoch": 0.3651824034334764, + "grad_norm": 1.6063143014907837, + "learning_rate": 3.6692870281093597e-06, + "loss": 2.3109, + "step": 6807 + }, + { + "epoch": 0.36523605150214594, + "grad_norm": 1.35938560962677, + "learning_rate": 3.6689030517840708e-06, + "loss": 2.225, + "step": 6808 + }, + { + "epoch": 0.3652896995708155, + "grad_norm": 1.3573801517486572, + "learning_rate": 3.6685190401663655e-06, + "loss": 2.2546, + "step": 6809 + }, + { + "epoch": 0.365343347639485, + "grad_norm": 1.3583049774169922, + "learning_rate": 3.6681349932678393e-06, + "loss": 2.2875, + "step": 6810 + }, + { + "epoch": 0.3653969957081545, + "grad_norm": 1.5916417837142944, + "learning_rate": 3.6677509111000877e-06, + "loss": 2.3707, + "step": 6811 + }, + { + "epoch": 0.365450643776824, + "grad_norm": 1.6052608489990234, + "learning_rate": 3.6673667936747057e-06, + "loss": 2.2314, + "step": 6812 + }, + { + "epoch": 0.36550429184549355, + "grad_norm": 1.4410148859024048, + "learning_rate": 3.666982641003293e-06, + "loss": 2.1898, + "step": 6813 + }, + { + "epoch": 0.3655579399141631, + "grad_norm": 1.4078929424285889, + "learning_rate": 3.6665984530974473e-06, + "loss": 2.2166, + "step": 6814 + }, + { + "epoch": 0.3656115879828326, + "grad_norm": 1.4683173894882202, + "learning_rate": 3.6662142299687677e-06, + "loss": 2.2877, + "step": 6815 + }, + { + "epoch": 0.36566523605150214, + "grad_norm": 2.2232651710510254, + "learning_rate": 3.665829971628856e-06, + "loss": 2.0726, + "step": 6816 + }, + { + "epoch": 0.3657188841201717, + "grad_norm": 1.5164231061935425, + "learning_rate": 3.665445678089313e-06, + "loss": 2.2809, + "step": 6817 + }, + { + "epoch": 0.3657725321888412, + "grad_norm": 1.5016347169876099, + "learning_rate": 3.665061349361742e-06, + "loss": 2.0188, + "step": 6818 + }, + { + "epoch": 0.36582618025751074, + "grad_norm": 1.5709737539291382, + "learning_rate": 3.6646769854577473e-06, + "loss": 2.3083, + "step": 6819 + }, + { + "epoch": 0.36587982832618027, + "grad_norm": 1.384810209274292, + "learning_rate": 3.664292586388933e-06, + "loss": 2.2406, + "step": 6820 + }, + { + "epoch": 0.3659334763948498, + "grad_norm": 1.4167635440826416, + "learning_rate": 3.663908152166907e-06, + "loss": 2.3789, + "step": 6821 + }, + { + "epoch": 0.36598712446351933, + "grad_norm": 1.4145112037658691, + "learning_rate": 3.6635236828032755e-06, + "loss": 2.1885, + "step": 6822 + }, + { + "epoch": 0.36604077253218886, + "grad_norm": 1.5206773281097412, + "learning_rate": 3.663139178309645e-06, + "loss": 2.342, + "step": 6823 + }, + { + "epoch": 0.3660944206008584, + "grad_norm": 1.1240845918655396, + "learning_rate": 3.6627546386976272e-06, + "loss": 1.9656, + "step": 6824 + }, + { + "epoch": 0.36614806866952787, + "grad_norm": 1.505140781402588, + "learning_rate": 3.662370063978831e-06, + "loss": 2.0789, + "step": 6825 + }, + { + "epoch": 0.3662017167381974, + "grad_norm": 1.6888195276260376, + "learning_rate": 3.6619854541648684e-06, + "loss": 2.4072, + "step": 6826 + }, + { + "epoch": 0.36625536480686693, + "grad_norm": 1.3788481950759888, + "learning_rate": 3.6616008092673516e-06, + "loss": 2.2954, + "step": 6827 + }, + { + "epoch": 0.36630901287553647, + "grad_norm": 1.3583757877349854, + "learning_rate": 3.6612161292978944e-06, + "loss": 2.1292, + "step": 6828 + }, + { + "epoch": 0.366362660944206, + "grad_norm": 1.6252539157867432, + "learning_rate": 3.66083141426811e-06, + "loss": 2.0617, + "step": 6829 + }, + { + "epoch": 0.36641630901287553, + "grad_norm": 1.5195623636245728, + "learning_rate": 3.6604466641896164e-06, + "loss": 2.2816, + "step": 6830 + }, + { + "epoch": 0.36646995708154506, + "grad_norm": 1.8866890668869019, + "learning_rate": 3.6600618790740284e-06, + "loss": 2.3094, + "step": 6831 + }, + { + "epoch": 0.3665236051502146, + "grad_norm": 1.5910669565200806, + "learning_rate": 3.659677058932964e-06, + "loss": 2.0151, + "step": 6832 + }, + { + "epoch": 0.3665772532188841, + "grad_norm": 1.3843241930007935, + "learning_rate": 3.6592922037780426e-06, + "loss": 1.4233, + "step": 6833 + }, + { + "epoch": 0.36663090128755366, + "grad_norm": 1.4499326944351196, + "learning_rate": 3.6589073136208836e-06, + "loss": 2.2121, + "step": 6834 + }, + { + "epoch": 0.3666845493562232, + "grad_norm": 1.4325186014175415, + "learning_rate": 3.6585223884731082e-06, + "loss": 2.2491, + "step": 6835 + }, + { + "epoch": 0.3667381974248927, + "grad_norm": 1.553390622138977, + "learning_rate": 3.658137428346338e-06, + "loss": 2.4493, + "step": 6836 + }, + { + "epoch": 0.36679184549356225, + "grad_norm": 1.4223438501358032, + "learning_rate": 3.6577524332521957e-06, + "loss": 2.4186, + "step": 6837 + }, + { + "epoch": 0.3668454935622318, + "grad_norm": 1.4701305627822876, + "learning_rate": 3.657367403202306e-06, + "loss": 2.3575, + "step": 6838 + }, + { + "epoch": 0.36689914163090126, + "grad_norm": 1.6330044269561768, + "learning_rate": 3.6569823382082943e-06, + "loss": 2.2911, + "step": 6839 + }, + { + "epoch": 0.3669527896995708, + "grad_norm": 1.2264593839645386, + "learning_rate": 3.656597238281786e-06, + "loss": 2.0298, + "step": 6840 + }, + { + "epoch": 0.3670064377682403, + "grad_norm": 1.4444235563278198, + "learning_rate": 3.656212103434409e-06, + "loss": 2.4068, + "step": 6841 + }, + { + "epoch": 0.36706008583690986, + "grad_norm": 1.504629135131836, + "learning_rate": 3.6558269336777907e-06, + "loss": 2.2973, + "step": 6842 + }, + { + "epoch": 0.3671137339055794, + "grad_norm": 1.4674381017684937, + "learning_rate": 3.655441729023562e-06, + "loss": 2.5995, + "step": 6843 + }, + { + "epoch": 0.3671673819742489, + "grad_norm": 1.6665573120117188, + "learning_rate": 3.6550564894833517e-06, + "loss": 2.2996, + "step": 6844 + }, + { + "epoch": 0.36722103004291845, + "grad_norm": 1.6044554710388184, + "learning_rate": 3.654671215068791e-06, + "loss": 2.4232, + "step": 6845 + }, + { + "epoch": 0.367274678111588, + "grad_norm": 1.3795610666275024, + "learning_rate": 3.6542859057915137e-06, + "loss": 2.2728, + "step": 6846 + }, + { + "epoch": 0.3673283261802575, + "grad_norm": 1.3931607007980347, + "learning_rate": 3.6539005616631536e-06, + "loss": 2.3354, + "step": 6847 + }, + { + "epoch": 0.36738197424892705, + "grad_norm": 1.693136215209961, + "learning_rate": 3.6535151826953442e-06, + "loss": 2.6807, + "step": 6848 + }, + { + "epoch": 0.3674356223175966, + "grad_norm": 1.4154067039489746, + "learning_rate": 3.6531297688997216e-06, + "loss": 2.4145, + "step": 6849 + }, + { + "epoch": 0.3674892703862661, + "grad_norm": 1.4483230113983154, + "learning_rate": 3.652744320287922e-06, + "loss": 2.2514, + "step": 6850 + }, + { + "epoch": 0.36754291845493564, + "grad_norm": 1.6498268842697144, + "learning_rate": 3.652358836871584e-06, + "loss": 2.3393, + "step": 6851 + }, + { + "epoch": 0.3675965665236052, + "grad_norm": 1.251517653465271, + "learning_rate": 3.6519733186623462e-06, + "loss": 2.4335, + "step": 6852 + }, + { + "epoch": 0.3676502145922747, + "grad_norm": 1.4911175966262817, + "learning_rate": 3.6515877656718478e-06, + "loss": 2.1564, + "step": 6853 + }, + { + "epoch": 0.3677038626609442, + "grad_norm": 1.343172311782837, + "learning_rate": 3.6512021779117306e-06, + "loss": 2.0746, + "step": 6854 + }, + { + "epoch": 0.3677575107296137, + "grad_norm": 1.2593491077423096, + "learning_rate": 3.650816555393636e-06, + "loss": 2.4783, + "step": 6855 + }, + { + "epoch": 0.36781115879828324, + "grad_norm": 1.2271156311035156, + "learning_rate": 3.650430898129207e-06, + "loss": 1.8904, + "step": 6856 + }, + { + "epoch": 0.3678648068669528, + "grad_norm": 1.3670172691345215, + "learning_rate": 3.6500452061300882e-06, + "loss": 2.5242, + "step": 6857 + }, + { + "epoch": 0.3679184549356223, + "grad_norm": 1.407310128211975, + "learning_rate": 3.6496594794079243e-06, + "loss": 2.4337, + "step": 6858 + }, + { + "epoch": 0.36797210300429184, + "grad_norm": 1.4597244262695312, + "learning_rate": 3.649273717974361e-06, + "loss": 2.2194, + "step": 6859 + }, + { + "epoch": 0.36802575107296137, + "grad_norm": 1.4699345827102661, + "learning_rate": 3.648887921841046e-06, + "loss": 2.2459, + "step": 6860 + }, + { + "epoch": 0.3680793991416309, + "grad_norm": 1.3794256448745728, + "learning_rate": 3.648502091019629e-06, + "loss": 2.1371, + "step": 6861 + }, + { + "epoch": 0.36813304721030043, + "grad_norm": 1.5203105211257935, + "learning_rate": 3.6481162255217564e-06, + "loss": 2.2867, + "step": 6862 + }, + { + "epoch": 0.36818669527896997, + "grad_norm": 1.7307788133621216, + "learning_rate": 3.6477303253590797e-06, + "loss": 2.4946, + "step": 6863 + }, + { + "epoch": 0.3682403433476395, + "grad_norm": 1.2765541076660156, + "learning_rate": 3.647344390543251e-06, + "loss": 2.1812, + "step": 6864 + }, + { + "epoch": 0.36829399141630903, + "grad_norm": 1.4848891496658325, + "learning_rate": 3.6469584210859233e-06, + "loss": 2.1088, + "step": 6865 + }, + { + "epoch": 0.36834763948497856, + "grad_norm": 1.5605049133300781, + "learning_rate": 3.6465724169987482e-06, + "loss": 2.3615, + "step": 6866 + }, + { + "epoch": 0.3684012875536481, + "grad_norm": 1.3556714057922363, + "learning_rate": 3.646186378293381e-06, + "loss": 2.3591, + "step": 6867 + }, + { + "epoch": 0.36845493562231757, + "grad_norm": 1.2964011430740356, + "learning_rate": 3.645800304981477e-06, + "loss": 2.3558, + "step": 6868 + }, + { + "epoch": 0.3685085836909871, + "grad_norm": 1.1354954242706299, + "learning_rate": 3.6454141970746943e-06, + "loss": 2.1562, + "step": 6869 + }, + { + "epoch": 0.36856223175965663, + "grad_norm": 2.5650525093078613, + "learning_rate": 3.6450280545846896e-06, + "loss": 2.089, + "step": 6870 + }, + { + "epoch": 0.36861587982832617, + "grad_norm": 1.9419045448303223, + "learning_rate": 3.6446418775231208e-06, + "loss": 2.3741, + "step": 6871 + }, + { + "epoch": 0.3686695278969957, + "grad_norm": 1.4953351020812988, + "learning_rate": 3.644255665901648e-06, + "loss": 2.4009, + "step": 6872 + }, + { + "epoch": 0.36872317596566523, + "grad_norm": 1.3456047773361206, + "learning_rate": 3.6438694197319335e-06, + "loss": 2.3481, + "step": 6873 + }, + { + "epoch": 0.36877682403433476, + "grad_norm": 1.766541838645935, + "learning_rate": 3.6434831390256374e-06, + "loss": 2.1215, + "step": 6874 + }, + { + "epoch": 0.3688304721030043, + "grad_norm": 1.5959115028381348, + "learning_rate": 3.6430968237944232e-06, + "loss": 2.205, + "step": 6875 + }, + { + "epoch": 0.3688841201716738, + "grad_norm": 1.3228532075881958, + "learning_rate": 3.6427104740499546e-06, + "loss": 2.4712, + "step": 6876 + }, + { + "epoch": 0.36893776824034336, + "grad_norm": 1.445631504058838, + "learning_rate": 3.642324089803897e-06, + "loss": 2.5512, + "step": 6877 + }, + { + "epoch": 0.3689914163090129, + "grad_norm": 1.5391074419021606, + "learning_rate": 3.641937671067916e-06, + "loss": 2.2152, + "step": 6878 + }, + { + "epoch": 0.3690450643776824, + "grad_norm": 1.803919792175293, + "learning_rate": 3.6415512178536796e-06, + "loss": 2.1563, + "step": 6879 + }, + { + "epoch": 0.36909871244635195, + "grad_norm": 1.4672173261642456, + "learning_rate": 3.641164730172854e-06, + "loss": 2.1143, + "step": 6880 + }, + { + "epoch": 0.3691523605150215, + "grad_norm": 1.3631654977798462, + "learning_rate": 3.6407782080371103e-06, + "loss": 2.1819, + "step": 6881 + }, + { + "epoch": 0.369206008583691, + "grad_norm": 1.3743414878845215, + "learning_rate": 3.640391651458117e-06, + "loss": 2.4696, + "step": 6882 + }, + { + "epoch": 0.3692596566523605, + "grad_norm": 1.5828720331192017, + "learning_rate": 3.6400050604475472e-06, + "loss": 2.4038, + "step": 6883 + }, + { + "epoch": 0.36931330472103, + "grad_norm": 2.0156710147857666, + "learning_rate": 3.6396184350170714e-06, + "loss": 2.2102, + "step": 6884 + }, + { + "epoch": 0.36936695278969955, + "grad_norm": 1.6584254503250122, + "learning_rate": 3.6392317751783635e-06, + "loss": 2.3173, + "step": 6885 + }, + { + "epoch": 0.3694206008583691, + "grad_norm": 1.5411570072174072, + "learning_rate": 3.6388450809430986e-06, + "loss": 1.9116, + "step": 6886 + }, + { + "epoch": 0.3694742489270386, + "grad_norm": 1.405226707458496, + "learning_rate": 3.638458352322951e-06, + "loss": 2.0492, + "step": 6887 + }, + { + "epoch": 0.36952789699570815, + "grad_norm": 1.5013080835342407, + "learning_rate": 3.6380715893295976e-06, + "loss": 2.5872, + "step": 6888 + }, + { + "epoch": 0.3695815450643777, + "grad_norm": 1.377221941947937, + "learning_rate": 3.6376847919747154e-06, + "loss": 2.2964, + "step": 6889 + }, + { + "epoch": 0.3696351931330472, + "grad_norm": 1.8704475164413452, + "learning_rate": 3.637297960269984e-06, + "loss": 2.0524, + "step": 6890 + }, + { + "epoch": 0.36968884120171674, + "grad_norm": 1.4416236877441406, + "learning_rate": 3.6369110942270803e-06, + "loss": 2.2946, + "step": 6891 + }, + { + "epoch": 0.3697424892703863, + "grad_norm": 1.2296833992004395, + "learning_rate": 3.636524193857689e-06, + "loss": 2.1448, + "step": 6892 + }, + { + "epoch": 0.3697961373390558, + "grad_norm": 1.3616312742233276, + "learning_rate": 3.636137259173488e-06, + "loss": 1.7696, + "step": 6893 + }, + { + "epoch": 0.36984978540772534, + "grad_norm": 1.3784282207489014, + "learning_rate": 3.635750290186162e-06, + "loss": 2.2658, + "step": 6894 + }, + { + "epoch": 0.36990343347639487, + "grad_norm": 1.4476991891860962, + "learning_rate": 3.635363286907393e-06, + "loss": 2.3323, + "step": 6895 + }, + { + "epoch": 0.3699570815450644, + "grad_norm": 1.310617446899414, + "learning_rate": 3.634976249348867e-06, + "loss": 2.2243, + "step": 6896 + }, + { + "epoch": 0.3700107296137339, + "grad_norm": 1.5402742624282837, + "learning_rate": 3.63458917752227e-06, + "loss": 2.25, + "step": 6897 + }, + { + "epoch": 0.3700643776824034, + "grad_norm": 1.245415210723877, + "learning_rate": 3.6342020714392877e-06, + "loss": 2.0873, + "step": 6898 + }, + { + "epoch": 0.37011802575107294, + "grad_norm": 1.5287516117095947, + "learning_rate": 3.6338149311116088e-06, + "loss": 2.2273, + "step": 6899 + }, + { + "epoch": 0.3701716738197425, + "grad_norm": 1.4434535503387451, + "learning_rate": 3.6334277565509218e-06, + "loss": 2.3083, + "step": 6900 + }, + { + "epoch": 0.370225321888412, + "grad_norm": 1.476590871810913, + "learning_rate": 3.6330405477689158e-06, + "loss": 2.0422, + "step": 6901 + }, + { + "epoch": 0.37027896995708154, + "grad_norm": 1.3976579904556274, + "learning_rate": 3.6326533047772827e-06, + "loss": 2.2208, + "step": 6902 + }, + { + "epoch": 0.37033261802575107, + "grad_norm": 1.3993518352508545, + "learning_rate": 3.6322660275877143e-06, + "loss": 2.2454, + "step": 6903 + }, + { + "epoch": 0.3703862660944206, + "grad_norm": 1.279201865196228, + "learning_rate": 3.631878716211903e-06, + "loss": 2.1404, + "step": 6904 + }, + { + "epoch": 0.37043991416309013, + "grad_norm": 1.2452553510665894, + "learning_rate": 3.631491370661544e-06, + "loss": 2.2937, + "step": 6905 + }, + { + "epoch": 0.37049356223175967, + "grad_norm": 1.3386310338974, + "learning_rate": 3.631103990948331e-06, + "loss": 2.2521, + "step": 6906 + }, + { + "epoch": 0.3705472103004292, + "grad_norm": 1.6687031984329224, + "learning_rate": 3.6307165770839597e-06, + "loss": 2.4997, + "step": 6907 + }, + { + "epoch": 0.37060085836909873, + "grad_norm": 1.729781150817871, + "learning_rate": 3.630329129080129e-06, + "loss": 2.4854, + "step": 6908 + }, + { + "epoch": 0.37065450643776826, + "grad_norm": 1.4928096532821655, + "learning_rate": 3.6299416469485366e-06, + "loss": 2.2308, + "step": 6909 + }, + { + "epoch": 0.3707081545064378, + "grad_norm": 1.532011866569519, + "learning_rate": 3.629554130700881e-06, + "loss": 2.5423, + "step": 6910 + }, + { + "epoch": 0.37076180257510727, + "grad_norm": 1.5617808103561401, + "learning_rate": 3.6291665803488618e-06, + "loss": 2.0088, + "step": 6911 + }, + { + "epoch": 0.3708154506437768, + "grad_norm": 1.3810231685638428, + "learning_rate": 3.628778995904182e-06, + "loss": 2.2385, + "step": 6912 + }, + { + "epoch": 0.37086909871244633, + "grad_norm": 3.196739673614502, + "learning_rate": 3.628391377378542e-06, + "loss": 2.35, + "step": 6913 + }, + { + "epoch": 0.37092274678111586, + "grad_norm": 1.5975474119186401, + "learning_rate": 3.6280037247836463e-06, + "loss": 2.3936, + "step": 6914 + }, + { + "epoch": 0.3709763948497854, + "grad_norm": 1.4685429334640503, + "learning_rate": 3.6276160381311988e-06, + "loss": 2.1783, + "step": 6915 + }, + { + "epoch": 0.3710300429184549, + "grad_norm": 1.5978342294692993, + "learning_rate": 3.6272283174329044e-06, + "loss": 2.2382, + "step": 6916 + }, + { + "epoch": 0.37108369098712446, + "grad_norm": 1.5717930793762207, + "learning_rate": 3.62684056270047e-06, + "loss": 2.4332, + "step": 6917 + }, + { + "epoch": 0.371137339055794, + "grad_norm": 1.413562297821045, + "learning_rate": 3.626452773945603e-06, + "loss": 2.3936, + "step": 6918 + }, + { + "epoch": 0.3711909871244635, + "grad_norm": 1.2497109174728394, + "learning_rate": 3.6260649511800118e-06, + "loss": 2.2301, + "step": 6919 + }, + { + "epoch": 0.37124463519313305, + "grad_norm": 1.279589056968689, + "learning_rate": 3.6256770944154057e-06, + "loss": 2.1729, + "step": 6920 + }, + { + "epoch": 0.3712982832618026, + "grad_norm": 1.5696780681610107, + "learning_rate": 3.625289203663495e-06, + "loss": 2.3711, + "step": 6921 + }, + { + "epoch": 0.3713519313304721, + "grad_norm": 1.3760271072387695, + "learning_rate": 3.6249012789359917e-06, + "loss": 1.9865, + "step": 6922 + }, + { + "epoch": 0.37140557939914165, + "grad_norm": 1.4253889322280884, + "learning_rate": 3.6245133202446085e-06, + "loss": 2.1821, + "step": 6923 + }, + { + "epoch": 0.3714592274678112, + "grad_norm": 1.4957057237625122, + "learning_rate": 3.6241253276010578e-06, + "loss": 2.1461, + "step": 6924 + }, + { + "epoch": 0.3715128755364807, + "grad_norm": 1.7063466310501099, + "learning_rate": 3.6237373010170552e-06, + "loss": 2.2886, + "step": 6925 + }, + { + "epoch": 0.3715665236051502, + "grad_norm": 1.2812485694885254, + "learning_rate": 3.6233492405043154e-06, + "loss": 2.3354, + "step": 6926 + }, + { + "epoch": 0.3716201716738197, + "grad_norm": 1.4430510997772217, + "learning_rate": 3.622961146074556e-06, + "loss": 2.1885, + "step": 6927 + }, + { + "epoch": 0.37167381974248925, + "grad_norm": 1.370879054069519, + "learning_rate": 3.6225730177394946e-06, + "loss": 2.3542, + "step": 6928 + }, + { + "epoch": 0.3717274678111588, + "grad_norm": 1.437784194946289, + "learning_rate": 3.6221848555108486e-06, + "loss": 2.3931, + "step": 6929 + }, + { + "epoch": 0.3717811158798283, + "grad_norm": 1.6398218870162964, + "learning_rate": 3.6217966594003383e-06, + "loss": 2.3929, + "step": 6930 + }, + { + "epoch": 0.37183476394849785, + "grad_norm": 1.5635154247283936, + "learning_rate": 3.621408429419686e-06, + "loss": 2.5612, + "step": 6931 + }, + { + "epoch": 0.3718884120171674, + "grad_norm": 1.545665979385376, + "learning_rate": 3.6210201655806114e-06, + "loss": 2.2541, + "step": 6932 + }, + { + "epoch": 0.3719420600858369, + "grad_norm": 1.6116752624511719, + "learning_rate": 3.6206318678948384e-06, + "loss": 2.246, + "step": 6933 + }, + { + "epoch": 0.37199570815450644, + "grad_norm": 1.532895565032959, + "learning_rate": 3.6202435363740896e-06, + "loss": 2.3675, + "step": 6934 + }, + { + "epoch": 0.372049356223176, + "grad_norm": 2.339257001876831, + "learning_rate": 3.6198551710300904e-06, + "loss": 2.3341, + "step": 6935 + }, + { + "epoch": 0.3721030042918455, + "grad_norm": 3.9742565155029297, + "learning_rate": 3.6194667718745675e-06, + "loss": 2.176, + "step": 6936 + }, + { + "epoch": 0.37215665236051504, + "grad_norm": 1.6383799314498901, + "learning_rate": 3.619078338919247e-06, + "loss": 2.3127, + "step": 6937 + }, + { + "epoch": 0.37221030042918457, + "grad_norm": 1.3526800870895386, + "learning_rate": 3.618689872175856e-06, + "loss": 1.9454, + "step": 6938 + }, + { + "epoch": 0.3722639484978541, + "grad_norm": 1.5295674800872803, + "learning_rate": 3.618301371656125e-06, + "loss": 2.2802, + "step": 6939 + }, + { + "epoch": 0.3723175965665236, + "grad_norm": 1.4326975345611572, + "learning_rate": 3.617912837371783e-06, + "loss": 2.5191, + "step": 6940 + }, + { + "epoch": 0.3723712446351931, + "grad_norm": 0.9981538653373718, + "learning_rate": 3.617524269334561e-06, + "loss": 1.9115, + "step": 6941 + }, + { + "epoch": 0.37242489270386264, + "grad_norm": 1.4271115064620972, + "learning_rate": 3.61713566755619e-06, + "loss": 2.1921, + "step": 6942 + }, + { + "epoch": 0.3724785407725322, + "grad_norm": 2.7249596118927, + "learning_rate": 3.616747032048405e-06, + "loss": 2.2199, + "step": 6943 + }, + { + "epoch": 0.3725321888412017, + "grad_norm": 1.286529779434204, + "learning_rate": 3.616358362822939e-06, + "loss": 2.4087, + "step": 6944 + }, + { + "epoch": 0.37258583690987124, + "grad_norm": 1.3982230424880981, + "learning_rate": 3.6159696598915267e-06, + "loss": 2.3678, + "step": 6945 + }, + { + "epoch": 0.37263948497854077, + "grad_norm": 1.4010636806488037, + "learning_rate": 3.6155809232659032e-06, + "loss": 2.0546, + "step": 6946 + }, + { + "epoch": 0.3726931330472103, + "grad_norm": 1.3854717016220093, + "learning_rate": 3.6151921529578075e-06, + "loss": 2.3902, + "step": 6947 + }, + { + "epoch": 0.37274678111587983, + "grad_norm": 1.5202025175094604, + "learning_rate": 3.614803348978977e-06, + "loss": 2.268, + "step": 6948 + }, + { + "epoch": 0.37280042918454936, + "grad_norm": 1.3080168962478638, + "learning_rate": 3.614414511341151e-06, + "loss": 2.1116, + "step": 6949 + }, + { + "epoch": 0.3728540772532189, + "grad_norm": 1.8063571453094482, + "learning_rate": 3.614025640056068e-06, + "loss": 2.7415, + "step": 6950 + }, + { + "epoch": 0.37290772532188843, + "grad_norm": 1.5825618505477905, + "learning_rate": 3.613636735135471e-06, + "loss": 2.3957, + "step": 6951 + }, + { + "epoch": 0.37296137339055796, + "grad_norm": 1.7007830142974854, + "learning_rate": 3.6132477965911012e-06, + "loss": 2.0377, + "step": 6952 + }, + { + "epoch": 0.3730150214592275, + "grad_norm": 1.6261029243469238, + "learning_rate": 3.612858824434702e-06, + "loss": 2.1265, + "step": 6953 + }, + { + "epoch": 0.373068669527897, + "grad_norm": 1.2923657894134521, + "learning_rate": 3.6124698186780166e-06, + "loss": 1.8762, + "step": 6954 + }, + { + "epoch": 0.3731223175965665, + "grad_norm": 1.3947232961654663, + "learning_rate": 3.6120807793327916e-06, + "loss": 2.2414, + "step": 6955 + }, + { + "epoch": 0.37317596566523603, + "grad_norm": 1.4074935913085938, + "learning_rate": 3.6116917064107727e-06, + "loss": 2.1697, + "step": 6956 + }, + { + "epoch": 0.37322961373390556, + "grad_norm": 1.7191129922866821, + "learning_rate": 3.6113025999237066e-06, + "loss": 2.2763, + "step": 6957 + }, + { + "epoch": 0.3732832618025751, + "grad_norm": 1.5107213258743286, + "learning_rate": 3.610913459883342e-06, + "loss": 2.485, + "step": 6958 + }, + { + "epoch": 0.3733369098712446, + "grad_norm": 1.3712161779403687, + "learning_rate": 3.6105242863014283e-06, + "loss": 2.3317, + "step": 6959 + }, + { + "epoch": 0.37339055793991416, + "grad_norm": 1.3459231853485107, + "learning_rate": 3.610135079189715e-06, + "loss": 2.3004, + "step": 6960 + }, + { + "epoch": 0.3734442060085837, + "grad_norm": 1.3892192840576172, + "learning_rate": 3.6097458385599536e-06, + "loss": 2.1997, + "step": 6961 + }, + { + "epoch": 0.3734978540772532, + "grad_norm": 1.4331645965576172, + "learning_rate": 3.609356564423897e-06, + "loss": 2.2221, + "step": 6962 + }, + { + "epoch": 0.37355150214592275, + "grad_norm": 1.3679664134979248, + "learning_rate": 3.6089672567932975e-06, + "loss": 2.1773, + "step": 6963 + }, + { + "epoch": 0.3736051502145923, + "grad_norm": 1.4237314462661743, + "learning_rate": 3.60857791567991e-06, + "loss": 2.1092, + "step": 6964 + }, + { + "epoch": 0.3736587982832618, + "grad_norm": 1.1443086862564087, + "learning_rate": 3.6081885410954897e-06, + "loss": 1.6066, + "step": 6965 + }, + { + "epoch": 0.37371244635193135, + "grad_norm": 1.3594821691513062, + "learning_rate": 3.6077991330517924e-06, + "loss": 2.352, + "step": 6966 + }, + { + "epoch": 0.3737660944206009, + "grad_norm": 1.2889430522918701, + "learning_rate": 3.6074096915605766e-06, + "loss": 2.191, + "step": 6967 + }, + { + "epoch": 0.3738197424892704, + "grad_norm": 1.7121226787567139, + "learning_rate": 3.6070202166335993e-06, + "loss": 2.3185, + "step": 6968 + }, + { + "epoch": 0.3738733905579399, + "grad_norm": 2.0185775756835938, + "learning_rate": 3.60663070828262e-06, + "loss": 2.0863, + "step": 6969 + }, + { + "epoch": 0.3739270386266094, + "grad_norm": 3.4433956146240234, + "learning_rate": 3.6062411665193997e-06, + "loss": 2.1854, + "step": 6970 + }, + { + "epoch": 0.37398068669527895, + "grad_norm": 1.4340267181396484, + "learning_rate": 3.6058515913556995e-06, + "loss": 1.6767, + "step": 6971 + }, + { + "epoch": 0.3740343347639485, + "grad_norm": 1.3661003112792969, + "learning_rate": 3.605461982803282e-06, + "loss": 2.1054, + "step": 6972 + }, + { + "epoch": 0.374087982832618, + "grad_norm": 1.246264934539795, + "learning_rate": 3.6050723408739098e-06, + "loss": 2.2377, + "step": 6973 + }, + { + "epoch": 0.37414163090128755, + "grad_norm": 1.5288461446762085, + "learning_rate": 3.6046826655793478e-06, + "loss": 2.2282, + "step": 6974 + }, + { + "epoch": 0.3741952789699571, + "grad_norm": 1.2349251508712769, + "learning_rate": 3.604292956931361e-06, + "loss": 2.2199, + "step": 6975 + }, + { + "epoch": 0.3742489270386266, + "grad_norm": 1.402969479560852, + "learning_rate": 3.6039032149417163e-06, + "loss": 2.0833, + "step": 6976 + }, + { + "epoch": 0.37430257510729614, + "grad_norm": 1.541727066040039, + "learning_rate": 3.6035134396221805e-06, + "loss": 1.8017, + "step": 6977 + }, + { + "epoch": 0.3743562231759657, + "grad_norm": 1.3445076942443848, + "learning_rate": 3.603123630984523e-06, + "loss": 2.2417, + "step": 6978 + }, + { + "epoch": 0.3744098712446352, + "grad_norm": 1.457708716392517, + "learning_rate": 3.602733789040512e-06, + "loss": 2.474, + "step": 6979 + }, + { + "epoch": 0.37446351931330474, + "grad_norm": 1.412330985069275, + "learning_rate": 3.602343913801919e-06, + "loss": 2.1986, + "step": 6980 + }, + { + "epoch": 0.37451716738197427, + "grad_norm": 1.4456124305725098, + "learning_rate": 3.6019540052805148e-06, + "loss": 2.3536, + "step": 6981 + }, + { + "epoch": 0.3745708154506438, + "grad_norm": 1.448690414428711, + "learning_rate": 3.601564063488071e-06, + "loss": 2.3614, + "step": 6982 + }, + { + "epoch": 0.3746244635193133, + "grad_norm": 1.4970598220825195, + "learning_rate": 3.6011740884363625e-06, + "loss": 2.3799, + "step": 6983 + }, + { + "epoch": 0.3746781115879828, + "grad_norm": 1.4095937013626099, + "learning_rate": 3.6007840801371636e-06, + "loss": 2.1805, + "step": 6984 + }, + { + "epoch": 0.37473175965665234, + "grad_norm": 1.596618890762329, + "learning_rate": 3.6003940386022485e-06, + "loss": 2.1495, + "step": 6985 + }, + { + "epoch": 0.3747854077253219, + "grad_norm": 1.5304063558578491, + "learning_rate": 3.6000039638433944e-06, + "loss": 2.1859, + "step": 6986 + }, + { + "epoch": 0.3748390557939914, + "grad_norm": 1.3699692487716675, + "learning_rate": 3.5996138558723793e-06, + "loss": 2.2755, + "step": 6987 + }, + { + "epoch": 0.37489270386266094, + "grad_norm": 1.3054895401000977, + "learning_rate": 3.59922371470098e-06, + "loss": 2.3412, + "step": 6988 + }, + { + "epoch": 0.37494635193133047, + "grad_norm": 1.673695683479309, + "learning_rate": 3.5988335403409785e-06, + "loss": 2.2001, + "step": 6989 + }, + { + "epoch": 0.375, + "grad_norm": 1.5906991958618164, + "learning_rate": 3.598443332804153e-06, + "loss": 2.443, + "step": 6990 + }, + { + "epoch": 0.37505364806866953, + "grad_norm": 1.3376318216323853, + "learning_rate": 3.5980530921022856e-06, + "loss": 2.334, + "step": 6991 + }, + { + "epoch": 0.37510729613733906, + "grad_norm": 1.4782027006149292, + "learning_rate": 3.5976628182471586e-06, + "loss": 2.5055, + "step": 6992 + }, + { + "epoch": 0.3751609442060086, + "grad_norm": 1.1901473999023438, + "learning_rate": 3.5972725112505563e-06, + "loss": 2.1497, + "step": 6993 + }, + { + "epoch": 0.3752145922746781, + "grad_norm": 1.4590404033660889, + "learning_rate": 3.596882171124262e-06, + "loss": 2.1396, + "step": 6994 + }, + { + "epoch": 0.37526824034334766, + "grad_norm": 1.4019227027893066, + "learning_rate": 3.5964917978800617e-06, + "loss": 2.5411, + "step": 6995 + }, + { + "epoch": 0.3753218884120172, + "grad_norm": 1.332124948501587, + "learning_rate": 3.5961013915297423e-06, + "loss": 1.3992, + "step": 6996 + }, + { + "epoch": 0.3753755364806867, + "grad_norm": 3.0119991302490234, + "learning_rate": 3.5957109520850905e-06, + "loss": 2.2746, + "step": 6997 + }, + { + "epoch": 0.3754291845493562, + "grad_norm": 1.5826221704483032, + "learning_rate": 3.5953204795578944e-06, + "loss": 1.3994, + "step": 6998 + }, + { + "epoch": 0.37548283261802573, + "grad_norm": 1.4329360723495483, + "learning_rate": 3.5949299739599453e-06, + "loss": 2.2318, + "step": 6999 + }, + { + "epoch": 0.37553648068669526, + "grad_norm": 1.5063965320587158, + "learning_rate": 3.5945394353030316e-06, + "loss": 2.2713, + "step": 7000 + }, + { + "epoch": 0.3755901287553648, + "grad_norm": 1.1498984098434448, + "learning_rate": 3.5941488635989454e-06, + "loss": 2.3212, + "step": 7001 + }, + { + "epoch": 0.3756437768240343, + "grad_norm": 1.262702465057373, + "learning_rate": 3.5937582588594795e-06, + "loss": 2.2687, + "step": 7002 + }, + { + "epoch": 0.37569742489270386, + "grad_norm": 1.4734928607940674, + "learning_rate": 3.5933676210964274e-06, + "loss": 2.2718, + "step": 7003 + }, + { + "epoch": 0.3757510729613734, + "grad_norm": 1.3860937356948853, + "learning_rate": 3.5929769503215826e-06, + "loss": 2.2316, + "step": 7004 + }, + { + "epoch": 0.3758047210300429, + "grad_norm": 1.3602614402770996, + "learning_rate": 3.592586246546742e-06, + "loss": 2.5799, + "step": 7005 + }, + { + "epoch": 0.37585836909871245, + "grad_norm": 1.2427059412002563, + "learning_rate": 3.5921955097837014e-06, + "loss": 2.3534, + "step": 7006 + }, + { + "epoch": 0.375912017167382, + "grad_norm": 1.2597901821136475, + "learning_rate": 3.5918047400442574e-06, + "loss": 1.9979, + "step": 7007 + }, + { + "epoch": 0.3759656652360515, + "grad_norm": 1.4652866125106812, + "learning_rate": 3.5914139373402083e-06, + "loss": 2.2636, + "step": 7008 + }, + { + "epoch": 0.37601931330472105, + "grad_norm": 1.5378305912017822, + "learning_rate": 3.591023101683355e-06, + "loss": 2.2195, + "step": 7009 + }, + { + "epoch": 0.3760729613733906, + "grad_norm": 1.57584810256958, + "learning_rate": 3.5906322330854977e-06, + "loss": 2.3939, + "step": 7010 + }, + { + "epoch": 0.3761266094420601, + "grad_norm": 1.2963011264801025, + "learning_rate": 3.5902413315584374e-06, + "loss": 2.2136, + "step": 7011 + }, + { + "epoch": 0.3761802575107296, + "grad_norm": 1.4453976154327393, + "learning_rate": 3.589850397113976e-06, + "loss": 2.296, + "step": 7012 + }, + { + "epoch": 0.3762339055793991, + "grad_norm": 1.4831602573394775, + "learning_rate": 3.5894594297639168e-06, + "loss": 2.3703, + "step": 7013 + }, + { + "epoch": 0.37628755364806865, + "grad_norm": 1.3723182678222656, + "learning_rate": 3.5890684295200645e-06, + "loss": 2.3522, + "step": 7014 + }, + { + "epoch": 0.3763412017167382, + "grad_norm": 1.3233827352523804, + "learning_rate": 3.5886773963942254e-06, + "loss": 2.2667, + "step": 7015 + }, + { + "epoch": 0.3763948497854077, + "grad_norm": 1.3848850727081299, + "learning_rate": 3.588286330398205e-06, + "loss": 2.226, + "step": 7016 + }, + { + "epoch": 0.37644849785407725, + "grad_norm": 1.5313208103179932, + "learning_rate": 3.587895231543811e-06, + "loss": 2.3367, + "step": 7017 + }, + { + "epoch": 0.3765021459227468, + "grad_norm": 2.033557176589966, + "learning_rate": 3.5875040998428513e-06, + "loss": 2.2574, + "step": 7018 + }, + { + "epoch": 0.3765557939914163, + "grad_norm": 1.627288579940796, + "learning_rate": 3.587112935307135e-06, + "loss": 2.1418, + "step": 7019 + }, + { + "epoch": 0.37660944206008584, + "grad_norm": 1.3068797588348389, + "learning_rate": 3.586721737948473e-06, + "loss": 2.4104, + "step": 7020 + }, + { + "epoch": 0.3766630901287554, + "grad_norm": 1.748645544052124, + "learning_rate": 3.586330507778677e-06, + "loss": 2.0801, + "step": 7021 + }, + { + "epoch": 0.3767167381974249, + "grad_norm": 1.4364442825317383, + "learning_rate": 3.585939244809559e-06, + "loss": 2.2361, + "step": 7022 + }, + { + "epoch": 0.37677038626609444, + "grad_norm": 1.4079698324203491, + "learning_rate": 3.5855479490529315e-06, + "loss": 2.3473, + "step": 7023 + }, + { + "epoch": 0.37682403433476397, + "grad_norm": 1.6047422885894775, + "learning_rate": 3.5851566205206108e-06, + "loss": 2.2015, + "step": 7024 + }, + { + "epoch": 0.3768776824034335, + "grad_norm": 1.6668063402175903, + "learning_rate": 3.5847652592244094e-06, + "loss": 2.3627, + "step": 7025 + }, + { + "epoch": 0.376931330472103, + "grad_norm": 1.4821275472640991, + "learning_rate": 3.584373865176145e-06, + "loss": 2.0908, + "step": 7026 + }, + { + "epoch": 0.3769849785407725, + "grad_norm": 1.6452339887619019, + "learning_rate": 3.583982438387636e-06, + "loss": 2.3016, + "step": 7027 + }, + { + "epoch": 0.37703862660944204, + "grad_norm": 1.2783164978027344, + "learning_rate": 3.583590978870699e-06, + "loss": 2.1881, + "step": 7028 + }, + { + "epoch": 0.37709227467811157, + "grad_norm": 1.3644497394561768, + "learning_rate": 3.5831994866371546e-06, + "loss": 2.222, + "step": 7029 + }, + { + "epoch": 0.3771459227467811, + "grad_norm": 1.5567418336868286, + "learning_rate": 3.582807961698821e-06, + "loss": 2.3035, + "step": 7030 + }, + { + "epoch": 0.37719957081545064, + "grad_norm": 2.014706611633301, + "learning_rate": 3.582416404067521e-06, + "loss": 1.7097, + "step": 7031 + }, + { + "epoch": 0.37725321888412017, + "grad_norm": 6.547465801239014, + "learning_rate": 3.582024813755077e-06, + "loss": 2.1854, + "step": 7032 + }, + { + "epoch": 0.3773068669527897, + "grad_norm": 2.5202903747558594, + "learning_rate": 3.5816331907733115e-06, + "loss": 2.1922, + "step": 7033 + }, + { + "epoch": 0.37736051502145923, + "grad_norm": 1.6390420198440552, + "learning_rate": 3.5812415351340486e-06, + "loss": 2.265, + "step": 7034 + }, + { + "epoch": 0.37741416309012876, + "grad_norm": 1.2423211336135864, + "learning_rate": 3.5808498468491135e-06, + "loss": 1.865, + "step": 7035 + }, + { + "epoch": 0.3774678111587983, + "grad_norm": 1.7429546117782593, + "learning_rate": 3.5804581259303334e-06, + "loss": 2.2402, + "step": 7036 + }, + { + "epoch": 0.3775214592274678, + "grad_norm": 1.4788603782653809, + "learning_rate": 3.580066372389534e-06, + "loss": 2.1695, + "step": 7037 + }, + { + "epoch": 0.37757510729613736, + "grad_norm": 1.5205669403076172, + "learning_rate": 3.579674586238544e-06, + "loss": 2.265, + "step": 7038 + }, + { + "epoch": 0.3776287553648069, + "grad_norm": 1.344826340675354, + "learning_rate": 3.579282767489193e-06, + "loss": 2.0102, + "step": 7039 + }, + { + "epoch": 0.3776824034334764, + "grad_norm": 1.8502384424209595, + "learning_rate": 3.5788909161533102e-06, + "loss": 2.2037, + "step": 7040 + }, + { + "epoch": 0.3777360515021459, + "grad_norm": 1.556571364402771, + "learning_rate": 3.5784990322427275e-06, + "loss": 2.0699, + "step": 7041 + }, + { + "epoch": 0.37778969957081543, + "grad_norm": 1.8016893863677979, + "learning_rate": 3.5781071157692763e-06, + "loss": 2.2543, + "step": 7042 + }, + { + "epoch": 0.37784334763948496, + "grad_norm": 1.5110739469528198, + "learning_rate": 3.57771516674479e-06, + "loss": 2.3401, + "step": 7043 + }, + { + "epoch": 0.3778969957081545, + "grad_norm": 1.4274299144744873, + "learning_rate": 3.577323185181103e-06, + "loss": 2.3794, + "step": 7044 + }, + { + "epoch": 0.377950643776824, + "grad_norm": 1.4881749153137207, + "learning_rate": 3.576931171090049e-06, + "loss": 2.3749, + "step": 7045 + }, + { + "epoch": 0.37800429184549356, + "grad_norm": 1.957499384880066, + "learning_rate": 3.5765391244834657e-06, + "loss": 2.4022, + "step": 7046 + }, + { + "epoch": 0.3780579399141631, + "grad_norm": 1.3438087701797485, + "learning_rate": 3.5761470453731886e-06, + "loss": 2.2271, + "step": 7047 + }, + { + "epoch": 0.3781115879828326, + "grad_norm": 1.544123649597168, + "learning_rate": 3.5757549337710562e-06, + "loss": 2.5698, + "step": 7048 + }, + { + "epoch": 0.37816523605150215, + "grad_norm": 1.542656660079956, + "learning_rate": 3.5753627896889077e-06, + "loss": 2.2716, + "step": 7049 + }, + { + "epoch": 0.3782188841201717, + "grad_norm": 1.3072032928466797, + "learning_rate": 3.5749706131385823e-06, + "loss": 2.3631, + "step": 7050 + }, + { + "epoch": 0.3782725321888412, + "grad_norm": 1.6364070177078247, + "learning_rate": 3.5745784041319233e-06, + "loss": 2.2716, + "step": 7051 + }, + { + "epoch": 0.37832618025751075, + "grad_norm": 1.4905989170074463, + "learning_rate": 3.574186162680769e-06, + "loss": 1.5382, + "step": 7052 + }, + { + "epoch": 0.3783798283261803, + "grad_norm": 1.6142522096633911, + "learning_rate": 3.573793888796964e-06, + "loss": 2.2595, + "step": 7053 + }, + { + "epoch": 0.3784334763948498, + "grad_norm": 1.1961321830749512, + "learning_rate": 3.573401582492352e-06, + "loss": 1.9678, + "step": 7054 + }, + { + "epoch": 0.3784871244635193, + "grad_norm": 1.6450841426849365, + "learning_rate": 3.5730092437787783e-06, + "loss": 2.5005, + "step": 7055 + }, + { + "epoch": 0.3785407725321888, + "grad_norm": 1.6296801567077637, + "learning_rate": 3.572616872668088e-06, + "loss": 2.5189, + "step": 7056 + }, + { + "epoch": 0.37859442060085835, + "grad_norm": 1.6453884840011597, + "learning_rate": 3.5722244691721285e-06, + "loss": 2.4327, + "step": 7057 + }, + { + "epoch": 0.3786480686695279, + "grad_norm": 1.3600528240203857, + "learning_rate": 3.571832033302747e-06, + "loss": 2.2271, + "step": 7058 + }, + { + "epoch": 0.3787017167381974, + "grad_norm": 1.6638065576553345, + "learning_rate": 3.571439565071792e-06, + "loss": 2.1244, + "step": 7059 + }, + { + "epoch": 0.37875536480686695, + "grad_norm": 1.3679696321487427, + "learning_rate": 3.571047064491114e-06, + "loss": 1.977, + "step": 7060 + }, + { + "epoch": 0.3788090128755365, + "grad_norm": 1.4558689594268799, + "learning_rate": 3.5706545315725632e-06, + "loss": 2.1901, + "step": 7061 + }, + { + "epoch": 0.378862660944206, + "grad_norm": 1.3570740222930908, + "learning_rate": 3.5702619663279913e-06, + "loss": 2.3027, + "step": 7062 + }, + { + "epoch": 0.37891630901287554, + "grad_norm": 1.4842164516448975, + "learning_rate": 3.5698693687692514e-06, + "loss": 2.0243, + "step": 7063 + }, + { + "epoch": 0.3789699570815451, + "grad_norm": 1.305939793586731, + "learning_rate": 3.5694767389081963e-06, + "loss": 2.3075, + "step": 7064 + }, + { + "epoch": 0.3790236051502146, + "grad_norm": 1.5054433345794678, + "learning_rate": 3.569084076756681e-06, + "loss": 1.5378, + "step": 7065 + }, + { + "epoch": 0.37907725321888414, + "grad_norm": 1.3943922519683838, + "learning_rate": 3.5686913823265614e-06, + "loss": 2.1983, + "step": 7066 + }, + { + "epoch": 0.37913090128755367, + "grad_norm": 1.4126089811325073, + "learning_rate": 3.568298655629693e-06, + "loss": 2.1204, + "step": 7067 + }, + { + "epoch": 0.3791845493562232, + "grad_norm": 1.2911114692687988, + "learning_rate": 3.5679058966779346e-06, + "loss": 2.1754, + "step": 7068 + }, + { + "epoch": 0.37923819742489273, + "grad_norm": 1.2536755800247192, + "learning_rate": 3.5675131054831435e-06, + "loss": 2.0383, + "step": 7069 + }, + { + "epoch": 0.3792918454935622, + "grad_norm": 1.4357402324676514, + "learning_rate": 3.567120282057179e-06, + "loss": 2.3079, + "step": 7070 + }, + { + "epoch": 0.37934549356223174, + "grad_norm": 1.3642776012420654, + "learning_rate": 3.5667274264119035e-06, + "loss": 1.9418, + "step": 7071 + }, + { + "epoch": 0.37939914163090127, + "grad_norm": 1.721253514289856, + "learning_rate": 3.566334538559176e-06, + "loss": 2.2938, + "step": 7072 + }, + { + "epoch": 0.3794527896995708, + "grad_norm": 1.5342763662338257, + "learning_rate": 3.5659416185108613e-06, + "loss": 2.3504, + "step": 7073 + }, + { + "epoch": 0.37950643776824033, + "grad_norm": 1.4710346460342407, + "learning_rate": 3.565548666278821e-06, + "loss": 2.2452, + "step": 7074 + }, + { + "epoch": 0.37956008583690987, + "grad_norm": 1.6790549755096436, + "learning_rate": 3.5651556818749196e-06, + "loss": 2.1834, + "step": 7075 + }, + { + "epoch": 0.3796137339055794, + "grad_norm": 2.2838780879974365, + "learning_rate": 3.5647626653110217e-06, + "loss": 2.1486, + "step": 7076 + }, + { + "epoch": 0.37966738197424893, + "grad_norm": 1.7085174322128296, + "learning_rate": 3.564369616598996e-06, + "loss": 2.3059, + "step": 7077 + }, + { + "epoch": 0.37972103004291846, + "grad_norm": 1.20753812789917, + "learning_rate": 3.5639765357507077e-06, + "loss": 2.2915, + "step": 7078 + }, + { + "epoch": 0.379774678111588, + "grad_norm": 1.3393698930740356, + "learning_rate": 3.5635834227780255e-06, + "loss": 2.2948, + "step": 7079 + }, + { + "epoch": 0.3798283261802575, + "grad_norm": 2.077826499938965, + "learning_rate": 3.5631902776928186e-06, + "loss": 1.9167, + "step": 7080 + }, + { + "epoch": 0.37988197424892706, + "grad_norm": 1.7156116962432861, + "learning_rate": 3.562797100506957e-06, + "loss": 2.0467, + "step": 7081 + }, + { + "epoch": 0.3799356223175966, + "grad_norm": 1.4461619853973389, + "learning_rate": 3.5624038912323113e-06, + "loss": 2.1931, + "step": 7082 + }, + { + "epoch": 0.3799892703862661, + "grad_norm": 1.5770859718322754, + "learning_rate": 3.562010649880755e-06, + "loss": 2.1917, + "step": 7083 + }, + { + "epoch": 0.3800429184549356, + "grad_norm": 4.459570407867432, + "learning_rate": 3.56161737646416e-06, + "loss": 1.8911, + "step": 7084 + }, + { + "epoch": 0.38009656652360513, + "grad_norm": 1.1969562768936157, + "learning_rate": 3.5612240709944008e-06, + "loss": 2.0294, + "step": 7085 + }, + { + "epoch": 0.38015021459227466, + "grad_norm": 1.3461261987686157, + "learning_rate": 3.5608307334833524e-06, + "loss": 2.0202, + "step": 7086 + }, + { + "epoch": 0.3802038626609442, + "grad_norm": 1.5598872900009155, + "learning_rate": 3.5604373639428903e-06, + "loss": 2.1215, + "step": 7087 + }, + { + "epoch": 0.3802575107296137, + "grad_norm": 1.3970175981521606, + "learning_rate": 3.5600439623848916e-06, + "loss": 2.1213, + "step": 7088 + }, + { + "epoch": 0.38031115879828326, + "grad_norm": 1.3933061361312866, + "learning_rate": 3.559650528821234e-06, + "loss": 2.2406, + "step": 7089 + }, + { + "epoch": 0.3803648068669528, + "grad_norm": 1.1037049293518066, + "learning_rate": 3.5592570632637976e-06, + "loss": 2.2624, + "step": 7090 + }, + { + "epoch": 0.3804184549356223, + "grad_norm": 1.8765888214111328, + "learning_rate": 3.5588635657244607e-06, + "loss": 2.1129, + "step": 7091 + }, + { + "epoch": 0.38047210300429185, + "grad_norm": 1.4874075651168823, + "learning_rate": 3.558470036215104e-06, + "loss": 2.2892, + "step": 7092 + }, + { + "epoch": 0.3805257510729614, + "grad_norm": 1.5440664291381836, + "learning_rate": 3.5580764747476106e-06, + "loss": 2.4283, + "step": 7093 + }, + { + "epoch": 0.3805793991416309, + "grad_norm": 1.4877080917358398, + "learning_rate": 3.5576828813338617e-06, + "loss": 2.38, + "step": 7094 + }, + { + "epoch": 0.38063304721030045, + "grad_norm": 2.0485153198242188, + "learning_rate": 3.557289255985743e-06, + "loss": 2.1523, + "step": 7095 + }, + { + "epoch": 0.38068669527897, + "grad_norm": 1.4862865209579468, + "learning_rate": 3.5568955987151373e-06, + "loss": 2.4894, + "step": 7096 + }, + { + "epoch": 0.3807403433476395, + "grad_norm": 3.8561813831329346, + "learning_rate": 3.5565019095339304e-06, + "loss": 2.4942, + "step": 7097 + }, + { + "epoch": 0.380793991416309, + "grad_norm": 1.4156289100646973, + "learning_rate": 3.5561081884540094e-06, + "loss": 2.346, + "step": 7098 + }, + { + "epoch": 0.3808476394849785, + "grad_norm": 1.4625810384750366, + "learning_rate": 3.5557144354872613e-06, + "loss": 2.2899, + "step": 7099 + }, + { + "epoch": 0.38090128755364805, + "grad_norm": 1.4686527252197266, + "learning_rate": 3.5553206506455757e-06, + "loss": 2.2372, + "step": 7100 + }, + { + "epoch": 0.3809549356223176, + "grad_norm": 1.2918366193771362, + "learning_rate": 3.554926833940841e-06, + "loss": 1.9015, + "step": 7101 + }, + { + "epoch": 0.3810085836909871, + "grad_norm": 1.5385547876358032, + "learning_rate": 3.5545329853849474e-06, + "loss": 2.4159, + "step": 7102 + }, + { + "epoch": 0.38106223175965664, + "grad_norm": 1.6376628875732422, + "learning_rate": 3.5541391049897876e-06, + "loss": 2.4628, + "step": 7103 + }, + { + "epoch": 0.3811158798283262, + "grad_norm": 1.4753223657608032, + "learning_rate": 3.553745192767253e-06, + "loss": 2.2805, + "step": 7104 + }, + { + "epoch": 0.3811695278969957, + "grad_norm": 2.0424320697784424, + "learning_rate": 3.5533512487292364e-06, + "loss": 2.4933, + "step": 7105 + }, + { + "epoch": 0.38122317596566524, + "grad_norm": 1.2379062175750732, + "learning_rate": 3.552957272887634e-06, + "loss": 2.0805, + "step": 7106 + }, + { + "epoch": 0.38127682403433477, + "grad_norm": 1.2036736011505127, + "learning_rate": 3.5525632652543383e-06, + "loss": 2.2715, + "step": 7107 + }, + { + "epoch": 0.3813304721030043, + "grad_norm": 1.443591594696045, + "learning_rate": 3.5521692258412483e-06, + "loss": 2.3116, + "step": 7108 + }, + { + "epoch": 0.38138412017167383, + "grad_norm": 1.4672949314117432, + "learning_rate": 3.5517751546602587e-06, + "loss": 2.3167, + "step": 7109 + }, + { + "epoch": 0.38143776824034337, + "grad_norm": 1.4518742561340332, + "learning_rate": 3.5513810517232683e-06, + "loss": 2.2632, + "step": 7110 + }, + { + "epoch": 0.3814914163090129, + "grad_norm": 1.4921389818191528, + "learning_rate": 3.5509869170421774e-06, + "loss": 2.4078, + "step": 7111 + }, + { + "epoch": 0.38154506437768243, + "grad_norm": 1.5881885290145874, + "learning_rate": 3.5505927506288852e-06, + "loss": 2.1948, + "step": 7112 + }, + { + "epoch": 0.3815987124463519, + "grad_norm": 1.615777611732483, + "learning_rate": 3.550198552495292e-06, + "loss": 2.4147, + "step": 7113 + }, + { + "epoch": 0.38165236051502144, + "grad_norm": 1.4781193733215332, + "learning_rate": 3.549804322653301e-06, + "loss": 2.3769, + "step": 7114 + }, + { + "epoch": 0.38170600858369097, + "grad_norm": 1.2851500511169434, + "learning_rate": 3.549410061114813e-06, + "loss": 2.2328, + "step": 7115 + }, + { + "epoch": 0.3817596566523605, + "grad_norm": 1.3613777160644531, + "learning_rate": 3.549015767891734e-06, + "loss": 2.0962, + "step": 7116 + }, + { + "epoch": 0.38181330472103003, + "grad_norm": 1.131056785583496, + "learning_rate": 3.5486214429959687e-06, + "loss": 2.1467, + "step": 7117 + }, + { + "epoch": 0.38186695278969957, + "grad_norm": 1.5438752174377441, + "learning_rate": 3.548227086439422e-06, + "loss": 2.2707, + "step": 7118 + }, + { + "epoch": 0.3819206008583691, + "grad_norm": 1.6165395975112915, + "learning_rate": 3.5478326982340004e-06, + "loss": 2.2705, + "step": 7119 + }, + { + "epoch": 0.38197424892703863, + "grad_norm": 1.7686961889266968, + "learning_rate": 3.5474382783916124e-06, + "loss": 2.3064, + "step": 7120 + }, + { + "epoch": 0.38202789699570816, + "grad_norm": 1.5419442653656006, + "learning_rate": 3.547043826924166e-06, + "loss": 2.0073, + "step": 7121 + }, + { + "epoch": 0.3820815450643777, + "grad_norm": 1.4060407876968384, + "learning_rate": 3.5466493438435707e-06, + "loss": 2.1571, + "step": 7122 + }, + { + "epoch": 0.3821351931330472, + "grad_norm": 2.1350767612457275, + "learning_rate": 3.5462548291617377e-06, + "loss": 2.3748, + "step": 7123 + }, + { + "epoch": 0.38218884120171676, + "grad_norm": 1.5835888385772705, + "learning_rate": 3.545860282890578e-06, + "loss": 2.3195, + "step": 7124 + }, + { + "epoch": 0.3822424892703863, + "grad_norm": 1.5308562517166138, + "learning_rate": 3.5454657050420037e-06, + "loss": 2.138, + "step": 7125 + }, + { + "epoch": 0.3822961373390558, + "grad_norm": 1.4792020320892334, + "learning_rate": 3.5450710956279293e-06, + "loss": 2.4436, + "step": 7126 + }, + { + "epoch": 0.3823497854077253, + "grad_norm": 1.5629016160964966, + "learning_rate": 3.5446764546602686e-06, + "loss": 2.2837, + "step": 7127 + }, + { + "epoch": 0.3824034334763948, + "grad_norm": 1.732651710510254, + "learning_rate": 3.544281782150937e-06, + "loss": 2.2712, + "step": 7128 + }, + { + "epoch": 0.38245708154506436, + "grad_norm": 1.771511197090149, + "learning_rate": 3.5438870781118503e-06, + "loss": 2.2454, + "step": 7129 + }, + { + "epoch": 0.3825107296137339, + "grad_norm": 1.492269515991211, + "learning_rate": 3.543492342554926e-06, + "loss": 2.102, + "step": 7130 + }, + { + "epoch": 0.3825643776824034, + "grad_norm": 1.2588258981704712, + "learning_rate": 3.5430975754920823e-06, + "loss": 2.4598, + "step": 7131 + }, + { + "epoch": 0.38261802575107295, + "grad_norm": 1.4006578922271729, + "learning_rate": 3.5427027769352373e-06, + "loss": 2.2336, + "step": 7132 + }, + { + "epoch": 0.3826716738197425, + "grad_norm": 1.5897362232208252, + "learning_rate": 3.5423079468963127e-06, + "loss": 2.2819, + "step": 7133 + }, + { + "epoch": 0.382725321888412, + "grad_norm": 1.5214604139328003, + "learning_rate": 3.54191308538723e-06, + "loss": 2.4479, + "step": 7134 + }, + { + "epoch": 0.38277896995708155, + "grad_norm": 1.700340747833252, + "learning_rate": 3.5415181924199083e-06, + "loss": 2.3102, + "step": 7135 + }, + { + "epoch": 0.3828326180257511, + "grad_norm": 1.4812581539154053, + "learning_rate": 3.541123268006273e-06, + "loss": 2.4669, + "step": 7136 + }, + { + "epoch": 0.3828862660944206, + "grad_norm": 1.2273448705673218, + "learning_rate": 3.540728312158247e-06, + "loss": 2.0661, + "step": 7137 + }, + { + "epoch": 0.38293991416309014, + "grad_norm": 1.6681246757507324, + "learning_rate": 3.540333324887755e-06, + "loss": 2.1888, + "step": 7138 + }, + { + "epoch": 0.3829935622317597, + "grad_norm": 1.4309213161468506, + "learning_rate": 3.5399383062067235e-06, + "loss": 2.3372, + "step": 7139 + }, + { + "epoch": 0.3830472103004292, + "grad_norm": 1.289438247680664, + "learning_rate": 3.539543256127078e-06, + "loss": 2.5661, + "step": 7140 + }, + { + "epoch": 0.38310085836909874, + "grad_norm": 1.4819854497909546, + "learning_rate": 3.5391481746607477e-06, + "loss": 2.1952, + "step": 7141 + }, + { + "epoch": 0.3831545064377682, + "grad_norm": 1.5064640045166016, + "learning_rate": 3.5387530618196596e-06, + "loss": 2.2221, + "step": 7142 + }, + { + "epoch": 0.38320815450643775, + "grad_norm": 1.4701539278030396, + "learning_rate": 3.538357917615745e-06, + "loss": 2.2065, + "step": 7143 + }, + { + "epoch": 0.3832618025751073, + "grad_norm": 1.266905426979065, + "learning_rate": 3.537962742060933e-06, + "loss": 2.5487, + "step": 7144 + }, + { + "epoch": 0.3833154506437768, + "grad_norm": 1.1653016805648804, + "learning_rate": 3.537567535167155e-06, + "loss": 1.8619, + "step": 7145 + }, + { + "epoch": 0.38336909871244634, + "grad_norm": 1.3341618776321411, + "learning_rate": 3.537172296946344e-06, + "loss": 2.398, + "step": 7146 + }, + { + "epoch": 0.3834227467811159, + "grad_norm": 1.3650214672088623, + "learning_rate": 3.536777027410434e-06, + "loss": 2.1405, + "step": 7147 + }, + { + "epoch": 0.3834763948497854, + "grad_norm": 1.6660027503967285, + "learning_rate": 3.5363817265713577e-06, + "loss": 1.7816, + "step": 7148 + }, + { + "epoch": 0.38353004291845494, + "grad_norm": 1.1885535717010498, + "learning_rate": 3.535986394441051e-06, + "loss": 1.8798, + "step": 7149 + }, + { + "epoch": 0.38358369098712447, + "grad_norm": 1.63679039478302, + "learning_rate": 3.5355910310314506e-06, + "loss": 2.5674, + "step": 7150 + }, + { + "epoch": 0.383637339055794, + "grad_norm": 1.574468731880188, + "learning_rate": 3.5351956363544933e-06, + "loss": 2.3473, + "step": 7151 + }, + { + "epoch": 0.38369098712446353, + "grad_norm": 1.3876655101776123, + "learning_rate": 3.534800210422117e-06, + "loss": 2.2927, + "step": 7152 + }, + { + "epoch": 0.38374463519313307, + "grad_norm": 1.7459757328033447, + "learning_rate": 3.5344047532462606e-06, + "loss": 2.3925, + "step": 7153 + }, + { + "epoch": 0.3837982832618026, + "grad_norm": 1.502570629119873, + "learning_rate": 3.5340092648388643e-06, + "loss": 2.403, + "step": 7154 + }, + { + "epoch": 0.38385193133047213, + "grad_norm": 1.263372540473938, + "learning_rate": 3.533613745211868e-06, + "loss": 2.4749, + "step": 7155 + }, + { + "epoch": 0.3839055793991416, + "grad_norm": 1.4296120405197144, + "learning_rate": 3.533218194377216e-06, + "loss": 2.1824, + "step": 7156 + }, + { + "epoch": 0.38395922746781114, + "grad_norm": 1.2555186748504639, + "learning_rate": 3.5328226123468483e-06, + "loss": 2.1465, + "step": 7157 + }, + { + "epoch": 0.38401287553648067, + "grad_norm": 1.55518639087677, + "learning_rate": 3.5324269991327098e-06, + "loss": 2.4283, + "step": 7158 + }, + { + "epoch": 0.3840665236051502, + "grad_norm": 1.265496015548706, + "learning_rate": 3.5320313547467453e-06, + "loss": 2.1509, + "step": 7159 + }, + { + "epoch": 0.38412017167381973, + "grad_norm": 1.3166147470474243, + "learning_rate": 3.531635679200901e-06, + "loss": 1.8173, + "step": 7160 + }, + { + "epoch": 0.38417381974248926, + "grad_norm": 1.4758870601654053, + "learning_rate": 3.5312399725071223e-06, + "loss": 2.3758, + "step": 7161 + }, + { + "epoch": 0.3842274678111588, + "grad_norm": 1.6287429332733154, + "learning_rate": 3.5308442346773565e-06, + "loss": 2.1174, + "step": 7162 + }, + { + "epoch": 0.3842811158798283, + "grad_norm": 1.4377782344818115, + "learning_rate": 3.5304484657235525e-06, + "loss": 2.1342, + "step": 7163 + }, + { + "epoch": 0.38433476394849786, + "grad_norm": 1.4711322784423828, + "learning_rate": 3.5300526656576605e-06, + "loss": 2.2619, + "step": 7164 + }, + { + "epoch": 0.3843884120171674, + "grad_norm": 1.323542833328247, + "learning_rate": 3.52965683449163e-06, + "loss": 2.2356, + "step": 7165 + }, + { + "epoch": 0.3844420600858369, + "grad_norm": 1.6299479007720947, + "learning_rate": 3.529260972237412e-06, + "loss": 2.2418, + "step": 7166 + }, + { + "epoch": 0.38449570815450645, + "grad_norm": 1.2885947227478027, + "learning_rate": 3.5288650789069588e-06, + "loss": 2.3141, + "step": 7167 + }, + { + "epoch": 0.384549356223176, + "grad_norm": 1.266083836555481, + "learning_rate": 3.528469154512224e-06, + "loss": 1.5786, + "step": 7168 + }, + { + "epoch": 0.3846030042918455, + "grad_norm": 1.5370007753372192, + "learning_rate": 3.528073199065162e-06, + "loss": 2.1887, + "step": 7169 + }, + { + "epoch": 0.384656652360515, + "grad_norm": 1.4732719659805298, + "learning_rate": 3.5276772125777265e-06, + "loss": 2.6335, + "step": 7170 + }, + { + "epoch": 0.3847103004291845, + "grad_norm": 1.597367763519287, + "learning_rate": 3.5272811950618734e-06, + "loss": 2.1465, + "step": 7171 + }, + { + "epoch": 0.38476394849785406, + "grad_norm": 1.3470137119293213, + "learning_rate": 3.5268851465295605e-06, + "loss": 2.2067, + "step": 7172 + }, + { + "epoch": 0.3848175965665236, + "grad_norm": 1.5376290082931519, + "learning_rate": 3.5264890669927455e-06, + "loss": 2.2775, + "step": 7173 + }, + { + "epoch": 0.3848712446351931, + "grad_norm": 1.2760653495788574, + "learning_rate": 3.526092956463388e-06, + "loss": 2.2944, + "step": 7174 + }, + { + "epoch": 0.38492489270386265, + "grad_norm": 1.4818849563598633, + "learning_rate": 3.5256968149534453e-06, + "loss": 2.3467, + "step": 7175 + }, + { + "epoch": 0.3849785407725322, + "grad_norm": 1.5066486597061157, + "learning_rate": 3.5253006424748796e-06, + "loss": 2.07, + "step": 7176 + }, + { + "epoch": 0.3850321888412017, + "grad_norm": 1.6796036958694458, + "learning_rate": 3.5249044390396518e-06, + "loss": 2.1107, + "step": 7177 + }, + { + "epoch": 0.38508583690987125, + "grad_norm": 1.6663833856582642, + "learning_rate": 3.524508204659725e-06, + "loss": 2.239, + "step": 7178 + }, + { + "epoch": 0.3851394849785408, + "grad_norm": 1.1909596920013428, + "learning_rate": 3.5241119393470633e-06, + "loss": 2.1014, + "step": 7179 + }, + { + "epoch": 0.3851931330472103, + "grad_norm": 2.04338002204895, + "learning_rate": 3.5237156431136294e-06, + "loss": 1.9767, + "step": 7180 + }, + { + "epoch": 0.38524678111587984, + "grad_norm": 1.365208387374878, + "learning_rate": 3.523319315971389e-06, + "loss": 2.3764, + "step": 7181 + }, + { + "epoch": 0.3853004291845494, + "grad_norm": 1.4043782949447632, + "learning_rate": 3.5229229579323088e-06, + "loss": 2.4325, + "step": 7182 + }, + { + "epoch": 0.3853540772532189, + "grad_norm": 1.4276247024536133, + "learning_rate": 3.522526569008356e-06, + "loss": 2.6211, + "step": 7183 + }, + { + "epoch": 0.38540772532188844, + "grad_norm": 1.2876906394958496, + "learning_rate": 3.5221301492114983e-06, + "loss": 2.0073, + "step": 7184 + }, + { + "epoch": 0.3854613733905579, + "grad_norm": 1.452094554901123, + "learning_rate": 3.5217336985537047e-06, + "loss": 2.2104, + "step": 7185 + }, + { + "epoch": 0.38551502145922745, + "grad_norm": 1.4212895631790161, + "learning_rate": 3.521337217046945e-06, + "loss": 2.0683, + "step": 7186 + }, + { + "epoch": 0.385568669527897, + "grad_norm": 1.4591859579086304, + "learning_rate": 3.5209407047031914e-06, + "loss": 2.4194, + "step": 7187 + }, + { + "epoch": 0.3856223175965665, + "grad_norm": 1.4990204572677612, + "learning_rate": 3.5205441615344134e-06, + "loss": 2.1608, + "step": 7188 + }, + { + "epoch": 0.38567596566523604, + "grad_norm": 1.4680249691009521, + "learning_rate": 3.520147587552585e-06, + "loss": 2.3746, + "step": 7189 + }, + { + "epoch": 0.3857296137339056, + "grad_norm": 1.3927541971206665, + "learning_rate": 3.5197509827696806e-06, + "loss": 2.3607, + "step": 7190 + }, + { + "epoch": 0.3857832618025751, + "grad_norm": 1.6681116819381714, + "learning_rate": 3.5193543471976744e-06, + "loss": 2.262, + "step": 7191 + }, + { + "epoch": 0.38583690987124464, + "grad_norm": 1.6615896224975586, + "learning_rate": 3.518957680848541e-06, + "loss": 2.595, + "step": 7192 + }, + { + "epoch": 0.38589055793991417, + "grad_norm": 1.352702260017395, + "learning_rate": 3.518560983734258e-06, + "loss": 2.1418, + "step": 7193 + }, + { + "epoch": 0.3859442060085837, + "grad_norm": 1.3523303270339966, + "learning_rate": 3.5181642558668007e-06, + "loss": 2.0652, + "step": 7194 + }, + { + "epoch": 0.38599785407725323, + "grad_norm": 1.5282734632492065, + "learning_rate": 3.51776749725815e-06, + "loss": 2.247, + "step": 7195 + }, + { + "epoch": 0.38605150214592276, + "grad_norm": 1.5017155408859253, + "learning_rate": 3.517370707920284e-06, + "loss": 2.1643, + "step": 7196 + }, + { + "epoch": 0.3861051502145923, + "grad_norm": 1.3667994737625122, + "learning_rate": 3.5169738878651825e-06, + "loss": 2.2816, + "step": 7197 + }, + { + "epoch": 0.38615879828326183, + "grad_norm": 1.5541408061981201, + "learning_rate": 3.516577037104828e-06, + "loss": 2.2457, + "step": 7198 + }, + { + "epoch": 0.3862124463519313, + "grad_norm": 1.3501417636871338, + "learning_rate": 3.516180155651201e-06, + "loss": 2.3359, + "step": 7199 + }, + { + "epoch": 0.38626609442060084, + "grad_norm": 1.5077298879623413, + "learning_rate": 3.515783243516284e-06, + "loss": 2.5775, + "step": 7200 + }, + { + "epoch": 0.38631974248927037, + "grad_norm": 1.6984909772872925, + "learning_rate": 3.515386300712063e-06, + "loss": 2.2992, + "step": 7201 + }, + { + "epoch": 0.3863733905579399, + "grad_norm": 1.3329027891159058, + "learning_rate": 3.5149893272505216e-06, + "loss": 2.3561, + "step": 7202 + }, + { + "epoch": 0.38642703862660943, + "grad_norm": 1.478976845741272, + "learning_rate": 3.5145923231436453e-06, + "loss": 2.3378, + "step": 7203 + }, + { + "epoch": 0.38648068669527896, + "grad_norm": 1.4096879959106445, + "learning_rate": 3.5141952884034215e-06, + "loss": 2.2807, + "step": 7204 + }, + { + "epoch": 0.3865343347639485, + "grad_norm": 1.4988951683044434, + "learning_rate": 3.513798223041837e-06, + "loss": 2.5366, + "step": 7205 + }, + { + "epoch": 0.386587982832618, + "grad_norm": 1.443189024925232, + "learning_rate": 3.51340112707088e-06, + "loss": 2.3536, + "step": 7206 + }, + { + "epoch": 0.38664163090128756, + "grad_norm": 1.542603850364685, + "learning_rate": 3.513004000502541e-06, + "loss": 2.2514, + "step": 7207 + }, + { + "epoch": 0.3866952789699571, + "grad_norm": 2.2659595012664795, + "learning_rate": 3.5126068433488103e-06, + "loss": 2.0757, + "step": 7208 + }, + { + "epoch": 0.3867489270386266, + "grad_norm": 1.441951870918274, + "learning_rate": 3.512209655621679e-06, + "loss": 2.345, + "step": 7209 + }, + { + "epoch": 0.38680257510729615, + "grad_norm": 1.6946419477462769, + "learning_rate": 3.511812437333138e-06, + "loss": 2.2524, + "step": 7210 + }, + { + "epoch": 0.3868562231759657, + "grad_norm": 1.3067548274993896, + "learning_rate": 3.511415188495182e-06, + "loss": 2.3312, + "step": 7211 + }, + { + "epoch": 0.3869098712446352, + "grad_norm": 1.514825701713562, + "learning_rate": 3.5110179091198048e-06, + "loss": 2.2418, + "step": 7212 + }, + { + "epoch": 0.3869635193133047, + "grad_norm": 1.4664649963378906, + "learning_rate": 3.5106205992190014e-06, + "loss": 2.1749, + "step": 7213 + }, + { + "epoch": 0.3870171673819742, + "grad_norm": 1.5010216236114502, + "learning_rate": 3.510223258804767e-06, + "loss": 2.2713, + "step": 7214 + }, + { + "epoch": 0.38707081545064376, + "grad_norm": 1.3301386833190918, + "learning_rate": 3.5098258878890985e-06, + "loss": 1.9829, + "step": 7215 + }, + { + "epoch": 0.3871244635193133, + "grad_norm": 1.6319094896316528, + "learning_rate": 3.509428486483994e-06, + "loss": 1.9147, + "step": 7216 + }, + { + "epoch": 0.3871781115879828, + "grad_norm": 1.3890591859817505, + "learning_rate": 3.509031054601452e-06, + "loss": 2.2246, + "step": 7217 + }, + { + "epoch": 0.38723175965665235, + "grad_norm": 1.246338963508606, + "learning_rate": 3.508633592253472e-06, + "loss": 2.1182, + "step": 7218 + }, + { + "epoch": 0.3872854077253219, + "grad_norm": 1.4997339248657227, + "learning_rate": 3.5082360994520546e-06, + "loss": 2.4062, + "step": 7219 + }, + { + "epoch": 0.3873390557939914, + "grad_norm": 2.2339086532592773, + "learning_rate": 3.507838576209202e-06, + "loss": 1.497, + "step": 7220 + }, + { + "epoch": 0.38739270386266095, + "grad_norm": 1.509867787361145, + "learning_rate": 3.5074410225369147e-06, + "loss": 2.308, + "step": 7221 + }, + { + "epoch": 0.3874463519313305, + "grad_norm": 1.3964465856552124, + "learning_rate": 3.507043438447198e-06, + "loss": 1.9849, + "step": 7222 + }, + { + "epoch": 0.3875, + "grad_norm": 1.2551723718643188, + "learning_rate": 3.5066458239520544e-06, + "loss": 2.068, + "step": 7223 + }, + { + "epoch": 0.38755364806866954, + "grad_norm": 1.3833973407745361, + "learning_rate": 3.5062481790634893e-06, + "loss": 2.2026, + "step": 7224 + }, + { + "epoch": 0.3876072961373391, + "grad_norm": 1.4364162683486938, + "learning_rate": 3.5058505037935097e-06, + "loss": 1.5923, + "step": 7225 + }, + { + "epoch": 0.3876609442060086, + "grad_norm": 1.4871810674667358, + "learning_rate": 3.505452798154122e-06, + "loss": 2.2423, + "step": 7226 + }, + { + "epoch": 0.38771459227467814, + "grad_norm": 1.545515775680542, + "learning_rate": 3.5050550621573334e-06, + "loss": 2.5672, + "step": 7227 + }, + { + "epoch": 0.3877682403433476, + "grad_norm": 1.383879542350769, + "learning_rate": 3.504657295815153e-06, + "loss": 2.1959, + "step": 7228 + }, + { + "epoch": 0.38782188841201715, + "grad_norm": 1.5071160793304443, + "learning_rate": 3.5042594991395912e-06, + "loss": 2.445, + "step": 7229 + }, + { + "epoch": 0.3878755364806867, + "grad_norm": 1.374474287033081, + "learning_rate": 3.5038616721426576e-06, + "loss": 2.4563, + "step": 7230 + }, + { + "epoch": 0.3879291845493562, + "grad_norm": 1.5588550567626953, + "learning_rate": 3.5034638148363644e-06, + "loss": 2.3704, + "step": 7231 + }, + { + "epoch": 0.38798283261802574, + "grad_norm": 1.360715389251709, + "learning_rate": 3.5030659272327233e-06, + "loss": 2.2062, + "step": 7232 + }, + { + "epoch": 0.3880364806866953, + "grad_norm": 1.4977775812149048, + "learning_rate": 3.502668009343748e-06, + "loss": 2.2087, + "step": 7233 + }, + { + "epoch": 0.3880901287553648, + "grad_norm": 1.2632594108581543, + "learning_rate": 3.5022700611814527e-06, + "loss": 2.5256, + "step": 7234 + }, + { + "epoch": 0.38814377682403434, + "grad_norm": 1.2800676822662354, + "learning_rate": 3.5018720827578523e-06, + "loss": 1.7638, + "step": 7235 + }, + { + "epoch": 0.38819742489270387, + "grad_norm": 1.99589204788208, + "learning_rate": 3.5014740740849646e-06, + "loss": 2.2583, + "step": 7236 + }, + { + "epoch": 0.3882510729613734, + "grad_norm": 1.4568816423416138, + "learning_rate": 3.501076035174804e-06, + "loss": 2.2913, + "step": 7237 + }, + { + "epoch": 0.38830472103004293, + "grad_norm": 1.5416792631149292, + "learning_rate": 3.5006779660393897e-06, + "loss": 2.2883, + "step": 7238 + }, + { + "epoch": 0.38835836909871246, + "grad_norm": 1.7652993202209473, + "learning_rate": 3.50027986669074e-06, + "loss": 2.315, + "step": 7239 + }, + { + "epoch": 0.388412017167382, + "grad_norm": 1.3815497159957886, + "learning_rate": 3.4998817371408754e-06, + "loss": 2.339, + "step": 7240 + }, + { + "epoch": 0.3884656652360515, + "grad_norm": 1.4199881553649902, + "learning_rate": 3.4994835774018165e-06, + "loss": 2.2901, + "step": 7241 + }, + { + "epoch": 0.388519313304721, + "grad_norm": 1.554677963256836, + "learning_rate": 3.4990853874855842e-06, + "loss": 2.3366, + "step": 7242 + }, + { + "epoch": 0.38857296137339054, + "grad_norm": 1.6515088081359863, + "learning_rate": 3.498687167404201e-06, + "loss": 2.5932, + "step": 7243 + }, + { + "epoch": 0.38862660944206007, + "grad_norm": 1.4661296606063843, + "learning_rate": 3.4982889171696903e-06, + "loss": 1.6514, + "step": 7244 + }, + { + "epoch": 0.3886802575107296, + "grad_norm": 1.488137125968933, + "learning_rate": 3.4978906367940768e-06, + "loss": 1.508, + "step": 7245 + }, + { + "epoch": 0.38873390557939913, + "grad_norm": 1.5268150568008423, + "learning_rate": 3.4974923262893857e-06, + "loss": 2.2312, + "step": 7246 + }, + { + "epoch": 0.38878755364806866, + "grad_norm": 1.6108039617538452, + "learning_rate": 3.497093985667643e-06, + "loss": 2.4126, + "step": 7247 + }, + { + "epoch": 0.3888412017167382, + "grad_norm": 1.3999958038330078, + "learning_rate": 3.4966956149408748e-06, + "loss": 2.1625, + "step": 7248 + }, + { + "epoch": 0.3888948497854077, + "grad_norm": 1.388348937034607, + "learning_rate": 3.4962972141211106e-06, + "loss": 2.4347, + "step": 7249 + }, + { + "epoch": 0.38894849785407726, + "grad_norm": 1.5000842809677124, + "learning_rate": 3.4958987832203774e-06, + "loss": 2.2605, + "step": 7250 + }, + { + "epoch": 0.3890021459227468, + "grad_norm": 1.256534457206726, + "learning_rate": 3.495500322250707e-06, + "loss": 2.1248, + "step": 7251 + }, + { + "epoch": 0.3890557939914163, + "grad_norm": 1.5634568929672241, + "learning_rate": 3.495101831224128e-06, + "loss": 2.2581, + "step": 7252 + }, + { + "epoch": 0.38910944206008585, + "grad_norm": 1.349122166633606, + "learning_rate": 3.4947033101526727e-06, + "loss": 2.0345, + "step": 7253 + }, + { + "epoch": 0.3891630901287554, + "grad_norm": 1.4243550300598145, + "learning_rate": 3.4943047590483755e-06, + "loss": 2.204, + "step": 7254 + }, + { + "epoch": 0.3892167381974249, + "grad_norm": 1.5093629360198975, + "learning_rate": 3.4939061779232663e-06, + "loss": 2.4168, + "step": 7255 + }, + { + "epoch": 0.38927038626609445, + "grad_norm": 1.3291574716567993, + "learning_rate": 3.4935075667893816e-06, + "loss": 2.0442, + "step": 7256 + }, + { + "epoch": 0.3893240343347639, + "grad_norm": 1.3363105058670044, + "learning_rate": 3.493108925658756e-06, + "loss": 2.1932, + "step": 7257 + }, + { + "epoch": 0.38937768240343346, + "grad_norm": 1.3224880695343018, + "learning_rate": 3.4927102545434256e-06, + "loss": 2.576, + "step": 7258 + }, + { + "epoch": 0.389431330472103, + "grad_norm": 1.2582480907440186, + "learning_rate": 3.4923115534554275e-06, + "loss": 2.0481, + "step": 7259 + }, + { + "epoch": 0.3894849785407725, + "grad_norm": 1.524138331413269, + "learning_rate": 3.4919128224067995e-06, + "loss": 2.2017, + "step": 7260 + }, + { + "epoch": 0.38953862660944205, + "grad_norm": 1.4756494760513306, + "learning_rate": 3.4915140614095807e-06, + "loss": 2.1927, + "step": 7261 + }, + { + "epoch": 0.3895922746781116, + "grad_norm": 1.1993285417556763, + "learning_rate": 3.49111527047581e-06, + "loss": 2.2146, + "step": 7262 + }, + { + "epoch": 0.3896459227467811, + "grad_norm": 2.3309826850891113, + "learning_rate": 3.4907164496175283e-06, + "loss": 2.0685, + "step": 7263 + }, + { + "epoch": 0.38969957081545065, + "grad_norm": 1.5239611864089966, + "learning_rate": 3.490317598846778e-06, + "loss": 2.3345, + "step": 7264 + }, + { + "epoch": 0.3897532188841202, + "grad_norm": 1.3662230968475342, + "learning_rate": 3.4899187181756e-06, + "loss": 2.2815, + "step": 7265 + }, + { + "epoch": 0.3898068669527897, + "grad_norm": 1.4243156909942627, + "learning_rate": 3.489519807616039e-06, + "loss": 2.2812, + "step": 7266 + }, + { + "epoch": 0.38986051502145924, + "grad_norm": 4.9857258796691895, + "learning_rate": 3.4891208671801387e-06, + "loss": 2.1517, + "step": 7267 + }, + { + "epoch": 0.3899141630901288, + "grad_norm": 1.3959686756134033, + "learning_rate": 3.4887218968799435e-06, + "loss": 2.1058, + "step": 7268 + }, + { + "epoch": 0.3899678111587983, + "grad_norm": 1.592218279838562, + "learning_rate": 3.4883228967275007e-06, + "loss": 2.0753, + "step": 7269 + }, + { + "epoch": 0.39002145922746784, + "grad_norm": 1.253032922744751, + "learning_rate": 3.487923866734856e-06, + "loss": 2.0366, + "step": 7270 + }, + { + "epoch": 0.3900751072961373, + "grad_norm": 1.5743896961212158, + "learning_rate": 3.4875248069140587e-06, + "loss": 2.2142, + "step": 7271 + }, + { + "epoch": 0.39012875536480685, + "grad_norm": 1.3225570917129517, + "learning_rate": 3.4871257172771556e-06, + "loss": 2.2577, + "step": 7272 + }, + { + "epoch": 0.3901824034334764, + "grad_norm": 1.554538607597351, + "learning_rate": 3.486726597836198e-06, + "loss": 2.2862, + "step": 7273 + }, + { + "epoch": 0.3902360515021459, + "grad_norm": 3.265653371810913, + "learning_rate": 3.4863274486032357e-06, + "loss": 2.3828, + "step": 7274 + }, + { + "epoch": 0.39028969957081544, + "grad_norm": 1.182265043258667, + "learning_rate": 3.4859282695903196e-06, + "loss": 2.2216, + "step": 7275 + }, + { + "epoch": 0.39034334763948497, + "grad_norm": 1.571900486946106, + "learning_rate": 3.4855290608095038e-06, + "loss": 2.1231, + "step": 7276 + }, + { + "epoch": 0.3903969957081545, + "grad_norm": 1.6021851301193237, + "learning_rate": 3.4851298222728393e-06, + "loss": 2.297, + "step": 7277 + }, + { + "epoch": 0.39045064377682404, + "grad_norm": 1.4508349895477295, + "learning_rate": 3.484730553992381e-06, + "loss": 2.492, + "step": 7278 + }, + { + "epoch": 0.39050429184549357, + "grad_norm": 1.3954308032989502, + "learning_rate": 3.4843312559801846e-06, + "loss": 1.1299, + "step": 7279 + }, + { + "epoch": 0.3905579399141631, + "grad_norm": 1.303883671760559, + "learning_rate": 3.483931928248306e-06, + "loss": 2.1559, + "step": 7280 + }, + { + "epoch": 0.39061158798283263, + "grad_norm": 1.4395354986190796, + "learning_rate": 3.483532570808801e-06, + "loss": 2.2512, + "step": 7281 + }, + { + "epoch": 0.39066523605150216, + "grad_norm": 1.2383042573928833, + "learning_rate": 3.4831331836737274e-06, + "loss": 2.4151, + "step": 7282 + }, + { + "epoch": 0.3907188841201717, + "grad_norm": 1.428794503211975, + "learning_rate": 3.4827337668551454e-06, + "loss": 2.4215, + "step": 7283 + }, + { + "epoch": 0.3907725321888412, + "grad_norm": 2.6186208724975586, + "learning_rate": 3.4823343203651124e-06, + "loss": 2.1471, + "step": 7284 + }, + { + "epoch": 0.3908261802575107, + "grad_norm": 1.2708736658096313, + "learning_rate": 3.4819348442156904e-06, + "loss": 2.0617, + "step": 7285 + }, + { + "epoch": 0.39087982832618023, + "grad_norm": 1.4908573627471924, + "learning_rate": 3.4815353384189392e-06, + "loss": 2.2002, + "step": 7286 + }, + { + "epoch": 0.39093347639484977, + "grad_norm": 3.7129218578338623, + "learning_rate": 3.4811358029869226e-06, + "loss": 2.2982, + "step": 7287 + }, + { + "epoch": 0.3909871244635193, + "grad_norm": 1.3782280683517456, + "learning_rate": 3.480736237931703e-06, + "loss": 2.3759, + "step": 7288 + }, + { + "epoch": 0.39104077253218883, + "grad_norm": 1.4626001119613647, + "learning_rate": 3.4803366432653437e-06, + "loss": 2.1363, + "step": 7289 + }, + { + "epoch": 0.39109442060085836, + "grad_norm": 2.7958905696868896, + "learning_rate": 3.4799370189999103e-06, + "loss": 2.303, + "step": 7290 + }, + { + "epoch": 0.3911480686695279, + "grad_norm": 1.4497624635696411, + "learning_rate": 3.4795373651474682e-06, + "loss": 2.1752, + "step": 7291 + }, + { + "epoch": 0.3912017167381974, + "grad_norm": 2.1683902740478516, + "learning_rate": 3.4791376817200852e-06, + "loss": 2.2695, + "step": 7292 + }, + { + "epoch": 0.39125536480686696, + "grad_norm": 1.5292627811431885, + "learning_rate": 3.4787379687298272e-06, + "loss": 2.0207, + "step": 7293 + }, + { + "epoch": 0.3913090128755365, + "grad_norm": 1.548232078552246, + "learning_rate": 3.478338226188764e-06, + "loss": 2.4827, + "step": 7294 + }, + { + "epoch": 0.391362660944206, + "grad_norm": 1.8815807104110718, + "learning_rate": 3.477938454108963e-06, + "loss": 2.2881, + "step": 7295 + }, + { + "epoch": 0.39141630901287555, + "grad_norm": 1.5939007997512817, + "learning_rate": 3.477538652502496e-06, + "loss": 2.3479, + "step": 7296 + }, + { + "epoch": 0.3914699570815451, + "grad_norm": 1.4147379398345947, + "learning_rate": 3.477138821381435e-06, + "loss": 2.1518, + "step": 7297 + }, + { + "epoch": 0.3915236051502146, + "grad_norm": 1.8029663562774658, + "learning_rate": 3.4767389607578505e-06, + "loss": 2.2396, + "step": 7298 + }, + { + "epoch": 0.39157725321888415, + "grad_norm": 1.4443937540054321, + "learning_rate": 3.476339070643815e-06, + "loss": 2.5083, + "step": 7299 + }, + { + "epoch": 0.3916309012875536, + "grad_norm": 1.5967910289764404, + "learning_rate": 3.4759391510514033e-06, + "loss": 2.3777, + "step": 7300 + }, + { + "epoch": 0.39168454935622316, + "grad_norm": 1.6211133003234863, + "learning_rate": 3.47553920199269e-06, + "loss": 1.2919, + "step": 7301 + }, + { + "epoch": 0.3917381974248927, + "grad_norm": 1.2137641906738281, + "learning_rate": 3.4751392234797502e-06, + "loss": 2.0943, + "step": 7302 + }, + { + "epoch": 0.3917918454935622, + "grad_norm": 1.4714586734771729, + "learning_rate": 3.474739215524661e-06, + "loss": 2.3574, + "step": 7303 + }, + { + "epoch": 0.39184549356223175, + "grad_norm": 1.5231130123138428, + "learning_rate": 3.474339178139499e-06, + "loss": 2.3111, + "step": 7304 + }, + { + "epoch": 0.3918991416309013, + "grad_norm": 1.4691166877746582, + "learning_rate": 3.473939111336343e-06, + "loss": 2.3927, + "step": 7305 + }, + { + "epoch": 0.3919527896995708, + "grad_norm": 1.3635526895523071, + "learning_rate": 3.473539015127272e-06, + "loss": 2.2552, + "step": 7306 + }, + { + "epoch": 0.39200643776824035, + "grad_norm": 1.2967535257339478, + "learning_rate": 3.4731388895243657e-06, + "loss": 2.3011, + "step": 7307 + }, + { + "epoch": 0.3920600858369099, + "grad_norm": 1.4698660373687744, + "learning_rate": 3.4727387345397056e-06, + "loss": 2.4466, + "step": 7308 + }, + { + "epoch": 0.3921137339055794, + "grad_norm": 1.3985228538513184, + "learning_rate": 3.472338550185373e-06, + "loss": 2.2651, + "step": 7309 + }, + { + "epoch": 0.39216738197424894, + "grad_norm": 1.4881538152694702, + "learning_rate": 3.4719383364734507e-06, + "loss": 2.3039, + "step": 7310 + }, + { + "epoch": 0.3922210300429185, + "grad_norm": 1.590765357017517, + "learning_rate": 3.471538093416022e-06, + "loss": 2.2636, + "step": 7311 + }, + { + "epoch": 0.392274678111588, + "grad_norm": 1.441636085510254, + "learning_rate": 3.4711378210251717e-06, + "loss": 2.257, + "step": 7312 + }, + { + "epoch": 0.39232832618025754, + "grad_norm": 1.5992588996887207, + "learning_rate": 3.470737519312985e-06, + "loss": 2.3429, + "step": 7313 + }, + { + "epoch": 0.392381974248927, + "grad_norm": 1.486020565032959, + "learning_rate": 3.470337188291548e-06, + "loss": 2.3073, + "step": 7314 + }, + { + "epoch": 0.39243562231759654, + "grad_norm": 1.2677648067474365, + "learning_rate": 3.469936827972949e-06, + "loss": 1.8606, + "step": 7315 + }, + { + "epoch": 0.3924892703862661, + "grad_norm": 1.332586646080017, + "learning_rate": 3.4695364383692744e-06, + "loss": 1.7889, + "step": 7316 + }, + { + "epoch": 0.3925429184549356, + "grad_norm": 1.4693282842636108, + "learning_rate": 3.4691360194926126e-06, + "loss": 2.4753, + "step": 7317 + }, + { + "epoch": 0.39259656652360514, + "grad_norm": 1.2926071882247925, + "learning_rate": 3.4687355713550554e-06, + "loss": 2.3775, + "step": 7318 + }, + { + "epoch": 0.39265021459227467, + "grad_norm": 1.4810293912887573, + "learning_rate": 3.4683350939686915e-06, + "loss": 2.2395, + "step": 7319 + }, + { + "epoch": 0.3927038626609442, + "grad_norm": 1.529138207435608, + "learning_rate": 3.4679345873456145e-06, + "loss": 2.3016, + "step": 7320 + }, + { + "epoch": 0.39275751072961373, + "grad_norm": 1.41195809841156, + "learning_rate": 3.467534051497915e-06, + "loss": 2.4252, + "step": 7321 + }, + { + "epoch": 0.39281115879828327, + "grad_norm": 1.3483872413635254, + "learning_rate": 3.4671334864376873e-06, + "loss": 2.1969, + "step": 7322 + }, + { + "epoch": 0.3928648068669528, + "grad_norm": 1.344193458557129, + "learning_rate": 3.4667328921770245e-06, + "loss": 2.2849, + "step": 7323 + }, + { + "epoch": 0.39291845493562233, + "grad_norm": 1.4456509351730347, + "learning_rate": 3.4663322687280226e-06, + "loss": 2.4312, + "step": 7324 + }, + { + "epoch": 0.39297210300429186, + "grad_norm": 1.5548014640808105, + "learning_rate": 3.4659316161027773e-06, + "loss": 2.4829, + "step": 7325 + }, + { + "epoch": 0.3930257510729614, + "grad_norm": 1.5027053356170654, + "learning_rate": 3.465530934313385e-06, + "loss": 2.2352, + "step": 7326 + }, + { + "epoch": 0.3930793991416309, + "grad_norm": 3.839831829071045, + "learning_rate": 3.465130223371944e-06, + "loss": 2.1163, + "step": 7327 + }, + { + "epoch": 0.3931330472103004, + "grad_norm": 1.6617968082427979, + "learning_rate": 3.464729483290553e-06, + "loss": 2.4682, + "step": 7328 + }, + { + "epoch": 0.39318669527896993, + "grad_norm": 1.496253252029419, + "learning_rate": 3.4643287140813103e-06, + "loss": 2.1059, + "step": 7329 + }, + { + "epoch": 0.39324034334763946, + "grad_norm": 1.4874473810195923, + "learning_rate": 3.4639279157563175e-06, + "loss": 1.3296, + "step": 7330 + }, + { + "epoch": 0.393293991416309, + "grad_norm": 1.3531092405319214, + "learning_rate": 3.463527088327675e-06, + "loss": 2.0903, + "step": 7331 + }, + { + "epoch": 0.39334763948497853, + "grad_norm": 1.355104923248291, + "learning_rate": 3.463126231807486e-06, + "loss": 2.3062, + "step": 7332 + }, + { + "epoch": 0.39340128755364806, + "grad_norm": 1.432108759880066, + "learning_rate": 3.4627253462078523e-06, + "loss": 2.4558, + "step": 7333 + }, + { + "epoch": 0.3934549356223176, + "grad_norm": 1.5918198823928833, + "learning_rate": 3.4623244315408776e-06, + "loss": 2.3895, + "step": 7334 + }, + { + "epoch": 0.3935085836909871, + "grad_norm": 1.4952020645141602, + "learning_rate": 3.4619234878186675e-06, + "loss": 2.322, + "step": 7335 + }, + { + "epoch": 0.39356223175965666, + "grad_norm": 1.8664504289627075, + "learning_rate": 3.4615225150533273e-06, + "loss": 2.0878, + "step": 7336 + }, + { + "epoch": 0.3936158798283262, + "grad_norm": 1.5161851644515991, + "learning_rate": 3.4611215132569637e-06, + "loss": 2.3659, + "step": 7337 + }, + { + "epoch": 0.3936695278969957, + "grad_norm": 1.5376198291778564, + "learning_rate": 3.460720482441683e-06, + "loss": 2.1271, + "step": 7338 + }, + { + "epoch": 0.39372317596566525, + "grad_norm": 1.447488784790039, + "learning_rate": 3.4603194226195945e-06, + "loss": 2.2242, + "step": 7339 + }, + { + "epoch": 0.3937768240343348, + "grad_norm": 1.2295221090316772, + "learning_rate": 3.459918333802807e-06, + "loss": 2.0151, + "step": 7340 + }, + { + "epoch": 0.3938304721030043, + "grad_norm": 1.3935548067092896, + "learning_rate": 3.4595172160034303e-06, + "loss": 2.2307, + "step": 7341 + }, + { + "epoch": 0.39388412017167385, + "grad_norm": 9.271126747131348, + "learning_rate": 3.4591160692335763e-06, + "loss": 2.4594, + "step": 7342 + }, + { + "epoch": 0.3939377682403433, + "grad_norm": 1.426344633102417, + "learning_rate": 3.458714893505355e-06, + "loss": 2.2749, + "step": 7343 + }, + { + "epoch": 0.39399141630901285, + "grad_norm": 1.4990466833114624, + "learning_rate": 3.45831368883088e-06, + "loss": 2.3526, + "step": 7344 + }, + { + "epoch": 0.3940450643776824, + "grad_norm": 1.3017239570617676, + "learning_rate": 3.4579124552222648e-06, + "loss": 2.0441, + "step": 7345 + }, + { + "epoch": 0.3940987124463519, + "grad_norm": 1.6340274810791016, + "learning_rate": 3.457511192691624e-06, + "loss": 2.3608, + "step": 7346 + }, + { + "epoch": 0.39415236051502145, + "grad_norm": 1.5077391862869263, + "learning_rate": 3.4571099012510717e-06, + "loss": 2.223, + "step": 7347 + }, + { + "epoch": 0.394206008583691, + "grad_norm": 1.3759905099868774, + "learning_rate": 3.4567085809127247e-06, + "loss": 2.2412, + "step": 7348 + }, + { + "epoch": 0.3942596566523605, + "grad_norm": 1.531803011894226, + "learning_rate": 3.4563072316887004e-06, + "loss": 2.151, + "step": 7349 + }, + { + "epoch": 0.39431330472103004, + "grad_norm": 1.6430257558822632, + "learning_rate": 3.455905853591116e-06, + "loss": 2.2179, + "step": 7350 + }, + { + "epoch": 0.3943669527896996, + "grad_norm": 1.3189276456832886, + "learning_rate": 3.4555044466320907e-06, + "loss": 2.2175, + "step": 7351 + }, + { + "epoch": 0.3944206008583691, + "grad_norm": 1.5562074184417725, + "learning_rate": 3.4551030108237436e-06, + "loss": 2.2703, + "step": 7352 + }, + { + "epoch": 0.39447424892703864, + "grad_norm": 1.5531412363052368, + "learning_rate": 3.454701546178195e-06, + "loss": 2.24, + "step": 7353 + }, + { + "epoch": 0.39452789699570817, + "grad_norm": 2.4246249198913574, + "learning_rate": 3.4543000527075676e-06, + "loss": 2.4466, + "step": 7354 + }, + { + "epoch": 0.3945815450643777, + "grad_norm": 1.5119608640670776, + "learning_rate": 3.4538985304239824e-06, + "loss": 2.081, + "step": 7355 + }, + { + "epoch": 0.39463519313304724, + "grad_norm": 1.3951529264450073, + "learning_rate": 3.453496979339563e-06, + "loss": 2.3607, + "step": 7356 + }, + { + "epoch": 0.3946888412017167, + "grad_norm": 1.4397295713424683, + "learning_rate": 3.4530953994664318e-06, + "loss": 2.468, + "step": 7357 + }, + { + "epoch": 0.39474248927038624, + "grad_norm": 1.3314844369888306, + "learning_rate": 3.4526937908167152e-06, + "loss": 2.2991, + "step": 7358 + }, + { + "epoch": 0.3947961373390558, + "grad_norm": 1.6063385009765625, + "learning_rate": 3.452292153402539e-06, + "loss": 2.352, + "step": 7359 + }, + { + "epoch": 0.3948497854077253, + "grad_norm": 1.3501405715942383, + "learning_rate": 3.4518904872360294e-06, + "loss": 1.5004, + "step": 7360 + }, + { + "epoch": 0.39490343347639484, + "grad_norm": 1.5808645486831665, + "learning_rate": 3.451488792329313e-06, + "loss": 2.3556, + "step": 7361 + }, + { + "epoch": 0.39495708154506437, + "grad_norm": 1.4042527675628662, + "learning_rate": 3.4510870686945196e-06, + "loss": 2.1705, + "step": 7362 + }, + { + "epoch": 0.3950107296137339, + "grad_norm": 3.3535189628601074, + "learning_rate": 3.4506853163437763e-06, + "loss": 2.0799, + "step": 7363 + }, + { + "epoch": 0.39506437768240343, + "grad_norm": 2.187772035598755, + "learning_rate": 3.4502835352892155e-06, + "loss": 2.1775, + "step": 7364 + }, + { + "epoch": 0.39511802575107297, + "grad_norm": 5.594254493713379, + "learning_rate": 3.449881725542966e-06, + "loss": 2.498, + "step": 7365 + }, + { + "epoch": 0.3951716738197425, + "grad_norm": 1.5532408952713013, + "learning_rate": 3.449479887117161e-06, + "loss": 2.1131, + "step": 7366 + }, + { + "epoch": 0.39522532188841203, + "grad_norm": 1.3053195476531982, + "learning_rate": 3.4490780200239325e-06, + "loss": 2.2678, + "step": 7367 + }, + { + "epoch": 0.39527896995708156, + "grad_norm": 1.3898929357528687, + "learning_rate": 3.448676124275414e-06, + "loss": 2.3206, + "step": 7368 + }, + { + "epoch": 0.3953326180257511, + "grad_norm": 1.4463444948196411, + "learning_rate": 3.4482741998837393e-06, + "loss": 2.4136, + "step": 7369 + }, + { + "epoch": 0.3953862660944206, + "grad_norm": 1.3392775058746338, + "learning_rate": 3.447872246861045e-06, + "loss": 2.2684, + "step": 7370 + }, + { + "epoch": 0.39543991416309016, + "grad_norm": 1.6284980773925781, + "learning_rate": 3.447470265219466e-06, + "loss": 1.6364, + "step": 7371 + }, + { + "epoch": 0.39549356223175963, + "grad_norm": 1.4342169761657715, + "learning_rate": 3.447068254971139e-06, + "loss": 2.4039, + "step": 7372 + }, + { + "epoch": 0.39554721030042916, + "grad_norm": 1.5769048929214478, + "learning_rate": 3.4466662161282033e-06, + "loss": 2.5615, + "step": 7373 + }, + { + "epoch": 0.3956008583690987, + "grad_norm": 1.2215602397918701, + "learning_rate": 3.4462641487027963e-06, + "loss": 2.2283, + "step": 7374 + }, + { + "epoch": 0.3956545064377682, + "grad_norm": 1.3421862125396729, + "learning_rate": 3.445862052707058e-06, + "loss": 2.4942, + "step": 7375 + }, + { + "epoch": 0.39570815450643776, + "grad_norm": 1.596993088722229, + "learning_rate": 3.4454599281531285e-06, + "loss": 2.4582, + "step": 7376 + }, + { + "epoch": 0.3957618025751073, + "grad_norm": 1.7352863550186157, + "learning_rate": 3.44505777505315e-06, + "loss": 1.9542, + "step": 7377 + }, + { + "epoch": 0.3958154506437768, + "grad_norm": 1.6325647830963135, + "learning_rate": 3.4446555934192627e-06, + "loss": 2.187, + "step": 7378 + }, + { + "epoch": 0.39586909871244635, + "grad_norm": 1.735583782196045, + "learning_rate": 3.4442533832636104e-06, + "loss": 2.1249, + "step": 7379 + }, + { + "epoch": 0.3959227467811159, + "grad_norm": 1.5095182657241821, + "learning_rate": 3.4438511445983386e-06, + "loss": 2.2085, + "step": 7380 + }, + { + "epoch": 0.3959763948497854, + "grad_norm": 1.5327881574630737, + "learning_rate": 3.44344887743559e-06, + "loss": 2.2829, + "step": 7381 + }, + { + "epoch": 0.39603004291845495, + "grad_norm": 1.359908938407898, + "learning_rate": 3.4430465817875113e-06, + "loss": 2.2368, + "step": 7382 + }, + { + "epoch": 0.3960836909871245, + "grad_norm": 1.4082549810409546, + "learning_rate": 3.442644257666248e-06, + "loss": 2.3483, + "step": 7383 + }, + { + "epoch": 0.396137339055794, + "grad_norm": 1.375734567642212, + "learning_rate": 3.442241905083948e-06, + "loss": 2.2401, + "step": 7384 + }, + { + "epoch": 0.39619098712446355, + "grad_norm": 1.3262296915054321, + "learning_rate": 3.441839524052759e-06, + "loss": 2.053, + "step": 7385 + }, + { + "epoch": 0.396244635193133, + "grad_norm": 1.6107803583145142, + "learning_rate": 3.44143711458483e-06, + "loss": 2.1631, + "step": 7386 + }, + { + "epoch": 0.39629828326180255, + "grad_norm": 1.3062028884887695, + "learning_rate": 3.4410346766923115e-06, + "loss": 2.3476, + "step": 7387 + }, + { + "epoch": 0.3963519313304721, + "grad_norm": 1.3485711812973022, + "learning_rate": 3.4406322103873536e-06, + "loss": 2.2416, + "step": 7388 + }, + { + "epoch": 0.3964055793991416, + "grad_norm": 2.5527873039245605, + "learning_rate": 3.4402297156821084e-06, + "loss": 2.0929, + "step": 7389 + }, + { + "epoch": 0.39645922746781115, + "grad_norm": 3.13289737701416, + "learning_rate": 3.4398271925887273e-06, + "loss": 2.2625, + "step": 7390 + }, + { + "epoch": 0.3965128755364807, + "grad_norm": 1.4152448177337646, + "learning_rate": 3.439424641119365e-06, + "loss": 2.2119, + "step": 7391 + }, + { + "epoch": 0.3965665236051502, + "grad_norm": 1.6828807592391968, + "learning_rate": 3.4390220612861746e-06, + "loss": 2.3819, + "step": 7392 + }, + { + "epoch": 0.39662017167381974, + "grad_norm": 1.5212030410766602, + "learning_rate": 3.4386194531013116e-06, + "loss": 2.1818, + "step": 7393 + }, + { + "epoch": 0.3966738197424893, + "grad_norm": 1.529909610748291, + "learning_rate": 3.4382168165769315e-06, + "loss": 2.3011, + "step": 7394 + }, + { + "epoch": 0.3967274678111588, + "grad_norm": 1.5092853307724, + "learning_rate": 3.4378141517251916e-06, + "loss": 2.4296, + "step": 7395 + }, + { + "epoch": 0.39678111587982834, + "grad_norm": 3.7360005378723145, + "learning_rate": 3.4374114585582487e-06, + "loss": 2.1996, + "step": 7396 + }, + { + "epoch": 0.39683476394849787, + "grad_norm": 1.490601658821106, + "learning_rate": 3.4370087370882614e-06, + "loss": 2.1051, + "step": 7397 + }, + { + "epoch": 0.3968884120171674, + "grad_norm": 1.4878822565078735, + "learning_rate": 3.4366059873273893e-06, + "loss": 2.2666, + "step": 7398 + }, + { + "epoch": 0.39694206008583693, + "grad_norm": 1.2487239837646484, + "learning_rate": 3.4362032092877933e-06, + "loss": 2.4256, + "step": 7399 + }, + { + "epoch": 0.3969957081545064, + "grad_norm": 1.3302043676376343, + "learning_rate": 3.4358004029816326e-06, + "loss": 1.6087, + "step": 7400 + }, + { + "epoch": 0.39704935622317594, + "grad_norm": 2.5361077785491943, + "learning_rate": 3.43539756842107e-06, + "loss": 2.4354, + "step": 7401 + }, + { + "epoch": 0.3971030042918455, + "grad_norm": 2.808790922164917, + "learning_rate": 3.4349947056182675e-06, + "loss": 1.9906, + "step": 7402 + }, + { + "epoch": 0.397156652360515, + "grad_norm": 1.4232245683670044, + "learning_rate": 3.4345918145853894e-06, + "loss": 1.883, + "step": 7403 + }, + { + "epoch": 0.39721030042918454, + "grad_norm": 1.3380852937698364, + "learning_rate": 3.4341888953346015e-06, + "loss": 2.4043, + "step": 7404 + }, + { + "epoch": 0.39726394849785407, + "grad_norm": 1.4475769996643066, + "learning_rate": 3.433785947878066e-06, + "loss": 2.3098, + "step": 7405 + }, + { + "epoch": 0.3973175965665236, + "grad_norm": 1.607069492340088, + "learning_rate": 3.433382972227951e-06, + "loss": 2.534, + "step": 7406 + }, + { + "epoch": 0.39737124463519313, + "grad_norm": 1.307146430015564, + "learning_rate": 3.4329799683964226e-06, + "loss": 2.3063, + "step": 7407 + }, + { + "epoch": 0.39742489270386266, + "grad_norm": 1.4113075733184814, + "learning_rate": 3.4325769363956484e-06, + "loss": 2.1433, + "step": 7408 + }, + { + "epoch": 0.3974785407725322, + "grad_norm": 1.3985209465026855, + "learning_rate": 3.4321738762377986e-06, + "loss": 2.031, + "step": 7409 + }, + { + "epoch": 0.39753218884120173, + "grad_norm": 1.2658993005752563, + "learning_rate": 3.4317707879350413e-06, + "loss": 1.8225, + "step": 7410 + }, + { + "epoch": 0.39758583690987126, + "grad_norm": 1.510581374168396, + "learning_rate": 3.431367671499547e-06, + "loss": 2.3328, + "step": 7411 + }, + { + "epoch": 0.3976394849785408, + "grad_norm": 1.2582894563674927, + "learning_rate": 3.4309645269434877e-06, + "loss": 1.9172, + "step": 7412 + }, + { + "epoch": 0.3976931330472103, + "grad_norm": 1.4866788387298584, + "learning_rate": 3.430561354279034e-06, + "loss": 2.3411, + "step": 7413 + }, + { + "epoch": 0.39774678111587985, + "grad_norm": 1.5098133087158203, + "learning_rate": 3.4301581535183604e-06, + "loss": 2.2161, + "step": 7414 + }, + { + "epoch": 0.39780042918454933, + "grad_norm": 1.553486704826355, + "learning_rate": 3.4297549246736395e-06, + "loss": 2.3822, + "step": 7415 + }, + { + "epoch": 0.39785407725321886, + "grad_norm": 1.484671950340271, + "learning_rate": 3.4293516677570465e-06, + "loss": 2.0605, + "step": 7416 + }, + { + "epoch": 0.3979077253218884, + "grad_norm": 1.3867870569229126, + "learning_rate": 3.4289483827807576e-06, + "loss": 2.1675, + "step": 7417 + }, + { + "epoch": 0.3979613733905579, + "grad_norm": 1.152915596961975, + "learning_rate": 3.4285450697569465e-06, + "loss": 2.1363, + "step": 7418 + }, + { + "epoch": 0.39801502145922746, + "grad_norm": 1.4852628707885742, + "learning_rate": 3.4281417286977925e-06, + "loss": 2.317, + "step": 7419 + }, + { + "epoch": 0.398068669527897, + "grad_norm": 1.347629427909851, + "learning_rate": 3.4277383596154733e-06, + "loss": 2.105, + "step": 7420 + }, + { + "epoch": 0.3981223175965665, + "grad_norm": 1.7537912130355835, + "learning_rate": 3.427334962522168e-06, + "loss": 2.0554, + "step": 7421 + }, + { + "epoch": 0.39817596566523605, + "grad_norm": 1.1652344465255737, + "learning_rate": 3.426931537430055e-06, + "loss": 2.1169, + "step": 7422 + }, + { + "epoch": 0.3982296137339056, + "grad_norm": 1.3884693384170532, + "learning_rate": 3.426528084351316e-06, + "loss": 2.274, + "step": 7423 + }, + { + "epoch": 0.3982832618025751, + "grad_norm": 1.4728822708129883, + "learning_rate": 3.4261246032981316e-06, + "loss": 2.3763, + "step": 7424 + }, + { + "epoch": 0.39833690987124465, + "grad_norm": 1.677062749862671, + "learning_rate": 3.4257210942826834e-06, + "loss": 2.3619, + "step": 7425 + }, + { + "epoch": 0.3983905579399142, + "grad_norm": 3.6332273483276367, + "learning_rate": 3.425317557317157e-06, + "loss": 1.7408, + "step": 7426 + }, + { + "epoch": 0.3984442060085837, + "grad_norm": 1.4751925468444824, + "learning_rate": 3.424913992413734e-06, + "loss": 2.0613, + "step": 7427 + }, + { + "epoch": 0.39849785407725324, + "grad_norm": 1.5289349555969238, + "learning_rate": 3.4245103995846007e-06, + "loss": 2.2094, + "step": 7428 + }, + { + "epoch": 0.3985515021459227, + "grad_norm": 1.27696692943573, + "learning_rate": 3.424106778841941e-06, + "loss": 2.1362, + "step": 7429 + }, + { + "epoch": 0.39860515021459225, + "grad_norm": 1.4952375888824463, + "learning_rate": 3.4237031301979423e-06, + "loss": 2.1734, + "step": 7430 + }, + { + "epoch": 0.3986587982832618, + "grad_norm": 1.5706937313079834, + "learning_rate": 3.4232994536647916e-06, + "loss": 2.5475, + "step": 7431 + }, + { + "epoch": 0.3987124463519313, + "grad_norm": 1.3397984504699707, + "learning_rate": 3.422895749254677e-06, + "loss": 2.3701, + "step": 7432 + }, + { + "epoch": 0.39876609442060085, + "grad_norm": 1.4047380685806274, + "learning_rate": 3.422492016979788e-06, + "loss": 2.2498, + "step": 7433 + }, + { + "epoch": 0.3988197424892704, + "grad_norm": 1.2038965225219727, + "learning_rate": 3.4220882568523138e-06, + "loss": 1.7256, + "step": 7434 + }, + { + "epoch": 0.3988733905579399, + "grad_norm": 1.409570336341858, + "learning_rate": 3.4216844688844453e-06, + "loss": 2.096, + "step": 7435 + }, + { + "epoch": 0.39892703862660944, + "grad_norm": 2.046546220779419, + "learning_rate": 3.421280653088374e-06, + "loss": 2.2333, + "step": 7436 + }, + { + "epoch": 0.398980686695279, + "grad_norm": 1.4572309255599976, + "learning_rate": 3.420876809476292e-06, + "loss": 2.1674, + "step": 7437 + }, + { + "epoch": 0.3990343347639485, + "grad_norm": 2.4748358726501465, + "learning_rate": 3.420472938060392e-06, + "loss": 2.296, + "step": 7438 + }, + { + "epoch": 0.39908798283261804, + "grad_norm": 1.4636505842208862, + "learning_rate": 3.42006903885287e-06, + "loss": 2.4913, + "step": 7439 + }, + { + "epoch": 0.39914163090128757, + "grad_norm": 1.2301338911056519, + "learning_rate": 3.4196651118659186e-06, + "loss": 2.151, + "step": 7440 + }, + { + "epoch": 0.3991952789699571, + "grad_norm": 1.458539366722107, + "learning_rate": 3.4192611571117334e-06, + "loss": 2.2372, + "step": 7441 + }, + { + "epoch": 0.39924892703862663, + "grad_norm": 1.2747623920440674, + "learning_rate": 3.418857174602512e-06, + "loss": 2.2278, + "step": 7442 + }, + { + "epoch": 0.39930257510729616, + "grad_norm": 1.5987961292266846, + "learning_rate": 3.4184531643504526e-06, + "loss": 2.4647, + "step": 7443 + }, + { + "epoch": 0.39935622317596564, + "grad_norm": 1.1756842136383057, + "learning_rate": 3.418049126367752e-06, + "loss": 2.0855, + "step": 7444 + }, + { + "epoch": 0.3994098712446352, + "grad_norm": 1.3360310792922974, + "learning_rate": 3.417645060666609e-06, + "loss": 2.2232, + "step": 7445 + }, + { + "epoch": 0.3994635193133047, + "grad_norm": 1.211004614830017, + "learning_rate": 3.4172409672592244e-06, + "loss": 2.0577, + "step": 7446 + }, + { + "epoch": 0.39951716738197424, + "grad_norm": 1.227432370185852, + "learning_rate": 3.4168368461577977e-06, + "loss": 2.0818, + "step": 7447 + }, + { + "epoch": 0.39957081545064377, + "grad_norm": 1.5853517055511475, + "learning_rate": 3.4164326973745325e-06, + "loss": 2.0604, + "step": 7448 + }, + { + "epoch": 0.3996244635193133, + "grad_norm": 1.4175564050674438, + "learning_rate": 3.416028520921629e-06, + "loss": 2.3429, + "step": 7449 + }, + { + "epoch": 0.39967811158798283, + "grad_norm": 1.5144730806350708, + "learning_rate": 3.4156243168112916e-06, + "loss": 2.3601, + "step": 7450 + }, + { + "epoch": 0.39973175965665236, + "grad_norm": 1.4915977716445923, + "learning_rate": 3.4152200850557244e-06, + "loss": 2.4009, + "step": 7451 + }, + { + "epoch": 0.3997854077253219, + "grad_norm": 1.5717787742614746, + "learning_rate": 3.4148158256671316e-06, + "loss": 2.4007, + "step": 7452 + }, + { + "epoch": 0.3998390557939914, + "grad_norm": 1.5570484399795532, + "learning_rate": 3.4144115386577185e-06, + "loss": 2.1811, + "step": 7453 + }, + { + "epoch": 0.39989270386266096, + "grad_norm": 1.4607923030853271, + "learning_rate": 3.414007224039693e-06, + "loss": 2.0619, + "step": 7454 + }, + { + "epoch": 0.3999463519313305, + "grad_norm": 1.3961716890335083, + "learning_rate": 3.413602881825262e-06, + "loss": 2.5495, + "step": 7455 + }, + { + "epoch": 0.4, + "grad_norm": 1.533799409866333, + "learning_rate": 3.4131985120266336e-06, + "loss": 2.0894, + "step": 7456 + }, + { + "epoch": 0.40005364806866955, + "grad_norm": 1.5276317596435547, + "learning_rate": 3.4127941146560175e-06, + "loss": 2.4487, + "step": 7457 + }, + { + "epoch": 0.40010729613733903, + "grad_norm": 1.4449228048324585, + "learning_rate": 3.412389689725621e-06, + "loss": 2.1612, + "step": 7458 + }, + { + "epoch": 0.40016094420600856, + "grad_norm": 1.5516074895858765, + "learning_rate": 3.4119852372476574e-06, + "loss": 2.3363, + "step": 7459 + }, + { + "epoch": 0.4002145922746781, + "grad_norm": 1.56174898147583, + "learning_rate": 3.411580757234338e-06, + "loss": 2.6522, + "step": 7460 + }, + { + "epoch": 0.4002682403433476, + "grad_norm": 1.8166316747665405, + "learning_rate": 3.4111762496978753e-06, + "loss": 2.26, + "step": 7461 + }, + { + "epoch": 0.40032188841201716, + "grad_norm": 1.4163415431976318, + "learning_rate": 3.4107717146504803e-06, + "loss": 2.1689, + "step": 7462 + }, + { + "epoch": 0.4003755364806867, + "grad_norm": 1.4427788257598877, + "learning_rate": 3.4103671521043696e-06, + "loss": 1.9962, + "step": 7463 + }, + { + "epoch": 0.4004291845493562, + "grad_norm": 1.3007259368896484, + "learning_rate": 3.4099625620717563e-06, + "loss": 2.422, + "step": 7464 + }, + { + "epoch": 0.40048283261802575, + "grad_norm": 1.2962175607681274, + "learning_rate": 3.409557944564858e-06, + "loss": 2.3569, + "step": 7465 + }, + { + "epoch": 0.4005364806866953, + "grad_norm": 1.464599847793579, + "learning_rate": 3.4091532995958894e-06, + "loss": 2.354, + "step": 7466 + }, + { + "epoch": 0.4005901287553648, + "grad_norm": 1.4053866863250732, + "learning_rate": 3.408748627177069e-06, + "loss": 2.2893, + "step": 7467 + }, + { + "epoch": 0.40064377682403435, + "grad_norm": 1.105279803276062, + "learning_rate": 3.4083439273206137e-06, + "loss": 1.7354, + "step": 7468 + }, + { + "epoch": 0.4006974248927039, + "grad_norm": 1.471537470817566, + "learning_rate": 3.4079392000387436e-06, + "loss": 2.2226, + "step": 7469 + }, + { + "epoch": 0.4007510729613734, + "grad_norm": 1.4939988851547241, + "learning_rate": 3.407534445343679e-06, + "loss": 2.1306, + "step": 7470 + }, + { + "epoch": 0.40080472103004294, + "grad_norm": 1.616015076637268, + "learning_rate": 3.407129663247639e-06, + "loss": 2.2524, + "step": 7471 + }, + { + "epoch": 0.4008583690987124, + "grad_norm": 1.290217399597168, + "learning_rate": 3.406724853762846e-06, + "loss": 2.382, + "step": 7472 + }, + { + "epoch": 0.40091201716738195, + "grad_norm": 1.5881755352020264, + "learning_rate": 3.4063200169015227e-06, + "loss": 2.406, + "step": 7473 + }, + { + "epoch": 0.4009656652360515, + "grad_norm": 1.1372253894805908, + "learning_rate": 3.4059151526758914e-06, + "loss": 1.936, + "step": 7474 + }, + { + "epoch": 0.401019313304721, + "grad_norm": 1.8968067169189453, + "learning_rate": 3.4055102610981767e-06, + "loss": 2.2963, + "step": 7475 + }, + { + "epoch": 0.40107296137339055, + "grad_norm": 1.5871440172195435, + "learning_rate": 3.405105342180603e-06, + "loss": 2.3009, + "step": 7476 + }, + { + "epoch": 0.4011266094420601, + "grad_norm": 1.1983741521835327, + "learning_rate": 3.404700395935396e-06, + "loss": 2.4279, + "step": 7477 + }, + { + "epoch": 0.4011802575107296, + "grad_norm": 1.5429797172546387, + "learning_rate": 3.404295422374782e-06, + "loss": 2.4014, + "step": 7478 + }, + { + "epoch": 0.40123390557939914, + "grad_norm": 1.3577243089675903, + "learning_rate": 3.4038904215109896e-06, + "loss": 2.1768, + "step": 7479 + }, + { + "epoch": 0.4012875536480687, + "grad_norm": 1.332977056503296, + "learning_rate": 3.4034853933562446e-06, + "loss": 2.1774, + "step": 7480 + }, + { + "epoch": 0.4013412017167382, + "grad_norm": 1.4717929363250732, + "learning_rate": 3.4030803379227774e-06, + "loss": 2.2646, + "step": 7481 + }, + { + "epoch": 0.40139484978540774, + "grad_norm": 1.477283239364624, + "learning_rate": 3.4026752552228177e-06, + "loss": 2.3591, + "step": 7482 + }, + { + "epoch": 0.40144849785407727, + "grad_norm": 1.546702265739441, + "learning_rate": 3.402270145268596e-06, + "loss": 2.2853, + "step": 7483 + }, + { + "epoch": 0.4015021459227468, + "grad_norm": 1.509535551071167, + "learning_rate": 3.401865008072343e-06, + "loss": 2.1788, + "step": 7484 + }, + { + "epoch": 0.40155579399141633, + "grad_norm": 1.5168423652648926, + "learning_rate": 3.401459843646292e-06, + "loss": 2.1981, + "step": 7485 + }, + { + "epoch": 0.40160944206008586, + "grad_norm": 1.4292051792144775, + "learning_rate": 3.401054652002675e-06, + "loss": 1.8678, + "step": 7486 + }, + { + "epoch": 0.40166309012875534, + "grad_norm": 1.4719046354293823, + "learning_rate": 3.4006494331537252e-06, + "loss": 2.2016, + "step": 7487 + }, + { + "epoch": 0.40171673819742487, + "grad_norm": 1.6790062189102173, + "learning_rate": 3.4002441871116804e-06, + "loss": 2.2358, + "step": 7488 + }, + { + "epoch": 0.4017703862660944, + "grad_norm": 1.405478835105896, + "learning_rate": 3.399838913888773e-06, + "loss": 2.1205, + "step": 7489 + }, + { + "epoch": 0.40182403433476394, + "grad_norm": 1.6698098182678223, + "learning_rate": 3.3994336134972412e-06, + "loss": 2.0567, + "step": 7490 + }, + { + "epoch": 0.40187768240343347, + "grad_norm": 1.4686812162399292, + "learning_rate": 3.3990282859493206e-06, + "loss": 2.2226, + "step": 7491 + }, + { + "epoch": 0.401931330472103, + "grad_norm": 1.5111911296844482, + "learning_rate": 3.39862293125725e-06, + "loss": 2.5223, + "step": 7492 + }, + { + "epoch": 0.40198497854077253, + "grad_norm": 4.578327655792236, + "learning_rate": 3.398217549433268e-06, + "loss": 2.2462, + "step": 7493 + }, + { + "epoch": 0.40203862660944206, + "grad_norm": 1.5742160081863403, + "learning_rate": 3.3978121404896146e-06, + "loss": 2.3701, + "step": 7494 + }, + { + "epoch": 0.4020922746781116, + "grad_norm": 1.4247606992721558, + "learning_rate": 3.39740670443853e-06, + "loss": 2.4602, + "step": 7495 + }, + { + "epoch": 0.4021459227467811, + "grad_norm": 1.529578447341919, + "learning_rate": 3.3970012412922556e-06, + "loss": 2.1743, + "step": 7496 + }, + { + "epoch": 0.40219957081545066, + "grad_norm": 1.650627613067627, + "learning_rate": 3.396595751063032e-06, + "loss": 1.6569, + "step": 7497 + }, + { + "epoch": 0.4022532188841202, + "grad_norm": 1.386601209640503, + "learning_rate": 3.396190233763104e-06, + "loss": 2.1663, + "step": 7498 + }, + { + "epoch": 0.4023068669527897, + "grad_norm": 1.3780403137207031, + "learning_rate": 3.395784689404714e-06, + "loss": 2.154, + "step": 7499 + }, + { + "epoch": 0.40236051502145925, + "grad_norm": 1.818891167640686, + "learning_rate": 3.3953791180001084e-06, + "loss": 2.3669, + "step": 7500 + }, + { + "epoch": 0.40241416309012873, + "grad_norm": 1.827765703201294, + "learning_rate": 3.3949735195615306e-06, + "loss": 2.3535, + "step": 7501 + }, + { + "epoch": 0.40246781115879826, + "grad_norm": 1.3586678504943848, + "learning_rate": 3.394567894101227e-06, + "loss": 2.1786, + "step": 7502 + }, + { + "epoch": 0.4025214592274678, + "grad_norm": 1.4847569465637207, + "learning_rate": 3.3941622416314446e-06, + "loss": 2.0661, + "step": 7503 + }, + { + "epoch": 0.4025751072961373, + "grad_norm": 1.4962503910064697, + "learning_rate": 3.3937565621644313e-06, + "loss": 2.1146, + "step": 7504 + }, + { + "epoch": 0.40262875536480686, + "grad_norm": 1.504928708076477, + "learning_rate": 3.3933508557124374e-06, + "loss": 2.5062, + "step": 7505 + }, + { + "epoch": 0.4026824034334764, + "grad_norm": 2.125558853149414, + "learning_rate": 3.3929451222877093e-06, + "loss": 2.0391, + "step": 7506 + }, + { + "epoch": 0.4027360515021459, + "grad_norm": 1.6432536840438843, + "learning_rate": 3.392539361902499e-06, + "loss": 2.2809, + "step": 7507 + }, + { + "epoch": 0.40278969957081545, + "grad_norm": 1.2858083248138428, + "learning_rate": 3.392133574569057e-06, + "loss": 1.9944, + "step": 7508 + }, + { + "epoch": 0.402843347639485, + "grad_norm": 1.3511453866958618, + "learning_rate": 3.3917277602996345e-06, + "loss": 2.485, + "step": 7509 + }, + { + "epoch": 0.4028969957081545, + "grad_norm": 1.3478076457977295, + "learning_rate": 3.3913219191064865e-06, + "loss": 2.083, + "step": 7510 + }, + { + "epoch": 0.40295064377682405, + "grad_norm": 1.4356874227523804, + "learning_rate": 3.3909160510018646e-06, + "loss": 2.3533, + "step": 7511 + }, + { + "epoch": 0.4030042918454936, + "grad_norm": 1.6104116439819336, + "learning_rate": 3.390510155998023e-06, + "loss": 2.1983, + "step": 7512 + }, + { + "epoch": 0.4030579399141631, + "grad_norm": 1.0795238018035889, + "learning_rate": 3.3901042341072177e-06, + "loss": 1.9892, + "step": 7513 + }, + { + "epoch": 0.40311158798283264, + "grad_norm": 1.231153130531311, + "learning_rate": 3.3896982853417036e-06, + "loss": 2.2433, + "step": 7514 + }, + { + "epoch": 0.4031652360515021, + "grad_norm": 1.5930166244506836, + "learning_rate": 3.389292309713738e-06, + "loss": 2.425, + "step": 7515 + }, + { + "epoch": 0.40321888412017165, + "grad_norm": 1.5709879398345947, + "learning_rate": 3.3888863072355783e-06, + "loss": 2.2685, + "step": 7516 + }, + { + "epoch": 0.4032725321888412, + "grad_norm": 1.2763164043426514, + "learning_rate": 3.3884802779194832e-06, + "loss": 2.5411, + "step": 7517 + }, + { + "epoch": 0.4033261802575107, + "grad_norm": 2.128333330154419, + "learning_rate": 3.388074221777712e-06, + "loss": 1.3069, + "step": 7518 + }, + { + "epoch": 0.40337982832618025, + "grad_norm": 1.97826087474823, + "learning_rate": 3.3876681388225236e-06, + "loss": 2.4523, + "step": 7519 + }, + { + "epoch": 0.4034334763948498, + "grad_norm": 1.7515946626663208, + "learning_rate": 3.3872620290661782e-06, + "loss": 2.2728, + "step": 7520 + }, + { + "epoch": 0.4034871244635193, + "grad_norm": 1.4875638484954834, + "learning_rate": 3.38685589252094e-06, + "loss": 2.2886, + "step": 7521 + }, + { + "epoch": 0.40354077253218884, + "grad_norm": 1.4930124282836914, + "learning_rate": 3.3864497291990694e-06, + "loss": 2.306, + "step": 7522 + }, + { + "epoch": 0.4035944206008584, + "grad_norm": 1.3744115829467773, + "learning_rate": 3.386043539112831e-06, + "loss": 2.2947, + "step": 7523 + }, + { + "epoch": 0.4036480686695279, + "grad_norm": 2.2550251483917236, + "learning_rate": 3.385637322274487e-06, + "loss": 2.3288, + "step": 7524 + }, + { + "epoch": 0.40370171673819744, + "grad_norm": 1.306441307067871, + "learning_rate": 3.385231078696304e-06, + "loss": 2.338, + "step": 7525 + }, + { + "epoch": 0.40375536480686697, + "grad_norm": 2.4246315956115723, + "learning_rate": 3.384824808390545e-06, + "loss": 2.2098, + "step": 7526 + }, + { + "epoch": 0.4038090128755365, + "grad_norm": 1.5084997415542603, + "learning_rate": 3.38441851136948e-06, + "loss": 2.1542, + "step": 7527 + }, + { + "epoch": 0.40386266094420603, + "grad_norm": 1.4134807586669922, + "learning_rate": 3.3840121876453734e-06, + "loss": 2.3378, + "step": 7528 + }, + { + "epoch": 0.40391630901287556, + "grad_norm": 1.4803353548049927, + "learning_rate": 3.3836058372304946e-06, + "loss": 2.3679, + "step": 7529 + }, + { + "epoch": 0.40396995708154504, + "grad_norm": 1.522688627243042, + "learning_rate": 3.383199460137112e-06, + "loss": 2.1995, + "step": 7530 + }, + { + "epoch": 0.40402360515021457, + "grad_norm": 1.442228078842163, + "learning_rate": 3.3827930563774953e-06, + "loss": 2.1904, + "step": 7531 + }, + { + "epoch": 0.4040772532188841, + "grad_norm": 1.3704488277435303, + "learning_rate": 3.3823866259639147e-06, + "loss": 2.2515, + "step": 7532 + }, + { + "epoch": 0.40413090128755363, + "grad_norm": 1.6657888889312744, + "learning_rate": 3.3819801689086423e-06, + "loss": 2.2476, + "step": 7533 + }, + { + "epoch": 0.40418454935622317, + "grad_norm": 1.3897005319595337, + "learning_rate": 3.3815736852239493e-06, + "loss": 2.3177, + "step": 7534 + }, + { + "epoch": 0.4042381974248927, + "grad_norm": 1.48407781124115, + "learning_rate": 3.3811671749221085e-06, + "loss": 2.0496, + "step": 7535 + }, + { + "epoch": 0.40429184549356223, + "grad_norm": 1.4731385707855225, + "learning_rate": 3.3807606380153944e-06, + "loss": 2.1636, + "step": 7536 + }, + { + "epoch": 0.40434549356223176, + "grad_norm": 1.4066170454025269, + "learning_rate": 3.3803540745160814e-06, + "loss": 2.0581, + "step": 7537 + }, + { + "epoch": 0.4043991416309013, + "grad_norm": 1.328715205192566, + "learning_rate": 3.379947484436444e-06, + "loss": 2.0066, + "step": 7538 + }, + { + "epoch": 0.4044527896995708, + "grad_norm": 1.4667611122131348, + "learning_rate": 3.3795408677887585e-06, + "loss": 1.6995, + "step": 7539 + }, + { + "epoch": 0.40450643776824036, + "grad_norm": 1.391870379447937, + "learning_rate": 3.379134224585302e-06, + "loss": 2.2674, + "step": 7540 + }, + { + "epoch": 0.4045600858369099, + "grad_norm": 1.5465041399002075, + "learning_rate": 3.3787275548383523e-06, + "loss": 2.17, + "step": 7541 + }, + { + "epoch": 0.4046137339055794, + "grad_norm": 3.9669153690338135, + "learning_rate": 3.3783208585601878e-06, + "loss": 2.1799, + "step": 7542 + }, + { + "epoch": 0.40466738197424895, + "grad_norm": 1.6726735830307007, + "learning_rate": 3.3779141357630873e-06, + "loss": 2.4418, + "step": 7543 + }, + { + "epoch": 0.40472103004291843, + "grad_norm": 1.379836082458496, + "learning_rate": 3.3775073864593317e-06, + "loss": 2.3445, + "step": 7544 + }, + { + "epoch": 0.40477467811158796, + "grad_norm": 1.3272500038146973, + "learning_rate": 3.3771006106612012e-06, + "loss": 1.9567, + "step": 7545 + }, + { + "epoch": 0.4048283261802575, + "grad_norm": 1.461116075515747, + "learning_rate": 3.3766938083809787e-06, + "loss": 2.224, + "step": 7546 + }, + { + "epoch": 0.404881974248927, + "grad_norm": 1.4799610376358032, + "learning_rate": 3.3762869796309443e-06, + "loss": 2.2339, + "step": 7547 + }, + { + "epoch": 0.40493562231759656, + "grad_norm": 1.823141098022461, + "learning_rate": 3.375880124423383e-06, + "loss": 2.081, + "step": 7548 + }, + { + "epoch": 0.4049892703862661, + "grad_norm": 1.5919840335845947, + "learning_rate": 3.3754732427705796e-06, + "loss": 2.3284, + "step": 7549 + }, + { + "epoch": 0.4050429184549356, + "grad_norm": 3.5296430587768555, + "learning_rate": 3.3750663346848174e-06, + "loss": 2.2281, + "step": 7550 + }, + { + "epoch": 0.40509656652360515, + "grad_norm": 1.6460238695144653, + "learning_rate": 3.3746594001783823e-06, + "loss": 2.2486, + "step": 7551 + }, + { + "epoch": 0.4051502145922747, + "grad_norm": 1.3393707275390625, + "learning_rate": 3.374252439263562e-06, + "loss": 2.25, + "step": 7552 + }, + { + "epoch": 0.4052038626609442, + "grad_norm": 1.4936153888702393, + "learning_rate": 3.373845451952642e-06, + "loss": 2.4204, + "step": 7553 + }, + { + "epoch": 0.40525751072961375, + "grad_norm": 1.5342942476272583, + "learning_rate": 3.3734384382579114e-06, + "loss": 2.3366, + "step": 7554 + }, + { + "epoch": 0.4053111587982833, + "grad_norm": 1.2577030658721924, + "learning_rate": 3.3730313981916597e-06, + "loss": 2.412, + "step": 7555 + }, + { + "epoch": 0.4053648068669528, + "grad_norm": 1.5333271026611328, + "learning_rate": 3.3726243317661756e-06, + "loss": 2.4456, + "step": 7556 + }, + { + "epoch": 0.40541845493562234, + "grad_norm": 1.4399313926696777, + "learning_rate": 3.372217238993749e-06, + "loss": 2.3234, + "step": 7557 + }, + { + "epoch": 0.4054721030042919, + "grad_norm": 1.4879390001296997, + "learning_rate": 3.3718101198866733e-06, + "loss": 2.0245, + "step": 7558 + }, + { + "epoch": 0.40552575107296135, + "grad_norm": 1.3000102043151855, + "learning_rate": 3.371402974457238e-06, + "loss": 2.1341, + "step": 7559 + }, + { + "epoch": 0.4055793991416309, + "grad_norm": 1.5306812524795532, + "learning_rate": 3.3709958027177374e-06, + "loss": 2.2741, + "step": 7560 + }, + { + "epoch": 0.4056330472103004, + "grad_norm": 1.4936784505844116, + "learning_rate": 3.3705886046804653e-06, + "loss": 2.2515, + "step": 7561 + }, + { + "epoch": 0.40568669527896994, + "grad_norm": 1.37185800075531, + "learning_rate": 3.3701813803577153e-06, + "loss": 2.2651, + "step": 7562 + }, + { + "epoch": 0.4057403433476395, + "grad_norm": 1.6048569679260254, + "learning_rate": 3.369774129761784e-06, + "loss": 2.3734, + "step": 7563 + }, + { + "epoch": 0.405793991416309, + "grad_norm": 1.4744781255722046, + "learning_rate": 3.369366852904966e-06, + "loss": 2.236, + "step": 7564 + }, + { + "epoch": 0.40584763948497854, + "grad_norm": 2.2808406352996826, + "learning_rate": 3.3689595497995576e-06, + "loss": 2.2058, + "step": 7565 + }, + { + "epoch": 0.40590128755364807, + "grad_norm": 2.0545501708984375, + "learning_rate": 3.368552220457858e-06, + "loss": 1.5444, + "step": 7566 + }, + { + "epoch": 0.4059549356223176, + "grad_norm": 2.65175199508667, + "learning_rate": 3.368144864892166e-06, + "loss": 2.3523, + "step": 7567 + }, + { + "epoch": 0.40600858369098713, + "grad_norm": 1.4936425685882568, + "learning_rate": 3.367737483114779e-06, + "loss": 2.6231, + "step": 7568 + }, + { + "epoch": 0.40606223175965667, + "grad_norm": 1.5421500205993652, + "learning_rate": 3.3673300751379976e-06, + "loss": 2.4131, + "step": 7569 + }, + { + "epoch": 0.4061158798283262, + "grad_norm": 1.409574270248413, + "learning_rate": 3.366922640974123e-06, + "loss": 2.2847, + "step": 7570 + }, + { + "epoch": 0.40616952789699573, + "grad_norm": 1.5558013916015625, + "learning_rate": 3.3665151806354563e-06, + "loss": 2.237, + "step": 7571 + }, + { + "epoch": 0.40622317596566526, + "grad_norm": 1.345863938331604, + "learning_rate": 3.3661076941343e-06, + "loss": 2.3032, + "step": 7572 + }, + { + "epoch": 0.40627682403433474, + "grad_norm": 1.4900178909301758, + "learning_rate": 3.365700181482958e-06, + "loss": 2.1733, + "step": 7573 + }, + { + "epoch": 0.40633047210300427, + "grad_norm": 1.6071497201919556, + "learning_rate": 3.3652926426937327e-06, + "loss": 2.2481, + "step": 7574 + }, + { + "epoch": 0.4063841201716738, + "grad_norm": 1.5230313539505005, + "learning_rate": 3.36488507777893e-06, + "loss": 1.4653, + "step": 7575 + }, + { + "epoch": 0.40643776824034333, + "grad_norm": 1.5364099740982056, + "learning_rate": 3.364477486750855e-06, + "loss": 2.1149, + "step": 7576 + }, + { + "epoch": 0.40649141630901287, + "grad_norm": 1.730255126953125, + "learning_rate": 3.3640698696218145e-06, + "loss": 2.0467, + "step": 7577 + }, + { + "epoch": 0.4065450643776824, + "grad_norm": 1.4820847511291504, + "learning_rate": 3.363662226404115e-06, + "loss": 2.5828, + "step": 7578 + }, + { + "epoch": 0.40659871244635193, + "grad_norm": 1.4426065683364868, + "learning_rate": 3.3632545571100637e-06, + "loss": 2.2528, + "step": 7579 + }, + { + "epoch": 0.40665236051502146, + "grad_norm": 1.2756426334381104, + "learning_rate": 3.3628468617519716e-06, + "loss": 2.1995, + "step": 7580 + }, + { + "epoch": 0.406706008583691, + "grad_norm": 1.5102970600128174, + "learning_rate": 3.3624391403421457e-06, + "loss": 2.213, + "step": 7581 + }, + { + "epoch": 0.4067596566523605, + "grad_norm": 1.8728218078613281, + "learning_rate": 3.3620313928928974e-06, + "loss": 2.416, + "step": 7582 + }, + { + "epoch": 0.40681330472103006, + "grad_norm": 1.3752219676971436, + "learning_rate": 3.361623619416537e-06, + "loss": 2.1636, + "step": 7583 + }, + { + "epoch": 0.4068669527896996, + "grad_norm": 1.741518497467041, + "learning_rate": 3.3612158199253776e-06, + "loss": 2.3236, + "step": 7584 + }, + { + "epoch": 0.4069206008583691, + "grad_norm": 1.4976341724395752, + "learning_rate": 3.3608079944317306e-06, + "loss": 1.9596, + "step": 7585 + }, + { + "epoch": 0.40697424892703865, + "grad_norm": 1.6879020929336548, + "learning_rate": 3.36040014294791e-06, + "loss": 2.2726, + "step": 7586 + }, + { + "epoch": 0.4070278969957081, + "grad_norm": 1.4302918910980225, + "learning_rate": 3.3599922654862298e-06, + "loss": 2.2602, + "step": 7587 + }, + { + "epoch": 0.40708154506437766, + "grad_norm": 1.1676193475723267, + "learning_rate": 3.359584362059004e-06, + "loss": 2.2247, + "step": 7588 + }, + { + "epoch": 0.4071351931330472, + "grad_norm": 1.8063963651657104, + "learning_rate": 3.359176432678551e-06, + "loss": 2.3396, + "step": 7589 + }, + { + "epoch": 0.4071888412017167, + "grad_norm": 1.8815983533859253, + "learning_rate": 3.3587684773571843e-06, + "loss": 2.332, + "step": 7590 + }, + { + "epoch": 0.40724248927038625, + "grad_norm": 1.3747330904006958, + "learning_rate": 3.3583604961072227e-06, + "loss": 2.0059, + "step": 7591 + }, + { + "epoch": 0.4072961373390558, + "grad_norm": 1.6225625276565552, + "learning_rate": 3.357952488940984e-06, + "loss": 2.3789, + "step": 7592 + }, + { + "epoch": 0.4073497854077253, + "grad_norm": 1.3701133728027344, + "learning_rate": 3.357544455870787e-06, + "loss": 2.4795, + "step": 7593 + }, + { + "epoch": 0.40740343347639485, + "grad_norm": 1.9506821632385254, + "learning_rate": 3.3571363969089522e-06, + "loss": 1.8975, + "step": 7594 + }, + { + "epoch": 0.4074570815450644, + "grad_norm": 1.4759584665298462, + "learning_rate": 3.3567283120677984e-06, + "loss": 2.2944, + "step": 7595 + }, + { + "epoch": 0.4075107296137339, + "grad_norm": 1.2033581733703613, + "learning_rate": 3.356320201359648e-06, + "loss": 2.0849, + "step": 7596 + }, + { + "epoch": 0.40756437768240344, + "grad_norm": 1.4414206743240356, + "learning_rate": 3.3559120647968226e-06, + "loss": 2.1415, + "step": 7597 + }, + { + "epoch": 0.407618025751073, + "grad_norm": 1.4615751504898071, + "learning_rate": 3.3555039023916454e-06, + "loss": 2.5205, + "step": 7598 + }, + { + "epoch": 0.4076716738197425, + "grad_norm": 1.431201457977295, + "learning_rate": 3.3550957141564387e-06, + "loss": 2.4952, + "step": 7599 + }, + { + "epoch": 0.40772532188841204, + "grad_norm": 1.5177161693572998, + "learning_rate": 3.354687500103529e-06, + "loss": 2.2898, + "step": 7600 + }, + { + "epoch": 0.40777896995708157, + "grad_norm": 1.3599438667297363, + "learning_rate": 3.3542792602452397e-06, + "loss": 2.0673, + "step": 7601 + }, + { + "epoch": 0.40783261802575105, + "grad_norm": 1.3038997650146484, + "learning_rate": 3.3538709945938974e-06, + "loss": 2.2432, + "step": 7602 + }, + { + "epoch": 0.4078862660944206, + "grad_norm": 1.5227564573287964, + "learning_rate": 3.353462703161828e-06, + "loss": 2.5368, + "step": 7603 + }, + { + "epoch": 0.4079399141630901, + "grad_norm": 1.3575537204742432, + "learning_rate": 3.353054385961359e-06, + "loss": 2.2436, + "step": 7604 + }, + { + "epoch": 0.40799356223175964, + "grad_norm": 1.6602410078048706, + "learning_rate": 3.3526460430048198e-06, + "loss": 2.1551, + "step": 7605 + }, + { + "epoch": 0.4080472103004292, + "grad_norm": 1.9108518362045288, + "learning_rate": 3.3522376743045383e-06, + "loss": 2.1708, + "step": 7606 + }, + { + "epoch": 0.4081008583690987, + "grad_norm": 1.7338933944702148, + "learning_rate": 3.3518292798728453e-06, + "loss": 2.1241, + "step": 7607 + }, + { + "epoch": 0.40815450643776824, + "grad_norm": 1.503735065460205, + "learning_rate": 3.3514208597220704e-06, + "loss": 2.3068, + "step": 7608 + }, + { + "epoch": 0.40820815450643777, + "grad_norm": 1.5750504732131958, + "learning_rate": 3.351012413864545e-06, + "loss": 2.3482, + "step": 7609 + }, + { + "epoch": 0.4082618025751073, + "grad_norm": 1.4124572277069092, + "learning_rate": 3.350603942312601e-06, + "loss": 2.2778, + "step": 7610 + }, + { + "epoch": 0.40831545064377683, + "grad_norm": 1.5788168907165527, + "learning_rate": 3.350195445078573e-06, + "loss": 2.2266, + "step": 7611 + }, + { + "epoch": 0.40836909871244637, + "grad_norm": 21.42499542236328, + "learning_rate": 3.3497869221747923e-06, + "loss": 2.2024, + "step": 7612 + }, + { + "epoch": 0.4084227467811159, + "grad_norm": 1.6068240404129028, + "learning_rate": 3.3493783736135956e-06, + "loss": 2.268, + "step": 7613 + }, + { + "epoch": 0.40847639484978543, + "grad_norm": 1.790477991104126, + "learning_rate": 3.348969799407316e-06, + "loss": 2.3287, + "step": 7614 + }, + { + "epoch": 0.40853004291845496, + "grad_norm": 1.7950087785720825, + "learning_rate": 3.348561199568291e-06, + "loss": 2.4743, + "step": 7615 + }, + { + "epoch": 0.40858369098712444, + "grad_norm": 1.197148323059082, + "learning_rate": 3.348152574108856e-06, + "loss": 2.212, + "step": 7616 + }, + { + "epoch": 0.40863733905579397, + "grad_norm": 4.587645530700684, + "learning_rate": 3.34774392304135e-06, + "loss": 2.2847, + "step": 7617 + }, + { + "epoch": 0.4086909871244635, + "grad_norm": 1.5636261701583862, + "learning_rate": 3.3473352463781107e-06, + "loss": 2.1171, + "step": 7618 + }, + { + "epoch": 0.40874463519313303, + "grad_norm": 1.6445213556289673, + "learning_rate": 3.3469265441314767e-06, + "loss": 2.2712, + "step": 7619 + }, + { + "epoch": 0.40879828326180256, + "grad_norm": 1.5757290124893188, + "learning_rate": 3.346517816313789e-06, + "loss": 2.4348, + "step": 7620 + }, + { + "epoch": 0.4088519313304721, + "grad_norm": 4.324021339416504, + "learning_rate": 3.3461090629373865e-06, + "loss": 2.2598, + "step": 7621 + }, + { + "epoch": 0.4089055793991416, + "grad_norm": 1.5825327634811401, + "learning_rate": 3.3457002840146124e-06, + "loss": 2.442, + "step": 7622 + }, + { + "epoch": 0.40895922746781116, + "grad_norm": 1.608406901359558, + "learning_rate": 3.3452914795578073e-06, + "loss": 2.3365, + "step": 7623 + }, + { + "epoch": 0.4090128755364807, + "grad_norm": 1.5684659481048584, + "learning_rate": 3.3448826495793163e-06, + "loss": 2.4942, + "step": 7624 + }, + { + "epoch": 0.4090665236051502, + "grad_norm": 1.5089131593704224, + "learning_rate": 3.3444737940914808e-06, + "loss": 2.2078, + "step": 7625 + }, + { + "epoch": 0.40912017167381975, + "grad_norm": 1.660137414932251, + "learning_rate": 3.344064913106646e-06, + "loss": 2.213, + "step": 7626 + }, + { + "epoch": 0.4091738197424893, + "grad_norm": 1.410142183303833, + "learning_rate": 3.343656006637157e-06, + "loss": 2.2405, + "step": 7627 + }, + { + "epoch": 0.4092274678111588, + "grad_norm": 7.652499675750732, + "learning_rate": 3.3432470746953606e-06, + "loss": 2.2594, + "step": 7628 + }, + { + "epoch": 0.40928111587982835, + "grad_norm": 1.4337314367294312, + "learning_rate": 3.342838117293603e-06, + "loss": 2.3777, + "step": 7629 + }, + { + "epoch": 0.4093347639484979, + "grad_norm": 1.6034066677093506, + "learning_rate": 3.3424291344442323e-06, + "loss": 2.1999, + "step": 7630 + }, + { + "epoch": 0.40938841201716736, + "grad_norm": 1.5601093769073486, + "learning_rate": 3.3420201261595966e-06, + "loss": 2.2189, + "step": 7631 + }, + { + "epoch": 0.4094420600858369, + "grad_norm": 1.513320803642273, + "learning_rate": 3.341611092452044e-06, + "loss": 2.3696, + "step": 7632 + }, + { + "epoch": 0.4094957081545064, + "grad_norm": 1.5924516916275024, + "learning_rate": 3.3412020333339257e-06, + "loss": 2.5343, + "step": 7633 + }, + { + "epoch": 0.40954935622317595, + "grad_norm": 1.4033737182617188, + "learning_rate": 3.340792948817591e-06, + "loss": 2.3526, + "step": 7634 + }, + { + "epoch": 0.4096030042918455, + "grad_norm": 6.051878452301025, + "learning_rate": 3.340383838915393e-06, + "loss": 2.1724, + "step": 7635 + }, + { + "epoch": 0.409656652360515, + "grad_norm": 1.3296301364898682, + "learning_rate": 3.3399747036396823e-06, + "loss": 2.2591, + "step": 7636 + }, + { + "epoch": 0.40971030042918455, + "grad_norm": 1.4115715026855469, + "learning_rate": 3.339565543002813e-06, + "loss": 2.4193, + "step": 7637 + }, + { + "epoch": 0.4097639484978541, + "grad_norm": 1.5323166847229004, + "learning_rate": 3.3391563570171377e-06, + "loss": 2.2571, + "step": 7638 + }, + { + "epoch": 0.4098175965665236, + "grad_norm": 1.2917309999465942, + "learning_rate": 3.3387471456950114e-06, + "loss": 2.2571, + "step": 7639 + }, + { + "epoch": 0.40987124463519314, + "grad_norm": 1.4500638246536255, + "learning_rate": 3.33833790904879e-06, + "loss": 2.1974, + "step": 7640 + }, + { + "epoch": 0.4099248927038627, + "grad_norm": 1.7297306060791016, + "learning_rate": 3.3379286470908283e-06, + "loss": 2.2067, + "step": 7641 + }, + { + "epoch": 0.4099785407725322, + "grad_norm": 1.4964649677276611, + "learning_rate": 3.3375193598334845e-06, + "loss": 2.2848, + "step": 7642 + }, + { + "epoch": 0.41003218884120174, + "grad_norm": 1.371924877166748, + "learning_rate": 3.337110047289114e-06, + "loss": 2.0639, + "step": 7643 + }, + { + "epoch": 0.41008583690987127, + "grad_norm": 1.6524094343185425, + "learning_rate": 3.336700709470076e-06, + "loss": 2.0946, + "step": 7644 + }, + { + "epoch": 0.41013948497854075, + "grad_norm": 1.3813105821609497, + "learning_rate": 3.3362913463887304e-06, + "loss": 1.9839, + "step": 7645 + }, + { + "epoch": 0.4101931330472103, + "grad_norm": 2.5401880741119385, + "learning_rate": 3.335881958057437e-06, + "loss": 2.419, + "step": 7646 + }, + { + "epoch": 0.4102467811158798, + "grad_norm": 1.513715386390686, + "learning_rate": 3.335472544488555e-06, + "loss": 2.2673, + "step": 7647 + }, + { + "epoch": 0.41030042918454934, + "grad_norm": 1.659090280532837, + "learning_rate": 3.335063105694447e-06, + "loss": 2.0995, + "step": 7648 + }, + { + "epoch": 0.4103540772532189, + "grad_norm": 1.4371672868728638, + "learning_rate": 3.3346536416874743e-06, + "loss": 2.1828, + "step": 7649 + }, + { + "epoch": 0.4104077253218884, + "grad_norm": 1.3145729303359985, + "learning_rate": 3.3342441524799994e-06, + "loss": 2.3104, + "step": 7650 + }, + { + "epoch": 0.41046137339055794, + "grad_norm": 1.3534489870071411, + "learning_rate": 3.3338346380843876e-06, + "loss": 2.0501, + "step": 7651 + }, + { + "epoch": 0.41051502145922747, + "grad_norm": 1.5098953247070312, + "learning_rate": 3.3334250985130016e-06, + "loss": 2.3761, + "step": 7652 + }, + { + "epoch": 0.410568669527897, + "grad_norm": 1.4107753038406372, + "learning_rate": 3.3330155337782077e-06, + "loss": 2.3035, + "step": 7653 + }, + { + "epoch": 0.41062231759656653, + "grad_norm": 1.3889501094818115, + "learning_rate": 3.332605943892371e-06, + "loss": 2.1937, + "step": 7654 + }, + { + "epoch": 0.41067596566523606, + "grad_norm": 1.4140701293945312, + "learning_rate": 3.3321963288678575e-06, + "loss": 2.1378, + "step": 7655 + }, + { + "epoch": 0.4107296137339056, + "grad_norm": 1.585524082183838, + "learning_rate": 3.331786688717037e-06, + "loss": 2.3295, + "step": 7656 + }, + { + "epoch": 0.41078326180257513, + "grad_norm": 1.439028024673462, + "learning_rate": 3.331377023452275e-06, + "loss": 2.1335, + "step": 7657 + }, + { + "epoch": 0.41083690987124466, + "grad_norm": 1.8102279901504517, + "learning_rate": 3.330967333085942e-06, + "loss": 2.5123, + "step": 7658 + }, + { + "epoch": 0.41089055793991414, + "grad_norm": 1.5894087553024292, + "learning_rate": 3.330557617630407e-06, + "loss": 2.2753, + "step": 7659 + }, + { + "epoch": 0.41094420600858367, + "grad_norm": 1.4370474815368652, + "learning_rate": 3.330147877098041e-06, + "loss": 2.0705, + "step": 7660 + }, + { + "epoch": 0.4109978540772532, + "grad_norm": 1.595379114151001, + "learning_rate": 3.329738111501215e-06, + "loss": 2.2054, + "step": 7661 + }, + { + "epoch": 0.41105150214592273, + "grad_norm": 1.5062137842178345, + "learning_rate": 3.3293283208523013e-06, + "loss": 2.3801, + "step": 7662 + }, + { + "epoch": 0.41110515021459226, + "grad_norm": 1.554824709892273, + "learning_rate": 3.328918505163672e-06, + "loss": 2.2148, + "step": 7663 + }, + { + "epoch": 0.4111587982832618, + "grad_norm": 1.4676251411437988, + "learning_rate": 3.3285086644477006e-06, + "loss": 2.0226, + "step": 7664 + }, + { + "epoch": 0.4112124463519313, + "grad_norm": 1.5175234079360962, + "learning_rate": 3.3280987987167614e-06, + "loss": 2.0365, + "step": 7665 + }, + { + "epoch": 0.41126609442060086, + "grad_norm": 1.5501763820648193, + "learning_rate": 3.327688907983229e-06, + "loss": 2.058, + "step": 7666 + }, + { + "epoch": 0.4113197424892704, + "grad_norm": 1.4869688749313354, + "learning_rate": 3.32727899225948e-06, + "loss": 1.7755, + "step": 7667 + }, + { + "epoch": 0.4113733905579399, + "grad_norm": 1.2625278234481812, + "learning_rate": 3.3268690515578906e-06, + "loss": 2.2693, + "step": 7668 + }, + { + "epoch": 0.41142703862660945, + "grad_norm": 1.309733510017395, + "learning_rate": 3.3264590858908385e-06, + "loss": 2.281, + "step": 7669 + }, + { + "epoch": 0.411480686695279, + "grad_norm": 1.8899695873260498, + "learning_rate": 3.3260490952707e-06, + "loss": 2.1583, + "step": 7670 + }, + { + "epoch": 0.4115343347639485, + "grad_norm": 1.524415135383606, + "learning_rate": 3.325639079709856e-06, + "loss": 2.2088, + "step": 7671 + }, + { + "epoch": 0.41158798283261805, + "grad_norm": 1.2210280895233154, + "learning_rate": 3.325229039220684e-06, + "loss": 2.2391, + "step": 7672 + }, + { + "epoch": 0.4116416309012876, + "grad_norm": 1.4491066932678223, + "learning_rate": 3.3248189738155665e-06, + "loss": 2.3732, + "step": 7673 + }, + { + "epoch": 0.41169527896995706, + "grad_norm": 1.5008517503738403, + "learning_rate": 3.3244088835068828e-06, + "loss": 2.1939, + "step": 7674 + }, + { + "epoch": 0.4117489270386266, + "grad_norm": 2.0484981536865234, + "learning_rate": 3.3239987683070148e-06, + "loss": 2.3287, + "step": 7675 + }, + { + "epoch": 0.4118025751072961, + "grad_norm": 1.4082878828048706, + "learning_rate": 3.3235886282283453e-06, + "loss": 1.8208, + "step": 7676 + }, + { + "epoch": 0.41185622317596565, + "grad_norm": 1.5031461715698242, + "learning_rate": 3.3231784632832577e-06, + "loss": 2.2845, + "step": 7677 + }, + { + "epoch": 0.4119098712446352, + "grad_norm": 1.425111174583435, + "learning_rate": 3.322768273484136e-06, + "loss": 2.2127, + "step": 7678 + }, + { + "epoch": 0.4119635193133047, + "grad_norm": 1.3848754167556763, + "learning_rate": 3.3223580588433647e-06, + "loss": 2.252, + "step": 7679 + }, + { + "epoch": 0.41201716738197425, + "grad_norm": 1.5462273359298706, + "learning_rate": 3.321947819373329e-06, + "loss": 2.1153, + "step": 7680 + }, + { + "epoch": 0.4120708154506438, + "grad_norm": 1.656982660293579, + "learning_rate": 3.3215375550864167e-06, + "loss": 2.4652, + "step": 7681 + }, + { + "epoch": 0.4121244635193133, + "grad_norm": 3.2872684001922607, + "learning_rate": 3.3211272659950134e-06, + "loss": 2.3127, + "step": 7682 + }, + { + "epoch": 0.41217811158798284, + "grad_norm": 1.6049373149871826, + "learning_rate": 3.3207169521115067e-06, + "loss": 2.537, + "step": 7683 + }, + { + "epoch": 0.4122317596566524, + "grad_norm": 2.4169623851776123, + "learning_rate": 3.3203066134482858e-06, + "loss": 2.28, + "step": 7684 + }, + { + "epoch": 0.4122854077253219, + "grad_norm": 1.7265690565109253, + "learning_rate": 3.31989625001774e-06, + "loss": 2.2933, + "step": 7685 + }, + { + "epoch": 0.41233905579399144, + "grad_norm": 1.3791300058364868, + "learning_rate": 3.3194858618322594e-06, + "loss": 2.2507, + "step": 7686 + }, + { + "epoch": 0.41239270386266097, + "grad_norm": 1.504041314125061, + "learning_rate": 3.3190754489042343e-06, + "loss": 2.3248, + "step": 7687 + }, + { + "epoch": 0.41244635193133045, + "grad_norm": 1.3633394241333008, + "learning_rate": 3.318665011246056e-06, + "loss": 2.1004, + "step": 7688 + }, + { + "epoch": 0.4125, + "grad_norm": 1.4642304182052612, + "learning_rate": 3.3182545488701166e-06, + "loss": 2.2754, + "step": 7689 + }, + { + "epoch": 0.4125536480686695, + "grad_norm": 1.3593698740005493, + "learning_rate": 3.3178440617888107e-06, + "loss": 2.3427, + "step": 7690 + }, + { + "epoch": 0.41260729613733904, + "grad_norm": 1.8055020570755005, + "learning_rate": 3.317433550014531e-06, + "loss": 2.1643, + "step": 7691 + }, + { + "epoch": 0.4126609442060086, + "grad_norm": 1.67972993850708, + "learning_rate": 3.3170230135596716e-06, + "loss": 2.1946, + "step": 7692 + }, + { + "epoch": 0.4127145922746781, + "grad_norm": 1.2972562313079834, + "learning_rate": 3.3166124524366277e-06, + "loss": 2.0103, + "step": 7693 + }, + { + "epoch": 0.41276824034334764, + "grad_norm": 1.4033565521240234, + "learning_rate": 3.316201866657796e-06, + "loss": 2.2899, + "step": 7694 + }, + { + "epoch": 0.41282188841201717, + "grad_norm": 1.3700413703918457, + "learning_rate": 3.3157912562355727e-06, + "loss": 2.1939, + "step": 7695 + }, + { + "epoch": 0.4128755364806867, + "grad_norm": 2.2402901649475098, + "learning_rate": 3.315380621182355e-06, + "loss": 2.2149, + "step": 7696 + }, + { + "epoch": 0.41292918454935623, + "grad_norm": 1.8647050857543945, + "learning_rate": 3.3149699615105426e-06, + "loss": 2.2979, + "step": 7697 + }, + { + "epoch": 0.41298283261802576, + "grad_norm": 3.0614712238311768, + "learning_rate": 3.3145592772325325e-06, + "loss": 2.2751, + "step": 7698 + }, + { + "epoch": 0.4130364806866953, + "grad_norm": 1.5294445753097534, + "learning_rate": 3.3141485683607257e-06, + "loss": 2.2675, + "step": 7699 + }, + { + "epoch": 0.4130901287553648, + "grad_norm": 1.3794969320297241, + "learning_rate": 3.3137378349075216e-06, + "loss": 2.1819, + "step": 7700 + }, + { + "epoch": 0.41314377682403436, + "grad_norm": 1.4785715341567993, + "learning_rate": 3.313327076885322e-06, + "loss": 2.2108, + "step": 7701 + }, + { + "epoch": 0.41319742489270384, + "grad_norm": 1.1518113613128662, + "learning_rate": 3.3129162943065295e-06, + "loss": 2.4634, + "step": 7702 + }, + { + "epoch": 0.41325107296137337, + "grad_norm": 1.3408408164978027, + "learning_rate": 3.312505487183546e-06, + "loss": 1.8173, + "step": 7703 + }, + { + "epoch": 0.4133047210300429, + "grad_norm": 2.9200832843780518, + "learning_rate": 3.312094655528775e-06, + "loss": 1.993, + "step": 7704 + }, + { + "epoch": 0.41335836909871243, + "grad_norm": 1.6003128290176392, + "learning_rate": 3.31168379935462e-06, + "loss": 2.4641, + "step": 7705 + }, + { + "epoch": 0.41341201716738196, + "grad_norm": 1.362621545791626, + "learning_rate": 3.3112729186734867e-06, + "loss": 1.9007, + "step": 7706 + }, + { + "epoch": 0.4134656652360515, + "grad_norm": 2.11230206489563, + "learning_rate": 3.31086201349778e-06, + "loss": 2.3789, + "step": 7707 + }, + { + "epoch": 0.413519313304721, + "grad_norm": 1.3423364162445068, + "learning_rate": 3.3104510838399086e-06, + "loss": 2.5287, + "step": 7708 + }, + { + "epoch": 0.41357296137339056, + "grad_norm": 1.474397897720337, + "learning_rate": 3.310040129712276e-06, + "loss": 2.2101, + "step": 7709 + }, + { + "epoch": 0.4136266094420601, + "grad_norm": 4.157703876495361, + "learning_rate": 3.3096291511272925e-06, + "loss": 2.0639, + "step": 7710 + }, + { + "epoch": 0.4136802575107296, + "grad_norm": 1.535360336303711, + "learning_rate": 3.309218148097366e-06, + "loss": 2.2064, + "step": 7711 + }, + { + "epoch": 0.41373390557939915, + "grad_norm": 1.4798983335494995, + "learning_rate": 3.3088071206349054e-06, + "loss": 2.524, + "step": 7712 + }, + { + "epoch": 0.4137875536480687, + "grad_norm": 1.5787886381149292, + "learning_rate": 3.308396068752322e-06, + "loss": 2.4834, + "step": 7713 + }, + { + "epoch": 0.4138412017167382, + "grad_norm": 1.4520328044891357, + "learning_rate": 3.307984992462025e-06, + "loss": 2.2446, + "step": 7714 + }, + { + "epoch": 0.41389484978540775, + "grad_norm": 1.4544165134429932, + "learning_rate": 3.307573891776427e-06, + "loss": 2.1951, + "step": 7715 + }, + { + "epoch": 0.4139484978540773, + "grad_norm": 1.497046947479248, + "learning_rate": 3.3071627667079397e-06, + "loss": 2.3597, + "step": 7716 + }, + { + "epoch": 0.41400214592274676, + "grad_norm": 1.3673096895217896, + "learning_rate": 3.3067516172689774e-06, + "loss": 2.3663, + "step": 7717 + }, + { + "epoch": 0.4140557939914163, + "grad_norm": 1.243415355682373, + "learning_rate": 3.3063404434719514e-06, + "loss": 2.2649, + "step": 7718 + }, + { + "epoch": 0.4141094420600858, + "grad_norm": 1.2941721677780151, + "learning_rate": 3.3059292453292786e-06, + "loss": 2.2449, + "step": 7719 + }, + { + "epoch": 0.41416309012875535, + "grad_norm": 1.5604037046432495, + "learning_rate": 3.305518022853373e-06, + "loss": 2.2249, + "step": 7720 + }, + { + "epoch": 0.4142167381974249, + "grad_norm": 1.3808099031448364, + "learning_rate": 3.3051067760566508e-06, + "loss": 2.1405, + "step": 7721 + }, + { + "epoch": 0.4142703862660944, + "grad_norm": 1.4102287292480469, + "learning_rate": 3.3046955049515277e-06, + "loss": 2.328, + "step": 7722 + }, + { + "epoch": 0.41432403433476395, + "grad_norm": 1.4910821914672852, + "learning_rate": 3.304284209550423e-06, + "loss": 2.3551, + "step": 7723 + }, + { + "epoch": 0.4143776824034335, + "grad_norm": 1.461292028427124, + "learning_rate": 3.3038728898657537e-06, + "loss": 2.2384, + "step": 7724 + }, + { + "epoch": 0.414431330472103, + "grad_norm": 1.4318722486495972, + "learning_rate": 3.3034615459099382e-06, + "loss": 2.1462, + "step": 7725 + }, + { + "epoch": 0.41448497854077254, + "grad_norm": 1.4123327732086182, + "learning_rate": 3.303050177695398e-06, + "loss": 2.247, + "step": 7726 + }, + { + "epoch": 0.4145386266094421, + "grad_norm": 1.4885132312774658, + "learning_rate": 3.3026387852345514e-06, + "loss": 2.4336, + "step": 7727 + }, + { + "epoch": 0.4145922746781116, + "grad_norm": 1.2341228723526, + "learning_rate": 3.3022273685398197e-06, + "loss": 2.1397, + "step": 7728 + }, + { + "epoch": 0.41464592274678114, + "grad_norm": 1.4334890842437744, + "learning_rate": 3.301815927623626e-06, + "loss": 2.3826, + "step": 7729 + }, + { + "epoch": 0.41469957081545067, + "grad_norm": 1.5745042562484741, + "learning_rate": 3.301404462498393e-06, + "loss": 2.3447, + "step": 7730 + }, + { + "epoch": 0.41475321888412015, + "grad_norm": 2.0712692737579346, + "learning_rate": 3.300992973176542e-06, + "loss": 2.487, + "step": 7731 + }, + { + "epoch": 0.4148068669527897, + "grad_norm": 1.6920208930969238, + "learning_rate": 3.300581459670499e-06, + "loss": 2.4384, + "step": 7732 + }, + { + "epoch": 0.4148605150214592, + "grad_norm": 1.6578826904296875, + "learning_rate": 3.300169921992687e-06, + "loss": 1.9578, + "step": 7733 + }, + { + "epoch": 0.41491416309012874, + "grad_norm": 1.1579794883728027, + "learning_rate": 3.2997583601555316e-06, + "loss": 1.8301, + "step": 7734 + }, + { + "epoch": 0.41496781115879827, + "grad_norm": 1.5124093294143677, + "learning_rate": 3.2993467741714614e-06, + "loss": 2.3299, + "step": 7735 + }, + { + "epoch": 0.4150214592274678, + "grad_norm": 1.4099968671798706, + "learning_rate": 3.2989351640529005e-06, + "loss": 2.2197, + "step": 7736 + }, + { + "epoch": 0.41507510729613734, + "grad_norm": 1.5257041454315186, + "learning_rate": 3.2985235298122775e-06, + "loss": 2.3155, + "step": 7737 + }, + { + "epoch": 0.41512875536480687, + "grad_norm": 1.9985848665237427, + "learning_rate": 3.298111871462022e-06, + "loss": 2.2488, + "step": 7738 + }, + { + "epoch": 0.4151824034334764, + "grad_norm": 1.3784886598587036, + "learning_rate": 3.297700189014561e-06, + "loss": 2.3404, + "step": 7739 + }, + { + "epoch": 0.41523605150214593, + "grad_norm": 1.3754932880401611, + "learning_rate": 3.297288482482326e-06, + "loss": 2.2711, + "step": 7740 + }, + { + "epoch": 0.41528969957081546, + "grad_norm": 1.5652151107788086, + "learning_rate": 3.296876751877746e-06, + "loss": 2.2985, + "step": 7741 + }, + { + "epoch": 0.415343347639485, + "grad_norm": 2.085951566696167, + "learning_rate": 3.2964649972132538e-06, + "loss": 2.1501, + "step": 7742 + }, + { + "epoch": 0.4153969957081545, + "grad_norm": 1.341373085975647, + "learning_rate": 3.2960532185012813e-06, + "loss": 2.2645, + "step": 7743 + }, + { + "epoch": 0.41545064377682406, + "grad_norm": 1.632558822631836, + "learning_rate": 3.2956414157542606e-06, + "loss": 2.3246, + "step": 7744 + }, + { + "epoch": 0.4155042918454936, + "grad_norm": 1.695114016532898, + "learning_rate": 3.2952295889846245e-06, + "loss": 2.0647, + "step": 7745 + }, + { + "epoch": 0.41555793991416307, + "grad_norm": 1.7110008001327515, + "learning_rate": 3.2948177382048087e-06, + "loss": 2.3843, + "step": 7746 + }, + { + "epoch": 0.4156115879828326, + "grad_norm": 1.5528291463851929, + "learning_rate": 3.2944058634272468e-06, + "loss": 2.0997, + "step": 7747 + }, + { + "epoch": 0.41566523605150213, + "grad_norm": 1.5603028535842896, + "learning_rate": 3.2939939646643765e-06, + "loss": 2.0321, + "step": 7748 + }, + { + "epoch": 0.41571888412017166, + "grad_norm": 1.9983553886413574, + "learning_rate": 3.293582041928631e-06, + "loss": 2.0535, + "step": 7749 + }, + { + "epoch": 0.4157725321888412, + "grad_norm": 1.2104387283325195, + "learning_rate": 3.29317009523245e-06, + "loss": 2.0906, + "step": 7750 + }, + { + "epoch": 0.4158261802575107, + "grad_norm": 1.4224672317504883, + "learning_rate": 3.29275812458827e-06, + "loss": 2.3921, + "step": 7751 + }, + { + "epoch": 0.41587982832618026, + "grad_norm": 1.2007229328155518, + "learning_rate": 3.292346130008531e-06, + "loss": 2.1674, + "step": 7752 + }, + { + "epoch": 0.4159334763948498, + "grad_norm": 1.2037780284881592, + "learning_rate": 3.29193411150567e-06, + "loss": 2.1386, + "step": 7753 + }, + { + "epoch": 0.4159871244635193, + "grad_norm": 1.6458520889282227, + "learning_rate": 3.291522069092129e-06, + "loss": 2.3652, + "step": 7754 + }, + { + "epoch": 0.41604077253218885, + "grad_norm": 1.6782970428466797, + "learning_rate": 3.291110002780348e-06, + "loss": 2.362, + "step": 7755 + }, + { + "epoch": 0.4160944206008584, + "grad_norm": 1.1856120824813843, + "learning_rate": 3.290697912582767e-06, + "loss": 2.3452, + "step": 7756 + }, + { + "epoch": 0.4161480686695279, + "grad_norm": 1.649328589439392, + "learning_rate": 3.2902857985118307e-06, + "loss": 2.0589, + "step": 7757 + }, + { + "epoch": 0.41620171673819745, + "grad_norm": 1.5288405418395996, + "learning_rate": 3.28987366057998e-06, + "loss": 2.0065, + "step": 7758 + }, + { + "epoch": 0.416255364806867, + "grad_norm": 1.528908133506775, + "learning_rate": 3.28946149879966e-06, + "loss": 2.22, + "step": 7759 + }, + { + "epoch": 0.41630901287553645, + "grad_norm": 1.542487382888794, + "learning_rate": 3.2890493131833134e-06, + "loss": 2.3366, + "step": 7760 + }, + { + "epoch": 0.416362660944206, + "grad_norm": 1.2885318994522095, + "learning_rate": 3.2886371037433864e-06, + "loss": 2.3052, + "step": 7761 + }, + { + "epoch": 0.4164163090128755, + "grad_norm": 1.6253385543823242, + "learning_rate": 3.288224870492324e-06, + "loss": 2.3811, + "step": 7762 + }, + { + "epoch": 0.41646995708154505, + "grad_norm": 1.9480631351470947, + "learning_rate": 3.287812613442573e-06, + "loss": 2.1665, + "step": 7763 + }, + { + "epoch": 0.4165236051502146, + "grad_norm": 1.5220069885253906, + "learning_rate": 3.287400332606581e-06, + "loss": 2.3517, + "step": 7764 + }, + { + "epoch": 0.4165772532188841, + "grad_norm": 1.7980835437774658, + "learning_rate": 3.286988027996796e-06, + "loss": 2.2494, + "step": 7765 + }, + { + "epoch": 0.41663090128755365, + "grad_norm": 1.5568333864212036, + "learning_rate": 3.286575699625666e-06, + "loss": 2.2163, + "step": 7766 + }, + { + "epoch": 0.4166845493562232, + "grad_norm": 1.4396002292633057, + "learning_rate": 3.2861633475056394e-06, + "loss": 2.4083, + "step": 7767 + }, + { + "epoch": 0.4167381974248927, + "grad_norm": 1.6428143978118896, + "learning_rate": 3.2857509716491674e-06, + "loss": 2.1568, + "step": 7768 + }, + { + "epoch": 0.41679184549356224, + "grad_norm": 1.3327813148498535, + "learning_rate": 3.2853385720687014e-06, + "loss": 2.1578, + "step": 7769 + }, + { + "epoch": 0.4168454935622318, + "grad_norm": 1.347579836845398, + "learning_rate": 3.2849261487766924e-06, + "loss": 2.1586, + "step": 7770 + }, + { + "epoch": 0.4168991416309013, + "grad_norm": 1.3679218292236328, + "learning_rate": 3.2845137017855922e-06, + "loss": 2.55, + "step": 7771 + }, + { + "epoch": 0.41695278969957084, + "grad_norm": 1.4560644626617432, + "learning_rate": 3.2841012311078535e-06, + "loss": 2.4655, + "step": 7772 + }, + { + "epoch": 0.41700643776824037, + "grad_norm": 1.4604018926620483, + "learning_rate": 3.28368873675593e-06, + "loss": 1.4193, + "step": 7773 + }, + { + "epoch": 0.41706008583690984, + "grad_norm": 1.4886481761932373, + "learning_rate": 3.2832762187422773e-06, + "loss": 2.5451, + "step": 7774 + }, + { + "epoch": 0.4171137339055794, + "grad_norm": 1.378043293952942, + "learning_rate": 3.2828636770793486e-06, + "loss": 2.0915, + "step": 7775 + }, + { + "epoch": 0.4171673819742489, + "grad_norm": 1.4138685464859009, + "learning_rate": 3.2824511117796008e-06, + "loss": 2.4767, + "step": 7776 + }, + { + "epoch": 0.41722103004291844, + "grad_norm": 1.6628249883651733, + "learning_rate": 3.282038522855491e-06, + "loss": 2.3041, + "step": 7777 + }, + { + "epoch": 0.41727467811158797, + "grad_norm": 1.6029062271118164, + "learning_rate": 3.2816259103194744e-06, + "loss": 2.4599, + "step": 7778 + }, + { + "epoch": 0.4173283261802575, + "grad_norm": 1.327297568321228, + "learning_rate": 3.281213274184011e-06, + "loss": 2.2524, + "step": 7779 + }, + { + "epoch": 0.41738197424892703, + "grad_norm": 1.69856595993042, + "learning_rate": 3.280800614461558e-06, + "loss": 2.3199, + "step": 7780 + }, + { + "epoch": 0.41743562231759657, + "grad_norm": 1.4067317247390747, + "learning_rate": 3.2803879311645746e-06, + "loss": 2.3444, + "step": 7781 + }, + { + "epoch": 0.4174892703862661, + "grad_norm": 1.5898317098617554, + "learning_rate": 3.279975224305523e-06, + "loss": 2.3337, + "step": 7782 + }, + { + "epoch": 0.41754291845493563, + "grad_norm": 1.410117268562317, + "learning_rate": 3.279562493896862e-06, + "loss": 2.3231, + "step": 7783 + }, + { + "epoch": 0.41759656652360516, + "grad_norm": 1.4565143585205078, + "learning_rate": 3.2791497399510526e-06, + "loss": 2.2541, + "step": 7784 + }, + { + "epoch": 0.4176502145922747, + "grad_norm": 1.497341513633728, + "learning_rate": 3.2787369624805587e-06, + "loss": 2.2778, + "step": 7785 + }, + { + "epoch": 0.4177038626609442, + "grad_norm": 1.709040641784668, + "learning_rate": 3.2783241614978422e-06, + "loss": 2.2261, + "step": 7786 + }, + { + "epoch": 0.41775751072961376, + "grad_norm": 1.6227948665618896, + "learning_rate": 3.2779113370153672e-06, + "loss": 2.3951, + "step": 7787 + }, + { + "epoch": 0.4178111587982833, + "grad_norm": 1.5859805345535278, + "learning_rate": 3.2774984890455974e-06, + "loss": 2.253, + "step": 7788 + }, + { + "epoch": 0.41786480686695276, + "grad_norm": 1.8413357734680176, + "learning_rate": 3.2770856176009984e-06, + "loss": 2.4231, + "step": 7789 + }, + { + "epoch": 0.4179184549356223, + "grad_norm": 1.4284173250198364, + "learning_rate": 3.2766727226940353e-06, + "loss": 2.1314, + "step": 7790 + }, + { + "epoch": 0.41797210300429183, + "grad_norm": 2.8858158588409424, + "learning_rate": 3.276259804337175e-06, + "loss": 2.3343, + "step": 7791 + }, + { + "epoch": 0.41802575107296136, + "grad_norm": 1.443858027458191, + "learning_rate": 3.275846862542885e-06, + "loss": 2.3148, + "step": 7792 + }, + { + "epoch": 0.4180793991416309, + "grad_norm": 1.266008734703064, + "learning_rate": 3.2754338973236327e-06, + "loss": 2.351, + "step": 7793 + }, + { + "epoch": 0.4181330472103004, + "grad_norm": 1.4309877157211304, + "learning_rate": 3.275020908691886e-06, + "loss": 2.1861, + "step": 7794 + }, + { + "epoch": 0.41818669527896996, + "grad_norm": 1.5683045387268066, + "learning_rate": 3.274607896660116e-06, + "loss": 2.3614, + "step": 7795 + }, + { + "epoch": 0.4182403433476395, + "grad_norm": 1.391222357749939, + "learning_rate": 3.2741948612407897e-06, + "loss": 2.2754, + "step": 7796 + }, + { + "epoch": 0.418293991416309, + "grad_norm": 1.4452954530715942, + "learning_rate": 3.2737818024463814e-06, + "loss": 2.2975, + "step": 7797 + }, + { + "epoch": 0.41834763948497855, + "grad_norm": 1.4382071495056152, + "learning_rate": 3.27336872028936e-06, + "loss": 2.1436, + "step": 7798 + }, + { + "epoch": 0.4184012875536481, + "grad_norm": 1.617903709411621, + "learning_rate": 3.2729556147821985e-06, + "loss": 2.3021, + "step": 7799 + }, + { + "epoch": 0.4184549356223176, + "grad_norm": 1.5259101390838623, + "learning_rate": 3.272542485937369e-06, + "loss": 2.2873, + "step": 7800 + }, + { + "epoch": 0.41850858369098715, + "grad_norm": 1.5197595357894897, + "learning_rate": 3.2721293337673453e-06, + "loss": 2.1139, + "step": 7801 + }, + { + "epoch": 0.4185622317596567, + "grad_norm": 1.5092004537582397, + "learning_rate": 3.2717161582846026e-06, + "loss": 2.2144, + "step": 7802 + }, + { + "epoch": 0.41861587982832615, + "grad_norm": 1.5234017372131348, + "learning_rate": 3.2713029595016144e-06, + "loss": 1.8783, + "step": 7803 + }, + { + "epoch": 0.4186695278969957, + "grad_norm": 1.0969997644424438, + "learning_rate": 3.2708897374308575e-06, + "loss": 1.8838, + "step": 7804 + }, + { + "epoch": 0.4187231759656652, + "grad_norm": 1.4225842952728271, + "learning_rate": 3.2704764920848076e-06, + "loss": 2.298, + "step": 7805 + }, + { + "epoch": 0.41877682403433475, + "grad_norm": 1.5698463916778564, + "learning_rate": 3.270063223475941e-06, + "loss": 2.4708, + "step": 7806 + }, + { + "epoch": 0.4188304721030043, + "grad_norm": 1.4315531253814697, + "learning_rate": 3.269649931616737e-06, + "loss": 2.2172, + "step": 7807 + }, + { + "epoch": 0.4188841201716738, + "grad_norm": 2.142765760421753, + "learning_rate": 3.2692366165196727e-06, + "loss": 2.2147, + "step": 7808 + }, + { + "epoch": 0.41893776824034334, + "grad_norm": 1.9436603784561157, + "learning_rate": 3.2688232781972277e-06, + "loss": 2.3732, + "step": 7809 + }, + { + "epoch": 0.4189914163090129, + "grad_norm": 1.2748655080795288, + "learning_rate": 3.268409916661883e-06, + "loss": 2.0878, + "step": 7810 + }, + { + "epoch": 0.4190450643776824, + "grad_norm": 1.4809191226959229, + "learning_rate": 3.2679965319261163e-06, + "loss": 2.1225, + "step": 7811 + }, + { + "epoch": 0.41909871244635194, + "grad_norm": 1.8515655994415283, + "learning_rate": 3.2675831240024107e-06, + "loss": 2.2128, + "step": 7812 + }, + { + "epoch": 0.41915236051502147, + "grad_norm": 1.2729538679122925, + "learning_rate": 3.267169692903249e-06, + "loss": 2.0295, + "step": 7813 + }, + { + "epoch": 0.419206008583691, + "grad_norm": 1.3161540031433105, + "learning_rate": 3.266756238641112e-06, + "loss": 2.1675, + "step": 7814 + }, + { + "epoch": 0.41925965665236054, + "grad_norm": 1.487657070159912, + "learning_rate": 3.266342761228485e-06, + "loss": 2.363, + "step": 7815 + }, + { + "epoch": 0.41931330472103007, + "grad_norm": 1.5200635194778442, + "learning_rate": 3.2659292606778493e-06, + "loss": 2.212, + "step": 7816 + }, + { + "epoch": 0.4193669527896996, + "grad_norm": 1.5101183652877808, + "learning_rate": 3.2655157370016917e-06, + "loss": 2.1993, + "step": 7817 + }, + { + "epoch": 0.4194206008583691, + "grad_norm": 1.3317903280258179, + "learning_rate": 3.265102190212497e-06, + "loss": 2.299, + "step": 7818 + }, + { + "epoch": 0.4194742489270386, + "grad_norm": 1.3444364070892334, + "learning_rate": 3.2646886203227514e-06, + "loss": 2.2268, + "step": 7819 + }, + { + "epoch": 0.41952789699570814, + "grad_norm": 4.0640082359313965, + "learning_rate": 3.264275027344942e-06, + "loss": 2.0611, + "step": 7820 + }, + { + "epoch": 0.41958154506437767, + "grad_norm": 2.1207520961761475, + "learning_rate": 3.2638614112915556e-06, + "loss": 2.2295, + "step": 7821 + }, + { + "epoch": 0.4196351931330472, + "grad_norm": 1.3426662683486938, + "learning_rate": 3.2634477721750813e-06, + "loss": 1.5738, + "step": 7822 + }, + { + "epoch": 0.41968884120171673, + "grad_norm": 1.4305676221847534, + "learning_rate": 3.263034110008007e-06, + "loss": 2.0941, + "step": 7823 + }, + { + "epoch": 0.41974248927038627, + "grad_norm": 2.2561490535736084, + "learning_rate": 3.262620424802823e-06, + "loss": 2.2761, + "step": 7824 + }, + { + "epoch": 0.4197961373390558, + "grad_norm": 1.4732009172439575, + "learning_rate": 3.2622067165720197e-06, + "loss": 2.2975, + "step": 7825 + }, + { + "epoch": 0.41984978540772533, + "grad_norm": 1.3603531122207642, + "learning_rate": 3.2617929853280877e-06, + "loss": 2.0626, + "step": 7826 + }, + { + "epoch": 0.41990343347639486, + "grad_norm": 1.3362019062042236, + "learning_rate": 3.261379231083519e-06, + "loss": 2.4214, + "step": 7827 + }, + { + "epoch": 0.4199570815450644, + "grad_norm": 1.5428283214569092, + "learning_rate": 3.260965453850806e-06, + "loss": 2.3404, + "step": 7828 + }, + { + "epoch": 0.4200107296137339, + "grad_norm": 1.4473384618759155, + "learning_rate": 3.260551653642441e-06, + "loss": 2.1902, + "step": 7829 + }, + { + "epoch": 0.42006437768240346, + "grad_norm": 1.462601661682129, + "learning_rate": 3.2601378304709187e-06, + "loss": 2.2708, + "step": 7830 + }, + { + "epoch": 0.420118025751073, + "grad_norm": 1.3779417276382446, + "learning_rate": 3.259723984348733e-06, + "loss": 1.6928, + "step": 7831 + }, + { + "epoch": 0.42017167381974246, + "grad_norm": 1.43612539768219, + "learning_rate": 3.25931011528838e-06, + "loss": 2.1442, + "step": 7832 + }, + { + "epoch": 0.420225321888412, + "grad_norm": 1.3620563745498657, + "learning_rate": 3.258896223302354e-06, + "loss": 2.2084, + "step": 7833 + }, + { + "epoch": 0.4202789699570815, + "grad_norm": 1.5250532627105713, + "learning_rate": 3.258482308403153e-06, + "loss": 2.4341, + "step": 7834 + }, + { + "epoch": 0.42033261802575106, + "grad_norm": 2.6247522830963135, + "learning_rate": 3.258068370603273e-06, + "loss": 2.2749, + "step": 7835 + }, + { + "epoch": 0.4203862660944206, + "grad_norm": 1.4167430400848389, + "learning_rate": 3.257654409915213e-06, + "loss": 2.227, + "step": 7836 + }, + { + "epoch": 0.4204399141630901, + "grad_norm": 1.2815651893615723, + "learning_rate": 3.257240426351471e-06, + "loss": 2.1727, + "step": 7837 + }, + { + "epoch": 0.42049356223175965, + "grad_norm": 1.3838121891021729, + "learning_rate": 3.256826419924547e-06, + "loss": 2.2096, + "step": 7838 + }, + { + "epoch": 0.4205472103004292, + "grad_norm": 1.2218873500823975, + "learning_rate": 3.2564123906469397e-06, + "loss": 2.2765, + "step": 7839 + }, + { + "epoch": 0.4206008583690987, + "grad_norm": 1.7815532684326172, + "learning_rate": 3.255998338531151e-06, + "loss": 2.4157, + "step": 7840 + }, + { + "epoch": 0.42065450643776825, + "grad_norm": 1.449270248413086, + "learning_rate": 3.255584263589682e-06, + "loss": 2.3282, + "step": 7841 + }, + { + "epoch": 0.4207081545064378, + "grad_norm": 1.115543007850647, + "learning_rate": 3.255170165835034e-06, + "loss": 1.9232, + "step": 7842 + }, + { + "epoch": 0.4207618025751073, + "grad_norm": 1.740797758102417, + "learning_rate": 3.2547560452797113e-06, + "loss": 2.3043, + "step": 7843 + }, + { + "epoch": 0.42081545064377684, + "grad_norm": 1.7855161428451538, + "learning_rate": 3.2543419019362155e-06, + "loss": 2.3906, + "step": 7844 + }, + { + "epoch": 0.4208690987124464, + "grad_norm": 1.693674087524414, + "learning_rate": 3.2539277358170524e-06, + "loss": 2.3956, + "step": 7845 + }, + { + "epoch": 0.42092274678111585, + "grad_norm": 1.2826745510101318, + "learning_rate": 3.253513546934725e-06, + "loss": 2.2256, + "step": 7846 + }, + { + "epoch": 0.4209763948497854, + "grad_norm": 1.374570369720459, + "learning_rate": 3.253099335301741e-06, + "loss": 2.1395, + "step": 7847 + }, + { + "epoch": 0.4210300429184549, + "grad_norm": 1.4869433641433716, + "learning_rate": 3.252685100930605e-06, + "loss": 2.3593, + "step": 7848 + }, + { + "epoch": 0.42108369098712445, + "grad_norm": 1.5450023412704468, + "learning_rate": 3.2522708438338247e-06, + "loss": 2.354, + "step": 7849 + }, + { + "epoch": 0.421137339055794, + "grad_norm": 1.4715144634246826, + "learning_rate": 3.251856564023907e-06, + "loss": 2.1362, + "step": 7850 + }, + { + "epoch": 0.4211909871244635, + "grad_norm": 1.3273662328720093, + "learning_rate": 3.2514422615133606e-06, + "loss": 2.3618, + "step": 7851 + }, + { + "epoch": 0.42124463519313304, + "grad_norm": 1.654830813407898, + "learning_rate": 3.2510279363146935e-06, + "loss": 2.3231, + "step": 7852 + }, + { + "epoch": 0.4212982832618026, + "grad_norm": 1.4779833555221558, + "learning_rate": 3.2506135884404165e-06, + "loss": 2.1846, + "step": 7853 + }, + { + "epoch": 0.4213519313304721, + "grad_norm": 2.04868483543396, + "learning_rate": 3.2501992179030407e-06, + "loss": 1.9854, + "step": 7854 + }, + { + "epoch": 0.42140557939914164, + "grad_norm": 1.6368457078933716, + "learning_rate": 3.2497848247150756e-06, + "loss": 2.277, + "step": 7855 + }, + { + "epoch": 0.42145922746781117, + "grad_norm": 1.4832419157028198, + "learning_rate": 3.2493704088890322e-06, + "loss": 2.2009, + "step": 7856 + }, + { + "epoch": 0.4215128755364807, + "grad_norm": 1.4578932523727417, + "learning_rate": 3.2489559704374246e-06, + "loss": 2.311, + "step": 7857 + }, + { + "epoch": 0.42156652360515023, + "grad_norm": 1.209839940071106, + "learning_rate": 3.2485415093727644e-06, + "loss": 1.9859, + "step": 7858 + }, + { + "epoch": 0.42162017167381977, + "grad_norm": 1.3903636932373047, + "learning_rate": 3.248127025707567e-06, + "loss": 2.2505, + "step": 7859 + }, + { + "epoch": 0.4216738197424893, + "grad_norm": 1.4310063123703003, + "learning_rate": 3.2477125194543455e-06, + "loss": 2.2085, + "step": 7860 + }, + { + "epoch": 0.4217274678111588, + "grad_norm": 1.6068100929260254, + "learning_rate": 3.247297990625615e-06, + "loss": 2.2471, + "step": 7861 + }, + { + "epoch": 0.4217811158798283, + "grad_norm": 1.6659319400787354, + "learning_rate": 3.246883439233892e-06, + "loss": 2.2696, + "step": 7862 + }, + { + "epoch": 0.42183476394849784, + "grad_norm": 1.5508545637130737, + "learning_rate": 3.2464688652916925e-06, + "loss": 2.0537, + "step": 7863 + }, + { + "epoch": 0.42188841201716737, + "grad_norm": 1.5647097826004028, + "learning_rate": 3.2460542688115336e-06, + "loss": 2.2793, + "step": 7864 + }, + { + "epoch": 0.4219420600858369, + "grad_norm": 1.505354642868042, + "learning_rate": 3.2456396498059333e-06, + "loss": 2.2549, + "step": 7865 + }, + { + "epoch": 0.42199570815450643, + "grad_norm": 1.646005392074585, + "learning_rate": 3.2452250082874097e-06, + "loss": 2.3178, + "step": 7866 + }, + { + "epoch": 0.42204935622317596, + "grad_norm": 1.848036527633667, + "learning_rate": 3.2448103442684833e-06, + "loss": 2.2551, + "step": 7867 + }, + { + "epoch": 0.4221030042918455, + "grad_norm": 1.2453501224517822, + "learning_rate": 3.2443956577616715e-06, + "loss": 2.4876, + "step": 7868 + }, + { + "epoch": 0.42215665236051503, + "grad_norm": 1.494310736656189, + "learning_rate": 3.243980948779497e-06, + "loss": 2.3788, + "step": 7869 + }, + { + "epoch": 0.42221030042918456, + "grad_norm": 1.5477315187454224, + "learning_rate": 3.24356621733448e-06, + "loss": 2.5244, + "step": 7870 + }, + { + "epoch": 0.4222639484978541, + "grad_norm": 1.6888422966003418, + "learning_rate": 3.2431514634391433e-06, + "loss": 2.0477, + "step": 7871 + }, + { + "epoch": 0.4223175965665236, + "grad_norm": 1.505963921546936, + "learning_rate": 3.2427366871060084e-06, + "loss": 2.3202, + "step": 7872 + }, + { + "epoch": 0.42237124463519315, + "grad_norm": 1.9772863388061523, + "learning_rate": 3.2423218883475995e-06, + "loss": 2.0451, + "step": 7873 + }, + { + "epoch": 0.4224248927038627, + "grad_norm": 1.5468308925628662, + "learning_rate": 3.2419070671764384e-06, + "loss": 2.429, + "step": 7874 + }, + { + "epoch": 0.42247854077253216, + "grad_norm": 14.570969581604004, + "learning_rate": 3.2414922236050526e-06, + "loss": 2.2608, + "step": 7875 + }, + { + "epoch": 0.4225321888412017, + "grad_norm": 1.220957636833191, + "learning_rate": 3.2410773576459665e-06, + "loss": 2.1449, + "step": 7876 + }, + { + "epoch": 0.4225858369098712, + "grad_norm": 1.4347996711730957, + "learning_rate": 3.240662469311705e-06, + "loss": 2.3518, + "step": 7877 + }, + { + "epoch": 0.42263948497854076, + "grad_norm": 1.6182658672332764, + "learning_rate": 3.2402475586147954e-06, + "loss": 2.2215, + "step": 7878 + }, + { + "epoch": 0.4226931330472103, + "grad_norm": 1.5469698905944824, + "learning_rate": 3.239832625567765e-06, + "loss": 1.8205, + "step": 7879 + }, + { + "epoch": 0.4227467811158798, + "grad_norm": 1.7662376165390015, + "learning_rate": 3.2394176701831414e-06, + "loss": 2.2882, + "step": 7880 + }, + { + "epoch": 0.42280042918454935, + "grad_norm": 1.5604840517044067, + "learning_rate": 3.2390026924734536e-06, + "loss": 2.199, + "step": 7881 + }, + { + "epoch": 0.4228540772532189, + "grad_norm": 1.6685742139816284, + "learning_rate": 3.238587692451231e-06, + "loss": 2.0147, + "step": 7882 + }, + { + "epoch": 0.4229077253218884, + "grad_norm": 1.3347595930099487, + "learning_rate": 3.238172670129004e-06, + "loss": 2.0398, + "step": 7883 + }, + { + "epoch": 0.42296137339055795, + "grad_norm": 1.6309458017349243, + "learning_rate": 3.237757625519302e-06, + "loss": 2.3035, + "step": 7884 + }, + { + "epoch": 0.4230150214592275, + "grad_norm": 1.3982563018798828, + "learning_rate": 3.2373425586346576e-06, + "loss": 2.1135, + "step": 7885 + }, + { + "epoch": 0.423068669527897, + "grad_norm": 1.4071725606918335, + "learning_rate": 3.236927469487602e-06, + "loss": 1.2593, + "step": 7886 + }, + { + "epoch": 0.42312231759656654, + "grad_norm": 1.4695463180541992, + "learning_rate": 3.236512358090668e-06, + "loss": 2.4353, + "step": 7887 + }, + { + "epoch": 0.4231759656652361, + "grad_norm": 1.6847201585769653, + "learning_rate": 3.2360972244563897e-06, + "loss": 2.2788, + "step": 7888 + }, + { + "epoch": 0.42322961373390555, + "grad_norm": 1.1999304294586182, + "learning_rate": 3.2356820685973013e-06, + "loss": 2.0888, + "step": 7889 + }, + { + "epoch": 0.4232832618025751, + "grad_norm": 1.598694086074829, + "learning_rate": 3.235266890525936e-06, + "loss": 2.6165, + "step": 7890 + }, + { + "epoch": 0.4233369098712446, + "grad_norm": 1.770013451576233, + "learning_rate": 3.23485169025483e-06, + "loss": 2.2585, + "step": 7891 + }, + { + "epoch": 0.42339055793991415, + "grad_norm": 1.5648910999298096, + "learning_rate": 3.234436467796519e-06, + "loss": 2.3142, + "step": 7892 + }, + { + "epoch": 0.4234442060085837, + "grad_norm": 1.5265071392059326, + "learning_rate": 3.2340212231635403e-06, + "loss": 2.3901, + "step": 7893 + }, + { + "epoch": 0.4234978540772532, + "grad_norm": 1.6010373830795288, + "learning_rate": 3.233605956368432e-06, + "loss": 2.4443, + "step": 7894 + }, + { + "epoch": 0.42355150214592274, + "grad_norm": 1.4509798288345337, + "learning_rate": 3.2331906674237303e-06, + "loss": 2.2205, + "step": 7895 + }, + { + "epoch": 0.4236051502145923, + "grad_norm": 1.3754351139068604, + "learning_rate": 3.2327753563419745e-06, + "loss": 2.2816, + "step": 7896 + }, + { + "epoch": 0.4236587982832618, + "grad_norm": 1.5160404443740845, + "learning_rate": 3.2323600231357045e-06, + "loss": 2.4327, + "step": 7897 + }, + { + "epoch": 0.42371244635193134, + "grad_norm": 2.018413782119751, + "learning_rate": 3.2319446678174603e-06, + "loss": 2.3792, + "step": 7898 + }, + { + "epoch": 0.42376609442060087, + "grad_norm": 13.057031631469727, + "learning_rate": 3.231529290399783e-06, + "loss": 1.9806, + "step": 7899 + }, + { + "epoch": 0.4238197424892704, + "grad_norm": 1.4720447063446045, + "learning_rate": 3.2311138908952127e-06, + "loss": 2.2819, + "step": 7900 + }, + { + "epoch": 0.42387339055793993, + "grad_norm": 1.475183367729187, + "learning_rate": 3.2306984693162924e-06, + "loss": 2.4205, + "step": 7901 + }, + { + "epoch": 0.42392703862660946, + "grad_norm": 1.7435846328735352, + "learning_rate": 3.230283025675565e-06, + "loss": 2.4245, + "step": 7902 + }, + { + "epoch": 0.423980686695279, + "grad_norm": 1.8998814821243286, + "learning_rate": 3.229867559985573e-06, + "loss": 1.7364, + "step": 7903 + }, + { + "epoch": 0.4240343347639485, + "grad_norm": 1.5368393659591675, + "learning_rate": 3.2294520722588617e-06, + "loss": 2.2615, + "step": 7904 + }, + { + "epoch": 0.424087982832618, + "grad_norm": 1.66306471824646, + "learning_rate": 3.2290365625079743e-06, + "loss": 2.2009, + "step": 7905 + }, + { + "epoch": 0.42414163090128754, + "grad_norm": 1.5488022565841675, + "learning_rate": 3.2286210307454578e-06, + "loss": 2.1071, + "step": 7906 + }, + { + "epoch": 0.42419527896995707, + "grad_norm": 1.4632186889648438, + "learning_rate": 3.2282054769838574e-06, + "loss": 2.1968, + "step": 7907 + }, + { + "epoch": 0.4242489270386266, + "grad_norm": 1.6395875215530396, + "learning_rate": 3.2277899012357195e-06, + "loss": 2.714, + "step": 7908 + }, + { + "epoch": 0.42430257510729613, + "grad_norm": 2.312563180923462, + "learning_rate": 3.2273743035135924e-06, + "loss": 2.1572, + "step": 7909 + }, + { + "epoch": 0.42435622317596566, + "grad_norm": 1.6313565969467163, + "learning_rate": 3.226958683830023e-06, + "loss": 2.2211, + "step": 7910 + }, + { + "epoch": 0.4244098712446352, + "grad_norm": 1.3234349489212036, + "learning_rate": 3.226543042197561e-06, + "loss": 2.3296, + "step": 7911 + }, + { + "epoch": 0.4244635193133047, + "grad_norm": 1.2776800394058228, + "learning_rate": 3.2261273786287563e-06, + "loss": 2.1084, + "step": 7912 + }, + { + "epoch": 0.42451716738197426, + "grad_norm": 1.4412834644317627, + "learning_rate": 3.225711693136156e-06, + "loss": 2.446, + "step": 7913 + }, + { + "epoch": 0.4245708154506438, + "grad_norm": 1.4672930240631104, + "learning_rate": 3.225295985732314e-06, + "loss": 2.3424, + "step": 7914 + }, + { + "epoch": 0.4246244635193133, + "grad_norm": 1.4091933965682983, + "learning_rate": 3.2248802564297805e-06, + "loss": 2.5654, + "step": 7915 + }, + { + "epoch": 0.42467811158798285, + "grad_norm": 1.3048096895217896, + "learning_rate": 3.2244645052411076e-06, + "loss": 1.5963, + "step": 7916 + }, + { + "epoch": 0.4247317596566524, + "grad_norm": 1.6387568712234497, + "learning_rate": 3.2240487321788478e-06, + "loss": 1.845, + "step": 7917 + }, + { + "epoch": 0.42478540772532186, + "grad_norm": 1.5015496015548706, + "learning_rate": 3.223632937255554e-06, + "loss": 2.2718, + "step": 7918 + }, + { + "epoch": 0.4248390557939914, + "grad_norm": 1.491241455078125, + "learning_rate": 3.2232171204837813e-06, + "loss": 2.2188, + "step": 7919 + }, + { + "epoch": 0.4248927038626609, + "grad_norm": 1.4673899412155151, + "learning_rate": 3.2228012818760834e-06, + "loss": 2.2987, + "step": 7920 + }, + { + "epoch": 0.42494635193133046, + "grad_norm": 1.4714573621749878, + "learning_rate": 3.222385421445016e-06, + "loss": 2.3212, + "step": 7921 + }, + { + "epoch": 0.425, + "grad_norm": 1.4303195476531982, + "learning_rate": 3.2219695392031354e-06, + "loss": 2.3302, + "step": 7922 + }, + { + "epoch": 0.4250536480686695, + "grad_norm": 1.594266653060913, + "learning_rate": 3.2215536351629978e-06, + "loss": 2.3327, + "step": 7923 + }, + { + "epoch": 0.42510729613733905, + "grad_norm": 1.4561924934387207, + "learning_rate": 3.2211377093371605e-06, + "loss": 2.309, + "step": 7924 + }, + { + "epoch": 0.4251609442060086, + "grad_norm": 1.4469399452209473, + "learning_rate": 3.2207217617381816e-06, + "loss": 1.7639, + "step": 7925 + }, + { + "epoch": 0.4252145922746781, + "grad_norm": 1.7967528104782104, + "learning_rate": 3.2203057923786196e-06, + "loss": 2.1646, + "step": 7926 + }, + { + "epoch": 0.42526824034334765, + "grad_norm": 1.5870404243469238, + "learning_rate": 3.2198898012710332e-06, + "loss": 2.1662, + "step": 7927 + }, + { + "epoch": 0.4253218884120172, + "grad_norm": 2.0270278453826904, + "learning_rate": 3.2194737884279838e-06, + "loss": 2.3048, + "step": 7928 + }, + { + "epoch": 0.4253755364806867, + "grad_norm": 1.5702199935913086, + "learning_rate": 3.2190577538620315e-06, + "loss": 2.4056, + "step": 7929 + }, + { + "epoch": 0.42542918454935624, + "grad_norm": 1.6694493293762207, + "learning_rate": 3.218641697585736e-06, + "loss": 2.4031, + "step": 7930 + }, + { + "epoch": 0.4254828326180258, + "grad_norm": 1.6185957193374634, + "learning_rate": 3.2182256196116614e-06, + "loss": 2.4621, + "step": 7931 + }, + { + "epoch": 0.4255364806866953, + "grad_norm": 1.3104718923568726, + "learning_rate": 3.2178095199523685e-06, + "loss": 2.2408, + "step": 7932 + }, + { + "epoch": 0.4255901287553648, + "grad_norm": 1.5669573545455933, + "learning_rate": 3.2173933986204214e-06, + "loss": 1.6082, + "step": 7933 + }, + { + "epoch": 0.4256437768240343, + "grad_norm": 1.6291731595993042, + "learning_rate": 3.216977255628384e-06, + "loss": 2.2555, + "step": 7934 + }, + { + "epoch": 0.42569742489270385, + "grad_norm": 2.208658218383789, + "learning_rate": 3.2165610909888203e-06, + "loss": 2.2541, + "step": 7935 + }, + { + "epoch": 0.4257510729613734, + "grad_norm": 1.6828049421310425, + "learning_rate": 3.2161449047142947e-06, + "loss": 2.4421, + "step": 7936 + }, + { + "epoch": 0.4258047210300429, + "grad_norm": 1.5687333345413208, + "learning_rate": 3.2157286968173752e-06, + "loss": 2.1733, + "step": 7937 + }, + { + "epoch": 0.42585836909871244, + "grad_norm": 1.4879565238952637, + "learning_rate": 3.2153124673106273e-06, + "loss": 2.4444, + "step": 7938 + }, + { + "epoch": 0.425912017167382, + "grad_norm": 2.1739695072174072, + "learning_rate": 3.2148962162066178e-06, + "loss": 2.3723, + "step": 7939 + }, + { + "epoch": 0.4259656652360515, + "grad_norm": 1.2147653102874756, + "learning_rate": 3.214479943517914e-06, + "loss": 2.073, + "step": 7940 + }, + { + "epoch": 0.42601931330472104, + "grad_norm": 2.1297762393951416, + "learning_rate": 3.2140636492570855e-06, + "loss": 2.2192, + "step": 7941 + }, + { + "epoch": 0.42607296137339057, + "grad_norm": 1.8674724102020264, + "learning_rate": 3.2136473334367002e-06, + "loss": 2.1992, + "step": 7942 + }, + { + "epoch": 0.4261266094420601, + "grad_norm": 1.4309717416763306, + "learning_rate": 3.2132309960693286e-06, + "loss": 2.3467, + "step": 7943 + }, + { + "epoch": 0.42618025751072963, + "grad_norm": 1.186553955078125, + "learning_rate": 3.2128146371675406e-06, + "loss": 2.0302, + "step": 7944 + }, + { + "epoch": 0.42623390557939916, + "grad_norm": 3.238542318344116, + "learning_rate": 3.2123982567439083e-06, + "loss": 2.2807, + "step": 7945 + }, + { + "epoch": 0.4262875536480687, + "grad_norm": 1.7200255393981934, + "learning_rate": 3.2119818548110014e-06, + "loss": 2.3654, + "step": 7946 + }, + { + "epoch": 0.42634120171673817, + "grad_norm": 1.668556571006775, + "learning_rate": 3.2115654313813947e-06, + "loss": 2.3138, + "step": 7947 + }, + { + "epoch": 0.4263948497854077, + "grad_norm": 1.6898459196090698, + "learning_rate": 3.2111489864676593e-06, + "loss": 2.22, + "step": 7948 + }, + { + "epoch": 0.42644849785407724, + "grad_norm": 1.4441487789154053, + "learning_rate": 3.210732520082369e-06, + "loss": 2.2212, + "step": 7949 + }, + { + "epoch": 0.42650214592274677, + "grad_norm": 1.7992478609085083, + "learning_rate": 3.2103160322380994e-06, + "loss": 2.295, + "step": 7950 + }, + { + "epoch": 0.4265557939914163, + "grad_norm": 3.8099753856658936, + "learning_rate": 3.2098995229474243e-06, + "loss": 2.2366, + "step": 7951 + }, + { + "epoch": 0.42660944206008583, + "grad_norm": 1.5444141626358032, + "learning_rate": 3.2094829922229192e-06, + "loss": 2.2688, + "step": 7952 + }, + { + "epoch": 0.42666309012875536, + "grad_norm": 1.6627299785614014, + "learning_rate": 3.2090664400771606e-06, + "loss": 1.4601, + "step": 7953 + }, + { + "epoch": 0.4267167381974249, + "grad_norm": 1.5748521089553833, + "learning_rate": 3.208649866522725e-06, + "loss": 2.4799, + "step": 7954 + }, + { + "epoch": 0.4267703862660944, + "grad_norm": 1.4269745349884033, + "learning_rate": 3.208233271572191e-06, + "loss": 1.9462, + "step": 7955 + }, + { + "epoch": 0.42682403433476396, + "grad_norm": 1.6973297595977783, + "learning_rate": 3.2078166552381357e-06, + "loss": 2.0892, + "step": 7956 + }, + { + "epoch": 0.4268776824034335, + "grad_norm": 1.4444034099578857, + "learning_rate": 3.207400017533138e-06, + "loss": 2.2103, + "step": 7957 + }, + { + "epoch": 0.426931330472103, + "grad_norm": 1.4388043880462646, + "learning_rate": 3.206983358469778e-06, + "loss": 2.12, + "step": 7958 + }, + { + "epoch": 0.42698497854077255, + "grad_norm": 1.2842705249786377, + "learning_rate": 3.206566678060634e-06, + "loss": 1.9788, + "step": 7959 + }, + { + "epoch": 0.4270386266094421, + "grad_norm": 1.4425103664398193, + "learning_rate": 3.20614997631829e-06, + "loss": 2.5978, + "step": 7960 + }, + { + "epoch": 0.42709227467811156, + "grad_norm": 3.031583547592163, + "learning_rate": 3.2057332532553244e-06, + "loss": 2.114, + "step": 7961 + }, + { + "epoch": 0.4271459227467811, + "grad_norm": 1.9036043882369995, + "learning_rate": 3.2053165088843204e-06, + "loss": 2.215, + "step": 7962 + }, + { + "epoch": 0.4271995708154506, + "grad_norm": 1.4143015146255493, + "learning_rate": 3.2048997432178607e-06, + "loss": 2.2827, + "step": 7963 + }, + { + "epoch": 0.42725321888412016, + "grad_norm": 1.1381113529205322, + "learning_rate": 3.204482956268528e-06, + "loss": 2.1706, + "step": 7964 + }, + { + "epoch": 0.4273068669527897, + "grad_norm": 1.4916390180587769, + "learning_rate": 3.2040661480489072e-06, + "loss": 2.1738, + "step": 7965 + }, + { + "epoch": 0.4273605150214592, + "grad_norm": 1.4070395231246948, + "learning_rate": 3.203649318571582e-06, + "loss": 2.3459, + "step": 7966 + }, + { + "epoch": 0.42741416309012875, + "grad_norm": 1.7049872875213623, + "learning_rate": 3.2032324678491385e-06, + "loss": 2.0688, + "step": 7967 + }, + { + "epoch": 0.4274678111587983, + "grad_norm": 1.3313552141189575, + "learning_rate": 3.2028155958941614e-06, + "loss": 2.285, + "step": 7968 + }, + { + "epoch": 0.4275214592274678, + "grad_norm": 1.6493017673492432, + "learning_rate": 3.2023987027192383e-06, + "loss": 2.1024, + "step": 7969 + }, + { + "epoch": 0.42757510729613735, + "grad_norm": 1.3663749694824219, + "learning_rate": 3.2019817883369565e-06, + "loss": 2.422, + "step": 7970 + }, + { + "epoch": 0.4276287553648069, + "grad_norm": 1.2619808912277222, + "learning_rate": 3.2015648527599024e-06, + "loss": 1.9022, + "step": 7971 + }, + { + "epoch": 0.4276824034334764, + "grad_norm": 1.6836705207824707, + "learning_rate": 3.201147896000666e-06, + "loss": 2.3715, + "step": 7972 + }, + { + "epoch": 0.42773605150214594, + "grad_norm": 1.832632303237915, + "learning_rate": 3.2007309180718354e-06, + "loss": 2.5257, + "step": 7973 + }, + { + "epoch": 0.4277896995708155, + "grad_norm": 1.4698035717010498, + "learning_rate": 3.200313918986001e-06, + "loss": 2.2253, + "step": 7974 + }, + { + "epoch": 0.427843347639485, + "grad_norm": 1.4707525968551636, + "learning_rate": 3.1998968987557516e-06, + "loss": 2.3533, + "step": 7975 + }, + { + "epoch": 0.4278969957081545, + "grad_norm": 1.6735259294509888, + "learning_rate": 3.1994798573936803e-06, + "loss": 2.2455, + "step": 7976 + }, + { + "epoch": 0.427950643776824, + "grad_norm": 1.4980757236480713, + "learning_rate": 3.1990627949123777e-06, + "loss": 2.1231, + "step": 7977 + }, + { + "epoch": 0.42800429184549355, + "grad_norm": 1.5043152570724487, + "learning_rate": 3.198645711324436e-06, + "loss": 2.4233, + "step": 7978 + }, + { + "epoch": 0.4280579399141631, + "grad_norm": 1.4911528825759888, + "learning_rate": 3.1982286066424486e-06, + "loss": 2.1688, + "step": 7979 + }, + { + "epoch": 0.4281115879828326, + "grad_norm": 1.5213937759399414, + "learning_rate": 3.1978114808790083e-06, + "loss": 2.3515, + "step": 7980 + }, + { + "epoch": 0.42816523605150214, + "grad_norm": 6.6434736251831055, + "learning_rate": 3.1973943340467097e-06, + "loss": 2.4487, + "step": 7981 + }, + { + "epoch": 0.4282188841201717, + "grad_norm": 1.594068169593811, + "learning_rate": 3.196977166158148e-06, + "loss": 2.5707, + "step": 7982 + }, + { + "epoch": 0.4282725321888412, + "grad_norm": 1.7088065147399902, + "learning_rate": 3.1965599772259177e-06, + "loss": 2.2715, + "step": 7983 + }, + { + "epoch": 0.42832618025751074, + "grad_norm": 1.621443748474121, + "learning_rate": 3.1961427672626154e-06, + "loss": 2.2576, + "step": 7984 + }, + { + "epoch": 0.42837982832618027, + "grad_norm": 1.5756698846817017, + "learning_rate": 3.195725536280839e-06, + "loss": 2.325, + "step": 7985 + }, + { + "epoch": 0.4284334763948498, + "grad_norm": 1.4848440885543823, + "learning_rate": 3.1953082842931836e-06, + "loss": 2.3485, + "step": 7986 + }, + { + "epoch": 0.42848712446351933, + "grad_norm": 1.435317873954773, + "learning_rate": 3.1948910113122483e-06, + "loss": 2.3878, + "step": 7987 + }, + { + "epoch": 0.42854077253218886, + "grad_norm": 4.105795860290527, + "learning_rate": 3.1944737173506326e-06, + "loss": 2.2217, + "step": 7988 + }, + { + "epoch": 0.4285944206008584, + "grad_norm": 1.3488889932632446, + "learning_rate": 3.1940564024209344e-06, + "loss": 2.3052, + "step": 7989 + }, + { + "epoch": 0.42864806866952787, + "grad_norm": 1.4940874576568604, + "learning_rate": 3.1936390665357537e-06, + "loss": 2.1783, + "step": 7990 + }, + { + "epoch": 0.4287017167381974, + "grad_norm": 1.4665088653564453, + "learning_rate": 3.1932217097076923e-06, + "loss": 2.3841, + "step": 7991 + }, + { + "epoch": 0.42875536480686693, + "grad_norm": 1.5305167436599731, + "learning_rate": 3.1928043319493498e-06, + "loss": 2.126, + "step": 7992 + }, + { + "epoch": 0.42880901287553647, + "grad_norm": 1.5216519832611084, + "learning_rate": 3.1923869332733283e-06, + "loss": 2.158, + "step": 7993 + }, + { + "epoch": 0.428862660944206, + "grad_norm": 1.3009028434753418, + "learning_rate": 3.1919695136922313e-06, + "loss": 2.323, + "step": 7994 + }, + { + "epoch": 0.42891630901287553, + "grad_norm": 1.466546893119812, + "learning_rate": 3.1915520732186613e-06, + "loss": 2.5036, + "step": 7995 + }, + { + "epoch": 0.42896995708154506, + "grad_norm": 1.498923897743225, + "learning_rate": 3.1911346118652214e-06, + "loss": 2.3576, + "step": 7996 + }, + { + "epoch": 0.4290236051502146, + "grad_norm": 1.6452248096466064, + "learning_rate": 3.1907171296445163e-06, + "loss": 2.6337, + "step": 7997 + }, + { + "epoch": 0.4290772532188841, + "grad_norm": 1.4706716537475586, + "learning_rate": 3.1902996265691505e-06, + "loss": 1.9872, + "step": 7998 + }, + { + "epoch": 0.42913090128755366, + "grad_norm": 11.947999954223633, + "learning_rate": 3.1898821026517307e-06, + "loss": 2.3268, + "step": 7999 + }, + { + "epoch": 0.4291845493562232, + "grad_norm": 1.314054012298584, + "learning_rate": 3.1894645579048626e-06, + "loss": 2.2234, + "step": 8000 + }, + { + "epoch": 0.4292381974248927, + "grad_norm": 1.4083292484283447, + "learning_rate": 3.1890469923411528e-06, + "loss": 2.2501, + "step": 8001 + }, + { + "epoch": 0.42929184549356225, + "grad_norm": 1.369756817817688, + "learning_rate": 3.1886294059732088e-06, + "loss": 2.1918, + "step": 8002 + }, + { + "epoch": 0.4293454935622318, + "grad_norm": 1.4302895069122314, + "learning_rate": 3.1882117988136387e-06, + "loss": 1.8147, + "step": 8003 + }, + { + "epoch": 0.42939914163090126, + "grad_norm": 1.4577451944351196, + "learning_rate": 3.1877941708750503e-06, + "loss": 2.2223, + "step": 8004 + }, + { + "epoch": 0.4294527896995708, + "grad_norm": 1.8986947536468506, + "learning_rate": 3.187376522170055e-06, + "loss": 2.2903, + "step": 8005 + }, + { + "epoch": 0.4295064377682403, + "grad_norm": 2.042292594909668, + "learning_rate": 3.1869588527112616e-06, + "loss": 2.4951, + "step": 8006 + }, + { + "epoch": 0.42956008583690986, + "grad_norm": 1.350043773651123, + "learning_rate": 3.18654116251128e-06, + "loss": 2.2498, + "step": 8007 + }, + { + "epoch": 0.4296137339055794, + "grad_norm": 1.6261391639709473, + "learning_rate": 3.1861234515827226e-06, + "loss": 2.3849, + "step": 8008 + }, + { + "epoch": 0.4296673819742489, + "grad_norm": 1.575932264328003, + "learning_rate": 3.185705719938201e-06, + "loss": 2.2317, + "step": 8009 + }, + { + "epoch": 0.42972103004291845, + "grad_norm": 1.3975958824157715, + "learning_rate": 3.1852879675903276e-06, + "loss": 1.9177, + "step": 8010 + }, + { + "epoch": 0.429774678111588, + "grad_norm": 1.4269347190856934, + "learning_rate": 3.1848701945517148e-06, + "loss": 2.0137, + "step": 8011 + }, + { + "epoch": 0.4298283261802575, + "grad_norm": 1.3697351217269897, + "learning_rate": 3.1844524008349774e-06, + "loss": 2.2294, + "step": 8012 + }, + { + "epoch": 0.42988197424892705, + "grad_norm": 1.399917721748352, + "learning_rate": 3.1840345864527296e-06, + "loss": 1.5899, + "step": 8013 + }, + { + "epoch": 0.4299356223175966, + "grad_norm": 1.3451400995254517, + "learning_rate": 3.1836167514175853e-06, + "loss": 2.2491, + "step": 8014 + }, + { + "epoch": 0.4299892703862661, + "grad_norm": 1.4327765703201294, + "learning_rate": 3.1831988957421607e-06, + "loss": 2.0152, + "step": 8015 + }, + { + "epoch": 0.43004291845493564, + "grad_norm": 1.386083960533142, + "learning_rate": 3.1827810194390724e-06, + "loss": 2.2569, + "step": 8016 + }, + { + "epoch": 0.4300965665236052, + "grad_norm": 1.4172297716140747, + "learning_rate": 3.1823631225209376e-06, + "loss": 2.4062, + "step": 8017 + }, + { + "epoch": 0.4301502145922747, + "grad_norm": 1.2792010307312012, + "learning_rate": 3.181945205000373e-06, + "loss": 2.1647, + "step": 8018 + }, + { + "epoch": 0.4302038626609442, + "grad_norm": 1.509142279624939, + "learning_rate": 3.181527266889996e-06, + "loss": 2.1551, + "step": 8019 + }, + { + "epoch": 0.4302575107296137, + "grad_norm": 1.5188751220703125, + "learning_rate": 3.181109308202427e-06, + "loss": 2.3228, + "step": 8020 + }, + { + "epoch": 0.43031115879828324, + "grad_norm": 1.6845483779907227, + "learning_rate": 3.1806913289502835e-06, + "loss": 2.0341, + "step": 8021 + }, + { + "epoch": 0.4303648068669528, + "grad_norm": 1.729885220527649, + "learning_rate": 3.1802733291461873e-06, + "loss": 2.4676, + "step": 8022 + }, + { + "epoch": 0.4304184549356223, + "grad_norm": 1.4221220016479492, + "learning_rate": 3.179855308802758e-06, + "loss": 2.0917, + "step": 8023 + }, + { + "epoch": 0.43047210300429184, + "grad_norm": 1.7407376766204834, + "learning_rate": 3.1794372679326165e-06, + "loss": 1.9121, + "step": 8024 + }, + { + "epoch": 0.43052575107296137, + "grad_norm": 1.5793811082839966, + "learning_rate": 3.1790192065483848e-06, + "loss": 2.2189, + "step": 8025 + }, + { + "epoch": 0.4305793991416309, + "grad_norm": 1.5141501426696777, + "learning_rate": 3.1786011246626858e-06, + "loss": 2.2607, + "step": 8026 + }, + { + "epoch": 0.43063304721030043, + "grad_norm": 1.2994771003723145, + "learning_rate": 3.1781830222881428e-06, + "loss": 1.9633, + "step": 8027 + }, + { + "epoch": 0.43068669527896997, + "grad_norm": 1.6522879600524902, + "learning_rate": 3.1777648994373783e-06, + "loss": 2.4687, + "step": 8028 + }, + { + "epoch": 0.4307403433476395, + "grad_norm": 1.5024014711380005, + "learning_rate": 3.1773467561230174e-06, + "loss": 2.2261, + "step": 8029 + }, + { + "epoch": 0.43079399141630903, + "grad_norm": 1.7797702550888062, + "learning_rate": 3.1769285923576855e-06, + "loss": 2.2542, + "step": 8030 + }, + { + "epoch": 0.43084763948497856, + "grad_norm": 1.5361511707305908, + "learning_rate": 3.176510408154006e-06, + "loss": 2.2267, + "step": 8031 + }, + { + "epoch": 0.4309012875536481, + "grad_norm": 1.4598067998886108, + "learning_rate": 3.176092203524607e-06, + "loss": 2.4629, + "step": 8032 + }, + { + "epoch": 0.43095493562231757, + "grad_norm": 1.435252070426941, + "learning_rate": 3.175673978482115e-06, + "loss": 2.2541, + "step": 8033 + }, + { + "epoch": 0.4310085836909871, + "grad_norm": 1.5211689472198486, + "learning_rate": 3.1752557330391572e-06, + "loss": 2.0924, + "step": 8034 + }, + { + "epoch": 0.43106223175965663, + "grad_norm": 1.5999082326889038, + "learning_rate": 3.1748374672083614e-06, + "loss": 2.3852, + "step": 8035 + }, + { + "epoch": 0.43111587982832617, + "grad_norm": 1.5071624517440796, + "learning_rate": 3.1744191810023565e-06, + "loss": 2.3892, + "step": 8036 + }, + { + "epoch": 0.4311695278969957, + "grad_norm": 1.739762306213379, + "learning_rate": 3.1740008744337707e-06, + "loss": 2.2346, + "step": 8037 + }, + { + "epoch": 0.43122317596566523, + "grad_norm": 1.350911021232605, + "learning_rate": 3.1735825475152356e-06, + "loss": 2.4741, + "step": 8038 + }, + { + "epoch": 0.43127682403433476, + "grad_norm": 1.494175910949707, + "learning_rate": 3.1731642002593802e-06, + "loss": 2.3974, + "step": 8039 + }, + { + "epoch": 0.4313304721030043, + "grad_norm": 1.1515116691589355, + "learning_rate": 3.1727458326788363e-06, + "loss": 2.3932, + "step": 8040 + }, + { + "epoch": 0.4313841201716738, + "grad_norm": 1.5171592235565186, + "learning_rate": 3.172327444786235e-06, + "loss": 2.2811, + "step": 8041 + }, + { + "epoch": 0.43143776824034336, + "grad_norm": 1.3103886842727661, + "learning_rate": 3.1719090365942095e-06, + "loss": 2.0276, + "step": 8042 + }, + { + "epoch": 0.4314914163090129, + "grad_norm": 1.386548399925232, + "learning_rate": 3.1714906081153907e-06, + "loss": 2.3667, + "step": 8043 + }, + { + "epoch": 0.4315450643776824, + "grad_norm": 1.517061471939087, + "learning_rate": 3.171072159362415e-06, + "loss": 2.2647, + "step": 8044 + }, + { + "epoch": 0.43159871244635195, + "grad_norm": 1.529171347618103, + "learning_rate": 3.1706536903479145e-06, + "loss": 2.1917, + "step": 8045 + }, + { + "epoch": 0.4316523605150215, + "grad_norm": 1.81204354763031, + "learning_rate": 3.170235201084525e-06, + "loss": 2.1401, + "step": 8046 + }, + { + "epoch": 0.431706008583691, + "grad_norm": 1.5169239044189453, + "learning_rate": 3.1698166915848803e-06, + "loss": 2.3555, + "step": 8047 + }, + { + "epoch": 0.4317596566523605, + "grad_norm": 5.320855617523193, + "learning_rate": 3.169398161861618e-06, + "loss": 2.5015, + "step": 8048 + }, + { + "epoch": 0.43181330472103, + "grad_norm": 1.3677281141281128, + "learning_rate": 3.168979611927374e-06, + "loss": 2.216, + "step": 8049 + }, + { + "epoch": 0.43186695278969955, + "grad_norm": NaN, + "learning_rate": 3.168979611927374e-06, + "loss": 2.3939, + "step": 8050 + }, + { + "epoch": 0.4319206008583691, + "grad_norm": 1.5220837593078613, + "learning_rate": 3.168561041794786e-06, + "loss": 2.2553, + "step": 8051 + }, + { + "epoch": 0.4319742489270386, + "grad_norm": 2.643423080444336, + "learning_rate": 3.168142451476491e-06, + "loss": 2.2729, + "step": 8052 + }, + { + "epoch": 0.43202789699570815, + "grad_norm": 1.608709692955017, + "learning_rate": 3.1677238409851275e-06, + "loss": 2.1935, + "step": 8053 + }, + { + "epoch": 0.4320815450643777, + "grad_norm": 1.6616311073303223, + "learning_rate": 3.1673052103333356e-06, + "loss": 2.5221, + "step": 8054 + }, + { + "epoch": 0.4321351931330472, + "grad_norm": 1.3503551483154297, + "learning_rate": 3.166886559533753e-06, + "loss": 2.3642, + "step": 8055 + }, + { + "epoch": 0.43218884120171674, + "grad_norm": 1.468050241470337, + "learning_rate": 3.166467888599021e-06, + "loss": 2.3, + "step": 8056 + }, + { + "epoch": 0.4322424892703863, + "grad_norm": 2.0955734252929688, + "learning_rate": 3.166049197541781e-06, + "loss": 2.4092, + "step": 8057 + }, + { + "epoch": 0.4322961373390558, + "grad_norm": 2.0736489295959473, + "learning_rate": 3.1656304863746732e-06, + "loss": 2.3789, + "step": 8058 + }, + { + "epoch": 0.43234978540772534, + "grad_norm": 1.6692017316818237, + "learning_rate": 3.165211755110341e-06, + "loss": 2.4718, + "step": 8059 + }, + { + "epoch": 0.43240343347639487, + "grad_norm": 1.5809335708618164, + "learning_rate": 3.164793003761426e-06, + "loss": 2.0613, + "step": 8060 + }, + { + "epoch": 0.4324570815450644, + "grad_norm": 1.4521249532699585, + "learning_rate": 3.1643742323405715e-06, + "loss": 2.3367, + "step": 8061 + }, + { + "epoch": 0.4325107296137339, + "grad_norm": 1.6124294996261597, + "learning_rate": 3.1639554408604227e-06, + "loss": 2.2778, + "step": 8062 + }, + { + "epoch": 0.4325643776824034, + "grad_norm": 1.4774208068847656, + "learning_rate": 3.163536629333622e-06, + "loss": 2.315, + "step": 8063 + }, + { + "epoch": 0.43261802575107294, + "grad_norm": 1.2863355875015259, + "learning_rate": 3.1631177977728167e-06, + "loss": 2.0677, + "step": 8064 + }, + { + "epoch": 0.4326716738197425, + "grad_norm": 1.4312041997909546, + "learning_rate": 3.1626989461906506e-06, + "loss": 2.2243, + "step": 8065 + }, + { + "epoch": 0.432725321888412, + "grad_norm": 1.246522307395935, + "learning_rate": 3.1622800745997706e-06, + "loss": 2.2944, + "step": 8066 + }, + { + "epoch": 0.43277896995708154, + "grad_norm": 1.9669450521469116, + "learning_rate": 3.1618611830128233e-06, + "loss": 2.2726, + "step": 8067 + }, + { + "epoch": 0.43283261802575107, + "grad_norm": 2.2055838108062744, + "learning_rate": 3.1614422714424575e-06, + "loss": 2.4171, + "step": 8068 + }, + { + "epoch": 0.4328862660944206, + "grad_norm": 1.502489447593689, + "learning_rate": 3.1610233399013197e-06, + "loss": 2.0701, + "step": 8069 + }, + { + "epoch": 0.43293991416309013, + "grad_norm": 1.6288349628448486, + "learning_rate": 3.16060438840206e-06, + "loss": 2.239, + "step": 8070 + }, + { + "epoch": 0.43299356223175967, + "grad_norm": 1.597580909729004, + "learning_rate": 3.1601854169573267e-06, + "loss": 2.4971, + "step": 8071 + }, + { + "epoch": 0.4330472103004292, + "grad_norm": 1.6797171831130981, + "learning_rate": 3.1597664255797694e-06, + "loss": 2.3954, + "step": 8072 + }, + { + "epoch": 0.43310085836909873, + "grad_norm": 1.3544557094573975, + "learning_rate": 3.1593474142820398e-06, + "loss": 2.1815, + "step": 8073 + }, + { + "epoch": 0.43315450643776826, + "grad_norm": 1.5037949085235596, + "learning_rate": 3.158928383076788e-06, + "loss": 2.2685, + "step": 8074 + }, + { + "epoch": 0.4332081545064378, + "grad_norm": 1.5078909397125244, + "learning_rate": 3.1585093319766667e-06, + "loss": 2.3395, + "step": 8075 + }, + { + "epoch": 0.43326180257510727, + "grad_norm": 1.4763667583465576, + "learning_rate": 3.1580902609943283e-06, + "loss": 2.1566, + "step": 8076 + }, + { + "epoch": 0.4333154506437768, + "grad_norm": 1.6447101831436157, + "learning_rate": 3.1576711701424236e-06, + "loss": 2.2202, + "step": 8077 + }, + { + "epoch": 0.43336909871244633, + "grad_norm": 1.4241979122161865, + "learning_rate": 3.1572520594336077e-06, + "loss": 2.3252, + "step": 8078 + }, + { + "epoch": 0.43342274678111586, + "grad_norm": 1.454370379447937, + "learning_rate": 3.1568329288805357e-06, + "loss": 2.5236, + "step": 8079 + }, + { + "epoch": 0.4334763948497854, + "grad_norm": 1.8107396364212036, + "learning_rate": 3.1564137784958605e-06, + "loss": 2.4759, + "step": 8080 + }, + { + "epoch": 0.4335300429184549, + "grad_norm": 1.4618521928787231, + "learning_rate": 3.155994608292238e-06, + "loss": 2.2265, + "step": 8081 + }, + { + "epoch": 0.43358369098712446, + "grad_norm": 1.4625614881515503, + "learning_rate": 3.1555754182823244e-06, + "loss": 2.0686, + "step": 8082 + }, + { + "epoch": 0.433637339055794, + "grad_norm": 1.5175803899765015, + "learning_rate": 3.1551562084787763e-06, + "loss": 2.2795, + "step": 8083 + }, + { + "epoch": 0.4336909871244635, + "grad_norm": 1.345065712928772, + "learning_rate": 3.15473697889425e-06, + "loss": 2.1742, + "step": 8084 + }, + { + "epoch": 0.43374463519313305, + "grad_norm": 2.5773701667785645, + "learning_rate": 3.154317729541404e-06, + "loss": 2.3493, + "step": 8085 + }, + { + "epoch": 0.4337982832618026, + "grad_norm": 1.4570146799087524, + "learning_rate": 3.153898460432896e-06, + "loss": 2.1545, + "step": 8086 + }, + { + "epoch": 0.4338519313304721, + "grad_norm": 1.5041435956954956, + "learning_rate": 3.1534791715813855e-06, + "loss": 2.332, + "step": 8087 + }, + { + "epoch": 0.43390557939914165, + "grad_norm": 1.3628169298171997, + "learning_rate": 3.1530598629995317e-06, + "loss": 1.7667, + "step": 8088 + }, + { + "epoch": 0.4339592274678112, + "grad_norm": 1.6008864641189575, + "learning_rate": 3.1526405346999944e-06, + "loss": 2.1514, + "step": 8089 + }, + { + "epoch": 0.4340128755364807, + "grad_norm": 1.260610580444336, + "learning_rate": 3.152221186695435e-06, + "loss": 1.9267, + "step": 8090 + }, + { + "epoch": 0.4340665236051502, + "grad_norm": 1.7011919021606445, + "learning_rate": 3.1518018189985143e-06, + "loss": 2.1639, + "step": 8091 + }, + { + "epoch": 0.4341201716738197, + "grad_norm": 1.5462454557418823, + "learning_rate": 3.1513824316218936e-06, + "loss": 2.295, + "step": 8092 + }, + { + "epoch": 0.43417381974248925, + "grad_norm": 1.3211286067962646, + "learning_rate": 3.1509630245782373e-06, + "loss": 2.3366, + "step": 8093 + }, + { + "epoch": 0.4342274678111588, + "grad_norm": 1.3216708898544312, + "learning_rate": 3.1505435978802055e-06, + "loss": 2.3539, + "step": 8094 + }, + { + "epoch": 0.4342811158798283, + "grad_norm": 1.608099102973938, + "learning_rate": 3.1501241515404644e-06, + "loss": 2.4397, + "step": 8095 + }, + { + "epoch": 0.43433476394849785, + "grad_norm": 1.4908567667007446, + "learning_rate": 3.1497046855716774e-06, + "loss": 2.1291, + "step": 8096 + }, + { + "epoch": 0.4343884120171674, + "grad_norm": 1.3830418586730957, + "learning_rate": 3.1492851999865086e-06, + "loss": 2.1671, + "step": 8097 + }, + { + "epoch": 0.4344420600858369, + "grad_norm": 1.9300270080566406, + "learning_rate": 3.1488656947976254e-06, + "loss": 2.3351, + "step": 8098 + }, + { + "epoch": 0.43449570815450644, + "grad_norm": 1.5431162118911743, + "learning_rate": 3.1484461700176917e-06, + "loss": 2.2713, + "step": 8099 + }, + { + "epoch": 0.434549356223176, + "grad_norm": 1.7396215200424194, + "learning_rate": 3.1480266256593746e-06, + "loss": 2.3222, + "step": 8100 + }, + { + "epoch": 0.4346030042918455, + "grad_norm": 1.5062544345855713, + "learning_rate": 3.1476070617353417e-06, + "loss": 2.1556, + "step": 8101 + }, + { + "epoch": 0.43465665236051504, + "grad_norm": 1.6098475456237793, + "learning_rate": 3.147187478258262e-06, + "loss": 2.3548, + "step": 8102 + }, + { + "epoch": 0.43471030042918457, + "grad_norm": 1.5512802600860596, + "learning_rate": 3.1467678752408014e-06, + "loss": 2.2661, + "step": 8103 + }, + { + "epoch": 0.4347639484978541, + "grad_norm": 1.2750555276870728, + "learning_rate": 3.14634825269563e-06, + "loss": 2.1776, + "step": 8104 + }, + { + "epoch": 0.4348175965665236, + "grad_norm": 1.4480499029159546, + "learning_rate": 3.145928610635418e-06, + "loss": 2.2483, + "step": 8105 + }, + { + "epoch": 0.4348712446351931, + "grad_norm": 1.4347401857376099, + "learning_rate": 3.145508949072834e-06, + "loss": 2.2934, + "step": 8106 + }, + { + "epoch": 0.43492489270386264, + "grad_norm": 1.3863723278045654, + "learning_rate": 3.1450892680205513e-06, + "loss": 2.2066, + "step": 8107 + }, + { + "epoch": 0.4349785407725322, + "grad_norm": 2.47845196723938, + "learning_rate": 3.144669567491239e-06, + "loss": 2.0564, + "step": 8108 + }, + { + "epoch": 0.4350321888412017, + "grad_norm": 1.4054067134857178, + "learning_rate": 3.1442498474975696e-06, + "loss": 2.2403, + "step": 8109 + }, + { + "epoch": 0.43508583690987124, + "grad_norm": 1.6642202138900757, + "learning_rate": 3.143830108052216e-06, + "loss": 2.3803, + "step": 8110 + }, + { + "epoch": 0.43513948497854077, + "grad_norm": 1.278454303741455, + "learning_rate": 3.14341034916785e-06, + "loss": 1.9629, + "step": 8111 + }, + { + "epoch": 0.4351931330472103, + "grad_norm": 1.4403215646743774, + "learning_rate": 3.1429905708571475e-06, + "loss": 2.2399, + "step": 8112 + }, + { + "epoch": 0.43524678111587983, + "grad_norm": 1.8239011764526367, + "learning_rate": 3.1425707731327813e-06, + "loss": 2.1202, + "step": 8113 + }, + { + "epoch": 0.43530042918454936, + "grad_norm": 1.6787456274032593, + "learning_rate": 3.1421509560074265e-06, + "loss": 2.2645, + "step": 8114 + }, + { + "epoch": 0.4353540772532189, + "grad_norm": 1.646278977394104, + "learning_rate": 3.141731119493759e-06, + "loss": 2.0362, + "step": 8115 + }, + { + "epoch": 0.43540772532188843, + "grad_norm": 1.567460060119629, + "learning_rate": 3.1413112636044535e-06, + "loss": 2.1989, + "step": 8116 + }, + { + "epoch": 0.43546137339055796, + "grad_norm": 1.5351523160934448, + "learning_rate": 3.1408913883521874e-06, + "loss": 2.3242, + "step": 8117 + }, + { + "epoch": 0.4355150214592275, + "grad_norm": 1.2965872287750244, + "learning_rate": 3.1404714937496382e-06, + "loss": 1.5869, + "step": 8118 + }, + { + "epoch": 0.435568669527897, + "grad_norm": 1.3965492248535156, + "learning_rate": 3.140051579809484e-06, + "loss": 2.4989, + "step": 8119 + }, + { + "epoch": 0.4356223175965665, + "grad_norm": 1.5008741617202759, + "learning_rate": 3.1396316465444027e-06, + "loss": 2.3443, + "step": 8120 + }, + { + "epoch": 0.43567596566523603, + "grad_norm": 1.6376490592956543, + "learning_rate": 3.1392116939670727e-06, + "loss": 2.2139, + "step": 8121 + }, + { + "epoch": 0.43572961373390556, + "grad_norm": 2.0533387660980225, + "learning_rate": 3.1387917220901744e-06, + "loss": 2.3848, + "step": 8122 + }, + { + "epoch": 0.4357832618025751, + "grad_norm": 2.3343279361724854, + "learning_rate": 3.138371730926386e-06, + "loss": 2.2626, + "step": 8123 + }, + { + "epoch": 0.4358369098712446, + "grad_norm": 1.4971846342086792, + "learning_rate": 3.137951720488391e-06, + "loss": 2.3251, + "step": 8124 + }, + { + "epoch": 0.43589055793991416, + "grad_norm": 1.5860618352890015, + "learning_rate": 3.1375316907888697e-06, + "loss": 2.3519, + "step": 8125 + }, + { + "epoch": 0.4359442060085837, + "grad_norm": 1.4059075117111206, + "learning_rate": 3.1371116418405034e-06, + "loss": 2.0373, + "step": 8126 + }, + { + "epoch": 0.4359978540772532, + "grad_norm": 1.4690818786621094, + "learning_rate": 3.1366915736559743e-06, + "loss": 2.3117, + "step": 8127 + }, + { + "epoch": 0.43605150214592275, + "grad_norm": 1.5833725929260254, + "learning_rate": 3.1362714862479664e-06, + "loss": 2.4472, + "step": 8128 + }, + { + "epoch": 0.4361051502145923, + "grad_norm": 1.43063485622406, + "learning_rate": 3.1358513796291628e-06, + "loss": 2.4208, + "step": 8129 + }, + { + "epoch": 0.4361587982832618, + "grad_norm": 1.1802074909210205, + "learning_rate": 3.1354312538122473e-06, + "loss": 1.9499, + "step": 8130 + }, + { + "epoch": 0.43621244635193135, + "grad_norm": 1.7703646421432495, + "learning_rate": 3.1350111088099055e-06, + "loss": 1.9931, + "step": 8131 + }, + { + "epoch": 0.4362660944206009, + "grad_norm": 1.1617642641067505, + "learning_rate": 3.134590944634822e-06, + "loss": 2.2516, + "step": 8132 + }, + { + "epoch": 0.4363197424892704, + "grad_norm": 1.6314432621002197, + "learning_rate": 3.134170761299683e-06, + "loss": 2.0382, + "step": 8133 + }, + { + "epoch": 0.4363733905579399, + "grad_norm": 1.4624756574630737, + "learning_rate": 3.133750558817175e-06, + "loss": 2.0633, + "step": 8134 + }, + { + "epoch": 0.4364270386266094, + "grad_norm": 1.4284498691558838, + "learning_rate": 3.1333303371999853e-06, + "loss": 2.1656, + "step": 8135 + }, + { + "epoch": 0.43648068669527895, + "grad_norm": 1.480573296546936, + "learning_rate": 3.132910096460801e-06, + "loss": 2.3533, + "step": 8136 + }, + { + "epoch": 0.4365343347639485, + "grad_norm": 1.7662222385406494, + "learning_rate": 3.1324898366123113e-06, + "loss": 2.2829, + "step": 8137 + }, + { + "epoch": 0.436587982832618, + "grad_norm": 1.3800550699234009, + "learning_rate": 3.132069557667204e-06, + "loss": 2.231, + "step": 8138 + }, + { + "epoch": 0.43664163090128755, + "grad_norm": 1.5631026029586792, + "learning_rate": 3.131649259638168e-06, + "loss": 1.9991, + "step": 8139 + }, + { + "epoch": 0.4366952789699571, + "grad_norm": 1.7115808725357056, + "learning_rate": 3.131228942537895e-06, + "loss": 2.3624, + "step": 8140 + }, + { + "epoch": 0.4367489270386266, + "grad_norm": 1.4468218088150024, + "learning_rate": 3.130808606379074e-06, + "loss": 2.2342, + "step": 8141 + }, + { + "epoch": 0.43680257510729614, + "grad_norm": 1.5920908451080322, + "learning_rate": 3.130388251174398e-06, + "loss": 1.9133, + "step": 8142 + }, + { + "epoch": 0.4368562231759657, + "grad_norm": 1.3319932222366333, + "learning_rate": 3.129967876936557e-06, + "loss": 2.3474, + "step": 8143 + }, + { + "epoch": 0.4369098712446352, + "grad_norm": 1.5074255466461182, + "learning_rate": 3.1295474836782437e-06, + "loss": 2.3461, + "step": 8144 + }, + { + "epoch": 0.43696351931330474, + "grad_norm": 1.222482681274414, + "learning_rate": 3.1291270714121496e-06, + "loss": 2.2175, + "step": 8145 + }, + { + "epoch": 0.43701716738197427, + "grad_norm": 1.6612634658813477, + "learning_rate": 3.1287066401509715e-06, + "loss": 2.3266, + "step": 8146 + }, + { + "epoch": 0.4370708154506438, + "grad_norm": 1.5026041269302368, + "learning_rate": 3.1282861899074e-06, + "loss": 2.2376, + "step": 8147 + }, + { + "epoch": 0.4371244635193133, + "grad_norm": 1.8113747835159302, + "learning_rate": 3.1278657206941315e-06, + "loss": 2.2188, + "step": 8148 + }, + { + "epoch": 0.4371781115879828, + "grad_norm": 1.6892218589782715, + "learning_rate": 3.1274452325238603e-06, + "loss": 2.2495, + "step": 8149 + }, + { + "epoch": 0.43723175965665234, + "grad_norm": 1.2566713094711304, + "learning_rate": 3.127024725409282e-06, + "loss": 2.2401, + "step": 8150 + }, + { + "epoch": 0.4372854077253219, + "grad_norm": 1.5468930006027222, + "learning_rate": 3.126604199363094e-06, + "loss": 2.3404, + "step": 8151 + }, + { + "epoch": 0.4373390557939914, + "grad_norm": 1.6254000663757324, + "learning_rate": 3.126183654397993e-06, + "loss": 2.3083, + "step": 8152 + }, + { + "epoch": 0.43739270386266094, + "grad_norm": 1.5898528099060059, + "learning_rate": 3.1257630905266744e-06, + "loss": 2.1933, + "step": 8153 + }, + { + "epoch": 0.43744635193133047, + "grad_norm": 1.3524144887924194, + "learning_rate": 3.125342507761839e-06, + "loss": 2.2312, + "step": 8154 + }, + { + "epoch": 0.4375, + "grad_norm": 1.4741265773773193, + "learning_rate": 3.1249219061161833e-06, + "loss": 2.2563, + "step": 8155 + }, + { + "epoch": 0.43755364806866953, + "grad_norm": 1.7285051345825195, + "learning_rate": 3.1245012856024067e-06, + "loss": 2.0761, + "step": 8156 + }, + { + "epoch": 0.43760729613733906, + "grad_norm": 1.4238667488098145, + "learning_rate": 3.1240806462332095e-06, + "loss": 2.3635, + "step": 8157 + }, + { + "epoch": 0.4376609442060086, + "grad_norm": 1.7835544347763062, + "learning_rate": 3.1236599880212927e-06, + "loss": 2.0593, + "step": 8158 + }, + { + "epoch": 0.4377145922746781, + "grad_norm": 2.772679090499878, + "learning_rate": 3.1232393109793557e-06, + "loss": 2.4023, + "step": 8159 + }, + { + "epoch": 0.43776824034334766, + "grad_norm": 1.3941822052001953, + "learning_rate": 3.1228186151201002e-06, + "loss": 2.3574, + "step": 8160 + }, + { + "epoch": 0.4378218884120172, + "grad_norm": 1.5843664407730103, + "learning_rate": 3.122397900456229e-06, + "loss": 2.0416, + "step": 8161 + }, + { + "epoch": 0.4378755364806867, + "grad_norm": 1.649236798286438, + "learning_rate": 3.1219771670004424e-06, + "loss": 2.226, + "step": 8162 + }, + { + "epoch": 0.4379291845493562, + "grad_norm": 1.515203833580017, + "learning_rate": 3.1215564147654463e-06, + "loss": 2.4362, + "step": 8163 + }, + { + "epoch": 0.43798283261802573, + "grad_norm": 1.2670848369598389, + "learning_rate": 3.121135643763944e-06, + "loss": 2.0972, + "step": 8164 + }, + { + "epoch": 0.43803648068669526, + "grad_norm": 1.3961238861083984, + "learning_rate": 3.120714854008638e-06, + "loss": 2.3723, + "step": 8165 + }, + { + "epoch": 0.4380901287553648, + "grad_norm": 1.5744242668151855, + "learning_rate": 3.1202940455122338e-06, + "loss": 2.5309, + "step": 8166 + }, + { + "epoch": 0.4381437768240343, + "grad_norm": 2.784168004989624, + "learning_rate": 3.1198732182874377e-06, + "loss": 1.2818, + "step": 8167 + }, + { + "epoch": 0.43819742489270386, + "grad_norm": 1.720723271369934, + "learning_rate": 3.1194523723469544e-06, + "loss": 2.3037, + "step": 8168 + }, + { + "epoch": 0.4382510729613734, + "grad_norm": 1.2177618741989136, + "learning_rate": 3.119031507703491e-06, + "loss": 2.2377, + "step": 8169 + }, + { + "epoch": 0.4383047210300429, + "grad_norm": 1.4818427562713623, + "learning_rate": 3.118610624369755e-06, + "loss": 2.3897, + "step": 8170 + }, + { + "epoch": 0.43835836909871245, + "grad_norm": 1.4474354982376099, + "learning_rate": 3.1181897223584536e-06, + "loss": 2.2499, + "step": 8171 + }, + { + "epoch": 0.438412017167382, + "grad_norm": 1.7727051973342896, + "learning_rate": 3.117768801682294e-06, + "loss": 2.0793, + "step": 8172 + }, + { + "epoch": 0.4384656652360515, + "grad_norm": 1.4510897397994995, + "learning_rate": 3.1173478623539865e-06, + "loss": 1.8723, + "step": 8173 + }, + { + "epoch": 0.43851931330472105, + "grad_norm": 1.7089896202087402, + "learning_rate": 3.1169269043862398e-06, + "loss": 2.2021, + "step": 8174 + }, + { + "epoch": 0.4385729613733906, + "grad_norm": 1.617608666419983, + "learning_rate": 3.116505927791764e-06, + "loss": 2.4983, + "step": 8175 + }, + { + "epoch": 0.4386266094420601, + "grad_norm": 1.6654052734375, + "learning_rate": 3.11608493258327e-06, + "loss": 2.2967, + "step": 8176 + }, + { + "epoch": 0.4386802575107296, + "grad_norm": 1.2543256282806396, + "learning_rate": 3.1156639187734676e-06, + "loss": 2.1484, + "step": 8177 + }, + { + "epoch": 0.4387339055793991, + "grad_norm": 1.460953712463379, + "learning_rate": 3.1152428863750685e-06, + "loss": 2.4599, + "step": 8178 + }, + { + "epoch": 0.43878755364806865, + "grad_norm": 1.8255698680877686, + "learning_rate": 3.1148218354007854e-06, + "loss": 2.4074, + "step": 8179 + }, + { + "epoch": 0.4388412017167382, + "grad_norm": 1.685549020767212, + "learning_rate": 3.1144007658633315e-06, + "loss": 2.526, + "step": 8180 + }, + { + "epoch": 0.4388948497854077, + "grad_norm": 1.4316269159317017, + "learning_rate": 3.113979677775419e-06, + "loss": 1.9304, + "step": 8181 + }, + { + "epoch": 0.43894849785407725, + "grad_norm": 1.190027117729187, + "learning_rate": 3.1135585711497625e-06, + "loss": 2.173, + "step": 8182 + }, + { + "epoch": 0.4390021459227468, + "grad_norm": 1.6234550476074219, + "learning_rate": 3.113137445999076e-06, + "loss": 2.1919, + "step": 8183 + }, + { + "epoch": 0.4390557939914163, + "grad_norm": 2.176079511642456, + "learning_rate": 3.1127163023360742e-06, + "loss": 2.3384, + "step": 8184 + }, + { + "epoch": 0.43910944206008584, + "grad_norm": 1.5399919748306274, + "learning_rate": 3.112295140173472e-06, + "loss": 2.1702, + "step": 8185 + }, + { + "epoch": 0.4391630901287554, + "grad_norm": 1.4991408586502075, + "learning_rate": 3.1118739595239876e-06, + "loss": 1.8594, + "step": 8186 + }, + { + "epoch": 0.4392167381974249, + "grad_norm": 1.6419779062271118, + "learning_rate": 3.1114527604003357e-06, + "loss": 2.2471, + "step": 8187 + }, + { + "epoch": 0.43927038626609444, + "grad_norm": 1.5755397081375122, + "learning_rate": 3.1110315428152343e-06, + "loss": 2.3301, + "step": 8188 + }, + { + "epoch": 0.43932403433476397, + "grad_norm": 1.213074803352356, + "learning_rate": 3.110610306781401e-06, + "loss": 2.0927, + "step": 8189 + }, + { + "epoch": 0.4393776824034335, + "grad_norm": 1.6937772035598755, + "learning_rate": 3.110189052311554e-06, + "loss": 2.4114, + "step": 8190 + }, + { + "epoch": 0.439431330472103, + "grad_norm": 2.1828291416168213, + "learning_rate": 3.109767779418412e-06, + "loss": 2.2254, + "step": 8191 + }, + { + "epoch": 0.4394849785407725, + "grad_norm": 1.5207009315490723, + "learning_rate": 3.109346488114694e-06, + "loss": 1.9683, + "step": 8192 + }, + { + "epoch": 0.43953862660944204, + "grad_norm": 1.5282800197601318, + "learning_rate": 3.108925178413121e-06, + "loss": 2.2185, + "step": 8193 + }, + { + "epoch": 0.43959227467811157, + "grad_norm": 1.6110286712646484, + "learning_rate": 3.108503850326413e-06, + "loss": 1.9903, + "step": 8194 + }, + { + "epoch": 0.4396459227467811, + "grad_norm": 1.481516718864441, + "learning_rate": 3.1080825038672907e-06, + "loss": 2.132, + "step": 8195 + }, + { + "epoch": 0.43969957081545064, + "grad_norm": 1.3939735889434814, + "learning_rate": 3.107661139048476e-06, + "loss": 2.2092, + "step": 8196 + }, + { + "epoch": 0.43975321888412017, + "grad_norm": 1.3236570358276367, + "learning_rate": 3.1072397558826917e-06, + "loss": 2.0993, + "step": 8197 + }, + { + "epoch": 0.4398068669527897, + "grad_norm": 1.4255019426345825, + "learning_rate": 3.106818354382659e-06, + "loss": 2.2557, + "step": 8198 + }, + { + "epoch": 0.43986051502145923, + "grad_norm": 1.408014178276062, + "learning_rate": 3.1063969345611035e-06, + "loss": 2.242, + "step": 8199 + }, + { + "epoch": 0.43991416309012876, + "grad_norm": 1.807202696800232, + "learning_rate": 3.1059754964307472e-06, + "loss": 2.7263, + "step": 8200 + }, + { + "epoch": 0.4399678111587983, + "grad_norm": 1.6142090559005737, + "learning_rate": 3.1055540400043136e-06, + "loss": 2.2654, + "step": 8201 + }, + { + "epoch": 0.4400214592274678, + "grad_norm": 1.8548439741134644, + "learning_rate": 3.10513256529453e-06, + "loss": 2.196, + "step": 8202 + }, + { + "epoch": 0.44007510729613736, + "grad_norm": 1.5082253217697144, + "learning_rate": 3.1047110723141205e-06, + "loss": 2.3218, + "step": 8203 + }, + { + "epoch": 0.4401287553648069, + "grad_norm": 1.5474853515625, + "learning_rate": 3.104289561075812e-06, + "loss": 2.0221, + "step": 8204 + }, + { + "epoch": 0.4401824034334764, + "grad_norm": 1.13668954372406, + "learning_rate": 3.1038680315923304e-06, + "loss": 2.2238, + "step": 8205 + }, + { + "epoch": 0.4402360515021459, + "grad_norm": 1.66643226146698, + "learning_rate": 3.103446483876403e-06, + "loss": 2.212, + "step": 8206 + }, + { + "epoch": 0.44028969957081543, + "grad_norm": 2.5605428218841553, + "learning_rate": 3.103024917940757e-06, + "loss": 1.9839, + "step": 8207 + }, + { + "epoch": 0.44034334763948496, + "grad_norm": 1.487849473953247, + "learning_rate": 3.102603333798122e-06, + "loss": 2.5901, + "step": 8208 + }, + { + "epoch": 0.4403969957081545, + "grad_norm": 1.513083815574646, + "learning_rate": 3.102181731461225e-06, + "loss": 2.298, + "step": 8209 + }, + { + "epoch": 0.440450643776824, + "grad_norm": 1.324217438697815, + "learning_rate": 3.101760110942797e-06, + "loss": 2.1099, + "step": 8210 + }, + { + "epoch": 0.44050429184549356, + "grad_norm": 1.588037133216858, + "learning_rate": 3.101338472255567e-06, + "loss": 2.4378, + "step": 8211 + }, + { + "epoch": 0.4405579399141631, + "grad_norm": 1.5031795501708984, + "learning_rate": 3.1009168154122653e-06, + "loss": 2.1952, + "step": 8212 + }, + { + "epoch": 0.4406115879828326, + "grad_norm": 1.1891772747039795, + "learning_rate": 3.1004951404256234e-06, + "loss": 2.106, + "step": 8213 + }, + { + "epoch": 0.44066523605150215, + "grad_norm": 1.560293436050415, + "learning_rate": 3.1000734473083726e-06, + "loss": 1.3742, + "step": 8214 + }, + { + "epoch": 0.4407188841201717, + "grad_norm": 1.2879817485809326, + "learning_rate": 3.0996517360732446e-06, + "loss": 2.2655, + "step": 8215 + }, + { + "epoch": 0.4407725321888412, + "grad_norm": 1.6570684909820557, + "learning_rate": 3.0992300067329733e-06, + "loss": 2.2067, + "step": 8216 + }, + { + "epoch": 0.44082618025751075, + "grad_norm": 1.518264889717102, + "learning_rate": 3.0988082593002913e-06, + "loss": 2.3532, + "step": 8217 + }, + { + "epoch": 0.4408798283261803, + "grad_norm": 1.4387354850769043, + "learning_rate": 3.0983864937879302e-06, + "loss": 2.2927, + "step": 8218 + }, + { + "epoch": 0.4409334763948498, + "grad_norm": 1.6947021484375, + "learning_rate": 3.0979647102086275e-06, + "loss": 2.1638, + "step": 8219 + }, + { + "epoch": 0.4409871244635193, + "grad_norm": 4.488561630249023, + "learning_rate": 3.097542908575116e-06, + "loss": 2.3416, + "step": 8220 + }, + { + "epoch": 0.4410407725321888, + "grad_norm": 1.3499075174331665, + "learning_rate": 3.0971210889001324e-06, + "loss": 1.9877, + "step": 8221 + }, + { + "epoch": 0.44109442060085835, + "grad_norm": 1.5126768350601196, + "learning_rate": 3.0966992511964116e-06, + "loss": 2.0616, + "step": 8222 + }, + { + "epoch": 0.4411480686695279, + "grad_norm": 1.5287882089614868, + "learning_rate": 3.09627739547669e-06, + "loss": 2.2459, + "step": 8223 + }, + { + "epoch": 0.4412017167381974, + "grad_norm": 1.5800319910049438, + "learning_rate": 3.0958555217537045e-06, + "loss": 2.2457, + "step": 8224 + }, + { + "epoch": 0.44125536480686695, + "grad_norm": 1.558396816253662, + "learning_rate": 3.0954336300401937e-06, + "loss": 2.2346, + "step": 8225 + }, + { + "epoch": 0.4413090128755365, + "grad_norm": 1.383270263671875, + "learning_rate": 3.0950117203488956e-06, + "loss": 2.3448, + "step": 8226 + }, + { + "epoch": 0.441362660944206, + "grad_norm": 1.4690966606140137, + "learning_rate": 3.094589792692547e-06, + "loss": 2.3377, + "step": 8227 + }, + { + "epoch": 0.44141630901287554, + "grad_norm": 1.6433228254318237, + "learning_rate": 3.0941678470838888e-06, + "loss": 2.0412, + "step": 8228 + }, + { + "epoch": 0.4414699570815451, + "grad_norm": 2.8719236850738525, + "learning_rate": 3.0937458835356605e-06, + "loss": 2.2157, + "step": 8229 + }, + { + "epoch": 0.4415236051502146, + "grad_norm": 1.7792938947677612, + "learning_rate": 3.0933239020606016e-06, + "loss": 2.0918, + "step": 8230 + }, + { + "epoch": 0.44157725321888414, + "grad_norm": 1.5176101922988892, + "learning_rate": 3.0929019026714536e-06, + "loss": 2.297, + "step": 8231 + }, + { + "epoch": 0.44163090128755367, + "grad_norm": 1.650693655014038, + "learning_rate": 3.0924798853809575e-06, + "loss": 2.3403, + "step": 8232 + }, + { + "epoch": 0.4416845493562232, + "grad_norm": 6.125771522521973, + "learning_rate": 3.092057850201855e-06, + "loss": 2.4215, + "step": 8233 + }, + { + "epoch": 0.44173819742489273, + "grad_norm": 1.7382993698120117, + "learning_rate": 3.091635797146889e-06, + "loss": 2.1856, + "step": 8234 + }, + { + "epoch": 0.4417918454935622, + "grad_norm": 1.4512192010879517, + "learning_rate": 3.0912137262288024e-06, + "loss": 2.3139, + "step": 8235 + }, + { + "epoch": 0.44184549356223174, + "grad_norm": 1.848416805267334, + "learning_rate": 3.090791637460338e-06, + "loss": 2.4269, + "step": 8236 + }, + { + "epoch": 0.44189914163090127, + "grad_norm": 1.493782877922058, + "learning_rate": 3.0903695308542407e-06, + "loss": 2.307, + "step": 8237 + }, + { + "epoch": 0.4419527896995708, + "grad_norm": 1.5734100341796875, + "learning_rate": 3.089947406423255e-06, + "loss": 2.1927, + "step": 8238 + }, + { + "epoch": 0.44200643776824033, + "grad_norm": 10.580697059631348, + "learning_rate": 3.0895252641801253e-06, + "loss": 2.3309, + "step": 8239 + }, + { + "epoch": 0.44206008583690987, + "grad_norm": 1.2737406492233276, + "learning_rate": 3.0891031041375967e-06, + "loss": 1.9363, + "step": 8240 + }, + { + "epoch": 0.4421137339055794, + "grad_norm": 1.6707308292388916, + "learning_rate": 3.088680926308417e-06, + "loss": 2.3056, + "step": 8241 + }, + { + "epoch": 0.44216738197424893, + "grad_norm": 1.506313443183899, + "learning_rate": 3.088258730705333e-06, + "loss": 2.3989, + "step": 8242 + }, + { + "epoch": 0.44222103004291846, + "grad_norm": 1.5361547470092773, + "learning_rate": 3.0878365173410905e-06, + "loss": 2.203, + "step": 8243 + }, + { + "epoch": 0.442274678111588, + "grad_norm": 1.5956224203109741, + "learning_rate": 3.0874142862284382e-06, + "loss": 1.3172, + "step": 8244 + }, + { + "epoch": 0.4423283261802575, + "grad_norm": 1.7355424165725708, + "learning_rate": 3.0869920373801243e-06, + "loss": 2.2466, + "step": 8245 + }, + { + "epoch": 0.44238197424892706, + "grad_norm": 1.4972761869430542, + "learning_rate": 3.0865697708088966e-06, + "loss": 1.9851, + "step": 8246 + }, + { + "epoch": 0.4424356223175966, + "grad_norm": 1.4504144191741943, + "learning_rate": 3.086147486527506e-06, + "loss": 2.2216, + "step": 8247 + }, + { + "epoch": 0.4424892703862661, + "grad_norm": 1.382952332496643, + "learning_rate": 3.0857251845487023e-06, + "loss": 2.18, + "step": 8248 + }, + { + "epoch": 0.4425429184549356, + "grad_norm": 1.6413203477859497, + "learning_rate": 3.085302864885235e-06, + "loss": 2.3876, + "step": 8249 + }, + { + "epoch": 0.44259656652360513, + "grad_norm": 1.427801251411438, + "learning_rate": 3.084880527549856e-06, + "loss": 2.2107, + "step": 8250 + }, + { + "epoch": 0.44265021459227466, + "grad_norm": 1.4216923713684082, + "learning_rate": 3.0844581725553162e-06, + "loss": 2.259, + "step": 8251 + }, + { + "epoch": 0.4427038626609442, + "grad_norm": 1.5792412757873535, + "learning_rate": 3.084035799914368e-06, + "loss": 2.3866, + "step": 8252 + }, + { + "epoch": 0.4427575107296137, + "grad_norm": 1.2935481071472168, + "learning_rate": 3.0836134096397642e-06, + "loss": 2.1847, + "step": 8253 + }, + { + "epoch": 0.44281115879828326, + "grad_norm": 1.4485241174697876, + "learning_rate": 3.0831910017442568e-06, + "loss": 2.2906, + "step": 8254 + }, + { + "epoch": 0.4428648068669528, + "grad_norm": 1.4937679767608643, + "learning_rate": 3.0827685762406013e-06, + "loss": 2.2389, + "step": 8255 + }, + { + "epoch": 0.4429184549356223, + "grad_norm": 1.6575536727905273, + "learning_rate": 3.0823461331415507e-06, + "loss": 2.408, + "step": 8256 + }, + { + "epoch": 0.44297210300429185, + "grad_norm": 1.5308891534805298, + "learning_rate": 3.0819236724598593e-06, + "loss": 2.2719, + "step": 8257 + }, + { + "epoch": 0.4430257510729614, + "grad_norm": 1.6676307916641235, + "learning_rate": 3.0815011942082832e-06, + "loss": 2.2977, + "step": 8258 + }, + { + "epoch": 0.4430793991416309, + "grad_norm": 1.7521926164627075, + "learning_rate": 3.081078698399579e-06, + "loss": 2.4725, + "step": 8259 + }, + { + "epoch": 0.44313304721030045, + "grad_norm": 1.4197114706039429, + "learning_rate": 3.0806561850465006e-06, + "loss": 2.2243, + "step": 8260 + }, + { + "epoch": 0.44318669527897, + "grad_norm": 2.563281774520874, + "learning_rate": 3.080233654161808e-06, + "loss": 2.2353, + "step": 8261 + }, + { + "epoch": 0.4432403433476395, + "grad_norm": 1.2640773057937622, + "learning_rate": 3.079811105758255e-06, + "loss": 2.4221, + "step": 8262 + }, + { + "epoch": 0.443293991416309, + "grad_norm": 1.3549326658248901, + "learning_rate": 3.079388539848602e-06, + "loss": 2.1391, + "step": 8263 + }, + { + "epoch": 0.4433476394849785, + "grad_norm": 1.514380931854248, + "learning_rate": 3.0789659564456065e-06, + "loss": 2.2257, + "step": 8264 + }, + { + "epoch": 0.44340128755364805, + "grad_norm": NaN, + "learning_rate": 3.0789659564456065e-06, + "loss": 2.1802, + "step": 8265 + }, + { + "epoch": 0.4434549356223176, + "grad_norm": 1.6377195119857788, + "learning_rate": 3.0785433555620285e-06, + "loss": 2.1655, + "step": 8266 + }, + { + "epoch": 0.4435085836909871, + "grad_norm": 1.4744887351989746, + "learning_rate": 3.0781207372106266e-06, + "loss": 2.3602, + "step": 8267 + }, + { + "epoch": 0.44356223175965664, + "grad_norm": 1.4504430294036865, + "learning_rate": 3.07769810140416e-06, + "loss": 2.2435, + "step": 8268 + }, + { + "epoch": 0.4436158798283262, + "grad_norm": 1.4906703233718872, + "learning_rate": 3.077275448155391e-06, + "loss": 1.9925, + "step": 8269 + }, + { + "epoch": 0.4436695278969957, + "grad_norm": 1.4686421155929565, + "learning_rate": 3.076852777477079e-06, + "loss": 2.3366, + "step": 8270 + }, + { + "epoch": 0.44372317596566524, + "grad_norm": 1.4906855821609497, + "learning_rate": 3.076430089381988e-06, + "loss": 2.2395, + "step": 8271 + }, + { + "epoch": 0.44377682403433477, + "grad_norm": 1.5335112810134888, + "learning_rate": 3.076007383882877e-06, + "loss": 2.3485, + "step": 8272 + }, + { + "epoch": 0.4438304721030043, + "grad_norm": 2.6739532947540283, + "learning_rate": 3.075584660992511e-06, + "loss": 2.3781, + "step": 8273 + }, + { + "epoch": 0.44388412017167383, + "grad_norm": 1.4965986013412476, + "learning_rate": 3.075161920723652e-06, + "loss": 2.2345, + "step": 8274 + }, + { + "epoch": 0.44393776824034337, + "grad_norm": 1.4792897701263428, + "learning_rate": 3.0747391630890645e-06, + "loss": 2.1944, + "step": 8275 + }, + { + "epoch": 0.4439914163090129, + "grad_norm": 2.3615171909332275, + "learning_rate": 3.074316388101512e-06, + "loss": 2.2655, + "step": 8276 + }, + { + "epoch": 0.44404506437768243, + "grad_norm": 1.48591947555542, + "learning_rate": 3.073893595773759e-06, + "loss": 2.2675, + "step": 8277 + }, + { + "epoch": 0.4440987124463519, + "grad_norm": 1.4812761545181274, + "learning_rate": 3.073470786118572e-06, + "loss": 2.3064, + "step": 8278 + }, + { + "epoch": 0.44415236051502144, + "grad_norm": 1.895809531211853, + "learning_rate": 3.073047959148716e-06, + "loss": 2.4668, + "step": 8279 + }, + { + "epoch": 0.44420600858369097, + "grad_norm": 1.5534019470214844, + "learning_rate": 3.072625114876958e-06, + "loss": 2.3322, + "step": 8280 + }, + { + "epoch": 0.4442596566523605, + "grad_norm": 1.3481273651123047, + "learning_rate": 3.072202253316063e-06, + "loss": 2.3632, + "step": 8281 + }, + { + "epoch": 0.44431330472103003, + "grad_norm": 1.531535029411316, + "learning_rate": 3.0717793744788005e-06, + "loss": 2.1744, + "step": 8282 + }, + { + "epoch": 0.44436695278969957, + "grad_norm": 1.3982070684432983, + "learning_rate": 3.0713564783779374e-06, + "loss": 2.4201, + "step": 8283 + }, + { + "epoch": 0.4444206008583691, + "grad_norm": 2.411639928817749, + "learning_rate": 3.070933565026243e-06, + "loss": 2.668, + "step": 8284 + }, + { + "epoch": 0.44447424892703863, + "grad_norm": 1.345180869102478, + "learning_rate": 3.0705106344364844e-06, + "loss": 2.2405, + "step": 8285 + }, + { + "epoch": 0.44452789699570816, + "grad_norm": 1.6579338312149048, + "learning_rate": 3.0700876866214326e-06, + "loss": 2.2398, + "step": 8286 + }, + { + "epoch": 0.4445815450643777, + "grad_norm": 1.5954296588897705, + "learning_rate": 3.069664721593856e-06, + "loss": 2.4042, + "step": 8287 + }, + { + "epoch": 0.4446351931330472, + "grad_norm": 1.6287651062011719, + "learning_rate": 3.0692417393665273e-06, + "loss": 2.1765, + "step": 8288 + }, + { + "epoch": 0.44468884120171676, + "grad_norm": 1.4368798732757568, + "learning_rate": 3.068818739952216e-06, + "loss": 2.2431, + "step": 8289 + }, + { + "epoch": 0.4447424892703863, + "grad_norm": 1.4785022735595703, + "learning_rate": 3.0683957233636935e-06, + "loss": 2.1899, + "step": 8290 + }, + { + "epoch": 0.4447961373390558, + "grad_norm": 1.3623108863830566, + "learning_rate": 3.0679726896137326e-06, + "loss": 2.2127, + "step": 8291 + }, + { + "epoch": 0.4448497854077253, + "grad_norm": 1.5241988897323608, + "learning_rate": 3.0675496387151056e-06, + "loss": 2.2567, + "step": 8292 + }, + { + "epoch": 0.4449034334763948, + "grad_norm": 1.4753203392028809, + "learning_rate": 3.0671265706805853e-06, + "loss": 2.1745, + "step": 8293 + }, + { + "epoch": 0.44495708154506436, + "grad_norm": 1.4884871244430542, + "learning_rate": 3.066703485522946e-06, + "loss": 2.3754, + "step": 8294 + }, + { + "epoch": 0.4450107296137339, + "grad_norm": 1.4054796695709229, + "learning_rate": 3.066280383254961e-06, + "loss": 2.0384, + "step": 8295 + }, + { + "epoch": 0.4450643776824034, + "grad_norm": 1.393243432044983, + "learning_rate": 3.065857263889405e-06, + "loss": 2.1675, + "step": 8296 + }, + { + "epoch": 0.44511802575107295, + "grad_norm": 1.6074496507644653, + "learning_rate": 3.0654341274390537e-06, + "loss": 2.1456, + "step": 8297 + }, + { + "epoch": 0.4451716738197425, + "grad_norm": 1.4409321546554565, + "learning_rate": 3.065010973916682e-06, + "loss": 2.1281, + "step": 8298 + }, + { + "epoch": 0.445225321888412, + "grad_norm": 1.3262722492218018, + "learning_rate": 3.0645878033350674e-06, + "loss": 2.3932, + "step": 8299 + }, + { + "epoch": 0.44527896995708155, + "grad_norm": 1.6073384284973145, + "learning_rate": 3.0641646157069853e-06, + "loss": 2.4955, + "step": 8300 + }, + { + "epoch": 0.4453326180257511, + "grad_norm": 1.4306575059890747, + "learning_rate": 3.0637414110452133e-06, + "loss": 2.3725, + "step": 8301 + }, + { + "epoch": 0.4453862660944206, + "grad_norm": 1.609908938407898, + "learning_rate": 3.06331818936253e-06, + "loss": 2.064, + "step": 8302 + }, + { + "epoch": 0.44543991416309014, + "grad_norm": 1.4838604927062988, + "learning_rate": 3.062894950671711e-06, + "loss": 1.9926, + "step": 8303 + }, + { + "epoch": 0.4454935622317597, + "grad_norm": 1.68695867061615, + "learning_rate": 3.062471694985538e-06, + "loss": 2.5147, + "step": 8304 + }, + { + "epoch": 0.4455472103004292, + "grad_norm": 1.5027607679367065, + "learning_rate": 3.062048422316789e-06, + "loss": 2.5396, + "step": 8305 + }, + { + "epoch": 0.44560085836909874, + "grad_norm": 1.3888792991638184, + "learning_rate": 3.0616251326782444e-06, + "loss": 2.0674, + "step": 8306 + }, + { + "epoch": 0.4456545064377682, + "grad_norm": 1.5818082094192505, + "learning_rate": 3.061201826082683e-06, + "loss": 2.1237, + "step": 8307 + }, + { + "epoch": 0.44570815450643775, + "grad_norm": 1.7295055389404297, + "learning_rate": 3.0607785025428864e-06, + "loss": 2.2148, + "step": 8308 + }, + { + "epoch": 0.4457618025751073, + "grad_norm": 1.4651027917861938, + "learning_rate": 3.0603551620716367e-06, + "loss": 2.3378, + "step": 8309 + }, + { + "epoch": 0.4458154506437768, + "grad_norm": 1.4915000200271606, + "learning_rate": 3.0599318046817144e-06, + "loss": 2.5233, + "step": 8310 + }, + { + "epoch": 0.44586909871244634, + "grad_norm": 2.272883415222168, + "learning_rate": 3.0595084303859035e-06, + "loss": 2.3003, + "step": 8311 + }, + { + "epoch": 0.4459227467811159, + "grad_norm": 1.7647143602371216, + "learning_rate": 3.0590850391969852e-06, + "loss": 2.1015, + "step": 8312 + }, + { + "epoch": 0.4459763948497854, + "grad_norm": 1.5829981565475464, + "learning_rate": 3.058661631127744e-06, + "loss": 2.3407, + "step": 8313 + }, + { + "epoch": 0.44603004291845494, + "grad_norm": 1.1261963844299316, + "learning_rate": 3.058238206190963e-06, + "loss": 1.7852, + "step": 8314 + }, + { + "epoch": 0.44608369098712447, + "grad_norm": 1.497343897819519, + "learning_rate": 3.057814764399426e-06, + "loss": 2.1921, + "step": 8315 + }, + { + "epoch": 0.446137339055794, + "grad_norm": 3.772108316421509, + "learning_rate": 3.0573913057659192e-06, + "loss": 2.1725, + "step": 8316 + }, + { + "epoch": 0.44619098712446353, + "grad_norm": 1.1935025453567505, + "learning_rate": 3.056967830303228e-06, + "loss": 2.1884, + "step": 8317 + }, + { + "epoch": 0.44624463519313307, + "grad_norm": 1.5357714891433716, + "learning_rate": 3.056544338024137e-06, + "loss": 2.0309, + "step": 8318 + }, + { + "epoch": 0.4462982832618026, + "grad_norm": 1.6772332191467285, + "learning_rate": 3.0561208289414348e-06, + "loss": 2.5322, + "step": 8319 + }, + { + "epoch": 0.44635193133047213, + "grad_norm": 1.532554268836975, + "learning_rate": 3.0556973030679057e-06, + "loss": 2.4001, + "step": 8320 + }, + { + "epoch": 0.4464055793991416, + "grad_norm": 1.464996576309204, + "learning_rate": 3.0552737604163378e-06, + "loss": 2.3761, + "step": 8321 + }, + { + "epoch": 0.44645922746781114, + "grad_norm": 1.6790658235549927, + "learning_rate": 3.054850200999521e-06, + "loss": 2.3794, + "step": 8322 + }, + { + "epoch": 0.44651287553648067, + "grad_norm": 1.5258294343948364, + "learning_rate": 3.0544266248302413e-06, + "loss": 2.2081, + "step": 8323 + }, + { + "epoch": 0.4465665236051502, + "grad_norm": 1.5874114036560059, + "learning_rate": 3.0540030319212893e-06, + "loss": 2.5136, + "step": 8324 + }, + { + "epoch": 0.44662017167381973, + "grad_norm": 1.613317847251892, + "learning_rate": 3.053579422285453e-06, + "loss": 2.257, + "step": 8325 + }, + { + "epoch": 0.44667381974248926, + "grad_norm": 1.2938765287399292, + "learning_rate": 3.053155795935523e-06, + "loss": 2.0894, + "step": 8326 + }, + { + "epoch": 0.4467274678111588, + "grad_norm": 1.749281883239746, + "learning_rate": 3.0527321528842903e-06, + "loss": 2.1368, + "step": 8327 + }, + { + "epoch": 0.4467811158798283, + "grad_norm": 3.324453592300415, + "learning_rate": 3.0523084931445455e-06, + "loss": 1.934, + "step": 8328 + }, + { + "epoch": 0.44683476394849786, + "grad_norm": 1.2827945947647095, + "learning_rate": 3.0518848167290797e-06, + "loss": 1.997, + "step": 8329 + }, + { + "epoch": 0.4468884120171674, + "grad_norm": 1.374488353729248, + "learning_rate": 3.0514611236506852e-06, + "loss": 2.2073, + "step": 8330 + }, + { + "epoch": 0.4469420600858369, + "grad_norm": 1.491152048110962, + "learning_rate": 3.0510374139221544e-06, + "loss": 2.1684, + "step": 8331 + }, + { + "epoch": 0.44699570815450645, + "grad_norm": 1.5504097938537598, + "learning_rate": 3.05061368755628e-06, + "loss": 2.4527, + "step": 8332 + }, + { + "epoch": 0.447049356223176, + "grad_norm": 2.132828712463379, + "learning_rate": 3.0501899445658565e-06, + "loss": 1.5796, + "step": 8333 + }, + { + "epoch": 0.4471030042918455, + "grad_norm": 1.471354365348816, + "learning_rate": 3.0497661849636763e-06, + "loss": 2.2633, + "step": 8334 + }, + { + "epoch": 0.447156652360515, + "grad_norm": 1.654012680053711, + "learning_rate": 3.049342408762535e-06, + "loss": 2.1841, + "step": 8335 + }, + { + "epoch": 0.4472103004291845, + "grad_norm": 1.5079448223114014, + "learning_rate": 3.048918615975227e-06, + "loss": 2.2614, + "step": 8336 + }, + { + "epoch": 0.44726394849785406, + "grad_norm": 1.3787786960601807, + "learning_rate": 3.0484948066145488e-06, + "loss": 2.2009, + "step": 8337 + }, + { + "epoch": 0.4473175965665236, + "grad_norm": 1.2477898597717285, + "learning_rate": 3.0480709806932946e-06, + "loss": 1.945, + "step": 8338 + }, + { + "epoch": 0.4473712446351931, + "grad_norm": 1.5061390399932861, + "learning_rate": 3.047647138224262e-06, + "loss": 2.1678, + "step": 8339 + }, + { + "epoch": 0.44742489270386265, + "grad_norm": 1.5806396007537842, + "learning_rate": 3.047223279220248e-06, + "loss": 2.2645, + "step": 8340 + }, + { + "epoch": 0.4474785407725322, + "grad_norm": 1.6012747287750244, + "learning_rate": 3.0467994036940514e-06, + "loss": 2.3839, + "step": 8341 + }, + { + "epoch": 0.4475321888412017, + "grad_norm": 1.52213716506958, + "learning_rate": 3.0463755116584664e-06, + "loss": 2.1082, + "step": 8342 + }, + { + "epoch": 0.44758583690987125, + "grad_norm": 1.4776591062545776, + "learning_rate": 3.0459516031262948e-06, + "loss": 2.3441, + "step": 8343 + }, + { + "epoch": 0.4476394849785408, + "grad_norm": 1.4764235019683838, + "learning_rate": 3.0455276781103342e-06, + "loss": 2.3234, + "step": 8344 + }, + { + "epoch": 0.4476931330472103, + "grad_norm": 1.6458574533462524, + "learning_rate": 3.0451037366233848e-06, + "loss": 2.4767, + "step": 8345 + }, + { + "epoch": 0.44774678111587984, + "grad_norm": 1.470445990562439, + "learning_rate": 3.0446797786782468e-06, + "loss": 2.2755, + "step": 8346 + }, + { + "epoch": 0.4478004291845494, + "grad_norm": 1.5310581922531128, + "learning_rate": 3.044255804287719e-06, + "loss": 2.1678, + "step": 8347 + }, + { + "epoch": 0.4478540772532189, + "grad_norm": 1.6813807487487793, + "learning_rate": 3.043831813464604e-06, + "loss": 2.4914, + "step": 8348 + }, + { + "epoch": 0.44790772532188844, + "grad_norm": 3.0713131427764893, + "learning_rate": 3.043407806221702e-06, + "loss": 2.3088, + "step": 8349 + }, + { + "epoch": 0.4479613733905579, + "grad_norm": 1.46730637550354, + "learning_rate": 3.042983782571816e-06, + "loss": 2.2534, + "step": 8350 + }, + { + "epoch": 0.44801502145922745, + "grad_norm": 1.3291233777999878, + "learning_rate": 3.0425597425277483e-06, + "loss": 1.8821, + "step": 8351 + }, + { + "epoch": 0.448068669527897, + "grad_norm": 1.6874921321868896, + "learning_rate": 3.0421356861023014e-06, + "loss": 2.111, + "step": 8352 + }, + { + "epoch": 0.4481223175965665, + "grad_norm": 1.367336630821228, + "learning_rate": 3.041711613308279e-06, + "loss": 2.2811, + "step": 8353 + }, + { + "epoch": 0.44817596566523604, + "grad_norm": 1.4838038682937622, + "learning_rate": 3.041287524158485e-06, + "loss": 2.409, + "step": 8354 + }, + { + "epoch": 0.4482296137339056, + "grad_norm": 1.3762383460998535, + "learning_rate": 3.040863418665723e-06, + "loss": 2.1504, + "step": 8355 + }, + { + "epoch": 0.4482832618025751, + "grad_norm": 1.6596686840057373, + "learning_rate": 3.0404392968428e-06, + "loss": 2.0329, + "step": 8356 + }, + { + "epoch": 0.44833690987124464, + "grad_norm": 1.5268166065216064, + "learning_rate": 3.040015158702519e-06, + "loss": 2.4341, + "step": 8357 + }, + { + "epoch": 0.44839055793991417, + "grad_norm": 1.5599701404571533, + "learning_rate": 3.039591004257688e-06, + "loss": 2.3794, + "step": 8358 + }, + { + "epoch": 0.4484442060085837, + "grad_norm": 1.5318559408187866, + "learning_rate": 3.0391668335211115e-06, + "loss": 2.2394, + "step": 8359 + }, + { + "epoch": 0.44849785407725323, + "grad_norm": 1.2533109188079834, + "learning_rate": 3.0387426465055975e-06, + "loss": 2.0829, + "step": 8360 + }, + { + "epoch": 0.44855150214592276, + "grad_norm": 1.1616547107696533, + "learning_rate": 3.0383184432239533e-06, + "loss": 2.087, + "step": 8361 + }, + { + "epoch": 0.4486051502145923, + "grad_norm": 1.4279353618621826, + "learning_rate": 3.037894223688987e-06, + "loss": 2.2361, + "step": 8362 + }, + { + "epoch": 0.44865879828326183, + "grad_norm": 1.5710012912750244, + "learning_rate": 3.037469987913506e-06, + "loss": 2.1986, + "step": 8363 + }, + { + "epoch": 0.4487124463519313, + "grad_norm": 1.4687856435775757, + "learning_rate": 3.0370457359103206e-06, + "loss": 2.1426, + "step": 8364 + }, + { + "epoch": 0.44876609442060084, + "grad_norm": 15.368971824645996, + "learning_rate": 3.0366214676922384e-06, + "loss": 2.3267, + "step": 8365 + }, + { + "epoch": 0.44881974248927037, + "grad_norm": 1.4098690748214722, + "learning_rate": 3.0361971832720707e-06, + "loss": 2.3099, + "step": 8366 + }, + { + "epoch": 0.4488733905579399, + "grad_norm": 2.3673200607299805, + "learning_rate": 3.035772882662627e-06, + "loss": 2.2911, + "step": 8367 + }, + { + "epoch": 0.44892703862660943, + "grad_norm": 1.2059561014175415, + "learning_rate": 3.035348565876719e-06, + "loss": 2.3213, + "step": 8368 + }, + { + "epoch": 0.44898068669527896, + "grad_norm": 1.4826250076293945, + "learning_rate": 3.0349242329271565e-06, + "loss": 2.5921, + "step": 8369 + }, + { + "epoch": 0.4490343347639485, + "grad_norm": 1.9130384922027588, + "learning_rate": 3.0344998838267525e-06, + "loss": 2.2156, + "step": 8370 + }, + { + "epoch": 0.449087982832618, + "grad_norm": 1.495926022529602, + "learning_rate": 3.034075518588319e-06, + "loss": 2.1921, + "step": 8371 + }, + { + "epoch": 0.44914163090128756, + "grad_norm": 1.9312334060668945, + "learning_rate": 3.0336511372246687e-06, + "loss": 2.4442, + "step": 8372 + }, + { + "epoch": 0.4491952789699571, + "grad_norm": 1.3913463354110718, + "learning_rate": 3.0332267397486146e-06, + "loss": 2.3377, + "step": 8373 + }, + { + "epoch": 0.4492489270386266, + "grad_norm": 1.6376678943634033, + "learning_rate": 3.032802326172971e-06, + "loss": 2.5002, + "step": 8374 + }, + { + "epoch": 0.44930257510729615, + "grad_norm": 1.5088374614715576, + "learning_rate": 3.0323778965105515e-06, + "loss": 1.3384, + "step": 8375 + }, + { + "epoch": 0.4493562231759657, + "grad_norm": 1.500156044960022, + "learning_rate": 3.031953450774171e-06, + "loss": 2.366, + "step": 8376 + }, + { + "epoch": 0.4494098712446352, + "grad_norm": 1.5189099311828613, + "learning_rate": 3.031528988976645e-06, + "loss": 2.2237, + "step": 8377 + }, + { + "epoch": 0.4494635193133047, + "grad_norm": 1.5585887432098389, + "learning_rate": 3.0311045111307887e-06, + "loss": 2.2638, + "step": 8378 + }, + { + "epoch": 0.4495171673819742, + "grad_norm": 1.7162344455718994, + "learning_rate": 3.030680017249419e-06, + "loss": 2.093, + "step": 8379 + }, + { + "epoch": 0.44957081545064376, + "grad_norm": 4.107916831970215, + "learning_rate": 3.0302555073453515e-06, + "loss": 2.3684, + "step": 8380 + }, + { + "epoch": 0.4496244635193133, + "grad_norm": 1.2709906101226807, + "learning_rate": 3.0298309814314043e-06, + "loss": 2.0755, + "step": 8381 + }, + { + "epoch": 0.4496781115879828, + "grad_norm": 2.6721363067626953, + "learning_rate": 3.029406439520394e-06, + "loss": 2.3498, + "step": 8382 + }, + { + "epoch": 0.44973175965665235, + "grad_norm": 1.5264629125595093, + "learning_rate": 3.02898188162514e-06, + "loss": 2.403, + "step": 8383 + }, + { + "epoch": 0.4497854077253219, + "grad_norm": 1.4518808126449585, + "learning_rate": 3.0285573077584595e-06, + "loss": 2.1511, + "step": 8384 + }, + { + "epoch": 0.4498390557939914, + "grad_norm": 1.4041424989700317, + "learning_rate": 3.0281327179331727e-06, + "loss": 1.9915, + "step": 8385 + }, + { + "epoch": 0.44989270386266095, + "grad_norm": 3.619110345840454, + "learning_rate": 3.027708112162099e-06, + "loss": 2.2578, + "step": 8386 + }, + { + "epoch": 0.4499463519313305, + "grad_norm": 1.5727635622024536, + "learning_rate": 3.027283490458058e-06, + "loss": 2.4731, + "step": 8387 + }, + { + "epoch": 0.45, + "grad_norm": 1.8106545209884644, + "learning_rate": 3.026858852833869e-06, + "loss": 2.0926, + "step": 8388 + }, + { + "epoch": 0.45005364806866954, + "grad_norm": 1.4549144506454468, + "learning_rate": 3.0264341993023548e-06, + "loss": 2.2191, + "step": 8389 + }, + { + "epoch": 0.4501072961373391, + "grad_norm": 1.72709321975708, + "learning_rate": 3.0260095298763374e-06, + "loss": 2.4125, + "step": 8390 + }, + { + "epoch": 0.4501609442060086, + "grad_norm": 1.6845989227294922, + "learning_rate": 3.025584844568637e-06, + "loss": 2.4015, + "step": 8391 + }, + { + "epoch": 0.45021459227467814, + "grad_norm": 1.3616329431533813, + "learning_rate": 3.025160143392077e-06, + "loss": 1.9455, + "step": 8392 + }, + { + "epoch": 0.4502682403433476, + "grad_norm": 1.5099834203720093, + "learning_rate": 3.0247354263594795e-06, + "loss": 2.2006, + "step": 8393 + }, + { + "epoch": 0.45032188841201715, + "grad_norm": 1.424399733543396, + "learning_rate": 3.0243106934836687e-06, + "loss": 2.388, + "step": 8394 + }, + { + "epoch": 0.4503755364806867, + "grad_norm": 1.6219607591629028, + "learning_rate": 3.023885944777468e-06, + "loss": 2.3365, + "step": 8395 + }, + { + "epoch": 0.4504291845493562, + "grad_norm": 1.2697198390960693, + "learning_rate": 3.023461180253702e-06, + "loss": 2.2057, + "step": 8396 + }, + { + "epoch": 0.45048283261802574, + "grad_norm": 1.580161452293396, + "learning_rate": 3.0230363999251956e-06, + "loss": 2.4524, + "step": 8397 + }, + { + "epoch": 0.4505364806866953, + "grad_norm": 1.4261723756790161, + "learning_rate": 3.0226116038047736e-06, + "loss": 2.0529, + "step": 8398 + }, + { + "epoch": 0.4505901287553648, + "grad_norm": 1.65313720703125, + "learning_rate": 3.0221867919052623e-06, + "loss": 2.16, + "step": 8399 + }, + { + "epoch": 0.45064377682403434, + "grad_norm": 1.5621830224990845, + "learning_rate": 3.0217619642394872e-06, + "loss": 2.2236, + "step": 8400 + }, + { + "epoch": 0.45069742489270387, + "grad_norm": 1.4613940715789795, + "learning_rate": 3.0213371208202758e-06, + "loss": 2.4128, + "step": 8401 + }, + { + "epoch": 0.4507510729613734, + "grad_norm": 1.373001217842102, + "learning_rate": 3.0209122616604554e-06, + "loss": 2.0888, + "step": 8402 + }, + { + "epoch": 0.45080472103004293, + "grad_norm": 1.541587233543396, + "learning_rate": 3.0204873867728535e-06, + "loss": 2.2537, + "step": 8403 + }, + { + "epoch": 0.45085836909871246, + "grad_norm": 1.1452425718307495, + "learning_rate": 3.0200624961702973e-06, + "loss": 2.1618, + "step": 8404 + }, + { + "epoch": 0.450912017167382, + "grad_norm": 1.2700392007827759, + "learning_rate": 3.019637589865616e-06, + "loss": 2.5287, + "step": 8405 + }, + { + "epoch": 0.4509656652360515, + "grad_norm": 2.276151418685913, + "learning_rate": 3.0192126678716394e-06, + "loss": 2.2773, + "step": 8406 + }, + { + "epoch": 0.451019313304721, + "grad_norm": 1.7869900465011597, + "learning_rate": 3.018787730201197e-06, + "loss": 2.6724, + "step": 8407 + }, + { + "epoch": 0.45107296137339054, + "grad_norm": 1.6125644445419312, + "learning_rate": 3.018362776867118e-06, + "loss": 2.0515, + "step": 8408 + }, + { + "epoch": 0.45112660944206007, + "grad_norm": 1.419479489326477, + "learning_rate": 3.0179378078822335e-06, + "loss": 2.2518, + "step": 8409 + }, + { + "epoch": 0.4511802575107296, + "grad_norm": 1.7026028633117676, + "learning_rate": 3.0175128232593733e-06, + "loss": 2.5887, + "step": 8410 + }, + { + "epoch": 0.45123390557939913, + "grad_norm": 1.7513402700424194, + "learning_rate": 3.0170878230113704e-06, + "loss": 2.3004, + "step": 8411 + }, + { + "epoch": 0.45128755364806866, + "grad_norm": 1.5148708820343018, + "learning_rate": 3.0166628071510566e-06, + "loss": 2.3711, + "step": 8412 + }, + { + "epoch": 0.4513412017167382, + "grad_norm": 1.5965909957885742, + "learning_rate": 3.0162377756912644e-06, + "loss": 2.2139, + "step": 8413 + }, + { + "epoch": 0.4513948497854077, + "grad_norm": 1.2514725923538208, + "learning_rate": 3.0158127286448246e-06, + "loss": 1.8648, + "step": 8414 + }, + { + "epoch": 0.45144849785407726, + "grad_norm": 1.648686408996582, + "learning_rate": 3.0153876660245736e-06, + "loss": 2.3274, + "step": 8415 + }, + { + "epoch": 0.4515021459227468, + "grad_norm": 1.6019989252090454, + "learning_rate": 3.0149625878433427e-06, + "loss": 2.2335, + "step": 8416 + }, + { + "epoch": 0.4515557939914163, + "grad_norm": 1.273440957069397, + "learning_rate": 3.014537494113968e-06, + "loss": 2.6745, + "step": 8417 + }, + { + "epoch": 0.45160944206008585, + "grad_norm": 2.335068464279175, + "learning_rate": 3.0141123848492828e-06, + "loss": 2.2905, + "step": 8418 + }, + { + "epoch": 0.4516630901287554, + "grad_norm": 1.6905550956726074, + "learning_rate": 3.0136872600621227e-06, + "loss": 2.2254, + "step": 8419 + }, + { + "epoch": 0.4517167381974249, + "grad_norm": 1.4539273977279663, + "learning_rate": 3.0132621197653243e-06, + "loss": 2.0929, + "step": 8420 + }, + { + "epoch": 0.45177038626609445, + "grad_norm": 1.6236536502838135, + "learning_rate": 3.012836963971723e-06, + "loss": 2.1649, + "step": 8421 + }, + { + "epoch": 0.4518240343347639, + "grad_norm": 1.2660270929336548, + "learning_rate": 3.0124117926941555e-06, + "loss": 1.2364, + "step": 8422 + }, + { + "epoch": 0.45187768240343346, + "grad_norm": 2.2493247985839844, + "learning_rate": 3.0119866059454596e-06, + "loss": 2.2925, + "step": 8423 + }, + { + "epoch": 0.451931330472103, + "grad_norm": 1.9846365451812744, + "learning_rate": 3.0115614037384712e-06, + "loss": 2.5132, + "step": 8424 + }, + { + "epoch": 0.4519849785407725, + "grad_norm": 1.7736921310424805, + "learning_rate": 3.0111361860860306e-06, + "loss": 2.451, + "step": 8425 + }, + { + "epoch": 0.45203862660944205, + "grad_norm": 1.1813709735870361, + "learning_rate": 3.010710953000974e-06, + "loss": 2.4631, + "step": 8426 + }, + { + "epoch": 0.4520922746781116, + "grad_norm": 1.7993085384368896, + "learning_rate": 3.0102857044961415e-06, + "loss": 2.3617, + "step": 8427 + }, + { + "epoch": 0.4521459227467811, + "grad_norm": 1.5055296421051025, + "learning_rate": 3.009860440584373e-06, + "loss": 2.386, + "step": 8428 + }, + { + "epoch": 0.45219957081545065, + "grad_norm": 1.4176268577575684, + "learning_rate": 3.0094351612785073e-06, + "loss": 2.1476, + "step": 8429 + }, + { + "epoch": 0.4522532188841202, + "grad_norm": 1.594693660736084, + "learning_rate": 3.009009866591386e-06, + "loss": 2.256, + "step": 8430 + }, + { + "epoch": 0.4523068669527897, + "grad_norm": 1.270418643951416, + "learning_rate": 3.008584556535849e-06, + "loss": 1.9326, + "step": 8431 + }, + { + "epoch": 0.45236051502145924, + "grad_norm": 1.724379539489746, + "learning_rate": 3.0081592311247378e-06, + "loss": 2.1613, + "step": 8432 + }, + { + "epoch": 0.4524141630901288, + "grad_norm": 1.8563522100448608, + "learning_rate": 3.007733890370893e-06, + "loss": 2.1769, + "step": 8433 + }, + { + "epoch": 0.4524678111587983, + "grad_norm": 1.1668548583984375, + "learning_rate": 3.0073085342871592e-06, + "loss": 2.1736, + "step": 8434 + }, + { + "epoch": 0.45252145922746784, + "grad_norm": 1.3384054899215698, + "learning_rate": 3.006883162886378e-06, + "loss": 1.9235, + "step": 8435 + }, + { + "epoch": 0.4525751072961373, + "grad_norm": 1.6458275318145752, + "learning_rate": 3.0064577761813918e-06, + "loss": 2.3726, + "step": 8436 + }, + { + "epoch": 0.45262875536480685, + "grad_norm": 1.6826012134552002, + "learning_rate": 3.006032374185045e-06, + "loss": 2.3856, + "step": 8437 + }, + { + "epoch": 0.4526824034334764, + "grad_norm": 1.4322094917297363, + "learning_rate": 3.005606956910182e-06, + "loss": 2.3857, + "step": 8438 + }, + { + "epoch": 0.4527360515021459, + "grad_norm": 1.5496189594268799, + "learning_rate": 3.005181524369646e-06, + "loss": 2.1697, + "step": 8439 + }, + { + "epoch": 0.45278969957081544, + "grad_norm": 1.4682562351226807, + "learning_rate": 3.004756076576283e-06, + "loss": 2.3134, + "step": 8440 + }, + { + "epoch": 0.45284334763948497, + "grad_norm": 4.379953861236572, + "learning_rate": 3.0043306135429385e-06, + "loss": 2.2165, + "step": 8441 + }, + { + "epoch": 0.4528969957081545, + "grad_norm": 1.689222812652588, + "learning_rate": 3.003905135282458e-06, + "loss": 2.7127, + "step": 8442 + }, + { + "epoch": 0.45295064377682404, + "grad_norm": 1.7560914754867554, + "learning_rate": 3.0034796418076873e-06, + "loss": 2.2294, + "step": 8443 + }, + { + "epoch": 0.45300429184549357, + "grad_norm": 1.2820957899093628, + "learning_rate": 3.0030541331314744e-06, + "loss": 2.2987, + "step": 8444 + }, + { + "epoch": 0.4530579399141631, + "grad_norm": 1.5750411748886108, + "learning_rate": 3.002628609266666e-06, + "loss": 2.0561, + "step": 8445 + }, + { + "epoch": 0.45311158798283263, + "grad_norm": 1.54265558719635, + "learning_rate": 3.0022030702261105e-06, + "loss": 2.1988, + "step": 8446 + }, + { + "epoch": 0.45316523605150216, + "grad_norm": 1.257224440574646, + "learning_rate": 3.001777516022655e-06, + "loss": 1.9803, + "step": 8447 + }, + { + "epoch": 0.4532188841201717, + "grad_norm": 1.606240153312683, + "learning_rate": 3.0013519466691494e-06, + "loss": 2.1825, + "step": 8448 + }, + { + "epoch": 0.4532725321888412, + "grad_norm": 1.8263992071151733, + "learning_rate": 3.000926362178441e-06, + "loss": 2.3032, + "step": 8449 + }, + { + "epoch": 0.4533261802575107, + "grad_norm": 1.6938695907592773, + "learning_rate": 3.000500762563381e-06, + "loss": 2.3816, + "step": 8450 + }, + { + "epoch": 0.45337982832618023, + "grad_norm": 1.8769086599349976, + "learning_rate": 3.000075147836819e-06, + "loss": 2.1767, + "step": 8451 + }, + { + "epoch": 0.45343347639484977, + "grad_norm": 1.5968912839889526, + "learning_rate": 2.999649518011606e-06, + "loss": 2.3889, + "step": 8452 + }, + { + "epoch": 0.4534871244635193, + "grad_norm": 1.4536869525909424, + "learning_rate": 2.999223873100591e-06, + "loss": 2.4211, + "step": 8453 + }, + { + "epoch": 0.45354077253218883, + "grad_norm": 1.8839291334152222, + "learning_rate": 2.9987982131166277e-06, + "loss": 2.4117, + "step": 8454 + }, + { + "epoch": 0.45359442060085836, + "grad_norm": 1.4150179624557495, + "learning_rate": 2.9983725380725667e-06, + "loss": 2.2021, + "step": 8455 + }, + { + "epoch": 0.4536480686695279, + "grad_norm": 1.3256953954696655, + "learning_rate": 2.9979468479812605e-06, + "loss": 2.2295, + "step": 8456 + }, + { + "epoch": 0.4537017167381974, + "grad_norm": 1.4684526920318604, + "learning_rate": 2.9975211428555616e-06, + "loss": 2.4334, + "step": 8457 + }, + { + "epoch": 0.45375536480686696, + "grad_norm": 1.6525191068649292, + "learning_rate": 2.9970954227083243e-06, + "loss": 2.2956, + "step": 8458 + }, + { + "epoch": 0.4538090128755365, + "grad_norm": 1.5435229539871216, + "learning_rate": 2.9966696875524013e-06, + "loss": 2.0874, + "step": 8459 + }, + { + "epoch": 0.453862660944206, + "grad_norm": 1.4367908239364624, + "learning_rate": 2.996243937400647e-06, + "loss": 2.2319, + "step": 8460 + }, + { + "epoch": 0.45391630901287555, + "grad_norm": 1.4570984840393066, + "learning_rate": 2.995818172265916e-06, + "loss": 2.3163, + "step": 8461 + }, + { + "epoch": 0.4539699570815451, + "grad_norm": 1.8435587882995605, + "learning_rate": 2.995392392161063e-06, + "loss": 2.1066, + "step": 8462 + }, + { + "epoch": 0.4540236051502146, + "grad_norm": 1.5223029851913452, + "learning_rate": 2.994966597098944e-06, + "loss": 2.2742, + "step": 8463 + }, + { + "epoch": 0.45407725321888415, + "grad_norm": 1.3966004848480225, + "learning_rate": 2.9945407870924144e-06, + "loss": 1.9397, + "step": 8464 + }, + { + "epoch": 0.4541309012875536, + "grad_norm": 1.3498709201812744, + "learning_rate": 2.994114962154331e-06, + "loss": 2.1454, + "step": 8465 + }, + { + "epoch": 0.45418454935622316, + "grad_norm": 1.2302484512329102, + "learning_rate": 2.99368912229755e-06, + "loss": 1.9917, + "step": 8466 + }, + { + "epoch": 0.4542381974248927, + "grad_norm": 1.4776437282562256, + "learning_rate": 2.9932632675349293e-06, + "loss": 2.269, + "step": 8467 + }, + { + "epoch": 0.4542918454935622, + "grad_norm": 1.8276113271713257, + "learning_rate": 2.992837397879327e-06, + "loss": 2.1754, + "step": 8468 + }, + { + "epoch": 0.45434549356223175, + "grad_norm": 1.7410334348678589, + "learning_rate": 2.992411513343601e-06, + "loss": 2.2366, + "step": 8469 + }, + { + "epoch": 0.4543991416309013, + "grad_norm": 7.015322685241699, + "learning_rate": 2.9919856139406095e-06, + "loss": 2.1833, + "step": 8470 + }, + { + "epoch": 0.4544527896995708, + "grad_norm": 1.6317375898361206, + "learning_rate": 2.991559699683211e-06, + "loss": 2.5238, + "step": 8471 + }, + { + "epoch": 0.45450643776824035, + "grad_norm": 1.4562642574310303, + "learning_rate": 2.9911337705842663e-06, + "loss": 2.2142, + "step": 8472 + }, + { + "epoch": 0.4545600858369099, + "grad_norm": 1.4991774559020996, + "learning_rate": 2.9907078266566354e-06, + "loss": 2.227, + "step": 8473 + }, + { + "epoch": 0.4546137339055794, + "grad_norm": 1.1926480531692505, + "learning_rate": 2.9902818679131777e-06, + "loss": 1.9053, + "step": 8474 + }, + { + "epoch": 0.45466738197424894, + "grad_norm": 1.8449056148529053, + "learning_rate": 2.989855894366755e-06, + "loss": 2.1115, + "step": 8475 + }, + { + "epoch": 0.4547210300429185, + "grad_norm": 1.6305173635482788, + "learning_rate": 2.989429906030228e-06, + "loss": 2.0652, + "step": 8476 + }, + { + "epoch": 0.454774678111588, + "grad_norm": 2.176271438598633, + "learning_rate": 2.989003902916459e-06, + "loss": 2.4027, + "step": 8477 + }, + { + "epoch": 0.45482832618025754, + "grad_norm": 1.9448421001434326, + "learning_rate": 2.988577885038309e-06, + "loss": 2.4766, + "step": 8478 + }, + { + "epoch": 0.454881974248927, + "grad_norm": 1.7932624816894531, + "learning_rate": 2.9881518524086424e-06, + "loss": 2.1715, + "step": 8479 + }, + { + "epoch": 0.45493562231759654, + "grad_norm": 10.770556449890137, + "learning_rate": 2.9877258050403214e-06, + "loss": 2.3573, + "step": 8480 + }, + { + "epoch": 0.4549892703862661, + "grad_norm": 1.63186776638031, + "learning_rate": 2.9872997429462094e-06, + "loss": 2.3591, + "step": 8481 + }, + { + "epoch": 0.4550429184549356, + "grad_norm": 1.1950782537460327, + "learning_rate": 2.98687366613917e-06, + "loss": 2.3434, + "step": 8482 + }, + { + "epoch": 0.45509656652360514, + "grad_norm": 1.3373425006866455, + "learning_rate": 2.9864475746320686e-06, + "loss": 2.545, + "step": 8483 + }, + { + "epoch": 0.45515021459227467, + "grad_norm": 1.394322395324707, + "learning_rate": 2.98602146843777e-06, + "loss": 2.2799, + "step": 8484 + }, + { + "epoch": 0.4552038626609442, + "grad_norm": 1.4521267414093018, + "learning_rate": 2.9855953475691392e-06, + "loss": 2.0951, + "step": 8485 + }, + { + "epoch": 0.45525751072961373, + "grad_norm": 1.4906973838806152, + "learning_rate": 2.985169212039042e-06, + "loss": 2.3616, + "step": 8486 + }, + { + "epoch": 0.45531115879828327, + "grad_norm": 1.6045711040496826, + "learning_rate": 2.984743061860345e-06, + "loss": 1.5049, + "step": 8487 + }, + { + "epoch": 0.4553648068669528, + "grad_norm": 1.4417780637741089, + "learning_rate": 2.984316897045914e-06, + "loss": 2.2572, + "step": 8488 + }, + { + "epoch": 0.45541845493562233, + "grad_norm": 1.7175252437591553, + "learning_rate": 2.983890717608616e-06, + "loss": 2.3871, + "step": 8489 + }, + { + "epoch": 0.45547210300429186, + "grad_norm": 1.256291151046753, + "learning_rate": 2.9834645235613203e-06, + "loss": 2.188, + "step": 8490 + }, + { + "epoch": 0.4555257510729614, + "grad_norm": 1.4453824758529663, + "learning_rate": 2.9830383149168928e-06, + "loss": 2.1571, + "step": 8491 + }, + { + "epoch": 0.4555793991416309, + "grad_norm": 1.9120060205459595, + "learning_rate": 2.9826120916882032e-06, + "loss": 1.9187, + "step": 8492 + }, + { + "epoch": 0.4556330472103004, + "grad_norm": 12.029306411743164, + "learning_rate": 2.9821858538881202e-06, + "loss": 2.2685, + "step": 8493 + }, + { + "epoch": 0.45568669527896993, + "grad_norm": 1.6247375011444092, + "learning_rate": 2.9817596015295123e-06, + "loss": 2.3132, + "step": 8494 + }, + { + "epoch": 0.45574034334763946, + "grad_norm": 1.4291731119155884, + "learning_rate": 2.98133333462525e-06, + "loss": 2.3177, + "step": 8495 + }, + { + "epoch": 0.455793991416309, + "grad_norm": 1.3954130411148071, + "learning_rate": 2.9809070531882033e-06, + "loss": 2.0905, + "step": 8496 + }, + { + "epoch": 0.45584763948497853, + "grad_norm": 1.6046708822250366, + "learning_rate": 2.9804807572312426e-06, + "loss": 2.4053, + "step": 8497 + }, + { + "epoch": 0.45590128755364806, + "grad_norm": 1.531225323677063, + "learning_rate": 2.9800544467672393e-06, + "loss": 2.3909, + "step": 8498 + }, + { + "epoch": 0.4559549356223176, + "grad_norm": 1.5712811946868896, + "learning_rate": 2.979628121809065e-06, + "loss": 2.3359, + "step": 8499 + }, + { + "epoch": 0.4560085836909871, + "grad_norm": 1.5735650062561035, + "learning_rate": 2.9792017823695907e-06, + "loss": 2.3643, + "step": 8500 + }, + { + "epoch": 0.45606223175965666, + "grad_norm": 1.3569775819778442, + "learning_rate": 2.9787754284616897e-06, + "loss": 2.3336, + "step": 8501 + }, + { + "epoch": 0.4561158798283262, + "grad_norm": 1.2899388074874878, + "learning_rate": 2.9783490600982347e-06, + "loss": 1.9689, + "step": 8502 + }, + { + "epoch": 0.4561695278969957, + "grad_norm": 1.206030011177063, + "learning_rate": 2.9779226772920988e-06, + "loss": 2.1694, + "step": 8503 + }, + { + "epoch": 0.45622317596566525, + "grad_norm": 1.4632991552352905, + "learning_rate": 2.9774962800561557e-06, + "loss": 2.2774, + "step": 8504 + }, + { + "epoch": 0.4562768240343348, + "grad_norm": 1.6464836597442627, + "learning_rate": 2.977069868403279e-06, + "loss": 2.3012, + "step": 8505 + }, + { + "epoch": 0.4563304721030043, + "grad_norm": 1.7580739259719849, + "learning_rate": 2.976643442346344e-06, + "loss": 2.2888, + "step": 8506 + }, + { + "epoch": 0.45638412017167385, + "grad_norm": 1.3476372957229614, + "learning_rate": 2.9762170018982252e-06, + "loss": 2.4795, + "step": 8507 + }, + { + "epoch": 0.4564377682403433, + "grad_norm": 1.4373703002929688, + "learning_rate": 2.9757905470717984e-06, + "loss": 2.4154, + "step": 8508 + }, + { + "epoch": 0.45649141630901285, + "grad_norm": 1.5947518348693848, + "learning_rate": 2.9753640778799397e-06, + "loss": 2.2297, + "step": 8509 + }, + { + "epoch": 0.4565450643776824, + "grad_norm": 1.3559566736221313, + "learning_rate": 2.9749375943355245e-06, + "loss": 2.1611, + "step": 8510 + }, + { + "epoch": 0.4565987124463519, + "grad_norm": 1.8397332429885864, + "learning_rate": 2.974511096451429e-06, + "loss": 2.2645, + "step": 8511 + }, + { + "epoch": 0.45665236051502145, + "grad_norm": 1.4102994203567505, + "learning_rate": 2.9740845842405324e-06, + "loss": 1.6931, + "step": 8512 + }, + { + "epoch": 0.456706008583691, + "grad_norm": 1.553154706954956, + "learning_rate": 2.973658057715711e-06, + "loss": 2.1984, + "step": 8513 + }, + { + "epoch": 0.4567596566523605, + "grad_norm": 1.5377551317214966, + "learning_rate": 2.9732315168898434e-06, + "loss": 1.742, + "step": 8514 + }, + { + "epoch": 0.45681330472103004, + "grad_norm": 1.5819438695907593, + "learning_rate": 2.972804961775807e-06, + "loss": 2.1434, + "step": 8515 + }, + { + "epoch": 0.4568669527896996, + "grad_norm": 1.6798235177993774, + "learning_rate": 2.9723783923864817e-06, + "loss": 2.292, + "step": 8516 + }, + { + "epoch": 0.4569206008583691, + "grad_norm": 1.621486783027649, + "learning_rate": 2.9719518087347466e-06, + "loss": 2.0562, + "step": 8517 + }, + { + "epoch": 0.45697424892703864, + "grad_norm": 1.8811949491500854, + "learning_rate": 2.971525210833481e-06, + "loss": 2.2396, + "step": 8518 + }, + { + "epoch": 0.45702789699570817, + "grad_norm": 1.7225056886672974, + "learning_rate": 2.971098598695565e-06, + "loss": 2.2998, + "step": 8519 + }, + { + "epoch": 0.4570815450643777, + "grad_norm": 1.5601576566696167, + "learning_rate": 2.9706719723338794e-06, + "loss": 2.2689, + "step": 8520 + }, + { + "epoch": 0.45713519313304724, + "grad_norm": 1.9287315607070923, + "learning_rate": 2.970245331761306e-06, + "loss": 2.3883, + "step": 8521 + }, + { + "epoch": 0.4571888412017167, + "grad_norm": 2.0466485023498535, + "learning_rate": 2.9698186769907256e-06, + "loss": 2.2285, + "step": 8522 + }, + { + "epoch": 0.45724248927038624, + "grad_norm": 1.4725170135498047, + "learning_rate": 2.96939200803502e-06, + "loss": 2.1667, + "step": 8523 + }, + { + "epoch": 0.4572961373390558, + "grad_norm": 1.5707913637161255, + "learning_rate": 2.9689653249070718e-06, + "loss": 2.1919, + "step": 8524 + }, + { + "epoch": 0.4573497854077253, + "grad_norm": 2.3382134437561035, + "learning_rate": 2.968538627619763e-06, + "loss": 2.3481, + "step": 8525 + }, + { + "epoch": 0.45740343347639484, + "grad_norm": 1.4404429197311401, + "learning_rate": 2.968111916185978e-06, + "loss": 2.1253, + "step": 8526 + }, + { + "epoch": 0.45745708154506437, + "grad_norm": 1.9075185060501099, + "learning_rate": 2.9676851906186e-06, + "loss": 2.4885, + "step": 8527 + }, + { + "epoch": 0.4575107296137339, + "grad_norm": 1.8546538352966309, + "learning_rate": 2.9672584509305112e-06, + "loss": 2.2658, + "step": 8528 + }, + { + "epoch": 0.45756437768240343, + "grad_norm": 1.3345239162445068, + "learning_rate": 2.966831697134599e-06, + "loss": 2.2342, + "step": 8529 + }, + { + "epoch": 0.45761802575107297, + "grad_norm": 1.530107855796814, + "learning_rate": 2.9664049292437465e-06, + "loss": 2.3791, + "step": 8530 + }, + { + "epoch": 0.4576716738197425, + "grad_norm": 1.5487178564071655, + "learning_rate": 2.9659781472708397e-06, + "loss": 2.3136, + "step": 8531 + }, + { + "epoch": 0.45772532188841203, + "grad_norm": 1.5166698694229126, + "learning_rate": 2.9655513512287644e-06, + "loss": 1.3992, + "step": 8532 + }, + { + "epoch": 0.45777896995708156, + "grad_norm": 1.5956025123596191, + "learning_rate": 2.965124541130405e-06, + "loss": 2.6105, + "step": 8533 + }, + { + "epoch": 0.4578326180257511, + "grad_norm": 1.2321346998214722, + "learning_rate": 2.9646977169886504e-06, + "loss": 2.0284, + "step": 8534 + }, + { + "epoch": 0.4578862660944206, + "grad_norm": 1.803460717201233, + "learning_rate": 2.9642708788163866e-06, + "loss": 2.257, + "step": 8535 + }, + { + "epoch": 0.45793991416309016, + "grad_norm": 1.379502534866333, + "learning_rate": 2.9638440266265013e-06, + "loss": 2.3683, + "step": 8536 + }, + { + "epoch": 0.45799356223175963, + "grad_norm": 1.459061622619629, + "learning_rate": 2.9634171604318816e-06, + "loss": 2.4166, + "step": 8537 + }, + { + "epoch": 0.45804721030042916, + "grad_norm": 1.634279727935791, + "learning_rate": 2.962990280245417e-06, + "loss": 2.2485, + "step": 8538 + }, + { + "epoch": 0.4581008583690987, + "grad_norm": 1.5042693614959717, + "learning_rate": 2.9625633860799952e-06, + "loss": 1.7802, + "step": 8539 + }, + { + "epoch": 0.4581545064377682, + "grad_norm": 1.5282906293869019, + "learning_rate": 2.962136477948505e-06, + "loss": 2.5708, + "step": 8540 + }, + { + "epoch": 0.45820815450643776, + "grad_norm": 1.4006789922714233, + "learning_rate": 2.9617095558638376e-06, + "loss": 2.3098, + "step": 8541 + }, + { + "epoch": 0.4582618025751073, + "grad_norm": 1.3141058683395386, + "learning_rate": 2.961282619838881e-06, + "loss": 1.487, + "step": 8542 + }, + { + "epoch": 0.4583154506437768, + "grad_norm": 1.4388060569763184, + "learning_rate": 2.960855669886527e-06, + "loss": 2.4214, + "step": 8543 + }, + { + "epoch": 0.45836909871244635, + "grad_norm": 1.1679131984710693, + "learning_rate": 2.9604287060196658e-06, + "loss": 2.0097, + "step": 8544 + }, + { + "epoch": 0.4584227467811159, + "grad_norm": 1.5676844120025635, + "learning_rate": 2.9600017282511884e-06, + "loss": 2.2814, + "step": 8545 + }, + { + "epoch": 0.4584763948497854, + "grad_norm": 1.4812867641448975, + "learning_rate": 2.9595747365939868e-06, + "loss": 2.0938, + "step": 8546 + }, + { + "epoch": 0.45853004291845495, + "grad_norm": 1.4551677703857422, + "learning_rate": 2.9591477310609535e-06, + "loss": 2.1318, + "step": 8547 + }, + { + "epoch": 0.4585836909871245, + "grad_norm": 1.5939582586288452, + "learning_rate": 2.95872071166498e-06, + "loss": 2.0913, + "step": 8548 + }, + { + "epoch": 0.458637339055794, + "grad_norm": 1.651092290878296, + "learning_rate": 2.958293678418961e-06, + "loss": 2.3077, + "step": 8549 + }, + { + "epoch": 0.45869098712446355, + "grad_norm": 1.4225586652755737, + "learning_rate": 2.9578666313357866e-06, + "loss": 2.3416, + "step": 8550 + }, + { + "epoch": 0.458744635193133, + "grad_norm": 1.5088672637939453, + "learning_rate": 2.957439570428353e-06, + "loss": 1.9287, + "step": 8551 + }, + { + "epoch": 0.45879828326180255, + "grad_norm": 1.6284856796264648, + "learning_rate": 2.9570124957095543e-06, + "loss": 2.1222, + "step": 8552 + }, + { + "epoch": 0.4588519313304721, + "grad_norm": 2.3220691680908203, + "learning_rate": 2.9565854071922844e-06, + "loss": 2.2327, + "step": 8553 + }, + { + "epoch": 0.4589055793991416, + "grad_norm": 1.8562283515930176, + "learning_rate": 2.956158304889438e-06, + "loss": 2.3095, + "step": 8554 + }, + { + "epoch": 0.45895922746781115, + "grad_norm": 1.5497322082519531, + "learning_rate": 2.955731188813912e-06, + "loss": 2.2761, + "step": 8555 + }, + { + "epoch": 0.4590128755364807, + "grad_norm": 1.8241865634918213, + "learning_rate": 2.9553040589785996e-06, + "loss": 2.2406, + "step": 8556 + }, + { + "epoch": 0.4590665236051502, + "grad_norm": 1.5430917739868164, + "learning_rate": 2.9548769153963992e-06, + "loss": 2.1873, + "step": 8557 + }, + { + "epoch": 0.45912017167381974, + "grad_norm": 1.4275166988372803, + "learning_rate": 2.954449758080207e-06, + "loss": 2.1043, + "step": 8558 + }, + { + "epoch": 0.4591738197424893, + "grad_norm": 1.185896635055542, + "learning_rate": 2.9540225870429203e-06, + "loss": 2.1244, + "step": 8559 + }, + { + "epoch": 0.4592274678111588, + "grad_norm": 1.4922709465026855, + "learning_rate": 2.9535954022974356e-06, + "loss": 2.4319, + "step": 8560 + }, + { + "epoch": 0.45928111587982834, + "grad_norm": 1.3636101484298706, + "learning_rate": 2.9531682038566518e-06, + "loss": 2.1703, + "step": 8561 + }, + { + "epoch": 0.45933476394849787, + "grad_norm": 1.2504459619522095, + "learning_rate": 2.952740991733466e-06, + "loss": 2.1038, + "step": 8562 + }, + { + "epoch": 0.4593884120171674, + "grad_norm": 1.730621576309204, + "learning_rate": 2.952313765940778e-06, + "loss": 2.2456, + "step": 8563 + }, + { + "epoch": 0.45944206008583693, + "grad_norm": 1.3464761972427368, + "learning_rate": 2.951886526491487e-06, + "loss": 1.3683, + "step": 8564 + }, + { + "epoch": 0.4594957081545064, + "grad_norm": 1.1516870260238647, + "learning_rate": 2.9514592733984913e-06, + "loss": 2.0833, + "step": 8565 + }, + { + "epoch": 0.45954935622317594, + "grad_norm": 1.4696297645568848, + "learning_rate": 2.9510320066746923e-06, + "loss": 2.317, + "step": 8566 + }, + { + "epoch": 0.4596030042918455, + "grad_norm": 1.2848308086395264, + "learning_rate": 2.9506047263329896e-06, + "loss": 1.5934, + "step": 8567 + }, + { + "epoch": 0.459656652360515, + "grad_norm": 1.293823003768921, + "learning_rate": 2.9501774323862843e-06, + "loss": 2.2631, + "step": 8568 + }, + { + "epoch": 0.45971030042918454, + "grad_norm": 1.4550652503967285, + "learning_rate": 2.9497501248474777e-06, + "loss": 2.1655, + "step": 8569 + }, + { + "epoch": 0.45976394849785407, + "grad_norm": 1.6155637502670288, + "learning_rate": 2.9493228037294704e-06, + "loss": 2.4671, + "step": 8570 + }, + { + "epoch": 0.4598175965665236, + "grad_norm": 1.4410358667373657, + "learning_rate": 2.948895469045166e-06, + "loss": 2.2247, + "step": 8571 + }, + { + "epoch": 0.45987124463519313, + "grad_norm": 2.0347273349761963, + "learning_rate": 2.948468120807465e-06, + "loss": 2.2182, + "step": 8572 + }, + { + "epoch": 0.45992489270386266, + "grad_norm": 1.4325189590454102, + "learning_rate": 2.9480407590292715e-06, + "loss": 2.2845, + "step": 8573 + }, + { + "epoch": 0.4599785407725322, + "grad_norm": 1.379155158996582, + "learning_rate": 2.947613383723489e-06, + "loss": 2.0368, + "step": 8574 + }, + { + "epoch": 0.46003218884120173, + "grad_norm": 1.6581634283065796, + "learning_rate": 2.9471859949030203e-06, + "loss": 2.3072, + "step": 8575 + }, + { + "epoch": 0.46008583690987126, + "grad_norm": 1.3969151973724365, + "learning_rate": 2.9467585925807694e-06, + "loss": 2.2391, + "step": 8576 + }, + { + "epoch": 0.4601394849785408, + "grad_norm": 2.3076860904693604, + "learning_rate": 2.9463311767696417e-06, + "loss": 2.0673, + "step": 8577 + }, + { + "epoch": 0.4601931330472103, + "grad_norm": 1.572458028793335, + "learning_rate": 2.9459037474825413e-06, + "loss": 2.3045, + "step": 8578 + }, + { + "epoch": 0.46024678111587985, + "grad_norm": 1.3093385696411133, + "learning_rate": 2.945476304732373e-06, + "loss": 2.0533, + "step": 8579 + }, + { + "epoch": 0.46030042918454933, + "grad_norm": 1.6649353504180908, + "learning_rate": 2.945048848532045e-06, + "loss": 2.1801, + "step": 8580 + }, + { + "epoch": 0.46035407725321886, + "grad_norm": 1.5760871171951294, + "learning_rate": 2.94462137889446e-06, + "loss": 2.2153, + "step": 8581 + }, + { + "epoch": 0.4604077253218884, + "grad_norm": 1.373711109161377, + "learning_rate": 2.9441938958325268e-06, + "loss": 2.1486, + "step": 8582 + }, + { + "epoch": 0.4604613733905579, + "grad_norm": 1.5877070426940918, + "learning_rate": 2.9437663993591507e-06, + "loss": 2.1426, + "step": 8583 + }, + { + "epoch": 0.46051502145922746, + "grad_norm": 1.3515058755874634, + "learning_rate": 2.9433388894872407e-06, + "loss": 2.2806, + "step": 8584 + }, + { + "epoch": 0.460568669527897, + "grad_norm": 1.5057801008224487, + "learning_rate": 2.9429113662297032e-06, + "loss": 2.5324, + "step": 8585 + }, + { + "epoch": 0.4606223175965665, + "grad_norm": 1.3781347274780273, + "learning_rate": 2.9424838295994464e-06, + "loss": 2.2196, + "step": 8586 + }, + { + "epoch": 0.46067596566523605, + "grad_norm": 1.4883724451065063, + "learning_rate": 2.9420562796093797e-06, + "loss": 2.155, + "step": 8587 + }, + { + "epoch": 0.4607296137339056, + "grad_norm": 1.2386267185211182, + "learning_rate": 2.941628716272411e-06, + "loss": 2.2897, + "step": 8588 + }, + { + "epoch": 0.4607832618025751, + "grad_norm": 1.4904791116714478, + "learning_rate": 2.9412011396014513e-06, + "loss": 2.4764, + "step": 8589 + }, + { + "epoch": 0.46083690987124465, + "grad_norm": 1.4377118349075317, + "learning_rate": 2.9407735496094075e-06, + "loss": 2.2607, + "step": 8590 + }, + { + "epoch": 0.4608905579399142, + "grad_norm": 1.3032056093215942, + "learning_rate": 2.940345946309192e-06, + "loss": 2.0865, + "step": 8591 + }, + { + "epoch": 0.4609442060085837, + "grad_norm": 1.522532343864441, + "learning_rate": 2.9399183297137145e-06, + "loss": 2.3798, + "step": 8592 + }, + { + "epoch": 0.46099785407725324, + "grad_norm": 1.4652634859085083, + "learning_rate": 2.939490699835887e-06, + "loss": 2.3291, + "step": 8593 + }, + { + "epoch": 0.4610515021459227, + "grad_norm": 1.6100796461105347, + "learning_rate": 2.9390630566886193e-06, + "loss": 2.1847, + "step": 8594 + }, + { + "epoch": 0.46110515021459225, + "grad_norm": 1.753370761871338, + "learning_rate": 2.9386354002848235e-06, + "loss": 2.2256, + "step": 8595 + }, + { + "epoch": 0.4611587982832618, + "grad_norm": 1.5946520566940308, + "learning_rate": 2.9382077306374113e-06, + "loss": 2.0785, + "step": 8596 + }, + { + "epoch": 0.4612124463519313, + "grad_norm": 1.5514724254608154, + "learning_rate": 2.937780047759298e-06, + "loss": 2.1683, + "step": 8597 + }, + { + "epoch": 0.46126609442060085, + "grad_norm": 1.5482949018478394, + "learning_rate": 2.937352351663393e-06, + "loss": 2.1543, + "step": 8598 + }, + { + "epoch": 0.4613197424892704, + "grad_norm": 1.443645715713501, + "learning_rate": 2.9369246423626106e-06, + "loss": 2.4103, + "step": 8599 + }, + { + "epoch": 0.4613733905579399, + "grad_norm": 1.5088039636611938, + "learning_rate": 2.9364969198698657e-06, + "loss": 2.1707, + "step": 8600 + }, + { + "epoch": 0.46142703862660944, + "grad_norm": 1.4190442562103271, + "learning_rate": 2.9360691841980716e-06, + "loss": 2.3823, + "step": 8601 + }, + { + "epoch": 0.461480686695279, + "grad_norm": 1.3274551630020142, + "learning_rate": 2.935641435360143e-06, + "loss": 2.3038, + "step": 8602 + }, + { + "epoch": 0.4615343347639485, + "grad_norm": 1.3520207405090332, + "learning_rate": 2.935213673368995e-06, + "loss": 2.1934, + "step": 8603 + }, + { + "epoch": 0.46158798283261804, + "grad_norm": 1.4037753343582153, + "learning_rate": 2.934785898237542e-06, + "loss": 2.2718, + "step": 8604 + }, + { + "epoch": 0.46164163090128757, + "grad_norm": 1.4440394639968872, + "learning_rate": 2.934358109978701e-06, + "loss": 2.3357, + "step": 8605 + }, + { + "epoch": 0.4616952789699571, + "grad_norm": 1.549751877784729, + "learning_rate": 2.9339303086053874e-06, + "loss": 2.465, + "step": 8606 + }, + { + "epoch": 0.46174892703862663, + "grad_norm": 1.5106244087219238, + "learning_rate": 2.933502494130518e-06, + "loss": 2.2389, + "step": 8607 + }, + { + "epoch": 0.46180257510729616, + "grad_norm": 1.5870270729064941, + "learning_rate": 2.933074666567009e-06, + "loss": 2.4564, + "step": 8608 + }, + { + "epoch": 0.46185622317596564, + "grad_norm": 1.3741997480392456, + "learning_rate": 2.9326468259277785e-06, + "loss": 2.4158, + "step": 8609 + }, + { + "epoch": 0.4619098712446352, + "grad_norm": 1.366355299949646, + "learning_rate": 2.932218972225744e-06, + "loss": 2.3828, + "step": 8610 + }, + { + "epoch": 0.4619635193133047, + "grad_norm": 1.3372726440429688, + "learning_rate": 2.9317911054738233e-06, + "loss": 2.0448, + "step": 8611 + }, + { + "epoch": 0.46201716738197424, + "grad_norm": 1.5391608476638794, + "learning_rate": 2.9313632256849345e-06, + "loss": 2.2436, + "step": 8612 + }, + { + "epoch": 0.46207081545064377, + "grad_norm": 6.101585388183594, + "learning_rate": 2.930935332871998e-06, + "loss": 2.0812, + "step": 8613 + }, + { + "epoch": 0.4621244635193133, + "grad_norm": 1.460872769355774, + "learning_rate": 2.9305074270479316e-06, + "loss": 2.2393, + "step": 8614 + }, + { + "epoch": 0.46217811158798283, + "grad_norm": 1.494729995727539, + "learning_rate": 2.9300795082256563e-06, + "loss": 2.2091, + "step": 8615 + }, + { + "epoch": 0.46223175965665236, + "grad_norm": 1.878673791885376, + "learning_rate": 2.9296515764180906e-06, + "loss": 2.3955, + "step": 8616 + }, + { + "epoch": 0.4622854077253219, + "grad_norm": 2.029452323913574, + "learning_rate": 2.9292236316381556e-06, + "loss": 2.5529, + "step": 8617 + }, + { + "epoch": 0.4623390557939914, + "grad_norm": 1.150478482246399, + "learning_rate": 2.9287956738987724e-06, + "loss": 2.1422, + "step": 8618 + }, + { + "epoch": 0.46239270386266096, + "grad_norm": 1.4028666019439697, + "learning_rate": 2.928367703212861e-06, + "loss": 2.4561, + "step": 8619 + }, + { + "epoch": 0.4624463519313305, + "grad_norm": 1.6906061172485352, + "learning_rate": 2.9279397195933455e-06, + "loss": 2.4867, + "step": 8620 + }, + { + "epoch": 0.4625, + "grad_norm": 1.3437143564224243, + "learning_rate": 2.9275117230531456e-06, + "loss": 1.4013, + "step": 8621 + }, + { + "epoch": 0.46255364806866955, + "grad_norm": 1.4869227409362793, + "learning_rate": 2.927083713605185e-06, + "loss": 2.3245, + "step": 8622 + }, + { + "epoch": 0.46260729613733903, + "grad_norm": 1.497109055519104, + "learning_rate": 2.926655691262386e-06, + "loss": 1.7254, + "step": 8623 + }, + { + "epoch": 0.46266094420600856, + "grad_norm": 1.449254035949707, + "learning_rate": 2.9262276560376718e-06, + "loss": 2.338, + "step": 8624 + }, + { + "epoch": 0.4627145922746781, + "grad_norm": 1.7465767860412598, + "learning_rate": 2.925799607943966e-06, + "loss": 2.2274, + "step": 8625 + }, + { + "epoch": 0.4627682403433476, + "grad_norm": 1.452717661857605, + "learning_rate": 2.9253715469941925e-06, + "loss": 2.425, + "step": 8626 + }, + { + "epoch": 0.46282188841201716, + "grad_norm": 1.5312227010726929, + "learning_rate": 2.9249434732012756e-06, + "loss": 2.4143, + "step": 8627 + }, + { + "epoch": 0.4628755364806867, + "grad_norm": 2.1188220977783203, + "learning_rate": 2.9245153865781405e-06, + "loss": 2.1353, + "step": 8628 + }, + { + "epoch": 0.4629291845493562, + "grad_norm": 2.0828983783721924, + "learning_rate": 2.9240872871377113e-06, + "loss": 1.5826, + "step": 8629 + }, + { + "epoch": 0.46298283261802575, + "grad_norm": 1.6123303174972534, + "learning_rate": 2.9236591748929146e-06, + "loss": 2.3103, + "step": 8630 + }, + { + "epoch": 0.4630364806866953, + "grad_norm": 1.686381459236145, + "learning_rate": 2.9232310498566753e-06, + "loss": 2.2044, + "step": 8631 + }, + { + "epoch": 0.4630901287553648, + "grad_norm": 1.6949212551116943, + "learning_rate": 2.9228029120419203e-06, + "loss": 2.0441, + "step": 8632 + }, + { + "epoch": 0.46314377682403435, + "grad_norm": 1.3553091287612915, + "learning_rate": 2.922374761461577e-06, + "loss": 2.1772, + "step": 8633 + }, + { + "epoch": 0.4631974248927039, + "grad_norm": 1.3776499032974243, + "learning_rate": 2.9219465981285717e-06, + "loss": 1.8162, + "step": 8634 + }, + { + "epoch": 0.4632510729613734, + "grad_norm": 1.6897424459457397, + "learning_rate": 2.92151842205583e-06, + "loss": 2.2447, + "step": 8635 + }, + { + "epoch": 0.46330472103004294, + "grad_norm": 1.9304746389389038, + "learning_rate": 2.921090233256283e-06, + "loss": 2.0532, + "step": 8636 + }, + { + "epoch": 0.4633583690987124, + "grad_norm": 1.3839534521102905, + "learning_rate": 2.9206620317428575e-06, + "loss": 2.1848, + "step": 8637 + }, + { + "epoch": 0.46341201716738195, + "grad_norm": 1.5131298303604126, + "learning_rate": 2.920233817528482e-06, + "loss": 2.3252, + "step": 8638 + }, + { + "epoch": 0.4634656652360515, + "grad_norm": 1.5242598056793213, + "learning_rate": 2.9198055906260845e-06, + "loss": 2.2893, + "step": 8639 + }, + { + "epoch": 0.463519313304721, + "grad_norm": 1.6620253324508667, + "learning_rate": 2.9193773510485956e-06, + "loss": 2.5412, + "step": 8640 + }, + { + "epoch": 0.46357296137339055, + "grad_norm": 1.5967803001403809, + "learning_rate": 2.9189490988089446e-06, + "loss": 2.3532, + "step": 8641 + }, + { + "epoch": 0.4636266094420601, + "grad_norm": 1.5300050973892212, + "learning_rate": 2.9185208339200626e-06, + "loss": 2.3611, + "step": 8642 + }, + { + "epoch": 0.4636802575107296, + "grad_norm": 1.5453264713287354, + "learning_rate": 2.9180925563948785e-06, + "loss": 2.3246, + "step": 8643 + }, + { + "epoch": 0.46373390557939914, + "grad_norm": 1.261572003364563, + "learning_rate": 2.9176642662463246e-06, + "loss": 2.3386, + "step": 8644 + }, + { + "epoch": 0.4637875536480687, + "grad_norm": 1.558885097503662, + "learning_rate": 2.917235963487331e-06, + "loss": 2.2925, + "step": 8645 + }, + { + "epoch": 0.4638412017167382, + "grad_norm": 2.4942638874053955, + "learning_rate": 2.91680764813083e-06, + "loss": 1.9772, + "step": 8646 + }, + { + "epoch": 0.46389484978540774, + "grad_norm": 2.209242343902588, + "learning_rate": 2.9163793201897533e-06, + "loss": 2.1999, + "step": 8647 + }, + { + "epoch": 0.46394849785407727, + "grad_norm": 1.5592516660690308, + "learning_rate": 2.9159509796770337e-06, + "loss": 2.1857, + "step": 8648 + }, + { + "epoch": 0.4640021459227468, + "grad_norm": 1.420048713684082, + "learning_rate": 2.9155226266056036e-06, + "loss": 2.2871, + "step": 8649 + }, + { + "epoch": 0.46405579399141633, + "grad_norm": 1.483932614326477, + "learning_rate": 2.915094260988397e-06, + "loss": 2.1078, + "step": 8650 + }, + { + "epoch": 0.46410944206008586, + "grad_norm": 3.7491140365600586, + "learning_rate": 2.914665882838346e-06, + "loss": 2.2628, + "step": 8651 + }, + { + "epoch": 0.46416309012875534, + "grad_norm": 1.4560986757278442, + "learning_rate": 2.914237492168385e-06, + "loss": 2.119, + "step": 8652 + }, + { + "epoch": 0.46421673819742487, + "grad_norm": 1.6182823181152344, + "learning_rate": 2.9138090889914494e-06, + "loss": 2.2389, + "step": 8653 + }, + { + "epoch": 0.4642703862660944, + "grad_norm": 1.5108447074890137, + "learning_rate": 2.913380673320473e-06, + "loss": 2.1869, + "step": 8654 + }, + { + "epoch": 0.46432403433476394, + "grad_norm": 1.5729776620864868, + "learning_rate": 2.9129522451683908e-06, + "loss": 2.3399, + "step": 8655 + }, + { + "epoch": 0.46437768240343347, + "grad_norm": 1.3003201484680176, + "learning_rate": 2.912523804548138e-06, + "loss": 2.1891, + "step": 8656 + }, + { + "epoch": 0.464431330472103, + "grad_norm": 1.9030699729919434, + "learning_rate": 2.9120953514726514e-06, + "loss": 1.9747, + "step": 8657 + }, + { + "epoch": 0.46448497854077253, + "grad_norm": 1.6089155673980713, + "learning_rate": 2.9116668859548654e-06, + "loss": 2.1264, + "step": 8658 + }, + { + "epoch": 0.46453862660944206, + "grad_norm": 1.6051081418991089, + "learning_rate": 2.9112384080077188e-06, + "loss": 2.3704, + "step": 8659 + }, + { + "epoch": 0.4645922746781116, + "grad_norm": 1.540601372718811, + "learning_rate": 2.9108099176441474e-06, + "loss": 2.368, + "step": 8660 + }, + { + "epoch": 0.4646459227467811, + "grad_norm": 1.5638774633407593, + "learning_rate": 2.910381414877088e-06, + "loss": 2.0505, + "step": 8661 + }, + { + "epoch": 0.46469957081545066, + "grad_norm": 1.3605488538742065, + "learning_rate": 2.909952899719479e-06, + "loss": 2.2401, + "step": 8662 + }, + { + "epoch": 0.4647532188841202, + "grad_norm": 1.3443291187286377, + "learning_rate": 2.909524372184258e-06, + "loss": 2.3059, + "step": 8663 + }, + { + "epoch": 0.4648068669527897, + "grad_norm": 1.6558818817138672, + "learning_rate": 2.909095832284364e-06, + "loss": 2.4014, + "step": 8664 + }, + { + "epoch": 0.46486051502145925, + "grad_norm": 1.3967549800872803, + "learning_rate": 2.908667280032736e-06, + "loss": 2.3106, + "step": 8665 + }, + { + "epoch": 0.46491416309012873, + "grad_norm": 1.4061247110366821, + "learning_rate": 2.9082387154423125e-06, + "loss": 2.2405, + "step": 8666 + }, + { + "epoch": 0.46496781115879826, + "grad_norm": 1.536965012550354, + "learning_rate": 2.9078101385260325e-06, + "loss": 2.2668, + "step": 8667 + }, + { + "epoch": 0.4650214592274678, + "grad_norm": 1.2946280241012573, + "learning_rate": 2.907381549296838e-06, + "loss": 2.1449, + "step": 8668 + }, + { + "epoch": 0.4650751072961373, + "grad_norm": 1.3246421813964844, + "learning_rate": 2.9069529477676668e-06, + "loss": 2.0355, + "step": 8669 + }, + { + "epoch": 0.46512875536480686, + "grad_norm": 1.7573357820510864, + "learning_rate": 2.9065243339514608e-06, + "loss": 2.1433, + "step": 8670 + }, + { + "epoch": 0.4651824034334764, + "grad_norm": 1.416779637336731, + "learning_rate": 2.9060957078611612e-06, + "loss": 2.2811, + "step": 8671 + }, + { + "epoch": 0.4652360515021459, + "grad_norm": 1.5243133306503296, + "learning_rate": 2.9056670695097095e-06, + "loss": 2.4216, + "step": 8672 + }, + { + "epoch": 0.46528969957081545, + "grad_norm": 1.7805235385894775, + "learning_rate": 2.9052384189100475e-06, + "loss": 2.4551, + "step": 8673 + }, + { + "epoch": 0.465343347639485, + "grad_norm": 1.3638023138046265, + "learning_rate": 2.904809756075116e-06, + "loss": 2.1842, + "step": 8674 + }, + { + "epoch": 0.4653969957081545, + "grad_norm": 1.462346076965332, + "learning_rate": 2.9043810810178584e-06, + "loss": 2.3362, + "step": 8675 + }, + { + "epoch": 0.46545064377682405, + "grad_norm": 2.0286059379577637, + "learning_rate": 2.9039523937512188e-06, + "loss": 2.1448, + "step": 8676 + }, + { + "epoch": 0.4655042918454936, + "grad_norm": 1.6750671863555908, + "learning_rate": 2.903523694288139e-06, + "loss": 2.3505, + "step": 8677 + }, + { + "epoch": 0.4655579399141631, + "grad_norm": 1.516391396522522, + "learning_rate": 2.9030949826415627e-06, + "loss": 2.2469, + "step": 8678 + }, + { + "epoch": 0.46561158798283264, + "grad_norm": 1.520341396331787, + "learning_rate": 2.902666258824434e-06, + "loss": 2.2545, + "step": 8679 + }, + { + "epoch": 0.4656652360515021, + "grad_norm": 1.3803044557571411, + "learning_rate": 2.9022375228496976e-06, + "loss": 2.2257, + "step": 8680 + }, + { + "epoch": 0.46571888412017165, + "grad_norm": 1.2606937885284424, + "learning_rate": 2.901808774730299e-06, + "loss": 2.0515, + "step": 8681 + }, + { + "epoch": 0.4657725321888412, + "grad_norm": 1.1595121622085571, + "learning_rate": 2.901380014479181e-06, + "loss": 1.9049, + "step": 8682 + }, + { + "epoch": 0.4658261802575107, + "grad_norm": 1.517127513885498, + "learning_rate": 2.9009512421092908e-06, + "loss": 2.1073, + "step": 8683 + }, + { + "epoch": 0.46587982832618025, + "grad_norm": 1.1169227361679077, + "learning_rate": 2.900522457633574e-06, + "loss": 1.7129, + "step": 8684 + }, + { + "epoch": 0.4659334763948498, + "grad_norm": 1.3964556455612183, + "learning_rate": 2.9000936610649765e-06, + "loss": 2.1957, + "step": 8685 + }, + { + "epoch": 0.4659871244635193, + "grad_norm": 1.5305267572402954, + "learning_rate": 2.899664852416445e-06, + "loss": 2.2726, + "step": 8686 + }, + { + "epoch": 0.46604077253218884, + "grad_norm": 1.6547919511795044, + "learning_rate": 2.899236031700926e-06, + "loss": 1.212, + "step": 8687 + }, + { + "epoch": 0.4660944206008584, + "grad_norm": 1.6392898559570312, + "learning_rate": 2.8988071989313675e-06, + "loss": 2.2753, + "step": 8688 + }, + { + "epoch": 0.4661480686695279, + "grad_norm": 2.154069662094116, + "learning_rate": 2.898378354120717e-06, + "loss": 2.1064, + "step": 8689 + }, + { + "epoch": 0.46620171673819744, + "grad_norm": 1.7749816179275513, + "learning_rate": 2.8979494972819227e-06, + "loss": 2.4604, + "step": 8690 + }, + { + "epoch": 0.46625536480686697, + "grad_norm": 1.6755925416946411, + "learning_rate": 2.8975206284279317e-06, + "loss": 2.3814, + "step": 8691 + }, + { + "epoch": 0.4663090128755365, + "grad_norm": 1.415834665298462, + "learning_rate": 2.8970917475716934e-06, + "loss": 2.3479, + "step": 8692 + }, + { + "epoch": 0.46636266094420603, + "grad_norm": 1.5434625148773193, + "learning_rate": 2.8966628547261583e-06, + "loss": 2.1391, + "step": 8693 + }, + { + "epoch": 0.46641630901287556, + "grad_norm": 2.2316274642944336, + "learning_rate": 2.8962339499042743e-06, + "loss": 2.3123, + "step": 8694 + }, + { + "epoch": 0.46646995708154504, + "grad_norm": 1.645414113998413, + "learning_rate": 2.895805033118992e-06, + "loss": 2.1213, + "step": 8695 + }, + { + "epoch": 0.46652360515021457, + "grad_norm": 1.5352023839950562, + "learning_rate": 2.89537610438326e-06, + "loss": 2.121, + "step": 8696 + }, + { + "epoch": 0.4665772532188841, + "grad_norm": 1.1556278467178345, + "learning_rate": 2.8949471637100306e-06, + "loss": 1.6984, + "step": 8697 + }, + { + "epoch": 0.46663090128755363, + "grad_norm": 1.551558017730713, + "learning_rate": 2.894518211112254e-06, + "loss": 2.356, + "step": 8698 + }, + { + "epoch": 0.46668454935622317, + "grad_norm": 1.4236552715301514, + "learning_rate": 2.8940892466028826e-06, + "loss": 2.3746, + "step": 8699 + }, + { + "epoch": 0.4667381974248927, + "grad_norm": 1.5360392332077026, + "learning_rate": 2.8936602701948663e-06, + "loss": 2.194, + "step": 8700 + }, + { + "epoch": 0.46679184549356223, + "grad_norm": 1.6751341819763184, + "learning_rate": 2.8932312819011575e-06, + "loss": 2.3297, + "step": 8701 + }, + { + "epoch": 0.46684549356223176, + "grad_norm": 1.5664892196655273, + "learning_rate": 2.8928022817347094e-06, + "loss": 2.4849, + "step": 8702 + }, + { + "epoch": 0.4668991416309013, + "grad_norm": 1.1969434022903442, + "learning_rate": 2.892373269708474e-06, + "loss": 2.1359, + "step": 8703 + }, + { + "epoch": 0.4669527896995708, + "grad_norm": 1.4725085496902466, + "learning_rate": 2.8919442458354045e-06, + "loss": 2.1877, + "step": 8704 + }, + { + "epoch": 0.46700643776824036, + "grad_norm": 1.634915828704834, + "learning_rate": 2.8915152101284543e-06, + "loss": 2.087, + "step": 8705 + }, + { + "epoch": 0.4670600858369099, + "grad_norm": 1.660628080368042, + "learning_rate": 2.8910861626005774e-06, + "loss": 2.2858, + "step": 8706 + }, + { + "epoch": 0.4671137339055794, + "grad_norm": 1.5098588466644287, + "learning_rate": 2.890657103264728e-06, + "loss": 2.2454, + "step": 8707 + }, + { + "epoch": 0.46716738197424895, + "grad_norm": 1.5933469533920288, + "learning_rate": 2.89022803213386e-06, + "loss": 2.2685, + "step": 8708 + }, + { + "epoch": 0.46722103004291843, + "grad_norm": 1.6651493310928345, + "learning_rate": 2.8897989492209284e-06, + "loss": 2.3441, + "step": 8709 + }, + { + "epoch": 0.46727467811158796, + "grad_norm": 1.4059581756591797, + "learning_rate": 2.8893698545388887e-06, + "loss": 2.3582, + "step": 8710 + }, + { + "epoch": 0.4673283261802575, + "grad_norm": 1.2797795534133911, + "learning_rate": 2.8889407481006965e-06, + "loss": 2.0689, + "step": 8711 + }, + { + "epoch": 0.467381974248927, + "grad_norm": 1.2863273620605469, + "learning_rate": 2.888511629919308e-06, + "loss": 2.2923, + "step": 8712 + }, + { + "epoch": 0.46743562231759656, + "grad_norm": 1.692063570022583, + "learning_rate": 2.8880825000076783e-06, + "loss": 1.8891, + "step": 8713 + }, + { + "epoch": 0.4674892703862661, + "grad_norm": 1.4936134815216064, + "learning_rate": 2.8876533583787647e-06, + "loss": 2.1646, + "step": 8714 + }, + { + "epoch": 0.4675429184549356, + "grad_norm": 1.4489961862564087, + "learning_rate": 2.887224205045524e-06, + "loss": 2.1789, + "step": 8715 + }, + { + "epoch": 0.46759656652360515, + "grad_norm": 1.4647313356399536, + "learning_rate": 2.8867950400209144e-06, + "loss": 2.2339, + "step": 8716 + }, + { + "epoch": 0.4676502145922747, + "grad_norm": 1.127768635749817, + "learning_rate": 2.886365863317893e-06, + "loss": 1.9685, + "step": 8717 + }, + { + "epoch": 0.4677038626609442, + "grad_norm": 1.3859171867370605, + "learning_rate": 2.8859366749494177e-06, + "loss": 2.1387, + "step": 8718 + }, + { + "epoch": 0.46775751072961375, + "grad_norm": 1.4734423160552979, + "learning_rate": 2.8855074749284467e-06, + "loss": 2.1398, + "step": 8719 + }, + { + "epoch": 0.4678111587982833, + "grad_norm": 1.656827449798584, + "learning_rate": 2.885078263267938e-06, + "loss": 2.2954, + "step": 8720 + }, + { + "epoch": 0.4678648068669528, + "grad_norm": 1.44058358669281, + "learning_rate": 2.8846490399808536e-06, + "loss": 2.1781, + "step": 8721 + }, + { + "epoch": 0.46791845493562234, + "grad_norm": 1.692589521408081, + "learning_rate": 2.8842198050801497e-06, + "loss": 2.2205, + "step": 8722 + }, + { + "epoch": 0.4679721030042919, + "grad_norm": 2.3053834438323975, + "learning_rate": 2.8837905585787874e-06, + "loss": 2.2834, + "step": 8723 + }, + { + "epoch": 0.46802575107296135, + "grad_norm": 1.628146767616272, + "learning_rate": 2.883361300489727e-06, + "loss": 2.3003, + "step": 8724 + }, + { + "epoch": 0.4680793991416309, + "grad_norm": 1.5001779794692993, + "learning_rate": 2.882932030825929e-06, + "loss": 2.2552, + "step": 8725 + }, + { + "epoch": 0.4681330472103004, + "grad_norm": 1.6262531280517578, + "learning_rate": 2.8825027496003537e-06, + "loss": 2.2523, + "step": 8726 + }, + { + "epoch": 0.46818669527896994, + "grad_norm": 1.4689451456069946, + "learning_rate": 2.8820734568259628e-06, + "loss": 2.3113, + "step": 8727 + }, + { + "epoch": 0.4682403433476395, + "grad_norm": 1.3572964668273926, + "learning_rate": 2.881644152515718e-06, + "loss": 2.0873, + "step": 8728 + }, + { + "epoch": 0.468293991416309, + "grad_norm": 1.5128618478775024, + "learning_rate": 2.8812148366825803e-06, + "loss": 1.5788, + "step": 8729 + }, + { + "epoch": 0.46834763948497854, + "grad_norm": 1.389311671257019, + "learning_rate": 2.8807855093395127e-06, + "loss": 2.281, + "step": 8730 + }, + { + "epoch": 0.46840128755364807, + "grad_norm": 1.5825908184051514, + "learning_rate": 2.8803561704994776e-06, + "loss": 1.6758, + "step": 8731 + }, + { + "epoch": 0.4684549356223176, + "grad_norm": 1.2260459661483765, + "learning_rate": 2.8799268201754384e-06, + "loss": 2.2887, + "step": 8732 + }, + { + "epoch": 0.46850858369098713, + "grad_norm": 1.5919737815856934, + "learning_rate": 2.879497458380357e-06, + "loss": 2.468, + "step": 8733 + }, + { + "epoch": 0.46856223175965667, + "grad_norm": 1.8838616609573364, + "learning_rate": 2.8790680851271984e-06, + "loss": 2.3252, + "step": 8734 + }, + { + "epoch": 0.4686158798283262, + "grad_norm": 1.3054375648498535, + "learning_rate": 2.878638700428926e-06, + "loss": 2.0288, + "step": 8735 + }, + { + "epoch": 0.46866952789699573, + "grad_norm": 1.4662950038909912, + "learning_rate": 2.878209304298504e-06, + "loss": 2.2287, + "step": 8736 + }, + { + "epoch": 0.46872317596566526, + "grad_norm": 1.4993001222610474, + "learning_rate": 2.8777798967488967e-06, + "loss": 2.4388, + "step": 8737 + }, + { + "epoch": 0.46877682403433474, + "grad_norm": 1.526002287864685, + "learning_rate": 2.8773504777930706e-06, + "loss": 2.382, + "step": 8738 + }, + { + "epoch": 0.46883047210300427, + "grad_norm": 1.4430526494979858, + "learning_rate": 2.8769210474439894e-06, + "loss": 2.1872, + "step": 8739 + }, + { + "epoch": 0.4688841201716738, + "grad_norm": 1.9911226034164429, + "learning_rate": 2.8764916057146204e-06, + "loss": 2.2212, + "step": 8740 + }, + { + "epoch": 0.46893776824034333, + "grad_norm": 2.3215792179107666, + "learning_rate": 2.8760621526179277e-06, + "loss": 2.1391, + "step": 8741 + }, + { + "epoch": 0.46899141630901287, + "grad_norm": 1.6616603136062622, + "learning_rate": 2.8756326881668782e-06, + "loss": 2.4166, + "step": 8742 + }, + { + "epoch": 0.4690450643776824, + "grad_norm": 1.2166460752487183, + "learning_rate": 2.8752032123744405e-06, + "loss": 2.2508, + "step": 8743 + }, + { + "epoch": 0.46909871244635193, + "grad_norm": 1.222510576248169, + "learning_rate": 2.8747737252535795e-06, + "loss": 2.1736, + "step": 8744 + }, + { + "epoch": 0.46915236051502146, + "grad_norm": 1.7752948999404907, + "learning_rate": 2.8743442268172634e-06, + "loss": 2.6517, + "step": 8745 + }, + { + "epoch": 0.469206008583691, + "grad_norm": 2.3726463317871094, + "learning_rate": 2.87391471707846e-06, + "loss": 2.1872, + "step": 8746 + }, + { + "epoch": 0.4692596566523605, + "grad_norm": 1.544026494026184, + "learning_rate": 2.873485196050137e-06, + "loss": 2.0554, + "step": 8747 + }, + { + "epoch": 0.46931330472103006, + "grad_norm": 3.7983357906341553, + "learning_rate": 2.8730556637452635e-06, + "loss": 2.2431, + "step": 8748 + }, + { + "epoch": 0.4693669527896996, + "grad_norm": 1.818713665008545, + "learning_rate": 2.8726261201768072e-06, + "loss": 2.1298, + "step": 8749 + }, + { + "epoch": 0.4694206008583691, + "grad_norm": 1.3660212755203247, + "learning_rate": 2.872196565357739e-06, + "loss": 2.1774, + "step": 8750 + }, + { + "epoch": 0.46947424892703865, + "grad_norm": 1.5622730255126953, + "learning_rate": 2.871766999301026e-06, + "loss": 2.351, + "step": 8751 + }, + { + "epoch": 0.4695278969957081, + "grad_norm": 1.5049030780792236, + "learning_rate": 2.8713374220196405e-06, + "loss": 1.5987, + "step": 8752 + }, + { + "epoch": 0.46958154506437766, + "grad_norm": 1.532483458518982, + "learning_rate": 2.87090783352655e-06, + "loss": 2.1389, + "step": 8753 + }, + { + "epoch": 0.4696351931330472, + "grad_norm": 1.3773518800735474, + "learning_rate": 2.870478233834727e-06, + "loss": 2.1874, + "step": 8754 + }, + { + "epoch": 0.4696888412017167, + "grad_norm": 1.5514678955078125, + "learning_rate": 2.8700486229571418e-06, + "loss": 2.3467, + "step": 8755 + }, + { + "epoch": 0.46974248927038625, + "grad_norm": 1.7093992233276367, + "learning_rate": 2.869619000906765e-06, + "loss": 2.2698, + "step": 8756 + }, + { + "epoch": 0.4697961373390558, + "grad_norm": 1.608385682106018, + "learning_rate": 2.8691893676965686e-06, + "loss": 2.4887, + "step": 8757 + }, + { + "epoch": 0.4698497854077253, + "grad_norm": 2.152785539627075, + "learning_rate": 2.8687597233395248e-06, + "loss": 2.2742, + "step": 8758 + }, + { + "epoch": 0.46990343347639485, + "grad_norm": 1.5938125848770142, + "learning_rate": 2.868330067848604e-06, + "loss": 2.2145, + "step": 8759 + }, + { + "epoch": 0.4699570815450644, + "grad_norm": 1.486371397972107, + "learning_rate": 2.86790040123678e-06, + "loss": 2.4331, + "step": 8760 + }, + { + "epoch": 0.4700107296137339, + "grad_norm": 1.4976825714111328, + "learning_rate": 2.8674707235170267e-06, + "loss": 2.189, + "step": 8761 + }, + { + "epoch": 0.47006437768240344, + "grad_norm": 1.5330055952072144, + "learning_rate": 2.8670410347023155e-06, + "loss": 1.6894, + "step": 8762 + }, + { + "epoch": 0.470118025751073, + "grad_norm": 1.5683008432388306, + "learning_rate": 2.8666113348056203e-06, + "loss": 2.3833, + "step": 8763 + }, + { + "epoch": 0.4701716738197425, + "grad_norm": 2.0393154621124268, + "learning_rate": 2.866181623839915e-06, + "loss": 1.8356, + "step": 8764 + }, + { + "epoch": 0.47022532188841204, + "grad_norm": 1.4300284385681152, + "learning_rate": 2.865751901818174e-06, + "loss": 2.5069, + "step": 8765 + }, + { + "epoch": 0.47027896995708157, + "grad_norm": 1.562686800956726, + "learning_rate": 2.8653221687533717e-06, + "loss": 2.5863, + "step": 8766 + }, + { + "epoch": 0.47033261802575105, + "grad_norm": 1.5223720073699951, + "learning_rate": 2.8648924246584825e-06, + "loss": 2.3767, + "step": 8767 + }, + { + "epoch": 0.4703862660944206, + "grad_norm": 1.5432535409927368, + "learning_rate": 2.8644626695464822e-06, + "loss": 2.1819, + "step": 8768 + }, + { + "epoch": 0.4704399141630901, + "grad_norm": 1.576320767402649, + "learning_rate": 2.864032903430346e-06, + "loss": 2.2208, + "step": 8769 + }, + { + "epoch": 0.47049356223175964, + "grad_norm": 2.0509603023529053, + "learning_rate": 2.86360312632305e-06, + "loss": 2.4089, + "step": 8770 + }, + { + "epoch": 0.4705472103004292, + "grad_norm": 1.4318910837173462, + "learning_rate": 2.86317333823757e-06, + "loss": 1.6282, + "step": 8771 + }, + { + "epoch": 0.4706008583690987, + "grad_norm": 1.5750792026519775, + "learning_rate": 2.8627435391868824e-06, + "loss": 2.6642, + "step": 8772 + }, + { + "epoch": 0.47065450643776824, + "grad_norm": 1.5801860094070435, + "learning_rate": 2.8623137291839644e-06, + "loss": 2.3348, + "step": 8773 + }, + { + "epoch": 0.47070815450643777, + "grad_norm": 1.3092886209487915, + "learning_rate": 2.8618839082417933e-06, + "loss": 2.3444, + "step": 8774 + }, + { + "epoch": 0.4707618025751073, + "grad_norm": 1.4167194366455078, + "learning_rate": 2.8614540763733458e-06, + "loss": 2.155, + "step": 8775 + }, + { + "epoch": 0.47081545064377683, + "grad_norm": 1.647464394569397, + "learning_rate": 2.8610242335916e-06, + "loss": 2.3044, + "step": 8776 + }, + { + "epoch": 0.47086909871244637, + "grad_norm": 1.601934552192688, + "learning_rate": 2.860594379909534e-06, + "loss": 2.558, + "step": 8777 + }, + { + "epoch": 0.4709227467811159, + "grad_norm": 2.133376121520996, + "learning_rate": 2.860164515340128e-06, + "loss": 2.3468, + "step": 8778 + }, + { + "epoch": 0.47097639484978543, + "grad_norm": 1.3909473419189453, + "learning_rate": 2.859734639896358e-06, + "loss": 2.2438, + "step": 8779 + }, + { + "epoch": 0.47103004291845496, + "grad_norm": 1.5002249479293823, + "learning_rate": 2.859304753591205e-06, + "loss": 2.4177, + "step": 8780 + }, + { + "epoch": 0.47108369098712444, + "grad_norm": 1.458820104598999, + "learning_rate": 2.8588748564376476e-06, + "loss": 2.3114, + "step": 8781 + }, + { + "epoch": 0.47113733905579397, + "grad_norm": 1.4542187452316284, + "learning_rate": 2.8584449484486653e-06, + "loss": 2.0717, + "step": 8782 + }, + { + "epoch": 0.4711909871244635, + "grad_norm": 1.7220187187194824, + "learning_rate": 2.85801502963724e-06, + "loss": 2.4181, + "step": 8783 + }, + { + "epoch": 0.47124463519313303, + "grad_norm": 1.6844394207000732, + "learning_rate": 2.85758510001635e-06, + "loss": 2.3089, + "step": 8784 + }, + { + "epoch": 0.47129828326180256, + "grad_norm": 1.2064539194107056, + "learning_rate": 2.8571551595989777e-06, + "loss": 2.1646, + "step": 8785 + }, + { + "epoch": 0.4713519313304721, + "grad_norm": 1.494829535484314, + "learning_rate": 2.856725208398103e-06, + "loss": 2.3079, + "step": 8786 + }, + { + "epoch": 0.4714055793991416, + "grad_norm": 1.7061448097229004, + "learning_rate": 2.8562952464267072e-06, + "loss": 2.3701, + "step": 8787 + }, + { + "epoch": 0.47145922746781116, + "grad_norm": 1.464516282081604, + "learning_rate": 2.8558652736977736e-06, + "loss": 2.4156, + "step": 8788 + }, + { + "epoch": 0.4715128755364807, + "grad_norm": 1.6697046756744385, + "learning_rate": 2.8554352902242834e-06, + "loss": 2.1213, + "step": 8789 + }, + { + "epoch": 0.4715665236051502, + "grad_norm": 1.671735405921936, + "learning_rate": 2.8550052960192183e-06, + "loss": 2.1573, + "step": 8790 + }, + { + "epoch": 0.47162017167381975, + "grad_norm": 1.565163493156433, + "learning_rate": 2.8545752910955614e-06, + "loss": 2.2746, + "step": 8791 + }, + { + "epoch": 0.4716738197424893, + "grad_norm": 1.35075843334198, + "learning_rate": 2.8541452754662962e-06, + "loss": 2.1723, + "step": 8792 + }, + { + "epoch": 0.4717274678111588, + "grad_norm": 1.8176615238189697, + "learning_rate": 2.853715249144406e-06, + "loss": 2.261, + "step": 8793 + }, + { + "epoch": 0.47178111587982835, + "grad_norm": 1.4380602836608887, + "learning_rate": 2.8532852121428737e-06, + "loss": 2.0495, + "step": 8794 + }, + { + "epoch": 0.4718347639484979, + "grad_norm": 1.5254578590393066, + "learning_rate": 2.8528551644746843e-06, + "loss": 2.0143, + "step": 8795 + }, + { + "epoch": 0.47188841201716736, + "grad_norm": 1.4941892623901367, + "learning_rate": 2.8524251061528223e-06, + "loss": 2.2213, + "step": 8796 + }, + { + "epoch": 0.4719420600858369, + "grad_norm": 1.7161790132522583, + "learning_rate": 2.851995037190271e-06, + "loss": 2.2483, + "step": 8797 + }, + { + "epoch": 0.4719957081545064, + "grad_norm": 1.5198132991790771, + "learning_rate": 2.851564957600016e-06, + "loss": 2.3127, + "step": 8798 + }, + { + "epoch": 0.47204935622317595, + "grad_norm": 1.6025968790054321, + "learning_rate": 2.8511348673950427e-06, + "loss": 2.3982, + "step": 8799 + }, + { + "epoch": 0.4721030042918455, + "grad_norm": 1.6611047983169556, + "learning_rate": 2.850704766588337e-06, + "loss": 2.113, + "step": 8800 + }, + { + "epoch": 0.472156652360515, + "grad_norm": 1.607743740081787, + "learning_rate": 2.8502746551928856e-06, + "loss": 2.3078, + "step": 8801 + }, + { + "epoch": 0.47221030042918455, + "grad_norm": 1.559463381767273, + "learning_rate": 2.8498445332216722e-06, + "loss": 2.4356, + "step": 8802 + }, + { + "epoch": 0.4722639484978541, + "grad_norm": 1.5264345407485962, + "learning_rate": 2.8494144006876856e-06, + "loss": 2.3213, + "step": 8803 + }, + { + "epoch": 0.4723175965665236, + "grad_norm": 1.5662035942077637, + "learning_rate": 2.8489842576039113e-06, + "loss": 2.4327, + "step": 8804 + }, + { + "epoch": 0.47237124463519314, + "grad_norm": 1.472456455230713, + "learning_rate": 2.8485541039833385e-06, + "loss": 2.3605, + "step": 8805 + }, + { + "epoch": 0.4724248927038627, + "grad_norm": 1.22515869140625, + "learning_rate": 2.8481239398389527e-06, + "loss": 2.0377, + "step": 8806 + }, + { + "epoch": 0.4724785407725322, + "grad_norm": 1.3950101137161255, + "learning_rate": 2.847693765183742e-06, + "loss": 2.2572, + "step": 8807 + }, + { + "epoch": 0.47253218884120174, + "grad_norm": 1.3040159940719604, + "learning_rate": 2.847263580030696e-06, + "loss": 2.1446, + "step": 8808 + }, + { + "epoch": 0.47258583690987127, + "grad_norm": 2.098146438598633, + "learning_rate": 2.846833384392801e-06, + "loss": 2.3409, + "step": 8809 + }, + { + "epoch": 0.47263948497854075, + "grad_norm": 1.4579347372055054, + "learning_rate": 2.8464031782830478e-06, + "loss": 2.4382, + "step": 8810 + }, + { + "epoch": 0.4726931330472103, + "grad_norm": 2.1598060131073, + "learning_rate": 2.8459729617144244e-06, + "loss": 2.2078, + "step": 8811 + }, + { + "epoch": 0.4727467811158798, + "grad_norm": 2.032485008239746, + "learning_rate": 2.845542734699921e-06, + "loss": 2.2029, + "step": 8812 + }, + { + "epoch": 0.47280042918454934, + "grad_norm": 1.6484804153442383, + "learning_rate": 2.8451124972525264e-06, + "loss": 2.2021, + "step": 8813 + }, + { + "epoch": 0.4728540772532189, + "grad_norm": 1.6102609634399414, + "learning_rate": 2.844682249385232e-06, + "loss": 2.2136, + "step": 8814 + }, + { + "epoch": 0.4729077253218884, + "grad_norm": 1.5924869775772095, + "learning_rate": 2.844251991111026e-06, + "loss": 2.1781, + "step": 8815 + }, + { + "epoch": 0.47296137339055794, + "grad_norm": 1.608641266822815, + "learning_rate": 2.843821722442901e-06, + "loss": 2.2758, + "step": 8816 + }, + { + "epoch": 0.47301502145922747, + "grad_norm": 1.6108611822128296, + "learning_rate": 2.8433914433938476e-06, + "loss": 2.2648, + "step": 8817 + }, + { + "epoch": 0.473068669527897, + "grad_norm": 1.3044204711914062, + "learning_rate": 2.842961153976857e-06, + "loss": 2.1406, + "step": 8818 + }, + { + "epoch": 0.47312231759656653, + "grad_norm": 1.4838858842849731, + "learning_rate": 2.8425308542049208e-06, + "loss": 2.2268, + "step": 8819 + }, + { + "epoch": 0.47317596566523606, + "grad_norm": 1.7163608074188232, + "learning_rate": 2.8421005440910303e-06, + "loss": 2.1793, + "step": 8820 + }, + { + "epoch": 0.4732296137339056, + "grad_norm": 1.5169044733047485, + "learning_rate": 2.8416702236481788e-06, + "loss": 2.0739, + "step": 8821 + }, + { + "epoch": 0.47328326180257513, + "grad_norm": 1.6210829019546509, + "learning_rate": 2.841239892889358e-06, + "loss": 2.4448, + "step": 8822 + }, + { + "epoch": 0.47333690987124466, + "grad_norm": 1.5922006368637085, + "learning_rate": 2.8408095518275618e-06, + "loss": 2.2195, + "step": 8823 + }, + { + "epoch": 0.47339055793991414, + "grad_norm": 1.3026093244552612, + "learning_rate": 2.8403792004757825e-06, + "loss": 2.2338, + "step": 8824 + }, + { + "epoch": 0.47344420600858367, + "grad_norm": 1.286510944366455, + "learning_rate": 2.8399488388470135e-06, + "loss": 2.087, + "step": 8825 + }, + { + "epoch": 0.4734978540772532, + "grad_norm": 1.403841495513916, + "learning_rate": 2.83951846695425e-06, + "loss": 2.181, + "step": 8826 + }, + { + "epoch": 0.47355150214592273, + "grad_norm": 1.445796012878418, + "learning_rate": 2.839088084810484e-06, + "loss": 2.3694, + "step": 8827 + }, + { + "epoch": 0.47360515021459226, + "grad_norm": 1.5195858478546143, + "learning_rate": 2.838657692428711e-06, + "loss": 2.4497, + "step": 8828 + }, + { + "epoch": 0.4736587982832618, + "grad_norm": 1.400526523590088, + "learning_rate": 2.8382272898219265e-06, + "loss": 2.4275, + "step": 8829 + }, + { + "epoch": 0.4737124463519313, + "grad_norm": 33.00614547729492, + "learning_rate": 2.837796877003124e-06, + "loss": 2.1807, + "step": 8830 + }, + { + "epoch": 0.47376609442060086, + "grad_norm": 1.5564216375350952, + "learning_rate": 2.8373664539853004e-06, + "loss": 1.8329, + "step": 8831 + }, + { + "epoch": 0.4738197424892704, + "grad_norm": 1.605162501335144, + "learning_rate": 2.83693602078145e-06, + "loss": 2.6715, + "step": 8832 + }, + { + "epoch": 0.4738733905579399, + "grad_norm": 1.3806949853897095, + "learning_rate": 2.8365055774045697e-06, + "loss": 2.1328, + "step": 8833 + }, + { + "epoch": 0.47392703862660945, + "grad_norm": 1.6662801504135132, + "learning_rate": 2.8360751238676556e-06, + "loss": 2.3286, + "step": 8834 + }, + { + "epoch": 0.473980686695279, + "grad_norm": 2.146822690963745, + "learning_rate": 2.835644660183704e-06, + "loss": 2.2439, + "step": 8835 + }, + { + "epoch": 0.4740343347639485, + "grad_norm": 2.253026008605957, + "learning_rate": 2.8352141863657124e-06, + "loss": 2.1889, + "step": 8836 + }, + { + "epoch": 0.47408798283261805, + "grad_norm": 1.3843916654586792, + "learning_rate": 2.8347837024266762e-06, + "loss": 2.2292, + "step": 8837 + }, + { + "epoch": 0.4741416309012876, + "grad_norm": 1.6961302757263184, + "learning_rate": 2.834353208379595e-06, + "loss": 1.9881, + "step": 8838 + }, + { + "epoch": 0.47419527896995706, + "grad_norm": 1.6997240781784058, + "learning_rate": 2.833922704237466e-06, + "loss": 2.3941, + "step": 8839 + }, + { + "epoch": 0.4742489270386266, + "grad_norm": 1.6909266710281372, + "learning_rate": 2.8334921900132873e-06, + "loss": 2.519, + "step": 8840 + }, + { + "epoch": 0.4743025751072961, + "grad_norm": 1.4950566291809082, + "learning_rate": 2.833061665720057e-06, + "loss": 2.334, + "step": 8841 + }, + { + "epoch": 0.47435622317596565, + "grad_norm": 1.4293179512023926, + "learning_rate": 2.832631131370774e-06, + "loss": 2.1653, + "step": 8842 + }, + { + "epoch": 0.4744098712446352, + "grad_norm": 1.2603390216827393, + "learning_rate": 2.8322005869784365e-06, + "loss": 1.8489, + "step": 8843 + }, + { + "epoch": 0.4744635193133047, + "grad_norm": 1.5083627700805664, + "learning_rate": 2.831770032556045e-06, + "loss": 2.3249, + "step": 8844 + }, + { + "epoch": 0.47451716738197425, + "grad_norm": 1.635743498802185, + "learning_rate": 2.8313394681165995e-06, + "loss": 1.5205, + "step": 8845 + }, + { + "epoch": 0.4745708154506438, + "grad_norm": 1.2488460540771484, + "learning_rate": 2.830908893673099e-06, + "loss": 2.0163, + "step": 8846 + }, + { + "epoch": 0.4746244635193133, + "grad_norm": 1.2960431575775146, + "learning_rate": 2.8304783092385434e-06, + "loss": 2.3248, + "step": 8847 + }, + { + "epoch": 0.47467811158798284, + "grad_norm": 1.870769739151001, + "learning_rate": 2.8300477148259343e-06, + "loss": 2.291, + "step": 8848 + }, + { + "epoch": 0.4747317596566524, + "grad_norm": 1.5541409254074097, + "learning_rate": 2.829617110448271e-06, + "loss": 2.3267, + "step": 8849 + }, + { + "epoch": 0.4747854077253219, + "grad_norm": 1.4272797107696533, + "learning_rate": 2.8291864961185568e-06, + "loss": 1.9472, + "step": 8850 + }, + { + "epoch": 0.47483905579399144, + "grad_norm": 2.2618398666381836, + "learning_rate": 2.828755871849791e-06, + "loss": 2.3179, + "step": 8851 + }, + { + "epoch": 0.47489270386266097, + "grad_norm": 1.2882750034332275, + "learning_rate": 2.8283252376549768e-06, + "loss": 1.9578, + "step": 8852 + }, + { + "epoch": 0.47494635193133045, + "grad_norm": 1.452199935913086, + "learning_rate": 2.827894593547116e-06, + "loss": 2.266, + "step": 8853 + }, + { + "epoch": 0.475, + "grad_norm": 1.9915714263916016, + "learning_rate": 2.82746393953921e-06, + "loss": 1.7196, + "step": 8854 + }, + { + "epoch": 0.4750536480686695, + "grad_norm": 1.2976012229919434, + "learning_rate": 2.8270332756442625e-06, + "loss": 2.3119, + "step": 8855 + }, + { + "epoch": 0.47510729613733904, + "grad_norm": 1.3560115098953247, + "learning_rate": 2.826602601875276e-06, + "loss": 2.25, + "step": 8856 + }, + { + "epoch": 0.4751609442060086, + "grad_norm": 1.514209270477295, + "learning_rate": 2.826171918245254e-06, + "loss": 2.4648, + "step": 8857 + }, + { + "epoch": 0.4752145922746781, + "grad_norm": 1.2063382863998413, + "learning_rate": 2.8257412247672e-06, + "loss": 1.7753, + "step": 8858 + }, + { + "epoch": 0.47526824034334764, + "grad_norm": 1.6160645484924316, + "learning_rate": 2.825310521454117e-06, + "loss": 2.3329, + "step": 8859 + }, + { + "epoch": 0.47532188841201717, + "grad_norm": 1.3871656656265259, + "learning_rate": 2.8248798083190094e-06, + "loss": 2.1917, + "step": 8860 + }, + { + "epoch": 0.4753755364806867, + "grad_norm": 1.5894215106964111, + "learning_rate": 2.8244490853748825e-06, + "loss": 1.6683, + "step": 8861 + }, + { + "epoch": 0.47542918454935623, + "grad_norm": 1.6411014795303345, + "learning_rate": 2.8240183526347407e-06, + "loss": 2.1639, + "step": 8862 + }, + { + "epoch": 0.47548283261802576, + "grad_norm": 1.426587700843811, + "learning_rate": 2.823587610111589e-06, + "loss": 2.1797, + "step": 8863 + }, + { + "epoch": 0.4755364806866953, + "grad_norm": 1.52129065990448, + "learning_rate": 2.823156857818432e-06, + "loss": 2.1611, + "step": 8864 + }, + { + "epoch": 0.4755901287553648, + "grad_norm": 1.6154028177261353, + "learning_rate": 2.8227260957682762e-06, + "loss": 2.0479, + "step": 8865 + }, + { + "epoch": 0.47564377682403436, + "grad_norm": 1.5890657901763916, + "learning_rate": 2.822295323974126e-06, + "loss": 2.369, + "step": 8866 + }, + { + "epoch": 0.47569742489270384, + "grad_norm": 1.6348934173583984, + "learning_rate": 2.8218645424489905e-06, + "loss": 2.3142, + "step": 8867 + }, + { + "epoch": 0.47575107296137337, + "grad_norm": 1.297574520111084, + "learning_rate": 2.821433751205873e-06, + "loss": 2.1842, + "step": 8868 + }, + { + "epoch": 0.4758047210300429, + "grad_norm": 1.5140312910079956, + "learning_rate": 2.8210029502577823e-06, + "loss": 2.369, + "step": 8869 + }, + { + "epoch": 0.47585836909871243, + "grad_norm": 1.846339225769043, + "learning_rate": 2.820572139617725e-06, + "loss": 2.4282, + "step": 8870 + }, + { + "epoch": 0.47591201716738196, + "grad_norm": 1.489786148071289, + "learning_rate": 2.8201413192987074e-06, + "loss": 2.0705, + "step": 8871 + }, + { + "epoch": 0.4759656652360515, + "grad_norm": 3.9669110774993896, + "learning_rate": 2.819710489313739e-06, + "loss": 2.2154, + "step": 8872 + }, + { + "epoch": 0.476019313304721, + "grad_norm": 1.6892238855361938, + "learning_rate": 2.8192796496758264e-06, + "loss": 1.5037, + "step": 8873 + }, + { + "epoch": 0.47607296137339056, + "grad_norm": 1.5178956985473633, + "learning_rate": 2.8188488003979784e-06, + "loss": 2.2819, + "step": 8874 + }, + { + "epoch": 0.4761266094420601, + "grad_norm": 1.601321816444397, + "learning_rate": 2.8184179414932033e-06, + "loss": 2.3852, + "step": 8875 + }, + { + "epoch": 0.4761802575107296, + "grad_norm": 1.4785367250442505, + "learning_rate": 2.8179870729745098e-06, + "loss": 1.9122, + "step": 8876 + }, + { + "epoch": 0.47623390557939915, + "grad_norm": 1.303305745124817, + "learning_rate": 2.8175561948549074e-06, + "loss": 2.2577, + "step": 8877 + }, + { + "epoch": 0.4762875536480687, + "grad_norm": 1.3841509819030762, + "learning_rate": 2.8171253071474054e-06, + "loss": 2.1967, + "step": 8878 + }, + { + "epoch": 0.4763412017167382, + "grad_norm": 1.918471336364746, + "learning_rate": 2.8166944098650133e-06, + "loss": 2.3016, + "step": 8879 + }, + { + "epoch": 0.47639484978540775, + "grad_norm": 1.6406028270721436, + "learning_rate": 2.816263503020742e-06, + "loss": 2.2267, + "step": 8880 + }, + { + "epoch": 0.4764484978540773, + "grad_norm": 1.772044062614441, + "learning_rate": 2.8158325866276e-06, + "loss": 2.3316, + "step": 8881 + }, + { + "epoch": 0.47650214592274676, + "grad_norm": 1.4125466346740723, + "learning_rate": 2.815401660698599e-06, + "loss": 2.3971, + "step": 8882 + }, + { + "epoch": 0.4765557939914163, + "grad_norm": 1.2635571956634521, + "learning_rate": 2.814970725246749e-06, + "loss": 2.2637, + "step": 8883 + }, + { + "epoch": 0.4766094420600858, + "grad_norm": 1.617551326751709, + "learning_rate": 2.8145397802850632e-06, + "loss": 2.1588, + "step": 8884 + }, + { + "epoch": 0.47666309012875535, + "grad_norm": 1.658085823059082, + "learning_rate": 2.814108825826551e-06, + "loss": 2.2265, + "step": 8885 + }, + { + "epoch": 0.4767167381974249, + "grad_norm": 1.3719912767410278, + "learning_rate": 2.8136778618842244e-06, + "loss": 2.3369, + "step": 8886 + }, + { + "epoch": 0.4767703862660944, + "grad_norm": 2.5332133769989014, + "learning_rate": 2.8132468884710954e-06, + "loss": 2.4061, + "step": 8887 + }, + { + "epoch": 0.47682403433476395, + "grad_norm": 1.7036287784576416, + "learning_rate": 2.8128159056001775e-06, + "loss": 2.2528, + "step": 8888 + }, + { + "epoch": 0.4768776824034335, + "grad_norm": 1.4502954483032227, + "learning_rate": 2.812384913284482e-06, + "loss": 2.4191, + "step": 8889 + }, + { + "epoch": 0.476931330472103, + "grad_norm": 1.5565773248672485, + "learning_rate": 2.811953911537022e-06, + "loss": 2.2027, + "step": 8890 + }, + { + "epoch": 0.47698497854077254, + "grad_norm": 1.5994676351547241, + "learning_rate": 2.8115229003708113e-06, + "loss": 2.3236, + "step": 8891 + }, + { + "epoch": 0.4770386266094421, + "grad_norm": 1.189470648765564, + "learning_rate": 2.8110918797988617e-06, + "loss": 2.1088, + "step": 8892 + }, + { + "epoch": 0.4770922746781116, + "grad_norm": 1.2351588010787964, + "learning_rate": 2.8106608498341888e-06, + "loss": 2.2733, + "step": 8893 + }, + { + "epoch": 0.47714592274678114, + "grad_norm": 1.5159430503845215, + "learning_rate": 2.810229810489805e-06, + "loss": 2.2875, + "step": 8894 + }, + { + "epoch": 0.47719957081545067, + "grad_norm": 1.6399673223495483, + "learning_rate": 2.809798761778726e-06, + "loss": 2.1413, + "step": 8895 + }, + { + "epoch": 0.47725321888412015, + "grad_norm": 1.6026382446289062, + "learning_rate": 2.8093677037139653e-06, + "loss": 2.2006, + "step": 8896 + }, + { + "epoch": 0.4773068669527897, + "grad_norm": 2.167872905731201, + "learning_rate": 2.8089366363085383e-06, + "loss": 2.125, + "step": 8897 + }, + { + "epoch": 0.4773605150214592, + "grad_norm": 1.7616389989852905, + "learning_rate": 2.808505559575461e-06, + "loss": 2.2351, + "step": 8898 + }, + { + "epoch": 0.47741416309012874, + "grad_norm": 1.4975261688232422, + "learning_rate": 2.808074473527746e-06, + "loss": 2.2256, + "step": 8899 + }, + { + "epoch": 0.47746781115879827, + "grad_norm": 1.5858818292617798, + "learning_rate": 2.8076433781784114e-06, + "loss": 2.4562, + "step": 8900 + }, + { + "epoch": 0.4775214592274678, + "grad_norm": 1.3994598388671875, + "learning_rate": 2.807212273540472e-06, + "loss": 1.9964, + "step": 8901 + }, + { + "epoch": 0.47757510729613734, + "grad_norm": 1.4704415798187256, + "learning_rate": 2.8067811596269457e-06, + "loss": 2.2198, + "step": 8902 + }, + { + "epoch": 0.47762875536480687, + "grad_norm": 1.3326730728149414, + "learning_rate": 2.8063500364508472e-06, + "loss": 1.9395, + "step": 8903 + }, + { + "epoch": 0.4776824034334764, + "grad_norm": 1.6807875633239746, + "learning_rate": 2.805918904025194e-06, + "loss": 2.2836, + "step": 8904 + }, + { + "epoch": 0.47773605150214593, + "grad_norm": 1.4377540349960327, + "learning_rate": 2.8054877623630023e-06, + "loss": 1.9255, + "step": 8905 + }, + { + "epoch": 0.47778969957081546, + "grad_norm": 1.6070091724395752, + "learning_rate": 2.8050566114772915e-06, + "loss": 2.2973, + "step": 8906 + }, + { + "epoch": 0.477843347639485, + "grad_norm": 1.3569538593292236, + "learning_rate": 2.804625451381077e-06, + "loss": 2.3132, + "step": 8907 + }, + { + "epoch": 0.4778969957081545, + "grad_norm": 1.5673879384994507, + "learning_rate": 2.8041942820873786e-06, + "loss": 2.3249, + "step": 8908 + }, + { + "epoch": 0.47795064377682406, + "grad_norm": 3.996368646621704, + "learning_rate": 2.8037631036092126e-06, + "loss": 2.296, + "step": 8909 + }, + { + "epoch": 0.4780042918454936, + "grad_norm": 1.7302461862564087, + "learning_rate": 2.803331915959599e-06, + "loss": 2.5054, + "step": 8910 + }, + { + "epoch": 0.47805793991416307, + "grad_norm": 1.5297995805740356, + "learning_rate": 2.8029007191515563e-06, + "loss": 2.3738, + "step": 8911 + }, + { + "epoch": 0.4781115879828326, + "grad_norm": 1.488141417503357, + "learning_rate": 2.802469513198103e-06, + "loss": 2.4166, + "step": 8912 + }, + { + "epoch": 0.47816523605150213, + "grad_norm": 5.598122596740723, + "learning_rate": 2.8020382981122584e-06, + "loss": 2.3495, + "step": 8913 + }, + { + "epoch": 0.47821888412017166, + "grad_norm": 1.3804762363433838, + "learning_rate": 2.801607073907043e-06, + "loss": 2.0326, + "step": 8914 + }, + { + "epoch": 0.4782725321888412, + "grad_norm": 1.224878191947937, + "learning_rate": 2.801175840595476e-06, + "loss": 2.128, + "step": 8915 + }, + { + "epoch": 0.4783261802575107, + "grad_norm": 2.6265907287597656, + "learning_rate": 2.800744598190577e-06, + "loss": 2.32, + "step": 8916 + }, + { + "epoch": 0.47837982832618026, + "grad_norm": 1.4148718118667603, + "learning_rate": 2.8003133467053667e-06, + "loss": 1.4934, + "step": 8917 + }, + { + "epoch": 0.4784334763948498, + "grad_norm": 1.46062433719635, + "learning_rate": 2.799882086152866e-06, + "loss": 2.1831, + "step": 8918 + }, + { + "epoch": 0.4784871244635193, + "grad_norm": 1.5437538623809814, + "learning_rate": 2.799450816546096e-06, + "loss": 2.3151, + "step": 8919 + }, + { + "epoch": 0.47854077253218885, + "grad_norm": 1.6745342016220093, + "learning_rate": 2.7990195378980784e-06, + "loss": 2.3402, + "step": 8920 + }, + { + "epoch": 0.4785944206008584, + "grad_norm": 1.5794379711151123, + "learning_rate": 2.798588250221834e-06, + "loss": 2.3494, + "step": 8921 + }, + { + "epoch": 0.4786480686695279, + "grad_norm": 1.5488311052322388, + "learning_rate": 2.798156953530383e-06, + "loss": 2.3256, + "step": 8922 + }, + { + "epoch": 0.47870171673819745, + "grad_norm": 1.4549928903579712, + "learning_rate": 2.7977256478367503e-06, + "loss": 2.3846, + "step": 8923 + }, + { + "epoch": 0.478755364806867, + "grad_norm": 1.6181161403656006, + "learning_rate": 2.7972943331539575e-06, + "loss": 2.2652, + "step": 8924 + }, + { + "epoch": 0.47880901287553645, + "grad_norm": 1.553434133529663, + "learning_rate": 2.796863009495026e-06, + "loss": 2.363, + "step": 8925 + }, + { + "epoch": 0.478862660944206, + "grad_norm": 1.4348256587982178, + "learning_rate": 2.7964316768729794e-06, + "loss": 2.269, + "step": 8926 + }, + { + "epoch": 0.4789163090128755, + "grad_norm": 1.578824758529663, + "learning_rate": 2.7960003353008407e-06, + "loss": 2.2336, + "step": 8927 + }, + { + "epoch": 0.47896995708154505, + "grad_norm": 1.518700122833252, + "learning_rate": 2.795568984791633e-06, + "loss": 2.2372, + "step": 8928 + }, + { + "epoch": 0.4790236051502146, + "grad_norm": 1.5398374795913696, + "learning_rate": 2.795137625358381e-06, + "loss": 2.3806, + "step": 8929 + }, + { + "epoch": 0.4790772532188841, + "grad_norm": 1.2304531335830688, + "learning_rate": 2.7947062570141076e-06, + "loss": 1.5394, + "step": 8930 + }, + { + "epoch": 0.47913090128755365, + "grad_norm": 1.4782686233520508, + "learning_rate": 2.7942748797718373e-06, + "loss": 2.3855, + "step": 8931 + }, + { + "epoch": 0.4791845493562232, + "grad_norm": 1.6692990064620972, + "learning_rate": 2.7938434936445946e-06, + "loss": 2.2208, + "step": 8932 + }, + { + "epoch": 0.4792381974248927, + "grad_norm": 1.7755703926086426, + "learning_rate": 2.793412098645404e-06, + "loss": 2.2429, + "step": 8933 + }, + { + "epoch": 0.47929184549356224, + "grad_norm": 1.4642422199249268, + "learning_rate": 2.792980694787291e-06, + "loss": 2.1458, + "step": 8934 + }, + { + "epoch": 0.4793454935622318, + "grad_norm": 1.579666018486023, + "learning_rate": 2.7925492820832805e-06, + "loss": 2.328, + "step": 8935 + }, + { + "epoch": 0.4793991416309013, + "grad_norm": 1.3535549640655518, + "learning_rate": 2.7921178605463984e-06, + "loss": 1.7937, + "step": 8936 + }, + { + "epoch": 0.47945278969957084, + "grad_norm": 1.5165374279022217, + "learning_rate": 2.7916864301896706e-06, + "loss": 2.3351, + "step": 8937 + }, + { + "epoch": 0.47950643776824037, + "grad_norm": 1.60256826877594, + "learning_rate": 2.791254991026122e-06, + "loss": 2.3595, + "step": 8938 + }, + { + "epoch": 0.47956008583690984, + "grad_norm": 1.3840500116348267, + "learning_rate": 2.7908235430687803e-06, + "loss": 1.9477, + "step": 8939 + }, + { + "epoch": 0.4796137339055794, + "grad_norm": 1.5251604318618774, + "learning_rate": 2.790392086330671e-06, + "loss": 2.4322, + "step": 8940 + }, + { + "epoch": 0.4796673819742489, + "grad_norm": 1.3965842723846436, + "learning_rate": 2.789960620824822e-06, + "loss": 2.2214, + "step": 8941 + }, + { + "epoch": 0.47972103004291844, + "grad_norm": 1.5188148021697998, + "learning_rate": 2.789529146564261e-06, + "loss": 2.3437, + "step": 8942 + }, + { + "epoch": 0.47977467811158797, + "grad_norm": 1.463356375694275, + "learning_rate": 2.7890976635620133e-06, + "loss": 2.2805, + "step": 8943 + }, + { + "epoch": 0.4798283261802575, + "grad_norm": 1.457000970840454, + "learning_rate": 2.7886661718311078e-06, + "loss": 2.2104, + "step": 8944 + }, + { + "epoch": 0.47988197424892703, + "grad_norm": 1.5414018630981445, + "learning_rate": 2.788234671384572e-06, + "loss": 2.314, + "step": 8945 + }, + { + "epoch": 0.47993562231759657, + "grad_norm": 1.5408220291137695, + "learning_rate": 2.787803162235435e-06, + "loss": 2.1895, + "step": 8946 + }, + { + "epoch": 0.4799892703862661, + "grad_norm": 1.1234506368637085, + "learning_rate": 2.7873716443967246e-06, + "loss": 2.2703, + "step": 8947 + }, + { + "epoch": 0.48004291845493563, + "grad_norm": 1.1135640144348145, + "learning_rate": 2.786940117881469e-06, + "loss": 2.2334, + "step": 8948 + }, + { + "epoch": 0.48009656652360516, + "grad_norm": 14.829164505004883, + "learning_rate": 2.7865085827026985e-06, + "loss": 2.2228, + "step": 8949 + }, + { + "epoch": 0.4801502145922747, + "grad_norm": 1.7137683629989624, + "learning_rate": 2.786077038873441e-06, + "loss": 2.16, + "step": 8950 + }, + { + "epoch": 0.4802038626609442, + "grad_norm": 1.6388914585113525, + "learning_rate": 2.7856454864067267e-06, + "loss": 2.2918, + "step": 8951 + }, + { + "epoch": 0.48025751072961376, + "grad_norm": 1.6464343070983887, + "learning_rate": 2.785213925315586e-06, + "loss": 2.0918, + "step": 8952 + }, + { + "epoch": 0.4803111587982833, + "grad_norm": 1.4921010732650757, + "learning_rate": 2.784782355613047e-06, + "loss": 2.3577, + "step": 8953 + }, + { + "epoch": 0.48036480686695276, + "grad_norm": 2.1390810012817383, + "learning_rate": 2.784350777312142e-06, + "loss": 2.32, + "step": 8954 + }, + { + "epoch": 0.4804184549356223, + "grad_norm": 1.6033371686935425, + "learning_rate": 2.7839191904259e-06, + "loss": 2.2353, + "step": 8955 + }, + { + "epoch": 0.48047210300429183, + "grad_norm": 1.5695960521697998, + "learning_rate": 2.783487594967353e-06, + "loss": 2.0859, + "step": 8956 + }, + { + "epoch": 0.48052575107296136, + "grad_norm": 1.5054091215133667, + "learning_rate": 2.7830559909495315e-06, + "loss": 2.4245, + "step": 8957 + }, + { + "epoch": 0.4805793991416309, + "grad_norm": 1.7826502323150635, + "learning_rate": 2.782624378385467e-06, + "loss": 2.2756, + "step": 8958 + }, + { + "epoch": 0.4806330472103004, + "grad_norm": 1.5057251453399658, + "learning_rate": 2.782192757288191e-06, + "loss": 2.1906, + "step": 8959 + }, + { + "epoch": 0.48068669527896996, + "grad_norm": 1.7478843927383423, + "learning_rate": 2.7817611276707352e-06, + "loss": 2.1953, + "step": 8960 + }, + { + "epoch": 0.4807403433476395, + "grad_norm": 1.6383897066116333, + "learning_rate": 2.7813294895461314e-06, + "loss": 2.217, + "step": 8961 + }, + { + "epoch": 0.480793991416309, + "grad_norm": 1.5980000495910645, + "learning_rate": 2.7808978429274125e-06, + "loss": 2.2528, + "step": 8962 + }, + { + "epoch": 0.48084763948497855, + "grad_norm": 1.6045914888381958, + "learning_rate": 2.7804661878276106e-06, + "loss": 2.3063, + "step": 8963 + }, + { + "epoch": 0.4809012875536481, + "grad_norm": 1.508978009223938, + "learning_rate": 2.78003452425976e-06, + "loss": 2.322, + "step": 8964 + }, + { + "epoch": 0.4809549356223176, + "grad_norm": 1.5248689651489258, + "learning_rate": 2.7796028522368916e-06, + "loss": 2.258, + "step": 8965 + }, + { + "epoch": 0.48100858369098715, + "grad_norm": 1.218807339668274, + "learning_rate": 2.7791711717720406e-06, + "loss": 2.0386, + "step": 8966 + }, + { + "epoch": 0.4810622317596567, + "grad_norm": 1.689613938331604, + "learning_rate": 2.7787394828782394e-06, + "loss": 2.3499, + "step": 8967 + }, + { + "epoch": 0.48111587982832615, + "grad_norm": 1.6863222122192383, + "learning_rate": 2.778307785568523e-06, + "loss": 2.3181, + "step": 8968 + }, + { + "epoch": 0.4811695278969957, + "grad_norm": 1.5043702125549316, + "learning_rate": 2.7778760798559245e-06, + "loss": 2.1693, + "step": 8969 + }, + { + "epoch": 0.4812231759656652, + "grad_norm": 1.499253511428833, + "learning_rate": 2.7774443657534788e-06, + "loss": 2.2921, + "step": 8970 + }, + { + "epoch": 0.48127682403433475, + "grad_norm": 1.3414020538330078, + "learning_rate": 2.7770126432742206e-06, + "loss": 2.3062, + "step": 8971 + }, + { + "epoch": 0.4813304721030043, + "grad_norm": 1.4700849056243896, + "learning_rate": 2.7765809124311843e-06, + "loss": 2.0059, + "step": 8972 + }, + { + "epoch": 0.4813841201716738, + "grad_norm": 2.5273256301879883, + "learning_rate": 2.776149173237406e-06, + "loss": 2.6381, + "step": 8973 + }, + { + "epoch": 0.48143776824034334, + "grad_norm": 1.3304486274719238, + "learning_rate": 2.7757174257059198e-06, + "loss": 2.1828, + "step": 8974 + }, + { + "epoch": 0.4814914163090129, + "grad_norm": 1.5221681594848633, + "learning_rate": 2.775285669849762e-06, + "loss": 2.47, + "step": 8975 + }, + { + "epoch": 0.4815450643776824, + "grad_norm": 1.5196255445480347, + "learning_rate": 2.774853905681969e-06, + "loss": 2.1993, + "step": 8976 + }, + { + "epoch": 0.48159871244635194, + "grad_norm": 1.5688096284866333, + "learning_rate": 2.7744221332155773e-06, + "loss": 2.2728, + "step": 8977 + }, + { + "epoch": 0.48165236051502147, + "grad_norm": 1.510208249092102, + "learning_rate": 2.7739903524636207e-06, + "loss": 2.2782, + "step": 8978 + }, + { + "epoch": 0.481706008583691, + "grad_norm": 2.2671821117401123, + "learning_rate": 2.773558563439139e-06, + "loss": 2.6029, + "step": 8979 + }, + { + "epoch": 0.48175965665236054, + "grad_norm": 1.389789342880249, + "learning_rate": 2.7731267661551676e-06, + "loss": 2.4644, + "step": 8980 + }, + { + "epoch": 0.48181330472103007, + "grad_norm": 1.5788774490356445, + "learning_rate": 2.772694960624744e-06, + "loss": 2.3696, + "step": 8981 + }, + { + "epoch": 0.4818669527896996, + "grad_norm": 1.607273817062378, + "learning_rate": 2.7722631468609053e-06, + "loss": 2.3809, + "step": 8982 + }, + { + "epoch": 0.4819206008583691, + "grad_norm": 1.5881706476211548, + "learning_rate": 2.771831324876689e-06, + "loss": 2.267, + "step": 8983 + }, + { + "epoch": 0.4819742489270386, + "grad_norm": 1.4951869249343872, + "learning_rate": 2.7713994946851327e-06, + "loss": 2.3928, + "step": 8984 + }, + { + "epoch": 0.48202789699570814, + "grad_norm": 1.4855356216430664, + "learning_rate": 2.770967656299276e-06, + "loss": 2.2687, + "step": 8985 + }, + { + "epoch": 0.48208154506437767, + "grad_norm": 1.4998747110366821, + "learning_rate": 2.770535809732157e-06, + "loss": 2.045, + "step": 8986 + }, + { + "epoch": 0.4821351931330472, + "grad_norm": 1.4095348119735718, + "learning_rate": 2.770103954996813e-06, + "loss": 2.2392, + "step": 8987 + }, + { + "epoch": 0.48218884120171673, + "grad_norm": 1.4756535291671753, + "learning_rate": 2.769672092106283e-06, + "loss": 2.3007, + "step": 8988 + }, + { + "epoch": 0.48224248927038627, + "grad_norm": 1.4501311779022217, + "learning_rate": 2.7692402210736082e-06, + "loss": 2.5102, + "step": 8989 + }, + { + "epoch": 0.4822961373390558, + "grad_norm": 1.5690712928771973, + "learning_rate": 2.7688083419118256e-06, + "loss": 2.1616, + "step": 8990 + }, + { + "epoch": 0.48234978540772533, + "grad_norm": 1.5623574256896973, + "learning_rate": 2.7683764546339758e-06, + "loss": 2.2846, + "step": 8991 + }, + { + "epoch": 0.48240343347639486, + "grad_norm": 1.5768859386444092, + "learning_rate": 2.767944559253099e-06, + "loss": 2.337, + "step": 8992 + }, + { + "epoch": 0.4824570815450644, + "grad_norm": 1.405672311782837, + "learning_rate": 2.767512655782235e-06, + "loss": 2.299, + "step": 8993 + }, + { + "epoch": 0.4825107296137339, + "grad_norm": 1.607465386390686, + "learning_rate": 2.7670807442344238e-06, + "loss": 2.4042, + "step": 8994 + }, + { + "epoch": 0.48256437768240346, + "grad_norm": 1.3457920551300049, + "learning_rate": 2.766648824622707e-06, + "loss": 2.2359, + "step": 8995 + }, + { + "epoch": 0.482618025751073, + "grad_norm": 1.4138245582580566, + "learning_rate": 2.7662168969601243e-06, + "loss": 2.2017, + "step": 8996 + }, + { + "epoch": 0.48267167381974246, + "grad_norm": 1.444812297821045, + "learning_rate": 2.7657849612597175e-06, + "loss": 2.2829, + "step": 8997 + }, + { + "epoch": 0.482725321888412, + "grad_norm": 29.67660140991211, + "learning_rate": 2.7653530175345277e-06, + "loss": 2.1371, + "step": 8998 + }, + { + "epoch": 0.4827789699570815, + "grad_norm": 1.6261683702468872, + "learning_rate": 2.7649210657975968e-06, + "loss": 2.1247, + "step": 8999 + }, + { + "epoch": 0.48283261802575106, + "grad_norm": 1.6914910078048706, + "learning_rate": 2.7644891060619656e-06, + "loss": 2.3816, + "step": 9000 + }, + { + "epoch": 0.4828862660944206, + "grad_norm": 1.8096294403076172, + "learning_rate": 2.764057138340677e-06, + "loss": 2.1692, + "step": 9001 + }, + { + "epoch": 0.4829399141630901, + "grad_norm": 3.984257698059082, + "learning_rate": 2.7636251626467737e-06, + "loss": 1.8544, + "step": 9002 + }, + { + "epoch": 0.48299356223175965, + "grad_norm": 1.3397808074951172, + "learning_rate": 2.7631931789932974e-06, + "loss": 1.8292, + "step": 9003 + }, + { + "epoch": 0.4830472103004292, + "grad_norm": 1.5114164352416992, + "learning_rate": 2.7627611873932918e-06, + "loss": 2.342, + "step": 9004 + }, + { + "epoch": 0.4831008583690987, + "grad_norm": 1.4545506238937378, + "learning_rate": 2.7623291878597985e-06, + "loss": 2.2525, + "step": 9005 + }, + { + "epoch": 0.48315450643776825, + "grad_norm": 1.206678032875061, + "learning_rate": 2.7618971804058615e-06, + "loss": 1.6677, + "step": 9006 + }, + { + "epoch": 0.4832081545064378, + "grad_norm": 1.5255200862884521, + "learning_rate": 2.7614651650445246e-06, + "loss": 2.2751, + "step": 9007 + }, + { + "epoch": 0.4832618025751073, + "grad_norm": 1.8004666566848755, + "learning_rate": 2.7610331417888326e-06, + "loss": 2.1468, + "step": 9008 + }, + { + "epoch": 0.48331545064377684, + "grad_norm": 1.453015923500061, + "learning_rate": 2.7606011106518265e-06, + "loss": 2.4146, + "step": 9009 + }, + { + "epoch": 0.4833690987124464, + "grad_norm": 1.559418797492981, + "learning_rate": 2.760169071646553e-06, + "loss": 2.3258, + "step": 9010 + }, + { + "epoch": 0.48342274678111585, + "grad_norm": 1.4192191362380981, + "learning_rate": 2.7597370247860555e-06, + "loss": 1.9731, + "step": 9011 + }, + { + "epoch": 0.4834763948497854, + "grad_norm": 1.3098634481430054, + "learning_rate": 2.759304970083379e-06, + "loss": 1.6949, + "step": 9012 + }, + { + "epoch": 0.4835300429184549, + "grad_norm": 1.5987271070480347, + "learning_rate": 2.7588729075515684e-06, + "loss": 2.2813, + "step": 9013 + }, + { + "epoch": 0.48358369098712445, + "grad_norm": 1.6089681386947632, + "learning_rate": 2.7584408372036686e-06, + "loss": 2.2464, + "step": 9014 + }, + { + "epoch": 0.483637339055794, + "grad_norm": 4.821911811828613, + "learning_rate": 2.7580087590527255e-06, + "loss": 2.1611, + "step": 9015 + }, + { + "epoch": 0.4836909871244635, + "grad_norm": 1.6386127471923828, + "learning_rate": 2.7575766731117847e-06, + "loss": 2.4109, + "step": 9016 + }, + { + "epoch": 0.48374463519313304, + "grad_norm": 1.7992515563964844, + "learning_rate": 2.7571445793938918e-06, + "loss": 2.256, + "step": 9017 + }, + { + "epoch": 0.4837982832618026, + "grad_norm": 1.4227594137191772, + "learning_rate": 2.7567124779120923e-06, + "loss": 2.3658, + "step": 9018 + }, + { + "epoch": 0.4838519313304721, + "grad_norm": 1.4255247116088867, + "learning_rate": 2.7562803686794336e-06, + "loss": 2.2811, + "step": 9019 + }, + { + "epoch": 0.48390557939914164, + "grad_norm": 1.4857426881790161, + "learning_rate": 2.7558482517089617e-06, + "loss": 2.3236, + "step": 9020 + }, + { + "epoch": 0.48395922746781117, + "grad_norm": 1.496901035308838, + "learning_rate": 2.7554161270137245e-06, + "loss": 2.1652, + "step": 9021 + }, + { + "epoch": 0.4840128755364807, + "grad_norm": 2.024207353591919, + "learning_rate": 2.7549839946067674e-06, + "loss": 2.2708, + "step": 9022 + }, + { + "epoch": 0.48406652360515023, + "grad_norm": 1.6691854000091553, + "learning_rate": 2.754551854501138e-06, + "loss": 2.2049, + "step": 9023 + }, + { + "epoch": 0.48412017167381977, + "grad_norm": 1.5731093883514404, + "learning_rate": 2.7541197067098845e-06, + "loss": 2.2865, + "step": 9024 + }, + { + "epoch": 0.4841738197424893, + "grad_norm": 2.002371072769165, + "learning_rate": 2.7536875512460544e-06, + "loss": 2.3933, + "step": 9025 + }, + { + "epoch": 0.4842274678111588, + "grad_norm": 2.111229658126831, + "learning_rate": 2.753255388122696e-06, + "loss": 2.0371, + "step": 9026 + }, + { + "epoch": 0.4842811158798283, + "grad_norm": 1.5404088497161865, + "learning_rate": 2.752823217352856e-06, + "loss": 2.4084, + "step": 9027 + }, + { + "epoch": 0.48433476394849784, + "grad_norm": 1.4556753635406494, + "learning_rate": 2.752391038949585e-06, + "loss": 2.1953, + "step": 9028 + }, + { + "epoch": 0.48438841201716737, + "grad_norm": 1.3977620601654053, + "learning_rate": 2.7519588529259293e-06, + "loss": 2.2056, + "step": 9029 + }, + { + "epoch": 0.4844420600858369, + "grad_norm": 1.5625306367874146, + "learning_rate": 2.7515266592949408e-06, + "loss": 2.1023, + "step": 9030 + }, + { + "epoch": 0.48449570815450643, + "grad_norm": 1.7019789218902588, + "learning_rate": 2.7510944580696657e-06, + "loss": 2.3734, + "step": 9031 + }, + { + "epoch": 0.48454935622317596, + "grad_norm": 1.6218317747116089, + "learning_rate": 2.7506622492631553e-06, + "loss": 2.4475, + "step": 9032 + }, + { + "epoch": 0.4846030042918455, + "grad_norm": 2.478410005569458, + "learning_rate": 2.750230032888458e-06, + "loss": 2.2026, + "step": 9033 + }, + { + "epoch": 0.48465665236051503, + "grad_norm": 1.5342580080032349, + "learning_rate": 2.7497978089586236e-06, + "loss": 2.2378, + "step": 9034 + }, + { + "epoch": 0.48471030042918456, + "grad_norm": 1.3621281385421753, + "learning_rate": 2.749365577486703e-06, + "loss": 2.1397, + "step": 9035 + }, + { + "epoch": 0.4847639484978541, + "grad_norm": 1.7174782752990723, + "learning_rate": 2.748933338485746e-06, + "loss": 2.4274, + "step": 9036 + }, + { + "epoch": 0.4848175965665236, + "grad_norm": 1.5859864950180054, + "learning_rate": 2.748501091968803e-06, + "loss": 2.3072, + "step": 9037 + }, + { + "epoch": 0.48487124463519315, + "grad_norm": 4.69979190826416, + "learning_rate": 2.748068837948925e-06, + "loss": 2.2135, + "step": 9038 + }, + { + "epoch": 0.4849248927038627, + "grad_norm": 1.4628430604934692, + "learning_rate": 2.7476365764391627e-06, + "loss": 2.393, + "step": 9039 + }, + { + "epoch": 0.48497854077253216, + "grad_norm": 1.5995533466339111, + "learning_rate": 2.7472043074525674e-06, + "loss": 2.3211, + "step": 9040 + }, + { + "epoch": 0.4850321888412017, + "grad_norm": 1.3286219835281372, + "learning_rate": 2.7467720310021904e-06, + "loss": 2.2994, + "step": 9041 + }, + { + "epoch": 0.4850858369098712, + "grad_norm": 1.663124918937683, + "learning_rate": 2.7463397471010834e-06, + "loss": 2.0337, + "step": 9042 + }, + { + "epoch": 0.48513948497854076, + "grad_norm": 1.466835856437683, + "learning_rate": 2.745907455762299e-06, + "loss": 2.2435, + "step": 9043 + }, + { + "epoch": 0.4851931330472103, + "grad_norm": 1.3988091945648193, + "learning_rate": 2.7454751569988876e-06, + "loss": 2.1577, + "step": 9044 + }, + { + "epoch": 0.4852467811158798, + "grad_norm": 1.1710726022720337, + "learning_rate": 2.7450428508239024e-06, + "loss": 2.1263, + "step": 9045 + }, + { + "epoch": 0.48530042918454935, + "grad_norm": 1.6160788536071777, + "learning_rate": 2.744610537250395e-06, + "loss": 2.165, + "step": 9046 + }, + { + "epoch": 0.4853540772532189, + "grad_norm": 1.625352144241333, + "learning_rate": 2.74417821629142e-06, + "loss": 1.4018, + "step": 9047 + }, + { + "epoch": 0.4854077253218884, + "grad_norm": 1.6725915670394897, + "learning_rate": 2.74374588796003e-06, + "loss": 2.5094, + "step": 9048 + }, + { + "epoch": 0.48546137339055795, + "grad_norm": 1.6625337600708008, + "learning_rate": 2.7433135522692774e-06, + "loss": 2.2316, + "step": 9049 + }, + { + "epoch": 0.4855150214592275, + "grad_norm": 1.4455866813659668, + "learning_rate": 2.742881209232215e-06, + "loss": 2.1172, + "step": 9050 + }, + { + "epoch": 0.485568669527897, + "grad_norm": 1.7053568363189697, + "learning_rate": 2.742448858861898e-06, + "loss": 2.217, + "step": 9051 + }, + { + "epoch": 0.48562231759656654, + "grad_norm": 1.5962988138198853, + "learning_rate": 2.742016501171379e-06, + "loss": 2.414, + "step": 9052 + }, + { + "epoch": 0.4856759656652361, + "grad_norm": 1.477253794670105, + "learning_rate": 2.7415841361737127e-06, + "loss": 2.1674, + "step": 9053 + }, + { + "epoch": 0.48572961373390555, + "grad_norm": 1.1854312419891357, + "learning_rate": 2.741151763881953e-06, + "loss": 2.1908, + "step": 9054 + }, + { + "epoch": 0.4857832618025751, + "grad_norm": 1.5955545902252197, + "learning_rate": 2.740719384309155e-06, + "loss": 2.4942, + "step": 9055 + }, + { + "epoch": 0.4858369098712446, + "grad_norm": 2.1162283420562744, + "learning_rate": 2.7402869974683732e-06, + "loss": 2.2484, + "step": 9056 + }, + { + "epoch": 0.48589055793991415, + "grad_norm": 1.4689736366271973, + "learning_rate": 2.7398546033726627e-06, + "loss": 2.1949, + "step": 9057 + }, + { + "epoch": 0.4859442060085837, + "grad_norm": 1.7181146144866943, + "learning_rate": 2.7394222020350776e-06, + "loss": 2.2199, + "step": 9058 + }, + { + "epoch": 0.4859978540772532, + "grad_norm": 1.7094826698303223, + "learning_rate": 2.738989793468675e-06, + "loss": 1.7889, + "step": 9059 + }, + { + "epoch": 0.48605150214592274, + "grad_norm": 1.5525280237197876, + "learning_rate": 2.7385573776865086e-06, + "loss": 2.2053, + "step": 9060 + }, + { + "epoch": 0.4861051502145923, + "grad_norm": 1.5426701307296753, + "learning_rate": 2.738124954701637e-06, + "loss": 2.0344, + "step": 9061 + }, + { + "epoch": 0.4861587982832618, + "grad_norm": 1.6070570945739746, + "learning_rate": 2.7376925245271125e-06, + "loss": 2.2654, + "step": 9062 + }, + { + "epoch": 0.48621244635193134, + "grad_norm": 1.4222930669784546, + "learning_rate": 2.7372600871759946e-06, + "loss": 2.2696, + "step": 9063 + }, + { + "epoch": 0.48626609442060087, + "grad_norm": 2.448643445968628, + "learning_rate": 2.7368276426613382e-06, + "loss": 2.2963, + "step": 9064 + }, + { + "epoch": 0.4863197424892704, + "grad_norm": 2.547900676727295, + "learning_rate": 2.7363951909962004e-06, + "loss": 2.445, + "step": 9065 + }, + { + "epoch": 0.48637339055793993, + "grad_norm": 1.5344934463500977, + "learning_rate": 2.735962732193638e-06, + "loss": 1.3117, + "step": 9066 + }, + { + "epoch": 0.48642703862660946, + "grad_norm": 1.5250906944274902, + "learning_rate": 2.735530266266709e-06, + "loss": 2.0335, + "step": 9067 + }, + { + "epoch": 0.486480686695279, + "grad_norm": 1.3329155445098877, + "learning_rate": 2.7350977932284683e-06, + "loss": 2.0765, + "step": 9068 + }, + { + "epoch": 0.4865343347639485, + "grad_norm": 1.584048867225647, + "learning_rate": 2.734665313091976e-06, + "loss": 1.9923, + "step": 9069 + }, + { + "epoch": 0.486587982832618, + "grad_norm": 1.6812756061553955, + "learning_rate": 2.7342328258702895e-06, + "loss": 2.4094, + "step": 9070 + }, + { + "epoch": 0.48664163090128754, + "grad_norm": 1.4697275161743164, + "learning_rate": 2.7338003315764657e-06, + "loss": 2.2063, + "step": 9071 + }, + { + "epoch": 0.48669527896995707, + "grad_norm": 1.451852560043335, + "learning_rate": 2.7333678302235633e-06, + "loss": 1.9976, + "step": 9072 + }, + { + "epoch": 0.4867489270386266, + "grad_norm": 1.7532079219818115, + "learning_rate": 2.7329353218246406e-06, + "loss": 2.2268, + "step": 9073 + }, + { + "epoch": 0.48680257510729613, + "grad_norm": 1.7579280138015747, + "learning_rate": 2.732502806392757e-06, + "loss": 1.0603, + "step": 9074 + }, + { + "epoch": 0.48685622317596566, + "grad_norm": 1.597919225692749, + "learning_rate": 2.73207028394097e-06, + "loss": 2.3277, + "step": 9075 + }, + { + "epoch": 0.4869098712446352, + "grad_norm": 2.3047313690185547, + "learning_rate": 2.7316377544823397e-06, + "loss": 2.4279, + "step": 9076 + }, + { + "epoch": 0.4869635193133047, + "grad_norm": 1.3574410676956177, + "learning_rate": 2.7312052180299255e-06, + "loss": 2.1659, + "step": 9077 + }, + { + "epoch": 0.48701716738197426, + "grad_norm": 1.6300498247146606, + "learning_rate": 2.7307726745967855e-06, + "loss": 2.3555, + "step": 9078 + }, + { + "epoch": 0.4870708154506438, + "grad_norm": 1.5106104612350464, + "learning_rate": 2.7303401241959808e-06, + "loss": 1.9596, + "step": 9079 + }, + { + "epoch": 0.4871244635193133, + "grad_norm": 1.5785129070281982, + "learning_rate": 2.7299075668405705e-06, + "loss": 2.0656, + "step": 9080 + }, + { + "epoch": 0.48717811158798285, + "grad_norm": 1.371991515159607, + "learning_rate": 2.729475002543615e-06, + "loss": 2.5199, + "step": 9081 + }, + { + "epoch": 0.4872317596566524, + "grad_norm": 1.3164485692977905, + "learning_rate": 2.729042431318175e-06, + "loss": 1.4357, + "step": 9082 + }, + { + "epoch": 0.48728540772532186, + "grad_norm": 1.61985445022583, + "learning_rate": 2.7286098531773108e-06, + "loss": 2.0732, + "step": 9083 + }, + { + "epoch": 0.4873390557939914, + "grad_norm": 1.3886834383010864, + "learning_rate": 2.7281772681340825e-06, + "loss": 2.189, + "step": 9084 + }, + { + "epoch": 0.4873927038626609, + "grad_norm": 1.2886035442352295, + "learning_rate": 2.7277446762015507e-06, + "loss": 2.2001, + "step": 9085 + }, + { + "epoch": 0.48744635193133046, + "grad_norm": 1.6329926252365112, + "learning_rate": 2.7273120773927787e-06, + "loss": 2.1963, + "step": 9086 + }, + { + "epoch": 0.4875, + "grad_norm": 1.6188676357269287, + "learning_rate": 2.726879471720826e-06, + "loss": 2.3599, + "step": 9087 + }, + { + "epoch": 0.4875536480686695, + "grad_norm": 1.4992201328277588, + "learning_rate": 2.7264468591987547e-06, + "loss": 2.3332, + "step": 9088 + }, + { + "epoch": 0.48760729613733905, + "grad_norm": 1.145732045173645, + "learning_rate": 2.726014239839626e-06, + "loss": 2.0989, + "step": 9089 + }, + { + "epoch": 0.4876609442060086, + "grad_norm": 1.593786597251892, + "learning_rate": 2.725581613656503e-06, + "loss": 2.441, + "step": 9090 + }, + { + "epoch": 0.4877145922746781, + "grad_norm": 1.510446548461914, + "learning_rate": 2.7251489806624464e-06, + "loss": 2.3633, + "step": 9091 + }, + { + "epoch": 0.48776824034334765, + "grad_norm": 1.3848161697387695, + "learning_rate": 2.7247163408705207e-06, + "loss": 2.1, + "step": 9092 + }, + { + "epoch": 0.4878218884120172, + "grad_norm": 1.844834566116333, + "learning_rate": 2.724283694293787e-06, + "loss": 2.3375, + "step": 9093 + }, + { + "epoch": 0.4878755364806867, + "grad_norm": 1.544724941253662, + "learning_rate": 2.723851040945307e-06, + "loss": 2.3484, + "step": 9094 + }, + { + "epoch": 0.48792918454935624, + "grad_norm": 1.618008017539978, + "learning_rate": 2.7234183808381465e-06, + "loss": 2.2897, + "step": 9095 + }, + { + "epoch": 0.4879828326180258, + "grad_norm": 1.4622361660003662, + "learning_rate": 2.7229857139853667e-06, + "loss": 2.1992, + "step": 9096 + }, + { + "epoch": 0.4880364806866953, + "grad_norm": 1.5573172569274902, + "learning_rate": 2.7225530404000315e-06, + "loss": 2.2466, + "step": 9097 + }, + { + "epoch": 0.4880901287553648, + "grad_norm": 1.3403315544128418, + "learning_rate": 2.7221203600952047e-06, + "loss": 2.2952, + "step": 9098 + }, + { + "epoch": 0.4881437768240343, + "grad_norm": 1.4099754095077515, + "learning_rate": 2.7216876730839496e-06, + "loss": 2.3296, + "step": 9099 + }, + { + "epoch": 0.48819742489270385, + "grad_norm": 1.9543249607086182, + "learning_rate": 2.7212549793793314e-06, + "loss": 2.4887, + "step": 9100 + }, + { + "epoch": 0.4882510729613734, + "grad_norm": 1.461338758468628, + "learning_rate": 2.720822278994413e-06, + "loss": 2.2317, + "step": 9101 + }, + { + "epoch": 0.4883047210300429, + "grad_norm": 1.5632555484771729, + "learning_rate": 2.720389571942259e-06, + "loss": 2.3278, + "step": 9102 + }, + { + "epoch": 0.48835836909871244, + "grad_norm": 1.4802947044372559, + "learning_rate": 2.7199568582359353e-06, + "loss": 2.2475, + "step": 9103 + }, + { + "epoch": 0.488412017167382, + "grad_norm": 1.21487295627594, + "learning_rate": 2.719524137888505e-06, + "loss": 2.3433, + "step": 9104 + }, + { + "epoch": 0.4884656652360515, + "grad_norm": 1.3538087606430054, + "learning_rate": 2.719091410913035e-06, + "loss": 2.1629, + "step": 9105 + }, + { + "epoch": 0.48851931330472104, + "grad_norm": 1.3067601919174194, + "learning_rate": 2.7186586773225885e-06, + "loss": 2.4024, + "step": 9106 + }, + { + "epoch": 0.48857296137339057, + "grad_norm": 1.7064461708068848, + "learning_rate": 2.718225937130231e-06, + "loss": 2.3548, + "step": 9107 + }, + { + "epoch": 0.4886266094420601, + "grad_norm": 1.386432409286499, + "learning_rate": 2.7177931903490302e-06, + "loss": 2.2386, + "step": 9108 + }, + { + "epoch": 0.48868025751072963, + "grad_norm": 1.6739431619644165, + "learning_rate": 2.71736043699205e-06, + "loss": 2.3093, + "step": 9109 + }, + { + "epoch": 0.48873390557939916, + "grad_norm": 1.6766189336776733, + "learning_rate": 2.7169276770723583e-06, + "loss": 1.4548, + "step": 9110 + }, + { + "epoch": 0.4887875536480687, + "grad_norm": 1.227871060371399, + "learning_rate": 2.71649491060302e-06, + "loss": 2.3274, + "step": 9111 + }, + { + "epoch": 0.48884120171673817, + "grad_norm": 1.6201472282409668, + "learning_rate": 2.716062137597101e-06, + "loss": 2.107, + "step": 9112 + }, + { + "epoch": 0.4888948497854077, + "grad_norm": 1.1380677223205566, + "learning_rate": 2.715629358067668e-06, + "loss": 1.9569, + "step": 9113 + }, + { + "epoch": 0.48894849785407724, + "grad_norm": 1.4030513763427734, + "learning_rate": 2.7151965720277896e-06, + "loss": 2.1758, + "step": 9114 + }, + { + "epoch": 0.48900214592274677, + "grad_norm": 1.6671311855316162, + "learning_rate": 2.7147637794905314e-06, + "loss": 2.2968, + "step": 9115 + }, + { + "epoch": 0.4890557939914163, + "grad_norm": 1.3747804164886475, + "learning_rate": 2.71433098046896e-06, + "loss": 2.2659, + "step": 9116 + }, + { + "epoch": 0.48910944206008583, + "grad_norm": 1.6506941318511963, + "learning_rate": 2.713898174976144e-06, + "loss": 2.2335, + "step": 9117 + }, + { + "epoch": 0.48916309012875536, + "grad_norm": 1.731046438217163, + "learning_rate": 2.7134653630251507e-06, + "loss": 2.5644, + "step": 9118 + }, + { + "epoch": 0.4892167381974249, + "grad_norm": 1.6567022800445557, + "learning_rate": 2.7130325446290478e-06, + "loss": 2.0528, + "step": 9119 + }, + { + "epoch": 0.4892703862660944, + "grad_norm": 1.5768059492111206, + "learning_rate": 2.712599719800903e-06, + "loss": 2.1695, + "step": 9120 + }, + { + "epoch": 0.48932403433476396, + "grad_norm": 5.442168712615967, + "learning_rate": 2.712166888553785e-06, + "loss": 2.1099, + "step": 9121 + }, + { + "epoch": 0.4893776824034335, + "grad_norm": 1.5135952234268188, + "learning_rate": 2.711734050900762e-06, + "loss": 2.4411, + "step": 9122 + }, + { + "epoch": 0.489431330472103, + "grad_norm": 1.5388524532318115, + "learning_rate": 2.711301206854903e-06, + "loss": 2.3784, + "step": 9123 + }, + { + "epoch": 0.48948497854077255, + "grad_norm": 1.6463637351989746, + "learning_rate": 2.7108683564292748e-06, + "loss": 2.5638, + "step": 9124 + }, + { + "epoch": 0.4895386266094421, + "grad_norm": 1.363834261894226, + "learning_rate": 2.7104354996369485e-06, + "loss": 2.1773, + "step": 9125 + }, + { + "epoch": 0.48959227467811156, + "grad_norm": 1.7313225269317627, + "learning_rate": 2.7100026364909925e-06, + "loss": 2.2144, + "step": 9126 + }, + { + "epoch": 0.4896459227467811, + "grad_norm": 1.8723465204238892, + "learning_rate": 2.709569767004477e-06, + "loss": 2.3345, + "step": 9127 + }, + { + "epoch": 0.4896995708154506, + "grad_norm": 1.553295612335205, + "learning_rate": 2.70913689119047e-06, + "loss": 2.3272, + "step": 9128 + }, + { + "epoch": 0.48975321888412016, + "grad_norm": 1.6897462606430054, + "learning_rate": 2.708704009062042e-06, + "loss": 2.3893, + "step": 9129 + }, + { + "epoch": 0.4898068669527897, + "grad_norm": 1.6052806377410889, + "learning_rate": 2.708271120632262e-06, + "loss": 2.3921, + "step": 9130 + }, + { + "epoch": 0.4898605150214592, + "grad_norm": 1.5988221168518066, + "learning_rate": 2.7078382259142016e-06, + "loss": 2.2585, + "step": 9131 + }, + { + "epoch": 0.48991416309012875, + "grad_norm": 1.4534022808074951, + "learning_rate": 2.707405324920931e-06, + "loss": 2.1875, + "step": 9132 + }, + { + "epoch": 0.4899678111587983, + "grad_norm": 3.7328360080718994, + "learning_rate": 2.706972417665519e-06, + "loss": 2.2221, + "step": 9133 + }, + { + "epoch": 0.4900214592274678, + "grad_norm": 1.7136904001235962, + "learning_rate": 2.7065395041610383e-06, + "loss": 2.4679, + "step": 9134 + }, + { + "epoch": 0.49007510729613735, + "grad_norm": 1.6412007808685303, + "learning_rate": 2.7061065844205586e-06, + "loss": 2.1881, + "step": 9135 + }, + { + "epoch": 0.4901287553648069, + "grad_norm": 1.6069873571395874, + "learning_rate": 2.7056736584571506e-06, + "loss": 2.067, + "step": 9136 + }, + { + "epoch": 0.4901824034334764, + "grad_norm": 1.6538954973220825, + "learning_rate": 2.7052407262838866e-06, + "loss": 2.4129, + "step": 9137 + }, + { + "epoch": 0.49023605150214594, + "grad_norm": 1.816689133644104, + "learning_rate": 2.7048077879138375e-06, + "loss": 2.3484, + "step": 9138 + }, + { + "epoch": 0.4902896995708155, + "grad_norm": 1.6590938568115234, + "learning_rate": 2.7043748433600748e-06, + "loss": 2.4113, + "step": 9139 + }, + { + "epoch": 0.490343347639485, + "grad_norm": 1.6081421375274658, + "learning_rate": 2.7039418926356703e-06, + "loss": 2.2077, + "step": 9140 + }, + { + "epoch": 0.4903969957081545, + "grad_norm": 1.5239319801330566, + "learning_rate": 2.7035089357536958e-06, + "loss": 2.3405, + "step": 9141 + }, + { + "epoch": 0.490450643776824, + "grad_norm": 1.6159523725509644, + "learning_rate": 2.7030759727272245e-06, + "loss": 2.3003, + "step": 9142 + }, + { + "epoch": 0.49050429184549355, + "grad_norm": 1.6769232749938965, + "learning_rate": 2.7026430035693273e-06, + "loss": 2.3197, + "step": 9143 + }, + { + "epoch": 0.4905579399141631, + "grad_norm": 2.1692943572998047, + "learning_rate": 2.702210028293078e-06, + "loss": 2.1786, + "step": 9144 + }, + { + "epoch": 0.4906115879828326, + "grad_norm": 1.4223120212554932, + "learning_rate": 2.701777046911549e-06, + "loss": 2.0939, + "step": 9145 + }, + { + "epoch": 0.49066523605150214, + "grad_norm": 1.6400643587112427, + "learning_rate": 2.701344059437812e-06, + "loss": 2.3415, + "step": 9146 + }, + { + "epoch": 0.4907188841201717, + "grad_norm": 3.0558621883392334, + "learning_rate": 2.7009110658849415e-06, + "loss": 2.3018, + "step": 9147 + }, + { + "epoch": 0.4907725321888412, + "grad_norm": 1.5213627815246582, + "learning_rate": 2.7004780662660104e-06, + "loss": 2.5183, + "step": 9148 + }, + { + "epoch": 0.49082618025751074, + "grad_norm": 2.090266466140747, + "learning_rate": 2.7000450605940924e-06, + "loss": 2.3968, + "step": 9149 + }, + { + "epoch": 0.49087982832618027, + "grad_norm": 1.4736955165863037, + "learning_rate": 2.6996120488822602e-06, + "loss": 1.8968, + "step": 9150 + }, + { + "epoch": 0.4909334763948498, + "grad_norm": 1.5239720344543457, + "learning_rate": 2.699179031143589e-06, + "loss": 2.1433, + "step": 9151 + }, + { + "epoch": 0.49098712446351933, + "grad_norm": 1.4817049503326416, + "learning_rate": 2.6987460073911516e-06, + "loss": 2.4249, + "step": 9152 + }, + { + "epoch": 0.49104077253218886, + "grad_norm": 1.5078792572021484, + "learning_rate": 2.6983129776380217e-06, + "loss": 2.4017, + "step": 9153 + }, + { + "epoch": 0.4910944206008584, + "grad_norm": 1.4571313858032227, + "learning_rate": 2.697879941897276e-06, + "loss": 2.5346, + "step": 9154 + }, + { + "epoch": 0.49114806866952787, + "grad_norm": 1.616053581237793, + "learning_rate": 2.697446900181987e-06, + "loss": 2.389, + "step": 9155 + }, + { + "epoch": 0.4912017167381974, + "grad_norm": 2.026289224624634, + "learning_rate": 2.69701385250523e-06, + "loss": 2.3012, + "step": 9156 + }, + { + "epoch": 0.49125536480686693, + "grad_norm": 1.9737440347671509, + "learning_rate": 2.6965807988800805e-06, + "loss": 1.3164, + "step": 9157 + }, + { + "epoch": 0.49130901287553647, + "grad_norm": 1.7062712907791138, + "learning_rate": 2.696147739319613e-06, + "loss": 2.2185, + "step": 9158 + }, + { + "epoch": 0.491362660944206, + "grad_norm": 1.5037875175476074, + "learning_rate": 2.695714673836902e-06, + "loss": 2.1502, + "step": 9159 + }, + { + "epoch": 0.49141630901287553, + "grad_norm": 1.5991556644439697, + "learning_rate": 2.6952816024450246e-06, + "loss": 2.1954, + "step": 9160 + }, + { + "epoch": 0.49146995708154506, + "grad_norm": 1.803530216217041, + "learning_rate": 2.6948485251570553e-06, + "loss": 2.496, + "step": 9161 + }, + { + "epoch": 0.4915236051502146, + "grad_norm": 1.6792042255401611, + "learning_rate": 2.694415441986071e-06, + "loss": 2.2274, + "step": 9162 + }, + { + "epoch": 0.4915772532188841, + "grad_norm": 1.4707893133163452, + "learning_rate": 2.6939823529451455e-06, + "loss": 1.8714, + "step": 9163 + }, + { + "epoch": 0.49163090128755366, + "grad_norm": 1.6090956926345825, + "learning_rate": 2.693549258047357e-06, + "loss": 2.409, + "step": 9164 + }, + { + "epoch": 0.4916845493562232, + "grad_norm": 1.8623753786087036, + "learning_rate": 2.693116157305781e-06, + "loss": 2.3412, + "step": 9165 + }, + { + "epoch": 0.4917381974248927, + "grad_norm": 1.7044168710708618, + "learning_rate": 2.6926830507334943e-06, + "loss": 2.6004, + "step": 9166 + }, + { + "epoch": 0.49179184549356225, + "grad_norm": 1.4472486972808838, + "learning_rate": 2.6922499383435743e-06, + "loss": 2.0356, + "step": 9167 + }, + { + "epoch": 0.4918454935622318, + "grad_norm": 1.6207995414733887, + "learning_rate": 2.691816820149096e-06, + "loss": 2.2958, + "step": 9168 + }, + { + "epoch": 0.49189914163090126, + "grad_norm": 1.6419016122817993, + "learning_rate": 2.6913836961631368e-06, + "loss": 2.0466, + "step": 9169 + }, + { + "epoch": 0.4919527896995708, + "grad_norm": 1.5072258710861206, + "learning_rate": 2.6909505663987757e-06, + "loss": 2.2189, + "step": 9170 + }, + { + "epoch": 0.4920064377682403, + "grad_norm": 2.0649914741516113, + "learning_rate": 2.690517430869089e-06, + "loss": 2.2355, + "step": 9171 + }, + { + "epoch": 0.49206008583690986, + "grad_norm": 1.536091685295105, + "learning_rate": 2.690084289587154e-06, + "loss": 2.2772, + "step": 9172 + }, + { + "epoch": 0.4921137339055794, + "grad_norm": 1.7106235027313232, + "learning_rate": 2.6896511425660483e-06, + "loss": 2.2502, + "step": 9173 + }, + { + "epoch": 0.4921673819742489, + "grad_norm": 1.7535786628723145, + "learning_rate": 2.6892179898188504e-06, + "loss": 2.0655, + "step": 9174 + }, + { + "epoch": 0.49222103004291845, + "grad_norm": 1.5067572593688965, + "learning_rate": 2.6887848313586373e-06, + "loss": 1.7163, + "step": 9175 + }, + { + "epoch": 0.492274678111588, + "grad_norm": 1.811293601989746, + "learning_rate": 2.6883516671984895e-06, + "loss": 2.3295, + "step": 9176 + }, + { + "epoch": 0.4923283261802575, + "grad_norm": 1.3210101127624512, + "learning_rate": 2.6879184973514833e-06, + "loss": 2.3064, + "step": 9177 + }, + { + "epoch": 0.49238197424892705, + "grad_norm": 1.7063063383102417, + "learning_rate": 2.6874853218306985e-06, + "loss": 2.1531, + "step": 9178 + }, + { + "epoch": 0.4924356223175966, + "grad_norm": 1.6465473175048828, + "learning_rate": 2.6870521406492127e-06, + "loss": 2.2447, + "step": 9179 + }, + { + "epoch": 0.4924892703862661, + "grad_norm": 1.9008513689041138, + "learning_rate": 2.686618953820106e-06, + "loss": 2.1531, + "step": 9180 + }, + { + "epoch": 0.49254291845493564, + "grad_norm": 1.4297319650650024, + "learning_rate": 2.686185761356456e-06, + "loss": 2.1813, + "step": 9181 + }, + { + "epoch": 0.4925965665236052, + "grad_norm": 1.5347731113433838, + "learning_rate": 2.685752563271344e-06, + "loss": 2.2345, + "step": 9182 + }, + { + "epoch": 0.4926502145922747, + "grad_norm": 1.2003395557403564, + "learning_rate": 2.6853193595778484e-06, + "loss": 2.3288, + "step": 9183 + }, + { + "epoch": 0.4927038626609442, + "grad_norm": 1.4716925621032715, + "learning_rate": 2.684886150289049e-06, + "loss": 1.6755, + "step": 9184 + }, + { + "epoch": 0.4927575107296137, + "grad_norm": 1.56732177734375, + "learning_rate": 2.6844529354180255e-06, + "loss": 2.4739, + "step": 9185 + }, + { + "epoch": 0.49281115879828324, + "grad_norm": 1.690914273262024, + "learning_rate": 2.684019714977857e-06, + "loss": 2.3828, + "step": 9186 + }, + { + "epoch": 0.4928648068669528, + "grad_norm": 1.4978433847427368, + "learning_rate": 2.6835864889816245e-06, + "loss": 2.4127, + "step": 9187 + }, + { + "epoch": 0.4929184549356223, + "grad_norm": 1.5619347095489502, + "learning_rate": 2.6831532574424084e-06, + "loss": 2.4474, + "step": 9188 + }, + { + "epoch": 0.49297210300429184, + "grad_norm": 2.1275243759155273, + "learning_rate": 2.68272002037329e-06, + "loss": 2.4285, + "step": 9189 + }, + { + "epoch": 0.49302575107296137, + "grad_norm": 1.4582114219665527, + "learning_rate": 2.682286777787348e-06, + "loss": 2.1967, + "step": 9190 + }, + { + "epoch": 0.4930793991416309, + "grad_norm": 1.4533578157424927, + "learning_rate": 2.6818535296976638e-06, + "loss": 1.9971, + "step": 9191 + }, + { + "epoch": 0.49313304721030043, + "grad_norm": 1.2893530130386353, + "learning_rate": 2.6814202761173184e-06, + "loss": 2.3457, + "step": 9192 + }, + { + "epoch": 0.49318669527896997, + "grad_norm": 1.6546639204025269, + "learning_rate": 2.6809870170593944e-06, + "loss": 2.2263, + "step": 9193 + }, + { + "epoch": 0.4932403433476395, + "grad_norm": 2.1782190799713135, + "learning_rate": 2.680553752536971e-06, + "loss": 2.3337, + "step": 9194 + }, + { + "epoch": 0.49329399141630903, + "grad_norm": 1.386637568473816, + "learning_rate": 2.680120482563131e-06, + "loss": 2.3253, + "step": 9195 + }, + { + "epoch": 0.49334763948497856, + "grad_norm": 1.650132417678833, + "learning_rate": 2.679687207150955e-06, + "loss": 2.2504, + "step": 9196 + }, + { + "epoch": 0.4934012875536481, + "grad_norm": 1.2144780158996582, + "learning_rate": 2.6792539263135253e-06, + "loss": 1.9278, + "step": 9197 + }, + { + "epoch": 0.49345493562231757, + "grad_norm": 1.3294143676757812, + "learning_rate": 2.6788206400639243e-06, + "loss": 2.1254, + "step": 9198 + }, + { + "epoch": 0.4935085836909871, + "grad_norm": 1.4440969228744507, + "learning_rate": 2.6783873484152332e-06, + "loss": 2.1053, + "step": 9199 + }, + { + "epoch": 0.49356223175965663, + "grad_norm": 1.5957558155059814, + "learning_rate": 2.6779540513805354e-06, + "loss": 2.3297, + "step": 9200 + }, + { + "epoch": 0.49361587982832617, + "grad_norm": 1.4901102781295776, + "learning_rate": 2.6775207489729123e-06, + "loss": 2.2298, + "step": 9201 + }, + { + "epoch": 0.4936695278969957, + "grad_norm": 1.4887256622314453, + "learning_rate": 2.6770874412054472e-06, + "loss": 2.3704, + "step": 9202 + }, + { + "epoch": 0.49372317596566523, + "grad_norm": 1.4285328388214111, + "learning_rate": 2.676654128091222e-06, + "loss": 2.4133, + "step": 9203 + }, + { + "epoch": 0.49377682403433476, + "grad_norm": 1.3463314771652222, + "learning_rate": 2.67622080964332e-06, + "loss": 2.0478, + "step": 9204 + }, + { + "epoch": 0.4938304721030043, + "grad_norm": 2.5579488277435303, + "learning_rate": 2.675787485874825e-06, + "loss": 2.2851, + "step": 9205 + }, + { + "epoch": 0.4938841201716738, + "grad_norm": 1.5274778604507446, + "learning_rate": 2.6753541567988194e-06, + "loss": 1.9359, + "step": 9206 + }, + { + "epoch": 0.49393776824034336, + "grad_norm": 1.687256932258606, + "learning_rate": 2.6749208224283873e-06, + "loss": 2.2595, + "step": 9207 + }, + { + "epoch": 0.4939914163090129, + "grad_norm": 1.4733461141586304, + "learning_rate": 2.6744874827766116e-06, + "loss": 2.2597, + "step": 9208 + }, + { + "epoch": 0.4940450643776824, + "grad_norm": 1.4140058755874634, + "learning_rate": 2.674054137856576e-06, + "loss": 2.3853, + "step": 9209 + }, + { + "epoch": 0.49409871244635195, + "grad_norm": 1.4855026006698608, + "learning_rate": 2.6736207876813646e-06, + "loss": 2.1346, + "step": 9210 + }, + { + "epoch": 0.4941523605150215, + "grad_norm": 1.496519923210144, + "learning_rate": 2.6731874322640628e-06, + "loss": 2.3103, + "step": 9211 + }, + { + "epoch": 0.494206008583691, + "grad_norm": 1.7072017192840576, + "learning_rate": 2.6727540716177517e-06, + "loss": 2.2911, + "step": 9212 + }, + { + "epoch": 0.4942596566523605, + "grad_norm": 1.592624306678772, + "learning_rate": 2.6723207057555185e-06, + "loss": 2.3401, + "step": 9213 + }, + { + "epoch": 0.49431330472103, + "grad_norm": 1.476975440979004, + "learning_rate": 2.671887334690447e-06, + "loss": 1.9712, + "step": 9214 + }, + { + "epoch": 0.49436695278969955, + "grad_norm": 2.0052835941314697, + "learning_rate": 2.6714539584356204e-06, + "loss": 2.0328, + "step": 9215 + }, + { + "epoch": 0.4944206008583691, + "grad_norm": 1.3355295658111572, + "learning_rate": 2.6710205770041254e-06, + "loss": 1.854, + "step": 9216 + }, + { + "epoch": 0.4944742489270386, + "grad_norm": 1.2718628644943237, + "learning_rate": 2.670587190409046e-06, + "loss": 2.1859, + "step": 9217 + }, + { + "epoch": 0.49452789699570815, + "grad_norm": 1.926589012145996, + "learning_rate": 2.6701537986634675e-06, + "loss": 2.1222, + "step": 9218 + }, + { + "epoch": 0.4945815450643777, + "grad_norm": 1.1267212629318237, + "learning_rate": 2.669720401780475e-06, + "loss": 1.7602, + "step": 9219 + }, + { + "epoch": 0.4946351931330472, + "grad_norm": 1.6535011529922485, + "learning_rate": 2.6692869997731547e-06, + "loss": 2.1572, + "step": 9220 + }, + { + "epoch": 0.49468884120171674, + "grad_norm": 1.5858726501464844, + "learning_rate": 2.6688535926545915e-06, + "loss": 2.2927, + "step": 9221 + }, + { + "epoch": 0.4947424892703863, + "grad_norm": 1.6230530738830566, + "learning_rate": 2.6684201804378716e-06, + "loss": 2.2651, + "step": 9222 + }, + { + "epoch": 0.4947961373390558, + "grad_norm": 1.6020499467849731, + "learning_rate": 2.6679867631360806e-06, + "loss": 2.3995, + "step": 9223 + }, + { + "epoch": 0.49484978540772534, + "grad_norm": 1.6170706748962402, + "learning_rate": 2.6675533407623058e-06, + "loss": 2.2902, + "step": 9224 + }, + { + "epoch": 0.49490343347639487, + "grad_norm": 1.5128365755081177, + "learning_rate": 2.6671199133296305e-06, + "loss": 2.2474, + "step": 9225 + }, + { + "epoch": 0.4949570815450644, + "grad_norm": 1.5529385805130005, + "learning_rate": 2.6666864808511435e-06, + "loss": 2.3881, + "step": 9226 + }, + { + "epoch": 0.4950107296137339, + "grad_norm": 1.6686440706253052, + "learning_rate": 2.6662530433399306e-06, + "loss": 2.1172, + "step": 9227 + }, + { + "epoch": 0.4950643776824034, + "grad_norm": 1.7143278121948242, + "learning_rate": 2.665819600809079e-06, + "loss": 2.4529, + "step": 9228 + }, + { + "epoch": 0.49511802575107294, + "grad_norm": 1.3620227575302124, + "learning_rate": 2.6653861532716752e-06, + "loss": 2.168, + "step": 9229 + }, + { + "epoch": 0.4951716738197425, + "grad_norm": 1.5119584798812866, + "learning_rate": 2.664952700740806e-06, + "loss": 2.1649, + "step": 9230 + }, + { + "epoch": 0.495225321888412, + "grad_norm": NaN, + "learning_rate": 2.664952700740806e-06, + "loss": 2.2063, + "step": 9231 + }, + { + "epoch": 0.49527896995708154, + "grad_norm": 1.4864317178726196, + "learning_rate": 2.6645192432295575e-06, + "loss": 2.3918, + "step": 9232 + }, + { + "epoch": 0.49533261802575107, + "grad_norm": 1.4728572368621826, + "learning_rate": 2.664085780751019e-06, + "loss": 1.9528, + "step": 9233 + }, + { + "epoch": 0.4953862660944206, + "grad_norm": 1.48419189453125, + "learning_rate": 2.663652313318278e-06, + "loss": 1.9344, + "step": 9234 + }, + { + "epoch": 0.49543991416309013, + "grad_norm": 1.9512670040130615, + "learning_rate": 2.66321884094442e-06, + "loss": 2.2995, + "step": 9235 + }, + { + "epoch": 0.49549356223175967, + "grad_norm": 1.588759183883667, + "learning_rate": 2.662785363642534e-06, + "loss": 2.2928, + "step": 9236 + }, + { + "epoch": 0.4955472103004292, + "grad_norm": 1.7768653631210327, + "learning_rate": 2.6623518814257073e-06, + "loss": 2.4037, + "step": 9237 + }, + { + "epoch": 0.49560085836909873, + "grad_norm": 1.7050732374191284, + "learning_rate": 2.661918394307028e-06, + "loss": 2.2319, + "step": 9238 + }, + { + "epoch": 0.49565450643776826, + "grad_norm": 1.6108057498931885, + "learning_rate": 2.6614849022995857e-06, + "loss": 2.2974, + "step": 9239 + }, + { + "epoch": 0.4957081545064378, + "grad_norm": 1.8353748321533203, + "learning_rate": 2.661051405416467e-06, + "loss": 2.2277, + "step": 9240 + }, + { + "epoch": 0.49576180257510727, + "grad_norm": 1.5699474811553955, + "learning_rate": 2.6606179036707614e-06, + "loss": 2.4554, + "step": 9241 + }, + { + "epoch": 0.4958154506437768, + "grad_norm": 1.2580887079238892, + "learning_rate": 2.660184397075557e-06, + "loss": 2.2325, + "step": 9242 + }, + { + "epoch": 0.49586909871244633, + "grad_norm": 1.8054271936416626, + "learning_rate": 2.6597508856439424e-06, + "loss": 2.1656, + "step": 9243 + }, + { + "epoch": 0.49592274678111586, + "grad_norm": 1.5650728940963745, + "learning_rate": 2.6593173693890074e-06, + "loss": 1.9765, + "step": 9244 + }, + { + "epoch": 0.4959763948497854, + "grad_norm": 1.4243245124816895, + "learning_rate": 2.65888384832384e-06, + "loss": 2.1868, + "step": 9245 + }, + { + "epoch": 0.4960300429184549, + "grad_norm": 1.5118536949157715, + "learning_rate": 2.65845032246153e-06, + "loss": 2.3483, + "step": 9246 + }, + { + "epoch": 0.49608369098712446, + "grad_norm": 1.6177427768707275, + "learning_rate": 2.6580167918151665e-06, + "loss": 2.1472, + "step": 9247 + }, + { + "epoch": 0.496137339055794, + "grad_norm": 1.566419005393982, + "learning_rate": 2.6575832563978395e-06, + "loss": 2.4248, + "step": 9248 + }, + { + "epoch": 0.4961909871244635, + "grad_norm": 1.2978357076644897, + "learning_rate": 2.6571497162226365e-06, + "loss": 2.0985, + "step": 9249 + }, + { + "epoch": 0.49624463519313305, + "grad_norm": 1.6484462022781372, + "learning_rate": 2.6567161713026506e-06, + "loss": 2.1223, + "step": 9250 + }, + { + "epoch": 0.4962982832618026, + "grad_norm": 1.6576454639434814, + "learning_rate": 2.6562826216509696e-06, + "loss": 2.2468, + "step": 9251 + }, + { + "epoch": 0.4963519313304721, + "grad_norm": 1.781693935394287, + "learning_rate": 2.6558490672806854e-06, + "loss": 2.1429, + "step": 9252 + }, + { + "epoch": 0.49640557939914165, + "grad_norm": 1.4604313373565674, + "learning_rate": 2.6554155082048854e-06, + "loss": 2.0628, + "step": 9253 + }, + { + "epoch": 0.4964592274678112, + "grad_norm": 1.3581655025482178, + "learning_rate": 2.654981944436662e-06, + "loss": 1.8715, + "step": 9254 + }, + { + "epoch": 0.4965128755364807, + "grad_norm": 1.5426658391952515, + "learning_rate": 2.6545483759891037e-06, + "loss": 2.2995, + "step": 9255 + }, + { + "epoch": 0.4965665236051502, + "grad_norm": 1.7887117862701416, + "learning_rate": 2.6541148028753046e-06, + "loss": 2.3047, + "step": 9256 + }, + { + "epoch": 0.4966201716738197, + "grad_norm": 1.687641978263855, + "learning_rate": 2.6536812251083525e-06, + "loss": 2.3091, + "step": 9257 + }, + { + "epoch": 0.49667381974248925, + "grad_norm": 1.6305876970291138, + "learning_rate": 2.653247642701339e-06, + "loss": 2.4412, + "step": 9258 + }, + { + "epoch": 0.4967274678111588, + "grad_norm": 1.575639009475708, + "learning_rate": 2.6528140556673558e-06, + "loss": 2.2905, + "step": 9259 + }, + { + "epoch": 0.4967811158798283, + "grad_norm": 1.4382482767105103, + "learning_rate": 2.6523804640194937e-06, + "loss": 2.0476, + "step": 9260 + }, + { + "epoch": 0.49683476394849785, + "grad_norm": 1.415600299835205, + "learning_rate": 2.651946867770844e-06, + "loss": 2.1305, + "step": 9261 + }, + { + "epoch": 0.4968884120171674, + "grad_norm": 1.5099796056747437, + "learning_rate": 2.651513266934498e-06, + "loss": 2.0486, + "step": 9262 + }, + { + "epoch": 0.4969420600858369, + "grad_norm": 1.4680988788604736, + "learning_rate": 2.6510796615235474e-06, + "loss": 2.2786, + "step": 9263 + }, + { + "epoch": 0.49699570815450644, + "grad_norm": 1.4176087379455566, + "learning_rate": 2.650646051551084e-06, + "loss": 2.2984, + "step": 9264 + }, + { + "epoch": 0.497049356223176, + "grad_norm": 1.5463110208511353, + "learning_rate": 2.6502124370301994e-06, + "loss": 2.1603, + "step": 9265 + }, + { + "epoch": 0.4971030042918455, + "grad_norm": 1.6988277435302734, + "learning_rate": 2.649778817973987e-06, + "loss": 2.3629, + "step": 9266 + }, + { + "epoch": 0.49715665236051504, + "grad_norm": 1.437639594078064, + "learning_rate": 2.6493451943955373e-06, + "loss": 2.2753, + "step": 9267 + }, + { + "epoch": 0.49721030042918457, + "grad_norm": 1.5123748779296875, + "learning_rate": 2.648911566307943e-06, + "loss": 2.4084, + "step": 9268 + }, + { + "epoch": 0.4972639484978541, + "grad_norm": 1.8173130750656128, + "learning_rate": 2.648477933724297e-06, + "loss": 2.379, + "step": 9269 + }, + { + "epoch": 0.4973175965665236, + "grad_norm": 1.338335633277893, + "learning_rate": 2.648044296657692e-06, + "loss": 2.2444, + "step": 9270 + }, + { + "epoch": 0.4973712446351931, + "grad_norm": 1.765030860900879, + "learning_rate": 2.647610655121219e-06, + "loss": 2.3715, + "step": 9271 + }, + { + "epoch": 0.49742489270386264, + "grad_norm": 1.4405921697616577, + "learning_rate": 2.6471770091279725e-06, + "loss": 2.1446, + "step": 9272 + }, + { + "epoch": 0.4974785407725322, + "grad_norm": 1.5691782236099243, + "learning_rate": 2.646743358691046e-06, + "loss": 2.3446, + "step": 9273 + }, + { + "epoch": 0.4975321888412017, + "grad_norm": 1.4336931705474854, + "learning_rate": 2.6463097038235313e-06, + "loss": 2.2462, + "step": 9274 + }, + { + "epoch": 0.49758583690987124, + "grad_norm": 1.4622300863265991, + "learning_rate": 2.645876044538522e-06, + "loss": 2.2098, + "step": 9275 + }, + { + "epoch": 0.49763948497854077, + "grad_norm": 1.6325781345367432, + "learning_rate": 2.6454423808491113e-06, + "loss": 2.2744, + "step": 9276 + }, + { + "epoch": 0.4976931330472103, + "grad_norm": 1.5795032978057861, + "learning_rate": 2.6450087127683926e-06, + "loss": 2.2969, + "step": 9277 + }, + { + "epoch": 0.49774678111587983, + "grad_norm": 1.4038335084915161, + "learning_rate": 2.6445750403094607e-06, + "loss": 2.3, + "step": 9278 + }, + { + "epoch": 0.49780042918454936, + "grad_norm": 1.9932184219360352, + "learning_rate": 2.644141363485408e-06, + "loss": 2.1185, + "step": 9279 + }, + { + "epoch": 0.4978540772532189, + "grad_norm": 2.3069956302642822, + "learning_rate": 2.6437076823093296e-06, + "loss": 2.4745, + "step": 9280 + }, + { + "epoch": 0.49790772532188843, + "grad_norm": 1.5081312656402588, + "learning_rate": 2.6432739967943177e-06, + "loss": 2.2111, + "step": 9281 + }, + { + "epoch": 0.49796137339055796, + "grad_norm": 3.48547625541687, + "learning_rate": 2.642840306953469e-06, + "loss": 2.2264, + "step": 9282 + }, + { + "epoch": 0.4980150214592275, + "grad_norm": 1.5419590473175049, + "learning_rate": 2.642406612799875e-06, + "loss": 2.3665, + "step": 9283 + }, + { + "epoch": 0.498068669527897, + "grad_norm": 1.4918646812438965, + "learning_rate": 2.6419729143466322e-06, + "loss": 2.3357, + "step": 9284 + }, + { + "epoch": 0.4981223175965665, + "grad_norm": 1.5226820707321167, + "learning_rate": 2.6415392116068344e-06, + "loss": 2.1732, + "step": 9285 + }, + { + "epoch": 0.49817596566523603, + "grad_norm": 1.6888201236724854, + "learning_rate": 2.641105504593577e-06, + "loss": 2.3623, + "step": 9286 + }, + { + "epoch": 0.49822961373390556, + "grad_norm": 1.7468839883804321, + "learning_rate": 2.640671793319954e-06, + "loss": 2.1003, + "step": 9287 + }, + { + "epoch": 0.4982832618025751, + "grad_norm": 1.6799430847167969, + "learning_rate": 2.64023807779906e-06, + "loss": 2.1981, + "step": 9288 + }, + { + "epoch": 0.4983369098712446, + "grad_norm": 1.5961467027664185, + "learning_rate": 2.6398043580439907e-06, + "loss": 2.329, + "step": 9289 + }, + { + "epoch": 0.49839055793991416, + "grad_norm": 1.5417066812515259, + "learning_rate": 2.6393706340678415e-06, + "loss": 2.2293, + "step": 9290 + }, + { + "epoch": 0.4984442060085837, + "grad_norm": 2.0722239017486572, + "learning_rate": 2.6389369058837076e-06, + "loss": 2.5078, + "step": 9291 + }, + { + "epoch": 0.4984978540772532, + "grad_norm": 1.7002400159835815, + "learning_rate": 2.638503173504684e-06, + "loss": 2.2021, + "step": 9292 + }, + { + "epoch": 0.49855150214592275, + "grad_norm": 1.686484456062317, + "learning_rate": 2.638069436943867e-06, + "loss": 2.2846, + "step": 9293 + }, + { + "epoch": 0.4986051502145923, + "grad_norm": 1.580237865447998, + "learning_rate": 2.637635696214351e-06, + "loss": 2.2234, + "step": 9294 + }, + { + "epoch": 0.4986587982832618, + "grad_norm": 1.306093454360962, + "learning_rate": 2.637201951329233e-06, + "loss": 2.1903, + "step": 9295 + }, + { + "epoch": 0.49871244635193135, + "grad_norm": 1.5610854625701904, + "learning_rate": 2.6367682023016093e-06, + "loss": 2.1959, + "step": 9296 + }, + { + "epoch": 0.4987660944206009, + "grad_norm": 2.079503059387207, + "learning_rate": 2.6363344491445754e-06, + "loss": 2.2346, + "step": 9297 + }, + { + "epoch": 0.4988197424892704, + "grad_norm": 1.8659729957580566, + "learning_rate": 2.6359006918712273e-06, + "loss": 2.1353, + "step": 9298 + }, + { + "epoch": 0.4988733905579399, + "grad_norm": 2.5973072052001953, + "learning_rate": 2.635466930494661e-06, + "loss": 2.1689, + "step": 9299 + }, + { + "epoch": 0.4989270386266094, + "grad_norm": 1.5783244371414185, + "learning_rate": 2.635033165027974e-06, + "loss": 1.8083, + "step": 9300 + }, + { + "epoch": 0.49898068669527895, + "grad_norm": 1.5894012451171875, + "learning_rate": 2.634599395484262e-06, + "loss": 2.2644, + "step": 9301 + }, + { + "epoch": 0.4990343347639485, + "grad_norm": 1.7999049425125122, + "learning_rate": 2.634165621876622e-06, + "loss": 2.2648, + "step": 9302 + }, + { + "epoch": 0.499087982832618, + "grad_norm": 1.6926331520080566, + "learning_rate": 2.633731844218151e-06, + "loss": 2.2256, + "step": 9303 + }, + { + "epoch": 0.49914163090128755, + "grad_norm": 1.6586172580718994, + "learning_rate": 2.6332980625219457e-06, + "loss": 2.3104, + "step": 9304 + }, + { + "epoch": 0.4991952789699571, + "grad_norm": 1.7066971063613892, + "learning_rate": 2.6328642768011033e-06, + "loss": 2.3723, + "step": 9305 + }, + { + "epoch": 0.4992489270386266, + "grad_norm": 1.4443484544754028, + "learning_rate": 2.6324304870687208e-06, + "loss": 2.1615, + "step": 9306 + }, + { + "epoch": 0.49930257510729614, + "grad_norm": 1.389529824256897, + "learning_rate": 2.631996693337896e-06, + "loss": 1.7456, + "step": 9307 + }, + { + "epoch": 0.4993562231759657, + "grad_norm": 1.5105700492858887, + "learning_rate": 2.6315628956217253e-06, + "loss": 1.8024, + "step": 9308 + }, + { + "epoch": 0.4994098712446352, + "grad_norm": 1.512529730796814, + "learning_rate": 2.6311290939333084e-06, + "loss": 2.2155, + "step": 9309 + }, + { + "epoch": 0.49946351931330474, + "grad_norm": 1.6697055101394653, + "learning_rate": 2.63069528828574e-06, + "loss": 2.5712, + "step": 9310 + }, + { + "epoch": 0.49951716738197427, + "grad_norm": 1.534751296043396, + "learning_rate": 2.6302614786921206e-06, + "loss": 2.4064, + "step": 9311 + }, + { + "epoch": 0.4995708154506438, + "grad_norm": 1.5884662866592407, + "learning_rate": 2.629827665165546e-06, + "loss": 2.3474, + "step": 9312 + }, + { + "epoch": 0.4996244635193133, + "grad_norm": 2.2350728511810303, + "learning_rate": 2.6293938477191157e-06, + "loss": 2.3112, + "step": 9313 + }, + { + "epoch": 0.4996781115879828, + "grad_norm": 1.7637885808944702, + "learning_rate": 2.6289600263659278e-06, + "loss": 2.281, + "step": 9314 + }, + { + "epoch": 0.49973175965665234, + "grad_norm": 1.401785135269165, + "learning_rate": 2.6285262011190793e-06, + "loss": 2.2397, + "step": 9315 + }, + { + "epoch": 0.4997854077253219, + "grad_norm": 1.2865015268325806, + "learning_rate": 2.6280923719916696e-06, + "loss": 2.1139, + "step": 9316 + }, + { + "epoch": 0.4998390557939914, + "grad_norm": 1.5077418088912964, + "learning_rate": 2.6276585389967964e-06, + "loss": 2.4542, + "step": 9317 + }, + { + "epoch": 0.49989270386266094, + "grad_norm": 1.5955067873001099, + "learning_rate": 2.62722470214756e-06, + "loss": 2.0926, + "step": 9318 + }, + { + "epoch": 0.49994635193133047, + "grad_norm": 1.262532114982605, + "learning_rate": 2.6267908614570575e-06, + "loss": 1.9878, + "step": 9319 + }, + { + "epoch": 0.5, + "grad_norm": 1.7104852199554443, + "learning_rate": 2.626357016938388e-06, + "loss": 2.3244, + "step": 9320 + }, + { + "epoch": 0.5000536480686695, + "grad_norm": 1.5729069709777832, + "learning_rate": 2.625923168604651e-06, + "loss": 2.2671, + "step": 9321 + }, + { + "epoch": 0.5001072961373391, + "grad_norm": 1.2886667251586914, + "learning_rate": 2.6254893164689453e-06, + "loss": 2.1155, + "step": 9322 + }, + { + "epoch": 0.5001609442060085, + "grad_norm": 1.491030216217041, + "learning_rate": 2.62505546054437e-06, + "loss": 2.4687, + "step": 9323 + }, + { + "epoch": 0.5002145922746781, + "grad_norm": 1.698343276977539, + "learning_rate": 2.6246216008440245e-06, + "loss": 2.474, + "step": 9324 + }, + { + "epoch": 0.5002682403433476, + "grad_norm": 1.6345332860946655, + "learning_rate": 2.6241877373810082e-06, + "loss": 2.4763, + "step": 9325 + }, + { + "epoch": 0.5003218884120172, + "grad_norm": 1.7429673671722412, + "learning_rate": 2.623753870168421e-06, + "loss": 2.0485, + "step": 9326 + }, + { + "epoch": 0.5003755364806867, + "grad_norm": 1.7975529432296753, + "learning_rate": 2.623319999219362e-06, + "loss": 1.92, + "step": 9327 + }, + { + "epoch": 0.5004291845493563, + "grad_norm": 1.2865033149719238, + "learning_rate": 2.6228861245469313e-06, + "loss": 2.2807, + "step": 9328 + }, + { + "epoch": 0.5004828326180257, + "grad_norm": 1.3098294734954834, + "learning_rate": 2.6224522461642288e-06, + "loss": 2.0721, + "step": 9329 + }, + { + "epoch": 0.5005364806866953, + "grad_norm": 1.4888266324996948, + "learning_rate": 2.6220183640843536e-06, + "loss": 2.3881, + "step": 9330 + }, + { + "epoch": 0.5005901287553648, + "grad_norm": 1.620855450630188, + "learning_rate": 2.6215844783204083e-06, + "loss": 2.2865, + "step": 9331 + }, + { + "epoch": 0.5006437768240344, + "grad_norm": 1.6032512187957764, + "learning_rate": 2.62115058888549e-06, + "loss": 2.1822, + "step": 9332 + }, + { + "epoch": 0.5006974248927039, + "grad_norm": 1.7138921022415161, + "learning_rate": 2.6207166957927e-06, + "loss": 2.25, + "step": 9333 + }, + { + "epoch": 0.5007510729613734, + "grad_norm": 1.4407072067260742, + "learning_rate": 2.62028279905514e-06, + "loss": 2.3563, + "step": 9334 + }, + { + "epoch": 0.5008047210300429, + "grad_norm": 1.2342724800109863, + "learning_rate": 2.6198488986859095e-06, + "loss": 2.0582, + "step": 9335 + }, + { + "epoch": 0.5008583690987124, + "grad_norm": 1.4230976104736328, + "learning_rate": 2.6194149946981096e-06, + "loss": 2.4373, + "step": 9336 + }, + { + "epoch": 0.500912017167382, + "grad_norm": 1.1044255495071411, + "learning_rate": 2.6189810871048406e-06, + "loss": 2.1594, + "step": 9337 + }, + { + "epoch": 0.5009656652360515, + "grad_norm": 1.6123310327529907, + "learning_rate": 2.6185471759192033e-06, + "loss": 2.345, + "step": 9338 + }, + { + "epoch": 0.501019313304721, + "grad_norm": 1.689627766609192, + "learning_rate": 2.618113261154298e-06, + "loss": 2.4886, + "step": 9339 + }, + { + "epoch": 0.5010729613733905, + "grad_norm": 1.4243237972259521, + "learning_rate": 2.617679342823229e-06, + "loss": 2.2299, + "step": 9340 + }, + { + "epoch": 0.5011266094420601, + "grad_norm": 1.5934579372406006, + "learning_rate": 2.6172454209390937e-06, + "loss": 2.3311, + "step": 9341 + }, + { + "epoch": 0.5011802575107296, + "grad_norm": 1.5327214002609253, + "learning_rate": 2.6168114955149947e-06, + "loss": 2.2324, + "step": 9342 + }, + { + "epoch": 0.5012339055793992, + "grad_norm": 1.4858577251434326, + "learning_rate": 2.6163775665640344e-06, + "loss": 2.2287, + "step": 9343 + }, + { + "epoch": 0.5012875536480687, + "grad_norm": 1.8768996000289917, + "learning_rate": 2.6159436340993132e-06, + "loss": 2.287, + "step": 9344 + }, + { + "epoch": 0.5013412017167382, + "grad_norm": 1.4463534355163574, + "learning_rate": 2.6155096981339333e-06, + "loss": 2.2099, + "step": 9345 + }, + { + "epoch": 0.5013948497854077, + "grad_norm": 1.4638453722000122, + "learning_rate": 2.615075758680996e-06, + "loss": 2.4124, + "step": 9346 + }, + { + "epoch": 0.5014484978540773, + "grad_norm": 1.4665781259536743, + "learning_rate": 2.614641815753603e-06, + "loss": 2.4085, + "step": 9347 + }, + { + "epoch": 0.5015021459227468, + "grad_norm": 1.7976751327514648, + "learning_rate": 2.614207869364857e-06, + "loss": 2.3419, + "step": 9348 + }, + { + "epoch": 0.5015557939914163, + "grad_norm": 1.6300972700119019, + "learning_rate": 2.61377391952786e-06, + "loss": 2.3122, + "step": 9349 + }, + { + "epoch": 0.5016094420600858, + "grad_norm": 1.5531550645828247, + "learning_rate": 2.6133399662557124e-06, + "loss": 2.3084, + "step": 9350 + }, + { + "epoch": 0.5016630901287553, + "grad_norm": 1.3532021045684814, + "learning_rate": 2.612906009561519e-06, + "loss": 2.0687, + "step": 9351 + }, + { + "epoch": 0.5017167381974249, + "grad_norm": 1.425213098526001, + "learning_rate": 2.6124720494583805e-06, + "loss": 2.0973, + "step": 9352 + }, + { + "epoch": 0.5017703862660944, + "grad_norm": 1.1199367046356201, + "learning_rate": 2.6120380859594e-06, + "loss": 1.9336, + "step": 9353 + }, + { + "epoch": 0.501824034334764, + "grad_norm": 1.7510643005371094, + "learning_rate": 2.6116041190776796e-06, + "loss": 2.3328, + "step": 9354 + }, + { + "epoch": 0.5018776824034334, + "grad_norm": 1.6097170114517212, + "learning_rate": 2.6111701488263224e-06, + "loss": 2.4687, + "step": 9355 + }, + { + "epoch": 0.501931330472103, + "grad_norm": 1.7225233316421509, + "learning_rate": 2.6107361752184306e-06, + "loss": 2.2843, + "step": 9356 + }, + { + "epoch": 0.5019849785407725, + "grad_norm": 2.722534418106079, + "learning_rate": 2.610302198267107e-06, + "loss": 2.3391, + "step": 9357 + }, + { + "epoch": 0.5020386266094421, + "grad_norm": 1.578925371170044, + "learning_rate": 2.6098682179854568e-06, + "loss": 1.8834, + "step": 9358 + }, + { + "epoch": 0.5020922746781116, + "grad_norm": 1.6461807489395142, + "learning_rate": 2.60943423438658e-06, + "loss": 2.2096, + "step": 9359 + }, + { + "epoch": 0.5021459227467812, + "grad_norm": 1.802487850189209, + "learning_rate": 2.6090002474835814e-06, + "loss": 2.335, + "step": 9360 + }, + { + "epoch": 0.5021995708154506, + "grad_norm": 1.6711472272872925, + "learning_rate": 2.6085662572895635e-06, + "loss": 2.295, + "step": 9361 + }, + { + "epoch": 0.5022532188841202, + "grad_norm": 1.9528995752334595, + "learning_rate": 2.608132263817631e-06, + "loss": 2.1156, + "step": 9362 + }, + { + "epoch": 0.5023068669527897, + "grad_norm": 1.4589587450027466, + "learning_rate": 2.6076982670808855e-06, + "loss": 2.6136, + "step": 9363 + }, + { + "epoch": 0.5023605150214592, + "grad_norm": 1.1623284816741943, + "learning_rate": 2.6072642670924313e-06, + "loss": 2.2079, + "step": 9364 + }, + { + "epoch": 0.5024141630901288, + "grad_norm": 1.6035655736923218, + "learning_rate": 2.6068302638653733e-06, + "loss": 2.2268, + "step": 9365 + }, + { + "epoch": 0.5024678111587982, + "grad_norm": 1.5371960401535034, + "learning_rate": 2.6063962574128134e-06, + "loss": 2.3968, + "step": 9366 + }, + { + "epoch": 0.5025214592274678, + "grad_norm": 1.5626661777496338, + "learning_rate": 2.6059622477478565e-06, + "loss": 2.3325, + "step": 9367 + }, + { + "epoch": 0.5025751072961373, + "grad_norm": 1.6579841375350952, + "learning_rate": 2.6055282348836066e-06, + "loss": 2.0495, + "step": 9368 + }, + { + "epoch": 0.5026287553648069, + "grad_norm": 1.217958927154541, + "learning_rate": 2.605094218833167e-06, + "loss": 2.1449, + "step": 9369 + }, + { + "epoch": 0.5026824034334764, + "grad_norm": 1.4526989459991455, + "learning_rate": 2.604660199609642e-06, + "loss": 2.3172, + "step": 9370 + }, + { + "epoch": 0.502736051502146, + "grad_norm": 1.6100527048110962, + "learning_rate": 2.6042261772261374e-06, + "loss": 2.3136, + "step": 9371 + }, + { + "epoch": 0.5027896995708154, + "grad_norm": 1.474117398262024, + "learning_rate": 2.603792151695755e-06, + "loss": 2.134, + "step": 9372 + }, + { + "epoch": 0.502843347639485, + "grad_norm": 2.913107395172119, + "learning_rate": 2.603358123031601e-06, + "loss": 2.2692, + "step": 9373 + }, + { + "epoch": 0.5028969957081545, + "grad_norm": 1.6982510089874268, + "learning_rate": 2.6029240912467796e-06, + "loss": 2.0886, + "step": 9374 + }, + { + "epoch": 0.5029506437768241, + "grad_norm": 1.5845898389816284, + "learning_rate": 2.602490056354395e-06, + "loss": 2.6704, + "step": 9375 + }, + { + "epoch": 0.5030042918454936, + "grad_norm": 1.3999850749969482, + "learning_rate": 2.6020560183675525e-06, + "loss": 2.1252, + "step": 9376 + }, + { + "epoch": 0.5030579399141631, + "grad_norm": 1.651443362236023, + "learning_rate": 2.601621977299357e-06, + "loss": 2.259, + "step": 9377 + }, + { + "epoch": 0.5031115879828326, + "grad_norm": 1.664521336555481, + "learning_rate": 2.6011879331629115e-06, + "loss": 2.2526, + "step": 9378 + }, + { + "epoch": 0.5031652360515021, + "grad_norm": 1.6495311260223389, + "learning_rate": 2.6007538859713232e-06, + "loss": 2.3481, + "step": 9379 + }, + { + "epoch": 0.5032188841201717, + "grad_norm": 1.4169909954071045, + "learning_rate": 2.600319835737697e-06, + "loss": 2.0702, + "step": 9380 + }, + { + "epoch": 0.5032725321888412, + "grad_norm": 1.5524128675460815, + "learning_rate": 2.5998857824751374e-06, + "loss": 2.0256, + "step": 9381 + }, + { + "epoch": 0.5033261802575107, + "grad_norm": 1.7552313804626465, + "learning_rate": 2.5994517261967495e-06, + "loss": 2.3193, + "step": 9382 + }, + { + "epoch": 0.5033798283261802, + "grad_norm": 1.4797323942184448, + "learning_rate": 2.5990176669156387e-06, + "loss": 1.9561, + "step": 9383 + }, + { + "epoch": 0.5034334763948498, + "grad_norm": 1.4735983610153198, + "learning_rate": 2.5985836046449116e-06, + "loss": 2.3202, + "step": 9384 + }, + { + "epoch": 0.5034871244635193, + "grad_norm": 1.565852403640747, + "learning_rate": 2.5981495393976718e-06, + "loss": 2.2881, + "step": 9385 + }, + { + "epoch": 0.5035407725321889, + "grad_norm": 1.5347890853881836, + "learning_rate": 2.5977154711870266e-06, + "loss": 2.3288, + "step": 9386 + }, + { + "epoch": 0.5035944206008584, + "grad_norm": 1.6477727890014648, + "learning_rate": 2.597281400026081e-06, + "loss": 2.3425, + "step": 9387 + }, + { + "epoch": 0.5036480686695279, + "grad_norm": 1.3589400053024292, + "learning_rate": 2.596847325927941e-06, + "loss": 1.9909, + "step": 9388 + }, + { + "epoch": 0.5037017167381974, + "grad_norm": 1.4890276193618774, + "learning_rate": 2.596413248905712e-06, + "loss": 2.7348, + "step": 9389 + }, + { + "epoch": 0.503755364806867, + "grad_norm": 1.5172982215881348, + "learning_rate": 2.5959791689725006e-06, + "loss": 2.1097, + "step": 9390 + }, + { + "epoch": 0.5038090128755365, + "grad_norm": 1.8276323080062866, + "learning_rate": 2.595545086141413e-06, + "loss": 2.2491, + "step": 9391 + }, + { + "epoch": 0.503862660944206, + "grad_norm": 1.7309107780456543, + "learning_rate": 2.595111000425555e-06, + "loss": 2.4613, + "step": 9392 + }, + { + "epoch": 0.5039163090128755, + "grad_norm": 1.6014606952667236, + "learning_rate": 2.594676911838033e-06, + "loss": 2.1212, + "step": 9393 + }, + { + "epoch": 0.503969957081545, + "grad_norm": 1.6282418966293335, + "learning_rate": 2.594242820391953e-06, + "loss": 2.2076, + "step": 9394 + }, + { + "epoch": 0.5040236051502146, + "grad_norm": 1.4968764781951904, + "learning_rate": 2.593808726100421e-06, + "loss": 2.0424, + "step": 9395 + }, + { + "epoch": 0.5040772532188841, + "grad_norm": 1.4420112371444702, + "learning_rate": 2.593374628976544e-06, + "loss": 2.257, + "step": 9396 + }, + { + "epoch": 0.5041309012875537, + "grad_norm": 1.4719082117080688, + "learning_rate": 2.5929405290334304e-06, + "loss": 2.3053, + "step": 9397 + }, + { + "epoch": 0.5041845493562231, + "grad_norm": 1.7280449867248535, + "learning_rate": 2.592506426284184e-06, + "loss": 1.4178, + "step": 9398 + }, + { + "epoch": 0.5042381974248927, + "grad_norm": 1.608132004737854, + "learning_rate": 2.5920723207419137e-06, + "loss": 2.2993, + "step": 9399 + }, + { + "epoch": 0.5042918454935622, + "grad_norm": 1.6596757173538208, + "learning_rate": 2.5916382124197243e-06, + "loss": 2.266, + "step": 9400 + }, + { + "epoch": 0.5043454935622318, + "grad_norm": 1.5012609958648682, + "learning_rate": 2.591204101330724e-06, + "loss": 2.3793, + "step": 9401 + }, + { + "epoch": 0.5043991416309013, + "grad_norm": 2.601703643798828, + "learning_rate": 2.5907699874880205e-06, + "loss": 2.3522, + "step": 9402 + }, + { + "epoch": 0.5044527896995709, + "grad_norm": 1.8509043455123901, + "learning_rate": 2.5903358709047196e-06, + "loss": 2.055, + "step": 9403 + }, + { + "epoch": 0.5045064377682403, + "grad_norm": 1.7408945560455322, + "learning_rate": 2.5899017515939286e-06, + "loss": 2.3729, + "step": 9404 + }, + { + "epoch": 0.5045600858369099, + "grad_norm": 1.9090639352798462, + "learning_rate": 2.5894676295687552e-06, + "loss": 2.0667, + "step": 9405 + }, + { + "epoch": 0.5046137339055794, + "grad_norm": 1.5563626289367676, + "learning_rate": 2.589033504842307e-06, + "loss": 2.2061, + "step": 9406 + }, + { + "epoch": 0.5046673819742489, + "grad_norm": 1.3424087762832642, + "learning_rate": 2.588599377427691e-06, + "loss": 2.0387, + "step": 9407 + }, + { + "epoch": 0.5047210300429185, + "grad_norm": 1.6319630146026611, + "learning_rate": 2.5881652473380143e-06, + "loss": 2.445, + "step": 9408 + }, + { + "epoch": 0.5047746781115879, + "grad_norm": 1.3911845684051514, + "learning_rate": 2.5877311145863855e-06, + "loss": 2.1557, + "step": 9409 + }, + { + "epoch": 0.5048283261802575, + "grad_norm": 10.802081108093262, + "learning_rate": 2.5872969791859114e-06, + "loss": 2.3086, + "step": 9410 + }, + { + "epoch": 0.504881974248927, + "grad_norm": 1.5060126781463623, + "learning_rate": 2.586862841149701e-06, + "loss": 2.3437, + "step": 9411 + }, + { + "epoch": 0.5049356223175966, + "grad_norm": 1.4075313806533813, + "learning_rate": 2.58642870049086e-06, + "loss": 2.1262, + "step": 9412 + }, + { + "epoch": 0.5049892703862661, + "grad_norm": 1.454740047454834, + "learning_rate": 2.585994557222498e-06, + "loss": 2.2647, + "step": 9413 + }, + { + "epoch": 0.5050429184549357, + "grad_norm": 1.542368769645691, + "learning_rate": 2.5855604113577227e-06, + "loss": 2.172, + "step": 9414 + }, + { + "epoch": 0.5050965665236051, + "grad_norm": 1.4417036771774292, + "learning_rate": 2.5851262629096425e-06, + "loss": 1.7734, + "step": 9415 + }, + { + "epoch": 0.5051502145922747, + "grad_norm": 1.405246376991272, + "learning_rate": 2.5846921118913644e-06, + "loss": 2.3349, + "step": 9416 + }, + { + "epoch": 0.5052038626609442, + "grad_norm": 1.4091581106185913, + "learning_rate": 2.5842579583159964e-06, + "loss": 2.3826, + "step": 9417 + }, + { + "epoch": 0.5052575107296138, + "grad_norm": 2.431933879852295, + "learning_rate": 2.5838238021966483e-06, + "loss": 2.4949, + "step": 9418 + }, + { + "epoch": 0.5053111587982833, + "grad_norm": 1.5329569578170776, + "learning_rate": 2.5833896435464285e-06, + "loss": 2.0723, + "step": 9419 + }, + { + "epoch": 0.5053648068669528, + "grad_norm": 1.5452736616134644, + "learning_rate": 2.5829554823784443e-06, + "loss": 2.2233, + "step": 9420 + }, + { + "epoch": 0.5054184549356223, + "grad_norm": 1.5487651824951172, + "learning_rate": 2.5825213187058045e-06, + "loss": 2.232, + "step": 9421 + }, + { + "epoch": 0.5054721030042918, + "grad_norm": 1.7064512968063354, + "learning_rate": 2.582087152541618e-06, + "loss": 2.2111, + "step": 9422 + }, + { + "epoch": 0.5055257510729614, + "grad_norm": 1.3615854978561401, + "learning_rate": 2.581652983898993e-06, + "loss": 2.0555, + "step": 9423 + }, + { + "epoch": 0.5055793991416309, + "grad_norm": 1.3554266691207886, + "learning_rate": 2.5812188127910394e-06, + "loss": 2.2484, + "step": 9424 + }, + { + "epoch": 0.5056330472103004, + "grad_norm": 1.6895246505737305, + "learning_rate": 2.5807846392308645e-06, + "loss": 2.3988, + "step": 9425 + }, + { + "epoch": 0.5056866952789699, + "grad_norm": 1.5537031888961792, + "learning_rate": 2.580350463231578e-06, + "loss": 2.2399, + "step": 9426 + }, + { + "epoch": 0.5057403433476395, + "grad_norm": 1.815228819847107, + "learning_rate": 2.5799162848062892e-06, + "loss": 2.0789, + "step": 9427 + }, + { + "epoch": 0.505793991416309, + "grad_norm": 2.3890814781188965, + "learning_rate": 2.5794821039681066e-06, + "loss": 2.3622, + "step": 9428 + }, + { + "epoch": 0.5058476394849786, + "grad_norm": 1.3941744565963745, + "learning_rate": 2.5790479207301394e-06, + "loss": 2.1288, + "step": 9429 + }, + { + "epoch": 0.505901287553648, + "grad_norm": 1.711916208267212, + "learning_rate": 2.578613735105497e-06, + "loss": 2.2975, + "step": 9430 + }, + { + "epoch": 0.5059549356223176, + "grad_norm": 1.9849133491516113, + "learning_rate": 2.5781795471072883e-06, + "loss": 2.3745, + "step": 9431 + }, + { + "epoch": 0.5060085836909871, + "grad_norm": 1.2631137371063232, + "learning_rate": 2.577745356748623e-06, + "loss": 1.7764, + "step": 9432 + }, + { + "epoch": 0.5060622317596567, + "grad_norm": 1.3256394863128662, + "learning_rate": 2.5773111640426114e-06, + "loss": 1.8179, + "step": 9433 + }, + { + "epoch": 0.5061158798283262, + "grad_norm": 1.7008658647537231, + "learning_rate": 2.5768769690023603e-06, + "loss": 2.1078, + "step": 9434 + }, + { + "epoch": 0.5061695278969958, + "grad_norm": 1.4502544403076172, + "learning_rate": 2.576442771640982e-06, + "loss": 2.0189, + "step": 9435 + }, + { + "epoch": 0.5062231759656652, + "grad_norm": 1.5658434629440308, + "learning_rate": 2.5760085719715843e-06, + "loss": 2.228, + "step": 9436 + }, + { + "epoch": 0.5062768240343347, + "grad_norm": 1.4472483396530151, + "learning_rate": 2.575574370007278e-06, + "loss": 2.0867, + "step": 9437 + }, + { + "epoch": 0.5063304721030043, + "grad_norm": 1.9228260517120361, + "learning_rate": 2.575140165761173e-06, + "loss": 2.2714, + "step": 9438 + }, + { + "epoch": 0.5063841201716738, + "grad_norm": 1.641157865524292, + "learning_rate": 2.574705959246378e-06, + "loss": 2.3182, + "step": 9439 + }, + { + "epoch": 0.5064377682403434, + "grad_norm": 1.6303422451019287, + "learning_rate": 2.5742717504760027e-06, + "loss": 2.3313, + "step": 9440 + }, + { + "epoch": 0.5064914163090128, + "grad_norm": 1.5266828536987305, + "learning_rate": 2.5738375394631595e-06, + "loss": 2.2863, + "step": 9441 + }, + { + "epoch": 0.5065450643776824, + "grad_norm": 1.4662190675735474, + "learning_rate": 2.5734033262209564e-06, + "loss": 2.3116, + "step": 9442 + }, + { + "epoch": 0.5065987124463519, + "grad_norm": 1.6350542306900024, + "learning_rate": 2.572969110762503e-06, + "loss": 2.473, + "step": 9443 + }, + { + "epoch": 0.5066523605150215, + "grad_norm": 1.4716527462005615, + "learning_rate": 2.572534893100911e-06, + "loss": 2.4461, + "step": 9444 + }, + { + "epoch": 0.506706008583691, + "grad_norm": 1.5017297267913818, + "learning_rate": 2.57210067324929e-06, + "loss": 1.8656, + "step": 9445 + }, + { + "epoch": 0.5067596566523606, + "grad_norm": 1.7272151708602905, + "learning_rate": 2.57166645122075e-06, + "loss": 2.2699, + "step": 9446 + }, + { + "epoch": 0.50681330472103, + "grad_norm": 1.4392763376235962, + "learning_rate": 2.5712322270284016e-06, + "loss": 2.4994, + "step": 9447 + }, + { + "epoch": 0.5068669527896996, + "grad_norm": 1.3279788494110107, + "learning_rate": 2.570798000685356e-06, + "loss": 2.1779, + "step": 9448 + }, + { + "epoch": 0.5069206008583691, + "grad_norm": 1.7304906845092773, + "learning_rate": 2.5703637722047226e-06, + "loss": 2.3758, + "step": 9449 + }, + { + "epoch": 0.5069742489270386, + "grad_norm": 1.623830795288086, + "learning_rate": 2.569929541599612e-06, + "loss": 2.4404, + "step": 9450 + }, + { + "epoch": 0.5070278969957082, + "grad_norm": 1.8032795190811157, + "learning_rate": 2.5694953088831352e-06, + "loss": 2.386, + "step": 9451 + }, + { + "epoch": 0.5070815450643776, + "grad_norm": 1.212575912475586, + "learning_rate": 2.569061074068403e-06, + "loss": 2.0072, + "step": 9452 + }, + { + "epoch": 0.5071351931330472, + "grad_norm": 1.542898416519165, + "learning_rate": 2.568626837168526e-06, + "loss": 2.3726, + "step": 9453 + }, + { + "epoch": 0.5071888412017167, + "grad_norm": 1.4899331331253052, + "learning_rate": 2.568192598196615e-06, + "loss": 2.2663, + "step": 9454 + }, + { + "epoch": 0.5072424892703863, + "grad_norm": 1.3249403238296509, + "learning_rate": 2.5677583571657815e-06, + "loss": 2.4415, + "step": 9455 + }, + { + "epoch": 0.5072961373390558, + "grad_norm": 1.4623125791549683, + "learning_rate": 2.5673241140891354e-06, + "loss": 2.2273, + "step": 9456 + }, + { + "epoch": 0.5073497854077254, + "grad_norm": 2.1498472690582275, + "learning_rate": 2.566889868979787e-06, + "loss": 2.459, + "step": 9457 + }, + { + "epoch": 0.5074034334763948, + "grad_norm": 1.4772511720657349, + "learning_rate": 2.5664556218508494e-06, + "loss": 2.2688, + "step": 9458 + }, + { + "epoch": 0.5074570815450644, + "grad_norm": 1.3467870950698853, + "learning_rate": 2.5660213727154338e-06, + "loss": 1.6172, + "step": 9459 + }, + { + "epoch": 0.5075107296137339, + "grad_norm": 1.7252204418182373, + "learning_rate": 2.5655871215866498e-06, + "loss": 2.3039, + "step": 9460 + }, + { + "epoch": 0.5075643776824035, + "grad_norm": 1.6843510866165161, + "learning_rate": 2.5651528684776086e-06, + "loss": 2.2094, + "step": 9461 + }, + { + "epoch": 0.507618025751073, + "grad_norm": 1.627050757408142, + "learning_rate": 2.5647186134014223e-06, + "loss": 1.6657, + "step": 9462 + }, + { + "epoch": 0.5076716738197425, + "grad_norm": 1.6009646654129028, + "learning_rate": 2.5642843563712018e-06, + "loss": 2.3497, + "step": 9463 + }, + { + "epoch": 0.507725321888412, + "grad_norm": 1.5823663473129272, + "learning_rate": 2.5638500974000594e-06, + "loss": 2.3457, + "step": 9464 + }, + { + "epoch": 0.5077789699570815, + "grad_norm": 1.151563048362732, + "learning_rate": 2.5634158365011057e-06, + "loss": 1.8672, + "step": 9465 + }, + { + "epoch": 0.5078326180257511, + "grad_norm": 1.7289451360702515, + "learning_rate": 2.5629815736874526e-06, + "loss": 2.1454, + "step": 9466 + }, + { + "epoch": 0.5078862660944206, + "grad_norm": 1.565757155418396, + "learning_rate": 2.562547308972212e-06, + "loss": 2.4562, + "step": 9467 + }, + { + "epoch": 0.5079399141630901, + "grad_norm": 1.5268651247024536, + "learning_rate": 2.562113042368494e-06, + "loss": 2.3665, + "step": 9468 + }, + { + "epoch": 0.5079935622317596, + "grad_norm": 1.791067361831665, + "learning_rate": 2.561678773889413e-06, + "loss": 2.3854, + "step": 9469 + }, + { + "epoch": 0.5080472103004292, + "grad_norm": 1.4726518392562866, + "learning_rate": 2.561244503548078e-06, + "loss": 2.253, + "step": 9470 + }, + { + "epoch": 0.5081008583690987, + "grad_norm": 1.7609559297561646, + "learning_rate": 2.5608102313576026e-06, + "loss": 2.3068, + "step": 9471 + }, + { + "epoch": 0.5081545064377683, + "grad_norm": 1.1377410888671875, + "learning_rate": 2.560375957331099e-06, + "loss": 2.1197, + "step": 9472 + }, + { + "epoch": 0.5082081545064377, + "grad_norm": 1.6886955499649048, + "learning_rate": 2.559941681481677e-06, + "loss": 2.4691, + "step": 9473 + }, + { + "epoch": 0.5082618025751073, + "grad_norm": 1.4149153232574463, + "learning_rate": 2.55950740382245e-06, + "loss": 1.9899, + "step": 9474 + }, + { + "epoch": 0.5083154506437768, + "grad_norm": 1.6024386882781982, + "learning_rate": 2.55907312436653e-06, + "loss": 2.2556, + "step": 9475 + }, + { + "epoch": 0.5083690987124464, + "grad_norm": 1.603256344795227, + "learning_rate": 2.55863884312703e-06, + "loss": 2.1937, + "step": 9476 + }, + { + "epoch": 0.5084227467811159, + "grad_norm": 1.5117782354354858, + "learning_rate": 2.5582045601170607e-06, + "loss": 2.5248, + "step": 9477 + }, + { + "epoch": 0.5084763948497855, + "grad_norm": 1.657288908958435, + "learning_rate": 2.5577702753497346e-06, + "loss": 2.2885, + "step": 9478 + }, + { + "epoch": 0.5085300429184549, + "grad_norm": 1.3567848205566406, + "learning_rate": 2.557335988838164e-06, + "loss": 2.3271, + "step": 9479 + }, + { + "epoch": 0.5085836909871244, + "grad_norm": 1.5797635316848755, + "learning_rate": 2.5569017005954606e-06, + "loss": 1.1133, + "step": 9480 + }, + { + "epoch": 0.508637339055794, + "grad_norm": 1.3255115747451782, + "learning_rate": 2.5564674106347387e-06, + "loss": 2.1882, + "step": 9481 + }, + { + "epoch": 0.5086909871244635, + "grad_norm": 1.659878134727478, + "learning_rate": 2.556033118969109e-06, + "loss": 2.3075, + "step": 9482 + }, + { + "epoch": 0.5087446351931331, + "grad_norm": 1.5313823223114014, + "learning_rate": 2.555598825611685e-06, + "loss": 2.3147, + "step": 9483 + }, + { + "epoch": 0.5087982832618025, + "grad_norm": 1.7321680784225464, + "learning_rate": 2.555164530575578e-06, + "loss": 2.4467, + "step": 9484 + }, + { + "epoch": 0.5088519313304721, + "grad_norm": 1.2987053394317627, + "learning_rate": 2.5547302338739014e-06, + "loss": 1.8434, + "step": 9485 + }, + { + "epoch": 0.5089055793991416, + "grad_norm": 1.2799197435379028, + "learning_rate": 2.5542959355197682e-06, + "loss": 1.8048, + "step": 9486 + }, + { + "epoch": 0.5089592274678112, + "grad_norm": 1.805335521697998, + "learning_rate": 2.55386163552629e-06, + "loss": 2.2967, + "step": 9487 + }, + { + "epoch": 0.5090128755364807, + "grad_norm": 1.6726912260055542, + "learning_rate": 2.55342733390658e-06, + "loss": 2.4095, + "step": 9488 + }, + { + "epoch": 0.5090665236051503, + "grad_norm": 2.057925224304199, + "learning_rate": 2.5529930306737515e-06, + "loss": 2.1239, + "step": 9489 + }, + { + "epoch": 0.5091201716738197, + "grad_norm": 1.5278518199920654, + "learning_rate": 2.552558725840917e-06, + "loss": 2.4986, + "step": 9490 + }, + { + "epoch": 0.5091738197424893, + "grad_norm": 1.5034375190734863, + "learning_rate": 2.5521244194211887e-06, + "loss": 1.4623, + "step": 9491 + }, + { + "epoch": 0.5092274678111588, + "grad_norm": 1.7262731790542603, + "learning_rate": 2.5516901114276804e-06, + "loss": 2.3225, + "step": 9492 + }, + { + "epoch": 0.5092811158798283, + "grad_norm": 1.485005259513855, + "learning_rate": 2.5512558018735045e-06, + "loss": 2.3233, + "step": 9493 + }, + { + "epoch": 0.5093347639484979, + "grad_norm": 1.6057273149490356, + "learning_rate": 2.5508214907717745e-06, + "loss": 2.1888, + "step": 9494 + }, + { + "epoch": 0.5093884120171673, + "grad_norm": 1.5371129512786865, + "learning_rate": 2.5503871781356032e-06, + "loss": 2.4984, + "step": 9495 + }, + { + "epoch": 0.5094420600858369, + "grad_norm": 1.5448060035705566, + "learning_rate": 2.5499528639781028e-06, + "loss": 2.2899, + "step": 9496 + }, + { + "epoch": 0.5094957081545064, + "grad_norm": 3.773012399673462, + "learning_rate": 2.5495185483123873e-06, + "loss": 2.2505, + "step": 9497 + }, + { + "epoch": 0.509549356223176, + "grad_norm": 1.8801054954528809, + "learning_rate": 2.5490842311515706e-06, + "loss": 2.174, + "step": 9498 + }, + { + "epoch": 0.5096030042918455, + "grad_norm": 1.6884026527404785, + "learning_rate": 2.548649912508766e-06, + "loss": 2.2561, + "step": 9499 + }, + { + "epoch": 0.509656652360515, + "grad_norm": 1.2083035707473755, + "learning_rate": 2.5482155923970846e-06, + "loss": 2.2494, + "step": 9500 + }, + { + "epoch": 0.5097103004291845, + "grad_norm": 1.477763295173645, + "learning_rate": 2.5477812708296417e-06, + "loss": 2.5407, + "step": 9501 + }, + { + "epoch": 0.5097639484978541, + "grad_norm": 1.5377873182296753, + "learning_rate": 2.5473469478195496e-06, + "loss": 2.2704, + "step": 9502 + }, + { + "epoch": 0.5098175965665236, + "grad_norm": 1.3925145864486694, + "learning_rate": 2.546912623379923e-06, + "loss": 2.1226, + "step": 9503 + }, + { + "epoch": 0.5098712446351932, + "grad_norm": 1.3373255729675293, + "learning_rate": 2.5464782975238742e-06, + "loss": 2.2223, + "step": 9504 + }, + { + "epoch": 0.5099248927038627, + "grad_norm": 1.7015550136566162, + "learning_rate": 2.5460439702645173e-06, + "loss": 2.3434, + "step": 9505 + }, + { + "epoch": 0.5099785407725322, + "grad_norm": 1.5901564359664917, + "learning_rate": 2.545609641614965e-06, + "loss": 2.2585, + "step": 9506 + }, + { + "epoch": 0.5100321888412017, + "grad_norm": 1.5013775825500488, + "learning_rate": 2.5451753115883323e-06, + "loss": 1.9678, + "step": 9507 + }, + { + "epoch": 0.5100858369098712, + "grad_norm": 1.5309944152832031, + "learning_rate": 2.5447409801977313e-06, + "loss": 2.2897, + "step": 9508 + }, + { + "epoch": 0.5101394849785408, + "grad_norm": 1.6582218408584595, + "learning_rate": 2.5443066474562768e-06, + "loss": 2.1395, + "step": 9509 + }, + { + "epoch": 0.5101931330472103, + "grad_norm": 1.5725189447402954, + "learning_rate": 2.543872313377082e-06, + "loss": 2.2553, + "step": 9510 + }, + { + "epoch": 0.5102467811158798, + "grad_norm": 1.5769237279891968, + "learning_rate": 2.5434379779732604e-06, + "loss": 2.4199, + "step": 9511 + }, + { + "epoch": 0.5103004291845493, + "grad_norm": 1.7797818183898926, + "learning_rate": 2.5430036412579275e-06, + "loss": 2.1073, + "step": 9512 + }, + { + "epoch": 0.5103540772532189, + "grad_norm": 1.6143420934677124, + "learning_rate": 2.5425693032441934e-06, + "loss": 1.95, + "step": 9513 + }, + { + "epoch": 0.5104077253218884, + "grad_norm": 1.5795440673828125, + "learning_rate": 2.5421349639451758e-06, + "loss": 2.0791, + "step": 9514 + }, + { + "epoch": 0.510461373390558, + "grad_norm": 1.3700891733169556, + "learning_rate": 2.5417006233739866e-06, + "loss": 2.2518, + "step": 9515 + }, + { + "epoch": 0.5105150214592274, + "grad_norm": 1.4906792640686035, + "learning_rate": 2.5412662815437406e-06, + "loss": 2.3087, + "step": 9516 + }, + { + "epoch": 0.510568669527897, + "grad_norm": 1.7680227756500244, + "learning_rate": 2.5408319384675524e-06, + "loss": 2.4155, + "step": 9517 + }, + { + "epoch": 0.5106223175965665, + "grad_norm": 1.4928674697875977, + "learning_rate": 2.540397594158534e-06, + "loss": 2.3116, + "step": 9518 + }, + { + "epoch": 0.5106759656652361, + "grad_norm": 1.3741556406021118, + "learning_rate": 2.5399632486298e-06, + "loss": 1.4275, + "step": 9519 + }, + { + "epoch": 0.5107296137339056, + "grad_norm": 1.8492858409881592, + "learning_rate": 2.5395289018944652e-06, + "loss": 2.2569, + "step": 9520 + }, + { + "epoch": 0.5107832618025752, + "grad_norm": 2.0046145915985107, + "learning_rate": 2.5390945539656447e-06, + "loss": 2.3411, + "step": 9521 + }, + { + "epoch": 0.5108369098712446, + "grad_norm": 1.5931086540222168, + "learning_rate": 2.5386602048564507e-06, + "loss": 2.1966, + "step": 9522 + }, + { + "epoch": 0.5108905579399141, + "grad_norm": 1.4627892971038818, + "learning_rate": 2.5382258545799983e-06, + "loss": 2.1346, + "step": 9523 + }, + { + "epoch": 0.5109442060085837, + "grad_norm": 1.8850005865097046, + "learning_rate": 2.5377915031494016e-06, + "loss": 2.1656, + "step": 9524 + }, + { + "epoch": 0.5109978540772532, + "grad_norm": 1.5171480178833008, + "learning_rate": 2.537357150577775e-06, + "loss": 2.1279, + "step": 9525 + }, + { + "epoch": 0.5110515021459228, + "grad_norm": 1.5702807903289795, + "learning_rate": 2.5369227968782325e-06, + "loss": 2.4185, + "step": 9526 + }, + { + "epoch": 0.5111051502145922, + "grad_norm": 1.1952322721481323, + "learning_rate": 2.5364884420638887e-06, + "loss": 2.0288, + "step": 9527 + }, + { + "epoch": 0.5111587982832618, + "grad_norm": 1.3326495885849, + "learning_rate": 2.536054086147858e-06, + "loss": 2.32, + "step": 9528 + }, + { + "epoch": 0.5112124463519313, + "grad_norm": 1.6189101934432983, + "learning_rate": 2.5356197291432542e-06, + "loss": 2.4468, + "step": 9529 + }, + { + "epoch": 0.5112660944206009, + "grad_norm": 1.8788411617279053, + "learning_rate": 2.5351853710631928e-06, + "loss": 2.3076, + "step": 9530 + }, + { + "epoch": 0.5113197424892704, + "grad_norm": 1.7133491039276123, + "learning_rate": 2.534751011920788e-06, + "loss": 2.5343, + "step": 9531 + }, + { + "epoch": 0.51137339055794, + "grad_norm": 1.4381481409072876, + "learning_rate": 2.534316651729154e-06, + "loss": 2.2623, + "step": 9532 + }, + { + "epoch": 0.5114270386266094, + "grad_norm": 1.5206329822540283, + "learning_rate": 2.533882290501405e-06, + "loss": 2.1995, + "step": 9533 + }, + { + "epoch": 0.511480686695279, + "grad_norm": 3.2986814975738525, + "learning_rate": 2.533447928250657e-06, + "loss": 2.2513, + "step": 9534 + }, + { + "epoch": 0.5115343347639485, + "grad_norm": 1.7371692657470703, + "learning_rate": 2.5330135649900216e-06, + "loss": 1.9098, + "step": 9535 + }, + { + "epoch": 0.511587982832618, + "grad_norm": 1.5207747220993042, + "learning_rate": 2.5325792007326166e-06, + "loss": 2.2625, + "step": 9536 + }, + { + "epoch": 0.5116416309012876, + "grad_norm": 1.7023423910140991, + "learning_rate": 2.5321448354915556e-06, + "loss": 2.3443, + "step": 9537 + }, + { + "epoch": 0.511695278969957, + "grad_norm": 2.4953463077545166, + "learning_rate": 2.531710469279953e-06, + "loss": 2.2273, + "step": 9538 + }, + { + "epoch": 0.5117489270386266, + "grad_norm": 1.5633251667022705, + "learning_rate": 2.5312761021109238e-06, + "loss": 2.3161, + "step": 9539 + }, + { + "epoch": 0.5118025751072961, + "grad_norm": 1.581200361251831, + "learning_rate": 2.530841733997582e-06, + "loss": 2.3806, + "step": 9540 + }, + { + "epoch": 0.5118562231759657, + "grad_norm": 1.5788437128067017, + "learning_rate": 2.530407364953043e-06, + "loss": 2.3432, + "step": 9541 + }, + { + "epoch": 0.5119098712446352, + "grad_norm": 1.524128794670105, + "learning_rate": 2.5299729949904216e-06, + "loss": 2.1009, + "step": 9542 + }, + { + "epoch": 0.5119635193133047, + "grad_norm": 1.5115550756454468, + "learning_rate": 2.529538624122833e-06, + "loss": 2.2431, + "step": 9543 + }, + { + "epoch": 0.5120171673819742, + "grad_norm": 1.4476664066314697, + "learning_rate": 2.5291042523633918e-06, + "loss": 2.2635, + "step": 9544 + }, + { + "epoch": 0.5120708154506438, + "grad_norm": 1.5971295833587646, + "learning_rate": 2.5286698797252124e-06, + "loss": 2.2085, + "step": 9545 + }, + { + "epoch": 0.5121244635193133, + "grad_norm": 1.872375249862671, + "learning_rate": 2.52823550622141e-06, + "loss": 2.461, + "step": 9546 + }, + { + "epoch": 0.5121781115879829, + "grad_norm": 1.6290335655212402, + "learning_rate": 2.5278011318651e-06, + "loss": 2.3947, + "step": 9547 + }, + { + "epoch": 0.5122317596566524, + "grad_norm": 1.622754454612732, + "learning_rate": 2.527366756669396e-06, + "loss": 2.1562, + "step": 9548 + }, + { + "epoch": 0.5122854077253219, + "grad_norm": 1.702386736869812, + "learning_rate": 2.5269323806474144e-06, + "loss": 2.2255, + "step": 9549 + }, + { + "epoch": 0.5123390557939914, + "grad_norm": 1.4793012142181396, + "learning_rate": 2.52649800381227e-06, + "loss": 2.2809, + "step": 9550 + }, + { + "epoch": 0.5123927038626609, + "grad_norm": 1.3450281620025635, + "learning_rate": 2.5260636261770776e-06, + "loss": 2.401, + "step": 9551 + }, + { + "epoch": 0.5124463519313305, + "grad_norm": 1.2541580200195312, + "learning_rate": 2.525629247754952e-06, + "loss": 2.3198, + "step": 9552 + }, + { + "epoch": 0.5125, + "grad_norm": 1.566861867904663, + "learning_rate": 2.5251948685590083e-06, + "loss": 2.2171, + "step": 9553 + }, + { + "epoch": 0.5125536480686695, + "grad_norm": 1.6227333545684814, + "learning_rate": 2.5247604886023626e-06, + "loss": 2.1123, + "step": 9554 + }, + { + "epoch": 0.512607296137339, + "grad_norm": 2.6618330478668213, + "learning_rate": 2.5243261078981286e-06, + "loss": 2.5189, + "step": 9555 + }, + { + "epoch": 0.5126609442060086, + "grad_norm": 1.6128482818603516, + "learning_rate": 2.523891726459423e-06, + "loss": 2.3192, + "step": 9556 + }, + { + "epoch": 0.5127145922746781, + "grad_norm": 6.024653911590576, + "learning_rate": 2.523457344299359e-06, + "loss": 2.4574, + "step": 9557 + }, + { + "epoch": 0.5127682403433477, + "grad_norm": 1.5417386293411255, + "learning_rate": 2.523022961431053e-06, + "loss": 2.4953, + "step": 9558 + }, + { + "epoch": 0.5128218884120171, + "grad_norm": 3.5255517959594727, + "learning_rate": 2.5225885778676207e-06, + "loss": 2.4055, + "step": 9559 + }, + { + "epoch": 0.5128755364806867, + "grad_norm": 1.5199623107910156, + "learning_rate": 2.5221541936221765e-06, + "loss": 2.1653, + "step": 9560 + }, + { + "epoch": 0.5129291845493562, + "grad_norm": 1.600111484527588, + "learning_rate": 2.5217198087078364e-06, + "loss": 2.2954, + "step": 9561 + }, + { + "epoch": 0.5129828326180258, + "grad_norm": 1.5912209749221802, + "learning_rate": 2.521285423137715e-06, + "loss": 2.3096, + "step": 9562 + }, + { + "epoch": 0.5130364806866953, + "grad_norm": 1.4835960865020752, + "learning_rate": 2.520851036924928e-06, + "loss": 1.8502, + "step": 9563 + }, + { + "epoch": 0.5130901287553649, + "grad_norm": 1.592501163482666, + "learning_rate": 2.520416650082589e-06, + "loss": 2.1177, + "step": 9564 + }, + { + "epoch": 0.5131437768240343, + "grad_norm": 1.5598506927490234, + "learning_rate": 2.5199822626238167e-06, + "loss": 2.1997, + "step": 9565 + }, + { + "epoch": 0.5131974248927038, + "grad_norm": 1.4307540655136108, + "learning_rate": 2.5195478745617243e-06, + "loss": 2.1743, + "step": 9566 + }, + { + "epoch": 0.5132510729613734, + "grad_norm": 2.029982566833496, + "learning_rate": 2.519113485909427e-06, + "loss": 2.2759, + "step": 9567 + }, + { + "epoch": 0.5133047210300429, + "grad_norm": 2.45451283454895, + "learning_rate": 2.5186790966800414e-06, + "loss": 2.4713, + "step": 9568 + }, + { + "epoch": 0.5133583690987125, + "grad_norm": 1.5989588499069214, + "learning_rate": 2.518244706886681e-06, + "loss": 1.9459, + "step": 9569 + }, + { + "epoch": 0.5134120171673819, + "grad_norm": 1.3568593263626099, + "learning_rate": 2.517810316542463e-06, + "loss": 2.3738, + "step": 9570 + }, + { + "epoch": 0.5134656652360515, + "grad_norm": 1.653891682624817, + "learning_rate": 2.5173759256605028e-06, + "loss": 2.2811, + "step": 9571 + }, + { + "epoch": 0.513519313304721, + "grad_norm": 1.616196632385254, + "learning_rate": 2.516941534253915e-06, + "loss": 2.1252, + "step": 9572 + }, + { + "epoch": 0.5135729613733906, + "grad_norm": 1.328762173652649, + "learning_rate": 2.5165071423358158e-06, + "loss": 2.15, + "step": 9573 + }, + { + "epoch": 0.5136266094420601, + "grad_norm": 1.4799013137817383, + "learning_rate": 2.51607274991932e-06, + "loss": 2.0877, + "step": 9574 + }, + { + "epoch": 0.5136802575107297, + "grad_norm": 2.1118979454040527, + "learning_rate": 2.515638357017543e-06, + "loss": 2.35, + "step": 9575 + }, + { + "epoch": 0.5137339055793991, + "grad_norm": 1.4561400413513184, + "learning_rate": 2.5152039636436008e-06, + "loss": 2.0953, + "step": 9576 + }, + { + "epoch": 0.5137875536480687, + "grad_norm": 1.6208657026290894, + "learning_rate": 2.5147695698106093e-06, + "loss": 2.5074, + "step": 9577 + }, + { + "epoch": 0.5138412017167382, + "grad_norm": 1.4914045333862305, + "learning_rate": 2.5143351755316847e-06, + "loss": 2.0941, + "step": 9578 + }, + { + "epoch": 0.5138948497854077, + "grad_norm": 1.4601587057113647, + "learning_rate": 2.5139007808199402e-06, + "loss": 2.2674, + "step": 9579 + }, + { + "epoch": 0.5139484978540773, + "grad_norm": 1.3922638893127441, + "learning_rate": 2.5134663856884926e-06, + "loss": 2.2624, + "step": 9580 + }, + { + "epoch": 0.5140021459227467, + "grad_norm": 1.6004934310913086, + "learning_rate": 2.5130319901504573e-06, + "loss": 2.4411, + "step": 9581 + }, + { + "epoch": 0.5140557939914163, + "grad_norm": 1.3655898571014404, + "learning_rate": 2.5125975942189507e-06, + "loss": 1.699, + "step": 9582 + }, + { + "epoch": 0.5141094420600858, + "grad_norm": 1.7025952339172363, + "learning_rate": 2.512163197907089e-06, + "loss": 2.067, + "step": 9583 + }, + { + "epoch": 0.5141630901287554, + "grad_norm": 1.553683876991272, + "learning_rate": 2.5117288012279855e-06, + "loss": 2.3284, + "step": 9584 + }, + { + "epoch": 0.5142167381974249, + "grad_norm": 1.4016145467758179, + "learning_rate": 2.5112944041947566e-06, + "loss": 2.325, + "step": 9585 + }, + { + "epoch": 0.5142703862660944, + "grad_norm": 1.4820910692214966, + "learning_rate": 2.5108600068205195e-06, + "loss": 2.0361, + "step": 9586 + }, + { + "epoch": 0.5143240343347639, + "grad_norm": 1.5624785423278809, + "learning_rate": 2.5104256091183883e-06, + "loss": 2.1892, + "step": 9587 + }, + { + "epoch": 0.5143776824034335, + "grad_norm": 1.2946652173995972, + "learning_rate": 2.509991211101479e-06, + "loss": 2.1366, + "step": 9588 + }, + { + "epoch": 0.514431330472103, + "grad_norm": 1.941336989402771, + "learning_rate": 2.509556812782907e-06, + "loss": 2.1995, + "step": 9589 + }, + { + "epoch": 0.5144849785407726, + "grad_norm": 1.2235581874847412, + "learning_rate": 2.509122414175789e-06, + "loss": 2.3221, + "step": 9590 + }, + { + "epoch": 0.514538626609442, + "grad_norm": 1.4589020013809204, + "learning_rate": 2.5086880152932403e-06, + "loss": 2.3056, + "step": 9591 + }, + { + "epoch": 0.5145922746781116, + "grad_norm": 1.599081039428711, + "learning_rate": 2.508253616148376e-06, + "loss": 2.2127, + "step": 9592 + }, + { + "epoch": 0.5146459227467811, + "grad_norm": 1.6361404657363892, + "learning_rate": 2.5078192167543127e-06, + "loss": 2.4998, + "step": 9593 + }, + { + "epoch": 0.5146995708154506, + "grad_norm": 1.4749610424041748, + "learning_rate": 2.507384817124165e-06, + "loss": 2.322, + "step": 9594 + }, + { + "epoch": 0.5147532188841202, + "grad_norm": 1.5237866640090942, + "learning_rate": 2.5069504172710496e-06, + "loss": 2.3737, + "step": 9595 + }, + { + "epoch": 0.5148068669527897, + "grad_norm": 1.6884499788284302, + "learning_rate": 2.506516017208082e-06, + "loss": 2.436, + "step": 9596 + }, + { + "epoch": 0.5148605150214592, + "grad_norm": 1.476137399673462, + "learning_rate": 2.506081616948377e-06, + "loss": 1.7687, + "step": 9597 + }, + { + "epoch": 0.5149141630901287, + "grad_norm": 1.4475433826446533, + "learning_rate": 2.5056472165050514e-06, + "loss": 2.0841, + "step": 9598 + }, + { + "epoch": 0.5149678111587983, + "grad_norm": 1.768786907196045, + "learning_rate": 2.5052128158912216e-06, + "loss": 2.3692, + "step": 9599 + }, + { + "epoch": 0.5150214592274678, + "grad_norm": 1.3260886669158936, + "learning_rate": 2.504778415120002e-06, + "loss": 2.183, + "step": 9600 + }, + { + "epoch": 0.5150751072961374, + "grad_norm": 2.1755878925323486, + "learning_rate": 2.504344014204509e-06, + "loss": 2.336, + "step": 9601 + }, + { + "epoch": 0.5151287553648068, + "grad_norm": 1.6680762767791748, + "learning_rate": 2.5039096131578585e-06, + "loss": 2.3285, + "step": 9602 + }, + { + "epoch": 0.5151824034334764, + "grad_norm": 1.7270575761795044, + "learning_rate": 2.503475211993164e-06, + "loss": 2.2569, + "step": 9603 + }, + { + "epoch": 0.5152360515021459, + "grad_norm": 1.384505033493042, + "learning_rate": 2.503040810723545e-06, + "loss": 2.5888, + "step": 9604 + }, + { + "epoch": 0.5152896995708155, + "grad_norm": 1.7739157676696777, + "learning_rate": 2.5026064093621157e-06, + "loss": 2.3808, + "step": 9605 + }, + { + "epoch": 0.515343347639485, + "grad_norm": 1.6307008266448975, + "learning_rate": 2.5021720079219913e-06, + "loss": 2.3503, + "step": 9606 + }, + { + "epoch": 0.5153969957081546, + "grad_norm": 1.9977308511734009, + "learning_rate": 2.501737606416288e-06, + "loss": 2.4389, + "step": 9607 + }, + { + "epoch": 0.515450643776824, + "grad_norm": 1.501369595527649, + "learning_rate": 2.501303204858121e-06, + "loss": 2.3321, + "step": 9608 + }, + { + "epoch": 0.5155042918454935, + "grad_norm": 1.5063793659210205, + "learning_rate": 2.500868803260607e-06, + "loss": 2.4591, + "step": 9609 + }, + { + "epoch": 0.5155579399141631, + "grad_norm": 1.5581926107406616, + "learning_rate": 2.5004344016368616e-06, + "loss": 2.4172, + "step": 9610 + }, + { + "epoch": 0.5156115879828326, + "grad_norm": 1.5553542375564575, + "learning_rate": 2.5e-06, + "loss": 2.2424, + "step": 9611 + }, + { + "epoch": 0.5156652360515022, + "grad_norm": 2.923837423324585, + "learning_rate": 2.499565598363139e-06, + "loss": 2.2986, + "step": 9612 + }, + { + "epoch": 0.5157188841201716, + "grad_norm": 4.4027838706970215, + "learning_rate": 2.4991311967393937e-06, + "loss": 2.3345, + "step": 9613 + }, + { + "epoch": 0.5157725321888412, + "grad_norm": 1.8025678396224976, + "learning_rate": 2.4986967951418795e-06, + "loss": 2.2918, + "step": 9614 + }, + { + "epoch": 0.5158261802575107, + "grad_norm": 1.4770300388336182, + "learning_rate": 2.4982623935837126e-06, + "loss": 2.4222, + "step": 9615 + }, + { + "epoch": 0.5158798283261803, + "grad_norm": 1.4561740159988403, + "learning_rate": 2.49782799207801e-06, + "loss": 2.4631, + "step": 9616 + }, + { + "epoch": 0.5159334763948498, + "grad_norm": 1.3746614456176758, + "learning_rate": 2.4973935906378855e-06, + "loss": 2.2644, + "step": 9617 + }, + { + "epoch": 0.5159871244635194, + "grad_norm": 1.653701901435852, + "learning_rate": 2.4969591892764555e-06, + "loss": 2.4652, + "step": 9618 + }, + { + "epoch": 0.5160407725321888, + "grad_norm": 1.688780665397644, + "learning_rate": 2.4965247880068363e-06, + "loss": 2.1872, + "step": 9619 + }, + { + "epoch": 0.5160944206008584, + "grad_norm": 1.6097456216812134, + "learning_rate": 2.496090386842143e-06, + "loss": 2.4221, + "step": 9620 + }, + { + "epoch": 0.5161480686695279, + "grad_norm": 1.6143680810928345, + "learning_rate": 2.4956559857954914e-06, + "loss": 2.2124, + "step": 9621 + }, + { + "epoch": 0.5162017167381975, + "grad_norm": 1.3323038816452026, + "learning_rate": 2.4952215848799984e-06, + "loss": 2.3923, + "step": 9622 + }, + { + "epoch": 0.516255364806867, + "grad_norm": 1.9085757732391357, + "learning_rate": 2.494787184108779e-06, + "loss": 2.1916, + "step": 9623 + }, + { + "epoch": 0.5163090128755364, + "grad_norm": 1.1856188774108887, + "learning_rate": 2.494352783494948e-06, + "loss": 2.4853, + "step": 9624 + }, + { + "epoch": 0.516362660944206, + "grad_norm": 1.6108092069625854, + "learning_rate": 2.4939183830516235e-06, + "loss": 2.3095, + "step": 9625 + }, + { + "epoch": 0.5164163090128755, + "grad_norm": 1.609169602394104, + "learning_rate": 2.493483982791919e-06, + "loss": 2.2434, + "step": 9626 + }, + { + "epoch": 0.5164699570815451, + "grad_norm": 1.417149543762207, + "learning_rate": 2.4930495827289512e-06, + "loss": 2.2729, + "step": 9627 + }, + { + "epoch": 0.5165236051502146, + "grad_norm": 1.7957649230957031, + "learning_rate": 2.4926151828758358e-06, + "loss": 2.2747, + "step": 9628 + }, + { + "epoch": 0.5165772532188841, + "grad_norm": 3.403876543045044, + "learning_rate": 2.492180783245688e-06, + "loss": 2.222, + "step": 9629 + }, + { + "epoch": 0.5166309012875536, + "grad_norm": 1.5992838144302368, + "learning_rate": 2.4917463838516247e-06, + "loss": 2.1075, + "step": 9630 + }, + { + "epoch": 0.5166845493562232, + "grad_norm": 1.2243341207504272, + "learning_rate": 2.4913119847067605e-06, + "loss": 2.0427, + "step": 9631 + }, + { + "epoch": 0.5167381974248927, + "grad_norm": 1.6887693405151367, + "learning_rate": 2.4908775858242105e-06, + "loss": 2.2263, + "step": 9632 + }, + { + "epoch": 0.5167918454935623, + "grad_norm": 1.2971817255020142, + "learning_rate": 2.4904431872170924e-06, + "loss": 2.0115, + "step": 9633 + }, + { + "epoch": 0.5168454935622318, + "grad_norm": 1.4999419450759888, + "learning_rate": 2.4900087888985224e-06, + "loss": 2.1475, + "step": 9634 + }, + { + "epoch": 0.5168991416309013, + "grad_norm": 1.9111789464950562, + "learning_rate": 2.489574390881613e-06, + "loss": 2.5304, + "step": 9635 + }, + { + "epoch": 0.5169527896995708, + "grad_norm": 1.4756755828857422, + "learning_rate": 2.4891399931794813e-06, + "loss": 2.3749, + "step": 9636 + }, + { + "epoch": 0.5170064377682403, + "grad_norm": 1.600265622138977, + "learning_rate": 2.488705595805244e-06, + "loss": 2.2673, + "step": 9637 + }, + { + "epoch": 0.5170600858369099, + "grad_norm": 1.4647659063339233, + "learning_rate": 2.4882711987720154e-06, + "loss": 2.3906, + "step": 9638 + }, + { + "epoch": 0.5171137339055794, + "grad_norm": 1.6222003698349, + "learning_rate": 2.487836802092912e-06, + "loss": 2.2881, + "step": 9639 + }, + { + "epoch": 0.5171673819742489, + "grad_norm": 1.4544073343276978, + "learning_rate": 2.4874024057810493e-06, + "loss": 2.3568, + "step": 9640 + }, + { + "epoch": 0.5172210300429184, + "grad_norm": 2.775313138961792, + "learning_rate": 2.4869680098495427e-06, + "loss": 2.1056, + "step": 9641 + }, + { + "epoch": 0.517274678111588, + "grad_norm": 1.3598438501358032, + "learning_rate": 2.4865336143115086e-06, + "loss": 1.8313, + "step": 9642 + }, + { + "epoch": 0.5173283261802575, + "grad_norm": 1.635754108428955, + "learning_rate": 2.486099219180061e-06, + "loss": 2.0876, + "step": 9643 + }, + { + "epoch": 0.5173819742489271, + "grad_norm": 1.369410514831543, + "learning_rate": 2.485664824468317e-06, + "loss": 2.3465, + "step": 9644 + }, + { + "epoch": 0.5174356223175965, + "grad_norm": 1.551515817642212, + "learning_rate": 2.485230430189391e-06, + "loss": 2.3621, + "step": 9645 + }, + { + "epoch": 0.5174892703862661, + "grad_norm": 1.5911792516708374, + "learning_rate": 2.4847960363563996e-06, + "loss": 2.03, + "step": 9646 + }, + { + "epoch": 0.5175429184549356, + "grad_norm": 1.5923548936843872, + "learning_rate": 2.4843616429824577e-06, + "loss": 2.4821, + "step": 9647 + }, + { + "epoch": 0.5175965665236052, + "grad_norm": 1.5245214700698853, + "learning_rate": 2.483927250080681e-06, + "loss": 2.1691, + "step": 9648 + }, + { + "epoch": 0.5176502145922747, + "grad_norm": 1.6270971298217773, + "learning_rate": 2.4834928576641846e-06, + "loss": 2.2847, + "step": 9649 + }, + { + "epoch": 0.5177038626609443, + "grad_norm": 1.6680573225021362, + "learning_rate": 2.4830584657460852e-06, + "loss": 2.3794, + "step": 9650 + }, + { + "epoch": 0.5177575107296137, + "grad_norm": 1.6640452146530151, + "learning_rate": 2.4826240743394985e-06, + "loss": 2.2994, + "step": 9651 + }, + { + "epoch": 0.5178111587982832, + "grad_norm": 1.576786756515503, + "learning_rate": 2.482189683457538e-06, + "loss": 2.1743, + "step": 9652 + }, + { + "epoch": 0.5178648068669528, + "grad_norm": 1.9678094387054443, + "learning_rate": 2.4817552931133195e-06, + "loss": 2.4447, + "step": 9653 + }, + { + "epoch": 0.5179184549356223, + "grad_norm": 1.5628901720046997, + "learning_rate": 2.48132090331996e-06, + "loss": 2.1492, + "step": 9654 + }, + { + "epoch": 0.5179721030042919, + "grad_norm": 1.6818790435791016, + "learning_rate": 2.4808865140905735e-06, + "loss": 2.2523, + "step": 9655 + }, + { + "epoch": 0.5180257510729613, + "grad_norm": 1.558101773262024, + "learning_rate": 2.4804521254382765e-06, + "loss": 2.1409, + "step": 9656 + }, + { + "epoch": 0.5180793991416309, + "grad_norm": 1.3217257261276245, + "learning_rate": 2.4800177373761837e-06, + "loss": 2.6076, + "step": 9657 + }, + { + "epoch": 0.5181330472103004, + "grad_norm": 1.4312920570373535, + "learning_rate": 2.479583349917411e-06, + "loss": 2.4338, + "step": 9658 + }, + { + "epoch": 0.51818669527897, + "grad_norm": 1.84848153591156, + "learning_rate": 2.479148963075073e-06, + "loss": 2.4287, + "step": 9659 + }, + { + "epoch": 0.5182403433476395, + "grad_norm": 1.546366810798645, + "learning_rate": 2.478714576862286e-06, + "loss": 2.3746, + "step": 9660 + }, + { + "epoch": 0.518293991416309, + "grad_norm": 1.5893722772598267, + "learning_rate": 2.4782801912921644e-06, + "loss": 2.0261, + "step": 9661 + }, + { + "epoch": 0.5183476394849785, + "grad_norm": 1.5034621953964233, + "learning_rate": 2.477845806377824e-06, + "loss": 1.5283, + "step": 9662 + }, + { + "epoch": 0.5184012875536481, + "grad_norm": 1.5757497549057007, + "learning_rate": 2.47741142213238e-06, + "loss": 2.4137, + "step": 9663 + }, + { + "epoch": 0.5184549356223176, + "grad_norm": 1.4522570371627808, + "learning_rate": 2.4769770385689475e-06, + "loss": 2.2991, + "step": 9664 + }, + { + "epoch": 0.5185085836909872, + "grad_norm": 1.9090070724487305, + "learning_rate": 2.4765426557006413e-06, + "loss": 2.4281, + "step": 9665 + }, + { + "epoch": 0.5185622317596567, + "grad_norm": 1.4936554431915283, + "learning_rate": 2.476108273540578e-06, + "loss": 2.065, + "step": 9666 + }, + { + "epoch": 0.5186158798283261, + "grad_norm": 3.593622922897339, + "learning_rate": 2.475673892101872e-06, + "loss": 2.2239, + "step": 9667 + }, + { + "epoch": 0.5186695278969957, + "grad_norm": 1.4530092477798462, + "learning_rate": 2.475239511397638e-06, + "loss": 2.0853, + "step": 9668 + }, + { + "epoch": 0.5187231759656652, + "grad_norm": 1.2249287366867065, + "learning_rate": 2.474805131440992e-06, + "loss": 1.871, + "step": 9669 + }, + { + "epoch": 0.5187768240343348, + "grad_norm": 1.5036066770553589, + "learning_rate": 2.474370752245049e-06, + "loss": 1.9737, + "step": 9670 + }, + { + "epoch": 0.5188304721030043, + "grad_norm": 1.5948525667190552, + "learning_rate": 2.4739363738229233e-06, + "loss": 2.3048, + "step": 9671 + }, + { + "epoch": 0.5188841201716738, + "grad_norm": 1.5109959840774536, + "learning_rate": 2.4735019961877306e-06, + "loss": 2.3035, + "step": 9672 + }, + { + "epoch": 0.5189377682403433, + "grad_norm": 1.6494393348693848, + "learning_rate": 2.473067619352586e-06, + "loss": 2.3481, + "step": 9673 + }, + { + "epoch": 0.5189914163090129, + "grad_norm": 1.9646680355072021, + "learning_rate": 2.472633243330605e-06, + "loss": 2.2453, + "step": 9674 + }, + { + "epoch": 0.5190450643776824, + "grad_norm": 1.5722788572311401, + "learning_rate": 2.4721988681349014e-06, + "loss": 2.4735, + "step": 9675 + }, + { + "epoch": 0.519098712446352, + "grad_norm": 1.652121663093567, + "learning_rate": 2.4717644937785906e-06, + "loss": 2.2712, + "step": 9676 + }, + { + "epoch": 0.5191523605150214, + "grad_norm": 1.6201772689819336, + "learning_rate": 2.4713301202747876e-06, + "loss": 2.1986, + "step": 9677 + }, + { + "epoch": 0.519206008583691, + "grad_norm": 1.821562647819519, + "learning_rate": 2.4708957476366095e-06, + "loss": 2.4725, + "step": 9678 + }, + { + "epoch": 0.5192596566523605, + "grad_norm": 2.1833207607269287, + "learning_rate": 2.470461375877168e-06, + "loss": 2.1025, + "step": 9679 + }, + { + "epoch": 0.51931330472103, + "grad_norm": 1.4056572914123535, + "learning_rate": 2.470027005009579e-06, + "loss": 2.1792, + "step": 9680 + }, + { + "epoch": 0.5193669527896996, + "grad_norm": 1.2415049076080322, + "learning_rate": 2.469592635046958e-06, + "loss": 2.0338, + "step": 9681 + }, + { + "epoch": 0.519420600858369, + "grad_norm": 1.5852965116500854, + "learning_rate": 2.4691582660024187e-06, + "loss": 2.2814, + "step": 9682 + }, + { + "epoch": 0.5194742489270386, + "grad_norm": 1.5123838186264038, + "learning_rate": 2.468723897889077e-06, + "loss": 2.4214, + "step": 9683 + }, + { + "epoch": 0.5195278969957081, + "grad_norm": 1.535053014755249, + "learning_rate": 2.4682895307200475e-06, + "loss": 2.2912, + "step": 9684 + }, + { + "epoch": 0.5195815450643777, + "grad_norm": 1.5153706073760986, + "learning_rate": 2.4678551645084448e-06, + "loss": 2.2759, + "step": 9685 + }, + { + "epoch": 0.5196351931330472, + "grad_norm": 1.605078935623169, + "learning_rate": 2.4674207992673834e-06, + "loss": 2.4759, + "step": 9686 + }, + { + "epoch": 0.5196888412017168, + "grad_norm": 1.2608284950256348, + "learning_rate": 2.4669864350099788e-06, + "loss": 1.8762, + "step": 9687 + }, + { + "epoch": 0.5197424892703862, + "grad_norm": 1.6317566633224487, + "learning_rate": 2.4665520717493443e-06, + "loss": 2.5124, + "step": 9688 + }, + { + "epoch": 0.5197961373390558, + "grad_norm": 1.7608729600906372, + "learning_rate": 2.4661177094985955e-06, + "loss": 2.3143, + "step": 9689 + }, + { + "epoch": 0.5198497854077253, + "grad_norm": 2.0697593688964844, + "learning_rate": 2.4656833482708465e-06, + "loss": 2.4913, + "step": 9690 + }, + { + "epoch": 0.5199034334763949, + "grad_norm": 1.184171199798584, + "learning_rate": 2.4652489880792128e-06, + "loss": 2.0531, + "step": 9691 + }, + { + "epoch": 0.5199570815450644, + "grad_norm": 1.4613450765609741, + "learning_rate": 2.4648146289368077e-06, + "loss": 2.1194, + "step": 9692 + }, + { + "epoch": 0.520010729613734, + "grad_norm": 1.3956979513168335, + "learning_rate": 2.464380270856746e-06, + "loss": 2.3156, + "step": 9693 + }, + { + "epoch": 0.5200643776824034, + "grad_norm": 1.658992886543274, + "learning_rate": 2.4639459138521425e-06, + "loss": 2.3121, + "step": 9694 + }, + { + "epoch": 0.5201180257510729, + "grad_norm": 1.434004783630371, + "learning_rate": 2.4635115579361125e-06, + "loss": 2.2989, + "step": 9695 + }, + { + "epoch": 0.5201716738197425, + "grad_norm": 1.6551181077957153, + "learning_rate": 2.463077203121769e-06, + "loss": 2.2332, + "step": 9696 + }, + { + "epoch": 0.520225321888412, + "grad_norm": 1.331092357635498, + "learning_rate": 2.4626428494222267e-06, + "loss": 2.2475, + "step": 9697 + }, + { + "epoch": 0.5202789699570816, + "grad_norm": 1.643966555595398, + "learning_rate": 2.462208496850599e-06, + "loss": 2.2966, + "step": 9698 + }, + { + "epoch": 0.520332618025751, + "grad_norm": 1.561035394668579, + "learning_rate": 2.4617741454200026e-06, + "loss": 2.1973, + "step": 9699 + }, + { + "epoch": 0.5203862660944206, + "grad_norm": 3.4552266597747803, + "learning_rate": 2.46133979514355e-06, + "loss": 1.4885, + "step": 9700 + }, + { + "epoch": 0.5204399141630901, + "grad_norm": 1.6493990421295166, + "learning_rate": 2.460905446034356e-06, + "loss": 2.2642, + "step": 9701 + }, + { + "epoch": 0.5204935622317597, + "grad_norm": 1.5534117221832275, + "learning_rate": 2.4604710981055348e-06, + "loss": 2.1881, + "step": 9702 + }, + { + "epoch": 0.5205472103004292, + "grad_norm": 1.6329491138458252, + "learning_rate": 2.4600367513702e-06, + "loss": 2.2386, + "step": 9703 + }, + { + "epoch": 0.5206008583690988, + "grad_norm": 1.6095008850097656, + "learning_rate": 2.4596024058414674e-06, + "loss": 2.2179, + "step": 9704 + }, + { + "epoch": 0.5206545064377682, + "grad_norm": 1.7537890672683716, + "learning_rate": 2.4591680615324493e-06, + "loss": 2.1922, + "step": 9705 + }, + { + "epoch": 0.5207081545064378, + "grad_norm": 1.9739524126052856, + "learning_rate": 2.4587337184562598e-06, + "loss": 2.3094, + "step": 9706 + }, + { + "epoch": 0.5207618025751073, + "grad_norm": 1.511179804801941, + "learning_rate": 2.4582993766260138e-06, + "loss": 2.4409, + "step": 9707 + }, + { + "epoch": 0.5208154506437769, + "grad_norm": 1.5623152256011963, + "learning_rate": 2.457865036054825e-06, + "loss": 2.2865, + "step": 9708 + }, + { + "epoch": 0.5208690987124464, + "grad_norm": 1.6876882314682007, + "learning_rate": 2.457430696755807e-06, + "loss": 2.1871, + "step": 9709 + }, + { + "epoch": 0.5209227467811158, + "grad_norm": 1.5745306015014648, + "learning_rate": 2.4569963587420738e-06, + "loss": 2.181, + "step": 9710 + }, + { + "epoch": 0.5209763948497854, + "grad_norm": 1.4491225481033325, + "learning_rate": 2.4565620220267396e-06, + "loss": 2.4284, + "step": 9711 + }, + { + "epoch": 0.5210300429184549, + "grad_norm": 1.3392448425292969, + "learning_rate": 2.4561276866229185e-06, + "loss": 2.4016, + "step": 9712 + }, + { + "epoch": 0.5210836909871245, + "grad_norm": 1.2452610731124878, + "learning_rate": 2.4556933525437245e-06, + "loss": 2.1647, + "step": 9713 + }, + { + "epoch": 0.521137339055794, + "grad_norm": 1.4346636533737183, + "learning_rate": 2.45525901980227e-06, + "loss": 2.2871, + "step": 9714 + }, + { + "epoch": 0.5211909871244635, + "grad_norm": 1.585094928741455, + "learning_rate": 2.4548246884116686e-06, + "loss": 2.2843, + "step": 9715 + }, + { + "epoch": 0.521244635193133, + "grad_norm": 1.4704527854919434, + "learning_rate": 2.4543903583850355e-06, + "loss": 1.9305, + "step": 9716 + }, + { + "epoch": 0.5212982832618026, + "grad_norm": 1.8618366718292236, + "learning_rate": 2.4539560297354836e-06, + "loss": 2.268, + "step": 9717 + }, + { + "epoch": 0.5213519313304721, + "grad_norm": 1.4819940328598022, + "learning_rate": 2.453521702476126e-06, + "loss": 2.2612, + "step": 9718 + }, + { + "epoch": 0.5214055793991417, + "grad_norm": 1.4518449306488037, + "learning_rate": 2.4530873766200775e-06, + "loss": 2.1968, + "step": 9719 + }, + { + "epoch": 0.5214592274678111, + "grad_norm": 1.5152201652526855, + "learning_rate": 2.452653052180451e-06, + "loss": 1.9411, + "step": 9720 + }, + { + "epoch": 0.5215128755364807, + "grad_norm": 1.5661042928695679, + "learning_rate": 2.4522187291703587e-06, + "loss": 2.1689, + "step": 9721 + }, + { + "epoch": 0.5215665236051502, + "grad_norm": 1.4090567827224731, + "learning_rate": 2.451784407602916e-06, + "loss": 2.1509, + "step": 9722 + }, + { + "epoch": 0.5216201716738197, + "grad_norm": 1.5194065570831299, + "learning_rate": 2.4513500874912353e-06, + "loss": 2.4101, + "step": 9723 + }, + { + "epoch": 0.5216738197424893, + "grad_norm": 1.5935362577438354, + "learning_rate": 2.45091576884843e-06, + "loss": 2.1304, + "step": 9724 + }, + { + "epoch": 0.5217274678111588, + "grad_norm": 1.7381033897399902, + "learning_rate": 2.450481451687613e-06, + "loss": 2.2078, + "step": 9725 + }, + { + "epoch": 0.5217811158798283, + "grad_norm": 1.5339064598083496, + "learning_rate": 2.450047136021898e-06, + "loss": 2.1791, + "step": 9726 + }, + { + "epoch": 0.5218347639484978, + "grad_norm": 1.322548270225525, + "learning_rate": 2.4496128218643976e-06, + "loss": 2.2028, + "step": 9727 + }, + { + "epoch": 0.5218884120171674, + "grad_norm": 1.6987981796264648, + "learning_rate": 2.449178509228226e-06, + "loss": 2.4733, + "step": 9728 + }, + { + "epoch": 0.5219420600858369, + "grad_norm": 1.5402007102966309, + "learning_rate": 2.4487441981264955e-06, + "loss": 2.1016, + "step": 9729 + }, + { + "epoch": 0.5219957081545065, + "grad_norm": 1.499603271484375, + "learning_rate": 2.44830988857232e-06, + "loss": 2.2541, + "step": 9730 + }, + { + "epoch": 0.5220493562231759, + "grad_norm": 1.5691012144088745, + "learning_rate": 2.447875580578812e-06, + "loss": 1.429, + "step": 9731 + }, + { + "epoch": 0.5221030042918455, + "grad_norm": 1.2811968326568604, + "learning_rate": 2.447441274159084e-06, + "loss": 2.2587, + "step": 9732 + }, + { + "epoch": 0.522156652360515, + "grad_norm": 1.4356921911239624, + "learning_rate": 2.447006969326249e-06, + "loss": 2.2805, + "step": 9733 + }, + { + "epoch": 0.5222103004291846, + "grad_norm": 1.6899813413619995, + "learning_rate": 2.4465726660934203e-06, + "loss": 2.3365, + "step": 9734 + }, + { + "epoch": 0.5222639484978541, + "grad_norm": 1.8181899785995483, + "learning_rate": 2.4461383644737104e-06, + "loss": 2.4161, + "step": 9735 + }, + { + "epoch": 0.5223175965665237, + "grad_norm": 1.648056983947754, + "learning_rate": 2.4457040644802326e-06, + "loss": 2.2765, + "step": 9736 + }, + { + "epoch": 0.5223712446351931, + "grad_norm": 1.5701541900634766, + "learning_rate": 2.445269766126099e-06, + "loss": 2.1291, + "step": 9737 + }, + { + "epoch": 0.5224248927038626, + "grad_norm": 1.684285283088684, + "learning_rate": 2.4448354694244225e-06, + "loss": 2.1515, + "step": 9738 + }, + { + "epoch": 0.5224785407725322, + "grad_norm": 1.756799578666687, + "learning_rate": 2.444401174388315e-06, + "loss": 2.3229, + "step": 9739 + }, + { + "epoch": 0.5225321888412017, + "grad_norm": 1.489188313484192, + "learning_rate": 2.443966881030892e-06, + "loss": 2.182, + "step": 9740 + }, + { + "epoch": 0.5225858369098713, + "grad_norm": 1.885585904121399, + "learning_rate": 2.443532589365262e-06, + "loss": 2.1012, + "step": 9741 + }, + { + "epoch": 0.5226394849785407, + "grad_norm": 1.7143566608428955, + "learning_rate": 2.4430982994045402e-06, + "loss": 2.4894, + "step": 9742 + }, + { + "epoch": 0.5226931330472103, + "grad_norm": 1.2306407690048218, + "learning_rate": 2.442664011161837e-06, + "loss": 2.116, + "step": 9743 + }, + { + "epoch": 0.5227467811158798, + "grad_norm": 2.4058837890625, + "learning_rate": 2.4422297246502663e-06, + "loss": 2.1899, + "step": 9744 + }, + { + "epoch": 0.5228004291845494, + "grad_norm": 1.7514617443084717, + "learning_rate": 2.44179543988294e-06, + "loss": 1.9331, + "step": 9745 + }, + { + "epoch": 0.5228540772532189, + "grad_norm": 1.6922224760055542, + "learning_rate": 2.4413611568729705e-06, + "loss": 2.1875, + "step": 9746 + }, + { + "epoch": 0.5229077253218885, + "grad_norm": 1.6257402896881104, + "learning_rate": 2.4409268756334697e-06, + "loss": 2.2671, + "step": 9747 + }, + { + "epoch": 0.5229613733905579, + "grad_norm": 1.7832056283950806, + "learning_rate": 2.4404925961775504e-06, + "loss": 2.2666, + "step": 9748 + }, + { + "epoch": 0.5230150214592275, + "grad_norm": 1.3588719367980957, + "learning_rate": 2.440058318518324e-06, + "loss": 2.4787, + "step": 9749 + }, + { + "epoch": 0.523068669527897, + "grad_norm": 1.7460546493530273, + "learning_rate": 2.439624042668902e-06, + "loss": 2.1002, + "step": 9750 + }, + { + "epoch": 0.5231223175965666, + "grad_norm": 1.2867724895477295, + "learning_rate": 2.439189768642398e-06, + "loss": 2.3529, + "step": 9751 + }, + { + "epoch": 0.523175965665236, + "grad_norm": 1.4527522325515747, + "learning_rate": 2.4387554964519223e-06, + "loss": 2.1881, + "step": 9752 + }, + { + "epoch": 0.5232296137339055, + "grad_norm": 1.621613621711731, + "learning_rate": 2.438321226110588e-06, + "loss": 2.2694, + "step": 9753 + }, + { + "epoch": 0.5232832618025751, + "grad_norm": 1.5440576076507568, + "learning_rate": 2.4378869576315063e-06, + "loss": 1.9252, + "step": 9754 + }, + { + "epoch": 0.5233369098712446, + "grad_norm": 1.8236671686172485, + "learning_rate": 2.437452691027789e-06, + "loss": 2.2908, + "step": 9755 + }, + { + "epoch": 0.5233905579399142, + "grad_norm": 1.6313265562057495, + "learning_rate": 2.4370184263125474e-06, + "loss": 2.2331, + "step": 9756 + }, + { + "epoch": 0.5234442060085837, + "grad_norm": 1.5128180980682373, + "learning_rate": 2.4365841634988956e-06, + "loss": 2.2945, + "step": 9757 + }, + { + "epoch": 0.5234978540772532, + "grad_norm": 1.6240179538726807, + "learning_rate": 2.436149902599942e-06, + "loss": 2.355, + "step": 9758 + }, + { + "epoch": 0.5235515021459227, + "grad_norm": 1.6296993494033813, + "learning_rate": 2.4357156436287995e-06, + "loss": 2.2915, + "step": 9759 + }, + { + "epoch": 0.5236051502145923, + "grad_norm": 1.5697522163391113, + "learning_rate": 2.4352813865985785e-06, + "loss": 2.2247, + "step": 9760 + }, + { + "epoch": 0.5236587982832618, + "grad_norm": 1.5460034608840942, + "learning_rate": 2.4348471315223923e-06, + "loss": 2.2499, + "step": 9761 + }, + { + "epoch": 0.5237124463519314, + "grad_norm": 1.3447622060775757, + "learning_rate": 2.4344128784133515e-06, + "loss": 1.7818, + "step": 9762 + }, + { + "epoch": 0.5237660944206008, + "grad_norm": 1.6427818536758423, + "learning_rate": 2.433978627284567e-06, + "loss": 2.0192, + "step": 9763 + }, + { + "epoch": 0.5238197424892704, + "grad_norm": 1.7930079698562622, + "learning_rate": 2.43354437814915e-06, + "loss": 2.187, + "step": 9764 + }, + { + "epoch": 0.5238733905579399, + "grad_norm": 1.3613404035568237, + "learning_rate": 2.433110131020213e-06, + "loss": 2.0356, + "step": 9765 + }, + { + "epoch": 0.5239270386266094, + "grad_norm": 1.729349970817566, + "learning_rate": 2.432675885910866e-06, + "loss": 2.2305, + "step": 9766 + }, + { + "epoch": 0.523980686695279, + "grad_norm": 1.523099422454834, + "learning_rate": 2.4322416428342197e-06, + "loss": 2.5277, + "step": 9767 + }, + { + "epoch": 0.5240343347639485, + "grad_norm": 1.5763788223266602, + "learning_rate": 2.4318074018033856e-06, + "loss": 2.04, + "step": 9768 + }, + { + "epoch": 0.524087982832618, + "grad_norm": 1.5635384321212769, + "learning_rate": 2.4313731628314746e-06, + "loss": 2.2231, + "step": 9769 + }, + { + "epoch": 0.5241416309012875, + "grad_norm": 1.5492242574691772, + "learning_rate": 2.4309389259315973e-06, + "loss": 2.0496, + "step": 9770 + }, + { + "epoch": 0.5241952789699571, + "grad_norm": 4.780927658081055, + "learning_rate": 2.430504691116865e-06, + "loss": 2.3737, + "step": 9771 + }, + { + "epoch": 0.5242489270386266, + "grad_norm": 1.7463847398757935, + "learning_rate": 2.4300704584003883e-06, + "loss": 1.8717, + "step": 9772 + }, + { + "epoch": 0.5243025751072962, + "grad_norm": 1.5545408725738525, + "learning_rate": 2.4296362277952778e-06, + "loss": 2.2483, + "step": 9773 + }, + { + "epoch": 0.5243562231759656, + "grad_norm": 1.5446425676345825, + "learning_rate": 2.4292019993146445e-06, + "loss": 2.4052, + "step": 9774 + }, + { + "epoch": 0.5244098712446352, + "grad_norm": 1.613356590270996, + "learning_rate": 2.4287677729715992e-06, + "loss": 2.2867, + "step": 9775 + }, + { + "epoch": 0.5244635193133047, + "grad_norm": 1.5137661695480347, + "learning_rate": 2.428333548779251e-06, + "loss": 2.1947, + "step": 9776 + }, + { + "epoch": 0.5245171673819743, + "grad_norm": 1.547343134880066, + "learning_rate": 2.4278993267507104e-06, + "loss": 2.1384, + "step": 9777 + }, + { + "epoch": 0.5245708154506438, + "grad_norm": 1.6261996030807495, + "learning_rate": 2.4274651068990894e-06, + "loss": 2.2906, + "step": 9778 + }, + { + "epoch": 0.5246244635193134, + "grad_norm": 1.5851014852523804, + "learning_rate": 2.4270308892374974e-06, + "loss": 2.233, + "step": 9779 + }, + { + "epoch": 0.5246781115879828, + "grad_norm": 1.5763484239578247, + "learning_rate": 2.426596673779045e-06, + "loss": 2.3982, + "step": 9780 + }, + { + "epoch": 0.5247317596566523, + "grad_norm": 1.673654317855835, + "learning_rate": 2.426162460536841e-06, + "loss": 2.2261, + "step": 9781 + }, + { + "epoch": 0.5247854077253219, + "grad_norm": 1.475342869758606, + "learning_rate": 2.4257282495239972e-06, + "loss": 2.3889, + "step": 9782 + }, + { + "epoch": 0.5248390557939914, + "grad_norm": 1.7657757997512817, + "learning_rate": 2.425294040753623e-06, + "loss": 2.2526, + "step": 9783 + }, + { + "epoch": 0.524892703862661, + "grad_norm": 1.5121018886566162, + "learning_rate": 2.424859834238828e-06, + "loss": 2.2056, + "step": 9784 + }, + { + "epoch": 0.5249463519313304, + "grad_norm": 1.5009384155273438, + "learning_rate": 2.4244256299927228e-06, + "loss": 2.3934, + "step": 9785 + }, + { + "epoch": 0.525, + "grad_norm": 1.6311734914779663, + "learning_rate": 2.4239914280284165e-06, + "loss": 2.2347, + "step": 9786 + }, + { + "epoch": 0.5250536480686695, + "grad_norm": 1.4427019357681274, + "learning_rate": 2.4235572283590194e-06, + "loss": 2.0414, + "step": 9787 + }, + { + "epoch": 0.5251072961373391, + "grad_norm": 1.584061861038208, + "learning_rate": 2.4231230309976405e-06, + "loss": 2.2259, + "step": 9788 + }, + { + "epoch": 0.5251609442060086, + "grad_norm": 1.521130919456482, + "learning_rate": 2.42268883595739e-06, + "loss": 2.4714, + "step": 9789 + }, + { + "epoch": 0.5252145922746781, + "grad_norm": 1.4948731660842896, + "learning_rate": 2.422254643251377e-06, + "loss": 2.16, + "step": 9790 + }, + { + "epoch": 0.5252682403433476, + "grad_norm": 1.6798971891403198, + "learning_rate": 2.4218204528927116e-06, + "loss": 2.5826, + "step": 9791 + }, + { + "epoch": 0.5253218884120172, + "grad_norm": 13.613481521606445, + "learning_rate": 2.4213862648945044e-06, + "loss": 2.3323, + "step": 9792 + }, + { + "epoch": 0.5253755364806867, + "grad_norm": 1.6202540397644043, + "learning_rate": 2.4209520792698614e-06, + "loss": 2.2212, + "step": 9793 + }, + { + "epoch": 0.5254291845493563, + "grad_norm": 1.5739237070083618, + "learning_rate": 2.4205178960318942e-06, + "loss": 2.2416, + "step": 9794 + }, + { + "epoch": 0.5254828326180258, + "grad_norm": 1.492950201034546, + "learning_rate": 2.4200837151937116e-06, + "loss": 2.1916, + "step": 9795 + }, + { + "epoch": 0.5255364806866952, + "grad_norm": 1.7529628276824951, + "learning_rate": 2.4196495367684226e-06, + "loss": 2.4257, + "step": 9796 + }, + { + "epoch": 0.5255901287553648, + "grad_norm": 1.5241831541061401, + "learning_rate": 2.4192153607691363e-06, + "loss": 2.2001, + "step": 9797 + }, + { + "epoch": 0.5256437768240343, + "grad_norm": 1.4280928373336792, + "learning_rate": 2.418781187208962e-06, + "loss": 2.1287, + "step": 9798 + }, + { + "epoch": 0.5256974248927039, + "grad_norm": 1.7071176767349243, + "learning_rate": 2.4183470161010075e-06, + "loss": 2.3798, + "step": 9799 + }, + { + "epoch": 0.5257510729613734, + "grad_norm": 1.7373212575912476, + "learning_rate": 2.4179128474583828e-06, + "loss": 2.2386, + "step": 9800 + }, + { + "epoch": 0.5258047210300429, + "grad_norm": 1.5686688423156738, + "learning_rate": 2.4174786812941968e-06, + "loss": 2.1983, + "step": 9801 + }, + { + "epoch": 0.5258583690987124, + "grad_norm": 1.5391547679901123, + "learning_rate": 2.4170445176215574e-06, + "loss": 2.2235, + "step": 9802 + }, + { + "epoch": 0.525912017167382, + "grad_norm": 1.4957104921340942, + "learning_rate": 2.4166103564535728e-06, + "loss": 1.9004, + "step": 9803 + }, + { + "epoch": 0.5259656652360515, + "grad_norm": 1.3903528451919556, + "learning_rate": 2.4161761978033525e-06, + "loss": 1.8303, + "step": 9804 + }, + { + "epoch": 0.5260193133047211, + "grad_norm": 1.7903767824172974, + "learning_rate": 2.415742041684004e-06, + "loss": 2.445, + "step": 9805 + }, + { + "epoch": 0.5260729613733905, + "grad_norm": 1.4098598957061768, + "learning_rate": 2.415307888108637e-06, + "loss": 2.2311, + "step": 9806 + }, + { + "epoch": 0.5261266094420601, + "grad_norm": 1.8885935544967651, + "learning_rate": 2.4148737370903584e-06, + "loss": 2.206, + "step": 9807 + }, + { + "epoch": 0.5261802575107296, + "grad_norm": 1.5358929634094238, + "learning_rate": 2.4144395886422777e-06, + "loss": 2.29, + "step": 9808 + }, + { + "epoch": 0.5262339055793992, + "grad_norm": 1.8806475400924683, + "learning_rate": 2.4140054427775023e-06, + "loss": 2.1573, + "step": 9809 + }, + { + "epoch": 0.5262875536480687, + "grad_norm": 1.8045859336853027, + "learning_rate": 2.4135712995091407e-06, + "loss": 2.2898, + "step": 9810 + }, + { + "epoch": 0.5263412017167381, + "grad_norm": 2.085477828979492, + "learning_rate": 2.4131371588503e-06, + "loss": 2.3124, + "step": 9811 + }, + { + "epoch": 0.5263948497854077, + "grad_norm": 1.6184645891189575, + "learning_rate": 2.412703020814089e-06, + "loss": 2.5721, + "step": 9812 + }, + { + "epoch": 0.5264484978540772, + "grad_norm": 1.5723474025726318, + "learning_rate": 2.4122688854136154e-06, + "loss": 2.1714, + "step": 9813 + }, + { + "epoch": 0.5265021459227468, + "grad_norm": 1.8893853425979614, + "learning_rate": 2.411834752661986e-06, + "loss": 2.506, + "step": 9814 + }, + { + "epoch": 0.5265557939914163, + "grad_norm": 1.6397137641906738, + "learning_rate": 2.41140062257231e-06, + "loss": 2.2731, + "step": 9815 + }, + { + "epoch": 0.5266094420600859, + "grad_norm": 1.840777039527893, + "learning_rate": 2.410966495157694e-06, + "loss": 2.2121, + "step": 9816 + }, + { + "epoch": 0.5266630901287553, + "grad_norm": 1.702829122543335, + "learning_rate": 2.4105323704312456e-06, + "loss": 2.3608, + "step": 9817 + }, + { + "epoch": 0.5267167381974249, + "grad_norm": 1.7482885122299194, + "learning_rate": 2.410098248406072e-06, + "loss": 2.4142, + "step": 9818 + }, + { + "epoch": 0.5267703862660944, + "grad_norm": 1.7688953876495361, + "learning_rate": 2.409664129095282e-06, + "loss": 2.0551, + "step": 9819 + }, + { + "epoch": 0.526824034334764, + "grad_norm": 1.9494273662567139, + "learning_rate": 2.409230012511981e-06, + "loss": 2.1994, + "step": 9820 + }, + { + "epoch": 0.5268776824034335, + "grad_norm": 1.4894942045211792, + "learning_rate": 2.408795898669277e-06, + "loss": 2.4726, + "step": 9821 + }, + { + "epoch": 0.526931330472103, + "grad_norm": 1.6455023288726807, + "learning_rate": 2.4083617875802765e-06, + "loss": 2.3514, + "step": 9822 + }, + { + "epoch": 0.5269849785407725, + "grad_norm": 1.5976306200027466, + "learning_rate": 2.4079276792580875e-06, + "loss": 2.3635, + "step": 9823 + }, + { + "epoch": 0.527038626609442, + "grad_norm": 1.6638463735580444, + "learning_rate": 2.4074935737158162e-06, + "loss": 2.2159, + "step": 9824 + }, + { + "epoch": 0.5270922746781116, + "grad_norm": 1.4125336408615112, + "learning_rate": 2.4070594709665704e-06, + "loss": 1.7305, + "step": 9825 + }, + { + "epoch": 0.5271459227467811, + "grad_norm": 1.714720606803894, + "learning_rate": 2.406625371023456e-06, + "loss": 2.2382, + "step": 9826 + }, + { + "epoch": 0.5271995708154507, + "grad_norm": 1.5010114908218384, + "learning_rate": 2.4061912738995794e-06, + "loss": 2.212, + "step": 9827 + }, + { + "epoch": 0.5272532188841201, + "grad_norm": 1.7332149744033813, + "learning_rate": 2.4057571796080482e-06, + "loss": 2.2192, + "step": 9828 + }, + { + "epoch": 0.5273068669527897, + "grad_norm": 1.523167610168457, + "learning_rate": 2.4053230881619683e-06, + "loss": 2.058, + "step": 9829 + }, + { + "epoch": 0.5273605150214592, + "grad_norm": 1.33027184009552, + "learning_rate": 2.404888999574446e-06, + "loss": 2.1961, + "step": 9830 + }, + { + "epoch": 0.5274141630901288, + "grad_norm": 1.5720146894454956, + "learning_rate": 2.4044549138585876e-06, + "loss": 2.2347, + "step": 9831 + }, + { + "epoch": 0.5274678111587983, + "grad_norm": 12.6752347946167, + "learning_rate": 2.4040208310275e-06, + "loss": 2.0709, + "step": 9832 + }, + { + "epoch": 0.5275214592274678, + "grad_norm": 1.5185182094573975, + "learning_rate": 2.403586751094289e-06, + "loss": 1.6202, + "step": 9833 + }, + { + "epoch": 0.5275751072961373, + "grad_norm": 2.0892531871795654, + "learning_rate": 2.4031526740720594e-06, + "loss": 2.2351, + "step": 9834 + }, + { + "epoch": 0.5276287553648069, + "grad_norm": 1.5425934791564941, + "learning_rate": 2.402718599973919e-06, + "loss": 2.3053, + "step": 9835 + }, + { + "epoch": 0.5276824034334764, + "grad_norm": 1.6076172590255737, + "learning_rate": 2.402284528812974e-06, + "loss": 2.2886, + "step": 9836 + }, + { + "epoch": 0.527736051502146, + "grad_norm": 1.464673638343811, + "learning_rate": 2.4018504606023295e-06, + "loss": 2.0368, + "step": 9837 + }, + { + "epoch": 0.5277896995708155, + "grad_norm": 1.6507831811904907, + "learning_rate": 2.4014163953550896e-06, + "loss": 2.5882, + "step": 9838 + }, + { + "epoch": 0.5278433476394849, + "grad_norm": 1.5668706893920898, + "learning_rate": 2.4009823330843617e-06, + "loss": 2.4434, + "step": 9839 + }, + { + "epoch": 0.5278969957081545, + "grad_norm": 1.6623281240463257, + "learning_rate": 2.4005482738032513e-06, + "loss": 2.5304, + "step": 9840 + }, + { + "epoch": 0.527950643776824, + "grad_norm": 1.4376311302185059, + "learning_rate": 2.4001142175248635e-06, + "loss": 1.9226, + "step": 9841 + }, + { + "epoch": 0.5280042918454936, + "grad_norm": 1.5862345695495605, + "learning_rate": 2.3996801642623034e-06, + "loss": 2.1939, + "step": 9842 + }, + { + "epoch": 0.528057939914163, + "grad_norm": 1.523652195930481, + "learning_rate": 2.3992461140286768e-06, + "loss": 2.2853, + "step": 9843 + }, + { + "epoch": 0.5281115879828326, + "grad_norm": 1.6302077770233154, + "learning_rate": 2.3988120668370885e-06, + "loss": 2.3652, + "step": 9844 + }, + { + "epoch": 0.5281652360515021, + "grad_norm": 1.470528483390808, + "learning_rate": 2.3983780227006448e-06, + "loss": 2.1881, + "step": 9845 + }, + { + "epoch": 0.5282188841201717, + "grad_norm": 1.5728909969329834, + "learning_rate": 2.3979439816324483e-06, + "loss": 2.309, + "step": 9846 + }, + { + "epoch": 0.5282725321888412, + "grad_norm": 1.580057978630066, + "learning_rate": 2.397509943645606e-06, + "loss": 2.1343, + "step": 9847 + }, + { + "epoch": 0.5283261802575108, + "grad_norm": 1.7060189247131348, + "learning_rate": 2.3970759087532212e-06, + "loss": 2.464, + "step": 9848 + }, + { + "epoch": 0.5283798283261802, + "grad_norm": 1.4693644046783447, + "learning_rate": 2.3966418769684e-06, + "loss": 2.4324, + "step": 9849 + }, + { + "epoch": 0.5284334763948498, + "grad_norm": 1.705837607383728, + "learning_rate": 2.396207848304246e-06, + "loss": 2.3326, + "step": 9850 + }, + { + "epoch": 0.5284871244635193, + "grad_norm": 1.4325743913650513, + "learning_rate": 2.3957738227738634e-06, + "loss": 2.1839, + "step": 9851 + }, + { + "epoch": 0.5285407725321889, + "grad_norm": 2.8524506092071533, + "learning_rate": 2.3953398003903582e-06, + "loss": 2.1965, + "step": 9852 + }, + { + "epoch": 0.5285944206008584, + "grad_norm": 1.4626145362854004, + "learning_rate": 2.394905781166833e-06, + "loss": 2.4147, + "step": 9853 + }, + { + "epoch": 0.5286480686695278, + "grad_norm": 2.840224266052246, + "learning_rate": 2.394471765116395e-06, + "loss": 2.364, + "step": 9854 + }, + { + "epoch": 0.5287017167381974, + "grad_norm": 1.6912890672683716, + "learning_rate": 2.3940377522521444e-06, + "loss": 2.1253, + "step": 9855 + }, + { + "epoch": 0.5287553648068669, + "grad_norm": 1.4663865566253662, + "learning_rate": 2.3936037425871874e-06, + "loss": 2.0371, + "step": 9856 + }, + { + "epoch": 0.5288090128755365, + "grad_norm": 1.569517731666565, + "learning_rate": 2.3931697361346276e-06, + "loss": 2.0952, + "step": 9857 + }, + { + "epoch": 0.528862660944206, + "grad_norm": 1.4899473190307617, + "learning_rate": 2.392735732907569e-06, + "loss": 2.2891, + "step": 9858 + }, + { + "epoch": 0.5289163090128756, + "grad_norm": 1.4453237056732178, + "learning_rate": 2.3923017329191153e-06, + "loss": 2.1715, + "step": 9859 + }, + { + "epoch": 0.528969957081545, + "grad_norm": 2.0323243141174316, + "learning_rate": 2.39186773618237e-06, + "loss": 2.4097, + "step": 9860 + }, + { + "epoch": 0.5290236051502146, + "grad_norm": 1.5833925008773804, + "learning_rate": 2.391433742710437e-06, + "loss": 2.2256, + "step": 9861 + }, + { + "epoch": 0.5290772532188841, + "grad_norm": 1.4000332355499268, + "learning_rate": 2.390999752516419e-06, + "loss": 1.7792, + "step": 9862 + }, + { + "epoch": 0.5291309012875537, + "grad_norm": 1.7721426486968994, + "learning_rate": 2.390565765613421e-06, + "loss": 2.2449, + "step": 9863 + }, + { + "epoch": 0.5291845493562232, + "grad_norm": 1.652698040008545, + "learning_rate": 2.3901317820145445e-06, + "loss": 2.287, + "step": 9864 + }, + { + "epoch": 0.5292381974248928, + "grad_norm": 1.4919543266296387, + "learning_rate": 2.3896978017328933e-06, + "loss": 2.0411, + "step": 9865 + }, + { + "epoch": 0.5292918454935622, + "grad_norm": 4.63834285736084, + "learning_rate": 2.3892638247815702e-06, + "loss": 1.8647, + "step": 9866 + }, + { + "epoch": 0.5293454935622317, + "grad_norm": 1.602766990661621, + "learning_rate": 2.388829851173678e-06, + "loss": 2.0897, + "step": 9867 + }, + { + "epoch": 0.5293991416309013, + "grad_norm": 1.5670119524002075, + "learning_rate": 2.388395880922321e-06, + "loss": 2.1107, + "step": 9868 + }, + { + "epoch": 0.5294527896995708, + "grad_norm": 1.770255208015442, + "learning_rate": 2.3879619140406004e-06, + "loss": 2.3156, + "step": 9869 + }, + { + "epoch": 0.5295064377682404, + "grad_norm": 1.4064948558807373, + "learning_rate": 2.38752795054162e-06, + "loss": 2.4075, + "step": 9870 + }, + { + "epoch": 0.5295600858369098, + "grad_norm": 1.603394627571106, + "learning_rate": 2.3870939904384814e-06, + "loss": 2.0764, + "step": 9871 + }, + { + "epoch": 0.5296137339055794, + "grad_norm": 2.5762624740600586, + "learning_rate": 2.386660033744288e-06, + "loss": 2.246, + "step": 9872 + }, + { + "epoch": 0.5296673819742489, + "grad_norm": 1.528285026550293, + "learning_rate": 2.386226080472141e-06, + "loss": 2.1821, + "step": 9873 + }, + { + "epoch": 0.5297210300429185, + "grad_norm": 1.552636981010437, + "learning_rate": 2.385792130635144e-06, + "loss": 2.1383, + "step": 9874 + }, + { + "epoch": 0.529774678111588, + "grad_norm": 1.2809522151947021, + "learning_rate": 2.3853581842463976e-06, + "loss": 2.1879, + "step": 9875 + }, + { + "epoch": 0.5298283261802575, + "grad_norm": 1.669471025466919, + "learning_rate": 2.384924241319005e-06, + "loss": 2.5068, + "step": 9876 + }, + { + "epoch": 0.529881974248927, + "grad_norm": 1.8080077171325684, + "learning_rate": 2.3844903018660675e-06, + "loss": 2.4966, + "step": 9877 + }, + { + "epoch": 0.5299356223175966, + "grad_norm": 1.585368275642395, + "learning_rate": 2.3840563659006876e-06, + "loss": 2.3698, + "step": 9878 + }, + { + "epoch": 0.5299892703862661, + "grad_norm": 1.468673825263977, + "learning_rate": 2.3836224334359656e-06, + "loss": 2.1652, + "step": 9879 + }, + { + "epoch": 0.5300429184549357, + "grad_norm": 1.477782964706421, + "learning_rate": 2.383188504485005e-06, + "loss": 2.2411, + "step": 9880 + }, + { + "epoch": 0.5300965665236052, + "grad_norm": 1.560173749923706, + "learning_rate": 2.3827545790609076e-06, + "loss": 2.4442, + "step": 9881 + }, + { + "epoch": 0.5301502145922746, + "grad_norm": 1.4431943893432617, + "learning_rate": 2.3823206571767727e-06, + "loss": 2.1339, + "step": 9882 + }, + { + "epoch": 0.5302038626609442, + "grad_norm": 1.6166421175003052, + "learning_rate": 2.3818867388457022e-06, + "loss": 2.2949, + "step": 9883 + }, + { + "epoch": 0.5302575107296137, + "grad_norm": 1.6132936477661133, + "learning_rate": 2.3814528240807976e-06, + "loss": 2.2677, + "step": 9884 + }, + { + "epoch": 0.5303111587982833, + "grad_norm": 1.452797532081604, + "learning_rate": 2.3810189128951602e-06, + "loss": 2.238, + "step": 9885 + }, + { + "epoch": 0.5303648068669528, + "grad_norm": 1.6409848928451538, + "learning_rate": 2.3805850053018913e-06, + "loss": 2.2247, + "step": 9886 + }, + { + "epoch": 0.5304184549356223, + "grad_norm": 1.4423987865447998, + "learning_rate": 2.380151101314091e-06, + "loss": 2.4885, + "step": 9887 + }, + { + "epoch": 0.5304721030042918, + "grad_norm": 1.5377237796783447, + "learning_rate": 2.37971720094486e-06, + "loss": 2.2397, + "step": 9888 + }, + { + "epoch": 0.5305257510729614, + "grad_norm": 1.2562400102615356, + "learning_rate": 2.3792833042073e-06, + "loss": 1.94, + "step": 9889 + }, + { + "epoch": 0.5305793991416309, + "grad_norm": 1.5612424612045288, + "learning_rate": 2.378849411114511e-06, + "loss": 2.3536, + "step": 9890 + }, + { + "epoch": 0.5306330472103005, + "grad_norm": 1.3966624736785889, + "learning_rate": 2.378415521679593e-06, + "loss": 2.2732, + "step": 9891 + }, + { + "epoch": 0.5306866952789699, + "grad_norm": 1.330570936203003, + "learning_rate": 2.3779816359156468e-06, + "loss": 2.2063, + "step": 9892 + }, + { + "epoch": 0.5307403433476395, + "grad_norm": 1.5985651016235352, + "learning_rate": 2.377547753835772e-06, + "loss": 2.3569, + "step": 9893 + }, + { + "epoch": 0.530793991416309, + "grad_norm": 1.6127429008483887, + "learning_rate": 2.377113875453069e-06, + "loss": 2.655, + "step": 9894 + }, + { + "epoch": 0.5308476394849786, + "grad_norm": 1.3685266971588135, + "learning_rate": 2.3766800007806386e-06, + "loss": 2.1245, + "step": 9895 + }, + { + "epoch": 0.5309012875536481, + "grad_norm": 1.5515294075012207, + "learning_rate": 2.3762461298315794e-06, + "loss": 2.2948, + "step": 9896 + }, + { + "epoch": 0.5309549356223175, + "grad_norm": 1.7435694932937622, + "learning_rate": 2.3758122626189918e-06, + "loss": 2.1486, + "step": 9897 + }, + { + "epoch": 0.5310085836909871, + "grad_norm": 1.5091745853424072, + "learning_rate": 2.3753783991559767e-06, + "loss": 2.3308, + "step": 9898 + }, + { + "epoch": 0.5310622317596566, + "grad_norm": 1.698147177696228, + "learning_rate": 2.374944539455631e-06, + "loss": 2.3053, + "step": 9899 + }, + { + "epoch": 0.5311158798283262, + "grad_norm": 1.732433795928955, + "learning_rate": 2.3745106835310556e-06, + "loss": 2.0404, + "step": 9900 + }, + { + "epoch": 0.5311695278969957, + "grad_norm": 1.6016517877578735, + "learning_rate": 2.37407683139535e-06, + "loss": 2.2419, + "step": 9901 + }, + { + "epoch": 0.5312231759656653, + "grad_norm": 38.30702590942383, + "learning_rate": 2.3736429830616125e-06, + "loss": 2.4347, + "step": 9902 + }, + { + "epoch": 0.5312768240343347, + "grad_norm": 1.5343971252441406, + "learning_rate": 2.3732091385429434e-06, + "loss": 2.2056, + "step": 9903 + }, + { + "epoch": 0.5313304721030043, + "grad_norm": 1.3900381326675415, + "learning_rate": 2.372775297852441e-06, + "loss": 2.3031, + "step": 9904 + }, + { + "epoch": 0.5313841201716738, + "grad_norm": 1.6319479942321777, + "learning_rate": 2.3723414610032036e-06, + "loss": 2.3724, + "step": 9905 + }, + { + "epoch": 0.5314377682403434, + "grad_norm": 1.8185173273086548, + "learning_rate": 2.3719076280083312e-06, + "loss": 2.1229, + "step": 9906 + }, + { + "epoch": 0.5314914163090129, + "grad_norm": 1.7179539203643799, + "learning_rate": 2.371473798880922e-06, + "loss": 2.1476, + "step": 9907 + }, + { + "epoch": 0.5315450643776825, + "grad_norm": 1.3503012657165527, + "learning_rate": 2.3710399736340735e-06, + "loss": 2.2064, + "step": 9908 + }, + { + "epoch": 0.5315987124463519, + "grad_norm": 1.9153404235839844, + "learning_rate": 2.370606152280885e-06, + "loss": 2.329, + "step": 9909 + }, + { + "epoch": 0.5316523605150214, + "grad_norm": 1.7838555574417114, + "learning_rate": 2.3701723348344545e-06, + "loss": 2.1132, + "step": 9910 + }, + { + "epoch": 0.531706008583691, + "grad_norm": 1.770398497581482, + "learning_rate": 2.3697385213078806e-06, + "loss": 1.9938, + "step": 9911 + }, + { + "epoch": 0.5317596566523605, + "grad_norm": 1.9058297872543335, + "learning_rate": 2.3693047117142607e-06, + "loss": 1.9916, + "step": 9912 + }, + { + "epoch": 0.5318133047210301, + "grad_norm": 1.3665268421173096, + "learning_rate": 2.3688709060666924e-06, + "loss": 2.2777, + "step": 9913 + }, + { + "epoch": 0.5318669527896995, + "grad_norm": 1.5618607997894287, + "learning_rate": 2.3684371043782743e-06, + "loss": 2.1629, + "step": 9914 + }, + { + "epoch": 0.5319206008583691, + "grad_norm": 4.14001989364624, + "learning_rate": 2.3680033066621044e-06, + "loss": 2.3347, + "step": 9915 + }, + { + "epoch": 0.5319742489270386, + "grad_norm": 1.6934125423431396, + "learning_rate": 2.3675695129312805e-06, + "loss": 2.037, + "step": 9916 + }, + { + "epoch": 0.5320278969957082, + "grad_norm": 1.7567788362503052, + "learning_rate": 2.3671357231988976e-06, + "loss": 2.5638, + "step": 9917 + }, + { + "epoch": 0.5320815450643777, + "grad_norm": 1.603188157081604, + "learning_rate": 2.3667019374780555e-06, + "loss": 2.0939, + "step": 9918 + }, + { + "epoch": 0.5321351931330472, + "grad_norm": 1.7259434461593628, + "learning_rate": 2.3662681557818503e-06, + "loss": 2.2439, + "step": 9919 + }, + { + "epoch": 0.5321888412017167, + "grad_norm": 1.638920545578003, + "learning_rate": 2.365834378123379e-06, + "loss": 2.3912, + "step": 9920 + }, + { + "epoch": 0.5322424892703863, + "grad_norm": 1.624302625656128, + "learning_rate": 2.365400604515739e-06, + "loss": 2.2384, + "step": 9921 + }, + { + "epoch": 0.5322961373390558, + "grad_norm": 1.5538018941879272, + "learning_rate": 2.364966834972027e-06, + "loss": 2.233, + "step": 9922 + }, + { + "epoch": 0.5323497854077254, + "grad_norm": 1.541866660118103, + "learning_rate": 2.36453306950534e-06, + "loss": 2.4691, + "step": 9923 + }, + { + "epoch": 0.5324034334763948, + "grad_norm": 1.1815329790115356, + "learning_rate": 2.3640993081287735e-06, + "loss": 1.6699, + "step": 9924 + }, + { + "epoch": 0.5324570815450643, + "grad_norm": 1.6691166162490845, + "learning_rate": 2.363665550855426e-06, + "loss": 2.3043, + "step": 9925 + }, + { + "epoch": 0.5325107296137339, + "grad_norm": 1.266883373260498, + "learning_rate": 2.3632317976983915e-06, + "loss": 2.3559, + "step": 9926 + }, + { + "epoch": 0.5325643776824034, + "grad_norm": 1.4726964235305786, + "learning_rate": 2.3627980486707673e-06, + "loss": 2.405, + "step": 9927 + }, + { + "epoch": 0.532618025751073, + "grad_norm": 1.6657620668411255, + "learning_rate": 2.3623643037856496e-06, + "loss": 2.2119, + "step": 9928 + }, + { + "epoch": 0.5326716738197425, + "grad_norm": 1.3162952661514282, + "learning_rate": 2.3619305630561335e-06, + "loss": 1.8136, + "step": 9929 + }, + { + "epoch": 0.532725321888412, + "grad_norm": 1.5811350345611572, + "learning_rate": 2.3614968264953164e-06, + "loss": 2.1324, + "step": 9930 + }, + { + "epoch": 0.5327789699570815, + "grad_norm": 1.4440209865570068, + "learning_rate": 2.361063094116293e-06, + "loss": 1.8679, + "step": 9931 + }, + { + "epoch": 0.5328326180257511, + "grad_norm": 1.4991178512573242, + "learning_rate": 2.360629365932159e-06, + "loss": 2.2323, + "step": 9932 + }, + { + "epoch": 0.5328862660944206, + "grad_norm": 1.71378493309021, + "learning_rate": 2.3601956419560097e-06, + "loss": 2.3815, + "step": 9933 + }, + { + "epoch": 0.5329399141630902, + "grad_norm": 1.7565964460372925, + "learning_rate": 2.359761922200941e-06, + "loss": 2.1902, + "step": 9934 + }, + { + "epoch": 0.5329935622317596, + "grad_norm": 1.3029837608337402, + "learning_rate": 2.3593282066800472e-06, + "loss": 2.4122, + "step": 9935 + }, + { + "epoch": 0.5330472103004292, + "grad_norm": 1.8883960247039795, + "learning_rate": 2.3588944954064242e-06, + "loss": 2.151, + "step": 9936 + }, + { + "epoch": 0.5331008583690987, + "grad_norm": 1.4607362747192383, + "learning_rate": 2.358460788393166e-06, + "loss": 1.6393, + "step": 9937 + }, + { + "epoch": 0.5331545064377683, + "grad_norm": 1.8744761943817139, + "learning_rate": 2.3580270856533686e-06, + "loss": 2.2858, + "step": 9938 + }, + { + "epoch": 0.5332081545064378, + "grad_norm": 1.750215768814087, + "learning_rate": 2.3575933872001254e-06, + "loss": 2.5287, + "step": 9939 + }, + { + "epoch": 0.5332618025751072, + "grad_norm": 1.4855417013168335, + "learning_rate": 2.3571596930465325e-06, + "loss": 2.4561, + "step": 9940 + }, + { + "epoch": 0.5333154506437768, + "grad_norm": 1.3889539241790771, + "learning_rate": 2.3567260032056823e-06, + "loss": 2.3971, + "step": 9941 + }, + { + "epoch": 0.5333690987124463, + "grad_norm": 1.537111520767212, + "learning_rate": 2.356292317690671e-06, + "loss": 2.229, + "step": 9942 + }, + { + "epoch": 0.5334227467811159, + "grad_norm": 1.608125925064087, + "learning_rate": 2.3558586365145933e-06, + "loss": 2.2572, + "step": 9943 + }, + { + "epoch": 0.5334763948497854, + "grad_norm": 1.6394675970077515, + "learning_rate": 2.3554249596905406e-06, + "loss": 2.262, + "step": 9944 + }, + { + "epoch": 0.533530042918455, + "grad_norm": 1.6826980113983154, + "learning_rate": 2.354991287231608e-06, + "loss": 2.3067, + "step": 9945 + }, + { + "epoch": 0.5335836909871244, + "grad_norm": 2.138603687286377, + "learning_rate": 2.3545576191508896e-06, + "loss": 2.3284, + "step": 9946 + }, + { + "epoch": 0.533637339055794, + "grad_norm": 1.6956843137741089, + "learning_rate": 2.354123955461479e-06, + "loss": 2.1129, + "step": 9947 + }, + { + "epoch": 0.5336909871244635, + "grad_norm": 1.3171837329864502, + "learning_rate": 2.353690296176469e-06, + "loss": 2.3097, + "step": 9948 + }, + { + "epoch": 0.5337446351931331, + "grad_norm": 1.4657152891159058, + "learning_rate": 2.3532566413089546e-06, + "loss": 2.1207, + "step": 9949 + }, + { + "epoch": 0.5337982832618026, + "grad_norm": 1.5334924459457397, + "learning_rate": 2.3528229908720275e-06, + "loss": 2.243, + "step": 9950 + }, + { + "epoch": 0.5338519313304722, + "grad_norm": 1.6140252351760864, + "learning_rate": 2.352389344878782e-06, + "loss": 2.2244, + "step": 9951 + }, + { + "epoch": 0.5339055793991416, + "grad_norm": 1.5079131126403809, + "learning_rate": 2.3519557033423098e-06, + "loss": 2.156, + "step": 9952 + }, + { + "epoch": 0.5339592274678111, + "grad_norm": 1.5228936672210693, + "learning_rate": 2.351522066275704e-06, + "loss": 2.203, + "step": 9953 + }, + { + "epoch": 0.5340128755364807, + "grad_norm": 1.8686819076538086, + "learning_rate": 2.3510884336920578e-06, + "loss": 2.1642, + "step": 9954 + }, + { + "epoch": 0.5340665236051502, + "grad_norm": 1.5363759994506836, + "learning_rate": 2.3506548056044635e-06, + "loss": 2.257, + "step": 9955 + }, + { + "epoch": 0.5341201716738198, + "grad_norm": 2.110614538192749, + "learning_rate": 2.350221182026014e-06, + "loss": 2.1741, + "step": 9956 + }, + { + "epoch": 0.5341738197424892, + "grad_norm": 1.4755345582962036, + "learning_rate": 2.349787562969801e-06, + "loss": 2.277, + "step": 9957 + }, + { + "epoch": 0.5342274678111588, + "grad_norm": 1.3690636157989502, + "learning_rate": 2.349353948448916e-06, + "loss": 2.1633, + "step": 9958 + }, + { + "epoch": 0.5342811158798283, + "grad_norm": 1.3737905025482178, + "learning_rate": 2.3489203384764526e-06, + "loss": 2.3156, + "step": 9959 + }, + { + "epoch": 0.5343347639484979, + "grad_norm": 2.1844749450683594, + "learning_rate": 2.3484867330655033e-06, + "loss": 2.3682, + "step": 9960 + }, + { + "epoch": 0.5343884120171674, + "grad_norm": 2.424933671951294, + "learning_rate": 2.3480531322291574e-06, + "loss": 2.1579, + "step": 9961 + }, + { + "epoch": 0.534442060085837, + "grad_norm": 1.7702921628952026, + "learning_rate": 2.347619535980507e-06, + "loss": 2.4495, + "step": 9962 + }, + { + "epoch": 0.5344957081545064, + "grad_norm": 1.6546452045440674, + "learning_rate": 2.3471859443326447e-06, + "loss": 2.2382, + "step": 9963 + }, + { + "epoch": 0.534549356223176, + "grad_norm": 1.5194532871246338, + "learning_rate": 2.3467523572986613e-06, + "loss": 2.242, + "step": 9964 + }, + { + "epoch": 0.5346030042918455, + "grad_norm": 1.6928621530532837, + "learning_rate": 2.346318774891648e-06, + "loss": 2.1817, + "step": 9965 + }, + { + "epoch": 0.5346566523605151, + "grad_norm": 1.2962805032730103, + "learning_rate": 2.345885197124696e-06, + "loss": 2.3168, + "step": 9966 + }, + { + "epoch": 0.5347103004291845, + "grad_norm": 1.420584797859192, + "learning_rate": 2.345451624010896e-06, + "loss": 2.3188, + "step": 9967 + }, + { + "epoch": 0.534763948497854, + "grad_norm": 1.420862078666687, + "learning_rate": 2.345018055563339e-06, + "loss": 2.249, + "step": 9968 + }, + { + "epoch": 0.5348175965665236, + "grad_norm": 1.4125969409942627, + "learning_rate": 2.344584491795116e-06, + "loss": 2.0716, + "step": 9969 + }, + { + "epoch": 0.5348712446351931, + "grad_norm": 1.5518994331359863, + "learning_rate": 2.3441509327193163e-06, + "loss": 2.337, + "step": 9970 + }, + { + "epoch": 0.5349248927038627, + "grad_norm": 1.4646022319793701, + "learning_rate": 2.3437173783490308e-06, + "loss": 2.0725, + "step": 9971 + }, + { + "epoch": 0.5349785407725322, + "grad_norm": 2.147535800933838, + "learning_rate": 2.34328382869735e-06, + "loss": 2.3055, + "step": 9972 + }, + { + "epoch": 0.5350321888412017, + "grad_norm": 1.497818946838379, + "learning_rate": 2.342850283777364e-06, + "loss": 2.2983, + "step": 9973 + }, + { + "epoch": 0.5350858369098712, + "grad_norm": 2.3196511268615723, + "learning_rate": 2.3424167436021613e-06, + "loss": 2.3762, + "step": 9974 + }, + { + "epoch": 0.5351394849785408, + "grad_norm": 2.0881574153900146, + "learning_rate": 2.341983208184834e-06, + "loss": 2.1381, + "step": 9975 + }, + { + "epoch": 0.5351931330472103, + "grad_norm": 1.5661368370056152, + "learning_rate": 2.3415496775384703e-06, + "loss": 2.3655, + "step": 9976 + }, + { + "epoch": 0.5352467811158799, + "grad_norm": 1.5450718402862549, + "learning_rate": 2.3411161516761603e-06, + "loss": 2.3405, + "step": 9977 + }, + { + "epoch": 0.5353004291845493, + "grad_norm": 1.7335892915725708, + "learning_rate": 2.3406826306109943e-06, + "loss": 2.437, + "step": 9978 + }, + { + "epoch": 0.5353540772532189, + "grad_norm": 1.7983849048614502, + "learning_rate": 2.340249114356058e-06, + "loss": 2.3524, + "step": 9979 + }, + { + "epoch": 0.5354077253218884, + "grad_norm": 1.7174283266067505, + "learning_rate": 2.339815602924444e-06, + "loss": 2.3309, + "step": 9980 + }, + { + "epoch": 0.535461373390558, + "grad_norm": 1.4748578071594238, + "learning_rate": 2.3393820963292394e-06, + "loss": 2.1894, + "step": 9981 + }, + { + "epoch": 0.5355150214592275, + "grad_norm": 1.5007877349853516, + "learning_rate": 2.3389485945835338e-06, + "loss": 2.2573, + "step": 9982 + }, + { + "epoch": 0.535568669527897, + "grad_norm": 1.6987147331237793, + "learning_rate": 2.338515097700415e-06, + "loss": 2.3567, + "step": 9983 + }, + { + "epoch": 0.5356223175965665, + "grad_norm": 1.809004306793213, + "learning_rate": 2.3380816056929722e-06, + "loss": 2.2612, + "step": 9984 + }, + { + "epoch": 0.535675965665236, + "grad_norm": 1.5915794372558594, + "learning_rate": 2.3376481185742936e-06, + "loss": 1.7296, + "step": 9985 + }, + { + "epoch": 0.5357296137339056, + "grad_norm": 1.6517136096954346, + "learning_rate": 2.337214636357467e-06, + "loss": 2.2542, + "step": 9986 + }, + { + "epoch": 0.5357832618025751, + "grad_norm": 1.6078455448150635, + "learning_rate": 2.3367811590555816e-06, + "loss": 1.9567, + "step": 9987 + }, + { + "epoch": 0.5358369098712447, + "grad_norm": 1.5999228954315186, + "learning_rate": 2.336347686681724e-06, + "loss": 2.3231, + "step": 9988 + }, + { + "epoch": 0.5358905579399141, + "grad_norm": 1.88495934009552, + "learning_rate": 2.335914219248982e-06, + "loss": 2.2677, + "step": 9989 + }, + { + "epoch": 0.5359442060085837, + "grad_norm": 1.7952759265899658, + "learning_rate": 2.335480756770443e-06, + "loss": 2.1805, + "step": 9990 + }, + { + "epoch": 0.5359978540772532, + "grad_norm": 3.4221155643463135, + "learning_rate": 2.335047299259195e-06, + "loss": 2.3242, + "step": 9991 + }, + { + "epoch": 0.5360515021459228, + "grad_norm": 1.6455450057983398, + "learning_rate": 2.334613846728325e-06, + "loss": 1.8124, + "step": 9992 + }, + { + "epoch": 0.5361051502145923, + "grad_norm": 1.6366456747055054, + "learning_rate": 2.3341803991909214e-06, + "loss": 2.344, + "step": 9993 + }, + { + "epoch": 0.5361587982832619, + "grad_norm": 1.6642056703567505, + "learning_rate": 2.3337469566600693e-06, + "loss": 2.3396, + "step": 9994 + }, + { + "epoch": 0.5362124463519313, + "grad_norm": 1.6702319383621216, + "learning_rate": 2.3333135191488565e-06, + "loss": 2.1777, + "step": 9995 + }, + { + "epoch": 0.5362660944206008, + "grad_norm": 1.77571702003479, + "learning_rate": 2.3328800866703704e-06, + "loss": 2.5534, + "step": 9996 + }, + { + "epoch": 0.5363197424892704, + "grad_norm": 1.5636972188949585, + "learning_rate": 2.3324466592376955e-06, + "loss": 2.2347, + "step": 9997 + }, + { + "epoch": 0.5363733905579399, + "grad_norm": 1.5426229238510132, + "learning_rate": 2.33201323686392e-06, + "loss": 2.4899, + "step": 9998 + }, + { + "epoch": 0.5364270386266095, + "grad_norm": 1.3353322744369507, + "learning_rate": 2.3315798195621288e-06, + "loss": 2.1458, + "step": 9999 + }, + { + "epoch": 0.5364806866952789, + "grad_norm": 1.6489787101745605, + "learning_rate": 2.331146407345409e-06, + "loss": 2.3675, + "step": 10000 + } + ], + "logging_steps": 1, + "max_steps": 18640, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.20694989914112e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}