diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,39731 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1443.5744044969551, + "learning_rate": 5.91715976331361e-08, + "loss": 12.2344, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1534.8176448250863, + "learning_rate": 1.183431952662722e-07, + "loss": 11.7344, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1326.3321019398716, + "learning_rate": 1.775147928994083e-07, + "loss": 11.6875, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 1288.3465989587996, + "learning_rate": 2.366863905325444e-07, + "loss": 11.0938, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 1341.804033668729, + "learning_rate": 2.958579881656805e-07, + "loss": 12.3125, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 1329.3145585197879, + "learning_rate": 3.550295857988166e-07, + "loss": 11.0938, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 1130.227729481188, + "learning_rate": 4.1420118343195276e-07, + "loss": 11.375, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1160.6261885441586, + "learning_rate": 4.733727810650888e-07, + "loss": 9.8906, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 1039.7040454148366, + "learning_rate": 5.32544378698225e-07, + "loss": 9.8984, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 835.4908181410407, + "learning_rate": 5.91715976331361e-07, + "loss": 8.6406, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 687.6534892135122, + "learning_rate": 6.50887573964497e-07, + "loss": 7.5391, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 354.312170952273, + "learning_rate": 7.100591715976332e-07, + "loss": 7.4141, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 365.8199705384397, + "learning_rate": 7.692307692307694e-07, + "loss": 7.0469, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 1216.9260067540185, + "learning_rate": 8.284023668639055e-07, + "loss": 8.3516, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 1474.4218345737838, + "learning_rate": 8.875739644970415e-07, + "loss": 8.1016, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 1393.9506087028988, + "learning_rate": 9.467455621301776e-07, + "loss": 8.0, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 1475.954883625004, + "learning_rate": 1.0059171597633138e-06, + "loss": 8.7891, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 1219.677012756364, + "learning_rate": 1.06508875739645e-06, + "loss": 6.4844, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 938.9090098183609, + "learning_rate": 1.1242603550295859e-06, + "loss": 4.9531, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 732.6809380140464, + "learning_rate": 1.183431952662722e-06, + "loss": 3.7578, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 312.90545925368025, + "learning_rate": 1.242603550295858e-06, + "loss": 2.3359, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 131.42857798633014, + "learning_rate": 1.301775147928994e-06, + "loss": 2.5195, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 204.75611392809807, + "learning_rate": 1.3609467455621303e-06, + "loss": 2.1074, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 281.53999585238597, + "learning_rate": 1.4201183431952664e-06, + "loss": 2.3008, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 407.1340542086892, + "learning_rate": 1.4792899408284026e-06, + "loss": 2.3633, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 513.0873413511963, + "learning_rate": 1.5384615384615387e-06, + "loss": 2.457, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 461.55622845849354, + "learning_rate": 1.5976331360946749e-06, + "loss": 2.1602, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 478.542794042002, + "learning_rate": 1.656804733727811e-06, + "loss": 2.1387, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 344.6282465781462, + "learning_rate": 1.7159763313609468e-06, + "loss": 1.7637, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 362.57393430326465, + "learning_rate": 1.775147928994083e-06, + "loss": 1.6602, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 168.67532989078214, + "learning_rate": 1.834319526627219e-06, + "loss": 1.3105, + "step": 31 + }, + { + "epoch": 0.02, + "grad_norm": 78.2335154608893, + "learning_rate": 1.8934911242603552e-06, + "loss": 1.0537, + "step": 32 + }, + { + "epoch": 0.02, + "grad_norm": 74.86312041211706, + "learning_rate": 1.952662721893491e-06, + "loss": 0.9717, + "step": 33 + }, + { + "epoch": 0.02, + "grad_norm": 173.72567437330426, + "learning_rate": 2.0118343195266275e-06, + "loss": 1.0166, + "step": 34 + }, + { + "epoch": 0.02, + "grad_norm": 244.678248816213, + "learning_rate": 2.0710059171597635e-06, + "loss": 1.1279, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 289.70307970146496, + "learning_rate": 2.1301775147929e-06, + "loss": 1.1523, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 261.404070412537, + "learning_rate": 2.1893491124260358e-06, + "loss": 1.0615, + "step": 37 + }, + { + "epoch": 0.03, + "grad_norm": 248.72543936315603, + "learning_rate": 2.2485207100591717e-06, + "loss": 0.96, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 175.19387912539813, + "learning_rate": 2.307692307692308e-06, + "loss": 0.8223, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 129.57230819430404, + "learning_rate": 2.366863905325444e-06, + "loss": 0.7617, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 64.75648312193796, + "learning_rate": 2.42603550295858e-06, + "loss": 0.6279, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 40.345496542665984, + "learning_rate": 2.485207100591716e-06, + "loss": 0.6426, + "step": 42 + }, + { + "epoch": 0.03, + "grad_norm": 112.96546034948359, + "learning_rate": 2.5443786982248527e-06, + "loss": 0.6133, + "step": 43 + }, + { + "epoch": 0.03, + "grad_norm": 113.11724928693071, + "learning_rate": 2.603550295857988e-06, + "loss": 0.6172, + "step": 44 + }, + { + "epoch": 0.03, + "grad_norm": 170.2583256015434, + "learning_rate": 2.6627218934911246e-06, + "loss": 0.6826, + "step": 45 + }, + { + "epoch": 0.03, + "grad_norm": 181.13824595705816, + "learning_rate": 2.7218934911242605e-06, + "loss": 0.6807, + "step": 46 + }, + { + "epoch": 0.03, + "grad_norm": 127.83994169123808, + "learning_rate": 2.7810650887573965e-06, + "loss": 0.5532, + "step": 47 + }, + { + "epoch": 0.03, + "grad_norm": 160.92696853722236, + "learning_rate": 2.840236686390533e-06, + "loss": 0.6377, + "step": 48 + }, + { + "epoch": 0.03, + "grad_norm": 83.85759914507452, + "learning_rate": 2.8994082840236688e-06, + "loss": 0.5171, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 34.82212793694129, + "learning_rate": 2.958579881656805e-06, + "loss": 0.4097, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 25.956713751406728, + "learning_rate": 3.017751479289941e-06, + "loss": 0.4434, + "step": 51 + }, + { + "epoch": 0.04, + "grad_norm": 78.75487723337257, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.394, + "step": 52 + }, + { + "epoch": 0.04, + "grad_norm": 154.51220671292447, + "learning_rate": 3.1360946745562134e-06, + "loss": 0.5396, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 167.45386551318256, + "learning_rate": 3.1952662721893497e-06, + "loss": 0.5225, + "step": 54 + }, + { + "epoch": 0.04, + "grad_norm": 153.7596657181405, + "learning_rate": 3.2544378698224853e-06, + "loss": 0.4897, + "step": 55 + }, + { + "epoch": 0.04, + "grad_norm": 104.90584941846187, + "learning_rate": 3.313609467455622e-06, + "loss": 0.4272, + "step": 56 + }, + { + "epoch": 0.04, + "grad_norm": 80.49574128984705, + "learning_rate": 3.3727810650887576e-06, + "loss": 0.395, + "step": 57 + }, + { + "epoch": 0.04, + "grad_norm": 44.08211560352312, + "learning_rate": 3.4319526627218935e-06, + "loss": 0.3936, + "step": 58 + }, + { + "epoch": 0.04, + "grad_norm": 74.39313269747478, + "learning_rate": 3.49112426035503e-06, + "loss": 0.4253, + "step": 59 + }, + { + "epoch": 0.04, + "grad_norm": 69.8169714038155, + "learning_rate": 3.550295857988166e-06, + "loss": 0.3569, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 89.54284154509011, + "learning_rate": 3.609467455621302e-06, + "loss": 0.3887, + "step": 61 + }, + { + "epoch": 0.04, + "grad_norm": 103.65697090523831, + "learning_rate": 3.668639053254438e-06, + "loss": 0.3853, + "step": 62 + }, + { + "epoch": 0.04, + "grad_norm": 77.98627320346968, + "learning_rate": 3.7278106508875745e-06, + "loss": 0.3628, + "step": 63 + }, + { + "epoch": 0.05, + "grad_norm": 42.66416290101905, + "learning_rate": 3.7869822485207104e-06, + "loss": 0.3237, + "step": 64 + }, + { + "epoch": 0.05, + "grad_norm": 33.27388124719927, + "learning_rate": 3.846153846153847e-06, + "loss": 0.3032, + "step": 65 + }, + { + "epoch": 0.05, + "grad_norm": 36.86492361076572, + "learning_rate": 3.905325443786982e-06, + "loss": 0.3418, + "step": 66 + }, + { + "epoch": 0.05, + "grad_norm": 66.58133496209655, + "learning_rate": 3.964497041420119e-06, + "loss": 0.3433, + "step": 67 + }, + { + "epoch": 0.05, + "grad_norm": 34.40834590692991, + "learning_rate": 4.023668639053255e-06, + "loss": 0.3169, + "step": 68 + }, + { + "epoch": 0.05, + "grad_norm": 89.33579913199095, + "learning_rate": 4.0828402366863906e-06, + "loss": 0.3481, + "step": 69 + }, + { + "epoch": 0.05, + "grad_norm": 75.01768722175284, + "learning_rate": 4.142011834319527e-06, + "loss": 0.3252, + "step": 70 + }, + { + "epoch": 0.05, + "grad_norm": 16.173648614704415, + "learning_rate": 4.201183431952663e-06, + "loss": 0.2759, + "step": 71 + }, + { + "epoch": 0.05, + "grad_norm": 18.990275484108082, + "learning_rate": 4.2603550295858e-06, + "loss": 0.2705, + "step": 72 + }, + { + "epoch": 0.05, + "grad_norm": 38.76375650821992, + "learning_rate": 4.319526627218935e-06, + "loss": 0.2842, + "step": 73 + }, + { + "epoch": 0.05, + "grad_norm": 56.67502763908607, + "learning_rate": 4.3786982248520715e-06, + "loss": 0.3154, + "step": 74 + }, + { + "epoch": 0.05, + "grad_norm": 66.38202402483932, + "learning_rate": 4.437869822485207e-06, + "loss": 0.334, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 26.659837428204344, + "learning_rate": 4.497041420118343e-06, + "loss": 0.2437, + "step": 76 + }, + { + "epoch": 0.05, + "grad_norm": 49.89366358764825, + "learning_rate": 4.55621301775148e-06, + "loss": 0.2893, + "step": 77 + }, + { + "epoch": 0.06, + "grad_norm": 22.724969702099184, + "learning_rate": 4.615384615384616e-06, + "loss": 0.2725, + "step": 78 + }, + { + "epoch": 0.06, + "grad_norm": 28.80561993848226, + "learning_rate": 4.674556213017752e-06, + "loss": 0.3044, + "step": 79 + }, + { + "epoch": 0.06, + "grad_norm": 29.286530669940298, + "learning_rate": 4.733727810650888e-06, + "loss": 0.2842, + "step": 80 + }, + { + "epoch": 0.06, + "grad_norm": 14.16660585102508, + "learning_rate": 4.792899408284024e-06, + "loss": 0.2437, + "step": 81 + }, + { + "epoch": 0.06, + "grad_norm": 73.9926353523121, + "learning_rate": 4.85207100591716e-06, + "loss": 0.2827, + "step": 82 + }, + { + "epoch": 0.06, + "grad_norm": 29.878951527569956, + "learning_rate": 4.911242603550296e-06, + "loss": 0.2458, + "step": 83 + }, + { + "epoch": 0.06, + "grad_norm": 11.484475177918583, + "learning_rate": 4.970414201183432e-06, + "loss": 0.2476, + "step": 84 + }, + { + "epoch": 0.06, + "grad_norm": 55.59592780267639, + "learning_rate": 5.029585798816569e-06, + "loss": 0.2793, + "step": 85 + }, + { + "epoch": 0.06, + "grad_norm": 50.21555063218247, + "learning_rate": 5.088757396449705e-06, + "loss": 0.2581, + "step": 86 + }, + { + "epoch": 0.06, + "grad_norm": 13.528834407068818, + "learning_rate": 5.14792899408284e-06, + "loss": 0.239, + "step": 87 + }, + { + "epoch": 0.06, + "grad_norm": 13.041769271462492, + "learning_rate": 5.207100591715976e-06, + "loss": 0.2402, + "step": 88 + }, + { + "epoch": 0.06, + "grad_norm": 21.52430533549206, + "learning_rate": 5.266272189349113e-06, + "loss": 0.2417, + "step": 89 + }, + { + "epoch": 0.06, + "grad_norm": 13.677037672727517, + "learning_rate": 5.325443786982249e-06, + "loss": 0.2317, + "step": 90 + }, + { + "epoch": 0.06, + "grad_norm": 38.59157227506834, + "learning_rate": 5.384615384615385e-06, + "loss": 0.2336, + "step": 91 + }, + { + "epoch": 0.07, + "grad_norm": 50.86665965832706, + "learning_rate": 5.443786982248521e-06, + "loss": 0.2283, + "step": 92 + }, + { + "epoch": 0.07, + "grad_norm": 13.565157782160531, + "learning_rate": 5.502958579881657e-06, + "loss": 0.229, + "step": 93 + }, + { + "epoch": 0.07, + "grad_norm": 9.612894516114126, + "learning_rate": 5.562130177514793e-06, + "loss": 0.2034, + "step": 94 + }, + { + "epoch": 0.07, + "grad_norm": 41.99675072884753, + "learning_rate": 5.621301775147929e-06, + "loss": 0.218, + "step": 95 + }, + { + "epoch": 0.07, + "grad_norm": 21.606516634808887, + "learning_rate": 5.680473372781066e-06, + "loss": 0.238, + "step": 96 + }, + { + "epoch": 0.07, + "grad_norm": 38.612734423443584, + "learning_rate": 5.739644970414202e-06, + "loss": 0.2727, + "step": 97 + }, + { + "epoch": 0.07, + "grad_norm": 10.14173463955849, + "learning_rate": 5.7988165680473375e-06, + "loss": 0.2144, + "step": 98 + }, + { + "epoch": 0.07, + "grad_norm": 11.726835683501431, + "learning_rate": 5.857988165680474e-06, + "loss": 0.2217, + "step": 99 + }, + { + "epoch": 0.07, + "grad_norm": 33.271941153238885, + "learning_rate": 5.91715976331361e-06, + "loss": 0.2334, + "step": 100 + }, + { + "epoch": 0.07, + "grad_norm": 28.312522667414854, + "learning_rate": 5.976331360946747e-06, + "loss": 0.229, + "step": 101 + }, + { + "epoch": 0.07, + "grad_norm": 39.47150040026952, + "learning_rate": 6.035502958579882e-06, + "loss": 0.2241, + "step": 102 + }, + { + "epoch": 0.07, + "grad_norm": 10.387184326679442, + "learning_rate": 6.0946745562130185e-06, + "loss": 0.1914, + "step": 103 + }, + { + "epoch": 0.07, + "grad_norm": 9.393251230339942, + "learning_rate": 6.153846153846155e-06, + "loss": 0.1992, + "step": 104 + }, + { + "epoch": 0.07, + "grad_norm": 43.36911762706155, + "learning_rate": 6.21301775147929e-06, + "loss": 0.2244, + "step": 105 + }, + { + "epoch": 0.08, + "grad_norm": 54.08032944347241, + "learning_rate": 6.272189349112427e-06, + "loss": 0.2598, + "step": 106 + }, + { + "epoch": 0.08, + "grad_norm": 34.12457211661539, + "learning_rate": 6.331360946745563e-06, + "loss": 0.2126, + "step": 107 + }, + { + "epoch": 0.08, + "grad_norm": 10.87841011317557, + "learning_rate": 6.3905325443786995e-06, + "loss": 0.1987, + "step": 108 + }, + { + "epoch": 0.08, + "grad_norm": 45.90938123438191, + "learning_rate": 6.449704142011834e-06, + "loss": 0.2549, + "step": 109 + }, + { + "epoch": 0.08, + "grad_norm": 43.25371651313902, + "learning_rate": 6.5088757396449705e-06, + "loss": 0.239, + "step": 110 + }, + { + "epoch": 0.08, + "grad_norm": 37.74000118926859, + "learning_rate": 6.568047337278107e-06, + "loss": 0.2085, + "step": 111 + }, + { + "epoch": 0.08, + "grad_norm": 8.833634104200893, + "learning_rate": 6.627218934911244e-06, + "loss": 0.1685, + "step": 112 + }, + { + "epoch": 0.08, + "grad_norm": 36.89708866713719, + "learning_rate": 6.686390532544379e-06, + "loss": 0.2686, + "step": 113 + }, + { + "epoch": 0.08, + "grad_norm": 50.89388673898367, + "learning_rate": 6.745562130177515e-06, + "loss": 0.2107, + "step": 114 + }, + { + "epoch": 0.08, + "grad_norm": 44.94360423791429, + "learning_rate": 6.8047337278106515e-06, + "loss": 0.2383, + "step": 115 + }, + { + "epoch": 0.08, + "grad_norm": 12.889886449107443, + "learning_rate": 6.863905325443787e-06, + "loss": 0.2217, + "step": 116 + }, + { + "epoch": 0.08, + "grad_norm": 51.161996812885185, + "learning_rate": 6.923076923076923e-06, + "loss": 0.1895, + "step": 117 + }, + { + "epoch": 0.08, + "grad_norm": 48.48659614955822, + "learning_rate": 6.98224852071006e-06, + "loss": 0.2227, + "step": 118 + }, + { + "epoch": 0.08, + "grad_norm": 11.885655124875624, + "learning_rate": 7.041420118343196e-06, + "loss": 0.1958, + "step": 119 + }, + { + "epoch": 0.09, + "grad_norm": 27.132063430841765, + "learning_rate": 7.100591715976332e-06, + "loss": 0.2239, + "step": 120 + }, + { + "epoch": 0.09, + "grad_norm": 18.86057434855715, + "learning_rate": 7.159763313609468e-06, + "loss": 0.1902, + "step": 121 + }, + { + "epoch": 0.09, + "grad_norm": 12.637608877658806, + "learning_rate": 7.218934911242604e-06, + "loss": 0.2109, + "step": 122 + }, + { + "epoch": 0.09, + "grad_norm": 10.047033973462321, + "learning_rate": 7.278106508875741e-06, + "loss": 0.1904, + "step": 123 + }, + { + "epoch": 0.09, + "grad_norm": 11.709619680883495, + "learning_rate": 7.337278106508876e-06, + "loss": 0.1729, + "step": 124 + }, + { + "epoch": 0.09, + "grad_norm": 44.025278094830405, + "learning_rate": 7.396449704142013e-06, + "loss": 0.2012, + "step": 125 + }, + { + "epoch": 0.09, + "grad_norm": 5.579611221108369, + "learning_rate": 7.455621301775149e-06, + "loss": 0.1807, + "step": 126 + }, + { + "epoch": 0.09, + "grad_norm": 9.178080204391332, + "learning_rate": 7.5147928994082845e-06, + "loss": 0.1792, + "step": 127 + }, + { + "epoch": 0.09, + "grad_norm": 10.70187830067804, + "learning_rate": 7.573964497041421e-06, + "loss": 0.1648, + "step": 128 + }, + { + "epoch": 0.09, + "grad_norm": 41.67469230519406, + "learning_rate": 7.633136094674556e-06, + "loss": 0.189, + "step": 129 + }, + { + "epoch": 0.09, + "grad_norm": 56.145533909909716, + "learning_rate": 7.692307692307694e-06, + "loss": 0.1772, + "step": 130 + }, + { + "epoch": 0.09, + "grad_norm": 12.886508074747201, + "learning_rate": 7.751479289940829e-06, + "loss": 0.2095, + "step": 131 + }, + { + "epoch": 0.09, + "grad_norm": 92.16611785910685, + "learning_rate": 7.810650887573965e-06, + "loss": 0.2383, + "step": 132 + }, + { + "epoch": 0.09, + "grad_norm": 40.64519507288005, + "learning_rate": 7.869822485207102e-06, + "loss": 0.2139, + "step": 133 + }, + { + "epoch": 0.1, + "grad_norm": 14.527350282989591, + "learning_rate": 7.928994082840237e-06, + "loss": 0.2041, + "step": 134 + }, + { + "epoch": 0.1, + "grad_norm": 82.46586621777837, + "learning_rate": 7.988165680473373e-06, + "loss": 0.2083, + "step": 135 + }, + { + "epoch": 0.1, + "grad_norm": 60.35861295571972, + "learning_rate": 8.04733727810651e-06, + "loss": 0.2388, + "step": 136 + }, + { + "epoch": 0.1, + "grad_norm": 11.976620441569606, + "learning_rate": 8.106508875739646e-06, + "loss": 0.1814, + "step": 137 + }, + { + "epoch": 0.1, + "grad_norm": 76.28662037424316, + "learning_rate": 8.165680473372781e-06, + "loss": 0.218, + "step": 138 + }, + { + "epoch": 0.1, + "grad_norm": 49.990057565286385, + "learning_rate": 8.224852071005918e-06, + "loss": 0.2263, + "step": 139 + }, + { + "epoch": 0.1, + "grad_norm": 9.12483970210935, + "learning_rate": 8.284023668639054e-06, + "loss": 0.1628, + "step": 140 + }, + { + "epoch": 0.1, + "grad_norm": 34.40581343675325, + "learning_rate": 8.343195266272191e-06, + "loss": 0.2046, + "step": 141 + }, + { + "epoch": 0.1, + "grad_norm": 86.73910967064404, + "learning_rate": 8.402366863905327e-06, + "loss": 0.2832, + "step": 142 + }, + { + "epoch": 0.1, + "grad_norm": 44.998603587663474, + "learning_rate": 8.461538461538462e-06, + "loss": 0.2, + "step": 143 + }, + { + "epoch": 0.1, + "grad_norm": 60.632795608186285, + "learning_rate": 8.5207100591716e-06, + "loss": 0.163, + "step": 144 + }, + { + "epoch": 0.1, + "grad_norm": 139.66966654303818, + "learning_rate": 8.579881656804735e-06, + "loss": 0.356, + "step": 145 + }, + { + "epoch": 0.1, + "grad_norm": 21.720375790615257, + "learning_rate": 8.63905325443787e-06, + "loss": 0.1716, + "step": 146 + }, + { + "epoch": 0.1, + "grad_norm": 93.50974348316393, + "learning_rate": 8.698224852071006e-06, + "loss": 0.2695, + "step": 147 + }, + { + "epoch": 0.11, + "grad_norm": 77.34688239840453, + "learning_rate": 8.757396449704143e-06, + "loss": 0.2104, + "step": 148 + }, + { + "epoch": 0.11, + "grad_norm": 38.11609599313387, + "learning_rate": 8.816568047337279e-06, + "loss": 0.1985, + "step": 149 + }, + { + "epoch": 0.11, + "grad_norm": 34.022306259789644, + "learning_rate": 8.875739644970414e-06, + "loss": 0.1743, + "step": 150 + }, + { + "epoch": 0.11, + "grad_norm": 86.92363217035724, + "learning_rate": 8.934911242603551e-06, + "loss": 0.2617, + "step": 151 + }, + { + "epoch": 0.11, + "grad_norm": 43.71673888744464, + "learning_rate": 8.994082840236687e-06, + "loss": 0.2148, + "step": 152 + }, + { + "epoch": 0.11, + "grad_norm": 31.570603625815373, + "learning_rate": 9.053254437869822e-06, + "loss": 0.1858, + "step": 153 + }, + { + "epoch": 0.11, + "grad_norm": 61.077459467399706, + "learning_rate": 9.11242603550296e-06, + "loss": 0.2427, + "step": 154 + }, + { + "epoch": 0.11, + "grad_norm": 50.10622818300115, + "learning_rate": 9.171597633136095e-06, + "loss": 0.2407, + "step": 155 + }, + { + "epoch": 0.11, + "grad_norm": 29.673041634897242, + "learning_rate": 9.230769230769232e-06, + "loss": 0.1736, + "step": 156 + }, + { + "epoch": 0.11, + "grad_norm": 34.729480106635634, + "learning_rate": 9.289940828402368e-06, + "loss": 0.1782, + "step": 157 + }, + { + "epoch": 0.11, + "grad_norm": 44.993817096293725, + "learning_rate": 9.349112426035503e-06, + "loss": 0.1809, + "step": 158 + }, + { + "epoch": 0.11, + "grad_norm": 8.899448788867884, + "learning_rate": 9.40828402366864e-06, + "loss": 0.1951, + "step": 159 + }, + { + "epoch": 0.11, + "grad_norm": 18.286850457168683, + "learning_rate": 9.467455621301776e-06, + "loss": 0.1606, + "step": 160 + }, + { + "epoch": 0.11, + "grad_norm": 56.53072371599443, + "learning_rate": 9.526627218934912e-06, + "loss": 0.2, + "step": 161 + }, + { + "epoch": 0.12, + "grad_norm": 5.264038479497928, + "learning_rate": 9.585798816568049e-06, + "loss": 0.1448, + "step": 162 + }, + { + "epoch": 0.12, + "grad_norm": 43.211370059501874, + "learning_rate": 9.644970414201184e-06, + "loss": 0.1975, + "step": 163 + }, + { + "epoch": 0.12, + "grad_norm": 14.273967210767667, + "learning_rate": 9.70414201183432e-06, + "loss": 0.1809, + "step": 164 + }, + { + "epoch": 0.12, + "grad_norm": 32.98648840761662, + "learning_rate": 9.763313609467457e-06, + "loss": 0.1685, + "step": 165 + }, + { + "epoch": 0.12, + "grad_norm": 33.93062311124414, + "learning_rate": 9.822485207100593e-06, + "loss": 0.1956, + "step": 166 + }, + { + "epoch": 0.12, + "grad_norm": 51.5363090967256, + "learning_rate": 9.88165680473373e-06, + "loss": 0.187, + "step": 167 + }, + { + "epoch": 0.12, + "grad_norm": 9.60396802401412, + "learning_rate": 9.940828402366864e-06, + "loss": 0.1592, + "step": 168 + }, + { + "epoch": 0.12, + "grad_norm": 25.22452245463198, + "learning_rate": 1e-05, + "loss": 0.1738, + "step": 169 + }, + { + "epoch": 0.12, + "grad_norm": 57.51781109179427, + "learning_rate": 9.999999164703534e-06, + "loss": 0.21, + "step": 170 + }, + { + "epoch": 0.12, + "grad_norm": 19.002582848688764, + "learning_rate": 9.999996658814406e-06, + "loss": 0.1428, + "step": 171 + }, + { + "epoch": 0.12, + "grad_norm": 12.276224778176067, + "learning_rate": 9.999992482333461e-06, + "loss": 0.1465, + "step": 172 + }, + { + "epoch": 0.12, + "grad_norm": 44.678917780683705, + "learning_rate": 9.99998663526209e-06, + "loss": 0.1617, + "step": 173 + }, + { + "epoch": 0.12, + "grad_norm": 6.960442853339805, + "learning_rate": 9.99997911760225e-06, + "loss": 0.1443, + "step": 174 + }, + { + "epoch": 0.12, + "grad_norm": 14.261742691312454, + "learning_rate": 9.99996992935645e-06, + "loss": 0.1682, + "step": 175 + }, + { + "epoch": 0.13, + "grad_norm": 24.032339010668686, + "learning_rate": 9.99995907052776e-06, + "loss": 0.1469, + "step": 176 + }, + { + "epoch": 0.13, + "grad_norm": 58.31377133614243, + "learning_rate": 9.99994654111981e-06, + "loss": 0.2073, + "step": 177 + }, + { + "epoch": 0.13, + "grad_norm": 23.70246515992443, + "learning_rate": 9.999932341136785e-06, + "loss": 0.1718, + "step": 178 + }, + { + "epoch": 0.13, + "grad_norm": 26.891031514237994, + "learning_rate": 9.999916470583429e-06, + "loss": 0.1554, + "step": 179 + }, + { + "epoch": 0.13, + "grad_norm": 57.563572942900166, + "learning_rate": 9.999898929465047e-06, + "loss": 0.1926, + "step": 180 + }, + { + "epoch": 0.13, + "grad_norm": 36.34968696505108, + "learning_rate": 9.999879717787495e-06, + "loss": 0.1558, + "step": 181 + }, + { + "epoch": 0.13, + "grad_norm": 7.794023955932686, + "learning_rate": 9.999858835557197e-06, + "loss": 0.145, + "step": 182 + }, + { + "epoch": 0.13, + "grad_norm": 47.23120098027718, + "learning_rate": 9.999836282781128e-06, + "loss": 0.178, + "step": 183 + }, + { + "epoch": 0.13, + "grad_norm": 40.114505438540625, + "learning_rate": 9.999812059466825e-06, + "loss": 0.2019, + "step": 184 + }, + { + "epoch": 0.13, + "grad_norm": 25.380586865046148, + "learning_rate": 9.999786165622379e-06, + "loss": 0.1559, + "step": 185 + }, + { + "epoch": 0.13, + "grad_norm": 29.25336121583889, + "learning_rate": 9.999758601256441e-06, + "loss": 0.1888, + "step": 186 + }, + { + "epoch": 0.13, + "grad_norm": 64.84408243732132, + "learning_rate": 9.999729366378224e-06, + "loss": 0.1921, + "step": 187 + }, + { + "epoch": 0.13, + "grad_norm": 22.31665172260842, + "learning_rate": 9.999698460997493e-06, + "loss": 0.1838, + "step": 188 + }, + { + "epoch": 0.13, + "grad_norm": 18.476404539169845, + "learning_rate": 9.999665885124577e-06, + "loss": 0.1499, + "step": 189 + }, + { + "epoch": 0.14, + "grad_norm": 41.418357640800615, + "learning_rate": 9.99963163877036e-06, + "loss": 0.2021, + "step": 190 + }, + { + "epoch": 0.14, + "grad_norm": 31.53664585439554, + "learning_rate": 9.99959572194628e-06, + "loss": 0.2122, + "step": 191 + }, + { + "epoch": 0.14, + "grad_norm": 21.575514536412193, + "learning_rate": 9.999558134664342e-06, + "loss": 0.1829, + "step": 192 + }, + { + "epoch": 0.14, + "grad_norm": 39.02562270046022, + "learning_rate": 9.999518876937102e-06, + "loss": 0.1566, + "step": 193 + }, + { + "epoch": 0.14, + "grad_norm": 33.24695587894434, + "learning_rate": 9.999477948777678e-06, + "loss": 0.1599, + "step": 194 + }, + { + "epoch": 0.14, + "grad_norm": 45.78127128268334, + "learning_rate": 9.999435350199745e-06, + "loss": 0.1709, + "step": 195 + }, + { + "epoch": 0.14, + "grad_norm": 42.42344263018681, + "learning_rate": 9.999391081217536e-06, + "loss": 0.1919, + "step": 196 + }, + { + "epoch": 0.14, + "grad_norm": 13.73579636286334, + "learning_rate": 9.999345141845842e-06, + "loss": 0.1562, + "step": 197 + }, + { + "epoch": 0.14, + "grad_norm": 42.85891427323893, + "learning_rate": 9.99929753210001e-06, + "loss": 0.1689, + "step": 198 + }, + { + "epoch": 0.14, + "grad_norm": 6.259144765611043, + "learning_rate": 9.999248251995951e-06, + "loss": 0.1653, + "step": 199 + }, + { + "epoch": 0.14, + "grad_norm": 24.27283819335801, + "learning_rate": 9.999197301550127e-06, + "loss": 0.1798, + "step": 200 + }, + { + "epoch": 0.14, + "grad_norm": 27.899669266770783, + "learning_rate": 9.999144680779564e-06, + "loss": 0.1731, + "step": 201 + }, + { + "epoch": 0.14, + "grad_norm": 11.179927364455224, + "learning_rate": 9.999090389701844e-06, + "loss": 0.1702, + "step": 202 + }, + { + "epoch": 0.14, + "grad_norm": 19.847681296179072, + "learning_rate": 9.999034428335103e-06, + "loss": 0.1591, + "step": 203 + }, + { + "epoch": 0.15, + "grad_norm": 11.752832888535757, + "learning_rate": 9.998976796698043e-06, + "loss": 0.1545, + "step": 204 + }, + { + "epoch": 0.15, + "grad_norm": 20.029510235909346, + "learning_rate": 9.998917494809917e-06, + "loss": 0.1603, + "step": 205 + }, + { + "epoch": 0.15, + "grad_norm": 15.214337375526133, + "learning_rate": 9.998856522690538e-06, + "loss": 0.1699, + "step": 206 + }, + { + "epoch": 0.15, + "grad_norm": 12.852230970076132, + "learning_rate": 9.998793880360283e-06, + "loss": 0.1493, + "step": 207 + }, + { + "epoch": 0.15, + "grad_norm": 42.16352485230185, + "learning_rate": 9.998729567840077e-06, + "loss": 0.2166, + "step": 208 + }, + { + "epoch": 0.15, + "grad_norm": 43.29824172685643, + "learning_rate": 9.998663585151409e-06, + "loss": 0.1897, + "step": 209 + }, + { + "epoch": 0.15, + "grad_norm": 42.33267438448777, + "learning_rate": 9.998595932316327e-06, + "loss": 0.1721, + "step": 210 + }, + { + "epoch": 0.15, + "grad_norm": 8.107364224481227, + "learning_rate": 9.998526609357432e-06, + "loss": 0.1514, + "step": 211 + }, + { + "epoch": 0.15, + "grad_norm": 40.03181460372598, + "learning_rate": 9.998455616297889e-06, + "loss": 0.1746, + "step": 212 + }, + { + "epoch": 0.15, + "grad_norm": 49.846724341530525, + "learning_rate": 9.998382953161417e-06, + "loss": 0.2153, + "step": 213 + }, + { + "epoch": 0.15, + "grad_norm": 23.55333318015724, + "learning_rate": 9.998308619972292e-06, + "loss": 0.1775, + "step": 214 + }, + { + "epoch": 0.15, + "grad_norm": 49.65058025675812, + "learning_rate": 9.998232616755354e-06, + "loss": 0.1721, + "step": 215 + }, + { + "epoch": 0.15, + "grad_norm": 30.06267959273612, + "learning_rate": 9.998154943535996e-06, + "loss": 0.1575, + "step": 216 + }, + { + "epoch": 0.15, + "grad_norm": 31.9421224193554, + "learning_rate": 9.998075600340166e-06, + "loss": 0.179, + "step": 217 + }, + { + "epoch": 0.16, + "grad_norm": 44.71443339581383, + "learning_rate": 9.997994587194381e-06, + "loss": 0.1708, + "step": 218 + }, + { + "epoch": 0.16, + "grad_norm": 5.825036612048357, + "learning_rate": 9.997911904125704e-06, + "loss": 0.141, + "step": 219 + }, + { + "epoch": 0.16, + "grad_norm": 9.105912112927012, + "learning_rate": 9.997827551161762e-06, + "loss": 0.1552, + "step": 220 + }, + { + "epoch": 0.16, + "grad_norm": 30.337684796583403, + "learning_rate": 9.997741528330739e-06, + "loss": 0.1497, + "step": 221 + }, + { + "epoch": 0.16, + "grad_norm": 5.256728997034248, + "learning_rate": 9.997653835661376e-06, + "loss": 0.1792, + "step": 222 + }, + { + "epoch": 0.16, + "grad_norm": 11.710837732666267, + "learning_rate": 9.997564473182976e-06, + "loss": 0.1416, + "step": 223 + }, + { + "epoch": 0.16, + "grad_norm": 6.138467739217358, + "learning_rate": 9.997473440925394e-06, + "loss": 0.1589, + "step": 224 + }, + { + "epoch": 0.16, + "grad_norm": 10.011674393770035, + "learning_rate": 9.997380738919045e-06, + "loss": 0.1641, + "step": 225 + }, + { + "epoch": 0.16, + "grad_norm": 45.558916966443206, + "learning_rate": 9.997286367194903e-06, + "loss": 0.1786, + "step": 226 + }, + { + "epoch": 0.16, + "grad_norm": 22.828341098912837, + "learning_rate": 9.9971903257845e-06, + "loss": 0.1472, + "step": 227 + }, + { + "epoch": 0.16, + "grad_norm": 32.44780904932061, + "learning_rate": 9.997092614719926e-06, + "loss": 0.1599, + "step": 228 + }, + { + "epoch": 0.16, + "grad_norm": 37.937074029999266, + "learning_rate": 9.996993234033826e-06, + "loss": 0.1521, + "step": 229 + }, + { + "epoch": 0.16, + "grad_norm": 22.707824607414384, + "learning_rate": 9.996892183759407e-06, + "loss": 0.1743, + "step": 230 + }, + { + "epoch": 0.16, + "grad_norm": 46.45915188550997, + "learning_rate": 9.99678946393043e-06, + "loss": 0.1892, + "step": 231 + }, + { + "epoch": 0.17, + "grad_norm": 24.319677361782876, + "learning_rate": 9.996685074581216e-06, + "loss": 0.1489, + "step": 232 + }, + { + "epoch": 0.17, + "grad_norm": 18.661367787669693, + "learning_rate": 9.996579015746645e-06, + "loss": 0.1721, + "step": 233 + }, + { + "epoch": 0.17, + "grad_norm": 57.942764629266655, + "learning_rate": 9.996471287462151e-06, + "loss": 0.1649, + "step": 234 + }, + { + "epoch": 0.17, + "grad_norm": 13.048230837396387, + "learning_rate": 9.99636188976373e-06, + "loss": 0.1602, + "step": 235 + }, + { + "epoch": 0.17, + "grad_norm": 8.709658285204258, + "learning_rate": 9.996250822687932e-06, + "loss": 0.1438, + "step": 236 + }, + { + "epoch": 0.17, + "grad_norm": 31.617100347623733, + "learning_rate": 9.996138086271869e-06, + "loss": 0.1556, + "step": 237 + }, + { + "epoch": 0.17, + "grad_norm": 35.27416734620715, + "learning_rate": 9.996023680553204e-06, + "loss": 0.1575, + "step": 238 + }, + { + "epoch": 0.17, + "grad_norm": 9.153491233670788, + "learning_rate": 9.995907605570167e-06, + "loss": 0.1797, + "step": 239 + }, + { + "epoch": 0.17, + "grad_norm": 7.987526278742785, + "learning_rate": 9.995789861361538e-06, + "loss": 0.2065, + "step": 240 + }, + { + "epoch": 0.17, + "grad_norm": 64.32372496086278, + "learning_rate": 9.995670447966658e-06, + "loss": 0.1824, + "step": 241 + }, + { + "epoch": 0.17, + "grad_norm": 15.645118470787404, + "learning_rate": 9.995549365425426e-06, + "loss": 0.1591, + "step": 242 + }, + { + "epoch": 0.17, + "grad_norm": 4.7785990929647815, + "learning_rate": 9.995426613778297e-06, + "loss": 0.1615, + "step": 243 + }, + { + "epoch": 0.17, + "grad_norm": 44.643088379285764, + "learning_rate": 9.995302193066286e-06, + "loss": 0.1528, + "step": 244 + }, + { + "epoch": 0.17, + "grad_norm": 29.669669631908086, + "learning_rate": 9.995176103330962e-06, + "loss": 0.1643, + "step": 245 + }, + { + "epoch": 0.18, + "grad_norm": 5.187868389208368, + "learning_rate": 9.995048344614455e-06, + "loss": 0.1855, + "step": 246 + }, + { + "epoch": 0.18, + "grad_norm": 64.34845710145298, + "learning_rate": 9.994918916959453e-06, + "loss": 0.2019, + "step": 247 + }, + { + "epoch": 0.18, + "grad_norm": 27.545718441791433, + "learning_rate": 9.994787820409198e-06, + "loss": 0.1816, + "step": 248 + }, + { + "epoch": 0.18, + "grad_norm": 16.691064146642706, + "learning_rate": 9.994655055007491e-06, + "loss": 0.1548, + "step": 249 + }, + { + "epoch": 0.18, + "grad_norm": 32.47918718068585, + "learning_rate": 9.994520620798696e-06, + "loss": 0.1423, + "step": 250 + }, + { + "epoch": 0.18, + "grad_norm": 62.24279005895381, + "learning_rate": 9.994384517827726e-06, + "loss": 0.1908, + "step": 251 + }, + { + "epoch": 0.18, + "grad_norm": 6.306470473367477, + "learning_rate": 9.994246746140057e-06, + "loss": 0.1655, + "step": 252 + }, + { + "epoch": 0.18, + "grad_norm": 69.9581859530375, + "learning_rate": 9.99410730578172e-06, + "loss": 0.2007, + "step": 253 + }, + { + "epoch": 0.18, + "grad_norm": 35.84402462268306, + "learning_rate": 9.993966196799304e-06, + "loss": 0.1494, + "step": 254 + }, + { + "epoch": 0.18, + "grad_norm": 6.728361882638175, + "learning_rate": 9.993823419239959e-06, + "loss": 0.1531, + "step": 255 + }, + { + "epoch": 0.18, + "grad_norm": 30.744672622096342, + "learning_rate": 9.993678973151388e-06, + "loss": 0.1378, + "step": 256 + }, + { + "epoch": 0.18, + "grad_norm": 72.68699910576366, + "learning_rate": 9.993532858581853e-06, + "loss": 0.2354, + "step": 257 + }, + { + "epoch": 0.18, + "grad_norm": 10.87452116844784, + "learning_rate": 9.993385075580173e-06, + "loss": 0.1495, + "step": 258 + }, + { + "epoch": 0.18, + "grad_norm": 27.98767655605202, + "learning_rate": 9.993235624195728e-06, + "loss": 0.1709, + "step": 259 + }, + { + "epoch": 0.19, + "grad_norm": 33.83809754783617, + "learning_rate": 9.993084504478448e-06, + "loss": 0.167, + "step": 260 + }, + { + "epoch": 0.19, + "grad_norm": 5.335493128269599, + "learning_rate": 9.99293171647883e-06, + "loss": 0.1222, + "step": 261 + }, + { + "epoch": 0.19, + "grad_norm": 13.010955660808273, + "learning_rate": 9.992777260247916e-06, + "loss": 0.1418, + "step": 262 + }, + { + "epoch": 0.19, + "grad_norm": 5.84935039014272, + "learning_rate": 9.99262113583732e-06, + "loss": 0.1448, + "step": 263 + }, + { + "epoch": 0.19, + "grad_norm": 15.275558317087714, + "learning_rate": 9.992463343299203e-06, + "loss": 0.1398, + "step": 264 + }, + { + "epoch": 0.19, + "grad_norm": 11.577795034557614, + "learning_rate": 9.992303882686288e-06, + "loss": 0.1459, + "step": 265 + }, + { + "epoch": 0.19, + "grad_norm": 21.96507681305609, + "learning_rate": 9.99214275405185e-06, + "loss": 0.1466, + "step": 266 + }, + { + "epoch": 0.19, + "grad_norm": 6.113512224090927, + "learning_rate": 9.991979957449729e-06, + "loss": 0.149, + "step": 267 + }, + { + "epoch": 0.19, + "grad_norm": 7.810162297246139, + "learning_rate": 9.991815492934318e-06, + "loss": 0.1479, + "step": 268 + }, + { + "epoch": 0.19, + "grad_norm": 32.60107223287011, + "learning_rate": 9.991649360560565e-06, + "loss": 0.1899, + "step": 269 + }, + { + "epoch": 0.19, + "grad_norm": 29.946520202273987, + "learning_rate": 9.99148156038398e-06, + "loss": 0.1804, + "step": 270 + }, + { + "epoch": 0.19, + "grad_norm": 10.367035744107707, + "learning_rate": 9.991312092460626e-06, + "loss": 0.1296, + "step": 271 + }, + { + "epoch": 0.19, + "grad_norm": 62.72671197950166, + "learning_rate": 9.991140956847128e-06, + "loss": 0.207, + "step": 272 + }, + { + "epoch": 0.19, + "grad_norm": 4.62480621517589, + "learning_rate": 9.990968153600664e-06, + "loss": 0.1626, + "step": 273 + }, + { + "epoch": 0.2, + "grad_norm": 19.419784824468074, + "learning_rate": 9.990793682778973e-06, + "loss": 0.139, + "step": 274 + }, + { + "epoch": 0.2, + "grad_norm": 52.135212954885354, + "learning_rate": 9.990617544440346e-06, + "loss": 0.1566, + "step": 275 + }, + { + "epoch": 0.2, + "grad_norm": 20.1662562272831, + "learning_rate": 9.990439738643635e-06, + "loss": 0.1516, + "step": 276 + }, + { + "epoch": 0.2, + "grad_norm": 21.742228599596476, + "learning_rate": 9.99026026544825e-06, + "loss": 0.1516, + "step": 277 + }, + { + "epoch": 0.2, + "grad_norm": 47.92258723772546, + "learning_rate": 9.990079124914156e-06, + "loss": 0.1448, + "step": 278 + }, + { + "epoch": 0.2, + "grad_norm": 30.997197668797394, + "learning_rate": 9.989896317101873e-06, + "loss": 0.1375, + "step": 279 + }, + { + "epoch": 0.2, + "grad_norm": 31.062475355277208, + "learning_rate": 9.989711842072482e-06, + "loss": 0.1689, + "step": 280 + }, + { + "epoch": 0.2, + "grad_norm": 53.09032787145096, + "learning_rate": 9.989525699887619e-06, + "loss": 0.1543, + "step": 281 + }, + { + "epoch": 0.2, + "grad_norm": 31.435016992194903, + "learning_rate": 9.989337890609478e-06, + "loss": 0.1792, + "step": 282 + }, + { + "epoch": 0.2, + "grad_norm": 17.590958097222614, + "learning_rate": 9.98914841430081e-06, + "loss": 0.1298, + "step": 283 + }, + { + "epoch": 0.2, + "grad_norm": 7.770018952416595, + "learning_rate": 9.988957271024922e-06, + "loss": 0.119, + "step": 284 + }, + { + "epoch": 0.2, + "grad_norm": 67.99377764408035, + "learning_rate": 9.988764460845676e-06, + "loss": 0.2058, + "step": 285 + }, + { + "epoch": 0.2, + "grad_norm": 37.91613195879691, + "learning_rate": 9.9885699838275e-06, + "loss": 0.1306, + "step": 286 + }, + { + "epoch": 0.2, + "grad_norm": 26.261219214046495, + "learning_rate": 9.988373840035366e-06, + "loss": 0.1443, + "step": 287 + }, + { + "epoch": 0.21, + "grad_norm": 42.836761940466495, + "learning_rate": 9.988176029534814e-06, + "loss": 0.161, + "step": 288 + }, + { + "epoch": 0.21, + "grad_norm": 39.39654040488338, + "learning_rate": 9.987976552391933e-06, + "loss": 0.1567, + "step": 289 + }, + { + "epoch": 0.21, + "grad_norm": 11.790620806136728, + "learning_rate": 9.987775408673373e-06, + "loss": 0.1569, + "step": 290 + }, + { + "epoch": 0.21, + "grad_norm": 53.31062556612756, + "learning_rate": 9.987572598446337e-06, + "loss": 0.1945, + "step": 291 + }, + { + "epoch": 0.21, + "grad_norm": 56.490600010392015, + "learning_rate": 9.987368121778594e-06, + "loss": 0.1736, + "step": 292 + }, + { + "epoch": 0.21, + "grad_norm": 11.122995442967113, + "learning_rate": 9.98716197873846e-06, + "loss": 0.1388, + "step": 293 + }, + { + "epoch": 0.21, + "grad_norm": 92.30833385479315, + "learning_rate": 9.98695416939481e-06, + "loss": 0.231, + "step": 294 + }, + { + "epoch": 0.21, + "grad_norm": 35.983480070160525, + "learning_rate": 9.986744693817077e-06, + "loss": 0.1768, + "step": 295 + }, + { + "epoch": 0.21, + "grad_norm": 16.767014422656075, + "learning_rate": 9.986533552075252e-06, + "loss": 0.1654, + "step": 296 + }, + { + "epoch": 0.21, + "grad_norm": 34.16924846592324, + "learning_rate": 9.986320744239883e-06, + "loss": 0.1589, + "step": 297 + }, + { + "epoch": 0.21, + "grad_norm": 51.06145849537146, + "learning_rate": 9.98610627038207e-06, + "loss": 0.1792, + "step": 298 + }, + { + "epoch": 0.21, + "grad_norm": 14.179575987397332, + "learning_rate": 9.985890130573474e-06, + "loss": 0.1523, + "step": 299 + }, + { + "epoch": 0.21, + "grad_norm": 32.87174916074473, + "learning_rate": 9.98567232488631e-06, + "loss": 0.1482, + "step": 300 + }, + { + "epoch": 0.21, + "grad_norm": 20.716840238877516, + "learning_rate": 9.985452853393353e-06, + "loss": 0.1276, + "step": 301 + }, + { + "epoch": 0.22, + "grad_norm": 39.212614000189724, + "learning_rate": 9.985231716167933e-06, + "loss": 0.15, + "step": 302 + }, + { + "epoch": 0.22, + "grad_norm": 14.977736600228983, + "learning_rate": 9.985008913283933e-06, + "loss": 0.1562, + "step": 303 + }, + { + "epoch": 0.22, + "grad_norm": 36.10332759742872, + "learning_rate": 9.984784444815799e-06, + "loss": 0.191, + "step": 304 + }, + { + "epoch": 0.22, + "grad_norm": 8.909269570291388, + "learning_rate": 9.984558310838528e-06, + "loss": 0.1589, + "step": 305 + }, + { + "epoch": 0.22, + "grad_norm": 28.923562213086704, + "learning_rate": 9.984330511427676e-06, + "loss": 0.1799, + "step": 306 + }, + { + "epoch": 0.22, + "grad_norm": 13.490403752500722, + "learning_rate": 9.984101046659353e-06, + "loss": 0.1479, + "step": 307 + }, + { + "epoch": 0.22, + "grad_norm": 30.913895702455253, + "learning_rate": 9.983869916610232e-06, + "loss": 0.1377, + "step": 308 + }, + { + "epoch": 0.22, + "grad_norm": 53.35070194289101, + "learning_rate": 9.983637121357534e-06, + "loss": 0.1831, + "step": 309 + }, + { + "epoch": 0.22, + "grad_norm": 19.191550606900922, + "learning_rate": 9.983402660979042e-06, + "loss": 0.1614, + "step": 310 + }, + { + "epoch": 0.22, + "grad_norm": 46.927298296283055, + "learning_rate": 9.983166535553093e-06, + "loss": 0.1902, + "step": 311 + }, + { + "epoch": 0.22, + "grad_norm": 13.604114451062268, + "learning_rate": 9.98292874515858e-06, + "loss": 0.1169, + "step": 312 + }, + { + "epoch": 0.22, + "grad_norm": 9.658236921460949, + "learning_rate": 9.982689289874956e-06, + "loss": 0.1569, + "step": 313 + }, + { + "epoch": 0.22, + "grad_norm": 6.620979122118501, + "learning_rate": 9.982448169782226e-06, + "loss": 0.1575, + "step": 314 + }, + { + "epoch": 0.22, + "grad_norm": 9.20134743823668, + "learning_rate": 9.98220538496095e-06, + "loss": 0.1302, + "step": 315 + }, + { + "epoch": 0.23, + "grad_norm": 21.83470224282846, + "learning_rate": 9.98196093549225e-06, + "loss": 0.1381, + "step": 316 + }, + { + "epoch": 0.23, + "grad_norm": 5.019438949112746, + "learning_rate": 9.9817148214578e-06, + "loss": 0.1637, + "step": 317 + }, + { + "epoch": 0.23, + "grad_norm": 18.531774403521805, + "learning_rate": 9.981467042939833e-06, + "loss": 0.1531, + "step": 318 + }, + { + "epoch": 0.23, + "grad_norm": 6.772842664240888, + "learning_rate": 9.981217600021133e-06, + "loss": 0.1455, + "step": 319 + }, + { + "epoch": 0.23, + "grad_norm": 4.688710490868121, + "learning_rate": 9.980966492785048e-06, + "loss": 0.1639, + "step": 320 + }, + { + "epoch": 0.23, + "grad_norm": 12.259163694801227, + "learning_rate": 9.980713721315473e-06, + "loss": 0.1166, + "step": 321 + }, + { + "epoch": 0.23, + "grad_norm": 7.099541659387623, + "learning_rate": 9.98045928569687e-06, + "loss": 0.1406, + "step": 322 + }, + { + "epoch": 0.23, + "grad_norm": 12.661009836373966, + "learning_rate": 9.98020318601424e-06, + "loss": 0.1086, + "step": 323 + }, + { + "epoch": 0.23, + "grad_norm": 7.038778597302965, + "learning_rate": 9.97994542235316e-06, + "loss": 0.1447, + "step": 324 + }, + { + "epoch": 0.23, + "grad_norm": 21.75909194513975, + "learning_rate": 9.979685994799753e-06, + "loss": 0.1561, + "step": 325 + }, + { + "epoch": 0.23, + "grad_norm": 35.89676316531792, + "learning_rate": 9.979424903440695e-06, + "loss": 0.1526, + "step": 326 + }, + { + "epoch": 0.23, + "grad_norm": 26.47012119449328, + "learning_rate": 9.979162148363222e-06, + "loss": 0.1331, + "step": 327 + }, + { + "epoch": 0.23, + "grad_norm": 41.47116585357957, + "learning_rate": 9.978897729655127e-06, + "loss": 0.1527, + "step": 328 + }, + { + "epoch": 0.23, + "grad_norm": 68.50658423771155, + "learning_rate": 9.978631647404755e-06, + "loss": 0.2024, + "step": 329 + }, + { + "epoch": 0.24, + "grad_norm": 5.536910352068493, + "learning_rate": 9.97836390170101e-06, + "loss": 0.1448, + "step": 330 + }, + { + "epoch": 0.24, + "grad_norm": 62.646920155000565, + "learning_rate": 9.978094492633353e-06, + "loss": 0.1959, + "step": 331 + }, + { + "epoch": 0.24, + "grad_norm": 74.9129334802522, + "learning_rate": 9.977823420291796e-06, + "loss": 0.2213, + "step": 332 + }, + { + "epoch": 0.24, + "grad_norm": 4.926863527790253, + "learning_rate": 9.97755068476691e-06, + "loss": 0.1501, + "step": 333 + }, + { + "epoch": 0.24, + "grad_norm": 29.900030068841634, + "learning_rate": 9.977276286149821e-06, + "loss": 0.1589, + "step": 334 + }, + { + "epoch": 0.24, + "grad_norm": 64.98822420575073, + "learning_rate": 9.977000224532211e-06, + "loss": 0.1938, + "step": 335 + }, + { + "epoch": 0.24, + "grad_norm": 43.849118052159554, + "learning_rate": 9.976722500006318e-06, + "loss": 0.1895, + "step": 336 + }, + { + "epoch": 0.24, + "grad_norm": 16.878076037266382, + "learning_rate": 9.976443112664932e-06, + "loss": 0.1444, + "step": 337 + }, + { + "epoch": 0.24, + "grad_norm": 58.54099348004542, + "learning_rate": 9.976162062601407e-06, + "loss": 0.1805, + "step": 338 + }, + { + "epoch": 0.24, + "grad_norm": 37.890609096404795, + "learning_rate": 9.97587934990964e-06, + "loss": 0.1713, + "step": 339 + }, + { + "epoch": 0.24, + "grad_norm": 5.701668469320554, + "learning_rate": 9.975594974684096e-06, + "loss": 0.1388, + "step": 340 + }, + { + "epoch": 0.24, + "grad_norm": 7.179582917571072, + "learning_rate": 9.975308937019787e-06, + "loss": 0.1361, + "step": 341 + }, + { + "epoch": 0.24, + "grad_norm": 38.612872861365716, + "learning_rate": 9.975021237012286e-06, + "loss": 0.1589, + "step": 342 + }, + { + "epoch": 0.24, + "grad_norm": 28.653520911802577, + "learning_rate": 9.974731874757717e-06, + "loss": 0.1484, + "step": 343 + }, + { + "epoch": 0.25, + "grad_norm": 28.38881703138808, + "learning_rate": 9.974440850352762e-06, + "loss": 0.1755, + "step": 344 + }, + { + "epoch": 0.25, + "grad_norm": 48.04652377386355, + "learning_rate": 9.974148163894658e-06, + "loss": 0.1395, + "step": 345 + }, + { + "epoch": 0.25, + "grad_norm": 28.398568336832877, + "learning_rate": 9.973853815481196e-06, + "loss": 0.1409, + "step": 346 + }, + { + "epoch": 0.25, + "grad_norm": 11.919991106134674, + "learning_rate": 9.973557805210724e-06, + "loss": 0.1555, + "step": 347 + }, + { + "epoch": 0.25, + "grad_norm": 9.837518594770932, + "learning_rate": 9.973260133182145e-06, + "loss": 0.1455, + "step": 348 + }, + { + "epoch": 0.25, + "grad_norm": 27.31277900154392, + "learning_rate": 9.972960799494915e-06, + "loss": 0.1361, + "step": 349 + }, + { + "epoch": 0.25, + "grad_norm": 12.083930952779552, + "learning_rate": 9.972659804249047e-06, + "loss": 0.1295, + "step": 350 + }, + { + "epoch": 0.25, + "grad_norm": 7.435839166347306, + "learning_rate": 9.972357147545113e-06, + "loss": 0.1345, + "step": 351 + }, + { + "epoch": 0.25, + "grad_norm": 28.605070082539534, + "learning_rate": 9.972052829484231e-06, + "loss": 0.1387, + "step": 352 + }, + { + "epoch": 0.25, + "grad_norm": 13.819008184513397, + "learning_rate": 9.971746850168084e-06, + "loss": 0.1255, + "step": 353 + }, + { + "epoch": 0.25, + "grad_norm": 15.558085688257078, + "learning_rate": 9.971439209698902e-06, + "loss": 0.1755, + "step": 354 + }, + { + "epoch": 0.25, + "grad_norm": 14.892786074375907, + "learning_rate": 9.971129908179474e-06, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.25, + "grad_norm": 9.814580259790407, + "learning_rate": 9.970818945713145e-06, + "loss": 0.1426, + "step": 356 + }, + { + "epoch": 0.25, + "grad_norm": 8.246334439430278, + "learning_rate": 9.970506322403813e-06, + "loss": 0.1237, + "step": 357 + }, + { + "epoch": 0.26, + "grad_norm": 36.7630773678966, + "learning_rate": 9.970192038355928e-06, + "loss": 0.1527, + "step": 358 + }, + { + "epoch": 0.26, + "grad_norm": 18.698463504899674, + "learning_rate": 9.969876093674502e-06, + "loss": 0.1565, + "step": 359 + }, + { + "epoch": 0.26, + "grad_norm": 34.419554162278324, + "learning_rate": 9.969558488465097e-06, + "loss": 0.1506, + "step": 360 + }, + { + "epoch": 0.26, + "grad_norm": 36.93690244776614, + "learning_rate": 9.969239222833829e-06, + "loss": 0.1531, + "step": 361 + }, + { + "epoch": 0.26, + "grad_norm": 33.49368454947981, + "learning_rate": 9.968918296887374e-06, + "loss": 0.1509, + "step": 362 + }, + { + "epoch": 0.26, + "grad_norm": 25.28539142873406, + "learning_rate": 9.968595710732955e-06, + "loss": 0.1499, + "step": 363 + }, + { + "epoch": 0.26, + "grad_norm": 23.43222923122621, + "learning_rate": 9.968271464478357e-06, + "loss": 0.1312, + "step": 364 + }, + { + "epoch": 0.26, + "grad_norm": 19.51822930107335, + "learning_rate": 9.967945558231917e-06, + "loss": 0.144, + "step": 365 + }, + { + "epoch": 0.26, + "grad_norm": 41.83722751889746, + "learning_rate": 9.967617992102526e-06, + "loss": 0.1533, + "step": 366 + }, + { + "epoch": 0.26, + "grad_norm": 30.973390532695422, + "learning_rate": 9.967288766199628e-06, + "loss": 0.13, + "step": 367 + }, + { + "epoch": 0.26, + "grad_norm": 9.81797488242334, + "learning_rate": 9.966957880633225e-06, + "loss": 0.1371, + "step": 368 + }, + { + "epoch": 0.26, + "grad_norm": 18.097159112396604, + "learning_rate": 9.966625335513873e-06, + "loss": 0.1356, + "step": 369 + }, + { + "epoch": 0.26, + "grad_norm": 5.8534602258826025, + "learning_rate": 9.96629113095268e-06, + "loss": 0.1406, + "step": 370 + }, + { + "epoch": 0.26, + "grad_norm": 12.63522722168294, + "learning_rate": 9.965955267061309e-06, + "loss": 0.1616, + "step": 371 + }, + { + "epoch": 0.27, + "grad_norm": 12.281840014620311, + "learning_rate": 9.965617743951982e-06, + "loss": 0.1528, + "step": 372 + }, + { + "epoch": 0.27, + "grad_norm": 11.010291018078059, + "learning_rate": 9.965278561737466e-06, + "loss": 0.1039, + "step": 373 + }, + { + "epoch": 0.27, + "grad_norm": 23.593676742558646, + "learning_rate": 9.964937720531094e-06, + "loss": 0.1334, + "step": 374 + }, + { + "epoch": 0.27, + "grad_norm": 11.379963157519395, + "learning_rate": 9.964595220446744e-06, + "loss": 0.1658, + "step": 375 + }, + { + "epoch": 0.27, + "grad_norm": 8.612895660033113, + "learning_rate": 9.964251061598853e-06, + "loss": 0.1724, + "step": 376 + }, + { + "epoch": 0.27, + "grad_norm": 10.169065570952082, + "learning_rate": 9.96390524410241e-06, + "loss": 0.1475, + "step": 377 + }, + { + "epoch": 0.27, + "grad_norm": 24.25698722300466, + "learning_rate": 9.96355776807296e-06, + "loss": 0.1232, + "step": 378 + }, + { + "epoch": 0.27, + "grad_norm": 17.4894072996316, + "learning_rate": 9.9632086336266e-06, + "loss": 0.1412, + "step": 379 + }, + { + "epoch": 0.27, + "grad_norm": 14.429277148705365, + "learning_rate": 9.962857840879983e-06, + "loss": 0.1322, + "step": 380 + }, + { + "epoch": 0.27, + "grad_norm": 21.967245963743874, + "learning_rate": 9.962505389950317e-06, + "loss": 0.1565, + "step": 381 + }, + { + "epoch": 0.27, + "grad_norm": 13.836611609961958, + "learning_rate": 9.962151280955359e-06, + "loss": 0.1473, + "step": 382 + }, + { + "epoch": 0.27, + "grad_norm": 6.777348182737883, + "learning_rate": 9.961795514013424e-06, + "loss": 0.1934, + "step": 383 + }, + { + "epoch": 0.27, + "grad_norm": 14.417467203134215, + "learning_rate": 9.961438089243384e-06, + "loss": 0.1414, + "step": 384 + }, + { + "epoch": 0.27, + "grad_norm": 6.704688741271175, + "learning_rate": 9.961079006764659e-06, + "loss": 0.198, + "step": 385 + }, + { + "epoch": 0.28, + "grad_norm": 6.379293993035889, + "learning_rate": 9.960718266697223e-06, + "loss": 0.115, + "step": 386 + }, + { + "epoch": 0.28, + "grad_norm": 13.477558643143759, + "learning_rate": 9.960355869161609e-06, + "loss": 0.1284, + "step": 387 + }, + { + "epoch": 0.28, + "grad_norm": 5.698091583062664, + "learning_rate": 9.959991814278898e-06, + "loss": 0.1287, + "step": 388 + }, + { + "epoch": 0.28, + "grad_norm": 22.68559034944762, + "learning_rate": 9.95962610217073e-06, + "loss": 0.1202, + "step": 389 + }, + { + "epoch": 0.28, + "grad_norm": 11.617129910996631, + "learning_rate": 9.959258732959296e-06, + "loss": 0.1614, + "step": 390 + }, + { + "epoch": 0.28, + "grad_norm": 22.012768496639392, + "learning_rate": 9.958889706767341e-06, + "loss": 0.1481, + "step": 391 + }, + { + "epoch": 0.28, + "grad_norm": 42.213262359287036, + "learning_rate": 9.95851902371816e-06, + "loss": 0.1747, + "step": 392 + }, + { + "epoch": 0.28, + "grad_norm": 10.688717536140965, + "learning_rate": 9.95814668393561e-06, + "loss": 0.1383, + "step": 393 + }, + { + "epoch": 0.28, + "grad_norm": 41.26203920787363, + "learning_rate": 9.957772687544094e-06, + "loss": 0.1768, + "step": 394 + }, + { + "epoch": 0.28, + "grad_norm": 10.068392998011415, + "learning_rate": 9.95739703466857e-06, + "loss": 0.136, + "step": 395 + }, + { + "epoch": 0.28, + "grad_norm": 7.825594261580451, + "learning_rate": 9.957019725434554e-06, + "loss": 0.1346, + "step": 396 + }, + { + "epoch": 0.28, + "grad_norm": 14.922584492469332, + "learning_rate": 9.956640759968111e-06, + "loss": 0.1091, + "step": 397 + }, + { + "epoch": 0.28, + "grad_norm": 15.362761377543631, + "learning_rate": 9.956260138395857e-06, + "loss": 0.1241, + "step": 398 + }, + { + "epoch": 0.28, + "grad_norm": 14.478039428734272, + "learning_rate": 9.955877860844969e-06, + "loss": 0.1665, + "step": 399 + }, + { + "epoch": 0.29, + "grad_norm": 15.342154188172143, + "learning_rate": 9.955493927443171e-06, + "loss": 0.1294, + "step": 400 + }, + { + "epoch": 0.29, + "grad_norm": 16.72367814770503, + "learning_rate": 9.955108338318743e-06, + "loss": 0.1521, + "step": 401 + }, + { + "epoch": 0.29, + "grad_norm": 12.948093169878645, + "learning_rate": 9.954721093600517e-06, + "loss": 0.1439, + "step": 402 + }, + { + "epoch": 0.29, + "grad_norm": 4.973524690035784, + "learning_rate": 9.95433219341788e-06, + "loss": 0.1324, + "step": 403 + }, + { + "epoch": 0.29, + "grad_norm": 15.47228950899974, + "learning_rate": 9.953941637900769e-06, + "loss": 0.1686, + "step": 404 + }, + { + "epoch": 0.29, + "grad_norm": 10.160262840433568, + "learning_rate": 9.953549427179676e-06, + "loss": 0.1477, + "step": 405 + }, + { + "epoch": 0.29, + "grad_norm": 25.931016059983655, + "learning_rate": 9.953155561385646e-06, + "loss": 0.1356, + "step": 406 + }, + { + "epoch": 0.29, + "grad_norm": 5.62503524150755, + "learning_rate": 9.952760040650278e-06, + "loss": 0.1508, + "step": 407 + }, + { + "epoch": 0.29, + "grad_norm": 6.370817286061716, + "learning_rate": 9.95236286510572e-06, + "loss": 0.1111, + "step": 408 + }, + { + "epoch": 0.29, + "grad_norm": 17.969755002141245, + "learning_rate": 9.95196403488468e-06, + "loss": 0.1555, + "step": 409 + }, + { + "epoch": 0.29, + "grad_norm": 15.602862431905702, + "learning_rate": 9.951563550120412e-06, + "loss": 0.1444, + "step": 410 + }, + { + "epoch": 0.29, + "grad_norm": 24.892905576485063, + "learning_rate": 9.951161410946725e-06, + "loss": 0.142, + "step": 411 + }, + { + "epoch": 0.29, + "grad_norm": 26.823371334885348, + "learning_rate": 9.950757617497983e-06, + "loss": 0.1376, + "step": 412 + }, + { + "epoch": 0.29, + "grad_norm": 42.11109194980369, + "learning_rate": 9.950352169909101e-06, + "loss": 0.1213, + "step": 413 + }, + { + "epoch": 0.3, + "grad_norm": 11.958217327562295, + "learning_rate": 9.949945068315544e-06, + "loss": 0.1626, + "step": 414 + }, + { + "epoch": 0.3, + "grad_norm": 17.71107942350547, + "learning_rate": 9.949536312853334e-06, + "loss": 0.166, + "step": 415 + }, + { + "epoch": 0.3, + "grad_norm": 33.36805721946892, + "learning_rate": 9.949125903659042e-06, + "loss": 0.1525, + "step": 416 + }, + { + "epoch": 0.3, + "grad_norm": 10.084727395074816, + "learning_rate": 9.948713840869797e-06, + "loss": 0.1426, + "step": 417 + }, + { + "epoch": 0.3, + "grad_norm": 29.756453469589164, + "learning_rate": 9.948300124623274e-06, + "loss": 0.1035, + "step": 418 + }, + { + "epoch": 0.3, + "grad_norm": 5.824683629550345, + "learning_rate": 9.947884755057703e-06, + "loss": 0.14, + "step": 419 + }, + { + "epoch": 0.3, + "grad_norm": 11.852603046561496, + "learning_rate": 9.947467732311868e-06, + "loss": 0.1642, + "step": 420 + }, + { + "epoch": 0.3, + "grad_norm": 14.091641894723889, + "learning_rate": 9.947049056525104e-06, + "loss": 0.1337, + "step": 421 + }, + { + "epoch": 0.3, + "grad_norm": 11.772867686006098, + "learning_rate": 9.9466287278373e-06, + "loss": 0.1405, + "step": 422 + }, + { + "epoch": 0.3, + "grad_norm": 20.227967360512032, + "learning_rate": 9.946206746388892e-06, + "loss": 0.1464, + "step": 423 + }, + { + "epoch": 0.3, + "grad_norm": 28.120732815939572, + "learning_rate": 9.94578311232087e-06, + "loss": 0.1587, + "step": 424 + }, + { + "epoch": 0.3, + "grad_norm": 6.836326247915467, + "learning_rate": 9.945357825774786e-06, + "loss": 0.1632, + "step": 425 + }, + { + "epoch": 0.3, + "grad_norm": 38.068508076771145, + "learning_rate": 9.944930886892731e-06, + "loss": 0.1635, + "step": 426 + }, + { + "epoch": 0.3, + "grad_norm": 17.53039371409448, + "learning_rate": 9.944502295817353e-06, + "loss": 0.1393, + "step": 427 + }, + { + "epoch": 0.31, + "grad_norm": 5.744624734916593, + "learning_rate": 9.944072052691853e-06, + "loss": 0.1299, + "step": 428 + }, + { + "epoch": 0.31, + "grad_norm": 36.21266814484719, + "learning_rate": 9.943640157659984e-06, + "loss": 0.1241, + "step": 429 + }, + { + "epoch": 0.31, + "grad_norm": 27.99729719398013, + "learning_rate": 9.94320661086605e-06, + "loss": 0.1172, + "step": 430 + }, + { + "epoch": 0.31, + "grad_norm": 32.61398458459067, + "learning_rate": 9.942771412454906e-06, + "loss": 0.1688, + "step": 431 + }, + { + "epoch": 0.31, + "grad_norm": 29.53139637962103, + "learning_rate": 9.942334562571961e-06, + "loss": 0.14, + "step": 432 + }, + { + "epoch": 0.31, + "grad_norm": 70.15413954052487, + "learning_rate": 9.941896061363173e-06, + "loss": 0.1909, + "step": 433 + }, + { + "epoch": 0.31, + "grad_norm": 41.57928064536469, + "learning_rate": 9.941455908975054e-06, + "loss": 0.1348, + "step": 434 + }, + { + "epoch": 0.31, + "grad_norm": 38.37885664713142, + "learning_rate": 9.941014105554668e-06, + "loss": 0.1616, + "step": 435 + }, + { + "epoch": 0.31, + "grad_norm": 49.537343514648235, + "learning_rate": 9.94057065124963e-06, + "loss": 0.1626, + "step": 436 + }, + { + "epoch": 0.31, + "grad_norm": 54.24518982500366, + "learning_rate": 9.940125546208107e-06, + "loss": 0.1528, + "step": 437 + }, + { + "epoch": 0.31, + "grad_norm": 5.439592562127047, + "learning_rate": 9.939678790578813e-06, + "loss": 0.1382, + "step": 438 + }, + { + "epoch": 0.31, + "grad_norm": 50.401662662406025, + "learning_rate": 9.93923038451102e-06, + "loss": 0.1444, + "step": 439 + }, + { + "epoch": 0.31, + "grad_norm": 27.565998455504264, + "learning_rate": 9.938780328154549e-06, + "loss": 0.1638, + "step": 440 + }, + { + "epoch": 0.31, + "grad_norm": 29.80949294384711, + "learning_rate": 9.938328621659775e-06, + "loss": 0.177, + "step": 441 + }, + { + "epoch": 0.32, + "grad_norm": 39.42630244801424, + "learning_rate": 9.937875265177615e-06, + "loss": 0.1831, + "step": 442 + }, + { + "epoch": 0.32, + "grad_norm": 5.22622431582067, + "learning_rate": 9.937420258859547e-06, + "loss": 0.1394, + "step": 443 + }, + { + "epoch": 0.32, + "grad_norm": 32.328178133199046, + "learning_rate": 9.9369636028576e-06, + "loss": 0.1349, + "step": 444 + }, + { + "epoch": 0.32, + "grad_norm": 6.504231110559711, + "learning_rate": 9.936505297324346e-06, + "loss": 0.1211, + "step": 445 + }, + { + "epoch": 0.32, + "grad_norm": 20.781251334768122, + "learning_rate": 9.936045342412917e-06, + "loss": 0.1482, + "step": 446 + }, + { + "epoch": 0.32, + "grad_norm": 21.21248632778865, + "learning_rate": 9.93558373827699e-06, + "loss": 0.1327, + "step": 447 + }, + { + "epoch": 0.32, + "grad_norm": 12.707782785702369, + "learning_rate": 9.935120485070799e-06, + "loss": 0.1494, + "step": 448 + }, + { + "epoch": 0.32, + "grad_norm": 9.673210776385321, + "learning_rate": 9.934655582949123e-06, + "loss": 0.0964, + "step": 449 + }, + { + "epoch": 0.32, + "grad_norm": 9.888011994441523, + "learning_rate": 9.934189032067296e-06, + "loss": 0.1555, + "step": 450 + }, + { + "epoch": 0.32, + "grad_norm": 32.12516155863512, + "learning_rate": 9.933720832581197e-06, + "loss": 0.1355, + "step": 451 + }, + { + "epoch": 0.32, + "grad_norm": 22.322386591262855, + "learning_rate": 9.933250984647266e-06, + "loss": 0.1368, + "step": 452 + }, + { + "epoch": 0.32, + "grad_norm": 15.223180132034958, + "learning_rate": 9.932779488422484e-06, + "loss": 0.1383, + "step": 453 + }, + { + "epoch": 0.32, + "grad_norm": 52.31779619507051, + "learning_rate": 9.93230634406439e-06, + "loss": 0.1522, + "step": 454 + }, + { + "epoch": 0.32, + "grad_norm": 29.017829224877428, + "learning_rate": 9.931831551731067e-06, + "loss": 0.1445, + "step": 455 + }, + { + "epoch": 0.33, + "grad_norm": 18.997292779869753, + "learning_rate": 9.931355111581154e-06, + "loss": 0.1554, + "step": 456 + }, + { + "epoch": 0.33, + "grad_norm": 34.735932438290625, + "learning_rate": 9.930877023773837e-06, + "loss": 0.134, + "step": 457 + }, + { + "epoch": 0.33, + "grad_norm": 45.54908316077474, + "learning_rate": 9.930397288468853e-06, + "loss": 0.1663, + "step": 458 + }, + { + "epoch": 0.33, + "grad_norm": 9.033708570010118, + "learning_rate": 9.929915905826494e-06, + "loss": 0.1084, + "step": 459 + }, + { + "epoch": 0.33, + "grad_norm": 4.323441132876292, + "learning_rate": 9.9294328760076e-06, + "loss": 0.1165, + "step": 460 + }, + { + "epoch": 0.33, + "grad_norm": 47.919864734101694, + "learning_rate": 9.928948199173552e-06, + "loss": 0.1626, + "step": 461 + }, + { + "epoch": 0.33, + "grad_norm": 4.461706134664676, + "learning_rate": 9.928461875486297e-06, + "loss": 0.1068, + "step": 462 + }, + { + "epoch": 0.33, + "grad_norm": 15.03570267608453, + "learning_rate": 9.927973905108323e-06, + "loss": 0.1066, + "step": 463 + }, + { + "epoch": 0.33, + "grad_norm": 29.02078498555869, + "learning_rate": 9.927484288202671e-06, + "loss": 0.1425, + "step": 464 + }, + { + "epoch": 0.33, + "grad_norm": 25.201159295770108, + "learning_rate": 9.926993024932929e-06, + "loss": 0.1377, + "step": 465 + }, + { + "epoch": 0.33, + "grad_norm": 5.395550286499605, + "learning_rate": 9.926500115463238e-06, + "loss": 0.1176, + "step": 466 + }, + { + "epoch": 0.33, + "grad_norm": 11.158696809668983, + "learning_rate": 9.926005559958287e-06, + "loss": 0.1361, + "step": 467 + }, + { + "epoch": 0.33, + "grad_norm": 25.515101194445457, + "learning_rate": 9.925509358583319e-06, + "loss": 0.1162, + "step": 468 + }, + { + "epoch": 0.33, + "grad_norm": 18.28360522188909, + "learning_rate": 9.92501151150412e-06, + "loss": 0.1367, + "step": 469 + }, + { + "epoch": 0.34, + "grad_norm": 31.778596429481162, + "learning_rate": 9.924512018887036e-06, + "loss": 0.1229, + "step": 470 + }, + { + "epoch": 0.34, + "grad_norm": 13.711321378139074, + "learning_rate": 9.924010880898952e-06, + "loss": 0.1389, + "step": 471 + }, + { + "epoch": 0.34, + "grad_norm": 5.3412870602780185, + "learning_rate": 9.923508097707306e-06, + "loss": 0.1394, + "step": 472 + }, + { + "epoch": 0.34, + "grad_norm": 39.662327865375396, + "learning_rate": 9.923003669480094e-06, + "loss": 0.187, + "step": 473 + }, + { + "epoch": 0.34, + "grad_norm": 6.585151391933716, + "learning_rate": 9.922497596385848e-06, + "loss": 0.1266, + "step": 474 + }, + { + "epoch": 0.34, + "grad_norm": 25.903061542242025, + "learning_rate": 9.92198987859366e-06, + "loss": 0.0936, + "step": 475 + }, + { + "epoch": 0.34, + "grad_norm": 3.982867596376823, + "learning_rate": 9.921480516273168e-06, + "loss": 0.1238, + "step": 476 + }, + { + "epoch": 0.34, + "grad_norm": 13.564020531973936, + "learning_rate": 9.920969509594558e-06, + "loss": 0.126, + "step": 477 + }, + { + "epoch": 0.34, + "grad_norm": 10.136096874644808, + "learning_rate": 9.920456858728567e-06, + "loss": 0.1329, + "step": 478 + }, + { + "epoch": 0.34, + "grad_norm": 4.415457519145033, + "learning_rate": 9.919942563846482e-06, + "loss": 0.1044, + "step": 479 + }, + { + "epoch": 0.34, + "grad_norm": 6.035743042678794, + "learning_rate": 9.919426625120137e-06, + "loss": 0.1689, + "step": 480 + }, + { + "epoch": 0.34, + "grad_norm": 10.45092970947201, + "learning_rate": 9.918909042721918e-06, + "loss": 0.136, + "step": 481 + }, + { + "epoch": 0.34, + "grad_norm": 4.68056597961224, + "learning_rate": 9.918389816824759e-06, + "loss": 0.1423, + "step": 482 + }, + { + "epoch": 0.34, + "grad_norm": 20.55146614949313, + "learning_rate": 9.917868947602144e-06, + "loss": 0.1532, + "step": 483 + }, + { + "epoch": 0.35, + "grad_norm": 34.77111147030742, + "learning_rate": 9.917346435228102e-06, + "loss": 0.1746, + "step": 484 + }, + { + "epoch": 0.35, + "grad_norm": 5.0196084233144695, + "learning_rate": 9.916822279877217e-06, + "loss": 0.1279, + "step": 485 + }, + { + "epoch": 0.35, + "grad_norm": 3.941526827167951, + "learning_rate": 9.91629648172462e-06, + "loss": 0.1146, + "step": 486 + }, + { + "epoch": 0.35, + "grad_norm": 12.890997507039701, + "learning_rate": 9.915769040945984e-06, + "loss": 0.1028, + "step": 487 + }, + { + "epoch": 0.35, + "grad_norm": 27.88496903893723, + "learning_rate": 9.915239957717542e-06, + "loss": 0.1274, + "step": 488 + }, + { + "epoch": 0.35, + "grad_norm": 9.00778567508186, + "learning_rate": 9.91470923221607e-06, + "loss": 0.1411, + "step": 489 + }, + { + "epoch": 0.35, + "grad_norm": 23.484883782999354, + "learning_rate": 9.914176864618891e-06, + "loss": 0.1384, + "step": 490 + }, + { + "epoch": 0.35, + "grad_norm": 4.929422863937659, + "learning_rate": 9.913642855103881e-06, + "loss": 0.0734, + "step": 491 + }, + { + "epoch": 0.35, + "grad_norm": 5.36947027897346, + "learning_rate": 9.913107203849464e-06, + "loss": 0.1497, + "step": 492 + }, + { + "epoch": 0.35, + "grad_norm": 24.750798415291204, + "learning_rate": 9.912569911034607e-06, + "loss": 0.1672, + "step": 493 + }, + { + "epoch": 0.35, + "grad_norm": 6.881614535806723, + "learning_rate": 9.912030976838832e-06, + "loss": 0.1456, + "step": 494 + }, + { + "epoch": 0.35, + "grad_norm": 33.54932127569477, + "learning_rate": 9.911490401442205e-06, + "loss": 0.1489, + "step": 495 + }, + { + "epoch": 0.35, + "grad_norm": 20.00784172694422, + "learning_rate": 9.910948185025345e-06, + "loss": 0.1536, + "step": 496 + }, + { + "epoch": 0.35, + "grad_norm": 5.238510328306569, + "learning_rate": 9.910404327769414e-06, + "loss": 0.1207, + "step": 497 + }, + { + "epoch": 0.36, + "grad_norm": 30.054761593942306, + "learning_rate": 9.909858829856127e-06, + "loss": 0.1364, + "step": 498 + }, + { + "epoch": 0.36, + "grad_norm": 7.592134090075237, + "learning_rate": 9.909311691467744e-06, + "loss": 0.1006, + "step": 499 + }, + { + "epoch": 0.36, + "grad_norm": 5.936467444451623, + "learning_rate": 9.908762912787073e-06, + "loss": 0.1512, + "step": 500 + }, + { + "epoch": 0.36, + "eval_avg_AUC": 0.8148136168882592, + "eval_avg_Accuracy": 0.7284897214854111, + "eval_avg_Accuracy-right": 0.8730272596843616, + "eval_avg_Accuracy-wrong": 0.4764612235615192, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6962655285457768, + "eval_last_AUC": 0.8285519781961206, + "eval_last_Accuracy": 0.7570457559681698, + "eval_last_Accuracy-right": 0.8302465110212599, + "eval_last_Accuracy-wrong": 0.6294064134637253, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6999852156206944, + "eval_max_AUC": 0.7665118487150447, + "eval_max_Accuracy": 0.6521054376657824, + "eval_max_Accuracy-right": 0.9773053345506717, + "eval_max_Accuracy-wrong": 0.08505799408687742, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6174539818202015, + "eval_min_AUC": 0.8243709198076318, + "eval_min_Accuracy": 0.7511604774535809, + "eval_min_Accuracy-right": 0.7600104343289422, + "eval_min_Accuracy-wrong": 0.735728906072322, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6964298045694274, + "eval_prod_AUC": 0.8209843050796547, + "eval_prod_Accuracy": 0.6548408488063661, + "eval_prod_Accuracy-right": 0.49582626842311206, + "eval_prod_Accuracy-wrong": 0.9321128041846714, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6870361522901595, + "eval_runtime": 248.2297, + "eval_samples_per_second": 97.2, + "eval_steps_per_second": 3.038, + "eval_sum_AUC": 0.6857568992387502, + "eval_sum_Accuracy": 0.6382211538461539, + "eval_sum_Accuracy-right": 0.9962827703143342, + "eval_sum_Accuracy-wrong": 0.013873095292244713, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6648248225648956, + "step": 500 + }, + { + "epoch": 0.36, + "grad_norm": 7.521136475767055, + "learning_rate": 9.908212493997473e-06, + "loss": 0.1351, + "step": 501 + }, + { + "epoch": 0.36, + "grad_norm": 24.8215182959066, + "learning_rate": 9.90766043528285e-06, + "loss": 0.1265, + "step": 502 + }, + { + "epoch": 0.36, + "grad_norm": 20.262220801212766, + "learning_rate": 9.907106736827654e-06, + "loss": 0.1382, + "step": 503 + }, + { + "epoch": 0.36, + "grad_norm": 6.612067632756019, + "learning_rate": 9.906551398816886e-06, + "loss": 0.1176, + "step": 504 + }, + { + "epoch": 0.36, + "grad_norm": 10.684225952657133, + "learning_rate": 9.9059944214361e-06, + "loss": 0.1636, + "step": 505 + }, + { + "epoch": 0.36, + "grad_norm": 12.984066186445835, + "learning_rate": 9.905435804871387e-06, + "loss": 0.1366, + "step": 506 + }, + { + "epoch": 0.36, + "grad_norm": 24.763096461281147, + "learning_rate": 9.904875549309391e-06, + "loss": 0.1584, + "step": 507 + }, + { + "epoch": 0.36, + "grad_norm": 5.7311046700749975, + "learning_rate": 9.904313654937308e-06, + "loss": 0.1486, + "step": 508 + }, + { + "epoch": 0.36, + "grad_norm": 51.33110066179473, + "learning_rate": 9.903750121942873e-06, + "loss": 0.1875, + "step": 509 + }, + { + "epoch": 0.36, + "grad_norm": 12.408291940217502, + "learning_rate": 9.903184950514378e-06, + "loss": 0.1373, + "step": 510 + }, + { + "epoch": 0.36, + "grad_norm": 7.255602958809023, + "learning_rate": 9.90261814084065e-06, + "loss": 0.1256, + "step": 511 + }, + { + "epoch": 0.37, + "grad_norm": 17.357961967373914, + "learning_rate": 9.902049693111077e-06, + "loss": 0.1616, + "step": 512 + }, + { + "epoch": 0.37, + "grad_norm": 28.352859541701022, + "learning_rate": 9.901479607515587e-06, + "loss": 0.1322, + "step": 513 + }, + { + "epoch": 0.37, + "grad_norm": 18.349020721142626, + "learning_rate": 9.900907884244654e-06, + "loss": 0.1311, + "step": 514 + }, + { + "epoch": 0.37, + "grad_norm": 7.109150918544185, + "learning_rate": 9.900334523489303e-06, + "loss": 0.1604, + "step": 515 + }, + { + "epoch": 0.37, + "grad_norm": 29.842240993629268, + "learning_rate": 9.899759525441101e-06, + "loss": 0.1586, + "step": 516 + }, + { + "epoch": 0.37, + "grad_norm": 32.448571178210294, + "learning_rate": 9.899182890292171e-06, + "loss": 0.1516, + "step": 517 + }, + { + "epoch": 0.37, + "grad_norm": 37.541295217424135, + "learning_rate": 9.898604618235175e-06, + "loss": 0.1541, + "step": 518 + }, + { + "epoch": 0.37, + "grad_norm": 59.74203765524549, + "learning_rate": 9.898024709463322e-06, + "loss": 0.1914, + "step": 519 + }, + { + "epoch": 0.37, + "grad_norm": 51.553681345435216, + "learning_rate": 9.897443164170375e-06, + "loss": 0.1547, + "step": 520 + }, + { + "epoch": 0.37, + "grad_norm": 17.65120220241073, + "learning_rate": 9.896859982550636e-06, + "loss": 0.1357, + "step": 521 + }, + { + "epoch": 0.37, + "grad_norm": 4.5369157168625405, + "learning_rate": 9.89627516479896e-06, + "loss": 0.1232, + "step": 522 + }, + { + "epoch": 0.37, + "grad_norm": 28.586892727146566, + "learning_rate": 9.895688711110739e-06, + "loss": 0.1234, + "step": 523 + }, + { + "epoch": 0.37, + "grad_norm": 4.714799632443614, + "learning_rate": 9.895100621681923e-06, + "loss": 0.1794, + "step": 524 + }, + { + "epoch": 0.37, + "grad_norm": 10.894818046361022, + "learning_rate": 9.894510896709003e-06, + "loss": 0.1145, + "step": 525 + }, + { + "epoch": 0.38, + "grad_norm": 11.020267897450193, + "learning_rate": 9.893919536389017e-06, + "loss": 0.1331, + "step": 526 + }, + { + "epoch": 0.38, + "grad_norm": 6.603140469816235, + "learning_rate": 9.89332654091955e-06, + "loss": 0.1317, + "step": 527 + }, + { + "epoch": 0.38, + "grad_norm": 5.317710681474739, + "learning_rate": 9.892731910498731e-06, + "loss": 0.134, + "step": 528 + }, + { + "epoch": 0.38, + "grad_norm": 18.48919116376951, + "learning_rate": 9.892135645325238e-06, + "loss": 0.127, + "step": 529 + }, + { + "epoch": 0.38, + "grad_norm": 15.110852007766283, + "learning_rate": 9.891537745598293e-06, + "loss": 0.1333, + "step": 530 + }, + { + "epoch": 0.38, + "grad_norm": 25.48093356108608, + "learning_rate": 9.89093821151767e-06, + "loss": 0.1425, + "step": 531 + }, + { + "epoch": 0.38, + "grad_norm": 14.211704092158119, + "learning_rate": 9.89033704328368e-06, + "loss": 0.1261, + "step": 532 + }, + { + "epoch": 0.38, + "grad_norm": 12.49842219441739, + "learning_rate": 9.889734241097186e-06, + "loss": 0.1227, + "step": 533 + }, + { + "epoch": 0.38, + "grad_norm": 6.538899092333007, + "learning_rate": 9.889129805159595e-06, + "loss": 0.1333, + "step": 534 + }, + { + "epoch": 0.38, + "grad_norm": 11.229548690859028, + "learning_rate": 9.888523735672861e-06, + "loss": 0.1207, + "step": 535 + }, + { + "epoch": 0.38, + "grad_norm": 14.706186292987425, + "learning_rate": 9.887916032839482e-06, + "loss": 0.1376, + "step": 536 + }, + { + "epoch": 0.38, + "grad_norm": 15.889794280637537, + "learning_rate": 9.887306696862504e-06, + "loss": 0.1122, + "step": 537 + }, + { + "epoch": 0.38, + "grad_norm": 9.453535010562696, + "learning_rate": 9.886695727945515e-06, + "loss": 0.131, + "step": 538 + }, + { + "epoch": 0.38, + "grad_norm": 11.612128252341902, + "learning_rate": 9.886083126292655e-06, + "loss": 0.1492, + "step": 539 + }, + { + "epoch": 0.39, + "grad_norm": 7.075171549803063, + "learning_rate": 9.885468892108603e-06, + "loss": 0.1272, + "step": 540 + }, + { + "epoch": 0.39, + "grad_norm": 26.914960079687482, + "learning_rate": 9.884853025598587e-06, + "loss": 0.1514, + "step": 541 + }, + { + "epoch": 0.39, + "grad_norm": 43.779863637748186, + "learning_rate": 9.884235526968377e-06, + "loss": 0.1256, + "step": 542 + }, + { + "epoch": 0.39, + "grad_norm": 37.73960304494291, + "learning_rate": 9.883616396424294e-06, + "loss": 0.1553, + "step": 543 + }, + { + "epoch": 0.39, + "grad_norm": 12.454022843924934, + "learning_rate": 9.8829956341732e-06, + "loss": 0.1204, + "step": 544 + }, + { + "epoch": 0.39, + "grad_norm": 43.19321310165527, + "learning_rate": 9.882373240422503e-06, + "loss": 0.1389, + "step": 545 + }, + { + "epoch": 0.39, + "grad_norm": 48.79232289830731, + "learning_rate": 9.881749215380156e-06, + "loss": 0.13, + "step": 546 + }, + { + "epoch": 0.39, + "grad_norm": 18.76530769424551, + "learning_rate": 9.881123559254658e-06, + "loss": 0.1158, + "step": 547 + }, + { + "epoch": 0.39, + "grad_norm": 62.57220924147817, + "learning_rate": 9.880496272255053e-06, + "loss": 0.158, + "step": 548 + }, + { + "epoch": 0.39, + "grad_norm": 12.500417488061052, + "learning_rate": 9.879867354590926e-06, + "loss": 0.1196, + "step": 549 + }, + { + "epoch": 0.39, + "grad_norm": 15.875141788148147, + "learning_rate": 9.879236806472414e-06, + "loss": 0.1333, + "step": 550 + }, + { + "epoch": 0.39, + "grad_norm": 7.301629680806376, + "learning_rate": 9.878604628110194e-06, + "loss": 0.1799, + "step": 551 + }, + { + "epoch": 0.39, + "grad_norm": 6.264465519965361, + "learning_rate": 9.877970819715485e-06, + "loss": 0.1427, + "step": 552 + }, + { + "epoch": 0.39, + "grad_norm": 30.949313988770307, + "learning_rate": 9.87733538150006e-06, + "loss": 0.142, + "step": 553 + }, + { + "epoch": 0.4, + "grad_norm": 6.380549224768007, + "learning_rate": 9.876698313676225e-06, + "loss": 0.1191, + "step": 554 + }, + { + "epoch": 0.4, + "grad_norm": 10.66859233407463, + "learning_rate": 9.876059616456842e-06, + "loss": 0.1405, + "step": 555 + }, + { + "epoch": 0.4, + "grad_norm": 20.946002565023058, + "learning_rate": 9.875419290055305e-06, + "loss": 0.1102, + "step": 556 + }, + { + "epoch": 0.4, + "grad_norm": 30.897928782214787, + "learning_rate": 9.874777334685565e-06, + "loss": 0.1711, + "step": 557 + }, + { + "epoch": 0.4, + "grad_norm": 13.033106438202273, + "learning_rate": 9.874133750562108e-06, + "loss": 0.1622, + "step": 558 + }, + { + "epoch": 0.4, + "grad_norm": 40.36685994127496, + "learning_rate": 9.873488537899967e-06, + "loss": 0.2061, + "step": 559 + }, + { + "epoch": 0.4, + "grad_norm": 19.50274995202192, + "learning_rate": 9.872841696914721e-06, + "loss": 0.1294, + "step": 560 + }, + { + "epoch": 0.4, + "grad_norm": 11.932353294948314, + "learning_rate": 9.872193227822492e-06, + "loss": 0.1265, + "step": 561 + }, + { + "epoch": 0.4, + "grad_norm": 20.43294471193903, + "learning_rate": 9.871543130839944e-06, + "loss": 0.1456, + "step": 562 + }, + { + "epoch": 0.4, + "grad_norm": 42.946806904632204, + "learning_rate": 9.870891406184288e-06, + "loss": 0.1626, + "step": 563 + }, + { + "epoch": 0.4, + "grad_norm": 33.149765129877956, + "learning_rate": 9.870238054073275e-06, + "loss": 0.1593, + "step": 564 + }, + { + "epoch": 0.4, + "grad_norm": 5.929277127548148, + "learning_rate": 9.869583074725206e-06, + "loss": 0.1477, + "step": 565 + }, + { + "epoch": 0.4, + "grad_norm": 39.416395744901585, + "learning_rate": 9.868926468358919e-06, + "loss": 0.1432, + "step": 566 + }, + { + "epoch": 0.4, + "grad_norm": 43.22729095180558, + "learning_rate": 9.868268235193796e-06, + "loss": 0.1406, + "step": 567 + }, + { + "epoch": 0.41, + "grad_norm": 15.377542148018398, + "learning_rate": 9.867608375449772e-06, + "loss": 0.1235, + "step": 568 + }, + { + "epoch": 0.41, + "grad_norm": 23.10191212472918, + "learning_rate": 9.866946889347311e-06, + "loss": 0.127, + "step": 569 + }, + { + "epoch": 0.41, + "grad_norm": 32.3797801287966, + "learning_rate": 9.866283777107432e-06, + "loss": 0.1323, + "step": 570 + }, + { + "epoch": 0.41, + "grad_norm": 6.095686964503419, + "learning_rate": 9.865619038951692e-06, + "loss": 0.1375, + "step": 571 + }, + { + "epoch": 0.41, + "grad_norm": 9.775807625906909, + "learning_rate": 9.864952675102193e-06, + "loss": 0.1379, + "step": 572 + }, + { + "epoch": 0.41, + "grad_norm": 6.286767820247743, + "learning_rate": 9.864284685781578e-06, + "loss": 0.1425, + "step": 573 + }, + { + "epoch": 0.41, + "grad_norm": 6.147344431329742, + "learning_rate": 9.863615071213036e-06, + "loss": 0.1304, + "step": 574 + }, + { + "epoch": 0.41, + "grad_norm": 28.412228671248563, + "learning_rate": 9.862943831620298e-06, + "loss": 0.1273, + "step": 575 + }, + { + "epoch": 0.41, + "grad_norm": 30.96935582463328, + "learning_rate": 9.862270967227636e-06, + "loss": 0.1459, + "step": 576 + }, + { + "epoch": 0.41, + "grad_norm": 4.987197903594031, + "learning_rate": 9.861596478259869e-06, + "loss": 0.139, + "step": 577 + }, + { + "epoch": 0.41, + "grad_norm": 63.33976410883062, + "learning_rate": 9.860920364942353e-06, + "loss": 0.1904, + "step": 578 + }, + { + "epoch": 0.41, + "grad_norm": 4.66543675696079, + "learning_rate": 9.860242627500994e-06, + "loss": 0.1125, + "step": 579 + }, + { + "epoch": 0.41, + "grad_norm": 11.0806252525076, + "learning_rate": 9.859563266162231e-06, + "loss": 0.1321, + "step": 580 + }, + { + "epoch": 0.41, + "grad_norm": 5.080418185339296, + "learning_rate": 9.858882281153058e-06, + "loss": 0.1157, + "step": 581 + }, + { + "epoch": 0.42, + "grad_norm": 23.03682096474836, + "learning_rate": 9.858199672701e-06, + "loss": 0.1392, + "step": 582 + }, + { + "epoch": 0.42, + "grad_norm": 31.28197071942006, + "learning_rate": 9.85751544103413e-06, + "loss": 0.1129, + "step": 583 + }, + { + "epoch": 0.42, + "grad_norm": 9.68760436913685, + "learning_rate": 9.856829586381065e-06, + "loss": 0.1071, + "step": 584 + }, + { + "epoch": 0.42, + "grad_norm": 73.28546422256532, + "learning_rate": 9.856142108970958e-06, + "loss": 0.1958, + "step": 585 + }, + { + "epoch": 0.42, + "grad_norm": 18.720122616928183, + "learning_rate": 9.855453009033512e-06, + "loss": 0.1326, + "step": 586 + }, + { + "epoch": 0.42, + "grad_norm": 15.463997560094482, + "learning_rate": 9.854762286798965e-06, + "loss": 0.1453, + "step": 587 + }, + { + "epoch": 0.42, + "grad_norm": 7.391237164066708, + "learning_rate": 9.854069942498102e-06, + "loss": 0.1965, + "step": 588 + }, + { + "epoch": 0.42, + "grad_norm": 8.599884051273923, + "learning_rate": 9.853375976362245e-06, + "loss": 0.1635, + "step": 589 + }, + { + "epoch": 0.42, + "grad_norm": 4.441491117087743, + "learning_rate": 9.852680388623266e-06, + "loss": 0.1158, + "step": 590 + }, + { + "epoch": 0.42, + "grad_norm": 34.51436948860912, + "learning_rate": 9.85198317951357e-06, + "loss": 0.1531, + "step": 591 + }, + { + "epoch": 0.42, + "grad_norm": 17.99454842623697, + "learning_rate": 9.851284349266107e-06, + "loss": 0.1305, + "step": 592 + }, + { + "epoch": 0.42, + "grad_norm": 16.123249191913267, + "learning_rate": 9.850583898114372e-06, + "loss": 0.1575, + "step": 593 + }, + { + "epoch": 0.42, + "grad_norm": 48.69893321264437, + "learning_rate": 9.849881826292399e-06, + "loss": 0.1558, + "step": 594 + }, + { + "epoch": 0.42, + "grad_norm": 34.46486038497999, + "learning_rate": 9.84917813403476e-06, + "loss": 0.1367, + "step": 595 + }, + { + "epoch": 0.43, + "grad_norm": 9.597468969380602, + "learning_rate": 9.848472821576572e-06, + "loss": 0.1439, + "step": 596 + }, + { + "epoch": 0.43, + "grad_norm": 52.9719490703774, + "learning_rate": 9.847765889153497e-06, + "loss": 0.1819, + "step": 597 + }, + { + "epoch": 0.43, + "grad_norm": 36.233987937356446, + "learning_rate": 9.847057337001731e-06, + "loss": 0.1594, + "step": 598 + }, + { + "epoch": 0.43, + "grad_norm": 20.295048431141648, + "learning_rate": 9.846347165358014e-06, + "loss": 0.1284, + "step": 599 + }, + { + "epoch": 0.43, + "grad_norm": 42.84524320177862, + "learning_rate": 9.84563537445963e-06, + "loss": 0.1504, + "step": 600 + }, + { + "epoch": 0.43, + "grad_norm": 62.1195121067531, + "learning_rate": 9.844921964544398e-06, + "loss": 0.1758, + "step": 601 + }, + { + "epoch": 0.43, + "grad_norm": 47.2653880369296, + "learning_rate": 9.844206935850687e-06, + "loss": 0.1831, + "step": 602 + }, + { + "epoch": 0.43, + "grad_norm": 29.4931297961525, + "learning_rate": 9.843490288617397e-06, + "loss": 0.1008, + "step": 603 + }, + { + "epoch": 0.43, + "grad_norm": 78.26510483270128, + "learning_rate": 9.842772023083972e-06, + "loss": 0.1829, + "step": 604 + }, + { + "epoch": 0.43, + "grad_norm": 72.53305356196587, + "learning_rate": 9.842052139490403e-06, + "loss": 0.2009, + "step": 605 + }, + { + "epoch": 0.43, + "grad_norm": 23.74689339670262, + "learning_rate": 9.841330638077213e-06, + "loss": 0.139, + "step": 606 + }, + { + "epoch": 0.43, + "grad_norm": 58.871630884771356, + "learning_rate": 9.840607519085467e-06, + "loss": 0.1533, + "step": 607 + }, + { + "epoch": 0.43, + "grad_norm": 60.04727742225944, + "learning_rate": 9.839882782756778e-06, + "loss": 0.1533, + "step": 608 + }, + { + "epoch": 0.43, + "grad_norm": 34.76813691419715, + "learning_rate": 9.839156429333291e-06, + "loss": 0.1517, + "step": 609 + }, + { + "epoch": 0.44, + "grad_norm": 39.487093806576866, + "learning_rate": 9.838428459057694e-06, + "loss": 0.1306, + "step": 610 + }, + { + "epoch": 0.44, + "grad_norm": 71.43206314131315, + "learning_rate": 9.837698872173214e-06, + "loss": 0.166, + "step": 611 + }, + { + "epoch": 0.44, + "grad_norm": 49.09265681558195, + "learning_rate": 9.836967668923623e-06, + "loss": 0.1689, + "step": 612 + }, + { + "epoch": 0.44, + "grad_norm": 6.462674294507499, + "learning_rate": 9.836234849553228e-06, + "loss": 0.1088, + "step": 613 + }, + { + "epoch": 0.44, + "grad_norm": 70.19656687365818, + "learning_rate": 9.835500414306875e-06, + "loss": 0.2119, + "step": 614 + }, + { + "epoch": 0.44, + "grad_norm": 62.173880695304916, + "learning_rate": 9.834764363429956e-06, + "loss": 0.1681, + "step": 615 + }, + { + "epoch": 0.44, + "grad_norm": 43.48786940185642, + "learning_rate": 9.8340266971684e-06, + "loss": 0.172, + "step": 616 + }, + { + "epoch": 0.44, + "grad_norm": 26.124867896320307, + "learning_rate": 9.83328741576867e-06, + "loss": 0.1342, + "step": 617 + }, + { + "epoch": 0.44, + "grad_norm": 47.911183471370855, + "learning_rate": 9.832546519477778e-06, + "loss": 0.179, + "step": 618 + }, + { + "epoch": 0.44, + "grad_norm": 89.25182761050898, + "learning_rate": 9.831804008543271e-06, + "loss": 0.2285, + "step": 619 + }, + { + "epoch": 0.44, + "grad_norm": 23.042941448066955, + "learning_rate": 9.831059883213234e-06, + "loss": 0.1616, + "step": 620 + }, + { + "epoch": 0.44, + "grad_norm": 58.09372311927322, + "learning_rate": 9.830314143736292e-06, + "loss": 0.1641, + "step": 621 + }, + { + "epoch": 0.44, + "grad_norm": 28.97753294613849, + "learning_rate": 9.829566790361615e-06, + "loss": 0.1344, + "step": 622 + }, + { + "epoch": 0.44, + "grad_norm": 49.28980917332516, + "learning_rate": 9.828817823338903e-06, + "loss": 0.1614, + "step": 623 + }, + { + "epoch": 0.45, + "grad_norm": 32.04413626104762, + "learning_rate": 9.828067242918402e-06, + "loss": 0.1404, + "step": 624 + }, + { + "epoch": 0.45, + "grad_norm": 4.78476590401764, + "learning_rate": 9.827315049350895e-06, + "loss": 0.1169, + "step": 625 + }, + { + "epoch": 0.45, + "grad_norm": 81.63675673285175, + "learning_rate": 9.826561242887704e-06, + "loss": 0.1953, + "step": 626 + }, + { + "epoch": 0.45, + "grad_norm": 48.11606730737452, + "learning_rate": 9.825805823780687e-06, + "loss": 0.1619, + "step": 627 + }, + { + "epoch": 0.45, + "grad_norm": 3.907109159794875, + "learning_rate": 9.825048792282247e-06, + "loss": 0.131, + "step": 628 + }, + { + "epoch": 0.45, + "grad_norm": 11.103553566829753, + "learning_rate": 9.824290148645322e-06, + "loss": 0.1139, + "step": 629 + }, + { + "epoch": 0.45, + "grad_norm": 51.12278081641119, + "learning_rate": 9.823529893123384e-06, + "loss": 0.1527, + "step": 630 + }, + { + "epoch": 0.45, + "grad_norm": 64.29064695958792, + "learning_rate": 9.822768025970456e-06, + "loss": 0.1838, + "step": 631 + }, + { + "epoch": 0.45, + "grad_norm": 22.424660906876873, + "learning_rate": 9.822004547441088e-06, + "loss": 0.1398, + "step": 632 + }, + { + "epoch": 0.45, + "grad_norm": 5.706055561297325, + "learning_rate": 9.821239457790373e-06, + "loss": 0.1428, + "step": 633 + }, + { + "epoch": 0.45, + "grad_norm": 59.7429449871699, + "learning_rate": 9.82047275727394e-06, + "loss": 0.176, + "step": 634 + }, + { + "epoch": 0.45, + "grad_norm": 67.77135894876831, + "learning_rate": 9.81970444614796e-06, + "loss": 0.1714, + "step": 635 + }, + { + "epoch": 0.45, + "grad_norm": 10.668865071226387, + "learning_rate": 9.81893452466914e-06, + "loss": 0.1267, + "step": 636 + }, + { + "epoch": 0.45, + "grad_norm": 44.76001772659544, + "learning_rate": 9.818162993094724e-06, + "loss": 0.1423, + "step": 637 + }, + { + "epoch": 0.46, + "grad_norm": 53.39779453147049, + "learning_rate": 9.817389851682494e-06, + "loss": 0.1842, + "step": 638 + }, + { + "epoch": 0.46, + "grad_norm": 24.28617527330543, + "learning_rate": 9.816615100690773e-06, + "loss": 0.1235, + "step": 639 + }, + { + "epoch": 0.46, + "grad_norm": 19.154914114241624, + "learning_rate": 9.81583874037842e-06, + "loss": 0.1217, + "step": 640 + }, + { + "epoch": 0.46, + "grad_norm": 10.613120002801404, + "learning_rate": 9.815060771004831e-06, + "loss": 0.1311, + "step": 641 + }, + { + "epoch": 0.46, + "grad_norm": 27.240227333671324, + "learning_rate": 9.81428119282994e-06, + "loss": 0.1553, + "step": 642 + }, + { + "epoch": 0.46, + "grad_norm": 21.0473186526304, + "learning_rate": 9.813500006114216e-06, + "loss": 0.1239, + "step": 643 + }, + { + "epoch": 0.46, + "grad_norm": 15.104115765634024, + "learning_rate": 9.812717211118673e-06, + "loss": 0.1184, + "step": 644 + }, + { + "epoch": 0.46, + "grad_norm": 12.249235905236901, + "learning_rate": 9.811932808104852e-06, + "loss": 0.1505, + "step": 645 + }, + { + "epoch": 0.46, + "grad_norm": 5.4945638837559105, + "learning_rate": 9.811146797334838e-06, + "loss": 0.1547, + "step": 646 + }, + { + "epoch": 0.46, + "grad_norm": 15.362754513773162, + "learning_rate": 9.810359179071255e-06, + "loss": 0.1442, + "step": 647 + }, + { + "epoch": 0.46, + "grad_norm": 19.255632442668595, + "learning_rate": 9.809569953577258e-06, + "loss": 0.0961, + "step": 648 + }, + { + "epoch": 0.46, + "grad_norm": 8.391685162385883, + "learning_rate": 9.808779121116542e-06, + "loss": 0.1421, + "step": 649 + }, + { + "epoch": 0.46, + "grad_norm": 15.810940924831158, + "learning_rate": 9.807986681953341e-06, + "loss": 0.1198, + "step": 650 + }, + { + "epoch": 0.46, + "grad_norm": 9.376506844728723, + "learning_rate": 9.807192636352422e-06, + "loss": 0.1383, + "step": 651 + }, + { + "epoch": 0.47, + "grad_norm": 17.64775555566848, + "learning_rate": 9.80639698457909e-06, + "loss": 0.108, + "step": 652 + }, + { + "epoch": 0.47, + "grad_norm": 19.559096284384445, + "learning_rate": 9.805599726899188e-06, + "loss": 0.1316, + "step": 653 + }, + { + "epoch": 0.47, + "grad_norm": 9.461717264545886, + "learning_rate": 9.804800863579094e-06, + "loss": 0.1169, + "step": 654 + }, + { + "epoch": 0.47, + "grad_norm": 20.446164874079532, + "learning_rate": 9.804000394885723e-06, + "loss": 0.13, + "step": 655 + }, + { + "epoch": 0.47, + "grad_norm": 51.40477529762973, + "learning_rate": 9.803198321086527e-06, + "loss": 0.2056, + "step": 656 + }, + { + "epoch": 0.47, + "grad_norm": 7.068665322892394, + "learning_rate": 9.802394642449494e-06, + "loss": 0.1346, + "step": 657 + }, + { + "epoch": 0.47, + "grad_norm": 10.538054958638275, + "learning_rate": 9.801589359243147e-06, + "loss": 0.1362, + "step": 658 + }, + { + "epoch": 0.47, + "grad_norm": 35.66887445030199, + "learning_rate": 9.800782471736547e-06, + "loss": 0.135, + "step": 659 + }, + { + "epoch": 0.47, + "grad_norm": 31.297567953445206, + "learning_rate": 9.799973980199288e-06, + "loss": 0.1344, + "step": 660 + }, + { + "epoch": 0.47, + "grad_norm": 22.996584888633382, + "learning_rate": 9.799163884901506e-06, + "loss": 0.1359, + "step": 661 + }, + { + "epoch": 0.47, + "grad_norm": 38.59277267402653, + "learning_rate": 9.798352186113867e-06, + "loss": 0.1353, + "step": 662 + }, + { + "epoch": 0.47, + "grad_norm": 43.973533584689015, + "learning_rate": 9.797538884107574e-06, + "loss": 0.1582, + "step": 663 + }, + { + "epoch": 0.47, + "grad_norm": 32.03919442027451, + "learning_rate": 9.796723979154366e-06, + "loss": 0.1212, + "step": 664 + }, + { + "epoch": 0.47, + "grad_norm": 5.076183766829495, + "learning_rate": 9.795907471526518e-06, + "loss": 0.1411, + "step": 665 + }, + { + "epoch": 0.48, + "grad_norm": 53.124967231457354, + "learning_rate": 9.79508936149684e-06, + "loss": 0.1311, + "step": 666 + }, + { + "epoch": 0.48, + "grad_norm": 39.74430808850389, + "learning_rate": 9.79426964933868e-06, + "loss": 0.1692, + "step": 667 + }, + { + "epoch": 0.48, + "grad_norm": 9.484329880517071, + "learning_rate": 9.793448335325919e-06, + "loss": 0.1332, + "step": 668 + }, + { + "epoch": 0.48, + "grad_norm": 13.266390261252008, + "learning_rate": 9.792625419732969e-06, + "loss": 0.121, + "step": 669 + }, + { + "epoch": 0.48, + "grad_norm": 22.195451826375955, + "learning_rate": 9.791800902834787e-06, + "loss": 0.0991, + "step": 670 + }, + { + "epoch": 0.48, + "grad_norm": 28.240346235800146, + "learning_rate": 9.790974784906855e-06, + "loss": 0.1233, + "step": 671 + }, + { + "epoch": 0.48, + "grad_norm": 5.289549119474038, + "learning_rate": 9.790147066225198e-06, + "loss": 0.1588, + "step": 672 + }, + { + "epoch": 0.48, + "grad_norm": 40.07083822351059, + "learning_rate": 9.789317747066369e-06, + "loss": 0.1315, + "step": 673 + }, + { + "epoch": 0.48, + "grad_norm": 17.30108556214769, + "learning_rate": 9.788486827707462e-06, + "loss": 0.1672, + "step": 674 + }, + { + "epoch": 0.48, + "grad_norm": 14.124723292613249, + "learning_rate": 9.7876543084261e-06, + "loss": 0.103, + "step": 675 + }, + { + "epoch": 0.48, + "grad_norm": 27.255940377336177, + "learning_rate": 9.786820189500443e-06, + "loss": 0.1493, + "step": 676 + }, + { + "epoch": 0.48, + "grad_norm": 32.00415533490613, + "learning_rate": 9.785984471209186e-06, + "loss": 0.1235, + "step": 677 + }, + { + "epoch": 0.48, + "grad_norm": 3.6327392186328873, + "learning_rate": 9.785147153831562e-06, + "loss": 0.1182, + "step": 678 + }, + { + "epoch": 0.48, + "grad_norm": 4.6193887433702985, + "learning_rate": 9.784308237647329e-06, + "loss": 0.1451, + "step": 679 + }, + { + "epoch": 0.49, + "grad_norm": 9.595692250193478, + "learning_rate": 9.783467722936786e-06, + "loss": 0.1777, + "step": 680 + }, + { + "epoch": 0.49, + "grad_norm": 37.73689869043474, + "learning_rate": 9.782625609980767e-06, + "loss": 0.1315, + "step": 681 + }, + { + "epoch": 0.49, + "grad_norm": 52.16154314059323, + "learning_rate": 9.781781899060635e-06, + "loss": 0.1628, + "step": 682 + }, + { + "epoch": 0.49, + "grad_norm": 4.752406905653005, + "learning_rate": 9.78093659045829e-06, + "loss": 0.1372, + "step": 683 + }, + { + "epoch": 0.49, + "grad_norm": 32.17012777708828, + "learning_rate": 9.780089684456164e-06, + "loss": 0.1354, + "step": 684 + }, + { + "epoch": 0.49, + "grad_norm": 30.835260778351316, + "learning_rate": 9.779241181337228e-06, + "loss": 0.1133, + "step": 685 + }, + { + "epoch": 0.49, + "grad_norm": 32.630866103987174, + "learning_rate": 9.778391081384979e-06, + "loss": 0.1271, + "step": 686 + }, + { + "epoch": 0.49, + "grad_norm": 23.691240269860604, + "learning_rate": 9.777539384883453e-06, + "loss": 0.1061, + "step": 687 + }, + { + "epoch": 0.49, + "grad_norm": 10.821205705393695, + "learning_rate": 9.776686092117216e-06, + "loss": 0.1611, + "step": 688 + }, + { + "epoch": 0.49, + "grad_norm": 28.128381007348157, + "learning_rate": 9.775831203371371e-06, + "loss": 0.1252, + "step": 689 + }, + { + "epoch": 0.49, + "grad_norm": 62.024712894378, + "learning_rate": 9.774974718931551e-06, + "loss": 0.2048, + "step": 690 + }, + { + "epoch": 0.49, + "grad_norm": 30.29155638874213, + "learning_rate": 9.774116639083923e-06, + "loss": 0.1371, + "step": 691 + }, + { + "epoch": 0.49, + "grad_norm": 17.014556677775154, + "learning_rate": 9.773256964115189e-06, + "loss": 0.0955, + "step": 692 + }, + { + "epoch": 0.49, + "grad_norm": 55.463045257964794, + "learning_rate": 9.772395694312583e-06, + "loss": 0.1831, + "step": 693 + }, + { + "epoch": 0.5, + "grad_norm": 24.022624545256317, + "learning_rate": 9.771532829963865e-06, + "loss": 0.1633, + "step": 694 + }, + { + "epoch": 0.5, + "grad_norm": 20.571501004321036, + "learning_rate": 9.770668371357344e-06, + "loss": 0.1271, + "step": 695 + }, + { + "epoch": 0.5, + "grad_norm": 30.701880427913235, + "learning_rate": 9.769802318781842e-06, + "loss": 0.1296, + "step": 696 + }, + { + "epoch": 0.5, + "grad_norm": 25.625148935174835, + "learning_rate": 9.76893467252673e-06, + "loss": 0.1271, + "step": 697 + }, + { + "epoch": 0.5, + "grad_norm": 5.240286750024976, + "learning_rate": 9.768065432881903e-06, + "loss": 0.1227, + "step": 698 + }, + { + "epoch": 0.5, + "grad_norm": 19.49975018445689, + "learning_rate": 9.767194600137789e-06, + "loss": 0.1124, + "step": 699 + }, + { + "epoch": 0.5, + "grad_norm": 7.416082012842279, + "learning_rate": 9.766322174585347e-06, + "loss": 0.1313, + "step": 700 + }, + { + "epoch": 0.5, + "grad_norm": 15.26643688822226, + "learning_rate": 9.765448156516077e-06, + "loss": 0.1049, + "step": 701 + }, + { + "epoch": 0.5, + "grad_norm": 23.46621543879915, + "learning_rate": 9.764572546222e-06, + "loss": 0.1229, + "step": 702 + }, + { + "epoch": 0.5, + "grad_norm": 20.60346076669723, + "learning_rate": 9.763695343995674e-06, + "loss": 0.1364, + "step": 703 + }, + { + "epoch": 0.5, + "grad_norm": 14.594095858724636, + "learning_rate": 9.762816550130192e-06, + "loss": 0.0992, + "step": 704 + }, + { + "epoch": 0.5, + "grad_norm": 21.631405632050786, + "learning_rate": 9.76193616491917e-06, + "loss": 0.1521, + "step": 705 + }, + { + "epoch": 0.5, + "grad_norm": 8.107810318183903, + "learning_rate": 9.761054188656766e-06, + "loss": 0.1497, + "step": 706 + }, + { + "epoch": 0.5, + "grad_norm": 13.5375760091303, + "learning_rate": 9.760170621637661e-06, + "loss": 0.1255, + "step": 707 + }, + { + "epoch": 0.51, + "grad_norm": 10.731462678449901, + "learning_rate": 9.759285464157073e-06, + "loss": 0.1245, + "step": 708 + }, + { + "epoch": 0.51, + "grad_norm": 3.6574842453813687, + "learning_rate": 9.758398716510751e-06, + "loss": 0.1086, + "step": 709 + }, + { + "epoch": 0.51, + "grad_norm": 30.727691788456823, + "learning_rate": 9.75751037899497e-06, + "loss": 0.1281, + "step": 710 + }, + { + "epoch": 0.51, + "grad_norm": 27.202375481436885, + "learning_rate": 9.756620451906543e-06, + "loss": 0.1276, + "step": 711 + }, + { + "epoch": 0.51, + "grad_norm": 18.595561761908904, + "learning_rate": 9.75572893554281e-06, + "loss": 0.1384, + "step": 712 + }, + { + "epoch": 0.51, + "grad_norm": 14.144175158912619, + "learning_rate": 9.754835830201645e-06, + "loss": 0.1586, + "step": 713 + }, + { + "epoch": 0.51, + "grad_norm": 37.849196456539325, + "learning_rate": 9.753941136181448e-06, + "loss": 0.145, + "step": 714 + }, + { + "epoch": 0.51, + "grad_norm": 35.69633971283892, + "learning_rate": 9.753044853781155e-06, + "loss": 0.1268, + "step": 715 + }, + { + "epoch": 0.51, + "grad_norm": 7.152193957134935, + "learning_rate": 9.75214698330023e-06, + "loss": 0.1831, + "step": 716 + }, + { + "epoch": 0.51, + "grad_norm": 27.653182844039453, + "learning_rate": 9.751247525038669e-06, + "loss": 0.1306, + "step": 717 + }, + { + "epoch": 0.51, + "grad_norm": 27.593230490233363, + "learning_rate": 9.750346479296998e-06, + "loss": 0.1471, + "step": 718 + }, + { + "epoch": 0.51, + "grad_norm": 4.305679091229352, + "learning_rate": 9.74944384637627e-06, + "loss": 0.1173, + "step": 719 + }, + { + "epoch": 0.51, + "grad_norm": 27.29753578820685, + "learning_rate": 9.748539626578076e-06, + "loss": 0.1168, + "step": 720 + }, + { + "epoch": 0.51, + "grad_norm": 5.465815777296448, + "learning_rate": 9.747633820204527e-06, + "loss": 0.1176, + "step": 721 + }, + { + "epoch": 0.52, + "grad_norm": 12.977436523645396, + "learning_rate": 9.746726427558276e-06, + "loss": 0.1294, + "step": 722 + }, + { + "epoch": 0.52, + "grad_norm": 8.50325075384258, + "learning_rate": 9.745817448942496e-06, + "loss": 0.1541, + "step": 723 + }, + { + "epoch": 0.52, + "grad_norm": 25.299637458129375, + "learning_rate": 9.744906884660894e-06, + "loss": 0.146, + "step": 724 + }, + { + "epoch": 0.52, + "grad_norm": 29.392294563570957, + "learning_rate": 9.743994735017708e-06, + "loss": 0.1144, + "step": 725 + }, + { + "epoch": 0.52, + "grad_norm": 9.051587116954162, + "learning_rate": 9.743081000317703e-06, + "loss": 0.1433, + "step": 726 + }, + { + "epoch": 0.52, + "grad_norm": 38.587545048429, + "learning_rate": 9.742165680866173e-06, + "loss": 0.1388, + "step": 727 + }, + { + "epoch": 0.52, + "grad_norm": 23.864218562404943, + "learning_rate": 9.741248776968947e-06, + "loss": 0.1458, + "step": 728 + }, + { + "epoch": 0.52, + "grad_norm": 17.858968253120967, + "learning_rate": 9.740330288932379e-06, + "loss": 0.1136, + "step": 729 + }, + { + "epoch": 0.52, + "grad_norm": 5.168166530378955, + "learning_rate": 9.73941021706335e-06, + "loss": 0.1274, + "step": 730 + }, + { + "epoch": 0.52, + "grad_norm": 20.348246500543265, + "learning_rate": 9.738488561669272e-06, + "loss": 0.1316, + "step": 731 + }, + { + "epoch": 0.52, + "grad_norm": 31.053717210440414, + "learning_rate": 9.737565323058094e-06, + "loss": 0.1594, + "step": 732 + }, + { + "epoch": 0.52, + "grad_norm": 21.28339041051843, + "learning_rate": 9.736640501538281e-06, + "loss": 0.1228, + "step": 733 + }, + { + "epoch": 0.52, + "grad_norm": 15.286309070528286, + "learning_rate": 9.735714097418835e-06, + "loss": 0.1377, + "step": 734 + }, + { + "epoch": 0.52, + "grad_norm": 27.796625488655682, + "learning_rate": 9.734786111009287e-06, + "loss": 0.1254, + "step": 735 + }, + { + "epoch": 0.53, + "grad_norm": 7.5056897186412606, + "learning_rate": 9.73385654261969e-06, + "loss": 0.0974, + "step": 736 + }, + { + "epoch": 0.53, + "grad_norm": 4.791484316668564, + "learning_rate": 9.732925392560634e-06, + "loss": 0.131, + "step": 737 + }, + { + "epoch": 0.53, + "grad_norm": 14.411536482139912, + "learning_rate": 9.731992661143233e-06, + "loss": 0.1107, + "step": 738 + }, + { + "epoch": 0.53, + "grad_norm": 15.29447975137697, + "learning_rate": 9.731058348679128e-06, + "loss": 0.1244, + "step": 739 + }, + { + "epoch": 0.53, + "grad_norm": 26.753002422549987, + "learning_rate": 9.73012245548049e-06, + "loss": 0.1729, + "step": 740 + }, + { + "epoch": 0.53, + "grad_norm": 15.269880803641675, + "learning_rate": 9.729184981860023e-06, + "loss": 0.1367, + "step": 741 + }, + { + "epoch": 0.53, + "grad_norm": 23.286568445497416, + "learning_rate": 9.728245928130949e-06, + "loss": 0.1204, + "step": 742 + }, + { + "epoch": 0.53, + "grad_norm": 12.124695582884476, + "learning_rate": 9.727305294607024e-06, + "loss": 0.1174, + "step": 743 + }, + { + "epoch": 0.53, + "grad_norm": 5.340039718599179, + "learning_rate": 9.726363081602532e-06, + "loss": 0.1272, + "step": 744 + }, + { + "epoch": 0.53, + "grad_norm": 8.03978216102685, + "learning_rate": 9.725419289432287e-06, + "loss": 0.1349, + "step": 745 + }, + { + "epoch": 0.53, + "grad_norm": 12.900890675498434, + "learning_rate": 9.724473918411624e-06, + "loss": 0.1295, + "step": 746 + }, + { + "epoch": 0.53, + "grad_norm": 12.021350602091163, + "learning_rate": 9.723526968856408e-06, + "loss": 0.1057, + "step": 747 + }, + { + "epoch": 0.53, + "grad_norm": 10.989225561921891, + "learning_rate": 9.722578441083035e-06, + "loss": 0.0867, + "step": 748 + }, + { + "epoch": 0.53, + "grad_norm": 17.269981813172723, + "learning_rate": 9.721628335408423e-06, + "loss": 0.1116, + "step": 749 + }, + { + "epoch": 0.54, + "grad_norm": 30.291178256649264, + "learning_rate": 9.720676652150025e-06, + "loss": 0.1224, + "step": 750 + }, + { + "epoch": 0.54, + "grad_norm": 6.188545883869058, + "learning_rate": 9.719723391625813e-06, + "loss": 0.1566, + "step": 751 + }, + { + "epoch": 0.54, + "grad_norm": 11.183215999542815, + "learning_rate": 9.718768554154287e-06, + "loss": 0.1754, + "step": 752 + }, + { + "epoch": 0.54, + "grad_norm": 12.300039794356254, + "learning_rate": 9.717812140054479e-06, + "loss": 0.1091, + "step": 753 + }, + { + "epoch": 0.54, + "grad_norm": 43.092405088706, + "learning_rate": 9.716854149645945e-06, + "loss": 0.1327, + "step": 754 + }, + { + "epoch": 0.54, + "grad_norm": 14.989995368482804, + "learning_rate": 9.715894583248764e-06, + "loss": 0.1329, + "step": 755 + }, + { + "epoch": 0.54, + "grad_norm": 4.450637838006689, + "learning_rate": 9.714933441183549e-06, + "loss": 0.1046, + "step": 756 + }, + { + "epoch": 0.54, + "grad_norm": 4.370431682700567, + "learning_rate": 9.713970723771432e-06, + "loss": 0.094, + "step": 757 + }, + { + "epoch": 0.54, + "grad_norm": 5.439106294822273, + "learning_rate": 9.713006431334076e-06, + "loss": 0.1075, + "step": 758 + }, + { + "epoch": 0.54, + "grad_norm": 6.568446270079447, + "learning_rate": 9.71204056419367e-06, + "loss": 0.1646, + "step": 759 + }, + { + "epoch": 0.54, + "grad_norm": 12.719463921354748, + "learning_rate": 9.711073122672928e-06, + "loss": 0.1296, + "step": 760 + }, + { + "epoch": 0.54, + "grad_norm": 32.803026706716224, + "learning_rate": 9.71010410709509e-06, + "loss": 0.1307, + "step": 761 + }, + { + "epoch": 0.54, + "grad_norm": 9.96453423011384, + "learning_rate": 9.70913351778392e-06, + "loss": 0.1228, + "step": 762 + }, + { + "epoch": 0.54, + "grad_norm": 20.833929487014423, + "learning_rate": 9.708161355063714e-06, + "loss": 0.1479, + "step": 763 + }, + { + "epoch": 0.55, + "grad_norm": 39.33943975858833, + "learning_rate": 9.707187619259286e-06, + "loss": 0.123, + "step": 764 + }, + { + "epoch": 0.55, + "grad_norm": 7.376268195351014, + "learning_rate": 9.706212310695981e-06, + "loss": 0.125, + "step": 765 + }, + { + "epoch": 0.55, + "grad_norm": 9.440271953630097, + "learning_rate": 9.705235429699666e-06, + "loss": 0.1115, + "step": 766 + }, + { + "epoch": 0.55, + "grad_norm": 20.249022741934898, + "learning_rate": 9.704256976596737e-06, + "loss": 0.1263, + "step": 767 + }, + { + "epoch": 0.55, + "grad_norm": 26.41981337531156, + "learning_rate": 9.703276951714114e-06, + "loss": 0.1115, + "step": 768 + }, + { + "epoch": 0.55, + "grad_norm": 36.217352696254, + "learning_rate": 9.70229535537924e-06, + "loss": 0.0952, + "step": 769 + }, + { + "epoch": 0.55, + "grad_norm": 14.397083314980746, + "learning_rate": 9.701312187920084e-06, + "loss": 0.1769, + "step": 770 + }, + { + "epoch": 0.55, + "grad_norm": 26.0325827243847, + "learning_rate": 9.700327449665143e-06, + "loss": 0.1141, + "step": 771 + }, + { + "epoch": 0.55, + "grad_norm": 36.351045967305154, + "learning_rate": 9.699341140943434e-06, + "loss": 0.1384, + "step": 772 + }, + { + "epoch": 0.55, + "grad_norm": 23.25267147893353, + "learning_rate": 9.698353262084501e-06, + "loss": 0.1324, + "step": 773 + }, + { + "epoch": 0.55, + "grad_norm": 8.482877435007008, + "learning_rate": 9.697363813418414e-06, + "loss": 0.1206, + "step": 774 + }, + { + "epoch": 0.55, + "grad_norm": 12.560594535886262, + "learning_rate": 9.696372795275766e-06, + "loss": 0.1587, + "step": 775 + }, + { + "epoch": 0.55, + "grad_norm": 14.141509443672007, + "learning_rate": 9.695380207987675e-06, + "loss": 0.0968, + "step": 776 + }, + { + "epoch": 0.55, + "grad_norm": 23.18138524562932, + "learning_rate": 9.69438605188578e-06, + "loss": 0.1436, + "step": 777 + }, + { + "epoch": 0.56, + "grad_norm": 18.79428105301295, + "learning_rate": 9.69339032730225e-06, + "loss": 0.1552, + "step": 778 + }, + { + "epoch": 0.56, + "grad_norm": 30.44553819976487, + "learning_rate": 9.692393034569776e-06, + "loss": 0.1146, + "step": 779 + }, + { + "epoch": 0.56, + "grad_norm": 6.399140231533663, + "learning_rate": 9.69139417402157e-06, + "loss": 0.1072, + "step": 780 + }, + { + "epoch": 0.56, + "grad_norm": 19.39038831015829, + "learning_rate": 9.690393745991368e-06, + "loss": 0.1361, + "step": 781 + }, + { + "epoch": 0.56, + "grad_norm": 16.807652464179398, + "learning_rate": 9.689391750813436e-06, + "loss": 0.1516, + "step": 782 + }, + { + "epoch": 0.56, + "grad_norm": 12.595719018779906, + "learning_rate": 9.688388188822556e-06, + "loss": 0.1456, + "step": 783 + }, + { + "epoch": 0.56, + "grad_norm": 78.16077311228811, + "learning_rate": 9.687383060354038e-06, + "loss": 0.2327, + "step": 784 + }, + { + "epoch": 0.56, + "grad_norm": 12.822682377103986, + "learning_rate": 9.686376365743714e-06, + "loss": 0.1251, + "step": 785 + }, + { + "epoch": 0.56, + "grad_norm": 35.69822535004326, + "learning_rate": 9.685368105327938e-06, + "loss": 0.1688, + "step": 786 + }, + { + "epoch": 0.56, + "grad_norm": 27.540323241637438, + "learning_rate": 9.684358279443593e-06, + "loss": 0.1223, + "step": 787 + }, + { + "epoch": 0.56, + "grad_norm": 52.99880117052681, + "learning_rate": 9.683346888428074e-06, + "loss": 0.1387, + "step": 788 + }, + { + "epoch": 0.56, + "grad_norm": 38.310273349733706, + "learning_rate": 9.68233393261931e-06, + "loss": 0.156, + "step": 789 + }, + { + "epoch": 0.56, + "grad_norm": 34.726834784529174, + "learning_rate": 9.681319412355748e-06, + "loss": 0.0992, + "step": 790 + }, + { + "epoch": 0.56, + "grad_norm": 68.29196974086027, + "learning_rate": 9.680303327976356e-06, + "loss": 0.1697, + "step": 791 + }, + { + "epoch": 0.57, + "grad_norm": 44.244078950175414, + "learning_rate": 9.679285679820628e-06, + "loss": 0.1471, + "step": 792 + }, + { + "epoch": 0.57, + "grad_norm": 10.851951807112226, + "learning_rate": 9.67826646822858e-06, + "loss": 0.1339, + "step": 793 + }, + { + "epoch": 0.57, + "grad_norm": 49.15989066226097, + "learning_rate": 9.677245693540749e-06, + "loss": 0.1746, + "step": 794 + }, + { + "epoch": 0.57, + "grad_norm": 29.52416076684413, + "learning_rate": 9.676223356098194e-06, + "loss": 0.1154, + "step": 795 + }, + { + "epoch": 0.57, + "grad_norm": 9.38163332741748, + "learning_rate": 9.675199456242499e-06, + "loss": 0.1305, + "step": 796 + }, + { + "epoch": 0.57, + "grad_norm": 10.925248900398602, + "learning_rate": 9.674173994315764e-06, + "loss": 0.1724, + "step": 797 + }, + { + "epoch": 0.57, + "grad_norm": 17.954529179430356, + "learning_rate": 9.67314697066062e-06, + "loss": 0.1324, + "step": 798 + }, + { + "epoch": 0.57, + "grad_norm": 19.573255779518618, + "learning_rate": 9.672118385620209e-06, + "loss": 0.1199, + "step": 799 + }, + { + "epoch": 0.57, + "grad_norm": 45.10260866334031, + "learning_rate": 9.671088239538204e-06, + "loss": 0.168, + "step": 800 + }, + { + "epoch": 0.57, + "grad_norm": 4.096128376621521, + "learning_rate": 9.670056532758798e-06, + "loss": 0.113, + "step": 801 + }, + { + "epoch": 0.57, + "grad_norm": 41.288709677166764, + "learning_rate": 9.669023265626698e-06, + "loss": 0.1699, + "step": 802 + }, + { + "epoch": 0.57, + "grad_norm": 50.206876725438384, + "learning_rate": 9.66798843848714e-06, + "loss": 0.1154, + "step": 803 + }, + { + "epoch": 0.57, + "grad_norm": 18.766537968172486, + "learning_rate": 9.666952051685882e-06, + "loss": 0.1078, + "step": 804 + }, + { + "epoch": 0.57, + "grad_norm": 27.654720064496317, + "learning_rate": 9.665914105569196e-06, + "loss": 0.1472, + "step": 805 + }, + { + "epoch": 0.58, + "grad_norm": 30.465348594344608, + "learning_rate": 9.664874600483883e-06, + "loss": 0.1125, + "step": 806 + }, + { + "epoch": 0.58, + "grad_norm": 35.45942332075495, + "learning_rate": 9.663833536777256e-06, + "loss": 0.1239, + "step": 807 + }, + { + "epoch": 0.58, + "grad_norm": 4.15194949775956, + "learning_rate": 9.662790914797158e-06, + "loss": 0.1382, + "step": 808 + }, + { + "epoch": 0.58, + "grad_norm": 5.445521315197055, + "learning_rate": 9.661746734891947e-06, + "loss": 0.1438, + "step": 809 + }, + { + "epoch": 0.58, + "grad_norm": 18.926741667241487, + "learning_rate": 9.6607009974105e-06, + "loss": 0.1327, + "step": 810 + }, + { + "epoch": 0.58, + "grad_norm": 34.07683918961295, + "learning_rate": 9.659653702702223e-06, + "loss": 0.1337, + "step": 811 + }, + { + "epoch": 0.58, + "grad_norm": 31.666273539929072, + "learning_rate": 9.658604851117032e-06, + "loss": 0.1421, + "step": 812 + }, + { + "epoch": 0.58, + "grad_norm": 7.290269985502701, + "learning_rate": 9.65755444300537e-06, + "loss": 0.1259, + "step": 813 + }, + { + "epoch": 0.58, + "grad_norm": 20.748229179136143, + "learning_rate": 9.656502478718197e-06, + "loss": 0.1207, + "step": 814 + }, + { + "epoch": 0.58, + "grad_norm": 18.76253063384707, + "learning_rate": 9.655448958606994e-06, + "loss": 0.1289, + "step": 815 + }, + { + "epoch": 0.58, + "grad_norm": 47.068157538168485, + "learning_rate": 9.654393883023763e-06, + "loss": 0.1449, + "step": 816 + }, + { + "epoch": 0.58, + "grad_norm": 16.508111889804976, + "learning_rate": 9.653337252321023e-06, + "loss": 0.137, + "step": 817 + }, + { + "epoch": 0.58, + "grad_norm": 6.161976884102493, + "learning_rate": 9.652279066851811e-06, + "loss": 0.126, + "step": 818 + }, + { + "epoch": 0.58, + "grad_norm": 69.48129884052561, + "learning_rate": 9.651219326969694e-06, + "loss": 0.179, + "step": 819 + }, + { + "epoch": 0.59, + "grad_norm": 25.548064760384744, + "learning_rate": 9.650158033028743e-06, + "loss": 0.1292, + "step": 820 + }, + { + "epoch": 0.59, + "grad_norm": 22.848051753734424, + "learning_rate": 9.64909518538356e-06, + "loss": 0.1185, + "step": 821 + }, + { + "epoch": 0.59, + "grad_norm": 38.528184137095906, + "learning_rate": 9.648030784389264e-06, + "loss": 0.1333, + "step": 822 + }, + { + "epoch": 0.59, + "grad_norm": 40.289153016277766, + "learning_rate": 9.646964830401487e-06, + "loss": 0.1868, + "step": 823 + }, + { + "epoch": 0.59, + "grad_norm": 28.63000932230696, + "learning_rate": 9.645897323776386e-06, + "loss": 0.1309, + "step": 824 + }, + { + "epoch": 0.59, + "grad_norm": 29.390175324133896, + "learning_rate": 9.644828264870634e-06, + "loss": 0.1494, + "step": 825 + }, + { + "epoch": 0.59, + "grad_norm": 29.520098412687467, + "learning_rate": 9.643757654041423e-06, + "loss": 0.1147, + "step": 826 + }, + { + "epoch": 0.59, + "grad_norm": 30.864194139048347, + "learning_rate": 9.642685491646467e-06, + "loss": 0.1078, + "step": 827 + }, + { + "epoch": 0.59, + "grad_norm": 42.21262734452298, + "learning_rate": 9.641611778043992e-06, + "loss": 0.1384, + "step": 828 + }, + { + "epoch": 0.59, + "grad_norm": 12.587684898717356, + "learning_rate": 9.64053651359275e-06, + "loss": 0.1387, + "step": 829 + }, + { + "epoch": 0.59, + "grad_norm": 36.28856657814182, + "learning_rate": 9.639459698652e-06, + "loss": 0.1475, + "step": 830 + }, + { + "epoch": 0.59, + "grad_norm": 40.3795735401778, + "learning_rate": 9.63838133358153e-06, + "loss": 0.1454, + "step": 831 + }, + { + "epoch": 0.59, + "grad_norm": 56.58394981425765, + "learning_rate": 9.637301418741643e-06, + "loss": 0.1733, + "step": 832 + }, + { + "epoch": 0.59, + "grad_norm": 9.372522640119524, + "learning_rate": 9.636219954493157e-06, + "loss": 0.1172, + "step": 833 + }, + { + "epoch": 0.6, + "grad_norm": 5.10785762912234, + "learning_rate": 9.635136941197409e-06, + "loss": 0.1244, + "step": 834 + }, + { + "epoch": 0.6, + "grad_norm": 47.76352747100944, + "learning_rate": 9.634052379216256e-06, + "loss": 0.1663, + "step": 835 + }, + { + "epoch": 0.6, + "grad_norm": 27.543918826169143, + "learning_rate": 9.632966268912067e-06, + "loss": 0.1227, + "step": 836 + }, + { + "epoch": 0.6, + "grad_norm": 27.936130596657357, + "learning_rate": 9.631878610647734e-06, + "loss": 0.14, + "step": 837 + }, + { + "epoch": 0.6, + "grad_norm": 35.17792432965117, + "learning_rate": 9.630789404786664e-06, + "loss": 0.156, + "step": 838 + }, + { + "epoch": 0.6, + "grad_norm": 28.829265986491016, + "learning_rate": 9.629698651692779e-06, + "loss": 0.1443, + "step": 839 + }, + { + "epoch": 0.6, + "grad_norm": 24.10923790760327, + "learning_rate": 9.62860635173052e-06, + "loss": 0.1364, + "step": 840 + }, + { + "epoch": 0.6, + "grad_norm": 4.678265188180967, + "learning_rate": 9.627512505264847e-06, + "loss": 0.1251, + "step": 841 + }, + { + "epoch": 0.6, + "grad_norm": 30.518872167549794, + "learning_rate": 9.626417112661233e-06, + "loss": 0.1193, + "step": 842 + }, + { + "epoch": 0.6, + "grad_norm": 4.30248553766188, + "learning_rate": 9.62532017428567e-06, + "loss": 0.1294, + "step": 843 + }, + { + "epoch": 0.6, + "grad_norm": 8.95928112006393, + "learning_rate": 9.624221690504663e-06, + "loss": 0.1318, + "step": 844 + }, + { + "epoch": 0.6, + "grad_norm": 9.975229344522406, + "learning_rate": 9.623121661685239e-06, + "loss": 0.1351, + "step": 845 + }, + { + "epoch": 0.6, + "grad_norm": 5.673008949146638, + "learning_rate": 9.622020088194934e-06, + "loss": 0.1339, + "step": 846 + }, + { + "epoch": 0.6, + "grad_norm": 5.7187307082075485, + "learning_rate": 9.62091697040181e-06, + "loss": 0.109, + "step": 847 + }, + { + "epoch": 0.61, + "grad_norm": 19.086793156437807, + "learning_rate": 9.619812308674434e-06, + "loss": 0.1528, + "step": 848 + }, + { + "epoch": 0.61, + "grad_norm": 15.448285601263763, + "learning_rate": 9.618706103381896e-06, + "loss": 0.137, + "step": 849 + }, + { + "epoch": 0.61, + "grad_norm": 8.43458261032492, + "learning_rate": 9.6175983548938e-06, + "loss": 0.1368, + "step": 850 + }, + { + "epoch": 0.61, + "grad_norm": 6.695331340739746, + "learning_rate": 9.616489063580265e-06, + "loss": 0.1494, + "step": 851 + }, + { + "epoch": 0.61, + "grad_norm": 16.36884007886458, + "learning_rate": 9.615378229811927e-06, + "loss": 0.098, + "step": 852 + }, + { + "epoch": 0.61, + "grad_norm": 6.709638349793122, + "learning_rate": 9.614265853959935e-06, + "loss": 0.1013, + "step": 853 + }, + { + "epoch": 0.61, + "grad_norm": 13.96540985996891, + "learning_rate": 9.613151936395952e-06, + "loss": 0.1692, + "step": 854 + }, + { + "epoch": 0.61, + "grad_norm": 4.1809794075417095, + "learning_rate": 9.612036477492163e-06, + "loss": 0.1151, + "step": 855 + }, + { + "epoch": 0.61, + "grad_norm": 31.864538660439695, + "learning_rate": 9.610919477621262e-06, + "loss": 0.094, + "step": 856 + }, + { + "epoch": 0.61, + "grad_norm": 11.028949180354989, + "learning_rate": 9.609800937156459e-06, + "loss": 0.1671, + "step": 857 + }, + { + "epoch": 0.61, + "grad_norm": 13.125900295808705, + "learning_rate": 9.60868085647148e-06, + "loss": 0.1697, + "step": 858 + }, + { + "epoch": 0.61, + "grad_norm": 26.03409453086144, + "learning_rate": 9.607559235940562e-06, + "loss": 0.1094, + "step": 859 + }, + { + "epoch": 0.61, + "grad_norm": 14.356106276553628, + "learning_rate": 9.60643607593846e-06, + "loss": 0.1176, + "step": 860 + }, + { + "epoch": 0.61, + "grad_norm": 10.48182805841283, + "learning_rate": 9.605311376840446e-06, + "loss": 0.1034, + "step": 861 + }, + { + "epoch": 0.62, + "grad_norm": 4.737169065727766, + "learning_rate": 9.604185139022302e-06, + "loss": 0.1119, + "step": 862 + }, + { + "epoch": 0.62, + "grad_norm": 11.543502918761671, + "learning_rate": 9.603057362860323e-06, + "loss": 0.1512, + "step": 863 + }, + { + "epoch": 0.62, + "grad_norm": 13.137753210423396, + "learning_rate": 9.60192804873132e-06, + "loss": 0.0869, + "step": 864 + }, + { + "epoch": 0.62, + "grad_norm": 8.238531279627713, + "learning_rate": 9.60079719701262e-06, + "loss": 0.1251, + "step": 865 + }, + { + "epoch": 0.62, + "grad_norm": 4.442828257491122, + "learning_rate": 9.599664808082058e-06, + "loss": 0.1073, + "step": 866 + }, + { + "epoch": 0.62, + "grad_norm": 45.80139113041787, + "learning_rate": 9.598530882317992e-06, + "loss": 0.1849, + "step": 867 + }, + { + "epoch": 0.62, + "grad_norm": 5.013274249831894, + "learning_rate": 9.59739542009928e-06, + "loss": 0.1652, + "step": 868 + }, + { + "epoch": 0.62, + "grad_norm": 20.94648720784535, + "learning_rate": 9.596258421805306e-06, + "loss": 0.1349, + "step": 869 + }, + { + "epoch": 0.62, + "grad_norm": 20.308753874252467, + "learning_rate": 9.595119887815962e-06, + "loss": 0.1345, + "step": 870 + }, + { + "epoch": 0.62, + "grad_norm": 30.38587830318456, + "learning_rate": 9.593979818511655e-06, + "loss": 0.1326, + "step": 871 + }, + { + "epoch": 0.62, + "grad_norm": 7.113951857489782, + "learning_rate": 9.592838214273298e-06, + "loss": 0.1516, + "step": 872 + }, + { + "epoch": 0.62, + "grad_norm": 4.08267552970721, + "learning_rate": 9.591695075482326e-06, + "loss": 0.1118, + "step": 873 + }, + { + "epoch": 0.62, + "grad_norm": 21.290502900984546, + "learning_rate": 9.590550402520683e-06, + "loss": 0.1262, + "step": 874 + }, + { + "epoch": 0.62, + "grad_norm": 40.747754501062765, + "learning_rate": 9.589404195770821e-06, + "loss": 0.145, + "step": 875 + }, + { + "epoch": 0.63, + "grad_norm": 5.363040561381, + "learning_rate": 9.588256455615716e-06, + "loss": 0.1309, + "step": 876 + }, + { + "epoch": 0.63, + "grad_norm": 9.164698662773475, + "learning_rate": 9.587107182438846e-06, + "loss": 0.1272, + "step": 877 + }, + { + "epoch": 0.63, + "grad_norm": 46.7965238441854, + "learning_rate": 9.585956376624204e-06, + "loss": 0.1318, + "step": 878 + }, + { + "epoch": 0.63, + "grad_norm": 8.889294742395169, + "learning_rate": 9.584804038556297e-06, + "loss": 0.1427, + "step": 879 + }, + { + "epoch": 0.63, + "grad_norm": 8.060406285399305, + "learning_rate": 9.58365016862014e-06, + "loss": 0.1256, + "step": 880 + }, + { + "epoch": 0.63, + "grad_norm": 40.00127209719933, + "learning_rate": 9.582494767201265e-06, + "loss": 0.1309, + "step": 881 + }, + { + "epoch": 0.63, + "grad_norm": 13.84245575000966, + "learning_rate": 9.581337834685713e-06, + "loss": 0.104, + "step": 882 + }, + { + "epoch": 0.63, + "grad_norm": 23.402757032695074, + "learning_rate": 9.580179371460034e-06, + "loss": 0.1289, + "step": 883 + }, + { + "epoch": 0.63, + "grad_norm": 24.43621959954114, + "learning_rate": 9.579019377911296e-06, + "loss": 0.1401, + "step": 884 + }, + { + "epoch": 0.63, + "grad_norm": 9.753446651852203, + "learning_rate": 9.57785785442707e-06, + "loss": 0.1206, + "step": 885 + }, + { + "epoch": 0.63, + "grad_norm": 9.301431458818122, + "learning_rate": 9.576694801395447e-06, + "loss": 0.1083, + "step": 886 + }, + { + "epoch": 0.63, + "grad_norm": 15.99251554227728, + "learning_rate": 9.57553021920502e-06, + "loss": 0.1708, + "step": 887 + }, + { + "epoch": 0.63, + "grad_norm": 27.354550811963918, + "learning_rate": 9.574364108244903e-06, + "loss": 0.1095, + "step": 888 + }, + { + "epoch": 0.63, + "grad_norm": 14.453620875280626, + "learning_rate": 9.573196468904711e-06, + "loss": 0.1439, + "step": 889 + }, + { + "epoch": 0.64, + "grad_norm": 23.906822227072045, + "learning_rate": 9.572027301574576e-06, + "loss": 0.177, + "step": 890 + }, + { + "epoch": 0.64, + "grad_norm": 40.74283625046122, + "learning_rate": 9.570856606645139e-06, + "loss": 0.1349, + "step": 891 + }, + { + "epoch": 0.64, + "grad_norm": 32.77881193498861, + "learning_rate": 9.569684384507547e-06, + "loss": 0.1088, + "step": 892 + }, + { + "epoch": 0.64, + "grad_norm": 9.788598652893294, + "learning_rate": 9.568510635553466e-06, + "loss": 0.1145, + "step": 893 + }, + { + "epoch": 0.64, + "grad_norm": 5.486909948066521, + "learning_rate": 9.567335360175065e-06, + "loss": 0.131, + "step": 894 + }, + { + "epoch": 0.64, + "grad_norm": 6.250156911609917, + "learning_rate": 9.566158558765026e-06, + "loss": 0.1603, + "step": 895 + }, + { + "epoch": 0.64, + "grad_norm": 23.236140333378174, + "learning_rate": 9.564980231716541e-06, + "loss": 0.1176, + "step": 896 + }, + { + "epoch": 0.64, + "grad_norm": 37.16329810519581, + "learning_rate": 9.56380037942331e-06, + "loss": 0.1315, + "step": 897 + }, + { + "epoch": 0.64, + "grad_norm": 8.545747912275482, + "learning_rate": 9.562619002279541e-06, + "loss": 0.1576, + "step": 898 + }, + { + "epoch": 0.64, + "grad_norm": 4.7428484359296625, + "learning_rate": 9.561436100679959e-06, + "loss": 0.1204, + "step": 899 + }, + { + "epoch": 0.64, + "grad_norm": 34.822828382532826, + "learning_rate": 9.56025167501979e-06, + "loss": 0.1407, + "step": 900 + }, + { + "epoch": 0.64, + "grad_norm": 4.274669158416529, + "learning_rate": 9.559065725694775e-06, + "loss": 0.1305, + "step": 901 + }, + { + "epoch": 0.64, + "grad_norm": 5.2730681558009245, + "learning_rate": 9.55787825310116e-06, + "loss": 0.1417, + "step": 902 + }, + { + "epoch": 0.64, + "grad_norm": 4.919028484956142, + "learning_rate": 9.5566892576357e-06, + "loss": 0.0978, + "step": 903 + }, + { + "epoch": 0.65, + "grad_norm": 8.4710721096915, + "learning_rate": 9.555498739695665e-06, + "loss": 0.1519, + "step": 904 + }, + { + "epoch": 0.65, + "grad_norm": 7.993053552415446, + "learning_rate": 9.554306699678827e-06, + "loss": 0.1193, + "step": 905 + }, + { + "epoch": 0.65, + "grad_norm": 29.810998416607553, + "learning_rate": 9.553113137983467e-06, + "loss": 0.1377, + "step": 906 + }, + { + "epoch": 0.65, + "grad_norm": 10.703199156948056, + "learning_rate": 9.551918055008378e-06, + "loss": 0.125, + "step": 907 + }, + { + "epoch": 0.65, + "grad_norm": 9.792361535663362, + "learning_rate": 9.55072145115286e-06, + "loss": 0.1246, + "step": 908 + }, + { + "epoch": 0.65, + "grad_norm": 38.43165453677999, + "learning_rate": 9.54952332681672e-06, + "loss": 0.1543, + "step": 909 + }, + { + "epoch": 0.65, + "grad_norm": 20.810549271538743, + "learning_rate": 9.54832368240027e-06, + "loss": 0.1169, + "step": 910 + }, + { + "epoch": 0.65, + "grad_norm": 14.216738798712827, + "learning_rate": 9.54712251830434e-06, + "loss": 0.1195, + "step": 911 + }, + { + "epoch": 0.65, + "grad_norm": 37.7812086135415, + "learning_rate": 9.545919834930257e-06, + "loss": 0.1229, + "step": 912 + }, + { + "epoch": 0.65, + "grad_norm": 16.088887693282086, + "learning_rate": 9.54471563267986e-06, + "loss": 0.1521, + "step": 913 + }, + { + "epoch": 0.65, + "grad_norm": 6.086877374953741, + "learning_rate": 9.543509911955497e-06, + "loss": 0.1245, + "step": 914 + }, + { + "epoch": 0.65, + "grad_norm": 6.884647790510014, + "learning_rate": 9.542302673160021e-06, + "loss": 0.1477, + "step": 915 + }, + { + "epoch": 0.65, + "grad_norm": 13.471052689354279, + "learning_rate": 9.541093916696793e-06, + "loss": 0.1655, + "step": 916 + }, + { + "epoch": 0.65, + "grad_norm": 5.860831191922153, + "learning_rate": 9.539883642969681e-06, + "loss": 0.0962, + "step": 917 + }, + { + "epoch": 0.66, + "grad_norm": 9.950951525429048, + "learning_rate": 9.53867185238306e-06, + "loss": 0.1212, + "step": 918 + }, + { + "epoch": 0.66, + "grad_norm": 29.397346489598785, + "learning_rate": 9.53745854534181e-06, + "loss": 0.1531, + "step": 919 + }, + { + "epoch": 0.66, + "grad_norm": 14.700115456334249, + "learning_rate": 9.536243722251321e-06, + "loss": 0.1633, + "step": 920 + }, + { + "epoch": 0.66, + "grad_norm": 31.465682765345253, + "learning_rate": 9.53502738351749e-06, + "loss": 0.1273, + "step": 921 + }, + { + "epoch": 0.66, + "grad_norm": 13.342265225292849, + "learning_rate": 9.533809529546716e-06, + "loss": 0.0986, + "step": 922 + }, + { + "epoch": 0.66, + "grad_norm": 8.06223070740461, + "learning_rate": 9.532590160745906e-06, + "loss": 0.1138, + "step": 923 + }, + { + "epoch": 0.66, + "grad_norm": 5.891965791948293, + "learning_rate": 9.531369277522475e-06, + "loss": 0.1008, + "step": 924 + }, + { + "epoch": 0.66, + "grad_norm": 14.385809821534846, + "learning_rate": 9.530146880284343e-06, + "loss": 0.1107, + "step": 925 + }, + { + "epoch": 0.66, + "grad_norm": 5.594656018466403, + "learning_rate": 9.528922969439935e-06, + "loss": 0.1097, + "step": 926 + }, + { + "epoch": 0.66, + "grad_norm": 23.884572371985932, + "learning_rate": 9.527697545398183e-06, + "loss": 0.1483, + "step": 927 + }, + { + "epoch": 0.66, + "grad_norm": 5.235374187689162, + "learning_rate": 9.526470608568521e-06, + "loss": 0.1179, + "step": 928 + }, + { + "epoch": 0.66, + "grad_norm": 9.498044658058308, + "learning_rate": 9.525242159360897e-06, + "loss": 0.1262, + "step": 929 + }, + { + "epoch": 0.66, + "grad_norm": 10.326025536830267, + "learning_rate": 9.524012198185755e-06, + "loss": 0.1978, + "step": 930 + }, + { + "epoch": 0.66, + "grad_norm": 15.950579961915821, + "learning_rate": 9.522780725454048e-06, + "loss": 0.1472, + "step": 931 + }, + { + "epoch": 0.67, + "grad_norm": 16.911939075772374, + "learning_rate": 9.521547741577232e-06, + "loss": 0.1405, + "step": 932 + }, + { + "epoch": 0.67, + "grad_norm": 8.840723626936537, + "learning_rate": 9.520313246967277e-06, + "loss": 0.1378, + "step": 933 + }, + { + "epoch": 0.67, + "grad_norm": 10.194781971670784, + "learning_rate": 9.519077242036643e-06, + "loss": 0.1351, + "step": 934 + }, + { + "epoch": 0.67, + "grad_norm": 26.571594739816774, + "learning_rate": 9.517839727198306e-06, + "loss": 0.1461, + "step": 935 + }, + { + "epoch": 0.67, + "grad_norm": 25.488817918137755, + "learning_rate": 9.516600702865742e-06, + "loss": 0.1245, + "step": 936 + }, + { + "epoch": 0.67, + "grad_norm": 9.428264723234935, + "learning_rate": 9.51536016945293e-06, + "loss": 0.1256, + "step": 937 + }, + { + "epoch": 0.67, + "grad_norm": 9.398832656295712, + "learning_rate": 9.514118127374358e-06, + "loss": 0.118, + "step": 938 + }, + { + "epoch": 0.67, + "grad_norm": 28.08087002312531, + "learning_rate": 9.512874577045016e-06, + "loss": 0.1302, + "step": 939 + }, + { + "epoch": 0.67, + "grad_norm": 20.624474099676195, + "learning_rate": 9.511629518880394e-06, + "loss": 0.1273, + "step": 940 + }, + { + "epoch": 0.67, + "grad_norm": 12.613846962609959, + "learning_rate": 9.510382953296492e-06, + "loss": 0.0986, + "step": 941 + }, + { + "epoch": 0.67, + "grad_norm": 35.918107603359275, + "learning_rate": 9.50913488070981e-06, + "loss": 0.1638, + "step": 942 + }, + { + "epoch": 0.67, + "grad_norm": 26.49689819650897, + "learning_rate": 9.50788530153735e-06, + "loss": 0.1265, + "step": 943 + }, + { + "epoch": 0.67, + "grad_norm": 16.996031473631124, + "learning_rate": 9.506634216196621e-06, + "loss": 0.1117, + "step": 944 + }, + { + "epoch": 0.67, + "grad_norm": 29.06015069003101, + "learning_rate": 9.505381625105636e-06, + "loss": 0.1605, + "step": 945 + }, + { + "epoch": 0.68, + "grad_norm": 30.453905417068526, + "learning_rate": 9.504127528682907e-06, + "loss": 0.1602, + "step": 946 + }, + { + "epoch": 0.68, + "grad_norm": 11.628167915371439, + "learning_rate": 9.502871927347452e-06, + "loss": 0.101, + "step": 947 + }, + { + "epoch": 0.68, + "grad_norm": 10.321842222130279, + "learning_rate": 9.501614821518789e-06, + "loss": 0.1279, + "step": 948 + }, + { + "epoch": 0.68, + "grad_norm": 4.367606237846145, + "learning_rate": 9.500356211616941e-06, + "loss": 0.1444, + "step": 949 + }, + { + "epoch": 0.68, + "grad_norm": 52.609164817487056, + "learning_rate": 9.499096098062435e-06, + "loss": 0.1614, + "step": 950 + }, + { + "epoch": 0.68, + "grad_norm": 4.849007077901183, + "learning_rate": 9.497834481276293e-06, + "loss": 0.1133, + "step": 951 + }, + { + "epoch": 0.68, + "grad_norm": 6.628839587514296, + "learning_rate": 9.496571361680052e-06, + "loss": 0.1362, + "step": 952 + }, + { + "epoch": 0.68, + "grad_norm": 22.70492961795664, + "learning_rate": 9.495306739695738e-06, + "loss": 0.1348, + "step": 953 + }, + { + "epoch": 0.68, + "grad_norm": 23.438670549885423, + "learning_rate": 9.494040615745887e-06, + "loss": 0.1139, + "step": 954 + }, + { + "epoch": 0.68, + "grad_norm": 3.777828080048308, + "learning_rate": 9.492772990253535e-06, + "loss": 0.1149, + "step": 955 + }, + { + "epoch": 0.68, + "grad_norm": 17.01356783506953, + "learning_rate": 9.49150386364222e-06, + "loss": 0.1173, + "step": 956 + }, + { + "epoch": 0.68, + "grad_norm": 20.86435129792081, + "learning_rate": 9.490233236335977e-06, + "loss": 0.1227, + "step": 957 + }, + { + "epoch": 0.68, + "grad_norm": 21.168010427297975, + "learning_rate": 9.488961108759349e-06, + "loss": 0.0913, + "step": 958 + }, + { + "epoch": 0.68, + "grad_norm": 12.351204194124849, + "learning_rate": 9.487687481337377e-06, + "loss": 0.1135, + "step": 959 + }, + { + "epoch": 0.69, + "grad_norm": 4.269203529612833, + "learning_rate": 9.486412354495605e-06, + "loss": 0.1016, + "step": 960 + }, + { + "epoch": 0.69, + "grad_norm": 5.879287993430144, + "learning_rate": 9.485135728660073e-06, + "loss": 0.1355, + "step": 961 + }, + { + "epoch": 0.69, + "grad_norm": 19.29718867284448, + "learning_rate": 9.48385760425733e-06, + "loss": 0.1233, + "step": 962 + }, + { + "epoch": 0.69, + "grad_norm": 10.109327624294433, + "learning_rate": 9.482577981714417e-06, + "loss": 0.1221, + "step": 963 + }, + { + "epoch": 0.69, + "grad_norm": 11.843077711835422, + "learning_rate": 9.481296861458881e-06, + "loss": 0.1158, + "step": 964 + }, + { + "epoch": 0.69, + "grad_norm": 6.528475309427824, + "learning_rate": 9.480014243918769e-06, + "loss": 0.168, + "step": 965 + }, + { + "epoch": 0.69, + "grad_norm": 15.294376985491054, + "learning_rate": 9.478730129522627e-06, + "loss": 0.1021, + "step": 966 + }, + { + "epoch": 0.69, + "grad_norm": 23.53373087484837, + "learning_rate": 9.477444518699501e-06, + "loss": 0.156, + "step": 967 + }, + { + "epoch": 0.69, + "grad_norm": 12.345322636300342, + "learning_rate": 9.476157411878937e-06, + "loss": 0.1395, + "step": 968 + }, + { + "epoch": 0.69, + "grad_norm": 29.858200916606965, + "learning_rate": 9.474868809490984e-06, + "loss": 0.1115, + "step": 969 + }, + { + "epoch": 0.69, + "grad_norm": 4.955084258234279, + "learning_rate": 9.473578711966185e-06, + "loss": 0.1597, + "step": 970 + }, + { + "epoch": 0.69, + "grad_norm": 11.600246069094517, + "learning_rate": 9.472287119735588e-06, + "loss": 0.1439, + "step": 971 + }, + { + "epoch": 0.69, + "grad_norm": 10.893917106674843, + "learning_rate": 9.470994033230735e-06, + "loss": 0.1113, + "step": 972 + }, + { + "epoch": 0.69, + "grad_norm": 5.074697713863284, + "learning_rate": 9.469699452883672e-06, + "loss": 0.1354, + "step": 973 + }, + { + "epoch": 0.7, + "grad_norm": 10.174208973947078, + "learning_rate": 9.468403379126943e-06, + "loss": 0.1121, + "step": 974 + }, + { + "epoch": 0.7, + "grad_norm": 18.846130115274406, + "learning_rate": 9.46710581239359e-06, + "loss": 0.163, + "step": 975 + }, + { + "epoch": 0.7, + "grad_norm": 4.592869078244344, + "learning_rate": 9.465806753117153e-06, + "loss": 0.1381, + "step": 976 + }, + { + "epoch": 0.7, + "grad_norm": 3.7149921395013608, + "learning_rate": 9.464506201731674e-06, + "loss": 0.0979, + "step": 977 + }, + { + "epoch": 0.7, + "grad_norm": 5.5795453248401605, + "learning_rate": 9.463204158671687e-06, + "loss": 0.1036, + "step": 978 + }, + { + "epoch": 0.7, + "grad_norm": 9.827533012507683, + "learning_rate": 9.461900624372233e-06, + "loss": 0.1218, + "step": 979 + }, + { + "epoch": 0.7, + "grad_norm": 9.285235509914184, + "learning_rate": 9.460595599268848e-06, + "loss": 0.1506, + "step": 980 + }, + { + "epoch": 0.7, + "grad_norm": 6.469405449404304, + "learning_rate": 9.45928908379756e-06, + "loss": 0.0923, + "step": 981 + }, + { + "epoch": 0.7, + "grad_norm": 13.088900389735285, + "learning_rate": 9.457981078394905e-06, + "loss": 0.1519, + "step": 982 + }, + { + "epoch": 0.7, + "grad_norm": 11.416521356456194, + "learning_rate": 9.45667158349791e-06, + "loss": 0.1038, + "step": 983 + }, + { + "epoch": 0.7, + "grad_norm": 7.000029309311584, + "learning_rate": 9.4553605995441e-06, + "loss": 0.1067, + "step": 984 + }, + { + "epoch": 0.7, + "grad_norm": 6.179225960011602, + "learning_rate": 9.4540481269715e-06, + "loss": 0.1224, + "step": 985 + }, + { + "epoch": 0.7, + "grad_norm": 12.010485857421125, + "learning_rate": 9.452734166218635e-06, + "loss": 0.1298, + "step": 986 + }, + { + "epoch": 0.7, + "grad_norm": 28.699516768299603, + "learning_rate": 9.451418717724518e-06, + "loss": 0.1122, + "step": 987 + }, + { + "epoch": 0.71, + "grad_norm": 17.47564053882029, + "learning_rate": 9.45010178192867e-06, + "loss": 0.105, + "step": 988 + }, + { + "epoch": 0.71, + "grad_norm": 22.22446380391295, + "learning_rate": 9.448783359271102e-06, + "loss": 0.1224, + "step": 989 + }, + { + "epoch": 0.71, + "grad_norm": 8.196080255624146, + "learning_rate": 9.44746345019232e-06, + "loss": 0.1234, + "step": 990 + }, + { + "epoch": 0.71, + "grad_norm": 23.353760929453095, + "learning_rate": 9.446142055133333e-06, + "loss": 0.1925, + "step": 991 + }, + { + "epoch": 0.71, + "grad_norm": 12.684837863530596, + "learning_rate": 9.444819174535647e-06, + "loss": 0.1752, + "step": 992 + }, + { + "epoch": 0.71, + "grad_norm": 41.91167980648941, + "learning_rate": 9.443494808841255e-06, + "loss": 0.1741, + "step": 993 + }, + { + "epoch": 0.71, + "grad_norm": 25.021333305393867, + "learning_rate": 9.442168958492657e-06, + "loss": 0.1377, + "step": 994 + }, + { + "epoch": 0.71, + "grad_norm": 17.303187659047722, + "learning_rate": 9.44084162393284e-06, + "loss": 0.1587, + "step": 995 + }, + { + "epoch": 0.71, + "grad_norm": 13.881013346273633, + "learning_rate": 9.439512805605294e-06, + "loss": 0.1152, + "step": 996 + }, + { + "epoch": 0.71, + "grad_norm": 5.679523342858859, + "learning_rate": 9.438182503954002e-06, + "loss": 0.1433, + "step": 997 + }, + { + "epoch": 0.71, + "grad_norm": 5.809722363046939, + "learning_rate": 9.43685071942344e-06, + "loss": 0.1212, + "step": 998 + }, + { + "epoch": 0.71, + "grad_norm": 16.300528267719702, + "learning_rate": 9.435517452458584e-06, + "loss": 0.1003, + "step": 999 + }, + { + "epoch": 0.71, + "grad_norm": 7.142200136073567, + "learning_rate": 9.434182703504904e-06, + "loss": 0.1167, + "step": 1000 + }, + { + "epoch": 0.71, + "eval_avg_AUC": 0.8306426631221906, + "eval_avg_Accuracy": 0.697198275862069, + "eval_avg_Accuracy-right": 0.9357636624494587, + "eval_avg_Accuracy-wrong": 0.2812144644075506, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.7007286233364818, + "eval_last_AUC": 0.8499919642378029, + "eval_last_Accuracy": 0.7737483421750663, + "eval_last_Accuracy-right": 0.8576366244945872, + "eval_last_Accuracy-wrong": 0.6274732772344781, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.7122224635862333, + "eval_max_AUC": 0.7712377534346357, + "eval_max_Accuracy": 0.6421170424403183, + "eval_max_Accuracy-right": 0.9876744489370027, + "eval_max_Accuracy-wrong": 0.03957243575164885, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6283448927147767, + "eval_min_AUC": 0.8459536118200042, + "eval_min_Accuracy": 0.7756962864721485, + "eval_min_Accuracy-right": 0.8238554845441503, + "eval_min_Accuracy-wrong": 0.6917216283829885, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.7067690279119097, + "eval_prod_AUC": 0.8340437720336431, + "eval_prod_Accuracy": 0.7153929045092838, + "eval_prod_Accuracy-right": 0.5985391939480892, + "eval_prod_Accuracy-wrong": 0.9191494200591313, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6925029632488553, + "eval_runtime": 246.6967, + "eval_samples_per_second": 97.804, + "eval_steps_per_second": 3.056, + "eval_sum_AUC": 0.6696619705759399, + "eval_sum_Accuracy": 0.6358173076923077, + "eval_sum_Accuracy-right": 0.9999347854441111, + "eval_sum_Accuracy-wrong": 0.0009097111667045713, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6722441685783858, + "step": 1000 + }, + { + "epoch": 0.71, + "grad_norm": 26.90301939765578, + "learning_rate": 9.432846473008363e-06, + "loss": 0.1073, + "step": 1001 + }, + { + "epoch": 0.72, + "grad_norm": 22.473750563787483, + "learning_rate": 9.431508761415422e-06, + "loss": 0.1453, + "step": 1002 + }, + { + "epoch": 0.72, + "grad_norm": 8.035685660671625, + "learning_rate": 9.430169569173034e-06, + "loss": 0.1307, + "step": 1003 + }, + { + "epoch": 0.72, + "grad_norm": 11.372775149242225, + "learning_rate": 9.428828896728645e-06, + "loss": 0.1189, + "step": 1004 + }, + { + "epoch": 0.72, + "grad_norm": 14.367906249168973, + "learning_rate": 9.427486744530205e-06, + "loss": 0.1089, + "step": 1005 + }, + { + "epoch": 0.72, + "grad_norm": 21.80296312308416, + "learning_rate": 9.426143113026147e-06, + "loss": 0.1641, + "step": 1006 + }, + { + "epoch": 0.72, + "grad_norm": 12.689136775090986, + "learning_rate": 9.424798002665405e-06, + "loss": 0.1456, + "step": 1007 + }, + { + "epoch": 0.72, + "grad_norm": 19.914315829482117, + "learning_rate": 9.423451413897406e-06, + "loss": 0.1381, + "step": 1008 + }, + { + "epoch": 0.72, + "grad_norm": 17.487013924724515, + "learning_rate": 9.42210334717207e-06, + "loss": 0.1842, + "step": 1009 + }, + { + "epoch": 0.72, + "grad_norm": 41.591369148501364, + "learning_rate": 9.42075380293981e-06, + "loss": 0.1582, + "step": 1010 + }, + { + "epoch": 0.72, + "grad_norm": 21.325164316590822, + "learning_rate": 9.419402781651537e-06, + "loss": 0.1091, + "step": 1011 + }, + { + "epoch": 0.72, + "grad_norm": 27.610236895046608, + "learning_rate": 9.418050283758647e-06, + "loss": 0.1791, + "step": 1012 + }, + { + "epoch": 0.72, + "grad_norm": 48.91961383487278, + "learning_rate": 9.416696309713038e-06, + "loss": 0.1597, + "step": 1013 + }, + { + "epoch": 0.72, + "grad_norm": 6.314404735669536, + "learning_rate": 9.415340859967099e-06, + "loss": 0.1124, + "step": 1014 + }, + { + "epoch": 0.72, + "grad_norm": 4.467559964579186, + "learning_rate": 9.413983934973709e-06, + "loss": 0.1421, + "step": 1015 + }, + { + "epoch": 0.73, + "grad_norm": 39.483948098381305, + "learning_rate": 9.412625535186242e-06, + "loss": 0.1479, + "step": 1016 + }, + { + "epoch": 0.73, + "grad_norm": 25.24978912021175, + "learning_rate": 9.411265661058565e-06, + "loss": 0.1482, + "step": 1017 + }, + { + "epoch": 0.73, + "grad_norm": 13.0958518186262, + "learning_rate": 9.409904313045038e-06, + "loss": 0.1525, + "step": 1018 + }, + { + "epoch": 0.73, + "grad_norm": 15.169402604256561, + "learning_rate": 9.408541491600511e-06, + "loss": 0.1279, + "step": 1019 + }, + { + "epoch": 0.73, + "grad_norm": 65.13621930141626, + "learning_rate": 9.407177197180328e-06, + "loss": 0.1594, + "step": 1020 + }, + { + "epoch": 0.73, + "grad_norm": 47.31199672873694, + "learning_rate": 9.405811430240329e-06, + "loss": 0.1588, + "step": 1021 + }, + { + "epoch": 0.73, + "grad_norm": 15.929281558822629, + "learning_rate": 9.404444191236837e-06, + "loss": 0.1355, + "step": 1022 + }, + { + "epoch": 0.73, + "grad_norm": 43.31429781114566, + "learning_rate": 9.403075480626674e-06, + "loss": 0.1611, + "step": 1023 + }, + { + "epoch": 0.73, + "grad_norm": 50.56085481452662, + "learning_rate": 9.401705298867151e-06, + "loss": 0.1643, + "step": 1024 + }, + { + "epoch": 0.73, + "grad_norm": 36.11556641873227, + "learning_rate": 9.400333646416073e-06, + "loss": 0.1428, + "step": 1025 + }, + { + "epoch": 0.73, + "grad_norm": 6.0767580228440075, + "learning_rate": 9.398960523731735e-06, + "loss": 0.1222, + "step": 1026 + }, + { + "epoch": 0.73, + "grad_norm": 48.47371243487692, + "learning_rate": 9.397585931272919e-06, + "loss": 0.1434, + "step": 1027 + }, + { + "epoch": 0.73, + "grad_norm": 47.87986732416734, + "learning_rate": 9.396209869498905e-06, + "loss": 0.1599, + "step": 1028 + }, + { + "epoch": 0.73, + "grad_norm": 39.188166800682154, + "learning_rate": 9.39483233886946e-06, + "loss": 0.1154, + "step": 1029 + }, + { + "epoch": 0.74, + "grad_norm": 10.175502788260841, + "learning_rate": 9.393453339844842e-06, + "loss": 0.1465, + "step": 1030 + }, + { + "epoch": 0.74, + "grad_norm": 15.843982783240834, + "learning_rate": 9.392072872885802e-06, + "loss": 0.1418, + "step": 1031 + }, + { + "epoch": 0.74, + "grad_norm": 51.04924891166709, + "learning_rate": 9.39069093845358e-06, + "loss": 0.1541, + "step": 1032 + }, + { + "epoch": 0.74, + "grad_norm": 28.092328125663045, + "learning_rate": 9.389307537009902e-06, + "loss": 0.1537, + "step": 1033 + }, + { + "epoch": 0.74, + "grad_norm": 30.374904328612487, + "learning_rate": 9.387922669016992e-06, + "loss": 0.1475, + "step": 1034 + }, + { + "epoch": 0.74, + "grad_norm": 16.81604856905053, + "learning_rate": 9.386536334937557e-06, + "loss": 0.1382, + "step": 1035 + }, + { + "epoch": 0.74, + "grad_norm": 37.54277803011268, + "learning_rate": 9.385148535234799e-06, + "loss": 0.1389, + "step": 1036 + }, + { + "epoch": 0.74, + "grad_norm": 37.991731569487285, + "learning_rate": 9.383759270372408e-06, + "loss": 0.1583, + "step": 1037 + }, + { + "epoch": 0.74, + "grad_norm": 43.38772303094576, + "learning_rate": 9.382368540814563e-06, + "loss": 0.1724, + "step": 1038 + }, + { + "epoch": 0.74, + "grad_norm": 17.689671523576397, + "learning_rate": 9.380976347025932e-06, + "loss": 0.1157, + "step": 1039 + }, + { + "epoch": 0.74, + "grad_norm": 44.41572273988692, + "learning_rate": 9.379582689471671e-06, + "loss": 0.1479, + "step": 1040 + }, + { + "epoch": 0.74, + "grad_norm": 24.499163894929794, + "learning_rate": 9.378187568617431e-06, + "loss": 0.1245, + "step": 1041 + }, + { + "epoch": 0.74, + "grad_norm": 34.745047025595184, + "learning_rate": 9.376790984929348e-06, + "loss": 0.1395, + "step": 1042 + }, + { + "epoch": 0.74, + "grad_norm": 18.610150565591297, + "learning_rate": 9.37539293887404e-06, + "loss": 0.1469, + "step": 1043 + }, + { + "epoch": 0.75, + "grad_norm": 29.510494891674, + "learning_rate": 9.373993430918626e-06, + "loss": 0.1155, + "step": 1044 + }, + { + "epoch": 0.75, + "grad_norm": 26.89748864189952, + "learning_rate": 9.372592461530708e-06, + "loss": 0.1505, + "step": 1045 + }, + { + "epoch": 0.75, + "grad_norm": 10.407826158103616, + "learning_rate": 9.371190031178372e-06, + "loss": 0.1257, + "step": 1046 + }, + { + "epoch": 0.75, + "grad_norm": 20.146410557268332, + "learning_rate": 9.369786140330198e-06, + "loss": 0.1201, + "step": 1047 + }, + { + "epoch": 0.75, + "grad_norm": 4.649809459235866, + "learning_rate": 9.368380789455251e-06, + "loss": 0.1188, + "step": 1048 + }, + { + "epoch": 0.75, + "grad_norm": 20.935495774130015, + "learning_rate": 9.36697397902309e-06, + "loss": 0.1434, + "step": 1049 + }, + { + "epoch": 0.75, + "grad_norm": 25.501227504620445, + "learning_rate": 9.365565709503748e-06, + "loss": 0.1395, + "step": 1050 + }, + { + "epoch": 0.75, + "grad_norm": 7.080004066515803, + "learning_rate": 9.364155981367761e-06, + "loss": 0.0883, + "step": 1051 + }, + { + "epoch": 0.75, + "grad_norm": 10.423934524437326, + "learning_rate": 9.36274479508614e-06, + "loss": 0.1187, + "step": 1052 + }, + { + "epoch": 0.75, + "grad_norm": 34.962167370390645, + "learning_rate": 9.361332151130396e-06, + "loss": 0.12, + "step": 1053 + }, + { + "epoch": 0.75, + "grad_norm": 21.079610427912726, + "learning_rate": 9.359918049972512e-06, + "loss": 0.1295, + "step": 1054 + }, + { + "epoch": 0.75, + "grad_norm": 28.618833107684583, + "learning_rate": 9.358502492084969e-06, + "loss": 0.1395, + "step": 1055 + }, + { + "epoch": 0.75, + "grad_norm": 5.8759001732284855, + "learning_rate": 9.35708547794073e-06, + "loss": 0.1281, + "step": 1056 + }, + { + "epoch": 0.75, + "grad_norm": 37.51265565785163, + "learning_rate": 9.355667008013249e-06, + "loss": 0.1451, + "step": 1057 + }, + { + "epoch": 0.76, + "grad_norm": 63.60202101689444, + "learning_rate": 9.354247082776459e-06, + "loss": 0.1753, + "step": 1058 + }, + { + "epoch": 0.76, + "grad_norm": 23.159854031756225, + "learning_rate": 9.352825702704784e-06, + "loss": 0.132, + "step": 1059 + }, + { + "epoch": 0.76, + "grad_norm": 46.848693056166454, + "learning_rate": 9.351402868273136e-06, + "loss": 0.1409, + "step": 1060 + }, + { + "epoch": 0.76, + "grad_norm": 35.31714858575622, + "learning_rate": 9.349978579956908e-06, + "loss": 0.1536, + "step": 1061 + }, + { + "epoch": 0.76, + "grad_norm": 39.11227629802165, + "learning_rate": 9.348552838231983e-06, + "loss": 0.1378, + "step": 1062 + }, + { + "epoch": 0.76, + "grad_norm": 14.341752586653023, + "learning_rate": 9.347125643574726e-06, + "loss": 0.1119, + "step": 1063 + }, + { + "epoch": 0.76, + "grad_norm": 31.512222693182753, + "learning_rate": 9.345696996461992e-06, + "loss": 0.1544, + "step": 1064 + }, + { + "epoch": 0.76, + "grad_norm": 44.78047175611448, + "learning_rate": 9.344266897371114e-06, + "loss": 0.1526, + "step": 1065 + }, + { + "epoch": 0.76, + "grad_norm": 13.860702506152542, + "learning_rate": 9.34283534677992e-06, + "loss": 0.132, + "step": 1066 + }, + { + "epoch": 0.76, + "grad_norm": 9.914053857460699, + "learning_rate": 9.341402345166714e-06, + "loss": 0.146, + "step": 1067 + }, + { + "epoch": 0.76, + "grad_norm": 5.244521794867931, + "learning_rate": 9.33996789301029e-06, + "loss": 0.1675, + "step": 1068 + }, + { + "epoch": 0.76, + "grad_norm": 30.55515143164621, + "learning_rate": 9.338531990789926e-06, + "loss": 0.1207, + "step": 1069 + }, + { + "epoch": 0.76, + "grad_norm": 11.578710866057671, + "learning_rate": 9.33709463898538e-06, + "loss": 0.1359, + "step": 1070 + }, + { + "epoch": 0.76, + "grad_norm": 7.390861205292069, + "learning_rate": 9.335655838076902e-06, + "loss": 0.0799, + "step": 1071 + }, + { + "epoch": 0.77, + "grad_norm": 4.0312126352617454, + "learning_rate": 9.33421558854522e-06, + "loss": 0.1311, + "step": 1072 + }, + { + "epoch": 0.77, + "grad_norm": 16.82866199649571, + "learning_rate": 9.332773890871548e-06, + "loss": 0.1306, + "step": 1073 + }, + { + "epoch": 0.77, + "grad_norm": 23.34045634748492, + "learning_rate": 9.331330745537586e-06, + "loss": 0.1274, + "step": 1074 + }, + { + "epoch": 0.77, + "grad_norm": 21.331301913161205, + "learning_rate": 9.329886153025513e-06, + "loss": 0.1211, + "step": 1075 + }, + { + "epoch": 0.77, + "grad_norm": 11.711724010388526, + "learning_rate": 9.328440113817995e-06, + "loss": 0.1337, + "step": 1076 + }, + { + "epoch": 0.77, + "grad_norm": 22.866056035390898, + "learning_rate": 9.326992628398182e-06, + "loss": 0.1196, + "step": 1077 + }, + { + "epoch": 0.77, + "grad_norm": 6.234289721717951, + "learning_rate": 9.325543697249706e-06, + "loss": 0.1526, + "step": 1078 + }, + { + "epoch": 0.77, + "grad_norm": 29.859928319090212, + "learning_rate": 9.324093320856679e-06, + "loss": 0.137, + "step": 1079 + }, + { + "epoch": 0.77, + "grad_norm": 15.250096335579789, + "learning_rate": 9.3226414997037e-06, + "loss": 0.116, + "step": 1080 + }, + { + "epoch": 0.77, + "grad_norm": 13.223197640483088, + "learning_rate": 9.32118823427585e-06, + "loss": 0.1439, + "step": 1081 + }, + { + "epoch": 0.77, + "grad_norm": 7.3596389314819675, + "learning_rate": 9.319733525058694e-06, + "loss": 0.1238, + "step": 1082 + }, + { + "epoch": 0.77, + "grad_norm": 5.286656373449186, + "learning_rate": 9.318277372538274e-06, + "loss": 0.1317, + "step": 1083 + }, + { + "epoch": 0.77, + "grad_norm": 14.109356967510104, + "learning_rate": 9.316819777201119e-06, + "loss": 0.1257, + "step": 1084 + }, + { + "epoch": 0.77, + "grad_norm": 10.223027875056157, + "learning_rate": 9.315360739534235e-06, + "loss": 0.1648, + "step": 1085 + }, + { + "epoch": 0.78, + "grad_norm": 4.052529390869049, + "learning_rate": 9.313900260025121e-06, + "loss": 0.1232, + "step": 1086 + }, + { + "epoch": 0.78, + "grad_norm": 14.816538587879734, + "learning_rate": 9.312438339161746e-06, + "loss": 0.1198, + "step": 1087 + }, + { + "epoch": 0.78, + "grad_norm": 5.283898279259918, + "learning_rate": 9.310974977432565e-06, + "loss": 0.1261, + "step": 1088 + }, + { + "epoch": 0.78, + "grad_norm": 19.36116992418242, + "learning_rate": 9.309510175326515e-06, + "loss": 0.1118, + "step": 1089 + }, + { + "epoch": 0.78, + "grad_norm": 5.860358397763082, + "learning_rate": 9.308043933333012e-06, + "loss": 0.1211, + "step": 1090 + }, + { + "epoch": 0.78, + "grad_norm": 12.600425083069407, + "learning_rate": 9.306576251941957e-06, + "loss": 0.1387, + "step": 1091 + }, + { + "epoch": 0.78, + "grad_norm": 49.923681618239925, + "learning_rate": 9.305107131643729e-06, + "loss": 0.1647, + "step": 1092 + }, + { + "epoch": 0.78, + "grad_norm": 9.41603756614823, + "learning_rate": 9.303636572929188e-06, + "loss": 0.1338, + "step": 1093 + }, + { + "epoch": 0.78, + "grad_norm": 15.05350452195723, + "learning_rate": 9.302164576289674e-06, + "loss": 0.1469, + "step": 1094 + }, + { + "epoch": 0.78, + "grad_norm": 28.33566516662762, + "learning_rate": 9.30069114221701e-06, + "loss": 0.153, + "step": 1095 + }, + { + "epoch": 0.78, + "grad_norm": 23.243076526598593, + "learning_rate": 9.299216271203498e-06, + "loss": 0.1603, + "step": 1096 + }, + { + "epoch": 0.78, + "grad_norm": 10.582545901960945, + "learning_rate": 9.297739963741918e-06, + "loss": 0.1294, + "step": 1097 + }, + { + "epoch": 0.78, + "grad_norm": 11.667560754149541, + "learning_rate": 9.296262220325535e-06, + "loss": 0.11, + "step": 1098 + }, + { + "epoch": 0.78, + "grad_norm": 10.168707735076364, + "learning_rate": 9.294783041448088e-06, + "loss": 0.127, + "step": 1099 + }, + { + "epoch": 0.79, + "grad_norm": 12.57179862351945, + "learning_rate": 9.293302427603796e-06, + "loss": 0.1414, + "step": 1100 + }, + { + "epoch": 0.79, + "grad_norm": 26.656118865639158, + "learning_rate": 9.291820379287364e-06, + "loss": 0.1296, + "step": 1101 + }, + { + "epoch": 0.79, + "grad_norm": 5.172589483208834, + "learning_rate": 9.29033689699397e-06, + "loss": 0.1492, + "step": 1102 + }, + { + "epoch": 0.79, + "grad_norm": 14.01791519086498, + "learning_rate": 9.288851981219273e-06, + "loss": 0.1327, + "step": 1103 + }, + { + "epoch": 0.79, + "grad_norm": 33.859914601995875, + "learning_rate": 9.28736563245941e-06, + "loss": 0.1571, + "step": 1104 + }, + { + "epoch": 0.79, + "grad_norm": 21.956222104782483, + "learning_rate": 9.285877851210999e-06, + "loss": 0.1, + "step": 1105 + }, + { + "epoch": 0.79, + "grad_norm": 18.10448242341278, + "learning_rate": 9.284388637971136e-06, + "loss": 0.1234, + "step": 1106 + }, + { + "epoch": 0.79, + "grad_norm": 30.84897554982973, + "learning_rate": 9.282897993237392e-06, + "loss": 0.1714, + "step": 1107 + }, + { + "epoch": 0.79, + "grad_norm": 37.9856161444381, + "learning_rate": 9.281405917507824e-06, + "loss": 0.1244, + "step": 1108 + }, + { + "epoch": 0.79, + "grad_norm": 15.563367998567477, + "learning_rate": 9.279912411280958e-06, + "loss": 0.1194, + "step": 1109 + }, + { + "epoch": 0.79, + "grad_norm": 12.245185274327808, + "learning_rate": 9.278417475055803e-06, + "loss": 0.1119, + "step": 1110 + }, + { + "epoch": 0.79, + "grad_norm": 12.455063691988519, + "learning_rate": 9.276921109331845e-06, + "loss": 0.1165, + "step": 1111 + }, + { + "epoch": 0.79, + "grad_norm": 25.8213778871432, + "learning_rate": 9.275423314609049e-06, + "loss": 0.152, + "step": 1112 + }, + { + "epoch": 0.79, + "grad_norm": 13.869919626594394, + "learning_rate": 9.273924091387855e-06, + "loss": 0.115, + "step": 1113 + }, + { + "epoch": 0.8, + "grad_norm": 4.074864485432894, + "learning_rate": 9.272423440169181e-06, + "loss": 0.1091, + "step": 1114 + }, + { + "epoch": 0.8, + "grad_norm": 5.656458241700619, + "learning_rate": 9.270921361454424e-06, + "loss": 0.0948, + "step": 1115 + }, + { + "epoch": 0.8, + "grad_norm": 37.09019385402894, + "learning_rate": 9.269417855745453e-06, + "loss": 0.1337, + "step": 1116 + }, + { + "epoch": 0.8, + "grad_norm": 33.08524762310668, + "learning_rate": 9.267912923544621e-06, + "loss": 0.1455, + "step": 1117 + }, + { + "epoch": 0.8, + "grad_norm": 8.26175581668237, + "learning_rate": 9.266406565354753e-06, + "loss": 0.1221, + "step": 1118 + }, + { + "epoch": 0.8, + "grad_norm": 23.067596741868993, + "learning_rate": 9.26489878167915e-06, + "loss": 0.1331, + "step": 1119 + }, + { + "epoch": 0.8, + "grad_norm": 28.961300839321403, + "learning_rate": 9.263389573021592e-06, + "loss": 0.1156, + "step": 1120 + }, + { + "epoch": 0.8, + "grad_norm": 36.58260181012153, + "learning_rate": 9.261878939886332e-06, + "loss": 0.1499, + "step": 1121 + }, + { + "epoch": 0.8, + "grad_norm": 15.19087787168311, + "learning_rate": 9.2603668827781e-06, + "loss": 0.1367, + "step": 1122 + }, + { + "epoch": 0.8, + "grad_norm": 20.099032625961726, + "learning_rate": 9.258853402202106e-06, + "loss": 0.1182, + "step": 1123 + }, + { + "epoch": 0.8, + "grad_norm": 33.55489255269871, + "learning_rate": 9.25733849866403e-06, + "loss": 0.1436, + "step": 1124 + }, + { + "epoch": 0.8, + "grad_norm": 22.799341496962658, + "learning_rate": 9.255822172670028e-06, + "loss": 0.119, + "step": 1125 + }, + { + "epoch": 0.8, + "grad_norm": 10.031052291448315, + "learning_rate": 9.254304424726734e-06, + "loss": 0.1013, + "step": 1126 + }, + { + "epoch": 0.8, + "grad_norm": 6.390605015029899, + "learning_rate": 9.252785255341256e-06, + "loss": 0.1116, + "step": 1127 + }, + { + "epoch": 0.81, + "grad_norm": 8.188962911089087, + "learning_rate": 9.251264665021178e-06, + "loss": 0.1255, + "step": 1128 + }, + { + "epoch": 0.81, + "grad_norm": 28.64616641930042, + "learning_rate": 9.249742654274554e-06, + "loss": 0.1237, + "step": 1129 + }, + { + "epoch": 0.81, + "grad_norm": 9.921072334184103, + "learning_rate": 9.24821922360992e-06, + "loss": 0.1051, + "step": 1130 + }, + { + "epoch": 0.81, + "grad_norm": 22.18891754085567, + "learning_rate": 9.246694373536277e-06, + "loss": 0.1155, + "step": 1131 + }, + { + "epoch": 0.81, + "grad_norm": 22.768049432903226, + "learning_rate": 9.245168104563112e-06, + "loss": 0.1561, + "step": 1132 + }, + { + "epoch": 0.81, + "grad_norm": 4.694267206038536, + "learning_rate": 9.243640417200376e-06, + "loss": 0.1177, + "step": 1133 + }, + { + "epoch": 0.81, + "grad_norm": 19.472208799074128, + "learning_rate": 9.242111311958502e-06, + "loss": 0.1261, + "step": 1134 + }, + { + "epoch": 0.81, + "grad_norm": 7.137828446129272, + "learning_rate": 9.240580789348385e-06, + "loss": 0.1493, + "step": 1135 + }, + { + "epoch": 0.81, + "grad_norm": 17.02804572717195, + "learning_rate": 9.23904884988141e-06, + "loss": 0.1364, + "step": 1136 + }, + { + "epoch": 0.81, + "grad_norm": 7.782199051410428, + "learning_rate": 9.237515494069417e-06, + "loss": 0.1243, + "step": 1137 + }, + { + "epoch": 0.81, + "grad_norm": 5.708252427463937, + "learning_rate": 9.235980722424737e-06, + "loss": 0.1117, + "step": 1138 + }, + { + "epoch": 0.81, + "grad_norm": 4.366900740290782, + "learning_rate": 9.234444535460161e-06, + "loss": 0.1058, + "step": 1139 + }, + { + "epoch": 0.81, + "grad_norm": 7.735023185477729, + "learning_rate": 9.232906933688959e-06, + "loss": 0.1042, + "step": 1140 + }, + { + "epoch": 0.81, + "grad_norm": 17.878356014797415, + "learning_rate": 9.231367917624872e-06, + "loss": 0.1143, + "step": 1141 + }, + { + "epoch": 0.82, + "grad_norm": 13.500635215475153, + "learning_rate": 9.229827487782115e-06, + "loss": 0.1472, + "step": 1142 + }, + { + "epoch": 0.82, + "grad_norm": 28.914775597777798, + "learning_rate": 9.228285644675372e-06, + "loss": 0.1571, + "step": 1143 + }, + { + "epoch": 0.82, + "grad_norm": 7.679656527001652, + "learning_rate": 9.226742388819804e-06, + "loss": 0.1243, + "step": 1144 + }, + { + "epoch": 0.82, + "grad_norm": 42.56589275402226, + "learning_rate": 9.225197720731039e-06, + "loss": 0.1748, + "step": 1145 + }, + { + "epoch": 0.82, + "grad_norm": 33.46124830306206, + "learning_rate": 9.223651640925181e-06, + "loss": 0.1602, + "step": 1146 + }, + { + "epoch": 0.82, + "grad_norm": 11.354447656553011, + "learning_rate": 9.222104149918804e-06, + "loss": 0.1125, + "step": 1147 + }, + { + "epoch": 0.82, + "grad_norm": 4.943626062124622, + "learning_rate": 9.220555248228954e-06, + "loss": 0.1055, + "step": 1148 + }, + { + "epoch": 0.82, + "grad_norm": 22.06199413082995, + "learning_rate": 9.219004936373146e-06, + "loss": 0.1202, + "step": 1149 + }, + { + "epoch": 0.82, + "grad_norm": 78.80684998307608, + "learning_rate": 9.217453214869368e-06, + "loss": 0.2026, + "step": 1150 + }, + { + "epoch": 0.82, + "grad_norm": 33.723692179365536, + "learning_rate": 9.21590008423608e-06, + "loss": 0.1237, + "step": 1151 + }, + { + "epoch": 0.82, + "grad_norm": 20.8665304253155, + "learning_rate": 9.214345544992214e-06, + "loss": 0.136, + "step": 1152 + }, + { + "epoch": 0.82, + "grad_norm": 66.63596872046561, + "learning_rate": 9.212789597657167e-06, + "loss": 0.1742, + "step": 1153 + }, + { + "epoch": 0.82, + "grad_norm": 39.21810406123508, + "learning_rate": 9.21123224275081e-06, + "loss": 0.1675, + "step": 1154 + }, + { + "epoch": 0.82, + "grad_norm": 23.06081056803674, + "learning_rate": 9.209673480793486e-06, + "loss": 0.1375, + "step": 1155 + }, + { + "epoch": 0.83, + "grad_norm": 25.325897571173787, + "learning_rate": 9.208113312306006e-06, + "loss": 0.1434, + "step": 1156 + }, + { + "epoch": 0.83, + "grad_norm": 52.68852183497776, + "learning_rate": 9.206551737809653e-06, + "loss": 0.1707, + "step": 1157 + }, + { + "epoch": 0.83, + "grad_norm": 4.831358394803084, + "learning_rate": 9.204988757826173e-06, + "loss": 0.1233, + "step": 1158 + }, + { + "epoch": 0.83, + "grad_norm": 16.788606151895603, + "learning_rate": 9.203424372877791e-06, + "loss": 0.116, + "step": 1159 + }, + { + "epoch": 0.83, + "grad_norm": 10.314075233780184, + "learning_rate": 9.201858583487195e-06, + "loss": 0.1305, + "step": 1160 + }, + { + "epoch": 0.83, + "grad_norm": 24.77340581749608, + "learning_rate": 9.200291390177546e-06, + "loss": 0.1249, + "step": 1161 + }, + { + "epoch": 0.83, + "grad_norm": 13.838543735694156, + "learning_rate": 9.198722793472471e-06, + "loss": 0.0964, + "step": 1162 + }, + { + "epoch": 0.83, + "grad_norm": 4.489793512918954, + "learning_rate": 9.197152793896068e-06, + "loss": 0.1027, + "step": 1163 + }, + { + "epoch": 0.83, + "grad_norm": 17.370296763714304, + "learning_rate": 9.195581391972903e-06, + "loss": 0.1373, + "step": 1164 + }, + { + "epoch": 0.83, + "grad_norm": 8.574790093828888, + "learning_rate": 9.194008588228011e-06, + "loss": 0.1179, + "step": 1165 + }, + { + "epoch": 0.83, + "grad_norm": 10.243110764589119, + "learning_rate": 9.192434383186894e-06, + "loss": 0.1274, + "step": 1166 + }, + { + "epoch": 0.83, + "grad_norm": 7.676977628339888, + "learning_rate": 9.190858777375523e-06, + "loss": 0.1256, + "step": 1167 + }, + { + "epoch": 0.83, + "grad_norm": 19.690069281828507, + "learning_rate": 9.18928177132034e-06, + "loss": 0.1165, + "step": 1168 + }, + { + "epoch": 0.83, + "grad_norm": 7.038428559464359, + "learning_rate": 9.187703365548248e-06, + "loss": 0.1094, + "step": 1169 + }, + { + "epoch": 0.84, + "grad_norm": 26.547275502331654, + "learning_rate": 9.186123560586623e-06, + "loss": 0.2321, + "step": 1170 + }, + { + "epoch": 0.84, + "grad_norm": 13.193929639465265, + "learning_rate": 9.18454235696331e-06, + "loss": 0.1295, + "step": 1171 + }, + { + "epoch": 0.84, + "grad_norm": 26.734089366689073, + "learning_rate": 9.182959755206613e-06, + "loss": 0.1381, + "step": 1172 + }, + { + "epoch": 0.84, + "grad_norm": 6.039923169051226, + "learning_rate": 9.181375755845314e-06, + "loss": 0.1196, + "step": 1173 + }, + { + "epoch": 0.84, + "grad_norm": 5.543614416837632, + "learning_rate": 9.179790359408655e-06, + "loss": 0.1206, + "step": 1174 + }, + { + "epoch": 0.84, + "grad_norm": 7.105282043598741, + "learning_rate": 9.178203566426344e-06, + "loss": 0.1093, + "step": 1175 + }, + { + "epoch": 0.84, + "grad_norm": 20.186482641410777, + "learning_rate": 9.176615377428563e-06, + "loss": 0.1453, + "step": 1176 + }, + { + "epoch": 0.84, + "grad_norm": 4.872272725289261, + "learning_rate": 9.175025792945951e-06, + "loss": 0.0986, + "step": 1177 + }, + { + "epoch": 0.84, + "grad_norm": 8.738372660616186, + "learning_rate": 9.173434813509618e-06, + "loss": 0.0973, + "step": 1178 + }, + { + "epoch": 0.84, + "grad_norm": 13.83532907369495, + "learning_rate": 9.171842439651143e-06, + "loss": 0.1072, + "step": 1179 + }, + { + "epoch": 0.84, + "grad_norm": 6.269615641040455, + "learning_rate": 9.170248671902565e-06, + "loss": 0.106, + "step": 1180 + }, + { + "epoch": 0.84, + "grad_norm": 19.59295084898656, + "learning_rate": 9.168653510796392e-06, + "loss": 0.1466, + "step": 1181 + }, + { + "epoch": 0.84, + "grad_norm": 7.073455582971626, + "learning_rate": 9.167056956865596e-06, + "loss": 0.1572, + "step": 1182 + }, + { + "epoch": 0.84, + "grad_norm": 11.573875087408759, + "learning_rate": 9.165459010643618e-06, + "loss": 0.1233, + "step": 1183 + }, + { + "epoch": 0.85, + "grad_norm": 4.690591395107708, + "learning_rate": 9.16385967266436e-06, + "loss": 0.111, + "step": 1184 + }, + { + "epoch": 0.85, + "grad_norm": 5.771275364512612, + "learning_rate": 9.16225894346219e-06, + "loss": 0.1333, + "step": 1185 + }, + { + "epoch": 0.85, + "grad_norm": 24.721033576277165, + "learning_rate": 9.160656823571942e-06, + "loss": 0.1638, + "step": 1186 + }, + { + "epoch": 0.85, + "grad_norm": 15.57598172877004, + "learning_rate": 9.159053313528913e-06, + "loss": 0.1168, + "step": 1187 + }, + { + "epoch": 0.85, + "grad_norm": 26.636854607871165, + "learning_rate": 9.15744841386887e-06, + "loss": 0.1198, + "step": 1188 + }, + { + "epoch": 0.85, + "grad_norm": 3.98440431085169, + "learning_rate": 9.155842125128033e-06, + "loss": 0.1067, + "step": 1189 + }, + { + "epoch": 0.85, + "grad_norm": 31.917676167530633, + "learning_rate": 9.154234447843098e-06, + "loss": 0.1306, + "step": 1190 + }, + { + "epoch": 0.85, + "grad_norm": 6.019593781975672, + "learning_rate": 9.152625382551217e-06, + "loss": 0.0909, + "step": 1191 + }, + { + "epoch": 0.85, + "grad_norm": 8.806640969390639, + "learning_rate": 9.15101492979001e-06, + "loss": 0.1046, + "step": 1192 + }, + { + "epoch": 0.85, + "grad_norm": 28.841021287717343, + "learning_rate": 9.149403090097557e-06, + "loss": 0.1171, + "step": 1193 + }, + { + "epoch": 0.85, + "grad_norm": 16.465965710527545, + "learning_rate": 9.147789864012408e-06, + "loss": 0.1447, + "step": 1194 + }, + { + "epoch": 0.85, + "grad_norm": 24.666832031038798, + "learning_rate": 9.146175252073568e-06, + "loss": 0.1456, + "step": 1195 + }, + { + "epoch": 0.85, + "grad_norm": 19.683015165025946, + "learning_rate": 9.144559254820511e-06, + "loss": 0.1213, + "step": 1196 + }, + { + "epoch": 0.85, + "grad_norm": 5.122237884315906, + "learning_rate": 9.14294187279317e-06, + "loss": 0.1357, + "step": 1197 + }, + { + "epoch": 0.86, + "grad_norm": 6.419084999748459, + "learning_rate": 9.141323106531943e-06, + "loss": 0.1353, + "step": 1198 + }, + { + "epoch": 0.86, + "grad_norm": 33.4219861921213, + "learning_rate": 9.139702956577693e-06, + "loss": 0.1405, + "step": 1199 + }, + { + "epoch": 0.86, + "grad_norm": 23.625021627700484, + "learning_rate": 9.138081423471736e-06, + "loss": 0.1046, + "step": 1200 + }, + { + "epoch": 0.86, + "grad_norm": 11.839297028784905, + "learning_rate": 9.136458507755862e-06, + "loss": 0.1106, + "step": 1201 + }, + { + "epoch": 0.86, + "grad_norm": 12.004864533298464, + "learning_rate": 9.134834209972314e-06, + "loss": 0.104, + "step": 1202 + }, + { + "epoch": 0.86, + "grad_norm": 34.08869419046535, + "learning_rate": 9.133208530663801e-06, + "loss": 0.1288, + "step": 1203 + }, + { + "epoch": 0.86, + "grad_norm": 26.979912078853985, + "learning_rate": 9.131581470373495e-06, + "loss": 0.1449, + "step": 1204 + }, + { + "epoch": 0.86, + "grad_norm": 26.374270596425987, + "learning_rate": 9.129953029645022e-06, + "loss": 0.1167, + "step": 1205 + }, + { + "epoch": 0.86, + "grad_norm": 3.895140545361581, + "learning_rate": 9.128323209022478e-06, + "loss": 0.1267, + "step": 1206 + }, + { + "epoch": 0.86, + "grad_norm": 51.67643816360633, + "learning_rate": 9.126692009050415e-06, + "loss": 0.1334, + "step": 1207 + }, + { + "epoch": 0.86, + "grad_norm": 6.7195350284212845, + "learning_rate": 9.125059430273848e-06, + "loss": 0.1033, + "step": 1208 + }, + { + "epoch": 0.86, + "grad_norm": 4.68146047091333, + "learning_rate": 9.123425473238253e-06, + "loss": 0.1194, + "step": 1209 + }, + { + "epoch": 0.86, + "grad_norm": 10.479614467784797, + "learning_rate": 9.121790138489564e-06, + "loss": 0.1483, + "step": 1210 + }, + { + "epoch": 0.86, + "grad_norm": 8.88151888755877, + "learning_rate": 9.120153426574177e-06, + "loss": 0.1454, + "step": 1211 + }, + { + "epoch": 0.87, + "grad_norm": 21.761167483206385, + "learning_rate": 9.118515338038947e-06, + "loss": 0.1039, + "step": 1212 + }, + { + "epoch": 0.87, + "grad_norm": 15.054311022200379, + "learning_rate": 9.11687587343119e-06, + "loss": 0.127, + "step": 1213 + }, + { + "epoch": 0.87, + "grad_norm": 4.715530741583724, + "learning_rate": 9.115235033298682e-06, + "loss": 0.1182, + "step": 1214 + }, + { + "epoch": 0.87, + "grad_norm": 6.72131217945236, + "learning_rate": 9.113592818189661e-06, + "loss": 0.1331, + "step": 1215 + }, + { + "epoch": 0.87, + "grad_norm": 7.593220467655973, + "learning_rate": 9.111949228652816e-06, + "loss": 0.1128, + "step": 1216 + }, + { + "epoch": 0.87, + "grad_norm": 4.570725755965095, + "learning_rate": 9.110304265237304e-06, + "loss": 0.1183, + "step": 1217 + }, + { + "epoch": 0.87, + "grad_norm": 17.867725431027747, + "learning_rate": 9.10865792849274e-06, + "loss": 0.1313, + "step": 1218 + }, + { + "epoch": 0.87, + "grad_norm": 41.74139211185614, + "learning_rate": 9.107010218969191e-06, + "loss": 0.1473, + "step": 1219 + }, + { + "epoch": 0.87, + "grad_norm": 13.698548574340123, + "learning_rate": 9.10536113721719e-06, + "loss": 0.1401, + "step": 1220 + }, + { + "epoch": 0.87, + "grad_norm": 56.22024108773497, + "learning_rate": 9.103710683787728e-06, + "loss": 0.129, + "step": 1221 + }, + { + "epoch": 0.87, + "grad_norm": 10.143381456660101, + "learning_rate": 9.102058859232247e-06, + "loss": 0.1459, + "step": 1222 + }, + { + "epoch": 0.87, + "grad_norm": 8.58414656315145, + "learning_rate": 9.100405664102656e-06, + "loss": 0.1403, + "step": 1223 + }, + { + "epoch": 0.87, + "grad_norm": 33.73692819416245, + "learning_rate": 9.098751098951317e-06, + "loss": 0.1604, + "step": 1224 + }, + { + "epoch": 0.87, + "grad_norm": 17.199990598047965, + "learning_rate": 9.09709516433105e-06, + "loss": 0.1375, + "step": 1225 + }, + { + "epoch": 0.88, + "grad_norm": 15.42145961364144, + "learning_rate": 9.095437860795138e-06, + "loss": 0.1305, + "step": 1226 + }, + { + "epoch": 0.88, + "grad_norm": 36.82138829966084, + "learning_rate": 9.09377918889731e-06, + "loss": 0.1387, + "step": 1227 + }, + { + "epoch": 0.88, + "grad_norm": 4.248193826270128, + "learning_rate": 9.092119149191765e-06, + "loss": 0.1298, + "step": 1228 + }, + { + "epoch": 0.88, + "grad_norm": 7.168241999958529, + "learning_rate": 9.090457742233152e-06, + "loss": 0.0919, + "step": 1229 + }, + { + "epoch": 0.88, + "grad_norm": 4.413448398731609, + "learning_rate": 9.088794968576575e-06, + "loss": 0.1368, + "step": 1230 + }, + { + "epoch": 0.88, + "grad_norm": 10.297298848033256, + "learning_rate": 9.087130828777598e-06, + "loss": 0.1672, + "step": 1231 + }, + { + "epoch": 0.88, + "grad_norm": 3.995001961847258, + "learning_rate": 9.085465323392243e-06, + "loss": 0.1167, + "step": 1232 + }, + { + "epoch": 0.88, + "grad_norm": 5.999725996925589, + "learning_rate": 9.083798452976988e-06, + "loss": 0.1295, + "step": 1233 + }, + { + "epoch": 0.88, + "grad_norm": 20.375505744587326, + "learning_rate": 9.082130218088762e-06, + "loss": 0.1641, + "step": 1234 + }, + { + "epoch": 0.88, + "grad_norm": 3.932899862057339, + "learning_rate": 9.080460619284954e-06, + "loss": 0.1132, + "step": 1235 + }, + { + "epoch": 0.88, + "grad_norm": 27.645263230888336, + "learning_rate": 9.07878965712341e-06, + "loss": 0.1514, + "step": 1236 + }, + { + "epoch": 0.88, + "grad_norm": 10.21802883875594, + "learning_rate": 9.077117332162427e-06, + "loss": 0.1427, + "step": 1237 + }, + { + "epoch": 0.88, + "grad_norm": 18.322663661950923, + "learning_rate": 9.075443644960761e-06, + "loss": 0.1166, + "step": 1238 + }, + { + "epoch": 0.88, + "grad_norm": 4.316841214679732, + "learning_rate": 9.07376859607762e-06, + "loss": 0.0997, + "step": 1239 + }, + { + "epoch": 0.89, + "grad_norm": 6.244511662257982, + "learning_rate": 9.072092186072675e-06, + "loss": 0.1416, + "step": 1240 + }, + { + "epoch": 0.89, + "grad_norm": 19.860679127095636, + "learning_rate": 9.070414415506038e-06, + "loss": 0.1356, + "step": 1241 + }, + { + "epoch": 0.89, + "grad_norm": 10.605522811610166, + "learning_rate": 9.068735284938288e-06, + "loss": 0.1052, + "step": 1242 + }, + { + "epoch": 0.89, + "grad_norm": 17.842863169949126, + "learning_rate": 9.067054794930452e-06, + "loss": 0.1169, + "step": 1243 + }, + { + "epoch": 0.89, + "grad_norm": 8.718865016595677, + "learning_rate": 9.065372946044014e-06, + "loss": 0.1428, + "step": 1244 + }, + { + "epoch": 0.89, + "grad_norm": 20.576896857972773, + "learning_rate": 9.063689738840911e-06, + "loss": 0.1407, + "step": 1245 + }, + { + "epoch": 0.89, + "grad_norm": 7.3907765645746935, + "learning_rate": 9.06200517388353e-06, + "loss": 0.1586, + "step": 1246 + }, + { + "epoch": 0.89, + "grad_norm": 4.412682061788436, + "learning_rate": 9.060319251734723e-06, + "loss": 0.1168, + "step": 1247 + }, + { + "epoch": 0.89, + "grad_norm": 9.216748606683582, + "learning_rate": 9.058631972957783e-06, + "loss": 0.179, + "step": 1248 + }, + { + "epoch": 0.89, + "grad_norm": 13.251080346982283, + "learning_rate": 9.056943338116461e-06, + "loss": 0.1057, + "step": 1249 + }, + { + "epoch": 0.89, + "grad_norm": 13.556345489969091, + "learning_rate": 9.055253347774961e-06, + "loss": 0.1252, + "step": 1250 + }, + { + "epoch": 0.89, + "grad_norm": 8.70070366178762, + "learning_rate": 9.053562002497943e-06, + "loss": 0.121, + "step": 1251 + }, + { + "epoch": 0.89, + "grad_norm": 4.542057911871553, + "learning_rate": 9.051869302850515e-06, + "loss": 0.0962, + "step": 1252 + }, + { + "epoch": 0.89, + "grad_norm": 16.718582711323283, + "learning_rate": 9.05017524939824e-06, + "loss": 0.1232, + "step": 1253 + }, + { + "epoch": 0.9, + "grad_norm": 13.719584747865103, + "learning_rate": 9.048479842707132e-06, + "loss": 0.1292, + "step": 1254 + }, + { + "epoch": 0.9, + "grad_norm": 3.570019714838347, + "learning_rate": 9.046783083343657e-06, + "loss": 0.0828, + "step": 1255 + }, + { + "epoch": 0.9, + "grad_norm": 8.629289413960565, + "learning_rate": 9.045084971874738e-06, + "loss": 0.1039, + "step": 1256 + }, + { + "epoch": 0.9, + "grad_norm": 6.7497642996638705, + "learning_rate": 9.043385508867741e-06, + "loss": 0.0898, + "step": 1257 + }, + { + "epoch": 0.9, + "grad_norm": 20.039832970250274, + "learning_rate": 9.041684694890492e-06, + "loss": 0.1107, + "step": 1258 + }, + { + "epoch": 0.9, + "grad_norm": 15.178772636850542, + "learning_rate": 9.03998253051126e-06, + "loss": 0.1654, + "step": 1259 + }, + { + "epoch": 0.9, + "grad_norm": 27.037718152114405, + "learning_rate": 9.038279016298773e-06, + "loss": 0.1035, + "step": 1260 + }, + { + "epoch": 0.9, + "grad_norm": 28.49390797177503, + "learning_rate": 9.036574152822206e-06, + "loss": 0.1362, + "step": 1261 + }, + { + "epoch": 0.9, + "grad_norm": 8.695671686306866, + "learning_rate": 9.034867940651186e-06, + "loss": 0.1486, + "step": 1262 + }, + { + "epoch": 0.9, + "grad_norm": 7.332210434139537, + "learning_rate": 9.033160380355789e-06, + "loss": 0.1077, + "step": 1263 + }, + { + "epoch": 0.9, + "grad_norm": 4.95118182352966, + "learning_rate": 9.031451472506544e-06, + "loss": 0.1095, + "step": 1264 + }, + { + "epoch": 0.9, + "grad_norm": 8.219397289040563, + "learning_rate": 9.029741217674428e-06, + "loss": 0.1373, + "step": 1265 + }, + { + "epoch": 0.9, + "grad_norm": 28.958428037574063, + "learning_rate": 9.02802961643087e-06, + "loss": 0.1364, + "step": 1266 + }, + { + "epoch": 0.9, + "grad_norm": 27.25383513627309, + "learning_rate": 9.026316669347747e-06, + "loss": 0.1401, + "step": 1267 + }, + { + "epoch": 0.91, + "grad_norm": 7.0655709620800975, + "learning_rate": 9.024602376997387e-06, + "loss": 0.1539, + "step": 1268 + }, + { + "epoch": 0.91, + "grad_norm": 45.88384945997455, + "learning_rate": 9.022886739952565e-06, + "loss": 0.1285, + "step": 1269 + }, + { + "epoch": 0.91, + "grad_norm": 26.66446887230687, + "learning_rate": 9.02116975878651e-06, + "loss": 0.1138, + "step": 1270 + }, + { + "epoch": 0.91, + "grad_norm": 9.862633449690026, + "learning_rate": 9.019451434072894e-06, + "loss": 0.1599, + "step": 1271 + }, + { + "epoch": 0.91, + "grad_norm": 12.445395355203088, + "learning_rate": 9.017731766385844e-06, + "loss": 0.1327, + "step": 1272 + }, + { + "epoch": 0.91, + "grad_norm": 9.717224738562036, + "learning_rate": 9.016010756299934e-06, + "loss": 0.1062, + "step": 1273 + }, + { + "epoch": 0.91, + "grad_norm": 33.84566034930815, + "learning_rate": 9.014288404390182e-06, + "loss": 0.1376, + "step": 1274 + }, + { + "epoch": 0.91, + "grad_norm": 17.609479593872077, + "learning_rate": 9.012564711232059e-06, + "loss": 0.1116, + "step": 1275 + }, + { + "epoch": 0.91, + "grad_norm": 5.705629267795132, + "learning_rate": 9.010839677401484e-06, + "loss": 0.1307, + "step": 1276 + }, + { + "epoch": 0.91, + "grad_norm": 3.4202600275574486, + "learning_rate": 9.009113303474822e-06, + "loss": 0.0847, + "step": 1277 + }, + { + "epoch": 0.91, + "grad_norm": 27.46790309202585, + "learning_rate": 9.007385590028887e-06, + "loss": 0.1199, + "step": 1278 + }, + { + "epoch": 0.91, + "grad_norm": 19.134070635982145, + "learning_rate": 9.005656537640942e-06, + "loss": 0.1385, + "step": 1279 + }, + { + "epoch": 0.91, + "grad_norm": 4.867626401898946, + "learning_rate": 9.003926146888691e-06, + "loss": 0.098, + "step": 1280 + }, + { + "epoch": 0.91, + "grad_norm": 12.301604393023416, + "learning_rate": 9.002194418350291e-06, + "loss": 0.1766, + "step": 1281 + }, + { + "epoch": 0.92, + "grad_norm": 13.47168882178934, + "learning_rate": 9.000461352604349e-06, + "loss": 0.1528, + "step": 1282 + }, + { + "epoch": 0.92, + "grad_norm": 20.94266276844876, + "learning_rate": 8.99872695022991e-06, + "loss": 0.13, + "step": 1283 + }, + { + "epoch": 0.92, + "grad_norm": 7.568146188671397, + "learning_rate": 8.996991211806471e-06, + "loss": 0.1041, + "step": 1284 + }, + { + "epoch": 0.92, + "grad_norm": 19.767793790726582, + "learning_rate": 8.995254137913977e-06, + "loss": 0.1321, + "step": 1285 + }, + { + "epoch": 0.92, + "grad_norm": 5.925767904518081, + "learning_rate": 8.99351572913281e-06, + "loss": 0.1204, + "step": 1286 + }, + { + "epoch": 0.92, + "grad_norm": 17.665286809508974, + "learning_rate": 8.991775986043814e-06, + "loss": 0.0851, + "step": 1287 + }, + { + "epoch": 0.92, + "grad_norm": 15.691280452120386, + "learning_rate": 8.990034909228262e-06, + "loss": 0.1456, + "step": 1288 + }, + { + "epoch": 0.92, + "grad_norm": 10.619640637451463, + "learning_rate": 8.988292499267885e-06, + "loss": 0.0916, + "step": 1289 + }, + { + "epoch": 0.92, + "grad_norm": 39.48387258198995, + "learning_rate": 8.986548756744852e-06, + "loss": 0.1195, + "step": 1290 + }, + { + "epoch": 0.92, + "grad_norm": 8.63610318578367, + "learning_rate": 8.98480368224178e-06, + "loss": 0.1492, + "step": 1291 + }, + { + "epoch": 0.92, + "grad_norm": 20.03898408603859, + "learning_rate": 8.98305727634173e-06, + "loss": 0.1219, + "step": 1292 + }, + { + "epoch": 0.92, + "grad_norm": 7.857773381391331, + "learning_rate": 8.981309539628212e-06, + "loss": 0.132, + "step": 1293 + }, + { + "epoch": 0.92, + "grad_norm": 9.80086758659809, + "learning_rate": 8.979560472685174e-06, + "loss": 0.1019, + "step": 1294 + }, + { + "epoch": 0.92, + "grad_norm": 6.467420571204822, + "learning_rate": 8.977810076097013e-06, + "loss": 0.0927, + "step": 1295 + }, + { + "epoch": 0.93, + "grad_norm": 10.798830013093253, + "learning_rate": 8.97605835044857e-06, + "loss": 0.1462, + "step": 1296 + }, + { + "epoch": 0.93, + "grad_norm": 3.3782928073977945, + "learning_rate": 8.974305296325125e-06, + "loss": 0.0842, + "step": 1297 + }, + { + "epoch": 0.93, + "grad_norm": 8.272978733091367, + "learning_rate": 8.97255091431241e-06, + "loss": 0.1271, + "step": 1298 + }, + { + "epoch": 0.93, + "grad_norm": 16.81966484700589, + "learning_rate": 8.970795204996597e-06, + "loss": 0.1229, + "step": 1299 + }, + { + "epoch": 0.93, + "grad_norm": 8.739832847910312, + "learning_rate": 8.969038168964298e-06, + "loss": 0.1266, + "step": 1300 + }, + { + "epoch": 0.93, + "grad_norm": 9.173928247267657, + "learning_rate": 8.967279806802576e-06, + "loss": 0.1152, + "step": 1301 + }, + { + "epoch": 0.93, + "grad_norm": 22.14994655091031, + "learning_rate": 8.965520119098926e-06, + "loss": 0.1097, + "step": 1302 + }, + { + "epoch": 0.93, + "grad_norm": 54.5742266393221, + "learning_rate": 8.9637591064413e-06, + "loss": 0.1621, + "step": 1303 + }, + { + "epoch": 0.93, + "grad_norm": 12.6238048135336, + "learning_rate": 8.961996769418077e-06, + "loss": 0.0958, + "step": 1304 + }, + { + "epoch": 0.93, + "grad_norm": 29.088793120301332, + "learning_rate": 8.960233108618092e-06, + "loss": 0.1401, + "step": 1305 + }, + { + "epoch": 0.93, + "grad_norm": 51.52087793146708, + "learning_rate": 8.958468124630617e-06, + "loss": 0.1482, + "step": 1306 + }, + { + "epoch": 0.93, + "grad_norm": 26.746196577981273, + "learning_rate": 8.956701818045363e-06, + "loss": 0.1227, + "step": 1307 + }, + { + "epoch": 0.93, + "grad_norm": 35.98626766193302, + "learning_rate": 8.954934189452489e-06, + "loss": 0.1271, + "step": 1308 + }, + { + "epoch": 0.93, + "grad_norm": 10.706810349285421, + "learning_rate": 8.953165239442589e-06, + "loss": 0.1436, + "step": 1309 + }, + { + "epoch": 0.94, + "grad_norm": 17.689232861773036, + "learning_rate": 8.951394968606704e-06, + "loss": 0.1155, + "step": 1310 + }, + { + "epoch": 0.94, + "grad_norm": 7.4873562096415744, + "learning_rate": 8.949623377536314e-06, + "loss": 0.1095, + "step": 1311 + }, + { + "epoch": 0.94, + "grad_norm": 8.977404966444077, + "learning_rate": 8.947850466823343e-06, + "loss": 0.0917, + "step": 1312 + }, + { + "epoch": 0.94, + "grad_norm": 8.150139134114927, + "learning_rate": 8.946076237060148e-06, + "loss": 0.1312, + "step": 1313 + }, + { + "epoch": 0.94, + "grad_norm": 9.547066209117817, + "learning_rate": 8.944300688839538e-06, + "loss": 0.1211, + "step": 1314 + }, + { + "epoch": 0.94, + "grad_norm": 6.574430114106702, + "learning_rate": 8.942523822754751e-06, + "loss": 0.1184, + "step": 1315 + }, + { + "epoch": 0.94, + "grad_norm": 13.767237626071879, + "learning_rate": 8.940745639399477e-06, + "loss": 0.1439, + "step": 1316 + }, + { + "epoch": 0.94, + "grad_norm": 18.217321006016, + "learning_rate": 8.938966139367837e-06, + "loss": 0.124, + "step": 1317 + }, + { + "epoch": 0.94, + "grad_norm": 5.12123329875439, + "learning_rate": 8.937185323254395e-06, + "loss": 0.0945, + "step": 1318 + }, + { + "epoch": 0.94, + "grad_norm": 20.992000622397523, + "learning_rate": 8.935403191654155e-06, + "loss": 0.0821, + "step": 1319 + }, + { + "epoch": 0.94, + "grad_norm": 20.121514370471925, + "learning_rate": 8.933619745162559e-06, + "loss": 0.1375, + "step": 1320 + }, + { + "epoch": 0.94, + "grad_norm": 19.672716362914535, + "learning_rate": 8.931834984375492e-06, + "loss": 0.1133, + "step": 1321 + }, + { + "epoch": 0.94, + "grad_norm": 8.854511996956678, + "learning_rate": 8.930048909889272e-06, + "loss": 0.1428, + "step": 1322 + }, + { + "epoch": 0.94, + "grad_norm": 11.981569840262182, + "learning_rate": 8.928261522300665e-06, + "loss": 0.0844, + "step": 1323 + }, + { + "epoch": 0.95, + "grad_norm": 20.95789494137274, + "learning_rate": 8.926472822206869e-06, + "loss": 0.1074, + "step": 1324 + }, + { + "epoch": 0.95, + "grad_norm": 9.085896564148001, + "learning_rate": 8.924682810205519e-06, + "loss": 0.1525, + "step": 1325 + }, + { + "epoch": 0.95, + "grad_norm": 5.93521143384227, + "learning_rate": 8.922891486894692e-06, + "loss": 0.1149, + "step": 1326 + }, + { + "epoch": 0.95, + "grad_norm": 9.576240348621916, + "learning_rate": 8.921098852872904e-06, + "loss": 0.123, + "step": 1327 + }, + { + "epoch": 0.95, + "grad_norm": 6.371219220183408, + "learning_rate": 8.919304908739106e-06, + "loss": 0.1293, + "step": 1328 + }, + { + "epoch": 0.95, + "grad_norm": 6.67584580080675, + "learning_rate": 8.917509655092691e-06, + "loss": 0.1666, + "step": 1329 + }, + { + "epoch": 0.95, + "grad_norm": 4.493405850966914, + "learning_rate": 8.915713092533483e-06, + "loss": 0.1056, + "step": 1330 + }, + { + "epoch": 0.95, + "grad_norm": 13.499498266925373, + "learning_rate": 8.913915221661748e-06, + "loss": 0.1012, + "step": 1331 + }, + { + "epoch": 0.95, + "grad_norm": 30.9937070736886, + "learning_rate": 8.912116043078188e-06, + "loss": 0.1466, + "step": 1332 + }, + { + "epoch": 0.95, + "grad_norm": 14.4905788898255, + "learning_rate": 8.910315557383944e-06, + "loss": 0.1387, + "step": 1333 + }, + { + "epoch": 0.95, + "grad_norm": 8.442340648313488, + "learning_rate": 8.90851376518059e-06, + "loss": 0.1321, + "step": 1334 + }, + { + "epoch": 0.95, + "grad_norm": 19.692706543472003, + "learning_rate": 8.906710667070136e-06, + "loss": 0.1663, + "step": 1335 + }, + { + "epoch": 0.95, + "grad_norm": 7.236348097631177, + "learning_rate": 8.904906263655036e-06, + "loss": 0.1521, + "step": 1336 + }, + { + "epoch": 0.95, + "grad_norm": 12.189125432366566, + "learning_rate": 8.903100555538169e-06, + "loss": 0.1282, + "step": 1337 + }, + { + "epoch": 0.96, + "grad_norm": 10.711104406960347, + "learning_rate": 8.90129354332286e-06, + "loss": 0.146, + "step": 1338 + }, + { + "epoch": 0.96, + "grad_norm": 33.786095997393936, + "learning_rate": 8.899485227612865e-06, + "loss": 0.1194, + "step": 1339 + }, + { + "epoch": 0.96, + "grad_norm": 5.881201085589049, + "learning_rate": 8.897675609012372e-06, + "loss": 0.1199, + "step": 1340 + }, + { + "epoch": 0.96, + "grad_norm": 5.300747945263061, + "learning_rate": 8.895864688126013e-06, + "loss": 0.0984, + "step": 1341 + }, + { + "epoch": 0.96, + "grad_norm": 19.177339508083687, + "learning_rate": 8.894052465558846e-06, + "loss": 0.0996, + "step": 1342 + }, + { + "epoch": 0.96, + "grad_norm": 10.531710013911395, + "learning_rate": 8.892238941916372e-06, + "loss": 0.1389, + "step": 1343 + }, + { + "epoch": 0.96, + "grad_norm": 8.131493938719796, + "learning_rate": 8.890424117804522e-06, + "loss": 0.1129, + "step": 1344 + }, + { + "epoch": 0.96, + "grad_norm": 12.104920527712904, + "learning_rate": 8.88860799382966e-06, + "loss": 0.1772, + "step": 1345 + }, + { + "epoch": 0.96, + "grad_norm": 22.119517879117527, + "learning_rate": 8.88679057059859e-06, + "loss": 0.1145, + "step": 1346 + }, + { + "epoch": 0.96, + "grad_norm": 4.741445859808426, + "learning_rate": 8.884971848718544e-06, + "loss": 0.1284, + "step": 1347 + }, + { + "epoch": 0.96, + "grad_norm": 4.598276011199355, + "learning_rate": 8.883151828797194e-06, + "loss": 0.1213, + "step": 1348 + }, + { + "epoch": 0.96, + "grad_norm": 19.888338261433105, + "learning_rate": 8.88133051144264e-06, + "loss": 0.1331, + "step": 1349 + }, + { + "epoch": 0.96, + "grad_norm": 26.720524373083247, + "learning_rate": 8.87950789726342e-06, + "loss": 0.1427, + "step": 1350 + }, + { + "epoch": 0.96, + "grad_norm": 9.296969295372449, + "learning_rate": 8.8776839868685e-06, + "loss": 0.1285, + "step": 1351 + }, + { + "epoch": 0.97, + "grad_norm": 6.519639013034927, + "learning_rate": 8.875858780867286e-06, + "loss": 0.1084, + "step": 1352 + }, + { + "epoch": 0.97, + "grad_norm": 5.140053963785225, + "learning_rate": 8.87403227986961e-06, + "loss": 0.1273, + "step": 1353 + }, + { + "epoch": 0.97, + "grad_norm": 6.666680845030289, + "learning_rate": 8.872204484485743e-06, + "loss": 0.1301, + "step": 1354 + }, + { + "epoch": 0.97, + "grad_norm": 32.518052200093564, + "learning_rate": 8.870375395326384e-06, + "loss": 0.1344, + "step": 1355 + }, + { + "epoch": 0.97, + "grad_norm": 24.980724053437154, + "learning_rate": 8.868545013002665e-06, + "loss": 0.1077, + "step": 1356 + }, + { + "epoch": 0.97, + "grad_norm": 15.549275682764785, + "learning_rate": 8.866713338126152e-06, + "loss": 0.13, + "step": 1357 + }, + { + "epoch": 0.97, + "grad_norm": 10.581867589291187, + "learning_rate": 8.86488037130884e-06, + "loss": 0.13, + "step": 1358 + }, + { + "epoch": 0.97, + "grad_norm": 13.682883003986115, + "learning_rate": 8.863046113163158e-06, + "loss": 0.0698, + "step": 1359 + }, + { + "epoch": 0.97, + "grad_norm": 67.34606591529005, + "learning_rate": 8.861210564301967e-06, + "loss": 0.2075, + "step": 1360 + }, + { + "epoch": 0.97, + "grad_norm": 16.000577042347064, + "learning_rate": 8.859373725338558e-06, + "loss": 0.1465, + "step": 1361 + }, + { + "epoch": 0.97, + "grad_norm": 5.734984778309666, + "learning_rate": 8.857535596886652e-06, + "loss": 0.1035, + "step": 1362 + }, + { + "epoch": 0.97, + "grad_norm": 19.14624542885837, + "learning_rate": 8.855696179560402e-06, + "loss": 0.1437, + "step": 1363 + }, + { + "epoch": 0.97, + "grad_norm": 72.76158246694781, + "learning_rate": 8.85385547397439e-06, + "loss": 0.1649, + "step": 1364 + }, + { + "epoch": 0.97, + "grad_norm": 6.2260099295276845, + "learning_rate": 8.852013480743632e-06, + "loss": 0.1321, + "step": 1365 + }, + { + "epoch": 0.98, + "grad_norm": 9.131312974812547, + "learning_rate": 8.850170200483573e-06, + "loss": 0.1362, + "step": 1366 + }, + { + "epoch": 0.98, + "grad_norm": 11.780427057540104, + "learning_rate": 8.848325633810083e-06, + "loss": 0.0975, + "step": 1367 + }, + { + "epoch": 0.98, + "grad_norm": 38.41930656679251, + "learning_rate": 8.84647978133947e-06, + "loss": 0.1438, + "step": 1368 + }, + { + "epoch": 0.98, + "grad_norm": 31.457034993977672, + "learning_rate": 8.844632643688467e-06, + "loss": 0.1525, + "step": 1369 + }, + { + "epoch": 0.98, + "grad_norm": 9.017217356990798, + "learning_rate": 8.842784221474237e-06, + "loss": 0.1115, + "step": 1370 + }, + { + "epoch": 0.98, + "grad_norm": 15.961687118243042, + "learning_rate": 8.840934515314372e-06, + "loss": 0.173, + "step": 1371 + }, + { + "epoch": 0.98, + "grad_norm": 35.13831841169741, + "learning_rate": 8.839083525826893e-06, + "loss": 0.1478, + "step": 1372 + }, + { + "epoch": 0.98, + "grad_norm": 56.57243122107551, + "learning_rate": 8.837231253630247e-06, + "loss": 0.1528, + "step": 1373 + }, + { + "epoch": 0.98, + "grad_norm": 6.078243066957646, + "learning_rate": 8.835377699343318e-06, + "loss": 0.1129, + "step": 1374 + }, + { + "epoch": 0.98, + "grad_norm": 4.098116188214989, + "learning_rate": 8.83352286358541e-06, + "loss": 0.1077, + "step": 1375 + }, + { + "epoch": 0.98, + "grad_norm": 27.118006043141744, + "learning_rate": 8.83166674697626e-06, + "loss": 0.1112, + "step": 1376 + }, + { + "epoch": 0.98, + "grad_norm": 35.911096298188546, + "learning_rate": 8.829809350136027e-06, + "loss": 0.1365, + "step": 1377 + }, + { + "epoch": 0.98, + "grad_norm": 25.339967860287878, + "learning_rate": 8.827950673685306e-06, + "loss": 0.1319, + "step": 1378 + }, + { + "epoch": 0.98, + "grad_norm": 3.559736283305232, + "learning_rate": 8.826090718245112e-06, + "loss": 0.1271, + "step": 1379 + }, + { + "epoch": 0.99, + "grad_norm": 12.481495463900032, + "learning_rate": 8.824229484436894e-06, + "loss": 0.1123, + "step": 1380 + }, + { + "epoch": 0.99, + "grad_norm": 6.710361054334273, + "learning_rate": 8.822366972882523e-06, + "loss": 0.1602, + "step": 1381 + }, + { + "epoch": 0.99, + "grad_norm": 46.045622161560836, + "learning_rate": 8.820503184204299e-06, + "loss": 0.1102, + "step": 1382 + }, + { + "epoch": 0.99, + "grad_norm": 57.381890629821335, + "learning_rate": 8.818638119024949e-06, + "loss": 0.1418, + "step": 1383 + }, + { + "epoch": 0.99, + "grad_norm": 32.56557318410978, + "learning_rate": 8.816771777967623e-06, + "loss": 0.1338, + "step": 1384 + }, + { + "epoch": 0.99, + "grad_norm": 59.30261725642934, + "learning_rate": 8.814904161655904e-06, + "loss": 0.1843, + "step": 1385 + }, + { + "epoch": 0.99, + "grad_norm": 55.152942998068376, + "learning_rate": 8.813035270713796e-06, + "loss": 0.144, + "step": 1386 + }, + { + "epoch": 0.99, + "grad_norm": 54.60066261642631, + "learning_rate": 8.811165105765732e-06, + "loss": 0.14, + "step": 1387 + }, + { + "epoch": 0.99, + "grad_norm": 5.534246320273303, + "learning_rate": 8.809293667436565e-06, + "loss": 0.124, + "step": 1388 + }, + { + "epoch": 0.99, + "grad_norm": 25.51275813460598, + "learning_rate": 8.80742095635158e-06, + "loss": 0.1329, + "step": 1389 + }, + { + "epoch": 0.99, + "grad_norm": 41.23937073867847, + "learning_rate": 8.805546973136481e-06, + "loss": 0.139, + "step": 1390 + }, + { + "epoch": 0.99, + "grad_norm": 44.99322436803483, + "learning_rate": 8.803671718417407e-06, + "loss": 0.1451, + "step": 1391 + }, + { + "epoch": 0.99, + "grad_norm": 19.302534314720663, + "learning_rate": 8.80179519282091e-06, + "loss": 0.1173, + "step": 1392 + }, + { + "epoch": 0.99, + "grad_norm": 4.692753538243162, + "learning_rate": 8.799917396973976e-06, + "loss": 0.0931, + "step": 1393 + }, + { + "epoch": 1.0, + "grad_norm": 55.478799678782806, + "learning_rate": 8.798038331504008e-06, + "loss": 0.1469, + "step": 1394 + }, + { + "epoch": 1.0, + "grad_norm": 51.060808138084866, + "learning_rate": 8.79615799703884e-06, + "loss": 0.1503, + "step": 1395 + }, + { + "epoch": 1.0, + "grad_norm": 18.71241704458466, + "learning_rate": 8.794276394206722e-06, + "loss": 0.1154, + "step": 1396 + }, + { + "epoch": 1.0, + "grad_norm": 20.47077129185746, + "learning_rate": 8.792393523636337e-06, + "loss": 0.1069, + "step": 1397 + }, + { + "epoch": 1.0, + "grad_norm": 50.377448768013004, + "learning_rate": 8.790509385956784e-06, + "loss": 0.1875, + "step": 1398 + }, + { + "epoch": 1.0, + "grad_norm": 24.38834863041375, + "learning_rate": 8.788623981797592e-06, + "loss": 0.145, + "step": 1399 + }, + { + "epoch": 1.0, + "grad_norm": 18.459905174452402, + "learning_rate": 8.786737311788708e-06, + "loss": 0.1196, + "step": 1400 + }, + { + "epoch": 1.0, + "grad_norm": 3.4253124318053634, + "learning_rate": 8.784849376560503e-06, + "loss": 0.0878, + "step": 1401 + }, + { + "epoch": 1.0, + "grad_norm": 32.20871321310459, + "learning_rate": 8.78296017674377e-06, + "loss": 0.0911, + "step": 1402 + }, + { + "epoch": 1.0, + "grad_norm": 29.868559521129765, + "learning_rate": 8.781069712969726e-06, + "loss": 0.0909, + "step": 1403 + }, + { + "epoch": 1.0, + "grad_norm": 28.47627393710747, + "learning_rate": 8.779177985870012e-06, + "loss": 0.0869, + "step": 1404 + }, + { + "epoch": 1.0, + "grad_norm": 4.353515032854893, + "learning_rate": 8.77728499607669e-06, + "loss": 0.0629, + "step": 1405 + }, + { + "epoch": 1.0, + "grad_norm": 23.98867948652184, + "learning_rate": 8.775390744222238e-06, + "loss": 0.1105, + "step": 1406 + }, + { + "epoch": 1.0, + "grad_norm": 20.72490090675031, + "learning_rate": 8.773495230939567e-06, + "loss": 0.0758, + "step": 1407 + }, + { + "epoch": 1.0, + "grad_norm": 28.331372328309556, + "learning_rate": 8.771598456861998e-06, + "loss": 0.0789, + "step": 1408 + }, + { + "epoch": 1.01, + "grad_norm": 6.026063050859235, + "learning_rate": 8.769700422623283e-06, + "loss": 0.0461, + "step": 1409 + }, + { + "epoch": 1.01, + "grad_norm": 26.292659193113025, + "learning_rate": 8.767801128857588e-06, + "loss": 0.1025, + "step": 1410 + }, + { + "epoch": 1.01, + "grad_norm": 7.794861855473636, + "learning_rate": 8.765900576199502e-06, + "loss": 0.0879, + "step": 1411 + }, + { + "epoch": 1.01, + "grad_norm": 18.682338368358316, + "learning_rate": 8.763998765284036e-06, + "loss": 0.0944, + "step": 1412 + }, + { + "epoch": 1.01, + "grad_norm": 11.412070873783431, + "learning_rate": 8.76209569674662e-06, + "loss": 0.0685, + "step": 1413 + }, + { + "epoch": 1.01, + "grad_norm": 10.884099960069042, + "learning_rate": 8.760191371223104e-06, + "loss": 0.0643, + "step": 1414 + }, + { + "epoch": 1.01, + "grad_norm": 9.034014941112474, + "learning_rate": 8.758285789349759e-06, + "loss": 0.1039, + "step": 1415 + }, + { + "epoch": 1.01, + "grad_norm": 7.709293674256186, + "learning_rate": 8.756378951763277e-06, + "loss": 0.093, + "step": 1416 + }, + { + "epoch": 1.01, + "grad_norm": 4.1454163736142275, + "learning_rate": 8.754470859100765e-06, + "loss": 0.0861, + "step": 1417 + }, + { + "epoch": 1.01, + "grad_norm": 17.634562346783152, + "learning_rate": 8.752561511999754e-06, + "loss": 0.0907, + "step": 1418 + }, + { + "epoch": 1.01, + "grad_norm": 5.054823596253129, + "learning_rate": 8.750650911098193e-06, + "loss": 0.0757, + "step": 1419 + }, + { + "epoch": 1.01, + "grad_norm": 10.241226207922915, + "learning_rate": 8.748739057034447e-06, + "loss": 0.081, + "step": 1420 + }, + { + "epoch": 1.01, + "grad_norm": 4.495722997870084, + "learning_rate": 8.746825950447302e-06, + "loss": 0.0734, + "step": 1421 + }, + { + "epoch": 1.01, + "grad_norm": 17.82904323610015, + "learning_rate": 8.744911591975967e-06, + "loss": 0.0651, + "step": 1422 + }, + { + "epoch": 1.02, + "grad_norm": 17.484241379617817, + "learning_rate": 8.742995982260059e-06, + "loss": 0.0819, + "step": 1423 + }, + { + "epoch": 1.02, + "grad_norm": 18.69316674886684, + "learning_rate": 8.741079121939621e-06, + "loss": 0.0981, + "step": 1424 + }, + { + "epoch": 1.02, + "grad_norm": 4.329418925647879, + "learning_rate": 8.739161011655113e-06, + "loss": 0.087, + "step": 1425 + }, + { + "epoch": 1.02, + "grad_norm": 4.751184813489952, + "learning_rate": 8.737241652047408e-06, + "loss": 0.086, + "step": 1426 + }, + { + "epoch": 1.02, + "grad_norm": 6.326549122631243, + "learning_rate": 8.735321043757805e-06, + "loss": 0.0751, + "step": 1427 + }, + { + "epoch": 1.02, + "grad_norm": 10.632712593121198, + "learning_rate": 8.73339918742801e-06, + "loss": 0.0657, + "step": 1428 + }, + { + "epoch": 1.02, + "grad_norm": 15.207265708492715, + "learning_rate": 8.731476083700154e-06, + "loss": 0.117, + "step": 1429 + }, + { + "epoch": 1.02, + "grad_norm": 11.850694276149138, + "learning_rate": 8.729551733216779e-06, + "loss": 0.0742, + "step": 1430 + }, + { + "epoch": 1.02, + "grad_norm": 12.467249641043043, + "learning_rate": 8.727626136620848e-06, + "loss": 0.0967, + "step": 1431 + }, + { + "epoch": 1.02, + "grad_norm": 16.67115883838518, + "learning_rate": 8.725699294555739e-06, + "loss": 0.0746, + "step": 1432 + }, + { + "epoch": 1.02, + "grad_norm": 21.221164427537694, + "learning_rate": 8.723771207665245e-06, + "loss": 0.0936, + "step": 1433 + }, + { + "epoch": 1.02, + "grad_norm": 3.9808876084963707, + "learning_rate": 8.721841876593576e-06, + "loss": 0.0742, + "step": 1434 + }, + { + "epoch": 1.02, + "grad_norm": 13.177543099730642, + "learning_rate": 8.719911301985355e-06, + "loss": 0.0726, + "step": 1435 + }, + { + "epoch": 1.02, + "grad_norm": 5.383309194526871, + "learning_rate": 8.717979484485628e-06, + "loss": 0.0803, + "step": 1436 + }, + { + "epoch": 1.03, + "grad_norm": 8.67535542961095, + "learning_rate": 8.716046424739845e-06, + "loss": 0.0964, + "step": 1437 + }, + { + "epoch": 1.03, + "grad_norm": 23.546000226981334, + "learning_rate": 8.714112123393882e-06, + "loss": 0.0906, + "step": 1438 + }, + { + "epoch": 1.03, + "grad_norm": 5.478616913562366, + "learning_rate": 8.712176581094025e-06, + "loss": 0.1, + "step": 1439 + }, + { + "epoch": 1.03, + "grad_norm": 12.072295746434312, + "learning_rate": 8.710239798486972e-06, + "loss": 0.1013, + "step": 1440 + }, + { + "epoch": 1.03, + "grad_norm": 6.156588542394663, + "learning_rate": 8.708301776219838e-06, + "loss": 0.0717, + "step": 1441 + }, + { + "epoch": 1.03, + "grad_norm": 7.03749840440953, + "learning_rate": 8.706362514940153e-06, + "loss": 0.071, + "step": 1442 + }, + { + "epoch": 1.03, + "grad_norm": 21.908593717607186, + "learning_rate": 8.704422015295861e-06, + "loss": 0.0908, + "step": 1443 + }, + { + "epoch": 1.03, + "grad_norm": 12.743381450833912, + "learning_rate": 8.702480277935319e-06, + "loss": 0.093, + "step": 1444 + }, + { + "epoch": 1.03, + "grad_norm": 12.237240562039307, + "learning_rate": 8.700537303507298e-06, + "loss": 0.0611, + "step": 1445 + }, + { + "epoch": 1.03, + "grad_norm": 31.205512369091654, + "learning_rate": 8.69859309266098e-06, + "loss": 0.1085, + "step": 1446 + }, + { + "epoch": 1.03, + "grad_norm": 6.508224635069914, + "learning_rate": 8.696647646045962e-06, + "loss": 0.0961, + "step": 1447 + }, + { + "epoch": 1.03, + "grad_norm": 6.020337116432563, + "learning_rate": 8.694700964312257e-06, + "loss": 0.1194, + "step": 1448 + }, + { + "epoch": 1.03, + "grad_norm": 11.85315501709236, + "learning_rate": 8.692753048110283e-06, + "loss": 0.1057, + "step": 1449 + }, + { + "epoch": 1.03, + "grad_norm": 16.76312648328666, + "learning_rate": 8.690803898090878e-06, + "loss": 0.0859, + "step": 1450 + }, + { + "epoch": 1.04, + "grad_norm": 18.181912184082726, + "learning_rate": 8.68885351490529e-06, + "loss": 0.0883, + "step": 1451 + }, + { + "epoch": 1.04, + "grad_norm": 15.561501662544323, + "learning_rate": 8.686901899205177e-06, + "loss": 0.0615, + "step": 1452 + }, + { + "epoch": 1.04, + "grad_norm": 10.141158740015651, + "learning_rate": 8.684949051642609e-06, + "loss": 0.0827, + "step": 1453 + }, + { + "epoch": 1.04, + "grad_norm": 5.131395886113488, + "learning_rate": 8.68299497287007e-06, + "loss": 0.0664, + "step": 1454 + }, + { + "epoch": 1.04, + "grad_norm": 35.18287088578419, + "learning_rate": 8.681039663540454e-06, + "loss": 0.0862, + "step": 1455 + }, + { + "epoch": 1.04, + "grad_norm": 6.000912763873758, + "learning_rate": 8.679083124307064e-06, + "loss": 0.0785, + "step": 1456 + }, + { + "epoch": 1.04, + "grad_norm": 3.954545314588621, + "learning_rate": 8.67712535582362e-06, + "loss": 0.0663, + "step": 1457 + }, + { + "epoch": 1.04, + "grad_norm": 16.363053312350825, + "learning_rate": 8.675166358744247e-06, + "loss": 0.0945, + "step": 1458 + }, + { + "epoch": 1.04, + "grad_norm": 32.23364016473821, + "learning_rate": 8.67320613372348e-06, + "loss": 0.1357, + "step": 1459 + }, + { + "epoch": 1.04, + "grad_norm": 19.06812434731879, + "learning_rate": 8.67124468141627e-06, + "loss": 0.0738, + "step": 1460 + }, + { + "epoch": 1.04, + "grad_norm": 9.336167974534424, + "learning_rate": 8.669282002477975e-06, + "loss": 0.116, + "step": 1461 + }, + { + "epoch": 1.04, + "grad_norm": 30.216375689397136, + "learning_rate": 8.66731809756436e-06, + "loss": 0.0871, + "step": 1462 + }, + { + "epoch": 1.04, + "grad_norm": 9.454651839939022, + "learning_rate": 8.665352967331604e-06, + "loss": 0.0981, + "step": 1463 + }, + { + "epoch": 1.04, + "grad_norm": 5.880572461820449, + "learning_rate": 8.66338661243629e-06, + "loss": 0.0784, + "step": 1464 + }, + { + "epoch": 1.05, + "grad_norm": 48.047477471559795, + "learning_rate": 8.661419033535419e-06, + "loss": 0.1255, + "step": 1465 + }, + { + "epoch": 1.05, + "grad_norm": 5.296796150731295, + "learning_rate": 8.659450231286392e-06, + "loss": 0.0761, + "step": 1466 + }, + { + "epoch": 1.05, + "grad_norm": 23.57114189695642, + "learning_rate": 8.657480206347024e-06, + "loss": 0.0864, + "step": 1467 + }, + { + "epoch": 1.05, + "grad_norm": 25.506822238745904, + "learning_rate": 8.655508959375536e-06, + "loss": 0.0881, + "step": 1468 + }, + { + "epoch": 1.05, + "grad_norm": 8.686210935110786, + "learning_rate": 8.653536491030559e-06, + "loss": 0.0715, + "step": 1469 + }, + { + "epoch": 1.05, + "grad_norm": 7.926723814269391, + "learning_rate": 8.651562801971131e-06, + "loss": 0.0723, + "step": 1470 + }, + { + "epoch": 1.05, + "grad_norm": 11.287008911087467, + "learning_rate": 8.649587892856698e-06, + "loss": 0.0819, + "step": 1471 + }, + { + "epoch": 1.05, + "grad_norm": 17.18659129208043, + "learning_rate": 8.647611764347114e-06, + "loss": 0.0782, + "step": 1472 + }, + { + "epoch": 1.05, + "grad_norm": 7.819540078505, + "learning_rate": 8.64563441710264e-06, + "loss": 0.1422, + "step": 1473 + }, + { + "epoch": 1.05, + "grad_norm": 17.558856807362183, + "learning_rate": 8.643655851783947e-06, + "loss": 0.0733, + "step": 1474 + }, + { + "epoch": 1.05, + "grad_norm": 16.51338622173928, + "learning_rate": 8.641676069052104e-06, + "loss": 0.0606, + "step": 1475 + }, + { + "epoch": 1.05, + "grad_norm": 7.477740828151368, + "learning_rate": 8.639695069568602e-06, + "loss": 0.0682, + "step": 1476 + }, + { + "epoch": 1.05, + "grad_norm": 14.973985830755504, + "learning_rate": 8.637712853995324e-06, + "loss": 0.105, + "step": 1477 + }, + { + "epoch": 1.05, + "grad_norm": 15.603054833838772, + "learning_rate": 8.635729422994566e-06, + "loss": 0.087, + "step": 1478 + }, + { + "epoch": 1.06, + "grad_norm": 6.290026085929057, + "learning_rate": 8.633744777229029e-06, + "loss": 0.0738, + "step": 1479 + }, + { + "epoch": 1.06, + "grad_norm": 3.7390029716415563, + "learning_rate": 8.63175891736182e-06, + "loss": 0.0673, + "step": 1480 + }, + { + "epoch": 1.06, + "grad_norm": 5.473545590164696, + "learning_rate": 8.629771844056452e-06, + "loss": 0.0686, + "step": 1481 + }, + { + "epoch": 1.06, + "grad_norm": 14.272289047677544, + "learning_rate": 8.627783557976846e-06, + "loss": 0.0627, + "step": 1482 + }, + { + "epoch": 1.06, + "grad_norm": 12.905709838650964, + "learning_rate": 8.62579405978732e-06, + "loss": 0.0811, + "step": 1483 + }, + { + "epoch": 1.06, + "grad_norm": 13.0467547255787, + "learning_rate": 8.623803350152606e-06, + "loss": 0.0734, + "step": 1484 + }, + { + "epoch": 1.06, + "grad_norm": 9.523481106503748, + "learning_rate": 8.621811429737837e-06, + "loss": 0.0906, + "step": 1485 + }, + { + "epoch": 1.06, + "grad_norm": 26.36033723371387, + "learning_rate": 8.619818299208548e-06, + "loss": 0.0687, + "step": 1486 + }, + { + "epoch": 1.06, + "grad_norm": 13.671703283276997, + "learning_rate": 8.617823959230683e-06, + "loss": 0.0829, + "step": 1487 + }, + { + "epoch": 1.06, + "grad_norm": 13.44761997098398, + "learning_rate": 8.615828410470589e-06, + "loss": 0.0809, + "step": 1488 + }, + { + "epoch": 1.06, + "grad_norm": 12.38686214081961, + "learning_rate": 8.613831653595013e-06, + "loss": 0.1403, + "step": 1489 + }, + { + "epoch": 1.06, + "grad_norm": 29.543403014919598, + "learning_rate": 8.61183368927111e-06, + "loss": 0.0684, + "step": 1490 + }, + { + "epoch": 1.06, + "grad_norm": 9.143382134106462, + "learning_rate": 8.609834518166439e-06, + "loss": 0.0809, + "step": 1491 + }, + { + "epoch": 1.06, + "grad_norm": 14.121857063675124, + "learning_rate": 8.607834140948958e-06, + "loss": 0.0721, + "step": 1492 + }, + { + "epoch": 1.07, + "grad_norm": 14.681563500765165, + "learning_rate": 8.60583255828703e-06, + "loss": 0.0598, + "step": 1493 + }, + { + "epoch": 1.07, + "grad_norm": 11.402342240873446, + "learning_rate": 8.603829770849421e-06, + "loss": 0.0837, + "step": 1494 + }, + { + "epoch": 1.07, + "grad_norm": 34.80956350140246, + "learning_rate": 8.601825779305302e-06, + "loss": 0.0812, + "step": 1495 + }, + { + "epoch": 1.07, + "grad_norm": 25.664482199439714, + "learning_rate": 8.59982058432424e-06, + "loss": 0.0872, + "step": 1496 + }, + { + "epoch": 1.07, + "grad_norm": 4.942475728354198, + "learning_rate": 8.597814186576212e-06, + "loss": 0.0752, + "step": 1497 + }, + { + "epoch": 1.07, + "grad_norm": 21.889841226015804, + "learning_rate": 8.595806586731589e-06, + "loss": 0.0869, + "step": 1498 + }, + { + "epoch": 1.07, + "grad_norm": 47.39501041858811, + "learning_rate": 8.59379778546115e-06, + "loss": 0.1328, + "step": 1499 + }, + { + "epoch": 1.07, + "grad_norm": 23.843981585654273, + "learning_rate": 8.591787783436073e-06, + "loss": 0.0833, + "step": 1500 + }, + { + "epoch": 1.07, + "eval_avg_AUC": 0.8187337540188822, + "eval_avg_Accuracy": 0.7427470159151194, + "eval_avg_Accuracy-right": 0.8404199817399244, + "eval_avg_Accuracy-wrong": 0.5724357516488515, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6880919420787215, + "eval_last_AUC": 0.8358783930516435, + "eval_last_Accuracy": 0.7628066976127321, + "eval_last_Accuracy-right": 0.8261379940002609, + "eval_last_Accuracy-wrong": 0.6523766204230157, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.7015684504151366, + "eval_max_AUC": 0.7842336113536985, + "eval_max_Accuracy": 0.658363726790451, + "eval_max_Accuracy-right": 0.9557845311073432, + "eval_max_Accuracy-wrong": 0.13975437798498977, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6534520933484, + "eval_min_AUC": 0.822015164482916, + "eval_min_Accuracy": 0.7524452917771883, + "eval_min_Accuracy-right": 0.7425981479066127, + "eval_min_Accuracy-wrong": 0.7696156470320673, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6912215159082531, + "eval_prod_AUC": 0.819309531656854, + "eval_prod_Accuracy": 0.5770888594164456, + "eval_prod_Accuracy-right": 0.35946263205947565, + "eval_prod_Accuracy-wrong": 0.9565612917898567, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6874273712080468, + "eval_runtime": 248.3319, + "eval_samples_per_second": 97.16, + "eval_steps_per_second": 3.036, + "eval_sum_AUC": 0.7137200687510031, + "eval_sum_Accuracy": 0.64253149867374, + "eval_sum_Accuracy-right": 0.9941959045258901, + "eval_sum_Accuracy-wrong": 0.029338185126222424, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6771763816044019, + "step": 1500 + }, + { + "epoch": 1.07, + "grad_norm": 22.34310628383116, + "learning_rate": 8.589776581327936e-06, + "loss": 0.1263, + "step": 1501 + }, + { + "epoch": 1.07, + "grad_norm": 22.26638647085685, + "learning_rate": 8.587764179808716e-06, + "loss": 0.0922, + "step": 1502 + }, + { + "epoch": 1.07, + "grad_norm": 34.88623829684697, + "learning_rate": 8.5857505795508e-06, + "loss": 0.0751, + "step": 1503 + }, + { + "epoch": 1.07, + "grad_norm": 19.362303625609805, + "learning_rate": 8.583735781226964e-06, + "loss": 0.0897, + "step": 1504 + }, + { + "epoch": 1.07, + "grad_norm": 15.607349323647277, + "learning_rate": 8.581719785510391e-06, + "loss": 0.0867, + "step": 1505 + }, + { + "epoch": 1.07, + "grad_norm": 11.194617495715056, + "learning_rate": 8.579702593074666e-06, + "loss": 0.0877, + "step": 1506 + }, + { + "epoch": 1.08, + "grad_norm": 19.404643890863266, + "learning_rate": 8.577684204593767e-06, + "loss": 0.0745, + "step": 1507 + }, + { + "epoch": 1.08, + "grad_norm": 12.482102245943857, + "learning_rate": 8.575664620742073e-06, + "loss": 0.0892, + "step": 1508 + }, + { + "epoch": 1.08, + "grad_norm": 17.587811648576377, + "learning_rate": 8.57364384219437e-06, + "loss": 0.0742, + "step": 1509 + }, + { + "epoch": 1.08, + "grad_norm": 19.655816441505596, + "learning_rate": 8.571621869625835e-06, + "loss": 0.093, + "step": 1510 + }, + { + "epoch": 1.08, + "grad_norm": 30.03043984591091, + "learning_rate": 8.569598703712045e-06, + "loss": 0.0831, + "step": 1511 + }, + { + "epoch": 1.08, + "grad_norm": 21.842875103984415, + "learning_rate": 8.56757434512898e-06, + "loss": 0.0834, + "step": 1512 + }, + { + "epoch": 1.08, + "grad_norm": 12.39324221817927, + "learning_rate": 8.565548794553016e-06, + "loss": 0.0818, + "step": 1513 + }, + { + "epoch": 1.08, + "grad_norm": 43.49885915979903, + "learning_rate": 8.563522052660925e-06, + "loss": 0.1049, + "step": 1514 + }, + { + "epoch": 1.08, + "grad_norm": 6.834088679375381, + "learning_rate": 8.561494120129878e-06, + "loss": 0.1014, + "step": 1515 + }, + { + "epoch": 1.08, + "grad_norm": 10.466927403243657, + "learning_rate": 8.55946499763745e-06, + "loss": 0.0749, + "step": 1516 + }, + { + "epoch": 1.08, + "grad_norm": 10.911200002806018, + "learning_rate": 8.557434685861604e-06, + "loss": 0.0834, + "step": 1517 + }, + { + "epoch": 1.08, + "grad_norm": 33.65082017177862, + "learning_rate": 8.555403185480706e-06, + "loss": 0.0867, + "step": 1518 + }, + { + "epoch": 1.08, + "grad_norm": 12.496700495977283, + "learning_rate": 8.553370497173518e-06, + "loss": 0.0687, + "step": 1519 + }, + { + "epoch": 1.08, + "grad_norm": 13.198970575902443, + "learning_rate": 8.551336621619202e-06, + "loss": 0.1044, + "step": 1520 + }, + { + "epoch": 1.09, + "grad_norm": 6.252606767967176, + "learning_rate": 8.549301559497309e-06, + "loss": 0.0756, + "step": 1521 + }, + { + "epoch": 1.09, + "grad_norm": 19.71387444708082, + "learning_rate": 8.547265311487794e-06, + "loss": 0.0796, + "step": 1522 + }, + { + "epoch": 1.09, + "grad_norm": 13.494983831348298, + "learning_rate": 8.545227878271004e-06, + "loss": 0.0994, + "step": 1523 + }, + { + "epoch": 1.09, + "grad_norm": 31.544098364929408, + "learning_rate": 8.543189260527685e-06, + "loss": 0.0847, + "step": 1524 + }, + { + "epoch": 1.09, + "grad_norm": 7.203150409865368, + "learning_rate": 8.541149458938972e-06, + "loss": 0.0718, + "step": 1525 + }, + { + "epoch": 1.09, + "grad_norm": 6.754617592326723, + "learning_rate": 8.539108474186408e-06, + "loss": 0.0932, + "step": 1526 + }, + { + "epoch": 1.09, + "grad_norm": 28.549577345821078, + "learning_rate": 8.53706630695192e-06, + "loss": 0.0979, + "step": 1527 + }, + { + "epoch": 1.09, + "grad_norm": 22.622866020467605, + "learning_rate": 8.535022957917833e-06, + "loss": 0.0777, + "step": 1528 + }, + { + "epoch": 1.09, + "grad_norm": 12.70378295713489, + "learning_rate": 8.53297842776687e-06, + "loss": 0.1194, + "step": 1529 + }, + { + "epoch": 1.09, + "grad_norm": 9.841315864137869, + "learning_rate": 8.530932717182148e-06, + "loss": 0.1196, + "step": 1530 + }, + { + "epoch": 1.09, + "grad_norm": 38.579351262870325, + "learning_rate": 8.528885826847173e-06, + "loss": 0.0726, + "step": 1531 + }, + { + "epoch": 1.09, + "grad_norm": 29.95101519238097, + "learning_rate": 8.52683775744585e-06, + "loss": 0.0677, + "step": 1532 + }, + { + "epoch": 1.09, + "grad_norm": 18.422090044359035, + "learning_rate": 8.524788509662478e-06, + "loss": 0.069, + "step": 1533 + }, + { + "epoch": 1.09, + "grad_norm": 18.93769711941636, + "learning_rate": 8.522738084181749e-06, + "loss": 0.0963, + "step": 1534 + }, + { + "epoch": 1.1, + "grad_norm": 22.851640489831993, + "learning_rate": 8.52068648168875e-06, + "loss": 0.0773, + "step": 1535 + }, + { + "epoch": 1.1, + "grad_norm": 51.74572885384836, + "learning_rate": 8.518633702868955e-06, + "loss": 0.1013, + "step": 1536 + }, + { + "epoch": 1.1, + "grad_norm": 12.047738013956764, + "learning_rate": 8.516579748408237e-06, + "loss": 0.0697, + "step": 1537 + }, + { + "epoch": 1.1, + "grad_norm": 8.15886762195816, + "learning_rate": 8.514524618992864e-06, + "loss": 0.0896, + "step": 1538 + }, + { + "epoch": 1.1, + "grad_norm": 47.185535045680865, + "learning_rate": 8.51246831530949e-06, + "loss": 0.1111, + "step": 1539 + }, + { + "epoch": 1.1, + "grad_norm": 56.966239713258744, + "learning_rate": 8.510410838045165e-06, + "loss": 0.1395, + "step": 1540 + }, + { + "epoch": 1.1, + "grad_norm": 13.443718359824823, + "learning_rate": 8.508352187887329e-06, + "loss": 0.0826, + "step": 1541 + }, + { + "epoch": 1.1, + "grad_norm": 21.80564065604766, + "learning_rate": 8.506292365523816e-06, + "loss": 0.0814, + "step": 1542 + }, + { + "epoch": 1.1, + "grad_norm": 24.864346346179897, + "learning_rate": 8.504231371642852e-06, + "loss": 0.1256, + "step": 1543 + }, + { + "epoch": 1.1, + "grad_norm": 54.020150916277935, + "learning_rate": 8.502169206933053e-06, + "loss": 0.1006, + "step": 1544 + }, + { + "epoch": 1.1, + "grad_norm": 25.153860020715037, + "learning_rate": 8.500105872083424e-06, + "loss": 0.0862, + "step": 1545 + }, + { + "epoch": 1.1, + "grad_norm": 8.72803160180219, + "learning_rate": 8.498041367783367e-06, + "loss": 0.0757, + "step": 1546 + }, + { + "epoch": 1.1, + "grad_norm": 44.857111526137345, + "learning_rate": 8.49597569472267e-06, + "loss": 0.1027, + "step": 1547 + }, + { + "epoch": 1.1, + "grad_norm": 39.333400063483836, + "learning_rate": 8.493908853591515e-06, + "loss": 0.0902, + "step": 1548 + }, + { + "epoch": 1.11, + "grad_norm": 18.48890358440609, + "learning_rate": 8.491840845080467e-06, + "loss": 0.0837, + "step": 1549 + }, + { + "epoch": 1.11, + "grad_norm": 5.104802433608296, + "learning_rate": 8.489771669880489e-06, + "loss": 0.0821, + "step": 1550 + }, + { + "epoch": 1.11, + "grad_norm": 17.233922736828326, + "learning_rate": 8.487701328682932e-06, + "loss": 0.0866, + "step": 1551 + }, + { + "epoch": 1.11, + "grad_norm": 31.87602778903619, + "learning_rate": 8.485629822179533e-06, + "loss": 0.084, + "step": 1552 + }, + { + "epoch": 1.11, + "grad_norm": 34.96613218097158, + "learning_rate": 8.483557151062423e-06, + "loss": 0.0948, + "step": 1553 + }, + { + "epoch": 1.11, + "grad_norm": 5.749975742639682, + "learning_rate": 8.481483316024117e-06, + "loss": 0.0853, + "step": 1554 + }, + { + "epoch": 1.11, + "grad_norm": 12.11538066802083, + "learning_rate": 8.479408317757525e-06, + "loss": 0.11, + "step": 1555 + }, + { + "epoch": 1.11, + "grad_norm": 35.93167259883927, + "learning_rate": 8.477332156955942e-06, + "loss": 0.0968, + "step": 1556 + }, + { + "epoch": 1.11, + "grad_norm": 45.13111946912449, + "learning_rate": 8.475254834313051e-06, + "loss": 0.1176, + "step": 1557 + }, + { + "epoch": 1.11, + "grad_norm": 8.099342716236617, + "learning_rate": 8.473176350522925e-06, + "loss": 0.0784, + "step": 1558 + }, + { + "epoch": 1.11, + "grad_norm": 6.454912943269717, + "learning_rate": 8.471096706280022e-06, + "loss": 0.1095, + "step": 1559 + }, + { + "epoch": 1.11, + "grad_norm": 27.021317013221832, + "learning_rate": 8.469015902279191e-06, + "loss": 0.0663, + "step": 1560 + }, + { + "epoch": 1.11, + "grad_norm": 34.78660476793632, + "learning_rate": 8.466933939215669e-06, + "loss": 0.1093, + "step": 1561 + }, + { + "epoch": 1.11, + "grad_norm": 21.9565790223612, + "learning_rate": 8.464850817785075e-06, + "loss": 0.0702, + "step": 1562 + }, + { + "epoch": 1.12, + "grad_norm": 11.065785710336767, + "learning_rate": 8.462766538683422e-06, + "loss": 0.0822, + "step": 1563 + }, + { + "epoch": 1.12, + "grad_norm": 27.319906992202778, + "learning_rate": 8.460681102607106e-06, + "loss": 0.0803, + "step": 1564 + }, + { + "epoch": 1.12, + "grad_norm": 12.384719292234578, + "learning_rate": 8.45859451025291e-06, + "loss": 0.0873, + "step": 1565 + }, + { + "epoch": 1.12, + "grad_norm": 8.592013770399037, + "learning_rate": 8.456506762317998e-06, + "loss": 0.1086, + "step": 1566 + }, + { + "epoch": 1.12, + "grad_norm": 15.454204756646234, + "learning_rate": 8.454417859499932e-06, + "loss": 0.1068, + "step": 1567 + }, + { + "epoch": 1.12, + "grad_norm": 13.97423432264866, + "learning_rate": 8.45232780249665e-06, + "loss": 0.0629, + "step": 1568 + }, + { + "epoch": 1.12, + "grad_norm": 8.11051784415923, + "learning_rate": 8.450236592006481e-06, + "loss": 0.0693, + "step": 1569 + }, + { + "epoch": 1.12, + "grad_norm": 10.52568011914339, + "learning_rate": 8.448144228728135e-06, + "loss": 0.0842, + "step": 1570 + }, + { + "epoch": 1.12, + "grad_norm": 13.957701644591483, + "learning_rate": 8.446050713360711e-06, + "loss": 0.0625, + "step": 1571 + }, + { + "epoch": 1.12, + "grad_norm": 8.651605410737629, + "learning_rate": 8.443956046603692e-06, + "loss": 0.0853, + "step": 1572 + }, + { + "epoch": 1.12, + "grad_norm": 11.215401871180575, + "learning_rate": 8.441860229156944e-06, + "loss": 0.0886, + "step": 1573 + }, + { + "epoch": 1.12, + "grad_norm": 16.2620319467978, + "learning_rate": 8.439763261720716e-06, + "loss": 0.1127, + "step": 1574 + }, + { + "epoch": 1.12, + "grad_norm": 19.65795515380451, + "learning_rate": 8.43766514499565e-06, + "loss": 0.0867, + "step": 1575 + }, + { + "epoch": 1.12, + "grad_norm": 15.569766631054197, + "learning_rate": 8.435565879682759e-06, + "loss": 0.0986, + "step": 1576 + }, + { + "epoch": 1.13, + "grad_norm": 4.803571295203503, + "learning_rate": 8.433465466483452e-06, + "loss": 0.0811, + "step": 1577 + }, + { + "epoch": 1.13, + "grad_norm": 8.29721737997988, + "learning_rate": 8.431363906099513e-06, + "loss": 0.0776, + "step": 1578 + }, + { + "epoch": 1.13, + "grad_norm": 15.709180796487498, + "learning_rate": 8.429261199233114e-06, + "loss": 0.0936, + "step": 1579 + }, + { + "epoch": 1.13, + "grad_norm": 9.65559442729195, + "learning_rate": 8.427157346586807e-06, + "loss": 0.0811, + "step": 1580 + }, + { + "epoch": 1.13, + "grad_norm": 9.08245662725313, + "learning_rate": 8.42505234886353e-06, + "loss": 0.1066, + "step": 1581 + }, + { + "epoch": 1.13, + "grad_norm": 15.963511956163712, + "learning_rate": 8.422946206766598e-06, + "loss": 0.0867, + "step": 1582 + }, + { + "epoch": 1.13, + "grad_norm": 16.28827683392989, + "learning_rate": 8.420838920999718e-06, + "loss": 0.0611, + "step": 1583 + }, + { + "epoch": 1.13, + "grad_norm": 28.81251875839677, + "learning_rate": 8.418730492266968e-06, + "loss": 0.086, + "step": 1584 + }, + { + "epoch": 1.13, + "grad_norm": 10.509715363597547, + "learning_rate": 8.416620921272818e-06, + "loss": 0.074, + "step": 1585 + }, + { + "epoch": 1.13, + "grad_norm": 21.04769017279374, + "learning_rate": 8.414510208722111e-06, + "loss": 0.0928, + "step": 1586 + }, + { + "epoch": 1.13, + "grad_norm": 29.427287736707218, + "learning_rate": 8.412398355320078e-06, + "loss": 0.0986, + "step": 1587 + }, + { + "epoch": 1.13, + "grad_norm": 12.45398042864309, + "learning_rate": 8.410285361772328e-06, + "loss": 0.0876, + "step": 1588 + }, + { + "epoch": 1.13, + "grad_norm": 14.68115002988725, + "learning_rate": 8.408171228784847e-06, + "loss": 0.0646, + "step": 1589 + }, + { + "epoch": 1.13, + "grad_norm": 23.379480357894426, + "learning_rate": 8.406055957064014e-06, + "loss": 0.0698, + "step": 1590 + }, + { + "epoch": 1.14, + "grad_norm": 33.42786888545678, + "learning_rate": 8.403939547316576e-06, + "loss": 0.1045, + "step": 1591 + }, + { + "epoch": 1.14, + "grad_norm": 18.63204059915201, + "learning_rate": 8.401822000249661e-06, + "loss": 0.0681, + "step": 1592 + }, + { + "epoch": 1.14, + "grad_norm": 34.22901610610889, + "learning_rate": 8.399703316570788e-06, + "loss": 0.0841, + "step": 1593 + }, + { + "epoch": 1.14, + "grad_norm": 7.427652450081075, + "learning_rate": 8.397583496987846e-06, + "loss": 0.0715, + "step": 1594 + }, + { + "epoch": 1.14, + "grad_norm": 10.923600709443539, + "learning_rate": 8.395462542209106e-06, + "loss": 0.0789, + "step": 1595 + }, + { + "epoch": 1.14, + "grad_norm": 12.337513283657069, + "learning_rate": 8.393340452943219e-06, + "loss": 0.0902, + "step": 1596 + }, + { + "epoch": 1.14, + "grad_norm": 27.489704010210236, + "learning_rate": 8.391217229899211e-06, + "loss": 0.1047, + "step": 1597 + }, + { + "epoch": 1.14, + "grad_norm": 21.02825485654168, + "learning_rate": 8.389092873786495e-06, + "loss": 0.0721, + "step": 1598 + }, + { + "epoch": 1.14, + "grad_norm": 7.16992364078436, + "learning_rate": 8.386967385314857e-06, + "loss": 0.0811, + "step": 1599 + }, + { + "epoch": 1.14, + "grad_norm": 12.091998028936846, + "learning_rate": 8.384840765194458e-06, + "loss": 0.0624, + "step": 1600 + }, + { + "epoch": 1.14, + "grad_norm": 51.589968347407535, + "learning_rate": 8.382713014135846e-06, + "loss": 0.1481, + "step": 1601 + }, + { + "epoch": 1.14, + "grad_norm": 23.84886638204971, + "learning_rate": 8.38058413284994e-06, + "loss": 0.0939, + "step": 1602 + }, + { + "epoch": 1.14, + "grad_norm": 21.203344409082234, + "learning_rate": 8.37845412204804e-06, + "loss": 0.0793, + "step": 1603 + }, + { + "epoch": 1.14, + "grad_norm": 24.550507200743613, + "learning_rate": 8.376322982441821e-06, + "loss": 0.0908, + "step": 1604 + }, + { + "epoch": 1.15, + "grad_norm": 19.77879329537523, + "learning_rate": 8.374190714743338e-06, + "loss": 0.0679, + "step": 1605 + }, + { + "epoch": 1.15, + "grad_norm": 29.6562862247157, + "learning_rate": 8.37205731966502e-06, + "loss": 0.0901, + "step": 1606 + }, + { + "epoch": 1.15, + "grad_norm": 5.814058600851218, + "learning_rate": 8.369922797919672e-06, + "loss": 0.0811, + "step": 1607 + }, + { + "epoch": 1.15, + "grad_norm": 39.8862320984172, + "learning_rate": 8.367787150220481e-06, + "loss": 0.0906, + "step": 1608 + }, + { + "epoch": 1.15, + "grad_norm": 28.666925523395125, + "learning_rate": 8.365650377281004e-06, + "loss": 0.0891, + "step": 1609 + }, + { + "epoch": 1.15, + "grad_norm": 21.206415015810865, + "learning_rate": 8.36351247981518e-06, + "loss": 0.0583, + "step": 1610 + }, + { + "epoch": 1.15, + "grad_norm": 8.871052189203208, + "learning_rate": 8.361373458537316e-06, + "loss": 0.0898, + "step": 1611 + }, + { + "epoch": 1.15, + "grad_norm": 32.08612248377674, + "learning_rate": 8.359233314162102e-06, + "loss": 0.1099, + "step": 1612 + }, + { + "epoch": 1.15, + "grad_norm": 28.62885152393508, + "learning_rate": 8.357092047404598e-06, + "loss": 0.0684, + "step": 1613 + }, + { + "epoch": 1.15, + "grad_norm": 12.122663530621193, + "learning_rate": 8.354949658980243e-06, + "loss": 0.0867, + "step": 1614 + }, + { + "epoch": 1.15, + "grad_norm": 10.300802431022989, + "learning_rate": 8.352806149604847e-06, + "loss": 0.0674, + "step": 1615 + }, + { + "epoch": 1.15, + "grad_norm": 14.64308369502829, + "learning_rate": 8.350661519994596e-06, + "loss": 0.1304, + "step": 1616 + }, + { + "epoch": 1.15, + "grad_norm": 14.480196587169626, + "learning_rate": 8.348515770866051e-06, + "loss": 0.1102, + "step": 1617 + }, + { + "epoch": 1.15, + "grad_norm": 25.760410586184832, + "learning_rate": 8.346368902936149e-06, + "loss": 0.1083, + "step": 1618 + }, + { + "epoch": 1.16, + "grad_norm": 5.429983401294056, + "learning_rate": 8.344220916922195e-06, + "loss": 0.0852, + "step": 1619 + }, + { + "epoch": 1.16, + "grad_norm": 13.695217928576449, + "learning_rate": 8.342071813541873e-06, + "loss": 0.0719, + "step": 1620 + }, + { + "epoch": 1.16, + "grad_norm": 18.550814397047013, + "learning_rate": 8.339921593513239e-06, + "loss": 0.1259, + "step": 1621 + }, + { + "epoch": 1.16, + "grad_norm": 5.205918109284469, + "learning_rate": 8.337770257554721e-06, + "loss": 0.0732, + "step": 1622 + }, + { + "epoch": 1.16, + "grad_norm": 4.675401155226301, + "learning_rate": 8.335617806385119e-06, + "loss": 0.0649, + "step": 1623 + }, + { + "epoch": 1.16, + "grad_norm": 5.349145931438254, + "learning_rate": 8.333464240723608e-06, + "loss": 0.0719, + "step": 1624 + }, + { + "epoch": 1.16, + "grad_norm": 21.629273939026348, + "learning_rate": 8.331309561289734e-06, + "loss": 0.089, + "step": 1625 + }, + { + "epoch": 1.16, + "grad_norm": 13.33043717619918, + "learning_rate": 8.329153768803415e-06, + "loss": 0.0852, + "step": 1626 + }, + { + "epoch": 1.16, + "grad_norm": 22.20088070759164, + "learning_rate": 8.326996863984942e-06, + "loss": 0.1255, + "step": 1627 + }, + { + "epoch": 1.16, + "grad_norm": 14.843059951898912, + "learning_rate": 8.324838847554976e-06, + "loss": 0.1042, + "step": 1628 + }, + { + "epoch": 1.16, + "grad_norm": 28.36135952406115, + "learning_rate": 8.322679720234553e-06, + "loss": 0.0717, + "step": 1629 + }, + { + "epoch": 1.16, + "grad_norm": 21.469435660824352, + "learning_rate": 8.320519482745076e-06, + "loss": 0.0778, + "step": 1630 + }, + { + "epoch": 1.16, + "grad_norm": 6.953286758260457, + "learning_rate": 8.31835813580832e-06, + "loss": 0.0997, + "step": 1631 + }, + { + "epoch": 1.16, + "grad_norm": 45.73029621305888, + "learning_rate": 8.316195680146431e-06, + "loss": 0.1168, + "step": 1632 + }, + { + "epoch": 1.17, + "grad_norm": 27.95351189790114, + "learning_rate": 8.314032116481927e-06, + "loss": 0.1123, + "step": 1633 + }, + { + "epoch": 1.17, + "grad_norm": 19.796349764504168, + "learning_rate": 8.311867445537694e-06, + "loss": 0.0734, + "step": 1634 + }, + { + "epoch": 1.17, + "grad_norm": 30.2548744067372, + "learning_rate": 8.30970166803699e-06, + "loss": 0.1013, + "step": 1635 + }, + { + "epoch": 1.17, + "grad_norm": 32.50299572337184, + "learning_rate": 8.307534784703438e-06, + "loss": 0.0886, + "step": 1636 + }, + { + "epoch": 1.17, + "grad_norm": 23.322609551734118, + "learning_rate": 8.305366796261036e-06, + "loss": 0.0909, + "step": 1637 + }, + { + "epoch": 1.17, + "grad_norm": 18.271103305323894, + "learning_rate": 8.303197703434151e-06, + "loss": 0.111, + "step": 1638 + }, + { + "epoch": 1.17, + "grad_norm": 37.2120104735074, + "learning_rate": 8.301027506947516e-06, + "loss": 0.1141, + "step": 1639 + }, + { + "epoch": 1.17, + "grad_norm": 26.100003029295397, + "learning_rate": 8.298856207526234e-06, + "loss": 0.0931, + "step": 1640 + }, + { + "epoch": 1.17, + "grad_norm": 13.681595047192792, + "learning_rate": 8.296683805895777e-06, + "loss": 0.0827, + "step": 1641 + }, + { + "epoch": 1.17, + "grad_norm": 4.15509856357399, + "learning_rate": 8.294510302781984e-06, + "loss": 0.0536, + "step": 1642 + }, + { + "epoch": 1.17, + "grad_norm": 26.6273672687637, + "learning_rate": 8.29233569891106e-06, + "loss": 0.082, + "step": 1643 + }, + { + "epoch": 1.17, + "grad_norm": 24.756619530194843, + "learning_rate": 8.290159995009586e-06, + "loss": 0.1047, + "step": 1644 + }, + { + "epoch": 1.17, + "grad_norm": 23.963631520040913, + "learning_rate": 8.2879831918045e-06, + "loss": 0.0957, + "step": 1645 + }, + { + "epoch": 1.17, + "grad_norm": 14.18439751234935, + "learning_rate": 8.285805290023119e-06, + "loss": 0.1011, + "step": 1646 + }, + { + "epoch": 1.18, + "grad_norm": 42.43033780709877, + "learning_rate": 8.283626290393112e-06, + "loss": 0.1035, + "step": 1647 + }, + { + "epoch": 1.18, + "grad_norm": 50.888225111603596, + "learning_rate": 8.28144619364253e-06, + "loss": 0.0992, + "step": 1648 + }, + { + "epoch": 1.18, + "grad_norm": 6.112792402279644, + "learning_rate": 8.279265000499783e-06, + "loss": 0.076, + "step": 1649 + }, + { + "epoch": 1.18, + "grad_norm": 22.063075048957163, + "learning_rate": 8.277082711693645e-06, + "loss": 0.0963, + "step": 1650 + }, + { + "epoch": 1.18, + "grad_norm": 15.773608935822898, + "learning_rate": 8.274899327953261e-06, + "loss": 0.1035, + "step": 1651 + }, + { + "epoch": 1.18, + "grad_norm": 11.5617974343692, + "learning_rate": 8.272714850008142e-06, + "loss": 0.1187, + "step": 1652 + }, + { + "epoch": 1.18, + "grad_norm": 15.775555813001663, + "learning_rate": 8.270529278588158e-06, + "loss": 0.1015, + "step": 1653 + }, + { + "epoch": 1.18, + "grad_norm": 14.11119372001958, + "learning_rate": 8.268342614423553e-06, + "loss": 0.0741, + "step": 1654 + }, + { + "epoch": 1.18, + "grad_norm": 22.852005917074994, + "learning_rate": 8.26615485824493e-06, + "loss": 0.0905, + "step": 1655 + }, + { + "epoch": 1.18, + "grad_norm": 10.383090597471917, + "learning_rate": 8.263966010783259e-06, + "loss": 0.0772, + "step": 1656 + }, + { + "epoch": 1.18, + "grad_norm": 20.778806982750876, + "learning_rate": 8.261776072769878e-06, + "loss": 0.0751, + "step": 1657 + }, + { + "epoch": 1.18, + "grad_norm": 32.605084582643634, + "learning_rate": 8.259585044936484e-06, + "loss": 0.0916, + "step": 1658 + }, + { + "epoch": 1.18, + "grad_norm": 16.061701345538165, + "learning_rate": 8.257392928015138e-06, + "loss": 0.0689, + "step": 1659 + }, + { + "epoch": 1.18, + "grad_norm": 5.880742039534995, + "learning_rate": 8.25519972273827e-06, + "loss": 0.0938, + "step": 1660 + }, + { + "epoch": 1.19, + "grad_norm": 38.374981943860945, + "learning_rate": 8.253005429838667e-06, + "loss": 0.0822, + "step": 1661 + }, + { + "epoch": 1.19, + "grad_norm": 31.098642690273483, + "learning_rate": 8.250810050049488e-06, + "loss": 0.0938, + "step": 1662 + }, + { + "epoch": 1.19, + "grad_norm": 19.79092470836297, + "learning_rate": 8.248613584104245e-06, + "loss": 0.073, + "step": 1663 + }, + { + "epoch": 1.19, + "grad_norm": 10.879540641520684, + "learning_rate": 8.246416032736824e-06, + "loss": 0.0814, + "step": 1664 + }, + { + "epoch": 1.19, + "grad_norm": 35.35301070268229, + "learning_rate": 8.244217396681461e-06, + "loss": 0.0746, + "step": 1665 + }, + { + "epoch": 1.19, + "grad_norm": 32.156642571249634, + "learning_rate": 8.242017676672766e-06, + "loss": 0.1055, + "step": 1666 + }, + { + "epoch": 1.19, + "grad_norm": 5.761163190326076, + "learning_rate": 8.239816873445705e-06, + "loss": 0.0907, + "step": 1667 + }, + { + "epoch": 1.19, + "grad_norm": 7.04210178955987, + "learning_rate": 8.237614987735607e-06, + "loss": 0.0601, + "step": 1668 + }, + { + "epoch": 1.19, + "grad_norm": 18.6708744832657, + "learning_rate": 8.235412020278164e-06, + "loss": 0.0577, + "step": 1669 + }, + { + "epoch": 1.19, + "grad_norm": 18.924197204027514, + "learning_rate": 8.233207971809427e-06, + "loss": 0.0748, + "step": 1670 + }, + { + "epoch": 1.19, + "grad_norm": 6.057484238524624, + "learning_rate": 8.23100284306581e-06, + "loss": 0.0565, + "step": 1671 + }, + { + "epoch": 1.19, + "grad_norm": 8.302716030736626, + "learning_rate": 8.228796634784086e-06, + "loss": 0.0578, + "step": 1672 + }, + { + "epoch": 1.19, + "grad_norm": 7.599559113192868, + "learning_rate": 8.226589347701396e-06, + "loss": 0.0682, + "step": 1673 + }, + { + "epoch": 1.19, + "grad_norm": 7.509062555162927, + "learning_rate": 8.224380982555226e-06, + "loss": 0.0937, + "step": 1674 + }, + { + "epoch": 1.2, + "grad_norm": 15.548280441790565, + "learning_rate": 8.222171540083442e-06, + "loss": 0.1221, + "step": 1675 + }, + { + "epoch": 1.2, + "grad_norm": 19.248751048696665, + "learning_rate": 8.219961021024251e-06, + "loss": 0.0949, + "step": 1676 + }, + { + "epoch": 1.2, + "grad_norm": 12.733356442809999, + "learning_rate": 8.217749426116238e-06, + "loss": 0.0925, + "step": 1677 + }, + { + "epoch": 1.2, + "grad_norm": 19.53707190829396, + "learning_rate": 8.215536756098327e-06, + "loss": 0.0745, + "step": 1678 + }, + { + "epoch": 1.2, + "grad_norm": 5.487045075606509, + "learning_rate": 8.21332301170982e-06, + "loss": 0.0776, + "step": 1679 + }, + { + "epoch": 1.2, + "grad_norm": 7.500739390496608, + "learning_rate": 8.211108193690369e-06, + "loss": 0.1046, + "step": 1680 + }, + { + "epoch": 1.2, + "grad_norm": 7.048956581103097, + "learning_rate": 8.208892302779982e-06, + "loss": 0.0927, + "step": 1681 + }, + { + "epoch": 1.2, + "grad_norm": 13.805405676241328, + "learning_rate": 8.206675339719034e-06, + "loss": 0.0771, + "step": 1682 + }, + { + "epoch": 1.2, + "grad_norm": 15.463386393474316, + "learning_rate": 8.204457305248253e-06, + "loss": 0.0728, + "step": 1683 + }, + { + "epoch": 1.2, + "grad_norm": 15.88977529722202, + "learning_rate": 8.202238200108721e-06, + "loss": 0.0798, + "step": 1684 + }, + { + "epoch": 1.2, + "grad_norm": 17.967914215750977, + "learning_rate": 8.200018025041887e-06, + "loss": 0.1217, + "step": 1685 + }, + { + "epoch": 1.2, + "grad_norm": 28.123029815824335, + "learning_rate": 8.19779678078955e-06, + "loss": 0.084, + "step": 1686 + }, + { + "epoch": 1.2, + "grad_norm": 34.442573623512985, + "learning_rate": 8.195574468093872e-06, + "loss": 0.1146, + "step": 1687 + }, + { + "epoch": 1.2, + "grad_norm": 13.908280626151955, + "learning_rate": 8.193351087697366e-06, + "loss": 0.0895, + "step": 1688 + }, + { + "epoch": 1.21, + "grad_norm": 17.926660993388946, + "learning_rate": 8.191126640342906e-06, + "loss": 0.0702, + "step": 1689 + }, + { + "epoch": 1.21, + "grad_norm": 18.323048704617886, + "learning_rate": 8.18890112677372e-06, + "loss": 0.083, + "step": 1690 + }, + { + "epoch": 1.21, + "grad_norm": 15.46406615699352, + "learning_rate": 8.186674547733398e-06, + "loss": 0.0956, + "step": 1691 + }, + { + "epoch": 1.21, + "grad_norm": 12.626608111415358, + "learning_rate": 8.184446903965875e-06, + "loss": 0.1058, + "step": 1692 + }, + { + "epoch": 1.21, + "grad_norm": 14.08504431889377, + "learning_rate": 8.182218196215452e-06, + "loss": 0.1021, + "step": 1693 + }, + { + "epoch": 1.21, + "grad_norm": 18.58510270452204, + "learning_rate": 8.17998842522678e-06, + "loss": 0.0598, + "step": 1694 + }, + { + "epoch": 1.21, + "grad_norm": 10.819894273976141, + "learning_rate": 8.17775759174487e-06, + "loss": 0.1018, + "step": 1695 + }, + { + "epoch": 1.21, + "grad_norm": 26.40052974707958, + "learning_rate": 8.17552569651508e-06, + "loss": 0.0984, + "step": 1696 + }, + { + "epoch": 1.21, + "grad_norm": 6.349053416270252, + "learning_rate": 8.173292740283135e-06, + "loss": 0.0953, + "step": 1697 + }, + { + "epoch": 1.21, + "grad_norm": 19.342874409960686, + "learning_rate": 8.171058723795097e-06, + "loss": 0.0953, + "step": 1698 + }, + { + "epoch": 1.21, + "grad_norm": 27.925603947809098, + "learning_rate": 8.168823647797401e-06, + "loss": 0.1146, + "step": 1699 + }, + { + "epoch": 1.21, + "grad_norm": 77.29392142046062, + "learning_rate": 8.166587513036826e-06, + "loss": 0.1232, + "step": 1700 + }, + { + "epoch": 1.21, + "grad_norm": 16.95185229227817, + "learning_rate": 8.164350320260502e-06, + "loss": 0.0662, + "step": 1701 + }, + { + "epoch": 1.21, + "grad_norm": 21.974085417487288, + "learning_rate": 8.16211207021592e-06, + "loss": 0.0947, + "step": 1702 + }, + { + "epoch": 1.22, + "grad_norm": 7.92675942999132, + "learning_rate": 8.15987276365092e-06, + "loss": 0.1031, + "step": 1703 + }, + { + "epoch": 1.22, + "grad_norm": 8.05360437061264, + "learning_rate": 8.157632401313696e-06, + "loss": 0.1014, + "step": 1704 + }, + { + "epoch": 1.22, + "grad_norm": 7.27119929232412, + "learning_rate": 8.155390983952795e-06, + "loss": 0.0781, + "step": 1705 + }, + { + "epoch": 1.22, + "grad_norm": 4.3328835898750695, + "learning_rate": 8.153148512317117e-06, + "loss": 0.0669, + "step": 1706 + }, + { + "epoch": 1.22, + "grad_norm": 11.753037962222542, + "learning_rate": 8.150904987155911e-06, + "loss": 0.0864, + "step": 1707 + }, + { + "epoch": 1.22, + "grad_norm": 16.976159095803407, + "learning_rate": 8.148660409218786e-06, + "loss": 0.1355, + "step": 1708 + }, + { + "epoch": 1.22, + "grad_norm": 5.360819721353881, + "learning_rate": 8.146414779255689e-06, + "loss": 0.1117, + "step": 1709 + }, + { + "epoch": 1.22, + "grad_norm": 17.94580268964115, + "learning_rate": 8.144168098016933e-06, + "loss": 0.071, + "step": 1710 + }, + { + "epoch": 1.22, + "grad_norm": 22.882975417855626, + "learning_rate": 8.141920366253173e-06, + "loss": 0.089, + "step": 1711 + }, + { + "epoch": 1.22, + "grad_norm": 8.388813141318186, + "learning_rate": 8.139671584715419e-06, + "loss": 0.088, + "step": 1712 + }, + { + "epoch": 1.22, + "grad_norm": 19.66518390061167, + "learning_rate": 8.137421754155031e-06, + "loss": 0.1162, + "step": 1713 + }, + { + "epoch": 1.22, + "grad_norm": 5.009113369388294, + "learning_rate": 8.13517087532372e-06, + "loss": 0.0576, + "step": 1714 + }, + { + "epoch": 1.22, + "grad_norm": 14.062847580323941, + "learning_rate": 8.132918948973543e-06, + "loss": 0.0834, + "step": 1715 + }, + { + "epoch": 1.22, + "grad_norm": 23.85559125219598, + "learning_rate": 8.130665975856913e-06, + "loss": 0.0881, + "step": 1716 + }, + { + "epoch": 1.23, + "grad_norm": 25.11264401301826, + "learning_rate": 8.128411956726592e-06, + "loss": 0.1072, + "step": 1717 + }, + { + "epoch": 1.23, + "grad_norm": 9.641948331236978, + "learning_rate": 8.126156892335686e-06, + "loss": 0.0957, + "step": 1718 + }, + { + "epoch": 1.23, + "grad_norm": 11.36919292282907, + "learning_rate": 8.123900783437655e-06, + "loss": 0.1229, + "step": 1719 + }, + { + "epoch": 1.23, + "grad_norm": 7.6768793485009965, + "learning_rate": 8.121643630786308e-06, + "loss": 0.1084, + "step": 1720 + }, + { + "epoch": 1.23, + "grad_norm": 11.056594907308932, + "learning_rate": 8.1193854351358e-06, + "loss": 0.08, + "step": 1721 + }, + { + "epoch": 1.23, + "grad_norm": 13.02410509965871, + "learning_rate": 8.11712619724064e-06, + "loss": 0.0721, + "step": 1722 + }, + { + "epoch": 1.23, + "grad_norm": 8.593130940430317, + "learning_rate": 8.114865917855676e-06, + "loss": 0.0872, + "step": 1723 + }, + { + "epoch": 1.23, + "grad_norm": 5.9240949988331035, + "learning_rate": 8.112604597736113e-06, + "loss": 0.0928, + "step": 1724 + }, + { + "epoch": 1.23, + "grad_norm": 22.20784644695516, + "learning_rate": 8.110342237637501e-06, + "loss": 0.0628, + "step": 1725 + }, + { + "epoch": 1.23, + "grad_norm": 10.921566844245627, + "learning_rate": 8.108078838315732e-06, + "loss": 0.0618, + "step": 1726 + }, + { + "epoch": 1.23, + "grad_norm": 8.111574295790442, + "learning_rate": 8.105814400527052e-06, + "loss": 0.0785, + "step": 1727 + }, + { + "epoch": 1.23, + "grad_norm": 8.289683871686902, + "learning_rate": 8.103548925028054e-06, + "loss": 0.1143, + "step": 1728 + }, + { + "epoch": 1.23, + "grad_norm": 14.203313022838092, + "learning_rate": 8.101282412575673e-06, + "loss": 0.098, + "step": 1729 + }, + { + "epoch": 1.23, + "grad_norm": 36.75773831067622, + "learning_rate": 8.099014863927192e-06, + "loss": 0.0892, + "step": 1730 + }, + { + "epoch": 1.24, + "grad_norm": 11.032969099373302, + "learning_rate": 8.096746279840245e-06, + "loss": 0.0819, + "step": 1731 + }, + { + "epoch": 1.24, + "grad_norm": 27.902605396219545, + "learning_rate": 8.094476661072806e-06, + "loss": 0.0928, + "step": 1732 + }, + { + "epoch": 1.24, + "grad_norm": 30.50428948295198, + "learning_rate": 8.092206008383195e-06, + "loss": 0.0852, + "step": 1733 + }, + { + "epoch": 1.24, + "grad_norm": 26.430089450059356, + "learning_rate": 8.089934322530082e-06, + "loss": 0.1184, + "step": 1734 + }, + { + "epoch": 1.24, + "grad_norm": 21.899501892048658, + "learning_rate": 8.087661604272477e-06, + "loss": 0.0836, + "step": 1735 + }, + { + "epoch": 1.24, + "grad_norm": 20.60542568709364, + "learning_rate": 8.08538785436974e-06, + "loss": 0.097, + "step": 1736 + }, + { + "epoch": 1.24, + "grad_norm": 22.941207777328003, + "learning_rate": 8.08311307358157e-06, + "loss": 0.1018, + "step": 1737 + }, + { + "epoch": 1.24, + "grad_norm": 6.3384979167538775, + "learning_rate": 8.080837262668017e-06, + "loss": 0.0703, + "step": 1738 + }, + { + "epoch": 1.24, + "grad_norm": 6.570398124978654, + "learning_rate": 8.078560422389472e-06, + "loss": 0.0669, + "step": 1739 + }, + { + "epoch": 1.24, + "grad_norm": 21.414339602584793, + "learning_rate": 8.076282553506664e-06, + "loss": 0.0767, + "step": 1740 + }, + { + "epoch": 1.24, + "grad_norm": 18.95372832984155, + "learning_rate": 8.074003656780678e-06, + "loss": 0.0938, + "step": 1741 + }, + { + "epoch": 1.24, + "grad_norm": 10.549353249338639, + "learning_rate": 8.071723732972933e-06, + "loss": 0.0778, + "step": 1742 + }, + { + "epoch": 1.24, + "grad_norm": 16.938439415836026, + "learning_rate": 8.069442782845191e-06, + "loss": 0.1041, + "step": 1743 + }, + { + "epoch": 1.24, + "grad_norm": 27.528040440849797, + "learning_rate": 8.067160807159566e-06, + "loss": 0.1102, + "step": 1744 + }, + { + "epoch": 1.25, + "grad_norm": 34.857238335191894, + "learning_rate": 8.064877806678504e-06, + "loss": 0.1146, + "step": 1745 + }, + { + "epoch": 1.25, + "grad_norm": 19.588923727022934, + "learning_rate": 8.062593782164798e-06, + "loss": 0.1074, + "step": 1746 + }, + { + "epoch": 1.25, + "grad_norm": 11.262300732118138, + "learning_rate": 8.060308734381585e-06, + "loss": 0.0928, + "step": 1747 + }, + { + "epoch": 1.25, + "grad_norm": 21.63069752918576, + "learning_rate": 8.05802266409234e-06, + "loss": 0.0915, + "step": 1748 + }, + { + "epoch": 1.25, + "grad_norm": 17.730732468468556, + "learning_rate": 8.055735572060883e-06, + "loss": 0.0682, + "step": 1749 + }, + { + "epoch": 1.25, + "grad_norm": 15.174261533303634, + "learning_rate": 8.053447459051374e-06, + "loss": 0.0715, + "step": 1750 + }, + { + "epoch": 1.25, + "grad_norm": 14.744003542025283, + "learning_rate": 8.051158325828315e-06, + "loss": 0.0828, + "step": 1751 + }, + { + "epoch": 1.25, + "grad_norm": 16.141517917244126, + "learning_rate": 8.048868173156546e-06, + "loss": 0.0897, + "step": 1752 + }, + { + "epoch": 1.25, + "grad_norm": 27.216388324679485, + "learning_rate": 8.046577001801248e-06, + "loss": 0.1003, + "step": 1753 + }, + { + "epoch": 1.25, + "grad_norm": 12.025194670403664, + "learning_rate": 8.044284812527949e-06, + "loss": 0.1011, + "step": 1754 + }, + { + "epoch": 1.25, + "grad_norm": 17.72776253943835, + "learning_rate": 8.041991606102507e-06, + "loss": 0.1263, + "step": 1755 + }, + { + "epoch": 1.25, + "grad_norm": 6.379107531262259, + "learning_rate": 8.039697383291127e-06, + "loss": 0.068, + "step": 1756 + }, + { + "epoch": 1.25, + "grad_norm": 10.098384373784489, + "learning_rate": 8.037402144860353e-06, + "loss": 0.0898, + "step": 1757 + }, + { + "epoch": 1.25, + "grad_norm": 11.389445025532513, + "learning_rate": 8.035105891577064e-06, + "loss": 0.0896, + "step": 1758 + }, + { + "epoch": 1.26, + "grad_norm": 8.922481057387875, + "learning_rate": 8.032808624208485e-06, + "loss": 0.1005, + "step": 1759 + }, + { + "epoch": 1.26, + "grad_norm": 25.126444637378192, + "learning_rate": 8.030510343522172e-06, + "loss": 0.0884, + "step": 1760 + }, + { + "epoch": 1.26, + "grad_norm": 12.30642758311427, + "learning_rate": 8.02821105028602e-06, + "loss": 0.076, + "step": 1761 + }, + { + "epoch": 1.26, + "grad_norm": 6.33267110220372, + "learning_rate": 8.025910745268276e-06, + "loss": 0.1035, + "step": 1762 + }, + { + "epoch": 1.26, + "grad_norm": 19.02072960195617, + "learning_rate": 8.023609429237504e-06, + "loss": 0.0708, + "step": 1763 + }, + { + "epoch": 1.26, + "grad_norm": 6.3968909866767545, + "learning_rate": 8.021307102962623e-06, + "loss": 0.0759, + "step": 1764 + }, + { + "epoch": 1.26, + "grad_norm": 23.20723618473892, + "learning_rate": 8.019003767212881e-06, + "loss": 0.0717, + "step": 1765 + }, + { + "epoch": 1.26, + "grad_norm": 32.55425111351312, + "learning_rate": 8.016699422757865e-06, + "loss": 0.126, + "step": 1766 + }, + { + "epoch": 1.26, + "grad_norm": 8.562249690813024, + "learning_rate": 8.014394070367499e-06, + "loss": 0.1367, + "step": 1767 + }, + { + "epoch": 1.26, + "grad_norm": 36.530412481845815, + "learning_rate": 8.012087710812047e-06, + "loss": 0.1062, + "step": 1768 + }, + { + "epoch": 1.26, + "grad_norm": 38.24443582138145, + "learning_rate": 8.009780344862101e-06, + "loss": 0.0983, + "step": 1769 + }, + { + "epoch": 1.26, + "grad_norm": 19.804828163884277, + "learning_rate": 8.0074719732886e-06, + "loss": 0.0815, + "step": 1770 + }, + { + "epoch": 1.26, + "grad_norm": 19.841573958043558, + "learning_rate": 8.005162596862812e-06, + "loss": 0.0888, + "step": 1771 + }, + { + "epoch": 1.26, + "grad_norm": 36.698744929178375, + "learning_rate": 8.002852216356343e-06, + "loss": 0.1433, + "step": 1772 + }, + { + "epoch": 1.27, + "grad_norm": 41.58220436938131, + "learning_rate": 8.000540832541132e-06, + "loss": 0.1224, + "step": 1773 + }, + { + "epoch": 1.27, + "grad_norm": 16.701232099627646, + "learning_rate": 7.99822844618946e-06, + "loss": 0.1053, + "step": 1774 + }, + { + "epoch": 1.27, + "grad_norm": 23.90585403265861, + "learning_rate": 7.995915058073933e-06, + "loss": 0.1041, + "step": 1775 + }, + { + "epoch": 1.27, + "grad_norm": 22.792947908502864, + "learning_rate": 7.9936006689675e-06, + "loss": 0.0842, + "step": 1776 + }, + { + "epoch": 1.27, + "grad_norm": 28.537044176142224, + "learning_rate": 7.99128527964344e-06, + "loss": 0.1215, + "step": 1777 + }, + { + "epoch": 1.27, + "grad_norm": 11.80089352447319, + "learning_rate": 7.988968890875368e-06, + "loss": 0.0922, + "step": 1778 + }, + { + "epoch": 1.27, + "grad_norm": 10.311869719605001, + "learning_rate": 7.986651503437233e-06, + "loss": 0.0958, + "step": 1779 + }, + { + "epoch": 1.27, + "grad_norm": 21.233417174050086, + "learning_rate": 7.984333118103318e-06, + "loss": 0.1084, + "step": 1780 + }, + { + "epoch": 1.27, + "grad_norm": 8.340066676153791, + "learning_rate": 7.982013735648235e-06, + "loss": 0.0981, + "step": 1781 + }, + { + "epoch": 1.27, + "grad_norm": 24.679907926446546, + "learning_rate": 7.979693356846937e-06, + "loss": 0.1503, + "step": 1782 + }, + { + "epoch": 1.27, + "grad_norm": 25.530690055700195, + "learning_rate": 7.977371982474705e-06, + "loss": 0.1339, + "step": 1783 + }, + { + "epoch": 1.27, + "grad_norm": 7.597467209345345, + "learning_rate": 7.975049613307151e-06, + "loss": 0.1124, + "step": 1784 + }, + { + "epoch": 1.27, + "grad_norm": 10.392568087679368, + "learning_rate": 7.972726250120225e-06, + "loss": 0.1146, + "step": 1785 + }, + { + "epoch": 1.27, + "grad_norm": 7.338556178755361, + "learning_rate": 7.970401893690202e-06, + "loss": 0.1012, + "step": 1786 + }, + { + "epoch": 1.28, + "grad_norm": 22.062178634771666, + "learning_rate": 7.968076544793696e-06, + "loss": 0.0973, + "step": 1787 + }, + { + "epoch": 1.28, + "grad_norm": 6.614753659360179, + "learning_rate": 7.965750204207647e-06, + "loss": 0.0793, + "step": 1788 + }, + { + "epoch": 1.28, + "grad_norm": 6.37429769749807, + "learning_rate": 7.96342287270933e-06, + "loss": 0.0891, + "step": 1789 + }, + { + "epoch": 1.28, + "grad_norm": 6.518825899555654, + "learning_rate": 7.96109455107635e-06, + "loss": 0.0751, + "step": 1790 + }, + { + "epoch": 1.28, + "grad_norm": 39.72404138798098, + "learning_rate": 7.958765240086639e-06, + "loss": 0.1064, + "step": 1791 + }, + { + "epoch": 1.28, + "grad_norm": 13.961553572881497, + "learning_rate": 7.956434940518468e-06, + "loss": 0.0696, + "step": 1792 + }, + { + "epoch": 1.28, + "grad_norm": 13.524395349313767, + "learning_rate": 7.954103653150432e-06, + "loss": 0.1025, + "step": 1793 + }, + { + "epoch": 1.28, + "grad_norm": 12.512822700266833, + "learning_rate": 7.951771378761455e-06, + "loss": 0.0912, + "step": 1794 + }, + { + "epoch": 1.28, + "grad_norm": 31.031749476523494, + "learning_rate": 7.949438118130797e-06, + "loss": 0.1554, + "step": 1795 + }, + { + "epoch": 1.28, + "grad_norm": 21.655240692631608, + "learning_rate": 7.94710387203804e-06, + "loss": 0.1049, + "step": 1796 + }, + { + "epoch": 1.28, + "grad_norm": 23.573349737519887, + "learning_rate": 7.944768641263101e-06, + "loss": 0.0951, + "step": 1797 + }, + { + "epoch": 1.28, + "grad_norm": 6.423633965336433, + "learning_rate": 7.942432426586224e-06, + "loss": 0.0883, + "step": 1798 + }, + { + "epoch": 1.28, + "grad_norm": 33.7924509461897, + "learning_rate": 7.94009522878798e-06, + "loss": 0.1217, + "step": 1799 + }, + { + "epoch": 1.28, + "grad_norm": 24.608886378571242, + "learning_rate": 7.937757048649274e-06, + "loss": 0.1155, + "step": 1800 + }, + { + "epoch": 1.29, + "grad_norm": 31.5944802631455, + "learning_rate": 7.935417886951332e-06, + "loss": 0.1301, + "step": 1801 + }, + { + "epoch": 1.29, + "grad_norm": 24.56355408273472, + "learning_rate": 7.933077744475713e-06, + "loss": 0.0983, + "step": 1802 + }, + { + "epoch": 1.29, + "grad_norm": 12.600335223994328, + "learning_rate": 7.930736622004301e-06, + "loss": 0.0992, + "step": 1803 + }, + { + "epoch": 1.29, + "grad_norm": 17.365260585180668, + "learning_rate": 7.928394520319311e-06, + "loss": 0.1122, + "step": 1804 + }, + { + "epoch": 1.29, + "grad_norm": 5.327789011223336, + "learning_rate": 7.926051440203278e-06, + "loss": 0.0663, + "step": 1805 + }, + { + "epoch": 1.29, + "grad_norm": 8.382747080442627, + "learning_rate": 7.923707382439073e-06, + "loss": 0.082, + "step": 1806 + }, + { + "epoch": 1.29, + "grad_norm": 9.124410943625675, + "learning_rate": 7.921362347809888e-06, + "loss": 0.1038, + "step": 1807 + }, + { + "epoch": 1.29, + "grad_norm": 19.289726689987205, + "learning_rate": 7.919016337099242e-06, + "loss": 0.1105, + "step": 1808 + }, + { + "epoch": 1.29, + "grad_norm": 7.964546128423192, + "learning_rate": 7.916669351090981e-06, + "loss": 0.0864, + "step": 1809 + }, + { + "epoch": 1.29, + "grad_norm": 4.824052539383046, + "learning_rate": 7.914321390569278e-06, + "loss": 0.0694, + "step": 1810 + }, + { + "epoch": 1.29, + "grad_norm": 8.33312536205575, + "learning_rate": 7.911972456318629e-06, + "loss": 0.1254, + "step": 1811 + }, + { + "epoch": 1.29, + "grad_norm": 6.507945976935379, + "learning_rate": 7.909622549123855e-06, + "loss": 0.0936, + "step": 1812 + }, + { + "epoch": 1.29, + "grad_norm": 13.143972556032457, + "learning_rate": 7.907271669770107e-06, + "loss": 0.0985, + "step": 1813 + }, + { + "epoch": 1.29, + "grad_norm": 12.491831143764811, + "learning_rate": 7.904919819042855e-06, + "loss": 0.1169, + "step": 1814 + }, + { + "epoch": 1.3, + "grad_norm": 13.22535591359354, + "learning_rate": 7.902566997727896e-06, + "loss": 0.1021, + "step": 1815 + }, + { + "epoch": 1.3, + "grad_norm": 11.970708252245554, + "learning_rate": 7.900213206611353e-06, + "loss": 0.1017, + "step": 1816 + }, + { + "epoch": 1.3, + "grad_norm": 26.696684320453635, + "learning_rate": 7.897858446479672e-06, + "loss": 0.1003, + "step": 1817 + }, + { + "epoch": 1.3, + "grad_norm": 31.522235837524857, + "learning_rate": 7.895502718119618e-06, + "loss": 0.1056, + "step": 1818 + }, + { + "epoch": 1.3, + "grad_norm": 11.01303572938718, + "learning_rate": 7.89314602231829e-06, + "loss": 0.0793, + "step": 1819 + }, + { + "epoch": 1.3, + "grad_norm": 34.25204366156014, + "learning_rate": 7.8907883598631e-06, + "loss": 0.1014, + "step": 1820 + }, + { + "epoch": 1.3, + "grad_norm": 40.92644841373539, + "learning_rate": 7.888429731541784e-06, + "loss": 0.1143, + "step": 1821 + }, + { + "epoch": 1.3, + "grad_norm": 5.3982661444680335, + "learning_rate": 7.886070138142407e-06, + "loss": 0.071, + "step": 1822 + }, + { + "epoch": 1.3, + "grad_norm": 32.32811463120643, + "learning_rate": 7.883709580453354e-06, + "loss": 0.1158, + "step": 1823 + }, + { + "epoch": 1.3, + "grad_norm": 19.031316596008647, + "learning_rate": 7.88134805926333e-06, + "loss": 0.0977, + "step": 1824 + }, + { + "epoch": 1.3, + "grad_norm": 15.045104813096087, + "learning_rate": 7.878985575361362e-06, + "loss": 0.0897, + "step": 1825 + }, + { + "epoch": 1.3, + "grad_norm": 21.035459837713777, + "learning_rate": 7.876622129536801e-06, + "loss": 0.1024, + "step": 1826 + }, + { + "epoch": 1.3, + "grad_norm": 13.660668150169178, + "learning_rate": 7.874257722579319e-06, + "loss": 0.0901, + "step": 1827 + }, + { + "epoch": 1.3, + "grad_norm": 13.858105852883831, + "learning_rate": 7.871892355278906e-06, + "loss": 0.0676, + "step": 1828 + }, + { + "epoch": 1.31, + "grad_norm": 30.715956025025108, + "learning_rate": 7.869526028425878e-06, + "loss": 0.1143, + "step": 1829 + }, + { + "epoch": 1.31, + "grad_norm": 20.527869229321144, + "learning_rate": 7.867158742810866e-06, + "loss": 0.0834, + "step": 1830 + }, + { + "epoch": 1.31, + "grad_norm": 7.21553644321985, + "learning_rate": 7.864790499224825e-06, + "loss": 0.069, + "step": 1831 + }, + { + "epoch": 1.31, + "grad_norm": 15.834412029043872, + "learning_rate": 7.86242129845903e-06, + "loss": 0.0689, + "step": 1832 + }, + { + "epoch": 1.31, + "grad_norm": 16.28611478117609, + "learning_rate": 7.860051141305074e-06, + "loss": 0.0878, + "step": 1833 + }, + { + "epoch": 1.31, + "grad_norm": 18.470539205773807, + "learning_rate": 7.857680028554873e-06, + "loss": 0.0988, + "step": 1834 + }, + { + "epoch": 1.31, + "grad_norm": 14.517688298308093, + "learning_rate": 7.855307961000656e-06, + "loss": 0.0839, + "step": 1835 + }, + { + "epoch": 1.31, + "grad_norm": 8.127093098505616, + "learning_rate": 7.852934939434977e-06, + "loss": 0.0699, + "step": 1836 + }, + { + "epoch": 1.31, + "grad_norm": 15.700240348518008, + "learning_rate": 7.850560964650707e-06, + "loss": 0.1311, + "step": 1837 + }, + { + "epoch": 1.31, + "grad_norm": 33.191617585225714, + "learning_rate": 7.848186037441035e-06, + "loss": 0.1548, + "step": 1838 + }, + { + "epoch": 1.31, + "grad_norm": 38.7790193219392, + "learning_rate": 7.845810158599467e-06, + "loss": 0.0975, + "step": 1839 + }, + { + "epoch": 1.31, + "grad_norm": 16.850557590964687, + "learning_rate": 7.84343332891983e-06, + "loss": 0.0818, + "step": 1840 + }, + { + "epoch": 1.31, + "grad_norm": 28.72144236111981, + "learning_rate": 7.841055549196267e-06, + "loss": 0.1003, + "step": 1841 + }, + { + "epoch": 1.31, + "grad_norm": 21.710573467867395, + "learning_rate": 7.838676820223234e-06, + "loss": 0.0881, + "step": 1842 + }, + { + "epoch": 1.32, + "grad_norm": 35.03115815820891, + "learning_rate": 7.836297142795515e-06, + "loss": 0.0961, + "step": 1843 + }, + { + "epoch": 1.32, + "grad_norm": 12.832217299261412, + "learning_rate": 7.833916517708203e-06, + "loss": 0.0826, + "step": 1844 + }, + { + "epoch": 1.32, + "grad_norm": 35.32560567226581, + "learning_rate": 7.831534945756703e-06, + "loss": 0.1127, + "step": 1845 + }, + { + "epoch": 1.32, + "grad_norm": 27.26210477910772, + "learning_rate": 7.82915242773675e-06, + "loss": 0.0912, + "step": 1846 + }, + { + "epoch": 1.32, + "grad_norm": 12.005708358639822, + "learning_rate": 7.826768964444384e-06, + "loss": 0.0798, + "step": 1847 + }, + { + "epoch": 1.32, + "grad_norm": 21.5695948891362, + "learning_rate": 7.824384556675966e-06, + "loss": 0.0976, + "step": 1848 + }, + { + "epoch": 1.32, + "grad_norm": 32.38044189186778, + "learning_rate": 7.821999205228168e-06, + "loss": 0.13, + "step": 1849 + }, + { + "epoch": 1.32, + "grad_norm": 78.64683053933734, + "learning_rate": 7.819612910897985e-06, + "loss": 0.2098, + "step": 1850 + }, + { + "epoch": 1.32, + "grad_norm": 18.63082381659104, + "learning_rate": 7.817225674482717e-06, + "loss": 0.0945, + "step": 1851 + }, + { + "epoch": 1.32, + "grad_norm": 36.11550459481589, + "learning_rate": 7.814837496779988e-06, + "loss": 0.0802, + "step": 1852 + }, + { + "epoch": 1.32, + "grad_norm": 57.52467349010068, + "learning_rate": 7.812448378587731e-06, + "loss": 0.1255, + "step": 1853 + }, + { + "epoch": 1.32, + "grad_norm": 38.526328381998304, + "learning_rate": 7.810058320704194e-06, + "loss": 0.11, + "step": 1854 + }, + { + "epoch": 1.32, + "grad_norm": 7.853053246927483, + "learning_rate": 7.807667323927941e-06, + "loss": 0.0726, + "step": 1855 + }, + { + "epoch": 1.32, + "grad_norm": 32.05414416545823, + "learning_rate": 7.80527538905785e-06, + "loss": 0.0781, + "step": 1856 + }, + { + "epoch": 1.33, + "grad_norm": 50.865898236236205, + "learning_rate": 7.802882516893106e-06, + "loss": 0.1028, + "step": 1857 + }, + { + "epoch": 1.33, + "grad_norm": 31.687847029880817, + "learning_rate": 7.800488708233219e-06, + "loss": 0.0911, + "step": 1858 + }, + { + "epoch": 1.33, + "grad_norm": 11.587216812760387, + "learning_rate": 7.798093963877998e-06, + "loss": 0.0892, + "step": 1859 + }, + { + "epoch": 1.33, + "grad_norm": 32.49642886667288, + "learning_rate": 7.795698284627575e-06, + "loss": 0.1219, + "step": 1860 + }, + { + "epoch": 1.33, + "grad_norm": 32.145261643797554, + "learning_rate": 7.793301671282391e-06, + "loss": 0.1083, + "step": 1861 + }, + { + "epoch": 1.33, + "grad_norm": 24.419766007483094, + "learning_rate": 7.7909041246432e-06, + "loss": 0.0844, + "step": 1862 + }, + { + "epoch": 1.33, + "grad_norm": 9.398067835525364, + "learning_rate": 7.788505645511065e-06, + "loss": 0.1044, + "step": 1863 + }, + { + "epoch": 1.33, + "grad_norm": 30.039981713715175, + "learning_rate": 7.786106234687362e-06, + "loss": 0.0833, + "step": 1864 + }, + { + "epoch": 1.33, + "grad_norm": 47.34351073345086, + "learning_rate": 7.783705892973782e-06, + "loss": 0.1405, + "step": 1865 + }, + { + "epoch": 1.33, + "grad_norm": 12.349109025106152, + "learning_rate": 7.78130462117232e-06, + "loss": 0.0813, + "step": 1866 + }, + { + "epoch": 1.33, + "grad_norm": 9.908058765643409, + "learning_rate": 7.778902420085289e-06, + "loss": 0.0681, + "step": 1867 + }, + { + "epoch": 1.33, + "grad_norm": 7.3494219707093515, + "learning_rate": 7.776499290515304e-06, + "loss": 0.0879, + "step": 1868 + }, + { + "epoch": 1.33, + "grad_norm": 11.040424391158401, + "learning_rate": 7.7740952332653e-06, + "loss": 0.0918, + "step": 1869 + }, + { + "epoch": 1.33, + "grad_norm": 24.22287383306475, + "learning_rate": 7.771690249138517e-06, + "loss": 0.0968, + "step": 1870 + }, + { + "epoch": 1.34, + "grad_norm": 11.109944144915485, + "learning_rate": 7.769284338938502e-06, + "loss": 0.1071, + "step": 1871 + }, + { + "epoch": 1.34, + "grad_norm": 9.840567242675737, + "learning_rate": 7.766877503469117e-06, + "loss": 0.1022, + "step": 1872 + }, + { + "epoch": 1.34, + "grad_norm": 7.497594965854643, + "learning_rate": 7.764469743534529e-06, + "loss": 0.0892, + "step": 1873 + }, + { + "epoch": 1.34, + "grad_norm": 34.64644715009004, + "learning_rate": 7.762061059939214e-06, + "loss": 0.0901, + "step": 1874 + }, + { + "epoch": 1.34, + "grad_norm": 17.020914763066287, + "learning_rate": 7.759651453487963e-06, + "loss": 0.0775, + "step": 1875 + }, + { + "epoch": 1.34, + "grad_norm": 41.443228475209594, + "learning_rate": 7.757240924985866e-06, + "loss": 0.1339, + "step": 1876 + }, + { + "epoch": 1.34, + "grad_norm": 22.513277159467158, + "learning_rate": 7.754829475238323e-06, + "loss": 0.105, + "step": 1877 + }, + { + "epoch": 1.34, + "grad_norm": 21.265724907023937, + "learning_rate": 7.752417105051051e-06, + "loss": 0.1527, + "step": 1878 + }, + { + "epoch": 1.34, + "grad_norm": 48.24610738893118, + "learning_rate": 7.750003815230062e-06, + "loss": 0.0941, + "step": 1879 + }, + { + "epoch": 1.34, + "grad_norm": 6.8333978374856015, + "learning_rate": 7.747589606581686e-06, + "loss": 0.0563, + "step": 1880 + }, + { + "epoch": 1.34, + "grad_norm": 17.172826245285943, + "learning_rate": 7.745174479912551e-06, + "loss": 0.1003, + "step": 1881 + }, + { + "epoch": 1.34, + "grad_norm": 10.693710087910775, + "learning_rate": 7.742758436029596e-06, + "loss": 0.1112, + "step": 1882 + }, + { + "epoch": 1.34, + "grad_norm": 18.75748741216541, + "learning_rate": 7.740341475740068e-06, + "loss": 0.0944, + "step": 1883 + }, + { + "epoch": 1.34, + "grad_norm": 29.562990385496306, + "learning_rate": 7.737923599851519e-06, + "loss": 0.0795, + "step": 1884 + }, + { + "epoch": 1.35, + "grad_norm": 6.523359606962466, + "learning_rate": 7.735504809171801e-06, + "loss": 0.0846, + "step": 1885 + }, + { + "epoch": 1.35, + "grad_norm": 6.171598232912832, + "learning_rate": 7.733085104509084e-06, + "loss": 0.0924, + "step": 1886 + }, + { + "epoch": 1.35, + "grad_norm": 29.73271224436981, + "learning_rate": 7.730664486671831e-06, + "loss": 0.1106, + "step": 1887 + }, + { + "epoch": 1.35, + "grad_norm": 30.655108989409246, + "learning_rate": 7.72824295646882e-06, + "loss": 0.0946, + "step": 1888 + }, + { + "epoch": 1.35, + "grad_norm": 11.90587683853757, + "learning_rate": 7.725820514709124e-06, + "loss": 0.0774, + "step": 1889 + }, + { + "epoch": 1.35, + "grad_norm": 23.959766879108276, + "learning_rate": 7.723397162202128e-06, + "loss": 0.1327, + "step": 1890 + }, + { + "epoch": 1.35, + "grad_norm": 18.9085622070494, + "learning_rate": 7.720972899757522e-06, + "loss": 0.0879, + "step": 1891 + }, + { + "epoch": 1.35, + "grad_norm": 23.68721606790896, + "learning_rate": 7.718547728185293e-06, + "loss": 0.1257, + "step": 1892 + }, + { + "epoch": 1.35, + "grad_norm": 12.866401919651024, + "learning_rate": 7.716121648295738e-06, + "loss": 0.0869, + "step": 1893 + }, + { + "epoch": 1.35, + "grad_norm": 12.203988683415128, + "learning_rate": 7.713694660899455e-06, + "loss": 0.087, + "step": 1894 + }, + { + "epoch": 1.35, + "grad_norm": 47.327087832020034, + "learning_rate": 7.711266766807345e-06, + "loss": 0.1295, + "step": 1895 + }, + { + "epoch": 1.35, + "grad_norm": 37.47630197076899, + "learning_rate": 7.708837966830615e-06, + "loss": 0.1118, + "step": 1896 + }, + { + "epoch": 1.35, + "grad_norm": 15.991272091833782, + "learning_rate": 7.706408261780769e-06, + "loss": 0.075, + "step": 1897 + }, + { + "epoch": 1.35, + "grad_norm": 35.91340982420784, + "learning_rate": 7.703977652469618e-06, + "loss": 0.1104, + "step": 1898 + }, + { + "epoch": 1.36, + "grad_norm": 31.129407279590517, + "learning_rate": 7.701546139709272e-06, + "loss": 0.0825, + "step": 1899 + }, + { + "epoch": 1.36, + "grad_norm": 6.731671111833922, + "learning_rate": 7.69911372431215e-06, + "loss": 0.1019, + "step": 1900 + }, + { + "epoch": 1.36, + "grad_norm": 7.665645173275369, + "learning_rate": 7.696680407090962e-06, + "loss": 0.1041, + "step": 1901 + }, + { + "epoch": 1.36, + "grad_norm": 14.15220002595494, + "learning_rate": 7.694246188858726e-06, + "loss": 0.087, + "step": 1902 + }, + { + "epoch": 1.36, + "grad_norm": 11.320413195892144, + "learning_rate": 7.691811070428758e-06, + "loss": 0.1177, + "step": 1903 + }, + { + "epoch": 1.36, + "grad_norm": 39.10964804895109, + "learning_rate": 7.689375052614681e-06, + "loss": 0.1274, + "step": 1904 + }, + { + "epoch": 1.36, + "grad_norm": 12.690397207526837, + "learning_rate": 7.686938136230408e-06, + "loss": 0.1031, + "step": 1905 + }, + { + "epoch": 1.36, + "grad_norm": 15.07798986443616, + "learning_rate": 7.684500322090162e-06, + "loss": 0.1309, + "step": 1906 + }, + { + "epoch": 1.36, + "grad_norm": 3.8068024583617093, + "learning_rate": 7.68206161100846e-06, + "loss": 0.0712, + "step": 1907 + }, + { + "epoch": 1.36, + "grad_norm": 7.903182113883539, + "learning_rate": 7.679622003800122e-06, + "loss": 0.0999, + "step": 1908 + }, + { + "epoch": 1.36, + "grad_norm": 41.30605521807284, + "learning_rate": 7.677181501280266e-06, + "loss": 0.0956, + "step": 1909 + }, + { + "epoch": 1.36, + "grad_norm": 13.709690616051338, + "learning_rate": 7.674740104264308e-06, + "loss": 0.0869, + "step": 1910 + }, + { + "epoch": 1.36, + "grad_norm": 13.260512003188966, + "learning_rate": 7.672297813567968e-06, + "loss": 0.1622, + "step": 1911 + }, + { + "epoch": 1.36, + "grad_norm": 21.84565087029452, + "learning_rate": 7.669854630007257e-06, + "loss": 0.1017, + "step": 1912 + }, + { + "epoch": 1.37, + "grad_norm": 33.420476181478065, + "learning_rate": 7.667410554398486e-06, + "loss": 0.098, + "step": 1913 + }, + { + "epoch": 1.37, + "grad_norm": 31.57040112746546, + "learning_rate": 7.664965587558271e-06, + "loss": 0.1077, + "step": 1914 + }, + { + "epoch": 1.37, + "grad_norm": 4.308046009484161, + "learning_rate": 7.662519730303517e-06, + "loss": 0.0771, + "step": 1915 + }, + { + "epoch": 1.37, + "grad_norm": 60.79999436143695, + "learning_rate": 7.660072983451433e-06, + "loss": 0.1399, + "step": 1916 + }, + { + "epoch": 1.37, + "grad_norm": 18.3987711534509, + "learning_rate": 7.657625347819522e-06, + "loss": 0.1149, + "step": 1917 + }, + { + "epoch": 1.37, + "grad_norm": 6.265288838813149, + "learning_rate": 7.655176824225582e-06, + "loss": 0.0807, + "step": 1918 + }, + { + "epoch": 1.37, + "grad_norm": 6.300914605261491, + "learning_rate": 7.652727413487716e-06, + "loss": 0.0993, + "step": 1919 + }, + { + "epoch": 1.37, + "grad_norm": 22.124179459804967, + "learning_rate": 7.650277116424313e-06, + "loss": 0.0684, + "step": 1920 + }, + { + "epoch": 1.37, + "grad_norm": 25.625468309309074, + "learning_rate": 7.647825933854063e-06, + "loss": 0.1117, + "step": 1921 + }, + { + "epoch": 1.37, + "grad_norm": 7.624552453874793, + "learning_rate": 7.645373866595953e-06, + "loss": 0.1179, + "step": 1922 + }, + { + "epoch": 1.37, + "grad_norm": 6.194508526818871, + "learning_rate": 7.642920915469265e-06, + "loss": 0.0785, + "step": 1923 + }, + { + "epoch": 1.37, + "grad_norm": 24.148000648762252, + "learning_rate": 7.640467081293573e-06, + "loss": 0.1417, + "step": 1924 + }, + { + "epoch": 1.37, + "grad_norm": 25.24596065135585, + "learning_rate": 7.638012364888751e-06, + "loss": 0.1062, + "step": 1925 + }, + { + "epoch": 1.37, + "grad_norm": 11.663735609649464, + "learning_rate": 7.635556767074965e-06, + "loss": 0.0919, + "step": 1926 + }, + { + "epoch": 1.38, + "grad_norm": 19.983238031321108, + "learning_rate": 7.633100288672674e-06, + "loss": 0.0861, + "step": 1927 + }, + { + "epoch": 1.38, + "grad_norm": 24.516987429952046, + "learning_rate": 7.630642930502634e-06, + "loss": 0.1084, + "step": 1928 + }, + { + "epoch": 1.38, + "grad_norm": 23.7079785548782, + "learning_rate": 7.628184693385896e-06, + "loss": 0.0987, + "step": 1929 + }, + { + "epoch": 1.38, + "grad_norm": 8.632779942934862, + "learning_rate": 7.625725578143801e-06, + "loss": 0.0803, + "step": 1930 + }, + { + "epoch": 1.38, + "grad_norm": 26.75562561931583, + "learning_rate": 7.6232655855979844e-06, + "loss": 0.12, + "step": 1931 + }, + { + "epoch": 1.38, + "grad_norm": 9.220766214367428, + "learning_rate": 7.620804716570376e-06, + "loss": 0.1036, + "step": 1932 + }, + { + "epoch": 1.38, + "grad_norm": 35.404659677885, + "learning_rate": 7.618342971883199e-06, + "loss": 0.1191, + "step": 1933 + }, + { + "epoch": 1.38, + "grad_norm": 5.539001124567162, + "learning_rate": 7.615880352358967e-06, + "loss": 0.0887, + "step": 1934 + }, + { + "epoch": 1.38, + "grad_norm": 33.35991052555321, + "learning_rate": 7.613416858820486e-06, + "loss": 0.0751, + "step": 1935 + }, + { + "epoch": 1.38, + "grad_norm": 43.77244661048231, + "learning_rate": 7.6109524920908575e-06, + "loss": 0.105, + "step": 1936 + }, + { + "epoch": 1.38, + "grad_norm": 8.095324647325247, + "learning_rate": 7.608487252993471e-06, + "loss": 0.1018, + "step": 1937 + }, + { + "epoch": 1.38, + "grad_norm": 10.482483420441799, + "learning_rate": 7.6060211423520095e-06, + "loss": 0.0607, + "step": 1938 + }, + { + "epoch": 1.38, + "grad_norm": 39.33723284296271, + "learning_rate": 7.6035541609904425e-06, + "loss": 0.1287, + "step": 1939 + }, + { + "epoch": 1.38, + "grad_norm": 16.80992526081807, + "learning_rate": 7.60108630973304e-06, + "loss": 0.0977, + "step": 1940 + }, + { + "epoch": 1.39, + "grad_norm": 14.940668855422615, + "learning_rate": 7.598617589404354e-06, + "loss": 0.0879, + "step": 1941 + }, + { + "epoch": 1.39, + "grad_norm": 7.542100439749129, + "learning_rate": 7.596148000829229e-06, + "loss": 0.1262, + "step": 1942 + }, + { + "epoch": 1.39, + "grad_norm": 17.47476515065765, + "learning_rate": 7.593677544832802e-06, + "loss": 0.1219, + "step": 1943 + }, + { + "epoch": 1.39, + "grad_norm": 7.605399845369206, + "learning_rate": 7.5912062222404965e-06, + "loss": 0.064, + "step": 1944 + }, + { + "epoch": 1.39, + "grad_norm": 30.299165014514305, + "learning_rate": 7.588734033878031e-06, + "loss": 0.1134, + "step": 1945 + }, + { + "epoch": 1.39, + "grad_norm": 12.53081257535877, + "learning_rate": 7.586260980571407e-06, + "loss": 0.0906, + "step": 1946 + }, + { + "epoch": 1.39, + "grad_norm": 28.76809322028387, + "learning_rate": 7.5837870631469165e-06, + "loss": 0.1503, + "step": 1947 + }, + { + "epoch": 1.39, + "grad_norm": 21.855556241959924, + "learning_rate": 7.581312282431143e-06, + "loss": 0.0753, + "step": 1948 + }, + { + "epoch": 1.39, + "grad_norm": 29.170849942865587, + "learning_rate": 7.578836639250958e-06, + "loss": 0.0962, + "step": 1949 + }, + { + "epoch": 1.39, + "grad_norm": 11.690758795177931, + "learning_rate": 7.576360134433517e-06, + "loss": 0.0917, + "step": 1950 + }, + { + "epoch": 1.39, + "grad_norm": 12.408711643478298, + "learning_rate": 7.5738827688062676e-06, + "loss": 0.0862, + "step": 1951 + }, + { + "epoch": 1.39, + "grad_norm": 13.868396523210292, + "learning_rate": 7.571404543196943e-06, + "loss": 0.1176, + "step": 1952 + }, + { + "epoch": 1.39, + "grad_norm": 26.57491798793351, + "learning_rate": 7.568925458433567e-06, + "loss": 0.1008, + "step": 1953 + }, + { + "epoch": 1.39, + "grad_norm": 5.974416690063627, + "learning_rate": 7.566445515344445e-06, + "loss": 0.0807, + "step": 1954 + }, + { + "epoch": 1.4, + "grad_norm": 14.981211780495261, + "learning_rate": 7.563964714758172e-06, + "loss": 0.0732, + "step": 1955 + }, + { + "epoch": 1.4, + "grad_norm": 30.306827143553917, + "learning_rate": 7.561483057503632e-06, + "loss": 0.1293, + "step": 1956 + }, + { + "epoch": 1.4, + "grad_norm": 4.761468592139051, + "learning_rate": 7.559000544409991e-06, + "loss": 0.0891, + "step": 1957 + }, + { + "epoch": 1.4, + "grad_norm": 10.534845276627891, + "learning_rate": 7.556517176306704e-06, + "loss": 0.0975, + "step": 1958 + }, + { + "epoch": 1.4, + "grad_norm": 18.880359885969494, + "learning_rate": 7.554032954023508e-06, + "loss": 0.0979, + "step": 1959 + }, + { + "epoch": 1.4, + "grad_norm": 15.143403477025016, + "learning_rate": 7.55154787839043e-06, + "loss": 0.1144, + "step": 1960 + }, + { + "epoch": 1.4, + "grad_norm": 15.521438570335969, + "learning_rate": 7.5490619502377805e-06, + "loss": 0.1129, + "step": 1961 + }, + { + "epoch": 1.4, + "grad_norm": 18.5242277495622, + "learning_rate": 7.546575170396153e-06, + "loss": 0.1074, + "step": 1962 + }, + { + "epoch": 1.4, + "grad_norm": 24.927697963385498, + "learning_rate": 7.544087539696427e-06, + "loss": 0.1021, + "step": 1963 + }, + { + "epoch": 1.4, + "grad_norm": 10.703714901569482, + "learning_rate": 7.541599058969766e-06, + "loss": 0.0956, + "step": 1964 + }, + { + "epoch": 1.4, + "grad_norm": 34.83739720762142, + "learning_rate": 7.539109729047619e-06, + "loss": 0.121, + "step": 1965 + }, + { + "epoch": 1.4, + "grad_norm": 10.37898184340536, + "learning_rate": 7.5366195507617155e-06, + "loss": 0.0914, + "step": 1966 + }, + { + "epoch": 1.4, + "grad_norm": 32.17544750214366, + "learning_rate": 7.534128524944071e-06, + "loss": 0.0848, + "step": 1967 + }, + { + "epoch": 1.4, + "grad_norm": 47.63891616778719, + "learning_rate": 7.531636652426985e-06, + "loss": 0.144, + "step": 1968 + }, + { + "epoch": 1.41, + "grad_norm": 12.949628774093883, + "learning_rate": 7.529143934043036e-06, + "loss": 0.1061, + "step": 1969 + }, + { + "epoch": 1.41, + "grad_norm": 32.96795634535262, + "learning_rate": 7.526650370625088e-06, + "loss": 0.1438, + "step": 1970 + }, + { + "epoch": 1.41, + "grad_norm": 55.11333254799953, + "learning_rate": 7.5241559630062896e-06, + "loss": 0.1469, + "step": 1971 + }, + { + "epoch": 1.41, + "grad_norm": 18.925573978897116, + "learning_rate": 7.5216607120200655e-06, + "loss": 0.0874, + "step": 1972 + }, + { + "epoch": 1.41, + "grad_norm": 21.262962891421868, + "learning_rate": 7.519164618500127e-06, + "loss": 0.0812, + "step": 1973 + }, + { + "epoch": 1.41, + "grad_norm": 41.7596554934021, + "learning_rate": 7.5166676832804655e-06, + "loss": 0.1268, + "step": 1974 + }, + { + "epoch": 1.41, + "grad_norm": 11.588135089329397, + "learning_rate": 7.514169907195352e-06, + "loss": 0.0864, + "step": 1975 + }, + { + "epoch": 1.41, + "grad_norm": 5.342692979719796, + "learning_rate": 7.511671291079342e-06, + "loss": 0.0705, + "step": 1976 + }, + { + "epoch": 1.41, + "grad_norm": 5.712686440388007, + "learning_rate": 7.509171835767268e-06, + "loss": 0.0907, + "step": 1977 + }, + { + "epoch": 1.41, + "grad_norm": 36.70804565028893, + "learning_rate": 7.506671542094246e-06, + "loss": 0.1252, + "step": 1978 + }, + { + "epoch": 1.41, + "grad_norm": 10.489002626915171, + "learning_rate": 7.504170410895668e-06, + "loss": 0.0996, + "step": 1979 + }, + { + "epoch": 1.41, + "grad_norm": 6.3444733677159295, + "learning_rate": 7.501668443007212e-06, + "loss": 0.0735, + "step": 1980 + }, + { + "epoch": 1.41, + "grad_norm": 11.093855896464442, + "learning_rate": 7.499165639264828e-06, + "loss": 0.115, + "step": 1981 + }, + { + "epoch": 1.41, + "grad_norm": 33.7558992291076, + "learning_rate": 7.496662000504752e-06, + "loss": 0.165, + "step": 1982 + }, + { + "epoch": 1.42, + "grad_norm": 10.907611777929603, + "learning_rate": 7.4941575275634945e-06, + "loss": 0.11, + "step": 1983 + }, + { + "epoch": 1.42, + "grad_norm": 28.17923318389303, + "learning_rate": 7.49165222127785e-06, + "loss": 0.0878, + "step": 1984 + }, + { + "epoch": 1.42, + "grad_norm": 15.100103791027154, + "learning_rate": 7.489146082484882e-06, + "loss": 0.0727, + "step": 1985 + }, + { + "epoch": 1.42, + "grad_norm": 12.562326160266043, + "learning_rate": 7.486639112021944e-06, + "loss": 0.0812, + "step": 1986 + }, + { + "epoch": 1.42, + "grad_norm": 31.277970594845105, + "learning_rate": 7.484131310726658e-06, + "loss": 0.0872, + "step": 1987 + }, + { + "epoch": 1.42, + "grad_norm": 14.473359713155952, + "learning_rate": 7.481622679436929e-06, + "loss": 0.0759, + "step": 1988 + }, + { + "epoch": 1.42, + "grad_norm": 7.391891563984436, + "learning_rate": 7.479113218990934e-06, + "loss": 0.0832, + "step": 1989 + }, + { + "epoch": 1.42, + "grad_norm": 6.8735457218956455, + "learning_rate": 7.4766029302271335e-06, + "loss": 0.108, + "step": 1990 + }, + { + "epoch": 1.42, + "grad_norm": 17.97877074755628, + "learning_rate": 7.474091813984261e-06, + "loss": 0.0837, + "step": 1991 + }, + { + "epoch": 1.42, + "grad_norm": 15.971760147819229, + "learning_rate": 7.471579871101326e-06, + "loss": 0.1023, + "step": 1992 + }, + { + "epoch": 1.42, + "grad_norm": 7.376728168558531, + "learning_rate": 7.4690671024176165e-06, + "loss": 0.1162, + "step": 1993 + }, + { + "epoch": 1.42, + "grad_norm": 9.360966837136843, + "learning_rate": 7.466553508772695e-06, + "loss": 0.0769, + "step": 1994 + }, + { + "epoch": 1.42, + "grad_norm": 8.398769699308323, + "learning_rate": 7.4640390910064e-06, + "loss": 0.093, + "step": 1995 + }, + { + "epoch": 1.42, + "grad_norm": 19.282073477802033, + "learning_rate": 7.461523849958845e-06, + "loss": 0.1223, + "step": 1996 + }, + { + "epoch": 1.43, + "grad_norm": 9.488837006383456, + "learning_rate": 7.459007786470418e-06, + "loss": 0.0822, + "step": 1997 + }, + { + "epoch": 1.43, + "grad_norm": 16.12399085356851, + "learning_rate": 7.4564909013817845e-06, + "loss": 0.1228, + "step": 1998 + }, + { + "epoch": 1.43, + "grad_norm": 7.449212798239846, + "learning_rate": 7.45397319553388e-06, + "loss": 0.0666, + "step": 1999 + }, + { + "epoch": 1.43, + "grad_norm": 30.988058304482948, + "learning_rate": 7.451454669767919e-06, + "loss": 0.1119, + "step": 2000 + }, + { + "epoch": 1.43, + "eval_avg_AUC": 0.7993414523799219, + "eval_avg_Accuracy": 0.6962864721485411, + "eval_avg_Accuracy-right": 0.895069779574801, + "eval_avg_Accuracy-wrong": 0.34967022970206957, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6534253440240803, + "eval_last_AUC": 0.796335909721104, + "eval_last_Accuracy": 0.7389340185676393, + "eval_last_Accuracy-right": 0.8014868918742664, + "eval_last_Accuracy-wrong": 0.6298612690470775, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6609678828685568, + "eval_max_AUC": 0.731932816756531, + "eval_max_Accuracy": 0.6443136604774535, + "eval_max_Accuracy-right": 0.9846745793661145, + "eval_max_Accuracy-wrong": 0.05083011143961792, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.597156835188284, + "eval_min_AUC": 0.8068730624550771, + "eval_min_Accuracy": 0.7409234084880637, + "eval_min_Accuracy-right": 0.7598800052171645, + "eval_min_Accuracy-wrong": 0.7078690015919945, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.662910126847143, + "eval_prod_AUC": 0.8081306191935946, + "eval_prod_Accuracy": 0.7138594164456233, + "eval_prod_Accuracy-right": 0.6302334681100822, + "eval_prod_Accuracy-wrong": 0.8596770525358198, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.653658027762353, + "eval_runtime": 246.3352, + "eval_samples_per_second": 97.948, + "eval_steps_per_second": 3.061, + "eval_sum_AUC": 0.6609696244629366, + "eval_sum_Accuracy": 0.6379310344827587, + "eval_sum_Accuracy-right": 0.9988261379940002, + "eval_sum_Accuracy-wrong": 0.008642256083693428, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6421408268547636, + "step": 2000 + }, + { + "epoch": 1.43, + "grad_norm": 13.839120903654035, + "learning_rate": 7.448935324925386e-06, + "loss": 0.0573, + "step": 2001 + }, + { + "epoch": 1.43, + "grad_norm": 6.253220256521355, + "learning_rate": 7.446415161848043e-06, + "loss": 0.0869, + "step": 2002 + }, + { + "epoch": 1.43, + "grad_norm": 10.932247825123236, + "learning_rate": 7.443894181377921e-06, + "loss": 0.0869, + "step": 2003 + }, + { + "epoch": 1.43, + "grad_norm": 14.774479223409482, + "learning_rate": 7.441372384357328e-06, + "loss": 0.0901, + "step": 2004 + }, + { + "epoch": 1.43, + "grad_norm": 28.556495798503843, + "learning_rate": 7.438849771628844e-06, + "loss": 0.1105, + "step": 2005 + }, + { + "epoch": 1.43, + "grad_norm": 10.991255023773615, + "learning_rate": 7.43632634403532e-06, + "loss": 0.0887, + "step": 2006 + }, + { + "epoch": 1.43, + "grad_norm": 10.019756556058779, + "learning_rate": 7.433802102419878e-06, + "loss": 0.1019, + "step": 2007 + }, + { + "epoch": 1.43, + "grad_norm": 26.827197018870162, + "learning_rate": 7.431277047625918e-06, + "loss": 0.1045, + "step": 2008 + }, + { + "epoch": 1.43, + "grad_norm": 40.7772525438193, + "learning_rate": 7.428751180497104e-06, + "loss": 0.1014, + "step": 2009 + }, + { + "epoch": 1.43, + "grad_norm": 10.794657275533215, + "learning_rate": 7.426224501877376e-06, + "loss": 0.1172, + "step": 2010 + }, + { + "epoch": 1.44, + "grad_norm": 4.9017737328348465, + "learning_rate": 7.423697012610947e-06, + "loss": 0.0688, + "step": 2011 + }, + { + "epoch": 1.44, + "grad_norm": 32.11030414551294, + "learning_rate": 7.421168713542294e-06, + "loss": 0.1136, + "step": 2012 + }, + { + "epoch": 1.44, + "grad_norm": 23.50894816197612, + "learning_rate": 7.418639605516172e-06, + "loss": 0.103, + "step": 2013 + }, + { + "epoch": 1.44, + "grad_norm": 7.586007382354752, + "learning_rate": 7.416109689377603e-06, + "loss": 0.121, + "step": 2014 + }, + { + "epoch": 1.44, + "grad_norm": 8.23229798707242, + "learning_rate": 7.413578965971876e-06, + "loss": 0.0738, + "step": 2015 + }, + { + "epoch": 1.44, + "grad_norm": 5.177266007361741, + "learning_rate": 7.411047436144556e-06, + "loss": 0.0636, + "step": 2016 + }, + { + "epoch": 1.44, + "grad_norm": 7.750691106855549, + "learning_rate": 7.408515100741471e-06, + "loss": 0.1152, + "step": 2017 + }, + { + "epoch": 1.44, + "grad_norm": 33.35658858310154, + "learning_rate": 7.405981960608725e-06, + "loss": 0.0986, + "step": 2018 + }, + { + "epoch": 1.44, + "grad_norm": 12.949061067234727, + "learning_rate": 7.403448016592685e-06, + "loss": 0.0984, + "step": 2019 + }, + { + "epoch": 1.44, + "grad_norm": 30.906573797223782, + "learning_rate": 7.400913269539988e-06, + "loss": 0.119, + "step": 2020 + }, + { + "epoch": 1.44, + "grad_norm": 14.698468315096022, + "learning_rate": 7.398377720297541e-06, + "loss": 0.0934, + "step": 2021 + }, + { + "epoch": 1.44, + "grad_norm": 38.759538949910294, + "learning_rate": 7.39584136971252e-06, + "loss": 0.1078, + "step": 2022 + }, + { + "epoch": 1.44, + "grad_norm": 12.61303665133732, + "learning_rate": 7.393304218632364e-06, + "loss": 0.0614, + "step": 2023 + }, + { + "epoch": 1.44, + "grad_norm": 33.99016387504461, + "learning_rate": 7.390766267904783e-06, + "loss": 0.1074, + "step": 2024 + }, + { + "epoch": 1.45, + "grad_norm": 25.92261989754995, + "learning_rate": 7.3882275183777554e-06, + "loss": 0.1655, + "step": 2025 + }, + { + "epoch": 1.45, + "grad_norm": 40.11705918481798, + "learning_rate": 7.385687970899523e-06, + "loss": 0.1057, + "step": 2026 + }, + { + "epoch": 1.45, + "grad_norm": 10.286685919964656, + "learning_rate": 7.3831476263185965e-06, + "loss": 0.0775, + "step": 2027 + }, + { + "epoch": 1.45, + "grad_norm": 15.0192688550375, + "learning_rate": 7.380606485483751e-06, + "loss": 0.0768, + "step": 2028 + }, + { + "epoch": 1.45, + "grad_norm": 55.38449400925222, + "learning_rate": 7.378064549244031e-06, + "loss": 0.1389, + "step": 2029 + }, + { + "epoch": 1.45, + "grad_norm": 23.97034166915613, + "learning_rate": 7.375521818448741e-06, + "loss": 0.1333, + "step": 2030 + }, + { + "epoch": 1.45, + "grad_norm": 13.121072563372268, + "learning_rate": 7.372978293947459e-06, + "loss": 0.0995, + "step": 2031 + }, + { + "epoch": 1.45, + "grad_norm": 45.71363021283654, + "learning_rate": 7.3704339765900205e-06, + "loss": 0.1544, + "step": 2032 + }, + { + "epoch": 1.45, + "grad_norm": 29.422717589921906, + "learning_rate": 7.367888867226531e-06, + "loss": 0.1324, + "step": 2033 + }, + { + "epoch": 1.45, + "grad_norm": 38.96958949899909, + "learning_rate": 7.365342966707359e-06, + "loss": 0.1, + "step": 2034 + }, + { + "epoch": 1.45, + "grad_norm": 6.289954542505453, + "learning_rate": 7.362796275883135e-06, + "loss": 0.0754, + "step": 2035 + }, + { + "epoch": 1.45, + "grad_norm": 10.57359176786369, + "learning_rate": 7.360248795604758e-06, + "loss": 0.0861, + "step": 2036 + }, + { + "epoch": 1.45, + "grad_norm": 24.090251255820395, + "learning_rate": 7.3577005267233885e-06, + "loss": 0.0708, + "step": 2037 + }, + { + "epoch": 1.45, + "grad_norm": 37.93968118245042, + "learning_rate": 7.355151470090449e-06, + "loss": 0.1194, + "step": 2038 + }, + { + "epoch": 1.46, + "grad_norm": 13.449078150315941, + "learning_rate": 7.352601626557628e-06, + "loss": 0.0925, + "step": 2039 + }, + { + "epoch": 1.46, + "grad_norm": 18.95997171971615, + "learning_rate": 7.350050996976875e-06, + "loss": 0.1005, + "step": 2040 + }, + { + "epoch": 1.46, + "grad_norm": 26.92934782855485, + "learning_rate": 7.347499582200404e-06, + "loss": 0.0961, + "step": 2041 + }, + { + "epoch": 1.46, + "grad_norm": 22.73730072835748, + "learning_rate": 7.344947383080687e-06, + "loss": 0.1058, + "step": 2042 + }, + { + "epoch": 1.46, + "grad_norm": 22.503461102204398, + "learning_rate": 7.342394400470463e-06, + "loss": 0.0862, + "step": 2043 + }, + { + "epoch": 1.46, + "grad_norm": 20.649364677252795, + "learning_rate": 7.339840635222732e-06, + "loss": 0.0664, + "step": 2044 + }, + { + "epoch": 1.46, + "grad_norm": 11.339548594219169, + "learning_rate": 7.337286088190754e-06, + "loss": 0.0889, + "step": 2045 + }, + { + "epoch": 1.46, + "grad_norm": 11.400084265838423, + "learning_rate": 7.334730760228049e-06, + "loss": 0.0951, + "step": 2046 + }, + { + "epoch": 1.46, + "grad_norm": 14.486076375014688, + "learning_rate": 7.332174652188401e-06, + "loss": 0.1224, + "step": 2047 + }, + { + "epoch": 1.46, + "grad_norm": 34.7417013692277, + "learning_rate": 7.329617764925853e-06, + "loss": 0.1178, + "step": 2048 + }, + { + "epoch": 1.46, + "grad_norm": 30.777860474477603, + "learning_rate": 7.32706009929471e-06, + "loss": 0.1587, + "step": 2049 + }, + { + "epoch": 1.46, + "grad_norm": 18.592979828725138, + "learning_rate": 7.324501656149532e-06, + "loss": 0.1007, + "step": 2050 + }, + { + "epoch": 1.46, + "grad_norm": 42.347294863149365, + "learning_rate": 7.321942436345146e-06, + "loss": 0.1045, + "step": 2051 + }, + { + "epoch": 1.46, + "grad_norm": 21.001661806227776, + "learning_rate": 7.319382440736632e-06, + "loss": 0.1101, + "step": 2052 + }, + { + "epoch": 1.47, + "grad_norm": 19.577035199548465, + "learning_rate": 7.316821670179335e-06, + "loss": 0.1144, + "step": 2053 + }, + { + "epoch": 1.47, + "grad_norm": 11.230363742790843, + "learning_rate": 7.314260125528854e-06, + "loss": 0.1404, + "step": 2054 + }, + { + "epoch": 1.47, + "grad_norm": 17.499694248271904, + "learning_rate": 7.311697807641048e-06, + "loss": 0.0714, + "step": 2055 + }, + { + "epoch": 1.47, + "grad_norm": 23.806058024752186, + "learning_rate": 7.3091347173720386e-06, + "loss": 0.1033, + "step": 2056 + }, + { + "epoch": 1.47, + "grad_norm": 6.085617803027079, + "learning_rate": 7.3065708555781986e-06, + "loss": 0.0939, + "step": 2057 + }, + { + "epoch": 1.47, + "grad_norm": 34.50468476483311, + "learning_rate": 7.304006223116162e-06, + "loss": 0.1208, + "step": 2058 + }, + { + "epoch": 1.47, + "grad_norm": 13.365971501697732, + "learning_rate": 7.301440820842822e-06, + "loss": 0.09, + "step": 2059 + }, + { + "epoch": 1.47, + "grad_norm": 10.044928264199827, + "learning_rate": 7.298874649615327e-06, + "loss": 0.153, + "step": 2060 + }, + { + "epoch": 1.47, + "grad_norm": 28.362345328223054, + "learning_rate": 7.29630771029108e-06, + "loss": 0.0945, + "step": 2061 + }, + { + "epoch": 1.47, + "grad_norm": 6.658453340675622, + "learning_rate": 7.293740003727745e-06, + "loss": 0.0809, + "step": 2062 + }, + { + "epoch": 1.47, + "grad_norm": 15.121775553304609, + "learning_rate": 7.291171530783241e-06, + "loss": 0.1129, + "step": 2063 + }, + { + "epoch": 1.47, + "grad_norm": 11.416830583602653, + "learning_rate": 7.288602292315742e-06, + "loss": 0.1333, + "step": 2064 + }, + { + "epoch": 1.47, + "grad_norm": 5.084441019797188, + "learning_rate": 7.286032289183679e-06, + "loss": 0.1097, + "step": 2065 + }, + { + "epoch": 1.47, + "grad_norm": 15.642621559627102, + "learning_rate": 7.283461522245736e-06, + "loss": 0.1122, + "step": 2066 + }, + { + "epoch": 1.48, + "grad_norm": 41.35476137894392, + "learning_rate": 7.280889992360856e-06, + "loss": 0.1206, + "step": 2067 + }, + { + "epoch": 1.48, + "grad_norm": 21.18129028339432, + "learning_rate": 7.278317700388232e-06, + "loss": 0.1133, + "step": 2068 + }, + { + "epoch": 1.48, + "grad_norm": 19.902882472946708, + "learning_rate": 7.275744647187318e-06, + "loss": 0.1512, + "step": 2069 + }, + { + "epoch": 1.48, + "grad_norm": 15.939602818419818, + "learning_rate": 7.273170833617818e-06, + "loss": 0.1044, + "step": 2070 + }, + { + "epoch": 1.48, + "grad_norm": 31.533034131028536, + "learning_rate": 7.2705962605396895e-06, + "loss": 0.1394, + "step": 2071 + }, + { + "epoch": 1.48, + "grad_norm": 9.111943201249398, + "learning_rate": 7.268020928813147e-06, + "loss": 0.0912, + "step": 2072 + }, + { + "epoch": 1.48, + "grad_norm": 13.514191499285351, + "learning_rate": 7.265444839298656e-06, + "loss": 0.0923, + "step": 2073 + }, + { + "epoch": 1.48, + "grad_norm": 6.330749877489256, + "learning_rate": 7.262867992856934e-06, + "loss": 0.0975, + "step": 2074 + }, + { + "epoch": 1.48, + "grad_norm": 30.935621396553604, + "learning_rate": 7.260290390348956e-06, + "loss": 0.1388, + "step": 2075 + }, + { + "epoch": 1.48, + "grad_norm": 6.688613763402811, + "learning_rate": 7.257712032635946e-06, + "loss": 0.1047, + "step": 2076 + }, + { + "epoch": 1.48, + "grad_norm": 13.33335451161966, + "learning_rate": 7.255132920579382e-06, + "loss": 0.1041, + "step": 2077 + }, + { + "epoch": 1.48, + "grad_norm": 8.619130876094033, + "learning_rate": 7.252553055040991e-06, + "loss": 0.0897, + "step": 2078 + }, + { + "epoch": 1.48, + "grad_norm": 12.168562646783498, + "learning_rate": 7.249972436882756e-06, + "loss": 0.1006, + "step": 2079 + }, + { + "epoch": 1.48, + "grad_norm": 14.278860858221417, + "learning_rate": 7.247391066966909e-06, + "loss": 0.0814, + "step": 2080 + }, + { + "epoch": 1.49, + "grad_norm": 13.260089786437403, + "learning_rate": 7.244808946155933e-06, + "loss": 0.0886, + "step": 2081 + }, + { + "epoch": 1.49, + "grad_norm": 5.521768030802181, + "learning_rate": 7.242226075312564e-06, + "loss": 0.0858, + "step": 2082 + }, + { + "epoch": 1.49, + "grad_norm": 13.198034640001179, + "learning_rate": 7.239642455299787e-06, + "loss": 0.0963, + "step": 2083 + }, + { + "epoch": 1.49, + "grad_norm": 35.149002150189034, + "learning_rate": 7.237058086980835e-06, + "loss": 0.1558, + "step": 2084 + }, + { + "epoch": 1.49, + "grad_norm": 10.810236800782775, + "learning_rate": 7.234472971219197e-06, + "loss": 0.08, + "step": 2085 + }, + { + "epoch": 1.49, + "grad_norm": 14.523629121141207, + "learning_rate": 7.231887108878606e-06, + "loss": 0.1177, + "step": 2086 + }, + { + "epoch": 1.49, + "grad_norm": 9.493707671164756, + "learning_rate": 7.229300500823047e-06, + "loss": 0.1161, + "step": 2087 + }, + { + "epoch": 1.49, + "grad_norm": 7.760754279339331, + "learning_rate": 7.226713147916754e-06, + "loss": 0.0781, + "step": 2088 + }, + { + "epoch": 1.49, + "grad_norm": 10.837060450720395, + "learning_rate": 7.22412505102421e-06, + "loss": 0.0732, + "step": 2089 + }, + { + "epoch": 1.49, + "grad_norm": 21.583304857531907, + "learning_rate": 7.221536211010147e-06, + "loss": 0.0729, + "step": 2090 + }, + { + "epoch": 1.49, + "grad_norm": 12.445127735446684, + "learning_rate": 7.2189466287395425e-06, + "loss": 0.0918, + "step": 2091 + }, + { + "epoch": 1.49, + "grad_norm": 8.94159436880388, + "learning_rate": 7.216356305077625e-06, + "loss": 0.1051, + "step": 2092 + }, + { + "epoch": 1.49, + "grad_norm": 18.138755911165397, + "learning_rate": 7.21376524088987e-06, + "loss": 0.1355, + "step": 2093 + }, + { + "epoch": 1.49, + "grad_norm": 15.689142765328521, + "learning_rate": 7.211173437042001e-06, + "loss": 0.0781, + "step": 2094 + }, + { + "epoch": 1.5, + "grad_norm": 37.30887264410398, + "learning_rate": 7.208580894399986e-06, + "loss": 0.1034, + "step": 2095 + }, + { + "epoch": 1.5, + "grad_norm": 27.998517100165653, + "learning_rate": 7.205987613830043e-06, + "loss": 0.1226, + "step": 2096 + }, + { + "epoch": 1.5, + "grad_norm": 27.401516777324407, + "learning_rate": 7.203393596198635e-06, + "loss": 0.1133, + "step": 2097 + }, + { + "epoch": 1.5, + "grad_norm": 23.244970427895833, + "learning_rate": 7.200798842372472e-06, + "loss": 0.1119, + "step": 2098 + }, + { + "epoch": 1.5, + "grad_norm": 24.537751698472384, + "learning_rate": 7.198203353218508e-06, + "loss": 0.1145, + "step": 2099 + }, + { + "epoch": 1.5, + "grad_norm": 6.5385945727391395, + "learning_rate": 7.195607129603946e-06, + "loss": 0.0945, + "step": 2100 + }, + { + "epoch": 1.5, + "grad_norm": 17.09406318610685, + "learning_rate": 7.19301017239623e-06, + "loss": 0.0776, + "step": 2101 + }, + { + "epoch": 1.5, + "grad_norm": 7.953522177542275, + "learning_rate": 7.190412482463054e-06, + "loss": 0.1013, + "step": 2102 + }, + { + "epoch": 1.5, + "grad_norm": 31.262974276965277, + "learning_rate": 7.187814060672354e-06, + "loss": 0.1171, + "step": 2103 + }, + { + "epoch": 1.5, + "grad_norm": 10.270590362702018, + "learning_rate": 7.1852149078923105e-06, + "loss": 0.1014, + "step": 2104 + }, + { + "epoch": 1.5, + "grad_norm": 12.499466735424543, + "learning_rate": 7.1826150249913495e-06, + "loss": 0.131, + "step": 2105 + }, + { + "epoch": 1.5, + "grad_norm": 26.68680284528321, + "learning_rate": 7.18001441283814e-06, + "loss": 0.0918, + "step": 2106 + }, + { + "epoch": 1.5, + "grad_norm": 42.91007158573037, + "learning_rate": 7.1774130723015955e-06, + "loss": 0.1384, + "step": 2107 + }, + { + "epoch": 1.5, + "grad_norm": 8.298129546397226, + "learning_rate": 7.17481100425087e-06, + "loss": 0.0711, + "step": 2108 + }, + { + "epoch": 1.51, + "grad_norm": 22.30804777349356, + "learning_rate": 7.172208209555365e-06, + "loss": 0.077, + "step": 2109 + }, + { + "epoch": 1.51, + "grad_norm": 21.45735632031128, + "learning_rate": 7.1696046890847206e-06, + "loss": 0.1058, + "step": 2110 + }, + { + "epoch": 1.51, + "grad_norm": 21.40607494365893, + "learning_rate": 7.167000443708823e-06, + "loss": 0.1253, + "step": 2111 + }, + { + "epoch": 1.51, + "grad_norm": 7.216056021799717, + "learning_rate": 7.164395474297798e-06, + "loss": 0.067, + "step": 2112 + }, + { + "epoch": 1.51, + "grad_norm": 22.5190941505317, + "learning_rate": 7.161789781722016e-06, + "loss": 0.1038, + "step": 2113 + }, + { + "epoch": 1.51, + "grad_norm": 24.586064614156214, + "learning_rate": 7.159183366852085e-06, + "loss": 0.1046, + "step": 2114 + }, + { + "epoch": 1.51, + "grad_norm": 11.51113614776396, + "learning_rate": 7.156576230558859e-06, + "loss": 0.1046, + "step": 2115 + }, + { + "epoch": 1.51, + "grad_norm": 70.95164168879344, + "learning_rate": 7.153968373713429e-06, + "loss": 0.1827, + "step": 2116 + }, + { + "epoch": 1.51, + "grad_norm": 6.699926065717833, + "learning_rate": 7.1513597971871295e-06, + "loss": 0.0992, + "step": 2117 + }, + { + "epoch": 1.51, + "grad_norm": 11.794229177328235, + "learning_rate": 7.148750501851532e-06, + "loss": 0.0793, + "step": 2118 + }, + { + "epoch": 1.51, + "grad_norm": 18.355183637296523, + "learning_rate": 7.1461404885784545e-06, + "loss": 0.1051, + "step": 2119 + }, + { + "epoch": 1.51, + "grad_norm": 32.86488907052558, + "learning_rate": 7.1435297582399475e-06, + "loss": 0.1007, + "step": 2120 + }, + { + "epoch": 1.51, + "grad_norm": 21.387125829223244, + "learning_rate": 7.140918311708306e-06, + "loss": 0.0792, + "step": 2121 + }, + { + "epoch": 1.51, + "grad_norm": 13.590993567540462, + "learning_rate": 7.138306149856062e-06, + "loss": 0.075, + "step": 2122 + }, + { + "epoch": 1.52, + "grad_norm": 6.447054862968271, + "learning_rate": 7.1356932735559905e-06, + "loss": 0.085, + "step": 2123 + }, + { + "epoch": 1.52, + "grad_norm": 25.482506147936167, + "learning_rate": 7.133079683681099e-06, + "loss": 0.1274, + "step": 2124 + }, + { + "epoch": 1.52, + "grad_norm": 9.387675092660638, + "learning_rate": 7.130465381104635e-06, + "loss": 0.0701, + "step": 2125 + }, + { + "epoch": 1.52, + "grad_norm": 8.715433430188718, + "learning_rate": 7.1278503667000885e-06, + "loss": 0.0797, + "step": 2126 + }, + { + "epoch": 1.52, + "grad_norm": 8.423200658182854, + "learning_rate": 7.125234641341185e-06, + "loss": 0.0934, + "step": 2127 + }, + { + "epoch": 1.52, + "grad_norm": 6.710068469682492, + "learning_rate": 7.1226182059018835e-06, + "loss": 0.0822, + "step": 2128 + }, + { + "epoch": 1.52, + "grad_norm": 13.329201988437877, + "learning_rate": 7.120001061256387e-06, + "loss": 0.0792, + "step": 2129 + }, + { + "epoch": 1.52, + "grad_norm": 22.059536956159867, + "learning_rate": 7.1173832082791294e-06, + "loss": 0.1038, + "step": 2130 + }, + { + "epoch": 1.52, + "grad_norm": 13.826617022708637, + "learning_rate": 7.114764647844788e-06, + "loss": 0.1299, + "step": 2131 + }, + { + "epoch": 1.52, + "grad_norm": 17.18931427687854, + "learning_rate": 7.112145380828267e-06, + "loss": 0.0983, + "step": 2132 + }, + { + "epoch": 1.52, + "grad_norm": 12.456228718757368, + "learning_rate": 7.109525408104717e-06, + "loss": 0.125, + "step": 2133 + }, + { + "epoch": 1.52, + "grad_norm": 12.457600501785418, + "learning_rate": 7.106904730549517e-06, + "loss": 0.0661, + "step": 2134 + }, + { + "epoch": 1.52, + "grad_norm": 19.169864989415867, + "learning_rate": 7.104283349038285e-06, + "loss": 0.0616, + "step": 2135 + }, + { + "epoch": 1.52, + "grad_norm": 28.451596210521583, + "learning_rate": 7.101661264446875e-06, + "loss": 0.1067, + "step": 2136 + }, + { + "epoch": 1.53, + "grad_norm": 11.557672547776232, + "learning_rate": 7.099038477651371e-06, + "loss": 0.0716, + "step": 2137 + }, + { + "epoch": 1.53, + "grad_norm": 13.915155495212185, + "learning_rate": 7.096414989528095e-06, + "loss": 0.0714, + "step": 2138 + }, + { + "epoch": 1.53, + "grad_norm": 28.111132386929246, + "learning_rate": 7.093790800953606e-06, + "loss": 0.0987, + "step": 2139 + }, + { + "epoch": 1.53, + "grad_norm": 13.342771869341991, + "learning_rate": 7.091165912804693e-06, + "loss": 0.1157, + "step": 2140 + }, + { + "epoch": 1.53, + "grad_norm": 10.294032473258458, + "learning_rate": 7.088540325958379e-06, + "loss": 0.1033, + "step": 2141 + }, + { + "epoch": 1.53, + "grad_norm": 6.93465060964492, + "learning_rate": 7.085914041291921e-06, + "loss": 0.0911, + "step": 2142 + }, + { + "epoch": 1.53, + "grad_norm": 17.880292154170586, + "learning_rate": 7.08328705968281e-06, + "loss": 0.1404, + "step": 2143 + }, + { + "epoch": 1.53, + "grad_norm": 22.929359757090396, + "learning_rate": 7.080659382008772e-06, + "loss": 0.1053, + "step": 2144 + }, + { + "epoch": 1.53, + "grad_norm": 7.958105578684251, + "learning_rate": 7.078031009147759e-06, + "loss": 0.1392, + "step": 2145 + }, + { + "epoch": 1.53, + "grad_norm": 14.942761890735525, + "learning_rate": 7.075401941977961e-06, + "loss": 0.0994, + "step": 2146 + }, + { + "epoch": 1.53, + "grad_norm": 6.000079482176476, + "learning_rate": 7.072772181377798e-06, + "loss": 0.0935, + "step": 2147 + }, + { + "epoch": 1.53, + "grad_norm": 9.21102239643325, + "learning_rate": 7.070141728225922e-06, + "loss": 0.0652, + "step": 2148 + }, + { + "epoch": 1.53, + "grad_norm": 10.92020872783142, + "learning_rate": 7.067510583401217e-06, + "loss": 0.0845, + "step": 2149 + }, + { + "epoch": 1.53, + "grad_norm": 4.251963896243572, + "learning_rate": 7.0648787477827965e-06, + "loss": 0.0601, + "step": 2150 + }, + { + "epoch": 1.54, + "grad_norm": 7.8790995364041905, + "learning_rate": 7.062246222250005e-06, + "loss": 0.1079, + "step": 2151 + }, + { + "epoch": 1.54, + "grad_norm": 16.979064512752753, + "learning_rate": 7.05961300768242e-06, + "loss": 0.1107, + "step": 2152 + }, + { + "epoch": 1.54, + "grad_norm": 39.1816186306808, + "learning_rate": 7.056979104959847e-06, + "loss": 0.1908, + "step": 2153 + }, + { + "epoch": 1.54, + "grad_norm": 15.074164527605571, + "learning_rate": 7.054344514962319e-06, + "loss": 0.0779, + "step": 2154 + }, + { + "epoch": 1.54, + "grad_norm": 19.191667670584966, + "learning_rate": 7.051709238570106e-06, + "loss": 0.1405, + "step": 2155 + }, + { + "epoch": 1.54, + "grad_norm": 27.248491502820745, + "learning_rate": 7.0490732766637e-06, + "loss": 0.1021, + "step": 2156 + }, + { + "epoch": 1.54, + "grad_norm": 9.506985241043234, + "learning_rate": 7.046436630123826e-06, + "loss": 0.1384, + "step": 2157 + }, + { + "epoch": 1.54, + "grad_norm": 6.615642105259546, + "learning_rate": 7.043799299831438e-06, + "loss": 0.1138, + "step": 2158 + }, + { + "epoch": 1.54, + "grad_norm": 8.07347512799611, + "learning_rate": 7.041161286667713e-06, + "loss": 0.1147, + "step": 2159 + }, + { + "epoch": 1.54, + "grad_norm": 24.75914750945603, + "learning_rate": 7.038522591514061e-06, + "loss": 0.0977, + "step": 2160 + }, + { + "epoch": 1.54, + "grad_norm": 20.871761477687055, + "learning_rate": 7.035883215252123e-06, + "loss": 0.0725, + "step": 2161 + }, + { + "epoch": 1.54, + "grad_norm": 7.2177050883819165, + "learning_rate": 7.03324315876376e-06, + "loss": 0.0817, + "step": 2162 + }, + { + "epoch": 1.54, + "grad_norm": 10.886438037155624, + "learning_rate": 7.030602422931065e-06, + "loss": 0.0947, + "step": 2163 + }, + { + "epoch": 1.54, + "grad_norm": 32.18182550206542, + "learning_rate": 7.027961008636359e-06, + "loss": 0.1033, + "step": 2164 + }, + { + "epoch": 1.55, + "grad_norm": 10.8123263405742, + "learning_rate": 7.025318916762185e-06, + "loss": 0.0902, + "step": 2165 + }, + { + "epoch": 1.55, + "grad_norm": 5.74661964617569, + "learning_rate": 7.022676148191315e-06, + "loss": 0.1548, + "step": 2166 + }, + { + "epoch": 1.55, + "grad_norm": 11.66124484632883, + "learning_rate": 7.020032703806748e-06, + "loss": 0.1166, + "step": 2167 + }, + { + "epoch": 1.55, + "grad_norm": 5.129462459098309, + "learning_rate": 7.017388584491709e-06, + "loss": 0.0863, + "step": 2168 + }, + { + "epoch": 1.55, + "grad_norm": 19.70040390124027, + "learning_rate": 7.014743791129644e-06, + "loss": 0.1052, + "step": 2169 + }, + { + "epoch": 1.55, + "grad_norm": 4.783745164454452, + "learning_rate": 7.012098324604231e-06, + "loss": 0.0804, + "step": 2170 + }, + { + "epoch": 1.55, + "grad_norm": 5.83062015200628, + "learning_rate": 7.009452185799368e-06, + "loss": 0.0876, + "step": 2171 + }, + { + "epoch": 1.55, + "grad_norm": 8.96404912676571, + "learning_rate": 7.00680537559918e-06, + "loss": 0.0917, + "step": 2172 + }, + { + "epoch": 1.55, + "grad_norm": 26.88319898284892, + "learning_rate": 7.0041578948880155e-06, + "loss": 0.0736, + "step": 2173 + }, + { + "epoch": 1.55, + "grad_norm": 11.385232172270138, + "learning_rate": 7.001509744550446e-06, + "loss": 0.1053, + "step": 2174 + }, + { + "epoch": 1.55, + "grad_norm": 7.771562344479279, + "learning_rate": 6.998860925471267e-06, + "loss": 0.0853, + "step": 2175 + }, + { + "epoch": 1.55, + "grad_norm": 18.270461704058164, + "learning_rate": 6.9962114385355e-06, + "loss": 0.0884, + "step": 2176 + }, + { + "epoch": 1.55, + "grad_norm": 43.811495080381825, + "learning_rate": 6.993561284628388e-06, + "loss": 0.1189, + "step": 2177 + }, + { + "epoch": 1.55, + "grad_norm": 9.153350063653045, + "learning_rate": 6.990910464635395e-06, + "loss": 0.075, + "step": 2178 + }, + { + "epoch": 1.56, + "grad_norm": 16.548387260500697, + "learning_rate": 6.9882589794422105e-06, + "loss": 0.0931, + "step": 2179 + }, + { + "epoch": 1.56, + "grad_norm": 25.197771744639976, + "learning_rate": 6.9856068299347455e-06, + "loss": 0.1284, + "step": 2180 + }, + { + "epoch": 1.56, + "grad_norm": 30.717886675806074, + "learning_rate": 6.98295401699913e-06, + "loss": 0.0989, + "step": 2181 + }, + { + "epoch": 1.56, + "grad_norm": 22.815922730000807, + "learning_rate": 6.980300541521721e-06, + "loss": 0.1202, + "step": 2182 + }, + { + "epoch": 1.56, + "grad_norm": 8.881826965201908, + "learning_rate": 6.977646404389092e-06, + "loss": 0.0764, + "step": 2183 + }, + { + "epoch": 1.56, + "grad_norm": 74.805756583807, + "learning_rate": 6.9749916064880404e-06, + "loss": 0.1982, + "step": 2184 + }, + { + "epoch": 1.56, + "grad_norm": 15.273061210962908, + "learning_rate": 6.972336148705583e-06, + "loss": 0.0898, + "step": 2185 + }, + { + "epoch": 1.56, + "grad_norm": 26.839917671298732, + "learning_rate": 6.969680031928959e-06, + "loss": 0.1118, + "step": 2186 + }, + { + "epoch": 1.56, + "grad_norm": 18.784018802631863, + "learning_rate": 6.967023257045624e-06, + "loss": 0.1224, + "step": 2187 + }, + { + "epoch": 1.56, + "grad_norm": 42.06255919354335, + "learning_rate": 6.96436582494326e-06, + "loss": 0.0928, + "step": 2188 + }, + { + "epoch": 1.56, + "grad_norm": 38.1069266876009, + "learning_rate": 6.961707736509759e-06, + "loss": 0.1373, + "step": 2189 + }, + { + "epoch": 1.56, + "grad_norm": 29.757725569858078, + "learning_rate": 6.959048992633241e-06, + "loss": 0.0897, + "step": 2190 + }, + { + "epoch": 1.56, + "grad_norm": 4.726770945951311, + "learning_rate": 6.956389594202041e-06, + "loss": 0.0917, + "step": 2191 + }, + { + "epoch": 1.56, + "grad_norm": 36.79881925491694, + "learning_rate": 6.953729542104713e-06, + "loss": 0.1097, + "step": 2192 + }, + { + "epoch": 1.57, + "grad_norm": 44.56686035867685, + "learning_rate": 6.951068837230032e-06, + "loss": 0.1515, + "step": 2193 + }, + { + "epoch": 1.57, + "grad_norm": 27.897014911286362, + "learning_rate": 6.9484074804669865e-06, + "loss": 0.1221, + "step": 2194 + }, + { + "epoch": 1.57, + "grad_norm": 17.338158874149745, + "learning_rate": 6.945745472704786e-06, + "loss": 0.1188, + "step": 2195 + }, + { + "epoch": 1.57, + "grad_norm": 38.83775106356088, + "learning_rate": 6.943082814832858e-06, + "loss": 0.0985, + "step": 2196 + }, + { + "epoch": 1.57, + "grad_norm": 44.22872017990922, + "learning_rate": 6.940419507740843e-06, + "loss": 0.1453, + "step": 2197 + }, + { + "epoch": 1.57, + "grad_norm": 23.616953797426014, + "learning_rate": 6.937755552318606e-06, + "loss": 0.0958, + "step": 2198 + }, + { + "epoch": 1.57, + "grad_norm": 16.228217397227283, + "learning_rate": 6.935090949456219e-06, + "loss": 0.0956, + "step": 2199 + }, + { + "epoch": 1.57, + "grad_norm": 16.949120884184445, + "learning_rate": 6.93242570004398e-06, + "loss": 0.1097, + "step": 2200 + }, + { + "epoch": 1.57, + "grad_norm": 45.65816773721843, + "learning_rate": 6.929759804972394e-06, + "loss": 0.1361, + "step": 2201 + }, + { + "epoch": 1.57, + "grad_norm": 28.0710224178119, + "learning_rate": 6.92709326513219e-06, + "loss": 0.11, + "step": 2202 + }, + { + "epoch": 1.57, + "grad_norm": 13.390078383095538, + "learning_rate": 6.924426081414305e-06, + "loss": 0.088, + "step": 2203 + }, + { + "epoch": 1.57, + "grad_norm": 5.427364387155767, + "learning_rate": 6.921758254709897e-06, + "loss": 0.0829, + "step": 2204 + }, + { + "epoch": 1.57, + "grad_norm": 26.214900361222632, + "learning_rate": 6.919089785910336e-06, + "loss": 0.0849, + "step": 2205 + }, + { + "epoch": 1.57, + "grad_norm": 31.71611812858684, + "learning_rate": 6.916420675907207e-06, + "loss": 0.0957, + "step": 2206 + }, + { + "epoch": 1.58, + "grad_norm": 21.55026185223569, + "learning_rate": 6.9137509255923085e-06, + "loss": 0.1045, + "step": 2207 + }, + { + "epoch": 1.58, + "grad_norm": 5.845490265416999, + "learning_rate": 6.911080535857655e-06, + "loss": 0.0778, + "step": 2208 + }, + { + "epoch": 1.58, + "grad_norm": 18.755449635394918, + "learning_rate": 6.908409507595472e-06, + "loss": 0.114, + "step": 2209 + }, + { + "epoch": 1.58, + "grad_norm": 27.69838224283327, + "learning_rate": 6.905737841698201e-06, + "loss": 0.1097, + "step": 2210 + }, + { + "epoch": 1.58, + "grad_norm": 13.295405659497893, + "learning_rate": 6.903065539058496e-06, + "loss": 0.0918, + "step": 2211 + }, + { + "epoch": 1.58, + "grad_norm": 9.442426195870846, + "learning_rate": 6.900392600569219e-06, + "loss": 0.0647, + "step": 2212 + }, + { + "epoch": 1.58, + "grad_norm": 8.137208442510627, + "learning_rate": 6.897719027123451e-06, + "loss": 0.1304, + "step": 2213 + }, + { + "epoch": 1.58, + "grad_norm": 29.345677962603688, + "learning_rate": 6.895044819614484e-06, + "loss": 0.1, + "step": 2214 + }, + { + "epoch": 1.58, + "grad_norm": 17.490421643869904, + "learning_rate": 6.8923699789358185e-06, + "loss": 0.0791, + "step": 2215 + }, + { + "epoch": 1.58, + "grad_norm": 14.661343717167206, + "learning_rate": 6.88969450598117e-06, + "loss": 0.068, + "step": 2216 + }, + { + "epoch": 1.58, + "grad_norm": 18.33228470361208, + "learning_rate": 6.887018401644463e-06, + "loss": 0.1045, + "step": 2217 + }, + { + "epoch": 1.58, + "grad_norm": 11.382139849610716, + "learning_rate": 6.884341666819832e-06, + "loss": 0.0778, + "step": 2218 + }, + { + "epoch": 1.58, + "grad_norm": 9.342522202577735, + "learning_rate": 6.881664302401626e-06, + "loss": 0.0865, + "step": 2219 + }, + { + "epoch": 1.58, + "grad_norm": 11.14544309202564, + "learning_rate": 6.878986309284401e-06, + "loss": 0.0933, + "step": 2220 + }, + { + "epoch": 1.59, + "grad_norm": 6.725068137788035, + "learning_rate": 6.876307688362925e-06, + "loss": 0.0895, + "step": 2221 + }, + { + "epoch": 1.59, + "grad_norm": 5.054484543343092, + "learning_rate": 6.873628440532175e-06, + "loss": 0.0834, + "step": 2222 + }, + { + "epoch": 1.59, + "grad_norm": 10.231322227289729, + "learning_rate": 6.8709485666873375e-06, + "loss": 0.104, + "step": 2223 + }, + { + "epoch": 1.59, + "grad_norm": 5.956539872126496, + "learning_rate": 6.868268067723808e-06, + "loss": 0.1002, + "step": 2224 + }, + { + "epoch": 1.59, + "grad_norm": 17.94119750705428, + "learning_rate": 6.86558694453719e-06, + "loss": 0.1049, + "step": 2225 + }, + { + "epoch": 1.59, + "grad_norm": 7.710067333196714, + "learning_rate": 6.8629051980233e-06, + "loss": 0.0728, + "step": 2226 + }, + { + "epoch": 1.59, + "grad_norm": 13.507965465766109, + "learning_rate": 6.860222829078156e-06, + "loss": 0.0928, + "step": 2227 + }, + { + "epoch": 1.59, + "grad_norm": 15.267717469976887, + "learning_rate": 6.857539838597987e-06, + "loss": 0.0722, + "step": 2228 + }, + { + "epoch": 1.59, + "grad_norm": 6.716234484479027, + "learning_rate": 6.8548562274792325e-06, + "loss": 0.0989, + "step": 2229 + }, + { + "epoch": 1.59, + "grad_norm": 32.705783237669955, + "learning_rate": 6.8521719966185355e-06, + "loss": 0.1067, + "step": 2230 + }, + { + "epoch": 1.59, + "grad_norm": 9.096469392127245, + "learning_rate": 6.8494871469127474e-06, + "loss": 0.1183, + "step": 2231 + }, + { + "epoch": 1.59, + "grad_norm": 7.547206226988792, + "learning_rate": 6.846801679258926e-06, + "loss": 0.1047, + "step": 2232 + }, + { + "epoch": 1.59, + "grad_norm": 13.029631509644526, + "learning_rate": 6.844115594554338e-06, + "loss": 0.0861, + "step": 2233 + }, + { + "epoch": 1.59, + "grad_norm": 43.40421796011947, + "learning_rate": 6.841428893696453e-06, + "loss": 0.1598, + "step": 2234 + }, + { + "epoch": 1.6, + "grad_norm": 18.30412375137895, + "learning_rate": 6.838741577582946e-06, + "loss": 0.1125, + "step": 2235 + }, + { + "epoch": 1.6, + "grad_norm": 4.815879130885394, + "learning_rate": 6.836053647111701e-06, + "loss": 0.0886, + "step": 2236 + }, + { + "epoch": 1.6, + "grad_norm": 20.612409133400302, + "learning_rate": 6.833365103180806e-06, + "loss": 0.1157, + "step": 2237 + }, + { + "epoch": 1.6, + "grad_norm": 23.966186458842202, + "learning_rate": 6.830675946688552e-06, + "loss": 0.0892, + "step": 2238 + }, + { + "epoch": 1.6, + "grad_norm": 7.940623113295723, + "learning_rate": 6.827986178533437e-06, + "loss": 0.1117, + "step": 2239 + }, + { + "epoch": 1.6, + "grad_norm": 7.44672935549102, + "learning_rate": 6.825295799614163e-06, + "loss": 0.1162, + "step": 2240 + }, + { + "epoch": 1.6, + "grad_norm": 14.891138354626657, + "learning_rate": 6.822604810829634e-06, + "loss": 0.0913, + "step": 2241 + }, + { + "epoch": 1.6, + "grad_norm": 17.653225783254584, + "learning_rate": 6.819913213078961e-06, + "loss": 0.0951, + "step": 2242 + }, + { + "epoch": 1.6, + "grad_norm": 5.9242045134536525, + "learning_rate": 6.817221007261456e-06, + "loss": 0.1039, + "step": 2243 + }, + { + "epoch": 1.6, + "grad_norm": 14.31038138115803, + "learning_rate": 6.814528194276636e-06, + "loss": 0.0687, + "step": 2244 + }, + { + "epoch": 1.6, + "grad_norm": 8.40265784360439, + "learning_rate": 6.811834775024219e-06, + "loss": 0.1125, + "step": 2245 + }, + { + "epoch": 1.6, + "grad_norm": 20.076086550456807, + "learning_rate": 6.809140750404127e-06, + "loss": 0.0917, + "step": 2246 + }, + { + "epoch": 1.6, + "grad_norm": 23.9239943476738, + "learning_rate": 6.8064461213164825e-06, + "loss": 0.1105, + "step": 2247 + }, + { + "epoch": 1.6, + "grad_norm": 7.421604483486793, + "learning_rate": 6.803750888661611e-06, + "loss": 0.0867, + "step": 2248 + }, + { + "epoch": 1.61, + "grad_norm": 26.261644106440436, + "learning_rate": 6.8010550533400425e-06, + "loss": 0.1093, + "step": 2249 + }, + { + "epoch": 1.61, + "grad_norm": 24.854024525208718, + "learning_rate": 6.798358616252503e-06, + "loss": 0.09, + "step": 2250 + }, + { + "epoch": 1.61, + "grad_norm": 28.96708219096894, + "learning_rate": 6.795661578299924e-06, + "loss": 0.0673, + "step": 2251 + }, + { + "epoch": 1.61, + "grad_norm": 11.997934589159817, + "learning_rate": 6.792963940383436e-06, + "loss": 0.132, + "step": 2252 + }, + { + "epoch": 1.61, + "grad_norm": 51.52301595812061, + "learning_rate": 6.790265703404368e-06, + "loss": 0.1309, + "step": 2253 + }, + { + "epoch": 1.61, + "grad_norm": 33.198458196121244, + "learning_rate": 6.787566868264253e-06, + "loss": 0.1149, + "step": 2254 + }, + { + "epoch": 1.61, + "grad_norm": 39.63747128767255, + "learning_rate": 6.7848674358648195e-06, + "loss": 0.1301, + "step": 2255 + }, + { + "epoch": 1.61, + "grad_norm": 16.270808913561332, + "learning_rate": 6.782167407108001e-06, + "loss": 0.1317, + "step": 2256 + }, + { + "epoch": 1.61, + "grad_norm": 32.72395746940256, + "learning_rate": 6.779466782895926e-06, + "loss": 0.087, + "step": 2257 + }, + { + "epoch": 1.61, + "grad_norm": 48.91482478473609, + "learning_rate": 6.7767655641309234e-06, + "loss": 0.1608, + "step": 2258 + }, + { + "epoch": 1.61, + "grad_norm": 34.08193473685804, + "learning_rate": 6.7740637517155205e-06, + "loss": 0.1154, + "step": 2259 + }, + { + "epoch": 1.61, + "grad_norm": 12.943831638411528, + "learning_rate": 6.771361346552445e-06, + "loss": 0.1069, + "step": 2260 + }, + { + "epoch": 1.61, + "grad_norm": 7.135693851021515, + "learning_rate": 6.7686583495446164e-06, + "loss": 0.0879, + "step": 2261 + }, + { + "epoch": 1.61, + "grad_norm": 37.11558243569602, + "learning_rate": 6.765954761595161e-06, + "loss": 0.1265, + "step": 2262 + }, + { + "epoch": 1.62, + "grad_norm": 51.426289905430444, + "learning_rate": 6.763250583607392e-06, + "loss": 0.1113, + "step": 2263 + }, + { + "epoch": 1.62, + "grad_norm": 24.538812354381417, + "learning_rate": 6.7605458164848316e-06, + "loss": 0.1005, + "step": 2264 + }, + { + "epoch": 1.62, + "grad_norm": 13.374751197386294, + "learning_rate": 6.75784046113119e-06, + "loss": 0.1232, + "step": 2265 + }, + { + "epoch": 1.62, + "grad_norm": 19.870883488045923, + "learning_rate": 6.755134518450377e-06, + "loss": 0.1033, + "step": 2266 + }, + { + "epoch": 1.62, + "grad_norm": 9.625372295659911, + "learning_rate": 6.752427989346497e-06, + "loss": 0.1268, + "step": 2267 + }, + { + "epoch": 1.62, + "grad_norm": 10.575802908582206, + "learning_rate": 6.749720874723854e-06, + "loss": 0.0843, + "step": 2268 + }, + { + "epoch": 1.62, + "grad_norm": 25.526322460359744, + "learning_rate": 6.747013175486944e-06, + "loss": 0.1187, + "step": 2269 + }, + { + "epoch": 1.62, + "grad_norm": 7.939467806031972, + "learning_rate": 6.74430489254046e-06, + "loss": 0.088, + "step": 2270 + }, + { + "epoch": 1.62, + "grad_norm": 4.729990582136349, + "learning_rate": 6.741596026789288e-06, + "loss": 0.1049, + "step": 2271 + }, + { + "epoch": 1.62, + "grad_norm": 6.467119491306653, + "learning_rate": 6.7388865791385124e-06, + "loss": 0.0942, + "step": 2272 + }, + { + "epoch": 1.62, + "grad_norm": 9.876980913769884, + "learning_rate": 6.736176550493411e-06, + "loss": 0.1053, + "step": 2273 + }, + { + "epoch": 1.62, + "grad_norm": 20.755840347838348, + "learning_rate": 6.7334659417594514e-06, + "loss": 0.116, + "step": 2274 + }, + { + "epoch": 1.62, + "grad_norm": 17.06569796812762, + "learning_rate": 6.730754753842303e-06, + "loss": 0.1023, + "step": 2275 + }, + { + "epoch": 1.62, + "grad_norm": 7.8737047418848904, + "learning_rate": 6.728042987647818e-06, + "loss": 0.0779, + "step": 2276 + }, + { + "epoch": 1.63, + "grad_norm": 31.73870174511549, + "learning_rate": 6.725330644082054e-06, + "loss": 0.1567, + "step": 2277 + }, + { + "epoch": 1.63, + "grad_norm": 27.556880854348176, + "learning_rate": 6.7226177240512516e-06, + "loss": 0.085, + "step": 2278 + }, + { + "epoch": 1.63, + "grad_norm": 11.788183835441814, + "learning_rate": 6.7199042284618484e-06, + "loss": 0.093, + "step": 2279 + }, + { + "epoch": 1.63, + "grad_norm": 31.31737563867473, + "learning_rate": 6.717190158220475e-06, + "loss": 0.1143, + "step": 2280 + }, + { + "epoch": 1.63, + "grad_norm": 5.564357936167133, + "learning_rate": 6.714475514233951e-06, + "loss": 0.0759, + "step": 2281 + }, + { + "epoch": 1.63, + "grad_norm": 46.40291596398642, + "learning_rate": 6.71176029740929e-06, + "loss": 0.1226, + "step": 2282 + }, + { + "epoch": 1.63, + "grad_norm": 24.154317765086333, + "learning_rate": 6.709044508653697e-06, + "loss": 0.1367, + "step": 2283 + }, + { + "epoch": 1.63, + "grad_norm": 6.925533803113575, + "learning_rate": 6.706328148874568e-06, + "loss": 0.1111, + "step": 2284 + }, + { + "epoch": 1.63, + "grad_norm": 17.52523515570555, + "learning_rate": 6.703611218979488e-06, + "loss": 0.113, + "step": 2285 + }, + { + "epoch": 1.63, + "grad_norm": 32.63171057290881, + "learning_rate": 6.700893719876234e-06, + "loss": 0.1052, + "step": 2286 + }, + { + "epoch": 1.63, + "grad_norm": 35.04462192513845, + "learning_rate": 6.698175652472774e-06, + "loss": 0.0858, + "step": 2287 + }, + { + "epoch": 1.63, + "grad_norm": 18.72923605107616, + "learning_rate": 6.695457017677263e-06, + "loss": 0.1011, + "step": 2288 + }, + { + "epoch": 1.63, + "grad_norm": 14.768416928691146, + "learning_rate": 6.692737816398048e-06, + "loss": 0.1317, + "step": 2289 + }, + { + "epoch": 1.63, + "grad_norm": 21.649718208972313, + "learning_rate": 6.6900180495436664e-06, + "loss": 0.1016, + "step": 2290 + }, + { + "epoch": 1.64, + "grad_norm": 16.92349184199488, + "learning_rate": 6.68729771802284e-06, + "loss": 0.1027, + "step": 2291 + }, + { + "epoch": 1.64, + "grad_norm": 18.73784013126941, + "learning_rate": 6.6845768227444855e-06, + "loss": 0.0793, + "step": 2292 + }, + { + "epoch": 1.64, + "grad_norm": 10.89593946985034, + "learning_rate": 6.681855364617702e-06, + "loss": 0.0908, + "step": 2293 + }, + { + "epoch": 1.64, + "grad_norm": 12.205255707572324, + "learning_rate": 6.67913334455178e-06, + "loss": 0.0995, + "step": 2294 + }, + { + "epoch": 1.64, + "grad_norm": 16.757751110332908, + "learning_rate": 6.676410763456197e-06, + "loss": 0.1001, + "step": 2295 + }, + { + "epoch": 1.64, + "grad_norm": 7.269571127624379, + "learning_rate": 6.673687622240619e-06, + "loss": 0.088, + "step": 2296 + }, + { + "epoch": 1.64, + "grad_norm": 9.609560802494752, + "learning_rate": 6.670963921814896e-06, + "loss": 0.1106, + "step": 2297 + }, + { + "epoch": 1.64, + "grad_norm": 8.246135299481141, + "learning_rate": 6.668239663089069e-06, + "loss": 0.1433, + "step": 2298 + }, + { + "epoch": 1.64, + "grad_norm": 18.24562720984829, + "learning_rate": 6.665514846973361e-06, + "loss": 0.0901, + "step": 2299 + }, + { + "epoch": 1.64, + "grad_norm": 13.37161564678813, + "learning_rate": 6.662789474378186e-06, + "loss": 0.1213, + "step": 2300 + }, + { + "epoch": 1.64, + "grad_norm": 11.844017421917352, + "learning_rate": 6.6600635462141415e-06, + "loss": 0.0905, + "step": 2301 + }, + { + "epoch": 1.64, + "grad_norm": 13.504340961380517, + "learning_rate": 6.657337063392011e-06, + "loss": 0.1224, + "step": 2302 + }, + { + "epoch": 1.64, + "grad_norm": 5.623690274333707, + "learning_rate": 6.654610026822761e-06, + "loss": 0.0778, + "step": 2303 + }, + { + "epoch": 1.64, + "grad_norm": 10.443915001651478, + "learning_rate": 6.651882437417546e-06, + "loss": 0.1146, + "step": 2304 + }, + { + "epoch": 1.65, + "grad_norm": 8.261849012510766, + "learning_rate": 6.649154296087705e-06, + "loss": 0.1248, + "step": 2305 + }, + { + "epoch": 1.65, + "grad_norm": 9.712674765149288, + "learning_rate": 6.646425603744759e-06, + "loss": 0.1034, + "step": 2306 + }, + { + "epoch": 1.65, + "grad_norm": 8.076830070701273, + "learning_rate": 6.643696361300418e-06, + "loss": 0.1305, + "step": 2307 + }, + { + "epoch": 1.65, + "grad_norm": 16.23453792461202, + "learning_rate": 6.6409665696665715e-06, + "loss": 0.0792, + "step": 2308 + }, + { + "epoch": 1.65, + "grad_norm": 6.928315712867063, + "learning_rate": 6.638236229755292e-06, + "loss": 0.1429, + "step": 2309 + }, + { + "epoch": 1.65, + "grad_norm": 5.653617940327973, + "learning_rate": 6.635505342478838e-06, + "loss": 0.0847, + "step": 2310 + }, + { + "epoch": 1.65, + "grad_norm": 9.207528691085546, + "learning_rate": 6.632773908749649e-06, + "loss": 0.0923, + "step": 2311 + }, + { + "epoch": 1.65, + "grad_norm": 26.119937550135738, + "learning_rate": 6.630041929480349e-06, + "loss": 0.1027, + "step": 2312 + }, + { + "epoch": 1.65, + "grad_norm": 24.180449478429487, + "learning_rate": 6.627309405583741e-06, + "loss": 0.1044, + "step": 2313 + }, + { + "epoch": 1.65, + "grad_norm": 20.448643687176528, + "learning_rate": 6.624576337972815e-06, + "loss": 0.0922, + "step": 2314 + }, + { + "epoch": 1.65, + "grad_norm": 11.591680766177754, + "learning_rate": 6.621842727560737e-06, + "loss": 0.0912, + "step": 2315 + }, + { + "epoch": 1.65, + "grad_norm": 4.828986274241329, + "learning_rate": 6.6191085752608575e-06, + "loss": 0.0772, + "step": 2316 + }, + { + "epoch": 1.65, + "grad_norm": 9.954083488247607, + "learning_rate": 6.616373881986708e-06, + "loss": 0.1047, + "step": 2317 + }, + { + "epoch": 1.65, + "grad_norm": 5.6263248883934125, + "learning_rate": 6.613638648652002e-06, + "loss": 0.0961, + "step": 2318 + }, + { + "epoch": 1.66, + "grad_norm": 5.087116155084636, + "learning_rate": 6.610902876170631e-06, + "loss": 0.0953, + "step": 2319 + }, + { + "epoch": 1.66, + "grad_norm": 18.559528074367204, + "learning_rate": 6.608166565456666e-06, + "loss": 0.1322, + "step": 2320 + }, + { + "epoch": 1.66, + "grad_norm": 36.09006097390221, + "learning_rate": 6.605429717424359e-06, + "loss": 0.0972, + "step": 2321 + }, + { + "epoch": 1.66, + "grad_norm": 5.741647863961911, + "learning_rate": 6.602692332988143e-06, + "loss": 0.0908, + "step": 2322 + }, + { + "epoch": 1.66, + "grad_norm": 30.835808916611533, + "learning_rate": 6.5999544130626305e-06, + "loss": 0.0942, + "step": 2323 + }, + { + "epoch": 1.66, + "grad_norm": 14.628025775072933, + "learning_rate": 6.597215958562608e-06, + "loss": 0.1154, + "step": 2324 + }, + { + "epoch": 1.66, + "grad_norm": 19.30215107454651, + "learning_rate": 6.5944769704030465e-06, + "loss": 0.0925, + "step": 2325 + }, + { + "epoch": 1.66, + "grad_norm": 16.28763762345787, + "learning_rate": 6.591737449499092e-06, + "loss": 0.1129, + "step": 2326 + }, + { + "epoch": 1.66, + "grad_norm": 8.830045219544676, + "learning_rate": 6.58899739676607e-06, + "loss": 0.0909, + "step": 2327 + }, + { + "epoch": 1.66, + "grad_norm": 8.33628287139074, + "learning_rate": 6.586256813119482e-06, + "loss": 0.0975, + "step": 2328 + }, + { + "epoch": 1.66, + "grad_norm": 16.449451637998475, + "learning_rate": 6.583515699475009e-06, + "loss": 0.129, + "step": 2329 + }, + { + "epoch": 1.66, + "grad_norm": 6.413192128563511, + "learning_rate": 6.580774056748508e-06, + "loss": 0.0936, + "step": 2330 + }, + { + "epoch": 1.66, + "grad_norm": 24.55379335050217, + "learning_rate": 6.578031885856011e-06, + "loss": 0.0896, + "step": 2331 + }, + { + "epoch": 1.66, + "grad_norm": 16.570585530302036, + "learning_rate": 6.575289187713731e-06, + "loss": 0.0884, + "step": 2332 + }, + { + "epoch": 1.67, + "grad_norm": 16.171822796583307, + "learning_rate": 6.572545963238053e-06, + "loss": 0.0837, + "step": 2333 + }, + { + "epoch": 1.67, + "grad_norm": 7.644151539511672, + "learning_rate": 6.569802213345537e-06, + "loss": 0.1268, + "step": 2334 + }, + { + "epoch": 1.67, + "grad_norm": 14.08472143552305, + "learning_rate": 6.5670579389529255e-06, + "loss": 0.0915, + "step": 2335 + }, + { + "epoch": 1.67, + "grad_norm": 10.674980055295068, + "learning_rate": 6.56431314097713e-06, + "loss": 0.0601, + "step": 2336 + }, + { + "epoch": 1.67, + "grad_norm": 20.618948771998678, + "learning_rate": 6.561567820335236e-06, + "loss": 0.0776, + "step": 2337 + }, + { + "epoch": 1.67, + "grad_norm": 7.837958868858976, + "learning_rate": 6.558821977944508e-06, + "loss": 0.0777, + "step": 2338 + }, + { + "epoch": 1.67, + "grad_norm": 10.081314355572308, + "learning_rate": 6.556075614722383e-06, + "loss": 0.097, + "step": 2339 + }, + { + "epoch": 1.67, + "grad_norm": 12.979423171549898, + "learning_rate": 6.553328731586473e-06, + "loss": 0.1097, + "step": 2340 + }, + { + "epoch": 1.67, + "grad_norm": 31.475378687444337, + "learning_rate": 6.550581329454561e-06, + "loss": 0.1274, + "step": 2341 + }, + { + "epoch": 1.67, + "grad_norm": 19.29402235295992, + "learning_rate": 6.547833409244606e-06, + "loss": 0.0825, + "step": 2342 + }, + { + "epoch": 1.67, + "grad_norm": 17.39491613562142, + "learning_rate": 6.545084971874738e-06, + "loss": 0.1018, + "step": 2343 + }, + { + "epoch": 1.67, + "grad_norm": 6.751666154792371, + "learning_rate": 6.542336018263262e-06, + "loss": 0.0856, + "step": 2344 + }, + { + "epoch": 1.67, + "grad_norm": 28.598101139931043, + "learning_rate": 6.539586549328656e-06, + "loss": 0.1866, + "step": 2345 + }, + { + "epoch": 1.67, + "grad_norm": 9.960017160676509, + "learning_rate": 6.536836565989565e-06, + "loss": 0.0786, + "step": 2346 + }, + { + "epoch": 1.68, + "grad_norm": 12.28562563434325, + "learning_rate": 6.534086069164813e-06, + "loss": 0.1039, + "step": 2347 + }, + { + "epoch": 1.68, + "grad_norm": 11.822129310031302, + "learning_rate": 6.531335059773392e-06, + "loss": 0.0911, + "step": 2348 + }, + { + "epoch": 1.68, + "grad_norm": 33.14817099069893, + "learning_rate": 6.528583538734463e-06, + "loss": 0.115, + "step": 2349 + }, + { + "epoch": 1.68, + "grad_norm": 9.926580522705304, + "learning_rate": 6.525831506967361e-06, + "loss": 0.1115, + "step": 2350 + }, + { + "epoch": 1.68, + "grad_norm": 25.639367084236095, + "learning_rate": 6.523078965391592e-06, + "loss": 0.1034, + "step": 2351 + }, + { + "epoch": 1.68, + "grad_norm": 6.074092820606506, + "learning_rate": 6.520325914926831e-06, + "loss": 0.0667, + "step": 2352 + }, + { + "epoch": 1.68, + "grad_norm": 16.162347392614848, + "learning_rate": 6.517572356492922e-06, + "loss": 0.0854, + "step": 2353 + }, + { + "epoch": 1.68, + "grad_norm": 4.991411546020857, + "learning_rate": 6.514818291009881e-06, + "loss": 0.1069, + "step": 2354 + }, + { + "epoch": 1.68, + "grad_norm": 15.6323688780031, + "learning_rate": 6.512063719397894e-06, + "loss": 0.0802, + "step": 2355 + }, + { + "epoch": 1.68, + "grad_norm": 8.816757264191924, + "learning_rate": 6.5093086425773126e-06, + "loss": 0.1061, + "step": 2356 + }, + { + "epoch": 1.68, + "grad_norm": 11.042693972596268, + "learning_rate": 6.506553061468659e-06, + "loss": 0.0801, + "step": 2357 + }, + { + "epoch": 1.68, + "grad_norm": 7.60615774529474, + "learning_rate": 6.5037969769926256e-06, + "loss": 0.1073, + "step": 2358 + }, + { + "epoch": 1.68, + "grad_norm": 7.938469871759891, + "learning_rate": 6.501040390070071e-06, + "loss": 0.1047, + "step": 2359 + }, + { + "epoch": 1.68, + "grad_norm": 8.407806278240889, + "learning_rate": 6.498283301622022e-06, + "loss": 0.0829, + "step": 2360 + }, + { + "epoch": 1.69, + "grad_norm": 21.855833457945497, + "learning_rate": 6.495525712569673e-06, + "loss": 0.0896, + "step": 2361 + }, + { + "epoch": 1.69, + "grad_norm": 24.55266138680911, + "learning_rate": 6.492767623834385e-06, + "loss": 0.0958, + "step": 2362 + }, + { + "epoch": 1.69, + "grad_norm": 7.844762029942168, + "learning_rate": 6.490009036337687e-06, + "loss": 0.1097, + "step": 2363 + }, + { + "epoch": 1.69, + "grad_norm": 14.157061696422106, + "learning_rate": 6.487249951001276e-06, + "loss": 0.0968, + "step": 2364 + }, + { + "epoch": 1.69, + "grad_norm": 23.749244203785135, + "learning_rate": 6.484490368747012e-06, + "loss": 0.1128, + "step": 2365 + }, + { + "epoch": 1.69, + "grad_norm": 6.384281292680775, + "learning_rate": 6.4817302904969226e-06, + "loss": 0.1133, + "step": 2366 + }, + { + "epoch": 1.69, + "grad_norm": 12.27197718764886, + "learning_rate": 6.4789697171732024e-06, + "loss": 0.1234, + "step": 2367 + }, + { + "epoch": 1.69, + "grad_norm": 6.1705409111581115, + "learning_rate": 6.476208649698209e-06, + "loss": 0.1206, + "step": 2368 + }, + { + "epoch": 1.69, + "grad_norm": 9.676461914963904, + "learning_rate": 6.473447088994467e-06, + "loss": 0.0778, + "step": 2369 + }, + { + "epoch": 1.69, + "grad_norm": 26.76018722096519, + "learning_rate": 6.470685035984667e-06, + "loss": 0.1274, + "step": 2370 + }, + { + "epoch": 1.69, + "grad_norm": 12.24487360044837, + "learning_rate": 6.467922491591658e-06, + "loss": 0.0955, + "step": 2371 + }, + { + "epoch": 1.69, + "grad_norm": 27.638340021369807, + "learning_rate": 6.465159456738461e-06, + "loss": 0.0926, + "step": 2372 + }, + { + "epoch": 1.69, + "grad_norm": 4.785845976111559, + "learning_rate": 6.462395932348257e-06, + "loss": 0.0978, + "step": 2373 + }, + { + "epoch": 1.69, + "grad_norm": 8.312844920909349, + "learning_rate": 6.459631919344389e-06, + "loss": 0.067, + "step": 2374 + }, + { + "epoch": 1.7, + "grad_norm": 9.685594864488179, + "learning_rate": 6.456867418650366e-06, + "loss": 0.0913, + "step": 2375 + }, + { + "epoch": 1.7, + "grad_norm": 9.9810523595601, + "learning_rate": 6.454102431189859e-06, + "loss": 0.1007, + "step": 2376 + }, + { + "epoch": 1.7, + "grad_norm": 6.265965225581489, + "learning_rate": 6.4513369578867026e-06, + "loss": 0.104, + "step": 2377 + }, + { + "epoch": 1.7, + "grad_norm": 16.893745136728803, + "learning_rate": 6.448570999664894e-06, + "loss": 0.1005, + "step": 2378 + }, + { + "epoch": 1.7, + "grad_norm": 28.908957709905614, + "learning_rate": 6.4458045574485875e-06, + "loss": 0.1255, + "step": 2379 + }, + { + "epoch": 1.7, + "grad_norm": 24.22139522152318, + "learning_rate": 6.443037632162104e-06, + "loss": 0.0996, + "step": 2380 + }, + { + "epoch": 1.7, + "grad_norm": 26.572515462610134, + "learning_rate": 6.440270224729927e-06, + "loss": 0.0901, + "step": 2381 + }, + { + "epoch": 1.7, + "grad_norm": 12.207827481160598, + "learning_rate": 6.437502336076695e-06, + "loss": 0.1221, + "step": 2382 + }, + { + "epoch": 1.7, + "grad_norm": 19.41999789803127, + "learning_rate": 6.4347339671272155e-06, + "loss": 0.0783, + "step": 2383 + }, + { + "epoch": 1.7, + "grad_norm": 16.873084876197144, + "learning_rate": 6.431965118806449e-06, + "loss": 0.094, + "step": 2384 + }, + { + "epoch": 1.7, + "grad_norm": 16.372299469008183, + "learning_rate": 6.42919579203952e-06, + "loss": 0.1016, + "step": 2385 + }, + { + "epoch": 1.7, + "grad_norm": 27.497086153318044, + "learning_rate": 6.4264259877517124e-06, + "loss": 0.1262, + "step": 2386 + }, + { + "epoch": 1.7, + "grad_norm": 24.053964065962585, + "learning_rate": 6.423655706868468e-06, + "loss": 0.0745, + "step": 2387 + }, + { + "epoch": 1.7, + "grad_norm": 22.25200523393893, + "learning_rate": 6.4208849503153915e-06, + "loss": 0.0837, + "step": 2388 + }, + { + "epoch": 1.71, + "grad_norm": 11.764255564259074, + "learning_rate": 6.418113719018242e-06, + "loss": 0.1034, + "step": 2389 + }, + { + "epoch": 1.71, + "grad_norm": 30.75721694150146, + "learning_rate": 6.415342013902939e-06, + "loss": 0.1151, + "step": 2390 + }, + { + "epoch": 1.71, + "grad_norm": 5.427156665392917, + "learning_rate": 6.412569835895562e-06, + "loss": 0.0956, + "step": 2391 + }, + { + "epoch": 1.71, + "grad_norm": 18.046997378438945, + "learning_rate": 6.409797185922349e-06, + "loss": 0.1169, + "step": 2392 + }, + { + "epoch": 1.71, + "grad_norm": 7.538487314232114, + "learning_rate": 6.40702406490969e-06, + "loss": 0.1074, + "step": 2393 + }, + { + "epoch": 1.71, + "grad_norm": 14.150795863197285, + "learning_rate": 6.404250473784138e-06, + "loss": 0.0994, + "step": 2394 + }, + { + "epoch": 1.71, + "grad_norm": 24.75329036726323, + "learning_rate": 6.401476413472404e-06, + "loss": 0.1069, + "step": 2395 + }, + { + "epoch": 1.71, + "grad_norm": 12.43088365316308, + "learning_rate": 6.398701884901348e-06, + "loss": 0.1107, + "step": 2396 + }, + { + "epoch": 1.71, + "grad_norm": 8.74404078917541, + "learning_rate": 6.3959268889979956e-06, + "loss": 0.0984, + "step": 2397 + }, + { + "epoch": 1.71, + "grad_norm": 9.577324694920657, + "learning_rate": 6.393151426689522e-06, + "loss": 0.1191, + "step": 2398 + }, + { + "epoch": 1.71, + "grad_norm": 13.63480133602996, + "learning_rate": 6.390375498903263e-06, + "loss": 0.0992, + "step": 2399 + }, + { + "epoch": 1.71, + "grad_norm": 14.152036274866392, + "learning_rate": 6.387599106566705e-06, + "loss": 0.0937, + "step": 2400 + }, + { + "epoch": 1.71, + "grad_norm": 6.121175600708925, + "learning_rate": 6.384822250607495e-06, + "loss": 0.1039, + "step": 2401 + }, + { + "epoch": 1.71, + "grad_norm": 6.47698057810036, + "learning_rate": 6.382044931953431e-06, + "loss": 0.0828, + "step": 2402 + }, + { + "epoch": 1.72, + "grad_norm": 10.08821825459922, + "learning_rate": 6.379267151532467e-06, + "loss": 0.135, + "step": 2403 + }, + { + "epoch": 1.72, + "grad_norm": 3.9574752880861226, + "learning_rate": 6.376488910272709e-06, + "loss": 0.0742, + "step": 2404 + }, + { + "epoch": 1.72, + "grad_norm": 5.2069319299192065, + "learning_rate": 6.373710209102423e-06, + "loss": 0.1099, + "step": 2405 + }, + { + "epoch": 1.72, + "grad_norm": 26.51909679991199, + "learning_rate": 6.370931048950022e-06, + "loss": 0.0972, + "step": 2406 + }, + { + "epoch": 1.72, + "grad_norm": 18.736652018749133, + "learning_rate": 6.368151430744075e-06, + "loss": 0.1042, + "step": 2407 + }, + { + "epoch": 1.72, + "grad_norm": 15.498335426749598, + "learning_rate": 6.365371355413306e-06, + "loss": 0.1053, + "step": 2408 + }, + { + "epoch": 1.72, + "grad_norm": 35.0436335302218, + "learning_rate": 6.362590823886588e-06, + "loss": 0.094, + "step": 2409 + }, + { + "epoch": 1.72, + "grad_norm": 35.01179148908803, + "learning_rate": 6.359809837092947e-06, + "loss": 0.1014, + "step": 2410 + }, + { + "epoch": 1.72, + "grad_norm": 8.194118078049293, + "learning_rate": 6.357028395961566e-06, + "loss": 0.0864, + "step": 2411 + }, + { + "epoch": 1.72, + "grad_norm": 6.697601824559924, + "learning_rate": 6.354246501421777e-06, + "loss": 0.12, + "step": 2412 + }, + { + "epoch": 1.72, + "grad_norm": 15.721750841959683, + "learning_rate": 6.3514641544030575e-06, + "loss": 0.1021, + "step": 2413 + }, + { + "epoch": 1.72, + "grad_norm": 23.028362842896957, + "learning_rate": 6.348681355835043e-06, + "loss": 0.106, + "step": 2414 + }, + { + "epoch": 1.72, + "grad_norm": 23.07284177900744, + "learning_rate": 6.345898106647521e-06, + "loss": 0.1036, + "step": 2415 + }, + { + "epoch": 1.72, + "grad_norm": 14.759239504580846, + "learning_rate": 6.3431144077704245e-06, + "loss": 0.1005, + "step": 2416 + }, + { + "epoch": 1.73, + "grad_norm": 7.849400235997493, + "learning_rate": 6.340330260133839e-06, + "loss": 0.1296, + "step": 2417 + }, + { + "epoch": 1.73, + "grad_norm": 13.264559348008135, + "learning_rate": 6.337545664668001e-06, + "loss": 0.1018, + "step": 2418 + }, + { + "epoch": 1.73, + "grad_norm": 54.345532921290065, + "learning_rate": 6.334760622303294e-06, + "loss": 0.1444, + "step": 2419 + }, + { + "epoch": 1.73, + "grad_norm": 11.648785289643577, + "learning_rate": 6.331975133970255e-06, + "loss": 0.0713, + "step": 2420 + }, + { + "epoch": 1.73, + "grad_norm": 20.282763003405808, + "learning_rate": 6.329189200599566e-06, + "loss": 0.0861, + "step": 2421 + }, + { + "epoch": 1.73, + "grad_norm": 19.267431318750052, + "learning_rate": 6.326402823122059e-06, + "loss": 0.061, + "step": 2422 + }, + { + "epoch": 1.73, + "grad_norm": 16.487214621707068, + "learning_rate": 6.3236160024687134e-06, + "loss": 0.0946, + "step": 2423 + }, + { + "epoch": 1.73, + "grad_norm": 8.987316243770564, + "learning_rate": 6.3208287395706595e-06, + "loss": 0.1047, + "step": 2424 + }, + { + "epoch": 1.73, + "grad_norm": 23.44498703021866, + "learning_rate": 6.3180410353591735e-06, + "loss": 0.1006, + "step": 2425 + }, + { + "epoch": 1.73, + "grad_norm": 11.465144785371189, + "learning_rate": 6.315252890765678e-06, + "loss": 0.0963, + "step": 2426 + }, + { + "epoch": 1.73, + "grad_norm": 5.166961299823076, + "learning_rate": 6.312464306721745e-06, + "loss": 0.0905, + "step": 2427 + }, + { + "epoch": 1.73, + "grad_norm": 9.634991916602496, + "learning_rate": 6.309675284159093e-06, + "loss": 0.0927, + "step": 2428 + }, + { + "epoch": 1.73, + "grad_norm": 11.247409453302282, + "learning_rate": 6.306885824009585e-06, + "loss": 0.0801, + "step": 2429 + }, + { + "epoch": 1.73, + "grad_norm": 6.056553339693292, + "learning_rate": 6.3040959272052315e-06, + "loss": 0.0787, + "step": 2430 + }, + { + "epoch": 1.74, + "grad_norm": 13.238200555357215, + "learning_rate": 6.301305594678189e-06, + "loss": 0.0916, + "step": 2431 + }, + { + "epoch": 1.74, + "grad_norm": 5.245856258170428, + "learning_rate": 6.2985148273607586e-06, + "loss": 0.0818, + "step": 2432 + }, + { + "epoch": 1.74, + "grad_norm": 16.58705389948258, + "learning_rate": 6.29572362618539e-06, + "loss": 0.0864, + "step": 2433 + }, + { + "epoch": 1.74, + "grad_norm": 24.4358198678026, + "learning_rate": 6.292931992084672e-06, + "loss": 0.1365, + "step": 2434 + }, + { + "epoch": 1.74, + "grad_norm": 14.195560279980855, + "learning_rate": 6.290139925991345e-06, + "loss": 0.1036, + "step": 2435 + }, + { + "epoch": 1.74, + "grad_norm": 14.566630680835873, + "learning_rate": 6.287347428838289e-06, + "loss": 0.067, + "step": 2436 + }, + { + "epoch": 1.74, + "grad_norm": 10.846086245631618, + "learning_rate": 6.2845545015585275e-06, + "loss": 0.126, + "step": 2437 + }, + { + "epoch": 1.74, + "grad_norm": 6.710131680304093, + "learning_rate": 6.281761145085232e-06, + "loss": 0.0868, + "step": 2438 + }, + { + "epoch": 1.74, + "grad_norm": 9.77685196724837, + "learning_rate": 6.278967360351712e-06, + "loss": 0.0619, + "step": 2439 + }, + { + "epoch": 1.74, + "grad_norm": 22.26451062145235, + "learning_rate": 6.276173148291425e-06, + "loss": 0.1013, + "step": 2440 + }, + { + "epoch": 1.74, + "grad_norm": 4.273890399178918, + "learning_rate": 6.273378509837969e-06, + "loss": 0.0831, + "step": 2441 + }, + { + "epoch": 1.74, + "grad_norm": 22.997496293271507, + "learning_rate": 6.2705834459250825e-06, + "loss": 0.1004, + "step": 2442 + }, + { + "epoch": 1.74, + "grad_norm": 11.57569324607047, + "learning_rate": 6.2677879574866515e-06, + "loss": 0.0858, + "step": 2443 + }, + { + "epoch": 1.74, + "grad_norm": 33.78323186801771, + "learning_rate": 6.264992045456699e-06, + "loss": 0.111, + "step": 2444 + }, + { + "epoch": 1.75, + "grad_norm": 10.134971347540446, + "learning_rate": 6.262195710769391e-06, + "loss": 0.089, + "step": 2445 + }, + { + "epoch": 1.75, + "grad_norm": 26.861775030048683, + "learning_rate": 6.259398954359037e-06, + "loss": 0.1116, + "step": 2446 + }, + { + "epoch": 1.75, + "grad_norm": 7.188038510887187, + "learning_rate": 6.256601777160082e-06, + "loss": 0.1488, + "step": 2447 + }, + { + "epoch": 1.75, + "grad_norm": 23.487091229583946, + "learning_rate": 6.253804180107116e-06, + "loss": 0.1021, + "step": 2448 + }, + { + "epoch": 1.75, + "grad_norm": 12.589395392272444, + "learning_rate": 6.2510061641348695e-06, + "loss": 0.1018, + "step": 2449 + }, + { + "epoch": 1.75, + "grad_norm": 13.31139980832625, + "learning_rate": 6.248207730178211e-06, + "loss": 0.0819, + "step": 2450 + }, + { + "epoch": 1.75, + "grad_norm": 12.211913937617608, + "learning_rate": 6.245408879172148e-06, + "loss": 0.1106, + "step": 2451 + }, + { + "epoch": 1.75, + "grad_norm": 5.641616344245171, + "learning_rate": 6.24260961205183e-06, + "loss": 0.0833, + "step": 2452 + }, + { + "epoch": 1.75, + "grad_norm": 21.256785415127464, + "learning_rate": 6.239809929752544e-06, + "loss": 0.0796, + "step": 2453 + }, + { + "epoch": 1.75, + "grad_norm": 10.627318442838185, + "learning_rate": 6.237009833209715e-06, + "loss": 0.1066, + "step": 2454 + }, + { + "epoch": 1.75, + "grad_norm": 12.179171841920033, + "learning_rate": 6.2342093233589095e-06, + "loss": 0.1462, + "step": 2455 + }, + { + "epoch": 1.75, + "grad_norm": 14.930949343627839, + "learning_rate": 6.231408401135828e-06, + "loss": 0.1081, + "step": 2456 + }, + { + "epoch": 1.75, + "grad_norm": 27.796422103824614, + "learning_rate": 6.228607067476311e-06, + "loss": 0.1116, + "step": 2457 + }, + { + "epoch": 1.75, + "grad_norm": 19.282120894394577, + "learning_rate": 6.225805323316336e-06, + "loss": 0.1353, + "step": 2458 + }, + { + "epoch": 1.76, + "grad_norm": 24.664287072388348, + "learning_rate": 6.223003169592018e-06, + "loss": 0.1069, + "step": 2459 + }, + { + "epoch": 1.76, + "grad_norm": 11.014341949865857, + "learning_rate": 6.220200607239609e-06, + "loss": 0.1069, + "step": 2460 + }, + { + "epoch": 1.76, + "grad_norm": 9.682585859806537, + "learning_rate": 6.217397637195497e-06, + "loss": 0.1176, + "step": 2461 + }, + { + "epoch": 1.76, + "grad_norm": 5.254486314903181, + "learning_rate": 6.214594260396206e-06, + "loss": 0.078, + "step": 2462 + }, + { + "epoch": 1.76, + "grad_norm": 6.715958544218641, + "learning_rate": 6.211790477778399e-06, + "loss": 0.0914, + "step": 2463 + }, + { + "epoch": 1.76, + "grad_norm": 9.16222865335177, + "learning_rate": 6.208986290278866e-06, + "loss": 0.0664, + "step": 2464 + }, + { + "epoch": 1.76, + "grad_norm": 17.220240072763353, + "learning_rate": 6.206181698834544e-06, + "loss": 0.1018, + "step": 2465 + }, + { + "epoch": 1.76, + "grad_norm": 16.278402617080406, + "learning_rate": 6.2033767043824955e-06, + "loss": 0.0988, + "step": 2466 + }, + { + "epoch": 1.76, + "grad_norm": 16.789027698060593, + "learning_rate": 6.200571307859923e-06, + "loss": 0.1199, + "step": 2467 + }, + { + "epoch": 1.76, + "grad_norm": 16.858190409234904, + "learning_rate": 6.197765510204161e-06, + "loss": 0.1025, + "step": 2468 + }, + { + "epoch": 1.76, + "grad_norm": 20.311780934271166, + "learning_rate": 6.19495931235268e-06, + "loss": 0.0986, + "step": 2469 + }, + { + "epoch": 1.76, + "grad_norm": 8.24075864852246, + "learning_rate": 6.19215271524308e-06, + "loss": 0.0897, + "step": 2470 + }, + { + "epoch": 1.76, + "grad_norm": 7.537619256321832, + "learning_rate": 6.189345719813099e-06, + "loss": 0.1053, + "step": 2471 + }, + { + "epoch": 1.76, + "grad_norm": 19.985538588188874, + "learning_rate": 6.186538327000609e-06, + "loss": 0.0933, + "step": 2472 + }, + { + "epoch": 1.77, + "grad_norm": 10.951691550287652, + "learning_rate": 6.183730537743607e-06, + "loss": 0.0876, + "step": 2473 + }, + { + "epoch": 1.77, + "grad_norm": 14.796383785272676, + "learning_rate": 6.18092235298023e-06, + "loss": 0.1133, + "step": 2474 + }, + { + "epoch": 1.77, + "grad_norm": 8.222463592646822, + "learning_rate": 6.178113773648745e-06, + "loss": 0.0845, + "step": 2475 + }, + { + "epoch": 1.77, + "grad_norm": 9.583268825766998, + "learning_rate": 6.175304800687551e-06, + "loss": 0.0778, + "step": 2476 + }, + { + "epoch": 1.77, + "grad_norm": 6.546628402576362, + "learning_rate": 6.172495435035176e-06, + "loss": 0.1199, + "step": 2477 + }, + { + "epoch": 1.77, + "grad_norm": 6.515726741394918, + "learning_rate": 6.169685677630284e-06, + "loss": 0.0916, + "step": 2478 + }, + { + "epoch": 1.77, + "grad_norm": 6.737944386905074, + "learning_rate": 6.1668755294116655e-06, + "loss": 0.0611, + "step": 2479 + }, + { + "epoch": 1.77, + "grad_norm": 9.071570166337342, + "learning_rate": 6.1640649913182436e-06, + "loss": 0.0988, + "step": 2480 + }, + { + "epoch": 1.77, + "grad_norm": 5.529604937790912, + "learning_rate": 6.161254064289072e-06, + "loss": 0.084, + "step": 2481 + }, + { + "epoch": 1.77, + "grad_norm": 13.061035895196843, + "learning_rate": 6.158442749263332e-06, + "loss": 0.0971, + "step": 2482 + }, + { + "epoch": 1.77, + "grad_norm": 17.682527665030868, + "learning_rate": 6.155631047180337e-06, + "loss": 0.0814, + "step": 2483 + }, + { + "epoch": 1.77, + "grad_norm": 14.400108863552052, + "learning_rate": 6.152818958979529e-06, + "loss": 0.0972, + "step": 2484 + }, + { + "epoch": 1.77, + "grad_norm": 4.875457146337422, + "learning_rate": 6.1500064856004796e-06, + "loss": 0.0699, + "step": 2485 + }, + { + "epoch": 1.77, + "grad_norm": 15.446178764057269, + "learning_rate": 6.147193627982887e-06, + "loss": 0.099, + "step": 2486 + }, + { + "epoch": 1.78, + "grad_norm": 7.245784964129334, + "learning_rate": 6.144380387066581e-06, + "loss": 0.1026, + "step": 2487 + }, + { + "epoch": 1.78, + "grad_norm": 24.36888800556823, + "learning_rate": 6.141566763791518e-06, + "loss": 0.1219, + "step": 2488 + }, + { + "epoch": 1.78, + "grad_norm": 11.458177901064403, + "learning_rate": 6.138752759097778e-06, + "loss": 0.0728, + "step": 2489 + }, + { + "epoch": 1.78, + "grad_norm": 6.774247738507147, + "learning_rate": 6.135938373925576e-06, + "loss": 0.0879, + "step": 2490 + }, + { + "epoch": 1.78, + "grad_norm": 5.476440380457411, + "learning_rate": 6.133123609215249e-06, + "loss": 0.0761, + "step": 2491 + }, + { + "epoch": 1.78, + "grad_norm": 5.095496325152822, + "learning_rate": 6.130308465907263e-06, + "loss": 0.0991, + "step": 2492 + }, + { + "epoch": 1.78, + "grad_norm": 6.029873546120703, + "learning_rate": 6.127492944942209e-06, + "loss": 0.0861, + "step": 2493 + }, + { + "epoch": 1.78, + "grad_norm": 19.09411821013496, + "learning_rate": 6.124677047260805e-06, + "loss": 0.0801, + "step": 2494 + }, + { + "epoch": 1.78, + "grad_norm": 18.044110197346168, + "learning_rate": 6.121860773803895e-06, + "loss": 0.099, + "step": 2495 + }, + { + "epoch": 1.78, + "grad_norm": 9.097311899728787, + "learning_rate": 6.119044125512447e-06, + "loss": 0.082, + "step": 2496 + }, + { + "epoch": 1.78, + "grad_norm": 10.196462877637837, + "learning_rate": 6.116227103327559e-06, + "loss": 0.1102, + "step": 2497 + }, + { + "epoch": 1.78, + "grad_norm": 17.411823414220972, + "learning_rate": 6.113409708190447e-06, + "loss": 0.1019, + "step": 2498 + }, + { + "epoch": 1.78, + "grad_norm": 18.155557304557057, + "learning_rate": 6.1105919410424566e-06, + "loss": 0.1013, + "step": 2499 + }, + { + "epoch": 1.78, + "grad_norm": 16.792200022527275, + "learning_rate": 6.107773802825055e-06, + "loss": 0.1157, + "step": 2500 + }, + { + "epoch": 1.78, + "eval_avg_AUC": 0.8162853827527864, + "eval_avg_Accuracy": 0.7180039787798409, + "eval_avg_Accuracy-right": 0.8944176340159123, + "eval_avg_Accuracy-wrong": 0.4103934500795997, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6879813648500559, + "eval_last_AUC": 0.8267063817923214, + "eval_last_Accuracy": 0.7504559018567639, + "eval_last_Accuracy-right": 0.8129646537107083, + "eval_last_Accuracy-wrong": 0.6414600864225608, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6987671721798714, + "eval_max_AUC": 0.7501564696977459, + "eval_max_Accuracy": 0.6474635278514589, + "eval_max_Accuracy-right": 0.9788704838920047, + "eval_max_Accuracy-wrong": 0.0695929042528997, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6173876633213748, + "eval_min_AUC": 0.824010385799939, + "eval_min_Accuracy": 0.7522795092838196, + "eval_min_Accuracy-right": 0.7631407330116082, + "eval_min_Accuracy-wrong": 0.7333409142597226, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6985406564564841, + "eval_prod_AUC": 0.8230216036527745, + "eval_prod_Accuracy": 0.712367374005305, + "eval_prod_Accuracy-right": 0.617320986044085, + "eval_prod_Accuracy-wrong": 0.8780987036615875, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6893611435134097, + "eval_runtime": 247.0285, + "eval_samples_per_second": 97.673, + "eval_steps_per_second": 3.052, + "eval_sum_AUC": 0.6795322314303073, + "eval_sum_Accuracy": 0.6392572944297082, + "eval_sum_Accuracy-right": 0.9985000652145559, + "eval_sum_Accuracy-wrong": 0.01284967022970207, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6636898117156945, + "step": 2500 + }, + { + "epoch": 1.79, + "grad_norm": 27.66785481931168, + "learning_rate": 6.1049552944798355e-06, + "loss": 0.1031, + "step": 2501 + }, + { + "epoch": 1.79, + "grad_norm": 13.877238096501856, + "learning_rate": 6.102136416948513e-06, + "loss": 0.0922, + "step": 2502 + }, + { + "epoch": 1.79, + "grad_norm": 14.832991307429838, + "learning_rate": 6.099317171172929e-06, + "loss": 0.1277, + "step": 2503 + }, + { + "epoch": 1.79, + "grad_norm": 13.151505015978923, + "learning_rate": 6.0964975580950445e-06, + "loss": 0.088, + "step": 2504 + }, + { + "epoch": 1.79, + "grad_norm": 14.194545354333274, + "learning_rate": 6.093677578656946e-06, + "loss": 0.1083, + "step": 2505 + }, + { + "epoch": 1.79, + "grad_norm": 8.548368155403958, + "learning_rate": 6.090857233800839e-06, + "loss": 0.0778, + "step": 2506 + }, + { + "epoch": 1.79, + "grad_norm": 29.162206973113896, + "learning_rate": 6.0880365244690546e-06, + "loss": 0.1195, + "step": 2507 + }, + { + "epoch": 1.79, + "grad_norm": 8.958049048309908, + "learning_rate": 6.085215451604044e-06, + "loss": 0.0724, + "step": 2508 + }, + { + "epoch": 1.79, + "grad_norm": 29.68574871023764, + "learning_rate": 6.082394016148379e-06, + "loss": 0.0842, + "step": 2509 + }, + { + "epoch": 1.79, + "grad_norm": 34.56039583857383, + "learning_rate": 6.079572219044755e-06, + "loss": 0.1259, + "step": 2510 + }, + { + "epoch": 1.79, + "grad_norm": 12.146084312643838, + "learning_rate": 6.076750061235985e-06, + "loss": 0.0802, + "step": 2511 + }, + { + "epoch": 1.79, + "grad_norm": 17.542205198619733, + "learning_rate": 6.073927543665008e-06, + "loss": 0.1012, + "step": 2512 + }, + { + "epoch": 1.79, + "grad_norm": 28.234603954607753, + "learning_rate": 6.071104667274875e-06, + "loss": 0.1429, + "step": 2513 + }, + { + "epoch": 1.79, + "grad_norm": 22.631420233858943, + "learning_rate": 6.068281433008765e-06, + "loss": 0.1251, + "step": 2514 + }, + { + "epoch": 1.8, + "grad_norm": 8.105380110647712, + "learning_rate": 6.0654578418099715e-06, + "loss": 0.0927, + "step": 2515 + }, + { + "epoch": 1.8, + "grad_norm": 13.931020901993193, + "learning_rate": 6.062633894621909e-06, + "loss": 0.0729, + "step": 2516 + }, + { + "epoch": 1.8, + "grad_norm": 26.71305807576688, + "learning_rate": 6.0598095923881105e-06, + "loss": 0.1173, + "step": 2517 + }, + { + "epoch": 1.8, + "grad_norm": 23.253308243155512, + "learning_rate": 6.056984936052229e-06, + "loss": 0.092, + "step": 2518 + }, + { + "epoch": 1.8, + "grad_norm": 11.399153561239252, + "learning_rate": 6.054159926558033e-06, + "loss": 0.1056, + "step": 2519 + }, + { + "epoch": 1.8, + "grad_norm": 32.15751111995552, + "learning_rate": 6.051334564849413e-06, + "loss": 0.1127, + "step": 2520 + }, + { + "epoch": 1.8, + "grad_norm": 7.043809237704044, + "learning_rate": 6.048508851870372e-06, + "loss": 0.0915, + "step": 2521 + }, + { + "epoch": 1.8, + "grad_norm": 17.66369609169051, + "learning_rate": 6.045682788565036e-06, + "loss": 0.0673, + "step": 2522 + }, + { + "epoch": 1.8, + "grad_norm": 11.479436418010804, + "learning_rate": 6.042856375877644e-06, + "loss": 0.1146, + "step": 2523 + }, + { + "epoch": 1.8, + "grad_norm": 12.596499035596006, + "learning_rate": 6.040029614752551e-06, + "loss": 0.1011, + "step": 2524 + }, + { + "epoch": 1.8, + "grad_norm": 27.997863595749756, + "learning_rate": 6.037202506134234e-06, + "loss": 0.0824, + "step": 2525 + }, + { + "epoch": 1.8, + "grad_norm": 21.9014456038331, + "learning_rate": 6.03437505096728e-06, + "loss": 0.0857, + "step": 2526 + }, + { + "epoch": 1.8, + "grad_norm": 11.901316851446808, + "learning_rate": 6.0315472501963955e-06, + "loss": 0.1244, + "step": 2527 + }, + { + "epoch": 1.8, + "grad_norm": 8.351044648698084, + "learning_rate": 6.028719104766402e-06, + "loss": 0.0792, + "step": 2528 + }, + { + "epoch": 1.81, + "grad_norm": 21.352307377026413, + "learning_rate": 6.025890615622233e-06, + "loss": 0.1433, + "step": 2529 + }, + { + "epoch": 1.81, + "grad_norm": 26.395611117406183, + "learning_rate": 6.023061783708941e-06, + "loss": 0.0824, + "step": 2530 + }, + { + "epoch": 1.81, + "grad_norm": 8.387064879765818, + "learning_rate": 6.020232609971694e-06, + "loss": 0.093, + "step": 2531 + }, + { + "epoch": 1.81, + "grad_norm": 7.527848046304538, + "learning_rate": 6.017403095355766e-06, + "loss": 0.106, + "step": 2532 + }, + { + "epoch": 1.81, + "grad_norm": 30.56609985998804, + "learning_rate": 6.014573240806553e-06, + "loss": 0.1035, + "step": 2533 + }, + { + "epoch": 1.81, + "grad_norm": 18.801880253810776, + "learning_rate": 6.011743047269563e-06, + "loss": 0.1088, + "step": 2534 + }, + { + "epoch": 1.81, + "grad_norm": 9.532091172107885, + "learning_rate": 6.008912515690415e-06, + "loss": 0.0753, + "step": 2535 + }, + { + "epoch": 1.81, + "grad_norm": 12.658859923242405, + "learning_rate": 6.006081647014842e-06, + "loss": 0.0805, + "step": 2536 + }, + { + "epoch": 1.81, + "grad_norm": 29.195819838210074, + "learning_rate": 6.00325044218869e-06, + "loss": 0.1105, + "step": 2537 + }, + { + "epoch": 1.81, + "grad_norm": 18.51162600271363, + "learning_rate": 6.000418902157919e-06, + "loss": 0.1062, + "step": 2538 + }, + { + "epoch": 1.81, + "grad_norm": 44.1732405351717, + "learning_rate": 5.997587027868598e-06, + "loss": 0.1221, + "step": 2539 + }, + { + "epoch": 1.81, + "grad_norm": 29.309897683209353, + "learning_rate": 5.994754820266908e-06, + "loss": 0.0969, + "step": 2540 + }, + { + "epoch": 1.81, + "grad_norm": 8.87554200493296, + "learning_rate": 5.991922280299143e-06, + "loss": 0.0918, + "step": 2541 + }, + { + "epoch": 1.81, + "grad_norm": 7.609783728113712, + "learning_rate": 5.989089408911706e-06, + "loss": 0.0911, + "step": 2542 + }, + { + "epoch": 1.82, + "grad_norm": 14.418526098296795, + "learning_rate": 5.986256207051113e-06, + "loss": 0.1036, + "step": 2543 + }, + { + "epoch": 1.82, + "grad_norm": 58.75440816966157, + "learning_rate": 5.98342267566399e-06, + "loss": 0.1527, + "step": 2544 + }, + { + "epoch": 1.82, + "grad_norm": 32.69007877459673, + "learning_rate": 5.9805888156970714e-06, + "loss": 0.1378, + "step": 2545 + }, + { + "epoch": 1.82, + "grad_norm": 28.46074577433084, + "learning_rate": 5.977754628097203e-06, + "loss": 0.0883, + "step": 2546 + }, + { + "epoch": 1.82, + "grad_norm": 36.58388507387772, + "learning_rate": 5.97492011381134e-06, + "loss": 0.088, + "step": 2547 + }, + { + "epoch": 1.82, + "grad_norm": 40.783240549154485, + "learning_rate": 5.972085273786547e-06, + "loss": 0.0876, + "step": 2548 + }, + { + "epoch": 1.82, + "grad_norm": 27.81800708078887, + "learning_rate": 5.969250108969995e-06, + "loss": 0.1101, + "step": 2549 + }, + { + "epoch": 1.82, + "grad_norm": 8.59147754384461, + "learning_rate": 5.966414620308965e-06, + "loss": 0.092, + "step": 2550 + }, + { + "epoch": 1.82, + "grad_norm": 28.42033100421827, + "learning_rate": 5.9635788087508474e-06, + "loss": 0.1016, + "step": 2551 + }, + { + "epoch": 1.82, + "grad_norm": 34.74266526203714, + "learning_rate": 5.960742675243139e-06, + "loss": 0.0794, + "step": 2552 + }, + { + "epoch": 1.82, + "grad_norm": 21.553513330683376, + "learning_rate": 5.957906220733447e-06, + "loss": 0.0999, + "step": 2553 + }, + { + "epoch": 1.82, + "grad_norm": 5.114093682906385, + "learning_rate": 5.9550694461694806e-06, + "loss": 0.0754, + "step": 2554 + }, + { + "epoch": 1.82, + "grad_norm": 7.8221315937987494, + "learning_rate": 5.95223235249906e-06, + "loss": 0.099, + "step": 2555 + }, + { + "epoch": 1.82, + "grad_norm": 16.717018622807625, + "learning_rate": 5.949394940670112e-06, + "loss": 0.0839, + "step": 2556 + }, + { + "epoch": 1.83, + "grad_norm": 10.214246667729665, + "learning_rate": 5.946557211630667e-06, + "loss": 0.0936, + "step": 2557 + }, + { + "epoch": 1.83, + "grad_norm": 9.657749645898761, + "learning_rate": 5.943719166328864e-06, + "loss": 0.0894, + "step": 2558 + }, + { + "epoch": 1.83, + "grad_norm": 5.149493730592742, + "learning_rate": 5.940880805712945e-06, + "loss": 0.0777, + "step": 2559 + }, + { + "epoch": 1.83, + "grad_norm": 6.2592533001332225, + "learning_rate": 5.938042130731262e-06, + "loss": 0.1044, + "step": 2560 + }, + { + "epoch": 1.83, + "grad_norm": 32.71064285721712, + "learning_rate": 5.935203142332267e-06, + "loss": 0.1262, + "step": 2561 + }, + { + "epoch": 1.83, + "grad_norm": 6.8348479431905105, + "learning_rate": 5.932363841464519e-06, + "loss": 0.0815, + "step": 2562 + }, + { + "epoch": 1.83, + "grad_norm": 30.961681239814183, + "learning_rate": 5.9295242290766805e-06, + "loss": 0.1127, + "step": 2563 + }, + { + "epoch": 1.83, + "grad_norm": 18.416776056408292, + "learning_rate": 5.9266843061175216e-06, + "loss": 0.0973, + "step": 2564 + }, + { + "epoch": 1.83, + "grad_norm": 10.93811275476671, + "learning_rate": 5.92384407353591e-06, + "loss": 0.1251, + "step": 2565 + }, + { + "epoch": 1.83, + "grad_norm": 5.447480213973523, + "learning_rate": 5.921003532280822e-06, + "loss": 0.0801, + "step": 2566 + }, + { + "epoch": 1.83, + "grad_norm": 7.813801842459522, + "learning_rate": 5.918162683301336e-06, + "loss": 0.1125, + "step": 2567 + }, + { + "epoch": 1.83, + "grad_norm": 28.216557976157063, + "learning_rate": 5.91532152754663e-06, + "loss": 0.1176, + "step": 2568 + }, + { + "epoch": 1.83, + "grad_norm": 19.011803166843467, + "learning_rate": 5.91248006596599e-06, + "loss": 0.1005, + "step": 2569 + }, + { + "epoch": 1.83, + "grad_norm": 22.129204827206834, + "learning_rate": 5.909638299508798e-06, + "loss": 0.0659, + "step": 2570 + }, + { + "epoch": 1.84, + "grad_norm": 8.030892883456888, + "learning_rate": 5.906796229124543e-06, + "loss": 0.101, + "step": 2571 + }, + { + "epoch": 1.84, + "grad_norm": 16.213774164615035, + "learning_rate": 5.903953855762812e-06, + "loss": 0.0795, + "step": 2572 + }, + { + "epoch": 1.84, + "grad_norm": 5.430953096593536, + "learning_rate": 5.901111180373298e-06, + "loss": 0.1147, + "step": 2573 + }, + { + "epoch": 1.84, + "grad_norm": 43.58218646987675, + "learning_rate": 5.898268203905788e-06, + "loss": 0.1244, + "step": 2574 + }, + { + "epoch": 1.84, + "grad_norm": 36.70832267287658, + "learning_rate": 5.895424927310174e-06, + "loss": 0.1086, + "step": 2575 + }, + { + "epoch": 1.84, + "grad_norm": 23.88936445567082, + "learning_rate": 5.89258135153645e-06, + "loss": 0.1085, + "step": 2576 + }, + { + "epoch": 1.84, + "grad_norm": 43.53612470901018, + "learning_rate": 5.889737477534704e-06, + "loss": 0.103, + "step": 2577 + }, + { + "epoch": 1.84, + "grad_norm": 34.14388783488702, + "learning_rate": 5.886893306255129e-06, + "loss": 0.1014, + "step": 2578 + }, + { + "epoch": 1.84, + "grad_norm": 21.552187716115085, + "learning_rate": 5.884048838648017e-06, + "loss": 0.1003, + "step": 2579 + }, + { + "epoch": 1.84, + "grad_norm": 6.854450569018791, + "learning_rate": 5.881204075663755e-06, + "loss": 0.1073, + "step": 2580 + }, + { + "epoch": 1.84, + "grad_norm": 21.51464575703357, + "learning_rate": 5.878359018252831e-06, + "loss": 0.1342, + "step": 2581 + }, + { + "epoch": 1.84, + "grad_norm": 41.8483916842586, + "learning_rate": 5.8755136673658365e-06, + "loss": 0.1082, + "step": 2582 + }, + { + "epoch": 1.84, + "grad_norm": 37.636416874159046, + "learning_rate": 5.872668023953449e-06, + "loss": 0.0976, + "step": 2583 + }, + { + "epoch": 1.84, + "grad_norm": 7.552098998633584, + "learning_rate": 5.869822088966455e-06, + "loss": 0.0833, + "step": 2584 + }, + { + "epoch": 1.85, + "grad_norm": 10.713494577494489, + "learning_rate": 5.866975863355734e-06, + "loss": 0.0893, + "step": 2585 + }, + { + "epoch": 1.85, + "grad_norm": 23.2091103383476, + "learning_rate": 5.864129348072261e-06, + "loss": 0.1205, + "step": 2586 + }, + { + "epoch": 1.85, + "grad_norm": 31.823573317574535, + "learning_rate": 5.861282544067112e-06, + "loss": 0.1232, + "step": 2587 + }, + { + "epoch": 1.85, + "grad_norm": 22.21437732970618, + "learning_rate": 5.8584354522914555e-06, + "loss": 0.1088, + "step": 2588 + }, + { + "epoch": 1.85, + "grad_norm": 11.487315525706093, + "learning_rate": 5.855588073696559e-06, + "loss": 0.0837, + "step": 2589 + }, + { + "epoch": 1.85, + "grad_norm": 22.281223727113854, + "learning_rate": 5.852740409233785e-06, + "loss": 0.0732, + "step": 2590 + }, + { + "epoch": 1.85, + "grad_norm": 24.22884461950451, + "learning_rate": 5.849892459854588e-06, + "loss": 0.0881, + "step": 2591 + }, + { + "epoch": 1.85, + "grad_norm": 25.959868893514688, + "learning_rate": 5.847044226510524e-06, + "loss": 0.0851, + "step": 2592 + }, + { + "epoch": 1.85, + "grad_norm": 17.515409904214152, + "learning_rate": 5.84419571015324e-06, + "loss": 0.1182, + "step": 2593 + }, + { + "epoch": 1.85, + "grad_norm": 14.3143330651017, + "learning_rate": 5.8413469117344766e-06, + "loss": 0.1311, + "step": 2594 + }, + { + "epoch": 1.85, + "grad_norm": 13.137514401761287, + "learning_rate": 5.838497832206074e-06, + "loss": 0.0912, + "step": 2595 + }, + { + "epoch": 1.85, + "grad_norm": 6.325028784584776, + "learning_rate": 5.835648472519958e-06, + "loss": 0.0896, + "step": 2596 + }, + { + "epoch": 1.85, + "grad_norm": 24.617506819797416, + "learning_rate": 5.832798833628156e-06, + "loss": 0.0967, + "step": 2597 + }, + { + "epoch": 1.85, + "grad_norm": 5.056349053356649, + "learning_rate": 5.829948916482784e-06, + "loss": 0.0851, + "step": 2598 + }, + { + "epoch": 1.86, + "grad_norm": 5.059342030707879, + "learning_rate": 5.827098722036053e-06, + "loss": 0.0838, + "step": 2599 + }, + { + "epoch": 1.86, + "grad_norm": 21.709897870658843, + "learning_rate": 5.824248251240265e-06, + "loss": 0.1256, + "step": 2600 + }, + { + "epoch": 1.86, + "grad_norm": 21.868681173826214, + "learning_rate": 5.8213975050478155e-06, + "loss": 0.0929, + "step": 2601 + }, + { + "epoch": 1.86, + "grad_norm": 12.614721781840734, + "learning_rate": 5.818546484411191e-06, + "loss": 0.1243, + "step": 2602 + }, + { + "epoch": 1.86, + "grad_norm": 10.034829947869662, + "learning_rate": 5.815695190282974e-06, + "loss": 0.1393, + "step": 2603 + }, + { + "epoch": 1.86, + "grad_norm": 13.607835352795393, + "learning_rate": 5.81284362361583e-06, + "loss": 0.1298, + "step": 2604 + }, + { + "epoch": 1.86, + "grad_norm": 10.740168450428905, + "learning_rate": 5.809991785362525e-06, + "loss": 0.0995, + "step": 2605 + }, + { + "epoch": 1.86, + "grad_norm": 10.42600482469918, + "learning_rate": 5.8071396764759065e-06, + "loss": 0.1045, + "step": 2606 + }, + { + "epoch": 1.86, + "grad_norm": 14.34092985354534, + "learning_rate": 5.804287297908923e-06, + "loss": 0.099, + "step": 2607 + }, + { + "epoch": 1.86, + "grad_norm": 12.665461324372817, + "learning_rate": 5.801434650614601e-06, + "loss": 0.1346, + "step": 2608 + }, + { + "epoch": 1.86, + "grad_norm": 6.536614697320999, + "learning_rate": 5.798581735546066e-06, + "loss": 0.1066, + "step": 2609 + }, + { + "epoch": 1.86, + "grad_norm": 5.547830795525716, + "learning_rate": 5.79572855365653e-06, + "loss": 0.0737, + "step": 2610 + }, + { + "epoch": 1.86, + "grad_norm": 22.917140065123103, + "learning_rate": 5.792875105899294e-06, + "loss": 0.1536, + "step": 2611 + }, + { + "epoch": 1.86, + "grad_norm": 14.280858621166734, + "learning_rate": 5.790021393227747e-06, + "loss": 0.1257, + "step": 2612 + }, + { + "epoch": 1.87, + "grad_norm": 8.601450815452466, + "learning_rate": 5.787167416595369e-06, + "loss": 0.0939, + "step": 2613 + }, + { + "epoch": 1.87, + "grad_norm": 23.25278219450862, + "learning_rate": 5.784313176955726e-06, + "loss": 0.1099, + "step": 2614 + }, + { + "epoch": 1.87, + "grad_norm": 13.52299424961681, + "learning_rate": 5.781458675262472e-06, + "loss": 0.0918, + "step": 2615 + }, + { + "epoch": 1.87, + "grad_norm": 13.140181284280587, + "learning_rate": 5.778603912469349e-06, + "loss": 0.1211, + "step": 2616 + }, + { + "epoch": 1.87, + "grad_norm": 17.987639399325246, + "learning_rate": 5.775748889530187e-06, + "loss": 0.1158, + "step": 2617 + }, + { + "epoch": 1.87, + "grad_norm": 7.029185285872469, + "learning_rate": 5.772893607398901e-06, + "loss": 0.0793, + "step": 2618 + }, + { + "epoch": 1.87, + "grad_norm": 10.243106638432673, + "learning_rate": 5.770038067029496e-06, + "loss": 0.0837, + "step": 2619 + }, + { + "epoch": 1.87, + "grad_norm": 7.120185441810166, + "learning_rate": 5.76718226937606e-06, + "loss": 0.071, + "step": 2620 + }, + { + "epoch": 1.87, + "grad_norm": 17.79704560468372, + "learning_rate": 5.764326215392768e-06, + "loss": 0.0668, + "step": 2621 + }, + { + "epoch": 1.87, + "grad_norm": 23.092612411054002, + "learning_rate": 5.761469906033879e-06, + "loss": 0.0911, + "step": 2622 + }, + { + "epoch": 1.87, + "grad_norm": 5.923215155786091, + "learning_rate": 5.758613342253743e-06, + "loss": 0.0652, + "step": 2623 + }, + { + "epoch": 1.87, + "grad_norm": 12.23566762430646, + "learning_rate": 5.7557565250067896e-06, + "loss": 0.093, + "step": 2624 + }, + { + "epoch": 1.87, + "grad_norm": 29.68819365852448, + "learning_rate": 5.752899455247532e-06, + "loss": 0.0942, + "step": 2625 + }, + { + "epoch": 1.87, + "grad_norm": 25.11407988203367, + "learning_rate": 5.750042133930571e-06, + "loss": 0.1067, + "step": 2626 + }, + { + "epoch": 1.88, + "grad_norm": 13.029913042547735, + "learning_rate": 5.7471845620105925e-06, + "loss": 0.095, + "step": 2627 + }, + { + "epoch": 1.88, + "grad_norm": 8.765283553153818, + "learning_rate": 5.744326740442364e-06, + "loss": 0.1107, + "step": 2628 + }, + { + "epoch": 1.88, + "grad_norm": 14.7909670270782, + "learning_rate": 5.741468670180737e-06, + "loss": 0.1256, + "step": 2629 + }, + { + "epoch": 1.88, + "grad_norm": 16.345896107400005, + "learning_rate": 5.738610352180645e-06, + "loss": 0.1219, + "step": 2630 + }, + { + "epoch": 1.88, + "grad_norm": 6.525593347954734, + "learning_rate": 5.735751787397106e-06, + "loss": 0.0771, + "step": 2631 + }, + { + "epoch": 1.88, + "grad_norm": 29.13519994204438, + "learning_rate": 5.732892976785218e-06, + "loss": 0.1133, + "step": 2632 + }, + { + "epoch": 1.88, + "grad_norm": 10.037077891178535, + "learning_rate": 5.730033921300166e-06, + "loss": 0.0765, + "step": 2633 + }, + { + "epoch": 1.88, + "grad_norm": 11.01310603937196, + "learning_rate": 5.7271746218972105e-06, + "loss": 0.0965, + "step": 2634 + }, + { + "epoch": 1.88, + "grad_norm": 10.383120380608432, + "learning_rate": 5.724315079531697e-06, + "loss": 0.0765, + "step": 2635 + }, + { + "epoch": 1.88, + "grad_norm": 14.053650424576215, + "learning_rate": 5.721455295159053e-06, + "loss": 0.1095, + "step": 2636 + }, + { + "epoch": 1.88, + "grad_norm": 13.49823698629184, + "learning_rate": 5.7185952697347844e-06, + "loss": 0.1095, + "step": 2637 + }, + { + "epoch": 1.88, + "grad_norm": 15.100988227402953, + "learning_rate": 5.71573500421448e-06, + "loss": 0.0826, + "step": 2638 + }, + { + "epoch": 1.88, + "grad_norm": 18.69455842462284, + "learning_rate": 5.712874499553807e-06, + "loss": 0.1101, + "step": 2639 + }, + { + "epoch": 1.88, + "grad_norm": 8.131975095287098, + "learning_rate": 5.710013756708513e-06, + "loss": 0.1218, + "step": 2640 + }, + { + "epoch": 1.89, + "grad_norm": 18.309767202091365, + "learning_rate": 5.707152776634427e-06, + "loss": 0.0981, + "step": 2641 + }, + { + "epoch": 1.89, + "grad_norm": 17.760090104193882, + "learning_rate": 5.704291560287454e-06, + "loss": 0.1068, + "step": 2642 + }, + { + "epoch": 1.89, + "grad_norm": 17.356458991602008, + "learning_rate": 5.701430108623578e-06, + "loss": 0.0968, + "step": 2643 + }, + { + "epoch": 1.89, + "grad_norm": 8.346261757755903, + "learning_rate": 5.698568422598867e-06, + "loss": 0.0783, + "step": 2644 + }, + { + "epoch": 1.89, + "grad_norm": 10.936135220918278, + "learning_rate": 5.69570650316946e-06, + "loss": 0.0816, + "step": 2645 + }, + { + "epoch": 1.89, + "grad_norm": 12.619615725003502, + "learning_rate": 5.69284435129158e-06, + "loss": 0.105, + "step": 2646 + }, + { + "epoch": 1.89, + "grad_norm": 5.963461261946324, + "learning_rate": 5.689981967921523e-06, + "loss": 0.0933, + "step": 2647 + }, + { + "epoch": 1.89, + "grad_norm": 11.236246645246913, + "learning_rate": 5.6871193540156666e-06, + "loss": 0.0918, + "step": 2648 + }, + { + "epoch": 1.89, + "grad_norm": 12.81684610998191, + "learning_rate": 5.684256510530461e-06, + "loss": 0.1476, + "step": 2649 + }, + { + "epoch": 1.89, + "grad_norm": 11.153293940717711, + "learning_rate": 5.68139343842244e-06, + "loss": 0.0962, + "step": 2650 + }, + { + "epoch": 1.89, + "grad_norm": 14.765702957258346, + "learning_rate": 5.678530138648204e-06, + "loss": 0.0981, + "step": 2651 + }, + { + "epoch": 1.89, + "grad_norm": 7.541403950790154, + "learning_rate": 5.675666612164436e-06, + "loss": 0.1184, + "step": 2652 + }, + { + "epoch": 1.89, + "grad_norm": 27.66496804411558, + "learning_rate": 5.672802859927895e-06, + "loss": 0.093, + "step": 2653 + }, + { + "epoch": 1.89, + "grad_norm": 9.315476185301423, + "learning_rate": 5.669938882895412e-06, + "loss": 0.0898, + "step": 2654 + }, + { + "epoch": 1.9, + "grad_norm": 9.178791444866505, + "learning_rate": 5.667074682023896e-06, + "loss": 0.0924, + "step": 2655 + }, + { + "epoch": 1.9, + "grad_norm": 13.533905636009482, + "learning_rate": 5.664210258270331e-06, + "loss": 0.1217, + "step": 2656 + }, + { + "epoch": 1.9, + "grad_norm": 18.36169164512371, + "learning_rate": 5.661345612591771e-06, + "loss": 0.0782, + "step": 2657 + }, + { + "epoch": 1.9, + "grad_norm": 7.538301490553328, + "learning_rate": 5.6584807459453515e-06, + "loss": 0.0942, + "step": 2658 + }, + { + "epoch": 1.9, + "grad_norm": 10.264391649732847, + "learning_rate": 5.655615659288274e-06, + "loss": 0.1205, + "step": 2659 + }, + { + "epoch": 1.9, + "grad_norm": 6.794709350968707, + "learning_rate": 5.652750353577818e-06, + "loss": 0.1161, + "step": 2660 + }, + { + "epoch": 1.9, + "grad_norm": 13.99871974442599, + "learning_rate": 5.649884829771337e-06, + "loss": 0.1078, + "step": 2661 + }, + { + "epoch": 1.9, + "grad_norm": 7.867770861552083, + "learning_rate": 5.6470190888262545e-06, + "loss": 0.0922, + "step": 2662 + }, + { + "epoch": 1.9, + "grad_norm": 17.41387225248798, + "learning_rate": 5.644153131700067e-06, + "loss": 0.1163, + "step": 2663 + }, + { + "epoch": 1.9, + "grad_norm": 39.2761918554314, + "learning_rate": 5.6412869593503476e-06, + "loss": 0.1678, + "step": 2664 + }, + { + "epoch": 1.9, + "grad_norm": 9.993724717189073, + "learning_rate": 5.638420572734733e-06, + "loss": 0.0949, + "step": 2665 + }, + { + "epoch": 1.9, + "grad_norm": 10.59354372045396, + "learning_rate": 5.63555397281094e-06, + "loss": 0.0689, + "step": 2666 + }, + { + "epoch": 1.9, + "grad_norm": 24.937857294603422, + "learning_rate": 5.632687160536751e-06, + "loss": 0.0983, + "step": 2667 + }, + { + "epoch": 1.9, + "grad_norm": 41.37417208875139, + "learning_rate": 5.629820136870022e-06, + "loss": 0.1111, + "step": 2668 + }, + { + "epoch": 1.91, + "grad_norm": 7.437831972634287, + "learning_rate": 5.626952902768678e-06, + "loss": 0.1069, + "step": 2669 + }, + { + "epoch": 1.91, + "grad_norm": 14.905422981253997, + "learning_rate": 5.624085459190717e-06, + "loss": 0.0795, + "step": 2670 + }, + { + "epoch": 1.91, + "grad_norm": 18.705922511860052, + "learning_rate": 5.621217807094202e-06, + "loss": 0.0938, + "step": 2671 + }, + { + "epoch": 1.91, + "grad_norm": 31.833596818269967, + "learning_rate": 5.618349947437272e-06, + "loss": 0.1108, + "step": 2672 + }, + { + "epoch": 1.91, + "grad_norm": 18.633295676824503, + "learning_rate": 5.615481881178132e-06, + "loss": 0.1321, + "step": 2673 + }, + { + "epoch": 1.91, + "grad_norm": 4.9713241603062395, + "learning_rate": 5.612613609275054e-06, + "loss": 0.0859, + "step": 2674 + }, + { + "epoch": 1.91, + "grad_norm": 11.386906666975248, + "learning_rate": 5.609745132686383e-06, + "loss": 0.1326, + "step": 2675 + }, + { + "epoch": 1.91, + "grad_norm": 33.73569341071956, + "learning_rate": 5.60687645237053e-06, + "loss": 0.108, + "step": 2676 + }, + { + "epoch": 1.91, + "grad_norm": 37.4374522679626, + "learning_rate": 5.604007569285973e-06, + "loss": 0.135, + "step": 2677 + }, + { + "epoch": 1.91, + "grad_norm": 15.482062569547526, + "learning_rate": 5.6011384843912605e-06, + "loss": 0.098, + "step": 2678 + }, + { + "epoch": 1.91, + "grad_norm": 23.405290661541976, + "learning_rate": 5.598269198645008e-06, + "loss": 0.0983, + "step": 2679 + }, + { + "epoch": 1.91, + "grad_norm": 12.43065289610563, + "learning_rate": 5.5953997130058945e-06, + "loss": 0.1143, + "step": 2680 + }, + { + "epoch": 1.91, + "grad_norm": 25.630102473252506, + "learning_rate": 5.5925300284326715e-06, + "loss": 0.1412, + "step": 2681 + }, + { + "epoch": 1.91, + "grad_norm": 45.50344170828186, + "learning_rate": 5.5896601458841505e-06, + "loss": 0.1259, + "step": 2682 + }, + { + "epoch": 1.92, + "grad_norm": 30.06550381414141, + "learning_rate": 5.586790066319217e-06, + "loss": 0.0741, + "step": 2683 + }, + { + "epoch": 1.92, + "grad_norm": 7.271702734925442, + "learning_rate": 5.583919790696814e-06, + "loss": 0.0865, + "step": 2684 + }, + { + "epoch": 1.92, + "grad_norm": 24.493565366620036, + "learning_rate": 5.581049319975957e-06, + "loss": 0.0911, + "step": 2685 + }, + { + "epoch": 1.92, + "grad_norm": 10.872141679797997, + "learning_rate": 5.57817865511572e-06, + "loss": 0.1267, + "step": 2686 + }, + { + "epoch": 1.92, + "grad_norm": 31.622766470405587, + "learning_rate": 5.575307797075249e-06, + "loss": 0.1034, + "step": 2687 + }, + { + "epoch": 1.92, + "grad_norm": 33.281994290944546, + "learning_rate": 5.572436746813748e-06, + "loss": 0.1101, + "step": 2688 + }, + { + "epoch": 1.92, + "grad_norm": 9.921027242320786, + "learning_rate": 5.5695655052904905e-06, + "loss": 0.1333, + "step": 2689 + }, + { + "epoch": 1.92, + "grad_norm": 3.842511723248251, + "learning_rate": 5.566694073464812e-06, + "loss": 0.0715, + "step": 2690 + }, + { + "epoch": 1.92, + "grad_norm": 31.392960437519093, + "learning_rate": 5.56382245229611e-06, + "loss": 0.1133, + "step": 2691 + }, + { + "epoch": 1.92, + "grad_norm": 33.161077306234404, + "learning_rate": 5.560950642743847e-06, + "loss": 0.1158, + "step": 2692 + }, + { + "epoch": 1.92, + "grad_norm": 7.72182359708755, + "learning_rate": 5.558078645767547e-06, + "loss": 0.082, + "step": 2693 + }, + { + "epoch": 1.92, + "grad_norm": 33.264247772214716, + "learning_rate": 5.5552064623267986e-06, + "loss": 0.1349, + "step": 2694 + }, + { + "epoch": 1.92, + "grad_norm": 18.089787658256412, + "learning_rate": 5.5523340933812505e-06, + "loss": 0.1017, + "step": 2695 + }, + { + "epoch": 1.92, + "grad_norm": 15.398888267717105, + "learning_rate": 5.549461539890616e-06, + "loss": 0.0844, + "step": 2696 + }, + { + "epoch": 1.93, + "grad_norm": 24.67559393234702, + "learning_rate": 5.546588802814669e-06, + "loss": 0.0819, + "step": 2697 + }, + { + "epoch": 1.93, + "grad_norm": 18.90602279218803, + "learning_rate": 5.543715883113241e-06, + "loss": 0.0938, + "step": 2698 + }, + { + "epoch": 1.93, + "grad_norm": 26.617935820127663, + "learning_rate": 5.540842781746231e-06, + "loss": 0.0928, + "step": 2699 + }, + { + "epoch": 1.93, + "grad_norm": 9.410245020771018, + "learning_rate": 5.537969499673598e-06, + "loss": 0.1094, + "step": 2700 + }, + { + "epoch": 1.93, + "grad_norm": 12.46263647968909, + "learning_rate": 5.535096037855353e-06, + "loss": 0.0858, + "step": 2701 + }, + { + "epoch": 1.93, + "grad_norm": 24.402438809378204, + "learning_rate": 5.532222397251576e-06, + "loss": 0.0955, + "step": 2702 + }, + { + "epoch": 1.93, + "grad_norm": 18.28253890357763, + "learning_rate": 5.529348578822403e-06, + "loss": 0.132, + "step": 2703 + }, + { + "epoch": 1.93, + "grad_norm": 18.717809032260746, + "learning_rate": 5.526474583528032e-06, + "loss": 0.137, + "step": 2704 + }, + { + "epoch": 1.93, + "grad_norm": 24.233609934720455, + "learning_rate": 5.523600412328716e-06, + "loss": 0.1124, + "step": 2705 + }, + { + "epoch": 1.93, + "grad_norm": 12.27153406116996, + "learning_rate": 5.520726066184769e-06, + "loss": 0.1357, + "step": 2706 + }, + { + "epoch": 1.93, + "grad_norm": 36.491820592631434, + "learning_rate": 5.517851546056566e-06, + "loss": 0.0982, + "step": 2707 + }, + { + "epoch": 1.93, + "grad_norm": 24.71547500695131, + "learning_rate": 5.5149768529045355e-06, + "loss": 0.0795, + "step": 2708 + }, + { + "epoch": 1.93, + "grad_norm": 14.636866072311467, + "learning_rate": 5.512101987689168e-06, + "loss": 0.0864, + "step": 2709 + }, + { + "epoch": 1.93, + "grad_norm": 7.1436136261471574, + "learning_rate": 5.509226951371006e-06, + "loss": 0.0834, + "step": 2710 + }, + { + "epoch": 1.94, + "grad_norm": 20.77019856777048, + "learning_rate": 5.506351744910654e-06, + "loss": 0.0908, + "step": 2711 + }, + { + "epoch": 1.94, + "grad_norm": 20.18000054845474, + "learning_rate": 5.503476369268773e-06, + "loss": 0.0805, + "step": 2712 + }, + { + "epoch": 1.94, + "grad_norm": 11.97702212144314, + "learning_rate": 5.50060082540608e-06, + "loss": 0.0823, + "step": 2713 + }, + { + "epoch": 1.94, + "grad_norm": 18.93799132198841, + "learning_rate": 5.4977251142833445e-06, + "loss": 0.1274, + "step": 2714 + }, + { + "epoch": 1.94, + "grad_norm": 18.781493822249985, + "learning_rate": 5.494849236861397e-06, + "loss": 0.093, + "step": 2715 + }, + { + "epoch": 1.94, + "grad_norm": 25.586693957909443, + "learning_rate": 5.491973194101122e-06, + "loss": 0.1316, + "step": 2716 + }, + { + "epoch": 1.94, + "grad_norm": 33.83182887402268, + "learning_rate": 5.4890969869634606e-06, + "loss": 0.1128, + "step": 2717 + }, + { + "epoch": 1.94, + "grad_norm": 39.09734754951871, + "learning_rate": 5.486220616409403e-06, + "loss": 0.1139, + "step": 2718 + }, + { + "epoch": 1.94, + "grad_norm": 17.296007817801954, + "learning_rate": 5.4833440834e-06, + "loss": 0.0948, + "step": 2719 + }, + { + "epoch": 1.94, + "grad_norm": 36.970414189809944, + "learning_rate": 5.480467388896353e-06, + "loss": 0.1217, + "step": 2720 + }, + { + "epoch": 1.94, + "grad_norm": 41.98479812243881, + "learning_rate": 5.477590533859623e-06, + "loss": 0.1089, + "step": 2721 + }, + { + "epoch": 1.94, + "grad_norm": 39.39789221503169, + "learning_rate": 5.474713519251018e-06, + "loss": 0.1365, + "step": 2722 + }, + { + "epoch": 1.94, + "grad_norm": 10.022368374310624, + "learning_rate": 5.471836346031802e-06, + "loss": 0.0945, + "step": 2723 + }, + { + "epoch": 1.94, + "grad_norm": 11.411241170144551, + "learning_rate": 5.468959015163293e-06, + "loss": 0.0825, + "step": 2724 + }, + { + "epoch": 1.95, + "grad_norm": 17.404101960272982, + "learning_rate": 5.46608152760686e-06, + "loss": 0.0732, + "step": 2725 + }, + { + "epoch": 1.95, + "grad_norm": 39.62197496277501, + "learning_rate": 5.463203884323926e-06, + "loss": 0.1486, + "step": 2726 + }, + { + "epoch": 1.95, + "grad_norm": 27.49778841473844, + "learning_rate": 5.460326086275964e-06, + "loss": 0.1128, + "step": 2727 + }, + { + "epoch": 1.95, + "grad_norm": 6.379646370483621, + "learning_rate": 5.4574481344245015e-06, + "loss": 0.1173, + "step": 2728 + }, + { + "epoch": 1.95, + "grad_norm": 21.947214598255393, + "learning_rate": 5.454570029731115e-06, + "loss": 0.1093, + "step": 2729 + }, + { + "epoch": 1.95, + "grad_norm": 18.335934862211666, + "learning_rate": 5.451691773157431e-06, + "loss": 0.1183, + "step": 2730 + }, + { + "epoch": 1.95, + "grad_norm": 51.38289661985466, + "learning_rate": 5.448813365665129e-06, + "loss": 0.1513, + "step": 2731 + }, + { + "epoch": 1.95, + "grad_norm": 17.977264185528178, + "learning_rate": 5.44593480821594e-06, + "loss": 0.1239, + "step": 2732 + }, + { + "epoch": 1.95, + "grad_norm": 6.769399150938259, + "learning_rate": 5.443056101771643e-06, + "loss": 0.0939, + "step": 2733 + }, + { + "epoch": 1.95, + "grad_norm": 49.50787284234708, + "learning_rate": 5.44017724729407e-06, + "loss": 0.1066, + "step": 2734 + }, + { + "epoch": 1.95, + "grad_norm": 41.47491103746017, + "learning_rate": 5.437298245745093e-06, + "loss": 0.1199, + "step": 2735 + }, + { + "epoch": 1.95, + "grad_norm": 31.21920187501198, + "learning_rate": 5.434419098086645e-06, + "loss": 0.1379, + "step": 2736 + }, + { + "epoch": 1.95, + "grad_norm": 4.019317654949999, + "learning_rate": 5.431539805280702e-06, + "loss": 0.0836, + "step": 2737 + }, + { + "epoch": 1.95, + "grad_norm": 16.908346573560465, + "learning_rate": 5.428660368289289e-06, + "loss": 0.1027, + "step": 2738 + }, + { + "epoch": 1.96, + "grad_norm": 54.3446520419247, + "learning_rate": 5.42578078807448e-06, + "loss": 0.1442, + "step": 2739 + }, + { + "epoch": 1.96, + "grad_norm": 29.604460610263637, + "learning_rate": 5.422901065598395e-06, + "loss": 0.0917, + "step": 2740 + }, + { + "epoch": 1.96, + "grad_norm": 18.052870526381152, + "learning_rate": 5.4200212018232024e-06, + "loss": 0.0837, + "step": 2741 + }, + { + "epoch": 1.96, + "grad_norm": 7.485821680212534, + "learning_rate": 5.41714119771112e-06, + "loss": 0.0809, + "step": 2742 + }, + { + "epoch": 1.96, + "grad_norm": 23.671337108776125, + "learning_rate": 5.414261054224412e-06, + "loss": 0.0701, + "step": 2743 + }, + { + "epoch": 1.96, + "grad_norm": 13.628921621805322, + "learning_rate": 5.411380772325383e-06, + "loss": 0.132, + "step": 2744 + }, + { + "epoch": 1.96, + "grad_norm": 58.95361204923983, + "learning_rate": 5.408500352976392e-06, + "loss": 0.177, + "step": 2745 + }, + { + "epoch": 1.96, + "grad_norm": 13.789171355412767, + "learning_rate": 5.40561979713984e-06, + "loss": 0.0959, + "step": 2746 + }, + { + "epoch": 1.96, + "grad_norm": 8.334704571609194, + "learning_rate": 5.402739105778175e-06, + "loss": 0.1243, + "step": 2747 + }, + { + "epoch": 1.96, + "grad_norm": 46.74494923664595, + "learning_rate": 5.399858279853889e-06, + "loss": 0.1034, + "step": 2748 + }, + { + "epoch": 1.96, + "grad_norm": 37.41227408696971, + "learning_rate": 5.39697732032952e-06, + "loss": 0.0923, + "step": 2749 + }, + { + "epoch": 1.96, + "grad_norm": 41.32562404134608, + "learning_rate": 5.394096228167648e-06, + "loss": 0.1169, + "step": 2750 + }, + { + "epoch": 1.96, + "grad_norm": 11.165428505092693, + "learning_rate": 5.391215004330903e-06, + "loss": 0.0894, + "step": 2751 + }, + { + "epoch": 1.96, + "grad_norm": 25.320083374694267, + "learning_rate": 5.388333649781951e-06, + "loss": 0.121, + "step": 2752 + }, + { + "epoch": 1.97, + "grad_norm": 34.05814695395899, + "learning_rate": 5.3854521654835105e-06, + "loss": 0.1019, + "step": 2753 + }, + { + "epoch": 1.97, + "grad_norm": 37.729778469348126, + "learning_rate": 5.3825705523983366e-06, + "loss": 0.1252, + "step": 2754 + }, + { + "epoch": 1.97, + "grad_norm": 5.7768601785361735, + "learning_rate": 5.37968881148923e-06, + "loss": 0.1022, + "step": 2755 + }, + { + "epoch": 1.97, + "grad_norm": 21.434040827191126, + "learning_rate": 5.376806943719033e-06, + "loss": 0.106, + "step": 2756 + }, + { + "epoch": 1.97, + "grad_norm": 16.80181493949827, + "learning_rate": 5.373924950050633e-06, + "loss": 0.093, + "step": 2757 + }, + { + "epoch": 1.97, + "grad_norm": 48.37549473256851, + "learning_rate": 5.371042831446957e-06, + "loss": 0.172, + "step": 2758 + }, + { + "epoch": 1.97, + "grad_norm": 5.651227352253567, + "learning_rate": 5.3681605888709755e-06, + "loss": 0.0944, + "step": 2759 + }, + { + "epoch": 1.97, + "grad_norm": 8.240276586363413, + "learning_rate": 5.365278223285698e-06, + "loss": 0.1755, + "step": 2760 + }, + { + "epoch": 1.97, + "grad_norm": 13.781340769294333, + "learning_rate": 5.362395735654175e-06, + "loss": 0.1061, + "step": 2761 + }, + { + "epoch": 1.97, + "grad_norm": 36.77153746292403, + "learning_rate": 5.3595131269395015e-06, + "loss": 0.1104, + "step": 2762 + }, + { + "epoch": 1.97, + "grad_norm": 26.923003378387772, + "learning_rate": 5.356630398104811e-06, + "loss": 0.1163, + "step": 2763 + }, + { + "epoch": 1.97, + "grad_norm": 19.27470596602233, + "learning_rate": 5.353747550113274e-06, + "loss": 0.0679, + "step": 2764 + }, + { + "epoch": 1.97, + "grad_norm": 27.07027957372197, + "learning_rate": 5.350864583928106e-06, + "loss": 0.1111, + "step": 2765 + }, + { + "epoch": 1.97, + "grad_norm": 7.765524049007483, + "learning_rate": 5.347981500512558e-06, + "loss": 0.085, + "step": 2766 + }, + { + "epoch": 1.98, + "grad_norm": 41.86517901009098, + "learning_rate": 5.345098300829924e-06, + "loss": 0.1456, + "step": 2767 + }, + { + "epoch": 1.98, + "grad_norm": 5.349243010931892, + "learning_rate": 5.342214985843534e-06, + "loss": 0.0994, + "step": 2768 + }, + { + "epoch": 1.98, + "grad_norm": 16.401974170097617, + "learning_rate": 5.339331556516755e-06, + "loss": 0.0795, + "step": 2769 + }, + { + "epoch": 1.98, + "grad_norm": 16.03990289754173, + "learning_rate": 5.336448013812996e-06, + "loss": 0.1182, + "step": 2770 + }, + { + "epoch": 1.98, + "grad_norm": 17.74040652818955, + "learning_rate": 5.333564358695701e-06, + "loss": 0.0903, + "step": 2771 + }, + { + "epoch": 1.98, + "grad_norm": 41.62125267709523, + "learning_rate": 5.330680592128355e-06, + "loss": 0.1292, + "step": 2772 + }, + { + "epoch": 1.98, + "grad_norm": 11.822329634056095, + "learning_rate": 5.3277967150744755e-06, + "loss": 0.0974, + "step": 2773 + }, + { + "epoch": 1.98, + "grad_norm": 17.12557284468244, + "learning_rate": 5.324912728497621e-06, + "loss": 0.104, + "step": 2774 + }, + { + "epoch": 1.98, + "grad_norm": 51.98493528591735, + "learning_rate": 5.322028633361386e-06, + "loss": 0.1464, + "step": 2775 + }, + { + "epoch": 1.98, + "grad_norm": 38.343198477712235, + "learning_rate": 5.319144430629397e-06, + "loss": 0.1155, + "step": 2776 + }, + { + "epoch": 1.98, + "grad_norm": 13.395904291966062, + "learning_rate": 5.316260121265323e-06, + "loss": 0.1008, + "step": 2777 + }, + { + "epoch": 1.98, + "grad_norm": 12.23983059469622, + "learning_rate": 5.313375706232864e-06, + "loss": 0.1072, + "step": 2778 + }, + { + "epoch": 1.98, + "grad_norm": 6.743029425887316, + "learning_rate": 5.310491186495757e-06, + "loss": 0.0966, + "step": 2779 + }, + { + "epoch": 1.98, + "grad_norm": 28.054949717811493, + "learning_rate": 5.307606563017772e-06, + "loss": 0.0975, + "step": 2780 + }, + { + "epoch": 1.99, + "grad_norm": 15.050796478633394, + "learning_rate": 5.304721836762717e-06, + "loss": 0.0774, + "step": 2781 + }, + { + "epoch": 1.99, + "grad_norm": 6.720263367169756, + "learning_rate": 5.301837008694433e-06, + "loss": 0.0818, + "step": 2782 + }, + { + "epoch": 1.99, + "grad_norm": 11.712277096253679, + "learning_rate": 5.298952079776794e-06, + "loss": 0.0989, + "step": 2783 + }, + { + "epoch": 1.99, + "grad_norm": 7.4999136139393965, + "learning_rate": 5.296067050973709e-06, + "loss": 0.0798, + "step": 2784 + }, + { + "epoch": 1.99, + "grad_norm": 16.593799394745258, + "learning_rate": 5.29318192324912e-06, + "loss": 0.0981, + "step": 2785 + }, + { + "epoch": 1.99, + "grad_norm": 8.159019478221214, + "learning_rate": 5.290296697566999e-06, + "loss": 0.0942, + "step": 2786 + }, + { + "epoch": 1.99, + "grad_norm": 12.250548068310936, + "learning_rate": 5.287411374891356e-06, + "loss": 0.0811, + "step": 2787 + }, + { + "epoch": 1.99, + "grad_norm": 11.420608184090936, + "learning_rate": 5.284525956186231e-06, + "loss": 0.098, + "step": 2788 + }, + { + "epoch": 1.99, + "grad_norm": 6.178523711245514, + "learning_rate": 5.281640442415695e-06, + "loss": 0.1224, + "step": 2789 + }, + { + "epoch": 1.99, + "grad_norm": 14.432868868069074, + "learning_rate": 5.278754834543852e-06, + "loss": 0.0961, + "step": 2790 + }, + { + "epoch": 1.99, + "grad_norm": 19.832143314823682, + "learning_rate": 5.275869133534838e-06, + "loss": 0.0934, + "step": 2791 + }, + { + "epoch": 1.99, + "grad_norm": 6.173140621912715, + "learning_rate": 5.272983340352818e-06, + "loss": 0.0894, + "step": 2792 + }, + { + "epoch": 1.99, + "grad_norm": 8.664241283983232, + "learning_rate": 5.270097455961991e-06, + "loss": 0.0939, + "step": 2793 + }, + { + "epoch": 1.99, + "grad_norm": 6.199786160737331, + "learning_rate": 5.267211481326584e-06, + "loss": 0.0717, + "step": 2794 + }, + { + "epoch": 2.0, + "grad_norm": 9.359083770279224, + "learning_rate": 5.264325417410854e-06, + "loss": 0.0862, + "step": 2795 + }, + { + "epoch": 2.0, + "grad_norm": 8.999303639282669, + "learning_rate": 5.261439265179089e-06, + "loss": 0.0884, + "step": 2796 + }, + { + "epoch": 2.0, + "grad_norm": 9.531777669594215, + "learning_rate": 5.258553025595605e-06, + "loss": 0.1085, + "step": 2797 + }, + { + "epoch": 2.0, + "grad_norm": 10.564778248106444, + "learning_rate": 5.255666699624749e-06, + "loss": 0.0837, + "step": 2798 + }, + { + "epoch": 2.0, + "grad_norm": 8.144429532283462, + "learning_rate": 5.252780288230899e-06, + "loss": 0.0817, + "step": 2799 + }, + { + "epoch": 2.0, + "grad_norm": 8.479863646274058, + "learning_rate": 5.249893792378454e-06, + "loss": 0.1122, + "step": 2800 + }, + { + "epoch": 2.0, + "grad_norm": 8.219194155167031, + "learning_rate": 5.24700721303185e-06, + "loss": 0.1039, + "step": 2801 + }, + { + "epoch": 2.0, + "grad_norm": 16.285123515055172, + "learning_rate": 5.244120551155544e-06, + "loss": 0.1119, + "step": 2802 + }, + { + "epoch": 2.0, + "grad_norm": 13.901589114079679, + "learning_rate": 5.241233807714024e-06, + "loss": 0.054, + "step": 2803 + }, + { + "epoch": 2.0, + "grad_norm": 4.199611427046534, + "learning_rate": 5.238346983671805e-06, + "loss": 0.0583, + "step": 2804 + }, + { + "epoch": 2.0, + "grad_norm": 7.9097903544474075, + "learning_rate": 5.235460079993429e-06, + "loss": 0.0524, + "step": 2805 + }, + { + "epoch": 2.0, + "grad_norm": 9.10846903963757, + "learning_rate": 5.232573097643462e-06, + "loss": 0.0448, + "step": 2806 + }, + { + "epoch": 2.0, + "grad_norm": 8.33526310227755, + "learning_rate": 5.229686037586502e-06, + "loss": 0.0567, + "step": 2807 + }, + { + "epoch": 2.0, + "grad_norm": 14.233119308284538, + "learning_rate": 5.226798900787167e-06, + "loss": 0.0535, + "step": 2808 + }, + { + "epoch": 2.0, + "grad_norm": 7.672664735913163, + "learning_rate": 5.223911688210104e-06, + "loss": 0.0562, + "step": 2809 + }, + { + "epoch": 2.01, + "grad_norm": 11.614815160250359, + "learning_rate": 5.221024400819983e-06, + "loss": 0.0532, + "step": 2810 + }, + { + "epoch": 2.01, + "grad_norm": 14.154573219522113, + "learning_rate": 5.218137039581504e-06, + "loss": 0.0424, + "step": 2811 + }, + { + "epoch": 2.01, + "grad_norm": 6.576671964529106, + "learning_rate": 5.215249605459382e-06, + "loss": 0.0618, + "step": 2812 + }, + { + "epoch": 2.01, + "grad_norm": 11.287896142648293, + "learning_rate": 5.212362099418369e-06, + "loss": 0.0522, + "step": 2813 + }, + { + "epoch": 2.01, + "grad_norm": 6.582372740411856, + "learning_rate": 5.2094745224232306e-06, + "loss": 0.0511, + "step": 2814 + }, + { + "epoch": 2.01, + "grad_norm": 6.17959564383128, + "learning_rate": 5.206586875438759e-06, + "loss": 0.0565, + "step": 2815 + }, + { + "epoch": 2.01, + "grad_norm": 12.734580828635346, + "learning_rate": 5.203699159429773e-06, + "loss": 0.049, + "step": 2816 + }, + { + "epoch": 2.01, + "grad_norm": 13.886150462876547, + "learning_rate": 5.200811375361112e-06, + "loss": 0.0618, + "step": 2817 + }, + { + "epoch": 2.01, + "grad_norm": 9.75151912924947, + "learning_rate": 5.197923524197639e-06, + "loss": 0.0517, + "step": 2818 + }, + { + "epoch": 2.01, + "grad_norm": 8.18582132062964, + "learning_rate": 5.195035606904237e-06, + "loss": 0.0461, + "step": 2819 + }, + { + "epoch": 2.01, + "grad_norm": 4.949615940026982, + "learning_rate": 5.1921476244458135e-06, + "loss": 0.07, + "step": 2820 + }, + { + "epoch": 2.01, + "grad_norm": 5.522744474789711, + "learning_rate": 5.189259577787297e-06, + "loss": 0.0469, + "step": 2821 + }, + { + "epoch": 2.01, + "grad_norm": 13.237191723970465, + "learning_rate": 5.186371467893638e-06, + "loss": 0.0585, + "step": 2822 + }, + { + "epoch": 2.01, + "grad_norm": 22.63811767687695, + "learning_rate": 5.1834832957298075e-06, + "loss": 0.0612, + "step": 2823 + }, + { + "epoch": 2.02, + "grad_norm": 10.856725695302611, + "learning_rate": 5.180595062260797e-06, + "loss": 0.0328, + "step": 2824 + }, + { + "epoch": 2.02, + "grad_norm": 5.140432943530161, + "learning_rate": 5.177706768451619e-06, + "loss": 0.042, + "step": 2825 + }, + { + "epoch": 2.02, + "grad_norm": 14.324322382042538, + "learning_rate": 5.174818415267308e-06, + "loss": 0.0389, + "step": 2826 + }, + { + "epoch": 2.02, + "grad_norm": 12.103738058031697, + "learning_rate": 5.1719300036729135e-06, + "loss": 0.0532, + "step": 2827 + }, + { + "epoch": 2.02, + "grad_norm": 23.584660325094507, + "learning_rate": 5.169041534633511e-06, + "loss": 0.0512, + "step": 2828 + }, + { + "epoch": 2.02, + "grad_norm": 10.668112848453543, + "learning_rate": 5.166153009114188e-06, + "loss": 0.0463, + "step": 2829 + }, + { + "epoch": 2.02, + "grad_norm": 3.695483605532189, + "learning_rate": 5.163264428080057e-06, + "loss": 0.0368, + "step": 2830 + }, + { + "epoch": 2.02, + "grad_norm": 12.550637521816691, + "learning_rate": 5.160375792496246e-06, + "loss": 0.0579, + "step": 2831 + }, + { + "epoch": 2.02, + "grad_norm": 7.404223260758192, + "learning_rate": 5.157487103327901e-06, + "loss": 0.0544, + "step": 2832 + }, + { + "epoch": 2.02, + "grad_norm": 14.804123391013425, + "learning_rate": 5.1545983615401885e-06, + "loss": 0.0443, + "step": 2833 + }, + { + "epoch": 2.02, + "grad_norm": 2.9860623662205845, + "learning_rate": 5.151709568098289e-06, + "loss": 0.0437, + "step": 2834 + }, + { + "epoch": 2.02, + "grad_norm": 3.4808084788201925, + "learning_rate": 5.1488207239674036e-06, + "loss": 0.048, + "step": 2835 + }, + { + "epoch": 2.02, + "grad_norm": 9.244067790459159, + "learning_rate": 5.145931830112748e-06, + "loss": 0.0397, + "step": 2836 + }, + { + "epoch": 2.02, + "grad_norm": 6.7362343327123035, + "learning_rate": 5.1430428874995554e-06, + "loss": 0.0555, + "step": 2837 + }, + { + "epoch": 2.03, + "grad_norm": 9.004920379741428, + "learning_rate": 5.140153897093076e-06, + "loss": 0.0459, + "step": 2838 + }, + { + "epoch": 2.03, + "grad_norm": 8.516109389469001, + "learning_rate": 5.1372648598585725e-06, + "loss": 0.0656, + "step": 2839 + }, + { + "epoch": 2.03, + "grad_norm": 8.640361450205146, + "learning_rate": 5.134375776761329e-06, + "loss": 0.0576, + "step": 2840 + }, + { + "epoch": 2.03, + "grad_norm": 5.993357106722759, + "learning_rate": 5.131486648766642e-06, + "loss": 0.0432, + "step": 2841 + }, + { + "epoch": 2.03, + "grad_norm": 14.19048628173077, + "learning_rate": 5.1285974768398205e-06, + "loss": 0.0488, + "step": 2842 + }, + { + "epoch": 2.03, + "grad_norm": 3.711900865433462, + "learning_rate": 5.125708261946192e-06, + "loss": 0.0494, + "step": 2843 + }, + { + "epoch": 2.03, + "grad_norm": 7.5442087251704875, + "learning_rate": 5.122819005051096e-06, + "loss": 0.0534, + "step": 2844 + }, + { + "epoch": 2.03, + "grad_norm": 20.77714075942429, + "learning_rate": 5.119929707119889e-06, + "loss": 0.057, + "step": 2845 + }, + { + "epoch": 2.03, + "grad_norm": 24.696838420818487, + "learning_rate": 5.117040369117937e-06, + "loss": 0.071, + "step": 2846 + }, + { + "epoch": 2.03, + "grad_norm": 12.90688500272758, + "learning_rate": 5.114150992010621e-06, + "loss": 0.0461, + "step": 2847 + }, + { + "epoch": 2.03, + "grad_norm": 21.76698605488115, + "learning_rate": 5.1112615767633385e-06, + "loss": 0.0688, + "step": 2848 + }, + { + "epoch": 2.03, + "grad_norm": 12.042476678664588, + "learning_rate": 5.108372124341494e-06, + "loss": 0.0603, + "step": 2849 + }, + { + "epoch": 2.03, + "grad_norm": 8.711444498022269, + "learning_rate": 5.105482635710509e-06, + "loss": 0.045, + "step": 2850 + }, + { + "epoch": 2.03, + "grad_norm": 4.965522711679408, + "learning_rate": 5.102593111835815e-06, + "loss": 0.0609, + "step": 2851 + }, + { + "epoch": 2.04, + "grad_norm": 5.78715066740317, + "learning_rate": 5.099703553682854e-06, + "loss": 0.0474, + "step": 2852 + }, + { + "epoch": 2.04, + "grad_norm": 14.612001845478728, + "learning_rate": 5.096813962217086e-06, + "loss": 0.0386, + "step": 2853 + }, + { + "epoch": 2.04, + "grad_norm": 8.213767104804448, + "learning_rate": 5.093924338403971e-06, + "loss": 0.0436, + "step": 2854 + }, + { + "epoch": 2.04, + "grad_norm": 14.21885718408981, + "learning_rate": 5.091034683208988e-06, + "loss": 0.0649, + "step": 2855 + }, + { + "epoch": 2.04, + "grad_norm": 9.200406261001536, + "learning_rate": 5.088144997597627e-06, + "loss": 0.0344, + "step": 2856 + }, + { + "epoch": 2.04, + "grad_norm": 5.7563067974727264, + "learning_rate": 5.085255282535383e-06, + "loss": 0.0523, + "step": 2857 + }, + { + "epoch": 2.04, + "grad_norm": 17.092771383434464, + "learning_rate": 5.082365538987765e-06, + "loss": 0.0494, + "step": 2858 + }, + { + "epoch": 2.04, + "grad_norm": 6.860949499099525, + "learning_rate": 5.079475767920289e-06, + "loss": 0.0499, + "step": 2859 + }, + { + "epoch": 2.04, + "grad_norm": 22.180665634795407, + "learning_rate": 5.076585970298481e-06, + "loss": 0.0686, + "step": 2860 + }, + { + "epoch": 2.04, + "grad_norm": 14.512557797350894, + "learning_rate": 5.073696147087878e-06, + "loss": 0.0486, + "step": 2861 + }, + { + "epoch": 2.04, + "grad_norm": 6.190960927702063, + "learning_rate": 5.070806299254023e-06, + "loss": 0.0379, + "step": 2862 + }, + { + "epoch": 2.04, + "grad_norm": 7.028780537576393, + "learning_rate": 5.067916427762466e-06, + "loss": 0.0535, + "step": 2863 + }, + { + "epoch": 2.04, + "grad_norm": 22.0853084053586, + "learning_rate": 5.0650265335787685e-06, + "loss": 0.0513, + "step": 2864 + }, + { + "epoch": 2.04, + "grad_norm": 18.82402778771914, + "learning_rate": 5.062136617668497e-06, + "loss": 0.0328, + "step": 2865 + }, + { + "epoch": 2.05, + "grad_norm": 9.35572729496596, + "learning_rate": 5.059246680997228e-06, + "loss": 0.0499, + "step": 2866 + }, + { + "epoch": 2.05, + "grad_norm": 15.494949337550196, + "learning_rate": 5.05635672453054e-06, + "loss": 0.0523, + "step": 2867 + }, + { + "epoch": 2.05, + "grad_norm": 17.512400175601442, + "learning_rate": 5.053466749234023e-06, + "loss": 0.0384, + "step": 2868 + }, + { + "epoch": 2.05, + "grad_norm": 40.02758074357572, + "learning_rate": 5.050576756073272e-06, + "loss": 0.0751, + "step": 2869 + }, + { + "epoch": 2.05, + "grad_norm": 18.427268569788204, + "learning_rate": 5.047686746013888e-06, + "loss": 0.0413, + "step": 2870 + }, + { + "epoch": 2.05, + "grad_norm": 5.541127274636648, + "learning_rate": 5.044796720021474e-06, + "loss": 0.0522, + "step": 2871 + }, + { + "epoch": 2.05, + "grad_norm": 7.965720801969884, + "learning_rate": 5.041906679061643e-06, + "loss": 0.0359, + "step": 2872 + }, + { + "epoch": 2.05, + "grad_norm": 19.01943256008662, + "learning_rate": 5.039016624100013e-06, + "loss": 0.0659, + "step": 2873 + }, + { + "epoch": 2.05, + "grad_norm": 27.05389706378414, + "learning_rate": 5.036126556102202e-06, + "loss": 0.063, + "step": 2874 + }, + { + "epoch": 2.05, + "grad_norm": 5.472693277113262, + "learning_rate": 5.033236476033838e-06, + "loss": 0.042, + "step": 2875 + }, + { + "epoch": 2.05, + "grad_norm": 5.360766469879016, + "learning_rate": 5.0303463848605495e-06, + "loss": 0.0457, + "step": 2876 + }, + { + "epoch": 2.05, + "grad_norm": 16.162091546785724, + "learning_rate": 5.027456283547969e-06, + "loss": 0.0628, + "step": 2877 + }, + { + "epoch": 2.05, + "grad_norm": 21.94294781666631, + "learning_rate": 5.0245661730617344e-06, + "loss": 0.0573, + "step": 2878 + }, + { + "epoch": 2.05, + "grad_norm": 12.700207444520526, + "learning_rate": 5.0216760543674855e-06, + "loss": 0.0488, + "step": 2879 + }, + { + "epoch": 2.06, + "grad_norm": 10.814485131817554, + "learning_rate": 5.0187859284308635e-06, + "loss": 0.059, + "step": 2880 + }, + { + "epoch": 2.06, + "grad_norm": 19.09796261660822, + "learning_rate": 5.015895796217514e-06, + "loss": 0.056, + "step": 2881 + }, + { + "epoch": 2.06, + "grad_norm": 21.58846874899979, + "learning_rate": 5.013005658693083e-06, + "loss": 0.0521, + "step": 2882 + }, + { + "epoch": 2.06, + "grad_norm": 16.698159335405045, + "learning_rate": 5.01011551682322e-06, + "loss": 0.0566, + "step": 2883 + }, + { + "epoch": 2.06, + "grad_norm": 14.016096004860056, + "learning_rate": 5.007225371573573e-06, + "loss": 0.0602, + "step": 2884 + }, + { + "epoch": 2.06, + "grad_norm": 4.06089635411483, + "learning_rate": 5.004335223909797e-06, + "loss": 0.0458, + "step": 2885 + }, + { + "epoch": 2.06, + "grad_norm": 13.265950504045813, + "learning_rate": 5.0014450747975416e-06, + "loss": 0.0705, + "step": 2886 + }, + { + "epoch": 2.06, + "grad_norm": 19.71288313562799, + "learning_rate": 4.998554925202459e-06, + "loss": 0.0456, + "step": 2887 + }, + { + "epoch": 2.06, + "grad_norm": 18.997859716152803, + "learning_rate": 4.995664776090204e-06, + "loss": 0.0703, + "step": 2888 + }, + { + "epoch": 2.06, + "grad_norm": 12.003644856154164, + "learning_rate": 4.9927746284264275e-06, + "loss": 0.0547, + "step": 2889 + }, + { + "epoch": 2.06, + "grad_norm": 5.8612356586077965, + "learning_rate": 4.9898844831767826e-06, + "loss": 0.0421, + "step": 2890 + }, + { + "epoch": 2.06, + "grad_norm": 3.857541041219327, + "learning_rate": 4.98699434130692e-06, + "loss": 0.0409, + "step": 2891 + }, + { + "epoch": 2.06, + "grad_norm": 27.220331399521893, + "learning_rate": 4.984104203782488e-06, + "loss": 0.0518, + "step": 2892 + }, + { + "epoch": 2.06, + "grad_norm": 17.76518467751359, + "learning_rate": 4.981214071569139e-06, + "loss": 0.0568, + "step": 2893 + }, + { + "epoch": 2.07, + "grad_norm": 6.706506867503759, + "learning_rate": 4.978323945632515e-06, + "loss": 0.0346, + "step": 2894 + }, + { + "epoch": 2.07, + "grad_norm": 6.00943424052001, + "learning_rate": 4.975433826938267e-06, + "loss": 0.049, + "step": 2895 + }, + { + "epoch": 2.07, + "grad_norm": 6.204979629155921, + "learning_rate": 4.972543716452031e-06, + "loss": 0.0735, + "step": 2896 + }, + { + "epoch": 2.07, + "grad_norm": 10.632788181873034, + "learning_rate": 4.969653615139452e-06, + "loss": 0.0454, + "step": 2897 + }, + { + "epoch": 2.07, + "grad_norm": 5.839103370356773, + "learning_rate": 4.966763523966163e-06, + "loss": 0.0974, + "step": 2898 + }, + { + "epoch": 2.07, + "grad_norm": 10.849055825875318, + "learning_rate": 4.963873443897799e-06, + "loss": 0.0521, + "step": 2899 + }, + { + "epoch": 2.07, + "grad_norm": 7.708000075587648, + "learning_rate": 4.96098337589999e-06, + "loss": 0.0355, + "step": 2900 + }, + { + "epoch": 2.07, + "grad_norm": 5.723507585122161, + "learning_rate": 4.958093320938358e-06, + "loss": 0.0539, + "step": 2901 + }, + { + "epoch": 2.07, + "grad_norm": 8.158944443681586, + "learning_rate": 4.955203279978529e-06, + "loss": 0.0671, + "step": 2902 + }, + { + "epoch": 2.07, + "grad_norm": 9.759906642189957, + "learning_rate": 4.952313253986114e-06, + "loss": 0.0531, + "step": 2903 + }, + { + "epoch": 2.07, + "grad_norm": 6.11064821441867, + "learning_rate": 4.9494232439267296e-06, + "loss": 0.0496, + "step": 2904 + }, + { + "epoch": 2.07, + "grad_norm": 15.84805318936541, + "learning_rate": 4.946533250765977e-06, + "loss": 0.0455, + "step": 2905 + }, + { + "epoch": 2.07, + "grad_norm": 12.764198896161808, + "learning_rate": 4.943643275469461e-06, + "loss": 0.038, + "step": 2906 + }, + { + "epoch": 2.07, + "grad_norm": 8.974850741601227, + "learning_rate": 4.940753319002773e-06, + "loss": 0.053, + "step": 2907 + }, + { + "epoch": 2.08, + "grad_norm": 13.75551037130389, + "learning_rate": 4.937863382331504e-06, + "loss": 0.0544, + "step": 2908 + }, + { + "epoch": 2.08, + "grad_norm": 16.32267799964506, + "learning_rate": 4.934973466421234e-06, + "loss": 0.062, + "step": 2909 + }, + { + "epoch": 2.08, + "grad_norm": 10.103817381967463, + "learning_rate": 4.932083572237535e-06, + "loss": 0.0406, + "step": 2910 + }, + { + "epoch": 2.08, + "grad_norm": 6.213672979461658, + "learning_rate": 4.92919370074598e-06, + "loss": 0.0564, + "step": 2911 + }, + { + "epoch": 2.08, + "grad_norm": 9.834401409500478, + "learning_rate": 4.926303852912123e-06, + "loss": 0.0533, + "step": 2912 + }, + { + "epoch": 2.08, + "grad_norm": 6.141094774831048, + "learning_rate": 4.9234140297015204e-06, + "loss": 0.0361, + "step": 2913 + }, + { + "epoch": 2.08, + "grad_norm": 17.040575294136367, + "learning_rate": 4.920524232079712e-06, + "loss": 0.06, + "step": 2914 + }, + { + "epoch": 2.08, + "grad_norm": 18.436793853047202, + "learning_rate": 4.917634461012238e-06, + "loss": 0.0632, + "step": 2915 + }, + { + "epoch": 2.08, + "grad_norm": 11.958023047209824, + "learning_rate": 4.914744717464617e-06, + "loss": 0.0414, + "step": 2916 + }, + { + "epoch": 2.08, + "grad_norm": 6.374684872099406, + "learning_rate": 4.911855002402375e-06, + "loss": 0.0558, + "step": 2917 + }, + { + "epoch": 2.08, + "grad_norm": 8.56698629233553, + "learning_rate": 4.908965316791014e-06, + "loss": 0.0278, + "step": 2918 + }, + { + "epoch": 2.08, + "grad_norm": 8.288813639711439, + "learning_rate": 4.906075661596031e-06, + "loss": 0.0467, + "step": 2919 + }, + { + "epoch": 2.08, + "grad_norm": 25.748623981042726, + "learning_rate": 4.903186037782917e-06, + "loss": 0.0627, + "step": 2920 + }, + { + "epoch": 2.08, + "grad_norm": 4.1547283981098655, + "learning_rate": 4.900296446317146e-06, + "loss": 0.0426, + "step": 2921 + }, + { + "epoch": 2.09, + "grad_norm": 11.060180181270248, + "learning_rate": 4.897406888164187e-06, + "loss": 0.0411, + "step": 2922 + }, + { + "epoch": 2.09, + "grad_norm": 9.894982430864497, + "learning_rate": 4.8945173642894915e-06, + "loss": 0.0428, + "step": 2923 + }, + { + "epoch": 2.09, + "grad_norm": 10.318786752951437, + "learning_rate": 4.8916278756585074e-06, + "loss": 0.038, + "step": 2924 + }, + { + "epoch": 2.09, + "grad_norm": 23.226668767699405, + "learning_rate": 4.888738423236664e-06, + "loss": 0.0451, + "step": 2925 + }, + { + "epoch": 2.09, + "grad_norm": 7.663805162430875, + "learning_rate": 4.88584900798938e-06, + "loss": 0.0311, + "step": 2926 + }, + { + "epoch": 2.09, + "grad_norm": 11.873525027460229, + "learning_rate": 4.882959630882066e-06, + "loss": 0.0649, + "step": 2927 + }, + { + "epoch": 2.09, + "grad_norm": 21.51580574874802, + "learning_rate": 4.8800702928801124e-06, + "loss": 0.0541, + "step": 2928 + }, + { + "epoch": 2.09, + "grad_norm": 9.563175048514058, + "learning_rate": 4.8771809949489056e-06, + "loss": 0.0491, + "step": 2929 + }, + { + "epoch": 2.09, + "grad_norm": 9.177795863776534, + "learning_rate": 4.874291738053809e-06, + "loss": 0.058, + "step": 2930 + }, + { + "epoch": 2.09, + "grad_norm": 14.671314431344912, + "learning_rate": 4.871402523160181e-06, + "loss": 0.0529, + "step": 2931 + }, + { + "epoch": 2.09, + "grad_norm": 8.21054929459398, + "learning_rate": 4.868513351233359e-06, + "loss": 0.0407, + "step": 2932 + }, + { + "epoch": 2.09, + "grad_norm": 14.943488995962824, + "learning_rate": 4.865624223238672e-06, + "loss": 0.0537, + "step": 2933 + }, + { + "epoch": 2.09, + "grad_norm": 9.285607421042759, + "learning_rate": 4.862735140141428e-06, + "loss": 0.0526, + "step": 2934 + }, + { + "epoch": 2.09, + "grad_norm": 12.545169261685402, + "learning_rate": 4.859846102906927e-06, + "loss": 0.0347, + "step": 2935 + }, + { + "epoch": 2.1, + "grad_norm": 6.418869884194012, + "learning_rate": 4.856957112500446e-06, + "loss": 0.0605, + "step": 2936 + }, + { + "epoch": 2.1, + "grad_norm": 10.91319808557055, + "learning_rate": 4.854068169887254e-06, + "loss": 0.052, + "step": 2937 + }, + { + "epoch": 2.1, + "grad_norm": 7.162112299173172, + "learning_rate": 4.851179276032598e-06, + "loss": 0.0621, + "step": 2938 + }, + { + "epoch": 2.1, + "grad_norm": 17.64808170091903, + "learning_rate": 4.848290431901712e-06, + "loss": 0.0616, + "step": 2939 + }, + { + "epoch": 2.1, + "grad_norm": 5.721241146237817, + "learning_rate": 4.845401638459813e-06, + "loss": 0.0644, + "step": 2940 + }, + { + "epoch": 2.1, + "grad_norm": 5.215058023030627, + "learning_rate": 4.8425128966721e-06, + "loss": 0.0445, + "step": 2941 + }, + { + "epoch": 2.1, + "grad_norm": 8.470561168753651, + "learning_rate": 4.8396242075037555e-06, + "loss": 0.0458, + "step": 2942 + }, + { + "epoch": 2.1, + "grad_norm": 12.193270887378144, + "learning_rate": 4.836735571919946e-06, + "loss": 0.04, + "step": 2943 + }, + { + "epoch": 2.1, + "grad_norm": 6.51982708722715, + "learning_rate": 4.833846990885813e-06, + "loss": 0.0451, + "step": 2944 + }, + { + "epoch": 2.1, + "grad_norm": 14.275220224337577, + "learning_rate": 4.830958465366492e-06, + "loss": 0.06, + "step": 2945 + }, + { + "epoch": 2.1, + "grad_norm": 14.485618936801512, + "learning_rate": 4.828069996327088e-06, + "loss": 0.0452, + "step": 2946 + }, + { + "epoch": 2.1, + "grad_norm": 27.79698221697965, + "learning_rate": 4.825181584732695e-06, + "loss": 0.0803, + "step": 2947 + }, + { + "epoch": 2.1, + "grad_norm": 7.760801102621836, + "learning_rate": 4.822293231548382e-06, + "loss": 0.051, + "step": 2948 + }, + { + "epoch": 2.1, + "grad_norm": 18.186535510480933, + "learning_rate": 4.819404937739205e-06, + "loss": 0.0798, + "step": 2949 + }, + { + "epoch": 2.11, + "grad_norm": 13.657062142670078, + "learning_rate": 4.816516704270194e-06, + "loss": 0.0419, + "step": 2950 + }, + { + "epoch": 2.11, + "grad_norm": 17.763665719452103, + "learning_rate": 4.813628532106363e-06, + "loss": 0.041, + "step": 2951 + }, + { + "epoch": 2.11, + "grad_norm": 17.213024245970015, + "learning_rate": 4.810740422212705e-06, + "loss": 0.0514, + "step": 2952 + }, + { + "epoch": 2.11, + "grad_norm": 4.415365069317704, + "learning_rate": 4.807852375554188e-06, + "loss": 0.0439, + "step": 2953 + }, + { + "epoch": 2.11, + "grad_norm": 4.156693282692543, + "learning_rate": 4.804964393095765e-06, + "loss": 0.0477, + "step": 2954 + }, + { + "epoch": 2.11, + "grad_norm": 11.394625354113346, + "learning_rate": 4.802076475802362e-06, + "loss": 0.0494, + "step": 2955 + }, + { + "epoch": 2.11, + "grad_norm": 3.3842660382835024, + "learning_rate": 4.799188624638889e-06, + "loss": 0.0276, + "step": 2956 + }, + { + "epoch": 2.11, + "grad_norm": 4.446181745984596, + "learning_rate": 4.796300840570227e-06, + "loss": 0.0522, + "step": 2957 + }, + { + "epoch": 2.11, + "grad_norm": 15.29577470926494, + "learning_rate": 4.793413124561243e-06, + "loss": 0.0464, + "step": 2958 + }, + { + "epoch": 2.11, + "grad_norm": 8.414125335615022, + "learning_rate": 4.790525477576773e-06, + "loss": 0.0498, + "step": 2959 + }, + { + "epoch": 2.11, + "grad_norm": 4.031180542596354, + "learning_rate": 4.7876379005816325e-06, + "loss": 0.032, + "step": 2960 + }, + { + "epoch": 2.11, + "grad_norm": 3.6969083162862804, + "learning_rate": 4.784750394540619e-06, + "loss": 0.0463, + "step": 2961 + }, + { + "epoch": 2.11, + "grad_norm": 13.339709877305827, + "learning_rate": 4.781862960418498e-06, + "loss": 0.0376, + "step": 2962 + }, + { + "epoch": 2.11, + "grad_norm": 18.500857965632857, + "learning_rate": 4.778975599180019e-06, + "loss": 0.0535, + "step": 2963 + }, + { + "epoch": 2.12, + "grad_norm": 5.964132136696635, + "learning_rate": 4.776088311789897e-06, + "loss": 0.0497, + "step": 2964 + }, + { + "epoch": 2.12, + "grad_norm": 4.411962079455377, + "learning_rate": 4.773201099212835e-06, + "loss": 0.0402, + "step": 2965 + }, + { + "epoch": 2.12, + "grad_norm": 8.713256952124864, + "learning_rate": 4.770313962413499e-06, + "loss": 0.0499, + "step": 2966 + }, + { + "epoch": 2.12, + "grad_norm": 14.98799544031958, + "learning_rate": 4.767426902356539e-06, + "loss": 0.0379, + "step": 2967 + }, + { + "epoch": 2.12, + "grad_norm": 6.230504284482281, + "learning_rate": 4.7645399200065745e-06, + "loss": 0.0564, + "step": 2968 + }, + { + "epoch": 2.12, + "grad_norm": 7.904911066689615, + "learning_rate": 4.761653016328197e-06, + "loss": 0.0477, + "step": 2969 + }, + { + "epoch": 2.12, + "grad_norm": 15.071860947840747, + "learning_rate": 4.758766192285979e-06, + "loss": 0.0497, + "step": 2970 + }, + { + "epoch": 2.12, + "grad_norm": 21.895501390170683, + "learning_rate": 4.755879448844458e-06, + "loss": 0.0709, + "step": 2971 + }, + { + "epoch": 2.12, + "grad_norm": 13.52662158776065, + "learning_rate": 4.752992786968153e-06, + "loss": 0.0518, + "step": 2972 + }, + { + "epoch": 2.12, + "grad_norm": 18.273989464770818, + "learning_rate": 4.750106207621546e-06, + "loss": 0.0556, + "step": 2973 + }, + { + "epoch": 2.12, + "grad_norm": 12.57628275065163, + "learning_rate": 4.747219711769103e-06, + "loss": 0.0445, + "step": 2974 + }, + { + "epoch": 2.12, + "grad_norm": 8.238153462757012, + "learning_rate": 4.74433330037525e-06, + "loss": 0.0358, + "step": 2975 + }, + { + "epoch": 2.12, + "grad_norm": 5.815107502108913, + "learning_rate": 4.741446974404396e-06, + "loss": 0.0553, + "step": 2976 + }, + { + "epoch": 2.12, + "grad_norm": 15.551938145304423, + "learning_rate": 4.738560734820914e-06, + "loss": 0.0397, + "step": 2977 + }, + { + "epoch": 2.13, + "grad_norm": 11.898776772774422, + "learning_rate": 4.735674582589147e-06, + "loss": 0.0439, + "step": 2978 + }, + { + "epoch": 2.13, + "grad_norm": 10.619558463221429, + "learning_rate": 4.732788518673418e-06, + "loss": 0.0492, + "step": 2979 + }, + { + "epoch": 2.13, + "grad_norm": 9.889398543545939, + "learning_rate": 4.729902544038009e-06, + "loss": 0.0441, + "step": 2980 + }, + { + "epoch": 2.13, + "grad_norm": 5.05100282019345, + "learning_rate": 4.7270166596471825e-06, + "loss": 0.0447, + "step": 2981 + }, + { + "epoch": 2.13, + "grad_norm": 17.41976116286379, + "learning_rate": 4.724130866465163e-06, + "loss": 0.0475, + "step": 2982 + }, + { + "epoch": 2.13, + "grad_norm": 11.266177641235146, + "learning_rate": 4.721245165456149e-06, + "loss": 0.0494, + "step": 2983 + }, + { + "epoch": 2.13, + "grad_norm": 11.20597565365233, + "learning_rate": 4.7183595575843055e-06, + "loss": 0.069, + "step": 2984 + }, + { + "epoch": 2.13, + "grad_norm": 5.259086726470608, + "learning_rate": 4.715474043813771e-06, + "loss": 0.0356, + "step": 2985 + }, + { + "epoch": 2.13, + "grad_norm": 5.080850839428504, + "learning_rate": 4.712588625108645e-06, + "loss": 0.0481, + "step": 2986 + }, + { + "epoch": 2.13, + "grad_norm": 7.956793520224343, + "learning_rate": 4.709703302433003e-06, + "loss": 0.0567, + "step": 2987 + }, + { + "epoch": 2.13, + "grad_norm": 21.843191213401248, + "learning_rate": 4.706818076750883e-06, + "loss": 0.0645, + "step": 2988 + }, + { + "epoch": 2.13, + "grad_norm": 18.694211673739613, + "learning_rate": 4.703932949026291e-06, + "loss": 0.0492, + "step": 2989 + }, + { + "epoch": 2.13, + "grad_norm": 15.151416311688783, + "learning_rate": 4.701047920223207e-06, + "loss": 0.0446, + "step": 2990 + }, + { + "epoch": 2.13, + "grad_norm": 10.890580913948755, + "learning_rate": 4.6981629913055674e-06, + "loss": 0.0569, + "step": 2991 + }, + { + "epoch": 2.14, + "grad_norm": 7.892486306024457, + "learning_rate": 4.695278163237284e-06, + "loss": 0.0312, + "step": 2992 + }, + { + "epoch": 2.14, + "grad_norm": 12.299530495762703, + "learning_rate": 4.692393436982229e-06, + "loss": 0.0308, + "step": 2993 + }, + { + "epoch": 2.14, + "grad_norm": 9.153777049929731, + "learning_rate": 4.689508813504246e-06, + "loss": 0.0351, + "step": 2994 + }, + { + "epoch": 2.14, + "grad_norm": 3.108518173770672, + "learning_rate": 4.686624293767138e-06, + "loss": 0.0349, + "step": 2995 + }, + { + "epoch": 2.14, + "grad_norm": 17.853184369338187, + "learning_rate": 4.683739878734678e-06, + "loss": 0.0641, + "step": 2996 + }, + { + "epoch": 2.14, + "grad_norm": 6.466717945483939, + "learning_rate": 4.6808555693706045e-06, + "loss": 0.0453, + "step": 2997 + }, + { + "epoch": 2.14, + "grad_norm": 4.829172386888312, + "learning_rate": 4.677971366638616e-06, + "loss": 0.0391, + "step": 2998 + }, + { + "epoch": 2.14, + "grad_norm": 6.808172030984788, + "learning_rate": 4.67508727150238e-06, + "loss": 0.0492, + "step": 2999 + }, + { + "epoch": 2.14, + "grad_norm": 6.156095884400888, + "learning_rate": 4.672203284925525e-06, + "loss": 0.0414, + "step": 3000 + }, + { + "epoch": 2.14, + "eval_avg_AUC": 0.8227856069027939, + "eval_avg_Accuracy": 0.7180039787798409, + "eval_avg_Accuracy-right": 0.9098734837615756, + "eval_avg_Accuracy-wrong": 0.3834432567659768, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.6838678458638056, + "eval_last_AUC": 0.8253504247874757, + "eval_last_Accuracy": 0.7611903183023873, + "eval_last_Accuracy-right": 0.873288117907917, + "eval_last_Accuracy-wrong": 0.5657266317944053, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.6932099187920158, + "eval_max_AUC": 0.7682526969266754, + "eval_max_Accuracy": 0.6413710212201591, + "eval_max_Accuracy-right": 0.9846745793661145, + "eval_max_Accuracy-wrong": 0.042756424835114853, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6326595141662732, + "eval_min_AUC": 0.8253761205386874, + "eval_min_Accuracy": 0.7610245358090185, + "eval_min_Accuracy-right": 0.808921351245598, + "eval_min_Accuracy-wrong": 0.6775073914032295, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.6877850984496956, + "eval_prod_AUC": 0.8274092551394246, + "eval_prod_Accuracy": 0.7385195623342176, + "eval_prod_Accuracy-right": 0.690556932307291, + "eval_prod_Accuracy-wrong": 0.8221514669092563, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.6858525438813741, + "eval_runtime": 246.8659, + "eval_samples_per_second": 97.737, + "eval_steps_per_second": 3.054, + "eval_sum_AUC": 0.6815349093354525, + "eval_sum_Accuracy": 0.6375994694960212, + "eval_sum_Accuracy-right": 0.9964784139820008, + "eval_sum_Accuracy-wrong": 0.011826245167159426, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6539070349976671, + "step": 3000 + }, + { + "epoch": 2.14, + "grad_norm": 7.733951582678205, + "learning_rate": 4.669319407871647e-06, + "loss": 0.0526, + "step": 3001 + }, + { + "epoch": 2.14, + "grad_norm": 13.960495751854996, + "learning_rate": 4.666435641304301e-06, + "loss": 0.046, + "step": 3002 + }, + { + "epoch": 2.14, + "grad_norm": 8.77619391019296, + "learning_rate": 4.663551986187006e-06, + "loss": 0.0649, + "step": 3003 + }, + { + "epoch": 2.14, + "grad_norm": 6.884626530449051, + "learning_rate": 4.660668443483248e-06, + "loss": 0.0432, + "step": 3004 + }, + { + "epoch": 2.14, + "grad_norm": 5.3549337785724465, + "learning_rate": 4.657785014156468e-06, + "loss": 0.0498, + "step": 3005 + }, + { + "epoch": 2.15, + "grad_norm": 10.946093760776968, + "learning_rate": 4.654901699170077e-06, + "loss": 0.0597, + "step": 3006 + }, + { + "epoch": 2.15, + "grad_norm": 15.763447770059578, + "learning_rate": 4.652018499487442e-06, + "loss": 0.0491, + "step": 3007 + }, + { + "epoch": 2.15, + "grad_norm": 5.809896767903205, + "learning_rate": 4.649135416071896e-06, + "loss": 0.061, + "step": 3008 + }, + { + "epoch": 2.15, + "grad_norm": 15.298958843270297, + "learning_rate": 4.646252449886727e-06, + "loss": 0.0544, + "step": 3009 + }, + { + "epoch": 2.15, + "grad_norm": 10.962570611172566, + "learning_rate": 4.6433696018951915e-06, + "loss": 0.0578, + "step": 3010 + }, + { + "epoch": 2.15, + "grad_norm": 8.441821768445967, + "learning_rate": 4.640486873060501e-06, + "loss": 0.0522, + "step": 3011 + }, + { + "epoch": 2.15, + "grad_norm": 12.827070504932639, + "learning_rate": 4.6376042643458254e-06, + "loss": 0.0376, + "step": 3012 + }, + { + "epoch": 2.15, + "grad_norm": 9.437577892174438, + "learning_rate": 4.634721776714305e-06, + "loss": 0.0455, + "step": 3013 + }, + { + "epoch": 2.15, + "grad_norm": 9.188265760228681, + "learning_rate": 4.631839411129025e-06, + "loss": 0.061, + "step": 3014 + }, + { + "epoch": 2.15, + "grad_norm": 9.140831955733349, + "learning_rate": 4.628957168553044e-06, + "loss": 0.0463, + "step": 3015 + }, + { + "epoch": 2.15, + "grad_norm": 6.621373831056885, + "learning_rate": 4.6260750499493665e-06, + "loss": 0.0577, + "step": 3016 + }, + { + "epoch": 2.15, + "grad_norm": 16.423887046208293, + "learning_rate": 4.623193056280968e-06, + "loss": 0.0422, + "step": 3017 + }, + { + "epoch": 2.15, + "grad_norm": 8.94491278041153, + "learning_rate": 4.6203111885107735e-06, + "loss": 0.0443, + "step": 3018 + }, + { + "epoch": 2.15, + "grad_norm": 10.518796763313492, + "learning_rate": 4.617429447601665e-06, + "loss": 0.0434, + "step": 3019 + }, + { + "epoch": 2.16, + "grad_norm": 5.614949678699768, + "learning_rate": 4.614547834516492e-06, + "loss": 0.0467, + "step": 3020 + }, + { + "epoch": 2.16, + "grad_norm": 6.229969137081133, + "learning_rate": 4.6116663502180495e-06, + "loss": 0.0798, + "step": 3021 + }, + { + "epoch": 2.16, + "grad_norm": 5.066868988185442, + "learning_rate": 4.6087849956691e-06, + "loss": 0.0394, + "step": 3022 + }, + { + "epoch": 2.16, + "grad_norm": 11.331501605212363, + "learning_rate": 4.605903771832353e-06, + "loss": 0.0513, + "step": 3023 + }, + { + "epoch": 2.16, + "grad_norm": 5.839781229557823, + "learning_rate": 4.603022679670482e-06, + "loss": 0.041, + "step": 3024 + }, + { + "epoch": 2.16, + "grad_norm": 14.332881398684489, + "learning_rate": 4.6001417201461114e-06, + "loss": 0.0539, + "step": 3025 + }, + { + "epoch": 2.16, + "grad_norm": 16.291207017760183, + "learning_rate": 4.597260894221826e-06, + "loss": 0.0546, + "step": 3026 + }, + { + "epoch": 2.16, + "grad_norm": 11.921642342100183, + "learning_rate": 4.594380202860162e-06, + "loss": 0.0542, + "step": 3027 + }, + { + "epoch": 2.16, + "grad_norm": 13.932638931859428, + "learning_rate": 4.5914996470236094e-06, + "loss": 0.0531, + "step": 3028 + }, + { + "epoch": 2.16, + "grad_norm": 6.7911004552180465, + "learning_rate": 4.588619227674619e-06, + "loss": 0.031, + "step": 3029 + }, + { + "epoch": 2.16, + "grad_norm": 19.56600048222998, + "learning_rate": 4.58573894577559e-06, + "loss": 0.0653, + "step": 3030 + }, + { + "epoch": 2.16, + "grad_norm": 3.7997903160804594, + "learning_rate": 4.5828588022888815e-06, + "loss": 0.0432, + "step": 3031 + }, + { + "epoch": 2.16, + "grad_norm": 4.36456708146258, + "learning_rate": 4.5799787981767975e-06, + "loss": 0.0533, + "step": 3032 + }, + { + "epoch": 2.16, + "grad_norm": 12.061053723608204, + "learning_rate": 4.577098934401607e-06, + "loss": 0.0577, + "step": 3033 + }, + { + "epoch": 2.17, + "grad_norm": 5.047403317404933, + "learning_rate": 4.57421921192552e-06, + "loss": 0.043, + "step": 3034 + }, + { + "epoch": 2.17, + "grad_norm": 6.587631210630952, + "learning_rate": 4.5713396317107115e-06, + "loss": 0.0443, + "step": 3035 + }, + { + "epoch": 2.17, + "grad_norm": 5.787472009592416, + "learning_rate": 4.568460194719299e-06, + "loss": 0.0341, + "step": 3036 + }, + { + "epoch": 2.17, + "grad_norm": 6.7437175989326965, + "learning_rate": 4.565580901913356e-06, + "loss": 0.0533, + "step": 3037 + }, + { + "epoch": 2.17, + "grad_norm": 4.747331536456431, + "learning_rate": 4.562701754254909e-06, + "loss": 0.0473, + "step": 3038 + }, + { + "epoch": 2.17, + "grad_norm": 5.768433537381566, + "learning_rate": 4.559822752705933e-06, + "loss": 0.0473, + "step": 3039 + }, + { + "epoch": 2.17, + "grad_norm": 3.3083313751743146, + "learning_rate": 4.556943898228358e-06, + "loss": 0.0335, + "step": 3040 + }, + { + "epoch": 2.17, + "grad_norm": 14.995712674161572, + "learning_rate": 4.55406519178406e-06, + "loss": 0.0591, + "step": 3041 + }, + { + "epoch": 2.17, + "grad_norm": 12.571671708096863, + "learning_rate": 4.551186634334873e-06, + "loss": 0.0516, + "step": 3042 + }, + { + "epoch": 2.17, + "grad_norm": 3.8361711880156655, + "learning_rate": 4.54830822684257e-06, + "loss": 0.0463, + "step": 3043 + }, + { + "epoch": 2.17, + "grad_norm": 6.242235542738953, + "learning_rate": 4.545429970268888e-06, + "loss": 0.0573, + "step": 3044 + }, + { + "epoch": 2.17, + "grad_norm": 2.9572838572416, + "learning_rate": 4.542551865575499e-06, + "loss": 0.0381, + "step": 3045 + }, + { + "epoch": 2.17, + "grad_norm": 7.465321058449582, + "learning_rate": 4.539673913724037e-06, + "loss": 0.0504, + "step": 3046 + }, + { + "epoch": 2.17, + "grad_norm": 13.582177654022352, + "learning_rate": 4.5367961156760745e-06, + "loss": 0.0642, + "step": 3047 + }, + { + "epoch": 2.18, + "grad_norm": 13.106849831794028, + "learning_rate": 4.533918472393141e-06, + "loss": 0.0544, + "step": 3048 + }, + { + "epoch": 2.18, + "grad_norm": 8.258854894812476, + "learning_rate": 4.531040984836708e-06, + "loss": 0.0398, + "step": 3049 + }, + { + "epoch": 2.18, + "grad_norm": 6.1961201449818875, + "learning_rate": 4.5281636539682e-06, + "loss": 0.0563, + "step": 3050 + }, + { + "epoch": 2.18, + "grad_norm": 14.114915646585992, + "learning_rate": 4.5252864807489836e-06, + "loss": 0.0525, + "step": 3051 + }, + { + "epoch": 2.18, + "grad_norm": 3.184899979478378, + "learning_rate": 4.522409466140379e-06, + "loss": 0.0417, + "step": 3052 + }, + { + "epoch": 2.18, + "grad_norm": 15.445103578110677, + "learning_rate": 4.5195326111036475e-06, + "loss": 0.0515, + "step": 3053 + }, + { + "epoch": 2.18, + "grad_norm": 5.519785034152788, + "learning_rate": 4.5166559166000035e-06, + "loss": 0.0558, + "step": 3054 + }, + { + "epoch": 2.18, + "grad_norm": 11.233134348940904, + "learning_rate": 4.513779383590599e-06, + "loss": 0.0532, + "step": 3055 + }, + { + "epoch": 2.18, + "grad_norm": 6.353767355142566, + "learning_rate": 4.510903013036542e-06, + "loss": 0.0411, + "step": 3056 + }, + { + "epoch": 2.18, + "grad_norm": 5.602659776333459, + "learning_rate": 4.508026805898878e-06, + "loss": 0.0474, + "step": 3057 + }, + { + "epoch": 2.18, + "grad_norm": 3.55271578684028, + "learning_rate": 4.505150763138604e-06, + "loss": 0.0438, + "step": 3058 + }, + { + "epoch": 2.18, + "grad_norm": 5.4020523865517065, + "learning_rate": 4.502274885716656e-06, + "loss": 0.0441, + "step": 3059 + }, + { + "epoch": 2.18, + "grad_norm": 4.974542469049749, + "learning_rate": 4.499399174593923e-06, + "loss": 0.038, + "step": 3060 + }, + { + "epoch": 2.18, + "grad_norm": 13.183234894928061, + "learning_rate": 4.496523630731229e-06, + "loss": 0.0536, + "step": 3061 + }, + { + "epoch": 2.19, + "grad_norm": 7.012776400774284, + "learning_rate": 4.493648255089347e-06, + "loss": 0.053, + "step": 3062 + }, + { + "epoch": 2.19, + "grad_norm": 5.113694599229363, + "learning_rate": 4.490773048628997e-06, + "loss": 0.0441, + "step": 3063 + }, + { + "epoch": 2.19, + "grad_norm": 6.999872520790414, + "learning_rate": 4.487898012310834e-06, + "loss": 0.0551, + "step": 3064 + }, + { + "epoch": 2.19, + "grad_norm": 8.988865058697366, + "learning_rate": 4.485023147095466e-06, + "loss": 0.0759, + "step": 3065 + }, + { + "epoch": 2.19, + "grad_norm": 8.608399626100027, + "learning_rate": 4.482148453943434e-06, + "loss": 0.0555, + "step": 3066 + }, + { + "epoch": 2.19, + "grad_norm": 5.657772658940335, + "learning_rate": 4.479273933815232e-06, + "loss": 0.0548, + "step": 3067 + }, + { + "epoch": 2.19, + "grad_norm": 5.243732810688884, + "learning_rate": 4.476399587671285e-06, + "loss": 0.0759, + "step": 3068 + }, + { + "epoch": 2.19, + "grad_norm": 17.83601854064182, + "learning_rate": 4.47352541647197e-06, + "loss": 0.0443, + "step": 3069 + }, + { + "epoch": 2.19, + "grad_norm": 8.347841250675605, + "learning_rate": 4.470651421177599e-06, + "loss": 0.0452, + "step": 3070 + }, + { + "epoch": 2.19, + "grad_norm": 7.0436872734018765, + "learning_rate": 4.467777602748425e-06, + "loss": 0.0431, + "step": 3071 + }, + { + "epoch": 2.19, + "grad_norm": 24.860758320056135, + "learning_rate": 4.4649039621446495e-06, + "loss": 0.0797, + "step": 3072 + }, + { + "epoch": 2.19, + "grad_norm": 11.719607525022566, + "learning_rate": 4.462030500326403e-06, + "loss": 0.0443, + "step": 3073 + }, + { + "epoch": 2.19, + "grad_norm": 11.724827346428865, + "learning_rate": 4.459157218253769e-06, + "loss": 0.0717, + "step": 3074 + }, + { + "epoch": 2.19, + "grad_norm": 13.934654419830443, + "learning_rate": 4.456284116886758e-06, + "loss": 0.0496, + "step": 3075 + }, + { + "epoch": 2.2, + "grad_norm": 7.660664913105194, + "learning_rate": 4.453411197185334e-06, + "loss": 0.0442, + "step": 3076 + }, + { + "epoch": 2.2, + "grad_norm": 24.60532296154118, + "learning_rate": 4.450538460109384e-06, + "loss": 0.0543, + "step": 3077 + }, + { + "epoch": 2.2, + "grad_norm": 9.440853316535325, + "learning_rate": 4.447665906618751e-06, + "loss": 0.037, + "step": 3078 + }, + { + "epoch": 2.2, + "grad_norm": 2.5094571457438564, + "learning_rate": 4.444793537673204e-06, + "loss": 0.0406, + "step": 3079 + }, + { + "epoch": 2.2, + "grad_norm": 14.051552491510163, + "learning_rate": 4.441921354232455e-06, + "loss": 0.0598, + "step": 3080 + }, + { + "epoch": 2.2, + "grad_norm": 4.601130148270938, + "learning_rate": 4.439049357256156e-06, + "loss": 0.0391, + "step": 3081 + }, + { + "epoch": 2.2, + "grad_norm": 16.768683231807135, + "learning_rate": 4.436177547703891e-06, + "loss": 0.0461, + "step": 3082 + }, + { + "epoch": 2.2, + "grad_norm": 14.287404647048588, + "learning_rate": 4.433305926535189e-06, + "loss": 0.0694, + "step": 3083 + }, + { + "epoch": 2.2, + "grad_norm": 10.160614532165187, + "learning_rate": 4.430434494709509e-06, + "loss": 0.0536, + "step": 3084 + }, + { + "epoch": 2.2, + "grad_norm": 15.080982562292157, + "learning_rate": 4.427563253186253e-06, + "loss": 0.0343, + "step": 3085 + }, + { + "epoch": 2.2, + "grad_norm": 5.675753477001895, + "learning_rate": 4.424692202924754e-06, + "loss": 0.0356, + "step": 3086 + }, + { + "epoch": 2.2, + "grad_norm": 5.716909777073017, + "learning_rate": 4.421821344884281e-06, + "loss": 0.0341, + "step": 3087 + }, + { + "epoch": 2.2, + "grad_norm": 4.431358994048849, + "learning_rate": 4.418950680024046e-06, + "loss": 0.0457, + "step": 3088 + }, + { + "epoch": 2.2, + "grad_norm": 6.596427500698859, + "learning_rate": 4.416080209303187e-06, + "loss": 0.0395, + "step": 3089 + }, + { + "epoch": 2.21, + "grad_norm": 7.187035201109175, + "learning_rate": 4.413209933680786e-06, + "loss": 0.0556, + "step": 3090 + }, + { + "epoch": 2.21, + "grad_norm": 15.759502614264681, + "learning_rate": 4.410339854115849e-06, + "loss": 0.0658, + "step": 3091 + }, + { + "epoch": 2.21, + "grad_norm": 8.965449381966314, + "learning_rate": 4.407469971567331e-06, + "loss": 0.0542, + "step": 3092 + }, + { + "epoch": 2.21, + "grad_norm": 13.279415208911773, + "learning_rate": 4.4046002869941055e-06, + "loss": 0.0518, + "step": 3093 + }, + { + "epoch": 2.21, + "grad_norm": 5.9208827638453485, + "learning_rate": 4.401730801354994e-06, + "loss": 0.0453, + "step": 3094 + }, + { + "epoch": 2.21, + "grad_norm": 4.575198553252729, + "learning_rate": 4.39886151560874e-06, + "loss": 0.0413, + "step": 3095 + }, + { + "epoch": 2.21, + "grad_norm": 3.725306026650709, + "learning_rate": 4.395992430714028e-06, + "loss": 0.0352, + "step": 3096 + }, + { + "epoch": 2.21, + "grad_norm": 10.572986077833214, + "learning_rate": 4.393123547629472e-06, + "loss": 0.0573, + "step": 3097 + }, + { + "epoch": 2.21, + "grad_norm": 4.788737593661607, + "learning_rate": 4.390254867313619e-06, + "loss": 0.049, + "step": 3098 + }, + { + "epoch": 2.21, + "grad_norm": 14.988650115590175, + "learning_rate": 4.387386390724947e-06, + "loss": 0.041, + "step": 3099 + }, + { + "epoch": 2.21, + "grad_norm": 15.407848641378886, + "learning_rate": 4.38451811882187e-06, + "loss": 0.0631, + "step": 3100 + }, + { + "epoch": 2.21, + "grad_norm": 10.332737742350625, + "learning_rate": 4.3816500525627284e-06, + "loss": 0.0542, + "step": 3101 + }, + { + "epoch": 2.21, + "grad_norm": 3.3046432737592766, + "learning_rate": 4.3787821929057985e-06, + "loss": 0.0524, + "step": 3102 + }, + { + "epoch": 2.21, + "grad_norm": 21.62698520171683, + "learning_rate": 4.3759145408092855e-06, + "loss": 0.0541, + "step": 3103 + }, + { + "epoch": 2.22, + "grad_norm": 6.318176228409699, + "learning_rate": 4.373047097231324e-06, + "loss": 0.047, + "step": 3104 + }, + { + "epoch": 2.22, + "grad_norm": 5.161442887353655, + "learning_rate": 4.370179863129979e-06, + "loss": 0.0438, + "step": 3105 + }, + { + "epoch": 2.22, + "grad_norm": 7.9377251511338205, + "learning_rate": 4.367312839463251e-06, + "loss": 0.0496, + "step": 3106 + }, + { + "epoch": 2.22, + "grad_norm": 5.3131144070684595, + "learning_rate": 4.3644460271890614e-06, + "loss": 0.0415, + "step": 3107 + }, + { + "epoch": 2.22, + "grad_norm": 8.363837704096479, + "learning_rate": 4.361579427265268e-06, + "loss": 0.0644, + "step": 3108 + }, + { + "epoch": 2.22, + "grad_norm": 8.313467422358402, + "learning_rate": 4.358713040649654e-06, + "loss": 0.0499, + "step": 3109 + }, + { + "epoch": 2.22, + "grad_norm": 3.1434368387419873, + "learning_rate": 4.3558468682999336e-06, + "loss": 0.0358, + "step": 3110 + }, + { + "epoch": 2.22, + "grad_norm": 5.111253029616996, + "learning_rate": 4.352980911173747e-06, + "loss": 0.0492, + "step": 3111 + }, + { + "epoch": 2.22, + "grad_norm": 8.357180537855855, + "learning_rate": 4.350115170228664e-06, + "loss": 0.0505, + "step": 3112 + }, + { + "epoch": 2.22, + "grad_norm": 7.2716322758839755, + "learning_rate": 4.3472496464221845e-06, + "loss": 0.0474, + "step": 3113 + }, + { + "epoch": 2.22, + "grad_norm": 7.2404915807490315, + "learning_rate": 4.344384340711728e-06, + "loss": 0.0434, + "step": 3114 + }, + { + "epoch": 2.22, + "grad_norm": 5.148940982244169, + "learning_rate": 4.341519254054651e-06, + "loss": 0.0416, + "step": 3115 + }, + { + "epoch": 2.22, + "grad_norm": 5.229546559770345, + "learning_rate": 4.338654387408229e-06, + "loss": 0.0396, + "step": 3116 + }, + { + "epoch": 2.22, + "grad_norm": 20.48430154530679, + "learning_rate": 4.335789741729671e-06, + "loss": 0.0515, + "step": 3117 + }, + { + "epoch": 2.23, + "grad_norm": 10.359553267941436, + "learning_rate": 4.332925317976104e-06, + "loss": 0.0451, + "step": 3118 + }, + { + "epoch": 2.23, + "grad_norm": 12.04521250763658, + "learning_rate": 4.330061117104589e-06, + "loss": 0.0395, + "step": 3119 + }, + { + "epoch": 2.23, + "grad_norm": 9.16643051448259, + "learning_rate": 4.327197140072108e-06, + "loss": 0.0539, + "step": 3120 + }, + { + "epoch": 2.23, + "grad_norm": 11.188804325737475, + "learning_rate": 4.324333387835565e-06, + "loss": 0.048, + "step": 3121 + }, + { + "epoch": 2.23, + "grad_norm": 10.924204030628253, + "learning_rate": 4.321469861351799e-06, + "loss": 0.0456, + "step": 3122 + }, + { + "epoch": 2.23, + "grad_norm": 12.396543885172143, + "learning_rate": 4.318606561577562e-06, + "loss": 0.0711, + "step": 3123 + }, + { + "epoch": 2.23, + "grad_norm": 7.358079264086069, + "learning_rate": 4.31574348946954e-06, + "loss": 0.0432, + "step": 3124 + }, + { + "epoch": 2.23, + "grad_norm": 6.63910667690175, + "learning_rate": 4.312880645984334e-06, + "loss": 0.0473, + "step": 3125 + }, + { + "epoch": 2.23, + "grad_norm": 6.926846866544162, + "learning_rate": 4.310018032078479e-06, + "loss": 0.0427, + "step": 3126 + }, + { + "epoch": 2.23, + "grad_norm": 5.348927273635156, + "learning_rate": 4.307155648708421e-06, + "loss": 0.0398, + "step": 3127 + }, + { + "epoch": 2.23, + "grad_norm": 8.293095802027796, + "learning_rate": 4.304293496830542e-06, + "loss": 0.0552, + "step": 3128 + }, + { + "epoch": 2.23, + "grad_norm": 5.164416626873579, + "learning_rate": 4.301431577401136e-06, + "loss": 0.0558, + "step": 3129 + }, + { + "epoch": 2.23, + "grad_norm": 10.31913313745745, + "learning_rate": 4.298569891376423e-06, + "loss": 0.0557, + "step": 3130 + }, + { + "epoch": 2.23, + "grad_norm": 11.861633104275379, + "learning_rate": 4.2957084397125496e-06, + "loss": 0.0438, + "step": 3131 + }, + { + "epoch": 2.24, + "grad_norm": 11.33706132619583, + "learning_rate": 4.292847223365574e-06, + "loss": 0.0434, + "step": 3132 + }, + { + "epoch": 2.24, + "grad_norm": 15.917688181318276, + "learning_rate": 4.289986243291488e-06, + "loss": 0.0623, + "step": 3133 + }, + { + "epoch": 2.24, + "grad_norm": 4.856782780631129, + "learning_rate": 4.287125500446193e-06, + "loss": 0.0336, + "step": 3134 + }, + { + "epoch": 2.24, + "grad_norm": 16.31108504711644, + "learning_rate": 4.284264995785521e-06, + "loss": 0.0607, + "step": 3135 + }, + { + "epoch": 2.24, + "grad_norm": 9.762647669216687, + "learning_rate": 4.2814047302652155e-06, + "loss": 0.0361, + "step": 3136 + }, + { + "epoch": 2.24, + "grad_norm": 8.749071414210514, + "learning_rate": 4.278544704840948e-06, + "loss": 0.0448, + "step": 3137 + }, + { + "epoch": 2.24, + "grad_norm": 6.27371687383982, + "learning_rate": 4.275684920468306e-06, + "loss": 0.0473, + "step": 3138 + }, + { + "epoch": 2.24, + "grad_norm": 4.534335338717934, + "learning_rate": 4.272825378102791e-06, + "loss": 0.0475, + "step": 3139 + }, + { + "epoch": 2.24, + "grad_norm": 6.9101311343506655, + "learning_rate": 4.269966078699836e-06, + "loss": 0.0396, + "step": 3140 + }, + { + "epoch": 2.24, + "grad_norm": 3.184600894702316, + "learning_rate": 4.267107023214782e-06, + "loss": 0.0396, + "step": 3141 + }, + { + "epoch": 2.24, + "grad_norm": 10.089000430219308, + "learning_rate": 4.264248212602896e-06, + "loss": 0.0527, + "step": 3142 + }, + { + "epoch": 2.24, + "grad_norm": 6.60955809699945, + "learning_rate": 4.261389647819355e-06, + "loss": 0.0446, + "step": 3143 + }, + { + "epoch": 2.24, + "grad_norm": 13.226538812036294, + "learning_rate": 4.258531329819264e-06, + "loss": 0.0481, + "step": 3144 + }, + { + "epoch": 2.24, + "grad_norm": 10.865699000414827, + "learning_rate": 4.255673259557636e-06, + "loss": 0.0531, + "step": 3145 + }, + { + "epoch": 2.25, + "grad_norm": 14.364449970953885, + "learning_rate": 4.252815437989408e-06, + "loss": 0.055, + "step": 3146 + }, + { + "epoch": 2.25, + "grad_norm": 9.573885380724388, + "learning_rate": 4.24995786606943e-06, + "loss": 0.0589, + "step": 3147 + }, + { + "epoch": 2.25, + "grad_norm": 20.58280975888398, + "learning_rate": 4.24710054475247e-06, + "loss": 0.0551, + "step": 3148 + }, + { + "epoch": 2.25, + "grad_norm": 4.894893628781143, + "learning_rate": 4.244243474993214e-06, + "loss": 0.0312, + "step": 3149 + }, + { + "epoch": 2.25, + "grad_norm": 10.294083577388148, + "learning_rate": 4.241386657746257e-06, + "loss": 0.0512, + "step": 3150 + }, + { + "epoch": 2.25, + "grad_norm": 9.884297976636462, + "learning_rate": 4.2385300939661215e-06, + "loss": 0.0698, + "step": 3151 + }, + { + "epoch": 2.25, + "grad_norm": 20.580492702452048, + "learning_rate": 4.2356737846072326e-06, + "loss": 0.0423, + "step": 3152 + }, + { + "epoch": 2.25, + "grad_norm": 10.3250929805605, + "learning_rate": 4.232817730623941e-06, + "loss": 0.0622, + "step": 3153 + }, + { + "epoch": 2.25, + "grad_norm": 7.020014929183152, + "learning_rate": 4.229961932970505e-06, + "loss": 0.029, + "step": 3154 + }, + { + "epoch": 2.25, + "grad_norm": 7.624072768428424, + "learning_rate": 4.2271063926010995e-06, + "loss": 0.0357, + "step": 3155 + }, + { + "epoch": 2.25, + "grad_norm": 10.226850797566772, + "learning_rate": 4.224251110469814e-06, + "loss": 0.0622, + "step": 3156 + }, + { + "epoch": 2.25, + "grad_norm": 6.615838261687149, + "learning_rate": 4.221396087530652e-06, + "loss": 0.056, + "step": 3157 + }, + { + "epoch": 2.25, + "grad_norm": 8.170230954284976, + "learning_rate": 4.218541324737529e-06, + "loss": 0.0552, + "step": 3158 + }, + { + "epoch": 2.25, + "grad_norm": 5.928036811380873, + "learning_rate": 4.2156868230442756e-06, + "loss": 0.0472, + "step": 3159 + }, + { + "epoch": 2.26, + "grad_norm": 15.829254203511692, + "learning_rate": 4.212832583404632e-06, + "loss": 0.0587, + "step": 3160 + }, + { + "epoch": 2.26, + "grad_norm": 11.618419552979391, + "learning_rate": 4.2099786067722535e-06, + "loss": 0.0403, + "step": 3161 + }, + { + "epoch": 2.26, + "grad_norm": 2.1222508462469003, + "learning_rate": 4.207124894100707e-06, + "loss": 0.0364, + "step": 3162 + }, + { + "epoch": 2.26, + "grad_norm": 4.318748307419997, + "learning_rate": 4.2042714463434715e-06, + "loss": 0.0391, + "step": 3163 + }, + { + "epoch": 2.26, + "grad_norm": 9.36228620240982, + "learning_rate": 4.201418264453935e-06, + "loss": 0.0319, + "step": 3164 + }, + { + "epoch": 2.26, + "grad_norm": 3.3778091133940125, + "learning_rate": 4.198565349385402e-06, + "loss": 0.0536, + "step": 3165 + }, + { + "epoch": 2.26, + "grad_norm": 17.14676052388145, + "learning_rate": 4.195712702091079e-06, + "loss": 0.0535, + "step": 3166 + }, + { + "epoch": 2.26, + "grad_norm": 5.024792221872647, + "learning_rate": 4.192860323524094e-06, + "loss": 0.0481, + "step": 3167 + }, + { + "epoch": 2.26, + "grad_norm": 6.294621583070762, + "learning_rate": 4.190008214637476e-06, + "loss": 0.0591, + "step": 3168 + }, + { + "epoch": 2.26, + "grad_norm": 7.517307671125681, + "learning_rate": 4.187156376384171e-06, + "loss": 0.0526, + "step": 3169 + }, + { + "epoch": 2.26, + "grad_norm": 8.592352297300314, + "learning_rate": 4.184304809717027e-06, + "loss": 0.0632, + "step": 3170 + }, + { + "epoch": 2.26, + "grad_norm": 14.526658581579898, + "learning_rate": 4.18145351558881e-06, + "loss": 0.0461, + "step": 3171 + }, + { + "epoch": 2.26, + "grad_norm": 16.837649318976588, + "learning_rate": 4.178602494952187e-06, + "loss": 0.0806, + "step": 3172 + }, + { + "epoch": 2.26, + "grad_norm": 7.440201035737965, + "learning_rate": 4.175751748759737e-06, + "loss": 0.0474, + "step": 3173 + }, + { + "epoch": 2.27, + "grad_norm": 4.692217557688077, + "learning_rate": 4.1729012779639495e-06, + "loss": 0.0359, + "step": 3174 + }, + { + "epoch": 2.27, + "grad_norm": 5.1727632650733915, + "learning_rate": 4.170051083517217e-06, + "loss": 0.0648, + "step": 3175 + }, + { + "epoch": 2.27, + "grad_norm": 8.946075258402107, + "learning_rate": 4.167201166371846e-06, + "loss": 0.0524, + "step": 3176 + }, + { + "epoch": 2.27, + "grad_norm": 7.667399977486221, + "learning_rate": 4.164351527480042e-06, + "loss": 0.049, + "step": 3177 + }, + { + "epoch": 2.27, + "grad_norm": 5.274714349541263, + "learning_rate": 4.161502167793928e-06, + "loss": 0.0706, + "step": 3178 + }, + { + "epoch": 2.27, + "grad_norm": 10.214076104747047, + "learning_rate": 4.1586530882655226e-06, + "loss": 0.0504, + "step": 3179 + }, + { + "epoch": 2.27, + "grad_norm": 7.550207984999468, + "learning_rate": 4.155804289846762e-06, + "loss": 0.0385, + "step": 3180 + }, + { + "epoch": 2.27, + "grad_norm": 6.718996738912058, + "learning_rate": 4.152955773489479e-06, + "loss": 0.0577, + "step": 3181 + }, + { + "epoch": 2.27, + "grad_norm": 4.097197065235624, + "learning_rate": 4.150107540145413e-06, + "loss": 0.0457, + "step": 3182 + }, + { + "epoch": 2.27, + "grad_norm": 8.064906232599887, + "learning_rate": 4.147259590766219e-06, + "loss": 0.0574, + "step": 3183 + }, + { + "epoch": 2.27, + "grad_norm": 6.518517725711126, + "learning_rate": 4.144411926303442e-06, + "loss": 0.0383, + "step": 3184 + }, + { + "epoch": 2.27, + "grad_norm": 5.616507539227108, + "learning_rate": 4.141564547708546e-06, + "loss": 0.0521, + "step": 3185 + }, + { + "epoch": 2.27, + "grad_norm": 6.556983372205476, + "learning_rate": 4.138717455932888e-06, + "loss": 0.0537, + "step": 3186 + }, + { + "epoch": 2.27, + "grad_norm": 4.742026540119468, + "learning_rate": 4.13587065192774e-06, + "loss": 0.0479, + "step": 3187 + }, + { + "epoch": 2.28, + "grad_norm": 16.34355762704361, + "learning_rate": 4.133024136644269e-06, + "loss": 0.0531, + "step": 3188 + }, + { + "epoch": 2.28, + "grad_norm": 7.81542554393562, + "learning_rate": 4.130177911033546e-06, + "loss": 0.04, + "step": 3189 + }, + { + "epoch": 2.28, + "grad_norm": 13.76182597376324, + "learning_rate": 4.127331976046553e-06, + "loss": 0.0431, + "step": 3190 + }, + { + "epoch": 2.28, + "grad_norm": 6.879252527427723, + "learning_rate": 4.124486332634165e-06, + "loss": 0.067, + "step": 3191 + }, + { + "epoch": 2.28, + "grad_norm": 9.10827224827748, + "learning_rate": 4.121640981747169e-06, + "loss": 0.0434, + "step": 3192 + }, + { + "epoch": 2.28, + "grad_norm": 6.435745967654931, + "learning_rate": 4.118795924336245e-06, + "loss": 0.0596, + "step": 3193 + }, + { + "epoch": 2.28, + "grad_norm": 6.612178222538567, + "learning_rate": 4.115951161351985e-06, + "loss": 0.0446, + "step": 3194 + }, + { + "epoch": 2.28, + "grad_norm": 14.164355450798242, + "learning_rate": 4.113106693744871e-06, + "loss": 0.0479, + "step": 3195 + }, + { + "epoch": 2.28, + "grad_norm": 7.157081086345673, + "learning_rate": 4.110262522465298e-06, + "loss": 0.0531, + "step": 3196 + }, + { + "epoch": 2.28, + "grad_norm": 7.031758826153102, + "learning_rate": 4.107418648463553e-06, + "loss": 0.0307, + "step": 3197 + }, + { + "epoch": 2.28, + "grad_norm": 13.865702651660351, + "learning_rate": 4.104575072689827e-06, + "loss": 0.0684, + "step": 3198 + }, + { + "epoch": 2.28, + "grad_norm": 3.3223935942971883, + "learning_rate": 4.101731796094215e-06, + "loss": 0.0368, + "step": 3199 + }, + { + "epoch": 2.28, + "grad_norm": 9.926947145875099, + "learning_rate": 4.098888819626704e-06, + "loss": 0.0475, + "step": 3200 + }, + { + "epoch": 2.28, + "grad_norm": 13.764679326338504, + "learning_rate": 4.096046144237189e-06, + "loss": 0.0667, + "step": 3201 + }, + { + "epoch": 2.29, + "grad_norm": 4.877567877322836, + "learning_rate": 4.093203770875458e-06, + "loss": 0.0318, + "step": 3202 + }, + { + "epoch": 2.29, + "grad_norm": 3.0890455248333235, + "learning_rate": 4.090361700491203e-06, + "loss": 0.0391, + "step": 3203 + }, + { + "epoch": 2.29, + "grad_norm": 11.307502997946623, + "learning_rate": 4.087519934034011e-06, + "loss": 0.0579, + "step": 3204 + }, + { + "epoch": 2.29, + "grad_norm": 12.286021346963802, + "learning_rate": 4.084678472453371e-06, + "loss": 0.0425, + "step": 3205 + }, + { + "epoch": 2.29, + "grad_norm": 20.05845646784748, + "learning_rate": 4.081837316698665e-06, + "loss": 0.0714, + "step": 3206 + }, + { + "epoch": 2.29, + "grad_norm": 7.513715346915439, + "learning_rate": 4.078996467719179e-06, + "loss": 0.0566, + "step": 3207 + }, + { + "epoch": 2.29, + "grad_norm": 3.871357352532569, + "learning_rate": 4.076155926464091e-06, + "loss": 0.0322, + "step": 3208 + }, + { + "epoch": 2.29, + "grad_norm": 8.692473548212876, + "learning_rate": 4.07331569388248e-06, + "loss": 0.0452, + "step": 3209 + }, + { + "epoch": 2.29, + "grad_norm": 5.348891593350171, + "learning_rate": 4.07047577092332e-06, + "loss": 0.0622, + "step": 3210 + }, + { + "epoch": 2.29, + "grad_norm": 10.667994143010105, + "learning_rate": 4.067636158535483e-06, + "loss": 0.0429, + "step": 3211 + }, + { + "epoch": 2.29, + "grad_norm": 14.33390498257431, + "learning_rate": 4.064796857667734e-06, + "loss": 0.0602, + "step": 3212 + }, + { + "epoch": 2.29, + "grad_norm": 13.853874395384349, + "learning_rate": 4.0619578692687405e-06, + "loss": 0.0485, + "step": 3213 + }, + { + "epoch": 2.29, + "grad_norm": 10.967333214518202, + "learning_rate": 4.059119194287056e-06, + "loss": 0.0618, + "step": 3214 + }, + { + "epoch": 2.29, + "grad_norm": 19.491206872437058, + "learning_rate": 4.056280833671139e-06, + "loss": 0.0689, + "step": 3215 + }, + { + "epoch": 2.3, + "grad_norm": 15.545263997390558, + "learning_rate": 4.053442788369334e-06, + "loss": 0.0463, + "step": 3216 + }, + { + "epoch": 2.3, + "grad_norm": 8.404901220400674, + "learning_rate": 4.05060505932989e-06, + "loss": 0.049, + "step": 3217 + }, + { + "epoch": 2.3, + "grad_norm": 10.955981738026576, + "learning_rate": 4.04776764750094e-06, + "loss": 0.0467, + "step": 3218 + }, + { + "epoch": 2.3, + "grad_norm": 9.242743867660305, + "learning_rate": 4.04493055383052e-06, + "loss": 0.0492, + "step": 3219 + }, + { + "epoch": 2.3, + "grad_norm": 18.269012387060222, + "learning_rate": 4.042093779266553e-06, + "loss": 0.0473, + "step": 3220 + }, + { + "epoch": 2.3, + "grad_norm": 4.530310469645218, + "learning_rate": 4.0392573247568614e-06, + "loss": 0.0731, + "step": 3221 + }, + { + "epoch": 2.3, + "grad_norm": 10.337877336593875, + "learning_rate": 4.036421191249155e-06, + "loss": 0.0583, + "step": 3222 + }, + { + "epoch": 2.3, + "grad_norm": 11.665285153590094, + "learning_rate": 4.033585379691036e-06, + "loss": 0.101, + "step": 3223 + }, + { + "epoch": 2.3, + "grad_norm": 10.20335646310635, + "learning_rate": 4.030749891030008e-06, + "loss": 0.0428, + "step": 3224 + }, + { + "epoch": 2.3, + "grad_norm": 14.145496210000346, + "learning_rate": 4.0279147262134534e-06, + "loss": 0.0629, + "step": 3225 + }, + { + "epoch": 2.3, + "grad_norm": 10.358245511220993, + "learning_rate": 4.025079886188661e-06, + "loss": 0.0611, + "step": 3226 + }, + { + "epoch": 2.3, + "grad_norm": 11.004038035963342, + "learning_rate": 4.022245371902796e-06, + "loss": 0.0431, + "step": 3227 + }, + { + "epoch": 2.3, + "grad_norm": 13.706420262135827, + "learning_rate": 4.01941118430293e-06, + "loss": 0.0503, + "step": 3228 + }, + { + "epoch": 2.3, + "grad_norm": 5.96510652515189, + "learning_rate": 4.0165773243360105e-06, + "loss": 0.0313, + "step": 3229 + }, + { + "epoch": 2.31, + "grad_norm": 4.516895579432337, + "learning_rate": 4.0137437929488885e-06, + "loss": 0.0425, + "step": 3230 + }, + { + "epoch": 2.31, + "grad_norm": 6.163717649417404, + "learning_rate": 4.010910591088296e-06, + "loss": 0.0454, + "step": 3231 + }, + { + "epoch": 2.31, + "grad_norm": 10.726656037501574, + "learning_rate": 4.008077719700859e-06, + "loss": 0.0592, + "step": 3232 + }, + { + "epoch": 2.31, + "grad_norm": 13.5104992520412, + "learning_rate": 4.005245179733095e-06, + "loss": 0.045, + "step": 3233 + }, + { + "epoch": 2.31, + "grad_norm": 7.879892748265249, + "learning_rate": 4.002412972131403e-06, + "loss": 0.0378, + "step": 3234 + }, + { + "epoch": 2.31, + "grad_norm": 7.298147921669261, + "learning_rate": 3.999581097842082e-06, + "loss": 0.0468, + "step": 3235 + }, + { + "epoch": 2.31, + "grad_norm": 8.052193186678828, + "learning_rate": 3.99674955781131e-06, + "loss": 0.0628, + "step": 3236 + }, + { + "epoch": 2.31, + "grad_norm": 4.805780871133186, + "learning_rate": 3.99391835298516e-06, + "loss": 0.0336, + "step": 3237 + }, + { + "epoch": 2.31, + "grad_norm": 11.265202041373872, + "learning_rate": 3.991087484309586e-06, + "loss": 0.0347, + "step": 3238 + }, + { + "epoch": 2.31, + "grad_norm": 9.214043476468115, + "learning_rate": 3.988256952730439e-06, + "loss": 0.0771, + "step": 3239 + }, + { + "epoch": 2.31, + "grad_norm": 5.8998163572166895, + "learning_rate": 3.985426759193449e-06, + "loss": 0.0495, + "step": 3240 + }, + { + "epoch": 2.31, + "grad_norm": 10.144981153013562, + "learning_rate": 3.982596904644236e-06, + "loss": 0.0382, + "step": 3241 + }, + { + "epoch": 2.31, + "grad_norm": 5.985568117991297, + "learning_rate": 3.979767390028309e-06, + "loss": 0.0441, + "step": 3242 + }, + { + "epoch": 2.31, + "grad_norm": 17.97011714211926, + "learning_rate": 3.976938216291059e-06, + "loss": 0.0776, + "step": 3243 + }, + { + "epoch": 2.32, + "grad_norm": 3.31899818255434, + "learning_rate": 3.974109384377768e-06, + "loss": 0.0443, + "step": 3244 + }, + { + "epoch": 2.32, + "grad_norm": 10.545724201687834, + "learning_rate": 3.971280895233599e-06, + "loss": 0.0442, + "step": 3245 + }, + { + "epoch": 2.32, + "grad_norm": 5.238792598733231, + "learning_rate": 3.968452749803605e-06, + "loss": 0.0577, + "step": 3246 + }, + { + "epoch": 2.32, + "grad_norm": 3.7875985593995583, + "learning_rate": 3.965624949032723e-06, + "loss": 0.0409, + "step": 3247 + }, + { + "epoch": 2.32, + "grad_norm": 4.594499938098831, + "learning_rate": 3.962797493865767e-06, + "loss": 0.0485, + "step": 3248 + }, + { + "epoch": 2.32, + "grad_norm": 5.310912106765515, + "learning_rate": 3.959970385247451e-06, + "loss": 0.0536, + "step": 3249 + }, + { + "epoch": 2.32, + "grad_norm": 8.359734666912926, + "learning_rate": 3.957143624122359e-06, + "loss": 0.0649, + "step": 3250 + }, + { + "epoch": 2.32, + "grad_norm": 5.073298004365436, + "learning_rate": 3.954317211434966e-06, + "loss": 0.0496, + "step": 3251 + }, + { + "epoch": 2.32, + "grad_norm": 6.362507284188202, + "learning_rate": 3.951491148129628e-06, + "loss": 0.0522, + "step": 3252 + }, + { + "epoch": 2.32, + "grad_norm": 3.991116571447913, + "learning_rate": 3.948665435150589e-06, + "loss": 0.0389, + "step": 3253 + }, + { + "epoch": 2.32, + "grad_norm": 10.386760657877408, + "learning_rate": 3.945840073441967e-06, + "loss": 0.0495, + "step": 3254 + }, + { + "epoch": 2.32, + "grad_norm": 5.3677506790255665, + "learning_rate": 3.943015063947773e-06, + "loss": 0.0417, + "step": 3255 + }, + { + "epoch": 2.32, + "grad_norm": 7.163430214164329, + "learning_rate": 3.940190407611891e-06, + "loss": 0.0495, + "step": 3256 + }, + { + "epoch": 2.32, + "grad_norm": 17.871089845054883, + "learning_rate": 3.937366105378093e-06, + "loss": 0.0321, + "step": 3257 + }, + { + "epoch": 2.33, + "grad_norm": 7.221258871476373, + "learning_rate": 3.93454215819003e-06, + "loss": 0.0586, + "step": 3258 + }, + { + "epoch": 2.33, + "grad_norm": 8.203863757302269, + "learning_rate": 3.931718566991236e-06, + "loss": 0.0588, + "step": 3259 + }, + { + "epoch": 2.33, + "grad_norm": 6.2318167707341505, + "learning_rate": 3.9288953327251265e-06, + "loss": 0.0539, + "step": 3260 + }, + { + "epoch": 2.33, + "grad_norm": 8.561571820835148, + "learning_rate": 3.9260724563349935e-06, + "loss": 0.045, + "step": 3261 + }, + { + "epoch": 2.33, + "grad_norm": 10.489240447227571, + "learning_rate": 3.923249938764016e-06, + "loss": 0.0543, + "step": 3262 + }, + { + "epoch": 2.33, + "grad_norm": 5.63338501785195, + "learning_rate": 3.920427780955247e-06, + "loss": 0.0695, + "step": 3263 + }, + { + "epoch": 2.33, + "grad_norm": 7.206481099887481, + "learning_rate": 3.917605983851622e-06, + "loss": 0.0625, + "step": 3264 + }, + { + "epoch": 2.33, + "grad_norm": 12.970256281406726, + "learning_rate": 3.914784548395959e-06, + "loss": 0.048, + "step": 3265 + }, + { + "epoch": 2.33, + "grad_norm": 16.605000076839822, + "learning_rate": 3.911963475530948e-06, + "loss": 0.0655, + "step": 3266 + }, + { + "epoch": 2.33, + "grad_norm": 22.570657256087852, + "learning_rate": 3.909142766199163e-06, + "loss": 0.0626, + "step": 3267 + }, + { + "epoch": 2.33, + "grad_norm": 7.757888466221637, + "learning_rate": 3.906322421343055e-06, + "loss": 0.0389, + "step": 3268 + }, + { + "epoch": 2.33, + "grad_norm": 18.353788836285396, + "learning_rate": 3.903502441904956e-06, + "loss": 0.0423, + "step": 3269 + }, + { + "epoch": 2.33, + "grad_norm": 18.335337367703037, + "learning_rate": 3.900682828827072e-06, + "loss": 0.041, + "step": 3270 + }, + { + "epoch": 2.33, + "grad_norm": 20.560967998792574, + "learning_rate": 3.897863583051488e-06, + "loss": 0.0438, + "step": 3271 + }, + { + "epoch": 2.34, + "grad_norm": 9.189481542895964, + "learning_rate": 3.895044705520167e-06, + "loss": 0.0408, + "step": 3272 + }, + { + "epoch": 2.34, + "grad_norm": 6.240925968107035, + "learning_rate": 3.892226197174947e-06, + "loss": 0.0535, + "step": 3273 + }, + { + "epoch": 2.34, + "grad_norm": 11.330018925248265, + "learning_rate": 3.889408058957547e-06, + "loss": 0.0564, + "step": 3274 + }, + { + "epoch": 2.34, + "grad_norm": 22.462180664383062, + "learning_rate": 3.886590291809554e-06, + "loss": 0.0628, + "step": 3275 + }, + { + "epoch": 2.34, + "grad_norm": 25.572962458851126, + "learning_rate": 3.883772896672443e-06, + "loss": 0.0505, + "step": 3276 + }, + { + "epoch": 2.34, + "grad_norm": 16.781204089206188, + "learning_rate": 3.8809558744875534e-06, + "loss": 0.0385, + "step": 3277 + }, + { + "epoch": 2.34, + "grad_norm": 3.374198239697441, + "learning_rate": 3.878139226196107e-06, + "loss": 0.0457, + "step": 3278 + }, + { + "epoch": 2.34, + "grad_norm": 14.5149616122367, + "learning_rate": 3.875322952739196e-06, + "loss": 0.0605, + "step": 3279 + }, + { + "epoch": 2.34, + "grad_norm": 13.155372455067319, + "learning_rate": 3.872507055057793e-06, + "loss": 0.0413, + "step": 3280 + }, + { + "epoch": 2.34, + "grad_norm": 14.517540894737511, + "learning_rate": 3.8696915340927395e-06, + "loss": 0.0463, + "step": 3281 + }, + { + "epoch": 2.34, + "grad_norm": 12.853273973170033, + "learning_rate": 3.866876390784752e-06, + "loss": 0.0489, + "step": 3282 + }, + { + "epoch": 2.34, + "grad_norm": 16.08116245026929, + "learning_rate": 3.8640616260744266e-06, + "loss": 0.0584, + "step": 3283 + }, + { + "epoch": 2.34, + "grad_norm": 13.158844571849507, + "learning_rate": 3.861247240902223e-06, + "loss": 0.0468, + "step": 3284 + }, + { + "epoch": 2.34, + "grad_norm": 11.605770161229955, + "learning_rate": 3.858433236208485e-06, + "loss": 0.0745, + "step": 3285 + }, + { + "epoch": 2.35, + "grad_norm": 14.696854573310642, + "learning_rate": 3.85561961293342e-06, + "loss": 0.0485, + "step": 3286 + }, + { + "epoch": 2.35, + "grad_norm": 8.808771550272338, + "learning_rate": 3.852806372017115e-06, + "loss": 0.0525, + "step": 3287 + }, + { + "epoch": 2.35, + "grad_norm": 12.28812481591032, + "learning_rate": 3.849993514399521e-06, + "loss": 0.0457, + "step": 3288 + }, + { + "epoch": 2.35, + "grad_norm": 7.973779667671355, + "learning_rate": 3.847181041020472e-06, + "loss": 0.0442, + "step": 3289 + }, + { + "epoch": 2.35, + "grad_norm": 19.096436336706248, + "learning_rate": 3.844368952819666e-06, + "loss": 0.0495, + "step": 3290 + }, + { + "epoch": 2.35, + "grad_norm": 4.254182022740973, + "learning_rate": 3.84155725073667e-06, + "loss": 0.0621, + "step": 3291 + }, + { + "epoch": 2.35, + "grad_norm": 8.2181153756375, + "learning_rate": 3.838745935710931e-06, + "loss": 0.0423, + "step": 3292 + }, + { + "epoch": 2.35, + "grad_norm": 6.376471586863244, + "learning_rate": 3.835935008681757e-06, + "loss": 0.0704, + "step": 3293 + }, + { + "epoch": 2.35, + "grad_norm": 12.63158037914216, + "learning_rate": 3.833124470588336e-06, + "loss": 0.0662, + "step": 3294 + }, + { + "epoch": 2.35, + "grad_norm": 13.3387733700678, + "learning_rate": 3.830314322369717e-06, + "loss": 0.0459, + "step": 3295 + }, + { + "epoch": 2.35, + "grad_norm": 5.133497815442831, + "learning_rate": 3.827504564964825e-06, + "loss": 0.038, + "step": 3296 + }, + { + "epoch": 2.35, + "grad_norm": 8.710809474582618, + "learning_rate": 3.82469519931245e-06, + "loss": 0.0739, + "step": 3297 + }, + { + "epoch": 2.35, + "grad_norm": 7.0462004661285835, + "learning_rate": 3.8218862263512565e-06, + "loss": 0.0383, + "step": 3298 + }, + { + "epoch": 2.35, + "grad_norm": 5.6385692219029595, + "learning_rate": 3.819077647019772e-06, + "loss": 0.0709, + "step": 3299 + }, + { + "epoch": 2.36, + "grad_norm": 24.502025525734307, + "learning_rate": 3.816269462256394e-06, + "loss": 0.0513, + "step": 3300 + }, + { + "epoch": 2.36, + "grad_norm": 13.864706482022296, + "learning_rate": 3.813461672999394e-06, + "loss": 0.0684, + "step": 3301 + }, + { + "epoch": 2.36, + "grad_norm": 6.156507954757023, + "learning_rate": 3.8106542801869007e-06, + "loss": 0.0494, + "step": 3302 + }, + { + "epoch": 2.36, + "grad_norm": 6.807759895501928, + "learning_rate": 3.8078472847569215e-06, + "loss": 0.0603, + "step": 3303 + }, + { + "epoch": 2.36, + "grad_norm": 7.8161472508869725, + "learning_rate": 3.805040687647321e-06, + "loss": 0.0661, + "step": 3304 + }, + { + "epoch": 2.36, + "grad_norm": 4.692379712993815, + "learning_rate": 3.8022344897958402e-06, + "loss": 0.0519, + "step": 3305 + }, + { + "epoch": 2.36, + "grad_norm": 13.215818470193987, + "learning_rate": 3.799428692140077e-06, + "loss": 0.0547, + "step": 3306 + }, + { + "epoch": 2.36, + "grad_norm": 13.05567981160067, + "learning_rate": 3.7966232956175053e-06, + "loss": 0.0408, + "step": 3307 + }, + { + "epoch": 2.36, + "grad_norm": 11.836967320563483, + "learning_rate": 3.793818301165457e-06, + "loss": 0.0607, + "step": 3308 + }, + { + "epoch": 2.36, + "grad_norm": 24.41802793152467, + "learning_rate": 3.7910137097211345e-06, + "loss": 0.0675, + "step": 3309 + }, + { + "epoch": 2.36, + "grad_norm": 13.008738936268362, + "learning_rate": 3.788209522221604e-06, + "loss": 0.0508, + "step": 3310 + }, + { + "epoch": 2.36, + "grad_norm": 8.453167608671531, + "learning_rate": 3.7854057396037934e-06, + "loss": 0.0465, + "step": 3311 + }, + { + "epoch": 2.36, + "grad_norm": 7.942453598993944, + "learning_rate": 3.7826023628045037e-06, + "loss": 0.0429, + "step": 3312 + }, + { + "epoch": 2.36, + "grad_norm": 7.2021837650240546, + "learning_rate": 3.779799392760391e-06, + "loss": 0.0595, + "step": 3313 + }, + { + "epoch": 2.37, + "grad_norm": 8.110336108153568, + "learning_rate": 3.7769968304079833e-06, + "loss": 0.0392, + "step": 3314 + }, + { + "epoch": 2.37, + "grad_norm": 4.770513910427804, + "learning_rate": 3.7741946766836657e-06, + "loss": 0.0422, + "step": 3315 + }, + { + "epoch": 2.37, + "grad_norm": 5.485390555701271, + "learning_rate": 3.771392932523691e-06, + "loss": 0.0442, + "step": 3316 + }, + { + "epoch": 2.37, + "grad_norm": 3.776761654570514, + "learning_rate": 3.768591598864174e-06, + "loss": 0.0474, + "step": 3317 + }, + { + "epoch": 2.37, + "grad_norm": 8.255921845791628, + "learning_rate": 3.765790676641092e-06, + "loss": 0.0593, + "step": 3318 + }, + { + "epoch": 2.37, + "grad_norm": 7.84516772890703, + "learning_rate": 3.762990166790286e-06, + "loss": 0.0421, + "step": 3319 + }, + { + "epoch": 2.37, + "grad_norm": 21.7601097912396, + "learning_rate": 3.760190070247458e-06, + "loss": 0.0898, + "step": 3320 + }, + { + "epoch": 2.37, + "grad_norm": 8.031470132988193, + "learning_rate": 3.7573903879481714e-06, + "loss": 0.0466, + "step": 3321 + }, + { + "epoch": 2.37, + "grad_norm": 11.109101536744378, + "learning_rate": 3.754591120827854e-06, + "loss": 0.0467, + "step": 3322 + }, + { + "epoch": 2.37, + "grad_norm": 17.1446165094642, + "learning_rate": 3.7517922698217914e-06, + "loss": 0.0461, + "step": 3323 + }, + { + "epoch": 2.37, + "grad_norm": 14.65645178329707, + "learning_rate": 3.7489938358651334e-06, + "loss": 0.0511, + "step": 3324 + }, + { + "epoch": 2.37, + "grad_norm": 6.9909618648344605, + "learning_rate": 3.746195819892885e-06, + "loss": 0.0581, + "step": 3325 + }, + { + "epoch": 2.37, + "grad_norm": 18.094397161249283, + "learning_rate": 3.7433982228399205e-06, + "loss": 0.0445, + "step": 3326 + }, + { + "epoch": 2.37, + "grad_norm": 12.802062963646232, + "learning_rate": 3.7406010456409648e-06, + "loss": 0.05, + "step": 3327 + }, + { + "epoch": 2.38, + "grad_norm": 16.396058272502454, + "learning_rate": 3.73780428923061e-06, + "loss": 0.0441, + "step": 3328 + }, + { + "epoch": 2.38, + "grad_norm": 10.919646375990595, + "learning_rate": 3.7350079545433014e-06, + "loss": 0.0501, + "step": 3329 + }, + { + "epoch": 2.38, + "grad_norm": 6.922454337057674, + "learning_rate": 3.7322120425133497e-06, + "loss": 0.0319, + "step": 3330 + }, + { + "epoch": 2.38, + "grad_norm": 11.453873310916373, + "learning_rate": 3.729416554074917e-06, + "loss": 0.0383, + "step": 3331 + }, + { + "epoch": 2.38, + "grad_norm": 13.08051920997564, + "learning_rate": 3.726621490162033e-06, + "loss": 0.0369, + "step": 3332 + }, + { + "epoch": 2.38, + "grad_norm": 10.658265780826953, + "learning_rate": 3.7238268517085773e-06, + "loss": 0.0862, + "step": 3333 + }, + { + "epoch": 2.38, + "grad_norm": 15.404614991290057, + "learning_rate": 3.7210326396482893e-06, + "loss": 0.0499, + "step": 3334 + }, + { + "epoch": 2.38, + "grad_norm": 14.75043279711972, + "learning_rate": 3.718238854914771e-06, + "loss": 0.0703, + "step": 3335 + }, + { + "epoch": 2.38, + "grad_norm": 15.503977295964852, + "learning_rate": 3.7154454984414733e-06, + "loss": 0.0573, + "step": 3336 + }, + { + "epoch": 2.38, + "grad_norm": 6.946957800905822, + "learning_rate": 3.7126525711617135e-06, + "loss": 0.0673, + "step": 3337 + }, + { + "epoch": 2.38, + "grad_norm": 6.259494768027386, + "learning_rate": 3.7098600740086555e-06, + "loss": 0.0547, + "step": 3338 + }, + { + "epoch": 2.38, + "grad_norm": 11.546595002331136, + "learning_rate": 3.707068007915329e-06, + "loss": 0.056, + "step": 3339 + }, + { + "epoch": 2.38, + "grad_norm": 17.967131882842132, + "learning_rate": 3.704276373814611e-06, + "loss": 0.0409, + "step": 3340 + }, + { + "epoch": 2.38, + "grad_norm": 15.393071791424758, + "learning_rate": 3.7014851726392427e-06, + "loss": 0.0546, + "step": 3341 + }, + { + "epoch": 2.39, + "grad_norm": 7.001550477108224, + "learning_rate": 3.6986944053218143e-06, + "loss": 0.0545, + "step": 3342 + }, + { + "epoch": 2.39, + "grad_norm": 3.9519019922056784, + "learning_rate": 3.69590407279477e-06, + "loss": 0.043, + "step": 3343 + }, + { + "epoch": 2.39, + "grad_norm": 8.960410735070326, + "learning_rate": 3.6931141759904175e-06, + "loss": 0.0607, + "step": 3344 + }, + { + "epoch": 2.39, + "grad_norm": 14.847248534040741, + "learning_rate": 3.6903247158409077e-06, + "loss": 0.0394, + "step": 3345 + }, + { + "epoch": 2.39, + "grad_norm": 17.388659127488623, + "learning_rate": 3.687535693278256e-06, + "loss": 0.0561, + "step": 3346 + }, + { + "epoch": 2.39, + "grad_norm": 6.378086436756875, + "learning_rate": 3.6847471092343225e-06, + "loss": 0.049, + "step": 3347 + }, + { + "epoch": 2.39, + "grad_norm": 4.734911486197315, + "learning_rate": 3.681958964640828e-06, + "loss": 0.052, + "step": 3348 + }, + { + "epoch": 2.39, + "grad_norm": 3.652535209632719, + "learning_rate": 3.679171260429343e-06, + "loss": 0.0573, + "step": 3349 + }, + { + "epoch": 2.39, + "grad_norm": 18.331070603639798, + "learning_rate": 3.676383997531288e-06, + "loss": 0.0636, + "step": 3350 + }, + { + "epoch": 2.39, + "grad_norm": 10.55381413312245, + "learning_rate": 3.673597176877944e-06, + "loss": 0.0585, + "step": 3351 + }, + { + "epoch": 2.39, + "grad_norm": 8.280879713430096, + "learning_rate": 3.670810799400435e-06, + "loss": 0.0447, + "step": 3352 + }, + { + "epoch": 2.39, + "grad_norm": 6.519631412173078, + "learning_rate": 3.668024866029747e-06, + "loss": 0.0404, + "step": 3353 + }, + { + "epoch": 2.39, + "grad_norm": 11.576104508421272, + "learning_rate": 3.665239377696706e-06, + "loss": 0.0468, + "step": 3354 + }, + { + "epoch": 2.39, + "grad_norm": 4.760444512858353, + "learning_rate": 3.6624543353320006e-06, + "loss": 0.036, + "step": 3355 + }, + { + "epoch": 2.4, + "grad_norm": 4.6760439313381905, + "learning_rate": 3.659669739866162e-06, + "loss": 0.0276, + "step": 3356 + }, + { + "epoch": 2.4, + "grad_norm": 4.543274966456261, + "learning_rate": 3.6568855922295776e-06, + "loss": 0.0482, + "step": 3357 + }, + { + "epoch": 2.4, + "grad_norm": 5.3086205701499, + "learning_rate": 3.654101893352482e-06, + "loss": 0.0451, + "step": 3358 + }, + { + "epoch": 2.4, + "grad_norm": 13.06056766375821, + "learning_rate": 3.651318644164958e-06, + "loss": 0.0627, + "step": 3359 + }, + { + "epoch": 2.4, + "grad_norm": 8.710508596248417, + "learning_rate": 3.6485358455969454e-06, + "loss": 0.043, + "step": 3360 + }, + { + "epoch": 2.4, + "grad_norm": 5.694002302786408, + "learning_rate": 3.645753498578225e-06, + "loss": 0.0331, + "step": 3361 + }, + { + "epoch": 2.4, + "grad_norm": 8.42561338264928, + "learning_rate": 3.6429716040384346e-06, + "loss": 0.034, + "step": 3362 + }, + { + "epoch": 2.4, + "grad_norm": 6.1770297556861475, + "learning_rate": 3.6401901629070524e-06, + "loss": 0.0625, + "step": 3363 + }, + { + "epoch": 2.4, + "grad_norm": 5.919884188773968, + "learning_rate": 3.6374091761134147e-06, + "loss": 0.0616, + "step": 3364 + }, + { + "epoch": 2.4, + "grad_norm": 5.43484904633554, + "learning_rate": 3.6346286445866953e-06, + "loss": 0.0339, + "step": 3365 + }, + { + "epoch": 2.4, + "grad_norm": 3.2904973516040323, + "learning_rate": 3.6318485692559263e-06, + "loss": 0.0532, + "step": 3366 + }, + { + "epoch": 2.4, + "grad_norm": 4.091491571818965, + "learning_rate": 3.62906895104998e-06, + "loss": 0.0418, + "step": 3367 + }, + { + "epoch": 2.4, + "grad_norm": 7.015427483416067, + "learning_rate": 3.6262897908975787e-06, + "loss": 0.0275, + "step": 3368 + }, + { + "epoch": 2.4, + "grad_norm": 2.8745109759948946, + "learning_rate": 3.6235110897272917e-06, + "loss": 0.0345, + "step": 3369 + }, + { + "epoch": 2.41, + "grad_norm": 4.67694078014422, + "learning_rate": 3.620732848467535e-06, + "loss": 0.041, + "step": 3370 + }, + { + "epoch": 2.41, + "grad_norm": 7.963957573360893, + "learning_rate": 3.6179550680465703e-06, + "loss": 0.042, + "step": 3371 + }, + { + "epoch": 2.41, + "grad_norm": 13.579331192890951, + "learning_rate": 3.615177749392506e-06, + "loss": 0.0472, + "step": 3372 + }, + { + "epoch": 2.41, + "grad_norm": 3.9713676441301975, + "learning_rate": 3.6124008934332956e-06, + "loss": 0.0406, + "step": 3373 + }, + { + "epoch": 2.41, + "grad_norm": 4.852127016193872, + "learning_rate": 3.609624501096739e-06, + "loss": 0.0449, + "step": 3374 + }, + { + "epoch": 2.41, + "grad_norm": 4.550426276023454, + "learning_rate": 3.606848573310479e-06, + "loss": 0.0425, + "step": 3375 + }, + { + "epoch": 2.41, + "grad_norm": 12.497366697285011, + "learning_rate": 3.6040731110020065e-06, + "loss": 0.0394, + "step": 3376 + }, + { + "epoch": 2.41, + "grad_norm": 18.168602267642527, + "learning_rate": 3.6012981150986524e-06, + "loss": 0.0843, + "step": 3377 + }, + { + "epoch": 2.41, + "grad_norm": 12.307783885049966, + "learning_rate": 3.598523586527599e-06, + "loss": 0.0384, + "step": 3378 + }, + { + "epoch": 2.41, + "grad_norm": 4.59462797750005, + "learning_rate": 3.595749526215862e-06, + "loss": 0.0432, + "step": 3379 + }, + { + "epoch": 2.41, + "grad_norm": 14.017885187186423, + "learning_rate": 3.5929759350903117e-06, + "loss": 0.046, + "step": 3380 + }, + { + "epoch": 2.41, + "grad_norm": 24.792119713261677, + "learning_rate": 3.5902028140776524e-06, + "loss": 0.0671, + "step": 3381 + }, + { + "epoch": 2.41, + "grad_norm": 12.244242993831454, + "learning_rate": 3.5874301641044386e-06, + "loss": 0.0345, + "step": 3382 + }, + { + "epoch": 2.41, + "grad_norm": 7.018502392020034, + "learning_rate": 3.5846579860970632e-06, + "loss": 0.0583, + "step": 3383 + }, + { + "epoch": 2.42, + "grad_norm": 16.228689702919265, + "learning_rate": 3.58188628098176e-06, + "loss": 0.0566, + "step": 3384 + }, + { + "epoch": 2.42, + "grad_norm": 7.9160236332123395, + "learning_rate": 3.579115049684612e-06, + "loss": 0.0453, + "step": 3385 + }, + { + "epoch": 2.42, + "grad_norm": 6.865601457118527, + "learning_rate": 3.576344293131533e-06, + "loss": 0.0482, + "step": 3386 + }, + { + "epoch": 2.42, + "grad_norm": 14.727035098444215, + "learning_rate": 3.5735740122482896e-06, + "loss": 0.0439, + "step": 3387 + }, + { + "epoch": 2.42, + "grad_norm": 3.831666188573751, + "learning_rate": 3.570804207960481e-06, + "loss": 0.0428, + "step": 3388 + }, + { + "epoch": 2.42, + "grad_norm": 6.970567913151844, + "learning_rate": 3.5680348811935527e-06, + "loss": 0.0429, + "step": 3389 + }, + { + "epoch": 2.42, + "grad_norm": 4.700224034319951, + "learning_rate": 3.565266032872785e-06, + "loss": 0.05, + "step": 3390 + }, + { + "epoch": 2.42, + "grad_norm": 5.060450793558676, + "learning_rate": 3.5624976639233056e-06, + "loss": 0.0587, + "step": 3391 + }, + { + "epoch": 2.42, + "grad_norm": 9.739022910580005, + "learning_rate": 3.559729775270076e-06, + "loss": 0.0383, + "step": 3392 + }, + { + "epoch": 2.42, + "grad_norm": 5.282482394344644, + "learning_rate": 3.5569623678378972e-06, + "loss": 0.05, + "step": 3393 + }, + { + "epoch": 2.42, + "grad_norm": 7.146211172306711, + "learning_rate": 3.554195442551416e-06, + "loss": 0.0346, + "step": 3394 + }, + { + "epoch": 2.42, + "grad_norm": 7.117770381713717, + "learning_rate": 3.551429000335108e-06, + "loss": 0.0453, + "step": 3395 + }, + { + "epoch": 2.42, + "grad_norm": 4.501430969155091, + "learning_rate": 3.5486630421132983e-06, + "loss": 0.0406, + "step": 3396 + }, + { + "epoch": 2.42, + "grad_norm": 9.978686458274396, + "learning_rate": 3.5458975688101403e-06, + "loss": 0.0475, + "step": 3397 + }, + { + "epoch": 2.43, + "grad_norm": 9.504820462744869, + "learning_rate": 3.5431325813496352e-06, + "loss": 0.055, + "step": 3398 + }, + { + "epoch": 2.43, + "grad_norm": 13.297641172040194, + "learning_rate": 3.540368080655612e-06, + "loss": 0.064, + "step": 3399 + }, + { + "epoch": 2.43, + "grad_norm": 5.145894409950023, + "learning_rate": 3.5376040676517443e-06, + "loss": 0.0508, + "step": 3400 + }, + { + "epoch": 2.43, + "grad_norm": 4.6754745477101975, + "learning_rate": 3.5348405432615407e-06, + "loss": 0.0369, + "step": 3401 + }, + { + "epoch": 2.43, + "grad_norm": 9.183504955229806, + "learning_rate": 3.5320775084083425e-06, + "loss": 0.0362, + "step": 3402 + }, + { + "epoch": 2.43, + "grad_norm": 10.016899646390549, + "learning_rate": 3.529314964015336e-06, + "loss": 0.0524, + "step": 3403 + }, + { + "epoch": 2.43, + "grad_norm": 7.225269060493574, + "learning_rate": 3.526552911005533e-06, + "loss": 0.0651, + "step": 3404 + }, + { + "epoch": 2.43, + "grad_norm": 9.402695057707561, + "learning_rate": 3.523791350301793e-06, + "loss": 0.0508, + "step": 3405 + }, + { + "epoch": 2.43, + "grad_norm": 5.528168693207084, + "learning_rate": 3.5210302828267984e-06, + "loss": 0.0591, + "step": 3406 + }, + { + "epoch": 2.43, + "grad_norm": 4.978722816391329, + "learning_rate": 3.5182697095030795e-06, + "loss": 0.0323, + "step": 3407 + }, + { + "epoch": 2.43, + "grad_norm": 11.785501724674626, + "learning_rate": 3.5155096312529913e-06, + "loss": 0.0605, + "step": 3408 + }, + { + "epoch": 2.43, + "grad_norm": 8.863372588103983, + "learning_rate": 3.5127500489987252e-06, + "loss": 0.0441, + "step": 3409 + }, + { + "epoch": 2.43, + "grad_norm": 5.336343653134836, + "learning_rate": 3.5099909636623148e-06, + "loss": 0.0493, + "step": 3410 + }, + { + "epoch": 2.43, + "grad_norm": 9.86742725134051, + "learning_rate": 3.5072323761656163e-06, + "loss": 0.0493, + "step": 3411 + }, + { + "epoch": 2.44, + "grad_norm": 7.055618254025233, + "learning_rate": 3.5044742874303297e-06, + "loss": 0.0541, + "step": 3412 + }, + { + "epoch": 2.44, + "grad_norm": 7.776496254142825, + "learning_rate": 3.501716698377979e-06, + "loss": 0.0308, + "step": 3413 + }, + { + "epoch": 2.44, + "grad_norm": 7.122528698892253, + "learning_rate": 3.4989596099299306e-06, + "loss": 0.0427, + "step": 3414 + }, + { + "epoch": 2.44, + "grad_norm": 16.55113954313631, + "learning_rate": 3.496203023007374e-06, + "loss": 0.075, + "step": 3415 + }, + { + "epoch": 2.44, + "grad_norm": 9.731003512719141, + "learning_rate": 3.4934469385313418e-06, + "loss": 0.0554, + "step": 3416 + }, + { + "epoch": 2.44, + "grad_norm": 35.12656172388353, + "learning_rate": 3.490691357422689e-06, + "loss": 0.0719, + "step": 3417 + }, + { + "epoch": 2.44, + "grad_norm": 7.754354703696009, + "learning_rate": 3.487936280602108e-06, + "loss": 0.0235, + "step": 3418 + }, + { + "epoch": 2.44, + "grad_norm": 3.521163253822065, + "learning_rate": 3.4851817089901203e-06, + "loss": 0.033, + "step": 3419 + }, + { + "epoch": 2.44, + "grad_norm": 26.224926062833664, + "learning_rate": 3.4824276435070804e-06, + "loss": 0.0563, + "step": 3420 + }, + { + "epoch": 2.44, + "grad_norm": 15.61271153737091, + "learning_rate": 3.4796740850731716e-06, + "loss": 0.0356, + "step": 3421 + }, + { + "epoch": 2.44, + "grad_norm": 9.031891603705114, + "learning_rate": 3.47692103460841e-06, + "loss": 0.0486, + "step": 3422 + }, + { + "epoch": 2.44, + "grad_norm": 6.734795663073578, + "learning_rate": 3.474168493032641e-06, + "loss": 0.0525, + "step": 3423 + }, + { + "epoch": 2.44, + "grad_norm": 6.755868031059958, + "learning_rate": 3.4714164612655387e-06, + "loss": 0.0458, + "step": 3424 + }, + { + "epoch": 2.44, + "grad_norm": 7.708505272692726, + "learning_rate": 3.468664940226609e-06, + "loss": 0.0513, + "step": 3425 + }, + { + "epoch": 2.45, + "grad_norm": 4.506075218775547, + "learning_rate": 3.4659139308351885e-06, + "loss": 0.0344, + "step": 3426 + }, + { + "epoch": 2.45, + "grad_norm": 3.992509225070968, + "learning_rate": 3.4631634340104357e-06, + "loss": 0.044, + "step": 3427 + }, + { + "epoch": 2.45, + "grad_norm": 7.003251664486951, + "learning_rate": 3.460413450671346e-06, + "loss": 0.0434, + "step": 3428 + }, + { + "epoch": 2.45, + "grad_norm": 14.475401829587407, + "learning_rate": 3.457663981736739e-06, + "loss": 0.0854, + "step": 3429 + }, + { + "epoch": 2.45, + "grad_norm": 4.483836265471178, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.0443, + "step": 3430 + }, + { + "epoch": 2.45, + "grad_norm": 11.13154384738619, + "learning_rate": 3.4521665907553957e-06, + "loss": 0.0337, + "step": 3431 + }, + { + "epoch": 2.45, + "grad_norm": 5.800442883625316, + "learning_rate": 3.4494186705454402e-06, + "loss": 0.0618, + "step": 3432 + }, + { + "epoch": 2.45, + "grad_norm": 16.842487232611695, + "learning_rate": 3.446671268413528e-06, + "loss": 0.056, + "step": 3433 + }, + { + "epoch": 2.45, + "grad_norm": 6.307882602308188, + "learning_rate": 3.443924385277617e-06, + "loss": 0.0401, + "step": 3434 + }, + { + "epoch": 2.45, + "grad_norm": 12.347319578908353, + "learning_rate": 3.4411780220554937e-06, + "loss": 0.049, + "step": 3435 + }, + { + "epoch": 2.45, + "grad_norm": 6.777317017761099, + "learning_rate": 3.4384321796647645e-06, + "loss": 0.044, + "step": 3436 + }, + { + "epoch": 2.45, + "grad_norm": 8.578839854101997, + "learning_rate": 3.4356868590228727e-06, + "loss": 0.08, + "step": 3437 + }, + { + "epoch": 2.45, + "grad_norm": 11.34847212257333, + "learning_rate": 3.4329420610470745e-06, + "loss": 0.0591, + "step": 3438 + }, + { + "epoch": 2.45, + "grad_norm": 16.488323097081217, + "learning_rate": 3.4301977866544634e-06, + "loss": 0.0469, + "step": 3439 + }, + { + "epoch": 2.46, + "grad_norm": 3.5345240893456635, + "learning_rate": 3.427454036761948e-06, + "loss": 0.037, + "step": 3440 + }, + { + "epoch": 2.46, + "grad_norm": 10.70351554080197, + "learning_rate": 3.4247108122862703e-06, + "loss": 0.049, + "step": 3441 + }, + { + "epoch": 2.46, + "grad_norm": 6.477152114166718, + "learning_rate": 3.4219681141439907e-06, + "loss": 0.0401, + "step": 3442 + }, + { + "epoch": 2.46, + "grad_norm": 18.807079075717237, + "learning_rate": 3.4192259432514934e-06, + "loss": 0.0406, + "step": 3443 + }, + { + "epoch": 2.46, + "grad_norm": 15.723153609547095, + "learning_rate": 3.4164843005249928e-06, + "loss": 0.0524, + "step": 3444 + }, + { + "epoch": 2.46, + "grad_norm": 12.39846598919014, + "learning_rate": 3.413743186880519e-06, + "loss": 0.0407, + "step": 3445 + }, + { + "epoch": 2.46, + "grad_norm": 7.3971140149986665, + "learning_rate": 3.4110026032339317e-06, + "loss": 0.0679, + "step": 3446 + }, + { + "epoch": 2.46, + "grad_norm": 20.316958331448777, + "learning_rate": 3.408262550500908e-06, + "loss": 0.0699, + "step": 3447 + }, + { + "epoch": 2.46, + "grad_norm": 19.17858909141001, + "learning_rate": 3.4055230295969556e-06, + "loss": 0.0484, + "step": 3448 + }, + { + "epoch": 2.46, + "grad_norm": 10.402764837758005, + "learning_rate": 3.4027840414373924e-06, + "loss": 0.0569, + "step": 3449 + }, + { + "epoch": 2.46, + "grad_norm": 7.135992378182201, + "learning_rate": 3.4000455869373716e-06, + "loss": 0.0426, + "step": 3450 + }, + { + "epoch": 2.46, + "grad_norm": 14.292267710034482, + "learning_rate": 3.397307667011859e-06, + "loss": 0.0685, + "step": 3451 + }, + { + "epoch": 2.46, + "grad_norm": 22.66042408586546, + "learning_rate": 3.394570282575642e-06, + "loss": 0.0729, + "step": 3452 + }, + { + "epoch": 2.46, + "grad_norm": 7.338566737526984, + "learning_rate": 3.3918334345433367e-06, + "loss": 0.0495, + "step": 3453 + }, + { + "epoch": 2.47, + "grad_norm": 8.259220087387108, + "learning_rate": 3.3890971238293703e-06, + "loss": 0.0544, + "step": 3454 + }, + { + "epoch": 2.47, + "grad_norm": 7.349858298914431, + "learning_rate": 3.386361351347999e-06, + "loss": 0.0507, + "step": 3455 + }, + { + "epoch": 2.47, + "grad_norm": 4.307434476542483, + "learning_rate": 3.3836261180132914e-06, + "loss": 0.0546, + "step": 3456 + }, + { + "epoch": 2.47, + "grad_norm": 5.554073968741056, + "learning_rate": 3.3808914247391437e-06, + "loss": 0.0492, + "step": 3457 + }, + { + "epoch": 2.47, + "grad_norm": 10.475716145946574, + "learning_rate": 3.3781572724392642e-06, + "loss": 0.0467, + "step": 3458 + }, + { + "epoch": 2.47, + "grad_norm": 11.333875703512863, + "learning_rate": 3.3754236620271876e-06, + "loss": 0.0606, + "step": 3459 + }, + { + "epoch": 2.47, + "grad_norm": 7.737852380637597, + "learning_rate": 3.3726905944162615e-06, + "loss": 0.0582, + "step": 3460 + }, + { + "epoch": 2.47, + "grad_norm": 8.626067005065318, + "learning_rate": 3.3699580705196527e-06, + "loss": 0.0547, + "step": 3461 + }, + { + "epoch": 2.47, + "grad_norm": 9.788902013697735, + "learning_rate": 3.367226091250353e-06, + "loss": 0.0514, + "step": 3462 + }, + { + "epoch": 2.47, + "grad_norm": 7.494197930351465, + "learning_rate": 3.3644946575211634e-06, + "loss": 0.0322, + "step": 3463 + }, + { + "epoch": 2.47, + "grad_norm": 6.72717147418577, + "learning_rate": 3.36176377024471e-06, + "loss": 0.0377, + "step": 3464 + }, + { + "epoch": 2.47, + "grad_norm": 9.106408875639595, + "learning_rate": 3.3590334303334293e-06, + "loss": 0.0645, + "step": 3465 + }, + { + "epoch": 2.47, + "grad_norm": 3.468657818143453, + "learning_rate": 3.356303638699583e-06, + "loss": 0.0349, + "step": 3466 + }, + { + "epoch": 2.47, + "grad_norm": 10.178729399836424, + "learning_rate": 3.35357439625524e-06, + "loss": 0.0491, + "step": 3467 + }, + { + "epoch": 2.48, + "grad_norm": 7.847892185842703, + "learning_rate": 3.3508457039122965e-06, + "loss": 0.0514, + "step": 3468 + }, + { + "epoch": 2.48, + "grad_norm": 12.475199644318261, + "learning_rate": 3.348117562582457e-06, + "loss": 0.0379, + "step": 3469 + }, + { + "epoch": 2.48, + "grad_norm": 4.425641585558723, + "learning_rate": 3.345389973177241e-06, + "loss": 0.0402, + "step": 3470 + }, + { + "epoch": 2.48, + "grad_norm": 4.957786046660219, + "learning_rate": 3.342662936607992e-06, + "loss": 0.0685, + "step": 3471 + }, + { + "epoch": 2.48, + "grad_norm": 10.682715401706243, + "learning_rate": 3.3399364537858594e-06, + "loss": 0.0541, + "step": 3472 + }, + { + "epoch": 2.48, + "grad_norm": 5.124834331996581, + "learning_rate": 3.3372105256218153e-06, + "loss": 0.0482, + "step": 3473 + }, + { + "epoch": 2.48, + "grad_norm": 6.155139065247913, + "learning_rate": 3.334485153026639e-06, + "loss": 0.0362, + "step": 3474 + }, + { + "epoch": 2.48, + "grad_norm": 11.387503864889187, + "learning_rate": 3.3317603369109332e-06, + "loss": 0.0608, + "step": 3475 + }, + { + "epoch": 2.48, + "grad_norm": 7.514201902512651, + "learning_rate": 3.3290360781851055e-06, + "loss": 0.0428, + "step": 3476 + }, + { + "epoch": 2.48, + "grad_norm": 10.254928567111829, + "learning_rate": 3.326312377759383e-06, + "loss": 0.0479, + "step": 3477 + }, + { + "epoch": 2.48, + "grad_norm": 8.581791189780155, + "learning_rate": 3.3235892365438038e-06, + "loss": 0.0291, + "step": 3478 + }, + { + "epoch": 2.48, + "grad_norm": 9.933864421671199, + "learning_rate": 3.3208666554482216e-06, + "loss": 0.0492, + "step": 3479 + }, + { + "epoch": 2.48, + "grad_norm": 5.1340261357847, + "learning_rate": 3.3181446353822997e-06, + "loss": 0.0361, + "step": 3480 + }, + { + "epoch": 2.48, + "grad_norm": 10.211747713964122, + "learning_rate": 3.315423177255516e-06, + "loss": 0.0725, + "step": 3481 + }, + { + "epoch": 2.49, + "grad_norm": 7.574056301175818, + "learning_rate": 3.312702281977161e-06, + "loss": 0.0604, + "step": 3482 + }, + { + "epoch": 2.49, + "grad_norm": 20.440276470356725, + "learning_rate": 3.3099819504563356e-06, + "loss": 0.0413, + "step": 3483 + }, + { + "epoch": 2.49, + "grad_norm": 11.787768676878581, + "learning_rate": 3.3072621836019535e-06, + "loss": 0.0383, + "step": 3484 + }, + { + "epoch": 2.49, + "grad_norm": 5.320753627246697, + "learning_rate": 3.3045429823227405e-06, + "loss": 0.0366, + "step": 3485 + }, + { + "epoch": 2.49, + "grad_norm": 3.0007273838143593, + "learning_rate": 3.3018243475272282e-06, + "loss": 0.0377, + "step": 3486 + }, + { + "epoch": 2.49, + "grad_norm": 15.551358111361543, + "learning_rate": 3.2991062801237683e-06, + "loss": 0.0653, + "step": 3487 + }, + { + "epoch": 2.49, + "grad_norm": 14.995269345180393, + "learning_rate": 3.296388781020513e-06, + "loss": 0.065, + "step": 3488 + }, + { + "epoch": 2.49, + "grad_norm": 16.071603280041327, + "learning_rate": 3.293671851125434e-06, + "loss": 0.0832, + "step": 3489 + }, + { + "epoch": 2.49, + "grad_norm": 11.916063649824876, + "learning_rate": 3.2909554913463034e-06, + "loss": 0.0473, + "step": 3490 + }, + { + "epoch": 2.49, + "grad_norm": 21.15674028802956, + "learning_rate": 3.2882397025907114e-06, + "loss": 0.0777, + "step": 3491 + }, + { + "epoch": 2.49, + "grad_norm": 16.156481807543038, + "learning_rate": 3.2855244857660497e-06, + "loss": 0.0443, + "step": 3492 + }, + { + "epoch": 2.49, + "grad_norm": 16.88354258053397, + "learning_rate": 3.2828098417795267e-06, + "loss": 0.0409, + "step": 3493 + }, + { + "epoch": 2.49, + "grad_norm": 22.697696452401754, + "learning_rate": 3.2800957715381537e-06, + "loss": 0.0652, + "step": 3494 + }, + { + "epoch": 2.49, + "grad_norm": 4.743314683992511, + "learning_rate": 3.2773822759487497e-06, + "loss": 0.0425, + "step": 3495 + }, + { + "epoch": 2.5, + "grad_norm": 14.222623429544777, + "learning_rate": 3.2746693559179483e-06, + "loss": 0.0468, + "step": 3496 + }, + { + "epoch": 2.5, + "grad_norm": 36.49482472083385, + "learning_rate": 3.2719570123521816e-06, + "loss": 0.0753, + "step": 3497 + }, + { + "epoch": 2.5, + "grad_norm": 18.21815675653801, + "learning_rate": 3.2692452461576997e-06, + "loss": 0.0572, + "step": 3498 + }, + { + "epoch": 2.5, + "grad_norm": 12.83475452863996, + "learning_rate": 3.266534058240548e-06, + "loss": 0.0497, + "step": 3499 + }, + { + "epoch": 2.5, + "grad_norm": 5.823480145703581, + "learning_rate": 3.2638234495065903e-06, + "loss": 0.0537, + "step": 3500 + }, + { + "epoch": 2.5, + "eval_avg_AUC": 0.832346751207196, + "eval_avg_Accuracy": 0.739597148541114, + "eval_avg_Accuracy-right": 0.8735489761314725, + "eval_avg_Accuracy-wrong": 0.5060268364794178, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.7006953165400592, + "eval_last_AUC": 0.8475061209281654, + "eval_last_Accuracy": 0.7807940981432361, + "eval_last_Accuracy-right": 0.8329855223685927, + "eval_last_Accuracy-wrong": 0.6897884921537412, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.7097495321532842, + "eval_max_AUC": 0.7701603116760396, + "eval_max_Accuracy": 0.6501574933687002, + "eval_max_Accuracy-right": 0.9781531237772271, + "eval_max_Accuracy-wrong": 0.07823516033659313, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6369597827879677, + "eval_min_AUC": 0.8455461246669156, + "eval_min_Accuracy": 0.7731681034482759, + "eval_min_Accuracy-right": 0.7823138124429373, + "eval_min_Accuracy-wrong": 0.7572208323857176, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.7126465395097116, + "eval_prod_AUC": 0.8377160248849372, + "eval_prod_Accuracy": 0.7180868700265252, + "eval_prod_Accuracy-right": 0.6126255380200861, + "eval_prod_Accuracy-wrong": 0.9019786217875825, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.7055742832242811, + "eval_runtime": 247.3763, + "eval_samples_per_second": 97.536, + "eval_steps_per_second": 3.048, + "eval_sum_AUC": 0.712185339026256, + "eval_sum_Accuracy": 0.640873673740053, + "eval_sum_Accuracy-right": 0.9953697665318899, + "eval_sum_Accuracy-wrong": 0.02274277916761428, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6744582768230599, + "step": 3500 + }, + { + "epoch": 2.5, + "grad_norm": 14.845003338865876, + "learning_rate": 3.261113420861487e-06, + "loss": 0.0569, + "step": 3501 + }, + { + "epoch": 2.5, + "grad_norm": 20.476101635024914, + "learning_rate": 3.258403973210713e-06, + "loss": 0.0443, + "step": 3502 + }, + { + "epoch": 2.5, + "grad_norm": 16.582413967496823, + "learning_rate": 3.2556951074595435e-06, + "loss": 0.0405, + "step": 3503 + }, + { + "epoch": 2.5, + "grad_norm": 7.163367417609349, + "learning_rate": 3.2529868245130577e-06, + "loss": 0.0509, + "step": 3504 + }, + { + "epoch": 2.5, + "grad_norm": 14.256405396539867, + "learning_rate": 3.250279125276148e-06, + "loss": 0.0443, + "step": 3505 + }, + { + "epoch": 2.5, + "grad_norm": 9.355654318932615, + "learning_rate": 3.2475720106535036e-06, + "loss": 0.0351, + "step": 3506 + }, + { + "epoch": 2.5, + "grad_norm": 15.4290622991359, + "learning_rate": 3.244865481549625e-06, + "loss": 0.0385, + "step": 3507 + }, + { + "epoch": 2.5, + "grad_norm": 5.540807483931574, + "learning_rate": 3.24215953886881e-06, + "loss": 0.0711, + "step": 3508 + }, + { + "epoch": 2.5, + "grad_norm": 3.2900577929188204, + "learning_rate": 3.2394541835151692e-06, + "loss": 0.0322, + "step": 3509 + }, + { + "epoch": 2.51, + "grad_norm": 5.839214385110157, + "learning_rate": 3.2367494163926095e-06, + "loss": 0.0355, + "step": 3510 + }, + { + "epoch": 2.51, + "grad_norm": 9.368493630197404, + "learning_rate": 3.234045238404841e-06, + "loss": 0.0588, + "step": 3511 + }, + { + "epoch": 2.51, + "grad_norm": 13.586715948198096, + "learning_rate": 3.2313416504553852e-06, + "loss": 0.0579, + "step": 3512 + }, + { + "epoch": 2.51, + "grad_norm": 7.231859040657891, + "learning_rate": 3.2286386534475568e-06, + "loss": 0.0381, + "step": 3513 + }, + { + "epoch": 2.51, + "grad_norm": 4.101218107837911, + "learning_rate": 3.2259362482844803e-06, + "loss": 0.0369, + "step": 3514 + }, + { + "epoch": 2.51, + "grad_norm": 5.651265778383636, + "learning_rate": 3.2232344358690765e-06, + "loss": 0.0352, + "step": 3515 + }, + { + "epoch": 2.51, + "grad_norm": 12.961817876373003, + "learning_rate": 3.220533217104075e-06, + "loss": 0.0605, + "step": 3516 + }, + { + "epoch": 2.51, + "grad_norm": 8.658443920435355, + "learning_rate": 3.217832592891999e-06, + "loss": 0.0306, + "step": 3517 + }, + { + "epoch": 2.51, + "grad_norm": 7.589369821386444, + "learning_rate": 3.2151325641351817e-06, + "loss": 0.0788, + "step": 3518 + }, + { + "epoch": 2.51, + "grad_norm": 7.192128941843156, + "learning_rate": 3.2124331317357506e-06, + "loss": 0.0422, + "step": 3519 + }, + { + "epoch": 2.51, + "grad_norm": 5.335264955696329, + "learning_rate": 3.2097342965956334e-06, + "loss": 0.064, + "step": 3520 + }, + { + "epoch": 2.51, + "grad_norm": 2.7148073824964096, + "learning_rate": 3.2070360596165667e-06, + "loss": 0.0297, + "step": 3521 + }, + { + "epoch": 2.51, + "grad_norm": 14.262742389282423, + "learning_rate": 3.204338421700076e-06, + "loss": 0.0448, + "step": 3522 + }, + { + "epoch": 2.51, + "grad_norm": 17.047759589845352, + "learning_rate": 3.201641383747498e-06, + "loss": 0.0615, + "step": 3523 + }, + { + "epoch": 2.52, + "grad_norm": 5.1887465158901795, + "learning_rate": 3.1989449466599574e-06, + "loss": 0.0709, + "step": 3524 + }, + { + "epoch": 2.52, + "grad_norm": 10.43183697384031, + "learning_rate": 3.1962491113383896e-06, + "loss": 0.0739, + "step": 3525 + }, + { + "epoch": 2.52, + "grad_norm": 7.760472498332863, + "learning_rate": 3.1935538786835183e-06, + "loss": 0.0476, + "step": 3526 + }, + { + "epoch": 2.52, + "grad_norm": 20.92702203320919, + "learning_rate": 3.1908592495958747e-06, + "loss": 0.053, + "step": 3527 + }, + { + "epoch": 2.52, + "grad_norm": 11.407485827565694, + "learning_rate": 3.1881652249757823e-06, + "loss": 0.0372, + "step": 3528 + }, + { + "epoch": 2.52, + "grad_norm": 4.8307280957120735, + "learning_rate": 3.185471805723365e-06, + "loss": 0.045, + "step": 3529 + }, + { + "epoch": 2.52, + "grad_norm": 8.019327740566352, + "learning_rate": 3.1827789927385444e-06, + "loss": 0.0433, + "step": 3530 + }, + { + "epoch": 2.52, + "grad_norm": 5.920055320148531, + "learning_rate": 3.18008678692104e-06, + "loss": 0.0533, + "step": 3531 + }, + { + "epoch": 2.52, + "grad_norm": 4.226924007230088, + "learning_rate": 3.1773951891703668e-06, + "loss": 0.052, + "step": 3532 + }, + { + "epoch": 2.52, + "grad_norm": 13.516443184870806, + "learning_rate": 3.1747042003858386e-06, + "loss": 0.0547, + "step": 3533 + }, + { + "epoch": 2.52, + "grad_norm": 8.21547792246494, + "learning_rate": 3.1720138214665643e-06, + "loss": 0.0491, + "step": 3534 + }, + { + "epoch": 2.52, + "grad_norm": 6.142068501852478, + "learning_rate": 3.1693240533114496e-06, + "loss": 0.0641, + "step": 3535 + }, + { + "epoch": 2.52, + "grad_norm": 4.150763465920459, + "learning_rate": 3.1666348968191955e-06, + "loss": 0.0401, + "step": 3536 + }, + { + "epoch": 2.52, + "grad_norm": 11.729987470097232, + "learning_rate": 3.1639463528883007e-06, + "loss": 0.0582, + "step": 3537 + }, + { + "epoch": 2.53, + "grad_norm": 10.409400310191932, + "learning_rate": 3.161258422417055e-06, + "loss": 0.0383, + "step": 3538 + }, + { + "epoch": 2.53, + "grad_norm": 6.526722578357467, + "learning_rate": 3.1585711063035496e-06, + "loss": 0.0496, + "step": 3539 + }, + { + "epoch": 2.53, + "grad_norm": 4.236653388596862, + "learning_rate": 3.155884405445663e-06, + "loss": 0.0424, + "step": 3540 + }, + { + "epoch": 2.53, + "grad_norm": 5.5839787734094095, + "learning_rate": 3.153198320741074e-06, + "loss": 0.0374, + "step": 3541 + }, + { + "epoch": 2.53, + "grad_norm": 13.274151144936681, + "learning_rate": 3.150512853087253e-06, + "loss": 0.0564, + "step": 3542 + }, + { + "epoch": 2.53, + "grad_norm": 9.510037031594369, + "learning_rate": 3.1478280033814657e-06, + "loss": 0.0466, + "step": 3543 + }, + { + "epoch": 2.53, + "grad_norm": 6.862413054864041, + "learning_rate": 3.14514377252077e-06, + "loss": 0.0373, + "step": 3544 + }, + { + "epoch": 2.53, + "grad_norm": 5.4795830545549435, + "learning_rate": 3.142460161402014e-06, + "loss": 0.0541, + "step": 3545 + }, + { + "epoch": 2.53, + "grad_norm": 14.338633368147894, + "learning_rate": 3.139777170921847e-06, + "loss": 0.0584, + "step": 3546 + }, + { + "epoch": 2.53, + "grad_norm": 8.975528387578118, + "learning_rate": 3.137094801976701e-06, + "loss": 0.0461, + "step": 3547 + }, + { + "epoch": 2.53, + "grad_norm": 7.4199338093489215, + "learning_rate": 3.1344130554628104e-06, + "loss": 0.0346, + "step": 3548 + }, + { + "epoch": 2.53, + "grad_norm": 10.49710065460678, + "learning_rate": 3.131731932276193e-06, + "loss": 0.0508, + "step": 3549 + }, + { + "epoch": 2.53, + "grad_norm": 5.33530821774696, + "learning_rate": 3.129051433312664e-06, + "loss": 0.0599, + "step": 3550 + }, + { + "epoch": 2.53, + "grad_norm": 5.796697346739547, + "learning_rate": 3.1263715594678257e-06, + "loss": 0.0528, + "step": 3551 + }, + { + "epoch": 2.54, + "grad_norm": 3.240653694412567, + "learning_rate": 3.1236923116370764e-06, + "loss": 0.0381, + "step": 3552 + }, + { + "epoch": 2.54, + "grad_norm": 2.9372123775383145, + "learning_rate": 3.121013690715601e-06, + "loss": 0.0376, + "step": 3553 + }, + { + "epoch": 2.54, + "grad_norm": 7.609415911504751, + "learning_rate": 3.118335697598376e-06, + "loss": 0.0358, + "step": 3554 + }, + { + "epoch": 2.54, + "grad_norm": 8.390856165835398, + "learning_rate": 3.1156583331801703e-06, + "loss": 0.0514, + "step": 3555 + }, + { + "epoch": 2.54, + "grad_norm": 8.449747018529868, + "learning_rate": 3.1129815983555387e-06, + "loss": 0.0518, + "step": 3556 + }, + { + "epoch": 2.54, + "grad_norm": 17.287771804155526, + "learning_rate": 3.1103054940188316e-06, + "loss": 0.0499, + "step": 3557 + }, + { + "epoch": 2.54, + "grad_norm": 3.918114158102355, + "learning_rate": 3.1076300210641814e-06, + "loss": 0.0447, + "step": 3558 + }, + { + "epoch": 2.54, + "grad_norm": 12.087731186438175, + "learning_rate": 3.1049551803855173e-06, + "loss": 0.0766, + "step": 3559 + }, + { + "epoch": 2.54, + "grad_norm": 4.23804644083563, + "learning_rate": 3.1022809728765486e-06, + "loss": 0.0511, + "step": 3560 + }, + { + "epoch": 2.54, + "grad_norm": 6.3009649089413236, + "learning_rate": 3.0996073994307825e-06, + "loss": 0.041, + "step": 3561 + }, + { + "epoch": 2.54, + "grad_norm": 4.219391874758236, + "learning_rate": 3.0969344609415076e-06, + "loss": 0.0388, + "step": 3562 + }, + { + "epoch": 2.54, + "grad_norm": 5.0531747625683945, + "learning_rate": 3.0942621583017994e-06, + "loss": 0.0349, + "step": 3563 + }, + { + "epoch": 2.54, + "grad_norm": 9.97710030264506, + "learning_rate": 3.0915904924045294e-06, + "loss": 0.0431, + "step": 3564 + }, + { + "epoch": 2.54, + "grad_norm": 9.397820259535381, + "learning_rate": 3.088919464142346e-06, + "loss": 0.0457, + "step": 3565 + }, + { + "epoch": 2.55, + "grad_norm": 4.754455760260489, + "learning_rate": 3.0862490744076928e-06, + "loss": 0.0633, + "step": 3566 + }, + { + "epoch": 2.55, + "grad_norm": 4.929988907403646, + "learning_rate": 3.0835793240927937e-06, + "loss": 0.0445, + "step": 3567 + }, + { + "epoch": 2.55, + "grad_norm": 3.6203167310866937, + "learning_rate": 3.0809102140896652e-06, + "loss": 0.0346, + "step": 3568 + }, + { + "epoch": 2.55, + "grad_norm": 8.143345190966631, + "learning_rate": 3.078241745290103e-06, + "loss": 0.0656, + "step": 3569 + }, + { + "epoch": 2.55, + "grad_norm": 9.331281186466539, + "learning_rate": 3.075573918585696e-06, + "loss": 0.0452, + "step": 3570 + }, + { + "epoch": 2.55, + "grad_norm": 10.05873032023487, + "learning_rate": 3.0729067348678127e-06, + "loss": 0.0308, + "step": 3571 + }, + { + "epoch": 2.55, + "grad_norm": 4.45197339309329, + "learning_rate": 3.0702401950276066e-06, + "loss": 0.0505, + "step": 3572 + }, + { + "epoch": 2.55, + "grad_norm": 6.511150541662428, + "learning_rate": 3.067574299956022e-06, + "loss": 0.0391, + "step": 3573 + }, + { + "epoch": 2.55, + "grad_norm": 6.19790668856283, + "learning_rate": 3.0649090505437804e-06, + "loss": 0.0618, + "step": 3574 + }, + { + "epoch": 2.55, + "grad_norm": 4.092533552558913, + "learning_rate": 3.062244447681396e-06, + "loss": 0.0362, + "step": 3575 + }, + { + "epoch": 2.55, + "grad_norm": 3.328037295873963, + "learning_rate": 3.0595804922591564e-06, + "loss": 0.0398, + "step": 3576 + }, + { + "epoch": 2.55, + "grad_norm": 11.908875157062429, + "learning_rate": 3.0569171851671436e-06, + "loss": 0.0689, + "step": 3577 + }, + { + "epoch": 2.55, + "grad_norm": 10.261183867138291, + "learning_rate": 3.054254527295215e-06, + "loss": 0.0539, + "step": 3578 + }, + { + "epoch": 2.55, + "grad_norm": 3.6021336026425312, + "learning_rate": 3.0515925195330148e-06, + "loss": 0.0393, + "step": 3579 + }, + { + "epoch": 2.56, + "grad_norm": 5.4869403375119425, + "learning_rate": 3.048931162769969e-06, + "loss": 0.0507, + "step": 3580 + }, + { + "epoch": 2.56, + "grad_norm": 16.354612895096743, + "learning_rate": 3.0462704578952874e-06, + "loss": 0.0511, + "step": 3581 + }, + { + "epoch": 2.56, + "grad_norm": 7.463260414069788, + "learning_rate": 3.0436104057979604e-06, + "loss": 0.0478, + "step": 3582 + }, + { + "epoch": 2.56, + "grad_norm": 8.65693289332545, + "learning_rate": 3.0409510073667602e-06, + "loss": 0.0355, + "step": 3583 + }, + { + "epoch": 2.56, + "grad_norm": 4.322328157842049, + "learning_rate": 3.038292263490242e-06, + "loss": 0.0651, + "step": 3584 + }, + { + "epoch": 2.56, + "grad_norm": 13.68926290517709, + "learning_rate": 3.035634175056742e-06, + "loss": 0.0544, + "step": 3585 + }, + { + "epoch": 2.56, + "grad_norm": 10.244078321532088, + "learning_rate": 3.0329767429543767e-06, + "loss": 0.0561, + "step": 3586 + }, + { + "epoch": 2.56, + "grad_norm": 4.0632698090854795, + "learning_rate": 3.030319968071043e-06, + "loss": 0.0345, + "step": 3587 + }, + { + "epoch": 2.56, + "grad_norm": 2.8950494704080074, + "learning_rate": 3.0276638512944177e-06, + "loss": 0.0347, + "step": 3588 + }, + { + "epoch": 2.56, + "grad_norm": 7.721157968690705, + "learning_rate": 3.025008393511961e-06, + "loss": 0.0644, + "step": 3589 + }, + { + "epoch": 2.56, + "grad_norm": 14.006638808045752, + "learning_rate": 3.022353595610909e-06, + "loss": 0.0735, + "step": 3590 + }, + { + "epoch": 2.56, + "grad_norm": 3.9140223429542447, + "learning_rate": 3.01969945847828e-06, + "loss": 0.043, + "step": 3591 + }, + { + "epoch": 2.56, + "grad_norm": 9.900182325928498, + "learning_rate": 3.017045983000871e-06, + "loss": 0.0489, + "step": 3592 + }, + { + "epoch": 2.56, + "grad_norm": 4.361887420728005, + "learning_rate": 3.014393170065256e-06, + "loss": 0.0556, + "step": 3593 + }, + { + "epoch": 2.57, + "grad_norm": 13.67604306112046, + "learning_rate": 3.0117410205577903e-06, + "loss": 0.0477, + "step": 3594 + }, + { + "epoch": 2.57, + "grad_norm": 9.916712836665532, + "learning_rate": 3.0090895353646053e-06, + "loss": 0.0385, + "step": 3595 + }, + { + "epoch": 2.57, + "grad_norm": 3.3840417799657025, + "learning_rate": 3.006438715371614e-06, + "loss": 0.0473, + "step": 3596 + }, + { + "epoch": 2.57, + "grad_norm": 7.2866366962959095, + "learning_rate": 3.0037885614645e-06, + "loss": 0.0517, + "step": 3597 + }, + { + "epoch": 2.57, + "grad_norm": 10.141648265814117, + "learning_rate": 3.001139074528735e-06, + "loss": 0.0262, + "step": 3598 + }, + { + "epoch": 2.57, + "grad_norm": 5.09068531568647, + "learning_rate": 2.9984902554495556e-06, + "loss": 0.0488, + "step": 3599 + }, + { + "epoch": 2.57, + "grad_norm": 9.031353090048116, + "learning_rate": 2.995842105111987e-06, + "loss": 0.0409, + "step": 3600 + }, + { + "epoch": 2.57, + "grad_norm": 6.990355483528864, + "learning_rate": 2.99319462440082e-06, + "loss": 0.0499, + "step": 3601 + }, + { + "epoch": 2.57, + "grad_norm": 4.2774820742083435, + "learning_rate": 2.990547814200633e-06, + "loss": 0.0457, + "step": 3602 + }, + { + "epoch": 2.57, + "grad_norm": 9.343295864443792, + "learning_rate": 2.987901675395771e-06, + "loss": 0.0399, + "step": 3603 + }, + { + "epoch": 2.57, + "grad_norm": 9.517038015733123, + "learning_rate": 2.985256208870357e-06, + "loss": 0.0621, + "step": 3604 + }, + { + "epoch": 2.57, + "grad_norm": 7.376829239076995, + "learning_rate": 2.982611415508294e-06, + "loss": 0.0432, + "step": 3605 + }, + { + "epoch": 2.57, + "grad_norm": 5.701404340515294, + "learning_rate": 2.9799672961932525e-06, + "loss": 0.0607, + "step": 3606 + }, + { + "epoch": 2.57, + "grad_norm": 4.540746888215145, + "learning_rate": 2.9773238518086866e-06, + "loss": 0.0315, + "step": 3607 + }, + { + "epoch": 2.58, + "grad_norm": 5.28757599597712, + "learning_rate": 2.974681083237816e-06, + "loss": 0.0513, + "step": 3608 + }, + { + "epoch": 2.58, + "grad_norm": 12.344327625926315, + "learning_rate": 2.972038991363643e-06, + "loss": 0.0467, + "step": 3609 + }, + { + "epoch": 2.58, + "grad_norm": 10.626885297699232, + "learning_rate": 2.9693975770689344e-06, + "loss": 0.0414, + "step": 3610 + }, + { + "epoch": 2.58, + "grad_norm": 5.12103907244829, + "learning_rate": 2.9667568412362415e-06, + "loss": 0.0318, + "step": 3611 + }, + { + "epoch": 2.58, + "grad_norm": 13.006583054701983, + "learning_rate": 2.9641167847478797e-06, + "loss": 0.0516, + "step": 3612 + }, + { + "epoch": 2.58, + "grad_norm": 7.09256099551172, + "learning_rate": 2.96147740848594e-06, + "loss": 0.0533, + "step": 3613 + }, + { + "epoch": 2.58, + "grad_norm": 6.204069778734054, + "learning_rate": 2.9588387133322903e-06, + "loss": 0.0599, + "step": 3614 + }, + { + "epoch": 2.58, + "grad_norm": 21.147619366359972, + "learning_rate": 2.9562007001685644e-06, + "loss": 0.0424, + "step": 3615 + }, + { + "epoch": 2.58, + "grad_norm": 9.26229905936863, + "learning_rate": 2.9535633698761755e-06, + "loss": 0.0731, + "step": 3616 + }, + { + "epoch": 2.58, + "grad_norm": 12.74094699586504, + "learning_rate": 2.9509267233363005e-06, + "loss": 0.0692, + "step": 3617 + }, + { + "epoch": 2.58, + "grad_norm": 4.182529756900972, + "learning_rate": 2.948290761429895e-06, + "loss": 0.0459, + "step": 3618 + }, + { + "epoch": 2.58, + "grad_norm": 13.62462515024402, + "learning_rate": 2.9456554850376805e-06, + "loss": 0.0695, + "step": 3619 + }, + { + "epoch": 2.58, + "grad_norm": 5.9507955856152925, + "learning_rate": 2.943020895040155e-06, + "loss": 0.0305, + "step": 3620 + }, + { + "epoch": 2.58, + "grad_norm": 34.83572138414297, + "learning_rate": 2.940386992317582e-06, + "loss": 0.073, + "step": 3621 + }, + { + "epoch": 2.59, + "grad_norm": 3.677257365244086, + "learning_rate": 2.937753777749996e-06, + "loss": 0.032, + "step": 3622 + }, + { + "epoch": 2.59, + "grad_norm": 13.762342920713616, + "learning_rate": 2.9351212522172056e-06, + "loss": 0.0811, + "step": 3623 + }, + { + "epoch": 2.59, + "grad_norm": 10.635154432789744, + "learning_rate": 2.9324894165987837e-06, + "loss": 0.0372, + "step": 3624 + }, + { + "epoch": 2.59, + "grad_norm": 11.482810263694958, + "learning_rate": 2.9298582717740797e-06, + "loss": 0.0728, + "step": 3625 + }, + { + "epoch": 2.59, + "grad_norm": 22.37942524618786, + "learning_rate": 2.9272278186222025e-06, + "loss": 0.0446, + "step": 3626 + }, + { + "epoch": 2.59, + "grad_norm": 9.388487880388006, + "learning_rate": 2.9245980580220405e-06, + "loss": 0.0342, + "step": 3627 + }, + { + "epoch": 2.59, + "grad_norm": 6.033256714344545, + "learning_rate": 2.921968990852242e-06, + "loss": 0.0581, + "step": 3628 + }, + { + "epoch": 2.59, + "grad_norm": 5.768022756457545, + "learning_rate": 2.9193406179912297e-06, + "loss": 0.0361, + "step": 3629 + }, + { + "epoch": 2.59, + "grad_norm": 8.174036176719323, + "learning_rate": 2.91671294031719e-06, + "loss": 0.037, + "step": 3630 + }, + { + "epoch": 2.59, + "grad_norm": 10.705237470263434, + "learning_rate": 2.91408595870808e-06, + "loss": 0.0402, + "step": 3631 + }, + { + "epoch": 2.59, + "grad_norm": 9.964496232913909, + "learning_rate": 2.9114596740416224e-06, + "loss": 0.0462, + "step": 3632 + }, + { + "epoch": 2.59, + "grad_norm": 10.145165458346774, + "learning_rate": 2.908834087195308e-06, + "loss": 0.046, + "step": 3633 + }, + { + "epoch": 2.59, + "grad_norm": 5.38954700874282, + "learning_rate": 2.9062091990463935e-06, + "loss": 0.0441, + "step": 3634 + }, + { + "epoch": 2.59, + "grad_norm": 13.879887451783997, + "learning_rate": 2.903585010471904e-06, + "loss": 0.0697, + "step": 3635 + }, + { + "epoch": 2.6, + "grad_norm": 14.995211769676875, + "learning_rate": 2.9009615223486297e-06, + "loss": 0.0709, + "step": 3636 + }, + { + "epoch": 2.6, + "grad_norm": 15.90824468650359, + "learning_rate": 2.898338735553128e-06, + "loss": 0.0475, + "step": 3637 + }, + { + "epoch": 2.6, + "grad_norm": 15.039762855178544, + "learning_rate": 2.895716650961714e-06, + "loss": 0.0751, + "step": 3638 + }, + { + "epoch": 2.6, + "grad_norm": 8.403369327327413, + "learning_rate": 2.8930952694504843e-06, + "loss": 0.0774, + "step": 3639 + }, + { + "epoch": 2.6, + "grad_norm": 2.68859343539843, + "learning_rate": 2.8904745918952833e-06, + "loss": 0.031, + "step": 3640 + }, + { + "epoch": 2.6, + "grad_norm": 7.863619642319471, + "learning_rate": 2.887854619171735e-06, + "loss": 0.0398, + "step": 3641 + }, + { + "epoch": 2.6, + "grad_norm": 7.469206438001187, + "learning_rate": 2.8852353521552135e-06, + "loss": 0.0381, + "step": 3642 + }, + { + "epoch": 2.6, + "grad_norm": 6.029345864226802, + "learning_rate": 2.8826167917208727e-06, + "loss": 0.0405, + "step": 3643 + }, + { + "epoch": 2.6, + "grad_norm": 3.487371662761291, + "learning_rate": 2.8799989387436137e-06, + "loss": 0.0357, + "step": 3644 + }, + { + "epoch": 2.6, + "grad_norm": 10.641719996018361, + "learning_rate": 2.8773817940981186e-06, + "loss": 0.0533, + "step": 3645 + }, + { + "epoch": 2.6, + "grad_norm": 6.635229300228402, + "learning_rate": 2.8747653586588183e-06, + "loss": 0.0632, + "step": 3646 + }, + { + "epoch": 2.6, + "grad_norm": 9.041375472119013, + "learning_rate": 2.872149633299913e-06, + "loss": 0.0475, + "step": 3647 + }, + { + "epoch": 2.6, + "grad_norm": 7.537386679553337, + "learning_rate": 2.8695346188953666e-06, + "loss": 0.0461, + "step": 3648 + }, + { + "epoch": 2.6, + "grad_norm": 5.14722877172696, + "learning_rate": 2.866920316318904e-06, + "loss": 0.048, + "step": 3649 + }, + { + "epoch": 2.61, + "grad_norm": 14.424463819672413, + "learning_rate": 2.8643067264440116e-06, + "loss": 0.0525, + "step": 3650 + }, + { + "epoch": 2.61, + "grad_norm": 13.853378771747693, + "learning_rate": 2.8616938501439384e-06, + "loss": 0.0692, + "step": 3651 + }, + { + "epoch": 2.61, + "grad_norm": 18.270438337227063, + "learning_rate": 2.8590816882916948e-06, + "loss": 0.0568, + "step": 3652 + }, + { + "epoch": 2.61, + "grad_norm": 18.947419081569944, + "learning_rate": 2.856470241760054e-06, + "loss": 0.0561, + "step": 3653 + }, + { + "epoch": 2.61, + "grad_norm": 4.429111310023221, + "learning_rate": 2.8538595114215472e-06, + "loss": 0.0626, + "step": 3654 + }, + { + "epoch": 2.61, + "grad_norm": 4.475739674756901, + "learning_rate": 2.8512494981484706e-06, + "loss": 0.0581, + "step": 3655 + }, + { + "epoch": 2.61, + "grad_norm": 7.771125239389673, + "learning_rate": 2.848640202812872e-06, + "loss": 0.0417, + "step": 3656 + }, + { + "epoch": 2.61, + "grad_norm": 23.062847695927893, + "learning_rate": 2.846031626286574e-06, + "loss": 0.0508, + "step": 3657 + }, + { + "epoch": 2.61, + "grad_norm": 11.972621678603849, + "learning_rate": 2.8434237694411414e-06, + "loss": 0.0488, + "step": 3658 + }, + { + "epoch": 2.61, + "grad_norm": 14.32588978216122, + "learning_rate": 2.840816633147917e-06, + "loss": 0.0611, + "step": 3659 + }, + { + "epoch": 2.61, + "grad_norm": 5.884934135428629, + "learning_rate": 2.8382102182779846e-06, + "loss": 0.0441, + "step": 3660 + }, + { + "epoch": 2.61, + "grad_norm": 9.351592698422403, + "learning_rate": 2.8356045257022037e-06, + "loss": 0.0354, + "step": 3661 + }, + { + "epoch": 2.61, + "grad_norm": 6.957720102055877, + "learning_rate": 2.832999556291177e-06, + "loss": 0.0375, + "step": 3662 + }, + { + "epoch": 2.61, + "grad_norm": 10.048379667158468, + "learning_rate": 2.8303953109152815e-06, + "loss": 0.0575, + "step": 3663 + }, + { + "epoch": 2.62, + "grad_norm": 16.31872426637752, + "learning_rate": 2.827791790444638e-06, + "loss": 0.045, + "step": 3664 + }, + { + "epoch": 2.62, + "grad_norm": 5.464392634847121, + "learning_rate": 2.8251889957491317e-06, + "loss": 0.0614, + "step": 3665 + }, + { + "epoch": 2.62, + "grad_norm": 7.507048236348415, + "learning_rate": 2.822586927698407e-06, + "loss": 0.0433, + "step": 3666 + }, + { + "epoch": 2.62, + "grad_norm": 9.366777000694293, + "learning_rate": 2.819985587161861e-06, + "loss": 0.0492, + "step": 3667 + }, + { + "epoch": 2.62, + "grad_norm": 18.342694251690276, + "learning_rate": 2.8173849750086513e-06, + "loss": 0.0646, + "step": 3668 + }, + { + "epoch": 2.62, + "grad_norm": 13.408885923969533, + "learning_rate": 2.8147850921076903e-06, + "loss": 0.072, + "step": 3669 + }, + { + "epoch": 2.62, + "grad_norm": 5.859363792224824, + "learning_rate": 2.8121859393276475e-06, + "loss": 0.0466, + "step": 3670 + }, + { + "epoch": 2.62, + "grad_norm": 6.544712210188832, + "learning_rate": 2.809587517536947e-06, + "loss": 0.0583, + "step": 3671 + }, + { + "epoch": 2.62, + "grad_norm": 10.49401514565673, + "learning_rate": 2.806989827603771e-06, + "loss": 0.053, + "step": 3672 + }, + { + "epoch": 2.62, + "grad_norm": 4.011757972273914, + "learning_rate": 2.8043928703960565e-06, + "loss": 0.0323, + "step": 3673 + }, + { + "epoch": 2.62, + "grad_norm": 2.960978152701493, + "learning_rate": 2.8017966467814933e-06, + "loss": 0.0303, + "step": 3674 + }, + { + "epoch": 2.62, + "grad_norm": 14.114822282619247, + "learning_rate": 2.7992011576275295e-06, + "loss": 0.0411, + "step": 3675 + }, + { + "epoch": 2.62, + "grad_norm": 7.15626451156831, + "learning_rate": 2.7966064038013657e-06, + "loss": 0.0564, + "step": 3676 + }, + { + "epoch": 2.62, + "grad_norm": 9.494513074846147, + "learning_rate": 2.7940123861699577e-06, + "loss": 0.0399, + "step": 3677 + }, + { + "epoch": 2.63, + "grad_norm": 5.069632762089934, + "learning_rate": 2.7914191056000147e-06, + "loss": 0.0523, + "step": 3678 + }, + { + "epoch": 2.63, + "grad_norm": 10.285203626675607, + "learning_rate": 2.788826562958e-06, + "loss": 0.0495, + "step": 3679 + }, + { + "epoch": 2.63, + "grad_norm": 12.855691316396912, + "learning_rate": 2.7862347591101326e-06, + "loss": 0.051, + "step": 3680 + }, + { + "epoch": 2.63, + "grad_norm": 8.43463699635779, + "learning_rate": 2.7836436949223755e-06, + "loss": 0.0451, + "step": 3681 + }, + { + "epoch": 2.63, + "grad_norm": 6.876214234656952, + "learning_rate": 2.78105337126046e-06, + "loss": 0.0444, + "step": 3682 + }, + { + "epoch": 2.63, + "grad_norm": 16.846348195822873, + "learning_rate": 2.7784637889898534e-06, + "loss": 0.0648, + "step": 3683 + }, + { + "epoch": 2.63, + "grad_norm": 5.6815124763738485, + "learning_rate": 2.7758749489757914e-06, + "loss": 0.0646, + "step": 3684 + }, + { + "epoch": 2.63, + "grad_norm": 10.381951130054908, + "learning_rate": 2.7732868520832455e-06, + "loss": 0.0453, + "step": 3685 + }, + { + "epoch": 2.63, + "grad_norm": 11.46923666061598, + "learning_rate": 2.770699499176954e-06, + "loss": 0.051, + "step": 3686 + }, + { + "epoch": 2.63, + "grad_norm": 5.726611333652352, + "learning_rate": 2.768112891121394e-06, + "loss": 0.0314, + "step": 3687 + }, + { + "epoch": 2.63, + "grad_norm": 15.030148911582819, + "learning_rate": 2.7655270287808045e-06, + "loss": 0.0557, + "step": 3688 + }, + { + "epoch": 2.63, + "grad_norm": 15.463593669759398, + "learning_rate": 2.762941913019166e-06, + "loss": 0.0581, + "step": 3689 + }, + { + "epoch": 2.63, + "grad_norm": 6.6863472657093315, + "learning_rate": 2.760357544700215e-06, + "loss": 0.0718, + "step": 3690 + }, + { + "epoch": 2.63, + "grad_norm": 14.381312588146585, + "learning_rate": 2.757773924687437e-06, + "loss": 0.0497, + "step": 3691 + }, + { + "epoch": 2.64, + "grad_norm": 8.724171215718298, + "learning_rate": 2.755191053844068e-06, + "loss": 0.0458, + "step": 3692 + }, + { + "epoch": 2.64, + "grad_norm": 8.172858308911552, + "learning_rate": 2.7526089330330925e-06, + "loss": 0.054, + "step": 3693 + }, + { + "epoch": 2.64, + "grad_norm": 8.290284625175802, + "learning_rate": 2.7500275631172455e-06, + "loss": 0.0347, + "step": 3694 + }, + { + "epoch": 2.64, + "grad_norm": 5.764092240236844, + "learning_rate": 2.74744694495901e-06, + "loss": 0.0269, + "step": 3695 + }, + { + "epoch": 2.64, + "grad_norm": 8.50315536403876, + "learning_rate": 2.74486707942062e-06, + "loss": 0.0414, + "step": 3696 + }, + { + "epoch": 2.64, + "grad_norm": 17.67269423715195, + "learning_rate": 2.7422879673640552e-06, + "loss": 0.0577, + "step": 3697 + }, + { + "epoch": 2.64, + "grad_norm": 13.286685334149144, + "learning_rate": 2.7397096096510467e-06, + "loss": 0.0527, + "step": 3698 + }, + { + "epoch": 2.64, + "grad_norm": 5.190362466075019, + "learning_rate": 2.7371320071430674e-06, + "loss": 0.0354, + "step": 3699 + }, + { + "epoch": 2.64, + "grad_norm": 7.222587422386342, + "learning_rate": 2.7345551607013475e-06, + "loss": 0.04, + "step": 3700 + }, + { + "epoch": 2.64, + "grad_norm": 4.454442361570336, + "learning_rate": 2.7319790711868545e-06, + "loss": 0.0438, + "step": 3701 + }, + { + "epoch": 2.64, + "grad_norm": 4.525361617331926, + "learning_rate": 2.7294037394603135e-06, + "loss": 0.0638, + "step": 3702 + }, + { + "epoch": 2.64, + "grad_norm": 6.9455554028374324, + "learning_rate": 2.7268291663821825e-06, + "loss": 0.0435, + "step": 3703 + }, + { + "epoch": 2.64, + "grad_norm": 4.868096567440874, + "learning_rate": 2.7242553528126842e-06, + "loss": 0.0508, + "step": 3704 + }, + { + "epoch": 2.64, + "grad_norm": 6.558699133066451, + "learning_rate": 2.72168229961177e-06, + "loss": 0.0406, + "step": 3705 + }, + { + "epoch": 2.65, + "grad_norm": 4.402340494429636, + "learning_rate": 2.7191100076391473e-06, + "loss": 0.0454, + "step": 3706 + }, + { + "epoch": 2.65, + "grad_norm": 12.420379796085895, + "learning_rate": 2.716538477754266e-06, + "loss": 0.0583, + "step": 3707 + }, + { + "epoch": 2.65, + "grad_norm": 6.1149424905859675, + "learning_rate": 2.713967710816323e-06, + "loss": 0.0345, + "step": 3708 + }, + { + "epoch": 2.65, + "grad_norm": 6.348271002030884, + "learning_rate": 2.7113977076842597e-06, + "loss": 0.0576, + "step": 3709 + }, + { + "epoch": 2.65, + "grad_norm": 7.584921780363273, + "learning_rate": 2.7088284692167604e-06, + "loss": 0.0529, + "step": 3710 + }, + { + "epoch": 2.65, + "grad_norm": 9.51983997797159, + "learning_rate": 2.7062599962722563e-06, + "loss": 0.0416, + "step": 3711 + }, + { + "epoch": 2.65, + "grad_norm": 7.319440057301574, + "learning_rate": 2.703692289708922e-06, + "loss": 0.0438, + "step": 3712 + }, + { + "epoch": 2.65, + "grad_norm": 26.01010056382804, + "learning_rate": 2.701125350384676e-06, + "loss": 0.0746, + "step": 3713 + }, + { + "epoch": 2.65, + "grad_norm": 11.765662745979094, + "learning_rate": 2.69855917915718e-06, + "loss": 0.0467, + "step": 3714 + }, + { + "epoch": 2.65, + "grad_norm": 23.418590205484847, + "learning_rate": 2.695993776883839e-06, + "loss": 0.0595, + "step": 3715 + }, + { + "epoch": 2.65, + "grad_norm": 14.703328795567954, + "learning_rate": 2.693429144421803e-06, + "loss": 0.0302, + "step": 3716 + }, + { + "epoch": 2.65, + "grad_norm": 23.578741481643345, + "learning_rate": 2.6908652826279623e-06, + "loss": 0.0658, + "step": 3717 + }, + { + "epoch": 2.65, + "grad_norm": 5.857687742436122, + "learning_rate": 2.688302192358952e-06, + "loss": 0.0638, + "step": 3718 + }, + { + "epoch": 2.65, + "grad_norm": 8.405158533466244, + "learning_rate": 2.6857398744711472e-06, + "loss": 0.0517, + "step": 3719 + }, + { + "epoch": 2.66, + "grad_norm": 20.846618406862838, + "learning_rate": 2.683178329820666e-06, + "loss": 0.074, + "step": 3720 + }, + { + "epoch": 2.66, + "grad_norm": 16.091758689157754, + "learning_rate": 2.680617559263368e-06, + "loss": 0.0647, + "step": 3721 + }, + { + "epoch": 2.66, + "grad_norm": 5.701177069467726, + "learning_rate": 2.6780575636548544e-06, + "loss": 0.0526, + "step": 3722 + }, + { + "epoch": 2.66, + "grad_norm": 13.59914895192763, + "learning_rate": 2.67549834385047e-06, + "loss": 0.0457, + "step": 3723 + }, + { + "epoch": 2.66, + "grad_norm": 6.660108186626482, + "learning_rate": 2.67293990070529e-06, + "loss": 0.0396, + "step": 3724 + }, + { + "epoch": 2.66, + "grad_norm": 3.818886071754114, + "learning_rate": 2.6703822350741483e-06, + "loss": 0.0388, + "step": 3725 + }, + { + "epoch": 2.66, + "grad_norm": 7.17966247766626, + "learning_rate": 2.6678253478116e-06, + "loss": 0.0617, + "step": 3726 + }, + { + "epoch": 2.66, + "grad_norm": 15.063926908466604, + "learning_rate": 2.665269239771953e-06, + "loss": 0.0401, + "step": 3727 + }, + { + "epoch": 2.66, + "grad_norm": 17.589769650176894, + "learning_rate": 2.662713911809248e-06, + "loss": 0.0502, + "step": 3728 + }, + { + "epoch": 2.66, + "grad_norm": 7.171926503654985, + "learning_rate": 2.6601593647772696e-06, + "loss": 0.0394, + "step": 3729 + }, + { + "epoch": 2.66, + "grad_norm": 11.235108267848334, + "learning_rate": 2.657605599529538e-06, + "loss": 0.055, + "step": 3730 + }, + { + "epoch": 2.66, + "grad_norm": 6.427120667435036, + "learning_rate": 2.6550526169193148e-06, + "loss": 0.0506, + "step": 3731 + }, + { + "epoch": 2.66, + "grad_norm": 3.6786946918606946, + "learning_rate": 2.6525004177995984e-06, + "loss": 0.0367, + "step": 3732 + }, + { + "epoch": 2.66, + "grad_norm": 11.498439922011594, + "learning_rate": 2.6499490030231255e-06, + "loss": 0.0499, + "step": 3733 + }, + { + "epoch": 2.67, + "grad_norm": 5.251312011644968, + "learning_rate": 2.6473983734423725e-06, + "loss": 0.0386, + "step": 3734 + }, + { + "epoch": 2.67, + "grad_norm": 8.575386085669063, + "learning_rate": 2.644848529909552e-06, + "loss": 0.0491, + "step": 3735 + }, + { + "epoch": 2.67, + "grad_norm": 8.930496092355702, + "learning_rate": 2.6422994732766124e-06, + "loss": 0.0331, + "step": 3736 + }, + { + "epoch": 2.67, + "grad_norm": 6.071195064464973, + "learning_rate": 2.6397512043952422e-06, + "loss": 0.0522, + "step": 3737 + }, + { + "epoch": 2.67, + "grad_norm": 7.623010868928134, + "learning_rate": 2.637203724116865e-06, + "loss": 0.0363, + "step": 3738 + }, + { + "epoch": 2.67, + "grad_norm": 9.317615708928468, + "learning_rate": 2.634657033292644e-06, + "loss": 0.0439, + "step": 3739 + }, + { + "epoch": 2.67, + "grad_norm": 4.679344162012358, + "learning_rate": 2.6321111327734693e-06, + "loss": 0.041, + "step": 3740 + }, + { + "epoch": 2.67, + "grad_norm": 10.628255304414862, + "learning_rate": 2.6295660234099816e-06, + "loss": 0.0474, + "step": 3741 + }, + { + "epoch": 2.67, + "grad_norm": 3.7547535821491653, + "learning_rate": 2.6270217060525416e-06, + "loss": 0.0542, + "step": 3742 + }, + { + "epoch": 2.67, + "grad_norm": 9.99181954191989, + "learning_rate": 2.624478181551261e-06, + "loss": 0.0369, + "step": 3743 + }, + { + "epoch": 2.67, + "grad_norm": 7.006118145287793, + "learning_rate": 2.62193545075597e-06, + "loss": 0.0498, + "step": 3744 + }, + { + "epoch": 2.67, + "grad_norm": 3.068279415859639, + "learning_rate": 2.6193935145162507e-06, + "loss": 0.0294, + "step": 3745 + }, + { + "epoch": 2.67, + "grad_norm": 8.203047800359975, + "learning_rate": 2.6168523736814035e-06, + "loss": 0.0416, + "step": 3746 + }, + { + "epoch": 2.67, + "grad_norm": 13.181075968942887, + "learning_rate": 2.6143120291004785e-06, + "loss": 0.0476, + "step": 3747 + }, + { + "epoch": 2.68, + "grad_norm": 5.395303454942057, + "learning_rate": 2.611772481622246e-06, + "loss": 0.0324, + "step": 3748 + }, + { + "epoch": 2.68, + "grad_norm": 8.82113177422998, + "learning_rate": 2.609233732095218e-06, + "loss": 0.0478, + "step": 3749 + }, + { + "epoch": 2.68, + "grad_norm": 14.848385000891767, + "learning_rate": 2.6066957813676375e-06, + "loss": 0.0469, + "step": 3750 + }, + { + "epoch": 2.68, + "grad_norm": 4.738432033718035, + "learning_rate": 2.604158630287482e-06, + "loss": 0.0406, + "step": 3751 + }, + { + "epoch": 2.68, + "grad_norm": 6.340406715674136, + "learning_rate": 2.60162227970246e-06, + "loss": 0.0437, + "step": 3752 + }, + { + "epoch": 2.68, + "grad_norm": 7.33847764367545, + "learning_rate": 2.5990867304600136e-06, + "loss": 0.0372, + "step": 3753 + }, + { + "epoch": 2.68, + "grad_norm": 3.166395335908612, + "learning_rate": 2.5965519834073172e-06, + "loss": 0.0388, + "step": 3754 + }, + { + "epoch": 2.68, + "grad_norm": 7.976267116665653, + "learning_rate": 2.5940180393912767e-06, + "loss": 0.0428, + "step": 3755 + }, + { + "epoch": 2.68, + "grad_norm": 13.601712677500139, + "learning_rate": 2.5914848992585293e-06, + "loss": 0.0574, + "step": 3756 + }, + { + "epoch": 2.68, + "grad_norm": 8.172580723390993, + "learning_rate": 2.588952563855448e-06, + "loss": 0.0438, + "step": 3757 + }, + { + "epoch": 2.68, + "grad_norm": 7.078598667043399, + "learning_rate": 2.5864210340281247e-06, + "loss": 0.041, + "step": 3758 + }, + { + "epoch": 2.68, + "grad_norm": 10.010293320385502, + "learning_rate": 2.5838903106224004e-06, + "loss": 0.0445, + "step": 3759 + }, + { + "epoch": 2.68, + "grad_norm": 8.32877855610212, + "learning_rate": 2.5813603944838283e-06, + "loss": 0.0509, + "step": 3760 + }, + { + "epoch": 2.68, + "grad_norm": 8.156187892342613, + "learning_rate": 2.578831286457708e-06, + "loss": 0.0385, + "step": 3761 + }, + { + "epoch": 2.69, + "grad_norm": 8.634329209857926, + "learning_rate": 2.5763029873890542e-06, + "loss": 0.0607, + "step": 3762 + }, + { + "epoch": 2.69, + "grad_norm": 10.967700021524228, + "learning_rate": 2.573775498122626e-06, + "loss": 0.0626, + "step": 3763 + }, + { + "epoch": 2.69, + "grad_norm": 7.039557541057048, + "learning_rate": 2.5712488195028972e-06, + "loss": 0.0526, + "step": 3764 + }, + { + "epoch": 2.69, + "grad_norm": 4.977594670286856, + "learning_rate": 2.5687229523740852e-06, + "loss": 0.0497, + "step": 3765 + }, + { + "epoch": 2.69, + "grad_norm": 14.378683618047381, + "learning_rate": 2.566197897580124e-06, + "loss": 0.0786, + "step": 3766 + }, + { + "epoch": 2.69, + "grad_norm": 11.70648699510383, + "learning_rate": 2.5636736559646824e-06, + "loss": 0.0514, + "step": 3767 + }, + { + "epoch": 2.69, + "grad_norm": 9.415184037952974, + "learning_rate": 2.5611502283711576e-06, + "loss": 0.0461, + "step": 3768 + }, + { + "epoch": 2.69, + "grad_norm": 5.61280952127481, + "learning_rate": 2.5586276156426726e-06, + "loss": 0.0415, + "step": 3769 + }, + { + "epoch": 2.69, + "grad_norm": 10.422217664943227, + "learning_rate": 2.55610581862208e-06, + "loss": 0.0533, + "step": 3770 + }, + { + "epoch": 2.69, + "grad_norm": 9.032558758185202, + "learning_rate": 2.553584838151959e-06, + "loss": 0.0513, + "step": 3771 + }, + { + "epoch": 2.69, + "grad_norm": 9.894389545613095, + "learning_rate": 2.5510646750746154e-06, + "loss": 0.0457, + "step": 3772 + }, + { + "epoch": 2.69, + "grad_norm": 9.655381132642049, + "learning_rate": 2.548545330232083e-06, + "loss": 0.0419, + "step": 3773 + }, + { + "epoch": 2.69, + "grad_norm": 5.7174887879327745, + "learning_rate": 2.5460268044661215e-06, + "loss": 0.0477, + "step": 3774 + }, + { + "epoch": 2.69, + "grad_norm": 16.20332294670294, + "learning_rate": 2.5435090986182176e-06, + "loss": 0.0663, + "step": 3775 + }, + { + "epoch": 2.7, + "grad_norm": 7.352189227072673, + "learning_rate": 2.5409922135295827e-06, + "loss": 0.0466, + "step": 3776 + }, + { + "epoch": 2.7, + "grad_norm": 5.547528471853324, + "learning_rate": 2.538476150041156e-06, + "loss": 0.032, + "step": 3777 + }, + { + "epoch": 2.7, + "grad_norm": 9.148875990223138, + "learning_rate": 2.5359609089936006e-06, + "loss": 0.0935, + "step": 3778 + }, + { + "epoch": 2.7, + "grad_norm": 7.890951821809732, + "learning_rate": 2.533446491227305e-06, + "loss": 0.0345, + "step": 3779 + }, + { + "epoch": 2.7, + "grad_norm": 11.425432370389863, + "learning_rate": 2.5309328975823834e-06, + "loss": 0.051, + "step": 3780 + }, + { + "epoch": 2.7, + "grad_norm": 7.139640801191179, + "learning_rate": 2.5284201288986744e-06, + "loss": 0.0556, + "step": 3781 + }, + { + "epoch": 2.7, + "grad_norm": 12.38512234939895, + "learning_rate": 2.5259081860157418e-06, + "loss": 0.0362, + "step": 3782 + }, + { + "epoch": 2.7, + "grad_norm": 6.283048395599437, + "learning_rate": 2.5233970697728673e-06, + "loss": 0.0679, + "step": 3783 + }, + { + "epoch": 2.7, + "grad_norm": 7.575758374522927, + "learning_rate": 2.520886781009068e-06, + "loss": 0.0566, + "step": 3784 + }, + { + "epoch": 2.7, + "grad_norm": 4.885508330549097, + "learning_rate": 2.5183773205630726e-06, + "loss": 0.0316, + "step": 3785 + }, + { + "epoch": 2.7, + "grad_norm": 15.694594828315246, + "learning_rate": 2.515868689273344e-06, + "loss": 0.0845, + "step": 3786 + }, + { + "epoch": 2.7, + "grad_norm": 9.405452769181828, + "learning_rate": 2.513360887978056e-06, + "loss": 0.0417, + "step": 3787 + }, + { + "epoch": 2.7, + "grad_norm": 3.6408201726978895, + "learning_rate": 2.510853917515119e-06, + "loss": 0.0554, + "step": 3788 + }, + { + "epoch": 2.7, + "grad_norm": 4.7474944897018405, + "learning_rate": 2.50834777872215e-06, + "loss": 0.036, + "step": 3789 + }, + { + "epoch": 2.71, + "grad_norm": 6.32046437072879, + "learning_rate": 2.505842472436506e-06, + "loss": 0.0375, + "step": 3790 + }, + { + "epoch": 2.71, + "grad_norm": 4.876168319824173, + "learning_rate": 2.5033379994952493e-06, + "loss": 0.0405, + "step": 3791 + }, + { + "epoch": 2.71, + "grad_norm": 3.4487084790696407, + "learning_rate": 2.5008343607351733e-06, + "loss": 0.0408, + "step": 3792 + }, + { + "epoch": 2.71, + "grad_norm": 21.368691291832302, + "learning_rate": 2.4983315569927895e-06, + "loss": 0.0717, + "step": 3793 + }, + { + "epoch": 2.71, + "grad_norm": 5.394868150765753, + "learning_rate": 2.495829589104333e-06, + "loss": 0.0406, + "step": 3794 + }, + { + "epoch": 2.71, + "grad_norm": 7.701013264562421, + "learning_rate": 2.493328457905755e-06, + "loss": 0.0374, + "step": 3795 + }, + { + "epoch": 2.71, + "grad_norm": 9.241059362038095, + "learning_rate": 2.490828164232732e-06, + "loss": 0.0476, + "step": 3796 + }, + { + "epoch": 2.71, + "grad_norm": 10.070938365629022, + "learning_rate": 2.4883287089206582e-06, + "loss": 0.0363, + "step": 3797 + }, + { + "epoch": 2.71, + "grad_norm": 9.776212143814869, + "learning_rate": 2.48583009280465e-06, + "loss": 0.0464, + "step": 3798 + }, + { + "epoch": 2.71, + "grad_norm": 3.2565285582268926, + "learning_rate": 2.483332316719535e-06, + "loss": 0.0363, + "step": 3799 + }, + { + "epoch": 2.71, + "grad_norm": 3.0244463261051457, + "learning_rate": 2.4808353814998747e-06, + "loss": 0.0432, + "step": 3800 + }, + { + "epoch": 2.71, + "grad_norm": 11.23437776694193, + "learning_rate": 2.4783392879799345e-06, + "loss": 0.0754, + "step": 3801 + }, + { + "epoch": 2.71, + "grad_norm": 7.504406874948566, + "learning_rate": 2.4758440369937125e-06, + "loss": 0.0468, + "step": 3802 + }, + { + "epoch": 2.71, + "grad_norm": 11.839846713260169, + "learning_rate": 2.4733496293749116e-06, + "loss": 0.0382, + "step": 3803 + }, + { + "epoch": 2.72, + "grad_norm": 15.580980042235241, + "learning_rate": 2.4708560659569665e-06, + "loss": 0.0377, + "step": 3804 + }, + { + "epoch": 2.72, + "grad_norm": 4.915771833170678, + "learning_rate": 2.4683633475730158e-06, + "loss": 0.0426, + "step": 3805 + }, + { + "epoch": 2.72, + "grad_norm": 3.7801116481824035, + "learning_rate": 2.465871475055931e-06, + "loss": 0.0416, + "step": 3806 + }, + { + "epoch": 2.72, + "grad_norm": 13.257844295682492, + "learning_rate": 2.4633804492382866e-06, + "loss": 0.0469, + "step": 3807 + }, + { + "epoch": 2.72, + "grad_norm": 2.321691675555728, + "learning_rate": 2.460890270952383e-06, + "loss": 0.0398, + "step": 3808 + }, + { + "epoch": 2.72, + "grad_norm": 5.925469089728795, + "learning_rate": 2.4584009410302357e-06, + "loss": 0.0483, + "step": 3809 + }, + { + "epoch": 2.72, + "grad_norm": 5.729798068275378, + "learning_rate": 2.4559124603035744e-06, + "loss": 0.0408, + "step": 3810 + }, + { + "epoch": 2.72, + "grad_norm": 8.12961700972875, + "learning_rate": 2.4534248296038488e-06, + "loss": 0.0488, + "step": 3811 + }, + { + "epoch": 2.72, + "grad_norm": 3.6114805207114986, + "learning_rate": 2.4509380497622208e-06, + "loss": 0.031, + "step": 3812 + }, + { + "epoch": 2.72, + "grad_norm": 10.032594319995873, + "learning_rate": 2.448452121609571e-06, + "loss": 0.0432, + "step": 3813 + }, + { + "epoch": 2.72, + "grad_norm": 6.816652452453811, + "learning_rate": 2.445967045976493e-06, + "loss": 0.0637, + "step": 3814 + }, + { + "epoch": 2.72, + "grad_norm": 8.219953176187644, + "learning_rate": 2.443482823693298e-06, + "loss": 0.0431, + "step": 3815 + }, + { + "epoch": 2.72, + "grad_norm": 6.300399021489175, + "learning_rate": 2.4409994555900125e-06, + "loss": 0.0393, + "step": 3816 + }, + { + "epoch": 2.72, + "grad_norm": 10.553580034207647, + "learning_rate": 2.4385169424963696e-06, + "loss": 0.0486, + "step": 3817 + }, + { + "epoch": 2.73, + "grad_norm": 3.250056961602972, + "learning_rate": 2.4360352852418305e-06, + "loss": 0.038, + "step": 3818 + }, + { + "epoch": 2.73, + "grad_norm": 2.5109869107834233, + "learning_rate": 2.4335544846555564e-06, + "loss": 0.0341, + "step": 3819 + }, + { + "epoch": 2.73, + "grad_norm": 3.141809200082095, + "learning_rate": 2.431074541566436e-06, + "loss": 0.0346, + "step": 3820 + }, + { + "epoch": 2.73, + "grad_norm": 8.93333143773801, + "learning_rate": 2.4285954568030566e-06, + "loss": 0.0596, + "step": 3821 + }, + { + "epoch": 2.73, + "grad_norm": 2.8270654820116854, + "learning_rate": 2.426117231193735e-06, + "loss": 0.0378, + "step": 3822 + }, + { + "epoch": 2.73, + "grad_norm": 6.114877716065037, + "learning_rate": 2.4236398655664834e-06, + "loss": 0.0361, + "step": 3823 + }, + { + "epoch": 2.73, + "grad_norm": 11.797891191634468, + "learning_rate": 2.4211633607490442e-06, + "loss": 0.0575, + "step": 3824 + }, + { + "epoch": 2.73, + "grad_norm": 10.73316573754624, + "learning_rate": 2.4186877175688576e-06, + "loss": 0.0678, + "step": 3825 + }, + { + "epoch": 2.73, + "grad_norm": 5.071528261076872, + "learning_rate": 2.4162129368530848e-06, + "loss": 0.053, + "step": 3826 + }, + { + "epoch": 2.73, + "grad_norm": 5.82366753919729, + "learning_rate": 2.413739019428595e-06, + "loss": 0.0371, + "step": 3827 + }, + { + "epoch": 2.73, + "grad_norm": 4.408441529976167, + "learning_rate": 2.41126596612197e-06, + "loss": 0.0352, + "step": 3828 + }, + { + "epoch": 2.73, + "grad_norm": 2.9553768058089136, + "learning_rate": 2.408793777759504e-06, + "loss": 0.0439, + "step": 3829 + }, + { + "epoch": 2.73, + "grad_norm": 7.364755485236182, + "learning_rate": 2.4063224551672e-06, + "loss": 0.0334, + "step": 3830 + }, + { + "epoch": 2.73, + "grad_norm": 7.684527850853966, + "learning_rate": 2.4038519991707725e-06, + "loss": 0.0483, + "step": 3831 + }, + { + "epoch": 2.74, + "grad_norm": 9.183703665017836, + "learning_rate": 2.4013824105956483e-06, + "loss": 0.054, + "step": 3832 + }, + { + "epoch": 2.74, + "grad_norm": 4.171026788107983, + "learning_rate": 2.3989136902669614e-06, + "loss": 0.0352, + "step": 3833 + }, + { + "epoch": 2.74, + "grad_norm": 19.60009405514915, + "learning_rate": 2.396445839009558e-06, + "loss": 0.0358, + "step": 3834 + }, + { + "epoch": 2.74, + "grad_norm": 4.711556978156083, + "learning_rate": 2.3939788576479926e-06, + "loss": 0.0421, + "step": 3835 + }, + { + "epoch": 2.74, + "grad_norm": 22.38928410529173, + "learning_rate": 2.39151274700653e-06, + "loss": 0.0849, + "step": 3836 + }, + { + "epoch": 2.74, + "grad_norm": 10.767567534815475, + "learning_rate": 2.389047507909143e-06, + "loss": 0.0384, + "step": 3837 + }, + { + "epoch": 2.74, + "grad_norm": 12.836118570888207, + "learning_rate": 2.3865831411795137e-06, + "loss": 0.0568, + "step": 3838 + }, + { + "epoch": 2.74, + "grad_norm": 15.689055796114143, + "learning_rate": 2.3841196476410337e-06, + "loss": 0.0585, + "step": 3839 + }, + { + "epoch": 2.74, + "grad_norm": 7.494309674312284, + "learning_rate": 2.3816570281168016e-06, + "loss": 0.0571, + "step": 3840 + }, + { + "epoch": 2.74, + "grad_norm": 16.871108256802874, + "learning_rate": 2.379195283429626e-06, + "loss": 0.0378, + "step": 3841 + }, + { + "epoch": 2.74, + "grad_norm": 9.679241876905133, + "learning_rate": 2.3767344144020164e-06, + "loss": 0.0595, + "step": 3842 + }, + { + "epoch": 2.74, + "grad_norm": 2.73030324100972, + "learning_rate": 2.374274421856202e-06, + "loss": 0.0332, + "step": 3843 + }, + { + "epoch": 2.74, + "grad_norm": 4.984926898926288, + "learning_rate": 2.371815306614104e-06, + "loss": 0.0441, + "step": 3844 + }, + { + "epoch": 2.74, + "grad_norm": 6.959555864531863, + "learning_rate": 2.3693570694973673e-06, + "loss": 0.0458, + "step": 3845 + }, + { + "epoch": 2.75, + "grad_norm": 19.08144651468584, + "learning_rate": 2.366899711327326e-06, + "loss": 0.0403, + "step": 3846 + }, + { + "epoch": 2.75, + "grad_norm": 7.685927254780913, + "learning_rate": 2.3644432329250374e-06, + "loss": 0.0418, + "step": 3847 + }, + { + "epoch": 2.75, + "grad_norm": 3.733683829988181, + "learning_rate": 2.3619876351112486e-06, + "loss": 0.0475, + "step": 3848 + }, + { + "epoch": 2.75, + "grad_norm": 7.2819284808845905, + "learning_rate": 2.3595329187064282e-06, + "loss": 0.041, + "step": 3849 + }, + { + "epoch": 2.75, + "grad_norm": 8.790602910243651, + "learning_rate": 2.3570790845307367e-06, + "loss": 0.0455, + "step": 3850 + }, + { + "epoch": 2.75, + "grad_norm": 5.762808139392179, + "learning_rate": 2.3546261334040475e-06, + "loss": 0.0621, + "step": 3851 + }, + { + "epoch": 2.75, + "grad_norm": 18.647824678033906, + "learning_rate": 2.352174066145938e-06, + "loss": 0.0724, + "step": 3852 + }, + { + "epoch": 2.75, + "grad_norm": 7.320240282215148, + "learning_rate": 2.3497228835756887e-06, + "loss": 0.0392, + "step": 3853 + }, + { + "epoch": 2.75, + "grad_norm": 12.389189567383427, + "learning_rate": 2.3472725865122854e-06, + "loss": 0.0609, + "step": 3854 + }, + { + "epoch": 2.75, + "grad_norm": 10.228654260988792, + "learning_rate": 2.344823175774418e-06, + "loss": 0.0463, + "step": 3855 + }, + { + "epoch": 2.75, + "grad_norm": 6.773649330102428, + "learning_rate": 2.3423746521804796e-06, + "loss": 0.0524, + "step": 3856 + }, + { + "epoch": 2.75, + "grad_norm": 5.435268080793936, + "learning_rate": 2.339927016548568e-06, + "loss": 0.0579, + "step": 3857 + }, + { + "epoch": 2.75, + "grad_norm": 7.41772531916488, + "learning_rate": 2.3374802696964842e-06, + "loss": 0.0454, + "step": 3858 + }, + { + "epoch": 2.75, + "grad_norm": 10.022754338495929, + "learning_rate": 2.3350344124417336e-06, + "loss": 0.035, + "step": 3859 + }, + { + "epoch": 2.76, + "grad_norm": 10.610656647476898, + "learning_rate": 2.3325894456015154e-06, + "loss": 0.0523, + "step": 3860 + }, + { + "epoch": 2.76, + "grad_norm": 18.39360281841152, + "learning_rate": 2.3301453699927477e-06, + "loss": 0.0579, + "step": 3861 + }, + { + "epoch": 2.76, + "grad_norm": 6.048783311715747, + "learning_rate": 2.3277021864320332e-06, + "loss": 0.0506, + "step": 3862 + }, + { + "epoch": 2.76, + "grad_norm": 7.751672566932973, + "learning_rate": 2.325259895735693e-06, + "loss": 0.0463, + "step": 3863 + }, + { + "epoch": 2.76, + "grad_norm": 18.50386797364024, + "learning_rate": 2.322818498719734e-06, + "loss": 0.044, + "step": 3864 + }, + { + "epoch": 2.76, + "grad_norm": 12.855762559723866, + "learning_rate": 2.3203779961998795e-06, + "loss": 0.0297, + "step": 3865 + }, + { + "epoch": 2.76, + "grad_norm": 13.336159487903306, + "learning_rate": 2.317938388991541e-06, + "loss": 0.0365, + "step": 3866 + }, + { + "epoch": 2.76, + "grad_norm": 16.833109489814944, + "learning_rate": 2.3154996779098405e-06, + "loss": 0.058, + "step": 3867 + }, + { + "epoch": 2.76, + "grad_norm": 3.4464387255304665, + "learning_rate": 2.313061863769594e-06, + "loss": 0.0407, + "step": 3868 + }, + { + "epoch": 2.76, + "grad_norm": 5.236811429649089, + "learning_rate": 2.310624947385322e-06, + "loss": 0.0554, + "step": 3869 + }, + { + "epoch": 2.76, + "grad_norm": 17.42376851125036, + "learning_rate": 2.3081889295712434e-06, + "loss": 0.0373, + "step": 3870 + }, + { + "epoch": 2.76, + "grad_norm": 28.16530722294004, + "learning_rate": 2.3057538111412765e-06, + "loss": 0.0786, + "step": 3871 + }, + { + "epoch": 2.76, + "grad_norm": 3.5925463205431445, + "learning_rate": 2.3033195929090404e-06, + "loss": 0.0395, + "step": 3872 + }, + { + "epoch": 2.76, + "grad_norm": 9.615138697340234, + "learning_rate": 2.300886275687852e-06, + "loss": 0.0433, + "step": 3873 + }, + { + "epoch": 2.77, + "grad_norm": 18.44213000447938, + "learning_rate": 2.298453860290728e-06, + "loss": 0.0474, + "step": 3874 + }, + { + "epoch": 2.77, + "grad_norm": 14.67959710316391, + "learning_rate": 2.296022347530384e-06, + "loss": 0.0471, + "step": 3875 + }, + { + "epoch": 2.77, + "grad_norm": 10.905366471260221, + "learning_rate": 2.293591738219233e-06, + "loss": 0.0513, + "step": 3876 + }, + { + "epoch": 2.77, + "grad_norm": 11.816111298594338, + "learning_rate": 2.2911620331693867e-06, + "loss": 0.0495, + "step": 3877 + }, + { + "epoch": 2.77, + "grad_norm": 10.254614678819744, + "learning_rate": 2.2887332331926555e-06, + "loss": 0.06, + "step": 3878 + }, + { + "epoch": 2.77, + "grad_norm": 16.736637604289378, + "learning_rate": 2.2863053391005462e-06, + "loss": 0.0427, + "step": 3879 + }, + { + "epoch": 2.77, + "grad_norm": 18.759961357816216, + "learning_rate": 2.2838783517042628e-06, + "loss": 0.0543, + "step": 3880 + }, + { + "epoch": 2.77, + "grad_norm": 17.80455205698793, + "learning_rate": 2.281452271814708e-06, + "loss": 0.063, + "step": 3881 + }, + { + "epoch": 2.77, + "grad_norm": 19.60942208840727, + "learning_rate": 2.2790271002424794e-06, + "loss": 0.0605, + "step": 3882 + }, + { + "epoch": 2.77, + "grad_norm": 12.453488405455872, + "learning_rate": 2.276602837797872e-06, + "loss": 0.0385, + "step": 3883 + }, + { + "epoch": 2.77, + "grad_norm": 6.709907947321404, + "learning_rate": 2.274179485290879e-06, + "loss": 0.0788, + "step": 3884 + }, + { + "epoch": 2.77, + "grad_norm": 20.038024659232907, + "learning_rate": 2.271757043531184e-06, + "loss": 0.056, + "step": 3885 + }, + { + "epoch": 2.77, + "grad_norm": 9.050003866061806, + "learning_rate": 2.2693355133281706e-06, + "loss": 0.0639, + "step": 3886 + }, + { + "epoch": 2.77, + "grad_norm": 18.58808666026702, + "learning_rate": 2.266914895490918e-06, + "loss": 0.0564, + "step": 3887 + }, + { + "epoch": 2.78, + "grad_norm": 27.52059173829719, + "learning_rate": 2.2644951908282e-06, + "loss": 0.0483, + "step": 3888 + }, + { + "epoch": 2.78, + "grad_norm": 10.592134606963889, + "learning_rate": 2.262076400148484e-06, + "loss": 0.0627, + "step": 3889 + }, + { + "epoch": 2.78, + "grad_norm": 8.496491370108698, + "learning_rate": 2.2596585242599333e-06, + "loss": 0.0794, + "step": 3890 + }, + { + "epoch": 2.78, + "grad_norm": 17.493489913800556, + "learning_rate": 2.257241563970405e-06, + "loss": 0.0555, + "step": 3891 + }, + { + "epoch": 2.78, + "grad_norm": 13.568365004774797, + "learning_rate": 2.254825520087451e-06, + "loss": 0.064, + "step": 3892 + }, + { + "epoch": 2.78, + "grad_norm": 4.508400191683394, + "learning_rate": 2.2524103934183154e-06, + "loss": 0.0353, + "step": 3893 + }, + { + "epoch": 2.78, + "grad_norm": 12.32194315514444, + "learning_rate": 2.249996184769938e-06, + "loss": 0.0521, + "step": 3894 + }, + { + "epoch": 2.78, + "grad_norm": 18.4710969837349, + "learning_rate": 2.2475828949489504e-06, + "loss": 0.0694, + "step": 3895 + }, + { + "epoch": 2.78, + "grad_norm": 10.766594698490506, + "learning_rate": 2.2451705247616774e-06, + "loss": 0.0493, + "step": 3896 + }, + { + "epoch": 2.78, + "grad_norm": 9.107591867740075, + "learning_rate": 2.2427590750141364e-06, + "loss": 0.0403, + "step": 3897 + }, + { + "epoch": 2.78, + "grad_norm": 10.375009405030395, + "learning_rate": 2.240348546512039e-06, + "loss": 0.0589, + "step": 3898 + }, + { + "epoch": 2.78, + "grad_norm": 16.177260041162715, + "learning_rate": 2.237938940060786e-06, + "loss": 0.0407, + "step": 3899 + }, + { + "epoch": 2.78, + "grad_norm": 10.180497058246122, + "learning_rate": 2.235530256465474e-06, + "loss": 0.0477, + "step": 3900 + }, + { + "epoch": 2.78, + "grad_norm": 5.248521888185715, + "learning_rate": 2.233122496530884e-06, + "loss": 0.0443, + "step": 3901 + }, + { + "epoch": 2.79, + "grad_norm": 3.581699527389943, + "learning_rate": 2.2307156610615e-06, + "loss": 0.0369, + "step": 3902 + }, + { + "epoch": 2.79, + "grad_norm": 9.5196018314941, + "learning_rate": 2.2283097508614837e-06, + "loss": 0.0368, + "step": 3903 + }, + { + "epoch": 2.79, + "grad_norm": 3.2933709812401295, + "learning_rate": 2.225904766734702e-06, + "loss": 0.0442, + "step": 3904 + }, + { + "epoch": 2.79, + "grad_norm": 12.630550369303542, + "learning_rate": 2.2235007094846963e-06, + "loss": 0.0475, + "step": 3905 + }, + { + "epoch": 2.79, + "grad_norm": 10.408411930140902, + "learning_rate": 2.2210975799147143e-06, + "loss": 0.0349, + "step": 3906 + }, + { + "epoch": 2.79, + "grad_norm": 4.291974830052436, + "learning_rate": 2.21869537882768e-06, + "loss": 0.0429, + "step": 3907 + }, + { + "epoch": 2.79, + "grad_norm": 7.91993089844047, + "learning_rate": 2.21629410702622e-06, + "loss": 0.0311, + "step": 3908 + }, + { + "epoch": 2.79, + "grad_norm": 7.232659660731117, + "learning_rate": 2.2138937653126393e-06, + "loss": 0.0378, + "step": 3909 + }, + { + "epoch": 2.79, + "grad_norm": 6.846272738804037, + "learning_rate": 2.2114943544889366e-06, + "loss": 0.0432, + "step": 3910 + }, + { + "epoch": 2.79, + "grad_norm": 6.8102817293655065, + "learning_rate": 2.2090958753568013e-06, + "loss": 0.0545, + "step": 3911 + }, + { + "epoch": 2.79, + "grad_norm": 15.66677503356628, + "learning_rate": 2.206698328717609e-06, + "loss": 0.0679, + "step": 3912 + }, + { + "epoch": 2.79, + "grad_norm": 8.851017334282632, + "learning_rate": 2.2043017153724253e-06, + "loss": 0.0357, + "step": 3913 + }, + { + "epoch": 2.79, + "grad_norm": 4.348470099639396, + "learning_rate": 2.2019060361220036e-06, + "loss": 0.0399, + "step": 3914 + }, + { + "epoch": 2.79, + "grad_norm": 18.351757414727047, + "learning_rate": 2.199511291766783e-06, + "loss": 0.0485, + "step": 3915 + }, + { + "epoch": 2.8, + "grad_norm": 13.228376315595304, + "learning_rate": 2.1971174831068944e-06, + "loss": 0.0478, + "step": 3916 + }, + { + "epoch": 2.8, + "grad_norm": 4.989234913705305, + "learning_rate": 2.1947246109421514e-06, + "loss": 0.0687, + "step": 3917 + }, + { + "epoch": 2.8, + "grad_norm": 7.578293430391057, + "learning_rate": 2.192332676072061e-06, + "loss": 0.0698, + "step": 3918 + }, + { + "epoch": 2.8, + "grad_norm": 19.50726319750263, + "learning_rate": 2.189941679295807e-06, + "loss": 0.0401, + "step": 3919 + }, + { + "epoch": 2.8, + "grad_norm": 7.056050545567341, + "learning_rate": 2.1875516214122723e-06, + "loss": 0.0625, + "step": 3920 + }, + { + "epoch": 2.8, + "grad_norm": 6.43155113737614, + "learning_rate": 2.185162503220013e-06, + "loss": 0.0531, + "step": 3921 + }, + { + "epoch": 2.8, + "grad_norm": 7.53621423016585, + "learning_rate": 2.182774325517285e-06, + "loss": 0.0531, + "step": 3922 + }, + { + "epoch": 2.8, + "grad_norm": 5.175804512791806, + "learning_rate": 2.180387089102016e-06, + "loss": 0.0406, + "step": 3923 + }, + { + "epoch": 2.8, + "grad_norm": 5.263521350393166, + "learning_rate": 2.1780007947718336e-06, + "loss": 0.0337, + "step": 3924 + }, + { + "epoch": 2.8, + "grad_norm": 6.01139176635299, + "learning_rate": 2.175615443324035e-06, + "loss": 0.0613, + "step": 3925 + }, + { + "epoch": 2.8, + "grad_norm": 11.818360574772607, + "learning_rate": 2.173231035555618e-06, + "loss": 0.0646, + "step": 3926 + }, + { + "epoch": 2.8, + "grad_norm": 3.039048127733819, + "learning_rate": 2.170847572263252e-06, + "loss": 0.0482, + "step": 3927 + }, + { + "epoch": 2.8, + "grad_norm": 6.788911571364, + "learning_rate": 2.1684650542432985e-06, + "loss": 0.0548, + "step": 3928 + }, + { + "epoch": 2.8, + "grad_norm": 4.333329054284375, + "learning_rate": 2.166083482291801e-06, + "loss": 0.0405, + "step": 3929 + }, + { + "epoch": 2.81, + "grad_norm": 7.014460540954454, + "learning_rate": 2.1637028572044867e-06, + "loss": 0.0438, + "step": 3930 + }, + { + "epoch": 2.81, + "grad_norm": 5.847965777925317, + "learning_rate": 2.1613231797767668e-06, + "loss": 0.066, + "step": 3931 + }, + { + "epoch": 2.81, + "grad_norm": 8.636014633570216, + "learning_rate": 2.158944450803736e-06, + "loss": 0.0629, + "step": 3932 + }, + { + "epoch": 2.81, + "grad_norm": 29.011178688159877, + "learning_rate": 2.1565666710801714e-06, + "loss": 0.0831, + "step": 3933 + }, + { + "epoch": 2.81, + "grad_norm": 4.645775512919015, + "learning_rate": 2.1541898414005343e-06, + "loss": 0.0553, + "step": 3934 + }, + { + "epoch": 2.81, + "grad_norm": 4.5723835589756465, + "learning_rate": 2.1518139625589663e-06, + "loss": 0.0328, + "step": 3935 + }, + { + "epoch": 2.81, + "grad_norm": 9.046555105138385, + "learning_rate": 2.1494390353492935e-06, + "loss": 0.0498, + "step": 3936 + }, + { + "epoch": 2.81, + "grad_norm": 5.551419544385709, + "learning_rate": 2.1470650605650235e-06, + "loss": 0.0357, + "step": 3937 + }, + { + "epoch": 2.81, + "grad_norm": 5.766470083781369, + "learning_rate": 2.144692038999345e-06, + "loss": 0.0416, + "step": 3938 + }, + { + "epoch": 2.81, + "grad_norm": 11.936429313135362, + "learning_rate": 2.142319971445129e-06, + "loss": 0.0496, + "step": 3939 + }, + { + "epoch": 2.81, + "grad_norm": 4.825676884200282, + "learning_rate": 2.139948858694926e-06, + "loss": 0.0468, + "step": 3940 + }, + { + "epoch": 2.81, + "grad_norm": 4.708123288700695, + "learning_rate": 2.137578701540971e-06, + "loss": 0.0472, + "step": 3941 + }, + { + "epoch": 2.81, + "grad_norm": 3.6296186806200073, + "learning_rate": 2.1352095007751754e-06, + "loss": 0.045, + "step": 3942 + }, + { + "epoch": 2.81, + "grad_norm": 5.596968703398852, + "learning_rate": 2.132841257189137e-06, + "loss": 0.0561, + "step": 3943 + }, + { + "epoch": 2.82, + "grad_norm": 3.7089513216445256, + "learning_rate": 2.1304739715741235e-06, + "loss": 0.0627, + "step": 3944 + }, + { + "epoch": 2.82, + "grad_norm": 3.45323572738163, + "learning_rate": 2.128107644721096e-06, + "loss": 0.0333, + "step": 3945 + }, + { + "epoch": 2.82, + "grad_norm": 5.788593095757397, + "learning_rate": 2.1257422774206816e-06, + "loss": 0.045, + "step": 3946 + }, + { + "epoch": 2.82, + "grad_norm": 9.321850934523075, + "learning_rate": 2.1233778704632002e-06, + "loss": 0.0383, + "step": 3947 + }, + { + "epoch": 2.82, + "grad_norm": 10.186691315669988, + "learning_rate": 2.1210144246386378e-06, + "loss": 0.0892, + "step": 3948 + }, + { + "epoch": 2.82, + "grad_norm": 8.396952907923813, + "learning_rate": 2.1186519407366725e-06, + "loss": 0.0443, + "step": 3949 + }, + { + "epoch": 2.82, + "grad_norm": 2.555619165023534, + "learning_rate": 2.1162904195466455e-06, + "loss": 0.0361, + "step": 3950 + }, + { + "epoch": 2.82, + "grad_norm": 8.059103832940904, + "learning_rate": 2.113929861857594e-06, + "loss": 0.0584, + "step": 3951 + }, + { + "epoch": 2.82, + "grad_norm": 5.033265961432398, + "learning_rate": 2.1115702684582177e-06, + "loss": 0.0424, + "step": 3952 + }, + { + "epoch": 2.82, + "grad_norm": 4.060885661594244, + "learning_rate": 2.1092116401369033e-06, + "loss": 0.0406, + "step": 3953 + }, + { + "epoch": 2.82, + "grad_norm": 2.7540242910146486, + "learning_rate": 2.1068539776817115e-06, + "loss": 0.0282, + "step": 3954 + }, + { + "epoch": 2.82, + "grad_norm": 4.013085063098618, + "learning_rate": 2.1044972818803816e-06, + "loss": 0.0482, + "step": 3955 + }, + { + "epoch": 2.82, + "grad_norm": 11.249338957476022, + "learning_rate": 2.1021415535203294e-06, + "loss": 0.0404, + "step": 3956 + }, + { + "epoch": 2.82, + "grad_norm": 11.67130080350898, + "learning_rate": 2.0997867933886467e-06, + "loss": 0.0558, + "step": 3957 + }, + { + "epoch": 2.83, + "grad_norm": 11.149614862400087, + "learning_rate": 2.0974330022721044e-06, + "loss": 0.0512, + "step": 3958 + }, + { + "epoch": 2.83, + "grad_norm": 3.687014650829844, + "learning_rate": 2.0950801809571466e-06, + "loss": 0.0612, + "step": 3959 + }, + { + "epoch": 2.83, + "grad_norm": 4.395001654489367, + "learning_rate": 2.0927283302298944e-06, + "loss": 0.0432, + "step": 3960 + }, + { + "epoch": 2.83, + "grad_norm": 5.284072609548135, + "learning_rate": 2.0903774508761477e-06, + "loss": 0.0447, + "step": 3961 + }, + { + "epoch": 2.83, + "grad_norm": 3.343071674546592, + "learning_rate": 2.0880275436813726e-06, + "loss": 0.0218, + "step": 3962 + }, + { + "epoch": 2.83, + "grad_norm": 6.360795545549215, + "learning_rate": 2.0856786094307247e-06, + "loss": 0.053, + "step": 3963 + }, + { + "epoch": 2.83, + "grad_norm": 6.794862698130726, + "learning_rate": 2.0833306489090186e-06, + "loss": 0.0465, + "step": 3964 + }, + { + "epoch": 2.83, + "grad_norm": 5.79082889481297, + "learning_rate": 2.08098366290076e-06, + "loss": 0.0308, + "step": 3965 + }, + { + "epoch": 2.83, + "grad_norm": 10.3999370599744, + "learning_rate": 2.078637652190112e-06, + "loss": 0.0356, + "step": 3966 + }, + { + "epoch": 2.83, + "grad_norm": 5.198526561829731, + "learning_rate": 2.0762926175609287e-06, + "loss": 0.045, + "step": 3967 + }, + { + "epoch": 2.83, + "grad_norm": 7.195220899911224, + "learning_rate": 2.0739485597967237e-06, + "loss": 0.064, + "step": 3968 + }, + { + "epoch": 2.83, + "grad_norm": 5.666052330346906, + "learning_rate": 2.0716054796806916e-06, + "loss": 0.0467, + "step": 3969 + }, + { + "epoch": 2.83, + "grad_norm": 4.662416911777877, + "learning_rate": 2.0692633779956998e-06, + "loss": 0.0317, + "step": 3970 + }, + { + "epoch": 2.83, + "grad_norm": 8.017085740979171, + "learning_rate": 2.0669222555242884e-06, + "loss": 0.0516, + "step": 3971 + }, + { + "epoch": 2.84, + "grad_norm": 7.265789210058505, + "learning_rate": 2.064582113048669e-06, + "loss": 0.0457, + "step": 3972 + }, + { + "epoch": 2.84, + "grad_norm": 8.795311756133641, + "learning_rate": 2.0622429513507275e-06, + "loss": 0.0427, + "step": 3973 + }, + { + "epoch": 2.84, + "grad_norm": 6.9365340050039865, + "learning_rate": 2.05990477121202e-06, + "loss": 0.063, + "step": 3974 + }, + { + "epoch": 2.84, + "grad_norm": 3.914484245239362, + "learning_rate": 2.0575675734137773e-06, + "loss": 0.0345, + "step": 3975 + }, + { + "epoch": 2.84, + "grad_norm": 5.314654954698931, + "learning_rate": 2.0552313587369003e-06, + "loss": 0.0421, + "step": 3976 + }, + { + "epoch": 2.84, + "grad_norm": 3.3533779142183926, + "learning_rate": 2.052896127961963e-06, + "loss": 0.0424, + "step": 3977 + }, + { + "epoch": 2.84, + "grad_norm": 19.476559355360912, + "learning_rate": 2.050561881869205e-06, + "loss": 0.0509, + "step": 3978 + }, + { + "epoch": 2.84, + "grad_norm": 6.938628294930286, + "learning_rate": 2.048228621238547e-06, + "loss": 0.0466, + "step": 3979 + }, + { + "epoch": 2.84, + "grad_norm": 10.94306529980738, + "learning_rate": 2.0458963468495692e-06, + "loss": 0.0442, + "step": 3980 + }, + { + "epoch": 2.84, + "grad_norm": 4.791706216599774, + "learning_rate": 2.0435650594815338e-06, + "loss": 0.0298, + "step": 3981 + }, + { + "epoch": 2.84, + "grad_norm": 20.151160656020064, + "learning_rate": 2.0412347599133607e-06, + "loss": 0.0598, + "step": 3982 + }, + { + "epoch": 2.84, + "grad_norm": 12.255216096956687, + "learning_rate": 2.0389054489236534e-06, + "loss": 0.0381, + "step": 3983 + }, + { + "epoch": 2.84, + "grad_norm": 4.855045793700611, + "learning_rate": 2.03657712729067e-06, + "loss": 0.0604, + "step": 3984 + }, + { + "epoch": 2.84, + "grad_norm": 8.182350191418786, + "learning_rate": 2.034249795792355e-06, + "loss": 0.0321, + "step": 3985 + }, + { + "epoch": 2.85, + "grad_norm": 7.707765374797664, + "learning_rate": 2.031923455206306e-06, + "loss": 0.0521, + "step": 3986 + }, + { + "epoch": 2.85, + "grad_norm": 6.104221297033886, + "learning_rate": 2.0295981063098e-06, + "loss": 0.0394, + "step": 3987 + }, + { + "epoch": 2.85, + "grad_norm": 11.93087451550472, + "learning_rate": 2.027273749879777e-06, + "loss": 0.0406, + "step": 3988 + }, + { + "epoch": 2.85, + "grad_norm": 5.2406447108361265, + "learning_rate": 2.02495038669285e-06, + "loss": 0.0346, + "step": 3989 + }, + { + "epoch": 2.85, + "grad_norm": 13.604224730402509, + "learning_rate": 2.0226280175252966e-06, + "loss": 0.0383, + "step": 3990 + }, + { + "epoch": 2.85, + "grad_norm": 2.555103971529308, + "learning_rate": 2.020306643153063e-06, + "loss": 0.0337, + "step": 3991 + }, + { + "epoch": 2.85, + "grad_norm": 6.369185477762121, + "learning_rate": 2.0179862643517657e-06, + "loss": 0.0542, + "step": 3992 + }, + { + "epoch": 2.85, + "grad_norm": 3.4873107821184797, + "learning_rate": 2.015666881896684e-06, + "loss": 0.0335, + "step": 3993 + }, + { + "epoch": 2.85, + "grad_norm": 6.806890531404817, + "learning_rate": 2.0133484965627683e-06, + "loss": 0.0486, + "step": 3994 + }, + { + "epoch": 2.85, + "grad_norm": 11.163092400988585, + "learning_rate": 2.0110311091246333e-06, + "loss": 0.0443, + "step": 3995 + }, + { + "epoch": 2.85, + "grad_norm": 8.775320024586614, + "learning_rate": 2.0087147203565614e-06, + "loss": 0.0558, + "step": 3996 + }, + { + "epoch": 2.85, + "grad_norm": 4.638711681258063, + "learning_rate": 2.0063993310325013e-06, + "loss": 0.0591, + "step": 3997 + }, + { + "epoch": 2.85, + "grad_norm": 11.201379197569311, + "learning_rate": 2.0040849419260682e-06, + "loss": 0.0336, + "step": 3998 + }, + { + "epoch": 2.85, + "grad_norm": 8.662046945406965, + "learning_rate": 2.0017715538105416e-06, + "loss": 0.05, + "step": 3999 + }, + { + "epoch": 2.86, + "grad_norm": 22.627187773041392, + "learning_rate": 1.9994591674588677e-06, + "loss": 0.0756, + "step": 4000 + }, + { + "epoch": 2.86, + "eval_avg_AUC": 0.8313536271084199, + "eval_avg_Accuracy": 0.7335875331564987, + "eval_avg_Accuracy-right": 0.8922655536715794, + "eval_avg_Accuracy-wrong": 0.4569024334773709, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.7065878301129395, + "eval_last_AUC": 0.8360746856019163, + "eval_last_Accuracy": 0.7791777188328912, + "eval_last_Accuracy-right": 0.8478544411112561, + "eval_last_Accuracy-wrong": 0.6594268819649761, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.7097758926367477, + "eval_max_AUC": 0.779076192285081, + "eval_max_Accuracy": 0.6495358090185677, + "eval_max_Accuracy-right": 0.9808921351245597, + "eval_max_Accuracy-wrong": 0.07175346827382306, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6517312365875962, + "eval_min_AUC": 0.8398533477848513, + "eval_min_Accuracy": 0.7703498010610079, + "eval_min_Accuracy-right": 0.7896178427024912, + "eval_min_Accuracy-wrong": 0.7367523311348647, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.7123435959578684, + "eval_prod_AUC": 0.834872802991024, + "eval_prod_Accuracy": 0.7446949602122016, + "eval_prod_Accuracy-right": 0.6746445806704057, + "eval_prod_Accuracy-wrong": 0.8668410279736184, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.707421683014454, + "eval_runtime": 248.5729, + "eval_samples_per_second": 97.066, + "eval_steps_per_second": 3.033, + "eval_sum_AUC": 0.7054043711817337, + "eval_sum_Accuracy": 0.6395059681697612, + "eval_sum_Accuracy-right": 0.9969349158732229, + "eval_sum_Accuracy-wrong": 0.016261087104844214, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.682774019703003, + "step": 4000 + }, + { + "epoch": 2.86, + "grad_norm": 11.448506614643406, + "learning_rate": 1.9971477836436575e-06, + "loss": 0.0907, + "step": 4001 + }, + { + "epoch": 2.86, + "grad_norm": 17.106042357096115, + "learning_rate": 1.99483740313719e-06, + "loss": 0.046, + "step": 4002 + }, + { + "epoch": 2.86, + "grad_norm": 15.306913920279596, + "learning_rate": 1.9925280267114e-06, + "loss": 0.0353, + "step": 4003 + }, + { + "epoch": 2.86, + "grad_norm": 4.772152424590818, + "learning_rate": 1.9902196551379006e-06, + "loss": 0.0455, + "step": 4004 + }, + { + "epoch": 2.86, + "grad_norm": 15.68768042844194, + "learning_rate": 1.987912289187954e-06, + "loss": 0.0629, + "step": 4005 + }, + { + "epoch": 2.86, + "grad_norm": 8.371248909494858, + "learning_rate": 1.9856059296325027e-06, + "loss": 0.0479, + "step": 4006 + }, + { + "epoch": 2.86, + "grad_norm": 6.182577891454594, + "learning_rate": 1.9833005772421354e-06, + "loss": 0.0281, + "step": 4007 + }, + { + "epoch": 2.86, + "grad_norm": 10.517434691180428, + "learning_rate": 1.980996232787121e-06, + "loss": 0.044, + "step": 4008 + }, + { + "epoch": 2.86, + "grad_norm": 12.669315091660465, + "learning_rate": 1.978692897037377e-06, + "loss": 0.0345, + "step": 4009 + }, + { + "epoch": 2.86, + "grad_norm": 5.040911189426195, + "learning_rate": 1.9763905707624975e-06, + "loss": 0.0225, + "step": 4010 + }, + { + "epoch": 2.86, + "grad_norm": 6.772416865182909, + "learning_rate": 1.974089254731727e-06, + "loss": 0.0538, + "step": 4011 + }, + { + "epoch": 2.86, + "grad_norm": 6.111698799497366, + "learning_rate": 1.97178894971398e-06, + "loss": 0.0297, + "step": 4012 + }, + { + "epoch": 2.86, + "grad_norm": 5.376778629697909, + "learning_rate": 1.9694896564778317e-06, + "loss": 0.0508, + "step": 4013 + }, + { + "epoch": 2.87, + "grad_norm": 12.684429026836513, + "learning_rate": 1.9671913757915173e-06, + "loss": 0.0394, + "step": 4014 + }, + { + "epoch": 2.87, + "grad_norm": 10.673329195030192, + "learning_rate": 1.964894108422936e-06, + "loss": 0.0573, + "step": 4015 + }, + { + "epoch": 2.87, + "grad_norm": 9.962999370202702, + "learning_rate": 1.962597855139648e-06, + "loss": 0.0633, + "step": 4016 + }, + { + "epoch": 2.87, + "grad_norm": 5.930744399839919, + "learning_rate": 1.960302616708873e-06, + "loss": 0.0561, + "step": 4017 + }, + { + "epoch": 2.87, + "grad_norm": 2.99617559332668, + "learning_rate": 1.9580083938974937e-06, + "loss": 0.0344, + "step": 4018 + }, + { + "epoch": 2.87, + "grad_norm": 6.613066828727661, + "learning_rate": 1.9557151874720526e-06, + "loss": 0.0646, + "step": 4019 + }, + { + "epoch": 2.87, + "grad_norm": 14.130923755778449, + "learning_rate": 1.953422998198754e-06, + "loss": 0.0476, + "step": 4020 + }, + { + "epoch": 2.87, + "grad_norm": 5.093142217656671, + "learning_rate": 1.9511318268434554e-06, + "loss": 0.0422, + "step": 4021 + }, + { + "epoch": 2.87, + "grad_norm": 5.700589187453384, + "learning_rate": 1.9488416741716877e-06, + "loss": 0.0416, + "step": 4022 + }, + { + "epoch": 2.87, + "grad_norm": 11.290562714679753, + "learning_rate": 1.946552540948625e-06, + "loss": 0.0374, + "step": 4023 + }, + { + "epoch": 2.87, + "grad_norm": 11.076994659685992, + "learning_rate": 1.944264427939118e-06, + "loss": 0.064, + "step": 4024 + }, + { + "epoch": 2.87, + "grad_norm": 5.073760434659358, + "learning_rate": 1.941977335907659e-06, + "loss": 0.034, + "step": 4025 + }, + { + "epoch": 2.87, + "grad_norm": 12.555237380180236, + "learning_rate": 1.939691265618417e-06, + "loss": 0.0356, + "step": 4026 + }, + { + "epoch": 2.87, + "grad_norm": 13.260024757677835, + "learning_rate": 1.9374062178352036e-06, + "loss": 0.043, + "step": 4027 + }, + { + "epoch": 2.88, + "grad_norm": 8.049256823016284, + "learning_rate": 1.935122193321499e-06, + "loss": 0.0456, + "step": 4028 + }, + { + "epoch": 2.88, + "grad_norm": 4.427445039334935, + "learning_rate": 1.932839192840436e-06, + "loss": 0.0403, + "step": 4029 + }, + { + "epoch": 2.88, + "grad_norm": 11.153086971868019, + "learning_rate": 1.930557217154809e-06, + "loss": 0.0361, + "step": 4030 + }, + { + "epoch": 2.88, + "grad_norm": 6.049221219619234, + "learning_rate": 1.9282762670270693e-06, + "loss": 0.0492, + "step": 4031 + }, + { + "epoch": 2.88, + "grad_norm": 7.07032061230794, + "learning_rate": 1.925996343219323e-06, + "loss": 0.0557, + "step": 4032 + }, + { + "epoch": 2.88, + "grad_norm": 11.430527196674731, + "learning_rate": 1.923717446493336e-06, + "loss": 0.054, + "step": 4033 + }, + { + "epoch": 2.88, + "grad_norm": 16.819939441212135, + "learning_rate": 1.9214395776105297e-06, + "loss": 0.0526, + "step": 4034 + }, + { + "epoch": 2.88, + "grad_norm": 12.959453782895224, + "learning_rate": 1.919162737331983e-06, + "loss": 0.047, + "step": 4035 + }, + { + "epoch": 2.88, + "grad_norm": 4.702665094500624, + "learning_rate": 1.9168869264184296e-06, + "loss": 0.0423, + "step": 4036 + }, + { + "epoch": 2.88, + "grad_norm": 8.969495139042449, + "learning_rate": 1.9146121456302613e-06, + "loss": 0.038, + "step": 4037 + }, + { + "epoch": 2.88, + "grad_norm": 5.982997771608221, + "learning_rate": 1.9123383957275237e-06, + "loss": 0.0415, + "step": 4038 + }, + { + "epoch": 2.88, + "grad_norm": 21.08738392970205, + "learning_rate": 1.91006567746992e-06, + "loss": 0.0711, + "step": 4039 + }, + { + "epoch": 2.88, + "grad_norm": 6.035083987282505, + "learning_rate": 1.907793991616806e-06, + "loss": 0.0494, + "step": 4040 + }, + { + "epoch": 2.88, + "grad_norm": 8.001816643118843, + "learning_rate": 1.9055233389271955e-06, + "loss": 0.0515, + "step": 4041 + }, + { + "epoch": 2.89, + "grad_norm": 5.808783157897872, + "learning_rate": 1.9032537201597556e-06, + "loss": 0.0374, + "step": 4042 + }, + { + "epoch": 2.89, + "grad_norm": 7.463408759803654, + "learning_rate": 1.9009851360728077e-06, + "loss": 0.0522, + "step": 4043 + }, + { + "epoch": 2.89, + "grad_norm": 7.169003885468974, + "learning_rate": 1.898717587424328e-06, + "loss": 0.0566, + "step": 4044 + }, + { + "epoch": 2.89, + "grad_norm": 6.208200904132249, + "learning_rate": 1.8964510749719484e-06, + "loss": 0.0567, + "step": 4045 + }, + { + "epoch": 2.89, + "grad_norm": 16.35211171549844, + "learning_rate": 1.8941855994729497e-06, + "loss": 0.0486, + "step": 4046 + }, + { + "epoch": 2.89, + "grad_norm": 5.281197291589123, + "learning_rate": 1.8919211616842703e-06, + "loss": 0.0331, + "step": 4047 + }, + { + "epoch": 2.89, + "grad_norm": 4.747918718572736, + "learning_rate": 1.8896577623625017e-06, + "loss": 0.037, + "step": 4048 + }, + { + "epoch": 2.89, + "grad_norm": 12.046638045605734, + "learning_rate": 1.887395402263888e-06, + "loss": 0.0693, + "step": 4049 + }, + { + "epoch": 2.89, + "grad_norm": 27.115553942502387, + "learning_rate": 1.8851340821443248e-06, + "loss": 0.0374, + "step": 4050 + }, + { + "epoch": 2.89, + "grad_norm": 18.92193070578465, + "learning_rate": 1.882873802759362e-06, + "loss": 0.0794, + "step": 4051 + }, + { + "epoch": 2.89, + "grad_norm": 4.295273630927773, + "learning_rate": 1.8806145648642005e-06, + "loss": 0.04, + "step": 4052 + }, + { + "epoch": 2.89, + "grad_norm": 3.412350246216868, + "learning_rate": 1.8783563692136936e-06, + "loss": 0.0385, + "step": 4053 + }, + { + "epoch": 2.89, + "grad_norm": 17.67225398516106, + "learning_rate": 1.8760992165623465e-06, + "loss": 0.0724, + "step": 4054 + }, + { + "epoch": 2.89, + "grad_norm": 8.36568199802075, + "learning_rate": 1.873843107664316e-06, + "loss": 0.0547, + "step": 4055 + }, + { + "epoch": 2.9, + "grad_norm": 9.290828968139262, + "learning_rate": 1.87158804327341e-06, + "loss": 0.0535, + "step": 4056 + }, + { + "epoch": 2.9, + "grad_norm": 2.802306481599526, + "learning_rate": 1.8693340241430874e-06, + "loss": 0.0308, + "step": 4057 + }, + { + "epoch": 2.9, + "grad_norm": 10.43756004493309, + "learning_rate": 1.867081051026458e-06, + "loss": 0.057, + "step": 4058 + }, + { + "epoch": 2.9, + "grad_norm": 6.681088881904337, + "learning_rate": 1.8648291246762818e-06, + "loss": 0.0321, + "step": 4059 + }, + { + "epoch": 2.9, + "grad_norm": 8.252176413766616, + "learning_rate": 1.8625782458449693e-06, + "loss": 0.0511, + "step": 4060 + }, + { + "epoch": 2.9, + "grad_norm": 8.406152709881708, + "learning_rate": 1.860328415284583e-06, + "loss": 0.0521, + "step": 4061 + }, + { + "epoch": 2.9, + "grad_norm": 2.896995014403766, + "learning_rate": 1.8580796337468276e-06, + "loss": 0.0422, + "step": 4062 + }, + { + "epoch": 2.9, + "grad_norm": 3.8791001120200477, + "learning_rate": 1.8558319019830695e-06, + "loss": 0.0373, + "step": 4063 + }, + { + "epoch": 2.9, + "grad_norm": 5.53865330305396, + "learning_rate": 1.853585220744311e-06, + "loss": 0.0411, + "step": 4064 + }, + { + "epoch": 2.9, + "grad_norm": 12.29757774570903, + "learning_rate": 1.851339590781217e-06, + "loss": 0.0444, + "step": 4065 + }, + { + "epoch": 2.9, + "grad_norm": 17.39424270516312, + "learning_rate": 1.8490950128440877e-06, + "loss": 0.0834, + "step": 4066 + }, + { + "epoch": 2.9, + "grad_norm": 14.625296601227284, + "learning_rate": 1.8468514876828847e-06, + "loss": 0.0585, + "step": 4067 + }, + { + "epoch": 2.9, + "grad_norm": 18.641308415923643, + "learning_rate": 1.844609016047204e-06, + "loss": 0.0481, + "step": 4068 + }, + { + "epoch": 2.9, + "grad_norm": 9.87551268358642, + "learning_rate": 1.8423675986863054e-06, + "loss": 0.0361, + "step": 4069 + }, + { + "epoch": 2.91, + "grad_norm": 3.7267052490862635, + "learning_rate": 1.8401272363490818e-06, + "loss": 0.0443, + "step": 4070 + }, + { + "epoch": 2.91, + "grad_norm": 10.562034474063116, + "learning_rate": 1.8378879297840818e-06, + "loss": 0.0477, + "step": 4071 + }, + { + "epoch": 2.91, + "grad_norm": 5.802811004992916, + "learning_rate": 1.8356496797395002e-06, + "loss": 0.0434, + "step": 4072 + }, + { + "epoch": 2.91, + "grad_norm": 6.436346892902528, + "learning_rate": 1.8334124869631765e-06, + "loss": 0.0345, + "step": 4073 + }, + { + "epoch": 2.91, + "grad_norm": 5.010512163189531, + "learning_rate": 1.8311763522025994e-06, + "loss": 0.0364, + "step": 4074 + }, + { + "epoch": 2.91, + "grad_norm": 7.935289328219277, + "learning_rate": 1.828941276204903e-06, + "loss": 0.0379, + "step": 4075 + }, + { + "epoch": 2.91, + "grad_norm": 5.923070187953769, + "learning_rate": 1.8267072597168673e-06, + "loss": 0.0304, + "step": 4076 + }, + { + "epoch": 2.91, + "grad_norm": 3.8603791640597245, + "learning_rate": 1.8244743034849193e-06, + "loss": 0.0446, + "step": 4077 + }, + { + "epoch": 2.91, + "grad_norm": 3.915093269488006, + "learning_rate": 1.8222424082551303e-06, + "loss": 0.0691, + "step": 4078 + }, + { + "epoch": 2.91, + "grad_norm": 5.595291860210755, + "learning_rate": 1.820011574773221e-06, + "loss": 0.0426, + "step": 4079 + }, + { + "epoch": 2.91, + "grad_norm": 6.802755351874992, + "learning_rate": 1.8177818037845485e-06, + "loss": 0.0476, + "step": 4080 + }, + { + "epoch": 2.91, + "grad_norm": 3.6813348946660613, + "learning_rate": 1.8155530960341273e-06, + "loss": 0.0359, + "step": 4081 + }, + { + "epoch": 2.91, + "grad_norm": 18.71702922304564, + "learning_rate": 1.8133254522666033e-06, + "loss": 0.0569, + "step": 4082 + }, + { + "epoch": 2.91, + "grad_norm": 7.6921438088763985, + "learning_rate": 1.8110988732262808e-06, + "loss": 0.0419, + "step": 4083 + }, + { + "epoch": 2.92, + "grad_norm": 10.044378122355903, + "learning_rate": 1.8088733596570945e-06, + "loss": 0.0382, + "step": 4084 + }, + { + "epoch": 2.92, + "grad_norm": 6.114486237395475, + "learning_rate": 1.806648912302636e-06, + "loss": 0.061, + "step": 4085 + }, + { + "epoch": 2.92, + "grad_norm": 3.3399112721858772, + "learning_rate": 1.8044255319061287e-06, + "loss": 0.0387, + "step": 4086 + }, + { + "epoch": 2.92, + "grad_norm": 12.412698383313344, + "learning_rate": 1.8022032192104517e-06, + "loss": 0.0405, + "step": 4087 + }, + { + "epoch": 2.92, + "grad_norm": 30.82021867460914, + "learning_rate": 1.7999819749581154e-06, + "loss": 0.0749, + "step": 4088 + }, + { + "epoch": 2.92, + "grad_norm": 11.480240630642303, + "learning_rate": 1.797761799891281e-06, + "loss": 0.0581, + "step": 4089 + }, + { + "epoch": 2.92, + "grad_norm": 7.582474162510632, + "learning_rate": 1.7955426947517507e-06, + "loss": 0.059, + "step": 4090 + }, + { + "epoch": 2.92, + "grad_norm": 10.63345521267209, + "learning_rate": 1.793324660280968e-06, + "loss": 0.0526, + "step": 4091 + }, + { + "epoch": 2.92, + "grad_norm": 8.103239978569194, + "learning_rate": 1.7911076972200193e-06, + "loss": 0.0411, + "step": 4092 + }, + { + "epoch": 2.92, + "grad_norm": 11.925157615703743, + "learning_rate": 1.7888918063096334e-06, + "loss": 0.0408, + "step": 4093 + }, + { + "epoch": 2.92, + "grad_norm": 14.374780754417847, + "learning_rate": 1.7866769882901814e-06, + "loss": 0.0412, + "step": 4094 + }, + { + "epoch": 2.92, + "grad_norm": 6.109178998467284, + "learning_rate": 1.784463243901674e-06, + "loss": 0.0528, + "step": 4095 + }, + { + "epoch": 2.92, + "grad_norm": 9.609524368121408, + "learning_rate": 1.7822505738837648e-06, + "loss": 0.0651, + "step": 4096 + }, + { + "epoch": 2.92, + "grad_norm": 5.040866179019186, + "learning_rate": 1.7800389789757483e-06, + "loss": 0.0445, + "step": 4097 + }, + { + "epoch": 2.93, + "grad_norm": 5.714985544615724, + "learning_rate": 1.7778284599165597e-06, + "loss": 0.0487, + "step": 4098 + }, + { + "epoch": 2.93, + "grad_norm": 7.619378628354265, + "learning_rate": 1.7756190174447734e-06, + "loss": 0.0436, + "step": 4099 + }, + { + "epoch": 2.93, + "grad_norm": 3.2474864738073133, + "learning_rate": 1.7734106522986061e-06, + "loss": 0.0462, + "step": 4100 + }, + { + "epoch": 2.93, + "grad_norm": 4.3973092959361875, + "learning_rate": 1.7712033652159133e-06, + "loss": 0.0531, + "step": 4101 + }, + { + "epoch": 2.93, + "grad_norm": 7.4959152699641765, + "learning_rate": 1.7689971569341907e-06, + "loss": 0.0576, + "step": 4102 + }, + { + "epoch": 2.93, + "grad_norm": 3.749459966128013, + "learning_rate": 1.7667920281905738e-06, + "loss": 0.0277, + "step": 4103 + }, + { + "epoch": 2.93, + "grad_norm": 7.533700501363081, + "learning_rate": 1.764587979721838e-06, + "loss": 0.0647, + "step": 4104 + }, + { + "epoch": 2.93, + "grad_norm": 10.29526487200459, + "learning_rate": 1.7623850122643926e-06, + "loss": 0.0401, + "step": 4105 + }, + { + "epoch": 2.93, + "grad_norm": 12.913824847457555, + "learning_rate": 1.7601831265542968e-06, + "loss": 0.0405, + "step": 4106 + }, + { + "epoch": 2.93, + "grad_norm": 6.04341070181785, + "learning_rate": 1.7579823233272337e-06, + "loss": 0.0359, + "step": 4107 + }, + { + "epoch": 2.93, + "grad_norm": 19.592526482210516, + "learning_rate": 1.7557826033185404e-06, + "loss": 0.054, + "step": 4108 + }, + { + "epoch": 2.93, + "grad_norm": 5.540253325530484, + "learning_rate": 1.7535839672631772e-06, + "loss": 0.0508, + "step": 4109 + }, + { + "epoch": 2.93, + "grad_norm": 7.452509099451598, + "learning_rate": 1.7513864158957556e-06, + "loss": 0.0364, + "step": 4110 + }, + { + "epoch": 2.93, + "grad_norm": 20.761584265848015, + "learning_rate": 1.7491899499505122e-06, + "loss": 0.0438, + "step": 4111 + }, + { + "epoch": 2.94, + "grad_norm": 13.58680016006121, + "learning_rate": 1.746994570161334e-06, + "loss": 0.0384, + "step": 4112 + }, + { + "epoch": 2.94, + "grad_norm": 4.603506950318464, + "learning_rate": 1.7448002772617324e-06, + "loss": 0.0438, + "step": 4113 + }, + { + "epoch": 2.94, + "grad_norm": 3.8154557106353284, + "learning_rate": 1.7426070719848632e-06, + "loss": 0.0257, + "step": 4114 + }, + { + "epoch": 2.94, + "grad_norm": 5.747171185189265, + "learning_rate": 1.7404149550635173e-06, + "loss": 0.0524, + "step": 4115 + }, + { + "epoch": 2.94, + "grad_norm": 7.9740227323975335, + "learning_rate": 1.7382239272301221e-06, + "loss": 0.077, + "step": 4116 + }, + { + "epoch": 2.94, + "grad_norm": 10.989166783225153, + "learning_rate": 1.7360339892167404e-06, + "loss": 0.0374, + "step": 4117 + }, + { + "epoch": 2.94, + "grad_norm": 6.7143857340659165, + "learning_rate": 1.7338451417550712e-06, + "loss": 0.0756, + "step": 4118 + }, + { + "epoch": 2.94, + "grad_norm": 12.19837267726741, + "learning_rate": 1.7316573855764485e-06, + "loss": 0.092, + "step": 4119 + }, + { + "epoch": 2.94, + "grad_norm": 7.153980131842924, + "learning_rate": 1.7294707214118434e-06, + "loss": 0.0359, + "step": 4120 + }, + { + "epoch": 2.94, + "grad_norm": 7.588739529742861, + "learning_rate": 1.7272851499918603e-06, + "loss": 0.0444, + "step": 4121 + }, + { + "epoch": 2.94, + "grad_norm": 4.8112730950323765, + "learning_rate": 1.725100672046741e-06, + "loss": 0.0451, + "step": 4122 + }, + { + "epoch": 2.94, + "grad_norm": 7.260957042480349, + "learning_rate": 1.7229172883063556e-06, + "loss": 0.0417, + "step": 4123 + }, + { + "epoch": 2.94, + "grad_norm": 7.1770527506341235, + "learning_rate": 1.7207349995002192e-06, + "loss": 0.0321, + "step": 4124 + }, + { + "epoch": 2.94, + "grad_norm": 8.493330479432425, + "learning_rate": 1.7185538063574692e-06, + "loss": 0.0701, + "step": 4125 + }, + { + "epoch": 2.95, + "grad_norm": 18.78832049114435, + "learning_rate": 1.7163737096068883e-06, + "loss": 0.0322, + "step": 4126 + }, + { + "epoch": 2.95, + "grad_norm": 3.9921302247739954, + "learning_rate": 1.7141947099768818e-06, + "loss": 0.0453, + "step": 4127 + }, + { + "epoch": 2.95, + "grad_norm": 10.454142818755402, + "learning_rate": 1.7120168081955001e-06, + "loss": 0.0321, + "step": 4128 + }, + { + "epoch": 2.95, + "grad_norm": 7.925392060998192, + "learning_rate": 1.7098400049904163e-06, + "loss": 0.0514, + "step": 4129 + }, + { + "epoch": 2.95, + "grad_norm": 6.926450108002052, + "learning_rate": 1.707664301088941e-06, + "loss": 0.0338, + "step": 4130 + }, + { + "epoch": 2.95, + "grad_norm": 3.1024288350216556, + "learning_rate": 1.705489697218019e-06, + "loss": 0.0466, + "step": 4131 + }, + { + "epoch": 2.95, + "grad_norm": 9.17160382357441, + "learning_rate": 1.7033161941042248e-06, + "loss": 0.0503, + "step": 4132 + }, + { + "epoch": 2.95, + "grad_norm": 9.016353852398503, + "learning_rate": 1.7011437924737666e-06, + "loss": 0.0489, + "step": 4133 + }, + { + "epoch": 2.95, + "grad_norm": 9.854573725493205, + "learning_rate": 1.6989724930524843e-06, + "loss": 0.0579, + "step": 4134 + }, + { + "epoch": 2.95, + "grad_norm": 6.671723566772397, + "learning_rate": 1.6968022965658492e-06, + "loss": 0.0335, + "step": 4135 + }, + { + "epoch": 2.95, + "grad_norm": 2.720192606101922, + "learning_rate": 1.694633203738964e-06, + "loss": 0.0402, + "step": 4136 + }, + { + "epoch": 2.95, + "grad_norm": 5.995599861484541, + "learning_rate": 1.6924652152965632e-06, + "loss": 0.0549, + "step": 4137 + }, + { + "epoch": 2.95, + "grad_norm": 12.266299701613004, + "learning_rate": 1.690298331963014e-06, + "loss": 0.0572, + "step": 4138 + }, + { + "epoch": 2.95, + "grad_norm": 3.4813446495338654, + "learning_rate": 1.6881325544623067e-06, + "loss": 0.0452, + "step": 4139 + }, + { + "epoch": 2.96, + "grad_norm": 5.891263821479523, + "learning_rate": 1.6859678835180749e-06, + "loss": 0.0515, + "step": 4140 + }, + { + "epoch": 2.96, + "grad_norm": 3.6553475031782474, + "learning_rate": 1.6838043198535693e-06, + "loss": 0.0418, + "step": 4141 + }, + { + "epoch": 2.96, + "grad_norm": 5.408411863091127, + "learning_rate": 1.681641864191682e-06, + "loss": 0.0557, + "step": 4142 + }, + { + "epoch": 2.96, + "grad_norm": 5.8114627598252895, + "learning_rate": 1.6794805172549244e-06, + "loss": 0.0398, + "step": 4143 + }, + { + "epoch": 2.96, + "grad_norm": 7.53548148027711, + "learning_rate": 1.6773202797654486e-06, + "loss": 0.061, + "step": 4144 + }, + { + "epoch": 2.96, + "grad_norm": 5.913855035387853, + "learning_rate": 1.6751611524450235e-06, + "loss": 0.0335, + "step": 4145 + }, + { + "epoch": 2.96, + "grad_norm": 2.024414954794077, + "learning_rate": 1.6730031360150605e-06, + "loss": 0.0341, + "step": 4146 + }, + { + "epoch": 2.96, + "grad_norm": 9.593789286345464, + "learning_rate": 1.670846231196588e-06, + "loss": 0.0404, + "step": 4147 + }, + { + "epoch": 2.96, + "grad_norm": 4.308719832906545, + "learning_rate": 1.6686904387102692e-06, + "loss": 0.0405, + "step": 4148 + }, + { + "epoch": 2.96, + "grad_norm": 4.491244505461696, + "learning_rate": 1.6665357592763948e-06, + "loss": 0.0345, + "step": 4149 + }, + { + "epoch": 2.96, + "grad_norm": 14.320074989143352, + "learning_rate": 1.6643821936148834e-06, + "loss": 0.0335, + "step": 4150 + }, + { + "epoch": 2.96, + "grad_norm": 4.632814125620873, + "learning_rate": 1.6622297424452817e-06, + "loss": 0.0333, + "step": 4151 + }, + { + "epoch": 2.96, + "grad_norm": 8.547410451451574, + "learning_rate": 1.6600784064867625e-06, + "loss": 0.049, + "step": 4152 + }, + { + "epoch": 2.96, + "grad_norm": 11.406023385749469, + "learning_rate": 1.6579281864581275e-06, + "loss": 0.0543, + "step": 4153 + }, + { + "epoch": 2.97, + "grad_norm": 6.388068135398717, + "learning_rate": 1.6557790830778058e-06, + "loss": 0.0488, + "step": 4154 + }, + { + "epoch": 2.97, + "grad_norm": 4.4616777204213145, + "learning_rate": 1.6536310970638525e-06, + "loss": 0.0606, + "step": 4155 + }, + { + "epoch": 2.97, + "grad_norm": 20.563153827350366, + "learning_rate": 1.6514842291339494e-06, + "loss": 0.0524, + "step": 4156 + }, + { + "epoch": 2.97, + "grad_norm": 10.870505302773477, + "learning_rate": 1.6493384800054052e-06, + "loss": 0.0561, + "step": 4157 + }, + { + "epoch": 2.97, + "grad_norm": 9.278625304721448, + "learning_rate": 1.6471938503951546e-06, + "loss": 0.0536, + "step": 4158 + }, + { + "epoch": 2.97, + "grad_norm": 4.924475025051709, + "learning_rate": 1.6450503410197582e-06, + "loss": 0.0427, + "step": 4159 + }, + { + "epoch": 2.97, + "grad_norm": 14.546159414440977, + "learning_rate": 1.6429079525954023e-06, + "loss": 0.0796, + "step": 4160 + }, + { + "epoch": 2.97, + "grad_norm": 3.274576485929536, + "learning_rate": 1.6407666858378985e-06, + "loss": 0.0368, + "step": 4161 + }, + { + "epoch": 2.97, + "grad_norm": 4.750934336503444, + "learning_rate": 1.6386265414626834e-06, + "loss": 0.0559, + "step": 4162 + }, + { + "epoch": 2.97, + "grad_norm": 3.540640136850486, + "learning_rate": 1.636487520184822e-06, + "loss": 0.0367, + "step": 4163 + }, + { + "epoch": 2.97, + "grad_norm": 4.692046580807771, + "learning_rate": 1.6343496227189948e-06, + "loss": 0.0359, + "step": 4164 + }, + { + "epoch": 2.97, + "grad_norm": 10.53276192290032, + "learning_rate": 1.632212849779521e-06, + "loss": 0.0316, + "step": 4165 + }, + { + "epoch": 2.97, + "grad_norm": 5.302753116947668, + "learning_rate": 1.630077202080328e-06, + "loss": 0.0267, + "step": 4166 + }, + { + "epoch": 2.97, + "grad_norm": 5.259703313914846, + "learning_rate": 1.6279426803349828e-06, + "loss": 0.0547, + "step": 4167 + }, + { + "epoch": 2.98, + "grad_norm": 4.082893040513792, + "learning_rate": 1.6258092852566625e-06, + "loss": 0.0217, + "step": 4168 + }, + { + "epoch": 2.98, + "grad_norm": 8.23019416830836, + "learning_rate": 1.6236770175581807e-06, + "loss": 0.0509, + "step": 4169 + }, + { + "epoch": 2.98, + "grad_norm": 12.438101804745394, + "learning_rate": 1.62154587795196e-06, + "loss": 0.055, + "step": 4170 + }, + { + "epoch": 2.98, + "grad_norm": 5.427420456057769, + "learning_rate": 1.6194158671500616e-06, + "loss": 0.0518, + "step": 4171 + }, + { + "epoch": 2.98, + "grad_norm": 10.415775308376247, + "learning_rate": 1.6172869858641554e-06, + "loss": 0.0432, + "step": 4172 + }, + { + "epoch": 2.98, + "grad_norm": 8.480843745125458, + "learning_rate": 1.6151592348055433e-06, + "loss": 0.0427, + "step": 4173 + }, + { + "epoch": 2.98, + "grad_norm": 4.617772973040294, + "learning_rate": 1.6130326146851455e-06, + "loss": 0.0438, + "step": 4174 + }, + { + "epoch": 2.98, + "grad_norm": 5.919133113007816, + "learning_rate": 1.6109071262135056e-06, + "loss": 0.0423, + "step": 4175 + }, + { + "epoch": 2.98, + "grad_norm": 3.689039215465472, + "learning_rate": 1.608782770100789e-06, + "loss": 0.0372, + "step": 4176 + }, + { + "epoch": 2.98, + "grad_norm": 14.067237284232494, + "learning_rate": 1.6066595470567825e-06, + "loss": 0.0435, + "step": 4177 + }, + { + "epoch": 2.98, + "grad_norm": 5.854433151920558, + "learning_rate": 1.6045374577908944e-06, + "loss": 0.0504, + "step": 4178 + }, + { + "epoch": 2.98, + "grad_norm": 11.6646480285659, + "learning_rate": 1.6024165030121542e-06, + "loss": 0.0554, + "step": 4179 + }, + { + "epoch": 2.98, + "grad_norm": 6.011438841677108, + "learning_rate": 1.6002966834292116e-06, + "loss": 0.0504, + "step": 4180 + }, + { + "epoch": 2.98, + "grad_norm": 6.227447353492944, + "learning_rate": 1.5981779997503405e-06, + "loss": 0.0476, + "step": 4181 + }, + { + "epoch": 2.99, + "grad_norm": 10.349021587684007, + "learning_rate": 1.5960604526834266e-06, + "loss": 0.077, + "step": 4182 + }, + { + "epoch": 2.99, + "grad_norm": 6.211039507384463, + "learning_rate": 1.5939440429359888e-06, + "loss": 0.0554, + "step": 4183 + }, + { + "epoch": 2.99, + "grad_norm": 2.6513931652813705, + "learning_rate": 1.591828771215152e-06, + "loss": 0.0348, + "step": 4184 + }, + { + "epoch": 2.99, + "grad_norm": 9.179016643100248, + "learning_rate": 1.5897146382276752e-06, + "loss": 0.0577, + "step": 4185 + }, + { + "epoch": 2.99, + "grad_norm": 7.051485770699399, + "learning_rate": 1.587601644679922e-06, + "loss": 0.0298, + "step": 4186 + }, + { + "epoch": 2.99, + "grad_norm": 7.952697092704008, + "learning_rate": 1.58548979127789e-06, + "loss": 0.0339, + "step": 4187 + }, + { + "epoch": 2.99, + "grad_norm": 3.6748750294465, + "learning_rate": 1.5833790787271819e-06, + "loss": 0.0346, + "step": 4188 + }, + { + "epoch": 2.99, + "grad_norm": 5.611862793582695, + "learning_rate": 1.5812695077330325e-06, + "loss": 0.056, + "step": 4189 + }, + { + "epoch": 2.99, + "grad_norm": 11.09492276918743, + "learning_rate": 1.5791610790002838e-06, + "loss": 0.0451, + "step": 4190 + }, + { + "epoch": 2.99, + "grad_norm": 8.006779281484823, + "learning_rate": 1.577053793233403e-06, + "loss": 0.049, + "step": 4191 + }, + { + "epoch": 2.99, + "grad_norm": 10.760492669788933, + "learning_rate": 1.5749476511364726e-06, + "loss": 0.0446, + "step": 4192 + }, + { + "epoch": 2.99, + "grad_norm": 10.776948111144355, + "learning_rate": 1.5728426534131946e-06, + "loss": 0.0305, + "step": 4193 + }, + { + "epoch": 2.99, + "grad_norm": 4.0188505044934155, + "learning_rate": 1.5707388007668877e-06, + "loss": 0.0481, + "step": 4194 + }, + { + "epoch": 2.99, + "grad_norm": 8.979036207018861, + "learning_rate": 1.568636093900488e-06, + "loss": 0.0398, + "step": 4195 + }, + { + "epoch": 3.0, + "grad_norm": 6.689484772564502, + "learning_rate": 1.5665345335165488e-06, + "loss": 0.0412, + "step": 4196 + }, + { + "epoch": 3.0, + "grad_norm": 4.5182528489425104, + "learning_rate": 1.5644341203172415e-06, + "loss": 0.0522, + "step": 4197 + }, + { + "epoch": 3.0, + "grad_norm": 7.494580739110864, + "learning_rate": 1.5623348550043516e-06, + "loss": 0.0384, + "step": 4198 + }, + { + "epoch": 3.0, + "grad_norm": 6.568879027287965, + "learning_rate": 1.5602367382792839e-06, + "loss": 0.0331, + "step": 4199 + }, + { + "epoch": 3.0, + "grad_norm": 14.033675146688019, + "learning_rate": 1.5581397708430578e-06, + "loss": 0.0726, + "step": 4200 + }, + { + "epoch": 3.0, + "grad_norm": 4.394766382636879, + "learning_rate": 1.556043953396309e-06, + "loss": 0.0311, + "step": 4201 + }, + { + "epoch": 3.0, + "grad_norm": 11.599669876581697, + "learning_rate": 1.5539492866392891e-06, + "loss": 0.0349, + "step": 4202 + }, + { + "epoch": 3.0, + "grad_norm": 5.095777532033208, + "learning_rate": 1.551855771271865e-06, + "loss": 0.0338, + "step": 4203 + }, + { + "epoch": 3.0, + "grad_norm": 8.662438814706617, + "learning_rate": 1.5497634079935198e-06, + "loss": 0.0439, + "step": 4204 + }, + { + "epoch": 3.0, + "grad_norm": 1.5906055347775252, + "learning_rate": 1.5476721975033498e-06, + "loss": 0.0178, + "step": 4205 + }, + { + "epoch": 3.0, + "grad_norm": 3.918653409467006, + "learning_rate": 1.5455821405000703e-06, + "loss": 0.0265, + "step": 4206 + }, + { + "epoch": 3.0, + "grad_norm": 5.99063662385307, + "learning_rate": 1.5434932376820039e-06, + "loss": 0.034, + "step": 4207 + }, + { + "epoch": 3.0, + "grad_norm": 5.838954154742759, + "learning_rate": 1.5414054897470942e-06, + "loss": 0.0314, + "step": 4208 + }, + { + "epoch": 3.0, + "grad_norm": 1.5387521453912725, + "learning_rate": 1.5393188973928957e-06, + "loss": 0.0201, + "step": 4209 + }, + { + "epoch": 3.0, + "grad_norm": 3.3458673133915355, + "learning_rate": 1.5372334613165784e-06, + "loss": 0.0305, + "step": 4210 + }, + { + "epoch": 3.01, + "grad_norm": 5.3408721579010185, + "learning_rate": 1.5351491822149255e-06, + "loss": 0.0208, + "step": 4211 + }, + { + "epoch": 3.01, + "grad_norm": 5.0928277444132934, + "learning_rate": 1.533066060784333e-06, + "loss": 0.0292, + "step": 4212 + }, + { + "epoch": 3.01, + "grad_norm": 4.046163091652899, + "learning_rate": 1.5309840977208096e-06, + "loss": 0.0229, + "step": 4213 + }, + { + "epoch": 3.01, + "grad_norm": 2.3567429229007706, + "learning_rate": 1.5289032937199793e-06, + "loss": 0.0251, + "step": 4214 + }, + { + "epoch": 3.01, + "grad_norm": 4.662301159431529, + "learning_rate": 1.5268236494770772e-06, + "loss": 0.0248, + "step": 4215 + }, + { + "epoch": 3.01, + "grad_norm": 5.949991688960428, + "learning_rate": 1.5247451656869499e-06, + "loss": 0.0298, + "step": 4216 + }, + { + "epoch": 3.01, + "grad_norm": 4.549139241488894, + "learning_rate": 1.5226678430440588e-06, + "loss": 0.0263, + "step": 4217 + }, + { + "epoch": 3.01, + "grad_norm": 3.180555537523223, + "learning_rate": 1.5205916822424755e-06, + "loss": 0.0239, + "step": 4218 + }, + { + "epoch": 3.01, + "grad_norm": 3.443474683242361, + "learning_rate": 1.5185166839758836e-06, + "loss": 0.0295, + "step": 4219 + }, + { + "epoch": 3.01, + "grad_norm": 1.5504955901675597, + "learning_rate": 1.5164428489375789e-06, + "loss": 0.0229, + "step": 4220 + }, + { + "epoch": 3.01, + "grad_norm": 2.295258883006284, + "learning_rate": 1.5143701778204683e-06, + "loss": 0.0263, + "step": 4221 + }, + { + "epoch": 3.01, + "grad_norm": 3.551031768131113, + "learning_rate": 1.5122986713170712e-06, + "loss": 0.0274, + "step": 4222 + }, + { + "epoch": 3.01, + "grad_norm": 10.191936949992497, + "learning_rate": 1.510228330119512e-06, + "loss": 0.0294, + "step": 4223 + }, + { + "epoch": 3.01, + "grad_norm": 6.993474554478619, + "learning_rate": 1.5081591549195357e-06, + "loss": 0.0232, + "step": 4224 + }, + { + "epoch": 3.02, + "grad_norm": 9.562045171559191, + "learning_rate": 1.5060911464084864e-06, + "loss": 0.0357, + "step": 4225 + }, + { + "epoch": 3.02, + "grad_norm": 3.0924209278860673, + "learning_rate": 1.5040243052773312e-06, + "loss": 0.0326, + "step": 4226 + }, + { + "epoch": 3.02, + "grad_norm": 4.420905003070142, + "learning_rate": 1.5019586322166323e-06, + "loss": 0.0173, + "step": 4227 + }, + { + "epoch": 3.02, + "grad_norm": 4.811580679582127, + "learning_rate": 1.4998941279165773e-06, + "loss": 0.0158, + "step": 4228 + }, + { + "epoch": 3.02, + "grad_norm": 9.851045271456961, + "learning_rate": 1.4978307930669483e-06, + "loss": 0.024, + "step": 4229 + }, + { + "epoch": 3.02, + "grad_norm": 9.584372906933657, + "learning_rate": 1.4957686283571498e-06, + "loss": 0.0256, + "step": 4230 + }, + { + "epoch": 3.02, + "grad_norm": 6.770213359448928, + "learning_rate": 1.4937076344761858e-06, + "loss": 0.0271, + "step": 4231 + }, + { + "epoch": 3.02, + "grad_norm": 3.7727215155076355, + "learning_rate": 1.4916478121126732e-06, + "loss": 0.0224, + "step": 4232 + }, + { + "epoch": 3.02, + "grad_norm": 1.893841240458152, + "learning_rate": 1.4895891619548374e-06, + "loss": 0.0232, + "step": 4233 + }, + { + "epoch": 3.02, + "grad_norm": 5.161001637713427, + "learning_rate": 1.4875316846905113e-06, + "loss": 0.023, + "step": 4234 + }, + { + "epoch": 3.02, + "grad_norm": 11.061852811785181, + "learning_rate": 1.4854753810071364e-06, + "loss": 0.028, + "step": 4235 + }, + { + "epoch": 3.02, + "grad_norm": 8.109821052482838, + "learning_rate": 1.4834202515917628e-06, + "loss": 0.0312, + "step": 4236 + }, + { + "epoch": 3.02, + "grad_norm": 11.014754015899051, + "learning_rate": 1.4813662971310465e-06, + "loss": 0.032, + "step": 4237 + }, + { + "epoch": 3.02, + "grad_norm": 2.2753130698707484, + "learning_rate": 1.4793135183112523e-06, + "loss": 0.027, + "step": 4238 + }, + { + "epoch": 3.03, + "grad_norm": 4.690230867022709, + "learning_rate": 1.477261915818251e-06, + "loss": 0.0206, + "step": 4239 + }, + { + "epoch": 3.03, + "grad_norm": 5.281295741354212, + "learning_rate": 1.4752114903375243e-06, + "loss": 0.0131, + "step": 4240 + }, + { + "epoch": 3.03, + "grad_norm": 4.3490688386339516, + "learning_rate": 1.473162242554151e-06, + "loss": 0.0299, + "step": 4241 + }, + { + "epoch": 3.03, + "grad_norm": 8.960682672351583, + "learning_rate": 1.47111417315283e-06, + "loss": 0.0282, + "step": 4242 + }, + { + "epoch": 3.03, + "grad_norm": 8.66598782769347, + "learning_rate": 1.4690672828178532e-06, + "loss": 0.0362, + "step": 4243 + }, + { + "epoch": 3.03, + "grad_norm": 5.361898466333456, + "learning_rate": 1.467021572233131e-06, + "loss": 0.0323, + "step": 4244 + }, + { + "epoch": 3.03, + "grad_norm": 3.731815327650255, + "learning_rate": 1.4649770420821663e-06, + "loss": 0.0297, + "step": 4245 + }, + { + "epoch": 3.03, + "grad_norm": 5.073322490267396, + "learning_rate": 1.4629336930480813e-06, + "loss": 0.0276, + "step": 4246 + }, + { + "epoch": 3.03, + "grad_norm": 2.1603771617345915, + "learning_rate": 1.4608915258135914e-06, + "loss": 0.0354, + "step": 4247 + }, + { + "epoch": 3.03, + "grad_norm": 6.080643211063934, + "learning_rate": 1.4588505410610283e-06, + "loss": 0.0316, + "step": 4248 + }, + { + "epoch": 3.03, + "grad_norm": 4.017000843022467, + "learning_rate": 1.4568107394723175e-06, + "loss": 0.0245, + "step": 4249 + }, + { + "epoch": 3.03, + "grad_norm": 2.6670606172944034, + "learning_rate": 1.4547721217289972e-06, + "loss": 0.0292, + "step": 4250 + }, + { + "epoch": 3.03, + "grad_norm": 8.81886940865451, + "learning_rate": 1.4527346885122073e-06, + "loss": 0.0335, + "step": 4251 + }, + { + "epoch": 3.03, + "grad_norm": 6.820384061010361, + "learning_rate": 1.450698440502692e-06, + "loss": 0.022, + "step": 4252 + }, + { + "epoch": 3.04, + "grad_norm": 3.2055510470449873, + "learning_rate": 1.4486633783807997e-06, + "loss": 0.0227, + "step": 4253 + }, + { + "epoch": 3.04, + "grad_norm": 6.073201859461071, + "learning_rate": 1.4466295028264822e-06, + "loss": 0.0244, + "step": 4254 + }, + { + "epoch": 3.04, + "grad_norm": 2.993722905445915, + "learning_rate": 1.4445968145192951e-06, + "loss": 0.023, + "step": 4255 + }, + { + "epoch": 3.04, + "grad_norm": 2.6344293218691632, + "learning_rate": 1.4425653141383977e-06, + "loss": 0.0262, + "step": 4256 + }, + { + "epoch": 3.04, + "grad_norm": 3.3473254018552536, + "learning_rate": 1.4405350023625514e-06, + "loss": 0.0179, + "step": 4257 + }, + { + "epoch": 3.04, + "grad_norm": 2.0005642649862208, + "learning_rate": 1.4385058798701223e-06, + "loss": 0.0231, + "step": 4258 + }, + { + "epoch": 3.04, + "grad_norm": 2.6183840757874544, + "learning_rate": 1.4364779473390767e-06, + "loss": 0.0266, + "step": 4259 + }, + { + "epoch": 3.04, + "grad_norm": 3.7496408204859124, + "learning_rate": 1.4344512054469855e-06, + "loss": 0.0322, + "step": 4260 + }, + { + "epoch": 3.04, + "grad_norm": 5.046695349286583, + "learning_rate": 1.4324256548710202e-06, + "loss": 0.0212, + "step": 4261 + }, + { + "epoch": 3.04, + "grad_norm": 4.538256274604029, + "learning_rate": 1.430401296287955e-06, + "loss": 0.0237, + "step": 4262 + }, + { + "epoch": 3.04, + "grad_norm": 3.4393732090188815, + "learning_rate": 1.4283781303741662e-06, + "loss": 0.0169, + "step": 4263 + }, + { + "epoch": 3.04, + "grad_norm": 4.128065352323907, + "learning_rate": 1.4263561578056307e-06, + "loss": 0.0239, + "step": 4264 + }, + { + "epoch": 3.04, + "grad_norm": 3.7030098567807346, + "learning_rate": 1.4243353792579285e-06, + "loss": 0.028, + "step": 4265 + }, + { + "epoch": 3.04, + "grad_norm": 2.497292358728816, + "learning_rate": 1.4223157954062344e-06, + "loss": 0.0199, + "step": 4266 + }, + { + "epoch": 3.05, + "grad_norm": 6.153106787419073, + "learning_rate": 1.4202974069253362e-06, + "loss": 0.0301, + "step": 4267 + }, + { + "epoch": 3.05, + "grad_norm": 1.5508727812000602, + "learning_rate": 1.418280214489608e-06, + "loss": 0.0194, + "step": 4268 + }, + { + "epoch": 3.05, + "grad_norm": 3.9753775875375252, + "learning_rate": 1.416264218773038e-06, + "loss": 0.0207, + "step": 4269 + }, + { + "epoch": 3.05, + "grad_norm": 4.719703036216151, + "learning_rate": 1.4142494204492007e-06, + "loss": 0.0307, + "step": 4270 + }, + { + "epoch": 3.05, + "grad_norm": 1.8739011534547396, + "learning_rate": 1.412235820191285e-06, + "loss": 0.014, + "step": 4271 + }, + { + "epoch": 3.05, + "grad_norm": 1.6215340748185867, + "learning_rate": 1.4102234186720653e-06, + "loss": 0.0198, + "step": 4272 + }, + { + "epoch": 3.05, + "grad_norm": 1.1853722180193103, + "learning_rate": 1.4082122165639285e-06, + "loss": 0.0182, + "step": 4273 + }, + { + "epoch": 3.05, + "grad_norm": 1.4639779224320038, + "learning_rate": 1.4062022145388503e-06, + "loss": 0.0155, + "step": 4274 + }, + { + "epoch": 3.05, + "grad_norm": 3.0442440975692535, + "learning_rate": 1.4041934132684116e-06, + "loss": 0.034, + "step": 4275 + }, + { + "epoch": 3.05, + "grad_norm": 1.485809174565446, + "learning_rate": 1.4021858134237892e-06, + "loss": 0.0212, + "step": 4276 + }, + { + "epoch": 3.05, + "grad_norm": 2.050713547548853, + "learning_rate": 1.4001794156757598e-06, + "loss": 0.0276, + "step": 4277 + }, + { + "epoch": 3.05, + "grad_norm": 3.065013650984198, + "learning_rate": 1.398174220694699e-06, + "loss": 0.0283, + "step": 4278 + }, + { + "epoch": 3.05, + "grad_norm": 3.4201497917182877, + "learning_rate": 1.3961702291505791e-06, + "loss": 0.0275, + "step": 4279 + }, + { + "epoch": 3.05, + "grad_norm": 2.7541672840549887, + "learning_rate": 1.3941674417129714e-06, + "loss": 0.023, + "step": 4280 + }, + { + "epoch": 3.06, + "grad_norm": 2.246885890731216, + "learning_rate": 1.3921658590510434e-06, + "loss": 0.0336, + "step": 4281 + }, + { + "epoch": 3.06, + "grad_norm": 3.022641932858734, + "learning_rate": 1.3901654818335618e-06, + "loss": 0.0173, + "step": 4282 + }, + { + "epoch": 3.06, + "grad_norm": 6.8132174445899665, + "learning_rate": 1.3881663107288918e-06, + "loss": 0.0199, + "step": 4283 + }, + { + "epoch": 3.06, + "grad_norm": 1.867430595204692, + "learning_rate": 1.386168346404988e-06, + "loss": 0.0254, + "step": 4284 + }, + { + "epoch": 3.06, + "grad_norm": 3.233486849230603, + "learning_rate": 1.3841715895294138e-06, + "loss": 0.0185, + "step": 4285 + }, + { + "epoch": 3.06, + "grad_norm": 4.830562376779117, + "learning_rate": 1.3821760407693175e-06, + "loss": 0.0284, + "step": 4286 + }, + { + "epoch": 3.06, + "grad_norm": 1.5354005181744397, + "learning_rate": 1.3801817007914543e-06, + "loss": 0.0244, + "step": 4287 + }, + { + "epoch": 3.06, + "grad_norm": 3.8781541269057374, + "learning_rate": 1.3781885702621644e-06, + "loss": 0.0317, + "step": 4288 + }, + { + "epoch": 3.06, + "grad_norm": 3.305553379082568, + "learning_rate": 1.3761966498473956e-06, + "loss": 0.017, + "step": 4289 + }, + { + "epoch": 3.06, + "grad_norm": 5.335432721094165, + "learning_rate": 1.3742059402126818e-06, + "loss": 0.0297, + "step": 4290 + }, + { + "epoch": 3.06, + "grad_norm": 3.030662176498976, + "learning_rate": 1.3722164420231565e-06, + "loss": 0.0234, + "step": 4291 + }, + { + "epoch": 3.06, + "grad_norm": 2.2382330008234748, + "learning_rate": 1.370228155943548e-06, + "loss": 0.0209, + "step": 4292 + }, + { + "epoch": 3.06, + "grad_norm": 2.1132694656038105, + "learning_rate": 1.3682410826381816e-06, + "loss": 0.0181, + "step": 4293 + }, + { + "epoch": 3.06, + "grad_norm": 2.882412146996378, + "learning_rate": 1.366255222770973e-06, + "loss": 0.0223, + "step": 4294 + }, + { + "epoch": 3.07, + "grad_norm": 4.0879767931219435, + "learning_rate": 1.364270577005436e-06, + "loss": 0.0446, + "step": 4295 + }, + { + "epoch": 3.07, + "grad_norm": 3.898872989029239, + "learning_rate": 1.3622871460046778e-06, + "loss": 0.0242, + "step": 4296 + }, + { + "epoch": 3.07, + "grad_norm": 2.6033162498479463, + "learning_rate": 1.3603049304313992e-06, + "loss": 0.0349, + "step": 4297 + }, + { + "epoch": 3.07, + "grad_norm": 3.5165832344769536, + "learning_rate": 1.3583239309478953e-06, + "loss": 0.0233, + "step": 4298 + }, + { + "epoch": 3.07, + "grad_norm": 6.573066298874758, + "learning_rate": 1.3563441482160562e-06, + "loss": 0.0214, + "step": 4299 + }, + { + "epoch": 3.07, + "grad_norm": 2.235003481206468, + "learning_rate": 1.35436558289736e-06, + "loss": 0.0264, + "step": 4300 + }, + { + "epoch": 3.07, + "grad_norm": 5.948089488451964, + "learning_rate": 1.3523882356528883e-06, + "loss": 0.0163, + "step": 4301 + }, + { + "epoch": 3.07, + "grad_norm": 7.51398793823194, + "learning_rate": 1.350412107143303e-06, + "loss": 0.031, + "step": 4302 + }, + { + "epoch": 3.07, + "grad_norm": 2.0676717040990757, + "learning_rate": 1.3484371980288712e-06, + "loss": 0.0195, + "step": 4303 + }, + { + "epoch": 3.07, + "grad_norm": 2.0443453458688623, + "learning_rate": 1.3464635089694416e-06, + "loss": 0.0204, + "step": 4304 + }, + { + "epoch": 3.07, + "grad_norm": 2.4356846280920754, + "learning_rate": 1.344491040624466e-06, + "loss": 0.0255, + "step": 4305 + }, + { + "epoch": 3.07, + "grad_norm": 4.725705710515471, + "learning_rate": 1.3425197936529766e-06, + "loss": 0.0226, + "step": 4306 + }, + { + "epoch": 3.07, + "grad_norm": 4.276820785445097, + "learning_rate": 1.3405497687136098e-06, + "loss": 0.0295, + "step": 4307 + }, + { + "epoch": 3.07, + "grad_norm": 1.4908882433698298, + "learning_rate": 1.3385809664645827e-06, + "loss": 0.0192, + "step": 4308 + }, + { + "epoch": 3.08, + "grad_norm": 2.5380484947293613, + "learning_rate": 1.336613387563711e-06, + "loss": 0.0266, + "step": 4309 + }, + { + "epoch": 3.08, + "grad_norm": 2.2945673276968894, + "learning_rate": 1.3346470326683986e-06, + "loss": 0.0289, + "step": 4310 + }, + { + "epoch": 3.08, + "grad_norm": 2.4304115620560154, + "learning_rate": 1.3326819024356413e-06, + "loss": 0.0208, + "step": 4311 + }, + { + "epoch": 3.08, + "grad_norm": 2.8002395851902047, + "learning_rate": 1.3307179975220264e-06, + "loss": 0.0295, + "step": 4312 + }, + { + "epoch": 3.08, + "grad_norm": 2.275129136772214, + "learning_rate": 1.3287553185837298e-06, + "loss": 0.0194, + "step": 4313 + }, + { + "epoch": 3.08, + "grad_norm": 7.423216970876992, + "learning_rate": 1.3267938662765206e-06, + "loss": 0.0204, + "step": 4314 + }, + { + "epoch": 3.08, + "grad_norm": 2.6068477417666607, + "learning_rate": 1.324833641255755e-06, + "loss": 0.0209, + "step": 4315 + }, + { + "epoch": 3.08, + "grad_norm": 2.474988243701278, + "learning_rate": 1.3228746441763813e-06, + "loss": 0.0143, + "step": 4316 + }, + { + "epoch": 3.08, + "grad_norm": 2.441680844619571, + "learning_rate": 1.3209168756929363e-06, + "loss": 0.0305, + "step": 4317 + }, + { + "epoch": 3.08, + "grad_norm": 7.651310891463135, + "learning_rate": 1.3189603364595483e-06, + "loss": 0.0259, + "step": 4318 + }, + { + "epoch": 3.08, + "grad_norm": 1.8489347124819762, + "learning_rate": 1.3170050271299316e-06, + "loss": 0.0206, + "step": 4319 + }, + { + "epoch": 3.08, + "grad_norm": 2.050853437133708, + "learning_rate": 1.315050948357392e-06, + "loss": 0.0201, + "step": 4320 + }, + { + "epoch": 3.08, + "grad_norm": 3.8678126664888266, + "learning_rate": 1.3130981007948247e-06, + "loss": 0.0286, + "step": 4321 + }, + { + "epoch": 3.08, + "grad_norm": 3.5527708798067263, + "learning_rate": 1.3111464850947103e-06, + "loss": 0.0284, + "step": 4322 + }, + { + "epoch": 3.09, + "grad_norm": 6.799812210909939, + "learning_rate": 1.3091961019091216e-06, + "loss": 0.0285, + "step": 4323 + }, + { + "epoch": 3.09, + "grad_norm": 3.718954096429571, + "learning_rate": 1.3072469518897184e-06, + "loss": 0.0201, + "step": 4324 + }, + { + "epoch": 3.09, + "grad_norm": 7.454710537462165, + "learning_rate": 1.3052990356877444e-06, + "loss": 0.0204, + "step": 4325 + }, + { + "epoch": 3.09, + "grad_norm": 5.810987162547036, + "learning_rate": 1.3033523539540394e-06, + "loss": 0.0218, + "step": 4326 + }, + { + "epoch": 3.09, + "grad_norm": 2.6929964518246527, + "learning_rate": 1.3014069073390206e-06, + "loss": 0.0237, + "step": 4327 + }, + { + "epoch": 3.09, + "grad_norm": 1.6508887017292961, + "learning_rate": 1.2994626964927042e-06, + "loss": 0.0233, + "step": 4328 + }, + { + "epoch": 3.09, + "grad_norm": 1.694127839814189, + "learning_rate": 1.2975197220646807e-06, + "loss": 0.0146, + "step": 4329 + }, + { + "epoch": 3.09, + "grad_norm": 8.55682892697071, + "learning_rate": 1.29557798470414e-06, + "loss": 0.0198, + "step": 4330 + }, + { + "epoch": 3.09, + "grad_norm": 4.464695362177069, + "learning_rate": 1.293637485059847e-06, + "loss": 0.0292, + "step": 4331 + }, + { + "epoch": 3.09, + "grad_norm": 1.3157782950293537, + "learning_rate": 1.291698223780164e-06, + "loss": 0.0184, + "step": 4332 + }, + { + "epoch": 3.09, + "grad_norm": 2.6685124060299343, + "learning_rate": 1.2897602015130306e-06, + "loss": 0.0253, + "step": 4333 + }, + { + "epoch": 3.09, + "grad_norm": 3.4823603776992815, + "learning_rate": 1.287823418905977e-06, + "loss": 0.0304, + "step": 4334 + }, + { + "epoch": 3.09, + "grad_norm": 5.911506432130707, + "learning_rate": 1.2858878766061178e-06, + "loss": 0.0257, + "step": 4335 + }, + { + "epoch": 3.09, + "grad_norm": 2.086775123511386, + "learning_rate": 1.2839535752601551e-06, + "loss": 0.0324, + "step": 4336 + }, + { + "epoch": 3.1, + "grad_norm": 2.029837014664569, + "learning_rate": 1.2820205155143738e-06, + "loss": 0.0204, + "step": 4337 + }, + { + "epoch": 3.1, + "grad_norm": 7.437040162360332, + "learning_rate": 1.2800886980146453e-06, + "loss": 0.0263, + "step": 4338 + }, + { + "epoch": 3.1, + "grad_norm": 12.488964061218802, + "learning_rate": 1.2781581234064256e-06, + "loss": 0.033, + "step": 4339 + }, + { + "epoch": 3.1, + "grad_norm": 5.81702543197951, + "learning_rate": 1.276228792334756e-06, + "loss": 0.0384, + "step": 4340 + }, + { + "epoch": 3.1, + "grad_norm": 2.7625144524398144, + "learning_rate": 1.274300705444262e-06, + "loss": 0.0206, + "step": 4341 + }, + { + "epoch": 3.1, + "grad_norm": 1.9870680625000081, + "learning_rate": 1.2723738633791538e-06, + "loss": 0.0257, + "step": 4342 + }, + { + "epoch": 3.1, + "grad_norm": 1.9775795029648342, + "learning_rate": 1.2704482667832218e-06, + "loss": 0.0261, + "step": 4343 + }, + { + "epoch": 3.1, + "grad_norm": 7.743239827097478, + "learning_rate": 1.2685239162998485e-06, + "loss": 0.026, + "step": 4344 + }, + { + "epoch": 3.1, + "grad_norm": 3.6982912516993784, + "learning_rate": 1.2666008125719904e-06, + "loss": 0.0281, + "step": 4345 + }, + { + "epoch": 3.1, + "grad_norm": 5.896769242939257, + "learning_rate": 1.2646789562421975e-06, + "loss": 0.0206, + "step": 4346 + }, + { + "epoch": 3.1, + "grad_norm": 5.284872574990886, + "learning_rate": 1.2627583479525913e-06, + "loss": 0.0313, + "step": 4347 + }, + { + "epoch": 3.1, + "grad_norm": 6.165105344259164, + "learning_rate": 1.2608389883448896e-06, + "loss": 0.0283, + "step": 4348 + }, + { + "epoch": 3.1, + "grad_norm": 3.7245284863056223, + "learning_rate": 1.2589208780603795e-06, + "loss": 0.025, + "step": 4349 + }, + { + "epoch": 3.1, + "grad_norm": 2.567851032035511, + "learning_rate": 1.2570040177399435e-06, + "loss": 0.0274, + "step": 4350 + }, + { + "epoch": 3.11, + "grad_norm": 4.0847654157063555, + "learning_rate": 1.255088408024036e-06, + "loss": 0.0256, + "step": 4351 + }, + { + "epoch": 3.11, + "grad_norm": 4.834804601853921, + "learning_rate": 1.2531740495526989e-06, + "loss": 0.0204, + "step": 4352 + }, + { + "epoch": 3.11, + "grad_norm": 3.9081054310744254, + "learning_rate": 1.2512609429655553e-06, + "loss": 0.0241, + "step": 4353 + }, + { + "epoch": 3.11, + "grad_norm": 3.942714359609263, + "learning_rate": 1.249349088901809e-06, + "loss": 0.0239, + "step": 4354 + }, + { + "epoch": 3.11, + "grad_norm": 6.164893142313988, + "learning_rate": 1.247438488000247e-06, + "loss": 0.0227, + "step": 4355 + }, + { + "epoch": 3.11, + "grad_norm": 6.8002871593043235, + "learning_rate": 1.245529140899236e-06, + "loss": 0.0285, + "step": 4356 + }, + { + "epoch": 3.11, + "grad_norm": 14.128621503244876, + "learning_rate": 1.2436210482367245e-06, + "loss": 0.0278, + "step": 4357 + }, + { + "epoch": 3.11, + "grad_norm": 4.892012988671743, + "learning_rate": 1.2417142106502418e-06, + "loss": 0.0212, + "step": 4358 + }, + { + "epoch": 3.11, + "grad_norm": 6.258033264830363, + "learning_rate": 1.2398086287768969e-06, + "loss": 0.0276, + "step": 4359 + }, + { + "epoch": 3.11, + "grad_norm": 2.533324048190474, + "learning_rate": 1.237904303253381e-06, + "loss": 0.0209, + "step": 4360 + }, + { + "epoch": 3.11, + "grad_norm": 7.187459746139216, + "learning_rate": 1.236001234715965e-06, + "loss": 0.021, + "step": 4361 + }, + { + "epoch": 3.11, + "grad_norm": 3.648792787441624, + "learning_rate": 1.2340994238004987e-06, + "loss": 0.0202, + "step": 4362 + }, + { + "epoch": 3.11, + "grad_norm": 8.691909830938794, + "learning_rate": 1.2321988711424132e-06, + "loss": 0.0224, + "step": 4363 + }, + { + "epoch": 3.11, + "grad_norm": 6.995988388162933, + "learning_rate": 1.2302995773767174e-06, + "loss": 0.0349, + "step": 4364 + }, + { + "epoch": 3.12, + "grad_norm": 11.008840944917822, + "learning_rate": 1.2284015431380015e-06, + "loss": 0.0285, + "step": 4365 + }, + { + "epoch": 3.12, + "grad_norm": 3.373639756983513, + "learning_rate": 1.2265047690604354e-06, + "loss": 0.0184, + "step": 4366 + }, + { + "epoch": 3.12, + "grad_norm": 3.803315273323329, + "learning_rate": 1.2246092557777633e-06, + "loss": 0.0263, + "step": 4367 + }, + { + "epoch": 3.12, + "grad_norm": 3.4078700572104674, + "learning_rate": 1.2227150039233132e-06, + "loss": 0.032, + "step": 4368 + }, + { + "epoch": 3.12, + "grad_norm": 3.581845625792135, + "learning_rate": 1.2208220141299893e-06, + "loss": 0.0294, + "step": 4369 + }, + { + "epoch": 3.12, + "grad_norm": 10.077585671022081, + "learning_rate": 1.2189302870302755e-06, + "loss": 0.0233, + "step": 4370 + }, + { + "epoch": 3.12, + "grad_norm": 6.127715448474595, + "learning_rate": 1.2170398232562324e-06, + "loss": 0.0158, + "step": 4371 + }, + { + "epoch": 3.12, + "grad_norm": 8.996189920092641, + "learning_rate": 1.2151506234395e-06, + "loss": 0.0471, + "step": 4372 + }, + { + "epoch": 3.12, + "grad_norm": 8.082755597103926, + "learning_rate": 1.2132626882112935e-06, + "loss": 0.0246, + "step": 4373 + }, + { + "epoch": 3.12, + "grad_norm": 4.081561675170032, + "learning_rate": 1.211376018202408e-06, + "loss": 0.0271, + "step": 4374 + }, + { + "epoch": 3.12, + "grad_norm": 7.991360997398173, + "learning_rate": 1.2094906140432155e-06, + "loss": 0.0232, + "step": 4375 + }, + { + "epoch": 3.12, + "grad_norm": 5.043322011112299, + "learning_rate": 1.2076064763636641e-06, + "loss": 0.0167, + "step": 4376 + }, + { + "epoch": 3.12, + "grad_norm": 3.5060059412563933, + "learning_rate": 1.205723605793279e-06, + "loss": 0.0177, + "step": 4377 + }, + { + "epoch": 3.12, + "grad_norm": 3.3044991967413786, + "learning_rate": 1.2038420029611625e-06, + "loss": 0.0185, + "step": 4378 + }, + { + "epoch": 3.13, + "grad_norm": 2.5519361887925407, + "learning_rate": 1.2019616684959934e-06, + "loss": 0.0158, + "step": 4379 + }, + { + "epoch": 3.13, + "grad_norm": 1.5711224232775167, + "learning_rate": 1.2000826030260254e-06, + "loss": 0.0196, + "step": 4380 + }, + { + "epoch": 3.13, + "grad_norm": 3.845792898604567, + "learning_rate": 1.1982048071790903e-06, + "loss": 0.0327, + "step": 4381 + }, + { + "epoch": 3.13, + "grad_norm": 1.807799902555709, + "learning_rate": 1.1963282815825938e-06, + "loss": 0.0217, + "step": 4382 + }, + { + "epoch": 3.13, + "grad_norm": 3.5441110376934706, + "learning_rate": 1.194453026863519e-06, + "loss": 0.0205, + "step": 4383 + }, + { + "epoch": 3.13, + "grad_norm": 2.2644135155675387, + "learning_rate": 1.1925790436484219e-06, + "loss": 0.0182, + "step": 4384 + }, + { + "epoch": 3.13, + "grad_norm": 7.720254102788194, + "learning_rate": 1.1907063325634376e-06, + "loss": 0.0249, + "step": 4385 + }, + { + "epoch": 3.13, + "grad_norm": 2.967984156703408, + "learning_rate": 1.1888348942342697e-06, + "loss": 0.0239, + "step": 4386 + }, + { + "epoch": 3.13, + "grad_norm": 5.143287739884021, + "learning_rate": 1.1869647292862051e-06, + "loss": 0.0336, + "step": 4387 + }, + { + "epoch": 3.13, + "grad_norm": 6.7684831711996045, + "learning_rate": 1.1850958383440957e-06, + "loss": 0.0294, + "step": 4388 + }, + { + "epoch": 3.13, + "grad_norm": 8.714645017881613, + "learning_rate": 1.183228222032378e-06, + "loss": 0.0316, + "step": 4389 + }, + { + "epoch": 3.13, + "grad_norm": 5.477500456705119, + "learning_rate": 1.181361880975052e-06, + "loss": 0.0294, + "step": 4390 + }, + { + "epoch": 3.13, + "grad_norm": 3.500435025541258, + "learning_rate": 1.1794968157957026e-06, + "loss": 0.0218, + "step": 4391 + }, + { + "epoch": 3.13, + "grad_norm": 4.026778952227533, + "learning_rate": 1.1776330271174786e-06, + "loss": 0.0235, + "step": 4392 + }, + { + "epoch": 3.14, + "grad_norm": 1.910801405729195, + "learning_rate": 1.1757705155631072e-06, + "loss": 0.0224, + "step": 4393 + }, + { + "epoch": 3.14, + "grad_norm": 4.7870288481139776, + "learning_rate": 1.1739092817548887e-06, + "loss": 0.0263, + "step": 4394 + }, + { + "epoch": 3.14, + "grad_norm": 3.7988605929737616, + "learning_rate": 1.172049326314696e-06, + "loss": 0.0221, + "step": 4395 + }, + { + "epoch": 3.14, + "grad_norm": 4.220102301388364, + "learning_rate": 1.1701906498639741e-06, + "loss": 0.0193, + "step": 4396 + }, + { + "epoch": 3.14, + "grad_norm": 1.7825031992726328, + "learning_rate": 1.1683332530237423e-06, + "loss": 0.0278, + "step": 4397 + }, + { + "epoch": 3.14, + "grad_norm": 2.9360054998312166, + "learning_rate": 1.1664771364145905e-06, + "loss": 0.0293, + "step": 4398 + }, + { + "epoch": 3.14, + "grad_norm": 4.303354411436024, + "learning_rate": 1.1646223006566827e-06, + "loss": 0.0363, + "step": 4399 + }, + { + "epoch": 3.14, + "grad_norm": 2.194395308626148, + "learning_rate": 1.162768746369753e-06, + "loss": 0.0237, + "step": 4400 + }, + { + "epoch": 3.14, + "grad_norm": 2.116716075883209, + "learning_rate": 1.1609164741731105e-06, + "loss": 0.0273, + "step": 4401 + }, + { + "epoch": 3.14, + "grad_norm": 3.210931319165583, + "learning_rate": 1.1590654846856291e-06, + "loss": 0.0263, + "step": 4402 + }, + { + "epoch": 3.14, + "grad_norm": 2.119091545167854, + "learning_rate": 1.1572157785257643e-06, + "loss": 0.0143, + "step": 4403 + }, + { + "epoch": 3.14, + "grad_norm": 4.324074704437676, + "learning_rate": 1.1553673563115325e-06, + "loss": 0.0336, + "step": 4404 + }, + { + "epoch": 3.14, + "grad_norm": 4.170772991278056, + "learning_rate": 1.153520218660531e-06, + "loss": 0.038, + "step": 4405 + }, + { + "epoch": 3.14, + "grad_norm": 4.213138084530538, + "learning_rate": 1.1516743661899172e-06, + "loss": 0.0237, + "step": 4406 + }, + { + "epoch": 3.15, + "grad_norm": 11.656757681338371, + "learning_rate": 1.1498297995164305e-06, + "loss": 0.0331, + "step": 4407 + }, + { + "epoch": 3.15, + "grad_norm": 3.4983502997263547, + "learning_rate": 1.1479865192563683e-06, + "loss": 0.0318, + "step": 4408 + }, + { + "epoch": 3.15, + "grad_norm": 2.8807252835615254, + "learning_rate": 1.146144526025612e-06, + "loss": 0.0242, + "step": 4409 + }, + { + "epoch": 3.15, + "grad_norm": 2.6306843568728038, + "learning_rate": 1.1443038204396007e-06, + "loss": 0.0258, + "step": 4410 + }, + { + "epoch": 3.15, + "grad_norm": 4.192920419278408, + "learning_rate": 1.1424644031133502e-06, + "loss": 0.0187, + "step": 4411 + }, + { + "epoch": 3.15, + "grad_norm": 7.177668219309729, + "learning_rate": 1.1406262746614433e-06, + "loss": 0.026, + "step": 4412 + }, + { + "epoch": 3.15, + "grad_norm": 4.873333282003289, + "learning_rate": 1.1387894356980334e-06, + "loss": 0.0289, + "step": 4413 + }, + { + "epoch": 3.15, + "grad_norm": 2.9519728668908227, + "learning_rate": 1.1369538868368424e-06, + "loss": 0.0196, + "step": 4414 + }, + { + "epoch": 3.15, + "grad_norm": 3.8848350672996834, + "learning_rate": 1.1351196286911615e-06, + "loss": 0.0194, + "step": 4415 + }, + { + "epoch": 3.15, + "grad_norm": 4.635840605259712, + "learning_rate": 1.1332866618738498e-06, + "loss": 0.0232, + "step": 4416 + }, + { + "epoch": 3.15, + "grad_norm": 1.5581337612715147, + "learning_rate": 1.1314549869973363e-06, + "loss": 0.0232, + "step": 4417 + }, + { + "epoch": 3.15, + "grad_norm": 1.7583630718679997, + "learning_rate": 1.1296246046736176e-06, + "loss": 0.0185, + "step": 4418 + }, + { + "epoch": 3.15, + "grad_norm": 6.283893780931881, + "learning_rate": 1.1277955155142578e-06, + "loss": 0.0232, + "step": 4419 + }, + { + "epoch": 3.15, + "grad_norm": 7.862048061035702, + "learning_rate": 1.1259677201303905e-06, + "loss": 0.0239, + "step": 4420 + }, + { + "epoch": 3.16, + "grad_norm": 1.6230472483193787, + "learning_rate": 1.1241412191327155e-06, + "loss": 0.0157, + "step": 4421 + }, + { + "epoch": 3.16, + "grad_norm": 2.755073615396238, + "learning_rate": 1.1223160131315008e-06, + "loss": 0.0298, + "step": 4422 + }, + { + "epoch": 3.16, + "grad_norm": 2.73806500409641, + "learning_rate": 1.1204921027365818e-06, + "loss": 0.03, + "step": 4423 + }, + { + "epoch": 3.16, + "grad_norm": 3.1203954227660895, + "learning_rate": 1.1186694885573602e-06, + "loss": 0.0197, + "step": 4424 + }, + { + "epoch": 3.16, + "grad_norm": 3.5066645012673496, + "learning_rate": 1.1168481712028061e-06, + "loss": 0.0297, + "step": 4425 + }, + { + "epoch": 3.16, + "grad_norm": 4.2675581356015915, + "learning_rate": 1.115028151281457e-06, + "loss": 0.022, + "step": 4426 + }, + { + "epoch": 3.16, + "grad_norm": 3.4456080030453426, + "learning_rate": 1.1132094294014106e-06, + "loss": 0.0126, + "step": 4427 + }, + { + "epoch": 3.16, + "grad_norm": 6.579809040631279, + "learning_rate": 1.1113920061703416e-06, + "loss": 0.0318, + "step": 4428 + }, + { + "epoch": 3.16, + "grad_norm": 2.475266437209171, + "learning_rate": 1.1095758821954788e-06, + "loss": 0.0166, + "step": 4429 + }, + { + "epoch": 3.16, + "grad_norm": 2.0190295382576133, + "learning_rate": 1.107761058083629e-06, + "loss": 0.0327, + "step": 4430 + }, + { + "epoch": 3.16, + "grad_norm": 1.9898004038566273, + "learning_rate": 1.1059475344411535e-06, + "loss": 0.0185, + "step": 4431 + }, + { + "epoch": 3.16, + "grad_norm": 3.6520762370965887, + "learning_rate": 1.104135311873989e-06, + "loss": 0.012, + "step": 4432 + }, + { + "epoch": 3.16, + "grad_norm": 1.790944349770655, + "learning_rate": 1.1023243909876275e-06, + "loss": 0.0175, + "step": 4433 + }, + { + "epoch": 3.16, + "grad_norm": 1.4622862012678017, + "learning_rate": 1.1005147723871374e-06, + "loss": 0.0174, + "step": 4434 + }, + { + "epoch": 3.17, + "grad_norm": 2.29471929364547, + "learning_rate": 1.0987064566771405e-06, + "loss": 0.021, + "step": 4435 + }, + { + "epoch": 3.17, + "grad_norm": 6.60079633509637, + "learning_rate": 1.0968994444618313e-06, + "loss": 0.0331, + "step": 4436 + }, + { + "epoch": 3.17, + "grad_norm": 1.9722105559545458, + "learning_rate": 1.0950937363449659e-06, + "loss": 0.0176, + "step": 4437 + }, + { + "epoch": 3.17, + "grad_norm": 9.226928463154826, + "learning_rate": 1.0932893329298643e-06, + "loss": 0.0339, + "step": 4438 + }, + { + "epoch": 3.17, + "grad_norm": 2.7596617157692784, + "learning_rate": 1.0914862348194121e-06, + "loss": 0.0234, + "step": 4439 + }, + { + "epoch": 3.17, + "grad_norm": 5.353123299645156, + "learning_rate": 1.0896844426160575e-06, + "loss": 0.0313, + "step": 4440 + }, + { + "epoch": 3.17, + "grad_norm": 7.939115234782665, + "learning_rate": 1.0878839569218124e-06, + "loss": 0.0187, + "step": 4441 + }, + { + "epoch": 3.17, + "grad_norm": 3.5070933721087356, + "learning_rate": 1.0860847783382534e-06, + "loss": 0.0319, + "step": 4442 + }, + { + "epoch": 3.17, + "grad_norm": 7.566534679772082, + "learning_rate": 1.0842869074665186e-06, + "loss": 0.0226, + "step": 4443 + }, + { + "epoch": 3.17, + "grad_norm": 2.5892901109879563, + "learning_rate": 1.0824903449073115e-06, + "loss": 0.0241, + "step": 4444 + }, + { + "epoch": 3.17, + "grad_norm": 4.0522225755260655, + "learning_rate": 1.0806950912608937e-06, + "loss": 0.0407, + "step": 4445 + }, + { + "epoch": 3.17, + "grad_norm": 4.042715326868905, + "learning_rate": 1.0789011471270983e-06, + "loss": 0.0237, + "step": 4446 + }, + { + "epoch": 3.17, + "grad_norm": 5.5516387804921985, + "learning_rate": 1.0771085131053087e-06, + "loss": 0.0225, + "step": 4447 + }, + { + "epoch": 3.17, + "grad_norm": 3.6164402815909362, + "learning_rate": 1.0753171897944835e-06, + "loss": 0.0215, + "step": 4448 + }, + { + "epoch": 3.18, + "grad_norm": 6.6222474603434405, + "learning_rate": 1.0735271777931322e-06, + "loss": 0.022, + "step": 4449 + }, + { + "epoch": 3.18, + "grad_norm": 2.592791914105326, + "learning_rate": 1.0717384776993356e-06, + "loss": 0.0322, + "step": 4450 + }, + { + "epoch": 3.18, + "grad_norm": 5.227418309409531, + "learning_rate": 1.069951090110728e-06, + "loss": 0.0263, + "step": 4451 + }, + { + "epoch": 3.18, + "grad_norm": 2.77396491744881, + "learning_rate": 1.06816501562451e-06, + "loss": 0.0195, + "step": 4452 + }, + { + "epoch": 3.18, + "grad_norm": 1.8546536944682173, + "learning_rate": 1.0663802548374424e-06, + "loss": 0.0214, + "step": 4453 + }, + { + "epoch": 3.18, + "grad_norm": 3.6321220341074585, + "learning_rate": 1.064596808345847e-06, + "loss": 0.0237, + "step": 4454 + }, + { + "epoch": 3.18, + "grad_norm": 9.088425273383129, + "learning_rate": 1.0628146767456066e-06, + "loss": 0.031, + "step": 4455 + }, + { + "epoch": 3.18, + "grad_norm": 2.244296281743473, + "learning_rate": 1.061033860632164e-06, + "loss": 0.0244, + "step": 4456 + }, + { + "epoch": 3.18, + "grad_norm": 1.8747595056637634, + "learning_rate": 1.0592543606005235e-06, + "loss": 0.0254, + "step": 4457 + }, + { + "epoch": 3.18, + "grad_norm": 3.2313945845341676, + "learning_rate": 1.0574761772452486e-06, + "loss": 0.0342, + "step": 4458 + }, + { + "epoch": 3.18, + "grad_norm": 2.0815746855400103, + "learning_rate": 1.0556993111604635e-06, + "loss": 0.0226, + "step": 4459 + }, + { + "epoch": 3.18, + "grad_norm": 8.359714562424623, + "learning_rate": 1.0539237629398536e-06, + "loss": 0.0335, + "step": 4460 + }, + { + "epoch": 3.18, + "grad_norm": 3.709979629335406, + "learning_rate": 1.052149533176659e-06, + "loss": 0.0193, + "step": 4461 + }, + { + "epoch": 3.18, + "grad_norm": 2.239188178419034, + "learning_rate": 1.050376622463688e-06, + "loss": 0.0217, + "step": 4462 + }, + { + "epoch": 3.19, + "grad_norm": 4.238067345733504, + "learning_rate": 1.0486050313932972e-06, + "loss": 0.0246, + "step": 4463 + }, + { + "epoch": 3.19, + "grad_norm": 2.8338637835412612, + "learning_rate": 1.0468347605574137e-06, + "loss": 0.0266, + "step": 4464 + }, + { + "epoch": 3.19, + "grad_norm": 4.869316442569333, + "learning_rate": 1.0450658105475126e-06, + "loss": 0.0232, + "step": 4465 + }, + { + "epoch": 3.19, + "grad_norm": 3.5665750122251625, + "learning_rate": 1.0432981819546384e-06, + "loss": 0.0309, + "step": 4466 + }, + { + "epoch": 3.19, + "grad_norm": 8.936241009064924, + "learning_rate": 1.0415318753693837e-06, + "loss": 0.0236, + "step": 4467 + }, + { + "epoch": 3.19, + "grad_norm": 3.101588523930688, + "learning_rate": 1.0397668913819086e-06, + "loss": 0.0239, + "step": 4468 + }, + { + "epoch": 3.19, + "grad_norm": 4.295993293606268, + "learning_rate": 1.0380032305819243e-06, + "loss": 0.0223, + "step": 4469 + }, + { + "epoch": 3.19, + "grad_norm": 2.3409226213317025, + "learning_rate": 1.0362408935587026e-06, + "loss": 0.0309, + "step": 4470 + }, + { + "epoch": 3.19, + "grad_norm": 2.8493429852061487, + "learning_rate": 1.0344798809010748e-06, + "loss": 0.0246, + "step": 4471 + }, + { + "epoch": 3.19, + "grad_norm": 4.991729680693538, + "learning_rate": 1.0327201931974262e-06, + "loss": 0.0165, + "step": 4472 + }, + { + "epoch": 3.19, + "grad_norm": 1.8954580681443631, + "learning_rate": 1.0309618310357023e-06, + "loss": 0.0196, + "step": 4473 + }, + { + "epoch": 3.19, + "grad_norm": 10.27608130286604, + "learning_rate": 1.0292047950034046e-06, + "loss": 0.0312, + "step": 4474 + }, + { + "epoch": 3.19, + "grad_norm": 2.153159817327546, + "learning_rate": 1.0274490856875908e-06, + "loss": 0.0109, + "step": 4475 + }, + { + "epoch": 3.19, + "grad_norm": 1.5007061694216104, + "learning_rate": 1.0256947036748766e-06, + "loss": 0.0186, + "step": 4476 + }, + { + "epoch": 3.2, + "grad_norm": 1.7574699954655115, + "learning_rate": 1.0239416495514331e-06, + "loss": 0.0267, + "step": 4477 + }, + { + "epoch": 3.2, + "grad_norm": 3.1371640873189954, + "learning_rate": 1.0221899239029887e-06, + "loss": 0.0185, + "step": 4478 + }, + { + "epoch": 3.2, + "grad_norm": 5.019779692114384, + "learning_rate": 1.0204395273148277e-06, + "loss": 0.0157, + "step": 4479 + }, + { + "epoch": 3.2, + "grad_norm": 7.540699269714138, + "learning_rate": 1.0186904603717894e-06, + "loss": 0.027, + "step": 4480 + }, + { + "epoch": 3.2, + "grad_norm": 3.9237182973169595, + "learning_rate": 1.0169427236582702e-06, + "loss": 0.0377, + "step": 4481 + }, + { + "epoch": 3.2, + "grad_norm": 4.278480950923269, + "learning_rate": 1.0151963177582208e-06, + "loss": 0.0245, + "step": 4482 + }, + { + "epoch": 3.2, + "grad_norm": 2.312231473624651, + "learning_rate": 1.0134512432551492e-06, + "loss": 0.0183, + "step": 4483 + }, + { + "epoch": 3.2, + "grad_norm": 2.0986535225249416, + "learning_rate": 1.0117075007321152e-06, + "loss": 0.0143, + "step": 4484 + }, + { + "epoch": 3.2, + "grad_norm": 5.975648412284588, + "learning_rate": 1.009965090771739e-06, + "loss": 0.0197, + "step": 4485 + }, + { + "epoch": 3.2, + "grad_norm": 4.374236320725866, + "learning_rate": 1.0082240139561866e-06, + "loss": 0.0254, + "step": 4486 + }, + { + "epoch": 3.2, + "grad_norm": 6.532426395412988, + "learning_rate": 1.0064842708671908e-06, + "loss": 0.0208, + "step": 4487 + }, + { + "epoch": 3.2, + "grad_norm": 3.6018002154036113, + "learning_rate": 1.0047458620860251e-06, + "loss": 0.0195, + "step": 4488 + }, + { + "epoch": 3.2, + "grad_norm": 2.4807601063417293, + "learning_rate": 1.0030087881935308e-06, + "loss": 0.0158, + "step": 4489 + }, + { + "epoch": 3.2, + "grad_norm": 3.4131438838142536, + "learning_rate": 1.0012730497700912e-06, + "loss": 0.0293, + "step": 4490 + }, + { + "epoch": 3.21, + "grad_norm": 7.077550750630359, + "learning_rate": 9.995386473956531e-07, + "loss": 0.0251, + "step": 4491 + }, + { + "epoch": 3.21, + "grad_norm": 2.258377721601201, + "learning_rate": 9.978055816497084e-07, + "loss": 0.0227, + "step": 4492 + }, + { + "epoch": 3.21, + "grad_norm": 2.229914063381325, + "learning_rate": 9.960738531113118e-07, + "loss": 0.0209, + "step": 4493 + }, + { + "epoch": 3.21, + "grad_norm": 2.1564691341307487, + "learning_rate": 9.94343462359061e-07, + "loss": 0.014, + "step": 4494 + }, + { + "epoch": 3.21, + "grad_norm": 5.382295516557594, + "learning_rate": 9.926144099711138e-07, + "loss": 0.0277, + "step": 4495 + }, + { + "epoch": 3.21, + "grad_norm": 2.516122869741995, + "learning_rate": 9.90886696525179e-07, + "loss": 0.022, + "step": 4496 + }, + { + "epoch": 3.21, + "grad_norm": 7.685796145048834, + "learning_rate": 9.89160322598517e-07, + "loss": 0.0294, + "step": 4497 + }, + { + "epoch": 3.21, + "grad_norm": 2.327309331754003, + "learning_rate": 9.874352887679416e-07, + "loss": 0.0273, + "step": 4498 + }, + { + "epoch": 3.21, + "grad_norm": 2.8309636158087965, + "learning_rate": 9.857115956098196e-07, + "loss": 0.0287, + "step": 4499 + }, + { + "epoch": 3.21, + "grad_norm": 3.2811544904994556, + "learning_rate": 9.839892437000675e-07, + "loss": 0.0135, + "step": 4500 + }, + { + "epoch": 3.21, + "eval_avg_AUC": 0.8345286134092101, + "eval_avg_Accuracy": 0.7458968832891246, + "eval_avg_Accuracy-right": 0.888352680318247, + "eval_avg_Accuracy-wrong": 0.49749829429156245, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.7117713077832741, + "eval_last_AUC": 0.8350567927270806, + "eval_last_Accuracy": 0.7807526525198939, + "eval_last_Accuracy-right": 0.8413982000782575, + "eval_last_Accuracy-wrong": 0.6750056856947919, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.7121076559312289, + "eval_max_AUC": 0.7854278482735377, + "eval_max_Accuracy": 0.6475878647214854, + "eval_max_Accuracy-right": 0.9867614451545585, + "eval_max_Accuracy-wrong": 0.056174664544007276, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6558197091469862, + "eval_min_AUC": 0.843139889983326, + "eval_min_Accuracy": 0.7734996684350133, + "eval_min_Accuracy-right": 0.7857049693491587, + "eval_min_Accuracy-wrong": 0.7522174209688424, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.715089706106881, + "eval_prod_AUC": 0.8407684428232383, + "eval_prod_Accuracy": 0.7391826923076923, + "eval_prod_Accuracy-right": 0.6629059606104083, + "eval_prod_Accuracy-wrong": 0.8721855810780077, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.7128298178825854, + "eval_runtime": 247.0161, + "eval_samples_per_second": 97.678, + "eval_steps_per_second": 3.052, + "eval_sum_AUC": 0.7186416616330681, + "eval_sum_Accuracy": 0.6397546419098143, + "eval_sum_Accuracy-right": 0.9967392722055562, + "eval_sum_Accuracy-wrong": 0.017284512167386856, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6893739044164864, + "step": 4500 + }, + { + "epoch": 3.21, + "grad_norm": 1.4722866410217152, + "learning_rate": 9.822682336141558e-07, + "loss": 0.0178, + "step": 4501 + }, + { + "epoch": 3.21, + "grad_norm": 2.181579380218437, + "learning_rate": 9.805485659271064e-07, + "loss": 0.0168, + "step": 4502 + }, + { + "epoch": 3.21, + "grad_norm": 7.752924249109985, + "learning_rate": 9.788302412134931e-07, + "loss": 0.0352, + "step": 4503 + }, + { + "epoch": 3.21, + "grad_norm": 1.9281551566524828, + "learning_rate": 9.77113260047436e-07, + "loss": 0.0192, + "step": 4504 + }, + { + "epoch": 3.22, + "grad_norm": 1.9477930911053865, + "learning_rate": 9.753976230026158e-07, + "loss": 0.0234, + "step": 4505 + }, + { + "epoch": 3.22, + "grad_norm": 6.050182109498725, + "learning_rate": 9.736833306522537e-07, + "loss": 0.0257, + "step": 4506 + }, + { + "epoch": 3.22, + "grad_norm": 5.560668219829083, + "learning_rate": 9.719703835691314e-07, + "loss": 0.0216, + "step": 4507 + }, + { + "epoch": 3.22, + "grad_norm": 10.427582712165258, + "learning_rate": 9.702587823255715e-07, + "loss": 0.0413, + "step": 4508 + }, + { + "epoch": 3.22, + "grad_norm": 2.6181714476401536, + "learning_rate": 9.685485274934576e-07, + "loss": 0.0179, + "step": 4509 + }, + { + "epoch": 3.22, + "grad_norm": 9.156873614600924, + "learning_rate": 9.66839619644211e-07, + "loss": 0.0251, + "step": 4510 + }, + { + "epoch": 3.22, + "grad_norm": 3.4673271118101643, + "learning_rate": 9.651320593488162e-07, + "loss": 0.0191, + "step": 4511 + }, + { + "epoch": 3.22, + "grad_norm": 1.6137388839865046, + "learning_rate": 9.634258471777958e-07, + "loss": 0.0184, + "step": 4512 + }, + { + "epoch": 3.22, + "grad_norm": 3.6836288992334048, + "learning_rate": 9.617209837012287e-07, + "loss": 0.0264, + "step": 4513 + }, + { + "epoch": 3.22, + "grad_norm": 5.5935012176996635, + "learning_rate": 9.600174694887421e-07, + "loss": 0.0247, + "step": 4514 + }, + { + "epoch": 3.22, + "grad_norm": 7.298134909789238, + "learning_rate": 9.583153051095107e-07, + "loss": 0.0247, + "step": 4515 + }, + { + "epoch": 3.22, + "grad_norm": 5.308498768012288, + "learning_rate": 9.5661449113226e-07, + "loss": 0.0287, + "step": 4516 + }, + { + "epoch": 3.22, + "grad_norm": 1.8179009486270912, + "learning_rate": 9.549150281252633e-07, + "loss": 0.0206, + "step": 4517 + }, + { + "epoch": 3.22, + "grad_norm": 3.3811622578866727, + "learning_rate": 9.532169166563426e-07, + "loss": 0.0284, + "step": 4518 + }, + { + "epoch": 3.23, + "grad_norm": 3.8222062654617406, + "learning_rate": 9.515201572928689e-07, + "loss": 0.028, + "step": 4519 + }, + { + "epoch": 3.23, + "grad_norm": 3.0854257738751163, + "learning_rate": 9.49824750601761e-07, + "loss": 0.0316, + "step": 4520 + }, + { + "epoch": 3.23, + "grad_norm": 2.5397879239379098, + "learning_rate": 9.481306971494858e-07, + "loss": 0.0265, + "step": 4521 + }, + { + "epoch": 3.23, + "grad_norm": 3.2717124935629953, + "learning_rate": 9.464379975020576e-07, + "loss": 0.0278, + "step": 4522 + }, + { + "epoch": 3.23, + "grad_norm": 9.377789209888471, + "learning_rate": 9.447466522250393e-07, + "loss": 0.0318, + "step": 4523 + }, + { + "epoch": 3.23, + "grad_norm": 6.441122861364104, + "learning_rate": 9.430566618835407e-07, + "loss": 0.0314, + "step": 4524 + }, + { + "epoch": 3.23, + "grad_norm": 2.3160747535915265, + "learning_rate": 9.413680270422187e-07, + "loss": 0.0289, + "step": 4525 + }, + { + "epoch": 3.23, + "grad_norm": 1.6669925746710255, + "learning_rate": 9.396807482652775e-07, + "loss": 0.0241, + "step": 4526 + }, + { + "epoch": 3.23, + "grad_norm": 2.8847819825771777, + "learning_rate": 9.3799482611647e-07, + "loss": 0.0266, + "step": 4527 + }, + { + "epoch": 3.23, + "grad_norm": 4.401247157463592, + "learning_rate": 9.363102611590918e-07, + "loss": 0.0206, + "step": 4528 + }, + { + "epoch": 3.23, + "grad_norm": 4.810249161783182, + "learning_rate": 9.346270539559882e-07, + "loss": 0.0213, + "step": 4529 + }, + { + "epoch": 3.23, + "grad_norm": 7.9036297509881095, + "learning_rate": 9.329452050695497e-07, + "loss": 0.0221, + "step": 4530 + }, + { + "epoch": 3.23, + "grad_norm": 3.570781343969614, + "learning_rate": 9.312647150617144e-07, + "loss": 0.0207, + "step": 4531 + }, + { + "epoch": 3.23, + "grad_norm": 1.6397653229972227, + "learning_rate": 9.295855844939639e-07, + "loss": 0.012, + "step": 4532 + }, + { + "epoch": 3.24, + "grad_norm": 2.258414374329704, + "learning_rate": 9.279078139273279e-07, + "loss": 0.0181, + "step": 4533 + }, + { + "epoch": 3.24, + "grad_norm": 8.950313191107599, + "learning_rate": 9.262314039223802e-07, + "loss": 0.0312, + "step": 4534 + }, + { + "epoch": 3.24, + "grad_norm": 2.1815871351729617, + "learning_rate": 9.245563550392406e-07, + "loss": 0.0223, + "step": 4535 + }, + { + "epoch": 3.24, + "grad_norm": 4.859815723424763, + "learning_rate": 9.22882667837574e-07, + "loss": 0.0266, + "step": 4536 + }, + { + "epoch": 3.24, + "grad_norm": 2.355286650208844, + "learning_rate": 9.212103428765912e-07, + "loss": 0.0203, + "step": 4537 + }, + { + "epoch": 3.24, + "grad_norm": 2.0779801100488684, + "learning_rate": 9.19539380715046e-07, + "loss": 0.0234, + "step": 4538 + }, + { + "epoch": 3.24, + "grad_norm": 9.338632225920227, + "learning_rate": 9.178697819112381e-07, + "loss": 0.0253, + "step": 4539 + }, + { + "epoch": 3.24, + "grad_norm": 2.2899098228257464, + "learning_rate": 9.162015470230123e-07, + "loss": 0.0175, + "step": 4540 + }, + { + "epoch": 3.24, + "grad_norm": 1.5376658110359513, + "learning_rate": 9.145346766077562e-07, + "loss": 0.025, + "step": 4541 + }, + { + "epoch": 3.24, + "grad_norm": 1.9588815283252403, + "learning_rate": 9.128691712224025e-07, + "loss": 0.0208, + "step": 4542 + }, + { + "epoch": 3.24, + "grad_norm": 1.793777776979931, + "learning_rate": 9.112050314234272e-07, + "loss": 0.0204, + "step": 4543 + }, + { + "epoch": 3.24, + "grad_norm": 3.048832843843268, + "learning_rate": 9.0954225776685e-07, + "loss": 0.0146, + "step": 4544 + }, + { + "epoch": 3.24, + "grad_norm": 2.3438063416343393, + "learning_rate": 9.078808508082354e-07, + "loss": 0.0206, + "step": 4545 + }, + { + "epoch": 3.24, + "grad_norm": 1.9916762447835414, + "learning_rate": 9.06220811102691e-07, + "loss": 0.0257, + "step": 4546 + }, + { + "epoch": 3.25, + "grad_norm": 5.85438556591717, + "learning_rate": 9.045621392048637e-07, + "loss": 0.0235, + "step": 4547 + }, + { + "epoch": 3.25, + "grad_norm": 2.46668861130742, + "learning_rate": 9.029048356689507e-07, + "loss": 0.0255, + "step": 4548 + }, + { + "epoch": 3.25, + "grad_norm": 4.512065904072692, + "learning_rate": 9.012489010486835e-07, + "loss": 0.0242, + "step": 4549 + }, + { + "epoch": 3.25, + "grad_norm": 3.86233637448217, + "learning_rate": 8.995943358973463e-07, + "loss": 0.0244, + "step": 4550 + }, + { + "epoch": 3.25, + "grad_norm": 3.2961520762406575, + "learning_rate": 8.979411407677535e-07, + "loss": 0.0241, + "step": 4551 + }, + { + "epoch": 3.25, + "grad_norm": 4.216390230976483, + "learning_rate": 8.962893162122749e-07, + "loss": 0.0201, + "step": 4552 + }, + { + "epoch": 3.25, + "grad_norm": 8.348781851060899, + "learning_rate": 8.946388627828106e-07, + "loss": 0.0311, + "step": 4553 + }, + { + "epoch": 3.25, + "grad_norm": 6.084513848361836, + "learning_rate": 8.929897810308102e-07, + "loss": 0.025, + "step": 4554 + }, + { + "epoch": 3.25, + "grad_norm": 8.2559434065546, + "learning_rate": 8.913420715072619e-07, + "loss": 0.0396, + "step": 4555 + }, + { + "epoch": 3.25, + "grad_norm": 4.631280709242924, + "learning_rate": 8.896957347626966e-07, + "loss": 0.0321, + "step": 4556 + }, + { + "epoch": 3.25, + "grad_norm": 3.139609791677315, + "learning_rate": 8.880507713471853e-07, + "loss": 0.0157, + "step": 4557 + }, + { + "epoch": 3.25, + "grad_norm": 3.308998251970711, + "learning_rate": 8.864071818103415e-07, + "loss": 0.0179, + "step": 4558 + }, + { + "epoch": 3.25, + "grad_norm": 2.4597231253581624, + "learning_rate": 8.847649667013187e-07, + "loss": 0.0248, + "step": 4559 + }, + { + "epoch": 3.25, + "grad_norm": 4.1940520085889625, + "learning_rate": 8.831241265688112e-07, + "loss": 0.0284, + "step": 4560 + }, + { + "epoch": 3.26, + "grad_norm": 9.58472105522457, + "learning_rate": 8.814846619610545e-07, + "loss": 0.0199, + "step": 4561 + }, + { + "epoch": 3.26, + "grad_norm": 4.313842958685549, + "learning_rate": 8.79846573425826e-07, + "loss": 0.0267, + "step": 4562 + }, + { + "epoch": 3.26, + "grad_norm": 3.509297605663404, + "learning_rate": 8.782098615104373e-07, + "loss": 0.0182, + "step": 4563 + }, + { + "epoch": 3.26, + "grad_norm": 1.6077932936017227, + "learning_rate": 8.765745267617487e-07, + "loss": 0.0247, + "step": 4564 + }, + { + "epoch": 3.26, + "grad_norm": 2.9038110707158453, + "learning_rate": 8.749405697261515e-07, + "loss": 0.0228, + "step": 4565 + }, + { + "epoch": 3.26, + "grad_norm": 3.162924718811875, + "learning_rate": 8.733079909495868e-07, + "loss": 0.0229, + "step": 4566 + }, + { + "epoch": 3.26, + "grad_norm": 2.0279848737464015, + "learning_rate": 8.716767909775231e-07, + "loss": 0.028, + "step": 4567 + }, + { + "epoch": 3.26, + "grad_norm": 6.216485879376879, + "learning_rate": 8.700469703549802e-07, + "loss": 0.0333, + "step": 4568 + }, + { + "epoch": 3.26, + "grad_norm": 7.520962408278871, + "learning_rate": 8.684185296265074e-07, + "loss": 0.0197, + "step": 4569 + }, + { + "epoch": 3.26, + "grad_norm": 8.629919683440207, + "learning_rate": 8.667914693362006e-07, + "loss": 0.0187, + "step": 4570 + }, + { + "epoch": 3.26, + "grad_norm": 7.682867297333057, + "learning_rate": 8.651657900276878e-07, + "loss": 0.0235, + "step": 4571 + }, + { + "epoch": 3.26, + "grad_norm": 6.700794151870362, + "learning_rate": 8.635414922441398e-07, + "loss": 0.0226, + "step": 4572 + }, + { + "epoch": 3.26, + "grad_norm": 1.3609985270641234, + "learning_rate": 8.61918576528265e-07, + "loss": 0.0191, + "step": 4573 + }, + { + "epoch": 3.26, + "grad_norm": 8.333651139592504, + "learning_rate": 8.60297043422309e-07, + "loss": 0.0236, + "step": 4574 + }, + { + "epoch": 3.27, + "grad_norm": 5.2970319558403975, + "learning_rate": 8.586768934680572e-07, + "loss": 0.0227, + "step": 4575 + }, + { + "epoch": 3.27, + "grad_norm": 8.021689608056526, + "learning_rate": 8.570581272068307e-07, + "loss": 0.0183, + "step": 4576 + }, + { + "epoch": 3.27, + "grad_norm": 2.30192703714936, + "learning_rate": 8.554407451794905e-07, + "loss": 0.0168, + "step": 4577 + }, + { + "epoch": 3.27, + "grad_norm": 10.68080512987844, + "learning_rate": 8.538247479264327e-07, + "loss": 0.0336, + "step": 4578 + }, + { + "epoch": 3.27, + "grad_norm": 4.080574857143387, + "learning_rate": 8.522101359875934e-07, + "loss": 0.0234, + "step": 4579 + }, + { + "epoch": 3.27, + "grad_norm": 6.2548733609393015, + "learning_rate": 8.505969099024436e-07, + "loss": 0.0318, + "step": 4580 + }, + { + "epoch": 3.27, + "grad_norm": 2.452170699337819, + "learning_rate": 8.489850702099922e-07, + "loss": 0.0318, + "step": 4581 + }, + { + "epoch": 3.27, + "grad_norm": 5.3176103182624805, + "learning_rate": 8.473746174487846e-07, + "loss": 0.0326, + "step": 4582 + }, + { + "epoch": 3.27, + "grad_norm": 4.033919981368345, + "learning_rate": 8.457655521569036e-07, + "loss": 0.0287, + "step": 4583 + }, + { + "epoch": 3.27, + "grad_norm": 8.624185466236282, + "learning_rate": 8.441578748719676e-07, + "loss": 0.0274, + "step": 4584 + }, + { + "epoch": 3.27, + "grad_norm": 2.78523165440918, + "learning_rate": 8.425515861311312e-07, + "loss": 0.0197, + "step": 4585 + }, + { + "epoch": 3.27, + "grad_norm": 6.283027986139463, + "learning_rate": 8.409466864710858e-07, + "loss": 0.0248, + "step": 4586 + }, + { + "epoch": 3.27, + "grad_norm": 5.122071684243946, + "learning_rate": 8.393431764280591e-07, + "loss": 0.0175, + "step": 4587 + }, + { + "epoch": 3.27, + "grad_norm": 7.677661344943328, + "learning_rate": 8.377410565378097e-07, + "loss": 0.026, + "step": 4588 + }, + { + "epoch": 3.28, + "grad_norm": 6.845604358260919, + "learning_rate": 8.361403273356411e-07, + "loss": 0.0262, + "step": 4589 + }, + { + "epoch": 3.28, + "grad_norm": 1.9332179724262128, + "learning_rate": 8.345409893563816e-07, + "loss": 0.0208, + "step": 4590 + }, + { + "epoch": 3.28, + "grad_norm": 4.828115067765296, + "learning_rate": 8.329430431344043e-07, + "loss": 0.0283, + "step": 4591 + }, + { + "epoch": 3.28, + "grad_norm": 4.041522711176144, + "learning_rate": 8.313464892036083e-07, + "loss": 0.0182, + "step": 4592 + }, + { + "epoch": 3.28, + "grad_norm": 3.5931461402812284, + "learning_rate": 8.297513280974362e-07, + "loss": 0.0205, + "step": 4593 + }, + { + "epoch": 3.28, + "grad_norm": 7.322773741524985, + "learning_rate": 8.281575603488573e-07, + "loss": 0.0245, + "step": 4594 + }, + { + "epoch": 3.28, + "grad_norm": 2.2037482548752654, + "learning_rate": 8.265651864903823e-07, + "loss": 0.0228, + "step": 4595 + }, + { + "epoch": 3.28, + "grad_norm": 1.8543264539363935, + "learning_rate": 8.249742070540506e-07, + "loss": 0.0205, + "step": 4596 + }, + { + "epoch": 3.28, + "grad_norm": 4.841219706984167, + "learning_rate": 8.233846225714386e-07, + "loss": 0.0315, + "step": 4597 + }, + { + "epoch": 3.28, + "grad_norm": 1.9559510186530535, + "learning_rate": 8.217964335736556e-07, + "loss": 0.0196, + "step": 4598 + }, + { + "epoch": 3.28, + "grad_norm": 2.369932136393158, + "learning_rate": 8.202096405913462e-07, + "loss": 0.0211, + "step": 4599 + }, + { + "epoch": 3.28, + "grad_norm": 3.149653056045046, + "learning_rate": 8.186242441546866e-07, + "loss": 0.0208, + "step": 4600 + }, + { + "epoch": 3.28, + "grad_norm": 5.1208147167576294, + "learning_rate": 8.170402447933873e-07, + "loss": 0.0377, + "step": 4601 + }, + { + "epoch": 3.28, + "grad_norm": 2.50638995160429, + "learning_rate": 8.154576430366922e-07, + "loss": 0.0223, + "step": 4602 + }, + { + "epoch": 3.29, + "grad_norm": 5.384939207639812, + "learning_rate": 8.13876439413378e-07, + "loss": 0.028, + "step": 4603 + }, + { + "epoch": 3.29, + "grad_norm": 1.762765510713304, + "learning_rate": 8.122966344517536e-07, + "loss": 0.0239, + "step": 4604 + }, + { + "epoch": 3.29, + "grad_norm": 3.0740858313842443, + "learning_rate": 8.107182286796633e-07, + "loss": 0.0244, + "step": 4605 + }, + { + "epoch": 3.29, + "grad_norm": 3.9397427075031013, + "learning_rate": 8.091412226244771e-07, + "loss": 0.0172, + "step": 4606 + }, + { + "epoch": 3.29, + "grad_norm": 3.5915667776547204, + "learning_rate": 8.07565616813108e-07, + "loss": 0.0344, + "step": 4607 + }, + { + "epoch": 3.29, + "grad_norm": 3.9272905067394714, + "learning_rate": 8.059914117719897e-07, + "loss": 0.0253, + "step": 4608 + }, + { + "epoch": 3.29, + "grad_norm": 3.2510066913863462, + "learning_rate": 8.044186080270983e-07, + "loss": 0.0248, + "step": 4609 + }, + { + "epoch": 3.29, + "grad_norm": 1.2617444867153567, + "learning_rate": 8.028472061039322e-07, + "loss": 0.0192, + "step": 4610 + }, + { + "epoch": 3.29, + "grad_norm": 1.939700065819408, + "learning_rate": 8.012772065275304e-07, + "loss": 0.021, + "step": 4611 + }, + { + "epoch": 3.29, + "grad_norm": 2.1320831911446376, + "learning_rate": 7.997086098224555e-07, + "loss": 0.0199, + "step": 4612 + }, + { + "epoch": 3.29, + "grad_norm": 1.884317978445364, + "learning_rate": 7.981414165128065e-07, + "loss": 0.0151, + "step": 4613 + }, + { + "epoch": 3.29, + "grad_norm": 4.5621264328140905, + "learning_rate": 7.965756271222108e-07, + "loss": 0.0208, + "step": 4614 + }, + { + "epoch": 3.29, + "grad_norm": 1.440610499433646, + "learning_rate": 7.950112421738282e-07, + "loss": 0.0178, + "step": 4615 + }, + { + "epoch": 3.29, + "grad_norm": 2.258538996875376, + "learning_rate": 7.934482621903494e-07, + "loss": 0.0262, + "step": 4616 + }, + { + "epoch": 3.3, + "grad_norm": 3.0165939984057832, + "learning_rate": 7.91886687693994e-07, + "loss": 0.0165, + "step": 4617 + }, + { + "epoch": 3.3, + "grad_norm": 5.695857517771355, + "learning_rate": 7.903265192065141e-07, + "loss": 0.0197, + "step": 4618 + }, + { + "epoch": 3.3, + "grad_norm": 1.7055585895666883, + "learning_rate": 7.887677572491903e-07, + "loss": 0.0187, + "step": 4619 + }, + { + "epoch": 3.3, + "grad_norm": 7.5887897339828685, + "learning_rate": 7.872104023428339e-07, + "loss": 0.0238, + "step": 4620 + }, + { + "epoch": 3.3, + "grad_norm": 3.374951107007746, + "learning_rate": 7.856544550077883e-07, + "loss": 0.0257, + "step": 4621 + }, + { + "epoch": 3.3, + "grad_norm": 1.855484086420173, + "learning_rate": 7.840999157639195e-07, + "loss": 0.019, + "step": 4622 + }, + { + "epoch": 3.3, + "grad_norm": 2.792593588176287, + "learning_rate": 7.825467851306335e-07, + "loss": 0.0207, + "step": 4623 + }, + { + "epoch": 3.3, + "grad_norm": 9.742513544372374, + "learning_rate": 7.809950636268554e-07, + "loss": 0.0224, + "step": 4624 + }, + { + "epoch": 3.3, + "grad_norm": 5.9915718445374795, + "learning_rate": 7.794447517710485e-07, + "loss": 0.0223, + "step": 4625 + }, + { + "epoch": 3.3, + "grad_norm": 2.4603505001707378, + "learning_rate": 7.778958500811961e-07, + "loss": 0.0317, + "step": 4626 + }, + { + "epoch": 3.3, + "grad_norm": 2.349151356305383, + "learning_rate": 7.7634835907482e-07, + "loss": 0.0232, + "step": 4627 + }, + { + "epoch": 3.3, + "grad_norm": 5.788557646206046, + "learning_rate": 7.748022792689613e-07, + "loss": 0.0233, + "step": 4628 + }, + { + "epoch": 3.3, + "grad_norm": 4.051836245160571, + "learning_rate": 7.732576111801982e-07, + "loss": 0.0125, + "step": 4629 + }, + { + "epoch": 3.3, + "grad_norm": 2.8753818346406246, + "learning_rate": 7.717143553246298e-07, + "loss": 0.0306, + "step": 4630 + }, + { + "epoch": 3.31, + "grad_norm": 2.577591283510448, + "learning_rate": 7.701725122178871e-07, + "loss": 0.0186, + "step": 4631 + }, + { + "epoch": 3.31, + "grad_norm": 11.542060297374746, + "learning_rate": 7.686320823751298e-07, + "loss": 0.0478, + "step": 4632 + }, + { + "epoch": 3.31, + "grad_norm": 2.371685944489164, + "learning_rate": 7.670930663110426e-07, + "loss": 0.0251, + "step": 4633 + }, + { + "epoch": 3.31, + "grad_norm": 1.7468794803617154, + "learning_rate": 7.655554645398405e-07, + "loss": 0.0228, + "step": 4634 + }, + { + "epoch": 3.31, + "grad_norm": 1.245714311313327, + "learning_rate": 7.640192775752647e-07, + "loss": 0.0189, + "step": 4635 + }, + { + "epoch": 3.31, + "grad_norm": 4.9245303461145715, + "learning_rate": 7.624845059305836e-07, + "loss": 0.028, + "step": 4636 + }, + { + "epoch": 3.31, + "grad_norm": 3.7927382042335367, + "learning_rate": 7.609511501185929e-07, + "loss": 0.0305, + "step": 4637 + }, + { + "epoch": 3.31, + "grad_norm": 1.8226386480619419, + "learning_rate": 7.594192106516151e-07, + "loss": 0.024, + "step": 4638 + }, + { + "epoch": 3.31, + "grad_norm": 4.11296383175397, + "learning_rate": 7.578886880414999e-07, + "loss": 0.0342, + "step": 4639 + }, + { + "epoch": 3.31, + "grad_norm": 2.9392679931067014, + "learning_rate": 7.563595827996235e-07, + "loss": 0.0247, + "step": 4640 + }, + { + "epoch": 3.31, + "grad_norm": 3.0949822502136284, + "learning_rate": 7.548318954368883e-07, + "loss": 0.0252, + "step": 4641 + }, + { + "epoch": 3.31, + "grad_norm": 2.384999555074518, + "learning_rate": 7.533056264637228e-07, + "loss": 0.0207, + "step": 4642 + }, + { + "epoch": 3.31, + "grad_norm": 1.6244482154616275, + "learning_rate": 7.51780776390082e-07, + "loss": 0.0199, + "step": 4643 + }, + { + "epoch": 3.31, + "grad_norm": 5.849177252436853, + "learning_rate": 7.50257345725447e-07, + "loss": 0.0227, + "step": 4644 + }, + { + "epoch": 3.32, + "grad_norm": 7.411666454715126, + "learning_rate": 7.487353349788234e-07, + "loss": 0.0223, + "step": 4645 + }, + { + "epoch": 3.32, + "grad_norm": 2.944448161435966, + "learning_rate": 7.472147446587452e-07, + "loss": 0.0213, + "step": 4646 + }, + { + "epoch": 3.32, + "grad_norm": 4.2919752995505585, + "learning_rate": 7.456955752732659e-07, + "loss": 0.0292, + "step": 4647 + }, + { + "epoch": 3.32, + "grad_norm": 1.7148082801364484, + "learning_rate": 7.441778273299738e-07, + "loss": 0.0146, + "step": 4648 + }, + { + "epoch": 3.32, + "grad_norm": 5.5113167274528125, + "learning_rate": 7.426615013359706e-07, + "loss": 0.0242, + "step": 4649 + }, + { + "epoch": 3.32, + "grad_norm": 3.4930471463189185, + "learning_rate": 7.411465977978949e-07, + "loss": 0.0214, + "step": 4650 + }, + { + "epoch": 3.32, + "grad_norm": 2.7583759156057948, + "learning_rate": 7.396331172218996e-07, + "loss": 0.0263, + "step": 4651 + }, + { + "epoch": 3.32, + "grad_norm": 3.0835376005766184, + "learning_rate": 7.381210601136702e-07, + "loss": 0.0226, + "step": 4652 + }, + { + "epoch": 3.32, + "grad_norm": 7.939488044101162, + "learning_rate": 7.366104269784086e-07, + "loss": 0.0222, + "step": 4653 + }, + { + "epoch": 3.32, + "grad_norm": 7.71559173914037, + "learning_rate": 7.351012183208511e-07, + "loss": 0.0246, + "step": 4654 + }, + { + "epoch": 3.32, + "grad_norm": 3.3505208772325212, + "learning_rate": 7.335934346452484e-07, + "loss": 0.022, + "step": 4655 + }, + { + "epoch": 3.32, + "grad_norm": 2.522427789926864, + "learning_rate": 7.320870764553795e-07, + "loss": 0.0258, + "step": 4656 + }, + { + "epoch": 3.32, + "grad_norm": 4.625965296359435, + "learning_rate": 7.305821442545474e-07, + "loss": 0.0318, + "step": 4657 + }, + { + "epoch": 3.32, + "grad_norm": 1.666715669670613, + "learning_rate": 7.290786385455778e-07, + "loss": 0.0267, + "step": 4658 + }, + { + "epoch": 3.33, + "grad_norm": 7.423398252798393, + "learning_rate": 7.275765598308199e-07, + "loss": 0.0457, + "step": 4659 + }, + { + "epoch": 3.33, + "grad_norm": 4.94580570567993, + "learning_rate": 7.26075908612146e-07, + "loss": 0.0223, + "step": 4660 + }, + { + "epoch": 3.33, + "grad_norm": 2.6200308676820714, + "learning_rate": 7.245766853909519e-07, + "loss": 0.0302, + "step": 4661 + }, + { + "epoch": 3.33, + "grad_norm": 2.565484956644103, + "learning_rate": 7.230788906681558e-07, + "loss": 0.0139, + "step": 4662 + }, + { + "epoch": 3.33, + "grad_norm": 7.519074271081959, + "learning_rate": 7.215825249441982e-07, + "loss": 0.0438, + "step": 4663 + }, + { + "epoch": 3.33, + "grad_norm": 2.9349056048339865, + "learning_rate": 7.200875887190445e-07, + "loss": 0.0319, + "step": 4664 + }, + { + "epoch": 3.33, + "grad_norm": 1.7802836631328636, + "learning_rate": 7.185940824921772e-07, + "loss": 0.0256, + "step": 4665 + }, + { + "epoch": 3.33, + "grad_norm": 2.3165391573905887, + "learning_rate": 7.171020067626089e-07, + "loss": 0.0241, + "step": 4666 + }, + { + "epoch": 3.33, + "grad_norm": 10.891239008200467, + "learning_rate": 7.156113620288646e-07, + "loss": 0.0393, + "step": 4667 + }, + { + "epoch": 3.33, + "grad_norm": 8.083097463271788, + "learning_rate": 7.141221487890027e-07, + "loss": 0.0271, + "step": 4668 + }, + { + "epoch": 3.33, + "grad_norm": 4.11813397569581, + "learning_rate": 7.126343675405905e-07, + "loss": 0.0164, + "step": 4669 + }, + { + "epoch": 3.33, + "grad_norm": 5.230913755558933, + "learning_rate": 7.111480187807296e-07, + "loss": 0.0227, + "step": 4670 + }, + { + "epoch": 3.33, + "grad_norm": 2.4065813670459244, + "learning_rate": 7.096631030060308e-07, + "loss": 0.0265, + "step": 4671 + }, + { + "epoch": 3.33, + "grad_norm": 2.9175150140266624, + "learning_rate": 7.081796207126373e-07, + "loss": 0.0167, + "step": 4672 + }, + { + "epoch": 3.34, + "grad_norm": 5.43837835587297, + "learning_rate": 7.06697572396205e-07, + "loss": 0.0166, + "step": 4673 + }, + { + "epoch": 3.34, + "grad_norm": 5.613308008082632, + "learning_rate": 7.052169585519142e-07, + "loss": 0.0162, + "step": 4674 + }, + { + "epoch": 3.34, + "grad_norm": 4.7970690996692165, + "learning_rate": 7.037377796744666e-07, + "loss": 0.0214, + "step": 4675 + }, + { + "epoch": 3.34, + "grad_norm": 6.004092278249741, + "learning_rate": 7.022600362580817e-07, + "loss": 0.0234, + "step": 4676 + }, + { + "epoch": 3.34, + "grad_norm": 4.179910056219882, + "learning_rate": 7.007837287965024e-07, + "loss": 0.014, + "step": 4677 + }, + { + "epoch": 3.34, + "grad_norm": 5.786550535023316, + "learning_rate": 6.993088577829904e-07, + "loss": 0.0287, + "step": 4678 + }, + { + "epoch": 3.34, + "grad_norm": 8.49339045448722, + "learning_rate": 6.978354237103264e-07, + "loss": 0.0291, + "step": 4679 + }, + { + "epoch": 3.34, + "grad_norm": 1.9362629401618683, + "learning_rate": 6.963634270708137e-07, + "loss": 0.0211, + "step": 4680 + }, + { + "epoch": 3.34, + "grad_norm": 3.2084607601706954, + "learning_rate": 6.948928683562722e-07, + "loss": 0.0145, + "step": 4681 + }, + { + "epoch": 3.34, + "grad_norm": 6.388831706059872, + "learning_rate": 6.934237480580435e-07, + "loss": 0.0261, + "step": 4682 + }, + { + "epoch": 3.34, + "grad_norm": 8.92008557471272, + "learning_rate": 6.919560666669889e-07, + "loss": 0.025, + "step": 4683 + }, + { + "epoch": 3.34, + "grad_norm": 3.5596242184271523, + "learning_rate": 6.904898246734864e-07, + "loss": 0.024, + "step": 4684 + }, + { + "epoch": 3.34, + "grad_norm": 5.813474706944849, + "learning_rate": 6.890250225674361e-07, + "loss": 0.0238, + "step": 4685 + }, + { + "epoch": 3.34, + "grad_norm": 6.456049406535201, + "learning_rate": 6.875616608382562e-07, + "loss": 0.0235, + "step": 4686 + }, + { + "epoch": 3.35, + "grad_norm": 4.27019288004158, + "learning_rate": 6.860997399748792e-07, + "loss": 0.0241, + "step": 4687 + }, + { + "epoch": 3.35, + "grad_norm": 1.4468000545374635, + "learning_rate": 6.846392604657653e-07, + "loss": 0.0173, + "step": 4688 + }, + { + "epoch": 3.35, + "grad_norm": 2.0063895434320953, + "learning_rate": 6.831802227988843e-07, + "loss": 0.0128, + "step": 4689 + }, + { + "epoch": 3.35, + "grad_norm": 4.125314951911715, + "learning_rate": 6.817226274617283e-07, + "loss": 0.0184, + "step": 4690 + }, + { + "epoch": 3.35, + "grad_norm": 4.390828758784832, + "learning_rate": 6.802664749413079e-07, + "loss": 0.0222, + "step": 4691 + }, + { + "epoch": 3.35, + "grad_norm": 2.617942356146691, + "learning_rate": 6.788117657241506e-07, + "loss": 0.017, + "step": 4692 + }, + { + "epoch": 3.35, + "grad_norm": 5.244438256633183, + "learning_rate": 6.773585002963007e-07, + "loss": 0.0287, + "step": 4693 + }, + { + "epoch": 3.35, + "grad_norm": 5.848459710599114, + "learning_rate": 6.759066791433228e-07, + "loss": 0.025, + "step": 4694 + }, + { + "epoch": 3.35, + "grad_norm": 4.5353239904491645, + "learning_rate": 6.744563027502959e-07, + "loss": 0.0229, + "step": 4695 + }, + { + "epoch": 3.35, + "grad_norm": 6.973704437407207, + "learning_rate": 6.730073716018187e-07, + "loss": 0.0313, + "step": 4696 + }, + { + "epoch": 3.35, + "grad_norm": 3.2864116478612178, + "learning_rate": 6.715598861820055e-07, + "loss": 0.0179, + "step": 4697 + }, + { + "epoch": 3.35, + "grad_norm": 2.120343882460884, + "learning_rate": 6.701138469744883e-07, + "loss": 0.0195, + "step": 4698 + }, + { + "epoch": 3.35, + "grad_norm": 2.2826363640967333, + "learning_rate": 6.686692544624157e-07, + "loss": 0.0249, + "step": 4699 + }, + { + "epoch": 3.35, + "grad_norm": 2.2453281994267424, + "learning_rate": 6.672261091284526e-07, + "loss": 0.0298, + "step": 4700 + }, + { + "epoch": 3.36, + "grad_norm": 4.527463652856993, + "learning_rate": 6.657844114547812e-07, + "loss": 0.0332, + "step": 4701 + }, + { + "epoch": 3.36, + "grad_norm": 12.362619407093279, + "learning_rate": 6.643441619230989e-07, + "loss": 0.0338, + "step": 4702 + }, + { + "epoch": 3.36, + "grad_norm": 4.943448636464124, + "learning_rate": 6.629053610146202e-07, + "loss": 0.0238, + "step": 4703 + }, + { + "epoch": 3.36, + "grad_norm": 3.9420677318165227, + "learning_rate": 6.61468009210075e-07, + "loss": 0.0211, + "step": 4704 + }, + { + "epoch": 3.36, + "grad_norm": 2.069578664244523, + "learning_rate": 6.600321069897097e-07, + "loss": 0.0155, + "step": 4705 + }, + { + "epoch": 3.36, + "grad_norm": 6.509957366481949, + "learning_rate": 6.585976548332856e-07, + "loss": 0.0441, + "step": 4706 + }, + { + "epoch": 3.36, + "grad_norm": 3.550669987246946, + "learning_rate": 6.571646532200815e-07, + "loss": 0.0255, + "step": 4707 + }, + { + "epoch": 3.36, + "grad_norm": 2.07784447599088, + "learning_rate": 6.557331026288855e-07, + "loss": 0.0137, + "step": 4708 + }, + { + "epoch": 3.36, + "grad_norm": 2.33838698403416, + "learning_rate": 6.543030035380099e-07, + "loss": 0.017, + "step": 4709 + }, + { + "epoch": 3.36, + "grad_norm": 4.926223440067987, + "learning_rate": 6.528743564252737e-07, + "loss": 0.023, + "step": 4710 + }, + { + "epoch": 3.36, + "grad_norm": 3.0636490626676856, + "learning_rate": 6.514471617680184e-07, + "loss": 0.0229, + "step": 4711 + }, + { + "epoch": 3.36, + "grad_norm": 2.4741516925135745, + "learning_rate": 6.500214200430921e-07, + "loss": 0.017, + "step": 4712 + }, + { + "epoch": 3.36, + "grad_norm": 2.4767551496918587, + "learning_rate": 6.485971317268658e-07, + "loss": 0.0192, + "step": 4713 + }, + { + "epoch": 3.36, + "grad_norm": 1.7457587448524006, + "learning_rate": 6.471742972952172e-07, + "loss": 0.0236, + "step": 4714 + }, + { + "epoch": 3.37, + "grad_norm": 3.6635714256934637, + "learning_rate": 6.457529172235427e-07, + "loss": 0.0201, + "step": 4715 + }, + { + "epoch": 3.37, + "grad_norm": 3.108350647844174, + "learning_rate": 6.44332991986753e-07, + "loss": 0.0288, + "step": 4716 + }, + { + "epoch": 3.37, + "grad_norm": 1.5565324965470344, + "learning_rate": 6.429145220592703e-07, + "loss": 0.0173, + "step": 4717 + }, + { + "epoch": 3.37, + "grad_norm": 5.2053739429829005, + "learning_rate": 6.414975079150321e-07, + "loss": 0.0235, + "step": 4718 + }, + { + "epoch": 3.37, + "grad_norm": 6.228664595722526, + "learning_rate": 6.400819500274891e-07, + "loss": 0.0341, + "step": 4719 + }, + { + "epoch": 3.37, + "grad_norm": 6.550911531984451, + "learning_rate": 6.386678488696057e-07, + "loss": 0.0226, + "step": 4720 + }, + { + "epoch": 3.37, + "grad_norm": 1.3109577383123228, + "learning_rate": 6.372552049138591e-07, + "loss": 0.0179, + "step": 4721 + }, + { + "epoch": 3.37, + "grad_norm": 4.114856675378657, + "learning_rate": 6.358440186322401e-07, + "loss": 0.0256, + "step": 4722 + }, + { + "epoch": 3.37, + "grad_norm": 2.4993011122011786, + "learning_rate": 6.344342904962536e-07, + "loss": 0.0167, + "step": 4723 + }, + { + "epoch": 3.37, + "grad_norm": 2.1082760871594544, + "learning_rate": 6.330260209769124e-07, + "loss": 0.0158, + "step": 4724 + }, + { + "epoch": 3.37, + "grad_norm": 3.5954228865011024, + "learning_rate": 6.316192105447499e-07, + "loss": 0.0181, + "step": 4725 + }, + { + "epoch": 3.37, + "grad_norm": 3.250561893307712, + "learning_rate": 6.302138596698032e-07, + "loss": 0.0179, + "step": 4726 + }, + { + "epoch": 3.37, + "grad_norm": 5.034063607262308, + "learning_rate": 6.288099688216309e-07, + "loss": 0.0226, + "step": 4727 + }, + { + "epoch": 3.37, + "grad_norm": 3.382398513985597, + "learning_rate": 6.27407538469294e-07, + "loss": 0.0223, + "step": 4728 + }, + { + "epoch": 3.38, + "grad_norm": 3.5480387834752256, + "learning_rate": 6.260065690813754e-07, + "loss": 0.0244, + "step": 4729 + }, + { + "epoch": 3.38, + "grad_norm": 3.894566614171884, + "learning_rate": 6.246070611259603e-07, + "loss": 0.038, + "step": 4730 + }, + { + "epoch": 3.38, + "grad_norm": 4.908257487051644, + "learning_rate": 6.232090150706555e-07, + "loss": 0.0319, + "step": 4731 + }, + { + "epoch": 3.38, + "grad_norm": 2.2356350192362955, + "learning_rate": 6.218124313825696e-07, + "loss": 0.0118, + "step": 4732 + }, + { + "epoch": 3.38, + "grad_norm": 2.7833053267514285, + "learning_rate": 6.204173105283295e-07, + "loss": 0.0305, + "step": 4733 + }, + { + "epoch": 3.38, + "grad_norm": 3.0358977610764937, + "learning_rate": 6.190236529740701e-07, + "loss": 0.0213, + "step": 4734 + }, + { + "epoch": 3.38, + "grad_norm": 2.7799160284276017, + "learning_rate": 6.176314591854388e-07, + "loss": 0.024, + "step": 4735 + }, + { + "epoch": 3.38, + "grad_norm": 2.411428363895045, + "learning_rate": 6.162407296275936e-07, + "loss": 0.024, + "step": 4736 + }, + { + "epoch": 3.38, + "grad_norm": 4.987033320862642, + "learning_rate": 6.148514647652026e-07, + "loss": 0.0213, + "step": 4737 + }, + { + "epoch": 3.38, + "grad_norm": 7.294162310830876, + "learning_rate": 6.134636650624448e-07, + "loss": 0.0306, + "step": 4738 + }, + { + "epoch": 3.38, + "grad_norm": 1.9285035219081978, + "learning_rate": 6.120773309830108e-07, + "loss": 0.0151, + "step": 4739 + }, + { + "epoch": 3.38, + "grad_norm": 2.9779680103281083, + "learning_rate": 6.106924629900996e-07, + "loss": 0.023, + "step": 4740 + }, + { + "epoch": 3.38, + "grad_norm": 2.802445642894432, + "learning_rate": 6.09309061546422e-07, + "loss": 0.0204, + "step": 4741 + }, + { + "epoch": 3.38, + "grad_norm": 3.5712560891174503, + "learning_rate": 6.079271271141979e-07, + "loss": 0.0258, + "step": 4742 + }, + { + "epoch": 3.39, + "grad_norm": 2.7284199278884085, + "learning_rate": 6.065466601551578e-07, + "loss": 0.0249, + "step": 4743 + }, + { + "epoch": 3.39, + "grad_norm": 2.1930907915124753, + "learning_rate": 6.051676611305401e-07, + "loss": 0.0327, + "step": 4744 + }, + { + "epoch": 3.39, + "grad_norm": 3.0801008957709364, + "learning_rate": 6.037901305010951e-07, + "loss": 0.0156, + "step": 4745 + }, + { + "epoch": 3.39, + "grad_norm": 2.2322852894555716, + "learning_rate": 6.024140687270813e-07, + "loss": 0.0134, + "step": 4746 + }, + { + "epoch": 3.39, + "grad_norm": 3.7566488570881544, + "learning_rate": 6.010394762682659e-07, + "loss": 0.018, + "step": 4747 + }, + { + "epoch": 3.39, + "grad_norm": 2.266574212516343, + "learning_rate": 5.996663535839275e-07, + "loss": 0.0201, + "step": 4748 + }, + { + "epoch": 3.39, + "grad_norm": 2.644461947400835, + "learning_rate": 5.982947011328489e-07, + "loss": 0.0243, + "step": 4749 + }, + { + "epoch": 3.39, + "grad_norm": 2.973314350873358, + "learning_rate": 5.969245193733275e-07, + "loss": 0.0241, + "step": 4750 + }, + { + "epoch": 3.39, + "grad_norm": 4.151307716784977, + "learning_rate": 5.955558087631641e-07, + "loss": 0.021, + "step": 4751 + }, + { + "epoch": 3.39, + "grad_norm": 4.048781625741955, + "learning_rate": 5.941885697596734e-07, + "loss": 0.0216, + "step": 4752 + }, + { + "epoch": 3.39, + "grad_norm": 3.429703276468199, + "learning_rate": 5.928228028196714e-07, + "loss": 0.0236, + "step": 4753 + }, + { + "epoch": 3.39, + "grad_norm": 2.739052446340753, + "learning_rate": 5.914585083994906e-07, + "loss": 0.017, + "step": 4754 + }, + { + "epoch": 3.39, + "grad_norm": 3.04638508596632, + "learning_rate": 5.900956869549629e-07, + "loss": 0.0281, + "step": 4755 + }, + { + "epoch": 3.39, + "grad_norm": 4.775164840894267, + "learning_rate": 5.887343389414363e-07, + "loss": 0.0363, + "step": 4756 + }, + { + "epoch": 3.4, + "grad_norm": 1.9766077837059972, + "learning_rate": 5.873744648137592e-07, + "loss": 0.032, + "step": 4757 + }, + { + "epoch": 3.4, + "grad_norm": 1.932804696724512, + "learning_rate": 5.860160650262925e-07, + "loss": 0.0171, + "step": 4758 + }, + { + "epoch": 3.4, + "grad_norm": 2.4446932774769956, + "learning_rate": 5.846591400329021e-07, + "loss": 0.0123, + "step": 4759 + }, + { + "epoch": 3.4, + "grad_norm": 2.8494754763154027, + "learning_rate": 5.833036902869626e-07, + "loss": 0.0229, + "step": 4760 + }, + { + "epoch": 3.4, + "grad_norm": 3.070788144846567, + "learning_rate": 5.81949716241354e-07, + "loss": 0.0193, + "step": 4761 + }, + { + "epoch": 3.4, + "grad_norm": 3.3717997476125032, + "learning_rate": 5.805972183484654e-07, + "loss": 0.0272, + "step": 4762 + }, + { + "epoch": 3.4, + "grad_norm": 1.891084092238694, + "learning_rate": 5.792461970601903e-07, + "loss": 0.02, + "step": 4763 + }, + { + "epoch": 3.4, + "grad_norm": 1.6405267772372447, + "learning_rate": 5.778966528279306e-07, + "loss": 0.016, + "step": 4764 + }, + { + "epoch": 3.4, + "grad_norm": 3.494793807973005, + "learning_rate": 5.765485861025944e-07, + "loss": 0.0191, + "step": 4765 + }, + { + "epoch": 3.4, + "grad_norm": 2.096825436091188, + "learning_rate": 5.752019973345963e-07, + "loss": 0.0208, + "step": 4766 + }, + { + "epoch": 3.4, + "grad_norm": 1.678751370669293, + "learning_rate": 5.738568869738537e-07, + "loss": 0.0174, + "step": 4767 + }, + { + "epoch": 3.4, + "grad_norm": 4.166473159072722, + "learning_rate": 5.725132554697971e-07, + "loss": 0.024, + "step": 4768 + }, + { + "epoch": 3.4, + "grad_norm": 4.3633134523358335, + "learning_rate": 5.711711032713547e-07, + "loss": 0.0182, + "step": 4769 + }, + { + "epoch": 3.4, + "grad_norm": 2.4438998290312313, + "learning_rate": 5.698304308269686e-07, + "loss": 0.0225, + "step": 4770 + }, + { + "epoch": 3.41, + "grad_norm": 1.6826909960550536, + "learning_rate": 5.684912385845786e-07, + "loss": 0.0165, + "step": 4771 + }, + { + "epoch": 3.41, + "grad_norm": 4.223897905047371, + "learning_rate": 5.671535269916373e-07, + "loss": 0.0189, + "step": 4772 + }, + { + "epoch": 3.41, + "grad_norm": 4.451136477185219, + "learning_rate": 5.658172964950953e-07, + "loss": 0.0172, + "step": 4773 + }, + { + "epoch": 3.41, + "grad_norm": 2.8782735665276964, + "learning_rate": 5.644825475414162e-07, + "loss": 0.0149, + "step": 4774 + }, + { + "epoch": 3.41, + "grad_norm": 5.622361288092509, + "learning_rate": 5.631492805765609e-07, + "loss": 0.0232, + "step": 4775 + }, + { + "epoch": 3.41, + "grad_norm": 2.0095858180774, + "learning_rate": 5.618174960459999e-07, + "loss": 0.0343, + "step": 4776 + }, + { + "epoch": 3.41, + "grad_norm": 4.312258137394328, + "learning_rate": 5.604871943947071e-07, + "loss": 0.0214, + "step": 4777 + }, + { + "epoch": 3.41, + "grad_norm": 12.714949832938647, + "learning_rate": 5.591583760671609e-07, + "loss": 0.0416, + "step": 4778 + }, + { + "epoch": 3.41, + "grad_norm": 6.098583691916675, + "learning_rate": 5.578310415073451e-07, + "loss": 0.0201, + "step": 4779 + }, + { + "epoch": 3.41, + "grad_norm": 3.0148079241593546, + "learning_rate": 5.565051911587455e-07, + "loss": 0.0257, + "step": 4780 + }, + { + "epoch": 3.41, + "grad_norm": 2.1858188497490474, + "learning_rate": 5.551808254643543e-07, + "loss": 0.0237, + "step": 4781 + }, + { + "epoch": 3.41, + "grad_norm": 7.095925230827684, + "learning_rate": 5.538579448666675e-07, + "loss": 0.0175, + "step": 4782 + }, + { + "epoch": 3.41, + "grad_norm": 2.3749779288397534, + "learning_rate": 5.525365498076807e-07, + "loss": 0.0241, + "step": 4783 + }, + { + "epoch": 3.41, + "grad_norm": 2.1169352892214444, + "learning_rate": 5.51216640728901e-07, + "loss": 0.0263, + "step": 4784 + }, + { + "epoch": 3.42, + "grad_norm": 1.5113690124988577, + "learning_rate": 5.498982180713308e-07, + "loss": 0.0244, + "step": 4785 + }, + { + "epoch": 3.42, + "grad_norm": 5.743204991318629, + "learning_rate": 5.485812822754826e-07, + "loss": 0.0365, + "step": 4786 + }, + { + "epoch": 3.42, + "grad_norm": 5.539838806774285, + "learning_rate": 5.472658337813664e-07, + "loss": 0.0237, + "step": 4787 + }, + { + "epoch": 3.42, + "grad_norm": 2.172371017801228, + "learning_rate": 5.459518730285007e-07, + "loss": 0.0176, + "step": 4788 + }, + { + "epoch": 3.42, + "grad_norm": 7.723251710056527, + "learning_rate": 5.446394004559008e-07, + "loss": 0.0264, + "step": 4789 + }, + { + "epoch": 3.42, + "grad_norm": 3.6273060070583125, + "learning_rate": 5.43328416502093e-07, + "loss": 0.0194, + "step": 4790 + }, + { + "epoch": 3.42, + "grad_norm": 5.07385094981908, + "learning_rate": 5.420189216050969e-07, + "loss": 0.0348, + "step": 4791 + }, + { + "epoch": 3.42, + "grad_norm": 2.3283643337319133, + "learning_rate": 5.407109162024409e-07, + "loss": 0.0178, + "step": 4792 + }, + { + "epoch": 3.42, + "grad_norm": 1.9835055011656801, + "learning_rate": 5.394044007311544e-07, + "loss": 0.0307, + "step": 4793 + }, + { + "epoch": 3.42, + "grad_norm": 8.575120390262455, + "learning_rate": 5.380993756277675e-07, + "loss": 0.0278, + "step": 4794 + }, + { + "epoch": 3.42, + "grad_norm": 2.5516786025822182, + "learning_rate": 5.367958413283141e-07, + "loss": 0.0219, + "step": 4795 + }, + { + "epoch": 3.42, + "grad_norm": 9.296882286540368, + "learning_rate": 5.354937982683283e-07, + "loss": 0.0288, + "step": 4796 + }, + { + "epoch": 3.42, + "grad_norm": 3.5951696364880306, + "learning_rate": 5.341932468828481e-07, + "loss": 0.0207, + "step": 4797 + }, + { + "epoch": 3.42, + "grad_norm": 3.7362598014081576, + "learning_rate": 5.328941876064114e-07, + "loss": 0.0259, + "step": 4798 + }, + { + "epoch": 3.43, + "grad_norm": 2.4075304327331613, + "learning_rate": 5.315966208730578e-07, + "loss": 0.0219, + "step": 4799 + }, + { + "epoch": 3.43, + "grad_norm": 2.245070060364752, + "learning_rate": 5.30300547116328e-07, + "loss": 0.024, + "step": 4800 + }, + { + "epoch": 3.43, + "grad_norm": 4.6692741589089675, + "learning_rate": 5.290059667692655e-07, + "loss": 0.0308, + "step": 4801 + }, + { + "epoch": 3.43, + "grad_norm": 3.000431411370373, + "learning_rate": 5.277128802644133e-07, + "loss": 0.0195, + "step": 4802 + }, + { + "epoch": 3.43, + "grad_norm": 8.965965626231066, + "learning_rate": 5.264212880338154e-07, + "loss": 0.0414, + "step": 4803 + }, + { + "epoch": 3.43, + "grad_norm": 5.876899527347979, + "learning_rate": 5.251311905090167e-07, + "loss": 0.0178, + "step": 4804 + }, + { + "epoch": 3.43, + "grad_norm": 4.667587955450332, + "learning_rate": 5.238425881210624e-07, + "loss": 0.0205, + "step": 4805 + }, + { + "epoch": 3.43, + "grad_norm": 20.266107099324124, + "learning_rate": 5.225554813004996e-07, + "loss": 0.0611, + "step": 4806 + }, + { + "epoch": 3.43, + "grad_norm": 7.450685723335502, + "learning_rate": 5.21269870477375e-07, + "loss": 0.0396, + "step": 4807 + }, + { + "epoch": 3.43, + "grad_norm": 2.8697958072570726, + "learning_rate": 5.199857560812316e-07, + "loss": 0.0218, + "step": 4808 + }, + { + "epoch": 3.43, + "grad_norm": 5.772178641403427, + "learning_rate": 5.187031385411206e-07, + "loss": 0.0254, + "step": 4809 + }, + { + "epoch": 3.43, + "grad_norm": 6.398504975526057, + "learning_rate": 5.174220182855844e-07, + "loss": 0.0217, + "step": 4810 + }, + { + "epoch": 3.43, + "grad_norm": 7.595861533557351, + "learning_rate": 5.161423957426725e-07, + "loss": 0.0276, + "step": 4811 + }, + { + "epoch": 3.43, + "grad_norm": 10.706839343586863, + "learning_rate": 5.148642713399272e-07, + "loss": 0.0207, + "step": 4812 + }, + { + "epoch": 3.44, + "grad_norm": 9.248483629993162, + "learning_rate": 5.13587645504397e-07, + "loss": 0.0163, + "step": 4813 + }, + { + "epoch": 3.44, + "grad_norm": 4.5853381448697315, + "learning_rate": 5.123125186626227e-07, + "loss": 0.032, + "step": 4814 + }, + { + "epoch": 3.44, + "grad_norm": 2.6956221870281567, + "learning_rate": 5.110388912406517e-07, + "loss": 0.0199, + "step": 4815 + }, + { + "epoch": 3.44, + "grad_norm": 6.193292360315304, + "learning_rate": 5.097667636640241e-07, + "loss": 0.0251, + "step": 4816 + }, + { + "epoch": 3.44, + "grad_norm": 4.970186592154489, + "learning_rate": 5.084961363577817e-07, + "loss": 0.0285, + "step": 4817 + }, + { + "epoch": 3.44, + "grad_norm": 2.2733355420940127, + "learning_rate": 5.072270097464649e-07, + "loss": 0.0255, + "step": 4818 + }, + { + "epoch": 3.44, + "grad_norm": 1.282955595029532, + "learning_rate": 5.059593842541127e-07, + "loss": 0.0167, + "step": 4819 + }, + { + "epoch": 3.44, + "grad_norm": 2.9173904214804534, + "learning_rate": 5.04693260304262e-07, + "loss": 0.0229, + "step": 4820 + }, + { + "epoch": 3.44, + "grad_norm": 8.307122745953784, + "learning_rate": 5.034286383199488e-07, + "loss": 0.0229, + "step": 4821 + }, + { + "epoch": 3.44, + "grad_norm": 10.234033109502867, + "learning_rate": 5.021655187237067e-07, + "loss": 0.0214, + "step": 4822 + }, + { + "epoch": 3.44, + "grad_norm": 5.625721576167736, + "learning_rate": 5.009039019375672e-07, + "loss": 0.0244, + "step": 4823 + }, + { + "epoch": 3.44, + "grad_norm": 6.9193873894305, + "learning_rate": 4.996437883830596e-07, + "loss": 0.0136, + "step": 4824 + }, + { + "epoch": 3.44, + "grad_norm": 1.9958571703345365, + "learning_rate": 4.983851784812127e-07, + "loss": 0.0194, + "step": 4825 + }, + { + "epoch": 3.44, + "grad_norm": 3.457883216533768, + "learning_rate": 4.97128072652549e-07, + "loss": 0.0174, + "step": 4826 + }, + { + "epoch": 3.45, + "grad_norm": 3.264883698161827, + "learning_rate": 4.958724713170943e-07, + "loss": 0.0245, + "step": 4827 + }, + { + "epoch": 3.45, + "grad_norm": 1.6502594031454794, + "learning_rate": 4.946183748943639e-07, + "loss": 0.0254, + "step": 4828 + }, + { + "epoch": 3.45, + "grad_norm": 1.992537904277229, + "learning_rate": 4.933657838033795e-07, + "loss": 0.0189, + "step": 4829 + }, + { + "epoch": 3.45, + "grad_norm": 2.8779533325492452, + "learning_rate": 4.921146984626507e-07, + "loss": 0.0241, + "step": 4830 + }, + { + "epoch": 3.45, + "grad_norm": 3.3861004105405037, + "learning_rate": 4.908651192901926e-07, + "loss": 0.022, + "step": 4831 + }, + { + "epoch": 3.45, + "grad_norm": 1.7445944302458545, + "learning_rate": 4.896170467035089e-07, + "loss": 0.0226, + "step": 4832 + }, + { + "epoch": 3.45, + "grad_norm": 3.5655653643743537, + "learning_rate": 4.883704811196072e-07, + "loss": 0.0161, + "step": 4833 + }, + { + "epoch": 3.45, + "grad_norm": 8.66061468381288, + "learning_rate": 4.871254229549855e-07, + "loss": 0.0523, + "step": 4834 + }, + { + "epoch": 3.45, + "grad_norm": 2.657255822757317, + "learning_rate": 4.858818726256425e-07, + "loss": 0.028, + "step": 4835 + }, + { + "epoch": 3.45, + "grad_norm": 5.472995401509639, + "learning_rate": 4.846398305470712e-07, + "loss": 0.0291, + "step": 4836 + }, + { + "epoch": 3.45, + "grad_norm": 5.524011687090876, + "learning_rate": 4.833992971342604e-07, + "loss": 0.0233, + "step": 4837 + }, + { + "epoch": 3.45, + "grad_norm": 8.501747778235401, + "learning_rate": 4.821602728016955e-07, + "loss": 0.0182, + "step": 4838 + }, + { + "epoch": 3.45, + "grad_norm": 4.991085189378154, + "learning_rate": 4.809227579633585e-07, + "loss": 0.0291, + "step": 4839 + }, + { + "epoch": 3.45, + "grad_norm": 5.373771634991806, + "learning_rate": 4.796867530327249e-07, + "loss": 0.0316, + "step": 4840 + }, + { + "epoch": 3.46, + "grad_norm": 2.6554569867124536, + "learning_rate": 4.784522584227675e-07, + "loss": 0.0186, + "step": 4841 + }, + { + "epoch": 3.46, + "grad_norm": 5.663613995175922, + "learning_rate": 4.772192745459536e-07, + "loss": 0.0249, + "step": 4842 + }, + { + "epoch": 3.46, + "grad_norm": 2.1898976437413404, + "learning_rate": 4.7598780181424666e-07, + "loss": 0.0228, + "step": 4843 + }, + { + "epoch": 3.46, + "grad_norm": 4.016259077763735, + "learning_rate": 4.7475784063910404e-07, + "loss": 0.0166, + "step": 4844 + }, + { + "epoch": 3.46, + "grad_norm": 1.9221511010470675, + "learning_rate": 4.7352939143147927e-07, + "loss": 0.0231, + "step": 4845 + }, + { + "epoch": 3.46, + "grad_norm": 3.563250640079929, + "learning_rate": 4.72302454601819e-07, + "loss": 0.0238, + "step": 4846 + }, + { + "epoch": 3.46, + "grad_norm": 7.844911546765337, + "learning_rate": 4.7107703056006706e-07, + "loss": 0.0254, + "step": 4847 + }, + { + "epoch": 3.46, + "grad_norm": 5.403934146700979, + "learning_rate": 4.6985311971565806e-07, + "loss": 0.0185, + "step": 4848 + }, + { + "epoch": 3.46, + "grad_norm": 5.167903062966448, + "learning_rate": 4.6863072247752664e-07, + "loss": 0.016, + "step": 4849 + }, + { + "epoch": 3.46, + "grad_norm": 2.1577262913144835, + "learning_rate": 4.67409839254096e-07, + "loss": 0.0212, + "step": 4850 + }, + { + "epoch": 3.46, + "grad_norm": 1.3631800246611254, + "learning_rate": 4.66190470453286e-07, + "loss": 0.0197, + "step": 4851 + }, + { + "epoch": 3.46, + "grad_norm": 3.344043147865704, + "learning_rate": 4.6497261648251134e-07, + "loss": 0.02, + "step": 4852 + }, + { + "epoch": 3.46, + "grad_norm": 3.0874123639808926, + "learning_rate": 4.6375627774867925e-07, + "loss": 0.0212, + "step": 4853 + }, + { + "epoch": 3.46, + "grad_norm": 7.328165344305696, + "learning_rate": 4.6254145465819134e-07, + "loss": 0.0213, + "step": 4854 + }, + { + "epoch": 3.47, + "grad_norm": 3.224518234244968, + "learning_rate": 4.6132814761694234e-07, + "loss": 0.016, + "step": 4855 + }, + { + "epoch": 3.47, + "grad_norm": 4.097008960876435, + "learning_rate": 4.6011635703032075e-07, + "loss": 0.0252, + "step": 4856 + }, + { + "epoch": 3.47, + "grad_norm": 2.807322653482841, + "learning_rate": 4.589060833032083e-07, + "loss": 0.0268, + "step": 4857 + }, + { + "epoch": 3.47, + "grad_norm": 2.3328664724594423, + "learning_rate": 4.5769732683997983e-07, + "loss": 0.0319, + "step": 4858 + }, + { + "epoch": 3.47, + "grad_norm": 1.7226625611795265, + "learning_rate": 4.564900880445039e-07, + "loss": 0.021, + "step": 4859 + }, + { + "epoch": 3.47, + "grad_norm": 4.033978626071074, + "learning_rate": 4.552843673201407e-07, + "loss": 0.0255, + "step": 4860 + }, + { + "epoch": 3.47, + "grad_norm": 2.467384163882045, + "learning_rate": 4.540801650697446e-07, + "loss": 0.0245, + "step": 4861 + }, + { + "epoch": 3.47, + "grad_norm": 2.9446504009525656, + "learning_rate": 4.528774816956616e-07, + "loss": 0.0178, + "step": 4862 + }, + { + "epoch": 3.47, + "grad_norm": 2.235471595072864, + "learning_rate": 4.516763175997302e-07, + "loss": 0.0237, + "step": 4863 + }, + { + "epoch": 3.47, + "grad_norm": 3.5832326050990857, + "learning_rate": 4.5047667318328215e-07, + "loss": 0.0277, + "step": 4864 + }, + { + "epoch": 3.47, + "grad_norm": 7.581065600283904, + "learning_rate": 4.492785488471413e-07, + "loss": 0.0321, + "step": 4865 + }, + { + "epoch": 3.47, + "grad_norm": 2.6477077947678427, + "learning_rate": 4.480819449916224e-07, + "loss": 0.0253, + "step": 4866 + }, + { + "epoch": 3.47, + "grad_norm": 3.2013043262055714, + "learning_rate": 4.468868620165334e-07, + "loss": 0.0266, + "step": 4867 + }, + { + "epoch": 3.47, + "grad_norm": 7.4880088058210115, + "learning_rate": 4.4569330032117496e-07, + "loss": 0.033, + "step": 4868 + }, + { + "epoch": 3.48, + "grad_norm": 4.464721969976531, + "learning_rate": 4.445012603043347e-07, + "loss": 0.0327, + "step": 4869 + }, + { + "epoch": 3.48, + "grad_norm": 3.1361654089894224, + "learning_rate": 4.4331074236430014e-07, + "loss": 0.0221, + "step": 4870 + }, + { + "epoch": 3.48, + "grad_norm": 2.418337459504903, + "learning_rate": 4.421217468988409e-07, + "loss": 0.0188, + "step": 4871 + }, + { + "epoch": 3.48, + "grad_norm": 2.973188602787579, + "learning_rate": 4.409342743052264e-07, + "loss": 0.0176, + "step": 4872 + }, + { + "epoch": 3.48, + "grad_norm": 4.845221165052784, + "learning_rate": 4.3974832498020983e-07, + "loss": 0.0243, + "step": 4873 + }, + { + "epoch": 3.48, + "grad_norm": 2.687507603812843, + "learning_rate": 4.385638993200425e-07, + "loss": 0.0227, + "step": 4874 + }, + { + "epoch": 3.48, + "grad_norm": 5.577237802892938, + "learning_rate": 4.3738099772045963e-07, + "loss": 0.0234, + "step": 4875 + }, + { + "epoch": 3.48, + "grad_norm": 1.9561421206253133, + "learning_rate": 4.3619962057669216e-07, + "loss": 0.025, + "step": 4876 + }, + { + "epoch": 3.48, + "grad_norm": 4.175169583976657, + "learning_rate": 4.350197682834606e-07, + "loss": 0.0178, + "step": 4877 + }, + { + "epoch": 3.48, + "grad_norm": 5.29437999575849, + "learning_rate": 4.338414412349745e-07, + "loss": 0.0204, + "step": 4878 + }, + { + "epoch": 3.48, + "grad_norm": 2.0279366468727096, + "learning_rate": 4.3266463982493566e-07, + "loss": 0.0251, + "step": 4879 + }, + { + "epoch": 3.48, + "grad_norm": 3.208336780233766, + "learning_rate": 4.314893644465351e-07, + "loss": 0.0241, + "step": 4880 + }, + { + "epoch": 3.48, + "grad_norm": 3.0834492531396256, + "learning_rate": 4.303156154924537e-07, + "loss": 0.0139, + "step": 4881 + }, + { + "epoch": 3.48, + "grad_norm": 3.2142530911093683, + "learning_rate": 4.291433933548633e-07, + "loss": 0.0255, + "step": 4882 + }, + { + "epoch": 3.49, + "grad_norm": 2.640834983845967, + "learning_rate": 4.279726984254251e-07, + "loss": 0.0262, + "step": 4883 + }, + { + "epoch": 3.49, + "grad_norm": 3.651611286643107, + "learning_rate": 4.268035310952906e-07, + "loss": 0.0197, + "step": 4884 + }, + { + "epoch": 3.49, + "grad_norm": 2.2783581009506872, + "learning_rate": 4.256358917550979e-07, + "loss": 0.0192, + "step": 4885 + }, + { + "epoch": 3.49, + "grad_norm": 5.428102406876919, + "learning_rate": 4.244697807949805e-07, + "loss": 0.0196, + "step": 4886 + }, + { + "epoch": 3.49, + "grad_norm": 4.979308584900138, + "learning_rate": 4.2330519860455446e-07, + "loss": 0.0229, + "step": 4887 + }, + { + "epoch": 3.49, + "grad_norm": 4.1409739409071635, + "learning_rate": 4.2214214557293133e-07, + "loss": 0.0306, + "step": 4888 + }, + { + "epoch": 3.49, + "grad_norm": 4.79268503474881, + "learning_rate": 4.209806220887053e-07, + "loss": 0.03, + "step": 4889 + }, + { + "epoch": 3.49, + "grad_norm": 2.352736577614645, + "learning_rate": 4.1982062853996695e-07, + "loss": 0.0215, + "step": 4890 + }, + { + "epoch": 3.49, + "grad_norm": 3.0169446795812007, + "learning_rate": 4.1866216531428806e-07, + "loss": 0.0196, + "step": 4891 + }, + { + "epoch": 3.49, + "grad_norm": 4.629541748577467, + "learning_rate": 4.1750523279873613e-07, + "loss": 0.0195, + "step": 4892 + }, + { + "epoch": 3.49, + "grad_norm": 1.271554392634293, + "learning_rate": 4.1634983137986083e-07, + "loss": 0.0134, + "step": 4893 + }, + { + "epoch": 3.49, + "grad_norm": 2.0119672947069476, + "learning_rate": 4.151959614437046e-07, + "loss": 0.0247, + "step": 4894 + }, + { + "epoch": 3.49, + "grad_norm": 5.712184762810063, + "learning_rate": 4.1404362337579716e-07, + "loss": 0.029, + "step": 4895 + }, + { + "epoch": 3.49, + "grad_norm": 5.945272020870616, + "learning_rate": 4.128928175611546e-07, + "loss": 0.029, + "step": 4896 + }, + { + "epoch": 3.5, + "grad_norm": 4.408542849416339, + "learning_rate": 4.1174354438428434e-07, + "loss": 0.0188, + "step": 4897 + }, + { + "epoch": 3.5, + "grad_norm": 7.5322367428462025, + "learning_rate": 4.105958042291791e-07, + "loss": 0.0363, + "step": 4898 + }, + { + "epoch": 3.5, + "grad_norm": 1.6314161372537972, + "learning_rate": 4.0944959747931945e-07, + "loss": 0.0179, + "step": 4899 + }, + { + "epoch": 3.5, + "grad_norm": 3.334468123285942, + "learning_rate": 4.0830492451767566e-07, + "loss": 0.0211, + "step": 4900 + }, + { + "epoch": 3.5, + "grad_norm": 2.5645975778823953, + "learning_rate": 4.0716178572670405e-07, + "loss": 0.0224, + "step": 4901 + }, + { + "epoch": 3.5, + "grad_norm": 4.513557316836511, + "learning_rate": 4.060201814883474e-07, + "loss": 0.0137, + "step": 4902 + }, + { + "epoch": 3.5, + "grad_norm": 3.9386081706521936, + "learning_rate": 4.0488011218403844e-07, + "loss": 0.0279, + "step": 4903 + }, + { + "epoch": 3.5, + "grad_norm": 2.9682175362012773, + "learning_rate": 4.0374157819469406e-07, + "loss": 0.0221, + "step": 4904 + }, + { + "epoch": 3.5, + "grad_norm": 4.816340081088018, + "learning_rate": 4.0260457990072113e-07, + "loss": 0.0267, + "step": 4905 + }, + { + "epoch": 3.5, + "grad_norm": 1.652181073212692, + "learning_rate": 4.014691176820107e-07, + "loss": 0.0142, + "step": 4906 + }, + { + "epoch": 3.5, + "grad_norm": 2.860067524660608, + "learning_rate": 4.003351919179421e-07, + "loss": 0.0188, + "step": 4907 + }, + { + "epoch": 3.5, + "grad_norm": 1.120466782402688, + "learning_rate": 3.9920280298738125e-07, + "loss": 0.0195, + "step": 4908 + }, + { + "epoch": 3.5, + "grad_norm": 3.1345045016083892, + "learning_rate": 3.980719512686809e-07, + "loss": 0.0175, + "step": 4909 + }, + { + "epoch": 3.5, + "grad_norm": 3.7019204611349212, + "learning_rate": 3.969426371396773e-07, + "loss": 0.0179, + "step": 4910 + }, + { + "epoch": 3.51, + "grad_norm": 2.976700351913577, + "learning_rate": 3.9581486097769905e-07, + "loss": 0.0188, + "step": 4911 + }, + { + "epoch": 3.51, + "grad_norm": 2.486198283586228, + "learning_rate": 3.946886231595526e-07, + "loss": 0.0215, + "step": 4912 + }, + { + "epoch": 3.51, + "grad_norm": 2.020627642013814, + "learning_rate": 3.935639240615396e-07, + "loss": 0.0218, + "step": 4913 + }, + { + "epoch": 3.51, + "grad_norm": 4.969087632628006, + "learning_rate": 3.924407640594391e-07, + "loss": 0.0237, + "step": 4914 + }, + { + "epoch": 3.51, + "grad_norm": 2.887660959477525, + "learning_rate": 3.913191435285224e-07, + "loss": 0.0196, + "step": 4915 + }, + { + "epoch": 3.51, + "grad_norm": 1.867078635117958, + "learning_rate": 3.9019906284354145e-07, + "loss": 0.0118, + "step": 4916 + }, + { + "epoch": 3.51, + "grad_norm": 1.9465176519470353, + "learning_rate": 3.8908052237873863e-07, + "loss": 0.0207, + "step": 4917 + }, + { + "epoch": 3.51, + "grad_norm": 9.8498836097476, + "learning_rate": 3.879635225078371e-07, + "loss": 0.0282, + "step": 4918 + }, + { + "epoch": 3.51, + "grad_norm": 2.9397169943458836, + "learning_rate": 3.868480636040484e-07, + "loss": 0.0266, + "step": 4919 + }, + { + "epoch": 3.51, + "grad_norm": 4.960618492091106, + "learning_rate": 3.857341460400665e-07, + "loss": 0.0177, + "step": 4920 + }, + { + "epoch": 3.51, + "grad_norm": 4.712244580795838, + "learning_rate": 3.846217701880739e-07, + "loss": 0.0237, + "step": 4921 + }, + { + "epoch": 3.51, + "grad_norm": 1.6971508399719888, + "learning_rate": 3.835109364197348e-07, + "loss": 0.018, + "step": 4922 + }, + { + "epoch": 3.51, + "grad_norm": 1.7472712527944807, + "learning_rate": 3.8240164510620017e-07, + "loss": 0.0211, + "step": 4923 + }, + { + "epoch": 3.51, + "grad_norm": 3.5480848152651516, + "learning_rate": 3.81293896618104e-07, + "loss": 0.0257, + "step": 4924 + }, + { + "epoch": 3.52, + "grad_norm": 3.256572425168427, + "learning_rate": 3.8018769132556644e-07, + "loss": 0.0149, + "step": 4925 + }, + { + "epoch": 3.52, + "grad_norm": 2.4597451706002413, + "learning_rate": 3.790830295981912e-07, + "loss": 0.0182, + "step": 4926 + }, + { + "epoch": 3.52, + "grad_norm": 3.5212984506844682, + "learning_rate": 3.7797991180506643e-07, + "loss": 0.019, + "step": 4927 + }, + { + "epoch": 3.52, + "grad_norm": 3.4875509288086013, + "learning_rate": 3.768783383147623e-07, + "loss": 0.0197, + "step": 4928 + }, + { + "epoch": 3.52, + "grad_norm": 3.163210180212613, + "learning_rate": 3.757783094953382e-07, + "loss": 0.0292, + "step": 4929 + }, + { + "epoch": 3.52, + "grad_norm": 2.5198870342212296, + "learning_rate": 3.746798257143314e-07, + "loss": 0.029, + "step": 4930 + }, + { + "epoch": 3.52, + "grad_norm": 3.5847902090339847, + "learning_rate": 3.735828873387681e-07, + "loss": 0.0183, + "step": 4931 + }, + { + "epoch": 3.52, + "grad_norm": 3.338194522323422, + "learning_rate": 3.724874947351531e-07, + "loss": 0.0196, + "step": 4932 + }, + { + "epoch": 3.52, + "grad_norm": 1.9680640652690908, + "learning_rate": 3.7139364826948077e-07, + "loss": 0.0177, + "step": 4933 + }, + { + "epoch": 3.52, + "grad_norm": 1.7834906752501898, + "learning_rate": 3.7030134830722207e-07, + "loss": 0.0166, + "step": 4934 + }, + { + "epoch": 3.52, + "grad_norm": 2.341415129554604, + "learning_rate": 3.692105952133379e-07, + "loss": 0.0249, + "step": 4935 + }, + { + "epoch": 3.52, + "grad_norm": 4.6407785758751166, + "learning_rate": 3.681213893522667e-07, + "loss": 0.0352, + "step": 4936 + }, + { + "epoch": 3.52, + "grad_norm": 4.540938320445459, + "learning_rate": 3.670337310879335e-07, + "loss": 0.0237, + "step": 4937 + }, + { + "epoch": 3.52, + "grad_norm": 6.387135784803516, + "learning_rate": 3.6594762078374536e-07, + "loss": 0.0277, + "step": 4938 + }, + { + "epoch": 3.53, + "grad_norm": 3.665487214795183, + "learning_rate": 3.6486305880259085e-07, + "loss": 0.0257, + "step": 4939 + }, + { + "epoch": 3.53, + "grad_norm": 4.212800882402541, + "learning_rate": 3.6378004550684355e-07, + "loss": 0.0213, + "step": 4940 + }, + { + "epoch": 3.53, + "grad_norm": 3.2600196073500043, + "learning_rate": 3.626985812583572e-07, + "loss": 0.0193, + "step": 4941 + }, + { + "epoch": 3.53, + "grad_norm": 2.0264755974293966, + "learning_rate": 3.6161866641847007e-07, + "loss": 0.0271, + "step": 4942 + }, + { + "epoch": 3.53, + "grad_norm": 3.4707348585734024, + "learning_rate": 3.6054030134800243e-07, + "loss": 0.0162, + "step": 4943 + }, + { + "epoch": 3.53, + "grad_norm": 2.0961417452701587, + "learning_rate": 3.594634864072527e-07, + "loss": 0.0232, + "step": 4944 + }, + { + "epoch": 3.53, + "grad_norm": 4.063463726526555, + "learning_rate": 3.583882219560092e-07, + "loss": 0.032, + "step": 4945 + }, + { + "epoch": 3.53, + "grad_norm": 2.098516311167729, + "learning_rate": 3.57314508353534e-07, + "loss": 0.0345, + "step": 4946 + }, + { + "epoch": 3.53, + "grad_norm": 3.9001687600424275, + "learning_rate": 3.5624234595857787e-07, + "loss": 0.0237, + "step": 4947 + }, + { + "epoch": 3.53, + "grad_norm": 1.3522189099210975, + "learning_rate": 3.551717351293676e-07, + "loss": 0.0137, + "step": 4948 + }, + { + "epoch": 3.53, + "grad_norm": 5.448938058753399, + "learning_rate": 3.541026762236166e-07, + "loss": 0.0335, + "step": 4949 + }, + { + "epoch": 3.53, + "grad_norm": 3.39809976716535, + "learning_rate": 3.5303516959851405e-07, + "loss": 0.0267, + "step": 4950 + }, + { + "epoch": 3.53, + "grad_norm": 2.0973599945236923, + "learning_rate": 3.519692156107379e-07, + "loss": 0.02, + "step": 4951 + }, + { + "epoch": 3.53, + "grad_norm": 4.928976500976884, + "learning_rate": 3.509048146164401e-07, + "loss": 0.0232, + "step": 4952 + }, + { + "epoch": 3.54, + "grad_norm": 4.341400781737448, + "learning_rate": 3.4984196697125827e-07, + "loss": 0.021, + "step": 4953 + }, + { + "epoch": 3.54, + "grad_norm": 1.5588549990820906, + "learning_rate": 3.4878067303030836e-07, + "loss": 0.0135, + "step": 4954 + }, + { + "epoch": 3.54, + "grad_norm": 5.866237503465065, + "learning_rate": 3.4772093314818957e-07, + "loss": 0.0292, + "step": 4955 + }, + { + "epoch": 3.54, + "grad_norm": 2.889730819504992, + "learning_rate": 3.4666274767897967e-07, + "loss": 0.0207, + "step": 4956 + }, + { + "epoch": 3.54, + "grad_norm": 6.8785021783137665, + "learning_rate": 3.456061169762392e-07, + "loss": 0.0226, + "step": 4957 + }, + { + "epoch": 3.54, + "grad_norm": 2.4293314184044568, + "learning_rate": 3.44551041393007e-07, + "loss": 0.0219, + "step": 4958 + }, + { + "epoch": 3.54, + "grad_norm": 2.5630792149658586, + "learning_rate": 3.434975212818048e-07, + "loss": 0.0239, + "step": 4959 + }, + { + "epoch": 3.54, + "grad_norm": 1.6349579915872139, + "learning_rate": 3.424455569946317e-07, + "loss": 0.0134, + "step": 4960 + }, + { + "epoch": 3.54, + "grad_norm": 6.31085497252182, + "learning_rate": 3.4139514888296975e-07, + "loss": 0.0312, + "step": 4961 + }, + { + "epoch": 3.54, + "grad_norm": 5.08757040363162, + "learning_rate": 3.403462972977789e-07, + "loss": 0.0244, + "step": 4962 + }, + { + "epoch": 3.54, + "grad_norm": 4.263621591200577, + "learning_rate": 3.392990025895004e-07, + "loss": 0.0243, + "step": 4963 + }, + { + "epoch": 3.54, + "grad_norm": 2.1468229060472144, + "learning_rate": 3.3825326510805556e-07, + "loss": 0.0219, + "step": 4964 + }, + { + "epoch": 3.54, + "grad_norm": 3.37958648447092, + "learning_rate": 3.372090852028437e-07, + "loss": 0.0281, + "step": 4965 + }, + { + "epoch": 3.54, + "grad_norm": 1.7362602943935712, + "learning_rate": 3.361664632227446e-07, + "loss": 0.0188, + "step": 4966 + }, + { + "epoch": 3.55, + "grad_norm": 2.274637522937686, + "learning_rate": 3.3512539951611856e-07, + "loss": 0.0234, + "step": 4967 + }, + { + "epoch": 3.55, + "grad_norm": 5.0068330161997805, + "learning_rate": 3.3408589443080395e-07, + "loss": 0.0206, + "step": 4968 + }, + { + "epoch": 3.55, + "grad_norm": 2.6474551601173517, + "learning_rate": 3.3304794831411804e-07, + "loss": 0.0234, + "step": 4969 + }, + { + "epoch": 3.55, + "grad_norm": 2.496187749477123, + "learning_rate": 3.3201156151285994e-07, + "loss": 0.0258, + "step": 4970 + }, + { + "epoch": 3.55, + "grad_norm": 4.010290134183546, + "learning_rate": 3.309767343733028e-07, + "loss": 0.0176, + "step": 4971 + }, + { + "epoch": 3.55, + "grad_norm": 1.5675425735480626, + "learning_rate": 3.299434672412044e-07, + "loss": 0.0194, + "step": 4972 + }, + { + "epoch": 3.55, + "grad_norm": 2.539100465424783, + "learning_rate": 3.2891176046179583e-07, + "loss": 0.019, + "step": 4973 + }, + { + "epoch": 3.55, + "grad_norm": 2.0917188211506823, + "learning_rate": 3.278816143797919e-07, + "loss": 0.0194, + "step": 4974 + }, + { + "epoch": 3.55, + "grad_norm": 4.350767461798503, + "learning_rate": 3.2685302933938177e-07, + "loss": 0.0386, + "step": 4975 + }, + { + "epoch": 3.55, + "grad_norm": 3.8928169385095948, + "learning_rate": 3.2582600568423715e-07, + "loss": 0.0244, + "step": 4976 + }, + { + "epoch": 3.55, + "grad_norm": 5.787909162323301, + "learning_rate": 3.2480054375750305e-07, + "loss": 0.0193, + "step": 4977 + }, + { + "epoch": 3.55, + "grad_norm": 2.4001819336533567, + "learning_rate": 3.237766439018064e-07, + "loss": 0.02, + "step": 4978 + }, + { + "epoch": 3.55, + "grad_norm": 1.8709562877855663, + "learning_rate": 3.227543064592514e-07, + "loss": 0.0191, + "step": 4979 + }, + { + "epoch": 3.55, + "grad_norm": 1.9065701916885103, + "learning_rate": 3.2173353177142044e-07, + "loss": 0.0203, + "step": 4980 + }, + { + "epoch": 3.56, + "grad_norm": 2.5120744516072513, + "learning_rate": 3.207143201793722e-07, + "loss": 0.0187, + "step": 4981 + }, + { + "epoch": 3.56, + "grad_norm": 1.8222263981036755, + "learning_rate": 3.1969667202364496e-07, + "loss": 0.0223, + "step": 4982 + }, + { + "epoch": 3.56, + "grad_norm": 3.4179200252243804, + "learning_rate": 3.1868058764425337e-07, + "loss": 0.0249, + "step": 4983 + }, + { + "epoch": 3.56, + "grad_norm": 5.7498072907102635, + "learning_rate": 3.1766606738069084e-07, + "loss": 0.0275, + "step": 4984 + }, + { + "epoch": 3.56, + "grad_norm": 5.041392321955332, + "learning_rate": 3.166531115719268e-07, + "loss": 0.021, + "step": 4985 + }, + { + "epoch": 3.56, + "grad_norm": 1.822728818625865, + "learning_rate": 3.1564172055640994e-07, + "loss": 0.0185, + "step": 4986 + }, + { + "epoch": 3.56, + "grad_norm": 2.7917425319945153, + "learning_rate": 3.1463189467206166e-07, + "loss": 0.023, + "step": 4987 + }, + { + "epoch": 3.56, + "grad_norm": 4.366628592986311, + "learning_rate": 3.1362363425628763e-07, + "loss": 0.0171, + "step": 4988 + }, + { + "epoch": 3.56, + "grad_norm": 3.3852483744219595, + "learning_rate": 3.1261693964596275e-07, + "loss": 0.0244, + "step": 4989 + }, + { + "epoch": 3.56, + "grad_norm": 3.7515129198680555, + "learning_rate": 3.116118111774452e-07, + "loss": 0.0148, + "step": 4990 + }, + { + "epoch": 3.56, + "grad_norm": 1.9817429833762845, + "learning_rate": 3.106082491865647e-07, + "loss": 0.0157, + "step": 4991 + }, + { + "epoch": 3.56, + "grad_norm": 2.414884784646765, + "learning_rate": 3.0960625400863253e-07, + "loss": 0.0179, + "step": 4992 + }, + { + "epoch": 3.56, + "grad_norm": 5.250163439928367, + "learning_rate": 3.0860582597843137e-07, + "loss": 0.0252, + "step": 4993 + }, + { + "epoch": 3.56, + "grad_norm": 4.698750314158885, + "learning_rate": 3.0760696543022496e-07, + "loss": 0.0187, + "step": 4994 + }, + { + "epoch": 3.57, + "grad_norm": 12.932454054533897, + "learning_rate": 3.066096726977502e-07, + "loss": 0.0597, + "step": 4995 + }, + { + "epoch": 3.57, + "grad_norm": 6.094441081177688, + "learning_rate": 3.056139481142206e-07, + "loss": 0.0201, + "step": 4996 + }, + { + "epoch": 3.57, + "grad_norm": 7.143468273686904, + "learning_rate": 3.0461979201232674e-07, + "loss": 0.0393, + "step": 4997 + }, + { + "epoch": 3.57, + "grad_norm": 1.6228527593437256, + "learning_rate": 3.0362720472423503e-07, + "loss": 0.018, + "step": 4998 + }, + { + "epoch": 3.57, + "grad_norm": 8.117921391666709, + "learning_rate": 3.026361865815869e-07, + "loss": 0.036, + "step": 4999 + }, + { + "epoch": 3.57, + "grad_norm": 4.951489289600751, + "learning_rate": 3.016467379154997e-07, + "loss": 0.0274, + "step": 5000 + }, + { + "epoch": 3.57, + "eval_avg_AUC": 0.8360170574106709, + "eval_avg_Accuracy": 0.743824602122016, + "eval_avg_Accuracy-right": 0.8943524194600235, + "eval_avg_Accuracy-wrong": 0.4813509210825563, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.7115880873199548, + "eval_last_AUC": 0.8253922573221322, + "eval_last_Accuracy": 0.7799651856763926, + "eval_last_Accuracy-right": 0.8460284335463676, + "eval_last_Accuracy-wrong": 0.6647714350693654, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.706694551574502, + "eval_max_AUC": 0.7861393054105478, + "eval_max_Accuracy": 0.647256299734748, + "eval_max_Accuracy-right": 0.988065736272336, + "eval_max_Accuracy-wrong": 0.05299067546054128, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6590420344459055, + "eval_min_AUC": 0.8438178165751404, + "eval_min_Accuracy": 0.7740384615384616, + "eval_min_Accuracy-right": 0.7930089996087126, + "eval_min_Accuracy-wrong": 0.7409597452808733, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.7097302187281208, + "eval_prod_AUC": 0.8398013852657344, + "eval_prod_Accuracy": 0.7416279840848806, + "eval_prod_Accuracy-right": 0.6696882744228512, + "eval_prod_Accuracy-wrong": 0.8670684557652946, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.7074705074291826, + "eval_runtime": 246.8796, + "eval_samples_per_second": 97.732, + "eval_steps_per_second": 3.054, + "eval_sum_AUC": 0.7124423224936764, + "eval_sum_Accuracy": 0.6394645225464191, + "eval_sum_Accuracy-right": 0.9969349158732229, + "eval_sum_Accuracy-wrong": 0.016147373209006142, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6847078223460834, + "step": 5000 + }, + { + "epoch": 3.57, + "grad_norm": 3.654880951851883, + "learning_rate": 3.0065885905656733e-07, + "loss": 0.0203, + "step": 5001 + }, + { + "epoch": 3.57, + "grad_norm": 5.243962114831206, + "learning_rate": 2.99672550334858e-07, + "loss": 0.0173, + "step": 5002 + }, + { + "epoch": 3.57, + "grad_norm": 1.4092832840135632, + "learning_rate": 2.986878120799158e-07, + "loss": 0.0208, + "step": 5003 + }, + { + "epoch": 3.57, + "grad_norm": 1.554011978745569, + "learning_rate": 2.977046446207604e-07, + "loss": 0.0176, + "step": 5004 + }, + { + "epoch": 3.57, + "grad_norm": 4.669169960672686, + "learning_rate": 2.967230482858863e-07, + "loss": 0.0252, + "step": 5005 + }, + { + "epoch": 3.57, + "grad_norm": 6.6094996745112695, + "learning_rate": 2.957430234032627e-07, + "loss": 0.0215, + "step": 5006 + }, + { + "epoch": 3.57, + "grad_norm": 11.110930559363911, + "learning_rate": 2.947645703003338e-07, + "loss": 0.0281, + "step": 5007 + }, + { + "epoch": 3.57, + "grad_norm": 5.9334704383066175, + "learning_rate": 2.937876893040209e-07, + "loss": 0.0171, + "step": 5008 + }, + { + "epoch": 3.58, + "grad_norm": 4.423469270196388, + "learning_rate": 2.9281238074071463e-07, + "loss": 0.0315, + "step": 5009 + }, + { + "epoch": 3.58, + "grad_norm": 1.592156374591865, + "learning_rate": 2.9183864493628756e-07, + "loss": 0.0227, + "step": 5010 + }, + { + "epoch": 3.58, + "grad_norm": 4.119354327917944, + "learning_rate": 2.908664822160806e-07, + "loss": 0.0254, + "step": 5011 + }, + { + "epoch": 3.58, + "grad_norm": 4.884003650845199, + "learning_rate": 2.898958929049117e-07, + "loss": 0.0186, + "step": 5012 + }, + { + "epoch": 3.58, + "grad_norm": 1.961721413748107, + "learning_rate": 2.889268773270731e-07, + "loss": 0.0222, + "step": 5013 + }, + { + "epoch": 3.58, + "grad_norm": 2.7963462297389414, + "learning_rate": 2.879594358063303e-07, + "loss": 0.0233, + "step": 5014 + }, + { + "epoch": 3.58, + "grad_norm": 2.468003197724322, + "learning_rate": 2.869935686659248e-07, + "loss": 0.0169, + "step": 5015 + }, + { + "epoch": 3.58, + "grad_norm": 8.136562695225999, + "learning_rate": 2.8602927622856935e-07, + "loss": 0.0215, + "step": 5016 + }, + { + "epoch": 3.58, + "grad_norm": 5.485663261521494, + "learning_rate": 2.8506655881645305e-07, + "loss": 0.0159, + "step": 5017 + }, + { + "epoch": 3.58, + "grad_norm": 2.0505850995875723, + "learning_rate": 2.841054167512369e-07, + "loss": 0.0294, + "step": 5018 + }, + { + "epoch": 3.58, + "grad_norm": 7.481500531403321, + "learning_rate": 2.8314585035405683e-07, + "loss": 0.0284, + "step": 5019 + }, + { + "epoch": 3.58, + "grad_norm": 2.9777680960797746, + "learning_rate": 2.8218785994552136e-07, + "loss": 0.028, + "step": 5020 + }, + { + "epoch": 3.58, + "grad_norm": 1.8732685907666995, + "learning_rate": 2.8123144584571326e-07, + "loss": 0.0196, + "step": 5021 + }, + { + "epoch": 3.58, + "grad_norm": 5.086031806559671, + "learning_rate": 2.8027660837418813e-07, + "loss": 0.0253, + "step": 5022 + }, + { + "epoch": 3.59, + "grad_norm": 6.7694477711519685, + "learning_rate": 2.793233478499752e-07, + "loss": 0.0291, + "step": 5023 + }, + { + "epoch": 3.59, + "grad_norm": 3.4396884785592605, + "learning_rate": 2.7837166459157625e-07, + "loss": 0.0202, + "step": 5024 + }, + { + "epoch": 3.59, + "grad_norm": 5.189513945664995, + "learning_rate": 2.77421558916966e-07, + "loss": 0.0231, + "step": 5025 + }, + { + "epoch": 3.59, + "grad_norm": 2.550855581854903, + "learning_rate": 2.764730311435931e-07, + "loss": 0.014, + "step": 5026 + }, + { + "epoch": 3.59, + "grad_norm": 4.1416552070916355, + "learning_rate": 2.755260815883781e-07, + "loss": 0.0367, + "step": 5027 + }, + { + "epoch": 3.59, + "grad_norm": 2.011663446639588, + "learning_rate": 2.745807105677145e-07, + "loss": 0.0251, + "step": 5028 + }, + { + "epoch": 3.59, + "grad_norm": 7.231248835410802, + "learning_rate": 2.736369183974685e-07, + "loss": 0.02, + "step": 5029 + }, + { + "epoch": 3.59, + "grad_norm": 2.63783442077024, + "learning_rate": 2.726947053929768e-07, + "loss": 0.0271, + "step": 5030 + }, + { + "epoch": 3.59, + "grad_norm": 3.454747329266388, + "learning_rate": 2.7175407186905367e-07, + "loss": 0.017, + "step": 5031 + }, + { + "epoch": 3.59, + "grad_norm": 5.611368469459556, + "learning_rate": 2.708150181399788e-07, + "loss": 0.0202, + "step": 5032 + }, + { + "epoch": 3.59, + "grad_norm": 2.2270437823391966, + "learning_rate": 2.698775445195101e-07, + "loss": 0.0245, + "step": 5033 + }, + { + "epoch": 3.59, + "grad_norm": 2.6227149241256744, + "learning_rate": 2.689416513208726e-07, + "loss": 0.0145, + "step": 5034 + }, + { + "epoch": 3.59, + "grad_norm": 3.930124224787002, + "learning_rate": 2.6800733885676833e-07, + "loss": 0.0425, + "step": 5035 + }, + { + "epoch": 3.59, + "grad_norm": 3.6236506163310427, + "learning_rate": 2.6707460743936653e-07, + "loss": 0.0212, + "step": 5036 + }, + { + "epoch": 3.6, + "grad_norm": 7.174749354653931, + "learning_rate": 2.6614345738031014e-07, + "loss": 0.0288, + "step": 5037 + }, + { + "epoch": 3.6, + "grad_norm": 3.9949991370218254, + "learning_rate": 2.6521388899071467e-07, + "loss": 0.0237, + "step": 5038 + }, + { + "epoch": 3.6, + "grad_norm": 1.4321127479377347, + "learning_rate": 2.642859025811656e-07, + "loss": 0.017, + "step": 5039 + }, + { + "epoch": 3.6, + "grad_norm": 4.085364605840262, + "learning_rate": 2.633594984617199e-07, + "loss": 0.0229, + "step": 5040 + }, + { + "epoch": 3.6, + "grad_norm": 4.510319037477354, + "learning_rate": 2.624346769419078e-07, + "loss": 0.0276, + "step": 5041 + }, + { + "epoch": 3.6, + "grad_norm": 2.265080203783568, + "learning_rate": 2.6151143833072824e-07, + "loss": 0.0195, + "step": 5042 + }, + { + "epoch": 3.6, + "grad_norm": 2.381010118389464, + "learning_rate": 2.605897829366527e-07, + "loss": 0.0164, + "step": 5043 + }, + { + "epoch": 3.6, + "grad_norm": 2.1997765716885223, + "learning_rate": 2.596697110676233e-07, + "loss": 0.0253, + "step": 5044 + }, + { + "epoch": 3.6, + "grad_norm": 2.80303572995716, + "learning_rate": 2.5875122303105403e-07, + "loss": 0.0303, + "step": 5045 + }, + { + "epoch": 3.6, + "grad_norm": 2.042229652025857, + "learning_rate": 2.5783431913382673e-07, + "loss": 0.0197, + "step": 5046 + }, + { + "epoch": 3.6, + "grad_norm": 3.4939362138330483, + "learning_rate": 2.5691899968229904e-07, + "loss": 0.0174, + "step": 5047 + }, + { + "epoch": 3.6, + "grad_norm": 1.4919612882154958, + "learning_rate": 2.560052649822925e-07, + "loss": 0.0164, + "step": 5048 + }, + { + "epoch": 3.6, + "grad_norm": 2.9015314648170483, + "learning_rate": 2.5509311533910674e-07, + "loss": 0.0202, + "step": 5049 + }, + { + "epoch": 3.6, + "grad_norm": 2.6111830304001855, + "learning_rate": 2.5418255105750465e-07, + "loss": 0.0182, + "step": 5050 + }, + { + "epoch": 3.61, + "grad_norm": 3.25436373070333, + "learning_rate": 2.532735724417251e-07, + "loss": 0.0296, + "step": 5051 + }, + { + "epoch": 3.61, + "grad_norm": 2.0050448829571654, + "learning_rate": 2.52366179795473e-07, + "loss": 0.0205, + "step": 5052 + }, + { + "epoch": 3.61, + "grad_norm": 5.903582016575689, + "learning_rate": 2.5146037342192673e-07, + "loss": 0.0338, + "step": 5053 + }, + { + "epoch": 3.61, + "grad_norm": 3.84120253557257, + "learning_rate": 2.505561536237311e-07, + "loss": 0.0205, + "step": 5054 + }, + { + "epoch": 3.61, + "grad_norm": 3.1135264434051866, + "learning_rate": 2.496535207030043e-07, + "loss": 0.0112, + "step": 5055 + }, + { + "epoch": 3.61, + "grad_norm": 1.5988271065115593, + "learning_rate": 2.4875247496133234e-07, + "loss": 0.0239, + "step": 5056 + }, + { + "epoch": 3.61, + "grad_norm": 2.5038282984009146, + "learning_rate": 2.4785301669977116e-07, + "loss": 0.0233, + "step": 5057 + }, + { + "epoch": 3.61, + "grad_norm": 3.2800353162346583, + "learning_rate": 2.469551462188463e-07, + "loss": 0.0207, + "step": 5058 + }, + { + "epoch": 3.61, + "grad_norm": 2.5254055880544684, + "learning_rate": 2.460588638185535e-07, + "loss": 0.0244, + "step": 5059 + }, + { + "epoch": 3.61, + "grad_norm": 2.135851694029928, + "learning_rate": 2.45164169798357e-07, + "loss": 0.0271, + "step": 5060 + }, + { + "epoch": 3.61, + "grad_norm": 5.307448036798728, + "learning_rate": 2.4427106445719053e-07, + "loss": 0.0141, + "step": 5061 + }, + { + "epoch": 3.61, + "grad_norm": 3.9909825038257694, + "learning_rate": 2.4337954809345807e-07, + "loss": 0.0143, + "step": 5062 + }, + { + "epoch": 3.61, + "grad_norm": 5.963517100815372, + "learning_rate": 2.4248962100503095e-07, + "loss": 0.0257, + "step": 5063 + }, + { + "epoch": 3.61, + "grad_norm": 3.314407673144177, + "learning_rate": 2.416012834892506e-07, + "loss": 0.0271, + "step": 5064 + }, + { + "epoch": 3.62, + "grad_norm": 3.1641223902931577, + "learning_rate": 2.4071453584292693e-07, + "loss": 0.0187, + "step": 5065 + }, + { + "epoch": 3.62, + "grad_norm": 2.1701515875255235, + "learning_rate": 2.3982937836233954e-07, + "loss": 0.0261, + "step": 5066 + }, + { + "epoch": 3.62, + "grad_norm": 3.6782775860952124, + "learning_rate": 2.389458113432347e-07, + "loss": 0.0218, + "step": 5067 + }, + { + "epoch": 3.62, + "grad_norm": 3.2993387317839593, + "learning_rate": 2.380638350808301e-07, + "loss": 0.0245, + "step": 5068 + }, + { + "epoch": 3.62, + "grad_norm": 1.5026356633345086, + "learning_rate": 2.371834498698089e-07, + "loss": 0.0189, + "step": 5069 + }, + { + "epoch": 3.62, + "grad_norm": 3.729960127563879, + "learning_rate": 2.363046560043264e-07, + "loss": 0.0256, + "step": 5070 + }, + { + "epoch": 3.62, + "grad_norm": 3.429117472534288, + "learning_rate": 2.3542745377800046e-07, + "loss": 0.0197, + "step": 5071 + }, + { + "epoch": 3.62, + "grad_norm": 2.2548825921747713, + "learning_rate": 2.3455184348392446e-07, + "loss": 0.014, + "step": 5072 + }, + { + "epoch": 3.62, + "grad_norm": 5.749312446063484, + "learning_rate": 2.3367782541465268e-07, + "loss": 0.0181, + "step": 5073 + }, + { + "epoch": 3.62, + "grad_norm": 4.623364041936797, + "learning_rate": 2.3280539986221317e-07, + "loss": 0.0201, + "step": 5074 + }, + { + "epoch": 3.62, + "grad_norm": 2.473930184609748, + "learning_rate": 2.3193456711809837e-07, + "loss": 0.025, + "step": 5075 + }, + { + "epoch": 3.62, + "grad_norm": 3.597217833878548, + "learning_rate": 2.3106532747327104e-07, + "loss": 0.0147, + "step": 5076 + }, + { + "epoch": 3.62, + "grad_norm": 2.8255040242530125, + "learning_rate": 2.3019768121815777e-07, + "loss": 0.0132, + "step": 5077 + }, + { + "epoch": 3.62, + "grad_norm": 2.3758633766254955, + "learning_rate": 2.2933162864265836e-07, + "loss": 0.015, + "step": 5078 + }, + { + "epoch": 3.63, + "grad_norm": 4.64748860266476, + "learning_rate": 2.2846717003613462e-07, + "loss": 0.0245, + "step": 5079 + }, + { + "epoch": 3.63, + "grad_norm": 3.860781440775316, + "learning_rate": 2.2760430568741943e-07, + "loss": 0.021, + "step": 5080 + }, + { + "epoch": 3.63, + "grad_norm": 2.042023533674793, + "learning_rate": 2.2674303588481162e-07, + "loss": 0.0194, + "step": 5081 + }, + { + "epoch": 3.63, + "grad_norm": 1.7607599316113298, + "learning_rate": 2.258833609160771e-07, + "loss": 0.021, + "step": 5082 + }, + { + "epoch": 3.63, + "grad_norm": 4.441214972475646, + "learning_rate": 2.2502528106845e-07, + "loss": 0.0197, + "step": 5083 + }, + { + "epoch": 3.63, + "grad_norm": 4.545608016353152, + "learning_rate": 2.241687966286299e-07, + "loss": 0.0257, + "step": 5084 + }, + { + "epoch": 3.63, + "grad_norm": 1.6754502037667738, + "learning_rate": 2.233139078827845e-07, + "loss": 0.0189, + "step": 5085 + }, + { + "epoch": 3.63, + "grad_norm": 1.0124530351530259, + "learning_rate": 2.2246061511654816e-07, + "loss": 0.013, + "step": 5086 + }, + { + "epoch": 3.63, + "grad_norm": 1.3560170545749979, + "learning_rate": 2.2160891861502165e-07, + "loss": 0.0203, + "step": 5087 + }, + { + "epoch": 3.63, + "grad_norm": 3.614265986080177, + "learning_rate": 2.2075881866277348e-07, + "loss": 0.0145, + "step": 5088 + }, + { + "epoch": 3.63, + "grad_norm": 1.8367535225634963, + "learning_rate": 2.199103155438359e-07, + "loss": 0.0218, + "step": 5089 + }, + { + "epoch": 3.63, + "grad_norm": 2.112045433383258, + "learning_rate": 2.1906340954171212e-07, + "loss": 0.0324, + "step": 5090 + }, + { + "epoch": 3.63, + "grad_norm": 2.89243344264716, + "learning_rate": 2.1821810093936636e-07, + "loss": 0.0231, + "step": 5091 + }, + { + "epoch": 3.63, + "grad_norm": 2.5341079427529376, + "learning_rate": 2.1737439001923488e-07, + "loss": 0.0171, + "step": 5092 + }, + { + "epoch": 3.64, + "grad_norm": 12.959994215228567, + "learning_rate": 2.1653227706321388e-07, + "loss": 0.055, + "step": 5093 + }, + { + "epoch": 3.64, + "grad_norm": 2.8515096352994016, + "learning_rate": 2.156917623526722e-07, + "loss": 0.0296, + "step": 5094 + }, + { + "epoch": 3.64, + "grad_norm": 3.541543190563925, + "learning_rate": 2.1485284616843904e-07, + "loss": 0.0184, + "step": 5095 + }, + { + "epoch": 3.64, + "grad_norm": 3.7930353728065533, + "learning_rate": 2.140155287908141e-07, + "loss": 0.0248, + "step": 5096 + }, + { + "epoch": 3.64, + "grad_norm": 3.481361108056802, + "learning_rate": 2.131798104995586e-07, + "loss": 0.0275, + "step": 5097 + }, + { + "epoch": 3.64, + "grad_norm": 6.486064903589152, + "learning_rate": 2.123456915739025e-07, + "loss": 0.0203, + "step": 5098 + }, + { + "epoch": 3.64, + "grad_norm": 3.4582584714064426, + "learning_rate": 2.115131722925401e-07, + "loss": 0.0294, + "step": 5099 + }, + { + "epoch": 3.64, + "grad_norm": 1.3294381184143647, + "learning_rate": 2.1068225293363166e-07, + "loss": 0.023, + "step": 5100 + }, + { + "epoch": 3.64, + "grad_norm": 2.312292253078772, + "learning_rate": 2.0985293377480342e-07, + "loss": 0.0206, + "step": 5101 + }, + { + "epoch": 3.64, + "grad_norm": 3.2619168492415973, + "learning_rate": 2.0902521509314543e-07, + "loss": 0.0173, + "step": 5102 + }, + { + "epoch": 3.64, + "grad_norm": 3.81893103016964, + "learning_rate": 2.0819909716521426e-07, + "loss": 0.0235, + "step": 5103 + }, + { + "epoch": 3.64, + "grad_norm": 2.3150610495838744, + "learning_rate": 2.0737458026703182e-07, + "loss": 0.0168, + "step": 5104 + }, + { + "epoch": 3.64, + "grad_norm": 4.629763159973222, + "learning_rate": 2.0655166467408283e-07, + "loss": 0.0433, + "step": 5105 + }, + { + "epoch": 3.64, + "grad_norm": 5.583557149885652, + "learning_rate": 2.057303506613212e-07, + "loss": 0.0238, + "step": 5106 + }, + { + "epoch": 3.65, + "grad_norm": 3.0910682729647805, + "learning_rate": 2.049106385031602e-07, + "loss": 0.0221, + "step": 5107 + }, + { + "epoch": 3.65, + "grad_norm": 4.763394187633355, + "learning_rate": 2.0409252847348404e-07, + "loss": 0.022, + "step": 5108 + }, + { + "epoch": 3.65, + "grad_norm": 1.864205678587097, + "learning_rate": 2.032760208456358e-07, + "loss": 0.0174, + "step": 5109 + }, + { + "epoch": 3.65, + "grad_norm": 5.291056945540742, + "learning_rate": 2.0246111589242835e-07, + "loss": 0.0265, + "step": 5110 + }, + { + "epoch": 3.65, + "grad_norm": 1.921185917208169, + "learning_rate": 2.0164781388613386e-07, + "loss": 0.0202, + "step": 5111 + }, + { + "epoch": 3.65, + "grad_norm": 7.495862039740579, + "learning_rate": 2.0083611509849443e-07, + "loss": 0.0235, + "step": 5112 + }, + { + "epoch": 3.65, + "grad_norm": 1.3689942096771623, + "learning_rate": 2.0002601980071145e-07, + "loss": 0.0245, + "step": 5113 + }, + { + "epoch": 3.65, + "grad_norm": 4.366828293872557, + "learning_rate": 1.9921752826345397e-07, + "loss": 0.0237, + "step": 5114 + }, + { + "epoch": 3.65, + "grad_norm": 1.9223888609093516, + "learning_rate": 1.9841064075685367e-07, + "loss": 0.0229, + "step": 5115 + }, + { + "epoch": 3.65, + "grad_norm": 1.2881973687247583, + "learning_rate": 1.9760535755050715e-07, + "loss": 0.0125, + "step": 5116 + }, + { + "epoch": 3.65, + "grad_norm": 2.6967817037967867, + "learning_rate": 1.9680167891347356e-07, + "loss": 0.016, + "step": 5117 + }, + { + "epoch": 3.65, + "grad_norm": 3.439604203890696, + "learning_rate": 1.9599960511427761e-07, + "loss": 0.0323, + "step": 5118 + }, + { + "epoch": 3.65, + "grad_norm": 1.5596126183501706, + "learning_rate": 1.9519913642090715e-07, + "loss": 0.0179, + "step": 5119 + }, + { + "epoch": 3.65, + "grad_norm": 2.9106021839335043, + "learning_rate": 1.9440027310081323e-07, + "loss": 0.0309, + "step": 5120 + }, + { + "epoch": 3.66, + "grad_norm": 4.111574300372222, + "learning_rate": 1.9360301542091065e-07, + "loss": 0.0214, + "step": 5121 + }, + { + "epoch": 3.66, + "grad_norm": 3.430689782203096, + "learning_rate": 1.9280736364757912e-07, + "loss": 0.0167, + "step": 5122 + }, + { + "epoch": 3.66, + "grad_norm": 2.7827580688892324, + "learning_rate": 1.9201331804665934e-07, + "loss": 0.0135, + "step": 5123 + }, + { + "epoch": 3.66, + "grad_norm": 1.750804909510771, + "learning_rate": 1.9122087888345798e-07, + "loss": 0.0179, + "step": 5124 + }, + { + "epoch": 3.66, + "grad_norm": 3.102904067177262, + "learning_rate": 1.9043004642274266e-07, + "loss": 0.0181, + "step": 5125 + }, + { + "epoch": 3.66, + "grad_norm": 1.4913641627966314, + "learning_rate": 1.896408209287459e-07, + "loss": 0.0184, + "step": 5126 + }, + { + "epoch": 3.66, + "grad_norm": 1.2369411725617043, + "learning_rate": 1.888532026651624e-07, + "loss": 0.0197, + "step": 5127 + }, + { + "epoch": 3.66, + "grad_norm": 3.1080100506244572, + "learning_rate": 1.880671918951499e-07, + "loss": 0.016, + "step": 5128 + }, + { + "epoch": 3.66, + "grad_norm": 3.60563971987711, + "learning_rate": 1.8728278888132944e-07, + "loss": 0.0275, + "step": 5129 + }, + { + "epoch": 3.66, + "grad_norm": 2.013124995048135, + "learning_rate": 1.864999938857842e-07, + "loss": 0.0272, + "step": 5130 + }, + { + "epoch": 3.66, + "grad_norm": 3.569223411896495, + "learning_rate": 1.8571880717006218e-07, + "loss": 0.013, + "step": 5131 + }, + { + "epoch": 3.66, + "grad_norm": 2.717475672756776, + "learning_rate": 1.8493922899516902e-07, + "loss": 0.0257, + "step": 5132 + }, + { + "epoch": 3.66, + "grad_norm": 3.2812818109675796, + "learning_rate": 1.8416125962157971e-07, + "loss": 0.0374, + "step": 5133 + }, + { + "epoch": 3.66, + "grad_norm": 6.769023739742816, + "learning_rate": 1.8338489930922632e-07, + "loss": 0.0213, + "step": 5134 + }, + { + "epoch": 3.67, + "grad_norm": 5.479076444685034, + "learning_rate": 1.8261014831750633e-07, + "loss": 0.0232, + "step": 5135 + }, + { + "epoch": 3.67, + "grad_norm": 3.1296442552646995, + "learning_rate": 1.8183700690527717e-07, + "loss": 0.0199, + "step": 5136 + }, + { + "epoch": 3.67, + "grad_norm": 2.5439097741169814, + "learning_rate": 1.810654753308616e-07, + "loss": 0.0301, + "step": 5137 + }, + { + "epoch": 3.67, + "grad_norm": 1.373119819713574, + "learning_rate": 1.8029555385204067e-07, + "loss": 0.0166, + "step": 5138 + }, + { + "epoch": 3.67, + "grad_norm": 2.135208763283895, + "learning_rate": 1.795272427260608e-07, + "loss": 0.0168, + "step": 5139 + }, + { + "epoch": 3.67, + "grad_norm": 2.092430470504436, + "learning_rate": 1.7876054220962835e-07, + "loss": 0.0161, + "step": 5140 + }, + { + "epoch": 3.67, + "grad_norm": 2.8211956696294695, + "learning_rate": 1.779954525589128e-07, + "loss": 0.0222, + "step": 5141 + }, + { + "epoch": 3.67, + "grad_norm": 1.7382385750874663, + "learning_rate": 1.7723197402954419e-07, + "loss": 0.0143, + "step": 5142 + }, + { + "epoch": 3.67, + "grad_norm": 3.548820464246968, + "learning_rate": 1.7647010687661558e-07, + "loss": 0.0152, + "step": 5143 + }, + { + "epoch": 3.67, + "grad_norm": 1.9248454205191103, + "learning_rate": 1.757098513546801e-07, + "loss": 0.0228, + "step": 5144 + }, + { + "epoch": 3.67, + "grad_norm": 5.791954592906748, + "learning_rate": 1.74951207717754e-07, + "loss": 0.0251, + "step": 5145 + }, + { + "epoch": 3.67, + "grad_norm": 3.654095689991998, + "learning_rate": 1.7419417621931388e-07, + "loss": 0.021, + "step": 5146 + }, + { + "epoch": 3.67, + "grad_norm": 3.678062581923196, + "learning_rate": 1.7343875711229864e-07, + "loss": 0.0274, + "step": 5147 + }, + { + "epoch": 3.67, + "grad_norm": 2.4221492030173604, + "learning_rate": 1.7268495064910574e-07, + "loss": 0.0193, + "step": 5148 + }, + { + "epoch": 3.68, + "grad_norm": 2.665876583116908, + "learning_rate": 1.719327570815993e-07, + "loss": 0.0284, + "step": 5149 + }, + { + "epoch": 3.68, + "grad_norm": 4.023954591813659, + "learning_rate": 1.711821766610977e-07, + "loss": 0.0265, + "step": 5150 + }, + { + "epoch": 3.68, + "grad_norm": 2.666367457998923, + "learning_rate": 1.704332096383865e-07, + "loss": 0.0207, + "step": 5151 + }, + { + "epoch": 3.68, + "grad_norm": 4.801752871559416, + "learning_rate": 1.696858562637077e-07, + "loss": 0.014, + "step": 5152 + }, + { + "epoch": 3.68, + "grad_norm": 2.7854192881731974, + "learning_rate": 1.689401167867677e-07, + "loss": 0.0282, + "step": 5153 + }, + { + "epoch": 3.68, + "grad_norm": 2.964894722808635, + "learning_rate": 1.6819599145672993e-07, + "loss": 0.028, + "step": 5154 + }, + { + "epoch": 3.68, + "grad_norm": 3.7778238501101375, + "learning_rate": 1.674534805222222e-07, + "loss": 0.0338, + "step": 5155 + }, + { + "epoch": 3.68, + "grad_norm": 2.277752038612983, + "learning_rate": 1.667125842313305e-07, + "loss": 0.0182, + "step": 5156 + }, + { + "epoch": 3.68, + "grad_norm": 1.2367806497125986, + "learning_rate": 1.6597330283160184e-07, + "loss": 0.0113, + "step": 5157 + }, + { + "epoch": 3.68, + "grad_norm": 8.140836309513228, + "learning_rate": 1.6523563657004416e-07, + "loss": 0.0375, + "step": 5158 + }, + { + "epoch": 3.68, + "grad_norm": 2.5710975609049305, + "learning_rate": 1.644995856931253e-07, + "loss": 0.02, + "step": 5159 + }, + { + "epoch": 3.68, + "grad_norm": 2.969233743080048, + "learning_rate": 1.6376515044677354e-07, + "loss": 0.0219, + "step": 5160 + }, + { + "epoch": 3.68, + "grad_norm": 2.177218001293877, + "learning_rate": 1.630323310763776e-07, + "loss": 0.0217, + "step": 5161 + }, + { + "epoch": 3.68, + "grad_norm": 2.1841237881191446, + "learning_rate": 1.6230112782678608e-07, + "loss": 0.0266, + "step": 5162 + }, + { + "epoch": 3.69, + "grad_norm": 2.4817841006206276, + "learning_rate": 1.6157154094230744e-07, + "loss": 0.0182, + "step": 5163 + }, + { + "epoch": 3.69, + "grad_norm": 4.016931727249433, + "learning_rate": 1.6084357066670997e-07, + "loss": 0.0272, + "step": 5164 + }, + { + "epoch": 3.69, + "grad_norm": 1.879190602814266, + "learning_rate": 1.601172172432225e-07, + "loss": 0.0209, + "step": 5165 + }, + { + "epoch": 3.69, + "grad_norm": 2.778061083524743, + "learning_rate": 1.5939248091453252e-07, + "loss": 0.0218, + "step": 5166 + }, + { + "epoch": 3.69, + "grad_norm": 5.732208780209183, + "learning_rate": 1.5866936192278915e-07, + "loss": 0.0325, + "step": 5167 + }, + { + "epoch": 3.69, + "grad_norm": 1.4706232698161006, + "learning_rate": 1.5794786050959797e-07, + "loss": 0.0134, + "step": 5168 + }, + { + "epoch": 3.69, + "grad_norm": 6.98594884392636, + "learning_rate": 1.5722797691602842e-07, + "loss": 0.0319, + "step": 5169 + }, + { + "epoch": 3.69, + "grad_norm": 3.349426832853079, + "learning_rate": 1.5650971138260473e-07, + "loss": 0.0248, + "step": 5170 + }, + { + "epoch": 3.69, + "grad_norm": 4.976579295378491, + "learning_rate": 1.5579306414931493e-07, + "loss": 0.0274, + "step": 5171 + }, + { + "epoch": 3.69, + "grad_norm": 2.608275894549004, + "learning_rate": 1.5507803545560195e-07, + "loss": 0.0272, + "step": 5172 + }, + { + "epoch": 3.69, + "grad_norm": 3.1503623838936474, + "learning_rate": 1.543646255403719e-07, + "loss": 0.0266, + "step": 5173 + }, + { + "epoch": 3.69, + "grad_norm": 1.5556403344359921, + "learning_rate": 1.5365283464198743e-07, + "loss": 0.0189, + "step": 5174 + }, + { + "epoch": 3.69, + "grad_norm": 8.45265240922781, + "learning_rate": 1.529426629982711e-07, + "loss": 0.0317, + "step": 5175 + }, + { + "epoch": 3.69, + "grad_norm": 4.4790627118344535, + "learning_rate": 1.5223411084650476e-07, + "loss": 0.02, + "step": 5176 + }, + { + "epoch": 3.7, + "grad_norm": 2.2062655844358336, + "learning_rate": 1.5152717842342845e-07, + "loss": 0.0246, + "step": 5177 + }, + { + "epoch": 3.7, + "grad_norm": 2.6205269218486786, + "learning_rate": 1.5082186596524218e-07, + "loss": 0.0175, + "step": 5178 + }, + { + "epoch": 3.7, + "grad_norm": 3.0051366479489072, + "learning_rate": 1.501181737076035e-07, + "loss": 0.0238, + "step": 5179 + }, + { + "epoch": 3.7, + "grad_norm": 4.819673804209162, + "learning_rate": 1.4941610188562884e-07, + "loss": 0.0199, + "step": 5180 + }, + { + "epoch": 3.7, + "grad_norm": 5.138577134268833, + "learning_rate": 1.4871565073389382e-07, + "loss": 0.0246, + "step": 5181 + }, + { + "epoch": 3.7, + "grad_norm": 3.8138304855316023, + "learning_rate": 1.4801682048643183e-07, + "loss": 0.0242, + "step": 5182 + }, + { + "epoch": 3.7, + "grad_norm": 1.4542929428683837, + "learning_rate": 1.4731961137673555e-07, + "loss": 0.016, + "step": 5183 + }, + { + "epoch": 3.7, + "grad_norm": 4.104948196555313, + "learning_rate": 1.466240236377553e-07, + "loss": 0.0257, + "step": 5184 + }, + { + "epoch": 3.7, + "grad_norm": 3.5735418575307967, + "learning_rate": 1.4593005750189958e-07, + "loss": 0.0232, + "step": 5185 + }, + { + "epoch": 3.7, + "grad_norm": 1.8507033348331547, + "learning_rate": 1.4523771320103574e-07, + "loss": 0.0195, + "step": 5186 + }, + { + "epoch": 3.7, + "grad_norm": 1.8413134451311026, + "learning_rate": 1.4454699096648873e-07, + "loss": 0.0345, + "step": 5187 + }, + { + "epoch": 3.7, + "grad_norm": 3.3278075671092697, + "learning_rate": 1.4385789102904168e-07, + "loss": 0.0301, + "step": 5188 + }, + { + "epoch": 3.7, + "grad_norm": 2.3248996332848746, + "learning_rate": 1.4317041361893546e-07, + "loss": 0.0295, + "step": 5189 + }, + { + "epoch": 3.7, + "grad_norm": 3.4264176035418945, + "learning_rate": 1.4248455896587022e-07, + "loss": 0.0327, + "step": 5190 + }, + { + "epoch": 3.71, + "grad_norm": 1.728019296230289, + "learning_rate": 1.418003272990004e-07, + "loss": 0.0189, + "step": 5191 + }, + { + "epoch": 3.71, + "grad_norm": 1.489162413811022, + "learning_rate": 1.4111771884694315e-07, + "loss": 0.0203, + "step": 5192 + }, + { + "epoch": 3.71, + "grad_norm": 1.9524657898968014, + "learning_rate": 1.4043673383776825e-07, + "loss": 0.0123, + "step": 5193 + }, + { + "epoch": 3.71, + "grad_norm": 2.415123271352204, + "learning_rate": 1.3975737249900812e-07, + "loss": 0.0266, + "step": 5194 + }, + { + "epoch": 3.71, + "grad_norm": 2.8692636393398847, + "learning_rate": 1.3907963505764731e-07, + "loss": 0.0243, + "step": 5195 + }, + { + "epoch": 3.71, + "grad_norm": 2.8502413687478807, + "learning_rate": 1.384035217401325e-07, + "loss": 0.0139, + "step": 5196 + }, + { + "epoch": 3.71, + "grad_norm": 2.862718769125452, + "learning_rate": 1.3772903277236404e-07, + "loss": 0.0235, + "step": 5197 + }, + { + "epoch": 3.71, + "grad_norm": 2.1513904659754792, + "learning_rate": 1.370561683797028e-07, + "loss": 0.0271, + "step": 5198 + }, + { + "epoch": 3.71, + "grad_norm": 6.519218790872213, + "learning_rate": 1.363849287869645e-07, + "loss": 0.0378, + "step": 5199 + }, + { + "epoch": 3.71, + "grad_norm": 4.489049467119617, + "learning_rate": 1.3571531421842256e-07, + "loss": 0.0273, + "step": 5200 + }, + { + "epoch": 3.71, + "grad_norm": 2.3577172209252852, + "learning_rate": 1.3504732489780849e-07, + "loss": 0.0219, + "step": 5201 + }, + { + "epoch": 3.71, + "grad_norm": 5.7833943763627795, + "learning_rate": 1.3438096104830879e-07, + "loss": 0.0217, + "step": 5202 + }, + { + "epoch": 3.71, + "grad_norm": 1.988371201831818, + "learning_rate": 1.3371622289256869e-07, + "loss": 0.0202, + "step": 5203 + }, + { + "epoch": 3.71, + "grad_norm": 2.425503910065707, + "learning_rate": 1.3305311065269e-07, + "loss": 0.0198, + "step": 5204 + }, + { + "epoch": 3.72, + "grad_norm": 3.090722053136764, + "learning_rate": 1.323916245502299e-07, + "loss": 0.0245, + "step": 5205 + }, + { + "epoch": 3.72, + "grad_norm": 4.9627962283694345, + "learning_rate": 1.3173176480620442e-07, + "loss": 0.0244, + "step": 5206 + }, + { + "epoch": 3.72, + "grad_norm": 9.206592423133095, + "learning_rate": 1.3107353164108273e-07, + "loss": 0.0316, + "step": 5207 + }, + { + "epoch": 3.72, + "grad_norm": 2.7039312687504373, + "learning_rate": 1.3041692527479556e-07, + "loss": 0.022, + "step": 5208 + }, + { + "epoch": 3.72, + "grad_norm": 1.3752954379070126, + "learning_rate": 1.2976194592672465e-07, + "loss": 0.0183, + "step": 5209 + }, + { + "epoch": 3.72, + "grad_norm": 4.6190385954775905, + "learning_rate": 1.2910859381571327e-07, + "loss": 0.0233, + "step": 5210 + }, + { + "epoch": 3.72, + "grad_norm": 3.210019534830187, + "learning_rate": 1.284568691600563e-07, + "loss": 0.0186, + "step": 5211 + }, + { + "epoch": 3.72, + "grad_norm": 3.4280035373439843, + "learning_rate": 1.2780677217750949e-07, + "loss": 0.0217, + "step": 5212 + }, + { + "epoch": 3.72, + "grad_norm": 2.9802093187192633, + "learning_rate": 1.271583030852791e-07, + "loss": 0.0247, + "step": 5213 + }, + { + "epoch": 3.72, + "grad_norm": 1.385873022404817, + "learning_rate": 1.2651146210003406e-07, + "loss": 0.0176, + "step": 5214 + }, + { + "epoch": 3.72, + "grad_norm": 5.529817438941661, + "learning_rate": 1.2586624943789372e-07, + "loss": 0.0199, + "step": 5215 + }, + { + "epoch": 3.72, + "grad_norm": 2.3117896694300617, + "learning_rate": 1.2522266531443616e-07, + "loss": 0.0269, + "step": 5216 + }, + { + "epoch": 3.72, + "grad_norm": 7.010813791691153, + "learning_rate": 1.245807099446955e-07, + "loss": 0.0401, + "step": 5217 + }, + { + "epoch": 3.72, + "grad_norm": 6.309724081611123, + "learning_rate": 1.239403835431602e-07, + "loss": 0.0204, + "step": 5218 + }, + { + "epoch": 3.73, + "grad_norm": 3.991131983188111, + "learning_rate": 1.2330168632377514e-07, + "loss": 0.021, + "step": 5219 + }, + { + "epoch": 3.73, + "grad_norm": 3.3782189157497045, + "learning_rate": 1.2266461849994138e-07, + "loss": 0.0224, + "step": 5220 + }, + { + "epoch": 3.73, + "grad_norm": 4.944132812084641, + "learning_rate": 1.2202918028451527e-07, + "loss": 0.0171, + "step": 5221 + }, + { + "epoch": 3.73, + "grad_norm": 2.3020823968537423, + "learning_rate": 1.2139537188980753e-07, + "loss": 0.0197, + "step": 5222 + }, + { + "epoch": 3.73, + "grad_norm": 2.252902650158216, + "learning_rate": 1.207631935275866e-07, + "loss": 0.0306, + "step": 5223 + }, + { + "epoch": 3.73, + "grad_norm": 4.783919269935325, + "learning_rate": 1.2013264540907455e-07, + "loss": 0.0215, + "step": 5224 + }, + { + "epoch": 3.73, + "grad_norm": 2.5909512837118926, + "learning_rate": 1.1950372774494846e-07, + "loss": 0.0176, + "step": 5225 + }, + { + "epoch": 3.73, + "grad_norm": 4.136611119846169, + "learning_rate": 1.1887644074534244e-07, + "loss": 0.0186, + "step": 5226 + }, + { + "epoch": 3.73, + "grad_norm": 2.72318296618443, + "learning_rate": 1.182507846198444e-07, + "loss": 0.0245, + "step": 5227 + }, + { + "epoch": 3.73, + "grad_norm": 7.516987600595989, + "learning_rate": 1.1762675957749769e-07, + "loss": 0.0422, + "step": 5228 + }, + { + "epoch": 3.73, + "grad_norm": 2.5276403915062757, + "learning_rate": 1.1700436582680108e-07, + "loss": 0.0191, + "step": 5229 + }, + { + "epoch": 3.73, + "grad_norm": 4.752529791718225, + "learning_rate": 1.1638360357570654e-07, + "loss": 0.0293, + "step": 5230 + }, + { + "epoch": 3.73, + "grad_norm": 2.400911193761536, + "learning_rate": 1.157644730316243e-07, + "loss": 0.0157, + "step": 5231 + }, + { + "epoch": 3.73, + "grad_norm": 2.326937968268249, + "learning_rate": 1.1514697440141498e-07, + "loss": 0.0182, + "step": 5232 + }, + { + "epoch": 3.74, + "grad_norm": 2.4877722821251047, + "learning_rate": 1.1453110789139855e-07, + "loss": 0.0208, + "step": 5233 + }, + { + "epoch": 3.74, + "grad_norm": 3.9936599844023553, + "learning_rate": 1.1391687370734594e-07, + "loss": 0.018, + "step": 5234 + }, + { + "epoch": 3.74, + "grad_norm": 3.6287267410137187, + "learning_rate": 1.1330427205448579e-07, + "loss": 0.0285, + "step": 5235 + }, + { + "epoch": 3.74, + "grad_norm": 2.4006517456771665, + "learning_rate": 1.1269330313749715e-07, + "loss": 0.0214, + "step": 5236 + }, + { + "epoch": 3.74, + "grad_norm": 4.142343883359804, + "learning_rate": 1.1208396716051895e-07, + "loss": 0.0247, + "step": 5237 + }, + { + "epoch": 3.74, + "grad_norm": 2.711787066445167, + "learning_rate": 1.1147626432713943e-07, + "loss": 0.017, + "step": 5238 + }, + { + "epoch": 3.74, + "grad_norm": 1.7231843572349768, + "learning_rate": 1.1087019484040562e-07, + "loss": 0.0136, + "step": 5239 + }, + { + "epoch": 3.74, + "grad_norm": 6.191379324297485, + "learning_rate": 1.1026575890281443e-07, + "loss": 0.0314, + "step": 5240 + }, + { + "epoch": 3.74, + "grad_norm": 2.822860417056047, + "learning_rate": 1.0966295671632043e-07, + "loss": 0.0251, + "step": 5241 + }, + { + "epoch": 3.74, + "grad_norm": 2.203361342127882, + "learning_rate": 1.0906178848233029e-07, + "loss": 0.0172, + "step": 5242 + }, + { + "epoch": 3.74, + "grad_norm": 2.8531402032960096, + "learning_rate": 1.0846225440170611e-07, + "loss": 0.0256, + "step": 5243 + }, + { + "epoch": 3.74, + "grad_norm": 1.698947522501088, + "learning_rate": 1.0786435467476264e-07, + "loss": 0.0149, + "step": 5244 + }, + { + "epoch": 3.74, + "grad_norm": 4.130192879839065, + "learning_rate": 1.072680895012701e-07, + "loss": 0.0208, + "step": 5245 + }, + { + "epoch": 3.74, + "grad_norm": 4.923493442779139, + "learning_rate": 1.0667345908045135e-07, + "loss": 0.0206, + "step": 5246 + }, + { + "epoch": 3.75, + "grad_norm": 1.6193563517025997, + "learning_rate": 1.0608046361098356e-07, + "loss": 0.021, + "step": 5247 + }, + { + "epoch": 3.75, + "grad_norm": 5.093530894764852, + "learning_rate": 1.0548910329099771e-07, + "loss": 0.0214, + "step": 5248 + }, + { + "epoch": 3.75, + "grad_norm": 3.1583303655468944, + "learning_rate": 1.048993783180785e-07, + "loss": 0.0204, + "step": 5249 + }, + { + "epoch": 3.75, + "grad_norm": 4.86486432471581, + "learning_rate": 1.0431128888926222e-07, + "loss": 0.0346, + "step": 5250 + }, + { + "epoch": 3.75, + "grad_norm": 3.9913107188739105, + "learning_rate": 1.0372483520104337e-07, + "loss": 0.0359, + "step": 5251 + }, + { + "epoch": 3.75, + "grad_norm": 2.2381666578064294, + "learning_rate": 1.0314001744936409e-07, + "loss": 0.0213, + "step": 5252 + }, + { + "epoch": 3.75, + "grad_norm": 4.551116852075309, + "learning_rate": 1.0255683582962583e-07, + "loss": 0.0211, + "step": 5253 + }, + { + "epoch": 3.75, + "grad_norm": 1.057164791329309, + "learning_rate": 1.0197529053667721e-07, + "loss": 0.0178, + "step": 5254 + }, + { + "epoch": 3.75, + "grad_norm": 3.1358364708701503, + "learning_rate": 1.013953817648261e-07, + "loss": 0.0205, + "step": 5255 + }, + { + "epoch": 3.75, + "grad_norm": 4.521051074903339, + "learning_rate": 1.008171097078292e-07, + "loss": 0.0273, + "step": 5256 + }, + { + "epoch": 3.75, + "grad_norm": 2.9904025566820267, + "learning_rate": 1.0024047455889918e-07, + "loss": 0.0304, + "step": 5257 + }, + { + "epoch": 3.75, + "grad_norm": 3.778324379127207, + "learning_rate": 9.966547651069913e-08, + "loss": 0.0278, + "step": 5258 + }, + { + "epoch": 3.75, + "grad_norm": 10.272189356491223, + "learning_rate": 9.909211575534705e-08, + "loss": 0.0461, + "step": 5259 + }, + { + "epoch": 3.75, + "grad_norm": 2.2283715681606227, + "learning_rate": 9.852039248441414e-08, + "loss": 0.0171, + "step": 5260 + }, + { + "epoch": 3.76, + "grad_norm": 4.722409299384118, + "learning_rate": 9.79503068889226e-08, + "loss": 0.0255, + "step": 5261 + }, + { + "epoch": 3.76, + "grad_norm": 3.4508766889297706, + "learning_rate": 9.738185915935005e-08, + "loss": 0.0292, + "step": 5262 + }, + { + "epoch": 3.76, + "grad_norm": 4.496268757517527, + "learning_rate": 9.681504948562403e-08, + "loss": 0.0217, + "step": 5263 + }, + { + "epoch": 3.76, + "grad_norm": 5.404507854655809, + "learning_rate": 9.624987805712749e-08, + "loss": 0.0197, + "step": 5264 + }, + { + "epoch": 3.76, + "grad_norm": 2.622445035917951, + "learning_rate": 9.568634506269381e-08, + "loss": 0.0259, + "step": 5265 + }, + { + "epoch": 3.76, + "grad_norm": 4.884116982411068, + "learning_rate": 9.51244506906096e-08, + "loss": 0.0258, + "step": 5266 + }, + { + "epoch": 3.76, + "grad_norm": 1.8627897365580712, + "learning_rate": 9.45641951286158e-08, + "loss": 0.0194, + "step": 5267 + }, + { + "epoch": 3.76, + "grad_norm": 2.7000828431244783, + "learning_rate": 9.400557856390158e-08, + "loss": 0.0199, + "step": 5268 + }, + { + "epoch": 3.76, + "grad_norm": 3.956492664965638, + "learning_rate": 9.344860118311427e-08, + "loss": 0.0234, + "step": 5269 + }, + { + "epoch": 3.76, + "grad_norm": 1.425209166481016, + "learning_rate": 9.289326317234726e-08, + "loss": 0.0231, + "step": 5270 + }, + { + "epoch": 3.76, + "grad_norm": 4.097267096486783, + "learning_rate": 9.23395647171521e-08, + "loss": 0.0235, + "step": 5271 + }, + { + "epoch": 3.76, + "grad_norm": 2.028670905892367, + "learning_rate": 9.178750600252695e-08, + "loss": 0.0255, + "step": 5272 + }, + { + "epoch": 3.76, + "grad_norm": 4.966736021039718, + "learning_rate": 9.123708721292756e-08, + "loss": 0.028, + "step": 5273 + }, + { + "epoch": 3.76, + "grad_norm": 3.197737197596851, + "learning_rate": 9.06883085322574e-08, + "loss": 0.0216, + "step": 5274 + }, + { + "epoch": 3.77, + "grad_norm": 4.080727773730197, + "learning_rate": 9.014117014387424e-08, + "loss": 0.0352, + "step": 5275 + }, + { + "epoch": 3.77, + "grad_norm": 1.1296264280256079, + "learning_rate": 8.95956722305874e-08, + "loss": 0.0169, + "step": 5276 + }, + { + "epoch": 3.77, + "grad_norm": 4.057116883266143, + "learning_rate": 8.905181497465664e-08, + "loss": 0.0221, + "step": 5277 + }, + { + "epoch": 3.77, + "grad_norm": 3.9647187605506953, + "learning_rate": 8.850959855779662e-08, + "loss": 0.0233, + "step": 5278 + }, + { + "epoch": 3.77, + "grad_norm": 2.1040774256861408, + "learning_rate": 8.796902316117018e-08, + "loss": 0.0213, + "step": 5279 + }, + { + "epoch": 3.77, + "grad_norm": 4.65723542116145, + "learning_rate": 8.743008896539451e-08, + "loss": 0.0258, + "step": 5280 + }, + { + "epoch": 3.77, + "grad_norm": 4.876295934301376, + "learning_rate": 8.68927961505378e-08, + "loss": 0.0257, + "step": 5281 + }, + { + "epoch": 3.77, + "grad_norm": 2.8627960416897555, + "learning_rate": 8.635714489611868e-08, + "loss": 0.0314, + "step": 5282 + }, + { + "epoch": 3.77, + "grad_norm": 2.273725154988319, + "learning_rate": 8.582313538110898e-08, + "loss": 0.0161, + "step": 5283 + }, + { + "epoch": 3.77, + "grad_norm": 1.9282706264267362, + "learning_rate": 8.529076778393097e-08, + "loss": 0.0254, + "step": 5284 + }, + { + "epoch": 3.77, + "grad_norm": 2.628970273375051, + "learning_rate": 8.476004228245848e-08, + "loss": 0.0231, + "step": 5285 + }, + { + "epoch": 3.77, + "grad_norm": 3.0795335082550763, + "learning_rate": 8.42309590540169e-08, + "loss": 0.0233, + "step": 5286 + }, + { + "epoch": 3.77, + "grad_norm": 2.4895174177627926, + "learning_rate": 8.370351827538259e-08, + "loss": 0.0258, + "step": 5287 + }, + { + "epoch": 3.77, + "grad_norm": 7.5059982137655075, + "learning_rate": 8.317772012278347e-08, + "loss": 0.0257, + "step": 5288 + }, + { + "epoch": 3.78, + "grad_norm": 2.287349453785497, + "learning_rate": 8.26535647718979e-08, + "loss": 0.0153, + "step": 5289 + }, + { + "epoch": 3.78, + "grad_norm": 4.007946391371989, + "learning_rate": 8.213105239785691e-08, + "loss": 0.0142, + "step": 5290 + }, + { + "epoch": 3.78, + "grad_norm": 3.325150468656642, + "learning_rate": 8.161018317524139e-08, + "loss": 0.0186, + "step": 5291 + }, + { + "epoch": 3.78, + "grad_norm": 2.679684188837439, + "learning_rate": 8.109095727808269e-08, + "loss": 0.0303, + "step": 5292 + }, + { + "epoch": 3.78, + "grad_norm": 3.923277021778115, + "learning_rate": 8.057337487986427e-08, + "loss": 0.0271, + "step": 5293 + }, + { + "epoch": 3.78, + "grad_norm": 2.9481719065310883, + "learning_rate": 8.005743615352057e-08, + "loss": 0.022, + "step": 5294 + }, + { + "epoch": 3.78, + "grad_norm": 11.41032621498422, + "learning_rate": 7.954314127143481e-08, + "loss": 0.0336, + "step": 5295 + }, + { + "epoch": 3.78, + "grad_norm": 8.438022620378145, + "learning_rate": 7.903049040544453e-08, + "loss": 0.0272, + "step": 5296 + }, + { + "epoch": 3.78, + "grad_norm": 1.5690249006305896, + "learning_rate": 7.851948372683382e-08, + "loss": 0.0239, + "step": 5297 + }, + { + "epoch": 3.78, + "grad_norm": 1.8028194196876857, + "learning_rate": 7.801012140634167e-08, + "loss": 0.02, + "step": 5298 + }, + { + "epoch": 3.78, + "grad_norm": 1.8809847137659024, + "learning_rate": 7.750240361415362e-08, + "loss": 0.0178, + "step": 5299 + }, + { + "epoch": 3.78, + "grad_norm": 4.070256377359856, + "learning_rate": 7.69963305199084e-08, + "loss": 0.0211, + "step": 5300 + }, + { + "epoch": 3.78, + "grad_norm": 4.871741592906539, + "learning_rate": 7.64919022926941e-08, + "loss": 0.0213, + "step": 5301 + }, + { + "epoch": 3.78, + "grad_norm": 2.1231914716571154, + "learning_rate": 7.598911910105033e-08, + "loss": 0.0186, + "step": 5302 + }, + { + "epoch": 3.79, + "grad_norm": 2.0372035432976916, + "learning_rate": 7.548798111296552e-08, + "loss": 0.0222, + "step": 5303 + }, + { + "epoch": 3.79, + "grad_norm": 3.43083865516561, + "learning_rate": 7.498848849588015e-08, + "loss": 0.0245, + "step": 5304 + }, + { + "epoch": 3.79, + "grad_norm": 4.2401607366865335, + "learning_rate": 7.449064141668238e-08, + "loss": 0.0262, + "step": 5305 + }, + { + "epoch": 3.79, + "grad_norm": 2.638289900596031, + "learning_rate": 7.399444004171364e-08, + "loss": 0.0216, + "step": 5306 + }, + { + "epoch": 3.79, + "grad_norm": 5.203760317859933, + "learning_rate": 7.349988453676349e-08, + "loss": 0.0264, + "step": 5307 + }, + { + "epoch": 3.79, + "grad_norm": 1.1903840699355497, + "learning_rate": 7.300697506707254e-08, + "loss": 0.0174, + "step": 5308 + }, + { + "epoch": 3.79, + "grad_norm": 2.386179668964028, + "learning_rate": 7.251571179732963e-08, + "loss": 0.0213, + "step": 5309 + }, + { + "epoch": 3.79, + "grad_norm": 5.227249096322965, + "learning_rate": 7.202609489167734e-08, + "loss": 0.0252, + "step": 5310 + }, + { + "epoch": 3.79, + "grad_norm": 1.9320796097209536, + "learning_rate": 7.153812451370312e-08, + "loss": 0.0148, + "step": 5311 + }, + { + "epoch": 3.79, + "grad_norm": 3.182049411942898, + "learning_rate": 7.10518008264488e-08, + "loss": 0.0228, + "step": 5312 + }, + { + "epoch": 3.79, + "grad_norm": 3.3538825883283443, + "learning_rate": 7.056712399240274e-08, + "loss": 0.0275, + "step": 5313 + }, + { + "epoch": 3.79, + "grad_norm": 2.1138391051762473, + "learning_rate": 7.008409417350648e-08, + "loss": 0.0259, + "step": 5314 + }, + { + "epoch": 3.79, + "grad_norm": 4.7881945597014255, + "learning_rate": 6.960271153114706e-08, + "loss": 0.0272, + "step": 5315 + }, + { + "epoch": 3.79, + "grad_norm": 1.3152877442321909, + "learning_rate": 6.912297622616526e-08, + "loss": 0.0118, + "step": 5316 + }, + { + "epoch": 3.8, + "grad_norm": 2.201245036810093, + "learning_rate": 6.864488841884786e-08, + "loss": 0.0182, + "step": 5317 + }, + { + "epoch": 3.8, + "grad_norm": 1.7396078183985866, + "learning_rate": 6.816844826893431e-08, + "loss": 0.0211, + "step": 5318 + }, + { + "epoch": 3.8, + "grad_norm": 6.429755754943171, + "learning_rate": 6.769365593561117e-08, + "loss": 0.0239, + "step": 5319 + }, + { + "epoch": 3.8, + "grad_norm": 2.4797877328826865, + "learning_rate": 6.722051157751597e-08, + "loss": 0.0221, + "step": 5320 + }, + { + "epoch": 3.8, + "grad_norm": 4.190112748365354, + "learning_rate": 6.674901535273448e-08, + "loss": 0.0164, + "step": 5321 + }, + { + "epoch": 3.8, + "grad_norm": 3.4184760808635124, + "learning_rate": 6.627916741880291e-08, + "loss": 0.0369, + "step": 5322 + }, + { + "epoch": 3.8, + "grad_norm": 2.141574310849085, + "learning_rate": 6.581096793270625e-08, + "loss": 0.0137, + "step": 5323 + }, + { + "epoch": 3.8, + "grad_norm": 2.740988732465579, + "learning_rate": 6.534441705087768e-08, + "loss": 0.0214, + "step": 5324 + }, + { + "epoch": 3.8, + "grad_norm": 3.057761817664433, + "learning_rate": 6.487951492920141e-08, + "loss": 0.0271, + "step": 5325 + }, + { + "epoch": 3.8, + "grad_norm": 4.367102135231493, + "learning_rate": 6.441626172300986e-08, + "loss": 0.0225, + "step": 5326 + }, + { + "epoch": 3.8, + "grad_norm": 5.584248486877098, + "learning_rate": 6.395465758708419e-08, + "loss": 0.018, + "step": 5327 + }, + { + "epoch": 3.8, + "grad_norm": 5.53656132887622, + "learning_rate": 6.349470267565549e-08, + "loss": 0.0262, + "step": 5328 + }, + { + "epoch": 3.8, + "grad_norm": 2.128469356840069, + "learning_rate": 6.303639714240196e-08, + "loss": 0.0127, + "step": 5329 + }, + { + "epoch": 3.8, + "grad_norm": 4.588339566836977, + "learning_rate": 6.257974114045385e-08, + "loss": 0.0219, + "step": 5330 + }, + { + "epoch": 3.81, + "grad_norm": 1.3886881130197628, + "learning_rate": 6.212473482238635e-08, + "loss": 0.0196, + "step": 5331 + }, + { + "epoch": 3.81, + "grad_norm": 11.056473846052311, + "learning_rate": 6.167137834022785e-08, + "loss": 0.0398, + "step": 5332 + }, + { + "epoch": 3.81, + "grad_norm": 1.7596324104793302, + "learning_rate": 6.121967184545107e-08, + "loss": 0.0129, + "step": 5333 + }, + { + "epoch": 3.81, + "grad_norm": 1.6553378546687454, + "learning_rate": 6.076961548898086e-08, + "loss": 0.0172, + "step": 5334 + }, + { + "epoch": 3.81, + "grad_norm": 3.0913951303287965, + "learning_rate": 6.032120942118858e-08, + "loss": 0.0193, + "step": 5335 + }, + { + "epoch": 3.81, + "grad_norm": 3.909975477030834, + "learning_rate": 5.98744537918955e-08, + "loss": 0.0185, + "step": 5336 + }, + { + "epoch": 3.81, + "grad_norm": 1.5964964171735954, + "learning_rate": 5.9429348750371097e-08, + "loss": 0.02, + "step": 5337 + }, + { + "epoch": 3.81, + "grad_norm": 3.4138357917288413, + "learning_rate": 5.898589444533254e-08, + "loss": 0.0198, + "step": 5338 + }, + { + "epoch": 3.81, + "grad_norm": 4.010010339656211, + "learning_rate": 5.85440910249474e-08, + "loss": 0.0468, + "step": 5339 + }, + { + "epoch": 3.81, + "grad_norm": 2.884838335613086, + "learning_rate": 5.810393863682873e-08, + "loss": 0.0236, + "step": 5340 + }, + { + "epoch": 3.81, + "grad_norm": 2.5632337910539778, + "learning_rate": 5.7665437428041096e-08, + "loss": 0.0254, + "step": 5341 + }, + { + "epoch": 3.81, + "grad_norm": 6.461929942388153, + "learning_rate": 5.722858754509564e-08, + "loss": 0.0225, + "step": 5342 + }, + { + "epoch": 3.81, + "grad_norm": 2.358122356416235, + "learning_rate": 5.679338913395116e-08, + "loss": 0.0181, + "step": 5343 + }, + { + "epoch": 3.81, + "grad_norm": 7.677224673110656, + "learning_rate": 5.6359842340016904e-08, + "loss": 0.0262, + "step": 5344 + }, + { + "epoch": 3.82, + "grad_norm": 2.168935169589492, + "learning_rate": 5.5927947308147545e-08, + "loss": 0.0177, + "step": 5345 + }, + { + "epoch": 3.82, + "grad_norm": 3.466736981204486, + "learning_rate": 5.549770418264766e-08, + "loss": 0.0177, + "step": 5346 + }, + { + "epoch": 3.82, + "grad_norm": 3.4138530734232755, + "learning_rate": 5.5069113107270034e-08, + "loss": 0.0244, + "step": 5347 + }, + { + "epoch": 3.82, + "grad_norm": 2.065047272470145, + "learning_rate": 5.464217422521456e-08, + "loss": 0.0221, + "step": 5348 + }, + { + "epoch": 3.82, + "grad_norm": 2.8946037809002765, + "learning_rate": 5.421688767912936e-08, + "loss": 0.0186, + "step": 5349 + }, + { + "epoch": 3.82, + "grad_norm": 4.394964370403741, + "learning_rate": 5.3793253611110206e-08, + "loss": 0.0197, + "step": 5350 + }, + { + "epoch": 3.82, + "grad_norm": 2.608996497277368, + "learning_rate": 5.3371272162702214e-08, + "loss": 0.0177, + "step": 5351 + }, + { + "epoch": 3.82, + "grad_norm": 1.8025304112213554, + "learning_rate": 5.295094347489593e-08, + "loss": 0.0207, + "step": 5352 + }, + { + "epoch": 3.82, + "grad_norm": 2.5613629695229787, + "learning_rate": 5.253226768813235e-08, + "loss": 0.0164, + "step": 5353 + }, + { + "epoch": 3.82, + "grad_norm": 4.54003532307264, + "learning_rate": 5.211524494229736e-08, + "loss": 0.023, + "step": 5354 + }, + { + "epoch": 3.82, + "grad_norm": 1.904588615161526, + "learning_rate": 5.169987537672727e-08, + "loss": 0.0179, + "step": 5355 + }, + { + "epoch": 3.82, + "grad_norm": 1.9393310412596467, + "learning_rate": 5.128615913020385e-08, + "loss": 0.0222, + "step": 5356 + }, + { + "epoch": 3.82, + "grad_norm": 3.359468134735287, + "learning_rate": 5.087409634095819e-08, + "loss": 0.0238, + "step": 5357 + }, + { + "epoch": 3.82, + "grad_norm": 1.1449746630362878, + "learning_rate": 5.046368714666683e-08, + "loss": 0.0117, + "step": 5358 + }, + { + "epoch": 3.83, + "grad_norm": 4.222518213830071, + "learning_rate": 5.0054931684457296e-08, + "loss": 0.0383, + "step": 5359 + }, + { + "epoch": 3.83, + "grad_norm": 3.743647719670252, + "learning_rate": 4.964783009090035e-08, + "loss": 0.0293, + "step": 5360 + }, + { + "epoch": 3.83, + "grad_norm": 3.833891477000365, + "learning_rate": 4.9242382502017185e-08, + "loss": 0.0229, + "step": 5361 + }, + { + "epoch": 3.83, + "grad_norm": 4.168894379967887, + "learning_rate": 4.883858905327499e-08, + "loss": 0.019, + "step": 5362 + }, + { + "epoch": 3.83, + "grad_norm": 1.913254240680341, + "learning_rate": 4.843644987958862e-08, + "loss": 0.0188, + "step": 5363 + }, + { + "epoch": 3.83, + "grad_norm": 2.110753327008818, + "learning_rate": 4.8035965115320604e-08, + "loss": 0.0197, + "step": 5364 + }, + { + "epoch": 3.83, + "grad_norm": 8.56246472776577, + "learning_rate": 4.763713489428001e-08, + "loss": 0.0337, + "step": 5365 + }, + { + "epoch": 3.83, + "grad_norm": 3.119553272465091, + "learning_rate": 4.723995934972414e-08, + "loss": 0.0235, + "step": 5366 + }, + { + "epoch": 3.83, + "grad_norm": 3.8853086683100098, + "learning_rate": 4.684443861435572e-08, + "loss": 0.0244, + "step": 5367 + }, + { + "epoch": 3.83, + "grad_norm": 4.4695595709236855, + "learning_rate": 4.6450572820325727e-08, + "loss": 0.0252, + "step": 5368 + }, + { + "epoch": 3.83, + "grad_norm": 2.7874075602983672, + "learning_rate": 4.605836209923331e-08, + "loss": 0.0192, + "step": 5369 + }, + { + "epoch": 3.83, + "grad_norm": 2.203531606811125, + "learning_rate": 4.566780658212144e-08, + "loss": 0.0206, + "step": 5370 + }, + { + "epoch": 3.83, + "grad_norm": 3.160499631682135, + "learning_rate": 4.5278906399483516e-08, + "loss": 0.0169, + "step": 5371 + }, + { + "epoch": 3.83, + "grad_norm": 4.6089108849374805, + "learning_rate": 4.489166168125725e-08, + "loss": 0.0271, + "step": 5372 + }, + { + "epoch": 3.84, + "grad_norm": 4.866432991585118, + "learning_rate": 4.4506072556829704e-08, + "loss": 0.0189, + "step": 5373 + }, + { + "epoch": 3.84, + "grad_norm": 3.197986454347981, + "learning_rate": 4.4122139155031717e-08, + "loss": 0.0219, + "step": 5374 + }, + { + "epoch": 3.84, + "grad_norm": 4.3140025616477145, + "learning_rate": 4.373986160414345e-08, + "loss": 0.04, + "step": 5375 + }, + { + "epoch": 3.84, + "grad_norm": 0.9589388375969385, + "learning_rate": 4.335924003189107e-08, + "loss": 0.0173, + "step": 5376 + }, + { + "epoch": 3.84, + "grad_norm": 2.9736420672090085, + "learning_rate": 4.298027456544674e-08, + "loss": 0.0215, + "step": 5377 + }, + { + "epoch": 3.84, + "grad_norm": 3.0076067234022306, + "learning_rate": 4.260296533143027e-08, + "loss": 0.0311, + "step": 5378 + }, + { + "epoch": 3.84, + "grad_norm": 2.9163352666589164, + "learning_rate": 4.22273124559075e-08, + "loss": 0.0179, + "step": 5379 + }, + { + "epoch": 3.84, + "grad_norm": 5.350361549448945, + "learning_rate": 4.185331606439136e-08, + "loss": 0.0296, + "step": 5380 + }, + { + "epoch": 3.84, + "grad_norm": 3.5143808177875666, + "learning_rate": 4.148097628184078e-08, + "loss": 0.0268, + "step": 5381 + }, + { + "epoch": 3.84, + "grad_norm": 1.6488078338220415, + "learning_rate": 4.111029323266125e-08, + "loss": 0.0212, + "step": 5382 + }, + { + "epoch": 3.84, + "grad_norm": 2.9180480292411337, + "learning_rate": 4.07412670407048e-08, + "loss": 0.0167, + "step": 5383 + }, + { + "epoch": 3.84, + "grad_norm": 3.426743137102039, + "learning_rate": 4.037389782927059e-08, + "loss": 0.0261, + "step": 5384 + }, + { + "epoch": 3.84, + "grad_norm": 2.626920868414007, + "learning_rate": 4.000818572110265e-08, + "loss": 0.0187, + "step": 5385 + }, + { + "epoch": 3.84, + "grad_norm": 4.358340512942245, + "learning_rate": 3.964413083839269e-08, + "loss": 0.024, + "step": 5386 + }, + { + "epoch": 3.85, + "grad_norm": 2.5771043456304734, + "learning_rate": 3.9281733302778404e-08, + "loss": 0.0135, + "step": 5387 + }, + { + "epoch": 3.85, + "grad_norm": 3.7108362612772945, + "learning_rate": 3.892099323534293e-08, + "loss": 0.0251, + "step": 5388 + }, + { + "epoch": 3.85, + "grad_norm": 3.2641134315951765, + "learning_rate": 3.856191075661708e-08, + "loss": 0.0207, + "step": 5389 + }, + { + "epoch": 3.85, + "grad_norm": 4.7954663584761, + "learning_rate": 3.8204485986576e-08, + "loss": 0.0233, + "step": 5390 + }, + { + "epoch": 3.85, + "grad_norm": 3.0039195291483587, + "learning_rate": 3.784871904464249e-08, + "loss": 0.0214, + "step": 5391 + }, + { + "epoch": 3.85, + "grad_norm": 4.781319594897229, + "learning_rate": 3.7494610049684796e-08, + "loss": 0.0202, + "step": 5392 + }, + { + "epoch": 3.85, + "grad_norm": 2.2181342509203716, + "learning_rate": 3.714215912001773e-08, + "loss": 0.0237, + "step": 5393 + }, + { + "epoch": 3.85, + "grad_norm": 1.5307656638034723, + "learning_rate": 3.6791366373400974e-08, + "loss": 0.016, + "step": 5394 + }, + { + "epoch": 3.85, + "grad_norm": 2.9980957129316477, + "learning_rate": 3.6442231927041324e-08, + "loss": 0.0209, + "step": 5395 + }, + { + "epoch": 3.85, + "grad_norm": 2.614325470570068, + "learning_rate": 3.609475589759104e-08, + "loss": 0.0195, + "step": 5396 + }, + { + "epoch": 3.85, + "grad_norm": 1.2342786417833946, + "learning_rate": 3.574893840114835e-08, + "loss": 0.0195, + "step": 5397 + }, + { + "epoch": 3.85, + "grad_norm": 3.13513680493031, + "learning_rate": 3.5404779553257494e-08, + "loss": 0.0198, + "step": 5398 + }, + { + "epoch": 3.85, + "grad_norm": 15.078842855499811, + "learning_rate": 3.506227946890761e-08, + "loss": 0.042, + "step": 5399 + }, + { + "epoch": 3.85, + "grad_norm": 4.968249293917528, + "learning_rate": 3.4721438262534935e-08, + "loss": 0.0286, + "step": 5400 + }, + { + "epoch": 3.86, + "grad_norm": 2.300715233738611, + "learning_rate": 3.438225604802115e-08, + "loss": 0.0247, + "step": 5401 + }, + { + "epoch": 3.86, + "grad_norm": 3.2215687096264367, + "learning_rate": 3.404473293869226e-08, + "loss": 0.0225, + "step": 5402 + }, + { + "epoch": 3.86, + "grad_norm": 1.3402505655817432, + "learning_rate": 3.370886904732196e-08, + "loss": 0.0137, + "step": 5403 + }, + { + "epoch": 3.86, + "grad_norm": 1.0509369564653657, + "learning_rate": 3.33746644861288e-08, + "loss": 0.0152, + "step": 5404 + }, + { + "epoch": 3.86, + "grad_norm": 1.5200283370650576, + "learning_rate": 3.30421193667757e-08, + "loss": 0.0172, + "step": 5405 + }, + { + "epoch": 3.86, + "grad_norm": 6.978782111965761, + "learning_rate": 3.271123380037322e-08, + "loss": 0.0288, + "step": 5406 + }, + { + "epoch": 3.86, + "grad_norm": 4.070184829965542, + "learning_rate": 3.2382007897475695e-08, + "loss": 0.0289, + "step": 5407 + }, + { + "epoch": 3.86, + "grad_norm": 4.409008744759003, + "learning_rate": 3.2054441768083477e-08, + "loss": 0.0241, + "step": 5408 + }, + { + "epoch": 3.86, + "grad_norm": 2.2078652879223797, + "learning_rate": 3.1728535521643454e-08, + "loss": 0.0198, + "step": 5409 + }, + { + "epoch": 3.86, + "grad_norm": 2.187261175257129, + "learning_rate": 3.1404289267046305e-08, + "loss": 0.0185, + "step": 5410 + }, + { + "epoch": 3.86, + "grad_norm": 1.8104547975037084, + "learning_rate": 3.1081703112628146e-08, + "loss": 0.0207, + "step": 5411 + }, + { + "epoch": 3.86, + "grad_norm": 2.108463142618941, + "learning_rate": 3.0760777166172206e-08, + "loss": 0.0204, + "step": 5412 + }, + { + "epoch": 3.86, + "grad_norm": 4.41782265356589, + "learning_rate": 3.0441511534904934e-08, + "loss": 0.0266, + "step": 5413 + }, + { + "epoch": 3.86, + "grad_norm": 2.1563829583102483, + "learning_rate": 3.012390632549933e-08, + "loss": 0.0298, + "step": 5414 + }, + { + "epoch": 3.87, + "grad_norm": 7.78806885480321, + "learning_rate": 2.9807961644073294e-08, + "loss": 0.026, + "step": 5415 + }, + { + "epoch": 3.87, + "grad_norm": 3.120989169250971, + "learning_rate": 2.9493677596189595e-08, + "loss": 0.0237, + "step": 5416 + }, + { + "epoch": 3.87, + "grad_norm": 4.358905498727036, + "learning_rate": 2.9181054286855916e-08, + "loss": 0.0214, + "step": 5417 + }, + { + "epoch": 3.87, + "grad_norm": 4.6477516202899, + "learning_rate": 2.887009182052647e-08, + "loss": 0.0307, + "step": 5418 + }, + { + "epoch": 3.87, + "grad_norm": 6.1082992817988115, + "learning_rate": 2.8560790301098705e-08, + "loss": 0.0244, + "step": 5419 + }, + { + "epoch": 3.87, + "grad_norm": 1.275610383748102, + "learning_rate": 2.825314983191718e-08, + "loss": 0.0116, + "step": 5420 + }, + { + "epoch": 3.87, + "grad_norm": 8.547679089993268, + "learning_rate": 2.7947170515768562e-08, + "loss": 0.0333, + "step": 5421 + }, + { + "epoch": 3.87, + "grad_norm": 2.957300537052848, + "learning_rate": 2.7642852454887736e-08, + "loss": 0.0221, + "step": 5422 + }, + { + "epoch": 3.87, + "grad_norm": 2.033116201364948, + "learning_rate": 2.7340195750952813e-08, + "loss": 0.0148, + "step": 5423 + }, + { + "epoch": 3.87, + "grad_norm": 5.438523966512901, + "learning_rate": 2.703920050508624e-08, + "loss": 0.0304, + "step": 5424 + }, + { + "epoch": 3.87, + "grad_norm": 5.690418670617803, + "learning_rate": 2.673986681785645e-08, + "loss": 0.0202, + "step": 5425 + }, + { + "epoch": 3.87, + "grad_norm": 6.5518385685801555, + "learning_rate": 2.6442194789277342e-08, + "loss": 0.0322, + "step": 5426 + }, + { + "epoch": 3.87, + "grad_norm": 1.2401661063067848, + "learning_rate": 2.6146184518804908e-08, + "loss": 0.014, + "step": 5427 + }, + { + "epoch": 3.87, + "grad_norm": 3.209009421866879, + "learning_rate": 2.5851836105343363e-08, + "loss": 0.0168, + "step": 5428 + }, + { + "epoch": 3.88, + "grad_norm": 2.2479236401385623, + "learning_rate": 2.555914964723849e-08, + "loss": 0.0192, + "step": 5429 + }, + { + "epoch": 3.88, + "grad_norm": 3.664875088278122, + "learning_rate": 2.5268125242283724e-08, + "loss": 0.0293, + "step": 5430 + }, + { + "epoch": 3.88, + "grad_norm": 4.403075650273632, + "learning_rate": 2.4978762987714067e-08, + "loss": 0.0368, + "step": 5431 + }, + { + "epoch": 3.88, + "grad_norm": 6.9310695166002025, + "learning_rate": 2.469106298021273e-08, + "loss": 0.0316, + "step": 5432 + }, + { + "epoch": 3.88, + "grad_norm": 2.423140496516659, + "learning_rate": 2.4405025315904495e-08, + "loss": 0.024, + "step": 5433 + }, + { + "epoch": 3.88, + "grad_norm": 4.022916173686561, + "learning_rate": 2.412065009036013e-08, + "loss": 0.0212, + "step": 5434 + }, + { + "epoch": 3.88, + "grad_norm": 3.837813048775894, + "learning_rate": 2.3837937398594747e-08, + "loss": 0.0298, + "step": 5435 + }, + { + "epoch": 3.88, + "grad_norm": 2.1748941776797377, + "learning_rate": 2.3556887335067223e-08, + "loss": 0.0262, + "step": 5436 + }, + { + "epoch": 3.88, + "grad_norm": 6.561398321137626, + "learning_rate": 2.3277499993682452e-08, + "loss": 0.036, + "step": 5437 + }, + { + "epoch": 3.88, + "grad_norm": 4.357304631831137, + "learning_rate": 2.2999775467788532e-08, + "loss": 0.0223, + "step": 5438 + }, + { + "epoch": 3.88, + "grad_norm": 3.486845904406258, + "learning_rate": 2.272371385017902e-08, + "loss": 0.0272, + "step": 5439 + }, + { + "epoch": 3.88, + "grad_norm": 4.662953700822703, + "learning_rate": 2.244931523309013e-08, + "loss": 0.0278, + "step": 5440 + }, + { + "epoch": 3.88, + "grad_norm": 2.054534031716196, + "learning_rate": 2.2176579708204636e-08, + "loss": 0.0244, + "step": 5441 + }, + { + "epoch": 3.88, + "grad_norm": 3.9539255224153704, + "learning_rate": 2.190550736664798e-08, + "loss": 0.0359, + "step": 5442 + }, + { + "epoch": 3.89, + "grad_norm": 5.221943937345425, + "learning_rate": 2.163609829898994e-08, + "loss": 0.0316, + "step": 5443 + }, + { + "epoch": 3.89, + "grad_norm": 2.6919999185285426, + "learning_rate": 2.136835259524628e-08, + "loss": 0.0281, + "step": 5444 + }, + { + "epoch": 3.89, + "grad_norm": 3.252062666573824, + "learning_rate": 2.1102270344874887e-08, + "loss": 0.0428, + "step": 5445 + }, + { + "epoch": 3.89, + "grad_norm": 2.2687059601810353, + "learning_rate": 2.083785163677965e-08, + "loss": 0.0194, + "step": 5446 + }, + { + "epoch": 3.89, + "grad_norm": 3.2991800662002784, + "learning_rate": 2.0575096559306564e-08, + "loss": 0.0274, + "step": 5447 + }, + { + "epoch": 3.89, + "grad_norm": 3.6298689814703224, + "learning_rate": 2.0314005200248178e-08, + "loss": 0.0195, + "step": 5448 + }, + { + "epoch": 3.89, + "grad_norm": 3.218939239822081, + "learning_rate": 2.0054577646839156e-08, + "loss": 0.0142, + "step": 5449 + }, + { + "epoch": 3.89, + "grad_norm": 2.868578007944478, + "learning_rate": 1.979681398575961e-08, + "loss": 0.0172, + "step": 5450 + }, + { + "epoch": 3.89, + "grad_norm": 4.0210996974880056, + "learning_rate": 1.954071430313287e-08, + "loss": 0.0219, + "step": 5451 + }, + { + "epoch": 3.89, + "grad_norm": 2.3082378385282873, + "learning_rate": 1.9286278684526593e-08, + "loss": 0.0243, + "step": 5452 + }, + { + "epoch": 3.89, + "grad_norm": 3.362288093740942, + "learning_rate": 1.9033507214952784e-08, + "loss": 0.017, + "step": 5453 + }, + { + "epoch": 3.89, + "grad_norm": 4.605772008765052, + "learning_rate": 1.878239997886666e-08, + "loss": 0.023, + "step": 5454 + }, + { + "epoch": 3.89, + "grad_norm": 5.6829958574551505, + "learning_rate": 1.853295706016778e-08, + "loss": 0.0274, + "step": 5455 + }, + { + "epoch": 3.89, + "grad_norm": 2.357378781581144, + "learning_rate": 1.8285178542200022e-08, + "loss": 0.0222, + "step": 5456 + }, + { + "epoch": 3.9, + "grad_norm": 2.7214489224447185, + "learning_rate": 1.8039064507750503e-08, + "loss": 0.0131, + "step": 5457 + }, + { + "epoch": 3.9, + "grad_norm": 2.8725088439600706, + "learning_rate": 1.7794615039050665e-08, + "loss": 0.0144, + "step": 5458 + }, + { + "epoch": 3.9, + "grad_norm": 4.16619545847278, + "learning_rate": 1.7551830217775734e-08, + "loss": 0.0164, + "step": 5459 + }, + { + "epoch": 3.9, + "grad_norm": 2.0022748959715058, + "learning_rate": 1.7310710125044707e-08, + "loss": 0.0226, + "step": 5460 + }, + { + "epoch": 3.9, + "grad_norm": 2.3611659620346592, + "learning_rate": 1.7071254841419805e-08, + "loss": 0.0202, + "step": 5461 + }, + { + "epoch": 3.9, + "grad_norm": 3.864100176432943, + "learning_rate": 1.6833464446907588e-08, + "loss": 0.0208, + "step": 5462 + }, + { + "epoch": 3.9, + "grad_norm": 2.0424463543342584, + "learning_rate": 1.6597339020958393e-08, + "loss": 0.0192, + "step": 5463 + }, + { + "epoch": 3.9, + "grad_norm": 2.3858731170790857, + "learning_rate": 1.6362878642466328e-08, + "loss": 0.0205, + "step": 5464 + }, + { + "epoch": 3.9, + "grad_norm": 2.110909016473477, + "learning_rate": 1.6130083389768735e-08, + "loss": 0.0182, + "step": 5465 + }, + { + "epoch": 3.9, + "grad_norm": 3.7439060845341365, + "learning_rate": 1.5898953340646728e-08, + "loss": 0.029, + "step": 5466 + }, + { + "epoch": 3.9, + "grad_norm": 4.647959641308654, + "learning_rate": 1.5669488572325197e-08, + "loss": 0.0206, + "step": 5467 + }, + { + "epoch": 3.9, + "grad_norm": 2.5611865882124523, + "learning_rate": 1.5441689161472816e-08, + "loss": 0.0283, + "step": 5468 + }, + { + "epoch": 3.9, + "grad_norm": 3.131129293544498, + "learning_rate": 1.521555518420148e-08, + "loss": 0.0255, + "step": 5469 + }, + { + "epoch": 3.9, + "grad_norm": 2.906211714511769, + "learning_rate": 1.499108671606686e-08, + "loss": 0.0222, + "step": 5470 + }, + { + "epoch": 3.91, + "grad_norm": 1.3921197046262057, + "learning_rate": 1.4768283832067853e-08, + "loss": 0.0206, + "step": 5471 + }, + { + "epoch": 3.91, + "grad_norm": 3.859282967632713, + "learning_rate": 1.4547146606646578e-08, + "loss": 0.0236, + "step": 5472 + }, + { + "epoch": 3.91, + "grad_norm": 5.407318739007233, + "learning_rate": 1.4327675113690598e-08, + "loss": 0.0219, + "step": 5473 + }, + { + "epoch": 3.91, + "grad_norm": 4.811797317082379, + "learning_rate": 1.4109869426527368e-08, + "loss": 0.0205, + "step": 5474 + }, + { + "epoch": 3.91, + "grad_norm": 2.252930648915975, + "learning_rate": 1.3893729617931451e-08, + "loss": 0.0241, + "step": 5475 + }, + { + "epoch": 3.91, + "grad_norm": 1.4816278252113144, + "learning_rate": 1.3679255760118415e-08, + "loss": 0.0227, + "step": 5476 + }, + { + "epoch": 3.91, + "grad_norm": 6.986746169818905, + "learning_rate": 1.3466447924748716e-08, + "loss": 0.0249, + "step": 5477 + }, + { + "epoch": 3.91, + "grad_norm": 2.1997925480493876, + "learning_rate": 1.3255306182924365e-08, + "loss": 0.0179, + "step": 5478 + }, + { + "epoch": 3.91, + "grad_norm": 4.030533592119653, + "learning_rate": 1.3045830605192266e-08, + "loss": 0.0458, + "step": 5479 + }, + { + "epoch": 3.91, + "grad_norm": 3.088240734799316, + "learning_rate": 1.2838021261541988e-08, + "loss": 0.019, + "step": 5480 + }, + { + "epoch": 3.91, + "grad_norm": 3.0526266172301084, + "learning_rate": 1.263187822140688e-08, + "loss": 0.0228, + "step": 5481 + }, + { + "epoch": 3.91, + "grad_norm": 2.888270689172121, + "learning_rate": 1.2427401553662955e-08, + "loss": 0.0192, + "step": 5482 + }, + { + "epoch": 3.91, + "grad_norm": 2.4369666827080487, + "learning_rate": 1.2224591326628898e-08, + "loss": 0.015, + "step": 5483 + }, + { + "epoch": 3.91, + "grad_norm": 2.2435601031926296, + "learning_rate": 1.2023447608068283e-08, + "loss": 0.0208, + "step": 5484 + }, + { + "epoch": 3.92, + "grad_norm": 5.144528764350548, + "learning_rate": 1.182397046518735e-08, + "loss": 0.0224, + "step": 5485 + }, + { + "epoch": 3.92, + "grad_norm": 2.2211905772296, + "learning_rate": 1.1626159964633899e-08, + "loss": 0.0147, + "step": 5486 + }, + { + "epoch": 3.92, + "grad_norm": 2.2902390229673197, + "learning_rate": 1.1430016172501169e-08, + "loss": 0.0149, + "step": 5487 + }, + { + "epoch": 3.92, + "grad_norm": 4.665203417246369, + "learning_rate": 1.1235539154323405e-08, + "loss": 0.0213, + "step": 5488 + }, + { + "epoch": 3.92, + "grad_norm": 6.05200035827132, + "learning_rate": 1.1042728975079741e-08, + "loss": 0.0254, + "step": 5489 + }, + { + "epoch": 3.92, + "grad_norm": 6.912668006593784, + "learning_rate": 1.0851585699191425e-08, + "loss": 0.0291, + "step": 5490 + }, + { + "epoch": 3.92, + "grad_norm": 2.8237061465451463, + "learning_rate": 1.0662109390522924e-08, + "loss": 0.0197, + "step": 5491 + }, + { + "epoch": 3.92, + "grad_norm": 5.594205018270895, + "learning_rate": 1.047430011238193e-08, + "loss": 0.0262, + "step": 5492 + }, + { + "epoch": 3.92, + "grad_norm": 2.5381484863031885, + "learning_rate": 1.028815792751936e-08, + "loss": 0.0226, + "step": 5493 + }, + { + "epoch": 3.92, + "grad_norm": 3.127384381139182, + "learning_rate": 1.0103682898128241e-08, + "loss": 0.0184, + "step": 5494 + }, + { + "epoch": 3.92, + "grad_norm": 3.2830582442340375, + "learning_rate": 9.920875085845383e-09, + "loss": 0.0213, + "step": 5495 + }, + { + "epoch": 3.92, + "grad_norm": 2.7865544882843336, + "learning_rate": 9.739734551749703e-09, + "loss": 0.0185, + "step": 5496 + }, + { + "epoch": 3.92, + "grad_norm": 6.719662127295909, + "learning_rate": 9.560261356364452e-09, + "loss": 0.0387, + "step": 5497 + }, + { + "epoch": 3.92, + "grad_norm": 3.0048549069559707, + "learning_rate": 9.382455559654446e-09, + "loss": 0.026, + "step": 5498 + }, + { + "epoch": 3.93, + "grad_norm": 2.031697797024053, + "learning_rate": 9.206317221027717e-09, + "loss": 0.023, + "step": 5499 + }, + { + "epoch": 3.93, + "grad_norm": 3.903823119328278, + "learning_rate": 9.031846399336075e-09, + "loss": 0.0327, + "step": 5500 + }, + { + "epoch": 3.93, + "eval_avg_AUC": 0.8371702589944845, + "eval_avg_Accuracy": 0.7440318302387268, + "eval_avg_Accuracy-right": 0.8942872049041346, + "eval_avg_Accuracy-wrong": 0.48203320445758474, + "eval_avg_Num questions with both labels": 523, + "eval_avg_Question-wise AUC": 0.7103294303171083, + "eval_last_AUC": 0.8319023333640545, + "eval_last_Accuracy": 0.7803796419098143, + "eval_last_Accuracy-right": 0.8463545063258119, + "eval_last_Accuracy-wrong": 0.6653400045485558, + "eval_last_Num questions with both labels": 523, + "eval_last_Question-wise AUC": 0.7067685458768546, + "eval_max_AUC": 0.7867669380385187, + "eval_max_Accuracy": 0.6470490716180372, + "eval_max_Accuracy-right": 0.9887178818312248, + "eval_max_Accuracy-wrong": 0.05128496702297021, + "eval_max_Num questions with both labels": 523, + "eval_max_Question-wise AUC": 0.6553866744173231, + "eval_min_AUC": 0.8460472659735542, + "eval_min_Accuracy": 0.7755305039787799, + "eval_min_Accuracy-right": 0.7935307160558237, + "eval_min_Accuracy-wrong": 0.7441437343643393, + "eval_min_Num questions with both labels": 523, + "eval_min_Question-wise AUC": 0.711460753125292, + "eval_prod_AUC": 0.8437307587767714, + "eval_prod_Accuracy": 0.7427470159151194, + "eval_prod_Accuracy-right": 0.6701447763140733, + "eval_prod_Accuracy-wrong": 0.869342733682056, + "eval_prod_Num questions with both labels": 523, + "eval_prod_Question-wise AUC": 0.7105555644172705, + "eval_runtime": 247.3548, + "eval_samples_per_second": 97.544, + "eval_steps_per_second": 3.048, + "eval_sum_AUC": 0.7143932084431329, + "eval_sum_Accuracy": 0.6395474137931034, + "eval_sum_Accuracy-right": 0.996869701317334, + "eval_sum_Accuracy-wrong": 0.016488514896520354, + "eval_sum_Num questions with both labels": 523, + "eval_sum_Question-wise AUC": 0.6877217306105692, + "step": 5500 + }, + { + "epoch": 3.93, + "grad_norm": 3.3371950491903637, + "learning_rate": 8.859043152872892e-09, + "loss": 0.0194, + "step": 5501 + }, + { + "epoch": 3.93, + "grad_norm": 1.9530134838227862, + "learning_rate": 8.687907539375318e-09, + "loss": 0.0204, + "step": 5502 + }, + { + "epoch": 3.93, + "grad_norm": 1.9341448030769988, + "learning_rate": 8.518439616022057e-09, + "loss": 0.0219, + "step": 5503 + }, + { + "epoch": 3.93, + "grad_norm": 2.087075389938105, + "learning_rate": 8.350639439436703e-09, + "loss": 0.0242, + "step": 5504 + }, + { + "epoch": 3.93, + "grad_norm": 3.430222763743477, + "learning_rate": 8.184507065683855e-09, + "loss": 0.0174, + "step": 5505 + }, + { + "epoch": 3.93, + "grad_norm": 5.6908272831133395, + "learning_rate": 8.020042550271889e-09, + "loss": 0.02, + "step": 5506 + }, + { + "epoch": 3.93, + "grad_norm": 2.351915576486446, + "learning_rate": 7.857245948150183e-09, + "loss": 0.0186, + "step": 5507 + }, + { + "epoch": 3.93, + "grad_norm": 4.173408896062741, + "learning_rate": 7.696117313713559e-09, + "loss": 0.0271, + "step": 5508 + }, + { + "epoch": 3.93, + "grad_norm": 1.7164931573536757, + "learning_rate": 7.536656700797284e-09, + "loss": 0.0169, + "step": 5509 + }, + { + "epoch": 3.93, + "grad_norm": 2.99059556731782, + "learning_rate": 7.37886416268041e-09, + "loss": 0.0201, + "step": 5510 + }, + { + "epoch": 3.93, + "grad_norm": 3.3403066040208462, + "learning_rate": 7.222739752084096e-09, + "loss": 0.0251, + "step": 5511 + }, + { + "epoch": 3.93, + "grad_norm": 8.053052196324328, + "learning_rate": 7.068283521172725e-09, + "loss": 0.0359, + "step": 5512 + }, + { + "epoch": 3.94, + "grad_norm": 4.416499688532894, + "learning_rate": 6.915495521552795e-09, + "loss": 0.018, + "step": 5513 + }, + { + "epoch": 3.94, + "grad_norm": 4.387845883879492, + "learning_rate": 6.764375804274026e-09, + "loss": 0.0239, + "step": 5514 + }, + { + "epoch": 3.94, + "grad_norm": 5.03141411349809, + "learning_rate": 6.61492441982714e-09, + "loss": 0.0377, + "step": 5515 + }, + { + "epoch": 3.94, + "grad_norm": 3.757977750589518, + "learning_rate": 6.467141418147748e-09, + "loss": 0.0122, + "step": 5516 + }, + { + "epoch": 3.94, + "grad_norm": 2.001295426759492, + "learning_rate": 6.321026848613021e-09, + "loss": 0.0153, + "step": 5517 + }, + { + "epoch": 3.94, + "grad_norm": 3.9821414447648973, + "learning_rate": 6.176580760041684e-09, + "loss": 0.0194, + "step": 5518 + }, + { + "epoch": 3.94, + "grad_norm": 1.7433347366769623, + "learning_rate": 6.033803200696242e-09, + "loss": 0.018, + "step": 5519 + }, + { + "epoch": 3.94, + "grad_norm": 4.301461630452348, + "learning_rate": 5.892694218281869e-09, + "loss": 0.0371, + "step": 5520 + }, + { + "epoch": 3.94, + "grad_norm": 4.386923072523771, + "learning_rate": 5.753253859944741e-09, + "loss": 0.0165, + "step": 5521 + }, + { + "epoch": 3.94, + "grad_norm": 3.0793034972539677, + "learning_rate": 5.615482172275366e-09, + "loss": 0.0387, + "step": 5522 + }, + { + "epoch": 3.94, + "grad_norm": 6.778878857726523, + "learning_rate": 5.479379201305257e-09, + "loss": 0.0304, + "step": 5523 + }, + { + "epoch": 3.94, + "grad_norm": 4.863627634913964, + "learning_rate": 5.344944992509149e-09, + "loss": 0.0256, + "step": 5524 + }, + { + "epoch": 3.94, + "grad_norm": 2.509596156570167, + "learning_rate": 5.212179590803335e-09, + "loss": 0.0306, + "step": 5525 + }, + { + "epoch": 3.94, + "grad_norm": 2.796887352552083, + "learning_rate": 5.08108304054844e-09, + "loss": 0.0224, + "step": 5526 + }, + { + "epoch": 3.95, + "grad_norm": 4.106611953848664, + "learning_rate": 4.9516553855455395e-09, + "loss": 0.0308, + "step": 5527 + }, + { + "epoch": 3.95, + "grad_norm": 2.2897445226023216, + "learning_rate": 4.82389666903893e-09, + "loss": 0.0221, + "step": 5528 + }, + { + "epoch": 3.95, + "grad_norm": 2.7080187516169496, + "learning_rate": 4.697806933715021e-09, + "loss": 0.0269, + "step": 5529 + }, + { + "epoch": 3.95, + "grad_norm": 4.343144051787935, + "learning_rate": 4.573386221703446e-09, + "loss": 0.0267, + "step": 5530 + }, + { + "epoch": 3.95, + "grad_norm": 2.7596315781432397, + "learning_rate": 4.450634574574286e-09, + "loss": 0.0331, + "step": 5531 + }, + { + "epoch": 3.95, + "grad_norm": 4.324439345233048, + "learning_rate": 4.329552033341955e-09, + "loss": 0.0182, + "step": 5532 + }, + { + "epoch": 3.95, + "grad_norm": 2.8977509062535747, + "learning_rate": 4.210138638462424e-09, + "loss": 0.0135, + "step": 5533 + }, + { + "epoch": 3.95, + "grad_norm": 4.537391924016334, + "learning_rate": 4.0923944298337796e-09, + "loss": 0.0325, + "step": 5534 + }, + { + "epoch": 3.95, + "grad_norm": 5.661417948871261, + "learning_rate": 3.976319446795662e-09, + "loss": 0.0379, + "step": 5535 + }, + { + "epoch": 3.95, + "grad_norm": 2.8349607349929005, + "learning_rate": 3.8619137281326044e-09, + "loss": 0.0227, + "step": 5536 + }, + { + "epoch": 3.95, + "grad_norm": 5.200658373149944, + "learning_rate": 3.749177312068475e-09, + "loss": 0.0279, + "step": 5537 + }, + { + "epoch": 3.95, + "grad_norm": 1.8628376484778557, + "learning_rate": 3.63811023627092e-09, + "loss": 0.0193, + "step": 5538 + }, + { + "epoch": 3.95, + "grad_norm": 4.724184543652983, + "learning_rate": 3.528712537849144e-09, + "loss": 0.0267, + "step": 5539 + }, + { + "epoch": 3.95, + "grad_norm": 3.6749826001937764, + "learning_rate": 3.42098425335613e-09, + "loss": 0.0311, + "step": 5540 + }, + { + "epoch": 3.96, + "grad_norm": 4.909671343908744, + "learning_rate": 3.3149254187841985e-09, + "loss": 0.0288, + "step": 5541 + }, + { + "epoch": 3.96, + "grad_norm": 3.8677490201938785, + "learning_rate": 3.210536069571113e-09, + "loss": 0.024, + "step": 5542 + }, + { + "epoch": 3.96, + "grad_norm": 3.608231101955727, + "learning_rate": 3.1078162405939747e-09, + "loss": 0.0213, + "step": 5543 + }, + { + "epoch": 3.96, + "grad_norm": 3.272735238444045, + "learning_rate": 3.006765966174774e-09, + "loss": 0.0318, + "step": 5544 + }, + { + "epoch": 3.96, + "grad_norm": 3.1790589293641482, + "learning_rate": 2.907385280075392e-09, + "loss": 0.0182, + "step": 5545 + }, + { + "epoch": 3.96, + "grad_norm": 3.649186670073757, + "learning_rate": 2.80967421550038e-09, + "loss": 0.0286, + "step": 5546 + }, + { + "epoch": 3.96, + "grad_norm": 3.138795296256432, + "learning_rate": 2.7136328050980654e-09, + "loss": 0.0284, + "step": 5547 + }, + { + "epoch": 3.96, + "grad_norm": 3.9420052048363736, + "learning_rate": 2.6192610809566697e-09, + "loss": 0.0237, + "step": 5548 + }, + { + "epoch": 3.96, + "grad_norm": 2.182296768945293, + "learning_rate": 2.5265590746076373e-09, + "loss": 0.0113, + "step": 5549 + }, + { + "epoch": 3.96, + "grad_norm": 2.6773104562420236, + "learning_rate": 2.43552681702508e-09, + "loss": 0.0203, + "step": 5550 + }, + { + "epoch": 3.96, + "grad_norm": 3.367481619945359, + "learning_rate": 2.346164338624113e-09, + "loss": 0.0228, + "step": 5551 + }, + { + "epoch": 3.96, + "grad_norm": 5.067110242743501, + "learning_rate": 2.2584716692619636e-09, + "loss": 0.0237, + "step": 5552 + }, + { + "epoch": 3.96, + "grad_norm": 2.0718286609725105, + "learning_rate": 2.172448838239083e-09, + "loss": 0.0221, + "step": 5553 + }, + { + "epoch": 3.96, + "grad_norm": 2.662417149164622, + "learning_rate": 2.08809587429748e-09, + "loss": 0.0202, + "step": 5554 + }, + { + "epoch": 3.97, + "grad_norm": 3.989820634216304, + "learning_rate": 2.0054128056201662e-09, + "loss": 0.0137, + "step": 5555 + }, + { + "epoch": 3.97, + "grad_norm": 1.8775842103565406, + "learning_rate": 1.924399659833376e-09, + "loss": 0.014, + "step": 5556 + }, + { + "epoch": 3.97, + "grad_norm": 7.413735859817457, + "learning_rate": 1.8450564640054569e-09, + "loss": 0.046, + "step": 5557 + }, + { + "epoch": 3.97, + "grad_norm": 3.243264854700322, + "learning_rate": 1.7673832446463146e-09, + "loss": 0.0276, + "step": 5558 + }, + { + "epoch": 3.97, + "grad_norm": 7.71185976345551, + "learning_rate": 1.6913800277085225e-09, + "loss": 0.031, + "step": 5559 + }, + { + "epoch": 3.97, + "grad_norm": 6.885871743475731, + "learning_rate": 1.6170468385845462e-09, + "loss": 0.0285, + "step": 5560 + }, + { + "epoch": 3.97, + "grad_norm": 3.6615117870884992, + "learning_rate": 1.5443837021122954e-09, + "loss": 0.0207, + "step": 5561 + }, + { + "epoch": 3.97, + "grad_norm": 4.28734845963075, + "learning_rate": 1.473390642569017e-09, + "loss": 0.0173, + "step": 5562 + }, + { + "epoch": 3.97, + "grad_norm": 2.702589837418001, + "learning_rate": 1.4040676836746259e-09, + "loss": 0.0251, + "step": 5563 + }, + { + "epoch": 3.97, + "grad_norm": 5.090936212710113, + "learning_rate": 1.336414848591705e-09, + "loss": 0.0137, + "step": 5564 + }, + { + "epoch": 3.97, + "grad_norm": 5.178772005848657, + "learning_rate": 1.2704321599243951e-09, + "loss": 0.0318, + "step": 5565 + }, + { + "epoch": 3.97, + "grad_norm": 3.0897135807835494, + "learning_rate": 1.206119639718395e-09, + "loss": 0.0284, + "step": 5566 + }, + { + "epoch": 3.97, + "grad_norm": 6.956302961997545, + "learning_rate": 1.1434773094615158e-09, + "loss": 0.0202, + "step": 5567 + }, + { + "epoch": 3.97, + "grad_norm": 5.512949716160351, + "learning_rate": 1.0825051900842377e-09, + "loss": 0.0299, + "step": 5568 + }, + { + "epoch": 3.98, + "grad_norm": 4.34290757998301, + "learning_rate": 1.0232033019580423e-09, + "loss": 0.0247, + "step": 5569 + }, + { + "epoch": 3.98, + "grad_norm": 1.9398553518199555, + "learning_rate": 9.655716648970804e-10, + "loss": 0.0212, + "step": 5570 + }, + { + "epoch": 3.98, + "grad_norm": 3.452078753978198, + "learning_rate": 9.096102981570598e-10, + "loss": 0.0181, + "step": 5571 + }, + { + "epoch": 3.98, + "grad_norm": 1.1011444564361026, + "learning_rate": 8.553192204358018e-10, + "loss": 0.0116, + "step": 5572 + }, + { + "epoch": 3.98, + "grad_norm": 2.2777409632755012, + "learning_rate": 8.026984498726853e-10, + "loss": 0.0277, + "step": 5573 + }, + { + "epoch": 3.98, + "grad_norm": 4.323892977235624, + "learning_rate": 7.517480040497572e-10, + "loss": 0.0181, + "step": 5574 + }, + { + "epoch": 3.98, + "grad_norm": 1.8751831593629684, + "learning_rate": 7.024678999900669e-10, + "loss": 0.0309, + "step": 5575 + }, + { + "epoch": 3.98, + "grad_norm": 2.7347866857145986, + "learning_rate": 6.548581541593324e-10, + "loss": 0.0186, + "step": 5576 + }, + { + "epoch": 3.98, + "grad_norm": 2.8926365897897406, + "learning_rate": 6.08918782464829e-10, + "loss": 0.0178, + "step": 5577 + }, + { + "epoch": 3.98, + "grad_norm": 4.6764673593214505, + "learning_rate": 5.646498002553902e-10, + "loss": 0.0231, + "step": 5578 + }, + { + "epoch": 3.98, + "grad_norm": 5.802111064150024, + "learning_rate": 5.220512223219621e-10, + "loss": 0.0308, + "step": 5579 + }, + { + "epoch": 3.98, + "grad_norm": 2.0047609631552525, + "learning_rate": 4.81123062898714e-10, + "loss": 0.0163, + "step": 5580 + }, + { + "epoch": 3.98, + "grad_norm": 1.0730242231401868, + "learning_rate": 4.4186533565915293e-10, + "loss": 0.0146, + "step": 5581 + }, + { + "epoch": 3.98, + "grad_norm": 3.672437423453078, + "learning_rate": 4.042780537205637e-10, + "loss": 0.0192, + "step": 5582 + }, + { + "epoch": 3.99, + "grad_norm": 4.753438723651524, + "learning_rate": 3.6836122964178934e-10, + "loss": 0.0247, + "step": 5583 + }, + { + "epoch": 3.99, + "grad_norm": 2.127300877016818, + "learning_rate": 3.341148754232304e-10, + "loss": 0.0155, + "step": 5584 + }, + { + "epoch": 3.99, + "grad_norm": 7.34220557033025, + "learning_rate": 3.015390025068454e-10, + "loss": 0.039, + "step": 5585 + }, + { + "epoch": 3.99, + "grad_norm": 6.750258237012312, + "learning_rate": 2.706336217767058e-10, + "loss": 0.0253, + "step": 5586 + }, + { + "epoch": 3.99, + "grad_norm": 2.231439477264196, + "learning_rate": 2.4139874355955105e-10, + "loss": 0.0146, + "step": 5587 + }, + { + "epoch": 3.99, + "grad_norm": 3.31797560363478, + "learning_rate": 2.138343776231233e-10, + "loss": 0.0244, + "step": 5588 + }, + { + "epoch": 3.99, + "grad_norm": 4.782531101487246, + "learning_rate": 1.8794053317672255e-10, + "loss": 0.0175, + "step": 5589 + }, + { + "epoch": 3.99, + "grad_norm": 1.6196649621284065, + "learning_rate": 1.6371721887287196e-10, + "loss": 0.0192, + "step": 5590 + }, + { + "epoch": 3.99, + "grad_norm": 1.720724803964426, + "learning_rate": 1.4116444280398711e-10, + "loss": 0.0169, + "step": 5591 + }, + { + "epoch": 3.99, + "grad_norm": 3.9654605728386154, + "learning_rate": 1.2028221250570683e-10, + "loss": 0.015, + "step": 5592 + }, + { + "epoch": 3.99, + "grad_norm": 3.692236086799763, + "learning_rate": 1.0107053495522767e-10, + "loss": 0.0205, + "step": 5593 + }, + { + "epoch": 3.99, + "grad_norm": 2.5848778676255066, + "learning_rate": 8.35294165718592e-11, + "loss": 0.027, + "step": 5594 + }, + { + "epoch": 3.99, + "grad_norm": 5.861472760760072, + "learning_rate": 6.765886321646874e-11, + "loss": 0.021, + "step": 5595 + }, + { + "epoch": 3.99, + "grad_norm": 6.288974718765686, + "learning_rate": 5.345888019092638e-11, + "loss": 0.0381, + "step": 5596 + }, + { + "epoch": 4.0, + "grad_norm": 3.6870045895899226, + "learning_rate": 4.092947224032529e-11, + "loss": 0.0194, + "step": 5597 + }, + { + "epoch": 4.0, + "grad_norm": 7.961768982598845, + "learning_rate": 3.007064355076139e-11, + "loss": 0.0204, + "step": 5598 + }, + { + "epoch": 4.0, + "grad_norm": 2.888016420410043, + "learning_rate": 2.088239775044354e-11, + "loss": 0.0248, + "step": 5599 + }, + { + "epoch": 4.0, + "grad_norm": 6.1509337395528, + "learning_rate": 1.3364737909138392e-11, + "loss": 0.025, + "step": 5600 + }, + { + "epoch": 4.0, + "grad_norm": 3.4536548965699705, + "learning_rate": 7.517666539280654e-12, + "loss": 0.039, + "step": 5601 + }, + { + "epoch": 4.0, + "grad_norm": 1.643426850528451, + "learning_rate": 3.3411855937526273e-12, + "loss": 0.0276, + "step": 5602 + }, + { + "epoch": 4.0, + "grad_norm": 3.699342004217049, + "learning_rate": 8.352964681046516e-13, + "loss": 0.0218, + "step": 5603 + }, + { + "epoch": 4.0, + "grad_norm": 1.6873280750994422, + "learning_rate": 0.0, + "loss": 0.0218, + "step": 5604 + }, + { + "epoch": 4.0, + "step": 5604, + "total_flos": 750239094767616.0, + "train_loss": 0.1176493737971587, + "train_runtime": 15635.0784, + "train_samples_per_second": 22.932, + "train_steps_per_second": 0.358 + } + ], + "logging_steps": 1.0, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 24000, + "total_flos": 750239094767616.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}